xref: /illumos-gate/usr/src/uts/common/fs/ufs/ufs_dir.c (revision aba1133a5077b2daf9217c517f6aa15731135d8e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 /*
31  * University Copyright- Copyright (c) 1982, 1986, 1988
32  * The Regents of the University of California
33  * All Rights Reserved
34  *
35  * University Acknowledgment- Portions of this document are derived from
36  * software developed by the University of California, Berkeley, and its
37  * contributors.
38  */
39 
40 
41 #pragma ident	"%Z%%M%	%I%	%E% SMI"
42 
43 /*
44  * Directory manipulation routines.
45  *
46  * When manipulating directories, the i_rwlock provides serialization
47  * since directories cannot be mmapped. The i_contents lock is redundant.
48  */
49 
50 #include <sys/types.h>
51 #include <sys/t_lock.h>
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/signal.h>
55 #include <sys/cred.h>
56 #include <sys/proc.h>
57 #include <sys/disp.h>
58 #include <sys/user.h>
59 #include <sys/vfs.h>
60 #include <sys/vnode.h>
61 #include <sys/stat.h>
62 #include <sys/mode.h>
63 #include <sys/buf.h>
64 #include <sys/uio.h>
65 #include <sys/dnlc.h>
66 #include <sys/fs/ufs_inode.h>
67 #include <sys/fs/ufs_fs.h>
68 #include <sys/mount.h>
69 #include <sys/fs/ufs_fsdir.h>
70 #include <sys/fs/ufs_trans.h>
71 #include <sys/fs/ufs_panic.h>
72 #include <sys/fs/ufs_quota.h>
73 #include <sys/errno.h>
74 #include <sys/debug.h>
75 #include <vm/seg.h>
76 #include <sys/sysmacros.h>
77 #include <sys/cmn_err.h>
78 #include <sys/cpuvar.h>
79 #include <sys/unistd.h>
80 #include <sys/policy.h>
81 
82 /*
83  * This is required since we're using P2ROUNDUP_TYPED on DIRBLKSIZ
84  */
85 #if !ISP2(DIRBLKSIZ)
86 #error	"DIRBLKSIZ not a power of 2"
87 #endif
88 
89 /*
90  * A virgin directory.
91  */
92 static struct dirtemplate mastertemplate = {
93 	0, 12, 1, ".",
94 	0, DIRBLKSIZ - 12, 2, ".."
95 };
96 
97 #define	LDIRSIZ(len) \
98 	((sizeof (struct direct) - (MAXNAMLEN + 1)) + ((len + 1 + 3) &~ 3))
99 #define	MAX_DIR_NAME_LEN(len) \
100 	(((len) - (sizeof (struct direct) - (MAXNAMLEN + 1))) - 1)
101 
102 /*
103  * The dnlc directory cache allows a 64 bit handle for directory entries.
104  * For ufs we squeeze both the 32 bit inumber and a 32 bit disk offset
105  * into the handle. Note, a 32 bit offset allows a 4GB directory, which
106  * is way beyond what could be cached in memory by the directory
107  * caching routines. So we are quite safe with this limit.
108  * The macros below pack and unpack the handle.
109  */
110 #define	H_TO_INO(h) (uint32_t)((h) & UINT_MAX)
111 #define	H_TO_OFF(h) (off_t)((h) >> 32)
112 #define	INO_OFF_TO_H(ino, off) (uint64_t)(((uint64_t)(off) << 32) | (ino))
113 
114 /*
115  * The average size of a typical on disk directory entry is about 16 bytes
116  * and so defines AV_DIRECT_SHIFT : log2(16)
117  * This define is only used to approximate the number of entries
118  * is a directory. This is needed for dnlc_dir_start() which will immediately
119  * return an error if the value is not within its acceptable range of
120  * number of files in a directory.
121  */
122 #define	AV_DIRECT_SHIFT 4
123 /*
124  * If the directory size (from i_size) is greater than the ufs_min_dir_cache
125  * tunable then we request dnlc directory caching.
126  * This has found to be profitable after 1024 file names.
127  */
128 int ufs_min_dir_cache = 1024 << AV_DIRECT_SHIFT;
129 
130 /* The time point the dnlc directory caching was disabled */
131 static hrtime_t ufs_dc_disable_at;
132 /* directory caching disable duration */
133 static hrtime_t ufs_dc_disable_duration = (hrtime_t)NANOSEC * 5;
134 
135 #ifdef DEBUG
136 int dirchk = 1;
137 #else /* !DEBUG */
138 int dirchk = 0;
139 #endif /* DEBUG */
140 int ufs_negative_cache = 1;
141 uint64_t ufs_dirremove_retry_cnt;
142 
143 static void dirbad();
144 static int ufs_dirrename();
145 static int ufs_diraddentry();
146 static int ufs_dirempty();
147 static int ufs_dirscan();
148 static int ufs_dirclrdotdot();
149 static int ufs_dirfixdotdot();
150 static int ufs_dirpurgedotdot();
151 static int dirprepareentry();
152 static int ufs_dirmakedirect();
153 static int dirbadname();
154 static int dirmangled();
155 
156 /*
157  * Look for a given name in a directory.  On successful return, *ipp
158  * will point to the VN_HELD inode.
159  */
160 int
161 ufs_dirlook(
162 	struct inode *dp,
163 	char *namep,
164 	struct inode **ipp,
165 	struct cred *cr,
166 	int skipdnlc)			/* skip the 1st level dnlc */
167 {
168 	uint64_t handle;
169 	struct fbuf *fbp;		/* a buffer of directory entries */
170 	struct direct *ep;		/* the current directory entry */
171 	struct vnode *vp;
172 	struct vnode *dvp;		/* directory vnode ptr */
173 	dcanchor_t *dcap;
174 	off_t endsearch;		/* offset to end directory search */
175 	off_t offset;
176 	off_t start_off;		/* starting offset from middle search */
177 	off_t last_offset;		/* last offset */
178 	int entryoffsetinblock;		/* offset of ep in addr's buffer */
179 	int numdirpasses;		/* strategy for directory search */
180 	int namlen;			/* length of name */
181 	int err;
182 	int doingchk;
183 	int i;
184 	int caching;
185 	ino_t ep_ino;			/* entry i number */
186 	ino_t chkino;
187 	ushort_t ep_reclen;		/* direct local d_reclen */
188 
189 	ASSERT(*namep != '\0'); /* All callers ensure *namep is non null */
190 
191 	/*
192 	 * Check accessibility of directory.
193 	 */
194 	if (((dp->i_mode & IFMT) != IFDIR) &&
195 	    ((dp->i_mode & IFMT) != IFATTRDIR))
196 		return (ENOTDIR);
197 
198 	if (err = ufs_iaccess(dp, IEXEC, cr))
199 		return (err);
200 
201 	/*
202 	 * Check the directory name lookup cache, first for individual files
203 	 * then for complete directories.
204 	 */
205 	dvp = ITOV(dp);
206 	if (!skipdnlc && (vp = dnlc_lookup(dvp, namep))) {
207 		/* vp is already held from dnlc_lookup */
208 		if (vp == DNLC_NO_VNODE) {
209 			VN_RELE(vp);
210 			return (ENOENT);
211 		}
212 		*ipp = VTOI(vp);
213 		return (0);
214 	}
215 
216 	dcap = &dp->i_danchor;
217 
218 	/*
219 	 * Grab the reader lock on the directory data before checking
220 	 * the dnlc to avoid a race with ufs_dirremove() & friends.
221 	 */
222 	rw_enter(&dp->i_rwlock, RW_READER);
223 
224 	switch (dnlc_dir_lookup(dcap, namep, &handle)) {
225 	case DFOUND:
226 		ep_ino = (ino_t)H_TO_INO(handle);
227 		if (dp->i_number == ep_ino) {
228 			VN_HOLD(dvp);	/* want ourself, "." */
229 			*ipp = dp;
230 			rw_exit(&dp->i_rwlock);
231 			return (0);
232 		}
233 		if (namep[0] == '.' && namep[1] == '.' && namep[2] == 0) {
234 			uint64_t handle2;
235 			/*
236 			 * release the lock on the dir we are searching
237 			 * to avoid a deadlock when grabbing the
238 			 * i_contents lock in ufs_iget_alloced().
239 			 */
240 			rw_exit(&dp->i_rwlock);
241 			rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER);
242 			err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr);
243 			rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
244 			/*
245 			 * must recheck as we dropped dp->i_rwlock
246 			 */
247 			rw_enter(&dp->i_rwlock, RW_READER);
248 			if (!err && (dnlc_dir_lookup(dcap, namep, &handle2)
249 			    == DFOUND) && (handle == handle2)) {
250 				dnlc_update(dvp, namep, ITOV(*ipp));
251 				rw_exit(&dp->i_rwlock);
252 				return (0);
253 			}
254 			/* check failed, read the actual directory */
255 			if (!err) {
256 				VN_RELE(ITOV(*ipp));
257 			}
258 			goto restart;
259 		}
260 		/* usual case of not "." nor ".." */
261 		rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER);
262 		err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr);
263 		rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
264 		if (err) {
265 			rw_exit(&dp->i_rwlock);
266 			return (err);
267 		}
268 		dnlc_update(dvp, namep, ITOV(*ipp));
269 		rw_exit(&dp->i_rwlock);
270 		return (0);
271 	case DNOENT:
272 		if (ufs_negative_cache && (dp->i_nlink > 0)) {
273 			dnlc_enter(dvp, namep, DNLC_NO_VNODE);
274 		}
275 		rw_exit(&dp->i_rwlock);
276 		return (ENOENT);
277 	default:
278 		break;
279 	}
280 restart:
281 
282 	fbp = NULL;
283 	doingchk = 0;
284 	chkino = 0;
285 	caching = 0;
286 
287 	/*
288 	 * Attempt to cache any directories greater than the tunable
289 	 * ufs_min_cache_dir. If it fails due to memory shortage (DNOMEM),
290 	 * disable caching for this directory and record the system time.
291 	 * Any attempt after the disable time has expired will enable
292 	 * the caching again.
293 	 */
294 	if (dp->i_size >= ufs_min_dir_cache) {
295 		/*
296 		 * if the directory caching disable time has expired
297 		 * enable the caching again.
298 		 */
299 		if (dp->i_cachedir == CD_DISABLED_NOMEM &&
300 		    gethrtime() - ufs_dc_disable_at > ufs_dc_disable_duration) {
301 			ufs_dc_disable_at = 0;
302 			dp->i_cachedir = CD_ENABLED;
303 		}
304 		if (dp->i_cachedir == CD_ENABLED) {
305 			switch (dnlc_dir_start(dcap, dp->i_size >>
306 				AV_DIRECT_SHIFT)) {
307 			case DNOMEM:
308 				dp->i_cachedir = CD_DISABLED_NOMEM;
309 				ufs_dc_disable_at = gethrtime();
310 				break;
311 			case DTOOBIG:
312 				dp->i_cachedir = CD_DISABLED_TOOBIG;
313 				break;
314 			case DOK:
315 				caching = 1;
316 				break;
317 			default:
318 				break;
319 			}
320 		}
321 	}
322 	/*
323 	 * If caching we don't stop when the file has been
324 	 * found, but need to know later, so clear *ipp now
325 	 */
326 	*ipp = NULL;
327 
328 recheck:
329 	if (caching) {
330 		offset = 0;
331 		entryoffsetinblock = 0;
332 		numdirpasses = 1;
333 	} else {
334 		/*
335 		 * Take care to look at dp->i_diroff only once, as it
336 		 * may be changing due to other threads/cpus.
337 		 */
338 		offset = dp->i_diroff;
339 		if (offset > dp->i_size) {
340 			offset = 0;
341 		}
342 		if (offset == 0) {
343 			entryoffsetinblock = 0;
344 			numdirpasses = 1;
345 		} else {
346 			start_off = offset;
347 
348 			entryoffsetinblock = blkoff(dp->i_fs, offset);
349 			if (entryoffsetinblock != 0) {
350 				err = blkatoff(dp, offset, (char **)0, &fbp);
351 				if (err)
352 					goto bad;
353 			}
354 			numdirpasses = 2;
355 		}
356 	}
357 	endsearch = P2ROUNDUP_TYPED(dp->i_size, DIRBLKSIZ, u_offset_t);
358 	namlen = strlen(namep);
359 	last_offset = 0;
360 
361 searchloop:
362 	while (offset < endsearch) {
363 		/*
364 		 * If offset is on a block boundary,
365 		 * read the next directory block.
366 		 * Release previous if it exists.
367 		 */
368 		if (blkoff(dp->i_fs, offset) == 0) {
369 			if (fbp != NULL) {
370 				fbrelse(fbp, S_OTHER);
371 			}
372 			err = blkatoff(dp, offset, (char **)0, &fbp);
373 			if (err)
374 				goto bad;
375 			entryoffsetinblock = 0;
376 		}
377 
378 		/*
379 		 * If the offset to the next entry is invalid or if the
380 		 * next entry is a zero length record or if the record
381 		 * length is invalid, then skip to the next directory
382 		 * block.  Complete validation checks are done if the
383 		 * record length is invalid.
384 		 *
385 		 * Full validation checks are slow so they are disabled
386 		 * by default.  Complete checks can be run by patching
387 		 * "dirchk" to be true.
388 		 *
389 		 * We have to check the validity of entryoffsetinblock
390 		 * here because it can be set to i_diroff above.
391 		 */
392 		ep = (struct direct *)(fbp->fb_addr + entryoffsetinblock);
393 		if ((entryoffsetinblock & 0x3) || ep->d_reclen == 0 ||
394 		    (dirchk || (ep->d_reclen & 0x3)) &&
395 		    dirmangled(dp, ep, entryoffsetinblock, offset)) {
396 			i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1));
397 			offset += i;
398 			entryoffsetinblock += i;
399 			if (caching) {
400 				dnlc_dir_purge(dcap);
401 				caching = 0;
402 			}
403 			continue;
404 		}
405 
406 		ep_reclen = ep->d_reclen;
407 
408 		/*
409 		 * Add named entries and free space into the directory cache
410 		 */
411 		if (caching) {
412 			ushort_t extra;
413 			off_t off2;
414 
415 			if (ep->d_ino == 0) {
416 				extra = ep_reclen;
417 				if (offset & (DIRBLKSIZ - 1)) {
418 					dnlc_dir_purge(dcap);
419 					dp->i_cachedir = CD_DISABLED;
420 					caching = 0;
421 				}
422 			} else {
423 				/*
424 				 * entries hold the previous offset except the
425 				 * 1st which holds the offset + 1
426 				 */
427 				if (offset & (DIRBLKSIZ - 1)) {
428 					off2 = last_offset;
429 				} else {
430 					off2 = offset + 1;
431 				}
432 				caching = (dnlc_dir_add_entry(dcap, ep->d_name,
433 				    INO_OFF_TO_H(ep->d_ino, off2)) == DOK);
434 				extra = ep_reclen - DIRSIZ(ep);
435 			}
436 			if (caching && (extra >= LDIRSIZ(1))) {
437 				caching = (dnlc_dir_add_space(dcap, extra,
438 				    (uint64_t)offset) == DOK);
439 			}
440 		}
441 
442 		/*
443 		 * Check for a name match.
444 		 * We have the parent inode read locked with i_rwlock.
445 		 */
446 		if (ep->d_ino && ep->d_namlen == namlen &&
447 		    *namep == *ep->d_name &&	/* fast chk 1st chr */
448 		    bcmp(namep, ep->d_name, (int)ep->d_namlen) == 0) {
449 
450 			/*
451 			 * We have to release the fbp early here to avoid
452 			 * a possible deadlock situation where we have the
453 			 * fbp and want the directory inode and someone doing
454 			 * a ufs_direnter_* has the directory inode and wants
455 			 * the fbp.  XXX - is this still needed?
456 			 */
457 			ep_ino = (ino_t)ep->d_ino;
458 			ASSERT(fbp != NULL);
459 			fbrelse(fbp, S_OTHER);
460 			fbp = NULL;
461 
462 			/*
463 			 * Atomic update (read lock held)
464 			 */
465 			dp->i_diroff = offset;
466 
467 			if (namlen == 2 && namep[0] == '.' && namep[1] == '.') {
468 				struct timeval32 omtime;
469 
470 				if (caching) {
471 					dnlc_dir_purge(dcap);
472 					caching = 0;
473 				}
474 				if (doingchk) {
475 					/*
476 					 * if the inumber didn't change
477 					 * continue with already found inode.
478 					 */
479 					if (ep_ino == chkino)
480 						goto checkok;
481 					else {
482 						VN_RELE(ITOV(*ipp));
483 						/* *ipp is nulled at restart */
484 						goto restart;
485 					}
486 				}
487 				/*
488 				 * release the lock on the dir we are searching
489 				 * to avoid a deadlock when grabbing the
490 				 * i_contents lock in ufs_iget_alloced().
491 				 */
492 				omtime = dp->i_mtime;
493 				rw_exit(&dp->i_rwlock);
494 				rw_enter(&dp->i_ufsvfs->vfs_dqrwlock,
495 						RW_READER);
496 				err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp,
497 				    cr);
498 				rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
499 				rw_enter(&dp->i_rwlock, RW_READER);
500 				if (err)
501 					goto bad;
502 				/*
503 				 * Since we released the lock on the directory,
504 				 * we must check that the same inode is still
505 				 * the ".." entry for this directory.
506 				 */
507 				/*CSTYLED*/
508 				if (timercmp(&omtime, &dp->i_mtime, !=)) {
509 					/*
510 					 * Modification time changed on the
511 					 * directory, we must go check if
512 					 * the inumber changed for ".."
513 					 */
514 					doingchk = 1;
515 					chkino = ep_ino;
516 					entryoffsetinblock = 0;
517 					if (caching) {
518 						/*
519 						 * Forget directory caching
520 						 * for this rare case
521 						 */
522 						dnlc_dir_purge(dcap);
523 						caching = 0;
524 					}
525 					goto recheck;
526 				}
527 			} else if (dp->i_number == ep_ino) {
528 				VN_HOLD(dvp);	/* want ourself, "." */
529 				*ipp = dp;
530 				if (caching) {
531 					dnlc_dir_purge(dcap);
532 					caching = 0;
533 				}
534 			} else {
535 				rw_enter(&dp->i_ufsvfs->vfs_dqrwlock,
536 						RW_READER);
537 				err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp,
538 				    cr);
539 				rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
540 				if (err)
541 					goto bad;
542 			}
543 checkok:
544 			ASSERT(*ipp);
545 			dnlc_update(dvp, namep, ITOV(*ipp));
546 			/*
547 			 * If we are not caching then just return the entry
548 			 * otherwise complete loading up the cache
549 			 */
550 			if (!caching) {
551 				rw_exit(&dp->i_rwlock);
552 				return (0);
553 			}
554 			err = blkatoff(dp, offset, (char **)0, &fbp);
555 			if (err)
556 				goto bad;
557 		}
558 		last_offset = offset;
559 		offset += ep_reclen;
560 		entryoffsetinblock += ep_reclen;
561 	}
562 	/*
563 	 * If we started in the middle of the directory and failed
564 	 * to find our target, we must check the beginning as well.
565 	 */
566 	if (numdirpasses == 2) {
567 		numdirpasses--;
568 		offset = 0;
569 		endsearch = start_off;
570 		goto searchloop;
571 	}
572 
573 	/*
574 	 * If whole directory caching is on (or was originally on) then
575 	 * the entry may have been found.
576 	 */
577 	if (*ipp == NULL) {
578 		err = ENOENT;
579 		if (ufs_negative_cache && (dp->i_nlink > 0)) {
580 			dnlc_enter(dvp, namep, DNLC_NO_VNODE);
581 		}
582 	}
583 	if (caching) {
584 		dnlc_dir_complete(dcap);
585 		caching = 0;
586 	}
587 
588 bad:
589 	if (err && *ipp) {
590 		/*
591 		 * err and *ipp can both be set if we were attempting to
592 		 * cache the directory, and we found the entry, then later
593 		 * while trying to complete the directory cache encountered
594 		 * a error (eg reading a directory sector).
595 		 */
596 		VN_RELE(ITOV(*ipp));
597 		*ipp = NULL;
598 	}
599 
600 	if (fbp)
601 		fbrelse(fbp, S_OTHER);
602 	rw_exit(&dp->i_rwlock);
603 	if (caching)
604 		dnlc_dir_purge(dcap);
605 	return (err);
606 }
607 
608 /*
609  * Write a new directory entry for DE_CREATE or DE_MKDIR operations.
610  */
611 int
612 ufs_direnter_cm(
613 	struct inode *tdp,	/* target directory to make entry in */
614 	char *namep,		/* name of entry */
615 	enum de_op op,		/* entry operation */
616 	struct vattr *vap,	/* attributes if new inode needed */
617 	struct inode **ipp,	/* return entered inode here */
618 	struct cred *cr,	/* user credentials */
619 	int flags)		/* no entry exists */
620 {
621 	struct inode *tip;	/* inode of (existing) target file */
622 	char *s;
623 	struct slot slot;	/* slot info to pass around */
624 	int namlen;		/* length of name */
625 	int err;		/* error number */
626 	struct inode *nip;	/* new inode */
627 	int do_rele_nip = 0;	/* release nip */
628 	int noentry = flags & ~IQUIET;
629 	int quiet = flags & IQUIET;	/* Suppress out of inodes message */
630 
631 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
632 
633 	if (((tdp->i_mode & IFMT) == IFATTRDIR) && ((op == DE_MKDIR) ||
634 	    ((vap->va_type == VCHR) || (vap->va_type == VBLK) ||
635 	    (vap->va_type == VDOOR) || (vap->va_type == VSOCK) ||
636 	    (vap->va_type == VFIFO))))
637 		return (EINVAL);
638 
639 	/* don't allow '/' characters in pathname component */
640 	for (s = namep, namlen = 0; *s; s++, namlen++)
641 		if (*s == '/')
642 			return (EACCES);
643 	ASSERT(namlen);
644 
645 	/*
646 	 * If name is "." or ".." then if this is a create look it up
647 	 * and return EEXIST.
648 	 */
649 	if (namep[0] == '.' &&
650 	    (namlen == 1 || (namlen == 2 && namep[1] == '.'))) {
651 		/*
652 		 * ufs_dirlook will acquire the i_rwlock
653 		 */
654 		rw_exit(&tdp->i_rwlock);
655 		if (err = ufs_dirlook(tdp, namep, ipp, cr, 0)) {
656 			rw_enter(&tdp->i_rwlock, RW_WRITER);
657 			return (err);
658 		}
659 		rw_enter(&tdp->i_rwlock, RW_WRITER);
660 		return (EEXIST);
661 	}
662 
663 	/*
664 	 * If target directory has not been removed, then we can consider
665 	 * allowing file to be created.
666 	 */
667 	if (tdp->i_nlink <= 0) {
668 		return (ENOENT);
669 	}
670 
671 	/*
672 	 * Check accessibility of directory.
673 	 */
674 	if (((tdp->i_mode & IFMT) != IFDIR) &&
675 	    ((tdp->i_mode & IFMT) != IFATTRDIR)) {
676 		return (ENOTDIR);
677 	}
678 
679 	/*
680 	 * Execute access is required to search the directory.
681 	 */
682 	if (err = ufs_iaccess(tdp, IEXEC, cr)) {
683 		return (err);
684 	}
685 
686 	/*
687 	 * Search for the entry. Return VN_HELD tip if found.
688 	 */
689 	tip = NULL;
690 	slot.fbp = NULL;
691 	slot.status = NONE;
692 	rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER);
693 	rw_enter(&tdp->i_contents, RW_WRITER);
694 	err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, noentry);
695 	if (err)
696 		goto out;
697 	if (tip) {
698 		ASSERT(!noentry);
699 		*ipp = tip;
700 		err = EEXIST;
701 	} else {
702 		/*
703 		 * The entry does not exist. Check write permission in
704 		 * directory to see if entry can be created.
705 		 */
706 		if (err = ufs_iaccess(tdp, IWRITE, cr))
707 			goto out;
708 		/*
709 		 * Make new inode and directory entry.
710 		 */
711 		tdp->i_flag |= quiet;
712 		if (err = ufs_dirmakeinode(tdp, &nip, vap, op, cr)) {
713 			if (nip != NULL)
714 				do_rele_nip = 1;
715 			goto out;
716 		}
717 		if (err = ufs_diraddentry(tdp, namep, op,
718 		    namlen, &slot, nip, NULL, cr)) {
719 			/*
720 			 * Unmake the inode we just made.
721 			 */
722 			rw_enter(&nip->i_contents, RW_WRITER);
723 			if (((nip->i_mode & IFMT) == IFDIR) ||
724 			    ((nip->i_mode & IFMT) == IFATTRDIR)) {
725 				tdp->i_nlink--;
726 				ufs_setreclaim(tdp);
727 				tdp->i_flag |= ICHG;
728 				tdp->i_seq++;
729 				TRANS_INODE(tdp->i_ufsvfs, tdp);
730 				ITIMES_NOLOCK(tdp);
731 			}
732 			nip->i_nlink = 0;
733 			ufs_setreclaim(nip);
734 			TRANS_INODE(nip->i_ufsvfs, nip);
735 			nip->i_flag |= ICHG;
736 			nip->i_seq++;
737 			ITIMES_NOLOCK(nip);
738 			rw_exit(&nip->i_contents);
739 			do_rele_nip = 1;
740 		} else {
741 			*ipp = nip;
742 		}
743 	}
744 
745 out:
746 	if (slot.fbp)
747 		fbrelse(slot.fbp, S_OTHER);
748 
749 	tdp->i_flag &= ~quiet;
750 	rw_exit(&tdp->i_contents);
751 
752 	/*
753 	 * Drop vfs_dqrwlock before calling VN_RELE() on nip to
754 	 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
755 	 */
756 	rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock);
757 
758 	if (do_rele_nip) {
759 		VN_RELE(ITOV(nip));
760 	}
761 
762 	return (err);
763 }
764 
765 /*
766  * Write a new directory entry for DE_LINK, DE_SYMLINK or DE_RENAME operations.
767  * If tvpp is non-null, return with the pointer to the target vnode.
768  */
769 int
770 ufs_direnter_lr(
771 	struct inode *tdp,	/* target directory to make entry in */
772 	char *namep,		/* name of entry */
773 	enum de_op op,		/* entry operation */
774 	struct inode *sdp,	/* source inode parent if rename */
775 	struct inode *sip,	/* source inode */
776 	struct cred *cr,	/* user credentials */
777 	vnode_t **tvpp)		/* Return: (held) vnode of (existing) target */
778 {
779 	struct inode *tip;	/* inode of (existing) target file */
780 	char *s;
781 	struct slot slot;	/* slot info to pass around */
782 	int namlen;		/* length of name */
783 	int err;		/* error number */
784 
785 	/* don't allow '/' characters in pathname component */
786 	for (s = namep, namlen = 0; *s; s++, namlen++)
787 		if (*s == '/')
788 			return (EACCES);
789 	ASSERT(namlen);
790 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
791 
792 	/*
793 	 * If name is "." or ".." then if this is a create look it up
794 	 * and return EEXIST.  Rename or link TO "." or ".." is forbidden.
795 	 */
796 	if (namep[0] == '.' &&
797 	    (namlen == 1 || (namlen == 2 && namep[1] == '.'))) {
798 		if (op == DE_RENAME) {
799 			return (EINVAL);	/* *SIGH* should be ENOTEMPTY */
800 		}
801 		return (EEXIST);
802 	}
803 	/*
804 	 * For link and rename lock the source entry and check the link count
805 	 * to see if it has been removed while it was unlocked.  If not, we
806 	 * increment the link count and force the inode to disk to make sure
807 	 * that it is there before any directory entry that points to it.
808 	 *
809 	 * In the case of a symbolic link, we are dealing with a new inode
810 	 * which does not yet have any links.  We've created it with a link
811 	 * count of 1, and we don't want to increment it since this will be
812 	 * its first link.
813 	 *
814 	 * We are about to push the inode to disk. We make sure
815 	 * that the inode's data blocks are flushed first so the
816 	 * inode and it's data blocks are always in sync.  This
817 	 * adds some robustness in in the event of a power failure
818 	 * or panic where sync fails. If we panic before the
819 	 * inode is updated, then the inode still refers to the
820 	 * old data blocks (or none for a new file). If we panic
821 	 * after the inode is updated, then the inode refers to
822 	 * the new data blocks.
823 	 *
824 	 * We do this before grabbing the i_contents lock because
825 	 * ufs_syncip() will want that lock. We could do the data
826 	 * syncing after the removal checks, but upon return from
827 	 * the data sync we would have to repeat the removal
828 	 * checks.
829 	 */
830 	if (err = TRANS_SYNCIP(sip, 0, I_DSYNC, TOP_FSYNC)) {
831 		return (err);
832 	}
833 
834 	rw_enter(&sip->i_contents, RW_WRITER);
835 	if (sip->i_nlink <= 0) {
836 		rw_exit(&sip->i_contents);
837 		return (ENOENT);
838 	}
839 	if (sip->i_nlink == MAXLINK) {
840 		rw_exit(&sip->i_contents);
841 		return (EMLINK);
842 	}
843 
844 	/*
845 	 * Sync the indirect blocks associated with the file
846 	 * for the same reasons as described above.  Since this
847 	 * call wants the i_contents lock held for it we can do
848 	 * this here with no extra work.
849 	 */
850 	if (err = ufs_sync_indir(sip)) {
851 		rw_exit(&sip->i_contents);
852 		return (err);
853 	}
854 
855 	if (op != DE_SYMLINK)
856 		sip->i_nlink++;
857 	TRANS_INODE(sip->i_ufsvfs, sip);
858 	sip->i_flag |= ICHG;
859 	sip->i_seq++;
860 	ufs_iupdat(sip, I_SYNC);
861 	rw_exit(&sip->i_contents);
862 
863 	/*
864 	 * If target directory has not been removed, then we can consider
865 	 * allowing file to be created.
866 	 */
867 	if (tdp->i_nlink <= 0) {
868 		err = ENOENT;
869 		goto out2;
870 	}
871 	/*
872 	 * Check accessibility of directory.
873 	 */
874 	if (((tdp->i_mode & IFMT) != IFDIR) &&
875 	    (tdp->i_mode & IFMT) != IFATTRDIR) {
876 		err = ENOTDIR;
877 		goto out2;
878 	}
879 	/*
880 	 * Execute access is required to search the directory.
881 	 */
882 	if (err = ufs_iaccess(tdp, IEXEC, cr)) {
883 		goto out2;
884 	}
885 
886 	/*
887 	 * Search for the entry. Return VN_HELD tip if found.
888 	 */
889 	tip = NULL;
890 	slot.status = NONE;
891 	slot.fbp = NULL;
892 	rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER);
893 	rw_enter(&tdp->i_contents, RW_WRITER);
894 	err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, 0);
895 	if (err)
896 		goto out;
897 
898 	if (tip) {
899 		switch (op) {
900 		case DE_RENAME:
901 			err = ufs_dirrename(sdp, sip, tdp, namep,
902 			    tip, &slot, cr);
903 			break;
904 
905 		case DE_LINK:
906 		case DE_SYMLINK:
907 			/*
908 			 * Can't link to an existing file.
909 			 */
910 			err = EEXIST;
911 			break;
912 		default:
913 			break;
914 		}
915 	} else {
916 		/*
917 		 * The entry does not exist. Check write permission in
918 		 * directory to see if entry can be created.
919 		 */
920 		if (err = ufs_iaccess(tdp, IWRITE, cr))
921 			goto out;
922 		err = ufs_diraddentry(tdp, namep, op, namlen, &slot, sip, sdp,
923 		    cr);
924 	}
925 
926 out:
927 	if (slot.fbp)
928 		fbrelse(slot.fbp, S_OTHER);
929 
930 	rw_exit(&tdp->i_contents);
931 
932 	/*
933 	 * Drop vfs_dqrwlock before calling VN_RELE() on tip to
934 	 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
935 	 */
936 	rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock);
937 
938 	/*
939 	 * If we renamed a file over the top of an existing file,
940 	 * or linked a file to an existing file (or tried to),
941 	 * then set *tvpp to the target vnode, if tvpp is non-null
942 	 * otherwise, release and delete (or just release) the inode.
943 	 *
944 	 * N.B., by returning the target's vnode pointer to the caller,
945 	 * that caller becomes responsible for doing the VN_RELE.
946 	 */
947 	if (tip) {
948 		if ((err == 0) && (tvpp != NULL)) {
949 			*tvpp = ITOV(tip);
950 		} else {
951 			VN_RELE(ITOV(tip));
952 		}
953 	}
954 
955 out2:
956 	if (err) {
957 		/*
958 		 * Undo bumped link count.
959 		 */
960 		if (op != DE_SYMLINK) {
961 			rw_enter(&sip->i_contents, RW_WRITER);
962 			sip->i_nlink--;
963 			ufs_setreclaim(sip);
964 			TRANS_INODE(sip->i_ufsvfs, sip);
965 			sip->i_flag |= ICHG;
966 			sip->i_seq++;
967 			ITIMES_NOLOCK(sip);
968 			rw_exit(&sip->i_contents);
969 		}
970 	}
971 	return (err);
972 }
973 
974 /*
975  * Check for the existence of a name in a directory (unless noentry
976  * is set) , or else of an empty
977  * slot in which an entry may be made.  If the requested name is found,
978  * then on return *ipp points at the inode and *offp contains
979  * its offset in the directory.  If the name is not found, then *ipp
980  * will be NULL and *slotp will contain information about a directory slot in
981  * which an entry may be made (either an empty slot, or the first position
982  * past the end of the directory).
983  * The target directory inode (tdp) is supplied write locked (i_rwlock).
984  *
985  * This may not be used on "." or "..", but aliases of "." are ok.
986  */
987 int
988 ufs_dircheckforname(
989 	struct inode *tdp,	/* inode of directory being checked */
990 	char *namep,		/* name we're checking for */
991 	int namlen,		/* length of name, excluding null */
992 	struct slot *slotp,	/* slot structure */
993 	struct inode **ipp,	/* return inode if we find one */
994 	struct cred *cr,
995 	int noentry)		/* noentry - just look for space */
996 {
997 	uint64_t handle;
998 	struct fbuf *fbp;	/* pointer to directory block */
999 	struct direct *ep;	/* directory entry */
1000 	struct direct *nep;	/* next directory entry */
1001 	dcanchor_t *dcap;
1002 	vnode_t *dvp;		/* directory vnode ptr */
1003 	off_t dirsize;		/* size of the directory */
1004 	off_t offset;		/* offset in the directory */
1005 	off_t last_offset;	/* last offset */
1006 	off_t enduseful;	/* pointer past last used dir slot */
1007 	int entryoffsetinblk;	/* offset of ep in fbp's buffer */
1008 	int i;			/* length of mangled entry */
1009 	int needed;
1010 	int err;
1011 	int first;
1012 	int caching;
1013 	int stat;
1014 	ino_t ep_ino;
1015 	slotstat_t initstat = slotp->status;
1016 
1017 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
1018 	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
1019 	ASSERT(*ipp == NULL);
1020 	fbp = NULL;
1021 
1022 	/*
1023 	 * First check if there is a complete cache of the directory.
1024 	 */
1025 	dvp = ITOV(tdp);
1026 
1027 	dcap = &tdp->i_danchor;
1028 	if (noentry) {
1029 		/*
1030 		 * We know from the 1st level dnlc cache that the entry
1031 		 * doesn't exist, so don't bother searching the directory
1032 		 * cache, but just look for space (possibly in the directory
1033 		 * cache).
1034 		 */
1035 		stat = DNOENT;
1036 	} else {
1037 		stat = dnlc_dir_lookup(dcap, namep, &handle);
1038 	}
1039 	switch (stat) {
1040 	case DFOUND:
1041 		ep_ino = (ino_t)H_TO_INO(handle);
1042 		if (tdp->i_number == ep_ino) {
1043 			*ipp = tdp;	/* we want ourself, ie "." */
1044 			VN_HOLD(dvp);
1045 		} else {
1046 			err = ufs_iget_alloced(tdp->i_vfs, ep_ino, ipp, cr);
1047 			if (err)
1048 				return (err);
1049 		}
1050 		offset = H_TO_OFF(handle);
1051 		first = 0;
1052 		if (offset & 1) {
1053 			/* This is the first entry in the block */
1054 			first = 1;
1055 			offset -= 1;
1056 			ASSERT((offset & (DIRBLKSIZ - 1)) == 0);
1057 		}
1058 		err = blkatoff(tdp, offset, (char **)&ep, &fbp);
1059 		if (err) {
1060 			VN_RELE(ITOV(*ipp));
1061 			*ipp = NULL;
1062 			return (err);
1063 		}
1064 		/*
1065 		 * Check the validity of the entry.
1066 		 * If it's bad, then throw away the cache and
1067 		 * continue without it. The dirmangled() routine
1068 		 * will then be called upon it.
1069 		 */
1070 		if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) {
1071 			VN_RELE(ITOV(*ipp));
1072 			*ipp = NULL;
1073 			dnlc_dir_purge(dcap);
1074 			break;
1075 		}
1076 		/*
1077 		 * Remember the returned offset is the offset of the
1078 		 * preceding record (unless this is the 1st record
1079 		 * in the DIRBLKSIZ sized block (disk sector)), then it's
1080 		 * offset + 1. Note, no real offsets are on odd boundaries.
1081 		 */
1082 		if (first) {
1083 			ASSERT((offset & (DIRBLKSIZ - 1)) == 0);
1084 			slotp->offset = offset;
1085 			slotp->size = 0;
1086 			slotp->ep = ep;
1087 		} else {
1088 			/* get the next entry */
1089 			nep = (struct direct *)((char *)ep + ep->d_reclen);
1090 			/*
1091 			 * Check the validity of this entry as well
1092 			 * If it's bad, then throw away the cache and
1093 			 * continue without it. The dirmangled() routine
1094 			 * will then be called upon it.
1095 			 */
1096 			if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) ||
1097 			    (nep->d_ino != ep_ino)) {
1098 				VN_RELE(ITOV(*ipp));
1099 				*ipp = NULL;
1100 				dnlc_dir_purge(dcap);
1101 				break;
1102 			}
1103 			slotp->offset = offset + ep->d_reclen;
1104 			slotp->size = ep->d_reclen;
1105 			slotp->ep = nep;
1106 		}
1107 		slotp->status = EXIST;
1108 		slotp->fbp = fbp;
1109 		slotp->endoff = 0;
1110 		slotp->cached = 1;
1111 		dnlc_update(dvp, namep, ITOV(*ipp));
1112 		return (0);
1113 	case DNOENT:
1114 		/*
1115 		 * The caller gets to set the initial slot status to
1116 		 * indicate whether it's interested in getting a
1117 		 * empty slot. For example, the status can be set
1118 		 * to FOUND when an entry is being deleted.
1119 		 */
1120 		ASSERT(slotp->fbp == NULL);
1121 		if (slotp->status == FOUND) {
1122 			return (0);
1123 		}
1124 		switch (dnlc_dir_rem_space_by_len(dcap, LDIRSIZ(namlen),
1125 		    &handle)) {
1126 		case DFOUND:
1127 			offset = (off_t)handle;
1128 			err = blkatoff(tdp, offset, (char **)&ep, &fbp);
1129 			if (err) {
1130 				dnlc_dir_purge(dcap);
1131 				ASSERT(*ipp == NULL);
1132 				return (err);
1133 			}
1134 			/*
1135 			 * Check the validity of the entry.
1136 			 * If it's bad, then throw away the cache and
1137 			 * continue without it. The dirmangled() routine
1138 			 * will then be called upon it.
1139 			 */
1140 			if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) {
1141 				dnlc_dir_purge(dcap);
1142 				break;
1143 			}
1144 			/*
1145 			 * Remember the returned offset is the offset of the
1146 			 * containing record.
1147 			 */
1148 			slotp->status = FOUND;
1149 			slotp->ep = ep;
1150 			slotp->offset = offset;
1151 			slotp->fbp = fbp;
1152 			slotp->size = ep->d_reclen;
1153 			/*
1154 			 * Set end offset to 0. Truncation is handled
1155 			 * because the dnlc cache will blow away the
1156 			 * cached directory when an entry is removed
1157 			 * that drops the entries left to less than half
1158 			 * the minumum number (dnlc_min_dir_cache).
1159 			 */
1160 			slotp->endoff = 0;
1161 			slotp->cached = 1;
1162 			return (0);
1163 		case DNOENT:
1164 			slotp->status = NONE;
1165 			slotp->offset = P2ROUNDUP_TYPED(tdp->i_size,
1166 			    DIRBLKSIZ, u_offset_t);
1167 			slotp->size = DIRBLKSIZ;
1168 			slotp->endoff = 0;
1169 			slotp->cached = 1;
1170 			return (0);
1171 		default:
1172 			break;
1173 		}
1174 		break;
1175 	}
1176 	slotp->cached = 0;
1177 	caching = NULL;
1178 	if (!noentry && tdp->i_size >= ufs_min_dir_cache) {
1179 		/*
1180 		 * if the directory caching disable time has expired
1181 		 * enable caching again.
1182 		 */
1183 		if (tdp->i_cachedir == CD_DISABLED_NOMEM &&
1184 		    gethrtime() - ufs_dc_disable_at > ufs_dc_disable_duration) {
1185 			ufs_dc_disable_at = 0;
1186 			tdp->i_cachedir = CD_ENABLED;
1187 		}
1188 		/*
1189 		 * Attempt to cache any directories greater than the tunable
1190 		 * ufs_min_cache_dir. If it fails due to memory shortage
1191 		 * (DNOMEM), disable caching for this directory and record
1192 		 * the system time. Any attempt after the disable time has
1193 		 * expired will enable the caching again.
1194 		 */
1195 		if (tdp->i_cachedir == CD_ENABLED) {
1196 			switch (dnlc_dir_start(dcap,
1197 			    tdp->i_size >> AV_DIRECT_SHIFT)) {
1198 			case DNOMEM:
1199 				tdp->i_cachedir = CD_DISABLED_NOMEM;
1200 				ufs_dc_disable_at = gethrtime();
1201 				break;
1202 			case DTOOBIG:
1203 				tdp->i_cachedir = CD_DISABLED_TOOBIG;
1204 				break;
1205 			case DOK:
1206 				caching = 1;
1207 				break;
1208 			default:
1209 				break;
1210 			}
1211 		}
1212 	}
1213 
1214 	/*
1215 	 * No point in using i_diroff since we must search whole directory
1216 	 */
1217 	dirsize = P2ROUNDUP_TYPED(tdp->i_size, DIRBLKSIZ, u_offset_t);
1218 	enduseful = 0;
1219 	offset = last_offset = 0;
1220 	entryoffsetinblk = 0;
1221 	needed = (int)LDIRSIZ(namlen);
1222 	while (offset < dirsize) {
1223 		/*
1224 		 * If offset is on a block boundary,
1225 		 * read the next directory block.
1226 		 * Release previous if it exists.
1227 		 */
1228 		if (blkoff(tdp->i_fs, offset) == 0) {
1229 			if (fbp != NULL)
1230 				fbrelse(fbp, S_OTHER);
1231 
1232 			err = blkatoff(tdp, offset, (char **)0, &fbp);
1233 			if (err) {
1234 				ASSERT(*ipp == NULL);
1235 				if (caching) {
1236 					dnlc_dir_purge(dcap);
1237 				}
1238 				return (err);
1239 			}
1240 			entryoffsetinblk = 0;
1241 		}
1242 		/*
1243 		 * If still looking for a slot, and at a DIRBLKSIZ
1244 		 * boundary, have to start looking for free space
1245 		 * again.
1246 		 */
1247 		if (slotp->status == NONE &&
1248 		    (entryoffsetinblk & (DIRBLKSIZ - 1)) == 0) {
1249 			slotp->offset = -1;
1250 		}
1251 		/*
1252 		 * If the next entry is a zero length record or if the
1253 		 * record length is invalid, then skip to the next
1254 		 * directory block.  Complete validation checks are
1255 		 * done if the record length is invalid.
1256 		 *
1257 		 * Full validation checks are slow so they are disabled
1258 		 * by default.  Complete checks can be run by patching
1259 		 * "dirchk" to be true.
1260 		 *
1261 		 * We do not have to check the validity of
1262 		 * entryoffsetinblk here because it starts out as zero
1263 		 * and is only incremented by d_reclen values that we
1264 		 * validate here.
1265 		 */
1266 		ep = (struct direct *)(fbp->fb_addr + entryoffsetinblk);
1267 		if (ep->d_reclen == 0 ||
1268 		    (dirchk || (ep->d_reclen & 0x3)) &&
1269 		    dirmangled(tdp, ep, entryoffsetinblk, offset)) {
1270 			i = DIRBLKSIZ - (entryoffsetinblk & (DIRBLKSIZ - 1));
1271 			offset += i;
1272 			entryoffsetinblk += i;
1273 			if (caching) {
1274 				dnlc_dir_purge(dcap);
1275 				caching = 0;
1276 			}
1277 			continue;
1278 		}
1279 
1280 		/*
1281 		 * Add named entries and free space into the directory cache
1282 		 */
1283 		if (caching) {
1284 			ushort_t extra;
1285 			off_t off2;
1286 
1287 			if (ep->d_ino == 0) {
1288 				extra = ep->d_reclen;
1289 				if (offset & (DIRBLKSIZ - 1)) {
1290 					dnlc_dir_purge(dcap);
1291 					caching = 0;
1292 				}
1293 			} else {
1294 				/*
1295 				 * entries hold the previous offset if
1296 				 * not the 1st one
1297 				 */
1298 				if (offset & (DIRBLKSIZ - 1)) {
1299 					off2 = last_offset;
1300 				} else {
1301 					off2 = offset + 1;
1302 				}
1303 				caching = (dnlc_dir_add_entry(dcap, ep->d_name,
1304 				    INO_OFF_TO_H(ep->d_ino, off2)) == DOK);
1305 				extra = ep->d_reclen - DIRSIZ(ep);
1306 			}
1307 			if (caching && (extra >= LDIRSIZ(1))) {
1308 				caching = (dnlc_dir_add_space(dcap, extra,
1309 				    (uint64_t)offset) == DOK);
1310 			}
1311 		}
1312 
1313 		/*
1314 		 * If an appropriate sized slot has not yet been found,
1315 		 * check to see if one is available.
1316 		 */
1317 		if ((slotp->status != FOUND) && (slotp->status != EXIST)) {
1318 			int size = ep->d_reclen;
1319 
1320 			if (ep->d_ino != 0)
1321 				size -= DIRSIZ(ep);
1322 			if (size > 0) {
1323 				if (size >= needed) {
1324 					slotp->offset = offset;
1325 					slotp->size = ep->d_reclen;
1326 					if (noentry) {
1327 						slotp->ep = ep;
1328 						slotp->fbp = fbp;
1329 						slotp->status = FOUND;
1330 						slotp->endoff = 0;
1331 						return (0);
1332 					}
1333 					slotp->status = FOUND;
1334 				} else if (slotp->status == NONE) {
1335 					if (slotp->offset == -1)
1336 						slotp->offset = offset;
1337 				}
1338 			}
1339 		}
1340 		/*
1341 		 * Check for a name match.
1342 		 */
1343 		if (ep->d_ino && ep->d_namlen == namlen &&
1344 		    *namep == *ep->d_name &&	/* fast chk 1st char */
1345 		    bcmp(namep, ep->d_name, namlen) == 0) {
1346 
1347 			tdp->i_diroff = offset;
1348 
1349 			if (tdp->i_number == ep->d_ino) {
1350 				*ipp = tdp;	/* we want ourself, ie "." */
1351 				VN_HOLD(dvp);
1352 			} else {
1353 				err = ufs_iget_alloced(tdp->i_vfs,
1354 				    (ino_t)ep->d_ino, ipp, cr);
1355 				if (err) {
1356 					fbrelse(fbp, S_OTHER);
1357 					if (caching)
1358 						dnlc_dir_purge(dcap);
1359 					return (err);
1360 				}
1361 			}
1362 			slotp->status = EXIST;
1363 			slotp->offset = offset;
1364 			slotp->size = (int)(offset - last_offset);
1365 			slotp->fbp = fbp;
1366 			slotp->ep = ep;
1367 			slotp->endoff = 0;
1368 			if (caching)
1369 				dnlc_dir_purge(dcap);
1370 			return (0);
1371 		}
1372 		last_offset = offset;
1373 		offset += ep->d_reclen;
1374 		entryoffsetinblk += ep->d_reclen;
1375 		if (ep->d_ino)
1376 			enduseful = offset;
1377 	}
1378 	if (fbp) {
1379 		fbrelse(fbp, S_OTHER);
1380 	}
1381 
1382 	if (caching) {
1383 		dnlc_dir_complete(dcap);
1384 		slotp->cached = 1;
1385 		if (slotp->status == FOUND) {
1386 			if (initstat == FOUND) {
1387 				return (0);
1388 			}
1389 			(void) dnlc_dir_rem_space_by_handle(dcap,
1390 			    slotp->offset);
1391 			slotp->endoff = 0;
1392 			return (0);
1393 		}
1394 	}
1395 
1396 	if (slotp->status == NONE) {
1397 		/*
1398 		 * We didn't find a slot; the new directory entry should be put
1399 		 * at the end of the directory.  Return an indication of where
1400 		 * this is, and set "endoff" to zero; since we're going to have
1401 		 * to extend the directory, we're certainly not going to
1402 		 * truncate it.
1403 		 */
1404 		slotp->offset = dirsize;
1405 		slotp->size = DIRBLKSIZ;
1406 		slotp->endoff = 0;
1407 	} else {
1408 		/*
1409 		 * We found a slot, and will return an indication of where that
1410 		 * slot is, as any new directory entry will be put there.
1411 		 * Since that slot will become a useful entry, if the last
1412 		 * useful entry we found was before this one, update the offset
1413 		 * of the last useful entry.
1414 		 */
1415 		if (enduseful < slotp->offset + slotp->size)
1416 			enduseful = slotp->offset + slotp->size;
1417 		slotp->endoff = P2ROUNDUP_TYPED(enduseful, DIRBLKSIZ, off_t);
1418 	}
1419 	*ipp = NULL;
1420 	return (0);
1421 }
1422 
1423 uint64_t ufs_dirrename_retry_cnt;
1424 
1425 /*
1426  * Rename the entry in the directory tdp so that it points to
1427  * sip instead of tip.
1428  */
1429 static int
1430 ufs_dirrename(
1431 	struct inode *sdp,	/* parent directory of source */
1432 	struct inode *sip,	/* source inode */
1433 	struct inode *tdp,	/* parent directory of target */
1434 	char *namep,		/* entry we are trying to change */
1435 	struct inode *tip,	/* target inode */
1436 	struct slot *slotp,	/* slot for entry */
1437 	struct cred *cr)	/* credentials */
1438 {
1439 	vnode_t *tdvp;
1440 	off_t offset;
1441 	int err;
1442 	int doingdirectory;
1443 
1444 	ASSERT(sdp->i_ufsvfs != NULL);
1445 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
1446 	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
1447 	/*
1448 	 * Short circuit rename of something to itself.
1449 	 */
1450 	if (sip->i_number == tip->i_number) {
1451 		return (ESAME); /* special KLUDGE error code */
1452 	}
1453 
1454 	/*
1455 	 * We're locking 2 peer level locks, so must use tryenter
1456 	 * on the 2nd to avoid deadlocks that would occur
1457 	 * if we renamed a->b and b->a concurrently.
1458 	 */
1459 retry:
1460 	rw_enter(&tip->i_contents, RW_WRITER);
1461 	if (!rw_tryenter(&sip->i_contents, RW_READER)) {
1462 		/*
1463 		 * drop tip and wait (sleep) until we stand a chance
1464 		 * of holding sip
1465 		 */
1466 		rw_exit(&tip->i_contents);
1467 		rw_enter(&sip->i_contents, RW_READER);
1468 		/*
1469 		 * Reverse the lock grabs in case we have heavy
1470 		 * contention on the 2nd lock.
1471 		 */
1472 		if (!rw_tryenter(&tip->i_contents, RW_WRITER)) {
1473 			ufs_dirrename_retry_cnt++;
1474 			rw_exit(&sip->i_contents);
1475 			goto retry;
1476 		}
1477 	}
1478 
1479 	/*
1480 	 * Check that everything is on the same filesystem.
1481 	 */
1482 	if ((ITOV(tip)->v_vfsp != ITOV(tdp)->v_vfsp) ||
1483 	    (ITOV(tip)->v_vfsp != ITOV(sip)->v_vfsp)) {
1484 		err = EXDEV;		/* XXX archaic */
1485 		goto out;
1486 	}
1487 	/*
1488 	 * Must have write permission to rewrite target entry.
1489 	 * Perform additional checks for sticky directories.
1490 	 */
1491 	if ((err = ufs_iaccess(tdp, IWRITE, cr)) != 0 ||
1492 	    (err = ufs_sticky_remove_access(tdp, tip, cr)) != 0)
1493 		goto out;
1494 
1495 	/*
1496 	 * Ensure source and target are compatible (both directories
1497 	 * or both not directories).  If target is a directory it must
1498 	 * be empty and have no links to it; in addition it must not
1499 	 * be a mount point, and both the source and target must be
1500 	 * writable.
1501 	 */
1502 	doingdirectory = (((sip->i_mode & IFMT) == IFDIR) ||
1503 	    ((sip->i_mode & IFMT) == IFATTRDIR));
1504 	if (((tip->i_mode & IFMT) == IFDIR) ||
1505 	    ((tip->i_mode & IFMT) == IFATTRDIR)) {
1506 		if (!doingdirectory) {
1507 			err = EISDIR;
1508 			goto out;
1509 		}
1510 		/*
1511 		 * vn_vfswlock will prevent mounts from using the directory
1512 		 * until we are done.
1513 		 */
1514 		if (vn_vfswlock(ITOV(tip))) {
1515 			err = EBUSY;
1516 			goto out;
1517 		}
1518 		if (vn_mountedvfs(ITOV(tip)) != NULL) {
1519 			vn_vfsunlock(ITOV(tip));
1520 			err = EBUSY;
1521 			goto out;
1522 		}
1523 		if (!ufs_dirempty(tip, tdp->i_number, cr) || tip->i_nlink > 2) {
1524 			vn_vfsunlock(ITOV(tip));
1525 			err = EEXIST;	/* SIGH should be ENOTEMPTY */
1526 			goto out;
1527 		}
1528 	} else if (doingdirectory) {
1529 		err = ENOTDIR;
1530 		goto out;
1531 	}
1532 
1533 	/*
1534 	 * Rewrite the inode pointer for target name entry
1535 	 * from the target inode (ip) to the source inode (sip).
1536 	 * This prevents the target entry from disappearing
1537 	 * during a crash. Mark the directory inode to reflect the changes.
1538 	 */
1539 	tdvp = ITOV(tdp);
1540 	slotp->ep->d_ino = (int32_t)sip->i_number;
1541 	dnlc_update(tdvp, namep, ITOV(sip));
1542 	if (slotp->size) {
1543 		offset = slotp->offset - slotp->size;
1544 	} else {
1545 		offset = slotp->offset + 1;
1546 	}
1547 	if (slotp->cached) {
1548 		(void) dnlc_dir_update(&tdp->i_danchor, namep,
1549 		    INO_OFF_TO_H(slotp->ep->d_ino, offset));
1550 	}
1551 
1552 	err = TRANS_DIR(tdp, slotp->offset);
1553 	if (err)
1554 		fbrelse(slotp->fbp, S_OTHER);
1555 	else
1556 		err = ufs_fbwrite(slotp->fbp, tdp);
1557 
1558 	slotp->fbp = NULL;
1559 	if (err) {
1560 		if (doingdirectory)
1561 			vn_vfsunlock(ITOV(tip));
1562 		goto out;
1563 	}
1564 
1565 	TRANS_INODE(tdp->i_ufsvfs, tdp);
1566 	tdp->i_flag |= IUPD|ICHG;
1567 	tdp->i_seq++;
1568 	ITIMES_NOLOCK(tdp);
1569 
1570 	/*
1571 	 * Decrement the link count of the target inode.
1572 	 * Fix the ".." entry in sip to point to dp.
1573 	 * This is done after the new entry is on the disk.
1574 	 */
1575 	tip->i_nlink--;
1576 	TRANS_INODE(tip->i_ufsvfs, tip);
1577 	tip->i_flag |= ICHG;
1578 	tip->i_seq++;
1579 	ITIMES_NOLOCK(tip);
1580 	if (doingdirectory) {
1581 		/*
1582 		 * The entry for tip no longer exists so I can unlock the
1583 		 * vfslock.
1584 		 */
1585 		vn_vfsunlock(ITOV(tip));
1586 		/*
1587 		 * Decrement target link count once more if it was a directory.
1588 		 */
1589 		if (--tip->i_nlink != 0) {
1590 			err = ufs_fault(ITOV(tip),
1591 		    "ufs_dirrename: target directory link count != 0 (%s)",
1592 			    tip->i_fs->fs_fsmnt);
1593 			rw_exit(&tip->i_contents);
1594 			return (err);
1595 		}
1596 		TRANS_INODE(tip->i_ufsvfs, tip);
1597 		ufs_setreclaim(tip);
1598 		/*
1599 		 * Renaming a directory with the parent different
1600 		 * requires that ".." be rewritten.  The window is
1601 		 * still there for ".." to be inconsistent, but this
1602 		 * is unavoidable, and a lot shorter than when it was
1603 		 * done in a user process.  We decrement the link
1604 		 * count in the new parent as appropriate to reflect
1605 		 * the just-removed target.  If the parent is the
1606 		 * same, this is appropriate since the original
1607 		 * directory is going away.  If the new parent is
1608 		 * different, ufs_dirfixdotdot() will bump the link count
1609 		 * back.
1610 		 */
1611 		tdp->i_nlink--;
1612 		ufs_setreclaim(tdp);
1613 		TRANS_INODE(tdp->i_ufsvfs, tdp);
1614 		tdp->i_flag |= ICHG;
1615 		tdp->i_seq++;
1616 		ITIMES_NOLOCK(tdp);
1617 		if (sdp != tdp) {
1618 			rw_exit(&tip->i_contents);
1619 			rw_exit(&sip->i_contents);
1620 			err = ufs_dirfixdotdot(sip, sdp, tdp);
1621 			return (err);
1622 		}
1623 	} else
1624 		ufs_setreclaim(tip);
1625 out:
1626 	rw_exit(&tip->i_contents);
1627 	rw_exit(&sip->i_contents);
1628 	return (err);
1629 }
1630 
1631 /*
1632  * Fix the ".." entry of the child directory so that it points
1633  * to the new parent directory instead of the old one.  Routine
1634  * assumes that dp is a directory and that all the inodes are on
1635  * the same file system.
1636  */
1637 static int
1638 ufs_dirfixdotdot(
1639 	struct inode *dp,	/* child directory */
1640 	struct inode *opdp,	/* old parent directory */
1641 	struct inode *npdp)	/* new parent directory */
1642 {
1643 	struct fbuf *fbp;
1644 	struct dirtemplate *dirp;
1645 	vnode_t *dvp;
1646 	int err;
1647 
1648 	ASSERT(RW_WRITE_HELD(&npdp->i_rwlock));
1649 	ASSERT(RW_WRITE_HELD(&npdp->i_contents));
1650 
1651 	/*
1652 	 * We hold the child directory's i_contents lock before calling
1653 	 * blkatoff so that we honor correct locking protocol which is
1654 	 * i_contents lock and then page lock. (blkatoff will call
1655 	 * ufs_getpage where we want the page lock)
1656 	 * We hold the child directory's i_rwlock before i_contents (as
1657 	 * per the locking protocol) since we are modifying the ".." entry
1658 	 * of the child directory.
1659 	 * We hold the i_rwlock and i_contents lock until we record
1660 	 * this directory delta to the log (via ufs_trans_dir) and have
1661 	 * done fbrelse.
1662 	 */
1663 	rw_enter(&dp->i_rwlock, RW_WRITER);
1664 	rw_enter(&dp->i_contents, RW_WRITER);
1665 	err = blkatoff(dp, (off_t)0, (char **)&dirp, &fbp);
1666 	if (err)
1667 		goto bad;
1668 
1669 	if (dp->i_nlink <= 0 ||
1670 	    dp->i_size < sizeof (struct dirtemplate)) {
1671 		err = ENOENT;
1672 		goto bad;
1673 	}
1674 
1675 	if (dirp->dotdot_namlen != 2 ||
1676 	    dirp->dotdot_name[0] != '.' ||
1677 	    dirp->dotdot_name[1] != '.') {	/* Sanity check. */
1678 		dirbad(dp, "mangled .. entry", (off_t)0);
1679 		err = ENOTDIR;
1680 		goto bad;
1681 	}
1682 
1683 	/*
1684 	 * Increment the link count in the new parent inode and force it out.
1685 	 */
1686 	if (npdp->i_nlink == MAXLINK) {
1687 		err = EMLINK;
1688 		goto bad;
1689 	}
1690 	npdp->i_nlink++;
1691 	TRANS_INODE(npdp->i_ufsvfs, npdp);
1692 	npdp->i_flag |= ICHG;
1693 	npdp->i_seq++;
1694 	ufs_iupdat(npdp, I_SYNC);
1695 
1696 	/*
1697 	 * Rewrite the child ".." entry and force it out.
1698 	 */
1699 	dvp = ITOV(dp);
1700 	dirp->dotdot_ino = (uint32_t)npdp->i_number;
1701 	dnlc_update(dvp, "..", ITOV(npdp));
1702 	(void) dnlc_dir_update(&dp->i_danchor, "..",
1703 	    INO_OFF_TO_H(dirp->dotdot_ino, 0));
1704 
1705 	err = TRANS_DIR(dp, 0);
1706 	if (err)
1707 		fbrelse(fbp, S_OTHER);
1708 	else
1709 		err = ufs_fbwrite(fbp, dp);
1710 
1711 	fbp = NULL;
1712 	if (err)
1713 		goto bad;
1714 
1715 	rw_exit(&dp->i_contents);
1716 	rw_exit(&dp->i_rwlock);
1717 
1718 	/*
1719 	 * Decrement the link count of the old parent inode and force it out.
1720 	 */
1721 	ASSERT(opdp);
1722 	rw_enter(&opdp->i_contents, RW_WRITER);
1723 	ASSERT(opdp->i_nlink > 0);
1724 	opdp->i_nlink--;
1725 	ufs_setreclaim(opdp);
1726 	TRANS_INODE(opdp->i_ufsvfs, opdp);
1727 	opdp->i_flag |= ICHG;
1728 	opdp->i_seq++;
1729 	ufs_iupdat(opdp, I_SYNC);
1730 	rw_exit(&opdp->i_contents);
1731 	return (0);
1732 
1733 bad:
1734 	if (fbp)
1735 		fbrelse(fbp, S_OTHER);
1736 	rw_exit(&dp->i_contents);
1737 	rw_exit(&dp->i_rwlock);
1738 	return (err);
1739 }
1740 
1741 /*
1742  * Enter the file sip in the directory tdp with name namep.
1743  */
1744 static int
1745 ufs_diraddentry(
1746 	struct inode *tdp,
1747 	char *namep,
1748 	enum de_op op,
1749 	int namlen,
1750 	struct slot *slotp,
1751 	struct inode *sip,
1752 	struct inode *sdp,
1753 	struct cred *cr)
1754 {
1755 	struct direct *ep, *nep;
1756 	vnode_t *tdvp;
1757 	dcanchor_t *dcap = &tdp->i_danchor;
1758 	off_t offset;
1759 	int err;
1760 	ushort_t extra;
1761 
1762 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
1763 	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
1764 	/*
1765 	 * Prepare a new entry.  If the caller has not supplied an
1766 	 * existing inode, make a new one.
1767 	 */
1768 	err = dirprepareentry(tdp, slotp, cr);
1769 	if (err) {
1770 		if (slotp->fbp) {
1771 			fbrelse(slotp->fbp, S_OTHER);
1772 			slotp->fbp = NULL;
1773 		}
1774 		return (err);
1775 	}
1776 	/*
1777 	 * Check inode to be linked to see if it is in the
1778 	 * same filesystem.
1779 	 */
1780 	if (ITOV(tdp)->v_vfsp != ITOV(sip)->v_vfsp) {
1781 		err = EXDEV;
1782 		goto bad;
1783 	}
1784 
1785 	/*
1786 	 * If renaming a directory then fix up the ".." entry in the
1787 	 * directory to point to the new parent.
1788 	 */
1789 	if ((op == DE_RENAME) && (((sip->i_mode & IFMT) == IFDIR) ||
1790 	    ((sip->i_mode & IFMT) == IFATTRDIR)) && (sdp != tdp)) {
1791 		err = ufs_dirfixdotdot(sip, sdp, tdp);
1792 		if (err)
1793 			goto bad;
1794 	}
1795 
1796 	/*
1797 	 * Fill in entry data.
1798 	 */
1799 	ep = slotp->ep;
1800 	ep->d_namlen = (ushort_t)namlen;
1801 	(void) strncpy(ep->d_name, namep, (size_t)((namlen + 4) & ~3));
1802 	ep->d_ino = (uint32_t)sip->i_number;
1803 	tdvp = ITOV(tdp);
1804 	dnlc_update(tdvp, namep, ITOV(sip));
1805 	/*
1806 	 * Note the offset supplied for any named entry is
1807 	 * the offset of the previous one, unless it's the 1st.
1808 	 * slotp->size is used to pass the length to
1809 	 * the previous entry.
1810 	 */
1811 	if (slotp->size) {
1812 		offset = slotp->offset - slotp->size;
1813 	} else {
1814 		offset = slotp->offset + 1;
1815 	}
1816 
1817 	if (slotp->cached) {
1818 		/*
1819 		 * Add back any usable unused space to the dnlc directory
1820 		 * cache.
1821 		 */
1822 		extra = ep->d_reclen - DIRSIZ(ep);
1823 		if (extra >= LDIRSIZ(1)) {
1824 			(void) dnlc_dir_add_space(dcap, extra,
1825 			    (uint64_t)slotp->offset);
1826 		}
1827 
1828 		(void) dnlc_dir_add_entry(dcap, namep,
1829 		    INO_OFF_TO_H(ep->d_ino, offset));
1830 
1831 		/* adjust the previous offset of the next entry */
1832 		nep = (struct direct *)((char *)ep + ep->d_reclen);
1833 		if ((uintptr_t)nep & (DIRBLKSIZ - 1)) {
1834 			/*
1835 			 * Not a new block.
1836 			 *
1837 			 * Check the validity of the next entry.
1838 			 * If it's bad, then throw away the cache, and
1839 			 * continue as before directory caching.
1840 			 */
1841 			if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) ||
1842 			    dnlc_dir_update(dcap, nep->d_name,
1843 			    INO_OFF_TO_H(nep->d_ino, slotp->offset))
1844 			    == DNOENT) {
1845 				dnlc_dir_purge(dcap);
1846 				slotp->cached = 0;
1847 			}
1848 		}
1849 	}
1850 
1851 	/*
1852 	 * Write out the directory block.
1853 	 */
1854 	err = TRANS_DIR(tdp, slotp->offset);
1855 	if (err)
1856 		fbrelse(slotp->fbp, S_OTHER);
1857 	else
1858 		err = ufs_fbwrite(slotp->fbp, tdp);
1859 
1860 	slotp->fbp = NULL;
1861 	/*
1862 	 * If this is a rename of a directory, then we have already
1863 	 * fixed the ".." entry to refer to the new parent. If err
1864 	 * is true at this point, we have failed to update the new
1865 	 * parent to refer to the renamed directory.
1866 	 * XXX - we need to unwind the ".." fix.
1867 	 */
1868 	if (err)
1869 		return (err);
1870 
1871 	/*
1872 	 * Mark the directory inode to reflect the changes.
1873 	 * Truncate the directory to chop off blocks of empty entries.
1874 	 */
1875 
1876 	TRANS_INODE(tdp->i_ufsvfs, tdp);
1877 	tdp->i_flag |= IUPD|ICHG;
1878 	tdp->i_seq++;
1879 	tdp->i_diroff = 0;
1880 	ITIMES_NOLOCK(tdp);
1881 	/*
1882 	 * If the directory grew then dirprepareentry() will have
1883 	 * set IATTCHG in tdp->i_flag, then the directory inode must
1884 	 * be flushed out. This is because if fsync() is used later
1885 	 * the directory size must be correct, otherwise a crash would
1886 	 * cause fsck to move the file to lost+found. Also because later
1887 	 * a file may be linked in more than one directory, then there
1888 	 * is no way to flush the original directory. So it must be
1889 	 * flushed out on creation. See bug 4293809.
1890 	 */
1891 	if (tdp->i_flag & IATTCHG) {
1892 		ufs_iupdat(tdp, I_SYNC);
1893 	}
1894 
1895 	if (slotp->endoff && (slotp->endoff < tdp->i_size)) {
1896 		if (!TRANS_ISTRANS(tdp->i_ufsvfs)) {
1897 			(void) ufs_itrunc(tdp, (u_offset_t)slotp->endoff, 0,
1898 						cr);
1899 		}
1900 	}
1901 
1902 
1903 	return (0);
1904 
1905 bad:
1906 	if (slotp->cached) {
1907 		dnlc_dir_purge(dcap);
1908 		fbrelse(slotp->fbp, S_OTHER);
1909 		slotp->cached = 0;
1910 		slotp->fbp = NULL;
1911 		return (err);
1912 	}
1913 
1914 	/*
1915 	 * Clear out entry prepared by dirprepareent.
1916 	 */
1917 	slotp->ep->d_ino = 0;
1918 	slotp->ep->d_namlen = 0;
1919 
1920 	/*
1921 	 * Don't touch err so we don't clobber the real error that got us here.
1922 	 */
1923 	if (TRANS_DIR(tdp, slotp->offset))
1924 		fbrelse(slotp->fbp, S_OTHER);
1925 	else
1926 		(void) ufs_fbwrite(slotp->fbp, tdp);
1927 	slotp->fbp = NULL;
1928 	return (err);
1929 }
1930 
1931 /*
1932  * Prepare a directory slot to receive an entry.
1933  */
1934 static int
1935 dirprepareentry(
1936 	struct inode *dp,	/* directory we are working in */
1937 	struct slot *slotp,	/* available slot info */
1938 	struct cred *cr)
1939 {
1940 	struct direct *ep, *nep;
1941 	off_t entryend;
1942 	int err;
1943 	slotstat_t status = slotp->status;
1944 	ushort_t dsize;
1945 
1946 	ASSERT((status == NONE) || (status == FOUND));
1947 	ASSERT(RW_WRITE_HELD(&dp->i_rwlock));
1948 	ASSERT(RW_WRITE_HELD(&dp->i_contents));
1949 	/*
1950 	 * If we didn't find a slot, then indicate that the
1951 	 * new slot belongs at the end of the directory.
1952 	 * If we found a slot, then the new entry can be
1953 	 * put at slotp->offset.
1954 	 */
1955 	entryend = slotp->offset + slotp->size;
1956 	if (status == NONE) {
1957 		ASSERT((slotp->offset & (DIRBLKSIZ - 1)) == 0);
1958 		if (DIRBLKSIZ > dp->i_fs->fs_fsize) {
1959 			err = ufs_fault(ITOV(dp),
1960 			    "dirprepareentry: bad fs_fsize, DIRBLKSIZ: %d"
1961 			    " > dp->i_fs->fs_fsize: %d (%s)",
1962 			    DIRBLKSIZ, dp->i_fs->fs_fsize, dp->i_fs->fs_fsmnt);
1963 			return (err);
1964 		}
1965 		/*
1966 		 * Allocate the new block.
1967 		 */
1968 		err = BMAPALLOC(dp, (u_offset_t)slotp->offset,
1969 		    (int)(blkoff(dp->i_fs, slotp->offset) + DIRBLKSIZ), cr);
1970 		if (err) {
1971 			return (err);
1972 		}
1973 		dp->i_size = entryend;
1974 		TRANS_INODE(dp->i_ufsvfs, dp);
1975 		dp->i_flag |= IUPD|ICHG|IATTCHG;
1976 		dp->i_seq++;
1977 		ITIMES_NOLOCK(dp);
1978 	} else if (entryend > dp->i_size) {
1979 		/*
1980 		 * Adjust directory size, if needed. This should never
1981 		 * push the size past a new multiple of DIRBLKSIZ.
1982 		 * This is an artifact of the old (4.2BSD) way of initializing
1983 		 * directory sizes to be less than DIRBLKSIZ.
1984 		 */
1985 		dp->i_size = P2ROUNDUP_TYPED(entryend, DIRBLKSIZ, off_t);
1986 		TRANS_INODE(dp->i_ufsvfs, dp);
1987 		dp->i_flag |= IUPD|ICHG|IATTCHG;
1988 		dp->i_seq++;
1989 		ITIMES_NOLOCK(dp);
1990 	}
1991 
1992 	/*
1993 	 * Get the block containing the space for the new directory entry.
1994 	 */
1995 	if (slotp->fbp == NULL) {
1996 		err = blkatoff(dp, slotp->offset, (char **)&slotp->ep,
1997 		    &slotp->fbp);
1998 		if (err) {
1999 			return (err);
2000 		}
2001 	}
2002 	ep = slotp->ep;
2003 
2004 	switch (status) {
2005 	case NONE:
2006 		/*
2007 		 * No space in the directory. slotp->offset will be on a
2008 		 * directory block boundary and we will write the new entry
2009 		 * into a fresh block.
2010 		 */
2011 		ep->d_reclen = DIRBLKSIZ;
2012 		slotp->size = 0; /* length of previous entry */
2013 		break;
2014 	case FOUND:
2015 		/*
2016 		 * An entry of the required size has been found. Use it.
2017 		 */
2018 		if (ep->d_ino == 0) {
2019 			/* this is the 1st record in a block */
2020 			slotp->size = 0; /* length of previous entry */
2021 		} else {
2022 			dsize = DIRSIZ(ep);
2023 			nep = (struct direct *)((char *)ep + dsize);
2024 			nep->d_reclen = ep->d_reclen - dsize;
2025 			ep->d_reclen = dsize;
2026 			slotp->ep = nep;
2027 			slotp->offset += dsize;
2028 			slotp->size = dsize; /* length of previous entry */
2029 		}
2030 		break;
2031 	default:
2032 		break;
2033 	}
2034 	return (0);
2035 }
2036 
2037 /*
2038  * Allocate and initialize a new inode that will go into directory tdp.
2039  * This routine is called from ufs_symlink(), as well as within this file.
2040  */
2041 int
2042 ufs_dirmakeinode(
2043 	struct inode *tdp,
2044 	struct inode **ipp,
2045 	struct vattr *vap,
2046 	enum de_op op,
2047 	struct cred *cr)
2048 {
2049 	struct inode *ip;
2050 	enum vtype type;
2051 	int imode;			/* mode and format as in inode */
2052 	ino_t ipref;
2053 	int err;
2054 	timestruc_t now;
2055 
2056 	ASSERT(vap != NULL);
2057 	ASSERT(op == DE_CREATE || op == DE_MKDIR || op == DE_ATTRDIR ||
2058 		op == DE_SYMLINK);
2059 	ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
2060 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
2061 	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
2062 	/*
2063 	 * Allocate a new inode.
2064 	 */
2065 	type = vap->va_type;
2066 	if (type == VDIR) {
2067 		ipref = dirpref(tdp);
2068 	} else {
2069 		ipref = tdp->i_number;
2070 	}
2071 	if (op == DE_ATTRDIR)
2072 		imode = vap->va_mode;
2073 	else
2074 		imode = MAKEIMODE(type, vap->va_mode);
2075 	*ipp = NULL;
2076 	err = ufs_ialloc(tdp, ipref, imode, &ip, cr);
2077 	if (err)
2078 		return (err);
2079 
2080 	/*
2081 	 * We don't need to grab vfs_dqrwlock here because it is held
2082 	 * in ufs_direnter_*() above us.
2083 	 */
2084 	ASSERT(RW_READ_HELD(&ip->i_ufsvfs->vfs_dqrwlock));
2085 	rw_enter(&ip->i_contents, RW_WRITER);
2086 	if (ip->i_dquot != NULL) {
2087 		err = ufs_fault(ITOV(ip),
2088 		    "ufs_dirmakeinode, ip->i_dquot != NULL: dquot (%s)",
2089 				    tdp->i_fs->fs_fsmnt);
2090 		rw_exit(&ip->i_contents);
2091 		return (err);
2092 	}
2093 	*ipp = ip;
2094 	ip->i_mode = (o_mode_t)imode;
2095 	if (type == VBLK || type == VCHR) {
2096 		dev_t d = vap->va_rdev;
2097 		dev32_t dev32;
2098 
2099 		/*
2100 		 * Don't allow a special file to be created with a
2101 		 * dev_t that cannot be represented by this filesystem
2102 		 * format on disk.
2103 		 */
2104 		if (!cmpldev(&dev32, d)) {
2105 			err = EOVERFLOW;
2106 			goto fail;
2107 		}
2108 
2109 		ITOV(ip)->v_rdev = ip->i_rdev = d;
2110 
2111 		if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) {
2112 			ip->i_ordev = dev32; /* can't use old format */
2113 		} else {
2114 			ip->i_ordev = cmpdev(d);
2115 		}
2116 	}
2117 	ITOV(ip)->v_type = type;
2118 	ufs_reset_vnode(ip->i_vnode);
2119 	if (type == VDIR) {
2120 		ip->i_nlink = 2; /* anticipating a call to dirmakedirect */
2121 	} else {
2122 		ip->i_nlink = 1;
2123 	}
2124 
2125 	if (op == DE_ATTRDIR) {
2126 		ip->i_uid = vap->va_uid;
2127 		ip->i_gid = vap->va_gid;
2128 	} else
2129 		ip->i_uid = crgetuid(cr);
2130 	/*
2131 	 * To determine the group-id of the created file:
2132 	 *   1) If the gid is set in the attribute list (non-Sun & pre-4.0
2133 	 *	clients are not likely to set the gid), then use it if
2134 	 *	the process is privileged, belongs to the target group,
2135 	 *	or the group is the same as the parent directory.
2136 	 *   2) If the filesystem was not mounted with the Old-BSD-compatible
2137 	 *	GRPID option, and the directory's set-gid bit is clear,
2138 	 *	then use the process's gid.
2139 	 *   3) Otherwise, set the group-id to the gid of the parent directory.
2140 	 */
2141 	if (op != DE_ATTRDIR && (vap->va_mask & AT_GID) &&
2142 	    ((vap->va_gid == tdp->i_gid) || groupmember(vap->va_gid, cr) ||
2143 	    secpolicy_vnode_create_gid(cr) == 0)) {
2144 		/*
2145 		 * XXX - is this only the case when a 4.0 NFS client, or a
2146 		 * client derived from that code, makes a call over the wire?
2147 		 */
2148 		ip->i_gid = vap->va_gid;
2149 	} else
2150 		ip->i_gid = (tdp->i_mode & ISGID) ? tdp->i_gid : crgetgid(cr);
2151 
2152 	/*
2153 	 * For SunOS 5.0->5.4, the lines below read:
2154 	 *
2155 	 * ip->i_suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid;
2156 	 * ip->i_sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid;
2157 	 *
2158 	 * where MAXUID was set to 60002.  See notes on this in ufs_inode.c
2159 	 */
2160 	ip->i_suid = (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ?
2161 		UID_LONG : ip->i_uid;
2162 	ip->i_sgid = (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ?
2163 		GID_LONG : ip->i_gid;
2164 
2165 	/*
2166 	 * If we're creating a directory, and the parent directory has the
2167 	 * set-GID bit set, set it on the new directory.
2168 	 * Otherwise, if the user is neither privileged nor a member of the
2169 	 * file's new group, clear the file's set-GID bit.
2170 	 */
2171 	if ((tdp->i_mode & ISGID) && (type == VDIR))
2172 		ip->i_mode |= ISGID;
2173 	else {
2174 		if ((ip->i_mode & ISGID) &&
2175 		    secpolicy_vnode_setids_setgids(cr, ip->i_gid) != 0)
2176 			ip->i_mode &= ~ISGID;
2177 	}
2178 
2179 	if (((vap->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2180 	    ((vap->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2181 		err = EOVERFLOW;
2182 		goto fail;
2183 	}
2184 
2185 	/*
2186 	 * Extended attribute directories are not subject to quotas.
2187 	 */
2188 	if (op != DE_ATTRDIR)
2189 		ip->i_dquot = getinoquota(ip);
2190 	else
2191 		ip->i_dquot = NULL;
2192 
2193 	if (op == DE_MKDIR || op == DE_ATTRDIR) {
2194 		err = ufs_dirmakedirect(ip, tdp, (op == DE_MKDIR) ? 0 : 1, cr);
2195 		if (err)
2196 			goto fail;
2197 	}
2198 
2199 	/*
2200 	 * generate the shadow inode and attach it to the new object
2201 	 */
2202 	ASSERT((tdp->i_shadow && tdp->i_ufs_acl) ||
2203 	    (!tdp->i_shadow && !tdp->i_ufs_acl));
2204 	if (tdp->i_shadow && tdp->i_ufs_acl &&
2205 	    (((tdp->i_mode & IFMT) == IFDIR) ||
2206 	    ((tdp->i_mode & IFMT) == IFATTRDIR))) {
2207 		err = ufs_si_inherit(ip, tdp, ip->i_mode, cr);
2208 		if (err) {
2209 			if (op == DE_MKDIR) {
2210 				/*
2211 				 * clean up parent directory
2212 				 *
2213 				 * tdp->i_contents already locked from
2214 				 * ufs_direnter_*()
2215 				 */
2216 				tdp->i_nlink--;
2217 				TRANS_INODE(tdp->i_ufsvfs, tdp);
2218 				tdp->i_flag |= ICHG;
2219 				tdp->i_seq++;
2220 				ufs_iupdat(tdp, I_SYNC);
2221 			}
2222 			goto fail;
2223 		}
2224 	}
2225 
2226 	/*
2227 	 * If the passed in attributes contain atime and/or mtime
2228 	 * settings, then use them instead of using the current
2229 	 * high resolution time.
2230 	 */
2231 	if (vap->va_mask & (AT_MTIME|AT_ATIME)) {
2232 		if (vap->va_mask & AT_ATIME) {
2233 			ip->i_atime.tv_sec = vap->va_atime.tv_sec;
2234 			ip->i_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2235 			ip->i_flag &= ~IACC;
2236 		} else
2237 			ip->i_flag |= IACC;
2238 		if (vap->va_mask & AT_MTIME) {
2239 			ip->i_mtime.tv_sec = vap->va_mtime.tv_sec;
2240 			ip->i_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2241 			gethrestime(&now);
2242 			if (now.tv_sec > TIME32_MAX) {
2243 				/*
2244 				 * In 2038, ctime sticks forever..
2245 				 */
2246 				ip->i_ctime.tv_sec = TIME32_MAX;
2247 				ip->i_ctime.tv_usec = 0;
2248 			} else {
2249 				ip->i_ctime.tv_sec = now.tv_sec;
2250 				ip->i_ctime.tv_usec = now.tv_nsec / 1000;
2251 			}
2252 			ip->i_flag &= ~(IUPD|ICHG);
2253 			ip->i_flag |= IMODTIME;
2254 		} else
2255 			ip->i_flag |= IUPD|ICHG;
2256 		ip->i_flag |= IMOD;
2257 	} else
2258 		ip->i_flag |= IACC|IUPD|ICHG;
2259 	ip->i_seq++;
2260 
2261 	/*
2262 	 * If this is an attribute tag it as one.
2263 	 */
2264 	if ((tdp->i_mode & IFMT) == IFATTRDIR) {
2265 		ip->i_cflags |= IXATTR;
2266 	}
2267 
2268 	/*
2269 	 * push inode before it's name appears in a directory
2270 	 */
2271 	TRANS_INODE(ip->i_ufsvfs, ip);
2272 	ufs_iupdat(ip, I_SYNC);
2273 	rw_exit(&ip->i_contents);
2274 	return (0);
2275 
2276 fail:
2277 	/* Throw away inode we just allocated. */
2278 	ip->i_nlink = 0;
2279 	ufs_setreclaim(ip);
2280 	TRANS_INODE(ip->i_ufsvfs, ip);
2281 	ip->i_flag |= ICHG;
2282 	ip->i_seq++;
2283 	ITIMES_NOLOCK(ip);
2284 	rw_exit(&ip->i_contents);
2285 	return (err);
2286 }
2287 
2288 /*
2289  * Write a prototype directory into the empty inode ip, whose parent is dp.
2290  */
2291 static int
2292 ufs_dirmakedirect(
2293 	struct inode *ip,		/* new directory */
2294 	struct inode *dp,		/* parent directory */
2295 	int	attrdir,
2296 	struct cred *cr)
2297 {
2298 	struct dirtemplate *dirp;
2299 	struct fbuf *fbp;
2300 	int err;
2301 
2302 	ASSERT(RW_WRITE_HELD(&ip->i_contents));
2303 	ASSERT(RW_WRITE_HELD(&dp->i_rwlock));
2304 	ASSERT(RW_WRITE_HELD(&dp->i_contents));
2305 	/*
2306 	 * Allocate space for the directory we're creating.
2307 	 */
2308 	err = BMAPALLOC(ip, (u_offset_t)0, DIRBLKSIZ, cr);
2309 	if (err)
2310 		return (err);
2311 	if (DIRBLKSIZ > dp->i_fs->fs_fsize) {
2312 		err = ufs_fault(ITOV(dp),
2313 "ufs_dirmakedirect: bad fs_fsize, DIRBLKSIZ: %d > dp->i_fs->fs_fsize: %d (%s)",
2314 					DIRBLKSIZ, dp->i_fs->fs_fsize,
2315 					dp->i_fs->fs_fsmnt);
2316 		return (err);
2317 	}
2318 	ip->i_size = DIRBLKSIZ;
2319 	TRANS_INODE(ip->i_ufsvfs, ip);
2320 	ip->i_flag |= IUPD|ICHG|IATTCHG;
2321 	ip->i_seq++;
2322 	ITIMES_NOLOCK(ip);
2323 	/*
2324 	 * Update the tdp link count and write out the change.
2325 	 * This reflects the ".." entry we'll soon write.
2326 	 */
2327 	if (dp->i_nlink == MAXLINK)
2328 		return (EMLINK);
2329 	if (attrdir == 0)
2330 		dp->i_nlink++;
2331 	TRANS_INODE(dp->i_ufsvfs, dp);
2332 	dp->i_flag |= ICHG;
2333 	dp->i_seq++;
2334 	ufs_iupdat(dp, I_SYNC);
2335 	/*
2336 	 * Initialize directory with "."
2337 	 * and ".." from static template.
2338 	 *
2339 	 * Since the parent directory is locked, we don't have to
2340 	 * worry about anything changing when we drop the write
2341 	 * lock on (ip).
2342 	 *
2343 	 */
2344 	err = fbread(ITOV(ip), (offset_t)0, (uint_t)ip->i_fs->fs_fsize,
2345 	    S_READ, &fbp);
2346 
2347 	if (err) {
2348 		goto fail;
2349 	}
2350 	dirp = (struct dirtemplate *)fbp->fb_addr;
2351 	/*
2352 	 * Now initialize the directory we're creating
2353 	 * with the "." and ".." entries.
2354 	 */
2355 	*dirp = mastertemplate;			/* structure assignment */
2356 	dirp->dot_ino = (uint32_t)ip->i_number;
2357 	dirp->dotdot_ino = (uint32_t)dp->i_number;
2358 
2359 	err = TRANS_DIR(ip, 0);
2360 	if (err) {
2361 		fbrelse(fbp, S_OTHER);
2362 		goto fail;
2363 	}
2364 
2365 	err = ufs_fbwrite(fbp, ip);
2366 	if (err) {
2367 		goto fail;
2368 	}
2369 
2370 	return (0);
2371 
2372 fail:
2373 	if (attrdir == 0)
2374 		dp->i_nlink--;
2375 	TRANS_INODE(dp->i_ufsvfs, dp);
2376 	dp->i_flag |= ICHG;
2377 	dp->i_seq++;
2378 	ufs_iupdat(dp, I_SYNC);
2379 	return (err);
2380 }
2381 
2382 /*
2383  * Delete a directory entry.  If oip is nonzero the entry is checked
2384  * to make sure it still reflects oip.
2385  *
2386  * If vpp is non-null, return the ptr of the (held) vnode associated with
2387  * the removed name.  The caller is responsible for doing the VN_RELE().
2388  */
2389 int
2390 ufs_dirremove(
2391 	struct inode *dp,
2392 	char *namep,
2393 	struct inode *oip,
2394 	struct vnode *cdir,
2395 	enum dr_op op,
2396 	struct cred *cr,
2397 	vnode_t **vpp)	/* Return (held) vnode ptr of removed file/dir */
2398 {
2399 	struct direct *ep, *pep, *nep;
2400 	struct inode *ip;
2401 	vnode_t *dvp, *vp;
2402 	struct slot slot;
2403 	int namlen;
2404 	int err;
2405 	int mode;
2406 	ushort_t extra;
2407 
2408 	namlen = (int)strlen(namep);
2409 	if (namlen == 0)
2410 		return (ufs_fault(ITOV(dp), "ufs_dirremove: namlen == 0"));
2411 	/*
2412 	 * return error when removing . and ..
2413 	 */
2414 	if (namep[0] == '.') {
2415 		if (namlen == 1)
2416 			return (EINVAL);
2417 		else if (namlen == 2 && namep[1] == '.') {
2418 			return (EEXIST);	/* SIGH should be ENOTEMPTY */
2419 		}
2420 	}
2421 
2422 	ASSERT(RW_WRITE_HELD(&dp->i_rwlock));
2423 	/*
2424 	 * Check accessibility of directory.
2425 	 */
2426 retry:
2427 	if (((dp->i_mode & IFMT) != IFDIR) &&
2428 	    ((dp->i_mode & IFMT) != IFATTRDIR)) {
2429 		return (ENOTDIR);
2430 	}
2431 
2432 	/*
2433 	 * Execute access is required to search the directory.
2434 	 * Access for write is interpreted as allowing
2435 	 * deletion of files in the directory.
2436 	 */
2437 	if (err = ufs_iaccess(dp, IEXEC|IWRITE, cr)) {
2438 		return (err);
2439 	}
2440 
2441 	ip = NULL;
2442 	slot.fbp = NULL;
2443 	slot.status = FOUND;	/* don't need to look for empty slot */
2444 	rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER);
2445 	rw_enter(&dp->i_contents, RW_WRITER);
2446 	err = ufs_dircheckforname(dp, namep, namlen, &slot, &ip, cr, 0);
2447 	if (err)
2448 		goto out_novfs;
2449 	if (ip == NULL) {
2450 		err = ENOENT;
2451 		goto out_novfs;
2452 	}
2453 	vp = ITOV(ip);
2454 	if (oip && oip != ip) {
2455 		err = ENOENT;
2456 		goto out_novfs;
2457 	}
2458 
2459 	mode = ip->i_mode & IFMT;
2460 	if (mode == IFDIR || mode == IFATTRDIR) {
2461 
2462 		/*
2463 		 * vn_vfswlock() prevents races between mount and rmdir.
2464 		 */
2465 		if (vn_vfswlock(vp)) {
2466 			err = EBUSY;
2467 			goto out_novfs;
2468 		}
2469 		if (vn_mountedvfs(vp) != NULL && op != DR_RENAME) {
2470 			err = EBUSY;
2471 			goto out;
2472 		}
2473 		/*
2474 		 * If we are removing a directory, get a lock on it.
2475 		 * Taking a writer lock prevents a parallel ufs_dirlook from
2476 		 * incorrectly entering a negative cache vnode entry in the dnlc
2477 		 * If the directory is empty, it will stay empty until
2478 		 * we can remove it.
2479 		 */
2480 		if (!rw_tryenter(&ip->i_rwlock, RW_WRITER)) {
2481 			/*
2482 			 * It is possible that a thread in rename would have
2483 			 * acquired this rwlock. To prevent a deadlock we
2484 			 * do a rw_tryenter. If we fail to get the lock
2485 			 * we drop all the locks we have acquired, wait
2486 			 * for 2 ticks and reacquire the
2487 			 * directory's (dp) i_rwlock and try again.
2488 			 * If we dont drop dp's i_rwlock then we will panic
2489 			 * with a "Deadlock: cycle in blocking chain"
2490 			 * since in ufs_dircheckpath we want dp's i_rwlock.
2491 			 * dp is guaranteed to exist since ufs_dirremove is
2492 			 * called after a VN_HOLD(dp) has been done.
2493 			 */
2494 			ufs_dirremove_retry_cnt++;
2495 			vn_vfsunlock(vp);
2496 			if (slot.fbp)
2497 				fbrelse(slot.fbp, S_OTHER);
2498 			rw_exit(&dp->i_contents);
2499 			rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
2500 			rw_exit(&dp->i_rwlock);
2501 			VN_RELE(vp);
2502 			delay(2);
2503 			rw_enter(&dp->i_rwlock, RW_WRITER);
2504 			goto retry;
2505 		}
2506 	}
2507 	rw_enter(&ip->i_contents, RW_READER);
2508 
2509 	/*
2510 	 * Now check the restrictions that apply on sticky directories.
2511 	 */
2512 	if ((err = ufs_sticky_remove_access(dp, ip, cr)) != 0) {
2513 		rw_exit(&ip->i_contents);
2514 		if (mode == IFDIR || mode == IFATTRDIR)
2515 			rw_exit(&ip->i_rwlock);
2516 		goto out;
2517 	}
2518 
2519 	if (op == DR_RMDIR) {
2520 		/*
2521 		 * For rmdir(2), some special checks are required.
2522 		 * (a) Don't remove any alias of the parent (e.g. ".").
2523 		 * (b) Don't remove the current directory.
2524 		 * (c) Make sure the entry is (still) a directory.
2525 		 * (d) Make sure the directory is empty.
2526 		 */
2527 
2528 		if (dp == ip || vp == cdir)
2529 			err = EINVAL;
2530 		else if (((ip->i_mode & IFMT) != IFDIR) &&
2531 		    ((ip->i_mode & IFMT) != IFATTRDIR))
2532 			err = ENOTDIR;
2533 		else if ((ip->i_nlink > 2) ||
2534 		    !ufs_dirempty(ip, dp->i_number, cr)) {
2535 			err = EEXIST;	/* SIGH should be ENOTEMPTY */
2536 		}
2537 
2538 		if (err) {
2539 			rw_exit(&ip->i_contents);
2540 			if (mode == IFDIR || mode == IFATTRDIR)
2541 				rw_exit(&ip->i_rwlock);
2542 			goto out;
2543 		}
2544 	} else if (op == DR_REMOVE)  {
2545 		/*
2546 		 * unlink(2) requires a different check: allow only
2547 		 * privileged users to unlink a directory.
2548 		 */
2549 		if (vp->v_type == VDIR &&
2550 		    secpolicy_fs_linkdir(cr, vp->v_vfsp)) {
2551 			err = EPERM;
2552 			rw_exit(&ip->i_contents);
2553 			rw_exit(&ip->i_rwlock);
2554 			goto out;
2555 		}
2556 	}
2557 
2558 	rw_exit(&ip->i_contents);
2559 
2560 	/*
2561 	 * Remove the cache'd entry, if any.
2562 	 */
2563 	dvp = ITOV(dp);
2564 	dnlc_remove(dvp, namep);
2565 	ep = slot.ep;
2566 	ep->d_ino = 0;
2567 
2568 	if (slot.cached) {
2569 		dcanchor_t *dcap = &dp->i_danchor;
2570 
2571 		(void) dnlc_dir_rem_entry(dcap, namep, NULL);
2572 		if (((int)ep->d_reclen - (int)DIRSIZ(ep)) >= LDIRSIZ(1)) {
2573 			(void) dnlc_dir_rem_space_by_handle(dcap, slot.offset);
2574 		}
2575 		if (slot.offset & (DIRBLKSIZ - 1)) {
2576 			/*
2577 			 * Collapse new free space into previous entry.
2578 			 * Note, the previous entry has already been
2579 			 * validated in ufs_dircheckforname().
2580 			 */
2581 			ASSERT(slot.size);
2582 			pep = (struct direct *)((char *)ep - slot.size);
2583 			if ((pep->d_ino == 0) &&
2584 			    ((uintptr_t)pep & (DIRBLKSIZ - 1))) {
2585 				dnlc_dir_purge(dcap);
2586 				slot.cached = 0;
2587 				goto nocache;
2588 			}
2589 			if (pep->d_ino) {
2590 				extra = pep->d_reclen - DIRSIZ(pep);
2591 			} else {
2592 				extra = pep->d_reclen;
2593 			}
2594 			if (extra >= LDIRSIZ(1)) {
2595 				(void) dnlc_dir_rem_space_by_handle(dcap,
2596 				    (uint64_t)(slot.offset - slot.size));
2597 			}
2598 			pep->d_reclen += ep->d_reclen;
2599 			(void) dnlc_dir_add_space(dcap, extra + ep->d_reclen,
2600 				(uint64_t)(slot.offset - slot.size));
2601 			/* adjust the previous pointer in the next entry */
2602 			nep = (struct direct *)((char *)ep + ep->d_reclen);
2603 			if ((uintptr_t)nep & (DIRBLKSIZ - 1)) {
2604 				/*
2605 				 * Not a new block.
2606 				 *
2607 				 * Check the validity of the entry.
2608 				 * If it's bad, then throw away the cache and
2609 				 * continue.
2610 				 */
2611 				if ((nep->d_reclen == 0) ||
2612 				    (nep->d_reclen & 0x3) ||
2613 				    (dnlc_dir_update(dcap, nep->d_name,
2614 				    INO_OFF_TO_H(nep->d_ino,
2615 				    slot.offset - slot.size)) == DNOENT)) {
2616 					dnlc_dir_purge(dcap);
2617 					slot.cached = 0;
2618 				}
2619 			}
2620 		} else {
2621 			(void) dnlc_dir_add_space(dcap, ep->d_reclen,
2622 			(uint64_t)slot.offset);
2623 		}
2624 	} else {
2625 		/*
2626 		 * If the entry isn't the first in the directory, we must
2627 		 * reclaim the space of the now empty record by adding
2628 		 * the record size to the size of the previous entry.
2629 		 */
2630 		if (slot.offset & (DIRBLKSIZ - 1)) {
2631 			/*
2632 			 * Collapse new free space into previous entry.
2633 			 */
2634 			pep = (struct direct *)((char *)ep - slot.size);
2635 			pep->d_reclen += ep->d_reclen;
2636 		}
2637 	}
2638 nocache:
2639 
2640 
2641 	err = TRANS_DIR(dp, slot.offset);
2642 	if (err)
2643 		fbrelse(slot.fbp, S_OTHER);
2644 	else
2645 		err = ufs_fbwrite(slot.fbp, dp);
2646 	slot.fbp = NULL;
2647 
2648 	/*
2649 	 * If we were removing a directory, it is 'gone' now, but we cannot
2650 	 * unlock it as a thread may be waiting for the lock in ufs_create. If
2651 	 * we did, it could then create a file in a deleted directory.
2652 	 */
2653 
2654 	if (err) {
2655 		if (mode == IFDIR || mode == IFATTRDIR)
2656 			rw_exit(&ip->i_rwlock);
2657 		goto out;
2658 	}
2659 
2660 	rw_enter(&ip->i_contents, RW_WRITER);
2661 
2662 	dp->i_flag |= IUPD|ICHG;
2663 	dp->i_seq++;
2664 	ip->i_flag |= ICHG;
2665 	ip->i_seq++;
2666 
2667 	TRANS_INODE(dp->i_ufsvfs, dp);
2668 	TRANS_INODE(ip->i_ufsvfs, ip);
2669 	/*
2670 	 * Now dispose of the inode.
2671 	 */
2672 	if (ip->i_nlink > 0) {
2673 		/*
2674 		 * This is not done for IFATTRDIR's because they don't
2675 		 * have entries in the dnlc and the link counts are
2676 		 * not incremented when they are created.
2677 		 */
2678 		if (op == DR_RMDIR && (ip->i_mode & IFMT) == IFDIR) {
2679 			/*
2680 			 * Decrement by 2 because we're trashing the "."
2681 			 * entry as well as removing the entry in dp.
2682 			 * Clear the directory entry, but there may be
2683 			 * other hard links so don't free the inode.
2684 			 * Decrement the dp linkcount because we're
2685 			 * trashing the ".." entry.
2686 			 */
2687 			ip->i_nlink -= 2;
2688 			dp->i_nlink--;
2689 			ufs_setreclaim(dp);
2690 			/*
2691 			 * XXX need to discard negative cache entries
2692 			 * for vp.  See comment in ufs_delete().
2693 			 */
2694 			dnlc_remove(vp, ".");
2695 			dnlc_remove(vp, "..");
2696 			/*
2697 			 * The return value is ignored here bacause if
2698 			 * the directory purge fails we don't want to
2699 			 * stop the delete. If ufs_dirpurgedotdot fails
2700 			 * the delete will continue with the preexiting
2701 			 * behavior.
2702 			 */
2703 			(void) ufs_dirpurgedotdot(ip, dp->i_number, cr);
2704 		} else {
2705 			ip->i_nlink--;
2706 		}
2707 		ufs_setreclaim(ip);
2708 	}
2709 	ITIMES_NOLOCK(dp);
2710 	ITIMES_NOLOCK(ip);
2711 
2712 	if (!TRANS_ISTRANS(dp->i_ufsvfs))
2713 		ufs_iupdat(dp, I_SYNC);
2714 	if (!TRANS_ISTRANS(ip->i_ufsvfs))
2715 		ufs_iupdat(ip, I_SYNC);
2716 
2717 	rw_exit(&ip->i_contents);
2718 	if (mode == IFDIR || mode == IFATTRDIR)
2719 		rw_exit(&ip->i_rwlock);
2720 out:
2721 	if (mode == IFDIR || mode == IFATTRDIR) {
2722 		vn_vfsunlock(vp);
2723 	}
2724 out_novfs:
2725 	ASSERT(RW_WRITE_HELD(&dp->i_contents));
2726 
2727 	if (slot.fbp)
2728 		fbrelse(slot.fbp, S_OTHER);
2729 
2730 	rw_exit(&dp->i_contents);
2731 	rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
2732 
2733 	/*
2734 	 * If no error and vpp is non-NULL, return the vnode ptr to the caller.
2735 	 * The caller becomes responsible for the VN_RELE().  Otherwise,
2736 	 * Release (and delete) the inode after we drop vfs_dqrwlock to
2737 	 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
2738 	 */
2739 	if (ip) {
2740 		if ((err == 0) && (vpp != NULL)) {
2741 			*vpp = ITOV(ip);
2742 		} else {
2743 			VN_RELE(vp);
2744 		}
2745 	}
2746 
2747 	return (err);
2748 }
2749 
2750 /*
2751  * Return buffer with contents of block "offset"
2752  * from the beginning of directory "ip".  If "res"
2753  * is non-zero, fill it in with a pointer to the
2754  * remaining space in the directory.
2755  *
2756  */
2757 
2758 int
2759 blkatoff(
2760 	struct inode *ip,
2761 	off_t offset,
2762 	char **res,
2763 	struct fbuf **fbpp)
2764 {
2765 	struct fs *fs;
2766 	struct fbuf *fbp;
2767 	daddr_t lbn;
2768 	uint_t bsize;
2769 	int err;
2770 
2771 	CPU_STATS_ADD_K(sys, ufsdirblk, 1);
2772 	fs = ip->i_fs;
2773 	lbn = (daddr_t)lblkno(fs, offset);
2774 	bsize = (uint_t)blksize(fs, ip, lbn);
2775 	err = fbread(ITOV(ip), (offset_t)(offset & fs->fs_bmask),
2776 			bsize, S_READ, &fbp);
2777 	if (err) {
2778 		*fbpp = (struct fbuf *)NULL;
2779 		return (err);
2780 	}
2781 	if (res)
2782 		*res = fbp->fb_addr + blkoff(fs, offset);
2783 	*fbpp = fbp;
2784 	return (0);
2785 }
2786 
2787 /*
2788  * Do consistency checking:
2789  *	record length must be multiple of 4
2790  *	entry must fit in rest of its DIRBLKSIZ block
2791  *	record must be large enough to contain entry
2792  *	name is not longer than MAXNAMLEN
2793  *	name must be as long as advertised, and null terminated
2794  * NOTE: record length must not be zero (should be checked previously).
2795  *       This routine is only called if dirchk is true.
2796  *       It would be nice to set the FSBAD flag in the super-block when
2797  *       this routine fails so that a fsck is forced on next reboot,
2798  *       but locking is a problem.
2799  */
2800 static int
2801 dirmangled(
2802 	struct inode *dp,
2803 	struct direct *ep,
2804 	int entryoffsetinblock,
2805 	off_t offset)
2806 {
2807 	int i;
2808 
2809 	i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1));
2810 	if ((ep->d_reclen & 0x3) != 0 || (int)ep->d_reclen > i ||
2811 	    (uint_t)ep->d_reclen < DIRSIZ(ep) || ep->d_namlen > MAXNAMLEN ||
2812 	    ep->d_ino && dirbadname(ep->d_name, (int)ep->d_namlen)) {
2813 		dirbad(dp, "mangled entry", offset);
2814 		return (1);
2815 	}
2816 	return (0);
2817 }
2818 
2819 static void
2820 dirbad(struct inode *ip, char *how, off_t offset)
2821 {
2822 	cmn_err(CE_NOTE, "%s: bad dir ino %d at offset %ld: %s",
2823 	    ip->i_fs->fs_fsmnt, (int)ip->i_number, offset, how);
2824 }
2825 
2826 static int
2827 dirbadname(char *sp, int l)
2828 {
2829 	while (l--) {			/* check for nulls */
2830 		if (*sp++ == '\0') {
2831 			return (1);
2832 		}
2833 	}
2834 	return (*sp);			/* check for terminating null */
2835 }
2836 
2837 /*
2838  * Check if a directory is empty or not.
2839  */
2840 static int
2841 ufs_dirempty(
2842 	struct inode *ip,
2843 	ino_t parentino,
2844 	struct cred *cr)
2845 {
2846 	return (ufs_dirscan(ip, parentino, cr, 0));
2847 }
2848 
2849 /*
2850  * clear the .. directory entry.
2851  */
2852 static int
2853 ufs_dirpurgedotdot(
2854 	struct inode *ip,
2855 	ino_t parentino,
2856 	struct cred *cr)
2857 {
2858 	return (ufs_dirscan(ip, parentino, cr, 1));
2859 }
2860 
2861 /*
2862  * Scan the directoy. If clr_dotdot is true clear the ..
2863  * directory else check to see if the directory is empty.
2864  *
2865  * Using a struct dirtemplate here is not precisely
2866  * what we want, but better than using a struct direct.
2867  *
2868  * clr_dotdot is used as a flag to tell us if we need
2869  * to clear the dotdot entry
2870  *
2871  * N.B.: does not handle corrupted directories.
2872  */
2873 static int
2874 ufs_dirscan(
2875 	struct inode *ip,
2876 	ino_t parentino,
2877 	struct cred *cr,
2878 	int clr_dotdot)
2879 {
2880 	offset_t off;
2881 	struct dirtemplate dbuf;
2882 	struct direct *dp = (struct direct *)&dbuf;
2883 	int err, count;
2884 	int empty = 1;	/* Assume it's empty */
2885 #define	MINDIRSIZ (sizeof (struct dirtemplate) / 2)
2886 
2887 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
2888 
2889 	ASSERT(ip->i_size <= (offset_t)MAXOFF_T);
2890 	for (off = 0; off < ip->i_size; off += dp->d_reclen) {
2891 		err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp,
2892 		    (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr);
2893 		/*
2894 		 * Since we read MINDIRSIZ, residual must
2895 		 * be 0 unless we're at end of file.
2896 		 */
2897 		if (err || count != 0 || dp->d_reclen == 0) {
2898 			empty = 0;
2899 			break;
2900 		}
2901 		/* skip empty entries */
2902 		if (dp->d_ino == 0)
2903 			continue;
2904 		/* accept only "." and ".." */
2905 		if (dp->d_namlen > 2 || dp->d_name[0] != '.') {
2906 			empty = 0;
2907 			break;
2908 		}
2909 		/*
2910 		 * At this point d_namlen must be 1 or 2.
2911 		 * 1 implies ".", 2 implies ".." if second
2912 		 * char is also "."
2913 		 */
2914 		if (dp->d_namlen == 1)
2915 			continue;
2916 		if (dp->d_name[1] == '.' &&
2917 		    (ino_t)dp->d_ino == parentino) {
2918 			/*
2919 			 * If we're doing a purge we need to check for
2920 			 * the . and .. entries and clear the d_ino for ..
2921 			 *
2922 			 * if clr_dotdot is set ufs_dirscan does not
2923 			 * check for an empty directory.
2924 			 */
2925 			if (clr_dotdot) {
2926 				/*
2927 				 * Have to actually zap the ..
2928 				 * entry in the directory, as
2929 				 * otherwise someone might have
2930 				 * dp as its cwd and try to
2931 				 * open .., which now points to
2932 				 * an unallocated inode.
2933 				 */
2934 				empty = ufs_dirclrdotdot(ip, parentino);
2935 				break;
2936 			} else {
2937 				continue;
2938 			}
2939 		}
2940 		empty = 0;
2941 		break;
2942 	}
2943 	return (empty);
2944 }
2945 
2946 clock_t retry_backoff_delay = 1; /* delay before retrying the i_rwlock */
2947 uint64_t dircheck_retry_cnt;
2948 /*
2949  * Check if source directory inode is in the path of the target directory.
2950  * Target is supplied locked.
2951  *
2952  * The source and target inode's should be different upon entry.
2953  */
2954 int
2955 ufs_dircheckpath(
2956 	ino_t source_ino,
2957 	struct inode *target,
2958 	struct inode *sdp,
2959 	struct cred *cr)
2960 {
2961 	struct fbuf *fbp;
2962 	struct dirtemplate *dirp;
2963 	struct inode *ip;
2964 	struct ufsvfs *ufsvfsp;
2965 	struct inode *tip;
2966 	ino_t dotdotino;
2967 	int err;
2968 
2969 	ASSERT(target->i_ufsvfs != NULL);
2970 	ASSERT(RW_LOCK_HELD(&target->i_rwlock));
2971 	ASSERT(RW_LOCK_HELD(&sdp->i_rwlock));
2972 
2973 	ip = target;
2974 	if (ip->i_number == source_ino) {
2975 		err = EINVAL;
2976 		goto out;
2977 	}
2978 	if (ip->i_number == UFSROOTINO) {
2979 		err = 0;
2980 		goto out;
2981 	}
2982 	/*
2983 	 * Search back through the directory tree, using the ".." entries.
2984 	 * Fail any attempt to move a directory into an ancestor directory.
2985 	 */
2986 	fbp = NULL;
2987 	for (;;) {
2988 		struct vfs	*vfs;
2989 
2990 		err = blkatoff(ip, (off_t)0, (char **)&dirp, &fbp);
2991 		if (err)
2992 			break;
2993 		if (((ip->i_mode & IFMT) != IFDIR) || ip->i_nlink == 0 ||
2994 		    ip->i_size < sizeof (struct dirtemplate)) {
2995 			dirbad(ip, "bad size, unlinked or not dir", (off_t)0);
2996 			err = ENOTDIR;
2997 			break;
2998 		}
2999 		if (dirp->dotdot_namlen != 2 ||
3000 		    dirp->dotdot_name[0] != '.' ||
3001 		    dirp->dotdot_name[1] != '.') {
3002 			dirbad(ip, "mangled .. entry", (off_t)0);
3003 			err = ENOTDIR;		/* Sanity check */
3004 			break;
3005 		}
3006 		dotdotino = (ino_t)dirp->dotdot_ino;
3007 		if (dotdotino == source_ino) {
3008 			err = EINVAL;
3009 			break;
3010 		}
3011 		if (dotdotino == UFSROOTINO)
3012 			break;
3013 		if (fbp) {
3014 			fbrelse(fbp, S_OTHER);
3015 			fbp = NULL;
3016 		}
3017 		vfs = ip->i_vfs;
3018 		ufsvfsp = ip->i_ufsvfs;
3019 
3020 		if (ip != target) {
3021 			rw_exit(&ip->i_rwlock);
3022 			VN_RELE(ITOV(ip));
3023 		}
3024 		/*
3025 		 * Race to get the inode.
3026 		 */
3027 		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
3028 		if (err = ufs_iget_alloced(vfs, dotdotino, &tip, cr)) {
3029 			rw_exit(&ufsvfsp->vfs_dqrwlock);
3030 			ip = NULL;
3031 			break;
3032 		}
3033 		rw_exit(&ufsvfsp->vfs_dqrwlock);
3034 		/*
3035 		 * If the directory of the source inode (also a directory)
3036 		 * is the same as this next entry up the chain, then
3037 		 * we know the source directory itself can't be in the
3038 		 * chain. This also prevents a panic because we already
3039 		 * have sdp->i_rwlock locked.
3040 		 */
3041 		if (tip == sdp) {
3042 			VN_RELE(ITOV(tip));
3043 			ip = NULL;
3044 			break;
3045 		}
3046 		ip = tip;
3047 
3048 		/*
3049 		 * If someone has set the WRITE_WANTED bit in this lock and if
3050 		 * this happens to be a sdp or tdp of another parallel rename
3051 		 * which is executing  the same code and in similar situation
3052 		 * we end up in a 4 way deadlock. We need to make sure that
3053 		 * the WRITE_WANTED bit is not  set.
3054 		 */
3055 retry_lock:
3056 		if (!rw_tryenter(&ip->i_rwlock, RW_READER)) {
3057 			/*
3058 			 * If the lock held as WRITER thats fine but if it
3059 			 * has WRITE_WANTED bit set we might end up in a
3060 			 * deadlock. If WRITE_WANTED is set we return
3061 			 * with EAGAIN else we just go back and try.
3062 			 */
3063 			if (RW_ISWRITER(&ip->i_rwlock) &&
3064 					!(RW_WRITE_HELD(&ip->i_rwlock))) {
3065 				err = EAGAIN;
3066 				if (fbp) {
3067 					fbrelse(fbp, S_OTHER);
3068 				}
3069 				VN_RELE(ITOV(ip));
3070 				return (err);
3071 			} else {
3072 				/*
3073 				 * The lock is being write held. We could
3074 				 * just do a rw_enter here but there is a
3075 				 * window between the check and now, where
3076 				 * the status could have changed, so to
3077 				 * avoid looping we backoff and go back to
3078 				 * try for the lock.
3079 				 */
3080 				delay(retry_backoff_delay);
3081 				dircheck_retry_cnt++;
3082 				goto retry_lock;
3083 			}
3084 		}
3085 	}
3086 	if (fbp) {
3087 		fbrelse(fbp, S_OTHER);
3088 	}
3089 out:
3090 	if (ip) {
3091 		if (ip != target) {
3092 			rw_exit(&ip->i_rwlock);
3093 			VN_RELE(ITOV(ip));
3094 		}
3095 	}
3096 	return (err);
3097 }
3098 
3099 int
3100 ufs_xattrdirempty(struct inode *ip, ino_t parentino, struct cred *cr)
3101 {
3102 	offset_t off;
3103 	struct dirtemplate dbuf;
3104 	struct direct *dp = (struct direct *)&dbuf;
3105 	int err, count;
3106 	int empty = 1;	/* Assume it's empty */
3107 #define	MINDIRSIZ (sizeof (struct dirtemplate) / 2)
3108 
3109 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
3110 
3111 	ASSERT(ip->i_size <= (offset_t)MAXOFF_T);
3112 	for (off = 0; off < ip->i_size; off += dp->d_reclen) {
3113 		err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp,
3114 		    (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr);
3115 		/*
3116 		 * Since we read MINDIRSIZ, residual must
3117 		 * be 0 unless we're at end of file.
3118 		 */
3119 
3120 		if (err || count != 0 || dp->d_reclen == 0) {
3121 			empty = 0;
3122 			break;
3123 		}
3124 		/* skip empty entries */
3125 		if (dp->d_ino == 0)
3126 			continue;
3127 		/*
3128 		 * At this point d_namlen must be 1 or 2.
3129 		 * 1 implies ".", 2 implies ".." if second
3130 		 * char is also "."
3131 		 */
3132 
3133 		if (dp->d_namlen == 1 && dp->d_name[0] == '.' &&
3134 				(ino_t)dp->d_ino == parentino)
3135 			continue;
3136 
3137 		if (dp->d_namlen == 2 && dp->d_name[0] == '.' &&
3138 			dp->d_name[1] == '.') {
3139 			continue;
3140 		}
3141 		empty = 0;
3142 		break;
3143 	}
3144 	return (empty);
3145 }
3146 
3147 
3148 /*
3149  * Allocate and initialize a new shadow inode to contain extended attributes.
3150  */
3151 int
3152 ufs_xattrmkdir(
3153 	struct inode *tdp,
3154 	struct inode **ipp,
3155 	int flags,
3156 	struct cred *cr)
3157 {
3158 	struct inode *ip;
3159 	struct vattr va;
3160 	int err;
3161 	int retry = 1;
3162 	struct ufsvfs *ufsvfsp;
3163 	struct ulockfs *ulp;
3164 	int issync;
3165 	int trans_size;
3166 	int dorwlock;		/* 0 = not yet taken, */
3167 				/* 1 = taken outside the transaction, */
3168 				/* 2 = taken inside the transaction */
3169 
3170 	/*
3171 	 * Validate permission to create attribute directory
3172 	 */
3173 
3174 	if ((err = ufs_iaccess(tdp, IWRITE, cr)) != 0) {
3175 		return (err);
3176 	}
3177 
3178 	if (vn_is_readonly(ITOV(tdp)))
3179 		return (EROFS);
3180 
3181 	/*
3182 	 * No need to re-init err after again:, since it's set before
3183 	 * the next use of it.
3184 	 */
3185 again:
3186 	dorwlock = 0;
3187 	va.va_type = VDIR;
3188 	va.va_uid = tdp->i_uid;
3189 	va.va_gid = tdp->i_gid;
3190 
3191 	if ((tdp->i_mode & IFMT) == IFDIR) {
3192 		va.va_mode = (o_mode_t)IFATTRDIR;
3193 		va.va_mode |= tdp->i_mode & 0777;
3194 	} else {
3195 		va.va_mode = (o_mode_t)IFATTRDIR|0700;
3196 		if (tdp->i_mode & 0040)
3197 			va.va_mode |= 0750;
3198 		if (tdp->i_mode & 0004)
3199 			va.va_mode |= 0705;
3200 	}
3201 	va.va_mask = AT_TYPE|AT_MODE;
3202 
3203 	ufsvfsp = tdp->i_ufsvfs;
3204 
3205 	err = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MKDIR_MASK);
3206 	if (err)
3207 		return (err);
3208 
3209 	/*
3210 	 * Acquire i_rwlock before TRANS_BEGIN_CSYNC() if this is a file.
3211 	 * This follows the protocol for read()/write().
3212 	 */
3213 	if (ITOV(tdp)->v_type != VDIR) {
3214 		rw_enter(&tdp->i_rwlock, RW_WRITER);
3215 		dorwlock = 1;
3216 	}
3217 
3218 	if (ulp) {
3219 		trans_size = (int)TOP_MKDIR_SIZE(tdp);
3220 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_MKDIR, trans_size);
3221 	}
3222 
3223 	/*
3224 	 * Acquire i_rwlock after TRANS_BEGIN_CSYNC() if this is a directory.
3225 	 * This follows the protocol established by
3226 	 * ufs_link/create/remove/rename/mkdir/rmdir/symlink.
3227 	 */
3228 	if (dorwlock == 0) {
3229 		rw_enter(&tdp->i_rwlock, RW_WRITER);
3230 		dorwlock = 2;
3231 	}
3232 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
3233 	rw_enter(&tdp->i_contents, RW_WRITER);
3234 
3235 	/*
3236 	 * Suppress out of inodes messages if we will retry.
3237 	 */
3238 	if (retry)
3239 		tdp->i_flag |= IQUIET;
3240 	err = ufs_dirmakeinode(tdp, &ip, &va, DE_ATTRDIR, cr);
3241 	tdp->i_flag &= ~IQUIET;
3242 
3243 	if (err)
3244 		goto fail;
3245 
3246 	if (flags) {
3247 
3248 		/*
3249 		 * Now attach it to src file.
3250 		 */
3251 
3252 		tdp->i_oeftflag = ip->i_number;
3253 	}
3254 
3255 	ip->i_cflags |= IXATTR;
3256 	ITOV(ip)->v_flag |= V_XATTRDIR;
3257 	TRANS_INODE(ufsvfsp, tdp);
3258 	tdp->i_flag |= ICHG | IUPD;
3259 	tdp->i_seq++;
3260 	ufs_iupdat(tdp, I_SYNC);
3261 	rw_exit(&tdp->i_contents);
3262 	rw_exit(&ufsvfsp->vfs_dqrwlock);
3263 
3264 	rw_enter(&ip->i_rwlock, RW_WRITER);
3265 	rw_enter(&ip->i_contents, RW_WRITER);
3266 	TRANS_INODE(ufsvfsp, ip);
3267 	ip->i_flag |= ICHG| IUPD;
3268 	ip->i_seq++;
3269 	ufs_iupdat(ip, I_SYNC);
3270 	rw_exit(&ip->i_contents);
3271 	rw_exit(&ip->i_rwlock);
3272 	if (dorwlock == 2)
3273 		rw_exit(&tdp->i_rwlock);
3274 	if (ulp) {
3275 		int terr = 0;
3276 
3277 		TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size);
3278 		ufs_lockfs_end(ulp);
3279 		if (err == 0)
3280 			err = terr;
3281 	}
3282 	if (dorwlock == 1)
3283 		rw_exit(&tdp->i_rwlock);
3284 	*ipp = ip;
3285 	return (err);
3286 
3287 fail:
3288 	rw_exit(&tdp->i_contents);
3289 	rw_exit(&ufsvfsp->vfs_dqrwlock);
3290 	if (dorwlock == 2)
3291 		rw_exit(&tdp->i_rwlock);
3292 	if (ulp) {
3293 		TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size);
3294 		ufs_lockfs_end(ulp);
3295 	}
3296 	if (dorwlock == 1)
3297 		rw_exit(&tdp->i_rwlock);
3298 	if (ip != NULL)
3299 		VN_RELE(ITOV(ip));
3300 
3301 	/*
3302 	 * No inodes?  See if any are tied up in pending deletions.
3303 	 * This has to be done outside of any of the above, because
3304 	 * the draining operation can't be done from inside a transaction.
3305 	 */
3306 	if ((err == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
3307 		ufs_delete_drain_wait(ufsvfsp, 1);
3308 		retry = 0;
3309 		goto again;
3310 	}
3311 
3312 	return (err);
3313 }
3314 
3315 /*
3316  * clear the dotdot directory entry.
3317  * Used by ufs_dirscan when clr_dotdot
3318  * flag is set and we're deleting a
3319  * directory.
3320  */
3321 static int
3322 ufs_dirclrdotdot(struct inode *ip, ino_t parentino)
3323 {
3324 	struct fbuf *fbp;
3325 	struct direct *dotp, *dotdotp;
3326 	int err = 0;
3327 
3328 	ASSERT(RW_WRITE_HELD(&ip->i_rwlock));
3329 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
3330 	err = blkatoff(ip, 0, NULL, &fbp);
3331 	if (err) {
3332 		return (err);
3333 	}
3334 
3335 	dotp = (struct direct *)fbp->fb_addr;
3336 	if ((dotp->d_namlen < (MAXNAMLEN + 1)) &&
3337 	    ((DIRBLKSIZ - DIRSIZ(dotp)) >= (sizeof (struct dirtemplate) / 2))) {
3338 		dotdotp = (struct direct *)((char *)dotp + dotp->d_reclen);
3339 		if ((dotdotp->d_namlen < (MAXNAMLEN + 1)) &&
3340 		    ((DIRBLKSIZ - DIRSIZ(dotp)) >= dotdotp->d_reclen)) {
3341 
3342 			dotp->d_reclen += dotdotp->d_reclen;
3343 			if (parentino == dotdotp->d_ino) {
3344 				dotdotp->d_ino = 0;
3345 				dotdotp->d_namlen = 0;
3346 				dotdotp->d_reclen = 0;
3347 			}
3348 
3349 			err = TRANS_DIR(ip, 0);
3350 			if (err) {
3351 				fbrelse(fbp, S_OTHER);
3352 			} else {
3353 				err = ufs_fbwrite(fbp, ip);
3354 			}
3355 		}
3356 	} else {
3357 		err = -1;
3358 	}
3359 	return (err);
3360 }
3361