xref: /titanic_50/usr/src/uts/common/fs/ufs/ufs_dir.c (revision c77a61a72b5ecdc507d6cf104142edd371a16c84)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * University Copyright- Copyright (c) 1982, 1986, 1988
31  * The Regents of the University of California
32  * All Rights Reserved
33  *
34  * University Acknowledgment- Portions of this document are derived from
35  * software developed by the University of California, Berkeley, and its
36  * contributors.
37  */
38 
39 
40 #pragma ident	"%Z%%M%	%I%	%E% SMI"
41 
42 /*
43  * Directory manipulation routines.
44  *
45  * When manipulating directories, the i_rwlock provides serialization
46  * since directories cannot be mmapped. The i_contents lock is redundant.
47  */
48 
49 #include <sys/types.h>
50 #include <sys/t_lock.h>
51 #include <sys/param.h>
52 #include <sys/systm.h>
53 #include <sys/signal.h>
54 #include <sys/cred.h>
55 #include <sys/proc.h>
56 #include <sys/disp.h>
57 #include <sys/user.h>
58 #include <sys/vfs.h>
59 #include <sys/vnode.h>
60 #include <sys/stat.h>
61 #include <sys/mode.h>
62 #include <sys/buf.h>
63 #include <sys/uio.h>
64 #include <sys/dnlc.h>
65 #include <sys/fs/ufs_inode.h>
66 #include <sys/fs/ufs_fs.h>
67 #include <sys/mount.h>
68 #include <sys/fs/ufs_fsdir.h>
69 #include <sys/fs/ufs_trans.h>
70 #include <sys/fs/ufs_panic.h>
71 #include <sys/fs/ufs_quota.h>
72 #include <sys/errno.h>
73 #include <sys/debug.h>
74 #include <vm/seg.h>
75 #include <sys/sysmacros.h>
76 #include <sys/cmn_err.h>
77 #include <sys/cpuvar.h>
78 #include <sys/unistd.h>
79 #include <sys/policy.h>
80 
81 /*
82  * This is required since we're using P2ROUNDUP_TYPED on DIRBLKSIZ
83  */
84 #if !ISP2(DIRBLKSIZ)
85 #error	"DIRBLKSIZ not a power of 2"
86 #endif
87 
88 /*
89  * A virgin directory.
90  */
91 static struct dirtemplate mastertemplate = {
92 	0, 12, 1, ".",
93 	0, DIRBLKSIZ - 12, 2, ".."
94 };
95 
96 #define	LDIRSIZ(len) \
97 	((sizeof (struct direct) - (MAXNAMLEN + 1)) + ((len + 1 + 3) &~ 3))
98 #define	MAX_DIR_NAME_LEN(len) \
99 	(((len) - (sizeof (struct direct) - (MAXNAMLEN + 1))) - 1)
100 
101 /*
102  * The dnlc directory cache allows a 64 bit handle for directory entries.
103  * For ufs we squeeze both the 32 bit inumber and a 32 bit disk offset
104  * into the handle. Note, a 32 bit offset allows a 4GB directory, which
105  * is way beyond what could be cached in memory by the directory
106  * caching routines. So we are quite safe with this limit.
107  * The macros below pack and unpack the handle.
108  */
109 #define	H_TO_INO(h) (uint32_t)((h) & UINT_MAX)
110 #define	H_TO_OFF(h) (off_t)((h) >> 32)
111 #define	INO_OFF_TO_H(ino, off) (uint64_t)(((uint64_t)(off) << 32) | (ino))
112 
113 /*
114  * The average size of a typical on disk directory entry is about 16 bytes
115  * and so defines AV_DIRECT_SHIFT : log2(16)
116  * This define is only used to approximate the number of entries
117  * is a directory. This is needed for dnlc_dir_start() which will immediately
118  * return an error if the value is not within its acceptable range of
119  * number of files in a directory.
120  */
121 #define	AV_DIRECT_SHIFT 4
122 /*
123  * If the directory size (from i_size) is greater than the ufs_min_dir_cache
124  * tunable then we request dnlc directory caching.
125  * This has found to be profitable after 1024 file names.
126  */
127 int ufs_min_dir_cache = 1024 << AV_DIRECT_SHIFT;
128 
129 /* The time point the dnlc directory caching was disabled */
130 static hrtime_t ufs_dc_disable_at;
131 /* directory caching disable duration */
132 static hrtime_t ufs_dc_disable_duration = (hrtime_t)NANOSEC * 5;
133 
134 #ifdef DEBUG
135 int dirchk = 1;
136 #else /* !DEBUG */
137 int dirchk = 0;
138 #endif /* DEBUG */
139 int ufs_negative_cache = 1;
140 uint64_t ufs_dirremove_retry_cnt;
141 
142 static void dirbad();
143 static int ufs_dirrename();
144 static int ufs_diraddentry();
145 static int ufs_dirempty();
146 static int ufs_dirscan();
147 static int ufs_dirclrdotdot();
148 static int ufs_dirfixdotdot();
149 static int ufs_dirpurgedotdot();
150 static int dirprepareentry();
151 static int ufs_dirmakedirect();
152 static int dirbadname();
153 static int dirmangled();
154 
155 /*
156  * Look for a given name in a directory.  On successful return, *ipp
157  * will point to the VN_HELD inode.
158  */
159 int
160 ufs_dirlook(
161 	struct inode *dp,
162 	char *namep,
163 	struct inode **ipp,
164 	struct cred *cr,
165 	int skipdnlc)			/* skip the 1st level dnlc */
166 {
167 	uint64_t handle;
168 	struct fbuf *fbp;		/* a buffer of directory entries */
169 	struct direct *ep;		/* the current directory entry */
170 	struct vnode *vp;
171 	struct vnode *dvp;		/* directory vnode ptr */
172 	struct ulockfs *ulp;
173 	dcanchor_t *dcap;
174 	off_t endsearch;		/* offset to end directory search */
175 	off_t offset;
176 	off_t start_off;		/* starting offset from middle search */
177 	off_t last_offset;		/* last offset */
178 	int entryoffsetinblock;		/* offset of ep in addr's buffer */
179 	int numdirpasses;		/* strategy for directory search */
180 	int namlen;			/* length of name */
181 	int err;
182 	int doingchk;
183 	int i;
184 	int caching;
185 	int indeadlock;
186 	ino_t ep_ino;			/* entry i number */
187 	ino_t chkino;
188 	ushort_t ep_reclen;		/* direct local d_reclen */
189 
190 	ASSERT(*namep != '\0'); /* All callers ensure *namep is non null */
191 
192 	if (dp->i_ufsvfs)
193 		ulp = &dp->i_ufsvfs->vfs_ulockfs;
194 	/*
195 	 * Check accessibility of directory.
196 	 */
197 	if (((dp->i_mode & IFMT) != IFDIR) &&
198 	    ((dp->i_mode & IFMT) != IFATTRDIR))
199 		return (ENOTDIR);
200 
201 	if (err = ufs_iaccess(dp, IEXEC, cr))
202 		return (err);
203 
204 	/*
205 	 * Check the directory name lookup cache, first for individual files
206 	 * then for complete directories.
207 	 */
208 	dvp = ITOV(dp);
209 	if (!skipdnlc && (vp = dnlc_lookup(dvp, namep))) {
210 		/* vp is already held from dnlc_lookup */
211 		if (vp == DNLC_NO_VNODE) {
212 			VN_RELE(vp);
213 			return (ENOENT);
214 		}
215 		*ipp = VTOI(vp);
216 		return (0);
217 	}
218 
219 	dcap = &dp->i_danchor;
220 
221 	/*
222 	 * Grab the reader lock on the directory data before checking
223 	 * the dnlc to avoid a race with ufs_dirremove() & friends.
224 	 *
225 	 * ufs_tryirwlock uses rw_tryenter and checks for SLOCK to
226 	 * avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
227 	 * possible, retries the operation.
228 	 */
229 	ufs_tryirwlock((&dp->i_rwlock), RW_READER, retry_dircache);
230 	if (indeadlock)
231 		return (EAGAIN);
232 
233 	switch (dnlc_dir_lookup(dcap, namep, &handle)) {
234 	case DFOUND:
235 		ep_ino = (ino_t)H_TO_INO(handle);
236 		if (dp->i_number == ep_ino) {
237 			VN_HOLD(dvp);	/* want ourself, "." */
238 			*ipp = dp;
239 			rw_exit(&dp->i_rwlock);
240 			return (0);
241 		}
242 		if (namep[0] == '.' && namep[1] == '.' && namep[2] == 0) {
243 			uint64_t handle2;
244 			/*
245 			 * release the lock on the dir we are searching
246 			 * to avoid a deadlock when grabbing the
247 			 * i_contents lock in ufs_iget_alloced().
248 			 */
249 			rw_exit(&dp->i_rwlock);
250 			rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER);
251 			err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr);
252 			rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
253 			/*
254 			 * must recheck as we dropped dp->i_rwlock
255 			 */
256 			ufs_tryirwlock(&dp->i_rwlock, RW_READER, retry_parent);
257 			if (indeadlock) {
258 				if (!err)
259 					VN_RELE(ITOV(*ipp));
260 				return (EAGAIN);
261 			}
262 			if (!err && (dnlc_dir_lookup(dcap, namep, &handle2)
263 			    == DFOUND) && (handle == handle2)) {
264 				dnlc_update(dvp, namep, ITOV(*ipp));
265 				rw_exit(&dp->i_rwlock);
266 				return (0);
267 			}
268 			/* check failed, read the actual directory */
269 			if (!err) {
270 				VN_RELE(ITOV(*ipp));
271 			}
272 			goto restart;
273 		}
274 		/* usual case of not "." nor ".." */
275 		rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER);
276 		err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr);
277 		rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
278 		if (err) {
279 			rw_exit(&dp->i_rwlock);
280 			return (err);
281 		}
282 		dnlc_update(dvp, namep, ITOV(*ipp));
283 		rw_exit(&dp->i_rwlock);
284 		return (0);
285 	case DNOENT:
286 		if (ufs_negative_cache && (dp->i_nlink > 0)) {
287 			dnlc_enter(dvp, namep, DNLC_NO_VNODE);
288 		}
289 		rw_exit(&dp->i_rwlock);
290 		return (ENOENT);
291 	default:
292 		break;
293 	}
294 restart:
295 
296 	fbp = NULL;
297 	doingchk = 0;
298 	chkino = 0;
299 	caching = 0;
300 
301 	/*
302 	 * Attempt to cache any directories greater than the tunable
303 	 * ufs_min_cache_dir. If it fails due to memory shortage (DNOMEM),
304 	 * disable caching for this directory and record the system time.
305 	 * Any attempt after the disable time has expired will enable
306 	 * the caching again.
307 	 */
308 	if (dp->i_size >= ufs_min_dir_cache) {
309 		/*
310 		 * if the directory caching disable time has expired
311 		 * enable the caching again.
312 		 */
313 		if (dp->i_cachedir == CD_DISABLED_NOMEM &&
314 		    gethrtime() - ufs_dc_disable_at > ufs_dc_disable_duration) {
315 			ufs_dc_disable_at = 0;
316 			dp->i_cachedir = CD_ENABLED;
317 		}
318 		if (dp->i_cachedir == CD_ENABLED) {
319 			switch (dnlc_dir_start(dcap, dp->i_size >>
320 				AV_DIRECT_SHIFT)) {
321 			case DNOMEM:
322 				dp->i_cachedir = CD_DISABLED_NOMEM;
323 				ufs_dc_disable_at = gethrtime();
324 				break;
325 			case DTOOBIG:
326 				dp->i_cachedir = CD_DISABLED_TOOBIG;
327 				break;
328 			case DOK:
329 				caching = 1;
330 				break;
331 			default:
332 				break;
333 			}
334 		}
335 	}
336 	/*
337 	 * If caching we don't stop when the file has been
338 	 * found, but need to know later, so clear *ipp now
339 	 */
340 	*ipp = NULL;
341 
342 recheck:
343 	if (caching) {
344 		offset = 0;
345 		entryoffsetinblock = 0;
346 		numdirpasses = 1;
347 	} else {
348 		/*
349 		 * Take care to look at dp->i_diroff only once, as it
350 		 * may be changing due to other threads/cpus.
351 		 */
352 		offset = dp->i_diroff;
353 		if (offset > dp->i_size) {
354 			offset = 0;
355 		}
356 		if (offset == 0) {
357 			entryoffsetinblock = 0;
358 			numdirpasses = 1;
359 		} else {
360 			start_off = offset;
361 
362 			entryoffsetinblock = blkoff(dp->i_fs, offset);
363 			if (entryoffsetinblock != 0) {
364 				err = blkatoff(dp, offset, (char **)0, &fbp);
365 				if (err)
366 					goto bad;
367 			}
368 			numdirpasses = 2;
369 		}
370 	}
371 	endsearch = P2ROUNDUP_TYPED(dp->i_size, DIRBLKSIZ, u_offset_t);
372 	namlen = strlen(namep);
373 	last_offset = 0;
374 
375 searchloop:
376 	while (offset < endsearch) {
377 		/*
378 		 * If offset is on a block boundary,
379 		 * read the next directory block.
380 		 * Release previous if it exists.
381 		 */
382 		if (blkoff(dp->i_fs, offset) == 0) {
383 			if (fbp != NULL) {
384 				fbrelse(fbp, S_OTHER);
385 			}
386 			err = blkatoff(dp, offset, (char **)0, &fbp);
387 			if (err)
388 				goto bad;
389 			entryoffsetinblock = 0;
390 		}
391 
392 		/*
393 		 * If the offset to the next entry is invalid or if the
394 		 * next entry is a zero length record or if the record
395 		 * length is invalid, then skip to the next directory
396 		 * block.  Complete validation checks are done if the
397 		 * record length is invalid.
398 		 *
399 		 * Full validation checks are slow so they are disabled
400 		 * by default.  Complete checks can be run by patching
401 		 * "dirchk" to be true.
402 		 *
403 		 * We have to check the validity of entryoffsetinblock
404 		 * here because it can be set to i_diroff above.
405 		 */
406 		ep = (struct direct *)(fbp->fb_addr + entryoffsetinblock);
407 		if ((entryoffsetinblock & 0x3) || ep->d_reclen == 0 ||
408 		    (dirchk || (ep->d_reclen & 0x3)) &&
409 		    dirmangled(dp, ep, entryoffsetinblock, offset)) {
410 			i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1));
411 			offset += i;
412 			entryoffsetinblock += i;
413 			if (caching) {
414 				dnlc_dir_purge(dcap);
415 				caching = 0;
416 			}
417 			continue;
418 		}
419 
420 		ep_reclen = ep->d_reclen;
421 
422 		/*
423 		 * Add named entries and free space into the directory cache
424 		 */
425 		if (caching) {
426 			ushort_t extra;
427 			off_t off2;
428 
429 			if (ep->d_ino == 0) {
430 				extra = ep_reclen;
431 				if (offset & (DIRBLKSIZ - 1)) {
432 					dnlc_dir_purge(dcap);
433 					dp->i_cachedir = CD_DISABLED;
434 					caching = 0;
435 				}
436 			} else {
437 				/*
438 				 * entries hold the previous offset except the
439 				 * 1st which holds the offset + 1
440 				 */
441 				if (offset & (DIRBLKSIZ - 1)) {
442 					off2 = last_offset;
443 				} else {
444 					off2 = offset + 1;
445 				}
446 				caching = (dnlc_dir_add_entry(dcap, ep->d_name,
447 				    INO_OFF_TO_H(ep->d_ino, off2)) == DOK);
448 				extra = ep_reclen - DIRSIZ(ep);
449 			}
450 			if (caching && (extra >= LDIRSIZ(1))) {
451 				caching = (dnlc_dir_add_space(dcap, extra,
452 				    (uint64_t)offset) == DOK);
453 			}
454 		}
455 
456 		/*
457 		 * Check for a name match.
458 		 * We have the parent inode read locked with i_rwlock.
459 		 */
460 		if (ep->d_ino && ep->d_namlen == namlen &&
461 		    *namep == *ep->d_name &&	/* fast chk 1st chr */
462 		    bcmp(namep, ep->d_name, (int)ep->d_namlen) == 0) {
463 
464 			/*
465 			 * We have to release the fbp early here to avoid
466 			 * a possible deadlock situation where we have the
467 			 * fbp and want the directory inode and someone doing
468 			 * a ufs_direnter_* has the directory inode and wants
469 			 * the fbp.  XXX - is this still needed?
470 			 */
471 			ep_ino = (ino_t)ep->d_ino;
472 			ASSERT(fbp != NULL);
473 			fbrelse(fbp, S_OTHER);
474 			fbp = NULL;
475 
476 			/*
477 			 * Atomic update (read lock held)
478 			 */
479 			dp->i_diroff = offset;
480 
481 			if (namlen == 2 && namep[0] == '.' && namep[1] == '.') {
482 				struct timeval32 omtime;
483 
484 				if (caching) {
485 					dnlc_dir_purge(dcap);
486 					caching = 0;
487 				}
488 				if (doingchk) {
489 					/*
490 					 * if the inumber didn't change
491 					 * continue with already found inode.
492 					 */
493 					if (ep_ino == chkino)
494 						goto checkok;
495 					else {
496 						VN_RELE(ITOV(*ipp));
497 						/* *ipp is nulled at restart */
498 						goto restart;
499 					}
500 				}
501 				/*
502 				 * release the lock on the dir we are searching
503 				 * to avoid a deadlock when grabbing the
504 				 * i_contents lock in ufs_iget_alloced().
505 				 */
506 				omtime = dp->i_mtime;
507 				rw_exit(&dp->i_rwlock);
508 				rw_enter(&dp->i_ufsvfs->vfs_dqrwlock,
509 						RW_READER);
510 				err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp,
511 				    cr);
512 				rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
513 				ufs_tryirwlock(&dp->i_rwlock, RW_READER,
514 						retry_disk);
515 				if (indeadlock) {
516 					if (!err)
517 						VN_RELE(ITOV(*ipp));
518 					return (EAGAIN);
519 				}
520 				if (err)
521 					goto bad;
522 				/*
523 				 * Since we released the lock on the directory,
524 				 * we must check that the same inode is still
525 				 * the ".." entry for this directory.
526 				 */
527 				/*CSTYLED*/
528 				if (timercmp(&omtime, &dp->i_mtime, !=)) {
529 					/*
530 					 * Modification time changed on the
531 					 * directory, we must go check if
532 					 * the inumber changed for ".."
533 					 */
534 					doingchk = 1;
535 					chkino = ep_ino;
536 					entryoffsetinblock = 0;
537 					if (caching) {
538 						/*
539 						 * Forget directory caching
540 						 * for this rare case
541 						 */
542 						dnlc_dir_purge(dcap);
543 						caching = 0;
544 					}
545 					goto recheck;
546 				}
547 			} else if (dp->i_number == ep_ino) {
548 				VN_HOLD(dvp);	/* want ourself, "." */
549 				*ipp = dp;
550 				if (caching) {
551 					dnlc_dir_purge(dcap);
552 					caching = 0;
553 				}
554 			} else {
555 				rw_enter(&dp->i_ufsvfs->vfs_dqrwlock,
556 						RW_READER);
557 				err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp,
558 				    cr);
559 				rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
560 				if (err)
561 					goto bad;
562 			}
563 checkok:
564 			ASSERT(*ipp);
565 			dnlc_update(dvp, namep, ITOV(*ipp));
566 			/*
567 			 * If we are not caching then just return the entry
568 			 * otherwise complete loading up the cache
569 			 */
570 			if (!caching) {
571 				rw_exit(&dp->i_rwlock);
572 				return (0);
573 			}
574 			err = blkatoff(dp, offset, (char **)0, &fbp);
575 			if (err)
576 				goto bad;
577 		}
578 		last_offset = offset;
579 		offset += ep_reclen;
580 		entryoffsetinblock += ep_reclen;
581 	}
582 	/*
583 	 * If we started in the middle of the directory and failed
584 	 * to find our target, we must check the beginning as well.
585 	 */
586 	if (numdirpasses == 2) {
587 		numdirpasses--;
588 		offset = 0;
589 		endsearch = start_off;
590 		goto searchloop;
591 	}
592 
593 	/*
594 	 * If whole directory caching is on (or was originally on) then
595 	 * the entry may have been found.
596 	 */
597 	if (*ipp == NULL) {
598 		err = ENOENT;
599 		if (ufs_negative_cache && (dp->i_nlink > 0)) {
600 			dnlc_enter(dvp, namep, DNLC_NO_VNODE);
601 		}
602 	}
603 	if (caching) {
604 		dnlc_dir_complete(dcap);
605 		caching = 0;
606 	}
607 
608 bad:
609 	if (err && *ipp) {
610 		/*
611 		 * err and *ipp can both be set if we were attempting to
612 		 * cache the directory, and we found the entry, then later
613 		 * while trying to complete the directory cache encountered
614 		 * a error (eg reading a directory sector).
615 		 */
616 		VN_RELE(ITOV(*ipp));
617 		*ipp = NULL;
618 	}
619 
620 	if (fbp)
621 		fbrelse(fbp, S_OTHER);
622 	rw_exit(&dp->i_rwlock);
623 	if (caching)
624 		dnlc_dir_purge(dcap);
625 	return (err);
626 }
627 
628 /*
629  * Write a new directory entry for DE_CREATE or DE_MKDIR operations.
630  */
631 int
632 ufs_direnter_cm(
633 	struct inode *tdp,	/* target directory to make entry in */
634 	char *namep,		/* name of entry */
635 	enum de_op op,		/* entry operation */
636 	struct vattr *vap,	/* attributes if new inode needed */
637 	struct inode **ipp,	/* return entered inode here */
638 	struct cred *cr,	/* user credentials */
639 	int flags)		/* no entry exists */
640 {
641 	struct inode *tip;	/* inode of (existing) target file */
642 	char *s;
643 	struct ufs_slot slot;	/* slot info to pass around */
644 	int namlen;		/* length of name */
645 	int err;		/* error number */
646 	struct inode *nip;	/* new inode */
647 	int do_rele_nip = 0;	/* release nip */
648 	int noentry = flags & ~IQUIET;
649 	int quiet = flags & IQUIET;	/* Suppress out of inodes message */
650 	int indeadlock;
651 	struct ulockfs *ulp;
652 
653 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
654 
655 	if (((tdp->i_mode & IFMT) == IFATTRDIR) && ((op == DE_MKDIR) ||
656 	    ((vap->va_type == VCHR) || (vap->va_type == VBLK) ||
657 	    (vap->va_type == VDOOR) || (vap->va_type == VSOCK) ||
658 	    (vap->va_type == VFIFO))))
659 		return (EINVAL);
660 
661 	/* don't allow '/' characters in pathname component */
662 	for (s = namep, namlen = 0; *s; s++, namlen++)
663 		if (*s == '/')
664 			return (EACCES);
665 	ASSERT(namlen);
666 
667 	/*
668 	 * If name is "." or ".." then if this is a create look it up
669 	 * and return EEXIST.
670 	 */
671 	if (namep[0] == '.' &&
672 	    (namlen == 1 || (namlen == 2 && namep[1] == '.'))) {
673 		/*
674 		 * ufs_dirlook will acquire the i_rwlock
675 		 */
676 		if (tdp->i_ufsvfs)
677 			ulp = &tdp->i_ufsvfs->vfs_ulockfs;
678 		rw_exit(&tdp->i_rwlock);
679 		if (err = ufs_dirlook(tdp, namep, ipp, cr, 0)) {
680 			if (err == EAGAIN)
681 				return (err);
682 
683 			/*
684 			 * ufs_tryirwlock uses rw_tryenter and checks for
685 			 * SLOCK to avoid i_rwlock, ufs_lockfs_begin deadlock.
686 			 * If deadlock possible, retries the operation.
687 			 */
688 			ufs_tryirwlock(&tdp->i_rwlock, RW_WRITER, retry_err);
689 			if (indeadlock)
690 				return (EAGAIN);
691 
692 			return (err);
693 		}
694 		ufs_tryirwlock(&tdp->i_rwlock, RW_WRITER, retry);
695 		if (indeadlock) {
696 			VN_RELE(ITOV(*ipp));
697 			return (EAGAIN);
698 		}
699 		return (EEXIST);
700 	}
701 
702 	/*
703 	 * If target directory has not been removed, then we can consider
704 	 * allowing file to be created.
705 	 */
706 	if (tdp->i_nlink <= 0) {
707 		return (ENOENT);
708 	}
709 
710 	/*
711 	 * Check accessibility of directory.
712 	 */
713 	if (((tdp->i_mode & IFMT) != IFDIR) &&
714 	    ((tdp->i_mode & IFMT) != IFATTRDIR)) {
715 		return (ENOTDIR);
716 	}
717 
718 	/*
719 	 * Execute access is required to search the directory.
720 	 */
721 	if (err = ufs_iaccess(tdp, IEXEC, cr)) {
722 		return (err);
723 	}
724 
725 	/*
726 	 * Search for the entry. Return VN_HELD tip if found.
727 	 */
728 	tip = NULL;
729 	slot.fbp = NULL;
730 	slot.status = NONE;
731 	rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER);
732 	rw_enter(&tdp->i_contents, RW_WRITER);
733 	err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, noentry);
734 	if (err)
735 		goto out;
736 	if (tip) {
737 		ASSERT(!noentry);
738 		*ipp = tip;
739 		err = EEXIST;
740 	} else {
741 		/*
742 		 * The entry does not exist. Check write permission in
743 		 * directory to see if entry can be created.
744 		 */
745 		if (err = ufs_iaccess(tdp, IWRITE, cr))
746 			goto out;
747 		/*
748 		 * Make new inode and directory entry.
749 		 */
750 		tdp->i_flag |= quiet;
751 		if (err = ufs_dirmakeinode(tdp, &nip, vap, op, cr)) {
752 			if (nip != NULL)
753 				do_rele_nip = 1;
754 			goto out;
755 		}
756 		if (err = ufs_diraddentry(tdp, namep, op,
757 		    namlen, &slot, nip, NULL, cr)) {
758 			/*
759 			 * Unmake the inode we just made.
760 			 */
761 			rw_enter(&nip->i_contents, RW_WRITER);
762 			if (((nip->i_mode & IFMT) == IFDIR) ||
763 			    ((nip->i_mode & IFMT) == IFATTRDIR)) {
764 				tdp->i_nlink--;
765 				ufs_setreclaim(tdp);
766 				tdp->i_flag |= ICHG;
767 				tdp->i_seq++;
768 				TRANS_INODE(tdp->i_ufsvfs, tdp);
769 				ITIMES_NOLOCK(tdp);
770 			}
771 			nip->i_nlink = 0;
772 			ufs_setreclaim(nip);
773 			TRANS_INODE(nip->i_ufsvfs, nip);
774 			nip->i_flag |= ICHG;
775 			nip->i_seq++;
776 			ITIMES_NOLOCK(nip);
777 			rw_exit(&nip->i_contents);
778 			do_rele_nip = 1;
779 		} else {
780 			*ipp = nip;
781 		}
782 	}
783 
784 out:
785 	if (slot.fbp)
786 		fbrelse(slot.fbp, S_OTHER);
787 
788 	tdp->i_flag &= ~quiet;
789 	rw_exit(&tdp->i_contents);
790 
791 	/*
792 	 * Drop vfs_dqrwlock before calling VN_RELE() on nip to
793 	 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
794 	 */
795 	rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock);
796 
797 	if (do_rele_nip) {
798 		VN_RELE(ITOV(nip));
799 	}
800 
801 	return (err);
802 }
803 
804 /*
805  * Write a new directory entry for DE_LINK, DE_SYMLINK or DE_RENAME operations.
806  * If tvpp is non-null, return with the pointer to the target vnode.
807  */
808 int
809 ufs_direnter_lr(
810 	struct inode *tdp,	/* target directory to make entry in */
811 	char *namep,		/* name of entry */
812 	enum de_op op,		/* entry operation */
813 	struct inode *sdp,	/* source inode parent if rename */
814 	struct inode *sip,	/* source inode */
815 	struct cred *cr,	/* user credentials */
816 	vnode_t **tvpp)		/* Return: (held) vnode of (existing) target */
817 {
818 	struct inode *tip;	/* inode of (existing) target file */
819 	char *s;
820 	struct ufs_slot slot;	/* slot info to pass around */
821 	int namlen;		/* length of name */
822 	int err;		/* error number */
823 
824 	/* don't allow '/' characters in pathname component */
825 	for (s = namep, namlen = 0; *s; s++, namlen++)
826 		if (*s == '/')
827 			return (EACCES);
828 	ASSERT(namlen);
829 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
830 
831 	/*
832 	 * If name is "." or ".." then if this is a create look it up
833 	 * and return EEXIST.  Rename or link TO "." or ".." is forbidden.
834 	 */
835 	if (namep[0] == '.' &&
836 	    (namlen == 1 || (namlen == 2 && namep[1] == '.'))) {
837 		if (op == DE_RENAME) {
838 			return (EINVAL);	/* *SIGH* should be ENOTEMPTY */
839 		}
840 		return (EEXIST);
841 	}
842 	/*
843 	 * For link and rename lock the source entry and check the link count
844 	 * to see if it has been removed while it was unlocked.  If not, we
845 	 * increment the link count and force the inode to disk to make sure
846 	 * that it is there before any directory entry that points to it.
847 	 *
848 	 * In the case of a symbolic link, we are dealing with a new inode
849 	 * which does not yet have any links.  We've created it with a link
850 	 * count of 1, and we don't want to increment it since this will be
851 	 * its first link.
852 	 *
853 	 * We are about to push the inode to disk. We make sure
854 	 * that the inode's data blocks are flushed first so the
855 	 * inode and it's data blocks are always in sync.  This
856 	 * adds some robustness in in the event of a power failure
857 	 * or panic where sync fails. If we panic before the
858 	 * inode is updated, then the inode still refers to the
859 	 * old data blocks (or none for a new file). If we panic
860 	 * after the inode is updated, then the inode refers to
861 	 * the new data blocks.
862 	 *
863 	 * We do this before grabbing the i_contents lock because
864 	 * ufs_syncip() will want that lock. We could do the data
865 	 * syncing after the removal checks, but upon return from
866 	 * the data sync we would have to repeat the removal
867 	 * checks.
868 	 */
869 	if (err = TRANS_SYNCIP(sip, 0, I_DSYNC, TOP_FSYNC)) {
870 		return (err);
871 	}
872 
873 	rw_enter(&sip->i_contents, RW_WRITER);
874 	if (sip->i_nlink <= 0) {
875 		rw_exit(&sip->i_contents);
876 		return (ENOENT);
877 	}
878 	if (sip->i_nlink == MAXLINK) {
879 		rw_exit(&sip->i_contents);
880 		return (EMLINK);
881 	}
882 
883 	/*
884 	 * Sync the indirect blocks associated with the file
885 	 * for the same reasons as described above.  Since this
886 	 * call wants the i_contents lock held for it we can do
887 	 * this here with no extra work.
888 	 */
889 	if (err = ufs_sync_indir(sip)) {
890 		rw_exit(&sip->i_contents);
891 		return (err);
892 	}
893 
894 	if (op != DE_SYMLINK)
895 		sip->i_nlink++;
896 	TRANS_INODE(sip->i_ufsvfs, sip);
897 	sip->i_flag |= ICHG;
898 	sip->i_seq++;
899 	ufs_iupdat(sip, I_SYNC);
900 	rw_exit(&sip->i_contents);
901 
902 	/*
903 	 * If target directory has not been removed, then we can consider
904 	 * allowing file to be created.
905 	 */
906 	if (tdp->i_nlink <= 0) {
907 		err = ENOENT;
908 		goto out2;
909 	}
910 	/*
911 	 * Check accessibility of directory.
912 	 */
913 	if (((tdp->i_mode & IFMT) != IFDIR) &&
914 	    (tdp->i_mode & IFMT) != IFATTRDIR) {
915 		err = ENOTDIR;
916 		goto out2;
917 	}
918 	/*
919 	 * Execute access is required to search the directory.
920 	 */
921 	if (err = ufs_iaccess(tdp, IEXEC, cr)) {
922 		goto out2;
923 	}
924 
925 	/*
926 	 * Search for the entry. Return VN_HELD tip if found.
927 	 */
928 	tip = NULL;
929 	slot.status = NONE;
930 	slot.fbp = NULL;
931 	rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER);
932 	rw_enter(&tdp->i_contents, RW_WRITER);
933 	err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, 0);
934 	if (err)
935 		goto out;
936 
937 	if (tip) {
938 		switch (op) {
939 		case DE_RENAME:
940 			err = ufs_dirrename(sdp, sip, tdp, namep,
941 			    tip, &slot, cr);
942 			break;
943 
944 		case DE_LINK:
945 		case DE_SYMLINK:
946 			/*
947 			 * Can't link to an existing file.
948 			 */
949 			err = EEXIST;
950 			break;
951 		default:
952 			break;
953 		}
954 	} else {
955 		/*
956 		 * The entry does not exist. Check write permission in
957 		 * directory to see if entry can be created.
958 		 */
959 		if (err = ufs_iaccess(tdp, IWRITE, cr))
960 			goto out;
961 		err = ufs_diraddentry(tdp, namep, op, namlen, &slot, sip, sdp,
962 		    cr);
963 	}
964 
965 out:
966 	if (slot.fbp)
967 		fbrelse(slot.fbp, S_OTHER);
968 
969 	rw_exit(&tdp->i_contents);
970 
971 	/*
972 	 * Drop vfs_dqrwlock before calling VN_RELE() on tip to
973 	 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
974 	 */
975 	rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock);
976 
977 	/*
978 	 * If we renamed a file over the top of an existing file,
979 	 * or linked a file to an existing file (or tried to),
980 	 * then set *tvpp to the target vnode, if tvpp is non-null
981 	 * otherwise, release and delete (or just release) the inode.
982 	 *
983 	 * N.B., by returning the target's vnode pointer to the caller,
984 	 * that caller becomes responsible for doing the VN_RELE.
985 	 */
986 	if (tip) {
987 		if ((err == 0) && (tvpp != NULL)) {
988 			*tvpp = ITOV(tip);
989 		} else {
990 			VN_RELE(ITOV(tip));
991 		}
992 	}
993 
994 out2:
995 	if (err) {
996 		/*
997 		 * Undo bumped link count.
998 		 */
999 		if (op != DE_SYMLINK) {
1000 			rw_enter(&sip->i_contents, RW_WRITER);
1001 			sip->i_nlink--;
1002 			ufs_setreclaim(sip);
1003 			TRANS_INODE(sip->i_ufsvfs, sip);
1004 			sip->i_flag |= ICHG;
1005 			sip->i_seq++;
1006 			ITIMES_NOLOCK(sip);
1007 			rw_exit(&sip->i_contents);
1008 		}
1009 	}
1010 	return (err);
1011 }
1012 
1013 /*
1014  * Check for the existence of a name in a directory (unless noentry
1015  * is set) , or else of an empty
1016  * slot in which an entry may be made.  If the requested name is found,
1017  * then on return *ipp points at the inode and *offp contains
1018  * its offset in the directory.  If the name is not found, then *ipp
1019  * will be NULL and *slotp will contain information about a directory slot in
1020  * which an entry may be made (either an empty slot, or the first position
1021  * past the end of the directory).
1022  * The target directory inode (tdp) is supplied write locked (i_rwlock).
1023  *
1024  * This may not be used on "." or "..", but aliases of "." are ok.
1025  */
1026 int
1027 ufs_dircheckforname(
1028 	struct inode *tdp,	/* inode of directory being checked */
1029 	char *namep,		/* name we're checking for */
1030 	int namlen,		/* length of name, excluding null */
1031 	struct ufs_slot *slotp,	/* slot structure */
1032 	struct inode **ipp,	/* return inode if we find one */
1033 	struct cred *cr,
1034 	int noentry)		/* noentry - just look for space */
1035 {
1036 	uint64_t handle;
1037 	struct fbuf *fbp;	/* pointer to directory block */
1038 	struct direct *ep;	/* directory entry */
1039 	struct direct *nep;	/* next directory entry */
1040 	dcanchor_t *dcap;
1041 	vnode_t *dvp;		/* directory vnode ptr */
1042 	off_t dirsize;		/* size of the directory */
1043 	off_t offset;		/* offset in the directory */
1044 	off_t last_offset;	/* last offset */
1045 	off_t enduseful;	/* pointer past last used dir slot */
1046 	int entryoffsetinblk;	/* offset of ep in fbp's buffer */
1047 	int i;			/* length of mangled entry */
1048 	int needed;
1049 	int err;
1050 	int first;
1051 	int caching;
1052 	int stat;
1053 	ino_t ep_ino;
1054 	slotstat_t initstat = slotp->status;
1055 
1056 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
1057 	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
1058 	ASSERT(*ipp == NULL);
1059 	fbp = NULL;
1060 
1061 	/*
1062 	 * First check if there is a complete cache of the directory.
1063 	 */
1064 	dvp = ITOV(tdp);
1065 
1066 	dcap = &tdp->i_danchor;
1067 	if (noentry) {
1068 		/*
1069 		 * We know from the 1st level dnlc cache that the entry
1070 		 * doesn't exist, so don't bother searching the directory
1071 		 * cache, but just look for space (possibly in the directory
1072 		 * cache).
1073 		 */
1074 		stat = DNOENT;
1075 	} else {
1076 		stat = dnlc_dir_lookup(dcap, namep, &handle);
1077 	}
1078 	switch (stat) {
1079 	case DFOUND:
1080 		ep_ino = (ino_t)H_TO_INO(handle);
1081 		if (tdp->i_number == ep_ino) {
1082 			*ipp = tdp;	/* we want ourself, ie "." */
1083 			VN_HOLD(dvp);
1084 		} else {
1085 			err = ufs_iget_alloced(tdp->i_vfs, ep_ino, ipp, cr);
1086 			if (err)
1087 				return (err);
1088 		}
1089 		offset = H_TO_OFF(handle);
1090 		first = 0;
1091 		if (offset & 1) {
1092 			/* This is the first entry in the block */
1093 			first = 1;
1094 			offset -= 1;
1095 			ASSERT((offset & (DIRBLKSIZ - 1)) == 0);
1096 		}
1097 		err = blkatoff(tdp, offset, (char **)&ep, &fbp);
1098 		if (err) {
1099 			VN_RELE(ITOV(*ipp));
1100 			*ipp = NULL;
1101 			return (err);
1102 		}
1103 		/*
1104 		 * Check the validity of the entry.
1105 		 * If it's bad, then throw away the cache and
1106 		 * continue without it. The dirmangled() routine
1107 		 * will then be called upon it.
1108 		 */
1109 		if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) {
1110 			VN_RELE(ITOV(*ipp));
1111 			*ipp = NULL;
1112 			dnlc_dir_purge(dcap);
1113 			break;
1114 		}
1115 		/*
1116 		 * Remember the returned offset is the offset of the
1117 		 * preceding record (unless this is the 1st record
1118 		 * in the DIRBLKSIZ sized block (disk sector)), then it's
1119 		 * offset + 1. Note, no real offsets are on odd boundaries.
1120 		 */
1121 		if (first) {
1122 			ASSERT((offset & (DIRBLKSIZ - 1)) == 0);
1123 			slotp->offset = offset;
1124 			slotp->size = 0;
1125 			slotp->ep = ep;
1126 		} else {
1127 			/* get the next entry */
1128 			nep = (struct direct *)((char *)ep + ep->d_reclen);
1129 			/*
1130 			 * Check the validity of this entry as well
1131 			 * If it's bad, then throw away the cache and
1132 			 * continue without it. The dirmangled() routine
1133 			 * will then be called upon it.
1134 			 */
1135 			if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) ||
1136 			    (nep->d_ino != ep_ino)) {
1137 				VN_RELE(ITOV(*ipp));
1138 				*ipp = NULL;
1139 				dnlc_dir_purge(dcap);
1140 				break;
1141 			}
1142 			slotp->offset = offset + ep->d_reclen;
1143 			slotp->size = ep->d_reclen;
1144 			slotp->ep = nep;
1145 		}
1146 		slotp->status = EXIST;
1147 		slotp->fbp = fbp;
1148 		slotp->endoff = 0;
1149 		slotp->cached = 1;
1150 		dnlc_update(dvp, namep, ITOV(*ipp));
1151 		return (0);
1152 	case DNOENT:
1153 		/*
1154 		 * The caller gets to set the initial slot status to
1155 		 * indicate whether it's interested in getting a
1156 		 * empty slot. For example, the status can be set
1157 		 * to FOUND when an entry is being deleted.
1158 		 */
1159 		ASSERT(slotp->fbp == NULL);
1160 		if (slotp->status == FOUND) {
1161 			return (0);
1162 		}
1163 		switch (dnlc_dir_rem_space_by_len(dcap, LDIRSIZ(namlen),
1164 		    &handle)) {
1165 		case DFOUND:
1166 			offset = (off_t)handle;
1167 			err = blkatoff(tdp, offset, (char **)&ep, &fbp);
1168 			if (err) {
1169 				dnlc_dir_purge(dcap);
1170 				ASSERT(*ipp == NULL);
1171 				return (err);
1172 			}
1173 			/*
1174 			 * Check the validity of the entry.
1175 			 * If it's bad, then throw away the cache and
1176 			 * continue without it. The dirmangled() routine
1177 			 * will then be called upon it.
1178 			 */
1179 			if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) {
1180 				dnlc_dir_purge(dcap);
1181 				break;
1182 			}
1183 			/*
1184 			 * Remember the returned offset is the offset of the
1185 			 * containing record.
1186 			 */
1187 			slotp->status = FOUND;
1188 			slotp->ep = ep;
1189 			slotp->offset = offset;
1190 			slotp->fbp = fbp;
1191 			slotp->size = ep->d_reclen;
1192 			/*
1193 			 * Set end offset to 0. Truncation is handled
1194 			 * because the dnlc cache will blow away the
1195 			 * cached directory when an entry is removed
1196 			 * that drops the entries left to less than half
1197 			 * the minumum number (dnlc_min_dir_cache).
1198 			 */
1199 			slotp->endoff = 0;
1200 			slotp->cached = 1;
1201 			return (0);
1202 		case DNOENT:
1203 			slotp->status = NONE;
1204 			slotp->offset = P2ROUNDUP_TYPED(tdp->i_size,
1205 			    DIRBLKSIZ, u_offset_t);
1206 			slotp->size = DIRBLKSIZ;
1207 			slotp->endoff = 0;
1208 			slotp->cached = 1;
1209 			return (0);
1210 		default:
1211 			break;
1212 		}
1213 		break;
1214 	}
1215 	slotp->cached = 0;
1216 	caching = NULL;
1217 	if (!noentry && tdp->i_size >= ufs_min_dir_cache) {
1218 		/*
1219 		 * if the directory caching disable time has expired
1220 		 * enable caching again.
1221 		 */
1222 		if (tdp->i_cachedir == CD_DISABLED_NOMEM &&
1223 		    gethrtime() - ufs_dc_disable_at > ufs_dc_disable_duration) {
1224 			ufs_dc_disable_at = 0;
1225 			tdp->i_cachedir = CD_ENABLED;
1226 		}
1227 		/*
1228 		 * Attempt to cache any directories greater than the tunable
1229 		 * ufs_min_cache_dir. If it fails due to memory shortage
1230 		 * (DNOMEM), disable caching for this directory and record
1231 		 * the system time. Any attempt after the disable time has
1232 		 * expired will enable the caching again.
1233 		 */
1234 		if (tdp->i_cachedir == CD_ENABLED) {
1235 			switch (dnlc_dir_start(dcap,
1236 			    tdp->i_size >> AV_DIRECT_SHIFT)) {
1237 			case DNOMEM:
1238 				tdp->i_cachedir = CD_DISABLED_NOMEM;
1239 				ufs_dc_disable_at = gethrtime();
1240 				break;
1241 			case DTOOBIG:
1242 				tdp->i_cachedir = CD_DISABLED_TOOBIG;
1243 				break;
1244 			case DOK:
1245 				caching = 1;
1246 				break;
1247 			default:
1248 				break;
1249 			}
1250 		}
1251 	}
1252 
1253 	/*
1254 	 * No point in using i_diroff since we must search whole directory
1255 	 */
1256 	dirsize = P2ROUNDUP_TYPED(tdp->i_size, DIRBLKSIZ, u_offset_t);
1257 	enduseful = 0;
1258 	offset = last_offset = 0;
1259 	entryoffsetinblk = 0;
1260 	needed = (int)LDIRSIZ(namlen);
1261 	while (offset < dirsize) {
1262 		/*
1263 		 * If offset is on a block boundary,
1264 		 * read the next directory block.
1265 		 * Release previous if it exists.
1266 		 */
1267 		if (blkoff(tdp->i_fs, offset) == 0) {
1268 			if (fbp != NULL)
1269 				fbrelse(fbp, S_OTHER);
1270 
1271 			err = blkatoff(tdp, offset, (char **)0, &fbp);
1272 			if (err) {
1273 				ASSERT(*ipp == NULL);
1274 				if (caching) {
1275 					dnlc_dir_purge(dcap);
1276 				}
1277 				return (err);
1278 			}
1279 			entryoffsetinblk = 0;
1280 		}
1281 		/*
1282 		 * If still looking for a slot, and at a DIRBLKSIZ
1283 		 * boundary, have to start looking for free space
1284 		 * again.
1285 		 */
1286 		if (slotp->status == NONE &&
1287 		    (entryoffsetinblk & (DIRBLKSIZ - 1)) == 0) {
1288 			slotp->offset = -1;
1289 		}
1290 		/*
1291 		 * If the next entry is a zero length record or if the
1292 		 * record length is invalid, then skip to the next
1293 		 * directory block.  Complete validation checks are
1294 		 * done if the record length is invalid.
1295 		 *
1296 		 * Full validation checks are slow so they are disabled
1297 		 * by default.  Complete checks can be run by patching
1298 		 * "dirchk" to be true.
1299 		 *
1300 		 * We do not have to check the validity of
1301 		 * entryoffsetinblk here because it starts out as zero
1302 		 * and is only incremented by d_reclen values that we
1303 		 * validate here.
1304 		 */
1305 		ep = (struct direct *)(fbp->fb_addr + entryoffsetinblk);
1306 		if (ep->d_reclen == 0 ||
1307 		    (dirchk || (ep->d_reclen & 0x3)) &&
1308 		    dirmangled(tdp, ep, entryoffsetinblk, offset)) {
1309 			i = DIRBLKSIZ - (entryoffsetinblk & (DIRBLKSIZ - 1));
1310 			offset += i;
1311 			entryoffsetinblk += i;
1312 			if (caching) {
1313 				dnlc_dir_purge(dcap);
1314 				caching = 0;
1315 			}
1316 			continue;
1317 		}
1318 
1319 		/*
1320 		 * Add named entries and free space into the directory cache
1321 		 */
1322 		if (caching) {
1323 			ushort_t extra;
1324 			off_t off2;
1325 
1326 			if (ep->d_ino == 0) {
1327 				extra = ep->d_reclen;
1328 				if (offset & (DIRBLKSIZ - 1)) {
1329 					dnlc_dir_purge(dcap);
1330 					caching = 0;
1331 				}
1332 			} else {
1333 				/*
1334 				 * entries hold the previous offset if
1335 				 * not the 1st one
1336 				 */
1337 				if (offset & (DIRBLKSIZ - 1)) {
1338 					off2 = last_offset;
1339 				} else {
1340 					off2 = offset + 1;
1341 				}
1342 				caching = (dnlc_dir_add_entry(dcap, ep->d_name,
1343 				    INO_OFF_TO_H(ep->d_ino, off2)) == DOK);
1344 				extra = ep->d_reclen - DIRSIZ(ep);
1345 			}
1346 			if (caching && (extra >= LDIRSIZ(1))) {
1347 				caching = (dnlc_dir_add_space(dcap, extra,
1348 				    (uint64_t)offset) == DOK);
1349 			}
1350 		}
1351 
1352 		/*
1353 		 * If an appropriate sized slot has not yet been found,
1354 		 * check to see if one is available.
1355 		 */
1356 		if ((slotp->status != FOUND) && (slotp->status != EXIST)) {
1357 			int size = ep->d_reclen;
1358 
1359 			if (ep->d_ino != 0)
1360 				size -= DIRSIZ(ep);
1361 			if (size > 0) {
1362 				if (size >= needed) {
1363 					slotp->offset = offset;
1364 					slotp->size = ep->d_reclen;
1365 					if (noentry) {
1366 						slotp->ep = ep;
1367 						slotp->fbp = fbp;
1368 						slotp->status = FOUND;
1369 						slotp->endoff = 0;
1370 						return (0);
1371 					}
1372 					slotp->status = FOUND;
1373 				} else if (slotp->status == NONE) {
1374 					if (slotp->offset == -1)
1375 						slotp->offset = offset;
1376 				}
1377 			}
1378 		}
1379 		/*
1380 		 * Check for a name match.
1381 		 */
1382 		if (ep->d_ino && ep->d_namlen == namlen &&
1383 		    *namep == *ep->d_name &&	/* fast chk 1st char */
1384 		    bcmp(namep, ep->d_name, namlen) == 0) {
1385 
1386 			tdp->i_diroff = offset;
1387 
1388 			if (tdp->i_number == ep->d_ino) {
1389 				*ipp = tdp;	/* we want ourself, ie "." */
1390 				VN_HOLD(dvp);
1391 			} else {
1392 				err = ufs_iget_alloced(tdp->i_vfs,
1393 				    (ino_t)ep->d_ino, ipp, cr);
1394 				if (err) {
1395 					fbrelse(fbp, S_OTHER);
1396 					if (caching)
1397 						dnlc_dir_purge(dcap);
1398 					return (err);
1399 				}
1400 			}
1401 			slotp->status = EXIST;
1402 			slotp->offset = offset;
1403 			slotp->size = (int)(offset - last_offset);
1404 			slotp->fbp = fbp;
1405 			slotp->ep = ep;
1406 			slotp->endoff = 0;
1407 			if (caching)
1408 				dnlc_dir_purge(dcap);
1409 			return (0);
1410 		}
1411 		last_offset = offset;
1412 		offset += ep->d_reclen;
1413 		entryoffsetinblk += ep->d_reclen;
1414 		if (ep->d_ino)
1415 			enduseful = offset;
1416 	}
1417 	if (fbp) {
1418 		fbrelse(fbp, S_OTHER);
1419 	}
1420 
1421 	if (caching) {
1422 		dnlc_dir_complete(dcap);
1423 		slotp->cached = 1;
1424 		if (slotp->status == FOUND) {
1425 			if (initstat == FOUND) {
1426 				return (0);
1427 			}
1428 			(void) dnlc_dir_rem_space_by_handle(dcap,
1429 			    slotp->offset);
1430 			slotp->endoff = 0;
1431 			return (0);
1432 		}
1433 	}
1434 
1435 	if (slotp->status == NONE) {
1436 		/*
1437 		 * We didn't find a slot; the new directory entry should be put
1438 		 * at the end of the directory.  Return an indication of where
1439 		 * this is, and set "endoff" to zero; since we're going to have
1440 		 * to extend the directory, we're certainly not going to
1441 		 * truncate it.
1442 		 */
1443 		slotp->offset = dirsize;
1444 		slotp->size = DIRBLKSIZ;
1445 		slotp->endoff = 0;
1446 	} else {
1447 		/*
1448 		 * We found a slot, and will return an indication of where that
1449 		 * slot is, as any new directory entry will be put there.
1450 		 * Since that slot will become a useful entry, if the last
1451 		 * useful entry we found was before this one, update the offset
1452 		 * of the last useful entry.
1453 		 */
1454 		if (enduseful < slotp->offset + slotp->size)
1455 			enduseful = slotp->offset + slotp->size;
1456 		slotp->endoff = P2ROUNDUP_TYPED(enduseful, DIRBLKSIZ, off_t);
1457 	}
1458 	*ipp = NULL;
1459 	return (0);
1460 }
1461 
1462 uint64_t ufs_dirrename_retry_cnt;
1463 
1464 /*
1465  * Rename the entry in the directory tdp so that it points to
1466  * sip instead of tip.
1467  */
1468 static int
1469 ufs_dirrename(
1470 	struct inode *sdp,	/* parent directory of source */
1471 	struct inode *sip,	/* source inode */
1472 	struct inode *tdp,	/* parent directory of target */
1473 	char *namep,		/* entry we are trying to change */
1474 	struct inode *tip,	/* target inode */
1475 	struct ufs_slot *slotp,	/* slot for entry */
1476 	struct cred *cr)	/* credentials */
1477 {
1478 	vnode_t *tdvp;
1479 	off_t offset;
1480 	int err;
1481 	int doingdirectory;
1482 
1483 	ASSERT(sdp->i_ufsvfs != NULL);
1484 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
1485 	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
1486 	/*
1487 	 * Short circuit rename of something to itself.
1488 	 */
1489 	if (sip->i_number == tip->i_number) {
1490 		return (ESAME); /* special KLUDGE error code */
1491 	}
1492 
1493 	/*
1494 	 * We're locking 2 peer level locks, so must use tryenter
1495 	 * on the 2nd to avoid deadlocks that would occur
1496 	 * if we renamed a->b and b->a concurrently.
1497 	 */
1498 retry:
1499 	rw_enter(&tip->i_contents, RW_WRITER);
1500 	if (!rw_tryenter(&sip->i_contents, RW_READER)) {
1501 		/*
1502 		 * drop tip and wait (sleep) until we stand a chance
1503 		 * of holding sip
1504 		 */
1505 		rw_exit(&tip->i_contents);
1506 		rw_enter(&sip->i_contents, RW_READER);
1507 		/*
1508 		 * Reverse the lock grabs in case we have heavy
1509 		 * contention on the 2nd lock.
1510 		 */
1511 		if (!rw_tryenter(&tip->i_contents, RW_WRITER)) {
1512 			ufs_dirrename_retry_cnt++;
1513 			rw_exit(&sip->i_contents);
1514 			goto retry;
1515 		}
1516 	}
1517 
1518 	/*
1519 	 * Check that everything is on the same filesystem.
1520 	 */
1521 	if ((ITOV(tip)->v_vfsp != ITOV(tdp)->v_vfsp) ||
1522 	    (ITOV(tip)->v_vfsp != ITOV(sip)->v_vfsp)) {
1523 		err = EXDEV;		/* XXX archaic */
1524 		goto out;
1525 	}
1526 	/*
1527 	 * Must have write permission to rewrite target entry.
1528 	 * Perform additional checks for sticky directories.
1529 	 */
1530 	if ((err = ufs_iaccess(tdp, IWRITE, cr)) != 0 ||
1531 	    (err = ufs_sticky_remove_access(tdp, tip, cr)) != 0)
1532 		goto out;
1533 
1534 	/*
1535 	 * Ensure source and target are compatible (both directories
1536 	 * or both not directories).  If target is a directory it must
1537 	 * be empty and have no links to it; in addition it must not
1538 	 * be a mount point, and both the source and target must be
1539 	 * writable.
1540 	 */
1541 	doingdirectory = (((sip->i_mode & IFMT) == IFDIR) ||
1542 	    ((sip->i_mode & IFMT) == IFATTRDIR));
1543 	if (((tip->i_mode & IFMT) == IFDIR) ||
1544 	    ((tip->i_mode & IFMT) == IFATTRDIR)) {
1545 		if (!doingdirectory) {
1546 			err = EISDIR;
1547 			goto out;
1548 		}
1549 		/*
1550 		 * vn_vfsrlock will prevent mounts from using the directory
1551 		 * until we are done.
1552 		 */
1553 		if (vn_vfsrlock(ITOV(tip))) {
1554 			err = EBUSY;
1555 			goto out;
1556 		}
1557 		if (vn_mountedvfs(ITOV(tip)) != NULL) {
1558 			vn_vfsunlock(ITOV(tip));
1559 			err = EBUSY;
1560 			goto out;
1561 		}
1562 		if (!ufs_dirempty(tip, tdp->i_number, cr) || tip->i_nlink > 2) {
1563 			vn_vfsunlock(ITOV(tip));
1564 			err = EEXIST;	/* SIGH should be ENOTEMPTY */
1565 			goto out;
1566 		}
1567 	} else if (doingdirectory) {
1568 		err = ENOTDIR;
1569 		goto out;
1570 	}
1571 
1572 	/*
1573 	 * Rewrite the inode pointer for target name entry
1574 	 * from the target inode (ip) to the source inode (sip).
1575 	 * This prevents the target entry from disappearing
1576 	 * during a crash. Mark the directory inode to reflect the changes.
1577 	 */
1578 	tdvp = ITOV(tdp);
1579 	slotp->ep->d_ino = (int32_t)sip->i_number;
1580 	dnlc_update(tdvp, namep, ITOV(sip));
1581 	if (slotp->size) {
1582 		offset = slotp->offset - slotp->size;
1583 	} else {
1584 		offset = slotp->offset + 1;
1585 	}
1586 	if (slotp->cached) {
1587 		(void) dnlc_dir_update(&tdp->i_danchor, namep,
1588 		    INO_OFF_TO_H(slotp->ep->d_ino, offset));
1589 	}
1590 
1591 	err = TRANS_DIR(tdp, slotp->offset);
1592 	if (err)
1593 		fbrelse(slotp->fbp, S_OTHER);
1594 	else
1595 		err = ufs_fbwrite(slotp->fbp, tdp);
1596 
1597 	slotp->fbp = NULL;
1598 	if (err) {
1599 		if (doingdirectory)
1600 			vn_vfsunlock(ITOV(tip));
1601 		goto out;
1602 	}
1603 
1604 	TRANS_INODE(tdp->i_ufsvfs, tdp);
1605 	tdp->i_flag |= IUPD|ICHG;
1606 	tdp->i_seq++;
1607 	ITIMES_NOLOCK(tdp);
1608 
1609 	/*
1610 	 * Decrement the link count of the target inode.
1611 	 * Fix the ".." entry in sip to point to dp.
1612 	 * This is done after the new entry is on the disk.
1613 	 */
1614 	tip->i_nlink--;
1615 	TRANS_INODE(tip->i_ufsvfs, tip);
1616 	tip->i_flag |= ICHG;
1617 	tip->i_seq++;
1618 	ITIMES_NOLOCK(tip);
1619 	if (doingdirectory) {
1620 		/*
1621 		 * The entry for tip no longer exists so I can unlock the
1622 		 * vfslock.
1623 		 */
1624 		vn_vfsunlock(ITOV(tip));
1625 		/*
1626 		 * Decrement target link count once more if it was a directory.
1627 		 */
1628 		if (--tip->i_nlink != 0) {
1629 			err = ufs_fault(ITOV(tip),
1630 		    "ufs_dirrename: target directory link count != 0 (%s)",
1631 			    tip->i_fs->fs_fsmnt);
1632 			rw_exit(&tip->i_contents);
1633 			return (err);
1634 		}
1635 		TRANS_INODE(tip->i_ufsvfs, tip);
1636 		ufs_setreclaim(tip);
1637 		/*
1638 		 * Renaming a directory with the parent different
1639 		 * requires that ".." be rewritten.  The window is
1640 		 * still there for ".." to be inconsistent, but this
1641 		 * is unavoidable, and a lot shorter than when it was
1642 		 * done in a user process.  We decrement the link
1643 		 * count in the new parent as appropriate to reflect
1644 		 * the just-removed target.  If the parent is the
1645 		 * same, this is appropriate since the original
1646 		 * directory is going away.  If the new parent is
1647 		 * different, ufs_dirfixdotdot() will bump the link count
1648 		 * back.
1649 		 */
1650 		tdp->i_nlink--;
1651 		ufs_setreclaim(tdp);
1652 		TRANS_INODE(tdp->i_ufsvfs, tdp);
1653 		tdp->i_flag |= ICHG;
1654 		tdp->i_seq++;
1655 		ITIMES_NOLOCK(tdp);
1656 		if (sdp != tdp) {
1657 			rw_exit(&tip->i_contents);
1658 			rw_exit(&sip->i_contents);
1659 			err = ufs_dirfixdotdot(sip, sdp, tdp);
1660 			return (err);
1661 		}
1662 	} else
1663 		ufs_setreclaim(tip);
1664 out:
1665 	rw_exit(&tip->i_contents);
1666 	rw_exit(&sip->i_contents);
1667 	return (err);
1668 }
1669 
1670 /*
1671  * Fix the ".." entry of the child directory so that it points
1672  * to the new parent directory instead of the old one.  Routine
1673  * assumes that dp is a directory and that all the inodes are on
1674  * the same file system.
1675  */
1676 static int
1677 ufs_dirfixdotdot(
1678 	struct inode *dp,	/* child directory */
1679 	struct inode *opdp,	/* old parent directory */
1680 	struct inode *npdp)	/* new parent directory */
1681 {
1682 	struct fbuf *fbp;
1683 	struct dirtemplate *dirp;
1684 	vnode_t *dvp;
1685 	int err;
1686 
1687 	ASSERT(RW_WRITE_HELD(&npdp->i_rwlock));
1688 	ASSERT(RW_WRITE_HELD(&npdp->i_contents));
1689 
1690 	/*
1691 	 * We hold the child directory's i_contents lock before calling
1692 	 * blkatoff so that we honor correct locking protocol which is
1693 	 * i_contents lock and then page lock. (blkatoff will call
1694 	 * ufs_getpage where we want the page lock)
1695 	 * We hold the child directory's i_rwlock before i_contents (as
1696 	 * per the locking protocol) since we are modifying the ".." entry
1697 	 * of the child directory.
1698 	 * We hold the i_rwlock and i_contents lock until we record
1699 	 * this directory delta to the log (via ufs_trans_dir) and have
1700 	 * done fbrelse.
1701 	 */
1702 	rw_enter(&dp->i_rwlock, RW_WRITER);
1703 	rw_enter(&dp->i_contents, RW_WRITER);
1704 	err = blkatoff(dp, (off_t)0, (char **)&dirp, &fbp);
1705 	if (err)
1706 		goto bad;
1707 
1708 	if (dp->i_nlink <= 0 ||
1709 	    dp->i_size < sizeof (struct dirtemplate)) {
1710 		err = ENOENT;
1711 		goto bad;
1712 	}
1713 
1714 	if (dirp->dotdot_namlen != 2 ||
1715 	    dirp->dotdot_name[0] != '.' ||
1716 	    dirp->dotdot_name[1] != '.') {	/* Sanity check. */
1717 		dirbad(dp, "mangled .. entry", (off_t)0);
1718 		err = ENOTDIR;
1719 		goto bad;
1720 	}
1721 
1722 	/*
1723 	 * Increment the link count in the new parent inode and force it out.
1724 	 */
1725 	if (npdp->i_nlink == MAXLINK) {
1726 		err = EMLINK;
1727 		goto bad;
1728 	}
1729 	npdp->i_nlink++;
1730 	TRANS_INODE(npdp->i_ufsvfs, npdp);
1731 	npdp->i_flag |= ICHG;
1732 	npdp->i_seq++;
1733 	ufs_iupdat(npdp, I_SYNC);
1734 
1735 	/*
1736 	 * Rewrite the child ".." entry and force it out.
1737 	 */
1738 	dvp = ITOV(dp);
1739 	dirp->dotdot_ino = (uint32_t)npdp->i_number;
1740 	dnlc_update(dvp, "..", ITOV(npdp));
1741 	(void) dnlc_dir_update(&dp->i_danchor, "..",
1742 	    INO_OFF_TO_H(dirp->dotdot_ino, 0));
1743 
1744 	err = TRANS_DIR(dp, 0);
1745 	if (err)
1746 		fbrelse(fbp, S_OTHER);
1747 	else
1748 		err = ufs_fbwrite(fbp, dp);
1749 
1750 	fbp = NULL;
1751 	if (err)
1752 		goto bad;
1753 
1754 	rw_exit(&dp->i_contents);
1755 	rw_exit(&dp->i_rwlock);
1756 
1757 	/*
1758 	 * Decrement the link count of the old parent inode and force it out.
1759 	 */
1760 	ASSERT(opdp);
1761 	rw_enter(&opdp->i_contents, RW_WRITER);
1762 	ASSERT(opdp->i_nlink > 0);
1763 	opdp->i_nlink--;
1764 	ufs_setreclaim(opdp);
1765 	TRANS_INODE(opdp->i_ufsvfs, opdp);
1766 	opdp->i_flag |= ICHG;
1767 	opdp->i_seq++;
1768 	ufs_iupdat(opdp, I_SYNC);
1769 	rw_exit(&opdp->i_contents);
1770 	return (0);
1771 
1772 bad:
1773 	if (fbp)
1774 		fbrelse(fbp, S_OTHER);
1775 	rw_exit(&dp->i_contents);
1776 	rw_exit(&dp->i_rwlock);
1777 	return (err);
1778 }
1779 
1780 /*
1781  * Enter the file sip in the directory tdp with name namep.
1782  */
1783 static int
1784 ufs_diraddentry(
1785 	struct inode *tdp,
1786 	char *namep,
1787 	enum de_op op,
1788 	int namlen,
1789 	struct ufs_slot *slotp,
1790 	struct inode *sip,
1791 	struct inode *sdp,
1792 	struct cred *cr)
1793 {
1794 	struct direct *ep, *nep;
1795 	vnode_t *tdvp;
1796 	dcanchor_t *dcap = &tdp->i_danchor;
1797 	off_t offset;
1798 	int err;
1799 	ushort_t extra;
1800 
1801 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
1802 	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
1803 	/*
1804 	 * Prepare a new entry.  If the caller has not supplied an
1805 	 * existing inode, make a new one.
1806 	 */
1807 	err = dirprepareentry(tdp, slotp, cr);
1808 	if (err) {
1809 		if (slotp->fbp) {
1810 			fbrelse(slotp->fbp, S_OTHER);
1811 			slotp->fbp = NULL;
1812 		}
1813 		return (err);
1814 	}
1815 	/*
1816 	 * Check inode to be linked to see if it is in the
1817 	 * same filesystem.
1818 	 */
1819 	if (ITOV(tdp)->v_vfsp != ITOV(sip)->v_vfsp) {
1820 		err = EXDEV;
1821 		goto bad;
1822 	}
1823 
1824 	/*
1825 	 * If renaming a directory then fix up the ".." entry in the
1826 	 * directory to point to the new parent.
1827 	 */
1828 	if ((op == DE_RENAME) && (((sip->i_mode & IFMT) == IFDIR) ||
1829 	    ((sip->i_mode & IFMT) == IFATTRDIR)) && (sdp != tdp)) {
1830 		err = ufs_dirfixdotdot(sip, sdp, tdp);
1831 		if (err)
1832 			goto bad;
1833 	}
1834 
1835 	/*
1836 	 * Fill in entry data.
1837 	 */
1838 	ep = slotp->ep;
1839 	ep->d_namlen = (ushort_t)namlen;
1840 	(void) strncpy(ep->d_name, namep, (size_t)((namlen + 4) & ~3));
1841 	ep->d_ino = (uint32_t)sip->i_number;
1842 	tdvp = ITOV(tdp);
1843 	dnlc_update(tdvp, namep, ITOV(sip));
1844 	/*
1845 	 * Note the offset supplied for any named entry is
1846 	 * the offset of the previous one, unless it's the 1st.
1847 	 * slotp->size is used to pass the length to
1848 	 * the previous entry.
1849 	 */
1850 	if (slotp->size) {
1851 		offset = slotp->offset - slotp->size;
1852 	} else {
1853 		offset = slotp->offset + 1;
1854 	}
1855 
1856 	if (slotp->cached) {
1857 		/*
1858 		 * Add back any usable unused space to the dnlc directory
1859 		 * cache.
1860 		 */
1861 		extra = ep->d_reclen - DIRSIZ(ep);
1862 		if (extra >= LDIRSIZ(1)) {
1863 			(void) dnlc_dir_add_space(dcap, extra,
1864 			    (uint64_t)slotp->offset);
1865 		}
1866 
1867 		(void) dnlc_dir_add_entry(dcap, namep,
1868 		    INO_OFF_TO_H(ep->d_ino, offset));
1869 
1870 		/* adjust the previous offset of the next entry */
1871 		nep = (struct direct *)((char *)ep + ep->d_reclen);
1872 		if ((uintptr_t)nep & (DIRBLKSIZ - 1)) {
1873 			/*
1874 			 * Not a new block.
1875 			 *
1876 			 * Check the validity of the next entry.
1877 			 * If it's bad, then throw away the cache, and
1878 			 * continue as before directory caching.
1879 			 */
1880 			if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) ||
1881 			    dnlc_dir_update(dcap, nep->d_name,
1882 			    INO_OFF_TO_H(nep->d_ino, slotp->offset))
1883 			    == DNOENT) {
1884 				dnlc_dir_purge(dcap);
1885 				slotp->cached = 0;
1886 			}
1887 		}
1888 	}
1889 
1890 	/*
1891 	 * Write out the directory block.
1892 	 */
1893 	err = TRANS_DIR(tdp, slotp->offset);
1894 	if (err)
1895 		fbrelse(slotp->fbp, S_OTHER);
1896 	else
1897 		err = ufs_fbwrite(slotp->fbp, tdp);
1898 
1899 	slotp->fbp = NULL;
1900 	/*
1901 	 * If this is a rename of a directory, then we have already
1902 	 * fixed the ".." entry to refer to the new parent. If err
1903 	 * is true at this point, we have failed to update the new
1904 	 * parent to refer to the renamed directory.
1905 	 * XXX - we need to unwind the ".." fix.
1906 	 */
1907 	if (err)
1908 		return (err);
1909 
1910 	/*
1911 	 * Mark the directory inode to reflect the changes.
1912 	 * Truncate the directory to chop off blocks of empty entries.
1913 	 */
1914 
1915 	TRANS_INODE(tdp->i_ufsvfs, tdp);
1916 	tdp->i_flag |= IUPD|ICHG;
1917 	tdp->i_seq++;
1918 	tdp->i_diroff = 0;
1919 	ITIMES_NOLOCK(tdp);
1920 	/*
1921 	 * If the directory grew then dirprepareentry() will have
1922 	 * set IATTCHG in tdp->i_flag, then the directory inode must
1923 	 * be flushed out. This is because if fsync() is used later
1924 	 * the directory size must be correct, otherwise a crash would
1925 	 * cause fsck to move the file to lost+found. Also because later
1926 	 * a file may be linked in more than one directory, then there
1927 	 * is no way to flush the original directory. So it must be
1928 	 * flushed out on creation. See bug 4293809.
1929 	 */
1930 	if (tdp->i_flag & IATTCHG) {
1931 		ufs_iupdat(tdp, I_SYNC);
1932 	}
1933 
1934 	if (slotp->endoff && (slotp->endoff < tdp->i_size)) {
1935 		if (!TRANS_ISTRANS(tdp->i_ufsvfs)) {
1936 			(void) ufs_itrunc(tdp, (u_offset_t)slotp->endoff, 0,
1937 						cr);
1938 		}
1939 	}
1940 
1941 
1942 	return (0);
1943 
1944 bad:
1945 	if (slotp->cached) {
1946 		dnlc_dir_purge(dcap);
1947 		fbrelse(slotp->fbp, S_OTHER);
1948 		slotp->cached = 0;
1949 		slotp->fbp = NULL;
1950 		return (err);
1951 	}
1952 
1953 	/*
1954 	 * Clear out entry prepared by dirprepareent.
1955 	 */
1956 	slotp->ep->d_ino = 0;
1957 	slotp->ep->d_namlen = 0;
1958 
1959 	/*
1960 	 * Don't touch err so we don't clobber the real error that got us here.
1961 	 */
1962 	if (TRANS_DIR(tdp, slotp->offset))
1963 		fbrelse(slotp->fbp, S_OTHER);
1964 	else
1965 		(void) ufs_fbwrite(slotp->fbp, tdp);
1966 	slotp->fbp = NULL;
1967 	return (err);
1968 }
1969 
1970 /*
1971  * Prepare a directory slot to receive an entry.
1972  */
1973 static int
1974 dirprepareentry(
1975 	struct inode *dp,	/* directory we are working in */
1976 	struct ufs_slot *slotp,	/* available slot info */
1977 	struct cred *cr)
1978 {
1979 	struct direct *ep, *nep;
1980 	off_t entryend;
1981 	int err;
1982 	slotstat_t status = slotp->status;
1983 	ushort_t dsize;
1984 
1985 	ASSERT((status == NONE) || (status == FOUND));
1986 	ASSERT(RW_WRITE_HELD(&dp->i_rwlock));
1987 	ASSERT(RW_WRITE_HELD(&dp->i_contents));
1988 	/*
1989 	 * If we didn't find a slot, then indicate that the
1990 	 * new slot belongs at the end of the directory.
1991 	 * If we found a slot, then the new entry can be
1992 	 * put at slotp->offset.
1993 	 */
1994 	entryend = slotp->offset + slotp->size;
1995 	if (status == NONE) {
1996 		ASSERT((slotp->offset & (DIRBLKSIZ - 1)) == 0);
1997 		if (DIRBLKSIZ > dp->i_fs->fs_fsize) {
1998 			err = ufs_fault(ITOV(dp),
1999 			    "dirprepareentry: bad fs_fsize, DIRBLKSIZ: %d"
2000 			    " > dp->i_fs->fs_fsize: %d (%s)",
2001 			    DIRBLKSIZ, dp->i_fs->fs_fsize, dp->i_fs->fs_fsmnt);
2002 			return (err);
2003 		}
2004 		/*
2005 		 * Allocate the new block.
2006 		 */
2007 		err = BMAPALLOC(dp, (u_offset_t)slotp->offset,
2008 		    (int)(blkoff(dp->i_fs, slotp->offset) + DIRBLKSIZ), cr);
2009 		if (err) {
2010 			return (err);
2011 		}
2012 		dp->i_size = entryend;
2013 		TRANS_INODE(dp->i_ufsvfs, dp);
2014 		dp->i_flag |= IUPD|ICHG|IATTCHG;
2015 		dp->i_seq++;
2016 		ITIMES_NOLOCK(dp);
2017 	} else if (entryend > dp->i_size) {
2018 		/*
2019 		 * Adjust directory size, if needed. This should never
2020 		 * push the size past a new multiple of DIRBLKSIZ.
2021 		 * This is an artifact of the old (4.2BSD) way of initializing
2022 		 * directory sizes to be less than DIRBLKSIZ.
2023 		 */
2024 		dp->i_size = P2ROUNDUP_TYPED(entryend, DIRBLKSIZ, off_t);
2025 		TRANS_INODE(dp->i_ufsvfs, dp);
2026 		dp->i_flag |= IUPD|ICHG|IATTCHG;
2027 		dp->i_seq++;
2028 		ITIMES_NOLOCK(dp);
2029 	}
2030 
2031 	/*
2032 	 * Get the block containing the space for the new directory entry.
2033 	 */
2034 	if (slotp->fbp == NULL) {
2035 		err = blkatoff(dp, slotp->offset, (char **)&slotp->ep,
2036 		    &slotp->fbp);
2037 		if (err) {
2038 			return (err);
2039 		}
2040 	}
2041 	ep = slotp->ep;
2042 
2043 	switch (status) {
2044 	case NONE:
2045 		/*
2046 		 * No space in the directory. slotp->offset will be on a
2047 		 * directory block boundary and we will write the new entry
2048 		 * into a fresh block.
2049 		 */
2050 		ep->d_reclen = DIRBLKSIZ;
2051 		slotp->size = 0; /* length of previous entry */
2052 		break;
2053 	case FOUND:
2054 		/*
2055 		 * An entry of the required size has been found. Use it.
2056 		 */
2057 		if (ep->d_ino == 0) {
2058 			/* this is the 1st record in a block */
2059 			slotp->size = 0; /* length of previous entry */
2060 		} else {
2061 			dsize = DIRSIZ(ep);
2062 			nep = (struct direct *)((char *)ep + dsize);
2063 			nep->d_reclen = ep->d_reclen - dsize;
2064 			ep->d_reclen = dsize;
2065 			slotp->ep = nep;
2066 			slotp->offset += dsize;
2067 			slotp->size = dsize; /* length of previous entry */
2068 		}
2069 		break;
2070 	default:
2071 		break;
2072 	}
2073 	return (0);
2074 }
2075 
2076 /*
2077  * Allocate and initialize a new inode that will go into directory tdp.
2078  * This routine is called from ufs_symlink(), as well as within this file.
2079  */
2080 int
2081 ufs_dirmakeinode(
2082 	struct inode *tdp,
2083 	struct inode **ipp,
2084 	struct vattr *vap,
2085 	enum de_op op,
2086 	struct cred *cr)
2087 {
2088 	struct inode *ip;
2089 	enum vtype type;
2090 	int imode;			/* mode and format as in inode */
2091 	ino_t ipref;
2092 	int err;
2093 	timestruc_t now;
2094 
2095 	ASSERT(vap != NULL);
2096 	ASSERT(op == DE_CREATE || op == DE_MKDIR || op == DE_ATTRDIR ||
2097 		op == DE_SYMLINK);
2098 	ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
2099 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
2100 	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
2101 	/*
2102 	 * Allocate a new inode.
2103 	 */
2104 	type = vap->va_type;
2105 	if (type == VDIR) {
2106 		ipref = dirpref(tdp);
2107 	} else {
2108 		ipref = tdp->i_number;
2109 	}
2110 	if (op == DE_ATTRDIR)
2111 		imode = vap->va_mode;
2112 	else
2113 		imode = MAKEIMODE(type, vap->va_mode);
2114 	*ipp = NULL;
2115 	err = ufs_ialloc(tdp, ipref, imode, &ip, cr);
2116 	if (err)
2117 		return (err);
2118 
2119 	/*
2120 	 * We don't need to grab vfs_dqrwlock here because it is held
2121 	 * in ufs_direnter_*() above us.
2122 	 */
2123 	ASSERT(RW_READ_HELD(&ip->i_ufsvfs->vfs_dqrwlock));
2124 	rw_enter(&ip->i_contents, RW_WRITER);
2125 	if (ip->i_dquot != NULL) {
2126 		err = ufs_fault(ITOV(ip),
2127 		    "ufs_dirmakeinode, ip->i_dquot != NULL: dquot (%s)",
2128 				    tdp->i_fs->fs_fsmnt);
2129 		rw_exit(&ip->i_contents);
2130 		return (err);
2131 	}
2132 	*ipp = ip;
2133 	ip->i_mode = (o_mode_t)imode;
2134 	if (type == VBLK || type == VCHR) {
2135 		dev_t d = vap->va_rdev;
2136 		dev32_t dev32;
2137 
2138 		/*
2139 		 * Don't allow a special file to be created with a
2140 		 * dev_t that cannot be represented by this filesystem
2141 		 * format on disk.
2142 		 */
2143 		if (!cmpldev(&dev32, d)) {
2144 			err = EOVERFLOW;
2145 			goto fail;
2146 		}
2147 
2148 		ITOV(ip)->v_rdev = ip->i_rdev = d;
2149 
2150 		if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) {
2151 			ip->i_ordev = dev32; /* can't use old format */
2152 		} else {
2153 			ip->i_ordev = cmpdev(d);
2154 		}
2155 	}
2156 	ITOV(ip)->v_type = type;
2157 	ufs_reset_vnode(ip->i_vnode);
2158 	if (type == VDIR) {
2159 		ip->i_nlink = 2; /* anticipating a call to dirmakedirect */
2160 	} else {
2161 		ip->i_nlink = 1;
2162 	}
2163 
2164 	if (op == DE_ATTRDIR) {
2165 		ip->i_uid = vap->va_uid;
2166 		ip->i_gid = vap->va_gid;
2167 	} else
2168 		ip->i_uid = crgetuid(cr);
2169 	/*
2170 	 * To determine the group-id of the created file:
2171 	 *   1) If the gid is set in the attribute list (non-Sun & pre-4.0
2172 	 *	clients are not likely to set the gid), then use it if
2173 	 *	the process is privileged, belongs to the target group,
2174 	 *	or the group is the same as the parent directory.
2175 	 *   2) If the filesystem was not mounted with the Old-BSD-compatible
2176 	 *	GRPID option, and the directory's set-gid bit is clear,
2177 	 *	then use the process's gid.
2178 	 *   3) Otherwise, set the group-id to the gid of the parent directory.
2179 	 */
2180 	if (op != DE_ATTRDIR && (vap->va_mask & AT_GID) &&
2181 	    ((vap->va_gid == tdp->i_gid) || groupmember(vap->va_gid, cr) ||
2182 	    secpolicy_vnode_create_gid(cr) == 0)) {
2183 		/*
2184 		 * XXX - is this only the case when a 4.0 NFS client, or a
2185 		 * client derived from that code, makes a call over the wire?
2186 		 */
2187 		ip->i_gid = vap->va_gid;
2188 	} else
2189 		ip->i_gid = (tdp->i_mode & ISGID) ? tdp->i_gid : crgetgid(cr);
2190 
2191 	/*
2192 	 * For SunOS 5.0->5.4, the lines below read:
2193 	 *
2194 	 * ip->i_suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid;
2195 	 * ip->i_sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid;
2196 	 *
2197 	 * where MAXUID was set to 60002.  See notes on this in ufs_inode.c
2198 	 */
2199 	ip->i_suid = (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ?
2200 		UID_LONG : ip->i_uid;
2201 	ip->i_sgid = (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ?
2202 		GID_LONG : ip->i_gid;
2203 
2204 	/*
2205 	 * If we're creating a directory, and the parent directory has the
2206 	 * set-GID bit set, set it on the new directory.
2207 	 * Otherwise, if the user is neither privileged nor a member of the
2208 	 * file's new group, clear the file's set-GID bit.
2209 	 */
2210 	if ((tdp->i_mode & ISGID) && (type == VDIR))
2211 		ip->i_mode |= ISGID;
2212 	else {
2213 		if ((ip->i_mode & ISGID) &&
2214 		    secpolicy_vnode_setids_setgids(cr, ip->i_gid) != 0)
2215 			ip->i_mode &= ~ISGID;
2216 	}
2217 
2218 	if (((vap->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2219 	    ((vap->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2220 		err = EOVERFLOW;
2221 		goto fail;
2222 	}
2223 
2224 	/*
2225 	 * Extended attribute directories are not subject to quotas.
2226 	 */
2227 	if (op != DE_ATTRDIR)
2228 		ip->i_dquot = getinoquota(ip);
2229 	else
2230 		ip->i_dquot = NULL;
2231 
2232 	if (op == DE_MKDIR || op == DE_ATTRDIR) {
2233 		err = ufs_dirmakedirect(ip, tdp, (op == DE_MKDIR) ? 0 : 1, cr);
2234 		if (err)
2235 			goto fail;
2236 	}
2237 
2238 	/*
2239 	 * generate the shadow inode and attach it to the new object
2240 	 */
2241 	ASSERT((tdp->i_shadow && tdp->i_ufs_acl) ||
2242 	    (!tdp->i_shadow && !tdp->i_ufs_acl));
2243 	if (tdp->i_shadow && tdp->i_ufs_acl &&
2244 	    (((tdp->i_mode & IFMT) == IFDIR) ||
2245 	    ((tdp->i_mode & IFMT) == IFATTRDIR))) {
2246 		err = ufs_si_inherit(ip, tdp, ip->i_mode, cr);
2247 		if (err) {
2248 			if (op == DE_MKDIR) {
2249 				/*
2250 				 * clean up parent directory
2251 				 *
2252 				 * tdp->i_contents already locked from
2253 				 * ufs_direnter_*()
2254 				 */
2255 				tdp->i_nlink--;
2256 				TRANS_INODE(tdp->i_ufsvfs, tdp);
2257 				tdp->i_flag |= ICHG;
2258 				tdp->i_seq++;
2259 				ufs_iupdat(tdp, I_SYNC);
2260 			}
2261 			goto fail;
2262 		}
2263 	}
2264 
2265 	/*
2266 	 * If the passed in attributes contain atime and/or mtime
2267 	 * settings, then use them instead of using the current
2268 	 * high resolution time.
2269 	 */
2270 	if (vap->va_mask & (AT_MTIME|AT_ATIME)) {
2271 		if (vap->va_mask & AT_ATIME) {
2272 			ip->i_atime.tv_sec = vap->va_atime.tv_sec;
2273 			ip->i_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2274 			ip->i_flag &= ~IACC;
2275 		} else
2276 			ip->i_flag |= IACC;
2277 		if (vap->va_mask & AT_MTIME) {
2278 			ip->i_mtime.tv_sec = vap->va_mtime.tv_sec;
2279 			ip->i_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2280 			gethrestime(&now);
2281 			if (now.tv_sec > TIME32_MAX) {
2282 				/*
2283 				 * In 2038, ctime sticks forever..
2284 				 */
2285 				ip->i_ctime.tv_sec = TIME32_MAX;
2286 				ip->i_ctime.tv_usec = 0;
2287 			} else {
2288 				ip->i_ctime.tv_sec = now.tv_sec;
2289 				ip->i_ctime.tv_usec = now.tv_nsec / 1000;
2290 			}
2291 			ip->i_flag &= ~(IUPD|ICHG);
2292 			ip->i_flag |= IMODTIME;
2293 		} else
2294 			ip->i_flag |= IUPD|ICHG;
2295 		ip->i_flag |= IMOD;
2296 	} else
2297 		ip->i_flag |= IACC|IUPD|ICHG;
2298 	ip->i_seq++;
2299 
2300 	/*
2301 	 * If this is an attribute tag it as one.
2302 	 */
2303 	if ((tdp->i_mode & IFMT) == IFATTRDIR) {
2304 		ip->i_cflags |= IXATTR;
2305 	}
2306 
2307 	/*
2308 	 * push inode before it's name appears in a directory
2309 	 */
2310 	TRANS_INODE(ip->i_ufsvfs, ip);
2311 	ufs_iupdat(ip, I_SYNC);
2312 	rw_exit(&ip->i_contents);
2313 	return (0);
2314 
2315 fail:
2316 	/* Throw away inode we just allocated. */
2317 	ip->i_nlink = 0;
2318 	ufs_setreclaim(ip);
2319 	TRANS_INODE(ip->i_ufsvfs, ip);
2320 	ip->i_flag |= ICHG;
2321 	ip->i_seq++;
2322 	ITIMES_NOLOCK(ip);
2323 	rw_exit(&ip->i_contents);
2324 	return (err);
2325 }
2326 
2327 /*
2328  * Write a prototype directory into the empty inode ip, whose parent is dp.
2329  */
2330 static int
2331 ufs_dirmakedirect(
2332 	struct inode *ip,		/* new directory */
2333 	struct inode *dp,		/* parent directory */
2334 	int	attrdir,
2335 	struct cred *cr)
2336 {
2337 	struct dirtemplate *dirp;
2338 	struct fbuf *fbp;
2339 	int err;
2340 
2341 	ASSERT(RW_WRITE_HELD(&ip->i_contents));
2342 	ASSERT(RW_WRITE_HELD(&dp->i_rwlock));
2343 	ASSERT(RW_WRITE_HELD(&dp->i_contents));
2344 	/*
2345 	 * Allocate space for the directory we're creating.
2346 	 */
2347 	err = BMAPALLOC(ip, (u_offset_t)0, DIRBLKSIZ, cr);
2348 	if (err)
2349 		return (err);
2350 	if (DIRBLKSIZ > dp->i_fs->fs_fsize) {
2351 		err = ufs_fault(ITOV(dp),
2352 "ufs_dirmakedirect: bad fs_fsize, DIRBLKSIZ: %d > dp->i_fs->fs_fsize: %d (%s)",
2353 					DIRBLKSIZ, dp->i_fs->fs_fsize,
2354 					dp->i_fs->fs_fsmnt);
2355 		return (err);
2356 	}
2357 	ip->i_size = DIRBLKSIZ;
2358 	TRANS_INODE(ip->i_ufsvfs, ip);
2359 	ip->i_flag |= IUPD|ICHG|IATTCHG;
2360 	ip->i_seq++;
2361 	ITIMES_NOLOCK(ip);
2362 	/*
2363 	 * Update the tdp link count and write out the change.
2364 	 * This reflects the ".." entry we'll soon write.
2365 	 */
2366 	if (dp->i_nlink == MAXLINK)
2367 		return (EMLINK);
2368 	if (attrdir == 0)
2369 		dp->i_nlink++;
2370 	TRANS_INODE(dp->i_ufsvfs, dp);
2371 	dp->i_flag |= ICHG;
2372 	dp->i_seq++;
2373 	ufs_iupdat(dp, I_SYNC);
2374 	/*
2375 	 * Initialize directory with "."
2376 	 * and ".." from static template.
2377 	 *
2378 	 * Since the parent directory is locked, we don't have to
2379 	 * worry about anything changing when we drop the write
2380 	 * lock on (ip).
2381 	 *
2382 	 */
2383 	err = fbread(ITOV(ip), (offset_t)0, (uint_t)ip->i_fs->fs_fsize,
2384 	    S_READ, &fbp);
2385 
2386 	if (err) {
2387 		goto fail;
2388 	}
2389 	dirp = (struct dirtemplate *)fbp->fb_addr;
2390 	/*
2391 	 * Now initialize the directory we're creating
2392 	 * with the "." and ".." entries.
2393 	 */
2394 	*dirp = mastertemplate;			/* structure assignment */
2395 	dirp->dot_ino = (uint32_t)ip->i_number;
2396 	dirp->dotdot_ino = (uint32_t)dp->i_number;
2397 
2398 	err = TRANS_DIR(ip, 0);
2399 	if (err) {
2400 		fbrelse(fbp, S_OTHER);
2401 		goto fail;
2402 	}
2403 
2404 	err = ufs_fbwrite(fbp, ip);
2405 	if (err) {
2406 		goto fail;
2407 	}
2408 
2409 	return (0);
2410 
2411 fail:
2412 	if (attrdir == 0)
2413 		dp->i_nlink--;
2414 	TRANS_INODE(dp->i_ufsvfs, dp);
2415 	dp->i_flag |= ICHG;
2416 	dp->i_seq++;
2417 	ufs_iupdat(dp, I_SYNC);
2418 	return (err);
2419 }
2420 
2421 /*
2422  * Delete a directory entry.  If oip is nonzero the entry is checked
2423  * to make sure it still reflects oip.
2424  *
2425  * If vpp is non-null, return the ptr of the (held) vnode associated with
2426  * the removed name.  The caller is responsible for doing the VN_RELE().
2427  */
2428 int
2429 ufs_dirremove(
2430 	struct inode *dp,
2431 	char *namep,
2432 	struct inode *oip,
2433 	struct vnode *cdir,
2434 	enum dr_op op,
2435 	struct cred *cr,
2436 	vnode_t **vpp)	/* Return (held) vnode ptr of removed file/dir */
2437 {
2438 	struct direct *ep, *pep, *nep;
2439 	struct inode *ip;
2440 	vnode_t *dvp, *vp;
2441 	struct ufs_slot slot;
2442 	int namlen;
2443 	int err;
2444 	int mode;
2445 	ushort_t extra;
2446 
2447 	namlen = (int)strlen(namep);
2448 	if (namlen == 0)
2449 		return (ufs_fault(ITOV(dp), "ufs_dirremove: namlen == 0"));
2450 	/*
2451 	 * return error when removing . and ..
2452 	 */
2453 	if (namep[0] == '.') {
2454 		if (namlen == 1)
2455 			return (EINVAL);
2456 		else if (namlen == 2 && namep[1] == '.') {
2457 			return (EEXIST);	/* SIGH should be ENOTEMPTY */
2458 		}
2459 	}
2460 
2461 	ASSERT(RW_WRITE_HELD(&dp->i_rwlock));
2462 	/*
2463 	 * Check accessibility of directory.
2464 	 */
2465 retry:
2466 	if (((dp->i_mode & IFMT) != IFDIR) &&
2467 	    ((dp->i_mode & IFMT) != IFATTRDIR)) {
2468 		return (ENOTDIR);
2469 	}
2470 
2471 	/*
2472 	 * Execute access is required to search the directory.
2473 	 * Access for write is interpreted as allowing
2474 	 * deletion of files in the directory.
2475 	 */
2476 	if (err = ufs_iaccess(dp, IEXEC|IWRITE, cr)) {
2477 		return (err);
2478 	}
2479 
2480 	ip = NULL;
2481 	slot.fbp = NULL;
2482 	slot.status = FOUND;	/* don't need to look for empty slot */
2483 	rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER);
2484 	rw_enter(&dp->i_contents, RW_WRITER);
2485 	err = ufs_dircheckforname(dp, namep, namlen, &slot, &ip, cr, 0);
2486 	if (err)
2487 		goto out_novfs;
2488 	if (ip == NULL) {
2489 		err = ENOENT;
2490 		goto out_novfs;
2491 	}
2492 	vp = ITOV(ip);
2493 	if (oip && oip != ip) {
2494 		err = ENOENT;
2495 		goto out_novfs;
2496 	}
2497 
2498 	mode = ip->i_mode & IFMT;
2499 	if (mode == IFDIR || mode == IFATTRDIR) {
2500 
2501 		/*
2502 		 * vn_vfsrlock() prevents races between mount and rmdir.
2503 		 */
2504 		if (vn_vfsrlock(vp)) {
2505 			err = EBUSY;
2506 			goto out_novfs;
2507 		}
2508 		if (vn_mountedvfs(vp) != NULL && op != DR_RENAME) {
2509 			err = EBUSY;
2510 			goto out;
2511 		}
2512 		/*
2513 		 * If we are removing a directory, get a lock on it.
2514 		 * Taking a writer lock prevents a parallel ufs_dirlook from
2515 		 * incorrectly entering a negative cache vnode entry in the dnlc
2516 		 * If the directory is empty, it will stay empty until
2517 		 * we can remove it.
2518 		 */
2519 		if (!rw_tryenter(&ip->i_rwlock, RW_WRITER)) {
2520 			/*
2521 			 * It is possible that a thread in rename would have
2522 			 * acquired this rwlock. To prevent a deadlock we
2523 			 * do a rw_tryenter. If we fail to get the lock
2524 			 * we drop all the locks we have acquired, wait
2525 			 * for 2 ticks and reacquire the
2526 			 * directory's (dp) i_rwlock and try again.
2527 			 * If we dont drop dp's i_rwlock then we will panic
2528 			 * with a "Deadlock: cycle in blocking chain"
2529 			 * since in ufs_dircheckpath we want dp's i_rwlock.
2530 			 * dp is guaranteed to exist since ufs_dirremove is
2531 			 * called after a VN_HOLD(dp) has been done.
2532 			 */
2533 			ufs_dirremove_retry_cnt++;
2534 			vn_vfsunlock(vp);
2535 			if (slot.fbp)
2536 				fbrelse(slot.fbp, S_OTHER);
2537 			rw_exit(&dp->i_contents);
2538 			rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
2539 			rw_exit(&dp->i_rwlock);
2540 			VN_RELE(vp);
2541 			delay(2);
2542 			rw_enter(&dp->i_rwlock, RW_WRITER);
2543 			goto retry;
2544 		}
2545 	}
2546 	rw_enter(&ip->i_contents, RW_READER);
2547 
2548 	/*
2549 	 * Now check the restrictions that apply on sticky directories.
2550 	 */
2551 	if ((err = ufs_sticky_remove_access(dp, ip, cr)) != 0) {
2552 		rw_exit(&ip->i_contents);
2553 		if (mode == IFDIR || mode == IFATTRDIR)
2554 			rw_exit(&ip->i_rwlock);
2555 		goto out;
2556 	}
2557 
2558 	if (op == DR_RMDIR) {
2559 		/*
2560 		 * For rmdir(2), some special checks are required.
2561 		 * (a) Don't remove any alias of the parent (e.g. ".").
2562 		 * (b) Don't remove the current directory.
2563 		 * (c) Make sure the entry is (still) a directory.
2564 		 * (d) Make sure the directory is empty.
2565 		 */
2566 
2567 		if (dp == ip || vp == cdir)
2568 			err = EINVAL;
2569 		else if (((ip->i_mode & IFMT) != IFDIR) &&
2570 		    ((ip->i_mode & IFMT) != IFATTRDIR))
2571 			err = ENOTDIR;
2572 		else if ((ip->i_nlink > 2) ||
2573 		    !ufs_dirempty(ip, dp->i_number, cr)) {
2574 			err = EEXIST;	/* SIGH should be ENOTEMPTY */
2575 		}
2576 
2577 		if (err) {
2578 			rw_exit(&ip->i_contents);
2579 			if (mode == IFDIR || mode == IFATTRDIR)
2580 				rw_exit(&ip->i_rwlock);
2581 			goto out;
2582 		}
2583 	} else if (op == DR_REMOVE)  {
2584 		/*
2585 		 * unlink(2) requires a different check: allow only
2586 		 * privileged users to unlink a directory.
2587 		 */
2588 		if (vp->v_type == VDIR &&
2589 		    secpolicy_fs_linkdir(cr, vp->v_vfsp)) {
2590 			err = EPERM;
2591 			rw_exit(&ip->i_contents);
2592 			rw_exit(&ip->i_rwlock);
2593 			goto out;
2594 		}
2595 	}
2596 
2597 	rw_exit(&ip->i_contents);
2598 
2599 	/*
2600 	 * Remove the cache'd entry, if any.
2601 	 */
2602 	dvp = ITOV(dp);
2603 	dnlc_remove(dvp, namep);
2604 	ep = slot.ep;
2605 	ep->d_ino = 0;
2606 
2607 	if (slot.cached) {
2608 		dcanchor_t *dcap = &dp->i_danchor;
2609 
2610 		(void) dnlc_dir_rem_entry(dcap, namep, NULL);
2611 		if (((int)ep->d_reclen - (int)DIRSIZ(ep)) >= LDIRSIZ(1)) {
2612 			(void) dnlc_dir_rem_space_by_handle(dcap, slot.offset);
2613 		}
2614 		if (slot.offset & (DIRBLKSIZ - 1)) {
2615 			/*
2616 			 * Collapse new free space into previous entry.
2617 			 * Note, the previous entry has already been
2618 			 * validated in ufs_dircheckforname().
2619 			 */
2620 			ASSERT(slot.size);
2621 			pep = (struct direct *)((char *)ep - slot.size);
2622 			if ((pep->d_ino == 0) &&
2623 			    ((uintptr_t)pep & (DIRBLKSIZ - 1))) {
2624 				dnlc_dir_purge(dcap);
2625 				slot.cached = 0;
2626 				goto nocache;
2627 			}
2628 			if (pep->d_ino) {
2629 				extra = pep->d_reclen - DIRSIZ(pep);
2630 			} else {
2631 				extra = pep->d_reclen;
2632 			}
2633 			if (extra >= LDIRSIZ(1)) {
2634 				(void) dnlc_dir_rem_space_by_handle(dcap,
2635 				    (uint64_t)(slot.offset - slot.size));
2636 			}
2637 			pep->d_reclen += ep->d_reclen;
2638 			(void) dnlc_dir_add_space(dcap, extra + ep->d_reclen,
2639 				(uint64_t)(slot.offset - slot.size));
2640 			/* adjust the previous pointer in the next entry */
2641 			nep = (struct direct *)((char *)ep + ep->d_reclen);
2642 			if ((uintptr_t)nep & (DIRBLKSIZ - 1)) {
2643 				/*
2644 				 * Not a new block.
2645 				 *
2646 				 * Check the validity of the entry.
2647 				 * If it's bad, then throw away the cache and
2648 				 * continue.
2649 				 */
2650 				if ((nep->d_reclen == 0) ||
2651 				    (nep->d_reclen & 0x3) ||
2652 				    (dnlc_dir_update(dcap, nep->d_name,
2653 				    INO_OFF_TO_H(nep->d_ino,
2654 				    slot.offset - slot.size)) == DNOENT)) {
2655 					dnlc_dir_purge(dcap);
2656 					slot.cached = 0;
2657 				}
2658 			}
2659 		} else {
2660 			(void) dnlc_dir_add_space(dcap, ep->d_reclen,
2661 			(uint64_t)slot.offset);
2662 		}
2663 	} else {
2664 		/*
2665 		 * If the entry isn't the first in the directory, we must
2666 		 * reclaim the space of the now empty record by adding
2667 		 * the record size to the size of the previous entry.
2668 		 */
2669 		if (slot.offset & (DIRBLKSIZ - 1)) {
2670 			/*
2671 			 * Collapse new free space into previous entry.
2672 			 */
2673 			pep = (struct direct *)((char *)ep - slot.size);
2674 			pep->d_reclen += ep->d_reclen;
2675 		}
2676 	}
2677 nocache:
2678 
2679 
2680 	err = TRANS_DIR(dp, slot.offset);
2681 	if (err)
2682 		fbrelse(slot.fbp, S_OTHER);
2683 	else
2684 		err = ufs_fbwrite(slot.fbp, dp);
2685 	slot.fbp = NULL;
2686 
2687 	/*
2688 	 * If we were removing a directory, it is 'gone' now, but we cannot
2689 	 * unlock it as a thread may be waiting for the lock in ufs_create. If
2690 	 * we did, it could then create a file in a deleted directory.
2691 	 */
2692 
2693 	if (err) {
2694 		if (mode == IFDIR || mode == IFATTRDIR)
2695 			rw_exit(&ip->i_rwlock);
2696 		goto out;
2697 	}
2698 
2699 	rw_enter(&ip->i_contents, RW_WRITER);
2700 
2701 	dp->i_flag |= IUPD|ICHG;
2702 	dp->i_seq++;
2703 	ip->i_flag |= ICHG;
2704 	ip->i_seq++;
2705 
2706 	TRANS_INODE(dp->i_ufsvfs, dp);
2707 	TRANS_INODE(ip->i_ufsvfs, ip);
2708 	/*
2709 	 * Now dispose of the inode.
2710 	 */
2711 	if (ip->i_nlink > 0) {
2712 		/*
2713 		 * This is not done for IFATTRDIR's because they don't
2714 		 * have entries in the dnlc and the link counts are
2715 		 * not incremented when they are created.
2716 		 */
2717 		if (op == DR_RMDIR && (ip->i_mode & IFMT) == IFDIR) {
2718 			/*
2719 			 * Decrement by 2 because we're trashing the "."
2720 			 * entry as well as removing the entry in dp.
2721 			 * Clear the directory entry, but there may be
2722 			 * other hard links so don't free the inode.
2723 			 * Decrement the dp linkcount because we're
2724 			 * trashing the ".." entry.
2725 			 */
2726 			ip->i_nlink -= 2;
2727 			dp->i_nlink--;
2728 			ufs_setreclaim(dp);
2729 			/*
2730 			 * XXX need to discard negative cache entries
2731 			 * for vp.  See comment in ufs_delete().
2732 			 */
2733 			dnlc_remove(vp, ".");
2734 			dnlc_remove(vp, "..");
2735 			/*
2736 			 * The return value is ignored here bacause if
2737 			 * the directory purge fails we don't want to
2738 			 * stop the delete. If ufs_dirpurgedotdot fails
2739 			 * the delete will continue with the preexiting
2740 			 * behavior.
2741 			 */
2742 			(void) ufs_dirpurgedotdot(ip, dp->i_number, cr);
2743 		} else {
2744 			ip->i_nlink--;
2745 		}
2746 		ufs_setreclaim(ip);
2747 	}
2748 	ITIMES_NOLOCK(dp);
2749 	ITIMES_NOLOCK(ip);
2750 
2751 	if (!TRANS_ISTRANS(dp->i_ufsvfs))
2752 		ufs_iupdat(dp, I_SYNC);
2753 	if (!TRANS_ISTRANS(ip->i_ufsvfs))
2754 		ufs_iupdat(ip, I_SYNC);
2755 
2756 	rw_exit(&ip->i_contents);
2757 	if (mode == IFDIR || mode == IFATTRDIR)
2758 		rw_exit(&ip->i_rwlock);
2759 out:
2760 	if (mode == IFDIR || mode == IFATTRDIR) {
2761 		vn_vfsunlock(vp);
2762 	}
2763 out_novfs:
2764 	ASSERT(RW_WRITE_HELD(&dp->i_contents));
2765 
2766 	if (slot.fbp)
2767 		fbrelse(slot.fbp, S_OTHER);
2768 
2769 	rw_exit(&dp->i_contents);
2770 	rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
2771 
2772 	/*
2773 	 * If no error and vpp is non-NULL, return the vnode ptr to the caller.
2774 	 * The caller becomes responsible for the VN_RELE().  Otherwise,
2775 	 * Release (and delete) the inode after we drop vfs_dqrwlock to
2776 	 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
2777 	 */
2778 	if (ip) {
2779 		if ((err == 0) && (vpp != NULL)) {
2780 			*vpp = ITOV(ip);
2781 		} else {
2782 			VN_RELE(vp);
2783 		}
2784 	}
2785 
2786 	return (err);
2787 }
2788 
2789 /*
2790  * Return buffer with contents of block "offset"
2791  * from the beginning of directory "ip".  If "res"
2792  * is non-zero, fill it in with a pointer to the
2793  * remaining space in the directory.
2794  *
2795  */
2796 
2797 int
2798 blkatoff(
2799 	struct inode *ip,
2800 	off_t offset,
2801 	char **res,
2802 	struct fbuf **fbpp)
2803 {
2804 	struct fs *fs;
2805 	struct fbuf *fbp;
2806 	daddr_t lbn;
2807 	uint_t bsize;
2808 	int err;
2809 
2810 	CPU_STATS_ADD_K(sys, ufsdirblk, 1);
2811 	fs = ip->i_fs;
2812 	lbn = (daddr_t)lblkno(fs, offset);
2813 	bsize = (uint_t)blksize(fs, ip, lbn);
2814 	err = fbread(ITOV(ip), (offset_t)(offset & fs->fs_bmask),
2815 			bsize, S_READ, &fbp);
2816 	if (err) {
2817 		*fbpp = (struct fbuf *)NULL;
2818 		return (err);
2819 	}
2820 	if (res)
2821 		*res = fbp->fb_addr + blkoff(fs, offset);
2822 	*fbpp = fbp;
2823 	return (0);
2824 }
2825 
2826 /*
2827  * Do consistency checking:
2828  *	record length must be multiple of 4
2829  *	entry must fit in rest of its DIRBLKSIZ block
2830  *	record must be large enough to contain entry
2831  *	name is not longer than MAXNAMLEN
2832  *	name must be as long as advertised, and null terminated
2833  * NOTE: record length must not be zero (should be checked previously).
2834  *       This routine is only called if dirchk is true.
2835  *       It would be nice to set the FSBAD flag in the super-block when
2836  *       this routine fails so that a fsck is forced on next reboot,
2837  *       but locking is a problem.
2838  */
2839 static int
2840 dirmangled(
2841 	struct inode *dp,
2842 	struct direct *ep,
2843 	int entryoffsetinblock,
2844 	off_t offset)
2845 {
2846 	int i;
2847 
2848 	i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1));
2849 	if ((ep->d_reclen & 0x3) != 0 || (int)ep->d_reclen > i ||
2850 	    (uint_t)ep->d_reclen < DIRSIZ(ep) || ep->d_namlen > MAXNAMLEN ||
2851 	    ep->d_ino && dirbadname(ep->d_name, (int)ep->d_namlen)) {
2852 		dirbad(dp, "mangled entry", offset);
2853 		return (1);
2854 	}
2855 	return (0);
2856 }
2857 
2858 static void
2859 dirbad(struct inode *ip, char *how, off_t offset)
2860 {
2861 	cmn_err(CE_NOTE, "%s: bad dir ino %d at offset %ld: %s",
2862 	    ip->i_fs->fs_fsmnt, (int)ip->i_number, offset, how);
2863 }
2864 
2865 static int
2866 dirbadname(char *sp, int l)
2867 {
2868 	while (l--) {			/* check for nulls */
2869 		if (*sp++ == '\0') {
2870 			return (1);
2871 		}
2872 	}
2873 	return (*sp);			/* check for terminating null */
2874 }
2875 
2876 /*
2877  * Check if a directory is empty or not.
2878  */
2879 static int
2880 ufs_dirempty(
2881 	struct inode *ip,
2882 	ino_t parentino,
2883 	struct cred *cr)
2884 {
2885 	return (ufs_dirscan(ip, parentino, cr, 0));
2886 }
2887 
2888 /*
2889  * clear the .. directory entry.
2890  */
2891 static int
2892 ufs_dirpurgedotdot(
2893 	struct inode *ip,
2894 	ino_t parentino,
2895 	struct cred *cr)
2896 {
2897 	return (ufs_dirscan(ip, parentino, cr, 1));
2898 }
2899 
2900 /*
2901  * Scan the directoy. If clr_dotdot is true clear the ..
2902  * directory else check to see if the directory is empty.
2903  *
2904  * Using a struct dirtemplate here is not precisely
2905  * what we want, but better than using a struct direct.
2906  *
2907  * clr_dotdot is used as a flag to tell us if we need
2908  * to clear the dotdot entry
2909  *
2910  * N.B.: does not handle corrupted directories.
2911  */
2912 static int
2913 ufs_dirscan(
2914 	struct inode *ip,
2915 	ino_t parentino,
2916 	struct cred *cr,
2917 	int clr_dotdot)
2918 {
2919 	offset_t off;
2920 	struct dirtemplate dbuf;
2921 	struct direct *dp = (struct direct *)&dbuf;
2922 	int err, count;
2923 	int empty = 1;	/* Assume it's empty */
2924 #define	MINDIRSIZ (sizeof (struct dirtemplate) / 2)
2925 
2926 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
2927 
2928 	ASSERT(ip->i_size <= (offset_t)MAXOFF_T);
2929 	for (off = 0; off < ip->i_size; off += dp->d_reclen) {
2930 		err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp,
2931 		    (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr);
2932 		/*
2933 		 * Since we read MINDIRSIZ, residual must
2934 		 * be 0 unless we're at end of file.
2935 		 */
2936 		if (err || count != 0 || dp->d_reclen == 0) {
2937 			empty = 0;
2938 			break;
2939 		}
2940 		/* skip empty entries */
2941 		if (dp->d_ino == 0)
2942 			continue;
2943 		/* accept only "." and ".." */
2944 		if (dp->d_namlen > 2 || dp->d_name[0] != '.') {
2945 			empty = 0;
2946 			break;
2947 		}
2948 		/*
2949 		 * At this point d_namlen must be 1 or 2.
2950 		 * 1 implies ".", 2 implies ".." if second
2951 		 * char is also "."
2952 		 */
2953 		if (dp->d_namlen == 1)
2954 			continue;
2955 		if (dp->d_name[1] == '.' &&
2956 		    (ino_t)dp->d_ino == parentino) {
2957 			/*
2958 			 * If we're doing a purge we need to check for
2959 			 * the . and .. entries and clear the d_ino for ..
2960 			 *
2961 			 * if clr_dotdot is set ufs_dirscan does not
2962 			 * check for an empty directory.
2963 			 */
2964 			if (clr_dotdot) {
2965 				/*
2966 				 * Have to actually zap the ..
2967 				 * entry in the directory, as
2968 				 * otherwise someone might have
2969 				 * dp as its cwd and try to
2970 				 * open .., which now points to
2971 				 * an unallocated inode.
2972 				 */
2973 				empty = ufs_dirclrdotdot(ip, parentino);
2974 				break;
2975 			} else {
2976 				continue;
2977 			}
2978 		}
2979 		empty = 0;
2980 		break;
2981 	}
2982 	return (empty);
2983 }
2984 
2985 clock_t retry_backoff_delay = 1; /* delay before retrying the i_rwlock */
2986 uint64_t dircheck_retry_cnt;
2987 /*
2988  * Check if source directory inode is in the path of the target directory.
2989  * Target is supplied locked.
2990  *
2991  * The source and target inode's should be different upon entry.
2992  */
2993 int
2994 ufs_dircheckpath(
2995 	ino_t source_ino,
2996 	struct inode *target,
2997 	struct inode *sdp,
2998 	struct cred *cr)
2999 {
3000 	struct fbuf *fbp;
3001 	struct dirtemplate *dirp;
3002 	struct inode *ip;
3003 	struct ufsvfs *ufsvfsp;
3004 	struct inode *tip;
3005 	ino_t dotdotino;
3006 	int err;
3007 
3008 	ASSERT(target->i_ufsvfs != NULL);
3009 	ASSERT(RW_LOCK_HELD(&target->i_rwlock));
3010 	ASSERT(RW_LOCK_HELD(&sdp->i_rwlock));
3011 
3012 	ip = target;
3013 	if (ip->i_number == source_ino) {
3014 		err = EINVAL;
3015 		goto out;
3016 	}
3017 	if (ip->i_number == UFSROOTINO) {
3018 		err = 0;
3019 		goto out;
3020 	}
3021 	/*
3022 	 * Search back through the directory tree, using the ".." entries.
3023 	 * Fail any attempt to move a directory into an ancestor directory.
3024 	 */
3025 	fbp = NULL;
3026 	for (;;) {
3027 		struct vfs	*vfs;
3028 
3029 		err = blkatoff(ip, (off_t)0, (char **)&dirp, &fbp);
3030 		if (err)
3031 			break;
3032 		if (((ip->i_mode & IFMT) != IFDIR) || ip->i_nlink == 0 ||
3033 		    ip->i_size < sizeof (struct dirtemplate)) {
3034 			dirbad(ip, "bad size, unlinked or not dir", (off_t)0);
3035 			err = ENOTDIR;
3036 			break;
3037 		}
3038 		if (dirp->dotdot_namlen != 2 ||
3039 		    dirp->dotdot_name[0] != '.' ||
3040 		    dirp->dotdot_name[1] != '.') {
3041 			dirbad(ip, "mangled .. entry", (off_t)0);
3042 			err = ENOTDIR;		/* Sanity check */
3043 			break;
3044 		}
3045 		dotdotino = (ino_t)dirp->dotdot_ino;
3046 		if (dotdotino == source_ino) {
3047 			err = EINVAL;
3048 			break;
3049 		}
3050 		if (dotdotino == UFSROOTINO)
3051 			break;
3052 		if (fbp) {
3053 			fbrelse(fbp, S_OTHER);
3054 			fbp = NULL;
3055 		}
3056 		vfs = ip->i_vfs;
3057 		ufsvfsp = ip->i_ufsvfs;
3058 
3059 		if (ip != target) {
3060 			rw_exit(&ip->i_rwlock);
3061 			VN_RELE(ITOV(ip));
3062 		}
3063 		/*
3064 		 * Race to get the inode.
3065 		 */
3066 		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
3067 		if (err = ufs_iget_alloced(vfs, dotdotino, &tip, cr)) {
3068 			rw_exit(&ufsvfsp->vfs_dqrwlock);
3069 			ip = NULL;
3070 			break;
3071 		}
3072 		rw_exit(&ufsvfsp->vfs_dqrwlock);
3073 		/*
3074 		 * If the directory of the source inode (also a directory)
3075 		 * is the same as this next entry up the chain, then
3076 		 * we know the source directory itself can't be in the
3077 		 * chain. This also prevents a panic because we already
3078 		 * have sdp->i_rwlock locked.
3079 		 */
3080 		if (tip == sdp) {
3081 			VN_RELE(ITOV(tip));
3082 			ip = NULL;
3083 			break;
3084 		}
3085 		ip = tip;
3086 
3087 		/*
3088 		 * If someone has set the WRITE_WANTED bit in this lock and if
3089 		 * this happens to be a sdp or tdp of another parallel rename
3090 		 * which is executing  the same code and in similar situation
3091 		 * we end up in a 4 way deadlock. We need to make sure that
3092 		 * the WRITE_WANTED bit is not  set.
3093 		 */
3094 retry_lock:
3095 		if (!rw_tryenter(&ip->i_rwlock, RW_READER)) {
3096 			/*
3097 			 * If the lock held as WRITER thats fine but if it
3098 			 * has WRITE_WANTED bit set we might end up in a
3099 			 * deadlock. If WRITE_WANTED is set we return
3100 			 * with EAGAIN else we just go back and try.
3101 			 */
3102 			if (RW_ISWRITER(&ip->i_rwlock) &&
3103 					!(RW_WRITE_HELD(&ip->i_rwlock))) {
3104 				err = EAGAIN;
3105 				if (fbp) {
3106 					fbrelse(fbp, S_OTHER);
3107 				}
3108 				VN_RELE(ITOV(ip));
3109 				return (err);
3110 			} else {
3111 				/*
3112 				 * The lock is being write held. We could
3113 				 * just do a rw_enter here but there is a
3114 				 * window between the check and now, where
3115 				 * the status could have changed, so to
3116 				 * avoid looping we backoff and go back to
3117 				 * try for the lock.
3118 				 */
3119 				delay(retry_backoff_delay);
3120 				dircheck_retry_cnt++;
3121 				goto retry_lock;
3122 			}
3123 		}
3124 	}
3125 	if (fbp) {
3126 		fbrelse(fbp, S_OTHER);
3127 	}
3128 out:
3129 	if (ip) {
3130 		if (ip != target) {
3131 			rw_exit(&ip->i_rwlock);
3132 			VN_RELE(ITOV(ip));
3133 		}
3134 	}
3135 	return (err);
3136 }
3137 
3138 int
3139 ufs_xattrdirempty(struct inode *ip, ino_t parentino, struct cred *cr)
3140 {
3141 	offset_t off;
3142 	struct dirtemplate dbuf;
3143 	struct direct *dp = (struct direct *)&dbuf;
3144 	int err, count;
3145 	int empty = 1;	/* Assume it's empty */
3146 #define	MINDIRSIZ (sizeof (struct dirtemplate) / 2)
3147 
3148 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
3149 
3150 	ASSERT(ip->i_size <= (offset_t)MAXOFF_T);
3151 	for (off = 0; off < ip->i_size; off += dp->d_reclen) {
3152 		err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp,
3153 		    (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr);
3154 		/*
3155 		 * Since we read MINDIRSIZ, residual must
3156 		 * be 0 unless we're at end of file.
3157 		 */
3158 
3159 		if (err || count != 0 || dp->d_reclen == 0) {
3160 			empty = 0;
3161 			break;
3162 		}
3163 		/* skip empty entries */
3164 		if (dp->d_ino == 0)
3165 			continue;
3166 		/*
3167 		 * At this point d_namlen must be 1 or 2.
3168 		 * 1 implies ".", 2 implies ".." if second
3169 		 * char is also "."
3170 		 */
3171 
3172 		if (dp->d_namlen == 1 && dp->d_name[0] == '.' &&
3173 				(ino_t)dp->d_ino == parentino)
3174 			continue;
3175 
3176 		if (dp->d_namlen == 2 && dp->d_name[0] == '.' &&
3177 			dp->d_name[1] == '.') {
3178 			continue;
3179 		}
3180 		empty = 0;
3181 		break;
3182 	}
3183 	return (empty);
3184 }
3185 
3186 
3187 /*
3188  * Allocate and initialize a new shadow inode to contain extended attributes.
3189  */
3190 int
3191 ufs_xattrmkdir(
3192 	struct inode *tdp,
3193 	struct inode **ipp,
3194 	int flags,
3195 	struct cred *cr)
3196 {
3197 	struct inode *ip;
3198 	struct vattr va;
3199 	int err;
3200 	int retry = 1;
3201 	struct ufsvfs *ufsvfsp;
3202 	struct ulockfs *ulp;
3203 	int issync;
3204 	int trans_size;
3205 	int dorwlock;		/* 0 = not yet taken, */
3206 				/* 1 = taken outside the transaction, */
3207 				/* 2 = taken inside the transaction */
3208 
3209 	/*
3210 	 * Validate permission to create attribute directory
3211 	 */
3212 
3213 	if ((err = ufs_iaccess(tdp, IWRITE, cr)) != 0) {
3214 		return (err);
3215 	}
3216 
3217 	if (vn_is_readonly(ITOV(tdp)))
3218 		return (EROFS);
3219 
3220 	/*
3221 	 * No need to re-init err after again:, since it's set before
3222 	 * the next use of it.
3223 	 */
3224 again:
3225 	dorwlock = 0;
3226 	va.va_type = VDIR;
3227 	va.va_uid = tdp->i_uid;
3228 	va.va_gid = tdp->i_gid;
3229 
3230 	if ((tdp->i_mode & IFMT) == IFDIR) {
3231 		va.va_mode = (o_mode_t)IFATTRDIR;
3232 		va.va_mode |= tdp->i_mode & 0777;
3233 	} else {
3234 		va.va_mode = (o_mode_t)IFATTRDIR|0700;
3235 		if (tdp->i_mode & 0040)
3236 			va.va_mode |= 0750;
3237 		if (tdp->i_mode & 0004)
3238 			va.va_mode |= 0705;
3239 	}
3240 	va.va_mask = AT_TYPE|AT_MODE;
3241 
3242 	ufsvfsp = tdp->i_ufsvfs;
3243 
3244 	err = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MKDIR_MASK);
3245 	if (err)
3246 		return (err);
3247 
3248 	/*
3249 	 * Acquire i_rwlock before TRANS_BEGIN_CSYNC() if this is a file.
3250 	 * This follows the protocol for read()/write().
3251 	 */
3252 	if (ITOV(tdp)->v_type != VDIR) {
3253 		rw_enter(&tdp->i_rwlock, RW_WRITER);
3254 		dorwlock = 1;
3255 	}
3256 
3257 	if (ulp) {
3258 		trans_size = (int)TOP_MKDIR_SIZE(tdp);
3259 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_MKDIR, trans_size);
3260 	}
3261 
3262 	/*
3263 	 * Acquire i_rwlock after TRANS_BEGIN_CSYNC() if this is a directory.
3264 	 * This follows the protocol established by
3265 	 * ufs_link/create/remove/rename/mkdir/rmdir/symlink.
3266 	 */
3267 	if (dorwlock == 0) {
3268 		rw_enter(&tdp->i_rwlock, RW_WRITER);
3269 		dorwlock = 2;
3270 	}
3271 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
3272 	rw_enter(&tdp->i_contents, RW_WRITER);
3273 
3274 	/*
3275 	 * Suppress out of inodes messages if we will retry.
3276 	 */
3277 	if (retry)
3278 		tdp->i_flag |= IQUIET;
3279 	err = ufs_dirmakeinode(tdp, &ip, &va, DE_ATTRDIR, cr);
3280 	tdp->i_flag &= ~IQUIET;
3281 
3282 	if (err)
3283 		goto fail;
3284 
3285 	if (flags) {
3286 
3287 		/*
3288 		 * Now attach it to src file.
3289 		 */
3290 
3291 		tdp->i_oeftflag = ip->i_number;
3292 	}
3293 
3294 	ip->i_cflags |= IXATTR;
3295 	ITOV(ip)->v_flag |= V_XATTRDIR;
3296 	TRANS_INODE(ufsvfsp, tdp);
3297 	tdp->i_flag |= ICHG | IUPD;
3298 	tdp->i_seq++;
3299 	ufs_iupdat(tdp, I_SYNC);
3300 	rw_exit(&tdp->i_contents);
3301 	rw_exit(&ufsvfsp->vfs_dqrwlock);
3302 
3303 	rw_enter(&ip->i_rwlock, RW_WRITER);
3304 	rw_enter(&ip->i_contents, RW_WRITER);
3305 	TRANS_INODE(ufsvfsp, ip);
3306 	ip->i_flag |= ICHG| IUPD;
3307 	ip->i_seq++;
3308 	ufs_iupdat(ip, I_SYNC);
3309 	rw_exit(&ip->i_contents);
3310 	rw_exit(&ip->i_rwlock);
3311 	if (dorwlock == 2)
3312 		rw_exit(&tdp->i_rwlock);
3313 	if (ulp) {
3314 		int terr = 0;
3315 
3316 		TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size);
3317 		ufs_lockfs_end(ulp);
3318 		if (err == 0)
3319 			err = terr;
3320 	}
3321 	if (dorwlock == 1)
3322 		rw_exit(&tdp->i_rwlock);
3323 	*ipp = ip;
3324 	return (err);
3325 
3326 fail:
3327 	rw_exit(&tdp->i_contents);
3328 	rw_exit(&ufsvfsp->vfs_dqrwlock);
3329 	if (dorwlock == 2)
3330 		rw_exit(&tdp->i_rwlock);
3331 	if (ulp) {
3332 		TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size);
3333 		ufs_lockfs_end(ulp);
3334 	}
3335 	if (dorwlock == 1)
3336 		rw_exit(&tdp->i_rwlock);
3337 	if (ip != NULL)
3338 		VN_RELE(ITOV(ip));
3339 
3340 	/*
3341 	 * No inodes?  See if any are tied up in pending deletions.
3342 	 * This has to be done outside of any of the above, because
3343 	 * the draining operation can't be done from inside a transaction.
3344 	 */
3345 	if ((err == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
3346 		ufs_delete_drain_wait(ufsvfsp, 1);
3347 		retry = 0;
3348 		goto again;
3349 	}
3350 
3351 	return (err);
3352 }
3353 
3354 /*
3355  * clear the dotdot directory entry.
3356  * Used by ufs_dirscan when clr_dotdot
3357  * flag is set and we're deleting a
3358  * directory.
3359  */
3360 static int
3361 ufs_dirclrdotdot(struct inode *ip, ino_t parentino)
3362 {
3363 	struct fbuf *fbp;
3364 	struct direct *dotp, *dotdotp;
3365 	int err = 0;
3366 
3367 	ASSERT(RW_WRITE_HELD(&ip->i_rwlock));
3368 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
3369 	err = blkatoff(ip, 0, NULL, &fbp);
3370 	if (err) {
3371 		return (err);
3372 	}
3373 
3374 	dotp = (struct direct *)fbp->fb_addr;
3375 	if ((dotp->d_namlen < (MAXNAMLEN + 1)) &&
3376 	    ((DIRBLKSIZ - DIRSIZ(dotp)) >= (sizeof (struct dirtemplate) / 2))) {
3377 		dotdotp = (struct direct *)((char *)dotp + dotp->d_reclen);
3378 		if ((dotdotp->d_namlen < (MAXNAMLEN + 1)) &&
3379 		    ((DIRBLKSIZ - DIRSIZ(dotp)) >= dotdotp->d_reclen)) {
3380 
3381 			dotp->d_reclen += dotdotp->d_reclen;
3382 			if (parentino == dotdotp->d_ino) {
3383 				dotdotp->d_ino = 0;
3384 				dotdotp->d_namlen = 0;
3385 				dotdotp->d_reclen = 0;
3386 			}
3387 
3388 			err = TRANS_DIR(ip, 0);
3389 			if (err) {
3390 				fbrelse(fbp, S_OTHER);
3391 			} else {
3392 				err = ufs_fbwrite(fbp, ip);
3393 			}
3394 		}
3395 	} else {
3396 		err = -1;
3397 	}
3398 	return (err);
3399 }
3400