xref: /titanic_52/usr/src/uts/common/fs/ufs/ufs_thread.c (revision 9db67a327daf1243e630c20b81978ffd2a7baad7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * Portions of this source code were derived from Berkeley 4.3 BSD
31  * under license from the Regents of the University of California.
32  */
33 
34 #pragma ident	"%Z%%M%	%I%	%E% SMI"
35 
36 #include <sys/types.h>
37 #include <sys/systm.h>
38 #include <sys/errno.h>
39 #include <sys/kmem.h>
40 #include <sys/buf.h>
41 #include <sys/vnode.h>
42 #include <sys/vfs.h>
43 #include <sys/user.h>
44 #include <sys/callb.h>
45 #include <sys/cpuvar.h>
46 #include <sys/fs/ufs_inode.h>
47 #include <sys/fs/ufs_log.h>
48 #include <sys/fs/ufs_trans.h>
49 #include <sys/fs/ufs_acl.h>
50 #include <sys/fs/ufs_bio.h>
51 #include <sys/fs/ufs_fsdir.h>
52 #include <sys/debug.h>
53 #include <sys/cmn_err.h>
54 #include <sys/sysmacros.h>
55 
56 extern pri_t 			minclsyspri;
57 extern int			hash2ints();
58 extern struct kmem_cache	*inode_cache;	/* cache of free inodes */
59 extern int			ufs_idle_waiters;
60 extern struct instats		ins;
61 
62 static void ufs_attr_purge(struct inode *);
63 
64 /*
65  * initialize a thread's queue struct
66  */
67 void
68 ufs_thread_init(struct ufs_q *uq, int lowat)
69 {
70 	bzero((caddr_t)uq, sizeof (*uq));
71 	cv_init(&uq->uq_cv, NULL, CV_DEFAULT, NULL);
72 	mutex_init(&uq->uq_mutex, NULL, MUTEX_DEFAULT, NULL);
73 	uq->uq_lowat = lowat;
74 	uq->uq_hiwat = 2 * lowat;
75 	uq->uq_threadp = NULL;
76 }
77 
78 /*
79  * start a thread for a queue (assumes success)
80  */
81 void
82 ufs_thread_start(struct ufs_q *uq, void (*func)(), struct vfs *vfsp)
83 {
84 	mutex_enter(&uq->uq_mutex);
85 	if (uq->uq_threadp == NULL) {
86 		uq->uq_threadp = thread_create(NULL, 0, func, vfsp, 0, &p0,
87 		    TS_RUN, minclsyspri);
88 		uq->uq_flags = 0;
89 	}
90 	mutex_exit(&uq->uq_mutex);
91 }
92 
93 /*
94  * wait for the thread to exit
95  */
96 void
97 ufs_thread_exit(struct ufs_q *uq)
98 {
99 	kt_did_t ufs_thread_did = 0;
100 
101 	mutex_enter(&uq->uq_mutex);
102 	uq->uq_flags &= ~(UQ_SUSPEND | UQ_SUSPENDED);
103 	if (uq->uq_threadp != NULL) {
104 		ufs_thread_did = uq->uq_threadp->t_did;
105 		uq->uq_flags |= (UQ_EXIT|UQ_WAIT);
106 		cv_broadcast(&uq->uq_cv);
107 	}
108 	mutex_exit(&uq->uq_mutex);
109 
110 	/*
111 	 * It's safe to call thread_join() with an already-gone
112 	 * t_did, but we have to obtain it before the kernel
113 	 * thread structure is freed. We do so above under the
114 	 * protection of the uq_mutex when we're sure the thread
115 	 * still exists and it's save to de-reference it.
116 	 * We also have to check if ufs_thread_did is != 0
117 	 * before calling thread_join() since thread 0 in the system
118 	 * gets a t_did of 0.
119 	 */
120 	if (ufs_thread_did)
121 		thread_join(ufs_thread_did);
122 }
123 
124 /*
125  * wait for a thread to suspend itself on the caller's behalf
126  *	the caller is responsible for continuing the thread
127  */
128 void
129 ufs_thread_suspend(struct ufs_q *uq)
130 {
131 	mutex_enter(&uq->uq_mutex);
132 	if (uq->uq_threadp != NULL) {
133 		/*
134 		 * wait while another thread is suspending this thread.
135 		 * no need to do a cv_broadcast(), as whoever suspended
136 		 * the thread must continue it at some point.
137 		 */
138 		while ((uq->uq_flags & UQ_SUSPEND) &&
139 		    (uq->uq_threadp != NULL)) {
140 			/*
141 			 * We can't use cv_signal() because if our
142 			 * signal doesn't happen to hit the desired
143 			 * thread but instead some other waiter like
144 			 * ourselves, we'll wait forever for a
145 			 * response.  Well, at least an indeterminate
146 			 * amount of time until we just happen to get
147 			 * lucky from whomever did get signalled doing
148 			 * a cv_signal() of their own.  This is an
149 			 * unfortunate performance lossage.
150 			 */
151 			uq->uq_flags |= UQ_WAIT;
152 			cv_wait(&uq->uq_cv, &uq->uq_mutex);
153 		}
154 
155 		uq->uq_flags |= (UQ_SUSPEND | UQ_WAIT);
156 
157 		/*
158 		 * wait for the thread to suspend itself
159 		 */
160 		if ((uq->uq_flags & UQ_SUSPENDED) == 0 &&
161 		    (uq->uq_threadp != NULL)) {
162 			cv_broadcast(&uq->uq_cv);
163 		}
164 
165 		while (((uq->uq_flags & UQ_SUSPENDED) == 0) &&
166 		    (uq->uq_threadp != NULL)) {
167 			cv_wait(&uq->uq_cv, &uq->uq_mutex);
168 		}
169 	}
170 	mutex_exit(&uq->uq_mutex);
171 }
172 
173 /*
174  * allow a thread to continue from a ufs_thread_suspend()
175  *	This thread must be the same as the thread that called
176  *	ufs_thread_suspend.
177  */
178 void
179 ufs_thread_continue(struct ufs_q *uq)
180 {
181 	mutex_enter(&uq->uq_mutex);
182 	uq->uq_flags &= ~(UQ_SUSPEND | UQ_SUSPENDED);
183 	cv_broadcast(&uq->uq_cv);
184 	mutex_exit(&uq->uq_mutex);
185 }
186 
187 /*
188  * some common code for managing a threads execution
189  *	uq is locked at entry and return
190  *	may sleep
191  *	may exit
192  */
193 /*
194  * Kind of a hack passing in the callb_cpr_t * here.
195  * It should really be part of the ufs_q structure.
196  * I did not put it in there because we are already in beta
197  * and I was concerned that changing ufs_inode.h to include
198  * callb.h might break something.
199  */
200 int
201 ufs_thread_run(struct ufs_q *uq, callb_cpr_t *cprinfop)
202 {
203 again:
204 	ASSERT(uq->uq_ne >= 0);
205 
206 	if (uq->uq_flags & UQ_SUSPEND) {
207 		uq->uq_flags |= UQ_SUSPENDED;
208 	} else if (uq->uq_flags & UQ_EXIT) {
209 		/*
210 		 * exiting; empty the queue (may infinite loop)
211 		 */
212 		if (uq->uq_ne)
213 			return (uq->uq_ne);
214 		uq->uq_threadp = NULL;
215 		if (uq->uq_flags & UQ_WAIT) {
216 			cv_broadcast(&uq->uq_cv);
217 		}
218 		uq->uq_flags &= ~(UQ_EXIT | UQ_WAIT);
219 		CALLB_CPR_EXIT(cprinfop);
220 		thread_exit();
221 	} else if (uq->uq_ne >= uq->uq_lowat) {
222 		/*
223 		 * process a block of entries until below high water mark
224 		 */
225 		return (uq->uq_ne - (uq->uq_lowat >> 1));
226 	}
227 	if (uq->uq_flags & UQ_WAIT) {
228 		uq->uq_flags &= ~UQ_WAIT;
229 		cv_broadcast(&uq->uq_cv);
230 	}
231 	CALLB_CPR_SAFE_BEGIN(cprinfop);
232 	cv_wait(&uq->uq_cv, &uq->uq_mutex);
233 	CALLB_CPR_SAFE_END(cprinfop, &uq->uq_mutex);
234 	goto again;
235 }
236 
237 /*
238  * DELETE INODE
239  * The following routines implement the protocol for freeing the resources
240  * held by an idle and deleted inode.
241  */
242 void
243 ufs_delete(struct ufsvfs *ufsvfsp, struct inode *ip, int dolockfs)
244 {
245 	ushort_t	mode;
246 	struct vnode	*vp	= ITOV(ip);
247 	struct ulockfs	*ulp;
248 	int		trans_size;
249 	int		dorwlock = ((ip->i_mode & IFMT) == IFREG);
250 	int		issync;
251 	int		err;
252 	struct inode	*dp;
253 	struct ufs_q    *delq = &ufsvfsp->vfs_delete;
254 	struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
255 
256 	/*
257 	 * not on a trans device or not part of a transaction
258 	 */
259 	ASSERT(!TRANS_ISTRANS(ufsvfsp) ||
260 	    ((curthread->t_flag & T_DONTBLOCK) == 0));
261 
262 	/*
263 	 * Ignore if deletes are not allowed (wlock/hlock)
264 	 */
265 	if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) {
266 		mutex_enter(&delq->uq_mutex);
267 		delq_info->delq_unreclaimed_blocks -= ip->i_blocks;
268 		delq_info->delq_unreclaimed_files--;
269 		mutex_exit(&delq->uq_mutex);
270 		VN_RELE(vp);
271 		return;
272 	}
273 
274 	if ((vp->v_count > 1) || (ip->i_mode == 0)) {
275 		mutex_enter(&delq->uq_mutex);
276 		delq_info->delq_unreclaimed_blocks -= ip->i_blocks;
277 		delq_info->delq_unreclaimed_files--;
278 		mutex_exit(&delq->uq_mutex);
279 		VN_RELE(vp);
280 		return;
281 	}
282 	/*
283 	 * If we are called as part of setting a fs lock, then only
284 	 * do part of the lockfs protocol.  In other words, don't hang.
285 	 */
286 	if (dolockfs) {
287 		if (ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_DELETE_MASK))
288 			return;
289 	} else {
290 		/*
291 		 * check for recursive VOP call
292 		 */
293 		if (curthread->t_flag & T_DONTBLOCK) {
294 			ulp = NULL;
295 		} else {
296 			ulp = &ufsvfsp->vfs_ulockfs;
297 			curthread->t_flag |= T_DONTBLOCK;
298 		}
299 	}
300 
301 	/*
302 	 * Hold rwlock to synchronize with (nfs) writes
303 	 */
304 	if (dorwlock)
305 		rw_enter(&ip->i_rwlock, RW_WRITER);
306 
307 	/*
308 	 * Delete the attribute directory.
309 	 */
310 	if (ip->i_oeftflag != 0) {
311 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE,
312 		    trans_size = (int)TOP_REMOVE_SIZE(ip));
313 		rw_enter(&ip->i_contents, RW_WRITER);
314 		err = ufs_iget(ip->i_vfs, ip->i_oeftflag,
315 		    &dp, CRED());
316 		if (err == 0) {
317 			rw_enter(&dp->i_rwlock, RW_WRITER);
318 			rw_enter(&dp->i_contents, RW_WRITER);
319 			dp->i_flag |= IUPD|ICHG;
320 			dp->i_seq++;
321 			TRANS_INODE(dp->i_ufsvfs, dp);
322 			dp->i_nlink -= 2;
323 			ufs_setreclaim(dp);
324 			/*
325 			 * Should get rid of any negative cache entries that
326 			 * might be lingering, as well as ``.'' and
327 			 * ``..''.  If we don't, the VN_RELE() below
328 			 * won't actually put dp on the delete queue
329 			 * and it'll hang out until someone forces it
330 			 * (lockfs -f, umount, ...).  The only reliable
331 			 * way of doing this at the moment is to call
332 			 * dnlc_purge_vp(ITOV(dp)), which is unacceptably
333 			 * slow, so we'll just note the problem in this
334 			 * comment for now.
335 			 */
336 			dnlc_remove(ITOV(dp), ".");
337 			dnlc_remove(ITOV(dp), "..");
338 			ITIMES_NOLOCK(dp);
339 			if (!TRANS_ISTRANS(ufsvfsp)) {
340 				ufs_iupdat(dp, I_SYNC);
341 			}
342 			rw_exit(&dp->i_contents);
343 			rw_exit(&dp->i_rwlock);
344 			VN_RELE(ITOV(dp));
345 		}
346 		/*
347 		 * Clear out attribute pointer
348 		 */
349 		ip->i_oeftflag = 0;
350 		rw_exit(&ip->i_contents);
351 		TRANS_END_CSYNC(ufsvfsp, err, issync,
352 		    TOP_REMOVE, trans_size);
353 		dnlc_remove(ITOV(ip), XATTR_DIR_NAME);
354 	}
355 
356 	if ((ip->i_mode & IFMT) == IFATTRDIR) {
357 		ufs_attr_purge(ip);
358 	}
359 
360 	(void) TRANS_ITRUNC(ip, (u_offset_t)0, I_FREE | I_ACCT, CRED());
361 
362 	/*
363 	 * the inode's space has been freed; now free the inode
364 	 */
365 	if (ulp) {
366 		trans_size = TOP_IFREE_SIZE(ip);
367 		TRANS_BEGIN_ASYNC(ufsvfsp, TOP_IFREE, trans_size);
368 	}
369 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
370 	rw_enter(&ip->i_contents, RW_WRITER);
371 	TRANS_INODE(ufsvfsp, ip);
372 	mode = ip->i_mode;
373 	ip->i_mode = 0;
374 	ip->i_rdev = 0;
375 	ip->i_ordev = 0;
376 	ip->i_flag |= IMOD;
377 	if (ip->i_ufs_acl) {
378 		(void) ufs_si_free(ip->i_ufs_acl, vp->v_vfsp, CRED());
379 		ip->i_ufs_acl = NULL;
380 		ip->i_shadow = 0;
381 	}
382 
383 	/*
384 	 * This inode is torn down but still retains it's identity
385 	 * (inode number).  It could get recycled soon so it's best
386 	 * to clean up the vnode just in case.
387 	 */
388 	mutex_enter(&vp->v_lock);
389 	vn_recycle(vp);
390 	mutex_exit(&vp->v_lock);
391 
392 	/*
393 	 * free the inode
394 	 */
395 	ufs_ifree(ip, ip->i_number, mode);
396 	/*
397 	 * release quota resources; can't fail
398 	 */
399 	(void) chkiq((struct ufsvfs *)vp->v_vfsp->vfs_data,
400 	    /* change */ -1, ip, (uid_t)ip->i_uid, 0, CRED(),
401 	    (char **)NULL, (size_t *)NULL);
402 	dqrele(ip->i_dquot);
403 	ip->i_dquot = NULL;
404 	ip->i_flag &= ~(IDEL | IDIRECTIO);
405 	ip->i_cflags = 0;
406 	if (!TRANS_ISTRANS(ufsvfsp)) {
407 		ufs_iupdat(ip, I_SYNC);
408 	} else {
409 		mutex_enter(&delq->uq_mutex);
410 		delq_info->delq_unreclaimed_files--;
411 		mutex_exit(&delq->uq_mutex);
412 	}
413 	rw_exit(&ip->i_contents);
414 	rw_exit(&ufsvfsp->vfs_dqrwlock);
415 	if (dorwlock)
416 		rw_exit(&ip->i_rwlock);
417 	VN_RELE(vp);
418 
419 	/*
420 	 * End of transaction
421 	 */
422 	if (ulp) {
423 		TRANS_END_ASYNC(ufsvfsp, TOP_IFREE, trans_size);
424 		if (dolockfs)
425 			ufs_lockfs_end(ulp);
426 		else
427 			curthread->t_flag &= ~T_DONTBLOCK;
428 	}
429 }
430 
431 /*
432  * Create the delete thread and init the delq_info for this fs
433  */
434 void
435 ufs_delete_init(struct ufsvfs *ufsvfsp, int lowat)
436 {
437 	struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
438 
439 	ufs_thread_init(&ufsvfsp->vfs_delete, lowat);
440 	(void) memset((void *)delq_info, 0, sizeof (*delq_info));
441 }
442 
443 /*
444  * thread that frees up deleted inodes
445  */
446 void
447 ufs_thread_delete(struct vfs *vfsp)
448 {
449 	struct ufsvfs	*ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
450 	struct ufs_q	*uq = &ufsvfsp->vfs_delete;
451 	struct inode	*ip;
452 	long		ne;
453 	callb_cpr_t	cprinfo;
454 
455 	CALLB_CPR_INIT(&cprinfo, &uq->uq_mutex, callb_generic_cpr,
456 	    "ufsdelete");
457 
458 	mutex_enter(&uq->uq_mutex);
459 again:
460 	/*
461 	 * Sleep until there is work to do.  Only do one entry at
462 	 * a time, to reduce the wait time for checking for a suspend
463 	 * request.  The ?: is for pedantic portability.
464 	 */
465 	ne = ufs_thread_run(uq, &cprinfo) ? 1 : 0;
466 
467 	/*
468 	 * process an entry, if there are any
469 	 */
470 	if (ne && (ip = uq->uq_ihead)) {
471 		/*
472 		 * process first entry on queue.  Assumed conditions are:
473 		 *	ip is held (v_count >= 1)
474 		 *	ip is referenced (i_flag & IREF)
475 		 *	ip is free (i_nlink <= 0)
476 		 */
477 		if ((uq->uq_ihead = ip->i_freef) == ip)
478 			uq->uq_ihead = NULL;
479 		ip->i_freef->i_freeb = ip->i_freeb;
480 		ip->i_freeb->i_freef = ip->i_freef;
481 		ip->i_freef = ip;
482 		ip->i_freeb = ip;
483 		uq->uq_ne--;
484 		mutex_exit(&uq->uq_mutex);
485 		ufs_delete(ufsvfsp, ip, 1);
486 		mutex_enter(&uq->uq_mutex);
487 	}
488 	goto again;
489 }
490 
491 /*
492  * drain ne entries off the delete queue.  As new queue entries may
493  * be added while we're working, ne is interpreted as follows:
494  *
495  * ne > 0   => remove up to ne entries
496  * ne == 0  => remove all entries currently on the queue
497  * ne == -1 => remove entries until the queue is empty
498  */
499 void
500 ufs_delete_drain(struct vfs *vfsp, int ne, int dolockfs)
501 {
502 	struct ufsvfs	*ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
503 	struct ufs_q	*uq;
504 	struct inode	*ip;
505 	int		drain_cnt = 0;
506 	int		done;
507 
508 	/*
509 	 * if forcibly unmounted; ignore
510 	 */
511 	if (ufsvfsp == NULL)
512 		return;
513 
514 	uq = &ufsvfsp->vfs_delete;
515 	mutex_enter(&uq->uq_mutex);
516 	if (ne == 0)
517 		drain_cnt = uq->uq_ne;
518 	else if (ne > 0)
519 		drain_cnt = ne;
520 
521 	/*
522 	 * process up to ne entries
523 	 */
524 
525 	done = 0;
526 	while (!done && (ip = uq->uq_ihead)) {
527 		if (ne != -1)
528 			drain_cnt--;
529 		if (ne != -1 && drain_cnt == 0)
530 			done = 1;
531 		if ((uq->uq_ihead = ip->i_freef) == ip)
532 			uq->uq_ihead = NULL;
533 		ip->i_freef->i_freeb = ip->i_freeb;
534 		ip->i_freeb->i_freef = ip->i_freef;
535 		ip->i_freef = ip;
536 		ip->i_freeb = ip;
537 		uq->uq_ne--;
538 		mutex_exit(&uq->uq_mutex);
539 		ufs_delete(ufsvfsp, ip, dolockfs);
540 		mutex_enter(&uq->uq_mutex);
541 	}
542 	mutex_exit(&uq->uq_mutex);
543 }
544 
545 void
546 ufs_sync_with_thread(struct ufs_q *uq)
547 {
548 	mutex_enter(&uq->uq_mutex);
549 
550 	/*
551 	 * Wake up delete thread to free up space.
552 	 */
553 	if ((uq->uq_flags & UQ_WAIT) == 0) {
554 		uq->uq_flags |= UQ_WAIT;
555 		cv_broadcast(&uq->uq_cv);
556 	}
557 
558 	while ((uq->uq_threadp != NULL) && (uq->uq_flags & UQ_WAIT)) {
559 		cv_wait(&uq->uq_cv, &uq->uq_mutex);
560 	}
561 
562 	mutex_exit(&uq->uq_mutex);
563 }
564 
565 /*
566  * Get rid of everything that's currently in the delete queue,
567  * plus whatever the delete thread is working on at the moment.
568  *
569  * This ability is required for providing true POSIX semantics
570  * regarding close(2), unlink(2), etc, even when logging is enabled.
571  * The standard requires that the released space be immediately
572  * observable (statvfs(2)) and allocatable (e.g., write(2)).
573  */
574 void
575 ufs_delete_drain_wait(struct ufsvfs *ufsvfsp, int dolockfs)
576 {
577 	struct ufs_q *uq = &ufsvfsp->vfs_delete;
578 	int	error;
579 	struct ufs_q    *delq = &ufsvfsp->vfs_delete;
580 	struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
581 
582 	/*
583 	 * If there is something on delq or delete thread
584 	 * working on delq.
585 	 */
586 	mutex_enter(&delq->uq_mutex);
587 	if (delq_info->delq_unreclaimed_files > 0) {
588 		mutex_exit(&delq->uq_mutex);
589 		(void) ufs_delete_drain(ufsvfsp->vfs_vfs, 0, dolockfs);
590 		ufs_sync_with_thread(uq);
591 	} else {
592 		ASSERT(delq_info->delq_unreclaimed_files == 0);
593 		mutex_exit(&delq->uq_mutex);
594 		return;
595 	}
596 
597 	/*
598 	 * Commit any outstanding transactions to make sure
599 	 * any canceled freed blocks are available for allocation.
600 	 */
601 	curthread->t_flag |= T_DONTBLOCK;
602 	TRANS_BEGIN_SYNC(ufsvfsp, TOP_COMMIT_UPDATE, TOP_COMMIT_SIZE, error);
603 	if (!error) {
604 		TRANS_END_SYNC(ufsvfsp, error, TOP_COMMIT_UPDATE,
605 		    TOP_COMMIT_SIZE);
606 	}
607 	curthread->t_flag &= ~T_DONTBLOCK;
608 }
609 
610 /*
611  * Adjust the resource usage in a struct statvfs based on
612  * what's in the delete queue.
613  *
614  * We do not consider the impact of ACLs or extended attributes
615  * that may be deleted as a side-effect of deleting a file.
616  * Those are metadata, and their sizes aren't reflected in the
617  * sizes returned by stat(), so this is not a problem.
618  */
619 void
620 ufs_delete_adjust_stats(struct ufsvfs *ufsvfsp, struct statvfs64 *sp)
621 {
622 	struct ufs_q *uq = &ufsvfsp->vfs_delete;
623 	struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
624 
625 	mutex_enter(&uq->uq_mutex);
626 	/*
627 	 * The blocks accounted for in the delete queue info are
628 	 * counted in DEV_BSIZE chunks, but ufs_statvfs counts in
629 	 * filesystem fragments, so a conversion is required here.
630 	 */
631 	sp->f_bfree += dbtofsb(ufsvfsp->vfs_fs,
632 	    delq_info->delq_unreclaimed_blocks);
633 	sp->f_ffree += delq_info->delq_unreclaimed_files;
634 	mutex_exit(&uq->uq_mutex);
635 }
636 
637 /*
638  * IDLE INODE
639  * The following routines implement the protocol for maintaining an
640  * LRU list of idle inodes and for moving the idle inodes to the
641  * reuse list when the number of allocated inodes exceeds the user
642  * tunable high-water mark (ufs_ninode).
643  */
644 
645 /*
646  * clean an idle inode and move it to the reuse list
647  */
648 static void
649 ufs_idle_free(struct inode *ip)
650 {
651 	int			pages;
652 	int			hno;
653 	kmutex_t		*ihm;
654 	struct ufsvfs		*ufsvfsp	= ip->i_ufsvfs;
655 	struct vnode		*vp		= ITOV(ip);
656 
657 	/*
658 	 * inode is held
659 	 */
660 
661 	/*
662 	 * remember `pages' for stats below
663 	 */
664 	pages = (ip->i_mode && vn_has_cached_data(vp) && vp->v_type != VCHR);
665 
666 	/*
667 	 * start the dirty pages to disk and then invalidate them
668 	 * unless the inode is invalid (ISTALE)
669 	 */
670 	if ((ip->i_flag & ISTALE) == 0) {
671 		(void) TRANS_SYNCIP(ip, B_ASYNC, I_ASYNC, TOP_SYNCIP_FREE);
672 		(void) TRANS_SYNCIP(ip,
673 		    (TRANS_ISERROR(ufsvfsp)) ? B_INVAL | B_FORCE : B_INVAL,
674 		    I_ASYNC, TOP_SYNCIP_FREE);
675 	}
676 
677 	/*
678 	 * wait for any current ufs_iget to finish and block future ufs_igets
679 	 */
680 	ASSERT(ip->i_number != 0);
681 	hno = INOHASH(ip->i_number);
682 	ihm = &ih_lock[hno];
683 	mutex_enter(ihm);
684 
685 	/*
686 	 * It must be guaranteed that v_count >= 2, otherwise
687 	 * something must be wrong with this vnode already.
688 	 * That is why we use v_count-- instead of VN_RELE().
689 	 * Acquire the vnode lock in case another thread is in
690 	 * VN_RELE().
691 	 */
692 	mutex_enter(&vp->v_lock);
693 
694 	if (vp->v_count < 2)
695 		cmn_err(CE_PANIC,
696 		    "ufs_idle_free: vnode ref count is less than 2");
697 
698 	vp->v_count--;
699 	if ((vp->v_type != VCHR && vn_has_cached_data(vp)) ||
700 	    vp->v_count != 1 ||
701 	    ip->i_flag & (IMOD|IMODACC|IACC|ICHG|IUPD|IATTCHG)) {
702 		/*
703 		 * Another thread has referenced this inode while
704 		 * we are trying to free it. Call VN_RELE() to
705 		 * release our reference.
706 		 */
707 		mutex_exit(&vp->v_lock);
708 		mutex_exit(ihm);
709 		VN_RELE(vp);
710 	} else {
711 		/*
712 		 * The inode is currently unreferenced and can not
713 		 * acquire further references because it has no pages
714 		 * and the hash is locked.  Inodes acquire references
715 		 * via the hash list or via their pages.
716 		 */
717 
718 		mutex_exit(&vp->v_lock);
719 
720 		/*
721 		 * remove it from the cache
722 		 */
723 		remque(ip);
724 		mutex_exit(ihm);
725 		/*
726 		 * Stale inodes have no valid ufsvfs
727 		 */
728 		if ((ip->i_flag & ISTALE) == 0 && ip->i_dquot) {
729 			TRANS_DQRELE(ufsvfsp, ip->i_dquot);
730 			ip->i_dquot = NULL;
731 		}
732 		ufs_si_del(ip);
733 		if (pages) {
734 			CPU_STATS_ADDQ(CPU, sys, ufsipage, 1);
735 		} else {
736 			CPU_STATS_ADDQ(CPU, sys, ufsinopage, 1);
737 		}
738 		ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp));
739 
740 		/*
741 		 * We had better not have a vnode reference count > 1
742 		 * at this point, if we do then something is broken as
743 		 * this inode/vnode acquired a reference underneath of us.
744 		 */
745 		ASSERT(vp->v_count == 1);
746 
747 		ufs_free_inode(ip);
748 	}
749 }
750 
751 /*
752  * this thread processes the global idle queue
753  */
754 iqhead_t *ufs_junk_iq;
755 iqhead_t *ufs_useful_iq;
756 int ufs_njunk_iq = 0;
757 int ufs_nuseful_iq = 0;
758 int ufs_niqhash;
759 int ufs_iqhashmask;
760 struct ufs_q	ufs_idle_q;
761 
762 void
763 ufs_thread_idle(void)
764 {
765 	callb_cpr_t cprinfo;
766 	int i;
767 	int ne;
768 
769 	ufs_niqhash = (ufs_idle_q.uq_lowat >> 1) / IQHASHQLEN;
770 	ufs_niqhash = 1 << highbit(ufs_niqhash); /* round up to power of 2 */
771 	ufs_iqhashmask = ufs_niqhash - 1;
772 	ufs_junk_iq = kmem_alloc(ufs_niqhash * sizeof (*ufs_junk_iq),
773 	    KM_SLEEP);
774 	ufs_useful_iq = kmem_alloc(ufs_niqhash * sizeof (*ufs_useful_iq),
775 	    KM_SLEEP);
776 
777 	/* Initialize hash queue headers */
778 	for (i = 0; i < ufs_niqhash; i++) {
779 		ufs_junk_iq[i].i_freef = (inode_t *)&ufs_junk_iq[i];
780 		ufs_junk_iq[i].i_freeb = (inode_t *)&ufs_junk_iq[i];
781 		ufs_useful_iq[i].i_freef = (inode_t *)&ufs_useful_iq[i];
782 		ufs_useful_iq[i].i_freeb = (inode_t *)&ufs_useful_iq[i];
783 	}
784 
785 	CALLB_CPR_INIT(&cprinfo, &ufs_idle_q.uq_mutex, callb_generic_cpr,
786 	    "ufsidle");
787 again:
788 	/*
789 	 * Whenever the idle thread is awakened, it repeatedly gives
790 	 * back half of the idle queue until the idle queue falls
791 	 * below lowat.
792 	 */
793 	mutex_enter(&ufs_idle_q.uq_mutex);
794 	if (ufs_idle_q.uq_ne < ufs_idle_q.uq_lowat) {
795 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
796 		cv_wait(&ufs_idle_q.uq_cv, &ufs_idle_q.uq_mutex);
797 		CALLB_CPR_SAFE_END(&cprinfo, &ufs_idle_q.uq_mutex);
798 	}
799 	mutex_exit(&ufs_idle_q.uq_mutex);
800 
801 	/*
802 	 * Give back 1/2 of the idle queue
803 	 */
804 	ne = ufs_idle_q.uq_ne >> 1;
805 	ins.in_tidles.value.ul += ne;
806 	ufs_idle_some(ne);
807 	goto again;
808 }
809 
810 /*
811  * Reclaim callback for ufs inode cache.
812  * Invoked by the kernel memory allocator when memory gets tight.
813  */
814 /*ARGSUSED*/
815 void
816 ufs_inode_cache_reclaim(void *cdrarg)
817 {
818 	/*
819 	 * If we are low on memory and the idle queue is over its
820 	 * halfway mark, then free 50% of the idle q
821 	 *
822 	 * We don't free all of the idle inodes because the inodes
823 	 * for popular NFS files may have been kicked from the dnlc.
824 	 * The inodes for these files will end up on the idle queue
825 	 * after every NFS access.
826 	 *
827 	 * If we repeatedly push them from the idle queue then
828 	 * NFS users may be unhappy as an extra buf cache operation
829 	 * is incurred for every NFS operation to these files.
830 	 *
831 	 * It's not common, but I have seen it happen.
832 	 *
833 	 */
834 	if (ufs_idle_q.uq_ne < (ufs_idle_q.uq_lowat >> 1))
835 		return;
836 	mutex_enter(&ufs_idle_q.uq_mutex);
837 	cv_broadcast(&ufs_idle_q.uq_cv);
838 	mutex_exit(&ufs_idle_q.uq_mutex);
839 }
840 
841 /*
842  * Free up some idle inodes
843  */
844 void
845 ufs_idle_some(int ne)
846 {
847 	int i;
848 	struct inode *ip;
849 	struct vnode *vp;
850 	static int junk_rotor = 0;
851 	static int useful_rotor = 0;
852 
853 	for (i = 0; i < ne; ++i) {
854 		mutex_enter(&ufs_idle_q.uq_mutex);
855 
856 		if (ufs_njunk_iq) {
857 			while (ufs_junk_iq[junk_rotor].i_freef ==
858 			    (inode_t *)&ufs_junk_iq[junk_rotor]) {
859 				junk_rotor = IQNEXT(junk_rotor);
860 			}
861 			ip = ufs_junk_iq[junk_rotor].i_freef;
862 			ASSERT(ip->i_flag & IJUNKIQ);
863 		} else if (ufs_nuseful_iq) {
864 			while (ufs_useful_iq[useful_rotor].i_freef ==
865 			    (inode_t *)&ufs_useful_iq[useful_rotor]) {
866 				useful_rotor = IQNEXT(useful_rotor);
867 			}
868 			ip = ufs_useful_iq[useful_rotor].i_freef;
869 			ASSERT(!(ip->i_flag & IJUNKIQ));
870 		} else {
871 			mutex_exit(&ufs_idle_q.uq_mutex);
872 			return;
873 		}
874 
875 		/*
876 		 * emulate ufs_iget
877 		 */
878 		vp = ITOV(ip);
879 		VN_HOLD(vp);
880 		mutex_exit(&ufs_idle_q.uq_mutex);
881 		rw_enter(&ip->i_contents, RW_WRITER);
882 		/*
883 		 * VN_RELE should not be called if
884 		 * ufs_rmidle returns true, as it will
885 		 * effectively be done in ufs_idle_free.
886 		 */
887 		if (ufs_rmidle(ip)) {
888 			rw_exit(&ip->i_contents);
889 			ufs_idle_free(ip);
890 		} else {
891 			rw_exit(&ip->i_contents);
892 			VN_RELE(vp);
893 		}
894 	}
895 }
896 
897 /*
898  * drain entries for vfsp from the idle queue
899  * vfsp == NULL means drain the entire thing
900  */
901 void
902 ufs_idle_drain(struct vfs *vfsp)
903 {
904 	struct inode	*ip, *nip;
905 	struct inode	*ianchor = NULL;
906 	int		i;
907 
908 	mutex_enter(&ufs_idle_q.uq_mutex);
909 	if (ufs_njunk_iq) {
910 		/* for each hash q */
911 		for (i = 0; i < ufs_niqhash; i++) {
912 			/* search down the hash q */
913 			for (ip = ufs_junk_iq[i].i_freef;
914 			    ip != (inode_t *)&ufs_junk_iq[i];
915 			    ip = ip->i_freef) {
916 				if (ip->i_vfs == vfsp || vfsp == NULL) {
917 					/* found a matching entry */
918 					VN_HOLD(ITOV(ip));
919 					mutex_exit(&ufs_idle_q.uq_mutex);
920 					rw_enter(&ip->i_contents, RW_WRITER);
921 					/*
922 					 * See comments in ufs_idle_some()
923 					 * as we will call ufs_idle_free()
924 					 * after scanning both queues.
925 					 */
926 					if (ufs_rmidle(ip)) {
927 						rw_exit(&ip->i_contents);
928 						ip->i_freef = ianchor;
929 						ianchor = ip;
930 					} else {
931 						rw_exit(&ip->i_contents);
932 						VN_RELE(ITOV(ip));
933 					}
934 					/* restart this hash q */
935 					ip = (inode_t *)&ufs_junk_iq[i];
936 					mutex_enter(&ufs_idle_q.uq_mutex);
937 				}
938 			}
939 		}
940 	}
941 	if (ufs_nuseful_iq) {
942 		/* for each hash q */
943 		for (i = 0; i < ufs_niqhash; i++) {
944 			/* search down the hash q */
945 			for (ip = ufs_useful_iq[i].i_freef;
946 			    ip != (inode_t *)&ufs_useful_iq[i];
947 			    ip = ip->i_freef) {
948 				if (ip->i_vfs == vfsp || vfsp == NULL) {
949 					/* found a matching entry */
950 					VN_HOLD(ITOV(ip));
951 					mutex_exit(&ufs_idle_q.uq_mutex);
952 					rw_enter(&ip->i_contents, RW_WRITER);
953 					/*
954 					 * See comments in ufs_idle_some()
955 					 * as we will call ufs_idle_free()
956 					 * after scanning both queues.
957 					 */
958 					if (ufs_rmidle(ip)) {
959 						rw_exit(&ip->i_contents);
960 						ip->i_freef = ianchor;
961 						ianchor = ip;
962 					} else {
963 						rw_exit(&ip->i_contents);
964 						VN_RELE(ITOV(ip));
965 					}
966 					/* restart this hash q */
967 					ip = (inode_t *)&ufs_useful_iq[i];
968 					mutex_enter(&ufs_idle_q.uq_mutex);
969 				}
970 			}
971 		}
972 	}
973 
974 	mutex_exit(&ufs_idle_q.uq_mutex);
975 	/* no more matching entries, release those we have found (if any) */
976 	for (ip = ianchor; ip; ip = nip) {
977 		nip = ip->i_freef;
978 		ip->i_freef = ip;
979 		ufs_idle_free(ip);
980 	}
981 }
982 
983 /*
984  * RECLAIM DELETED INODES
985  * The following thread scans the file system once looking for deleted files
986  */
987 void
988 ufs_thread_reclaim(struct vfs *vfsp)
989 {
990 	struct ufsvfs		*ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
991 	struct ufs_q		*uq	= &ufsvfsp->vfs_reclaim;
992 	struct fs		*fs	= ufsvfsp->vfs_fs;
993 	struct buf		*bp	= 0;
994 	int			err	= 0;
995 	daddr_t			bno;
996 	ino_t			ino;
997 	struct dinode		*dp;
998 	struct inode		*ip;
999 	callb_cpr_t		cprinfo;
1000 
1001 	CALLB_CPR_INIT(&cprinfo, &uq->uq_mutex, callb_generic_cpr,
1002 	    "ufsreclaim");
1003 
1004 	/*
1005 	 * mount decided that we don't need a reclaim thread
1006 	 */
1007 	if ((fs->fs_reclaim & FS_RECLAIMING) == 0)
1008 		err++;
1009 
1010 	/*
1011 	 * don't reclaim if readonly
1012 	 */
1013 	if (fs->fs_ronly)
1014 		err++;
1015 
1016 	for (ino = 0; ino < (fs->fs_ncg * fs->fs_ipg) && !err; ++ino) {
1017 
1018 		/*
1019 		 * Check whether we are the target of another
1020 		 * thread having called ufs_thread_exit() or
1021 		 * ufs_thread_suspend().
1022 		 */
1023 		mutex_enter(&uq->uq_mutex);
1024 again:
1025 		if (uq->uq_flags & UQ_EXIT) {
1026 			err++;
1027 			mutex_exit(&uq->uq_mutex);
1028 			break;
1029 		} else if (uq->uq_flags & UQ_SUSPEND) {
1030 			uq->uq_flags |= UQ_SUSPENDED;
1031 			/*
1032 			 * Release the buf before we cv_wait()
1033 			 * otherwise we may deadlock with the
1034 			 * thread that called ufs_thread_suspend().
1035 			 */
1036 			if (bp) {
1037 				brelse(bp);
1038 				bp = 0;
1039 			}
1040 			if (uq->uq_flags & UQ_WAIT) {
1041 				uq->uq_flags &= ~UQ_WAIT;
1042 				cv_broadcast(&uq->uq_cv);
1043 			}
1044 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1045 			cv_wait(&uq->uq_cv, &uq->uq_mutex);
1046 			CALLB_CPR_SAFE_END(&cprinfo, &uq->uq_mutex);
1047 			goto again;
1048 		}
1049 		mutex_exit(&uq->uq_mutex);
1050 
1051 		/*
1052 		 * if we don't already have the buf; get it
1053 		 */
1054 		bno = fsbtodb(fs, itod(fs, ino));
1055 		if ((bp == 0) || (bp->b_blkno != bno)) {
1056 			if (bp)
1057 				brelse(bp);
1058 			bp = UFS_BREAD(ufsvfsp,
1059 			    ufsvfsp->vfs_dev, bno, fs->fs_bsize);
1060 			bp->b_flags |= B_AGE;
1061 		}
1062 		if (bp->b_flags & B_ERROR) {
1063 			err++;
1064 			continue;
1065 		}
1066 		/*
1067 		 * nlink <= 0 and mode != 0 means deleted
1068 		 */
1069 		dp = (struct dinode *)bp->b_un.b_addr + itoo(fs, ino);
1070 		if ((dp->di_nlink <= 0) && (dp->di_mode != 0)) {
1071 			/*
1072 			 * can't hold the buf (deadlock)
1073 			 */
1074 			brelse(bp);
1075 			bp = 0;
1076 			rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1077 			/*
1078 			 * iget/iput sequence will put inode on ifree
1079 			 * thread queue if it is idle.  This is a nop
1080 			 * for busy (open, deleted) inodes
1081 			 */
1082 			if (ufs_iget(vfsp, ino, &ip, CRED()))
1083 				err++;
1084 			else
1085 				VN_RELE(ITOV(ip));
1086 			rw_exit(&ufsvfsp->vfs_dqrwlock);
1087 		}
1088 	}
1089 
1090 	if (bp)
1091 		brelse(bp);
1092 	if (!err) {
1093 		/*
1094 		 * reset the reclaiming-bit
1095 		 */
1096 		mutex_enter(&ufsvfsp->vfs_lock);
1097 		fs->fs_reclaim &= ~FS_RECLAIMING;
1098 		mutex_exit(&ufsvfsp->vfs_lock);
1099 		TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_RECLAIM);
1100 	}
1101 
1102 	/*
1103 	 * exit the reclaim thread
1104 	 */
1105 	mutex_enter(&uq->uq_mutex);
1106 	uq->uq_threadp = NULL;
1107 	uq->uq_flags &= ~UQ_WAIT;
1108 	cv_broadcast(&uq->uq_cv);
1109 	CALLB_CPR_EXIT(&cprinfo);
1110 	thread_exit();
1111 }
1112 /*
1113  * HLOCK FILE SYSTEM
1114  *	hlock the file system's whose logs have device errors
1115  */
1116 struct ufs_q	ufs_hlock;
1117 /*ARGSUSED*/
1118 void
1119 ufs_thread_hlock(void *ignore)
1120 {
1121 	int		retry;
1122 	callb_cpr_t	cprinfo;
1123 
1124 	CALLB_CPR_INIT(&cprinfo, &ufs_hlock.uq_mutex, callb_generic_cpr,
1125 	    "ufshlock");
1126 
1127 	for (;;) {
1128 		/*
1129 		 * sleep until there is work to do
1130 		 */
1131 		mutex_enter(&ufs_hlock.uq_mutex);
1132 		(void) ufs_thread_run(&ufs_hlock, &cprinfo);
1133 		ufs_hlock.uq_ne = 0;
1134 		mutex_exit(&ufs_hlock.uq_mutex);
1135 		/*
1136 		 * hlock the error'ed fs's
1137 		 *	retry after a bit if another app is doing lockfs stuff
1138 		 */
1139 		do {
1140 			retry = ufs_trans_hlock();
1141 			if (retry) {
1142 				mutex_enter(&ufs_hlock.uq_mutex);
1143 				CALLB_CPR_SAFE_BEGIN(&cprinfo);
1144 				(void) cv_timedwait(&ufs_hlock.uq_cv,
1145 				    &ufs_hlock.uq_mutex, lbolt + hz);
1146 				CALLB_CPR_SAFE_END(&cprinfo,
1147 				    &ufs_hlock.uq_mutex);
1148 				mutex_exit(&ufs_hlock.uq_mutex);
1149 			}
1150 		} while (retry);
1151 	}
1152 }
1153 
1154 static void
1155 ufs_attr_purge(struct inode *dp)
1156 {
1157 	int	err;
1158 	int	error;
1159 	off_t 	dirsize;			/* size of the directory */
1160 	off_t 	offset;	/* offset in the directory */
1161 	int entryoffsetinblk;		/* offset of ep in fbp's buffer */
1162 	struct inode *tp;
1163 	struct fbuf *fbp;	/* pointer to directory block */
1164 	struct direct *ep;	/* directory entry */
1165 	int trans_size;
1166 	int issync;
1167 	struct ufsvfs	*ufsvfsp = dp->i_ufsvfs;
1168 
1169 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1170 
1171 	fbp = NULL;
1172 	dirsize = roundup(dp->i_size, DIRBLKSIZ);
1173 	offset = 0;
1174 	entryoffsetinblk = 0;
1175 
1176 	/*
1177 	 * Purge directory cache
1178 	 */
1179 
1180 	dnlc_dir_purge(&dp->i_danchor);
1181 
1182 	while (offset < dirsize) {
1183 		/*
1184 		 * If offset is on a block boundary,
1185 		 * read the next directory block.
1186 		 * Release previous if it exists.
1187 		 */
1188 		if (blkoff(dp->i_fs, offset) == 0) {
1189 			if (fbp != NULL) {
1190 				fbrelse(fbp, S_OTHER);
1191 			}
1192 
1193 			err = blkatoff(dp, offset, (char **)0, &fbp);
1194 			if (err) {
1195 				goto out;
1196 			}
1197 			entryoffsetinblk = 0;
1198 		}
1199 		ep = (struct direct *)(fbp->fb_addr + entryoffsetinblk);
1200 		if (ep->d_ino == 0 || (ep->d_name[0] == '.' &&
1201 		    ep->d_name[1] == '\0') ||
1202 		    (ep->d_name[0] == '.' && ep->d_name[1] == '.' &&
1203 		    ep->d_name[2] == '\0')) {
1204 
1205 			entryoffsetinblk += ep->d_reclen;
1206 
1207 		} else {
1208 
1209 			if ((err = ufs_iget(dp->i_vfs, ep->d_ino,
1210 			    &tp, CRED())) != 0) {
1211 				goto out;
1212 			}
1213 
1214 			TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE,
1215 			    trans_size = (int)TOP_REMOVE_SIZE(tp));
1216 
1217 			/*
1218 			 * Delete inode.
1219 			 */
1220 
1221 			dnlc_remove(ITOV(dp), ep->d_name);
1222 
1223 			rw_enter(&tp->i_contents, RW_WRITER);
1224 			tp->i_flag |= ICHG;
1225 			tp->i_seq++;
1226 			TRANS_INODE(tp->i_ufsvfs, tp);
1227 			tp->i_nlink--;
1228 			ufs_setreclaim(tp);
1229 			ITIMES_NOLOCK(tp);
1230 			rw_exit(&tp->i_contents);
1231 
1232 			VN_RELE(ITOV(tp));
1233 			entryoffsetinblk += ep->d_reclen;
1234 			TRANS_END_CSYNC(ufsvfsp, error,
1235 			    issync, TOP_REMOVE, trans_size);
1236 
1237 		}
1238 		offset += ep->d_reclen;
1239 	}
1240 
1241 	if (fbp) {
1242 		fbrelse(fbp, S_OTHER);
1243 	}
1244 
1245 out:
1246 	rw_exit(&ufsvfsp->vfs_dqrwlock);
1247 }
1248