xref: /titanic_50/usr/src/uts/common/fs/ufs/ufs_thread.c (revision 02e56f3f1bfc8d9977bafb8cb5202f576dcded27)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 /*
31  * Portions of this source code were derived from Berkeley 4.3 BSD
32  * under license from the Regents of the University of California.
33  */
34 
35 #pragma ident	"%Z%%M%	%I%	%E% SMI"
36 
37 #include <sys/types.h>
38 #include <sys/systm.h>
39 #include <sys/errno.h>
40 #include <sys/kmem.h>
41 #include <sys/buf.h>
42 #include <sys/vnode.h>
43 #include <sys/vfs.h>
44 #include <sys/user.h>
45 #include <sys/callb.h>
46 #include <sys/cpuvar.h>
47 #include <sys/fs/ufs_inode.h>
48 #include <sys/fs/ufs_log.h>
49 #include <sys/fs/ufs_trans.h>
50 #include <sys/fs/ufs_acl.h>
51 #include <sys/fs/ufs_bio.h>
52 #include <sys/fs/ufs_fsdir.h>
53 #include <sys/debug.h>
54 #include <sys/cmn_err.h>
55 #include <sys/sysmacros.h>
56 
57 extern pri_t 			minclsyspri;
58 extern int			hash2ints();
59 extern struct kmem_cache	*inode_cache;	/* cache of free inodes */
60 extern int			ufs_idle_waiters;
61 extern struct instats		ins;
62 
63 static void ufs_attr_purge(struct inode *);
64 
65 /*
66  * initialize a thread's queue struct
67  */
68 void
69 ufs_thread_init(struct ufs_q *uq, int lowat)
70 {
71 	bzero((caddr_t)uq, sizeof (*uq));
72 	cv_init(&uq->uq_cv, NULL, CV_DEFAULT, NULL);
73 	mutex_init(&uq->uq_mutex, NULL, MUTEX_DEFAULT, NULL);
74 	uq->uq_lowat = lowat;
75 	uq->uq_hiwat = 2 * lowat;
76 	uq->uq_threadp = NULL;
77 }
78 
79 /*
80  * start a thread for a queue (assumes success)
81  */
82 void
83 ufs_thread_start(struct ufs_q *uq, void (*func)(), struct vfs *vfsp)
84 {
85 	mutex_enter(&uq->uq_mutex);
86 	if (uq->uq_threadp == NULL) {
87 		uq->uq_threadp = thread_create(NULL, 0, func, vfsp, 0, &p0,
88 		    TS_RUN, minclsyspri);
89 		uq->uq_flags = 0;
90 	}
91 	mutex_exit(&uq->uq_mutex);
92 }
93 
94 /*
95  * wait for the thread to exit
96  */
97 void
98 ufs_thread_exit(struct ufs_q *uq)
99 {
100 	kt_did_t ufs_thread_did = 0;
101 
102 	mutex_enter(&uq->uq_mutex);
103 	uq->uq_flags &= ~(UQ_SUSPEND | UQ_SUSPENDED);
104 	if (uq->uq_threadp != NULL) {
105 		ufs_thread_did = uq->uq_threadp->t_did;
106 		uq->uq_flags |= (UQ_EXIT|UQ_WAIT);
107 		cv_broadcast(&uq->uq_cv);
108 	}
109 	mutex_exit(&uq->uq_mutex);
110 
111 	/*
112 	 * It's safe to call thread_join() with an already-gone
113 	 * t_did, but we have to obtain it before the kernel
114 	 * thread structure is freed. We do so above under the
115 	 * protection of the uq_mutex when we're sure the thread
116 	 * still exists and it's save to de-reference it.
117 	 * We also have to check if ufs_thread_did is != 0
118 	 * before calling thread_join() since thread 0 in the system
119 	 * gets a t_did of 0.
120 	 */
121 	if (ufs_thread_did)
122 		thread_join(ufs_thread_did);
123 }
124 
125 /*
126  * wait for a thread to suspend itself on the caller's behalf
127  *	the caller is responsible for continuing the thread
128  */
129 void
130 ufs_thread_suspend(struct ufs_q *uq)
131 {
132 	mutex_enter(&uq->uq_mutex);
133 	if (uq->uq_threadp != NULL) {
134 		/*
135 		 * wait while another thread is suspending this thread.
136 		 * no need to do a cv_broadcast(), as whoever suspended
137 		 * the thread must continue it at some point.
138 		 */
139 		while ((uq->uq_flags & UQ_SUSPEND) &&
140 		    (uq->uq_threadp != NULL)) {
141 			/*
142 			 * We can't use cv_signal() because if our
143 			 * signal doesn't happen to hit the desired
144 			 * thread but instead some other waiter like
145 			 * ourselves, we'll wait forever for a
146 			 * response.  Well, at least an indeterminate
147 			 * amount of time until we just happen to get
148 			 * lucky from whomever did get signalled doing
149 			 * a cv_signal() of their own.  This is an
150 			 * unfortunate performance lossage.
151 			 */
152 			uq->uq_flags |= UQ_WAIT;
153 			cv_wait(&uq->uq_cv, &uq->uq_mutex);
154 		}
155 
156 		/*
157 		 * wait for the thread to suspend itself
158 		 */
159 		uq->uq_flags |= UQ_SUSPEND;
160 		while (((uq->uq_flags & UQ_SUSPENDED) == 0) &&
161 		    (uq->uq_threadp != NULL)) {
162 			uq->uq_flags |= UQ_WAIT;
163 			cv_broadcast(&uq->uq_cv);
164 			cv_wait(&uq->uq_cv, &uq->uq_mutex);
165 		}
166 	}
167 	mutex_exit(&uq->uq_mutex);
168 }
169 
170 /*
171  * allow a thread to continue from a ufs_thread_suspend()
172  *	This thread must be the same as the thread that called
173  *	ufs_thread_suspend.
174  */
175 void
176 ufs_thread_continue(struct ufs_q *uq)
177 {
178 	mutex_enter(&uq->uq_mutex);
179 	uq->uq_flags &= ~(UQ_SUSPEND | UQ_SUSPENDED);
180 	cv_broadcast(&uq->uq_cv);
181 	mutex_exit(&uq->uq_mutex);
182 }
183 
184 /*
185  * some common code for managing a threads execution
186  *	uq is locked at entry and return
187  *	may sleep
188  *	may exit
189  */
190 /*
191  * Kind of a hack passing in the callb_cpr_t * here.
192  * It should really be part of the ufs_q structure.
193  * I did not put it in there because we are already in beta
194  * and I was concerned that changing ufs_inode.h to include
195  * callb.h might break something.
196  */
197 int
198 ufs_thread_run(struct ufs_q *uq, callb_cpr_t *cprinfop)
199 {
200 again:
201 	ASSERT(uq->uq_ne >= 0);
202 
203 	if (uq->uq_flags & UQ_SUSPEND) {
204 		uq->uq_flags |= UQ_SUSPENDED;
205 	} else if (uq->uq_flags & UQ_EXIT) {
206 		/*
207 		 * exiting; empty the queue (may infinite loop)
208 		 */
209 		if (uq->uq_ne)
210 			return (uq->uq_ne);
211 		uq->uq_threadp = NULL;
212 		if (uq->uq_flags & UQ_WAIT) {
213 			cv_broadcast(&uq->uq_cv);
214 		}
215 		uq->uq_flags &= ~(UQ_EXIT | UQ_WAIT);
216 		CALLB_CPR_EXIT(cprinfop);
217 		thread_exit();
218 	} else if (uq->uq_ne >= uq->uq_lowat) {
219 		/*
220 		 * process a block of entries until below high water mark
221 		 */
222 		return (uq->uq_ne - (uq->uq_lowat >> 1));
223 	} else if (uq->uq_flags & UQ_FASTCLIENTS) {
224 		/*
225 		 * Let the fast acting clients through
226 		 */
227 		return (0);
228 	}
229 	if (uq->uq_flags & UQ_WAIT) {
230 		uq->uq_flags &= ~UQ_WAIT;
231 		cv_broadcast(&uq->uq_cv);
232 	}
233 	CALLB_CPR_SAFE_BEGIN(cprinfop);
234 	cv_wait(&uq->uq_cv, &uq->uq_mutex);
235 	CALLB_CPR_SAFE_END(cprinfop, &uq->uq_mutex);
236 	goto again;
237 }
238 
239 /*
240  * DELETE INODE
241  * The following routines implement the protocol for freeing the resources
242  * held by an idle and deleted inode.
243  */
244 void
245 ufs_delete(struct ufsvfs *ufsvfsp, struct inode *ip, int dolockfs)
246 {
247 	ushort_t	mode;
248 	struct vnode	*vp	= ITOV(ip);
249 	struct ulockfs	*ulp;
250 	int		trans_size;
251 	int		dorwlock = ((ip->i_mode & IFMT) == IFREG);
252 	int		issync;
253 	int		err;
254 	struct inode	*dp;
255 	struct ufs_q    *delq = &ufsvfsp->vfs_delete;
256 	struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
257 
258 	/*
259 	 * not on a trans device or not part of a transaction
260 	 */
261 	ASSERT(!TRANS_ISTRANS(ufsvfsp) ||
262 		((curthread->t_flag & T_DONTBLOCK) == 0));
263 
264 	/*
265 	 * Ignore if deletes are not allowed (wlock/hlock)
266 	 */
267 	if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) {
268 		VN_RELE(vp);
269 		return;
270 	}
271 
272 	if ((vp->v_count > 1) || (ip->i_mode == 0)) {
273 		VN_RELE(vp);
274 		return;
275 	}
276 	/*
277 	 * If we are called as part of setting a fs lock, then only
278 	 * do part of the lockfs protocol.  In other words, don't hang.
279 	 */
280 	if (dolockfs) {
281 		if (ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_DELETE_MASK))
282 			return;
283 	} else {
284 		/*
285 		 * check for recursive VOP call
286 		 */
287 		if (curthread->t_flag & T_DONTBLOCK) {
288 			ulp = NULL;
289 		} else {
290 			ulp = &ufsvfsp->vfs_ulockfs;
291 			curthread->t_flag |= T_DONTBLOCK;
292 		}
293 	}
294 
295 	/*
296 	 * Hold rwlock to synchronize with (nfs) writes
297 	 */
298 	if (dorwlock)
299 		rw_enter(&ip->i_rwlock, RW_WRITER);
300 
301 	/*
302 	 * Delete the attribute directory.
303 	 */
304 	if (ip->i_oeftflag != 0) {
305 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE,
306 		    trans_size = (int)TOP_REMOVE_SIZE(ip));
307 		rw_enter(&ip->i_contents, RW_WRITER);
308 		err = ufs_iget(ip->i_vfs, ip->i_oeftflag,
309 		    &dp, CRED());
310 		if (err == 0) {
311 			rw_enter(&dp->i_rwlock, RW_WRITER);
312 			rw_enter(&dp->i_contents, RW_WRITER);
313 			dp->i_flag |= IUPD|ICHG;
314 			dp->i_seq++;
315 			TRANS_INODE(dp->i_ufsvfs, dp);
316 			dp->i_nlink -= 2;
317 			ufs_setreclaim(dp);
318 			/*
319 			 * Should get rid of any negative cache entries that
320 			 * might be lingering, as well as ``.'' and
321 			 * ``..''.  If we don't, the VN_RELE() below
322 			 * won't actually put dp on the delete queue
323 			 * and it'll hang out until someone forces it
324 			 * (lockfs -f, umount, ...).  The only reliable
325 			 * way of doing this at the moment is to call
326 			 * dnlc_purge_vp(ITOV(dp)), which is unacceptably
327 			 * slow, so we'll just note the problem in this
328 			 * comment for now.
329 			 */
330 			dnlc_remove(ITOV(dp), ".");
331 			dnlc_remove(ITOV(dp), "..");
332 			ITIMES_NOLOCK(dp);
333 			if (!TRANS_ISTRANS(ufsvfsp)) {
334 				ufs_iupdat(dp, I_SYNC);
335 			}
336 			rw_exit(&dp->i_contents);
337 			rw_exit(&dp->i_rwlock);
338 			VN_RELE(ITOV(dp));
339 		}
340 		/*
341 		 * Clear out attribute pointer
342 		 */
343 		ip->i_oeftflag = 0;
344 		rw_exit(&ip->i_contents);
345 		TRANS_END_CSYNC(ufsvfsp, err, issync,
346 		    TOP_REMOVE, trans_size);
347 		dnlc_remove(ITOV(ip), XATTR_DIR_NAME);
348 	}
349 
350 	if ((ip->i_mode & IFMT) == IFATTRDIR) {
351 		ufs_attr_purge(ip);
352 	}
353 
354 	(void) TRANS_ITRUNC(ip, (u_offset_t)0, I_FREE | I_ACCT, CRED());
355 
356 	/*
357 	 * the inode's space has been freed; now free the inode
358 	 */
359 	if (ulp) {
360 		trans_size = TOP_IFREE_SIZE(ip);
361 		TRANS_BEGIN_ASYNC(ufsvfsp, TOP_IFREE, trans_size);
362 	}
363 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
364 	rw_enter(&ip->i_contents, RW_WRITER);
365 	TRANS_INODE(ufsvfsp, ip);
366 	mode = ip->i_mode;
367 	ip->i_mode = 0;
368 	ip->i_rdev = 0;
369 	ip->i_ordev = 0;
370 	ip->i_flag |= IMOD;
371 	if (ip->i_ufs_acl) {
372 		(void) ufs_si_free(ip->i_ufs_acl, vp->v_vfsp, CRED());
373 		ip->i_ufs_acl = NULL;
374 		ip->i_shadow = 0;
375 	}
376 
377 	/*
378 	 * This inode is torn down but still retains it's identity
379 	 * (inode number).  It could get recycled soon so it's best
380 	 * to clean up the vnode just in case.
381 	 */
382 	mutex_enter(&vp->v_lock);
383 	vn_recycle(vp);
384 	mutex_exit(&vp->v_lock);
385 
386 	/*
387 	 * free the inode
388 	 */
389 	ufs_ifree(ip, ip->i_number, mode);
390 	/*
391 	 * release quota resources; can't fail
392 	 */
393 	(void) chkiq((struct ufsvfs *)vp->v_vfsp->vfs_data,
394 		/* change */ -1, ip, (uid_t)ip->i_uid, 0, CRED(),
395 		(char **)NULL, (size_t *)NULL);
396 	dqrele(ip->i_dquot);
397 	ip->i_dquot = NULL;
398 	ip->i_flag &= ~(IDEL | IDIRECTIO);
399 	ip->i_cflags = 0;
400 	if (!TRANS_ISTRANS(ufsvfsp)) {
401 		ufs_iupdat(ip, I_SYNC);
402 	} else {
403 		mutex_enter(&delq->uq_mutex);
404 		delq_info->delq_unreclaimed_files--;
405 		mutex_exit(&delq->uq_mutex);
406 	}
407 	rw_exit(&ip->i_contents);
408 	rw_exit(&ufsvfsp->vfs_dqrwlock);
409 	if (dorwlock)
410 		rw_exit(&ip->i_rwlock);
411 	VN_RELE(vp);
412 
413 	/*
414 	 * End of transaction
415 	 */
416 	if (ulp) {
417 		TRANS_END_ASYNC(ufsvfsp, TOP_IFREE, trans_size);
418 		if (dolockfs)
419 			ufs_lockfs_end(ulp);
420 		else
421 			curthread->t_flag &= ~T_DONTBLOCK;
422 	}
423 }
424 
425 /*
426  * Create the delete thread and init the delq_info for this fs
427  */
428 void
429 ufs_delete_init(struct ufsvfs *ufsvfsp, int lowat)
430 {
431 	struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
432 
433 	ufs_thread_init(&ufsvfsp->vfs_delete, lowat);
434 	(void) memset((void *)delq_info, 0, sizeof (*delq_info));
435 	cv_init(&delq_info->delq_fast_cv, NULL, CV_DEFAULT, NULL);
436 }
437 
438 /*
439  * thread that frees up deleted inodes
440  */
441 void
442 ufs_thread_delete(struct vfs *vfsp)
443 {
444 	struct ufsvfs	*ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
445 	struct ufs_q	*uq = &ufsvfsp->vfs_delete;
446 	struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
447 	struct inode	*ip;
448 	long		ne;
449 	callb_cpr_t	cprinfo;
450 
451 	CALLB_CPR_INIT(&cprinfo, &uq->uq_mutex, callb_generic_cpr,
452 	    "ufsdelete");
453 
454 	mutex_enter(&uq->uq_mutex);
455 again:
456 	/*
457 	 * Sleep until there is work to do.  Only do one entry at
458 	 * a time, to reduce the wait time for checking for a suspend
459 	 * or fast-client request.  The ?: is for pedantic portability.
460 	 */
461 	ne = ufs_thread_run(uq, &cprinfo) ? 1 : 0;
462 
463 	/*
464 	 * process an entry, if there are any
465 	 */
466 	if (ne && (ip = uq->uq_ihead)) {
467 		/*
468 		 * process first entry on queue.  Assumed conditions are:
469 		 *	ip is held (v_count >= 1)
470 		 *	ip is referenced (i_flag & IREF)
471 		 *	ip is free (i_nlink <= 0)
472 		 */
473 		if ((uq->uq_ihead = ip->i_freef) == ip)
474 			uq->uq_ihead = NULL;
475 		ip->i_freef->i_freeb = ip->i_freeb;
476 		ip->i_freeb->i_freef = ip->i_freef;
477 		ip->i_freef = ip;
478 		ip->i_freeb = ip;
479 		uq->uq_ne--;
480 		mutex_exit(&uq->uq_mutex);
481 		ufs_delete(ufsvfsp, ip, 1);
482 		mutex_enter(&uq->uq_mutex);
483 	}
484 
485 	/*
486 	 * If there are any fast clients, let all of them through.
487 	 * Mainly intended for statvfs(), which doesn't need to do
488 	 * anything except look at the number of bytes/inodes that
489 	 * are in the queue.
490 	 */
491 	if (uq->uq_flags & UQ_FASTCLIENTS) {
492 		uq->uq_flags &= ~UQ_FASTCLIENTS;
493 		/*
494 		 * Give clients a chance.  The lock exit/entry
495 		 * allows waiting statvfs threads through.
496 		 */
497 		cv_broadcast(&delq_info->delq_fast_cv);
498 		mutex_exit(&uq->uq_mutex);
499 		mutex_enter(&uq->uq_mutex);
500 	}
501 	goto again;
502 }
503 
504 /*
505  * drain ne entries off the delete queue.  As new queue entries may
506  * be added while we're working, ne is interpreted as follows:
507  *
508  * ne > 0   => remove up to ne entries
509  * ne == 0  => remove all entries currently on the queue
510  * ne == -1 => remove entries until the queue is empty
511  */
512 void
513 ufs_delete_drain(struct vfs *vfsp, int ne, int dolockfs)
514 {
515 	struct ufsvfs	*ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
516 	struct ufs_q	*uq;
517 	struct inode	*ip;
518 	int		drain_cnt = 0;
519 	int		done;
520 
521 	/*
522 	 * if forcibly unmounted; ignore
523 	 */
524 	if (ufsvfsp == NULL)
525 		return;
526 
527 	uq = &ufsvfsp->vfs_delete;
528 	mutex_enter(&uq->uq_mutex);
529 	if (ne == 0)
530 		drain_cnt = uq->uq_ne;
531 	else if (ne > 0)
532 		drain_cnt = ne;
533 
534 	/*
535 	 * process up to ne entries
536 	 */
537 
538 	done = 0;
539 	while (!done && (ip = uq->uq_ihead)) {
540 		if (ne != -1)
541 			drain_cnt--;
542 		if (ne != -1 && drain_cnt == 0)
543 			done = 1;
544 		if ((uq->uq_ihead = ip->i_freef) == ip)
545 			uq->uq_ihead = NULL;
546 		ip->i_freef->i_freeb = ip->i_freeb;
547 		ip->i_freeb->i_freef = ip->i_freef;
548 		ip->i_freef = ip;
549 		ip->i_freeb = ip;
550 		uq->uq_ne--;
551 		mutex_exit(&uq->uq_mutex);
552 		ufs_delete(ufsvfsp, ip, dolockfs);
553 		mutex_enter(&uq->uq_mutex);
554 	}
555 	mutex_exit(&uq->uq_mutex);
556 }
557 
558 void
559 ufs_sync_with_thread(struct ufs_q *uq)
560 {
561 	mutex_enter(&uq->uq_mutex);
562 	uq->uq_flags |= UQ_WAIT;
563 	/*
564 	 * Someone other than the thread we're interested in might
565 	 * send a signal, so make sure the thread's given an
566 	 * acknowledgement.
567 	 */
568 	while ((uq->uq_threadp != NULL) && (uq->uq_flags & UQ_WAIT)) {
569 		cv_broadcast(&uq->uq_cv);
570 		cv_wait(&uq->uq_cv, &uq->uq_mutex);
571 	}
572 	mutex_exit(&uq->uq_mutex);
573 }
574 
575 /*
576  * Get rid of everything that's currently in the delete queue,
577  * plus whatever the delete thread is working on at the moment.
578  *
579  * This ability is required for providing true POSIX semantics
580  * regarding close(2), unlink(2), etc, even when logging is enabled.
581  * The standard requires that the released space be immediately
582  * observable (statvfs(2)) and allocatable (e.g., write(2)).
583  */
584 void
585 ufs_delete_drain_wait(struct ufsvfs *ufsvfsp, int dolockfs)
586 {
587 	struct ufs_q *uq = &ufsvfsp->vfs_delete;
588 	int	error;
589 
590 	(void) ufs_delete_drain(ufsvfsp->vfs_vfs, 0, dolockfs);
591 	ufs_sync_with_thread(uq);
592 
593 	/*
594 	 * Commit any outstanding transactions to make sure
595 	 * any canceled freed blocks are available for allocation.
596 	 */
597 	curthread->t_flag |= T_DONTBLOCK;
598 	TRANS_BEGIN_SYNC(ufsvfsp, TOP_COMMIT_UPDATE, TOP_COMMIT_SIZE, error);
599 	if (!error) {
600 		TRANS_END_SYNC(ufsvfsp, error, TOP_COMMIT_UPDATE,
601 			TOP_COMMIT_SIZE);
602 	}
603 	curthread->t_flag &= ~T_DONTBLOCK;
604 }
605 
606 /*
607  * Adjust the resource usage in a struct statvfs based on
608  * what's in the delete queue.
609  *
610  * We do not consider the impact of ACLs or extended attributes
611  * that may be deleted as a side-effect of deleting a file.
612  * Those are metadata, and their sizes aren't reflected in the
613  * sizes returned by stat(), so this is not a problem.
614  */
615 void
616 ufs_delete_adjust_stats(struct ufsvfs *ufsvfsp, struct statvfs64 *sp)
617 {
618 	struct ufs_q *uq = &ufsvfsp->vfs_delete;
619 	struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
620 
621 	/*
622 	 * We'll get signalled when it's our turn.  However, if there's
623 	 * nothing going on, there's no point in waking up the delete
624 	 * thread and waiting for it to tell us to continue.
625 	 */
626 	mutex_enter(&uq->uq_mutex);
627 
628 	if ((uq->uq_flags & UQ_FASTCLIENTS) || (uq->uq_ne != 0)) {
629 		uq->uq_flags |= UQ_FASTCLIENTS;
630 		cv_broadcast(&uq->uq_cv);
631 		cv_wait(&delq_info->delq_fast_cv, &uq->uq_mutex);
632 	}
633 
634 	sp->f_bfree += delq_info->delq_unreclaimed_blocks;
635 	sp->f_ffree += delq_info->delq_unreclaimed_files;
636 	mutex_exit(&uq->uq_mutex);
637 }
638 
639 /*
640  * IDLE INODE
641  * The following routines implement the protocol for maintaining an
642  * LRU list of idle inodes and for moving the idle inodes to the
643  * reuse list when the number of allocated inodes exceeds the user
644  * tunable high-water mark (ufs_ninode).
645  */
646 
647 /*
648  * clean an idle inode and move it to the reuse list
649  */
650 static void
651 ufs_idle_free(struct inode *ip)
652 {
653 	int			pages;
654 	int			hno;
655 	kmutex_t		*ihm;
656 	struct ufsvfs		*ufsvfsp	= ip->i_ufsvfs;
657 	struct vnode		*vp		= ITOV(ip);
658 
659 	/*
660 	 * inode is held
661 	 */
662 
663 	/*
664 	 * remember `pages' for stats below
665 	 */
666 	pages = (ip->i_mode && vn_has_cached_data(vp) && vp->v_type != VCHR);
667 
668 	/*
669 	 * start the dirty pages to disk and then invalidate them
670 	 * unless the inode is invalid (ISTALE)
671 	 */
672 	if ((ip->i_flag & ISTALE) == 0) {
673 		(void) TRANS_SYNCIP(ip, B_ASYNC, I_ASYNC, TOP_SYNCIP_FREE);
674 		(void) TRANS_SYNCIP(ip,
675 				    (TRANS_ISERROR(ufsvfsp)) ?
676 				    B_INVAL | B_FORCE : B_INVAL,
677 				    I_ASYNC, TOP_SYNCIP_FREE);
678 	}
679 
680 	/*
681 	 * wait for any current ufs_iget to finish and block future ufs_igets
682 	 */
683 	ASSERT(ip->i_number != 0);
684 	hno = INOHASH(ip->i_number);
685 	ihm = &ih_lock[hno];
686 	mutex_enter(ihm);
687 
688 	/*
689 	 * It must be guaranteed that v_count >= 2, otherwise
690 	 * something must be wrong with this vnode already.
691 	 * That is why we use v_count-- instead of VN_RELE().
692 	 * Acquire the vnode lock in case another thread is in
693 	 * VN_RELE().
694 	 */
695 	mutex_enter(&vp->v_lock);
696 
697 	if (vp->v_count < 2)
698 		cmn_err(CE_PANIC,
699 			"ufs_idle_free: vnode ref count is less than 2");
700 
701 	vp->v_count--;
702 	if ((vp->v_type != VCHR && vn_has_cached_data(vp)) ||
703 		vp->v_count != 1 ||
704 		ip->i_flag & (IMOD|IMODACC|IACC|ICHG|IUPD|IATTCHG)) {
705 			/*
706 			 * Another thread has referenced this inode while
707 			 * we are trying to free it. Call VN_RELE() to
708 			 * release our reference.
709 			 */
710 			mutex_exit(&vp->v_lock);
711 			mutex_exit(ihm);
712 			VN_RELE(vp);
713 	} else {
714 		/*
715 		 * The inode is currently unreferenced and can not
716 		 * acquire further references because it has no pages
717 		 * and the hash is locked.  Inodes acquire references
718 		 * via the hash list or via their pages.
719 		 */
720 
721 		mutex_exit(&vp->v_lock);
722 
723 		/*
724 		 * remove it from the cache
725 		 */
726 		remque(ip);
727 		mutex_exit(ihm);
728 		/*
729 		 * Stale inodes have no valid ufsvfs
730 		 */
731 		if ((ip->i_flag & ISTALE) == 0 && ip->i_dquot) {
732 			TRANS_DQRELE(ufsvfsp, ip->i_dquot);
733 			ip->i_dquot = NULL;
734 		}
735 		ufs_si_del(ip);
736 		if (pages) {
737 			CPU_STATS_ADDQ(CPU, sys, ufsipage, 1);
738 		} else {
739 			CPU_STATS_ADDQ(CPU, sys, ufsinopage, 1);
740 		}
741 		ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp));
742 		ufs_free_inode(ip);
743 	}
744 }
745 
746 /*
747  * this thread processes the global idle queue
748  */
749 iqhead_t *ufs_junk_iq;
750 iqhead_t *ufs_useful_iq;
751 int ufs_njunk_iq = 0;
752 int ufs_nuseful_iq = 0;
753 int ufs_niqhash;
754 int ufs_iqhashmask;
755 struct ufs_q	ufs_idle_q;
756 
757 void
758 ufs_thread_idle(void)
759 {
760 	callb_cpr_t cprinfo;
761 	int i;
762 	int ne;
763 
764 	ufs_niqhash = (ufs_idle_q.uq_lowat >> 1) / IQHASHQLEN;
765 	ufs_niqhash = 1 << highbit(ufs_niqhash); /* round up to power of 2 */
766 	ufs_iqhashmask = ufs_niqhash - 1;
767 	ufs_junk_iq = kmem_alloc(ufs_niqhash * sizeof (*ufs_junk_iq),
768 	    KM_SLEEP);
769 	ufs_useful_iq = kmem_alloc(ufs_niqhash * sizeof (*ufs_useful_iq),
770 	    KM_SLEEP);
771 
772 	/* Initialize hash queue headers */
773 	for (i = 0; i < ufs_niqhash; i++) {
774 		ufs_junk_iq[i].i_freef = (inode_t *)&ufs_junk_iq[i];
775 		ufs_junk_iq[i].i_freeb = (inode_t *)&ufs_junk_iq[i];
776 		ufs_useful_iq[i].i_freef = (inode_t *)&ufs_useful_iq[i];
777 		ufs_useful_iq[i].i_freeb = (inode_t *)&ufs_useful_iq[i];
778 	}
779 
780 	CALLB_CPR_INIT(&cprinfo, &ufs_idle_q.uq_mutex, callb_generic_cpr,
781 	    "ufsidle");
782 again:
783 	/*
784 	 * Whenever the idle thread is awakened, it repeatedly gives
785 	 * back half of the idle queue until the idle queue falls
786 	 * below lowat.
787 	 */
788 	mutex_enter(&ufs_idle_q.uq_mutex);
789 	if (ufs_idle_q.uq_ne < ufs_idle_q.uq_lowat) {
790 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
791 		cv_wait(&ufs_idle_q.uq_cv, &ufs_idle_q.uq_mutex);
792 		CALLB_CPR_SAFE_END(&cprinfo, &ufs_idle_q.uq_mutex);
793 	}
794 	mutex_exit(&ufs_idle_q.uq_mutex);
795 
796 	/*
797 	 * Give back 1/2 of the idle queue
798 	 */
799 	ne = ufs_idle_q.uq_ne >> 1;
800 	ins.in_tidles.value.ul += ne;
801 	ufs_idle_some(ne);
802 	goto again;
803 }
804 
805 /*
806  * Reclaim callback for ufs inode cache.
807  * Invoked by the kernel memory allocator when memory gets tight.
808  */
809 /*ARGSUSED*/
810 void
811 ufs_inode_cache_reclaim(void *cdrarg)
812 {
813 	/*
814 	 * If we are low on memory and the idle queue is over its
815 	 * halfway mark, then free 50% of the idle q
816 	 *
817 	 * We don't free all of the idle inodes because the inodes
818 	 * for popular NFS files may have been kicked from the dnlc.
819 	 * The inodes for these files will end up on the idle queue
820 	 * after every NFS access.
821 	 *
822 	 * If we repeatedly push them from the idle queue then
823 	 * NFS users may be unhappy as an extra buf cache operation
824 	 * is incurred for every NFS operation to these files.
825 	 *
826 	 * It's not common, but I have seen it happen.
827 	 *
828 	 */
829 	if (ufs_idle_q.uq_ne < (ufs_idle_q.uq_lowat >> 1))
830 		return;
831 	mutex_enter(&ufs_idle_q.uq_mutex);
832 	cv_broadcast(&ufs_idle_q.uq_cv);
833 	mutex_exit(&ufs_idle_q.uq_mutex);
834 }
835 
836 /*
837  * Free up some idle inodes
838  */
839 void
840 ufs_idle_some(int ne)
841 {
842 	int i;
843 	struct inode *ip;
844 	struct vnode *vp;
845 	static int junk_rotor = 0;
846 	static int useful_rotor = 0;
847 
848 	for (i = 0; i < ne; ++i) {
849 		mutex_enter(&ufs_idle_q.uq_mutex);
850 
851 		if (ufs_njunk_iq) {
852 			while (ufs_junk_iq[junk_rotor].i_freef ==
853 			    (inode_t *)&ufs_junk_iq[junk_rotor]) {
854 				junk_rotor = IQNEXT(junk_rotor);
855 			}
856 			ip = ufs_junk_iq[junk_rotor].i_freef;
857 			ASSERT(ip->i_flag & IJUNKIQ);
858 		} else if (ufs_nuseful_iq) {
859 			while (ufs_useful_iq[useful_rotor].i_freef ==
860 			    (inode_t *)&ufs_useful_iq[useful_rotor]) {
861 				useful_rotor = IQNEXT(useful_rotor);
862 			}
863 			ip = ufs_useful_iq[useful_rotor].i_freef;
864 			ASSERT(!(ip->i_flag & IJUNKIQ));
865 		} else {
866 			mutex_exit(&ufs_idle_q.uq_mutex);
867 			return;
868 		}
869 
870 		/*
871 		 * emulate ufs_iget
872 		 */
873 		vp = ITOV(ip);
874 		VN_HOLD(vp);
875 		mutex_exit(&ufs_idle_q.uq_mutex);
876 		rw_enter(&ip->i_contents, RW_WRITER);
877 		/*
878 		 * VN_RELE should not be called if
879 		 * ufs_rmidle returns true, as it will
880 		 * effectively be done in ufs_idle_free.
881 		 */
882 		if (ufs_rmidle(ip)) {
883 			rw_exit(&ip->i_contents);
884 			ufs_idle_free(ip);
885 		} else {
886 			rw_exit(&ip->i_contents);
887 			VN_RELE(vp);
888 		}
889 	}
890 }
891 
892 /*
893  * drain entries for vfsp from the idle queue
894  * vfsp == NULL means drain the entire thing
895  */
896 void
897 ufs_idle_drain(struct vfs *vfsp)
898 {
899 	struct inode	*ip, *nip;
900 	struct inode	*ianchor = NULL;
901 	int		i;
902 
903 	mutex_enter(&ufs_idle_q.uq_mutex);
904 	if (ufs_njunk_iq) {
905 		/* for each hash q */
906 		for (i = 0; i < ufs_niqhash; i++) {
907 			/* search down the hash q */
908 			for (ip = ufs_junk_iq[i].i_freef;
909 			    ip != (inode_t *)&ufs_junk_iq[i];
910 			    ip = ip->i_freef) {
911 				if (ip->i_vfs == vfsp || vfsp == NULL) {
912 					/* found a matching entry */
913 					VN_HOLD(ITOV(ip));
914 					mutex_exit(&ufs_idle_q.uq_mutex);
915 					rw_enter(&ip->i_contents, RW_WRITER);
916 					/*
917 					 * See comments in ufs_idle_some()
918 					 * as we will call ufs_idle_free()
919 					 * after scanning both queues.
920 					 */
921 					if (ufs_rmidle(ip)) {
922 						rw_exit(&ip->i_contents);
923 						ip->i_freef = ianchor;
924 						ianchor = ip;
925 					} else {
926 						rw_exit(&ip->i_contents);
927 						VN_RELE(ITOV(ip));
928 					}
929 					/* restart this hash q */
930 					ip = (inode_t *)&ufs_junk_iq[i];
931 					mutex_enter(&ufs_idle_q.uq_mutex);
932 				}
933 			}
934 		}
935 	}
936 	if (ufs_nuseful_iq) {
937 		/* for each hash q */
938 		for (i = 0; i < ufs_niqhash; i++) {
939 			/* search down the hash q */
940 			for (ip = ufs_useful_iq[i].i_freef;
941 			    ip != (inode_t *)&ufs_useful_iq[i];
942 			    ip = ip->i_freef) {
943 				if (ip->i_vfs == vfsp || vfsp == NULL) {
944 					/* found a matching entry */
945 					VN_HOLD(ITOV(ip));
946 					mutex_exit(&ufs_idle_q.uq_mutex);
947 					rw_enter(&ip->i_contents, RW_WRITER);
948 					/*
949 					 * See comments in ufs_idle_some()
950 					 * as we will call ufs_idle_free()
951 					 * after scanning both queues.
952 					 */
953 					if (ufs_rmidle(ip)) {
954 						rw_exit(&ip->i_contents);
955 						ip->i_freef = ianchor;
956 						ianchor = ip;
957 					} else {
958 						rw_exit(&ip->i_contents);
959 						VN_RELE(ITOV(ip));
960 					}
961 					/* restart this hash q */
962 					ip = (inode_t *)&ufs_useful_iq[i];
963 					mutex_enter(&ufs_idle_q.uq_mutex);
964 				}
965 			}
966 		}
967 	}
968 
969 	mutex_exit(&ufs_idle_q.uq_mutex);
970 	/* no more matching entries, release those we have found (if any) */
971 	for (ip = ianchor; ip; ip = nip) {
972 		nip = ip->i_freef;
973 		ip->i_freef = ip;
974 		ufs_idle_free(ip);
975 	}
976 }
977 
978 /*
979  * RECLAIM DELETED INODES
980  * The following thread scans the file system once looking for deleted files
981  */
982 void
983 ufs_thread_reclaim(struct vfs *vfsp)
984 {
985 	struct ufsvfs		*ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
986 	struct ufs_q		*uq	= &ufsvfsp->vfs_reclaim;
987 	struct fs		*fs	= ufsvfsp->vfs_fs;
988 	struct buf		*bp	= 0;
989 	int			err	= 0;
990 	daddr_t			bno;
991 	ino_t			ino;
992 	struct dinode		*dp;
993 	struct inode		*ip;
994 	callb_cpr_t		cprinfo;
995 
996 	CALLB_CPR_INIT(&cprinfo, &uq->uq_mutex, callb_generic_cpr,
997 	    "ufsreclaim");
998 
999 	/*
1000 	 * mount decided that we don't need a reclaim thread
1001 	 */
1002 	if ((fs->fs_reclaim & FS_RECLAIMING) == 0)
1003 		err++;
1004 
1005 	/*
1006 	 * don't reclaim if readonly
1007 	 */
1008 	if (fs->fs_ronly)
1009 		err++;
1010 
1011 	for (ino = 0; ino < (fs->fs_ncg * fs->fs_ipg) && !err; ++ino) {
1012 
1013 		/*
1014 		 * Check whether we are the target of another
1015 		 * thread having called ufs_thread_exit() or
1016 		 * ufs_thread_suspend().
1017 		 */
1018 		mutex_enter(&uq->uq_mutex);
1019 again:
1020 		if (uq->uq_flags & UQ_EXIT) {
1021 			err++;
1022 			mutex_exit(&uq->uq_mutex);
1023 			break;
1024 		} else if (uq->uq_flags & UQ_SUSPEND) {
1025 			uq->uq_flags |= UQ_SUSPENDED;
1026 			/*
1027 			 * Release the buf before we cv_wait()
1028 			 * otherwise we may deadlock with the
1029 			 * thread that called ufs_thread_suspend().
1030 			 */
1031 			if (bp) {
1032 				brelse(bp);
1033 				bp = 0;
1034 			}
1035 			if (uq->uq_flags & UQ_WAIT) {
1036 				uq->uq_flags &= ~UQ_WAIT;
1037 				cv_broadcast(&uq->uq_cv);
1038 			}
1039 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1040 			cv_wait(&uq->uq_cv, &uq->uq_mutex);
1041 			CALLB_CPR_SAFE_END(&cprinfo, &uq->uq_mutex);
1042 			goto again;
1043 		}
1044 		mutex_exit(&uq->uq_mutex);
1045 
1046 		/*
1047 		 * if we don't already have the buf; get it
1048 		 */
1049 		bno = fsbtodb(fs, itod(fs, ino));
1050 		if ((bp == 0) || (bp->b_blkno != bno)) {
1051 			if (bp)
1052 				brelse(bp);
1053 			bp = UFS_BREAD(ufsvfsp,
1054 					ufsvfsp->vfs_dev, bno, fs->fs_bsize);
1055 			bp->b_flags |= B_AGE;
1056 		}
1057 		if (bp->b_flags & B_ERROR) {
1058 			err++;
1059 			continue;
1060 		}
1061 		/*
1062 		 * nlink <= 0 and mode != 0 means deleted
1063 		 */
1064 		dp = (struct dinode *)bp->b_un.b_addr + itoo(fs, ino);
1065 		if ((dp->di_nlink <= 0) && (dp->di_mode != 0)) {
1066 			/*
1067 			 * can't hold the buf (deadlock)
1068 			 */
1069 			brelse(bp);
1070 			bp = 0;
1071 			rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1072 			/*
1073 			 * iget/iput sequence will put inode on ifree
1074 			 * thread queue if it is idle.  This is a nop
1075 			 * for busy (open, deleted) inodes
1076 			 */
1077 			if (ufs_iget(vfsp, ino, &ip, CRED()))
1078 				err++;
1079 			else
1080 				VN_RELE(ITOV(ip));
1081 			rw_exit(&ufsvfsp->vfs_dqrwlock);
1082 		}
1083 	}
1084 
1085 	if (bp)
1086 		brelse(bp);
1087 	if (!err) {
1088 		/*
1089 		 * reset the reclaiming-bit
1090 		 */
1091 		mutex_enter(&ufsvfsp->vfs_lock);
1092 		fs->fs_reclaim &= ~FS_RECLAIMING;
1093 		mutex_exit(&ufsvfsp->vfs_lock);
1094 		TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_RECLAIM);
1095 	}
1096 
1097 	/*
1098 	 * exit the reclaim thread
1099 	 */
1100 	mutex_enter(&uq->uq_mutex);
1101 	uq->uq_threadp = NULL;
1102 	uq->uq_flags &= ~UQ_WAIT;
1103 	cv_broadcast(&uq->uq_cv);
1104 	CALLB_CPR_EXIT(&cprinfo);
1105 	thread_exit();
1106 }
1107 /*
1108  * HLOCK FILE SYSTEM
1109  *	hlock the file system's whose logs have device errors
1110  */
1111 struct ufs_q	ufs_hlock;
1112 /*ARGSUSED*/
1113 void
1114 ufs_thread_hlock(void *ignore)
1115 {
1116 	int		retry;
1117 	callb_cpr_t	cprinfo;
1118 
1119 	CALLB_CPR_INIT(&cprinfo, &ufs_hlock.uq_mutex, callb_generic_cpr,
1120 	    "ufshlock");
1121 
1122 	for (;;) {
1123 		/*
1124 		 * sleep until there is work to do
1125 		 */
1126 		mutex_enter(&ufs_hlock.uq_mutex);
1127 		(void) ufs_thread_run(&ufs_hlock, &cprinfo);
1128 		ufs_hlock.uq_ne = 0;
1129 		mutex_exit(&ufs_hlock.uq_mutex);
1130 		/*
1131 		 * hlock the error'ed fs's
1132 		 *	retry after a bit if another app is doing lockfs stuff
1133 		 */
1134 		do {
1135 			retry = ufs_trans_hlock();
1136 			if (retry) {
1137 				mutex_enter(&ufs_hlock.uq_mutex);
1138 				CALLB_CPR_SAFE_BEGIN(&cprinfo);
1139 				(void) cv_timedwait(&ufs_hlock.uq_cv,
1140 							&ufs_hlock.uq_mutex,
1141 							lbolt + hz);
1142 				CALLB_CPR_SAFE_END(&cprinfo,
1143 				    &ufs_hlock.uq_mutex);
1144 				mutex_exit(&ufs_hlock.uq_mutex);
1145 			}
1146 		} while (retry);
1147 	}
1148 }
1149 
1150 static void
1151 ufs_attr_purge(struct inode *dp)
1152 {
1153 	int	err;
1154 	int	error;
1155 	off_t 	dirsize;			/* size of the directory */
1156 	off_t 	offset;	/* offset in the directory */
1157 	int entryoffsetinblk;		/* offset of ep in fbp's buffer */
1158 	struct inode *tp;
1159 	struct fbuf *fbp;	/* pointer to directory block */
1160 	struct direct *ep;	/* directory entry */
1161 	int trans_size;
1162 	int issync;
1163 	struct ufsvfs	*ufsvfsp = dp->i_ufsvfs;
1164 
1165 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1166 
1167 	fbp = NULL;
1168 	dirsize = roundup(dp->i_size, DIRBLKSIZ);
1169 	offset = 0;
1170 	entryoffsetinblk = 0;
1171 
1172 	/*
1173 	 * Purge directory cache
1174 	 */
1175 
1176 	dnlc_dir_purge(&dp->i_danchor);
1177 
1178 	while (offset < dirsize) {
1179 		/*
1180 		 * If offset is on a block boundary,
1181 		 * read the next directory block.
1182 		 * Release previous if it exists.
1183 		 */
1184 		if (blkoff(dp->i_fs, offset) == 0) {
1185 			if (fbp != NULL) {
1186 				fbrelse(fbp, S_OTHER);
1187 			}
1188 
1189 			err = blkatoff(dp, offset, (char **)0, &fbp);
1190 			if (err) {
1191 				goto out;
1192 			}
1193 			entryoffsetinblk = 0;
1194 		}
1195 		ep = (struct direct *)(fbp->fb_addr + entryoffsetinblk);
1196 		if (ep->d_ino == 0 || (ep->d_name[0] == '.' &&
1197 		    ep->d_name[1] == '\0') ||
1198 		    (ep->d_name[0] == '.' && ep->d_name[1] == '.' &&
1199 		    ep->d_name[2] == '\0')) {
1200 
1201 			entryoffsetinblk += ep->d_reclen;
1202 
1203 		} else {
1204 
1205 			if ((err = ufs_iget(dp->i_vfs, ep->d_ino,
1206 			    &tp, CRED())) != 0) {
1207 				goto out;
1208 			}
1209 
1210 			TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE,
1211 			    trans_size = (int)TOP_REMOVE_SIZE(tp));
1212 
1213 			/*
1214 			 * Delete inode.
1215 			 */
1216 
1217 			dnlc_remove(ITOV(dp), ep->d_name);
1218 
1219 			rw_enter(&tp->i_contents, RW_WRITER);
1220 			tp->i_flag |= ICHG;
1221 			tp->i_seq++;
1222 			TRANS_INODE(tp->i_ufsvfs, tp);
1223 			tp->i_nlink--;
1224 			ufs_setreclaim(tp);
1225 			ITIMES_NOLOCK(tp);
1226 			rw_exit(&tp->i_contents);
1227 
1228 			VN_RELE(ITOV(tp));
1229 			entryoffsetinblk += ep->d_reclen;
1230 			TRANS_END_CSYNC(ufsvfsp, error,
1231 			    issync, TOP_REMOVE, trans_size);
1232 
1233 		}
1234 		offset += ep->d_reclen;
1235 	}
1236 
1237 	if (fbp) {
1238 		fbrelse(fbp, S_OTHER);
1239 	}
1240 
1241 out:
1242 	rw_exit(&ufsvfsp->vfs_dqrwlock);
1243 }
1244