xref: /titanic_50/usr/src/uts/common/fs/ufs/ufs_thread.c (revision b7f45089ccbe01bab3d7c7377b49d80d2ae18a69)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 /*
31  * Portions of this source code were derived from Berkeley 4.3 BSD
32  * under license from the Regents of the University of California.
33  */
34 
35 #pragma ident	"%Z%%M%	%I%	%E% SMI"
36 
37 #include <sys/types.h>
38 #include <sys/systm.h>
39 #include <sys/errno.h>
40 #include <sys/kmem.h>
41 #include <sys/buf.h>
42 #include <sys/vnode.h>
43 #include <sys/vfs.h>
44 #include <sys/user.h>
45 #include <sys/callb.h>
46 #include <sys/cpuvar.h>
47 #include <sys/fs/ufs_inode.h>
48 #include <sys/fs/ufs_log.h>
49 #include <sys/fs/ufs_trans.h>
50 #include <sys/fs/ufs_acl.h>
51 #include <sys/fs/ufs_bio.h>
52 #include <sys/fs/ufs_fsdir.h>
53 #include <sys/debug.h>
54 #include <sys/cmn_err.h>
55 #include <sys/sysmacros.h>
56 
57 extern pri_t 			minclsyspri;
58 extern int			hash2ints();
59 extern struct kmem_cache	*inode_cache;	/* cache of free inodes */
60 extern int			ufs_idle_waiters;
61 extern struct instats		ins;
62 
63 static void ufs_attr_purge(struct inode *);
64 
65 /*
66  * initialize a thread's queue struct
67  */
68 void
69 ufs_thread_init(struct ufs_q *uq, int lowat)
70 {
71 	bzero((caddr_t)uq, sizeof (*uq));
72 	cv_init(&uq->uq_cv, NULL, CV_DEFAULT, NULL);
73 	mutex_init(&uq->uq_mutex, NULL, MUTEX_DEFAULT, NULL);
74 	uq->uq_lowat = lowat;
75 	uq->uq_hiwat = 2 * lowat;
76 	uq->uq_threadp = NULL;
77 }
78 
79 /*
80  * start a thread for a queue (assumes success)
81  */
82 void
83 ufs_thread_start(struct ufs_q *uq, void (*func)(), struct vfs *vfsp)
84 {
85 	mutex_enter(&uq->uq_mutex);
86 	if (uq->uq_threadp == NULL) {
87 		uq->uq_threadp = thread_create(NULL, 0, func, vfsp, 0, &p0,
88 		    TS_RUN, minclsyspri);
89 		uq->uq_flags = 0;
90 	}
91 	mutex_exit(&uq->uq_mutex);
92 }
93 
94 /*
95  * wait for the thread to exit
96  */
97 void
98 ufs_thread_exit(struct ufs_q *uq)
99 {
100 	kt_did_t ufs_thread_did = 0;
101 
102 	mutex_enter(&uq->uq_mutex);
103 	uq->uq_flags &= ~(UQ_SUSPEND | UQ_SUSPENDED);
104 	if (uq->uq_threadp != NULL) {
105 		ufs_thread_did = uq->uq_threadp->t_did;
106 		uq->uq_flags |= (UQ_EXIT|UQ_WAIT);
107 		cv_broadcast(&uq->uq_cv);
108 	}
109 	mutex_exit(&uq->uq_mutex);
110 
111 	/*
112 	 * It's safe to call thread_join() with an already-gone
113 	 * t_did, but we have to obtain it before the kernel
114 	 * thread structure is freed. We do so above under the
115 	 * protection of the uq_mutex when we're sure the thread
116 	 * still exists and it's save to de-reference it.
117 	 * We also have to check if ufs_thread_did is != 0
118 	 * before calling thread_join() since thread 0 in the system
119 	 * gets a t_did of 0.
120 	 */
121 	if (ufs_thread_did)
122 		thread_join(ufs_thread_did);
123 }
124 
125 /*
126  * wait for a thread to suspend itself on the caller's behalf
127  *	the caller is responsible for continuing the thread
128  */
129 void
130 ufs_thread_suspend(struct ufs_q *uq)
131 {
132 	mutex_enter(&uq->uq_mutex);
133 	if (uq->uq_threadp != NULL) {
134 		/*
135 		 * wait while another thread is suspending this thread.
136 		 * no need to do a cv_broadcast(), as whoever suspended
137 		 * the thread must continue at some point.
138 		 */
139 		while ((uq->uq_flags & UQ_SUSPEND) &&
140 		    (uq->uq_threadp != NULL)) {
141 			uq->uq_flags |= UQ_WAIT;
142 			cv_wait(&uq->uq_cv, &uq->uq_mutex);
143 		}
144 
145 		/*
146 		 * wait for the thread to suspend itself
147 		 */
148 		uq->uq_flags |= UQ_SUSPEND;
149 		while (((uq->uq_flags & UQ_SUSPENDED) == 0) &&
150 		    (uq->uq_threadp != NULL)) {
151 			uq->uq_flags |= UQ_WAIT;
152 			cv_broadcast(&uq->uq_cv);
153 			cv_wait(&uq->uq_cv, &uq->uq_mutex);
154 		}
155 	}
156 	mutex_exit(&uq->uq_mutex);
157 }
158 
159 /*
160  * allow a thread to continue from a ufs_thread_suspend()
161  *	This thread must be the same as the thread that called
162  *	ufs_thread_suspend.
163  */
164 void
165 ufs_thread_continue(struct ufs_q *uq)
166 {
167 	mutex_enter(&uq->uq_mutex);
168 	uq->uq_flags &= ~(UQ_SUSPEND | UQ_SUSPENDED);
169 	cv_broadcast(&uq->uq_cv);
170 	mutex_exit(&uq->uq_mutex);
171 }
172 
173 /*
174  * some common code for managing a threads execution
175  *	uq is locked at entry and return
176  *	may sleep
177  *	may exit
178  */
179 /*
180  * Kind of a hack passing in the callb_cpr_t * here.
181  * It should really be part of the ufs_q structure.
182  * I did not put it in there because we are already in beta
183  * and I was concerned that changing ufs_inode.h to include
184  * callb.h might break something.
185  */
186 int
187 ufs_thread_run(struct ufs_q *uq, callb_cpr_t *cprinfop)
188 {
189 again:
190 	ASSERT(uq->uq_ne >= 0);
191 
192 	if (uq->uq_flags & UQ_SUSPEND) {
193 		uq->uq_flags |= UQ_SUSPENDED;
194 	} else if (uq->uq_flags & UQ_EXIT) {
195 		/*
196 		 * exiting; empty the queue (may infinite loop)
197 		 */
198 		if (uq->uq_ne)
199 			return (uq->uq_ne);
200 		uq->uq_threadp = NULL;
201 		if (uq->uq_flags & UQ_WAIT)
202 			cv_broadcast(&uq->uq_cv);
203 		uq->uq_flags &= ~(UQ_EXIT | UQ_WAIT);
204 		CALLB_CPR_EXIT(cprinfop);
205 		thread_exit();
206 	} else if (uq->uq_ne >= uq->uq_lowat) {
207 		/*
208 		 * process a block of entries until below high water mark
209 		 */
210 		return (uq->uq_ne - (uq->uq_lowat >> 1));
211 	}
212 	if (uq->uq_flags & UQ_WAIT) {
213 		uq->uq_flags &= ~UQ_WAIT;
214 		cv_broadcast(&uq->uq_cv);
215 	}
216 	CALLB_CPR_SAFE_BEGIN(cprinfop);
217 	cv_wait(&uq->uq_cv, &uq->uq_mutex);
218 	CALLB_CPR_SAFE_END(cprinfop, &uq->uq_mutex);
219 	goto again;
220 }
221 
222 /*
223  * DELETE INODE
224  * The following routines implement the protocol for freeing the resources
225  * held by an idle and deleted inode.
226  */
227 void
228 ufs_delete(struct ufsvfs *ufsvfsp, struct inode *ip, int dolockfs)
229 {
230 	ushort_t	mode;
231 	struct vnode	*vp	= ITOV(ip);
232 	struct ulockfs	*ulp;
233 	int		trans_size;
234 	int		dorwlock = ((ip->i_mode & IFMT) == IFREG);
235 	int		issync;
236 	int		err;
237 	struct inode	*dp;
238 
239 	/*
240 	 * not on a trans device or not part of a transaction
241 	 */
242 	ASSERT(!TRANS_ISTRANS(ufsvfsp) ||
243 		((curthread->t_flag & T_DONTBLOCK) == 0));
244 
245 	/*
246 	 * Ignore if deletes are not allowed (wlock/hlock)
247 	 */
248 	if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) {
249 		VN_RELE(vp);
250 		return;
251 	}
252 
253 	if ((vp->v_count > 1) || (ip->i_mode == 0)) {
254 		VN_RELE(vp);
255 		return;
256 	}
257 	/*
258 	 * If we are called as part of setting a fs lock, then only
259 	 * do part of the lockfs protocol.  In other words, don't hang.
260 	 */
261 	if (dolockfs) {
262 		if (ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_DELETE_MASK))
263 			return;
264 	} else {
265 		/*
266 		 * check for recursive VOP call
267 		 */
268 		if (curthread->t_flag & T_DONTBLOCK) {
269 			ulp = NULL;
270 		} else {
271 			ulp = &ufsvfsp->vfs_ulockfs;
272 			curthread->t_flag |= T_DONTBLOCK;
273 		}
274 	}
275 
276 	/*
277 	 * Hold rwlock to synchronize with (nfs) writes
278 	 */
279 	if (dorwlock)
280 		rw_enter(&ip->i_rwlock, RW_WRITER);
281 
282 	/*
283 	 * Delete the attribute directory.
284 	 */
285 	if (ip->i_oeftflag != 0) {
286 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE,
287 		    trans_size = (int)TOP_REMOVE_SIZE(ip));
288 		rw_enter(&ip->i_contents, RW_WRITER);
289 		err = ufs_iget(ip->i_vfs, ip->i_oeftflag,
290 		    &dp, CRED());
291 		if (err == 0) {
292 			rw_enter(&dp->i_rwlock, RW_WRITER);
293 			rw_enter(&dp->i_contents, RW_WRITER);
294 			dp->i_flag |= IUPD|ICHG;
295 			dp->i_seq++;
296 			TRANS_INODE(dp->i_ufsvfs, dp);
297 			dp->i_nlink -= 2;
298 			ufs_setreclaim(dp);
299 			/*
300 			 * Should get rid of any negative cache entries that
301 			 * might be lingering, as well as ``.'' and
302 			 * ``..''.  If we don't, the VN_RELE() below
303 			 * won't actually put dp on the delete queue
304 			 * and it'll hang out until someone forces it
305 			 * (lockfs -f, umount, ...).  The only reliable
306 			 * way of doing this at the moment is to call
307 			 * dnlc_purge_vp(ITOV(dp)), which is unacceptably
308 			 * slow, so we'll just note the problem in this
309 			 * comment for now.
310 			 */
311 			dnlc_remove(ITOV(dp), ".");
312 			dnlc_remove(ITOV(dp), "..");
313 			ITIMES_NOLOCK(dp);
314 			if (!TRANS_ISTRANS(ufsvfsp)) {
315 				ufs_iupdat(dp, I_SYNC);
316 			}
317 			rw_exit(&dp->i_contents);
318 			rw_exit(&dp->i_rwlock);
319 			VN_RELE(ITOV(dp));
320 		}
321 		/*
322 		 * Clear out attribute pointer
323 		 */
324 		ip->i_oeftflag = 0;
325 		rw_exit(&ip->i_contents);
326 		TRANS_END_CSYNC(ufsvfsp, err, issync,
327 		    TOP_REMOVE, trans_size);
328 		dnlc_remove(ITOV(ip), XATTR_DIR_NAME);
329 	}
330 
331 	if ((ip->i_mode & IFMT) == IFATTRDIR) {
332 		ufs_attr_purge(ip);
333 	}
334 
335 	(void) TRANS_ITRUNC(ip, (u_offset_t)0, I_FREE, CRED());
336 
337 	/*
338 	 * the inode's space has been freed; now free the inode
339 	 */
340 	if (ulp) {
341 		trans_size = TOP_IFREE_SIZE(ip);
342 		TRANS_BEGIN_ASYNC(ufsvfsp, TOP_IFREE, trans_size);
343 	}
344 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
345 	rw_enter(&ip->i_contents, RW_WRITER);
346 	TRANS_INODE(ufsvfsp, ip);
347 	mode = ip->i_mode;
348 	ip->i_mode = 0;
349 	ip->i_rdev = 0;
350 	ip->i_ordev = 0;
351 	ip->i_flag |= IMOD;
352 	if (ip->i_ufs_acl) {
353 		(void) ufs_si_free(ip->i_ufs_acl, vp->v_vfsp, CRED());
354 		ip->i_ufs_acl = NULL;
355 		ip->i_shadow = 0;
356 	}
357 
358 	/*
359 	 * This inode is torn down but still retains it's identity
360 	 * (inode number).  It could get recycled soon so it's best
361 	 * to clean up the vnode just in case.
362 	 */
363 	mutex_enter(&vp->v_lock);
364 	vn_recycle(vp);
365 	mutex_exit(&vp->v_lock);
366 
367 	/*
368 	 * free the inode
369 	 */
370 	ufs_ifree(ip, ip->i_number, mode);
371 	/*
372 	 * release quota resources; can't fail
373 	 */
374 	(void) chkiq((struct ufsvfs *)vp->v_vfsp->vfs_data,
375 		/* change */ -1, ip, (uid_t)ip->i_uid, 0, CRED(),
376 		(char **)NULL, (size_t *)NULL);
377 	dqrele(ip->i_dquot);
378 	ip->i_dquot = NULL;
379 	ip->i_flag &= ~(IDEL | IDIRECTIO);
380 	ip->i_cflags = 0;
381 	if (!TRANS_ISTRANS(ufsvfsp)) {
382 		ufs_iupdat(ip, I_SYNC);
383 	}
384 	rw_exit(&ip->i_contents);
385 	rw_exit(&ufsvfsp->vfs_dqrwlock);
386 	if (dorwlock)
387 		rw_exit(&ip->i_rwlock);
388 	VN_RELE(vp);
389 
390 	/*
391 	 * End of transaction
392 	 */
393 	if (ulp) {
394 		TRANS_END_ASYNC(ufsvfsp, TOP_IFREE, trans_size);
395 		if (dolockfs)
396 			ufs_lockfs_end(ulp);
397 		else
398 			curthread->t_flag &= ~T_DONTBLOCK;
399 	}
400 }
401 
402 /*
403  * thread that frees up deleted inodes
404  */
405 void
406 ufs_thread_delete(struct vfs *vfsp)
407 {
408 	struct ufsvfs	*ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
409 	struct ufs_q	*uq	= &ufsvfsp->vfs_delete;
410 	struct inode	*ip;
411 	long		ne;
412 	callb_cpr_t	cprinfo;
413 
414 	CALLB_CPR_INIT(&cprinfo, &uq->uq_mutex, callb_generic_cpr,
415 	    "ufsdelete");
416 
417 	mutex_enter(&uq->uq_mutex);
418 again:
419 	/*
420 	 * sleep until there is work to do
421 	 */
422 	ne = ufs_thread_run(uq, &cprinfo);
423 	/*
424 	 * process up to ne entries
425 	 */
426 	while (ne-- && (ip = uq->uq_ihead)) {
427 		/*
428 		 * process first entry on queue.  Assumed conditions are:
429 		 *	ip is held (v_count >= 1)
430 		 *	ip is referenced (i_flag & IREF)
431 		 *	ip is free (i_nlink <= 0)
432 		 */
433 		if ((uq->uq_ihead = ip->i_freef) == ip)
434 			uq->uq_ihead = NULL;
435 		ip->i_freef->i_freeb = ip->i_freeb;
436 		ip->i_freeb->i_freef = ip->i_freef;
437 		ip->i_freef = ip;
438 		ip->i_freeb = ip;
439 		uq->uq_ne--;
440 		mutex_exit(&uq->uq_mutex);
441 		ufs_delete(ufsvfsp, ip, 1);
442 		mutex_enter(&uq->uq_mutex);
443 	}
444 	goto again;
445 }
446 
447 /*
448  * drain ne entries off the delete queue.  As new queue entries may
449  * be added while we're working, ne is interpreted as follows:
450  *
451  * ne > 0   => remove up to ne entries
452  * ne == 0  => remove all entries currently on the queue
453  * ne == -1 => remove entries until the queue is empty
454  */
455 void
456 ufs_delete_drain(struct vfs *vfsp, int ne, int dolockfs)
457 {
458 	struct ufsvfs	*ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
459 	struct ufs_q	*uq;
460 	struct inode	*ip;
461 	int		drain_cnt = 0;
462 	int		done;
463 
464 	/*
465 	 * if forcibly unmounted; ignore
466 	 */
467 	if (ufsvfsp == NULL)
468 		return;
469 
470 	uq = &ufsvfsp->vfs_delete;
471 	mutex_enter(&uq->uq_mutex);
472 	if (ne == 0)
473 		drain_cnt = uq->uq_ne;
474 	else if (ne > 0)
475 		drain_cnt = ne;
476 
477 	/*
478 	 * process up to ne entries
479 	 */
480 
481 	done = 0;
482 	while (!done && (ip = uq->uq_ihead)) {
483 		if (ne != -1)
484 			drain_cnt--;
485 		if (ne != -1 && drain_cnt == 0)
486 			done = 1;
487 		if ((uq->uq_ihead = ip->i_freef) == ip)
488 			uq->uq_ihead = NULL;
489 		ip->i_freef->i_freeb = ip->i_freeb;
490 		ip->i_freeb->i_freef = ip->i_freef;
491 		ip->i_freef = ip;
492 		ip->i_freeb = ip;
493 		uq->uq_ne--;
494 		mutex_exit(&uq->uq_mutex);
495 		ufs_delete(ufsvfsp, ip, dolockfs);
496 		mutex_enter(&uq->uq_mutex);
497 	}
498 	mutex_exit(&uq->uq_mutex);
499 }
500 
501 void
502 ufs_sync_with_thread(struct ufs_q *uq)
503 {
504 	mutex_enter(&uq->uq_mutex);
505 	uq->uq_flags |= UQ_WAIT;
506 	/*
507 	 * Someone other than the thread we're interested in might
508 	 * send a signal, so make sure the thread's given an
509 	 * acknowledgement.
510 	 */
511 	while ((uq->uq_threadp != NULL) && (uq->uq_flags & UQ_WAIT)) {
512 		cv_broadcast(&uq->uq_cv);
513 		cv_wait(&uq->uq_cv, &uq->uq_mutex);
514 	}
515 	mutex_exit(&uq->uq_mutex);
516 }
517 
518 /*
519  * Get rid of everything that's currently in the delete queue,
520  * plus whatever the delete thread is working on at the moment.
521  *
522  * This ability is required for providing true POSIX semantics
523  * regarding close(2), unlink(2), etc, even when logging is enabled.
524  * The standard requires that the released space be immediately
525  * observable (statvfs(2)) and allocatable (e.g., write(2)).
526  */
527 void
528 ufs_delete_drain_wait(struct ufsvfs *ufsvfsp, int dolockfs)
529 {
530 	struct ufs_q *uq = &ufsvfsp->vfs_delete;
531 	int	error;
532 
533 	(void) ufs_delete_drain(ufsvfsp->vfs_vfs, 0, dolockfs);
534 	ufs_sync_with_thread(uq);
535 
536 	/*
537 	 * Commit any outstanding transactions to make sure
538 	 * any canceled freed blocks are available for allocation.
539 	 */
540 	curthread->t_flag |= T_DONTBLOCK;
541 	TRANS_BEGIN_SYNC(ufsvfsp, TOP_COMMIT_UPDATE, TOP_COMMIT_SIZE, error);
542 	if (!error) {
543 		TRANS_END_SYNC(ufsvfsp, error, TOP_COMMIT_UPDATE,
544 			TOP_COMMIT_SIZE);
545 	}
546 	curthread->t_flag &= ~T_DONTBLOCK;
547 }
548 
549 /*
550  * Adjust the resource usage in a struct statvfs based on
551  * what's in the delete queue.  Assumes that the delete
552  * thread has been suspended.
553  *
554  * We do not consider the impact of ACLs or extended attributes
555  * that may be deleted as a side-effect of deleting a file.
556  * Those are metadata, and their sizes aren't reflected in the
557  * sizes returned by stat(), so this is not a problem.
558  */
559 void
560 ufs_delete_adjust_stats(struct ufsvfs *ufsvfsp, struct statvfs64 *sp)
561 {
562 	struct inode *ip;
563 	struct fs *fs = ufsvfsp->vfs_fs;
564 	struct ufs_q *uq = &ufsvfsp->vfs_delete;
565 
566 	/*
567 	 * To be self-consistent with the existing contents of
568 	 * *sp, we have to keep the queue stable during our
569 	 * traversal.  mainly, this keeps anyone from doing a
570 	 * ufs_delete_drain() on top of us.
571 	 */
572 	mutex_enter(&uq->uq_mutex);
573 
574 	ip = uq->uq_ihead;
575 	if (ip != NULL) {
576 		do {
577 			sp->f_bfree += dbtofsb(fs, ip->i_blocks);
578 			sp->f_ffree += 1;
579 			ip = ip->i_freef;
580 		} while (ip != uq->uq_ihead);
581 	}
582 
583 	mutex_exit(&uq->uq_mutex);
584 }
585 
586 /*
587  * IDLE INODE
588  * The following routines implement the protocol for maintaining an
589  * LRU list of idle inodes and for moving the idle inodes to the
590  * reuse list when the number of allocated inodes exceeds the user
591  * tunable high-water mark (ufs_ninode).
592  */
593 
594 /*
595  * clean an idle inode and move it to the reuse list
596  */
597 static void
598 ufs_idle_free(struct inode *ip)
599 {
600 	int			pages;
601 	int			hno;
602 	kmutex_t		*ihm;
603 	struct ufsvfs		*ufsvfsp	= ip->i_ufsvfs;
604 	struct vnode		*vp		= ITOV(ip);
605 
606 	/*
607 	 * inode is held
608 	 */
609 
610 	/*
611 	 * remember `pages' for stats below
612 	 */
613 	pages = (ip->i_mode && vn_has_cached_data(vp) && vp->v_type != VCHR);
614 
615 	/*
616 	 * start the dirty pages to disk and then invalidate them
617 	 * unless the inode is invalid (ISTALE)
618 	 */
619 	if ((ip->i_flag & ISTALE) == 0) {
620 		(void) TRANS_SYNCIP(ip, B_ASYNC, I_ASYNC, TOP_SYNCIP_FREE);
621 		(void) TRANS_SYNCIP(ip,
622 				    (TRANS_ISERROR(ufsvfsp)) ?
623 				    B_INVAL | B_FORCE : B_INVAL,
624 				    I_ASYNC, TOP_SYNCIP_FREE);
625 	}
626 
627 	/*
628 	 * wait for any current ufs_iget to finish and block future ufs_igets
629 	 */
630 	ASSERT(ip->i_number != 0);
631 	hno = INOHASH(ip->i_number);
632 	ihm = &ih_lock[hno];
633 	mutex_enter(ihm);
634 
635 	/*
636 	 * It must be guaranteed that v_count >= 2, otherwise
637 	 * something must be wrong with this vnode already.
638 	 * That is why we use v_count-- instead of VN_RELE().
639 	 * Acquire the vnode lock in case another thread is in
640 	 * VN_RELE().
641 	 */
642 	mutex_enter(&vp->v_lock);
643 
644 	if (vp->v_count < 2)
645 		cmn_err(CE_PANIC,
646 			"ufs_idle_free: vnode ref count is less than 2");
647 
648 	vp->v_count--;
649 	if ((vp->v_type != VCHR && vn_has_cached_data(vp)) ||
650 		vp->v_count != 1 ||
651 		ip->i_flag & (IMOD|IMODACC|IACC|ICHG|IUPD|IATTCHG)) {
652 			/*
653 			 * Another thread has referenced this inode while
654 			 * we are trying to free it. Call VN_RELE() to
655 			 * release our reference.
656 			 */
657 			mutex_exit(&vp->v_lock);
658 			mutex_exit(ihm);
659 			VN_RELE(vp);
660 	} else {
661 		/*
662 		 * The inode is currently unreferenced and can not
663 		 * acquire further references because it has no pages
664 		 * and the hash is locked.  Inodes acquire references
665 		 * via the hash list or via their pages.
666 		 */
667 
668 		mutex_exit(&vp->v_lock);
669 
670 		/*
671 		 * remove it from the cache
672 		 */
673 		remque(ip);
674 		mutex_exit(ihm);
675 		/*
676 		 * Stale inodes have no valid ufsvfs
677 		 */
678 		if ((ip->i_flag & ISTALE) == 0 && ip->i_dquot) {
679 			TRANS_DQRELE(ufsvfsp, ip->i_dquot);
680 			ip->i_dquot = NULL;
681 		}
682 		ufs_si_del(ip);
683 		if (pages) {
684 			CPU_STATS_ADDQ(CPU, sys, ufsipage, 1);
685 		} else {
686 			CPU_STATS_ADDQ(CPU, sys, ufsinopage, 1);
687 		}
688 		ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp));
689 		ufs_free_inode(ip);
690 	}
691 }
692 
693 /*
694  * this thread processes the global idle queue
695  */
696 iqhead_t *ufs_junk_iq;
697 iqhead_t *ufs_useful_iq;
698 int ufs_njunk_iq = 0;
699 int ufs_nuseful_iq = 0;
700 int ufs_niqhash;
701 int ufs_iqhashmask;
702 struct ufs_q	ufs_idle_q;
703 
704 void
705 ufs_thread_idle(void)
706 {
707 	callb_cpr_t cprinfo;
708 	int i;
709 	int ne;
710 
711 	ufs_niqhash = (ufs_idle_q.uq_lowat >> 1) / IQHASHQLEN;
712 	ufs_niqhash = 1 << highbit(ufs_niqhash); /* round up to power of 2 */
713 	ufs_iqhashmask = ufs_niqhash - 1;
714 	ufs_junk_iq = kmem_alloc(ufs_niqhash * sizeof (*ufs_junk_iq),
715 	    KM_SLEEP);
716 	ufs_useful_iq = kmem_alloc(ufs_niqhash * sizeof (*ufs_useful_iq),
717 	    KM_SLEEP);
718 
719 	/* Initialize hash queue headers */
720 	for (i = 0; i < ufs_niqhash; i++) {
721 		ufs_junk_iq[i].i_freef = (inode_t *)&ufs_junk_iq[i];
722 		ufs_junk_iq[i].i_freeb = (inode_t *)&ufs_junk_iq[i];
723 		ufs_useful_iq[i].i_freef = (inode_t *)&ufs_useful_iq[i];
724 		ufs_useful_iq[i].i_freeb = (inode_t *)&ufs_useful_iq[i];
725 	}
726 
727 	CALLB_CPR_INIT(&cprinfo, &ufs_idle_q.uq_mutex, callb_generic_cpr,
728 	    "ufsidle");
729 again:
730 	/*
731 	 * Whenever the idle thread is awakened, it repeatedly gives
732 	 * back half of the idle queue until the idle queue falls
733 	 * below lowat.
734 	 */
735 	mutex_enter(&ufs_idle_q.uq_mutex);
736 	if (ufs_idle_q.uq_ne < ufs_idle_q.uq_lowat) {
737 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
738 		cv_wait(&ufs_idle_q.uq_cv, &ufs_idle_q.uq_mutex);
739 		CALLB_CPR_SAFE_END(&cprinfo, &ufs_idle_q.uq_mutex);
740 	}
741 	mutex_exit(&ufs_idle_q.uq_mutex);
742 
743 	/*
744 	 * Give back 1/2 of the idle queue
745 	 */
746 	ne = ufs_idle_q.uq_ne >> 1;
747 	ins.in_tidles.value.ul += ne;
748 	ufs_idle_some(ne);
749 	goto again;
750 }
751 
752 /*
753  * Reclaim callback for ufs inode cache.
754  * Invoked by the kernel memory allocator when memory gets tight.
755  */
756 /*ARGSUSED*/
757 void
758 ufs_inode_cache_reclaim(void *cdrarg)
759 {
760 	/*
761 	 * If we are low on memory and the idle queue is over its
762 	 * halfway mark, then free 50% of the idle q
763 	 *
764 	 * We don't free all of the idle inodes because the inodes
765 	 * for popular NFS files may have been kicked from the dnlc.
766 	 * The inodes for these files will end up on the idle queue
767 	 * after every NFS access.
768 	 *
769 	 * If we repeatedly push them from the idle queue then
770 	 * NFS users may be unhappy as an extra buf cache operation
771 	 * is incurred for every NFS operation to these files.
772 	 *
773 	 * It's not common, but I have seen it happen.
774 	 *
775 	 */
776 	if (ufs_idle_q.uq_ne < (ufs_idle_q.uq_lowat >> 1))
777 		return;
778 	mutex_enter(&ufs_idle_q.uq_mutex);
779 	cv_broadcast(&ufs_idle_q.uq_cv);
780 	mutex_exit(&ufs_idle_q.uq_mutex);
781 }
782 
783 /*
784  * Free up some idle inodes
785  */
786 void
787 ufs_idle_some(int ne)
788 {
789 	int i;
790 	struct inode *ip;
791 	struct vnode *vp;
792 	static int junk_rotor = 0;
793 	static int useful_rotor = 0;
794 
795 	for (i = 0; i < ne; ++i) {
796 		mutex_enter(&ufs_idle_q.uq_mutex);
797 
798 		if (ufs_njunk_iq) {
799 			while (ufs_junk_iq[junk_rotor].i_freef ==
800 			    (inode_t *)&ufs_junk_iq[junk_rotor]) {
801 				junk_rotor = IQNEXT(junk_rotor);
802 			}
803 			ip = ufs_junk_iq[junk_rotor].i_freef;
804 			ASSERT(ip->i_flag & IJUNKIQ);
805 		} else if (ufs_nuseful_iq) {
806 			while (ufs_useful_iq[useful_rotor].i_freef ==
807 			    (inode_t *)&ufs_useful_iq[useful_rotor]) {
808 				useful_rotor = IQNEXT(useful_rotor);
809 			}
810 			ip = ufs_useful_iq[useful_rotor].i_freef;
811 			ASSERT(!(ip->i_flag & IJUNKIQ));
812 		} else {
813 			mutex_exit(&ufs_idle_q.uq_mutex);
814 			return;
815 		}
816 
817 		/*
818 		 * emulate ufs_iget
819 		 */
820 		vp = ITOV(ip);
821 		VN_HOLD(vp);
822 		mutex_exit(&ufs_idle_q.uq_mutex);
823 		rw_enter(&ip->i_contents, RW_WRITER);
824 		/*
825 		 * VN_RELE should not be called if
826 		 * ufs_rmidle returns true, as it will
827 		 * effectively be done in ufs_idle_free.
828 		 */
829 		if (ufs_rmidle(ip)) {
830 			rw_exit(&ip->i_contents);
831 			ufs_idle_free(ip);
832 		} else {
833 			rw_exit(&ip->i_contents);
834 			VN_RELE(vp);
835 		}
836 	}
837 }
838 
839 /*
840  * drain entries for vfsp from the idle queue
841  * vfsp == NULL means drain the entire thing
842  */
843 void
844 ufs_idle_drain(struct vfs *vfsp)
845 {
846 	struct inode	*ip, *nip;
847 	struct inode	*ianchor = NULL;
848 	int		i;
849 
850 	mutex_enter(&ufs_idle_q.uq_mutex);
851 	if (ufs_njunk_iq) {
852 		/* for each hash q */
853 		for (i = 0; i < ufs_niqhash; i++) {
854 			/* search down the hash q */
855 			for (ip = ufs_junk_iq[i].i_freef;
856 			    ip != (inode_t *)&ufs_junk_iq[i];
857 			    ip = ip->i_freef) {
858 				if (ip->i_vfs == vfsp || vfsp == NULL) {
859 					/* found a matching entry */
860 					VN_HOLD(ITOV(ip));
861 					mutex_exit(&ufs_idle_q.uq_mutex);
862 					rw_enter(&ip->i_contents, RW_WRITER);
863 					/*
864 					 * See comments in ufs_idle_some()
865 					 * as we will call ufs_idle_free()
866 					 * after scanning both queues.
867 					 */
868 					if (ufs_rmidle(ip)) {
869 						rw_exit(&ip->i_contents);
870 						ip->i_freef = ianchor;
871 						ianchor = ip;
872 					} else {
873 						rw_exit(&ip->i_contents);
874 						VN_RELE(ITOV(ip));
875 					}
876 					/* restart this hash q */
877 					ip = (inode_t *)&ufs_junk_iq[i];
878 					mutex_enter(&ufs_idle_q.uq_mutex);
879 				}
880 			}
881 		}
882 	}
883 	if (ufs_nuseful_iq) {
884 		/* for each hash q */
885 		for (i = 0; i < ufs_niqhash; i++) {
886 			/* search down the hash q */
887 			for (ip = ufs_useful_iq[i].i_freef;
888 			    ip != (inode_t *)&ufs_useful_iq[i];
889 			    ip = ip->i_freef) {
890 				if (ip->i_vfs == vfsp || vfsp == NULL) {
891 					/* found a matching entry */
892 					VN_HOLD(ITOV(ip));
893 					mutex_exit(&ufs_idle_q.uq_mutex);
894 					rw_enter(&ip->i_contents, RW_WRITER);
895 					/*
896 					 * See comments in ufs_idle_some()
897 					 * as we will call ufs_idle_free()
898 					 * after scanning both queues.
899 					 */
900 					if (ufs_rmidle(ip)) {
901 						rw_exit(&ip->i_contents);
902 						ip->i_freef = ianchor;
903 						ianchor = ip;
904 					} else {
905 						rw_exit(&ip->i_contents);
906 						VN_RELE(ITOV(ip));
907 					}
908 					/* restart this hash q */
909 					ip = (inode_t *)&ufs_useful_iq[i];
910 					mutex_enter(&ufs_idle_q.uq_mutex);
911 				}
912 			}
913 		}
914 	}
915 
916 	mutex_exit(&ufs_idle_q.uq_mutex);
917 	/* no more matching entries, release those we have found (if any) */
918 	for (ip = ianchor; ip; ip = nip) {
919 		nip = ip->i_freef;
920 		ip->i_freef = ip;
921 		ufs_idle_free(ip);
922 	}
923 }
924 
925 /*
926  * RECLAIM DELETED INODES
927  * The following thread scans the file system once looking for deleted files
928  */
929 void
930 ufs_thread_reclaim(struct vfs *vfsp)
931 {
932 	struct ufsvfs		*ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
933 	struct ufs_q		*uq	= &ufsvfsp->vfs_reclaim;
934 	struct fs		*fs	= ufsvfsp->vfs_fs;
935 	struct buf		*bp	= 0;
936 	int			err	= 0;
937 	daddr_t			bno;
938 	ino_t			ino;
939 	struct dinode		*dp;
940 	struct inode		*ip;
941 	callb_cpr_t		cprinfo;
942 
943 	CALLB_CPR_INIT(&cprinfo, &uq->uq_mutex, callb_generic_cpr,
944 	    "ufsreclaim");
945 
946 	/*
947 	 * mount decided that we don't need a reclaim thread
948 	 */
949 	if ((fs->fs_reclaim & FS_RECLAIMING) == 0)
950 		err++;
951 
952 	/*
953 	 * don't reclaim if readonly
954 	 */
955 	if (fs->fs_ronly)
956 		err++;
957 
958 	for (ino = 0; ino < (fs->fs_ncg * fs->fs_ipg) && !err; ++ino) {
959 
960 		/*
961 		 * Check whether we are the target of another
962 		 * thread having called ufs_thread_exit() or
963 		 * ufs_thread_suspend().
964 		 */
965 		mutex_enter(&uq->uq_mutex);
966 again:
967 		if (uq->uq_flags & UQ_EXIT) {
968 			err++;
969 			mutex_exit(&uq->uq_mutex);
970 			break;
971 		} else if (uq->uq_flags & UQ_SUSPEND) {
972 			uq->uq_flags |= UQ_SUSPENDED;
973 			/*
974 			 * Release the buf before we cv_wait()
975 			 * otherwise we may deadlock with the
976 			 * thread that called ufs_thread_suspend().
977 			 */
978 			if (bp) {
979 				brelse(bp);
980 				bp = 0;
981 			}
982 			if (uq->uq_flags & UQ_WAIT) {
983 				uq->uq_flags &= ~UQ_WAIT;
984 				cv_broadcast(&uq->uq_cv);
985 			}
986 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
987 			cv_wait(&uq->uq_cv, &uq->uq_mutex);
988 			CALLB_CPR_SAFE_END(&cprinfo, &uq->uq_mutex);
989 			goto again;
990 		}
991 		mutex_exit(&uq->uq_mutex);
992 
993 		/*
994 		 * if we don't already have the buf; get it
995 		 */
996 		bno = fsbtodb(fs, itod(fs, ino));
997 		if ((bp == 0) || (bp->b_blkno != bno)) {
998 			if (bp)
999 				brelse(bp);
1000 			bp = UFS_BREAD(ufsvfsp,
1001 					ufsvfsp->vfs_dev, bno, fs->fs_bsize);
1002 			bp->b_flags |= B_AGE;
1003 		}
1004 		if (bp->b_flags & B_ERROR) {
1005 			err++;
1006 			continue;
1007 		}
1008 		/*
1009 		 * nlink <= 0 and mode != 0 means deleted
1010 		 */
1011 		dp = (struct dinode *)bp->b_un.b_addr + itoo(fs, ino);
1012 		if ((dp->di_nlink <= 0) && (dp->di_mode != 0)) {
1013 			/*
1014 			 * can't hold the buf (deadlock)
1015 			 */
1016 			brelse(bp);
1017 			bp = 0;
1018 			rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1019 			/*
1020 			 * iget/iput sequence will put inode on ifree
1021 			 * thread queue if it is idle.  This is a nop
1022 			 * for busy (open, deleted) inodes
1023 			 */
1024 			if (ufs_iget(vfsp, ino, &ip, CRED()))
1025 				err++;
1026 			else
1027 				VN_RELE(ITOV(ip));
1028 			rw_exit(&ufsvfsp->vfs_dqrwlock);
1029 		}
1030 	}
1031 
1032 	if (bp)
1033 		brelse(bp);
1034 	if (!err) {
1035 		/*
1036 		 * reset the reclaiming-bit
1037 		 */
1038 		mutex_enter(&ufsvfsp->vfs_lock);
1039 		fs->fs_reclaim &= ~FS_RECLAIMING;
1040 		mutex_exit(&ufsvfsp->vfs_lock);
1041 		TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_RECLAIM);
1042 	}
1043 
1044 	/*
1045 	 * exit the reclaim thread
1046 	 */
1047 	mutex_enter(&uq->uq_mutex);
1048 	uq->uq_threadp = NULL;
1049 	uq->uq_flags &= ~UQ_WAIT;
1050 	cv_broadcast(&uq->uq_cv);
1051 	CALLB_CPR_EXIT(&cprinfo);
1052 	thread_exit();
1053 }
1054 /*
1055  * HLOCK FILE SYSTEM
1056  *	hlock the file system's whose logs have device errors
1057  */
1058 struct ufs_q	ufs_hlock;
1059 /*ARGSUSED*/
1060 void
1061 ufs_thread_hlock(void *ignore)
1062 {
1063 	int		retry;
1064 	callb_cpr_t	cprinfo;
1065 
1066 	CALLB_CPR_INIT(&cprinfo, &ufs_hlock.uq_mutex, callb_generic_cpr,
1067 	    "ufshlock");
1068 
1069 	for (;;) {
1070 		/*
1071 		 * sleep until there is work to do
1072 		 */
1073 		mutex_enter(&ufs_hlock.uq_mutex);
1074 		(void) ufs_thread_run(&ufs_hlock, &cprinfo);
1075 		ufs_hlock.uq_ne = 0;
1076 		mutex_exit(&ufs_hlock.uq_mutex);
1077 		/*
1078 		 * hlock the error'ed fs's
1079 		 *	retry after a bit if another app is doing lockfs stuff
1080 		 */
1081 		do {
1082 			retry = ufs_trans_hlock();
1083 			if (retry) {
1084 				mutex_enter(&ufs_hlock.uq_mutex);
1085 				CALLB_CPR_SAFE_BEGIN(&cprinfo);
1086 				(void) cv_timedwait(&ufs_hlock.uq_cv,
1087 							&ufs_hlock.uq_mutex,
1088 							lbolt + hz);
1089 				CALLB_CPR_SAFE_END(&cprinfo,
1090 				    &ufs_hlock.uq_mutex);
1091 				mutex_exit(&ufs_hlock.uq_mutex);
1092 			}
1093 		} while (retry);
1094 	}
1095 }
1096 
1097 static void
1098 ufs_attr_purge(struct inode *dp)
1099 {
1100 	int	err;
1101 	int	error;
1102 	off_t 	dirsize;			/* size of the directory */
1103 	off_t 	offset;	/* offset in the directory */
1104 	int entryoffsetinblk;		/* offset of ep in fbp's buffer */
1105 	struct inode *tp;
1106 	struct fbuf *fbp;	/* pointer to directory block */
1107 	struct direct *ep;	/* directory entry */
1108 	int trans_size;
1109 	int issync;
1110 	struct ufsvfs	*ufsvfsp = dp->i_ufsvfs;
1111 
1112 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1113 
1114 	fbp = NULL;
1115 	dirsize = roundup(dp->i_size, DIRBLKSIZ);
1116 	offset = 0;
1117 	entryoffsetinblk = 0;
1118 
1119 	/*
1120 	 * Purge directory cache
1121 	 */
1122 
1123 	dnlc_dir_purge(&dp->i_danchor);
1124 
1125 	while (offset < dirsize) {
1126 		/*
1127 		 * If offset is on a block boundary,
1128 		 * read the next directory block.
1129 		 * Release previous if it exists.
1130 		 */
1131 		if (blkoff(dp->i_fs, offset) == 0) {
1132 			if (fbp != NULL) {
1133 				fbrelse(fbp, S_OTHER);
1134 			}
1135 
1136 			err = blkatoff(dp, offset, (char **)0, &fbp);
1137 			if (err) {
1138 				goto out;
1139 			}
1140 			entryoffsetinblk = 0;
1141 		}
1142 		ep = (struct direct *)(fbp->fb_addr + entryoffsetinblk);
1143 		if (ep->d_ino == 0 || (ep->d_name[0] == '.' &&
1144 		    ep->d_name[1] == '\0') ||
1145 		    (ep->d_name[0] == '.' && ep->d_name[1] == '.' &&
1146 		    ep->d_name[2] == '\0')) {
1147 
1148 			entryoffsetinblk += ep->d_reclen;
1149 
1150 		} else {
1151 
1152 			if ((err = ufs_iget(dp->i_vfs, ep->d_ino,
1153 			    &tp, CRED())) != 0) {
1154 				goto out;
1155 			}
1156 
1157 			TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE,
1158 			    trans_size = (int)TOP_REMOVE_SIZE(tp));
1159 
1160 			/*
1161 			 * Delete inode.
1162 			 */
1163 
1164 			dnlc_remove(ITOV(dp), ep->d_name);
1165 
1166 			rw_enter(&tp->i_contents, RW_WRITER);
1167 			tp->i_flag |= ICHG;
1168 			tp->i_seq++;
1169 			TRANS_INODE(tp->i_ufsvfs, tp);
1170 			tp->i_nlink--;
1171 			ufs_setreclaim(tp);
1172 			ITIMES_NOLOCK(tp);
1173 			rw_exit(&tp->i_contents);
1174 
1175 			VN_RELE(ITOV(tp));
1176 			entryoffsetinblk += ep->d_reclen;
1177 			TRANS_END_CSYNC(ufsvfsp, error,
1178 			    issync, TOP_REMOVE, trans_size);
1179 
1180 		}
1181 		offset += ep->d_reclen;
1182 	}
1183 
1184 	if (fbp) {
1185 		fbrelse(fbp, S_OTHER);
1186 	}
1187 
1188 out:
1189 	rw_exit(&ufsvfsp->vfs_dqrwlock);
1190 }
1191