1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /* copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
28
29 /*
30 * Portions of this source code were derived from Berkeley 4.3 BSD
31 * under license from the Regents of the University of California.
32 */
33
34 #include <sys/types.h>
35 #include <sys/systm.h>
36 #include <sys/errno.h>
37 #include <sys/kmem.h>
38 #include <sys/buf.h>
39 #include <sys/vnode.h>
40 #include <sys/vfs.h>
41 #include <sys/user.h>
42 #include <sys/callb.h>
43 #include <sys/cpuvar.h>
44 #include <sys/fs/ufs_inode.h>
45 #include <sys/fs/ufs_log.h>
46 #include <sys/fs/ufs_trans.h>
47 #include <sys/fs/ufs_acl.h>
48 #include <sys/fs/ufs_bio.h>
49 #include <sys/fs/ufs_fsdir.h>
50 #include <sys/debug.h>
51 #include <sys/cmn_err.h>
52 #include <sys/sysmacros.h>
53 #include <vm/pvn.h>
54
55 extern pri_t minclsyspri;
56 extern int hash2ints();
57 extern struct kmem_cache *inode_cache; /* cache of free inodes */
58 extern int ufs_idle_waiters;
59 extern struct instats ins;
60
61 static void ufs_attr_purge(struct inode *);
62
63 /*
64 * initialize a thread's queue struct
65 */
66 void
ufs_thread_init(struct ufs_q * uq,int lowat)67 ufs_thread_init(struct ufs_q *uq, int lowat)
68 {
69 bzero((caddr_t)uq, sizeof (*uq));
70 cv_init(&uq->uq_cv, NULL, CV_DEFAULT, NULL);
71 mutex_init(&uq->uq_mutex, NULL, MUTEX_DEFAULT, NULL);
72 uq->uq_lowat = lowat;
73 uq->uq_hiwat = 2 * lowat;
74 uq->uq_threadp = NULL;
75 }
76
77 /*
78 * start a thread for a queue (assumes success)
79 */
80 void
ufs_thread_start(struct ufs_q * uq,void (* func)(),struct vfs * vfsp)81 ufs_thread_start(struct ufs_q *uq, void (*func)(), struct vfs *vfsp)
82 {
83 mutex_enter(&uq->uq_mutex);
84 if (uq->uq_threadp == NULL) {
85 uq->uq_threadp = thread_create(NULL, 0, func, vfsp, 0, &p0,
86 TS_RUN, minclsyspri);
87 uq->uq_flags = 0;
88 }
89 mutex_exit(&uq->uq_mutex);
90 }
91
92 /*
93 * wait for the thread to exit
94 */
95 void
ufs_thread_exit(struct ufs_q * uq)96 ufs_thread_exit(struct ufs_q *uq)
97 {
98 kt_did_t ufs_thread_did = 0;
99
100 mutex_enter(&uq->uq_mutex);
101 uq->uq_flags &= ~(UQ_SUSPEND | UQ_SUSPENDED);
102 if (uq->uq_threadp != NULL) {
103 ufs_thread_did = uq->uq_threadp->t_did;
104 uq->uq_flags |= (UQ_EXIT|UQ_WAIT);
105 cv_broadcast(&uq->uq_cv);
106 }
107 mutex_exit(&uq->uq_mutex);
108
109 /*
110 * It's safe to call thread_join() with an already-gone
111 * t_did, but we have to obtain it before the kernel
112 * thread structure is freed. We do so above under the
113 * protection of the uq_mutex when we're sure the thread
114 * still exists and it's save to de-reference it.
115 * We also have to check if ufs_thread_did is != 0
116 * before calling thread_join() since thread 0 in the system
117 * gets a t_did of 0.
118 */
119 if (ufs_thread_did)
120 thread_join(ufs_thread_did);
121 }
122
123 /*
124 * wait for a thread to suspend itself on the caller's behalf
125 * the caller is responsible for continuing the thread
126 */
127 void
ufs_thread_suspend(struct ufs_q * uq)128 ufs_thread_suspend(struct ufs_q *uq)
129 {
130 mutex_enter(&uq->uq_mutex);
131 if (uq->uq_threadp != NULL) {
132 /*
133 * wait while another thread is suspending this thread.
134 * no need to do a cv_broadcast(), as whoever suspended
135 * the thread must continue it at some point.
136 */
137 while ((uq->uq_flags & UQ_SUSPEND) &&
138 (uq->uq_threadp != NULL)) {
139 /*
140 * We can't use cv_signal() because if our
141 * signal doesn't happen to hit the desired
142 * thread but instead some other waiter like
143 * ourselves, we'll wait forever for a
144 * response. Well, at least an indeterminate
145 * amount of time until we just happen to get
146 * lucky from whomever did get signalled doing
147 * a cv_signal() of their own. This is an
148 * unfortunate performance lossage.
149 */
150 uq->uq_flags |= UQ_WAIT;
151 cv_wait(&uq->uq_cv, &uq->uq_mutex);
152 }
153
154 uq->uq_flags |= (UQ_SUSPEND | UQ_WAIT);
155
156 /*
157 * wait for the thread to suspend itself
158 */
159 if ((uq->uq_flags & UQ_SUSPENDED) == 0 &&
160 (uq->uq_threadp != NULL)) {
161 cv_broadcast(&uq->uq_cv);
162 }
163
164 while (((uq->uq_flags & UQ_SUSPENDED) == 0) &&
165 (uq->uq_threadp != NULL)) {
166 cv_wait(&uq->uq_cv, &uq->uq_mutex);
167 }
168 }
169 mutex_exit(&uq->uq_mutex);
170 }
171
172 /*
173 * allow a thread to continue from a ufs_thread_suspend()
174 * This thread must be the same as the thread that called
175 * ufs_thread_suspend.
176 */
177 void
ufs_thread_continue(struct ufs_q * uq)178 ufs_thread_continue(struct ufs_q *uq)
179 {
180 mutex_enter(&uq->uq_mutex);
181 uq->uq_flags &= ~(UQ_SUSPEND | UQ_SUSPENDED);
182 cv_broadcast(&uq->uq_cv);
183 mutex_exit(&uq->uq_mutex);
184 }
185
186 /*
187 * some common code for managing a threads execution
188 * uq is locked at entry and return
189 * may sleep
190 * may exit
191 */
192 /*
193 * Kind of a hack passing in the callb_cpr_t * here.
194 * It should really be part of the ufs_q structure.
195 * I did not put it in there because we are already in beta
196 * and I was concerned that changing ufs_inode.h to include
197 * callb.h might break something.
198 */
199 int
ufs_thread_run(struct ufs_q * uq,callb_cpr_t * cprinfop)200 ufs_thread_run(struct ufs_q *uq, callb_cpr_t *cprinfop)
201 {
202 again:
203 ASSERT(uq->uq_ne >= 0);
204
205 if (uq->uq_flags & UQ_SUSPEND) {
206 uq->uq_flags |= UQ_SUSPENDED;
207 } else if (uq->uq_flags & UQ_EXIT) {
208 /*
209 * exiting; empty the queue (may infinite loop)
210 */
211 if (uq->uq_ne)
212 return (uq->uq_ne);
213 uq->uq_threadp = NULL;
214 if (uq->uq_flags & UQ_WAIT) {
215 cv_broadcast(&uq->uq_cv);
216 }
217 uq->uq_flags &= ~(UQ_EXIT | UQ_WAIT);
218 CALLB_CPR_EXIT(cprinfop);
219 thread_exit();
220 } else if (uq->uq_ne >= uq->uq_lowat) {
221 /*
222 * process a block of entries until below high water mark
223 */
224 return (uq->uq_ne - (uq->uq_lowat >> 1));
225 }
226 if (uq->uq_flags & UQ_WAIT) {
227 uq->uq_flags &= ~UQ_WAIT;
228 cv_broadcast(&uq->uq_cv);
229 }
230 CALLB_CPR_SAFE_BEGIN(cprinfop);
231 cv_wait(&uq->uq_cv, &uq->uq_mutex);
232 CALLB_CPR_SAFE_END(cprinfop, &uq->uq_mutex);
233 goto again;
234 }
235
236 /*
237 * DELETE INODE
238 * The following routines implement the protocol for freeing the resources
239 * held by an idle and deleted inode.
240 */
241 void
ufs_delete(struct ufsvfs * ufsvfsp,struct inode * ip,int dolockfs)242 ufs_delete(struct ufsvfs *ufsvfsp, struct inode *ip, int dolockfs)
243 {
244 ushort_t mode;
245 struct vnode *vp = ITOV(ip);
246 struct ulockfs *ulp;
247 int trans_size;
248 int dorwlock = ((ip->i_mode & IFMT) == IFREG);
249 int issync;
250 int err;
251 struct inode *dp;
252 struct ufs_q *delq = &ufsvfsp->vfs_delete;
253 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
254
255 /*
256 * Ignore if deletes are not allowed (wlock/hlock)
257 */
258 if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) {
259 mutex_enter(&delq->uq_mutex);
260 delq_info->delq_unreclaimed_blocks -= ip->i_blocks;
261 delq_info->delq_unreclaimed_files--;
262 mutex_exit(&delq->uq_mutex);
263 VN_RELE(vp);
264 return;
265 }
266
267 if ((vp->v_count > 1) || (ip->i_mode == 0)) {
268 mutex_enter(&delq->uq_mutex);
269 delq_info->delq_unreclaimed_blocks -= ip->i_blocks;
270 delq_info->delq_unreclaimed_files--;
271 mutex_exit(&delq->uq_mutex);
272 VN_RELE(vp);
273 return;
274 }
275 /*
276 * If we are called as part of setting a fs lock, then only
277 * do part of the lockfs protocol. In other words, don't hang.
278 */
279 if (dolockfs) {
280 if (ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_DELETE_MASK))
281 return;
282 } else {
283 /*
284 * check for recursive VOP call
285 */
286 if (curthread->t_flag & T_DONTBLOCK) {
287 ulp = NULL;
288 } else {
289 ulp = &ufsvfsp->vfs_ulockfs;
290 curthread->t_flag |= T_DONTBLOCK;
291 }
292 }
293
294 /*
295 * Hold rwlock to synchronize with (nfs) writes
296 */
297 if (dorwlock)
298 rw_enter(&ip->i_rwlock, RW_WRITER);
299
300 /*
301 * Delete the attribute directory.
302 */
303 if (ip->i_oeftflag != 0) {
304 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE,
305 trans_size = (int)TOP_REMOVE_SIZE(ip));
306 rw_enter(&ip->i_contents, RW_WRITER);
307 err = ufs_iget(ip->i_vfs, ip->i_oeftflag,
308 &dp, CRED());
309 if (err == 0) {
310 rw_enter(&dp->i_rwlock, RW_WRITER);
311 rw_enter(&dp->i_contents, RW_WRITER);
312 dp->i_flag |= IUPD|ICHG;
313 dp->i_seq++;
314 TRANS_INODE(dp->i_ufsvfs, dp);
315 dp->i_nlink -= 2;
316 ufs_setreclaim(dp);
317 /*
318 * Should get rid of any negative cache entries that
319 * might be lingering, as well as ``.'' and
320 * ``..''. If we don't, the VN_RELE() below
321 * won't actually put dp on the delete queue
322 * and it'll hang out until someone forces it
323 * (lockfs -f, umount, ...). The only reliable
324 * way of doing this at the moment is to call
325 * dnlc_purge_vp(ITOV(dp)), which is unacceptably
326 * slow, so we'll just note the problem in this
327 * comment for now.
328 */
329 dnlc_remove(ITOV(dp), ".");
330 dnlc_remove(ITOV(dp), "..");
331 ITIMES_NOLOCK(dp);
332 if (!TRANS_ISTRANS(ufsvfsp)) {
333 ufs_iupdat(dp, I_SYNC);
334 }
335 rw_exit(&dp->i_contents);
336 rw_exit(&dp->i_rwlock);
337 VN_RELE(ITOV(dp));
338 }
339 /*
340 * Clear out attribute pointer
341 */
342 ip->i_oeftflag = 0;
343 rw_exit(&ip->i_contents);
344 TRANS_END_CSYNC(ufsvfsp, err, issync,
345 TOP_REMOVE, trans_size);
346 dnlc_remove(ITOV(ip), XATTR_DIR_NAME);
347 }
348
349 if ((ip->i_mode & IFMT) == IFATTRDIR) {
350 ufs_attr_purge(ip);
351 }
352
353 (void) TRANS_ITRUNC(ip, (u_offset_t)0, I_FREE | I_ACCT, CRED());
354
355 /*
356 * the inode's space has been freed; now free the inode
357 */
358 if (ulp) {
359 trans_size = TOP_IFREE_SIZE(ip);
360 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_IFREE, trans_size);
361 }
362 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
363 rw_enter(&ip->i_contents, RW_WRITER);
364 TRANS_INODE(ufsvfsp, ip);
365 mode = ip->i_mode;
366 ip->i_mode = 0;
367 ip->i_rdev = 0;
368 ip->i_ordev = 0;
369 ip->i_flag |= IMOD;
370 if (ip->i_ufs_acl) {
371 (void) ufs_si_free(ip->i_ufs_acl, vp->v_vfsp, CRED());
372 ip->i_ufs_acl = NULL;
373 ip->i_shadow = 0;
374 }
375
376 /*
377 * This inode is torn down but still retains it's identity
378 * (inode number). It could get recycled soon so it's best
379 * to clean up the vnode just in case.
380 */
381 mutex_enter(&vp->v_lock);
382 vn_recycle(vp);
383 mutex_exit(&vp->v_lock);
384
385 /*
386 * free the inode
387 */
388 ufs_ifree(ip, ip->i_number, mode);
389 /*
390 * release quota resources; can't fail
391 */
392 (void) chkiq((struct ufsvfs *)vp->v_vfsp->vfs_data,
393 /* change */ -1, ip, (uid_t)ip->i_uid, 0, CRED(),
394 (char **)NULL, (size_t *)NULL);
395 dqrele(ip->i_dquot);
396 ip->i_dquot = NULL;
397 ip->i_flag &= ~(IDEL | IDIRECTIO);
398 ip->i_cflags = 0;
399 if (!TRANS_ISTRANS(ufsvfsp)) {
400 ufs_iupdat(ip, I_SYNC);
401 } else {
402 mutex_enter(&delq->uq_mutex);
403 delq_info->delq_unreclaimed_files--;
404 mutex_exit(&delq->uq_mutex);
405 }
406 rw_exit(&ip->i_contents);
407 rw_exit(&ufsvfsp->vfs_dqrwlock);
408 if (dorwlock)
409 rw_exit(&ip->i_rwlock);
410 VN_RELE(vp);
411
412 /*
413 * End of transaction
414 */
415 if (ulp) {
416 TRANS_END_ASYNC(ufsvfsp, TOP_IFREE, trans_size);
417 if (dolockfs)
418 ufs_lockfs_end(ulp);
419 else
420 curthread->t_flag &= ~T_DONTBLOCK;
421 }
422 }
423
424 /*
425 * Create the delete thread and init the delq_info for this fs
426 */
427 void
ufs_delete_init(struct ufsvfs * ufsvfsp,int lowat)428 ufs_delete_init(struct ufsvfs *ufsvfsp, int lowat)
429 {
430 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
431
432 ufs_thread_init(&ufsvfsp->vfs_delete, lowat);
433 (void) memset((void *)delq_info, 0, sizeof (*delq_info));
434 }
435
436 /*
437 * thread that frees up deleted inodes
438 */
439 void
ufs_thread_delete(struct vfs * vfsp)440 ufs_thread_delete(struct vfs *vfsp)
441 {
442 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
443 struct ufs_q *uq = &ufsvfsp->vfs_delete;
444 struct inode *ip;
445 long ne;
446 callb_cpr_t cprinfo;
447
448 CALLB_CPR_INIT(&cprinfo, &uq->uq_mutex, callb_generic_cpr,
449 "ufsdelete");
450
451 mutex_enter(&uq->uq_mutex);
452 again:
453 /*
454 * Sleep until there is work to do. Only do one entry at
455 * a time, to reduce the wait time for checking for a suspend
456 * request. The ?: is for pedantic portability.
457 */
458 ne = ufs_thread_run(uq, &cprinfo) ? 1 : 0;
459
460 /*
461 * process an entry, if there are any
462 */
463 if (ne && (ip = uq->uq_ihead)) {
464 /*
465 * process first entry on queue. Assumed conditions are:
466 * ip is held (v_count >= 1)
467 * ip is referenced (i_flag & IREF)
468 * ip is free (i_nlink <= 0)
469 */
470 if ((uq->uq_ihead = ip->i_freef) == ip)
471 uq->uq_ihead = NULL;
472 ip->i_freef->i_freeb = ip->i_freeb;
473 ip->i_freeb->i_freef = ip->i_freef;
474 ip->i_freef = ip;
475 ip->i_freeb = ip;
476 uq->uq_ne--;
477 mutex_exit(&uq->uq_mutex);
478 ufs_delete(ufsvfsp, ip, 1);
479 mutex_enter(&uq->uq_mutex);
480 }
481 goto again;
482 }
483
484 /*
485 * drain ne entries off the delete queue. As new queue entries may
486 * be added while we're working, ne is interpreted as follows:
487 *
488 * ne > 0 => remove up to ne entries
489 * ne == 0 => remove all entries currently on the queue
490 * ne == -1 => remove entries until the queue is empty
491 */
492 void
ufs_delete_drain(struct vfs * vfsp,int ne,int dolockfs)493 ufs_delete_drain(struct vfs *vfsp, int ne, int dolockfs)
494 {
495 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
496 struct ufs_q *uq;
497 struct inode *ip;
498 int drain_cnt = 0;
499 int done;
500
501 /*
502 * if forcibly unmounted; ignore
503 */
504 if (ufsvfsp == NULL)
505 return;
506
507 uq = &ufsvfsp->vfs_delete;
508 mutex_enter(&uq->uq_mutex);
509 if (ne == 0)
510 drain_cnt = uq->uq_ne;
511 else if (ne > 0)
512 drain_cnt = ne;
513
514 /*
515 * process up to ne entries
516 */
517
518 done = 0;
519 while (!done && (ip = uq->uq_ihead)) {
520 if (ne != -1)
521 drain_cnt--;
522 if (ne != -1 && drain_cnt == 0)
523 done = 1;
524 if ((uq->uq_ihead = ip->i_freef) == ip)
525 uq->uq_ihead = NULL;
526 ip->i_freef->i_freeb = ip->i_freeb;
527 ip->i_freeb->i_freef = ip->i_freef;
528 ip->i_freef = ip;
529 ip->i_freeb = ip;
530 uq->uq_ne--;
531 mutex_exit(&uq->uq_mutex);
532 ufs_delete(ufsvfsp, ip, dolockfs);
533 mutex_enter(&uq->uq_mutex);
534 }
535 mutex_exit(&uq->uq_mutex);
536 }
537
538 void
ufs_sync_with_thread(struct ufs_q * uq)539 ufs_sync_with_thread(struct ufs_q *uq)
540 {
541 mutex_enter(&uq->uq_mutex);
542
543 /*
544 * Wake up delete thread to free up space.
545 */
546 if ((uq->uq_flags & UQ_WAIT) == 0) {
547 uq->uq_flags |= UQ_WAIT;
548 cv_broadcast(&uq->uq_cv);
549 }
550
551 while ((uq->uq_threadp != NULL) && (uq->uq_flags & UQ_WAIT)) {
552 cv_wait(&uq->uq_cv, &uq->uq_mutex);
553 }
554
555 mutex_exit(&uq->uq_mutex);
556 }
557
558 /*
559 * Get rid of everything that's currently in the delete queue,
560 * plus whatever the delete thread is working on at the moment.
561 *
562 * This ability is required for providing true POSIX semantics
563 * regarding close(2), unlink(2), etc, even when logging is enabled.
564 * The standard requires that the released space be immediately
565 * observable (statvfs(2)) and allocatable (e.g., write(2)).
566 */
567 void
ufs_delete_drain_wait(struct ufsvfs * ufsvfsp,int dolockfs)568 ufs_delete_drain_wait(struct ufsvfs *ufsvfsp, int dolockfs)
569 {
570 struct ufs_q *uq = &ufsvfsp->vfs_delete;
571 int error;
572 struct ufs_q *delq = &ufsvfsp->vfs_delete;
573 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
574
575 /*
576 * If there is something on delq or delete thread
577 * working on delq.
578 */
579 mutex_enter(&delq->uq_mutex);
580 if (delq_info->delq_unreclaimed_files > 0) {
581 mutex_exit(&delq->uq_mutex);
582 (void) ufs_delete_drain(ufsvfsp->vfs_vfs, 0, dolockfs);
583 ufs_sync_with_thread(uq);
584 } else {
585 ASSERT(delq_info->delq_unreclaimed_files == 0);
586 mutex_exit(&delq->uq_mutex);
587 return;
588 }
589
590 /*
591 * Commit any outstanding transactions to make sure
592 * any canceled freed blocks are available for allocation.
593 */
594 curthread->t_flag |= T_DONTBLOCK;
595 TRANS_BEGIN_SYNC(ufsvfsp, TOP_COMMIT_UPDATE, TOP_COMMIT_SIZE, error);
596 if (!error) {
597 TRANS_END_SYNC(ufsvfsp, error, TOP_COMMIT_UPDATE,
598 TOP_COMMIT_SIZE);
599 }
600 curthread->t_flag &= ~T_DONTBLOCK;
601 }
602
603 /*
604 * Adjust the resource usage in a struct statvfs based on
605 * what's in the delete queue.
606 *
607 * We do not consider the impact of ACLs or extended attributes
608 * that may be deleted as a side-effect of deleting a file.
609 * Those are metadata, and their sizes aren't reflected in the
610 * sizes returned by stat(), so this is not a problem.
611 */
612 void
ufs_delete_adjust_stats(struct ufsvfs * ufsvfsp,struct statvfs64 * sp)613 ufs_delete_adjust_stats(struct ufsvfs *ufsvfsp, struct statvfs64 *sp)
614 {
615 struct ufs_q *uq = &ufsvfsp->vfs_delete;
616 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
617
618 mutex_enter(&uq->uq_mutex);
619 /*
620 * The blocks accounted for in the delete queue info are
621 * counted in DEV_BSIZE chunks, but ufs_statvfs counts in
622 * filesystem fragments, so a conversion is required here.
623 */
624 sp->f_bfree += dbtofsb(ufsvfsp->vfs_fs,
625 delq_info->delq_unreclaimed_blocks);
626 sp->f_ffree += delq_info->delq_unreclaimed_files;
627 mutex_exit(&uq->uq_mutex);
628 }
629
630 /*
631 * IDLE INODE
632 * The following routines implement the protocol for maintaining an
633 * LRU list of idle inodes and for moving the idle inodes to the
634 * reuse list when the number of allocated inodes exceeds the user
635 * tunable high-water mark (ufs_ninode).
636 */
637
638 /*
639 * clean an idle inode and move it to the reuse list
640 */
641 static void
ufs_idle_free(struct inode * ip)642 ufs_idle_free(struct inode *ip)
643 {
644 int pages;
645 int hno;
646 kmutex_t *ihm;
647 struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
648 struct vnode *vp = ITOV(ip);
649 int vn_has_data, vn_modified;
650
651 /*
652 * inode is held
653 */
654
655 /*
656 * remember `pages' for stats below
657 */
658 pages = (ip->i_mode && vn_has_cached_data(vp) && vp->v_type != VCHR);
659
660 /*
661 * start the dirty pages to disk and then invalidate them
662 * unless the inode is invalid (ISTALE)
663 */
664 if ((ip->i_flag & ISTALE) == 0) {
665 (void) TRANS_SYNCIP(ip, B_ASYNC, I_ASYNC, TOP_SYNCIP_FREE);
666 (void) TRANS_SYNCIP(ip,
667 (TRANS_ISERROR(ufsvfsp)) ? B_INVAL | B_FORCE : B_INVAL,
668 I_ASYNC, TOP_SYNCIP_FREE);
669 }
670
671 /*
672 * wait for any current ufs_iget to finish and block future ufs_igets
673 */
674 ASSERT(ip->i_number != 0);
675 hno = INOHASH(ip->i_number);
676 ihm = &ih_lock[hno];
677 mutex_enter(ihm);
678
679 /*
680 * It must be guaranteed that v_count >= 2, otherwise
681 * something must be wrong with this vnode already.
682 * That is why we use v_count-- instead of VN_RELE().
683 * Acquire the vnode lock in case another thread is in
684 * VN_RELE().
685 */
686 mutex_enter(&vp->v_lock);
687
688 if (vp->v_count < 2)
689 cmn_err(CE_PANIC,
690 "ufs_idle_free: vnode ref count is less than 2");
691
692 vp->v_count--;
693
694 vn_has_data = (vp->v_type != VCHR && vn_has_cached_data(vp));
695 vn_modified = (ip->i_flag & (IMOD|IMODACC|IACC|ICHG|IUPD|IATTCHG));
696
697 if (vp->v_count != 1 ||
698 ((vn_has_data || vn_modified) &&
699 ((ip->i_flag & ISTALE) == 0))) {
700 /*
701 * Another thread has referenced this inode while
702 * we are trying to free it. Call VN_RELE() to
703 * release our reference, if v_count > 1 data is
704 * present or one of the modified etc. flags was
705 * set, whereby ISTALE wasn't set.
706 * If we'd proceed with ISTALE set here, we might
707 * get ourselves into a deadlock situation.
708 */
709 mutex_exit(&vp->v_lock);
710 mutex_exit(ihm);
711 VN_RELE(vp);
712 } else {
713 /*
714 * The inode is currently unreferenced and can not
715 * acquire further references because it has no pages
716 * and the hash is locked. Inodes acquire references
717 * via the hash list or via their pages.
718 */
719
720 mutex_exit(&vp->v_lock);
721
722 /*
723 * remove it from the cache
724 */
725 remque(ip);
726 mutex_exit(ihm);
727 /*
728 * Stale inodes have no valid ufsvfs
729 */
730 if ((ip->i_flag & ISTALE) == 0 && ip->i_dquot) {
731 TRANS_DQRELE(ufsvfsp, ip->i_dquot);
732 ip->i_dquot = NULL;
733 }
734 if ((ip->i_flag & ISTALE) &&
735 vn_has_data) {
736 /*
737 * ISTALE inodes may have data
738 * and this data needs to be
739 * cleaned up.
740 */
741 (void) pvn_vplist_dirty(vp, (u_offset_t)0,
742 ufs_putapage, B_INVAL | B_TRUNC,
743 (struct cred *)NULL);
744 }
745 ufs_si_del(ip);
746 if (pages) {
747 CPU_STATS_ADDQ(CPU, sys, ufsipage, 1);
748 } else {
749 CPU_STATS_ADDQ(CPU, sys, ufsinopage, 1);
750 }
751 ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp));
752
753 /*
754 * We had better not have a vnode reference count > 1
755 * at this point, if we do then something is broken as
756 * this inode/vnode acquired a reference underneath of us.
757 */
758 ASSERT(vp->v_count == 1);
759
760 ufs_free_inode(ip);
761 }
762 }
763
764 /*
765 * this thread processes the global idle queue
766 */
767 iqhead_t *ufs_junk_iq;
768 iqhead_t *ufs_useful_iq;
769 int ufs_njunk_iq = 0;
770 int ufs_nuseful_iq = 0;
771 int ufs_niqhash;
772 int ufs_iqhashmask;
773 struct ufs_q ufs_idle_q;
774
775 void
ufs_thread_idle(void)776 ufs_thread_idle(void)
777 {
778 callb_cpr_t cprinfo;
779 int i;
780 int ne;
781
782 ufs_niqhash = (ufs_idle_q.uq_lowat >> 1) / IQHASHQLEN;
783 ufs_niqhash = 1 << highbit(ufs_niqhash); /* round up to power of 2 */
784 ufs_iqhashmask = ufs_niqhash - 1;
785 ufs_junk_iq = kmem_alloc(ufs_niqhash * sizeof (*ufs_junk_iq),
786 KM_SLEEP);
787 ufs_useful_iq = kmem_alloc(ufs_niqhash * sizeof (*ufs_useful_iq),
788 KM_SLEEP);
789
790 /* Initialize hash queue headers */
791 for (i = 0; i < ufs_niqhash; i++) {
792 ufs_junk_iq[i].i_freef = (inode_t *)&ufs_junk_iq[i];
793 ufs_junk_iq[i].i_freeb = (inode_t *)&ufs_junk_iq[i];
794 ufs_useful_iq[i].i_freef = (inode_t *)&ufs_useful_iq[i];
795 ufs_useful_iq[i].i_freeb = (inode_t *)&ufs_useful_iq[i];
796 }
797
798 CALLB_CPR_INIT(&cprinfo, &ufs_idle_q.uq_mutex, callb_generic_cpr,
799 "ufsidle");
800 again:
801 /*
802 * Whenever the idle thread is awakened, it repeatedly gives
803 * back half of the idle queue until the idle queue falls
804 * below lowat.
805 */
806 mutex_enter(&ufs_idle_q.uq_mutex);
807 if (ufs_idle_q.uq_ne < ufs_idle_q.uq_lowat) {
808 CALLB_CPR_SAFE_BEGIN(&cprinfo);
809 cv_wait(&ufs_idle_q.uq_cv, &ufs_idle_q.uq_mutex);
810 CALLB_CPR_SAFE_END(&cprinfo, &ufs_idle_q.uq_mutex);
811 }
812 mutex_exit(&ufs_idle_q.uq_mutex);
813
814 /*
815 * Give back 1/2 of the idle queue
816 */
817 ne = ufs_idle_q.uq_ne >> 1;
818 ins.in_tidles.value.ul += ne;
819 ufs_idle_some(ne);
820 goto again;
821 }
822
823 /*
824 * Reclaim callback for ufs inode cache.
825 * Invoked by the kernel memory allocator when memory gets tight.
826 */
827 /*ARGSUSED*/
828 void
ufs_inode_cache_reclaim(void * cdrarg)829 ufs_inode_cache_reclaim(void *cdrarg)
830 {
831 /*
832 * If we are low on memory and the idle queue is over its
833 * halfway mark, then free 50% of the idle q
834 *
835 * We don't free all of the idle inodes because the inodes
836 * for popular NFS files may have been kicked from the dnlc.
837 * The inodes for these files will end up on the idle queue
838 * after every NFS access.
839 *
840 * If we repeatedly push them from the idle queue then
841 * NFS users may be unhappy as an extra buf cache operation
842 * is incurred for every NFS operation to these files.
843 *
844 * It's not common, but I have seen it happen.
845 *
846 */
847 if (ufs_idle_q.uq_ne < (ufs_idle_q.uq_lowat >> 1))
848 return;
849 mutex_enter(&ufs_idle_q.uq_mutex);
850 cv_broadcast(&ufs_idle_q.uq_cv);
851 mutex_exit(&ufs_idle_q.uq_mutex);
852 }
853
854 /*
855 * Free up some idle inodes
856 */
857 void
ufs_idle_some(int ne)858 ufs_idle_some(int ne)
859 {
860 int i;
861 struct inode *ip;
862 struct vnode *vp;
863 static int junk_rotor = 0;
864 static int useful_rotor = 0;
865
866 for (i = 0; i < ne; ++i) {
867 mutex_enter(&ufs_idle_q.uq_mutex);
868
869 if (ufs_njunk_iq) {
870 while (ufs_junk_iq[junk_rotor].i_freef ==
871 (inode_t *)&ufs_junk_iq[junk_rotor]) {
872 junk_rotor = IQNEXT(junk_rotor);
873 }
874 ip = ufs_junk_iq[junk_rotor].i_freef;
875 ASSERT(ip->i_flag & IJUNKIQ);
876 } else if (ufs_nuseful_iq) {
877 while (ufs_useful_iq[useful_rotor].i_freef ==
878 (inode_t *)&ufs_useful_iq[useful_rotor]) {
879 useful_rotor = IQNEXT(useful_rotor);
880 }
881 ip = ufs_useful_iq[useful_rotor].i_freef;
882 ASSERT(!(ip->i_flag & IJUNKIQ));
883 } else {
884 mutex_exit(&ufs_idle_q.uq_mutex);
885 return;
886 }
887
888 /*
889 * emulate ufs_iget
890 */
891 vp = ITOV(ip);
892 VN_HOLD(vp);
893 mutex_exit(&ufs_idle_q.uq_mutex);
894 rw_enter(&ip->i_contents, RW_WRITER);
895 /*
896 * VN_RELE should not be called if
897 * ufs_rmidle returns true, as it will
898 * effectively be done in ufs_idle_free.
899 */
900 if (ufs_rmidle(ip)) {
901 rw_exit(&ip->i_contents);
902 ufs_idle_free(ip);
903 } else {
904 rw_exit(&ip->i_contents);
905 VN_RELE(vp);
906 }
907 }
908 }
909
910 /*
911 * drain entries for vfsp from the idle queue
912 * vfsp == NULL means drain the entire thing
913 */
914 void
ufs_idle_drain(struct vfs * vfsp)915 ufs_idle_drain(struct vfs *vfsp)
916 {
917 struct inode *ip, *nip;
918 struct inode *ianchor = NULL;
919 int i;
920
921 mutex_enter(&ufs_idle_q.uq_mutex);
922 if (ufs_njunk_iq) {
923 /* for each hash q */
924 for (i = 0; i < ufs_niqhash; i++) {
925 /* search down the hash q */
926 for (ip = ufs_junk_iq[i].i_freef;
927 ip != (inode_t *)&ufs_junk_iq[i];
928 ip = ip->i_freef) {
929 if (ip->i_vfs == vfsp || vfsp == NULL) {
930 /* found a matching entry */
931 VN_HOLD(ITOV(ip));
932 mutex_exit(&ufs_idle_q.uq_mutex);
933 rw_enter(&ip->i_contents, RW_WRITER);
934 /*
935 * See comments in ufs_idle_some()
936 * as we will call ufs_idle_free()
937 * after scanning both queues.
938 */
939 if (ufs_rmidle(ip)) {
940 rw_exit(&ip->i_contents);
941 ip->i_freef = ianchor;
942 ianchor = ip;
943 } else {
944 rw_exit(&ip->i_contents);
945 VN_RELE(ITOV(ip));
946 }
947 /* restart this hash q */
948 ip = (inode_t *)&ufs_junk_iq[i];
949 mutex_enter(&ufs_idle_q.uq_mutex);
950 }
951 }
952 }
953 }
954 if (ufs_nuseful_iq) {
955 /* for each hash q */
956 for (i = 0; i < ufs_niqhash; i++) {
957 /* search down the hash q */
958 for (ip = ufs_useful_iq[i].i_freef;
959 ip != (inode_t *)&ufs_useful_iq[i];
960 ip = ip->i_freef) {
961 if (ip->i_vfs == vfsp || vfsp == NULL) {
962 /* found a matching entry */
963 VN_HOLD(ITOV(ip));
964 mutex_exit(&ufs_idle_q.uq_mutex);
965 rw_enter(&ip->i_contents, RW_WRITER);
966 /*
967 * See comments in ufs_idle_some()
968 * as we will call ufs_idle_free()
969 * after scanning both queues.
970 */
971 if (ufs_rmidle(ip)) {
972 rw_exit(&ip->i_contents);
973 ip->i_freef = ianchor;
974 ianchor = ip;
975 } else {
976 rw_exit(&ip->i_contents);
977 VN_RELE(ITOV(ip));
978 }
979 /* restart this hash q */
980 ip = (inode_t *)&ufs_useful_iq[i];
981 mutex_enter(&ufs_idle_q.uq_mutex);
982 }
983 }
984 }
985 }
986
987 mutex_exit(&ufs_idle_q.uq_mutex);
988 /* no more matching entries, release those we have found (if any) */
989 for (ip = ianchor; ip; ip = nip) {
990 nip = ip->i_freef;
991 ip->i_freef = ip;
992 ufs_idle_free(ip);
993 }
994 }
995
996 /*
997 * RECLAIM DELETED INODES
998 * The following thread scans the file system once looking for deleted files
999 */
1000 void
ufs_thread_reclaim(struct vfs * vfsp)1001 ufs_thread_reclaim(struct vfs *vfsp)
1002 {
1003 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
1004 struct ufs_q *uq = &ufsvfsp->vfs_reclaim;
1005 struct fs *fs = ufsvfsp->vfs_fs;
1006 struct buf *bp = 0;
1007 int err = 0;
1008 daddr_t bno;
1009 ino_t ino;
1010 struct dinode *dp;
1011 struct inode *ip;
1012 callb_cpr_t cprinfo;
1013
1014 CALLB_CPR_INIT(&cprinfo, &uq->uq_mutex, callb_generic_cpr,
1015 "ufsreclaim");
1016
1017 /*
1018 * mount decided that we don't need a reclaim thread
1019 */
1020 if ((fs->fs_reclaim & FS_RECLAIMING) == 0)
1021 err++;
1022
1023 /*
1024 * don't reclaim if readonly
1025 */
1026 if (fs->fs_ronly)
1027 err++;
1028
1029 for (ino = 0; ino < (fs->fs_ncg * fs->fs_ipg) && !err; ++ino) {
1030
1031 /*
1032 * Check whether we are the target of another
1033 * thread having called ufs_thread_exit() or
1034 * ufs_thread_suspend().
1035 */
1036 mutex_enter(&uq->uq_mutex);
1037 again:
1038 if (uq->uq_flags & UQ_EXIT) {
1039 err++;
1040 mutex_exit(&uq->uq_mutex);
1041 break;
1042 } else if (uq->uq_flags & UQ_SUSPEND) {
1043 uq->uq_flags |= UQ_SUSPENDED;
1044 /*
1045 * Release the buf before we cv_wait()
1046 * otherwise we may deadlock with the
1047 * thread that called ufs_thread_suspend().
1048 */
1049 if (bp) {
1050 brelse(bp);
1051 bp = 0;
1052 }
1053 if (uq->uq_flags & UQ_WAIT) {
1054 uq->uq_flags &= ~UQ_WAIT;
1055 cv_broadcast(&uq->uq_cv);
1056 }
1057 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1058 cv_wait(&uq->uq_cv, &uq->uq_mutex);
1059 CALLB_CPR_SAFE_END(&cprinfo, &uq->uq_mutex);
1060 goto again;
1061 }
1062 mutex_exit(&uq->uq_mutex);
1063
1064 /*
1065 * if we don't already have the buf; get it
1066 */
1067 bno = fsbtodb(fs, itod(fs, ino));
1068 if ((bp == 0) || (bp->b_blkno != bno)) {
1069 if (bp)
1070 brelse(bp);
1071 bp = UFS_BREAD(ufsvfsp,
1072 ufsvfsp->vfs_dev, bno, fs->fs_bsize);
1073 bp->b_flags |= B_AGE;
1074 }
1075 if (bp->b_flags & B_ERROR) {
1076 err++;
1077 continue;
1078 }
1079 /*
1080 * nlink <= 0 and mode != 0 means deleted
1081 */
1082 dp = (struct dinode *)bp->b_un.b_addr + itoo(fs, ino);
1083 if ((dp->di_nlink <= 0) && (dp->di_mode != 0)) {
1084 /*
1085 * can't hold the buf (deadlock)
1086 */
1087 brelse(bp);
1088 bp = 0;
1089 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1090 /*
1091 * iget/iput sequence will put inode on ifree
1092 * thread queue if it is idle. This is a nop
1093 * for busy (open, deleted) inodes
1094 */
1095 if (ufs_iget(vfsp, ino, &ip, CRED()))
1096 err++;
1097 else
1098 VN_RELE(ITOV(ip));
1099 rw_exit(&ufsvfsp->vfs_dqrwlock);
1100 }
1101 }
1102
1103 if (bp)
1104 brelse(bp);
1105 if (!err) {
1106 /*
1107 * reset the reclaiming-bit
1108 */
1109 mutex_enter(&ufsvfsp->vfs_lock);
1110 fs->fs_reclaim &= ~FS_RECLAIMING;
1111 mutex_exit(&ufsvfsp->vfs_lock);
1112 TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_RECLAIM);
1113 }
1114
1115 /*
1116 * exit the reclaim thread
1117 */
1118 mutex_enter(&uq->uq_mutex);
1119 uq->uq_threadp = NULL;
1120 uq->uq_flags &= ~UQ_WAIT;
1121 cv_broadcast(&uq->uq_cv);
1122 CALLB_CPR_EXIT(&cprinfo);
1123 thread_exit();
1124 }
1125 /*
1126 * HLOCK FILE SYSTEM
1127 * hlock the file system's whose logs have device errors
1128 */
1129 struct ufs_q ufs_hlock;
1130 /*ARGSUSED*/
1131 void
ufs_thread_hlock(void * ignore)1132 ufs_thread_hlock(void *ignore)
1133 {
1134 int retry;
1135 callb_cpr_t cprinfo;
1136
1137 CALLB_CPR_INIT(&cprinfo, &ufs_hlock.uq_mutex, callb_generic_cpr,
1138 "ufshlock");
1139
1140 for (;;) {
1141 /*
1142 * sleep until there is work to do
1143 */
1144 mutex_enter(&ufs_hlock.uq_mutex);
1145 (void) ufs_thread_run(&ufs_hlock, &cprinfo);
1146 ufs_hlock.uq_ne = 0;
1147 mutex_exit(&ufs_hlock.uq_mutex);
1148 /*
1149 * hlock the error'ed fs's
1150 * retry after a bit if another app is doing lockfs stuff
1151 */
1152 do {
1153 retry = ufs_trans_hlock();
1154 if (retry) {
1155 mutex_enter(&ufs_hlock.uq_mutex);
1156 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1157 (void) cv_reltimedwait(&ufs_hlock.uq_cv,
1158 &ufs_hlock.uq_mutex, hz, TR_CLOCK_TICK);
1159 CALLB_CPR_SAFE_END(&cprinfo,
1160 &ufs_hlock.uq_mutex);
1161 mutex_exit(&ufs_hlock.uq_mutex);
1162 }
1163 } while (retry);
1164 }
1165 }
1166
1167 static void
ufs_attr_purge(struct inode * dp)1168 ufs_attr_purge(struct inode *dp)
1169 {
1170 int err;
1171 int error;
1172 off_t dirsize; /* size of the directory */
1173 off_t offset; /* offset in the directory */
1174 int entryoffsetinblk; /* offset of ep in fbp's buffer */
1175 struct inode *tp;
1176 struct fbuf *fbp; /* pointer to directory block */
1177 struct direct *ep; /* directory entry */
1178 int trans_size;
1179 int issync;
1180 struct ufsvfs *ufsvfsp = dp->i_ufsvfs;
1181
1182 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1183
1184 fbp = NULL;
1185 dirsize = roundup(dp->i_size, DIRBLKSIZ);
1186 offset = 0;
1187 entryoffsetinblk = 0;
1188
1189 /*
1190 * Purge directory cache
1191 */
1192
1193 dnlc_dir_purge(&dp->i_danchor);
1194
1195 while (offset < dirsize) {
1196 /*
1197 * If offset is on a block boundary,
1198 * read the next directory block.
1199 * Release previous if it exists.
1200 */
1201 if (blkoff(dp->i_fs, offset) == 0) {
1202 if (fbp != NULL) {
1203 fbrelse(fbp, S_OTHER);
1204 }
1205
1206 err = blkatoff(dp, offset, (char **)0, &fbp);
1207 if (err) {
1208 goto out;
1209 }
1210 entryoffsetinblk = 0;
1211 }
1212 ep = (struct direct *)(fbp->fb_addr + entryoffsetinblk);
1213 if (ep->d_ino == 0 || (ep->d_name[0] == '.' &&
1214 ep->d_name[1] == '\0') ||
1215 (ep->d_name[0] == '.' && ep->d_name[1] == '.' &&
1216 ep->d_name[2] == '\0')) {
1217
1218 entryoffsetinblk += ep->d_reclen;
1219
1220 } else {
1221
1222 if ((err = ufs_iget(dp->i_vfs, ep->d_ino,
1223 &tp, CRED())) != 0) {
1224 goto out;
1225 }
1226
1227 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE,
1228 trans_size = (int)TOP_REMOVE_SIZE(tp));
1229
1230 /*
1231 * Delete inode.
1232 */
1233
1234 dnlc_remove(ITOV(dp), ep->d_name);
1235
1236 rw_enter(&tp->i_contents, RW_WRITER);
1237 tp->i_flag |= ICHG;
1238 tp->i_seq++;
1239 TRANS_INODE(tp->i_ufsvfs, tp);
1240 tp->i_nlink--;
1241 ufs_setreclaim(tp);
1242 ITIMES_NOLOCK(tp);
1243 rw_exit(&tp->i_contents);
1244
1245 VN_RELE(ITOV(tp));
1246 entryoffsetinblk += ep->d_reclen;
1247 TRANS_END_CSYNC(ufsvfsp, error,
1248 issync, TOP_REMOVE, trans_size);
1249
1250 }
1251 offset += ep->d_reclen;
1252 }
1253
1254 if (fbp) {
1255 fbrelse(fbp, S_OTHER);
1256 }
1257
1258 out:
1259 rw_exit(&ufsvfsp->vfs_dqrwlock);
1260 }
1261