xref: /titanic_41/usr/src/uts/common/fs/nfs/nfs4_db.c (revision 70025d765b044c6d8594bb965a2247a61e991a99)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/systm.h>
30 #include <sys/cmn_err.h>
31 #include <sys/kmem.h>
32 #include <sys/disp.h>
33 #include <sys/id_space.h>
34 #include <sys/atomic.h>
35 #include <rpc/rpc.h>
36 #include <nfs/nfs4.h>
37 #include <nfs/nfs4_db_impl.h>
38 
39 static int rfs4_reap_interval = RFS4_REAP_INTERVAL;
40 
41 static void rfs4_dbe_reap(rfs4_table_t *, time_t, uint32_t);
42 static void rfs4_dbe_destroy(rfs4_dbe_t *);
43 static rfs4_dbe_t *rfs4_dbe_create(rfs4_table_t *, rfs4_entry_t);
44 static void rfs4_start_reaper(rfs4_table_t *);
45 
46 id_t
47 rfs4_dbe_getid(rfs4_dbe_t *e)
48 {
49 	return (e->id);
50 }
51 
52 void
53 rfs4_dbe_hold(rfs4_dbe_t *e)
54 {
55 	atomic_add_32(&e->refcnt, 1);
56 }
57 
58 /*
59  * rfs4_dbe_rele_nolock only decrements the reference count of the entry.
60  */
61 void
62 rfs4_dbe_rele_nolock(rfs4_dbe_t *e)
63 {
64 	atomic_add_32(&e->refcnt, -1);
65 }
66 
67 
68 uint32_t
69 rfs4_dbe_refcnt(rfs4_dbe_t *e)
70 {
71 	return (e->refcnt);
72 }
73 
74 /*
75  * Mark an entry such that the dbsearch will skip it.
76  * Caller does not want this entry to be found any longer
77  */
78 void
79 rfs4_dbe_invalidate(rfs4_dbe_t *e)
80 {
81 	e->invalid = TRUE;
82 	e->skipsearch = TRUE;
83 }
84 
85 /*
86  * Is this entry invalid?
87  */
88 bool_t
89 rfs4_dbe_is_invalid(rfs4_dbe_t *e)
90 {
91 	return (e->invalid);
92 }
93 
94 time_t
95 rfs4_dbe_get_timerele(rfs4_dbe_t *e)
96 {
97 	return (e->time_rele);
98 }
99 
100 /*
101  * Use these to temporarily hide/unhide a db entry.
102  */
103 void
104 rfs4_dbe_hide(rfs4_dbe_t *e)
105 {
106 	rfs4_dbe_lock(e);
107 	e->skipsearch = TRUE;
108 	rfs4_dbe_unlock(e);
109 }
110 
111 void
112 rfs4_dbe_unhide(rfs4_dbe_t *e)
113 {
114 	rfs4_dbe_lock(e);
115 	e->skipsearch = FALSE;
116 	rfs4_dbe_unlock(e);
117 }
118 
119 void
120 rfs4_dbe_rele(rfs4_dbe_t *e)
121 {
122 	mutex_enter(e->lock);
123 	ASSERT(e->refcnt > 1);
124 	atomic_add_32(&e->refcnt, -1);
125 	e->time_rele = gethrestime_sec();
126 	mutex_exit(e->lock);
127 }
128 
129 void
130 rfs4_dbe_lock(rfs4_dbe_t *e)
131 {
132 	mutex_enter(e->lock);
133 }
134 
135 void
136 rfs4_dbe_unlock(rfs4_dbe_t *e)
137 {
138 	mutex_exit(e->lock);
139 }
140 
141 bool_t
142 rfs4_dbe_islocked(rfs4_dbe_t *e)
143 {
144 	return (mutex_owned(e->lock));
145 }
146 
147 clock_t
148 rfs4_dbe_twait(rfs4_dbe_t *e, clock_t timeout)
149 {
150 	return (cv_timedwait(e->cv, e->lock, timeout));
151 }
152 
153 void
154 rfs4_dbe_cv_broadcast(rfs4_dbe_t *e)
155 {
156 	cv_broadcast(e->cv);
157 }
158 
159 /* ARGSUSED */
160 static int
161 rfs4_dbe_kmem_constructor(void *obj, void *private, int kmflag)
162 {
163 	rfs4_dbe_t *entry = obj;
164 
165 	mutex_init(entry->lock, NULL, MUTEX_DEFAULT, NULL);
166 	cv_init(entry->cv, NULL, CV_DEFAULT, NULL);
167 
168 	return (0);
169 }
170 
171 static void
172 rfs4_dbe_kmem_destructor(void *obj, void *private)
173 {
174 	rfs4_dbe_t *entry = obj;
175 	/*LINTED*/
176 	rfs4_table_t *table = private;
177 
178 	mutex_destroy(entry->lock);
179 	cv_destroy(entry->cv);
180 }
181 
182 rfs4_database_t *
183 rfs4_database_create(uint32_t flags)
184 {
185 	rfs4_database_t *db;
186 
187 	db = kmem_alloc(sizeof (rfs4_database_t), KM_SLEEP);
188 	mutex_init(db->lock, NULL, MUTEX_DEFAULT, NULL);
189 	db->tables = NULL;
190 	db->debug_flags = flags;
191 	db->shutdown_count = 0;
192 	cv_init(&db->shutdown_wait, NULL, CV_DEFAULT, NULL);
193 	return (db);
194 }
195 
196 
197 /*
198  * The reaper threads that have been created for the tables in this
199  * database must be stopped and the entries in the tables released.
200  * Each table will be marked as "shutdown" and the reaper threads
201  * poked and they will see that a shutdown is in progress and cleanup
202  * and exit.  This function waits for all reaper threads to stop
203  * before returning to the caller.
204  */
205 void
206 rfs4_database_shutdown(rfs4_database_t *db)
207 {
208 	rfs4_table_t *table;
209 
210 	mutex_enter(db->lock);
211 	for (table = db->tables; table; table = table->tnext) {
212 		table->reaper_shutdown = TRUE;
213 		mutex_enter(&table->reaper_cv_lock);
214 		cv_broadcast(&table->reaper_wait);
215 		db->shutdown_count++;
216 		mutex_exit(&table->reaper_cv_lock);
217 	}
218 	while (db->shutdown_count > 0) {
219 		cv_wait(&db->shutdown_wait, db->lock);
220 	}
221 	mutex_exit(db->lock);
222 }
223 
224 /*
225  * Given a database that has been "shutdown" by the function above all
226  * of the table tables are destroyed and then the database itself
227  * freed.
228  */
229 void
230 rfs4_database_destroy(rfs4_database_t *db)
231 {
232 	rfs4_table_t *next, *tmp;
233 
234 	for (next = db->tables; next; ) {
235 		tmp = next;
236 		next = tmp->tnext;
237 		rfs4_table_destroy(db, tmp);
238 	}
239 
240 	mutex_destroy(db->lock);
241 	kmem_free(db, sizeof (rfs4_database_t));
242 }
243 
244 rfs4_table_t *
245 rfs4_table_create(rfs4_database_t *dbp, char *tabname, time_t max_cache_time,
246 		uint32_t idxcnt, bool_t (*create)(rfs4_entry_t, void *),
247 		void (*destroy)(rfs4_entry_t),
248 		bool_t (*expiry)(rfs4_entry_t),
249 		uint32_t size, uint32_t hashsize,
250 		uint32_t maxentries, id_t start)
251 {
252 	rfs4_table_t *table;
253 	int len;
254 	char *cache_name;
255 	char *id_name;
256 
257 	table = kmem_alloc(sizeof (rfs4_table_t), KM_SLEEP);
258 	table->dbp = dbp;
259 	rw_init(table->t_lock, NULL, RW_DEFAULT, NULL);
260 	mutex_init(table->lock, NULL, MUTEX_DEFAULT, NULL);
261 	mutex_init(&table->reaper_cv_lock, NULL, MUTEX_DEFAULT, NULL);
262 	cv_init(&table->reaper_wait, NULL, CV_DEFAULT, NULL);
263 
264 	len = strlen(tabname);
265 	table->name = kmem_alloc(len+1, KM_SLEEP);
266 	cache_name = kmem_alloc(len + 12 /* "_entry_cache" */ + 1, KM_SLEEP);
267 	(void) strcpy(table->name, tabname);
268 	(void) sprintf(cache_name, "%s_entry_cache", table->name);
269 	table->max_cache_time = max_cache_time;
270 	table->usize = size;
271 	table->len = hashsize;
272 	table->count = 0;
273 	table->idxcnt = 0;
274 	table->ccnt = 0;
275 	table->maxcnt = idxcnt;
276 	table->indices = NULL;
277 	table->id_space = NULL;
278 	table->reaper_shutdown = FALSE;
279 
280 	if (start >= 0) {
281 		if (maxentries + (uint32_t)start > (uint32_t)INT32_MAX)
282 			maxentries = INT32_MAX - start;
283 		id_name = kmem_alloc(len + 9 /* "_id_space" */ + 1, KM_SLEEP);
284 		(void) sprintf(id_name, "%s_id_space", table->name);
285 		table->id_space = id_space_create(id_name, start,
286 						maxentries + start);
287 		kmem_free(id_name, len + 10);
288 	}
289 	table->maxentries = maxentries;
290 	table->create = create;
291 	table->destroy = destroy;
292 	table->expiry = expiry;
293 
294 	table->mem_cache = kmem_cache_create(cache_name,
295 					    sizeof (rfs4_dbe_t) +
296 					    idxcnt * sizeof (rfs4_link) +
297 					    size,
298 					    0,
299 					    rfs4_dbe_kmem_constructor,
300 					    rfs4_dbe_kmem_destructor,
301 					    NULL,
302 					    table,
303 					    NULL,
304 					    0);
305 	kmem_free(cache_name, len+13);
306 
307 	table->debug = dbp->debug_flags;
308 
309 	mutex_enter(dbp->lock);
310 	table->tnext = dbp->tables;
311 	dbp->tables = table;
312 	mutex_exit(dbp->lock);
313 
314 	rfs4_start_reaper(table);
315 
316 	return (table);
317 }
318 
319 void
320 rfs4_table_destroy(rfs4_database_t *dbp, rfs4_table_t *table)
321 {
322 	rfs4_table_t *p;
323 	rfs4_index_t *t;
324 
325 	ASSERT(table->count == 0);
326 
327 	mutex_enter(dbp->lock);
328 	if (table == dbp->tables)
329 		dbp->tables = table->tnext;
330 	else {
331 		for (p = dbp->tables; p; p = p->tnext)
332 			if (p->tnext == table) {
333 				p->tnext = table->tnext;
334 				table->tnext = NULL;
335 				break;
336 			}
337 		ASSERT(p != NULL);
338 	}
339 	mutex_exit(dbp->lock);
340 
341 	/* Destroy indices */
342 	while (table->indices) {
343 		t = table->indices;
344 		table->indices = t->inext;
345 		rfs4_index_destroy(t);
346 	}
347 
348 	rw_destroy(table->t_lock);
349 	mutex_destroy(table->lock);
350 	mutex_destroy(&table->reaper_cv_lock);
351 	cv_destroy(&table->reaper_wait);
352 
353 	kmem_free(table->name, strlen(table->name) + 1);
354 	if (table->id_space)
355 		id_space_destroy(table->id_space);
356 	kmem_cache_destroy(table->mem_cache);
357 	kmem_free(table, sizeof (rfs4_table_t));
358 }
359 
360 rfs4_index_t *
361 rfs4_index_create(rfs4_table_t *table, char *keyname,
362 	uint32_t (*hash)(void *),
363 	bool_t (compare)(rfs4_entry_t, void *),
364 	void *(*mkkey)(rfs4_entry_t),
365 	bool_t createable)
366 {
367 	rfs4_index_t *idx;
368 
369 	ASSERT(table->idxcnt < table->maxcnt);
370 
371 	idx = kmem_alloc(sizeof (rfs4_index_t), KM_SLEEP);
372 
373 	idx->table = table;
374 	idx->keyname = kmem_alloc(strlen(keyname) + 1, KM_SLEEP);
375 	(void) strcpy(idx->keyname, keyname);
376 	idx->hash = hash;
377 	idx->compare = compare;
378 	idx->mkkey = mkkey;
379 	idx->tblidx = table->idxcnt;
380 	table->idxcnt++;
381 	if (createable) {
382 		table->ccnt++;
383 		if (table->ccnt > 1)
384 			panic("Table %s currently can have only have one "
385 			    "index that will allow creation of entries",
386 			    table->name);
387 		idx->createable = TRUE;
388 	} else {
389 		idx->createable = FALSE;
390 	}
391 
392 	idx->inext = table->indices;
393 	table->indices = idx;
394 	idx->buckets = kmem_zalloc(sizeof (rfs4_bucket) * table->len, KM_SLEEP);
395 
396 	return (idx);
397 }
398 
399 void
400 rfs4_index_destroy(rfs4_index_t *idx)
401 {
402 	kmem_free(idx->keyname, strlen(idx->keyname) + 1);
403 	kmem_free(idx->buckets, sizeof (rfs4_bucket) * idx->table->len);
404 	kmem_free(idx, sizeof (rfs4_index_t));
405 }
406 
407 static void
408 rfs4_dbe_destroy(rfs4_dbe_t *entry)
409 {
410 	rfs4_index_t *ip;
411 	void *key;
412 	int i;
413 	rfs4_bucket *bp;
414 	rfs4_table_t *table = entry->table;
415 	rfs4_link *l;
416 
417 	NFS4_DEBUG(table->debug & DESTROY_DEBUG,
418 		(CE_NOTE, "Destroying entry %p from %s",
419 		(void*)entry, table->name));
420 
421 	mutex_enter(entry->lock);
422 	ASSERT(entry->refcnt == 0);
423 	mutex_exit(entry->lock);
424 
425 	/* Unlink from all indices */
426 	for (ip = table->indices; ip; ip = ip->inext) {
427 		l = &entry->indices[ip->tblidx];
428 		/* check and see if we were ever linked in to the index */
429 		if (INVALID_LINK(l)) {
430 			ASSERT(l->next == NULL && l->prev == NULL);
431 			continue;
432 		}
433 		key = ip->mkkey(entry->data);
434 		i = HASH(ip, key);
435 		bp = &ip->buckets[i];
436 		ASSERT(bp->head != NULL);
437 		DEQUEUE_IDX(bp, &entry->indices[ip->tblidx]);
438 	}
439 
440 	/* Destroy user data */
441 	if (table->destroy)
442 		(*table->destroy)(entry->data);
443 
444 	if (table->id_space)
445 		id_free(table->id_space, entry->id);
446 
447 	mutex_enter(table->lock);
448 	table->count--;
449 	mutex_exit(table->lock);
450 
451 	/* Destroy the entry itself */
452 	kmem_cache_free(table->mem_cache, entry);
453 }
454 
455 
456 static rfs4_dbe_t *
457 rfs4_dbe_create(rfs4_table_t *table, rfs4_entry_t data)
458 {
459 	rfs4_dbe_t *entry;
460 	int i;
461 
462 
463 	NFS4_DEBUG(table->debug & CREATE_DEBUG,
464 		(CE_NOTE, "Creating entry in table %s", table->name));
465 
466 	entry = kmem_cache_alloc(table->mem_cache, KM_SLEEP);
467 
468 	entry->refcnt = 1;
469 	entry->invalid = FALSE;
470 	entry->skipsearch = FALSE;
471 	entry->time_rele = 0;
472 	entry->id = 0;
473 
474 	if (table->id_space)
475 		entry->id = id_alloc(table->id_space);
476 	entry->table = table;
477 
478 	for (i = 0; i < table->maxcnt; i++) {
479 		entry->indices[i].next = entry->indices[i].prev = NULL;
480 		entry->indices[i].entry = entry;
481 		/*
482 		 * We mark the entry as not indexed by setting the low
483 		 * order bit, since address are word aligned. This has
484 		 * the advantage of causeing a trap if the address is
485 		 * used. After the entry is linked in to the
486 		 * corresponding index the bit will be cleared.
487 		 */
488 		INVALIDATE_ADDR(entry->indices[i].entry);
489 	}
490 
491 	entry->data = (rfs4_entry_t)&entry->indices[table->maxcnt];
492 	bzero(entry->data, table->usize);
493 	entry->data->dbe = entry;
494 
495 	if (!(*table->create)(entry->data, data)) {
496 		kmem_cache_free(table->mem_cache, entry);
497 		return (NULL);
498 	}
499 
500 	mutex_enter(table->lock);
501 	table->count++;
502 	mutex_exit(table->lock);
503 
504 	return (entry);
505 }
506 
507 rfs4_entry_t
508 rfs4_dbsearch(rfs4_index_t *idx, void *key, bool_t *create, void *arg,
509 		rfs4_dbsearch_type_t dbsearch_type)
510 {
511 	int already_done;
512 	uint32_t i;
513 	rfs4_table_t *table = idx->table;
514 	rfs4_index_t *ip;
515 	rfs4_bucket *bp;
516 	rfs4_link *l;
517 	rfs4_dbe_t *entry = NULL;
518 
519 	i = HASH(idx, key);
520 	bp = &idx->buckets[i];
521 
522 	NFS4_DEBUG(table->debug & SEARCH_DEBUG,
523 		(CE_NOTE, "Searching for key %p in table %s by %s",
524 		key, table->name, idx->keyname));
525 
526 	rw_enter(bp->lock, RW_READER);
527 retry:
528 	for (l = bp->head; l; l = l->next) {
529 		if (l->entry->refcnt > 0 &&
530 			(l->entry->skipsearch == FALSE ||
531 			(l->entry->skipsearch == TRUE &&
532 				dbsearch_type == RFS4_DBS_INVALID)) &&
533 			(*idx->compare)(l->entry->data, key)) {
534 			mutex_enter(l->entry->lock);
535 			if (l->entry->refcnt == 0) {
536 				mutex_exit(l->entry->lock);
537 				continue;
538 			}
539 
540 			/* place an additional hold since we are returning */
541 			rfs4_dbe_hold(l->entry);
542 
543 			mutex_exit(l->entry->lock);
544 			rw_exit(bp->lock);
545 
546 			if (entry) {
547 				/*
548 				 * The entry has not been placed in a
549 				 * table so go ahead and drop the ref
550 				 * count and destroy the entry.
551 				 */
552 				entry->refcnt--;
553 				rfs4_dbe_destroy(entry);
554 			}
555 			*create = FALSE;
556 
557 			NFS4_DEBUG((table->debug & SEARCH_DEBUG),
558 				(CE_NOTE, "Found entry %p for %p in table %s",
559 					(void *)l->entry, key, table->name));
560 
561 			return (l->entry->data);
562 		}
563 	}
564 
565 	if (!*create || table->create == NULL || !idx->createable ||
566 		table->maxentries == table->count) {
567 		*create = FALSE;
568 
569 		NFS4_DEBUG(table->debug & SEARCH_DEBUG,
570 			(CE_NOTE, "Entry for %p in %s not found",
571 			key, table->name));
572 
573 		rw_exit(bp->lock);
574 
575 		return (NULL);
576 	}
577 
578 	/* Create data before grabing an exclusive lock if needed */
579 	if (entry == NULL) {
580 		entry = rfs4_dbe_create(table, arg);
581 		if (entry == NULL) {
582 			rw_exit(bp->lock);
583 
584 			NFS4_DEBUG(table->debug & CREATE_DEBUG,
585 				(CE_NOTE, "Constructor for table %s failed",
586 				table->name));
587 			return (NULL);
588 		}
589 	}
590 
591 	/* Now that we've allocated  */
592 	if (rw_read_locked(bp->lock) && !rw_tryupgrade(bp->lock)) {
593 
594 		NFS4_DEBUG(table->debug & OTHER_DEBUG,
595 			(CE_NOTE, "Trying to upgrade lock for entry %p on "
596 			"hash chain %d (%p) for  %s by %s",
597 			(void*)entry, i, (void*)bp,
598 			table->name, idx->keyname));
599 
600 		rw_exit(bp->lock);
601 		rw_enter(bp->lock, RW_WRITER);
602 
603 		goto retry;
604 	}
605 
606 	/*
607 	 * Add one ref for entry into table's hash - only one
608 	 * reference added evn though there may be multiple indices
609 	 */
610 	rfs4_dbe_hold(entry);
611 	ENQUEUE(bp->head, &entry->indices[idx->tblidx]);
612 	VALIDATE_ADDR(entry->indices[idx->tblidx].entry);
613 
614 	already_done = idx->tblidx;
615 	rw_exit(bp->lock);
616 
617 	for (ip = table->indices; ip; ip = ip->inext) {
618 		if (ip->tblidx == already_done)
619 			continue;
620 		l = &entry->indices[ip->tblidx];
621 		i = HASH(ip, ip->mkkey(entry->data));
622 		ASSERT(i < ip->table->len);
623 		bp = &ip->buckets[i];
624 		ENQUEUE_IDX(bp, l);
625 	}
626 
627 	NFS4_DEBUG(table->debug & SEARCH_DEBUG || table->debug & CREATE_DEBUG,
628 		(CE_NOTE, "Entry %p created for %s = %p in table %s",
629 		(void*)entry, idx->keyname, (void*)key, table->name));
630 
631 	return (entry->data);
632 }
633 
634 /*ARGSUSED*/
635 boolean_t
636 rfs4_cpr_callb(void *arg, int code)
637 {
638 	rfs4_table_t *tbl = rfs4_client_tab;
639 	rfs4_bucket *buckets, *bp;
640 	rfs4_link *l;
641 	rfs4_client_t *cl;
642 	int i;
643 
644 	/*
645 	 * We get called for Suspend and Resume events.
646 	 * For the suspend case we simply don't care!  Nor do we care if
647 	 * there are no clients.
648 	 */
649 	if (code == CB_CODE_CPR_CHKPT || tbl == NULL) {
650 		return (B_TRUE);
651 	}
652 
653 	buckets = tbl->indices->buckets;
654 
655 	/*
656 	 * When we get this far we are in the process of
657 	 * resuming the system from a previous suspend.
658 	 *
659 	 * We are going to blast through and update the
660 	 * last_access time for all the clients and in
661 	 * doing so extend them by one lease period.
662 	 */
663 	for (i = 0; i < tbl->len; i++) {
664 		bp = &buckets[i];
665 		for (l = bp->head; l; l = l->next) {
666 			cl = (rfs4_client_t *)l->entry->data;
667 			cl->last_access = gethrestime_sec();
668 		}
669 	}
670 
671 	return (B_TRUE);
672 }
673 
674 /*
675  * Given a table, lock each of the buckets and walk all entries (in
676  * turn locking those) and calling the provided "callout" function
677  * with the provided parameter.  Obviously used to iterate across all
678  * entries in a particular table via the database locking hierarchy.
679  * Obviously the caller must not hold locks on any of the entries in
680  * the specified table.
681  */
682 void
683 rfs4_dbe_walk(rfs4_table_t *table,
684 		void (*callout)(rfs4_entry_t, void *),
685 		void *data)
686 {
687 	rfs4_bucket *buckets = table->indices->buckets, *bp;
688 	rfs4_link *l;
689 	rfs4_dbe_t *e;
690 	int i;
691 
692 	NFS4_DEBUG(table->debug & WALK_DEBUG,
693 		(CE_NOTE, "Walking entries in %s", table->name));
694 
695 	/* Walk the buckets looking for entries to release/destroy */
696 	for (i = 0; i < table->len; i++) {
697 		bp = &buckets[i];
698 		rw_enter(bp->lock, RW_READER);
699 		for (l = bp->head; l; l = l->next) {
700 			e = l->entry;
701 			mutex_enter(e->lock);
702 			(*callout)(e->data, data);
703 			mutex_exit(e->lock);
704 		}
705 		rw_exit(bp->lock);
706 	}
707 
708 	NFS4_DEBUG(table->debug & WALK_DEBUG,
709 		(CE_NOTE, "Walking entries complete %s", table->name));
710 }
711 
712 
713 static void
714 rfs4_dbe_reap(rfs4_table_t *table, time_t cache_time, uint32_t desired)
715 {
716 	rfs4_index_t *ip = table->indices;
717 	rfs4_bucket *buckets = ip->buckets, *bp;
718 	rfs4_link *l, *t;
719 	rfs4_dbe_t *e;
720 	bool_t found;
721 	int i;
722 	int count = 0;
723 
724 	NFS4_DEBUG(table->debug & REAP_DEBUG,
725 		(CE_NOTE,
726 		"Reaping %d entries older than %ld seconds in table %s",
727 		desired, cache_time, table->name));
728 
729 	/* Walk the buckets looking for entries to release/destroy */
730 	for (i = 0; i < table->len; i++) {
731 		bp = &buckets[i];
732 		do {
733 			found = FALSE;
734 			rw_enter(bp->lock, RW_READER);
735 			for (l = bp->head; l; l = l->next) {
736 				e = l->entry;
737 				/*
738 				 * Examine an entry.  Ref count of 1 means
739 				 * that the only reference is for the hash
740 				 * table reference.
741 				 */
742 				if (e->refcnt == 1) {
743 					mutex_enter(e->lock);
744 					if (e->refcnt == 1) {
745 						if (table->reaper_shutdown ||
746 						    table->expiry == NULL ||
747 						    (*table->expiry)(e->data)) {
748 							e->refcnt--;
749 							count++;
750 							found = TRUE;
751 						}
752 					}
753 					mutex_exit(e->lock);
754 				}
755 			}
756 			if (found) {
757 				if (!rw_tryupgrade(bp->lock)) {
758 					rw_exit(bp->lock);
759 					rw_enter(bp->lock, RW_WRITER);
760 				}
761 
762 				l = bp->head;
763 				while (l) {
764 					t = l;
765 					e = t->entry;
766 					l = l->next;
767 					if (e->refcnt == 0) {
768 						DEQUEUE(bp->head, t);
769 						t->next = NULL;
770 						t->prev = NULL;
771 						INVALIDATE_ADDR(t->entry);
772 						rfs4_dbe_destroy(e);
773 					}
774 				}
775 			}
776 			rw_exit(bp->lock);
777 			/*
778 			 * delay slightly if there is more work to do
779 			 * with the expectation that other reaper
780 			 * threads are freeing data structures as well
781 			 * and in turn will reduce ref counts on
782 			 * entries in this table allowing them to be
783 			 * released.  This is only done in the
784 			 * instance that the tables are being shut down.
785 			 */
786 			if (table->reaper_shutdown && bp->head != NULL)
787 				delay(hz/100);
788 		/*
789 		 * If this is a table shutdown, keep going until
790 		 * everything is gone
791 		 */
792 		} while (table->reaper_shutdown && bp->head != NULL);
793 
794 		if (!table->reaper_shutdown && desired && count >= desired)
795 			break;
796 	}
797 
798 	NFS4_DEBUG(table->debug & REAP_DEBUG,
799 		(CE_NOTE,
800 		"Reaped %d entries older than %ld seconds in table %s",
801 		count, cache_time, table->name));
802 }
803 
804 
805 static void
806 reaper_thread(caddr_t *arg)
807 {
808 	rfs4_table_t *table = (rfs4_table_t *)arg;
809 	clock_t rc, time;
810 
811 	NFS4_DEBUG(table->debug,
812 		(CE_NOTE, "rfs4_reaper_thread starting for %s", table->name));
813 
814 	CALLB_CPR_INIT(&table->reaper_cpr_info, &table->reaper_cv_lock,
815 		callb_generic_cpr, "nfsv4Reaper");
816 
817 	time = MIN(rfs4_reap_interval, table->max_cache_time);
818 	mutex_enter(&table->reaper_cv_lock);
819 	do {
820 		CALLB_CPR_SAFE_BEGIN(&table->reaper_cpr_info);
821 		rc = cv_timedwait_sig(&table->reaper_wait,
822 					&table->reaper_cv_lock,
823 					lbolt + SEC_TO_TICK(time));
824 		CALLB_CPR_SAFE_END(&table->reaper_cpr_info,
825 					&table->reaper_cv_lock);
826 		rfs4_dbe_reap(table, table->max_cache_time, 0);
827 	} while (rc != 0 && table->reaper_shutdown == FALSE);
828 
829 	CALLB_CPR_EXIT(&table->reaper_cpr_info);
830 
831 	NFS4_DEBUG(table->debug,
832 		(CE_NOTE, "rfs4_reaper_thread exiting for %s", table->name));
833 
834 	/* Notify the database shutdown processing that the table is shutdown */
835 	mutex_enter(table->dbp->lock);
836 	table->dbp->shutdown_count--;
837 	cv_signal(&table->dbp->shutdown_wait);
838 	mutex_exit(table->dbp->lock);
839 }
840 
841 static void
842 rfs4_start_reaper(rfs4_table_t *table)
843 {
844 	if (table->max_cache_time == 0)
845 		return;
846 
847 	(void) thread_create(NULL, 0, reaper_thread, table, 0, &p0, TS_RUN,
848 			    minclsyspri);
849 }
850 
851 #ifdef DEBUG
852 void
853 rfs4_dbe_debug(rfs4_dbe_t *e)
854 {
855 	cmn_err(CE_NOTE, "Entry %p from table %s", (void *)e, e->table->name);
856 	cmn_err(CE_CONT, "\trefcnt = %d id = %d", e->refcnt, e->id);
857 }
858 #endif
859