xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs4_db.c (revision aedf2b3bb56b025fcaf87b49ec6c8aeea07f16d7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/systm.h>
27 #include <sys/cmn_err.h>
28 #include <sys/kmem.h>
29 #include <sys/disp.h>
30 #include <sys/id_space.h>
31 #include <sys/atomic.h>
32 #include <rpc/rpc.h>
33 #include <nfs/nfs4.h>
34 #include <nfs/nfs4_db_impl.h>
35 
36 static int rfs4_reap_interval = RFS4_REAP_INTERVAL;
37 
38 static void rfs4_dbe_reap(rfs4_table_t *, time_t, uint32_t);
39 static void rfs4_dbe_destroy(rfs4_dbe_t *);
40 static rfs4_dbe_t *rfs4_dbe_create(rfs4_table_t *, id_t, rfs4_entry_t);
41 static void rfs4_start_reaper(rfs4_table_t *);
42 
43 id_t
44 rfs4_dbe_getid(rfs4_dbe_t *entry)
45 {
46 	return (entry->dbe_id);
47 }
48 
49 void
50 rfs4_dbe_hold(rfs4_dbe_t *entry)
51 {
52 	atomic_add_32(&entry->dbe_refcnt, 1);
53 }
54 
55 /*
56  * rfs4_dbe_rele_nolock only decrements the reference count of the entry.
57  */
58 void
59 rfs4_dbe_rele_nolock(rfs4_dbe_t *entry)
60 {
61 	atomic_add_32(&entry->dbe_refcnt, -1);
62 }
63 
64 
65 uint32_t
66 rfs4_dbe_refcnt(rfs4_dbe_t *entry)
67 {
68 	return (entry->dbe_refcnt);
69 }
70 
71 /*
72  * Mark an entry such that the dbsearch will skip it.
73  * Caller does not want this entry to be found any longer
74  */
75 void
76 rfs4_dbe_invalidate(rfs4_dbe_t *entry)
77 {
78 	entry->dbe_invalid = TRUE;
79 	entry->dbe_skipsearch = TRUE;
80 }
81 
82 /*
83  * Is this entry invalid?
84  */
85 bool_t
86 rfs4_dbe_is_invalid(rfs4_dbe_t *entry)
87 {
88 	return (entry->dbe_invalid);
89 }
90 
91 time_t
92 rfs4_dbe_get_timerele(rfs4_dbe_t *entry)
93 {
94 	return (entry->dbe_time_rele);
95 }
96 
97 /*
98  * Use these to temporarily hide/unhide a db entry.
99  */
100 void
101 rfs4_dbe_hide(rfs4_dbe_t *entry)
102 {
103 	rfs4_dbe_lock(entry);
104 	entry->dbe_skipsearch = TRUE;
105 	rfs4_dbe_unlock(entry);
106 }
107 
108 void
109 rfs4_dbe_unhide(rfs4_dbe_t *entry)
110 {
111 	rfs4_dbe_lock(entry);
112 	entry->dbe_skipsearch = FALSE;
113 	rfs4_dbe_unlock(entry);
114 }
115 
116 void
117 rfs4_dbe_rele(rfs4_dbe_t *entry)
118 {
119 	mutex_enter(entry->dbe_lock);
120 	ASSERT(entry->dbe_refcnt > 1);
121 	atomic_add_32(&entry->dbe_refcnt, -1);
122 	entry->dbe_time_rele = gethrestime_sec();
123 	mutex_exit(entry->dbe_lock);
124 }
125 
126 void
127 rfs4_dbe_lock(rfs4_dbe_t *entry)
128 {
129 	mutex_enter(entry->dbe_lock);
130 }
131 
132 void
133 rfs4_dbe_unlock(rfs4_dbe_t *entry)
134 {
135 	mutex_exit(entry->dbe_lock);
136 }
137 
138 bool_t
139 rfs4_dbe_islocked(rfs4_dbe_t *entry)
140 {
141 	return (mutex_owned(entry->dbe_lock));
142 }
143 
144 clock_t
145 rfs4_dbe_twait(rfs4_dbe_t *entry, clock_t timeout)
146 {
147 	return (cv_timedwait(entry->dbe_cv, entry->dbe_lock, timeout));
148 }
149 
150 void
151 rfs4_dbe_cv_broadcast(rfs4_dbe_t *entry)
152 {
153 	cv_broadcast(entry->dbe_cv);
154 }
155 
156 /* ARGSUSED */
157 static int
158 rfs4_dbe_kmem_constructor(void *obj, void *private, int kmflag)
159 {
160 	rfs4_dbe_t *entry = obj;
161 
162 	mutex_init(entry->dbe_lock, NULL, MUTEX_DEFAULT, NULL);
163 	cv_init(entry->dbe_cv, NULL, CV_DEFAULT, NULL);
164 
165 	return (0);
166 }
167 
168 static void
169 rfs4_dbe_kmem_destructor(void *obj, void *private)
170 {
171 	rfs4_dbe_t *entry = obj;
172 	/*LINTED*/
173 	rfs4_table_t *table = private;
174 
175 	mutex_destroy(entry->dbe_lock);
176 	cv_destroy(entry->dbe_cv);
177 }
178 
179 rfs4_database_t *
180 rfs4_database_create(uint32_t flags)
181 {
182 	rfs4_database_t *db;
183 
184 	db = kmem_alloc(sizeof (rfs4_database_t), KM_SLEEP);
185 	mutex_init(db->db_lock, NULL, MUTEX_DEFAULT, NULL);
186 	db->db_tables = NULL;
187 	db->db_debug_flags = flags;
188 	db->db_shutdown_count = 0;
189 	cv_init(&db->db_shutdown_wait, NULL, CV_DEFAULT, NULL);
190 	return (db);
191 }
192 
193 
194 /*
195  * The reaper threads that have been created for the tables in this
196  * database must be stopped and the entries in the tables released.
197  * Each table will be marked as "shutdown" and the reaper threads
198  * poked and they will see that a shutdown is in progress and cleanup
199  * and exit.  This function waits for all reaper threads to stop
200  * before returning to the caller.
201  */
202 void
203 rfs4_database_shutdown(rfs4_database_t *db)
204 {
205 	rfs4_table_t *table;
206 
207 	mutex_enter(db->db_lock);
208 	for (table = db->db_tables; table; table = table->dbt_tnext) {
209 		table->dbt_reaper_shutdown = TRUE;
210 		mutex_enter(&table->dbt_reaper_cv_lock);
211 		cv_broadcast(&table->dbt_reaper_wait);
212 		db->db_shutdown_count++;
213 		mutex_exit(&table->dbt_reaper_cv_lock);
214 	}
215 	while (db->db_shutdown_count > 0) {
216 		cv_wait(&db->db_shutdown_wait, db->db_lock);
217 	}
218 	mutex_exit(db->db_lock);
219 }
220 
221 /*
222  * Given a database that has been "shutdown" by the function above all
223  * of the table tables are destroyed and then the database itself
224  * freed.
225  */
226 void
227 rfs4_database_destroy(rfs4_database_t *db)
228 {
229 	rfs4_table_t *next, *tmp;
230 
231 	for (next = db->db_tables; next; ) {
232 		tmp = next;
233 		next = tmp->dbt_tnext;
234 		rfs4_table_destroy(db, tmp);
235 	}
236 
237 	mutex_destroy(db->db_lock);
238 	kmem_free(db, sizeof (rfs4_database_t));
239 }
240 
241 rfs4_table_t *
242 rfs4_table_create(rfs4_database_t *db, char *tabname, time_t max_cache_time,
243     uint32_t idxcnt, bool_t (*create)(rfs4_entry_t, void *),
244     void (*destroy)(rfs4_entry_t),
245     bool_t (*expiry)(rfs4_entry_t),
246     uint32_t size, uint32_t hashsize,
247     uint32_t maxentries, id_t start)
248 {
249 	rfs4_table_t *table;
250 	int len;
251 	char *cache_name;
252 	char *id_name;
253 
254 	table = kmem_alloc(sizeof (rfs4_table_t), KM_SLEEP);
255 	table->dbt_db = db;
256 	rw_init(table->dbt_t_lock, NULL, RW_DEFAULT, NULL);
257 	mutex_init(table->dbt_lock, NULL, MUTEX_DEFAULT, NULL);
258 	mutex_init(&table->dbt_reaper_cv_lock, NULL, MUTEX_DEFAULT, NULL);
259 	cv_init(&table->dbt_reaper_wait, NULL, CV_DEFAULT, NULL);
260 
261 	len = strlen(tabname);
262 	table->dbt_name = kmem_alloc(len+1, KM_SLEEP);
263 	cache_name = kmem_alloc(len + 12 /* "_entry_cache" */ + 1, KM_SLEEP);
264 	(void) strcpy(table->dbt_name, tabname);
265 	(void) sprintf(cache_name, "%s_entry_cache", table->dbt_name);
266 	table->dbt_max_cache_time = max_cache_time;
267 	table->dbt_usize = size;
268 	table->dbt_len = hashsize;
269 	table->dbt_count = 0;
270 	table->dbt_idxcnt = 0;
271 	table->dbt_ccnt = 0;
272 	table->dbt_maxcnt = idxcnt;
273 	table->dbt_indices = NULL;
274 	table->dbt_id_space = NULL;
275 	table->dbt_reaper_shutdown = FALSE;
276 
277 	if (start >= 0) {
278 		if (maxentries + (uint32_t)start > (uint32_t)INT32_MAX)
279 			maxentries = INT32_MAX - start;
280 		id_name = kmem_alloc(len + 9 /* "_id_space" */ + 1, KM_SLEEP);
281 		(void) sprintf(id_name, "%s_id_space", table->dbt_name);
282 		table->dbt_id_space = id_space_create(id_name, start,
283 		    maxentries + start);
284 		kmem_free(id_name, len + 10);
285 	}
286 	table->dbt_maxentries = maxentries;
287 	table->dbt_create = create;
288 	table->dbt_destroy = destroy;
289 	table->dbt_expiry = expiry;
290 
291 	table->dbt_mem_cache = kmem_cache_create(cache_name,
292 	    sizeof (rfs4_dbe_t) + idxcnt * sizeof (rfs4_link_t) + size,
293 	    0,
294 	    rfs4_dbe_kmem_constructor,
295 	    rfs4_dbe_kmem_destructor,
296 	    NULL,
297 	    table,
298 	    NULL,
299 	    0);
300 	kmem_free(cache_name, len+13);
301 
302 	table->dbt_debug = db->db_debug_flags;
303 
304 	mutex_enter(db->db_lock);
305 	table->dbt_tnext = db->db_tables;
306 	db->db_tables = table;
307 	mutex_exit(db->db_lock);
308 
309 	rfs4_start_reaper(table);
310 
311 	return (table);
312 }
313 
314 void
315 rfs4_table_destroy(rfs4_database_t *db, rfs4_table_t *table)
316 {
317 	rfs4_table_t *p;
318 	rfs4_index_t *idx;
319 
320 	ASSERT(table->dbt_count == 0);
321 
322 	mutex_enter(db->db_lock);
323 	if (table == db->db_tables)
324 		db->db_tables = table->dbt_tnext;
325 	else {
326 		for (p = db->db_tables; p; p = p->dbt_tnext)
327 			if (p->dbt_tnext == table) {
328 				p->dbt_tnext = table->dbt_tnext;
329 				table->dbt_tnext = NULL;
330 				break;
331 			}
332 		ASSERT(p != NULL);
333 	}
334 	mutex_exit(db->db_lock);
335 
336 	/* Destroy indices */
337 	while (table->dbt_indices) {
338 		idx = table->dbt_indices;
339 		table->dbt_indices = idx->dbi_inext;
340 		rfs4_index_destroy(idx);
341 	}
342 
343 	rw_destroy(table->dbt_t_lock);
344 	mutex_destroy(table->dbt_lock);
345 	mutex_destroy(&table->dbt_reaper_cv_lock);
346 	cv_destroy(&table->dbt_reaper_wait);
347 
348 	kmem_free(table->dbt_name, strlen(table->dbt_name) + 1);
349 	if (table->dbt_id_space)
350 		id_space_destroy(table->dbt_id_space);
351 	kmem_cache_destroy(table->dbt_mem_cache);
352 	kmem_free(table, sizeof (rfs4_table_t));
353 }
354 
355 rfs4_index_t *
356 rfs4_index_create(rfs4_table_t *table, char *keyname,
357     uint32_t (*hash)(void *),
358     bool_t (compare)(rfs4_entry_t, void *),
359     void *(*mkkey)(rfs4_entry_t),
360     bool_t createable)
361 {
362 	rfs4_index_t *idx;
363 
364 	ASSERT(table->dbt_idxcnt < table->dbt_maxcnt);
365 
366 	idx = kmem_alloc(sizeof (rfs4_index_t), KM_SLEEP);
367 
368 	idx->dbi_table = table;
369 	idx->dbi_keyname = kmem_alloc(strlen(keyname) + 1, KM_SLEEP);
370 	(void) strcpy(idx->dbi_keyname, keyname);
371 	idx->dbi_hash = hash;
372 	idx->dbi_compare = compare;
373 	idx->dbi_mkkey = mkkey;
374 	idx->dbi_tblidx = table->dbt_idxcnt;
375 	table->dbt_idxcnt++;
376 	if (createable) {
377 		table->dbt_ccnt++;
378 		if (table->dbt_ccnt > 1)
379 			panic("Table %s currently can have only have one "
380 			    "index that will allow creation of entries",
381 			    table->dbt_name);
382 		idx->dbi_createable = TRUE;
383 	} else {
384 		idx->dbi_createable = FALSE;
385 	}
386 
387 	idx->dbi_inext = table->dbt_indices;
388 	table->dbt_indices = idx;
389 	idx->dbi_buckets = kmem_zalloc(sizeof (rfs4_bucket_t) * table->dbt_len,
390 	    KM_SLEEP);
391 
392 	return (idx);
393 }
394 
395 void
396 rfs4_index_destroy(rfs4_index_t *idx)
397 {
398 	kmem_free(idx->dbi_keyname, strlen(idx->dbi_keyname) + 1);
399 	kmem_free(idx->dbi_buckets,
400 	    sizeof (rfs4_bucket_t) * idx->dbi_table->dbt_len);
401 	kmem_free(idx, sizeof (rfs4_index_t));
402 }
403 
404 static void
405 rfs4_dbe_destroy(rfs4_dbe_t *entry)
406 {
407 	rfs4_index_t *idx;
408 	void *key;
409 	int i;
410 	rfs4_bucket_t *bp;
411 	rfs4_table_t *table = entry->dbe_table;
412 	rfs4_link_t *l;
413 
414 	NFS4_DEBUG(table->dbt_debug & DESTROY_DEBUG,
415 	    (CE_NOTE, "Destroying entry %p from %s",
416 	    (void*)entry, table->dbt_name));
417 
418 	mutex_enter(entry->dbe_lock);
419 	ASSERT(entry->dbe_refcnt == 0);
420 	mutex_exit(entry->dbe_lock);
421 
422 	/* Unlink from all indices */
423 	for (idx = table->dbt_indices; idx; idx = idx->dbi_inext) {
424 		l = &entry->dbe_indices[idx->dbi_tblidx];
425 		/* check and see if we were ever linked in to the index */
426 		if (INVALID_LINK(l)) {
427 			ASSERT(l->next == NULL && l->prev == NULL);
428 			continue;
429 		}
430 		key = idx->dbi_mkkey(entry->dbe_data);
431 		i = HASH(idx, key);
432 		bp = &idx->dbi_buckets[i];
433 		ASSERT(bp->dbk_head != NULL);
434 		DEQUEUE_IDX(bp, &entry->dbe_indices[idx->dbi_tblidx]);
435 	}
436 
437 	/* Destroy user data */
438 	if (table->dbt_destroy)
439 		(*table->dbt_destroy)(entry->dbe_data);
440 
441 	if (table->dbt_id_space)
442 		id_free(table->dbt_id_space, entry->dbe_id);
443 
444 	mutex_enter(table->dbt_lock);
445 	table->dbt_count--;
446 	mutex_exit(table->dbt_lock);
447 
448 	/* Destroy the entry itself */
449 	kmem_cache_free(table->dbt_mem_cache, entry);
450 }
451 
452 
453 static rfs4_dbe_t *
454 rfs4_dbe_create(rfs4_table_t *table, id_t id, rfs4_entry_t data)
455 {
456 	rfs4_dbe_t *entry;
457 	int i;
458 
459 	NFS4_DEBUG(table->dbt_debug & CREATE_DEBUG,
460 	    (CE_NOTE, "Creating entry in table %s", table->dbt_name));
461 
462 	entry = kmem_cache_alloc(table->dbt_mem_cache, KM_SLEEP);
463 
464 	entry->dbe_refcnt = 1;
465 	entry->dbe_invalid = FALSE;
466 	entry->dbe_skipsearch = FALSE;
467 	entry->dbe_time_rele = 0;
468 	entry->dbe_id = 0;
469 
470 	if (table->dbt_id_space)
471 		entry->dbe_id = id;
472 	entry->dbe_table = table;
473 
474 	for (i = 0; i < table->dbt_maxcnt; i++) {
475 		entry->dbe_indices[i].next = entry->dbe_indices[i].prev = NULL;
476 		entry->dbe_indices[i].entry = entry;
477 		/*
478 		 * We mark the entry as not indexed by setting the low
479 		 * order bit, since address are word aligned. This has
480 		 * the advantage of causeing a trap if the address is
481 		 * used. After the entry is linked in to the
482 		 * corresponding index the bit will be cleared.
483 		 */
484 		INVALIDATE_ADDR(entry->dbe_indices[i].entry);
485 	}
486 
487 	entry->dbe_data = (rfs4_entry_t)&entry->dbe_indices[table->dbt_maxcnt];
488 	bzero(entry->dbe_data, table->dbt_usize);
489 	entry->dbe_data->dbe = entry;
490 
491 	if (!(*table->dbt_create)(entry->dbe_data, data)) {
492 		kmem_cache_free(table->dbt_mem_cache, entry);
493 		return (NULL);
494 	}
495 
496 	mutex_enter(table->dbt_lock);
497 	table->dbt_count++;
498 	mutex_exit(table->dbt_lock);
499 
500 	return (entry);
501 }
502 
503 rfs4_entry_t
504 rfs4_dbsearch(rfs4_index_t *idx, void *key, bool_t *create, void *arg,
505     rfs4_dbsearch_type_t dbsearch_type)
506 {
507 	int already_done;
508 	uint32_t i;
509 	rfs4_table_t *table = idx->dbi_table;
510 	rfs4_index_t *ip;
511 	rfs4_bucket_t *bp;
512 	rfs4_link_t *l;
513 	rfs4_dbe_t *entry;
514 	id_t id = -1;
515 
516 	i = HASH(idx, key);
517 	bp = &idx->dbi_buckets[i];
518 
519 	NFS4_DEBUG(table->dbt_debug & SEARCH_DEBUG,
520 	    (CE_NOTE, "Searching for key %p in table %s by %s",
521 	    key, table->dbt_name, idx->dbi_keyname));
522 
523 	rw_enter(bp->dbk_lock, RW_READER);
524 retry:
525 	for (l = bp->dbk_head; l; l = l->next) {
526 		if (l->entry->dbe_refcnt > 0 &&
527 		    (l->entry->dbe_skipsearch == FALSE ||
528 		    (l->entry->dbe_skipsearch == TRUE &&
529 		    dbsearch_type == RFS4_DBS_INVALID)) &&
530 		    (*idx->dbi_compare)(l->entry->dbe_data, key)) {
531 			mutex_enter(l->entry->dbe_lock);
532 			if (l->entry->dbe_refcnt == 0) {
533 				mutex_exit(l->entry->dbe_lock);
534 				continue;
535 			}
536 
537 			/* place an additional hold since we are returning */
538 			rfs4_dbe_hold(l->entry);
539 
540 			mutex_exit(l->entry->dbe_lock);
541 			rw_exit(bp->dbk_lock);
542 
543 			*create = FALSE;
544 
545 			NFS4_DEBUG((table->dbt_debug & SEARCH_DEBUG),
546 			    (CE_NOTE, "Found entry %p for %p in table %s",
547 			    (void *)l->entry, key, table->dbt_name));
548 
549 			if (id != -1)
550 				id_free(table->dbt_id_space, id);
551 			return (l->entry->dbe_data);
552 		}
553 	}
554 
555 	if (!*create || table->dbt_create == NULL || !idx->dbi_createable ||
556 	    table->dbt_maxentries == table->dbt_count) {
557 		NFS4_DEBUG(table->dbt_debug & SEARCH_DEBUG,
558 		    (CE_NOTE, "Entry for %p in %s not found",
559 		    key, table->dbt_name));
560 
561 		rw_exit(bp->dbk_lock);
562 		if (id != -1)
563 			id_free(table->dbt_id_space, id);
564 		return (NULL);
565 	}
566 
567 	if (table->dbt_id_space && id == -1) {
568 		/* get an id but don't sleep for it */
569 		id = id_alloc_nosleep(table->dbt_id_space);
570 		if (id == -1) {
571 			rw_exit(bp->dbk_lock);
572 
573 			/* get an id, ok to sleep for it here */
574 			id = id_alloc(table->dbt_id_space);
575 
576 			rw_enter(bp->dbk_lock, RW_WRITER);
577 			goto retry;
578 		}
579 	}
580 
581 	/* get an exclusive lock on the bucket */
582 	if (rw_read_locked(bp->dbk_lock) && !rw_tryupgrade(bp->dbk_lock)) {
583 		NFS4_DEBUG(table->dbt_debug & OTHER_DEBUG,
584 		    (CE_NOTE, "Trying to upgrade lock on "
585 		    "hash chain %d (%p) for  %s by %s",
586 		    i, (void*)bp, table->dbt_name, idx->dbi_keyname));
587 
588 		rw_exit(bp->dbk_lock);
589 		rw_enter(bp->dbk_lock, RW_WRITER);
590 		goto retry;
591 	}
592 
593 	/* create entry */
594 	entry = rfs4_dbe_create(table, id, arg);
595 	if (entry == NULL) {
596 		rw_exit(bp->dbk_lock);
597 		if (id != -1)
598 			id_free(table->dbt_id_space, id);
599 
600 		NFS4_DEBUG(table->dbt_debug & CREATE_DEBUG,
601 		    (CE_NOTE, "Constructor for table %s failed",
602 		    table->dbt_name));
603 		return (NULL);
604 	}
605 
606 	/*
607 	 * Add one ref for entry into table's hash - only one
608 	 * reference added even though there may be multiple indices
609 	 */
610 	rfs4_dbe_hold(entry);
611 	ENQUEUE(bp->dbk_head, &entry->dbe_indices[idx->dbi_tblidx]);
612 	VALIDATE_ADDR(entry->dbe_indices[idx->dbi_tblidx].entry);
613 
614 	already_done = idx->dbi_tblidx;
615 	rw_exit(bp->dbk_lock);
616 
617 	for (ip = table->dbt_indices; ip; ip = ip->dbi_inext) {
618 		if (ip->dbi_tblidx == already_done)
619 			continue;
620 		l = &entry->dbe_indices[ip->dbi_tblidx];
621 		i = HASH(ip, ip->dbi_mkkey(entry->dbe_data));
622 		ASSERT(i < ip->dbi_table->dbt_len);
623 		bp = &ip->dbi_buckets[i];
624 		ENQUEUE_IDX(bp, l);
625 	}
626 
627 	NFS4_DEBUG(
628 	    table->dbt_debug & SEARCH_DEBUG || table->dbt_debug & CREATE_DEBUG,
629 	    (CE_NOTE, "Entry %p created for %s = %p in table %s",
630 	    (void*)entry, idx->dbi_keyname, (void*)key, table->dbt_name));
631 
632 	return (entry->dbe_data);
633 }
634 
635 /*ARGSUSED*/
636 boolean_t
637 rfs4_cpr_callb(void *arg, int code)
638 {
639 	rfs4_table_t *table = rfs4_client_tab;
640 	rfs4_bucket_t *buckets, *bp;
641 	rfs4_link_t *l;
642 	rfs4_client_t *cp;
643 	int i;
644 
645 	/*
646 	 * We get called for Suspend and Resume events.
647 	 * For the suspend case we simply don't care!  Nor do we care if
648 	 * there are no clients.
649 	 */
650 	if (code == CB_CODE_CPR_CHKPT || table == NULL) {
651 		return (B_TRUE);
652 	}
653 
654 	buckets = table->dbt_indices->dbi_buckets;
655 
656 	/*
657 	 * When we get this far we are in the process of
658 	 * resuming the system from a previous suspend.
659 	 *
660 	 * We are going to blast through and update the
661 	 * last_access time for all the clients and in
662 	 * doing so extend them by one lease period.
663 	 */
664 	for (i = 0; i < table->dbt_len; i++) {
665 		bp = &buckets[i];
666 		for (l = bp->dbk_head; l; l = l->next) {
667 			cp = (rfs4_client_t *)l->entry->dbe_data;
668 			cp->rc_last_access = gethrestime_sec();
669 		}
670 	}
671 
672 	return (B_TRUE);
673 }
674 
675 /*
676  * Given a table, lock each of the buckets and walk all entries (in
677  * turn locking those) and calling the provided "callout" function
678  * with the provided parameter.  Obviously used to iterate across all
679  * entries in a particular table via the database locking hierarchy.
680  * Obviously the caller must not hold locks on any of the entries in
681  * the specified table.
682  */
683 void
684 rfs4_dbe_walk(rfs4_table_t *table,
685     void (*callout)(rfs4_entry_t, void *),
686     void *data)
687 {
688 	rfs4_bucket_t *buckets = table->dbt_indices->dbi_buckets, *bp;
689 	rfs4_link_t *l;
690 	rfs4_dbe_t *entry;
691 	int i;
692 
693 	NFS4_DEBUG(table->dbt_debug & WALK_DEBUG,
694 	    (CE_NOTE, "Walking entries in %s", table->dbt_name));
695 
696 	/* Walk the buckets looking for entries to release/destroy */
697 	for (i = 0; i < table->dbt_len; i++) {
698 		bp = &buckets[i];
699 		rw_enter(bp->dbk_lock, RW_READER);
700 		for (l = bp->dbk_head; l; l = l->next) {
701 			entry = l->entry;
702 			mutex_enter(entry->dbe_lock);
703 			(*callout)(entry->dbe_data, data);
704 			mutex_exit(entry->dbe_lock);
705 		}
706 		rw_exit(bp->dbk_lock);
707 	}
708 
709 	NFS4_DEBUG(table->dbt_debug & WALK_DEBUG,
710 	    (CE_NOTE, "Walking entries complete %s", table->dbt_name));
711 }
712 
713 
714 static void
715 rfs4_dbe_reap(rfs4_table_t *table, time_t cache_time, uint32_t desired)
716 {
717 	rfs4_index_t *idx = table->dbt_indices;
718 	rfs4_bucket_t *buckets = idx->dbi_buckets, *bp;
719 	rfs4_link_t *l, *t;
720 	rfs4_dbe_t *entry;
721 	bool_t found;
722 	int i;
723 	int count = 0;
724 
725 	NFS4_DEBUG(table->dbt_debug & REAP_DEBUG,
726 	    (CE_NOTE, "Reaping %d entries older than %ld seconds in table %s",
727 	    desired, cache_time, table->dbt_name));
728 
729 	/* Walk the buckets looking for entries to release/destroy */
730 	for (i = 0; i < table->dbt_len; i++) {
731 		bp = &buckets[i];
732 		do {
733 			found = FALSE;
734 			rw_enter(bp->dbk_lock, RW_READER);
735 			for (l = bp->dbk_head; l; l = l->next) {
736 				entry = l->entry;
737 				/*
738 				 * Examine an entry.  Ref count of 1 means
739 				 * that the only reference is for the hash
740 				 * table reference.
741 				 */
742 				if (entry->dbe_refcnt != 1)
743 					continue;
744 				mutex_enter(entry->dbe_lock);
745 				if ((entry->dbe_refcnt == 1) &&
746 				    (table->dbt_reaper_shutdown ||
747 				    table->dbt_expiry == NULL ||
748 				    (*table->dbt_expiry)(entry->dbe_data))) {
749 					entry->dbe_refcnt--;
750 					count++;
751 					found = TRUE;
752 				}
753 				mutex_exit(entry->dbe_lock);
754 			}
755 			if (found) {
756 				if (!rw_tryupgrade(bp->dbk_lock)) {
757 					rw_exit(bp->dbk_lock);
758 					rw_enter(bp->dbk_lock, RW_WRITER);
759 				}
760 
761 				l = bp->dbk_head;
762 				while (l) {
763 					t = l;
764 					entry = t->entry;
765 					l = l->next;
766 					if (entry->dbe_refcnt == 0) {
767 						DEQUEUE(bp->dbk_head, t);
768 						t->next = NULL;
769 						t->prev = NULL;
770 						INVALIDATE_ADDR(t->entry);
771 						rfs4_dbe_destroy(entry);
772 					}
773 				}
774 			}
775 			rw_exit(bp->dbk_lock);
776 			/*
777 			 * delay slightly if there is more work to do
778 			 * with the expectation that other reaper
779 			 * threads are freeing data structures as well
780 			 * and in turn will reduce ref counts on
781 			 * entries in this table allowing them to be
782 			 * released.  This is only done in the
783 			 * instance that the tables are being shut down.
784 			 */
785 			if (table->dbt_reaper_shutdown && bp->dbk_head != NULL)
786 				delay(hz/100);
787 		/*
788 		 * If this is a table shutdown, keep going until
789 		 * everything is gone
790 		 */
791 		} while (table->dbt_reaper_shutdown && bp->dbk_head != NULL);
792 
793 		if (!table->dbt_reaper_shutdown && desired && count >= desired)
794 			break;
795 	}
796 
797 	NFS4_DEBUG(table->dbt_debug & REAP_DEBUG,
798 	    (CE_NOTE, "Reaped %d entries older than %ld seconds in table %s",
799 	    count, cache_time, table->dbt_name));
800 }
801 
802 
803 static void
804 reaper_thread(caddr_t *arg)
805 {
806 	rfs4_table_t *table = (rfs4_table_t *)arg;
807 	clock_t rc, time, wakeup;
808 
809 	NFS4_DEBUG(table->dbt_debug,
810 	    (CE_NOTE, "rfs4_reaper_thread starting for %s", table->dbt_name));
811 
812 	CALLB_CPR_INIT(&table->dbt_reaper_cpr_info, &table->dbt_reaper_cv_lock,
813 	    callb_generic_cpr, "nfsv4Reaper");
814 
815 	time = MIN(rfs4_reap_interval, table->dbt_max_cache_time);
816 	wakeup = SEC_TO_TICK(time);
817 
818 	mutex_enter(&table->dbt_reaper_cv_lock);
819 	do {
820 		CALLB_CPR_SAFE_BEGIN(&table->dbt_reaper_cpr_info);
821 		rc = cv_reltimedwait_sig(&table->dbt_reaper_wait,
822 		    &table->dbt_reaper_cv_lock, wakeup, TR_CLOCK_TICK);
823 		CALLB_CPR_SAFE_END(&table->dbt_reaper_cpr_info,
824 		    &table->dbt_reaper_cv_lock);
825 		rfs4_dbe_reap(table, table->dbt_max_cache_time, 0);
826 	} while (rc != 0 && table->dbt_reaper_shutdown == FALSE);
827 
828 	CALLB_CPR_EXIT(&table->dbt_reaper_cpr_info);
829 
830 	NFS4_DEBUG(table->dbt_debug,
831 	    (CE_NOTE, "rfs4_reaper_thread exiting for %s", table->dbt_name));
832 
833 	/* Notify the database shutdown processing that the table is shutdown */
834 	mutex_enter(table->dbt_db->db_lock);
835 	table->dbt_db->db_shutdown_count--;
836 	cv_signal(&table->dbt_db->db_shutdown_wait);
837 	mutex_exit(table->dbt_db->db_lock);
838 }
839 
840 static void
841 rfs4_start_reaper(rfs4_table_t *table)
842 {
843 	if (table->dbt_max_cache_time == 0)
844 		return;
845 
846 	(void) thread_create(NULL, 0, reaper_thread, table, 0, &p0, TS_RUN,
847 	    minclsyspri);
848 }
849 
850 #ifdef DEBUG
851 void
852 rfs4_dbe_debug(rfs4_dbe_t *entry)
853 {
854 	cmn_err(CE_NOTE, "Entry %p from table %s",
855 	    (void *)entry, entry->dbe_table->dbt_name);
856 	cmn_err(CE_CONT, "\trefcnt = %d id = %d",
857 	    entry->dbe_refcnt, entry->dbe_id);
858 }
859 #endif
860