/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
 */

/*
 * Copyright 2018 Nexenta Systems, Inc.
 */

#include <sys/systm.h>
#include <sys/cmn_err.h>
#include <sys/kmem.h>
#include <sys/disp.h>
#include <sys/id_space.h>
#include <sys/atomic.h>
#include <rpc/rpc.h>
#include <nfs/nfs4.h>
#include <nfs/nfs4_db_impl.h>
#include <sys/sdt.h>

static int rfs4_reap_interval = RFS4_REAP_INTERVAL;

static void rfs4_dbe_reap(rfs4_table_t *, time_t, uint32_t);
static void rfs4_dbe_destroy(rfs4_dbe_t *);
static rfs4_dbe_t *rfs4_dbe_create(rfs4_table_t *, id_t, rfs4_entry_t);
static void rfs4_start_reaper(rfs4_table_t *);

/*
 * t_lowat - integer percentage of table entries	/etc/system only
 * t_hiwat - integer percentage of table entries	/etc/system only
 * t_lreap - integer percentage of table reap time	mdb or /etc/system
 * t_hreap - integer percentage of table reap time	mdb or /etc/system
 */
uint32_t	t_lowat = 50;	/* reap at t_lreap when id's in use hit 50% */
uint32_t	t_hiwat = 75;	/* reap at t_hreap when id's in use hit 75% */
time_t		t_lreap = 50;	/* default to 50% of table's reap interval */
time_t		t_hreap = 10;	/* default to 10% of table's reap interval */

id_t
rfs4_dbe_getid(rfs4_dbe_t *entry)
{
	return (entry->dbe_id);
}

void
rfs4_dbe_hold(rfs4_dbe_t *entry)
{
	atomic_inc_32(&entry->dbe_refcnt);
}

/*
 * rfs4_dbe_rele_nolock only decrements the reference count of the entry.
 */
void
rfs4_dbe_rele_nolock(rfs4_dbe_t *entry)
{
	atomic_dec_32(&entry->dbe_refcnt);
}


uint32_t
rfs4_dbe_refcnt(rfs4_dbe_t *entry)
{
	return (entry->dbe_refcnt);
}

/*
 * Mark an entry such that the dbsearch will skip it.
 * Caller does not want this entry to be found any longer
 */
void
rfs4_dbe_invalidate(rfs4_dbe_t *entry)
{
	entry->dbe_invalid = TRUE;
	entry->dbe_skipsearch = TRUE;
}

/*
 * Is this entry invalid?
 */
bool_t
rfs4_dbe_is_invalid(rfs4_dbe_t *entry)
{
	return (entry->dbe_invalid);
}

time_t
rfs4_dbe_get_timerele(rfs4_dbe_t *entry)
{
	return (entry->dbe_time_rele);
}

/*
 * Use these to temporarily hide/unhide a db entry.
 */
void
rfs4_dbe_hide(rfs4_dbe_t *entry)
{
	rfs4_dbe_lock(entry);
	entry->dbe_skipsearch = TRUE;
	rfs4_dbe_unlock(entry);
}

void
rfs4_dbe_unhide(rfs4_dbe_t *entry)
{
	rfs4_dbe_lock(entry);
	entry->dbe_skipsearch = FALSE;
	rfs4_dbe_unlock(entry);
}

void
rfs4_dbe_rele(rfs4_dbe_t *entry)
{
	mutex_enter(entry->dbe_lock);
	ASSERT(entry->dbe_refcnt > 1);
	atomic_dec_32(&entry->dbe_refcnt);
	entry->dbe_time_rele = gethrestime_sec();
	mutex_exit(entry->dbe_lock);
}

void
rfs4_dbe_lock(rfs4_dbe_t *entry)
{
	mutex_enter(entry->dbe_lock);
}

void
rfs4_dbe_unlock(rfs4_dbe_t *entry)
{
	mutex_exit(entry->dbe_lock);
}

bool_t
rfs4_dbe_islocked(rfs4_dbe_t *entry)
{
	return (mutex_owned(entry->dbe_lock));
}

clock_t
rfs4_dbe_twait(rfs4_dbe_t *entry, clock_t timeout)
{
	return (cv_timedwait(entry->dbe_cv, entry->dbe_lock, timeout));
}

void
rfs4_dbe_cv_broadcast(rfs4_dbe_t *entry)
{
	cv_broadcast(entry->dbe_cv);
}

/* ARGSUSED */
static int
rfs4_dbe_kmem_constructor(void *obj, void *private, int kmflag)
{
	rfs4_dbe_t *entry = obj;

	mutex_init(entry->dbe_lock, NULL, MUTEX_DEFAULT, NULL);
	cv_init(entry->dbe_cv, NULL, CV_DEFAULT, NULL);

	return (0);
}

static void
rfs4_dbe_kmem_destructor(void *obj, void *private)
{
	rfs4_dbe_t *entry = obj;
	/*LINTED*/
	rfs4_table_t *table = private;

	mutex_destroy(entry->dbe_lock);
	cv_destroy(entry->dbe_cv);
}

rfs4_database_t *
rfs4_database_create(uint32_t flags)
{
	rfs4_database_t *db;

	db = kmem_alloc(sizeof (rfs4_database_t), KM_SLEEP);
	mutex_init(db->db_lock, NULL, MUTEX_DEFAULT, NULL);
	db->db_tables = NULL;
	db->db_debug_flags = flags;
	db->db_shutdown_count = 0;
	cv_init(&db->db_shutdown_wait, NULL, CV_DEFAULT, NULL);
	return (db);
}


/*
 * The reaper threads that have been created for the tables in this
 * database must be stopped and the entries in the tables released.
 * Each table will be marked as "shutdown" and the reaper threads
 * poked and they will see that a shutdown is in progress and cleanup
 * and exit.  This function waits for all reaper threads to stop
 * before returning to the caller.
 */
void
rfs4_database_shutdown(rfs4_database_t *db)
{
	rfs4_table_t *table;

	mutex_enter(db->db_lock);
	for (table = db->db_tables; table; table = table->dbt_tnext) {
		mutex_enter(&table->dbt_reaper_cv_lock);
		table->dbt_reaper_shutdown = TRUE;
		cv_broadcast(&table->dbt_reaper_wait);
		db->db_shutdown_count++;
		mutex_exit(&table->dbt_reaper_cv_lock);
	}
	while (db->db_shutdown_count > 0) {
		cv_wait(&db->db_shutdown_wait, db->db_lock);
	}
	mutex_exit(db->db_lock);
}

/*
 * Given a database that has been "shutdown" by the function above all
 * of the table tables are destroyed and then the database itself
 * freed.
 */
void
rfs4_database_destroy(rfs4_database_t *db)
{
	rfs4_table_t *next, *tmp;

	for (next = db->db_tables; next; ) {
		tmp = next;
		next = tmp->dbt_tnext;
		rfs4_table_destroy(db, tmp);
	}

	mutex_destroy(db->db_lock);
	kmem_free(db, sizeof (rfs4_database_t));
}

/*
 * Used to get the correct kmem_cache database for the state table being
 * created.
 * Helper function for rfs4_table_create
 */
static kmem_cache_t *
get_db_mem_cache(char *name)
{
	int i;

	for (i = 0; i < RFS4_DB_MEM_CACHE_NUM; i++) {
		if (strcmp(name, rfs4_db_mem_cache_table[i].r_db_name) == 0)
			return (rfs4_db_mem_cache_table[i].r_db_mem_cache);
	}
	/*
	 * There is no associated kmem cache for this NFS4 server state
	 * table name
	 */
	return (NULL);
}

/*
 * Used to initialize the global NFSv4 server state database.
 * Helper funtion for rfs4_state_g_init and called when module is loaded.
 */
kmem_cache_t *
/* CSTYLED */
nfs4_init_mem_cache(char *cache_name, uint32_t idxcnt, uint32_t size, uint32_t idx)
{
	kmem_cache_t *mem_cache = kmem_cache_create(cache_name,
	    sizeof (rfs4_dbe_t) + idxcnt * sizeof (rfs4_link_t) + size,
	    0,
	    rfs4_dbe_kmem_constructor,
	    rfs4_dbe_kmem_destructor,
	    NULL,
	    NULL,
	    NULL,
	    0);
	(void) strlcpy(rfs4_db_mem_cache_table[idx].r_db_name, cache_name,
	    strlen(cache_name) + 1);
	rfs4_db_mem_cache_table[idx].r_db_mem_cache = mem_cache;
	return (mem_cache);
}

rfs4_table_t *
rfs4_table_create(rfs4_database_t *db, char *tabname, time_t max_cache_time,
    uint32_t idxcnt, bool_t (*create)(rfs4_entry_t, void *),
    void (*destroy)(rfs4_entry_t),
    bool_t (*expiry)(rfs4_entry_t),
    uint32_t size, uint32_t hashsize,
    uint32_t maxentries, id_t start)
{
	rfs4_table_t	*table;
	int		 len;
	char		*cache_name;
	char		*id_name;

	table = kmem_alloc(sizeof (rfs4_table_t), KM_SLEEP);
	table->dbt_db = db;
	rw_init(table->dbt_t_lock, NULL, RW_DEFAULT, NULL);
	mutex_init(table->dbt_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&table->dbt_reaper_cv_lock, NULL, MUTEX_DEFAULT, NULL);
	cv_init(&table->dbt_reaper_wait, NULL, CV_DEFAULT, NULL);

	len = strlen(tabname);
	table->dbt_name = kmem_alloc(len+1, KM_SLEEP);
	cache_name = kmem_alloc(len + 12 /* "_entry_cache" */ + 1, KM_SLEEP);
	(void) strcpy(table->dbt_name, tabname);
	(void) sprintf(cache_name, "%s_entry_cache", table->dbt_name);
	table->dbt_max_cache_time = max_cache_time;
	table->dbt_usize = size;
	table->dbt_len = hashsize;
	table->dbt_count = 0;
	table->dbt_idxcnt = 0;
	table->dbt_ccnt = 0;
	table->dbt_maxcnt = idxcnt;
	table->dbt_indices = NULL;
	table->dbt_id_space = NULL;
	table->dbt_reaper_shutdown = FALSE;

	if (start >= 0) {
		if (maxentries + (uint32_t)start > (uint32_t)INT32_MAX)
			maxentries = INT32_MAX - start;
		id_name = kmem_alloc(len + 9 /* "_id_space" */ + 1, KM_SLEEP);
		(void) sprintf(id_name, "%s_id_space", table->dbt_name);
		table->dbt_id_space = id_space_create(id_name, start,
		    maxentries + start);
		kmem_free(id_name, len + 10);
	}
	ASSERT(t_lowat != 0);
	table->dbt_id_lwat = (maxentries * t_lowat) / 100;
	ASSERT(t_hiwat != 0);
	table->dbt_id_hwat = (maxentries * t_hiwat) / 100;
	table->dbt_id_reap = MIN(rfs4_reap_interval, max_cache_time);
	table->dbt_maxentries = maxentries;
	table->dbt_create = create;
	table->dbt_destroy = destroy;
	table->dbt_expiry = expiry;

	/*
	 * get the correct kmem_cache for this table type based on the name.
	 */
	table->dbt_mem_cache = get_db_mem_cache(cache_name);

	kmem_free(cache_name, len+13);

	table->dbt_debug = db->db_debug_flags;

	mutex_enter(db->db_lock);
	table->dbt_tnext = db->db_tables;
	db->db_tables = table;
	mutex_exit(db->db_lock);

	rfs4_start_reaper(table);

	return (table);
}

void
rfs4_table_destroy(rfs4_database_t *db, rfs4_table_t *table)
{
	rfs4_table_t *p;
	rfs4_index_t *idx;

	ASSERT(table->dbt_count == 0);

	mutex_enter(db->db_lock);
	if (table == db->db_tables)
		db->db_tables = table->dbt_tnext;
	else {
		for (p = db->db_tables; p; p = p->dbt_tnext)
			if (p->dbt_tnext == table) {
				p->dbt_tnext = table->dbt_tnext;
				table->dbt_tnext = NULL;
				break;
			}
		ASSERT(p != NULL);
	}
	mutex_exit(db->db_lock);

	/* Destroy indices */
	while (table->dbt_indices) {
		idx = table->dbt_indices;
		table->dbt_indices = idx->dbi_inext;
		rfs4_index_destroy(idx);
	}

	rw_destroy(table->dbt_t_lock);
	mutex_destroy(table->dbt_lock);
	mutex_destroy(&table->dbt_reaper_cv_lock);
	cv_destroy(&table->dbt_reaper_wait);

	kmem_free(table->dbt_name, strlen(table->dbt_name) + 1);
	if (table->dbt_id_space)
		id_space_destroy(table->dbt_id_space);
	table->dbt_mem_cache = NULL;
	kmem_free(table, sizeof (rfs4_table_t));
}

rfs4_index_t *
rfs4_index_create(rfs4_table_t *table, char *keyname,
    uint32_t (*hash)(void *),
    bool_t (compare)(rfs4_entry_t, void *),
    void *(*mkkey)(rfs4_entry_t),
    bool_t createable)
{
	rfs4_index_t *idx;

	ASSERT(table->dbt_idxcnt < table->dbt_maxcnt);

	idx = kmem_alloc(sizeof (rfs4_index_t), KM_SLEEP);

	idx->dbi_table = table;
	idx->dbi_keyname = kmem_alloc(strlen(keyname) + 1, KM_SLEEP);
	(void) strcpy(idx->dbi_keyname, keyname);
	idx->dbi_hash = hash;
	idx->dbi_compare = compare;
	idx->dbi_mkkey = mkkey;
	idx->dbi_tblidx = table->dbt_idxcnt;
	table->dbt_idxcnt++;
	if (createable) {
		table->dbt_ccnt++;
		if (table->dbt_ccnt > 1)
			panic("Table %s currently can have only have one "
			    "index that will allow creation of entries",
			    table->dbt_name);
		idx->dbi_createable = TRUE;
	} else {
		idx->dbi_createable = FALSE;
	}

	idx->dbi_inext = table->dbt_indices;
	table->dbt_indices = idx;
	idx->dbi_buckets = kmem_zalloc(sizeof (rfs4_bucket_t) * table->dbt_len,
	    KM_SLEEP);

	return (idx);
}

void
rfs4_index_destroy(rfs4_index_t *idx)
{
	kmem_free(idx->dbi_keyname, strlen(idx->dbi_keyname) + 1);
	kmem_free(idx->dbi_buckets,
	    sizeof (rfs4_bucket_t) * idx->dbi_table->dbt_len);
	kmem_free(idx, sizeof (rfs4_index_t));
}

static void
rfs4_dbe_destroy(rfs4_dbe_t *entry)
{
	rfs4_index_t *idx;
	void *key;
	int i;
	rfs4_bucket_t *bp;
	rfs4_table_t *table = entry->dbe_table;
	rfs4_link_t *l;

	NFS4_DEBUG(table->dbt_debug & DESTROY_DEBUG,
	    (CE_NOTE, "Destroying entry %p from %s",
	    (void*)entry, table->dbt_name));

	mutex_enter(entry->dbe_lock);
	ASSERT(entry->dbe_refcnt == 0);
	mutex_exit(entry->dbe_lock);

	/* Unlink from all indices */
	for (idx = table->dbt_indices; idx; idx = idx->dbi_inext) {
		l = &entry->dbe_indices[idx->dbi_tblidx];
		/* check and see if we were ever linked in to the index */
		if (INVALID_LINK(l)) {
			ASSERT(l->next == NULL && l->prev == NULL);
			continue;
		}
		key = idx->dbi_mkkey(entry->dbe_data);
		i = HASH(idx, key);
		bp = &idx->dbi_buckets[i];
		ASSERT(bp->dbk_head != NULL);
		DEQUEUE_IDX(bp, &entry->dbe_indices[idx->dbi_tblidx]);
	}

	/* Destroy user data */
	if (table->dbt_destroy)
		(*table->dbt_destroy)(entry->dbe_data);

	if (table->dbt_id_space)
		id_free(table->dbt_id_space, entry->dbe_id);

	mutex_enter(table->dbt_lock);
	table->dbt_count--;
	mutex_exit(table->dbt_lock);

	/* Destroy the entry itself */
	kmem_cache_free(table->dbt_mem_cache, entry);
}


static rfs4_dbe_t *
rfs4_dbe_create(rfs4_table_t *table, id_t id, rfs4_entry_t data)
{
	rfs4_dbe_t *entry;
	int i;

	NFS4_DEBUG(table->dbt_debug & CREATE_DEBUG,
	    (CE_NOTE, "Creating entry in table %s", table->dbt_name));

	entry = kmem_cache_alloc(table->dbt_mem_cache, KM_SLEEP);

	entry->dbe_refcnt = 1;
	entry->dbe_invalid = FALSE;
	entry->dbe_skipsearch = FALSE;
	entry->dbe_time_rele = 0;
	entry->dbe_id = 0;

	if (table->dbt_id_space)
		entry->dbe_id = id;
	entry->dbe_table = table;

	for (i = 0; i < table->dbt_maxcnt; i++) {
		entry->dbe_indices[i].next = entry->dbe_indices[i].prev = NULL;
		entry->dbe_indices[i].entry = entry;
		/*
		 * We mark the entry as not indexed by setting the low
		 * order bit, since address are word aligned. This has
		 * the advantage of causeing a trap if the address is
		 * used. After the entry is linked in to the
		 * corresponding index the bit will be cleared.
		 */
		INVALIDATE_ADDR(entry->dbe_indices[i].entry);
	}

	entry->dbe_data = (rfs4_entry_t)&entry->dbe_indices[table->dbt_maxcnt];
	bzero(entry->dbe_data, table->dbt_usize);
	entry->dbe_data->dbe = entry;

	if (!(*table->dbt_create)(entry->dbe_data, data)) {
		kmem_cache_free(table->dbt_mem_cache, entry);
		return (NULL);
	}

	mutex_enter(table->dbt_lock);
	table->dbt_count++;
	mutex_exit(table->dbt_lock);

	return (entry);
}

static void
rfs4_dbe_tabreap_adjust(rfs4_table_t *table)
{
	clock_t		tabreap;
	clock_t		reap_int;
	uint32_t	in_use;

	/*
	 * Adjust the table's reap interval based on the
	 * number of id's currently in use. Each table's
	 * default remains the same if id usage subsides.
	 */
	ASSERT(MUTEX_HELD(&table->dbt_reaper_cv_lock));
	tabreap = MIN(rfs4_reap_interval, table->dbt_max_cache_time);

	in_use = table->dbt_count + 1;	/* see rfs4_dbe_create */
	if (in_use >= table->dbt_id_hwat) {
		ASSERT(t_hreap != 0);
		reap_int = (tabreap * t_hreap) / 100;
	} else if (in_use >= table->dbt_id_lwat) {
		ASSERT(t_lreap != 0);
		reap_int = (tabreap * t_lreap) / 100;
	} else {
		reap_int = tabreap;
	}
	table->dbt_id_reap = reap_int;
	DTRACE_PROBE2(table__reap__interval, char *,
	    table->dbt_name, time_t, table->dbt_id_reap);
}

rfs4_entry_t
rfs4_dbsearch(rfs4_index_t *idx, void *key, bool_t *create, void *arg,
    rfs4_dbsearch_type_t dbsearch_type)
{
	int		 already_done;
	uint32_t	 i;
	rfs4_table_t	*table = idx->dbi_table;
	rfs4_index_t	*ip;
	rfs4_bucket_t	*bp;
	rfs4_link_t	*l;
	rfs4_dbe_t	*entry;
	id_t		 id = -1;

	i = HASH(idx, key);
	bp = &idx->dbi_buckets[i];

	NFS4_DEBUG(table->dbt_debug & SEARCH_DEBUG,
	    (CE_NOTE, "Searching for key %p in table %s by %s",
	    key, table->dbt_name, idx->dbi_keyname));

	rw_enter(bp->dbk_lock, RW_READER);
retry:
	for (l = bp->dbk_head; l; l = l->next) {
		if (l->entry->dbe_refcnt > 0 &&
		    (l->entry->dbe_skipsearch == FALSE ||
		    (l->entry->dbe_skipsearch == TRUE &&
		    dbsearch_type == RFS4_DBS_INVALID)) &&
		    (*idx->dbi_compare)(l->entry->dbe_data, key)) {
			mutex_enter(l->entry->dbe_lock);
			if (l->entry->dbe_refcnt == 0) {
				mutex_exit(l->entry->dbe_lock);
				continue;
			}

			/* place an additional hold since we are returning */
			rfs4_dbe_hold(l->entry);

			mutex_exit(l->entry->dbe_lock);
			rw_exit(bp->dbk_lock);

			*create = FALSE;

			NFS4_DEBUG((table->dbt_debug & SEARCH_DEBUG),
			    (CE_NOTE, "Found entry %p for %p in table %s",
			    (void *)l->entry, key, table->dbt_name));

			if (id != -1)
				id_free(table->dbt_id_space, id);
			return (l->entry->dbe_data);
		}
	}

	if (!*create || table->dbt_create == NULL || !idx->dbi_createable ||
	    table->dbt_maxentries == table->dbt_count) {
		NFS4_DEBUG(table->dbt_debug & SEARCH_DEBUG,
		    (CE_NOTE, "Entry for %p in %s not found",
		    key, table->dbt_name));

		rw_exit(bp->dbk_lock);
		if (id != -1)
			id_free(table->dbt_id_space, id);
		return (NULL);
	}

	if (table->dbt_id_space && id == -1) {
		rw_exit(bp->dbk_lock);

		/* get an id, ok to sleep for it here */
		id = id_alloc(table->dbt_id_space);
		ASSERT(id != -1);

		mutex_enter(&table->dbt_reaper_cv_lock);
		rfs4_dbe_tabreap_adjust(table);
		mutex_exit(&table->dbt_reaper_cv_lock);

		rw_enter(bp->dbk_lock, RW_WRITER);
		goto retry;
	}

	/* get an exclusive lock on the bucket */
	if (rw_read_locked(bp->dbk_lock) && !rw_tryupgrade(bp->dbk_lock)) {
		NFS4_DEBUG(table->dbt_debug & OTHER_DEBUG,
		    (CE_NOTE, "Trying to upgrade lock on "
		    "hash chain %d (%p) for  %s by %s",
		    i, (void*)bp, table->dbt_name, idx->dbi_keyname));

		rw_exit(bp->dbk_lock);
		rw_enter(bp->dbk_lock, RW_WRITER);
		goto retry;
	}

	/* create entry */
	entry = rfs4_dbe_create(table, id, arg);
	if (entry == NULL) {
		rw_exit(bp->dbk_lock);
		if (id != -1)
			id_free(table->dbt_id_space, id);

		NFS4_DEBUG(table->dbt_debug & CREATE_DEBUG,
		    (CE_NOTE, "Constructor for table %s failed",
		    table->dbt_name));
		return (NULL);
	}

	/*
	 * Add one ref for entry into table's hash - only one
	 * reference added even though there may be multiple indices
	 */
	rfs4_dbe_hold(entry);
	ENQUEUE(bp->dbk_head, &entry->dbe_indices[idx->dbi_tblidx]);
	VALIDATE_ADDR(entry->dbe_indices[idx->dbi_tblidx].entry);

	already_done = idx->dbi_tblidx;
	rw_exit(bp->dbk_lock);

	for (ip = table->dbt_indices; ip; ip = ip->dbi_inext) {
		if (ip->dbi_tblidx == already_done)
			continue;
		l = &entry->dbe_indices[ip->dbi_tblidx];
		i = HASH(ip, ip->dbi_mkkey(entry->dbe_data));
		ASSERT(i < ip->dbi_table->dbt_len);
		bp = &ip->dbi_buckets[i];
		ENQUEUE_IDX(bp, l);
	}

	NFS4_DEBUG(
	    table->dbt_debug & SEARCH_DEBUG || table->dbt_debug & CREATE_DEBUG,
	    (CE_NOTE, "Entry %p created for %s = %p in table %s",
	    (void*)entry, idx->dbi_keyname, (void*)key, table->dbt_name));

	return (entry->dbe_data);
}

/*ARGSUSED*/
boolean_t
rfs4_cpr_callb(void *arg, int code)
{
	rfs4_bucket_t *buckets, *bp;
	rfs4_link_t *l;
	rfs4_client_t *cp;
	int i;

	nfs4_srv_t *nsrv4 = nfs4_get_srv();
	rfs4_table_t *table = nsrv4->rfs4_client_tab;

	/*
	 * We get called for Suspend and Resume events.
	 * For the suspend case we simply don't care!  Nor do we care if
	 * there are no clients.
	 */
	if (code == CB_CODE_CPR_CHKPT || table == NULL) {
		return (B_TRUE);
	}

	buckets = table->dbt_indices->dbi_buckets;

	/*
	 * When we get this far we are in the process of
	 * resuming the system from a previous suspend.
	 *
	 * We are going to blast through and update the
	 * last_access time for all the clients and in
	 * doing so extend them by one lease period.
	 */
	for (i = 0; i < table->dbt_len; i++) {
		bp = &buckets[i];
		for (l = bp->dbk_head; l; l = l->next) {
			cp = (rfs4_client_t *)l->entry->dbe_data;
			cp->rc_last_access = gethrestime_sec();
		}
	}

	return (B_TRUE);
}

/*
 * Given a table, lock each of the buckets and walk all entries (in
 * turn locking those) and calling the provided "callout" function
 * with the provided parameter.  Obviously used to iterate across all
 * entries in a particular table via the database locking hierarchy.
 * Obviously the caller must not hold locks on any of the entries in
 * the specified table.
 */
void
rfs4_dbe_walk(rfs4_table_t *table,
    void (*callout)(rfs4_entry_t, void *),
    void *data)
{
	rfs4_bucket_t *buckets = table->dbt_indices->dbi_buckets, *bp;
	rfs4_link_t *l;
	rfs4_dbe_t *entry;
	int i;

	NFS4_DEBUG(table->dbt_debug & WALK_DEBUG,
	    (CE_NOTE, "Walking entries in %s", table->dbt_name));

	/* Walk the buckets looking for entries to release/destroy */
	for (i = 0; i < table->dbt_len; i++) {
		bp = &buckets[i];
		rw_enter(bp->dbk_lock, RW_READER);
		for (l = bp->dbk_head; l; l = l->next) {
			entry = l->entry;
			mutex_enter(entry->dbe_lock);
			(*callout)(entry->dbe_data, data);
			mutex_exit(entry->dbe_lock);
		}
		rw_exit(bp->dbk_lock);
	}

	NFS4_DEBUG(table->dbt_debug & WALK_DEBUG,
	    (CE_NOTE, "Walking entries complete %s", table->dbt_name));
}


static void
rfs4_dbe_reap(rfs4_table_t *table, time_t cache_time, uint32_t desired)
{
	rfs4_index_t *idx = table->dbt_indices;
	rfs4_bucket_t *buckets = idx->dbi_buckets, *bp;
	rfs4_link_t *l, *t;
	rfs4_dbe_t *entry;
	bool_t found;
	int i;
	int count = 0;

	NFS4_DEBUG(table->dbt_debug & REAP_DEBUG,
	    (CE_NOTE, "Reaping %d entries older than %ld seconds in table %s",
	    desired, cache_time, table->dbt_name));

	/* Walk the buckets looking for entries to release/destroy */
	for (i = 0; i < table->dbt_len; i++) {
		bp = &buckets[i];
		do {
			found = FALSE;
			rw_enter(bp->dbk_lock, RW_READER);
			for (l = bp->dbk_head; l; l = l->next) {
				entry = l->entry;
				/*
				 * Examine an entry.  Ref count of 1 means
				 * that the only reference is for the hash
				 * table reference.
				 */
				if (entry->dbe_refcnt != 1)
					continue;
				mutex_enter(entry->dbe_lock);
				if ((entry->dbe_refcnt == 1) &&
				    (table->dbt_reaper_shutdown ||
				    table->dbt_expiry == NULL ||
				    (*table->dbt_expiry)(entry->dbe_data))) {
					entry->dbe_refcnt--;
					count++;
					found = TRUE;
				}
				mutex_exit(entry->dbe_lock);
			}
			if (found) {
				if (!rw_tryupgrade(bp->dbk_lock)) {
					rw_exit(bp->dbk_lock);
					rw_enter(bp->dbk_lock, RW_WRITER);
				}

				l = bp->dbk_head;
				while (l) {
					t = l;
					entry = t->entry;
					l = l->next;
					if (entry->dbe_refcnt == 0) {
						DEQUEUE(bp->dbk_head, t);
						t->next = NULL;
						t->prev = NULL;
						INVALIDATE_ADDR(t->entry);
						rfs4_dbe_destroy(entry);
					}
				}
			}
			rw_exit(bp->dbk_lock);
			/*
			 * delay slightly if there is more work to do
			 * with the expectation that other reaper
			 * threads are freeing data structures as well
			 * and in turn will reduce ref counts on
			 * entries in this table allowing them to be
			 * released.  This is only done in the
			 * instance that the tables are being shut down.
			 */
			if (table->dbt_reaper_shutdown && bp->dbk_head != NULL)
				delay(hz/100);
		/*
		 * If this is a table shutdown, keep going until
		 * everything is gone
		 */
		} while (table->dbt_reaper_shutdown && bp->dbk_head != NULL);

		if (!table->dbt_reaper_shutdown && desired && count >= desired)
			break;
	}

	NFS4_DEBUG(table->dbt_debug & REAP_DEBUG,
	    (CE_NOTE, "Reaped %d entries older than %ld seconds in table %s",
	    count, cache_time, table->dbt_name));
}

static void
reaper_thread(caddr_t *arg)
{
	rfs4_table_t	*table = (rfs4_table_t *)arg;
	clock_t		 rc;

	NFS4_DEBUG(table->dbt_debug,
	    (CE_NOTE, "rfs4_reaper_thread starting for %s", table->dbt_name));

	CALLB_CPR_INIT(&table->dbt_reaper_cpr_info, &table->dbt_reaper_cv_lock,
	    callb_generic_cpr, "nfsv4Reaper");

	mutex_enter(&table->dbt_reaper_cv_lock);
	do {
		CALLB_CPR_SAFE_BEGIN(&table->dbt_reaper_cpr_info);
		rc = cv_reltimedwait_sig(&table->dbt_reaper_wait,
		    &table->dbt_reaper_cv_lock,
		    SEC_TO_TICK(table->dbt_id_reap), TR_CLOCK_TICK);
		CALLB_CPR_SAFE_END(&table->dbt_reaper_cpr_info,
		    &table->dbt_reaper_cv_lock);
		rfs4_dbe_reap(table, table->dbt_max_cache_time, 0);
	} while (rc != 0 && table->dbt_reaper_shutdown == FALSE);

	CALLB_CPR_EXIT(&table->dbt_reaper_cpr_info);

	NFS4_DEBUG(table->dbt_debug,
	    (CE_NOTE, "rfs4_reaper_thread exiting for %s", table->dbt_name));

	/* Notify the database shutdown processing that the table is shutdown */
	mutex_enter(table->dbt_db->db_lock);
	table->dbt_db->db_shutdown_count--;
	cv_signal(&table->dbt_db->db_shutdown_wait);
	mutex_exit(table->dbt_db->db_lock);
	zthread_exit();
}

static void
rfs4_start_reaper(rfs4_table_t *table)
{
	if (table->dbt_max_cache_time == 0)
		return;

	(void) zthread_create(NULL, 0, reaper_thread, table, 0,
	    minclsyspri);
}

#ifdef DEBUG
void
rfs4_dbe_debug(rfs4_dbe_t *entry)
{
	cmn_err(CE_NOTE, "Entry %p from table %s",
	    (void *)entry, entry->dbe_table->dbt_name);
	cmn_err(CE_CONT, "\trefcnt = %d id = %d",
	    entry->dbe_refcnt, entry->dbe_id);
}
#endif