common/klm/nlm_impl.c

/*
 * Copyright (c) 2008 Isilon Inc http://www.isilon.com/
 * Authors: Doug Rabson <dfr@rabson.org>
 * Developed with Red Inc: Alfred Perlstein <alfred@freebsd.org>
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
 * Copyright (c) 2012 by Delphix. All rights reserved.
 */

/*
 * NFS LockManager, start/stop, support functions, etc.
 * Most of the interesting code is here.
 *
 * Source code derived from FreeBSD nlm_prot_impl.c
 */

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/thread.h>
#include <sys/fcntl.h>
#include <sys/flock.h>
#include <sys/mount.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/share.h>
#include <sys/socket.h>
#include <sys/syscall.h>
#include <sys/syslog.h>
#include <sys/systm.h>
#include <sys/class.h>
#include <sys/unistd.h>
#include <sys/vnode.h>
#include <sys/vfs.h>
#include <sys/queue.h>
#include <sys/bitmap.h>
#include <sys/sdt.h>
#include <netinet/in.h>

#include <rpc/rpc.h>
#include <rpc/xdr.h>
#include <rpc/pmap_prot.h>
#include <rpc/pmap_clnt.h>
#include <rpc/rpcb_prot.h>

#include <rpcsvc/nlm_prot.h>
#include <rpcsvc/sm_inter.h>
#include <rpcsvc/nsm_addr.h>

#include <nfs/nfs.h>
#include <nfs/nfs_clnt.h>
#include <nfs/export.h>
#include <nfs/rnode.h>
#include <nfs/lm.h>

#include "nlm_impl.h"

struct nlm_knc {
	struct knetconfig	n_knc;
	const char		*n_netid;
};

/*
 * Number of attempts NLM tries to obtain RPC binding
 * of local statd.
 */
#define	NLM_NSM_RPCBIND_RETRIES 10

/*
 * Timeout (in seconds) NLM waits before making another
 * attempt to obtain RPC binding of local statd.
 */
#define	NLM_NSM_RPCBIND_TIMEOUT 5

/*
 * Total number of sysids in NLM sysid bitmap
 */
#define	NLM_BMAP_NITEMS	(LM_SYSID_MAX + 1)

/*
 * Number of ulong_t words in bitmap that is used
 * for allocation of sysid numbers.
 */
#define	NLM_BMAP_WORDS  (NLM_BMAP_NITEMS / BT_NBIPUL)

/*
 * Given an integer x, the macro returns
 * -1 if x is negative,
 *  0 if x is zero
 *  1 if x is positive
 */
#define	SIGN(x) (((x) > 0) - ((x) < 0))

#define	ARRSIZE(arr)	(sizeof (arr) / sizeof ((arr)[0]))
#define	NLM_KNCS	ARRSIZE(nlm_netconfigs)

krwlock_t lm_lck;

/*
 * Zero timeout for asynchronous NLM RPC operations
 */
static const struct timeval nlm_rpctv_zero = { 0,  0 };

/*
 * List of all Zone globals nlm_globals instences
 * linked together.
 */
static struct nlm_globals_list nlm_zones_list; /* (g) */

/*
 * NLM kmem caches
 */
static struct kmem_cache *nlm_hosts_cache = NULL;
static struct kmem_cache *nlm_vhold_cache = NULL;

/*
 * A bitmap for allocation of new sysids.
 * Sysid is a unique number between LM_SYSID
 * and LM_SYSID_MAX. Sysid represents unique remote
 * host that does file locks on the given host.
 */
static ulong_t	nlm_sysid_bmap[NLM_BMAP_WORDS];	/* (g) */
static int	nlm_sysid_nidx;			/* (g) */

/*
 * RPC service registration for all transports
 */
static SVC_CALLOUT nlm_svcs[] = {
	{ NLM_PROG, 4, 4, nlm_prog_4 },	/* NLM4_VERS */
	{ NLM_PROG, 1, 3, nlm_prog_3 }	/* NLM_VERS - NLM_VERSX */
};

static SVC_CALLOUT_TABLE nlm_sct = {
	ARRSIZE(nlm_svcs),
	FALSE,
	nlm_svcs
};

/*
 * Static table of all netid/knetconfig network
 * lock manager can work with. nlm_netconfigs table
 * is used when we need to get valid knetconfig by
 * netid and vice versa.
 *
 * Knetconfigs are activated either by the call from
 * user-space lockd daemon (server side) or by taking
 * knetconfig from NFS mountinfo (client side)
 */
static struct nlm_knc nlm_netconfigs[] = { /* (g) */
	/* UDP */
	{
		{ NC_TPI_CLTS, NC_INET, NC_UDP, NODEV },
		"udp",
	},
	/* TCP */
	{
		{ NC_TPI_COTS_ORD, NC_INET, NC_TCP, NODEV },
		"tcp",
	},
	/* UDP over IPv6 */
	{
		{ NC_TPI_CLTS, NC_INET6, NC_UDP, NODEV },
		"udp6",
	},
	/* TCP over IPv6 */
	{
		{ NC_TPI_COTS_ORD, NC_INET6, NC_TCP, NODEV },
		"tcp6",
	},
	/* ticlts (loopback over UDP) */
	{
		{ NC_TPI_CLTS, NC_LOOPBACK, NC_NOPROTO, NODEV },
		"ticlts",
	},
	/* ticotsord (loopback over TCP) */
	{
		{ NC_TPI_COTS_ORD, NC_LOOPBACK, NC_NOPROTO, NODEV },
		"ticotsord",
	},
};

/*
 * NLM misc. function
 */
static void nlm_copy_netbuf(struct netbuf *, struct netbuf *);
static int nlm_netbuf_addrs_cmp(struct netbuf *, struct netbuf *);
static void nlm_kmem_reclaim(void *);
static void nlm_pool_shutdown(void);
static void nlm_suspend_zone(struct nlm_globals *);
static void nlm_resume_zone(struct nlm_globals *);
static void nlm_nsm_clnt_init(CLIENT *, struct nlm_nsm *);
static void nlm_netbuf_to_netobj(struct netbuf *, int *, netobj *);

/*
 * NLM thread functions
 */
static void nlm_gc(struct nlm_globals *);
static void nlm_reclaimer(struct nlm_host *);

/*
 * NLM NSM functions
 */
static int nlm_init_local_knc(struct knetconfig *);
static int nlm_nsm_init_local(struct nlm_nsm *);
static int nlm_nsm_init(struct nlm_nsm *, struct knetconfig *, struct netbuf *);
static void nlm_nsm_fini(struct nlm_nsm *);
static enum clnt_stat nlm_nsm_simu_crash(struct nlm_nsm *);
static enum clnt_stat nlm_nsm_stat(struct nlm_nsm *, int32_t *);
static enum clnt_stat nlm_nsm_mon(struct nlm_nsm *, char *, uint16_t);
static enum clnt_stat nlm_nsm_unmon(struct nlm_nsm *, char *);

/*
 * NLM host functions
 */
static int nlm_host_ctor(void *, void *, int);
static void nlm_host_dtor(void *, void *);
static void nlm_host_destroy(struct nlm_host *);
static struct nlm_host *nlm_host_create(char *, const char *,
    struct knetconfig *, struct netbuf *);
static struct nlm_host *nlm_host_find_locked(struct nlm_globals *,
    const char *, struct netbuf *, avl_index_t *);
static void nlm_host_unregister(struct nlm_globals *, struct nlm_host *);
static void nlm_host_gc_vholds(struct nlm_host *);
static bool_t nlm_host_has_srv_locks(struct nlm_host *);
static bool_t nlm_host_has_cli_locks(struct nlm_host *);
static bool_t nlm_host_has_locks(struct nlm_host *);

/*
 * NLM vhold functions
 */
static int nlm_vhold_ctor(void *, void *, int);
static void nlm_vhold_dtor(void *, void *);
static void nlm_vhold_destroy(struct nlm_host *,
    struct nlm_vhold *);
static bool_t nlm_vhold_busy(struct nlm_host *, struct nlm_vhold *);
static void nlm_vhold_clean(struct nlm_vhold *, int);

/*
 * NLM client/server sleeping locks/share reservation functions
 */
struct nlm_slreq *nlm_slreq_find_locked(struct nlm_host *,
    struct nlm_vhold *, struct flock64 *);
static struct nlm_shres *nlm_shres_create_item(struct shrlock *, vnode_t *);
static void nlm_shres_destroy_item(struct nlm_shres *);
static bool_t nlm_shres_equal(struct shrlock *, struct shrlock *);

/*
 * NLM initialization functions.
 */
void
nlm_init(void)
{
	nlm_hosts_cache = kmem_cache_create("nlm_host_cache",
	    sizeof (struct nlm_host), 0, nlm_host_ctor, nlm_host_dtor,
	    nlm_kmem_reclaim, NULL, NULL, 0);

	nlm_vhold_cache = kmem_cache_create("nlm_vhold_cache",
	    sizeof (struct nlm_vhold), 0, nlm_vhold_ctor, nlm_vhold_dtor,
	    NULL, NULL, NULL, 0);

	nlm_rpc_init();
	TAILQ_INIT(&nlm_zones_list);

	/* initialize sysids bitmap */
	bzero(nlm_sysid_bmap, sizeof (nlm_sysid_bmap));
	nlm_sysid_nidx = 1;

	/*
	 * Reserv the sysid #0, because it's associated
	 * with local locks only. Don't let to allocate
	 * it for remote locks.
	 */
	BT_SET(nlm_sysid_bmap, 0);
}

void
nlm_globals_register(struct nlm_globals *g)
{
	rw_enter(&lm_lck, RW_WRITER);
	TAILQ_INSERT_TAIL(&nlm_zones_list, g, nlm_link);
	rw_exit(&lm_lck);
}

void
nlm_globals_unregister(struct nlm_globals *g)
{
	rw_enter(&lm_lck, RW_WRITER);
	TAILQ_REMOVE(&nlm_zones_list, g, nlm_link);
	rw_exit(&lm_lck);
}

/* ARGSUSED */
static void
nlm_kmem_reclaim(void *cdrarg)
{
	struct nlm_globals *g;

	rw_enter(&lm_lck, RW_READER);
	TAILQ_FOREACH(g, &nlm_zones_list, nlm_link)
		cv_broadcast(&g->nlm_gc_sched_cv);

	rw_exit(&lm_lck);
}

/*
 * NLM garbage collector thread (GC).
 *
 * NLM GC periodically checks whether there're any host objects
 * that can be cleaned up. It also releases stale vnodes that
 * live on the server side (under protection of vhold objects).
 *
 * NLM host objects are cleaned up from GC thread because
 * operations helping us to determine whether given host has
 * any locks can be quite expensive and it's not good to call
 * them every time the very last reference to the host is dropped.
 * Thus we use "lazy" approach for hosts cleanup.
 *
 * The work of GC is to release stale vnodes on the server side
 * and destroy hosts that haven't any locks and any activity for
 * some time (i.e. idle hosts).
 */
static void
nlm_gc(struct nlm_globals *g)
{
	struct nlm_host *hostp;
	clock_t now, idle_period;

	idle_period = SEC_TO_TICK(g->cn_idle_tmo);
	mutex_enter(&g->lock);
	for (;;) {
		/*
		 * GC thread can be explicitly scheduled from
		 * memory reclamation function.
		 */
		(void) cv_timedwait(&g->nlm_gc_sched_cv, &g->lock,
		    ddi_get_lbolt() + idle_period);

		/*
		 * NLM is shutting down, time to die.
		 */
		if (g->run_status == NLM_ST_STOPPING)
			break;

		now = ddi_get_lbolt();
		DTRACE_PROBE2(gc__start, struct nlm_globals *, g,
		    clock_t, now);

		/*
		 * Find all obviously unused vholds and destroy them.
		 */
		for (hostp = avl_first(&g->nlm_hosts_tree); hostp != NULL;
		    hostp = AVL_NEXT(&g->nlm_hosts_tree, hostp)) {
			struct nlm_vhold *nvp;

			mutex_enter(&hostp->nh_lock);

			nvp = TAILQ_FIRST(&hostp->nh_vholds_list);
			while (nvp != NULL) {
				struct nlm_vhold *new_nvp;

				new_nvp = TAILQ_NEXT(nvp, nv_link);

				/*
				 * If these conditions are met, the vhold is
				 * obviously unused and we will destroy it.  In
				 * a case either v_filocks and/or v_shrlocks is
				 * non-NULL the vhold might still be unused by
				 * the host, but it is expensive to check that.
				 * We defer such check until the host is idle.
				 * The expensive check is done below without
				 * the global lock held.
				 */
				if (nvp->nv_refcnt == 0 &&
				    nvp->nv_vp->v_filocks == NULL &&
				    nvp->nv_vp->v_shrlocks == NULL) {
					nlm_vhold_destroy(hostp, nvp);
				}

				nvp = new_nvp;
			}

			mutex_exit(&hostp->nh_lock);
		}

		/*
		 * Handle all hosts that are unused at the moment
		 * until we meet one with idle timeout in future.
		 */
		while ((hostp = TAILQ_FIRST(&g->nlm_idle_hosts)) != NULL) {
			bool_t has_locks;

			if (hostp->nh_idle_timeout > now)
				break;

			/*
			 * Drop global lock while doing expensive work
			 * on this host. We'll re-check any conditions
			 * that might change after retaking the global
			 * lock.
			 */
			mutex_exit(&g->lock);
			mutex_enter(&hostp->nh_lock);

			/*
			 * nlm_globals lock was dropped earlier because
			 * garbage collecting of vholds and checking whether
			 * host has any locks/shares are expensive operations.
			 */
			nlm_host_gc_vholds(hostp);
			has_locks = nlm_host_has_locks(hostp);

			mutex_exit(&hostp->nh_lock);
			mutex_enter(&g->lock);

			/*
			 * While we were doing expensive operations
			 * outside of nlm_globals critical section,
			 * somebody could take the host and remove it
			 * from the idle list.  Whether its been
			 * reinserted or not, our information about
			 * the host is outdated, and we should take no
			 * further action.
			 */
			if ((hostp->nh_flags & NLM_NH_INIDLE) == 0 ||
			    hostp->nh_idle_timeout > now)
				continue;

			/*
			 * If the host has locks we have to renew the
			 * host's timeout and put it at the end of LRU
			 * list.
			 */
			if (has_locks) {
				TAILQ_REMOVE(&g->nlm_idle_hosts,
				    hostp, nh_link);
				hostp->nh_idle_timeout = now + idle_period;
				TAILQ_INSERT_TAIL(&g->nlm_idle_hosts,
				    hostp, nh_link);
				continue;
			}

			/*
			 * We're here if all the following conditions hold:
			 * 1) Host hasn't any locks or share reservations
			 * 2) Host is unused
			 * 3) Host wasn't touched by anyone at least for
			 *    g->cn_idle_tmo seconds.
			 *
			 * So, now we can destroy it.
			 */
			nlm_host_unregister(g, hostp);
			mutex_exit(&g->lock);

			nlm_host_unmonitor(g, hostp);
			nlm_host_destroy(hostp);
			mutex_enter(&g->lock);
			if (g->run_status == NLM_ST_STOPPING)
				break;

		}

		DTRACE_PROBE(gc__end);
	}

	DTRACE_PROBE1(gc__exit, struct nlm_globals *, g);

	/* Let others know that GC has died */
	g->nlm_gc_thread = NULL;
	mutex_exit(&g->lock);

	cv_broadcast(&g->nlm_gc_finish_cv);
	zthread_exit();
}

/*
 * Thread reclaim locks/shares acquired by the client side
 * on the given server represented by hostp.
 */
static void
nlm_reclaimer(struct nlm_host *hostp)
{
	struct nlm_globals *g;

	mutex_enter(&hostp->nh_lock);
	hostp->nh_reclaimer = curthread;
	mutex_exit(&hostp->nh_lock);

	g = zone_getspecific(nlm_zone_key, curzone);
	nlm_reclaim_client(g, hostp);

	mutex_enter(&hostp->nh_lock);
	hostp->nh_flags &= ~NLM_NH_RECLAIM;
	hostp->nh_reclaimer = NULL;
	cv_broadcast(&hostp->nh_recl_cv);
	mutex_exit(&hostp->nh_lock);

	/*
	 * Host was explicitly referenced before
	 * nlm_reclaim() was called, release it
	 * here.
	 */
	nlm_host_release(g, hostp);
	zthread_exit();
}

/*
 * Copy a struct netobj.  (see xdr.h)
 */
void
nlm_copy_netobj(struct netobj *dst, struct netobj *src)
{
	dst->n_len = src->n_len;
	dst->n_bytes = kmem_alloc(src->n_len, KM_SLEEP);
	bcopy(src->n_bytes, dst->n_bytes, src->n_len);
}

/*
 * An NLM specificw replacement for clnt_call().
 * nlm_clnt_call() is used by all RPC functions generated
 * from nlm_prot.x specification. The function is aware
 * about some pitfalls of NLM RPC procedures and has a logic
 * that handles them properly.
 */
enum clnt_stat
nlm_clnt_call(CLIENT *clnt, rpcproc_t procnum, xdrproc_t xdr_args,
    caddr_t argsp, xdrproc_t xdr_result, caddr_t resultp, struct timeval wait)
{
	k_sigset_t oldmask;
	enum clnt_stat stat;
	bool_t sig_blocked = FALSE;

	/*
	 * If NLM RPC procnum is one of the NLM _RES procedures
	 * that are used to reply to asynchronous NLM RPC
	 * (MSG calls), explicitly set RPC timeout to zero.
	 * Client doesn't send a reply to RES procedures, so
	 * we don't need to wait anything.
	 *
	 * NOTE: we ignore NLM4_*_RES procnums because they are
	 * equal to NLM_*_RES numbers.
	 */
	if (procnum >= NLM_TEST_RES && procnum <= NLM_GRANTED_RES)
		wait = nlm_rpctv_zero;

	/*
	 * We need to block signals in case of NLM_CANCEL RPC
	 * in order to prevent interruption of network RPC
	 * calls.
	 */
	if (procnum == NLM_CANCEL) {
		k_sigset_t newmask;

		sigfillset(&newmask);
		sigreplace(&newmask, &oldmask);
		sig_blocked = TRUE;
	}

	stat = clnt_call(clnt, procnum, xdr_args,
	    argsp, xdr_result, resultp, wait);

	/*
	 * Restore signal mask back if signals were blocked
	 */
	if (sig_blocked)
		sigreplace(&oldmask, (k_sigset_t *)NULL);

	return (stat);
}

/*
 * Suspend NLM client/server in the given zone.
 *
 * During suspend operation we mark those hosts
 * that have any locks with NLM_NH_SUSPEND flags,
 * so that they can be checked later, when resume
 * operation occurs.
 */
static void
nlm_suspend_zone(struct nlm_globals *g)
{
	struct nlm_host *hostp;
	struct nlm_host_list all_hosts;

	/*
	 * Note that while we're doing suspend, GC thread is active
	 * and it can destroy some hosts while we're walking through
	 * the hosts tree. To prevent that and make suspend logic
	 * a bit more simple we put all hosts to local "all_hosts"
	 * list and increment reference counter of each host.
	 * This guaranties that no hosts will be released while
	 * we're doing suspend.
	 * NOTE: reference of each host must be dropped during
	 * resume operation.
	 */
	TAILQ_INIT(&all_hosts);
	mutex_enter(&g->lock);
	for (hostp = avl_first(&g->nlm_hosts_tree); hostp != NULL;
	    hostp = AVL_NEXT(&g->nlm_hosts_tree, hostp)) {
		/*
		 * If host is idle, remove it from idle list and
		 * clear idle flag. That is done to prevent GC
		 * from touching this host.
		 */
		if (hostp->nh_flags & NLM_NH_INIDLE) {
			TAILQ_REMOVE(&g->nlm_idle_hosts, hostp, nh_link);
			hostp->nh_flags &= ~NLM_NH_INIDLE;
		}

		hostp->nh_refs++;
		TAILQ_INSERT_TAIL(&all_hosts, hostp, nh_link);
	}

	/*
	 * Now we can walk through all hosts on the system
	 * with zone globals lock released. The fact the
	 * we have taken a reference to each host guaranties
	 * that no hosts can be destroyed during that process.
	 */
	mutex_exit(&g->lock);
	while ((hostp = TAILQ_FIRST(&all_hosts)) != NULL) {
		mutex_enter(&hostp->nh_lock);
		if (nlm_host_has_locks(hostp))
			hostp->nh_flags |= NLM_NH_SUSPEND;

		mutex_exit(&hostp->nh_lock);
		TAILQ_REMOVE(&all_hosts, hostp, nh_link);
	}
}

/*
 * Resume NLM hosts for the given zone.
 *
 * nlm_resume_zone() is called after hosts were suspended
 * (see nlm_suspend_zone) and its main purpose to check
 * whether remote locks owned by hosts are still in consistent
 * state. If they aren't, resume function tries to reclaim
 * locks (for client side hosts) and clean locks (for
 * server side hosts).
 */
static void
nlm_resume_zone(struct nlm_globals *g)
{
	struct nlm_host *hostp, *h_next;

	mutex_enter(&g->lock);
	hostp = avl_first(&g->nlm_hosts_tree);

	/*
	 * In nlm_suspend_zone() the reference counter of each
	 * host was incremented, so we can safely iterate through
	 * all hosts without worrying that any host we touch will
	 * be removed at the moment.
	 */
	while (hostp != NULL) {
		struct nlm_nsm nsm;
		enum clnt_stat stat;
		int32_t sm_state;
		int error;
		bool_t resume_failed = FALSE;

		h_next = AVL_NEXT(&g->nlm_hosts_tree, hostp);
		mutex_exit(&g->lock);

		DTRACE_PROBE1(resume__host, struct nlm_host *, hostp);

		/*
		 * Suspend operation marked that the host doesn't
		 * have any locks. Skip it.
		 */
		if (!(hostp->nh_flags & NLM_NH_SUSPEND))
			goto cycle_end;

		error = nlm_nsm_init(&nsm, &hostp->nh_knc, &hostp->nh_addr);
		if (error != 0) {
			NLM_ERR("Resume: Failed to contact to NSM of host %s "
			    "[error=%d]\n", hostp->nh_name, error);
			resume_failed = TRUE;
			goto cycle_end;
		}

		stat = nlm_nsm_stat(&nsm, &sm_state);
		if (stat != RPC_SUCCESS) {
			NLM_ERR("Resume: Failed to call SM_STAT operation for "
			    "host %s [stat=%d]\n", hostp->nh_name, stat);
			resume_failed = TRUE;
			nlm_nsm_fini(&nsm);
			goto cycle_end;
		}

		if (sm_state != hostp->nh_state) {
			/*
			 * Current SM state of the host isn't equal
			 * to the one host had when it was suspended.
			 * Probably it was rebooted. Try to reclaim
			 * locks if the host has any on its client side.
			 * Also try to clean up its server side locks
			 * (if the host has any).
			 */
			nlm_host_notify_client(hostp, sm_state);
			nlm_host_notify_server(hostp, sm_state);
		}

		nlm_nsm_fini(&nsm);

cycle_end:
		if (resume_failed) {
			/*
			 * Resume failed for the given host.
			 * Just clean up all resources it owns.
			 */
			nlm_host_notify_server(hostp, 0);
			nlm_client_cancel_all(g, hostp);
		}

		hostp->nh_flags &= ~NLM_NH_SUSPEND;
		nlm_host_release(g, hostp);
		hostp = h_next;
		mutex_enter(&g->lock);
	}

	mutex_exit(&g->lock);
}

/*
 * NLM functions responsible for operations on NSM handle.
 */

/*
 * Initialize knetconfig that is used for communication
 * with local statd via loopback interface.
 */
static int
nlm_init_local_knc(struct knetconfig *knc)
{
	int error;
	vnode_t *vp;

	bzero(knc, sizeof (*knc));
	error = lookupname("/dev/tcp", UIO_SYSSPACE,
	    FOLLOW, NULLVPP, &vp);
	if (error != 0)
		return (error);

	knc->knc_semantics = NC_TPI_COTS;
	knc->knc_protofmly = NC_INET;
	knc->knc_proto = NC_TCP;
	knc->knc_rdev = vp->v_rdev;
	VN_RELE(vp);


	return (0);
}

/*
 * Initialize NSM handle that will be used to talk
 * to local statd via loopback interface.
 */
static int
nlm_nsm_init_local(struct nlm_nsm *nsm)
{
	int error;
	struct knetconfig knc;
	struct sockaddr_in sin;
	struct netbuf nb;

	error = nlm_init_local_knc(&knc);
	if (error != 0)
		return (error);

	bzero(&sin, sizeof (sin));
	sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
	sin.sin_family = AF_INET;

	nb.buf = (char *)&sin;
	nb.len = nb.maxlen = sizeof (sin);

	return (nlm_nsm_init(nsm, &knc, &nb));
}

/*
 * Initialize NSM handle used for talking to statd
 */
static int
nlm_nsm_init(struct nlm_nsm *nsm, struct knetconfig *knc, struct netbuf *nb)
{
	enum clnt_stat stat;
	int error, retries;

	bzero(nsm, sizeof (*nsm));
	nsm->ns_knc = *knc;
	nlm_copy_netbuf(&nsm->ns_addr, nb);

	/*
	 * Try several times to get the port of statd service,
	 * If rpcbind_getaddr returns  RPC_PROGNOTREGISTERED,
	 * retry an attempt, but wait for NLM_NSM_RPCBIND_TIMEOUT
	 * seconds berofore.
	 */
	for (retries = 0; retries < NLM_NSM_RPCBIND_RETRIES; retries++) {
		stat = rpcbind_getaddr(&nsm->ns_knc, SM_PROG,
		    SM_VERS, &nsm->ns_addr);
		if (stat != RPC_SUCCESS) {
			if (stat == RPC_PROGNOTREGISTERED) {
				delay(SEC_TO_TICK(NLM_NSM_RPCBIND_TIMEOUT));
				continue;
			}
		}

		break;
	}

	if (stat != RPC_SUCCESS) {
		DTRACE_PROBE2(rpcbind__error, enum clnt_stat, stat,
		    int, retries);
		error = ENOENT;
		goto error;
	}

	/*
	 * Create an RPC handle that'll be used for communication with local
	 * statd using the status monitor protocol.
	 */
	error = clnt_tli_kcreate(&nsm->ns_knc, &nsm->ns_addr, SM_PROG, SM_VERS,
	    0, NLM_RPC_RETRIES, zone_kcred(), &nsm->ns_handle);
	if (error != 0)
		goto error;

	/*
	 * Create an RPC handle that'll be used for communication with the
	 * local statd using the address registration protocol.
	 */
	error = clnt_tli_kcreate(&nsm->ns_knc, &nsm->ns_addr, NSM_ADDR_PROGRAM,
	    NSM_ADDR_V1, 0, NLM_RPC_RETRIES, zone_kcred(),
	    &nsm->ns_addr_handle);
	if (error != 0)
		goto error;

	sema_init(&nsm->ns_sem, 1, NULL, SEMA_DEFAULT, NULL);
	return (0);

error:
	kmem_free(nsm->ns_addr.buf, nsm->ns_addr.maxlen);
	if (nsm->ns_handle) {
		ASSERT(nsm->ns_handle->cl_auth != NULL);
		auth_destroy(nsm->ns_handle->cl_auth);
		CLNT_DESTROY(nsm->ns_handle);
	}

	return (error);
}

static void
nlm_nsm_fini(struct nlm_nsm *nsm)
{
	kmem_free(nsm->ns_addr.buf, nsm->ns_addr.maxlen);
	if (nsm->ns_addr_handle->cl_auth != NULL)
		auth_destroy(nsm->ns_addr_handle->cl_auth);
	CLNT_DESTROY(nsm->ns_addr_handle);
	nsm->ns_addr_handle = NULL;
	if (nsm->ns_handle->cl_auth != NULL)
		auth_destroy(nsm->ns_handle->cl_auth);
	CLNT_DESTROY(nsm->ns_handle);
	nsm->ns_handle = NULL;
	sema_destroy(&nsm->ns_sem);
}

static enum clnt_stat
nlm_nsm_simu_crash(struct nlm_nsm *nsm)
{
	enum clnt_stat stat;

	sema_p(&nsm->ns_sem);
	nlm_nsm_clnt_init(nsm->ns_handle, nsm);
	stat = sm_simu_crash_1(NULL, NULL, nsm->ns_handle);
	sema_v(&nsm->ns_sem);

	return (stat);
}

static enum clnt_stat
nlm_nsm_stat(struct nlm_nsm *nsm, int32_t *out_stat)
{
	struct sm_name args;
	struct sm_stat_res res;
	enum clnt_stat stat;

	args.mon_name = uts_nodename();
	bzero(&res, sizeof (res));

	sema_p(&nsm->ns_sem);
	nlm_nsm_clnt_init(nsm->ns_handle, nsm);
	stat = sm_stat_1(&args, &res, nsm->ns_handle);
	sema_v(&nsm->ns_sem);

	if (stat == RPC_SUCCESS)
		*out_stat = res.state;

	return (stat);
}

static enum clnt_stat
nlm_nsm_mon(struct nlm_nsm *nsm, char *hostname, uint16_t priv)
{
	struct mon args;
	struct sm_stat_res res;
	enum clnt_stat stat;

	bzero(&args, sizeof (args));
	bzero(&res, sizeof (res));

	args.mon_id.mon_name = hostname;
	args.mon_id.my_id.my_name = uts_nodename();
	args.mon_id.my_id.my_prog = NLM_PROG;
	args.mon_id.my_id.my_vers = NLM_SM;
	args.mon_id.my_id.my_proc = NLM_SM_NOTIFY1;
	bcopy(&priv, args.priv, sizeof (priv));

	sema_p(&nsm->ns_sem);
	nlm_nsm_clnt_init(nsm->ns_handle, nsm);
	stat = sm_mon_1(&args, &res, nsm->ns_handle);
	sema_v(&nsm->ns_sem);

	return (stat);
}

static enum clnt_stat
nlm_nsm_unmon(struct nlm_nsm *nsm, char *hostname)
{
	struct mon_id args;
	struct sm_stat res;
	enum clnt_stat stat;

	bzero(&args, sizeof (args));
	bzero(&res, sizeof (res));

	args.mon_name = hostname;
	args.my_id.my_name = uts_nodename();
	args.my_id.my_prog = NLM_PROG;
	args.my_id.my_vers = NLM_SM;
	args.my_id.my_proc = NLM_SM_NOTIFY1;

	sema_p(&nsm->ns_sem);
	nlm_nsm_clnt_init(nsm->ns_handle, nsm);
	stat = sm_unmon_1(&args, &res, nsm->ns_handle);
	sema_v(&nsm->ns_sem);

	return (stat);
}

static enum clnt_stat
nlm_nsmaddr_reg(struct nlm_nsm *nsm, char *name, int family, netobj *address)
{
	struct reg1args args = { 0 };
	struct reg1res res = { 0 };
	enum clnt_stat stat;

	args.family = family;
	args.name = name;
	args.address = *address;

	sema_p(&nsm->ns_sem);
	nlm_nsm_clnt_init(nsm->ns_addr_handle, nsm);
	stat = nsmaddrproc1_reg_1(&args, &res, nsm->ns_addr_handle);
	sema_v(&nsm->ns_sem);

	return (stat);
}

/*
 * Get NLM vhold object corresponding to vnode "vp".
 * If no such object was found, create a new one.
 *
 * The purpose of this function is to associate vhold
 * object with given vnode, so that:
 * 1) vnode is hold (VN_HOLD) while vhold object is alive.
 * 2) host has a track of all vnodes it touched by lock
 *    or share operations. These vnodes are accessible
 *    via collection of vhold objects.
 */
struct nlm_vhold *
nlm_vhold_get(struct nlm_host *hostp, vnode_t *vp)
{
	struct nlm_vhold *nvp, *new_nvp = NULL;

	mutex_enter(&hostp->nh_lock);
	nvp = nlm_vhold_find_locked(hostp, vp);
	if (nvp != NULL)
		goto out;

	/* nlm_vhold wasn't found, then create a new one */
	mutex_exit(&hostp->nh_lock);
	new_nvp = kmem_cache_alloc(nlm_vhold_cache, KM_SLEEP);

	/*
	 * Check if another thread has already
	 * created the same nlm_vhold.
	 */
	mutex_enter(&hostp->nh_lock);
	nvp = nlm_vhold_find_locked(hostp, vp);
	if (nvp == NULL) {
		nvp = new_nvp;
		new_nvp = NULL;

		TAILQ_INIT(&nvp->nv_slreqs);
		nvp->nv_vp = vp;
		nvp->nv_refcnt = 1;
		VN_HOLD(nvp->nv_vp);

		VERIFY(mod_hash_insert(hostp->nh_vholds_by_vp,
		    (mod_hash_key_t)vp, (mod_hash_val_t)nvp) == 0);
		TAILQ_INSERT_TAIL(&hostp->nh_vholds_list, nvp, nv_link);
	}

out:
	mutex_exit(&hostp->nh_lock);
	if (new_nvp != NULL)
		kmem_cache_free(nlm_vhold_cache, new_nvp);

	return (nvp);
}

/*
 * Drop a reference to vhold object nvp.
 */
void
nlm_vhold_release(struct nlm_host *hostp, struct nlm_vhold *nvp)
{
	if (nvp == NULL)
		return;

	mutex_enter(&hostp->nh_lock);
	ASSERT(nvp->nv_refcnt > 0);
	nvp->nv_refcnt--;

	/*
	 * If these conditions are met, the vhold is obviously unused and we
	 * will destroy it.  In a case either v_filocks and/or v_shrlocks is
	 * non-NULL the vhold might still be unused by the host, but it is
	 * expensive to check that.  We defer such check until the host is
	 * idle.  The expensive check is done in the NLM garbage collector.
	 */
	if (nvp->nv_refcnt == 0 &&
	    nvp->nv_vp->v_filocks == NULL &&
	    nvp->nv_vp->v_shrlocks == NULL) {
		nlm_vhold_destroy(hostp, nvp);
	}

	mutex_exit(&hostp->nh_lock);
}

/*
 * Clean all locks and share reservations on the
 * given vhold object that were acquired by the
 * given sysid
 */
static void
nlm_vhold_clean(struct nlm_vhold *nvp, int sysid)
{
	cleanlocks(nvp->nv_vp, IGN_PID, sysid);
	cleanshares_by_sysid(nvp->nv_vp, sysid);
}

static void
nlm_vhold_destroy(struct nlm_host *hostp, struct nlm_vhold *nvp)
{
	ASSERT(MUTEX_HELD(&hostp->nh_lock));

	ASSERT(nvp->nv_refcnt == 0);
	ASSERT(TAILQ_EMPTY(&nvp->nv_slreqs));

	VERIFY(mod_hash_remove(hostp->nh_vholds_by_vp,
	    (mod_hash_key_t)nvp->nv_vp,
	    (mod_hash_val_t)&nvp) == 0);

	TAILQ_REMOVE(&hostp->nh_vholds_list, nvp, nv_link);
	VN_RELE(nvp->nv_vp);
	nvp->nv_vp = NULL;

	kmem_cache_free(nlm_vhold_cache, nvp);
}

/*
 * Return TRUE if the given vhold is busy.
 * Vhold object is considered to be "busy" when
 * all the following conditions hold:
 * 1) No one uses it at the moment;
 * 2) It hasn't any locks;
 * 3) It hasn't any share reservations;
 */
static bool_t
nlm_vhold_busy(struct nlm_host *hostp, struct nlm_vhold *nvp)
{
	vnode_t *vp;
	int sysid;

	ASSERT(MUTEX_HELD(&hostp->nh_lock));

	if (nvp->nv_refcnt > 0)
		return (TRUE);

	vp = nvp->nv_vp;
	sysid = hostp->nh_sysid;
	if (flk_has_remote_locks_for_sysid(vp, sysid) ||
	    shr_has_remote_shares(vp, sysid))
		return (TRUE);

	return (FALSE);
}

/* ARGSUSED */
static int
nlm_vhold_ctor(void *datap, void *cdrarg, int kmflags)
{
	struct nlm_vhold *nvp = (struct nlm_vhold *)datap;

	bzero(nvp, sizeof (*nvp));
	return (0);
}

/* ARGSUSED */
static void
nlm_vhold_dtor(void *datap, void *cdrarg)
{
	struct nlm_vhold *nvp = (struct nlm_vhold *)datap;

	ASSERT(nvp->nv_refcnt == 0);
	ASSERT(TAILQ_EMPTY(&nvp->nv_slreqs));
	ASSERT(nvp->nv_vp == NULL);
}

struct nlm_vhold *
nlm_vhold_find_locked(struct nlm_host *hostp, const vnode_t *vp)
{
	struct nlm_vhold *nvp = NULL;

	ASSERT(MUTEX_HELD(&hostp->nh_lock));
	(void) mod_hash_find(hostp->nh_vholds_by_vp,
	    (mod_hash_key_t)vp,
	    (mod_hash_val_t)&nvp);

	if (nvp != NULL)
		nvp->nv_refcnt++;

	return (nvp);
}

/*
 * NLM host functions
 */
static void
nlm_copy_netbuf(struct netbuf *dst, struct netbuf *src)
{
	ASSERT(src->len <= src->maxlen);

	dst->maxlen = src->maxlen;
	dst->len = src->len;
	dst->buf = kmem_zalloc(src->maxlen, KM_SLEEP);
	bcopy(src->buf, dst->buf, src->len);
}

/* ARGSUSED */
static int
nlm_host_ctor(void *datap, void *cdrarg, int kmflags)
{
	struct nlm_host *hostp = (struct nlm_host *)datap;

	bzero(hostp, sizeof (*hostp));
	return (0);
}

/* ARGSUSED */
static void
nlm_host_dtor(void *datap, void *cdrarg)
{
	struct nlm_host *hostp = (struct nlm_host *)datap;
	ASSERT(hostp->nh_refs == 0);
}

static void
nlm_host_unregister(struct nlm_globals *g, struct nlm_host *hostp)
{
	ASSERT(hostp->nh_refs == 0);
	ASSERT(hostp->nh_flags & NLM_NH_INIDLE);

	avl_remove(&g->nlm_hosts_tree, hostp);
	VERIFY(mod_hash_remove(g->nlm_hosts_hash,
	    (mod_hash_key_t)(uintptr_t)hostp->nh_sysid,
	    (mod_hash_val_t)&hostp) == 0);
	TAILQ_REMOVE(&g->nlm_idle_hosts, hostp, nh_link);
	hostp->nh_flags &= ~NLM_NH_INIDLE;
}

/*
 * Free resources used by a host. This is called after the reference
 * count has reached zero so it doesn't need to worry about locks.
 */
static void
nlm_host_destroy(struct nlm_host *hostp)
{
	ASSERT(hostp->nh_name != NULL);
	ASSERT(hostp->nh_netid != NULL);
	ASSERT(TAILQ_EMPTY(&hostp->nh_vholds_list));

	strfree(hostp->nh_name);
	strfree(hostp->nh_netid);
	kmem_free(hostp->nh_addr.buf, hostp->nh_addr.maxlen);

	if (hostp->nh_sysid != LM_NOSYSID)
		nlm_sysid_free(hostp->nh_sysid);

	nlm_rpc_cache_destroy(hostp);

	ASSERT(TAILQ_EMPTY(&hostp->nh_vholds_list));
	mod_hash_destroy_ptrhash(hostp->nh_vholds_by_vp);

	mutex_destroy(&hostp->nh_lock);
	cv_destroy(&hostp->nh_rpcb_cv);
	cv_destroy(&hostp->nh_recl_cv);

	kmem_cache_free(nlm_hosts_cache, hostp);
}

/*
 * Cleanup SERVER-side state after a client restarts,
 * or becomes unresponsive, or whatever.
 *
 * We unlock any active locks owned by the host.
 * When rpc.lockd is shutting down,
 * this function is called with newstate set to zero
 * which allows us to cancel any pending async locks
 * and clear the locking state.
 *
 * When "state" is 0, we don't update host's state,
 * but cleanup all remote locks on the host.
 * It's useful to call this function for resources
 * cleanup.
 */
void
nlm_host_notify_server(struct nlm_host *hostp, int32_t state)
{
	struct nlm_vhold *nvp;
	struct nlm_slreq *slr;
	struct nlm_slreq_list slreqs2free;

	TAILQ_INIT(&slreqs2free);
	mutex_enter(&hostp->nh_lock);
	if (state != 0)
		hostp->nh_state = state;

	TAILQ_FOREACH(nvp, &hostp->nh_vholds_list, nv_link) {

		/* cleanup sleeping requests at first */
		while ((slr = TAILQ_FIRST(&nvp->nv_slreqs)) != NULL) {
			TAILQ_REMOVE(&nvp->nv_slreqs, slr, nsr_link);

			/*
			 * Instead of freeing cancelled sleeping request
			 * here, we add it to the linked list created
			 * on the stack in order to do all frees outside
			 * the critical section.
			 */
			TAILQ_INSERT_TAIL(&slreqs2free, slr, nsr_link);
		}

		nvp->nv_refcnt++;
		mutex_exit(&hostp->nh_lock);

		nlm_vhold_clean(nvp, hostp->nh_sysid);

		mutex_enter(&hostp->nh_lock);
		nvp->nv_refcnt--;
	}

	mutex_exit(&hostp->nh_lock);
	while ((slr = TAILQ_FIRST(&slreqs2free)) != NULL) {
		TAILQ_REMOVE(&slreqs2free, slr, nsr_link);
		kmem_free(slr, sizeof (*slr));
	}
}

/*
 * Cleanup CLIENT-side state after a server restarts,
 * or becomes unresponsive, or whatever.
 *
 * This is called by the local NFS statd when we receive a
 * host state change notification.  (also nlm_svc_stopping)
 *
 * Deal with a server restart.  If we are stopping the
 * NLM service, we'll have newstate == 0, and will just
 * cancel all our client-side lock requests.  Otherwise,
 * start the "recovery" process to reclaim any locks
 * we hold on this server.
 */
void
nlm_host_notify_client(struct nlm_host *hostp, int32_t state)
{
	mutex_enter(&hostp->nh_lock);
	hostp->nh_state = state;
	if (hostp->nh_flags & NLM_NH_RECLAIM) {
		/*
		 * Either host's state is up to date or
		 * host is already in recovery.
		 */
		mutex_exit(&hostp->nh_lock);
		return;
	}

	hostp->nh_flags |= NLM_NH_RECLAIM;

	/*
	 * Host will be released by the recovery thread,
	 * thus we need to increment refcount.
	 */
	hostp->nh_refs++;
	mutex_exit(&hostp->nh_lock);

	(void) zthread_create(NULL, 0, nlm_reclaimer,
	    hostp, 0, minclsyspri);
}

/*
 * The function is called when NLM client detects that
 * server has entered in grace period and client needs
 * to wait until reclamation process (if any) does
 * its job.
 */
int
nlm_host_wait_grace(struct nlm_host *hostp)
{
	struct nlm_globals *g;
	int error = 0;

	g = zone_getspecific(nlm_zone_key, curzone);
	mutex_enter(&hostp->nh_lock);

	do {
		int rc;

		rc = cv_timedwait_sig(&hostp->nh_recl_cv,
		    &hostp->nh_lock, ddi_get_lbolt() +
		    SEC_TO_TICK(g->retrans_tmo));

		if (rc == 0) {
			error = EINTR;
			break;
		}
	} while (hostp->nh_flags & NLM_NH_RECLAIM);

	mutex_exit(&hostp->nh_lock);
	return (error);
}

/*
 * Create a new NLM host.
 *
 * NOTE: The in-kernel RPC (kRPC) subsystem uses TLI/XTI,
 * which needs both a knetconfig and an address when creating
 * endpoints. Thus host object stores both knetconfig and
 * netid.
 */
static struct nlm_host *
nlm_host_create(char *name, const char *netid,
    struct knetconfig *knc, struct netbuf *naddr)
{
	struct nlm_host *host;

	host = kmem_cache_alloc(nlm_hosts_cache, KM_SLEEP);

	mutex_init(&host->nh_lock, NULL, MUTEX_DEFAULT, NULL);
	cv_init(&host->nh_rpcb_cv, NULL, CV_DEFAULT, NULL);
	cv_init(&host->nh_recl_cv, NULL, CV_DEFAULT, NULL);

	host->nh_sysid = LM_NOSYSID;
	host->nh_refs = 1;
	host->nh_name = strdup(name);
	host->nh_netid = strdup(netid);
	host->nh_knc = *knc;
	nlm_copy_netbuf(&host->nh_addr, naddr);

	host->nh_state = 0;
	host->nh_rpcb_state = NRPCB_NEED_UPDATE;
	host->nh_flags = 0;

	host->nh_vholds_by_vp = mod_hash_create_ptrhash("nlm vholds hash",
	    32, mod_hash_null_valdtor, sizeof (vnode_t));

	TAILQ_INIT(&host->nh_vholds_list);
	TAILQ_INIT(&host->nh_rpchc);

	return (host);
}

/*
 * Cancel all client side sleeping locks owned by given host.
 */
void
nlm_host_cancel_slocks(struct nlm_globals *g, struct nlm_host *hostp)
{
	struct nlm_slock *nslp;

	mutex_enter(&g->lock);
	TAILQ_FOREACH(nslp, &g->nlm_slocks, nsl_link) {
		if (nslp->nsl_host == hostp) {
			nslp->nsl_state = NLM_SL_CANCELLED;
			cv_broadcast(&nslp->nsl_cond);
		}
	}

	mutex_exit(&g->lock);
}

/*
 * Garbage collect stale vhold objects.
 *
 * In other words check whether vnodes that are
 * held by vhold objects still have any locks
 * or shares or still in use. If they aren't,
 * just destroy them.
 */
static void
nlm_host_gc_vholds(struct nlm_host *hostp)
{
	struct nlm_vhold *nvp;

	ASSERT(MUTEX_HELD(&hostp->nh_lock));

	nvp = TAILQ_FIRST(&hostp->nh_vholds_list);
	while (nvp != NULL) {
		struct nlm_vhold *nvp_tmp;

		if (nlm_vhold_busy(hostp, nvp)) {
			nvp = TAILQ_NEXT(nvp, nv_link);
			continue;
		}

		nvp_tmp = TAILQ_NEXT(nvp, nv_link);
		nlm_vhold_destroy(hostp, nvp);
		nvp = nvp_tmp;
	}
}

/*
 * Check whether the given host has any
 * server side locks or share reservations.
 */
static bool_t
nlm_host_has_srv_locks(struct nlm_host *hostp)
{
	/*
	 * It's cheap and simple: if server has
	 * any locks/shares there must be vhold
	 * object storing the affected vnode.
	 *
	 * NOTE: We don't need to check sleeping
	 * locks on the server side, because if
	 * server side sleeping lock is alive,
	 * there must be a vhold object corresponding
	 * to target vnode.
	 */
	ASSERT(MUTEX_HELD(&hostp->nh_lock));
	if (!TAILQ_EMPTY(&hostp->nh_vholds_list))
		return (TRUE);

	return (FALSE);
}

/*
 * Check whether the given host has any client side
 * locks or share reservations.
 */
static bool_t
nlm_host_has_cli_locks(struct nlm_host *hostp)
{
	ASSERT(MUTEX_HELD(&hostp->nh_lock));

	/*
	 * XXX: It's not the way I'd like to do the check,
	 * because flk_sysid_has_locks() can be very
	 * expensive by design. Unfortunatelly it iterates
	 * through all locks on the system, doesn't matter
	 * were they made on remote system via NLM or
	 * on local system via reclock. To understand the
	 * problem, consider that there're dozens of thousands
	 * of locks that are made on some ZFS dataset. And there's
	 * another dataset shared by NFS where NLM client had locks
	 * some time ago, but doesn't have them now.
	 * In this case flk_sysid_has_locks() will iterate
	 * thrught dozens of thousands locks until it returns us
	 * FALSE.
	 * Oh, I hope that in shiny future somebody will make
	 * local lock manager (os/flock.c) better, so that
	 * it'd be more friedly to remote locks and
	 * flk_sysid_has_locks() wouldn't be so expensive.
	 */
	if (flk_sysid_has_locks(hostp->nh_sysid |
	    LM_SYSID_CLIENT, FLK_QUERY_ACTIVE))
		return (TRUE);

	/*
	 * Check whether host has any share reservations
	 * registered on the client side.
	 */
	if (hostp->nh_shrlist != NULL)
		return (TRUE);

	return (FALSE);
}

/*
 * Determine whether the given host owns any
 * locks or share reservations.
 */
static bool_t
nlm_host_has_locks(struct nlm_host *hostp)
{
	if (nlm_host_has_srv_locks(hostp))
		return (TRUE);

	return (nlm_host_has_cli_locks(hostp));
}

/*
 * This function compares only addresses of two netbufs
 * that belong to NC_TCP[6] or NC_UDP[6] protofamily.
 * Port part of netbuf is ignored.
 *
 * Return values:
 *  -1: nb1's address is "smaller" than nb2's
 *   0: addresses are equal
 *   1: nb1's address is "greater" than nb2's
 */
static int
nlm_netbuf_addrs_cmp(struct netbuf *nb1, struct netbuf *nb2)
{
	union nlm_addr {
		struct sockaddr sa;
		struct sockaddr_in sin;
		struct sockaddr_in6 sin6;
	} *na1, *na2;
	int res;

	/* LINTED E_BAD_PTR_CAST_ALIGN */
	na1 = (union nlm_addr *)nb1->buf;
	/* LINTED E_BAD_PTR_CAST_ALIGN */
	na2 = (union nlm_addr *)nb2->buf;

	if (na1->sa.sa_family < na2->sa.sa_family)
		return (-1);
	if (na1->sa.sa_family > na2->sa.sa_family)
		return (1);

	switch (na1->sa.sa_family) {
	case AF_INET:
		res = memcmp(&na1->sin.sin_addr, &na2->sin.sin_addr,
		    sizeof (na1->sin.sin_addr));
		break;
	case AF_INET6:
		res = memcmp(&na1->sin6.sin6_addr, &na2->sin6.sin6_addr,
		    sizeof (na1->sin6.sin6_addr));
		break;
	default:
		VERIFY(0);
		return (0);
	}

	return (SIGN(res));
}

/*
 * Compare two nlm hosts.
 * Return values:
 * -1: host1 is "smaller" than host2
 *  0: host1 is equal to host2
 *  1: host1 is "greater" than host2
 */
int
nlm_host_cmp(const void *p1, const void *p2)
{
	struct nlm_host *h1 = (struct nlm_host *)p1;
	struct nlm_host *h2 = (struct nlm_host *)p2;
	int res;

	res = strcmp(h1->nh_netid, h2->nh_netid);
	if (res != 0)
		return (SIGN(res));

	res = nlm_netbuf_addrs_cmp(&h1->nh_addr, &h2->nh_addr);
	return (res);
}

/*
 * Find the host specified by...  (see below)
 * If found, increment the ref count.
 */
static struct nlm_host *
nlm_host_find_locked(struct nlm_globals *g, const char *netid,
    struct netbuf *naddr, avl_index_t *wherep)
{
	struct nlm_host *hostp, key;
	avl_index_t pos;

	ASSERT(MUTEX_HELD(&g->lock));

	key.nh_netid = (char *)netid;
	key.nh_addr.buf = naddr->buf;
	key.nh_addr.len = naddr->len;
	key.nh_addr.maxlen = naddr->maxlen;

	hostp = avl_find(&g->nlm_hosts_tree, &key, &pos);

	if (hostp != NULL) {
		/*
		 * Host is inuse now. Remove it from idle
		 * hosts list if needed.
		 */
		if (hostp->nh_flags & NLM_NH_INIDLE) {
			TAILQ_REMOVE(&g->nlm_idle_hosts, hostp, nh_link);
			hostp->nh_flags &= ~NLM_NH_INIDLE;
		}

		hostp->nh_refs++;
	}
	if (wherep != NULL)
		*wherep = pos;

	return (hostp);
}

/*
 * Find NLM host for the given name and address.
 */
struct nlm_host *
nlm_host_find(struct nlm_globals *g, const char *netid,
    struct netbuf *addr)
{
	struct nlm_host *hostp = NULL;

	mutex_enter(&g->lock);
	if (g->run_status != NLM_ST_UP)
		goto out;

	hostp = nlm_host_find_locked(g, netid, addr, NULL);

out:
	mutex_exit(&g->lock);
	return (hostp);
}


/*
 * Find or create an NLM host for the given name and address.
 *
 * The remote host is determined by all of: name, netid, address.
 * Note that the netid is whatever nlm_svc_add_ep() gave to
 * svc_tli_kcreate() for the service binding.  If any of these
 * are different, allocate a new host (new sysid).
 */
struct nlm_host *
nlm_host_findcreate(struct nlm_globals *g, char *name,
    const char *netid, struct netbuf *addr)
{
	int err;
	struct nlm_host *host, *newhost = NULL;
	struct knetconfig knc;
	avl_index_t where;

	mutex_enter(&g->lock);
	if (g->run_status != NLM_ST_UP) {
		mutex_exit(&g->lock);
		return (NULL);
	}

	host = nlm_host_find_locked(g, netid, addr, NULL);
	mutex_exit(&g->lock);
	if (host != NULL)
		return (host);

	err = nlm_knc_from_netid(netid, &knc);
	if (err != 0)
		return (NULL);
	/*
	 * Do allocations (etc.) outside of mutex,
	 * and then check again before inserting.
	 */
	newhost = nlm_host_create(name, netid, &knc, addr);
	newhost->nh_sysid = nlm_sysid_alloc();
	if (newhost->nh_sysid == LM_NOSYSID)
		goto out;

	mutex_enter(&g->lock);
	host = nlm_host_find_locked(g, netid, addr, &where);
	if (host == NULL) {
		host = newhost;
		newhost = NULL;

		/*
		 * Insert host to the hosts AVL tree that is
		 * used to lookup by <netid, address> pair.
		 */
		avl_insert(&g->nlm_hosts_tree, host, where);

		/*
		 * Insert host to the hosts hash table that is
		 * used to lookup host by sysid.
		 */
		VERIFY(mod_hash_insert(g->nlm_hosts_hash,
		    (mod_hash_key_t)(uintptr_t)host->nh_sysid,
		    (mod_hash_val_t)host) == 0);
	}

	mutex_exit(&g->lock);

out:
	if (newhost != NULL) {
		/*
		 * We do not need the preallocated nlm_host
		 * so decrement the reference counter
		 * and destroy it.
		 */
		newhost->nh_refs--;
		nlm_host_destroy(newhost);
	}

	return (host);
}

/*
 * Find the NLM host that matches the value of 'sysid'.
 * If found, return it with a new ref,
 * else return NULL.
 */
struct nlm_host *
nlm_host_find_by_sysid(struct nlm_globals *g, sysid_t sysid)
{
	struct nlm_host *hostp = NULL;

	mutex_enter(&g->lock);
	if (g->run_status != NLM_ST_UP)
		goto out;

	(void) mod_hash_find(g->nlm_hosts_hash,
	    (mod_hash_key_t)(uintptr_t)sysid,
	    (mod_hash_val_t)&hostp);

	if (hostp == NULL)
		goto out;

	/*
	 * Host is inuse now. Remove it
	 * from idle hosts list if needed.
	 */
	if (hostp->nh_flags & NLM_NH_INIDLE) {
		TAILQ_REMOVE(&g->nlm_idle_hosts, hostp, nh_link);
		hostp->nh_flags &= ~NLM_NH_INIDLE;
	}

	hostp->nh_refs++;

out:
	mutex_exit(&g->lock);
	return (hostp);
}

/*
 * Release the given host.
 * I.e. drop a reference that was taken earlier by one of
 * the following functions: nlm_host_findcreate(), nlm_host_find(),
 * nlm_host_find_by_sysid().
 *
 * When the very last reference is dropped, host is moved to
 * so-called "idle state". All hosts that are in idle state
 * have an idle timeout. If timeout is expired, GC thread
 * checks whether hosts have any locks and if they heven't
 * any, it removes them.
 * NOTE: only unused hosts can be in idle state.
 */
static void
nlm_host_release_locked(struct nlm_globals *g, struct nlm_host *hostp)
{
	if (hostp == NULL)
		return;

	ASSERT(MUTEX_HELD(&g->lock));
	ASSERT(hostp->nh_refs > 0);

	hostp->nh_refs--;
	if (hostp->nh_refs != 0)
		return;

	/*
	 * The very last reference to the host was dropped,
	 * thus host is unused now. Set its idle timeout
	 * and move it to the idle hosts LRU list.
	 */
	hostp->nh_idle_timeout = ddi_get_lbolt() +
	    SEC_TO_TICK(g->cn_idle_tmo);

	ASSERT((hostp->nh_flags & NLM_NH_INIDLE) == 0);
	TAILQ_INSERT_TAIL(&g->nlm_idle_hosts, hostp, nh_link);
	hostp->nh_flags |= NLM_NH_INIDLE;
}

void
nlm_host_release(struct nlm_globals *g, struct nlm_host *hostp)
{
	if (hostp == NULL)
		return;

	mutex_enter(&g->lock);
	nlm_host_release_locked(g, hostp);
	mutex_exit(&g->lock);
}

/*
 * Unregister this NLM host (NFS client) with the local statd
 * due to idleness (no locks held for a while).
 */
void
nlm_host_unmonitor(struct nlm_globals *g, struct nlm_host *host)
{
	enum clnt_stat stat;

	VERIFY(host->nh_refs == 0);
	if (!(host->nh_flags & NLM_NH_MONITORED))
		return;

	host->nh_flags &= ~NLM_NH_MONITORED;
	stat = nlm_nsm_unmon(&g->nlm_nsm, host->nh_name);
	if (stat != RPC_SUCCESS) {
		NLM_WARN("NLM: Failed to contact statd, stat=%d\n", stat);
		return;
	}
}

/*
 * Ask the local NFS statd to begin monitoring this host.
 * It will call us back when that host restarts, using the
 * prog,vers,proc specified below, i.e. NLM_SM_NOTIFY1,
 * which is handled in nlm_do_notify1().
 */
void
nlm_host_monitor(struct nlm_globals *g, struct nlm_host *host, int state)
{
	int family;
	netobj obj;
	enum clnt_stat stat;

	if (state != 0 && host->nh_state == 0) {
		/*
		 * This is the first time we have seen an NSM state
		 * Value for this host. We record it here to help
		 * detect host reboots.
		 */
		host->nh_state = state;
	}

	mutex_enter(&host->nh_lock);
	if (host->nh_flags & NLM_NH_MONITORED) {
		mutex_exit(&host->nh_lock);
		return;
	}

	host->nh_flags |= NLM_NH_MONITORED;
	mutex_exit(&host->nh_lock);

	/*
	 * Before we begin monitoring the host register the network address
	 * associated with this hostname.
	 */
	nlm_netbuf_to_netobj(&host->nh_addr, &family, &obj);
	stat = nlm_nsmaddr_reg(&g->nlm_nsm, host->nh_name, family, &obj);
	if (stat != RPC_SUCCESS) {
		NLM_WARN("Failed to register address, stat=%d\n", stat);
		mutex_enter(&g->lock);
		host->nh_flags &= ~NLM_NH_MONITORED;
		mutex_exit(&g->lock);

		return;
	}

	/*
	 * Tell statd how to call us with status updates for
	 * this host. Updates arrive via nlm_do_notify1().
	 *
	 * We put our assigned system ID value in the priv field to
	 * make it simpler to find the host if we are notified of a
	 * host restart.
	 */
	stat = nlm_nsm_mon(&g->nlm_nsm, host->nh_name, host->nh_sysid);
	if (stat != RPC_SUCCESS) {
		NLM_WARN("Failed to contact local NSM, stat=%d\n", stat);
		mutex_enter(&g->lock);
		host->nh_flags &= ~NLM_NH_MONITORED;
		mutex_exit(&g->lock);

		return;
	}
}

int
nlm_host_get_state(struct nlm_host *hostp)
{

	return (hostp->nh_state);
}

/*
 * NLM client/server sleeping locks
 */

/*
 * Register client side sleeping lock.
 *
 * Our client code calls this to keep information
 * about sleeping lock somewhere. When it receives
 * grant callback from server or when it just
 * needs to remove all sleeping locks from vnode,
 * it uses this information for remove/apply lock
 * properly.
 */
struct nlm_slock *
nlm_slock_register(
	struct nlm_globals *g,
	struct nlm_host *host,
	struct nlm4_lock *lock,
	struct vnode *vp)
{
	struct nlm_slock *nslp;

	nslp = kmem_zalloc(sizeof (*nslp), KM_SLEEP);
	cv_init(&nslp->nsl_cond, NULL, CV_DEFAULT, NULL);
	nslp->nsl_lock = *lock;
	nlm_copy_netobj(&nslp->nsl_fh, &nslp->nsl_lock.fh);
	nslp->nsl_state = NLM_SL_BLOCKED;
	nslp->nsl_host = host;
	nslp->nsl_vp = vp;

	mutex_enter(&g->lock);
	TAILQ_INSERT_TAIL(&g->nlm_slocks, nslp, nsl_link);
	mutex_exit(&g->lock);

	return (nslp);
}

/*
 * Remove this lock from the wait list and destroy it.
 */
void
nlm_slock_unregister(struct nlm_globals *g, struct nlm_slock *nslp)
{
	mutex_enter(&g->lock);
	TAILQ_REMOVE(&g->nlm_slocks, nslp, nsl_link);
	mutex_exit(&g->lock);

	kmem_free(nslp->nsl_fh.n_bytes, nslp->nsl_fh.n_len);
	cv_destroy(&nslp->nsl_cond);
	kmem_free(nslp, sizeof (*nslp));
}

/*
 * Wait for a granted callback or cancellation event
 * for a sleeping lock.
 *
 * If a signal interrupted the wait or if the lock
 * was cancelled, return EINTR - the caller must arrange to send
 * a cancellation to the server.
 *
 * If timeout occurred, return ETIMEDOUT - the caller must
 * resend the lock request to the server.
 *
 * On success return 0.
 */
int
nlm_slock_wait(struct nlm_globals *g,
    struct nlm_slock *nslp, uint_t timeo_secs)
{
	clock_t timeo_ticks;
	int cv_res, error;

	/*
	 * If the granted message arrived before we got here,
	 * nslp->nsl_state will be NLM_SL_GRANTED - in that case don't sleep.
	 */
	cv_res = 1;
	timeo_ticks = ddi_get_lbolt() + SEC_TO_TICK(timeo_secs);

	mutex_enter(&g->lock);
	while (nslp->nsl_state == NLM_SL_BLOCKED && cv_res > 0) {
		cv_res = cv_timedwait_sig(&nslp->nsl_cond,
		    &g->lock, timeo_ticks);
	}

	/*
	 * No matter why we wake up, if the lock was
	 * cancelled, let the function caller to know
	 * about it by returning EINTR.
	 */
	if (nslp->nsl_state == NLM_SL_CANCELLED) {
		error = EINTR;
		goto out;
	}

	if (cv_res <= 0) {
		/* We were woken up either by timeout or by interrupt */
		error = (cv_res < 0) ? ETIMEDOUT : EINTR;

		/*
		 * The granted message may arrive after the
		 * interrupt/timeout but before we manage to lock the
		 * mutex. Detect this by examining nslp.
		 */
		if (nslp->nsl_state == NLM_SL_GRANTED)
			error = 0;
	} else { /* Awaken via cv_signal()/cv_broadcast() or didn't block */
		error = 0;
		VERIFY(nslp->nsl_state == NLM_SL_GRANTED);
	}

out:
	mutex_exit(&g->lock);
	return (error);
}

/*
 * Mark client side sleeping lock as granted
 * and wake up a process blocked on the lock.
 * Called from server side NLM_GRANT handler.
 *
 * If sleeping lock is found return 0, otherwise
 * return ENOENT.
 */
int
nlm_slock_grant(struct nlm_globals *g,
    struct nlm_host *hostp, struct nlm4_lock *alock)
{
	struct nlm_slock *nslp;
	int error = ENOENT;

	mutex_enter(&g->lock);
	TAILQ_FOREACH(nslp, &g->nlm_slocks, nsl_link) {
		if ((nslp->nsl_state != NLM_SL_BLOCKED) ||
		    (nslp->nsl_host != hostp))
			continue;

		if (alock->svid		== nslp->nsl_lock.svid &&
		    alock->l_offset	== nslp->nsl_lock.l_offset &&
		    alock->l_len	== nslp->nsl_lock.l_len &&
		    alock->fh.n_len	== nslp->nsl_lock.fh.n_len &&
		    bcmp(alock->fh.n_bytes, nslp->nsl_lock.fh.n_bytes,
		    nslp->nsl_lock.fh.n_len) == 0) {
			nslp->nsl_state = NLM_SL_GRANTED;
			cv_broadcast(&nslp->nsl_cond);
			error = 0;
			break;
		}
	}

	mutex_exit(&g->lock);
	return (error);
}

/*
 * Register sleeping lock request corresponding to
 * flp on the given vhold object.
 * On success function returns 0, otherwise (if
 * lock request with the same flp is already
 * registered) function returns EEXIST.
 */
int
nlm_slreq_register(struct nlm_host *hostp, struct nlm_vhold *nvp,
    struct flock64 *flp)
{
	struct nlm_slreq *slr, *new_slr = NULL;
	int ret = EEXIST;

	mutex_enter(&hostp->nh_lock);
	slr = nlm_slreq_find_locked(hostp, nvp, flp);
	if (slr != NULL)
		goto out;

	mutex_exit(&hostp->nh_lock);
	new_slr = kmem_zalloc(sizeof (*slr), KM_SLEEP);
	bcopy(flp, &new_slr->nsr_fl, sizeof (*flp));

	mutex_enter(&hostp->nh_lock);
	slr = nlm_slreq_find_locked(hostp, nvp, flp);
	if (slr == NULL) {
		slr = new_slr;
		new_slr = NULL;
		ret = 0;

		TAILQ_INSERT_TAIL(&nvp->nv_slreqs, slr, nsr_link);
	}

out:
	mutex_exit(&hostp->nh_lock);
	if (new_slr != NULL)
		kmem_free(new_slr, sizeof (*new_slr));

	return (ret);
}

/*
 * Unregister sleeping lock request corresponding
 * to flp from the given vhold object.
 * On success function returns 0, otherwise (if
 * lock request corresponding to flp isn't found
 * on the given vhold) function returns ENOENT.
 */
int
nlm_slreq_unregister(struct nlm_host *hostp, struct nlm_vhold *nvp,
    struct flock64 *flp)
{
	struct nlm_slreq *slr;

	mutex_enter(&hostp->nh_lock);
	slr = nlm_slreq_find_locked(hostp, nvp, flp);
	if (slr == NULL) {
		mutex_exit(&hostp->nh_lock);
		return (ENOENT);
	}

	TAILQ_REMOVE(&nvp->nv_slreqs, slr, nsr_link);
	mutex_exit(&hostp->nh_lock);

	kmem_free(slr, sizeof (*slr));
	return (0);
}

/*
 * Find sleeping lock request on the given vhold object by flp.
 */
struct nlm_slreq *
nlm_slreq_find_locked(struct nlm_host *hostp, struct nlm_vhold *nvp,
    struct flock64 *flp)
{
	struct nlm_slreq *slr = NULL;

	ASSERT(MUTEX_HELD(&hostp->nh_lock));
	TAILQ_FOREACH(slr, &nvp->nv_slreqs, nsr_link) {
		if (slr->nsr_fl.l_start		== flp->l_start	&&
		    slr->nsr_fl.l_len		== flp->l_len	&&
		    slr->nsr_fl.l_pid		== flp->l_pid	&&
		    slr->nsr_fl.l_type		== flp->l_type)
			break;
	}

	return (slr);
}

/*
 * NLM tracks active share reservations made on the client side.
 * It needs to have a track of share reservations for two purposes
 * 1) to determine if nlm_host is busy (if it has active locks and/or
 *    share reservations, it is)
 * 2) to recover active share reservations when NLM server reports
 *    that it has rebooted.
 *
 * Unfortunately Illumos local share reservations manager (see os/share.c)
 * doesn't have an ability to lookup all reservations on the system
 * by sysid (like local lock manager) or get all reservations by sysid.
 * It tracks reservations per vnode and is able to get/looup them
 * on particular vnode. It's not what NLM needs. Thus it has that ugly
 * share reservations tracking scheme.
 */

void
nlm_shres_track(struct nlm_host *hostp, vnode_t *vp, struct shrlock *shrp)
{
	struct nlm_shres *nsp, *nsp_new;

	/*
	 * NFS code must fill the s_owner, so that
	 * s_own_len is never 0.
	 */
	ASSERT(shrp->s_own_len > 0);
	nsp_new = nlm_shres_create_item(shrp, vp);

	mutex_enter(&hostp->nh_lock);
	for (nsp = hostp->nh_shrlist; nsp != NULL; nsp = nsp->ns_next)
		if (nsp->ns_vp == vp && nlm_shres_equal(shrp, nsp->ns_shr))
			break;

	if (nsp != NULL) {
		/*
		 * Found a duplicate. Do nothing.
		 */

		goto out;
	}

	nsp = nsp_new;
	nsp_new = NULL;
	nsp->ns_next = hostp->nh_shrlist;
	hostp->nh_shrlist = nsp;

out:
	mutex_exit(&hostp->nh_lock);
	if (nsp_new != NULL)
		nlm_shres_destroy_item(nsp_new);
}

void
nlm_shres_untrack(struct nlm_host *hostp, vnode_t *vp, struct shrlock *shrp)
{
	struct nlm_shres *nsp, *nsp_prev = NULL;

	mutex_enter(&hostp->nh_lock);
	nsp = hostp->nh_shrlist;
	while (nsp != NULL) {
		if (nsp->ns_vp == vp && nlm_shres_equal(shrp, nsp->ns_shr)) {
			struct nlm_shres *nsp_del;

			nsp_del = nsp;
			nsp = nsp->ns_next;
			if (nsp_prev != NULL)
				nsp_prev->ns_next = nsp;
			else
				hostp->nh_shrlist = nsp;

			nlm_shres_destroy_item(nsp_del);
			continue;
		}

		nsp_prev = nsp;
		nsp = nsp->ns_next;
	}

	mutex_exit(&hostp->nh_lock);
}

/*
 * Get a _copy_ of the list of all active share reservations
 * made by the given host.
 * NOTE: the list function returns _must_ be released using
 *       nlm_free_shrlist().
 */
struct nlm_shres *
nlm_get_active_shres(struct nlm_host *hostp)
{
	struct nlm_shres *nsp, *nslist = NULL;

	mutex_enter(&hostp->nh_lock);
	for (nsp = hostp->nh_shrlist; nsp != NULL; nsp = nsp->ns_next) {
		struct nlm_shres *nsp_new;

		nsp_new = nlm_shres_create_item(nsp->ns_shr, nsp->ns_vp);
		nsp_new->ns_next = nslist;
		nslist = nsp_new;
	}

	mutex_exit(&hostp->nh_lock);
	return (nslist);
}

/*
 * Free memory allocated for the active share reservations
 * list created by nlm_get_active_shres() function.
 */
void
nlm_free_shrlist(struct nlm_shres *nslist)
{
	struct nlm_shres *nsp;

	while (nslist != NULL) {
		nsp =  nslist;
		nslist = nslist->ns_next;

		nlm_shres_destroy_item(nsp);
	}
}

static bool_t
nlm_shres_equal(struct shrlock *shrp1, struct shrlock *shrp2)
{
	if (shrp1->s_sysid	== shrp2->s_sysid	&&
	    shrp1->s_pid	== shrp2->s_pid		&&
	    shrp1->s_own_len	== shrp2->s_own_len	&&
	    bcmp(shrp1->s_owner, shrp2->s_owner,
	    shrp1->s_own_len) == 0)
		return (TRUE);

	return (FALSE);
}

static struct nlm_shres *
nlm_shres_create_item(struct shrlock *shrp, vnode_t *vp)
{
	struct nlm_shres *nsp;

	nsp = kmem_alloc(sizeof (*nsp), KM_SLEEP);
	nsp->ns_shr = kmem_alloc(sizeof (*shrp), KM_SLEEP);
	bcopy(shrp, nsp->ns_shr, sizeof (*shrp));
	nsp->ns_shr->s_owner = kmem_alloc(shrp->s_own_len, KM_SLEEP);
	bcopy(shrp->s_owner, nsp->ns_shr->s_owner, shrp->s_own_len);
	nsp->ns_vp = vp;

	return (nsp);
}

static void
nlm_shres_destroy_item(struct nlm_shres *nsp)
{
	kmem_free(nsp->ns_shr->s_owner,
	    nsp->ns_shr->s_own_len);
	kmem_free(nsp->ns_shr, sizeof (struct shrlock));
	kmem_free(nsp, sizeof (*nsp));
}

/*
 * Called by klmmod.c when lockd adds a network endpoint
 * on which we should begin RPC services.
 */
int
nlm_svc_add_ep(struct file *fp, const char *netid, struct knetconfig *knc)
{
	SVCMASTERXPRT *xprt = NULL;
	int error;

	error = svc_tli_kcreate(fp, 0, (char *)netid, NULL, &xprt,
	    &nlm_sct, NULL, NLM_SVCPOOL_ID, FALSE);
	if (error != 0)
		return (error);

	(void) nlm_knc_to_netid(knc);
	return (0);
}

/*
 * Start NLM service.
 */
int
nlm_svc_starting(struct nlm_globals *g, struct file *fp,
    const char *netid, struct knetconfig *knc)
{
	int error;
	enum clnt_stat stat;

	VERIFY(g->run_status == NLM_ST_STARTING);
	VERIFY(g->nlm_gc_thread == NULL);

	error = nlm_nsm_init_local(&g->nlm_nsm);
	if (error != 0) {
		NLM_ERR("Failed to initialize NSM handler "
		    "(error=%d)\n", error);
		g->run_status = NLM_ST_DOWN;
		return (error);
	}

	error = EIO;

	/*
	 * Create an NLM garbage collector thread that will
	 * clean up stale vholds and hosts objects.
	 */
	g->nlm_gc_thread = zthread_create(NULL, 0, nlm_gc,
	    g, 0, minclsyspri);

	/*
	 * Send SIMU_CRASH to local statd to report that
	 * NLM started, so that statd can report other hosts
	 * about NLM state change.
	 */

	stat = nlm_nsm_simu_crash(&g->nlm_nsm);
	if (stat != RPC_SUCCESS) {
		NLM_ERR("Failed to connect to local statd "
		    "(rpcerr=%d)\n", stat);
		goto shutdown_lm;
	}

	stat = nlm_nsm_stat(&g->nlm_nsm, &g->nsm_state);
	if (stat != RPC_SUCCESS) {
		NLM_ERR("Failed to get the status of local statd "
		    "(rpcerr=%d)\n", stat);
		goto shutdown_lm;
	}

	g->grace_threshold = ddi_get_lbolt() +
	    SEC_TO_TICK(g->grace_period);

	/* Register endpoint used for communications with local NLM */
	error = nlm_svc_add_ep(fp, netid, knc);
	if (error != 0)
		goto shutdown_lm;

	(void) svc_pool_control(NLM_SVCPOOL_ID,
	    SVCPSET_SHUTDOWN_PROC, (void *)nlm_pool_shutdown);
	g->run_status = NLM_ST_UP;
	return (0);

shutdown_lm:
	mutex_enter(&g->lock);
	g->run_status = NLM_ST_STOPPING;
	mutex_exit(&g->lock);

	nlm_svc_stopping(g);
	return (error);
}

/*
 * Called when the server pool is destroyed, so that
 * all transports are closed and no any server threads
 * exist.
 *
 * Just call lm_shutdown() to shut NLM down properly.
 */
static void
nlm_pool_shutdown(void)
{
	(void) lm_shutdown();
}

/*
 * Stop NLM service, cleanup all resources
 * NLM owns at the moment.
 *
 * NOTE: NFS code can call NLM while it's
 * stopping or even if it's shut down. Any attempt
 * to lock file either on client or on the server
 * will fail if NLM isn't in NLM_ST_UP state.
 */
void
nlm_svc_stopping(struct nlm_globals *g)
{
	mutex_enter(&g->lock);
	ASSERT(g->run_status == NLM_ST_STOPPING);

	/*
	 * Ask NLM GC thread to exit and wait until it dies.
	 */
	cv_signal(&g->nlm_gc_sched_cv);
	while (g->nlm_gc_thread != NULL)
		cv_wait(&g->nlm_gc_finish_cv, &g->lock);

	mutex_exit(&g->lock);

	/*
	 * Cleanup locks owned by NLM hosts.
	 * NOTE: New hosts won't be created while
	 * NLM is stopping.
	 */
	while (!avl_is_empty(&g->nlm_hosts_tree)) {
		struct nlm_host *hostp;
		int busy_hosts = 0;

		/*
		 * Iterate through all NLM hosts in the system
		 * and drop the locks they own by force.
		 */
		hostp = avl_first(&g->nlm_hosts_tree);
		while (hostp != NULL) {
			/* Cleanup all client and server side locks */
			nlm_client_cancel_all(g, hostp);
			nlm_host_notify_server(hostp, 0);

			mutex_enter(&hostp->nh_lock);
			nlm_host_gc_vholds(hostp);
			if (hostp->nh_refs > 0 || nlm_host_has_locks(hostp)) {
				/*
				 * Oh, it seems the host is still busy, let
				 * it some time to release and go to the
				 * next one.
				 */

				mutex_exit(&hostp->nh_lock);
				hostp = AVL_NEXT(&g->nlm_hosts_tree, hostp);
				busy_hosts++;
				continue;
			}

			mutex_exit(&hostp->nh_lock);
			hostp = AVL_NEXT(&g->nlm_hosts_tree, hostp);
		}

		/*
		 * All hosts go to nlm_idle_hosts list after
		 * all locks they own are cleaned up and last refereces
		 * were dropped. Just destroy all hosts in nlm_idle_hosts
		 * list, they can not be removed from there while we're
		 * in stopping state.
		 */
		while ((hostp = TAILQ_FIRST(&g->nlm_idle_hosts)) != NULL) {
			nlm_host_unregister(g, hostp);
			nlm_host_destroy(hostp);
		}

		if (busy_hosts > 0) {
			/*
			 * There're some hosts that weren't cleaned
			 * up. Probably they're in resource cleanup
			 * process. Give them some time to do drop
			 * references.
			 */
			delay(MSEC_TO_TICK(500));
		}
	}

	ASSERT(TAILQ_EMPTY(&g->nlm_slocks));

	nlm_nsm_fini(&g->nlm_nsm);
	g->lockd_pid = 0;
	g->run_status = NLM_ST_DOWN;
}

/*
 * Returns TRUE if the given vnode has
 * any active or sleeping locks.
 */
int
nlm_vp_active(const vnode_t *vp)
{
	struct nlm_globals *g;
	struct nlm_host *hostp;
	struct nlm_vhold *nvp;
	int active = 0;

	g = zone_getspecific(nlm_zone_key, curzone);

	/*
	 * Server side NLM has locks on the given vnode
	 * if there exist a vhold object that holds
	 * the given vnode "vp" in one of NLM hosts.
	 */
	mutex_enter(&g->lock);
	hostp = avl_first(&g->nlm_hosts_tree);
	while (hostp != NULL) {
		mutex_enter(&hostp->nh_lock);
		nvp = nlm_vhold_find_locked(hostp, vp);
		mutex_exit(&hostp->nh_lock);
		if (nvp != NULL) {
			active = 1;
			break;
		}

		hostp = AVL_NEXT(&g->nlm_hosts_tree, hostp);
	}

	mutex_exit(&g->lock);
	return (active);
}

/*
 * Called right before NFS export is going to
 * dissapear. The function finds all vnodes
 * belonging to the given export and cleans
 * all remote locks and share reservations
 * on them.
 */
void
nlm_zone_unexport(struct nlm_globals *g, struct exportinfo *exi)
{
	struct nlm_host *hostp;

	mutex_enter(&g->lock);
	if (g->run_status != NLM_ST_UP) {
		/* nothing to do */
		mutex_exit(&g->lock);
		return;
	}

	hostp = avl_first(&g->nlm_hosts_tree);
	while (hostp != NULL) {
		struct nlm_vhold *nvp;

		if (hostp->nh_flags & NLM_NH_INIDLE) {
			TAILQ_REMOVE(&g->nlm_idle_hosts, hostp, nh_link);
			hostp->nh_flags &= ~NLM_NH_INIDLE;
		}
		hostp->nh_refs++;

		mutex_exit(&g->lock);

		mutex_enter(&hostp->nh_lock);
		TAILQ_FOREACH(nvp, &hostp->nh_vholds_list, nv_link) {
			vnode_t *vp;

			nvp->nv_refcnt++;
			mutex_exit(&hostp->nh_lock);

			vp = nvp->nv_vp;

			if (!EQFSID(&exi->exi_fsid, &vp->v_vfsp->vfs_fsid))
				goto next_iter;

			/*
			 * Ok, it we found out that vnode vp is under
			 * control by the exportinfo exi, now we need
			 * to drop all locks from this vnode, let's
			 * do it.
			 */
			nlm_vhold_clean(nvp, hostp->nh_sysid);

		next_iter:
			mutex_enter(&hostp->nh_lock);
			nvp->nv_refcnt--;
		}
		mutex_exit(&hostp->nh_lock);

		mutex_enter(&g->lock);
		nlm_host_release_locked(g, hostp);

		hostp = AVL_NEXT(&g->nlm_hosts_tree, hostp);
	}

	mutex_exit(&g->lock);
}

void
nlm_unexport(struct exportinfo *exi)
{
	struct nlm_globals *g;

	rw_enter(&lm_lck, RW_READER);
	TAILQ_FOREACH(g, &nlm_zones_list, nlm_link) {
		if (g->nlm_zoneid == exi->exi_zoneid) {
			/*
			 * NOTE: If we want to drop lm_lock before
			 * calling nlm_zone_unexport(), we should break,
			 * and have a post-rw_exit() snippit like:
			 *	if (g != NULL)
			 *		nlm_zone_unexport(g, exi);
			 */
			nlm_zone_unexport(g, exi);
			break; /* Only going to match once! */
		}
	}
	rw_exit(&lm_lck);
}

/*
 * Allocate new unique sysid.
 * In case of failure (no available sysids)
 * return LM_NOSYSID.
 */
sysid_t
nlm_sysid_alloc(void)
{
	sysid_t ret_sysid = LM_NOSYSID;

	rw_enter(&lm_lck, RW_WRITER);
	if (nlm_sysid_nidx > LM_SYSID_MAX)
		nlm_sysid_nidx = LM_SYSID;

	if (!BT_TEST(nlm_sysid_bmap, nlm_sysid_nidx)) {
		BT_SET(nlm_sysid_bmap, nlm_sysid_nidx);
		ret_sysid = nlm_sysid_nidx++;
	} else {
		index_t id;

		id = bt_availbit(nlm_sysid_bmap, NLM_BMAP_NITEMS);
		if (id > 0) {
			nlm_sysid_nidx = id + 1;
			ret_sysid = id;
			BT_SET(nlm_sysid_bmap, id);
		}
	}

	rw_exit(&lm_lck);
	return (ret_sysid);
}

void
nlm_sysid_free(sysid_t sysid)
{
	ASSERT(sysid >= LM_SYSID && sysid <= LM_SYSID_MAX);

	rw_enter(&lm_lck, RW_WRITER);
	ASSERT(BT_TEST(nlm_sysid_bmap, sysid));
	BT_CLEAR(nlm_sysid_bmap, sysid);
	rw_exit(&lm_lck);
}

/*
 * Return true if the request came from a local caller.
 * By necessity, this "knows" the netid names invented
 * in lm_svc() and nlm_netid_from_knetconfig().
 */
bool_t
nlm_caller_is_local(SVCXPRT *transp)
{
	char *netid;
	struct netbuf *rtaddr;

	netid = svc_getnetid(transp);
	rtaddr = svc_getrpccaller(transp);

	if (netid == NULL)
		return (FALSE);

	if (strcmp(netid, "ticlts") == 0 ||
	    strcmp(netid, "ticotsord") == 0)
		return (TRUE);

	if (strcmp(netid, "tcp") == 0 || strcmp(netid, "udp") == 0) {
		struct sockaddr_in *sin = (void *)rtaddr->buf;
		if (sin->sin_addr.s_addr == htonl(INADDR_LOOPBACK))
			return (TRUE);
	}
	if (strcmp(netid, "tcp6") == 0 || strcmp(netid, "udp6") == 0) {
		struct sockaddr_in6 *sin6 = (void *)rtaddr->buf;
		if (IN6_IS_ADDR_LOOPBACK(&sin6->sin6_addr))
			return (TRUE);
	}

	return (FALSE); /* unknown transport */
}

/*
 * Get netid string correspondig to the given knetconfig.
 * If not done already, save knc->knc_rdev in our table.
 */
const char *
nlm_knc_to_netid(struct knetconfig *knc)
{
	int i;
	dev_t rdev;
	struct nlm_knc *nc;
	const char *netid = NULL;

	rw_enter(&lm_lck, RW_READER);
	for (i = 0; i < NLM_KNCS; i++) {
		nc = &nlm_netconfigs[i];

		if (nc->n_knc.knc_semantics == knc->knc_semantics &&
		    strcmp(nc->n_knc.knc_protofmly,
		    knc->knc_protofmly) == 0) {
			netid = nc->n_netid;
			rdev = nc->n_knc.knc_rdev;
			break;
		}
	}
	rw_exit(&lm_lck);

	if (netid != NULL && rdev == NODEV) {
		rw_enter(&lm_lck, RW_WRITER);
		if (nc->n_knc.knc_rdev == NODEV)
			nc->n_knc.knc_rdev = knc->knc_rdev;
		rw_exit(&lm_lck);
	}

	return (netid);
}

/*
 * Get a knetconfig corresponding to the given netid.
 * If there's no knetconfig for this netid, ENOENT
 * is returned.
 */
int
nlm_knc_from_netid(const char *netid, struct knetconfig *knc)
{
	int i, ret;

	ret = ENOENT;
	for (i = 0; i < NLM_KNCS; i++) {
		struct nlm_knc *nknc;

		nknc = &nlm_netconfigs[i];
		if (strcmp(netid, nknc->n_netid) == 0 &&
		    nknc->n_knc.knc_rdev != NODEV) {
			*knc = nknc->n_knc;
			ret = 0;
			break;
		}
	}

	return (ret);
}

void
nlm_cprsuspend(void)
{
	struct nlm_globals *g;

	rw_enter(&lm_lck, RW_READER);
	TAILQ_FOREACH(g, &nlm_zones_list, nlm_link)
		nlm_suspend_zone(g);

	rw_exit(&lm_lck);
}

void
nlm_cprresume(void)
{
	struct nlm_globals *g;

	rw_enter(&lm_lck, RW_READER);
	TAILQ_FOREACH(g, &nlm_zones_list, nlm_link)
		nlm_resume_zone(g);

	rw_exit(&lm_lck);
}

static void
nlm_nsm_clnt_init(CLIENT *clnt, struct nlm_nsm *nsm)
{
	(void) clnt_tli_kinit(clnt, &nsm->ns_knc, &nsm->ns_addr, 0,
	    NLM_RPC_RETRIES, zone_kcred());
}

static void
nlm_netbuf_to_netobj(struct netbuf *addr, int *family, netobj *obj)
{
	/* LINTED pointer alignment */
	struct sockaddr *sa = (struct sockaddr *)addr->buf;

	*family = sa->sa_family;

	switch (sa->sa_family) {
	case AF_INET: {
		/* LINTED pointer alignment */
		struct sockaddr_in *sin = (struct sockaddr_in *)sa;

		obj->n_len = sizeof (sin->sin_addr);
		obj->n_bytes = (char *)&sin->sin_addr;
		break;
	}

	case AF_INET6: {
		/* LINTED pointer alignment */
		struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sa;

		obj->n_len = sizeof (sin6->sin6_addr);
		obj->n_bytes = (char *)&sin6->sin6_addr;
		break;
	}

	default:
		VERIFY(0);
		break;
	}
}