/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 * Copyright 2012 Milan Jurik. All rights reserved.
 */
/* Copyright (c) 1990 Mentat Inc. */

/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
/*	  All Rights Reserved  	*/

/*
 * Kernel RPC filtering module
 */

#include <sys/param.h>
#include <sys/types.h>
#include <sys/stream.h>
#include <sys/stropts.h>
#include <sys/strsubr.h>
#include <sys/tihdr.h>
#include <sys/timod.h>
#include <sys/tiuser.h>
#include <sys/debug.h>
#include <sys/signal.h>
#include <sys/pcb.h>
#include <sys/user.h>
#include <sys/errno.h>
#include <sys/cred.h>
#include <sys/policy.h>
#include <sys/inline.h>
#include <sys/cmn_err.h>
#include <sys/kmem.h>
#include <sys/file.h>
#include <sys/sysmacros.h>
#include <sys/systm.h>
#include <sys/t_lock.h>
#include <sys/ddi.h>
#include <sys/vtrace.h>
#include <sys/callb.h>
#include <sys/strsun.h>

#include <sys/strlog.h>
#include <rpc/rpc_com.h>
#include <inet/common.h>
#include <rpc/types.h>
#include <sys/time.h>
#include <rpc/xdr.h>
#include <rpc/auth.h>
#include <rpc/clnt.h>
#include <rpc/rpc_msg.h>
#include <rpc/clnt.h>
#include <rpc/svc.h>
#include <rpc/rpcsys.h>
#include <rpc/rpc_rdma.h>

/*
 * This is the loadable module wrapper.
 */
#include <sys/conf.h>
#include <sys/modctl.h>
#include <sys/syscall.h>

extern struct streamtab rpcinfo;

static struct fmodsw fsw = {
	"rpcmod",
	&rpcinfo,
	D_NEW|D_MP,
};

/*
 * Module linkage information for the kernel.
 */

static struct modlstrmod modlstrmod = {
	&mod_strmodops, "rpc interface str mod", &fsw
};

/*
 * For the RPC system call.
 */
static struct sysent rpcsysent = {
	2,
	SE_32RVAL1 | SE_ARGC | SE_NOUNLOAD,
	rpcsys
};

static struct modlsys modlsys = {
	&mod_syscallops,
	"RPC syscall",
	&rpcsysent
};

#ifdef _SYSCALL32_IMPL
static struct modlsys modlsys32 = {
	&mod_syscallops32,
	"32-bit RPC syscall",
	&rpcsysent
};
#endif /* _SYSCALL32_IMPL */

static struct modlinkage modlinkage = {
	MODREV_1,
	{
		&modlsys,
#ifdef _SYSCALL32_IMPL
		&modlsys32,
#endif
		&modlstrmod,
		NULL
	}
};

int
_init(void)
{
	int error = 0;
	callb_id_t cid;
	int status;

	svc_init();
	clnt_init();
	cid = callb_add(connmgr_cpr_reset, 0, CB_CL_CPR_RPC, "rpc");

	if (error = mod_install(&modlinkage)) {
		/*
		 * Could not install module, cleanup previous
		 * initialization work.
		 */
		clnt_fini();
		if (cid != NULL)
			(void) callb_delete(cid);

		return (error);
	}

	/*
	 * Load up the RDMA plugins and initialize the stats. Even if the
	 * plugins loadup fails, but rpcmod was successfully installed the
	 * counters still get initialized.
	 */
	rw_init(&rdma_lock, NULL, RW_DEFAULT, NULL);
	mutex_init(&rdma_modload_lock, NULL, MUTEX_DEFAULT, NULL);

	cv_init(&rdma_wait.svc_cv, NULL, CV_DEFAULT, NULL);
	mutex_init(&rdma_wait.svc_lock, NULL, MUTEX_DEFAULT, NULL);

	mt_kstat_init();

	/*
	 * Get our identification into ldi.  This is used for loading
	 * other modules, e.g. rpcib.
	 */
	status = ldi_ident_from_mod(&modlinkage, &rpcmod_li);
	if (status != 0) {
		cmn_err(CE_WARN, "ldi_ident_from_mod fails with %d", status);
		rpcmod_li = NULL;
	}

	return (error);
}

/*
 * The unload entry point fails, because we advertise entry points into
 * rpcmod from the rest of kRPC: rpcmod_release().
 */
int
_fini(void)
{
	return (EBUSY);
}

int
_info(struct modinfo *modinfop)
{
	return (mod_info(&modlinkage, modinfop));
}

extern int nulldev();

#define	RPCMOD_ID	2049

int rmm_open(queue_t *, dev_t *, int, int, cred_t *);
int rmm_close(queue_t *, int, cred_t *);

/*
 * To save instructions, since STREAMS ignores the return value
 * from these functions, they are defined as void here. Kind of icky, but...
 */
void rmm_rput(queue_t *, mblk_t *);
void rmm_wput(queue_t *, mblk_t *);
void rmm_rsrv(queue_t *);
void rmm_wsrv(queue_t *);

int rpcmodopen(queue_t *, dev_t *, int, int, cred_t *);
int rpcmodclose(queue_t *, int, cred_t *);
void rpcmodrput(queue_t *, mblk_t *);
void rpcmodwput(queue_t *, mblk_t *);
void rpcmodrsrv();
void rpcmodwsrv(queue_t *);

static	void	rpcmodwput_other(queue_t *, mblk_t *);
static	int	mir_close(queue_t *q);
static	int	mir_open(queue_t *q, dev_t *devp, int flag, int sflag,
		    cred_t *credp);
static	void	mir_rput(queue_t *q, mblk_t *mp);
static	void	mir_rsrv(queue_t *q);
static	void	mir_wput(queue_t *q, mblk_t *mp);
static	void	mir_wsrv(queue_t *q);

static struct module_info rpcmod_info =
	{RPCMOD_ID, "rpcmod", 0, INFPSZ, 256*1024, 1024};

/*
 * Read side has no service procedure.
 */
static struct qinit rpcmodrinit = {
	(int (*)())rmm_rput,
	(int (*)())rmm_rsrv,
	rmm_open,
	rmm_close,
	nulldev,
	&rpcmod_info,
	NULL
};

/*
 * The write put procedure is simply putnext to conserve stack space.
 * The write service procedure is not used to queue data, but instead to
 * synchronize with flow control.
 */
static struct qinit rpcmodwinit = {
	(int (*)())rmm_wput,
	(int (*)())rmm_wsrv,
	rmm_open,
	rmm_close,
	nulldev,
	&rpcmod_info,
	NULL
};
struct streamtab rpcinfo = { &rpcmodrinit, &rpcmodwinit, NULL, NULL };

struct xprt_style_ops {
	int (*xo_open)();
	int (*xo_close)();
	void (*xo_wput)();
	void (*xo_wsrv)();
	void (*xo_rput)();
	void (*xo_rsrv)();
};

static struct xprt_style_ops xprt_clts_ops = {
	rpcmodopen,
	rpcmodclose,
	rpcmodwput,
	rpcmodwsrv,
	rpcmodrput,
	NULL
};

static struct xprt_style_ops xprt_cots_ops = {
	mir_open,
	mir_close,
	mir_wput,
	mir_wsrv,
	mir_rput,
	mir_rsrv
};

/*
 * Per rpcmod "slot" data structure. q->q_ptr points to one of these.
 */
struct rpcm {
	void		*rm_krpc_cell;	/* Reserved for use by KRPC */
	struct		xprt_style_ops	*rm_ops;
	int		rm_type;	/* Client or server side stream */
#define	RM_CLOSING	0x1		/* somebody is trying to close slot */
	uint_t		rm_state;	/* state of the slot. see above */
	uint_t		rm_ref;		/* cnt of external references to slot */
	kmutex_t	rm_lock;	/* mutex protecting above fields */
	kcondvar_t	rm_cwait;	/* condition for closing */
	zoneid_t	rm_zoneid;	/* zone which pushed rpcmod */
};

struct temp_slot {
	void *cell;
	struct xprt_style_ops *ops;
	int type;
	mblk_t *info_ack;
	kmutex_t lock;
	kcondvar_t wait;
};

typedef struct mir_s {
	void	*mir_krpc_cell;	/* Reserved for KRPC use. This field */
					/* must be first in the structure. */
	struct xprt_style_ops	*rm_ops;
	int	mir_type;		/* Client or server side stream */

	mblk_t	*mir_head_mp;		/* RPC msg in progress */
		/*
		 * mir_head_mp points the first mblk being collected in
		 * the current RPC message.  Record headers are removed
		 * before data is linked into mir_head_mp.
		 */
	mblk_t	*mir_tail_mp;		/* Last mblk in mir_head_mp */
		/*
		 * mir_tail_mp points to the last mblk in the message
		 * chain starting at mir_head_mp.  It is only valid
		 * if mir_head_mp is non-NULL and is used to add new
		 * data blocks to the end of chain quickly.
		 */

	int32_t	mir_frag_len;		/* Bytes seen in the current frag */
		/*
		 * mir_frag_len starts at -4 for beginning of each fragment.
		 * When this length is negative, it indicates the number of
		 * bytes that rpcmod needs to complete the record marker
		 * header.  When it is positive or zero, it holds the number
		 * of bytes that have arrived for the current fragment and
		 * are held in mir_header_mp.
		 */

	int32_t	mir_frag_header;
		/*
		 * Fragment header as collected for the current fragment.
		 * It holds the last-fragment indicator and the number
		 * of bytes in the fragment.
		 */

	unsigned int
		mir_ordrel_pending : 1,	/* Sent T_ORDREL_REQ */
		mir_hold_inbound : 1,	/* Hold inbound messages on server */
					/* side until outbound flow control */
					/* is relieved. */
		mir_closing : 1,	/* The stream is being closed */
		mir_inrservice : 1,	/* data queued or rd srv proc running */
		mir_inwservice : 1,	/* data queued or wr srv proc running */
		mir_inwflushdata : 1,	/* flush M_DATAs when srv runs */
		/*
		 * On client streams, mir_clntreq is 0 or 1; it is set
		 * to 1 whenever a new request is sent out (mir_wput)
		 * and cleared when the timer fires (mir_timer).  If
		 * the timer fires with this value equal to 0, then the
		 * stream is considered idle and KRPC is notified.
		 */
		mir_clntreq : 1,
		/*
		 * On server streams, stop accepting messages
		 */
		mir_svc_no_more_msgs : 1,
		mir_listen_stream : 1,	/* listen end point */
		mir_unused : 1,	/* no longer used */
		mir_timer_call : 1,
		mir_junk_fill_thru_bit_31 : 21;

	int	mir_setup_complete;	/* server has initialized everything */
	timeout_id_t mir_timer_id;	/* Timer for idle checks */
	clock_t	mir_idle_timeout;	/* Allowed idle time before shutdown */
		/*
		 * This value is copied from clnt_idle_timeout or
		 * svc_idle_timeout during the appropriate ioctl.
		 * Kept in milliseconds
		 */
	clock_t	mir_use_timestamp;	/* updated on client with each use */
		/*
		 * This value is set to lbolt
		 * every time a client stream sends or receives data.
		 * Even if the timer message arrives, we don't shutdown
		 * client unless:
		 *    lbolt >= MSEC_TO_TICK(mir_idle_timeout)+mir_use_timestamp.
		 * This value is kept in HZ.
		 */

	uint_t	*mir_max_msg_sizep;	/* Reference to sanity check size */
		/*
		 * This pointer is set to &clnt_max_msg_size or
		 * &svc_max_msg_size during the appropriate ioctl.
		 */
	zoneid_t mir_zoneid;	/* zone which pushed rpcmod */
	/* Server-side fields. */
	int	mir_ref_cnt;		/* Reference count: server side only */
					/* counts the number of references */
					/* that a kernel RPC server thread */
					/* (see svc_run()) has on this rpcmod */
					/* slot. Effectively, it is the */
					/* number * of unprocessed messages */
					/* that have been passed up to the */
					/* KRPC layer */

	mblk_t	*mir_svc_pend_mp;	/* Pending T_ORDREL_IND or */
					/* T_DISCON_IND */

	/*
	 * these fields are for both client and server, but for debugging,
	 * it is easier to have these last in the structure.
	 */
	kmutex_t	mir_mutex;	/* Mutex and condvar for close */
	kcondvar_t	mir_condvar;	/* synchronization. */
	kcondvar_t	mir_timer_cv;	/* Timer routine sync. */
} mir_t;

void tmp_rput(queue_t *q, mblk_t *mp);

struct xprt_style_ops tmpops = {
	NULL,
	NULL,
	putnext,
	NULL,
	tmp_rput,
	NULL
};

void
tmp_rput(queue_t *q, mblk_t *mp)
{
	struct temp_slot *t = (struct temp_slot *)(q->q_ptr);
	struct T_info_ack *pptr;

	switch (mp->b_datap->db_type) {
	case M_PCPROTO:
		pptr = (struct T_info_ack *)mp->b_rptr;
		switch (pptr->PRIM_type) {
		case T_INFO_ACK:
			mutex_enter(&t->lock);
			t->info_ack = mp;
			cv_signal(&t->wait);
			mutex_exit(&t->lock);
			return;
		default:
			break;
		}
	default:
		break;
	}

	/*
	 * Not an info-ack, so free it. This is ok because we should
	 * not be receiving data until the open finishes: rpcmod
	 * is pushed well before the end-point is bound to an address.
	 */
	freemsg(mp);
}

int
rmm_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *crp)
{
	mblk_t *bp;
	struct temp_slot ts, *t;
	struct T_info_ack *pptr;
	int error = 0;

	ASSERT(q != NULL);
	/*
	 * Check for re-opens.
	 */
	if (q->q_ptr) {
		TRACE_1(TR_FAC_KRPC, TR_RPCMODOPEN_END,
		    "rpcmodopen_end:(%s)", "q->qptr");
		return (0);
	}

	t = &ts;
	bzero(t, sizeof (*t));
	q->q_ptr = (void *)t;
	WR(q)->q_ptr = (void *)t;

	/*
	 * Allocate the required messages upfront.
	 */
	if ((bp = allocb_cred(sizeof (struct T_info_req) +
	    sizeof (struct T_info_ack), crp, curproc->p_pid)) == NULL) {
		return (ENOBUFS);
	}

	mutex_init(&t->lock, NULL, MUTEX_DEFAULT, NULL);
	cv_init(&t->wait, NULL, CV_DEFAULT, NULL);

	t->ops = &tmpops;

	qprocson(q);
	bp->b_datap->db_type = M_PCPROTO;
	*(int32_t *)bp->b_wptr = (int32_t)T_INFO_REQ;
	bp->b_wptr += sizeof (struct T_info_req);
	putnext(WR(q), bp);

	mutex_enter(&t->lock);
	while (t->info_ack == NULL) {
		if (cv_wait_sig(&t->wait, &t->lock) == 0) {
			error = EINTR;
			break;
		}
	}
	mutex_exit(&t->lock);

	if (error)
		goto out;

	pptr = (struct T_info_ack *)t->info_ack->b_rptr;

	if (pptr->SERV_type == T_CLTS) {
		if ((error = rpcmodopen(q, devp, flag, sflag, crp)) == 0)
			((struct rpcm *)q->q_ptr)->rm_ops = &xprt_clts_ops;
	} else {
		if ((error = mir_open(q, devp, flag, sflag, crp)) == 0)
			((mir_t *)q->q_ptr)->rm_ops = &xprt_cots_ops;
	}

out:
	if (error)
		qprocsoff(q);

	freemsg(t->info_ack);
	mutex_destroy(&t->lock);
	cv_destroy(&t->wait);

	return (error);
}

void
rmm_rput(queue_t *q, mblk_t  *mp)
{
	(*((struct temp_slot *)q->q_ptr)->ops->xo_rput)(q, mp);
}

void
rmm_rsrv(queue_t *q)
{
	(*((struct temp_slot *)q->q_ptr)->ops->xo_rsrv)(q);
}

void
rmm_wput(queue_t *q, mblk_t *mp)
{
	(*((struct temp_slot *)q->q_ptr)->ops->xo_wput)(q, mp);
}

void
rmm_wsrv(queue_t *q)
{
	(*((struct temp_slot *)q->q_ptr)->ops->xo_wsrv)(q);
}

int
rmm_close(queue_t *q, int flag, cred_t *crp)
{
	return ((*((struct temp_slot *)q->q_ptr)->ops->xo_close)(q, flag, crp));
}

static void rpcmod_release(queue_t *, mblk_t *);
/*
 * rpcmodopen -	open routine gets called when the module gets pushed
 *		onto the stream.
 */
/*ARGSUSED*/
int
rpcmodopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *crp)
{
	struct rpcm *rmp;

	extern void (*rpc_rele)(queue_t *, mblk_t *);

	TRACE_0(TR_FAC_KRPC, TR_RPCMODOPEN_START, "rpcmodopen_start:");

	/*
	 * Initialize entry points to release a rpcmod slot (and an input
	 * message if supplied) and to send an output message to the module
	 * below rpcmod.
	 */
	if (rpc_rele == NULL)
		rpc_rele = rpcmod_release;

	/*
	 * Only sufficiently privileged users can use this module, and it
	 * is assumed that they will use this module properly, and NOT send
	 * bulk data from downstream.
	 */
	if (secpolicy_rpcmod_open(crp) != 0)
		return (EPERM);

	/*
	 * Allocate slot data structure.
	 */
	rmp = kmem_zalloc(sizeof (*rmp), KM_SLEEP);

	mutex_init(&rmp->rm_lock, NULL, MUTEX_DEFAULT, NULL);
	cv_init(&rmp->rm_cwait, NULL, CV_DEFAULT, NULL);
	rmp->rm_zoneid = rpc_zoneid();
	/*
	 * slot type will be set by kRPC client and server ioctl's
	 */
	rmp->rm_type = 0;

	q->q_ptr = (void *)rmp;
	WR(q)->q_ptr = (void *)rmp;

	TRACE_1(TR_FAC_KRPC, TR_RPCMODOPEN_END, "rpcmodopen_end:(%s)", "end");
	return (0);
}

/*
 * rpcmodclose - This routine gets called when the module gets popped
 * off of the stream.
 */
/*ARGSUSED*/
int
rpcmodclose(queue_t *q, int flag, cred_t *crp)
{
	struct rpcm *rmp;

	ASSERT(q != NULL);
	rmp = (struct rpcm *)q->q_ptr;

	/*
	 * Mark our state as closing.
	 */
	mutex_enter(&rmp->rm_lock);
	rmp->rm_state |= RM_CLOSING;

	/*
	 * Check and see if there are any messages on the queue.  If so, send
	 * the messages, regardless whether the downstream module is ready to
	 * accept data.
	 */
	if (rmp->rm_type == RPC_SERVER) {
		flushq(q, FLUSHDATA);

		qenable(WR(q));

		if (rmp->rm_ref) {
			mutex_exit(&rmp->rm_lock);
			/*
			 * call into SVC to clean the queue
			 */
			svc_queueclean(q);
			mutex_enter(&rmp->rm_lock);

			/*
			 * Block while there are kRPC threads with a reference
			 * to this message.
			 */
			while (rmp->rm_ref)
				cv_wait(&rmp->rm_cwait, &rmp->rm_lock);
		}

		mutex_exit(&rmp->rm_lock);

		/*
		 * It is now safe to remove this queue from the stream. No kRPC
		 * threads have a reference to the stream, and none ever will,
		 * because RM_CLOSING is set.
		 */
		qprocsoff(q);

		/* Notify kRPC that this stream is going away. */
		svc_queueclose(q);
	} else {
		mutex_exit(&rmp->rm_lock);
		qprocsoff(q);
	}

	q->q_ptr = NULL;
	WR(q)->q_ptr = NULL;
	mutex_destroy(&rmp->rm_lock);
	cv_destroy(&rmp->rm_cwait);
	kmem_free(rmp, sizeof (*rmp));
	return (0);
}

#ifdef	DEBUG
int	rpcmod_send_msg_up = 0;
int	rpcmod_send_uderr = 0;
int	rpcmod_send_dup = 0;
int	rpcmod_send_dup_cnt = 0;
#endif

/*
 * rpcmodrput -	Module read put procedure.  This is called from
 *		the module, driver, or stream head downstream.
 */
void
rpcmodrput(queue_t *q, mblk_t *mp)
{
	struct rpcm *rmp;
	union T_primitives *pptr;
	int hdrsz;

	TRACE_0(TR_FAC_KRPC, TR_RPCMODRPUT_START, "rpcmodrput_start:");

	ASSERT(q != NULL);
	rmp = (struct rpcm *)q->q_ptr;

	if (rmp->rm_type == 0) {
		freemsg(mp);
		return;
	}

#ifdef DEBUG
	if (rpcmod_send_msg_up > 0) {
		mblk_t *nmp = copymsg(mp);
		if (nmp) {
			putnext(q, nmp);
			rpcmod_send_msg_up--;
		}
	}
	if ((rpcmod_send_uderr > 0) && mp->b_datap->db_type == M_PROTO) {
		mblk_t *nmp;
		struct T_unitdata_ind *data;
		struct T_uderror_ind *ud;
		int d;
		data = (struct T_unitdata_ind *)mp->b_rptr;
		if (data->PRIM_type == T_UNITDATA_IND) {
			d = sizeof (*ud) - sizeof (*data);
			nmp = allocb(mp->b_wptr - mp->b_rptr + d, BPRI_HI);
			if (nmp) {
				ud = (struct T_uderror_ind *)nmp->b_rptr;
				ud->PRIM_type = T_UDERROR_IND;
				ud->DEST_length = data->SRC_length;
				ud->DEST_offset = data->SRC_offset + d;
				ud->OPT_length = data->OPT_length;
				ud->OPT_offset = data->OPT_offset + d;
				ud->ERROR_type = ENETDOWN;
				if (data->SRC_length) {
					bcopy(mp->b_rptr +
					    data->SRC_offset,
					    nmp->b_rptr +
					    ud->DEST_offset,
					    data->SRC_length);
				}
				if (data->OPT_length) {
					bcopy(mp->b_rptr +
					    data->OPT_offset,
					    nmp->b_rptr +
					    ud->OPT_offset,
					    data->OPT_length);
				}
				nmp->b_wptr += d;
				nmp->b_wptr += (mp->b_wptr - mp->b_rptr);
				nmp->b_datap->db_type = M_PROTO;
				putnext(q, nmp);
				rpcmod_send_uderr--;
			}
		}
	}
#endif
	switch (mp->b_datap->db_type) {
	default:
		putnext(q, mp);
		break;

	case M_PROTO:
	case M_PCPROTO:
		ASSERT((mp->b_wptr - mp->b_rptr) >= sizeof (int32_t));
		pptr = (union T_primitives *)mp->b_rptr;

		/*
		 * Forward this message to krpc if it is data.
		 */
		if (pptr->type == T_UNITDATA_IND) {
			mblk_t *nmp;

		/*
		 * Check if the module is being popped.
		 */
			mutex_enter(&rmp->rm_lock);
			if (rmp->rm_state & RM_CLOSING) {
				mutex_exit(&rmp->rm_lock);
				putnext(q, mp);
				break;
			}

			switch (rmp->rm_type) {
			case RPC_CLIENT:
				mutex_exit(&rmp->rm_lock);
				hdrsz = mp->b_wptr - mp->b_rptr;

				/*
				 * Make sure the header is sane.
				 */
				if (hdrsz < TUNITDATAINDSZ ||
				    hdrsz < (pptr->unitdata_ind.OPT_length +
				    pptr->unitdata_ind.OPT_offset) ||
				    hdrsz < (pptr->unitdata_ind.SRC_length +
				    pptr->unitdata_ind.SRC_offset)) {
					freemsg(mp);
					return;
				}

				/*
				 * Call clnt_clts_dispatch_notify, so that it
				 * can pass the message to the proper caller.
				 * Don't discard the header just yet since the
				 * client may need the sender's address.
				 */
				clnt_clts_dispatch_notify(mp, hdrsz,
				    rmp->rm_zoneid);
				return;
			case RPC_SERVER:
				/*
				 * rm_krpc_cell is exclusively used by the kRPC
				 * CLTS server
				 */
				if (rmp->rm_krpc_cell) {
#ifdef DEBUG
					/*
					 * Test duplicate request cache and
					 * rm_ref count handling by sending a
					 * duplicate every so often, if
					 * desired.
					 */
					if (rpcmod_send_dup &&
					    rpcmod_send_dup_cnt++ %
					    rpcmod_send_dup)
						nmp = copymsg(mp);
					else
						nmp = NULL;
#endif
					/*
					 * Raise the reference count on this
					 * module to prevent it from being
					 * popped before krpc generates the
					 * reply.
					 */
					rmp->rm_ref++;
					mutex_exit(&rmp->rm_lock);

					/*
					 * Submit the message to krpc.
					 */
					svc_queuereq(q, mp);
#ifdef DEBUG
					/*
					 * Send duplicate if we created one.
					 */
					if (nmp) {
						mutex_enter(&rmp->rm_lock);
						rmp->rm_ref++;
						mutex_exit(&rmp->rm_lock);
						svc_queuereq(q, nmp);
					}
#endif
				} else {
					mutex_exit(&rmp->rm_lock);
					freemsg(mp);
				}
				return;
			default:
				mutex_exit(&rmp->rm_lock);
				freemsg(mp);
				return;
			} /* end switch(rmp->rm_type) */
		} else if (pptr->type == T_UDERROR_IND) {
			mutex_enter(&rmp->rm_lock);
			hdrsz = mp->b_wptr - mp->b_rptr;

			/*
			 * Make sure the header is sane
			 */
			if (hdrsz < TUDERRORINDSZ ||
			    hdrsz < (pptr->uderror_ind.OPT_length +
			    pptr->uderror_ind.OPT_offset) ||
			    hdrsz < (pptr->uderror_ind.DEST_length +
			    pptr->uderror_ind.DEST_offset)) {
				mutex_exit(&rmp->rm_lock);
				freemsg(mp);
				return;
			}

			/*
			 * In the case where a unit data error has been
			 * received, all we need to do is clear the message from
			 * the queue.
			 */
			mutex_exit(&rmp->rm_lock);
			freemsg(mp);
			RPCLOG(32, "rpcmodrput: unitdata error received at "
			    "%ld\n", gethrestime_sec());
			return;
		} /* end else if (pptr->type == T_UDERROR_IND) */

		putnext(q, mp);
		break;
	} /* end switch (mp->b_datap->db_type) */

	TRACE_0(TR_FAC_KRPC, TR_RPCMODRPUT_END,
	    "rpcmodrput_end:");
	/*
	 * Return codes are not looked at by the STREAMS framework.
	 */
}

/*
 * write put procedure
 */
void
rpcmodwput(queue_t *q, mblk_t *mp)
{
	struct rpcm	*rmp;

	ASSERT(q != NULL);

	switch (mp->b_datap->db_type) {
		case M_PROTO:
		case M_PCPROTO:
			break;
		default:
			rpcmodwput_other(q, mp);
			return;
	}

	/*
	 * Check to see if we can send the message downstream.
	 */
	if (canputnext(q)) {
		putnext(q, mp);
		return;
	}

	rmp = (struct rpcm *)q->q_ptr;
	ASSERT(rmp != NULL);

	/*
	 * The first canputnext failed.  Try again except this time with the
	 * lock held, so that we can check the state of the stream to see if
	 * it is closing.  If either of these conditions evaluate to true
	 * then send the meesage.
	 */
	mutex_enter(&rmp->rm_lock);
	if (canputnext(q) || (rmp->rm_state & RM_CLOSING)) {
		mutex_exit(&rmp->rm_lock);
		putnext(q, mp);
	} else {
		/*
		 * canputnext failed again and the stream is not closing.
		 * Place the message on the queue and let the service
		 * procedure handle the message.
		 */
		mutex_exit(&rmp->rm_lock);
		(void) putq(q, mp);
	}
}

static void
rpcmodwput_other(queue_t *q, mblk_t *mp)
{
	struct rpcm	*rmp;
	struct iocblk	*iocp;

	rmp = (struct rpcm *)q->q_ptr;
	ASSERT(rmp != NULL);

	switch (mp->b_datap->db_type) {
		case M_IOCTL:
			iocp = (struct iocblk *)mp->b_rptr;
			ASSERT(iocp != NULL);
			switch (iocp->ioc_cmd) {
				case RPC_CLIENT:
				case RPC_SERVER:
					mutex_enter(&rmp->rm_lock);
					rmp->rm_type = iocp->ioc_cmd;
					mutex_exit(&rmp->rm_lock);
					mp->b_datap->db_type = M_IOCACK;
					qreply(q, mp);
					return;
				default:
				/*
				 * pass the ioctl downstream and hope someone
				 * down there knows how to handle it.
				 */
					putnext(q, mp);
					return;
			}
		default:
			break;
	}
	/*
	 * This is something we definitely do not know how to handle, just
	 * pass the message downstream
	 */
	putnext(q, mp);
}

/*
 * Module write service procedure. This is called by downstream modules
 * for back enabling during flow control.
 */
void
rpcmodwsrv(queue_t *q)
{
	struct rpcm	*rmp;
	mblk_t		*mp = NULL;

	rmp = (struct rpcm *)q->q_ptr;
	ASSERT(rmp != NULL);

	/*
	 * Get messages that may be queued and send them down stream
	 */
	while ((mp = getq(q)) != NULL) {
		/*
		 * Optimize the service procedure for the server-side, by
		 * avoiding a call to canputnext().
		 */
		if (rmp->rm_type == RPC_SERVER || canputnext(q)) {
			putnext(q, mp);
			continue;
		}
		(void) putbq(q, mp);
		return;
	}
}

static void
rpcmod_release(queue_t *q, mblk_t *bp)
{
	struct rpcm *rmp;

	/*
	 * For now, just free the message.
	 */
	if (bp)
		freemsg(bp);
	rmp = (struct rpcm *)q->q_ptr;

	mutex_enter(&rmp->rm_lock);
	rmp->rm_ref--;

	if (rmp->rm_ref == 0 && (rmp->rm_state & RM_CLOSING)) {
		cv_broadcast(&rmp->rm_cwait);
	}

	mutex_exit(&rmp->rm_lock);
}

/*
 * This part of rpcmod is pushed on a connection-oriented transport for use
 * by RPC.  It serves to bypass the Stream head, implements
 * the record marking protocol, and dispatches incoming RPC messages.
 */

/* Default idle timer values */
#define	MIR_CLNT_IDLE_TIMEOUT	(5 * (60 * 1000L))	/* 5 minutes */
#define	MIR_SVC_IDLE_TIMEOUT	(6 * (60 * 1000L))	/* 6 minutes */
#define	MIR_SVC_ORDREL_TIMEOUT	(10 * (60 * 1000L))	/* 10 minutes */
#define	MIR_LASTFRAG	0x80000000	/* Record marker */

#define	MIR_SVC_QUIESCED(mir)	\
	(mir->mir_ref_cnt == 0 && mir->mir_inrservice == 0)

#define	MIR_CLEAR_INRSRV(mir_ptr)	{	\
	(mir_ptr)->mir_inrservice = 0;	\
	if ((mir_ptr)->mir_type == RPC_SERVER &&	\
		(mir_ptr)->mir_closing)	\
		cv_signal(&(mir_ptr)->mir_condvar);	\
}

/*
 * Don't block service procedure (and mir_close) if
 * we are in the process of closing.
 */
#define	MIR_WCANPUTNEXT(mir_ptr, write_q)	\
	(canputnext(write_q) || ((mir_ptr)->mir_svc_no_more_msgs == 1))

static int	mir_clnt_dup_request(queue_t *q, mblk_t *mp);
static void	mir_rput_proto(queue_t *q, mblk_t *mp);
static int	mir_svc_policy_notify(queue_t *q, int event);
static void	mir_svc_release(queue_t *wq, mblk_t *mp);
static void	mir_svc_start(queue_t *wq);
static void	mir_svc_idle_start(queue_t *, mir_t *);
static void	mir_svc_idle_stop(queue_t *, mir_t *);
static void	mir_svc_start_close(queue_t *, mir_t *);
static void	mir_clnt_idle_do_stop(queue_t *);
static void	mir_clnt_idle_stop(queue_t *, mir_t *);
static void	mir_clnt_idle_start(queue_t *, mir_t *);
static void	mir_wput(queue_t *q, mblk_t *mp);
static void	mir_wput_other(queue_t *q, mblk_t *mp);
static void	mir_wsrv(queue_t *q);
static	void	mir_disconnect(queue_t *, mir_t *ir);
static	int	mir_check_len(queue_t *, int32_t, mblk_t *);
static	void	mir_timer(void *);

extern void	(*mir_rele)(queue_t *, mblk_t *);
extern void	(*mir_start)(queue_t *);
extern void	(*clnt_stop_idle)(queue_t *);

clock_t	clnt_idle_timeout = MIR_CLNT_IDLE_TIMEOUT;
clock_t	svc_idle_timeout = MIR_SVC_IDLE_TIMEOUT;

/*
 * Timeout for subsequent notifications of idle connection.  This is
 * typically used to clean up after a wedged orderly release.
 */
clock_t	svc_ordrel_timeout = MIR_SVC_ORDREL_TIMEOUT; /* milliseconds */

extern	uint_t	*clnt_max_msg_sizep;
extern	uint_t	*svc_max_msg_sizep;
uint_t	clnt_max_msg_size = RPC_MAXDATASIZE;
uint_t	svc_max_msg_size = RPC_MAXDATASIZE;
uint_t	mir_krpc_cell_null;

static void
mir_timer_stop(mir_t *mir)
{
	timeout_id_t tid;

	ASSERT(MUTEX_HELD(&mir->mir_mutex));

	/*
	 * Since the mir_mutex lock needs to be released to call
	 * untimeout(), we need to make sure that no other thread
	 * can start/stop the timer (changing mir_timer_id) during
	 * that time.  The mir_timer_call bit and the mir_timer_cv
	 * condition variable are used to synchronize this.  Setting
	 * mir_timer_call also tells mir_timer() (refer to the comments
	 * in mir_timer()) that it does not need to do anything.
	 */
	while (mir->mir_timer_call)
		cv_wait(&mir->mir_timer_cv, &mir->mir_mutex);
	mir->mir_timer_call = B_TRUE;

	if ((tid = mir->mir_timer_id) != 0) {
		mir->mir_timer_id = 0;
		mutex_exit(&mir->mir_mutex);
		(void) untimeout(tid);
		mutex_enter(&mir->mir_mutex);
	}
	mir->mir_timer_call = B_FALSE;
	cv_broadcast(&mir->mir_timer_cv);
}

static void
mir_timer_start(queue_t *q, mir_t *mir, clock_t intrvl)
{
	timeout_id_t tid;

	ASSERT(MUTEX_HELD(&mir->mir_mutex));

	while (mir->mir_timer_call)
		cv_wait(&mir->mir_timer_cv, &mir->mir_mutex);
	mir->mir_timer_call = B_TRUE;

	if ((tid = mir->mir_timer_id) != 0) {
		mutex_exit(&mir->mir_mutex);
		(void) untimeout(tid);
		mutex_enter(&mir->mir_mutex);
	}
	/* Only start the timer when it is not closing. */
	if (!mir->mir_closing) {
		mir->mir_timer_id = timeout(mir_timer, q,
		    MSEC_TO_TICK(intrvl));
	}
	mir->mir_timer_call = B_FALSE;
	cv_broadcast(&mir->mir_timer_cv);
}

static int
mir_clnt_dup_request(queue_t *q, mblk_t *mp)
{
	mblk_t  *mp1;
	uint32_t  new_xid;
	uint32_t  old_xid;

	ASSERT(MUTEX_HELD(&((mir_t *)q->q_ptr)->mir_mutex));
	new_xid = BE32_TO_U32(&mp->b_rptr[4]);
	/*
	 * This loop is a bit tacky -- it walks the STREAMS list of
	 * flow-controlled messages.
	 */
	if ((mp1 = q->q_first) != NULL) {
		do {
			old_xid = BE32_TO_U32(&mp1->b_rptr[4]);
			if (new_xid == old_xid)
				return (1);
		} while ((mp1 = mp1->b_next) != NULL);
	}
	return (0);
}

static int
mir_close(queue_t *q)
{
	mir_t	*mir = q->q_ptr;
	mblk_t	*mp;
	bool_t queue_cleaned = FALSE;

	RPCLOG(32, "rpcmod: mir_close of q 0x%p\n", (void *)q);
	ASSERT(MUTEX_NOT_HELD(&mir->mir_mutex));
	mutex_enter(&mir->mir_mutex);
	if ((mp = mir->mir_head_mp) != NULL) {
		mir->mir_head_mp = NULL;
		mir->mir_tail_mp = NULL;
		freemsg(mp);
	}
	/*
	 * Set mir_closing so we get notified when MIR_SVC_QUIESCED()
	 * is TRUE.  And mir_timer_start() won't start the timer again.
	 */
	mir->mir_closing = B_TRUE;
	mir_timer_stop(mir);

	if (mir->mir_type == RPC_SERVER) {
		flushq(q, FLUSHDATA);	/* Ditch anything waiting on read q */

		/*
		 * This will prevent more requests from arriving and
		 * will force rpcmod to ignore flow control.
		 */
		mir_svc_start_close(WR(q), mir);

		while ((!MIR_SVC_QUIESCED(mir)) || mir->mir_inwservice == 1) {

			if (mir->mir_ref_cnt && !mir->mir_inrservice &&
			    (queue_cleaned == FALSE)) {
				/*
				 * call into SVC to clean the queue
				 */
				mutex_exit(&mir->mir_mutex);
				svc_queueclean(q);
				queue_cleaned = TRUE;
				mutex_enter(&mir->mir_mutex);
				continue;
			}

			/*
			 * Bugid 1253810 - Force the write service
			 * procedure to send its messages, regardless
			 * whether the downstream  module is ready
			 * to accept data.
			 */
			if (mir->mir_inwservice == 1)
				qenable(WR(q));

			cv_wait(&mir->mir_condvar, &mir->mir_mutex);
		}

		mutex_exit(&mir->mir_mutex);
		qprocsoff(q);

		/* Notify KRPC that this stream is going away. */
		svc_queueclose(q);
	} else {
		mutex_exit(&mir->mir_mutex);
		qprocsoff(q);
	}

	mutex_destroy(&mir->mir_mutex);
	cv_destroy(&mir->mir_condvar);
	cv_destroy(&mir->mir_timer_cv);
	kmem_free(mir, sizeof (mir_t));
	return (0);
}

/*
 * This is server side only (RPC_SERVER).
 *
 * Exit idle mode.
 */
static void
mir_svc_idle_stop(queue_t *q, mir_t *mir)
{
	ASSERT(MUTEX_HELD(&mir->mir_mutex));
	ASSERT((q->q_flag & QREADR) == 0);
	ASSERT(mir->mir_type == RPC_SERVER);
	RPCLOG(16, "rpcmod: mir_svc_idle_stop of q 0x%p\n", (void *)q);

	mir_timer_stop(mir);
}

/*
 * This is server side only (RPC_SERVER).
 *
 * Start idle processing, which will include setting idle timer if the
 * stream is not being closed.
 */
static void
mir_svc_idle_start(queue_t *q, mir_t *mir)
{
	ASSERT(MUTEX_HELD(&mir->mir_mutex));
	ASSERT((q->q_flag & QREADR) == 0);
	ASSERT(mir->mir_type == RPC_SERVER);
	RPCLOG(16, "rpcmod: mir_svc_idle_start q 0x%p\n", (void *)q);

	/*
	 * Don't re-start idle timer if we are closing queues.
	 */
	if (mir->mir_closing) {
		RPCLOG(16, "mir_svc_idle_start - closing: 0x%p\n",
		    (void *)q);

		/*
		 * We will call mir_svc_idle_start() whenever MIR_SVC_QUIESCED()
		 * is true.  When it is true, and we are in the process of
		 * closing the stream, signal any thread waiting in
		 * mir_close().
		 */
		if (mir->mir_inwservice == 0)
			cv_signal(&mir->mir_condvar);

	} else {
		RPCLOG(16, "mir_svc_idle_start - reset %s timer\n",
		    mir->mir_ordrel_pending ? "ordrel" : "normal");
		/*
		 * Normal condition, start the idle timer.  If an orderly
		 * release has been sent, set the timeout to wait for the
		 * client to close its side of the connection.  Otherwise,
		 * use the normal idle timeout.
		 */
		mir_timer_start(q, mir, mir->mir_ordrel_pending ?
		    svc_ordrel_timeout : mir->mir_idle_timeout);
	}
}

/* ARGSUSED */
static int
mir_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
{
	mir_t	*mir;

	RPCLOG(32, "rpcmod: mir_open of q 0x%p\n", (void *)q);
	/* Set variables used directly by KRPC. */
	if (!mir_rele)
		mir_rele = mir_svc_release;
	if (!mir_start)
		mir_start = mir_svc_start;
	if (!clnt_stop_idle)
		clnt_stop_idle = mir_clnt_idle_do_stop;
	if (!clnt_max_msg_sizep)
		clnt_max_msg_sizep = &clnt_max_msg_size;
	if (!svc_max_msg_sizep)
		svc_max_msg_sizep = &svc_max_msg_size;

	/* Allocate a zero'ed out mir structure for this stream. */
	mir = kmem_zalloc(sizeof (mir_t), KM_SLEEP);

	/*
	 * We set hold inbound here so that incoming messages will
	 * be held on the read-side queue until the stream is completely
	 * initialized with a RPC_CLIENT or RPC_SERVER ioctl.  During
	 * the ioctl processing, the flag is cleared and any messages that
	 * arrived between the open and the ioctl are delivered to KRPC.
	 *
	 * Early data should never arrive on a client stream since
	 * servers only respond to our requests and we do not send any.
	 * until after the stream is initialized.  Early data is
	 * very common on a server stream where the client will start
	 * sending data as soon as the connection is made (and this
	 * is especially true with TCP where the protocol accepts the
	 * connection before nfsd or KRPC is notified about it).
	 */

	mir->mir_hold_inbound = 1;

	/*
	 * Start the record marker looking for a 4-byte header.  When
	 * this length is negative, it indicates that rpcmod is looking
	 * for bytes to consume for the record marker header.  When it
	 * is positive, it holds the number of bytes that have arrived
	 * for the current fragment and are being held in mir_header_mp.
	 */

	mir->mir_frag_len = -(int32_t)sizeof (uint32_t);

	mir->mir_zoneid = rpc_zoneid();
	mutex_init(&mir->mir_mutex, NULL, MUTEX_DEFAULT, NULL);
	cv_init(&mir->mir_condvar, NULL, CV_DRIVER, NULL);
	cv_init(&mir->mir_timer_cv, NULL, CV_DRIVER, NULL);

	q->q_ptr = (char *)mir;
	WR(q)->q_ptr = (char *)mir;

	/*
	 * We noenable the read-side queue because we don't want it
	 * automatically enabled by putq.  We enable it explicitly
	 * in mir_wsrv when appropriate. (See additional comments on
	 * flow control at the beginning of mir_rsrv.)
	 */
	noenable(q);

	qprocson(q);
	return (0);
}

/*
 * Read-side put routine for both the client and server side.  Does the
 * record marking for incoming RPC messages, and when complete, dispatches
 * the message to either the client or server.
 */
static void
mir_rput(queue_t *q, mblk_t *mp)
{
	int	excess;
	int32_t	frag_len, frag_header;
	mblk_t	*cont_mp, *head_mp, *tail_mp, *mp1;
	mir_t	*mir = q->q_ptr;
	boolean_t stop_timer = B_FALSE;

	ASSERT(mir != NULL);

	/*
	 * If the stream has not been set up as a RPC_CLIENT or RPC_SERVER
	 * with the corresponding ioctl, then don't accept
	 * any inbound data.  This should never happen for streams
	 * created by nfsd or client-side KRPC because they are careful
	 * to set the mode of the stream before doing anything else.
	 */
	if (mir->mir_type == 0) {
		freemsg(mp);
		return;
	}

	ASSERT(MUTEX_NOT_HELD(&mir->mir_mutex));

	switch (mp->b_datap->db_type) {
	case M_DATA:
		break;
	case M_PROTO:
	case M_PCPROTO:
		if (MBLKL(mp) < sizeof (t_scalar_t)) {
			RPCLOG(1, "mir_rput: runt TPI message (%d bytes)\n",
			    (int)MBLKL(mp));
			freemsg(mp);
			return;
		}
		if (((union T_primitives *)mp->b_rptr)->type != T_DATA_IND) {
			mir_rput_proto(q, mp);
			return;
		}

		/* Throw away the T_DATA_IND block and continue with data. */
		mp1 = mp;
		mp = mp->b_cont;
		freeb(mp1);
		break;
	case M_SETOPTS:
		/*
		 * If a module on the stream is trying set the Stream head's
		 * high water mark, then set our hiwater to the requested
		 * value.  We are the "stream head" for all inbound
		 * data messages since messages are passed directly to KRPC.
		 */
		if (MBLKL(mp) >= sizeof (struct stroptions)) {
			struct stroptions	*stropts;

			stropts = (struct stroptions *)mp->b_rptr;
			if ((stropts->so_flags & SO_HIWAT) &&
			    !(stropts->so_flags & SO_BAND)) {
				(void) strqset(q, QHIWAT, 0, stropts->so_hiwat);
			}
		}
		putnext(q, mp);
		return;
	case M_FLUSH:
		RPCLOG(32, "mir_rput: ignoring M_FLUSH %x ", *mp->b_rptr);
		RPCLOG(32, "on q 0x%p\n", (void *)q);
		putnext(q, mp);
		return;
	default:
		putnext(q, mp);
		return;
	}

	mutex_enter(&mir->mir_mutex);

	/*
	 * If this connection is closing, don't accept any new messages.
	 */
	if (mir->mir_svc_no_more_msgs) {
		ASSERT(mir->mir_type == RPC_SERVER);
		mutex_exit(&mir->mir_mutex);
		freemsg(mp);
		return;
	}

	/* Get local copies for quicker access. */
	frag_len = mir->mir_frag_len;
	frag_header = mir->mir_frag_header;
	head_mp = mir->mir_head_mp;
	tail_mp = mir->mir_tail_mp;

	/* Loop, processing each message block in the mp chain separately. */
	do {
		cont_mp = mp->b_cont;
		mp->b_cont = NULL;

		/*
		 * Drop zero-length mblks to prevent unbounded kernel memory
		 * consumption.
		 */
		if (MBLKL(mp) == 0) {
			freeb(mp);
			continue;
		}

		/*
		 * If frag_len is negative, we're still in the process of
		 * building frag_header -- try to complete it with this mblk.
		 */
		while (frag_len < 0 && mp->b_rptr < mp->b_wptr) {
			frag_len++;
			frag_header <<= 8;
			frag_header += *mp->b_rptr++;
		}

		if (MBLKL(mp) == 0 && frag_len < 0) {
			/*
			 * We consumed this mblk while trying to complete the
			 * fragment header.  Free it and move on.
			 */
			freeb(mp);
			continue;
		}

		ASSERT(frag_len >= 0);

		/*
		 * Now frag_header has the number of bytes in this fragment
		 * and we're just waiting to collect them all.  Chain our
		 * latest mblk onto the list and see if we now have enough
		 * bytes to complete the fragment.
		 */
		if (head_mp == NULL) {
			ASSERT(tail_mp == NULL);
			head_mp = tail_mp = mp;
		} else {
			tail_mp->b_cont = mp;
			tail_mp = mp;
		}

		frag_len += MBLKL(mp);
		excess = frag_len - (frag_header & ~MIR_LASTFRAG);
		if (excess < 0) {
			/*
			 * We still haven't received enough data to complete
			 * the fragment, so continue on to the next mblk.
			 */
			continue;
		}

		/*
		 * We've got a complete fragment.  If there are excess bytes,
		 * then they're part of the next fragment's header (of either
		 * this RPC message or the next RPC message).  Split that part
		 * into its own mblk so that we can safely freeb() it when
		 * building frag_header above.
		 */
		if (excess > 0) {
			if ((mp1 = dupb(mp)) == NULL &&
			    (mp1 = copyb(mp)) == NULL) {
				freemsg(head_mp);
				freemsg(cont_mp);
				RPCLOG0(1, "mir_rput: dupb/copyb failed\n");
				mir->mir_frag_header = 0;
				mir->mir_frag_len = -(int32_t)sizeof (uint32_t);
				mir->mir_head_mp = NULL;
				mir->mir_tail_mp = NULL;
				mir_disconnect(q, mir);	/* drops mir_mutex */
				return;
			}

			/*
			 * Relink the message chain so that the next mblk is
			 * the next fragment header, followed by the rest of
			 * the message chain.
			 */
			mp1->b_cont = cont_mp;
			cont_mp = mp1;

			/*
			 * Data in the new mblk begins at the next fragment,
			 * and data in the old mblk ends at the next fragment.
			 */
			mp1->b_rptr = mp1->b_wptr - excess;
			mp->b_wptr -= excess;
		}

		/*
		 * Reset frag_len and frag_header for the next fragment.
		 */
		frag_len = -(int32_t)sizeof (uint32_t);
		if (!(frag_header & MIR_LASTFRAG)) {
			/*
			 * The current fragment is complete, but more
			 * fragments need to be processed before we can
			 * pass along the RPC message headed at head_mp.
			 */
			frag_header = 0;
			continue;
		}
		frag_header = 0;

		/*
		 * We've got a complete RPC message; pass it to the
		 * appropriate consumer.
		 */
		switch (mir->mir_type) {
		case RPC_CLIENT:
			if (clnt_dispatch_notify(head_mp, mir->mir_zoneid)) {
				/*
				 * Mark this stream as active.  This marker
				 * is used in mir_timer().
				 */
				mir->mir_clntreq = 1;
				mir->mir_use_timestamp = ddi_get_lbolt();
			} else {
				freemsg(head_mp);
			}
			break;

		case RPC_SERVER:
			/*
			 * Check for flow control before passing the
			 * message to KRPC.
			 */
			if (!mir->mir_hold_inbound) {
				if (mir->mir_krpc_cell) {
					/*
					 * If the reference count is 0
					 * (not including this request),
					 * then the stream is transitioning
					 * from idle to non-idle.  In this case,
					 * we cancel the idle timer.
					 */
					if (mir->mir_ref_cnt++ == 0)
						stop_timer = B_TRUE;
					if (mir_check_len(q,
					    (int32_t)msgdsize(mp), mp))
						return;
					svc_queuereq(q, head_mp); /* to KRPC */
				} else {
					/*
					 * Count # of times this happens. Should
					 * be never, but experience shows
					 * otherwise.
					 */
					mir_krpc_cell_null++;
					freemsg(head_mp);
				}
			} else {
				/*
				 * If the outbound side of the stream is
				 * flow controlled, then hold this message
				 * until client catches up. mir_hold_inbound
				 * is set in mir_wput and cleared in mir_wsrv.
				 */
				(void) putq(q, head_mp);
				mir->mir_inrservice = B_TRUE;
			}
			break;
		default:
			RPCLOG(1, "mir_rput: unknown mir_type %d\n",
			    mir->mir_type);
			freemsg(head_mp);
			break;
		}

		/*
		 * Reset the chain since we're starting on a new RPC message.
		 */
		head_mp = tail_mp = NULL;
	} while ((mp = cont_mp) != NULL);

	/*
	 * Sanity check the message length; if it's too large mir_check_len()
	 * will shutdown the connection, drop mir_mutex, and return non-zero.
	 */
	if (head_mp != NULL && mir->mir_setup_complete &&
	    mir_check_len(q, frag_len, head_mp))
		return;

	/* Save our local copies back in the mir structure. */
	mir->mir_frag_header = frag_header;
	mir->mir_frag_len = frag_len;
	mir->mir_head_mp = head_mp;
	mir->mir_tail_mp = tail_mp;

	/*
	 * The timer is stopped after the whole message chain is processed.
	 * The reason is that stopping the timer releases the mir_mutex
	 * lock temporarily.  This means that the request can be serviced
	 * while we are still processing the message chain.  This is not
	 * good.  So we stop the timer here instead.
	 *
	 * Note that if the timer fires before we stop it, it will not
	 * do any harm as MIR_SVC_QUIESCED() is false and mir_timer()
	 * will just return.
	 */
	if (stop_timer) {
		RPCLOG(16, "mir_rput: stopping idle timer on 0x%p because "
		    "ref cnt going to non zero\n", (void *)WR(q));
		mir_svc_idle_stop(WR(q), mir);
	}
	mutex_exit(&mir->mir_mutex);
}

static void
mir_rput_proto(queue_t *q, mblk_t *mp)
{
	mir_t	*mir = (mir_t *)q->q_ptr;
	uint32_t	type;
	uint32_t reason = 0;

	ASSERT(MUTEX_NOT_HELD(&mir->mir_mutex));

	type = ((union T_primitives *)mp->b_rptr)->type;
	switch (mir->mir_type) {
	case RPC_CLIENT:
		switch (type) {
		case T_DISCON_IND:
			reason = ((struct T_discon_ind *)
			    (mp->b_rptr))->DISCON_reason;
			/*FALLTHROUGH*/
		case T_ORDREL_IND:
			mutex_enter(&mir->mir_mutex);
			if (mir->mir_head_mp) {
				freemsg(mir->mir_head_mp);
				mir->mir_head_mp = (mblk_t *)0;
				mir->mir_tail_mp = (mblk_t *)0;
			}
			/*
			 * We are disconnecting, but not necessarily
			 * closing. By not closing, we will fail to
			 * pick up a possibly changed global timeout value,
			 * unless we store it now.
			 */
			mir->mir_idle_timeout = clnt_idle_timeout;
			mir_clnt_idle_stop(WR(q), mir);

			/*
			 * Even though we are unconnected, we still
			 * leave the idle timer going on the client. The
			 * reason for is that if we've disconnected due
			 * to a server-side disconnect, reset, or connection
			 * timeout, there is a possibility the client may
			 * retry the RPC request. This retry needs to done on
			 * the same bound address for the server to interpret
			 * it as such. However, we don't want
			 * to wait forever for that possibility. If the
			 * end-point stays unconnected for mir_idle_timeout
			 * units of time, then that is a signal to the
			 * connection manager to give up waiting for the
			 * application (eg. NFS) to send a retry.
			 */
			mir_clnt_idle_start(WR(q), mir);
			mutex_exit(&mir->mir_mutex);
			clnt_dispatch_notifyall(WR(q), type, reason);
			freemsg(mp);
			return;
		case T_ERROR_ACK:
		{
			struct T_error_ack	*terror;

			terror = (struct T_error_ack *)mp->b_rptr;
			RPCLOG(1, "mir_rput_proto T_ERROR_ACK for queue 0x%p",
			    (void *)q);
			RPCLOG(1, " ERROR_prim: %s,",
			    rpc_tpiprim2name(terror->ERROR_prim));
			RPCLOG(1, " TLI_error: %s,",
			    rpc_tpierr2name(terror->TLI_error));
			RPCLOG(1, " UNIX_error: %d\n", terror->UNIX_error);
			if (terror->ERROR_prim == T_DISCON_REQ)  {
				clnt_dispatch_notifyall(WR(q), type, reason);
				freemsg(mp);
				return;
			} else {
				if (clnt_dispatch_notifyconn(WR(q), mp))
					return;
			}
			break;
		}
		case T_OK_ACK:
		{
			struct T_ok_ack	*tok = (struct T_ok_ack *)mp->b_rptr;

			if (tok->CORRECT_prim == T_DISCON_REQ) {
				clnt_dispatch_notifyall(WR(q), type, reason);
				freemsg(mp);
				return;
			} else {
				if (clnt_dispatch_notifyconn(WR(q), mp))
					return;
			}
			break;
		}
		case T_CONN_CON:
		case T_INFO_ACK:
		case T_OPTMGMT_ACK:
			if (clnt_dispatch_notifyconn(WR(q), mp))
				return;
			break;
		case T_BIND_ACK:
			break;
		default:
			RPCLOG(1, "mir_rput: unexpected message %d "
			    "for KRPC client\n",
			    ((union T_primitives *)mp->b_rptr)->type);
			break;
		}
		break;

	case RPC_SERVER:
		switch (type) {
		case T_BIND_ACK:
		{
			struct T_bind_ack	*tbind;

			/*
			 * If this is a listening stream, then shut
			 * off the idle timer.
			 */
			tbind = (struct T_bind_ack *)mp->b_rptr;
			if (tbind->CONIND_number > 0) {
				mutex_enter(&mir->mir_mutex);
				mir_svc_idle_stop(WR(q), mir);

				/*
				 * mark this as a listen endpoint
				 * for special handling.
				 */

				mir->mir_listen_stream = 1;
				mutex_exit(&mir->mir_mutex);
			}
			break;
		}
		case T_DISCON_IND:
		case T_ORDREL_IND:
			RPCLOG(16, "mir_rput_proto: got %s indication\n",
			    type == T_DISCON_IND ? "disconnect"
			    : "orderly release");

			/*
			 * For listen endpoint just pass
			 * on the message.
			 */

			if (mir->mir_listen_stream)
				break;

			mutex_enter(&mir->mir_mutex);

			/*
			 * If client wants to break off connection, record
			 * that fact.
			 */
			mir_svc_start_close(WR(q), mir);

			/*
			 * If we are idle, then send the orderly release
			 * or disconnect indication to nfsd.
			 */
			if (MIR_SVC_QUIESCED(mir)) {
				mutex_exit(&mir->mir_mutex);
				break;
			}

			RPCLOG(16, "mir_rput_proto: not idle, so "
			    "disconnect/ord rel indication not passed "
			    "upstream on 0x%p\n", (void *)q);

			/*
			 * Hold the indication until we get idle
			 * If there already is an indication stored,
			 * replace it if the new one is a disconnect. The
			 * reasoning is that disconnection takes less time
			 * to process, and once a client decides to
			 * disconnect, we should do that.
			 */
			if (mir->mir_svc_pend_mp) {
				if (type == T_DISCON_IND) {
					RPCLOG(16, "mir_rput_proto: replacing"
					    " held disconnect/ord rel"
					    " indication with disconnect on"
					    " 0x%p\n", (void *)q);

					freemsg(mir->mir_svc_pend_mp);
					mir->mir_svc_pend_mp = mp;
				} else {
					RPCLOG(16, "mir_rput_proto: already "
					    "held a disconnect/ord rel "
					    "indication. freeing ord rel "
					    "ind on 0x%p\n", (void *)q);
					freemsg(mp);
				}
			} else
				mir->mir_svc_pend_mp = mp;

			mutex_exit(&mir->mir_mutex);
			return;

		default:
			/* nfsd handles server-side non-data messages. */
			break;
		}
		break;

	default:
		break;
	}

	putnext(q, mp);
}

/*
 * The server-side read queues are used to hold inbound messages while
 * outbound flow control is exerted.  When outbound flow control is
 * relieved, mir_wsrv qenables the read-side queue.  Read-side queues
 * are not enabled by STREAMS and are explicitly noenable'ed in mir_open.
 *
 * For the server side,  we have two types of messages queued. The first type
 * are messages that are ready to be XDR decoded and and then sent to the
 * RPC program's dispatch routine. The second type are "raw" messages that
 * haven't been processed, i.e. assembled from rpc record fragements into
 * full requests. The only time we will see the second type of message
 * queued is if we have a memory allocation failure while processing a
 * a raw message. The field mir_first_non_processed_mblk will mark the
 * first such raw message. So the flow for server side is:
 *
 *	- send processed queued messages to kRPC until we run out or find
 *	  one that needs additional processing because we were short on memory
 *	  earlier
 *	- process a message that was deferred because of lack of
 *	  memory
 *	- continue processing messages until the queue empties or we
 *	  have to stop because of lack of memory
 *	- during each of the above phase, if the queue is empty and
 *	  there are no pending messages that were passed to the RPC
 *	  layer, send upstream the pending disconnect/ordrel indication if
 *	  there is one
 *
 * The read-side queue is also enabled by a bufcall callback if dupmsg
 * fails in mir_rput.
 */
static void
mir_rsrv(queue_t *q)
{
	mir_t	*mir;
	mblk_t	*mp;
	mblk_t	*cmp = NULL;
	boolean_t stop_timer = B_FALSE;

	mir = (mir_t *)q->q_ptr;
	mutex_enter(&mir->mir_mutex);

	mp = NULL;
	switch (mir->mir_type) {
	case RPC_SERVER:
		if (mir->mir_ref_cnt == 0)
			mir->mir_hold_inbound = 0;
		if (mir->mir_hold_inbound) {

			ASSERT(cmp == NULL);
			if (q->q_first == NULL) {

				MIR_CLEAR_INRSRV(mir);

				if (MIR_SVC_QUIESCED(mir)) {
					cmp = mir->mir_svc_pend_mp;
					mir->mir_svc_pend_mp = NULL;
				}
			}

			mutex_exit(&mir->mir_mutex);

			if (cmp != NULL) {
				RPCLOG(16, "mir_rsrv: line %d: sending a held "
				    "disconnect/ord rel indication upstream\n",
				    __LINE__);
				putnext(q, cmp);
			}

			return;
		}
		while (mp = getq(q)) {
			if (mir->mir_krpc_cell &&
			    (mir->mir_svc_no_more_msgs == 0)) {
				/*
				 * If we were idle, turn off idle timer since
				 * we aren't idle any more.
				 */
				if (mir->mir_ref_cnt++ == 0)
					stop_timer = B_TRUE;
				if (mir_check_len(q,
				    (int32_t)msgdsize(mp), mp))
					return;
				svc_queuereq(q, mp);
			} else {
				/*
				 * Count # of times this happens. Should be
				 * never, but experience shows otherwise.
				 */
				if (mir->mir_krpc_cell == NULL)
					mir_krpc_cell_null++;
				freemsg(mp);
			}
		}
		break;
	case RPC_CLIENT:
		break;
	default:
		RPCLOG(1, "mir_rsrv: unexpected mir_type %d\n", mir->mir_type);

		if (q->q_first == NULL)
			MIR_CLEAR_INRSRV(mir);

		mutex_exit(&mir->mir_mutex);

		return;
	}

	/*
	 * The timer is stopped after all the messages are processed.
	 * The reason is that stopping the timer releases the mir_mutex
	 * lock temporarily.  This means that the request can be serviced
	 * while we are still processing the message queue.  This is not
	 * good.  So we stop the timer here instead.
	 */
	if (stop_timer)  {
		RPCLOG(16, "mir_rsrv stopping idle timer on 0x%p because ref "
		    "cnt going to non zero\n", (void *)WR(q));
		mir_svc_idle_stop(WR(q), mir);
	}

	if (q->q_first == NULL) {

		MIR_CLEAR_INRSRV(mir);

		ASSERT(cmp == NULL);
		if (mir->mir_type == RPC_SERVER && MIR_SVC_QUIESCED(mir)) {
			cmp = mir->mir_svc_pend_mp;
			mir->mir_svc_pend_mp = NULL;
		}

		mutex_exit(&mir->mir_mutex);

		if (cmp != NULL) {
			RPCLOG(16, "mir_rsrv: line %d: sending a held "
			    "disconnect/ord rel indication upstream\n",
			    __LINE__);
			putnext(q, cmp);
		}

		return;
	}
	mutex_exit(&mir->mir_mutex);
}

static int mir_svc_policy_fails;

/*
 * Called to send an event code to nfsd/lockd so that it initiates
 * connection close.
 */
static int
mir_svc_policy_notify(queue_t *q, int event)
{
	mblk_t	*mp;
#ifdef DEBUG
	mir_t *mir = (mir_t *)q->q_ptr;
	ASSERT(MUTEX_NOT_HELD(&mir->mir_mutex));
#endif
	ASSERT(q->q_flag & QREADR);

	/*
	 * Create an M_DATA message with the event code and pass it to the
	 * Stream head (nfsd or whoever created the stream will consume it).
	 */
	mp = allocb(sizeof (int), BPRI_HI);

	if (!mp) {

		mir_svc_policy_fails++;
		RPCLOG(16, "mir_svc_policy_notify: could not allocate event "
		    "%d\n", event);
		return (ENOMEM);
	}

	U32_TO_BE32(event, mp->b_rptr);
	mp->b_wptr = mp->b_rptr + sizeof (int);
	putnext(q, mp);
	return (0);
}

/*
 * Server side: start the close phase. We want to get this rpcmod slot in an
 * idle state before mir_close() is called.
 */
static void
mir_svc_start_close(queue_t *wq, mir_t *mir)
{
	ASSERT(MUTEX_HELD(&mir->mir_mutex));
	ASSERT((wq->q_flag & QREADR) == 0);
	ASSERT(mir->mir_type == RPC_SERVER);


	/*
	 * Do not accept any more messages.
	 */
	mir->mir_svc_no_more_msgs = 1;

	/*
	 * Next two statements will make the read service procedure invoke
	 * svc_queuereq() on everything stuck in the streams read queue.
	 * It's not necessary because enabling the write queue will
	 * have the same effect, but why not speed the process along?
	 */
	mir->mir_hold_inbound = 0;
	qenable(RD(wq));

	/*
	 * Meanwhile force the write service procedure to send the
	 * responses downstream, regardless of flow control.
	 */
	qenable(wq);
}

/*
 * This routine is called directly by KRPC after a request is completed,
 * whether a reply was sent or the request was dropped.
 */
static void
mir_svc_release(queue_t *wq, mblk_t *mp)
{
	mir_t   *mir = (mir_t *)wq->q_ptr;
	mblk_t	*cmp = NULL;

	ASSERT((wq->q_flag & QREADR) == 0);
	if (mp)
		freemsg(mp);

	mutex_enter(&mir->mir_mutex);

	/*
	 * Start idle processing if this is the last reference.
	 */
	if ((mir->mir_ref_cnt == 1) && (mir->mir_inrservice == 0)) {
		cmp = mir->mir_svc_pend_mp;
		mir->mir_svc_pend_mp = NULL;
	}

	if (cmp) {
		RPCLOG(16, "mir_svc_release: sending a held "
		    "disconnect/ord rel indication upstream on queue 0x%p\n",
		    (void *)RD(wq));

		mutex_exit(&mir->mir_mutex);

		putnext(RD(wq), cmp);

		mutex_enter(&mir->mir_mutex);
	}

	/*
	 * Start idle processing if this is the last reference.
	 */
	if (mir->mir_ref_cnt == 1 && mir->mir_inrservice == 0) {

		RPCLOG(16, "mir_svc_release starting idle timer on 0x%p "
		    "because ref cnt is zero\n", (void *) wq);

		mir_svc_idle_start(wq, mir);
	}

	mir->mir_ref_cnt--;
	ASSERT(mir->mir_ref_cnt >= 0);

	/*
	 * Wake up the thread waiting to close.
	 */

	if ((mir->mir_ref_cnt == 0) && mir->mir_closing)
		cv_signal(&mir->mir_condvar);

	mutex_exit(&mir->mir_mutex);
}

/*
 * This routine is called by server-side KRPC when it is ready to
 * handle inbound messages on the stream.
 */
static void
mir_svc_start(queue_t *wq)
{
	mir_t   *mir = (mir_t *)wq->q_ptr;

	/*
	 * no longer need to take the mir_mutex because the
	 * mir_setup_complete field has been moved out of
	 * the binary field protected by the mir_mutex.
	 */

	mir->mir_setup_complete = 1;
	qenable(RD(wq));
}

/*
 * client side wrapper for stopping timer with normal idle timeout.
 */
static void
mir_clnt_idle_stop(queue_t *wq, mir_t *mir)
{
	ASSERT(MUTEX_HELD(&mir->mir_mutex));
	ASSERT((wq->q_flag & QREADR) == 0);
	ASSERT(mir->mir_type == RPC_CLIENT);

	mir_timer_stop(mir);
}

/*
 * client side wrapper for stopping timer with normal idle timeout.
 */
static void
mir_clnt_idle_start(queue_t *wq, mir_t *mir)
{
	ASSERT(MUTEX_HELD(&mir->mir_mutex));
	ASSERT((wq->q_flag & QREADR) == 0);
	ASSERT(mir->mir_type == RPC_CLIENT);

	mir_timer_start(wq, mir, mir->mir_idle_timeout);
}

/*
 * client side only. Forces rpcmod to stop sending T_ORDREL_REQs on
 * end-points that aren't connected.
 */
static void
mir_clnt_idle_do_stop(queue_t *wq)
{
	mir_t   *mir = (mir_t *)wq->q_ptr;

	RPCLOG(1, "mir_clnt_idle_do_stop: wq 0x%p\n", (void *)wq);
	ASSERT(MUTEX_NOT_HELD(&mir->mir_mutex));
	mutex_enter(&mir->mir_mutex);
	mir_clnt_idle_stop(wq, mir);
	mutex_exit(&mir->mir_mutex);
}

/*
 * Timer handler.  It handles idle timeout and memory shortage problem.
 */
static void
mir_timer(void *arg)
{
	queue_t *wq = (queue_t *)arg;
	mir_t *mir = (mir_t *)wq->q_ptr;
	boolean_t notify;
	clock_t now;

	mutex_enter(&mir->mir_mutex);

	/*
	 * mir_timer_call is set only when either mir_timer_[start|stop]
	 * is progressing.  And mir_timer() can only be run while they
	 * are progressing if the timer is being stopped.  So just
	 * return.
	 */
	if (mir->mir_timer_call) {
		mutex_exit(&mir->mir_mutex);
		return;
	}
	mir->mir_timer_id = 0;

	switch (mir->mir_type) {
	case RPC_CLIENT:

		/*
		 * For clients, the timer fires at clnt_idle_timeout
		 * intervals.  If the activity marker (mir_clntreq) is
		 * zero, then the stream has been idle since the last
		 * timer event and we notify KRPC.  If mir_clntreq is
		 * non-zero, then the stream is active and we just
		 * restart the timer for another interval.  mir_clntreq
		 * is set to 1 in mir_wput for every request passed
		 * downstream.
		 *
		 * If this was a memory shortage timer reset the idle
		 * timeout regardless; the mir_clntreq will not be a
		 * valid indicator.
		 *
		 * The timer is initially started in mir_wput during
		 * RPC_CLIENT ioctl processing.
		 *
		 * The timer interval can be changed for individual
		 * streams with the ND variable "mir_idle_timeout".
		 */
		now = ddi_get_lbolt();
		if (mir->mir_clntreq > 0 && mir->mir_use_timestamp +
		    MSEC_TO_TICK(mir->mir_idle_timeout) - now >= 0) {
			clock_t tout;

			tout = mir->mir_idle_timeout -
			    TICK_TO_MSEC(now - mir->mir_use_timestamp);
			if (tout < 0)
				tout = 1000;
#if 0
			printf("mir_timer[%d < %d + %d]: reset client timer "
			    "to %d (ms)\n", TICK_TO_MSEC(now),
			    TICK_TO_MSEC(mir->mir_use_timestamp),
			    mir->mir_idle_timeout, tout);
#endif
			mir->mir_clntreq = 0;
			mir_timer_start(wq, mir, tout);
			mutex_exit(&mir->mir_mutex);
			return;
		}
#if 0
printf("mir_timer[%d]: doing client timeout\n", now / hz);
#endif
		/*
		 * We are disconnecting, but not necessarily
		 * closing. By not closing, we will fail to
		 * pick up a possibly changed global timeout value,
		 * unless we store it now.
		 */
		mir->mir_idle_timeout = clnt_idle_timeout;
		mir_clnt_idle_start(wq, mir);

		mutex_exit(&mir->mir_mutex);
		/*
		 * We pass T_ORDREL_REQ as an integer value
		 * to KRPC as the indication that the stream
		 * is idle.  This is not a T_ORDREL_REQ message,
		 * it is just a convenient value since we call
		 * the same KRPC routine for T_ORDREL_INDs and
		 * T_DISCON_INDs.
		 */
		clnt_dispatch_notifyall(wq, T_ORDREL_REQ, 0);
		return;

	case RPC_SERVER:

		/*
		 * For servers, the timer is only running when the stream
		 * is really idle or memory is short.  The timer is started
		 * by mir_wput when mir_type is set to RPC_SERVER and
		 * by mir_svc_idle_start whenever the stream goes idle
		 * (mir_ref_cnt == 0).  The timer is cancelled in
		 * mir_rput whenever a new inbound request is passed to KRPC
		 * and the stream was previously idle.
		 *
		 * The timer interval can be changed for individual
		 * streams with the ND variable "mir_idle_timeout".
		 *
		 * If the stream is not idle do nothing.
		 */
		if (!MIR_SVC_QUIESCED(mir)) {
			mutex_exit(&mir->mir_mutex);
			return;
		}

		notify = !mir->mir_inrservice;
		mutex_exit(&mir->mir_mutex);

		/*
		 * If there is no packet queued up in read queue, the stream
		 * is really idle so notify nfsd to close it.
		 */
		if (notify) {
			RPCLOG(16, "mir_timer: telling stream head listener "
			    "to close stream (0x%p)\n", (void *) RD(wq));
			(void) mir_svc_policy_notify(RD(wq), 1);
		}
		return;
	default:
		RPCLOG(1, "mir_timer: unexpected mir_type %d\n",
		    mir->mir_type);
		mutex_exit(&mir->mir_mutex);
		return;
	}
}

/*
 * Called by the RPC package to send either a call or a return, or a
 * transport connection request.  Adds the record marking header.
 */
static void
mir_wput(queue_t *q, mblk_t *mp)
{
	uint_t	frag_header;
	mir_t	*mir = (mir_t *)q->q_ptr;
	uchar_t	*rptr = mp->b_rptr;

	if (!mir) {
		freemsg(mp);
		return;
	}

	if (mp->b_datap->db_type != M_DATA) {
		mir_wput_other(q, mp);
		return;
	}

	if (mir->mir_ordrel_pending == 1) {
		freemsg(mp);
		RPCLOG(16, "mir_wput wq 0x%p: got data after T_ORDREL_REQ\n",
		    (void *)q);
		return;
	}

	frag_header = (uint_t)DLEN(mp);
	frag_header |= MIR_LASTFRAG;

	/* Stick in the 4 byte record marking header. */
	if ((rptr - mp->b_datap->db_base) < sizeof (uint32_t) ||
	    !IS_P2ALIGNED(mp->b_rptr, sizeof (uint32_t))) {
		/*
		 * Since we know that M_DATA messages are created exclusively
		 * by KRPC, we expect that KRPC will leave room for our header
		 * and 4 byte align which is normal for XDR.
		 * If KRPC (or someone else) does not cooperate, then we
		 * just throw away the message.
		 */
		RPCLOG(1, "mir_wput: KRPC did not leave space for record "
		    "fragment header (%d bytes left)\n",
		    (int)(rptr - mp->b_datap->db_base));
		freemsg(mp);
		return;
	}
	rptr -= sizeof (uint32_t);
	*(uint32_t *)rptr = htonl(frag_header);
	mp->b_rptr = rptr;

	mutex_enter(&mir->mir_mutex);
	if (mir->mir_type == RPC_CLIENT) {
		/*
		 * For the client, set mir_clntreq to indicate that the
		 * connection is active.
		 */
		mir->mir_clntreq = 1;
		mir->mir_use_timestamp = ddi_get_lbolt();
	}

	/*
	 * If we haven't already queued some data and the downstream module
	 * can accept more data, send it on, otherwise we queue the message
	 * and take other actions depending on mir_type.
	 */
	if (!mir->mir_inwservice && MIR_WCANPUTNEXT(mir, q)) {
		mutex_exit(&mir->mir_mutex);

		/*
		 * Now we pass the RPC message downstream.
		 */
		putnext(q, mp);
		return;
	}

	switch (mir->mir_type) {
	case RPC_CLIENT:
		/*
		 * Check for a previous duplicate request on the
		 * queue.  If there is one, then we throw away
		 * the current message and let the previous one
		 * go through.  If we can't find a duplicate, then
		 * send this one.  This tap dance is an effort
		 * to reduce traffic and processing requirements
		 * under load conditions.
		 */
		if (mir_clnt_dup_request(q, mp)) {
			mutex_exit(&mir->mir_mutex);
			freemsg(mp);
			return;
		}
		break;
	case RPC_SERVER:
		/*
		 * Set mir_hold_inbound so that new inbound RPC
		 * messages will be held until the client catches
		 * up on the earlier replies.  This flag is cleared
		 * in mir_wsrv after flow control is relieved;
		 * the read-side queue is also enabled at that time.
		 */
		mir->mir_hold_inbound = 1;
		break;
	default:
		RPCLOG(1, "mir_wput: unexpected mir_type %d\n", mir->mir_type);
		break;
	}
	mir->mir_inwservice = 1;
	(void) putq(q, mp);
	mutex_exit(&mir->mir_mutex);
}

static void
mir_wput_other(queue_t *q, mblk_t *mp)
{
	mir_t	*mir = (mir_t *)q->q_ptr;
	struct iocblk	*iocp;
	uchar_t	*rptr = mp->b_rptr;
	bool_t	flush_in_svc = FALSE;

	ASSERT(MUTEX_NOT_HELD(&mir->mir_mutex));
	switch (mp->b_datap->db_type) {
	case M_IOCTL:
		iocp = (struct iocblk *)rptr;
		switch (iocp->ioc_cmd) {
		case RPC_CLIENT:
			mutex_enter(&mir->mir_mutex);
			if (mir->mir_type != 0 &&
			    mir->mir_type != iocp->ioc_cmd) {
ioc_eperm:
				mutex_exit(&mir->mir_mutex);
				iocp->ioc_error = EPERM;
				iocp->ioc_count = 0;
				mp->b_datap->db_type = M_IOCACK;
				qreply(q, mp);
				return;
			}

			mir->mir_type = iocp->ioc_cmd;

			/*
			 * Clear mir_hold_inbound which was set to 1 by
			 * mir_open.  This flag is not used on client
			 * streams.
			 */
			mir->mir_hold_inbound = 0;
			mir->mir_max_msg_sizep = &clnt_max_msg_size;

			/*
			 * Start the idle timer.  See mir_timer() for more
			 * information on how client timers work.
			 */
			mir->mir_idle_timeout = clnt_idle_timeout;
			mir_clnt_idle_start(q, mir);
			mutex_exit(&mir->mir_mutex);

			mp->b_datap->db_type = M_IOCACK;
			qreply(q, mp);
			return;
		case RPC_SERVER:
			mutex_enter(&mir->mir_mutex);
			if (mir->mir_type != 0 &&
			    mir->mir_type != iocp->ioc_cmd)
				goto ioc_eperm;

			/*
			 * We don't clear mir_hold_inbound here because
			 * mir_hold_inbound is used in the flow control
			 * model. If we cleared it here, then we'd commit
			 * a small violation to the model where the transport
			 * might immediately block downstream flow.
			 */

			mir->mir_type = iocp->ioc_cmd;
			mir->mir_max_msg_sizep = &svc_max_msg_size;

			/*
			 * Start the idle timer.  See mir_timer() for more
			 * information on how server timers work.
			 *
			 * Note that it is important to start the idle timer
			 * here so that connections time out even if we
			 * never receive any data on them.
			 */
			mir->mir_idle_timeout = svc_idle_timeout;
			RPCLOG(16, "mir_wput_other starting idle timer on 0x%p "
			    "because we got RPC_SERVER ioctl\n", (void *)q);
			mir_svc_idle_start(q, mir);
			mutex_exit(&mir->mir_mutex);

			mp->b_datap->db_type = M_IOCACK;
			qreply(q, mp);
			return;
		default:
			break;
		}
		break;

	case M_PROTO:
		if (mir->mir_type == RPC_CLIENT) {
			/*
			 * We are likely being called from the context of a
			 * service procedure. So we need to enqueue. However
			 * enqueing may put our message behind data messages.
			 * So flush the data first.
			 */
			flush_in_svc = TRUE;
		}
		if ((mp->b_wptr - rptr) < sizeof (uint32_t) ||
		    !IS_P2ALIGNED(rptr, sizeof (uint32_t)))
			break;

		switch (((union T_primitives *)rptr)->type) {
		case T_DATA_REQ:
			/* Don't pass T_DATA_REQ messages downstream. */
			freemsg(mp);
			return;
		case T_ORDREL_REQ:
			RPCLOG(8, "mir_wput_other wq 0x%p: got T_ORDREL_REQ\n",
			    (void *)q);
			mutex_enter(&mir->mir_mutex);
			if (mir->mir_type != RPC_SERVER) {
				/*
				 * We are likely being called from
				 * clnt_dispatch_notifyall(). Sending
				 * a T_ORDREL_REQ will result in
				 * a some kind of _IND message being sent,
				 * will be another call to
				 * clnt_dispatch_notifyall(). To keep the stack
				 * lean, queue this message.
				 */
				mir->mir_inwservice = 1;
				(void) putq(q, mp);
				mutex_exit(&mir->mir_mutex);
				return;
			}

			/*
			 * Mark the structure such that we don't accept any
			 * more requests from client. We could defer this
			 * until we actually send the orderly release
			 * request downstream, but all that does is delay
			 * the closing of this stream.
			 */
			RPCLOG(16, "mir_wput_other wq 0x%p: got T_ORDREL_REQ "
			    " so calling mir_svc_start_close\n", (void *)q);

			mir_svc_start_close(q, mir);

			/*
			 * If we have sent down a T_ORDREL_REQ, don't send
			 * any more.
			 */
			if (mir->mir_ordrel_pending) {
				freemsg(mp);
				mutex_exit(&mir->mir_mutex);
				return;
			}

			/*
			 * If the stream is not idle, then we hold the
			 * orderly release until it becomes idle.  This
			 * ensures that KRPC will be able to reply to
			 * all requests that we have passed to it.
			 *
			 * We also queue the request if there is data already
			 * queued, because we cannot allow the T_ORDREL_REQ
			 * to go before data. When we had a separate reply
			 * count, this was not a problem, because the
			 * reply count was reconciled when mir_wsrv()
			 * completed.
			 */
			if (!MIR_SVC_QUIESCED(mir) ||
			    mir->mir_inwservice == 1) {
				mir->mir_inwservice = 1;
				(void) putq(q, mp);

				RPCLOG(16, "mir_wput_other: queuing "
				    "T_ORDREL_REQ on 0x%p\n", (void *)q);

				mutex_exit(&mir->mir_mutex);
				return;
			}

			/*
			 * Mark the structure so that we know we sent
			 * an orderly release request, and reset the idle timer.
			 */
			mir->mir_ordrel_pending = 1;

			RPCLOG(16, "mir_wput_other: calling mir_svc_idle_start"
			    " on 0x%p because we got T_ORDREL_REQ\n",
			    (void *)q);

			mir_svc_idle_start(q, mir);
			mutex_exit(&mir->mir_mutex);

			/*
			 * When we break, we will putnext the T_ORDREL_REQ.
			 */
			break;

		case T_CONN_REQ:
			mutex_enter(&mir->mir_mutex);
			if (mir->mir_head_mp != NULL) {
				freemsg(mir->mir_head_mp);
				mir->mir_head_mp = NULL;
				mir->mir_tail_mp = NULL;
			}
			mir->mir_frag_len = -(int32_t)sizeof (uint32_t);
			/*
			 * Restart timer in case mir_clnt_idle_do_stop() was
			 * called.
			 */
			mir->mir_idle_timeout = clnt_idle_timeout;
			mir_clnt_idle_stop(q, mir);
			mir_clnt_idle_start(q, mir);
			mutex_exit(&mir->mir_mutex);
			break;

		default:
			/*
			 * T_DISCON_REQ is one of the interesting default
			 * cases here. Ideally, an M_FLUSH is done before
			 * T_DISCON_REQ is done. However, that is somewhat
			 * cumbersome for clnt_cots.c to do. So we queue
			 * T_DISCON_REQ, and let the service procedure
			 * flush all M_DATA.
			 */
			break;
		}
		/* fallthru */;
	default:
		if (mp->b_datap->db_type >= QPCTL) {
			if (mp->b_datap->db_type == M_FLUSH) {
				if (mir->mir_type == RPC_CLIENT &&
				    *mp->b_rptr & FLUSHW) {
					RPCLOG(32, "mir_wput_other: flushing "
					    "wq 0x%p\n", (void *)q);
					if (*mp->b_rptr & FLUSHBAND) {
						flushband(q, *(mp->b_rptr + 1),
						    FLUSHDATA);
					} else {
						flushq(q, FLUSHDATA);
					}
				} else {
					RPCLOG(32, "mir_wput_other: ignoring "
					    "M_FLUSH on wq 0x%p\n", (void *)q);
				}
			}
			break;
		}

		mutex_enter(&mir->mir_mutex);
		if (mir->mir_inwservice == 0 && MIR_WCANPUTNEXT(mir, q)) {
			mutex_exit(&mir->mir_mutex);
			break;
		}
		mir->mir_inwservice = 1;
		mir->mir_inwflushdata = flush_in_svc;
		(void) putq(q, mp);
		mutex_exit(&mir->mir_mutex);
		qenable(q);

		return;
	}
	putnext(q, mp);
}

static void
mir_wsrv(queue_t *q)
{
	mblk_t	*mp;
	mir_t	*mir;
	bool_t flushdata;

	mir = (mir_t *)q->q_ptr;
	mutex_enter(&mir->mir_mutex);

	flushdata = mir->mir_inwflushdata;
	mir->mir_inwflushdata = 0;

	while (mp = getq(q)) {
		if (mp->b_datap->db_type == M_DATA) {
			/*
			 * Do not send any more data if we have sent
			 * a T_ORDREL_REQ.
			 */
			if (flushdata || mir->mir_ordrel_pending == 1) {
				freemsg(mp);
				continue;
			}

			/*
			 * Make sure that the stream can really handle more
			 * data.
			 */
			if (!MIR_WCANPUTNEXT(mir, q)) {
				(void) putbq(q, mp);
				mutex_exit(&mir->mir_mutex);
				return;
			}

			/*
			 * Now we pass the RPC message downstream.
			 */
			mutex_exit(&mir->mir_mutex);
			putnext(q, mp);
			mutex_enter(&mir->mir_mutex);
			continue;
		}

		/*
		 * This is not an RPC message, pass it downstream
		 * (ignoring flow control) if the server side is not sending a
		 * T_ORDREL_REQ downstream.
		 */
		if (mir->mir_type != RPC_SERVER ||
		    ((union T_primitives *)mp->b_rptr)->type !=
		    T_ORDREL_REQ) {
			mutex_exit(&mir->mir_mutex);
			putnext(q, mp);
			mutex_enter(&mir->mir_mutex);
			continue;
		}

		if (mir->mir_ordrel_pending == 1) {
			/*
			 * Don't send two T_ORDRELs
			 */
			freemsg(mp);
			continue;
		}

		/*
		 * Mark the structure so that we know we sent an orderly
		 * release request.  We will check to see slot is idle at the
		 * end of this routine, and if so, reset the idle timer to
		 * handle orderly release timeouts.
		 */
		mir->mir_ordrel_pending = 1;
		RPCLOG(16, "mir_wsrv: sending ordrel req on q 0x%p\n",
		    (void *)q);
		/*
		 * Send the orderly release downstream. If there are other
		 * pending replies we won't be able to send them.  However,
		 * the only reason we should send the orderly release is if
		 * we were idle, or if an unusual event occurred.
		 */
		mutex_exit(&mir->mir_mutex);
		putnext(q, mp);
		mutex_enter(&mir->mir_mutex);
	}

	if (q->q_first == NULL)
		/*
		 * If we call mir_svc_idle_start() below, then
		 * clearing mir_inwservice here will also result in
		 * any thread waiting in mir_close() to be signaled.
		 */
		mir->mir_inwservice = 0;

	if (mir->mir_type != RPC_SERVER) {
		mutex_exit(&mir->mir_mutex);
		return;
	}

	/*
	 * If idle we call mir_svc_idle_start to start the timer (or wakeup
	 * a close). Also make sure not to start the idle timer on the
	 * listener stream. This can cause nfsd to send an orderly release
	 * command on the listener stream.
	 */
	if (MIR_SVC_QUIESCED(mir) && !(mir->mir_listen_stream)) {
		RPCLOG(16, "mir_wsrv: calling mir_svc_idle_start on 0x%p "
		    "because mir slot is idle\n", (void *)q);
		mir_svc_idle_start(q, mir);
	}

	/*
	 * If outbound flow control has been relieved, then allow new
	 * inbound requests to be processed.
	 */
	if (mir->mir_hold_inbound) {
		mir->mir_hold_inbound = 0;
		qenable(RD(q));
	}
	mutex_exit(&mir->mir_mutex);
}

static void
mir_disconnect(queue_t *q, mir_t *mir)
{
	ASSERT(MUTEX_HELD(&mir->mir_mutex));

	switch (mir->mir_type) {
	case RPC_CLIENT:
		/*
		 * We are disconnecting, but not necessarily
		 * closing. By not closing, we will fail to
		 * pick up a possibly changed global timeout value,
		 * unless we store it now.
		 */
		mir->mir_idle_timeout = clnt_idle_timeout;
		mir_clnt_idle_start(WR(q), mir);
		mutex_exit(&mir->mir_mutex);

		/*
		 * T_DISCON_REQ is passed to KRPC as an integer value
		 * (this is not a TPI message).  It is used as a
		 * convenient value to indicate a sanity check
		 * failure -- the same KRPC routine is also called
		 * for T_DISCON_INDs and T_ORDREL_INDs.
		 */
		clnt_dispatch_notifyall(WR(q), T_DISCON_REQ, 0);
		break;

	case RPC_SERVER:
		mir->mir_svc_no_more_msgs = 1;
		mir_svc_idle_stop(WR(q), mir);
		mutex_exit(&mir->mir_mutex);
		RPCLOG(16, "mir_disconnect: telling "
		    "stream head listener to disconnect stream "
		    "(0x%p)\n", (void *) q);
		(void) mir_svc_policy_notify(q, 2);
		break;

	default:
		mutex_exit(&mir->mir_mutex);
		break;
	}
}

/*
 * Sanity check the message length, and if it's too large, shutdown the
 * connection.  Returns 1 if the connection is shutdown; 0 otherwise.
 */
static int
mir_check_len(queue_t *q, int32_t frag_len, mblk_t *head_mp)
{
	mir_t *mir = q->q_ptr;
	uint_t maxsize = 0;

	if (mir->mir_max_msg_sizep != NULL)
		maxsize = *mir->mir_max_msg_sizep;

	if (maxsize == 0 || frag_len <= (int)maxsize)
		return (0);

	freemsg(head_mp);
	mir->mir_head_mp = NULL;
	mir->mir_tail_mp = NULL;
	mir->mir_frag_header = 0;
	mir->mir_frag_len = -(int32_t)sizeof (uint32_t);
	if (mir->mir_type != RPC_SERVER || mir->mir_setup_complete) {
		cmn_err(CE_NOTE,
		    "KRPC: record fragment from %s of size(%d) exceeds "
		    "maximum (%u). Disconnecting",
		    (mir->mir_type == RPC_CLIENT) ? "server" :
		    (mir->mir_type == RPC_SERVER) ? "client" :
		    "test tool", frag_len, maxsize);
	}

	mir_disconnect(q, mir);
	return (1);
}