/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident	"%Z%%M%	%I%	%E% SMI"

/*
 * STREAMS Buffering module
 *
 * This streams module collects incoming messages from modules below
 * it on the stream and buffers them up into a smaller number of
 * aggregated messages.  Its main purpose is to reduce overhead by
 * cutting down on the number of read (or getmsg) calls its client
 * user process makes.
 *  - only M_DATA is buffered.
 *  - multithreading assumes configured as D_MTQPAIR
 *  - packets are lost only if flag SB_NO_HEADER is clear and buffer
 *    allocation fails.
 *  - in order message transmission. This is enforced for messages other
 *    than high priority messages.
 *  - zero length messages on the read side are not passed up the
 *    stream but used internally for synchronization.
 * FLAGS:
 * - SB_NO_PROTO_CVT - no conversion of M_PROTO messages to M_DATA.
 *   (conversion is the default for backwards compatibility
 *    hence the negative logic).
 * - SB_NO_HEADER - no headers in buffered data.
 *   (adding headers is the default for backwards compatibility
 *    hence the negative logic).
 * - SB_DEFER_CHUNK - provides improved response time in question-answer
 *   applications. Buffering is not enabled until the second message
 *   is received on the read side within the sb_ticks interval.
 *   This option will often be used in combination with flag SB_SEND_ON_WRITE.
 * - SB_SEND_ON_WRITE - a write message results in any pending buffered read
 *   data being immediately sent upstream.
 * - SB_NO_DROPS - bufmod behaves transparently in flow control and propagates
 *   the blocked flow condition downstream. If this flag is clear (default)
 *   messages will be dropped if the upstream flow is blocked.
 */


#include	<sys/types.h>
#include	<sys/errno.h>
#include	<sys/debug.h>
#include	<sys/stropts.h>
#include	<sys/time.h>
#include	<sys/stream.h>
#include	<sys/conf.h>
#include	<sys/ddi.h>
#include	<sys/sunddi.h>
#include	<sys/kmem.h>
#include	<sys/strsun.h>
#include	<sys/bufmod.h>
#include	<sys/modctl.h>
#include	<sys/isa_defs.h>

/*
 * Per-Stream state information.
 *
 * If sb_ticks is negative, we don't deliver chunks until they're
 * full.  If it's zero, we deliver every packet as it arrives.  (In
 * this case we force sb_chunk to zero, to make the implementation
 * easier.)  Otherwise, sb_ticks gives the number of ticks in a
 * buffering interval. The interval begins when the a read side data
 * message is received and a timeout is not active. If sb_snap is
 * zero, no truncation of the msg is done.
 */
struct sb {
	queue_t	*sb_rq;		/* our rq */
	mblk_t	*sb_mp;		/* partial chunk */
	mblk_t  *sb_head;	/* pre-allocated space for the next header */
	mblk_t	*sb_tail;	/* first mblk of last message appended */
	uint_t	sb_mlen;	/* sb_mp length */
	uint_t	sb_mcount;	/* input msg count in sb_mp */
	uint_t	sb_chunk;	/* max chunk size */
	clock_t	sb_ticks;	/* timeout interval */
	timeout_id_t sb_timeoutid; /* qtimeout() id */
	uint_t	sb_drops;	/* cumulative # discarded msgs */
	uint_t	sb_snap;	/* snapshot length */
	uint_t	sb_flags;	/* flags field */
	uint_t	sb_state;	/* state variable */
};

/*
 * Function prototypes.
 */
static	int	sbopen(queue_t *, dev_t *, int, int, cred_t *);
static	int	sbclose(queue_t *, int, cred_t *);
static	void	sbwput(queue_t *, mblk_t *);
static	void	sbrput(queue_t *, mblk_t *);
static	void	sbrsrv(queue_t *);
static	void	sbioctl(queue_t *, mblk_t *);
static	void	sbaddmsg(queue_t *, mblk_t *);
static	void	sbtick(void *);
static	void	sbclosechunk(struct sb *);
static	void	sbsendit(queue_t *, mblk_t *);

static struct module_info	sb_minfo = {
	21,		/* mi_idnum */
	"bufmod",	/* mi_idname */
	0,		/* mi_minpsz */
	INFPSZ,		/* mi_maxpsz */
	1,		/* mi_hiwat */
	0		/* mi_lowat */
};

static struct qinit	sb_rinit = {
	(int (*)())sbrput,	/* qi_putp */
	(int (*)())sbrsrv,	/* qi_srvp */
	sbopen,			/* qi_qopen */
	sbclose,		/* qi_qclose */
	NULL,			/* qi_qadmin */
	&sb_minfo,		/* qi_minfo */
	NULL			/* qi_mstat */
};

static struct qinit	sb_winit = {
	(int (*)())sbwput,	/* qi_putp */
	NULL,			/* qi_srvp */
	NULL,			/* qi_qopen */
	NULL,			/* qi_qclose */
	NULL,			/* qi_qadmin */
	&sb_minfo,		/* qi_minfo */
	NULL			/* qi_mstat */
};

static struct streamtab	sb_info = {
	&sb_rinit,	/* st_rdinit */
	&sb_winit,	/* st_wrinit */
	NULL,		/* st_muxrinit */
	NULL		/* st_muxwinit */
};


/*
 * This is the loadable module wrapper.
 */

static struct fmodsw fsw = {
	"bufmod",
	&sb_info,
	D_MTQPAIR | D_MP
};

/*
 * Module linkage information for the kernel.
 */

static struct modlstrmod modlstrmod = {
	&mod_strmodops, "streams buffer mod", &fsw
};

static struct modlinkage modlinkage = {
	MODREV_1, &modlstrmod, NULL
};


int
_init(void)
{
	return (mod_install(&modlinkage));
}

int
_fini(void)
{
	return (mod_remove(&modlinkage));
}

int
_info(struct modinfo *modinfop)
{
	return (mod_info(&modlinkage, modinfop));
}


/* ARGSUSED */
static int
sbopen(queue_t *rq, dev_t *dev, int oflag, int sflag, cred_t *crp)
{
	struct sb	*sbp;
	ASSERT(rq);

	if (sflag != MODOPEN)
		return (EINVAL);

	if (rq->q_ptr)
		return (0);

	/*
	 * Allocate and initialize per-Stream structure.
	 */
	sbp = kmem_alloc(sizeof (struct sb), KM_SLEEP);
	sbp->sb_rq = rq;
	sbp->sb_ticks = -1;
	sbp->sb_chunk = SB_DFLT_CHUNK;
	sbp->sb_tail = sbp->sb_mp = sbp->sb_head = NULL;
	sbp->sb_mlen = 0;
	sbp->sb_mcount = 0;
	sbp->sb_timeoutid = 0;
	sbp->sb_drops = 0;
	sbp->sb_snap = 0;
	sbp->sb_flags = 0;
	sbp->sb_state = 0;

	rq->q_ptr = WR(rq)->q_ptr = sbp;

	qprocson(rq);


	return (0);
}

/* ARGSUSED1 */
static int
sbclose(queue_t *rq, int flag, cred_t *credp)
{
	struct	sb	*sbp = (struct sb *)rq->q_ptr;

	ASSERT(sbp);

	qprocsoff(rq);
	/*
	 * Cancel an outstanding timeout
	 */
	if (sbp->sb_timeoutid != 0) {
		(void) quntimeout(rq, sbp->sb_timeoutid);
		sbp->sb_timeoutid = 0;
	}
	/*
	 * Free the current chunk.
	 */
	if (sbp->sb_mp) {
		freemsg(sbp->sb_mp);
		sbp->sb_tail = sbp->sb_mp = sbp->sb_head = NULL;
		sbp->sb_mlen = 0;
	}

	/*
	 * Free the per-Stream structure.
	 */
	kmem_free((caddr_t)sbp, sizeof (struct sb));
	rq->q_ptr = WR(rq)->q_ptr = NULL;

	return (0);
}

/*
 * the correction factor is introduced to compensate for
 * whatever assumptions the modules below have made about
 * how much traffic is flowing through the stream and the fact
 * that bufmod may be snipping messages with the sb_snap length.
 */
#define	SNIT_HIWAT(msgsize, fudge)	((4 * msgsize * fudge) + 512)
#define	SNIT_LOWAT(msgsize, fudge)	((2 * msgsize * fudge) + 256)


static void
sbioc(queue_t *wq, mblk_t *mp)
{
	struct iocblk *iocp;
	struct sb *sbp = (struct sb *)wq->q_ptr;
	clock_t	ticks;
	mblk_t	*mop;

	iocp = (struct iocblk *)mp->b_rptr;

	switch (iocp->ioc_cmd) {
	case SBIOCGCHUNK:
	case SBIOCGSNAP:
	case SBIOCGFLAGS:
	case SBIOCGTIME:
		miocack(wq, mp, 0, 0);
		return;

	case SBIOCSTIME:
#ifdef _SYSCALL32_IMPL
		if ((iocp->ioc_flag & IOC_MODELS) != IOC_NATIVE) {
			struct timeval32 *t32;

			t32 = (struct timeval32 *)mp->b_cont->b_rptr;
			if (t32->tv_sec < 0 || t32->tv_usec < 0) {
				miocnak(wq, mp, 0, EINVAL);
				break;
			}
			ticks = TIMEVAL_TO_TICK(t32);
		} else
#endif /* _SYSCALL32_IMPL */
		{
			struct timeval *tb;

			tb = (struct timeval *)mp->b_cont->b_rptr;

			if (tb->tv_sec < 0 || tb->tv_usec < 0) {
				miocnak(wq, mp, 0, EINVAL);
				break;
			}
			ticks = TIMEVAL_TO_TICK(tb);
		}
		sbp->sb_ticks = ticks;
		if (ticks == 0)
			sbp->sb_chunk = 0;
		miocack(wq, mp, 0, 0);
		sbclosechunk(sbp);
		return;

	case SBIOCSCHUNK:
		/*
		 * set up hi/lo water marks on stream head read queue.
		 * unlikely to run out of resources. Fix at later date.
		 */
		if ((mop = allocb(sizeof (struct stroptions),
		    BPRI_MED)) != NULL) {
			struct stroptions *sop;
			uint_t chunk;

			chunk = *(uint_t *)mp->b_cont->b_rptr;
			mop->b_datap->db_type = M_SETOPTS;
			mop->b_wptr += sizeof (struct stroptions);
			sop = (struct stroptions *)mop->b_rptr;
			sop->so_flags = SO_HIWAT | SO_LOWAT;
			sop->so_hiwat = SNIT_HIWAT(chunk, 1);
			sop->so_lowat = SNIT_LOWAT(chunk, 1);
			qreply(wq, mop);
		}

		sbp->sb_chunk = *(uint_t *)mp->b_cont->b_rptr;
		miocack(wq, mp, 0, 0);
		sbclosechunk(sbp);
		return;

	case SBIOCSFLAGS:
		sbp->sb_flags = *(uint_t *)mp->b_cont->b_rptr;
		miocack(wq, mp, 0, 0);
		return;

	case SBIOCSSNAP:
		/*
		 * if chunking dont worry about effects of
		 * snipping of message size on head flow control
		 * since it has a relatively small bearing on the
		 * data rate onto the streamn head.
		 */
		if (!sbp->sb_chunk) {
			/*
			 * set up hi/lo water marks on stream head read queue.
			 * unlikely to run out of resources. Fix at later date.
			 */
			if ((mop = allocb(sizeof (struct stroptions),
			    BPRI_MED)) != NULL) {
				struct stroptions *sop;
				uint_t snap;
				int fudge;

				snap = *(uint_t *)mp->b_cont->b_rptr;
				mop->b_datap->db_type = M_SETOPTS;
				mop->b_wptr += sizeof (struct stroptions);
				sop = (struct stroptions *)mop->b_rptr;
				sop->so_flags = SO_HIWAT | SO_LOWAT;
				fudge = snap <= 100 ?   4 :
				    snap <= 400 ?   2 :
				    1;
				sop->so_hiwat = SNIT_HIWAT(snap, fudge);
				sop->so_lowat = SNIT_LOWAT(snap, fudge);
				qreply(wq, mop);
			}
		}

		sbp->sb_snap = *(uint_t *)mp->b_cont->b_rptr;
		miocack(wq, mp, 0, 0);
		return;

	default:
		ASSERT(0);
		return;
	}
}

/*
 * Write-side put procedure.  Its main task is to detect ioctls
 * for manipulating the buffering state and hand them to sbioctl.
 * Other message types are passed on through.
 */
static void
sbwput(queue_t *wq, mblk_t *mp)
{
	struct	sb	*sbp = (struct sb *)wq->q_ptr;
	struct copyresp *resp;

	if (sbp->sb_flags & SB_SEND_ON_WRITE)
		sbclosechunk(sbp);
	switch (mp->b_datap->db_type) {
	case M_IOCTL:
		sbioctl(wq, mp);
		break;

	case M_IOCDATA:
		resp = (struct copyresp *)mp->b_rptr;
		if (resp->cp_rval) {
			/*
			 * Just free message on failure.
			 */
			freemsg(mp);
			break;
		}

		switch (resp->cp_cmd) {
		case SBIOCSTIME:
		case SBIOCSCHUNK:
		case SBIOCSFLAGS:
		case SBIOCSSNAP:
		case SBIOCGTIME:
		case SBIOCGCHUNK:
		case SBIOCGSNAP:
		case SBIOCGFLAGS:
			sbioc(wq, mp);
			break;

		default:
			putnext(wq, mp);
			break;
		}
		break;

	default:
		putnext(wq, mp);
		break;
	}
}

/*
 * Read-side put procedure.  It's responsible for buffering up incoming
 * messages and grouping them into aggregates according to the current
 * buffering parameters.
 */
static void
sbrput(queue_t *rq, mblk_t *mp)
{
	struct	sb	*sbp = (struct sb *)rq->q_ptr;

	ASSERT(sbp);

	switch (mp->b_datap->db_type) {
	case M_PROTO:
		if (sbp->sb_flags & SB_NO_PROTO_CVT) {
			sbclosechunk(sbp);
			sbsendit(rq, mp);
			break;
		} else {
			/*
			 * Convert M_PROTO to M_DATA.
			 */
			mp->b_datap->db_type = M_DATA;
		}
		/* FALLTHRU */

	case M_DATA:
		if ((sbp->sb_flags & SB_DEFER_CHUNK) &&
		    !(sbp->sb_state & SB_FRCVD)) {
			sbclosechunk(sbp);
			sbsendit(rq, mp);
			sbp->sb_state |= SB_FRCVD;
		} else
			sbaddmsg(rq, mp);

		if ((sbp->sb_ticks > 0) && !(sbp->sb_timeoutid))
			sbp->sb_timeoutid = qtimeout(sbp->sb_rq, sbtick,
			    sbp, sbp->sb_ticks);

		break;

	case M_FLUSH:
		if (*mp->b_rptr & FLUSHR) {
			/*
			 * Reset timeout, flush the chunk currently in
			 * progress, and start a new chunk.
			 */
			if (sbp->sb_timeoutid) {
				(void) quntimeout(sbp->sb_rq,
				    sbp->sb_timeoutid);
				sbp->sb_timeoutid = 0;
			}
			if (sbp->sb_mp) {
				freemsg(sbp->sb_mp);
				sbp->sb_tail = sbp->sb_mp = sbp->sb_head = NULL;
				sbp->sb_mlen = 0;
				sbp->sb_mcount = 0;
			}
			flushq(rq, FLUSHALL);
		}
		putnext(rq, mp);
		break;

	case M_CTL:
		/*
		 * Zero-length M_CTL means our timeout() popped.
		 */
		if (MBLKL(mp) == 0) {
			freemsg(mp);
			sbclosechunk(sbp);
		} else {
			sbclosechunk(sbp);
			sbsendit(rq, mp);
		}
		break;

	default:
		if (mp->b_datap->db_type <= QPCTL) {
			sbclosechunk(sbp);
			sbsendit(rq, mp);
		} else {
			/* Note: out of band */
			putnext(rq, mp);
		}
		break;
	}
}

/*
 *  read service procedure.
 */
/* ARGSUSED */
static void
sbrsrv(queue_t *rq)
{
	mblk_t	*mp;

	/*
	 * High priority messages shouldn't get here but if
	 * one does, jam it through to avoid infinite loop.
	 */
	while ((mp = getq(rq)) != NULL) {
		if (!canputnext(rq) && (mp->b_datap->db_type <= QPCTL)) {
			/* should only get here if SB_NO_SROPS */
			(void) putbq(rq, mp);
			return;
		}
		putnext(rq, mp);
	}
}

/*
 * Handle write-side M_IOCTL messages.
 */
static void
sbioctl(queue_t *wq, mblk_t *mp)
{
	struct	sb	*sbp = (struct sb *)wq->q_ptr;
	struct iocblk	*iocp = (struct iocblk *)mp->b_rptr;
	struct	timeval	*t;
	clock_t	ticks;
	mblk_t	*mop;
	int	transparent = iocp->ioc_count;
	mblk_t	*datamp;
	int	error;

	switch (iocp->ioc_cmd) {
	case SBIOCSTIME:
		if (iocp->ioc_count == TRANSPARENT) {
#ifdef _SYSCALL32_IMPL
			if ((iocp->ioc_flag & IOC_MODELS) != IOC_NATIVE) {
				mcopyin(mp, NULL, sizeof (struct timeval32),
				    NULL);
			} else
#endif /* _SYSCALL32_IMPL */
			{
				mcopyin(mp, NULL, sizeof (*t), NULL);
			}
			qreply(wq, mp);
		} else {
			/*
			 * Verify argument length.
			 */
#ifdef _SYSCALL32_IMPL
			if ((iocp->ioc_flag & IOC_MODELS) != IOC_NATIVE) {
				struct timeval32 *t32;

				error = miocpullup(mp,
				    sizeof (struct timeval32));
				if (error != 0) {
					miocnak(wq, mp, 0, error);
					break;
				}
				t32 = (struct timeval32 *)mp->b_cont->b_rptr;
				if (t32->tv_sec < 0 || t32->tv_usec < 0) {
					miocnak(wq, mp, 0, EINVAL);
					break;
				}
				ticks = TIMEVAL_TO_TICK(t32);
			} else
#endif /* _SYSCALL32_IMPL */
			{
				error = miocpullup(mp, sizeof (struct timeval));
				if (error != 0) {
					miocnak(wq, mp, 0, error);
					break;
				}

				t = (struct timeval *)mp->b_cont->b_rptr;
				if (t->tv_sec < 0 || t->tv_usec < 0) {
					miocnak(wq, mp, 0, EINVAL);
					break;
				}
				ticks = TIMEVAL_TO_TICK(t);
			}
			sbp->sb_ticks = ticks;
			if (ticks == 0)
				sbp->sb_chunk = 0;
			miocack(wq, mp, 0, 0);
			sbclosechunk(sbp);
		}
		break;

	case SBIOCGTIME: {
		struct timeval *t;

		/*
		 * Verify argument length.
		 */
		if (transparent != TRANSPARENT) {
#ifdef _SYSCALL32_IMPL
			if ((iocp->ioc_flag & IOC_MODELS) != IOC_NATIVE) {
				error = miocpullup(mp,
				    sizeof (struct timeval32));
				if (error != 0) {
					miocnak(wq, mp, 0, error);
					break;
				}
			} else
#endif /* _SYSCALL32_IMPL */
			error = miocpullup(mp, sizeof (struct timeval));
			if (error != 0) {
				miocnak(wq, mp, 0, error);
				break;
			}
		}

		/*
		 * If infinite timeout, return range error
		 * for the ioctl.
		 */
		if (sbp->sb_ticks < 0) {
			miocnak(wq, mp, 0, ERANGE);
			break;
		}

#ifdef _SYSCALL32_IMPL
		if ((iocp->ioc_flag & IOC_MODELS) != IOC_NATIVE) {
			struct timeval32 *t32;

			if (transparent == TRANSPARENT) {
				datamp = allocb(sizeof (*t32), BPRI_MED);
				if (datamp == NULL) {
					miocnak(wq, mp, 0, EAGAIN);
					break;
				}
				mcopyout(mp, NULL, sizeof (*t32), NULL, datamp);
			}

			t32 = (struct timeval32 *)mp->b_cont->b_rptr;
			TICK_TO_TIMEVAL32(sbp->sb_ticks, t32);

			if (transparent == TRANSPARENT)
				qreply(wq, mp);
			else
				miocack(wq, mp, sizeof (*t32), 0);
		} else
#endif /* _SYSCALL32_IMPL */
		{
			if (transparent == TRANSPARENT) {
				datamp = allocb(sizeof (*t), BPRI_MED);
				if (datamp == NULL) {
					miocnak(wq, mp, 0, EAGAIN);
					break;
				}
				mcopyout(mp, NULL, sizeof (*t), NULL, datamp);
			}

			t = (struct timeval *)mp->b_cont->b_rptr;
			TICK_TO_TIMEVAL(sbp->sb_ticks, t);

			if (transparent == TRANSPARENT)
				qreply(wq, mp);
			else
				miocack(wq, mp, sizeof (*t), 0);
		}
		break;
	}

	case SBIOCCTIME:
		sbp->sb_ticks = -1;
		miocack(wq, mp, 0, 0);
		break;

	case SBIOCSCHUNK:
		if (iocp->ioc_count == TRANSPARENT) {
			mcopyin(mp, NULL, sizeof (uint_t), NULL);
			qreply(wq, mp);
		} else {
			/*
			 * Verify argument length.
			 */
			error = miocpullup(mp, sizeof (uint_t));
			if (error != 0) {
				miocnak(wq, mp, 0, error);
				break;
			}

			/*
			 * set up hi/lo water marks on stream head read queue.
			 * unlikely to run out of resources. Fix at later date.
			 */
			if ((mop = allocb(sizeof (struct stroptions),
			    BPRI_MED)) != NULL) {
				struct stroptions *sop;
				uint_t chunk;

				chunk = *(uint_t *)mp->b_cont->b_rptr;
				mop->b_datap->db_type = M_SETOPTS;
				mop->b_wptr += sizeof (struct stroptions);
				sop = (struct stroptions *)mop->b_rptr;
				sop->so_flags = SO_HIWAT | SO_LOWAT;
				sop->so_hiwat = SNIT_HIWAT(chunk, 1);
				sop->so_lowat = SNIT_LOWAT(chunk, 1);
				qreply(wq, mop);
			}

			sbp->sb_chunk = *(uint_t *)mp->b_cont->b_rptr;
			miocack(wq, mp, 0, 0);
			sbclosechunk(sbp);
		}
		break;

	case SBIOCGCHUNK:
		/*
		 * Verify argument length.
		 */
		if (transparent != TRANSPARENT) {
			error = miocpullup(mp, sizeof (uint_t));
			if (error != 0) {
				miocnak(wq, mp, 0, error);
				break;
			}
		}

		if (transparent == TRANSPARENT) {
			datamp = allocb(sizeof (uint_t), BPRI_MED);
			if (datamp == NULL) {
				miocnak(wq, mp, 0, EAGAIN);
				break;
			}
			mcopyout(mp, NULL, sizeof (uint_t), NULL, datamp);
		}

		*(uint_t *)mp->b_cont->b_rptr = sbp->sb_chunk;

		if (transparent == TRANSPARENT)
			qreply(wq, mp);
		else
			miocack(wq, mp, sizeof (uint_t), 0);
		break;

	case SBIOCSSNAP:
		if (iocp->ioc_count == TRANSPARENT) {
			mcopyin(mp, NULL, sizeof (uint_t), NULL);
			qreply(wq, mp);
		} else {
			/*
			 * Verify argument length.
			 */
			error = miocpullup(mp, sizeof (uint_t));
			if (error != 0) {
				miocnak(wq, mp, 0, error);
				break;
			}

			/*
			 * if chunking dont worry about effects of
			 * snipping of message size on head flow control
			 * since it has a relatively small bearing on the
			 * data rate onto the streamn head.
			 */
			if (!sbp->sb_chunk) {
				/*
				 * set up hi/lo water marks on stream
				 * head read queue.  unlikely to run out
				 * of resources. Fix at later date.
				 */
				if ((mop = allocb(sizeof (struct stroptions),
				    BPRI_MED)) != NULL) {
					struct stroptions *sop;
					uint_t snap;
					int fudge;

					snap = *(uint_t *)mp->b_cont->b_rptr;
					mop->b_datap->db_type = M_SETOPTS;
					mop->b_wptr += sizeof (*sop);
					sop = (struct stroptions *)mop->b_rptr;
					sop->so_flags = SO_HIWAT | SO_LOWAT;
					fudge = (snap <= 100) ? 4 :
					    (snap <= 400) ? 2 : 1;
					sop->so_hiwat = SNIT_HIWAT(snap, fudge);
					sop->so_lowat = SNIT_LOWAT(snap, fudge);
					qreply(wq, mop);
				}
			}

			sbp->sb_snap = *(uint_t *)mp->b_cont->b_rptr;

			miocack(wq, mp, 0, 0);
		}
		break;

	case SBIOCGSNAP:
		/*
		 * Verify argument length
		 */
		if (transparent != TRANSPARENT) {
			error = miocpullup(mp, sizeof (uint_t));
			if (error != 0) {
				miocnak(wq, mp, 0, error);
				break;
			}
		}

		if (transparent == TRANSPARENT) {
			datamp = allocb(sizeof (uint_t), BPRI_MED);
			if (datamp == NULL) {
				miocnak(wq, mp, 0, EAGAIN);
				break;
			}
			mcopyout(mp, NULL, sizeof (uint_t), NULL, datamp);
		}

		*(uint_t *)mp->b_cont->b_rptr = sbp->sb_snap;

		if (transparent == TRANSPARENT)
			qreply(wq, mp);
		else
			miocack(wq, mp, sizeof (uint_t), 0);
		break;

	case SBIOCSFLAGS:
		/*
		 * set the flags.
		 */
		if (iocp->ioc_count == TRANSPARENT) {
			mcopyin(mp, NULL, sizeof (uint_t), NULL);
			qreply(wq, mp);
		} else {
			error = miocpullup(mp, sizeof (uint_t));
			if (error != 0) {
				miocnak(wq, mp, 0, error);
				break;
			}
			sbp->sb_flags = *(uint_t *)mp->b_cont->b_rptr;
			miocack(wq, mp, 0, 0);
		}
		break;

	case SBIOCGFLAGS:
		/*
		 * Verify argument length
		 */
		if (transparent != TRANSPARENT) {
			error = miocpullup(mp, sizeof (uint_t));
			if (error != 0) {
				miocnak(wq, mp, 0, error);
				break;
			}
		}

		if (transparent == TRANSPARENT) {
			datamp = allocb(sizeof (uint_t), BPRI_MED);
			if (datamp == NULL) {
				miocnak(wq, mp, 0, EAGAIN);
				break;
			}
			mcopyout(mp, NULL, sizeof (uint_t), NULL, datamp);
		}

		*(uint_t *)mp->b_cont->b_rptr = sbp->sb_flags;

		if (transparent == TRANSPARENT)
			qreply(wq, mp);
		else
			miocack(wq, mp, sizeof (uint_t), 0);
		break;


	default:
		putnext(wq, mp);
		break;
	}
}

/*
 * Given a length l, calculate the amount of extra storage
 * required to round it up to the next multiple of the alignment a.
 */
#define	RoundUpAmt(l, a)	((l) % (a) ? (a) - ((l) % (a)) : 0)
/*
 * Calculate additional amount of space required for alignment.
 */
#define	Align(l)		RoundUpAmt(l, sizeof (ulong_t))
/*
 * Smallest possible message size when headers are enabled.
 * This is used to calculate whether a chunk is nearly full.
 */
#define	SMALLEST_MESSAGE	sizeof (struct sb_hdr) + _POINTER_ALIGNMENT

/*
 * Process a read-side M_DATA message.
 *
 * If the currently accumulating chunk doesn't have enough room
 * for the message, close off the chunk, pass it upward, and start
 * a new one.  Then add the message to the current chunk, taking
 * account of the possibility that the message's size exceeds the
 * chunk size.
 *
 * If headers are enabled add an sb_hdr header and trailing alignment padding.
 *
 * To optimise performance the total number of msgbs should be kept
 * to a minimum. This is achieved by using any remaining space in message N
 * for both its own padding as well as the header of message N+1 if possible.
 * If there's insufficient space we allocate one message to hold this 'wrapper'.
 * (there's likely to be space beyond message N, since allocb would have
 * rounded up the required size to one of the dblk_sizes).
 *
 */
static void
sbaddmsg(queue_t *rq, mblk_t *mp)
{
	struct sb	*sbp;
	struct timeval	t;
	struct sb_hdr	hp;
	mblk_t *wrapper;	/* padding for msg N, header for msg N+1 */
	mblk_t *last;		/* last mblk of current message */
	size_t wrapperlen;	/* length of header + padding */
	size_t origlen;		/* data length before truncation */
	size_t pad;		/* bytes required to align header */

	sbp = (struct sb *)rq->q_ptr;

	origlen = msgdsize(mp);

	/*
	 * Truncate the message.
	 */
	if ((sbp->sb_snap > 0) && (origlen > sbp->sb_snap) &&
			(adjmsg(mp, -(origlen - sbp->sb_snap)) == 1))
		hp.sbh_totlen = hp.sbh_msglen = sbp->sb_snap;
	else
		hp.sbh_totlen = hp.sbh_msglen = origlen;

	if (sbp->sb_flags & SB_NO_HEADER) {

		/*
		 * Would the inclusion of this message overflow the current
		 * chunk? If so close the chunk off and start a new one.
		 */
		if ((hp.sbh_totlen + sbp->sb_mlen) > sbp->sb_chunk)
			sbclosechunk(sbp);
		/*
		 * First message too big for chunk - just send it up.
		 * This will always be true when we're not chunking.
		 */
		if (hp.sbh_totlen > sbp->sb_chunk) {
			sbsendit(rq, mp);
			return;
		}

		/*
		 * We now know that the msg will fit in the chunk.
		 * Link it onto the end of the chunk.
		 * Since linkb() walks the entire chain, we keep a pointer to
		 * the first mblk of the last msgb added and call linkb on that
		 * that last message, rather than performing the
		 * O(n) linkb() operation on the whole chain.
		 * sb_head isn't needed in this SB_NO_HEADER mode.
		 */
		if (sbp->sb_mp)
			linkb(sbp->sb_tail, mp);
		else
			sbp->sb_mp = mp;

		sbp->sb_tail = mp;
		sbp->sb_mlen += hp.sbh_totlen;
		sbp->sb_mcount++;
	} else {
		/* Timestamp must be done immediately */
		uniqtime(&t);
		TIMEVAL_TO_TIMEVAL32(&hp.sbh_timestamp, &t);

		pad = Align(hp.sbh_totlen);
		hp.sbh_totlen += sizeof (hp);
		hp.sbh_totlen += pad;

		/*
		 * Would the inclusion of this message overflow the current
		 * chunk? If so close the chunk off and start a new one.
		 */
		if ((hp.sbh_totlen + sbp->sb_mlen) > sbp->sb_chunk)
				sbclosechunk(sbp);

		if (sbp->sb_head == NULL) {
			/* Allocate leading header of new chunk */
			sbp->sb_head = allocb(sizeof (hp), BPRI_MED);
			if (sbp->sb_head == NULL) {
				/*
				 * Memory allocation failure.
				 * This will need to be revisited
				 * since using certain flag combinations
				 * can result in messages being dropped
				 * silently.
				 */
				freemsg(mp);
				sbp->sb_drops++;
				return;
			}
			sbp->sb_mp = sbp->sb_head;
		}

		/*
		 * Copy header into message
		 */
		hp.sbh_drops = sbp->sb_drops;
		hp.sbh_origlen = origlen;
		(void) memcpy(sbp->sb_head->b_wptr, (char *)&hp, sizeof (hp));
		sbp->sb_head->b_wptr += sizeof (hp);

		ASSERT(sbp->sb_head->b_wptr <= sbp->sb_head->b_datap->db_lim);

		/*
		 * Join message to the chunk
		 */
		linkb(sbp->sb_head, mp);

		sbp->sb_mcount++;
		sbp->sb_mlen += hp.sbh_totlen;

		/*
		 * If the first message alone is too big for the chunk close
		 * the chunk now.
		 * If the next message would immediately cause the chunk to
		 * overflow we may as well close the chunk now. The next
		 * message is certain to be at least SMALLEST_MESSAGE size.
		 */
		if (hp.sbh_totlen + SMALLEST_MESSAGE > sbp->sb_chunk) {
			sbclosechunk(sbp);
			return;
		}

		/*
		 * Find space for the wrapper. The wrapper consists of:
		 *
		 * 1) Padding for this message (this is to ensure each header
		 * begins on an 8 byte boundary in the userland buffer).
		 *
		 * 2) Space for the next message's header, in case the next
		 * next message will fit in this chunk.
		 *
		 * It may be possible to append the wrapper to the last mblk
		 * of the message, but only if we 'own' the data. If the dblk
		 * has been shared through dupmsg() we mustn't alter it.
		 */

		wrapperlen = (sizeof (hp) + pad);

		/* Is there space for the wrapper beyond the message's data ? */
		for (last = mp; last->b_cont; last = last->b_cont)
			;

		if ((wrapperlen <= MBLKTAIL(last)) &&
			(last->b_datap->db_ref == 1)) {
			if (pad > 0) {
				/*
				 * Pad with zeroes to the next pointer boundary
				 * (we don't want to disclose kernel data to
				 * users), then advance wptr.
				 */
				(void) memset(last->b_wptr, 0, pad);
				last->b_wptr += pad;
			}
			/* Remember where to write the header information */
			sbp->sb_head = last;
		} else {
			/* Have to allocate additional space for the wrapper */
			wrapper = allocb(wrapperlen, BPRI_MED);
			if (wrapper == NULL) {
				sbclosechunk(sbp);
				return;
			}
			if (pad > 0) {
				/*
				 * Pad with zeroes (we don't want to disclose
				 * kernel data to users).
				 */
				(void) memset(wrapper->b_wptr, 0, pad);
				wrapper->b_wptr += pad;
			}
			/* Link the wrapper msg onto the end of the chunk */
			linkb(mp, wrapper);
			/* Remember to write the next header in this wrapper */
			sbp->sb_head = wrapper;
		}
	}
}

/*
 * Called from timeout().
 * Signal a timeout by passing a zero-length M_CTL msg in the read-side
 * to synchronize with any active module threads (open, close, wput, rput).
 */
static void
sbtick(void *arg)
{
	struct sb *sbp = arg;
	queue_t	*rq;

	ASSERT(sbp);

	rq = sbp->sb_rq;
	sbp->sb_timeoutid = 0;		/* timeout has fired */

	if (putctl(rq, M_CTL) == 0)	/* failure */
		sbp->sb_timeoutid = qtimeout(rq, sbtick, sbp, sbp->sb_ticks);
}

/*
 * Close off the currently accumulating chunk and pass
 * it upward.  Takes care of resetting timers as well.
 *
 * This routine is called both directly and as a result
 * of the chunk timeout expiring.
 */
static void
sbclosechunk(struct sb *sbp)
{
	mblk_t	*mp;
	queue_t	*rq;

	ASSERT(sbp);

	if (sbp->sb_timeoutid) {
		(void) quntimeout(sbp->sb_rq, sbp->sb_timeoutid);
		sbp->sb_timeoutid = 0;
	}

	mp = sbp->sb_mp;
	rq = sbp->sb_rq;

	/*
	 * If there's currently a chunk in progress, close it off
	 * and try to send it up.
	 */
	if (mp) {
		sbsendit(rq, mp);
	}

	/*
	 * Clear old chunk.  Ready for new msgs.
	 */
	sbp->sb_tail = sbp->sb_mp = sbp->sb_head = NULL;
	sbp->sb_mlen = 0;
	sbp->sb_mcount = 0;
	if (sbp->sb_flags & SB_DEFER_CHUNK)
		sbp->sb_state &= ~SB_FRCVD;

}

static void
sbsendit(queue_t *rq, mblk_t *mp)
{
	struct	sb	*sbp = (struct sb *)rq->q_ptr;

	if (!canputnext(rq)) {
		if (sbp->sb_flags & SB_NO_DROPS)
			(void) putq(rq, mp);
		else {
			freemsg(mp);
			sbp->sb_drops += sbp->sb_mcount;
		}
		return;
	}
	/*
	 * If there are messages on the q already, keep
	 * queueing them since they need to be processed in order.
	 */
	if (qsize(rq) > 0) {
		/* should only get here if SB_NO_DROPS */
		(void) putq(rq, mp);
	}
	else
		putnext(rq, mp);
}