#include <sys/modctl.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/conf.h>
#include <sys/devops.h>
#include <sys/stat.h>
#include <sys/fs/zev.h>
#include <sys/zev_callbacks.h>
#include <sys/zev_checksums.h>
#include <sys/zfs_znode.h>
#include <sys/time.h>
#include <sys/sa.h>
#include <sys/zap.h>
#include <sys/time.h>

#define	OFFSETOF(s, m)		((size_t)(&(((s *)0)->m)))

#define ZEV_DEFAULT_QUEUE_NAME		"beaver"
#define ZEV_CONTROL_DEVICE_MINOR	0
#define ZEV_MINOR_MIN			(ZEV_CONTROL_DEVICE_MINOR + 1)
#define ZEV_MINOR_MAX			(ZEV_MINOR_MIN + ZEV_MAX_QUEUES - 1)

typedef struct zev_queue {
	char			zq_name[ZEV_MAX_QUEUE_NAME_LEN+1];
	minor_t			zq_minor_number;
	dev_info_t		*zq_dip;
	struct pollhead		zq_pollhead;
	uint64_t		zq_bytes_read;
	uint64_t		zq_events_read;
	uint64_t		zq_bytes_discarded;
	uint64_t		zq_events_discarded;
	uint64_t		zq_bytes_total;
	uint64_t		zq_events_total;
	uint64_t		zq_wakeup_threshold;
	uint16_t		zq_flags;
	uint16_t		zq_need_wakeup;
	/* protected by zev_mutex */
	int			zq_refcnt;
	uint64_t		zq_queue_len;
	uint64_t		zq_queue_messages;
	uint64_t		zq_max_queue_len;
	zev_msg_t		*zq_oldest;
	boolean_t		zq_busy;
	boolean_t		zq_to_be_removed;
	zev_statistics_t	zq_statistics;
	kcondvar_t		zq_condvar;
} zev_queue_t;

static void		*statep;
struct pollhead		zev_pollhead;

kmutex_t		zev_mutex;
kcondvar_t		zev_condvar;
kmutex_t		zev_queue_msg_mutex;
krwlock_t		zev_pool_list_rwlock;
static zev_statistics_t	zev_statistics;
static boolean_t	zev_attached;
static kmutex_t		zev_mark_id_mutex;
static uint64_t		zev_mark_id = 0;

static uint64_t		zev_msg_sequence_number = 0;
static zev_queue_t	*zev_queues[ZEV_MAX_QUEUES];
static int		zev_queue_cnt = 0;

uint64_t	zev_memory_allocated = 0;
uint64_t	zev_memory_freed = 0;

/*
 * The longest potential message is from zev_zfs_mount() and
 * contains the mountpoint, which might be close to MAXPATHLEN bytes long.
 *
 * Another candidate is zev_znode_rename_cb() and contains three inode
 * numbers and two filenames of up to MAXNAMELEN bytes each.
 */
#define ZEV_MAX_MESSAGE_LEN	4096

static zev_msg_t *zev_queue_head = NULL;
static zev_msg_t *zev_queue_tail = NULL;
static uint64_t zev_queue_len = 0;


typedef struct zev_pool_list_entry {
	struct zev_pool_list_entry	*next;
	char				name[MAXPATHLEN];
} zev_pool_list_entry_t;

static zev_pool_list_entry_t *zev_muted_pools_head = NULL;

static volatile int zev_wakeup_thread_run = 1;
static kthread_t *zev_poll_wakeup_thread = NULL;

void *
zev_alloc(ssize_t sz)
{
	ZEV_MEM_ADD(sz);
	return kmem_alloc(sz, KM_SLEEP);
}

void *
zev_zalloc(ssize_t sz)
{
	ZEV_MEM_ADD(sz);
	return kmem_zalloc(sz, KM_SLEEP);
}

void
zev_free(void *ptr, ssize_t sz)
{
	ZEV_MEM_SUB(sz);						\
	kmem_free(ptr, sz);
}

int
zev_queue_cmp(const void *a, const void *b)
{
	const zev_queue_t *qa = a;
	const zev_queue_t *qb = b;
	if (qa->zq_minor_number > qb->zq_minor_number)
		return 1;
	if (qa->zq_minor_number < qb->zq_minor_number)
		return -1;
	return 0;
}

/* must be called with zev_mutex held */
void
zev_queue_trim(void)
{
	zev_msg_t *m;
	uint64_t oldest_message;
	zev_queue_t *q;
	int i;

	if (!zev_queue_tail)
		return;

	oldest_message = zev_queue_tail->seq + 1;  /* does not exist, yet. */
	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
		q = zev_queues[i - ZEV_MINOR_MIN];
		if (q == NULL)
			continue;
		if (!q->zq_oldest)
			continue;
		if (oldest_message > q->zq_oldest->seq)
			oldest_message = q->zq_oldest->seq;
	}

	/* remove msgs between oldest_message and zev_queue_head */
	while(zev_queue_head && (oldest_message > zev_queue_head->seq)) {
		m = zev_queue_head;
		zev_queue_head = m->next;
		if (zev_queue_head == NULL) {
			zev_queue_tail = NULL;
		} else {
			zev_queue_head->prev = NULL;
		}
		if (m->read == 0) {
			zev_statistics.zev_bytes_discarded += m->size;
			zev_statistics.zev_cnt_discarded_events++;
		}
		zev_statistics.zev_queue_len -= m->size;
		zev_queue_len--;
		zev_free(m, sizeof(*m) + m->size);
	}
}

/* must be called with zev_mutex held */
static void
zev_queue_hold(zev_queue_t *q)
{
	q->zq_refcnt++;
}

/* must be called with zev_mutex held */
static void
zev_queue_release(zev_queue_t *q)
{
	q->zq_refcnt--;
	if (q->zq_refcnt > 0)
		return;

	ASSERT(q->zq_busy == B_FALSE);

	/* persistent queues will not be removed */
	if ((q->zq_flags & ZEV_FL_PERSISTENT) != 0)
		return;

	/* remove queue from queue list */
	zev_queues[q->zq_minor_number - ZEV_MINOR_MIN] = NULL;

	/* discard messages that no queue references anymore */
	zev_queue_trim();

	cv_destroy(&q->zq_condvar);
	ddi_remove_minor_node(q->zq_dip, q->zq_name);
	ddi_soft_state_free(statep, q->zq_minor_number);
	ZEV_MEM_SUB(sizeof(zev_queue_t));
	zev_queue_cnt--;
}

int
zev_queue_new(zev_queue_t **queue,
              dev_info_t *dip,
              char *name,
              uint64_t max_queue_len,
              uint16_t flags)
{
	zev_queue_t *q;
	zev_queue_t *tmp;
	zev_msg_t *msg;
	int name_exists = 0;
	minor_t minor;
	char *p;
	int i;

	if (max_queue_len > ZEV_MAX_QUEUE_LEN)
		return EINVAL;
	if (max_queue_len == 0)
		max_queue_len = ZEV_MAX_QUEUE_LEN;
	if (!strcmp(name, ZEV_CONTROL_DEVICE_NAME))
		return EINVAL;
	for (p = name; *p; p++) {
		if (*p >= 'a' && *p <= 'z')
			continue;
		if (*p >= '0' && *p <= '9')
			continue;
		if (*p == '.')
			continue;
		return EINVAL;
	}

	mutex_enter(&zev_mutex);

	/* find free minor number.*/
	/* if this were a frequent operation we'd have a free-minor list */
	for (minor = ZEV_MINOR_MIN; minor <= ZEV_MINOR_MAX; minor++) {
		tmp = zev_queues[minor - ZEV_MINOR_MIN];
		if (tmp == NULL)
			break;
	}
	if (tmp) {
		mutex_exit(&zev_mutex);
		return ENOSPC;
	}

	if (ddi_soft_state_zalloc(statep, minor) != DDI_SUCCESS) {
		mutex_exit(&zev_mutex);
		return ENOSPC;
	}
	ZEV_MEM_ADD(sizeof(zev_queue_t));

	q = ddi_get_soft_state(statep, minor);
	memset(q, 0, sizeof(*q));
	strncpy(q->zq_name, name, ZEV_MAX_QUEUE_NAME_LEN);
	q->zq_name[ZEV_MAX_QUEUE_NAME_LEN] = '\0';
	q->zq_max_queue_len = max_queue_len;
	q->zq_wakeup_threshold = ZEV_DEFAULT_POLL_WAKEUP_QUEUE_LEN;
	q->zq_flags = flags;
	q->zq_refcnt = 1;
	q->zq_dip = dip;
	q->zq_minor_number = minor;
	cv_init(&q->zq_condvar, NULL, CV_DRIVER, NULL);

	/* insert into queue list */
	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
		/* if this were a frequent operation we'd have a name tree */
		if (zev_queues[i - ZEV_MINOR_MIN] == NULL)
			continue;
		if (!strcmp(q->zq_name, zev_queues[i-ZEV_MINOR_MIN]->zq_name)) {
			name_exists = 1;
			break;
		}
	}
	if (name_exists) {
		ddi_soft_state_free(statep, minor);
		ZEV_MEM_SUB(sizeof(zev_queue_t));
		mutex_exit(&zev_mutex);
		return EEXIST;
	}
	zev_queues[minor - ZEV_MINOR_MIN] = q;
	zev_queue_cnt++;

	/* calculate current queue len and find head and tail */
	q->zq_oldest = zev_queue_tail;
	msg = zev_queue_tail;
	while ((msg != NULL) && (q->zq_queue_len < q->zq_max_queue_len)) {
		q->zq_queue_len += msg->size;
		q->zq_queue_messages++;
		q->zq_oldest = msg;
		msg = msg->prev;
	}

	mutex_exit(&zev_mutex);

	if (ddi_create_minor_node(dip, name,
	    S_IFCHR, minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
		mutex_enter(&zev_mutex);
		zev_queues[minor - ZEV_MINOR_MIN] = NULL;
		zev_queue_cnt--;
		ddi_soft_state_free(statep, minor);
		ZEV_MEM_SUB(sizeof(zev_queue_t));
		mutex_exit(&zev_mutex);
		return EFAULT;
	}

	*queue = q;
	return 0;
}

/*
 * poll() wakeup thread.  Used to check periodically whether we have
 * bytes left in the queue that have not yet been made into a
 * pollwakeup() call.  This is meant to insure a maximum waiting
 * time until an event is presented as a poll wakeup, while at
 * the same time not making every single event into a poll wakeup
 * of it's own.
 */

static void
zev_poll_wakeup(boolean_t flush_all)
{
	zev_queue_t *q;
	int i;

	/*
	 * This loop works with hold() and release() because
	 * pollwakeup() requires us to release our locks before calling it.
	 *
	 * from pollwakeup(9F):
	 *
	 *   "Driver defined locks should not be held across calls
	 *    to this function."
	 */

	/* wake up threads for each individual queue */
	mutex_enter(&zev_mutex);
	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
		q = zev_queues[i - ZEV_MINOR_MIN];
		if (q == NULL)
			continue;
		if (!q->zq_busy)
			continue;
		if (!q->zq_queue_len)
			continue;
		if ((flush_all) ||
		    (q->zq_queue_len > q->zq_wakeup_threshold)) {
			zev_queue_hold(q);
			mutex_exit(&zev_mutex);
			pollwakeup(&q->zq_pollhead, POLLIN);
			mutex_enter(&zev_mutex);
			zev_queue_release(q);
		}
	}
	mutex_exit(&zev_mutex);
}

static void
zev_poll_wakeup_thread_main(void)
{
	while (zev_wakeup_thread_run) {
		delay(drv_usectohz(100 * 1000)); /* sleep 100ms */

		zev_poll_wakeup(B_TRUE);
	}
	thread_exit();
}

static int
zev_ioc_mute_pool(char *poolname)
{
	zev_pool_list_entry_t *pe;
	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
	/* pool already muted? */
	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
		if (!strcmp(pe->name, poolname)) {
			rw_exit(&zev_pool_list_rwlock);
			return EEXIST;
		}
	}
	pe = zev_zalloc(sizeof(*pe));
	if (!pe) {
		rw_exit(&zev_pool_list_rwlock);
		return ENOMEM;
	}
	(void) strncpy(pe->name, poolname, sizeof(pe->name));
	pe->next = zev_muted_pools_head;
	zev_muted_pools_head = pe;
	rw_exit(&zev_pool_list_rwlock);
	return (0);
}

static int
zev_ioc_unmute_pool(char *poolname)
{
	zev_pool_list_entry_t *pe, *peprev;

	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
	/* pool muted? */
	peprev = NULL;
	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
		if (!strcmp(pe->name, poolname))
			break;
		peprev = pe;
	}
	if (pe) {
		rw_exit(&zev_pool_list_rwlock);
		return ENOENT;
	}

	if (peprev != NULL) {
		peprev->next = pe->next;
	} else {
		zev_muted_pools_head = pe->next;
	}
	zev_free(pe, sizeof(*pe));
	rw_exit(&zev_pool_list_rwlock);
	return (0);
}

int
zev_skip_pool(objset_t *os)
{
	zev_pool_list_entry_t *pe;
	dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
	rw_enter(&zev_pool_list_rwlock, RW_READER);
	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
		if (!strcmp(pe->name, dp->dp_spa->spa_name)) {
			rw_exit(&zev_pool_list_rwlock);
			return 1;
		}
	}
	rw_exit(&zev_pool_list_rwlock);
	return 0;
}

static void
zev_update_statistics(int op, zev_statistics_t *stat)
{
	switch (op) {
	case ZEV_OP_ERROR:
		stat->zev_cnt_errors++;
		break;
	case ZEV_OP_MARK:
		stat->zev_cnt_marks++;
		break;
	case ZEV_OP_ZFS_MOUNT:
		stat->zev_cnt_zfs_mount++;
		break;
	case ZEV_OP_ZFS_UMOUNT:
		stat->zev_cnt_zfs_umount++;
		break;
	case ZEV_OP_ZVOL_WRITE:
		stat->zev_cnt_zvol_write++;
		break;
	case ZEV_OP_ZVOL_TRUNCATE:
		stat->zev_cnt_zvol_truncate++;
		break;
	case ZEV_OP_ZNODE_CLOSE_AFTER_UPDATE:
		stat->zev_cnt_znode_close_after_update++;
		break;
	case ZEV_OP_ZNODE_CREATE:
		stat->zev_cnt_znode_create++;
		break;
	case ZEV_OP_ZNODE_REMOVE:
		stat->zev_cnt_znode_remove++;
		break;
	case ZEV_OP_ZNODE_LINK:
		stat->zev_cnt_znode_link++;
		break;
	case ZEV_OP_ZNODE_SYMLINK:
		stat->zev_cnt_znode_symlink++;
		break;
	case ZEV_OP_ZNODE_RENAME:
		stat->zev_cnt_znode_rename++;
		break;
	case ZEV_OP_ZNODE_WRITE:
		stat->zev_cnt_znode_write++;
		break;
	case ZEV_OP_ZNODE_TRUNCATE:
		stat->zev_cnt_znode_truncate++;
		break;
	case ZEV_OP_ZNODE_SETATTR:
		stat->zev_cnt_znode_setattr++;
		break;
	case ZEV_OP_ZNODE_ACL:
		stat->zev_cnt_znode_acl++;
		break;
	}
}

void
zev_queue_message(int op, zev_msg_t *msg)
{
	zev_queue_t *q;
	int wakeup = 0;
	zev_msg_t *m;
	int i;

	msg->next = NULL;
	msg->prev = NULL;
	msg->read = 0;

	if (op < ZEV_OP_MIN || op > ZEV_OP_MAX) {
		zev_queue_error(op, "unknown op id encountered: %d", op);
		zev_free(msg, sizeof(*msg) + msg->size);
		return;
	}

	/*
	 * This mutex protects us agains race conditions when several
	 * threads want to queue a message and one or more queues are
	 * full:  we release zev_mutex to wait for the queues to become
	 * less-than-full, but we don't know in which order the waiting
	 * threads will be awoken.  If it's not the same order in which
	 * they went to sleep we might mark different messages as "newest"
	 * in different queues, and so we might have dupes or even
	 * skip messages.
	 */
	mutex_enter(&zev_queue_msg_mutex);

	mutex_enter(&zev_mutex);

	/*
	 * When the module is loaded, the default behavior ist to
	 * put all events into a queue and block if the queue is full.
	 * This is done even before the pseudo device is attached.
	 * This way, no events are lost.
	 *
	 * To discard events entirely the "beaver" queue,
	 * which never discards anything, has to be removed.
	 */

	if (zev_queue_cnt == 0) {
		mutex_exit(&zev_mutex);
		mutex_exit(&zev_queue_msg_mutex);
		return;
	}

	/* put message into global queue */
	msg->seq = zev_msg_sequence_number++;
	while (zev_statistics.zev_max_queue_len &&
	    zev_statistics.zev_queue_len >= zev_statistics.zev_max_queue_len) {
		/* queue full.  block until it's been shrunk. */
		cv_wait(&zev_condvar, &zev_mutex);
	}

	if (zev_queue_tail == NULL) {
		zev_queue_head = zev_queue_tail = msg;
	} else {
		zev_queue_tail->next = msg;
		msg->prev = zev_queue_tail;
		zev_queue_tail = msg;
	}
	zev_queue_len++;
	zev_statistics.zev_cnt_total_events++;
	zev_statistics.zev_queue_len += msg->size;

	/* update per-device queues */
	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
		q = zev_queues[i - ZEV_MINOR_MIN];
		if (!q)
			continue;

		zev_queue_hold(q);

		/* make sure queue has enough room */
		while (q->zq_max_queue_len &&
		       q->zq_queue_len > q->zq_max_queue_len) {

			if (q->zq_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL) {
				/* block until queue has been shrunk. */
				cv_wait(&zev_condvar, &zev_mutex);
			} else {
				/* discard msgs until queue is small enough */
				while (q->zq_queue_len > q->zq_max_queue_len) {
					m = q->zq_oldest;
					if (m == NULL)
						break;
					q->zq_events_discarded++;
					q->zq_bytes_discarded += m->size;
					q->zq_oldest = m->next;
					q->zq_queue_len -= m->size;
					q->zq_queue_messages--;
				}
			}
		}

		/* register new message at the end of the queue */
		q->zq_queue_len += msg->size;
		q->zq_queue_messages++;
		q->zq_bytes_total += msg->size;
		q->zq_events_total++;
		if (q->zq_oldest == NULL)
			q->zq_oldest = msg;

		zev_update_statistics(op, &q->zq_statistics);

		if (q->zq_queue_len > q->zq_wakeup_threshold)
			wakeup = 1;
		if (q->zq_queue_len == msg->size)  /* queue was empty */
			cv_broadcast(&q->zq_condvar);

		zev_queue_release(q);
	}

	zev_queue_trim();

	zev_update_statistics(op, &zev_statistics);
	mutex_exit(&zev_mutex);
	mutex_exit(&zev_queue_msg_mutex);

	/* one or more queues need a pollwakeup() */
	if (op == ZEV_OP_MARK) {
		zev_poll_wakeup(B_TRUE);
	} else if (wakeup) {
		zev_poll_wakeup(B_FALSE);
	}

	return;
}

void
zev_queue_error(int op, char *fmt, ...)
{
	char buf[ZEV_MAX_MESSAGE_LEN];
	va_list ap;
	int len;
	zev_msg_t *msg = NULL;
	zev_error_t *rec;
	int msg_size;

	va_start(ap, fmt);
	len = vsnprintf(buf, sizeof(buf), fmt, ap);
	va_end(ap);
	if (len >= sizeof(buf)) {
		cmn_err(CE_WARN, "zev: can't report error - "
		        "dropping event entirely.");
		return;
	}

	msg_size = sizeof(*rec) + len + 1;
	msg = zev_alloc(sizeof(*msg) + msg_size);
	msg->size = msg_size;
	rec = (zev_error_t *)(msg + 1);
	rec->record_len = msg_size;
	rec->op = ZEV_OP_ERROR;
	rec->op_time = ddi_get_time();
	rec->guid = 0;
	rec->failed_op = op;
	rec->errstr_len = len;
	(void) memcpy(ZEV_ERRSTR(rec), buf, len + 1);

	zev_queue_message(ZEV_OP_ERROR, msg);
	return;
}

static int
zev_find_queue(zev_queue_t **out, zev_queue_t *req_q, zev_queue_name_t *name)
{
	char namebuf[ZEV_MAX_QUEUE_NAME_LEN+1];
	zev_queue_t *q;
	int i;

	*out = NULL;

	if (name->zev_namelen == 0) {
		if (req_q->zq_minor_number == ZEV_CONTROL_DEVICE_MINOR)
			return EINVAL;
		zev_queue_hold(req_q);
		*out = req_q;
		return 0;
	}

	if (name->zev_namelen > ZEV_MAX_QUEUE_NAME_LEN)
		return EINVAL;
	strncpy(namebuf, name->zev_name, name->zev_namelen);
	namebuf[name->zev_namelen] = '\0';

	mutex_enter(&zev_mutex);
	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
		q = zev_queues[i - ZEV_MINOR_MIN];
		if (!q)
			continue;
		if (!strcmp(q->zq_name, namebuf)) {
			zev_queue_hold(q);
			mutex_exit(&zev_mutex);
			*out = q;
			return 0;
		}
	}
	mutex_exit(&zev_mutex);
	return ENOENT;
}

static int
zev_ioc_get_queue_statistics(zev_queue_t *req_q, intptr_t arg, int mode)
{
	zev_ioctl_get_queue_statistics_t gs;
	zev_queue_t *q;
	int ret;

	if (ddi_copyin((void *)arg, &gs, sizeof(gs), mode) != 0)
		return EFAULT;

	ret = zev_find_queue(&q, req_q, &gs.zev_queue_name);
	if (ret)
		return ret;

	/* ddi_copyout() can take a long time.  Better make
	   a copy to be able to release the mutex faster. */
	mutex_enter(&zev_mutex);
	memcpy(&gs.zev_statistics, &q->zq_statistics,sizeof(gs.zev_statistics));
	gs.zev_statistics.zev_queue_len = q->zq_queue_len;
	gs.zev_statistics.zev_bytes_read = q->zq_bytes_read;
	gs.zev_statistics.zev_bytes_discarded = q->zq_bytes_discarded;
	gs.zev_statistics.zev_max_queue_len = q->zq_max_queue_len;
	gs.zev_statistics.zev_cnt_discarded_events = q->zq_events_discarded;
	gs.zev_statistics.zev_cnt_total_events = q->zq_events_total;
	zev_queue_release(q);
	mutex_exit(&zev_mutex);

	if (ddi_copyout(&gs, (void *)arg, sizeof(gs), mode) != 0)
		return EFAULT;
	return 0;
}

static int
zev_ioc_set_queue_properties(zev_queue_t *req_q, intptr_t arg, int mode)
{
	zev_ioctl_set_queue_properties_t qp;
	zev_queue_t *q;
	uint64_t old_max;
	uint64_t old_flags;
	int ret;

	if (ddi_copyin((void *)arg, &qp, sizeof(qp), mode) != 0)
		return EFAULT;
	if (qp.zev_max_queue_len > ZEV_MAX_QUEUE_LEN)
		return EINVAL;
	if (qp.zev_poll_wakeup_threshold > ZEV_MAX_POLL_WAKEUP_QUEUE_LEN)
		return EINVAL;

	ret = zev_find_queue(&q, req_q, &qp.zev_queue_name);
	if (ret)
		return ret;

	mutex_enter(&zev_mutex);

	/*
	 * Note: if the PERSISTENT flag is cleared, and the queue is not busy,
	 * the queue should be removed by zev_queue_release() in zev_ioctl().
	 */
	old_flags = qp.zev_flags;
	q->zq_flags = qp.zev_flags;
	if ((old_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL) &&
	   (!(qp.zev_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL))) {
		/* queue is no longer blocking - wake blocked threads */
		cv_broadcast(&zev_condvar);
	}

	old_max = q->zq_max_queue_len;
	q->zq_max_queue_len = qp.zev_max_queue_len;
	if (q->zq_max_queue_len < old_max)
		zev_queue_trim();
	if (q->zq_max_queue_len > old_max)
		cv_broadcast(&zev_condvar);	/* threads may be waiting */

	if ((qp.zev_poll_wakeup_threshold < q->zq_wakeup_threshold) &&
	    (qp.zev_poll_wakeup_threshold <= q->zq_queue_len))
		pollwakeup(&q->zq_pollhead, POLLIN);
	q->zq_wakeup_threshold = qp.zev_poll_wakeup_threshold;

	zev_queue_release(q);
	mutex_exit(&zev_mutex);
	return 0;
}

static int
zev_ioc_get_queue_properties(zev_queue_t *req_q, intptr_t arg, int mode)
{
	zev_ioctl_get_queue_properties_t qp;
	zev_queue_t *q;
	int ret;

	if (ddi_copyin((void *)arg, &qp, sizeof(qp), mode) != 0)
		return EFAULT;

	ret = zev_find_queue(&q, req_q, &qp.zev_queue_name);
	if (ret)
		return ret;

	mutex_enter(&zev_mutex);
	qp.zev_max_queue_len = q->zq_max_queue_len;
	qp.zev_flags = q->zq_flags;
	qp.zev_poll_wakeup_threshold = q->zq_wakeup_threshold;
	zev_queue_release(q);
	mutex_exit(&zev_mutex);

	if (ddi_copyout(&qp, (void *)arg, sizeof(qp), mode) != 0)
		return EFAULT;
	return 0;
}

static int
zev_ioc_add_queue(zev_queue_t *req_q, intptr_t arg, int mode)
{
	zev_ioctl_add_queue_t aq;
	zev_queue_t *new_q;
	char name[ZEV_MAX_QUEUE_NAME_LEN+1];

	if (ddi_copyin((void *)arg, &aq, sizeof(aq), mode) != 0)
		return EFAULT;

	if (aq.zev_namelen > ZEV_MAX_QUEUE_NAME_LEN)
		return EINVAL;
	strncpy(name, aq.zev_name, aq.zev_namelen);
	name[aq.zev_namelen] = '\0';

	return zev_queue_new(&new_q, req_q->zq_dip, name,
	                     aq.zev_max_queue_len, aq.zev_flags);
}

static int
zev_ioc_remove_queue(zev_queue_t *req_q, intptr_t arg, int mode)
{
	zev_ioctl_remove_queue_t rq;
	zev_queue_t *q;
	char name[ZEV_MAX_QUEUE_NAME_LEN+1];
	int found = 0;
	int i;

	if (ddi_copyin((void *)arg, &rq, sizeof(rq), mode) != 0)
		return EFAULT;

	if (rq.zev_queue_name.zev_namelen > ZEV_MAX_QUEUE_NAME_LEN)
		return EINVAL;
	strncpy(name, rq.zev_queue_name.zev_name,
	        rq.zev_queue_name.zev_namelen);
	name[rq.zev_queue_name.zev_namelen] = '\0';

	mutex_enter(&zev_mutex);
	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
		q = zev_queues[i - ZEV_MINOR_MIN];
		if (!q)
			continue;
		if (!strcmp(q->zq_name, name)) {
			found = 1;
			break;
		}
	}
	if (!found) {
		mutex_exit(&zev_mutex);
		return ENOENT;
	}

	if (q->zq_busy) {
		mutex_exit(&zev_mutex);
		return EBUSY;
	}
	/*
	 * clear flags, so that persistent queues are removed aswell
	 * and the queue becomes non-blocking.
	 */
	q->zq_flags = 0;
	if (q->zq_to_be_removed == B_FALSE) {
		q->zq_to_be_removed = B_TRUE;
		zev_queue_release(q);
	}
	/* some threads might be waiting for this queue to become writable */
	cv_broadcast(&zev_condvar);

	mutex_exit(&zev_mutex);
	return 0;
}

static int
zev_ioc_get_debug_info(zev_queue_t *req_q, intptr_t arg, int mode)
{
	zev_ioctl_debug_info_t di;
	uint64_t mem_allocated = atomic_add_64_nv(&zev_memory_allocated, 0);
	uint64_t mem_freed     = atomic_add_64_nv(&zev_memory_freed, 0);

	zev_chksum_stats(&di.zev_chksum_cache_size,
	                 &di.zev_chksum_cache_hits,
	                 &di.zev_chksum_cache_misses);
	di.zev_memory_allocated = mem_allocated - mem_freed;
	if (ddi_copyout(&di, (void *)arg, sizeof(di), mode) != 0)
		return EFAULT;
	return 0;
}

static int
zev_ioc_get_queue_list(zev_queue_t *req_q, intptr_t arg, int mode)
{
	zev_ioctl_get_queue_list_t gql;
	zev_queue_t *q;
	int i = 0;
	int count = 0;

	memset(&gql, 0, sizeof(gql));

	mutex_enter(&zev_mutex);
	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
		q = zev_queues[i - ZEV_MINOR_MIN];
		if (!q)
			continue;
		strncpy(gql.zev_queue_name[count].zev_name,
		    q->zq_name, ZEV_MAX_QUEUE_NAME_LEN);
		gql.zev_queue_name[count].zev_namelen = strlen(q->zq_name);
		count++;
	}
	gql.zev_n_queues = count;
	mutex_exit(&zev_mutex);

	if (ddi_copyout(&gql, (void *)arg, sizeof(gql), mode) != 0)
		return EFAULT;
	return 0;
}

/* ARGSUSED */
static int
zev_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
{
	zev_statistics_t zs;
	zev_ioctl_poolarg_t pa;
	zev_ioctl_mark_t mark;
	zev_mark_t *rec;
	int msg_size;
	zev_msg_t *msg;
	uint64_t len;
	uint64_t mark_id;
	minor_t minor;
	zev_queue_t *req_q;
	int ret = 0;

	minor = getminor(dev);
	mutex_enter(&zev_mutex);
	if ((req_q = ddi_get_soft_state(statep, minor)) == NULL) {
		mutex_exit(&zev_mutex);
		return (ENXIO);
	}
	zev_queue_hold(req_q);
	mutex_exit(&zev_mutex);
	/*
	 * all structures passed between kernel and userspace
	 * are now compatible between 64 and 32 bit.  Model
	 * conversion can be ignored.
	 */
	switch (cmd) {
	case ZEV_IOC_GET_GLOBAL_STATISTICS:
		/* ddi_copyout() can take a long time.  Better make
		   a copy to be able to release the mutex faster. */
		mutex_enter(&zev_mutex);
		(void) memcpy(&zs, &zev_statistics, sizeof(zs));
		mutex_exit(&zev_mutex);
		if (ddi_copyout(&zs, (void *)arg, sizeof(zs), mode) != 0)
			ret = EFAULT;
		break;
	case ZEV_IOC_GET_QUEUE_STATISTICS:
		ret = zev_ioc_get_queue_statistics(req_q, arg, mode);
		break;
	case ZEV_IOC_MUTE_POOL:
	case ZEV_IOC_UNMUTE_POOL:
		if (ddi_copyin((void *)arg, &pa, sizeof(pa), mode) != 0) {
			ret = EFAULT;
			break;
		}
		if (pa.zev_poolname_len >=MAXPATHLEN) {
			ret = EINVAL;
			break;
		}
		pa.zev_poolname[pa.zev_poolname_len] = '\0';
		if (cmd == ZEV_IOC_MUTE_POOL) {
			ret = zev_ioc_mute_pool(pa.zev_poolname);
		} else {
			ret = zev_ioc_unmute_pool(pa.zev_poolname);
		}
		break;
	case ZEV_IOC_SET_MAX_QUEUE_LEN:
		if (ddi_copyin((void *)arg, &len, sizeof(len), mode) != 0) {
			ret = EFAULT;
			break;
		}
		if (len > ZEV_MAX_QUEUE_LEN) {
			ret = EINVAL;
			break;
		}
		mutex_enter(&zev_mutex);
		zev_statistics.zev_max_queue_len = len;
		cv_broadcast(&zev_condvar);
		mutex_exit(&zev_mutex);
		break;
	case ZEV_IOC_GET_QUEUE_PROPERTIES:
		ret = zev_ioc_get_queue_properties(req_q, arg, mode);
		break;
	case ZEV_IOC_SET_QUEUE_PROPERTIES:
		ret = zev_ioc_set_queue_properties(req_q, arg, mode);
		break;
	case ZEV_IOC_MARK:
		if (ddi_copyin((void *)arg, &mark, sizeof(mark), mode) != 0) {
			ret = EFAULT;
			break;
		}
		/* prepare message */
		msg_size = sizeof(*rec) + mark.zev_payload_len + 1;
		msg = zev_alloc(sizeof(*msg) + msg_size);
		msg->size = msg_size;
		rec = (zev_mark_t *)(msg + 1);
		rec->record_len = msg_size;
		rec->op = ZEV_OP_MARK;
		rec->op_time = ddi_get_time();
		rec->guid = mark.zev_guid;
		rec->payload_len = mark.zev_payload_len;
		/* get payload */
		if (ddi_copyin(((char *)arg) + sizeof(mark),
		               ZEV_PAYLOAD(rec),
		               mark.zev_payload_len, mode) != 0) {
			zev_free(msg, msg_size);
			ret = EFAULT;
			break;
		}
		*(ZEV_PAYLOAD(rec) + mark.zev_payload_len) = '\0';
		/* get mark id and queue message */
		mutex_enter(&zev_mark_id_mutex);
		mark_id = zev_mark_id++;
		mutex_exit(&zev_mark_id_mutex);
		rec->mark_id = mark_id;
		zev_queue_message(ZEV_OP_MARK, msg);
		/* report mark id to userland, ignore errors */
		mark.zev_mark_id = mark_id;
		ddi_copyout(&mark, (void *)arg, sizeof(mark), mode);
		break;
	case ZEV_IOC_ADD_QUEUE:
		if (minor != ZEV_CONTROL_DEVICE_MINOR) {
			ret = EACCES;
			break;
		}
		ret = zev_ioc_add_queue(req_q, arg, mode);
		break;
	case ZEV_IOC_REMOVE_QUEUE:
		if (minor != ZEV_CONTROL_DEVICE_MINOR) {
			ret = EACCES;
			break;
		}
		ret = zev_ioc_remove_queue(req_q, arg, mode);
		break;
	case ZEV_IOC_GET_DEBUG_INFO:
		ret = zev_ioc_get_debug_info(req_q, arg, mode);
		break;
	case ZEV_IOC_GET_QUEUE_LIST:
		ret = zev_ioc_get_queue_list(req_q, arg, mode);
		break;
	case ZEV_IOC_GET_FILE_SIGNATURES:
		ret = zev_ioc_get_signatures(arg, mode);
		break;
	default:
		/* generic "ioctl unknown" error */
		ret = ENOTTY;
	}

	mutex_enter(&zev_mutex);
	zev_queue_release(req_q);
	mutex_exit(&zev_mutex);
	if (ret)
		SET_ERROR(ret);
	return (ret);
}

static int
zev_chpoll(dev_t dev, short events, int anyyet,
    short *reventsp, struct pollhead **phpp)
{
	int minor;
	short revent = 0;
	zev_queue_t *q;

	/* use minor-specific queue context and it's pollhead */
	minor = getminor(dev);
	if (minor == ZEV_CONTROL_DEVICE_MINOR)
		return (EINVAL);
	mutex_enter(&zev_mutex);
	if ((q = ddi_get_soft_state(statep, minor)) == NULL) {
		mutex_exit(&zev_mutex);
		return (ENXIO);
	}
	revent = 0;
	if ((events & POLLIN)) {
		if (q->zq_oldest)
			revent |= POLLIN;
	}
	if (revent == 0) {
		if (!anyyet) {
			*phpp = &q->zq_pollhead;
		}
	}
	*reventsp = revent;
	mutex_exit(&zev_mutex);
	return (0);
}

/* ARGSUSED */
static int
zev_read(dev_t dev, struct uio *uio_p, cred_t *crep_p)
{
	minor_t minor;
	offset_t off;
	int ret = 0;
	zev_msg_t *msg;
	char *data;
	zev_queue_t *q;

	minor = getminor(dev);
	if (minor == ZEV_CONTROL_DEVICE_MINOR)
		return (EINVAL);

	mutex_enter(&zev_mutex);
	q = ddi_get_soft_state(statep, minor);
	if (q == NULL) {
		mutex_exit(&zev_mutex);
		return (ENXIO);
	}
	off = uio_p->uio_loffset;
	msg = q->zq_oldest;
	while (msg == NULL) {
		if (!ddi_can_receive_sig()) {
			/*
			 * read() shouldn't block because this thread
			 * can't receive signals. (e.g., it might be
			 * torn down by exit() right now.)
			 */
			mutex_exit(&zev_mutex);
			return 0;
		}
		if (cv_wait_sig(&q->zq_condvar, &zev_mutex) == 0) {
			/* signal received. */
			mutex_exit(&zev_mutex);
			return EINTR;
		}
		msg = q->zq_oldest;
	}
	if (msg->size > uio_p->uio_resid) {
		mutex_exit(&zev_mutex);
		return E2BIG;
	}
	while (msg && uio_p->uio_resid >= msg->size) {
		data = (char *)(msg + 1);
		ret = uiomove(data, msg->size, UIO_READ, uio_p);
		if (ret != 0) {
			mutex_exit(&zev_mutex);
			cmn_err(CE_WARN, "zev: uiomove failed; messages lost");
			uio_p->uio_loffset = off;
			return (ret);
		}
		q->zq_oldest = msg->next;
		q->zq_bytes_read += msg->size;
		q->zq_queue_len -= msg->size;
		q->zq_queue_messages--;
		msg->read++;
		msg = q->zq_oldest;
	}
	cv_broadcast(&zev_condvar);
	mutex_exit(&zev_mutex);
	uio_p->uio_loffset = off;
	return 0;
}

/* ARGSUSED */
static int
zev_close(dev_t dev, int flag, int otyp, cred_t *crepd)
{
	zev_queue_t *q;
	int minor;

	minor = getminor(dev);
	if (otyp != OTYP_CHR)
		return (EINVAL);
	mutex_enter(&zev_mutex);
	if ((q = ddi_get_soft_state(statep, minor)) == NULL) {
		mutex_exit(&zev_mutex);
		return (ENXIO);
	}
	if (q->zq_busy != B_TRUE) {
		mutex_exit(&zev_mutex);
		return (EINVAL);
	}
	q->zq_busy = B_FALSE;
	if ((q->zq_flags & ZEV_FL_PERSISTENT) == 0)
		zev_queue_release(q);
	mutex_exit(&zev_mutex);
	return (0);
}

/* ARGSUSED */
static int
zev_open(dev_t *devp, int flag, int otyp, cred_t *credp)
{
	zev_queue_t *q;
	minor_t minor;

	minor = getminor(*devp);
	if (otyp != OTYP_CHR)
		return (EINVAL);
	if (drv_priv(credp) != 0)
		return (EPERM);
	mutex_enter(&zev_mutex);
	if ((q = ddi_get_soft_state(statep, minor)) == NULL) {
		mutex_exit(&zev_mutex);
		return (ENXIO);
	}
	if (minor == ZEV_CONTROL_DEVICE_MINOR) {
		/* control device may be used in parallel */
		q->zq_busy = B_TRUE;
		mutex_exit(&zev_mutex);
		return 0;
	}
	if (q->zq_busy == B_TRUE) {
		mutex_exit(&zev_mutex);
		return (EBUSY);
	}
	q->zq_busy = B_TRUE;	/* can only be opened exclusively */
	mutex_exit(&zev_mutex);
	return (0);
}

static struct cb_ops zev_cb_ops = {
	zev_open,		/* open */
	zev_close,		/* close */
	nodev,			/* strategy */
	nodev,			/* print */
	nodev,			/* dump */
	zev_read,		/* read */
	nodev,			/* write */
	zev_ioctl,		/* ioctl */
	nodev,			/* devmap */
	nodev,			/* mmap */
	nodev,			/* segmap */
	zev_chpoll,		/* chpoll */
	ddi_prop_op,		/* prop_op */
	NULL,			/* streamtab */
	D_MP | D_64BIT,		/* cb_flag */
	CB_REV,			/* cb_rev */
	nodev,			/* aread */
	nodev,			/* awrite */
};

static void
zev_free_instance(dev_info_t *dip)
{
	int instance;
	zev_queue_t *q;
	int i;

	instance = ddi_get_instance(dip);
	if (instance != 0) {
		cmn_err(CE_WARN, "zev: tried to free instance != 0 (%d)",
		        instance);
		return;
	}

	ddi_remove_minor_node(dip, NULL);

	/* stop pollwakeup thread */
	zev_wakeup_thread_run = 0;
	if (zev_poll_wakeup_thread != NULL) {
		thread_join(zev_poll_wakeup_thread->t_did);
		zev_poll_wakeup_thread = NULL;
	}

	mutex_enter(&zev_mutex);

	/* remove "ctrl" dummy queue */
	q = ddi_get_soft_state(statep, ZEV_CONTROL_DEVICE_MINOR);
	if (q) {
		ddi_soft_state_free(statep, ZEV_CONTROL_DEVICE_MINOR);
		ZEV_MEM_SUB(sizeof(zev_queue_t));
	}

	/* remove all other queues */
	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
		q = zev_queues[i- ZEV_MINOR_MIN];
		if (!q)
			continue;
		ASSERT(q->zq_refcnt == 1);
		zev_queue_release(q);
	}
	zev_queue_trim();
	bzero(&zev_queues, sizeof(zev_queues));

	mutex_exit(&zev_mutex);

}

static int
zev_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
{
	int instance;
	zev_queue_t *q;

	/* called once per instance with DDI_DETACH,
	   may be called to suspend */
	switch (cmd) {
	case DDI_DETACH:
		/* instance busy? */
		instance = ddi_get_instance(dip);
		if (instance != 0) {	/* hardcoded in zev.conf */
			/* this module only supports one instance. */
			return (DDI_FAILURE);
		}

		mutex_enter(&zev_mutex);
		if (!zev_attached) {
			mutex_exit(&zev_mutex);
			return (DDI_FAILURE);
		}

		/* check "ctrl" queue to see if t is busy */
		q = ddi_get_soft_state(statep, ZEV_CONTROL_DEVICE_MINOR);
		if (q == NULL) {
			mutex_exit(&zev_mutex);
			return (DDI_FAILURE);
		}
		if (q->zq_busy) {
			mutex_exit(&zev_mutex);
			return (DDI_FAILURE);
		}
		/* are there any queues? */
		if (zev_queue_cnt > 0) {
			mutex_exit(&zev_mutex);
			return (DDI_FAILURE);
		}

		zev_attached = B_FALSE;
		mutex_exit(&zev_mutex);

		/* switch ZFS event callbacks back to default */
		rw_enter(&rz_zev_rwlock, RW_WRITER);
		rz_zev_callbacks = rz_zev_default_callbacks;
		rz_zev_set_active(B_FALSE);
		rw_exit(&rz_zev_rwlock);

		/* no thread is inside of the callbacks anymore. */

		/* free resources allocated for this instance */
		zev_free_instance(dip);
		zev_chksum_fini();
#if 0
		cmn_err(CE_WARN, "zev: allocated memory at detach: %" PRIu64,
			zev_memory_allocated - zev_memory_freed);
#endif
		return (DDI_SUCCESS);
	case DDI_SUSPEND:
		/* kernel must not suspend zev devices while ZFS is running */
		return (DDI_FAILURE);
	default:
		return (DDI_FAILURE);
	}
}

static int
zev_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
{
	/* called once per instance with DDI_ATTACH,
	   may be called to resume */
	int instance;
	int error;
	zev_queue_t *q;
	switch (cmd) {
	case DDI_ATTACH:
		/* create instance state */
		instance = ddi_get_instance(dip);
		if (instance != 0) {	/* hardcoded in zev.conf */
			/* this module only supports one instance. */
			return (DDI_FAILURE);
		}

		mutex_enter(&zev_mutex);
		if (zev_attached) {
			mutex_exit(&zev_mutex);
			return (DDI_FAILURE);
		}
		if (ddi_soft_state_zalloc(statep, ZEV_CONTROL_DEVICE_MINOR) !=
		    DDI_SUCCESS) {
			mutex_exit(&zev_mutex);
			return (DDI_FAILURE);
		}
		ZEV_MEM_ADD(sizeof(zev_queue_t));
		zev_attached = B_TRUE;

		/* init queue list */
		bzero(&zev_queues, sizeof(zev_queues));
		mutex_exit(&zev_mutex);

		/* create a dummy queue for management of "ctrl" */

		q = ddi_get_soft_state(statep, ZEV_CONTROL_DEVICE_MINOR);
		q->zq_dip = dip;
		q->zq_refcnt = 1;
		q->zq_busy = B_FALSE;
		q->zq_minor_number = ZEV_CONTROL_DEVICE_MINOR;
		q->zq_flags = ZEV_FL_PERSISTENT;
		strcpy(q->zq_name, ZEV_CONTROL_DEVICE_NAME);

		/* create device node for "ctrl" */
		if (ddi_create_minor_node(dip, ZEV_CONTROL_DEVICE_NAME,
		    S_IFCHR, ZEV_CONTROL_DEVICE_MINOR,
		    DDI_PSEUDO, 0) == DDI_FAILURE) {
			goto fail;
		}

		/* note: intentionally not adding ctrl queue to queue list. */

		/* default queue */
		error = zev_queue_new(&q, dip,
				      ZEV_DEFAULT_QUEUE_NAME,
				      ZEV_MAX_QUEUE_LEN,
				      ZEV_FL_BLOCK_WHILE_QUEUE_FULL|
		                      ZEV_FL_PERSISTENT);
		if (error)
			goto fail;

		/* start pollwakeup thread */
		zev_wakeup_thread_run = 1;
		zev_poll_wakeup_thread = thread_create(NULL, 0,
		    zev_poll_wakeup_thread_main, NULL, 0, &p0,
		    TS_RUN, minclsyspri);

		ddi_report_dev(dip);

		zev_chksum_init();

		/* switch ZFS event callbacks to zev module callbacks */
		rw_enter(&rz_zev_rwlock, RW_WRITER);
		rz_zev_callbacks = &zev_callbacks;
		rz_zev_set_active(B_TRUE);
		rw_exit(&rz_zev_rwlock);

		return (DDI_SUCCESS);
	case DDI_RESUME:
		/* suspendeding zev devices should never happen */
		return (DDI_SUCCESS);
	default:
		return (DDI_FAILURE);
	}
fail:
	cmn_err(CE_WARN, "zev: attach failed");
	zev_free_instance(dip);
	mutex_enter(&zev_mutex);
	zev_attached = B_FALSE;
	mutex_exit(&zev_mutex);
	return (DDI_FAILURE);
}

/* ARGSUSED */
static int
zev_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **resultp)
{
	minor_t minor;
	zev_queue_t *q;

	/* arg is dev_t */
	minor = getminor((dev_t)arg);
	mutex_enter(&zev_mutex);
	q = ddi_get_soft_state(statep, minor);
	if (q == NULL) {
		*resultp = NULL;
		mutex_exit(&zev_mutex);
		return (DDI_FAILURE);
	}

	switch (infocmd) {
	case DDI_INFO_DEVT2DEVINFO:
		*resultp = q->zq_dip;
		break;
	case DDI_INFO_DEVT2INSTANCE:
		*resultp = (void *)(uintptr_t)ddi_get_instance(q->zq_dip);
		break;
	default:
		mutex_exit(&zev_mutex);
		return (DDI_FAILURE);
	}
	mutex_exit(&zev_mutex);
	return (DDI_SUCCESS);
}

static struct dev_ops zev_dev_ops = {
	DEVO_REV,			/* driver build revision */
	0,				/* driver reference count */
	zev_getinfo,			/* getinfo */
	nulldev,			/* identify (obsolete) */
	nulldev,			/* probe (search for devices) */
	zev_attach,			/* attach */
	zev_detach,			/* detach */
	nodev,				/* reset (obsolete, use quiesce) */
	&zev_cb_ops,			/* character and block device ops */
	NULL,				/* bus driver ops */
	NULL,				/* power management, not needed */
	ddi_quiesce_not_needed,		/* quiesce */
};

static struct modldrv zev_modldrv = {
	&mod_driverops,			/* all loadable modules use this */
	"zev ZFS event provider, v1.0",	/* driver name and version info */
	&zev_dev_ops			/* ops method pointers */
};

static struct modlinkage zev_modlinkage = {
	MODREV_1,	/* fixed value */
	{
		&zev_modldrv,	/* driver linkage structure */
		NULL		/* list terminator */
	}
};

int
_init(void)
{
	int error;

	if ((error = ddi_soft_state_init(&statep, sizeof(zev_queue_t), 1)) != 0)
		return (error);
	zev_attached = B_FALSE;

	zev_queue_head = NULL;
	zev_queue_tail = NULL;
	zev_queue_len = 0;
	zev_muted_pools_head = NULL;
	zev_memory_allocated = 0;
	zev_memory_freed = 0;
	zev_queue_cnt = 0;

	mutex_init(&zev_mutex, NULL, MUTEX_DRIVER, NULL);
	cv_init(&zev_condvar, NULL, CV_DRIVER, NULL);
	rw_init(&zev_pool_list_rwlock, NULL, RW_DRIVER, NULL);
	mutex_init(&zev_mark_id_mutex, NULL, MUTEX_DRIVER, NULL);
	zev_mark_id = gethrtime();
	mutex_init(&zev_queue_msg_mutex, NULL, MUTEX_DRIVER, NULL);
	zev_msg_sequence_number = gethrtime();
	bzero(&zev_statistics, sizeof(zev_statistics));
	bzero(&zev_pollhead, sizeof(zev_pollhead));
	bzero(&zev_queues, sizeof(zev_queues));
	zev_statistics.zev_max_queue_len = ZEV_MAX_QUEUE_LEN;
	if (zev_ioc_mute_pool("zg0")) {
		cmn_err(CE_WARN, "zev: could not init mute list");
		goto FAIL;
	}

	if ((error = mod_install(&zev_modlinkage)) != 0) {
		cmn_err(CE_WARN, "zev: could not install module");
		goto FAIL;
	}

	return (0);
FAIL:
	/* free resources */
	cmn_err(CE_WARN, "zev: _init failed");
	mutex_destroy(&zev_mutex);
	ddi_soft_state_fini(&statep);
	return (error);
}

int
_info(struct modinfo *modinfop)
{
	return (mod_info(&zev_modlinkage, modinfop));
}

int
_fini(void)
{
	int error = 0;
	zev_msg_t *msg;
	zev_pool_list_entry_t *pe, *npe;

	mutex_enter(&zev_mutex);
	if (zev_attached == B_TRUE) {
		mutex_exit(&zev_mutex);
		return (SET_ERROR(EBUSY));
	}
	if (zev_queue_cnt != 0) {
		/* should never happen */
		mutex_exit(&zev_mutex);
		return (SET_ERROR(EBUSY));
	}

	/*
	 * avoid deadlock if event list is full: make sure threads currently
	 * blocking on the event list can append their event and then release
	 * rz_zev_rwlock.  Since there should be no queues left when we
	 * reach this point we can simply empty the event list and then
	 * wake everybody.
	 */
	while (zev_queue_head) {
		msg = zev_queue_head;
		zev_queue_head = msg->next;
		zev_free(msg, sizeof(*msg) + msg->size);
	}
	cv_broadcast(&zev_condvar);
	mutex_exit(&zev_mutex);

	/* switch ZFS event callbacks back to default (again) */
	rw_enter(&rz_zev_rwlock, RW_WRITER);
	rz_zev_callbacks = rz_zev_default_callbacks;
	rz_zev_set_active(B_FALSE);
	rw_exit(&rz_zev_rwlock);

	/* no thread is inside of the callbacks anymore.  Safe to remove. */

	/* unload module callbacks */
	if ((error = mod_remove(&zev_modlinkage)) != 0) {
		cmn_err(CE_WARN, "mod_remove failed: %d", error);
		return (error);
	}

	/* free resources */
	mutex_enter(&zev_mutex);
	while (zev_queue_head) {
		msg = zev_queue_head;
		zev_queue_head = msg->next;
		zev_free(msg, sizeof(*msg) + msg->size);
	}
	mutex_exit(&zev_mutex);
	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
	pe = zev_muted_pools_head;
	while (pe) {
		npe = pe;
		pe = pe->next;
		zev_free(npe, sizeof(*npe));
	}
	rw_exit(&zev_pool_list_rwlock);
	ddi_soft_state_fini(&statep);
	rw_destroy(&zev_pool_list_rwlock);
	cv_destroy(&zev_condvar);
	mutex_destroy(&zev_mutex);
	mutex_destroy(&zev_mark_id_mutex);
	mutex_destroy(&zev_queue_msg_mutex);

	return (0);
}