xref: /titanic_41/usr/src/uts/common/fs/zev/zev.c (revision a9b5625ad449e28716053a1fd299e7e3271f750d)
1 #include <sys/modctl.h>
2 #include <sys/ddi.h>
3 #include <sys/sunddi.h>
4 #include <sys/conf.h>
5 #include <sys/devops.h>
6 #include <sys/stat.h>
7 #include <sys/fs/zev.h>
8 #include <sys/zev_callbacks.h>
9 #include <sys/zev_checksums.h>
10 #include <sys/zfs_znode.h>
11 #include <sys/time.h>
12 #include <sys/sa.h>
13 #include <sys/zap.h>
14 #include <sys/time.h>
15 
16 #define	OFFSETOF(s, m)		((size_t)(&(((s *)0)->m)))
17 
18 #define ZEV_DEFAULT_QUEUE_NAME		"beaver"
19 #define ZEV_CONTROL_DEVICE_MINOR	0
20 #define ZEV_MINOR_MIN			(ZEV_CONTROL_DEVICE_MINOR + 1)
21 #define ZEV_MINOR_MAX			(ZEV_MINOR_MIN + ZEV_MAX_QUEUES - 1)
22 
23 typedef struct zev_queue {
24 	char			zq_name[ZEV_MAX_QUEUE_NAME_LEN+1];
25 	minor_t			zq_minor_number;
26 	dev_info_t		*zq_dip;
27 	struct pollhead		zq_pollhead;
28 	uint64_t		zq_bytes_read;
29 	uint64_t		zq_events_read;
30 	uint64_t		zq_bytes_discarded;
31 	uint64_t		zq_events_discarded;
32 	uint64_t		zq_bytes_total;
33 	uint64_t		zq_events_total;
34 	uint64_t		zq_wakeup_threshold;
35 	uint16_t		zq_flags;
36 	uint16_t		zq_need_wakeup;
37 	/* protected by zev_mutex */
38 	int			zq_refcnt;
39 	uint64_t		zq_queue_len;
40 	uint64_t		zq_queue_messages;
41 	uint64_t		zq_max_queue_len;
42 	zev_msg_t		*zq_oldest;
43 	boolean_t		zq_busy;
44 	boolean_t		zq_to_be_removed;
45 	zev_statistics_t	zq_statistics;
46 	kcondvar_t		zq_condvar;
47 } zev_queue_t;
48 
49 static void		*statep;
50 struct pollhead		zev_pollhead;
51 
52 kmutex_t		zev_mutex;
53 kcondvar_t		zev_condvar;
54 kmutex_t		zev_queue_msg_mutex;
55 krwlock_t		zev_pool_list_rwlock;
56 static zev_statistics_t	zev_statistics;
57 static boolean_t	zev_attached;
58 static kmutex_t		zev_mark_id_mutex;
59 static uint64_t		zev_mark_id = 0;
60 
61 static uint64_t		zev_msg_sequence_number = 0;
62 static zev_queue_t	*zev_queues[ZEV_MAX_QUEUES];
63 static int		zev_queue_cnt = 0;
64 static int		zev_have_blocking_queues = 1;
65 
66 uint64_t	zev_memory_allocated = 0;
67 uint64_t	zev_memory_freed = 0;
68 
69 /*
70  * The longest potential message is from zev_zfs_mount() and
71  * contains the mountpoint, which might be close to MAXPATHLEN bytes long.
72  *
73  * Another candidate is zev_znode_rename_cb() and contains three inode
74  * numbers and two filenames of up to MAXNAMELEN bytes each.
75  */
76 #define ZEV_MAX_MESSAGE_LEN	4096
77 
78 static zev_msg_t *zev_queue_head = NULL;
79 static zev_msg_t *zev_queue_tail = NULL;
80 static uint64_t zev_queue_len = 0;
81 
82 
83 typedef struct zev_pool_list_entry {
84 	struct zev_pool_list_entry	*next;
85 	char				name[MAXPATHLEN];
86 } zev_pool_list_entry_t;
87 
88 static zev_pool_list_entry_t *zev_muted_pools_head = NULL;
89 
90 static volatile int zev_wakeup_thread_run = 1;
91 static kthread_t *zev_poll_wakeup_thread = NULL;
92 
93 void *
94 zev_alloc(ssize_t sz)
95 {
96 	ZEV_MEM_ADD(sz);
97 	return kmem_alloc(sz, KM_SLEEP);
98 }
99 
100 void *
101 zev_zalloc(ssize_t sz)
102 {
103 	ZEV_MEM_ADD(sz);
104 	return kmem_zalloc(sz, KM_SLEEP);
105 }
106 
107 void
108 zev_free(void *ptr, ssize_t sz)
109 {
110 	ZEV_MEM_SUB(sz);						\
111 	kmem_free(ptr, sz);
112 }
113 
114 /* must be called with zev_mutex held */
115 static void
116 zev_update_blockflag(void)
117 {
118 	zev_queue_t *q;
119 	int had_blocking_queues;
120 	int i;
121 
122 	had_blocking_queues = zev_have_blocking_queues;
123 
124 	/* do we still have blocking queues? */
125 	zev_have_blocking_queues = 0;
126 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
127 		q = zev_queues[i - ZEV_MINOR_MIN];
128 		if (!q)
129 			continue;
130 		if (q->zq_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL) {
131 			zev_have_blocking_queues = 1;
132 			break;
133 		}
134 	}
135 	/* no blocking queues */
136 	if (had_blocking_queues)
137 		cv_broadcast(&zev_condvar);
138 }
139 
140 int
141 zev_queue_cmp(const void *a, const void *b)
142 {
143 	const zev_queue_t *qa = a;
144 	const zev_queue_t *qb = b;
145 	if (qa->zq_minor_number > qb->zq_minor_number)
146 		return 1;
147 	if (qa->zq_minor_number < qb->zq_minor_number)
148 		return -1;
149 	return 0;
150 }
151 
152 /* must be called with zev_mutex held */
153 void
154 zev_queue_trim(void)
155 {
156 	zev_msg_t *m;
157 	uint64_t oldest_message;
158 	zev_queue_t *q;
159 	int i;
160 
161 	if (!zev_queue_tail)
162 		return;
163 
164 	oldest_message = zev_queue_tail->seq + 1;  /* does not exist, yet. */
165 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
166 		q = zev_queues[i - ZEV_MINOR_MIN];
167 		if (q == NULL)
168 			continue;
169 		if (!q->zq_oldest)
170 			continue;
171 		if (oldest_message > q->zq_oldest->seq)
172 			oldest_message = q->zq_oldest->seq;
173 	}
174 
175 	/* remove msgs between oldest_message and zev_queue_head */
176 	while(zev_queue_head && (oldest_message > zev_queue_head->seq)) {
177 		m = zev_queue_head;
178 		zev_queue_head = m->next;
179 		if (zev_queue_head == NULL) {
180 			zev_queue_tail = NULL;
181 		} else {
182 			zev_queue_head->prev = NULL;
183 		}
184 		if (m->read == 0) {
185 			zev_statistics.zev_bytes_discarded += m->size;
186 			zev_statistics.zev_cnt_discarded_events++;
187 		}
188 		zev_statistics.zev_queue_len -= m->size;
189 		zev_queue_len--;
190 		zev_free(m, sizeof(*m) + m->size);
191 	}
192 }
193 
194 /* must be called with zev_mutex held */
195 static void
196 zev_queue_hold(zev_queue_t *q)
197 {
198 	q->zq_refcnt++;
199 }
200 
201 /* must be called with zev_mutex held */
202 static void
203 zev_queue_release(zev_queue_t *q)
204 {
205 	q->zq_refcnt--;
206 	if (q->zq_refcnt > 0)
207 		return;
208 
209 	ASSERT(q->zq_busy == B_FALSE);
210 
211 	/* persistent queues will not be removed */
212 	if ((q->zq_flags & ZEV_FL_PERSISTENT) != 0)
213 		return;
214 
215 	/* remove queue from queue list */
216 	zev_queues[q->zq_minor_number - ZEV_MINOR_MIN] = NULL;
217 
218 	/* discard messages that no queue references anymore */
219 	zev_queue_trim();
220 
221 	cv_destroy(&q->zq_condvar);
222 	ddi_remove_minor_node(q->zq_dip, q->zq_name);
223 	ddi_soft_state_free(statep, q->zq_minor_number);
224 	ZEV_MEM_SUB(sizeof(zev_queue_t));
225 	zev_queue_cnt--;
226 	zev_update_blockflag();
227 }
228 
229 int
230 zev_queue_new(zev_queue_t **queue,
231               dev_info_t *dip,
232               char *name,
233               uint64_t max_queue_len,
234               uint16_t flags)
235 {
236 	zev_queue_t *q;
237 	zev_queue_t *tmp;
238 	zev_msg_t *msg;
239 	int name_exists = 0;
240 	minor_t minor;
241 	char *p;
242 	int i;
243 
244 	if (max_queue_len > ZEV_MAX_QUEUE_LEN)
245 		return EINVAL;
246 	if (max_queue_len == 0)
247 		max_queue_len = ZEV_MAX_QUEUE_LEN;
248 	if (!strcmp(name, ZEV_CONTROL_DEVICE_NAME))
249 		return EINVAL;
250 	for (p = name; *p; p++) {
251 		if (*p >= 'a' && *p <= 'z')
252 			continue;
253 		if (*p >= '0' && *p <= '9')
254 			continue;
255 		if (*p == '.')
256 			continue;
257 		return EINVAL;
258 	}
259 
260 	mutex_enter(&zev_mutex);
261 
262 	/* find free minor number.*/
263 	/* if this were a frequent operation we'd have a free-minor list */
264 	for (minor = ZEV_MINOR_MIN; minor <= ZEV_MINOR_MAX; minor++) {
265 		tmp = zev_queues[minor - ZEV_MINOR_MIN];
266 		if (tmp == NULL)
267 			break;
268 	}
269 	if (tmp) {
270 		mutex_exit(&zev_mutex);
271 		return ENOSPC;
272 	}
273 
274 	if (ddi_soft_state_zalloc(statep, minor) != DDI_SUCCESS) {
275 		mutex_exit(&zev_mutex);
276 		return ENOSPC;
277 	}
278 	ZEV_MEM_ADD(sizeof(zev_queue_t));
279 
280 	q = ddi_get_soft_state(statep, minor);
281 	memset(q, 0, sizeof(*q));
282 	strncpy(q->zq_name, name, ZEV_MAX_QUEUE_NAME_LEN);
283 	q->zq_name[ZEV_MAX_QUEUE_NAME_LEN] = '\0';
284 	q->zq_max_queue_len = max_queue_len;
285 	q->zq_wakeup_threshold = ZEV_DEFAULT_POLL_WAKEUP_QUEUE_LEN;
286 	q->zq_flags = flags;
287 	q->zq_refcnt = 1;
288 	q->zq_dip = dip;
289 	q->zq_minor_number = minor;
290 	cv_init(&q->zq_condvar, NULL, CV_DRIVER, NULL);
291 
292 	/* insert into queue list */
293 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
294 		/* if this were a frequent operation we'd have a name tree */
295 		if (zev_queues[i - ZEV_MINOR_MIN] == NULL)
296 			continue;
297 		if (!strcmp(q->zq_name, zev_queues[i-ZEV_MINOR_MIN]->zq_name)) {
298 			name_exists = 1;
299 			break;
300 		}
301 	}
302 	if (name_exists) {
303 		ddi_soft_state_free(statep, minor);
304 		ZEV_MEM_SUB(sizeof(zev_queue_t));
305 		mutex_exit(&zev_mutex);
306 		return EEXIST;
307 	}
308 	zev_queues[minor - ZEV_MINOR_MIN] = q;
309 	zev_queue_cnt++;
310 
311 	/* calculate current queue len and find head and tail */
312 	if (!(q->zq_flags & ZEV_FL_INITIALLY_EMPTY)) {
313 		q->zq_oldest = zev_queue_tail;
314 		msg = zev_queue_tail;
315 		while ((msg) && (q->zq_queue_len < q->zq_max_queue_len)) {
316 			q->zq_queue_len += msg->size;
317 			q->zq_queue_messages++;
318 			q->zq_oldest = msg;
319 			msg = msg->prev;
320 		}
321 	}
322 
323 	mutex_exit(&zev_mutex);
324 
325 	if (ddi_create_minor_node(dip, name,
326 	    S_IFCHR, minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
327 		mutex_enter(&zev_mutex);
328 		zev_queues[minor - ZEV_MINOR_MIN] = NULL;
329 		zev_queue_cnt--;
330 		ddi_soft_state_free(statep, minor);
331 		ZEV_MEM_SUB(sizeof(zev_queue_t));
332 		mutex_exit(&zev_mutex);
333 		return EFAULT;
334 	}
335 
336 	zev_update_blockflag();
337 
338 	*queue = q;
339 	return 0;
340 }
341 
342 /*
343  * poll() wakeup thread.  Used to check periodically whether we have
344  * bytes left in the queue that have not yet been made into a
345  * pollwakeup() call.  This is meant to insure a maximum waiting
346  * time until an event is presented as a poll wakeup, while at
347  * the same time not making every single event into a poll wakeup
348  * of it's own.
349  */
350 
351 static void
352 zev_poll_wakeup(boolean_t flush_all)
353 {
354 	zev_queue_t *q;
355 	int i;
356 
357 	/*
358 	 * This loop works with hold() and release() because
359 	 * pollwakeup() requires us to release our locks before calling it.
360 	 *
361 	 * from pollwakeup(9F):
362 	 *
363 	 *   "Driver defined locks should not be held across calls
364 	 *    to this function."
365 	 */
366 
367 	/* wake up threads for each individual queue */
368 	mutex_enter(&zev_mutex);
369 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
370 		q = zev_queues[i - ZEV_MINOR_MIN];
371 		if (q == NULL)
372 			continue;
373 		if (!q->zq_busy)
374 			continue;
375 		if (!q->zq_queue_len)
376 			continue;
377 		if ((flush_all) ||
378 		    (q->zq_queue_len > q->zq_wakeup_threshold)) {
379 			zev_queue_hold(q);
380 			mutex_exit(&zev_mutex);
381 			pollwakeup(&q->zq_pollhead, POLLIN);
382 			mutex_enter(&zev_mutex);
383 			zev_queue_release(q);
384 		}
385 	}
386 	mutex_exit(&zev_mutex);
387 }
388 
389 static void
390 zev_poll_wakeup_thread_main(void)
391 {
392 	while (zev_wakeup_thread_run) {
393 		delay(drv_usectohz(100 * 1000)); /* sleep 100ms */
394 
395 		zev_poll_wakeup(B_TRUE);
396 	}
397 	thread_exit();
398 }
399 
400 static int
401 zev_ioc_mute_pool(char *poolname)
402 {
403 	zev_pool_list_entry_t *pe;
404 	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
405 	/* pool already muted? */
406 	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
407 		if (!strcmp(pe->name, poolname)) {
408 			rw_exit(&zev_pool_list_rwlock);
409 			return EEXIST;
410 		}
411 	}
412 	pe = zev_zalloc(sizeof(*pe));
413 	if (!pe) {
414 		rw_exit(&zev_pool_list_rwlock);
415 		return ENOMEM;
416 	}
417 	(void) strncpy(pe->name, poolname, sizeof(pe->name));
418 	pe->next = zev_muted_pools_head;
419 	zev_muted_pools_head = pe;
420 	rw_exit(&zev_pool_list_rwlock);
421 	return (0);
422 }
423 
424 static int
425 zev_ioc_unmute_pool(char *poolname)
426 {
427 	zev_pool_list_entry_t *pe, *peprev;
428 
429 	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
430 	/* pool muted? */
431 	peprev = NULL;
432 	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
433 		if (!strcmp(pe->name, poolname))
434 			break;
435 		peprev = pe;
436 	}
437 	if (pe) {
438 		rw_exit(&zev_pool_list_rwlock);
439 		return ENOENT;
440 	}
441 
442 	if (peprev != NULL) {
443 		peprev->next = pe->next;
444 	} else {
445 		zev_muted_pools_head = pe->next;
446 	}
447 	zev_free(pe, sizeof(*pe));
448 	rw_exit(&zev_pool_list_rwlock);
449 	return (0);
450 }
451 
452 int
453 zev_skip_pool(objset_t *os)
454 {
455 	zev_pool_list_entry_t *pe;
456 	dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
457 	rw_enter(&zev_pool_list_rwlock, RW_READER);
458 	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
459 		if (!strcmp(pe->name, dp->dp_spa->spa_name)) {
460 			rw_exit(&zev_pool_list_rwlock);
461 			return 1;
462 		}
463 	}
464 	rw_exit(&zev_pool_list_rwlock);
465 	return 0;
466 }
467 
468 int
469 zev_skip_fs(zfsvfs_t *fs)
470 {
471 	dsl_dir_t *d = fs->z_os->os_dsl_dataset->ds_dir;
472 	dsl_dir_t *prev = NULL;
473 
474 	while (d && d != prev) {
475 		if (strstr(d->dd_myname, "_root"))
476 			return 0;
477 		prev = d;
478 		d = d->dd_parent;
479 	}
480 	return 1;
481 }
482 
483 static void
484 zev_update_statistics(int op, zev_statistics_t *stat)
485 {
486 	switch (op) {
487 	case ZEV_OP_ERROR:
488 		stat->zev_cnt_errors++;
489 		break;
490 	case ZEV_OP_MARK:
491 		stat->zev_cnt_marks++;
492 		break;
493 	case ZEV_OP_ZFS_MOUNT:
494 		stat->zev_cnt_zfs_mount++;
495 		break;
496 	case ZEV_OP_ZFS_UMOUNT:
497 		stat->zev_cnt_zfs_umount++;
498 		break;
499 	case ZEV_OP_ZVOL_WRITE:
500 		stat->zev_cnt_zvol_write++;
501 		break;
502 	case ZEV_OP_ZVOL_TRUNCATE:
503 		stat->zev_cnt_zvol_truncate++;
504 		break;
505 	case ZEV_OP_ZNODE_CLOSE_AFTER_UPDATE:
506 		stat->zev_cnt_znode_close_after_update++;
507 		break;
508 	case ZEV_OP_ZNODE_CREATE:
509 		stat->zev_cnt_znode_create++;
510 		break;
511 	case ZEV_OP_ZNODE_REMOVE:
512 		stat->zev_cnt_znode_remove++;
513 		break;
514 	case ZEV_OP_ZNODE_LINK:
515 		stat->zev_cnt_znode_link++;
516 		break;
517 	case ZEV_OP_ZNODE_SYMLINK:
518 		stat->zev_cnt_znode_symlink++;
519 		break;
520 	case ZEV_OP_ZNODE_RENAME:
521 		stat->zev_cnt_znode_rename++;
522 		break;
523 	case ZEV_OP_ZNODE_WRITE:
524 		stat->zev_cnt_znode_write++;
525 		break;
526 	case ZEV_OP_ZNODE_TRUNCATE:
527 		stat->zev_cnt_znode_truncate++;
528 		break;
529 	case ZEV_OP_ZNODE_SETATTR:
530 		stat->zev_cnt_znode_setattr++;
531 		break;
532 	case ZEV_OP_ZNODE_ACL:
533 		stat->zev_cnt_znode_acl++;
534 		break;
535 	}
536 }
537 
538 void
539 zev_queue_message(int op, zev_msg_t *msg)
540 {
541 	zev_queue_t *q;
542 	int wakeup = 0;
543 	zev_msg_t *m;
544 	int i;
545 
546 	msg->next = NULL;
547 	msg->prev = NULL;
548 	msg->read = 0;
549 
550 	if (op < ZEV_OP_MIN || op > ZEV_OP_MAX) {
551 		zev_queue_error(op, "unknown op id encountered: %d", op);
552 		zev_free(msg, sizeof(*msg) + msg->size);
553 		return;
554 	}
555 
556 	/*
557 	 * This mutex protects us agains race conditions when several
558 	 * threads want to queue a message and one or more queues are
559 	 * full:  we release zev_mutex to wait for the queues to become
560 	 * less-than-full, but we don't know in which order the waiting
561 	 * threads will be awoken.  If it's not the same order in which
562 	 * they went to sleep we might mark different messages as "newest"
563 	 * in different queues, and so we might have dupes or even
564 	 * skip messages.
565 	 */
566 	mutex_enter(&zev_queue_msg_mutex);
567 
568 	mutex_enter(&zev_mutex);
569 
570 	/*
571 	 * When the module is loaded, the default behavior ist to
572 	 * put all events into a queue and block if the queue is full.
573 	 * This is done even before the pseudo device is attached.
574 	 * This way, no events are lost.
575 	 *
576 	 * To discard events entirely the "beaver" queue,
577 	 * which never discards anything, has to be removed.
578 	 */
579 
580 	if (zev_queue_cnt == 0) {
581 		mutex_exit(&zev_mutex);
582 		mutex_exit(&zev_queue_msg_mutex);
583 		return;
584 	}
585 
586 	/* put message into global queue */
587 	msg->seq = zev_msg_sequence_number++;
588 
589 	/* do we need to make room? */
590 	while (zev_statistics.zev_max_queue_len &&
591 	    zev_statistics.zev_queue_len > zev_statistics.zev_max_queue_len) {
592 
593 		if (zev_have_blocking_queues) {
594 			/* queue full.  block until it's been shrunk. */
595 			cv_wait(&zev_condvar, &zev_mutex);
596 			continue;
597 		}
598 
599 		/* discard events until this message fits into all queues */
600 
601 		for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
602 			q = zev_queues[i - ZEV_MINOR_MIN];
603 			if (!q)
604 				continue;
605 			/* discard msgs until queue is small enough */
606 			while (q->zq_queue_len &&
607 			       q->zq_queue_len > q->zq_max_queue_len) {
608 				m = q->zq_oldest;
609 				if (m == NULL)
610 					break;
611 				q->zq_events_discarded++;
612 				q->zq_bytes_discarded += m->size;
613 				q->zq_oldest = m->next;
614 				q->zq_queue_len -= m->size;
615 				q->zq_queue_messages--;
616 			}
617 		}
618 
619 		zev_queue_trim();
620 		ASSERT(zev_statistics.zev_queue_len == 0 ||
621 		       zev_statistics.zev_queue_len <=
622 				zev_statistics.zev_max_queue_len);
623 	}
624 
625 	if (zev_queue_tail == NULL) {
626 		zev_queue_head = zev_queue_tail = msg;
627 	} else {
628 		zev_queue_tail->next = msg;
629 		msg->prev = zev_queue_tail;
630 		zev_queue_tail = msg;
631 	}
632 	zev_queue_len++;
633 	zev_statistics.zev_cnt_total_events++;
634 	zev_statistics.zev_queue_len += msg->size;
635 
636 	/* update per-device queues */
637 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
638 		q = zev_queues[i - ZEV_MINOR_MIN];
639 		if (!q)
640 			continue;
641 
642 		zev_queue_hold(q);
643 
644 		/* make sure queue has enough room */
645 		while (q->zq_max_queue_len &&
646 		       q->zq_queue_len > q->zq_max_queue_len) {
647 
648 			if (q->zq_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL) {
649 				/* block until queue has been shrunk. */
650 				cv_wait(&zev_condvar, &zev_mutex);
651 			} else {
652 				/* discard msgs until queue is small enough */
653 				while (q->zq_queue_len > q->zq_max_queue_len) {
654 					m = q->zq_oldest;
655 					if (m == NULL)
656 						break;
657 					q->zq_events_discarded++;
658 					q->zq_bytes_discarded += m->size;
659 					q->zq_oldest = m->next;
660 					q->zq_queue_len -= m->size;
661 					q->zq_queue_messages--;
662 				}
663 			}
664 		}
665 
666 		/* register new message at the end of the queue */
667 		q->zq_queue_len += msg->size;
668 		q->zq_queue_messages++;
669 		q->zq_bytes_total += msg->size;
670 		q->zq_events_total++;
671 		if (q->zq_oldest == NULL)
672 			q->zq_oldest = msg;
673 
674 		zev_update_statistics(op, &q->zq_statistics);
675 
676 		if (q->zq_queue_len > q->zq_wakeup_threshold)
677 			wakeup = 1;
678 		if (q->zq_queue_len == msg->size)  /* queue was empty */
679 			cv_broadcast(&q->zq_condvar);
680 
681 		zev_queue_release(q);
682 	}
683 
684 	zev_queue_trim();
685 
686 	zev_update_statistics(op, &zev_statistics);
687 	mutex_exit(&zev_mutex);
688 	mutex_exit(&zev_queue_msg_mutex);
689 
690 	/* one or more queues need a pollwakeup() */
691 	if (op == ZEV_OP_MARK) {
692 		zev_poll_wakeup(B_TRUE);
693 	} else if (wakeup) {
694 		zev_poll_wakeup(B_FALSE);
695 	}
696 
697 	return;
698 }
699 
700 void
701 zev_queue_error(int op, char *fmt, ...)
702 {
703 	char buf[ZEV_MAX_MESSAGE_LEN];
704 	va_list ap;
705 	int len;
706 	zev_msg_t *msg = NULL;
707 	zev_error_t *rec;
708 	int msg_size;
709 
710 	va_start(ap, fmt);
711 	len = vsnprintf(buf, sizeof(buf), fmt, ap);
712 	va_end(ap);
713 	if (len >= sizeof(buf)) {
714 		cmn_err(CE_WARN, "zev: can't report error - "
715 		        "dropping event entirely.");
716 		return;
717 	}
718 
719 	msg_size = sizeof(*rec) + len + 1;
720 	msg = zev_alloc(sizeof(*msg) + msg_size);
721 	msg->size = msg_size;
722 	rec = (zev_error_t *)(msg + 1);
723 	rec->record_len = msg_size;
724 	rec->op = ZEV_OP_ERROR;
725 	rec->op_time = ddi_get_time();
726 	rec->guid = 0;
727 	rec->failed_op = op;
728 	rec->errstr_len = len;
729 	(void) memcpy(ZEV_ERRSTR(rec), buf, len + 1);
730 
731 	zev_queue_message(ZEV_OP_ERROR, msg);
732 	return;
733 }
734 
735 static int
736 zev_find_queue(zev_queue_t **out, zev_queue_t *req_q, zev_queue_name_t *name)
737 {
738 	char namebuf[ZEV_MAX_QUEUE_NAME_LEN+1];
739 	zev_queue_t *q;
740 	int i;
741 
742 	*out = NULL;
743 
744 	if (name->zev_namelen == 0) {
745 		if (req_q->zq_minor_number == ZEV_CONTROL_DEVICE_MINOR)
746 			return EINVAL;
747 		zev_queue_hold(req_q);
748 		*out = req_q;
749 		return 0;
750 	}
751 
752 	if (name->zev_namelen > ZEV_MAX_QUEUE_NAME_LEN)
753 		return EINVAL;
754 	strncpy(namebuf, name->zev_name, name->zev_namelen);
755 	namebuf[name->zev_namelen] = '\0';
756 
757 	mutex_enter(&zev_mutex);
758 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
759 		q = zev_queues[i - ZEV_MINOR_MIN];
760 		if (!q)
761 			continue;
762 		if (!strcmp(q->zq_name, namebuf)) {
763 			zev_queue_hold(q);
764 			mutex_exit(&zev_mutex);
765 			*out = q;
766 			return 0;
767 		}
768 	}
769 	mutex_exit(&zev_mutex);
770 	return ENOENT;
771 }
772 
773 static int
774 zev_ioc_get_queue_statistics(zev_queue_t *req_q, intptr_t arg, int mode)
775 {
776 	zev_ioctl_get_queue_statistics_t gs;
777 	zev_queue_t *q;
778 	int ret;
779 
780 	if (ddi_copyin((void *)arg, &gs, sizeof(gs), mode) != 0)
781 		return EFAULT;
782 
783 	ret = zev_find_queue(&q, req_q, &gs.zev_queue_name);
784 	if (ret)
785 		return ret;
786 
787 	/* ddi_copyout() can take a long time.  Better make
788 	   a copy to be able to release the mutex faster. */
789 	mutex_enter(&zev_mutex);
790 	memcpy(&gs.zev_statistics, &q->zq_statistics,sizeof(gs.zev_statistics));
791 	gs.zev_statistics.zev_queue_len = q->zq_queue_len;
792 	gs.zev_statistics.zev_bytes_read = q->zq_bytes_read;
793 	gs.zev_statistics.zev_bytes_discarded = q->zq_bytes_discarded;
794 	gs.zev_statistics.zev_max_queue_len = q->zq_max_queue_len;
795 	gs.zev_statistics.zev_cnt_discarded_events = q->zq_events_discarded;
796 	gs.zev_statistics.zev_cnt_total_events = q->zq_events_total;
797 	zev_queue_release(q);
798 	mutex_exit(&zev_mutex);
799 
800 	if (ddi_copyout(&gs, (void *)arg, sizeof(gs), mode) != 0)
801 		return EFAULT;
802 	return 0;
803 }
804 
805 static int
806 zev_ioc_set_queue_properties(zev_queue_t *req_q, intptr_t arg, int mode)
807 {
808 	zev_ioctl_set_queue_properties_t qp;
809 	zev_queue_t *q;
810 	uint64_t old_max;
811 	uint64_t old_flags;
812 	int ret;
813 
814 	if (ddi_copyin((void *)arg, &qp, sizeof(qp), mode) != 0)
815 		return EFAULT;
816 	if (qp.zev_max_queue_len > ZEV_MAX_QUEUE_LEN)
817 		return EINVAL;
818 	if (qp.zev_poll_wakeup_threshold > ZEV_MAX_POLL_WAKEUP_QUEUE_LEN)
819 		return EINVAL;
820 
821 	ret = zev_find_queue(&q, req_q, &qp.zev_queue_name);
822 	if (ret)
823 		return ret;
824 
825 	mutex_enter(&zev_mutex);
826 
827 	/*
828 	 * Note: if the PERSISTENT flag is cleared, and the queue is not busy,
829 	 * the queue should be removed by zev_queue_release() in zev_ioctl().
830 	 */
831 	old_flags = qp.zev_flags;
832 	q->zq_flags = qp.zev_flags;
833 	if ((old_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL) &&
834 	   (!(qp.zev_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL))) {
835 		/* queue is no longer blocking - wake blocked threads */
836 		cv_broadcast(&zev_condvar);
837 	}
838 
839 	zev_update_blockflag();
840 
841 	old_max = q->zq_max_queue_len;
842 	q->zq_max_queue_len = qp.zev_max_queue_len;
843 	if (q->zq_max_queue_len < old_max)
844 		zev_queue_trim();
845 	if (q->zq_max_queue_len > old_max)
846 		cv_broadcast(&zev_condvar);	/* threads may be waiting */
847 
848 	if ((qp.zev_poll_wakeup_threshold < q->zq_wakeup_threshold) &&
849 	    (qp.zev_poll_wakeup_threshold <= q->zq_queue_len))
850 		pollwakeup(&q->zq_pollhead, POLLIN);
851 	q->zq_wakeup_threshold = qp.zev_poll_wakeup_threshold;
852 
853 	zev_queue_release(q);
854 	mutex_exit(&zev_mutex);
855 	return 0;
856 }
857 
858 static int
859 zev_ioc_get_queue_properties(zev_queue_t *req_q, intptr_t arg, int mode)
860 {
861 	zev_ioctl_get_queue_properties_t qp;
862 	zev_queue_t *q;
863 	int ret;
864 
865 	if (ddi_copyin((void *)arg, &qp, sizeof(qp), mode) != 0)
866 		return EFAULT;
867 
868 	ret = zev_find_queue(&q, req_q, &qp.zev_queue_name);
869 	if (ret)
870 		return ret;
871 
872 	mutex_enter(&zev_mutex);
873 	qp.zev_max_queue_len = q->zq_max_queue_len;
874 	qp.zev_flags = q->zq_flags;
875 	qp.zev_poll_wakeup_threshold = q->zq_wakeup_threshold;
876 	zev_queue_release(q);
877 	mutex_exit(&zev_mutex);
878 
879 	if (ddi_copyout(&qp, (void *)arg, sizeof(qp), mode) != 0)
880 		return EFAULT;
881 	return 0;
882 }
883 
884 static int
885 zev_ioc_add_queue(zev_queue_t *req_q, intptr_t arg, int mode)
886 {
887 	zev_ioctl_add_queue_t aq;
888 	zev_queue_t *new_q;
889 	char name[ZEV_MAX_QUEUE_NAME_LEN+1];
890 
891 	if (ddi_copyin((void *)arg, &aq, sizeof(aq), mode) != 0)
892 		return EFAULT;
893 
894 	if (aq.zev_namelen > ZEV_MAX_QUEUE_NAME_LEN)
895 		return EINVAL;
896 	strncpy(name, aq.zev_name, aq.zev_namelen);
897 	name[aq.zev_namelen] = '\0';
898 
899 	return zev_queue_new(&new_q, req_q->zq_dip, name,
900 	                     aq.zev_max_queue_len, aq.zev_flags);
901 }
902 
903 static int
904 zev_ioc_remove_queue(zev_queue_t *req_q, intptr_t arg, int mode)
905 {
906 	zev_ioctl_remove_queue_t rq;
907 	zev_queue_t *q;
908 	char name[ZEV_MAX_QUEUE_NAME_LEN+1];
909 	int found = 0;
910 	int i;
911 
912 	if (ddi_copyin((void *)arg, &rq, sizeof(rq), mode) != 0)
913 		return EFAULT;
914 
915 	if (rq.zev_queue_name.zev_namelen > ZEV_MAX_QUEUE_NAME_LEN)
916 		return EINVAL;
917 	strncpy(name, rq.zev_queue_name.zev_name,
918 	        rq.zev_queue_name.zev_namelen);
919 	name[rq.zev_queue_name.zev_namelen] = '\0';
920 
921 	mutex_enter(&zev_mutex);
922 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
923 		q = zev_queues[i - ZEV_MINOR_MIN];
924 		if (!q)
925 			continue;
926 		if (!strcmp(q->zq_name, name)) {
927 			found = 1;
928 			break;
929 		}
930 	}
931 	if (!found) {
932 		mutex_exit(&zev_mutex);
933 		return ENOENT;
934 	}
935 
936 	if (q->zq_busy) {
937 		mutex_exit(&zev_mutex);
938 		return EBUSY;
939 	}
940 	/*
941 	 * clear flags, so that persistent queues are removed aswell
942 	 * and the queue becomes non-blocking.
943 	 */
944 	q->zq_flags = 0;
945 	if (q->zq_to_be_removed == B_FALSE) {
946 		q->zq_to_be_removed = B_TRUE;
947 		zev_queue_release(q);
948 	}
949 	/* some threads might be waiting for this queue to become writable */
950 	cv_broadcast(&zev_condvar);
951 
952 	mutex_exit(&zev_mutex);
953 	return 0;
954 }
955 
956 static int
957 zev_ioc_get_debug_info(zev_queue_t *req_q, intptr_t arg, int mode)
958 {
959 	zev_ioctl_debug_info_t di;
960 	uint64_t mem_allocated = atomic_add_64_nv(&zev_memory_allocated, 0);
961 	uint64_t mem_freed     = atomic_add_64_nv(&zev_memory_freed, 0);
962 
963 	zev_chksum_stats(&di.zev_chksum_cache_size,
964 	                 &di.zev_chksum_cache_hits,
965 	                 &di.zev_chksum_cache_misses);
966 	di.zev_memory_allocated = mem_allocated - mem_freed;
967 	if (ddi_copyout(&di, (void *)arg, sizeof(di), mode) != 0)
968 		return EFAULT;
969 	return 0;
970 }
971 
972 static int
973 zev_ioc_get_queue_list(zev_queue_t *req_q, intptr_t arg, int mode)
974 {
975 	zev_ioctl_get_queue_list_t gql;
976 	zev_queue_t *q;
977 	int i = 0;
978 	int count = 0;
979 
980 	memset(&gql, 0, sizeof(gql));
981 
982 	mutex_enter(&zev_mutex);
983 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
984 		q = zev_queues[i - ZEV_MINOR_MIN];
985 		if (!q)
986 			continue;
987 		strncpy(gql.zev_queue_name[count].zev_name,
988 		    q->zq_name, ZEV_MAX_QUEUE_NAME_LEN);
989 		gql.zev_queue_name[count].zev_namelen = strlen(q->zq_name);
990 		count++;
991 	}
992 	gql.zev_n_queues = count;
993 	mutex_exit(&zev_mutex);
994 
995 	if (ddi_copyout(&gql, (void *)arg, sizeof(gql), mode) != 0)
996 		return EFAULT;
997 	return 0;
998 }
999 
1000 static int
1001 zev_ioc_set_max_queue_len(zev_queue_t *req_q, intptr_t arg, int mode)
1002 {
1003 	uint64_t len;
1004 	int i;
1005 	zev_queue_t *q;
1006 
1007 	if (ddi_copyin((void *)arg, &len, sizeof(len), mode) != 0) {
1008 		return EFAULT;
1009 	}
1010 	if (len > ZEV_MAX_QUEUE_LEN) {
1011 		return EINVAL;
1012 	}
1013 	mutex_enter(&zev_mutex);
1014 	zev_statistics.zev_max_queue_len = len;
1015 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
1016 		q = zev_queues[i - ZEV_MINOR_MIN];
1017 		if (!q)
1018 			continue;
1019 		if (q->zq_max_queue_len <=
1020 		    zev_statistics.zev_max_queue_len)
1021 			continue;
1022 		q->zq_max_queue_len = zev_statistics.zev_max_queue_len;
1023 	}
1024 	cv_broadcast(&zev_condvar);
1025 	mutex_exit(&zev_mutex);
1026 	return 0;
1027 }
1028 
1029 /* ARGSUSED */
1030 static int
1031 zev_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
1032 {
1033 	zev_statistics_t zs;
1034 	zev_ioctl_poolarg_t pa;
1035 	zev_ioctl_mark_t mark;
1036 	zev_mark_t *rec;
1037 	int msg_size;
1038 	zev_msg_t *msg;
1039 	uint64_t mark_id;
1040 	minor_t minor;
1041 	zev_queue_t *req_q;
1042 	int ret = 0;
1043 
1044 	minor = getminor(dev);
1045 	mutex_enter(&zev_mutex);
1046 	if ((req_q = ddi_get_soft_state(statep, minor)) == NULL) {
1047 		mutex_exit(&zev_mutex);
1048 		return (ENXIO);
1049 	}
1050 	zev_queue_hold(req_q);
1051 	mutex_exit(&zev_mutex);
1052 	/*
1053 	 * all structures passed between kernel and userspace
1054 	 * are now compatible between 64 and 32 bit.  Model
1055 	 * conversion can be ignored.
1056 	 */
1057 	switch (cmd) {
1058 	case ZEV_IOC_GET_GLOBAL_STATISTICS:
1059 		/* ddi_copyout() can take a long time.  Better make
1060 		   a copy to be able to release the mutex faster. */
1061 		mutex_enter(&zev_mutex);
1062 		(void) memcpy(&zs, &zev_statistics, sizeof(zs));
1063 		mutex_exit(&zev_mutex);
1064 		if (ddi_copyout(&zs, (void *)arg, sizeof(zs), mode) != 0)
1065 			ret = EFAULT;
1066 		break;
1067 	case ZEV_IOC_GET_QUEUE_STATISTICS:
1068 		ret = zev_ioc_get_queue_statistics(req_q, arg, mode);
1069 		break;
1070 	case ZEV_IOC_MUTE_POOL:
1071 	case ZEV_IOC_UNMUTE_POOL:
1072 		if (ddi_copyin((void *)arg, &pa, sizeof(pa), mode) != 0) {
1073 			ret = EFAULT;
1074 			break;
1075 		}
1076 		if (pa.zev_poolname_len >=MAXPATHLEN) {
1077 			ret = EINVAL;
1078 			break;
1079 		}
1080 		pa.zev_poolname[pa.zev_poolname_len] = '\0';
1081 		if (cmd == ZEV_IOC_MUTE_POOL) {
1082 			ret = zev_ioc_mute_pool(pa.zev_poolname);
1083 		} else {
1084 			ret = zev_ioc_unmute_pool(pa.zev_poolname);
1085 		}
1086 		break;
1087 	case ZEV_IOC_SET_MAX_QUEUE_LEN:
1088 		ret = zev_ioc_set_max_queue_len(req_q, arg, mode);
1089 		break;
1090 	case ZEV_IOC_GET_QUEUE_PROPERTIES:
1091 		ret = zev_ioc_get_queue_properties(req_q, arg, mode);
1092 		break;
1093 	case ZEV_IOC_SET_QUEUE_PROPERTIES:
1094 		ret = zev_ioc_set_queue_properties(req_q, arg, mode);
1095 		break;
1096 	case ZEV_IOC_MARK:
1097 		if (ddi_copyin((void *)arg, &mark, sizeof(mark), mode) != 0) {
1098 			ret = EFAULT;
1099 			break;
1100 		}
1101 		/* prepare message */
1102 		msg_size = sizeof(*rec) + mark.zev_payload_len + 1;
1103 		msg = zev_alloc(sizeof(*msg) + msg_size);
1104 		msg->size = msg_size;
1105 		rec = (zev_mark_t *)(msg + 1);
1106 		rec->record_len = msg_size;
1107 		rec->op = ZEV_OP_MARK;
1108 		rec->op_time = ddi_get_time();
1109 		rec->guid = mark.zev_guid;
1110 		rec->payload_len = mark.zev_payload_len;
1111 		/* get payload */
1112 		if (ddi_copyin(((char *)arg) + sizeof(mark),
1113 		               ZEV_PAYLOAD(rec),
1114 		               mark.zev_payload_len, mode) != 0) {
1115 			zev_free(msg, msg_size);
1116 			ret = EFAULT;
1117 			break;
1118 		}
1119 		*(ZEV_PAYLOAD(rec) + mark.zev_payload_len) = '\0';
1120 		/* get mark id and queue message */
1121 		mutex_enter(&zev_mark_id_mutex);
1122 		mark_id = zev_mark_id++;
1123 		mutex_exit(&zev_mark_id_mutex);
1124 		rec->mark_id = mark_id;
1125 		zev_queue_message(ZEV_OP_MARK, msg);
1126 		/* report mark id to userland, ignore errors */
1127 		mark.zev_mark_id = mark_id;
1128 		ddi_copyout(&mark, (void *)arg, sizeof(mark), mode);
1129 		break;
1130 	case ZEV_IOC_ADD_QUEUE:
1131 		if (minor != ZEV_CONTROL_DEVICE_MINOR) {
1132 			ret = EACCES;
1133 			break;
1134 		}
1135 		ret = zev_ioc_add_queue(req_q, arg, mode);
1136 		break;
1137 	case ZEV_IOC_REMOVE_QUEUE:
1138 		if (minor != ZEV_CONTROL_DEVICE_MINOR) {
1139 			ret = EACCES;
1140 			break;
1141 		}
1142 		ret = zev_ioc_remove_queue(req_q, arg, mode);
1143 		break;
1144 	case ZEV_IOC_GET_DEBUG_INFO:
1145 		ret = zev_ioc_get_debug_info(req_q, arg, mode);
1146 		break;
1147 	case ZEV_IOC_GET_QUEUE_LIST:
1148 		ret = zev_ioc_get_queue_list(req_q, arg, mode);
1149 		break;
1150 	case ZEV_IOC_GET_FILE_SIGNATURES:
1151 		ret = zev_ioc_get_signatures(arg, mode);
1152 		break;
1153 	default:
1154 		/* generic "ioctl unknown" error */
1155 		ret = ENOTTY;
1156 	}
1157 
1158 	mutex_enter(&zev_mutex);
1159 	zev_queue_release(req_q);
1160 	mutex_exit(&zev_mutex);
1161 	if (ret)
1162 		SET_ERROR(ret);
1163 	return (ret);
1164 }
1165 
1166 static int
1167 zev_chpoll(dev_t dev, short events, int anyyet,
1168     short *reventsp, struct pollhead **phpp)
1169 {
1170 	int minor;
1171 	short revent = 0;
1172 	zev_queue_t *q;
1173 
1174 	/* use minor-specific queue context and it's pollhead */
1175 	minor = getminor(dev);
1176 	if (minor == ZEV_CONTROL_DEVICE_MINOR)
1177 		return (EINVAL);
1178 	mutex_enter(&zev_mutex);
1179 	if ((q = ddi_get_soft_state(statep, minor)) == NULL) {
1180 		mutex_exit(&zev_mutex);
1181 		return (ENXIO);
1182 	}
1183 	revent = 0;
1184 	if ((events & POLLIN)) {
1185 		if (q->zq_oldest)
1186 			revent |= POLLIN;
1187 	}
1188 	if (revent == 0) {
1189 		if (!anyyet) {
1190 			*phpp = &q->zq_pollhead;
1191 		}
1192 	}
1193 	*reventsp = revent;
1194 	mutex_exit(&zev_mutex);
1195 	return (0);
1196 }
1197 
1198 /* ARGSUSED */
1199 static int
1200 zev_read(dev_t dev, struct uio *uio_p, cred_t *crep_p)
1201 {
1202 	minor_t minor;
1203 	offset_t off;
1204 	int ret = 0;
1205 	zev_msg_t *msg;
1206 	char *data;
1207 	zev_queue_t *q;
1208 
1209 	minor = getminor(dev);
1210 	if (minor == ZEV_CONTROL_DEVICE_MINOR)
1211 		return (EINVAL);
1212 
1213 	mutex_enter(&zev_mutex);
1214 	q = ddi_get_soft_state(statep, minor);
1215 	if (q == NULL) {
1216 		mutex_exit(&zev_mutex);
1217 		return (ENXIO);
1218 	}
1219 	off = uio_p->uio_loffset;
1220 	msg = q->zq_oldest;
1221 	while (msg == NULL) {
1222 		if (!ddi_can_receive_sig()) {
1223 			/*
1224 			 * read() shouldn't block because this thread
1225 			 * can't receive signals. (e.g., it might be
1226 			 * torn down by exit() right now.)
1227 			 */
1228 			mutex_exit(&zev_mutex);
1229 			return 0;
1230 		}
1231 		if (cv_wait_sig(&q->zq_condvar, &zev_mutex) == 0) {
1232 			/* signal received. */
1233 			mutex_exit(&zev_mutex);
1234 			return EINTR;
1235 		}
1236 		msg = q->zq_oldest;
1237 	}
1238 	if (msg->size > uio_p->uio_resid) {
1239 		mutex_exit(&zev_mutex);
1240 		return E2BIG;
1241 	}
1242 	while (msg && uio_p->uio_resid >= msg->size) {
1243 		data = (char *)(msg + 1);
1244 		ret = uiomove(data, msg->size, UIO_READ, uio_p);
1245 		if (ret != 0) {
1246 			mutex_exit(&zev_mutex);
1247 			cmn_err(CE_WARN, "zev: uiomove failed; messages lost");
1248 			uio_p->uio_loffset = off;
1249 			return (ret);
1250 		}
1251 		q->zq_oldest = msg->next;
1252 		q->zq_bytes_read += msg->size;
1253 		q->zq_queue_len -= msg->size;
1254 		q->zq_queue_messages--;
1255 		msg->read++;
1256 		msg = q->zq_oldest;
1257 	}
1258 	zev_queue_trim();
1259 	cv_broadcast(&zev_condvar);
1260 	mutex_exit(&zev_mutex);
1261 	uio_p->uio_loffset = off;
1262 	return 0;
1263 }
1264 
1265 /* ARGSUSED */
1266 static int
1267 zev_close(dev_t dev, int flag, int otyp, cred_t *crepd)
1268 {
1269 	zev_queue_t *q;
1270 	int minor;
1271 
1272 	minor = getminor(dev);
1273 	if (otyp != OTYP_CHR)
1274 		return (EINVAL);
1275 	mutex_enter(&zev_mutex);
1276 	if ((q = ddi_get_soft_state(statep, minor)) == NULL) {
1277 		mutex_exit(&zev_mutex);
1278 		return (ENXIO);
1279 	}
1280 	if (q->zq_busy != B_TRUE) {
1281 		mutex_exit(&zev_mutex);
1282 		return (EINVAL);
1283 	}
1284 	q->zq_busy = B_FALSE;
1285 	if ((q->zq_flags & ZEV_FL_PERSISTENT) == 0)
1286 		zev_queue_release(q);
1287 	mutex_exit(&zev_mutex);
1288 	return (0);
1289 }
1290 
1291 /* ARGSUSED */
1292 static int
1293 zev_open(dev_t *devp, int flag, int otyp, cred_t *credp)
1294 {
1295 	zev_queue_t *q;
1296 	minor_t minor;
1297 
1298 	minor = getminor(*devp);
1299 	if (otyp != OTYP_CHR)
1300 		return (EINVAL);
1301 	if (drv_priv(credp) != 0)
1302 		return (EPERM);
1303 	mutex_enter(&zev_mutex);
1304 	if ((q = ddi_get_soft_state(statep, minor)) == NULL) {
1305 		mutex_exit(&zev_mutex);
1306 		return (ENXIO);
1307 	}
1308 	if (minor == ZEV_CONTROL_DEVICE_MINOR) {
1309 		/* control device may be used in parallel */
1310 		q->zq_busy = B_TRUE;
1311 		mutex_exit(&zev_mutex);
1312 		return 0;
1313 	}
1314 	if (q->zq_busy == B_TRUE) {
1315 		mutex_exit(&zev_mutex);
1316 		return (EBUSY);
1317 	}
1318 	q->zq_busy = B_TRUE;	/* can only be opened exclusively */
1319 	mutex_exit(&zev_mutex);
1320 	return (0);
1321 }
1322 
1323 static struct cb_ops zev_cb_ops = {
1324 	zev_open,		/* open */
1325 	zev_close,		/* close */
1326 	nodev,			/* strategy */
1327 	nodev,			/* print */
1328 	nodev,			/* dump */
1329 	zev_read,		/* read */
1330 	nodev,			/* write */
1331 	zev_ioctl,		/* ioctl */
1332 	nodev,			/* devmap */
1333 	nodev,			/* mmap */
1334 	nodev,			/* segmap */
1335 	zev_chpoll,		/* chpoll */
1336 	ddi_prop_op,		/* prop_op */
1337 	NULL,			/* streamtab */
1338 	D_MP | D_64BIT,		/* cb_flag */
1339 	CB_REV,			/* cb_rev */
1340 	nodev,			/* aread */
1341 	nodev,			/* awrite */
1342 };
1343 
1344 static void
1345 zev_free_instance(dev_info_t *dip)
1346 {
1347 	int instance;
1348 	zev_queue_t *q;
1349 	int i;
1350 
1351 	instance = ddi_get_instance(dip);
1352 	if (instance != 0) {
1353 		cmn_err(CE_WARN, "zev: tried to free instance != 0 (%d)",
1354 		        instance);
1355 		return;
1356 	}
1357 
1358 	ddi_remove_minor_node(dip, NULL);
1359 
1360 	/* stop pollwakeup thread */
1361 	zev_wakeup_thread_run = 0;
1362 	if (zev_poll_wakeup_thread != NULL) {
1363 		thread_join(zev_poll_wakeup_thread->t_did);
1364 		zev_poll_wakeup_thread = NULL;
1365 	}
1366 
1367 	mutex_enter(&zev_mutex);
1368 
1369 	/* remove "ctrl" dummy queue */
1370 	q = ddi_get_soft_state(statep, ZEV_CONTROL_DEVICE_MINOR);
1371 	if (q) {
1372 		ddi_soft_state_free(statep, ZEV_CONTROL_DEVICE_MINOR);
1373 		ZEV_MEM_SUB(sizeof(zev_queue_t));
1374 	}
1375 
1376 	/* remove all other queues */
1377 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
1378 		q = zev_queues[i- ZEV_MINOR_MIN];
1379 		if (!q)
1380 			continue;
1381 		ASSERT(q->zq_refcnt == 1);
1382 		zev_queue_release(q);
1383 	}
1384 	zev_queue_trim();
1385 	bzero(&zev_queues, sizeof(zev_queues));
1386 
1387 	mutex_exit(&zev_mutex);
1388 
1389 }
1390 
1391 static int
1392 zev_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1393 {
1394 	int instance;
1395 	zev_queue_t *q;
1396 
1397 	/* called once per instance with DDI_DETACH,
1398 	   may be called to suspend */
1399 	switch (cmd) {
1400 	case DDI_DETACH:
1401 		/* instance busy? */
1402 		instance = ddi_get_instance(dip);
1403 		if (instance != 0) {	/* hardcoded in zev.conf */
1404 			/* this module only supports one instance. */
1405 			return (DDI_FAILURE);
1406 		}
1407 
1408 		mutex_enter(&zev_mutex);
1409 		if (!zev_attached) {
1410 			mutex_exit(&zev_mutex);
1411 			return (DDI_FAILURE);
1412 		}
1413 
1414 		/* check "ctrl" queue to see if t is busy */
1415 		q = ddi_get_soft_state(statep, ZEV_CONTROL_DEVICE_MINOR);
1416 		if (q == NULL) {
1417 			mutex_exit(&zev_mutex);
1418 			return (DDI_FAILURE);
1419 		}
1420 		if (q->zq_busy) {
1421 			mutex_exit(&zev_mutex);
1422 			return (DDI_FAILURE);
1423 		}
1424 		/* are there any queues? */
1425 		if (zev_queue_cnt > 0) {
1426 			mutex_exit(&zev_mutex);
1427 			return (DDI_FAILURE);
1428 		}
1429 
1430 		zev_attached = B_FALSE;
1431 		mutex_exit(&zev_mutex);
1432 
1433 		/* switch ZFS event callbacks back to default */
1434 		rw_enter(&rz_zev_rwlock, RW_WRITER);
1435 		rz_zev_callbacks = rz_zev_default_callbacks;
1436 		rz_zev_set_active(B_FALSE);
1437 		rw_exit(&rz_zev_rwlock);
1438 
1439 		/* no thread is inside of the callbacks anymore. */
1440 
1441 		/* free resources allocated for this instance */
1442 		zev_free_instance(dip);
1443 		zev_chksum_fini();
1444 #if 0
1445 		cmn_err(CE_WARN, "zev: allocated memory at detach: %" PRIu64,
1446 			zev_memory_allocated - zev_memory_freed);
1447 #endif
1448 		return (DDI_SUCCESS);
1449 	case DDI_SUSPEND:
1450 		/* kernel must not suspend zev devices while ZFS is running */
1451 		return (DDI_FAILURE);
1452 	default:
1453 		return (DDI_FAILURE);
1454 	}
1455 }
1456 
1457 static int
1458 zev_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
1459 {
1460 	/* called once per instance with DDI_ATTACH,
1461 	   may be called to resume */
1462 	int instance;
1463 	int error;
1464 	zev_queue_t *q;
1465 	switch (cmd) {
1466 	case DDI_ATTACH:
1467 		/* create instance state */
1468 		instance = ddi_get_instance(dip);
1469 		if (instance != 0) {	/* hardcoded in zev.conf */
1470 			/* this module only supports one instance. */
1471 			return (DDI_FAILURE);
1472 		}
1473 
1474 		mutex_enter(&zev_mutex);
1475 		if (zev_attached) {
1476 			mutex_exit(&zev_mutex);
1477 			return (DDI_FAILURE);
1478 		}
1479 		if (ddi_soft_state_zalloc(statep, ZEV_CONTROL_DEVICE_MINOR) !=
1480 		    DDI_SUCCESS) {
1481 			mutex_exit(&zev_mutex);
1482 			return (DDI_FAILURE);
1483 		}
1484 		ZEV_MEM_ADD(sizeof(zev_queue_t));
1485 		zev_attached = B_TRUE;
1486 
1487 		/* init queue list */
1488 		bzero(&zev_queues, sizeof(zev_queues));
1489 		mutex_exit(&zev_mutex);
1490 
1491 		/* create a dummy queue for management of "ctrl" */
1492 
1493 		q = ddi_get_soft_state(statep, ZEV_CONTROL_DEVICE_MINOR);
1494 		q->zq_dip = dip;
1495 		q->zq_refcnt = 1;
1496 		q->zq_busy = B_FALSE;
1497 		q->zq_minor_number = ZEV_CONTROL_DEVICE_MINOR;
1498 		q->zq_flags = ZEV_FL_PERSISTENT;
1499 		strcpy(q->zq_name, ZEV_CONTROL_DEVICE_NAME);
1500 
1501 		/* create device node for "ctrl" */
1502 		if (ddi_create_minor_node(dip, ZEV_CONTROL_DEVICE_NAME,
1503 		    S_IFCHR, ZEV_CONTROL_DEVICE_MINOR,
1504 		    DDI_PSEUDO, 0) == DDI_FAILURE) {
1505 			goto fail;
1506 		}
1507 
1508 		/* note: intentionally not adding ctrl queue to queue list. */
1509 
1510 		/* default queue */
1511 		error = zev_queue_new(&q, dip,
1512 				      ZEV_DEFAULT_QUEUE_NAME,
1513 				      ZEV_MAX_QUEUE_LEN,
1514 				      ZEV_FL_BLOCK_WHILE_QUEUE_FULL|
1515 		                      ZEV_FL_PERSISTENT);
1516 		if (error)
1517 			goto fail;
1518 
1519 		/* start pollwakeup thread */
1520 		zev_wakeup_thread_run = 1;
1521 		zev_poll_wakeup_thread = thread_create(NULL, 0,
1522 		    zev_poll_wakeup_thread_main, NULL, 0, &p0,
1523 		    TS_RUN, minclsyspri);
1524 
1525 		ddi_report_dev(dip);
1526 
1527 		zev_chksum_init();
1528 
1529 		/* switch ZFS event callbacks to zev module callbacks */
1530 		rw_enter(&rz_zev_rwlock, RW_WRITER);
1531 		rz_zev_callbacks = &zev_callbacks;
1532 		rz_zev_set_active(B_TRUE);
1533 		rw_exit(&rz_zev_rwlock);
1534 
1535 		return (DDI_SUCCESS);
1536 	case DDI_RESUME:
1537 		/* suspendeding zev devices should never happen */
1538 		return (DDI_SUCCESS);
1539 	default:
1540 		return (DDI_FAILURE);
1541 	}
1542 fail:
1543 	cmn_err(CE_WARN, "zev: attach failed");
1544 	zev_free_instance(dip);
1545 	mutex_enter(&zev_mutex);
1546 	zev_attached = B_FALSE;
1547 	mutex_exit(&zev_mutex);
1548 	return (DDI_FAILURE);
1549 }
1550 
1551 /* ARGSUSED */
1552 static int
1553 zev_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **resultp)
1554 {
1555 	minor_t minor;
1556 	zev_queue_t *q;
1557 
1558 	/* arg is dev_t */
1559 	minor = getminor((dev_t)arg);
1560 	mutex_enter(&zev_mutex);
1561 	q = ddi_get_soft_state(statep, minor);
1562 	if (q == NULL) {
1563 		*resultp = NULL;
1564 		mutex_exit(&zev_mutex);
1565 		return (DDI_FAILURE);
1566 	}
1567 
1568 	switch (infocmd) {
1569 	case DDI_INFO_DEVT2DEVINFO:
1570 		*resultp = q->zq_dip;
1571 		break;
1572 	case DDI_INFO_DEVT2INSTANCE:
1573 		*resultp = (void *)(uintptr_t)ddi_get_instance(q->zq_dip);
1574 		break;
1575 	default:
1576 		mutex_exit(&zev_mutex);
1577 		return (DDI_FAILURE);
1578 	}
1579 	mutex_exit(&zev_mutex);
1580 	return (DDI_SUCCESS);
1581 }
1582 
1583 static struct dev_ops zev_dev_ops = {
1584 	DEVO_REV,			/* driver build revision */
1585 	0,				/* driver reference count */
1586 	zev_getinfo,			/* getinfo */
1587 	nulldev,			/* identify (obsolete) */
1588 	nulldev,			/* probe (search for devices) */
1589 	zev_attach,			/* attach */
1590 	zev_detach,			/* detach */
1591 	nodev,				/* reset (obsolete, use quiesce) */
1592 	&zev_cb_ops,			/* character and block device ops */
1593 	NULL,				/* bus driver ops */
1594 	NULL,				/* power management, not needed */
1595 	ddi_quiesce_not_needed,		/* quiesce */
1596 };
1597 
1598 static struct modldrv zev_modldrv = {
1599 	&mod_driverops,			/* all loadable modules use this */
1600 	"zev ZFS event provider, v1.0",	/* driver name and version info */
1601 	&zev_dev_ops			/* ops method pointers */
1602 };
1603 
1604 static struct modlinkage zev_modlinkage = {
1605 	MODREV_1,	/* fixed value */
1606 	{
1607 		&zev_modldrv,	/* driver linkage structure */
1608 		NULL		/* list terminator */
1609 	}
1610 };
1611 
1612 int
1613 _init(void)
1614 {
1615 	int error;
1616 
1617 	if ((error = ddi_soft_state_init(&statep, sizeof(zev_queue_t), 1)) != 0)
1618 		return (error);
1619 	zev_attached = B_FALSE;
1620 
1621 	zev_queue_head = NULL;
1622 	zev_queue_tail = NULL;
1623 	zev_queue_len = 0;
1624 	zev_muted_pools_head = NULL;
1625 	zev_memory_allocated = 0;
1626 	zev_memory_freed = 0;
1627 	zev_queue_cnt = 0;
1628 	zev_have_blocking_queues = 1;
1629 
1630 	mutex_init(&zev_mutex, NULL, MUTEX_DRIVER, NULL);
1631 	cv_init(&zev_condvar, NULL, CV_DRIVER, NULL);
1632 	rw_init(&zev_pool_list_rwlock, NULL, RW_DRIVER, NULL);
1633 	mutex_init(&zev_mark_id_mutex, NULL, MUTEX_DRIVER, NULL);
1634 	zev_mark_id = gethrtime();
1635 	mutex_init(&zev_queue_msg_mutex, NULL, MUTEX_DRIVER, NULL);
1636 	zev_msg_sequence_number = gethrtime();
1637 	bzero(&zev_statistics, sizeof(zev_statistics));
1638 	bzero(&zev_pollhead, sizeof(zev_pollhead));
1639 	bzero(&zev_queues, sizeof(zev_queues));
1640 	zev_statistics.zev_max_queue_len = ZEV_MAX_QUEUE_LEN;
1641 	if (zev_ioc_mute_pool("zg0")) {
1642 		cmn_err(CE_WARN, "zev: could not init mute list");
1643 		goto FAIL;
1644 	}
1645 
1646 	if ((error = mod_install(&zev_modlinkage)) != 0) {
1647 		cmn_err(CE_WARN, "zev: could not install module");
1648 		goto FAIL;
1649 	}
1650 
1651 	return (0);
1652 FAIL:
1653 	/* free resources */
1654 	cmn_err(CE_WARN, "zev: _init failed");
1655 	mutex_destroy(&zev_mutex);
1656 	ddi_soft_state_fini(&statep);
1657 	return (error);
1658 }
1659 
1660 int
1661 _info(struct modinfo *modinfop)
1662 {
1663 	return (mod_info(&zev_modlinkage, modinfop));
1664 }
1665 
1666 int
1667 _fini(void)
1668 {
1669 	int error = 0;
1670 	zev_msg_t *msg;
1671 	zev_pool_list_entry_t *pe, *npe;
1672 
1673 	mutex_enter(&zev_mutex);
1674 	if (zev_attached == B_TRUE) {
1675 		mutex_exit(&zev_mutex);
1676 		return (SET_ERROR(EBUSY));
1677 	}
1678 	if (zev_queue_cnt != 0) {
1679 		/* should never happen */
1680 		mutex_exit(&zev_mutex);
1681 		return (SET_ERROR(EBUSY));
1682 	}
1683 
1684 	/*
1685 	 * avoid deadlock if event list is full: make sure threads currently
1686 	 * blocking on the event list can append their event and then release
1687 	 * rz_zev_rwlock.  Since there should be no queues left when we
1688 	 * reach this point we can simply empty the event list and then
1689 	 * wake everybody.
1690 	 */
1691 	while (zev_queue_head) {
1692 		msg = zev_queue_head;
1693 		zev_queue_head = msg->next;
1694 		zev_free(msg, sizeof(*msg) + msg->size);
1695 	}
1696 	cv_broadcast(&zev_condvar);
1697 	mutex_exit(&zev_mutex);
1698 
1699 	/* switch ZFS event callbacks back to default (again) */
1700 	rw_enter(&rz_zev_rwlock, RW_WRITER);
1701 	rz_zev_callbacks = rz_zev_default_callbacks;
1702 	rz_zev_set_active(B_FALSE);
1703 	rw_exit(&rz_zev_rwlock);
1704 
1705 	/* no thread is inside of the callbacks anymore.  Safe to remove. */
1706 
1707 	/* unload module callbacks */
1708 	if ((error = mod_remove(&zev_modlinkage)) != 0) {
1709 		cmn_err(CE_WARN, "mod_remove failed: %d", error);
1710 		return (error);
1711 	}
1712 
1713 	/* free resources */
1714 	mutex_enter(&zev_mutex);
1715 	while (zev_queue_head) {
1716 		msg = zev_queue_head;
1717 		zev_queue_head = msg->next;
1718 		zev_free(msg, sizeof(*msg) + msg->size);
1719 	}
1720 	mutex_exit(&zev_mutex);
1721 	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
1722 	pe = zev_muted_pools_head;
1723 	while (pe) {
1724 		npe = pe;
1725 		pe = pe->next;
1726 		zev_free(npe, sizeof(*npe));
1727 	}
1728 	rw_exit(&zev_pool_list_rwlock);
1729 	ddi_soft_state_fini(&statep);
1730 	rw_destroy(&zev_pool_list_rwlock);
1731 	cv_destroy(&zev_condvar);
1732 	mutex_destroy(&zev_mutex);
1733 	mutex_destroy(&zev_mark_id_mutex);
1734 	mutex_destroy(&zev_queue_msg_mutex);
1735 
1736 	return (0);
1737 }
1738 
1739