xref: /titanic_41/usr/src/uts/common/fs/zev/zev.c (revision a8e9a76e00d1b97ce5e2931d804c43c146eb1eed)
1 #include <sys/modctl.h>
2 #include <sys/ddi.h>
3 #include <sys/sunddi.h>
4 #include <sys/conf.h>
5 #include <sys/devops.h>
6 #include <sys/stat.h>
7 #include <sys/fs/zev.h>
8 #include <sys/zev_callbacks.h>
9 #include <sys/zev_checksums.h>
10 #include <sys/zfs_znode.h>
11 #include <sys/time.h>
12 #include <sys/sa.h>
13 #include <sys/zap.h>
14 #include <sys/time.h>
15 
16 #define	OFFSETOF(s, m)		((size_t)(&(((s *)0)->m)))
17 
18 #define ZEV_DEFAULT_QUEUE_NAME		"beaver"
19 #define ZEV_CONTROL_DEVICE_MINOR	0
20 #define ZEV_MINOR_MIN			(ZEV_CONTROL_DEVICE_MINOR + 1)
21 #define ZEV_MINOR_MAX			(ZEV_MINOR_MIN + ZEV_MAX_QUEUES - 1)
22 
23 typedef struct zev_queue {
24 	char			zq_name[ZEV_MAX_QUEUE_NAME_LEN+1];
25 	minor_t			zq_minor_number;
26 	dev_info_t		*zq_dip;
27 	struct pollhead		zq_pollhead;
28 	uint64_t		zq_bytes_read;
29 	uint64_t		zq_events_read;
30 	uint64_t		zq_bytes_discarded;
31 	uint64_t		zq_events_discarded;
32 	uint64_t		zq_bytes_total;
33 	uint64_t		zq_events_total;
34 	uint64_t		zq_wakeup_threshold;
35 	uint16_t		zq_flags;
36 	uint16_t		zq_need_wakeup;
37 	/* protected by zev_mutex */
38 	int			zq_refcnt;
39 	uint64_t		zq_queue_len;
40 	uint64_t		zq_queue_messages;
41 	uint64_t		zq_max_queue_len;
42 	zev_msg_t		*zq_oldest;
43 	boolean_t		zq_busy;
44 	boolean_t		zq_to_be_removed;
45 	zev_statistics_t	zq_statistics;
46 	kcondvar_t		zq_condvar;
47 } zev_queue_t;
48 
49 static void		*statep;
50 struct pollhead		zev_pollhead;
51 
52 kmutex_t		zev_mutex;
53 kcondvar_t		zev_condvar;
54 kmutex_t		zev_queue_msg_mutex;
55 krwlock_t		zev_pool_list_rwlock;
56 static zev_statistics_t	zev_statistics;
57 static boolean_t	zev_attached;
58 static kmutex_t		zev_mark_id_mutex;
59 static uint64_t		zev_mark_id = 0;
60 
61 static uint64_t		zev_msg_sequence_number = 0;
62 static zev_queue_t	*zev_queues[ZEV_MAX_QUEUES];
63 static int		zev_queue_cnt = 0;
64 
65 uint64_t	zev_memory_allocated = 0;
66 uint64_t	zev_memory_freed = 0;
67 
68 /*
69  * The longest potential message is from zev_zfs_mount() and
70  * contains the mountpoint, which might be close to MAXPATHLEN bytes long.
71  *
72  * Another candidate is zev_znode_rename_cb() and contains three inode
73  * numbers and two filenames of up to MAXNAMELEN bytes each.
74  */
75 #define ZEV_MAX_MESSAGE_LEN	4096
76 
77 static zev_msg_t *zev_queue_head = NULL;
78 static zev_msg_t *zev_queue_tail = NULL;
79 static uint64_t zev_queue_len = 0;
80 
81 
82 typedef struct zev_pool_list_entry {
83 	struct zev_pool_list_entry	*next;
84 	char				name[MAXPATHLEN];
85 } zev_pool_list_entry_t;
86 
87 static zev_pool_list_entry_t *zev_muted_pools_head = NULL;
88 
89 static volatile int zev_wakeup_thread_run = 1;
90 static kthread_t *zev_poll_wakeup_thread = NULL;
91 
92 void *
93 zev_alloc(ssize_t sz)
94 {
95 	ZEV_MEM_ADD(sz);
96 	return kmem_alloc(sz, KM_SLEEP);
97 }
98 
99 void *
100 zev_zalloc(ssize_t sz)
101 {
102 	ZEV_MEM_ADD(sz);
103 	return kmem_zalloc(sz, KM_SLEEP);
104 }
105 
106 void
107 zev_free(void *ptr, ssize_t sz)
108 {
109 	ZEV_MEM_SUB(sz);						\
110 	kmem_free(ptr, sz);
111 }
112 
113 int
114 zev_queue_cmp(const void *a, const void *b)
115 {
116 	const zev_queue_t *qa = a;
117 	const zev_queue_t *qb = b;
118 	if (qa->zq_minor_number > qb->zq_minor_number)
119 		return 1;
120 	if (qa->zq_minor_number < qb->zq_minor_number)
121 		return -1;
122 	return 0;
123 }
124 
125 /* must be called with zev_mutex held */
126 void
127 zev_queue_trim(void)
128 {
129 	zev_msg_t *m;
130 	uint64_t oldest_message;
131 	zev_queue_t *q;
132 	int i;
133 
134 	if (!zev_queue_tail)
135 		return;
136 
137 	oldest_message = zev_queue_tail->seq + 1;  /* does not exist, yet. */
138 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
139 		q = zev_queues[i - ZEV_MINOR_MIN];
140 		if (q == NULL)
141 			continue;
142 		if (!q->zq_oldest)
143 			continue;
144 		if (oldest_message > q->zq_oldest->seq)
145 			oldest_message = q->zq_oldest->seq;
146 	}
147 
148 	/* remove msgs between oldest_message and zev_queue_head */
149 	while(zev_queue_head && (oldest_message > zev_queue_head->seq)) {
150 		m = zev_queue_head;
151 		zev_queue_head = m->next;
152 		if (zev_queue_head == NULL) {
153 			zev_queue_tail = NULL;
154 		} else {
155 			zev_queue_head->prev = NULL;
156 		}
157 		if (m->read == 0) {
158 			zev_statistics.zev_bytes_discarded += m->size;
159 			zev_statistics.zev_cnt_discarded_events++;
160 		}
161 		zev_statistics.zev_queue_len -= m->size;
162 		zev_queue_len--;
163 		zev_free(m, sizeof(*m) + m->size);
164 	}
165 }
166 
167 /* must be called with zev_mutex held */
168 static void
169 zev_queue_hold(zev_queue_t *q)
170 {
171 	q->zq_refcnt++;
172 }
173 
174 /* must be called with zev_mutex held */
175 static void
176 zev_queue_release(zev_queue_t *q)
177 {
178 	q->zq_refcnt--;
179 	if (q->zq_refcnt > 0)
180 		return;
181 
182 	ASSERT(q->zq_busy == B_FALSE);
183 
184 	/* persistent queues will not be removed */
185 	if ((q->zq_flags & ZEV_FL_PERSISTENT) != 0)
186 		return;
187 
188 	/* remove queue from queue list */
189 	zev_queues[q->zq_minor_number - ZEV_MINOR_MIN] = NULL;
190 
191 	/* discard messages that no queue references anymore */
192 	zev_queue_trim();
193 
194 	cv_destroy(&q->zq_condvar);
195 	ddi_remove_minor_node(q->zq_dip, q->zq_name);
196 	ddi_soft_state_free(statep, q->zq_minor_number);
197 	ZEV_MEM_SUB(sizeof(zev_queue_t));
198 	zev_queue_cnt--;
199 }
200 
201 int
202 zev_queue_new(zev_queue_t **queue,
203               dev_info_t *dip,
204               char *name,
205               uint64_t max_queue_len,
206               uint16_t flags)
207 {
208 	zev_queue_t *q;
209 	zev_queue_t *tmp;
210 	zev_msg_t *msg;
211 	int name_exists = 0;
212 	minor_t minor;
213 	char *p;
214 	int i;
215 
216 	if (max_queue_len > ZEV_MAX_QUEUE_LEN)
217 		return EINVAL;
218 	if (max_queue_len == 0)
219 		max_queue_len = ZEV_MAX_QUEUE_LEN;
220 	if (!strcmp(name, ZEV_CONTROL_DEVICE_NAME))
221 		return EINVAL;
222 	for (p = name; *p; p++) {
223 		if (*p >= 'a' && *p <= 'z')
224 			continue;
225 		if (*p >= '0' && *p <= '9')
226 			continue;
227 		if (*p == '.')
228 			continue;
229 		return EINVAL;
230 	}
231 
232 	mutex_enter(&zev_mutex);
233 
234 	/* find free minor number.*/
235 	/* if this were a frequent operation we'd have a free-minor list */
236 	for (minor = ZEV_MINOR_MIN; minor <= ZEV_MINOR_MAX; minor++) {
237 		tmp = zev_queues[minor - ZEV_MINOR_MIN];
238 		if (tmp == NULL)
239 			break;
240 	}
241 	if (tmp) {
242 		mutex_exit(&zev_mutex);
243 		return ENOSPC;
244 	}
245 
246 	if (ddi_soft_state_zalloc(statep, minor) != DDI_SUCCESS) {
247 		mutex_exit(&zev_mutex);
248 		return ENOSPC;
249 	}
250 	ZEV_MEM_ADD(sizeof(zev_queue_t));
251 
252 	q = ddi_get_soft_state(statep, minor);
253 	memset(q, 0, sizeof(*q));
254 	strncpy(q->zq_name, name, ZEV_MAX_QUEUE_NAME_LEN);
255 	q->zq_name[ZEV_MAX_QUEUE_NAME_LEN] = '\0';
256 	q->zq_max_queue_len = max_queue_len;
257 	q->zq_wakeup_threshold = ZEV_DEFAULT_POLL_WAKEUP_QUEUE_LEN;
258 	q->zq_flags = flags;
259 	q->zq_refcnt = 1;
260 	q->zq_dip = dip;
261 	q->zq_minor_number = minor;
262 	cv_init(&q->zq_condvar, NULL, CV_DRIVER, NULL);
263 
264 	/* insert into queue list */
265 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
266 		/* if this were a frequent operation we'd have a name tree */
267 		if (zev_queues[i - ZEV_MINOR_MIN] == NULL)
268 			continue;
269 		if (!strcmp(q->zq_name, zev_queues[i-ZEV_MINOR_MIN]->zq_name)) {
270 			name_exists = 1;
271 			break;
272 		}
273 	}
274 	if (name_exists) {
275 		ddi_soft_state_free(statep, minor);
276 		ZEV_MEM_SUB(sizeof(zev_queue_t));
277 		mutex_exit(&zev_mutex);
278 		return EEXIST;
279 	}
280 	zev_queues[minor - ZEV_MINOR_MIN] = q;
281 	zev_queue_cnt++;
282 
283 	/* calculate current queue len and find head and tail */
284 	q->zq_oldest = zev_queue_tail;
285 	msg = zev_queue_tail;
286 	while ((msg != NULL) && (q->zq_queue_len < q->zq_max_queue_len)) {
287 		q->zq_queue_len += msg->size;
288 		q->zq_queue_messages++;
289 		q->zq_oldest = msg;
290 		msg = msg->prev;
291 	}
292 
293 	mutex_exit(&zev_mutex);
294 
295 	if (ddi_create_minor_node(dip, name,
296 	    S_IFCHR, minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
297 		mutex_enter(&zev_mutex);
298 		zev_queues[minor - ZEV_MINOR_MIN] = NULL;
299 		zev_queue_cnt--;
300 		ddi_soft_state_free(statep, minor);
301 		ZEV_MEM_SUB(sizeof(zev_queue_t));
302 		mutex_exit(&zev_mutex);
303 		return EFAULT;
304 	}
305 
306 	*queue = q;
307 	return 0;
308 }
309 
310 /*
311  * poll() wakeup thread.  Used to check periodically whether we have
312  * bytes left in the queue that have not yet been made into a
313  * pollwakeup() call.  This is meant to insure a maximum waiting
314  * time until an event is presented as a poll wakeup, while at
315  * the same time not making every single event into a poll wakeup
316  * of it's own.
317  */
318 
319 static void
320 zev_poll_wakeup(boolean_t flush_all)
321 {
322 	zev_queue_t *q;
323 	int i;
324 
325 	/*
326 	 * This loop works with hold() and release() because
327 	 * pollwakeup() requires us to release our locks before calling it.
328 	 *
329 	 * from pollwakeup(9F):
330 	 *
331 	 *   "Driver defined locks should not be held across calls
332 	 *    to this function."
333 	 */
334 
335 	/* wake up threads for each individual queue */
336 	mutex_enter(&zev_mutex);
337 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
338 		q = zev_queues[i - ZEV_MINOR_MIN];
339 		if (q == NULL)
340 			continue;
341 		if (!q->zq_busy)
342 			continue;
343 		if (!q->zq_queue_len)
344 			continue;
345 		if ((flush_all) ||
346 		    (q->zq_queue_len > q->zq_wakeup_threshold)) {
347 			zev_queue_hold(q);
348 			mutex_exit(&zev_mutex);
349 			pollwakeup(&q->zq_pollhead, POLLIN);
350 			mutex_enter(&zev_mutex);
351 			zev_queue_release(q);
352 		}
353 	}
354 	mutex_exit(&zev_mutex);
355 }
356 
357 static void
358 zev_poll_wakeup_thread_main(void)
359 {
360 	while (zev_wakeup_thread_run) {
361 		delay(drv_usectohz(100 * 1000)); /* sleep 100ms */
362 
363 		zev_poll_wakeup(B_TRUE);
364 	}
365 	thread_exit();
366 }
367 
368 static int
369 zev_ioc_mute_pool(char *poolname)
370 {
371 	zev_pool_list_entry_t *pe;
372 	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
373 	/* pool already muted? */
374 	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
375 		if (!strcmp(pe->name, poolname)) {
376 			rw_exit(&zev_pool_list_rwlock);
377 			return EEXIST;
378 		}
379 	}
380 	pe = zev_zalloc(sizeof(*pe));
381 	if (!pe) {
382 		rw_exit(&zev_pool_list_rwlock);
383 		return ENOMEM;
384 	}
385 	(void) strncpy(pe->name, poolname, sizeof(pe->name));
386 	pe->next = zev_muted_pools_head;
387 	zev_muted_pools_head = pe;
388 	rw_exit(&zev_pool_list_rwlock);
389 	return (0);
390 }
391 
392 static int
393 zev_ioc_unmute_pool(char *poolname)
394 {
395 	zev_pool_list_entry_t *pe, *peprev;
396 
397 	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
398 	/* pool muted? */
399 	peprev = NULL;
400 	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
401 		if (!strcmp(pe->name, poolname))
402 			break;
403 		peprev = pe;
404 	}
405 	if (pe) {
406 		rw_exit(&zev_pool_list_rwlock);
407 		return ENOENT;
408 	}
409 
410 	if (peprev != NULL) {
411 		peprev->next = pe->next;
412 	} else {
413 		zev_muted_pools_head = pe->next;
414 	}
415 	zev_free(pe, sizeof(*pe));
416 	rw_exit(&zev_pool_list_rwlock);
417 	return (0);
418 }
419 
420 int
421 zev_skip_pool(objset_t *os)
422 {
423 	zev_pool_list_entry_t *pe;
424 	dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
425 	rw_enter(&zev_pool_list_rwlock, RW_READER);
426 	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
427 		if (!strcmp(pe->name, dp->dp_spa->spa_name)) {
428 			rw_exit(&zev_pool_list_rwlock);
429 			return 1;
430 		}
431 	}
432 	rw_exit(&zev_pool_list_rwlock);
433 	return 0;
434 }
435 
436 static void
437 zev_update_statistics(int op, zev_statistics_t *stat)
438 {
439 	switch (op) {
440 	case ZEV_OP_ERROR:
441 		stat->zev_cnt_errors++;
442 		break;
443 	case ZEV_OP_MARK:
444 		stat->zev_cnt_marks++;
445 		break;
446 	case ZEV_OP_ZFS_MOUNT:
447 		stat->zev_cnt_zfs_mount++;
448 		break;
449 	case ZEV_OP_ZFS_UMOUNT:
450 		stat->zev_cnt_zfs_umount++;
451 		break;
452 	case ZEV_OP_ZVOL_WRITE:
453 		stat->zev_cnt_zvol_write++;
454 		break;
455 	case ZEV_OP_ZVOL_TRUNCATE:
456 		stat->zev_cnt_zvol_truncate++;
457 		break;
458 	case ZEV_OP_ZNODE_CLOSE_AFTER_UPDATE:
459 		stat->zev_cnt_znode_close_after_update++;
460 		break;
461 	case ZEV_OP_ZNODE_CREATE:
462 		stat->zev_cnt_znode_create++;
463 		break;
464 	case ZEV_OP_ZNODE_REMOVE:
465 		stat->zev_cnt_znode_remove++;
466 		break;
467 	case ZEV_OP_ZNODE_LINK:
468 		stat->zev_cnt_znode_link++;
469 		break;
470 	case ZEV_OP_ZNODE_SYMLINK:
471 		stat->zev_cnt_znode_symlink++;
472 		break;
473 	case ZEV_OP_ZNODE_RENAME:
474 		stat->zev_cnt_znode_rename++;
475 		break;
476 	case ZEV_OP_ZNODE_WRITE:
477 		stat->zev_cnt_znode_write++;
478 		break;
479 	case ZEV_OP_ZNODE_TRUNCATE:
480 		stat->zev_cnt_znode_truncate++;
481 		break;
482 	case ZEV_OP_ZNODE_SETATTR:
483 		stat->zev_cnt_znode_setattr++;
484 		break;
485 	case ZEV_OP_ZNODE_ACL:
486 		stat->zev_cnt_znode_acl++;
487 		break;
488 	}
489 }
490 
491 void
492 zev_queue_message(int op, zev_msg_t *msg)
493 {
494 	zev_queue_t *q;
495 	int wakeup = 0;
496 	zev_msg_t *m;
497 	int i;
498 
499 	msg->next = NULL;
500 	msg->prev = NULL;
501 	msg->read = 0;
502 
503 	if (op < ZEV_OP_MIN || op > ZEV_OP_MAX) {
504 		zev_queue_error(op, "unknown op id encountered: %d", op);
505 		zev_free(msg, sizeof(*msg) + msg->size);
506 		return;
507 	}
508 
509 	/*
510 	 * This mutex protects us agains race conditions when several
511 	 * threads want to queue a message and one or more queues are
512 	 * full:  we release zev_mutex to wait for the queues to become
513 	 * less-than-full, but we don't know in which order the waiting
514 	 * threads will be awoken.  If it's not the same order in which
515 	 * they went to sleep we might mark different messages as "newest"
516 	 * in different queues, and so we might have dupes or even
517 	 * skip messages.
518 	 */
519 	mutex_enter(&zev_queue_msg_mutex);
520 
521 	mutex_enter(&zev_mutex);
522 
523 	/*
524 	 * When the module is loaded, the default behavior ist to
525 	 * put all events into a queue and block if the queue is full.
526 	 * This is done even before the pseudo device is attached.
527 	 * This way, no events are lost.
528 	 *
529 	 * To discard events entirely the "beaver" queue,
530 	 * which never discards anything, has to be removed.
531 	 */
532 
533 	if (zev_queue_cnt == 0) {
534 		mutex_exit(&zev_mutex);
535 		mutex_exit(&zev_queue_msg_mutex);
536 		return;
537 	}
538 
539 	/* put message into global queue */
540 	msg->seq = zev_msg_sequence_number++;
541 	while (zev_statistics.zev_max_queue_len &&
542 	    zev_statistics.zev_queue_len >= zev_statistics.zev_max_queue_len) {
543 		/* queue full.  block until it's been shrunk. */
544 		cv_wait(&zev_condvar, &zev_mutex);
545 	}
546 
547 	if (zev_queue_tail == NULL) {
548 		zev_queue_head = zev_queue_tail = msg;
549 	} else {
550 		zev_queue_tail->next = msg;
551 		msg->prev = zev_queue_tail;
552 		zev_queue_tail = msg;
553 	}
554 	zev_queue_len++;
555 	zev_statistics.zev_cnt_total_events++;
556 	zev_statistics.zev_queue_len += msg->size;
557 
558 	/* update per-device queues */
559 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
560 		q = zev_queues[i - ZEV_MINOR_MIN];
561 		if (!q)
562 			continue;
563 
564 		zev_queue_hold(q);
565 
566 		/* make sure queue has enough room */
567 		while (q->zq_max_queue_len &&
568 		       q->zq_queue_len > q->zq_max_queue_len) {
569 
570 			if (q->zq_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL) {
571 				/* block until queue has been shrunk. */
572 				cv_wait(&zev_condvar, &zev_mutex);
573 			} else {
574 				/* discard msgs until queue is small enough */
575 				while (q->zq_queue_len > q->zq_max_queue_len) {
576 					m = q->zq_oldest;
577 					if (m == NULL)
578 						break;
579 					q->zq_events_discarded++;
580 					q->zq_bytes_discarded += m->size;
581 					q->zq_oldest = m->next;
582 					q->zq_queue_len -= m->size;
583 					q->zq_queue_messages--;
584 				}
585 			}
586 		}
587 
588 		/* register new message at the end of the queue */
589 		q->zq_queue_len += msg->size;
590 		q->zq_queue_messages++;
591 		q->zq_bytes_total += msg->size;
592 		q->zq_events_total++;
593 		if (q->zq_oldest == NULL)
594 			q->zq_oldest = msg;
595 
596 		zev_update_statistics(op, &q->zq_statistics);
597 
598 		if (q->zq_queue_len > q->zq_wakeup_threshold)
599 			wakeup = 1;
600 		if (q->zq_queue_len == msg->size)  /* queue was empty */
601 			cv_broadcast(&q->zq_condvar);
602 
603 		zev_queue_release(q);
604 	}
605 
606 	zev_queue_trim();
607 
608 	zev_update_statistics(op, &zev_statistics);
609 	mutex_exit(&zev_mutex);
610 	mutex_exit(&zev_queue_msg_mutex);
611 
612 	/* one or more queues need a pollwakeup() */
613 	if (op == ZEV_OP_MARK) {
614 		zev_poll_wakeup(B_TRUE);
615 	} else if (wakeup) {
616 		zev_poll_wakeup(B_FALSE);
617 	}
618 
619 	return;
620 }
621 
622 void
623 zev_queue_error(int op, char *fmt, ...)
624 {
625 	char buf[ZEV_MAX_MESSAGE_LEN];
626 	va_list ap;
627 	int len;
628 	zev_msg_t *msg = NULL;
629 	zev_error_t *rec;
630 	int msg_size;
631 
632 	va_start(ap, fmt);
633 	len = vsnprintf(buf, sizeof(buf), fmt, ap);
634 	va_end(ap);
635 	if (len >= sizeof(buf)) {
636 		cmn_err(CE_WARN, "zev: can't report error - "
637 		        "dropping event entirely.");
638 		return;
639 	}
640 
641 	msg_size = sizeof(*rec) + len + 1;
642 	msg = zev_alloc(sizeof(*msg) + msg_size);
643 	msg->size = msg_size;
644 	rec = (zev_error_t *)(msg + 1);
645 	rec->record_len = msg_size;
646 	rec->op = ZEV_OP_ERROR;
647 	rec->op_time = ddi_get_time();
648 	rec->guid = 0;
649 	rec->failed_op = op;
650 	rec->errstr_len = len;
651 	(void) memcpy(ZEV_ERRSTR(rec), buf, len + 1);
652 
653 	zev_queue_message(ZEV_OP_ERROR, msg);
654 	return;
655 }
656 
657 static int
658 zev_find_queue(zev_queue_t **out, zev_queue_t *req_q, zev_queue_name_t *name)
659 {
660 	char namebuf[ZEV_MAX_QUEUE_NAME_LEN+1];
661 	zev_queue_t *q;
662 	int i;
663 
664 	*out = NULL;
665 
666 	if (name->zev_namelen == 0) {
667 		if (req_q->zq_minor_number == ZEV_CONTROL_DEVICE_MINOR)
668 			return EINVAL;
669 		zev_queue_hold(req_q);
670 		*out = req_q;
671 		return 0;
672 	}
673 
674 	if (name->zev_namelen > ZEV_MAX_QUEUE_NAME_LEN)
675 		return EINVAL;
676 	strncpy(namebuf, name->zev_name, name->zev_namelen);
677 	namebuf[name->zev_namelen] = '\0';
678 
679 	mutex_enter(&zev_mutex);
680 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
681 		q = zev_queues[i - ZEV_MINOR_MIN];
682 		if (!q)
683 			continue;
684 		if (!strcmp(q->zq_name, namebuf)) {
685 			zev_queue_hold(q);
686 			mutex_exit(&zev_mutex);
687 			*out = q;
688 			return 0;
689 		}
690 	}
691 	mutex_exit(&zev_mutex);
692 	return ENOENT;
693 }
694 
695 static int
696 zev_ioc_get_queue_statistics(zev_queue_t *req_q, intptr_t arg, int mode)
697 {
698 	zev_ioctl_get_queue_statistics_t gs;
699 	zev_queue_t *q;
700 	int ret;
701 
702 	if (ddi_copyin((void *)arg, &gs, sizeof(gs), mode) != 0)
703 		return EFAULT;
704 
705 	ret = zev_find_queue(&q, req_q, &gs.zev_queue_name);
706 	if (ret)
707 		return ret;
708 
709 	/* ddi_copyout() can take a long time.  Better make
710 	   a copy to be able to release the mutex faster. */
711 	mutex_enter(&zev_mutex);
712 	memcpy(&gs.zev_statistics, &q->zq_statistics,sizeof(gs.zev_statistics));
713 	gs.zev_statistics.zev_queue_len = q->zq_queue_len;
714 	gs.zev_statistics.zev_bytes_read = q->zq_bytes_read;
715 	gs.zev_statistics.zev_bytes_discarded = q->zq_bytes_discarded;
716 	gs.zev_statistics.zev_max_queue_len = q->zq_max_queue_len;
717 	gs.zev_statistics.zev_cnt_discarded_events = q->zq_events_discarded;
718 	gs.zev_statistics.zev_cnt_total_events = q->zq_events_total;
719 	zev_queue_release(q);
720 	mutex_exit(&zev_mutex);
721 
722 	if (ddi_copyout(&gs, (void *)arg, sizeof(gs), mode) != 0)
723 		return EFAULT;
724 	return 0;
725 }
726 
727 static int
728 zev_ioc_set_queue_properties(zev_queue_t *req_q, intptr_t arg, int mode)
729 {
730 	zev_ioctl_set_queue_properties_t qp;
731 	zev_queue_t *q;
732 	uint64_t old_max;
733 	uint64_t old_flags;
734 	int ret;
735 
736 	if (ddi_copyin((void *)arg, &qp, sizeof(qp), mode) != 0)
737 		return EFAULT;
738 	if (qp.zev_max_queue_len > ZEV_MAX_QUEUE_LEN)
739 		return EINVAL;
740 	if (qp.zev_poll_wakeup_threshold > ZEV_MAX_POLL_WAKEUP_QUEUE_LEN)
741 		return EINVAL;
742 
743 	ret = zev_find_queue(&q, req_q, &qp.zev_queue_name);
744 	if (ret)
745 		return ret;
746 
747 	mutex_enter(&zev_mutex);
748 
749 	/*
750 	 * Note: if the PERSISTENT flag is cleared, and the queue is not busy,
751 	 * the queue should be removed by zev_queue_release() in zev_ioctl().
752 	 */
753 	old_flags = qp.zev_flags;
754 	q->zq_flags = qp.zev_flags;
755 	if ((old_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL) &&
756 	   (!(qp.zev_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL))) {
757 		/* queue is no longer blocking - wake blocked threads */
758 		cv_broadcast(&zev_condvar);
759 	}
760 
761 	old_max = q->zq_max_queue_len;
762 	q->zq_max_queue_len = qp.zev_max_queue_len;
763 	if (q->zq_max_queue_len < old_max)
764 		zev_queue_trim();
765 	if (q->zq_max_queue_len > old_max)
766 		cv_broadcast(&zev_condvar);	/* threads may be waiting */
767 
768 	if ((qp.zev_poll_wakeup_threshold < q->zq_wakeup_threshold) &&
769 	    (qp.zev_poll_wakeup_threshold <= q->zq_queue_len))
770 		pollwakeup(&q->zq_pollhead, POLLIN);
771 	q->zq_wakeup_threshold = qp.zev_poll_wakeup_threshold;
772 
773 	zev_queue_release(q);
774 	mutex_exit(&zev_mutex);
775 	return 0;
776 }
777 
778 static int
779 zev_ioc_get_queue_properties(zev_queue_t *req_q, intptr_t arg, int mode)
780 {
781 	zev_ioctl_get_queue_properties_t qp;
782 	zev_queue_t *q;
783 	int ret;
784 
785 	if (ddi_copyin((void *)arg, &qp, sizeof(qp), mode) != 0)
786 		return EFAULT;
787 
788 	ret = zev_find_queue(&q, req_q, &qp.zev_queue_name);
789 	if (ret)
790 		return ret;
791 
792 	mutex_enter(&zev_mutex);
793 	qp.zev_max_queue_len = q->zq_max_queue_len;
794 	qp.zev_flags = q->zq_flags;
795 	qp.zev_poll_wakeup_threshold = q->zq_wakeup_threshold;
796 	zev_queue_release(q);
797 	mutex_exit(&zev_mutex);
798 
799 	if (ddi_copyout(&qp, (void *)arg, sizeof(qp), mode) != 0)
800 		return EFAULT;
801 	return 0;
802 }
803 
804 static int
805 zev_ioc_add_queue(zev_queue_t *req_q, intptr_t arg, int mode)
806 {
807 	zev_ioctl_add_queue_t aq;
808 	zev_queue_t *new_q;
809 	char name[ZEV_MAX_QUEUE_NAME_LEN+1];
810 
811 	if (ddi_copyin((void *)arg, &aq, sizeof(aq), mode) != 0)
812 		return EFAULT;
813 
814 	if (aq.zev_namelen > ZEV_MAX_QUEUE_NAME_LEN)
815 		return EINVAL;
816 	strncpy(name, aq.zev_name, aq.zev_namelen);
817 	name[aq.zev_namelen] = '\0';
818 
819 	return zev_queue_new(&new_q, req_q->zq_dip, name,
820 	                     aq.zev_max_queue_len, aq.zev_flags);
821 }
822 
823 static int
824 zev_ioc_remove_queue(zev_queue_t *req_q, intptr_t arg, int mode)
825 {
826 	zev_ioctl_remove_queue_t rq;
827 	zev_queue_t *q;
828 	char name[ZEV_MAX_QUEUE_NAME_LEN+1];
829 	int found = 0;
830 	int i;
831 
832 	if (ddi_copyin((void *)arg, &rq, sizeof(rq), mode) != 0)
833 		return EFAULT;
834 
835 	if (rq.zev_queue_name.zev_namelen > ZEV_MAX_QUEUE_NAME_LEN)
836 		return EINVAL;
837 	strncpy(name, rq.zev_queue_name.zev_name,
838 	        rq.zev_queue_name.zev_namelen);
839 	name[rq.zev_queue_name.zev_namelen] = '\0';
840 
841 	mutex_enter(&zev_mutex);
842 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
843 		q = zev_queues[i - ZEV_MINOR_MIN];
844 		if (!q)
845 			continue;
846 		if (!strcmp(q->zq_name, name)) {
847 			found = 1;
848 			break;
849 		}
850 	}
851 	if (!found) {
852 		mutex_exit(&zev_mutex);
853 		return ENOENT;
854 	}
855 
856 	if (q->zq_busy) {
857 		mutex_exit(&zev_mutex);
858 		return EBUSY;
859 	}
860 	/*
861 	 * clear flags, so that persistent queues are removed aswell
862 	 * and the queue becomes non-blocking.
863 	 */
864 	q->zq_flags = 0;
865 	if (q->zq_to_be_removed == B_FALSE) {
866 		q->zq_to_be_removed = B_TRUE;
867 		zev_queue_release(q);
868 	}
869 	/* some threads might be waiting for this queue to become writable */
870 	cv_broadcast(&zev_condvar);
871 
872 	mutex_exit(&zev_mutex);
873 	return 0;
874 }
875 
876 static int
877 zev_ioc_get_debug_info(zev_queue_t *req_q, intptr_t arg, int mode)
878 {
879 	zev_ioctl_debug_info_t di;
880 	uint64_t mem_allocated = atomic_add_64_nv(&zev_memory_allocated, 0);
881 	uint64_t mem_freed     = atomic_add_64_nv(&zev_memory_freed, 0);
882 
883 	zev_chksum_stats(&di.zev_chksum_cache_size,
884 	                 &di.zev_chksum_cache_hits,
885 	                 &di.zev_chksum_cache_misses);
886 	di.zev_memory_allocated = mem_allocated - mem_freed;
887 	if (ddi_copyout(&di, (void *)arg, sizeof(di), mode) != 0)
888 		return EFAULT;
889 	return 0;
890 }
891 
892 static int
893 zev_ioc_get_queue_list(zev_queue_t *req_q, intptr_t arg, int mode)
894 {
895 	zev_ioctl_get_queue_list_t gql;
896 	zev_queue_t *q;
897 	int i = 0;
898 	int count = 0;
899 
900 	memset(&gql, 0, sizeof(gql));
901 
902 	mutex_enter(&zev_mutex);
903 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
904 		q = zev_queues[i - ZEV_MINOR_MIN];
905 		if (!q)
906 			continue;
907 		strncpy(gql.zev_queue_name[count].zev_name,
908 		    q->zq_name, ZEV_MAX_QUEUE_NAME_LEN);
909 		gql.zev_queue_name[count].zev_namelen = strlen(q->zq_name);
910 		count++;
911 	}
912 	gql.zev_n_queues = count;
913 	mutex_exit(&zev_mutex);
914 
915 	if (ddi_copyout(&gql, (void *)arg, sizeof(gql), mode) != 0)
916 		return EFAULT;
917 	return 0;
918 }
919 
920 /* ARGSUSED */
921 static int
922 zev_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
923 {
924 	zev_statistics_t zs;
925 	zev_ioctl_poolarg_t pa;
926 	zev_ioctl_mark_t mark;
927 	zev_mark_t *rec;
928 	int msg_size;
929 	zev_msg_t *msg;
930 	uint64_t len;
931 	uint64_t mark_id;
932 	minor_t minor;
933 	zev_queue_t *req_q;
934 	int ret = 0;
935 
936 	minor = getminor(dev);
937 	mutex_enter(&zev_mutex);
938 	if ((req_q = ddi_get_soft_state(statep, minor)) == NULL) {
939 		mutex_exit(&zev_mutex);
940 		return (ENXIO);
941 	}
942 	zev_queue_hold(req_q);
943 	mutex_exit(&zev_mutex);
944 	/*
945 	 * all structures passed between kernel and userspace
946 	 * are now compatible between 64 and 32 bit.  Model
947 	 * conversion can be ignored.
948 	 */
949 	switch (cmd) {
950 	case ZEV_IOC_GET_GLOBAL_STATISTICS:
951 		/* ddi_copyout() can take a long time.  Better make
952 		   a copy to be able to release the mutex faster. */
953 		mutex_enter(&zev_mutex);
954 		(void) memcpy(&zs, &zev_statistics, sizeof(zs));
955 		mutex_exit(&zev_mutex);
956 		if (ddi_copyout(&zs, (void *)arg, sizeof(zs), mode) != 0)
957 			ret = EFAULT;
958 		break;
959 	case ZEV_IOC_GET_QUEUE_STATISTICS:
960 		ret = zev_ioc_get_queue_statistics(req_q, arg, mode);
961 		break;
962 	case ZEV_IOC_MUTE_POOL:
963 	case ZEV_IOC_UNMUTE_POOL:
964 		if (ddi_copyin((void *)arg, &pa, sizeof(pa), mode) != 0) {
965 			ret = EFAULT;
966 			break;
967 		}
968 		if (pa.zev_poolname_len >=MAXPATHLEN) {
969 			ret = EINVAL;
970 			break;
971 		}
972 		pa.zev_poolname[pa.zev_poolname_len] = '\0';
973 		if (cmd == ZEV_IOC_MUTE_POOL) {
974 			ret = zev_ioc_mute_pool(pa.zev_poolname);
975 		} else {
976 			ret = zev_ioc_unmute_pool(pa.zev_poolname);
977 		}
978 		break;
979 	case ZEV_IOC_SET_MAX_QUEUE_LEN:
980 		if (ddi_copyin((void *)arg, &len, sizeof(len), mode) != 0) {
981 			ret = EFAULT;
982 			break;
983 		}
984 		if (len > ZEV_MAX_QUEUE_LEN) {
985 			ret = EINVAL;
986 			break;
987 		}
988 		mutex_enter(&zev_mutex);
989 		zev_statistics.zev_max_queue_len = len;
990 		cv_broadcast(&zev_condvar);
991 		mutex_exit(&zev_mutex);
992 		break;
993 	case ZEV_IOC_GET_QUEUE_PROPERTIES:
994 		ret = zev_ioc_get_queue_properties(req_q, arg, mode);
995 		break;
996 	case ZEV_IOC_SET_QUEUE_PROPERTIES:
997 		ret = zev_ioc_set_queue_properties(req_q, arg, mode);
998 		break;
999 	case ZEV_IOC_MARK:
1000 		if (ddi_copyin((void *)arg, &mark, sizeof(mark), mode) != 0) {
1001 			ret = EFAULT;
1002 			break;
1003 		}
1004 		/* prepare message */
1005 		msg_size = sizeof(*rec) + mark.zev_payload_len + 1;
1006 		msg = zev_alloc(sizeof(*msg) + msg_size);
1007 		msg->size = msg_size;
1008 		rec = (zev_mark_t *)(msg + 1);
1009 		rec->record_len = msg_size;
1010 		rec->op = ZEV_OP_MARK;
1011 		rec->op_time = ddi_get_time();
1012 		rec->guid = mark.zev_guid;
1013 		rec->payload_len = mark.zev_payload_len;
1014 		/* get payload */
1015 		if (ddi_copyin(((char *)arg) + sizeof(mark),
1016 		               ZEV_PAYLOAD(rec),
1017 		               mark.zev_payload_len, mode) != 0) {
1018 			zev_free(msg, msg_size);
1019 			ret = EFAULT;
1020 			break;
1021 		}
1022 		*(ZEV_PAYLOAD(rec) + mark.zev_payload_len) = '\0';
1023 		/* get mark id and queue message */
1024 		mutex_enter(&zev_mark_id_mutex);
1025 		mark_id = zev_mark_id++;
1026 		mutex_exit(&zev_mark_id_mutex);
1027 		rec->mark_id = mark_id;
1028 		zev_queue_message(ZEV_OP_MARK, msg);
1029 		/* report mark id to userland, ignore errors */
1030 		mark.zev_mark_id = mark_id;
1031 		ddi_copyout(&mark, (void *)arg, sizeof(mark), mode);
1032 		break;
1033 	case ZEV_IOC_ADD_QUEUE:
1034 		if (minor != ZEV_CONTROL_DEVICE_MINOR) {
1035 			ret = EACCES;
1036 			break;
1037 		}
1038 		ret = zev_ioc_add_queue(req_q, arg, mode);
1039 		break;
1040 	case ZEV_IOC_REMOVE_QUEUE:
1041 		if (minor != ZEV_CONTROL_DEVICE_MINOR) {
1042 			ret = EACCES;
1043 			break;
1044 		}
1045 		ret = zev_ioc_remove_queue(req_q, arg, mode);
1046 		break;
1047 	case ZEV_IOC_GET_DEBUG_INFO:
1048 		ret = zev_ioc_get_debug_info(req_q, arg, mode);
1049 		break;
1050 	case ZEV_IOC_GET_QUEUE_LIST:
1051 		ret = zev_ioc_get_queue_list(req_q, arg, mode);
1052 		break;
1053 	case ZEV_IOC_GET_FILE_SIGNATURES:
1054 		ret = zev_ioc_get_signatures(arg, mode);
1055 		break;
1056 	default:
1057 		/* generic "ioctl unknown" error */
1058 		ret = ENOTTY;
1059 	}
1060 
1061 	mutex_enter(&zev_mutex);
1062 	zev_queue_release(req_q);
1063 	mutex_exit(&zev_mutex);
1064 	if (ret)
1065 		SET_ERROR(ret);
1066 	return (ret);
1067 }
1068 
1069 static int
1070 zev_chpoll(dev_t dev, short events, int anyyet,
1071     short *reventsp, struct pollhead **phpp)
1072 {
1073 	int minor;
1074 	short revent = 0;
1075 	zev_queue_t *q;
1076 
1077 	/* use minor-specific queue context and it's pollhead */
1078 	minor = getminor(dev);
1079 	if (minor == ZEV_CONTROL_DEVICE_MINOR)
1080 		return (EINVAL);
1081 	mutex_enter(&zev_mutex);
1082 	if ((q = ddi_get_soft_state(statep, minor)) == NULL) {
1083 		mutex_exit(&zev_mutex);
1084 		return (ENXIO);
1085 	}
1086 	revent = 0;
1087 	if ((events & POLLIN)) {
1088 		if (q->zq_oldest)
1089 			revent |= POLLIN;
1090 	}
1091 	if (revent == 0) {
1092 		if (!anyyet) {
1093 			*phpp = &q->zq_pollhead;
1094 		}
1095 	}
1096 	*reventsp = revent;
1097 	mutex_exit(&zev_mutex);
1098 	return (0);
1099 }
1100 
1101 /* ARGSUSED */
1102 static int
1103 zev_read(dev_t dev, struct uio *uio_p, cred_t *crep_p)
1104 {
1105 	minor_t minor;
1106 	offset_t off;
1107 	int ret = 0;
1108 	zev_msg_t *msg;
1109 	char *data;
1110 	zev_queue_t *q;
1111 
1112 	minor = getminor(dev);
1113 	if (minor == ZEV_CONTROL_DEVICE_MINOR)
1114 		return (EINVAL);
1115 
1116 	mutex_enter(&zev_mutex);
1117 	q = ddi_get_soft_state(statep, minor);
1118 	if (q == NULL) {
1119 		mutex_exit(&zev_mutex);
1120 		return (ENXIO);
1121 	}
1122 	off = uio_p->uio_loffset;
1123 	msg = q->zq_oldest;
1124 	while (msg == NULL) {
1125 		if (!ddi_can_receive_sig()) {
1126 			/*
1127 			 * read() shouldn't block because this thread
1128 			 * can't receive signals. (e.g., it might be
1129 			 * torn down by exit() right now.)
1130 			 */
1131 			mutex_exit(&zev_mutex);
1132 			return 0;
1133 		}
1134 		if (cv_wait_sig(&q->zq_condvar, &zev_mutex) == 0) {
1135 			/* signal received. */
1136 			mutex_exit(&zev_mutex);
1137 			return EINTR;
1138 		}
1139 		msg = q->zq_oldest;
1140 	}
1141 	if (msg->size > uio_p->uio_resid) {
1142 		mutex_exit(&zev_mutex);
1143 		return E2BIG;
1144 	}
1145 	while (msg && uio_p->uio_resid >= msg->size) {
1146 		data = (char *)(msg + 1);
1147 		ret = uiomove(data, msg->size, UIO_READ, uio_p);
1148 		if (ret != 0) {
1149 			mutex_exit(&zev_mutex);
1150 			cmn_err(CE_WARN, "zev: uiomove failed; messages lost");
1151 			uio_p->uio_loffset = off;
1152 			return (ret);
1153 		}
1154 		q->zq_oldest = msg->next;
1155 		q->zq_bytes_read += msg->size;
1156 		q->zq_queue_len -= msg->size;
1157 		q->zq_queue_messages--;
1158 		msg->read++;
1159 		msg = q->zq_oldest;
1160 	}
1161 	cv_broadcast(&zev_condvar);
1162 	mutex_exit(&zev_mutex);
1163 	uio_p->uio_loffset = off;
1164 	return 0;
1165 }
1166 
1167 /* ARGSUSED */
1168 static int
1169 zev_close(dev_t dev, int flag, int otyp, cred_t *crepd)
1170 {
1171 	zev_queue_t *q;
1172 	int minor;
1173 
1174 	minor = getminor(dev);
1175 	if (otyp != OTYP_CHR)
1176 		return (EINVAL);
1177 	mutex_enter(&zev_mutex);
1178 	if ((q = ddi_get_soft_state(statep, minor)) == NULL) {
1179 		mutex_exit(&zev_mutex);
1180 		return (ENXIO);
1181 	}
1182 	if (q->zq_busy != B_TRUE) {
1183 		mutex_exit(&zev_mutex);
1184 		return (EINVAL);
1185 	}
1186 	q->zq_busy = B_FALSE;
1187 	if ((q->zq_flags & ZEV_FL_PERSISTENT) == 0)
1188 		zev_queue_release(q);
1189 	mutex_exit(&zev_mutex);
1190 	return (0);
1191 }
1192 
1193 /* ARGSUSED */
1194 static int
1195 zev_open(dev_t *devp, int flag, int otyp, cred_t *credp)
1196 {
1197 	zev_queue_t *q;
1198 	minor_t minor;
1199 
1200 	minor = getminor(*devp);
1201 	if (otyp != OTYP_CHR)
1202 		return (EINVAL);
1203 	if (drv_priv(credp) != 0)
1204 		return (EPERM);
1205 	mutex_enter(&zev_mutex);
1206 	if ((q = ddi_get_soft_state(statep, minor)) == NULL) {
1207 		mutex_exit(&zev_mutex);
1208 		return (ENXIO);
1209 	}
1210 	if (minor == ZEV_CONTROL_DEVICE_MINOR) {
1211 		/* control device may be used in parallel */
1212 		q->zq_busy = B_TRUE;
1213 		mutex_exit(&zev_mutex);
1214 		return 0;
1215 	}
1216 	if (q->zq_busy == B_TRUE) {
1217 		mutex_exit(&zev_mutex);
1218 		return (EBUSY);
1219 	}
1220 	q->zq_busy = B_TRUE;	/* can only be opened exclusively */
1221 	mutex_exit(&zev_mutex);
1222 	return (0);
1223 }
1224 
1225 static struct cb_ops zev_cb_ops = {
1226 	zev_open,		/* open */
1227 	zev_close,		/* close */
1228 	nodev,			/* strategy */
1229 	nodev,			/* print */
1230 	nodev,			/* dump */
1231 	zev_read,		/* read */
1232 	nodev,			/* write */
1233 	zev_ioctl,		/* ioctl */
1234 	nodev,			/* devmap */
1235 	nodev,			/* mmap */
1236 	nodev,			/* segmap */
1237 	zev_chpoll,		/* chpoll */
1238 	ddi_prop_op,		/* prop_op */
1239 	NULL,			/* streamtab */
1240 	D_MP | D_64BIT,		/* cb_flag */
1241 	CB_REV,			/* cb_rev */
1242 	nodev,			/* aread */
1243 	nodev,			/* awrite */
1244 };
1245 
1246 static void
1247 zev_free_instance(dev_info_t *dip)
1248 {
1249 	int instance;
1250 	zev_queue_t *q;
1251 	int i;
1252 
1253 	instance = ddi_get_instance(dip);
1254 	if (instance != 0) {
1255 		cmn_err(CE_WARN, "zev: tried to free instance != 0 (%d)",
1256 		        instance);
1257 		return;
1258 	}
1259 
1260 	ddi_remove_minor_node(dip, NULL);
1261 
1262 	/* stop pollwakeup thread */
1263 	zev_wakeup_thread_run = 0;
1264 	if (zev_poll_wakeup_thread != NULL) {
1265 		thread_join(zev_poll_wakeup_thread->t_did);
1266 		zev_poll_wakeup_thread = NULL;
1267 	}
1268 
1269 	mutex_enter(&zev_mutex);
1270 
1271 	/* remove "ctrl" dummy queue */
1272 	q = ddi_get_soft_state(statep, ZEV_CONTROL_DEVICE_MINOR);
1273 	if (q) {
1274 		ddi_soft_state_free(statep, ZEV_CONTROL_DEVICE_MINOR);
1275 		ZEV_MEM_SUB(sizeof(zev_queue_t));
1276 	}
1277 
1278 	/* remove all other queues */
1279 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
1280 		q = zev_queues[i- ZEV_MINOR_MIN];
1281 		if (!q)
1282 			continue;
1283 		ASSERT(q->zq_refcnt == 1);
1284 		zev_queue_release(q);
1285 	}
1286 	zev_queue_trim();
1287 	bzero(&zev_queues, sizeof(zev_queues));
1288 
1289 	mutex_exit(&zev_mutex);
1290 
1291 }
1292 
1293 static int
1294 zev_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1295 {
1296 	int instance;
1297 	zev_queue_t *q;
1298 
1299 	/* called once per instance with DDI_DETACH,
1300 	   may be called to suspend */
1301 	switch (cmd) {
1302 	case DDI_DETACH:
1303 		/* instance busy? */
1304 		instance = ddi_get_instance(dip);
1305 		if (instance != 0) {	/* hardcoded in zev.conf */
1306 			/* this module only supports one instance. */
1307 			return (DDI_FAILURE);
1308 		}
1309 
1310 		mutex_enter(&zev_mutex);
1311 		if (!zev_attached) {
1312 			mutex_exit(&zev_mutex);
1313 			return (DDI_FAILURE);
1314 		}
1315 
1316 		/* check "ctrl" queue to see if t is busy */
1317 		q = ddi_get_soft_state(statep, ZEV_CONTROL_DEVICE_MINOR);
1318 		if (q == NULL) {
1319 			mutex_exit(&zev_mutex);
1320 			return (DDI_FAILURE);
1321 		}
1322 		if (q->zq_busy) {
1323 			mutex_exit(&zev_mutex);
1324 			return (DDI_FAILURE);
1325 		}
1326 		/* are there any queues? */
1327 		if (zev_queue_cnt > 0) {
1328 			mutex_exit(&zev_mutex);
1329 			return (DDI_FAILURE);
1330 		}
1331 
1332 		zev_attached = B_FALSE;
1333 		mutex_exit(&zev_mutex);
1334 
1335 		/* switch ZFS event callbacks back to default */
1336 		rw_enter(&rz_zev_rwlock, RW_WRITER);
1337 		rz_zev_callbacks = rz_zev_default_callbacks;
1338 		rz_zev_set_active(B_FALSE);
1339 		rw_exit(&rz_zev_rwlock);
1340 
1341 		/* no thread is inside of the callbacks anymore. */
1342 
1343 		/* free resources allocated for this instance */
1344 		zev_free_instance(dip);
1345 		zev_chksum_fini();
1346 #if 0
1347 		cmn_err(CE_WARN, "zev: allocated memory at detach: %" PRIu64,
1348 			zev_memory_allocated - zev_memory_freed);
1349 #endif
1350 		return (DDI_SUCCESS);
1351 	case DDI_SUSPEND:
1352 		/* kernel must not suspend zev devices while ZFS is running */
1353 		return (DDI_FAILURE);
1354 	default:
1355 		return (DDI_FAILURE);
1356 	}
1357 }
1358 
1359 static int
1360 zev_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
1361 {
1362 	/* called once per instance with DDI_ATTACH,
1363 	   may be called to resume */
1364 	int instance;
1365 	int error;
1366 	zev_queue_t *q;
1367 	switch (cmd) {
1368 	case DDI_ATTACH:
1369 		/* create instance state */
1370 		instance = ddi_get_instance(dip);
1371 		if (instance != 0) {	/* hardcoded in zev.conf */
1372 			/* this module only supports one instance. */
1373 			return (DDI_FAILURE);
1374 		}
1375 
1376 		mutex_enter(&zev_mutex);
1377 		if (zev_attached) {
1378 			mutex_exit(&zev_mutex);
1379 			return (DDI_FAILURE);
1380 		}
1381 		if (ddi_soft_state_zalloc(statep, ZEV_CONTROL_DEVICE_MINOR) !=
1382 		    DDI_SUCCESS) {
1383 			mutex_exit(&zev_mutex);
1384 			return (DDI_FAILURE);
1385 		}
1386 		ZEV_MEM_ADD(sizeof(zev_queue_t));
1387 		zev_attached = B_TRUE;
1388 
1389 		/* init queue list */
1390 		bzero(&zev_queues, sizeof(zev_queues));
1391 		mutex_exit(&zev_mutex);
1392 
1393 		/* create a dummy queue for management of "ctrl" */
1394 
1395 		q = ddi_get_soft_state(statep, ZEV_CONTROL_DEVICE_MINOR);
1396 		q->zq_dip = dip;
1397 		q->zq_refcnt = 1;
1398 		q->zq_busy = B_FALSE;
1399 		q->zq_minor_number = ZEV_CONTROL_DEVICE_MINOR;
1400 		q->zq_flags = ZEV_FL_PERSISTENT;
1401 		strcpy(q->zq_name, ZEV_CONTROL_DEVICE_NAME);
1402 
1403 		/* create device node for "ctrl" */
1404 		if (ddi_create_minor_node(dip, ZEV_CONTROL_DEVICE_NAME,
1405 		    S_IFCHR, ZEV_CONTROL_DEVICE_MINOR,
1406 		    DDI_PSEUDO, 0) == DDI_FAILURE) {
1407 			goto fail;
1408 		}
1409 
1410 		/* note: intentionally not adding ctrl queue to queue list. */
1411 
1412 		/* default queue */
1413 		error = zev_queue_new(&q, dip,
1414 				      ZEV_DEFAULT_QUEUE_NAME,
1415 				      ZEV_MAX_QUEUE_LEN,
1416 				      ZEV_FL_BLOCK_WHILE_QUEUE_FULL|
1417 		                      ZEV_FL_PERSISTENT);
1418 		if (error)
1419 			goto fail;
1420 
1421 		/* start pollwakeup thread */
1422 		zev_wakeup_thread_run = 1;
1423 		zev_poll_wakeup_thread = thread_create(NULL, 0,
1424 		    zev_poll_wakeup_thread_main, NULL, 0, &p0,
1425 		    TS_RUN, minclsyspri);
1426 
1427 		ddi_report_dev(dip);
1428 
1429 		zev_chksum_init();
1430 
1431 		/* switch ZFS event callbacks to zev module callbacks */
1432 		rw_enter(&rz_zev_rwlock, RW_WRITER);
1433 		rz_zev_callbacks = &zev_callbacks;
1434 		rz_zev_set_active(B_TRUE);
1435 		rw_exit(&rz_zev_rwlock);
1436 
1437 		return (DDI_SUCCESS);
1438 	case DDI_RESUME:
1439 		/* suspendeding zev devices should never happen */
1440 		return (DDI_SUCCESS);
1441 	default:
1442 		return (DDI_FAILURE);
1443 	}
1444 fail:
1445 	cmn_err(CE_WARN, "zev: attach failed");
1446 	zev_free_instance(dip);
1447 	mutex_enter(&zev_mutex);
1448 	zev_attached = B_FALSE;
1449 	mutex_exit(&zev_mutex);
1450 	return (DDI_FAILURE);
1451 }
1452 
1453 /* ARGSUSED */
1454 static int
1455 zev_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **resultp)
1456 {
1457 	minor_t minor;
1458 	zev_queue_t *q;
1459 
1460 	/* arg is dev_t */
1461 	minor = getminor((dev_t)arg);
1462 	mutex_enter(&zev_mutex);
1463 	q = ddi_get_soft_state(statep, minor);
1464 	if (q == NULL) {
1465 		*resultp = NULL;
1466 		mutex_exit(&zev_mutex);
1467 		return (DDI_FAILURE);
1468 	}
1469 
1470 	switch (infocmd) {
1471 	case DDI_INFO_DEVT2DEVINFO:
1472 		*resultp = q->zq_dip;
1473 		break;
1474 	case DDI_INFO_DEVT2INSTANCE:
1475 		*resultp = (void *)(uintptr_t)ddi_get_instance(q->zq_dip);
1476 		break;
1477 	default:
1478 		mutex_exit(&zev_mutex);
1479 		return (DDI_FAILURE);
1480 	}
1481 	mutex_exit(&zev_mutex);
1482 	return (DDI_SUCCESS);
1483 }
1484 
1485 static struct dev_ops zev_dev_ops = {
1486 	DEVO_REV,			/* driver build revision */
1487 	0,				/* driver reference count */
1488 	zev_getinfo,			/* getinfo */
1489 	nulldev,			/* identify (obsolete) */
1490 	nulldev,			/* probe (search for devices) */
1491 	zev_attach,			/* attach */
1492 	zev_detach,			/* detach */
1493 	nodev,				/* reset (obsolete, use quiesce) */
1494 	&zev_cb_ops,			/* character and block device ops */
1495 	NULL,				/* bus driver ops */
1496 	NULL,				/* power management, not needed */
1497 	ddi_quiesce_not_needed,		/* quiesce */
1498 };
1499 
1500 static struct modldrv zev_modldrv = {
1501 	&mod_driverops,			/* all loadable modules use this */
1502 	"zev ZFS event provider, v1.0",	/* driver name and version info */
1503 	&zev_dev_ops			/* ops method pointers */
1504 };
1505 
1506 static struct modlinkage zev_modlinkage = {
1507 	MODREV_1,	/* fixed value */
1508 	{
1509 		&zev_modldrv,	/* driver linkage structure */
1510 		NULL		/* list terminator */
1511 	}
1512 };
1513 
1514 int
1515 _init(void)
1516 {
1517 	int error;
1518 
1519 	if ((error = ddi_soft_state_init(&statep, sizeof(zev_queue_t), 1)) != 0)
1520 		return (error);
1521 	zev_attached = B_FALSE;
1522 
1523 	zev_queue_head = NULL;
1524 	zev_queue_tail = NULL;
1525 	zev_queue_len = 0;
1526 	zev_muted_pools_head = NULL;
1527 	zev_memory_allocated = 0;
1528 	zev_memory_freed = 0;
1529 	zev_queue_cnt = 0;
1530 
1531 	mutex_init(&zev_mutex, NULL, MUTEX_DRIVER, NULL);
1532 	cv_init(&zev_condvar, NULL, CV_DRIVER, NULL);
1533 	rw_init(&zev_pool_list_rwlock, NULL, RW_DRIVER, NULL);
1534 	mutex_init(&zev_mark_id_mutex, NULL, MUTEX_DRIVER, NULL);
1535 	zev_mark_id = gethrtime();
1536 	mutex_init(&zev_queue_msg_mutex, NULL, MUTEX_DRIVER, NULL);
1537 	zev_msg_sequence_number = gethrtime();
1538 	bzero(&zev_statistics, sizeof(zev_statistics));
1539 	bzero(&zev_pollhead, sizeof(zev_pollhead));
1540 	bzero(&zev_queues, sizeof(zev_queues));
1541 	zev_statistics.zev_max_queue_len = ZEV_MAX_QUEUE_LEN;
1542 	if (zev_ioc_mute_pool("zg0")) {
1543 		cmn_err(CE_WARN, "zev: could not init mute list");
1544 		goto FAIL;
1545 	}
1546 
1547 	if ((error = mod_install(&zev_modlinkage)) != 0) {
1548 		cmn_err(CE_WARN, "zev: could not install module");
1549 		goto FAIL;
1550 	}
1551 
1552 	return (0);
1553 FAIL:
1554 	/* free resources */
1555 	cmn_err(CE_WARN, "zev: _init failed");
1556 	mutex_destroy(&zev_mutex);
1557 	ddi_soft_state_fini(&statep);
1558 	return (error);
1559 }
1560 
1561 int
1562 _info(struct modinfo *modinfop)
1563 {
1564 	return (mod_info(&zev_modlinkage, modinfop));
1565 }
1566 
1567 int
1568 _fini(void)
1569 {
1570 	int error = 0;
1571 	zev_msg_t *msg;
1572 	zev_pool_list_entry_t *pe, *npe;
1573 
1574 	mutex_enter(&zev_mutex);
1575 	if (zev_attached == B_TRUE) {
1576 		mutex_exit(&zev_mutex);
1577 		return (SET_ERROR(EBUSY));
1578 	}
1579 	if (zev_queue_cnt != 0) {
1580 		/* should never happen */
1581 		mutex_exit(&zev_mutex);
1582 		return (SET_ERROR(EBUSY));
1583 	}
1584 
1585 	/*
1586 	 * avoid deadlock if event list is full: make sure threads currently
1587 	 * blocking on the event list can append their event and then release
1588 	 * rz_zev_rwlock.  Since there should be no queues left when we
1589 	 * reach this point we can simply empty the event list and then
1590 	 * wake everybody.
1591 	 */
1592 	while (zev_queue_head) {
1593 		msg = zev_queue_head;
1594 		zev_queue_head = msg->next;
1595 		zev_free(msg, sizeof(*msg) + msg->size);
1596 	}
1597 	cv_broadcast(&zev_condvar);
1598 	mutex_exit(&zev_mutex);
1599 
1600 	/* switch ZFS event callbacks back to default (again) */
1601 	rw_enter(&rz_zev_rwlock, RW_WRITER);
1602 	rz_zev_callbacks = rz_zev_default_callbacks;
1603 	rz_zev_set_active(B_FALSE);
1604 	rw_exit(&rz_zev_rwlock);
1605 
1606 	/* no thread is inside of the callbacks anymore.  Safe to remove. */
1607 
1608 	/* unload module callbacks */
1609 	if ((error = mod_remove(&zev_modlinkage)) != 0) {
1610 		cmn_err(CE_WARN, "mod_remove failed: %d", error);
1611 		return (error);
1612 	}
1613 
1614 	/* free resources */
1615 	mutex_enter(&zev_mutex);
1616 	while (zev_queue_head) {
1617 		msg = zev_queue_head;
1618 		zev_queue_head = msg->next;
1619 		zev_free(msg, sizeof(*msg) + msg->size);
1620 	}
1621 	mutex_exit(&zev_mutex);
1622 	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
1623 	pe = zev_muted_pools_head;
1624 	while (pe) {
1625 		npe = pe;
1626 		pe = pe->next;
1627 		zev_free(npe, sizeof(*npe));
1628 	}
1629 	rw_exit(&zev_pool_list_rwlock);
1630 	ddi_soft_state_fini(&statep);
1631 	rw_destroy(&zev_pool_list_rwlock);
1632 	cv_destroy(&zev_condvar);
1633 	mutex_destroy(&zev_mutex);
1634 	mutex_destroy(&zev_mark_id_mutex);
1635 	mutex_destroy(&zev_queue_msg_mutex);
1636 
1637 	return (0);
1638 }
1639 
1640