xref: /titanic_51/usr/src/uts/common/fs/zev/zev.c (revision 5e28636192c7a7f235ea1e835192fe00d17757cb)
1 #include <sys/modctl.h>
2 #include <sys/ddi.h>
3 #include <sys/sunddi.h>
4 #include <sys/conf.h>
5 #include <sys/devops.h>
6 #include <sys/stat.h>
7 #include <sys/fs/zev.h>
8 #include <sys/zev_callbacks.h>
9 #include <sys/zev_checksums.h>
10 #include <sys/zfs_znode.h>
11 #include <sys/time.h>
12 #include <sys/sa.h>
13 #include <sys/zap.h>
14 #include <sys/time.h>
15 
16 #define	OFFSETOF(s, m)		((size_t)(&(((s *)0)->m)))
17 
18 #define ZEV_DEFAULT_QUEUE_NAME		"beaver"
19 #define ZEV_CONTROL_DEVICE_MINOR	0
20 #define ZEV_MINOR_MIN			(ZEV_CONTROL_DEVICE_MINOR + 1)
21 #define ZEV_MINOR_MAX			(ZEV_MINOR_MIN + ZEV_MAX_QUEUES - 1)
22 
23 typedef struct zev_queue {
24 	char			zq_name[ZEV_MAX_QUEUE_NAME_LEN+1];
25 	minor_t			zq_minor_number;
26 	dev_info_t		*zq_dip;
27 	struct pollhead		zq_pollhead;
28 	uint64_t		zq_bytes_read;
29 	uint64_t		zq_events_read;
30 	uint64_t		zq_bytes_discarded;
31 	uint64_t		zq_events_discarded;
32 	uint64_t		zq_bytes_total;
33 	uint64_t		zq_events_total;
34 	uint64_t		zq_wakeup_threshold;
35 	uint16_t		zq_flags;
36 	uint16_t		zq_need_wakeup;
37 	/* protected by zev_mutex */
38 	int			zq_refcnt;
39 	uint64_t		zq_queue_len;
40 	uint64_t		zq_queue_messages;
41 	uint64_t		zq_max_queue_len;
42 	zev_msg_t		*zq_oldest;
43 	boolean_t		zq_busy;
44 	boolean_t		zq_to_be_removed;
45 	zev_statistics_t	zq_statistics;
46 	kcondvar_t		zq_condvar;
47 } zev_queue_t;
48 
49 static void		*statep;
50 struct pollhead		zev_pollhead;
51 
52 kmutex_t		zev_mutex;
53 kcondvar_t		zev_condvar;
54 kmutex_t		zev_queue_msg_mutex;
55 krwlock_t		zev_pool_list_rwlock;
56 static zev_statistics_t	zev_statistics;
57 static boolean_t	zev_attached;
58 static kmutex_t		zev_mark_id_mutex;
59 static uint64_t		zev_mark_id = 0;
60 
61 static uint64_t		zev_msg_sequence_number = 0;
62 static zev_queue_t	*zev_queues[ZEV_MAX_QUEUES];
63 static int		zev_queue_cnt = 0;
64 
65 uint64_t	zev_memory_allocated = 0;
66 uint64_t	zev_memory_freed = 0;
67 
68 /*
69  * The longest potential message is from zev_zfs_mount() and
70  * contains the mountpoint, which might be close to MAXPATHLEN bytes long.
71  *
72  * Another candidate is zev_znode_rename_cb() and contains three inode
73  * numbers and two filenames of up to MAXNAMELEN bytes each.
74  */
75 #define ZEV_MAX_MESSAGE_LEN	4096
76 
77 static zev_msg_t *zev_queue_head = NULL;
78 static zev_msg_t *zev_queue_tail = NULL;
79 static uint64_t zev_queue_len = 0;
80 
81 
82 typedef struct zev_pool_list_entry {
83 	struct zev_pool_list_entry	*next;
84 	char				name[MAXPATHLEN];
85 } zev_pool_list_entry_t;
86 
87 static zev_pool_list_entry_t *zev_muted_pools_head = NULL;
88 
89 static volatile int zev_wakeup_thread_run = 1;
90 static kthread_t *zev_poll_wakeup_thread = NULL;
91 
92 void *
93 zev_alloc(ssize_t sz)
94 {
95 	ZEV_MEM_ADD(sz);
96 	return kmem_alloc(sz, KM_SLEEP);
97 }
98 
99 void *
100 zev_zalloc(ssize_t sz)
101 {
102 	ZEV_MEM_ADD(sz);
103 	return kmem_zalloc(sz, KM_SLEEP);
104 }
105 
106 void
107 zev_free(void *ptr, ssize_t sz)
108 {
109 	ZEV_MEM_SUB(sz);						\
110 	kmem_free(ptr, sz);
111 }
112 
113 int
114 zev_queue_cmp(const void *a, const void *b)
115 {
116 	const zev_queue_t *qa = a;
117 	const zev_queue_t *qb = b;
118 	if (qa->zq_minor_number > qb->zq_minor_number)
119 		return 1;
120 	if (qa->zq_minor_number < qb->zq_minor_number)
121 		return -1;
122 	return 0;
123 }
124 
125 /* must be called with zev_mutex held */
126 void
127 zev_queue_trim(void)
128 {
129 	zev_msg_t *m;
130 	uint64_t oldest_message;
131 	zev_queue_t *q;
132 	int i;
133 
134 	if (!zev_queue_tail)
135 		return;
136 
137 	oldest_message = zev_queue_tail->seq + 1;  /* does not exist, yet. */
138 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
139 		q = zev_queues[i - ZEV_MINOR_MIN];
140 		if (q == NULL)
141 			continue;
142 		if (!q->zq_oldest)
143 			continue;
144 		if (oldest_message > q->zq_oldest->seq)
145 			oldest_message = q->zq_oldest->seq;
146 	}
147 
148 	/* remove msgs between oldest_message and zev_queue_head */
149 	while(zev_queue_head && (oldest_message > zev_queue_head->seq)) {
150 		m = zev_queue_head;
151 		zev_queue_head = m->next;
152 		if (zev_queue_head == NULL) {
153 			zev_queue_tail = NULL;
154 		} else {
155 			zev_queue_head->prev = NULL;
156 		}
157 		if (m->read == 0) {
158 			zev_statistics.zev_bytes_discarded += m->size;
159 			zev_statistics.zev_cnt_discarded_events++;
160 		}
161 		zev_statistics.zev_queue_len -= m->size;
162 		zev_queue_len--;
163 		zev_free(m, sizeof(*m) + m->size);
164 	}
165 }
166 
167 /* must be called with zev_mutex held */
168 static void
169 zev_queue_hold(zev_queue_t *q)
170 {
171 	q->zq_refcnt++;
172 }
173 
174 /* must be called with zev_mutex held */
175 static void
176 zev_queue_release(zev_queue_t *q)
177 {
178 	q->zq_refcnt--;
179 	if (q->zq_refcnt > 0)
180 		return;
181 
182 	ASSERT(q->zq_busy == B_FALSE);
183 
184 	/* persistent queues will not be removed */
185 	if ((q->zq_flags & ZEV_FL_PERSISTENT) != 0)
186 		return;
187 
188 	/* remove queue from queue list */
189 	zev_queues[q->zq_minor_number - ZEV_MINOR_MIN] = NULL;
190 
191 	/* discard messages that no queue references anymore */
192 	zev_queue_trim();
193 
194 	cv_destroy(&q->zq_condvar);
195 	ddi_remove_minor_node(q->zq_dip, q->zq_name);
196 	ddi_soft_state_free(statep, q->zq_minor_number);
197 	ZEV_MEM_SUB(sizeof(zev_queue_t));
198 	zev_queue_cnt--;
199 }
200 
201 int
202 zev_queue_new(zev_queue_t **queue,
203               dev_info_t *dip,
204               char *name,
205               uint64_t max_queue_len,
206               uint16_t flags)
207 {
208 	zev_queue_t *q;
209 	zev_queue_t *tmp;
210 	zev_msg_t *msg;
211 	int name_exists = 0;
212 	minor_t minor;
213 	char *p;
214 	int i;
215 
216 	if (max_queue_len > ZEV_MAX_QUEUE_LEN)
217 		return EINVAL;
218 	if (max_queue_len == 0)
219 		max_queue_len = ZEV_MAX_QUEUE_LEN;
220 	if (!strcmp(name, ZEV_CONTROL_DEVICE_NAME))
221 		return EINVAL;
222 	for (p = name; *p; p++) {
223 		if (*p >= 'a' && *p <= 'z')
224 			continue;
225 		if (*p >= '0' && *p <= '9')
226 			continue;
227 		if (*p == '.')
228 			continue;
229 		return EINVAL;
230 	}
231 
232 	mutex_enter(&zev_mutex);
233 
234 	/* find free minor number.*/
235 	/* if this were a frequent operation we'd have a free-minor list */
236 	for (minor = ZEV_MINOR_MIN; minor <= ZEV_MINOR_MAX; minor++) {
237 		tmp = zev_queues[minor - ZEV_MINOR_MIN];
238 		if (tmp == NULL)
239 			break;
240 	}
241 	if (tmp) {
242 		mutex_exit(&zev_mutex);
243 		return ENOSPC;
244 	}
245 
246 	if (ddi_soft_state_zalloc(statep, minor) != DDI_SUCCESS) {
247 		mutex_exit(&zev_mutex);
248 		return ENOSPC;
249 	}
250 	ZEV_MEM_ADD(sizeof(zev_queue_t));
251 
252 	q = ddi_get_soft_state(statep, minor);
253 	memset(q, 0, sizeof(*q));
254 	strncpy(q->zq_name, name, ZEV_MAX_QUEUE_NAME_LEN);
255 	q->zq_name[ZEV_MAX_QUEUE_NAME_LEN] = '\0';
256 	q->zq_max_queue_len = max_queue_len;
257 	q->zq_wakeup_threshold = ZEV_DEFAULT_POLL_WAKEUP_QUEUE_LEN;
258 	q->zq_flags = flags;
259 	q->zq_refcnt = 1;
260 	q->zq_dip = dip;
261 	q->zq_minor_number = minor;
262 	cv_init(&q->zq_condvar, NULL, CV_DRIVER, NULL);
263 
264 	/* insert into queue list */
265 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
266 		/* if this were a frequent operation we'd have a name tree */
267 		if (zev_queues[i - ZEV_MINOR_MIN] == NULL)
268 			continue;
269 		if (!strcmp(q->zq_name, zev_queues[i-ZEV_MINOR_MIN]->zq_name)) {
270 			name_exists = 1;
271 			break;
272 		}
273 	}
274 	if (name_exists) {
275 		ddi_soft_state_free(statep, minor);
276 		ZEV_MEM_SUB(sizeof(zev_queue_t));
277 		mutex_exit(&zev_mutex);
278 		return EEXIST;
279 	}
280 	zev_queues[minor - ZEV_MINOR_MIN] = q;
281 	zev_queue_cnt++;
282 
283 	/* calculate current queue len and find head and tail */
284 	q->zq_oldest = zev_queue_tail;
285 	msg = zev_queue_tail;
286 	while ((msg != NULL) && (q->zq_queue_len < q->zq_max_queue_len)) {
287 		q->zq_queue_len += msg->size;
288 		q->zq_queue_messages++;
289 		q->zq_oldest = msg;
290 		msg = msg->prev;
291 	}
292 
293 	mutex_exit(&zev_mutex);
294 
295 	if (ddi_create_minor_node(dip, name,
296 	    S_IFCHR, minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
297 		mutex_enter(&zev_mutex);
298 		zev_queues[minor - ZEV_MINOR_MIN] = NULL;
299 		zev_queue_cnt--;
300 		ddi_soft_state_free(statep, minor);
301 		ZEV_MEM_SUB(sizeof(zev_queue_t));
302 		mutex_exit(&zev_mutex);
303 		return EFAULT;
304 	}
305 
306 	*queue = q;
307 	return 0;
308 }
309 
310 /*
311  * poll() wakeup thread.  Used to check periodically whether we have
312  * bytes left in the queue that have not yet been made into a
313  * pollwakeup() call.  This is meant to insure a maximum waiting
314  * time until an event is presented as a poll wakeup, while at
315  * the same time not making every single event into a poll wakeup
316  * of it's own.
317  */
318 
319 static void
320 zev_poll_wakeup(boolean_t flush_all)
321 {
322 	zev_queue_t *q;
323 	int i;
324 
325 	/*
326 	 * This loop works with hold() and release() because
327 	 * pollwakeup() requires us to release our locks before calling it.
328 	 *
329 	 * from pollwakeup(9F):
330 	 *
331 	 *   "Driver defined locks should not be held across calls
332 	 *    to this function."
333 	 */
334 
335 	/* wake up threads for each individual queue */
336 	mutex_enter(&zev_mutex);
337 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
338 		q = zev_queues[i - ZEV_MINOR_MIN];
339 		if (q == NULL)
340 			continue;
341 		if (!q->zq_busy)
342 			continue;
343 		if (!q->zq_queue_len)
344 			continue;
345 		if ((flush_all) ||
346 		    (q->zq_queue_len > q->zq_wakeup_threshold)) {
347 			zev_queue_hold(q);
348 			mutex_exit(&zev_mutex);
349 			pollwakeup(&q->zq_pollhead, POLLIN);
350 			mutex_enter(&zev_mutex);
351 			zev_queue_release(q);
352 		}
353 	}
354 	mutex_exit(&zev_mutex);
355 }
356 
357 static void
358 zev_poll_wakeup_thread_main(void)
359 {
360 	while (zev_wakeup_thread_run) {
361 		delay(drv_usectohz(100 * 1000)); /* sleep 100ms */
362 
363 		zev_poll_wakeup(B_TRUE);
364 	}
365 	thread_exit();
366 }
367 
368 static int
369 zev_ioc_mute_pool(char *poolname)
370 {
371 	zev_pool_list_entry_t *pe;
372 	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
373 	/* pool already muted? */
374 	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
375 		if (!strcmp(pe->name, poolname)) {
376 			rw_exit(&zev_pool_list_rwlock);
377 			return EEXIST;
378 		}
379 	}
380 	pe = zev_zalloc(sizeof(*pe));
381 	if (!pe) {
382 		rw_exit(&zev_pool_list_rwlock);
383 		return ENOMEM;
384 	}
385 	(void) strncpy(pe->name, poolname, sizeof(pe->name));
386 	pe->next = zev_muted_pools_head;
387 	zev_muted_pools_head = pe;
388 	rw_exit(&zev_pool_list_rwlock);
389 	return (0);
390 }
391 
392 static int
393 zev_ioc_unmute_pool(char *poolname)
394 {
395 	zev_pool_list_entry_t *pe, *peprev;
396 
397 	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
398 	/* pool muted? */
399 	peprev = NULL;
400 	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
401 		if (!strcmp(pe->name, poolname))
402 			break;
403 		peprev = pe;
404 	}
405 	if (pe) {
406 		rw_exit(&zev_pool_list_rwlock);
407 		return ENOENT;
408 	}
409 
410 	if (peprev != NULL) {
411 		peprev->next = pe->next;
412 	} else {
413 		zev_muted_pools_head = pe->next;
414 	}
415 	zev_free(pe, sizeof(*pe));
416 	rw_exit(&zev_pool_list_rwlock);
417 	return (0);
418 }
419 
420 int
421 zev_skip_pool(objset_t *os)
422 {
423 	zev_pool_list_entry_t *pe;
424 	dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
425 	rw_enter(&zev_pool_list_rwlock, RW_READER);
426 	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
427 		if (!strcmp(pe->name, dp->dp_spa->spa_name)) {
428 			rw_exit(&zev_pool_list_rwlock);
429 			return 1;
430 		}
431 	}
432 	rw_exit(&zev_pool_list_rwlock);
433 	return 0;
434 }
435 
436 static void
437 zev_update_statistics(int op, zev_statistics_t *stat)
438 {
439 	switch (op) {
440 	case ZEV_OP_ERROR:
441 		stat->zev_cnt_errors++;
442 		break;
443 	case ZEV_OP_MARK:
444 		stat->zev_cnt_marks++;
445 		break;
446 	case ZEV_OP_ZFS_MOUNT:
447 		stat->zev_cnt_zfs_mount++;
448 		break;
449 	case ZEV_OP_ZFS_UMOUNT:
450 		stat->zev_cnt_zfs_umount++;
451 		break;
452 	case ZEV_OP_ZVOL_WRITE:
453 		stat->zev_cnt_zvol_write++;
454 		break;
455 	case ZEV_OP_ZVOL_TRUNCATE:
456 		stat->zev_cnt_zvol_truncate++;
457 		break;
458 	case ZEV_OP_ZNODE_CLOSE_AFTER_UPDATE:
459 		stat->zev_cnt_znode_close_after_update++;
460 		break;
461 	case ZEV_OP_ZNODE_CREATE:
462 		stat->zev_cnt_znode_create++;
463 		break;
464 	case ZEV_OP_ZNODE_REMOVE:
465 		stat->zev_cnt_znode_remove++;
466 		break;
467 	case ZEV_OP_ZNODE_LINK:
468 		stat->zev_cnt_znode_link++;
469 		break;
470 	case ZEV_OP_ZNODE_SYMLINK:
471 		stat->zev_cnt_znode_symlink++;
472 		break;
473 	case ZEV_OP_ZNODE_RENAME:
474 		stat->zev_cnt_znode_rename++;
475 		break;
476 	case ZEV_OP_ZNODE_WRITE:
477 		stat->zev_cnt_znode_write++;
478 		break;
479 	case ZEV_OP_ZNODE_TRUNCATE:
480 		stat->zev_cnt_znode_truncate++;
481 		break;
482 	case ZEV_OP_ZNODE_SETATTR:
483 		stat->zev_cnt_znode_setattr++;
484 		break;
485 	case ZEV_OP_ZNODE_ACL:
486 		stat->zev_cnt_znode_acl++;
487 		break;
488 	}
489 }
490 
491 void
492 zev_queue_message(int op, zev_msg_t *msg)
493 {
494 	zev_queue_t *q;
495 	int wakeup = 0;
496 	zev_msg_t *m;
497 	int i;
498 
499 	msg->next = NULL;
500 	msg->prev = NULL;
501 	msg->read = 0;
502 
503 	if (op < ZEV_OP_MIN || op > ZEV_OP_MAX) {
504 		zev_queue_error(op, "unknown op id encountered: %d", op);
505 		zev_free(msg, sizeof(*msg) + msg->size);
506 		return;
507 	}
508 
509 	/*
510 	 * This mutex protects us agains race conditions when several
511 	 * threads want to queue a message and one or more queues are
512 	 * full:  we release zev_mutex to wait for the queues to become
513 	 * less-than-full, but we don't know in which order the waiting
514 	 * threads will be awoken.  If it's not the same order in which
515 	 * they went to sleep we might mark different messages as "newest"
516 	 * in different queues, and so we might have dupes or even
517 	 * skip messages.
518 	 */
519 	mutex_enter(&zev_queue_msg_mutex);
520 
521 	mutex_enter(&zev_mutex);
522 
523 	/*
524 	 * When the module is loaded, the default behavior ist to
525 	 * put all events into a queue and block if the queue is full.
526 	 * This is done even before the pseudo device is attached.
527 	 * This way, no events are lost.
528 	 *
529 	 * To discard events entirely the "beaver" queue,
530 	 * which never discards anything, has to be removed.
531 	 */
532 
533 	if (zev_queue_cnt == 0) {
534 		mutex_exit(&zev_mutex);
535 		mutex_exit(&zev_queue_msg_mutex);
536 		return;
537 	}
538 
539 	/* put message into global queue */
540 	msg->seq = zev_msg_sequence_number++;
541 	while (zev_statistics.zev_max_queue_len &&
542 	    zev_statistics.zev_queue_len >= zev_statistics.zev_max_queue_len) {
543 		/* queue full.  block until it's been shrunk. */
544 		cv_wait(&zev_condvar, &zev_mutex);
545 	}
546 
547 	if (zev_queue_tail == NULL) {
548 		zev_queue_head = zev_queue_tail = msg;
549 	} else {
550 		zev_queue_tail->next = msg;
551 		msg->prev = zev_queue_tail;
552 		zev_queue_tail = msg;
553 	}
554 	zev_queue_len++;
555 	zev_statistics.zev_cnt_total_events++;
556 	zev_statistics.zev_queue_len += msg->size;
557 
558 	/* update per-device queues */
559 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
560 		q = zev_queues[i - ZEV_MINOR_MIN];
561 		if (!q)
562 			continue;
563 
564 		zev_queue_hold(q);
565 
566 		/* make sure queue has enough room */
567 		while (q->zq_max_queue_len &&
568 		       q->zq_queue_len > q->zq_max_queue_len) {
569 
570 			if (q->zq_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL) {
571 				/* block until queue has been shrunk. */
572 				cv_wait(&zev_condvar, &zev_mutex);
573 			} else {
574 				/* discard msgs until queue is small enough */
575 				while (q->zq_queue_len > q->zq_max_queue_len) {
576 					m = q->zq_oldest;
577 					if (m == NULL)
578 						break;
579 					q->zq_events_discarded++;
580 					q->zq_bytes_discarded += m->size;
581 					q->zq_oldest = m->next;
582 					q->zq_queue_len -= m->size;
583 					q->zq_queue_messages--;
584 				}
585 			}
586 		}
587 
588 		/* register new message at the end of the queue */
589 		q->zq_queue_len += msg->size;
590 		q->zq_queue_messages++;
591 		q->zq_bytes_total += msg->size;
592 		q->zq_events_total++;
593 		if (q->zq_oldest == NULL)
594 			q->zq_oldest = msg;
595 
596 		zev_update_statistics(op, &q->zq_statistics);
597 
598 		if (q->zq_queue_len > q->zq_wakeup_threshold)
599 			wakeup = 1;
600 		if (q->zq_queue_len == msg->size)  /* queue was empty */
601 			cv_broadcast(&q->zq_condvar);
602 
603 		zev_queue_release(q);
604 	}
605 
606 	zev_queue_trim();
607 
608 	zev_update_statistics(op, &zev_statistics);
609 	mutex_exit(&zev_mutex);
610 	mutex_exit(&zev_queue_msg_mutex);
611 
612 	/* one or more queues need a pollwakeup() */
613 	if (op == ZEV_OP_MARK) {
614 		zev_poll_wakeup(B_TRUE);
615 	} else if (wakeup) {
616 		zev_poll_wakeup(B_FALSE);
617 	}
618 
619 	return;
620 }
621 
622 void
623 zev_queue_error(int op, char *fmt, ...)
624 {
625 	char buf[ZEV_MAX_MESSAGE_LEN];
626 	va_list ap;
627 	int len;
628 	zev_msg_t *msg = NULL;
629 	zev_error_t *rec;
630 	int msg_size;
631 
632 	va_start(ap, fmt);
633 	len = vsnprintf(buf, sizeof(buf), fmt, ap);
634 	va_end(ap);
635 	if (len >= sizeof(buf)) {
636 		cmn_err(CE_WARN, "zev: can't report error - "
637 		        "dropping event entirely.");
638 		return;
639 	}
640 
641 	msg_size = sizeof(*rec) + len + 1;
642 	msg = zev_alloc(sizeof(*msg) + msg_size);
643 	msg->size = msg_size;
644 	rec = (zev_error_t *)(msg + 1);
645 	rec->record_len = msg_size;
646 	rec->op = ZEV_OP_ERROR;
647 	rec->op_time = ddi_get_time();
648 	rec->guid = 0;
649 	rec->failed_op = op;
650 	rec->errstr_len = len;
651 	(void) memcpy(ZEV_ERRSTR(rec), buf, len + 1);
652 
653 	zev_queue_message(ZEV_OP_ERROR, msg);
654 	return;
655 }
656 
657 static int
658 zev_find_queue(zev_queue_t **out, zev_queue_t *req_q, zev_queue_name_t *name)
659 {
660 	char namebuf[ZEV_MAX_QUEUE_NAME_LEN+1];
661 	zev_queue_t *q;
662 	int i;
663 
664 	*out = NULL;
665 
666 	if (name->zev_namelen == 0) {
667 		if (req_q->zq_minor_number == ZEV_CONTROL_DEVICE_MINOR)
668 			return EINVAL;
669 		zev_queue_hold(req_q);
670 		*out = req_q;
671 		return 0;
672 	}
673 
674 	if (name->zev_namelen > ZEV_MAX_QUEUE_NAME_LEN)
675 		return EINVAL;
676 	strncpy(namebuf, name->zev_name, name->zev_namelen);
677 	namebuf[name->zev_namelen] = '\0';
678 
679 	mutex_enter(&zev_mutex);
680 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
681 		q = zev_queues[i - ZEV_MINOR_MIN];
682 		if (!q)
683 			continue;
684 		if (!strcmp(q->zq_name, namebuf)) {
685 			zev_queue_hold(q);
686 			mutex_exit(&zev_mutex);
687 			*out = q;
688 			return 0;
689 		}
690 	}
691 	mutex_exit(&zev_mutex);
692 	return ENOENT;
693 }
694 
695 static int
696 zev_ioc_get_queue_statistics(zev_queue_t *req_q, intptr_t arg, int mode)
697 {
698 	zev_ioctl_get_queue_statistics_t gs;
699 	zev_queue_t *q;
700 	int ret;
701 
702 	if (ddi_copyin((void *)arg, &gs, sizeof(gs), mode) != 0)
703 		return EFAULT;
704 
705 	ret = zev_find_queue(&q, req_q, &gs.zev_queue_name);
706 	if (ret)
707 		return ret;
708 
709 	/* ddi_copyout() can take a long time.  Better make
710 	   a copy to be able to release the mutex faster. */
711 	mutex_enter(&zev_mutex);
712 	memcpy(&gs.zev_statistics, &q->zq_statistics,sizeof(gs.zev_statistics));
713 	gs.zev_statistics.zev_queue_len = q->zq_queue_len;
714 	gs.zev_statistics.zev_bytes_read = q->zq_bytes_read;
715 	gs.zev_statistics.zev_bytes_discarded = q->zq_bytes_discarded;
716 	gs.zev_statistics.zev_max_queue_len = q->zq_max_queue_len;
717 	gs.zev_statistics.zev_cnt_discarded_events = q->zq_events_discarded;
718 	gs.zev_statistics.zev_cnt_total_events = q->zq_events_total;
719 	zev_queue_release(q);
720 	mutex_exit(&zev_mutex);
721 
722 	if (ddi_copyout(&gs, (void *)arg, sizeof(gs), mode) != 0)
723 		return EFAULT;
724 	return 0;
725 }
726 
727 static int
728 zev_ioc_set_queue_properties(zev_queue_t *req_q, intptr_t arg, int mode)
729 {
730 	zev_ioctl_set_queue_properties_t qp;
731 	zev_queue_t *q;
732 	uint64_t old_max;
733 	uint64_t old_flags;
734 	int ret;
735 
736 	if (ddi_copyin((void *)arg, &qp, sizeof(qp), mode) != 0)
737 		return EFAULT;
738 	if (qp.zev_max_queue_len > ZEV_MAX_QUEUE_LEN)
739 		return EINVAL;
740 	if (qp.zev_poll_wakeup_threshold > ZEV_MAX_POLL_WAKEUP_QUEUE_LEN)
741 		return EINVAL;
742 
743 	ret = zev_find_queue(&q, req_q, &qp.zev_queue_name);
744 	if (ret)
745 		return ret;
746 
747 	mutex_enter(&zev_mutex);
748 
749 	/*
750 	 * Note: if the PERSISTENT flag is cleared, and the queue is not busy,
751 	 * the queue should be removed by zev_queue_release() in zev_ioctl().
752 	 */
753 	old_flags = qp.zev_flags;
754 	q->zq_flags = qp.zev_flags;
755 	if ((old_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL) &&
756 	   (!(qp.zev_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL))) {
757 		/* queue is no longer blocking - wake blocked threads */
758 		cv_broadcast(&zev_condvar);
759 	}
760 
761 	old_max = q->zq_max_queue_len;
762 	q->zq_max_queue_len = qp.zev_max_queue_len;
763 	if (q->zq_max_queue_len < old_max)
764 		zev_queue_trim();
765 	if (q->zq_max_queue_len > old_max)
766 		cv_broadcast(&zev_condvar);	/* threads may be waiting */
767 
768 	if ((qp.zev_poll_wakeup_threshold < q->zq_wakeup_threshold) &&
769 	    (qp.zev_poll_wakeup_threshold <= q->zq_queue_len))
770 		pollwakeup(&q->zq_pollhead, POLLIN);
771 	q->zq_wakeup_threshold = qp.zev_poll_wakeup_threshold;
772 
773 	zev_queue_release(q);
774 	mutex_exit(&zev_mutex);
775 	return 0;
776 }
777 
778 static int
779 zev_ioc_get_queue_properties(zev_queue_t *req_q, intptr_t arg, int mode)
780 {
781 	zev_ioctl_get_queue_properties_t qp;
782 	zev_queue_t *q;
783 	int ret;
784 
785 	if (ddi_copyin((void *)arg, &qp, sizeof(qp), mode) != 0)
786 		return EFAULT;
787 
788 	ret = zev_find_queue(&q, req_q, &qp.zev_queue_name);
789 	if (ret)
790 		return ret;
791 
792 	mutex_enter(&zev_mutex);
793 	qp.zev_max_queue_len = q->zq_max_queue_len;
794 	qp.zev_flags = q->zq_flags;
795 	qp.zev_poll_wakeup_threshold = q->zq_wakeup_threshold;
796 	zev_queue_release(q);
797 	mutex_exit(&zev_mutex);
798 
799 	if (ddi_copyout(&qp, (void *)arg, sizeof(qp), mode) != 0)
800 		return EFAULT;
801 	return 0;
802 }
803 
804 static int
805 zev_ioc_add_queue(zev_queue_t *req_q, intptr_t arg, int mode)
806 {
807 	zev_ioctl_add_queue_t aq;
808 	zev_queue_t *new_q;
809 	char name[ZEV_MAX_QUEUE_NAME_LEN+1];
810 
811 	if (ddi_copyin((void *)arg, &aq, sizeof(aq), mode) != 0)
812 		return EFAULT;
813 
814 	if (aq.zev_namelen > ZEV_MAX_QUEUE_NAME_LEN)
815 		return EINVAL;
816 	strncpy(name, aq.zev_name, aq.zev_namelen);
817 	name[aq.zev_namelen] = '\0';
818 
819 	return zev_queue_new(&new_q, req_q->zq_dip, name,
820 	                     aq.zev_max_queue_len, aq.zev_flags);
821 }
822 
823 static int
824 zev_ioc_remove_queue(zev_queue_t *req_q, intptr_t arg, int mode)
825 {
826 	zev_ioctl_remove_queue_t rq;
827 	zev_queue_t *q;
828 	char name[ZEV_MAX_QUEUE_NAME_LEN+1];
829 	int found = 0;
830 	int i;
831 
832 	if (ddi_copyin((void *)arg, &rq, sizeof(rq), mode) != 0)
833 		return EFAULT;
834 
835 	if (rq.zev_queue_name.zev_namelen > ZEV_MAX_QUEUE_NAME_LEN)
836 		return EINVAL;
837 	strncpy(name, rq.zev_queue_name.zev_name,
838 	        rq.zev_queue_name.zev_namelen);
839 	name[rq.zev_queue_name.zev_namelen] = '\0';
840 
841 	mutex_enter(&zev_mutex);
842 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
843 		q = zev_queues[i - ZEV_MINOR_MIN];
844 		if (!q)
845 			continue;
846 		if (!strcmp(q->zq_name, name)) {
847 			found = 1;
848 			break;
849 		}
850 	}
851 	if (!found) {
852 		mutex_exit(&zev_mutex);
853 		return ENOENT;
854 	}
855 
856 	if (q->zq_busy) {
857 		mutex_exit(&zev_mutex);
858 		return EBUSY;
859 	}
860 	/*
861 	 * clear flags, so that persistent queues are removed aswell
862 	 * and the queue becomes non-blocking.
863 	 */
864 	q->zq_flags = 0;
865 	if (q->zq_to_be_removed == B_FALSE) {
866 		q->zq_to_be_removed = B_TRUE;
867 		zev_queue_release(q);
868 	}
869 	/* some threads might be waiting for this queue to become writable */
870 	cv_broadcast(&zev_condvar);
871 
872 	mutex_exit(&zev_mutex);
873 	return 0;
874 }
875 
876 static int
877 zev_ioc_get_debug_info(zev_queue_t *req_q, intptr_t arg, int mode)
878 {
879 	zev_ioctl_debug_info_t di;
880 	uint64_t mem_allocated = atomic_add_64_nv(&zev_memory_allocated, 0);
881 	uint64_t mem_freed     = atomic_add_64_nv(&zev_memory_freed, 0);
882 
883 	zev_chksum_stats(&di.zev_chksum_cache_size,
884 	                 &di.zev_chksum_cache_hits,
885 	                 &di.zev_chksum_cache_misses);
886 	di.zev_memory_allocated = mem_allocated - mem_freed;
887 	if (ddi_copyout(&di, (void *)arg, sizeof(di), mode) != 0)
888 		return EFAULT;
889 	return 0;
890 }
891 
892 static int
893 zev_ioc_get_queue_list(zev_queue_t *req_q, intptr_t arg, int mode)
894 {
895 	zev_ioctl_get_queue_list_t gql;
896 	zev_queue_t *q;
897 	int i = 0;
898 	int count = 0;
899 
900 	memset(&gql, 0, sizeof(gql));
901 
902 	mutex_enter(&zev_mutex);
903 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
904 		q = zev_queues[i - ZEV_MINOR_MIN];
905 		if (!q)
906 			continue;
907 		strncpy(gql.zev_queue_name[count].zev_name,
908 		    q->zq_name, ZEV_MAX_QUEUE_NAME_LEN);
909 		gql.zev_queue_name[count].zev_namelen = strlen(q->zq_name);
910 		count++;
911 	}
912 	gql.zev_n_queues = count;
913 	mutex_exit(&zev_mutex);
914 
915 	if (ddi_copyout(&gql, (void *)arg, sizeof(gql), mode) != 0)
916 		return EFAULT;
917 	return 0;
918 }
919 
920 /* ARGSUSED */
921 static int
922 zev_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
923 {
924 	zev_statistics_t zs;
925 	zev_ioctl_poolarg_t pa;
926 	zev_ioctl_mark_t mark;
927 	zev_mark_t *rec;
928 	int msg_size;
929 	zev_msg_t *msg;
930 	uint64_t len;
931 	uint64_t mark_id;
932 	minor_t minor;
933 	zev_queue_t *req_q;
934 	int ret = 0;
935 
936 	minor = getminor(dev);
937 	mutex_enter(&zev_mutex);
938 	if ((req_q = ddi_get_soft_state(statep, minor)) == NULL) {
939 		mutex_exit(&zev_mutex);
940 		return (ENXIO);
941 	}
942 	zev_queue_hold(req_q);
943 	mutex_exit(&zev_mutex);
944 	/*
945 	 * all structures passed between kernel and userspace
946 	 * are now compatible between 64 and 32 bit.  Model
947 	 * conversion can be ignored.
948 	 */
949 	switch (cmd) {
950 	case ZEV_IOC_GET_GLOBAL_STATISTICS:
951 		/* ddi_copyout() can take a long time.  Better make
952 		   a copy to be able to release the mutex faster. */
953 		mutex_enter(&zev_mutex);
954 		(void) memcpy(&zs, &zev_statistics, sizeof(zs));
955 		mutex_exit(&zev_mutex);
956 		if (ddi_copyout(&zs, (void *)arg, sizeof(zs), mode) != 0)
957 			ret = EFAULT;
958 		break;
959 	case ZEV_IOC_GET_QUEUE_STATISTICS:
960 		ret = zev_ioc_get_queue_statistics(req_q, arg, mode);
961 		break;
962 	case ZEV_IOC_MUTE_POOL:
963 	case ZEV_IOC_UNMUTE_POOL:
964 		if (ddi_copyin((void *)arg, &pa, sizeof(pa), mode) != 0) {
965 			ret = EFAULT;
966 			break;
967 		}
968 		if (pa.zev_poolname_len >=MAXPATHLEN) {
969 			ret = EINVAL;
970 			break;
971 		}
972 		pa.zev_poolname[pa.zev_poolname_len] = '\0';
973 		if (cmd == ZEV_IOC_MUTE_POOL) {
974 			ret = zev_ioc_mute_pool(pa.zev_poolname);
975 		} else {
976 			ret = zev_ioc_unmute_pool(pa.zev_poolname);
977 		}
978 		break;
979 	case ZEV_IOC_SET_MAX_QUEUE_LEN:
980 		if (ddi_copyin((void *)arg, &len, sizeof(len), mode) != 0) {
981 			ret = EFAULT;
982 			break;
983 		}
984 		if (len > ZEV_MAX_QUEUE_LEN) {
985 			ret = EINVAL;
986 			break;
987 		}
988 		mutex_enter(&zev_mutex);
989 		zev_statistics.zev_max_queue_len = len;
990 		cv_broadcast(&zev_condvar);
991 		mutex_exit(&zev_mutex);
992 		break;
993 	case ZEV_IOC_GET_QUEUE_PROPERTIES:
994 		ret = zev_ioc_get_queue_properties(req_q, arg, mode);
995 		break;
996 	case ZEV_IOC_SET_QUEUE_PROPERTIES:
997 		ret = zev_ioc_set_queue_properties(req_q, arg, mode);
998 		break;
999 	case ZEV_IOC_MARK:
1000 		if (ddi_copyin((void *)arg, &mark, sizeof(mark), mode) != 0) {
1001 			ret = EFAULT;
1002 			break;
1003 		}
1004 		/* prepare message */
1005 		msg_size = sizeof(*rec) + mark.zev_payload_len + 1;
1006 		msg = zev_alloc(sizeof(*msg) + msg_size);
1007 		msg->size = msg_size;
1008 		rec = (zev_mark_t *)(msg + 1);
1009 		rec->record_len = msg_size;
1010 		rec->op = ZEV_OP_MARK;
1011 		rec->op_time = ddi_get_time();
1012 		rec->guid = mark.zev_guid;
1013 		rec->payload_len = mark.zev_payload_len;
1014 		/* get payload */
1015 		if (ddi_copyin(((char *)arg) + sizeof(mark),
1016 		               ZEV_PAYLOAD(rec),
1017 		               mark.zev_payload_len, mode) != 0) {
1018 			zev_free(msg, msg_size);
1019 			ret = EFAULT;
1020 			break;
1021 		}
1022 		*(ZEV_PAYLOAD(rec) + mark.zev_payload_len) = '\0';
1023 		/* get mark id and queue message */
1024 		mutex_enter(&zev_mark_id_mutex);
1025 		mark_id = zev_mark_id++;
1026 		mutex_exit(&zev_mark_id_mutex);
1027 		rec->mark_id = mark_id;
1028 		zev_queue_message(ZEV_OP_MARK, msg);
1029 		/* report mark id to userland, ignore errors */
1030 		mark.zev_mark_id = mark_id;
1031 		ddi_copyout(&mark, (void *)arg, sizeof(mark), mode);
1032 		break;
1033 	case ZEV_IOC_ADD_QUEUE:
1034 		if (minor != ZEV_CONTROL_DEVICE_MINOR) {
1035 			ret = EACCES;
1036 			break;
1037 		}
1038 		ret = zev_ioc_add_queue(req_q, arg, mode);
1039 		break;
1040 	case ZEV_IOC_REMOVE_QUEUE:
1041 		if (minor != ZEV_CONTROL_DEVICE_MINOR) {
1042 			ret = EACCES;
1043 			break;
1044 		}
1045 		ret = zev_ioc_remove_queue(req_q, arg, mode);
1046 		break;
1047 	case ZEV_IOC_GET_DEBUG_INFO:
1048 		ret = zev_ioc_get_debug_info(req_q, arg, mode);
1049 		break;
1050 	case ZEV_IOC_GET_QUEUE_LIST:
1051 		ret = zev_ioc_get_queue_list(req_q, arg, mode);
1052 		break;
1053 	default:
1054 		/* generic "ioctl unknown" error */
1055 		ret = ENOTTY;
1056 	}
1057 
1058 	mutex_enter(&zev_mutex);
1059 	zev_queue_release(req_q);
1060 	mutex_exit(&zev_mutex);
1061 	return (ret);
1062 }
1063 
1064 static int
1065 zev_chpoll(dev_t dev, short events, int anyyet,
1066     short *reventsp, struct pollhead **phpp)
1067 {
1068 	int minor;
1069 	short revent = 0;
1070 	zev_queue_t *q;
1071 
1072 	/* use minor-specific queue context and it's pollhead */
1073 	minor = getminor(dev);
1074 	if (minor == ZEV_CONTROL_DEVICE_MINOR)
1075 		return (EINVAL);
1076 	mutex_enter(&zev_mutex);
1077 	if ((q = ddi_get_soft_state(statep, minor)) == NULL) {
1078 		mutex_exit(&zev_mutex);
1079 		return (ENXIO);
1080 	}
1081 	revent = 0;
1082 	if ((events & POLLIN)) {
1083 		if (q->zq_oldest)
1084 			revent |= POLLIN;
1085 	}
1086 	if (revent == 0) {
1087 		if (!anyyet) {
1088 			*phpp = &q->zq_pollhead;
1089 		}
1090 	}
1091 	*reventsp = revent;
1092 	mutex_exit(&zev_mutex);
1093 	return (0);
1094 }
1095 
1096 /* ARGSUSED */
1097 static int
1098 zev_read(dev_t dev, struct uio *uio_p, cred_t *crep_p)
1099 {
1100 	minor_t minor;
1101 	offset_t off;
1102 	int ret = 0;
1103 	zev_msg_t *msg;
1104 	char *data;
1105 	zev_queue_t *q;
1106 
1107 	minor = getminor(dev);
1108 	if (minor == ZEV_CONTROL_DEVICE_MINOR)
1109 		return (EINVAL);
1110 
1111 	mutex_enter(&zev_mutex);
1112 	q = ddi_get_soft_state(statep, minor);
1113 	if (q == NULL) {
1114 		mutex_exit(&zev_mutex);
1115 		return (ENXIO);
1116 	}
1117 	off = uio_p->uio_loffset;
1118 	msg = q->zq_oldest;
1119 	while (msg == NULL) {
1120 		if (!ddi_can_receive_sig()) {
1121 			/*
1122 			 * read() shouldn't block because this thread
1123 			 * can't receive signals. (e.g., it might be
1124 			 * torn down by exit() right now.)
1125 			 */
1126 			mutex_exit(&zev_mutex);
1127 			return 0;
1128 		}
1129 		if (cv_wait_sig(&q->zq_condvar, &zev_mutex) == 0) {
1130 			/* signal received. */
1131 			mutex_exit(&zev_mutex);
1132 			return EINTR;
1133 		}
1134 		msg = q->zq_oldest;
1135 	}
1136 	if (msg->size > uio_p->uio_resid) {
1137 		mutex_exit(&zev_mutex);
1138 		return E2BIG;
1139 	}
1140 	while (msg && uio_p->uio_resid >= msg->size) {
1141 		data = (char *)(msg + 1);
1142 		ret = uiomove(data, msg->size, UIO_READ, uio_p);
1143 		if (ret != 0) {
1144 			mutex_exit(&zev_mutex);
1145 			cmn_err(CE_WARN, "zev: uiomove failed; messages lost");
1146 			uio_p->uio_loffset = off;
1147 			return (ret);
1148 		}
1149 		q->zq_oldest = msg->next;
1150 		q->zq_bytes_read += msg->size;
1151 		q->zq_queue_len -= msg->size;
1152 		q->zq_queue_messages--;
1153 		msg->read++;
1154 		msg = q->zq_oldest;
1155 	}
1156 	cv_broadcast(&zev_condvar);
1157 	mutex_exit(&zev_mutex);
1158 	uio_p->uio_loffset = off;
1159 	return 0;
1160 }
1161 
1162 /* ARGSUSED */
1163 static int
1164 zev_close(dev_t dev, int flag, int otyp, cred_t *crepd)
1165 {
1166 	zev_queue_t *q;
1167 	int minor;
1168 
1169 	minor = getminor(dev);
1170 	if (otyp != OTYP_CHR)
1171 		return (EINVAL);
1172 	mutex_enter(&zev_mutex);
1173 	if ((q = ddi_get_soft_state(statep, minor)) == NULL) {
1174 		mutex_exit(&zev_mutex);
1175 		return (ENXIO);
1176 	}
1177 	if (q->zq_busy != B_TRUE) {
1178 		mutex_exit(&zev_mutex);
1179 		return (EINVAL);
1180 	}
1181 	q->zq_busy = B_FALSE;
1182 	if ((q->zq_flags & ZEV_FL_PERSISTENT) == 0)
1183 		zev_queue_release(q);
1184 	mutex_exit(&zev_mutex);
1185 	return (0);
1186 }
1187 
1188 /* ARGSUSED */
1189 static int
1190 zev_open(dev_t *devp, int flag, int otyp, cred_t *credp)
1191 {
1192 	zev_queue_t *q;
1193 	minor_t minor;
1194 
1195 	minor = getminor(*devp);
1196 	if (otyp != OTYP_CHR)
1197 		return (EINVAL);
1198 	if (drv_priv(credp) != 0)
1199 		return (EPERM);
1200 	mutex_enter(&zev_mutex);
1201 	if ((q = ddi_get_soft_state(statep, minor)) == NULL) {
1202 		mutex_exit(&zev_mutex);
1203 		return (ENXIO);
1204 	}
1205 	if (minor == ZEV_CONTROL_DEVICE_MINOR) {
1206 		/* control device may be used in parallel */
1207 		q->zq_busy = B_TRUE;
1208 		mutex_exit(&zev_mutex);
1209 		return 0;
1210 	}
1211 	if (q->zq_busy == B_TRUE) {
1212 		mutex_exit(&zev_mutex);
1213 		return (EBUSY);
1214 	}
1215 	q->zq_busy = B_TRUE;	/* can only be opened exclusively */
1216 	mutex_exit(&zev_mutex);
1217 	return (0);
1218 }
1219 
1220 static struct cb_ops zev_cb_ops = {
1221 	zev_open,		/* open */
1222 	zev_close,		/* close */
1223 	nodev,			/* strategy */
1224 	nodev,			/* print */
1225 	nodev,			/* dump */
1226 	zev_read,		/* read */
1227 	nodev,			/* write */
1228 	zev_ioctl,		/* ioctl */
1229 	nodev,			/* devmap */
1230 	nodev,			/* mmap */
1231 	nodev,			/* segmap */
1232 	zev_chpoll,		/* chpoll */
1233 	ddi_prop_op,		/* prop_op */
1234 	NULL,			/* streamtab */
1235 	D_MP | D_64BIT,		/* cb_flag */
1236 	CB_REV,			/* cb_rev */
1237 	nodev,			/* aread */
1238 	nodev,			/* awrite */
1239 };
1240 
1241 static void
1242 zev_free_instance(dev_info_t *dip)
1243 {
1244 	int instance;
1245 	zev_queue_t *q;
1246 	int i;
1247 
1248 	instance = ddi_get_instance(dip);
1249 	if (instance != 0) {
1250 		cmn_err(CE_WARN, "zev: tried to free instance != 0 (%d)",
1251 		        instance);
1252 		return;
1253 	}
1254 
1255 	ddi_remove_minor_node(dip, NULL);
1256 
1257 	/* stop pollwakeup thread */
1258 	zev_wakeup_thread_run = 0;
1259 	if (zev_poll_wakeup_thread != NULL) {
1260 		thread_join(zev_poll_wakeup_thread->t_did);
1261 		zev_poll_wakeup_thread = NULL;
1262 	}
1263 
1264 	mutex_enter(&zev_mutex);
1265 
1266 	/* remove "ctrl" dummy queue */
1267 	q = ddi_get_soft_state(statep, ZEV_CONTROL_DEVICE_MINOR);
1268 	if (q) {
1269 		ddi_soft_state_free(statep, ZEV_CONTROL_DEVICE_MINOR);
1270 		ZEV_MEM_SUB(sizeof(zev_queue_t));
1271 	}
1272 
1273 	/* remove all other queues */
1274 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
1275 		q = zev_queues[i- ZEV_MINOR_MIN];
1276 		if (!q)
1277 			continue;
1278 		ASSERT(q->zq_refcnt == 1);
1279 		zev_queue_release(q);
1280 	}
1281 	zev_queue_trim();
1282 	bzero(&zev_queues, sizeof(zev_queues));
1283 
1284 	mutex_exit(&zev_mutex);
1285 
1286 }
1287 
1288 static int
1289 zev_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1290 {
1291 	int instance;
1292 	zev_queue_t *q;
1293 
1294 	/* called once per instance with DDI_DETACH,
1295 	   may be called to suspend */
1296 	switch (cmd) {
1297 	case DDI_DETACH:
1298 		/* instance busy? */
1299 		instance = ddi_get_instance(dip);
1300 		if (instance != 0) {	/* hardcoded in zev.conf */
1301 			/* this module only supports one instance. */
1302 			return (DDI_FAILURE);
1303 		}
1304 
1305 		mutex_enter(&zev_mutex);
1306 		if (!zev_attached) {
1307 			mutex_exit(&zev_mutex);
1308 			return (DDI_FAILURE);
1309 		}
1310 
1311 		/* check "ctrl" queue to see if t is busy */
1312 		q = ddi_get_soft_state(statep, ZEV_CONTROL_DEVICE_MINOR);
1313 		if (q == NULL) {
1314 			mutex_exit(&zev_mutex);
1315 			return (DDI_FAILURE);
1316 		}
1317 		if (q->zq_busy) {
1318 			mutex_exit(&zev_mutex);
1319 			return (DDI_FAILURE);
1320 		}
1321 		/* are there any queues? */
1322 		if (zev_queue_cnt > 0) {
1323 			mutex_exit(&zev_mutex);
1324 			return (DDI_FAILURE);
1325 		}
1326 
1327 		zev_attached = B_FALSE;
1328 		mutex_exit(&zev_mutex);
1329 
1330 		/* switch ZFS event callbacks back to default */
1331 		rw_enter(&rz_zev_rwlock, RW_WRITER);
1332 		rz_zev_callbacks = rz_zev_default_callbacks;
1333 		rz_zev_set_active(B_FALSE);
1334 		rw_exit(&rz_zev_rwlock);
1335 
1336 		/* no thread is inside of the callbacks anymore. */
1337 
1338 		/* free resources allocated for this instance */
1339 		zev_free_instance(dip);
1340 		zev_chksum_fini();
1341 #if 0
1342 		cmn_err(CE_WARN, "zev: allocated memory at detach: %" PRIu64,
1343 			zev_memory_allocated - zev_memory_freed);
1344 #endif
1345 		return (DDI_SUCCESS);
1346 	case DDI_SUSPEND:
1347 		/* kernel must not suspend zev devices while ZFS is running */
1348 		return (DDI_FAILURE);
1349 	default:
1350 		return (DDI_FAILURE);
1351 	}
1352 }
1353 
1354 static int
1355 zev_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
1356 {
1357 	/* called once per instance with DDI_ATTACH,
1358 	   may be called to resume */
1359 	int instance;
1360 	int error;
1361 	zev_queue_t *q;
1362 	switch (cmd) {
1363 	case DDI_ATTACH:
1364 		/* create instance state */
1365 		instance = ddi_get_instance(dip);
1366 		if (instance != 0) {	/* hardcoded in zev.conf */
1367 			/* this module only supports one instance. */
1368 			return (DDI_FAILURE);
1369 		}
1370 
1371 		mutex_enter(&zev_mutex);
1372 		if (zev_attached) {
1373 			mutex_exit(&zev_mutex);
1374 			return (DDI_FAILURE);
1375 		}
1376 		if (ddi_soft_state_zalloc(statep, ZEV_CONTROL_DEVICE_MINOR) !=
1377 		    DDI_SUCCESS) {
1378 			mutex_exit(&zev_mutex);
1379 			return (DDI_FAILURE);
1380 		}
1381 		ZEV_MEM_ADD(sizeof(zev_queue_t));
1382 		zev_attached = B_TRUE;
1383 
1384 		/* init queue list */
1385 		bzero(&zev_queues, sizeof(zev_queues));
1386 		mutex_exit(&zev_mutex);
1387 
1388 		/* create a dummy queue for management of "ctrl" */
1389 
1390 		q = ddi_get_soft_state(statep, ZEV_CONTROL_DEVICE_MINOR);
1391 		q->zq_dip = dip;
1392 		q->zq_refcnt = 1;
1393 		q->zq_busy = B_FALSE;
1394 		q->zq_minor_number = ZEV_CONTROL_DEVICE_MINOR;
1395 		q->zq_flags = ZEV_FL_PERSISTENT;
1396 		strcpy(q->zq_name, ZEV_CONTROL_DEVICE_NAME);
1397 
1398 		/* create device node for "ctrl" */
1399 		if (ddi_create_minor_node(dip, ZEV_CONTROL_DEVICE_NAME,
1400 		    S_IFCHR, ZEV_CONTROL_DEVICE_MINOR,
1401 		    DDI_PSEUDO, 0) == DDI_FAILURE) {
1402 			goto fail;
1403 		}
1404 
1405 		/* note: intentionally not adding ctrl queue to queue list. */
1406 
1407 		/* default queue */
1408 		error = zev_queue_new(&q, dip,
1409 				      ZEV_DEFAULT_QUEUE_NAME,
1410 				      ZEV_MAX_QUEUE_LEN,
1411 				      ZEV_FL_BLOCK_WHILE_QUEUE_FULL|
1412 		                      ZEV_FL_PERSISTENT);
1413 		if (error)
1414 			goto fail;
1415 
1416 		/* start pollwakeup thread */
1417 		zev_wakeup_thread_run = 1;
1418 		zev_poll_wakeup_thread = thread_create(NULL, 0,
1419 		    zev_poll_wakeup_thread_main, NULL, 0, &p0,
1420 		    TS_RUN, minclsyspri);
1421 
1422 		ddi_report_dev(dip);
1423 
1424 		zev_chksum_init();
1425 
1426 		/* switch ZFS event callbacks to zev module callbacks */
1427 		rw_enter(&rz_zev_rwlock, RW_WRITER);
1428 		rz_zev_callbacks = &zev_callbacks;
1429 		rz_zev_set_active(B_TRUE);
1430 		rw_exit(&rz_zev_rwlock);
1431 
1432 		return (DDI_SUCCESS);
1433 	case DDI_RESUME:
1434 		/* suspendeding zev devices should never happen */
1435 		return (DDI_SUCCESS);
1436 	default:
1437 		return (DDI_FAILURE);
1438 	}
1439 fail:
1440 	cmn_err(CE_WARN, "zev: attach failed");
1441 	zev_free_instance(dip);
1442 	mutex_enter(&zev_mutex);
1443 	zev_attached = B_FALSE;
1444 	mutex_exit(&zev_mutex);
1445 	return (DDI_FAILURE);
1446 }
1447 
1448 /* ARGSUSED */
1449 static int
1450 zev_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **resultp)
1451 {
1452 	minor_t minor;
1453 	zev_queue_t *q;
1454 
1455 	/* arg is dev_t */
1456 	minor = getminor((dev_t)arg);
1457 	mutex_enter(&zev_mutex);
1458 	q = ddi_get_soft_state(statep, minor);
1459 	if (q == NULL) {
1460 		*resultp = NULL;
1461 		mutex_exit(&zev_mutex);
1462 		return (DDI_FAILURE);
1463 	}
1464 
1465 	switch (infocmd) {
1466 	case DDI_INFO_DEVT2DEVINFO:
1467 		*resultp = q->zq_dip;
1468 		break;
1469 	case DDI_INFO_DEVT2INSTANCE:
1470 		*resultp = (void *)(uintptr_t)ddi_get_instance(q->zq_dip);
1471 		break;
1472 	default:
1473 		mutex_exit(&zev_mutex);
1474 		return (DDI_FAILURE);
1475 	}
1476 	mutex_exit(&zev_mutex);
1477 	return (DDI_SUCCESS);
1478 }
1479 
1480 static struct dev_ops zev_dev_ops = {
1481 	DEVO_REV,			/* driver build revision */
1482 	0,				/* driver reference count */
1483 	zev_getinfo,			/* getinfo */
1484 	nulldev,			/* identify (obsolete) */
1485 	nulldev,			/* probe (search for devices) */
1486 	zev_attach,			/* attach */
1487 	zev_detach,			/* detach */
1488 	nodev,				/* reset (obsolete, use quiesce) */
1489 	&zev_cb_ops,			/* character and block device ops */
1490 	NULL,				/* bus driver ops */
1491 	NULL,				/* power management, not needed */
1492 	ddi_quiesce_not_needed,		/* quiesce */
1493 };
1494 
1495 static struct modldrv zev_modldrv = {
1496 	&mod_driverops,			/* all loadable modules use this */
1497 	"zev ZFS event provider, v1.0",	/* driver name and version info */
1498 	&zev_dev_ops			/* ops method pointers */
1499 };
1500 
1501 static struct modlinkage zev_modlinkage = {
1502 	MODREV_1,	/* fixed value */
1503 	{
1504 		&zev_modldrv,	/* driver linkage structure */
1505 		NULL		/* list terminator */
1506 	}
1507 };
1508 
1509 int
1510 _init(void)
1511 {
1512 	int error;
1513 
1514 	if ((error = ddi_soft_state_init(&statep, sizeof(zev_queue_t), 1)) != 0)
1515 		return (error);
1516 	zev_attached = B_FALSE;
1517 
1518 	zev_queue_head = NULL;
1519 	zev_queue_tail = NULL;
1520 	zev_queue_len = 0;
1521 	zev_muted_pools_head = NULL;
1522 	zev_memory_allocated = 0;
1523 	zev_memory_freed = 0;
1524 	zev_queue_cnt = 0;
1525 
1526 	mutex_init(&zev_mutex, NULL, MUTEX_DRIVER, NULL);
1527 	cv_init(&zev_condvar, NULL, CV_DRIVER, NULL);
1528 	rw_init(&zev_pool_list_rwlock, NULL, RW_DRIVER, NULL);
1529 	mutex_init(&zev_mark_id_mutex, NULL, MUTEX_DRIVER, NULL);
1530 	zev_mark_id = gethrtime();
1531 	mutex_init(&zev_queue_msg_mutex, NULL, MUTEX_DRIVER, NULL);
1532 	zev_msg_sequence_number = gethrtime();
1533 	bzero(&zev_statistics, sizeof(zev_statistics));
1534 	bzero(&zev_pollhead, sizeof(zev_pollhead));
1535 	bzero(&zev_queues, sizeof(zev_queues));
1536 	zev_statistics.zev_max_queue_len = ZEV_MAX_QUEUE_LEN;
1537 	if (zev_ioc_mute_pool("zg0")) {
1538 		cmn_err(CE_WARN, "zev: could not init mute list");
1539 		goto FAIL;
1540 	}
1541 
1542 	if ((error = mod_install(&zev_modlinkage)) != 0) {
1543 		cmn_err(CE_WARN, "zev: could not install module");
1544 		goto FAIL;
1545 	}
1546 
1547 	return (0);
1548 FAIL:
1549 	/* free resources */
1550 	cmn_err(CE_WARN, "zev: _init failed");
1551 	mutex_destroy(&zev_mutex);
1552 	ddi_soft_state_fini(&statep);
1553 	return (error);
1554 }
1555 
1556 int
1557 _info(struct modinfo *modinfop)
1558 {
1559 	return (mod_info(&zev_modlinkage, modinfop));
1560 }
1561 
1562 int
1563 _fini(void)
1564 {
1565 	int error = 0;
1566 	zev_msg_t *msg;
1567 	zev_pool_list_entry_t *pe, *npe;
1568 
1569 	mutex_enter(&zev_mutex);
1570 	if (zev_attached == B_TRUE) {
1571 		mutex_exit(&zev_mutex);
1572 		return (SET_ERROR(EBUSY));
1573 	}
1574 	if (zev_queue_cnt != 0) {
1575 		/* should never happen */
1576 		mutex_exit(&zev_mutex);
1577 		return (SET_ERROR(EBUSY));
1578 	}
1579 
1580 	/*
1581 	 * avoid deadlock if event list is full: make sure threads currently
1582 	 * blocking on the event list can append their event and then release
1583 	 * rz_zev_rwlock.  Since there should be no queues left when we
1584 	 * reach this point we can simply empty the event list and then
1585 	 * wake everybody.
1586 	 */
1587 	while (zev_queue_head) {
1588 		msg = zev_queue_head;
1589 		zev_queue_head = msg->next;
1590 		zev_free(msg, sizeof(*msg) + msg->size);
1591 	}
1592 	cv_broadcast(&zev_condvar);
1593 	mutex_exit(&zev_mutex);
1594 
1595 	/* switch ZFS event callbacks back to default (again) */
1596 	rw_enter(&rz_zev_rwlock, RW_WRITER);
1597 	rz_zev_callbacks = rz_zev_default_callbacks;
1598 	rz_zev_set_active(B_FALSE);
1599 	rw_exit(&rz_zev_rwlock);
1600 
1601 	/* no thread is inside of the callbacks anymore.  Safe to remove. */
1602 
1603 	/* unload module callbacks */
1604 	if ((error = mod_remove(&zev_modlinkage)) != 0) {
1605 		cmn_err(CE_WARN, "mod_remove failed: %d", error);
1606 		return (error);
1607 	}
1608 
1609 	/* free resources */
1610 	mutex_enter(&zev_mutex);
1611 	while (zev_queue_head) {
1612 		msg = zev_queue_head;
1613 		zev_queue_head = msg->next;
1614 		zev_free(msg, sizeof(*msg) + msg->size);
1615 	}
1616 	mutex_exit(&zev_mutex);
1617 	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
1618 	pe = zev_muted_pools_head;
1619 	while (pe) {
1620 		npe = pe;
1621 		pe = pe->next;
1622 		zev_free(npe, sizeof(*npe));
1623 	}
1624 	rw_exit(&zev_pool_list_rwlock);
1625 	ddi_soft_state_fini(&statep);
1626 	rw_destroy(&zev_pool_list_rwlock);
1627 	cv_destroy(&zev_condvar);
1628 	mutex_destroy(&zev_mutex);
1629 	mutex_destroy(&zev_mark_id_mutex);
1630 	mutex_destroy(&zev_queue_msg_mutex);
1631 
1632 	return (0);
1633 }
1634 
1635