xref: /titanic_52/usr/src/uts/common/fs/zev/zev.c (revision 6450d95ec3196265c750cc432ec31477e243b0b0)
1 #include <sys/modctl.h>
2 #include <sys/ddi.h>
3 #include <sys/sunddi.h>
4 #include <sys/conf.h>
5 #include <sys/devops.h>
6 #include <sys/stat.h>
7 #include <sys/fs/zev.h>
8 #include <sys/zev_callbacks.h>
9 #include <sys/zev_checksums.h>
10 #include <sys/zfs_znode.h>
11 #include <sys/time.h>
12 #include <sys/sa.h>
13 #include <sys/zap.h>
14 #include <sys/time.h>
15 #include <sys/fs/dv_node.h>
16 
17 #define	OFFSETOF(s, m)		((size_t)(&(((s *)0)->m)))
18 
19 #define XSTRING(x)	STRING(x)
20 #define STRING(x)	#x
21 
22 #define ZEV_DEFAULT_QUEUE_NAME		"beaver"
23 #define ZEV_CONTROL_DEVICE_MINOR	0
24 #define ZEV_TMPQUEUE_DEVICE_MINOR	1
25 #define ZEV_MINOR_MIN			(ZEV_TMPQUEUE_DEVICE_MINOR + 1)
26 #define ZEV_MINOR_MAX			(ZEV_MINOR_MIN + ZEV_MAX_QUEUES - 1)
27 
28 typedef struct zev_queue {
29 	char			zq_name[ZEV_MAX_QUEUE_NAME_LEN+1];
30 	minor_t			zq_minor_number;
31 	dev_info_t		*zq_dip;
32 	struct pollhead		zq_pollhead;
33 	uint64_t		zq_bytes_read;
34 	uint64_t		zq_events_read;
35 	uint64_t		zq_bytes_discarded;
36 	uint64_t		zq_events_discarded;
37 	uint64_t		zq_bytes_total;
38 	uint64_t		zq_events_total;
39 	uint64_t		zq_wakeup_threshold;
40 	uint16_t		zq_flags;
41 	uint16_t		zq_need_wakeup;
42 	/* protected by zev_mutex */
43 	int			zq_refcnt;
44 	uint64_t		zq_queue_len;
45 	uint64_t		zq_queue_messages;
46 	uint64_t		zq_max_queue_len;
47 	zev_msg_t		*zq_oldest;
48 	boolean_t		zq_busy;
49 	boolean_t		zq_to_be_removed;
50 	zev_statistics_t	zq_statistics;
51 	kcondvar_t		zq_condvar;
52 } zev_queue_t;
53 
54 static void		*statep;
55 struct pollhead		zev_pollhead;
56 
57 kmutex_t		zev_mutex;
58 kcondvar_t		zev_condvar;
59 kmutex_t		zev_queue_msg_mutex;
60 krwlock_t		zev_pool_list_rwlock;
61 static zev_statistics_t	zev_statistics;
62 static boolean_t	zev_attached;
63 static kmutex_t		zev_mark_id_mutex;
64 static uint64_t		zev_mark_id = 0;
65 
66 static uint64_t		zev_msg_sequence_number = 0;
67 static zev_queue_t	*zev_queues[ZEV_MAX_QUEUES];
68 static int		zev_queue_cnt = 0;
69 static int		zev_have_blocking_queues = 1;
70 static int		zev_tmpqueue_num = 0;
71 
72 uint64_t	zev_memory_allocated = 0;
73 uint64_t	zev_memory_freed = 0;
74 
75 /*
76  * The longest potential message is from zev_zfs_mount() and
77  * contains the mountpoint, which might be close to MAXPATHLEN bytes long.
78  *
79  * Another candidate is zev_znode_rename_cb() and contains three inode
80  * numbers and two filenames of up to MAXNAMELEN bytes each.
81  */
82 #define ZEV_MAX_MESSAGE_LEN	4096
83 
84 static zev_msg_t *zev_queue_head = NULL;
85 static zev_msg_t *zev_queue_tail = NULL;
86 static uint64_t zev_queue_len = 0;
87 
88 
89 typedef struct zev_pool_list_entry {
90 	struct zev_pool_list_entry	*next;
91 	char				name[MAXPATHLEN];
92 } zev_pool_list_entry_t;
93 
94 static zev_pool_list_entry_t *zev_muted_pools_head = NULL;
95 
96 static volatile int zev_wakeup_thread_run = 1;
97 static kthread_t *zev_poll_wakeup_thread = NULL;
98 
99 void *
100 zev_alloc(ssize_t sz)
101 {
102 	ZEV_MEM_ADD(sz);
103 	return kmem_alloc(sz, KM_SLEEP);
104 }
105 
106 void *
107 zev_zalloc(ssize_t sz)
108 {
109 	ZEV_MEM_ADD(sz);
110 	return kmem_zalloc(sz, KM_SLEEP);
111 }
112 
113 void
114 zev_free(void *ptr, ssize_t sz)
115 {
116 	ZEV_MEM_SUB(sz);						\
117 	kmem_free(ptr, sz);
118 }
119 
120 /* must be called with zev_mutex held */
121 static void
122 zev_update_blockflag(void)
123 {
124 	zev_queue_t *q;
125 	int had_blocking_queues;
126 	int i;
127 
128 	had_blocking_queues = zev_have_blocking_queues;
129 
130 	/* do we still have blocking queues? */
131 	zev_have_blocking_queues = 0;
132 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
133 		q = zev_queues[i - ZEV_MINOR_MIN];
134 		if (!q)
135 			continue;
136 		if (q->zq_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL) {
137 			zev_have_blocking_queues = 1;
138 			break;
139 		}
140 	}
141 	/* no blocking queues */
142 	if (had_blocking_queues)
143 		cv_broadcast(&zev_condvar);
144 }
145 
146 int
147 zev_queue_cmp(const void *a, const void *b)
148 {
149 	const zev_queue_t *qa = a;
150 	const zev_queue_t *qb = b;
151 	if (qa->zq_minor_number > qb->zq_minor_number)
152 		return 1;
153 	if (qa->zq_minor_number < qb->zq_minor_number)
154 		return -1;
155 	return 0;
156 }
157 
158 /* must be called with zev_mutex held */
159 void
160 zev_queue_trim(void)
161 {
162 	zev_msg_t *m;
163 	uint64_t oldest_message;
164 	zev_queue_t *q;
165 	int i;
166 
167 	if (!zev_queue_tail)
168 		return;
169 
170 	oldest_message = zev_queue_tail->seq + 1;  /* does not exist, yet. */
171 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
172 		q = zev_queues[i - ZEV_MINOR_MIN];
173 		if (q == NULL)
174 			continue;
175 		if (!q->zq_oldest)
176 			continue;
177 		if (oldest_message > q->zq_oldest->seq)
178 			oldest_message = q->zq_oldest->seq;
179 	}
180 
181 	/* remove msgs between oldest_message and zev_queue_head */
182 	while(zev_queue_head && (oldest_message > zev_queue_head->seq)) {
183 		m = zev_queue_head;
184 		zev_queue_head = m->next;
185 		if (zev_queue_head == NULL) {
186 			zev_queue_tail = NULL;
187 		} else {
188 			zev_queue_head->prev = NULL;
189 		}
190 		if (m->read == 0) {
191 			zev_statistics.zev_bytes_discarded += m->size;
192 			zev_statistics.zev_cnt_discarded_events++;
193 		}
194 		zev_statistics.zev_queue_len -= m->size;
195 		zev_queue_len--;
196 		zev_free(m, sizeof(*m) + m->size);
197 	}
198 }
199 
200 /* must be called with zev_mutex held */
201 static void
202 zev_queue_hold(zev_queue_t *q)
203 {
204 	q->zq_refcnt++;
205 }
206 
207 /* must be called with zev_mutex held */
208 static void
209 zev_queue_release(zev_queue_t *q)
210 {
211 	q->zq_refcnt--;
212 	if (q->zq_refcnt > 0)
213 		return;
214 
215 	ASSERT(q->zq_busy == B_FALSE);
216 
217 	/* persistent queues will not be removed */
218 	if ((q->zq_flags & ZEV_FL_PERSISTENT) != 0)
219 		return;
220 
221 	/* remove queue from queue list */
222 	zev_queues[q->zq_minor_number - ZEV_MINOR_MIN] = NULL;
223 
224 	/* discard messages that no queue references anymore */
225 	zev_queue_trim();
226 
227 	cv_destroy(&q->zq_condvar);
228 	ddi_remove_minor_node(q->zq_dip, q->zq_name);
229 	devfs_clean(q->zq_dip, NULL, DV_CLEAN_FORCE);
230 	ddi_soft_state_free(statep, q->zq_minor_number);
231 	ZEV_MEM_SUB(sizeof(zev_queue_t));
232 	zev_queue_cnt--;
233 	zev_update_blockflag();
234 }
235 
236 int
237 zev_queue_new(zev_queue_t **queue,
238               dev_info_t *dip,
239               char *name,
240               uint64_t max_queue_len,
241               uint16_t flags)
242 {
243 	zev_queue_t *q;
244 	zev_queue_t *tmp;
245 	zev_msg_t *msg;
246 	int name_exists = 0;
247 	minor_t minor;
248 	char *p;
249 	int i;
250 
251 	if (max_queue_len > ZEV_MAX_QUEUE_LEN)
252 		return EINVAL;
253 	if (max_queue_len == 0)
254 		max_queue_len = ZEV_MAX_QUEUE_LEN;
255 	if (!strcmp(name, ZEV_CONTROL_DEVICE_NAME))
256 		return EINVAL;
257 	for (p = name; *p; p++) {
258 		if (*p >= 'a' && *p <= 'z')
259 			continue;
260 		if (*p >= '0' && *p <= '9')
261 			continue;
262 		if (*p == '.')
263 			continue;
264 		return EINVAL;
265 	}
266 
267 	mutex_enter(&zev_mutex);
268 
269 	/* find free minor number.*/
270 	/* if this were a frequent operation we'd have a free-minor list */
271 	for (minor = ZEV_MINOR_MIN; minor <= ZEV_MINOR_MAX; minor++) {
272 		tmp = zev_queues[minor - ZEV_MINOR_MIN];
273 		if (tmp == NULL)
274 			break;
275 	}
276 	if (tmp) {
277 		mutex_exit(&zev_mutex);
278 		return ENOSPC;
279 	}
280 
281 	if (ddi_soft_state_zalloc(statep, minor) != DDI_SUCCESS) {
282 		mutex_exit(&zev_mutex);
283 		return ENOSPC;
284 	}
285 	ZEV_MEM_ADD(sizeof(zev_queue_t));
286 
287 	q = ddi_get_soft_state(statep, minor);
288 	memset(q, 0, sizeof(*q));
289 	strncpy(q->zq_name, name, ZEV_MAX_QUEUE_NAME_LEN);
290 	q->zq_name[ZEV_MAX_QUEUE_NAME_LEN] = '\0';
291 	q->zq_max_queue_len = max_queue_len;
292 	q->zq_wakeup_threshold = ZEV_DEFAULT_POLL_WAKEUP_QUEUE_LEN;
293 	q->zq_flags = flags;
294 	q->zq_refcnt = 1;
295 	q->zq_dip = dip;
296 	q->zq_minor_number = minor;
297 	cv_init(&q->zq_condvar, NULL, CV_DRIVER, NULL);
298 
299 	/* insert into queue list */
300 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
301 		/* if this were a frequent operation we'd have a name tree */
302 		if (zev_queues[i - ZEV_MINOR_MIN] == NULL)
303 			continue;
304 		if (!strcmp(q->zq_name, zev_queues[i-ZEV_MINOR_MIN]->zq_name)) {
305 			name_exists = 1;
306 			break;
307 		}
308 	}
309 	if (name_exists) {
310 		ddi_soft_state_free(statep, minor);
311 		ZEV_MEM_SUB(sizeof(zev_queue_t));
312 		mutex_exit(&zev_mutex);
313 		return EEXIST;
314 	}
315 	zev_queues[minor - ZEV_MINOR_MIN] = q;
316 	zev_queue_cnt++;
317 
318 	/* calculate current queue len and find head and tail */
319 	if (!(q->zq_flags & ZEV_FL_INITIALLY_EMPTY)) {
320 		q->zq_oldest = zev_queue_tail;
321 		msg = zev_queue_tail;
322 		while ((msg) && (q->zq_queue_len < q->zq_max_queue_len)) {
323 			q->zq_queue_len += msg->size;
324 			q->zq_queue_messages++;
325 			q->zq_oldest = msg;
326 			msg = msg->prev;
327 		}
328 	}
329 
330 	zev_update_blockflag();
331 
332 	mutex_exit(&zev_mutex);
333 
334 	if (ddi_create_minor_node(dip, name,
335 	    S_IFCHR, minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
336 		mutex_enter(&zev_mutex);
337 		zev_queues[minor - ZEV_MINOR_MIN] = NULL;
338 		zev_queue_cnt--;
339 		ddi_soft_state_free(statep, minor);
340 		ZEV_MEM_SUB(sizeof(zev_queue_t));
341 		zev_update_blockflag();
342 		mutex_exit(&zev_mutex);
343 		return EFAULT;
344 	}
345 
346 	*queue = q;
347 	return 0;
348 }
349 
350 /*
351  * poll() wakeup thread.  Used to check periodically whether we have
352  * bytes left in the queue that have not yet been made into a
353  * pollwakeup() call.  This is meant to insure a maximum waiting
354  * time until an event is presented as a poll wakeup, while at
355  * the same time not making every single event into a poll wakeup
356  * of it's own.
357  */
358 
359 static void
360 zev_poll_wakeup(boolean_t flush_all)
361 {
362 	zev_queue_t *q;
363 	int i;
364 
365 	/*
366 	 * This loop works with hold() and release() because
367 	 * pollwakeup() requires us to release our locks before calling it.
368 	 *
369 	 * from pollwakeup(9F):
370 	 *
371 	 *   "Driver defined locks should not be held across calls
372 	 *    to this function."
373 	 */
374 
375 	/* wake up threads for each individual queue */
376 	mutex_enter(&zev_mutex);
377 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
378 		q = zev_queues[i - ZEV_MINOR_MIN];
379 		if (q == NULL)
380 			continue;
381 		if (!q->zq_busy)
382 			continue;
383 		if (!q->zq_queue_len)
384 			continue;
385 		if ((flush_all) ||
386 		    (q->zq_queue_len > q->zq_wakeup_threshold)) {
387 			zev_queue_hold(q);
388 			mutex_exit(&zev_mutex);
389 			pollwakeup(&q->zq_pollhead, POLLIN);
390 			mutex_enter(&zev_mutex);
391 			zev_queue_release(q);
392 		}
393 	}
394 	mutex_exit(&zev_mutex);
395 }
396 
397 static void
398 zev_poll_wakeup_thread_main(void)
399 {
400 	while (zev_wakeup_thread_run) {
401 		delay(drv_usectohz(100 * 1000)); /* sleep 100ms */
402 
403 		zev_poll_wakeup(B_TRUE);
404 	}
405 	thread_exit();
406 }
407 
408 static int
409 zev_ioc_mute_pool(char *poolname)
410 {
411 	zev_pool_list_entry_t *pe;
412 	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
413 	/* pool already muted? */
414 	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
415 		if (!strcmp(pe->name, poolname)) {
416 			rw_exit(&zev_pool_list_rwlock);
417 			return EEXIST;
418 		}
419 	}
420 	pe = zev_zalloc(sizeof(*pe));
421 	if (!pe) {
422 		rw_exit(&zev_pool_list_rwlock);
423 		return ENOMEM;
424 	}
425 	(void) strncpy(pe->name, poolname, sizeof(pe->name));
426 	pe->next = zev_muted_pools_head;
427 	zev_muted_pools_head = pe;
428 	rw_exit(&zev_pool_list_rwlock);
429 	return (0);
430 }
431 
432 static int
433 zev_ioc_unmute_pool(char *poolname)
434 {
435 	zev_pool_list_entry_t *pe, *peprev;
436 
437 	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
438 	/* pool muted? */
439 	peprev = NULL;
440 	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
441 		if (!strcmp(pe->name, poolname))
442 			break;
443 		peprev = pe;
444 	}
445 	if (pe) {
446 		rw_exit(&zev_pool_list_rwlock);
447 		return ENOENT;
448 	}
449 
450 	if (peprev != NULL) {
451 		peprev->next = pe->next;
452 	} else {
453 		zev_muted_pools_head = pe->next;
454 	}
455 	zev_free(pe, sizeof(*pe));
456 	rw_exit(&zev_pool_list_rwlock);
457 	return (0);
458 }
459 
460 int
461 zev_skip_pool(objset_t *os)
462 {
463 	zev_pool_list_entry_t *pe;
464 	dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
465 	rw_enter(&zev_pool_list_rwlock, RW_READER);
466 	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
467 		if (!strcmp(pe->name, dp->dp_spa->spa_name)) {
468 			rw_exit(&zev_pool_list_rwlock);
469 			return 1;
470 		}
471 	}
472 	rw_exit(&zev_pool_list_rwlock);
473 	return 0;
474 }
475 
476 int
477 zev_skip_fs(zfsvfs_t *fs)
478 {
479 	dsl_dir_t *d = fs->z_os->os_dsl_dataset->ds_dir;
480 	dsl_dir_t *prev = NULL;
481 
482 	while (d && d != prev) {
483 		if (strstr(d->dd_myname, "_root"))
484 			return 0;
485 		prev = d;
486 		d = d->dd_parent;
487 	}
488 	return 1;
489 }
490 
491 static void
492 zev_update_statistics(int op, zev_statistics_t *stat)
493 {
494 	switch (op) {
495 	case ZEV_OP_ERROR:
496 		stat->zev_cnt_errors++;
497 		break;
498 	case ZEV_OP_MARK:
499 		stat->zev_cnt_marks++;
500 		break;
501 	case ZEV_OP_ZFS_MOUNT:
502 		stat->zev_cnt_zfs_mount++;
503 		break;
504 	case ZEV_OP_ZFS_UMOUNT:
505 		stat->zev_cnt_zfs_umount++;
506 		break;
507 	case ZEV_OP_ZVOL_WRITE:
508 		stat->zev_cnt_zvol_write++;
509 		break;
510 	case ZEV_OP_ZVOL_TRUNCATE:
511 		stat->zev_cnt_zvol_truncate++;
512 		break;
513 	case ZEV_OP_ZNODE_CLOSE_AFTER_UPDATE:
514 		stat->zev_cnt_znode_close_after_update++;
515 		break;
516 	case ZEV_OP_ZNODE_CREATE:
517 		stat->zev_cnt_znode_create++;
518 		break;
519 	case ZEV_OP_ZNODE_REMOVE:
520 		stat->zev_cnt_znode_remove++;
521 		break;
522 	case ZEV_OP_ZNODE_LINK:
523 		stat->zev_cnt_znode_link++;
524 		break;
525 	case ZEV_OP_ZNODE_SYMLINK:
526 		stat->zev_cnt_znode_symlink++;
527 		break;
528 	case ZEV_OP_ZNODE_RENAME:
529 		stat->zev_cnt_znode_rename++;
530 		break;
531 	case ZEV_OP_ZNODE_WRITE:
532 		stat->zev_cnt_znode_write++;
533 		break;
534 	case ZEV_OP_ZNODE_TRUNCATE:
535 		stat->zev_cnt_znode_truncate++;
536 		break;
537 	case ZEV_OP_ZNODE_SETATTR:
538 		stat->zev_cnt_znode_setattr++;
539 		break;
540 	case ZEV_OP_ZNODE_ACL:
541 		stat->zev_cnt_znode_acl++;
542 		break;
543 	}
544 }
545 
546 void
547 zev_queue_message(int op, zev_msg_t *msg)
548 {
549 	zev_queue_t *q;
550 	int wakeup = 0;
551 	zev_msg_t *m;
552 	int i;
553 
554 	msg->next = NULL;
555 	msg->prev = NULL;
556 	msg->read = 0;
557 
558 	if (op < ZEV_OP_MIN || op > ZEV_OP_MAX) {
559 		zev_queue_error(op, "unknown op id encountered: %d", op);
560 		zev_free(msg, sizeof(*msg) + msg->size);
561 		return;
562 	}
563 
564 	/*
565 	 * This mutex protects us agains race conditions when several
566 	 * threads want to queue a message and one or more queues are
567 	 * full:  we release zev_mutex to wait for the queues to become
568 	 * less-than-full, but we don't know in which order the waiting
569 	 * threads will be awoken.  If it's not the same order in which
570 	 * they went to sleep we might mark different messages as "newest"
571 	 * in different queues, and so we might have dupes or even
572 	 * skip messages.
573 	 */
574 	mutex_enter(&zev_queue_msg_mutex);
575 
576 	mutex_enter(&zev_mutex);
577 
578 	/*
579 	 * When the module is loaded, the default behavior ist to
580 	 * put all events into a queue and block if the queue is full.
581 	 * This is done even before the pseudo device is attached.
582 	 * This way, no events are lost.
583 	 *
584 	 * To discard events entirely the "beaver" queue,
585 	 * which never discards anything, has to be removed.
586 	 */
587 
588 	if (zev_queue_cnt == 0) {
589 		mutex_exit(&zev_mutex);
590 		mutex_exit(&zev_queue_msg_mutex);
591 		return;
592 	}
593 
594 	/* put message into global queue */
595 	msg->seq = zev_msg_sequence_number++;
596 
597 	/* do we need to make room? */
598 again:
599 	while (zev_statistics.zev_max_queue_len &&
600 	    zev_statistics.zev_queue_len > zev_statistics.zev_max_queue_len) {
601 
602 		if (zev_have_blocking_queues) {
603 			/* so we have blocking queues.  are they full? */
604 			for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
605 				q = zev_queues[i - ZEV_MINOR_MIN];
606 				if (!q)
607 					continue;
608 				if ((q->zq_flags &
609 				     ZEV_FL_BLOCK_WHILE_QUEUE_FULL) == 0)
610 					continue;
611 				if (q->zq_queue_len &&
612 				    q->zq_queue_len > q->zq_max_queue_len) {
613 					/* block until queue's been shrunk. */
614 					cv_wait(&zev_condvar, &zev_mutex);
615 					goto again;
616 				}
617 			}
618 		}
619 
620 		/* discard events until this message fits into all queues */
621 
622 		for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
623 			q = zev_queues[i - ZEV_MINOR_MIN];
624 			if (!q)
625 				continue;
626 			/* discard msgs until queue is small enough */
627 			while (q->zq_queue_len &&
628 			       q->zq_queue_len > q->zq_max_queue_len) {
629 				m = q->zq_oldest;
630 				if (m == NULL)
631 					break;
632 				q->zq_events_discarded++;
633 				q->zq_bytes_discarded += m->size;
634 				q->zq_oldest = m->next;
635 				q->zq_queue_len -= m->size;
636 				q->zq_queue_messages--;
637 			}
638 		}
639 
640 		zev_queue_trim();
641 		ASSERT(zev_statistics.zev_queue_len == 0 ||
642 		       zev_statistics.zev_queue_len <=
643 				zev_statistics.zev_max_queue_len);
644 	}
645 
646 	if (zev_queue_tail == NULL) {
647 		zev_queue_head = zev_queue_tail = msg;
648 	} else {
649 		zev_queue_tail->next = msg;
650 		msg->prev = zev_queue_tail;
651 		zev_queue_tail = msg;
652 	}
653 	zev_queue_len++;
654 	zev_statistics.zev_cnt_total_events++;
655 	zev_statistics.zev_queue_len += msg->size;
656 
657 	/* update per-device queues */
658 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
659 		q = zev_queues[i - ZEV_MINOR_MIN];
660 		if (!q)
661 			continue;
662 
663 		zev_queue_hold(q);
664 
665 		/* make sure queue has enough room */
666 		while (q->zq_max_queue_len &&
667 		       q->zq_queue_len > q->zq_max_queue_len) {
668 
669 			if (q->zq_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL) {
670 				/* block until queue has been shrunk. */
671 				cv_wait(&zev_condvar, &zev_mutex);
672 			} else {
673 				/* discard msgs until queue is small enough */
674 				while (q->zq_queue_len > q->zq_max_queue_len) {
675 					m = q->zq_oldest;
676 					if (m == NULL)
677 						break;
678 					q->zq_events_discarded++;
679 					q->zq_bytes_discarded += m->size;
680 					q->zq_oldest = m->next;
681 					q->zq_queue_len -= m->size;
682 					q->zq_queue_messages--;
683 				}
684 			}
685 		}
686 
687 		/* register new message at the end of the queue */
688 		q->zq_queue_len += msg->size;
689 		q->zq_queue_messages++;
690 		q->zq_bytes_total += msg->size;
691 		q->zq_events_total++;
692 		if (q->zq_oldest == NULL)
693 			q->zq_oldest = msg;
694 
695 		zev_update_statistics(op, &q->zq_statistics);
696 
697 		if (q->zq_queue_len > q->zq_wakeup_threshold)
698 			wakeup = 1;
699 		if (q->zq_queue_len == msg->size)  /* queue was empty */
700 			cv_broadcast(&q->zq_condvar);
701 
702 		zev_queue_release(q);
703 	}
704 
705 	zev_queue_trim();
706 
707 	zev_update_statistics(op, &zev_statistics);
708 	mutex_exit(&zev_mutex);
709 	mutex_exit(&zev_queue_msg_mutex);
710 
711 	/* one or more queues need a pollwakeup() */
712 	if (op == ZEV_OP_MARK) {
713 		zev_poll_wakeup(B_TRUE);
714 	} else if (wakeup) {
715 		zev_poll_wakeup(B_FALSE);
716 	}
717 
718 	return;
719 }
720 
721 void
722 zev_queue_error(int op, char *fmt, ...)
723 {
724 	char buf[ZEV_MAX_MESSAGE_LEN];
725 	va_list ap;
726 	int len;
727 	zev_msg_t *msg = NULL;
728 	zev_error_t *rec;
729 	int msg_size;
730 
731 	va_start(ap, fmt);
732 	len = vsnprintf(buf, sizeof(buf), fmt, ap);
733 	va_end(ap);
734 	if (len >= sizeof(buf)) {
735 		cmn_err(CE_WARN, "zev: can't report error - "
736 		        "dropping event entirely.");
737 		return;
738 	}
739 
740 	msg_size = sizeof(*rec) + len + 1;
741 	msg = zev_alloc(sizeof(*msg) + msg_size);
742 	msg->size = msg_size;
743 	rec = (zev_error_t *)(msg + 1);
744 	rec->record_len = msg_size;
745 	rec->op = ZEV_OP_ERROR;
746 	rec->op_time = ddi_get_time();
747 	rec->guid = 0;
748 	rec->failed_op = op;
749 	rec->errstr_len = len;
750 	(void) memcpy(ZEV_ERRSTR(rec), buf, len + 1);
751 
752 	zev_queue_message(ZEV_OP_ERROR, msg);
753 	return;
754 }
755 
756 static int
757 zev_find_queue(zev_queue_t **out, zev_queue_t *req_q, zev_queue_name_t *name)
758 {
759 	char namebuf[ZEV_MAX_QUEUE_NAME_LEN+1];
760 	zev_queue_t *q;
761 	int i;
762 
763 	*out = NULL;
764 
765 	if (name->zev_namelen == 0) {
766 		if (req_q->zq_minor_number == ZEV_CONTROL_DEVICE_MINOR)
767 			return EINVAL;
768 		mutex_enter(&zev_mutex);
769 		zev_queue_hold(req_q);
770 		mutex_exit(&zev_mutex);
771 		*out = req_q;
772 		return 0;
773 	}
774 
775 	if (name->zev_namelen > ZEV_MAX_QUEUE_NAME_LEN)
776 		return EINVAL;
777 	strncpy(namebuf, name->zev_name, name->zev_namelen);
778 	namebuf[name->zev_namelen] = '\0';
779 
780 	mutex_enter(&zev_mutex);
781 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
782 		q = zev_queues[i - ZEV_MINOR_MIN];
783 		if (!q)
784 			continue;
785 		if (!strcmp(q->zq_name, namebuf)) {
786 			zev_queue_hold(q);
787 			mutex_exit(&zev_mutex);
788 			*out = q;
789 			return 0;
790 		}
791 	}
792 	mutex_exit(&zev_mutex);
793 	return ENOENT;
794 }
795 
796 static int
797 zev_ioc_get_queue_statistics(zev_queue_t *req_q, intptr_t arg, int mode)
798 {
799 	zev_ioctl_get_queue_statistics_t gs;
800 	zev_queue_t *q;
801 	int ret;
802 
803 	if (ddi_copyin((void *)arg, &gs, sizeof(gs), mode) != 0)
804 		return EFAULT;
805 
806 	ret = zev_find_queue(&q, req_q, &gs.zev_queue_name);
807 	if (ret)
808 		return ret;
809 
810 	/* ddi_copyout() can take a long time.  Better make
811 	   a copy to be able to release the mutex faster. */
812 	mutex_enter(&zev_mutex);
813 	memcpy(&gs.zev_statistics, &q->zq_statistics,sizeof(gs.zev_statistics));
814 	gs.zev_statistics.zev_queue_len = q->zq_queue_len;
815 	gs.zev_statistics.zev_bytes_read = q->zq_bytes_read;
816 	gs.zev_statistics.zev_bytes_discarded = q->zq_bytes_discarded;
817 	gs.zev_statistics.zev_max_queue_len = q->zq_max_queue_len;
818 	gs.zev_statistics.zev_cnt_discarded_events = q->zq_events_discarded;
819 	gs.zev_statistics.zev_cnt_total_events = q->zq_events_total;
820 	zev_queue_release(q);
821 	mutex_exit(&zev_mutex);
822 
823 	if (ddi_copyout(&gs, (void *)arg, sizeof(gs), mode) != 0)
824 		return EFAULT;
825 	return 0;
826 }
827 
828 static int
829 zev_ioc_set_queue_properties(zev_queue_t *req_q, intptr_t arg, int mode)
830 {
831 	zev_ioctl_set_queue_properties_t qp;
832 	zev_queue_t *q;
833 	uint64_t old_max;
834 	uint64_t old_flags;
835 	int ret;
836 
837 	if (ddi_copyin((void *)arg, &qp, sizeof(qp), mode) != 0)
838 		return EFAULT;
839 	if (qp.zev_max_queue_len > ZEV_MAX_QUEUE_LEN)
840 		return EINVAL;
841 	if (qp.zev_poll_wakeup_threshold > ZEV_MAX_POLL_WAKEUP_QUEUE_LEN)
842 		return EINVAL;
843 
844 	ret = zev_find_queue(&q, req_q, &qp.zev_queue_name);
845 	if (ret)
846 		return ret;
847 
848 	mutex_enter(&zev_mutex);
849 
850 	/*
851 	 * Note: if the PERSISTENT flag is cleared, and the queue is not busy,
852 	 * the queue should be removed by zev_queue_release() in zev_ioctl().
853 	 */
854 	old_flags = qp.zev_flags;
855 	q->zq_flags = qp.zev_flags;
856 	if ((old_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL) &&
857 	   (!(qp.zev_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL))) {
858 		/* queue is no longer blocking - wake blocked threads */
859 		cv_broadcast(&zev_condvar);
860 	}
861 
862 	zev_update_blockflag();
863 
864 	old_max = q->zq_max_queue_len;
865 	q->zq_max_queue_len = qp.zev_max_queue_len;
866 	if (q->zq_max_queue_len < old_max)
867 		zev_queue_trim();
868 	if (q->zq_max_queue_len > old_max)
869 		cv_broadcast(&zev_condvar);	/* threads may be waiting */
870 
871 	if ((qp.zev_poll_wakeup_threshold < q->zq_wakeup_threshold) &&
872 	    (qp.zev_poll_wakeup_threshold <= q->zq_queue_len))
873 		pollwakeup(&q->zq_pollhead, POLLIN);
874 	q->zq_wakeup_threshold = qp.zev_poll_wakeup_threshold;
875 
876 	zev_queue_release(q);
877 	mutex_exit(&zev_mutex);
878 	return 0;
879 }
880 
881 static int
882 zev_ioc_get_queue_properties(zev_queue_t *req_q, intptr_t arg, int mode)
883 {
884 	zev_ioctl_get_queue_properties_t qp;
885 	zev_queue_t *q;
886 	int ret;
887 
888 	if (ddi_copyin((void *)arg, &qp, sizeof(qp), mode) != 0)
889 		return EFAULT;
890 
891 	ret = zev_find_queue(&q, req_q, &qp.zev_queue_name);
892 	if (ret)
893 		return ret;
894 
895 	mutex_enter(&zev_mutex);
896 	qp.zev_max_queue_len = q->zq_max_queue_len;
897 	qp.zev_flags = q->zq_flags;
898 	qp.zev_poll_wakeup_threshold = q->zq_wakeup_threshold;
899 	zev_queue_release(q);
900 	mutex_exit(&zev_mutex);
901 
902 	if (ddi_copyout(&qp, (void *)arg, sizeof(qp), mode) != 0)
903 		return EFAULT;
904 	return 0;
905 }
906 
907 static int
908 zev_ioc_add_queue(zev_queue_t *req_q, intptr_t arg, int mode)
909 {
910 	zev_ioctl_add_queue_t aq;
911 	zev_queue_t *new_q;
912 	char name[ZEV_MAX_QUEUE_NAME_LEN+1];
913 
914 	if (ddi_copyin((void *)arg, &aq, sizeof(aq), mode) != 0)
915 		return EFAULT;
916 
917 	if (aq.zev_namelen > ZEV_MAX_QUEUE_NAME_LEN)
918 		return EINVAL;
919 	strncpy(name, aq.zev_name, aq.zev_namelen);
920 	name[aq.zev_namelen] = '\0';
921 	if (!strncmp(name, ZEV_TMPQUEUE_DEVICE_NAME,
922 	             strlen(ZEV_TMPQUEUE_DEVICE_NAME)))
923 		return EINVAL;
924 
925 	return zev_queue_new(&new_q, req_q->zq_dip, name,
926 	                     aq.zev_max_queue_len, aq.zev_flags);
927 }
928 
929 static int
930 zev_ioc_remove_queue(zev_queue_t *req_q, intptr_t arg, int mode)
931 {
932 	zev_ioctl_remove_queue_t rq;
933 	zev_queue_t *q;
934 	char name[ZEV_MAX_QUEUE_NAME_LEN+1];
935 	int found = 0;
936 	int i;
937 
938 	if (ddi_copyin((void *)arg, &rq, sizeof(rq), mode) != 0)
939 		return EFAULT;
940 
941 	if (rq.zev_queue_name.zev_namelen > ZEV_MAX_QUEUE_NAME_LEN)
942 		return EINVAL;
943 	strncpy(name, rq.zev_queue_name.zev_name,
944 	        rq.zev_queue_name.zev_namelen);
945 	name[rq.zev_queue_name.zev_namelen] = '\0';
946 
947 	mutex_enter(&zev_mutex);
948 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
949 		q = zev_queues[i - ZEV_MINOR_MIN];
950 		if (!q)
951 			continue;
952 		if (!strcmp(q->zq_name, name)) {
953 			found = 1;
954 			break;
955 		}
956 	}
957 	if (!found) {
958 		mutex_exit(&zev_mutex);
959 		return ENOENT;
960 	}
961 
962 	if (q->zq_busy) {
963 		mutex_exit(&zev_mutex);
964 		return EBUSY;
965 	}
966 	/*
967 	 * clear flags, so that persistent queues are removed aswell
968 	 * and the queue becomes non-blocking.
969 	 */
970 	q->zq_flags = 0;
971 	if (q->zq_to_be_removed == B_FALSE) {
972 		q->zq_to_be_removed = B_TRUE;
973 		zev_queue_release(q);
974 	}
975 	/* some threads might be waiting for this queue to become writable */
976 	cv_broadcast(&zev_condvar);
977 
978 	mutex_exit(&zev_mutex);
979 	return 0;
980 }
981 
982 static int
983 zev_ioc_get_debug_info(zev_queue_t *req_q, intptr_t arg, int mode)
984 {
985 	zev_ioctl_debug_info_t di;
986 	uint64_t mem_allocated = atomic_add_64_nv(&zev_memory_allocated, 0);
987 	uint64_t mem_freed     = atomic_add_64_nv(&zev_memory_freed, 0);
988 
989 	zev_chksum_stats(&di.zev_chksum_cache_size,
990 	                 &di.zev_chksum_cache_hits,
991 	                 &di.zev_chksum_cache_misses);
992 	di.zev_memory_allocated = mem_allocated - mem_freed;
993 	if (ddi_copyout(&di, (void *)arg, sizeof(di), mode) != 0)
994 		return EFAULT;
995 	return 0;
996 }
997 
998 static int
999 zev_ioc_get_queue_list(zev_queue_t *req_q, intptr_t arg, int mode)
1000 {
1001 	zev_ioctl_get_queue_list_t gql;
1002 	zev_queue_t *q;
1003 	int i = 0;
1004 	int count = 0;
1005 
1006 	memset(&gql, 0, sizeof(gql));
1007 
1008 	mutex_enter(&zev_mutex);
1009 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
1010 		q = zev_queues[i - ZEV_MINOR_MIN];
1011 		if (!q)
1012 			continue;
1013 		strncpy(gql.zev_queue_name[count].zev_name,
1014 		    q->zq_name, ZEV_MAX_QUEUE_NAME_LEN);
1015 		gql.zev_queue_name[count].zev_namelen = strlen(q->zq_name);
1016 		count++;
1017 	}
1018 	gql.zev_n_queues = count;
1019 	mutex_exit(&zev_mutex);
1020 
1021 	if (ddi_copyout(&gql, (void *)arg, sizeof(gql), mode) != 0)
1022 		return EFAULT;
1023 	return 0;
1024 }
1025 
1026 static int
1027 zev_ioc_set_max_queue_len(zev_queue_t *req_q, intptr_t arg, int mode)
1028 {
1029 	uint64_t len;
1030 	int i;
1031 	zev_queue_t *q;
1032 
1033 	if (ddi_copyin((void *)arg, &len, sizeof(len), mode) != 0) {
1034 		return EFAULT;
1035 	}
1036 	if (len > ZEV_MAX_QUEUE_LEN) {
1037 		return EINVAL;
1038 	}
1039 	mutex_enter(&zev_mutex);
1040 	zev_statistics.zev_max_queue_len = len;
1041 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
1042 		q = zev_queues[i - ZEV_MINOR_MIN];
1043 		if (!q)
1044 			continue;
1045 		if (q->zq_max_queue_len <=
1046 		    zev_statistics.zev_max_queue_len)
1047 			continue;
1048 		q->zq_max_queue_len = zev_statistics.zev_max_queue_len;
1049 	}
1050 	cv_broadcast(&zev_condvar);
1051 	mutex_exit(&zev_mutex);
1052 	return 0;
1053 }
1054 
1055 static int
1056 zev_ioc_get_zev_version(intptr_t arg, int mode)
1057 {
1058 	zev_ioctl_get_zev_version vi;
1059 	vi.zev_major_version = ZEV_MAJOR_VERSION;
1060 	vi.zev_minor_version = ZEV_MINOR_VERSION;
1061 	if (ddi_copyout(&vi, (void *)arg, sizeof(vi), mode) != 0)
1062 		return EFAULT;
1063 	return 0;
1064 }
1065 
1066 /* ARGSUSED */
1067 static int
1068 zev_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
1069 {
1070 	zev_statistics_t zs;
1071 	zev_ioctl_poolarg_t pa;
1072 	zev_ioctl_mark_t mark;
1073 	zev_mark_t *rec;
1074 	int msg_size;
1075 	zev_msg_t *msg;
1076 	uint64_t mark_id;
1077 	minor_t minor;
1078 	zev_queue_t *req_q;
1079 	int ret = 0;
1080 
1081 	minor = getminor(dev);
1082 	mutex_enter(&zev_mutex);
1083 	if ((req_q = ddi_get_soft_state(statep, minor)) == NULL) {
1084 		mutex_exit(&zev_mutex);
1085 		return (ENXIO);
1086 	}
1087 	zev_queue_hold(req_q);
1088 	mutex_exit(&zev_mutex);
1089 	/*
1090 	 * all structures passed between kernel and userspace
1091 	 * are now compatible between 64 and 32 bit.  Model
1092 	 * conversion can be ignored.
1093 	 */
1094 	switch (cmd) {
1095 	case ZEV_IOC_GET_GLOBAL_STATISTICS:
1096 		/* ddi_copyout() can take a long time.  Better make
1097 		   a copy to be able to release the mutex faster. */
1098 		mutex_enter(&zev_mutex);
1099 		(void) memcpy(&zs, &zev_statistics, sizeof(zs));
1100 		mutex_exit(&zev_mutex);
1101 		if (ddi_copyout(&zs, (void *)arg, sizeof(zs), mode) != 0)
1102 			ret = EFAULT;
1103 		break;
1104 	case ZEV_IOC_GET_QUEUE_STATISTICS:
1105 		ret = zev_ioc_get_queue_statistics(req_q, arg, mode);
1106 		break;
1107 	case ZEV_IOC_MUTE_POOL:
1108 	case ZEV_IOC_UNMUTE_POOL:
1109 		if (ddi_copyin((void *)arg, &pa, sizeof(pa), mode) != 0) {
1110 			ret = EFAULT;
1111 			break;
1112 		}
1113 		if (pa.zev_poolname_len >=MAXPATHLEN) {
1114 			ret = EINVAL;
1115 			break;
1116 		}
1117 		pa.zev_poolname[pa.zev_poolname_len] = '\0';
1118 		if (cmd == ZEV_IOC_MUTE_POOL) {
1119 			ret = zev_ioc_mute_pool(pa.zev_poolname);
1120 		} else {
1121 			ret = zev_ioc_unmute_pool(pa.zev_poolname);
1122 		}
1123 		break;
1124 	case ZEV_IOC_SET_MAX_QUEUE_LEN:
1125 		ret = zev_ioc_set_max_queue_len(req_q, arg, mode);
1126 		break;
1127 	case ZEV_IOC_GET_QUEUE_PROPERTIES:
1128 		ret = zev_ioc_get_queue_properties(req_q, arg, mode);
1129 		break;
1130 	case ZEV_IOC_SET_QUEUE_PROPERTIES:
1131 		ret = zev_ioc_set_queue_properties(req_q, arg, mode);
1132 		break;
1133 	case ZEV_IOC_MARK:
1134 		if (ddi_copyin((void *)arg, &mark, sizeof(mark), mode) != 0) {
1135 			ret = EFAULT;
1136 			break;
1137 		}
1138 		/* prepare message */
1139 		msg_size = sizeof(*rec) + mark.zev_payload_len + 1;
1140 		msg = zev_alloc(sizeof(*msg) + msg_size);
1141 		msg->size = msg_size;
1142 		rec = (zev_mark_t *)(msg + 1);
1143 		rec->record_len = msg_size;
1144 		rec->op = ZEV_OP_MARK;
1145 		rec->op_time = ddi_get_time();
1146 		rec->guid = mark.zev_guid;
1147 		rec->payload_len = mark.zev_payload_len;
1148 		/* get payload */
1149 		if (ddi_copyin(((char *)arg) + sizeof(mark),
1150 		               ZEV_PAYLOAD(rec),
1151 		               mark.zev_payload_len, mode) != 0) {
1152 			zev_free(msg, msg_size);
1153 			ret = EFAULT;
1154 			break;
1155 		}
1156 		*(ZEV_PAYLOAD(rec) + mark.zev_payload_len) = '\0';
1157 		/* get mark id and queue message */
1158 		mutex_enter(&zev_mark_id_mutex);
1159 		mark_id = zev_mark_id++;
1160 		mutex_exit(&zev_mark_id_mutex);
1161 		rec->mark_id = mark_id;
1162 		zev_queue_message(ZEV_OP_MARK, msg);
1163 		/* report mark id to userland, ignore errors */
1164 		mark.zev_mark_id = mark_id;
1165 		ddi_copyout(&mark, (void *)arg, sizeof(mark), mode);
1166 		break;
1167 	case ZEV_IOC_ADD_QUEUE:
1168 		if (minor != ZEV_CONTROL_DEVICE_MINOR) {
1169 			ret = EACCES;
1170 			break;
1171 		}
1172 		ret = zev_ioc_add_queue(req_q, arg, mode);
1173 		break;
1174 	case ZEV_IOC_REMOVE_QUEUE:
1175 		if (minor != ZEV_CONTROL_DEVICE_MINOR) {
1176 			ret = EACCES;
1177 			break;
1178 		}
1179 		ret = zev_ioc_remove_queue(req_q, arg, mode);
1180 		break;
1181 	case ZEV_IOC_GET_DEBUG_INFO:
1182 		ret = zev_ioc_get_debug_info(req_q, arg, mode);
1183 		break;
1184 	case ZEV_IOC_GET_QUEUE_LIST:
1185 		ret = zev_ioc_get_queue_list(req_q, arg, mode);
1186 		break;
1187 	case ZEV_IOC_GET_FILE_SIGNATURES:
1188 		ret = zev_ioc_get_signatures(arg, mode);
1189 		break;
1190 	case ZEV_IOC_GET_ZEV_VERSION:
1191 		ret = zev_ioc_get_zev_version(arg, mode);
1192 		break;
1193 	default:
1194 		/* generic "ioctl unknown" error */
1195 		ret = ENOTTY;
1196 	}
1197 
1198 	mutex_enter(&zev_mutex);
1199 	zev_queue_release(req_q);
1200 	mutex_exit(&zev_mutex);
1201 	if (ret)
1202 		SET_ERROR(ret);
1203 	return (ret);
1204 }
1205 
1206 static int
1207 zev_chpoll(dev_t dev, short events, int anyyet,
1208     short *reventsp, struct pollhead **phpp)
1209 {
1210 	int minor;
1211 	short revent = 0;
1212 	zev_queue_t *q;
1213 
1214 	/* use minor-specific queue context and it's pollhead */
1215 	minor = getminor(dev);
1216 	if (minor == ZEV_CONTROL_DEVICE_MINOR)
1217 		return (EINVAL);
1218 	mutex_enter(&zev_mutex);
1219 	if ((q = ddi_get_soft_state(statep, minor)) == NULL) {
1220 		mutex_exit(&zev_mutex);
1221 		return (ENXIO);
1222 	}
1223 	revent = 0;
1224 	if ((events & POLLIN)) {
1225 		if (q->zq_oldest)
1226 			revent |= POLLIN;
1227 	}
1228 	if (revent == 0) {
1229 		if (!anyyet) {
1230 			*phpp = &q->zq_pollhead;
1231 		}
1232 	}
1233 	*reventsp = revent;
1234 	mutex_exit(&zev_mutex);
1235 	return (0);
1236 }
1237 
1238 /* ARGSUSED */
1239 static int
1240 zev_read(dev_t dev, struct uio *uio_p, cred_t *crep_p)
1241 {
1242 	minor_t minor;
1243 	offset_t off;
1244 	int ret = 0;
1245 	zev_msg_t *msg;
1246 	char *data;
1247 	zev_queue_t *q;
1248 
1249 	minor = getminor(dev);
1250 	if (minor == ZEV_CONTROL_DEVICE_MINOR)
1251 		return (EINVAL);
1252 
1253 	mutex_enter(&zev_mutex);
1254 	q = ddi_get_soft_state(statep, minor);
1255 	if (q == NULL) {
1256 		mutex_exit(&zev_mutex);
1257 		return (ENXIO);
1258 	}
1259 	off = uio_p->uio_loffset;
1260 	msg = q->zq_oldest;
1261 	while (msg == NULL) {
1262 		if (!ddi_can_receive_sig()) {
1263 			/*
1264 			 * read() shouldn't block because this thread
1265 			 * can't receive signals. (e.g., it might be
1266 			 * torn down by exit() right now.)
1267 			 */
1268 			mutex_exit(&zev_mutex);
1269 			return 0;
1270 		}
1271 		if (cv_wait_sig(&q->zq_condvar, &zev_mutex) == 0) {
1272 			/* signal received. */
1273 			mutex_exit(&zev_mutex);
1274 			return EINTR;
1275 		}
1276 		msg = q->zq_oldest;
1277 	}
1278 	if (msg->size > uio_p->uio_resid) {
1279 		mutex_exit(&zev_mutex);
1280 		return E2BIG;
1281 	}
1282 	while (msg && uio_p->uio_resid >= msg->size) {
1283 		data = (char *)(msg + 1);
1284 		ret = uiomove(data, msg->size, UIO_READ, uio_p);
1285 		if (ret != 0) {
1286 			mutex_exit(&zev_mutex);
1287 			cmn_err(CE_WARN, "zev: uiomove failed; messages lost");
1288 			uio_p->uio_loffset = off;
1289 			return (ret);
1290 		}
1291 		q->zq_oldest = msg->next;
1292 		q->zq_bytes_read += msg->size;
1293 		q->zq_queue_len -= msg->size;
1294 		q->zq_queue_messages--;
1295 		msg->read++;
1296 		msg = q->zq_oldest;
1297 	}
1298 	zev_queue_trim();
1299 	cv_broadcast(&zev_condvar);
1300 	mutex_exit(&zev_mutex);
1301 	uio_p->uio_loffset = off;
1302 	return 0;
1303 }
1304 
1305 /* ARGSUSED */
1306 static int
1307 zev_close(dev_t dev, int flag, int otyp, cred_t *crepd)
1308 {
1309 	zev_queue_t *q;
1310 	int minor;
1311 
1312 	minor = getminor(dev);
1313 	if (otyp != OTYP_CHR)
1314 		return (EINVAL);
1315 	mutex_enter(&zev_mutex);
1316 	if ((q = ddi_get_soft_state(statep, minor)) == NULL) {
1317 		mutex_exit(&zev_mutex);
1318 		return (ENXIO);
1319 	}
1320 	if (q->zq_busy != B_TRUE) {
1321 		mutex_exit(&zev_mutex);
1322 		return (EINVAL);
1323 	}
1324 	q->zq_busy = B_FALSE;
1325 	if ((q->zq_flags & ZEV_FL_PERSISTENT) == 0)
1326 		zev_queue_release(q);
1327 	mutex_exit(&zev_mutex);
1328 	return (0);
1329 }
1330 
1331 /* ARGSUSED */
1332 static int
1333 zev_open(dev_t *devp, int flag, int otyp, cred_t *credp)
1334 {
1335 	zev_queue_t *q;
1336 	minor_t minor;
1337 	char zq_name[ZEV_MAX_QUEUE_NAME_LEN];
1338 	int ret;
1339 
1340 	minor = getminor(*devp);
1341 	if (otyp != OTYP_CHR)
1342 		return (EINVAL);
1343 	if (drv_priv(credp) != 0)
1344 		return (EPERM);
1345 	if (minor == ZEV_TMPQUEUE_DEVICE_MINOR) {
1346 		/* get control queue soft state to have dip */
1347 		if ((q = ddi_get_soft_state(statep,
1348 		                            ZEV_CONTROL_DEVICE_MINOR)) == NULL){
1349 			mutex_exit(&zev_mutex);
1350 			return (ENXIO);
1351 		}
1352 
1353 		/* create new temporary queue and return it. */
1354 
1355 		snprintf(zq_name, sizeof(zq_name),
1356 		         ZEV_TMPQUEUE_DEVICE_NAME ".%d", zev_tmpqueue_num++);
1357 
1358 		ret = zev_queue_new(&q, q->zq_dip, zq_name, 0, 0);
1359 		if (ret) {
1360 			return ret;
1361 		}
1362 
1363 		q->zq_busy = B_TRUE;
1364 		*devp = makedevice(getmajor(*devp), q->zq_minor_number);
1365 		return 0;
1366 	}
1367 	mutex_enter(&zev_mutex);
1368 	if ((q = ddi_get_soft_state(statep, minor)) == NULL) {
1369 		mutex_exit(&zev_mutex);
1370 		return (ENXIO);
1371 	}
1372 	if (minor == ZEV_CONTROL_DEVICE_MINOR) {
1373 		/* control device may be used in parallel */
1374 		q->zq_busy = B_TRUE;
1375 		mutex_exit(&zev_mutex);
1376 		return 0;
1377 	}
1378 	if (q->zq_busy == B_TRUE) {
1379 		mutex_exit(&zev_mutex);
1380 		return (EBUSY);
1381 	}
1382 	q->zq_busy = B_TRUE;	/* can only be opened exclusively */
1383 	mutex_exit(&zev_mutex);
1384 	return (0);
1385 }
1386 
1387 static struct cb_ops zev_cb_ops = {
1388 	zev_open,		/* open */
1389 	zev_close,		/* close */
1390 	nodev,			/* strategy */
1391 	nodev,			/* print */
1392 	nodev,			/* dump */
1393 	zev_read,		/* read */
1394 	nodev,			/* write */
1395 	zev_ioctl,		/* ioctl */
1396 	nodev,			/* devmap */
1397 	nodev,			/* mmap */
1398 	nodev,			/* segmap */
1399 	zev_chpoll,		/* chpoll */
1400 	ddi_prop_op,		/* prop_op */
1401 	NULL,			/* streamtab */
1402 	D_MP | D_64BIT,		/* cb_flag */
1403 	CB_REV,			/* cb_rev */
1404 	nodev,			/* aread */
1405 	nodev,			/* awrite */
1406 };
1407 
1408 static void
1409 zev_free_instance(dev_info_t *dip)
1410 {
1411 	int instance;
1412 	zev_queue_t *q;
1413 	int i;
1414 
1415 	instance = ddi_get_instance(dip);
1416 	if (instance != 0) {
1417 		cmn_err(CE_WARN, "zev: tried to free instance != 0 (%d)",
1418 		        instance);
1419 		return;
1420 	}
1421 
1422 	ddi_remove_minor_node(dip, NULL);
1423 	devfs_clean(dip, NULL, DV_CLEAN_FORCE);
1424 
1425 	/* stop pollwakeup thread */
1426 	zev_wakeup_thread_run = 0;
1427 	if (zev_poll_wakeup_thread != NULL) {
1428 		thread_join(zev_poll_wakeup_thread->t_did);
1429 		zev_poll_wakeup_thread = NULL;
1430 	}
1431 
1432 	mutex_enter(&zev_mutex);
1433 
1434 	/* remove "ctrl" dummy queue */
1435 	q = ddi_get_soft_state(statep, ZEV_CONTROL_DEVICE_MINOR);
1436 	if (q) {
1437 		ddi_soft_state_free(statep, ZEV_CONTROL_DEVICE_MINOR);
1438 		ZEV_MEM_SUB(sizeof(zev_queue_t));
1439 	}
1440 
1441 	/* remove all other queues */
1442 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
1443 		q = zev_queues[i- ZEV_MINOR_MIN];
1444 		if (!q)
1445 			continue;
1446 		ASSERT(q->zq_refcnt == 1);
1447 		zev_queue_release(q);
1448 	}
1449 	zev_queue_trim();
1450 	bzero(&zev_queues, sizeof(zev_queues));
1451 
1452 	mutex_exit(&zev_mutex);
1453 
1454 }
1455 
1456 static int
1457 zev_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1458 {
1459 	int instance;
1460 	zev_queue_t *q;
1461 
1462 	/* called once per instance with DDI_DETACH,
1463 	   may be called to suspend */
1464 	switch (cmd) {
1465 	case DDI_DETACH:
1466 		/* instance busy? */
1467 		instance = ddi_get_instance(dip);
1468 		if (instance != 0) {	/* hardcoded in zev.conf */
1469 			/* this module only supports one instance. */
1470 			return (DDI_FAILURE);
1471 		}
1472 
1473 		mutex_enter(&zev_mutex);
1474 		if (!zev_attached) {
1475 			mutex_exit(&zev_mutex);
1476 			return (DDI_FAILURE);
1477 		}
1478 
1479 		/* check "ctrl" queue to see if t is busy */
1480 		q = ddi_get_soft_state(statep, ZEV_CONTROL_DEVICE_MINOR);
1481 		if (q == NULL) {
1482 			mutex_exit(&zev_mutex);
1483 			return (DDI_FAILURE);
1484 		}
1485 		if (q->zq_busy) {
1486 			mutex_exit(&zev_mutex);
1487 			return (DDI_FAILURE);
1488 		}
1489 		/* are there any queues? */
1490 		if (zev_queue_cnt > 0) {
1491 			mutex_exit(&zev_mutex);
1492 			return (DDI_FAILURE);
1493 		}
1494 
1495 		zev_attached = B_FALSE;
1496 		mutex_exit(&zev_mutex);
1497 
1498 		/* switch ZFS event callbacks back to default */
1499 		rw_enter(&rz_zev_rwlock, RW_WRITER);
1500 		rz_zev_callbacks = rz_zev_default_callbacks;
1501 		rz_zev_set_active(B_FALSE);
1502 		rw_exit(&rz_zev_rwlock);
1503 
1504 		/* no thread is inside of the callbacks anymore. */
1505 
1506 		/* free resources allocated for this instance */
1507 		zev_free_instance(dip);
1508 		zev_chksum_fini();
1509 #if 0
1510 		cmn_err(CE_WARN, "zev: allocated memory at detach: %" PRIu64,
1511 			zev_memory_allocated - zev_memory_freed);
1512 #endif
1513 		return (DDI_SUCCESS);
1514 	case DDI_SUSPEND:
1515 		/* kernel must not suspend zev devices while ZFS is running */
1516 		return (DDI_FAILURE);
1517 	default:
1518 		return (DDI_FAILURE);
1519 	}
1520 }
1521 
1522 static int
1523 zev_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
1524 {
1525 	/* called once per instance with DDI_ATTACH,
1526 	   may be called to resume */
1527 	int instance;
1528 	int error;
1529 	zev_queue_t *q;
1530 	switch (cmd) {
1531 	case DDI_ATTACH:
1532 		/* create instance state */
1533 		instance = ddi_get_instance(dip);
1534 		if (instance != 0) {	/* hardcoded in zev.conf */
1535 			/* this module only supports one instance. */
1536 			return (DDI_FAILURE);
1537 		}
1538 
1539 		mutex_enter(&zev_mutex);
1540 		if (zev_attached) {
1541 			mutex_exit(&zev_mutex);
1542 			return (DDI_FAILURE);
1543 		}
1544 		if (ddi_soft_state_zalloc(statep, ZEV_CONTROL_DEVICE_MINOR) !=
1545 		    DDI_SUCCESS) {
1546 			mutex_exit(&zev_mutex);
1547 			return (DDI_FAILURE);
1548 		}
1549 		ZEV_MEM_ADD(sizeof(zev_queue_t));
1550 		zev_attached = B_TRUE;
1551 
1552 		/* init queue list */
1553 		bzero(&zev_queues, sizeof(zev_queues));
1554 		mutex_exit(&zev_mutex);
1555 
1556 		/* create a dummy queue for management of "ctrl" */
1557 
1558 		q = ddi_get_soft_state(statep, ZEV_CONTROL_DEVICE_MINOR);
1559 		q->zq_dip = dip;
1560 		q->zq_refcnt = 1;
1561 		q->zq_busy = B_FALSE;
1562 		q->zq_minor_number = ZEV_CONTROL_DEVICE_MINOR;
1563 		q->zq_flags = ZEV_FL_PERSISTENT;
1564 		strcpy(q->zq_name, ZEV_CONTROL_DEVICE_NAME);
1565 
1566 		/* create device node for "ctrl" */
1567 		if (ddi_create_minor_node(dip, ZEV_CONTROL_DEVICE_NAME,
1568 		    S_IFCHR, ZEV_CONTROL_DEVICE_MINOR,
1569 		    DDI_PSEUDO, 0) == DDI_FAILURE) {
1570 			goto fail;
1571 		}
1572 
1573 		/* note: intentionally not adding ctrl queue to queue list. */
1574 
1575 		/* create device node for "tmpqueue" */
1576 		if (ddi_create_minor_node(dip, ZEV_TMPQUEUE_DEVICE_NAME,
1577 		    S_IFCHR, ZEV_TMPQUEUE_DEVICE_MINOR,
1578 		    DDI_PSEUDO, 0) == DDI_FAILURE) {
1579 			goto fail;
1580 		}
1581 
1582 		/* default queue */
1583 		error = zev_queue_new(&q, dip,
1584 				      ZEV_DEFAULT_QUEUE_NAME,
1585 				      ZEV_MAX_QUEUE_LEN,
1586 				      ZEV_FL_BLOCK_WHILE_QUEUE_FULL|
1587 		                      ZEV_FL_PERSISTENT);
1588 		if (error)
1589 			goto fail;
1590 
1591 		/* start pollwakeup thread */
1592 		zev_wakeup_thread_run = 1;
1593 		zev_poll_wakeup_thread = thread_create(NULL, 0,
1594 		    zev_poll_wakeup_thread_main, NULL, 0, &p0,
1595 		    TS_RUN, minclsyspri);
1596 
1597 		ddi_report_dev(dip);
1598 
1599 		zev_chksum_init();
1600 
1601 		/* switch ZFS event callbacks to zev module callbacks */
1602 		rw_enter(&rz_zev_rwlock, RW_WRITER);
1603 		rz_zev_callbacks = &zev_callbacks;
1604 		rz_zev_set_active(B_TRUE);
1605 		rw_exit(&rz_zev_rwlock);
1606 
1607 		return (DDI_SUCCESS);
1608 	case DDI_RESUME:
1609 		/* suspendeding zev devices should never happen */
1610 		return (DDI_SUCCESS);
1611 	default:
1612 		return (DDI_FAILURE);
1613 	}
1614 fail:
1615 	cmn_err(CE_WARN, "zev: attach failed");
1616 	zev_free_instance(dip);
1617 	mutex_enter(&zev_mutex);
1618 	zev_attached = B_FALSE;
1619 	mutex_exit(&zev_mutex);
1620 	return (DDI_FAILURE);
1621 }
1622 
1623 /* ARGSUSED */
1624 static int
1625 zev_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **resultp)
1626 {
1627 	minor_t minor;
1628 	zev_queue_t *q;
1629 
1630 	/* arg is dev_t */
1631 	minor = getminor((dev_t)arg);
1632 	mutex_enter(&zev_mutex);
1633 	q = ddi_get_soft_state(statep, minor);
1634 	if (q == NULL) {
1635 		*resultp = NULL;
1636 		mutex_exit(&zev_mutex);
1637 		return (DDI_FAILURE);
1638 	}
1639 
1640 	switch (infocmd) {
1641 	case DDI_INFO_DEVT2DEVINFO:
1642 		*resultp = q->zq_dip;
1643 		break;
1644 	case DDI_INFO_DEVT2INSTANCE:
1645 		*resultp = (void *)(uintptr_t)ddi_get_instance(q->zq_dip);
1646 		break;
1647 	default:
1648 		mutex_exit(&zev_mutex);
1649 		return (DDI_FAILURE);
1650 	}
1651 	mutex_exit(&zev_mutex);
1652 	return (DDI_SUCCESS);
1653 }
1654 
1655 static struct dev_ops zev_dev_ops = {
1656 	DEVO_REV,			/* driver build revision */
1657 	0,				/* driver reference count */
1658 	zev_getinfo,			/* getinfo */
1659 	nulldev,			/* identify (obsolete) */
1660 	nulldev,			/* probe (search for devices) */
1661 	zev_attach,			/* attach */
1662 	zev_detach,			/* detach */
1663 	nodev,				/* reset (obsolete, use quiesce) */
1664 	&zev_cb_ops,			/* character and block device ops */
1665 	NULL,				/* bus driver ops */
1666 	NULL,				/* power management, not needed */
1667 	ddi_quiesce_not_needed,		/* quiesce */
1668 };
1669 
1670 static struct modldrv zev_modldrv = {
1671 	&mod_driverops,			/* all loadable modules use this */
1672 	"ZFS event provider, v"
1673 		XSTRING(ZEV_MAJOR_VERSION) "."
1674 		XSTRING(ZEV_MINOR_VERSION),
1675 					/* driver name and version info */
1676 	&zev_dev_ops			/* ops method pointers */
1677 };
1678 
1679 static struct modlinkage zev_modlinkage = {
1680 	MODREV_1,	/* fixed value */
1681 	{
1682 		&zev_modldrv,	/* driver linkage structure */
1683 		NULL		/* list terminator */
1684 	}
1685 };
1686 
1687 int
1688 _init(void)
1689 {
1690 	int error;
1691 
1692 	if ((error = ddi_soft_state_init(&statep, sizeof(zev_queue_t), 1)) != 0)
1693 		return (error);
1694 	zev_attached = B_FALSE;
1695 
1696 	zev_queue_head = NULL;
1697 	zev_queue_tail = NULL;
1698 	zev_queue_len = 0;
1699 	zev_muted_pools_head = NULL;
1700 	zev_memory_allocated = 0;
1701 	zev_memory_freed = 0;
1702 	zev_queue_cnt = 0;
1703 	zev_have_blocking_queues = 1;
1704 
1705 	mutex_init(&zev_mutex, NULL, MUTEX_DRIVER, NULL);
1706 	cv_init(&zev_condvar, NULL, CV_DRIVER, NULL);
1707 	rw_init(&zev_pool_list_rwlock, NULL, RW_DRIVER, NULL);
1708 	mutex_init(&zev_mark_id_mutex, NULL, MUTEX_DRIVER, NULL);
1709 	zev_mark_id = gethrtime();
1710 	mutex_init(&zev_queue_msg_mutex, NULL, MUTEX_DRIVER, NULL);
1711 	zev_msg_sequence_number = gethrtime();
1712 	bzero(&zev_statistics, sizeof(zev_statistics));
1713 	bzero(&zev_pollhead, sizeof(zev_pollhead));
1714 	bzero(&zev_queues, sizeof(zev_queues));
1715 	zev_statistics.zev_max_queue_len = ZEV_MAX_QUEUE_LEN;
1716 	if (zev_ioc_mute_pool("zg0")) {
1717 		cmn_err(CE_WARN, "zev: could not init mute list");
1718 		goto FAIL;
1719 	}
1720 
1721 	if ((error = mod_install(&zev_modlinkage)) != 0) {
1722 		cmn_err(CE_WARN, "zev: could not install module");
1723 		goto FAIL;
1724 	}
1725 
1726 	return (0);
1727 FAIL:
1728 	/* free resources */
1729 	cmn_err(CE_WARN, "zev: _init failed");
1730 	mutex_destroy(&zev_mutex);
1731 	ddi_soft_state_fini(&statep);
1732 	return (error);
1733 }
1734 
1735 int
1736 _info(struct modinfo *modinfop)
1737 {
1738 	return (mod_info(&zev_modlinkage, modinfop));
1739 }
1740 
1741 int
1742 _fini(void)
1743 {
1744 	int error = 0;
1745 	zev_msg_t *msg;
1746 	zev_pool_list_entry_t *pe, *npe;
1747 
1748 	mutex_enter(&zev_mutex);
1749 	if (zev_attached == B_TRUE) {
1750 		mutex_exit(&zev_mutex);
1751 		return (SET_ERROR(EBUSY));
1752 	}
1753 	if (zev_queue_cnt != 0) {
1754 		/* should never happen */
1755 		mutex_exit(&zev_mutex);
1756 		return (SET_ERROR(EBUSY));
1757 	}
1758 
1759 	/*
1760 	 * avoid deadlock if event list is full: make sure threads currently
1761 	 * blocking on the event list can append their event and then release
1762 	 * rz_zev_rwlock.  Since there should be no queues left when we
1763 	 * reach this point we can simply empty the event list and then
1764 	 * wake everybody.
1765 	 */
1766 	while (zev_queue_head) {
1767 		msg = zev_queue_head;
1768 		zev_queue_head = msg->next;
1769 		zev_free(msg, sizeof(*msg) + msg->size);
1770 	}
1771 	cv_broadcast(&zev_condvar);
1772 	mutex_exit(&zev_mutex);
1773 
1774 	/* switch ZFS event callbacks back to default (again) */
1775 	rw_enter(&rz_zev_rwlock, RW_WRITER);
1776 	rz_zev_callbacks = rz_zev_default_callbacks;
1777 	rz_zev_set_active(B_FALSE);
1778 	rw_exit(&rz_zev_rwlock);
1779 
1780 	/* no thread is inside of the callbacks anymore.  Safe to remove. */
1781 
1782 	/* unload module callbacks */
1783 	if ((error = mod_remove(&zev_modlinkage)) != 0) {
1784 		cmn_err(CE_WARN, "mod_remove failed: %d", error);
1785 		return (error);
1786 	}
1787 
1788 	/* free resources */
1789 	mutex_enter(&zev_mutex);
1790 	while (zev_queue_head) {
1791 		msg = zev_queue_head;
1792 		zev_queue_head = msg->next;
1793 		zev_free(msg, sizeof(*msg) + msg->size);
1794 	}
1795 	mutex_exit(&zev_mutex);
1796 	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
1797 	pe = zev_muted_pools_head;
1798 	while (pe) {
1799 		npe = pe;
1800 		pe = pe->next;
1801 		zev_free(npe, sizeof(*npe));
1802 	}
1803 	rw_exit(&zev_pool_list_rwlock);
1804 	ddi_soft_state_fini(&statep);
1805 	rw_destroy(&zev_pool_list_rwlock);
1806 	cv_destroy(&zev_condvar);
1807 	mutex_destroy(&zev_mutex);
1808 	mutex_destroy(&zev_mark_id_mutex);
1809 	mutex_destroy(&zev_queue_msg_mutex);
1810 
1811 	return (0);
1812 }
1813 
1814