xref: /titanic_51/usr/src/uts/common/fs/zev/zev.c (revision 42cc51e07cdbcad3b9aca8d9d991fc09b251feb7)
1 #include <sys/modctl.h>
2 #include <sys/ddi.h>
3 #include <sys/sunddi.h>
4 #include <sys/conf.h>
5 #include <sys/devops.h>
6 #include <sys/stat.h>
7 #include <sys/fs/zev.h>
8 #include <sys/zev_callbacks.h>
9 #include <sys/zev_checksums.h>
10 #include <sys/zfs_znode.h>
11 #include <sys/time.h>
12 #include <sys/sa.h>
13 #include <sys/zap.h>
14 #include <sys/time.h>
15 #include <sys/fs/dv_node.h>
16 
17 #define	OFFSETOF(s, m)		((size_t)(&(((s *)0)->m)))
18 
19 #define XSTRING(x)	STRING(x)
20 #define STRING(x)	#x
21 
22 #define ZEV_DEFAULT_QUEUE_NAME		"beaver"
23 #define ZEV_CONTROL_DEVICE_MINOR	0
24 #define ZEV_TMPQUEUE_DEVICE_MINOR	1
25 #define ZEV_MINOR_MIN			(ZEV_TMPQUEUE_DEVICE_MINOR + 1)
26 #define ZEV_MINOR_MAX			(ZEV_MINOR_MIN + ZEV_MAX_QUEUES - 1)
27 
28 typedef struct zev_queue {
29 	char			zq_name[ZEV_MAX_QUEUE_NAME_LEN+1];
30 	minor_t			zq_minor_number;
31 	dev_info_t		*zq_dip;
32 	struct pollhead		zq_pollhead;
33 	uint64_t		zq_bytes_read;
34 	uint64_t		zq_events_read;
35 	uint64_t		zq_bytes_discarded;
36 	uint64_t		zq_events_discarded;
37 	uint64_t		zq_bytes_total;
38 	uint64_t		zq_events_total;
39 	uint64_t		zq_wakeup_threshold;
40 	uint16_t		zq_flags;
41 	uint16_t		zq_need_wakeup;
42 	/* protected by zev_mutex */
43 	int			zq_refcnt;
44 	uint64_t		zq_queue_len;
45 	uint64_t		zq_queue_messages;
46 	uint64_t		zq_max_queue_len;
47 	zev_msg_t		*zq_oldest;
48 	boolean_t		zq_busy;
49 	boolean_t		zq_to_be_removed;
50 	zev_statistics_t	zq_statistics;
51 	kcondvar_t		zq_condvar;
52 } zev_queue_t;
53 
54 static void		*statep;
55 struct pollhead		zev_pollhead;
56 
57 kmutex_t		zev_mutex;
58 kcondvar_t		zev_condvar;
59 kmutex_t		zev_queue_msg_mutex;
60 krwlock_t		zev_pool_list_rwlock;
61 static zev_statistics_t	zev_statistics;
62 static boolean_t	zev_attached;
63 static kmutex_t		zev_mark_id_mutex;
64 static uint64_t		zev_mark_id = 0;
65 
66 static uint64_t		zev_msg_sequence_number = 0;
67 static zev_queue_t	*zev_queues[ZEV_MAX_QUEUES];
68 static int		zev_queue_cnt = 0;
69 static int		zev_have_blocking_queues = 1;
70 static int		zev_tmpqueue_num = 0;
71 
72 uint64_t	zev_memory_allocated = 0;
73 uint64_t	zev_memory_freed = 0;
74 
75 /*
76  * The longest potential message is from zev_zfs_mount() and
77  * contains the mountpoint, which might be close to MAXPATHLEN bytes long.
78  *
79  * Another candidate is zev_znode_rename_cb() and contains three inode
80  * numbers and two filenames of up to MAXNAMELEN bytes each.
81  */
82 #define ZEV_MAX_MESSAGE_LEN	4096
83 
84 static zev_msg_t *zev_queue_head = NULL;
85 static zev_msg_t *zev_queue_tail = NULL;
86 static uint64_t zev_queue_len = 0;
87 
88 
89 typedef struct zev_pool_list_entry {
90 	struct zev_pool_list_entry	*next;
91 	char				name[MAXPATHLEN];
92 } zev_pool_list_entry_t;
93 
94 static zev_pool_list_entry_t *zev_muted_pools_head = NULL;
95 
96 static volatile int zev_wakeup_thread_run = 1;
97 static kthread_t *zev_poll_wakeup_thread = NULL;
98 
99 void *
100 zev_alloc(ssize_t sz)
101 {
102 	ZEV_MEM_ADD(sz);
103 	return kmem_alloc(sz, KM_SLEEP);
104 }
105 
106 void *
107 zev_zalloc(ssize_t sz)
108 {
109 	ZEV_MEM_ADD(sz);
110 	return kmem_zalloc(sz, KM_SLEEP);
111 }
112 
113 void
114 zev_free(void *ptr, ssize_t sz)
115 {
116 	ZEV_MEM_SUB(sz);						\
117 	kmem_free(ptr, sz);
118 }
119 
120 /* must be called with zev_mutex held */
121 static void
122 zev_update_blockflag(void)
123 {
124 	zev_queue_t *q;
125 	int had_blocking_queues;
126 	int i;
127 
128 	had_blocking_queues = zev_have_blocking_queues;
129 
130 	/* do we still have blocking queues? */
131 	zev_have_blocking_queues = 0;
132 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
133 		q = zev_queues[i - ZEV_MINOR_MIN];
134 		if (!q)
135 			continue;
136 		if (q->zq_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL) {
137 			zev_have_blocking_queues = 1;
138 			break;
139 		}
140 	}
141 	/* no blocking queues */
142 	if (had_blocking_queues)
143 		cv_broadcast(&zev_condvar);
144 }
145 
146 int
147 zev_queue_cmp(const void *a, const void *b)
148 {
149 	const zev_queue_t *qa = a;
150 	const zev_queue_t *qb = b;
151 	if (qa->zq_minor_number > qb->zq_minor_number)
152 		return 1;
153 	if (qa->zq_minor_number < qb->zq_minor_number)
154 		return -1;
155 	return 0;
156 }
157 
158 /* must be called with zev_mutex held */
159 void
160 zev_queue_trim(void)
161 {
162 	zev_msg_t *m;
163 	uint64_t oldest_message;
164 	zev_queue_t *q;
165 	int i;
166 
167 	if (!zev_queue_tail)
168 		return;
169 
170 	oldest_message = zev_queue_tail->seq + 1;  /* does not exist, yet. */
171 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
172 		q = zev_queues[i - ZEV_MINOR_MIN];
173 		if (q == NULL)
174 			continue;
175 		if (!q->zq_oldest)
176 			continue;
177 		if (oldest_message > q->zq_oldest->seq)
178 			oldest_message = q->zq_oldest->seq;
179 	}
180 
181 	/* remove msgs between oldest_message and zev_queue_head */
182 	while(zev_queue_head && (oldest_message > zev_queue_head->seq)) {
183 		m = zev_queue_head;
184 		zev_queue_head = m->next;
185 		if (zev_queue_head == NULL) {
186 			zev_queue_tail = NULL;
187 		} else {
188 			zev_queue_head->prev = NULL;
189 		}
190 		if (m->read == 0) {
191 			zev_statistics.zev_bytes_discarded += m->size;
192 			zev_statistics.zev_cnt_discarded_events++;
193 		}
194 		zev_statistics.zev_queue_len -= m->size;
195 		zev_queue_len--;
196 		zev_free(m, sizeof(*m) + m->size);
197 	}
198 }
199 
200 /* must be called with zev_mutex held */
201 static void
202 zev_queue_hold(zev_queue_t *q)
203 {
204 	q->zq_refcnt++;
205 }
206 
207 /* must be called with zev_mutex held */
208 static void
209 zev_queue_release(zev_queue_t *q)
210 {
211 	q->zq_refcnt--;
212 	if (q->zq_refcnt > 0)
213 		return;
214 
215 	ASSERT(q->zq_busy == B_FALSE);
216 
217 	/* persistent queues will not be removed */
218 	if ((q->zq_flags & ZEV_FL_PERSISTENT) != 0)
219 		return;
220 
221 	/* remove queue from queue list */
222 	zev_queues[q->zq_minor_number - ZEV_MINOR_MIN] = NULL;
223 
224 	/* discard messages that no queue references anymore */
225 	zev_queue_trim();
226 
227 	cv_destroy(&q->zq_condvar);
228 	ddi_remove_minor_node(q->zq_dip, q->zq_name);
229 	devfs_clean(ddi_root_node() ? ddi_root_node() : q->zq_dip,
230 	            NULL, DV_CLEAN_FORCE);
231 	ddi_soft_state_free(statep, q->zq_minor_number);
232 	ZEV_MEM_SUB(sizeof(zev_queue_t));
233 	zev_queue_cnt--;
234 	zev_update_blockflag();
235 }
236 
237 int
238 zev_queue_new(zev_queue_t **queue,
239               dev_info_t *dip,
240               char *name,
241               uint64_t max_queue_len,
242               uint16_t flags)
243 {
244 	zev_queue_t *q;
245 	zev_queue_t *tmp;
246 	zev_msg_t *msg;
247 	int name_exists = 0;
248 	minor_t minor;
249 	char *p;
250 	int i;
251 
252 	if (max_queue_len > ZEV_MAX_QUEUE_LEN)
253 		return EINVAL;
254 	if (max_queue_len == 0)
255 		max_queue_len = ZEV_MAX_QUEUE_LEN;
256 	if (!strcmp(name, ZEV_CONTROL_DEVICE_NAME))
257 		return EINVAL;
258 	for (p = name; *p; p++) {
259 		if (*p >= 'a' && *p <= 'z')
260 			continue;
261 		if (*p >= '0' && *p <= '9')
262 			continue;
263 		if (*p == '.')
264 			continue;
265 		return EINVAL;
266 	}
267 
268 	mutex_enter(&zev_mutex);
269 
270 	/* find free minor number.*/
271 	/* if this were a frequent operation we'd have a free-minor list */
272 	for (minor = ZEV_MINOR_MIN; minor <= ZEV_MINOR_MAX; minor++) {
273 		tmp = zev_queues[minor - ZEV_MINOR_MIN];
274 		if (tmp == NULL)
275 			break;
276 	}
277 	if (tmp) {
278 		mutex_exit(&zev_mutex);
279 		return ENOSPC;
280 	}
281 
282 	if (ddi_soft_state_zalloc(statep, minor) != DDI_SUCCESS) {
283 		mutex_exit(&zev_mutex);
284 		return ENOSPC;
285 	}
286 	ZEV_MEM_ADD(sizeof(zev_queue_t));
287 
288 	q = ddi_get_soft_state(statep, minor);
289 	memset(q, 0, sizeof(*q));
290 	strncpy(q->zq_name, name, ZEV_MAX_QUEUE_NAME_LEN);
291 	q->zq_name[ZEV_MAX_QUEUE_NAME_LEN] = '\0';
292 	q->zq_max_queue_len = max_queue_len;
293 	q->zq_wakeup_threshold = ZEV_DEFAULT_POLL_WAKEUP_QUEUE_LEN;
294 	q->zq_flags = flags;
295 	q->zq_refcnt = 1;
296 	q->zq_dip = dip;
297 	q->zq_minor_number = minor;
298 	cv_init(&q->zq_condvar, NULL, CV_DRIVER, NULL);
299 
300 	/* insert into queue list */
301 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
302 		/* if this were a frequent operation we'd have a name tree */
303 		if (zev_queues[i - ZEV_MINOR_MIN] == NULL)
304 			continue;
305 		if (!strcmp(q->zq_name, zev_queues[i-ZEV_MINOR_MIN]->zq_name)) {
306 			name_exists = 1;
307 			break;
308 		}
309 	}
310 	if (name_exists) {
311 		ddi_soft_state_free(statep, minor);
312 		ZEV_MEM_SUB(sizeof(zev_queue_t));
313 		mutex_exit(&zev_mutex);
314 		return EEXIST;
315 	}
316 	zev_queues[minor - ZEV_MINOR_MIN] = q;
317 	zev_queue_cnt++;
318 
319 	/* calculate current queue len and find head and tail */
320 	if (!(q->zq_flags & ZEV_FL_INITIALLY_EMPTY)) {
321 		q->zq_oldest = zev_queue_tail;
322 		msg = zev_queue_tail;
323 		while ((msg) && (q->zq_queue_len < q->zq_max_queue_len)) {
324 			q->zq_queue_len += msg->size;
325 			q->zq_queue_messages++;
326 			q->zq_oldest = msg;
327 			msg = msg->prev;
328 		}
329 	}
330 
331 	zev_update_blockflag();
332 
333 	mutex_exit(&zev_mutex);
334 
335 	if (ddi_create_minor_node(dip, name,
336 	    S_IFCHR, minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
337 		mutex_enter(&zev_mutex);
338 		zev_queues[minor - ZEV_MINOR_MIN] = NULL;
339 		zev_queue_cnt--;
340 		ddi_soft_state_free(statep, minor);
341 		ZEV_MEM_SUB(sizeof(zev_queue_t));
342 		zev_update_blockflag();
343 		mutex_exit(&zev_mutex);
344 		return EFAULT;
345 	}
346 
347 	*queue = q;
348 	return 0;
349 }
350 
351 /*
352  * poll() wakeup thread.  Used to check periodically whether we have
353  * bytes left in the queue that have not yet been made into a
354  * pollwakeup() call.  This is meant to insure a maximum waiting
355  * time until an event is presented as a poll wakeup, while at
356  * the same time not making every single event into a poll wakeup
357  * of it's own.
358  */
359 
360 static void
361 zev_poll_wakeup(boolean_t flush_all)
362 {
363 	zev_queue_t *q;
364 	int i;
365 
366 	/*
367 	 * This loop works with hold() and release() because
368 	 * pollwakeup() requires us to release our locks before calling it.
369 	 *
370 	 * from pollwakeup(9F):
371 	 *
372 	 *   "Driver defined locks should not be held across calls
373 	 *    to this function."
374 	 */
375 
376 	/* wake up threads for each individual queue */
377 	mutex_enter(&zev_mutex);
378 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
379 		q = zev_queues[i - ZEV_MINOR_MIN];
380 		if (q == NULL)
381 			continue;
382 		if (!q->zq_busy)
383 			continue;
384 		if (!q->zq_queue_len)
385 			continue;
386 		if ((flush_all) ||
387 		    (q->zq_queue_len > q->zq_wakeup_threshold)) {
388 			zev_queue_hold(q);
389 			mutex_exit(&zev_mutex);
390 			pollwakeup(&q->zq_pollhead, POLLIN);
391 			mutex_enter(&zev_mutex);
392 			zev_queue_release(q);
393 		}
394 	}
395 	mutex_exit(&zev_mutex);
396 }
397 
398 static void
399 zev_poll_wakeup_thread_main(void)
400 {
401 	while (zev_wakeup_thread_run) {
402 		delay(drv_usectohz(100 * 1000)); /* sleep 100ms */
403 
404 		zev_poll_wakeup(B_TRUE);
405 	}
406 	thread_exit();
407 }
408 
409 static int
410 zev_ioc_mute_pool(char *poolname)
411 {
412 	zev_pool_list_entry_t *pe;
413 	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
414 	/* pool already muted? */
415 	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
416 		if (!strcmp(pe->name, poolname)) {
417 			rw_exit(&zev_pool_list_rwlock);
418 			return EEXIST;
419 		}
420 	}
421 	pe = zev_zalloc(sizeof(*pe));
422 	if (!pe) {
423 		rw_exit(&zev_pool_list_rwlock);
424 		return ENOMEM;
425 	}
426 	(void) strncpy(pe->name, poolname, sizeof(pe->name));
427 	pe->next = zev_muted_pools_head;
428 	zev_muted_pools_head = pe;
429 	rw_exit(&zev_pool_list_rwlock);
430 	return (0);
431 }
432 
433 static int
434 zev_ioc_unmute_pool(char *poolname)
435 {
436 	zev_pool_list_entry_t *pe, *peprev;
437 
438 	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
439 	/* pool muted? */
440 	peprev = NULL;
441 	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
442 		if (!strcmp(pe->name, poolname))
443 			break;
444 		peprev = pe;
445 	}
446 	if (pe) {
447 		rw_exit(&zev_pool_list_rwlock);
448 		return ENOENT;
449 	}
450 
451 	if (peprev != NULL) {
452 		peprev->next = pe->next;
453 	} else {
454 		zev_muted_pools_head = pe->next;
455 	}
456 	zev_free(pe, sizeof(*pe));
457 	rw_exit(&zev_pool_list_rwlock);
458 	return (0);
459 }
460 
461 int
462 zev_skip_pool(objset_t *os)
463 {
464 	zev_pool_list_entry_t *pe;
465 	dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
466 	rw_enter(&zev_pool_list_rwlock, RW_READER);
467 	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
468 		if (!strcmp(pe->name, dp->dp_spa->spa_name)) {
469 			rw_exit(&zev_pool_list_rwlock);
470 			return 1;
471 		}
472 	}
473 	rw_exit(&zev_pool_list_rwlock);
474 	return 0;
475 }
476 
477 int
478 zev_skip_fs(zfsvfs_t *fs)
479 {
480 	dsl_dir_t *d = fs->z_os->os_dsl_dataset->ds_dir;
481 	dsl_dir_t *prev = NULL;
482 
483 	while (d && d != prev) {
484 		if (strstr(d->dd_myname, "_root"))
485 			return 0;
486 		prev = d;
487 		d = d->dd_parent;
488 	}
489 	return 1;
490 }
491 
492 static void
493 zev_update_statistics(int op, zev_statistics_t *stat)
494 {
495 	switch (op) {
496 	case ZEV_OP_ERROR:
497 		stat->zev_cnt_errors++;
498 		break;
499 	case ZEV_OP_MARK:
500 		stat->zev_cnt_marks++;
501 		break;
502 	case ZEV_OP_ZFS_MOUNT:
503 		stat->zev_cnt_zfs_mount++;
504 		break;
505 	case ZEV_OP_ZFS_UMOUNT:
506 		stat->zev_cnt_zfs_umount++;
507 		break;
508 	case ZEV_OP_ZVOL_WRITE:
509 		stat->zev_cnt_zvol_write++;
510 		break;
511 	case ZEV_OP_ZVOL_TRUNCATE:
512 		stat->zev_cnt_zvol_truncate++;
513 		break;
514 	case ZEV_OP_ZNODE_CLOSE_AFTER_UPDATE:
515 		stat->zev_cnt_znode_close_after_update++;
516 		break;
517 	case ZEV_OP_ZNODE_CREATE:
518 		stat->zev_cnt_znode_create++;
519 		break;
520 	case ZEV_OP_ZNODE_REMOVE:
521 		stat->zev_cnt_znode_remove++;
522 		break;
523 	case ZEV_OP_ZNODE_LINK:
524 		stat->zev_cnt_znode_link++;
525 		break;
526 	case ZEV_OP_ZNODE_SYMLINK:
527 		stat->zev_cnt_znode_symlink++;
528 		break;
529 	case ZEV_OP_ZNODE_RENAME:
530 		stat->zev_cnt_znode_rename++;
531 		break;
532 	case ZEV_OP_ZNODE_WRITE:
533 		stat->zev_cnt_znode_write++;
534 		break;
535 	case ZEV_OP_ZNODE_TRUNCATE:
536 		stat->zev_cnt_znode_truncate++;
537 		break;
538 	case ZEV_OP_ZNODE_SETATTR:
539 		stat->zev_cnt_znode_setattr++;
540 		break;
541 	case ZEV_OP_ZNODE_ACL:
542 		stat->zev_cnt_znode_acl++;
543 		break;
544 	}
545 }
546 
547 void
548 zev_queue_message(int op, zev_msg_t *msg)
549 {
550 	zev_queue_t *q;
551 	int wakeup = 0;
552 	zev_msg_t *m;
553 	int i;
554 
555 	msg->next = NULL;
556 	msg->prev = NULL;
557 	msg->read = 0;
558 
559 	if (op < ZEV_OP_MIN || op > ZEV_OP_MAX) {
560 		zev_queue_error(op, "unknown op id encountered: %d", op);
561 		zev_free(msg, sizeof(*msg) + msg->size);
562 		return;
563 	}
564 
565 	/*
566 	 * This mutex protects us agains race conditions when several
567 	 * threads want to queue a message and one or more queues are
568 	 * full:  we release zev_mutex to wait for the queues to become
569 	 * less-than-full, but we don't know in which order the waiting
570 	 * threads will be awoken.  If it's not the same order in which
571 	 * they went to sleep we might mark different messages as "newest"
572 	 * in different queues, and so we might have dupes or even
573 	 * skip messages.
574 	 */
575 	mutex_enter(&zev_queue_msg_mutex);
576 
577 	mutex_enter(&zev_mutex);
578 
579 	/*
580 	 * When the module is loaded, the default behavior ist to
581 	 * put all events into a queue and block if the queue is full.
582 	 * This is done even before the pseudo device is attached.
583 	 * This way, no events are lost.
584 	 *
585 	 * To discard events entirely the "beaver" queue,
586 	 * which never discards anything, has to be removed.
587 	 */
588 
589 	if (zev_queue_cnt == 0) {
590 		mutex_exit(&zev_mutex);
591 		mutex_exit(&zev_queue_msg_mutex);
592 		return;
593 	}
594 
595 	/* put message into global queue */
596 	msg->seq = zev_msg_sequence_number++;
597 
598 	/* do we need to make room? */
599 again:
600 	while (zev_statistics.zev_max_queue_len &&
601 	    zev_statistics.zev_queue_len > zev_statistics.zev_max_queue_len) {
602 
603 		if (zev_have_blocking_queues) {
604 			/* so we have blocking queues.  are they full? */
605 			for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
606 				q = zev_queues[i - ZEV_MINOR_MIN];
607 				if (!q)
608 					continue;
609 				if ((q->zq_flags &
610 				     ZEV_FL_BLOCK_WHILE_QUEUE_FULL) == 0)
611 					continue;
612 				if (q->zq_queue_len &&
613 				    q->zq_queue_len > q->zq_max_queue_len) {
614 					/* block until queue's been shrunk. */
615 					cv_wait(&zev_condvar, &zev_mutex);
616 					goto again;
617 				}
618 			}
619 		}
620 
621 		/* discard events until this message fits into all queues */
622 
623 		for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
624 			q = zev_queues[i - ZEV_MINOR_MIN];
625 			if (!q)
626 				continue;
627 			/* discard msgs until queue is small enough */
628 			while (q->zq_queue_len &&
629 			       q->zq_queue_len > q->zq_max_queue_len) {
630 				m = q->zq_oldest;
631 				if (m == NULL)
632 					break;
633 				q->zq_events_discarded++;
634 				q->zq_bytes_discarded += m->size;
635 				q->zq_oldest = m->next;
636 				q->zq_queue_len -= m->size;
637 				q->zq_queue_messages--;
638 			}
639 		}
640 
641 		zev_queue_trim();
642 		ASSERT(zev_statistics.zev_queue_len == 0 ||
643 		       zev_statistics.zev_queue_len <=
644 				zev_statistics.zev_max_queue_len);
645 	}
646 
647 	if (zev_queue_tail == NULL) {
648 		zev_queue_head = zev_queue_tail = msg;
649 	} else {
650 		zev_queue_tail->next = msg;
651 		msg->prev = zev_queue_tail;
652 		zev_queue_tail = msg;
653 	}
654 	zev_queue_len++;
655 	zev_statistics.zev_cnt_total_events++;
656 	zev_statistics.zev_queue_len += msg->size;
657 
658 	/* update per-device queues */
659 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
660 		q = zev_queues[i - ZEV_MINOR_MIN];
661 		if (!q)
662 			continue;
663 
664 		zev_queue_hold(q);
665 
666 		/* make sure queue has enough room */
667 		while (q->zq_max_queue_len &&
668 		       q->zq_queue_len > q->zq_max_queue_len) {
669 
670 			if (q->zq_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL) {
671 				/* block until queue has been shrunk. */
672 				cv_wait(&zev_condvar, &zev_mutex);
673 			} else {
674 				/* discard msgs until queue is small enough */
675 				while (q->zq_queue_len > q->zq_max_queue_len) {
676 					m = q->zq_oldest;
677 					if (m == NULL)
678 						break;
679 					q->zq_events_discarded++;
680 					q->zq_bytes_discarded += m->size;
681 					q->zq_oldest = m->next;
682 					q->zq_queue_len -= m->size;
683 					q->zq_queue_messages--;
684 				}
685 			}
686 		}
687 
688 		/* register new message at the end of the queue */
689 		q->zq_queue_len += msg->size;
690 		q->zq_queue_messages++;
691 		q->zq_bytes_total += msg->size;
692 		q->zq_events_total++;
693 		if (q->zq_oldest == NULL)
694 			q->zq_oldest = msg;
695 
696 		zev_update_statistics(op, &q->zq_statistics);
697 
698 		if (q->zq_queue_len > q->zq_wakeup_threshold)
699 			wakeup = 1;
700 		if (q->zq_queue_len == msg->size)  /* queue was empty */
701 			cv_broadcast(&q->zq_condvar);
702 
703 		zev_queue_release(q);
704 	}
705 
706 	zev_queue_trim();
707 
708 	zev_update_statistics(op, &zev_statistics);
709 	mutex_exit(&zev_mutex);
710 	mutex_exit(&zev_queue_msg_mutex);
711 
712 	/* one or more queues need a pollwakeup() */
713 	if (op == ZEV_OP_MARK) {
714 		zev_poll_wakeup(B_TRUE);
715 	} else if (wakeup) {
716 		zev_poll_wakeup(B_FALSE);
717 	}
718 
719 	return;
720 }
721 
722 void
723 zev_queue_error(int op, char *fmt, ...)
724 {
725 	char buf[ZEV_MAX_MESSAGE_LEN];
726 	va_list ap;
727 	int len;
728 	zev_msg_t *msg = NULL;
729 	zev_error_t *rec;
730 	int msg_size;
731 
732 	va_start(ap, fmt);
733 	len = vsnprintf(buf, sizeof(buf), fmt, ap);
734 	va_end(ap);
735 	if (len >= sizeof(buf)) {
736 		cmn_err(CE_WARN, "zev: can't report error - "
737 		        "dropping event entirely.");
738 		return;
739 	}
740 
741 	msg_size = sizeof(*rec) + len + 1;
742 	msg = zev_alloc(sizeof(*msg) + msg_size);
743 	msg->size = msg_size;
744 	rec = (zev_error_t *)(msg + 1);
745 	rec->record_len = msg_size;
746 	rec->op = ZEV_OP_ERROR;
747 	rec->op_time = ddi_get_time();
748 	rec->guid = 0;
749 	rec->failed_op = op;
750 	rec->errstr_len = len;
751 	(void) memcpy(ZEV_ERRSTR(rec), buf, len + 1);
752 
753 	zev_queue_message(ZEV_OP_ERROR, msg);
754 	return;
755 }
756 
757 static int
758 zev_find_queue(zev_queue_t **out, zev_queue_t *req_q, zev_queue_name_t *name)
759 {
760 	char namebuf[ZEV_MAX_QUEUE_NAME_LEN+1];
761 	zev_queue_t *q;
762 	int i;
763 
764 	*out = NULL;
765 
766 	if (name->zev_namelen == 0) {
767 		if (req_q->zq_minor_number == ZEV_CONTROL_DEVICE_MINOR)
768 			return EINVAL;
769 		mutex_enter(&zev_mutex);
770 		zev_queue_hold(req_q);
771 		mutex_exit(&zev_mutex);
772 		*out = req_q;
773 		return 0;
774 	}
775 
776 	if (name->zev_namelen > ZEV_MAX_QUEUE_NAME_LEN)
777 		return EINVAL;
778 	strncpy(namebuf, name->zev_name, name->zev_namelen);
779 	namebuf[name->zev_namelen] = '\0';
780 
781 	mutex_enter(&zev_mutex);
782 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
783 		q = zev_queues[i - ZEV_MINOR_MIN];
784 		if (!q)
785 			continue;
786 		if (!strcmp(q->zq_name, namebuf)) {
787 			zev_queue_hold(q);
788 			mutex_exit(&zev_mutex);
789 			*out = q;
790 			return 0;
791 		}
792 	}
793 	mutex_exit(&zev_mutex);
794 	return ENOENT;
795 }
796 
797 static int
798 zev_ioc_get_queue_statistics(zev_queue_t *req_q, intptr_t arg, int mode)
799 {
800 	zev_ioctl_get_queue_statistics_t gs;
801 	zev_queue_t *q;
802 	int ret;
803 
804 	if (ddi_copyin((void *)arg, &gs, sizeof(gs), mode) != 0)
805 		return EFAULT;
806 
807 	ret = zev_find_queue(&q, req_q, &gs.zev_queue_name);
808 	if (ret)
809 		return ret;
810 
811 	/* ddi_copyout() can take a long time.  Better make
812 	   a copy to be able to release the mutex faster. */
813 	mutex_enter(&zev_mutex);
814 	memcpy(&gs.zev_statistics, &q->zq_statistics,sizeof(gs.zev_statistics));
815 	gs.zev_statistics.zev_queue_len = q->zq_queue_len;
816 	gs.zev_statistics.zev_bytes_read = q->zq_bytes_read;
817 	gs.zev_statistics.zev_bytes_discarded = q->zq_bytes_discarded;
818 	gs.zev_statistics.zev_max_queue_len = q->zq_max_queue_len;
819 	gs.zev_statistics.zev_cnt_discarded_events = q->zq_events_discarded;
820 	gs.zev_statistics.zev_cnt_total_events = q->zq_events_total;
821 	zev_queue_release(q);
822 	mutex_exit(&zev_mutex);
823 
824 	if (ddi_copyout(&gs, (void *)arg, sizeof(gs), mode) != 0)
825 		return EFAULT;
826 	return 0;
827 }
828 
829 static int
830 zev_ioc_set_queue_properties(zev_queue_t *req_q, intptr_t arg, int mode)
831 {
832 	zev_ioctl_set_queue_properties_t qp;
833 	zev_queue_t *q;
834 	uint64_t old_max;
835 	uint64_t old_flags;
836 	int ret;
837 
838 	if (ddi_copyin((void *)arg, &qp, sizeof(qp), mode) != 0)
839 		return EFAULT;
840 	if (qp.zev_max_queue_len > ZEV_MAX_QUEUE_LEN)
841 		return EINVAL;
842 	if (qp.zev_poll_wakeup_threshold > ZEV_MAX_POLL_WAKEUP_QUEUE_LEN)
843 		return EINVAL;
844 
845 	ret = zev_find_queue(&q, req_q, &qp.zev_queue_name);
846 	if (ret)
847 		return ret;
848 
849 	mutex_enter(&zev_mutex);
850 
851 	/*
852 	 * Note: if the PERSISTENT flag is cleared, and the queue is not busy,
853 	 * the queue should be removed by zev_queue_release() in zev_ioctl().
854 	 */
855 	old_flags = qp.zev_flags;
856 	q->zq_flags = qp.zev_flags;
857 	if ((old_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL) &&
858 	   (!(qp.zev_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL))) {
859 		/* queue is no longer blocking - wake blocked threads */
860 		cv_broadcast(&zev_condvar);
861 	}
862 
863 	zev_update_blockflag();
864 
865 	old_max = q->zq_max_queue_len;
866 	q->zq_max_queue_len = qp.zev_max_queue_len;
867 	if (q->zq_max_queue_len < old_max)
868 		zev_queue_trim();
869 	if (q->zq_max_queue_len > old_max)
870 		cv_broadcast(&zev_condvar);	/* threads may be waiting */
871 
872 	if ((qp.zev_poll_wakeup_threshold < q->zq_wakeup_threshold) &&
873 	    (qp.zev_poll_wakeup_threshold <= q->zq_queue_len))
874 		pollwakeup(&q->zq_pollhead, POLLIN);
875 	q->zq_wakeup_threshold = qp.zev_poll_wakeup_threshold;
876 
877 	zev_queue_release(q);
878 	mutex_exit(&zev_mutex);
879 	return 0;
880 }
881 
882 static int
883 zev_ioc_get_queue_properties(zev_queue_t *req_q, intptr_t arg, int mode)
884 {
885 	zev_ioctl_get_queue_properties_t qp;
886 	zev_queue_t *q;
887 	int ret;
888 
889 	if (ddi_copyin((void *)arg, &qp, sizeof(qp), mode) != 0)
890 		return EFAULT;
891 
892 	ret = zev_find_queue(&q, req_q, &qp.zev_queue_name);
893 	if (ret)
894 		return ret;
895 
896 	mutex_enter(&zev_mutex);
897 	qp.zev_max_queue_len = q->zq_max_queue_len;
898 	qp.zev_flags = q->zq_flags;
899 	qp.zev_poll_wakeup_threshold = q->zq_wakeup_threshold;
900 	zev_queue_release(q);
901 	mutex_exit(&zev_mutex);
902 
903 	if (ddi_copyout(&qp, (void *)arg, sizeof(qp), mode) != 0)
904 		return EFAULT;
905 	return 0;
906 }
907 
908 static int
909 zev_ioc_add_queue(zev_queue_t *req_q, intptr_t arg, int mode)
910 {
911 	zev_ioctl_add_queue_t aq;
912 	zev_queue_t *new_q;
913 	char name[ZEV_MAX_QUEUE_NAME_LEN+1];
914 
915 	if (ddi_copyin((void *)arg, &aq, sizeof(aq), mode) != 0)
916 		return EFAULT;
917 
918 	if (aq.zev_namelen > ZEV_MAX_QUEUE_NAME_LEN)
919 		return EINVAL;
920 	strncpy(name, aq.zev_name, aq.zev_namelen);
921 	name[aq.zev_namelen] = '\0';
922 	if (!strncmp(name, ZEV_TMPQUEUE_DEVICE_NAME,
923 	             strlen(ZEV_TMPQUEUE_DEVICE_NAME)))
924 		return EINVAL;
925 
926 	return zev_queue_new(&new_q, req_q->zq_dip, name,
927 	                     aq.zev_max_queue_len, aq.zev_flags);
928 }
929 
930 static int
931 zev_ioc_remove_queue(zev_queue_t *req_q, intptr_t arg, int mode)
932 {
933 	zev_ioctl_remove_queue_t rq;
934 	zev_queue_t *q;
935 	char name[ZEV_MAX_QUEUE_NAME_LEN+1];
936 	int found = 0;
937 	int i;
938 
939 	if (ddi_copyin((void *)arg, &rq, sizeof(rq), mode) != 0)
940 		return EFAULT;
941 
942 	if (rq.zev_queue_name.zev_namelen > ZEV_MAX_QUEUE_NAME_LEN)
943 		return EINVAL;
944 	strncpy(name, rq.zev_queue_name.zev_name,
945 	        rq.zev_queue_name.zev_namelen);
946 	name[rq.zev_queue_name.zev_namelen] = '\0';
947 
948 	mutex_enter(&zev_mutex);
949 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
950 		q = zev_queues[i - ZEV_MINOR_MIN];
951 		if (!q)
952 			continue;
953 		if (!strcmp(q->zq_name, name)) {
954 			found = 1;
955 			break;
956 		}
957 	}
958 	if (!found) {
959 		mutex_exit(&zev_mutex);
960 		return ENOENT;
961 	}
962 
963 	if (q->zq_busy) {
964 		mutex_exit(&zev_mutex);
965 		return EBUSY;
966 	}
967 	/*
968 	 * clear flags, so that persistent queues are removed aswell
969 	 * and the queue becomes non-blocking.
970 	 */
971 	q->zq_flags = 0;
972 	if (q->zq_to_be_removed == B_FALSE) {
973 		q->zq_to_be_removed = B_TRUE;
974 		zev_queue_release(q);
975 	}
976 	/* some threads might be waiting for this queue to become writable */
977 	cv_broadcast(&zev_condvar);
978 
979 	mutex_exit(&zev_mutex);
980 	return 0;
981 }
982 
983 static int
984 zev_ioc_get_debug_info(zev_queue_t *req_q, intptr_t arg, int mode)
985 {
986 	zev_ioctl_debug_info_t di;
987 	uint64_t mem_allocated = atomic_add_64_nv(&zev_memory_allocated, 0);
988 	uint64_t mem_freed     = atomic_add_64_nv(&zev_memory_freed, 0);
989 
990 	zev_chksum_stats(&di.zev_chksum_cache_size,
991 	                 &di.zev_chksum_cache_hits,
992 	                 &di.zev_chksum_cache_misses);
993 	di.zev_memory_allocated = mem_allocated - mem_freed;
994 	if (ddi_copyout(&di, (void *)arg, sizeof(di), mode) != 0)
995 		return EFAULT;
996 	return 0;
997 }
998 
999 static int
1000 zev_ioc_get_queue_list(zev_queue_t *req_q, intptr_t arg, int mode)
1001 {
1002 	zev_ioctl_get_queue_list_t gql;
1003 	zev_queue_t *q;
1004 	int i = 0;
1005 	int count = 0;
1006 
1007 	memset(&gql, 0, sizeof(gql));
1008 
1009 	mutex_enter(&zev_mutex);
1010 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
1011 		q = zev_queues[i - ZEV_MINOR_MIN];
1012 		if (!q)
1013 			continue;
1014 		strncpy(gql.zev_queue_name[count].zev_name,
1015 		    q->zq_name, ZEV_MAX_QUEUE_NAME_LEN);
1016 		gql.zev_queue_name[count].zev_namelen = strlen(q->zq_name);
1017 		count++;
1018 	}
1019 	gql.zev_n_queues = count;
1020 	mutex_exit(&zev_mutex);
1021 
1022 	if (ddi_copyout(&gql, (void *)arg, sizeof(gql), mode) != 0)
1023 		return EFAULT;
1024 	return 0;
1025 }
1026 
1027 static int
1028 zev_ioc_set_max_queue_len(zev_queue_t *req_q, intptr_t arg, int mode)
1029 {
1030 	uint64_t len;
1031 	int i;
1032 	zev_queue_t *q;
1033 
1034 	if (ddi_copyin((void *)arg, &len, sizeof(len), mode) != 0) {
1035 		return EFAULT;
1036 	}
1037 	if (len > ZEV_MAX_QUEUE_LEN) {
1038 		return EINVAL;
1039 	}
1040 	mutex_enter(&zev_mutex);
1041 	zev_statistics.zev_max_queue_len = len;
1042 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
1043 		q = zev_queues[i - ZEV_MINOR_MIN];
1044 		if (!q)
1045 			continue;
1046 		if (q->zq_max_queue_len <=
1047 		    zev_statistics.zev_max_queue_len)
1048 			continue;
1049 		q->zq_max_queue_len = zev_statistics.zev_max_queue_len;
1050 	}
1051 	cv_broadcast(&zev_condvar);
1052 	mutex_exit(&zev_mutex);
1053 	return 0;
1054 }
1055 
1056 static int
1057 zev_ioc_get_zev_version(intptr_t arg, int mode)
1058 {
1059 	zev_ioctl_get_zev_version vi;
1060 	vi.zev_major_version = ZEV_MAJOR_VERSION;
1061 	vi.zev_minor_version = ZEV_MINOR_VERSION;
1062 	if (ddi_copyout(&vi, (void *)arg, sizeof(vi), mode) != 0)
1063 		return EFAULT;
1064 	return 0;
1065 }
1066 
1067 /* ARGSUSED */
1068 static int
1069 zev_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
1070 {
1071 	zev_statistics_t zs;
1072 	zev_ioctl_poolarg_t pa;
1073 	zev_ioctl_mark_t mark;
1074 	zev_mark_t *rec;
1075 	int msg_size;
1076 	zev_msg_t *msg;
1077 	uint64_t mark_id;
1078 	minor_t minor;
1079 	zev_queue_t *req_q;
1080 	int ret = 0;
1081 
1082 	minor = getminor(dev);
1083 	mutex_enter(&zev_mutex);
1084 	if ((req_q = ddi_get_soft_state(statep, minor)) == NULL) {
1085 		mutex_exit(&zev_mutex);
1086 		return (ENXIO);
1087 	}
1088 	zev_queue_hold(req_q);
1089 	mutex_exit(&zev_mutex);
1090 	/*
1091 	 * all structures passed between kernel and userspace
1092 	 * are now compatible between 64 and 32 bit.  Model
1093 	 * conversion can be ignored.
1094 	 */
1095 	switch (cmd) {
1096 	case ZEV_IOC_GET_GLOBAL_STATISTICS:
1097 		/* ddi_copyout() can take a long time.  Better make
1098 		   a copy to be able to release the mutex faster. */
1099 		mutex_enter(&zev_mutex);
1100 		(void) memcpy(&zs, &zev_statistics, sizeof(zs));
1101 		mutex_exit(&zev_mutex);
1102 		if (ddi_copyout(&zs, (void *)arg, sizeof(zs), mode) != 0)
1103 			ret = EFAULT;
1104 		break;
1105 	case ZEV_IOC_GET_QUEUE_STATISTICS:
1106 		ret = zev_ioc_get_queue_statistics(req_q, arg, mode);
1107 		break;
1108 	case ZEV_IOC_MUTE_POOL:
1109 	case ZEV_IOC_UNMUTE_POOL:
1110 		if (ddi_copyin((void *)arg, &pa, sizeof(pa), mode) != 0) {
1111 			ret = EFAULT;
1112 			break;
1113 		}
1114 		if (pa.zev_poolname_len >=MAXPATHLEN) {
1115 			ret = EINVAL;
1116 			break;
1117 		}
1118 		pa.zev_poolname[pa.zev_poolname_len] = '\0';
1119 		if (cmd == ZEV_IOC_MUTE_POOL) {
1120 			ret = zev_ioc_mute_pool(pa.zev_poolname);
1121 		} else {
1122 			ret = zev_ioc_unmute_pool(pa.zev_poolname);
1123 		}
1124 		break;
1125 	case ZEV_IOC_SET_MAX_QUEUE_LEN:
1126 		ret = zev_ioc_set_max_queue_len(req_q, arg, mode);
1127 		break;
1128 	case ZEV_IOC_GET_QUEUE_PROPERTIES:
1129 		ret = zev_ioc_get_queue_properties(req_q, arg, mode);
1130 		break;
1131 	case ZEV_IOC_SET_QUEUE_PROPERTIES:
1132 		ret = zev_ioc_set_queue_properties(req_q, arg, mode);
1133 		break;
1134 	case ZEV_IOC_MARK:
1135 		if (ddi_copyin((void *)arg, &mark, sizeof(mark), mode) != 0) {
1136 			ret = EFAULT;
1137 			break;
1138 		}
1139 		/* prepare message */
1140 		msg_size = sizeof(*rec) + mark.zev_payload_len + 1;
1141 		msg = zev_alloc(sizeof(*msg) + msg_size);
1142 		msg->size = msg_size;
1143 		rec = (zev_mark_t *)(msg + 1);
1144 		rec->record_len = msg_size;
1145 		rec->op = ZEV_OP_MARK;
1146 		rec->op_time = ddi_get_time();
1147 		rec->guid = mark.zev_guid;
1148 		rec->payload_len = mark.zev_payload_len;
1149 		/* get payload */
1150 		if (ddi_copyin(((char *)arg) + sizeof(mark),
1151 		               ZEV_PAYLOAD(rec),
1152 		               mark.zev_payload_len, mode) != 0) {
1153 			zev_free(msg, msg_size);
1154 			ret = EFAULT;
1155 			break;
1156 		}
1157 		*(ZEV_PAYLOAD(rec) + mark.zev_payload_len) = '\0';
1158 		/* get mark id and queue message */
1159 		mutex_enter(&zev_mark_id_mutex);
1160 		mark_id = zev_mark_id++;
1161 		mutex_exit(&zev_mark_id_mutex);
1162 		rec->mark_id = mark_id;
1163 		zev_queue_message(ZEV_OP_MARK, msg);
1164 		/* report mark id to userland, ignore errors */
1165 		mark.zev_mark_id = mark_id;
1166 		ddi_copyout(&mark, (void *)arg, sizeof(mark), mode);
1167 		break;
1168 	case ZEV_IOC_ADD_QUEUE:
1169 		if (minor != ZEV_CONTROL_DEVICE_MINOR) {
1170 			ret = EACCES;
1171 			break;
1172 		}
1173 		ret = zev_ioc_add_queue(req_q, arg, mode);
1174 		break;
1175 	case ZEV_IOC_REMOVE_QUEUE:
1176 		if (minor != ZEV_CONTROL_DEVICE_MINOR) {
1177 			ret = EACCES;
1178 			break;
1179 		}
1180 		ret = zev_ioc_remove_queue(req_q, arg, mode);
1181 		break;
1182 	case ZEV_IOC_GET_DEBUG_INFO:
1183 		ret = zev_ioc_get_debug_info(req_q, arg, mode);
1184 		break;
1185 	case ZEV_IOC_GET_QUEUE_LIST:
1186 		ret = zev_ioc_get_queue_list(req_q, arg, mode);
1187 		break;
1188 	case ZEV_IOC_GET_FILE_SIGNATURES:
1189 		ret = zev_ioc_get_signatures(arg, mode);
1190 		break;
1191 	case ZEV_IOC_GET_ZEV_VERSION:
1192 		ret = zev_ioc_get_zev_version(arg, mode);
1193 		break;
1194 	default:
1195 		/* generic "ioctl unknown" error */
1196 		ret = ENOTTY;
1197 	}
1198 
1199 	mutex_enter(&zev_mutex);
1200 	zev_queue_release(req_q);
1201 	mutex_exit(&zev_mutex);
1202 	if (ret)
1203 		return(SET_ERROR(ret));
1204 	return (ret);
1205 }
1206 
1207 static int
1208 zev_chpoll(dev_t dev, short events, int anyyet,
1209     short *reventsp, struct pollhead **phpp)
1210 {
1211 	int minor;
1212 	short revent = 0;
1213 	zev_queue_t *q;
1214 
1215 	/* use minor-specific queue context and it's pollhead */
1216 	minor = getminor(dev);
1217 	if (minor == ZEV_CONTROL_DEVICE_MINOR)
1218 		return (EINVAL);
1219 	mutex_enter(&zev_mutex);
1220 	if ((q = ddi_get_soft_state(statep, minor)) == NULL) {
1221 		mutex_exit(&zev_mutex);
1222 		return (ENXIO);
1223 	}
1224 	revent = 0;
1225 	if ((events & POLLIN)) {
1226 		if (q->zq_oldest)
1227 			revent |= POLLIN;
1228 	}
1229 	if (revent == 0) {
1230 		if (!anyyet) {
1231 			*phpp = &q->zq_pollhead;
1232 		}
1233 	}
1234 	*reventsp = revent;
1235 	mutex_exit(&zev_mutex);
1236 	return (0);
1237 }
1238 
1239 /* ARGSUSED */
1240 static int
1241 zev_read(dev_t dev, struct uio *uio_p, cred_t *crep_p)
1242 {
1243 	minor_t minor;
1244 	offset_t off;
1245 	int ret = 0;
1246 	zev_msg_t *msg;
1247 	char *data;
1248 	zev_queue_t *q;
1249 
1250 	minor = getminor(dev);
1251 	if (minor == ZEV_CONTROL_DEVICE_MINOR)
1252 		return (EINVAL);
1253 
1254 	mutex_enter(&zev_mutex);
1255 	q = ddi_get_soft_state(statep, minor);
1256 	if (q == NULL) {
1257 		mutex_exit(&zev_mutex);
1258 		return (ENXIO);
1259 	}
1260 	off = uio_p->uio_loffset;
1261 	msg = q->zq_oldest;
1262 	while (msg == NULL) {
1263 		if (!ddi_can_receive_sig()) {
1264 			/*
1265 			 * read() shouldn't block because this thread
1266 			 * can't receive signals. (e.g., it might be
1267 			 * torn down by exit() right now.)
1268 			 */
1269 			mutex_exit(&zev_mutex);
1270 			return 0;
1271 		}
1272 		if (cv_wait_sig(&q->zq_condvar, &zev_mutex) == 0) {
1273 			/* signal received. */
1274 			mutex_exit(&zev_mutex);
1275 			return EINTR;
1276 		}
1277 		msg = q->zq_oldest;
1278 	}
1279 	if (msg->size > uio_p->uio_resid) {
1280 		mutex_exit(&zev_mutex);
1281 		return E2BIG;
1282 	}
1283 	while (msg && uio_p->uio_resid >= msg->size) {
1284 		data = (char *)(msg + 1);
1285 		ret = uiomove(data, msg->size, UIO_READ, uio_p);
1286 		if (ret != 0) {
1287 			mutex_exit(&zev_mutex);
1288 			cmn_err(CE_WARN, "zev: uiomove failed; messages lost");
1289 			uio_p->uio_loffset = off;
1290 			return (ret);
1291 		}
1292 		q->zq_oldest = msg->next;
1293 		q->zq_bytes_read += msg->size;
1294 		q->zq_queue_len -= msg->size;
1295 		q->zq_queue_messages--;
1296 		msg->read++;
1297 		msg = q->zq_oldest;
1298 	}
1299 	zev_queue_trim();
1300 	cv_broadcast(&zev_condvar);
1301 	mutex_exit(&zev_mutex);
1302 	uio_p->uio_loffset = off;
1303 	return 0;
1304 }
1305 
1306 /* ARGSUSED */
1307 static int
1308 zev_close(dev_t dev, int flag, int otyp, cred_t *crepd)
1309 {
1310 	zev_queue_t *q;
1311 	int minor;
1312 
1313 	minor = getminor(dev);
1314 	if (otyp != OTYP_CHR)
1315 		return (EINVAL);
1316 	mutex_enter(&zev_mutex);
1317 	if ((q = ddi_get_soft_state(statep, minor)) == NULL) {
1318 		mutex_exit(&zev_mutex);
1319 		return (ENXIO);
1320 	}
1321 	if (q->zq_busy != B_TRUE) {
1322 		mutex_exit(&zev_mutex);
1323 		return (EINVAL);
1324 	}
1325 	q->zq_busy = B_FALSE;
1326 	if ((q->zq_flags & ZEV_FL_PERSISTENT) == 0)
1327 		zev_queue_release(q);
1328 	mutex_exit(&zev_mutex);
1329 	return (0);
1330 }
1331 
1332 /* ARGSUSED */
1333 static int
1334 zev_open(dev_t *devp, int flag, int otyp, cred_t *credp)
1335 {
1336 	zev_queue_t *q;
1337 	minor_t minor;
1338 	char zq_name[ZEV_MAX_QUEUE_NAME_LEN];
1339 	int ret;
1340 
1341 	minor = getminor(*devp);
1342 	if (otyp != OTYP_CHR)
1343 		return (EINVAL);
1344 	if (drv_priv(credp) != 0)
1345 		return (EPERM);
1346 	if (minor == ZEV_TMPQUEUE_DEVICE_MINOR) {
1347 		/* get control queue soft state to have dip */
1348 		if ((q = ddi_get_soft_state(statep,
1349 		                            ZEV_CONTROL_DEVICE_MINOR)) == NULL){
1350 			mutex_exit(&zev_mutex);
1351 			return (ENXIO);
1352 		}
1353 
1354 		/* create new temporary queue and return it. */
1355 
1356 		snprintf(zq_name, sizeof(zq_name),
1357 		         ZEV_TMPQUEUE_DEVICE_NAME ".%d", zev_tmpqueue_num++);
1358 
1359 		ret = zev_queue_new(&q, q->zq_dip, zq_name, 0,
1360 		                    ZEV_FL_INITIALLY_EMPTY);
1361 		if (ret) {
1362 			return ret;
1363 		}
1364 
1365 		q->zq_busy = B_TRUE;
1366 		*devp = makedevice(getmajor(*devp), q->zq_minor_number);
1367 		return 0;
1368 	}
1369 	mutex_enter(&zev_mutex);
1370 	if ((q = ddi_get_soft_state(statep, minor)) == NULL) {
1371 		mutex_exit(&zev_mutex);
1372 		return (ENXIO);
1373 	}
1374 	if (minor == ZEV_CONTROL_DEVICE_MINOR) {
1375 		/* control device may be used in parallel */
1376 		q->zq_busy = B_TRUE;
1377 		mutex_exit(&zev_mutex);
1378 		return 0;
1379 	}
1380 	if (q->zq_busy == B_TRUE) {
1381 		mutex_exit(&zev_mutex);
1382 		return (EBUSY);
1383 	}
1384 	q->zq_busy = B_TRUE;	/* can only be opened exclusively */
1385 	mutex_exit(&zev_mutex);
1386 	return (0);
1387 }
1388 
1389 static struct cb_ops zev_cb_ops = {
1390 	zev_open,		/* open */
1391 	zev_close,		/* close */
1392 	nodev,			/* strategy */
1393 	nodev,			/* print */
1394 	nodev,			/* dump */
1395 	zev_read,		/* read */
1396 	nodev,			/* write */
1397 	zev_ioctl,		/* ioctl */
1398 	nodev,			/* devmap */
1399 	nodev,			/* mmap */
1400 	nodev,			/* segmap */
1401 	zev_chpoll,		/* chpoll */
1402 	ddi_prop_op,		/* prop_op */
1403 	NULL,			/* streamtab */
1404 	D_MP | D_64BIT,		/* cb_flag */
1405 	CB_REV,			/* cb_rev */
1406 	nodev,			/* aread */
1407 	nodev,			/* awrite */
1408 };
1409 
1410 static void
1411 zev_free_instance(dev_info_t *dip)
1412 {
1413 	int instance;
1414 	zev_queue_t *q;
1415 	int i;
1416 
1417 	instance = ddi_get_instance(dip);
1418 	if (instance != 0) {
1419 		cmn_err(CE_WARN, "zev: tried to free instance != 0 (%d)",
1420 		        instance);
1421 		return;
1422 	}
1423 
1424 	ddi_remove_minor_node(dip, NULL);
1425 	devfs_clean(ddi_root_node() ? ddi_root_node() : dip,
1426 	            NULL, DV_CLEAN_FORCE);
1427 
1428 	/* stop pollwakeup thread */
1429 	zev_wakeup_thread_run = 0;
1430 	if (zev_poll_wakeup_thread != NULL) {
1431 		thread_join(zev_poll_wakeup_thread->t_did);
1432 		zev_poll_wakeup_thread = NULL;
1433 	}
1434 
1435 	mutex_enter(&zev_mutex);
1436 
1437 	/* remove "ctrl" dummy queue */
1438 	q = ddi_get_soft_state(statep, ZEV_CONTROL_DEVICE_MINOR);
1439 	if (q) {
1440 		ddi_soft_state_free(statep, ZEV_CONTROL_DEVICE_MINOR);
1441 		ZEV_MEM_SUB(sizeof(zev_queue_t));
1442 	}
1443 
1444 	/* remove all other queues */
1445 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
1446 		q = zev_queues[i- ZEV_MINOR_MIN];
1447 		if (!q)
1448 			continue;
1449 		ASSERT(q->zq_refcnt == 1);
1450 		zev_queue_release(q);
1451 	}
1452 	zev_queue_trim();
1453 	bzero(&zev_queues, sizeof(zev_queues));
1454 
1455 	mutex_exit(&zev_mutex);
1456 
1457 }
1458 
1459 static int
1460 zev_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1461 {
1462 	int instance;
1463 	zev_queue_t *q;
1464 
1465 	/* called once per instance with DDI_DETACH,
1466 	   may be called to suspend */
1467 	switch (cmd) {
1468 	case DDI_DETACH:
1469 		/* instance busy? */
1470 		instance = ddi_get_instance(dip);
1471 		if (instance != 0) {	/* hardcoded in zev.conf */
1472 			/* this module only supports one instance. */
1473 			return (DDI_FAILURE);
1474 		}
1475 
1476 		mutex_enter(&zev_mutex);
1477 		if (!zev_attached) {
1478 			mutex_exit(&zev_mutex);
1479 			return (DDI_FAILURE);
1480 		}
1481 
1482 		/* check "ctrl" queue to see if t is busy */
1483 		q = ddi_get_soft_state(statep, ZEV_CONTROL_DEVICE_MINOR);
1484 		if (q == NULL) {
1485 			mutex_exit(&zev_mutex);
1486 			return (DDI_FAILURE);
1487 		}
1488 		if (q->zq_busy) {
1489 			mutex_exit(&zev_mutex);
1490 			return (DDI_FAILURE);
1491 		}
1492 		/* are there any queues? */
1493 		if (zev_queue_cnt > 0) {
1494 			mutex_exit(&zev_mutex);
1495 			return (DDI_FAILURE);
1496 		}
1497 
1498 		zev_attached = B_FALSE;
1499 		mutex_exit(&zev_mutex);
1500 
1501 		/* switch ZFS event callbacks back to default */
1502 		rw_enter(&rz_zev_rwlock, RW_WRITER);
1503 		rz_zev_callbacks = rz_zev_default_callbacks;
1504 		rz_zev_set_active(B_FALSE);
1505 		rw_exit(&rz_zev_rwlock);
1506 
1507 		/* no thread is inside of the callbacks anymore. */
1508 
1509 		/* free resources allocated for this instance */
1510 		zev_free_instance(dip);
1511 		zev_chksum_fini();
1512 #if 0
1513 		cmn_err(CE_WARN, "zev: allocated memory at detach: %" PRIu64,
1514 			zev_memory_allocated - zev_memory_freed);
1515 #endif
1516 		return (DDI_SUCCESS);
1517 	case DDI_SUSPEND:
1518 		/* kernel must not suspend zev devices while ZFS is running */
1519 		return (DDI_FAILURE);
1520 	default:
1521 		return (DDI_FAILURE);
1522 	}
1523 }
1524 
1525 static int
1526 zev_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
1527 {
1528 	/* called once per instance with DDI_ATTACH,
1529 	   may be called to resume */
1530 	int instance;
1531 	int error;
1532 	zev_queue_t *q;
1533 	switch (cmd) {
1534 	case DDI_ATTACH:
1535 		/* create instance state */
1536 		instance = ddi_get_instance(dip);
1537 		if (instance != 0) {	/* hardcoded in zev.conf */
1538 			/* this module only supports one instance. */
1539 			return (DDI_FAILURE);
1540 		}
1541 
1542 		mutex_enter(&zev_mutex);
1543 		if (zev_attached) {
1544 			mutex_exit(&zev_mutex);
1545 			return (DDI_FAILURE);
1546 		}
1547 		if (ddi_soft_state_zalloc(statep, ZEV_CONTROL_DEVICE_MINOR) !=
1548 		    DDI_SUCCESS) {
1549 			mutex_exit(&zev_mutex);
1550 			return (DDI_FAILURE);
1551 		}
1552 		ZEV_MEM_ADD(sizeof(zev_queue_t));
1553 		zev_attached = B_TRUE;
1554 
1555 		/* init queue list */
1556 		bzero(&zev_queues, sizeof(zev_queues));
1557 		mutex_exit(&zev_mutex);
1558 
1559 		/* create a dummy queue for management of "ctrl" */
1560 
1561 		q = ddi_get_soft_state(statep, ZEV_CONTROL_DEVICE_MINOR);
1562 		q->zq_dip = dip;
1563 		q->zq_refcnt = 1;
1564 		q->zq_busy = B_FALSE;
1565 		q->zq_minor_number = ZEV_CONTROL_DEVICE_MINOR;
1566 		q->zq_flags = ZEV_FL_PERSISTENT;
1567 		strcpy(q->zq_name, ZEV_CONTROL_DEVICE_NAME);
1568 
1569 		/* create device node for "ctrl" */
1570 		if (ddi_create_minor_node(dip, ZEV_CONTROL_DEVICE_NAME,
1571 		    S_IFCHR, ZEV_CONTROL_DEVICE_MINOR,
1572 		    DDI_PSEUDO, 0) == DDI_FAILURE) {
1573 			goto fail;
1574 		}
1575 
1576 		/* note: intentionally not adding ctrl queue to queue list. */
1577 
1578 		/* create device node for "tmpqueue" */
1579 		if (ddi_create_minor_node(dip, ZEV_TMPQUEUE_DEVICE_NAME,
1580 		    S_IFCHR, ZEV_TMPQUEUE_DEVICE_MINOR,
1581 		    DDI_PSEUDO, 0) == DDI_FAILURE) {
1582 			goto fail;
1583 		}
1584 
1585 		/* default queue */
1586 		error = zev_queue_new(&q, dip,
1587 				      ZEV_DEFAULT_QUEUE_NAME,
1588 				      ZEV_MAX_QUEUE_LEN,
1589 				      ZEV_FL_BLOCK_WHILE_QUEUE_FULL|
1590 		                      ZEV_FL_PERSISTENT);
1591 		if (error)
1592 			goto fail;
1593 
1594 		/* start pollwakeup thread */
1595 		zev_wakeup_thread_run = 1;
1596 		zev_poll_wakeup_thread = thread_create(NULL, 0,
1597 		    zev_poll_wakeup_thread_main, NULL, 0, &p0,
1598 		    TS_RUN, minclsyspri);
1599 
1600 		ddi_report_dev(dip);
1601 
1602 		zev_chksum_init();
1603 
1604 		/* switch ZFS event callbacks to zev module callbacks */
1605 		rw_enter(&rz_zev_rwlock, RW_WRITER);
1606 		rz_zev_callbacks = &zev_callbacks;
1607 		rz_zev_set_active(B_TRUE);
1608 		rw_exit(&rz_zev_rwlock);
1609 
1610 		return (DDI_SUCCESS);
1611 	case DDI_RESUME:
1612 		/* suspendeding zev devices should never happen */
1613 		return (DDI_SUCCESS);
1614 	default:
1615 		return (DDI_FAILURE);
1616 	}
1617 fail:
1618 	cmn_err(CE_WARN, "zev: attach failed");
1619 	zev_free_instance(dip);
1620 	mutex_enter(&zev_mutex);
1621 	zev_attached = B_FALSE;
1622 	mutex_exit(&zev_mutex);
1623 	return (DDI_FAILURE);
1624 }
1625 
1626 /* ARGSUSED */
1627 static int
1628 zev_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **resultp)
1629 {
1630 	minor_t minor;
1631 	zev_queue_t *q;
1632 
1633 	/* arg is dev_t */
1634 	minor = getminor((dev_t)arg);
1635 	mutex_enter(&zev_mutex);
1636 	q = ddi_get_soft_state(statep, minor);
1637 	if (q == NULL) {
1638 		*resultp = NULL;
1639 		mutex_exit(&zev_mutex);
1640 		return (DDI_FAILURE);
1641 	}
1642 
1643 	switch (infocmd) {
1644 	case DDI_INFO_DEVT2DEVINFO:
1645 		*resultp = q->zq_dip;
1646 		break;
1647 	case DDI_INFO_DEVT2INSTANCE:
1648 		*resultp = (void *)(uintptr_t)ddi_get_instance(q->zq_dip);
1649 		break;
1650 	default:
1651 		mutex_exit(&zev_mutex);
1652 		return (DDI_FAILURE);
1653 	}
1654 	mutex_exit(&zev_mutex);
1655 	return (DDI_SUCCESS);
1656 }
1657 
1658 static struct dev_ops zev_dev_ops = {
1659 	DEVO_REV,			/* driver build revision */
1660 	0,				/* driver reference count */
1661 	zev_getinfo,			/* getinfo */
1662 	nulldev,			/* identify (obsolete) */
1663 	nulldev,			/* probe (search for devices) */
1664 	zev_attach,			/* attach */
1665 	zev_detach,			/* detach */
1666 	nodev,				/* reset (obsolete, use quiesce) */
1667 	&zev_cb_ops,			/* character and block device ops */
1668 	NULL,				/* bus driver ops */
1669 	NULL,				/* power management, not needed */
1670 	ddi_quiesce_not_needed,		/* quiesce */
1671 };
1672 
1673 static struct modldrv zev_modldrv = {
1674 	&mod_driverops,			/* all loadable modules use this */
1675 	"ZFS event provider, v"
1676 		XSTRING(ZEV_MAJOR_VERSION) "."
1677 		XSTRING(ZEV_MINOR_VERSION),
1678 					/* driver name and version info */
1679 	&zev_dev_ops			/* ops method pointers */
1680 };
1681 
1682 static struct modlinkage zev_modlinkage = {
1683 	MODREV_1,	/* fixed value */
1684 	{
1685 		&zev_modldrv,	/* driver linkage structure */
1686 		NULL		/* list terminator */
1687 	}
1688 };
1689 
1690 int
1691 _init(void)
1692 {
1693 	int error;
1694 
1695 	if ((error = ddi_soft_state_init(&statep, sizeof(zev_queue_t), 1)) != 0)
1696 		return (error);
1697 	zev_attached = B_FALSE;
1698 
1699 	zev_queue_head = NULL;
1700 	zev_queue_tail = NULL;
1701 	zev_queue_len = 0;
1702 	zev_muted_pools_head = NULL;
1703 	zev_memory_allocated = 0;
1704 	zev_memory_freed = 0;
1705 	zev_queue_cnt = 0;
1706 	zev_have_blocking_queues = 1;
1707 
1708 	mutex_init(&zev_mutex, NULL, MUTEX_DRIVER, NULL);
1709 	cv_init(&zev_condvar, NULL, CV_DRIVER, NULL);
1710 	rw_init(&zev_pool_list_rwlock, NULL, RW_DRIVER, NULL);
1711 	mutex_init(&zev_mark_id_mutex, NULL, MUTEX_DRIVER, NULL);
1712 	zev_mark_id = gethrtime();
1713 	mutex_init(&zev_queue_msg_mutex, NULL, MUTEX_DRIVER, NULL);
1714 	zev_msg_sequence_number = gethrtime();
1715 	bzero(&zev_statistics, sizeof(zev_statistics));
1716 	bzero(&zev_pollhead, sizeof(zev_pollhead));
1717 	bzero(&zev_queues, sizeof(zev_queues));
1718 	zev_statistics.zev_max_queue_len = ZEV_MAX_QUEUE_LEN;
1719 	if (zev_ioc_mute_pool("zg0")) {
1720 		cmn_err(CE_WARN, "zev: could not init mute list");
1721 		goto FAIL;
1722 	}
1723 
1724 	if ((error = mod_install(&zev_modlinkage)) != 0) {
1725 		cmn_err(CE_WARN, "zev: could not install module");
1726 		goto FAIL;
1727 	}
1728 
1729 	return (0);
1730 FAIL:
1731 	/* free resources */
1732 	cmn_err(CE_WARN, "zev: _init failed");
1733 	mutex_destroy(&zev_mutex);
1734 	ddi_soft_state_fini(&statep);
1735 	return (error);
1736 }
1737 
1738 int
1739 _info(struct modinfo *modinfop)
1740 {
1741 	return (mod_info(&zev_modlinkage, modinfop));
1742 }
1743 
1744 int
1745 _fini(void)
1746 {
1747 	int error = 0;
1748 	zev_msg_t *msg;
1749 	zev_pool_list_entry_t *pe, *npe;
1750 
1751 	mutex_enter(&zev_mutex);
1752 	if (zev_attached == B_TRUE) {
1753 		mutex_exit(&zev_mutex);
1754 		return (SET_ERROR(EBUSY));
1755 	}
1756 	if (zev_queue_cnt != 0) {
1757 		/* should never happen */
1758 		mutex_exit(&zev_mutex);
1759 		return (SET_ERROR(EBUSY));
1760 	}
1761 
1762 	/*
1763 	 * avoid deadlock if event list is full: make sure threads currently
1764 	 * blocking on the event list can append their event and then release
1765 	 * rz_zev_rwlock.  Since there should be no queues left when we
1766 	 * reach this point we can simply empty the event list and then
1767 	 * wake everybody.
1768 	 */
1769 	while (zev_queue_head) {
1770 		msg = zev_queue_head;
1771 		zev_queue_head = msg->next;
1772 		zev_free(msg, sizeof(*msg) + msg->size);
1773 	}
1774 	cv_broadcast(&zev_condvar);
1775 	mutex_exit(&zev_mutex);
1776 
1777 	/* switch ZFS event callbacks back to default (again) */
1778 	rw_enter(&rz_zev_rwlock, RW_WRITER);
1779 	rz_zev_callbacks = rz_zev_default_callbacks;
1780 	rz_zev_set_active(B_FALSE);
1781 	rw_exit(&rz_zev_rwlock);
1782 
1783 	/* no thread is inside of the callbacks anymore.  Safe to remove. */
1784 
1785 	/* unload module callbacks */
1786 	if ((error = mod_remove(&zev_modlinkage)) != 0) {
1787 		cmn_err(CE_WARN, "mod_remove failed: %d", error);
1788 		return (error);
1789 	}
1790 
1791 	/* free resources */
1792 	mutex_enter(&zev_mutex);
1793 	while (zev_queue_head) {
1794 		msg = zev_queue_head;
1795 		zev_queue_head = msg->next;
1796 		zev_free(msg, sizeof(*msg) + msg->size);
1797 	}
1798 	mutex_exit(&zev_mutex);
1799 	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
1800 	pe = zev_muted_pools_head;
1801 	while (pe) {
1802 		npe = pe;
1803 		pe = pe->next;
1804 		zev_free(npe, sizeof(*npe));
1805 	}
1806 	rw_exit(&zev_pool_list_rwlock);
1807 	ddi_soft_state_fini(&statep);
1808 	rw_destroy(&zev_pool_list_rwlock);
1809 	cv_destroy(&zev_condvar);
1810 	mutex_destroy(&zev_mutex);
1811 	mutex_destroy(&zev_mark_id_mutex);
1812 	mutex_destroy(&zev_queue_msg_mutex);
1813 
1814 	return (0);
1815 }
1816 
1817