xref: /titanic_51/usr/src/uts/common/fs/zev/zev.c (revision 16a2f0006cd030a3483375ec61eda16d45271453)
1 #include <sys/modctl.h>
2 #include <sys/ddi.h>
3 #include <sys/sunddi.h>
4 #include <sys/conf.h>
5 #include <sys/devops.h>
6 #include <sys/stat.h>
7 #include <sys/fs/zev.h>
8 #include <sys/zev_callbacks.h>
9 #include <sys/zev_checksums.h>
10 #include <sys/zfs_znode.h>
11 #include <sys/time.h>
12 #include <sys/sa.h>
13 #include <sys/zap.h>
14 #include <sys/time.h>
15 
16 #define	OFFSETOF(s, m)		((size_t)(&(((s *)0)->m)))
17 
18 #define ZEV_DEFAULT_QUEUE_NAME		"beaver"
19 #define ZEV_CONTROL_DEVICE_MINOR	0
20 #define ZEV_MINOR_MIN			(ZEV_CONTROL_DEVICE_MINOR + 1)
21 #define ZEV_MINOR_MAX			(ZEV_MINOR_MIN + ZEV_MAX_QUEUES - 1)
22 
23 typedef struct zev_queue {
24 	char			zq_name[ZEV_MAX_QUEUE_NAME_LEN+1];
25 	minor_t			zq_minor_number;
26 	dev_info_t		*zq_dip;
27 	struct pollhead		zq_pollhead;
28 	uint64_t		zq_bytes_read;
29 	uint64_t		zq_events_read;
30 	uint64_t		zq_bytes_discarded;
31 	uint64_t		zq_events_discarded;
32 	uint64_t		zq_bytes_total;
33 	uint64_t		zq_events_total;
34 	uint64_t		zq_wakeup_threshold;
35 	uint16_t		zq_flags;
36 	uint16_t		zq_need_wakeup;
37 	/* protected by zev_mutex */
38 	int			zq_refcnt;
39 	uint64_t		zq_queue_len;
40 	uint64_t		zq_queue_messages;
41 	uint64_t		zq_max_queue_len;
42 	zev_msg_t		*zq_oldest;
43 	boolean_t		zq_busy;
44 	boolean_t		zq_to_be_removed;
45 	zev_statistics_t	zq_statistics;
46 	kcondvar_t		zq_condvar;
47 } zev_queue_t;
48 
49 static void		*statep;
50 struct pollhead		zev_pollhead;
51 
52 kmutex_t		zev_mutex;
53 kcondvar_t		zev_condvar;
54 kmutex_t		zev_queue_msg_mutex;
55 krwlock_t		zev_pool_list_rwlock;
56 static zev_statistics_t	zev_statistics;
57 static boolean_t	zev_attached;
58 static kmutex_t		zev_mark_id_mutex;
59 static uint64_t		zev_mark_id = 0;
60 
61 static uint64_t		zev_msg_sequence_number = 0;
62 static zev_queue_t	*zev_queues[ZEV_MAX_QUEUES];
63 static int		zev_queue_cnt = 0;
64 static int		zev_have_blocking_queues = 1;
65 
66 uint64_t	zev_memory_allocated = 0;
67 uint64_t	zev_memory_freed = 0;
68 
69 /*
70  * The longest potential message is from zev_zfs_mount() and
71  * contains the mountpoint, which might be close to MAXPATHLEN bytes long.
72  *
73  * Another candidate is zev_znode_rename_cb() and contains three inode
74  * numbers and two filenames of up to MAXNAMELEN bytes each.
75  */
76 #define ZEV_MAX_MESSAGE_LEN	4096
77 
78 static zev_msg_t *zev_queue_head = NULL;
79 static zev_msg_t *zev_queue_tail = NULL;
80 static uint64_t zev_queue_len = 0;
81 
82 
83 typedef struct zev_pool_list_entry {
84 	struct zev_pool_list_entry	*next;
85 	char				name[MAXPATHLEN];
86 } zev_pool_list_entry_t;
87 
88 static zev_pool_list_entry_t *zev_muted_pools_head = NULL;
89 
90 static volatile int zev_wakeup_thread_run = 1;
91 static kthread_t *zev_poll_wakeup_thread = NULL;
92 
93 void *
94 zev_alloc(ssize_t sz)
95 {
96 	ZEV_MEM_ADD(sz);
97 	return kmem_alloc(sz, KM_SLEEP);
98 }
99 
100 void *
101 zev_zalloc(ssize_t sz)
102 {
103 	ZEV_MEM_ADD(sz);
104 	return kmem_zalloc(sz, KM_SLEEP);
105 }
106 
107 void
108 zev_free(void *ptr, ssize_t sz)
109 {
110 	ZEV_MEM_SUB(sz);						\
111 	kmem_free(ptr, sz);
112 }
113 
114 /* must be called with zev_mutex held */
115 static void
116 zev_update_blockflag(void)
117 {
118 	zev_queue_t *q;
119 	int had_blocking_queues;
120 	int i;
121 
122 	had_blocking_queues = zev_have_blocking_queues;
123 
124 	/* do we still have blocking queues? */
125 	zev_have_blocking_queues = 0;
126 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
127 		q = zev_queues[i - ZEV_MINOR_MIN];
128 		if (!q)
129 			continue;
130 		if (q->zq_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL) {
131 			zev_have_blocking_queues = 1;
132 			break;
133 		}
134 	}
135 	/* no blocking queues */
136 	if (had_blocking_queues)
137 		cv_broadcast(&zev_condvar);
138 }
139 
140 int
141 zev_queue_cmp(const void *a, const void *b)
142 {
143 	const zev_queue_t *qa = a;
144 	const zev_queue_t *qb = b;
145 	if (qa->zq_minor_number > qb->zq_minor_number)
146 		return 1;
147 	if (qa->zq_minor_number < qb->zq_minor_number)
148 		return -1;
149 	return 0;
150 }
151 
152 /* must be called with zev_mutex held */
153 void
154 zev_queue_trim(void)
155 {
156 	zev_msg_t *m;
157 	uint64_t oldest_message;
158 	zev_queue_t *q;
159 	int i;
160 
161 	if (!zev_queue_tail)
162 		return;
163 
164 	oldest_message = zev_queue_tail->seq + 1;  /* does not exist, yet. */
165 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
166 		q = zev_queues[i - ZEV_MINOR_MIN];
167 		if (q == NULL)
168 			continue;
169 		if (!q->zq_oldest)
170 			continue;
171 		if (oldest_message > q->zq_oldest->seq)
172 			oldest_message = q->zq_oldest->seq;
173 	}
174 
175 	/* remove msgs between oldest_message and zev_queue_head */
176 	while(zev_queue_head && (oldest_message > zev_queue_head->seq)) {
177 		m = zev_queue_head;
178 		zev_queue_head = m->next;
179 		if (zev_queue_head == NULL) {
180 			zev_queue_tail = NULL;
181 		} else {
182 			zev_queue_head->prev = NULL;
183 		}
184 		if (m->read == 0) {
185 			zev_statistics.zev_bytes_discarded += m->size;
186 			zev_statistics.zev_cnt_discarded_events++;
187 		}
188 		zev_statistics.zev_queue_len -= m->size;
189 		zev_queue_len--;
190 		zev_free(m, sizeof(*m) + m->size);
191 	}
192 }
193 
194 /* must be called with zev_mutex held */
195 static void
196 zev_queue_hold(zev_queue_t *q)
197 {
198 	q->zq_refcnt++;
199 }
200 
201 /* must be called with zev_mutex held */
202 static void
203 zev_queue_release(zev_queue_t *q)
204 {
205 	q->zq_refcnt--;
206 	if (q->zq_refcnt > 0)
207 		return;
208 
209 	ASSERT(q->zq_busy == B_FALSE);
210 
211 	/* persistent queues will not be removed */
212 	if ((q->zq_flags & ZEV_FL_PERSISTENT) != 0)
213 		return;
214 
215 	/* remove queue from queue list */
216 	zev_queues[q->zq_minor_number - ZEV_MINOR_MIN] = NULL;
217 
218 	/* discard messages that no queue references anymore */
219 	zev_queue_trim();
220 
221 	cv_destroy(&q->zq_condvar);
222 	ddi_remove_minor_node(q->zq_dip, q->zq_name);
223 	ddi_soft_state_free(statep, q->zq_minor_number);
224 	ZEV_MEM_SUB(sizeof(zev_queue_t));
225 	zev_queue_cnt--;
226 	zev_update_blockflag();
227 }
228 
229 int
230 zev_queue_new(zev_queue_t **queue,
231               dev_info_t *dip,
232               char *name,
233               uint64_t max_queue_len,
234               uint16_t flags)
235 {
236 	zev_queue_t *q;
237 	zev_queue_t *tmp;
238 	zev_msg_t *msg;
239 	int name_exists = 0;
240 	minor_t minor;
241 	char *p;
242 	int i;
243 
244 	if (max_queue_len > ZEV_MAX_QUEUE_LEN)
245 		return EINVAL;
246 	if (max_queue_len == 0)
247 		max_queue_len = ZEV_MAX_QUEUE_LEN;
248 	if (!strcmp(name, ZEV_CONTROL_DEVICE_NAME))
249 		return EINVAL;
250 	for (p = name; *p; p++) {
251 		if (*p >= 'a' && *p <= 'z')
252 			continue;
253 		if (*p >= '0' && *p <= '9')
254 			continue;
255 		if (*p == '.')
256 			continue;
257 		return EINVAL;
258 	}
259 
260 	mutex_enter(&zev_mutex);
261 
262 	/* find free minor number.*/
263 	/* if this were a frequent operation we'd have a free-minor list */
264 	for (minor = ZEV_MINOR_MIN; minor <= ZEV_MINOR_MAX; minor++) {
265 		tmp = zev_queues[minor - ZEV_MINOR_MIN];
266 		if (tmp == NULL)
267 			break;
268 	}
269 	if (tmp) {
270 		mutex_exit(&zev_mutex);
271 		return ENOSPC;
272 	}
273 
274 	if (ddi_soft_state_zalloc(statep, minor) != DDI_SUCCESS) {
275 		mutex_exit(&zev_mutex);
276 		return ENOSPC;
277 	}
278 	ZEV_MEM_ADD(sizeof(zev_queue_t));
279 
280 	q = ddi_get_soft_state(statep, minor);
281 	memset(q, 0, sizeof(*q));
282 	strncpy(q->zq_name, name, ZEV_MAX_QUEUE_NAME_LEN);
283 	q->zq_name[ZEV_MAX_QUEUE_NAME_LEN] = '\0';
284 	q->zq_max_queue_len = max_queue_len;
285 	q->zq_wakeup_threshold = ZEV_DEFAULT_POLL_WAKEUP_QUEUE_LEN;
286 	q->zq_flags = flags;
287 	q->zq_refcnt = 1;
288 	q->zq_dip = dip;
289 	q->zq_minor_number = minor;
290 	cv_init(&q->zq_condvar, NULL, CV_DRIVER, NULL);
291 
292 	/* insert into queue list */
293 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
294 		/* if this were a frequent operation we'd have a name tree */
295 		if (zev_queues[i - ZEV_MINOR_MIN] == NULL)
296 			continue;
297 		if (!strcmp(q->zq_name, zev_queues[i-ZEV_MINOR_MIN]->zq_name)) {
298 			name_exists = 1;
299 			break;
300 		}
301 	}
302 	if (name_exists) {
303 		ddi_soft_state_free(statep, minor);
304 		ZEV_MEM_SUB(sizeof(zev_queue_t));
305 		mutex_exit(&zev_mutex);
306 		return EEXIST;
307 	}
308 	zev_queues[minor - ZEV_MINOR_MIN] = q;
309 	zev_queue_cnt++;
310 
311 	/* calculate current queue len and find head and tail */
312 	if (!(q->zq_flags & ZEV_FL_INITIALLY_EMPTY)) {
313 		q->zq_oldest = zev_queue_tail;
314 		msg = zev_queue_tail;
315 		while ((msg) && (q->zq_queue_len < q->zq_max_queue_len)) {
316 			q->zq_queue_len += msg->size;
317 			q->zq_queue_messages++;
318 			q->zq_oldest = msg;
319 			msg = msg->prev;
320 		}
321 	}
322 
323 	mutex_exit(&zev_mutex);
324 
325 	if (ddi_create_minor_node(dip, name,
326 	    S_IFCHR, minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
327 		mutex_enter(&zev_mutex);
328 		zev_queues[minor - ZEV_MINOR_MIN] = NULL;
329 		zev_queue_cnt--;
330 		ddi_soft_state_free(statep, minor);
331 		ZEV_MEM_SUB(sizeof(zev_queue_t));
332 		mutex_exit(&zev_mutex);
333 		return EFAULT;
334 	}
335 
336 	zev_update_blockflag();
337 
338 	*queue = q;
339 	return 0;
340 }
341 
342 /*
343  * poll() wakeup thread.  Used to check periodically whether we have
344  * bytes left in the queue that have not yet been made into a
345  * pollwakeup() call.  This is meant to insure a maximum waiting
346  * time until an event is presented as a poll wakeup, while at
347  * the same time not making every single event into a poll wakeup
348  * of it's own.
349  */
350 
351 static void
352 zev_poll_wakeup(boolean_t flush_all)
353 {
354 	zev_queue_t *q;
355 	int i;
356 
357 	/*
358 	 * This loop works with hold() and release() because
359 	 * pollwakeup() requires us to release our locks before calling it.
360 	 *
361 	 * from pollwakeup(9F):
362 	 *
363 	 *   "Driver defined locks should not be held across calls
364 	 *    to this function."
365 	 */
366 
367 	/* wake up threads for each individual queue */
368 	mutex_enter(&zev_mutex);
369 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
370 		q = zev_queues[i - ZEV_MINOR_MIN];
371 		if (q == NULL)
372 			continue;
373 		if (!q->zq_busy)
374 			continue;
375 		if (!q->zq_queue_len)
376 			continue;
377 		if ((flush_all) ||
378 		    (q->zq_queue_len > q->zq_wakeup_threshold)) {
379 			zev_queue_hold(q);
380 			mutex_exit(&zev_mutex);
381 			pollwakeup(&q->zq_pollhead, POLLIN);
382 			mutex_enter(&zev_mutex);
383 			zev_queue_release(q);
384 		}
385 	}
386 	mutex_exit(&zev_mutex);
387 }
388 
389 static void
390 zev_poll_wakeup_thread_main(void)
391 {
392 	while (zev_wakeup_thread_run) {
393 		delay(drv_usectohz(100 * 1000)); /* sleep 100ms */
394 
395 		zev_poll_wakeup(B_TRUE);
396 	}
397 	thread_exit();
398 }
399 
400 static int
401 zev_ioc_mute_pool(char *poolname)
402 {
403 	zev_pool_list_entry_t *pe;
404 	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
405 	/* pool already muted? */
406 	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
407 		if (!strcmp(pe->name, poolname)) {
408 			rw_exit(&zev_pool_list_rwlock);
409 			return EEXIST;
410 		}
411 	}
412 	pe = zev_zalloc(sizeof(*pe));
413 	if (!pe) {
414 		rw_exit(&zev_pool_list_rwlock);
415 		return ENOMEM;
416 	}
417 	(void) strncpy(pe->name, poolname, sizeof(pe->name));
418 	pe->next = zev_muted_pools_head;
419 	zev_muted_pools_head = pe;
420 	rw_exit(&zev_pool_list_rwlock);
421 	return (0);
422 }
423 
424 static int
425 zev_ioc_unmute_pool(char *poolname)
426 {
427 	zev_pool_list_entry_t *pe, *peprev;
428 
429 	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
430 	/* pool muted? */
431 	peprev = NULL;
432 	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
433 		if (!strcmp(pe->name, poolname))
434 			break;
435 		peprev = pe;
436 	}
437 	if (pe) {
438 		rw_exit(&zev_pool_list_rwlock);
439 		return ENOENT;
440 	}
441 
442 	if (peprev != NULL) {
443 		peprev->next = pe->next;
444 	} else {
445 		zev_muted_pools_head = pe->next;
446 	}
447 	zev_free(pe, sizeof(*pe));
448 	rw_exit(&zev_pool_list_rwlock);
449 	return (0);
450 }
451 
452 int
453 zev_skip_pool(objset_t *os)
454 {
455 	zev_pool_list_entry_t *pe;
456 	dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
457 	rw_enter(&zev_pool_list_rwlock, RW_READER);
458 	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
459 		if (!strcmp(pe->name, dp->dp_spa->spa_name)) {
460 			rw_exit(&zev_pool_list_rwlock);
461 			return 1;
462 		}
463 	}
464 	rw_exit(&zev_pool_list_rwlock);
465 	return 0;
466 }
467 
468 static void
469 zev_update_statistics(int op, zev_statistics_t *stat)
470 {
471 	switch (op) {
472 	case ZEV_OP_ERROR:
473 		stat->zev_cnt_errors++;
474 		break;
475 	case ZEV_OP_MARK:
476 		stat->zev_cnt_marks++;
477 		break;
478 	case ZEV_OP_ZFS_MOUNT:
479 		stat->zev_cnt_zfs_mount++;
480 		break;
481 	case ZEV_OP_ZFS_UMOUNT:
482 		stat->zev_cnt_zfs_umount++;
483 		break;
484 	case ZEV_OP_ZVOL_WRITE:
485 		stat->zev_cnt_zvol_write++;
486 		break;
487 	case ZEV_OP_ZVOL_TRUNCATE:
488 		stat->zev_cnt_zvol_truncate++;
489 		break;
490 	case ZEV_OP_ZNODE_CLOSE_AFTER_UPDATE:
491 		stat->zev_cnt_znode_close_after_update++;
492 		break;
493 	case ZEV_OP_ZNODE_CREATE:
494 		stat->zev_cnt_znode_create++;
495 		break;
496 	case ZEV_OP_ZNODE_REMOVE:
497 		stat->zev_cnt_znode_remove++;
498 		break;
499 	case ZEV_OP_ZNODE_LINK:
500 		stat->zev_cnt_znode_link++;
501 		break;
502 	case ZEV_OP_ZNODE_SYMLINK:
503 		stat->zev_cnt_znode_symlink++;
504 		break;
505 	case ZEV_OP_ZNODE_RENAME:
506 		stat->zev_cnt_znode_rename++;
507 		break;
508 	case ZEV_OP_ZNODE_WRITE:
509 		stat->zev_cnt_znode_write++;
510 		break;
511 	case ZEV_OP_ZNODE_TRUNCATE:
512 		stat->zev_cnt_znode_truncate++;
513 		break;
514 	case ZEV_OP_ZNODE_SETATTR:
515 		stat->zev_cnt_znode_setattr++;
516 		break;
517 	case ZEV_OP_ZNODE_ACL:
518 		stat->zev_cnt_znode_acl++;
519 		break;
520 	}
521 }
522 
523 void
524 zev_queue_message(int op, zev_msg_t *msg)
525 {
526 	zev_queue_t *q;
527 	int wakeup = 0;
528 	zev_msg_t *m;
529 	int i;
530 
531 	msg->next = NULL;
532 	msg->prev = NULL;
533 	msg->read = 0;
534 
535 	if (op < ZEV_OP_MIN || op > ZEV_OP_MAX) {
536 		zev_queue_error(op, "unknown op id encountered: %d", op);
537 		zev_free(msg, sizeof(*msg) + msg->size);
538 		return;
539 	}
540 
541 	/*
542 	 * This mutex protects us agains race conditions when several
543 	 * threads want to queue a message and one or more queues are
544 	 * full:  we release zev_mutex to wait for the queues to become
545 	 * less-than-full, but we don't know in which order the waiting
546 	 * threads will be awoken.  If it's not the same order in which
547 	 * they went to sleep we might mark different messages as "newest"
548 	 * in different queues, and so we might have dupes or even
549 	 * skip messages.
550 	 */
551 	mutex_enter(&zev_queue_msg_mutex);
552 
553 	mutex_enter(&zev_mutex);
554 
555 	/*
556 	 * When the module is loaded, the default behavior ist to
557 	 * put all events into a queue and block if the queue is full.
558 	 * This is done even before the pseudo device is attached.
559 	 * This way, no events are lost.
560 	 *
561 	 * To discard events entirely the "beaver" queue,
562 	 * which never discards anything, has to be removed.
563 	 */
564 
565 	if (zev_queue_cnt == 0) {
566 		mutex_exit(&zev_mutex);
567 		mutex_exit(&zev_queue_msg_mutex);
568 		return;
569 	}
570 
571 	/* put message into global queue */
572 	msg->seq = zev_msg_sequence_number++;
573 
574 	/* do we need to make room? */
575 	while (zev_statistics.zev_max_queue_len &&
576 	    zev_statistics.zev_queue_len > zev_statistics.zev_max_queue_len) {
577 
578 		if (zev_have_blocking_queues) {
579 			/* queue full.  block until it's been shrunk. */
580 			cv_wait(&zev_condvar, &zev_mutex);
581 			continue;
582 		}
583 
584 		/* discard events until this message fits into all queues */
585 
586 		for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
587 			q = zev_queues[i - ZEV_MINOR_MIN];
588 			if (!q)
589 				continue;
590 			/* discard msgs until queue is small enough */
591 			while (q->zq_queue_len &&
592 			       q->zq_queue_len > q->zq_max_queue_len) {
593 				m = q->zq_oldest;
594 				if (m == NULL)
595 					break;
596 				q->zq_events_discarded++;
597 				q->zq_bytes_discarded += m->size;
598 				q->zq_oldest = m->next;
599 				q->zq_queue_len -= m->size;
600 				q->zq_queue_messages--;
601 			}
602 		}
603 
604 		zev_queue_trim();
605 		ASSERT(zev_statistics.zev_queue_len == 0 ||
606 		       zev_statistics.zev_queue_len <=
607 				zev_statistics.zev_max_queue_len);
608 	}
609 
610 	if (zev_queue_tail == NULL) {
611 		zev_queue_head = zev_queue_tail = msg;
612 	} else {
613 		zev_queue_tail->next = msg;
614 		msg->prev = zev_queue_tail;
615 		zev_queue_tail = msg;
616 	}
617 	zev_queue_len++;
618 	zev_statistics.zev_cnt_total_events++;
619 	zev_statistics.zev_queue_len += msg->size;
620 
621 	/* update per-device queues */
622 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
623 		q = zev_queues[i - ZEV_MINOR_MIN];
624 		if (!q)
625 			continue;
626 
627 		zev_queue_hold(q);
628 
629 		/* make sure queue has enough room */
630 		while (q->zq_max_queue_len &&
631 		       q->zq_queue_len > q->zq_max_queue_len) {
632 
633 			if (q->zq_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL) {
634 				/* block until queue has been shrunk. */
635 				cv_wait(&zev_condvar, &zev_mutex);
636 			} else {
637 				/* discard msgs until queue is small enough */
638 				while (q->zq_queue_len > q->zq_max_queue_len) {
639 					m = q->zq_oldest;
640 					if (m == NULL)
641 						break;
642 					q->zq_events_discarded++;
643 					q->zq_bytes_discarded += m->size;
644 					q->zq_oldest = m->next;
645 					q->zq_queue_len -= m->size;
646 					q->zq_queue_messages--;
647 				}
648 			}
649 		}
650 
651 		/* register new message at the end of the queue */
652 		q->zq_queue_len += msg->size;
653 		q->zq_queue_messages++;
654 		q->zq_bytes_total += msg->size;
655 		q->zq_events_total++;
656 		if (q->zq_oldest == NULL)
657 			q->zq_oldest = msg;
658 
659 		zev_update_statistics(op, &q->zq_statistics);
660 
661 		if (q->zq_queue_len > q->zq_wakeup_threshold)
662 			wakeup = 1;
663 		if (q->zq_queue_len == msg->size)  /* queue was empty */
664 			cv_broadcast(&q->zq_condvar);
665 
666 		zev_queue_release(q);
667 	}
668 
669 	zev_queue_trim();
670 
671 	zev_update_statistics(op, &zev_statistics);
672 	mutex_exit(&zev_mutex);
673 	mutex_exit(&zev_queue_msg_mutex);
674 
675 	/* one or more queues need a pollwakeup() */
676 	if (op == ZEV_OP_MARK) {
677 		zev_poll_wakeup(B_TRUE);
678 	} else if (wakeup) {
679 		zev_poll_wakeup(B_FALSE);
680 	}
681 
682 	return;
683 }
684 
685 void
686 zev_queue_error(int op, char *fmt, ...)
687 {
688 	char buf[ZEV_MAX_MESSAGE_LEN];
689 	va_list ap;
690 	int len;
691 	zev_msg_t *msg = NULL;
692 	zev_error_t *rec;
693 	int msg_size;
694 
695 	va_start(ap, fmt);
696 	len = vsnprintf(buf, sizeof(buf), fmt, ap);
697 	va_end(ap);
698 	if (len >= sizeof(buf)) {
699 		cmn_err(CE_WARN, "zev: can't report error - "
700 		        "dropping event entirely.");
701 		return;
702 	}
703 
704 	msg_size = sizeof(*rec) + len + 1;
705 	msg = zev_alloc(sizeof(*msg) + msg_size);
706 	msg->size = msg_size;
707 	rec = (zev_error_t *)(msg + 1);
708 	rec->record_len = msg_size;
709 	rec->op = ZEV_OP_ERROR;
710 	rec->op_time = ddi_get_time();
711 	rec->guid = 0;
712 	rec->failed_op = op;
713 	rec->errstr_len = len;
714 	(void) memcpy(ZEV_ERRSTR(rec), buf, len + 1);
715 
716 	zev_queue_message(ZEV_OP_ERROR, msg);
717 	return;
718 }
719 
720 static int
721 zev_find_queue(zev_queue_t **out, zev_queue_t *req_q, zev_queue_name_t *name)
722 {
723 	char namebuf[ZEV_MAX_QUEUE_NAME_LEN+1];
724 	zev_queue_t *q;
725 	int i;
726 
727 	*out = NULL;
728 
729 	if (name->zev_namelen == 0) {
730 		if (req_q->zq_minor_number == ZEV_CONTROL_DEVICE_MINOR)
731 			return EINVAL;
732 		zev_queue_hold(req_q);
733 		*out = req_q;
734 		return 0;
735 	}
736 
737 	if (name->zev_namelen > ZEV_MAX_QUEUE_NAME_LEN)
738 		return EINVAL;
739 	strncpy(namebuf, name->zev_name, name->zev_namelen);
740 	namebuf[name->zev_namelen] = '\0';
741 
742 	mutex_enter(&zev_mutex);
743 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
744 		q = zev_queues[i - ZEV_MINOR_MIN];
745 		if (!q)
746 			continue;
747 		if (!strcmp(q->zq_name, namebuf)) {
748 			zev_queue_hold(q);
749 			mutex_exit(&zev_mutex);
750 			*out = q;
751 			return 0;
752 		}
753 	}
754 	mutex_exit(&zev_mutex);
755 	return ENOENT;
756 }
757 
758 static int
759 zev_ioc_get_queue_statistics(zev_queue_t *req_q, intptr_t arg, int mode)
760 {
761 	zev_ioctl_get_queue_statistics_t gs;
762 	zev_queue_t *q;
763 	int ret;
764 
765 	if (ddi_copyin((void *)arg, &gs, sizeof(gs), mode) != 0)
766 		return EFAULT;
767 
768 	ret = zev_find_queue(&q, req_q, &gs.zev_queue_name);
769 	if (ret)
770 		return ret;
771 
772 	/* ddi_copyout() can take a long time.  Better make
773 	   a copy to be able to release the mutex faster. */
774 	mutex_enter(&zev_mutex);
775 	memcpy(&gs.zev_statistics, &q->zq_statistics,sizeof(gs.zev_statistics));
776 	gs.zev_statistics.zev_queue_len = q->zq_queue_len;
777 	gs.zev_statistics.zev_bytes_read = q->zq_bytes_read;
778 	gs.zev_statistics.zev_bytes_discarded = q->zq_bytes_discarded;
779 	gs.zev_statistics.zev_max_queue_len = q->zq_max_queue_len;
780 	gs.zev_statistics.zev_cnt_discarded_events = q->zq_events_discarded;
781 	gs.zev_statistics.zev_cnt_total_events = q->zq_events_total;
782 	zev_queue_release(q);
783 	mutex_exit(&zev_mutex);
784 
785 	if (ddi_copyout(&gs, (void *)arg, sizeof(gs), mode) != 0)
786 		return EFAULT;
787 	return 0;
788 }
789 
790 static int
791 zev_ioc_set_queue_properties(zev_queue_t *req_q, intptr_t arg, int mode)
792 {
793 	zev_ioctl_set_queue_properties_t qp;
794 	zev_queue_t *q;
795 	uint64_t old_max;
796 	uint64_t old_flags;
797 	int ret;
798 
799 	if (ddi_copyin((void *)arg, &qp, sizeof(qp), mode) != 0)
800 		return EFAULT;
801 	if (qp.zev_max_queue_len > ZEV_MAX_QUEUE_LEN)
802 		return EINVAL;
803 	if (qp.zev_poll_wakeup_threshold > ZEV_MAX_POLL_WAKEUP_QUEUE_LEN)
804 		return EINVAL;
805 
806 	ret = zev_find_queue(&q, req_q, &qp.zev_queue_name);
807 	if (ret)
808 		return ret;
809 
810 	mutex_enter(&zev_mutex);
811 
812 	/*
813 	 * Note: if the PERSISTENT flag is cleared, and the queue is not busy,
814 	 * the queue should be removed by zev_queue_release() in zev_ioctl().
815 	 */
816 	old_flags = qp.zev_flags;
817 	q->zq_flags = qp.zev_flags;
818 	if ((old_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL) &&
819 	   (!(qp.zev_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL))) {
820 		/* queue is no longer blocking - wake blocked threads */
821 		cv_broadcast(&zev_condvar);
822 	}
823 
824 	zev_update_blockflag();
825 
826 	old_max = q->zq_max_queue_len;
827 	q->zq_max_queue_len = qp.zev_max_queue_len;
828 	if (q->zq_max_queue_len < old_max)
829 		zev_queue_trim();
830 	if (q->zq_max_queue_len > old_max)
831 		cv_broadcast(&zev_condvar);	/* threads may be waiting */
832 
833 	if ((qp.zev_poll_wakeup_threshold < q->zq_wakeup_threshold) &&
834 	    (qp.zev_poll_wakeup_threshold <= q->zq_queue_len))
835 		pollwakeup(&q->zq_pollhead, POLLIN);
836 	q->zq_wakeup_threshold = qp.zev_poll_wakeup_threshold;
837 
838 	zev_queue_release(q);
839 	mutex_exit(&zev_mutex);
840 	return 0;
841 }
842 
843 static int
844 zev_ioc_get_queue_properties(zev_queue_t *req_q, intptr_t arg, int mode)
845 {
846 	zev_ioctl_get_queue_properties_t qp;
847 	zev_queue_t *q;
848 	int ret;
849 
850 	if (ddi_copyin((void *)arg, &qp, sizeof(qp), mode) != 0)
851 		return EFAULT;
852 
853 	ret = zev_find_queue(&q, req_q, &qp.zev_queue_name);
854 	if (ret)
855 		return ret;
856 
857 	mutex_enter(&zev_mutex);
858 	qp.zev_max_queue_len = q->zq_max_queue_len;
859 	qp.zev_flags = q->zq_flags;
860 	qp.zev_poll_wakeup_threshold = q->zq_wakeup_threshold;
861 	zev_queue_release(q);
862 	mutex_exit(&zev_mutex);
863 
864 	if (ddi_copyout(&qp, (void *)arg, sizeof(qp), mode) != 0)
865 		return EFAULT;
866 	return 0;
867 }
868 
869 static int
870 zev_ioc_add_queue(zev_queue_t *req_q, intptr_t arg, int mode)
871 {
872 	zev_ioctl_add_queue_t aq;
873 	zev_queue_t *new_q;
874 	char name[ZEV_MAX_QUEUE_NAME_LEN+1];
875 
876 	if (ddi_copyin((void *)arg, &aq, sizeof(aq), mode) != 0)
877 		return EFAULT;
878 
879 	if (aq.zev_namelen > ZEV_MAX_QUEUE_NAME_LEN)
880 		return EINVAL;
881 	strncpy(name, aq.zev_name, aq.zev_namelen);
882 	name[aq.zev_namelen] = '\0';
883 
884 	return zev_queue_new(&new_q, req_q->zq_dip, name,
885 	                     aq.zev_max_queue_len, aq.zev_flags);
886 }
887 
888 static int
889 zev_ioc_remove_queue(zev_queue_t *req_q, intptr_t arg, int mode)
890 {
891 	zev_ioctl_remove_queue_t rq;
892 	zev_queue_t *q;
893 	char name[ZEV_MAX_QUEUE_NAME_LEN+1];
894 	int found = 0;
895 	int i;
896 
897 	if (ddi_copyin((void *)arg, &rq, sizeof(rq), mode) != 0)
898 		return EFAULT;
899 
900 	if (rq.zev_queue_name.zev_namelen > ZEV_MAX_QUEUE_NAME_LEN)
901 		return EINVAL;
902 	strncpy(name, rq.zev_queue_name.zev_name,
903 	        rq.zev_queue_name.zev_namelen);
904 	name[rq.zev_queue_name.zev_namelen] = '\0';
905 
906 	mutex_enter(&zev_mutex);
907 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
908 		q = zev_queues[i - ZEV_MINOR_MIN];
909 		if (!q)
910 			continue;
911 		if (!strcmp(q->zq_name, name)) {
912 			found = 1;
913 			break;
914 		}
915 	}
916 	if (!found) {
917 		mutex_exit(&zev_mutex);
918 		return ENOENT;
919 	}
920 
921 	if (q->zq_busy) {
922 		mutex_exit(&zev_mutex);
923 		return EBUSY;
924 	}
925 	/*
926 	 * clear flags, so that persistent queues are removed aswell
927 	 * and the queue becomes non-blocking.
928 	 */
929 	q->zq_flags = 0;
930 	if (q->zq_to_be_removed == B_FALSE) {
931 		q->zq_to_be_removed = B_TRUE;
932 		zev_queue_release(q);
933 	}
934 	/* some threads might be waiting for this queue to become writable */
935 	cv_broadcast(&zev_condvar);
936 
937 	mutex_exit(&zev_mutex);
938 	return 0;
939 }
940 
941 static int
942 zev_ioc_get_debug_info(zev_queue_t *req_q, intptr_t arg, int mode)
943 {
944 	zev_ioctl_debug_info_t di;
945 	uint64_t mem_allocated = atomic_add_64_nv(&zev_memory_allocated, 0);
946 	uint64_t mem_freed     = atomic_add_64_nv(&zev_memory_freed, 0);
947 
948 	zev_chksum_stats(&di.zev_chksum_cache_size,
949 	                 &di.zev_chksum_cache_hits,
950 	                 &di.zev_chksum_cache_misses);
951 	di.zev_memory_allocated = mem_allocated - mem_freed;
952 	if (ddi_copyout(&di, (void *)arg, sizeof(di), mode) != 0)
953 		return EFAULT;
954 	return 0;
955 }
956 
957 static int
958 zev_ioc_get_queue_list(zev_queue_t *req_q, intptr_t arg, int mode)
959 {
960 	zev_ioctl_get_queue_list_t gql;
961 	zev_queue_t *q;
962 	int i = 0;
963 	int count = 0;
964 
965 	memset(&gql, 0, sizeof(gql));
966 
967 	mutex_enter(&zev_mutex);
968 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
969 		q = zev_queues[i - ZEV_MINOR_MIN];
970 		if (!q)
971 			continue;
972 		strncpy(gql.zev_queue_name[count].zev_name,
973 		    q->zq_name, ZEV_MAX_QUEUE_NAME_LEN);
974 		gql.zev_queue_name[count].zev_namelen = strlen(q->zq_name);
975 		count++;
976 	}
977 	gql.zev_n_queues = count;
978 	mutex_exit(&zev_mutex);
979 
980 	if (ddi_copyout(&gql, (void *)arg, sizeof(gql), mode) != 0)
981 		return EFAULT;
982 	return 0;
983 }
984 
985 static int
986 zev_ioc_set_max_queue_len(zev_queue_t *req_q, intptr_t arg, int mode)
987 {
988 	uint64_t len;
989 	int i;
990 	zev_queue_t *q;
991 
992 	if (ddi_copyin((void *)arg, &len, sizeof(len), mode) != 0) {
993 		return EFAULT;
994 	}
995 	if (len > ZEV_MAX_QUEUE_LEN) {
996 		return EINVAL;
997 	}
998 	mutex_enter(&zev_mutex);
999 	zev_statistics.zev_max_queue_len = len;
1000 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
1001 		q = zev_queues[i - ZEV_MINOR_MIN];
1002 		if (!q)
1003 			continue;
1004 		if (q->zq_max_queue_len <=
1005 		    zev_statistics.zev_max_queue_len)
1006 			continue;
1007 		q->zq_max_queue_len = zev_statistics.zev_max_queue_len;
1008 	}
1009 	cv_broadcast(&zev_condvar);
1010 	mutex_exit(&zev_mutex);
1011 	return 0;
1012 }
1013 
1014 /* ARGSUSED */
1015 static int
1016 zev_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
1017 {
1018 	zev_statistics_t zs;
1019 	zev_ioctl_poolarg_t pa;
1020 	zev_ioctl_mark_t mark;
1021 	zev_mark_t *rec;
1022 	int msg_size;
1023 	zev_msg_t *msg;
1024 	uint64_t mark_id;
1025 	minor_t minor;
1026 	zev_queue_t *req_q;
1027 	int ret = 0;
1028 
1029 	minor = getminor(dev);
1030 	mutex_enter(&zev_mutex);
1031 	if ((req_q = ddi_get_soft_state(statep, minor)) == NULL) {
1032 		mutex_exit(&zev_mutex);
1033 		return (ENXIO);
1034 	}
1035 	zev_queue_hold(req_q);
1036 	mutex_exit(&zev_mutex);
1037 	/*
1038 	 * all structures passed between kernel and userspace
1039 	 * are now compatible between 64 and 32 bit.  Model
1040 	 * conversion can be ignored.
1041 	 */
1042 	switch (cmd) {
1043 	case ZEV_IOC_GET_GLOBAL_STATISTICS:
1044 		/* ddi_copyout() can take a long time.  Better make
1045 		   a copy to be able to release the mutex faster. */
1046 		mutex_enter(&zev_mutex);
1047 		(void) memcpy(&zs, &zev_statistics, sizeof(zs));
1048 		mutex_exit(&zev_mutex);
1049 		if (ddi_copyout(&zs, (void *)arg, sizeof(zs), mode) != 0)
1050 			ret = EFAULT;
1051 		break;
1052 	case ZEV_IOC_GET_QUEUE_STATISTICS:
1053 		ret = zev_ioc_get_queue_statistics(req_q, arg, mode);
1054 		break;
1055 	case ZEV_IOC_MUTE_POOL:
1056 	case ZEV_IOC_UNMUTE_POOL:
1057 		if (ddi_copyin((void *)arg, &pa, sizeof(pa), mode) != 0) {
1058 			ret = EFAULT;
1059 			break;
1060 		}
1061 		if (pa.zev_poolname_len >=MAXPATHLEN) {
1062 			ret = EINVAL;
1063 			break;
1064 		}
1065 		pa.zev_poolname[pa.zev_poolname_len] = '\0';
1066 		if (cmd == ZEV_IOC_MUTE_POOL) {
1067 			ret = zev_ioc_mute_pool(pa.zev_poolname);
1068 		} else {
1069 			ret = zev_ioc_unmute_pool(pa.zev_poolname);
1070 		}
1071 		break;
1072 	case ZEV_IOC_SET_MAX_QUEUE_LEN:
1073 		ret = zev_ioc_set_max_queue_len(req_q, arg, mode);
1074 		break;
1075 	case ZEV_IOC_GET_QUEUE_PROPERTIES:
1076 		ret = zev_ioc_get_queue_properties(req_q, arg, mode);
1077 		break;
1078 	case ZEV_IOC_SET_QUEUE_PROPERTIES:
1079 		ret = zev_ioc_set_queue_properties(req_q, arg, mode);
1080 		break;
1081 	case ZEV_IOC_MARK:
1082 		if (ddi_copyin((void *)arg, &mark, sizeof(mark), mode) != 0) {
1083 			ret = EFAULT;
1084 			break;
1085 		}
1086 		/* prepare message */
1087 		msg_size = sizeof(*rec) + mark.zev_payload_len + 1;
1088 		msg = zev_alloc(sizeof(*msg) + msg_size);
1089 		msg->size = msg_size;
1090 		rec = (zev_mark_t *)(msg + 1);
1091 		rec->record_len = msg_size;
1092 		rec->op = ZEV_OP_MARK;
1093 		rec->op_time = ddi_get_time();
1094 		rec->guid = mark.zev_guid;
1095 		rec->payload_len = mark.zev_payload_len;
1096 		/* get payload */
1097 		if (ddi_copyin(((char *)arg) + sizeof(mark),
1098 		               ZEV_PAYLOAD(rec),
1099 		               mark.zev_payload_len, mode) != 0) {
1100 			zev_free(msg, msg_size);
1101 			ret = EFAULT;
1102 			break;
1103 		}
1104 		*(ZEV_PAYLOAD(rec) + mark.zev_payload_len) = '\0';
1105 		/* get mark id and queue message */
1106 		mutex_enter(&zev_mark_id_mutex);
1107 		mark_id = zev_mark_id++;
1108 		mutex_exit(&zev_mark_id_mutex);
1109 		rec->mark_id = mark_id;
1110 		zev_queue_message(ZEV_OP_MARK, msg);
1111 		/* report mark id to userland, ignore errors */
1112 		mark.zev_mark_id = mark_id;
1113 		ddi_copyout(&mark, (void *)arg, sizeof(mark), mode);
1114 		break;
1115 	case ZEV_IOC_ADD_QUEUE:
1116 		if (minor != ZEV_CONTROL_DEVICE_MINOR) {
1117 			ret = EACCES;
1118 			break;
1119 		}
1120 		ret = zev_ioc_add_queue(req_q, arg, mode);
1121 		break;
1122 	case ZEV_IOC_REMOVE_QUEUE:
1123 		if (minor != ZEV_CONTROL_DEVICE_MINOR) {
1124 			ret = EACCES;
1125 			break;
1126 		}
1127 		ret = zev_ioc_remove_queue(req_q, arg, mode);
1128 		break;
1129 	case ZEV_IOC_GET_DEBUG_INFO:
1130 		ret = zev_ioc_get_debug_info(req_q, arg, mode);
1131 		break;
1132 	case ZEV_IOC_GET_QUEUE_LIST:
1133 		ret = zev_ioc_get_queue_list(req_q, arg, mode);
1134 		break;
1135 	case ZEV_IOC_GET_FILE_SIGNATURES:
1136 		ret = zev_ioc_get_signatures(arg, mode);
1137 		break;
1138 	default:
1139 		/* generic "ioctl unknown" error */
1140 		ret = ENOTTY;
1141 	}
1142 
1143 	mutex_enter(&zev_mutex);
1144 	zev_queue_release(req_q);
1145 	mutex_exit(&zev_mutex);
1146 	if (ret)
1147 		SET_ERROR(ret);
1148 	return (ret);
1149 }
1150 
1151 static int
1152 zev_chpoll(dev_t dev, short events, int anyyet,
1153     short *reventsp, struct pollhead **phpp)
1154 {
1155 	int minor;
1156 	short revent = 0;
1157 	zev_queue_t *q;
1158 
1159 	/* use minor-specific queue context and it's pollhead */
1160 	minor = getminor(dev);
1161 	if (minor == ZEV_CONTROL_DEVICE_MINOR)
1162 		return (EINVAL);
1163 	mutex_enter(&zev_mutex);
1164 	if ((q = ddi_get_soft_state(statep, minor)) == NULL) {
1165 		mutex_exit(&zev_mutex);
1166 		return (ENXIO);
1167 	}
1168 	revent = 0;
1169 	if ((events & POLLIN)) {
1170 		if (q->zq_oldest)
1171 			revent |= POLLIN;
1172 	}
1173 	if (revent == 0) {
1174 		if (!anyyet) {
1175 			*phpp = &q->zq_pollhead;
1176 		}
1177 	}
1178 	*reventsp = revent;
1179 	mutex_exit(&zev_mutex);
1180 	return (0);
1181 }
1182 
1183 /* ARGSUSED */
1184 static int
1185 zev_read(dev_t dev, struct uio *uio_p, cred_t *crep_p)
1186 {
1187 	minor_t minor;
1188 	offset_t off;
1189 	int ret = 0;
1190 	zev_msg_t *msg;
1191 	char *data;
1192 	zev_queue_t *q;
1193 
1194 	minor = getminor(dev);
1195 	if (minor == ZEV_CONTROL_DEVICE_MINOR)
1196 		return (EINVAL);
1197 
1198 	mutex_enter(&zev_mutex);
1199 	q = ddi_get_soft_state(statep, minor);
1200 	if (q == NULL) {
1201 		mutex_exit(&zev_mutex);
1202 		return (ENXIO);
1203 	}
1204 	off = uio_p->uio_loffset;
1205 	msg = q->zq_oldest;
1206 	while (msg == NULL) {
1207 		if (!ddi_can_receive_sig()) {
1208 			/*
1209 			 * read() shouldn't block because this thread
1210 			 * can't receive signals. (e.g., it might be
1211 			 * torn down by exit() right now.)
1212 			 */
1213 			mutex_exit(&zev_mutex);
1214 			return 0;
1215 		}
1216 		if (cv_wait_sig(&q->zq_condvar, &zev_mutex) == 0) {
1217 			/* signal received. */
1218 			mutex_exit(&zev_mutex);
1219 			return EINTR;
1220 		}
1221 		msg = q->zq_oldest;
1222 	}
1223 	if (msg->size > uio_p->uio_resid) {
1224 		mutex_exit(&zev_mutex);
1225 		return E2BIG;
1226 	}
1227 	while (msg && uio_p->uio_resid >= msg->size) {
1228 		data = (char *)(msg + 1);
1229 		ret = uiomove(data, msg->size, UIO_READ, uio_p);
1230 		if (ret != 0) {
1231 			mutex_exit(&zev_mutex);
1232 			cmn_err(CE_WARN, "zev: uiomove failed; messages lost");
1233 			uio_p->uio_loffset = off;
1234 			return (ret);
1235 		}
1236 		q->zq_oldest = msg->next;
1237 		q->zq_bytes_read += msg->size;
1238 		q->zq_queue_len -= msg->size;
1239 		q->zq_queue_messages--;
1240 		msg->read++;
1241 		msg = q->zq_oldest;
1242 	}
1243 	zev_queue_trim();
1244 	cv_broadcast(&zev_condvar);
1245 	mutex_exit(&zev_mutex);
1246 	uio_p->uio_loffset = off;
1247 	return 0;
1248 }
1249 
1250 /* ARGSUSED */
1251 static int
1252 zev_close(dev_t dev, int flag, int otyp, cred_t *crepd)
1253 {
1254 	zev_queue_t *q;
1255 	int minor;
1256 
1257 	minor = getminor(dev);
1258 	if (otyp != OTYP_CHR)
1259 		return (EINVAL);
1260 	mutex_enter(&zev_mutex);
1261 	if ((q = ddi_get_soft_state(statep, minor)) == NULL) {
1262 		mutex_exit(&zev_mutex);
1263 		return (ENXIO);
1264 	}
1265 	if (q->zq_busy != B_TRUE) {
1266 		mutex_exit(&zev_mutex);
1267 		return (EINVAL);
1268 	}
1269 	q->zq_busy = B_FALSE;
1270 	if ((q->zq_flags & ZEV_FL_PERSISTENT) == 0)
1271 		zev_queue_release(q);
1272 	mutex_exit(&zev_mutex);
1273 	return (0);
1274 }
1275 
1276 /* ARGSUSED */
1277 static int
1278 zev_open(dev_t *devp, int flag, int otyp, cred_t *credp)
1279 {
1280 	zev_queue_t *q;
1281 	minor_t minor;
1282 
1283 	minor = getminor(*devp);
1284 	if (otyp != OTYP_CHR)
1285 		return (EINVAL);
1286 	if (drv_priv(credp) != 0)
1287 		return (EPERM);
1288 	mutex_enter(&zev_mutex);
1289 	if ((q = ddi_get_soft_state(statep, minor)) == NULL) {
1290 		mutex_exit(&zev_mutex);
1291 		return (ENXIO);
1292 	}
1293 	if (minor == ZEV_CONTROL_DEVICE_MINOR) {
1294 		/* control device may be used in parallel */
1295 		q->zq_busy = B_TRUE;
1296 		mutex_exit(&zev_mutex);
1297 		return 0;
1298 	}
1299 	if (q->zq_busy == B_TRUE) {
1300 		mutex_exit(&zev_mutex);
1301 		return (EBUSY);
1302 	}
1303 	q->zq_busy = B_TRUE;	/* can only be opened exclusively */
1304 	mutex_exit(&zev_mutex);
1305 	return (0);
1306 }
1307 
1308 static struct cb_ops zev_cb_ops = {
1309 	zev_open,		/* open */
1310 	zev_close,		/* close */
1311 	nodev,			/* strategy */
1312 	nodev,			/* print */
1313 	nodev,			/* dump */
1314 	zev_read,		/* read */
1315 	nodev,			/* write */
1316 	zev_ioctl,		/* ioctl */
1317 	nodev,			/* devmap */
1318 	nodev,			/* mmap */
1319 	nodev,			/* segmap */
1320 	zev_chpoll,		/* chpoll */
1321 	ddi_prop_op,		/* prop_op */
1322 	NULL,			/* streamtab */
1323 	D_MP | D_64BIT,		/* cb_flag */
1324 	CB_REV,			/* cb_rev */
1325 	nodev,			/* aread */
1326 	nodev,			/* awrite */
1327 };
1328 
1329 static void
1330 zev_free_instance(dev_info_t *dip)
1331 {
1332 	int instance;
1333 	zev_queue_t *q;
1334 	int i;
1335 
1336 	instance = ddi_get_instance(dip);
1337 	if (instance != 0) {
1338 		cmn_err(CE_WARN, "zev: tried to free instance != 0 (%d)",
1339 		        instance);
1340 		return;
1341 	}
1342 
1343 	ddi_remove_minor_node(dip, NULL);
1344 
1345 	/* stop pollwakeup thread */
1346 	zev_wakeup_thread_run = 0;
1347 	if (zev_poll_wakeup_thread != NULL) {
1348 		thread_join(zev_poll_wakeup_thread->t_did);
1349 		zev_poll_wakeup_thread = NULL;
1350 	}
1351 
1352 	mutex_enter(&zev_mutex);
1353 
1354 	/* remove "ctrl" dummy queue */
1355 	q = ddi_get_soft_state(statep, ZEV_CONTROL_DEVICE_MINOR);
1356 	if (q) {
1357 		ddi_soft_state_free(statep, ZEV_CONTROL_DEVICE_MINOR);
1358 		ZEV_MEM_SUB(sizeof(zev_queue_t));
1359 	}
1360 
1361 	/* remove all other queues */
1362 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
1363 		q = zev_queues[i- ZEV_MINOR_MIN];
1364 		if (!q)
1365 			continue;
1366 		ASSERT(q->zq_refcnt == 1);
1367 		zev_queue_release(q);
1368 	}
1369 	zev_queue_trim();
1370 	bzero(&zev_queues, sizeof(zev_queues));
1371 
1372 	mutex_exit(&zev_mutex);
1373 
1374 }
1375 
1376 static int
1377 zev_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1378 {
1379 	int instance;
1380 	zev_queue_t *q;
1381 
1382 	/* called once per instance with DDI_DETACH,
1383 	   may be called to suspend */
1384 	switch (cmd) {
1385 	case DDI_DETACH:
1386 		/* instance busy? */
1387 		instance = ddi_get_instance(dip);
1388 		if (instance != 0) {	/* hardcoded in zev.conf */
1389 			/* this module only supports one instance. */
1390 			return (DDI_FAILURE);
1391 		}
1392 
1393 		mutex_enter(&zev_mutex);
1394 		if (!zev_attached) {
1395 			mutex_exit(&zev_mutex);
1396 			return (DDI_FAILURE);
1397 		}
1398 
1399 		/* check "ctrl" queue to see if t is busy */
1400 		q = ddi_get_soft_state(statep, ZEV_CONTROL_DEVICE_MINOR);
1401 		if (q == NULL) {
1402 			mutex_exit(&zev_mutex);
1403 			return (DDI_FAILURE);
1404 		}
1405 		if (q->zq_busy) {
1406 			mutex_exit(&zev_mutex);
1407 			return (DDI_FAILURE);
1408 		}
1409 		/* are there any queues? */
1410 		if (zev_queue_cnt > 0) {
1411 			mutex_exit(&zev_mutex);
1412 			return (DDI_FAILURE);
1413 		}
1414 
1415 		zev_attached = B_FALSE;
1416 		mutex_exit(&zev_mutex);
1417 
1418 		/* switch ZFS event callbacks back to default */
1419 		rw_enter(&rz_zev_rwlock, RW_WRITER);
1420 		rz_zev_callbacks = rz_zev_default_callbacks;
1421 		rz_zev_set_active(B_FALSE);
1422 		rw_exit(&rz_zev_rwlock);
1423 
1424 		/* no thread is inside of the callbacks anymore. */
1425 
1426 		/* free resources allocated for this instance */
1427 		zev_free_instance(dip);
1428 		zev_chksum_fini();
1429 #if 0
1430 		cmn_err(CE_WARN, "zev: allocated memory at detach: %" PRIu64,
1431 			zev_memory_allocated - zev_memory_freed);
1432 #endif
1433 		return (DDI_SUCCESS);
1434 	case DDI_SUSPEND:
1435 		/* kernel must not suspend zev devices while ZFS is running */
1436 		return (DDI_FAILURE);
1437 	default:
1438 		return (DDI_FAILURE);
1439 	}
1440 }
1441 
1442 static int
1443 zev_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
1444 {
1445 	/* called once per instance with DDI_ATTACH,
1446 	   may be called to resume */
1447 	int instance;
1448 	int error;
1449 	zev_queue_t *q;
1450 	switch (cmd) {
1451 	case DDI_ATTACH:
1452 		/* create instance state */
1453 		instance = ddi_get_instance(dip);
1454 		if (instance != 0) {	/* hardcoded in zev.conf */
1455 			/* this module only supports one instance. */
1456 			return (DDI_FAILURE);
1457 		}
1458 
1459 		mutex_enter(&zev_mutex);
1460 		if (zev_attached) {
1461 			mutex_exit(&zev_mutex);
1462 			return (DDI_FAILURE);
1463 		}
1464 		if (ddi_soft_state_zalloc(statep, ZEV_CONTROL_DEVICE_MINOR) !=
1465 		    DDI_SUCCESS) {
1466 			mutex_exit(&zev_mutex);
1467 			return (DDI_FAILURE);
1468 		}
1469 		ZEV_MEM_ADD(sizeof(zev_queue_t));
1470 		zev_attached = B_TRUE;
1471 
1472 		/* init queue list */
1473 		bzero(&zev_queues, sizeof(zev_queues));
1474 		mutex_exit(&zev_mutex);
1475 
1476 		/* create a dummy queue for management of "ctrl" */
1477 
1478 		q = ddi_get_soft_state(statep, ZEV_CONTROL_DEVICE_MINOR);
1479 		q->zq_dip = dip;
1480 		q->zq_refcnt = 1;
1481 		q->zq_busy = B_FALSE;
1482 		q->zq_minor_number = ZEV_CONTROL_DEVICE_MINOR;
1483 		q->zq_flags = ZEV_FL_PERSISTENT;
1484 		strcpy(q->zq_name, ZEV_CONTROL_DEVICE_NAME);
1485 
1486 		/* create device node for "ctrl" */
1487 		if (ddi_create_minor_node(dip, ZEV_CONTROL_DEVICE_NAME,
1488 		    S_IFCHR, ZEV_CONTROL_DEVICE_MINOR,
1489 		    DDI_PSEUDO, 0) == DDI_FAILURE) {
1490 			goto fail;
1491 		}
1492 
1493 		/* note: intentionally not adding ctrl queue to queue list. */
1494 
1495 		/* default queue */
1496 		error = zev_queue_new(&q, dip,
1497 				      ZEV_DEFAULT_QUEUE_NAME,
1498 				      ZEV_MAX_QUEUE_LEN,
1499 				      ZEV_FL_BLOCK_WHILE_QUEUE_FULL|
1500 		                      ZEV_FL_PERSISTENT);
1501 		if (error)
1502 			goto fail;
1503 
1504 		/* start pollwakeup thread */
1505 		zev_wakeup_thread_run = 1;
1506 		zev_poll_wakeup_thread = thread_create(NULL, 0,
1507 		    zev_poll_wakeup_thread_main, NULL, 0, &p0,
1508 		    TS_RUN, minclsyspri);
1509 
1510 		ddi_report_dev(dip);
1511 
1512 		zev_chksum_init();
1513 
1514 		/* switch ZFS event callbacks to zev module callbacks */
1515 		rw_enter(&rz_zev_rwlock, RW_WRITER);
1516 		rz_zev_callbacks = &zev_callbacks;
1517 		rz_zev_set_active(B_TRUE);
1518 		rw_exit(&rz_zev_rwlock);
1519 
1520 		return (DDI_SUCCESS);
1521 	case DDI_RESUME:
1522 		/* suspendeding zev devices should never happen */
1523 		return (DDI_SUCCESS);
1524 	default:
1525 		return (DDI_FAILURE);
1526 	}
1527 fail:
1528 	cmn_err(CE_WARN, "zev: attach failed");
1529 	zev_free_instance(dip);
1530 	mutex_enter(&zev_mutex);
1531 	zev_attached = B_FALSE;
1532 	mutex_exit(&zev_mutex);
1533 	return (DDI_FAILURE);
1534 }
1535 
1536 /* ARGSUSED */
1537 static int
1538 zev_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **resultp)
1539 {
1540 	minor_t minor;
1541 	zev_queue_t *q;
1542 
1543 	/* arg is dev_t */
1544 	minor = getminor((dev_t)arg);
1545 	mutex_enter(&zev_mutex);
1546 	q = ddi_get_soft_state(statep, minor);
1547 	if (q == NULL) {
1548 		*resultp = NULL;
1549 		mutex_exit(&zev_mutex);
1550 		return (DDI_FAILURE);
1551 	}
1552 
1553 	switch (infocmd) {
1554 	case DDI_INFO_DEVT2DEVINFO:
1555 		*resultp = q->zq_dip;
1556 		break;
1557 	case DDI_INFO_DEVT2INSTANCE:
1558 		*resultp = (void *)(uintptr_t)ddi_get_instance(q->zq_dip);
1559 		break;
1560 	default:
1561 		mutex_exit(&zev_mutex);
1562 		return (DDI_FAILURE);
1563 	}
1564 	mutex_exit(&zev_mutex);
1565 	return (DDI_SUCCESS);
1566 }
1567 
1568 static struct dev_ops zev_dev_ops = {
1569 	DEVO_REV,			/* driver build revision */
1570 	0,				/* driver reference count */
1571 	zev_getinfo,			/* getinfo */
1572 	nulldev,			/* identify (obsolete) */
1573 	nulldev,			/* probe (search for devices) */
1574 	zev_attach,			/* attach */
1575 	zev_detach,			/* detach */
1576 	nodev,				/* reset (obsolete, use quiesce) */
1577 	&zev_cb_ops,			/* character and block device ops */
1578 	NULL,				/* bus driver ops */
1579 	NULL,				/* power management, not needed */
1580 	ddi_quiesce_not_needed,		/* quiesce */
1581 };
1582 
1583 static struct modldrv zev_modldrv = {
1584 	&mod_driverops,			/* all loadable modules use this */
1585 	"zev ZFS event provider, v1.0",	/* driver name and version info */
1586 	&zev_dev_ops			/* ops method pointers */
1587 };
1588 
1589 static struct modlinkage zev_modlinkage = {
1590 	MODREV_1,	/* fixed value */
1591 	{
1592 		&zev_modldrv,	/* driver linkage structure */
1593 		NULL		/* list terminator */
1594 	}
1595 };
1596 
1597 int
1598 _init(void)
1599 {
1600 	int error;
1601 
1602 	if ((error = ddi_soft_state_init(&statep, sizeof(zev_queue_t), 1)) != 0)
1603 		return (error);
1604 	zev_attached = B_FALSE;
1605 
1606 	zev_queue_head = NULL;
1607 	zev_queue_tail = NULL;
1608 	zev_queue_len = 0;
1609 	zev_muted_pools_head = NULL;
1610 	zev_memory_allocated = 0;
1611 	zev_memory_freed = 0;
1612 	zev_queue_cnt = 0;
1613 	zev_have_blocking_queues = 1;
1614 
1615 	mutex_init(&zev_mutex, NULL, MUTEX_DRIVER, NULL);
1616 	cv_init(&zev_condvar, NULL, CV_DRIVER, NULL);
1617 	rw_init(&zev_pool_list_rwlock, NULL, RW_DRIVER, NULL);
1618 	mutex_init(&zev_mark_id_mutex, NULL, MUTEX_DRIVER, NULL);
1619 	zev_mark_id = gethrtime();
1620 	mutex_init(&zev_queue_msg_mutex, NULL, MUTEX_DRIVER, NULL);
1621 	zev_msg_sequence_number = gethrtime();
1622 	bzero(&zev_statistics, sizeof(zev_statistics));
1623 	bzero(&zev_pollhead, sizeof(zev_pollhead));
1624 	bzero(&zev_queues, sizeof(zev_queues));
1625 	zev_statistics.zev_max_queue_len = ZEV_MAX_QUEUE_LEN;
1626 	if (zev_ioc_mute_pool("zg0")) {
1627 		cmn_err(CE_WARN, "zev: could not init mute list");
1628 		goto FAIL;
1629 	}
1630 
1631 	if ((error = mod_install(&zev_modlinkage)) != 0) {
1632 		cmn_err(CE_WARN, "zev: could not install module");
1633 		goto FAIL;
1634 	}
1635 
1636 	return (0);
1637 FAIL:
1638 	/* free resources */
1639 	cmn_err(CE_WARN, "zev: _init failed");
1640 	mutex_destroy(&zev_mutex);
1641 	ddi_soft_state_fini(&statep);
1642 	return (error);
1643 }
1644 
1645 int
1646 _info(struct modinfo *modinfop)
1647 {
1648 	return (mod_info(&zev_modlinkage, modinfop));
1649 }
1650 
1651 int
1652 _fini(void)
1653 {
1654 	int error = 0;
1655 	zev_msg_t *msg;
1656 	zev_pool_list_entry_t *pe, *npe;
1657 
1658 	mutex_enter(&zev_mutex);
1659 	if (zev_attached == B_TRUE) {
1660 		mutex_exit(&zev_mutex);
1661 		return (SET_ERROR(EBUSY));
1662 	}
1663 	if (zev_queue_cnt != 0) {
1664 		/* should never happen */
1665 		mutex_exit(&zev_mutex);
1666 		return (SET_ERROR(EBUSY));
1667 	}
1668 
1669 	/*
1670 	 * avoid deadlock if event list is full: make sure threads currently
1671 	 * blocking on the event list can append their event and then release
1672 	 * rz_zev_rwlock.  Since there should be no queues left when we
1673 	 * reach this point we can simply empty the event list and then
1674 	 * wake everybody.
1675 	 */
1676 	while (zev_queue_head) {
1677 		msg = zev_queue_head;
1678 		zev_queue_head = msg->next;
1679 		zev_free(msg, sizeof(*msg) + msg->size);
1680 	}
1681 	cv_broadcast(&zev_condvar);
1682 	mutex_exit(&zev_mutex);
1683 
1684 	/* switch ZFS event callbacks back to default (again) */
1685 	rw_enter(&rz_zev_rwlock, RW_WRITER);
1686 	rz_zev_callbacks = rz_zev_default_callbacks;
1687 	rz_zev_set_active(B_FALSE);
1688 	rw_exit(&rz_zev_rwlock);
1689 
1690 	/* no thread is inside of the callbacks anymore.  Safe to remove. */
1691 
1692 	/* unload module callbacks */
1693 	if ((error = mod_remove(&zev_modlinkage)) != 0) {
1694 		cmn_err(CE_WARN, "mod_remove failed: %d", error);
1695 		return (error);
1696 	}
1697 
1698 	/* free resources */
1699 	mutex_enter(&zev_mutex);
1700 	while (zev_queue_head) {
1701 		msg = zev_queue_head;
1702 		zev_queue_head = msg->next;
1703 		zev_free(msg, sizeof(*msg) + msg->size);
1704 	}
1705 	mutex_exit(&zev_mutex);
1706 	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
1707 	pe = zev_muted_pools_head;
1708 	while (pe) {
1709 		npe = pe;
1710 		pe = pe->next;
1711 		zev_free(npe, sizeof(*npe));
1712 	}
1713 	rw_exit(&zev_pool_list_rwlock);
1714 	ddi_soft_state_fini(&statep);
1715 	rw_destroy(&zev_pool_list_rwlock);
1716 	cv_destroy(&zev_condvar);
1717 	mutex_destroy(&zev_mutex);
1718 	mutex_destroy(&zev_mark_id_mutex);
1719 	mutex_destroy(&zev_queue_msg_mutex);
1720 
1721 	return (0);
1722 }
1723 
1724