xref: /titanic_52/usr/src/uts/common/fs/zev/zev.c (revision 9ae8ebd12419d119edb8f27b367924346dd9f579)
1 #include <sys/modctl.h>
2 #include <sys/ddi.h>
3 #include <sys/sunddi.h>
4 #include <sys/conf.h>
5 #include <sys/devops.h>
6 #include <sys/stat.h>
7 #include <sys/fs/zev.h>
8 #include <sys/zev_callbacks.h>
9 #include <sys/zev_checksums.h>
10 #include <sys/zfs_znode.h>
11 #include <sys/time.h>
12 #include <sys/sa.h>
13 #include <sys/zap.h>
14 #include <sys/time.h>
15 #include <sys/fs/dv_node.h>
16 
17 #define	OFFSETOF(s, m)		((size_t)(&(((s *)0)->m)))
18 
19 #define XSTRING(x)	STRING(x)
20 #define STRING(x)	#x
21 
22 #define ZEV_DEFAULT_QUEUE_NAME		"beaver"
23 #define ZEV_CONTROL_DEVICE_MINOR	0
24 #define ZEV_MINOR_MIN			(ZEV_CONTROL_DEVICE_MINOR + 1)
25 #define ZEV_MINOR_MAX			(ZEV_MINOR_MIN + ZEV_MAX_QUEUES - 1)
26 
27 typedef struct zev_queue {
28 	char			zq_name[ZEV_MAX_QUEUE_NAME_LEN+1];
29 	minor_t			zq_minor_number;
30 	dev_info_t		*zq_dip;
31 	struct pollhead		zq_pollhead;
32 	uint64_t		zq_bytes_read;
33 	uint64_t		zq_events_read;
34 	uint64_t		zq_bytes_discarded;
35 	uint64_t		zq_events_discarded;
36 	uint64_t		zq_bytes_total;
37 	uint64_t		zq_events_total;
38 	uint64_t		zq_wakeup_threshold;
39 	uint16_t		zq_flags;
40 	uint16_t		zq_need_wakeup;
41 	/* protected by zev_mutex */
42 	int			zq_refcnt;
43 	uint64_t		zq_queue_len;
44 	uint64_t		zq_queue_messages;
45 	uint64_t		zq_max_queue_len;
46 	zev_msg_t		*zq_oldest;
47 	boolean_t		zq_busy;
48 	boolean_t		zq_to_be_removed;
49 	zev_statistics_t	zq_statistics;
50 	kcondvar_t		zq_condvar;
51 } zev_queue_t;
52 
53 static void		*statep;
54 struct pollhead		zev_pollhead;
55 
56 kmutex_t		zev_mutex;
57 kcondvar_t		zev_condvar;
58 kmutex_t		zev_queue_msg_mutex;
59 krwlock_t		zev_pool_list_rwlock;
60 static zev_statistics_t	zev_statistics;
61 static boolean_t	zev_attached;
62 static kmutex_t		zev_mark_id_mutex;
63 static uint64_t		zev_mark_id = 0;
64 
65 static uint64_t		zev_msg_sequence_number = 0;
66 static zev_queue_t	*zev_queues[ZEV_MAX_QUEUES];
67 static int		zev_queue_cnt = 0;
68 static int		zev_have_blocking_queues = 1;
69 
70 uint64_t	zev_memory_allocated = 0;
71 uint64_t	zev_memory_freed = 0;
72 
73 /*
74  * The longest potential message is from zev_zfs_mount() and
75  * contains the mountpoint, which might be close to MAXPATHLEN bytes long.
76  *
77  * Another candidate is zev_znode_rename_cb() and contains three inode
78  * numbers and two filenames of up to MAXNAMELEN bytes each.
79  */
80 #define ZEV_MAX_MESSAGE_LEN	4096
81 
82 static zev_msg_t *zev_queue_head = NULL;
83 static zev_msg_t *zev_queue_tail = NULL;
84 static uint64_t zev_queue_len = 0;
85 
86 
87 typedef struct zev_pool_list_entry {
88 	struct zev_pool_list_entry	*next;
89 	char				name[MAXPATHLEN];
90 } zev_pool_list_entry_t;
91 
92 static zev_pool_list_entry_t *zev_muted_pools_head = NULL;
93 
94 static volatile int zev_wakeup_thread_run = 1;
95 static kthread_t *zev_poll_wakeup_thread = NULL;
96 
97 void *
98 zev_alloc(ssize_t sz)
99 {
100 	ZEV_MEM_ADD(sz);
101 	return kmem_alloc(sz, KM_SLEEP);
102 }
103 
104 void *
105 zev_zalloc(ssize_t sz)
106 {
107 	ZEV_MEM_ADD(sz);
108 	return kmem_zalloc(sz, KM_SLEEP);
109 }
110 
111 void
112 zev_free(void *ptr, ssize_t sz)
113 {
114 	ZEV_MEM_SUB(sz);						\
115 	kmem_free(ptr, sz);
116 }
117 
118 /* must be called with zev_mutex held */
119 static void
120 zev_update_blockflag(void)
121 {
122 	zev_queue_t *q;
123 	int had_blocking_queues;
124 	int i;
125 
126 	had_blocking_queues = zev_have_blocking_queues;
127 
128 	/* do we still have blocking queues? */
129 	zev_have_blocking_queues = 0;
130 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
131 		q = zev_queues[i - ZEV_MINOR_MIN];
132 		if (!q)
133 			continue;
134 		if (q->zq_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL) {
135 			zev_have_blocking_queues = 1;
136 			break;
137 		}
138 	}
139 	/* no blocking queues */
140 	if (had_blocking_queues)
141 		cv_broadcast(&zev_condvar);
142 }
143 
144 int
145 zev_queue_cmp(const void *a, const void *b)
146 {
147 	const zev_queue_t *qa = a;
148 	const zev_queue_t *qb = b;
149 	if (qa->zq_minor_number > qb->zq_minor_number)
150 		return 1;
151 	if (qa->zq_minor_number < qb->zq_minor_number)
152 		return -1;
153 	return 0;
154 }
155 
156 /* must be called with zev_mutex held */
157 void
158 zev_queue_trim(void)
159 {
160 	zev_msg_t *m;
161 	uint64_t oldest_message;
162 	zev_queue_t *q;
163 	int i;
164 
165 	if (!zev_queue_tail)
166 		return;
167 
168 	oldest_message = zev_queue_tail->seq + 1;  /* does not exist, yet. */
169 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
170 		q = zev_queues[i - ZEV_MINOR_MIN];
171 		if (q == NULL)
172 			continue;
173 		if (!q->zq_oldest)
174 			continue;
175 		if (oldest_message > q->zq_oldest->seq)
176 			oldest_message = q->zq_oldest->seq;
177 	}
178 
179 	/* remove msgs between oldest_message and zev_queue_head */
180 	while(zev_queue_head && (oldest_message > zev_queue_head->seq)) {
181 		m = zev_queue_head;
182 		zev_queue_head = m->next;
183 		if (zev_queue_head == NULL) {
184 			zev_queue_tail = NULL;
185 		} else {
186 			zev_queue_head->prev = NULL;
187 		}
188 		if (m->read == 0) {
189 			zev_statistics.zev_bytes_discarded += m->size;
190 			zev_statistics.zev_cnt_discarded_events++;
191 		}
192 		zev_statistics.zev_queue_len -= m->size;
193 		zev_queue_len--;
194 		zev_free(m, sizeof(*m) + m->size);
195 	}
196 }
197 
198 /* must be called with zev_mutex held */
199 static void
200 zev_queue_hold(zev_queue_t *q)
201 {
202 	q->zq_refcnt++;
203 }
204 
205 /* must be called with zev_mutex held */
206 static void
207 zev_queue_release(zev_queue_t *q)
208 {
209 	q->zq_refcnt--;
210 	if (q->zq_refcnt > 0)
211 		return;
212 
213 	ASSERT(q->zq_busy == B_FALSE);
214 
215 	/* persistent queues will not be removed */
216 	if ((q->zq_flags & ZEV_FL_PERSISTENT) != 0)
217 		return;
218 
219 	/* remove queue from queue list */
220 	zev_queues[q->zq_minor_number - ZEV_MINOR_MIN] = NULL;
221 
222 	/* discard messages that no queue references anymore */
223 	zev_queue_trim();
224 
225 	cv_destroy(&q->zq_condvar);
226 	ddi_remove_minor_node(q->zq_dip, q->zq_name);
227 	devfs_clean(q->zq_dip, NULL, 0);
228 	ddi_soft_state_free(statep, q->zq_minor_number);
229 	ZEV_MEM_SUB(sizeof(zev_queue_t));
230 	zev_queue_cnt--;
231 	zev_update_blockflag();
232 }
233 
234 int
235 zev_queue_new(zev_queue_t **queue,
236               dev_info_t *dip,
237               char *name,
238               uint64_t max_queue_len,
239               uint16_t flags)
240 {
241 	zev_queue_t *q;
242 	zev_queue_t *tmp;
243 	zev_msg_t *msg;
244 	int name_exists = 0;
245 	minor_t minor;
246 	char *p;
247 	int i;
248 
249 	if (max_queue_len > ZEV_MAX_QUEUE_LEN)
250 		return EINVAL;
251 	if (max_queue_len == 0)
252 		max_queue_len = ZEV_MAX_QUEUE_LEN;
253 	if (!strcmp(name, ZEV_CONTROL_DEVICE_NAME))
254 		return EINVAL;
255 	for (p = name; *p; p++) {
256 		if (*p >= 'a' && *p <= 'z')
257 			continue;
258 		if (*p >= '0' && *p <= '9')
259 			continue;
260 		if (*p == '.')
261 			continue;
262 		return EINVAL;
263 	}
264 
265 	mutex_enter(&zev_mutex);
266 
267 	/* find free minor number.*/
268 	/* if this were a frequent operation we'd have a free-minor list */
269 	for (minor = ZEV_MINOR_MIN; minor <= ZEV_MINOR_MAX; minor++) {
270 		tmp = zev_queues[minor - ZEV_MINOR_MIN];
271 		if (tmp == NULL)
272 			break;
273 	}
274 	if (tmp) {
275 		mutex_exit(&zev_mutex);
276 		return ENOSPC;
277 	}
278 
279 	if (ddi_soft_state_zalloc(statep, minor) != DDI_SUCCESS) {
280 		mutex_exit(&zev_mutex);
281 		return ENOSPC;
282 	}
283 	ZEV_MEM_ADD(sizeof(zev_queue_t));
284 
285 	q = ddi_get_soft_state(statep, minor);
286 	memset(q, 0, sizeof(*q));
287 	strncpy(q->zq_name, name, ZEV_MAX_QUEUE_NAME_LEN);
288 	q->zq_name[ZEV_MAX_QUEUE_NAME_LEN] = '\0';
289 	q->zq_max_queue_len = max_queue_len;
290 	q->zq_wakeup_threshold = ZEV_DEFAULT_POLL_WAKEUP_QUEUE_LEN;
291 	q->zq_flags = flags;
292 	q->zq_refcnt = 1;
293 	q->zq_dip = dip;
294 	q->zq_minor_number = minor;
295 	cv_init(&q->zq_condvar, NULL, CV_DRIVER, NULL);
296 
297 	/* insert into queue list */
298 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
299 		/* if this were a frequent operation we'd have a name tree */
300 		if (zev_queues[i - ZEV_MINOR_MIN] == NULL)
301 			continue;
302 		if (!strcmp(q->zq_name, zev_queues[i-ZEV_MINOR_MIN]->zq_name)) {
303 			name_exists = 1;
304 			break;
305 		}
306 	}
307 	if (name_exists) {
308 		ddi_soft_state_free(statep, minor);
309 		ZEV_MEM_SUB(sizeof(zev_queue_t));
310 		mutex_exit(&zev_mutex);
311 		return EEXIST;
312 	}
313 	zev_queues[minor - ZEV_MINOR_MIN] = q;
314 	zev_queue_cnt++;
315 
316 	/* calculate current queue len and find head and tail */
317 	if (!(q->zq_flags & ZEV_FL_INITIALLY_EMPTY)) {
318 		q->zq_oldest = zev_queue_tail;
319 		msg = zev_queue_tail;
320 		while ((msg) && (q->zq_queue_len < q->zq_max_queue_len)) {
321 			q->zq_queue_len += msg->size;
322 			q->zq_queue_messages++;
323 			q->zq_oldest = msg;
324 			msg = msg->prev;
325 		}
326 	}
327 
328 	zev_update_blockflag();
329 
330 	mutex_exit(&zev_mutex);
331 
332 	if (ddi_create_minor_node(dip, name,
333 	    S_IFCHR, minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
334 		mutex_enter(&zev_mutex);
335 		zev_queues[minor - ZEV_MINOR_MIN] = NULL;
336 		zev_queue_cnt--;
337 		ddi_soft_state_free(statep, minor);
338 		ZEV_MEM_SUB(sizeof(zev_queue_t));
339 		zev_update_blockflag();
340 		mutex_exit(&zev_mutex);
341 		return EFAULT;
342 	}
343 
344 	*queue = q;
345 	return 0;
346 }
347 
348 /*
349  * poll() wakeup thread.  Used to check periodically whether we have
350  * bytes left in the queue that have not yet been made into a
351  * pollwakeup() call.  This is meant to insure a maximum waiting
352  * time until an event is presented as a poll wakeup, while at
353  * the same time not making every single event into a poll wakeup
354  * of it's own.
355  */
356 
357 static void
358 zev_poll_wakeup(boolean_t flush_all)
359 {
360 	zev_queue_t *q;
361 	int i;
362 
363 	/*
364 	 * This loop works with hold() and release() because
365 	 * pollwakeup() requires us to release our locks before calling it.
366 	 *
367 	 * from pollwakeup(9F):
368 	 *
369 	 *   "Driver defined locks should not be held across calls
370 	 *    to this function."
371 	 */
372 
373 	/* wake up threads for each individual queue */
374 	mutex_enter(&zev_mutex);
375 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
376 		q = zev_queues[i - ZEV_MINOR_MIN];
377 		if (q == NULL)
378 			continue;
379 		if (!q->zq_busy)
380 			continue;
381 		if (!q->zq_queue_len)
382 			continue;
383 		if ((flush_all) ||
384 		    (q->zq_queue_len > q->zq_wakeup_threshold)) {
385 			zev_queue_hold(q);
386 			mutex_exit(&zev_mutex);
387 			pollwakeup(&q->zq_pollhead, POLLIN);
388 			mutex_enter(&zev_mutex);
389 			zev_queue_release(q);
390 		}
391 	}
392 	mutex_exit(&zev_mutex);
393 }
394 
395 static void
396 zev_poll_wakeup_thread_main(void)
397 {
398 	while (zev_wakeup_thread_run) {
399 		delay(drv_usectohz(100 * 1000)); /* sleep 100ms */
400 
401 		zev_poll_wakeup(B_TRUE);
402 	}
403 	thread_exit();
404 }
405 
406 static int
407 zev_ioc_mute_pool(char *poolname)
408 {
409 	zev_pool_list_entry_t *pe;
410 	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
411 	/* pool already muted? */
412 	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
413 		if (!strcmp(pe->name, poolname)) {
414 			rw_exit(&zev_pool_list_rwlock);
415 			return EEXIST;
416 		}
417 	}
418 	pe = zev_zalloc(sizeof(*pe));
419 	if (!pe) {
420 		rw_exit(&zev_pool_list_rwlock);
421 		return ENOMEM;
422 	}
423 	(void) strncpy(pe->name, poolname, sizeof(pe->name));
424 	pe->next = zev_muted_pools_head;
425 	zev_muted_pools_head = pe;
426 	rw_exit(&zev_pool_list_rwlock);
427 	return (0);
428 }
429 
430 static int
431 zev_ioc_unmute_pool(char *poolname)
432 {
433 	zev_pool_list_entry_t *pe, *peprev;
434 
435 	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
436 	/* pool muted? */
437 	peprev = NULL;
438 	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
439 		if (!strcmp(pe->name, poolname))
440 			break;
441 		peprev = pe;
442 	}
443 	if (pe) {
444 		rw_exit(&zev_pool_list_rwlock);
445 		return ENOENT;
446 	}
447 
448 	if (peprev != NULL) {
449 		peprev->next = pe->next;
450 	} else {
451 		zev_muted_pools_head = pe->next;
452 	}
453 	zev_free(pe, sizeof(*pe));
454 	rw_exit(&zev_pool_list_rwlock);
455 	return (0);
456 }
457 
458 int
459 zev_skip_pool(objset_t *os)
460 {
461 	zev_pool_list_entry_t *pe;
462 	dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
463 	rw_enter(&zev_pool_list_rwlock, RW_READER);
464 	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
465 		if (!strcmp(pe->name, dp->dp_spa->spa_name)) {
466 			rw_exit(&zev_pool_list_rwlock);
467 			return 1;
468 		}
469 	}
470 	rw_exit(&zev_pool_list_rwlock);
471 	return 0;
472 }
473 
474 int
475 zev_skip_fs(zfsvfs_t *fs)
476 {
477 	dsl_dir_t *d = fs->z_os->os_dsl_dataset->ds_dir;
478 	dsl_dir_t *prev = NULL;
479 
480 	while (d && d != prev) {
481 		if (strstr(d->dd_myname, "_root"))
482 			return 0;
483 		prev = d;
484 		d = d->dd_parent;
485 	}
486 	return 1;
487 }
488 
489 static void
490 zev_update_statistics(int op, zev_statistics_t *stat)
491 {
492 	switch (op) {
493 	case ZEV_OP_ERROR:
494 		stat->zev_cnt_errors++;
495 		break;
496 	case ZEV_OP_MARK:
497 		stat->zev_cnt_marks++;
498 		break;
499 	case ZEV_OP_ZFS_MOUNT:
500 		stat->zev_cnt_zfs_mount++;
501 		break;
502 	case ZEV_OP_ZFS_UMOUNT:
503 		stat->zev_cnt_zfs_umount++;
504 		break;
505 	case ZEV_OP_ZVOL_WRITE:
506 		stat->zev_cnt_zvol_write++;
507 		break;
508 	case ZEV_OP_ZVOL_TRUNCATE:
509 		stat->zev_cnt_zvol_truncate++;
510 		break;
511 	case ZEV_OP_ZNODE_CLOSE_AFTER_UPDATE:
512 		stat->zev_cnt_znode_close_after_update++;
513 		break;
514 	case ZEV_OP_ZNODE_CREATE:
515 		stat->zev_cnt_znode_create++;
516 		break;
517 	case ZEV_OP_ZNODE_REMOVE:
518 		stat->zev_cnt_znode_remove++;
519 		break;
520 	case ZEV_OP_ZNODE_LINK:
521 		stat->zev_cnt_znode_link++;
522 		break;
523 	case ZEV_OP_ZNODE_SYMLINK:
524 		stat->zev_cnt_znode_symlink++;
525 		break;
526 	case ZEV_OP_ZNODE_RENAME:
527 		stat->zev_cnt_znode_rename++;
528 		break;
529 	case ZEV_OP_ZNODE_WRITE:
530 		stat->zev_cnt_znode_write++;
531 		break;
532 	case ZEV_OP_ZNODE_TRUNCATE:
533 		stat->zev_cnt_znode_truncate++;
534 		break;
535 	case ZEV_OP_ZNODE_SETATTR:
536 		stat->zev_cnt_znode_setattr++;
537 		break;
538 	case ZEV_OP_ZNODE_ACL:
539 		stat->zev_cnt_znode_acl++;
540 		break;
541 	}
542 }
543 
544 void
545 zev_queue_message(int op, zev_msg_t *msg)
546 {
547 	zev_queue_t *q;
548 	int wakeup = 0;
549 	zev_msg_t *m;
550 	int i;
551 
552 	msg->next = NULL;
553 	msg->prev = NULL;
554 	msg->read = 0;
555 
556 	if (op < ZEV_OP_MIN || op > ZEV_OP_MAX) {
557 		zev_queue_error(op, "unknown op id encountered: %d", op);
558 		zev_free(msg, sizeof(*msg) + msg->size);
559 		return;
560 	}
561 
562 	/*
563 	 * This mutex protects us agains race conditions when several
564 	 * threads want to queue a message and one or more queues are
565 	 * full:  we release zev_mutex to wait for the queues to become
566 	 * less-than-full, but we don't know in which order the waiting
567 	 * threads will be awoken.  If it's not the same order in which
568 	 * they went to sleep we might mark different messages as "newest"
569 	 * in different queues, and so we might have dupes or even
570 	 * skip messages.
571 	 */
572 	mutex_enter(&zev_queue_msg_mutex);
573 
574 	mutex_enter(&zev_mutex);
575 
576 	/*
577 	 * When the module is loaded, the default behavior ist to
578 	 * put all events into a queue and block if the queue is full.
579 	 * This is done even before the pseudo device is attached.
580 	 * This way, no events are lost.
581 	 *
582 	 * To discard events entirely the "beaver" queue,
583 	 * which never discards anything, has to be removed.
584 	 */
585 
586 	if (zev_queue_cnt == 0) {
587 		mutex_exit(&zev_mutex);
588 		mutex_exit(&zev_queue_msg_mutex);
589 		return;
590 	}
591 
592 	/* put message into global queue */
593 	msg->seq = zev_msg_sequence_number++;
594 
595 	/* do we need to make room? */
596 again:
597 	while (zev_statistics.zev_max_queue_len &&
598 	    zev_statistics.zev_queue_len > zev_statistics.zev_max_queue_len) {
599 
600 		if (zev_have_blocking_queues) {
601 			/* so we have blocking queues.  are they full? */
602 			for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
603 				q = zev_queues[i - ZEV_MINOR_MIN];
604 				if (!q)
605 					continue;
606 				if ((q->zq_flags &
607 				     ZEV_FL_BLOCK_WHILE_QUEUE_FULL) == 0)
608 					continue;
609 				if (q->zq_queue_len &&
610 				    q->zq_queue_len > q->zq_max_queue_len) {
611 					/* block until queue's been shrunk. */
612 					cv_wait(&zev_condvar, &zev_mutex);
613 					goto again;
614 				}
615 			}
616 		}
617 
618 		/* discard events until this message fits into all queues */
619 
620 		for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
621 			q = zev_queues[i - ZEV_MINOR_MIN];
622 			if (!q)
623 				continue;
624 			/* discard msgs until queue is small enough */
625 			while (q->zq_queue_len &&
626 			       q->zq_queue_len > q->zq_max_queue_len) {
627 				m = q->zq_oldest;
628 				if (m == NULL)
629 					break;
630 				q->zq_events_discarded++;
631 				q->zq_bytes_discarded += m->size;
632 				q->zq_oldest = m->next;
633 				q->zq_queue_len -= m->size;
634 				q->zq_queue_messages--;
635 			}
636 		}
637 
638 		zev_queue_trim();
639 		ASSERT(zev_statistics.zev_queue_len == 0 ||
640 		       zev_statistics.zev_queue_len <=
641 				zev_statistics.zev_max_queue_len);
642 	}
643 
644 	if (zev_queue_tail == NULL) {
645 		zev_queue_head = zev_queue_tail = msg;
646 	} else {
647 		zev_queue_tail->next = msg;
648 		msg->prev = zev_queue_tail;
649 		zev_queue_tail = msg;
650 	}
651 	zev_queue_len++;
652 	zev_statistics.zev_cnt_total_events++;
653 	zev_statistics.zev_queue_len += msg->size;
654 
655 	/* update per-device queues */
656 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
657 		q = zev_queues[i - ZEV_MINOR_MIN];
658 		if (!q)
659 			continue;
660 
661 		zev_queue_hold(q);
662 
663 		/* make sure queue has enough room */
664 		while (q->zq_max_queue_len &&
665 		       q->zq_queue_len > q->zq_max_queue_len) {
666 
667 			if (q->zq_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL) {
668 				/* block until queue has been shrunk. */
669 				cv_wait(&zev_condvar, &zev_mutex);
670 			} else {
671 				/* discard msgs until queue is small enough */
672 				while (q->zq_queue_len > q->zq_max_queue_len) {
673 					m = q->zq_oldest;
674 					if (m == NULL)
675 						break;
676 					q->zq_events_discarded++;
677 					q->zq_bytes_discarded += m->size;
678 					q->zq_oldest = m->next;
679 					q->zq_queue_len -= m->size;
680 					q->zq_queue_messages--;
681 				}
682 			}
683 		}
684 
685 		/* register new message at the end of the queue */
686 		q->zq_queue_len += msg->size;
687 		q->zq_queue_messages++;
688 		q->zq_bytes_total += msg->size;
689 		q->zq_events_total++;
690 		if (q->zq_oldest == NULL)
691 			q->zq_oldest = msg;
692 
693 		zev_update_statistics(op, &q->zq_statistics);
694 
695 		if (q->zq_queue_len > q->zq_wakeup_threshold)
696 			wakeup = 1;
697 		if (q->zq_queue_len == msg->size)  /* queue was empty */
698 			cv_broadcast(&q->zq_condvar);
699 
700 		zev_queue_release(q);
701 	}
702 
703 	zev_queue_trim();
704 
705 	zev_update_statistics(op, &zev_statistics);
706 	mutex_exit(&zev_mutex);
707 	mutex_exit(&zev_queue_msg_mutex);
708 
709 	/* one or more queues need a pollwakeup() */
710 	if (op == ZEV_OP_MARK) {
711 		zev_poll_wakeup(B_TRUE);
712 	} else if (wakeup) {
713 		zev_poll_wakeup(B_FALSE);
714 	}
715 
716 	return;
717 }
718 
719 void
720 zev_queue_error(int op, char *fmt, ...)
721 {
722 	char buf[ZEV_MAX_MESSAGE_LEN];
723 	va_list ap;
724 	int len;
725 	zev_msg_t *msg = NULL;
726 	zev_error_t *rec;
727 	int msg_size;
728 
729 	va_start(ap, fmt);
730 	len = vsnprintf(buf, sizeof(buf), fmt, ap);
731 	va_end(ap);
732 	if (len >= sizeof(buf)) {
733 		cmn_err(CE_WARN, "zev: can't report error - "
734 		        "dropping event entirely.");
735 		return;
736 	}
737 
738 	msg_size = sizeof(*rec) + len + 1;
739 	msg = zev_alloc(sizeof(*msg) + msg_size);
740 	msg->size = msg_size;
741 	rec = (zev_error_t *)(msg + 1);
742 	rec->record_len = msg_size;
743 	rec->op = ZEV_OP_ERROR;
744 	rec->op_time = ddi_get_time();
745 	rec->guid = 0;
746 	rec->failed_op = op;
747 	rec->errstr_len = len;
748 	(void) memcpy(ZEV_ERRSTR(rec), buf, len + 1);
749 
750 	zev_queue_message(ZEV_OP_ERROR, msg);
751 	return;
752 }
753 
754 static int
755 zev_find_queue(zev_queue_t **out, zev_queue_t *req_q, zev_queue_name_t *name)
756 {
757 	char namebuf[ZEV_MAX_QUEUE_NAME_LEN+1];
758 	zev_queue_t *q;
759 	int i;
760 
761 	*out = NULL;
762 
763 	if (name->zev_namelen == 0) {
764 		if (req_q->zq_minor_number == ZEV_CONTROL_DEVICE_MINOR)
765 			return EINVAL;
766 		mutex_enter(&zev_mutex);
767 		zev_queue_hold(req_q);
768 		mutex_exit(&zev_mutex);
769 		*out = req_q;
770 		return 0;
771 	}
772 
773 	if (name->zev_namelen > ZEV_MAX_QUEUE_NAME_LEN)
774 		return EINVAL;
775 	strncpy(namebuf, name->zev_name, name->zev_namelen);
776 	namebuf[name->zev_namelen] = '\0';
777 
778 	mutex_enter(&zev_mutex);
779 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
780 		q = zev_queues[i - ZEV_MINOR_MIN];
781 		if (!q)
782 			continue;
783 		if (!strcmp(q->zq_name, namebuf)) {
784 			zev_queue_hold(q);
785 			mutex_exit(&zev_mutex);
786 			*out = q;
787 			return 0;
788 		}
789 	}
790 	mutex_exit(&zev_mutex);
791 	return ENOENT;
792 }
793 
794 static int
795 zev_ioc_get_queue_statistics(zev_queue_t *req_q, intptr_t arg, int mode)
796 {
797 	zev_ioctl_get_queue_statistics_t gs;
798 	zev_queue_t *q;
799 	int ret;
800 
801 	if (ddi_copyin((void *)arg, &gs, sizeof(gs), mode) != 0)
802 		return EFAULT;
803 
804 	ret = zev_find_queue(&q, req_q, &gs.zev_queue_name);
805 	if (ret)
806 		return ret;
807 
808 	/* ddi_copyout() can take a long time.  Better make
809 	   a copy to be able to release the mutex faster. */
810 	mutex_enter(&zev_mutex);
811 	memcpy(&gs.zev_statistics, &q->zq_statistics,sizeof(gs.zev_statistics));
812 	gs.zev_statistics.zev_queue_len = q->zq_queue_len;
813 	gs.zev_statistics.zev_bytes_read = q->zq_bytes_read;
814 	gs.zev_statistics.zev_bytes_discarded = q->zq_bytes_discarded;
815 	gs.zev_statistics.zev_max_queue_len = q->zq_max_queue_len;
816 	gs.zev_statistics.zev_cnt_discarded_events = q->zq_events_discarded;
817 	gs.zev_statistics.zev_cnt_total_events = q->zq_events_total;
818 	zev_queue_release(q);
819 	mutex_exit(&zev_mutex);
820 
821 	if (ddi_copyout(&gs, (void *)arg, sizeof(gs), mode) != 0)
822 		return EFAULT;
823 	return 0;
824 }
825 
826 static int
827 zev_ioc_set_queue_properties(zev_queue_t *req_q, intptr_t arg, int mode)
828 {
829 	zev_ioctl_set_queue_properties_t qp;
830 	zev_queue_t *q;
831 	uint64_t old_max;
832 	uint64_t old_flags;
833 	int ret;
834 
835 	if (ddi_copyin((void *)arg, &qp, sizeof(qp), mode) != 0)
836 		return EFAULT;
837 	if (qp.zev_max_queue_len > ZEV_MAX_QUEUE_LEN)
838 		return EINVAL;
839 	if (qp.zev_poll_wakeup_threshold > ZEV_MAX_POLL_WAKEUP_QUEUE_LEN)
840 		return EINVAL;
841 
842 	ret = zev_find_queue(&q, req_q, &qp.zev_queue_name);
843 	if (ret)
844 		return ret;
845 
846 	mutex_enter(&zev_mutex);
847 
848 	/*
849 	 * Note: if the PERSISTENT flag is cleared, and the queue is not busy,
850 	 * the queue should be removed by zev_queue_release() in zev_ioctl().
851 	 */
852 	old_flags = qp.zev_flags;
853 	q->zq_flags = qp.zev_flags;
854 	if ((old_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL) &&
855 	   (!(qp.zev_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL))) {
856 		/* queue is no longer blocking - wake blocked threads */
857 		cv_broadcast(&zev_condvar);
858 	}
859 
860 	zev_update_blockflag();
861 
862 	old_max = q->zq_max_queue_len;
863 	q->zq_max_queue_len = qp.zev_max_queue_len;
864 	if (q->zq_max_queue_len < old_max)
865 		zev_queue_trim();
866 	if (q->zq_max_queue_len > old_max)
867 		cv_broadcast(&zev_condvar);	/* threads may be waiting */
868 
869 	if ((qp.zev_poll_wakeup_threshold < q->zq_wakeup_threshold) &&
870 	    (qp.zev_poll_wakeup_threshold <= q->zq_queue_len))
871 		pollwakeup(&q->zq_pollhead, POLLIN);
872 	q->zq_wakeup_threshold = qp.zev_poll_wakeup_threshold;
873 
874 	zev_queue_release(q);
875 	mutex_exit(&zev_mutex);
876 	return 0;
877 }
878 
879 static int
880 zev_ioc_get_queue_properties(zev_queue_t *req_q, intptr_t arg, int mode)
881 {
882 	zev_ioctl_get_queue_properties_t qp;
883 	zev_queue_t *q;
884 	int ret;
885 
886 	if (ddi_copyin((void *)arg, &qp, sizeof(qp), mode) != 0)
887 		return EFAULT;
888 
889 	ret = zev_find_queue(&q, req_q, &qp.zev_queue_name);
890 	if (ret)
891 		return ret;
892 
893 	mutex_enter(&zev_mutex);
894 	qp.zev_max_queue_len = q->zq_max_queue_len;
895 	qp.zev_flags = q->zq_flags;
896 	qp.zev_poll_wakeup_threshold = q->zq_wakeup_threshold;
897 	zev_queue_release(q);
898 	mutex_exit(&zev_mutex);
899 
900 	if (ddi_copyout(&qp, (void *)arg, sizeof(qp), mode) != 0)
901 		return EFAULT;
902 	return 0;
903 }
904 
905 static int
906 zev_ioc_add_queue(zev_queue_t *req_q, intptr_t arg, int mode)
907 {
908 	zev_ioctl_add_queue_t aq;
909 	zev_queue_t *new_q;
910 	char name[ZEV_MAX_QUEUE_NAME_LEN+1];
911 
912 	if (ddi_copyin((void *)arg, &aq, sizeof(aq), mode) != 0)
913 		return EFAULT;
914 
915 	if (aq.zev_namelen > ZEV_MAX_QUEUE_NAME_LEN)
916 		return EINVAL;
917 	strncpy(name, aq.zev_name, aq.zev_namelen);
918 	name[aq.zev_namelen] = '\0';
919 
920 	return zev_queue_new(&new_q, req_q->zq_dip, name,
921 	                     aq.zev_max_queue_len, aq.zev_flags);
922 }
923 
924 static int
925 zev_ioc_remove_queue(zev_queue_t *req_q, intptr_t arg, int mode)
926 {
927 	zev_ioctl_remove_queue_t rq;
928 	zev_queue_t *q;
929 	char name[ZEV_MAX_QUEUE_NAME_LEN+1];
930 	int found = 0;
931 	int i;
932 
933 	if (ddi_copyin((void *)arg, &rq, sizeof(rq), mode) != 0)
934 		return EFAULT;
935 
936 	if (rq.zev_queue_name.zev_namelen > ZEV_MAX_QUEUE_NAME_LEN)
937 		return EINVAL;
938 	strncpy(name, rq.zev_queue_name.zev_name,
939 	        rq.zev_queue_name.zev_namelen);
940 	name[rq.zev_queue_name.zev_namelen] = '\0';
941 
942 	mutex_enter(&zev_mutex);
943 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
944 		q = zev_queues[i - ZEV_MINOR_MIN];
945 		if (!q)
946 			continue;
947 		if (!strcmp(q->zq_name, name)) {
948 			found = 1;
949 			break;
950 		}
951 	}
952 	if (!found) {
953 		mutex_exit(&zev_mutex);
954 		return ENOENT;
955 	}
956 
957 	if (q->zq_busy) {
958 		mutex_exit(&zev_mutex);
959 		return EBUSY;
960 	}
961 	/*
962 	 * clear flags, so that persistent queues are removed aswell
963 	 * and the queue becomes non-blocking.
964 	 */
965 	q->zq_flags = 0;
966 	if (q->zq_to_be_removed == B_FALSE) {
967 		q->zq_to_be_removed = B_TRUE;
968 		zev_queue_release(q);
969 	}
970 	/* some threads might be waiting for this queue to become writable */
971 	cv_broadcast(&zev_condvar);
972 
973 	mutex_exit(&zev_mutex);
974 	return 0;
975 }
976 
977 static int
978 zev_ioc_get_debug_info(zev_queue_t *req_q, intptr_t arg, int mode)
979 {
980 	zev_ioctl_debug_info_t di;
981 	uint64_t mem_allocated = atomic_add_64_nv(&zev_memory_allocated, 0);
982 	uint64_t mem_freed     = atomic_add_64_nv(&zev_memory_freed, 0);
983 
984 	zev_chksum_stats(&di.zev_chksum_cache_size,
985 	                 &di.zev_chksum_cache_hits,
986 	                 &di.zev_chksum_cache_misses);
987 	di.zev_memory_allocated = mem_allocated - mem_freed;
988 	if (ddi_copyout(&di, (void *)arg, sizeof(di), mode) != 0)
989 		return EFAULT;
990 	return 0;
991 }
992 
993 static int
994 zev_ioc_get_queue_list(zev_queue_t *req_q, intptr_t arg, int mode)
995 {
996 	zev_ioctl_get_queue_list_t gql;
997 	zev_queue_t *q;
998 	int i = 0;
999 	int count = 0;
1000 
1001 	memset(&gql, 0, sizeof(gql));
1002 
1003 	mutex_enter(&zev_mutex);
1004 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
1005 		q = zev_queues[i - ZEV_MINOR_MIN];
1006 		if (!q)
1007 			continue;
1008 		strncpy(gql.zev_queue_name[count].zev_name,
1009 		    q->zq_name, ZEV_MAX_QUEUE_NAME_LEN);
1010 		gql.zev_queue_name[count].zev_namelen = strlen(q->zq_name);
1011 		count++;
1012 	}
1013 	gql.zev_n_queues = count;
1014 	mutex_exit(&zev_mutex);
1015 
1016 	if (ddi_copyout(&gql, (void *)arg, sizeof(gql), mode) != 0)
1017 		return EFAULT;
1018 	return 0;
1019 }
1020 
1021 static int
1022 zev_ioc_set_max_queue_len(zev_queue_t *req_q, intptr_t arg, int mode)
1023 {
1024 	uint64_t len;
1025 	int i;
1026 	zev_queue_t *q;
1027 
1028 	if (ddi_copyin((void *)arg, &len, sizeof(len), mode) != 0) {
1029 		return EFAULT;
1030 	}
1031 	if (len > ZEV_MAX_QUEUE_LEN) {
1032 		return EINVAL;
1033 	}
1034 	mutex_enter(&zev_mutex);
1035 	zev_statistics.zev_max_queue_len = len;
1036 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
1037 		q = zev_queues[i - ZEV_MINOR_MIN];
1038 		if (!q)
1039 			continue;
1040 		if (q->zq_max_queue_len <=
1041 		    zev_statistics.zev_max_queue_len)
1042 			continue;
1043 		q->zq_max_queue_len = zev_statistics.zev_max_queue_len;
1044 	}
1045 	cv_broadcast(&zev_condvar);
1046 	mutex_exit(&zev_mutex);
1047 	return 0;
1048 }
1049 
1050 static int
1051 zev_ioc_get_zev_version(intptr_t arg, int mode)
1052 {
1053 	zev_ioctl_get_zev_version vi;
1054 	vi.zev_major_version = ZEV_MAJOR_VERSION;
1055 	vi.zev_minor_version = ZEV_MINOR_VERSION;
1056 	if (ddi_copyout(&vi, (void *)arg, sizeof(vi), mode) != 0)
1057 		return EFAULT;
1058 	return 0;
1059 }
1060 
1061 /* ARGSUSED */
1062 static int
1063 zev_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
1064 {
1065 	zev_statistics_t zs;
1066 	zev_ioctl_poolarg_t pa;
1067 	zev_ioctl_mark_t mark;
1068 	zev_mark_t *rec;
1069 	int msg_size;
1070 	zev_msg_t *msg;
1071 	uint64_t mark_id;
1072 	minor_t minor;
1073 	zev_queue_t *req_q;
1074 	int ret = 0;
1075 
1076 	minor = getminor(dev);
1077 	mutex_enter(&zev_mutex);
1078 	if ((req_q = ddi_get_soft_state(statep, minor)) == NULL) {
1079 		mutex_exit(&zev_mutex);
1080 		return (ENXIO);
1081 	}
1082 	zev_queue_hold(req_q);
1083 	mutex_exit(&zev_mutex);
1084 	/*
1085 	 * all structures passed between kernel and userspace
1086 	 * are now compatible between 64 and 32 bit.  Model
1087 	 * conversion can be ignored.
1088 	 */
1089 	switch (cmd) {
1090 	case ZEV_IOC_GET_GLOBAL_STATISTICS:
1091 		/* ddi_copyout() can take a long time.  Better make
1092 		   a copy to be able to release the mutex faster. */
1093 		mutex_enter(&zev_mutex);
1094 		(void) memcpy(&zs, &zev_statistics, sizeof(zs));
1095 		mutex_exit(&zev_mutex);
1096 		if (ddi_copyout(&zs, (void *)arg, sizeof(zs), mode) != 0)
1097 			ret = EFAULT;
1098 		break;
1099 	case ZEV_IOC_GET_QUEUE_STATISTICS:
1100 		ret = zev_ioc_get_queue_statistics(req_q, arg, mode);
1101 		break;
1102 	case ZEV_IOC_MUTE_POOL:
1103 	case ZEV_IOC_UNMUTE_POOL:
1104 		if (ddi_copyin((void *)arg, &pa, sizeof(pa), mode) != 0) {
1105 			ret = EFAULT;
1106 			break;
1107 		}
1108 		if (pa.zev_poolname_len >=MAXPATHLEN) {
1109 			ret = EINVAL;
1110 			break;
1111 		}
1112 		pa.zev_poolname[pa.zev_poolname_len] = '\0';
1113 		if (cmd == ZEV_IOC_MUTE_POOL) {
1114 			ret = zev_ioc_mute_pool(pa.zev_poolname);
1115 		} else {
1116 			ret = zev_ioc_unmute_pool(pa.zev_poolname);
1117 		}
1118 		break;
1119 	case ZEV_IOC_SET_MAX_QUEUE_LEN:
1120 		ret = zev_ioc_set_max_queue_len(req_q, arg, mode);
1121 		break;
1122 	case ZEV_IOC_GET_QUEUE_PROPERTIES:
1123 		ret = zev_ioc_get_queue_properties(req_q, arg, mode);
1124 		break;
1125 	case ZEV_IOC_SET_QUEUE_PROPERTIES:
1126 		ret = zev_ioc_set_queue_properties(req_q, arg, mode);
1127 		break;
1128 	case ZEV_IOC_MARK:
1129 		if (ddi_copyin((void *)arg, &mark, sizeof(mark), mode) != 0) {
1130 			ret = EFAULT;
1131 			break;
1132 		}
1133 		/* prepare message */
1134 		msg_size = sizeof(*rec) + mark.zev_payload_len + 1;
1135 		msg = zev_alloc(sizeof(*msg) + msg_size);
1136 		msg->size = msg_size;
1137 		rec = (zev_mark_t *)(msg + 1);
1138 		rec->record_len = msg_size;
1139 		rec->op = ZEV_OP_MARK;
1140 		rec->op_time = ddi_get_time();
1141 		rec->guid = mark.zev_guid;
1142 		rec->payload_len = mark.zev_payload_len;
1143 		/* get payload */
1144 		if (ddi_copyin(((char *)arg) + sizeof(mark),
1145 		               ZEV_PAYLOAD(rec),
1146 		               mark.zev_payload_len, mode) != 0) {
1147 			zev_free(msg, msg_size);
1148 			ret = EFAULT;
1149 			break;
1150 		}
1151 		*(ZEV_PAYLOAD(rec) + mark.zev_payload_len) = '\0';
1152 		/* get mark id and queue message */
1153 		mutex_enter(&zev_mark_id_mutex);
1154 		mark_id = zev_mark_id++;
1155 		mutex_exit(&zev_mark_id_mutex);
1156 		rec->mark_id = mark_id;
1157 		zev_queue_message(ZEV_OP_MARK, msg);
1158 		/* report mark id to userland, ignore errors */
1159 		mark.zev_mark_id = mark_id;
1160 		ddi_copyout(&mark, (void *)arg, sizeof(mark), mode);
1161 		break;
1162 	case ZEV_IOC_ADD_QUEUE:
1163 		if (minor != ZEV_CONTROL_DEVICE_MINOR) {
1164 			ret = EACCES;
1165 			break;
1166 		}
1167 		ret = zev_ioc_add_queue(req_q, arg, mode);
1168 		break;
1169 	case ZEV_IOC_REMOVE_QUEUE:
1170 		if (minor != ZEV_CONTROL_DEVICE_MINOR) {
1171 			ret = EACCES;
1172 			break;
1173 		}
1174 		ret = zev_ioc_remove_queue(req_q, arg, mode);
1175 		break;
1176 	case ZEV_IOC_GET_DEBUG_INFO:
1177 		ret = zev_ioc_get_debug_info(req_q, arg, mode);
1178 		break;
1179 	case ZEV_IOC_GET_QUEUE_LIST:
1180 		ret = zev_ioc_get_queue_list(req_q, arg, mode);
1181 		break;
1182 	case ZEV_IOC_GET_FILE_SIGNATURES:
1183 		ret = zev_ioc_get_signatures(arg, mode);
1184 		break;
1185 	case ZEV_IOC_GET_ZEV_VERSION:
1186 		ret = zev_ioc_get_zev_version(arg, mode);
1187 		break;
1188 	default:
1189 		/* generic "ioctl unknown" error */
1190 		ret = ENOTTY;
1191 	}
1192 
1193 	mutex_enter(&zev_mutex);
1194 	zev_queue_release(req_q);
1195 	mutex_exit(&zev_mutex);
1196 	if (ret)
1197 		SET_ERROR(ret);
1198 	return (ret);
1199 }
1200 
1201 static int
1202 zev_chpoll(dev_t dev, short events, int anyyet,
1203     short *reventsp, struct pollhead **phpp)
1204 {
1205 	int minor;
1206 	short revent = 0;
1207 	zev_queue_t *q;
1208 
1209 	/* use minor-specific queue context and it's pollhead */
1210 	minor = getminor(dev);
1211 	if (minor == ZEV_CONTROL_DEVICE_MINOR)
1212 		return (EINVAL);
1213 	mutex_enter(&zev_mutex);
1214 	if ((q = ddi_get_soft_state(statep, minor)) == NULL) {
1215 		mutex_exit(&zev_mutex);
1216 		return (ENXIO);
1217 	}
1218 	revent = 0;
1219 	if ((events & POLLIN)) {
1220 		if (q->zq_oldest)
1221 			revent |= POLLIN;
1222 	}
1223 	if (revent == 0) {
1224 		if (!anyyet) {
1225 			*phpp = &q->zq_pollhead;
1226 		}
1227 	}
1228 	*reventsp = revent;
1229 	mutex_exit(&zev_mutex);
1230 	return (0);
1231 }
1232 
1233 /* ARGSUSED */
1234 static int
1235 zev_read(dev_t dev, struct uio *uio_p, cred_t *crep_p)
1236 {
1237 	minor_t minor;
1238 	offset_t off;
1239 	int ret = 0;
1240 	zev_msg_t *msg;
1241 	char *data;
1242 	zev_queue_t *q;
1243 
1244 	minor = getminor(dev);
1245 	if (minor == ZEV_CONTROL_DEVICE_MINOR)
1246 		return (EINVAL);
1247 
1248 	mutex_enter(&zev_mutex);
1249 	q = ddi_get_soft_state(statep, minor);
1250 	if (q == NULL) {
1251 		mutex_exit(&zev_mutex);
1252 		return (ENXIO);
1253 	}
1254 	off = uio_p->uio_loffset;
1255 	msg = q->zq_oldest;
1256 	while (msg == NULL) {
1257 		if (!ddi_can_receive_sig()) {
1258 			/*
1259 			 * read() shouldn't block because this thread
1260 			 * can't receive signals. (e.g., it might be
1261 			 * torn down by exit() right now.)
1262 			 */
1263 			mutex_exit(&zev_mutex);
1264 			return 0;
1265 		}
1266 		if (cv_wait_sig(&q->zq_condvar, &zev_mutex) == 0) {
1267 			/* signal received. */
1268 			mutex_exit(&zev_mutex);
1269 			return EINTR;
1270 		}
1271 		msg = q->zq_oldest;
1272 	}
1273 	if (msg->size > uio_p->uio_resid) {
1274 		mutex_exit(&zev_mutex);
1275 		return E2BIG;
1276 	}
1277 	while (msg && uio_p->uio_resid >= msg->size) {
1278 		data = (char *)(msg + 1);
1279 		ret = uiomove(data, msg->size, UIO_READ, uio_p);
1280 		if (ret != 0) {
1281 			mutex_exit(&zev_mutex);
1282 			cmn_err(CE_WARN, "zev: uiomove failed; messages lost");
1283 			uio_p->uio_loffset = off;
1284 			return (ret);
1285 		}
1286 		q->zq_oldest = msg->next;
1287 		q->zq_bytes_read += msg->size;
1288 		q->zq_queue_len -= msg->size;
1289 		q->zq_queue_messages--;
1290 		msg->read++;
1291 		msg = q->zq_oldest;
1292 	}
1293 	zev_queue_trim();
1294 	cv_broadcast(&zev_condvar);
1295 	mutex_exit(&zev_mutex);
1296 	uio_p->uio_loffset = off;
1297 	return 0;
1298 }
1299 
1300 /* ARGSUSED */
1301 static int
1302 zev_close(dev_t dev, int flag, int otyp, cred_t *crepd)
1303 {
1304 	zev_queue_t *q;
1305 	int minor;
1306 
1307 	minor = getminor(dev);
1308 	if (otyp != OTYP_CHR)
1309 		return (EINVAL);
1310 	mutex_enter(&zev_mutex);
1311 	if ((q = ddi_get_soft_state(statep, minor)) == NULL) {
1312 		mutex_exit(&zev_mutex);
1313 		return (ENXIO);
1314 	}
1315 	if (q->zq_busy != B_TRUE) {
1316 		mutex_exit(&zev_mutex);
1317 		return (EINVAL);
1318 	}
1319 	q->zq_busy = B_FALSE;
1320 	if ((q->zq_flags & ZEV_FL_PERSISTENT) == 0)
1321 		zev_queue_release(q);
1322 	mutex_exit(&zev_mutex);
1323 	return (0);
1324 }
1325 
1326 /* ARGSUSED */
1327 static int
1328 zev_open(dev_t *devp, int flag, int otyp, cred_t *credp)
1329 {
1330 	zev_queue_t *q;
1331 	minor_t minor;
1332 
1333 	minor = getminor(*devp);
1334 	if (otyp != OTYP_CHR)
1335 		return (EINVAL);
1336 	if (drv_priv(credp) != 0)
1337 		return (EPERM);
1338 	mutex_enter(&zev_mutex);
1339 	if ((q = ddi_get_soft_state(statep, minor)) == NULL) {
1340 		mutex_exit(&zev_mutex);
1341 		return (ENXIO);
1342 	}
1343 	if (minor == ZEV_CONTROL_DEVICE_MINOR) {
1344 		/* control device may be used in parallel */
1345 		q->zq_busy = B_TRUE;
1346 		mutex_exit(&zev_mutex);
1347 		return 0;
1348 	}
1349 	if (q->zq_busy == B_TRUE) {
1350 		mutex_exit(&zev_mutex);
1351 		return (EBUSY);
1352 	}
1353 	q->zq_busy = B_TRUE;	/* can only be opened exclusively */
1354 	mutex_exit(&zev_mutex);
1355 	return (0);
1356 }
1357 
1358 static struct cb_ops zev_cb_ops = {
1359 	zev_open,		/* open */
1360 	zev_close,		/* close */
1361 	nodev,			/* strategy */
1362 	nodev,			/* print */
1363 	nodev,			/* dump */
1364 	zev_read,		/* read */
1365 	nodev,			/* write */
1366 	zev_ioctl,		/* ioctl */
1367 	nodev,			/* devmap */
1368 	nodev,			/* mmap */
1369 	nodev,			/* segmap */
1370 	zev_chpoll,		/* chpoll */
1371 	ddi_prop_op,		/* prop_op */
1372 	NULL,			/* streamtab */
1373 	D_MP | D_64BIT,		/* cb_flag */
1374 	CB_REV,			/* cb_rev */
1375 	nodev,			/* aread */
1376 	nodev,			/* awrite */
1377 };
1378 
1379 static void
1380 zev_free_instance(dev_info_t *dip)
1381 {
1382 	int instance;
1383 	zev_queue_t *q;
1384 	int i;
1385 
1386 	instance = ddi_get_instance(dip);
1387 	if (instance != 0) {
1388 		cmn_err(CE_WARN, "zev: tried to free instance != 0 (%d)",
1389 		        instance);
1390 		return;
1391 	}
1392 
1393 	ddi_remove_minor_node(dip, NULL);
1394 	devfs_clean(q->zq_dip, NULL, 0);
1395 
1396 	/* stop pollwakeup thread */
1397 	zev_wakeup_thread_run = 0;
1398 	if (zev_poll_wakeup_thread != NULL) {
1399 		thread_join(zev_poll_wakeup_thread->t_did);
1400 		zev_poll_wakeup_thread = NULL;
1401 	}
1402 
1403 	mutex_enter(&zev_mutex);
1404 
1405 	/* remove "ctrl" dummy queue */
1406 	q = ddi_get_soft_state(statep, ZEV_CONTROL_DEVICE_MINOR);
1407 	if (q) {
1408 		ddi_soft_state_free(statep, ZEV_CONTROL_DEVICE_MINOR);
1409 		ZEV_MEM_SUB(sizeof(zev_queue_t));
1410 	}
1411 
1412 	/* remove all other queues */
1413 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
1414 		q = zev_queues[i- ZEV_MINOR_MIN];
1415 		if (!q)
1416 			continue;
1417 		ASSERT(q->zq_refcnt == 1);
1418 		zev_queue_release(q);
1419 	}
1420 	zev_queue_trim();
1421 	bzero(&zev_queues, sizeof(zev_queues));
1422 
1423 	mutex_exit(&zev_mutex);
1424 
1425 }
1426 
1427 static int
1428 zev_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1429 {
1430 	int instance;
1431 	zev_queue_t *q;
1432 
1433 	/* called once per instance with DDI_DETACH,
1434 	   may be called to suspend */
1435 	switch (cmd) {
1436 	case DDI_DETACH:
1437 		/* instance busy? */
1438 		instance = ddi_get_instance(dip);
1439 		if (instance != 0) {	/* hardcoded in zev.conf */
1440 			/* this module only supports one instance. */
1441 			return (DDI_FAILURE);
1442 		}
1443 
1444 		mutex_enter(&zev_mutex);
1445 		if (!zev_attached) {
1446 			mutex_exit(&zev_mutex);
1447 			return (DDI_FAILURE);
1448 		}
1449 
1450 		/* check "ctrl" queue to see if t is busy */
1451 		q = ddi_get_soft_state(statep, ZEV_CONTROL_DEVICE_MINOR);
1452 		if (q == NULL) {
1453 			mutex_exit(&zev_mutex);
1454 			return (DDI_FAILURE);
1455 		}
1456 		if (q->zq_busy) {
1457 			mutex_exit(&zev_mutex);
1458 			return (DDI_FAILURE);
1459 		}
1460 		/* are there any queues? */
1461 		if (zev_queue_cnt > 0) {
1462 			mutex_exit(&zev_mutex);
1463 			return (DDI_FAILURE);
1464 		}
1465 
1466 		zev_attached = B_FALSE;
1467 		mutex_exit(&zev_mutex);
1468 
1469 		/* switch ZFS event callbacks back to default */
1470 		rw_enter(&rz_zev_rwlock, RW_WRITER);
1471 		rz_zev_callbacks = rz_zev_default_callbacks;
1472 		rz_zev_set_active(B_FALSE);
1473 		rw_exit(&rz_zev_rwlock);
1474 
1475 		/* no thread is inside of the callbacks anymore. */
1476 
1477 		/* free resources allocated for this instance */
1478 		zev_free_instance(dip);
1479 		zev_chksum_fini();
1480 #if 0
1481 		cmn_err(CE_WARN, "zev: allocated memory at detach: %" PRIu64,
1482 			zev_memory_allocated - zev_memory_freed);
1483 #endif
1484 		return (DDI_SUCCESS);
1485 	case DDI_SUSPEND:
1486 		/* kernel must not suspend zev devices while ZFS is running */
1487 		return (DDI_FAILURE);
1488 	default:
1489 		return (DDI_FAILURE);
1490 	}
1491 }
1492 
1493 static int
1494 zev_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
1495 {
1496 	/* called once per instance with DDI_ATTACH,
1497 	   may be called to resume */
1498 	int instance;
1499 	int error;
1500 	zev_queue_t *q;
1501 	switch (cmd) {
1502 	case DDI_ATTACH:
1503 		/* create instance state */
1504 		instance = ddi_get_instance(dip);
1505 		if (instance != 0) {	/* hardcoded in zev.conf */
1506 			/* this module only supports one instance. */
1507 			return (DDI_FAILURE);
1508 		}
1509 
1510 		mutex_enter(&zev_mutex);
1511 		if (zev_attached) {
1512 			mutex_exit(&zev_mutex);
1513 			return (DDI_FAILURE);
1514 		}
1515 		if (ddi_soft_state_zalloc(statep, ZEV_CONTROL_DEVICE_MINOR) !=
1516 		    DDI_SUCCESS) {
1517 			mutex_exit(&zev_mutex);
1518 			return (DDI_FAILURE);
1519 		}
1520 		ZEV_MEM_ADD(sizeof(zev_queue_t));
1521 		zev_attached = B_TRUE;
1522 
1523 		/* init queue list */
1524 		bzero(&zev_queues, sizeof(zev_queues));
1525 		mutex_exit(&zev_mutex);
1526 
1527 		/* create a dummy queue for management of "ctrl" */
1528 
1529 		q = ddi_get_soft_state(statep, ZEV_CONTROL_DEVICE_MINOR);
1530 		q->zq_dip = dip;
1531 		q->zq_refcnt = 1;
1532 		q->zq_busy = B_FALSE;
1533 		q->zq_minor_number = ZEV_CONTROL_DEVICE_MINOR;
1534 		q->zq_flags = ZEV_FL_PERSISTENT;
1535 		strcpy(q->zq_name, ZEV_CONTROL_DEVICE_NAME);
1536 
1537 		/* create device node for "ctrl" */
1538 		if (ddi_create_minor_node(dip, ZEV_CONTROL_DEVICE_NAME,
1539 		    S_IFCHR, ZEV_CONTROL_DEVICE_MINOR,
1540 		    DDI_PSEUDO, 0) == DDI_FAILURE) {
1541 			goto fail;
1542 		}
1543 
1544 		/* note: intentionally not adding ctrl queue to queue list. */
1545 
1546 		/* default queue */
1547 		error = zev_queue_new(&q, dip,
1548 				      ZEV_DEFAULT_QUEUE_NAME,
1549 				      ZEV_MAX_QUEUE_LEN,
1550 				      ZEV_FL_BLOCK_WHILE_QUEUE_FULL|
1551 		                      ZEV_FL_PERSISTENT);
1552 		if (error)
1553 			goto fail;
1554 
1555 		/* start pollwakeup thread */
1556 		zev_wakeup_thread_run = 1;
1557 		zev_poll_wakeup_thread = thread_create(NULL, 0,
1558 		    zev_poll_wakeup_thread_main, NULL, 0, &p0,
1559 		    TS_RUN, minclsyspri);
1560 
1561 		ddi_report_dev(dip);
1562 
1563 		zev_chksum_init();
1564 
1565 		/* switch ZFS event callbacks to zev module callbacks */
1566 		rw_enter(&rz_zev_rwlock, RW_WRITER);
1567 		rz_zev_callbacks = &zev_callbacks;
1568 		rz_zev_set_active(B_TRUE);
1569 		rw_exit(&rz_zev_rwlock);
1570 
1571 		return (DDI_SUCCESS);
1572 	case DDI_RESUME:
1573 		/* suspendeding zev devices should never happen */
1574 		return (DDI_SUCCESS);
1575 	default:
1576 		return (DDI_FAILURE);
1577 	}
1578 fail:
1579 	cmn_err(CE_WARN, "zev: attach failed");
1580 	zev_free_instance(dip);
1581 	mutex_enter(&zev_mutex);
1582 	zev_attached = B_FALSE;
1583 	mutex_exit(&zev_mutex);
1584 	return (DDI_FAILURE);
1585 }
1586 
1587 /* ARGSUSED */
1588 static int
1589 zev_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **resultp)
1590 {
1591 	minor_t minor;
1592 	zev_queue_t *q;
1593 
1594 	/* arg is dev_t */
1595 	minor = getminor((dev_t)arg);
1596 	mutex_enter(&zev_mutex);
1597 	q = ddi_get_soft_state(statep, minor);
1598 	if (q == NULL) {
1599 		*resultp = NULL;
1600 		mutex_exit(&zev_mutex);
1601 		return (DDI_FAILURE);
1602 	}
1603 
1604 	switch (infocmd) {
1605 	case DDI_INFO_DEVT2DEVINFO:
1606 		*resultp = q->zq_dip;
1607 		break;
1608 	case DDI_INFO_DEVT2INSTANCE:
1609 		*resultp = (void *)(uintptr_t)ddi_get_instance(q->zq_dip);
1610 		break;
1611 	default:
1612 		mutex_exit(&zev_mutex);
1613 		return (DDI_FAILURE);
1614 	}
1615 	mutex_exit(&zev_mutex);
1616 	return (DDI_SUCCESS);
1617 }
1618 
1619 static struct dev_ops zev_dev_ops = {
1620 	DEVO_REV,			/* driver build revision */
1621 	0,				/* driver reference count */
1622 	zev_getinfo,			/* getinfo */
1623 	nulldev,			/* identify (obsolete) */
1624 	nulldev,			/* probe (search for devices) */
1625 	zev_attach,			/* attach */
1626 	zev_detach,			/* detach */
1627 	nodev,				/* reset (obsolete, use quiesce) */
1628 	&zev_cb_ops,			/* character and block device ops */
1629 	NULL,				/* bus driver ops */
1630 	NULL,				/* power management, not needed */
1631 	ddi_quiesce_not_needed,		/* quiesce */
1632 };
1633 
1634 static struct modldrv zev_modldrv = {
1635 	&mod_driverops,			/* all loadable modules use this */
1636 	"ZFS event provider, v"
1637 		XSTRING(ZEV_MAJOR_VERSION) "."
1638 		XSTRING(ZEV_MINOR_VERSION),
1639 					/* driver name and version info */
1640 	&zev_dev_ops			/* ops method pointers */
1641 };
1642 
1643 static struct modlinkage zev_modlinkage = {
1644 	MODREV_1,	/* fixed value */
1645 	{
1646 		&zev_modldrv,	/* driver linkage structure */
1647 		NULL		/* list terminator */
1648 	}
1649 };
1650 
1651 int
1652 _init(void)
1653 {
1654 	int error;
1655 
1656 	if ((error = ddi_soft_state_init(&statep, sizeof(zev_queue_t), 1)) != 0)
1657 		return (error);
1658 	zev_attached = B_FALSE;
1659 
1660 	zev_queue_head = NULL;
1661 	zev_queue_tail = NULL;
1662 	zev_queue_len = 0;
1663 	zev_muted_pools_head = NULL;
1664 	zev_memory_allocated = 0;
1665 	zev_memory_freed = 0;
1666 	zev_queue_cnt = 0;
1667 	zev_have_blocking_queues = 1;
1668 
1669 	mutex_init(&zev_mutex, NULL, MUTEX_DRIVER, NULL);
1670 	cv_init(&zev_condvar, NULL, CV_DRIVER, NULL);
1671 	rw_init(&zev_pool_list_rwlock, NULL, RW_DRIVER, NULL);
1672 	mutex_init(&zev_mark_id_mutex, NULL, MUTEX_DRIVER, NULL);
1673 	zev_mark_id = gethrtime();
1674 	mutex_init(&zev_queue_msg_mutex, NULL, MUTEX_DRIVER, NULL);
1675 	zev_msg_sequence_number = gethrtime();
1676 	bzero(&zev_statistics, sizeof(zev_statistics));
1677 	bzero(&zev_pollhead, sizeof(zev_pollhead));
1678 	bzero(&zev_queues, sizeof(zev_queues));
1679 	zev_statistics.zev_max_queue_len = ZEV_MAX_QUEUE_LEN;
1680 	if (zev_ioc_mute_pool("zg0")) {
1681 		cmn_err(CE_WARN, "zev: could not init mute list");
1682 		goto FAIL;
1683 	}
1684 
1685 	if ((error = mod_install(&zev_modlinkage)) != 0) {
1686 		cmn_err(CE_WARN, "zev: could not install module");
1687 		goto FAIL;
1688 	}
1689 
1690 	return (0);
1691 FAIL:
1692 	/* free resources */
1693 	cmn_err(CE_WARN, "zev: _init failed");
1694 	mutex_destroy(&zev_mutex);
1695 	ddi_soft_state_fini(&statep);
1696 	return (error);
1697 }
1698 
1699 int
1700 _info(struct modinfo *modinfop)
1701 {
1702 	return (mod_info(&zev_modlinkage, modinfop));
1703 }
1704 
1705 int
1706 _fini(void)
1707 {
1708 	int error = 0;
1709 	zev_msg_t *msg;
1710 	zev_pool_list_entry_t *pe, *npe;
1711 
1712 	mutex_enter(&zev_mutex);
1713 	if (zev_attached == B_TRUE) {
1714 		mutex_exit(&zev_mutex);
1715 		return (SET_ERROR(EBUSY));
1716 	}
1717 	if (zev_queue_cnt != 0) {
1718 		/* should never happen */
1719 		mutex_exit(&zev_mutex);
1720 		return (SET_ERROR(EBUSY));
1721 	}
1722 
1723 	/*
1724 	 * avoid deadlock if event list is full: make sure threads currently
1725 	 * blocking on the event list can append their event and then release
1726 	 * rz_zev_rwlock.  Since there should be no queues left when we
1727 	 * reach this point we can simply empty the event list and then
1728 	 * wake everybody.
1729 	 */
1730 	while (zev_queue_head) {
1731 		msg = zev_queue_head;
1732 		zev_queue_head = msg->next;
1733 		zev_free(msg, sizeof(*msg) + msg->size);
1734 	}
1735 	cv_broadcast(&zev_condvar);
1736 	mutex_exit(&zev_mutex);
1737 
1738 	/* switch ZFS event callbacks back to default (again) */
1739 	rw_enter(&rz_zev_rwlock, RW_WRITER);
1740 	rz_zev_callbacks = rz_zev_default_callbacks;
1741 	rz_zev_set_active(B_FALSE);
1742 	rw_exit(&rz_zev_rwlock);
1743 
1744 	/* no thread is inside of the callbacks anymore.  Safe to remove. */
1745 
1746 	/* unload module callbacks */
1747 	if ((error = mod_remove(&zev_modlinkage)) != 0) {
1748 		cmn_err(CE_WARN, "mod_remove failed: %d", error);
1749 		return (error);
1750 	}
1751 
1752 	/* free resources */
1753 	mutex_enter(&zev_mutex);
1754 	while (zev_queue_head) {
1755 		msg = zev_queue_head;
1756 		zev_queue_head = msg->next;
1757 		zev_free(msg, sizeof(*msg) + msg->size);
1758 	}
1759 	mutex_exit(&zev_mutex);
1760 	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
1761 	pe = zev_muted_pools_head;
1762 	while (pe) {
1763 		npe = pe;
1764 		pe = pe->next;
1765 		zev_free(npe, sizeof(*npe));
1766 	}
1767 	rw_exit(&zev_pool_list_rwlock);
1768 	ddi_soft_state_fini(&statep);
1769 	rw_destroy(&zev_pool_list_rwlock);
1770 	cv_destroy(&zev_condvar);
1771 	mutex_destroy(&zev_mutex);
1772 	mutex_destroy(&zev_mark_id_mutex);
1773 	mutex_destroy(&zev_queue_msg_mutex);
1774 
1775 	return (0);
1776 }
1777 
1778