xref: /titanic_52/usr/src/uts/common/fs/zev/zev.c (revision dc1d6f1ab5062f454c33cd32797a70ed46a514c5)
1 #include <sys/modctl.h>
2 #include <sys/ddi.h>
3 #include <sys/sunddi.h>
4 #include <sys/conf.h>
5 #include <sys/devops.h>
6 #include <sys/stat.h>
7 #include <sys/fs/zev.h>
8 #include <sys/zev_callbacks.h>
9 #include <sys/zev_checksums.h>
10 #include <sys/zfs_znode.h>
11 #include <sys/time.h>
12 #include <sys/sa.h>
13 #include <sys/zap.h>
14 #include <sys/time.h>
15 
16 #define	OFFSETOF(s, m)		((size_t)(&(((s *)0)->m)))
17 
18 #define XSTRING(x)	STRING(x)
19 #define STRING(x)	#x
20 
21 #define ZEV_DEFAULT_QUEUE_NAME		"beaver"
22 #define ZEV_CONTROL_DEVICE_MINOR	0
23 #define ZEV_MINOR_MIN			(ZEV_CONTROL_DEVICE_MINOR + 1)
24 #define ZEV_MINOR_MAX			(ZEV_MINOR_MIN + ZEV_MAX_QUEUES - 1)
25 
26 typedef struct zev_queue {
27 	char			zq_name[ZEV_MAX_QUEUE_NAME_LEN+1];
28 	minor_t			zq_minor_number;
29 	dev_info_t		*zq_dip;
30 	struct pollhead		zq_pollhead;
31 	uint64_t		zq_bytes_read;
32 	uint64_t		zq_events_read;
33 	uint64_t		zq_bytes_discarded;
34 	uint64_t		zq_events_discarded;
35 	uint64_t		zq_bytes_total;
36 	uint64_t		zq_events_total;
37 	uint64_t		zq_wakeup_threshold;
38 	uint16_t		zq_flags;
39 	uint16_t		zq_need_wakeup;
40 	/* protected by zev_mutex */
41 	int			zq_refcnt;
42 	uint64_t		zq_queue_len;
43 	uint64_t		zq_queue_messages;
44 	uint64_t		zq_max_queue_len;
45 	zev_msg_t		*zq_oldest;
46 	boolean_t		zq_busy;
47 	boolean_t		zq_to_be_removed;
48 	zev_statistics_t	zq_statistics;
49 	kcondvar_t		zq_condvar;
50 } zev_queue_t;
51 
52 static void		*statep;
53 struct pollhead		zev_pollhead;
54 
55 kmutex_t		zev_mutex;
56 kcondvar_t		zev_condvar;
57 kmutex_t		zev_queue_msg_mutex;
58 krwlock_t		zev_pool_list_rwlock;
59 static zev_statistics_t	zev_statistics;
60 static boolean_t	zev_attached;
61 static kmutex_t		zev_mark_id_mutex;
62 static uint64_t		zev_mark_id = 0;
63 
64 static uint64_t		zev_msg_sequence_number = 0;
65 static zev_queue_t	*zev_queues[ZEV_MAX_QUEUES];
66 static int		zev_queue_cnt = 0;
67 static int		zev_have_blocking_queues = 1;
68 
69 uint64_t	zev_memory_allocated = 0;
70 uint64_t	zev_memory_freed = 0;
71 
72 /*
73  * The longest potential message is from zev_zfs_mount() and
74  * contains the mountpoint, which might be close to MAXPATHLEN bytes long.
75  *
76  * Another candidate is zev_znode_rename_cb() and contains three inode
77  * numbers and two filenames of up to MAXNAMELEN bytes each.
78  */
79 #define ZEV_MAX_MESSAGE_LEN	4096
80 
81 static zev_msg_t *zev_queue_head = NULL;
82 static zev_msg_t *zev_queue_tail = NULL;
83 static uint64_t zev_queue_len = 0;
84 
85 
86 typedef struct zev_pool_list_entry {
87 	struct zev_pool_list_entry	*next;
88 	char				name[MAXPATHLEN];
89 } zev_pool_list_entry_t;
90 
91 static zev_pool_list_entry_t *zev_muted_pools_head = NULL;
92 
93 static volatile int zev_wakeup_thread_run = 1;
94 static kthread_t *zev_poll_wakeup_thread = NULL;
95 
96 void *
97 zev_alloc(ssize_t sz)
98 {
99 	ZEV_MEM_ADD(sz);
100 	return kmem_alloc(sz, KM_SLEEP);
101 }
102 
103 void *
104 zev_zalloc(ssize_t sz)
105 {
106 	ZEV_MEM_ADD(sz);
107 	return kmem_zalloc(sz, KM_SLEEP);
108 }
109 
110 void
111 zev_free(void *ptr, ssize_t sz)
112 {
113 	ZEV_MEM_SUB(sz);						\
114 	kmem_free(ptr, sz);
115 }
116 
117 /* must be called with zev_mutex held */
118 static void
119 zev_update_blockflag(void)
120 {
121 	zev_queue_t *q;
122 	int had_blocking_queues;
123 	int i;
124 
125 	had_blocking_queues = zev_have_blocking_queues;
126 
127 	/* do we still have blocking queues? */
128 	zev_have_blocking_queues = 0;
129 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
130 		q = zev_queues[i - ZEV_MINOR_MIN];
131 		if (!q)
132 			continue;
133 		if (q->zq_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL) {
134 			zev_have_blocking_queues = 1;
135 			break;
136 		}
137 	}
138 	/* no blocking queues */
139 	if (had_blocking_queues)
140 		cv_broadcast(&zev_condvar);
141 }
142 
143 int
144 zev_queue_cmp(const void *a, const void *b)
145 {
146 	const zev_queue_t *qa = a;
147 	const zev_queue_t *qb = b;
148 	if (qa->zq_minor_number > qb->zq_minor_number)
149 		return 1;
150 	if (qa->zq_minor_number < qb->zq_minor_number)
151 		return -1;
152 	return 0;
153 }
154 
155 /* must be called with zev_mutex held */
156 void
157 zev_queue_trim(void)
158 {
159 	zev_msg_t *m;
160 	uint64_t oldest_message;
161 	zev_queue_t *q;
162 	int i;
163 
164 	if (!zev_queue_tail)
165 		return;
166 
167 	oldest_message = zev_queue_tail->seq + 1;  /* does not exist, yet. */
168 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
169 		q = zev_queues[i - ZEV_MINOR_MIN];
170 		if (q == NULL)
171 			continue;
172 		if (!q->zq_oldest)
173 			continue;
174 		if (oldest_message > q->zq_oldest->seq)
175 			oldest_message = q->zq_oldest->seq;
176 	}
177 
178 	/* remove msgs between oldest_message and zev_queue_head */
179 	while(zev_queue_head && (oldest_message > zev_queue_head->seq)) {
180 		m = zev_queue_head;
181 		zev_queue_head = m->next;
182 		if (zev_queue_head == NULL) {
183 			zev_queue_tail = NULL;
184 		} else {
185 			zev_queue_head->prev = NULL;
186 		}
187 		if (m->read == 0) {
188 			zev_statistics.zev_bytes_discarded += m->size;
189 			zev_statistics.zev_cnt_discarded_events++;
190 		}
191 		zev_statistics.zev_queue_len -= m->size;
192 		zev_queue_len--;
193 		zev_free(m, sizeof(*m) + m->size);
194 	}
195 }
196 
197 /* must be called with zev_mutex held */
198 static void
199 zev_queue_hold(zev_queue_t *q)
200 {
201 	q->zq_refcnt++;
202 }
203 
204 /* must be called with zev_mutex held */
205 static void
206 zev_queue_release(zev_queue_t *q)
207 {
208 	q->zq_refcnt--;
209 	if (q->zq_refcnt > 0)
210 		return;
211 
212 	ASSERT(q->zq_busy == B_FALSE);
213 
214 	/* persistent queues will not be removed */
215 	if ((q->zq_flags & ZEV_FL_PERSISTENT) != 0)
216 		return;
217 
218 	/* remove queue from queue list */
219 	zev_queues[q->zq_minor_number - ZEV_MINOR_MIN] = NULL;
220 
221 	/* discard messages that no queue references anymore */
222 	zev_queue_trim();
223 
224 	cv_destroy(&q->zq_condvar);
225 	ddi_remove_minor_node(q->zq_dip, q->zq_name);
226 	ddi_soft_state_free(statep, q->zq_minor_number);
227 	ZEV_MEM_SUB(sizeof(zev_queue_t));
228 	zev_queue_cnt--;
229 	zev_update_blockflag();
230 }
231 
232 int
233 zev_queue_new(zev_queue_t **queue,
234               dev_info_t *dip,
235               char *name,
236               uint64_t max_queue_len,
237               uint16_t flags)
238 {
239 	zev_queue_t *q;
240 	zev_queue_t *tmp;
241 	zev_msg_t *msg;
242 	int name_exists = 0;
243 	minor_t minor;
244 	char *p;
245 	int i;
246 
247 	if (max_queue_len > ZEV_MAX_QUEUE_LEN)
248 		return EINVAL;
249 	if (max_queue_len == 0)
250 		max_queue_len = ZEV_MAX_QUEUE_LEN;
251 	if (!strcmp(name, ZEV_CONTROL_DEVICE_NAME))
252 		return EINVAL;
253 	for (p = name; *p; p++) {
254 		if (*p >= 'a' && *p <= 'z')
255 			continue;
256 		if (*p >= '0' && *p <= '9')
257 			continue;
258 		if (*p == '.')
259 			continue;
260 		return EINVAL;
261 	}
262 
263 	mutex_enter(&zev_mutex);
264 
265 	/* find free minor number.*/
266 	/* if this were a frequent operation we'd have a free-minor list */
267 	for (minor = ZEV_MINOR_MIN; minor <= ZEV_MINOR_MAX; minor++) {
268 		tmp = zev_queues[minor - ZEV_MINOR_MIN];
269 		if (tmp == NULL)
270 			break;
271 	}
272 	if (tmp) {
273 		mutex_exit(&zev_mutex);
274 		return ENOSPC;
275 	}
276 
277 	if (ddi_soft_state_zalloc(statep, minor) != DDI_SUCCESS) {
278 		mutex_exit(&zev_mutex);
279 		return ENOSPC;
280 	}
281 	ZEV_MEM_ADD(sizeof(zev_queue_t));
282 
283 	q = ddi_get_soft_state(statep, minor);
284 	memset(q, 0, sizeof(*q));
285 	strncpy(q->zq_name, name, ZEV_MAX_QUEUE_NAME_LEN);
286 	q->zq_name[ZEV_MAX_QUEUE_NAME_LEN] = '\0';
287 	q->zq_max_queue_len = max_queue_len;
288 	q->zq_wakeup_threshold = ZEV_DEFAULT_POLL_WAKEUP_QUEUE_LEN;
289 	q->zq_flags = flags;
290 	q->zq_refcnt = 1;
291 	q->zq_dip = dip;
292 	q->zq_minor_number = minor;
293 	cv_init(&q->zq_condvar, NULL, CV_DRIVER, NULL);
294 
295 	/* insert into queue list */
296 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
297 		/* if this were a frequent operation we'd have a name tree */
298 		if (zev_queues[i - ZEV_MINOR_MIN] == NULL)
299 			continue;
300 		if (!strcmp(q->zq_name, zev_queues[i-ZEV_MINOR_MIN]->zq_name)) {
301 			name_exists = 1;
302 			break;
303 		}
304 	}
305 	if (name_exists) {
306 		ddi_soft_state_free(statep, minor);
307 		ZEV_MEM_SUB(sizeof(zev_queue_t));
308 		mutex_exit(&zev_mutex);
309 		return EEXIST;
310 	}
311 	zev_queues[minor - ZEV_MINOR_MIN] = q;
312 	zev_queue_cnt++;
313 
314 	/* calculate current queue len and find head and tail */
315 	if (!(q->zq_flags & ZEV_FL_INITIALLY_EMPTY)) {
316 		q->zq_oldest = zev_queue_tail;
317 		msg = zev_queue_tail;
318 		while ((msg) && (q->zq_queue_len < q->zq_max_queue_len)) {
319 			q->zq_queue_len += msg->size;
320 			q->zq_queue_messages++;
321 			q->zq_oldest = msg;
322 			msg = msg->prev;
323 		}
324 	}
325 
326 	zev_update_blockflag();
327 
328 	mutex_exit(&zev_mutex);
329 
330 	if (ddi_create_minor_node(dip, name,
331 	    S_IFCHR, minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
332 		mutex_enter(&zev_mutex);
333 		zev_queues[minor - ZEV_MINOR_MIN] = NULL;
334 		zev_queue_cnt--;
335 		ddi_soft_state_free(statep, minor);
336 		ZEV_MEM_SUB(sizeof(zev_queue_t));
337 		zev_update_blockflag();
338 		mutex_exit(&zev_mutex);
339 		return EFAULT;
340 	}
341 
342 	*queue = q;
343 	return 0;
344 }
345 
346 /*
347  * poll() wakeup thread.  Used to check periodically whether we have
348  * bytes left in the queue that have not yet been made into a
349  * pollwakeup() call.  This is meant to insure a maximum waiting
350  * time until an event is presented as a poll wakeup, while at
351  * the same time not making every single event into a poll wakeup
352  * of it's own.
353  */
354 
355 static void
356 zev_poll_wakeup(boolean_t flush_all)
357 {
358 	zev_queue_t *q;
359 	int i;
360 
361 	/*
362 	 * This loop works with hold() and release() because
363 	 * pollwakeup() requires us to release our locks before calling it.
364 	 *
365 	 * from pollwakeup(9F):
366 	 *
367 	 *   "Driver defined locks should not be held across calls
368 	 *    to this function."
369 	 */
370 
371 	/* wake up threads for each individual queue */
372 	mutex_enter(&zev_mutex);
373 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
374 		q = zev_queues[i - ZEV_MINOR_MIN];
375 		if (q == NULL)
376 			continue;
377 		if (!q->zq_busy)
378 			continue;
379 		if (!q->zq_queue_len)
380 			continue;
381 		if ((flush_all) ||
382 		    (q->zq_queue_len > q->zq_wakeup_threshold)) {
383 			zev_queue_hold(q);
384 			mutex_exit(&zev_mutex);
385 			pollwakeup(&q->zq_pollhead, POLLIN);
386 			mutex_enter(&zev_mutex);
387 			zev_queue_release(q);
388 		}
389 	}
390 	mutex_exit(&zev_mutex);
391 }
392 
393 static void
394 zev_poll_wakeup_thread_main(void)
395 {
396 	while (zev_wakeup_thread_run) {
397 		delay(drv_usectohz(100 * 1000)); /* sleep 100ms */
398 
399 		zev_poll_wakeup(B_TRUE);
400 	}
401 	thread_exit();
402 }
403 
404 static int
405 zev_ioc_mute_pool(char *poolname)
406 {
407 	zev_pool_list_entry_t *pe;
408 	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
409 	/* pool already muted? */
410 	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
411 		if (!strcmp(pe->name, poolname)) {
412 			rw_exit(&zev_pool_list_rwlock);
413 			return EEXIST;
414 		}
415 	}
416 	pe = zev_zalloc(sizeof(*pe));
417 	if (!pe) {
418 		rw_exit(&zev_pool_list_rwlock);
419 		return ENOMEM;
420 	}
421 	(void) strncpy(pe->name, poolname, sizeof(pe->name));
422 	pe->next = zev_muted_pools_head;
423 	zev_muted_pools_head = pe;
424 	rw_exit(&zev_pool_list_rwlock);
425 	return (0);
426 }
427 
428 static int
429 zev_ioc_unmute_pool(char *poolname)
430 {
431 	zev_pool_list_entry_t *pe, *peprev;
432 
433 	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
434 	/* pool muted? */
435 	peprev = NULL;
436 	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
437 		if (!strcmp(pe->name, poolname))
438 			break;
439 		peprev = pe;
440 	}
441 	if (pe) {
442 		rw_exit(&zev_pool_list_rwlock);
443 		return ENOENT;
444 	}
445 
446 	if (peprev != NULL) {
447 		peprev->next = pe->next;
448 	} else {
449 		zev_muted_pools_head = pe->next;
450 	}
451 	zev_free(pe, sizeof(*pe));
452 	rw_exit(&zev_pool_list_rwlock);
453 	return (0);
454 }
455 
456 int
457 zev_skip_pool(objset_t *os)
458 {
459 	zev_pool_list_entry_t *pe;
460 	dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
461 	rw_enter(&zev_pool_list_rwlock, RW_READER);
462 	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
463 		if (!strcmp(pe->name, dp->dp_spa->spa_name)) {
464 			rw_exit(&zev_pool_list_rwlock);
465 			return 1;
466 		}
467 	}
468 	rw_exit(&zev_pool_list_rwlock);
469 	return 0;
470 }
471 
472 int
473 zev_skip_fs(zfsvfs_t *fs)
474 {
475 	dsl_dir_t *d = fs->z_os->os_dsl_dataset->ds_dir;
476 	dsl_dir_t *prev = NULL;
477 
478 	while (d && d != prev) {
479 		if (strstr(d->dd_myname, "_root"))
480 			return 0;
481 		prev = d;
482 		d = d->dd_parent;
483 	}
484 	return 1;
485 }
486 
487 static void
488 zev_update_statistics(int op, zev_statistics_t *stat)
489 {
490 	switch (op) {
491 	case ZEV_OP_ERROR:
492 		stat->zev_cnt_errors++;
493 		break;
494 	case ZEV_OP_MARK:
495 		stat->zev_cnt_marks++;
496 		break;
497 	case ZEV_OP_ZFS_MOUNT:
498 		stat->zev_cnt_zfs_mount++;
499 		break;
500 	case ZEV_OP_ZFS_UMOUNT:
501 		stat->zev_cnt_zfs_umount++;
502 		break;
503 	case ZEV_OP_ZVOL_WRITE:
504 		stat->zev_cnt_zvol_write++;
505 		break;
506 	case ZEV_OP_ZVOL_TRUNCATE:
507 		stat->zev_cnt_zvol_truncate++;
508 		break;
509 	case ZEV_OP_ZNODE_CLOSE_AFTER_UPDATE:
510 		stat->zev_cnt_znode_close_after_update++;
511 		break;
512 	case ZEV_OP_ZNODE_CREATE:
513 		stat->zev_cnt_znode_create++;
514 		break;
515 	case ZEV_OP_ZNODE_REMOVE:
516 		stat->zev_cnt_znode_remove++;
517 		break;
518 	case ZEV_OP_ZNODE_LINK:
519 		stat->zev_cnt_znode_link++;
520 		break;
521 	case ZEV_OP_ZNODE_SYMLINK:
522 		stat->zev_cnt_znode_symlink++;
523 		break;
524 	case ZEV_OP_ZNODE_RENAME:
525 		stat->zev_cnt_znode_rename++;
526 		break;
527 	case ZEV_OP_ZNODE_WRITE:
528 		stat->zev_cnt_znode_write++;
529 		break;
530 	case ZEV_OP_ZNODE_TRUNCATE:
531 		stat->zev_cnt_znode_truncate++;
532 		break;
533 	case ZEV_OP_ZNODE_SETATTR:
534 		stat->zev_cnt_znode_setattr++;
535 		break;
536 	case ZEV_OP_ZNODE_ACL:
537 		stat->zev_cnt_znode_acl++;
538 		break;
539 	}
540 }
541 
542 void
543 zev_queue_message(int op, zev_msg_t *msg)
544 {
545 	zev_queue_t *q;
546 	int wakeup = 0;
547 	zev_msg_t *m;
548 	int i;
549 
550 	msg->next = NULL;
551 	msg->prev = NULL;
552 	msg->read = 0;
553 
554 	if (op < ZEV_OP_MIN || op > ZEV_OP_MAX) {
555 		zev_queue_error(op, "unknown op id encountered: %d", op);
556 		zev_free(msg, sizeof(*msg) + msg->size);
557 		return;
558 	}
559 
560 	/*
561 	 * This mutex protects us agains race conditions when several
562 	 * threads want to queue a message and one or more queues are
563 	 * full:  we release zev_mutex to wait for the queues to become
564 	 * less-than-full, but we don't know in which order the waiting
565 	 * threads will be awoken.  If it's not the same order in which
566 	 * they went to sleep we might mark different messages as "newest"
567 	 * in different queues, and so we might have dupes or even
568 	 * skip messages.
569 	 */
570 	mutex_enter(&zev_queue_msg_mutex);
571 
572 	mutex_enter(&zev_mutex);
573 
574 	/*
575 	 * When the module is loaded, the default behavior ist to
576 	 * put all events into a queue and block if the queue is full.
577 	 * This is done even before the pseudo device is attached.
578 	 * This way, no events are lost.
579 	 *
580 	 * To discard events entirely the "beaver" queue,
581 	 * which never discards anything, has to be removed.
582 	 */
583 
584 	if (zev_queue_cnt == 0) {
585 		mutex_exit(&zev_mutex);
586 		mutex_exit(&zev_queue_msg_mutex);
587 		return;
588 	}
589 
590 	/* put message into global queue */
591 	msg->seq = zev_msg_sequence_number++;
592 
593 	/* do we need to make room? */
594 	while (zev_statistics.zev_max_queue_len &&
595 	    zev_statistics.zev_queue_len > zev_statistics.zev_max_queue_len) {
596 
597 		if (zev_have_blocking_queues) {
598 			/* queue full.  block until it's been shrunk. */
599 			cv_wait(&zev_condvar, &zev_mutex);
600 			continue;
601 		}
602 
603 		/* discard events until this message fits into all queues */
604 
605 		for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
606 			q = zev_queues[i - ZEV_MINOR_MIN];
607 			if (!q)
608 				continue;
609 			/* discard msgs until queue is small enough */
610 			while (q->zq_queue_len &&
611 			       q->zq_queue_len > q->zq_max_queue_len) {
612 				m = q->zq_oldest;
613 				if (m == NULL)
614 					break;
615 				q->zq_events_discarded++;
616 				q->zq_bytes_discarded += m->size;
617 				q->zq_oldest = m->next;
618 				q->zq_queue_len -= m->size;
619 				q->zq_queue_messages--;
620 			}
621 		}
622 
623 		zev_queue_trim();
624 		ASSERT(zev_statistics.zev_queue_len == 0 ||
625 		       zev_statistics.zev_queue_len <=
626 				zev_statistics.zev_max_queue_len);
627 	}
628 
629 	if (zev_queue_tail == NULL) {
630 		zev_queue_head = zev_queue_tail = msg;
631 	} else {
632 		zev_queue_tail->next = msg;
633 		msg->prev = zev_queue_tail;
634 		zev_queue_tail = msg;
635 	}
636 	zev_queue_len++;
637 	zev_statistics.zev_cnt_total_events++;
638 	zev_statistics.zev_queue_len += msg->size;
639 
640 	/* update per-device queues */
641 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
642 		q = zev_queues[i - ZEV_MINOR_MIN];
643 		if (!q)
644 			continue;
645 
646 		zev_queue_hold(q);
647 
648 		/* make sure queue has enough room */
649 		while (q->zq_max_queue_len &&
650 		       q->zq_queue_len > q->zq_max_queue_len) {
651 
652 			if (q->zq_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL) {
653 				/* block until queue has been shrunk. */
654 				cv_wait(&zev_condvar, &zev_mutex);
655 			} else {
656 				/* discard msgs until queue is small enough */
657 				while (q->zq_queue_len > q->zq_max_queue_len) {
658 					m = q->zq_oldest;
659 					if (m == NULL)
660 						break;
661 					q->zq_events_discarded++;
662 					q->zq_bytes_discarded += m->size;
663 					q->zq_oldest = m->next;
664 					q->zq_queue_len -= m->size;
665 					q->zq_queue_messages--;
666 				}
667 			}
668 		}
669 
670 		/* register new message at the end of the queue */
671 		q->zq_queue_len += msg->size;
672 		q->zq_queue_messages++;
673 		q->zq_bytes_total += msg->size;
674 		q->zq_events_total++;
675 		if (q->zq_oldest == NULL)
676 			q->zq_oldest = msg;
677 
678 		zev_update_statistics(op, &q->zq_statistics);
679 
680 		if (q->zq_queue_len > q->zq_wakeup_threshold)
681 			wakeup = 1;
682 		if (q->zq_queue_len == msg->size)  /* queue was empty */
683 			cv_broadcast(&q->zq_condvar);
684 
685 		zev_queue_release(q);
686 	}
687 
688 	zev_queue_trim();
689 
690 	zev_update_statistics(op, &zev_statistics);
691 	mutex_exit(&zev_mutex);
692 	mutex_exit(&zev_queue_msg_mutex);
693 
694 	/* one or more queues need a pollwakeup() */
695 	if (op == ZEV_OP_MARK) {
696 		zev_poll_wakeup(B_TRUE);
697 	} else if (wakeup) {
698 		zev_poll_wakeup(B_FALSE);
699 	}
700 
701 	return;
702 }
703 
704 void
705 zev_queue_error(int op, char *fmt, ...)
706 {
707 	char buf[ZEV_MAX_MESSAGE_LEN];
708 	va_list ap;
709 	int len;
710 	zev_msg_t *msg = NULL;
711 	zev_error_t *rec;
712 	int msg_size;
713 
714 	va_start(ap, fmt);
715 	len = vsnprintf(buf, sizeof(buf), fmt, ap);
716 	va_end(ap);
717 	if (len >= sizeof(buf)) {
718 		cmn_err(CE_WARN, "zev: can't report error - "
719 		        "dropping event entirely.");
720 		return;
721 	}
722 
723 	msg_size = sizeof(*rec) + len + 1;
724 	msg = zev_alloc(sizeof(*msg) + msg_size);
725 	msg->size = msg_size;
726 	rec = (zev_error_t *)(msg + 1);
727 	rec->record_len = msg_size;
728 	rec->op = ZEV_OP_ERROR;
729 	rec->op_time = ddi_get_time();
730 	rec->guid = 0;
731 	rec->failed_op = op;
732 	rec->errstr_len = len;
733 	(void) memcpy(ZEV_ERRSTR(rec), buf, len + 1);
734 
735 	zev_queue_message(ZEV_OP_ERROR, msg);
736 	return;
737 }
738 
739 static int
740 zev_find_queue(zev_queue_t **out, zev_queue_t *req_q, zev_queue_name_t *name)
741 {
742 	char namebuf[ZEV_MAX_QUEUE_NAME_LEN+1];
743 	zev_queue_t *q;
744 	int i;
745 
746 	*out = NULL;
747 
748 	if (name->zev_namelen == 0) {
749 		if (req_q->zq_minor_number == ZEV_CONTROL_DEVICE_MINOR)
750 			return EINVAL;
751 		zev_queue_hold(req_q);
752 		*out = req_q;
753 		return 0;
754 	}
755 
756 	if (name->zev_namelen > ZEV_MAX_QUEUE_NAME_LEN)
757 		return EINVAL;
758 	strncpy(namebuf, name->zev_name, name->zev_namelen);
759 	namebuf[name->zev_namelen] = '\0';
760 
761 	mutex_enter(&zev_mutex);
762 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
763 		q = zev_queues[i - ZEV_MINOR_MIN];
764 		if (!q)
765 			continue;
766 		if (!strcmp(q->zq_name, namebuf)) {
767 			zev_queue_hold(q);
768 			mutex_exit(&zev_mutex);
769 			*out = q;
770 			return 0;
771 		}
772 	}
773 	mutex_exit(&zev_mutex);
774 	return ENOENT;
775 }
776 
777 static int
778 zev_ioc_get_queue_statistics(zev_queue_t *req_q, intptr_t arg, int mode)
779 {
780 	zev_ioctl_get_queue_statistics_t gs;
781 	zev_queue_t *q;
782 	int ret;
783 
784 	if (ddi_copyin((void *)arg, &gs, sizeof(gs), mode) != 0)
785 		return EFAULT;
786 
787 	ret = zev_find_queue(&q, req_q, &gs.zev_queue_name);
788 	if (ret)
789 		return ret;
790 
791 	/* ddi_copyout() can take a long time.  Better make
792 	   a copy to be able to release the mutex faster. */
793 	mutex_enter(&zev_mutex);
794 	memcpy(&gs.zev_statistics, &q->zq_statistics,sizeof(gs.zev_statistics));
795 	gs.zev_statistics.zev_queue_len = q->zq_queue_len;
796 	gs.zev_statistics.zev_bytes_read = q->zq_bytes_read;
797 	gs.zev_statistics.zev_bytes_discarded = q->zq_bytes_discarded;
798 	gs.zev_statistics.zev_max_queue_len = q->zq_max_queue_len;
799 	gs.zev_statistics.zev_cnt_discarded_events = q->zq_events_discarded;
800 	gs.zev_statistics.zev_cnt_total_events = q->zq_events_total;
801 	zev_queue_release(q);
802 	mutex_exit(&zev_mutex);
803 
804 	if (ddi_copyout(&gs, (void *)arg, sizeof(gs), mode) != 0)
805 		return EFAULT;
806 	return 0;
807 }
808 
809 static int
810 zev_ioc_set_queue_properties(zev_queue_t *req_q, intptr_t arg, int mode)
811 {
812 	zev_ioctl_set_queue_properties_t qp;
813 	zev_queue_t *q;
814 	uint64_t old_max;
815 	uint64_t old_flags;
816 	int ret;
817 
818 	if (ddi_copyin((void *)arg, &qp, sizeof(qp), mode) != 0)
819 		return EFAULT;
820 	if (qp.zev_max_queue_len > ZEV_MAX_QUEUE_LEN)
821 		return EINVAL;
822 	if (qp.zev_poll_wakeup_threshold > ZEV_MAX_POLL_WAKEUP_QUEUE_LEN)
823 		return EINVAL;
824 
825 	ret = zev_find_queue(&q, req_q, &qp.zev_queue_name);
826 	if (ret)
827 		return ret;
828 
829 	mutex_enter(&zev_mutex);
830 
831 	/*
832 	 * Note: if the PERSISTENT flag is cleared, and the queue is not busy,
833 	 * the queue should be removed by zev_queue_release() in zev_ioctl().
834 	 */
835 	old_flags = qp.zev_flags;
836 	q->zq_flags = qp.zev_flags;
837 	if ((old_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL) &&
838 	   (!(qp.zev_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL))) {
839 		/* queue is no longer blocking - wake blocked threads */
840 		cv_broadcast(&zev_condvar);
841 	}
842 
843 	zev_update_blockflag();
844 
845 	old_max = q->zq_max_queue_len;
846 	q->zq_max_queue_len = qp.zev_max_queue_len;
847 	if (q->zq_max_queue_len < old_max)
848 		zev_queue_trim();
849 	if (q->zq_max_queue_len > old_max)
850 		cv_broadcast(&zev_condvar);	/* threads may be waiting */
851 
852 	if ((qp.zev_poll_wakeup_threshold < q->zq_wakeup_threshold) &&
853 	    (qp.zev_poll_wakeup_threshold <= q->zq_queue_len))
854 		pollwakeup(&q->zq_pollhead, POLLIN);
855 	q->zq_wakeup_threshold = qp.zev_poll_wakeup_threshold;
856 
857 	zev_queue_release(q);
858 	mutex_exit(&zev_mutex);
859 	return 0;
860 }
861 
862 static int
863 zev_ioc_get_queue_properties(zev_queue_t *req_q, intptr_t arg, int mode)
864 {
865 	zev_ioctl_get_queue_properties_t qp;
866 	zev_queue_t *q;
867 	int ret;
868 
869 	if (ddi_copyin((void *)arg, &qp, sizeof(qp), mode) != 0)
870 		return EFAULT;
871 
872 	ret = zev_find_queue(&q, req_q, &qp.zev_queue_name);
873 	if (ret)
874 		return ret;
875 
876 	mutex_enter(&zev_mutex);
877 	qp.zev_max_queue_len = q->zq_max_queue_len;
878 	qp.zev_flags = q->zq_flags;
879 	qp.zev_poll_wakeup_threshold = q->zq_wakeup_threshold;
880 	zev_queue_release(q);
881 	mutex_exit(&zev_mutex);
882 
883 	if (ddi_copyout(&qp, (void *)arg, sizeof(qp), mode) != 0)
884 		return EFAULT;
885 	return 0;
886 }
887 
888 static int
889 zev_ioc_add_queue(zev_queue_t *req_q, intptr_t arg, int mode)
890 {
891 	zev_ioctl_add_queue_t aq;
892 	zev_queue_t *new_q;
893 	char name[ZEV_MAX_QUEUE_NAME_LEN+1];
894 
895 	if (ddi_copyin((void *)arg, &aq, sizeof(aq), mode) != 0)
896 		return EFAULT;
897 
898 	if (aq.zev_namelen > ZEV_MAX_QUEUE_NAME_LEN)
899 		return EINVAL;
900 	strncpy(name, aq.zev_name, aq.zev_namelen);
901 	name[aq.zev_namelen] = '\0';
902 
903 	return zev_queue_new(&new_q, req_q->zq_dip, name,
904 	                     aq.zev_max_queue_len, aq.zev_flags);
905 }
906 
907 static int
908 zev_ioc_remove_queue(zev_queue_t *req_q, intptr_t arg, int mode)
909 {
910 	zev_ioctl_remove_queue_t rq;
911 	zev_queue_t *q;
912 	char name[ZEV_MAX_QUEUE_NAME_LEN+1];
913 	int found = 0;
914 	int i;
915 
916 	if (ddi_copyin((void *)arg, &rq, sizeof(rq), mode) != 0)
917 		return EFAULT;
918 
919 	if (rq.zev_queue_name.zev_namelen > ZEV_MAX_QUEUE_NAME_LEN)
920 		return EINVAL;
921 	strncpy(name, rq.zev_queue_name.zev_name,
922 	        rq.zev_queue_name.zev_namelen);
923 	name[rq.zev_queue_name.zev_namelen] = '\0';
924 
925 	mutex_enter(&zev_mutex);
926 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
927 		q = zev_queues[i - ZEV_MINOR_MIN];
928 		if (!q)
929 			continue;
930 		if (!strcmp(q->zq_name, name)) {
931 			found = 1;
932 			break;
933 		}
934 	}
935 	if (!found) {
936 		mutex_exit(&zev_mutex);
937 		return ENOENT;
938 	}
939 
940 	if (q->zq_busy) {
941 		mutex_exit(&zev_mutex);
942 		return EBUSY;
943 	}
944 	/*
945 	 * clear flags, so that persistent queues are removed aswell
946 	 * and the queue becomes non-blocking.
947 	 */
948 	q->zq_flags = 0;
949 	if (q->zq_to_be_removed == B_FALSE) {
950 		q->zq_to_be_removed = B_TRUE;
951 		zev_queue_release(q);
952 	}
953 	/* some threads might be waiting for this queue to become writable */
954 	cv_broadcast(&zev_condvar);
955 
956 	mutex_exit(&zev_mutex);
957 	return 0;
958 }
959 
960 static int
961 zev_ioc_get_debug_info(zev_queue_t *req_q, intptr_t arg, int mode)
962 {
963 	zev_ioctl_debug_info_t di;
964 	uint64_t mem_allocated = atomic_add_64_nv(&zev_memory_allocated, 0);
965 	uint64_t mem_freed     = atomic_add_64_nv(&zev_memory_freed, 0);
966 
967 	zev_chksum_stats(&di.zev_chksum_cache_size,
968 	                 &di.zev_chksum_cache_hits,
969 	                 &di.zev_chksum_cache_misses);
970 	di.zev_memory_allocated = mem_allocated - mem_freed;
971 	if (ddi_copyout(&di, (void *)arg, sizeof(di), mode) != 0)
972 		return EFAULT;
973 	return 0;
974 }
975 
976 static int
977 zev_ioc_get_queue_list(zev_queue_t *req_q, intptr_t arg, int mode)
978 {
979 	zev_ioctl_get_queue_list_t gql;
980 	zev_queue_t *q;
981 	int i = 0;
982 	int count = 0;
983 
984 	memset(&gql, 0, sizeof(gql));
985 
986 	mutex_enter(&zev_mutex);
987 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
988 		q = zev_queues[i - ZEV_MINOR_MIN];
989 		if (!q)
990 			continue;
991 		strncpy(gql.zev_queue_name[count].zev_name,
992 		    q->zq_name, ZEV_MAX_QUEUE_NAME_LEN);
993 		gql.zev_queue_name[count].zev_namelen = strlen(q->zq_name);
994 		count++;
995 	}
996 	gql.zev_n_queues = count;
997 	mutex_exit(&zev_mutex);
998 
999 	if (ddi_copyout(&gql, (void *)arg, sizeof(gql), mode) != 0)
1000 		return EFAULT;
1001 	return 0;
1002 }
1003 
1004 static int
1005 zev_ioc_set_max_queue_len(zev_queue_t *req_q, intptr_t arg, int mode)
1006 {
1007 	uint64_t len;
1008 	int i;
1009 	zev_queue_t *q;
1010 
1011 	if (ddi_copyin((void *)arg, &len, sizeof(len), mode) != 0) {
1012 		return EFAULT;
1013 	}
1014 	if (len > ZEV_MAX_QUEUE_LEN) {
1015 		return EINVAL;
1016 	}
1017 	mutex_enter(&zev_mutex);
1018 	zev_statistics.zev_max_queue_len = len;
1019 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
1020 		q = zev_queues[i - ZEV_MINOR_MIN];
1021 		if (!q)
1022 			continue;
1023 		if (q->zq_max_queue_len <=
1024 		    zev_statistics.zev_max_queue_len)
1025 			continue;
1026 		q->zq_max_queue_len = zev_statistics.zev_max_queue_len;
1027 	}
1028 	cv_broadcast(&zev_condvar);
1029 	mutex_exit(&zev_mutex);
1030 	return 0;
1031 }
1032 
1033 static int
1034 zev_ioc_get_zev_version(intptr_t arg, int mode)
1035 {
1036 	zev_ioctl_get_zev_version vi;
1037 	vi.zev_major_version = ZEV_MAJOR_VERSION;
1038 	vi.zev_minor_version = ZEV_MINOR_VERSION;
1039 	if (ddi_copyout(&vi, (void *)arg, sizeof(vi), mode) != 0)
1040 		return EFAULT;
1041 	return 0;
1042 }
1043 
1044 /* ARGSUSED */
1045 static int
1046 zev_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
1047 {
1048 	zev_statistics_t zs;
1049 	zev_ioctl_poolarg_t pa;
1050 	zev_ioctl_mark_t mark;
1051 	zev_mark_t *rec;
1052 	int msg_size;
1053 	zev_msg_t *msg;
1054 	uint64_t mark_id;
1055 	minor_t minor;
1056 	zev_queue_t *req_q;
1057 	int ret = 0;
1058 
1059 	minor = getminor(dev);
1060 	mutex_enter(&zev_mutex);
1061 	if ((req_q = ddi_get_soft_state(statep, minor)) == NULL) {
1062 		mutex_exit(&zev_mutex);
1063 		return (ENXIO);
1064 	}
1065 	zev_queue_hold(req_q);
1066 	mutex_exit(&zev_mutex);
1067 	/*
1068 	 * all structures passed between kernel and userspace
1069 	 * are now compatible between 64 and 32 bit.  Model
1070 	 * conversion can be ignored.
1071 	 */
1072 	switch (cmd) {
1073 	case ZEV_IOC_GET_GLOBAL_STATISTICS:
1074 		/* ddi_copyout() can take a long time.  Better make
1075 		   a copy to be able to release the mutex faster. */
1076 		mutex_enter(&zev_mutex);
1077 		(void) memcpy(&zs, &zev_statistics, sizeof(zs));
1078 		mutex_exit(&zev_mutex);
1079 		if (ddi_copyout(&zs, (void *)arg, sizeof(zs), mode) != 0)
1080 			ret = EFAULT;
1081 		break;
1082 	case ZEV_IOC_GET_QUEUE_STATISTICS:
1083 		ret = zev_ioc_get_queue_statistics(req_q, arg, mode);
1084 		break;
1085 	case ZEV_IOC_MUTE_POOL:
1086 	case ZEV_IOC_UNMUTE_POOL:
1087 		if (ddi_copyin((void *)arg, &pa, sizeof(pa), mode) != 0) {
1088 			ret = EFAULT;
1089 			break;
1090 		}
1091 		if (pa.zev_poolname_len >=MAXPATHLEN) {
1092 			ret = EINVAL;
1093 			break;
1094 		}
1095 		pa.zev_poolname[pa.zev_poolname_len] = '\0';
1096 		if (cmd == ZEV_IOC_MUTE_POOL) {
1097 			ret = zev_ioc_mute_pool(pa.zev_poolname);
1098 		} else {
1099 			ret = zev_ioc_unmute_pool(pa.zev_poolname);
1100 		}
1101 		break;
1102 	case ZEV_IOC_SET_MAX_QUEUE_LEN:
1103 		ret = zev_ioc_set_max_queue_len(req_q, arg, mode);
1104 		break;
1105 	case ZEV_IOC_GET_QUEUE_PROPERTIES:
1106 		ret = zev_ioc_get_queue_properties(req_q, arg, mode);
1107 		break;
1108 	case ZEV_IOC_SET_QUEUE_PROPERTIES:
1109 		ret = zev_ioc_set_queue_properties(req_q, arg, mode);
1110 		break;
1111 	case ZEV_IOC_MARK:
1112 		if (ddi_copyin((void *)arg, &mark, sizeof(mark), mode) != 0) {
1113 			ret = EFAULT;
1114 			break;
1115 		}
1116 		/* prepare message */
1117 		msg_size = sizeof(*rec) + mark.zev_payload_len + 1;
1118 		msg = zev_alloc(sizeof(*msg) + msg_size);
1119 		msg->size = msg_size;
1120 		rec = (zev_mark_t *)(msg + 1);
1121 		rec->record_len = msg_size;
1122 		rec->op = ZEV_OP_MARK;
1123 		rec->op_time = ddi_get_time();
1124 		rec->guid = mark.zev_guid;
1125 		rec->payload_len = mark.zev_payload_len;
1126 		/* get payload */
1127 		if (ddi_copyin(((char *)arg) + sizeof(mark),
1128 		               ZEV_PAYLOAD(rec),
1129 		               mark.zev_payload_len, mode) != 0) {
1130 			zev_free(msg, msg_size);
1131 			ret = EFAULT;
1132 			break;
1133 		}
1134 		*(ZEV_PAYLOAD(rec) + mark.zev_payload_len) = '\0';
1135 		/* get mark id and queue message */
1136 		mutex_enter(&zev_mark_id_mutex);
1137 		mark_id = zev_mark_id++;
1138 		mutex_exit(&zev_mark_id_mutex);
1139 		rec->mark_id = mark_id;
1140 		zev_queue_message(ZEV_OP_MARK, msg);
1141 		/* report mark id to userland, ignore errors */
1142 		mark.zev_mark_id = mark_id;
1143 		ddi_copyout(&mark, (void *)arg, sizeof(mark), mode);
1144 		break;
1145 	case ZEV_IOC_ADD_QUEUE:
1146 		if (minor != ZEV_CONTROL_DEVICE_MINOR) {
1147 			ret = EACCES;
1148 			break;
1149 		}
1150 		ret = zev_ioc_add_queue(req_q, arg, mode);
1151 		break;
1152 	case ZEV_IOC_REMOVE_QUEUE:
1153 		if (minor != ZEV_CONTROL_DEVICE_MINOR) {
1154 			ret = EACCES;
1155 			break;
1156 		}
1157 		ret = zev_ioc_remove_queue(req_q, arg, mode);
1158 		break;
1159 	case ZEV_IOC_GET_DEBUG_INFO:
1160 		ret = zev_ioc_get_debug_info(req_q, arg, mode);
1161 		break;
1162 	case ZEV_IOC_GET_QUEUE_LIST:
1163 		ret = zev_ioc_get_queue_list(req_q, arg, mode);
1164 		break;
1165 	case ZEV_IOC_GET_FILE_SIGNATURES:
1166 		ret = zev_ioc_get_signatures(arg, mode);
1167 		break;
1168 	case ZEV_IOC_GET_ZEV_VERSION:
1169 		ret = zev_ioc_get_zev_version(arg, mode);
1170 		break;
1171 	default:
1172 		/* generic "ioctl unknown" error */
1173 		ret = ENOTTY;
1174 	}
1175 
1176 	mutex_enter(&zev_mutex);
1177 	zev_queue_release(req_q);
1178 	mutex_exit(&zev_mutex);
1179 	if (ret)
1180 		SET_ERROR(ret);
1181 	return (ret);
1182 }
1183 
1184 static int
1185 zev_chpoll(dev_t dev, short events, int anyyet,
1186     short *reventsp, struct pollhead **phpp)
1187 {
1188 	int minor;
1189 	short revent = 0;
1190 	zev_queue_t *q;
1191 
1192 	/* use minor-specific queue context and it's pollhead */
1193 	minor = getminor(dev);
1194 	if (minor == ZEV_CONTROL_DEVICE_MINOR)
1195 		return (EINVAL);
1196 	mutex_enter(&zev_mutex);
1197 	if ((q = ddi_get_soft_state(statep, minor)) == NULL) {
1198 		mutex_exit(&zev_mutex);
1199 		return (ENXIO);
1200 	}
1201 	revent = 0;
1202 	if ((events & POLLIN)) {
1203 		if (q->zq_oldest)
1204 			revent |= POLLIN;
1205 	}
1206 	if (revent == 0) {
1207 		if (!anyyet) {
1208 			*phpp = &q->zq_pollhead;
1209 		}
1210 	}
1211 	*reventsp = revent;
1212 	mutex_exit(&zev_mutex);
1213 	return (0);
1214 }
1215 
1216 /* ARGSUSED */
1217 static int
1218 zev_read(dev_t dev, struct uio *uio_p, cred_t *crep_p)
1219 {
1220 	minor_t minor;
1221 	offset_t off;
1222 	int ret = 0;
1223 	zev_msg_t *msg;
1224 	char *data;
1225 	zev_queue_t *q;
1226 
1227 	minor = getminor(dev);
1228 	if (minor == ZEV_CONTROL_DEVICE_MINOR)
1229 		return (EINVAL);
1230 
1231 	mutex_enter(&zev_mutex);
1232 	q = ddi_get_soft_state(statep, minor);
1233 	if (q == NULL) {
1234 		mutex_exit(&zev_mutex);
1235 		return (ENXIO);
1236 	}
1237 	off = uio_p->uio_loffset;
1238 	msg = q->zq_oldest;
1239 	while (msg == NULL) {
1240 		if (!ddi_can_receive_sig()) {
1241 			/*
1242 			 * read() shouldn't block because this thread
1243 			 * can't receive signals. (e.g., it might be
1244 			 * torn down by exit() right now.)
1245 			 */
1246 			mutex_exit(&zev_mutex);
1247 			return 0;
1248 		}
1249 		if (cv_wait_sig(&q->zq_condvar, &zev_mutex) == 0) {
1250 			/* signal received. */
1251 			mutex_exit(&zev_mutex);
1252 			return EINTR;
1253 		}
1254 		msg = q->zq_oldest;
1255 	}
1256 	if (msg->size > uio_p->uio_resid) {
1257 		mutex_exit(&zev_mutex);
1258 		return E2BIG;
1259 	}
1260 	while (msg && uio_p->uio_resid >= msg->size) {
1261 		data = (char *)(msg + 1);
1262 		ret = uiomove(data, msg->size, UIO_READ, uio_p);
1263 		if (ret != 0) {
1264 			mutex_exit(&zev_mutex);
1265 			cmn_err(CE_WARN, "zev: uiomove failed; messages lost");
1266 			uio_p->uio_loffset = off;
1267 			return (ret);
1268 		}
1269 		q->zq_oldest = msg->next;
1270 		q->zq_bytes_read += msg->size;
1271 		q->zq_queue_len -= msg->size;
1272 		q->zq_queue_messages--;
1273 		msg->read++;
1274 		msg = q->zq_oldest;
1275 	}
1276 	zev_queue_trim();
1277 	cv_broadcast(&zev_condvar);
1278 	mutex_exit(&zev_mutex);
1279 	uio_p->uio_loffset = off;
1280 	return 0;
1281 }
1282 
1283 /* ARGSUSED */
1284 static int
1285 zev_close(dev_t dev, int flag, int otyp, cred_t *crepd)
1286 {
1287 	zev_queue_t *q;
1288 	int minor;
1289 
1290 	minor = getminor(dev);
1291 	if (otyp != OTYP_CHR)
1292 		return (EINVAL);
1293 	mutex_enter(&zev_mutex);
1294 	if ((q = ddi_get_soft_state(statep, minor)) == NULL) {
1295 		mutex_exit(&zev_mutex);
1296 		return (ENXIO);
1297 	}
1298 	if (q->zq_busy != B_TRUE) {
1299 		mutex_exit(&zev_mutex);
1300 		return (EINVAL);
1301 	}
1302 	q->zq_busy = B_FALSE;
1303 	if ((q->zq_flags & ZEV_FL_PERSISTENT) == 0)
1304 		zev_queue_release(q);
1305 	mutex_exit(&zev_mutex);
1306 	return (0);
1307 }
1308 
1309 /* ARGSUSED */
1310 static int
1311 zev_open(dev_t *devp, int flag, int otyp, cred_t *credp)
1312 {
1313 	zev_queue_t *q;
1314 	minor_t minor;
1315 
1316 	minor = getminor(*devp);
1317 	if (otyp != OTYP_CHR)
1318 		return (EINVAL);
1319 	if (drv_priv(credp) != 0)
1320 		return (EPERM);
1321 	mutex_enter(&zev_mutex);
1322 	if ((q = ddi_get_soft_state(statep, minor)) == NULL) {
1323 		mutex_exit(&zev_mutex);
1324 		return (ENXIO);
1325 	}
1326 	if (minor == ZEV_CONTROL_DEVICE_MINOR) {
1327 		/* control device may be used in parallel */
1328 		q->zq_busy = B_TRUE;
1329 		mutex_exit(&zev_mutex);
1330 		return 0;
1331 	}
1332 	if (q->zq_busy == B_TRUE) {
1333 		mutex_exit(&zev_mutex);
1334 		return (EBUSY);
1335 	}
1336 	q->zq_busy = B_TRUE;	/* can only be opened exclusively */
1337 	mutex_exit(&zev_mutex);
1338 	return (0);
1339 }
1340 
1341 static struct cb_ops zev_cb_ops = {
1342 	zev_open,		/* open */
1343 	zev_close,		/* close */
1344 	nodev,			/* strategy */
1345 	nodev,			/* print */
1346 	nodev,			/* dump */
1347 	zev_read,		/* read */
1348 	nodev,			/* write */
1349 	zev_ioctl,		/* ioctl */
1350 	nodev,			/* devmap */
1351 	nodev,			/* mmap */
1352 	nodev,			/* segmap */
1353 	zev_chpoll,		/* chpoll */
1354 	ddi_prop_op,		/* prop_op */
1355 	NULL,			/* streamtab */
1356 	D_MP | D_64BIT,		/* cb_flag */
1357 	CB_REV,			/* cb_rev */
1358 	nodev,			/* aread */
1359 	nodev,			/* awrite */
1360 };
1361 
1362 static void
1363 zev_free_instance(dev_info_t *dip)
1364 {
1365 	int instance;
1366 	zev_queue_t *q;
1367 	int i;
1368 
1369 	instance = ddi_get_instance(dip);
1370 	if (instance != 0) {
1371 		cmn_err(CE_WARN, "zev: tried to free instance != 0 (%d)",
1372 		        instance);
1373 		return;
1374 	}
1375 
1376 	ddi_remove_minor_node(dip, NULL);
1377 
1378 	/* stop pollwakeup thread */
1379 	zev_wakeup_thread_run = 0;
1380 	if (zev_poll_wakeup_thread != NULL) {
1381 		thread_join(zev_poll_wakeup_thread->t_did);
1382 		zev_poll_wakeup_thread = NULL;
1383 	}
1384 
1385 	mutex_enter(&zev_mutex);
1386 
1387 	/* remove "ctrl" dummy queue */
1388 	q = ddi_get_soft_state(statep, ZEV_CONTROL_DEVICE_MINOR);
1389 	if (q) {
1390 		ddi_soft_state_free(statep, ZEV_CONTROL_DEVICE_MINOR);
1391 		ZEV_MEM_SUB(sizeof(zev_queue_t));
1392 	}
1393 
1394 	/* remove all other queues */
1395 	for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) {
1396 		q = zev_queues[i- ZEV_MINOR_MIN];
1397 		if (!q)
1398 			continue;
1399 		ASSERT(q->zq_refcnt == 1);
1400 		zev_queue_release(q);
1401 	}
1402 	zev_queue_trim();
1403 	bzero(&zev_queues, sizeof(zev_queues));
1404 
1405 	mutex_exit(&zev_mutex);
1406 
1407 }
1408 
1409 static int
1410 zev_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1411 {
1412 	int instance;
1413 	zev_queue_t *q;
1414 
1415 	/* called once per instance with DDI_DETACH,
1416 	   may be called to suspend */
1417 	switch (cmd) {
1418 	case DDI_DETACH:
1419 		/* instance busy? */
1420 		instance = ddi_get_instance(dip);
1421 		if (instance != 0) {	/* hardcoded in zev.conf */
1422 			/* this module only supports one instance. */
1423 			return (DDI_FAILURE);
1424 		}
1425 
1426 		mutex_enter(&zev_mutex);
1427 		if (!zev_attached) {
1428 			mutex_exit(&zev_mutex);
1429 			return (DDI_FAILURE);
1430 		}
1431 
1432 		/* check "ctrl" queue to see if t is busy */
1433 		q = ddi_get_soft_state(statep, ZEV_CONTROL_DEVICE_MINOR);
1434 		if (q == NULL) {
1435 			mutex_exit(&zev_mutex);
1436 			return (DDI_FAILURE);
1437 		}
1438 		if (q->zq_busy) {
1439 			mutex_exit(&zev_mutex);
1440 			return (DDI_FAILURE);
1441 		}
1442 		/* are there any queues? */
1443 		if (zev_queue_cnt > 0) {
1444 			mutex_exit(&zev_mutex);
1445 			return (DDI_FAILURE);
1446 		}
1447 
1448 		zev_attached = B_FALSE;
1449 		mutex_exit(&zev_mutex);
1450 
1451 		/* switch ZFS event callbacks back to default */
1452 		rw_enter(&rz_zev_rwlock, RW_WRITER);
1453 		rz_zev_callbacks = rz_zev_default_callbacks;
1454 		rz_zev_set_active(B_FALSE);
1455 		rw_exit(&rz_zev_rwlock);
1456 
1457 		/* no thread is inside of the callbacks anymore. */
1458 
1459 		/* free resources allocated for this instance */
1460 		zev_free_instance(dip);
1461 		zev_chksum_fini();
1462 #if 0
1463 		cmn_err(CE_WARN, "zev: allocated memory at detach: %" PRIu64,
1464 			zev_memory_allocated - zev_memory_freed);
1465 #endif
1466 		return (DDI_SUCCESS);
1467 	case DDI_SUSPEND:
1468 		/* kernel must not suspend zev devices while ZFS is running */
1469 		return (DDI_FAILURE);
1470 	default:
1471 		return (DDI_FAILURE);
1472 	}
1473 }
1474 
1475 static int
1476 zev_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
1477 {
1478 	/* called once per instance with DDI_ATTACH,
1479 	   may be called to resume */
1480 	int instance;
1481 	int error;
1482 	zev_queue_t *q;
1483 	switch (cmd) {
1484 	case DDI_ATTACH:
1485 		/* create instance state */
1486 		instance = ddi_get_instance(dip);
1487 		if (instance != 0) {	/* hardcoded in zev.conf */
1488 			/* this module only supports one instance. */
1489 			return (DDI_FAILURE);
1490 		}
1491 
1492 		mutex_enter(&zev_mutex);
1493 		if (zev_attached) {
1494 			mutex_exit(&zev_mutex);
1495 			return (DDI_FAILURE);
1496 		}
1497 		if (ddi_soft_state_zalloc(statep, ZEV_CONTROL_DEVICE_MINOR) !=
1498 		    DDI_SUCCESS) {
1499 			mutex_exit(&zev_mutex);
1500 			return (DDI_FAILURE);
1501 		}
1502 		ZEV_MEM_ADD(sizeof(zev_queue_t));
1503 		zev_attached = B_TRUE;
1504 
1505 		/* init queue list */
1506 		bzero(&zev_queues, sizeof(zev_queues));
1507 		mutex_exit(&zev_mutex);
1508 
1509 		/* create a dummy queue for management of "ctrl" */
1510 
1511 		q = ddi_get_soft_state(statep, ZEV_CONTROL_DEVICE_MINOR);
1512 		q->zq_dip = dip;
1513 		q->zq_refcnt = 1;
1514 		q->zq_busy = B_FALSE;
1515 		q->zq_minor_number = ZEV_CONTROL_DEVICE_MINOR;
1516 		q->zq_flags = ZEV_FL_PERSISTENT;
1517 		strcpy(q->zq_name, ZEV_CONTROL_DEVICE_NAME);
1518 
1519 		/* create device node for "ctrl" */
1520 		if (ddi_create_minor_node(dip, ZEV_CONTROL_DEVICE_NAME,
1521 		    S_IFCHR, ZEV_CONTROL_DEVICE_MINOR,
1522 		    DDI_PSEUDO, 0) == DDI_FAILURE) {
1523 			goto fail;
1524 		}
1525 
1526 		/* note: intentionally not adding ctrl queue to queue list. */
1527 
1528 		/* default queue */
1529 		error = zev_queue_new(&q, dip,
1530 				      ZEV_DEFAULT_QUEUE_NAME,
1531 				      ZEV_MAX_QUEUE_LEN,
1532 				      ZEV_FL_BLOCK_WHILE_QUEUE_FULL|
1533 		                      ZEV_FL_PERSISTENT);
1534 		if (error)
1535 			goto fail;
1536 
1537 		/* start pollwakeup thread */
1538 		zev_wakeup_thread_run = 1;
1539 		zev_poll_wakeup_thread = thread_create(NULL, 0,
1540 		    zev_poll_wakeup_thread_main, NULL, 0, &p0,
1541 		    TS_RUN, minclsyspri);
1542 
1543 		ddi_report_dev(dip);
1544 
1545 		zev_chksum_init();
1546 
1547 		/* switch ZFS event callbacks to zev module callbacks */
1548 		rw_enter(&rz_zev_rwlock, RW_WRITER);
1549 		rz_zev_callbacks = &zev_callbacks;
1550 		rz_zev_set_active(B_TRUE);
1551 		rw_exit(&rz_zev_rwlock);
1552 
1553 		return (DDI_SUCCESS);
1554 	case DDI_RESUME:
1555 		/* suspendeding zev devices should never happen */
1556 		return (DDI_SUCCESS);
1557 	default:
1558 		return (DDI_FAILURE);
1559 	}
1560 fail:
1561 	cmn_err(CE_WARN, "zev: attach failed");
1562 	zev_free_instance(dip);
1563 	mutex_enter(&zev_mutex);
1564 	zev_attached = B_FALSE;
1565 	mutex_exit(&zev_mutex);
1566 	return (DDI_FAILURE);
1567 }
1568 
1569 /* ARGSUSED */
1570 static int
1571 zev_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **resultp)
1572 {
1573 	minor_t minor;
1574 	zev_queue_t *q;
1575 
1576 	/* arg is dev_t */
1577 	minor = getminor((dev_t)arg);
1578 	mutex_enter(&zev_mutex);
1579 	q = ddi_get_soft_state(statep, minor);
1580 	if (q == NULL) {
1581 		*resultp = NULL;
1582 		mutex_exit(&zev_mutex);
1583 		return (DDI_FAILURE);
1584 	}
1585 
1586 	switch (infocmd) {
1587 	case DDI_INFO_DEVT2DEVINFO:
1588 		*resultp = q->zq_dip;
1589 		break;
1590 	case DDI_INFO_DEVT2INSTANCE:
1591 		*resultp = (void *)(uintptr_t)ddi_get_instance(q->zq_dip);
1592 		break;
1593 	default:
1594 		mutex_exit(&zev_mutex);
1595 		return (DDI_FAILURE);
1596 	}
1597 	mutex_exit(&zev_mutex);
1598 	return (DDI_SUCCESS);
1599 }
1600 
1601 static struct dev_ops zev_dev_ops = {
1602 	DEVO_REV,			/* driver build revision */
1603 	0,				/* driver reference count */
1604 	zev_getinfo,			/* getinfo */
1605 	nulldev,			/* identify (obsolete) */
1606 	nulldev,			/* probe (search for devices) */
1607 	zev_attach,			/* attach */
1608 	zev_detach,			/* detach */
1609 	nodev,				/* reset (obsolete, use quiesce) */
1610 	&zev_cb_ops,			/* character and block device ops */
1611 	NULL,				/* bus driver ops */
1612 	NULL,				/* power management, not needed */
1613 	ddi_quiesce_not_needed,		/* quiesce */
1614 };
1615 
1616 static struct modldrv zev_modldrv = {
1617 	&mod_driverops,			/* all loadable modules use this */
1618 	"ZFS event provider, v"
1619 		XSTRING(ZEV_MAJOR_VERSION) "."
1620 		XSTRING(ZEV_MINOR_VERSION),
1621 					/* driver name and version info */
1622 	&zev_dev_ops			/* ops method pointers */
1623 };
1624 
1625 static struct modlinkage zev_modlinkage = {
1626 	MODREV_1,	/* fixed value */
1627 	{
1628 		&zev_modldrv,	/* driver linkage structure */
1629 		NULL		/* list terminator */
1630 	}
1631 };
1632 
1633 int
1634 _init(void)
1635 {
1636 	int error;
1637 
1638 	if ((error = ddi_soft_state_init(&statep, sizeof(zev_queue_t), 1)) != 0)
1639 		return (error);
1640 	zev_attached = B_FALSE;
1641 
1642 	zev_queue_head = NULL;
1643 	zev_queue_tail = NULL;
1644 	zev_queue_len = 0;
1645 	zev_muted_pools_head = NULL;
1646 	zev_memory_allocated = 0;
1647 	zev_memory_freed = 0;
1648 	zev_queue_cnt = 0;
1649 	zev_have_blocking_queues = 1;
1650 
1651 	mutex_init(&zev_mutex, NULL, MUTEX_DRIVER, NULL);
1652 	cv_init(&zev_condvar, NULL, CV_DRIVER, NULL);
1653 	rw_init(&zev_pool_list_rwlock, NULL, RW_DRIVER, NULL);
1654 	mutex_init(&zev_mark_id_mutex, NULL, MUTEX_DRIVER, NULL);
1655 	zev_mark_id = gethrtime();
1656 	mutex_init(&zev_queue_msg_mutex, NULL, MUTEX_DRIVER, NULL);
1657 	zev_msg_sequence_number = gethrtime();
1658 	bzero(&zev_statistics, sizeof(zev_statistics));
1659 	bzero(&zev_pollhead, sizeof(zev_pollhead));
1660 	bzero(&zev_queues, sizeof(zev_queues));
1661 	zev_statistics.zev_max_queue_len = ZEV_MAX_QUEUE_LEN;
1662 	if (zev_ioc_mute_pool("zg0")) {
1663 		cmn_err(CE_WARN, "zev: could not init mute list");
1664 		goto FAIL;
1665 	}
1666 
1667 	if ((error = mod_install(&zev_modlinkage)) != 0) {
1668 		cmn_err(CE_WARN, "zev: could not install module");
1669 		goto FAIL;
1670 	}
1671 
1672 	return (0);
1673 FAIL:
1674 	/* free resources */
1675 	cmn_err(CE_WARN, "zev: _init failed");
1676 	mutex_destroy(&zev_mutex);
1677 	ddi_soft_state_fini(&statep);
1678 	return (error);
1679 }
1680 
1681 int
1682 _info(struct modinfo *modinfop)
1683 {
1684 	return (mod_info(&zev_modlinkage, modinfop));
1685 }
1686 
1687 int
1688 _fini(void)
1689 {
1690 	int error = 0;
1691 	zev_msg_t *msg;
1692 	zev_pool_list_entry_t *pe, *npe;
1693 
1694 	mutex_enter(&zev_mutex);
1695 	if (zev_attached == B_TRUE) {
1696 		mutex_exit(&zev_mutex);
1697 		return (SET_ERROR(EBUSY));
1698 	}
1699 	if (zev_queue_cnt != 0) {
1700 		/* should never happen */
1701 		mutex_exit(&zev_mutex);
1702 		return (SET_ERROR(EBUSY));
1703 	}
1704 
1705 	/*
1706 	 * avoid deadlock if event list is full: make sure threads currently
1707 	 * blocking on the event list can append their event and then release
1708 	 * rz_zev_rwlock.  Since there should be no queues left when we
1709 	 * reach this point we can simply empty the event list and then
1710 	 * wake everybody.
1711 	 */
1712 	while (zev_queue_head) {
1713 		msg = zev_queue_head;
1714 		zev_queue_head = msg->next;
1715 		zev_free(msg, sizeof(*msg) + msg->size);
1716 	}
1717 	cv_broadcast(&zev_condvar);
1718 	mutex_exit(&zev_mutex);
1719 
1720 	/* switch ZFS event callbacks back to default (again) */
1721 	rw_enter(&rz_zev_rwlock, RW_WRITER);
1722 	rz_zev_callbacks = rz_zev_default_callbacks;
1723 	rz_zev_set_active(B_FALSE);
1724 	rw_exit(&rz_zev_rwlock);
1725 
1726 	/* no thread is inside of the callbacks anymore.  Safe to remove. */
1727 
1728 	/* unload module callbacks */
1729 	if ((error = mod_remove(&zev_modlinkage)) != 0) {
1730 		cmn_err(CE_WARN, "mod_remove failed: %d", error);
1731 		return (error);
1732 	}
1733 
1734 	/* free resources */
1735 	mutex_enter(&zev_mutex);
1736 	while (zev_queue_head) {
1737 		msg = zev_queue_head;
1738 		zev_queue_head = msg->next;
1739 		zev_free(msg, sizeof(*msg) + msg->size);
1740 	}
1741 	mutex_exit(&zev_mutex);
1742 	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
1743 	pe = zev_muted_pools_head;
1744 	while (pe) {
1745 		npe = pe;
1746 		pe = pe->next;
1747 		zev_free(npe, sizeof(*npe));
1748 	}
1749 	rw_exit(&zev_pool_list_rwlock);
1750 	ddi_soft_state_fini(&statep);
1751 	rw_destroy(&zev_pool_list_rwlock);
1752 	cv_destroy(&zev_condvar);
1753 	mutex_destroy(&zev_mutex);
1754 	mutex_destroy(&zev_mark_id_mutex);
1755 	mutex_destroy(&zev_queue_msg_mutex);
1756 
1757 	return (0);
1758 }
1759 
1760