xref: /titanic_41/usr/src/uts/common/fs/zev/zev.c (revision d2eb38229d10d0077809edfbdc66da785bc33379)
1 #include <sys/modctl.h>
2 #include <sys/ddi.h>
3 #include <sys/sunddi.h>
4 #include <sys/conf.h>
5 #include <sys/devops.h>
6 #include <sys/stat.h>
7 #include <sys/fs/zev.h>
8 #include <sys/zev_callbacks.h>
9 #include <time.h>
10 
11 typedef struct zev_state {
12 	kmutex_t	mutex;
13 	dev_info_t	*dip;
14 	boolean_t	busy;
15 } zev_state_t;
16 
17 static void		*statep;
18 struct pollhead		zev_pollhead;
19 
20 kmutex_t		zev_mutex;
21 kcondvar_t		zev_condvar;
22 krwlock_t		zev_pool_list_rwlock;
23 static zev_statistics_t	zev_statistics;
24 static boolean_t	zev_busy;
25 static kmutex_t		zev_mark_id_mutex;
26 static uint64_t		zev_mark_id = 0;
27 
28 /*
29  * The longest potential message is from zev_zfs_mount() and
30  * contains the mountpoint, which might be close to MAXPATHLEN bytes long.
31  *
32  * Another candidate is zev_znode_rename_cb() and contains three inode
33  * numbers and two filenames of up to MAXNAMELEN bytes each.
34  */
35 #define ZEV_MAX_MESSAGE_LEN	4096
36 
37 /* If the queue size reaches 1GB, stop ZFS ops and block the threads.  */
38 #define ZEV_MAX_QUEUE_LEN		(1 * 1024 * 1024 * 1024)
39 
40 /* Don't wake up poll()ing processes for every single message. */
41 #define ZEV_MIN_POLL_WAKEUP_QUEUE_LEN	8192
42 
43 static zev_msg_t *zev_queue_head = NULL;
44 static zev_msg_t *zev_queue_tail = NULL;
45 static uint64_t zev_queue_len = 0;
46 
47 
48 typedef struct zev_pool_list_entry {
49 	struct zev_pool_list_entry	*next;
50 	char				name[MAXPATHLEN];
51 } zev_pool_list_entry_t;
52 
53 static zev_pool_list_entry_t *zev_muted_pools_head = NULL;
54 
55 /*
56  * poll() wakeup thread.  Used to check periodically whether we have
57  * bytes left in the queue that have not yet been made into a
58  * pollwakeup() call.  This is meant to insure a maximum waiting
59  * time until an event is presented as a poll wakeup, while at
60  * the same time not making every single event into a poll wakeup
61  * of it's own.
62  */
63 
64 static volatile int zev_wakeup_thread_run = 1;
65 static kthread_t *zev_poll_wakeup_thread = NULL;
66 
67 static void
68 zev_poll_wakeup_thread_main(void)
69 {
70 	int wakeup;
71 	while (zev_wakeup_thread_run) {
72 		delay(drv_usectohz(100 * 1000)); /* sleep 100ms */
73 		/* check message queue */
74 		mutex_enter(&zev_mutex);
75 		wakeup = 0;
76 		if (zev_queue_head)
77 			wakeup = 1;
78 		mutex_exit(&zev_mutex);
79 		if (wakeup)
80 			pollwakeup(&zev_pollhead, POLLIN);
81 	}
82 	thread_exit();
83 }
84 
85 static int
86 zev_ioc_mute_pool(char *poolname)
87 {
88 	zev_pool_list_entry_t *pe;
89 	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
90 	/* pool already muted? */
91 	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
92 		if (!strcmp(pe->name, poolname)) {
93 			rw_exit(&zev_pool_list_rwlock);
94 			return EEXIST;
95 		}
96 	}
97 	pe = kmem_zalloc(sizeof(*pe), KM_SLEEP);
98 	if (!pe) {
99 		rw_exit(&zev_pool_list_rwlock);
100 		return ENOMEM;
101 	}
102 	strncpy(pe->name, poolname, sizeof(pe->name));
103 	pe->next = zev_muted_pools_head;
104 	zev_muted_pools_head = pe;
105 	rw_exit(&zev_pool_list_rwlock);
106 	return (0);
107 }
108 
109 static int
110 zev_ioc_unmute_pool(char *poolname)
111 {
112 	zev_pool_list_entry_t *pe, *peprev;
113 	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
114 	/* pool muted? */
115 	peprev = NULL;
116 	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
117 		if (!strcmp(pe->name, poolname)) {
118 			goto found;
119 		}
120 		peprev = pe;
121 	}
122 	rw_exit(&zev_pool_list_rwlock);
123 	return ENOENT;
124 found:
125 	if (peprev != NULL) {
126 		peprev->next = pe->next;
127 	} else {
128 		zev_muted_pools_head = pe->next;
129 	}
130 	kmem_free(pe, sizeof(*pe));
131 	rw_exit(&zev_pool_list_rwlock);
132 	return (0);
133 }
134 
135 int
136 zev_skip_pool(objset_t *os)
137 {
138 	zev_pool_list_entry_t *pe;
139 	dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
140 	rw_enter(&zev_pool_list_rwlock, RW_READER);
141 	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
142 		if (!strcmp(pe->name, dp->dp_spa->spa_name)) {
143 			rw_exit(&zev_pool_list_rwlock);
144 			return 1;
145 		}
146 	}
147 	rw_exit(&zev_pool_list_rwlock);
148 	return 0;
149 }
150 
151 void
152 zev_queue_message(int op, zev_msg_t *msg)
153 {
154 	time_t now = 0;
155 	int wakeup = 0;
156 
157 	msg->next = NULL;
158 
159 	if (op < ZEV_OP_MIN || op > ZEV_OP_MAX) {
160 		zev_queue_error(op, "unknown op id encountered: %d", op);
161 		kmem_free(msg, sizeof(*msg) + msg->size);
162 		return;
163 	}
164 
165 	mutex_enter(&zev_mutex);
166 	while (zev_statistics.zev_max_queue_len &&
167 	    zev_statistics.zev_queue_len >= zev_statistics.zev_max_queue_len) {
168 		/* queue full.  block until it's been shrunk. */
169 		cv_wait(&zev_condvar, &zev_mutex);
170 	}
171 
172 	if (zev_queue_tail == NULL) {
173 		zev_queue_head = zev_queue_tail = msg;
174 	} else {
175 		zev_queue_tail->next = msg;
176 		zev_queue_tail = msg;
177 	}
178 	zev_queue_len++;
179 
180 	/* update statistics */
181 	zev_statistics.zev_cnt_total_events++;
182 	zev_statistics.zev_queue_len += msg->size;
183 	if (zev_statistics.zev_queue_len >
184 	    zev_statistics.zev_poll_wakeup_queue_len)
185 		wakeup = 1;
186 	switch (op) {
187 	case ZEV_OP_ERROR:
188 		zev_statistics.zev_cnt_errors++;
189 		break;
190 	case ZEV_OP_MARK:
191 		zev_statistics.zev_cnt_marks++;
192 		break;
193 	case ZEV_OP_ZFS_MOUNT:
194 		zev_statistics.zev_cnt_zfs_mount++;
195 		break;
196 	case ZEV_OP_ZFS_UMOUNT:
197 		zev_statistics.zev_cnt_zfs_umount++;
198 		break;
199 	case ZEV_OP_ZVOL_WRITE:
200 		zev_statistics.zev_cnt_zvol_write++;
201 		break;
202 	case ZEV_OP_ZVOL_TRUNCATE:
203 		zev_statistics.zev_cnt_zvol_truncate++;
204 		break;
205 	case ZEV_OP_ZNODE_CLOSE_AFTER_UPDATE:
206 		zev_statistics.zev_cnt_znode_close_after_update++;
207 		break;
208 	case ZEV_OP_ZNODE_CREATE:
209 		zev_statistics.zev_cnt_znode_create++;
210 		break;
211 	case ZEV_OP_ZNODE_REMOVE:
212 		zev_statistics.zev_cnt_znode_remove++;
213 		break;
214 	case ZEV_OP_ZNODE_LINK:
215 		zev_statistics.zev_cnt_znode_link++;
216 		break;
217 	case ZEV_OP_ZNODE_SYMLINK:
218 		zev_statistics.zev_cnt_znode_symlink++;
219 		break;
220 	case ZEV_OP_ZNODE_RENAME:
221 		zev_statistics.zev_cnt_znode_rename++;
222 		break;
223 	case ZEV_OP_ZNODE_WRITE:
224 		zev_statistics.zev_cnt_znode_write++;
225 		break;
226 	case ZEV_OP_ZNODE_TRUNCATE:
227 		zev_statistics.zev_cnt_znode_truncate++;
228 		break;
229 	case ZEV_OP_ZNODE_SETATTR:
230 		zev_statistics.zev_cnt_znode_setattr++;
231 		break;
232 	case ZEV_OP_ZNODE_ACL:
233 		zev_statistics.zev_cnt_znode_acl++;
234 		break;
235 	}
236 	mutex_exit(&zev_mutex);
237 
238 	/* chpoll event, if necessary.  */
239 	if (wakeup)
240 		pollwakeup(&zev_pollhead, POLLIN);
241 
242 	return;
243 }
244 
245 void
246 zev_queue_error(int op, char *fmt, ...)
247 {
248 	char buf[ZEV_MAX_MESSAGE_LEN];
249 	va_list ap;
250 	int len;
251 	zev_msg_t *msg = NULL;
252 	zev_error_t *rec;
253 	int msg_size;
254 
255 	va_start(ap, fmt);
256 	len = vsnprintf(buf, sizeof(buf), fmt, ap);
257 	va_end(ap);
258 	if (len >= sizeof(buf)) {
259 		cmn_err(CE_WARN, "zev: can't report error - "
260 		        "dropping event entirely.");
261 		return;
262 	}
263 
264 	msg_size = sizeof(*rec) + len + 1;
265 	msg = kmem_alloc(sizeof(*msg) + msg_size, KM_SLEEP);
266 	msg->size = msg_size;
267 	rec = (zev_error_t *)(msg + 1);
268 	rec->op = ZEV_OP_ERROR;
269 	rec->op_time = ddi_get_time();
270 	rec->guid = 0;
271 	rec->failed_op = op;
272 	rec->errstr_len = len;
273 	memcpy(ZEV_ERRSTR(rec), buf, len + 1);
274 
275 	zev_queue_message(ZEV_OP_ERROR, msg);
276 	return;
277 }
278 
279 static int
280 zev_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
281 {
282 	int instance;
283 	zev_state_t *sp;
284 	zev_statistics_t zs;
285 	zev_ioctl_poolarg_t pa;
286 	zev_ioctl_mark_t mark;
287 	zev_mark_t *rec;
288 	int msg_size;
289 	zev_msg_t *msg;
290 	uint64_t len;
291 	uint64_t mark_id;
292 
293 	instance = getminor(dev);
294 	if ((sp = ddi_get_soft_state(statep, instance)) == NULL)
295 		return (ENXIO);
296 	/*
297 	 * all structures passed between kernel and userspace
298 	 * are now compatible between 64 and 32 bit.  Model
299 	 * conversion can be ignore.
300 	 */
301 #if 0
302 	/* Remember to do 32/64 bit mode adjustments if
303 	   necessary.  See "Writing Device Drivers", 280pp */
304 	if (ddi_model_convert_from(mode) != DDI_MODEL_NONE) {
305 		/* userland has another data model.  (most
306 		   likely 32-bit) -> not supported. */
307 		return (EINVAL);
308 	}
309 #endif
310 	switch (cmd) {
311 	case ZEV_IOC_GET_STATISTICS:
312 		/* ddi_copyout() can take a long time.  Better make
313 		   a copy to be able to release the mutex faster. */
314 		mutex_enter(&zev_mutex);
315 		memcpy(&zs, &zev_statistics, sizeof(zs));
316 		mutex_exit(&zev_mutex);
317 		if (ddi_copyout(&zs, (void *)arg, sizeof(zs), mode) != 0)
318 			return EFAULT;
319 		break;
320 	case ZEV_IOC_MUTE_POOL:
321 	case ZEV_IOC_UNMUTE_POOL:
322 		if (ddi_copyin((void *)arg, &pa, sizeof(pa), mode) != 0)
323 			return EFAULT;
324 		if (pa.zev_poolname_len >=MAXPATHLEN)
325 			return EINVAL;
326 		pa.zev_poolname[pa.zev_poolname_len] = '\0';
327 		if (cmd == ZEV_IOC_MUTE_POOL) {
328 			return zev_ioc_mute_pool(pa.zev_poolname);
329 		} else {
330 			return zev_ioc_unmute_pool(pa.zev_poolname);
331 		}
332 		break;
333 	case ZEV_IOC_SET_MAX_QUEUE_LEN:
334 		if (ddi_copyin((void *)arg, &len, sizeof(len), mode) != 0)
335 			return EFAULT;
336 		if (len > ZEV_MAX_QUEUE_LEN)
337 			return EINVAL;
338 		mutex_enter(&zev_mutex);
339 		zev_statistics.zev_max_queue_len = len;
340 		cv_broadcast(&zev_condvar);
341 		mutex_exit(&zev_mutex);
342 		break;
343 	case ZEV_IOC_SET_POLL_WAKEUP_QUEUE_LEN:
344 		if (ddi_copyin((void *)arg, &len, sizeof(len), mode) != 0)
345 			return EFAULT;
346 		mutex_enter(&zev_mutex);
347 		zev_statistics.zev_poll_wakeup_queue_len = len;
348 		mutex_exit(&zev_mutex);
349 		break;
350 	case ZEV_IOC_MARK:
351 		if (ddi_copyin((void *)arg, &mark, sizeof(mark), mode) != 0)
352 			return EFAULT;
353 		cmn_err(CE_WARN, "mark: guid=%lu payload_len=%d", (long unsigned int)mark.zev_guid, mark.zev_payload_len);
354 		/* prepare message */
355 		msg_size = sizeof(*rec) + mark.zev_payload_len + 1;
356 		msg = kmem_alloc(sizeof(*msg) + msg_size, KM_SLEEP);
357 		msg->size = msg_size;
358 		rec = (zev_mark_t *)(msg + 1);
359 		rec->record_len = msg_size;
360 		rec->op = ZEV_OP_MARK;
361 		rec->op_time = ddi_get_time();
362 		rec->guid = mark.zev_guid;
363 		rec->payload_len = mark.zev_payload_len;
364 		/* get payload */
365 		if (ddi_copyin(((char *)arg) + sizeof(mark),
366 		               ZEV_PAYLOAD(rec),
367 		               mark.zev_payload_len, mode) != 0) {
368 			kmem_free(msg, msg_size);
369 			return EFAULT;
370 		}
371 		*(ZEV_PAYLOAD(rec) + mark.zev_payload_len) = '\0';
372 		/* get mark id and queue message */
373 		mutex_enter(&zev_mark_id_mutex);
374 		mark_id = zev_mark_id++;
375 		mutex_exit(&zev_mark_id_mutex);
376 		rec->mark_id = mark_id;
377 		zev_queue_message(ZEV_OP_MARK, msg);
378 		/* report mark id to userland, ignore errors */
379 		mark.zev_mark_id = mark_id;
380 		ddi_copyout(&mark, (void *)arg, sizeof(mark), mode);
381 		break;
382 	default:
383 		/* generic "ioctl unknown" error */
384 		return (ENOTTY);
385 	}
386 	return (0);
387 }
388 
389 static int
390 zev_chpoll(dev_t dev, short events, int anyyet,
391     short *reventsp, struct pollhead **phpp)
392 {
393 	int instance;
394 	zev_state_t *sp;
395 	short revent = 0;
396 
397 	instance = getminor(dev);
398 	if ((sp = ddi_get_soft_state(statep, instance)) == NULL)
399 		return (ENXIO);
400 	revent = 0;
401 	if ((events & POLLIN)) {
402 		mutex_enter(&zev_mutex);
403 		if (zev_queue_head)
404 			revent |= POLLIN;
405 		mutex_exit(&zev_mutex);
406 	}
407 	if (revent == 0) {
408 		if (!anyyet) {
409 			*phpp = &zev_pollhead;
410 		}
411 	}
412 	*reventsp = revent;
413 	return (0);
414 }
415 
416 static int
417 zev_read(dev_t dev, struct uio *uio_p, cred_t *crep_p)
418 {
419 	zev_state_t *sp;
420 	int instance;
421 	offset_t off;
422 	int ret = 0;
423 	zev_msg_t *msg;
424 	char *data;
425 
426 	instance = getminor(dev);
427 	if ((sp = ddi_get_soft_state(statep, instance)) == NULL)
428 		return (ENXIO);
429 	off = uio_p->uio_loffset;
430 	mutex_enter(&zev_mutex);
431 	msg = zev_queue_head;
432 	if (msg == NULL) {
433 		mutex_exit(&zev_mutex);
434 		return 0;
435 	}
436 	if (msg->size > uio_p->uio_resid) {
437 		mutex_exit(&zev_mutex);
438 		return E2BIG;
439 	}
440 	while (msg && uio_p->uio_resid >= msg->size) {
441 		data = (char *)(msg + 1);
442 		ret = uiomove(data, msg->size, UIO_READ, uio_p);
443 		if (ret != 0) {
444 			mutex_exit(&zev_mutex);
445 			cmn_err(CE_WARN, "zev: uiomove failed; messages lost");
446 			uio_p->uio_loffset = off;
447 			return (ret);
448 		}
449 		zev_queue_head = msg->next;
450 		if (zev_queue_head == NULL)
451 			zev_queue_tail = NULL;
452 		zev_statistics.zev_bytes_read += msg->size;
453 		zev_statistics.zev_queue_len -= msg->size;
454 		zev_queue_len--;
455 		kmem_free(msg, sizeof(*msg) + msg->size);
456 		msg = zev_queue_head;
457 	}
458 	cv_broadcast(&zev_condvar);
459 	mutex_exit(&zev_mutex);
460 	uio_p->uio_loffset = off;
461 	return 0;
462 }
463 
464 static int
465 zev_close(dev_t dev, int flag, int otyp, cred_t *crepd)
466 {
467 	zev_state_t *sp;
468 	int instance;
469 
470 	instance = getminor(dev);
471 	if ((sp = ddi_get_soft_state(statep, instance)) == NULL)
472 		return (ENXIO);
473 	if (otyp != OTYP_CHR)
474 		return (EINVAL);
475 	mutex_enter(&sp->mutex);
476 	if (sp->busy != B_TRUE) {
477 		mutex_exit(&sp->mutex);
478 		return (EINVAL);
479 	}
480 	sp->busy = B_FALSE;
481 	mutex_exit(&sp->mutex);
482 	return (0);
483 }
484 
485 static int
486 zev_open(dev_t *devp, int flag, int otyp, cred_t *credp)
487 {
488 	zev_state_t *sp;
489 	int instance;
490 
491 	instance = getminor(*devp);
492 	if ((sp = ddi_get_soft_state(statep, instance)) == NULL)
493 		return (ENXIO);
494 	if (otyp != OTYP_CHR)
495 		return (EINVAL);
496 	if (drv_priv(credp) != 0)
497 		return (EPERM);
498 	mutex_enter(&sp->mutex);
499 	if (sp->busy == B_TRUE) {
500 		/* XXX: wait for the instance to become available? */
501 		/* XXX: if we wait, the wait should be signal-interruptable. */
502 		mutex_exit(&sp->mutex);
503 		return (EBUSY);
504 	}
505 	sp->busy = B_TRUE;	/* can only be opened exclusively */
506 	mutex_exit(&sp->mutex);
507 	return (0);
508 }
509 
510 static struct cb_ops zev_cb_ops = {
511 	zev_open,		/* open */
512 	zev_close,		/* close */
513 	nodev,			/* strategy */
514 	nodev,			/* print */
515 	nodev,			/* dump */
516 	zev_read,		/* read */
517 	nodev,			/* write */
518 	zev_ioctl,		/* ioctl */
519 	nodev,			/* devmap */
520 	nodev,			/* mmap */
521 	nodev,			/* segmap */
522 	zev_chpoll,		/* chpoll */
523 	ddi_prop_op,		/* prop_op */
524 	NULL,			/* streamtab */
525 	D_MP | D_64BIT,		/* cb_flag */
526 	CB_REV,			/* cb_rev */
527 	nodev,			/* aread */
528 	nodev,			/* awrite */
529 };
530 
531 static void
532 zev_free_instance(dev_info_t *dip)
533 {
534 	int instance;
535 	zev_state_t *sp;
536 	instance = ddi_get_instance(dip);
537 	//ddi_remove_minor_node(dip, ddi_get_name(dip));
538 	ddi_remove_minor_node(dip, NULL);
539 	sp = ddi_get_soft_state(statep, instance);
540 	if (sp) {
541 		mutex_destroy(&sp->mutex);
542 		ddi_soft_state_free(statep, instance);
543 	}
544 }
545 
546 static int
547 zev_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
548 {
549 	int instance;
550 	zev_state_t *sp;
551 	/* called once per instance with DDI_DETACH,
552 	   may be called to suspend */
553 	switch (cmd) {
554 	case DDI_DETACH:
555 		/* instance busy? */
556 		instance = ddi_get_instance(dip);
557 		if ((sp = ddi_get_soft_state(statep, instance)) == NULL)
558 			return (ENXIO);
559 		mutex_enter(&sp->mutex);
560 		if (sp->busy == B_TRUE) {
561 			mutex_exit(&sp->mutex);
562 			return (EBUSY);
563 		}
564 		mutex_exit(&sp->mutex);
565 		/* free resources allocated for this instance */
566 		zev_free_instance(dip);
567 		return (DDI_SUCCESS);
568 	case DDI_SUSPEND:
569 		/* kernel must not suspend zev devices while ZFS is running */
570 		return (DDI_FAILURE);
571 	default:
572 		return (DDI_FAILURE);
573 	}
574 }
575 
576 static int
577 zev_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
578 {
579 	/* called once per instance with DDI_ATTACH,
580 	   may be called to resume */
581 	int instance;
582 	zev_state_t *sp;
583 	switch (cmd) {
584 	case DDI_ATTACH:
585 		instance = ddi_get_instance(dip);
586 		if (ddi_soft_state_zalloc(statep, instance) != DDI_SUCCESS) {
587 			return (DDI_FAILURE);
588 		}
589 		sp = ddi_get_soft_state(statep, instance);
590 		ddi_set_driver_private(dip, sp);
591 		sp->dip = dip;
592 		sp->busy = B_FALSE;
593 		mutex_init(&sp->mutex, NULL, MUTEX_DRIVER, NULL);
594 		if (ddi_create_minor_node(dip, ddi_get_name(dip),
595 		    S_IFCHR, instance, DDI_PSEUDO, 0) == DDI_FAILURE) {
596 			zev_free_instance(dip);
597 			return (DDI_FAILURE);
598 		}
599 		ddi_report_dev(dip);
600 		return (DDI_SUCCESS);
601 	case DDI_RESUME:
602 		/* suspendeding zev devices should never happen */
603 		return (DDI_SUCCESS);
604 	default:
605 		return (DDI_FAILURE);
606 	}
607 }
608 
609 static int
610 zev_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **resultp)
611 {
612 	int instance;
613 	zev_state_t *sp;
614 	switch (infocmd) {
615 	case DDI_INFO_DEVT2DEVINFO:
616 		/* arg is dev_t */
617 		instance = getminor((dev_t)arg);
618 		if ((sp = ddi_get_soft_state(statep, instance)) != NULL) {
619 			*resultp = sp->dip;
620 			return (DDI_SUCCESS);
621 		}
622 		*resultp = NULL;
623 		return (DDI_FAILURE);
624 	case DDI_INFO_DEVT2INSTANCE:
625 		/* arg is dev_t */
626 		instance = getminor((dev_t)arg);
627 		*resultp = (void *)(uintptr_t)instance;
628 		return (DDI_FAILURE);
629 	}
630 	return (DDI_FAILURE);
631 }
632 
633 static struct dev_ops zev_dev_ops = {
634 	DEVO_REV,			/* driver build revision */
635 	0,				/* driver reference count */
636 	zev_getinfo,			/* getinfo */
637 	nulldev,			/* identify (obsolete) */
638 	nulldev,			/* probe (search for devices) */
639 	zev_attach,			/* attach */
640 	zev_detach,			/* detach */
641 	nodev,				/* reset (obsolete, use quiesce) */
642 	&zev_cb_ops,			/* character and block device ops */
643 	NULL,				/* bus driver ops */
644 	NULL,				/* power management, not needed */
645 	ddi_quiesce_not_needed,		/* quiesce */
646 };
647 
648 static struct modldrv zev_modldrv = {
649 	&mod_driverops,			/* all loadable modules use this */
650 	"zev ZFS event provider, v1.0",	/* driver name and version info */
651 	&zev_dev_ops			/* ops method pointers */
652 };
653 
654 static struct modlinkage zev_modlinkage = {
655 	MODREV_1,	/* fixed value */
656 	{
657 		&zev_modldrv,	/* driver linkage structure */
658 		NULL		/* list terminator */
659 	}
660 };
661 
662 int
663 _init(void)
664 {
665 	int error;
666 	boolean_t module_installed = B_FALSE;
667 
668 	if ((error = ddi_soft_state_init(&statep, sizeof(zev_state_t), 1)) != 0)
669 		return (error);
670 	zev_busy = B_FALSE;
671 
672 	mutex_init(&zev_mutex, NULL, MUTEX_DRIVER, NULL);
673 	cv_init(&zev_condvar, NULL, CV_DRIVER, NULL);
674 	rw_init(&zev_pool_list_rwlock, NULL, RW_DRIVER, NULL);
675 	mutex_init(&zev_mark_id_mutex, NULL, MUTEX_DRIVER, NULL);
676 	zev_mark_id = gethrtime();
677 	bzero(&zev_statistics, sizeof(zev_statistics));
678 	zev_statistics.zev_max_queue_len = ZEV_MAX_QUEUE_LEN;
679 	zev_statistics.zev_poll_wakeup_queue_len =
680 	    ZEV_MIN_POLL_WAKEUP_QUEUE_LEN;
681 	if (zev_ioc_mute_pool("zg0")) {
682 		cmn_err(CE_WARN, "zev: could not init mute list");
683 		goto FAIL;
684 	}
685 
686 	if ((error = mod_install(&zev_modlinkage)) != 0) {
687 		cmn_err(CE_WARN, "zev: could not install module");
688 		goto FAIL;
689 	}
690 	module_installed = B_TRUE;
691 
692 	/*
693 	 * Note: _init() seems to be a bad place to access other modules'
694 	 * device files, as it can cause a kernel panic.
695 	 *
696 	 * For example, our _init() is called if our module isn't loaded
697 	 * when someone causes a readdir() in "/devices/pseudo".  For that,
698 	 * devfs_readdir() is used, which obtains an rwlock for the
699 	 * directory.
700 	 *
701 	 * Then, if we open a device file here, we will indirectly call
702 	 * devfs_lookup(), which tries to obtain the same rwlock
703 	 * again, which this thread already has.  That will result in
704 	 * a kernel panic. ("recursive entry")
705 	 *
706 	 * Therefor, we have switched from a zfs ioctl() to directly
707 	 * accessing symbols in the zfs module.
708 	 */
709 
710 	/* switch ZFS event callbacks to zev module callback functions */
711 	rw_enter(&rz_zev_rwlock, RW_WRITER);
712 	rz_zev_callbacks = &zev_callbacks;
713 	rw_exit(&rz_zev_rwlock);
714 
715 	zev_poll_wakeup_thread = thread_create(NULL, 0,
716 	    zev_poll_wakeup_thread_main, NULL, 0, &p0, TS_RUN, minclsyspri);
717 	return (0);
718 FAIL:
719 	/* free resources */
720 	if (module_installed == B_TRUE)
721 		(void) mod_remove(&zev_modlinkage);
722 	mutex_destroy(&zev_mutex);
723 	ddi_soft_state_fini(&statep);
724 	return (error);
725 }
726 
727 int
728 _info(struct modinfo *modinfop)
729 {
730 	return (mod_info(&zev_modlinkage, modinfop));
731 }
732 
733 int
734 _fini(void)
735 {
736 	int error = 0;
737 	zev_msg_t *msg;
738 	zev_pool_list_entry_t *pe, *npe;
739 
740 	mutex_enter(&zev_mutex);
741 	if (zev_busy == B_TRUE) {
742 		mutex_exit(&zev_mutex);
743 		return (SET_ERROR(EBUSY));
744 	}
745 	mutex_exit(&zev_mutex);
746 
747 	/* switch ZFS event callbacks back to default */
748 	rw_enter(&rz_zev_rwlock, RW_WRITER);
749 	rz_zev_callbacks = rz_zev_default_callbacks;
750 	rw_exit(&rz_zev_rwlock);
751 
752 	/* no thread is inside of the callbacks anymore.  Safe to remove. */
753 	zev_wakeup_thread_run = 0;
754 	if (zev_poll_wakeup_thread != 0) {
755 		thread_join(zev_poll_wakeup_thread->t_did);
756 		zev_poll_wakeup_thread = 0;
757 	}
758 	if ((error = mod_remove(&zev_modlinkage)) != 0) {
759 		cmn_err(CE_WARN, "mod_remove failed: %d", error);
760 		return (error);
761 	}
762 
763 	/* free resources */
764 	mutex_enter(&zev_mutex);
765 	while (zev_queue_head) {
766 		msg = zev_queue_head;
767 		zev_queue_head = msg->next;
768 		if (msg)
769 			kmem_free(msg, sizeof(*msg) + msg->size);
770 	}
771 	mutex_exit(&zev_mutex);
772 	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
773 	pe = zev_muted_pools_head;
774 	while (pe) {
775 		npe = pe;
776 		pe = pe->next;
777 		kmem_free(npe, sizeof(*npe));
778 	}
779 	rw_exit(&zev_pool_list_rwlock);
780 	ddi_soft_state_fini(&statep);
781 	rw_destroy(&zev_pool_list_rwlock);
782 	cv_destroy(&zev_condvar);
783 	mutex_destroy(&zev_mutex);
784 	mutex_destroy(&zev_mark_id_mutex);
785 
786 	return (0);
787 }
788 
789