xref: /titanic_41/usr/src/uts/common/fs/zev/zev.c (revision c463c4d9e1af14a90c14bd090bbdce01ae058d6f)
1 #include <sys/modctl.h>
2 #include <sys/ddi.h>
3 #include <sys/sunddi.h>
4 #include <sys/conf.h>
5 #include <sys/devops.h>
6 #include <sys/stat.h>
7 #include <sys/fs/zev.h>
8 #include <sys/zev_callbacks.h>
9 #include <sys/time.h>
10 
11 typedef struct zev_state {
12 	kmutex_t	mutex;
13 	dev_info_t	*dip;
14 	boolean_t	busy;
15 } zev_state_t;
16 
17 static void		*statep;
18 struct pollhead		zev_pollhead;
19 
20 kmutex_t		zev_mutex;
21 kcondvar_t		zev_condvar;
22 krwlock_t		zev_pool_list_rwlock;
23 static zev_statistics_t	zev_statistics;
24 static boolean_t	zev_busy;
25 static kmutex_t		zev_mark_id_mutex;
26 static uint64_t		zev_mark_id = 0;
27 
28 /*
29  * The longest potential message is from zev_zfs_mount() and
30  * contains the mountpoint, which might be close to MAXPATHLEN bytes long.
31  *
32  * Another candidate is zev_znode_rename_cb() and contains three inode
33  * numbers and two filenames of up to MAXNAMELEN bytes each.
34  */
35 #define ZEV_MAX_MESSAGE_LEN	4096
36 
37 /* If the queue size reaches 1GB, stop ZFS ops and block the threads.  */
38 #define ZEV_MAX_QUEUE_LEN		(1 * 1024 * 1024 * 1024)
39 
40 /* Don't wake up poll()ing processes for every single message. */
41 #define ZEV_MIN_POLL_WAKEUP_QUEUE_LEN	8192
42 
43 static zev_msg_t *zev_queue_head = NULL;
44 static zev_msg_t *zev_queue_tail = NULL;
45 static uint64_t zev_queue_len = 0;
46 
47 
48 typedef struct zev_pool_list_entry {
49 	struct zev_pool_list_entry	*next;
50 	char				name[MAXPATHLEN];
51 } zev_pool_list_entry_t;
52 
53 static zev_pool_list_entry_t *zev_muted_pools_head = NULL;
54 
55 /*
56  * poll() wakeup thread.  Used to check periodically whether we have
57  * bytes left in the queue that have not yet been made into a
58  * pollwakeup() call.  This is meant to insure a maximum waiting
59  * time until an event is presented as a poll wakeup, while at
60  * the same time not making every single event into a poll wakeup
61  * of it's own.
62  */
63 
64 static volatile int zev_wakeup_thread_run = 1;
65 static kthread_t *zev_poll_wakeup_thread = NULL;
66 
67 static void
68 zev_poll_wakeup_thread_main(void)
69 {
70 	int wakeup;
71 	while (zev_wakeup_thread_run) {
72 		delay(drv_usectohz(100 * 1000)); /* sleep 100ms */
73 		/* check message queue */
74 		mutex_enter(&zev_mutex);
75 		wakeup = 0;
76 		if (zev_queue_head)
77 			wakeup = 1;
78 		mutex_exit(&zev_mutex);
79 		if (wakeup)
80 			pollwakeup(&zev_pollhead, POLLIN);
81 	}
82 	thread_exit();
83 }
84 
85 static int
86 zev_ioc_mute_pool(char *poolname)
87 {
88 	zev_pool_list_entry_t *pe;
89 	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
90 	/* pool already muted? */
91 	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
92 		if (!strcmp(pe->name, poolname)) {
93 			rw_exit(&zev_pool_list_rwlock);
94 			return EEXIST;
95 		}
96 	}
97 	pe = kmem_zalloc(sizeof(*pe), KM_SLEEP);
98 	if (!pe) {
99 		rw_exit(&zev_pool_list_rwlock);
100 		return ENOMEM;
101 	}
102 	(void) strncpy(pe->name, poolname, sizeof(pe->name));
103 	pe->next = zev_muted_pools_head;
104 	zev_muted_pools_head = pe;
105 	rw_exit(&zev_pool_list_rwlock);
106 	return (0);
107 }
108 
109 static int
110 zev_ioc_unmute_pool(char *poolname)
111 {
112 	zev_pool_list_entry_t *pe, *peprev;
113 	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
114 	/* pool muted? */
115 	peprev = NULL;
116 	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
117 		if (!strcmp(pe->name, poolname)) {
118 			goto found;
119 		}
120 		peprev = pe;
121 	}
122 	rw_exit(&zev_pool_list_rwlock);
123 	return ENOENT;
124 found:
125 	if (peprev != NULL) {
126 		peprev->next = pe->next;
127 	} else {
128 		zev_muted_pools_head = pe->next;
129 	}
130 	kmem_free(pe, sizeof(*pe));
131 	rw_exit(&zev_pool_list_rwlock);
132 	return (0);
133 }
134 
135 int
136 zev_skip_pool(objset_t *os)
137 {
138 	zev_pool_list_entry_t *pe;
139 	dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
140 	rw_enter(&zev_pool_list_rwlock, RW_READER);
141 	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
142 		if (!strcmp(pe->name, dp->dp_spa->spa_name)) {
143 			rw_exit(&zev_pool_list_rwlock);
144 			return 1;
145 		}
146 	}
147 	rw_exit(&zev_pool_list_rwlock);
148 	return 0;
149 }
150 
151 void
152 zev_queue_message(int op, zev_msg_t *msg)
153 {
154 	int wakeup = 0;
155 
156 	msg->next = NULL;
157 
158 	if (op < ZEV_OP_MIN || op > ZEV_OP_MAX) {
159 		zev_queue_error(op, "unknown op id encountered: %d", op);
160 		kmem_free(msg, sizeof(*msg) + msg->size);
161 		return;
162 	}
163 
164 	mutex_enter(&zev_mutex);
165 	while (zev_statistics.zev_max_queue_len &&
166 	    zev_statistics.zev_queue_len >= zev_statistics.zev_max_queue_len) {
167 		/* queue full.  block until it's been shrunk. */
168 		cv_wait(&zev_condvar, &zev_mutex);
169 	}
170 
171 	if (zev_queue_tail == NULL) {
172 		zev_queue_head = zev_queue_tail = msg;
173 	} else {
174 		zev_queue_tail->next = msg;
175 		zev_queue_tail = msg;
176 	}
177 	zev_queue_len++;
178 
179 	/* update statistics */
180 	zev_statistics.zev_cnt_total_events++;
181 	zev_statistics.zev_queue_len += msg->size;
182 	if (zev_statistics.zev_queue_len >
183 	    zev_statistics.zev_poll_wakeup_queue_len)
184 		wakeup = 1;
185 	switch (op) {
186 	case ZEV_OP_ERROR:
187 		zev_statistics.zev_cnt_errors++;
188 		break;
189 	case ZEV_OP_MARK:
190 		zev_statistics.zev_cnt_marks++;
191 		break;
192 	case ZEV_OP_ZFS_MOUNT:
193 		zev_statistics.zev_cnt_zfs_mount++;
194 		break;
195 	case ZEV_OP_ZFS_UMOUNT:
196 		zev_statistics.zev_cnt_zfs_umount++;
197 		break;
198 	case ZEV_OP_ZVOL_WRITE:
199 		zev_statistics.zev_cnt_zvol_write++;
200 		break;
201 	case ZEV_OP_ZVOL_TRUNCATE:
202 		zev_statistics.zev_cnt_zvol_truncate++;
203 		break;
204 	case ZEV_OP_ZNODE_CLOSE_AFTER_UPDATE:
205 		zev_statistics.zev_cnt_znode_close_after_update++;
206 		break;
207 	case ZEV_OP_ZNODE_CREATE:
208 		zev_statistics.zev_cnt_znode_create++;
209 		break;
210 	case ZEV_OP_ZNODE_REMOVE:
211 		zev_statistics.zev_cnt_znode_remove++;
212 		break;
213 	case ZEV_OP_ZNODE_LINK:
214 		zev_statistics.zev_cnt_znode_link++;
215 		break;
216 	case ZEV_OP_ZNODE_SYMLINK:
217 		zev_statistics.zev_cnt_znode_symlink++;
218 		break;
219 	case ZEV_OP_ZNODE_RENAME:
220 		zev_statistics.zev_cnt_znode_rename++;
221 		break;
222 	case ZEV_OP_ZNODE_WRITE:
223 		zev_statistics.zev_cnt_znode_write++;
224 		break;
225 	case ZEV_OP_ZNODE_TRUNCATE:
226 		zev_statistics.zev_cnt_znode_truncate++;
227 		break;
228 	case ZEV_OP_ZNODE_SETATTR:
229 		zev_statistics.zev_cnt_znode_setattr++;
230 		break;
231 	case ZEV_OP_ZNODE_ACL:
232 		zev_statistics.zev_cnt_znode_acl++;
233 		break;
234 	}
235 	mutex_exit(&zev_mutex);
236 
237 	/* chpoll event, if necessary.  */
238 	if (wakeup)
239 		pollwakeup(&zev_pollhead, POLLIN);
240 
241 	return;
242 }
243 
244 void
245 zev_queue_error(int op, char *fmt, ...)
246 {
247 	char buf[ZEV_MAX_MESSAGE_LEN];
248 	va_list ap;
249 	int len;
250 	zev_msg_t *msg = NULL;
251 	zev_error_t *rec;
252 	int msg_size;
253 
254 	va_start(ap, fmt);
255 	len = vsnprintf(buf, sizeof(buf), fmt, ap);
256 	va_end(ap);
257 	if (len >= sizeof(buf)) {
258 		cmn_err(CE_WARN, "zev: can't report error - "
259 		        "dropping event entirely.");
260 		return;
261 	}
262 
263 	msg_size = sizeof(*rec) + len + 1;
264 	msg = kmem_alloc(sizeof(*msg) + msg_size, KM_SLEEP);
265 	msg->size = msg_size;
266 	rec = (zev_error_t *)(msg + 1);
267 	rec->record_len = msg_size;
268 	rec->op = ZEV_OP_ERROR;
269 	rec->op_time = ddi_get_time();
270 	rec->guid = 0;
271 	rec->failed_op = op;
272 	rec->errstr_len = len;
273 	(void) memcpy(ZEV_ERRSTR(rec), buf, len + 1);
274 
275 	zev_queue_message(ZEV_OP_ERROR, msg);
276 	return;
277 }
278 
279 /* ARGSUSED */
280 static int
281 zev_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
282 {
283 	int instance;
284 	zev_statistics_t zs;
285 	zev_ioctl_poolarg_t pa;
286 	zev_ioctl_mark_t mark;
287 	zev_mark_t *rec;
288 	int msg_size;
289 	zev_msg_t *msg;
290 	uint64_t len;
291 	uint64_t mark_id;
292 
293 	instance = getminor(dev);
294 	if (ddi_get_soft_state(statep, instance) == NULL)
295 		return (ENXIO);
296 	/*
297 	 * all structures passed between kernel and userspace
298 	 * are now compatible between 64 and 32 bit.  Model
299 	 * conversion can be ignore.
300 	 */
301 #if 0
302 	/* Remember to do 32/64 bit mode adjustments if
303 	   necessary.  See "Writing Device Drivers", 280pp */
304 	if (ddi_model_convert_from(mode) != DDI_MODEL_NONE) {
305 		/* userland has another data model.  (most
306 		   likely 32-bit) -> not supported. */
307 		return (EINVAL);
308 	}
309 #endif
310 	switch (cmd) {
311 	case ZEV_IOC_GET_STATISTICS:
312 		/* ddi_copyout() can take a long time.  Better make
313 		   a copy to be able to release the mutex faster. */
314 		mutex_enter(&zev_mutex);
315 		(void) memcpy(&zs, &zev_statistics, sizeof(zs));
316 		mutex_exit(&zev_mutex);
317 		if (ddi_copyout(&zs, (void *)arg, sizeof(zs), mode) != 0)
318 			return EFAULT;
319 		break;
320 	case ZEV_IOC_MUTE_POOL:
321 	case ZEV_IOC_UNMUTE_POOL:
322 		if (ddi_copyin((void *)arg, &pa, sizeof(pa), mode) != 0)
323 			return EFAULT;
324 		if (pa.zev_poolname_len >=MAXPATHLEN)
325 			return EINVAL;
326 		pa.zev_poolname[pa.zev_poolname_len] = '\0';
327 		if (cmd == ZEV_IOC_MUTE_POOL) {
328 			return zev_ioc_mute_pool(pa.zev_poolname);
329 		} else {
330 			return zev_ioc_unmute_pool(pa.zev_poolname);
331 		}
332 	case ZEV_IOC_SET_MAX_QUEUE_LEN:
333 		if (ddi_copyin((void *)arg, &len, sizeof(len), mode) != 0)
334 			return EFAULT;
335 		if (len > ZEV_MAX_QUEUE_LEN)
336 			return EINVAL;
337 		mutex_enter(&zev_mutex);
338 		zev_statistics.zev_max_queue_len = len;
339 		cv_broadcast(&zev_condvar);
340 		mutex_exit(&zev_mutex);
341 		break;
342 	case ZEV_IOC_SET_POLL_WAKEUP_QUEUE_LEN:
343 		if (ddi_copyin((void *)arg, &len, sizeof(len), mode) != 0)
344 			return EFAULT;
345 		mutex_enter(&zev_mutex);
346 		zev_statistics.zev_poll_wakeup_queue_len = len;
347 		mutex_exit(&zev_mutex);
348 		break;
349 	case ZEV_IOC_MARK:
350 		if (ddi_copyin((void *)arg, &mark, sizeof(mark), mode) != 0)
351 			return EFAULT;
352 		cmn_err(CE_WARN, "mark: guid=%lu payload_len=%d", (long unsigned int)mark.zev_guid, mark.zev_payload_len);
353 		/* prepare message */
354 		msg_size = sizeof(*rec) + mark.zev_payload_len + 1;
355 		msg = kmem_alloc(sizeof(*msg) + msg_size, KM_SLEEP);
356 		msg->size = msg_size;
357 		rec = (zev_mark_t *)(msg + 1);
358 		rec->record_len = msg_size;
359 		rec->op = ZEV_OP_MARK;
360 		rec->op_time = ddi_get_time();
361 		rec->guid = mark.zev_guid;
362 		rec->payload_len = mark.zev_payload_len;
363 		/* get payload */
364 		if (ddi_copyin(((char *)arg) + sizeof(mark),
365 		               ZEV_PAYLOAD(rec),
366 		               mark.zev_payload_len, mode) != 0) {
367 			kmem_free(msg, msg_size);
368 			return EFAULT;
369 		}
370 		*(ZEV_PAYLOAD(rec) + mark.zev_payload_len) = '\0';
371 		/* get mark id and queue message */
372 		mutex_enter(&zev_mark_id_mutex);
373 		mark_id = zev_mark_id++;
374 		mutex_exit(&zev_mark_id_mutex);
375 		rec->mark_id = mark_id;
376 		zev_queue_message(ZEV_OP_MARK, msg);
377 		/* report mark id to userland, ignore errors */
378 		mark.zev_mark_id = mark_id;
379 		ddi_copyout(&mark, (void *)arg, sizeof(mark), mode);
380 		break;
381 	default:
382 		/* generic "ioctl unknown" error */
383 		return (ENOTTY);
384 	}
385 	return (0);
386 }
387 
388 static int
389 zev_chpoll(dev_t dev, short events, int anyyet,
390     short *reventsp, struct pollhead **phpp)
391 {
392 	int instance;
393 	short revent = 0;
394 
395 	instance = getminor(dev);
396 	if (ddi_get_soft_state(statep, instance) == NULL)
397 		return (ENXIO);
398 	revent = 0;
399 	if ((events & POLLIN)) {
400 		mutex_enter(&zev_mutex);
401 		if (zev_queue_head)
402 			revent |= POLLIN;
403 		mutex_exit(&zev_mutex);
404 	}
405 	if (revent == 0) {
406 		if (!anyyet) {
407 			*phpp = &zev_pollhead;
408 		}
409 	}
410 	*reventsp = revent;
411 	return (0);
412 }
413 
414 /* ARGSUSED */
415 static int
416 zev_read(dev_t dev, struct uio *uio_p, cred_t *crep_p)
417 {
418 	int instance;
419 	offset_t off;
420 	int ret = 0;
421 	zev_msg_t *msg;
422 	char *data;
423 
424 	instance = getminor(dev);
425 	if (ddi_get_soft_state(statep, instance) == NULL)
426 		return (ENXIO);
427 	off = uio_p->uio_loffset;
428 	mutex_enter(&zev_mutex);
429 	msg = zev_queue_head;
430 	if (msg == NULL) {
431 		mutex_exit(&zev_mutex);
432 		return 0;
433 	}
434 	if (msg->size > uio_p->uio_resid) {
435 		mutex_exit(&zev_mutex);
436 		return E2BIG;
437 	}
438 	while (msg && uio_p->uio_resid >= msg->size) {
439 		data = (char *)(msg + 1);
440 		ret = uiomove(data, msg->size, UIO_READ, uio_p);
441 		if (ret != 0) {
442 			mutex_exit(&zev_mutex);
443 			cmn_err(CE_WARN, "zev: uiomove failed; messages lost");
444 			uio_p->uio_loffset = off;
445 			return (ret);
446 		}
447 		zev_queue_head = msg->next;
448 		if (zev_queue_head == NULL)
449 			zev_queue_tail = NULL;
450 		zev_statistics.zev_bytes_read += msg->size;
451 		zev_statistics.zev_queue_len -= msg->size;
452 		zev_queue_len--;
453 		kmem_free(msg, sizeof(*msg) + msg->size);
454 		msg = zev_queue_head;
455 	}
456 	cv_broadcast(&zev_condvar);
457 	mutex_exit(&zev_mutex);
458 	uio_p->uio_loffset = off;
459 	return 0;
460 }
461 
462 /* ARGSUSED */
463 static int
464 zev_close(dev_t dev, int flag, int otyp, cred_t *crepd)
465 {
466 	zev_state_t *sp;
467 	int instance;
468 
469 	instance = getminor(dev);
470 	if ((sp = ddi_get_soft_state(statep, instance)) == NULL)
471 		return (ENXIO);
472 	if (otyp != OTYP_CHR)
473 		return (EINVAL);
474 	mutex_enter(&sp->mutex);
475 	if (sp->busy != B_TRUE) {
476 		mutex_exit(&sp->mutex);
477 		return (EINVAL);
478 	}
479 	sp->busy = B_FALSE;
480 	mutex_exit(&sp->mutex);
481 	return (0);
482 }
483 
484 /* ARGSUSED */
485 static int
486 zev_open(dev_t *devp, int flag, int otyp, cred_t *credp)
487 {
488 	zev_state_t *sp;
489 	int instance;
490 
491 	instance = getminor(*devp);
492 	if ((sp = ddi_get_soft_state(statep, instance)) == NULL)
493 		return (ENXIO);
494 	if (otyp != OTYP_CHR)
495 		return (EINVAL);
496 	if (drv_priv(credp) != 0)
497 		return (EPERM);
498 	mutex_enter(&sp->mutex);
499 	if (sp->busy == B_TRUE) {
500 		/* XXX: wait for the instance to become available? */
501 		/* XXX: if we wait, the wait should be signal-interruptable. */
502 		mutex_exit(&sp->mutex);
503 		return (EBUSY);
504 	}
505 	sp->busy = B_TRUE;	/* can only be opened exclusively */
506 	mutex_exit(&sp->mutex);
507 	return (0);
508 }
509 
510 static struct cb_ops zev_cb_ops = {
511 	zev_open,		/* open */
512 	zev_close,		/* close */
513 	nodev,			/* strategy */
514 	nodev,			/* print */
515 	nodev,			/* dump */
516 	zev_read,		/* read */
517 	nodev,			/* write */
518 	zev_ioctl,		/* ioctl */
519 	nodev,			/* devmap */
520 	nodev,			/* mmap */
521 	nodev,			/* segmap */
522 	zev_chpoll,		/* chpoll */
523 	ddi_prop_op,		/* prop_op */
524 	NULL,			/* streamtab */
525 	D_MP | D_64BIT,		/* cb_flag */
526 	CB_REV,			/* cb_rev */
527 	nodev,			/* aread */
528 	nodev,			/* awrite */
529 };
530 
531 static void
532 zev_free_instance(dev_info_t *dip)
533 {
534 	int instance;
535 	zev_state_t *sp;
536 	instance = ddi_get_instance(dip);
537 	//ddi_remove_minor_node(dip, ddi_get_name(dip));
538 	ddi_remove_minor_node(dip, NULL);
539 	sp = ddi_get_soft_state(statep, instance);
540 	if (sp) {
541 		mutex_destroy(&sp->mutex);
542 		ddi_soft_state_free(statep, instance);
543 	}
544 }
545 
546 static int
547 zev_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
548 {
549 	int instance;
550 	zev_state_t *sp;
551 	/* called once per instance with DDI_DETACH,
552 	   may be called to suspend */
553 	switch (cmd) {
554 	case DDI_DETACH:
555 		/* instance busy? */
556 		instance = ddi_get_instance(dip);
557 		if ((sp = ddi_get_soft_state(statep, instance)) == NULL)
558 			return (DDI_FAILURE);
559 		mutex_enter(&sp->mutex);
560 		if (sp->busy == B_TRUE) {
561 			mutex_exit(&sp->mutex);
562 			return (DDI_FAILURE);
563 		}
564 		mutex_exit(&sp->mutex);
565 		/* free resources allocated for this instance */
566 		zev_free_instance(dip);
567 		return (DDI_SUCCESS);
568 	case DDI_SUSPEND:
569 		/* kernel must not suspend zev devices while ZFS is running */
570 		return (DDI_FAILURE);
571 	default:
572 		return (DDI_FAILURE);
573 	}
574 }
575 
576 static int
577 zev_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
578 {
579 	/* called once per instance with DDI_ATTACH,
580 	   may be called to resume */
581 	int instance;
582 	zev_state_t *sp;
583 	switch (cmd) {
584 	case DDI_ATTACH:
585 		instance = ddi_get_instance(dip);
586 		if (ddi_soft_state_zalloc(statep, instance) != DDI_SUCCESS) {
587 			return (DDI_FAILURE);
588 		}
589 		sp = ddi_get_soft_state(statep, instance);
590 		ddi_set_driver_private(dip, sp);
591 		sp->dip = dip;
592 		sp->busy = B_FALSE;
593 		mutex_init(&sp->mutex, NULL, MUTEX_DRIVER, NULL);
594 		if (ddi_create_minor_node(dip, ddi_get_name(dip),
595 		    S_IFCHR, instance, DDI_PSEUDO, 0) == DDI_FAILURE) {
596 			zev_free_instance(dip);
597 			return (DDI_FAILURE);
598 		}
599 		ddi_report_dev(dip);
600 		return (DDI_SUCCESS);
601 	case DDI_RESUME:
602 		/* suspendeding zev devices should never happen */
603 		return (DDI_SUCCESS);
604 	default:
605 		return (DDI_FAILURE);
606 	}
607 }
608 
609 /* ARGSUSED */
610 static int
611 zev_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **resultp)
612 {
613 	int instance;
614 	zev_state_t *sp;
615 	switch (infocmd) {
616 	case DDI_INFO_DEVT2DEVINFO:
617 		/* arg is dev_t */
618 		instance = getminor((dev_t)arg);
619 		if ((sp = ddi_get_soft_state(statep, instance)) != NULL) {
620 			*resultp = sp->dip;
621 			return (DDI_SUCCESS);
622 		}
623 		*resultp = NULL;
624 		return (DDI_FAILURE);
625 	case DDI_INFO_DEVT2INSTANCE:
626 		/* arg is dev_t */
627 		instance = getminor((dev_t)arg);
628 		*resultp = (void *)(uintptr_t)instance;
629 		return (DDI_FAILURE);
630 	}
631 	return (DDI_FAILURE);
632 }
633 
634 static struct dev_ops zev_dev_ops = {
635 	DEVO_REV,			/* driver build revision */
636 	0,				/* driver reference count */
637 	zev_getinfo,			/* getinfo */
638 	nulldev,			/* identify (obsolete) */
639 	nulldev,			/* probe (search for devices) */
640 	zev_attach,			/* attach */
641 	zev_detach,			/* detach */
642 	nodev,				/* reset (obsolete, use quiesce) */
643 	&zev_cb_ops,			/* character and block device ops */
644 	NULL,				/* bus driver ops */
645 	NULL,				/* power management, not needed */
646 	ddi_quiesce_not_needed,		/* quiesce */
647 };
648 
649 static struct modldrv zev_modldrv = {
650 	&mod_driverops,			/* all loadable modules use this */
651 	"zev ZFS event provider, v1.0",	/* driver name and version info */
652 	&zev_dev_ops			/* ops method pointers */
653 };
654 
655 static struct modlinkage zev_modlinkage = {
656 	MODREV_1,	/* fixed value */
657 	{
658 		&zev_modldrv,	/* driver linkage structure */
659 		NULL		/* list terminator */
660 	}
661 };
662 
663 int
664 _init(void)
665 {
666 	int error;
667 	boolean_t module_installed = B_FALSE;
668 
669 	if ((error = ddi_soft_state_init(&statep, sizeof(zev_state_t), 1)) != 0)
670 		return (error);
671 	zev_busy = B_FALSE;
672 
673 	mutex_init(&zev_mutex, NULL, MUTEX_DRIVER, NULL);
674 	cv_init(&zev_condvar, NULL, CV_DRIVER, NULL);
675 	rw_init(&zev_pool_list_rwlock, NULL, RW_DRIVER, NULL);
676 	mutex_init(&zev_mark_id_mutex, NULL, MUTEX_DRIVER, NULL);
677 	zev_mark_id = gethrtime();
678 	bzero(&zev_statistics, sizeof(zev_statistics));
679 	zev_statistics.zev_max_queue_len = ZEV_MAX_QUEUE_LEN;
680 	zev_statistics.zev_poll_wakeup_queue_len =
681 	    ZEV_MIN_POLL_WAKEUP_QUEUE_LEN;
682 	if (zev_ioc_mute_pool("zg0")) {
683 		cmn_err(CE_WARN, "zev: could not init mute list");
684 		goto FAIL;
685 	}
686 
687 	if ((error = mod_install(&zev_modlinkage)) != 0) {
688 		cmn_err(CE_WARN, "zev: could not install module");
689 		goto FAIL;
690 	}
691 	module_installed = B_TRUE;
692 
693 	/*
694 	 * Note: _init() seems to be a bad place to access other modules'
695 	 * device files, as it can cause a kernel panic.
696 	 *
697 	 * For example, our _init() is called if our module isn't loaded
698 	 * when someone causes a readdir() in "/devices/pseudo".  For that,
699 	 * devfs_readdir() is used, which obtains an rwlock for the
700 	 * directory.
701 	 *
702 	 * Then, if we open a device file here, we will indirectly call
703 	 * devfs_lookup(), which tries to obtain the same rwlock
704 	 * again, which this thread already has.  That will result in
705 	 * a kernel panic. ("recursive entry")
706 	 *
707 	 * Therefor, we have switched from a zfs ioctl() to directly
708 	 * accessing symbols in the zfs module.
709 	 */
710 
711 	/* switch ZFS event callbacks to zev module callback functions */
712 	rw_enter(&rz_zev_rwlock, RW_WRITER);
713 	rz_zev_callbacks = &zev_callbacks;
714 	rw_exit(&rz_zev_rwlock);
715 
716 	zev_poll_wakeup_thread = thread_create(NULL, 0,
717 	    zev_poll_wakeup_thread_main, NULL, 0, &p0, TS_RUN, minclsyspri);
718 	return (0);
719 FAIL:
720 	/* free resources */
721 	if (module_installed == B_TRUE)
722 		(void) mod_remove(&zev_modlinkage);
723 	mutex_destroy(&zev_mutex);
724 	ddi_soft_state_fini(&statep);
725 	return (error);
726 }
727 
728 int
729 _info(struct modinfo *modinfop)
730 {
731 	return (mod_info(&zev_modlinkage, modinfop));
732 }
733 
734 int
735 _fini(void)
736 {
737 	int error = 0;
738 	zev_msg_t *msg;
739 	zev_pool_list_entry_t *pe, *npe;
740 
741 	mutex_enter(&zev_mutex);
742 	if (zev_busy == B_TRUE) {
743 		mutex_exit(&zev_mutex);
744 		return (SET_ERROR(EBUSY));
745 	}
746 	mutex_exit(&zev_mutex);
747 
748 	/* switch ZFS event callbacks back to default */
749 	rw_enter(&rz_zev_rwlock, RW_WRITER);
750 	rz_zev_callbacks = rz_zev_default_callbacks;
751 	rw_exit(&rz_zev_rwlock);
752 
753 	/* no thread is inside of the callbacks anymore.  Safe to remove. */
754 	zev_wakeup_thread_run = 0;
755 	if (zev_poll_wakeup_thread != 0) {
756 		thread_join(zev_poll_wakeup_thread->t_did);
757 		zev_poll_wakeup_thread = 0;
758 	}
759 	if ((error = mod_remove(&zev_modlinkage)) != 0) {
760 		cmn_err(CE_WARN, "mod_remove failed: %d", error);
761 		return (error);
762 	}
763 
764 	/* free resources */
765 	mutex_enter(&zev_mutex);
766 	while (zev_queue_head) {
767 		msg = zev_queue_head;
768 		zev_queue_head = msg->next;
769 		if (msg)
770 			kmem_free(msg, sizeof(*msg) + msg->size);
771 	}
772 	mutex_exit(&zev_mutex);
773 	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
774 	pe = zev_muted_pools_head;
775 	while (pe) {
776 		npe = pe;
777 		pe = pe->next;
778 		kmem_free(npe, sizeof(*npe));
779 	}
780 	rw_exit(&zev_pool_list_rwlock);
781 	ddi_soft_state_fini(&statep);
782 	rw_destroy(&zev_pool_list_rwlock);
783 	cv_destroy(&zev_condvar);
784 	mutex_destroy(&zev_mutex);
785 	mutex_destroy(&zev_mark_id_mutex);
786 
787 	return (0);
788 }
789 
790