xref: /titanic_52/usr/src/uts/common/fs/zev/zev.c (revision 6db5d4ec7067f6ac90788b6d5f24002d82d0f21e)
1 #include <sys/modctl.h>
2 #include <sys/ddi.h>
3 #include <sys/sunddi.h>
4 #include <sys/conf.h>
5 #include <sys/devops.h>
6 #include <sys/stat.h>
7 #include <sys/fs/zev.h>
8 #include <sys/zev_callbacks.h>
9 #include <sys/zfs_znode.h>
10 #include <sys/time.h>
11 #include <sys/sa.h>
12 #include <sys/zap.h>
13 #include <sys/time.h>
14 
15 typedef struct zev_state {
16 	kmutex_t	mutex;
17 	dev_info_t	*dip;
18 	boolean_t	busy;
19 } zev_state_t;
20 
21 static void		*statep;
22 struct pollhead		zev_pollhead;
23 
24 kmutex_t		zev_mutex;
25 kcondvar_t		zev_condvar;
26 krwlock_t		zev_pool_list_rwlock;
27 static zev_statistics_t	zev_statistics;
28 static boolean_t	zev_busy;
29 static kmutex_t		zev_mark_id_mutex;
30 static uint64_t		zev_mark_id = 0;
31 
32 /*
33  * The longest potential message is from zev_zfs_mount() and
34  * contains the mountpoint, which might be close to MAXPATHLEN bytes long.
35  *
36  * Another candidate is zev_znode_rename_cb() and contains three inode
37  * numbers and two filenames of up to MAXNAMELEN bytes each.
38  */
39 #define ZEV_MAX_MESSAGE_LEN	4096
40 
41 /* If the queue size reaches 1GB, stop ZFS ops and block the threads.  */
42 #define ZEV_MAX_QUEUE_LEN		(1 * 1024 * 1024 * 1024)
43 
44 /* Don't wake up poll()ing processes for every single message. */
45 #define ZEV_MIN_POLL_WAKEUP_QUEUE_LEN	8192
46 
47 static zev_msg_t *zev_queue_head = NULL;
48 static zev_msg_t *zev_queue_tail = NULL;
49 static uint64_t zev_queue_len = 0;
50 
51 
52 typedef struct zev_pool_list_entry {
53 	struct zev_pool_list_entry	*next;
54 	char				name[MAXPATHLEN];
55 } zev_pool_list_entry_t;
56 
57 static zev_pool_list_entry_t *zev_muted_pools_head = NULL;
58 
59 /*
60  * poll() wakeup thread.  Used to check periodically whether we have
61  * bytes left in the queue that have not yet been made into a
62  * pollwakeup() call.  This is meant to insure a maximum waiting
63  * time until an event is presented as a poll wakeup, while at
64  * the same time not making every single event into a poll wakeup
65  * of it's own.
66  */
67 
68 static volatile int zev_wakeup_thread_run = 1;
69 static kthread_t *zev_poll_wakeup_thread = NULL;
70 
71 static void
72 zev_poll_wakeup_thread_main(void)
73 {
74 	int wakeup;
75 	while (zev_wakeup_thread_run) {
76 		delay(drv_usectohz(100 * 1000)); /* sleep 100ms */
77 		/* check message queue */
78 		mutex_enter(&zev_mutex);
79 		wakeup = 0;
80 		if (zev_queue_head)
81 			wakeup = 1;
82 		mutex_exit(&zev_mutex);
83 		if (wakeup)
84 			pollwakeup(&zev_pollhead, POLLIN);
85 	}
86 	thread_exit();
87 }
88 
89 static int
90 zev_ioc_mute_pool(char *poolname)
91 {
92 	zev_pool_list_entry_t *pe;
93 	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
94 	/* pool already muted? */
95 	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
96 		if (!strcmp(pe->name, poolname)) {
97 			rw_exit(&zev_pool_list_rwlock);
98 			return EEXIST;
99 		}
100 	}
101 	pe = kmem_zalloc(sizeof(*pe), KM_SLEEP);
102 	if (!pe) {
103 		rw_exit(&zev_pool_list_rwlock);
104 		return ENOMEM;
105 	}
106 	(void) strncpy(pe->name, poolname, sizeof(pe->name));
107 	pe->next = zev_muted_pools_head;
108 	zev_muted_pools_head = pe;
109 	rw_exit(&zev_pool_list_rwlock);
110 	return (0);
111 }
112 
113 static int
114 zev_ioc_unmute_pool(char *poolname)
115 {
116 	zev_pool_list_entry_t *pe, *peprev;
117 	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
118 	/* pool muted? */
119 	peprev = NULL;
120 	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
121 		if (!strcmp(pe->name, poolname)) {
122 			goto found;
123 		}
124 		peprev = pe;
125 	}
126 	rw_exit(&zev_pool_list_rwlock);
127 	return ENOENT;
128 found:
129 	if (peprev != NULL) {
130 		peprev->next = pe->next;
131 	} else {
132 		zev_muted_pools_head = pe->next;
133 	}
134 	kmem_free(pe, sizeof(*pe));
135 	rw_exit(&zev_pool_list_rwlock);
136 	return (0);
137 }
138 
139 int
140 zev_skip_pool(objset_t *os)
141 {
142 	zev_pool_list_entry_t *pe;
143 	dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
144 	rw_enter(&zev_pool_list_rwlock, RW_READER);
145 	for (pe=zev_muted_pools_head; pe; pe=pe->next) {
146 		if (!strcmp(pe->name, dp->dp_spa->spa_name)) {
147 			rw_exit(&zev_pool_list_rwlock);
148 			return 1;
149 		}
150 	}
151 	rw_exit(&zev_pool_list_rwlock);
152 	return 0;
153 }
154 
155 void
156 zev_queue_message(int op, zev_msg_t *msg)
157 {
158 	int wakeup = 0;
159 
160 	msg->next = NULL;
161 
162 	if (op < ZEV_OP_MIN || op > ZEV_OP_MAX) {
163 		zev_queue_error(op, "unknown op id encountered: %d", op);
164 		kmem_free(msg, sizeof(*msg) + msg->size);
165 		return;
166 	}
167 
168 	mutex_enter(&zev_mutex);
169 	while (zev_statistics.zev_max_queue_len &&
170 	    zev_statistics.zev_queue_len >= zev_statistics.zev_max_queue_len) {
171 		/* queue full.  block until it's been shrunk. */
172 		cv_wait(&zev_condvar, &zev_mutex);
173 	}
174 
175 	if (zev_queue_tail == NULL) {
176 		zev_queue_head = zev_queue_tail = msg;
177 	} else {
178 		zev_queue_tail->next = msg;
179 		zev_queue_tail = msg;
180 	}
181 	zev_queue_len++;
182 
183 	/* update statistics */
184 	zev_statistics.zev_cnt_total_events++;
185 	zev_statistics.zev_queue_len += msg->size;
186 	if (zev_statistics.zev_queue_len >
187 	    zev_statistics.zev_poll_wakeup_queue_len)
188 		wakeup = 1;
189 	switch (op) {
190 	case ZEV_OP_ERROR:
191 		zev_statistics.zev_cnt_errors++;
192 		break;
193 	case ZEV_OP_MARK:
194 		zev_statistics.zev_cnt_marks++;
195 		break;
196 	case ZEV_OP_ZFS_MOUNT:
197 		zev_statistics.zev_cnt_zfs_mount++;
198 		break;
199 	case ZEV_OP_ZFS_UMOUNT:
200 		zev_statistics.zev_cnt_zfs_umount++;
201 		break;
202 	case ZEV_OP_ZVOL_WRITE:
203 		zev_statistics.zev_cnt_zvol_write++;
204 		break;
205 	case ZEV_OP_ZVOL_TRUNCATE:
206 		zev_statistics.zev_cnt_zvol_truncate++;
207 		break;
208 	case ZEV_OP_ZNODE_CLOSE_AFTER_UPDATE:
209 		zev_statistics.zev_cnt_znode_close_after_update++;
210 		break;
211 	case ZEV_OP_ZNODE_CREATE:
212 		zev_statistics.zev_cnt_znode_create++;
213 		break;
214 	case ZEV_OP_ZNODE_REMOVE:
215 		zev_statistics.zev_cnt_znode_remove++;
216 		break;
217 	case ZEV_OP_ZNODE_LINK:
218 		zev_statistics.zev_cnt_znode_link++;
219 		break;
220 	case ZEV_OP_ZNODE_SYMLINK:
221 		zev_statistics.zev_cnt_znode_symlink++;
222 		break;
223 	case ZEV_OP_ZNODE_RENAME:
224 		zev_statistics.zev_cnt_znode_rename++;
225 		break;
226 	case ZEV_OP_ZNODE_WRITE:
227 		zev_statistics.zev_cnt_znode_write++;
228 		break;
229 	case ZEV_OP_ZNODE_TRUNCATE:
230 		zev_statistics.zev_cnt_znode_truncate++;
231 		break;
232 	case ZEV_OP_ZNODE_SETATTR:
233 		zev_statistics.zev_cnt_znode_setattr++;
234 		break;
235 	case ZEV_OP_ZNODE_ACL:
236 		zev_statistics.zev_cnt_znode_acl++;
237 		break;
238 	}
239 	mutex_exit(&zev_mutex);
240 
241 	/* chpoll event, if necessary.  */
242 	if (wakeup)
243 		pollwakeup(&zev_pollhead, POLLIN);
244 
245 	return;
246 }
247 
248 void
249 zev_queue_error(int op, char *fmt, ...)
250 {
251 	char buf[ZEV_MAX_MESSAGE_LEN];
252 	va_list ap;
253 	int len;
254 	zev_msg_t *msg = NULL;
255 	zev_error_t *rec;
256 	int msg_size;
257 
258 	va_start(ap, fmt);
259 	len = vsnprintf(buf, sizeof(buf), fmt, ap);
260 	va_end(ap);
261 	if (len >= sizeof(buf)) {
262 		cmn_err(CE_WARN, "zev: can't report error - "
263 		        "dropping event entirely.");
264 		return;
265 	}
266 
267 	msg_size = sizeof(*rec) + len + 1;
268 	msg = kmem_alloc(sizeof(*msg) + msg_size, KM_SLEEP);
269 	msg->size = msg_size;
270 	rec = (zev_error_t *)(msg + 1);
271 	rec->record_len = msg_size;
272 	rec->op = ZEV_OP_ERROR;
273 	rec->op_time = ddi_get_time();
274 	rec->guid = 0;
275 	rec->failed_op = op;
276 	rec->errstr_len = len;
277 	(void) memcpy(ZEV_ERRSTR(rec), buf, len + 1);
278 
279 	zev_queue_message(ZEV_OP_ERROR, msg);
280 	return;
281 }
282 
283 int
284 zev_ioc_get_gen(intptr_t arg, int mode)
285 {
286 	zev_ioctl_get_gen_t gg;
287 	file_t *fp;
288 	uint64_t gen;
289 	uint64_t crtime[2];
290 	uint64_t dummy;
291 	int ret = 0;
292 	zfsvfs_t *zfsvfs;
293 	objset_t *osp;
294 	sa_attr_type_t *sa_table;
295 	sa_handle_t *hdl;
296 	dmu_buf_t *db;
297 	sa_bulk_attr_t bulk[4];
298 	int count = 0;
299 	dmu_object_info_t doi;
300 	dsl_pool_t *dp;
301 	timestruc_t crtime_s;
302 
303 	if (ddi_copyin((void *)arg, &gg, sizeof(gg), mode) != 0)
304 		return EFAULT;
305 	fp = getf(gg.fd);
306 	if (fp == NULL)
307 		return EBADF;
308 	if (fp->f_vnode->v_vfsp->vfs_fstype != zfsfstype) {
309 		ret = EINVAL;
310 		goto out;
311 	}
312 	zfsvfs = (zfsvfs_t *)(fp->f_vnode->v_vfsp->vfs_data);
313 	osp = zfsvfs->z_os;
314 	dsl_dataset_name(zfsvfs->z_os->os_dsl_dataset, gg.dataset);
315 	/* get object attributes */
316 	ret = sa_setup(osp, gg.inode, zfs_attr_table, ZPL_END, &sa_table);
317 	if (ret)
318 		goto out;
319 	ret = sa_buf_hold(osp, gg.inode, FTAG, &db);
320 	if (ret)
321 		goto out;
322 	dmu_object_info_from_db(db, &doi);
323 	if ((doi.doi_bonus_type != DMU_OT_SA &&
324 	    doi.doi_bonus_type != DMU_OT_ZNODE) ||
325 	    doi.doi_bonus_type == DMU_OT_ZNODE &&
326 	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
327 		sa_buf_rele(db, FTAG);
328 		ret = ENOTSUP;
329 		goto out;
330 	}
331 	ret = sa_handle_get(osp, gg.inode, NULL, SA_HDL_PRIVATE, &hdl);
332 	if (ret) {
333 		sa_buf_rele(db, FTAG);
334 		goto out;
335 	}
336 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
337 		&dummy, sizeof(dummy));
338 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
339 		&gen, sizeof(gen));
340 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
341 		&dummy, sizeof(dummy));
342 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CRTIME], NULL,
343 		&crtime, sizeof(crtime));
344 	ret = sa_bulk_lookup(hdl, bulk, count);
345 	sa_handle_destroy(hdl);
346 	sa_buf_rele(db, FTAG);
347 	if (ret)
348 		goto out;
349 	dp = osp->os_dsl_dataset->ds_dir->dd_pool;
350 	ZFS_TIME_DECODE(&crtime_s, crtime);
351 	gg.generation = gen;
352 	gg.crtime = crtime_s.tv_sec;
353 	gg.guid = zfsvfs->z_os->os_dsl_dataset->ds_phys->ds_guid;
354 	ddi_copyout(&gg, (void *)arg, sizeof(gg), mode);
355 out:
356 	releasef(gg.fd);
357 	return ret;
358 }
359 
360 /* ARGSUSED */
361 static int
362 zev_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
363 {
364 	int instance;
365 	zev_statistics_t zs;
366 	zev_ioctl_poolarg_t pa;
367 	zev_ioctl_mark_t mark;
368 	zev_mark_t *rec;
369 	int msg_size;
370 	zev_msg_t *msg;
371 	uint64_t len;
372 	uint64_t mark_id;
373 
374 	instance = getminor(dev);
375 	if (ddi_get_soft_state(statep, instance) == NULL)
376 		return (ENXIO);
377 	/*
378 	 * all structures passed between kernel and userspace
379 	 * are now compatible between 64 and 32 bit.  Model
380 	 * conversion can be ignore.
381 	 */
382 #if 0
383 	/* Remember to do 32/64 bit mode adjustments if
384 	   necessary.  See "Writing Device Drivers", 280pp */
385 	if (ddi_model_convert_from(mode) != DDI_MODEL_NONE) {
386 		/* userland has another data model.  (most
387 		   likely 32-bit) -> not supported. */
388 		return (EINVAL);
389 	}
390 #endif
391 	switch (cmd) {
392 	case ZEV_IOC_GET_STATISTICS:
393 		/* ddi_copyout() can take a long time.  Better make
394 		   a copy to be able to release the mutex faster. */
395 		mutex_enter(&zev_mutex);
396 		(void) memcpy(&zs, &zev_statistics, sizeof(zs));
397 		mutex_exit(&zev_mutex);
398 		if (ddi_copyout(&zs, (void *)arg, sizeof(zs), mode) != 0)
399 			return EFAULT;
400 		break;
401 	case ZEV_IOC_MUTE_POOL:
402 	case ZEV_IOC_UNMUTE_POOL:
403 		if (ddi_copyin((void *)arg, &pa, sizeof(pa), mode) != 0)
404 			return EFAULT;
405 		if (pa.zev_poolname_len >=MAXPATHLEN)
406 			return EINVAL;
407 		pa.zev_poolname[pa.zev_poolname_len] = '\0';
408 		if (cmd == ZEV_IOC_MUTE_POOL) {
409 			return zev_ioc_mute_pool(pa.zev_poolname);
410 		} else {
411 			return zev_ioc_unmute_pool(pa.zev_poolname);
412 		}
413 	case ZEV_IOC_SET_MAX_QUEUE_LEN:
414 		if (ddi_copyin((void *)arg, &len, sizeof(len), mode) != 0)
415 			return EFAULT;
416 		if (len > ZEV_MAX_QUEUE_LEN)
417 			return EINVAL;
418 		mutex_enter(&zev_mutex);
419 		zev_statistics.zev_max_queue_len = len;
420 		cv_broadcast(&zev_condvar);
421 		mutex_exit(&zev_mutex);
422 		break;
423 	case ZEV_IOC_SET_POLL_WAKEUP_QUEUE_LEN:
424 		if (ddi_copyin((void *)arg, &len, sizeof(len), mode) != 0)
425 			return EFAULT;
426 		mutex_enter(&zev_mutex);
427 		zev_statistics.zev_poll_wakeup_queue_len = len;
428 		mutex_exit(&zev_mutex);
429 		break;
430 	case ZEV_IOC_MARK:
431 		if (ddi_copyin((void *)arg, &mark, sizeof(mark), mode) != 0)
432 			return EFAULT;
433 		cmn_err(CE_WARN, "mark: guid=%lu payload_len=%d", (long unsigned int)mark.zev_guid, mark.zev_payload_len);
434 		/* prepare message */
435 		msg_size = sizeof(*rec) + mark.zev_payload_len + 1;
436 		msg = kmem_alloc(sizeof(*msg) + msg_size, KM_SLEEP);
437 		msg->size = msg_size;
438 		rec = (zev_mark_t *)(msg + 1);
439 		rec->record_len = msg_size;
440 		rec->op = ZEV_OP_MARK;
441 		rec->op_time = ddi_get_time();
442 		rec->guid = mark.zev_guid;
443 		rec->payload_len = mark.zev_payload_len;
444 		/* get payload */
445 		if (ddi_copyin(((char *)arg) + sizeof(mark),
446 		               ZEV_PAYLOAD(rec),
447 		               mark.zev_payload_len, mode) != 0) {
448 			kmem_free(msg, msg_size);
449 			return EFAULT;
450 		}
451 		*(ZEV_PAYLOAD(rec) + mark.zev_payload_len) = '\0';
452 		/* get mark id and queue message */
453 		mutex_enter(&zev_mark_id_mutex);
454 		mark_id = zev_mark_id++;
455 		mutex_exit(&zev_mark_id_mutex);
456 		rec->mark_id = mark_id;
457 		zev_queue_message(ZEV_OP_MARK, msg);
458 		/* report mark id to userland, ignore errors */
459 		mark.zev_mark_id = mark_id;
460 		ddi_copyout(&mark, (void *)arg, sizeof(mark), mode);
461 		break;
462 	case ZEV_IOC_GET_GEN:
463 		return zev_ioc_get_gen(arg, mode);
464 	default:
465 		/* generic "ioctl unknown" error */
466 		return ENOTTY;
467 	}
468 	return (0);
469 }
470 
471 static int
472 zev_chpoll(dev_t dev, short events, int anyyet,
473     short *reventsp, struct pollhead **phpp)
474 {
475 	int instance;
476 	short revent = 0;
477 
478 	instance = getminor(dev);
479 	if (ddi_get_soft_state(statep, instance) == NULL)
480 		return (ENXIO);
481 	revent = 0;
482 	if ((events & POLLIN)) {
483 		mutex_enter(&zev_mutex);
484 		if (zev_queue_head)
485 			revent |= POLLIN;
486 		mutex_exit(&zev_mutex);
487 	}
488 	if (revent == 0) {
489 		if (!anyyet) {
490 			*phpp = &zev_pollhead;
491 		}
492 	}
493 	*reventsp = revent;
494 	return (0);
495 }
496 
497 /* ARGSUSED */
498 static int
499 zev_read(dev_t dev, struct uio *uio_p, cred_t *crep_p)
500 {
501 	int instance;
502 	offset_t off;
503 	int ret = 0;
504 	zev_msg_t *msg;
505 	char *data;
506 
507 	instance = getminor(dev);
508 	if (ddi_get_soft_state(statep, instance) == NULL)
509 		return (ENXIO);
510 	off = uio_p->uio_loffset;
511 	mutex_enter(&zev_mutex);
512 	msg = zev_queue_head;
513 	if (msg == NULL) {
514 		mutex_exit(&zev_mutex);
515 		return 0;
516 	}
517 	if (msg->size > uio_p->uio_resid) {
518 		mutex_exit(&zev_mutex);
519 		return E2BIG;
520 	}
521 	while (msg && uio_p->uio_resid >= msg->size) {
522 		data = (char *)(msg + 1);
523 		ret = uiomove(data, msg->size, UIO_READ, uio_p);
524 		if (ret != 0) {
525 			mutex_exit(&zev_mutex);
526 			cmn_err(CE_WARN, "zev: uiomove failed; messages lost");
527 			uio_p->uio_loffset = off;
528 			return (ret);
529 		}
530 		zev_queue_head = msg->next;
531 		if (zev_queue_head == NULL)
532 			zev_queue_tail = NULL;
533 		zev_statistics.zev_bytes_read += msg->size;
534 		zev_statistics.zev_queue_len -= msg->size;
535 		zev_queue_len--;
536 		kmem_free(msg, sizeof(*msg) + msg->size);
537 		msg = zev_queue_head;
538 	}
539 	cv_broadcast(&zev_condvar);
540 	mutex_exit(&zev_mutex);
541 	uio_p->uio_loffset = off;
542 	return 0;
543 }
544 
545 /* ARGSUSED */
546 static int
547 zev_close(dev_t dev, int flag, int otyp, cred_t *crepd)
548 {
549 	zev_state_t *sp;
550 	int instance;
551 
552 	instance = getminor(dev);
553 	if ((sp = ddi_get_soft_state(statep, instance)) == NULL)
554 		return (ENXIO);
555 	if (otyp != OTYP_CHR)
556 		return (EINVAL);
557 	mutex_enter(&sp->mutex);
558 	if (sp->busy != B_TRUE) {
559 		mutex_exit(&sp->mutex);
560 		return (EINVAL);
561 	}
562 	sp->busy = B_FALSE;
563 	mutex_exit(&sp->mutex);
564 	return (0);
565 }
566 
567 /* ARGSUSED */
568 static int
569 zev_open(dev_t *devp, int flag, int otyp, cred_t *credp)
570 {
571 	zev_state_t *sp;
572 	int instance;
573 
574 	instance = getminor(*devp);
575 	if ((sp = ddi_get_soft_state(statep, instance)) == NULL)
576 		return (ENXIO);
577 	if (otyp != OTYP_CHR)
578 		return (EINVAL);
579 	if (drv_priv(credp) != 0)
580 		return (EPERM);
581 	mutex_enter(&sp->mutex);
582 	if (sp->busy == B_TRUE) {
583 		/* XXX: wait for the instance to become available? */
584 		/* XXX: if we wait, the wait should be signal-interruptable. */
585 		mutex_exit(&sp->mutex);
586 		return (EBUSY);
587 	}
588 	sp->busy = B_TRUE;	/* can only be opened exclusively */
589 	mutex_exit(&sp->mutex);
590 	return (0);
591 }
592 
593 static struct cb_ops zev_cb_ops = {
594 	zev_open,		/* open */
595 	zev_close,		/* close */
596 	nodev,			/* strategy */
597 	nodev,			/* print */
598 	nodev,			/* dump */
599 	zev_read,		/* read */
600 	nodev,			/* write */
601 	zev_ioctl,		/* ioctl */
602 	nodev,			/* devmap */
603 	nodev,			/* mmap */
604 	nodev,			/* segmap */
605 	zev_chpoll,		/* chpoll */
606 	ddi_prop_op,		/* prop_op */
607 	NULL,			/* streamtab */
608 	D_MP | D_64BIT,		/* cb_flag */
609 	CB_REV,			/* cb_rev */
610 	nodev,			/* aread */
611 	nodev,			/* awrite */
612 };
613 
614 static void
615 zev_free_instance(dev_info_t *dip)
616 {
617 	int instance;
618 	zev_state_t *sp;
619 	instance = ddi_get_instance(dip);
620 	//ddi_remove_minor_node(dip, ddi_get_name(dip));
621 	ddi_remove_minor_node(dip, NULL);
622 	sp = ddi_get_soft_state(statep, instance);
623 	if (sp) {
624 		mutex_destroy(&sp->mutex);
625 		ddi_soft_state_free(statep, instance);
626 	}
627 }
628 
629 static int
630 zev_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
631 {
632 	int instance;
633 	zev_state_t *sp;
634 	/* called once per instance with DDI_DETACH,
635 	   may be called to suspend */
636 	switch (cmd) {
637 	case DDI_DETACH:
638 		/* instance busy? */
639 		instance = ddi_get_instance(dip);
640 		if ((sp = ddi_get_soft_state(statep, instance)) == NULL)
641 			return (DDI_FAILURE);
642 		mutex_enter(&sp->mutex);
643 		if (sp->busy == B_TRUE) {
644 			mutex_exit(&sp->mutex);
645 			return (DDI_FAILURE);
646 		}
647 		mutex_exit(&sp->mutex);
648 		/* free resources allocated for this instance */
649 		zev_free_instance(dip);
650 		return (DDI_SUCCESS);
651 	case DDI_SUSPEND:
652 		/* kernel must not suspend zev devices while ZFS is running */
653 		return (DDI_FAILURE);
654 	default:
655 		return (DDI_FAILURE);
656 	}
657 }
658 
659 static int
660 zev_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
661 {
662 	/* called once per instance with DDI_ATTACH,
663 	   may be called to resume */
664 	int instance;
665 	zev_state_t *sp;
666 	switch (cmd) {
667 	case DDI_ATTACH:
668 		instance = ddi_get_instance(dip);
669 		if (ddi_soft_state_zalloc(statep, instance) != DDI_SUCCESS) {
670 			return (DDI_FAILURE);
671 		}
672 		sp = ddi_get_soft_state(statep, instance);
673 		ddi_set_driver_private(dip, sp);
674 		sp->dip = dip;
675 		sp->busy = B_FALSE;
676 		mutex_init(&sp->mutex, NULL, MUTEX_DRIVER, NULL);
677 		if (ddi_create_minor_node(dip, ddi_get_name(dip),
678 		    S_IFCHR, instance, DDI_PSEUDO, 0) == DDI_FAILURE) {
679 			zev_free_instance(dip);
680 			return (DDI_FAILURE);
681 		}
682 		ddi_report_dev(dip);
683 		return (DDI_SUCCESS);
684 	case DDI_RESUME:
685 		/* suspendeding zev devices should never happen */
686 		return (DDI_SUCCESS);
687 	default:
688 		return (DDI_FAILURE);
689 	}
690 }
691 
692 /* ARGSUSED */
693 static int
694 zev_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **resultp)
695 {
696 	int instance;
697 	zev_state_t *sp;
698 	switch (infocmd) {
699 	case DDI_INFO_DEVT2DEVINFO:
700 		/* arg is dev_t */
701 		instance = getminor((dev_t)arg);
702 		if ((sp = ddi_get_soft_state(statep, instance)) != NULL) {
703 			*resultp = sp->dip;
704 			return (DDI_SUCCESS);
705 		}
706 		*resultp = NULL;
707 		return (DDI_FAILURE);
708 	case DDI_INFO_DEVT2INSTANCE:
709 		/* arg is dev_t */
710 		instance = getminor((dev_t)arg);
711 		*resultp = (void *)(uintptr_t)instance;
712 		return (DDI_FAILURE);
713 	}
714 	return (DDI_FAILURE);
715 }
716 
717 static struct dev_ops zev_dev_ops = {
718 	DEVO_REV,			/* driver build revision */
719 	0,				/* driver reference count */
720 	zev_getinfo,			/* getinfo */
721 	nulldev,			/* identify (obsolete) */
722 	nulldev,			/* probe (search for devices) */
723 	zev_attach,			/* attach */
724 	zev_detach,			/* detach */
725 	nodev,				/* reset (obsolete, use quiesce) */
726 	&zev_cb_ops,			/* character and block device ops */
727 	NULL,				/* bus driver ops */
728 	NULL,				/* power management, not needed */
729 	ddi_quiesce_not_needed,		/* quiesce */
730 };
731 
732 static struct modldrv zev_modldrv = {
733 	&mod_driverops,			/* all loadable modules use this */
734 	"zev ZFS event provider, v1.0",	/* driver name and version info */
735 	&zev_dev_ops			/* ops method pointers */
736 };
737 
738 static struct modlinkage zev_modlinkage = {
739 	MODREV_1,	/* fixed value */
740 	{
741 		&zev_modldrv,	/* driver linkage structure */
742 		NULL		/* list terminator */
743 	}
744 };
745 
746 int
747 _init(void)
748 {
749 	int error;
750 	boolean_t module_installed = B_FALSE;
751 
752 	if ((error = ddi_soft_state_init(&statep, sizeof(zev_state_t), 1)) != 0)
753 		return (error);
754 	zev_busy = B_FALSE;
755 
756 	mutex_init(&zev_mutex, NULL, MUTEX_DRIVER, NULL);
757 	cv_init(&zev_condvar, NULL, CV_DRIVER, NULL);
758 	rw_init(&zev_pool_list_rwlock, NULL, RW_DRIVER, NULL);
759 	mutex_init(&zev_mark_id_mutex, NULL, MUTEX_DRIVER, NULL);
760 	zev_mark_id = gethrtime();
761 	bzero(&zev_statistics, sizeof(zev_statistics));
762 	zev_statistics.zev_max_queue_len = ZEV_MAX_QUEUE_LEN;
763 	zev_statistics.zev_poll_wakeup_queue_len =
764 	    ZEV_MIN_POLL_WAKEUP_QUEUE_LEN;
765 	if (zev_ioc_mute_pool("zg0")) {
766 		cmn_err(CE_WARN, "zev: could not init mute list");
767 		goto FAIL;
768 	}
769 
770 	if ((error = mod_install(&zev_modlinkage)) != 0) {
771 		cmn_err(CE_WARN, "zev: could not install module");
772 		goto FAIL;
773 	}
774 	module_installed = B_TRUE;
775 
776 	/*
777 	 * Note: _init() seems to be a bad place to access other modules'
778 	 * device files, as it can cause a kernel panic.
779 	 *
780 	 * For example, our _init() is called if our module isn't loaded
781 	 * when someone causes a readdir() in "/devices/pseudo".  For that,
782 	 * devfs_readdir() is used, which obtains an rwlock for the
783 	 * directory.
784 	 *
785 	 * Then, if we open a device file here, we will indirectly call
786 	 * devfs_lookup(), which tries to obtain the same rwlock
787 	 * again, which this thread already has.  That will result in
788 	 * a kernel panic. ("recursive entry")
789 	 *
790 	 * Therefor, we have switched from a zfs ioctl() to directly
791 	 * accessing symbols in the zfs module.
792 	 */
793 
794 	/* switch ZFS event callbacks to zev module callback functions */
795 	rw_enter(&rz_zev_rwlock, RW_WRITER);
796 	rz_zev_callbacks = &zev_callbacks;
797 	rw_exit(&rz_zev_rwlock);
798 
799 	zev_poll_wakeup_thread = thread_create(NULL, 0,
800 	    zev_poll_wakeup_thread_main, NULL, 0, &p0, TS_RUN, minclsyspri);
801 	return (0);
802 FAIL:
803 	/* free resources */
804 	if (module_installed == B_TRUE)
805 		(void) mod_remove(&zev_modlinkage);
806 	mutex_destroy(&zev_mutex);
807 	ddi_soft_state_fini(&statep);
808 	return (error);
809 }
810 
811 int
812 _info(struct modinfo *modinfop)
813 {
814 	return (mod_info(&zev_modlinkage, modinfop));
815 }
816 
817 int
818 _fini(void)
819 {
820 	int error = 0;
821 	zev_msg_t *msg;
822 	zev_pool_list_entry_t *pe, *npe;
823 
824 	mutex_enter(&zev_mutex);
825 	if (zev_busy == B_TRUE) {
826 		mutex_exit(&zev_mutex);
827 		return (SET_ERROR(EBUSY));
828 	}
829 	mutex_exit(&zev_mutex);
830 
831 	/* switch ZFS event callbacks back to default */
832 	rw_enter(&rz_zev_rwlock, RW_WRITER);
833 	rz_zev_callbacks = rz_zev_default_callbacks;
834 	rw_exit(&rz_zev_rwlock);
835 
836 	/* no thread is inside of the callbacks anymore.  Safe to remove. */
837 	zev_wakeup_thread_run = 0;
838 	if (zev_poll_wakeup_thread != 0) {
839 		thread_join(zev_poll_wakeup_thread->t_did);
840 		zev_poll_wakeup_thread = 0;
841 	}
842 	if ((error = mod_remove(&zev_modlinkage)) != 0) {
843 		cmn_err(CE_WARN, "mod_remove failed: %d", error);
844 		return (error);
845 	}
846 
847 	/* free resources */
848 	mutex_enter(&zev_mutex);
849 	while (zev_queue_head) {
850 		msg = zev_queue_head;
851 		zev_queue_head = msg->next;
852 		if (msg)
853 			kmem_free(msg, sizeof(*msg) + msg->size);
854 	}
855 	mutex_exit(&zev_mutex);
856 	rw_enter(&zev_pool_list_rwlock, RW_WRITER);
857 	pe = zev_muted_pools_head;
858 	while (pe) {
859 		npe = pe;
860 		pe = pe->next;
861 		kmem_free(npe, sizeof(*npe));
862 	}
863 	rw_exit(&zev_pool_list_rwlock);
864 	ddi_soft_state_fini(&statep);
865 	rw_destroy(&zev_pool_list_rwlock);
866 	cv_destroy(&zev_condvar);
867 	mutex_destroy(&zev_mutex);
868 	mutex_destroy(&zev_mark_id_mutex);
869 
870 	return (0);
871 }
872 
873