#include #include #include #include #include #include #include #include typedef struct zev_state { kmutex_t mutex; dev_info_t *dip; boolean_t busy; } zev_state_t; static void *statep; struct pollhead zev_pollhead; kmutex_t zev_mutex; kcondvar_t zev_condvar; krwlock_t zev_pool_list_rwlock; static zev_statistics_t zev_statistics; static boolean_t zev_busy; /* * The longest potential message is from zev_zfs_mount() and * contains the mountpoint, which might be close to MAXPATHLEN bytes long. * * Another candidate is zev_znode_rename_cb() and contains three inode * numbers and two filenames of up to MAXNAMELEN bytes each. */ #define ZEV_MAX_MESSAGE_LEN 4096 /* If the queue size reaches 1GB, stop ZFS ops and block the threads. */ #define ZEV_MAX_QUEUE_LEN (1 * 1024 * 1024 * 1024) /* Don't wake up poll()ing processes for every single message. */ #define ZEV_MIN_POLL_WAKEUP_QUEUE_LEN 8192 typedef struct zev_mq { struct zev_mq *next; int used; int sent; char buf[ZEV_MAX_MESSAGE_LEN]; } zev_mq_t; static zev_mq_t *zev_mq_head = NULL; static zev_mq_t *zev_mq_tail = NULL; static uint64_t zev_mq_len = 0; typedef struct zev_pool_list_entry { struct zev_pool_list_entry *next; char name[MAXPATHLEN]; } zev_pool_list_entry_t; static zev_pool_list_entry_t *zev_muted_pools_head = NULL; /* * poll() wakeup thread. Used to check periodically whether we have * bytes left in the queue that have not yet been made into a * pollwakeup() call. This is meant to insure a maximum waiting * time until an event is presented as a poll wakeup, while at * the same time not making every single event into a poll wakeup * of it's own. */ static volatile int zev_wakeup_thread_run = 1; static kthread_t *zev_poll_wakeup_thread = NULL; static void zev_poll_wakeup_thread_main(void) { int wakeup; while (zev_wakeup_thread_run) { delay(drv_usectohz(100 * 1000)); /* sleep 100ms */ /* check message queue */ mutex_enter(&zev_mutex); wakeup = 0; if (zev_mq_head) wakeup = 1; mutex_exit(&zev_mutex); if (wakeup) pollwakeup(&zev_pollhead, POLLIN); } thread_exit(); } static int zev_ioc_mute_pool(char *poolname) { zev_pool_list_entry_t *pe; rw_enter(&zev_pool_list_rwlock, RW_WRITER); /* pool already muted? */ for (pe=zev_muted_pools_head; pe; pe=pe->next) { if (!strcmp(pe->name, poolname)) { rw_exit(&zev_pool_list_rwlock); return EEXIST; } } pe = kmem_zalloc(sizeof(*pe), KM_SLEEP); if (!pe) { rw_exit(&zev_pool_list_rwlock); return ENOMEM; } strncpy(pe->name, poolname, sizeof(pe->name)); pe->next = zev_muted_pools_head; zev_muted_pools_head = pe; rw_exit(&zev_pool_list_rwlock); return (0); } static int zev_ioc_unmute_pool(char *poolname) { zev_pool_list_entry_t *pe, *peprev; rw_enter(&zev_pool_list_rwlock, RW_WRITER); /* pool muted? */ peprev = NULL; for (pe=zev_muted_pools_head; pe; pe=pe->next) { if (!strcmp(pe->name, poolname)) { goto found; } peprev = pe; } rw_exit(&zev_pool_list_rwlock); return ENOENT; found: if (peprev != NULL) { peprev->next = pe->next; } else { zev_muted_pools_head = pe->next; } kmem_free(pe, sizeof(*pe)); rw_exit(&zev_pool_list_rwlock); return (0); } int zev_skip_pool(objset_t *os) { zev_pool_list_entry_t *pe; dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool; rw_enter(&zev_pool_list_rwlock, RW_READER); for (pe=zev_muted_pools_head; pe; pe=pe->next) { if (!strcmp(pe->name, dp->dp_spa->spa_name)) { rw_exit(&zev_pool_list_rwlock); return 1; } } rw_exit(&zev_pool_list_rwlock); return 0; } void zev_mq_printf(int op, int error, char *fmt, ...) { char buf[ZEV_MAX_MESSAGE_LEN]; int len; va_list ap; zev_mq_t *mq; uint64_t bytes_in_queue = 0; int wakeup = 0; /* render message */ va_start(ap, fmt); len = vsnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); if (len >= sizeof(buf)) { strcpy(buf, "ZEV_ERROR: message too long\n"); len = strlen(buf); error++; } /* op type ok? */ if (op < ZEV_OP_MIN || op > ZEV_OP_MAX) { len = snprintf(buf, sizeof(buf), "ZEV_ERROR: unknown op %d\n", op); error++; } mutex_enter(&zev_mutex); while (zev_statistics.zev_max_queue_len && zev_statistics.zev_queue_len >= zev_statistics.zev_max_queue_len) { /* queue full. block until it's been shrunk. */ cv_wait(&zev_condvar, &zev_mutex); } mq = zev_mq_tail; /* make sure we have enough space in our queue */ if (!mq || ((ZEV_MAX_MESSAGE_LEN - mq->used) < len)) { /* need new mq */ mq = kmem_zalloc(sizeof(*mq), KM_SLEEP); if (zev_mq_tail) zev_mq_tail->next = mq; zev_mq_tail = mq; zev_mq_len++; if (!zev_mq_head) zev_mq_head = mq; } /* copy message to queue */ memcpy(mq->buf + mq->used, buf, len); mq->used += len; /* update statistics */ zev_statistics.zev_cnt_total_events++; zev_statistics.zev_queue_len += len; bytes_in_queue = zev_statistics.zev_queue_len; if (error) zev_statistics.zev_cnt_errors++; switch (op) { case ZEV_OP_ZFS_MOUNT: zev_statistics.zev_cnt_zfs_mount++; break; case ZEV_OP_ZFS_UMOUNT: zev_statistics.zev_cnt_zfs_umount++; break; case ZEV_OP_ZVOL_WRITE: zev_statistics.zev_cnt_zvol_write++; break; case ZEV_OP_ZVOL_TRUNCATE: zev_statistics.zev_cnt_zvol_truncate++; break; case ZEV_OP_ZNODE_CLOSE_AFTER_UPDATE: zev_statistics.zev_cnt_znode_close_after_update++; break; case ZEV_OP_ZNODE_CREATE: zev_statistics.zev_cnt_znode_create++; break; case ZEV_OP_ZNODE_REMOVE: zev_statistics.zev_cnt_znode_remove++; break; case ZEV_OP_ZNODE_LINK: zev_statistics.zev_cnt_znode_link++; break; case ZEV_OP_ZNODE_SYMLINK: zev_statistics.zev_cnt_znode_symlink++; break; case ZEV_OP_ZNODE_RENAME: zev_statistics.zev_cnt_znode_rename++; break; case ZEV_OP_ZNODE_WRITE: zev_statistics.zev_cnt_znode_write++; break; case ZEV_OP_ZNODE_TRUNCATE: zev_statistics.zev_cnt_znode_truncate++; break; case ZEV_OP_ZNODE_SETATTR: zev_statistics.zev_cnt_znode_setattr++; break; case ZEV_OP_ZNODE_ACL: zev_statistics.zev_cnt_znode_acl++; break; } if (bytes_in_queue > zev_statistics.zev_poll_wakeup_queue_len) wakeup = 1; mutex_exit(&zev_mutex); /* chpoll event, if necessary. */ if (wakeup) pollwakeup(&zev_pollhead, POLLIN); } static int zev_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) { int instance; zev_state_t *sp; zev_statistics_t zs; zev_ioctl_poolarg_t pa; uint64_t len; instance = getminor(dev); if ((sp = ddi_get_soft_state(statep, instance)) == NULL) return (ENXIO); if (ddi_model_convert_from(mode) != DDI_MODEL_NONE) { /* userland has another data model. (most likely 32-bit) -> not supported. */ return (EINVAL); } /* Remember to do 32/64 bit mode adjustments if necessary. See "Writing Device Drivers", 280pp */ switch (cmd) { case ZEV_IOC_GET_STATISTICS: /* ddi_copyout() can take a long time. Better make a copy to be able to release the mutex faster. */ mutex_enter(&zev_mutex); memcpy(&zs, &zev_statistics, sizeof(zs)); mutex_exit(&zev_mutex); if (ddi_copyout(&zs, (void *)arg, sizeof(zs), mode) != 0) return EFAULT; break; case ZEV_IOC_MUTE_POOL: case ZEV_IOC_UNMUTE_POOL: if (ddi_copyin((void *)arg, &pa, sizeof(pa), mode) != 0) return EFAULT; if (pa.zev_poolname_len >=MAXPATHLEN) return EINVAL; pa.zev_poolname[pa.zev_poolname_len] = '\0'; if (cmd == ZEV_IOC_MUTE_POOL) { return zev_ioc_mute_pool(pa.zev_poolname); } else { return zev_ioc_unmute_pool(pa.zev_poolname); } break; case ZEV_IOC_SET_MAX_QUEUE_LEN: if (ddi_copyin((void *)arg, &len, sizeof(len), mode) != 0) return EFAULT; if (len > ZEV_MAX_QUEUE_LEN) return EINVAL; mutex_enter(&zev_mutex); zev_statistics.zev_max_queue_len = len; cv_broadcast(&zev_condvar); mutex_exit(&zev_mutex); break; case ZEV_IOC_SET_POLL_WAKEUP_QUEUE_LEN: if (ddi_copyin((void *)arg, &len, sizeof(len), mode) != 0) return EFAULT; mutex_enter(&zev_mutex); zev_statistics.zev_poll_wakeup_queue_len = len; mutex_exit(&zev_mutex); break; default: /* generic "ioctl unknown" error */ return (ENOTTY); } return (0); } static int zev_chpoll(dev_t dev, short events, int anyyet, short *reventsp, struct pollhead **phpp) { int instance; zev_state_t *sp; short revent = 0; instance = getminor(dev); if ((sp = ddi_get_soft_state(statep, instance)) == NULL) return (ENXIO); revent = 0; if ((events & POLLIN)) { mutex_enter(&zev_mutex); if (zev_mq_head) revent |= POLLIN; mutex_exit(&zev_mutex); } if (revent == 0) { if (!anyyet) { *phpp = &zev_pollhead; } } *reventsp = revent; return (0); } static int zev_read(dev_t dev, struct uio *uio_p, cred_t *crep_p) { zev_state_t *sp; int instance; offset_t off; int ret = 0; int mq_bytes; zev_mq_t *mq; int len; instance = getminor(dev); if ((sp = ddi_get_soft_state(statep, instance)) == NULL) return (ENXIO); off = uio_p->uio_loffset; mutex_enter(&zev_mutex); while (zev_mq_head && uio_p->uio_resid) { mq_bytes = zev_mq_head->used - zev_mq_head->sent; if (mq_bytes <= 0) { mq = zev_mq_head; zev_mq_head = zev_mq_head->next; if (!zev_mq_head) zev_mq_tail = NULL; kmem_free(mq, sizeof(*mq)); continue; } len = min(uio_p->uio_resid, mq_bytes); ret = uiomove(zev_mq_head->buf + zev_mq_head->sent, len, UIO_READ, uio_p); if (ret != 0) break; zev_statistics.zev_bytes_read += len; zev_statistics.zev_queue_len -= len; zev_mq_head->sent += len; cv_broadcast(&zev_condvar); } mutex_exit(&zev_mutex); uio_p->uio_loffset = off; return (ret); } static int zev_close(dev_t dev, int flag, int otyp, cred_t *crepd) { zev_state_t *sp; int instance; instance = getminor(dev); if ((sp = ddi_get_soft_state(statep, instance)) == NULL) return (ENXIO); if (otyp != OTYP_CHR) return (EINVAL); mutex_enter(&sp->mutex); if (sp->busy != B_TRUE) { mutex_exit(&sp->mutex); return (EINVAL); } sp->busy = B_FALSE; mutex_exit(&sp->mutex); return (0); } static int zev_open(dev_t *devp, int flag, int otyp, cred_t *credp) { zev_state_t *sp; int instance; instance = getminor(*devp); if ((sp = ddi_get_soft_state(statep, instance)) == NULL) return (ENXIO); if (otyp != OTYP_CHR) return (EINVAL); if (drv_priv(credp) != 0) return (EPERM); mutex_enter(&sp->mutex); if (sp->busy == B_TRUE) { /* XXX: wait for the instance to become available? */ /* XXX: if we wait, the wait should be signal-interruptable. */ mutex_exit(&sp->mutex); return (EBUSY); } sp->busy = B_TRUE; /* can only be opened exclusively */ mutex_exit(&sp->mutex); return (0); } static struct cb_ops zev_cb_ops = { zev_open, /* open */ zev_close, /* close */ nodev, /* strategy */ nodev, /* print */ nodev, /* dump */ zev_read, /* read */ nodev, /* write */ zev_ioctl, /* ioctl */ nodev, /* devmap */ nodev, /* mmap */ nodev, /* segmap */ zev_chpoll, /* chpoll */ ddi_prop_op, /* prop_op */ NULL, /* streamtab */ D_MP | D_64BIT, /* cb_flag */ CB_REV, /* cb_rev */ nodev, /* aread */ nodev, /* awrite */ }; static void zev_free_instance(dev_info_t *dip) { int instance; zev_state_t *sp; instance = ddi_get_instance(dip); //ddi_remove_minor_node(dip, ddi_get_name(dip)); ddi_remove_minor_node(dip, NULL); sp = ddi_get_soft_state(statep, instance); if (sp) { mutex_destroy(&sp->mutex); ddi_soft_state_free(statep, instance); } } static int zev_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) { int instance; zev_state_t *sp; /* called once per instance with DDI_DETACH, may be called to suspend */ switch (cmd) { case DDI_DETACH: /* instance busy? */ instance = ddi_get_instance(dip); if ((sp = ddi_get_soft_state(statep, instance)) == NULL) return (ENXIO); mutex_enter(&sp->mutex); if (sp->busy == B_TRUE) { mutex_exit(&sp->mutex); return (EBUSY); } mutex_exit(&sp->mutex); /* free resources allocated for this instance */ zev_free_instance(dip); return (DDI_SUCCESS); case DDI_SUSPEND: /* kernel must not suspend zev devices while ZFS is running */ return (DDI_FAILURE); default: return (DDI_FAILURE); } } static int zev_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { /* called once per instance with DDI_ATTACH, may be called to resume */ int instance; zev_state_t *sp; switch (cmd) { case DDI_ATTACH: instance = ddi_get_instance(dip); if (ddi_soft_state_zalloc(statep, instance) != DDI_SUCCESS) { return (DDI_FAILURE); } sp = ddi_get_soft_state(statep, instance); ddi_set_driver_private(dip, sp); sp->dip = dip; sp->busy = B_FALSE; mutex_init(&sp->mutex, NULL, MUTEX_DRIVER, NULL); if (ddi_create_minor_node(dip, ddi_get_name(dip), S_IFCHR, instance, DDI_PSEUDO, 0) == DDI_FAILURE) { zev_free_instance(dip); return (DDI_FAILURE); } ddi_report_dev(dip); return (DDI_SUCCESS); case DDI_RESUME: /* suspendeding zev devices should never happen */ return (DDI_SUCCESS); default: return (DDI_FAILURE); } } static int zev_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **resultp) { int instance; zev_state_t *sp; switch (infocmd) { case DDI_INFO_DEVT2DEVINFO: /* arg is dev_t */ instance = getminor((dev_t)arg); if ((sp = ddi_get_soft_state(statep, instance)) != NULL) { *resultp = sp->dip; return (DDI_SUCCESS); } *resultp = NULL; return (DDI_FAILURE); case DDI_INFO_DEVT2INSTANCE: /* arg is dev_t */ instance = getminor((dev_t)arg); *resultp = (void *)(uintptr_t)instance; return (DDI_FAILURE); } return (DDI_FAILURE); } static struct dev_ops zev_dev_ops = { DEVO_REV, /* driver build revision */ 0, /* driver reference count */ zev_getinfo, /* getinfo */ nulldev, /* identify (obsolete) */ nulldev, /* probe (search for devices) */ zev_attach, /* attach */ zev_detach, /* detach */ nodev, /* reset (obsolete, use quiesce) */ &zev_cb_ops, /* character and block device ops */ NULL, /* bus driver ops */ NULL, /* power management, not needed */ ddi_quiesce_not_needed, /* quiesce */ }; static struct modldrv zev_modldrv = { &mod_driverops, /* all loadable modules use this */ "zev ZFS event provider, v1.0", /* driver name and version info */ &zev_dev_ops /* ops method pointers */ }; static struct modlinkage zev_modlinkage = { MODREV_1, /* fixed value */ { &zev_modldrv, /* driver linkage structure */ NULL /* list terminator */ } }; int _init(void) { int error; boolean_t module_installed = B_FALSE; if ((error = ddi_soft_state_init(&statep, sizeof(zev_state_t), 1)) != 0) return (error); zev_busy = B_FALSE; mutex_init(&zev_mutex, NULL, MUTEX_DRIVER, NULL); cv_init(&zev_condvar, NULL, CV_DRIVER, NULL); rw_init(&zev_pool_list_rwlock, NULL, RW_DRIVER, NULL); bzero(&zev_statistics, sizeof(zev_statistics)); zev_statistics.zev_max_queue_len = ZEV_MAX_QUEUE_LEN; zev_statistics.zev_poll_wakeup_queue_len = ZEV_MIN_POLL_WAKEUP_QUEUE_LEN; if (zev_ioc_mute_pool("zg0")) { cmn_err(CE_WARN, "zev: could not init mute list"); goto FAIL; } if ((error = mod_install(&zev_modlinkage)) != 0) { cmn_err(CE_WARN, "zev: could not install module"); goto FAIL; } module_installed = B_TRUE; /* * Note: _init() seems to be a bad place to access other modules' * device files, as it can cause a kernel panic. * * For example, our _init() is called if our module isn't loaded * when someone causes a readdir() in "/devices/pseudo". For that, * devfs_readdir() is used, which obtains an rwlock for the * directory. * * Then, if we open a device file here, we will indirectly call * devfs_lookup(), which tries to obtain the same rwlock * again, which this thread already has. That will result in * a kernel panic. ("recursive entry") * * Therefor, we have switched from a zfs ioctl() to directly * accessing symbols in the zfs module. */ /* switch ZFS event callbacks to zev module callback functions */ rw_enter(&rz_zev_rwlock, RW_WRITER); rz_zev_callbacks = &zev_callbacks; rw_exit(&rz_zev_rwlock); zev_poll_wakeup_thread = thread_create(NULL, 0, zev_poll_wakeup_thread_main, NULL, 0, &p0, TS_RUN, minclsyspri); return (0); FAIL: /* free resources */ if (module_installed == B_TRUE) (void) mod_remove(&zev_modlinkage); mutex_destroy(&zev_mutex); ddi_soft_state_fini(&statep); return (error); } int _info(struct modinfo *modinfop) { return (mod_info(&zev_modlinkage, modinfop)); } int _fini(void) { int error = 0; zev_mq_t *mq; zev_pool_list_entry_t *pe, *npe; mutex_enter(&zev_mutex); if (zev_busy == B_TRUE) { mutex_exit(&zev_mutex); return (SET_ERROR(EBUSY)); } mutex_exit(&zev_mutex); /* switch ZFS event callbacks back to default */ rw_enter(&rz_zev_rwlock, RW_WRITER); rz_zev_callbacks = rz_zev_default_callbacks; rw_exit(&rz_zev_rwlock); /* no thread is inside of the callbacks anymore. Safe to remove. */ zev_wakeup_thread_run = 0; if (zev_poll_wakeup_thread != 0) { thread_join(zev_poll_wakeup_thread->t_did); zev_poll_wakeup_thread = 0; } if ((error = mod_remove(&zev_modlinkage)) != 0) { cmn_err(CE_WARN, "mod_remove failed: %d", error); return (error); } /* free resources */ mutex_enter(&zev_mutex); while (zev_mq_head) { mq = zev_mq_head; zev_mq_head = zev_mq_head->next; if (mq) kmem_free(mq, sizeof(*mq)); } mutex_exit(&zev_mutex); rw_enter(&zev_pool_list_rwlock, RW_WRITER); pe = zev_muted_pools_head; while (pe) { npe = pe; pe = pe->next; kmem_free(npe, sizeof(*npe)); } rw_exit(&zev_pool_list_rwlock); ddi_soft_state_fini(&statep); rw_destroy(&zev_pool_list_rwlock); cv_destroy(&zev_condvar); mutex_destroy(&zev_mutex); return (0); }