#include #include #include #include #include #include #include #include #include typedef struct zev_state { kmutex_t mutex; dev_info_t *dip; boolean_t busy; } zev_state_t; static void *statep; struct pollhead zev_pollhead; kmutex_t zev_mutex; kcondvar_t zev_condvar; krwlock_t zev_pool_list_rwlock; static zev_statistics_t zev_statistics; static boolean_t zev_busy; static kmutex_t zev_mark_id_mutex; static uint64_t zev_mark_id = 0; /* * The longest potential message is from zev_zfs_mount() and * contains the mountpoint, which might be close to MAXPATHLEN bytes long. * * Another candidate is zev_znode_rename_cb() and contains three inode * numbers and two filenames of up to MAXNAMELEN bytes each. */ #define ZEV_MAX_MESSAGE_LEN 4096 /* If the queue size reaches 1GB, stop ZFS ops and block the threads. */ #define ZEV_MAX_QUEUE_LEN (1 * 1024 * 1024 * 1024) /* Don't wake up poll()ing processes for every single message. */ #define ZEV_MIN_POLL_WAKEUP_QUEUE_LEN 8192 static zev_msg_t *zev_queue_head = NULL; static zev_msg_t *zev_queue_tail = NULL; static uint64_t zev_queue_len = 0; typedef struct zev_pool_list_entry { struct zev_pool_list_entry *next; char name[MAXPATHLEN]; } zev_pool_list_entry_t; static zev_pool_list_entry_t *zev_muted_pools_head = NULL; /* * poll() wakeup thread. Used to check periodically whether we have * bytes left in the queue that have not yet been made into a * pollwakeup() call. This is meant to insure a maximum waiting * time until an event is presented as a poll wakeup, while at * the same time not making every single event into a poll wakeup * of it's own. */ static volatile int zev_wakeup_thread_run = 1; static kthread_t *zev_poll_wakeup_thread = NULL; static void zev_poll_wakeup_thread_main(void) { int wakeup; while (zev_wakeup_thread_run) { delay(drv_usectohz(100 * 1000)); /* sleep 100ms */ /* check message queue */ mutex_enter(&zev_mutex); wakeup = 0; if (zev_queue_head) wakeup = 1; mutex_exit(&zev_mutex); if (wakeup) pollwakeup(&zev_pollhead, POLLIN); } thread_exit(); } static int zev_ioc_mute_pool(char *poolname) { zev_pool_list_entry_t *pe; rw_enter(&zev_pool_list_rwlock, RW_WRITER); /* pool already muted? */ for (pe=zev_muted_pools_head; pe; pe=pe->next) { if (!strcmp(pe->name, poolname)) { rw_exit(&zev_pool_list_rwlock); return EEXIST; } } pe = kmem_zalloc(sizeof(*pe), KM_SLEEP); if (!pe) { rw_exit(&zev_pool_list_rwlock); return ENOMEM; } strncpy(pe->name, poolname, sizeof(pe->name)); pe->next = zev_muted_pools_head; zev_muted_pools_head = pe; rw_exit(&zev_pool_list_rwlock); return (0); } static int zev_ioc_unmute_pool(char *poolname) { zev_pool_list_entry_t *pe, *peprev; rw_enter(&zev_pool_list_rwlock, RW_WRITER); /* pool muted? */ peprev = NULL; for (pe=zev_muted_pools_head; pe; pe=pe->next) { if (!strcmp(pe->name, poolname)) { goto found; } peprev = pe; } rw_exit(&zev_pool_list_rwlock); return ENOENT; found: if (peprev != NULL) { peprev->next = pe->next; } else { zev_muted_pools_head = pe->next; } kmem_free(pe, sizeof(*pe)); rw_exit(&zev_pool_list_rwlock); return (0); } int zev_skip_pool(objset_t *os) { zev_pool_list_entry_t *pe; dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool; rw_enter(&zev_pool_list_rwlock, RW_READER); for (pe=zev_muted_pools_head; pe; pe=pe->next) { if (!strcmp(pe->name, dp->dp_spa->spa_name)) { rw_exit(&zev_pool_list_rwlock); return 1; } } rw_exit(&zev_pool_list_rwlock); return 0; } void zev_queue_message(int op, zev_msg_t *msg) { time_t now = 0; int wakeup = 0; msg->next = NULL; if (op < ZEV_OP_MIN || op > ZEV_OP_MAX) { zev_queue_error(op, "unknown op id encountered: %d", op); kmem_free(msg, sizeof(*msg) + msg->size); return; } mutex_enter(&zev_mutex); while (zev_statistics.zev_max_queue_len && zev_statistics.zev_queue_len >= zev_statistics.zev_max_queue_len) { /* queue full. block until it's been shrunk. */ cv_wait(&zev_condvar, &zev_mutex); } if (zev_queue_tail == NULL) { zev_queue_head = zev_queue_tail = msg; } else { zev_queue_tail->next = msg; zev_queue_tail = msg; } zev_queue_len++; /* update statistics */ zev_statistics.zev_cnt_total_events++; zev_statistics.zev_queue_len += msg->size; if (zev_statistics.zev_queue_len > zev_statistics.zev_poll_wakeup_queue_len) wakeup = 1; switch (op) { case ZEV_OP_ERROR: zev_statistics.zev_cnt_errors++; break; case ZEV_OP_MARK: zev_statistics.zev_cnt_marks++; break; case ZEV_OP_ZFS_MOUNT: zev_statistics.zev_cnt_zfs_mount++; break; case ZEV_OP_ZFS_UMOUNT: zev_statistics.zev_cnt_zfs_umount++; break; case ZEV_OP_ZVOL_WRITE: zev_statistics.zev_cnt_zvol_write++; break; case ZEV_OP_ZVOL_TRUNCATE: zev_statistics.zev_cnt_zvol_truncate++; break; case ZEV_OP_ZNODE_CLOSE_AFTER_UPDATE: zev_statistics.zev_cnt_znode_close_after_update++; break; case ZEV_OP_ZNODE_CREATE: zev_statistics.zev_cnt_znode_create++; break; case ZEV_OP_ZNODE_REMOVE: zev_statistics.zev_cnt_znode_remove++; break; case ZEV_OP_ZNODE_LINK: zev_statistics.zev_cnt_znode_link++; break; case ZEV_OP_ZNODE_SYMLINK: zev_statistics.zev_cnt_znode_symlink++; break; case ZEV_OP_ZNODE_RENAME: zev_statistics.zev_cnt_znode_rename++; break; case ZEV_OP_ZNODE_WRITE: zev_statistics.zev_cnt_znode_write++; break; case ZEV_OP_ZNODE_TRUNCATE: zev_statistics.zev_cnt_znode_truncate++; break; case ZEV_OP_ZNODE_SETATTR: zev_statistics.zev_cnt_znode_setattr++; break; case ZEV_OP_ZNODE_ACL: zev_statistics.zev_cnt_znode_acl++; break; } mutex_exit(&zev_mutex); /* chpoll event, if necessary. */ if (wakeup) pollwakeup(&zev_pollhead, POLLIN); return; } void zev_queue_error(int op, char *fmt, ...) { char buf[ZEV_MAX_MESSAGE_LEN]; va_list ap; int len; zev_msg_t *msg = NULL; zev_error_t *rec; int msg_size; va_start(ap, fmt); len = vsnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); if (len >= sizeof(buf)) { cmn_err(CE_WARN, "zev: can't report error - " "dropping event entirely."); return; } msg_size = sizeof(*rec) + len + 1; msg = kmem_alloc(sizeof(*msg) + msg_size, KM_SLEEP); msg->size = msg_size; rec = (zev_error_t *)(msg + 1); rec->op = ZEV_OP_ERROR; rec->op_time = ddi_get_time(); rec->guid = 0; rec->failed_op = op; rec->errstr_len = len; memcpy(ZEV_ERRSTR(rec), buf, len + 1); zev_queue_message(ZEV_OP_ERROR, msg); return; } static int zev_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) { int instance; zev_state_t *sp; zev_statistics_t zs; zev_ioctl_poolarg_t pa; zev_ioctl_mark_t mark; zev_mark_t *rec; int msg_size; zev_msg_t *msg; uint64_t len; uint64_t mark_id; instance = getminor(dev); if ((sp = ddi_get_soft_state(statep, instance)) == NULL) return (ENXIO); /* * all structures passed between kernel and userspace * are now compatible between 64 and 32 bit. Model * conversion can be ignore. */ #if 0 /* Remember to do 32/64 bit mode adjustments if necessary. See "Writing Device Drivers", 280pp */ if (ddi_model_convert_from(mode) != DDI_MODEL_NONE) { /* userland has another data model. (most likely 32-bit) -> not supported. */ return (EINVAL); } #endif switch (cmd) { case ZEV_IOC_GET_STATISTICS: /* ddi_copyout() can take a long time. Better make a copy to be able to release the mutex faster. */ mutex_enter(&zev_mutex); memcpy(&zs, &zev_statistics, sizeof(zs)); mutex_exit(&zev_mutex); if (ddi_copyout(&zs, (void *)arg, sizeof(zs), mode) != 0) return EFAULT; break; case ZEV_IOC_MUTE_POOL: case ZEV_IOC_UNMUTE_POOL: if (ddi_copyin((void *)arg, &pa, sizeof(pa), mode) != 0) return EFAULT; if (pa.zev_poolname_len >=MAXPATHLEN) return EINVAL; pa.zev_poolname[pa.zev_poolname_len] = '\0'; if (cmd == ZEV_IOC_MUTE_POOL) { return zev_ioc_mute_pool(pa.zev_poolname); } else { return zev_ioc_unmute_pool(pa.zev_poolname); } break; case ZEV_IOC_SET_MAX_QUEUE_LEN: if (ddi_copyin((void *)arg, &len, sizeof(len), mode) != 0) return EFAULT; if (len > ZEV_MAX_QUEUE_LEN) return EINVAL; mutex_enter(&zev_mutex); zev_statistics.zev_max_queue_len = len; cv_broadcast(&zev_condvar); mutex_exit(&zev_mutex); break; case ZEV_IOC_SET_POLL_WAKEUP_QUEUE_LEN: if (ddi_copyin((void *)arg, &len, sizeof(len), mode) != 0) return EFAULT; mutex_enter(&zev_mutex); zev_statistics.zev_poll_wakeup_queue_len = len; mutex_exit(&zev_mutex); break; case ZEV_IOC_MARK: if (ddi_copyin((void *)arg, &mark, sizeof(mark), mode) != 0) return EFAULT; cmn_err(CE_WARN, "mark: guid=%lu payload_len=%d", (long unsigned int)mark.zev_guid, mark.zev_payload_len); /* prepare message */ msg_size = sizeof(*rec) + mark.zev_payload_len + 1; msg = kmem_alloc(sizeof(*msg) + msg_size, KM_SLEEP); msg->size = msg_size; rec = (zev_mark_t *)(msg + 1); rec->record_len = msg_size; rec->op = ZEV_OP_MARK; rec->op_time = ddi_get_time(); rec->guid = mark.zev_guid; rec->payload_len = mark.zev_payload_len; /* get payload */ if (ddi_copyin(((char *)arg) + sizeof(mark), ZEV_PAYLOAD(rec), mark.zev_payload_len, mode) != 0) { kmem_free(msg, msg_size); return EFAULT; } *(ZEV_PAYLOAD(rec) + mark.zev_payload_len) = '\0'; /* get mark id and queue message */ mutex_enter(&zev_mark_id_mutex); mark_id = zev_mark_id++; mutex_exit(&zev_mark_id_mutex); rec->mark_id = mark_id; zev_queue_message(ZEV_OP_MARK, msg); /* report mark id to userland, ignore errors */ mark.zev_mark_id = mark_id; ddi_copyout(&mark, (void *)arg, sizeof(mark), mode); break; default: /* generic "ioctl unknown" error */ return (ENOTTY); } return (0); } static int zev_chpoll(dev_t dev, short events, int anyyet, short *reventsp, struct pollhead **phpp) { int instance; zev_state_t *sp; short revent = 0; instance = getminor(dev); if ((sp = ddi_get_soft_state(statep, instance)) == NULL) return (ENXIO); revent = 0; if ((events & POLLIN)) { mutex_enter(&zev_mutex); if (zev_queue_head) revent |= POLLIN; mutex_exit(&zev_mutex); } if (revent == 0) { if (!anyyet) { *phpp = &zev_pollhead; } } *reventsp = revent; return (0); } static int zev_read(dev_t dev, struct uio *uio_p, cred_t *crep_p) { zev_state_t *sp; int instance; offset_t off; int ret = 0; zev_msg_t *msg; char *data; instance = getminor(dev); if ((sp = ddi_get_soft_state(statep, instance)) == NULL) return (ENXIO); off = uio_p->uio_loffset; mutex_enter(&zev_mutex); msg = zev_queue_head; if (msg == NULL) { mutex_exit(&zev_mutex); return 0; } if (msg->size > uio_p->uio_resid) { mutex_exit(&zev_mutex); return E2BIG; } while (msg && uio_p->uio_resid >= msg->size) { data = (char *)(msg + 1); ret = uiomove(data, msg->size, UIO_READ, uio_p); if (ret != 0) { mutex_exit(&zev_mutex); cmn_err(CE_WARN, "zev: uiomove failed; messages lost"); uio_p->uio_loffset = off; return (ret); } zev_queue_head = msg->next; if (zev_queue_head == NULL) zev_queue_tail = NULL; zev_statistics.zev_bytes_read += msg->size; zev_statistics.zev_queue_len -= msg->size; zev_queue_len--; kmem_free(msg, sizeof(*msg) + msg->size); msg = zev_queue_head; } cv_broadcast(&zev_condvar); mutex_exit(&zev_mutex); uio_p->uio_loffset = off; return 0; } static int zev_close(dev_t dev, int flag, int otyp, cred_t *crepd) { zev_state_t *sp; int instance; instance = getminor(dev); if ((sp = ddi_get_soft_state(statep, instance)) == NULL) return (ENXIO); if (otyp != OTYP_CHR) return (EINVAL); mutex_enter(&sp->mutex); if (sp->busy != B_TRUE) { mutex_exit(&sp->mutex); return (EINVAL); } sp->busy = B_FALSE; mutex_exit(&sp->mutex); return (0); } static int zev_open(dev_t *devp, int flag, int otyp, cred_t *credp) { zev_state_t *sp; int instance; instance = getminor(*devp); if ((sp = ddi_get_soft_state(statep, instance)) == NULL) return (ENXIO); if (otyp != OTYP_CHR) return (EINVAL); if (drv_priv(credp) != 0) return (EPERM); mutex_enter(&sp->mutex); if (sp->busy == B_TRUE) { /* XXX: wait for the instance to become available? */ /* XXX: if we wait, the wait should be signal-interruptable. */ mutex_exit(&sp->mutex); return (EBUSY); } sp->busy = B_TRUE; /* can only be opened exclusively */ mutex_exit(&sp->mutex); return (0); } static struct cb_ops zev_cb_ops = { zev_open, /* open */ zev_close, /* close */ nodev, /* strategy */ nodev, /* print */ nodev, /* dump */ zev_read, /* read */ nodev, /* write */ zev_ioctl, /* ioctl */ nodev, /* devmap */ nodev, /* mmap */ nodev, /* segmap */ zev_chpoll, /* chpoll */ ddi_prop_op, /* prop_op */ NULL, /* streamtab */ D_MP | D_64BIT, /* cb_flag */ CB_REV, /* cb_rev */ nodev, /* aread */ nodev, /* awrite */ }; static void zev_free_instance(dev_info_t *dip) { int instance; zev_state_t *sp; instance = ddi_get_instance(dip); //ddi_remove_minor_node(dip, ddi_get_name(dip)); ddi_remove_minor_node(dip, NULL); sp = ddi_get_soft_state(statep, instance); if (sp) { mutex_destroy(&sp->mutex); ddi_soft_state_free(statep, instance); } } static int zev_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) { int instance; zev_state_t *sp; /* called once per instance with DDI_DETACH, may be called to suspend */ switch (cmd) { case DDI_DETACH: /* instance busy? */ instance = ddi_get_instance(dip); if ((sp = ddi_get_soft_state(statep, instance)) == NULL) return (ENXIO); mutex_enter(&sp->mutex); if (sp->busy == B_TRUE) { mutex_exit(&sp->mutex); return (EBUSY); } mutex_exit(&sp->mutex); /* free resources allocated for this instance */ zev_free_instance(dip); return (DDI_SUCCESS); case DDI_SUSPEND: /* kernel must not suspend zev devices while ZFS is running */ return (DDI_FAILURE); default: return (DDI_FAILURE); } } static int zev_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { /* called once per instance with DDI_ATTACH, may be called to resume */ int instance; zev_state_t *sp; switch (cmd) { case DDI_ATTACH: instance = ddi_get_instance(dip); if (ddi_soft_state_zalloc(statep, instance) != DDI_SUCCESS) { return (DDI_FAILURE); } sp = ddi_get_soft_state(statep, instance); ddi_set_driver_private(dip, sp); sp->dip = dip; sp->busy = B_FALSE; mutex_init(&sp->mutex, NULL, MUTEX_DRIVER, NULL); if (ddi_create_minor_node(dip, ddi_get_name(dip), S_IFCHR, instance, DDI_PSEUDO, 0) == DDI_FAILURE) { zev_free_instance(dip); return (DDI_FAILURE); } ddi_report_dev(dip); return (DDI_SUCCESS); case DDI_RESUME: /* suspendeding zev devices should never happen */ return (DDI_SUCCESS); default: return (DDI_FAILURE); } } static int zev_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **resultp) { int instance; zev_state_t *sp; switch (infocmd) { case DDI_INFO_DEVT2DEVINFO: /* arg is dev_t */ instance = getminor((dev_t)arg); if ((sp = ddi_get_soft_state(statep, instance)) != NULL) { *resultp = sp->dip; return (DDI_SUCCESS); } *resultp = NULL; return (DDI_FAILURE); case DDI_INFO_DEVT2INSTANCE: /* arg is dev_t */ instance = getminor((dev_t)arg); *resultp = (void *)(uintptr_t)instance; return (DDI_FAILURE); } return (DDI_FAILURE); } static struct dev_ops zev_dev_ops = { DEVO_REV, /* driver build revision */ 0, /* driver reference count */ zev_getinfo, /* getinfo */ nulldev, /* identify (obsolete) */ nulldev, /* probe (search for devices) */ zev_attach, /* attach */ zev_detach, /* detach */ nodev, /* reset (obsolete, use quiesce) */ &zev_cb_ops, /* character and block device ops */ NULL, /* bus driver ops */ NULL, /* power management, not needed */ ddi_quiesce_not_needed, /* quiesce */ }; static struct modldrv zev_modldrv = { &mod_driverops, /* all loadable modules use this */ "zev ZFS event provider, v1.0", /* driver name and version info */ &zev_dev_ops /* ops method pointers */ }; static struct modlinkage zev_modlinkage = { MODREV_1, /* fixed value */ { &zev_modldrv, /* driver linkage structure */ NULL /* list terminator */ } }; int _init(void) { int error; boolean_t module_installed = B_FALSE; if ((error = ddi_soft_state_init(&statep, sizeof(zev_state_t), 1)) != 0) return (error); zev_busy = B_FALSE; mutex_init(&zev_mutex, NULL, MUTEX_DRIVER, NULL); cv_init(&zev_condvar, NULL, CV_DRIVER, NULL); rw_init(&zev_pool_list_rwlock, NULL, RW_DRIVER, NULL); mutex_init(&zev_mark_id_mutex, NULL, MUTEX_DRIVER, NULL); zev_mark_id = gethrtime(); bzero(&zev_statistics, sizeof(zev_statistics)); zev_statistics.zev_max_queue_len = ZEV_MAX_QUEUE_LEN; zev_statistics.zev_poll_wakeup_queue_len = ZEV_MIN_POLL_WAKEUP_QUEUE_LEN; if (zev_ioc_mute_pool("zg0")) { cmn_err(CE_WARN, "zev: could not init mute list"); goto FAIL; } if ((error = mod_install(&zev_modlinkage)) != 0) { cmn_err(CE_WARN, "zev: could not install module"); goto FAIL; } module_installed = B_TRUE; /* * Note: _init() seems to be a bad place to access other modules' * device files, as it can cause a kernel panic. * * For example, our _init() is called if our module isn't loaded * when someone causes a readdir() in "/devices/pseudo". For that, * devfs_readdir() is used, which obtains an rwlock for the * directory. * * Then, if we open a device file here, we will indirectly call * devfs_lookup(), which tries to obtain the same rwlock * again, which this thread already has. That will result in * a kernel panic. ("recursive entry") * * Therefor, we have switched from a zfs ioctl() to directly * accessing symbols in the zfs module. */ /* switch ZFS event callbacks to zev module callback functions */ rw_enter(&rz_zev_rwlock, RW_WRITER); rz_zev_callbacks = &zev_callbacks; rw_exit(&rz_zev_rwlock); zev_poll_wakeup_thread = thread_create(NULL, 0, zev_poll_wakeup_thread_main, NULL, 0, &p0, TS_RUN, minclsyspri); return (0); FAIL: /* free resources */ if (module_installed == B_TRUE) (void) mod_remove(&zev_modlinkage); mutex_destroy(&zev_mutex); ddi_soft_state_fini(&statep); return (error); } int _info(struct modinfo *modinfop) { return (mod_info(&zev_modlinkage, modinfop)); } int _fini(void) { int error = 0; zev_msg_t *msg; zev_pool_list_entry_t *pe, *npe; mutex_enter(&zev_mutex); if (zev_busy == B_TRUE) { mutex_exit(&zev_mutex); return (SET_ERROR(EBUSY)); } mutex_exit(&zev_mutex); /* switch ZFS event callbacks back to default */ rw_enter(&rz_zev_rwlock, RW_WRITER); rz_zev_callbacks = rz_zev_default_callbacks; rw_exit(&rz_zev_rwlock); /* no thread is inside of the callbacks anymore. Safe to remove. */ zev_wakeup_thread_run = 0; if (zev_poll_wakeup_thread != 0) { thread_join(zev_poll_wakeup_thread->t_did); zev_poll_wakeup_thread = 0; } if ((error = mod_remove(&zev_modlinkage)) != 0) { cmn_err(CE_WARN, "mod_remove failed: %d", error); return (error); } /* free resources */ mutex_enter(&zev_mutex); while (zev_queue_head) { msg = zev_queue_head; zev_queue_head = msg->next; if (msg) kmem_free(msg, sizeof(*msg) + msg->size); } mutex_exit(&zev_mutex); rw_enter(&zev_pool_list_rwlock, RW_WRITER); pe = zev_muted_pools_head; while (pe) { npe = pe; pe = pe->next; kmem_free(npe, sizeof(*npe)); } rw_exit(&zev_pool_list_rwlock); ddi_soft_state_fini(&statep); rw_destroy(&zev_pool_list_rwlock); cv_destroy(&zev_condvar); mutex_destroy(&zev_mutex); mutex_destroy(&zev_mark_id_mutex); return (0); }