#include #include #include #include #include #include #include #include #include #include #include #include #include #include #define OFFSETOF(s, m) ((size_t)(&(((s *)0)->m))) #define ZEV_DEFAULT_QUEUE_NAME "beaver" #define ZEV_CONTROL_DEVICE_MINOR 0 #define ZEV_MINOR_MIN (ZEV_CONTROL_DEVICE_MINOR + 1) #define ZEV_MINOR_MAX (ZEV_MINOR_MIN + ZEV_MAX_QUEUES - 1) typedef struct zev_queue { char zq_name[ZEV_MAX_QUEUE_NAME_LEN+1]; minor_t zq_minor_number; dev_info_t *zq_dip; struct pollhead zq_pollhead; uint64_t zq_bytes_read; uint64_t zq_events_read; uint64_t zq_bytes_discarded; uint64_t zq_events_discarded; uint64_t zq_bytes_total; uint64_t zq_events_total; uint64_t zq_wakeup_threshold; uint16_t zq_flags; uint16_t zq_need_wakeup; /* protected by zev_mutex */ int zq_refcnt; uint64_t zq_queue_len; uint64_t zq_queue_messages; uint64_t zq_max_queue_len; zev_msg_t *zq_oldest; boolean_t zq_busy; boolean_t zq_to_be_removed; zev_statistics_t zq_statistics; kcondvar_t zq_condvar; } zev_queue_t; static void *statep; struct pollhead zev_pollhead; kmutex_t zev_mutex; kcondvar_t zev_condvar; kmutex_t zev_queue_msg_mutex; krwlock_t zev_pool_list_rwlock; static zev_statistics_t zev_statistics; static boolean_t zev_attached; static kmutex_t zev_mark_id_mutex; static uint64_t zev_mark_id = 0; static uint64_t zev_msg_sequence_number = 0; static zev_queue_t *zev_queues[ZEV_MAX_QUEUES]; static int zev_queue_cnt = 0; uint64_t zev_memory_allocated = 0; uint64_t zev_memory_freed = 0; /* * The longest potential message is from zev_zfs_mount() and * contains the mountpoint, which might be close to MAXPATHLEN bytes long. * * Another candidate is zev_znode_rename_cb() and contains three inode * numbers and two filenames of up to MAXNAMELEN bytes each. */ #define ZEV_MAX_MESSAGE_LEN 4096 static zev_msg_t *zev_queue_head = NULL; static zev_msg_t *zev_queue_tail = NULL; static uint64_t zev_queue_len = 0; typedef struct zev_pool_list_entry { struct zev_pool_list_entry *next; char name[MAXPATHLEN]; } zev_pool_list_entry_t; static zev_pool_list_entry_t *zev_muted_pools_head = NULL; static volatile int zev_wakeup_thread_run = 1; static kthread_t *zev_poll_wakeup_thread = NULL; void * zev_alloc(ssize_t sz) { ZEV_MEM_ADD(sz); return kmem_alloc(sz, KM_SLEEP); } void * zev_zalloc(ssize_t sz) { ZEV_MEM_ADD(sz); return kmem_zalloc(sz, KM_SLEEP); } void zev_free(void *ptr, ssize_t sz) { ZEV_MEM_SUB(sz); \ kmem_free(ptr, sz); } int zev_queue_cmp(const void *a, const void *b) { const zev_queue_t *qa = a; const zev_queue_t *qb = b; if (qa->zq_minor_number > qb->zq_minor_number) return 1; if (qa->zq_minor_number < qb->zq_minor_number) return -1; return 0; } /* must be called with zev_mutex held */ void zev_queue_trim(void) { zev_msg_t *m; uint64_t oldest_message; zev_queue_t *q; int i; if (!zev_queue_tail) return; oldest_message = zev_queue_tail->seq + 1; /* does not exist, yet. */ for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) { q = zev_queues[i - ZEV_MINOR_MIN]; if (q == NULL) continue; if (!q->zq_oldest) continue; if (oldest_message > q->zq_oldest->seq) oldest_message = q->zq_oldest->seq; } /* remove msgs between oldest_message and zev_queue_head */ while(zev_queue_head && (oldest_message > zev_queue_head->seq)) { m = zev_queue_head; zev_queue_head = m->next; if (zev_queue_head == NULL) { zev_queue_tail = NULL; } else { zev_queue_head->prev = NULL; } if (m->read == 0) { zev_statistics.zev_bytes_discarded += m->size; zev_statistics.zev_cnt_discarded_events++; } zev_statistics.zev_queue_len -= m->size; zev_queue_len--; zev_free(m, sizeof(*m) + m->size); } } /* must be called with zev_mutex held */ static void zev_queue_hold(zev_queue_t *q) { q->zq_refcnt++; } /* must be called with zev_mutex held */ static void zev_queue_release(zev_queue_t *q) { q->zq_refcnt--; if (q->zq_refcnt > 0) return; ASSERT(q->zq_busy == B_FALSE); /* persistent queues will not be removed */ if ((q->zq_flags & ZEV_FL_PERSISTENT) != 0) return; /* remove queue from queue list */ zev_queues[q->zq_minor_number - ZEV_MINOR_MIN] = NULL; /* discard messages that no queue references anymore */ zev_queue_trim(); cv_destroy(&q->zq_condvar); ddi_remove_minor_node(q->zq_dip, q->zq_name); ddi_soft_state_free(statep, q->zq_minor_number); ZEV_MEM_SUB(sizeof(zev_queue_t)); zev_queue_cnt--; } int zev_queue_new(zev_queue_t **queue, dev_info_t *dip, char *name, uint64_t max_queue_len, uint16_t flags) { zev_queue_t *q; zev_queue_t *tmp; zev_msg_t *msg; int name_exists = 0; minor_t minor; char *p; int i; if (max_queue_len > ZEV_MAX_QUEUE_LEN) return EINVAL; if (max_queue_len == 0) max_queue_len = ZEV_MAX_QUEUE_LEN; if (!strcmp(name, ZEV_CONTROL_DEVICE_NAME)) return EINVAL; for (p = name; *p; p++) { if (*p >= 'a' && *p <= 'z') continue; if (*p >= '0' && *p <= '9') continue; if (*p == '.') continue; return EINVAL; } mutex_enter(&zev_mutex); /* find free minor number.*/ /* if this were a frequent operation we'd have a free-minor list */ for (minor = ZEV_MINOR_MIN; minor <= ZEV_MINOR_MAX; minor++) { tmp = zev_queues[minor - ZEV_MINOR_MIN]; if (tmp == NULL) break; } if (tmp) { mutex_exit(&zev_mutex); return ENOSPC; } if (ddi_soft_state_zalloc(statep, minor) != DDI_SUCCESS) { mutex_exit(&zev_mutex); return ENOSPC; } ZEV_MEM_ADD(sizeof(zev_queue_t)); q = ddi_get_soft_state(statep, minor); memset(q, 0, sizeof(*q)); strncpy(q->zq_name, name, ZEV_MAX_QUEUE_NAME_LEN); q->zq_name[ZEV_MAX_QUEUE_NAME_LEN] = '\0'; q->zq_max_queue_len = max_queue_len; q->zq_wakeup_threshold = ZEV_DEFAULT_POLL_WAKEUP_QUEUE_LEN; q->zq_flags = flags; q->zq_refcnt = 1; q->zq_dip = dip; q->zq_minor_number = minor; cv_init(&q->zq_condvar, NULL, CV_DRIVER, NULL); /* insert into queue list */ for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) { /* if this were a frequent operation we'd have a name tree */ if (zev_queues[i - ZEV_MINOR_MIN] == NULL) continue; if (!strcmp(q->zq_name, zev_queues[i-ZEV_MINOR_MIN]->zq_name)) { name_exists = 1; break; } } if (name_exists) { ddi_soft_state_free(statep, minor); ZEV_MEM_SUB(sizeof(zev_queue_t)); mutex_exit(&zev_mutex); return EEXIST; } zev_queues[minor - ZEV_MINOR_MIN] = q; zev_queue_cnt++; /* calculate current queue len and find head and tail */ q->zq_oldest = zev_queue_tail; msg = zev_queue_tail; while ((msg != NULL) && (q->zq_queue_len < q->zq_max_queue_len)) { q->zq_queue_len += msg->size; q->zq_queue_messages++; q->zq_oldest = msg; msg = msg->prev; } mutex_exit(&zev_mutex); if (ddi_create_minor_node(dip, name, S_IFCHR, minor, DDI_PSEUDO, 0) == DDI_FAILURE) { mutex_enter(&zev_mutex); zev_queues[minor - ZEV_MINOR_MIN] = NULL; zev_queue_cnt--; ddi_soft_state_free(statep, minor); ZEV_MEM_SUB(sizeof(zev_queue_t)); mutex_exit(&zev_mutex); return EFAULT; } *queue = q; return 0; } /* * poll() wakeup thread. Used to check periodically whether we have * bytes left in the queue that have not yet been made into a * pollwakeup() call. This is meant to insure a maximum waiting * time until an event is presented as a poll wakeup, while at * the same time not making every single event into a poll wakeup * of it's own. */ static void zev_poll_wakeup(boolean_t flush_all) { zev_queue_t *q; int i; /* * This loop works with hold() and release() because * pollwakeup() requires us to release our locks before calling it. * * from pollwakeup(9F): * * "Driver defined locks should not be held across calls * to this function." */ /* wake up threads for each individual queue */ mutex_enter(&zev_mutex); for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) { q = zev_queues[i - ZEV_MINOR_MIN]; if (q == NULL) continue; if (!q->zq_busy) continue; if (!q->zq_queue_len) continue; if ((flush_all) || (q->zq_queue_len > q->zq_wakeup_threshold)) { zev_queue_hold(q); mutex_exit(&zev_mutex); pollwakeup(&q->zq_pollhead, POLLIN); mutex_enter(&zev_mutex); zev_queue_release(q); } } mutex_exit(&zev_mutex); } static void zev_poll_wakeup_thread_main(void) { while (zev_wakeup_thread_run) { delay(drv_usectohz(100 * 1000)); /* sleep 100ms */ zev_poll_wakeup(B_TRUE); } thread_exit(); } static int zev_ioc_mute_pool(char *poolname) { zev_pool_list_entry_t *pe; rw_enter(&zev_pool_list_rwlock, RW_WRITER); /* pool already muted? */ for (pe=zev_muted_pools_head; pe; pe=pe->next) { if (!strcmp(pe->name, poolname)) { rw_exit(&zev_pool_list_rwlock); return EEXIST; } } pe = zev_zalloc(sizeof(*pe)); if (!pe) { rw_exit(&zev_pool_list_rwlock); return ENOMEM; } (void) strncpy(pe->name, poolname, sizeof(pe->name)); pe->next = zev_muted_pools_head; zev_muted_pools_head = pe; rw_exit(&zev_pool_list_rwlock); return (0); } static int zev_ioc_unmute_pool(char *poolname) { zev_pool_list_entry_t *pe, *peprev; rw_enter(&zev_pool_list_rwlock, RW_WRITER); /* pool muted? */ peprev = NULL; for (pe=zev_muted_pools_head; pe; pe=pe->next) { if (!strcmp(pe->name, poolname)) break; peprev = pe; } if (pe) { rw_exit(&zev_pool_list_rwlock); return ENOENT; } if (peprev != NULL) { peprev->next = pe->next; } else { zev_muted_pools_head = pe->next; } zev_free(pe, sizeof(*pe)); rw_exit(&zev_pool_list_rwlock); return (0); } int zev_skip_pool(objset_t *os) { zev_pool_list_entry_t *pe; dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool; rw_enter(&zev_pool_list_rwlock, RW_READER); for (pe=zev_muted_pools_head; pe; pe=pe->next) { if (!strcmp(pe->name, dp->dp_spa->spa_name)) { rw_exit(&zev_pool_list_rwlock); return 1; } } rw_exit(&zev_pool_list_rwlock); return 0; } static void zev_update_statistics(int op, zev_statistics_t *stat) { switch (op) { case ZEV_OP_ERROR: stat->zev_cnt_errors++; break; case ZEV_OP_MARK: stat->zev_cnt_marks++; break; case ZEV_OP_ZFS_MOUNT: stat->zev_cnt_zfs_mount++; break; case ZEV_OP_ZFS_UMOUNT: stat->zev_cnt_zfs_umount++; break; case ZEV_OP_ZVOL_WRITE: stat->zev_cnt_zvol_write++; break; case ZEV_OP_ZVOL_TRUNCATE: stat->zev_cnt_zvol_truncate++; break; case ZEV_OP_ZNODE_CLOSE_AFTER_UPDATE: stat->zev_cnt_znode_close_after_update++; break; case ZEV_OP_ZNODE_CREATE: stat->zev_cnt_znode_create++; break; case ZEV_OP_ZNODE_REMOVE: stat->zev_cnt_znode_remove++; break; case ZEV_OP_ZNODE_LINK: stat->zev_cnt_znode_link++; break; case ZEV_OP_ZNODE_SYMLINK: stat->zev_cnt_znode_symlink++; break; case ZEV_OP_ZNODE_RENAME: stat->zev_cnt_znode_rename++; break; case ZEV_OP_ZNODE_WRITE: stat->zev_cnt_znode_write++; break; case ZEV_OP_ZNODE_TRUNCATE: stat->zev_cnt_znode_truncate++; break; case ZEV_OP_ZNODE_SETATTR: stat->zev_cnt_znode_setattr++; break; case ZEV_OP_ZNODE_ACL: stat->zev_cnt_znode_acl++; break; } } void zev_queue_message(int op, zev_msg_t *msg) { zev_queue_t *q; int wakeup = 0; zev_msg_t *m; int i; msg->next = NULL; msg->prev = NULL; msg->read = 0; if (op < ZEV_OP_MIN || op > ZEV_OP_MAX) { zev_queue_error(op, "unknown op id encountered: %d", op); zev_free(msg, sizeof(*msg) + msg->size); return; } /* * This mutex protects us agains race conditions when several * threads want to queue a message and one or more queues are * full: we release zev_mutex to wait for the queues to become * less-than-full, but we don't know in which order the waiting * threads will be awoken. If it's not the same order in which * they went to sleep we might mark different messages as "newest" * in different queues, and so we might have dupes or even * skip messages. */ mutex_enter(&zev_queue_msg_mutex); mutex_enter(&zev_mutex); /* * When the module is loaded, the default behavior ist to * put all events into a queue and block if the queue is full. * This is done even before the pseudo device is attached. * This way, no events are lost. * * To discard events entirely the "beaver" queue, * which never discards anything, has to be removed. */ if (zev_queue_cnt == 0) { mutex_exit(&zev_mutex); mutex_exit(&zev_queue_msg_mutex); return; } /* put message into global queue */ msg->seq = zev_msg_sequence_number++; while (zev_statistics.zev_max_queue_len && zev_statistics.zev_queue_len >= zev_statistics.zev_max_queue_len) { /* queue full. block until it's been shrunk. */ cv_wait(&zev_condvar, &zev_mutex); } if (zev_queue_tail == NULL) { zev_queue_head = zev_queue_tail = msg; } else { zev_queue_tail->next = msg; msg->prev = zev_queue_tail; zev_queue_tail = msg; } zev_queue_len++; zev_statistics.zev_cnt_total_events++; zev_statistics.zev_queue_len += msg->size; /* update per-device queues */ for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) { q = zev_queues[i - ZEV_MINOR_MIN]; if (!q) continue; zev_queue_hold(q); /* make sure queue has enough room */ while (q->zq_max_queue_len && q->zq_queue_len > q->zq_max_queue_len) { if (q->zq_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL) { /* block until queue has been shrunk. */ cv_wait(&zev_condvar, &zev_mutex); } else { /* discard msgs until queue is small enough */ while (q->zq_queue_len > q->zq_max_queue_len) { m = q->zq_oldest; if (m == NULL) break; q->zq_events_discarded++; q->zq_bytes_discarded += m->size; q->zq_oldest = m->next; q->zq_queue_len -= m->size; q->zq_queue_messages--; } } } /* register new message at the end of the queue */ q->zq_queue_len += msg->size; q->zq_queue_messages++; q->zq_bytes_total += msg->size; q->zq_events_total++; if (q->zq_oldest == NULL) q->zq_oldest = msg; zev_update_statistics(op, &q->zq_statistics); if (q->zq_queue_len > q->zq_wakeup_threshold) wakeup = 1; if (q->zq_queue_len == msg->size) /* queue was empty */ cv_broadcast(&q->zq_condvar); zev_queue_release(q); } zev_queue_trim(); zev_update_statistics(op, &zev_statistics); mutex_exit(&zev_mutex); mutex_exit(&zev_queue_msg_mutex); /* one or more queues need a pollwakeup() */ if (op == ZEV_OP_MARK) { zev_poll_wakeup(B_TRUE); } else if (wakeup) { zev_poll_wakeup(B_FALSE); } return; } void zev_queue_error(int op, char *fmt, ...) { char buf[ZEV_MAX_MESSAGE_LEN]; va_list ap; int len; zev_msg_t *msg = NULL; zev_error_t *rec; int msg_size; va_start(ap, fmt); len = vsnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); if (len >= sizeof(buf)) { cmn_err(CE_WARN, "zev: can't report error - " "dropping event entirely."); return; } msg_size = sizeof(*rec) + len + 1; msg = zev_alloc(sizeof(*msg) + msg_size); msg->size = msg_size; rec = (zev_error_t *)(msg + 1); rec->record_len = msg_size; rec->op = ZEV_OP_ERROR; rec->op_time = ddi_get_time(); rec->guid = 0; rec->failed_op = op; rec->errstr_len = len; (void) memcpy(ZEV_ERRSTR(rec), buf, len + 1); zev_queue_message(ZEV_OP_ERROR, msg); return; } static int zev_find_queue(zev_queue_t **out, zev_queue_t *req_q, zev_queue_name_t *name) { char namebuf[ZEV_MAX_QUEUE_NAME_LEN+1]; zev_queue_t *q; int i; *out = NULL; if (name->zev_namelen == 0) { if (req_q->zq_minor_number == ZEV_CONTROL_DEVICE_MINOR) return EINVAL; zev_queue_hold(req_q); *out = req_q; return 0; } if (name->zev_namelen > ZEV_MAX_QUEUE_NAME_LEN) return EINVAL; strncpy(namebuf, name->zev_name, name->zev_namelen); namebuf[name->zev_namelen] = '\0'; mutex_enter(&zev_mutex); for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) { q = zev_queues[i - ZEV_MINOR_MIN]; if (!q) continue; if (!strcmp(q->zq_name, namebuf)) { zev_queue_hold(q); mutex_exit(&zev_mutex); *out = q; return 0; } } mutex_exit(&zev_mutex); return ENOENT; } static int zev_ioc_get_queue_statistics(zev_queue_t *req_q, intptr_t arg, int mode) { zev_ioctl_get_queue_statistics_t gs; zev_queue_t *q; int ret; if (ddi_copyin((void *)arg, &gs, sizeof(gs), mode) != 0) return EFAULT; ret = zev_find_queue(&q, req_q, &gs.zev_queue_name); if (ret) return ret; /* ddi_copyout() can take a long time. Better make a copy to be able to release the mutex faster. */ mutex_enter(&zev_mutex); memcpy(&gs.zev_statistics, &q->zq_statistics,sizeof(gs.zev_statistics)); gs.zev_statistics.zev_queue_len = q->zq_queue_len; gs.zev_statistics.zev_bytes_read = q->zq_bytes_read; gs.zev_statistics.zev_bytes_discarded = q->zq_bytes_discarded; gs.zev_statistics.zev_max_queue_len = q->zq_max_queue_len; gs.zev_statistics.zev_cnt_discarded_events = q->zq_events_discarded; gs.zev_statistics.zev_cnt_total_events = q->zq_events_total; zev_queue_release(q); mutex_exit(&zev_mutex); if (ddi_copyout(&gs, (void *)arg, sizeof(gs), mode) != 0) return EFAULT; return 0; } static int zev_ioc_set_queue_properties(zev_queue_t *req_q, intptr_t arg, int mode) { zev_ioctl_set_queue_properties_t qp; zev_queue_t *q; uint64_t old_max; uint64_t old_flags; int ret; if (ddi_copyin((void *)arg, &qp, sizeof(qp), mode) != 0) return EFAULT; if (qp.zev_max_queue_len > ZEV_MAX_QUEUE_LEN) return EINVAL; if (qp.zev_poll_wakeup_threshold > ZEV_MAX_POLL_WAKEUP_QUEUE_LEN) return EINVAL; ret = zev_find_queue(&q, req_q, &qp.zev_queue_name); if (ret) return ret; mutex_enter(&zev_mutex); /* * Note: if the PERSISTENT flag is cleared, and the queue is not busy, * the queue should be removed by zev_queue_release() in zev_ioctl(). */ old_flags = qp.zev_flags; q->zq_flags = qp.zev_flags; if ((old_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL) && (!(qp.zev_flags & ZEV_FL_BLOCK_WHILE_QUEUE_FULL))) { /* queue is no longer blocking - wake blocked threads */ cv_broadcast(&zev_condvar); } old_max = q->zq_max_queue_len; q->zq_max_queue_len = qp.zev_max_queue_len; if (q->zq_max_queue_len < old_max) zev_queue_trim(); if (q->zq_max_queue_len > old_max) cv_broadcast(&zev_condvar); /* threads may be waiting */ if ((qp.zev_poll_wakeup_threshold < q->zq_wakeup_threshold) && (qp.zev_poll_wakeup_threshold <= q->zq_queue_len)) pollwakeup(&q->zq_pollhead, POLLIN); q->zq_wakeup_threshold = qp.zev_poll_wakeup_threshold; zev_queue_release(q); mutex_exit(&zev_mutex); return 0; } static int zev_ioc_get_queue_properties(zev_queue_t *req_q, intptr_t arg, int mode) { zev_ioctl_get_queue_properties_t qp; zev_queue_t *q; int ret; if (ddi_copyin((void *)arg, &qp, sizeof(qp), mode) != 0) return EFAULT; ret = zev_find_queue(&q, req_q, &qp.zev_queue_name); if (ret) return ret; mutex_enter(&zev_mutex); qp.zev_max_queue_len = q->zq_max_queue_len; qp.zev_flags = q->zq_flags; qp.zev_poll_wakeup_threshold = q->zq_wakeup_threshold; zev_queue_release(q); mutex_exit(&zev_mutex); if (ddi_copyout(&qp, (void *)arg, sizeof(qp), mode) != 0) return EFAULT; return 0; } static int zev_ioc_add_queue(zev_queue_t *req_q, intptr_t arg, int mode) { zev_ioctl_add_queue_t aq; zev_queue_t *new_q; char name[ZEV_MAX_QUEUE_NAME_LEN+1]; if (ddi_copyin((void *)arg, &aq, sizeof(aq), mode) != 0) return EFAULT; if (aq.zev_namelen > ZEV_MAX_QUEUE_NAME_LEN) return EINVAL; strncpy(name, aq.zev_name, aq.zev_namelen); name[aq.zev_namelen] = '\0'; return zev_queue_new(&new_q, req_q->zq_dip, name, aq.zev_max_queue_len, aq.zev_flags); } static int zev_ioc_remove_queue(zev_queue_t *req_q, intptr_t arg, int mode) { zev_ioctl_remove_queue_t rq; zev_queue_t *q; char name[ZEV_MAX_QUEUE_NAME_LEN+1]; int found = 0; int i; if (ddi_copyin((void *)arg, &rq, sizeof(rq), mode) != 0) return EFAULT; if (rq.zev_queue_name.zev_namelen > ZEV_MAX_QUEUE_NAME_LEN) return EINVAL; strncpy(name, rq.zev_queue_name.zev_name, rq.zev_queue_name.zev_namelen); name[rq.zev_queue_name.zev_namelen] = '\0'; mutex_enter(&zev_mutex); for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) { q = zev_queues[i - ZEV_MINOR_MIN]; if (!q) continue; if (!strcmp(q->zq_name, name)) { found = 1; break; } } if (!found) { mutex_exit(&zev_mutex); return ENOENT; } if (q->zq_busy) { mutex_exit(&zev_mutex); return EBUSY; } /* * clear flags, so that persistent queues are removed aswell * and the queue becomes non-blocking. */ q->zq_flags = 0; if (q->zq_to_be_removed == B_FALSE) { q->zq_to_be_removed = B_TRUE; zev_queue_release(q); } /* some threads might be waiting for this queue to become writable */ cv_broadcast(&zev_condvar); mutex_exit(&zev_mutex); return 0; } static int zev_ioc_get_debug_info(zev_queue_t *req_q, intptr_t arg, int mode) { zev_ioctl_debug_info_t di; uint64_t mem_allocated = atomic_add_64_nv(&zev_memory_allocated, 0); uint64_t mem_freed = atomic_add_64_nv(&zev_memory_freed, 0); zev_chksum_stats(&di.zev_chksum_cache_size, &di.zev_chksum_cache_hits, &di.zev_chksum_cache_misses); di.zev_memory_allocated = mem_allocated - mem_freed; if (ddi_copyout(&di, (void *)arg, sizeof(di), mode) != 0) return EFAULT; return 0; } static int zev_ioc_get_queue_list(zev_queue_t *req_q, intptr_t arg, int mode) { zev_ioctl_get_queue_list_t gql; zev_queue_t *q; int i = 0; int count = 0; memset(&gql, 0, sizeof(gql)); mutex_enter(&zev_mutex); for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) { q = zev_queues[i - ZEV_MINOR_MIN]; if (!q) continue; strncpy(gql.zev_queue_name[count].zev_name, q->zq_name, ZEV_MAX_QUEUE_NAME_LEN); gql.zev_queue_name[count].zev_namelen = strlen(q->zq_name); count++; } gql.zev_n_queues = count; mutex_exit(&zev_mutex); if (ddi_copyout(&gql, (void *)arg, sizeof(gql), mode) != 0) return EFAULT; return 0; } /* ARGSUSED */ static int zev_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) { zev_statistics_t zs; zev_ioctl_poolarg_t pa; zev_ioctl_mark_t mark; zev_mark_t *rec; int msg_size; zev_msg_t *msg; uint64_t len; uint64_t mark_id; minor_t minor; zev_queue_t *req_q; int ret = 0; minor = getminor(dev); mutex_enter(&zev_mutex); if ((req_q = ddi_get_soft_state(statep, minor)) == NULL) { mutex_exit(&zev_mutex); return (ENXIO); } zev_queue_hold(req_q); mutex_exit(&zev_mutex); /* * all structures passed between kernel and userspace * are now compatible between 64 and 32 bit. Model * conversion can be ignored. */ switch (cmd) { case ZEV_IOC_GET_GLOBAL_STATISTICS: /* ddi_copyout() can take a long time. Better make a copy to be able to release the mutex faster. */ mutex_enter(&zev_mutex); (void) memcpy(&zs, &zev_statistics, sizeof(zs)); mutex_exit(&zev_mutex); if (ddi_copyout(&zs, (void *)arg, sizeof(zs), mode) != 0) ret = EFAULT; break; case ZEV_IOC_GET_QUEUE_STATISTICS: ret = zev_ioc_get_queue_statistics(req_q, arg, mode); break; case ZEV_IOC_MUTE_POOL: case ZEV_IOC_UNMUTE_POOL: if (ddi_copyin((void *)arg, &pa, sizeof(pa), mode) != 0) { ret = EFAULT; break; } if (pa.zev_poolname_len >=MAXPATHLEN) { ret = EINVAL; break; } pa.zev_poolname[pa.zev_poolname_len] = '\0'; if (cmd == ZEV_IOC_MUTE_POOL) { ret = zev_ioc_mute_pool(pa.zev_poolname); } else { ret = zev_ioc_unmute_pool(pa.zev_poolname); } break; case ZEV_IOC_SET_MAX_QUEUE_LEN: if (ddi_copyin((void *)arg, &len, sizeof(len), mode) != 0) { ret = EFAULT; break; } if (len > ZEV_MAX_QUEUE_LEN) { ret = EINVAL; break; } mutex_enter(&zev_mutex); zev_statistics.zev_max_queue_len = len; cv_broadcast(&zev_condvar); mutex_exit(&zev_mutex); break; case ZEV_IOC_GET_QUEUE_PROPERTIES: ret = zev_ioc_get_queue_properties(req_q, arg, mode); break; case ZEV_IOC_SET_QUEUE_PROPERTIES: ret = zev_ioc_set_queue_properties(req_q, arg, mode); break; case ZEV_IOC_MARK: if (ddi_copyin((void *)arg, &mark, sizeof(mark), mode) != 0) { ret = EFAULT; break; } /* prepare message */ msg_size = sizeof(*rec) + mark.zev_payload_len + 1; msg = zev_alloc(sizeof(*msg) + msg_size); msg->size = msg_size; rec = (zev_mark_t *)(msg + 1); rec->record_len = msg_size; rec->op = ZEV_OP_MARK; rec->op_time = ddi_get_time(); rec->guid = mark.zev_guid; rec->payload_len = mark.zev_payload_len; /* get payload */ if (ddi_copyin(((char *)arg) + sizeof(mark), ZEV_PAYLOAD(rec), mark.zev_payload_len, mode) != 0) { zev_free(msg, msg_size); ret = EFAULT; break; } *(ZEV_PAYLOAD(rec) + mark.zev_payload_len) = '\0'; /* get mark id and queue message */ mutex_enter(&zev_mark_id_mutex); mark_id = zev_mark_id++; mutex_exit(&zev_mark_id_mutex); rec->mark_id = mark_id; zev_queue_message(ZEV_OP_MARK, msg); /* report mark id to userland, ignore errors */ mark.zev_mark_id = mark_id; ddi_copyout(&mark, (void *)arg, sizeof(mark), mode); break; case ZEV_IOC_ADD_QUEUE: if (minor != ZEV_CONTROL_DEVICE_MINOR) { ret = EACCES; break; } ret = zev_ioc_add_queue(req_q, arg, mode); break; case ZEV_IOC_REMOVE_QUEUE: if (minor != ZEV_CONTROL_DEVICE_MINOR) { ret = EACCES; break; } ret = zev_ioc_remove_queue(req_q, arg, mode); break; case ZEV_IOC_GET_DEBUG_INFO: ret = zev_ioc_get_debug_info(req_q, arg, mode); break; case ZEV_IOC_GET_QUEUE_LIST: ret = zev_ioc_get_queue_list(req_q, arg, mode); break; case ZEV_IOC_GET_FILE_SIGNATURES: ret = zev_ioc_get_signatures(arg, mode); break; default: /* generic "ioctl unknown" error */ ret = ENOTTY; } mutex_enter(&zev_mutex); zev_queue_release(req_q); mutex_exit(&zev_mutex); if (ret) SET_ERROR(ret); return (ret); } static int zev_chpoll(dev_t dev, short events, int anyyet, short *reventsp, struct pollhead **phpp) { int minor; short revent = 0; zev_queue_t *q; /* use minor-specific queue context and it's pollhead */ minor = getminor(dev); if (minor == ZEV_CONTROL_DEVICE_MINOR) return (EINVAL); mutex_enter(&zev_mutex); if ((q = ddi_get_soft_state(statep, minor)) == NULL) { mutex_exit(&zev_mutex); return (ENXIO); } revent = 0; if ((events & POLLIN)) { if (q->zq_oldest) revent |= POLLIN; } if (revent == 0) { if (!anyyet) { *phpp = &q->zq_pollhead; } } *reventsp = revent; mutex_exit(&zev_mutex); return (0); } /* ARGSUSED */ static int zev_read(dev_t dev, struct uio *uio_p, cred_t *crep_p) { minor_t minor; offset_t off; int ret = 0; zev_msg_t *msg; char *data; zev_queue_t *q; minor = getminor(dev); if (minor == ZEV_CONTROL_DEVICE_MINOR) return (EINVAL); mutex_enter(&zev_mutex); q = ddi_get_soft_state(statep, minor); if (q == NULL) { mutex_exit(&zev_mutex); return (ENXIO); } off = uio_p->uio_loffset; msg = q->zq_oldest; while (msg == NULL) { if (!ddi_can_receive_sig()) { /* * read() shouldn't block because this thread * can't receive signals. (e.g., it might be * torn down by exit() right now.) */ mutex_exit(&zev_mutex); return 0; } if (cv_wait_sig(&q->zq_condvar, &zev_mutex) == 0) { /* signal received. */ mutex_exit(&zev_mutex); return EINTR; } msg = q->zq_oldest; } if (msg->size > uio_p->uio_resid) { mutex_exit(&zev_mutex); return E2BIG; } while (msg && uio_p->uio_resid >= msg->size) { data = (char *)(msg + 1); ret = uiomove(data, msg->size, UIO_READ, uio_p); if (ret != 0) { mutex_exit(&zev_mutex); cmn_err(CE_WARN, "zev: uiomove failed; messages lost"); uio_p->uio_loffset = off; return (ret); } q->zq_oldest = msg->next; q->zq_bytes_read += msg->size; q->zq_queue_len -= msg->size; q->zq_queue_messages--; msg->read++; msg = q->zq_oldest; } cv_broadcast(&zev_condvar); mutex_exit(&zev_mutex); uio_p->uio_loffset = off; return 0; } /* ARGSUSED */ static int zev_close(dev_t dev, int flag, int otyp, cred_t *crepd) { zev_queue_t *q; int minor; minor = getminor(dev); if (otyp != OTYP_CHR) return (EINVAL); mutex_enter(&zev_mutex); if ((q = ddi_get_soft_state(statep, minor)) == NULL) { mutex_exit(&zev_mutex); return (ENXIO); } if (q->zq_busy != B_TRUE) { mutex_exit(&zev_mutex); return (EINVAL); } q->zq_busy = B_FALSE; if ((q->zq_flags & ZEV_FL_PERSISTENT) == 0) zev_queue_release(q); mutex_exit(&zev_mutex); return (0); } /* ARGSUSED */ static int zev_open(dev_t *devp, int flag, int otyp, cred_t *credp) { zev_queue_t *q; minor_t minor; minor = getminor(*devp); if (otyp != OTYP_CHR) return (EINVAL); if (drv_priv(credp) != 0) return (EPERM); mutex_enter(&zev_mutex); if ((q = ddi_get_soft_state(statep, minor)) == NULL) { mutex_exit(&zev_mutex); return (ENXIO); } if (minor == ZEV_CONTROL_DEVICE_MINOR) { /* control device may be used in parallel */ q->zq_busy = B_TRUE; mutex_exit(&zev_mutex); return 0; } if (q->zq_busy == B_TRUE) { mutex_exit(&zev_mutex); return (EBUSY); } q->zq_busy = B_TRUE; /* can only be opened exclusively */ mutex_exit(&zev_mutex); return (0); } static struct cb_ops zev_cb_ops = { zev_open, /* open */ zev_close, /* close */ nodev, /* strategy */ nodev, /* print */ nodev, /* dump */ zev_read, /* read */ nodev, /* write */ zev_ioctl, /* ioctl */ nodev, /* devmap */ nodev, /* mmap */ nodev, /* segmap */ zev_chpoll, /* chpoll */ ddi_prop_op, /* prop_op */ NULL, /* streamtab */ D_MP | D_64BIT, /* cb_flag */ CB_REV, /* cb_rev */ nodev, /* aread */ nodev, /* awrite */ }; static void zev_free_instance(dev_info_t *dip) { int instance; zev_queue_t *q; int i; instance = ddi_get_instance(dip); if (instance != 0) { cmn_err(CE_WARN, "zev: tried to free instance != 0 (%d)", instance); return; } ddi_remove_minor_node(dip, NULL); /* stop pollwakeup thread */ zev_wakeup_thread_run = 0; if (zev_poll_wakeup_thread != NULL) { thread_join(zev_poll_wakeup_thread->t_did); zev_poll_wakeup_thread = NULL; } mutex_enter(&zev_mutex); /* remove "ctrl" dummy queue */ q = ddi_get_soft_state(statep, ZEV_CONTROL_DEVICE_MINOR); if (q) { ddi_soft_state_free(statep, ZEV_CONTROL_DEVICE_MINOR); ZEV_MEM_SUB(sizeof(zev_queue_t)); } /* remove all other queues */ for (i = ZEV_MINOR_MIN; i <= ZEV_MINOR_MAX; i++) { q = zev_queues[i- ZEV_MINOR_MIN]; if (!q) continue; ASSERT(q->zq_refcnt == 1); zev_queue_release(q); } zev_queue_trim(); bzero(&zev_queues, sizeof(zev_queues)); mutex_exit(&zev_mutex); } static int zev_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) { int instance; zev_queue_t *q; /* called once per instance with DDI_DETACH, may be called to suspend */ switch (cmd) { case DDI_DETACH: /* instance busy? */ instance = ddi_get_instance(dip); if (instance != 0) { /* hardcoded in zev.conf */ /* this module only supports one instance. */ return (DDI_FAILURE); } mutex_enter(&zev_mutex); if (!zev_attached) { mutex_exit(&zev_mutex); return (DDI_FAILURE); } /* check "ctrl" queue to see if t is busy */ q = ddi_get_soft_state(statep, ZEV_CONTROL_DEVICE_MINOR); if (q == NULL) { mutex_exit(&zev_mutex); return (DDI_FAILURE); } if (q->zq_busy) { mutex_exit(&zev_mutex); return (DDI_FAILURE); } /* are there any queues? */ if (zev_queue_cnt > 0) { mutex_exit(&zev_mutex); return (DDI_FAILURE); } zev_attached = B_FALSE; mutex_exit(&zev_mutex); /* switch ZFS event callbacks back to default */ rw_enter(&rz_zev_rwlock, RW_WRITER); rz_zev_callbacks = rz_zev_default_callbacks; rz_zev_set_active(B_FALSE); rw_exit(&rz_zev_rwlock); /* no thread is inside of the callbacks anymore. */ /* free resources allocated for this instance */ zev_free_instance(dip); zev_chksum_fini(); #if 0 cmn_err(CE_WARN, "zev: allocated memory at detach: %" PRIu64, zev_memory_allocated - zev_memory_freed); #endif return (DDI_SUCCESS); case DDI_SUSPEND: /* kernel must not suspend zev devices while ZFS is running */ return (DDI_FAILURE); default: return (DDI_FAILURE); } } static int zev_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { /* called once per instance with DDI_ATTACH, may be called to resume */ int instance; int error; zev_queue_t *q; switch (cmd) { case DDI_ATTACH: /* create instance state */ instance = ddi_get_instance(dip); if (instance != 0) { /* hardcoded in zev.conf */ /* this module only supports one instance. */ return (DDI_FAILURE); } mutex_enter(&zev_mutex); if (zev_attached) { mutex_exit(&zev_mutex); return (DDI_FAILURE); } if (ddi_soft_state_zalloc(statep, ZEV_CONTROL_DEVICE_MINOR) != DDI_SUCCESS) { mutex_exit(&zev_mutex); return (DDI_FAILURE); } ZEV_MEM_ADD(sizeof(zev_queue_t)); zev_attached = B_TRUE; /* init queue list */ bzero(&zev_queues, sizeof(zev_queues)); mutex_exit(&zev_mutex); /* create a dummy queue for management of "ctrl" */ q = ddi_get_soft_state(statep, ZEV_CONTROL_DEVICE_MINOR); q->zq_dip = dip; q->zq_refcnt = 1; q->zq_busy = B_FALSE; q->zq_minor_number = ZEV_CONTROL_DEVICE_MINOR; q->zq_flags = ZEV_FL_PERSISTENT; strcpy(q->zq_name, ZEV_CONTROL_DEVICE_NAME); /* create device node for "ctrl" */ if (ddi_create_minor_node(dip, ZEV_CONTROL_DEVICE_NAME, S_IFCHR, ZEV_CONTROL_DEVICE_MINOR, DDI_PSEUDO, 0) == DDI_FAILURE) { goto fail; } /* note: intentionally not adding ctrl queue to queue list. */ /* default queue */ error = zev_queue_new(&q, dip, ZEV_DEFAULT_QUEUE_NAME, ZEV_MAX_QUEUE_LEN, ZEV_FL_BLOCK_WHILE_QUEUE_FULL| ZEV_FL_PERSISTENT); if (error) goto fail; /* start pollwakeup thread */ zev_wakeup_thread_run = 1; zev_poll_wakeup_thread = thread_create(NULL, 0, zev_poll_wakeup_thread_main, NULL, 0, &p0, TS_RUN, minclsyspri); ddi_report_dev(dip); zev_chksum_init(); /* switch ZFS event callbacks to zev module callbacks */ rw_enter(&rz_zev_rwlock, RW_WRITER); rz_zev_callbacks = &zev_callbacks; rz_zev_set_active(B_TRUE); rw_exit(&rz_zev_rwlock); return (DDI_SUCCESS); case DDI_RESUME: /* suspendeding zev devices should never happen */ return (DDI_SUCCESS); default: return (DDI_FAILURE); } fail: cmn_err(CE_WARN, "zev: attach failed"); zev_free_instance(dip); mutex_enter(&zev_mutex); zev_attached = B_FALSE; mutex_exit(&zev_mutex); return (DDI_FAILURE); } /* ARGSUSED */ static int zev_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **resultp) { minor_t minor; zev_queue_t *q; /* arg is dev_t */ minor = getminor((dev_t)arg); mutex_enter(&zev_mutex); q = ddi_get_soft_state(statep, minor); if (q == NULL) { *resultp = NULL; mutex_exit(&zev_mutex); return (DDI_FAILURE); } switch (infocmd) { case DDI_INFO_DEVT2DEVINFO: *resultp = q->zq_dip; break; case DDI_INFO_DEVT2INSTANCE: *resultp = (void *)(uintptr_t)ddi_get_instance(q->zq_dip); break; default: mutex_exit(&zev_mutex); return (DDI_FAILURE); } mutex_exit(&zev_mutex); return (DDI_SUCCESS); } static struct dev_ops zev_dev_ops = { DEVO_REV, /* driver build revision */ 0, /* driver reference count */ zev_getinfo, /* getinfo */ nulldev, /* identify (obsolete) */ nulldev, /* probe (search for devices) */ zev_attach, /* attach */ zev_detach, /* detach */ nodev, /* reset (obsolete, use quiesce) */ &zev_cb_ops, /* character and block device ops */ NULL, /* bus driver ops */ NULL, /* power management, not needed */ ddi_quiesce_not_needed, /* quiesce */ }; static struct modldrv zev_modldrv = { &mod_driverops, /* all loadable modules use this */ "zev ZFS event provider, v1.0", /* driver name and version info */ &zev_dev_ops /* ops method pointers */ }; static struct modlinkage zev_modlinkage = { MODREV_1, /* fixed value */ { &zev_modldrv, /* driver linkage structure */ NULL /* list terminator */ } }; int _init(void) { int error; if ((error = ddi_soft_state_init(&statep, sizeof(zev_queue_t), 1)) != 0) return (error); zev_attached = B_FALSE; zev_queue_head = NULL; zev_queue_tail = NULL; zev_queue_len = 0; zev_muted_pools_head = NULL; zev_memory_allocated = 0; zev_memory_freed = 0; zev_queue_cnt = 0; mutex_init(&zev_mutex, NULL, MUTEX_DRIVER, NULL); cv_init(&zev_condvar, NULL, CV_DRIVER, NULL); rw_init(&zev_pool_list_rwlock, NULL, RW_DRIVER, NULL); mutex_init(&zev_mark_id_mutex, NULL, MUTEX_DRIVER, NULL); zev_mark_id = gethrtime(); mutex_init(&zev_queue_msg_mutex, NULL, MUTEX_DRIVER, NULL); zev_msg_sequence_number = gethrtime(); bzero(&zev_statistics, sizeof(zev_statistics)); bzero(&zev_pollhead, sizeof(zev_pollhead)); bzero(&zev_queues, sizeof(zev_queues)); zev_statistics.zev_max_queue_len = ZEV_MAX_QUEUE_LEN; if (zev_ioc_mute_pool("zg0")) { cmn_err(CE_WARN, "zev: could not init mute list"); goto FAIL; } if ((error = mod_install(&zev_modlinkage)) != 0) { cmn_err(CE_WARN, "zev: could not install module"); goto FAIL; } return (0); FAIL: /* free resources */ cmn_err(CE_WARN, "zev: _init failed"); mutex_destroy(&zev_mutex); ddi_soft_state_fini(&statep); return (error); } int _info(struct modinfo *modinfop) { return (mod_info(&zev_modlinkage, modinfop)); } int _fini(void) { int error = 0; zev_msg_t *msg; zev_pool_list_entry_t *pe, *npe; mutex_enter(&zev_mutex); if (zev_attached == B_TRUE) { mutex_exit(&zev_mutex); return (SET_ERROR(EBUSY)); } if (zev_queue_cnt != 0) { /* should never happen */ mutex_exit(&zev_mutex); return (SET_ERROR(EBUSY)); } /* * avoid deadlock if event list is full: make sure threads currently * blocking on the event list can append their event and then release * rz_zev_rwlock. Since there should be no queues left when we * reach this point we can simply empty the event list and then * wake everybody. */ while (zev_queue_head) { msg = zev_queue_head; zev_queue_head = msg->next; zev_free(msg, sizeof(*msg) + msg->size); } cv_broadcast(&zev_condvar); mutex_exit(&zev_mutex); /* switch ZFS event callbacks back to default (again) */ rw_enter(&rz_zev_rwlock, RW_WRITER); rz_zev_callbacks = rz_zev_default_callbacks; rz_zev_set_active(B_FALSE); rw_exit(&rz_zev_rwlock); /* no thread is inside of the callbacks anymore. Safe to remove. */ /* unload module callbacks */ if ((error = mod_remove(&zev_modlinkage)) != 0) { cmn_err(CE_WARN, "mod_remove failed: %d", error); return (error); } /* free resources */ mutex_enter(&zev_mutex); while (zev_queue_head) { msg = zev_queue_head; zev_queue_head = msg->next; zev_free(msg, sizeof(*msg) + msg->size); } mutex_exit(&zev_mutex); rw_enter(&zev_pool_list_rwlock, RW_WRITER); pe = zev_muted_pools_head; while (pe) { npe = pe; pe = pe->next; zev_free(npe, sizeof(*npe)); } rw_exit(&zev_pool_list_rwlock); ddi_soft_state_fini(&statep); rw_destroy(&zev_pool_list_rwlock); cv_destroy(&zev_condvar); mutex_destroy(&zev_mutex); mutex_destroy(&zev_mark_id_mutex); mutex_destroy(&zev_queue_msg_mutex); return (0); }