#include #include #include #include #include #include #include #include #include #include typedef struct zev_sig_cache_chksums_t { /* begin of key */ uint64_t offset_l1; /* end of key */ avl_node_t avl_node; uint8_t sigs[ZEV_L1_SIZE/ZEV_L0_SIZE][SHA1_DIGEST_LENGTH]; } zev_sig_cache_chksums_t; typedef struct zev_sig_cache_file_t { /* begin of key */ uint64_t guid; uint64_t ino; uint64_t gen; /* end of key */ uint32_t refcnt; list_node_t lru_node; avl_node_t avl_node; avl_tree_t chksums; } zev_sig_cache_file_t; typedef struct zev_sig_cache_t { kmutex_t mutex; uint64_t cache_size; uint64_t max_cache_size; uint64_t hits; uint64_t misses; list_t lru; avl_tree_t files; } zev_sig_cache_t; extern offset_t zfs_read_chunk_size; /* tuneable from zfs_vnops.c */ static uint8_t all_zero_sig[SHA1_DIGEST_LENGTH] = { 0x1c, 0xea, 0xf7, 0x3d, 0xf4, 0x0e, 0x53, 0x1d, 0xf3, 0xbf, 0xb2, 0x6b, 0x4f, 0xb7, 0xcd, 0x95, 0xfb, 0x7b, 0xff, 0x1d }; static uint8_t unknown_sig[SHA1_DIGEST_LENGTH] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; static zev_sig_cache_t zev_sig_cache; static int zev_cache_file_cmp(const void *entry_a, const void *entry_b) { const zev_sig_cache_file_t *a = entry_a; const zev_sig_cache_file_t *b = entry_b; if (a->guid < b->guid) return -1; if (a->guid > b->guid) return 1; if (a->ino < b->ino) return -1; if (a->ino > b->ino) return 1; if (a->gen < b->gen) return -1; if (a->gen > b->gen) return 1; return 0; } static int zev_chksum_cache_cmp(const void *entry_a, const void *entry_b) { const zev_sig_cache_chksums_t *a = entry_a; const zev_sig_cache_chksums_t *b = entry_b; if (a->offset_l1 < b->offset_l1) return -1; if (a->offset_l1 > b->offset_l1) return 1; return 0; } /* must be called with zev_sig_cache.mutex held */ static void zev_chksum_cache_file_free(zev_sig_cache_file_t *file) { zev_sig_cache_chksums_t *cs; void *c = NULL; /* cookie */ /* remove from lru list */ list_remove(&zev_sig_cache.lru, file); /* free resources */ avl_remove(&zev_sig_cache.files, file); while ((cs = avl_destroy_nodes(&file->chksums, &c)) != NULL) { zev_sig_cache.cache_size -= sizeof(*cs); zev_free(cs, sizeof(*cs)); } avl_destroy(&file->chksums); zev_free(file, sizeof(*file)); zev_sig_cache.cache_size -= sizeof(*file); } void zev_chksum_init(void) { memset(&zev_sig_cache, 0, sizeof(zev_sig_cache)); mutex_init(&zev_sig_cache.mutex, NULL, MUTEX_DRIVER, NULL); avl_create(&zev_sig_cache.files, zev_cache_file_cmp, sizeof(zev_sig_cache_file_t), offsetof(zev_sig_cache_file_t, avl_node)); list_create(&zev_sig_cache.lru, sizeof(zev_sig_cache_file_t), offsetof(zev_sig_cache_file_t, lru_node)); zev_sig_cache.max_cache_size = ZEV_CHKSUM_DEFAULT_CACHE_SIZE; } void zev_chksum_fini(void) { zev_sig_cache_file_t *file; mutex_destroy(&zev_sig_cache.mutex); while ((file = avl_first(&zev_sig_cache.files)) != NULL) zev_chksum_cache_file_free(file); list_destroy(&zev_sig_cache.lru); avl_destroy(&zev_sig_cache.files); } static zev_sig_cache_file_t * zev_chksum_cache_file_get_and_hold(znode_t *zp) { zev_sig_cache_file_t find_file; zev_sig_cache_file_t *file; avl_index_t where; find_file.guid = dsl_dataset_phys(zp->z_zfsvfs->z_os->os_dsl_dataset)->ds_guid; find_file.ino = zp->z_id; find_file.gen = zp->z_gen; mutex_enter(&zev_sig_cache.mutex); file = avl_find(&zev_sig_cache.files, &find_file, &where); if (!file) { file = zev_alloc(sizeof(*file)); file->guid = dsl_dataset_phys(zp->z_zfsvfs->z_os->os_dsl_dataset)->ds_guid; file->ino = zp->z_id; file->gen = zp->z_gen; file->refcnt = 0; avl_create(&file->chksums, zev_chksum_cache_cmp, sizeof(zev_sig_cache_chksums_t), offsetof(zev_sig_cache_chksums_t, avl_node)); list_insert_head(&zev_sig_cache.lru, file); avl_insert(&zev_sig_cache.files, file, where); zev_sig_cache.cache_size += sizeof(*file); } file->refcnt++; mutex_exit(&zev_sig_cache.mutex); return file; } static void zev_chksum_cache_file_release(zev_sig_cache_file_t *file) { mutex_enter(&zev_sig_cache.mutex); /* We don't invalidate/free/destroy *file. Cache expiry does that */ file->refcnt--; /* Move file to front of lru list */ list_remove(&zev_sig_cache.lru, file); list_insert_head(&zev_sig_cache.lru, file); mutex_exit(&zev_sig_cache.mutex); } static zev_sig_cache_chksums_t * zev_chksum_cache_get_lv1_entry(zev_sig_cache_file_t *file, uint64_t off_l1) { zev_sig_cache_chksums_t find_chksum; zev_sig_cache_chksums_t *cs; avl_index_t where; mutex_enter(&zev_sig_cache.mutex); find_chksum.offset_l1 = off_l1; cs = avl_find(&file->chksums, &find_chksum, &where); if (!cs) { cs = zev_zalloc(sizeof(*cs)); cs->offset_l1 = off_l1; avl_insert(&file->chksums, cs, where); zev_sig_cache.cache_size += sizeof(*cs); } mutex_exit(&zev_sig_cache.mutex); return cs; } void zev_chksum_stats(uint64_t *c_size, uint64_t *c_hits, uint64_t *c_misses) { mutex_enter(&zev_sig_cache.mutex); *c_size = zev_sig_cache.cache_size; *c_hits = zev_sig_cache.hits; *c_misses = zev_sig_cache.misses; mutex_exit(&zev_sig_cache.mutex); } static void zev_chksum_cache_invalidate(zev_sig_cache_file_t *file, znode_t *zp, zev_chksum_mode_t mode, uint64_t off, uint64_t len) { zev_sig_cache_chksums_t find_chksum; zev_sig_cache_chksums_t *cs; int idx; uint64_t off_l1; uint64_t len_l1; uint64_t pos_l0; uint64_t pos_l1; mutex_enter(&zev_sig_cache.mutex); /* start of this megabyte */ off_l1 = P2ALIGN(off, ZEV_L1_SIZE); if (len == 0) { /* truncate() to EOF */ len_l1 = ZEV_L1_SIZE; } else { /* full megabytes */ len_l1 = len + (off - off_l1); len_l1 = P2ROUNDUP(len_l1, ZEV_L1_SIZE); } for (pos_l1 = off_l1; pos_l1 < (off_l1+len_l1); pos_l1 += ZEV_L1_SIZE) { find_chksum.offset_l1 = pos_l1; cs = avl_find(&file->chksums, &find_chksum, NULL); if (!cs) continue; for (pos_l0 = MAX(pos_l1, P2ALIGN(off, ZEV_L0_SIZE)); pos_l0 < (pos_l1 + ZEV_L1_SIZE); pos_l0 += ZEV_L0_SIZE){ if ((len > 0) && (pos_l0 > (off + len - 1))) break; idx = (pos_l0 % ZEV_L1_SIZE) / ZEV_L0_SIZE; memcpy(cs->sigs[idx], unknown_sig, SHA1_DIGEST_LENGTH); } } if (len == 0) { /* truncate() to EOF -> invalidate all l1 sigs beyond EOF */ while ((cs = avl_last(&file->chksums)) != NULL) { if (cs->offset_l1 < zp->z_size) break; avl_remove(&file->chksums, cs); zev_sig_cache.cache_size -= sizeof(*cs); zev_free(cs, sizeof(*cs)); } } mutex_exit(&zev_sig_cache.mutex); } static int zev_chksum_cache_get(uint8_t *dst, zev_sig_cache_file_t *file, zev_sig_cache_chksums_t *cs, uint64_t off_l0) { int idx; mutex_enter(&zev_sig_cache.mutex); idx = (off_l0 % ZEV_L1_SIZE) / ZEV_L0_SIZE; if (!memcmp(cs->sigs[idx], unknown_sig, SHA1_DIGEST_LENGTH)) { zev_sig_cache.misses++; mutex_exit(&zev_sig_cache.mutex); return ENOENT; } memcpy(dst, cs->sigs[idx], SHA1_DIGEST_LENGTH); zev_sig_cache.hits++; mutex_exit(&zev_sig_cache.mutex); return 0; } static void zev_chksum_cache_put(uint8_t *sig, zev_sig_cache_file_t *file, zev_sig_cache_chksums_t *cs, uint64_t off_l0) { zev_sig_cache_file_t *f; zev_sig_cache_file_t *tmp; int idx; mutex_enter(&zev_sig_cache.mutex); if (zev_sig_cache.max_cache_size == 0) { /* cache disabled */ mutex_exit(&zev_sig_cache.mutex); return; } /* expire entries until there's room in the cache */ f = list_tail(&zev_sig_cache.lru); while (f && (zev_sig_cache.cache_size > zev_sig_cache.max_cache_size)){ tmp = f; f = list_prev(&zev_sig_cache.lru, f); if (tmp->refcnt == 0) zev_chksum_cache_file_free(tmp); } idx = (off_l0 % ZEV_L1_SIZE) / ZEV_L0_SIZE; memcpy(cs->sigs[idx], sig, SHA1_DIGEST_LENGTH); mutex_exit(&zev_sig_cache.mutex); return; } /* verbatim from zfs_vnops.c (unfortunatly it's declared static, there) */ static int mappedread(vnode_t *vp, int nbytes, uio_t *uio) { znode_t *zp = VTOZ(vp); objset_t *os = zp->z_zfsvfs->z_os; int64_t start, off; int len = nbytes; int error = 0; start = uio->uio_loffset; off = start & PAGEOFFSET; for (start &= PAGEMASK; len > 0; start += PAGESIZE) { page_t *pp; uint64_t bytes = MIN(PAGESIZE - off, len); if (pp = page_lookup(vp, start, SE_SHARED)) { caddr_t va; va = zfs_map_page(pp, S_READ); error = uiomove(va + off, bytes, UIO_READ, uio); zfs_unmap_page(pp, va); page_unlock(pp); } else { error = dmu_read_uio(os, zp->z_id, uio, bytes); } len -= bytes; off = 0; if (error) break; } return (error); } static int zev_safe_read(znode_t *zp, char *buf, uint64_t off, uint64_t len) { uio_t uio; struct iovec iov; ssize_t n; ssize_t nbytes; int error = 0; vnode_t *vp = ZTOV(zp); objset_t *os = zp->z_zfsvfs->z_os; /* set up uio */ iov.iov_base = buf; iov.iov_len = ZEV_L0_SIZE; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_segflg = (short)UIO_SYSSPACE; uio.uio_llimit = RLIM64_INFINITY; uio.uio_fmode = FREAD; uio.uio_extflg = UIO_COPY_DEFAULT; uio.uio_loffset = off; uio.uio_resid = len; again: if (uio.uio_loffset >= zp->z_size) return EINVAL; /* don't read past EOF */ n = MIN(uio.uio_resid, zp->z_size - uio.uio_loffset); /* this block was essentially copied from zfs_read() in zfs_vnops.c */ while (n > 0) { nbytes = MIN(n, zfs_read_chunk_size - P2PHASE(uio.uio_loffset, zfs_read_chunk_size)); if (vn_has_cached_data(vp)) { error = mappedread(vp, nbytes, &uio); } else { error = dmu_read_uio(os, zp->z_id, &uio, nbytes); } if (error) { if (error = EINTR) goto again; /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); break; } n -= nbytes; } if (error) return error; return len - uio.uio_resid; } static void zev_l0_sig(uint8_t *sig, char *buf) { SHA1_CTX ctx; SHA1Init(&ctx); SHA1Update(&ctx, buf, ZEV_L0_SIZE); SHA1Final(sig, &ctx); return; } static void zev_l0_blocksig(uint8_t *blk_sig, uint8_t *l0_sig, uint8_t block_no) { SHA1_CTX ctx; SHA1Init(&ctx); SHA1Update(&ctx, l0_sig, SHA1_DIGEST_LENGTH); SHA1Update(&ctx, &block_no, sizeof(block_no)); SHA1Final(blk_sig, &ctx); return; } static void zev_l1_add(uint8_t *sig_l1, uint8_t *sig_l0) { int i; int s; int carry = 0; for (i = SHA1_DIGEST_LENGTH - 1; i >= 0; --i) { s = sig_l1[i] + sig_l0[i] + carry; carry = s > 255 ? 1 : 0; sig_l1[i] = s & 0xff; } } static int zev_get_result_buffer(zev_sig_t **buffer, uint64_t *buffer_len, uint64_t max_buffer_len, znode_t *zp, uint64_t off, uint64_t len, zev_chksum_mode_t mode) { uint64_t blk_start; uint64_t blk_end; uint64_t l0_blocks; uint64_t l1_blocks; uint64_t sigs; int buflen; /* calculate result set size: how many checksums will we provide? */ ASSERT(len > 0 || (mode == zev_truncate && len == 0)); if (len == 0) { /* truncate */ l0_blocks = ((off % ZEV_L0_SIZE) == 0) ? 0 : 1; l1_blocks = ((off % ZEV_L1_SIZE) == 0) ? 0 : 1; } else { /* how many lv1 checksums do we update? */ blk_start = off / ZEV_L1_SIZE; blk_end = (off + len - 1) / ZEV_L1_SIZE; l1_blocks = blk_end - blk_start + 1; /* how many lv0 checksums do we update? */ blk_start = off / ZEV_L0_SIZE; blk_end = (off + len - 1) / ZEV_L0_SIZE; l0_blocks = blk_end - blk_start + 1; } sigs = l1_blocks + l0_blocks; if (sigs == 0) { *buffer = NULL; *buffer_len = 0; return 0; } buflen = sigs * sizeof(zev_sig_t); if (max_buffer_len && (buflen > max_buffer_len)) { *buffer = NULL; *buffer_len = 0; return ENOSPC; } *buffer_len = buflen; *buffer = zev_alloc(buflen); return 0; } static void zev_append_sig(zev_sig_t *s, int level, uint64_t off, uint8_t *sig) { s->level = level; s->block_offset = off; memcpy(s->value, sig, SHA1_DIGEST_LENGTH); } /* * Calculate all l0 and l1 checksums that are affected by the given range. * * This function assumes that the ranges it needs to read are already * range-locked. */ int zev_get_checksums(zev_sig_t **result, uint64_t *result_buf_len, uint64_t *signature_cnt, uint64_t max_result_len, znode_t *zp, uint64_t off, uint64_t len, zev_chksum_mode_t mode) { uint64_t off_l1; uint64_t len_l1; uint64_t pos_l1; uint64_t pos_l0; char *buf; int64_t ret; uint8_t sig_l0[SHA1_DIGEST_LENGTH]; uint8_t blk_sig_l0[SHA1_DIGEST_LENGTH]; uint8_t sig_l1[SHA1_DIGEST_LENGTH]; uint8_t l0_block_no; zev_sig_t *sig; int non_empty_l0_blocks; zev_sig_cache_file_t *file; zev_sig_cache_chksums_t *cs; /* * Note: for write events, the callback is called via * zfs_write() -> zfs_log_write() -> zev_znode_write_cb() * * The transaction is not commited, yet. * * A write() syscall might be split into smaller chunks by zfs_write() * * zfs_write() has a range lock when this is called. (zfs_vnops.c:925) * In zev mode, the range lock will encompass all data we need * to calculate our checksums. * * The same is true for truncates with non-zero length. ("punch hole") */ ASSERT(len > 0 || (mode == zev_truncate && len == 0)); *signature_cnt = 0; /* * Under certain circumstances we need the first l0 block's * checksum, because we didn't store it in the database and * can't easily get it from userspace. Not for this exact point * in time, anyway. So we cheat a little. */ if (mode == zev_truncate && len == 0 && off == 4096) { /* * Normally, we'd report no checkums: * - no l0 sum, because no remaining l0 block is changed * - no l1 sum, because the file is now too short for l1 sums * Let's pretend we changed the first l0 block, then. * Luckily the entire file is range locked during truncate(). */ off = 0; len = 4096; } /* start of this megabyte */ off_l1 = P2ALIGN(off, ZEV_L1_SIZE); /* full megabytes */ if (len == 0) { /* truncate(): we'll look at the last lv1 block, only. */ len_l1 = ZEV_L1_SIZE; } else { len_l1 = len + (off - off_l1); len_l1 = P2ROUNDUP(len_l1, ZEV_L1_SIZE); } file = zev_chksum_cache_file_get_and_hold(zp); zev_chksum_cache_invalidate(file, zp, mode, off, len); buf = zev_alloc(ZEV_L0_SIZE); ret = zev_get_result_buffer(result, result_buf_len, max_result_len, zp, off, len, mode); if (ret) { zev_free(buf, ZEV_L0_SIZE); zev_chksum_cache_file_release(file); return ret; } if (*result == NULL) { /* we're done */ zev_free(buf, ZEV_L0_SIZE); zev_chksum_cache_file_release(file); return 0; } sig = *result; for (pos_l1 = off_l1; pos_l1 < (off_l1+len_l1); pos_l1 += ZEV_L1_SIZE) { if (pos_l1 > zp->z_size) { cmn_err(CE_WARN, "zev_get_checksums: off+len beyond " "EOF. Unexpected behaviour; please fix!"); zev_free(*result, *result_buf_len); cmn_err(CE_WARN, "off=%" PRIu64 ", len=%" PRIu64 ", " "dataset='%s', inode=%" PRIu64, off, len, zp->z_zfsvfs->z_os-> os_dsl_dataset->ds_dir->dd_myname, zp->z_id); zev_free(*result, *result_buf_len); *result = NULL; break; } /* * Since we have a reference to 'file' 'cs' can't be expired. * Since our ranges are range locked, other threads woun't * touch our checksum entries. (not even read them) * Hence, we don't need to hold() or release() 'cs'. */ cs = zev_chksum_cache_get_lv1_entry(file, pos_l1); l0_block_no = 0; non_empty_l0_blocks = 0; bzero(sig_l1, sizeof(sig_l1)); for (pos_l0 = pos_l1; pos_l0 < (pos_l1 + ZEV_L1_SIZE); pos_l0 += ZEV_L0_SIZE){ if (pos_l0 >= zp->z_size) break; /* EOF */ if (zev_chksum_cache_get(sig_l0, file,cs,pos_l0) != 0) { /* signature is not cached, yet. */ ret = zev_safe_read(zp, buf, pos_l0, ZEV_L0_SIZE); if (ret < 0) { zev_free(*result, *result_buf_len); zev_free(buf, ZEV_L0_SIZE); zev_chksum_cache_file_release(file); return ret; } /* pad buffer with zeros if necessary */ if (ret < ZEV_L0_SIZE) bzero(buf + ret, ZEV_L0_SIZE - ret); /* calculate signature */ zev_l0_sig(sig_l0, buf); zev_chksum_cache_put(sig_l0, file, cs, pos_l0); } if (!memcmp(sig_l0, all_zero_sig, SHA1_DIGEST_LENGTH)) { /* all-zero l0 block. omit signature. */ l0_block_no++; continue; } non_empty_l0_blocks++; zev_l0_blocksig(blk_sig_l0, sig_l0, l0_block_no); zev_l1_add(sig_l1, blk_sig_l0); if (((pos_l0 + ZEV_L0_SIZE - 1) >= off) && (pos_l0 <= (off + len - 1))) { zev_append_sig(sig++, 0, pos_l0, sig_l0); } l0_block_no++; } if (non_empty_l0_blocks && (zp->z_size > ZEV_L0_SIZE)) zev_append_sig(sig++, 1, pos_l1, sig_l1); } *signature_cnt = ((char *)sig - (char *)*result) / sizeof(zev_sig_t); zev_free(buf, ZEV_L0_SIZE); zev_chksum_cache_file_release(file); return 0; } int zev_ioc_get_signatures(intptr_t arg, int mode) { zev_ioctl_get_signatures_t gs; file_t *fp; int ret = 0; znode_t *zp; zev_sig_t *sig_buf = NULL; uint64_t sig_buf_len; uint64_t sig_cnt = 0; uint64_t sig_len; char *dst; int range_locked = 0; rl_t *rl; ssize_t lock_off; ssize_t lock_len; if (ddi_copyin((void *)arg, &gs, sizeof(gs), mode) != 0) return EFAULT; fp = getf(gs.zev_fd); if (fp == NULL) return EBADF; if (fp->f_vnode->v_vfsp->vfs_fstype != zfsfstype) { ret = EINVAL; goto out; } if (fp->f_vnode->v_type != VREG) { ret = EINVAL; goto out; } zp = VTOZ(fp->f_vnode); if (gs.zev_offset >= zp->z_size) { ret = EINVAL; goto out; } /* range lock data */ lock_off = P2ALIGN(gs.zev_offset, ZEV_L1_SIZE); lock_len = gs.zev_len + (gs.zev_offset - lock_off); lock_len = P2ROUNDUP(lock_len, ZEV_L1_SIZE); rl = zfs_range_lock(zp, lock_off, lock_len, RL_READER); range_locked = 1; /* get checksums */ ret = zev_get_checksums(&sig_buf, &sig_buf_len, &sig_cnt, gs.zev_bufsize, zp, gs.zev_offset, gs.zev_len, zev_write); if (ret) goto out; /* copy to userland */ sig_len = sig_cnt * sizeof(zev_sig_t); gs.zev_signature_cnt = sig_cnt; if (ddi_copyout(&gs, (void *)arg, sizeof(gs), mode) != 0) { ret = EFAULT; goto out; } if (sig_cnt && sig_buf) { dst = (char *)arg + sizeof(gs); if (ddi_copyout(sig_buf, (void *)dst, sig_len, mode) != 0) { ret = EFAULT; goto out; } } out: if (sig_buf) zev_free(sig_buf, sig_buf_len); if (range_locked) zfs_range_unlock(rl); releasef(gs.zev_fd); return ret; } void zev_symlink_checksum(zev_znode_symlink_t *rec, char *link) { char buf[ZEV_L0_SIZE]; memset(buf, 0, sizeof(buf)); strcpy(buf, link); zev_l0_sig(rec->signature.value, buf); rec->signature.level = 0; rec->signature.block_offset = 0; } void zev_create_checksum(zev_znode_create_t *rec, znode_t *zp) { char buf[ZEV_L0_SIZE]; vnode_t *vp; uint64_t rdev; vp = ZTOV(zp); if (vp->v_type == VBLK || vp->v_type == VCHR) { sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zp->z_zfsvfs), &rdev, sizeof(rdev)); memset(buf, 0, sizeof(buf)); snprintf(buf, sizeof(buf), "%c%d,%d", vp->v_type == VBLK ? 'b' : 'c', getmajor(rdev), getminor(rdev)); zev_l0_sig(rec->signature.value, buf); } else { memset(rec->signature.value, 0, sizeof(rec->signature.value)); } rec->signature.level = 0; rec->signature.block_offset = 0; }