#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define ARCWATCH_READ_MAGIC "awrd" #define ARCWATCH_READ_VERSION 1 typedef struct arc_read_hdr { char arh_magic[4]; uint32_t arh_version; } arc_read_hdr_t; typedef struct arc_read { uint64_t ar_objset; uint64_t ar_object; uint64_t ar_level; uint64_t ar_blkid; uint64_t ar_size; uint64_t ar_type; uint64_t ar_dva0; uint64_t ar_dva1; uint64_t ar_birth; uint64_t ar_spa; } arc_read_t; #define ARNS_IN_L1_CACHE 1 #define ARNS_IN_L2_CACHE 2 typedef struct arc_read_node { arc_read_t arn_ar; avl_node_t arn_node; uint64_t arn_color; uint64_t arn_state; uint64_t arn_flags; } arc_read_node_t; #define ARCWATCH_CONTENT_MAGIC "awct" #define ARCWATCH_CONTENT_VERSION 1 typedef struct arc_content_hdr { char ach_magic[4]; uint32_t ach_version; uint64_t ach_buckets; uint64_t ach_buf_locks; } arc_content_hdr_t; static const char * state2str(arc_info_state_t state) { switch (state) { case AIS_ANON: return "anon"; case AIS_MRU: return "mru"; case AIS_MRU_GHOST: return "mru_ghost"; case AIS_MFU: return "mfu"; case AIS_MFU_GHOST: return "mfu_ghost"; case AIS_L2C_ONLY: return "l2c_only"; case AIS_NO_L1HDR: return "no_l1hdr"; default: case AIS_UNKNOWN: return "unknown"; } } static int g_verbose = 0; static char *d_prog = "dtrace:::BEGIN\n" "{\n" " trackedpid[pid] = 0;\n" " self->child = 0;\n" " OPT_follow = 1;\n" "}\n" "syscall::fork*:entry\n" "/OPT_follow && (pid == $target || self->child)/\n" "{\n" " trackedpid[pid] = 1;\n" "}\n" "syscall::fork*:return\n" "/OPT_follow && trackedpid[ppid]/\n" "{\n" " self->child = 1;\n" "}\n" "fbt::dbuf_hold_impl:entry\n" "/pid == $target || self->child/\n" "{\n" " self->dbp = args[6];\n" " self->type = args[0]->dn_type;\n" "}\n" "fbt::dbuf_hold_impl:return\n" "/self->dbp && (*self->dbp)->db_state == 4/\n" "{\n" " this->db = *self->dbp;\n" " this->os = this->db->db_objset;\n" " this->hdr = this->db->db_buf ? this->db->db_buf->b_hdr : 0;\n" " trace(this->os->os_dsl_dataset ?\n" " this->os->os_dsl_dataset->ds_object : 0);\n" " trace(this->db->db.db_object);\n" " trace(this->db->db_level);\n" " trace(this->db->db_blkid);\n" " trace(this->db->db.db_size);\n" " trace(self->type);\n" " trace(this->hdr ? this->hdr->b_dva.dva_word[0] : 0);\n" " trace(this->hdr ? this->hdr->b_dva.dva_word[1] : 0);\n" " trace(this->hdr ? this->hdr->b_birth : 0);\n" " trace(this->hdr ? this->hdr->b_spa : 0);\n" " self->dbp = 0;\n" " self->type = 0;\n" "}\n"; static int awr_cmp(const void *x, const void *y) { const arc_read_node_t *a = x; const arc_read_node_t *b = y; if (a->arn_ar.ar_spa < b->arn_ar.ar_spa) return -1; if (a->arn_ar.ar_spa > b->arn_ar.ar_spa) return 1; if (a->arn_ar.ar_dva0 < b->arn_ar.ar_dva0) return -1; if (a->arn_ar.ar_dva0 > b->arn_ar.ar_dva0) return 1; if (a->arn_ar.ar_dva1 < b->arn_ar.ar_dva1) return -1; if (a->arn_ar.ar_dva1 > b->arn_ar.ar_dva1) return 1; if (a->arn_ar.ar_birth < b->arn_ar.ar_birth) return -1; if (a->arn_ar.ar_birth > b->arn_ar.ar_birth) return 1; return 0; } static int drophandler(const dtrace_dropdata_t *data, void *arg) { fprintf(stderr, "type %d drops %lld\n", data->dtdda_kind, data->dtdda_drops); fprintf(stderr, "dtrace drops encountered. Try increasing buffers.\n"); exit(1); } static void prochandler(struct ps_prochandle *P, const char *msg, void *arg) { int *proc_done = arg; *proc_done = 1; } static uint64_t get_val(caddr_t base, dtrace_recdesc_t *rec) { uint64_t val = 0; assert(rec->dtrd_action == DTRACEACT_DIFEXPR); assert(rec->dtrd_size > 0); assert(rec->dtrd_size <= 8); memcpy(&val, base + rec->dtrd_offset, rec->dtrd_size); return val; } typedef struct trace_args { int ofd; avl_tree_t *awr; pthread_mutex_t mtx; int ptr; char buf[16384]; } trace_args_t; static int process_trace(const dtrace_probedata_t *data, void *arg) { dtrace_eprobedesc_t *edesc = data->dtpda_edesc; caddr_t base = data->dtpda_data; dtrace_recdesc_t *rec = edesc->dtepd_rec; trace_args_t *ta = arg; arc_read_t ar; int ret; assert(edesc->dtepd_nrecs == 15); ar.ar_objset = get_val(base, rec + 3); ar.ar_object = get_val(base, rec + 4); ar.ar_level = get_val(base, rec + 5); ar.ar_blkid = get_val(base, rec + 6); ar.ar_size = get_val(base, rec + 7); ar.ar_type = get_val(base, rec + 8); ar.ar_dva0 = get_val(base, rec + 9); ar.ar_dva1 = get_val(base, rec + 10); ar.ar_birth = get_val(base, rec + 11); ar.ar_spa = get_val(base, rec + 12); if (ta->ofd != -1) { pthread_mutex_lock(&ta->mtx); if (ta->ptr + sizeof(ar) > sizeof(ta->buf)) { ret = write(ta->ofd, ta->buf, ta->ptr); if (ret == -1) { fprintf(stderr, "cannot write to output file: %s\n", strerror(errno)); exit(1); } ta->ptr = 0; } memcpy(ta->buf + ta->ptr, &ar, sizeof(ar)); ta->ptr += sizeof(ar); pthread_mutex_unlock(&ta->mtx); } if (ta->awr) { arc_read_node_t *arn; arn = calloc(sizeof(*arn), 1); assert(arn); arn->arn_ar = ar; pthread_mutex_lock(&ta->mtx); if (avl_find(ta->awr, arn, NULL) == NULL) avl_add(ta->awr, arn); pthread_mutex_unlock(&ta->mtx); } if (g_verbose) { printf("spa %llx objset %lld object %lld level %lld blkid " "%lld size %lld type %lld dva %16x:%16x birth %lld\n", ar.ar_spa, ar.ar_objset, ar.ar_object, ar.ar_level, ar.ar_blkid, ar.ar_size, ar.ar_type, ar.ar_dva0, ar.ar_dva1, ar.ar_birth); } return (DTRACE_CONSUME_NEXT); } static void d_fatal(dtrace_hdl_t *dtp, char *msg) { fprintf(stderr, "%s: %s\n", msg, dtrace_errmsg(dtp, dtrace_errno(dtp))); exit(1); } static int run_dtrace(char *bufsize, char *out_fn, avl_tree_t *awr, int argc, char **argv) { dtrace_prog_t *dp; dtrace_hdl_t *dtp; dtrace_proginfo_t info; struct ps_prochandle *p; int err; int proc_done = 0; int done = 0; int ofd = -1; arc_read_hdr_t arh = { 0 }; trace_args_t ta = { 0 }; ta.ofd = -1; ta.awr = awr; pthread_mutex_init(&ta.mtx, NULL); if (out_fn) { ofd = open(out_fn, O_CREAT | O_TRUNC | O_WRONLY, 0644); if (ofd == -1) { printf("cannot open output file %s: %s\n", out_fn, strerror(errno)); exit(1); } memcpy(arh.arh_magic, ARCWATCH_READ_MAGIC, sizeof(arh.arh_magic)); arh.arh_version = ARCWATCH_READ_VERSION; err = write(ofd, &arh, sizeof(arh)); if (err == -1) { printf("cannot write to output file: %s\n", strerror(errno)); exit(1); } ta.ofd = ofd; } dtp = dtrace_open(DTRACE_VERSION, 0, &err); if (dtp == NULL) { printf("cannot open dtrace library: %s\n", dtrace_errmsg(NULL, err)); exit(1); } if (dtrace_handle_drop(dtp, &drophandler, NULL) == -1) d_fatal(dtp, "couldn't establish drop handler"); if (dtrace_handle_proc(dtp, &prochandler, &proc_done) == -1) d_fatal(dtp, "failed to establish proc handler"); if (dtrace_setopt(dtp, "bufsize", bufsize) == -1) d_fatal(dtp, "failed to set bufsize"); /* XXX TODO understand dynvar drops */ if (dtrace_setopt(dtp, "dynvarsize", "4m") == -1) d_fatal(dtp, "failed to set dynvarsize"); if (dtrace_setopt(dtp, "temporal", "no") == -1) d_fatal(dtp, "failed to set temporal"); if (dtrace_setopt(dtp, "switchrate", "100hz") == -1) d_fatal(dtp, "failed to set switchrate"); if (dtrace_setopt(dtp, "cleanrate", "100hz") == -1) d_fatal(dtp, "failed to set cleanrate"); p = dtrace_proc_create(dtp, argv[0], &argv[0]); if (p == NULL) d_fatal(dtp, "creating process failed"); dp = dtrace_program_strcompile(dtp, d_prog, DTRACE_PROBESPEC_NAME, 0, 0, NULL); if (dp == NULL) d_fatal(dtp, "failed to compile program"); if (dtrace_program_exec(dtp, dp, &info) == -1) d_fatal(dtp, "failed to enable probes"); if (dtrace_go(dtp)) d_fatal(dtp, "couldn't start tracing"); (void) dtrace_proc_continue(dtp, p); do { dtrace_sleep(dtp); if (proc_done) { done = 1; (void) dtrace_stop(dtp); } err = dtrace_work(dtp, stdout, process_trace, NULL, &ta); if (err == DTRACE_WORKSTATUS_DONE) done = 1; } while (!done); if (ta.ptr > 0) { err = write(ta.ofd, ta.buf, ta.ptr); if (err == -1) { fprintf(stderr, "cannot write to output file: %s\n", strerror(errno)); exit(1); } } (void) dtrace_close(dtp); if (ofd != -1) close(ofd); return (0); } static void read_awr(avl_tree_t *awr, char *in_fn) { int fd; int ret; arc_read_hdr_t arh; char buf[1000 * sizeof(arc_read_t)]; int blen = 0; int ptr = 0; fd = open(in_fn, O_RDONLY); if (fd == -1) { fprintf(stderr, "failed to open input: %s\n", strerror(errno)); exit(1); } ret = read(fd, &arh, sizeof(arh)); if (ret == -1) { fprintf(stderr, "failed to read input: %s\n", strerror(errno)); exit(1); } if (ret != sizeof(arh)) { fprintf(stderr, "failed to read input: truncated file\n"); exit(1); } if (memcmp(arh.arh_magic, ARCWATCH_READ_MAGIC, 4) != 0) { fprintf(stderr, "failed to read input: bad file magic\n"); exit(1); } if (arh.arh_version != ARCWATCH_READ_VERSION) { fprintf(stderr, "failed to read input: bad file version\n"); exit(1); } while (1) { arc_read_node_t *arn = calloc(sizeof(*arn), 1); assert(arn); if (blen == ptr) { ret = read(fd, buf, sizeof(buf)); if (ret == 0) break; if (ret == -1) { fprintf(stderr, "failed to read input: %s\n", strerror(errno)); exit(1); } blen = ret; ptr = 0; } if ((blen - ptr) < sizeof(arn->arn_ar)) { fprintf(stderr, "failed to read input: truncated file\n"); exit(1); } memcpy(&arn->arn_ar, buf + ptr, sizeof(arn->arn_ar)); ptr += sizeof(arn->arn_ar); if (g_verbose >= 2) { arc_read_t *ar = &arn->arn_ar; printf("spa %llx objset % 8lld object % 8lld " "level %lld blkid % 8lld size % 6lld type % 3lld " "dva %016x:%016x birth % 8lld\n", ar->ar_spa, ar->ar_objset, ar->ar_object, ar->ar_level, ar->ar_blkid, ar->ar_size, ar->ar_type, ar->ar_dva0, ar->ar_dva1, ar->ar_birth); } if (avl_find(awr, arn, NULL) == NULL) avl_add(awr, arn); } close(fd); } static void read_arc(avl_tree_t *awr, char *in_fn, uint64_t color, int just_dump) { int fd; int ret; arc_content_hdr_t ach; char buf[1000 * sizeof(arc_info_t)]; int ptr = 0; int blen = 0; fd = open(in_fn, O_RDONLY); if (fd == -1) { fprintf(stderr, "failed to open input: %s\n", strerror(errno)); exit(1); } ret = read(fd, &ach, sizeof(ach)); if (ret == -1) { fprintf(stderr, "failed to read input: %s\n", strerror(errno)); exit(1); } if (ret != sizeof(ach)) { fprintf(stderr, "failed to read input: truncated file\n"); exit(1); } if (memcmp(ach.ach_magic, ARCWATCH_CONTENT_MAGIC, 4) != 0) { fprintf(stderr, "failed to read input: bad file magic\n"); exit(1); } if (ach.ach_version != ARCWATCH_CONTENT_VERSION) { fprintf(stderr, "failed to read input: bad file version\n"); exit(1); } while (1) { arc_info_t ai; arc_read_node_t search; arc_read_node_t *arn; if (blen == ptr) { ret = read(fd, buf, sizeof(buf)); if (ret == 0) break; if (ret == -1) { fprintf(stderr, "failed to read input: %s\n", strerror(errno)); exit(1); } blen = ret; ptr = 0; } if ((blen - ptr) < sizeof(ai)) { fprintf(stderr, "failed to read input: truncated file\n"); exit(1); } memcpy(&ai, buf + ptr, sizeof(ai)); ptr += sizeof(ai); if (just_dump) { printf("dva %016llx:%016llx birth %8d " "spa %016llx " "size % 8x flags %016x state %s\n", ai.ai_dva.dva_word[0], ai.ai_dva.dva_word[1], ai.ai_birth, ai.ai_spa, ai.ai_size, ai.ai_flags, state2str(ai.ai_state)); } else { search.arn_ar.ar_spa = ai.ai_spa; search.arn_ar.ar_dva0 = ai.ai_dva.dva_word[0]; search.arn_ar.ar_dva1 = ai.ai_dva.dva_word[1]; search.arn_ar.ar_birth = ai.ai_birth; arn = avl_find(awr, &search, NULL); if (arn) { arn->arn_color = color; arn->arn_flags = ai.ai_flags; arn->arn_state = ai.ai_state; } } } close(fd); } #define BUFSZ 1048576 /* 1MB */ static void get_arc(avl_tree_t *awr, uint64_t color, char *out_fn) { int ret; int fd; void *buf = malloc(BUFSZ); zfs_cmd_t cmd = {0}; arc_info_t *ai; arc_info_hdr_t *aih; int ofd = -1; int hdr_written = 0; char wbuf[16384]; int wptr = 0; fd = open("/dev/zfs", O_RDWR); if (fd == -1) { fprintf(stderr, "failed to open /dev/zfs: %s\n", strerror(errno)); exit(1); } assert(buf); cmd.zc_obj = 0; cmd.zc_nvlist_dst = (uint64_t)buf; cmd.zc_nvlist_dst_size = BUFSZ; if (out_fn != NULL) { ofd = open(out_fn, O_CREAT | O_TRUNC | O_WRONLY, 0644); if (ofd == -1) { printf("cannot open output file %s: %s\n", out_fn, strerror(errno)); exit(1); } } do { int i; ret = ioctl(fd, ZFS_IOC_ARC_INFO, &cmd); if (ret == -1) { printf("ioctl failed with %d=%s\n", errno, strerror(errno)); exit(1); } aih = buf; ai = buf + sizeof(aih); if (ofd != -1 && !hdr_written) { arc_content_hdr_t ach; memcpy(ach.ach_magic, ARCWATCH_CONTENT_MAGIC, sizeof(ach.ach_magic)); ach.ach_version = ARCWATCH_CONTENT_VERSION; ach.ach_buckets = aih->aih_buckets; ach.ach_buf_locks = aih->aih_buf_locks; ret = write(ofd, &ach, sizeof(ach)); if (ret == -1) { printf("cannot write to output file: %s\n", strerror(errno)); exit(1); } hdr_written = 1; } for (i = 0; i < aih->aih_entries; ++i) { ai = ((arc_info_t *)(aih + 1)) + i; if (g_verbose) { printf("dva %016llx:%016llx birth %7d " "spa %016llx " "size % 8d flags %016x state %s\n", ai->ai_dva.dva_word[0], ai->ai_dva.dva_word[1], ai->ai_birth, ai->ai_spa, ai->ai_size, ai->ai_flags, state2str(ai->ai_state)); } if (awr) { arc_read_node_t search; arc_read_node_t *arn; search.arn_ar.ar_spa = ai->ai_spa; search.arn_ar.ar_dva0 = ai->ai_dva.dva_word[0]; search.arn_ar.ar_dva1 = ai->ai_dva.dva_word[1]; search.arn_ar.ar_birth = ai->ai_birth; arn = avl_find(awr, &search, NULL); if (arn) { arn->arn_color = color; arn->arn_flags = ai->ai_flags; arn->arn_state = ai->ai_state; } } if (ofd != -1) { if (wptr + sizeof(*ai) > sizeof(wbuf)) { ret = write(ofd, wbuf, wptr); if (ret == -1) { printf("cannot write to output " "file: %s\n", strerror(errno)); exit(1); } wptr = 0; } memcpy(wbuf + wptr, ai, sizeof(*ai)); wptr += sizeof(*ai); } } cmd.zc_obj = aih->aih_next; } while (cmd.zc_obj != 0); if (wptr > 0) { ret = write(ofd, wbuf, wptr); if (ret == -1) { printf("cannot write to output " "file: %s\n", strerror(errno)); } exit(1); } close(fd); close(ofd); free(buf); } static void awr_stat(avl_tree_t *awr, uint64_t color) { arc_read_node_t *arn = avl_first(awr); uint64_t bufs_total = 0; uint64_t bufs_in_l1 = 0; uint64_t bufs_in_l1_ghost = 0; uint64_t bufs_in_l2 = 0; uint64_t bytes_total = 0; uint64_t bytes_in_l1 = 0; uint64_t bytes_in_l1_ghost = 0; uint64_t bytes_in_l2 = 0; while (arn) { arc_read_t *ar = &arn->arn_ar; if (g_verbose) { printf("dva %016llx:%016llx birth % 8d " "spa %016llx size % 8d ", ar->ar_dva0, ar->ar_dva1, ar->ar_birth, ar->ar_spa, ar->ar_size, arn->arn_color); if (arn->arn_color == color) printf("flags %016x state %s\n", arn->arn_flags, state2str(arn->arn_state)); else printf("not in ARC\n"); } if (arn->arn_color == color) { if (arn->arn_state == AIS_MRU || arn->arn_state == AIS_MFU) { ++bufs_in_l1; bytes_in_l1 += ar->ar_size; } else if (arn->arn_state == AIS_MRU_GHOST || arn->arn_state == AIS_MFU_GHOST) { ++bufs_in_l1_ghost; bytes_in_l1_ghost =+ ar->ar_size; } if (arn->arn_flags & ARC_FLAG_HAS_L2HDR) { ++bufs_in_l2; bytes_in_l2 += ar->ar_size; } } ++bufs_total; bytes_total += ar->ar_size; arn = AVL_NEXT(awr, arn); } if (g_verbose) { printf("\n"); } printf(" | bufs | bytes\n"); printf("---------+------------+-----------------\n"); printf(" in l1 | % 10lld | %16lld\n", bufs_in_l1, bytes_in_l1); printf("l1 ghost | % 10lld | %16lld\n", bufs_in_l1_ghost, bytes_in_l1_ghost); printf(" in l2 | % 10lld | %16lld\n", bufs_in_l2, bytes_in_l2); printf(" total | % 10lld | %16lld\n", bufs_total, bytes_total); printf("\n"); } static void usage(const char *basename) { (void) fprintf(stderr, "Usage: %s -d [options]\n" " %s {-c | -i} [options] [command [args]]\n\n" "\tOptions:\n" "\t -c run command and record read blocks\n" "\t -i filename read previously recorded output from -o instead\n" "\t of running a command\n" "\t -b bufsize change tracing bufsize\n" "\t -a dump arc\n" "\t -v verbose\n" "\t -w watch decay of buffers in arc\n" "\t -d seconds watch interval\n" "\t -o filename write output to file\n", basename, basename); exit(1); } /* * TODO: compare 2 traces * TODO: compare 2 arc infos * TODO: persistent spa numbering */ int main(int argc, char **argv) { extern char *optarg; extern int optind; int c; char *bufsize = "4m"; int run_cmd = 0; int watch = 0; char *basename; char *out_fn = NULL; char *in_fn = NULL; avl_tree_t awr; uint64_t color = 0; int interval = 10; int dump_arc = 0; char *arc_fn = NULL; avl_create(&awr, awr_cmp, sizeof(arc_read_node_t), offsetof(arc_read_node_t, arn_node)); basename = strrchr(argv[0], '/'); if (basename == NULL) basename = argv[0]; while ((c = getopt(argc, argv, "b:o:i:cvwhd:aI:")) != EOF) { switch(c) { case 'b': bufsize = optarg; break; case 'c': run_cmd = 1; break; case 'w': watch = 1; break; case 'v': ++g_verbose; break; case 'o': out_fn = optarg; break; case 'i': in_fn = optarg; break; case 'I': arc_fn = optarg; break; case 'a': dump_arc = 1; break; case 'd': interval = atoi(optarg); break; case 'h': default: usage(basename); } } if (optind != argc && !run_cmd) { fprintf(stderr, "command given without -c switch\n"); exit(1); } if (dump_arc) { get_arc(NULL, 0, out_fn); exit(0); } if (arc_fn != NULL && !run_cmd && in_fn == NULL) { read_arc(&awr, arc_fn, 1, 1); exit(1); } if (arc_fn != NULL && watch) { fprintf(stderr, "-I given with -w\n"); exit(1); } if (run_cmd && (in_fn != NULL)) { fprintf(stderr, "-i and -c are mutually exclusive\n"); exit(1); } if (run_cmd) { if (optind == argc) { fprintf(stderr, "no command given\n"); exit(1); } run_dtrace(bufsize, out_fn, &awr, argc - optind, argv + optind); } if (in_fn) read_awr(&awr, in_fn); if (watch) { while (1) { get_arc(&awr, ++color, NULL); awr_stat(&awr, color); sleep(10); } } if (arc_fn) { read_arc(&awr, arc_fn, 1, 0); awr_stat(&awr, 1); } exit(0); }