xref: /freebsd/sys/contrib/openzfs/cmd/zpool_influxdb/zpool_influxdb.c (revision 61145dc2b94f12f6a47344fb9aac702321880e43)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * Gather top-level ZFS pool and resilver/scan statistics and print using
4  * influxdb line protocol
5  * usage: [options] [pool_name]
6  * where options are:
7  *   --execd, -e           run in telegraf execd input plugin mode, [CR] on
8  *                         stdin causes a sample to be printed and wait for
9  *                         the next [CR]
10  *   --no-histograms, -n   don't print histogram data (reduces cardinality
11  *                         if you don't care about histograms)
12  *   --sum-histogram-buckets, -s sum histogram bucket values
13  *
14  * To integrate into telegraf use one of:
15  * 1. the `inputs.execd` plugin with the `--execd` option
16  * 2. the `inputs.exec` plugin to simply run with no options
17  *
18  * NOTE: libzfs is an unstable interface. YMMV.
19  *
20  * The design goals of this software include:
21  * + be as lightweight as possible
22  * + reduce the number of external dependencies as far as possible, hence
23  *   there is no dependency on a client library for managing the metric
24  *   collection -- info is printed, KISS
25  * + broken pools or kernel bugs can cause this process to hang in an
26  *   unkillable state. For this reason, it is best to keep the damage limited
27  *   to a small process like zpool_influxdb rather than a larger collector.
28  *
29  * Copyright 2018-2020 Richard Elling
30  *
31  * This software is dual-licensed MIT and CDDL.
32  *
33  * The MIT License (MIT)
34  *
35  * Permission is hereby granted, free of charge, to any person obtaining a copy
36  * of this software and associated documentation files (the "Software"), to deal
37  * in the Software without restriction, including without limitation the rights
38  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
39  * copies of the Software, and to permit persons to whom the Software is
40  * furnished to do so, subject to the following conditions:
41  *
42  * The above copyright notice and this permission notice shall be included in
43  * all copies or substantial portions of the Software.
44  *
45  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
46  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
47  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
48  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
49  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
50  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
51  * SOFTWARE.
52  *
53  * CDDL HEADER START
54  *
55  * The contents of this file are subject to the terms of the
56  * Common Development and Distribution License (the "License").
57  * You may not use this file except in compliance with the License.
58  *
59  * The contents of this file are subject to the terms of the
60  * Common Development and Distribution License Version 1.0 (CDDL-1.0).
61  * You can obtain a copy of the license from the top-level file
62  * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
63  * You may not use this file except in compliance with the license.
64  *
65  * See the License for the specific language governing permissions
66  * and limitations under the License.
67  *
68  * CDDL HEADER END
69  */
70 #include <string.h>
71 #include <getopt.h>
72 #include <stdio.h>
73 #include <stdint.h>
74 #include <inttypes.h>
75 #include <libzfs.h>
76 
77 #define	POOL_MEASUREMENT	"zpool_stats"
78 #define	SCAN_MEASUREMENT	"zpool_scan_stats"
79 #define	VDEV_MEASUREMENT	"zpool_vdev_stats"
80 #define	POOL_LATENCY_MEASUREMENT	"zpool_latency"
81 #define	POOL_QUEUE_MEASUREMENT	"zpool_vdev_queue"
82 #define	MIN_LAT_INDEX	10  /* minimum latency index 10 = 1024ns */
83 #define	POOL_IO_SIZE_MEASUREMENT	"zpool_io_size"
84 #define	MIN_SIZE_INDEX	9  /* minimum size index 9 = 512 bytes */
85 
86 /* global options */
87 int execd_mode = 0;
88 int no_histograms = 0;
89 int sum_histogram_buckets = 0;
90 char metric_data_type = 'u';
91 uint64_t metric_value_mask = UINT64_MAX;
92 uint64_t timestamp = 0;
93 int complained_about_sync = 0;
94 const char *tags = "";
95 
96 typedef int (*stat_printer_f)(nvlist_t *, const char *, const char *);
97 
98 /*
99  * influxdb line protocol rules for escaping are important because the
100  * zpool name can include characters that need to be escaped
101  *
102  * caller is responsible for freeing result
103  */
104 static char *
escape_string(const char * s)105 escape_string(const char *s)
106 {
107 	const char *c;
108 	char *d;
109 	char *t = (char *)malloc(ZFS_MAX_DATASET_NAME_LEN * 2);
110 	if (t == NULL) {
111 		fprintf(stderr, "error: cannot allocate memory\n");
112 		exit(1);
113 	}
114 
115 	for (c = s, d = t; *c != '\0'; c++, d++) {
116 		switch (*c) {
117 		case ' ':
118 		case ',':
119 		case '=':
120 		case '\\':
121 			*d++ = '\\';
122 			zfs_fallthrough;
123 		default:
124 			*d = *c;
125 		}
126 	}
127 	*d = '\0';
128 	return (t);
129 }
130 
131 /*
132  * print key=value where value is a uint64_t
133  */
134 static void
print_kv(const char * key,uint64_t value)135 print_kv(const char *key, uint64_t value)
136 {
137 	printf("%s=%llu%c", key,
138 	    (u_longlong_t)value & metric_value_mask, metric_data_type);
139 }
140 
141 /*
142  * print_scan_status() prints the details as often seen in the "zpool status"
143  * output. However, unlike the zpool command, which is intended for humans,
144  * this output is suitable for long-term tracking in influxdb.
145  * TODO: update to include issued scan data
146  */
147 static int
print_scan_status(nvlist_t * nvroot,const char * pool_name)148 print_scan_status(nvlist_t *nvroot, const char *pool_name)
149 {
150 	uint_t c;
151 	int64_t elapsed;
152 	uint64_t examined, pass_exam, paused_time, paused_ts, rate;
153 	uint64_t remaining_time;
154 	pool_scan_stat_t *ps = NULL;
155 	double pct_done;
156 	const char *const state[DSS_NUM_STATES] = {
157 	    "none", "scanning", "finished", "canceled"};
158 	const char *func;
159 
160 	(void) nvlist_lookup_uint64_array(nvroot,
161 	    ZPOOL_CONFIG_SCAN_STATS,
162 	    (uint64_t **)&ps, &c);
163 
164 	/*
165 	 * ignore if there are no stats
166 	 */
167 	if (ps == NULL)
168 		return (0);
169 
170 	/*
171 	 * return error if state is bogus
172 	 */
173 	if (ps->pss_state >= DSS_NUM_STATES ||
174 	    ps->pss_func >= POOL_SCAN_FUNCS) {
175 		if (complained_about_sync % 1000 == 0) {
176 			fprintf(stderr, "error: cannot decode scan stats: "
177 			    "ZFS is out of sync with compiled zpool_influxdb");
178 			complained_about_sync++;
179 		}
180 		return (1);
181 	}
182 
183 	switch (ps->pss_func) {
184 	case POOL_SCAN_NONE:
185 		func = "none_requested";
186 		break;
187 	case POOL_SCAN_SCRUB:
188 		func = "scrub";
189 		break;
190 	case POOL_SCAN_RESILVER:
191 		func = "resilver";
192 		break;
193 #ifdef POOL_SCAN_REBUILD
194 	case POOL_SCAN_REBUILD:
195 		func = "rebuild";
196 		break;
197 #endif
198 	default:
199 		func = "scan";
200 	}
201 
202 	/* overall progress */
203 	examined = ps->pss_examined ? ps->pss_examined : 1;
204 	pct_done = 0.0;
205 	if (ps->pss_to_examine > 0)
206 		pct_done = 100.0 * examined / ps->pss_to_examine;
207 
208 #ifdef EZFS_SCRUB_PAUSED
209 	paused_ts = ps->pss_pass_scrub_pause;
210 	paused_time = ps->pss_pass_scrub_spent_paused;
211 #else
212 	paused_ts = 0;
213 	paused_time = 0;
214 #endif
215 
216 	/* calculations for this pass */
217 	if (ps->pss_state == DSS_SCANNING) {
218 		elapsed = (int64_t)time(NULL) - (int64_t)ps->pss_pass_start -
219 		    (int64_t)paused_time;
220 		elapsed = (elapsed > 0) ? elapsed : 1;
221 		pass_exam = ps->pss_pass_exam ? ps->pss_pass_exam : 1;
222 		rate = pass_exam / elapsed;
223 		rate = (rate > 0) ? rate : 1;
224 		remaining_time = ps->pss_to_examine - examined / rate;
225 	} else {
226 		elapsed =
227 		    (int64_t)ps->pss_end_time - (int64_t)ps->pss_pass_start -
228 		    (int64_t)paused_time;
229 		elapsed = (elapsed > 0) ? elapsed : 1;
230 		pass_exam = ps->pss_pass_exam ? ps->pss_pass_exam : 1;
231 		rate = pass_exam / elapsed;
232 		remaining_time = 0;
233 	}
234 	rate = rate ? rate : 1;
235 
236 	/* influxdb line protocol format: "tags metrics timestamp" */
237 	printf("%s%s,function=%s,name=%s,state=%s ",
238 	    SCAN_MEASUREMENT, tags, func, pool_name, state[ps->pss_state]);
239 	print_kv("end_ts", ps->pss_end_time);
240 	print_kv(",errors", ps->pss_errors);
241 	print_kv(",examined", examined);
242 	print_kv(",skipped", ps->pss_skipped);
243 	print_kv(",issued", ps->pss_issued);
244 	print_kv(",pass_examined", pass_exam);
245 	print_kv(",pass_issued", ps->pss_pass_issued);
246 	print_kv(",paused_ts", paused_ts);
247 	print_kv(",paused_t", paused_time);
248 	printf(",pct_done=%.2f", pct_done);
249 	print_kv(",processed", ps->pss_processed);
250 	print_kv(",rate", rate);
251 	print_kv(",remaining_t", remaining_time);
252 	print_kv(",start_ts", ps->pss_start_time);
253 	print_kv(",to_examine", ps->pss_to_examine);
254 	printf(" %llu\n", (u_longlong_t)timestamp);
255 	return (0);
256 }
257 
258 /*
259  * get a vdev name that corresponds to the top-level vdev names
260  * printed by `zpool status`
261  */
262 static char *
get_vdev_name(nvlist_t * nvroot,const char * parent_name)263 get_vdev_name(nvlist_t *nvroot, const char *parent_name)
264 {
265 	static char vdev_name[256];
266 	uint64_t vdev_id = 0;
267 
268 	const char *vdev_type = "unknown";
269 	(void) nvlist_lookup_string(nvroot, ZPOOL_CONFIG_TYPE, &vdev_type);
270 
271 	if (nvlist_lookup_uint64(
272 	    nvroot, ZPOOL_CONFIG_ID, &vdev_id) != 0)
273 		vdev_id = UINT64_MAX;
274 
275 	if (parent_name == NULL) {
276 		(void) snprintf(vdev_name, sizeof (vdev_name), "%s",
277 		    vdev_type);
278 	} else {
279 		(void) snprintf(vdev_name, sizeof (vdev_name),
280 		    "%.220s/%s-%llu",
281 		    parent_name, vdev_type, (u_longlong_t)vdev_id);
282 	}
283 	return (vdev_name);
284 }
285 
286 /*
287  * get a string suitable for an influxdb tag that describes this vdev
288  *
289  * By default only the vdev hierarchical name is shown, separated by '/'
290  * If the vdev has an associated path, which is typical of leaf vdevs,
291  * then the path is added.
292  * It would be nice to have the devid instead of the path, but under
293  * Linux we cannot be sure a devid will exist and we'd rather have
294  * something than nothing, so we'll use path instead.
295  */
296 static char *
get_vdev_desc(nvlist_t * nvroot,const char * parent_name)297 get_vdev_desc(nvlist_t *nvroot, const char *parent_name)
298 {
299 	static char vdev_desc[2 * MAXPATHLEN];
300 	char vdev_value[MAXPATHLEN];
301 	char *s, *t;
302 
303 	const char *vdev_type = "unknown";
304 	uint64_t vdev_id = UINT64_MAX;
305 	const char *vdev_path = NULL;
306 	(void) nvlist_lookup_string(nvroot, ZPOOL_CONFIG_TYPE, &vdev_type);
307 	(void) nvlist_lookup_uint64(nvroot, ZPOOL_CONFIG_ID, &vdev_id);
308 	(void) nvlist_lookup_string(nvroot, ZPOOL_CONFIG_PATH, &vdev_path);
309 
310 	if (parent_name == NULL) {
311 		s = escape_string(vdev_type);
312 		(void) snprintf(vdev_value, sizeof (vdev_value), "vdev=%s", s);
313 		free(s);
314 	} else {
315 		s = escape_string((char *)parent_name);
316 		t = escape_string(vdev_type);
317 		(void) snprintf(vdev_value, sizeof (vdev_value),
318 		    "vdev=%s/%s-%llu", s, t, (u_longlong_t)vdev_id);
319 		free(s);
320 		free(t);
321 	}
322 	if (vdev_path == NULL) {
323 		(void) snprintf(vdev_desc, sizeof (vdev_desc), "%s",
324 		    vdev_value);
325 	} else {
326 		s = escape_string(vdev_path);
327 		(void) snprintf(vdev_desc, sizeof (vdev_desc), "path=%s,%s",
328 		    s, vdev_value);
329 		free(s);
330 	}
331 	return (vdev_desc);
332 }
333 
334 /*
335  * vdev summary stats are a combination of the data shown by
336  * `zpool status` and `zpool list -v`
337  */
338 static int
print_summary_stats(nvlist_t * nvroot,const char * pool_name,const char * parent_name)339 print_summary_stats(nvlist_t *nvroot, const char *pool_name,
340     const char *parent_name)
341 {
342 	uint_t c;
343 	vdev_stat_t *vs;
344 	char *vdev_desc = NULL;
345 	vdev_desc = get_vdev_desc(nvroot, parent_name);
346 	if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
347 	    (uint64_t **)&vs, &c) != 0) {
348 		return (1);
349 	}
350 	printf("%s%s,name=%s,state=%s,%s ", POOL_MEASUREMENT, tags,
351 	    pool_name, zpool_state_to_name((vdev_state_t)vs->vs_state,
352 	    (vdev_aux_t)vs->vs_aux), vdev_desc);
353 	print_kv("alloc", vs->vs_alloc);
354 	print_kv(",free", vs->vs_space - vs->vs_alloc);
355 	print_kv(",size", vs->vs_space);
356 	print_kv(",read_bytes", vs->vs_bytes[ZIO_TYPE_READ]);
357 	print_kv(",read_errors", vs->vs_read_errors);
358 	print_kv(",read_ops", vs->vs_ops[ZIO_TYPE_READ]);
359 	print_kv(",write_bytes", vs->vs_bytes[ZIO_TYPE_WRITE]);
360 	print_kv(",write_errors", vs->vs_write_errors);
361 	print_kv(",write_ops", vs->vs_ops[ZIO_TYPE_WRITE]);
362 	print_kv(",checksum_errors", vs->vs_checksum_errors);
363 	print_kv(",fragmentation", vs->vs_fragmentation);
364 	printf(" %llu\n", (u_longlong_t)timestamp);
365 	return (0);
366 }
367 
368 /*
369  * vdev latency stats are histograms stored as nvlist arrays of uint64.
370  * Latency stats include the ZIO scheduler classes plus lower-level
371  * vdev latencies.
372  *
373  * In many cases, the top-level "root" view obscures the underlying
374  * top-level vdev operations. For example, if a pool has a log, special,
375  * or cache device, then each can behave very differently. It is useful
376  * to see how each is responding.
377  */
378 static int
print_vdev_latency_stats(nvlist_t * nvroot,const char * pool_name,const char * parent_name)379 print_vdev_latency_stats(nvlist_t *nvroot, const char *pool_name,
380     const char *parent_name)
381 {
382 	uint_t c, end = 0;
383 	nvlist_t *nv_ex;
384 	char *vdev_desc = NULL;
385 
386 	/* short_names become part of the metric name and are influxdb-ready */
387 	struct lat_lookup {
388 	    const char *name;
389 	    const char *short_name;
390 	    uint64_t sum;
391 	    uint64_t *array;
392 	};
393 	struct lat_lookup lat_type[] = {
394 	    {ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO,   "total_read", 0},
395 	    {ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO,   "total_write", 0},
396 	    {ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO,  "disk_read", 0},
397 	    {ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO,  "disk_write", 0},
398 	    {ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO,  "sync_read", 0},
399 	    {ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO,  "sync_write", 0},
400 	    {ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, "async_read", 0},
401 	    {ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, "async_write", 0},
402 	    {ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO,   "scrub", 0},
403 #ifdef ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO
404 	    {ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO,    "trim", 0},
405 #endif
406 	    {ZPOOL_CONFIG_VDEV_REBUILD_LAT_HISTO,    "rebuild", 0},
407 	    {NULL,	NULL}
408 	};
409 
410 	if (nvlist_lookup_nvlist(nvroot,
411 	    ZPOOL_CONFIG_VDEV_STATS_EX, &nv_ex) != 0) {
412 		return (6);
413 	}
414 
415 	vdev_desc = get_vdev_desc(nvroot, parent_name);
416 
417 	for (int i = 0; lat_type[i].name; i++) {
418 		if (nvlist_lookup_uint64_array(nv_ex,
419 		    lat_type[i].name, &lat_type[i].array, &c) != 0) {
420 			fprintf(stderr, "error: can't get %s\n",
421 			    lat_type[i].name);
422 			return (3);
423 		}
424 		/* end count count, all of the arrays are the same size */
425 		end = c - 1;
426 	}
427 
428 	for (int bucket = 0; bucket <= end; bucket++) {
429 		if (bucket < MIN_LAT_INDEX) {
430 			/* don't print, but collect the sum */
431 			for (int i = 0; lat_type[i].name; i++) {
432 				lat_type[i].sum += lat_type[i].array[bucket];
433 			}
434 			continue;
435 		}
436 		if (bucket < end) {
437 			printf("%s%s,le=%0.6f,name=%s,%s ",
438 			    POOL_LATENCY_MEASUREMENT, tags,
439 			    (float)(1ULL << bucket) * 1e-9,
440 			    pool_name, vdev_desc);
441 		} else {
442 			printf("%s%s,le=+Inf,name=%s,%s ",
443 			    POOL_LATENCY_MEASUREMENT, tags, pool_name,
444 			    vdev_desc);
445 		}
446 		for (int i = 0; lat_type[i].name; i++) {
447 			if (bucket <= MIN_LAT_INDEX || sum_histogram_buckets) {
448 				lat_type[i].sum += lat_type[i].array[bucket];
449 			} else {
450 				lat_type[i].sum = lat_type[i].array[bucket];
451 			}
452 			print_kv(lat_type[i].short_name, lat_type[i].sum);
453 			if (lat_type[i + 1].name != NULL) {
454 				printf(",");
455 			}
456 		}
457 		printf(" %llu\n", (u_longlong_t)timestamp);
458 	}
459 	return (0);
460 }
461 
462 /*
463  * vdev request size stats are histograms stored as nvlist arrays of uint64.
464  * Request size stats include the ZIO scheduler classes plus lower-level
465  * vdev sizes. Both independent (ind) and aggregated (agg) sizes are reported.
466  *
467  * In many cases, the top-level "root" view obscures the underlying
468  * top-level vdev operations. For example, if a pool has a log, special,
469  * or cache device, then each can behave very differently. It is useful
470  * to see how each is responding.
471  */
472 static int
print_vdev_size_stats(nvlist_t * nvroot,const char * pool_name,const char * parent_name)473 print_vdev_size_stats(nvlist_t *nvroot, const char *pool_name,
474     const char *parent_name)
475 {
476 	uint_t c, end = 0;
477 	nvlist_t *nv_ex;
478 	char *vdev_desc = NULL;
479 
480 	/* short_names become the field name */
481 	struct size_lookup {
482 	    const char *name;
483 	    const char *short_name;
484 	    uint64_t sum;
485 	    uint64_t *array;
486 	};
487 	struct size_lookup size_type[] = {
488 	    {ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO,   "sync_read_ind"},
489 	    {ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO,   "sync_write_ind"},
490 	    {ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO,  "async_read_ind"},
491 	    {ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO,  "async_write_ind"},
492 	    {ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO,    "scrub_read_ind"},
493 	    {ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO,   "sync_read_agg"},
494 	    {ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO,   "sync_write_agg"},
495 	    {ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO,  "async_read_agg"},
496 	    {ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO,  "async_write_agg"},
497 	    {ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO,    "scrub_read_agg"},
498 #ifdef ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO
499 	    {ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO,    "trim_write_ind"},
500 	    {ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO,    "trim_write_agg"},
501 #endif
502 	    {ZPOOL_CONFIG_VDEV_IND_REBUILD_HISTO,    "rebuild_write_ind"},
503 	    {ZPOOL_CONFIG_VDEV_AGG_REBUILD_HISTO,    "rebuild_write_agg"},
504 	    {NULL,	NULL}
505 	};
506 
507 	if (nvlist_lookup_nvlist(nvroot,
508 	    ZPOOL_CONFIG_VDEV_STATS_EX, &nv_ex) != 0) {
509 		return (6);
510 	}
511 
512 	vdev_desc = get_vdev_desc(nvroot, parent_name);
513 
514 	for (int i = 0; size_type[i].name; i++) {
515 		if (nvlist_lookup_uint64_array(nv_ex, size_type[i].name,
516 		    &size_type[i].array, &c) != 0) {
517 			fprintf(stderr, "error: can't get %s\n",
518 			    size_type[i].name);
519 			return (3);
520 		}
521 		/* end count count, all of the arrays are the same size */
522 		end = c - 1;
523 	}
524 
525 	for (int bucket = 0; bucket <= end; bucket++) {
526 		if (bucket < MIN_SIZE_INDEX) {
527 			/* don't print, but collect the sum */
528 			for (int i = 0; size_type[i].name; i++) {
529 				size_type[i].sum += size_type[i].array[bucket];
530 			}
531 			continue;
532 		}
533 
534 		if (bucket < end) {
535 			printf("%s%s,le=%llu,name=%s,%s ",
536 			    POOL_IO_SIZE_MEASUREMENT, tags, 1ULL << bucket,
537 			    pool_name, vdev_desc);
538 		} else {
539 			printf("%s%s,le=+Inf,name=%s,%s ",
540 			    POOL_IO_SIZE_MEASUREMENT, tags, pool_name,
541 			    vdev_desc);
542 		}
543 		for (int i = 0; size_type[i].name; i++) {
544 			if (bucket <= MIN_SIZE_INDEX || sum_histogram_buckets) {
545 				size_type[i].sum += size_type[i].array[bucket];
546 			} else {
547 				size_type[i].sum = size_type[i].array[bucket];
548 			}
549 			print_kv(size_type[i].short_name, size_type[i].sum);
550 			if (size_type[i + 1].name != NULL) {
551 				printf(",");
552 			}
553 		}
554 		printf(" %llu\n", (u_longlong_t)timestamp);
555 	}
556 	return (0);
557 }
558 
559 /*
560  * ZIO scheduler queue stats are stored as gauges. This is unfortunate
561  * because the values can change very rapidly and any point-in-time
562  * value will quickly be obsoleted. It is also not easy to downsample.
563  * Thus only the top-level queue stats might be beneficial... maybe.
564  */
565 static int
print_queue_stats(nvlist_t * nvroot,const char * pool_name,const char * parent_name)566 print_queue_stats(nvlist_t *nvroot, const char *pool_name,
567     const char *parent_name)
568 {
569 	nvlist_t *nv_ex;
570 	uint64_t value;
571 
572 	/* short_names are used for the field name */
573 	struct queue_lookup {
574 	    const char *name;
575 	    const char *short_name;
576 	};
577 	struct queue_lookup queue_type[] = {
578 	    {ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE,	"sync_r_active"},
579 	    {ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE,	"sync_w_active"},
580 	    {ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE,	"async_r_active"},
581 	    {ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE,	"async_w_active"},
582 	    {ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE,	"async_scrub_active"},
583 	    {ZPOOL_CONFIG_VDEV_REBUILD_ACTIVE_QUEUE,	"rebuild_active"},
584 	    {ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE,	"sync_r_pend"},
585 	    {ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE,	"sync_w_pend"},
586 	    {ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE,	"async_r_pend"},
587 	    {ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE,	"async_w_pend"},
588 	    {ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE,	"async_scrub_pend"},
589 	    {ZPOOL_CONFIG_VDEV_REBUILD_PEND_QUEUE,	"rebuild_pend"},
590 	    {NULL,	NULL}
591 	};
592 
593 	if (nvlist_lookup_nvlist(nvroot,
594 	    ZPOOL_CONFIG_VDEV_STATS_EX, &nv_ex) != 0) {
595 		return (6);
596 	}
597 
598 	printf("%s%s,name=%s,%s ", POOL_QUEUE_MEASUREMENT, tags, pool_name,
599 	    get_vdev_desc(nvroot, parent_name));
600 	for (int i = 0; queue_type[i].name; i++) {
601 		if (nvlist_lookup_uint64(nv_ex,
602 		    queue_type[i].name, &value) != 0) {
603 			fprintf(stderr, "error: can't get %s\n",
604 			    queue_type[i].name);
605 			return (3);
606 		}
607 		print_kv(queue_type[i].short_name, value);
608 		if (queue_type[i + 1].name != NULL) {
609 			printf(",");
610 		}
611 	}
612 	printf(" %llu\n", (u_longlong_t)timestamp);
613 	return (0);
614 }
615 
616 /*
617  * top-level vdev stats are at the pool level
618  */
619 static int
print_top_level_vdev_stats(nvlist_t * nvroot,const char * pool_name)620 print_top_level_vdev_stats(nvlist_t *nvroot, const char *pool_name)
621 {
622 	nvlist_t *nv_ex;
623 	uint64_t value;
624 
625 	/* short_names become part of the metric name */
626 	struct queue_lookup {
627 	    const char *name;
628 	    const char *short_name;
629 	};
630 	struct queue_lookup queue_type[] = {
631 	    {ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, "sync_r_active_queue"},
632 	    {ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, "sync_w_active_queue"},
633 	    {ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, "async_r_active_queue"},
634 	    {ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, "async_w_active_queue"},
635 	    {ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, "async_scrub_active_queue"},
636 	    {ZPOOL_CONFIG_VDEV_REBUILD_ACTIVE_QUEUE, "rebuild_active_queue"},
637 	    {ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE, "sync_r_pend_queue"},
638 	    {ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE, "sync_w_pend_queue"},
639 	    {ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE, "async_r_pend_queue"},
640 	    {ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE, "async_w_pend_queue"},
641 	    {ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE, "async_scrub_pend_queue"},
642 	    {ZPOOL_CONFIG_VDEV_REBUILD_PEND_QUEUE, "rebuild_pend_queue"},
643 	    {NULL, NULL}
644 	};
645 
646 	if (nvlist_lookup_nvlist(nvroot,
647 	    ZPOOL_CONFIG_VDEV_STATS_EX, &nv_ex) != 0) {
648 		return (6);
649 	}
650 
651 	printf("%s%s,name=%s,vdev=root ", VDEV_MEASUREMENT, tags,
652 	    pool_name);
653 	for (int i = 0; queue_type[i].name; i++) {
654 		if (nvlist_lookup_uint64(nv_ex,
655 		    queue_type[i].name, &value) != 0) {
656 			fprintf(stderr, "error: can't get %s\n",
657 			    queue_type[i].name);
658 			return (3);
659 		}
660 		if (i > 0)
661 			printf(",");
662 		print_kv(queue_type[i].short_name, value);
663 	}
664 
665 	printf(" %llu\n", (u_longlong_t)timestamp);
666 	return (0);
667 }
668 
669 /*
670  * recursive stats printer
671  */
672 static int
print_recursive_stats(stat_printer_f func,nvlist_t * nvroot,const char * pool_name,const char * parent_name,int descend)673 print_recursive_stats(stat_printer_f func, nvlist_t *nvroot,
674     const char *pool_name, const char *parent_name, int descend)
675 {
676 	uint_t c, children;
677 	nvlist_t **child;
678 	char vdev_name[256];
679 	int err;
680 
681 	err = func(nvroot, pool_name, parent_name);
682 	if (err)
683 		return (err);
684 
685 	if (descend && nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
686 	    &child, &children) == 0) {
687 		(void) strlcpy(vdev_name, get_vdev_name(nvroot, parent_name),
688 		    sizeof (vdev_name));
689 
690 		for (c = 0; c < children; c++) {
691 			err = print_recursive_stats(func, child[c], pool_name,
692 			    vdev_name, descend);
693 			if (err)
694 				return (err);
695 		}
696 	}
697 	return (0);
698 }
699 
700 /*
701  * call-back to print the stats from the pool config
702  *
703  * Note: if the pool is broken, this can hang indefinitely and perhaps in an
704  * unkillable state.
705  */
706 static int
print_stats(zpool_handle_t * zhp,void * data)707 print_stats(zpool_handle_t *zhp, void *data)
708 {
709 	uint_t c;
710 	int err;
711 	boolean_t missing;
712 	nvlist_t *config, *nvroot;
713 	vdev_stat_t *vs;
714 	struct timespec tv;
715 	char *pool_name;
716 
717 	/* if not this pool return quickly */
718 	if (data &&
719 	    strncmp(data, zpool_get_name(zhp), ZFS_MAX_DATASET_NAME_LEN) != 0) {
720 		zpool_close(zhp);
721 		return (0);
722 	}
723 
724 	if (zpool_refresh_stats(zhp, &missing) != 0) {
725 		zpool_close(zhp);
726 		return (1);
727 	}
728 
729 	config = zpool_get_config(zhp, NULL);
730 	if (clock_gettime(CLOCK_REALTIME, &tv) != 0)
731 		timestamp = (uint64_t)time(NULL) * 1000000000;
732 	else
733 		timestamp =
734 		    ((uint64_t)tv.tv_sec * 1000000000) + (uint64_t)tv.tv_nsec;
735 
736 	if (nvlist_lookup_nvlist(
737 	    config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) != 0) {
738 	zpool_close(zhp);
739 		return (2);
740 	}
741 	if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
742 	    (uint64_t **)&vs, &c) != 0) {
743 	zpool_close(zhp);
744 		return (3);
745 	}
746 
747 	pool_name = escape_string(zpool_get_name(zhp));
748 	err = print_recursive_stats(print_summary_stats, nvroot,
749 	    pool_name, NULL, 1);
750 	/* if any of these return an error, skip the rest */
751 	if (err == 0)
752 	err = print_top_level_vdev_stats(nvroot, pool_name);
753 
754 	if (no_histograms == 0) {
755 	if (err == 0)
756 		err = print_recursive_stats(print_vdev_latency_stats, nvroot,
757 		    pool_name, NULL, 1);
758 	if (err == 0)
759 		err = print_recursive_stats(print_vdev_size_stats, nvroot,
760 		    pool_name, NULL, 1);
761 	if (err == 0)
762 		err = print_recursive_stats(print_queue_stats, nvroot,
763 		    pool_name, NULL, 0);
764 	}
765 	if (err == 0)
766 		err = print_scan_status(nvroot, pool_name);
767 
768 	free(pool_name);
769 	zpool_close(zhp);
770 	return (err);
771 }
772 
773 static void
usage(char * name)774 usage(char *name)
775 {
776 	fprintf(stderr, "usage: %s [--execd][--no-histograms]"
777 	    "[--sum-histogram-buckets] [--signed-int] [poolname]\n", name);
778 	exit(EXIT_FAILURE);
779 }
780 
781 int
main(int argc,char * argv[])782 main(int argc, char *argv[])
783 {
784 	int opt;
785 	int ret = 8;
786 	char *line = NULL, *ttags = NULL;
787 	size_t len, tagslen = 0;
788 	struct option long_options[] = {
789 	    {"execd", no_argument, NULL, 'e'},
790 	    {"help", no_argument, NULL, 'h'},
791 	    {"no-histograms", no_argument, NULL, 'n'},
792 	    {"signed-int", no_argument, NULL, 'i'},
793 	    {"sum-histogram-buckets", no_argument, NULL, 's'},
794 	    {"tags", required_argument, NULL, 't'},
795 	    {0, 0, 0, 0}
796 	};
797 	while ((opt = getopt_long(
798 	    argc, argv, "ehinst:", long_options, NULL)) != -1) {
799 		switch (opt) {
800 		case 'e':
801 			execd_mode = 1;
802 			break;
803 		case 'i':
804 			metric_data_type = 'i';
805 			metric_value_mask = INT64_MAX;
806 			break;
807 		case 'n':
808 			no_histograms = 1;
809 			break;
810 		case 's':
811 			sum_histogram_buckets = 1;
812 			break;
813 		case 't':
814 			free(ttags);
815 			tagslen = strlen(optarg) + 2;
816 			ttags = calloc(1, tagslen);
817 			if (ttags == NULL) {
818 				fprintf(stderr,
819 				    "error: cannot allocate memory "
820 				    "for tags\n");
821 				exit(1);
822 			}
823 			(void) snprintf(ttags, tagslen, ",%s", optarg);
824 			tags = ttags;
825 			break;
826 		default:
827 			usage(argv[0]);
828 		}
829 	}
830 
831 	libzfs_handle_t *g_zfs;
832 	if ((g_zfs = libzfs_init()) == NULL) {
833 		fprintf(stderr,
834 		    "error: cannot initialize libzfs. "
835 		    "Is the zfs module loaded or zrepl running?\n");
836 		exit(EXIT_FAILURE);
837 	}
838 	if (execd_mode == 0) {
839 		ret = zpool_iter(g_zfs, print_stats, argv[optind]);
840 		return (ret);
841 	}
842 	while (getline(&line, &len, stdin) != -1) {
843 		ret = zpool_iter(g_zfs, print_stats, argv[optind]);
844 		fflush(stdout);
845 	}
846 	return (ret);
847 }
848