xref: /freebsd/sys/contrib/openzfs/module/zfs/spa_stats.c (revision c6989859ae9388eeb46a24fe88f9b8d07101c710)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 #include <sys/zfs_context.h>
23 #include <sys/spa_impl.h>
24 #include <sys/vdev_impl.h>
25 #include <sys/spa.h>
26 #include <zfs_comutil.h>
27 
28 /*
29  * Keeps stats on last N reads per spa_t, disabled by default.
30  */
31 int zfs_read_history = 0;
32 
33 /*
34  * Include cache hits in history, disabled by default.
35  */
36 int zfs_read_history_hits = 0;
37 
38 /*
39  * Keeps stats on the last 100 txgs by default.
40  */
41 int zfs_txg_history = 100;
42 
43 /*
44  * Keeps stats on the last N MMP updates, disabled by default.
45  */
46 int zfs_multihost_history = 0;
47 
48 /*
49  * ==========================================================================
50  * SPA Read History Routines
51  * ==========================================================================
52  */
53 
54 /*
55  * Read statistics - Information exported regarding each arc_read call
56  */
57 typedef struct spa_read_history {
58 	hrtime_t	start;		/* time read completed */
59 	uint64_t	objset;		/* read from this objset */
60 	uint64_t	object;		/* read of this object number */
61 	uint64_t	level;		/* block's indirection level */
62 	uint64_t	blkid;		/* read of this block id */
63 	char		origin[24];	/* read originated from here */
64 	uint32_t	aflags;		/* ARC flags (cached, prefetch, etc.) */
65 	pid_t		pid;		/* PID of task doing read */
66 	char		comm[16];	/* process name of task doing read */
67 	procfs_list_node_t	srh_node;
68 } spa_read_history_t;
69 
70 static int
71 spa_read_history_show_header(struct seq_file *f)
72 {
73 	seq_printf(f, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s "
74 	    "%-24s %-8s %-16s\n", "UID", "start", "objset", "object",
75 	    "level", "blkid", "aflags", "origin", "pid", "process");
76 
77 	return (0);
78 }
79 
80 static int
81 spa_read_history_show(struct seq_file *f, void *data)
82 {
83 	spa_read_history_t *srh = (spa_read_history_t *)data;
84 
85 	seq_printf(f, "%-8llu %-16llu 0x%-6llx "
86 	    "%-8lli %-8lli %-8lli 0x%-6x %-24s %-8i %-16s\n",
87 	    (u_longlong_t)srh->srh_node.pln_id, srh->start,
88 	    (longlong_t)srh->objset, (longlong_t)srh->object,
89 	    (longlong_t)srh->level, (longlong_t)srh->blkid,
90 	    srh->aflags, srh->origin, srh->pid, srh->comm);
91 
92 	return (0);
93 }
94 
95 /* Remove oldest elements from list until there are no more than 'size' left */
96 static void
97 spa_read_history_truncate(spa_history_list_t *shl, unsigned int size)
98 {
99 	spa_read_history_t *srh;
100 	while (shl->size > size) {
101 		srh = list_remove_head(&shl->procfs_list.pl_list);
102 		ASSERT3P(srh, !=, NULL);
103 		kmem_free(srh, sizeof (spa_read_history_t));
104 		shl->size--;
105 	}
106 
107 	if (size == 0)
108 		ASSERT(list_is_empty(&shl->procfs_list.pl_list));
109 }
110 
111 static int
112 spa_read_history_clear(procfs_list_t *procfs_list)
113 {
114 	spa_history_list_t *shl = procfs_list->pl_private;
115 	mutex_enter(&procfs_list->pl_lock);
116 	spa_read_history_truncate(shl, 0);
117 	mutex_exit(&procfs_list->pl_lock);
118 	return (0);
119 }
120 
121 static void
122 spa_read_history_init(spa_t *spa)
123 {
124 	spa_history_list_t *shl = &spa->spa_stats.read_history;
125 	char *module;
126 
127 	shl->size = 0;
128 
129 	module = kmem_asprintf("zfs/%s", spa_name(spa));
130 
131 	shl->procfs_list.pl_private = shl;
132 	procfs_list_install(module,
133 	    "reads",
134 	    0600,
135 	    &shl->procfs_list,
136 	    spa_read_history_show,
137 	    spa_read_history_show_header,
138 	    spa_read_history_clear,
139 	    offsetof(spa_read_history_t, srh_node));
140 
141 	kmem_strfree(module);
142 }
143 
144 static void
145 spa_read_history_destroy(spa_t *spa)
146 {
147 	spa_history_list_t *shl = &spa->spa_stats.read_history;
148 	procfs_list_uninstall(&shl->procfs_list);
149 	spa_read_history_truncate(shl, 0);
150 	procfs_list_destroy(&shl->procfs_list);
151 }
152 
153 void
154 spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags)
155 {
156 	spa_history_list_t *shl = &spa->spa_stats.read_history;
157 	spa_read_history_t *srh;
158 
159 	ASSERT3P(spa, !=, NULL);
160 	ASSERT3P(zb,  !=, NULL);
161 
162 	if (zfs_read_history == 0 && shl->size == 0)
163 		return;
164 
165 	if (zfs_read_history_hits == 0 && (aflags & ARC_FLAG_CACHED))
166 		return;
167 
168 	srh = kmem_zalloc(sizeof (spa_read_history_t), KM_SLEEP);
169 	strlcpy(srh->comm, getcomm(), sizeof (srh->comm));
170 	srh->start  = gethrtime();
171 	srh->objset = zb->zb_objset;
172 	srh->object = zb->zb_object;
173 	srh->level  = zb->zb_level;
174 	srh->blkid  = zb->zb_blkid;
175 	srh->aflags = aflags;
176 	srh->pid    = getpid();
177 
178 	mutex_enter(&shl->procfs_list.pl_lock);
179 
180 	procfs_list_add(&shl->procfs_list, srh);
181 	shl->size++;
182 
183 	spa_read_history_truncate(shl, zfs_read_history);
184 
185 	mutex_exit(&shl->procfs_list.pl_lock);
186 }
187 
188 /*
189  * ==========================================================================
190  * SPA TXG History Routines
191  * ==========================================================================
192  */
193 
194 /*
195  * Txg statistics - Information exported regarding each txg sync
196  */
197 
198 typedef struct spa_txg_history {
199 	uint64_t	txg;		/* txg id */
200 	txg_state_t	state;		/* active txg state */
201 	uint64_t	nread;		/* number of bytes read */
202 	uint64_t	nwritten;	/* number of bytes written */
203 	uint64_t	reads;		/* number of read operations */
204 	uint64_t	writes;		/* number of write operations */
205 	uint64_t	ndirty;		/* number of dirty bytes */
206 	hrtime_t	times[TXG_STATE_COMMITTED]; /* completion times */
207 	procfs_list_node_t	sth_node;
208 } spa_txg_history_t;
209 
210 static int
211 spa_txg_history_show_header(struct seq_file *f)
212 {
213 	seq_printf(f, "%-8s %-16s %-5s %-12s %-12s %-12s "
214 	    "%-8s %-8s %-12s %-12s %-12s %-12s\n", "txg", "birth", "state",
215 	    "ndirty", "nread", "nwritten", "reads", "writes",
216 	    "otime", "qtime", "wtime", "stime");
217 	return (0);
218 }
219 
220 static int
221 spa_txg_history_show(struct seq_file *f, void *data)
222 {
223 	spa_txg_history_t *sth = (spa_txg_history_t *)data;
224 	uint64_t open = 0, quiesce = 0, wait = 0, sync = 0;
225 	char state;
226 
227 	switch (sth->state) {
228 		case TXG_STATE_BIRTH:		state = 'B';	break;
229 		case TXG_STATE_OPEN:		state = 'O';	break;
230 		case TXG_STATE_QUIESCED:	state = 'Q';	break;
231 		case TXG_STATE_WAIT_FOR_SYNC:	state = 'W';	break;
232 		case TXG_STATE_SYNCED:		state = 'S';	break;
233 		case TXG_STATE_COMMITTED:	state = 'C';	break;
234 		default:			state = '?';	break;
235 	}
236 
237 	if (sth->times[TXG_STATE_OPEN])
238 		open = sth->times[TXG_STATE_OPEN] -
239 		    sth->times[TXG_STATE_BIRTH];
240 
241 	if (sth->times[TXG_STATE_QUIESCED])
242 		quiesce = sth->times[TXG_STATE_QUIESCED] -
243 		    sth->times[TXG_STATE_OPEN];
244 
245 	if (sth->times[TXG_STATE_WAIT_FOR_SYNC])
246 		wait = sth->times[TXG_STATE_WAIT_FOR_SYNC] -
247 		    sth->times[TXG_STATE_QUIESCED];
248 
249 	if (sth->times[TXG_STATE_SYNCED])
250 		sync = sth->times[TXG_STATE_SYNCED] -
251 		    sth->times[TXG_STATE_WAIT_FOR_SYNC];
252 
253 	seq_printf(f, "%-8llu %-16llu %-5c %-12llu "
254 	    "%-12llu %-12llu %-8llu %-8llu %-12llu %-12llu %-12llu %-12llu\n",
255 	    (longlong_t)sth->txg, sth->times[TXG_STATE_BIRTH], state,
256 	    (u_longlong_t)sth->ndirty,
257 	    (u_longlong_t)sth->nread, (u_longlong_t)sth->nwritten,
258 	    (u_longlong_t)sth->reads, (u_longlong_t)sth->writes,
259 	    (u_longlong_t)open, (u_longlong_t)quiesce, (u_longlong_t)wait,
260 	    (u_longlong_t)sync);
261 
262 	return (0);
263 }
264 
265 /* Remove oldest elements from list until there are no more than 'size' left */
266 static void
267 spa_txg_history_truncate(spa_history_list_t *shl, unsigned int size)
268 {
269 	spa_txg_history_t *sth;
270 	while (shl->size > size) {
271 		sth = list_remove_head(&shl->procfs_list.pl_list);
272 		ASSERT3P(sth, !=, NULL);
273 		kmem_free(sth, sizeof (spa_txg_history_t));
274 		shl->size--;
275 	}
276 
277 	if (size == 0)
278 		ASSERT(list_is_empty(&shl->procfs_list.pl_list));
279 
280 }
281 
282 static int
283 spa_txg_history_clear(procfs_list_t *procfs_list)
284 {
285 	spa_history_list_t *shl = procfs_list->pl_private;
286 	mutex_enter(&procfs_list->pl_lock);
287 	spa_txg_history_truncate(shl, 0);
288 	mutex_exit(&procfs_list->pl_lock);
289 	return (0);
290 }
291 
292 static void
293 spa_txg_history_init(spa_t *spa)
294 {
295 	spa_history_list_t *shl = &spa->spa_stats.txg_history;
296 	char *module;
297 
298 	shl->size = 0;
299 
300 	module = kmem_asprintf("zfs/%s", spa_name(spa));
301 
302 	shl->procfs_list.pl_private = shl;
303 	procfs_list_install(module,
304 	    "txgs",
305 	    0644,
306 	    &shl->procfs_list,
307 	    spa_txg_history_show,
308 	    spa_txg_history_show_header,
309 	    spa_txg_history_clear,
310 	    offsetof(spa_txg_history_t, sth_node));
311 
312 	kmem_strfree(module);
313 }
314 
315 static void
316 spa_txg_history_destroy(spa_t *spa)
317 {
318 	spa_history_list_t *shl = &spa->spa_stats.txg_history;
319 	procfs_list_uninstall(&shl->procfs_list);
320 	spa_txg_history_truncate(shl, 0);
321 	procfs_list_destroy(&shl->procfs_list);
322 }
323 
324 /*
325  * Add a new txg to historical record.
326  */
327 void
328 spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time)
329 {
330 	spa_history_list_t *shl = &spa->spa_stats.txg_history;
331 	spa_txg_history_t *sth;
332 
333 	if (zfs_txg_history == 0 && shl->size == 0)
334 		return;
335 
336 	sth = kmem_zalloc(sizeof (spa_txg_history_t), KM_SLEEP);
337 	sth->txg = txg;
338 	sth->state = TXG_STATE_OPEN;
339 	sth->times[TXG_STATE_BIRTH] = birth_time;
340 
341 	mutex_enter(&shl->procfs_list.pl_lock);
342 	procfs_list_add(&shl->procfs_list, sth);
343 	shl->size++;
344 	spa_txg_history_truncate(shl, zfs_txg_history);
345 	mutex_exit(&shl->procfs_list.pl_lock);
346 }
347 
348 /*
349  * Set txg state completion time and increment current state.
350  */
351 int
352 spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state,
353     hrtime_t completed_time)
354 {
355 	spa_history_list_t *shl = &spa->spa_stats.txg_history;
356 	spa_txg_history_t *sth;
357 	int error = ENOENT;
358 
359 	if (zfs_txg_history == 0)
360 		return (0);
361 
362 	mutex_enter(&shl->procfs_list.pl_lock);
363 	for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL;
364 	    sth = list_prev(&shl->procfs_list.pl_list, sth)) {
365 		if (sth->txg == txg) {
366 			sth->times[completed_state] = completed_time;
367 			sth->state++;
368 			error = 0;
369 			break;
370 		}
371 	}
372 	mutex_exit(&shl->procfs_list.pl_lock);
373 
374 	return (error);
375 }
376 
377 /*
378  * Set txg IO stats.
379  */
380 static int
381 spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread,
382     uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t ndirty)
383 {
384 	spa_history_list_t *shl = &spa->spa_stats.txg_history;
385 	spa_txg_history_t *sth;
386 	int error = ENOENT;
387 
388 	if (zfs_txg_history == 0)
389 		return (0);
390 
391 	mutex_enter(&shl->procfs_list.pl_lock);
392 	for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL;
393 	    sth = list_prev(&shl->procfs_list.pl_list, sth)) {
394 		if (sth->txg == txg) {
395 			sth->nread = nread;
396 			sth->nwritten = nwritten;
397 			sth->reads = reads;
398 			sth->writes = writes;
399 			sth->ndirty = ndirty;
400 			error = 0;
401 			break;
402 		}
403 	}
404 	mutex_exit(&shl->procfs_list.pl_lock);
405 
406 	return (error);
407 }
408 
409 txg_stat_t *
410 spa_txg_history_init_io(spa_t *spa, uint64_t txg, dsl_pool_t *dp)
411 {
412 	txg_stat_t *ts;
413 
414 	if (zfs_txg_history == 0)
415 		return (NULL);
416 
417 	ts = kmem_alloc(sizeof (txg_stat_t), KM_SLEEP);
418 
419 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
420 	vdev_get_stats(spa->spa_root_vdev, &ts->vs1);
421 	spa_config_exit(spa, SCL_CONFIG, FTAG);
422 
423 	ts->txg = txg;
424 	ts->ndirty = dp->dp_dirty_pertxg[txg & TXG_MASK];
425 
426 	spa_txg_history_set(spa, txg, TXG_STATE_WAIT_FOR_SYNC, gethrtime());
427 
428 	return (ts);
429 }
430 
431 void
432 spa_txg_history_fini_io(spa_t *spa, txg_stat_t *ts)
433 {
434 	if (ts == NULL)
435 		return;
436 
437 	if (zfs_txg_history == 0) {
438 		kmem_free(ts, sizeof (txg_stat_t));
439 		return;
440 	}
441 
442 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
443 	vdev_get_stats(spa->spa_root_vdev, &ts->vs2);
444 	spa_config_exit(spa, SCL_CONFIG, FTAG);
445 
446 	spa_txg_history_set(spa, ts->txg, TXG_STATE_SYNCED, gethrtime());
447 	spa_txg_history_set_io(spa, ts->txg,
448 	    ts->vs2.vs_bytes[ZIO_TYPE_READ] - ts->vs1.vs_bytes[ZIO_TYPE_READ],
449 	    ts->vs2.vs_bytes[ZIO_TYPE_WRITE] - ts->vs1.vs_bytes[ZIO_TYPE_WRITE],
450 	    ts->vs2.vs_ops[ZIO_TYPE_READ] - ts->vs1.vs_ops[ZIO_TYPE_READ],
451 	    ts->vs2.vs_ops[ZIO_TYPE_WRITE] - ts->vs1.vs_ops[ZIO_TYPE_WRITE],
452 	    ts->ndirty);
453 
454 	kmem_free(ts, sizeof (txg_stat_t));
455 }
456 
457 /*
458  * ==========================================================================
459  * SPA TX Assign Histogram Routines
460  * ==========================================================================
461  */
462 
463 /*
464  * Tx statistics - Information exported regarding dmu_tx_assign time.
465  */
466 
467 /*
468  * When the kstat is written zero all buckets.  When the kstat is read
469  * count the number of trailing buckets set to zero and update ks_ndata
470  * such that they are not output.
471  */
472 static int
473 spa_tx_assign_update(kstat_t *ksp, int rw)
474 {
475 	spa_t *spa = ksp->ks_private;
476 	spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram;
477 	int i;
478 
479 	if (rw == KSTAT_WRITE) {
480 		for (i = 0; i < shk->count; i++)
481 			((kstat_named_t *)shk->priv)[i].value.ui64 = 0;
482 	}
483 
484 	for (i = shk->count; i > 0; i--)
485 		if (((kstat_named_t *)shk->priv)[i-1].value.ui64 != 0)
486 			break;
487 
488 	ksp->ks_ndata = i;
489 	ksp->ks_data_size = i * sizeof (kstat_named_t);
490 
491 	return (0);
492 }
493 
494 static void
495 spa_tx_assign_init(spa_t *spa)
496 {
497 	spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram;
498 	char *name;
499 	kstat_named_t *ks;
500 	kstat_t *ksp;
501 	int i;
502 
503 	mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);
504 
505 	shk->count = 42; /* power of two buckets for 1ns to 2,199s */
506 	shk->size = shk->count * sizeof (kstat_named_t);
507 	shk->priv = kmem_alloc(shk->size, KM_SLEEP);
508 
509 	name = kmem_asprintf("zfs/%s", spa_name(spa));
510 
511 	for (i = 0; i < shk->count; i++) {
512 		ks = &((kstat_named_t *)shk->priv)[i];
513 		ks->data_type = KSTAT_DATA_UINT64;
514 		ks->value.ui64 = 0;
515 		(void) snprintf(ks->name, KSTAT_STRLEN, "%llu ns",
516 		    (u_longlong_t)1 << i);
517 	}
518 
519 	ksp = kstat_create(name, 0, "dmu_tx_assign", "misc",
520 	    KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL);
521 	shk->kstat = ksp;
522 
523 	if (ksp) {
524 		ksp->ks_lock = &shk->lock;
525 		ksp->ks_data = shk->priv;
526 		ksp->ks_ndata = shk->count;
527 		ksp->ks_data_size = shk->size;
528 		ksp->ks_private = spa;
529 		ksp->ks_update = spa_tx_assign_update;
530 		kstat_install(ksp);
531 	}
532 	kmem_strfree(name);
533 }
534 
535 static void
536 spa_tx_assign_destroy(spa_t *spa)
537 {
538 	spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram;
539 	kstat_t *ksp;
540 
541 	ksp = shk->kstat;
542 	if (ksp)
543 		kstat_delete(ksp);
544 
545 	kmem_free(shk->priv, shk->size);
546 	mutex_destroy(&shk->lock);
547 }
548 
549 void
550 spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs)
551 {
552 	spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram;
553 	uint64_t idx = 0;
554 
555 	while (((1ULL << idx) < nsecs) && (idx < shk->size - 1))
556 		idx++;
557 
558 	atomic_inc_64(&((kstat_named_t *)shk->priv)[idx].value.ui64);
559 }
560 
561 /*
562  * ==========================================================================
563  * SPA IO History Routines
564  * ==========================================================================
565  */
566 static int
567 spa_io_history_update(kstat_t *ksp, int rw)
568 {
569 	if (rw == KSTAT_WRITE)
570 		memset(ksp->ks_data, 0, ksp->ks_data_size);
571 
572 	return (0);
573 }
574 
575 static void
576 spa_io_history_init(spa_t *spa)
577 {
578 	spa_history_kstat_t *shk = &spa->spa_stats.io_history;
579 	char *name;
580 	kstat_t *ksp;
581 
582 	mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);
583 
584 	name = kmem_asprintf("zfs/%s", spa_name(spa));
585 
586 	ksp = kstat_create(name, 0, "io", "disk", KSTAT_TYPE_IO, 1, 0);
587 	shk->kstat = ksp;
588 
589 	if (ksp) {
590 		ksp->ks_lock = &shk->lock;
591 		ksp->ks_private = spa;
592 		ksp->ks_update = spa_io_history_update;
593 		kstat_install(ksp);
594 	}
595 	kmem_strfree(name);
596 }
597 
598 static void
599 spa_io_history_destroy(spa_t *spa)
600 {
601 	spa_history_kstat_t *shk = &spa->spa_stats.io_history;
602 
603 	if (shk->kstat)
604 		kstat_delete(shk->kstat);
605 
606 	mutex_destroy(&shk->lock);
607 }
608 
609 /*
610  * ==========================================================================
611  * SPA MMP History Routines
612  * ==========================================================================
613  */
614 
615 /*
616  * MMP statistics - Information exported regarding attempted MMP writes
617  *   For MMP writes issued, fields used as per comments below.
618  *   For MMP writes skipped, an entry represents a span of time when
619  *      writes were skipped for same reason (error from mmp_random_leaf).
620  *      Differences are:
621  *      timestamp	time first write skipped, if >1 skipped in a row
622  *      mmp_delay	delay value at timestamp
623  *      vdev_guid	number of writes skipped
624  *      io_error	one of enum mmp_error
625  *      duration	time span (ns) of skipped writes
626  */
627 
628 typedef struct spa_mmp_history {
629 	uint64_t	mmp_node_id;	/* unique # for updates */
630 	uint64_t	txg;		/* txg of last sync */
631 	uint64_t	timestamp;	/* UTC time MMP write issued */
632 	uint64_t	mmp_delay;	/* mmp_thread.mmp_delay at timestamp */
633 	uint64_t	vdev_guid;	/* unique ID of leaf vdev */
634 	char		*vdev_path;
635 	int		vdev_label;	/* vdev label */
636 	int		io_error;	/* error status of MMP write */
637 	hrtime_t	error_start;	/* hrtime of start of error period */
638 	hrtime_t	duration;	/* time from submission to completion */
639 	procfs_list_node_t	smh_node;
640 } spa_mmp_history_t;
641 
642 static int
643 spa_mmp_history_show_header(struct seq_file *f)
644 {
645 	seq_printf(f, "%-10s %-10s %-10s %-6s %-10s %-12s %-24s "
646 	    "%-10s %s\n", "id", "txg", "timestamp", "error", "duration",
647 	    "mmp_delay", "vdev_guid", "vdev_label", "vdev_path");
648 	return (0);
649 }
650 
651 static int
652 spa_mmp_history_show(struct seq_file *f, void *data)
653 {
654 	spa_mmp_history_t *smh = (spa_mmp_history_t *)data;
655 	char skip_fmt[] = "%-10llu %-10llu %10llu %#6llx %10lld %12llu %-24llu "
656 	    "%-10lld %s\n";
657 	char write_fmt[] = "%-10llu %-10llu %10llu %6lld %10lld %12llu %-24llu "
658 	    "%-10lld %s\n";
659 
660 	seq_printf(f, (smh->error_start ? skip_fmt : write_fmt),
661 	    (u_longlong_t)smh->mmp_node_id, (u_longlong_t)smh->txg,
662 	    (u_longlong_t)smh->timestamp, (longlong_t)smh->io_error,
663 	    (longlong_t)smh->duration, (u_longlong_t)smh->mmp_delay,
664 	    (u_longlong_t)smh->vdev_guid, (u_longlong_t)smh->vdev_label,
665 	    (smh->vdev_path ? smh->vdev_path : "-"));
666 
667 	return (0);
668 }
669 
670 /* Remove oldest elements from list until there are no more than 'size' left */
671 static void
672 spa_mmp_history_truncate(spa_history_list_t *shl, unsigned int size)
673 {
674 	spa_mmp_history_t *smh;
675 	while (shl->size > size) {
676 		smh = list_remove_head(&shl->procfs_list.pl_list);
677 		if (smh->vdev_path)
678 			kmem_strfree(smh->vdev_path);
679 		kmem_free(smh, sizeof (spa_mmp_history_t));
680 		shl->size--;
681 	}
682 
683 	if (size == 0)
684 		ASSERT(list_is_empty(&shl->procfs_list.pl_list));
685 
686 }
687 
688 static int
689 spa_mmp_history_clear(procfs_list_t *procfs_list)
690 {
691 	spa_history_list_t *shl = procfs_list->pl_private;
692 	mutex_enter(&procfs_list->pl_lock);
693 	spa_mmp_history_truncate(shl, 0);
694 	mutex_exit(&procfs_list->pl_lock);
695 	return (0);
696 }
697 
698 static void
699 spa_mmp_history_init(spa_t *spa)
700 {
701 	spa_history_list_t *shl = &spa->spa_stats.mmp_history;
702 	char *module;
703 
704 	shl->size = 0;
705 
706 	module = kmem_asprintf("zfs/%s", spa_name(spa));
707 
708 	shl->procfs_list.pl_private = shl;
709 	procfs_list_install(module,
710 	    "multihost",
711 	    0644,
712 	    &shl->procfs_list,
713 	    spa_mmp_history_show,
714 	    spa_mmp_history_show_header,
715 	    spa_mmp_history_clear,
716 	    offsetof(spa_mmp_history_t, smh_node));
717 
718 	kmem_strfree(module);
719 }
720 
721 static void
722 spa_mmp_history_destroy(spa_t *spa)
723 {
724 	spa_history_list_t *shl = &spa->spa_stats.mmp_history;
725 	procfs_list_uninstall(&shl->procfs_list);
726 	spa_mmp_history_truncate(shl, 0);
727 	procfs_list_destroy(&shl->procfs_list);
728 }
729 
730 /*
731  * Set duration in existing "skip" record to how long we have waited for a leaf
732  * vdev to become available.
733  *
734  * Important that we start search at the tail of the list where new
735  * records are inserted, so this is normally an O(1) operation.
736  */
737 int
738 spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_node_id)
739 {
740 	spa_history_list_t *shl = &spa->spa_stats.mmp_history;
741 	spa_mmp_history_t *smh;
742 	int error = ENOENT;
743 
744 	if (zfs_multihost_history == 0 && shl->size == 0)
745 		return (0);
746 
747 	mutex_enter(&shl->procfs_list.pl_lock);
748 	for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL;
749 	    smh = list_prev(&shl->procfs_list.pl_list, smh)) {
750 		if (smh->mmp_node_id == mmp_node_id) {
751 			ASSERT3U(smh->io_error, !=, 0);
752 			smh->duration = gethrtime() - smh->error_start;
753 			smh->vdev_guid++;
754 			error = 0;
755 			break;
756 		}
757 	}
758 	mutex_exit(&shl->procfs_list.pl_lock);
759 
760 	return (error);
761 }
762 
763 /*
764  * Set MMP write duration and error status in existing record.
765  * See comment re: search order above spa_mmp_history_set_skip().
766  */
767 int
768 spa_mmp_history_set(spa_t *spa, uint64_t mmp_node_id, int io_error,
769     hrtime_t duration)
770 {
771 	spa_history_list_t *shl = &spa->spa_stats.mmp_history;
772 	spa_mmp_history_t *smh;
773 	int error = ENOENT;
774 
775 	if (zfs_multihost_history == 0 && shl->size == 0)
776 		return (0);
777 
778 	mutex_enter(&shl->procfs_list.pl_lock);
779 	for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL;
780 	    smh = list_prev(&shl->procfs_list.pl_list, smh)) {
781 		if (smh->mmp_node_id == mmp_node_id) {
782 			ASSERT(smh->io_error == 0);
783 			smh->io_error = io_error;
784 			smh->duration = duration;
785 			error = 0;
786 			break;
787 		}
788 	}
789 	mutex_exit(&shl->procfs_list.pl_lock);
790 
791 	return (error);
792 }
793 
794 /*
795  * Add a new MMP historical record.
796  * error == 0 : a write was issued.
797  * error != 0 : a write was not issued because no leaves were found.
798  */
799 void
800 spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp,
801     uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_node_id,
802     int error)
803 {
804 	spa_history_list_t *shl = &spa->spa_stats.mmp_history;
805 	spa_mmp_history_t *smh;
806 
807 	if (zfs_multihost_history == 0 && shl->size == 0)
808 		return;
809 
810 	smh = kmem_zalloc(sizeof (spa_mmp_history_t), KM_SLEEP);
811 	smh->txg = txg;
812 	smh->timestamp = timestamp;
813 	smh->mmp_delay = mmp_delay;
814 	if (vd) {
815 		smh->vdev_guid = vd->vdev_guid;
816 		if (vd->vdev_path)
817 			smh->vdev_path = kmem_strdup(vd->vdev_path);
818 	}
819 	smh->vdev_label = label;
820 	smh->mmp_node_id = mmp_node_id;
821 
822 	if (error) {
823 		smh->io_error = error;
824 		smh->error_start = gethrtime();
825 		smh->vdev_guid = 1;
826 	}
827 
828 	mutex_enter(&shl->procfs_list.pl_lock);
829 	procfs_list_add(&shl->procfs_list, smh);
830 	shl->size++;
831 	spa_mmp_history_truncate(shl, zfs_multihost_history);
832 	mutex_exit(&shl->procfs_list.pl_lock);
833 }
834 
835 static void *
836 spa_state_addr(kstat_t *ksp, loff_t n)
837 {
838 	if (n == 0)
839 		return (ksp->ks_private);	/* return the spa_t */
840 	return (NULL);
841 }
842 
843 static int
844 spa_state_data(char *buf, size_t size, void *data)
845 {
846 	spa_t *spa = (spa_t *)data;
847 	(void) snprintf(buf, size, "%s\n", spa_state_to_name(spa));
848 	return (0);
849 }
850 
851 /*
852  * Return the state of the pool in /proc/spl/kstat/zfs/<pool>/state.
853  *
854  * This is a lock-less read of the pool's state (unlike using 'zpool', which
855  * can potentially block for seconds).  Because it doesn't block, it can useful
856  * as a pool heartbeat value.
857  */
858 static void
859 spa_state_init(spa_t *spa)
860 {
861 	spa_history_kstat_t *shk = &spa->spa_stats.state;
862 	char *name;
863 	kstat_t *ksp;
864 
865 	mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);
866 
867 	name = kmem_asprintf("zfs/%s", spa_name(spa));
868 	ksp = kstat_create(name, 0, "state", "misc",
869 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
870 
871 	shk->kstat = ksp;
872 	if (ksp) {
873 		ksp->ks_lock = &shk->lock;
874 		ksp->ks_data = NULL;
875 		ksp->ks_private = spa;
876 		ksp->ks_flags |= KSTAT_FLAG_NO_HEADERS;
877 		kstat_set_raw_ops(ksp, NULL, spa_state_data, spa_state_addr);
878 		kstat_install(ksp);
879 	}
880 
881 	kmem_strfree(name);
882 }
883 
884 static void
885 spa_health_destroy(spa_t *spa)
886 {
887 	spa_history_kstat_t *shk = &spa->spa_stats.state;
888 	kstat_t *ksp = shk->kstat;
889 	if (ksp)
890 		kstat_delete(ksp);
891 
892 	mutex_destroy(&shk->lock);
893 }
894 
895 static spa_iostats_t spa_iostats_template = {
896 	{ "trim_extents_written",		KSTAT_DATA_UINT64 },
897 	{ "trim_bytes_written",			KSTAT_DATA_UINT64 },
898 	{ "trim_extents_skipped",		KSTAT_DATA_UINT64 },
899 	{ "trim_bytes_skipped",			KSTAT_DATA_UINT64 },
900 	{ "trim_extents_failed",		KSTAT_DATA_UINT64 },
901 	{ "trim_bytes_failed",			KSTAT_DATA_UINT64 },
902 	{ "autotrim_extents_written",		KSTAT_DATA_UINT64 },
903 	{ "autotrim_bytes_written",		KSTAT_DATA_UINT64 },
904 	{ "autotrim_extents_skipped",		KSTAT_DATA_UINT64 },
905 	{ "autotrim_bytes_skipped",		KSTAT_DATA_UINT64 },
906 	{ "autotrim_extents_failed",		KSTAT_DATA_UINT64 },
907 	{ "autotrim_bytes_failed",		KSTAT_DATA_UINT64 },
908 	{ "simple_trim_extents_written",	KSTAT_DATA_UINT64 },
909 	{ "simple_trim_bytes_written",		KSTAT_DATA_UINT64 },
910 	{ "simple_trim_extents_skipped",	KSTAT_DATA_UINT64 },
911 	{ "simple_trim_bytes_skipped",		KSTAT_DATA_UINT64 },
912 	{ "simple_trim_extents_failed",		KSTAT_DATA_UINT64 },
913 	{ "simple_trim_bytes_failed",		KSTAT_DATA_UINT64 },
914 };
915 
916 #define	SPA_IOSTATS_ADD(stat, val) \
917     atomic_add_64(&iostats->stat.value.ui64, (val));
918 
919 void
920 spa_iostats_trim_add(spa_t *spa, trim_type_t type,
921     uint64_t extents_written, uint64_t bytes_written,
922     uint64_t extents_skipped, uint64_t bytes_skipped,
923     uint64_t extents_failed, uint64_t bytes_failed)
924 {
925 	spa_history_kstat_t *shk = &spa->spa_stats.iostats;
926 	kstat_t *ksp = shk->kstat;
927 	spa_iostats_t *iostats;
928 
929 	if (ksp == NULL)
930 		return;
931 
932 	iostats = ksp->ks_data;
933 	if (type == TRIM_TYPE_MANUAL) {
934 		SPA_IOSTATS_ADD(trim_extents_written, extents_written);
935 		SPA_IOSTATS_ADD(trim_bytes_written, bytes_written);
936 		SPA_IOSTATS_ADD(trim_extents_skipped, extents_skipped);
937 		SPA_IOSTATS_ADD(trim_bytes_skipped, bytes_skipped);
938 		SPA_IOSTATS_ADD(trim_extents_failed, extents_failed);
939 		SPA_IOSTATS_ADD(trim_bytes_failed, bytes_failed);
940 	} else if (type == TRIM_TYPE_AUTO) {
941 		SPA_IOSTATS_ADD(autotrim_extents_written, extents_written);
942 		SPA_IOSTATS_ADD(autotrim_bytes_written, bytes_written);
943 		SPA_IOSTATS_ADD(autotrim_extents_skipped, extents_skipped);
944 		SPA_IOSTATS_ADD(autotrim_bytes_skipped, bytes_skipped);
945 		SPA_IOSTATS_ADD(autotrim_extents_failed, extents_failed);
946 		SPA_IOSTATS_ADD(autotrim_bytes_failed, bytes_failed);
947 	} else {
948 		SPA_IOSTATS_ADD(simple_trim_extents_written, extents_written);
949 		SPA_IOSTATS_ADD(simple_trim_bytes_written, bytes_written);
950 		SPA_IOSTATS_ADD(simple_trim_extents_skipped, extents_skipped);
951 		SPA_IOSTATS_ADD(simple_trim_bytes_skipped, bytes_skipped);
952 		SPA_IOSTATS_ADD(simple_trim_extents_failed, extents_failed);
953 		SPA_IOSTATS_ADD(simple_trim_bytes_failed, bytes_failed);
954 	}
955 }
956 
957 static int
958 spa_iostats_update(kstat_t *ksp, int rw)
959 {
960 	if (rw == KSTAT_WRITE) {
961 		memcpy(ksp->ks_data, &spa_iostats_template,
962 		    sizeof (spa_iostats_t));
963 	}
964 
965 	return (0);
966 }
967 
968 static void
969 spa_iostats_init(spa_t *spa)
970 {
971 	spa_history_kstat_t *shk = &spa->spa_stats.iostats;
972 
973 	mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);
974 
975 	char *name = kmem_asprintf("zfs/%s", spa_name(spa));
976 	kstat_t *ksp = kstat_create(name, 0, "iostats", "misc",
977 	    KSTAT_TYPE_NAMED, sizeof (spa_iostats_t) / sizeof (kstat_named_t),
978 	    KSTAT_FLAG_VIRTUAL);
979 
980 	shk->kstat = ksp;
981 	if (ksp) {
982 		int size = sizeof (spa_iostats_t);
983 		ksp->ks_lock = &shk->lock;
984 		ksp->ks_private = spa;
985 		ksp->ks_update = spa_iostats_update;
986 		ksp->ks_data = kmem_alloc(size, KM_SLEEP);
987 		memcpy(ksp->ks_data, &spa_iostats_template, size);
988 		kstat_install(ksp);
989 	}
990 
991 	kmem_strfree(name);
992 }
993 
994 static void
995 spa_iostats_destroy(spa_t *spa)
996 {
997 	spa_history_kstat_t *shk = &spa->spa_stats.iostats;
998 	kstat_t *ksp = shk->kstat;
999 	if (ksp) {
1000 		kmem_free(ksp->ks_data, sizeof (spa_iostats_t));
1001 		kstat_delete(ksp);
1002 	}
1003 
1004 	mutex_destroy(&shk->lock);
1005 }
1006 
1007 void
1008 spa_stats_init(spa_t *spa)
1009 {
1010 	spa_read_history_init(spa);
1011 	spa_txg_history_init(spa);
1012 	spa_tx_assign_init(spa);
1013 	spa_io_history_init(spa);
1014 	spa_mmp_history_init(spa);
1015 	spa_state_init(spa);
1016 	spa_iostats_init(spa);
1017 }
1018 
1019 void
1020 spa_stats_destroy(spa_t *spa)
1021 {
1022 	spa_iostats_destroy(spa);
1023 	spa_health_destroy(spa);
1024 	spa_tx_assign_destroy(spa);
1025 	spa_txg_history_destroy(spa);
1026 	spa_read_history_destroy(spa);
1027 	spa_io_history_destroy(spa);
1028 	spa_mmp_history_destroy(spa);
1029 }
1030 
1031 /* BEGIN CSTYLED */
1032 ZFS_MODULE_PARAM(zfs, zfs_, read_history, INT, ZMOD_RW,
1033     "Historical statistics for the last N reads");
1034 
1035 ZFS_MODULE_PARAM(zfs, zfs_, read_history_hits, INT, ZMOD_RW,
1036     "Include cache hits in read history");
1037 
1038 ZFS_MODULE_PARAM(zfs_txg, zfs_txg_, history, INT, ZMOD_RW,
1039     "Historical statistics for the last N txgs");
1040 
1041 ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, history, INT, ZMOD_RW,
1042     "Historical statistics for last N multihost writes");
1043 /* END CSTYLED */
1044