1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 #include <sys/zfs_context.h> 24 #include <sys/spa_impl.h> 25 #include <sys/vdev_impl.h> 26 #include <sys/spa.h> 27 #include <zfs_comutil.h> 28 29 /* 30 * Keeps stats on last N reads per spa_t, disabled by default. 31 */ 32 static uint_t zfs_read_history = B_FALSE; 33 34 /* 35 * Include cache hits in history, disabled by default. 36 */ 37 static int zfs_read_history_hits = B_FALSE; 38 39 /* 40 * Keeps stats on the last 100 txgs by default. 41 */ 42 static uint_t zfs_txg_history = 100; 43 44 /* 45 * Keeps stats on the last N MMP updates, disabled by default. 46 */ 47 static uint_t zfs_multihost_history = B_FALSE; 48 49 /* 50 * ========================================================================== 51 * SPA Read History Routines 52 * ========================================================================== 53 */ 54 55 /* 56 * Read statistics - Information exported regarding each arc_read call 57 */ 58 typedef struct spa_read_history { 59 hrtime_t start; /* time read completed */ 60 uint64_t objset; /* read from this objset */ 61 uint64_t object; /* read of this object number */ 62 uint64_t level; /* block's indirection level */ 63 uint64_t blkid; /* read of this block id */ 64 char origin[24]; /* read originated from here */ 65 uint32_t aflags; /* ARC flags (cached, prefetch, etc.) */ 66 pid_t pid; /* PID of task doing read */ 67 char comm[16]; /* process name of task doing read */ 68 procfs_list_node_t srh_node; 69 } spa_read_history_t; 70 71 static int 72 spa_read_history_show_header(struct seq_file *f) 73 { 74 seq_printf(f, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s " 75 "%-24s %-8s %-16s\n", "UID", "start", "objset", "object", 76 "level", "blkid", "aflags", "origin", "pid", "process"); 77 78 return (0); 79 } 80 81 static int 82 spa_read_history_show(struct seq_file *f, void *data) 83 { 84 spa_read_history_t *srh = (spa_read_history_t *)data; 85 86 seq_printf(f, "%-8llu %-16llu 0x%-6llx " 87 "%-8lli %-8lli %-8lli 0x%-6x %-24s %-8i %-16s\n", 88 (u_longlong_t)srh->srh_node.pln_id, srh->start, 89 (longlong_t)srh->objset, (longlong_t)srh->object, 90 (longlong_t)srh->level, (longlong_t)srh->blkid, 91 srh->aflags, srh->origin, srh->pid, srh->comm); 92 93 return (0); 94 } 95 96 /* Remove oldest elements from list until there are no more than 'size' left */ 97 static void 98 spa_read_history_truncate(spa_history_list_t *shl, unsigned int size) 99 { 100 spa_read_history_t *srh; 101 while (shl->size > size) { 102 srh = list_remove_head(&shl->procfs_list.pl_list); 103 ASSERT3P(srh, !=, NULL); 104 kmem_free(srh, sizeof (spa_read_history_t)); 105 shl->size--; 106 } 107 108 if (size == 0) 109 ASSERT(list_is_empty(&shl->procfs_list.pl_list)); 110 } 111 112 static int 113 spa_read_history_clear(procfs_list_t *procfs_list) 114 { 115 spa_history_list_t *shl = procfs_list->pl_private; 116 mutex_enter(&procfs_list->pl_lock); 117 spa_read_history_truncate(shl, 0); 118 mutex_exit(&procfs_list->pl_lock); 119 return (0); 120 } 121 122 static void 123 spa_read_history_init(spa_t *spa) 124 { 125 spa_history_list_t *shl = &spa->spa_stats.read_history; 126 127 shl->size = 0; 128 shl->procfs_list.pl_private = shl; 129 procfs_list_install("zfs", 130 spa_name(spa), 131 "reads", 132 0600, 133 &shl->procfs_list, 134 spa_read_history_show, 135 spa_read_history_show_header, 136 spa_read_history_clear, 137 offsetof(spa_read_history_t, srh_node)); 138 } 139 140 static void 141 spa_read_history_destroy(spa_t *spa) 142 { 143 spa_history_list_t *shl = &spa->spa_stats.read_history; 144 procfs_list_uninstall(&shl->procfs_list); 145 spa_read_history_truncate(shl, 0); 146 procfs_list_destroy(&shl->procfs_list); 147 } 148 149 void 150 spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags) 151 { 152 spa_history_list_t *shl = &spa->spa_stats.read_history; 153 spa_read_history_t *srh; 154 155 ASSERT3P(spa, !=, NULL); 156 ASSERT3P(zb, !=, NULL); 157 158 if (zfs_read_history == 0 && shl->size == 0) 159 return; 160 161 if (zfs_read_history_hits == 0 && (aflags & ARC_FLAG_CACHED)) 162 return; 163 164 srh = kmem_zalloc(sizeof (spa_read_history_t), KM_SLEEP); 165 strlcpy(srh->comm, getcomm(), sizeof (srh->comm)); 166 srh->start = gethrtime(); 167 srh->objset = zb->zb_objset; 168 srh->object = zb->zb_object; 169 srh->level = zb->zb_level; 170 srh->blkid = zb->zb_blkid; 171 srh->aflags = aflags; 172 srh->pid = getpid(); 173 174 mutex_enter(&shl->procfs_list.pl_lock); 175 176 procfs_list_add(&shl->procfs_list, srh); 177 shl->size++; 178 179 spa_read_history_truncate(shl, zfs_read_history); 180 181 mutex_exit(&shl->procfs_list.pl_lock); 182 } 183 184 /* 185 * ========================================================================== 186 * SPA TXG History Routines 187 * ========================================================================== 188 */ 189 190 /* 191 * Txg statistics - Information exported regarding each txg sync 192 */ 193 194 typedef struct spa_txg_history { 195 uint64_t txg; /* txg id */ 196 txg_state_t state; /* active txg state */ 197 uint64_t nread; /* number of bytes read */ 198 uint64_t nwritten; /* number of bytes written */ 199 uint64_t reads; /* number of read operations */ 200 uint64_t writes; /* number of write operations */ 201 uint64_t ndirty; /* number of dirty bytes */ 202 hrtime_t times[TXG_STATE_COMMITTED]; /* completion times */ 203 procfs_list_node_t sth_node; 204 } spa_txg_history_t; 205 206 static int 207 spa_txg_history_show_header(struct seq_file *f) 208 { 209 seq_printf(f, "%-8s %-16s %-5s %-12s %-12s %-12s " 210 "%-8s %-8s %-12s %-12s %-12s %-12s\n", "txg", "birth", "state", 211 "ndirty", "nread", "nwritten", "reads", "writes", 212 "otime", "qtime", "wtime", "stime"); 213 return (0); 214 } 215 216 static int 217 spa_txg_history_show(struct seq_file *f, void *data) 218 { 219 spa_txg_history_t *sth = (spa_txg_history_t *)data; 220 uint64_t open = 0, quiesce = 0, wait = 0, sync = 0; 221 char state; 222 223 switch (sth->state) { 224 case TXG_STATE_BIRTH: state = 'B'; break; 225 case TXG_STATE_OPEN: state = 'O'; break; 226 case TXG_STATE_QUIESCED: state = 'Q'; break; 227 case TXG_STATE_WAIT_FOR_SYNC: state = 'W'; break; 228 case TXG_STATE_SYNCED: state = 'S'; break; 229 case TXG_STATE_COMMITTED: state = 'C'; break; 230 default: state = '?'; break; 231 } 232 233 if (sth->times[TXG_STATE_OPEN]) 234 open = sth->times[TXG_STATE_OPEN] - 235 sth->times[TXG_STATE_BIRTH]; 236 237 if (sth->times[TXG_STATE_QUIESCED]) 238 quiesce = sth->times[TXG_STATE_QUIESCED] - 239 sth->times[TXG_STATE_OPEN]; 240 241 if (sth->times[TXG_STATE_WAIT_FOR_SYNC]) 242 wait = sth->times[TXG_STATE_WAIT_FOR_SYNC] - 243 sth->times[TXG_STATE_QUIESCED]; 244 245 if (sth->times[TXG_STATE_SYNCED]) 246 sync = sth->times[TXG_STATE_SYNCED] - 247 sth->times[TXG_STATE_WAIT_FOR_SYNC]; 248 249 seq_printf(f, "%-8llu %-16llu %-5c %-12llu " 250 "%-12llu %-12llu %-8llu %-8llu %-12llu %-12llu %-12llu %-12llu\n", 251 (longlong_t)sth->txg, sth->times[TXG_STATE_BIRTH], state, 252 (u_longlong_t)sth->ndirty, 253 (u_longlong_t)sth->nread, (u_longlong_t)sth->nwritten, 254 (u_longlong_t)sth->reads, (u_longlong_t)sth->writes, 255 (u_longlong_t)open, (u_longlong_t)quiesce, (u_longlong_t)wait, 256 (u_longlong_t)sync); 257 258 return (0); 259 } 260 261 /* Remove oldest elements from list until there are no more than 'size' left */ 262 static void 263 spa_txg_history_truncate(spa_history_list_t *shl, unsigned int size) 264 { 265 spa_txg_history_t *sth; 266 while (shl->size > size) { 267 sth = list_remove_head(&shl->procfs_list.pl_list); 268 ASSERT3P(sth, !=, NULL); 269 kmem_free(sth, sizeof (spa_txg_history_t)); 270 shl->size--; 271 } 272 273 if (size == 0) 274 ASSERT(list_is_empty(&shl->procfs_list.pl_list)); 275 276 } 277 278 static int 279 spa_txg_history_clear(procfs_list_t *procfs_list) 280 { 281 spa_history_list_t *shl = procfs_list->pl_private; 282 mutex_enter(&procfs_list->pl_lock); 283 spa_txg_history_truncate(shl, 0); 284 mutex_exit(&procfs_list->pl_lock); 285 return (0); 286 } 287 288 static void 289 spa_txg_history_init(spa_t *spa) 290 { 291 spa_history_list_t *shl = &spa->spa_stats.txg_history; 292 293 shl->size = 0; 294 shl->procfs_list.pl_private = shl; 295 procfs_list_install("zfs", 296 spa_name(spa), 297 "txgs", 298 0644, 299 &shl->procfs_list, 300 spa_txg_history_show, 301 spa_txg_history_show_header, 302 spa_txg_history_clear, 303 offsetof(spa_txg_history_t, sth_node)); 304 } 305 306 static void 307 spa_txg_history_destroy(spa_t *spa) 308 { 309 spa_history_list_t *shl = &spa->spa_stats.txg_history; 310 procfs_list_uninstall(&shl->procfs_list); 311 spa_txg_history_truncate(shl, 0); 312 procfs_list_destroy(&shl->procfs_list); 313 } 314 315 /* 316 * Add a new txg to historical record. 317 */ 318 void 319 spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time) 320 { 321 spa_history_list_t *shl = &spa->spa_stats.txg_history; 322 spa_txg_history_t *sth; 323 324 if (zfs_txg_history == 0 && shl->size == 0) 325 return; 326 327 sth = kmem_zalloc(sizeof (spa_txg_history_t), KM_SLEEP); 328 sth->txg = txg; 329 sth->state = TXG_STATE_OPEN; 330 sth->times[TXG_STATE_BIRTH] = birth_time; 331 332 mutex_enter(&shl->procfs_list.pl_lock); 333 procfs_list_add(&shl->procfs_list, sth); 334 shl->size++; 335 spa_txg_history_truncate(shl, zfs_txg_history); 336 mutex_exit(&shl->procfs_list.pl_lock); 337 } 338 339 /* 340 * Set txg state completion time and increment current state. 341 */ 342 int 343 spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, 344 hrtime_t completed_time) 345 { 346 spa_history_list_t *shl = &spa->spa_stats.txg_history; 347 spa_txg_history_t *sth; 348 int error = ENOENT; 349 350 if (zfs_txg_history == 0) 351 return (0); 352 353 mutex_enter(&shl->procfs_list.pl_lock); 354 for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL; 355 sth = list_prev(&shl->procfs_list.pl_list, sth)) { 356 if (sth->txg == txg) { 357 sth->times[completed_state] = completed_time; 358 sth->state++; 359 error = 0; 360 break; 361 } 362 } 363 mutex_exit(&shl->procfs_list.pl_lock); 364 365 return (error); 366 } 367 368 /* 369 * Set txg IO stats. 370 */ 371 static int 372 spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread, 373 uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t ndirty) 374 { 375 spa_history_list_t *shl = &spa->spa_stats.txg_history; 376 spa_txg_history_t *sth; 377 int error = ENOENT; 378 379 if (zfs_txg_history == 0) 380 return (0); 381 382 mutex_enter(&shl->procfs_list.pl_lock); 383 for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL; 384 sth = list_prev(&shl->procfs_list.pl_list, sth)) { 385 if (sth->txg == txg) { 386 sth->nread = nread; 387 sth->nwritten = nwritten; 388 sth->reads = reads; 389 sth->writes = writes; 390 sth->ndirty = ndirty; 391 error = 0; 392 break; 393 } 394 } 395 mutex_exit(&shl->procfs_list.pl_lock); 396 397 return (error); 398 } 399 400 txg_stat_t * 401 spa_txg_history_init_io(spa_t *spa, uint64_t txg, dsl_pool_t *dp) 402 { 403 txg_stat_t *ts; 404 405 if (zfs_txg_history == 0) 406 return (NULL); 407 408 ts = kmem_alloc(sizeof (txg_stat_t), KM_SLEEP); 409 410 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 411 vdev_get_stats(spa->spa_root_vdev, &ts->vs1); 412 spa_config_exit(spa, SCL_CONFIG, FTAG); 413 414 ts->txg = txg; 415 ts->ndirty = dp->dp_dirty_pertxg[txg & TXG_MASK]; 416 417 spa_txg_history_set(spa, txg, TXG_STATE_WAIT_FOR_SYNC, gethrtime()); 418 419 return (ts); 420 } 421 422 void 423 spa_txg_history_fini_io(spa_t *spa, txg_stat_t *ts) 424 { 425 if (ts == NULL) 426 return; 427 428 if (zfs_txg_history == 0) { 429 kmem_free(ts, sizeof (txg_stat_t)); 430 return; 431 } 432 433 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 434 vdev_get_stats(spa->spa_root_vdev, &ts->vs2); 435 spa_config_exit(spa, SCL_CONFIG, FTAG); 436 437 spa_txg_history_set(spa, ts->txg, TXG_STATE_SYNCED, gethrtime()); 438 spa_txg_history_set_io(spa, ts->txg, 439 ts->vs2.vs_bytes[ZIO_TYPE_READ] - ts->vs1.vs_bytes[ZIO_TYPE_READ], 440 ts->vs2.vs_bytes[ZIO_TYPE_WRITE] - ts->vs1.vs_bytes[ZIO_TYPE_WRITE], 441 ts->vs2.vs_ops[ZIO_TYPE_READ] - ts->vs1.vs_ops[ZIO_TYPE_READ], 442 ts->vs2.vs_ops[ZIO_TYPE_WRITE] - ts->vs1.vs_ops[ZIO_TYPE_WRITE], 443 ts->ndirty); 444 445 kmem_free(ts, sizeof (txg_stat_t)); 446 } 447 448 /* 449 * ========================================================================== 450 * SPA TX Assign Histogram Routines 451 * ========================================================================== 452 */ 453 454 /* 455 * Tx statistics - Information exported regarding dmu_tx_assign time. 456 */ 457 458 /* 459 * When the kstat is written zero all buckets. When the kstat is read 460 * count the number of trailing buckets set to zero and update ks_ndata 461 * such that they are not output. 462 */ 463 static int 464 spa_tx_assign_update(kstat_t *ksp, int rw) 465 { 466 spa_t *spa = ksp->ks_private; 467 spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; 468 int i; 469 470 if (rw == KSTAT_WRITE) { 471 for (i = 0; i < shk->count; i++) 472 ((kstat_named_t *)shk->priv)[i].value.ui64 = 0; 473 } 474 475 for (i = shk->count; i > 0; i--) 476 if (((kstat_named_t *)shk->priv)[i-1].value.ui64 != 0) 477 break; 478 479 ksp->ks_ndata = i; 480 ksp->ks_data_size = i * sizeof (kstat_named_t); 481 482 return (0); 483 } 484 485 static void 486 spa_tx_assign_init(spa_t *spa) 487 { 488 spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; 489 char *name; 490 kstat_named_t *ks; 491 kstat_t *ksp; 492 int i; 493 494 mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); 495 496 shk->count = 42; /* power of two buckets for 1ns to 2,199s */ 497 shk->size = shk->count * sizeof (kstat_named_t); 498 shk->priv = kmem_alloc(shk->size, KM_SLEEP); 499 500 name = kmem_asprintf("zfs/%s", spa_name(spa)); 501 502 for (i = 0; i < shk->count; i++) { 503 ks = &((kstat_named_t *)shk->priv)[i]; 504 ks->data_type = KSTAT_DATA_UINT64; 505 ks->value.ui64 = 0; 506 (void) snprintf(ks->name, KSTAT_STRLEN, "%llu ns", 507 (u_longlong_t)1 << i); 508 } 509 510 ksp = kstat_create(name, 0, "dmu_tx_assign", "misc", 511 KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL); 512 shk->kstat = ksp; 513 514 if (ksp) { 515 ksp->ks_lock = &shk->lock; 516 ksp->ks_data = shk->priv; 517 ksp->ks_ndata = shk->count; 518 ksp->ks_data_size = shk->size; 519 ksp->ks_private = spa; 520 ksp->ks_update = spa_tx_assign_update; 521 kstat_install(ksp); 522 } 523 kmem_strfree(name); 524 } 525 526 static void 527 spa_tx_assign_destroy(spa_t *spa) 528 { 529 spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; 530 kstat_t *ksp; 531 532 ksp = shk->kstat; 533 if (ksp) 534 kstat_delete(ksp); 535 536 kmem_free(shk->priv, shk->size); 537 mutex_destroy(&shk->lock); 538 } 539 540 void 541 spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs) 542 { 543 spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; 544 uint64_t idx = 0; 545 546 while (((1ULL << idx) < nsecs) && (idx < shk->size - 1)) 547 idx++; 548 549 atomic_inc_64(&((kstat_named_t *)shk->priv)[idx].value.ui64); 550 } 551 552 /* 553 * ========================================================================== 554 * SPA MMP History Routines 555 * ========================================================================== 556 */ 557 558 /* 559 * MMP statistics - Information exported regarding attempted MMP writes 560 * For MMP writes issued, fields used as per comments below. 561 * For MMP writes skipped, an entry represents a span of time when 562 * writes were skipped for same reason (error from mmp_random_leaf). 563 * Differences are: 564 * timestamp time first write skipped, if >1 skipped in a row 565 * mmp_delay delay value at timestamp 566 * vdev_guid number of writes skipped 567 * io_error one of enum mmp_error 568 * duration time span (ns) of skipped writes 569 */ 570 571 typedef struct spa_mmp_history { 572 uint64_t mmp_node_id; /* unique # for updates */ 573 uint64_t txg; /* txg of last sync */ 574 uint64_t timestamp; /* UTC time MMP write issued */ 575 uint64_t mmp_delay; /* mmp_thread.mmp_delay at timestamp */ 576 uint64_t vdev_guid; /* unique ID of leaf vdev */ 577 char *vdev_path; 578 int vdev_label; /* vdev label */ 579 int io_error; /* error status of MMP write */ 580 hrtime_t error_start; /* hrtime of start of error period */ 581 hrtime_t duration; /* time from submission to completion */ 582 procfs_list_node_t smh_node; 583 } spa_mmp_history_t; 584 585 static int 586 spa_mmp_history_show_header(struct seq_file *f) 587 { 588 seq_printf(f, "%-10s %-10s %-10s %-6s %-10s %-12s %-24s " 589 "%-10s %s\n", "id", "txg", "timestamp", "error", "duration", 590 "mmp_delay", "vdev_guid", "vdev_label", "vdev_path"); 591 return (0); 592 } 593 594 static int 595 spa_mmp_history_show(struct seq_file *f, void *data) 596 { 597 spa_mmp_history_t *smh = (spa_mmp_history_t *)data; 598 char skip_fmt[] = "%-10llu %-10llu %10llu %#6llx %10lld %12llu %-24llu " 599 "%-10lld %s\n"; 600 char write_fmt[] = "%-10llu %-10llu %10llu %6lld %10lld %12llu %-24llu " 601 "%-10lld %s\n"; 602 603 seq_printf(f, (smh->error_start ? skip_fmt : write_fmt), 604 (u_longlong_t)smh->mmp_node_id, (u_longlong_t)smh->txg, 605 (u_longlong_t)smh->timestamp, (longlong_t)smh->io_error, 606 (longlong_t)smh->duration, (u_longlong_t)smh->mmp_delay, 607 (u_longlong_t)smh->vdev_guid, (u_longlong_t)smh->vdev_label, 608 (smh->vdev_path ? smh->vdev_path : "-")); 609 610 return (0); 611 } 612 613 /* Remove oldest elements from list until there are no more than 'size' left */ 614 static void 615 spa_mmp_history_truncate(spa_history_list_t *shl, unsigned int size) 616 { 617 spa_mmp_history_t *smh; 618 while (shl->size > size) { 619 smh = list_remove_head(&shl->procfs_list.pl_list); 620 if (smh->vdev_path) 621 kmem_strfree(smh->vdev_path); 622 kmem_free(smh, sizeof (spa_mmp_history_t)); 623 shl->size--; 624 } 625 626 if (size == 0) 627 ASSERT(list_is_empty(&shl->procfs_list.pl_list)); 628 629 } 630 631 static int 632 spa_mmp_history_clear(procfs_list_t *procfs_list) 633 { 634 spa_history_list_t *shl = procfs_list->pl_private; 635 mutex_enter(&procfs_list->pl_lock); 636 spa_mmp_history_truncate(shl, 0); 637 mutex_exit(&procfs_list->pl_lock); 638 return (0); 639 } 640 641 static void 642 spa_mmp_history_init(spa_t *spa) 643 { 644 spa_history_list_t *shl = &spa->spa_stats.mmp_history; 645 646 shl->size = 0; 647 648 shl->procfs_list.pl_private = shl; 649 procfs_list_install("zfs", 650 spa_name(spa), 651 "multihost", 652 0644, 653 &shl->procfs_list, 654 spa_mmp_history_show, 655 spa_mmp_history_show_header, 656 spa_mmp_history_clear, 657 offsetof(spa_mmp_history_t, smh_node)); 658 } 659 660 static void 661 spa_mmp_history_destroy(spa_t *spa) 662 { 663 spa_history_list_t *shl = &spa->spa_stats.mmp_history; 664 procfs_list_uninstall(&shl->procfs_list); 665 spa_mmp_history_truncate(shl, 0); 666 procfs_list_destroy(&shl->procfs_list); 667 } 668 669 /* 670 * Set duration in existing "skip" record to how long we have waited for a leaf 671 * vdev to become available. 672 * 673 * Important that we start search at the tail of the list where new 674 * records are inserted, so this is normally an O(1) operation. 675 */ 676 int 677 spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_node_id) 678 { 679 spa_history_list_t *shl = &spa->spa_stats.mmp_history; 680 spa_mmp_history_t *smh; 681 int error = ENOENT; 682 683 if (zfs_multihost_history == 0 && shl->size == 0) 684 return (0); 685 686 mutex_enter(&shl->procfs_list.pl_lock); 687 for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL; 688 smh = list_prev(&shl->procfs_list.pl_list, smh)) { 689 if (smh->mmp_node_id == mmp_node_id) { 690 ASSERT3U(smh->io_error, !=, 0); 691 smh->duration = gethrtime() - smh->error_start; 692 smh->vdev_guid++; 693 error = 0; 694 break; 695 } 696 } 697 mutex_exit(&shl->procfs_list.pl_lock); 698 699 return (error); 700 } 701 702 /* 703 * Set MMP write duration and error status in existing record. 704 * See comment re: search order above spa_mmp_history_set_skip(). 705 */ 706 int 707 spa_mmp_history_set(spa_t *spa, uint64_t mmp_node_id, int io_error, 708 hrtime_t duration) 709 { 710 spa_history_list_t *shl = &spa->spa_stats.mmp_history; 711 spa_mmp_history_t *smh; 712 int error = ENOENT; 713 714 if (zfs_multihost_history == 0 && shl->size == 0) 715 return (0); 716 717 mutex_enter(&shl->procfs_list.pl_lock); 718 for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL; 719 smh = list_prev(&shl->procfs_list.pl_list, smh)) { 720 if (smh->mmp_node_id == mmp_node_id) { 721 ASSERT0(smh->io_error); 722 smh->io_error = io_error; 723 smh->duration = duration; 724 error = 0; 725 break; 726 } 727 } 728 mutex_exit(&shl->procfs_list.pl_lock); 729 730 return (error); 731 } 732 733 /* 734 * Add a new MMP historical record. 735 * error == 0 : a write was issued. 736 * error != 0 : a write was not issued because no leaves were found. 737 */ 738 void 739 spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp, 740 uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_node_id, 741 int error) 742 { 743 spa_history_list_t *shl = &spa->spa_stats.mmp_history; 744 spa_mmp_history_t *smh; 745 746 if (zfs_multihost_history == 0 && shl->size == 0) 747 return; 748 749 smh = kmem_zalloc(sizeof (spa_mmp_history_t), KM_SLEEP); 750 smh->txg = txg; 751 smh->timestamp = timestamp; 752 smh->mmp_delay = mmp_delay; 753 if (vd) { 754 smh->vdev_guid = vd->vdev_guid; 755 if (vd->vdev_path) 756 smh->vdev_path = kmem_strdup(vd->vdev_path); 757 } 758 smh->vdev_label = label; 759 smh->mmp_node_id = mmp_node_id; 760 761 if (error) { 762 smh->io_error = error; 763 smh->error_start = gethrtime(); 764 smh->vdev_guid = 1; 765 } 766 767 mutex_enter(&shl->procfs_list.pl_lock); 768 procfs_list_add(&shl->procfs_list, smh); 769 shl->size++; 770 spa_mmp_history_truncate(shl, zfs_multihost_history); 771 mutex_exit(&shl->procfs_list.pl_lock); 772 } 773 774 static void * 775 spa_state_addr(kstat_t *ksp, loff_t n) 776 { 777 if (n == 0) 778 return (ksp->ks_private); /* return the spa_t */ 779 return (NULL); 780 } 781 782 static int 783 spa_state_data(char *buf, size_t size, void *data) 784 { 785 spa_t *spa = (spa_t *)data; 786 (void) snprintf(buf, size, "%s\n", spa_state_to_name(spa)); 787 return (0); 788 } 789 790 /* 791 * Return the state of the pool in /proc/spl/kstat/zfs/<pool>/state. 792 * 793 * This is a lock-less read of the pool's state (unlike using 'zpool', which 794 * can potentially block for seconds). Because it doesn't block, it can useful 795 * as a pool heartbeat value. 796 */ 797 static void 798 spa_state_init(spa_t *spa) 799 { 800 spa_history_kstat_t *shk = &spa->spa_stats.state; 801 char *name; 802 kstat_t *ksp; 803 804 mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); 805 806 name = kmem_asprintf("zfs/%s", spa_name(spa)); 807 ksp = kstat_create(name, 0, "state", "misc", 808 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); 809 810 shk->kstat = ksp; 811 if (ksp) { 812 ksp->ks_lock = &shk->lock; 813 ksp->ks_data = NULL; 814 ksp->ks_private = spa; 815 ksp->ks_flags |= KSTAT_FLAG_NO_HEADERS; 816 kstat_set_raw_ops(ksp, NULL, spa_state_data, spa_state_addr); 817 kstat_install(ksp); 818 } 819 820 kmem_strfree(name); 821 } 822 823 static int 824 spa_guid_data(char *buf, size_t size, void *data) 825 { 826 spa_t *spa = (spa_t *)data; 827 (void) snprintf(buf, size, "%llu\n", (u_longlong_t)spa_guid(spa)); 828 return (0); 829 } 830 831 static void 832 spa_guid_init(spa_t *spa) 833 { 834 spa_history_kstat_t *shk = &spa->spa_stats.guid; 835 char *name; 836 kstat_t *ksp; 837 838 mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); 839 840 name = kmem_asprintf("zfs/%s", spa_name(spa)); 841 842 ksp = kstat_create(name, 0, "guid", "misc", 843 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); 844 845 shk->kstat = ksp; 846 if (ksp) { 847 ksp->ks_lock = &shk->lock; 848 ksp->ks_data = NULL; 849 ksp->ks_private = spa; 850 ksp->ks_flags |= KSTAT_FLAG_NO_HEADERS; 851 kstat_set_raw_ops(ksp, NULL, spa_guid_data, spa_state_addr); 852 kstat_install(ksp); 853 } 854 855 kmem_strfree(name); 856 } 857 858 static void 859 spa_health_destroy(spa_t *spa) 860 { 861 spa_history_kstat_t *shk = &spa->spa_stats.state; 862 kstat_t *ksp = shk->kstat; 863 if (ksp) 864 kstat_delete(ksp); 865 866 mutex_destroy(&shk->lock); 867 } 868 869 static void 870 spa_guid_destroy(spa_t *spa) 871 { 872 spa_history_kstat_t *shk = &spa->spa_stats.guid; 873 kstat_t *ksp = shk->kstat; 874 if (ksp) 875 kstat_delete(ksp); 876 877 mutex_destroy(&shk->lock); 878 } 879 880 static const spa_iostats_t spa_iostats_template = { 881 { "trim_extents_written", KSTAT_DATA_UINT64 }, 882 { "trim_bytes_written", KSTAT_DATA_UINT64 }, 883 { "trim_extents_skipped", KSTAT_DATA_UINT64 }, 884 { "trim_bytes_skipped", KSTAT_DATA_UINT64 }, 885 { "trim_extents_failed", KSTAT_DATA_UINT64 }, 886 { "trim_bytes_failed", KSTAT_DATA_UINT64 }, 887 { "autotrim_extents_written", KSTAT_DATA_UINT64 }, 888 { "autotrim_bytes_written", KSTAT_DATA_UINT64 }, 889 { "autotrim_extents_skipped", KSTAT_DATA_UINT64 }, 890 { "autotrim_bytes_skipped", KSTAT_DATA_UINT64 }, 891 { "autotrim_extents_failed", KSTAT_DATA_UINT64 }, 892 { "autotrim_bytes_failed", KSTAT_DATA_UINT64 }, 893 { "simple_trim_extents_written", KSTAT_DATA_UINT64 }, 894 { "simple_trim_bytes_written", KSTAT_DATA_UINT64 }, 895 { "simple_trim_extents_skipped", KSTAT_DATA_UINT64 }, 896 { "simple_trim_bytes_skipped", KSTAT_DATA_UINT64 }, 897 { "simple_trim_extents_failed", KSTAT_DATA_UINT64 }, 898 { "simple_trim_bytes_failed", KSTAT_DATA_UINT64 }, 899 { "arc_read_count", KSTAT_DATA_UINT64 }, 900 { "arc_read_bytes", KSTAT_DATA_UINT64 }, 901 { "arc_write_count", KSTAT_DATA_UINT64 }, 902 { "arc_write_bytes", KSTAT_DATA_UINT64 }, 903 { "direct_read_count", KSTAT_DATA_UINT64 }, 904 { "direct_read_bytes", KSTAT_DATA_UINT64 }, 905 { "direct_write_count", KSTAT_DATA_UINT64 }, 906 { "direct_write_bytes", KSTAT_DATA_UINT64 }, 907 }; 908 909 #define SPA_IOSTATS_ADD(stat, val) \ 910 atomic_add_64(&iostats->stat.value.ui64, (val)); 911 912 void 913 spa_iostats_trim_add(spa_t *spa, trim_type_t type, 914 uint64_t extents_written, uint64_t bytes_written, 915 uint64_t extents_skipped, uint64_t bytes_skipped, 916 uint64_t extents_failed, uint64_t bytes_failed) 917 { 918 spa_history_kstat_t *shk = &spa->spa_stats.iostats; 919 kstat_t *ksp = shk->kstat; 920 spa_iostats_t *iostats; 921 922 if (ksp == NULL) 923 return; 924 925 iostats = ksp->ks_data; 926 if (type == TRIM_TYPE_MANUAL) { 927 SPA_IOSTATS_ADD(trim_extents_written, extents_written); 928 SPA_IOSTATS_ADD(trim_bytes_written, bytes_written); 929 SPA_IOSTATS_ADD(trim_extents_skipped, extents_skipped); 930 SPA_IOSTATS_ADD(trim_bytes_skipped, bytes_skipped); 931 SPA_IOSTATS_ADD(trim_extents_failed, extents_failed); 932 SPA_IOSTATS_ADD(trim_bytes_failed, bytes_failed); 933 } else if (type == TRIM_TYPE_AUTO) { 934 SPA_IOSTATS_ADD(autotrim_extents_written, extents_written); 935 SPA_IOSTATS_ADD(autotrim_bytes_written, bytes_written); 936 SPA_IOSTATS_ADD(autotrim_extents_skipped, extents_skipped); 937 SPA_IOSTATS_ADD(autotrim_bytes_skipped, bytes_skipped); 938 SPA_IOSTATS_ADD(autotrim_extents_failed, extents_failed); 939 SPA_IOSTATS_ADD(autotrim_bytes_failed, bytes_failed); 940 } else { 941 SPA_IOSTATS_ADD(simple_trim_extents_written, extents_written); 942 SPA_IOSTATS_ADD(simple_trim_bytes_written, bytes_written); 943 SPA_IOSTATS_ADD(simple_trim_extents_skipped, extents_skipped); 944 SPA_IOSTATS_ADD(simple_trim_bytes_skipped, bytes_skipped); 945 SPA_IOSTATS_ADD(simple_trim_extents_failed, extents_failed); 946 SPA_IOSTATS_ADD(simple_trim_bytes_failed, bytes_failed); 947 } 948 } 949 950 void 951 spa_iostats_read_add(spa_t *spa, uint64_t size, uint64_t iops, 952 dmu_flags_t flags) 953 { 954 spa_history_kstat_t *shk = &spa->spa_stats.iostats; 955 kstat_t *ksp = shk->kstat; 956 957 if (ksp == NULL) 958 return; 959 960 spa_iostats_t *iostats = ksp->ks_data; 961 if (flags & DMU_DIRECTIO) { 962 SPA_IOSTATS_ADD(direct_read_count, iops); 963 SPA_IOSTATS_ADD(direct_read_bytes, size); 964 } else { 965 SPA_IOSTATS_ADD(arc_read_count, iops); 966 SPA_IOSTATS_ADD(arc_read_bytes, size); 967 } 968 } 969 970 void 971 spa_iostats_write_add(spa_t *spa, uint64_t size, uint64_t iops, 972 dmu_flags_t flags) 973 { 974 spa_history_kstat_t *shk = &spa->spa_stats.iostats; 975 kstat_t *ksp = shk->kstat; 976 977 if (ksp == NULL) 978 return; 979 980 spa_iostats_t *iostats = ksp->ks_data; 981 if (flags & DMU_DIRECTIO) { 982 SPA_IOSTATS_ADD(direct_write_count, iops); 983 SPA_IOSTATS_ADD(direct_write_bytes, size); 984 } else { 985 SPA_IOSTATS_ADD(arc_write_count, iops); 986 SPA_IOSTATS_ADD(arc_write_bytes, size); 987 } 988 } 989 990 static int 991 spa_iostats_update(kstat_t *ksp, int rw) 992 { 993 if (rw == KSTAT_WRITE) { 994 memcpy(ksp->ks_data, &spa_iostats_template, 995 sizeof (spa_iostats_t)); 996 } 997 998 return (0); 999 } 1000 1001 static void 1002 spa_iostats_init(spa_t *spa) 1003 { 1004 spa_history_kstat_t *shk = &spa->spa_stats.iostats; 1005 1006 mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); 1007 1008 char *name = kmem_asprintf("zfs/%s", spa_name(spa)); 1009 kstat_t *ksp = kstat_create(name, 0, "iostats", "misc", 1010 KSTAT_TYPE_NAMED, sizeof (spa_iostats_t) / sizeof (kstat_named_t), 1011 KSTAT_FLAG_VIRTUAL); 1012 1013 shk->kstat = ksp; 1014 if (ksp) { 1015 int size = sizeof (spa_iostats_t); 1016 ksp->ks_lock = &shk->lock; 1017 ksp->ks_private = spa; 1018 ksp->ks_update = spa_iostats_update; 1019 ksp->ks_data = kmem_alloc(size, KM_SLEEP); 1020 memcpy(ksp->ks_data, &spa_iostats_template, size); 1021 kstat_install(ksp); 1022 } 1023 1024 kmem_strfree(name); 1025 } 1026 1027 static void 1028 spa_iostats_destroy(spa_t *spa) 1029 { 1030 spa_history_kstat_t *shk = &spa->spa_stats.iostats; 1031 kstat_t *ksp = shk->kstat; 1032 if (ksp) { 1033 kmem_free(ksp->ks_data, sizeof (spa_iostats_t)); 1034 kstat_delete(ksp); 1035 } 1036 1037 mutex_destroy(&shk->lock); 1038 } 1039 1040 void 1041 spa_stats_init(spa_t *spa) 1042 { 1043 spa_read_history_init(spa); 1044 spa_txg_history_init(spa); 1045 spa_tx_assign_init(spa); 1046 spa_mmp_history_init(spa); 1047 spa_state_init(spa); 1048 spa_guid_init(spa); 1049 spa_iostats_init(spa); 1050 } 1051 1052 void 1053 spa_stats_destroy(spa_t *spa) 1054 { 1055 spa_iostats_destroy(spa); 1056 spa_health_destroy(spa); 1057 spa_tx_assign_destroy(spa); 1058 spa_txg_history_destroy(spa); 1059 spa_read_history_destroy(spa); 1060 spa_mmp_history_destroy(spa); 1061 spa_guid_destroy(spa); 1062 } 1063 1064 ZFS_MODULE_PARAM(zfs, zfs_, read_history, UINT, ZMOD_RW, 1065 "Historical statistics for the last N reads"); 1066 1067 ZFS_MODULE_PARAM(zfs, zfs_, read_history_hits, INT, ZMOD_RW, 1068 "Include cache hits in read history"); 1069 1070 ZFS_MODULE_PARAM(zfs_txg, zfs_txg_, history, UINT, ZMOD_RW, 1071 "Historical statistics for the last N txgs"); 1072 1073 ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, history, UINT, ZMOD_RW, 1074 "Historical statistics for last N multihost writes"); 1075