1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 #include <sys/zfs_context.h> 23 #include <sys/spa_impl.h> 24 #include <sys/vdev_impl.h> 25 #include <sys/spa.h> 26 #include <zfs_comutil.h> 27 28 /* 29 * Keeps stats on last N reads per spa_t, disabled by default. 30 */ 31 int zfs_read_history = 0; 32 33 /* 34 * Include cache hits in history, disabled by default. 35 */ 36 int zfs_read_history_hits = 0; 37 38 /* 39 * Keeps stats on the last 100 txgs by default. 40 */ 41 int zfs_txg_history = 100; 42 43 /* 44 * Keeps stats on the last N MMP updates, disabled by default. 45 */ 46 int zfs_multihost_history = 0; 47 48 /* 49 * ========================================================================== 50 * SPA Read History Routines 51 * ========================================================================== 52 */ 53 54 /* 55 * Read statistics - Information exported regarding each arc_read call 56 */ 57 typedef struct spa_read_history { 58 hrtime_t start; /* time read completed */ 59 uint64_t objset; /* read from this objset */ 60 uint64_t object; /* read of this object number */ 61 uint64_t level; /* block's indirection level */ 62 uint64_t blkid; /* read of this block id */ 63 char origin[24]; /* read originated from here */ 64 uint32_t aflags; /* ARC flags (cached, prefetch, etc.) */ 65 pid_t pid; /* PID of task doing read */ 66 char comm[16]; /* process name of task doing read */ 67 procfs_list_node_t srh_node; 68 } spa_read_history_t; 69 70 static int 71 spa_read_history_show_header(struct seq_file *f) 72 { 73 seq_printf(f, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s " 74 "%-24s %-8s %-16s\n", "UID", "start", "objset", "object", 75 "level", "blkid", "aflags", "origin", "pid", "process"); 76 77 return (0); 78 } 79 80 static int 81 spa_read_history_show(struct seq_file *f, void *data) 82 { 83 spa_read_history_t *srh = (spa_read_history_t *)data; 84 85 seq_printf(f, "%-8llu %-16llu 0x%-6llx " 86 "%-8lli %-8lli %-8lli 0x%-6x %-24s %-8i %-16s\n", 87 (u_longlong_t)srh->srh_node.pln_id, srh->start, 88 (longlong_t)srh->objset, (longlong_t)srh->object, 89 (longlong_t)srh->level, (longlong_t)srh->blkid, 90 srh->aflags, srh->origin, srh->pid, srh->comm); 91 92 return (0); 93 } 94 95 /* Remove oldest elements from list until there are no more than 'size' left */ 96 static void 97 spa_read_history_truncate(spa_history_list_t *shl, unsigned int size) 98 { 99 spa_read_history_t *srh; 100 while (shl->size > size) { 101 srh = list_remove_head(&shl->procfs_list.pl_list); 102 ASSERT3P(srh, !=, NULL); 103 kmem_free(srh, sizeof (spa_read_history_t)); 104 shl->size--; 105 } 106 107 if (size == 0) 108 ASSERT(list_is_empty(&shl->procfs_list.pl_list)); 109 } 110 111 static int 112 spa_read_history_clear(procfs_list_t *procfs_list) 113 { 114 spa_history_list_t *shl = procfs_list->pl_private; 115 mutex_enter(&procfs_list->pl_lock); 116 spa_read_history_truncate(shl, 0); 117 mutex_exit(&procfs_list->pl_lock); 118 return (0); 119 } 120 121 static void 122 spa_read_history_init(spa_t *spa) 123 { 124 spa_history_list_t *shl = &spa->spa_stats.read_history; 125 char *module; 126 127 shl->size = 0; 128 129 module = kmem_asprintf("zfs/%s", spa_name(spa)); 130 131 shl->procfs_list.pl_private = shl; 132 procfs_list_install(module, 133 "reads", 134 0600, 135 &shl->procfs_list, 136 spa_read_history_show, 137 spa_read_history_show_header, 138 spa_read_history_clear, 139 offsetof(spa_read_history_t, srh_node)); 140 141 kmem_strfree(module); 142 } 143 144 static void 145 spa_read_history_destroy(spa_t *spa) 146 { 147 spa_history_list_t *shl = &spa->spa_stats.read_history; 148 procfs_list_uninstall(&shl->procfs_list); 149 spa_read_history_truncate(shl, 0); 150 procfs_list_destroy(&shl->procfs_list); 151 } 152 153 void 154 spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags) 155 { 156 spa_history_list_t *shl = &spa->spa_stats.read_history; 157 spa_read_history_t *srh; 158 159 ASSERT3P(spa, !=, NULL); 160 ASSERT3P(zb, !=, NULL); 161 162 if (zfs_read_history == 0 && shl->size == 0) 163 return; 164 165 if (zfs_read_history_hits == 0 && (aflags & ARC_FLAG_CACHED)) 166 return; 167 168 srh = kmem_zalloc(sizeof (spa_read_history_t), KM_SLEEP); 169 strlcpy(srh->comm, getcomm(), sizeof (srh->comm)); 170 srh->start = gethrtime(); 171 srh->objset = zb->zb_objset; 172 srh->object = zb->zb_object; 173 srh->level = zb->zb_level; 174 srh->blkid = zb->zb_blkid; 175 srh->aflags = aflags; 176 srh->pid = getpid(); 177 178 mutex_enter(&shl->procfs_list.pl_lock); 179 180 procfs_list_add(&shl->procfs_list, srh); 181 shl->size++; 182 183 spa_read_history_truncate(shl, zfs_read_history); 184 185 mutex_exit(&shl->procfs_list.pl_lock); 186 } 187 188 /* 189 * ========================================================================== 190 * SPA TXG History Routines 191 * ========================================================================== 192 */ 193 194 /* 195 * Txg statistics - Information exported regarding each txg sync 196 */ 197 198 typedef struct spa_txg_history { 199 uint64_t txg; /* txg id */ 200 txg_state_t state; /* active txg state */ 201 uint64_t nread; /* number of bytes read */ 202 uint64_t nwritten; /* number of bytes written */ 203 uint64_t reads; /* number of read operations */ 204 uint64_t writes; /* number of write operations */ 205 uint64_t ndirty; /* number of dirty bytes */ 206 hrtime_t times[TXG_STATE_COMMITTED]; /* completion times */ 207 procfs_list_node_t sth_node; 208 } spa_txg_history_t; 209 210 static int 211 spa_txg_history_show_header(struct seq_file *f) 212 { 213 seq_printf(f, "%-8s %-16s %-5s %-12s %-12s %-12s " 214 "%-8s %-8s %-12s %-12s %-12s %-12s\n", "txg", "birth", "state", 215 "ndirty", "nread", "nwritten", "reads", "writes", 216 "otime", "qtime", "wtime", "stime"); 217 return (0); 218 } 219 220 static int 221 spa_txg_history_show(struct seq_file *f, void *data) 222 { 223 spa_txg_history_t *sth = (spa_txg_history_t *)data; 224 uint64_t open = 0, quiesce = 0, wait = 0, sync = 0; 225 char state; 226 227 switch (sth->state) { 228 case TXG_STATE_BIRTH: state = 'B'; break; 229 case TXG_STATE_OPEN: state = 'O'; break; 230 case TXG_STATE_QUIESCED: state = 'Q'; break; 231 case TXG_STATE_WAIT_FOR_SYNC: state = 'W'; break; 232 case TXG_STATE_SYNCED: state = 'S'; break; 233 case TXG_STATE_COMMITTED: state = 'C'; break; 234 default: state = '?'; break; 235 } 236 237 if (sth->times[TXG_STATE_OPEN]) 238 open = sth->times[TXG_STATE_OPEN] - 239 sth->times[TXG_STATE_BIRTH]; 240 241 if (sth->times[TXG_STATE_QUIESCED]) 242 quiesce = sth->times[TXG_STATE_QUIESCED] - 243 sth->times[TXG_STATE_OPEN]; 244 245 if (sth->times[TXG_STATE_WAIT_FOR_SYNC]) 246 wait = sth->times[TXG_STATE_WAIT_FOR_SYNC] - 247 sth->times[TXG_STATE_QUIESCED]; 248 249 if (sth->times[TXG_STATE_SYNCED]) 250 sync = sth->times[TXG_STATE_SYNCED] - 251 sth->times[TXG_STATE_WAIT_FOR_SYNC]; 252 253 seq_printf(f, "%-8llu %-16llu %-5c %-12llu " 254 "%-12llu %-12llu %-8llu %-8llu %-12llu %-12llu %-12llu %-12llu\n", 255 (longlong_t)sth->txg, sth->times[TXG_STATE_BIRTH], state, 256 (u_longlong_t)sth->ndirty, 257 (u_longlong_t)sth->nread, (u_longlong_t)sth->nwritten, 258 (u_longlong_t)sth->reads, (u_longlong_t)sth->writes, 259 (u_longlong_t)open, (u_longlong_t)quiesce, (u_longlong_t)wait, 260 (u_longlong_t)sync); 261 262 return (0); 263 } 264 265 /* Remove oldest elements from list until there are no more than 'size' left */ 266 static void 267 spa_txg_history_truncate(spa_history_list_t *shl, unsigned int size) 268 { 269 spa_txg_history_t *sth; 270 while (shl->size > size) { 271 sth = list_remove_head(&shl->procfs_list.pl_list); 272 ASSERT3P(sth, !=, NULL); 273 kmem_free(sth, sizeof (spa_txg_history_t)); 274 shl->size--; 275 } 276 277 if (size == 0) 278 ASSERT(list_is_empty(&shl->procfs_list.pl_list)); 279 280 } 281 282 static int 283 spa_txg_history_clear(procfs_list_t *procfs_list) 284 { 285 spa_history_list_t *shl = procfs_list->pl_private; 286 mutex_enter(&procfs_list->pl_lock); 287 spa_txg_history_truncate(shl, 0); 288 mutex_exit(&procfs_list->pl_lock); 289 return (0); 290 } 291 292 static void 293 spa_txg_history_init(spa_t *spa) 294 { 295 spa_history_list_t *shl = &spa->spa_stats.txg_history; 296 char *module; 297 298 shl->size = 0; 299 300 module = kmem_asprintf("zfs/%s", spa_name(spa)); 301 302 shl->procfs_list.pl_private = shl; 303 procfs_list_install(module, 304 "txgs", 305 0644, 306 &shl->procfs_list, 307 spa_txg_history_show, 308 spa_txg_history_show_header, 309 spa_txg_history_clear, 310 offsetof(spa_txg_history_t, sth_node)); 311 312 kmem_strfree(module); 313 } 314 315 static void 316 spa_txg_history_destroy(spa_t *spa) 317 { 318 spa_history_list_t *shl = &spa->spa_stats.txg_history; 319 procfs_list_uninstall(&shl->procfs_list); 320 spa_txg_history_truncate(shl, 0); 321 procfs_list_destroy(&shl->procfs_list); 322 } 323 324 /* 325 * Add a new txg to historical record. 326 */ 327 void 328 spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time) 329 { 330 spa_history_list_t *shl = &spa->spa_stats.txg_history; 331 spa_txg_history_t *sth; 332 333 if (zfs_txg_history == 0 && shl->size == 0) 334 return; 335 336 sth = kmem_zalloc(sizeof (spa_txg_history_t), KM_SLEEP); 337 sth->txg = txg; 338 sth->state = TXG_STATE_OPEN; 339 sth->times[TXG_STATE_BIRTH] = birth_time; 340 341 mutex_enter(&shl->procfs_list.pl_lock); 342 procfs_list_add(&shl->procfs_list, sth); 343 shl->size++; 344 spa_txg_history_truncate(shl, zfs_txg_history); 345 mutex_exit(&shl->procfs_list.pl_lock); 346 } 347 348 /* 349 * Set txg state completion time and increment current state. 350 */ 351 int 352 spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, 353 hrtime_t completed_time) 354 { 355 spa_history_list_t *shl = &spa->spa_stats.txg_history; 356 spa_txg_history_t *sth; 357 int error = ENOENT; 358 359 if (zfs_txg_history == 0) 360 return (0); 361 362 mutex_enter(&shl->procfs_list.pl_lock); 363 for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL; 364 sth = list_prev(&shl->procfs_list.pl_list, sth)) { 365 if (sth->txg == txg) { 366 sth->times[completed_state] = completed_time; 367 sth->state++; 368 error = 0; 369 break; 370 } 371 } 372 mutex_exit(&shl->procfs_list.pl_lock); 373 374 return (error); 375 } 376 377 /* 378 * Set txg IO stats. 379 */ 380 static int 381 spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread, 382 uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t ndirty) 383 { 384 spa_history_list_t *shl = &spa->spa_stats.txg_history; 385 spa_txg_history_t *sth; 386 int error = ENOENT; 387 388 if (zfs_txg_history == 0) 389 return (0); 390 391 mutex_enter(&shl->procfs_list.pl_lock); 392 for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL; 393 sth = list_prev(&shl->procfs_list.pl_list, sth)) { 394 if (sth->txg == txg) { 395 sth->nread = nread; 396 sth->nwritten = nwritten; 397 sth->reads = reads; 398 sth->writes = writes; 399 sth->ndirty = ndirty; 400 error = 0; 401 break; 402 } 403 } 404 mutex_exit(&shl->procfs_list.pl_lock); 405 406 return (error); 407 } 408 409 txg_stat_t * 410 spa_txg_history_init_io(spa_t *spa, uint64_t txg, dsl_pool_t *dp) 411 { 412 txg_stat_t *ts; 413 414 if (zfs_txg_history == 0) 415 return (NULL); 416 417 ts = kmem_alloc(sizeof (txg_stat_t), KM_SLEEP); 418 419 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 420 vdev_get_stats(spa->spa_root_vdev, &ts->vs1); 421 spa_config_exit(spa, SCL_CONFIG, FTAG); 422 423 ts->txg = txg; 424 ts->ndirty = dp->dp_dirty_pertxg[txg & TXG_MASK]; 425 426 spa_txg_history_set(spa, txg, TXG_STATE_WAIT_FOR_SYNC, gethrtime()); 427 428 return (ts); 429 } 430 431 void 432 spa_txg_history_fini_io(spa_t *spa, txg_stat_t *ts) 433 { 434 if (ts == NULL) 435 return; 436 437 if (zfs_txg_history == 0) { 438 kmem_free(ts, sizeof (txg_stat_t)); 439 return; 440 } 441 442 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 443 vdev_get_stats(spa->spa_root_vdev, &ts->vs2); 444 spa_config_exit(spa, SCL_CONFIG, FTAG); 445 446 spa_txg_history_set(spa, ts->txg, TXG_STATE_SYNCED, gethrtime()); 447 spa_txg_history_set_io(spa, ts->txg, 448 ts->vs2.vs_bytes[ZIO_TYPE_READ] - ts->vs1.vs_bytes[ZIO_TYPE_READ], 449 ts->vs2.vs_bytes[ZIO_TYPE_WRITE] - ts->vs1.vs_bytes[ZIO_TYPE_WRITE], 450 ts->vs2.vs_ops[ZIO_TYPE_READ] - ts->vs1.vs_ops[ZIO_TYPE_READ], 451 ts->vs2.vs_ops[ZIO_TYPE_WRITE] - ts->vs1.vs_ops[ZIO_TYPE_WRITE], 452 ts->ndirty); 453 454 kmem_free(ts, sizeof (txg_stat_t)); 455 } 456 457 /* 458 * ========================================================================== 459 * SPA TX Assign Histogram Routines 460 * ========================================================================== 461 */ 462 463 /* 464 * Tx statistics - Information exported regarding dmu_tx_assign time. 465 */ 466 467 /* 468 * When the kstat is written zero all buckets. When the kstat is read 469 * count the number of trailing buckets set to zero and update ks_ndata 470 * such that they are not output. 471 */ 472 static int 473 spa_tx_assign_update(kstat_t *ksp, int rw) 474 { 475 spa_t *spa = ksp->ks_private; 476 spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; 477 int i; 478 479 if (rw == KSTAT_WRITE) { 480 for (i = 0; i < shk->count; i++) 481 ((kstat_named_t *)shk->priv)[i].value.ui64 = 0; 482 } 483 484 for (i = shk->count; i > 0; i--) 485 if (((kstat_named_t *)shk->priv)[i-1].value.ui64 != 0) 486 break; 487 488 ksp->ks_ndata = i; 489 ksp->ks_data_size = i * sizeof (kstat_named_t); 490 491 return (0); 492 } 493 494 static void 495 spa_tx_assign_init(spa_t *spa) 496 { 497 spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; 498 char *name; 499 kstat_named_t *ks; 500 kstat_t *ksp; 501 int i; 502 503 mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); 504 505 shk->count = 42; /* power of two buckets for 1ns to 2,199s */ 506 shk->size = shk->count * sizeof (kstat_named_t); 507 shk->priv = kmem_alloc(shk->size, KM_SLEEP); 508 509 name = kmem_asprintf("zfs/%s", spa_name(spa)); 510 511 for (i = 0; i < shk->count; i++) { 512 ks = &((kstat_named_t *)shk->priv)[i]; 513 ks->data_type = KSTAT_DATA_UINT64; 514 ks->value.ui64 = 0; 515 (void) snprintf(ks->name, KSTAT_STRLEN, "%llu ns", 516 (u_longlong_t)1 << i); 517 } 518 519 ksp = kstat_create(name, 0, "dmu_tx_assign", "misc", 520 KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL); 521 shk->kstat = ksp; 522 523 if (ksp) { 524 ksp->ks_lock = &shk->lock; 525 ksp->ks_data = shk->priv; 526 ksp->ks_ndata = shk->count; 527 ksp->ks_data_size = shk->size; 528 ksp->ks_private = spa; 529 ksp->ks_update = spa_tx_assign_update; 530 kstat_install(ksp); 531 } 532 kmem_strfree(name); 533 } 534 535 static void 536 spa_tx_assign_destroy(spa_t *spa) 537 { 538 spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; 539 kstat_t *ksp; 540 541 ksp = shk->kstat; 542 if (ksp) 543 kstat_delete(ksp); 544 545 kmem_free(shk->priv, shk->size); 546 mutex_destroy(&shk->lock); 547 } 548 549 void 550 spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs) 551 { 552 spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; 553 uint64_t idx = 0; 554 555 while (((1ULL << idx) < nsecs) && (idx < shk->size - 1)) 556 idx++; 557 558 atomic_inc_64(&((kstat_named_t *)shk->priv)[idx].value.ui64); 559 } 560 561 /* 562 * ========================================================================== 563 * SPA IO History Routines 564 * ========================================================================== 565 */ 566 static int 567 spa_io_history_update(kstat_t *ksp, int rw) 568 { 569 if (rw == KSTAT_WRITE) 570 memset(ksp->ks_data, 0, ksp->ks_data_size); 571 572 return (0); 573 } 574 575 static void 576 spa_io_history_init(spa_t *spa) 577 { 578 spa_history_kstat_t *shk = &spa->spa_stats.io_history; 579 char *name; 580 kstat_t *ksp; 581 582 mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); 583 584 name = kmem_asprintf("zfs/%s", spa_name(spa)); 585 586 ksp = kstat_create(name, 0, "io", "disk", KSTAT_TYPE_IO, 1, 0); 587 shk->kstat = ksp; 588 589 if (ksp) { 590 ksp->ks_lock = &shk->lock; 591 ksp->ks_private = spa; 592 ksp->ks_update = spa_io_history_update; 593 kstat_install(ksp); 594 } 595 kmem_strfree(name); 596 } 597 598 static void 599 spa_io_history_destroy(spa_t *spa) 600 { 601 spa_history_kstat_t *shk = &spa->spa_stats.io_history; 602 603 if (shk->kstat) 604 kstat_delete(shk->kstat); 605 606 mutex_destroy(&shk->lock); 607 } 608 609 /* 610 * ========================================================================== 611 * SPA MMP History Routines 612 * ========================================================================== 613 */ 614 615 /* 616 * MMP statistics - Information exported regarding attempted MMP writes 617 * For MMP writes issued, fields used as per comments below. 618 * For MMP writes skipped, an entry represents a span of time when 619 * writes were skipped for same reason (error from mmp_random_leaf). 620 * Differences are: 621 * timestamp time first write skipped, if >1 skipped in a row 622 * mmp_delay delay value at timestamp 623 * vdev_guid number of writes skipped 624 * io_error one of enum mmp_error 625 * duration time span (ns) of skipped writes 626 */ 627 628 typedef struct spa_mmp_history { 629 uint64_t mmp_node_id; /* unique # for updates */ 630 uint64_t txg; /* txg of last sync */ 631 uint64_t timestamp; /* UTC time MMP write issued */ 632 uint64_t mmp_delay; /* mmp_thread.mmp_delay at timestamp */ 633 uint64_t vdev_guid; /* unique ID of leaf vdev */ 634 char *vdev_path; 635 int vdev_label; /* vdev label */ 636 int io_error; /* error status of MMP write */ 637 hrtime_t error_start; /* hrtime of start of error period */ 638 hrtime_t duration; /* time from submission to completion */ 639 procfs_list_node_t smh_node; 640 } spa_mmp_history_t; 641 642 static int 643 spa_mmp_history_show_header(struct seq_file *f) 644 { 645 seq_printf(f, "%-10s %-10s %-10s %-6s %-10s %-12s %-24s " 646 "%-10s %s\n", "id", "txg", "timestamp", "error", "duration", 647 "mmp_delay", "vdev_guid", "vdev_label", "vdev_path"); 648 return (0); 649 } 650 651 static int 652 spa_mmp_history_show(struct seq_file *f, void *data) 653 { 654 spa_mmp_history_t *smh = (spa_mmp_history_t *)data; 655 char skip_fmt[] = "%-10llu %-10llu %10llu %#6llx %10lld %12llu %-24llu " 656 "%-10lld %s\n"; 657 char write_fmt[] = "%-10llu %-10llu %10llu %6lld %10lld %12llu %-24llu " 658 "%-10lld %s\n"; 659 660 seq_printf(f, (smh->error_start ? skip_fmt : write_fmt), 661 (u_longlong_t)smh->mmp_node_id, (u_longlong_t)smh->txg, 662 (u_longlong_t)smh->timestamp, (longlong_t)smh->io_error, 663 (longlong_t)smh->duration, (u_longlong_t)smh->mmp_delay, 664 (u_longlong_t)smh->vdev_guid, (u_longlong_t)smh->vdev_label, 665 (smh->vdev_path ? smh->vdev_path : "-")); 666 667 return (0); 668 } 669 670 /* Remove oldest elements from list until there are no more than 'size' left */ 671 static void 672 spa_mmp_history_truncate(spa_history_list_t *shl, unsigned int size) 673 { 674 spa_mmp_history_t *smh; 675 while (shl->size > size) { 676 smh = list_remove_head(&shl->procfs_list.pl_list); 677 if (smh->vdev_path) 678 kmem_strfree(smh->vdev_path); 679 kmem_free(smh, sizeof (spa_mmp_history_t)); 680 shl->size--; 681 } 682 683 if (size == 0) 684 ASSERT(list_is_empty(&shl->procfs_list.pl_list)); 685 686 } 687 688 static int 689 spa_mmp_history_clear(procfs_list_t *procfs_list) 690 { 691 spa_history_list_t *shl = procfs_list->pl_private; 692 mutex_enter(&procfs_list->pl_lock); 693 spa_mmp_history_truncate(shl, 0); 694 mutex_exit(&procfs_list->pl_lock); 695 return (0); 696 } 697 698 static void 699 spa_mmp_history_init(spa_t *spa) 700 { 701 spa_history_list_t *shl = &spa->spa_stats.mmp_history; 702 char *module; 703 704 shl->size = 0; 705 706 module = kmem_asprintf("zfs/%s", spa_name(spa)); 707 708 shl->procfs_list.pl_private = shl; 709 procfs_list_install(module, 710 "multihost", 711 0644, 712 &shl->procfs_list, 713 spa_mmp_history_show, 714 spa_mmp_history_show_header, 715 spa_mmp_history_clear, 716 offsetof(spa_mmp_history_t, smh_node)); 717 718 kmem_strfree(module); 719 } 720 721 static void 722 spa_mmp_history_destroy(spa_t *spa) 723 { 724 spa_history_list_t *shl = &spa->spa_stats.mmp_history; 725 procfs_list_uninstall(&shl->procfs_list); 726 spa_mmp_history_truncate(shl, 0); 727 procfs_list_destroy(&shl->procfs_list); 728 } 729 730 /* 731 * Set duration in existing "skip" record to how long we have waited for a leaf 732 * vdev to become available. 733 * 734 * Important that we start search at the tail of the list where new 735 * records are inserted, so this is normally an O(1) operation. 736 */ 737 int 738 spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_node_id) 739 { 740 spa_history_list_t *shl = &spa->spa_stats.mmp_history; 741 spa_mmp_history_t *smh; 742 int error = ENOENT; 743 744 if (zfs_multihost_history == 0 && shl->size == 0) 745 return (0); 746 747 mutex_enter(&shl->procfs_list.pl_lock); 748 for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL; 749 smh = list_prev(&shl->procfs_list.pl_list, smh)) { 750 if (smh->mmp_node_id == mmp_node_id) { 751 ASSERT3U(smh->io_error, !=, 0); 752 smh->duration = gethrtime() - smh->error_start; 753 smh->vdev_guid++; 754 error = 0; 755 break; 756 } 757 } 758 mutex_exit(&shl->procfs_list.pl_lock); 759 760 return (error); 761 } 762 763 /* 764 * Set MMP write duration and error status in existing record. 765 * See comment re: search order above spa_mmp_history_set_skip(). 766 */ 767 int 768 spa_mmp_history_set(spa_t *spa, uint64_t mmp_node_id, int io_error, 769 hrtime_t duration) 770 { 771 spa_history_list_t *shl = &spa->spa_stats.mmp_history; 772 spa_mmp_history_t *smh; 773 int error = ENOENT; 774 775 if (zfs_multihost_history == 0 && shl->size == 0) 776 return (0); 777 778 mutex_enter(&shl->procfs_list.pl_lock); 779 for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL; 780 smh = list_prev(&shl->procfs_list.pl_list, smh)) { 781 if (smh->mmp_node_id == mmp_node_id) { 782 ASSERT(smh->io_error == 0); 783 smh->io_error = io_error; 784 smh->duration = duration; 785 error = 0; 786 break; 787 } 788 } 789 mutex_exit(&shl->procfs_list.pl_lock); 790 791 return (error); 792 } 793 794 /* 795 * Add a new MMP historical record. 796 * error == 0 : a write was issued. 797 * error != 0 : a write was not issued because no leaves were found. 798 */ 799 void 800 spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp, 801 uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_node_id, 802 int error) 803 { 804 spa_history_list_t *shl = &spa->spa_stats.mmp_history; 805 spa_mmp_history_t *smh; 806 807 if (zfs_multihost_history == 0 && shl->size == 0) 808 return; 809 810 smh = kmem_zalloc(sizeof (spa_mmp_history_t), KM_SLEEP); 811 smh->txg = txg; 812 smh->timestamp = timestamp; 813 smh->mmp_delay = mmp_delay; 814 if (vd) { 815 smh->vdev_guid = vd->vdev_guid; 816 if (vd->vdev_path) 817 smh->vdev_path = kmem_strdup(vd->vdev_path); 818 } 819 smh->vdev_label = label; 820 smh->mmp_node_id = mmp_node_id; 821 822 if (error) { 823 smh->io_error = error; 824 smh->error_start = gethrtime(); 825 smh->vdev_guid = 1; 826 } 827 828 mutex_enter(&shl->procfs_list.pl_lock); 829 procfs_list_add(&shl->procfs_list, smh); 830 shl->size++; 831 spa_mmp_history_truncate(shl, zfs_multihost_history); 832 mutex_exit(&shl->procfs_list.pl_lock); 833 } 834 835 static void * 836 spa_state_addr(kstat_t *ksp, loff_t n) 837 { 838 if (n == 0) 839 return (ksp->ks_private); /* return the spa_t */ 840 return (NULL); 841 } 842 843 static int 844 spa_state_data(char *buf, size_t size, void *data) 845 { 846 spa_t *spa = (spa_t *)data; 847 (void) snprintf(buf, size, "%s\n", spa_state_to_name(spa)); 848 return (0); 849 } 850 851 /* 852 * Return the state of the pool in /proc/spl/kstat/zfs/<pool>/state. 853 * 854 * This is a lock-less read of the pool's state (unlike using 'zpool', which 855 * can potentially block for seconds). Because it doesn't block, it can useful 856 * as a pool heartbeat value. 857 */ 858 static void 859 spa_state_init(spa_t *spa) 860 { 861 spa_history_kstat_t *shk = &spa->spa_stats.state; 862 char *name; 863 kstat_t *ksp; 864 865 mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); 866 867 name = kmem_asprintf("zfs/%s", spa_name(spa)); 868 ksp = kstat_create(name, 0, "state", "misc", 869 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); 870 871 shk->kstat = ksp; 872 if (ksp) { 873 ksp->ks_lock = &shk->lock; 874 ksp->ks_data = NULL; 875 ksp->ks_private = spa; 876 ksp->ks_flags |= KSTAT_FLAG_NO_HEADERS; 877 kstat_set_raw_ops(ksp, NULL, spa_state_data, spa_state_addr); 878 kstat_install(ksp); 879 } 880 881 kmem_strfree(name); 882 } 883 884 static void 885 spa_health_destroy(spa_t *spa) 886 { 887 spa_history_kstat_t *shk = &spa->spa_stats.state; 888 kstat_t *ksp = shk->kstat; 889 if (ksp) 890 kstat_delete(ksp); 891 892 mutex_destroy(&shk->lock); 893 } 894 895 static spa_iostats_t spa_iostats_template = { 896 { "trim_extents_written", KSTAT_DATA_UINT64 }, 897 { "trim_bytes_written", KSTAT_DATA_UINT64 }, 898 { "trim_extents_skipped", KSTAT_DATA_UINT64 }, 899 { "trim_bytes_skipped", KSTAT_DATA_UINT64 }, 900 { "trim_extents_failed", KSTAT_DATA_UINT64 }, 901 { "trim_bytes_failed", KSTAT_DATA_UINT64 }, 902 { "autotrim_extents_written", KSTAT_DATA_UINT64 }, 903 { "autotrim_bytes_written", KSTAT_DATA_UINT64 }, 904 { "autotrim_extents_skipped", KSTAT_DATA_UINT64 }, 905 { "autotrim_bytes_skipped", KSTAT_DATA_UINT64 }, 906 { "autotrim_extents_failed", KSTAT_DATA_UINT64 }, 907 { "autotrim_bytes_failed", KSTAT_DATA_UINT64 }, 908 { "simple_trim_extents_written", KSTAT_DATA_UINT64 }, 909 { "simple_trim_bytes_written", KSTAT_DATA_UINT64 }, 910 { "simple_trim_extents_skipped", KSTAT_DATA_UINT64 }, 911 { "simple_trim_bytes_skipped", KSTAT_DATA_UINT64 }, 912 { "simple_trim_extents_failed", KSTAT_DATA_UINT64 }, 913 { "simple_trim_bytes_failed", KSTAT_DATA_UINT64 }, 914 }; 915 916 #define SPA_IOSTATS_ADD(stat, val) \ 917 atomic_add_64(&iostats->stat.value.ui64, (val)); 918 919 void 920 spa_iostats_trim_add(spa_t *spa, trim_type_t type, 921 uint64_t extents_written, uint64_t bytes_written, 922 uint64_t extents_skipped, uint64_t bytes_skipped, 923 uint64_t extents_failed, uint64_t bytes_failed) 924 { 925 spa_history_kstat_t *shk = &spa->spa_stats.iostats; 926 kstat_t *ksp = shk->kstat; 927 spa_iostats_t *iostats; 928 929 if (ksp == NULL) 930 return; 931 932 iostats = ksp->ks_data; 933 if (type == TRIM_TYPE_MANUAL) { 934 SPA_IOSTATS_ADD(trim_extents_written, extents_written); 935 SPA_IOSTATS_ADD(trim_bytes_written, bytes_written); 936 SPA_IOSTATS_ADD(trim_extents_skipped, extents_skipped); 937 SPA_IOSTATS_ADD(trim_bytes_skipped, bytes_skipped); 938 SPA_IOSTATS_ADD(trim_extents_failed, extents_failed); 939 SPA_IOSTATS_ADD(trim_bytes_failed, bytes_failed); 940 } else if (type == TRIM_TYPE_AUTO) { 941 SPA_IOSTATS_ADD(autotrim_extents_written, extents_written); 942 SPA_IOSTATS_ADD(autotrim_bytes_written, bytes_written); 943 SPA_IOSTATS_ADD(autotrim_extents_skipped, extents_skipped); 944 SPA_IOSTATS_ADD(autotrim_bytes_skipped, bytes_skipped); 945 SPA_IOSTATS_ADD(autotrim_extents_failed, extents_failed); 946 SPA_IOSTATS_ADD(autotrim_bytes_failed, bytes_failed); 947 } else { 948 SPA_IOSTATS_ADD(simple_trim_extents_written, extents_written); 949 SPA_IOSTATS_ADD(simple_trim_bytes_written, bytes_written); 950 SPA_IOSTATS_ADD(simple_trim_extents_skipped, extents_skipped); 951 SPA_IOSTATS_ADD(simple_trim_bytes_skipped, bytes_skipped); 952 SPA_IOSTATS_ADD(simple_trim_extents_failed, extents_failed); 953 SPA_IOSTATS_ADD(simple_trim_bytes_failed, bytes_failed); 954 } 955 } 956 957 static int 958 spa_iostats_update(kstat_t *ksp, int rw) 959 { 960 if (rw == KSTAT_WRITE) { 961 memcpy(ksp->ks_data, &spa_iostats_template, 962 sizeof (spa_iostats_t)); 963 } 964 965 return (0); 966 } 967 968 static void 969 spa_iostats_init(spa_t *spa) 970 { 971 spa_history_kstat_t *shk = &spa->spa_stats.iostats; 972 973 mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); 974 975 char *name = kmem_asprintf("zfs/%s", spa_name(spa)); 976 kstat_t *ksp = kstat_create(name, 0, "iostats", "misc", 977 KSTAT_TYPE_NAMED, sizeof (spa_iostats_t) / sizeof (kstat_named_t), 978 KSTAT_FLAG_VIRTUAL); 979 980 shk->kstat = ksp; 981 if (ksp) { 982 int size = sizeof (spa_iostats_t); 983 ksp->ks_lock = &shk->lock; 984 ksp->ks_private = spa; 985 ksp->ks_update = spa_iostats_update; 986 ksp->ks_data = kmem_alloc(size, KM_SLEEP); 987 memcpy(ksp->ks_data, &spa_iostats_template, size); 988 kstat_install(ksp); 989 } 990 991 kmem_strfree(name); 992 } 993 994 static void 995 spa_iostats_destroy(spa_t *spa) 996 { 997 spa_history_kstat_t *shk = &spa->spa_stats.iostats; 998 kstat_t *ksp = shk->kstat; 999 if (ksp) { 1000 kmem_free(ksp->ks_data, sizeof (spa_iostats_t)); 1001 kstat_delete(ksp); 1002 } 1003 1004 mutex_destroy(&shk->lock); 1005 } 1006 1007 void 1008 spa_stats_init(spa_t *spa) 1009 { 1010 spa_read_history_init(spa); 1011 spa_txg_history_init(spa); 1012 spa_tx_assign_init(spa); 1013 spa_io_history_init(spa); 1014 spa_mmp_history_init(spa); 1015 spa_state_init(spa); 1016 spa_iostats_init(spa); 1017 } 1018 1019 void 1020 spa_stats_destroy(spa_t *spa) 1021 { 1022 spa_iostats_destroy(spa); 1023 spa_health_destroy(spa); 1024 spa_tx_assign_destroy(spa); 1025 spa_txg_history_destroy(spa); 1026 spa_read_history_destroy(spa); 1027 spa_io_history_destroy(spa); 1028 spa_mmp_history_destroy(spa); 1029 } 1030 1031 /* BEGIN CSTYLED */ 1032 ZFS_MODULE_PARAM(zfs, zfs_, read_history, INT, ZMOD_RW, 1033 "Historical statistics for the last N reads"); 1034 1035 ZFS_MODULE_PARAM(zfs, zfs_, read_history_hits, INT, ZMOD_RW, 1036 "Include cache hits in read history"); 1037 1038 ZFS_MODULE_PARAM(zfs_txg, zfs_txg_, history, INT, ZMOD_RW, 1039 "Historical statistics for the last N txgs"); 1040 1041 ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, history, INT, ZMOD_RW, 1042 "Historical statistics for last N multihost writes"); 1043 /* END CSTYLED */ 1044