1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 #include <sys/zfs_context.h> 23 #include <sys/spa_impl.h> 24 #include <sys/vdev_impl.h> 25 #include <sys/spa.h> 26 #include <zfs_comutil.h> 27 28 /* 29 * Keeps stats on last N reads per spa_t, disabled by default. 30 */ 31 int zfs_read_history = 0; 32 33 /* 34 * Include cache hits in history, disabled by default. 35 */ 36 int zfs_read_history_hits = 0; 37 38 /* 39 * Keeps stats on the last 100 txgs by default. 40 */ 41 int zfs_txg_history = 100; 42 43 /* 44 * Keeps stats on the last N MMP updates, disabled by default. 45 */ 46 int zfs_multihost_history = 0; 47 48 /* 49 * ========================================================================== 50 * SPA Read History Routines 51 * ========================================================================== 52 */ 53 54 /* 55 * Read statistics - Information exported regarding each arc_read call 56 */ 57 typedef struct spa_read_history { 58 hrtime_t start; /* time read completed */ 59 uint64_t objset; /* read from this objset */ 60 uint64_t object; /* read of this object number */ 61 uint64_t level; /* block's indirection level */ 62 uint64_t blkid; /* read of this block id */ 63 char origin[24]; /* read originated from here */ 64 uint32_t aflags; /* ARC flags (cached, prefetch, etc.) */ 65 pid_t pid; /* PID of task doing read */ 66 char comm[16]; /* process name of task doing read */ 67 procfs_list_node_t srh_node; 68 } spa_read_history_t; 69 70 static int 71 spa_read_history_show_header(struct seq_file *f) 72 { 73 seq_printf(f, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s " 74 "%-24s %-8s %-16s\n", "UID", "start", "objset", "object", 75 "level", "blkid", "aflags", "origin", "pid", "process"); 76 77 return (0); 78 } 79 80 static int 81 spa_read_history_show(struct seq_file *f, void *data) 82 { 83 spa_read_history_t *srh = (spa_read_history_t *)data; 84 85 seq_printf(f, "%-8llu %-16llu 0x%-6llx " 86 "%-8lli %-8lli %-8lli 0x%-6x %-24s %-8i %-16s\n", 87 (u_longlong_t)srh->srh_node.pln_id, srh->start, 88 (longlong_t)srh->objset, (longlong_t)srh->object, 89 (longlong_t)srh->level, (longlong_t)srh->blkid, 90 srh->aflags, srh->origin, srh->pid, srh->comm); 91 92 return (0); 93 } 94 95 /* Remove oldest elements from list until there are no more than 'size' left */ 96 static void 97 spa_read_history_truncate(spa_history_list_t *shl, unsigned int size) 98 { 99 spa_read_history_t *srh; 100 while (shl->size > size) { 101 srh = list_remove_head(&shl->procfs_list.pl_list); 102 ASSERT3P(srh, !=, NULL); 103 kmem_free(srh, sizeof (spa_read_history_t)); 104 shl->size--; 105 } 106 107 if (size == 0) 108 ASSERT(list_is_empty(&shl->procfs_list.pl_list)); 109 } 110 111 static int 112 spa_read_history_clear(procfs_list_t *procfs_list) 113 { 114 spa_history_list_t *shl = procfs_list->pl_private; 115 mutex_enter(&procfs_list->pl_lock); 116 spa_read_history_truncate(shl, 0); 117 mutex_exit(&procfs_list->pl_lock); 118 return (0); 119 } 120 121 static void 122 spa_read_history_init(spa_t *spa) 123 { 124 spa_history_list_t *shl = &spa->spa_stats.read_history; 125 126 shl->size = 0; 127 shl->procfs_list.pl_private = shl; 128 procfs_list_install("zfs", 129 spa_name(spa), 130 "reads", 131 0600, 132 &shl->procfs_list, 133 spa_read_history_show, 134 spa_read_history_show_header, 135 spa_read_history_clear, 136 offsetof(spa_read_history_t, srh_node)); 137 } 138 139 static void 140 spa_read_history_destroy(spa_t *spa) 141 { 142 spa_history_list_t *shl = &spa->spa_stats.read_history; 143 procfs_list_uninstall(&shl->procfs_list); 144 spa_read_history_truncate(shl, 0); 145 procfs_list_destroy(&shl->procfs_list); 146 } 147 148 void 149 spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags) 150 { 151 spa_history_list_t *shl = &spa->spa_stats.read_history; 152 spa_read_history_t *srh; 153 154 ASSERT3P(spa, !=, NULL); 155 ASSERT3P(zb, !=, NULL); 156 157 if (zfs_read_history == 0 && shl->size == 0) 158 return; 159 160 if (zfs_read_history_hits == 0 && (aflags & ARC_FLAG_CACHED)) 161 return; 162 163 srh = kmem_zalloc(sizeof (spa_read_history_t), KM_SLEEP); 164 strlcpy(srh->comm, getcomm(), sizeof (srh->comm)); 165 srh->start = gethrtime(); 166 srh->objset = zb->zb_objset; 167 srh->object = zb->zb_object; 168 srh->level = zb->zb_level; 169 srh->blkid = zb->zb_blkid; 170 srh->aflags = aflags; 171 srh->pid = getpid(); 172 173 mutex_enter(&shl->procfs_list.pl_lock); 174 175 procfs_list_add(&shl->procfs_list, srh); 176 shl->size++; 177 178 spa_read_history_truncate(shl, zfs_read_history); 179 180 mutex_exit(&shl->procfs_list.pl_lock); 181 } 182 183 /* 184 * ========================================================================== 185 * SPA TXG History Routines 186 * ========================================================================== 187 */ 188 189 /* 190 * Txg statistics - Information exported regarding each txg sync 191 */ 192 193 typedef struct spa_txg_history { 194 uint64_t txg; /* txg id */ 195 txg_state_t state; /* active txg state */ 196 uint64_t nread; /* number of bytes read */ 197 uint64_t nwritten; /* number of bytes written */ 198 uint64_t reads; /* number of read operations */ 199 uint64_t writes; /* number of write operations */ 200 uint64_t ndirty; /* number of dirty bytes */ 201 hrtime_t times[TXG_STATE_COMMITTED]; /* completion times */ 202 procfs_list_node_t sth_node; 203 } spa_txg_history_t; 204 205 static int 206 spa_txg_history_show_header(struct seq_file *f) 207 { 208 seq_printf(f, "%-8s %-16s %-5s %-12s %-12s %-12s " 209 "%-8s %-8s %-12s %-12s %-12s %-12s\n", "txg", "birth", "state", 210 "ndirty", "nread", "nwritten", "reads", "writes", 211 "otime", "qtime", "wtime", "stime"); 212 return (0); 213 } 214 215 static int 216 spa_txg_history_show(struct seq_file *f, void *data) 217 { 218 spa_txg_history_t *sth = (spa_txg_history_t *)data; 219 uint64_t open = 0, quiesce = 0, wait = 0, sync = 0; 220 char state; 221 222 switch (sth->state) { 223 case TXG_STATE_BIRTH: state = 'B'; break; 224 case TXG_STATE_OPEN: state = 'O'; break; 225 case TXG_STATE_QUIESCED: state = 'Q'; break; 226 case TXG_STATE_WAIT_FOR_SYNC: state = 'W'; break; 227 case TXG_STATE_SYNCED: state = 'S'; break; 228 case TXG_STATE_COMMITTED: state = 'C'; break; 229 default: state = '?'; break; 230 } 231 232 if (sth->times[TXG_STATE_OPEN]) 233 open = sth->times[TXG_STATE_OPEN] - 234 sth->times[TXG_STATE_BIRTH]; 235 236 if (sth->times[TXG_STATE_QUIESCED]) 237 quiesce = sth->times[TXG_STATE_QUIESCED] - 238 sth->times[TXG_STATE_OPEN]; 239 240 if (sth->times[TXG_STATE_WAIT_FOR_SYNC]) 241 wait = sth->times[TXG_STATE_WAIT_FOR_SYNC] - 242 sth->times[TXG_STATE_QUIESCED]; 243 244 if (sth->times[TXG_STATE_SYNCED]) 245 sync = sth->times[TXG_STATE_SYNCED] - 246 sth->times[TXG_STATE_WAIT_FOR_SYNC]; 247 248 seq_printf(f, "%-8llu %-16llu %-5c %-12llu " 249 "%-12llu %-12llu %-8llu %-8llu %-12llu %-12llu %-12llu %-12llu\n", 250 (longlong_t)sth->txg, sth->times[TXG_STATE_BIRTH], state, 251 (u_longlong_t)sth->ndirty, 252 (u_longlong_t)sth->nread, (u_longlong_t)sth->nwritten, 253 (u_longlong_t)sth->reads, (u_longlong_t)sth->writes, 254 (u_longlong_t)open, (u_longlong_t)quiesce, (u_longlong_t)wait, 255 (u_longlong_t)sync); 256 257 return (0); 258 } 259 260 /* Remove oldest elements from list until there are no more than 'size' left */ 261 static void 262 spa_txg_history_truncate(spa_history_list_t *shl, unsigned int size) 263 { 264 spa_txg_history_t *sth; 265 while (shl->size > size) { 266 sth = list_remove_head(&shl->procfs_list.pl_list); 267 ASSERT3P(sth, !=, NULL); 268 kmem_free(sth, sizeof (spa_txg_history_t)); 269 shl->size--; 270 } 271 272 if (size == 0) 273 ASSERT(list_is_empty(&shl->procfs_list.pl_list)); 274 275 } 276 277 static int 278 spa_txg_history_clear(procfs_list_t *procfs_list) 279 { 280 spa_history_list_t *shl = procfs_list->pl_private; 281 mutex_enter(&procfs_list->pl_lock); 282 spa_txg_history_truncate(shl, 0); 283 mutex_exit(&procfs_list->pl_lock); 284 return (0); 285 } 286 287 static void 288 spa_txg_history_init(spa_t *spa) 289 { 290 spa_history_list_t *shl = &spa->spa_stats.txg_history; 291 292 shl->size = 0; 293 shl->procfs_list.pl_private = shl; 294 procfs_list_install("zfs", 295 spa_name(spa), 296 "txgs", 297 0644, 298 &shl->procfs_list, 299 spa_txg_history_show, 300 spa_txg_history_show_header, 301 spa_txg_history_clear, 302 offsetof(spa_txg_history_t, sth_node)); 303 } 304 305 static void 306 spa_txg_history_destroy(spa_t *spa) 307 { 308 spa_history_list_t *shl = &spa->spa_stats.txg_history; 309 procfs_list_uninstall(&shl->procfs_list); 310 spa_txg_history_truncate(shl, 0); 311 procfs_list_destroy(&shl->procfs_list); 312 } 313 314 /* 315 * Add a new txg to historical record. 316 */ 317 void 318 spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time) 319 { 320 spa_history_list_t *shl = &spa->spa_stats.txg_history; 321 spa_txg_history_t *sth; 322 323 if (zfs_txg_history == 0 && shl->size == 0) 324 return; 325 326 sth = kmem_zalloc(sizeof (spa_txg_history_t), KM_SLEEP); 327 sth->txg = txg; 328 sth->state = TXG_STATE_OPEN; 329 sth->times[TXG_STATE_BIRTH] = birth_time; 330 331 mutex_enter(&shl->procfs_list.pl_lock); 332 procfs_list_add(&shl->procfs_list, sth); 333 shl->size++; 334 spa_txg_history_truncate(shl, zfs_txg_history); 335 mutex_exit(&shl->procfs_list.pl_lock); 336 } 337 338 /* 339 * Set txg state completion time and increment current state. 340 */ 341 int 342 spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, 343 hrtime_t completed_time) 344 { 345 spa_history_list_t *shl = &spa->spa_stats.txg_history; 346 spa_txg_history_t *sth; 347 int error = ENOENT; 348 349 if (zfs_txg_history == 0) 350 return (0); 351 352 mutex_enter(&shl->procfs_list.pl_lock); 353 for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL; 354 sth = list_prev(&shl->procfs_list.pl_list, sth)) { 355 if (sth->txg == txg) { 356 sth->times[completed_state] = completed_time; 357 sth->state++; 358 error = 0; 359 break; 360 } 361 } 362 mutex_exit(&shl->procfs_list.pl_lock); 363 364 return (error); 365 } 366 367 /* 368 * Set txg IO stats. 369 */ 370 static int 371 spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread, 372 uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t ndirty) 373 { 374 spa_history_list_t *shl = &spa->spa_stats.txg_history; 375 spa_txg_history_t *sth; 376 int error = ENOENT; 377 378 if (zfs_txg_history == 0) 379 return (0); 380 381 mutex_enter(&shl->procfs_list.pl_lock); 382 for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL; 383 sth = list_prev(&shl->procfs_list.pl_list, sth)) { 384 if (sth->txg == txg) { 385 sth->nread = nread; 386 sth->nwritten = nwritten; 387 sth->reads = reads; 388 sth->writes = writes; 389 sth->ndirty = ndirty; 390 error = 0; 391 break; 392 } 393 } 394 mutex_exit(&shl->procfs_list.pl_lock); 395 396 return (error); 397 } 398 399 txg_stat_t * 400 spa_txg_history_init_io(spa_t *spa, uint64_t txg, dsl_pool_t *dp) 401 { 402 txg_stat_t *ts; 403 404 if (zfs_txg_history == 0) 405 return (NULL); 406 407 ts = kmem_alloc(sizeof (txg_stat_t), KM_SLEEP); 408 409 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 410 vdev_get_stats(spa->spa_root_vdev, &ts->vs1); 411 spa_config_exit(spa, SCL_CONFIG, FTAG); 412 413 ts->txg = txg; 414 ts->ndirty = dp->dp_dirty_pertxg[txg & TXG_MASK]; 415 416 spa_txg_history_set(spa, txg, TXG_STATE_WAIT_FOR_SYNC, gethrtime()); 417 418 return (ts); 419 } 420 421 void 422 spa_txg_history_fini_io(spa_t *spa, txg_stat_t *ts) 423 { 424 if (ts == NULL) 425 return; 426 427 if (zfs_txg_history == 0) { 428 kmem_free(ts, sizeof (txg_stat_t)); 429 return; 430 } 431 432 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 433 vdev_get_stats(spa->spa_root_vdev, &ts->vs2); 434 spa_config_exit(spa, SCL_CONFIG, FTAG); 435 436 spa_txg_history_set(spa, ts->txg, TXG_STATE_SYNCED, gethrtime()); 437 spa_txg_history_set_io(spa, ts->txg, 438 ts->vs2.vs_bytes[ZIO_TYPE_READ] - ts->vs1.vs_bytes[ZIO_TYPE_READ], 439 ts->vs2.vs_bytes[ZIO_TYPE_WRITE] - ts->vs1.vs_bytes[ZIO_TYPE_WRITE], 440 ts->vs2.vs_ops[ZIO_TYPE_READ] - ts->vs1.vs_ops[ZIO_TYPE_READ], 441 ts->vs2.vs_ops[ZIO_TYPE_WRITE] - ts->vs1.vs_ops[ZIO_TYPE_WRITE], 442 ts->ndirty); 443 444 kmem_free(ts, sizeof (txg_stat_t)); 445 } 446 447 /* 448 * ========================================================================== 449 * SPA TX Assign Histogram Routines 450 * ========================================================================== 451 */ 452 453 /* 454 * Tx statistics - Information exported regarding dmu_tx_assign time. 455 */ 456 457 /* 458 * When the kstat is written zero all buckets. When the kstat is read 459 * count the number of trailing buckets set to zero and update ks_ndata 460 * such that they are not output. 461 */ 462 static int 463 spa_tx_assign_update(kstat_t *ksp, int rw) 464 { 465 spa_t *spa = ksp->ks_private; 466 spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; 467 int i; 468 469 if (rw == KSTAT_WRITE) { 470 for (i = 0; i < shk->count; i++) 471 ((kstat_named_t *)shk->priv)[i].value.ui64 = 0; 472 } 473 474 for (i = shk->count; i > 0; i--) 475 if (((kstat_named_t *)shk->priv)[i-1].value.ui64 != 0) 476 break; 477 478 ksp->ks_ndata = i; 479 ksp->ks_data_size = i * sizeof (kstat_named_t); 480 481 return (0); 482 } 483 484 static void 485 spa_tx_assign_init(spa_t *spa) 486 { 487 spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; 488 char *name; 489 kstat_named_t *ks; 490 kstat_t *ksp; 491 int i; 492 493 mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); 494 495 shk->count = 42; /* power of two buckets for 1ns to 2,199s */ 496 shk->size = shk->count * sizeof (kstat_named_t); 497 shk->priv = kmem_alloc(shk->size, KM_SLEEP); 498 499 name = kmem_asprintf("zfs/%s", spa_name(spa)); 500 501 for (i = 0; i < shk->count; i++) { 502 ks = &((kstat_named_t *)shk->priv)[i]; 503 ks->data_type = KSTAT_DATA_UINT64; 504 ks->value.ui64 = 0; 505 (void) snprintf(ks->name, KSTAT_STRLEN, "%llu ns", 506 (u_longlong_t)1 << i); 507 } 508 509 ksp = kstat_create(name, 0, "dmu_tx_assign", "misc", 510 KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL); 511 shk->kstat = ksp; 512 513 if (ksp) { 514 ksp->ks_lock = &shk->lock; 515 ksp->ks_data = shk->priv; 516 ksp->ks_ndata = shk->count; 517 ksp->ks_data_size = shk->size; 518 ksp->ks_private = spa; 519 ksp->ks_update = spa_tx_assign_update; 520 kstat_install(ksp); 521 } 522 kmem_strfree(name); 523 } 524 525 static void 526 spa_tx_assign_destroy(spa_t *spa) 527 { 528 spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; 529 kstat_t *ksp; 530 531 ksp = shk->kstat; 532 if (ksp) 533 kstat_delete(ksp); 534 535 kmem_free(shk->priv, shk->size); 536 mutex_destroy(&shk->lock); 537 } 538 539 void 540 spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs) 541 { 542 spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; 543 uint64_t idx = 0; 544 545 while (((1ULL << idx) < nsecs) && (idx < shk->size - 1)) 546 idx++; 547 548 atomic_inc_64(&((kstat_named_t *)shk->priv)[idx].value.ui64); 549 } 550 551 /* 552 * ========================================================================== 553 * SPA IO History Routines 554 * ========================================================================== 555 */ 556 static int 557 spa_io_history_update(kstat_t *ksp, int rw) 558 { 559 if (rw == KSTAT_WRITE) 560 memset(ksp->ks_data, 0, ksp->ks_data_size); 561 562 return (0); 563 } 564 565 static void 566 spa_io_history_init(spa_t *spa) 567 { 568 spa_history_kstat_t *shk = &spa->spa_stats.io_history; 569 char *name; 570 kstat_t *ksp; 571 572 mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); 573 574 name = kmem_asprintf("zfs/%s", spa_name(spa)); 575 576 ksp = kstat_create(name, 0, "io", "disk", KSTAT_TYPE_IO, 1, 0); 577 shk->kstat = ksp; 578 579 if (ksp) { 580 ksp->ks_lock = &shk->lock; 581 ksp->ks_private = spa; 582 ksp->ks_update = spa_io_history_update; 583 kstat_install(ksp); 584 } 585 kmem_strfree(name); 586 } 587 588 static void 589 spa_io_history_destroy(spa_t *spa) 590 { 591 spa_history_kstat_t *shk = &spa->spa_stats.io_history; 592 593 if (shk->kstat) 594 kstat_delete(shk->kstat); 595 596 mutex_destroy(&shk->lock); 597 } 598 599 /* 600 * ========================================================================== 601 * SPA MMP History Routines 602 * ========================================================================== 603 */ 604 605 /* 606 * MMP statistics - Information exported regarding attempted MMP writes 607 * For MMP writes issued, fields used as per comments below. 608 * For MMP writes skipped, an entry represents a span of time when 609 * writes were skipped for same reason (error from mmp_random_leaf). 610 * Differences are: 611 * timestamp time first write skipped, if >1 skipped in a row 612 * mmp_delay delay value at timestamp 613 * vdev_guid number of writes skipped 614 * io_error one of enum mmp_error 615 * duration time span (ns) of skipped writes 616 */ 617 618 typedef struct spa_mmp_history { 619 uint64_t mmp_node_id; /* unique # for updates */ 620 uint64_t txg; /* txg of last sync */ 621 uint64_t timestamp; /* UTC time MMP write issued */ 622 uint64_t mmp_delay; /* mmp_thread.mmp_delay at timestamp */ 623 uint64_t vdev_guid; /* unique ID of leaf vdev */ 624 char *vdev_path; 625 int vdev_label; /* vdev label */ 626 int io_error; /* error status of MMP write */ 627 hrtime_t error_start; /* hrtime of start of error period */ 628 hrtime_t duration; /* time from submission to completion */ 629 procfs_list_node_t smh_node; 630 } spa_mmp_history_t; 631 632 static int 633 spa_mmp_history_show_header(struct seq_file *f) 634 { 635 seq_printf(f, "%-10s %-10s %-10s %-6s %-10s %-12s %-24s " 636 "%-10s %s\n", "id", "txg", "timestamp", "error", "duration", 637 "mmp_delay", "vdev_guid", "vdev_label", "vdev_path"); 638 return (0); 639 } 640 641 static int 642 spa_mmp_history_show(struct seq_file *f, void *data) 643 { 644 spa_mmp_history_t *smh = (spa_mmp_history_t *)data; 645 char skip_fmt[] = "%-10llu %-10llu %10llu %#6llx %10lld %12llu %-24llu " 646 "%-10lld %s\n"; 647 char write_fmt[] = "%-10llu %-10llu %10llu %6lld %10lld %12llu %-24llu " 648 "%-10lld %s\n"; 649 650 seq_printf(f, (smh->error_start ? skip_fmt : write_fmt), 651 (u_longlong_t)smh->mmp_node_id, (u_longlong_t)smh->txg, 652 (u_longlong_t)smh->timestamp, (longlong_t)smh->io_error, 653 (longlong_t)smh->duration, (u_longlong_t)smh->mmp_delay, 654 (u_longlong_t)smh->vdev_guid, (u_longlong_t)smh->vdev_label, 655 (smh->vdev_path ? smh->vdev_path : "-")); 656 657 return (0); 658 } 659 660 /* Remove oldest elements from list until there are no more than 'size' left */ 661 static void 662 spa_mmp_history_truncate(spa_history_list_t *shl, unsigned int size) 663 { 664 spa_mmp_history_t *smh; 665 while (shl->size > size) { 666 smh = list_remove_head(&shl->procfs_list.pl_list); 667 if (smh->vdev_path) 668 kmem_strfree(smh->vdev_path); 669 kmem_free(smh, sizeof (spa_mmp_history_t)); 670 shl->size--; 671 } 672 673 if (size == 0) 674 ASSERT(list_is_empty(&shl->procfs_list.pl_list)); 675 676 } 677 678 static int 679 spa_mmp_history_clear(procfs_list_t *procfs_list) 680 { 681 spa_history_list_t *shl = procfs_list->pl_private; 682 mutex_enter(&procfs_list->pl_lock); 683 spa_mmp_history_truncate(shl, 0); 684 mutex_exit(&procfs_list->pl_lock); 685 return (0); 686 } 687 688 static void 689 spa_mmp_history_init(spa_t *spa) 690 { 691 spa_history_list_t *shl = &spa->spa_stats.mmp_history; 692 693 shl->size = 0; 694 695 shl->procfs_list.pl_private = shl; 696 procfs_list_install("zfs", 697 spa_name(spa), 698 "multihost", 699 0644, 700 &shl->procfs_list, 701 spa_mmp_history_show, 702 spa_mmp_history_show_header, 703 spa_mmp_history_clear, 704 offsetof(spa_mmp_history_t, smh_node)); 705 } 706 707 static void 708 spa_mmp_history_destroy(spa_t *spa) 709 { 710 spa_history_list_t *shl = &spa->spa_stats.mmp_history; 711 procfs_list_uninstall(&shl->procfs_list); 712 spa_mmp_history_truncate(shl, 0); 713 procfs_list_destroy(&shl->procfs_list); 714 } 715 716 /* 717 * Set duration in existing "skip" record to how long we have waited for a leaf 718 * vdev to become available. 719 * 720 * Important that we start search at the tail of the list where new 721 * records are inserted, so this is normally an O(1) operation. 722 */ 723 int 724 spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_node_id) 725 { 726 spa_history_list_t *shl = &spa->spa_stats.mmp_history; 727 spa_mmp_history_t *smh; 728 int error = ENOENT; 729 730 if (zfs_multihost_history == 0 && shl->size == 0) 731 return (0); 732 733 mutex_enter(&shl->procfs_list.pl_lock); 734 for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL; 735 smh = list_prev(&shl->procfs_list.pl_list, smh)) { 736 if (smh->mmp_node_id == mmp_node_id) { 737 ASSERT3U(smh->io_error, !=, 0); 738 smh->duration = gethrtime() - smh->error_start; 739 smh->vdev_guid++; 740 error = 0; 741 break; 742 } 743 } 744 mutex_exit(&shl->procfs_list.pl_lock); 745 746 return (error); 747 } 748 749 /* 750 * Set MMP write duration and error status in existing record. 751 * See comment re: search order above spa_mmp_history_set_skip(). 752 */ 753 int 754 spa_mmp_history_set(spa_t *spa, uint64_t mmp_node_id, int io_error, 755 hrtime_t duration) 756 { 757 spa_history_list_t *shl = &spa->spa_stats.mmp_history; 758 spa_mmp_history_t *smh; 759 int error = ENOENT; 760 761 if (zfs_multihost_history == 0 && shl->size == 0) 762 return (0); 763 764 mutex_enter(&shl->procfs_list.pl_lock); 765 for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL; 766 smh = list_prev(&shl->procfs_list.pl_list, smh)) { 767 if (smh->mmp_node_id == mmp_node_id) { 768 ASSERT(smh->io_error == 0); 769 smh->io_error = io_error; 770 smh->duration = duration; 771 error = 0; 772 break; 773 } 774 } 775 mutex_exit(&shl->procfs_list.pl_lock); 776 777 return (error); 778 } 779 780 /* 781 * Add a new MMP historical record. 782 * error == 0 : a write was issued. 783 * error != 0 : a write was not issued because no leaves were found. 784 */ 785 void 786 spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp, 787 uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_node_id, 788 int error) 789 { 790 spa_history_list_t *shl = &spa->spa_stats.mmp_history; 791 spa_mmp_history_t *smh; 792 793 if (zfs_multihost_history == 0 && shl->size == 0) 794 return; 795 796 smh = kmem_zalloc(sizeof (spa_mmp_history_t), KM_SLEEP); 797 smh->txg = txg; 798 smh->timestamp = timestamp; 799 smh->mmp_delay = mmp_delay; 800 if (vd) { 801 smh->vdev_guid = vd->vdev_guid; 802 if (vd->vdev_path) 803 smh->vdev_path = kmem_strdup(vd->vdev_path); 804 } 805 smh->vdev_label = label; 806 smh->mmp_node_id = mmp_node_id; 807 808 if (error) { 809 smh->io_error = error; 810 smh->error_start = gethrtime(); 811 smh->vdev_guid = 1; 812 } 813 814 mutex_enter(&shl->procfs_list.pl_lock); 815 procfs_list_add(&shl->procfs_list, smh); 816 shl->size++; 817 spa_mmp_history_truncate(shl, zfs_multihost_history); 818 mutex_exit(&shl->procfs_list.pl_lock); 819 } 820 821 static void * 822 spa_state_addr(kstat_t *ksp, loff_t n) 823 { 824 if (n == 0) 825 return (ksp->ks_private); /* return the spa_t */ 826 return (NULL); 827 } 828 829 static int 830 spa_state_data(char *buf, size_t size, void *data) 831 { 832 spa_t *spa = (spa_t *)data; 833 (void) snprintf(buf, size, "%s\n", spa_state_to_name(spa)); 834 return (0); 835 } 836 837 /* 838 * Return the state of the pool in /proc/spl/kstat/zfs/<pool>/state. 839 * 840 * This is a lock-less read of the pool's state (unlike using 'zpool', which 841 * can potentially block for seconds). Because it doesn't block, it can useful 842 * as a pool heartbeat value. 843 */ 844 static void 845 spa_state_init(spa_t *spa) 846 { 847 spa_history_kstat_t *shk = &spa->spa_stats.state; 848 char *name; 849 kstat_t *ksp; 850 851 mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); 852 853 name = kmem_asprintf("zfs/%s", spa_name(spa)); 854 ksp = kstat_create(name, 0, "state", "misc", 855 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); 856 857 shk->kstat = ksp; 858 if (ksp) { 859 ksp->ks_lock = &shk->lock; 860 ksp->ks_data = NULL; 861 ksp->ks_private = spa; 862 ksp->ks_flags |= KSTAT_FLAG_NO_HEADERS; 863 kstat_set_raw_ops(ksp, NULL, spa_state_data, spa_state_addr); 864 kstat_install(ksp); 865 } 866 867 kmem_strfree(name); 868 } 869 870 static void 871 spa_health_destroy(spa_t *spa) 872 { 873 spa_history_kstat_t *shk = &spa->spa_stats.state; 874 kstat_t *ksp = shk->kstat; 875 if (ksp) 876 kstat_delete(ksp); 877 878 mutex_destroy(&shk->lock); 879 } 880 881 static spa_iostats_t spa_iostats_template = { 882 { "trim_extents_written", KSTAT_DATA_UINT64 }, 883 { "trim_bytes_written", KSTAT_DATA_UINT64 }, 884 { "trim_extents_skipped", KSTAT_DATA_UINT64 }, 885 { "trim_bytes_skipped", KSTAT_DATA_UINT64 }, 886 { "trim_extents_failed", KSTAT_DATA_UINT64 }, 887 { "trim_bytes_failed", KSTAT_DATA_UINT64 }, 888 { "autotrim_extents_written", KSTAT_DATA_UINT64 }, 889 { "autotrim_bytes_written", KSTAT_DATA_UINT64 }, 890 { "autotrim_extents_skipped", KSTAT_DATA_UINT64 }, 891 { "autotrim_bytes_skipped", KSTAT_DATA_UINT64 }, 892 { "autotrim_extents_failed", KSTAT_DATA_UINT64 }, 893 { "autotrim_bytes_failed", KSTAT_DATA_UINT64 }, 894 { "simple_trim_extents_written", KSTAT_DATA_UINT64 }, 895 { "simple_trim_bytes_written", KSTAT_DATA_UINT64 }, 896 { "simple_trim_extents_skipped", KSTAT_DATA_UINT64 }, 897 { "simple_trim_bytes_skipped", KSTAT_DATA_UINT64 }, 898 { "simple_trim_extents_failed", KSTAT_DATA_UINT64 }, 899 { "simple_trim_bytes_failed", KSTAT_DATA_UINT64 }, 900 }; 901 902 #define SPA_IOSTATS_ADD(stat, val) \ 903 atomic_add_64(&iostats->stat.value.ui64, (val)); 904 905 void 906 spa_iostats_trim_add(spa_t *spa, trim_type_t type, 907 uint64_t extents_written, uint64_t bytes_written, 908 uint64_t extents_skipped, uint64_t bytes_skipped, 909 uint64_t extents_failed, uint64_t bytes_failed) 910 { 911 spa_history_kstat_t *shk = &spa->spa_stats.iostats; 912 kstat_t *ksp = shk->kstat; 913 spa_iostats_t *iostats; 914 915 if (ksp == NULL) 916 return; 917 918 iostats = ksp->ks_data; 919 if (type == TRIM_TYPE_MANUAL) { 920 SPA_IOSTATS_ADD(trim_extents_written, extents_written); 921 SPA_IOSTATS_ADD(trim_bytes_written, bytes_written); 922 SPA_IOSTATS_ADD(trim_extents_skipped, extents_skipped); 923 SPA_IOSTATS_ADD(trim_bytes_skipped, bytes_skipped); 924 SPA_IOSTATS_ADD(trim_extents_failed, extents_failed); 925 SPA_IOSTATS_ADD(trim_bytes_failed, bytes_failed); 926 } else if (type == TRIM_TYPE_AUTO) { 927 SPA_IOSTATS_ADD(autotrim_extents_written, extents_written); 928 SPA_IOSTATS_ADD(autotrim_bytes_written, bytes_written); 929 SPA_IOSTATS_ADD(autotrim_extents_skipped, extents_skipped); 930 SPA_IOSTATS_ADD(autotrim_bytes_skipped, bytes_skipped); 931 SPA_IOSTATS_ADD(autotrim_extents_failed, extents_failed); 932 SPA_IOSTATS_ADD(autotrim_bytes_failed, bytes_failed); 933 } else { 934 SPA_IOSTATS_ADD(simple_trim_extents_written, extents_written); 935 SPA_IOSTATS_ADD(simple_trim_bytes_written, bytes_written); 936 SPA_IOSTATS_ADD(simple_trim_extents_skipped, extents_skipped); 937 SPA_IOSTATS_ADD(simple_trim_bytes_skipped, bytes_skipped); 938 SPA_IOSTATS_ADD(simple_trim_extents_failed, extents_failed); 939 SPA_IOSTATS_ADD(simple_trim_bytes_failed, bytes_failed); 940 } 941 } 942 943 static int 944 spa_iostats_update(kstat_t *ksp, int rw) 945 { 946 if (rw == KSTAT_WRITE) { 947 memcpy(ksp->ks_data, &spa_iostats_template, 948 sizeof (spa_iostats_t)); 949 } 950 951 return (0); 952 } 953 954 static void 955 spa_iostats_init(spa_t *spa) 956 { 957 spa_history_kstat_t *shk = &spa->spa_stats.iostats; 958 959 mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); 960 961 char *name = kmem_asprintf("zfs/%s", spa_name(spa)); 962 kstat_t *ksp = kstat_create(name, 0, "iostats", "misc", 963 KSTAT_TYPE_NAMED, sizeof (spa_iostats_t) / sizeof (kstat_named_t), 964 KSTAT_FLAG_VIRTUAL); 965 966 shk->kstat = ksp; 967 if (ksp) { 968 int size = sizeof (spa_iostats_t); 969 ksp->ks_lock = &shk->lock; 970 ksp->ks_private = spa; 971 ksp->ks_update = spa_iostats_update; 972 ksp->ks_data = kmem_alloc(size, KM_SLEEP); 973 memcpy(ksp->ks_data, &spa_iostats_template, size); 974 kstat_install(ksp); 975 } 976 977 kmem_strfree(name); 978 } 979 980 static void 981 spa_iostats_destroy(spa_t *spa) 982 { 983 spa_history_kstat_t *shk = &spa->spa_stats.iostats; 984 kstat_t *ksp = shk->kstat; 985 if (ksp) { 986 kmem_free(ksp->ks_data, sizeof (spa_iostats_t)); 987 kstat_delete(ksp); 988 } 989 990 mutex_destroy(&shk->lock); 991 } 992 993 void 994 spa_stats_init(spa_t *spa) 995 { 996 spa_read_history_init(spa); 997 spa_txg_history_init(spa); 998 spa_tx_assign_init(spa); 999 spa_io_history_init(spa); 1000 spa_mmp_history_init(spa); 1001 spa_state_init(spa); 1002 spa_iostats_init(spa); 1003 } 1004 1005 void 1006 spa_stats_destroy(spa_t *spa) 1007 { 1008 spa_iostats_destroy(spa); 1009 spa_health_destroy(spa); 1010 spa_tx_assign_destroy(spa); 1011 spa_txg_history_destroy(spa); 1012 spa_read_history_destroy(spa); 1013 spa_io_history_destroy(spa); 1014 spa_mmp_history_destroy(spa); 1015 } 1016 1017 /* BEGIN CSTYLED */ 1018 ZFS_MODULE_PARAM(zfs, zfs_, read_history, INT, ZMOD_RW, 1019 "Historical statistics for the last N reads"); 1020 1021 ZFS_MODULE_PARAM(zfs, zfs_, read_history_hits, INT, ZMOD_RW, 1022 "Include cache hits in read history"); 1023 1024 ZFS_MODULE_PARAM(zfs_txg, zfs_txg_, history, INT, ZMOD_RW, 1025 "Historical statistics for the last N txgs"); 1026 1027 ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, history, INT, ZMOD_RW, 1028 "Historical statistics for last N multihost writes"); 1029 /* END CSTYLED */ 1030