1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 #include <sys/zfs_context.h> 23 #include <sys/spa_impl.h> 24 #include <sys/vdev_impl.h> 25 #include <sys/spa.h> 26 #include <zfs_comutil.h> 27 28 /* 29 * Keeps stats on last N reads per spa_t, disabled by default. 30 */ 31 static int zfs_read_history = B_FALSE; 32 33 /* 34 * Include cache hits in history, disabled by default. 35 */ 36 static int zfs_read_history_hits = B_FALSE; 37 38 /* 39 * Keeps stats on the last 100 txgs by default. 40 */ 41 static int zfs_txg_history = 100; 42 43 /* 44 * Keeps stats on the last N MMP updates, disabled by default. 45 */ 46 int zfs_multihost_history = B_FALSE; 47 48 /* 49 * ========================================================================== 50 * SPA Read History Routines 51 * ========================================================================== 52 */ 53 54 /* 55 * Read statistics - Information exported regarding each arc_read call 56 */ 57 typedef struct spa_read_history { 58 hrtime_t start; /* time read completed */ 59 uint64_t objset; /* read from this objset */ 60 uint64_t object; /* read of this object number */ 61 uint64_t level; /* block's indirection level */ 62 uint64_t blkid; /* read of this block id */ 63 char origin[24]; /* read originated from here */ 64 uint32_t aflags; /* ARC flags (cached, prefetch, etc.) */ 65 pid_t pid; /* PID of task doing read */ 66 char comm[16]; /* process name of task doing read */ 67 procfs_list_node_t srh_node; 68 } spa_read_history_t; 69 70 static int 71 spa_read_history_show_header(struct seq_file *f) 72 { 73 seq_printf(f, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s " 74 "%-24s %-8s %-16s\n", "UID", "start", "objset", "object", 75 "level", "blkid", "aflags", "origin", "pid", "process"); 76 77 return (0); 78 } 79 80 static int 81 spa_read_history_show(struct seq_file *f, void *data) 82 { 83 spa_read_history_t *srh = (spa_read_history_t *)data; 84 85 seq_printf(f, "%-8llu %-16llu 0x%-6llx " 86 "%-8lli %-8lli %-8lli 0x%-6x %-24s %-8i %-16s\n", 87 (u_longlong_t)srh->srh_node.pln_id, srh->start, 88 (longlong_t)srh->objset, (longlong_t)srh->object, 89 (longlong_t)srh->level, (longlong_t)srh->blkid, 90 srh->aflags, srh->origin, srh->pid, srh->comm); 91 92 return (0); 93 } 94 95 /* Remove oldest elements from list until there are no more than 'size' left */ 96 static void 97 spa_read_history_truncate(spa_history_list_t *shl, unsigned int size) 98 { 99 spa_read_history_t *srh; 100 while (shl->size > size) { 101 srh = list_remove_head(&shl->procfs_list.pl_list); 102 ASSERT3P(srh, !=, NULL); 103 kmem_free(srh, sizeof (spa_read_history_t)); 104 shl->size--; 105 } 106 107 if (size == 0) 108 ASSERT(list_is_empty(&shl->procfs_list.pl_list)); 109 } 110 111 static int 112 spa_read_history_clear(procfs_list_t *procfs_list) 113 { 114 spa_history_list_t *shl = procfs_list->pl_private; 115 mutex_enter(&procfs_list->pl_lock); 116 spa_read_history_truncate(shl, 0); 117 mutex_exit(&procfs_list->pl_lock); 118 return (0); 119 } 120 121 static void 122 spa_read_history_init(spa_t *spa) 123 { 124 spa_history_list_t *shl = &spa->spa_stats.read_history; 125 126 shl->size = 0; 127 shl->procfs_list.pl_private = shl; 128 procfs_list_install("zfs", 129 spa_name(spa), 130 "reads", 131 0600, 132 &shl->procfs_list, 133 spa_read_history_show, 134 spa_read_history_show_header, 135 spa_read_history_clear, 136 offsetof(spa_read_history_t, srh_node)); 137 } 138 139 static void 140 spa_read_history_destroy(spa_t *spa) 141 { 142 spa_history_list_t *shl = &spa->spa_stats.read_history; 143 procfs_list_uninstall(&shl->procfs_list); 144 spa_read_history_truncate(shl, 0); 145 procfs_list_destroy(&shl->procfs_list); 146 } 147 148 void 149 spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags) 150 { 151 spa_history_list_t *shl = &spa->spa_stats.read_history; 152 spa_read_history_t *srh; 153 154 ASSERT3P(spa, !=, NULL); 155 ASSERT3P(zb, !=, NULL); 156 157 if (zfs_read_history == 0 && shl->size == 0) 158 return; 159 160 if (zfs_read_history_hits == 0 && (aflags & ARC_FLAG_CACHED)) 161 return; 162 163 srh = kmem_zalloc(sizeof (spa_read_history_t), KM_SLEEP); 164 strlcpy(srh->comm, getcomm(), sizeof (srh->comm)); 165 srh->start = gethrtime(); 166 srh->objset = zb->zb_objset; 167 srh->object = zb->zb_object; 168 srh->level = zb->zb_level; 169 srh->blkid = zb->zb_blkid; 170 srh->aflags = aflags; 171 srh->pid = getpid(); 172 173 mutex_enter(&shl->procfs_list.pl_lock); 174 175 procfs_list_add(&shl->procfs_list, srh); 176 shl->size++; 177 178 spa_read_history_truncate(shl, zfs_read_history); 179 180 mutex_exit(&shl->procfs_list.pl_lock); 181 } 182 183 /* 184 * ========================================================================== 185 * SPA TXG History Routines 186 * ========================================================================== 187 */ 188 189 /* 190 * Txg statistics - Information exported regarding each txg sync 191 */ 192 193 typedef struct spa_txg_history { 194 uint64_t txg; /* txg id */ 195 txg_state_t state; /* active txg state */ 196 uint64_t nread; /* number of bytes read */ 197 uint64_t nwritten; /* number of bytes written */ 198 uint64_t reads; /* number of read operations */ 199 uint64_t writes; /* number of write operations */ 200 uint64_t ndirty; /* number of dirty bytes */ 201 hrtime_t times[TXG_STATE_COMMITTED]; /* completion times */ 202 procfs_list_node_t sth_node; 203 } spa_txg_history_t; 204 205 static int 206 spa_txg_history_show_header(struct seq_file *f) 207 { 208 seq_printf(f, "%-8s %-16s %-5s %-12s %-12s %-12s " 209 "%-8s %-8s %-12s %-12s %-12s %-12s\n", "txg", "birth", "state", 210 "ndirty", "nread", "nwritten", "reads", "writes", 211 "otime", "qtime", "wtime", "stime"); 212 return (0); 213 } 214 215 static int 216 spa_txg_history_show(struct seq_file *f, void *data) 217 { 218 spa_txg_history_t *sth = (spa_txg_history_t *)data; 219 uint64_t open = 0, quiesce = 0, wait = 0, sync = 0; 220 char state; 221 222 switch (sth->state) { 223 case TXG_STATE_BIRTH: state = 'B'; break; 224 case TXG_STATE_OPEN: state = 'O'; break; 225 case TXG_STATE_QUIESCED: state = 'Q'; break; 226 case TXG_STATE_WAIT_FOR_SYNC: state = 'W'; break; 227 case TXG_STATE_SYNCED: state = 'S'; break; 228 case TXG_STATE_COMMITTED: state = 'C'; break; 229 default: state = '?'; break; 230 } 231 232 if (sth->times[TXG_STATE_OPEN]) 233 open = sth->times[TXG_STATE_OPEN] - 234 sth->times[TXG_STATE_BIRTH]; 235 236 if (sth->times[TXG_STATE_QUIESCED]) 237 quiesce = sth->times[TXG_STATE_QUIESCED] - 238 sth->times[TXG_STATE_OPEN]; 239 240 if (sth->times[TXG_STATE_WAIT_FOR_SYNC]) 241 wait = sth->times[TXG_STATE_WAIT_FOR_SYNC] - 242 sth->times[TXG_STATE_QUIESCED]; 243 244 if (sth->times[TXG_STATE_SYNCED]) 245 sync = sth->times[TXG_STATE_SYNCED] - 246 sth->times[TXG_STATE_WAIT_FOR_SYNC]; 247 248 seq_printf(f, "%-8llu %-16llu %-5c %-12llu " 249 "%-12llu %-12llu %-8llu %-8llu %-12llu %-12llu %-12llu %-12llu\n", 250 (longlong_t)sth->txg, sth->times[TXG_STATE_BIRTH], state, 251 (u_longlong_t)sth->ndirty, 252 (u_longlong_t)sth->nread, (u_longlong_t)sth->nwritten, 253 (u_longlong_t)sth->reads, (u_longlong_t)sth->writes, 254 (u_longlong_t)open, (u_longlong_t)quiesce, (u_longlong_t)wait, 255 (u_longlong_t)sync); 256 257 return (0); 258 } 259 260 /* Remove oldest elements from list until there are no more than 'size' left */ 261 static void 262 spa_txg_history_truncate(spa_history_list_t *shl, unsigned int size) 263 { 264 spa_txg_history_t *sth; 265 while (shl->size > size) { 266 sth = list_remove_head(&shl->procfs_list.pl_list); 267 ASSERT3P(sth, !=, NULL); 268 kmem_free(sth, sizeof (spa_txg_history_t)); 269 shl->size--; 270 } 271 272 if (size == 0) 273 ASSERT(list_is_empty(&shl->procfs_list.pl_list)); 274 275 } 276 277 static int 278 spa_txg_history_clear(procfs_list_t *procfs_list) 279 { 280 spa_history_list_t *shl = procfs_list->pl_private; 281 mutex_enter(&procfs_list->pl_lock); 282 spa_txg_history_truncate(shl, 0); 283 mutex_exit(&procfs_list->pl_lock); 284 return (0); 285 } 286 287 static void 288 spa_txg_history_init(spa_t *spa) 289 { 290 spa_history_list_t *shl = &spa->spa_stats.txg_history; 291 292 shl->size = 0; 293 shl->procfs_list.pl_private = shl; 294 procfs_list_install("zfs", 295 spa_name(spa), 296 "txgs", 297 0644, 298 &shl->procfs_list, 299 spa_txg_history_show, 300 spa_txg_history_show_header, 301 spa_txg_history_clear, 302 offsetof(spa_txg_history_t, sth_node)); 303 } 304 305 static void 306 spa_txg_history_destroy(spa_t *spa) 307 { 308 spa_history_list_t *shl = &spa->spa_stats.txg_history; 309 procfs_list_uninstall(&shl->procfs_list); 310 spa_txg_history_truncate(shl, 0); 311 procfs_list_destroy(&shl->procfs_list); 312 } 313 314 /* 315 * Add a new txg to historical record. 316 */ 317 void 318 spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time) 319 { 320 spa_history_list_t *shl = &spa->spa_stats.txg_history; 321 spa_txg_history_t *sth; 322 323 if (zfs_txg_history == 0 && shl->size == 0) 324 return; 325 326 sth = kmem_zalloc(sizeof (spa_txg_history_t), KM_SLEEP); 327 sth->txg = txg; 328 sth->state = TXG_STATE_OPEN; 329 sth->times[TXG_STATE_BIRTH] = birth_time; 330 331 mutex_enter(&shl->procfs_list.pl_lock); 332 procfs_list_add(&shl->procfs_list, sth); 333 shl->size++; 334 spa_txg_history_truncate(shl, zfs_txg_history); 335 mutex_exit(&shl->procfs_list.pl_lock); 336 } 337 338 /* 339 * Set txg state completion time and increment current state. 340 */ 341 int 342 spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, 343 hrtime_t completed_time) 344 { 345 spa_history_list_t *shl = &spa->spa_stats.txg_history; 346 spa_txg_history_t *sth; 347 int error = ENOENT; 348 349 if (zfs_txg_history == 0) 350 return (0); 351 352 mutex_enter(&shl->procfs_list.pl_lock); 353 for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL; 354 sth = list_prev(&shl->procfs_list.pl_list, sth)) { 355 if (sth->txg == txg) { 356 sth->times[completed_state] = completed_time; 357 sth->state++; 358 error = 0; 359 break; 360 } 361 } 362 mutex_exit(&shl->procfs_list.pl_lock); 363 364 return (error); 365 } 366 367 /* 368 * Set txg IO stats. 369 */ 370 static int 371 spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread, 372 uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t ndirty) 373 { 374 spa_history_list_t *shl = &spa->spa_stats.txg_history; 375 spa_txg_history_t *sth; 376 int error = ENOENT; 377 378 if (zfs_txg_history == 0) 379 return (0); 380 381 mutex_enter(&shl->procfs_list.pl_lock); 382 for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL; 383 sth = list_prev(&shl->procfs_list.pl_list, sth)) { 384 if (sth->txg == txg) { 385 sth->nread = nread; 386 sth->nwritten = nwritten; 387 sth->reads = reads; 388 sth->writes = writes; 389 sth->ndirty = ndirty; 390 error = 0; 391 break; 392 } 393 } 394 mutex_exit(&shl->procfs_list.pl_lock); 395 396 return (error); 397 } 398 399 txg_stat_t * 400 spa_txg_history_init_io(spa_t *spa, uint64_t txg, dsl_pool_t *dp) 401 { 402 txg_stat_t *ts; 403 404 if (zfs_txg_history == 0) 405 return (NULL); 406 407 ts = kmem_alloc(sizeof (txg_stat_t), KM_SLEEP); 408 409 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 410 vdev_get_stats(spa->spa_root_vdev, &ts->vs1); 411 spa_config_exit(spa, SCL_CONFIG, FTAG); 412 413 ts->txg = txg; 414 ts->ndirty = dp->dp_dirty_pertxg[txg & TXG_MASK]; 415 416 spa_txg_history_set(spa, txg, TXG_STATE_WAIT_FOR_SYNC, gethrtime()); 417 418 return (ts); 419 } 420 421 void 422 spa_txg_history_fini_io(spa_t *spa, txg_stat_t *ts) 423 { 424 if (ts == NULL) 425 return; 426 427 if (zfs_txg_history == 0) { 428 kmem_free(ts, sizeof (txg_stat_t)); 429 return; 430 } 431 432 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 433 vdev_get_stats(spa->spa_root_vdev, &ts->vs2); 434 spa_config_exit(spa, SCL_CONFIG, FTAG); 435 436 spa_txg_history_set(spa, ts->txg, TXG_STATE_SYNCED, gethrtime()); 437 spa_txg_history_set_io(spa, ts->txg, 438 ts->vs2.vs_bytes[ZIO_TYPE_READ] - ts->vs1.vs_bytes[ZIO_TYPE_READ], 439 ts->vs2.vs_bytes[ZIO_TYPE_WRITE] - ts->vs1.vs_bytes[ZIO_TYPE_WRITE], 440 ts->vs2.vs_ops[ZIO_TYPE_READ] - ts->vs1.vs_ops[ZIO_TYPE_READ], 441 ts->vs2.vs_ops[ZIO_TYPE_WRITE] - ts->vs1.vs_ops[ZIO_TYPE_WRITE], 442 ts->ndirty); 443 444 kmem_free(ts, sizeof (txg_stat_t)); 445 } 446 447 /* 448 * ========================================================================== 449 * SPA TX Assign Histogram Routines 450 * ========================================================================== 451 */ 452 453 /* 454 * Tx statistics - Information exported regarding dmu_tx_assign time. 455 */ 456 457 /* 458 * When the kstat is written zero all buckets. When the kstat is read 459 * count the number of trailing buckets set to zero and update ks_ndata 460 * such that they are not output. 461 */ 462 static int 463 spa_tx_assign_update(kstat_t *ksp, int rw) 464 { 465 spa_t *spa = ksp->ks_private; 466 spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; 467 int i; 468 469 if (rw == KSTAT_WRITE) { 470 for (i = 0; i < shk->count; i++) 471 ((kstat_named_t *)shk->priv)[i].value.ui64 = 0; 472 } 473 474 for (i = shk->count; i > 0; i--) 475 if (((kstat_named_t *)shk->priv)[i-1].value.ui64 != 0) 476 break; 477 478 ksp->ks_ndata = i; 479 ksp->ks_data_size = i * sizeof (kstat_named_t); 480 481 return (0); 482 } 483 484 static void 485 spa_tx_assign_init(spa_t *spa) 486 { 487 spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; 488 char *name; 489 kstat_named_t *ks; 490 kstat_t *ksp; 491 int i; 492 493 mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); 494 495 shk->count = 42; /* power of two buckets for 1ns to 2,199s */ 496 shk->size = shk->count * sizeof (kstat_named_t); 497 shk->priv = kmem_alloc(shk->size, KM_SLEEP); 498 499 name = kmem_asprintf("zfs/%s", spa_name(spa)); 500 501 for (i = 0; i < shk->count; i++) { 502 ks = &((kstat_named_t *)shk->priv)[i]; 503 ks->data_type = KSTAT_DATA_UINT64; 504 ks->value.ui64 = 0; 505 (void) snprintf(ks->name, KSTAT_STRLEN, "%llu ns", 506 (u_longlong_t)1 << i); 507 } 508 509 ksp = kstat_create(name, 0, "dmu_tx_assign", "misc", 510 KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL); 511 shk->kstat = ksp; 512 513 if (ksp) { 514 ksp->ks_lock = &shk->lock; 515 ksp->ks_data = shk->priv; 516 ksp->ks_ndata = shk->count; 517 ksp->ks_data_size = shk->size; 518 ksp->ks_private = spa; 519 ksp->ks_update = spa_tx_assign_update; 520 kstat_install(ksp); 521 } 522 kmem_strfree(name); 523 } 524 525 static void 526 spa_tx_assign_destroy(spa_t *spa) 527 { 528 spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; 529 kstat_t *ksp; 530 531 ksp = shk->kstat; 532 if (ksp) 533 kstat_delete(ksp); 534 535 kmem_free(shk->priv, shk->size); 536 mutex_destroy(&shk->lock); 537 } 538 539 void 540 spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs) 541 { 542 spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; 543 uint64_t idx = 0; 544 545 while (((1ULL << idx) < nsecs) && (idx < shk->size - 1)) 546 idx++; 547 548 atomic_inc_64(&((kstat_named_t *)shk->priv)[idx].value.ui64); 549 } 550 551 /* 552 * ========================================================================== 553 * SPA MMP History Routines 554 * ========================================================================== 555 */ 556 557 /* 558 * MMP statistics - Information exported regarding attempted MMP writes 559 * For MMP writes issued, fields used as per comments below. 560 * For MMP writes skipped, an entry represents a span of time when 561 * writes were skipped for same reason (error from mmp_random_leaf). 562 * Differences are: 563 * timestamp time first write skipped, if >1 skipped in a row 564 * mmp_delay delay value at timestamp 565 * vdev_guid number of writes skipped 566 * io_error one of enum mmp_error 567 * duration time span (ns) of skipped writes 568 */ 569 570 typedef struct spa_mmp_history { 571 uint64_t mmp_node_id; /* unique # for updates */ 572 uint64_t txg; /* txg of last sync */ 573 uint64_t timestamp; /* UTC time MMP write issued */ 574 uint64_t mmp_delay; /* mmp_thread.mmp_delay at timestamp */ 575 uint64_t vdev_guid; /* unique ID of leaf vdev */ 576 char *vdev_path; 577 int vdev_label; /* vdev label */ 578 int io_error; /* error status of MMP write */ 579 hrtime_t error_start; /* hrtime of start of error period */ 580 hrtime_t duration; /* time from submission to completion */ 581 procfs_list_node_t smh_node; 582 } spa_mmp_history_t; 583 584 static int 585 spa_mmp_history_show_header(struct seq_file *f) 586 { 587 seq_printf(f, "%-10s %-10s %-10s %-6s %-10s %-12s %-24s " 588 "%-10s %s\n", "id", "txg", "timestamp", "error", "duration", 589 "mmp_delay", "vdev_guid", "vdev_label", "vdev_path"); 590 return (0); 591 } 592 593 static int 594 spa_mmp_history_show(struct seq_file *f, void *data) 595 { 596 spa_mmp_history_t *smh = (spa_mmp_history_t *)data; 597 char skip_fmt[] = "%-10llu %-10llu %10llu %#6llx %10lld %12llu %-24llu " 598 "%-10lld %s\n"; 599 char write_fmt[] = "%-10llu %-10llu %10llu %6lld %10lld %12llu %-24llu " 600 "%-10lld %s\n"; 601 602 seq_printf(f, (smh->error_start ? skip_fmt : write_fmt), 603 (u_longlong_t)smh->mmp_node_id, (u_longlong_t)smh->txg, 604 (u_longlong_t)smh->timestamp, (longlong_t)smh->io_error, 605 (longlong_t)smh->duration, (u_longlong_t)smh->mmp_delay, 606 (u_longlong_t)smh->vdev_guid, (u_longlong_t)smh->vdev_label, 607 (smh->vdev_path ? smh->vdev_path : "-")); 608 609 return (0); 610 } 611 612 /* Remove oldest elements from list until there are no more than 'size' left */ 613 static void 614 spa_mmp_history_truncate(spa_history_list_t *shl, unsigned int size) 615 { 616 spa_mmp_history_t *smh; 617 while (shl->size > size) { 618 smh = list_remove_head(&shl->procfs_list.pl_list); 619 if (smh->vdev_path) 620 kmem_strfree(smh->vdev_path); 621 kmem_free(smh, sizeof (spa_mmp_history_t)); 622 shl->size--; 623 } 624 625 if (size == 0) 626 ASSERT(list_is_empty(&shl->procfs_list.pl_list)); 627 628 } 629 630 static int 631 spa_mmp_history_clear(procfs_list_t *procfs_list) 632 { 633 spa_history_list_t *shl = procfs_list->pl_private; 634 mutex_enter(&procfs_list->pl_lock); 635 spa_mmp_history_truncate(shl, 0); 636 mutex_exit(&procfs_list->pl_lock); 637 return (0); 638 } 639 640 static void 641 spa_mmp_history_init(spa_t *spa) 642 { 643 spa_history_list_t *shl = &spa->spa_stats.mmp_history; 644 645 shl->size = 0; 646 647 shl->procfs_list.pl_private = shl; 648 procfs_list_install("zfs", 649 spa_name(spa), 650 "multihost", 651 0644, 652 &shl->procfs_list, 653 spa_mmp_history_show, 654 spa_mmp_history_show_header, 655 spa_mmp_history_clear, 656 offsetof(spa_mmp_history_t, smh_node)); 657 } 658 659 static void 660 spa_mmp_history_destroy(spa_t *spa) 661 { 662 spa_history_list_t *shl = &spa->spa_stats.mmp_history; 663 procfs_list_uninstall(&shl->procfs_list); 664 spa_mmp_history_truncate(shl, 0); 665 procfs_list_destroy(&shl->procfs_list); 666 } 667 668 /* 669 * Set duration in existing "skip" record to how long we have waited for a leaf 670 * vdev to become available. 671 * 672 * Important that we start search at the tail of the list where new 673 * records are inserted, so this is normally an O(1) operation. 674 */ 675 int 676 spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_node_id) 677 { 678 spa_history_list_t *shl = &spa->spa_stats.mmp_history; 679 spa_mmp_history_t *smh; 680 int error = ENOENT; 681 682 if (zfs_multihost_history == 0 && shl->size == 0) 683 return (0); 684 685 mutex_enter(&shl->procfs_list.pl_lock); 686 for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL; 687 smh = list_prev(&shl->procfs_list.pl_list, smh)) { 688 if (smh->mmp_node_id == mmp_node_id) { 689 ASSERT3U(smh->io_error, !=, 0); 690 smh->duration = gethrtime() - smh->error_start; 691 smh->vdev_guid++; 692 error = 0; 693 break; 694 } 695 } 696 mutex_exit(&shl->procfs_list.pl_lock); 697 698 return (error); 699 } 700 701 /* 702 * Set MMP write duration and error status in existing record. 703 * See comment re: search order above spa_mmp_history_set_skip(). 704 */ 705 int 706 spa_mmp_history_set(spa_t *spa, uint64_t mmp_node_id, int io_error, 707 hrtime_t duration) 708 { 709 spa_history_list_t *shl = &spa->spa_stats.mmp_history; 710 spa_mmp_history_t *smh; 711 int error = ENOENT; 712 713 if (zfs_multihost_history == 0 && shl->size == 0) 714 return (0); 715 716 mutex_enter(&shl->procfs_list.pl_lock); 717 for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL; 718 smh = list_prev(&shl->procfs_list.pl_list, smh)) { 719 if (smh->mmp_node_id == mmp_node_id) { 720 ASSERT(smh->io_error == 0); 721 smh->io_error = io_error; 722 smh->duration = duration; 723 error = 0; 724 break; 725 } 726 } 727 mutex_exit(&shl->procfs_list.pl_lock); 728 729 return (error); 730 } 731 732 /* 733 * Add a new MMP historical record. 734 * error == 0 : a write was issued. 735 * error != 0 : a write was not issued because no leaves were found. 736 */ 737 void 738 spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp, 739 uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_node_id, 740 int error) 741 { 742 spa_history_list_t *shl = &spa->spa_stats.mmp_history; 743 spa_mmp_history_t *smh; 744 745 if (zfs_multihost_history == 0 && shl->size == 0) 746 return; 747 748 smh = kmem_zalloc(sizeof (spa_mmp_history_t), KM_SLEEP); 749 smh->txg = txg; 750 smh->timestamp = timestamp; 751 smh->mmp_delay = mmp_delay; 752 if (vd) { 753 smh->vdev_guid = vd->vdev_guid; 754 if (vd->vdev_path) 755 smh->vdev_path = kmem_strdup(vd->vdev_path); 756 } 757 smh->vdev_label = label; 758 smh->mmp_node_id = mmp_node_id; 759 760 if (error) { 761 smh->io_error = error; 762 smh->error_start = gethrtime(); 763 smh->vdev_guid = 1; 764 } 765 766 mutex_enter(&shl->procfs_list.pl_lock); 767 procfs_list_add(&shl->procfs_list, smh); 768 shl->size++; 769 spa_mmp_history_truncate(shl, zfs_multihost_history); 770 mutex_exit(&shl->procfs_list.pl_lock); 771 } 772 773 static void * 774 spa_state_addr(kstat_t *ksp, loff_t n) 775 { 776 if (n == 0) 777 return (ksp->ks_private); /* return the spa_t */ 778 return (NULL); 779 } 780 781 static int 782 spa_state_data(char *buf, size_t size, void *data) 783 { 784 spa_t *spa = (spa_t *)data; 785 (void) snprintf(buf, size, "%s\n", spa_state_to_name(spa)); 786 return (0); 787 } 788 789 /* 790 * Return the state of the pool in /proc/spl/kstat/zfs/<pool>/state. 791 * 792 * This is a lock-less read of the pool's state (unlike using 'zpool', which 793 * can potentially block for seconds). Because it doesn't block, it can useful 794 * as a pool heartbeat value. 795 */ 796 static void 797 spa_state_init(spa_t *spa) 798 { 799 spa_history_kstat_t *shk = &spa->spa_stats.state; 800 char *name; 801 kstat_t *ksp; 802 803 mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); 804 805 name = kmem_asprintf("zfs/%s", spa_name(spa)); 806 ksp = kstat_create(name, 0, "state", "misc", 807 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); 808 809 shk->kstat = ksp; 810 if (ksp) { 811 ksp->ks_lock = &shk->lock; 812 ksp->ks_data = NULL; 813 ksp->ks_private = spa; 814 ksp->ks_flags |= KSTAT_FLAG_NO_HEADERS; 815 kstat_set_raw_ops(ksp, NULL, spa_state_data, spa_state_addr); 816 kstat_install(ksp); 817 } 818 819 kmem_strfree(name); 820 } 821 822 static void 823 spa_health_destroy(spa_t *spa) 824 { 825 spa_history_kstat_t *shk = &spa->spa_stats.state; 826 kstat_t *ksp = shk->kstat; 827 if (ksp) 828 kstat_delete(ksp); 829 830 mutex_destroy(&shk->lock); 831 } 832 833 static const spa_iostats_t spa_iostats_template = { 834 { "trim_extents_written", KSTAT_DATA_UINT64 }, 835 { "trim_bytes_written", KSTAT_DATA_UINT64 }, 836 { "trim_extents_skipped", KSTAT_DATA_UINT64 }, 837 { "trim_bytes_skipped", KSTAT_DATA_UINT64 }, 838 { "trim_extents_failed", KSTAT_DATA_UINT64 }, 839 { "trim_bytes_failed", KSTAT_DATA_UINT64 }, 840 { "autotrim_extents_written", KSTAT_DATA_UINT64 }, 841 { "autotrim_bytes_written", KSTAT_DATA_UINT64 }, 842 { "autotrim_extents_skipped", KSTAT_DATA_UINT64 }, 843 { "autotrim_bytes_skipped", KSTAT_DATA_UINT64 }, 844 { "autotrim_extents_failed", KSTAT_DATA_UINT64 }, 845 { "autotrim_bytes_failed", KSTAT_DATA_UINT64 }, 846 { "simple_trim_extents_written", KSTAT_DATA_UINT64 }, 847 { "simple_trim_bytes_written", KSTAT_DATA_UINT64 }, 848 { "simple_trim_extents_skipped", KSTAT_DATA_UINT64 }, 849 { "simple_trim_bytes_skipped", KSTAT_DATA_UINT64 }, 850 { "simple_trim_extents_failed", KSTAT_DATA_UINT64 }, 851 { "simple_trim_bytes_failed", KSTAT_DATA_UINT64 }, 852 }; 853 854 #define SPA_IOSTATS_ADD(stat, val) \ 855 atomic_add_64(&iostats->stat.value.ui64, (val)); 856 857 void 858 spa_iostats_trim_add(spa_t *spa, trim_type_t type, 859 uint64_t extents_written, uint64_t bytes_written, 860 uint64_t extents_skipped, uint64_t bytes_skipped, 861 uint64_t extents_failed, uint64_t bytes_failed) 862 { 863 spa_history_kstat_t *shk = &spa->spa_stats.iostats; 864 kstat_t *ksp = shk->kstat; 865 spa_iostats_t *iostats; 866 867 if (ksp == NULL) 868 return; 869 870 iostats = ksp->ks_data; 871 if (type == TRIM_TYPE_MANUAL) { 872 SPA_IOSTATS_ADD(trim_extents_written, extents_written); 873 SPA_IOSTATS_ADD(trim_bytes_written, bytes_written); 874 SPA_IOSTATS_ADD(trim_extents_skipped, extents_skipped); 875 SPA_IOSTATS_ADD(trim_bytes_skipped, bytes_skipped); 876 SPA_IOSTATS_ADD(trim_extents_failed, extents_failed); 877 SPA_IOSTATS_ADD(trim_bytes_failed, bytes_failed); 878 } else if (type == TRIM_TYPE_AUTO) { 879 SPA_IOSTATS_ADD(autotrim_extents_written, extents_written); 880 SPA_IOSTATS_ADD(autotrim_bytes_written, bytes_written); 881 SPA_IOSTATS_ADD(autotrim_extents_skipped, extents_skipped); 882 SPA_IOSTATS_ADD(autotrim_bytes_skipped, bytes_skipped); 883 SPA_IOSTATS_ADD(autotrim_extents_failed, extents_failed); 884 SPA_IOSTATS_ADD(autotrim_bytes_failed, bytes_failed); 885 } else { 886 SPA_IOSTATS_ADD(simple_trim_extents_written, extents_written); 887 SPA_IOSTATS_ADD(simple_trim_bytes_written, bytes_written); 888 SPA_IOSTATS_ADD(simple_trim_extents_skipped, extents_skipped); 889 SPA_IOSTATS_ADD(simple_trim_bytes_skipped, bytes_skipped); 890 SPA_IOSTATS_ADD(simple_trim_extents_failed, extents_failed); 891 SPA_IOSTATS_ADD(simple_trim_bytes_failed, bytes_failed); 892 } 893 } 894 895 static int 896 spa_iostats_update(kstat_t *ksp, int rw) 897 { 898 if (rw == KSTAT_WRITE) { 899 memcpy(ksp->ks_data, &spa_iostats_template, 900 sizeof (spa_iostats_t)); 901 } 902 903 return (0); 904 } 905 906 static void 907 spa_iostats_init(spa_t *spa) 908 { 909 spa_history_kstat_t *shk = &spa->spa_stats.iostats; 910 911 mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); 912 913 char *name = kmem_asprintf("zfs/%s", spa_name(spa)); 914 kstat_t *ksp = kstat_create(name, 0, "iostats", "misc", 915 KSTAT_TYPE_NAMED, sizeof (spa_iostats_t) / sizeof (kstat_named_t), 916 KSTAT_FLAG_VIRTUAL); 917 918 shk->kstat = ksp; 919 if (ksp) { 920 int size = sizeof (spa_iostats_t); 921 ksp->ks_lock = &shk->lock; 922 ksp->ks_private = spa; 923 ksp->ks_update = spa_iostats_update; 924 ksp->ks_data = kmem_alloc(size, KM_SLEEP); 925 memcpy(ksp->ks_data, &spa_iostats_template, size); 926 kstat_install(ksp); 927 } 928 929 kmem_strfree(name); 930 } 931 932 static void 933 spa_iostats_destroy(spa_t *spa) 934 { 935 spa_history_kstat_t *shk = &spa->spa_stats.iostats; 936 kstat_t *ksp = shk->kstat; 937 if (ksp) { 938 kmem_free(ksp->ks_data, sizeof (spa_iostats_t)); 939 kstat_delete(ksp); 940 } 941 942 mutex_destroy(&shk->lock); 943 } 944 945 void 946 spa_stats_init(spa_t *spa) 947 { 948 spa_read_history_init(spa); 949 spa_txg_history_init(spa); 950 spa_tx_assign_init(spa); 951 spa_mmp_history_init(spa); 952 spa_state_init(spa); 953 spa_iostats_init(spa); 954 } 955 956 void 957 spa_stats_destroy(spa_t *spa) 958 { 959 spa_iostats_destroy(spa); 960 spa_health_destroy(spa); 961 spa_tx_assign_destroy(spa); 962 spa_txg_history_destroy(spa); 963 spa_read_history_destroy(spa); 964 spa_mmp_history_destroy(spa); 965 } 966 967 ZFS_MODULE_PARAM(zfs, zfs_, read_history, INT, ZMOD_RW, 968 "Historical statistics for the last N reads"); 969 970 ZFS_MODULE_PARAM(zfs, zfs_, read_history_hits, INT, ZMOD_RW, 971 "Include cache hits in read history"); 972 973 ZFS_MODULE_PARAM(zfs_txg, zfs_txg_, history, INT, ZMOD_RW, 974 "Historical statistics for the last N txgs"); 975 976 ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, history, INT, ZMOD_RW, 977 "Historical statistics for last N multihost writes"); 978