1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 #include <sys/zfs_context.h> 23 #include <sys/spa_impl.h> 24 #include <sys/vdev_impl.h> 25 #include <sys/spa.h> 26 #include <zfs_comutil.h> 27 28 /* 29 * Keeps stats on last N reads per spa_t, disabled by default. 30 */ 31 static uint_t zfs_read_history = B_FALSE; 32 33 /* 34 * Include cache hits in history, disabled by default. 35 */ 36 static int zfs_read_history_hits = B_FALSE; 37 38 /* 39 * Keeps stats on the last 100 txgs by default. 40 */ 41 static uint_t zfs_txg_history = 100; 42 43 /* 44 * Keeps stats on the last N MMP updates, disabled by default. 45 */ 46 static uint_t zfs_multihost_history = B_FALSE; 47 48 /* 49 * ========================================================================== 50 * SPA Read History Routines 51 * ========================================================================== 52 */ 53 54 /* 55 * Read statistics - Information exported regarding each arc_read call 56 */ 57 typedef struct spa_read_history { 58 hrtime_t start; /* time read completed */ 59 uint64_t objset; /* read from this objset */ 60 uint64_t object; /* read of this object number */ 61 uint64_t level; /* block's indirection level */ 62 uint64_t blkid; /* read of this block id */ 63 char origin[24]; /* read originated from here */ 64 uint32_t aflags; /* ARC flags (cached, prefetch, etc.) */ 65 pid_t pid; /* PID of task doing read */ 66 char comm[16]; /* process name of task doing read */ 67 procfs_list_node_t srh_node; 68 } spa_read_history_t; 69 70 static int 71 spa_read_history_show_header(struct seq_file *f) 72 { 73 seq_printf(f, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s " 74 "%-24s %-8s %-16s\n", "UID", "start", "objset", "object", 75 "level", "blkid", "aflags", "origin", "pid", "process"); 76 77 return (0); 78 } 79 80 static int 81 spa_read_history_show(struct seq_file *f, void *data) 82 { 83 spa_read_history_t *srh = (spa_read_history_t *)data; 84 85 seq_printf(f, "%-8llu %-16llu 0x%-6llx " 86 "%-8lli %-8lli %-8lli 0x%-6x %-24s %-8i %-16s\n", 87 (u_longlong_t)srh->srh_node.pln_id, srh->start, 88 (longlong_t)srh->objset, (longlong_t)srh->object, 89 (longlong_t)srh->level, (longlong_t)srh->blkid, 90 srh->aflags, srh->origin, srh->pid, srh->comm); 91 92 return (0); 93 } 94 95 /* Remove oldest elements from list until there are no more than 'size' left */ 96 static void 97 spa_read_history_truncate(spa_history_list_t *shl, unsigned int size) 98 { 99 spa_read_history_t *srh; 100 while (shl->size > size) { 101 srh = list_remove_head(&shl->procfs_list.pl_list); 102 ASSERT3P(srh, !=, NULL); 103 kmem_free(srh, sizeof (spa_read_history_t)); 104 shl->size--; 105 } 106 107 if (size == 0) 108 ASSERT(list_is_empty(&shl->procfs_list.pl_list)); 109 } 110 111 static int 112 spa_read_history_clear(procfs_list_t *procfs_list) 113 { 114 spa_history_list_t *shl = procfs_list->pl_private; 115 mutex_enter(&procfs_list->pl_lock); 116 spa_read_history_truncate(shl, 0); 117 mutex_exit(&procfs_list->pl_lock); 118 return (0); 119 } 120 121 static void 122 spa_read_history_init(spa_t *spa) 123 { 124 spa_history_list_t *shl = &spa->spa_stats.read_history; 125 126 shl->size = 0; 127 shl->procfs_list.pl_private = shl; 128 procfs_list_install("zfs", 129 spa_name(spa), 130 "reads", 131 0600, 132 &shl->procfs_list, 133 spa_read_history_show, 134 spa_read_history_show_header, 135 spa_read_history_clear, 136 offsetof(spa_read_history_t, srh_node)); 137 } 138 139 static void 140 spa_read_history_destroy(spa_t *spa) 141 { 142 spa_history_list_t *shl = &spa->spa_stats.read_history; 143 procfs_list_uninstall(&shl->procfs_list); 144 spa_read_history_truncate(shl, 0); 145 procfs_list_destroy(&shl->procfs_list); 146 } 147 148 void 149 spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags) 150 { 151 spa_history_list_t *shl = &spa->spa_stats.read_history; 152 spa_read_history_t *srh; 153 154 ASSERT3P(spa, !=, NULL); 155 ASSERT3P(zb, !=, NULL); 156 157 if (zfs_read_history == 0 && shl->size == 0) 158 return; 159 160 if (zfs_read_history_hits == 0 && (aflags & ARC_FLAG_CACHED)) 161 return; 162 163 srh = kmem_zalloc(sizeof (spa_read_history_t), KM_SLEEP); 164 strlcpy(srh->comm, getcomm(), sizeof (srh->comm)); 165 srh->start = gethrtime(); 166 srh->objset = zb->zb_objset; 167 srh->object = zb->zb_object; 168 srh->level = zb->zb_level; 169 srh->blkid = zb->zb_blkid; 170 srh->aflags = aflags; 171 srh->pid = getpid(); 172 173 mutex_enter(&shl->procfs_list.pl_lock); 174 175 procfs_list_add(&shl->procfs_list, srh); 176 shl->size++; 177 178 spa_read_history_truncate(shl, zfs_read_history); 179 180 mutex_exit(&shl->procfs_list.pl_lock); 181 } 182 183 /* 184 * ========================================================================== 185 * SPA TXG History Routines 186 * ========================================================================== 187 */ 188 189 /* 190 * Txg statistics - Information exported regarding each txg sync 191 */ 192 193 typedef struct spa_txg_history { 194 uint64_t txg; /* txg id */ 195 txg_state_t state; /* active txg state */ 196 uint64_t nread; /* number of bytes read */ 197 uint64_t nwritten; /* number of bytes written */ 198 uint64_t reads; /* number of read operations */ 199 uint64_t writes; /* number of write operations */ 200 uint64_t ndirty; /* number of dirty bytes */ 201 hrtime_t times[TXG_STATE_COMMITTED]; /* completion times */ 202 procfs_list_node_t sth_node; 203 } spa_txg_history_t; 204 205 static int 206 spa_txg_history_show_header(struct seq_file *f) 207 { 208 seq_printf(f, "%-8s %-16s %-5s %-12s %-12s %-12s " 209 "%-8s %-8s %-12s %-12s %-12s %-12s\n", "txg", "birth", "state", 210 "ndirty", "nread", "nwritten", "reads", "writes", 211 "otime", "qtime", "wtime", "stime"); 212 return (0); 213 } 214 215 static int 216 spa_txg_history_show(struct seq_file *f, void *data) 217 { 218 spa_txg_history_t *sth = (spa_txg_history_t *)data; 219 uint64_t open = 0, quiesce = 0, wait = 0, sync = 0; 220 char state; 221 222 switch (sth->state) { 223 case TXG_STATE_BIRTH: state = 'B'; break; 224 case TXG_STATE_OPEN: state = 'O'; break; 225 case TXG_STATE_QUIESCED: state = 'Q'; break; 226 case TXG_STATE_WAIT_FOR_SYNC: state = 'W'; break; 227 case TXG_STATE_SYNCED: state = 'S'; break; 228 case TXG_STATE_COMMITTED: state = 'C'; break; 229 default: state = '?'; break; 230 } 231 232 if (sth->times[TXG_STATE_OPEN]) 233 open = sth->times[TXG_STATE_OPEN] - 234 sth->times[TXG_STATE_BIRTH]; 235 236 if (sth->times[TXG_STATE_QUIESCED]) 237 quiesce = sth->times[TXG_STATE_QUIESCED] - 238 sth->times[TXG_STATE_OPEN]; 239 240 if (sth->times[TXG_STATE_WAIT_FOR_SYNC]) 241 wait = sth->times[TXG_STATE_WAIT_FOR_SYNC] - 242 sth->times[TXG_STATE_QUIESCED]; 243 244 if (sth->times[TXG_STATE_SYNCED]) 245 sync = sth->times[TXG_STATE_SYNCED] - 246 sth->times[TXG_STATE_WAIT_FOR_SYNC]; 247 248 seq_printf(f, "%-8llu %-16llu %-5c %-12llu " 249 "%-12llu %-12llu %-8llu %-8llu %-12llu %-12llu %-12llu %-12llu\n", 250 (longlong_t)sth->txg, sth->times[TXG_STATE_BIRTH], state, 251 (u_longlong_t)sth->ndirty, 252 (u_longlong_t)sth->nread, (u_longlong_t)sth->nwritten, 253 (u_longlong_t)sth->reads, (u_longlong_t)sth->writes, 254 (u_longlong_t)open, (u_longlong_t)quiesce, (u_longlong_t)wait, 255 (u_longlong_t)sync); 256 257 return (0); 258 } 259 260 /* Remove oldest elements from list until there are no more than 'size' left */ 261 static void 262 spa_txg_history_truncate(spa_history_list_t *shl, unsigned int size) 263 { 264 spa_txg_history_t *sth; 265 while (shl->size > size) { 266 sth = list_remove_head(&shl->procfs_list.pl_list); 267 ASSERT3P(sth, !=, NULL); 268 kmem_free(sth, sizeof (spa_txg_history_t)); 269 shl->size--; 270 } 271 272 if (size == 0) 273 ASSERT(list_is_empty(&shl->procfs_list.pl_list)); 274 275 } 276 277 static int 278 spa_txg_history_clear(procfs_list_t *procfs_list) 279 { 280 spa_history_list_t *shl = procfs_list->pl_private; 281 mutex_enter(&procfs_list->pl_lock); 282 spa_txg_history_truncate(shl, 0); 283 mutex_exit(&procfs_list->pl_lock); 284 return (0); 285 } 286 287 static void 288 spa_txg_history_init(spa_t *spa) 289 { 290 spa_history_list_t *shl = &spa->spa_stats.txg_history; 291 292 shl->size = 0; 293 shl->procfs_list.pl_private = shl; 294 procfs_list_install("zfs", 295 spa_name(spa), 296 "txgs", 297 0644, 298 &shl->procfs_list, 299 spa_txg_history_show, 300 spa_txg_history_show_header, 301 spa_txg_history_clear, 302 offsetof(spa_txg_history_t, sth_node)); 303 } 304 305 static void 306 spa_txg_history_destroy(spa_t *spa) 307 { 308 spa_history_list_t *shl = &spa->spa_stats.txg_history; 309 procfs_list_uninstall(&shl->procfs_list); 310 spa_txg_history_truncate(shl, 0); 311 procfs_list_destroy(&shl->procfs_list); 312 } 313 314 /* 315 * Add a new txg to historical record. 316 */ 317 void 318 spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time) 319 { 320 spa_history_list_t *shl = &spa->spa_stats.txg_history; 321 spa_txg_history_t *sth; 322 323 if (zfs_txg_history == 0 && shl->size == 0) 324 return; 325 326 sth = kmem_zalloc(sizeof (spa_txg_history_t), KM_SLEEP); 327 sth->txg = txg; 328 sth->state = TXG_STATE_OPEN; 329 sth->times[TXG_STATE_BIRTH] = birth_time; 330 331 mutex_enter(&shl->procfs_list.pl_lock); 332 procfs_list_add(&shl->procfs_list, sth); 333 shl->size++; 334 spa_txg_history_truncate(shl, zfs_txg_history); 335 mutex_exit(&shl->procfs_list.pl_lock); 336 } 337 338 /* 339 * Set txg state completion time and increment current state. 340 */ 341 int 342 spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, 343 hrtime_t completed_time) 344 { 345 spa_history_list_t *shl = &spa->spa_stats.txg_history; 346 spa_txg_history_t *sth; 347 int error = ENOENT; 348 349 if (zfs_txg_history == 0) 350 return (0); 351 352 mutex_enter(&shl->procfs_list.pl_lock); 353 for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL; 354 sth = list_prev(&shl->procfs_list.pl_list, sth)) { 355 if (sth->txg == txg) { 356 sth->times[completed_state] = completed_time; 357 sth->state++; 358 error = 0; 359 break; 360 } 361 } 362 mutex_exit(&shl->procfs_list.pl_lock); 363 364 return (error); 365 } 366 367 /* 368 * Set txg IO stats. 369 */ 370 static int 371 spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread, 372 uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t ndirty) 373 { 374 spa_history_list_t *shl = &spa->spa_stats.txg_history; 375 spa_txg_history_t *sth; 376 int error = ENOENT; 377 378 if (zfs_txg_history == 0) 379 return (0); 380 381 mutex_enter(&shl->procfs_list.pl_lock); 382 for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL; 383 sth = list_prev(&shl->procfs_list.pl_list, sth)) { 384 if (sth->txg == txg) { 385 sth->nread = nread; 386 sth->nwritten = nwritten; 387 sth->reads = reads; 388 sth->writes = writes; 389 sth->ndirty = ndirty; 390 error = 0; 391 break; 392 } 393 } 394 mutex_exit(&shl->procfs_list.pl_lock); 395 396 return (error); 397 } 398 399 txg_stat_t * 400 spa_txg_history_init_io(spa_t *spa, uint64_t txg, dsl_pool_t *dp) 401 { 402 txg_stat_t *ts; 403 404 if (zfs_txg_history == 0) 405 return (NULL); 406 407 ts = kmem_alloc(sizeof (txg_stat_t), KM_SLEEP); 408 409 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 410 vdev_get_stats(spa->spa_root_vdev, &ts->vs1); 411 spa_config_exit(spa, SCL_CONFIG, FTAG); 412 413 ts->txg = txg; 414 ts->ndirty = dp->dp_dirty_pertxg[txg & TXG_MASK]; 415 416 spa_txg_history_set(spa, txg, TXG_STATE_WAIT_FOR_SYNC, gethrtime()); 417 418 return (ts); 419 } 420 421 void 422 spa_txg_history_fini_io(spa_t *spa, txg_stat_t *ts) 423 { 424 if (ts == NULL) 425 return; 426 427 if (zfs_txg_history == 0) { 428 kmem_free(ts, sizeof (txg_stat_t)); 429 return; 430 } 431 432 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 433 vdev_get_stats(spa->spa_root_vdev, &ts->vs2); 434 spa_config_exit(spa, SCL_CONFIG, FTAG); 435 436 spa_txg_history_set(spa, ts->txg, TXG_STATE_SYNCED, gethrtime()); 437 spa_txg_history_set_io(spa, ts->txg, 438 ts->vs2.vs_bytes[ZIO_TYPE_READ] - ts->vs1.vs_bytes[ZIO_TYPE_READ], 439 ts->vs2.vs_bytes[ZIO_TYPE_WRITE] - ts->vs1.vs_bytes[ZIO_TYPE_WRITE], 440 ts->vs2.vs_ops[ZIO_TYPE_READ] - ts->vs1.vs_ops[ZIO_TYPE_READ], 441 ts->vs2.vs_ops[ZIO_TYPE_WRITE] - ts->vs1.vs_ops[ZIO_TYPE_WRITE], 442 ts->ndirty); 443 444 kmem_free(ts, sizeof (txg_stat_t)); 445 } 446 447 /* 448 * ========================================================================== 449 * SPA TX Assign Histogram Routines 450 * ========================================================================== 451 */ 452 453 /* 454 * Tx statistics - Information exported regarding dmu_tx_assign time. 455 */ 456 457 /* 458 * When the kstat is written zero all buckets. When the kstat is read 459 * count the number of trailing buckets set to zero and update ks_ndata 460 * such that they are not output. 461 */ 462 static int 463 spa_tx_assign_update(kstat_t *ksp, int rw) 464 { 465 spa_t *spa = ksp->ks_private; 466 spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; 467 int i; 468 469 if (rw == KSTAT_WRITE) { 470 for (i = 0; i < shk->count; i++) 471 ((kstat_named_t *)shk->priv)[i].value.ui64 = 0; 472 } 473 474 for (i = shk->count; i > 0; i--) 475 if (((kstat_named_t *)shk->priv)[i-1].value.ui64 != 0) 476 break; 477 478 ksp->ks_ndata = i; 479 ksp->ks_data_size = i * sizeof (kstat_named_t); 480 481 return (0); 482 } 483 484 static void 485 spa_tx_assign_init(spa_t *spa) 486 { 487 spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; 488 char *name; 489 kstat_named_t *ks; 490 kstat_t *ksp; 491 int i; 492 493 mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); 494 495 shk->count = 42; /* power of two buckets for 1ns to 2,199s */ 496 shk->size = shk->count * sizeof (kstat_named_t); 497 shk->priv = kmem_alloc(shk->size, KM_SLEEP); 498 499 name = kmem_asprintf("zfs/%s", spa_name(spa)); 500 501 for (i = 0; i < shk->count; i++) { 502 ks = &((kstat_named_t *)shk->priv)[i]; 503 ks->data_type = KSTAT_DATA_UINT64; 504 ks->value.ui64 = 0; 505 (void) snprintf(ks->name, KSTAT_STRLEN, "%llu ns", 506 (u_longlong_t)1 << i); 507 } 508 509 ksp = kstat_create(name, 0, "dmu_tx_assign", "misc", 510 KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL); 511 shk->kstat = ksp; 512 513 if (ksp) { 514 ksp->ks_lock = &shk->lock; 515 ksp->ks_data = shk->priv; 516 ksp->ks_ndata = shk->count; 517 ksp->ks_data_size = shk->size; 518 ksp->ks_private = spa; 519 ksp->ks_update = spa_tx_assign_update; 520 kstat_install(ksp); 521 } 522 kmem_strfree(name); 523 } 524 525 static void 526 spa_tx_assign_destroy(spa_t *spa) 527 { 528 spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; 529 kstat_t *ksp; 530 531 ksp = shk->kstat; 532 if (ksp) 533 kstat_delete(ksp); 534 535 kmem_free(shk->priv, shk->size); 536 mutex_destroy(&shk->lock); 537 } 538 539 void 540 spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs) 541 { 542 spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; 543 uint64_t idx = 0; 544 545 while (((1ULL << idx) < nsecs) && (idx < shk->size - 1)) 546 idx++; 547 548 atomic_inc_64(&((kstat_named_t *)shk->priv)[idx].value.ui64); 549 } 550 551 /* 552 * ========================================================================== 553 * SPA MMP History Routines 554 * ========================================================================== 555 */ 556 557 /* 558 * MMP statistics - Information exported regarding attempted MMP writes 559 * For MMP writes issued, fields used as per comments below. 560 * For MMP writes skipped, an entry represents a span of time when 561 * writes were skipped for same reason (error from mmp_random_leaf). 562 * Differences are: 563 * timestamp time first write skipped, if >1 skipped in a row 564 * mmp_delay delay value at timestamp 565 * vdev_guid number of writes skipped 566 * io_error one of enum mmp_error 567 * duration time span (ns) of skipped writes 568 */ 569 570 typedef struct spa_mmp_history { 571 uint64_t mmp_node_id; /* unique # for updates */ 572 uint64_t txg; /* txg of last sync */ 573 uint64_t timestamp; /* UTC time MMP write issued */ 574 uint64_t mmp_delay; /* mmp_thread.mmp_delay at timestamp */ 575 uint64_t vdev_guid; /* unique ID of leaf vdev */ 576 char *vdev_path; 577 int vdev_label; /* vdev label */ 578 int io_error; /* error status of MMP write */ 579 hrtime_t error_start; /* hrtime of start of error period */ 580 hrtime_t duration; /* time from submission to completion */ 581 procfs_list_node_t smh_node; 582 } spa_mmp_history_t; 583 584 static int 585 spa_mmp_history_show_header(struct seq_file *f) 586 { 587 seq_printf(f, "%-10s %-10s %-10s %-6s %-10s %-12s %-24s " 588 "%-10s %s\n", "id", "txg", "timestamp", "error", "duration", 589 "mmp_delay", "vdev_guid", "vdev_label", "vdev_path"); 590 return (0); 591 } 592 593 static int 594 spa_mmp_history_show(struct seq_file *f, void *data) 595 { 596 spa_mmp_history_t *smh = (spa_mmp_history_t *)data; 597 char skip_fmt[] = "%-10llu %-10llu %10llu %#6llx %10lld %12llu %-24llu " 598 "%-10lld %s\n"; 599 char write_fmt[] = "%-10llu %-10llu %10llu %6lld %10lld %12llu %-24llu " 600 "%-10lld %s\n"; 601 602 seq_printf(f, (smh->error_start ? skip_fmt : write_fmt), 603 (u_longlong_t)smh->mmp_node_id, (u_longlong_t)smh->txg, 604 (u_longlong_t)smh->timestamp, (longlong_t)smh->io_error, 605 (longlong_t)smh->duration, (u_longlong_t)smh->mmp_delay, 606 (u_longlong_t)smh->vdev_guid, (u_longlong_t)smh->vdev_label, 607 (smh->vdev_path ? smh->vdev_path : "-")); 608 609 return (0); 610 } 611 612 /* Remove oldest elements from list until there are no more than 'size' left */ 613 static void 614 spa_mmp_history_truncate(spa_history_list_t *shl, unsigned int size) 615 { 616 spa_mmp_history_t *smh; 617 while (shl->size > size) { 618 smh = list_remove_head(&shl->procfs_list.pl_list); 619 if (smh->vdev_path) 620 kmem_strfree(smh->vdev_path); 621 kmem_free(smh, sizeof (spa_mmp_history_t)); 622 shl->size--; 623 } 624 625 if (size == 0) 626 ASSERT(list_is_empty(&shl->procfs_list.pl_list)); 627 628 } 629 630 static int 631 spa_mmp_history_clear(procfs_list_t *procfs_list) 632 { 633 spa_history_list_t *shl = procfs_list->pl_private; 634 mutex_enter(&procfs_list->pl_lock); 635 spa_mmp_history_truncate(shl, 0); 636 mutex_exit(&procfs_list->pl_lock); 637 return (0); 638 } 639 640 static void 641 spa_mmp_history_init(spa_t *spa) 642 { 643 spa_history_list_t *shl = &spa->spa_stats.mmp_history; 644 645 shl->size = 0; 646 647 shl->procfs_list.pl_private = shl; 648 procfs_list_install("zfs", 649 spa_name(spa), 650 "multihost", 651 0644, 652 &shl->procfs_list, 653 spa_mmp_history_show, 654 spa_mmp_history_show_header, 655 spa_mmp_history_clear, 656 offsetof(spa_mmp_history_t, smh_node)); 657 } 658 659 static void 660 spa_mmp_history_destroy(spa_t *spa) 661 { 662 spa_history_list_t *shl = &spa->spa_stats.mmp_history; 663 procfs_list_uninstall(&shl->procfs_list); 664 spa_mmp_history_truncate(shl, 0); 665 procfs_list_destroy(&shl->procfs_list); 666 } 667 668 /* 669 * Set duration in existing "skip" record to how long we have waited for a leaf 670 * vdev to become available. 671 * 672 * Important that we start search at the tail of the list where new 673 * records are inserted, so this is normally an O(1) operation. 674 */ 675 int 676 spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_node_id) 677 { 678 spa_history_list_t *shl = &spa->spa_stats.mmp_history; 679 spa_mmp_history_t *smh; 680 int error = ENOENT; 681 682 if (zfs_multihost_history == 0 && shl->size == 0) 683 return (0); 684 685 mutex_enter(&shl->procfs_list.pl_lock); 686 for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL; 687 smh = list_prev(&shl->procfs_list.pl_list, smh)) { 688 if (smh->mmp_node_id == mmp_node_id) { 689 ASSERT3U(smh->io_error, !=, 0); 690 smh->duration = gethrtime() - smh->error_start; 691 smh->vdev_guid++; 692 error = 0; 693 break; 694 } 695 } 696 mutex_exit(&shl->procfs_list.pl_lock); 697 698 return (error); 699 } 700 701 /* 702 * Set MMP write duration and error status in existing record. 703 * See comment re: search order above spa_mmp_history_set_skip(). 704 */ 705 int 706 spa_mmp_history_set(spa_t *spa, uint64_t mmp_node_id, int io_error, 707 hrtime_t duration) 708 { 709 spa_history_list_t *shl = &spa->spa_stats.mmp_history; 710 spa_mmp_history_t *smh; 711 int error = ENOENT; 712 713 if (zfs_multihost_history == 0 && shl->size == 0) 714 return (0); 715 716 mutex_enter(&shl->procfs_list.pl_lock); 717 for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL; 718 smh = list_prev(&shl->procfs_list.pl_list, smh)) { 719 if (smh->mmp_node_id == mmp_node_id) { 720 ASSERT(smh->io_error == 0); 721 smh->io_error = io_error; 722 smh->duration = duration; 723 error = 0; 724 break; 725 } 726 } 727 mutex_exit(&shl->procfs_list.pl_lock); 728 729 return (error); 730 } 731 732 /* 733 * Add a new MMP historical record. 734 * error == 0 : a write was issued. 735 * error != 0 : a write was not issued because no leaves were found. 736 */ 737 void 738 spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp, 739 uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_node_id, 740 int error) 741 { 742 spa_history_list_t *shl = &spa->spa_stats.mmp_history; 743 spa_mmp_history_t *smh; 744 745 if (zfs_multihost_history == 0 && shl->size == 0) 746 return; 747 748 smh = kmem_zalloc(sizeof (spa_mmp_history_t), KM_SLEEP); 749 smh->txg = txg; 750 smh->timestamp = timestamp; 751 smh->mmp_delay = mmp_delay; 752 if (vd) { 753 smh->vdev_guid = vd->vdev_guid; 754 if (vd->vdev_path) 755 smh->vdev_path = kmem_strdup(vd->vdev_path); 756 } 757 smh->vdev_label = label; 758 smh->mmp_node_id = mmp_node_id; 759 760 if (error) { 761 smh->io_error = error; 762 smh->error_start = gethrtime(); 763 smh->vdev_guid = 1; 764 } 765 766 mutex_enter(&shl->procfs_list.pl_lock); 767 procfs_list_add(&shl->procfs_list, smh); 768 shl->size++; 769 spa_mmp_history_truncate(shl, zfs_multihost_history); 770 mutex_exit(&shl->procfs_list.pl_lock); 771 } 772 773 static void * 774 spa_state_addr(kstat_t *ksp, loff_t n) 775 { 776 if (n == 0) 777 return (ksp->ks_private); /* return the spa_t */ 778 return (NULL); 779 } 780 781 static int 782 spa_state_data(char *buf, size_t size, void *data) 783 { 784 spa_t *spa = (spa_t *)data; 785 (void) snprintf(buf, size, "%s\n", spa_state_to_name(spa)); 786 return (0); 787 } 788 789 /* 790 * Return the state of the pool in /proc/spl/kstat/zfs/<pool>/state. 791 * 792 * This is a lock-less read of the pool's state (unlike using 'zpool', which 793 * can potentially block for seconds). Because it doesn't block, it can useful 794 * as a pool heartbeat value. 795 */ 796 static void 797 spa_state_init(spa_t *spa) 798 { 799 spa_history_kstat_t *shk = &spa->spa_stats.state; 800 char *name; 801 kstat_t *ksp; 802 803 mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); 804 805 name = kmem_asprintf("zfs/%s", spa_name(spa)); 806 ksp = kstat_create(name, 0, "state", "misc", 807 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); 808 809 shk->kstat = ksp; 810 if (ksp) { 811 ksp->ks_lock = &shk->lock; 812 ksp->ks_data = NULL; 813 ksp->ks_private = spa; 814 ksp->ks_flags |= KSTAT_FLAG_NO_HEADERS; 815 kstat_set_raw_ops(ksp, NULL, spa_state_data, spa_state_addr); 816 kstat_install(ksp); 817 } 818 819 kmem_strfree(name); 820 } 821 822 static int 823 spa_guid_data(char *buf, size_t size, void *data) 824 { 825 spa_t *spa = (spa_t *)data; 826 (void) snprintf(buf, size, "%llu\n", (u_longlong_t)spa_guid(spa)); 827 return (0); 828 } 829 830 static void 831 spa_guid_init(spa_t *spa) 832 { 833 spa_history_kstat_t *shk = &spa->spa_stats.guid; 834 char *name; 835 kstat_t *ksp; 836 837 mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); 838 839 name = kmem_asprintf("zfs/%s", spa_name(spa)); 840 841 ksp = kstat_create(name, 0, "guid", "misc", 842 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); 843 844 shk->kstat = ksp; 845 if (ksp) { 846 ksp->ks_lock = &shk->lock; 847 ksp->ks_data = NULL; 848 ksp->ks_private = spa; 849 ksp->ks_flags |= KSTAT_FLAG_NO_HEADERS; 850 kstat_set_raw_ops(ksp, NULL, spa_guid_data, spa_state_addr); 851 kstat_install(ksp); 852 } 853 854 kmem_strfree(name); 855 } 856 857 static void 858 spa_health_destroy(spa_t *spa) 859 { 860 spa_history_kstat_t *shk = &spa->spa_stats.state; 861 kstat_t *ksp = shk->kstat; 862 if (ksp) 863 kstat_delete(ksp); 864 865 mutex_destroy(&shk->lock); 866 } 867 868 static void 869 spa_guid_destroy(spa_t *spa) 870 { 871 spa_history_kstat_t *shk = &spa->spa_stats.guid; 872 kstat_t *ksp = shk->kstat; 873 if (ksp) 874 kstat_delete(ksp); 875 876 mutex_destroy(&shk->lock); 877 } 878 879 static const spa_iostats_t spa_iostats_template = { 880 { "trim_extents_written", KSTAT_DATA_UINT64 }, 881 { "trim_bytes_written", KSTAT_DATA_UINT64 }, 882 { "trim_extents_skipped", KSTAT_DATA_UINT64 }, 883 { "trim_bytes_skipped", KSTAT_DATA_UINT64 }, 884 { "trim_extents_failed", KSTAT_DATA_UINT64 }, 885 { "trim_bytes_failed", KSTAT_DATA_UINT64 }, 886 { "autotrim_extents_written", KSTAT_DATA_UINT64 }, 887 { "autotrim_bytes_written", KSTAT_DATA_UINT64 }, 888 { "autotrim_extents_skipped", KSTAT_DATA_UINT64 }, 889 { "autotrim_bytes_skipped", KSTAT_DATA_UINT64 }, 890 { "autotrim_extents_failed", KSTAT_DATA_UINT64 }, 891 { "autotrim_bytes_failed", KSTAT_DATA_UINT64 }, 892 { "simple_trim_extents_written", KSTAT_DATA_UINT64 }, 893 { "simple_trim_bytes_written", KSTAT_DATA_UINT64 }, 894 { "simple_trim_extents_skipped", KSTAT_DATA_UINT64 }, 895 { "simple_trim_bytes_skipped", KSTAT_DATA_UINT64 }, 896 { "simple_trim_extents_failed", KSTAT_DATA_UINT64 }, 897 { "simple_trim_bytes_failed", KSTAT_DATA_UINT64 }, 898 { "arc_read_count", KSTAT_DATA_UINT64 }, 899 { "arc_read_bytes", KSTAT_DATA_UINT64 }, 900 { "arc_write_count", KSTAT_DATA_UINT64 }, 901 { "arc_write_bytes", KSTAT_DATA_UINT64 }, 902 { "direct_read_count", KSTAT_DATA_UINT64 }, 903 { "direct_read_bytes", KSTAT_DATA_UINT64 }, 904 { "direct_write_count", KSTAT_DATA_UINT64 }, 905 { "direct_write_bytes", KSTAT_DATA_UINT64 }, 906 }; 907 908 #define SPA_IOSTATS_ADD(stat, val) \ 909 atomic_add_64(&iostats->stat.value.ui64, (val)); 910 911 void 912 spa_iostats_trim_add(spa_t *spa, trim_type_t type, 913 uint64_t extents_written, uint64_t bytes_written, 914 uint64_t extents_skipped, uint64_t bytes_skipped, 915 uint64_t extents_failed, uint64_t bytes_failed) 916 { 917 spa_history_kstat_t *shk = &spa->spa_stats.iostats; 918 kstat_t *ksp = shk->kstat; 919 spa_iostats_t *iostats; 920 921 if (ksp == NULL) 922 return; 923 924 iostats = ksp->ks_data; 925 if (type == TRIM_TYPE_MANUAL) { 926 SPA_IOSTATS_ADD(trim_extents_written, extents_written); 927 SPA_IOSTATS_ADD(trim_bytes_written, bytes_written); 928 SPA_IOSTATS_ADD(trim_extents_skipped, extents_skipped); 929 SPA_IOSTATS_ADD(trim_bytes_skipped, bytes_skipped); 930 SPA_IOSTATS_ADD(trim_extents_failed, extents_failed); 931 SPA_IOSTATS_ADD(trim_bytes_failed, bytes_failed); 932 } else if (type == TRIM_TYPE_AUTO) { 933 SPA_IOSTATS_ADD(autotrim_extents_written, extents_written); 934 SPA_IOSTATS_ADD(autotrim_bytes_written, bytes_written); 935 SPA_IOSTATS_ADD(autotrim_extents_skipped, extents_skipped); 936 SPA_IOSTATS_ADD(autotrim_bytes_skipped, bytes_skipped); 937 SPA_IOSTATS_ADD(autotrim_extents_failed, extents_failed); 938 SPA_IOSTATS_ADD(autotrim_bytes_failed, bytes_failed); 939 } else { 940 SPA_IOSTATS_ADD(simple_trim_extents_written, extents_written); 941 SPA_IOSTATS_ADD(simple_trim_bytes_written, bytes_written); 942 SPA_IOSTATS_ADD(simple_trim_extents_skipped, extents_skipped); 943 SPA_IOSTATS_ADD(simple_trim_bytes_skipped, bytes_skipped); 944 SPA_IOSTATS_ADD(simple_trim_extents_failed, extents_failed); 945 SPA_IOSTATS_ADD(simple_trim_bytes_failed, bytes_failed); 946 } 947 } 948 949 void 950 spa_iostats_read_add(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) 951 { 952 spa_history_kstat_t *shk = &spa->spa_stats.iostats; 953 kstat_t *ksp = shk->kstat; 954 955 if (ksp == NULL) 956 return; 957 958 spa_iostats_t *iostats = ksp->ks_data; 959 if (flags & DMU_DIRECTIO) { 960 SPA_IOSTATS_ADD(direct_read_count, iops); 961 SPA_IOSTATS_ADD(direct_read_bytes, size); 962 } else { 963 SPA_IOSTATS_ADD(arc_read_count, iops); 964 SPA_IOSTATS_ADD(arc_read_bytes, size); 965 } 966 } 967 968 void 969 spa_iostats_write_add(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) 970 { 971 spa_history_kstat_t *shk = &spa->spa_stats.iostats; 972 kstat_t *ksp = shk->kstat; 973 974 if (ksp == NULL) 975 return; 976 977 spa_iostats_t *iostats = ksp->ks_data; 978 if (flags & DMU_DIRECTIO) { 979 SPA_IOSTATS_ADD(direct_write_count, iops); 980 SPA_IOSTATS_ADD(direct_write_bytes, size); 981 } else { 982 SPA_IOSTATS_ADD(arc_write_count, iops); 983 SPA_IOSTATS_ADD(arc_write_bytes, size); 984 } 985 } 986 987 static int 988 spa_iostats_update(kstat_t *ksp, int rw) 989 { 990 if (rw == KSTAT_WRITE) { 991 memcpy(ksp->ks_data, &spa_iostats_template, 992 sizeof (spa_iostats_t)); 993 } 994 995 return (0); 996 } 997 998 static void 999 spa_iostats_init(spa_t *spa) 1000 { 1001 spa_history_kstat_t *shk = &spa->spa_stats.iostats; 1002 1003 mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); 1004 1005 char *name = kmem_asprintf("zfs/%s", spa_name(spa)); 1006 kstat_t *ksp = kstat_create(name, 0, "iostats", "misc", 1007 KSTAT_TYPE_NAMED, sizeof (spa_iostats_t) / sizeof (kstat_named_t), 1008 KSTAT_FLAG_VIRTUAL); 1009 1010 shk->kstat = ksp; 1011 if (ksp) { 1012 int size = sizeof (spa_iostats_t); 1013 ksp->ks_lock = &shk->lock; 1014 ksp->ks_private = spa; 1015 ksp->ks_update = spa_iostats_update; 1016 ksp->ks_data = kmem_alloc(size, KM_SLEEP); 1017 memcpy(ksp->ks_data, &spa_iostats_template, size); 1018 kstat_install(ksp); 1019 } 1020 1021 kmem_strfree(name); 1022 } 1023 1024 static void 1025 spa_iostats_destroy(spa_t *spa) 1026 { 1027 spa_history_kstat_t *shk = &spa->spa_stats.iostats; 1028 kstat_t *ksp = shk->kstat; 1029 if (ksp) { 1030 kmem_free(ksp->ks_data, sizeof (spa_iostats_t)); 1031 kstat_delete(ksp); 1032 } 1033 1034 mutex_destroy(&shk->lock); 1035 } 1036 1037 void 1038 spa_stats_init(spa_t *spa) 1039 { 1040 spa_read_history_init(spa); 1041 spa_txg_history_init(spa); 1042 spa_tx_assign_init(spa); 1043 spa_mmp_history_init(spa); 1044 spa_state_init(spa); 1045 spa_guid_init(spa); 1046 spa_iostats_init(spa); 1047 } 1048 1049 void 1050 spa_stats_destroy(spa_t *spa) 1051 { 1052 spa_iostats_destroy(spa); 1053 spa_health_destroy(spa); 1054 spa_tx_assign_destroy(spa); 1055 spa_txg_history_destroy(spa); 1056 spa_read_history_destroy(spa); 1057 spa_mmp_history_destroy(spa); 1058 spa_guid_destroy(spa); 1059 } 1060 1061 ZFS_MODULE_PARAM(zfs, zfs_, read_history, UINT, ZMOD_RW, 1062 "Historical statistics for the last N reads"); 1063 1064 ZFS_MODULE_PARAM(zfs, zfs_, read_history_hits, INT, ZMOD_RW, 1065 "Include cache hits in read history"); 1066 1067 ZFS_MODULE_PARAM(zfs_txg, zfs_txg_, history, UINT, ZMOD_RW, 1068 "Historical statistics for the last N txgs"); 1069 1070 ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, history, UINT, ZMOD_RW, 1071 "Historical statistics for last N multihost writes"); 1072