1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1997, 1998, 1999 Kenneth D. Merry. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. The name of the author may not be used to endorse or promote products 16 * derived from this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 #include <sys/param.h> 32 #include <sys/disk.h> 33 #include <sys/kernel.h> 34 #include <sys/systm.h> 35 #include <sys/bio.h> 36 #include <sys/devicestat.h> 37 #include <sys/sdt.h> 38 #include <sys/sysctl.h> 39 #include <sys/malloc.h> 40 #include <sys/lock.h> 41 #include <sys/mutex.h> 42 #include <sys/conf.h> 43 #include <vm/vm.h> 44 #include <vm/pmap.h> 45 46 #ifdef COMPAT_FREEBSD32 47 #include <compat/freebsd32/freebsd32.h> 48 #endif 49 50 #include <machine/atomic.h> 51 52 SDT_PROVIDER_DEFINE(io); 53 54 SDT_PROBE_DEFINE2(io, , , start, "struct bio *", "struct devstat *"); 55 SDT_PROBE_DEFINE2(io, , , done, "struct bio *", "struct devstat *"); 56 57 #define DTRACE_DEVSTAT_BIO_START() SDT_PROBE2(io, , , start, bp, ds) 58 #define DTRACE_DEVSTAT_BIO_DONE() SDT_PROBE2(io, , , done, bp, ds) 59 60 static int devstat_num_devs; 61 static long devstat_generation = 1; 62 static int devstat_version = DEVSTAT_VERSION; 63 static int devstat_current_devnumber; 64 static struct mtx devstat_mutex; 65 MTX_SYSINIT(devstat_mutex, &devstat_mutex, "devstat", MTX_DEF); 66 67 static struct devstatlist device_statq = STAILQ_HEAD_INITIALIZER(device_statq); 68 static struct devstat *devstat_alloc(void); 69 static void devstat_free(struct devstat *); 70 static void devstat_add_entry(struct devstat *ds, const void *dev_name, 71 int unit_number, uint32_t block_size, 72 devstat_support_flags flags, 73 devstat_type_flags device_type, 74 devstat_priority priority); 75 76 /* 77 * Allocate a devstat and initialize it 78 */ 79 struct devstat * 80 devstat_new_entry(const void *dev_name, 81 int unit_number, uint32_t block_size, 82 devstat_support_flags flags, 83 devstat_type_flags device_type, 84 devstat_priority priority) 85 { 86 struct devstat *ds; 87 88 mtx_assert(&devstat_mutex, MA_NOTOWNED); 89 90 ds = devstat_alloc(); 91 mtx_lock(&devstat_mutex); 92 if (unit_number == -1) { 93 ds->unit_number = unit_number; 94 ds->id = dev_name; 95 binuptime(&ds->creation_time); 96 devstat_generation++; 97 } else { 98 devstat_add_entry(ds, dev_name, unit_number, block_size, 99 flags, device_type, priority); 100 } 101 mtx_unlock(&devstat_mutex); 102 return (ds); 103 } 104 105 /* 106 * Take a malloced and zeroed devstat structure given to us, fill it in 107 * and add it to the queue of devices. 108 */ 109 static void 110 devstat_add_entry(struct devstat *ds, const void *dev_name, 111 int unit_number, uint32_t block_size, 112 devstat_support_flags flags, 113 devstat_type_flags device_type, 114 devstat_priority priority) 115 { 116 struct devstatlist *devstat_head; 117 struct devstat *ds_tmp; 118 119 mtx_assert(&devstat_mutex, MA_OWNED); 120 devstat_num_devs++; 121 122 devstat_head = &device_statq; 123 124 /* 125 * Priority sort. Each driver passes in its priority when it adds 126 * its devstat entry. Drivers are sorted first by priority, and 127 * then by probe order. 128 * 129 * For the first device, we just insert it, since the priority 130 * doesn't really matter yet. Subsequent devices are inserted into 131 * the list using the order outlined above. 132 */ 133 if (devstat_num_devs == 1) 134 STAILQ_INSERT_TAIL(devstat_head, ds, dev_links); 135 else { 136 STAILQ_FOREACH(ds_tmp, devstat_head, dev_links) { 137 struct devstat *ds_next; 138 139 ds_next = STAILQ_NEXT(ds_tmp, dev_links); 140 141 /* 142 * If we find a break between higher and lower 143 * priority items, and if this item fits in the 144 * break, insert it. This also applies if the 145 * "lower priority item" is the end of the list. 146 */ 147 if ((priority <= ds_tmp->priority) 148 && ((ds_next == NULL) 149 || (priority > ds_next->priority))) { 150 STAILQ_INSERT_AFTER(devstat_head, ds_tmp, ds, 151 dev_links); 152 break; 153 } else if (priority > ds_tmp->priority) { 154 /* 155 * If this is the case, we should be able 156 * to insert ourselves at the head of the 157 * list. If we can't, something is wrong. 158 */ 159 if (ds_tmp == STAILQ_FIRST(devstat_head)) { 160 STAILQ_INSERT_HEAD(devstat_head, 161 ds, dev_links); 162 break; 163 } else { 164 STAILQ_INSERT_TAIL(devstat_head, 165 ds, dev_links); 166 printf("devstat_add_entry: HELP! " 167 "sorting problem detected " 168 "for name %p unit %d\n", 169 dev_name, unit_number); 170 break; 171 } 172 } 173 } 174 } 175 176 ds->device_number = devstat_current_devnumber++; 177 ds->unit_number = unit_number; 178 strlcpy(ds->device_name, dev_name, DEVSTAT_NAME_LEN); 179 ds->block_size = block_size; 180 ds->flags = flags; 181 ds->device_type = device_type; 182 ds->priority = priority; 183 binuptime(&ds->creation_time); 184 devstat_generation++; 185 } 186 187 /* 188 * Remove a devstat structure from the list of devices. 189 */ 190 void 191 devstat_remove_entry(struct devstat *ds) 192 { 193 struct devstatlist *devstat_head; 194 195 mtx_assert(&devstat_mutex, MA_NOTOWNED); 196 if (ds == NULL) 197 return; 198 199 mtx_lock(&devstat_mutex); 200 201 devstat_head = &device_statq; 202 203 /* Remove this entry from the devstat queue */ 204 atomic_add_acq_int(&ds->sequence1, 1); 205 if (ds->unit_number != -1) { 206 devstat_num_devs--; 207 STAILQ_REMOVE(devstat_head, ds, devstat, dev_links); 208 } 209 devstat_free(ds); 210 devstat_generation++; 211 mtx_unlock(&devstat_mutex); 212 } 213 214 /* 215 * Record a transaction start. 216 * 217 * See comments for devstat_end_transaction(). Ordering is very important 218 * here. 219 */ 220 void 221 devstat_start_transaction(struct devstat *ds, const struct bintime *now) 222 { 223 224 /* sanity check */ 225 if (ds == NULL) 226 return; 227 228 atomic_add_acq_int(&ds->sequence1, 1); 229 /* 230 * We only want to set the start time when we are going from idle 231 * to busy. The start time is really the start of the latest busy 232 * period. 233 */ 234 if (atomic_fetchadd_int(&ds->start_count, 1) == ds->end_count) { 235 if (now != NULL) 236 ds->busy_from = *now; 237 else 238 binuptime(&ds->busy_from); 239 } 240 atomic_add_rel_int(&ds->sequence0, 1); 241 } 242 243 void 244 devstat_start_transaction_bio(struct devstat *ds, struct bio *bp) 245 { 246 247 /* sanity check */ 248 if (ds == NULL) 249 return; 250 251 binuptime(&bp->bio_t0); 252 devstat_start_transaction_bio_t0(ds, bp); 253 } 254 255 void 256 devstat_start_transaction_bio_t0(struct devstat *ds, struct bio *bp) 257 { 258 259 /* sanity check */ 260 if (ds == NULL) 261 return; 262 263 devstat_start_transaction(ds, &bp->bio_t0); 264 DTRACE_DEVSTAT_BIO_START(); 265 } 266 267 /* 268 * Record the ending of a transaction, and incrment the various counters. 269 * 270 * Ordering in this function, and in devstat_start_transaction() is VERY 271 * important. The idea here is to run without locks, so we are very 272 * careful to only modify some fields on the way "down" (i.e. at 273 * transaction start) and some fields on the way "up" (i.e. at transaction 274 * completion). One exception is busy_from, which we only modify in 275 * devstat_start_transaction() when there are no outstanding transactions, 276 * and thus it can't be modified in devstat_end_transaction() 277 * simultaneously. 278 * 279 * The sequence0 and sequence1 fields are provided to enable an application 280 * spying on the structures with mmap(2) to tell when a structure is in a 281 * consistent state or not. 282 * 283 * For this to work 100% reliably, it is important that the two fields 284 * are at opposite ends of the structure and that they are incremented 285 * in the opposite order of how a memcpy(3) in userland would copy them. 286 * We assume that the copying happens front to back, but there is actually 287 * no way short of writing your own memcpy(3) replacement to guarantee 288 * this will be the case. 289 * 290 * In addition to this, being a kind of locks, they must be updated with 291 * atomic instructions using appropriate memory barriers. 292 */ 293 void 294 devstat_end_transaction(struct devstat *ds, uint32_t bytes, 295 devstat_tag_type tag_type, devstat_trans_flags flags, 296 const struct bintime *now, const struct bintime *then) 297 { 298 struct bintime dt, lnow; 299 300 /* sanity check */ 301 if (ds == NULL) 302 return; 303 304 if (now == NULL) { 305 binuptime(&lnow); 306 now = &lnow; 307 } 308 309 atomic_add_acq_int(&ds->sequence1, 1); 310 /* Update byte and operations counts */ 311 ds->bytes[flags] += bytes; 312 ds->operations[flags]++; 313 314 /* 315 * Keep a count of the various tag types sent. 316 */ 317 if ((ds->flags & DEVSTAT_NO_ORDERED_TAGS) == 0 && 318 tag_type != DEVSTAT_TAG_NONE) 319 ds->tag_types[tag_type]++; 320 321 if (then != NULL) { 322 /* Update duration of operations */ 323 dt = *now; 324 bintime_sub(&dt, then); 325 bintime_add(&ds->duration[flags], &dt); 326 } 327 328 /* Accumulate busy time */ 329 dt = *now; 330 bintime_sub(&dt, &ds->busy_from); 331 bintime_add(&ds->busy_time, &dt); 332 ds->busy_from = *now; 333 334 ds->end_count++; 335 atomic_add_rel_int(&ds->sequence0, 1); 336 } 337 338 void 339 devstat_end_transaction_bio(struct devstat *ds, const struct bio *bp) 340 { 341 342 devstat_end_transaction_bio_bt(ds, bp, NULL); 343 } 344 345 void 346 devstat_end_transaction_bio_bt(struct devstat *ds, const struct bio *bp, 347 const struct bintime *now) 348 { 349 devstat_trans_flags flg; 350 devstat_tag_type tag; 351 352 /* sanity check */ 353 if (ds == NULL) 354 return; 355 356 if (bp->bio_flags & BIO_ORDERED) 357 tag = DEVSTAT_TAG_ORDERED; 358 else 359 tag = DEVSTAT_TAG_SIMPLE; 360 if (bp->bio_cmd == BIO_DELETE) 361 flg = DEVSTAT_FREE; 362 else if ((bp->bio_cmd == BIO_READ) 363 || ((bp->bio_cmd == BIO_ZONE) 364 && (bp->bio_zone.zone_cmd == DISK_ZONE_REPORT_ZONES))) 365 flg = DEVSTAT_READ; 366 else if (bp->bio_cmd == BIO_WRITE) 367 flg = DEVSTAT_WRITE; 368 else 369 flg = DEVSTAT_NO_DATA; 370 371 devstat_end_transaction(ds, bp->bio_bcount - bp->bio_resid, 372 tag, flg, now, &bp->bio_t0); 373 DTRACE_DEVSTAT_BIO_DONE(); 374 } 375 376 /* 377 * This is the sysctl handler for the devstat package. The data pushed out 378 * on the kern.devstat.all sysctl variable consists of the current devstat 379 * generation number, and then an array of devstat structures, one for each 380 * device in the system. 381 * 382 * This is more cryptic that obvious, but basically we neither can nor 383 * want to hold the devstat_mutex for any amount of time, so we grab it 384 * only when we need to and keep an eye on devstat_generation all the time. 385 */ 386 static int 387 sysctl_devstat(SYSCTL_HANDLER_ARGS) 388 { 389 int error; 390 long mygen; 391 struct devstat *nds; 392 393 mtx_assert(&devstat_mutex, MA_NOTOWNED); 394 395 /* 396 * XXX devstat_generation should really be "volatile" but that 397 * XXX freaks out the sysctl macro below. The places where we 398 * XXX change it and inspect it are bracketed in the mutex which 399 * XXX guarantees us proper write barriers. I don't believe the 400 * XXX compiler is allowed to optimize mygen away across calls 401 * XXX to other functions, so the following is belived to be safe. 402 */ 403 mygen = devstat_generation; 404 405 #ifdef COMPAT_FREEBSD32 406 if ((req->flags & SCTL_MASK32) != 0) { 407 int32_t mygen32 = (int32_t)mygen; 408 409 error = SYSCTL_OUT(req, &mygen32, sizeof(mygen32)); 410 } else 411 #endif /* COMPAT_FREEBSD32 */ 412 error = SYSCTL_OUT(req, &mygen, sizeof(mygen)); 413 if (error != 0) 414 return (error); 415 416 if (devstat_num_devs == 0) 417 return(0); 418 419 mtx_lock(&devstat_mutex); 420 nds = STAILQ_FIRST(&device_statq); 421 if (mygen != devstat_generation) 422 error = EBUSY; 423 mtx_unlock(&devstat_mutex); 424 if (error != 0) 425 return (error); 426 427 while (nds != NULL) { 428 #ifdef COMPAT_FREEBSD32 429 if ((req->flags & SCTL_MASK32) != 0) { 430 struct devstat32 ds32; 431 unsigned int i; 432 433 CP(*nds, ds32, sequence0); 434 CP(*nds, ds32, allocated); 435 CP(*nds, ds32, start_count); 436 CP(*nds, ds32, end_count); 437 BT_CP(*nds, ds32, busy_from); 438 PTROUT_CP(*nds, ds32, dev_links.stqe_next); 439 CP(*nds, ds32, device_number); 440 strcpy(ds32.device_name, nds->device_name); 441 CP(*nds, ds32, unit_number); 442 for (i = 0; i < DEVSTAT_N_TRANS_FLAGS; i++) { 443 FU64_CP(*nds, ds32, bytes[i]); 444 FU64_CP(*nds, ds32, operations[i]); 445 BT_CP(*nds, ds32, duration[i]); 446 } 447 BT_CP(*nds, ds32, busy_time); 448 BT_CP(*nds, ds32, creation_time); 449 CP(*nds, ds32, block_size); 450 for (i = 0; i < nitems(ds32.tag_types); i++) { 451 FU64_CP(*nds, ds32, tag_types[i]); 452 } 453 CP(*nds, ds32, flags); 454 CP(*nds, ds32, device_type); 455 CP(*nds, ds32, priority); 456 PTROUT_CP(*nds, ds32, id); 457 CP(*nds, ds32, sequence1); 458 error = SYSCTL_OUT(req, &ds32, sizeof(ds32)); 459 } else 460 #endif /* COMPAT_FREEBSD32 */ 461 error = SYSCTL_OUT(req, nds, sizeof(*nds)); 462 if (error != 0) 463 return (error); 464 mtx_lock(&devstat_mutex); 465 if (mygen != devstat_generation) 466 error = EBUSY; 467 else 468 nds = STAILQ_NEXT(nds, dev_links); 469 mtx_unlock(&devstat_mutex); 470 if (error != 0) 471 return (error); 472 } 473 return (error); 474 } 475 476 /* 477 * Sysctl entries for devstat. The first one is a node that all the rest 478 * hang off of. 479 */ 480 static SYSCTL_NODE(_kern, OID_AUTO, devstat, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 481 "Device Statistics"); 482 483 SYSCTL_PROC(_kern_devstat, OID_AUTO, all, 484 CTLFLAG_RD | CTLTYPE_OPAQUE | CTLFLAG_MPSAFE, NULL, 0, 485 sysctl_devstat, "S,devstat", 486 "All devices in the devstat list"); 487 /* 488 * Export the number of devices in the system so that userland utilities 489 * can determine how much memory to allocate to hold all the devices. 490 */ 491 SYSCTL_INT(_kern_devstat, OID_AUTO, numdevs, CTLFLAG_RD, 492 &devstat_num_devs, 0, "Number of devices in the devstat list"); 493 SYSCTL_LONG(_kern_devstat, OID_AUTO, generation, CTLFLAG_RD, 494 &devstat_generation, 0, "Devstat list generation"); 495 SYSCTL_INT(_kern_devstat, OID_AUTO, version, CTLFLAG_RD, 496 &devstat_version, 0, "Devstat list version number"); 497 498 /* 499 * Allocator for struct devstat structures. We sub-allocate these from pages 500 * which we get from malloc. These pages are exported for mmap(2)'ing through 501 * a miniature device driver 502 */ 503 504 #define statsperpage (PAGE_SIZE / sizeof(struct devstat)) 505 506 static d_ioctl_t devstat_ioctl; 507 static d_mmap_t devstat_mmap; 508 509 static struct cdevsw devstat_cdevsw = { 510 .d_version = D_VERSION, 511 .d_ioctl = devstat_ioctl, 512 .d_mmap = devstat_mmap, 513 .d_name = "devstat", 514 }; 515 516 struct statspage { 517 TAILQ_ENTRY(statspage) list; 518 struct devstat *stat; 519 u_int nfree; 520 }; 521 522 static size_t pagelist_pages = 0; 523 static TAILQ_HEAD(, statspage) pagelist = TAILQ_HEAD_INITIALIZER(pagelist); 524 static MALLOC_DEFINE(M_DEVSTAT, "devstat", "Device statistics"); 525 526 static int 527 devstat_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, 528 struct thread *td) 529 { 530 int error = ENOTTY; 531 532 switch (cmd) { 533 case DIOCGMEDIASIZE: 534 error = 0; 535 *(off_t *)data = pagelist_pages * PAGE_SIZE; 536 break; 537 } 538 539 return (error); 540 } 541 542 static int 543 devstat_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr, 544 int nprot, vm_memattr_t *memattr) 545 { 546 struct statspage *spp; 547 548 if (nprot != VM_PROT_READ) 549 return (-1); 550 mtx_lock(&devstat_mutex); 551 TAILQ_FOREACH(spp, &pagelist, list) { 552 if (offset == 0) { 553 *paddr = vtophys(spp->stat); 554 mtx_unlock(&devstat_mutex); 555 return (0); 556 } 557 offset -= PAGE_SIZE; 558 } 559 mtx_unlock(&devstat_mutex); 560 return (-1); 561 } 562 563 static struct devstat * 564 devstat_alloc(void) 565 { 566 struct devstat *dsp; 567 struct statspage *spp, *spp2; 568 u_int u; 569 static int once; 570 571 mtx_assert(&devstat_mutex, MA_NOTOWNED); 572 if (!once) { 573 make_dev_credf(MAKEDEV_ETERNAL | MAKEDEV_CHECKNAME, 574 &devstat_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0444, 575 DEVSTAT_DEVICE_NAME); 576 once = 1; 577 } 578 spp2 = NULL; 579 mtx_lock(&devstat_mutex); 580 for (;;) { 581 TAILQ_FOREACH(spp, &pagelist, list) { 582 if (spp->nfree > 0) 583 break; 584 } 585 if (spp != NULL) 586 break; 587 mtx_unlock(&devstat_mutex); 588 spp2 = malloc(sizeof *spp, M_DEVSTAT, M_ZERO | M_WAITOK); 589 spp2->stat = malloc(PAGE_SIZE, M_DEVSTAT, M_ZERO | M_WAITOK); 590 spp2->nfree = statsperpage; 591 592 /* 593 * If free statspages were added while the lock was released 594 * just reuse them. 595 */ 596 mtx_lock(&devstat_mutex); 597 TAILQ_FOREACH(spp, &pagelist, list) 598 if (spp->nfree > 0) 599 break; 600 if (spp == NULL) { 601 spp = spp2; 602 603 /* 604 * It would make more sense to add the new page at the 605 * head but the order on the list determine the 606 * sequence of the mapping so we can't do that. 607 */ 608 pagelist_pages++; 609 TAILQ_INSERT_TAIL(&pagelist, spp, list); 610 } else 611 break; 612 } 613 dsp = spp->stat; 614 for (u = 0; u < statsperpage; u++) { 615 if (dsp->allocated == 0) 616 break; 617 dsp++; 618 } 619 spp->nfree--; 620 dsp->allocated = 1; 621 mtx_unlock(&devstat_mutex); 622 if (spp2 != NULL && spp2 != spp) { 623 free(spp2->stat, M_DEVSTAT); 624 free(spp2, M_DEVSTAT); 625 } 626 return (dsp); 627 } 628 629 static void 630 devstat_free(struct devstat *dsp) 631 { 632 struct statspage *spp; 633 634 mtx_assert(&devstat_mutex, MA_OWNED); 635 bzero(dsp, sizeof *dsp); 636 TAILQ_FOREACH(spp, &pagelist, list) { 637 if (dsp >= spp->stat && dsp < (spp->stat + statsperpage)) { 638 spp->nfree++; 639 return; 640 } 641 } 642 } 643 644 SYSCTL_SIZEOF_STRUCT(devstat); 645