1 /* 2 * Copyright (c) 1997, 1998, 1999 Kenneth D. Merry. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. The name of the author may not be used to endorse or promote products 14 * derived from this software without specific prior written permission. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * $FreeBSD$ 29 */ 30 31 #include <sys/param.h> 32 #include <sys/kernel.h> 33 #include <sys/systm.h> 34 #include <sys/bio.h> 35 #include <sys/devicestat.h> 36 #include <sys/sysctl.h> 37 #include <sys/malloc.h> 38 #include <sys/lock.h> 39 #include <sys/mutex.h> 40 #include <sys/conf.h> 41 #include <vm/vm.h> 42 #include <vm/pmap.h> 43 44 #include <machine/atomic.h> 45 46 static int devstat_num_devs; 47 static u_int devstat_generation; 48 static int devstat_version = DEVSTAT_VERSION; 49 static int devstat_current_devnumber; 50 static struct mtx devstat_mutex; 51 52 static struct devstatlist device_statq; 53 static struct devstat *devstat_alloc(void); 54 static void devstat_free(struct devstat *); 55 static void devstat_add_entry(struct devstat *ds, const void *dev_name, 56 int unit_number, u_int32_t block_size, 57 devstat_support_flags flags, 58 devstat_type_flags device_type, 59 devstat_priority priority); 60 61 /* 62 * Allocate a devstat and initialize it 63 */ 64 struct devstat * 65 devstat_new_entry(const void *dev_name, 66 int unit_number, u_int32_t block_size, 67 devstat_support_flags flags, 68 devstat_type_flags device_type, 69 devstat_priority priority) 70 { 71 struct devstat *ds; 72 static int once; 73 74 if (!once) { 75 STAILQ_INIT(&device_statq); 76 mtx_init(&devstat_mutex, "devstat", NULL, MTX_DEF); 77 once = 1; 78 } 79 mtx_assert(&devstat_mutex, MA_NOTOWNED); 80 81 ds = devstat_alloc(); 82 mtx_lock(&devstat_mutex); 83 if (unit_number == -1) { 84 ds->id = dev_name; 85 binuptime(&ds->creation_time); 86 devstat_generation++; 87 } else { 88 devstat_add_entry(ds, dev_name, unit_number, block_size, 89 flags, device_type, priority); 90 } 91 mtx_unlock(&devstat_mutex); 92 return (ds); 93 } 94 95 /* 96 * Take a malloced and zeroed devstat structure given to us, fill it in 97 * and add it to the queue of devices. 98 */ 99 static void 100 devstat_add_entry(struct devstat *ds, const void *dev_name, 101 int unit_number, u_int32_t block_size, 102 devstat_support_flags flags, 103 devstat_type_flags device_type, 104 devstat_priority priority) 105 { 106 struct devstatlist *devstat_head; 107 struct devstat *ds_tmp; 108 109 mtx_assert(&devstat_mutex, MA_OWNED); 110 devstat_num_devs++; 111 112 devstat_head = &device_statq; 113 114 /* 115 * Priority sort. Each driver passes in its priority when it adds 116 * its devstat entry. Drivers are sorted first by priority, and 117 * then by probe order. 118 * 119 * For the first device, we just insert it, since the priority 120 * doesn't really matter yet. Subsequent devices are inserted into 121 * the list using the order outlined above. 122 */ 123 if (devstat_num_devs == 1) 124 STAILQ_INSERT_TAIL(devstat_head, ds, dev_links); 125 else { 126 STAILQ_FOREACH(ds_tmp, devstat_head, dev_links) { 127 struct devstat *ds_next; 128 129 ds_next = STAILQ_NEXT(ds_tmp, dev_links); 130 131 /* 132 * If we find a break between higher and lower 133 * priority items, and if this item fits in the 134 * break, insert it. This also applies if the 135 * "lower priority item" is the end of the list. 136 */ 137 if ((priority <= ds_tmp->priority) 138 && ((ds_next == NULL) 139 || (priority > ds_next->priority))) { 140 STAILQ_INSERT_AFTER(devstat_head, ds_tmp, ds, 141 dev_links); 142 break; 143 } else if (priority > ds_tmp->priority) { 144 /* 145 * If this is the case, we should be able 146 * to insert ourselves at the head of the 147 * list. If we can't, something is wrong. 148 */ 149 if (ds_tmp == STAILQ_FIRST(devstat_head)) { 150 STAILQ_INSERT_HEAD(devstat_head, 151 ds, dev_links); 152 break; 153 } else { 154 STAILQ_INSERT_TAIL(devstat_head, 155 ds, dev_links); 156 printf("devstat_add_entry: HELP! " 157 "sorting problem detected " 158 "for name %p unit %d\n", 159 dev_name, unit_number); 160 break; 161 } 162 } 163 } 164 } 165 166 ds->device_number = devstat_current_devnumber++; 167 ds->unit_number = unit_number; 168 strlcpy(ds->device_name, dev_name, DEVSTAT_NAME_LEN); 169 ds->block_size = block_size; 170 ds->flags = flags; 171 ds->device_type = device_type; 172 ds->priority = priority; 173 binuptime(&ds->creation_time); 174 devstat_generation++; 175 } 176 177 /* 178 * Remove a devstat structure from the list of devices. 179 */ 180 void 181 devstat_remove_entry(struct devstat *ds) 182 { 183 struct devstatlist *devstat_head; 184 185 mtx_assert(&devstat_mutex, MA_NOTOWNED); 186 if (ds == NULL) 187 return; 188 189 mtx_lock(&devstat_mutex); 190 191 devstat_head = &device_statq; 192 193 /* Remove this entry from the devstat queue */ 194 atomic_add_acq_int(&ds->sequence1, 1); 195 if (ds->id == NULL) { 196 devstat_num_devs--; 197 STAILQ_REMOVE(devstat_head, ds, devstat, dev_links); 198 } 199 devstat_free(ds); 200 devstat_generation++; 201 mtx_unlock(&devstat_mutex); 202 } 203 204 /* 205 * Record a transaction start. 206 * 207 * See comments for devstat_end_transaction(). Ordering is very important 208 * here. 209 */ 210 void 211 devstat_start_transaction(struct devstat *ds, struct bintime *now) 212 { 213 214 mtx_assert(&devstat_mutex, MA_NOTOWNED); 215 216 /* sanity check */ 217 if (ds == NULL) 218 return; 219 220 atomic_add_acq_int(&ds->sequence1, 1); 221 /* 222 * We only want to set the start time when we are going from idle 223 * to busy. The start time is really the start of the latest busy 224 * period. 225 */ 226 if (ds->start_count == ds->end_count) { 227 if (now != NULL) 228 ds->busy_from = *now; 229 else 230 binuptime(&ds->busy_from); 231 } 232 ds->start_count++; 233 atomic_add_rel_int(&ds->sequence0, 1); 234 } 235 236 void 237 devstat_start_transaction_bio(struct devstat *ds, struct bio *bp) 238 { 239 240 mtx_assert(&devstat_mutex, MA_NOTOWNED); 241 242 /* sanity check */ 243 if (ds == NULL) 244 return; 245 246 binuptime(&bp->bio_t0); 247 devstat_start_transaction(ds, &bp->bio_t0); 248 } 249 250 /* 251 * Record the ending of a transaction, and incrment the various counters. 252 * 253 * Ordering in this function, and in devstat_start_transaction() is VERY 254 * important. The idea here is to run without locks, so we are very 255 * careful to only modify some fields on the way "down" (i.e. at 256 * transaction start) and some fields on the way "up" (i.e. at transaction 257 * completion). One exception is busy_from, which we only modify in 258 * devstat_start_transaction() when there are no outstanding transactions, 259 * and thus it can't be modified in devstat_end_transaction() 260 * simultaneously. 261 * 262 * The sequence0 and sequence1 fields are provided to enable an application 263 * spying on the structures with mmap(2) to tell when a structure is in a 264 * consistent state or not. 265 * 266 * For this to work 100% reliably, it is important that the two fields 267 * are at opposite ends of the structure and that they are incremented 268 * in the opposite order of how a memcpy(3) in userland would copy them. 269 * We assume that the copying happens front to back, but there is actually 270 * no way short of writing your own memcpy(3) replacement to guarantee 271 * this will be the case. 272 * 273 * In addition to this, being a kind of locks, they must be updated with 274 * atomic instructions using appropriate memory barriers. 275 */ 276 void 277 devstat_end_transaction(struct devstat *ds, u_int32_t bytes, 278 devstat_tag_type tag_type, devstat_trans_flags flags, 279 struct bintime *now, struct bintime *then) 280 { 281 struct bintime dt, lnow; 282 283 mtx_assert(&devstat_mutex, MA_NOTOWNED); 284 285 /* sanity check */ 286 if (ds == NULL) 287 return; 288 289 if (now == NULL) { 290 now = &lnow; 291 binuptime(now); 292 } 293 294 atomic_add_acq_int(&ds->sequence1, 1); 295 /* Update byte and operations counts */ 296 ds->bytes[flags] += bytes; 297 ds->operations[flags]++; 298 299 /* 300 * Keep a count of the various tag types sent. 301 */ 302 if ((ds->flags & DEVSTAT_NO_ORDERED_TAGS) == 0 && 303 tag_type != DEVSTAT_TAG_NONE) 304 ds->tag_types[tag_type]++; 305 306 if (then != NULL) { 307 /* Update duration of operations */ 308 dt = *now; 309 bintime_sub(&dt, then); 310 bintime_add(&ds->duration[flags], &dt); 311 } 312 313 /* Accumulate busy time */ 314 dt = *now; 315 bintime_sub(&dt, &ds->busy_from); 316 bintime_add(&ds->busy_time, &dt); 317 ds->busy_from = *now; 318 319 ds->end_count++; 320 atomic_add_rel_int(&ds->sequence0, 1); 321 } 322 323 void 324 devstat_end_transaction_bio(struct devstat *ds, struct bio *bp) 325 { 326 devstat_trans_flags flg; 327 328 mtx_assert(&devstat_mutex, MA_NOTOWNED); 329 330 /* sanity check */ 331 if (ds == NULL) 332 return; 333 334 if (bp->bio_cmd == BIO_DELETE) 335 flg = DEVSTAT_FREE; 336 else if (bp->bio_cmd == BIO_READ) 337 flg = DEVSTAT_READ; 338 else if (bp->bio_cmd == BIO_WRITE) 339 flg = DEVSTAT_WRITE; 340 else 341 flg = DEVSTAT_NO_DATA; 342 343 devstat_end_transaction(ds, bp->bio_bcount - bp->bio_resid, 344 DEVSTAT_TAG_SIMPLE, flg, NULL, &bp->bio_t0); 345 } 346 347 /* 348 * This is the sysctl handler for the devstat package. The data pushed out 349 * on the kern.devstat.all sysctl variable consists of the current devstat 350 * generation number, and then an array of devstat structures, one for each 351 * device in the system. 352 * 353 * This is more cryptic that obvious, but basically we neither can nor 354 * want to hold the devstat_mutex for any amount of time, so we grab it 355 * only when we need to and keep an eye on devstat_generation all the time. 356 */ 357 static int 358 sysctl_devstat(SYSCTL_HANDLER_ARGS) 359 { 360 int error; 361 u_int mygen; 362 struct devstat *nds; 363 364 mtx_assert(&devstat_mutex, MA_NOTOWNED); 365 366 if (devstat_num_devs == 0) 367 return(EINVAL); 368 369 /* 370 * XXX devstat_generation should really be "volatile" but that 371 * XXX freaks out the sysctl macro below. The places where we 372 * XXX change it and inspect it are bracketed in the mutex which 373 * XXX guarantees us proper write barriers. I don't belive the 374 * XXX compiler is allowed to optimize mygen away across calls 375 * XXX to other functions, so the following is belived to be safe. 376 */ 377 mygen = devstat_generation; 378 379 error = SYSCTL_OUT(req, &mygen, sizeof(mygen)); 380 381 if (error != 0) 382 return (error); 383 384 mtx_lock(&devstat_mutex); 385 nds = STAILQ_FIRST(&device_statq); 386 if (mygen != devstat_generation) 387 error = EBUSY; 388 mtx_unlock(&devstat_mutex); 389 390 if (error != 0) 391 return (error); 392 393 for (;nds != NULL;) { 394 error = SYSCTL_OUT(req, nds, sizeof(struct devstat)); 395 if (error != 0) 396 return (error); 397 mtx_lock(&devstat_mutex); 398 if (mygen != devstat_generation) 399 error = EBUSY; 400 else 401 nds = STAILQ_NEXT(nds, dev_links); 402 mtx_unlock(&devstat_mutex); 403 if (error != 0) 404 return (error); 405 } 406 return(error); 407 } 408 409 /* 410 * Sysctl entries for devstat. The first one is a node that all the rest 411 * hang off of. 412 */ 413 SYSCTL_NODE(_kern, OID_AUTO, devstat, CTLFLAG_RD, 0, "Device Statistics"); 414 415 SYSCTL_PROC(_kern_devstat, OID_AUTO, all, CTLFLAG_RD|CTLTYPE_OPAQUE, 416 0, 0, sysctl_devstat, "S,devstat", "All devices in the devstat list"); 417 /* 418 * Export the number of devices in the system so that userland utilities 419 * can determine how much memory to allocate to hold all the devices. 420 */ 421 SYSCTL_INT(_kern_devstat, OID_AUTO, numdevs, CTLFLAG_RD, 422 &devstat_num_devs, 0, "Number of devices in the devstat list"); 423 SYSCTL_UINT(_kern_devstat, OID_AUTO, generation, CTLFLAG_RD, 424 &devstat_generation, 0, "Devstat list generation"); 425 SYSCTL_INT(_kern_devstat, OID_AUTO, version, CTLFLAG_RD, 426 &devstat_version, 0, "Devstat list version number"); 427 428 /* 429 * Allocator for struct devstat structures. We sub-allocate these from pages 430 * which we get from malloc. These pages are exported for mmap(2)'ing through 431 * a miniature device driver 432 */ 433 434 #define statsperpage (PAGE_SIZE / sizeof(struct devstat)) 435 436 static d_mmap_t devstat_mmap; 437 438 static struct cdevsw devstat_cdevsw = { 439 .d_open = nullopen, 440 .d_close = nullclose, 441 .d_mmap = devstat_mmap, 442 .d_name = "devstat", 443 }; 444 445 struct statspage { 446 TAILQ_ENTRY(statspage) list; 447 struct devstat *stat; 448 u_int nfree; 449 }; 450 451 static TAILQ_HEAD(, statspage) pagelist = TAILQ_HEAD_INITIALIZER(pagelist); 452 static MALLOC_DEFINE(M_DEVSTAT, "devstat", "Device statistics"); 453 454 static int 455 devstat_mmap(dev_t dev, vm_offset_t offset, vm_paddr_t *paddr, int nprot) 456 { 457 struct statspage *spp; 458 459 if (nprot != VM_PROT_READ) 460 return (-1); 461 TAILQ_FOREACH(spp, &pagelist, list) { 462 if (offset == 0) { 463 *paddr = vtophys(spp->stat); 464 return (0); 465 } 466 offset -= PAGE_SIZE; 467 } 468 return (-1); 469 } 470 471 static struct devstat * 472 devstat_alloc(void) 473 { 474 struct devstat *dsp; 475 struct statspage *spp; 476 u_int u; 477 static int once; 478 479 mtx_assert(&devstat_mutex, MA_NOTOWNED); 480 if (!once) { 481 make_dev(&devstat_cdevsw, 0, 482 UID_ROOT, GID_WHEEL, 0400, DEVSTAT_DEVICE_NAME); 483 once = 1; 484 } 485 mtx_lock(&devstat_mutex); 486 for (;;) { 487 TAILQ_FOREACH(spp, &pagelist, list) { 488 if (spp->nfree > 0) 489 break; 490 } 491 if (spp != NULL) 492 break; 493 /* 494 * We had no free slot in any of our pages, drop the mutex 495 * and get another page. In theory we could have more than 496 * one process doing this at the same time and consequently 497 * we may allocate more pages than we will need. That is 498 * Just Too Bad[tm], we can live with that. 499 */ 500 mtx_unlock(&devstat_mutex); 501 spp = malloc(sizeof *spp, M_DEVSTAT, M_ZERO | M_WAITOK); 502 spp->stat = malloc(PAGE_SIZE, M_DEVSTAT, M_ZERO | M_WAITOK); 503 spp->nfree = statsperpage; 504 mtx_lock(&devstat_mutex); 505 /* 506 * It would make more sense to add the new page at the head 507 * but the order on the list determine the sequence of the 508 * mapping so we can't do that. 509 */ 510 TAILQ_INSERT_TAIL(&pagelist, spp, list); 511 } 512 dsp = spp->stat; 513 for (u = 0; u < statsperpage; u++) { 514 if (dsp->allocated == 0) 515 break; 516 dsp++; 517 } 518 spp->nfree--; 519 dsp->allocated = 1; 520 mtx_unlock(&devstat_mutex); 521 return (dsp); 522 } 523 524 static void 525 devstat_free(struct devstat *dsp) 526 { 527 struct statspage *spp; 528 529 mtx_assert(&devstat_mutex, MA_OWNED); 530 bzero(dsp, sizeof *dsp); 531 TAILQ_FOREACH(spp, &pagelist, list) { 532 if (dsp >= spp->stat && dsp < (spp->stat + statsperpage)) { 533 spp->nfree++; 534 return; 535 } 536 } 537 } 538 539 SYSCTL_INT(_debug_sizeof, OID_AUTO, devstat, CTLFLAG_RD, 540 0, sizeof(struct devstat), "sizeof(struct devstat)"); 541