1 /*- 2 * CAM IO Scheduler Interface 3 * 4 * Copyright (c) 2015 Netflix, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions, and the following disclaimer, 12 * without modification, immediately at the beginning of the file. 13 * 2. The name of the author may not be used to endorse or promote products 14 * derived from this software without specific prior written permission. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * $FreeBSD$ 29 */ 30 31 #include "opt_cam.h" 32 #include "opt_ddb.h" 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 37 #include <sys/param.h> 38 39 #include <sys/systm.h> 40 #include <sys/kernel.h> 41 #include <sys/bio.h> 42 #include <sys/lock.h> 43 #include <sys/malloc.h> 44 #include <sys/mutex.h> 45 #include <sys/sysctl.h> 46 47 #include <cam/cam.h> 48 #include <cam/cam_ccb.h> 49 #include <cam/cam_periph.h> 50 #include <cam/cam_xpt_periph.h> 51 #include <cam/cam_iosched.h> 52 53 #include <ddb/ddb.h> 54 55 static MALLOC_DEFINE(M_CAMSCHED, "CAM I/O Scheduler", 56 "CAM I/O Scheduler buffers"); 57 58 /* 59 * Default I/O scheduler for FreeBSD. This implementation is just a thin-vineer 60 * over the bioq_* interface, with notions of separate calls for normal I/O and 61 * for trims. 62 */ 63 64 #ifdef CAM_NETFLIX_IOSCHED 65 66 SYSCTL_DECL(_kern_cam); 67 static int do_netflix_iosched = 1; 68 TUNABLE_INT("kern.cam.do_netflix_iosched", &do_netflix_iosched); 69 SYSCTL_INT(_kern_cam, OID_AUTO, do_netflix_iosched, CTLFLAG_RD, 70 &do_netflix_iosched, 1, 71 "Enable Netflix I/O scheduler optimizations."); 72 73 static int alpha_bits = 9; 74 TUNABLE_INT("kern.cam.iosched_alpha_bits", &alpha_bits); 75 SYSCTL_INT(_kern_cam, OID_AUTO, iosched_alpha_bits, CTLFLAG_RW, 76 &alpha_bits, 1, 77 "Bits in EMA's alpha."); 78 79 80 81 struct iop_stats; 82 struct cam_iosched_softc; 83 84 int iosched_debug = 0; 85 86 typedef enum { 87 none = 0, /* No limits */ 88 queue_depth, /* Limit how many ops we queue to SIM */ 89 iops, /* Limit # of IOPS to the drive */ 90 bandwidth, /* Limit bandwidth to the drive */ 91 limiter_max 92 } io_limiter; 93 94 static const char *cam_iosched_limiter_names[] = 95 { "none", "queue_depth", "iops", "bandwidth" }; 96 97 /* 98 * Called to initialize the bits of the iop_stats structure relevant to the 99 * limiter. Called just after the limiter is set. 100 */ 101 typedef int l_init_t(struct iop_stats *); 102 103 /* 104 * Called every tick. 105 */ 106 typedef int l_tick_t(struct iop_stats *); 107 108 /* 109 * Called to see if the limiter thinks this IOP can be allowed to 110 * proceed. If so, the limiter assumes that the while IOP proceeded 111 * and makes any accounting of it that's needed. 112 */ 113 typedef int l_iop_t(struct iop_stats *, struct bio *); 114 115 /* 116 * Called when an I/O completes so the limiter can updates its 117 * accounting. Pending I/Os may complete in any order (even when 118 * sent to the hardware at the same time), so the limiter may not 119 * make any assumptions other than this I/O has completed. If it 120 * returns 1, then xpt_schedule() needs to be called again. 121 */ 122 typedef int l_iodone_t(struct iop_stats *, struct bio *); 123 124 static l_iop_t cam_iosched_qd_iop; 125 static l_iop_t cam_iosched_qd_caniop; 126 static l_iodone_t cam_iosched_qd_iodone; 127 128 static l_init_t cam_iosched_iops_init; 129 static l_tick_t cam_iosched_iops_tick; 130 static l_iop_t cam_iosched_iops_caniop; 131 static l_iop_t cam_iosched_iops_iop; 132 133 static l_init_t cam_iosched_bw_init; 134 static l_tick_t cam_iosched_bw_tick; 135 static l_iop_t cam_iosched_bw_caniop; 136 static l_iop_t cam_iosched_bw_iop; 137 138 struct limswitch 139 { 140 l_init_t *l_init; 141 l_tick_t *l_tick; 142 l_iop_t *l_iop; 143 l_iop_t *l_caniop; 144 l_iodone_t *l_iodone; 145 } limsw[] = 146 { 147 { /* none */ 148 .l_init = NULL, 149 .l_tick = NULL, 150 .l_iop = NULL, 151 .l_iodone= NULL, 152 }, 153 { /* queue_depth */ 154 .l_init = NULL, 155 .l_tick = NULL, 156 .l_caniop = cam_iosched_qd_caniop, 157 .l_iop = cam_iosched_qd_iop, 158 .l_iodone= cam_iosched_qd_iodone, 159 }, 160 { /* iops */ 161 .l_init = cam_iosched_iops_init, 162 .l_tick = cam_iosched_iops_tick, 163 .l_caniop = cam_iosched_iops_caniop, 164 .l_iop = cam_iosched_iops_iop, 165 .l_iodone= NULL, 166 }, 167 { /* bandwidth */ 168 .l_init = cam_iosched_bw_init, 169 .l_tick = cam_iosched_bw_tick, 170 .l_caniop = cam_iosched_bw_caniop, 171 .l_iop = cam_iosched_bw_iop, 172 .l_iodone= NULL, 173 }, 174 }; 175 176 struct iop_stats 177 { 178 /* 179 * sysctl state for this subnode. 180 */ 181 struct sysctl_ctx_list sysctl_ctx; 182 struct sysctl_oid *sysctl_tree; 183 184 /* 185 * Information about the current rate limiters, if any 186 */ 187 io_limiter limiter; /* How are I/Os being limited */ 188 int min; /* Low range of limit */ 189 int max; /* High range of limit */ 190 int current; /* Current rate limiter */ 191 int l_value1; /* per-limiter scratch value 1. */ 192 int l_value2; /* per-limiter scratch value 2. */ 193 194 195 /* 196 * Debug information about counts of I/Os that have gone through the 197 * scheduler. 198 */ 199 int pending; /* I/Os pending in the hardware */ 200 int queued; /* number currently in the queue */ 201 int total; /* Total for all time -- wraps */ 202 int in; /* number queued all time -- wraps */ 203 int out; /* number completed all time -- wraps */ 204 205 /* 206 * Statistics on different bits of the process. 207 */ 208 /* Exp Moving Average, alpha = 1 / (1 << alpha_bits) */ 209 sbintime_t ema; 210 sbintime_t emss; /* Exp Moving sum of the squares */ 211 sbintime_t sd; /* Last computed sd */ 212 213 struct cam_iosched_softc *softc; 214 }; 215 216 217 typedef enum { 218 set_max = 0, /* current = max */ 219 read_latency, /* Steer read latency by throttling writes */ 220 cl_max /* Keep last */ 221 } control_type; 222 223 static const char *cam_iosched_control_type_names[] = 224 { "set_max", "read_latency" }; 225 226 struct control_loop 227 { 228 /* 229 * sysctl state for this subnode. 230 */ 231 struct sysctl_ctx_list sysctl_ctx; 232 struct sysctl_oid *sysctl_tree; 233 234 sbintime_t next_steer; /* Time of next steer */ 235 sbintime_t steer_interval; /* How often do we steer? */ 236 sbintime_t lolat; 237 sbintime_t hilat; 238 int alpha; 239 control_type type; /* What type of control? */ 240 int last_count; /* Last I/O count */ 241 242 struct cam_iosched_softc *softc; 243 }; 244 245 #endif 246 247 struct cam_iosched_softc 248 { 249 struct bio_queue_head bio_queue; 250 struct bio_queue_head trim_queue; 251 /* scheduler flags < 16, user flags >= 16 */ 252 uint32_t flags; 253 int sort_io_queue; 254 #ifdef CAM_NETFLIX_IOSCHED 255 int read_bias; /* Read bias setting */ 256 int current_read_bias; /* Current read bias state */ 257 int total_ticks; 258 259 struct bio_queue_head write_queue; 260 struct iop_stats read_stats, write_stats, trim_stats; 261 struct sysctl_ctx_list sysctl_ctx; 262 struct sysctl_oid *sysctl_tree; 263 264 int quanta; /* Number of quanta per second */ 265 struct callout ticker; /* Callout for our quota system */ 266 struct cam_periph *periph; /* cam periph associated with this device */ 267 uint32_t this_frac; /* Fraction of a second (1024ths) for this tick */ 268 sbintime_t last_time; /* Last time we ticked */ 269 struct control_loop cl; 270 #endif 271 }; 272 273 #ifdef CAM_NETFLIX_IOSCHED 274 /* 275 * helper functions to call the limsw functions. 276 */ 277 static int 278 cam_iosched_limiter_init(struct iop_stats *ios) 279 { 280 int lim = ios->limiter; 281 282 /* maybe this should be a kassert */ 283 if (lim < none || lim >= limiter_max) 284 return EINVAL; 285 286 if (limsw[lim].l_init) 287 return limsw[lim].l_init(ios); 288 289 return 0; 290 } 291 292 static int 293 cam_iosched_limiter_tick(struct iop_stats *ios) 294 { 295 int lim = ios->limiter; 296 297 /* maybe this should be a kassert */ 298 if (lim < none || lim >= limiter_max) 299 return EINVAL; 300 301 if (limsw[lim].l_tick) 302 return limsw[lim].l_tick(ios); 303 304 return 0; 305 } 306 307 static int 308 cam_iosched_limiter_iop(struct iop_stats *ios, struct bio *bp) 309 { 310 int lim = ios->limiter; 311 312 /* maybe this should be a kassert */ 313 if (lim < none || lim >= limiter_max) 314 return EINVAL; 315 316 if (limsw[lim].l_iop) 317 return limsw[lim].l_iop(ios, bp); 318 319 return 0; 320 } 321 322 static int 323 cam_iosched_limiter_caniop(struct iop_stats *ios, struct bio *bp) 324 { 325 int lim = ios->limiter; 326 327 /* maybe this should be a kassert */ 328 if (lim < none || lim >= limiter_max) 329 return EINVAL; 330 331 if (limsw[lim].l_caniop) 332 return limsw[lim].l_caniop(ios, bp); 333 334 return 0; 335 } 336 337 static int 338 cam_iosched_limiter_iodone(struct iop_stats *ios, struct bio *bp) 339 { 340 int lim = ios->limiter; 341 342 /* maybe this should be a kassert */ 343 if (lim < none || lim >= limiter_max) 344 return 0; 345 346 if (limsw[lim].l_iodone) 347 return limsw[lim].l_iodone(ios, bp); 348 349 return 0; 350 } 351 352 /* 353 * Functions to implement the different kinds of limiters 354 */ 355 356 static int 357 cam_iosched_qd_iop(struct iop_stats *ios, struct bio *bp) 358 { 359 360 if (ios->current <= 0 || ios->pending < ios->current) 361 return 0; 362 363 return EAGAIN; 364 } 365 366 static int 367 cam_iosched_qd_caniop(struct iop_stats *ios, struct bio *bp) 368 { 369 370 if (ios->current <= 0 || ios->pending < ios->current) 371 return 0; 372 373 return EAGAIN; 374 } 375 376 static int 377 cam_iosched_qd_iodone(struct iop_stats *ios, struct bio *bp) 378 { 379 380 if (ios->current <= 0 || ios->pending != ios->current) 381 return 0; 382 383 return 1; 384 } 385 386 static int 387 cam_iosched_iops_init(struct iop_stats *ios) 388 { 389 390 ios->l_value1 = ios->current / ios->softc->quanta; 391 if (ios->l_value1 <= 0) 392 ios->l_value1 = 1; 393 394 return 0; 395 } 396 397 static int 398 cam_iosched_iops_tick(struct iop_stats *ios) 399 { 400 401 ios->l_value1 = (int)((ios->current * (uint64_t)ios->softc->this_frac) >> 16); 402 if (ios->l_value1 <= 0) 403 ios->l_value1 = 1; 404 405 return 0; 406 } 407 408 static int 409 cam_iosched_iops_caniop(struct iop_stats *ios, struct bio *bp) 410 { 411 412 /* 413 * So if we have any more IOPs left, allow it, 414 * otherwise wait. 415 */ 416 if (ios->l_value1 <= 0) 417 return EAGAIN; 418 return 0; 419 } 420 421 static int 422 cam_iosched_iops_iop(struct iop_stats *ios, struct bio *bp) 423 { 424 int rv; 425 426 rv = cam_iosched_limiter_caniop(ios, bp); 427 if (rv == 0) 428 ios->l_value1--; 429 430 return rv; 431 } 432 433 static int 434 cam_iosched_bw_init(struct iop_stats *ios) 435 { 436 437 /* ios->current is in kB/s, so scale to bytes */ 438 ios->l_value1 = ios->current * 1000 / ios->softc->quanta; 439 440 return 0; 441 } 442 443 static int 444 cam_iosched_bw_tick(struct iop_stats *ios) 445 { 446 int bw; 447 448 /* 449 * If we're in the hole for available quota from 450 * the last time, then add the quantum for this. 451 * If we have any left over from last quantum, 452 * then too bad, that's lost. Also, ios->current 453 * is in kB/s, so scale. 454 * 455 * We also allow up to 4 quanta of credits to 456 * accumulate to deal with burstiness. 4 is extremely 457 * arbitrary. 458 */ 459 bw = (int)((ios->current * 1000ull * (uint64_t)ios->softc->this_frac) >> 16); 460 if (ios->l_value1 < bw * 4) 461 ios->l_value1 += bw; 462 463 return 0; 464 } 465 466 static int 467 cam_iosched_bw_caniop(struct iop_stats *ios, struct bio *bp) 468 { 469 /* 470 * So if we have any more bw quota left, allow it, 471 * otherwise wait. Not, we'll go negative and that's 472 * OK. We'll just get a lettle less next quota. 473 * 474 * Note on going negative: that allows us to process 475 * requests in order better, since we won't allow 476 * shorter reads to get around the long one that we 477 * don't have the quota to do just yet. It also prevents 478 * starvation by being a little more permissive about 479 * what we let through this quantum (to prevent the 480 * starvation), at the cost of getting a little less 481 * next quantum. 482 */ 483 if (ios->l_value1 <= 0) 484 return EAGAIN; 485 486 487 return 0; 488 } 489 490 static int 491 cam_iosched_bw_iop(struct iop_stats *ios, struct bio *bp) 492 { 493 int rv; 494 495 rv = cam_iosched_limiter_caniop(ios, bp); 496 if (rv == 0) 497 ios->l_value1 -= bp->bio_length; 498 499 return rv; 500 } 501 502 static void cam_iosched_cl_maybe_steer(struct control_loop *clp); 503 504 static void 505 cam_iosched_ticker(void *arg) 506 { 507 struct cam_iosched_softc *isc = arg; 508 sbintime_t now, delta; 509 510 callout_reset(&isc->ticker, hz / isc->quanta - 1, cam_iosched_ticker, isc); 511 512 now = sbinuptime(); 513 delta = now - isc->last_time; 514 isc->this_frac = (uint32_t)delta >> 16; /* Note: discards seconds -- should be 0 harmless if not */ 515 isc->last_time = now; 516 517 cam_iosched_cl_maybe_steer(&isc->cl); 518 519 cam_iosched_limiter_tick(&isc->read_stats); 520 cam_iosched_limiter_tick(&isc->write_stats); 521 cam_iosched_limiter_tick(&isc->trim_stats); 522 523 cam_iosched_schedule(isc, isc->periph); 524 525 isc->total_ticks++; 526 } 527 528 529 static void 530 cam_iosched_cl_init(struct control_loop *clp, struct cam_iosched_softc *isc) 531 { 532 533 clp->next_steer = sbinuptime(); 534 clp->softc = isc; 535 clp->steer_interval = SBT_1S * 5; /* Let's start out steering every 5s */ 536 clp->lolat = 5 * SBT_1MS; 537 clp->hilat = 15 * SBT_1MS; 538 clp->alpha = 20; /* Alpha == gain. 20 = .2 */ 539 clp->type = set_max; 540 } 541 542 static void 543 cam_iosched_cl_maybe_steer(struct control_loop *clp) 544 { 545 struct cam_iosched_softc *isc; 546 sbintime_t now, lat; 547 int old; 548 549 isc = clp->softc; 550 now = isc->last_time; 551 if (now < clp->next_steer) 552 return; 553 554 clp->next_steer = now + clp->steer_interval; 555 switch (clp->type) { 556 case set_max: 557 if (isc->write_stats.current != isc->write_stats.max) 558 printf("Steering write from %d kBps to %d kBps\n", 559 isc->write_stats.current, isc->write_stats.max); 560 isc->read_stats.current = isc->read_stats.max; 561 isc->write_stats.current = isc->write_stats.max; 562 isc->trim_stats.current = isc->trim_stats.max; 563 break; 564 case read_latency: 565 old = isc->write_stats.current; 566 lat = isc->read_stats.ema; 567 /* 568 * Simple PLL-like engine. Since we're steering to a range for 569 * the SP (set point) that makes things a little more 570 * complicated. In addition, we're not directly controlling our 571 * PV (process variable), the read latency, but instead are 572 * manipulating the write bandwidth limit for our MV 573 * (manipulation variable), analysis of this code gets a bit 574 * messy. Also, the MV is a very noisy control surface for read 575 * latency since it is affected by many hidden processes inside 576 * the device which change how responsive read latency will be 577 * in reaction to changes in write bandwidth. Unlike the classic 578 * boiler control PLL. this may result in over-steering while 579 * the SSD takes its time to react to the new, lower load. This 580 * is why we use a relatively low alpha of between .1 and .25 to 581 * compensate for this effect. At .1, it takes ~22 steering 582 * intervals to back off by a factor of 10. At .2 it only takes 583 * ~10. At .25 it only takes ~8. However some preliminary data 584 * from the SSD drives suggests a reasponse time in 10's of 585 * seconds before latency drops regardless of the new write 586 * rate. Careful observation will be reqiured to tune this 587 * effectively. 588 * 589 * Also, when there's no read traffic, we jack up the write 590 * limit too regardless of the last read latency. 10 is 591 * somewhat arbitrary. 592 */ 593 if (lat < clp->lolat || isc->read_stats.total - clp->last_count < 10) 594 isc->write_stats.current = isc->write_stats.current * 595 (100 + clp->alpha) / 100; /* Scale up */ 596 else if (lat > clp->hilat) 597 isc->write_stats.current = isc->write_stats.current * 598 (100 - clp->alpha) / 100; /* Scale down */ 599 clp->last_count = isc->read_stats.total; 600 601 /* 602 * Even if we don't steer, per se, enforce the min/max limits as 603 * those may have changed. 604 */ 605 if (isc->write_stats.current < isc->write_stats.min) 606 isc->write_stats.current = isc->write_stats.min; 607 if (isc->write_stats.current > isc->write_stats.max) 608 isc->write_stats.current = isc->write_stats.max; 609 if (old != isc->write_stats.current && iosched_debug) 610 printf("Steering write from %d kBps to %d kBps due to latency of %jdms\n", 611 old, isc->write_stats.current, 612 (uintmax_t)((uint64_t)1000000 * (uint32_t)lat) >> 32); 613 break; 614 case cl_max: 615 break; 616 } 617 } 618 #endif 619 620 /* Trim or similar currently pending completion */ 621 #define CAM_IOSCHED_FLAG_TRIM_ACTIVE (1ul << 0) 622 /* Callout active, and needs to be torn down */ 623 #define CAM_IOSCHED_FLAG_CALLOUT_ACTIVE (1ul << 1) 624 625 /* Periph drivers set these flags to indicate work */ 626 #define CAM_IOSCHED_FLAG_WORK_FLAGS ((0xffffu) << 16) 627 628 #ifdef CAM_NETFLIX_IOSCHED 629 static void 630 cam_iosched_io_metric_update(struct cam_iosched_softc *isc, 631 sbintime_t sim_latency, int cmd, size_t size); 632 #endif 633 634 static inline int 635 cam_iosched_has_flagged_work(struct cam_iosched_softc *isc) 636 { 637 return !!(isc->flags & CAM_IOSCHED_FLAG_WORK_FLAGS); 638 } 639 640 static inline int 641 cam_iosched_has_io(struct cam_iosched_softc *isc) 642 { 643 #ifdef CAM_NETFLIX_IOSCHED 644 if (do_netflix_iosched) { 645 struct bio *rbp = bioq_first(&isc->bio_queue); 646 struct bio *wbp = bioq_first(&isc->write_queue); 647 int can_write = wbp != NULL && 648 cam_iosched_limiter_caniop(&isc->write_stats, wbp) == 0; 649 int can_read = rbp != NULL && 650 cam_iosched_limiter_caniop(&isc->read_stats, rbp) == 0; 651 if (iosched_debug > 2) { 652 printf("can write %d: pending_writes %d max_writes %d\n", can_write, isc->write_stats.pending, isc->write_stats.max); 653 printf("can read %d: read_stats.pending %d max_reads %d\n", can_read, isc->read_stats.pending, isc->read_stats.max); 654 printf("Queued reads %d writes %d\n", isc->read_stats.queued, isc->write_stats.queued); 655 } 656 return can_read || can_write; 657 } 658 #endif 659 return bioq_first(&isc->bio_queue) != NULL; 660 } 661 662 static inline int 663 cam_iosched_has_more_trim(struct cam_iosched_softc *isc) 664 { 665 return !(isc->flags & CAM_IOSCHED_FLAG_TRIM_ACTIVE) && 666 bioq_first(&isc->trim_queue); 667 } 668 669 #define cam_iosched_sort_queue(isc) ((isc)->sort_io_queue >= 0 ? \ 670 (isc)->sort_io_queue : cam_sort_io_queues) 671 672 673 static inline int 674 cam_iosched_has_work(struct cam_iosched_softc *isc) 675 { 676 #ifdef CAM_NETFLIX_IOSCHED 677 if (iosched_debug > 2) 678 printf("has work: %d %d %d\n", cam_iosched_has_io(isc), 679 cam_iosched_has_more_trim(isc), 680 cam_iosched_has_flagged_work(isc)); 681 #endif 682 683 return cam_iosched_has_io(isc) || 684 cam_iosched_has_more_trim(isc) || 685 cam_iosched_has_flagged_work(isc); 686 } 687 688 #ifdef CAM_NETFLIX_IOSCHED 689 static void 690 cam_iosched_iop_stats_init(struct cam_iosched_softc *isc, struct iop_stats *ios) 691 { 692 693 ios->limiter = none; 694 cam_iosched_limiter_init(ios); 695 ios->in = 0; 696 ios->max = 300000; 697 ios->min = 1; 698 ios->out = 0; 699 ios->pending = 0; 700 ios->queued = 0; 701 ios->total = 0; 702 ios->ema = 0; 703 ios->emss = 0; 704 ios->sd = 0; 705 ios->softc = isc; 706 } 707 708 static int 709 cam_iosched_limiter_sysctl(SYSCTL_HANDLER_ARGS) 710 { 711 char buf[16]; 712 struct iop_stats *ios; 713 struct cam_iosched_softc *isc; 714 int value, i, error, cantick; 715 const char *p; 716 717 ios = arg1; 718 isc = ios->softc; 719 value = ios->limiter; 720 if (value < none || value >= limiter_max) 721 p = "UNKNOWN"; 722 else 723 p = cam_iosched_limiter_names[value]; 724 725 strlcpy(buf, p, sizeof(buf)); 726 error = sysctl_handle_string(oidp, buf, sizeof(buf), req); 727 if (error != 0 || req->newptr == NULL) 728 return error; 729 730 cam_periph_lock(isc->periph); 731 732 for (i = none; i < limiter_max; i++) { 733 if (strcmp(buf, cam_iosched_limiter_names[i]) != 0) 734 continue; 735 ios->limiter = i; 736 error = cam_iosched_limiter_init(ios); 737 if (error != 0) { 738 ios->limiter = value; 739 cam_periph_unlock(isc->periph); 740 return error; 741 } 742 cantick = !!limsw[isc->read_stats.limiter].l_tick + 743 !!limsw[isc->write_stats.limiter].l_tick + 744 !!limsw[isc->trim_stats.limiter].l_tick + 745 1; /* Control loop requires it */ 746 if (isc->flags & CAM_IOSCHED_FLAG_CALLOUT_ACTIVE) { 747 if (cantick == 0) { 748 callout_stop(&isc->ticker); 749 isc->flags &= ~CAM_IOSCHED_FLAG_CALLOUT_ACTIVE; 750 } 751 } else { 752 if (cantick != 0) { 753 callout_reset(&isc->ticker, hz / isc->quanta - 1, cam_iosched_ticker, isc); 754 isc->flags |= CAM_IOSCHED_FLAG_CALLOUT_ACTIVE; 755 } 756 } 757 758 cam_periph_unlock(isc->periph); 759 return 0; 760 } 761 762 cam_periph_unlock(isc->periph); 763 return EINVAL; 764 } 765 766 static int 767 cam_iosched_control_type_sysctl(SYSCTL_HANDLER_ARGS) 768 { 769 char buf[16]; 770 struct control_loop *clp; 771 struct cam_iosched_softc *isc; 772 int value, i, error; 773 const char *p; 774 775 clp = arg1; 776 isc = clp->softc; 777 value = clp->type; 778 if (value < none || value >= cl_max) 779 p = "UNKNOWN"; 780 else 781 p = cam_iosched_control_type_names[value]; 782 783 strlcpy(buf, p, sizeof(buf)); 784 error = sysctl_handle_string(oidp, buf, sizeof(buf), req); 785 if (error != 0 || req->newptr == NULL) 786 return error; 787 788 for (i = set_max; i < cl_max; i++) { 789 if (strcmp(buf, cam_iosched_control_type_names[i]) != 0) 790 continue; 791 cam_periph_lock(isc->periph); 792 clp->type = i; 793 cam_periph_unlock(isc->periph); 794 return 0; 795 } 796 797 return EINVAL; 798 } 799 800 static int 801 cam_iosched_sbintime_sysctl(SYSCTL_HANDLER_ARGS) 802 { 803 char buf[16]; 804 sbintime_t value; 805 int error; 806 uint64_t us; 807 808 value = *(sbintime_t *)arg1; 809 us = (uint64_t)value / SBT_1US; 810 snprintf(buf, sizeof(buf), "%ju", (intmax_t)us); 811 error = sysctl_handle_string(oidp, buf, sizeof(buf), req); 812 if (error != 0 || req->newptr == NULL) 813 return error; 814 us = strtoul(buf, NULL, 10); 815 if (us == 0) 816 return EINVAL; 817 *(sbintime_t *)arg1 = us * SBT_1US; 818 return 0; 819 } 820 821 static void 822 cam_iosched_iop_stats_sysctl_init(struct cam_iosched_softc *isc, struct iop_stats *ios, char *name) 823 { 824 struct sysctl_oid_list *n; 825 struct sysctl_ctx_list *ctx; 826 827 ios->sysctl_tree = SYSCTL_ADD_NODE(&isc->sysctl_ctx, 828 SYSCTL_CHILDREN(isc->sysctl_tree), OID_AUTO, name, 829 CTLFLAG_RD, 0, name); 830 n = SYSCTL_CHILDREN(ios->sysctl_tree); 831 ctx = &ios->sysctl_ctx; 832 833 SYSCTL_ADD_UQUAD(ctx, n, 834 OID_AUTO, "ema", CTLFLAG_RD, 835 &ios->ema, 836 "Fast Exponentially Weighted Moving Average"); 837 SYSCTL_ADD_UQUAD(ctx, n, 838 OID_AUTO, "emss", CTLFLAG_RD, 839 &ios->emss, 840 "Fast Exponentially Weighted Moving Sum of Squares (maybe wrong)"); 841 SYSCTL_ADD_UQUAD(ctx, n, 842 OID_AUTO, "sd", CTLFLAG_RD, 843 &ios->sd, 844 "Estimated SD for fast ema (may be wrong)"); 845 846 SYSCTL_ADD_INT(ctx, n, 847 OID_AUTO, "pending", CTLFLAG_RD, 848 &ios->pending, 0, 849 "Instantaneous # of pending transactions"); 850 SYSCTL_ADD_INT(ctx, n, 851 OID_AUTO, "count", CTLFLAG_RD, 852 &ios->total, 0, 853 "# of transactions submitted to hardware"); 854 SYSCTL_ADD_INT(ctx, n, 855 OID_AUTO, "queued", CTLFLAG_RD, 856 &ios->queued, 0, 857 "# of transactions in the queue"); 858 SYSCTL_ADD_INT(ctx, n, 859 OID_AUTO, "in", CTLFLAG_RD, 860 &ios->in, 0, 861 "# of transactions queued to driver"); 862 SYSCTL_ADD_INT(ctx, n, 863 OID_AUTO, "out", CTLFLAG_RD, 864 &ios->out, 0, 865 "# of transactions completed"); 866 867 SYSCTL_ADD_PROC(ctx, n, 868 OID_AUTO, "limiter", CTLTYPE_STRING | CTLFLAG_RW, 869 ios, 0, cam_iosched_limiter_sysctl, "A", 870 "Current limiting type."); 871 SYSCTL_ADD_INT(ctx, n, 872 OID_AUTO, "min", CTLFLAG_RW, 873 &ios->min, 0, 874 "min resource"); 875 SYSCTL_ADD_INT(ctx, n, 876 OID_AUTO, "max", CTLFLAG_RW, 877 &ios->max, 0, 878 "max resource"); 879 SYSCTL_ADD_INT(ctx, n, 880 OID_AUTO, "current", CTLFLAG_RW, 881 &ios->current, 0, 882 "current resource"); 883 884 } 885 886 static void 887 cam_iosched_iop_stats_fini(struct iop_stats *ios) 888 { 889 if (ios->sysctl_tree) 890 if (sysctl_ctx_free(&ios->sysctl_ctx) != 0) 891 printf("can't remove iosched sysctl stats context\n"); 892 } 893 894 static void 895 cam_iosched_cl_sysctl_init(struct cam_iosched_softc *isc) 896 { 897 struct sysctl_oid_list *n; 898 struct sysctl_ctx_list *ctx; 899 struct control_loop *clp; 900 901 clp = &isc->cl; 902 clp->sysctl_tree = SYSCTL_ADD_NODE(&isc->sysctl_ctx, 903 SYSCTL_CHILDREN(isc->sysctl_tree), OID_AUTO, "control", 904 CTLFLAG_RD, 0, "Control loop info"); 905 n = SYSCTL_CHILDREN(clp->sysctl_tree); 906 ctx = &clp->sysctl_ctx; 907 908 SYSCTL_ADD_PROC(ctx, n, 909 OID_AUTO, "type", CTLTYPE_STRING | CTLFLAG_RW, 910 clp, 0, cam_iosched_control_type_sysctl, "A", 911 "Control loop algorithm"); 912 SYSCTL_ADD_PROC(ctx, n, 913 OID_AUTO, "steer_interval", CTLTYPE_STRING | CTLFLAG_RW, 914 &clp->steer_interval, 0, cam_iosched_sbintime_sysctl, "A", 915 "How often to steer (in us)"); 916 SYSCTL_ADD_PROC(ctx, n, 917 OID_AUTO, "lolat", CTLTYPE_STRING | CTLFLAG_RW, 918 &clp->lolat, 0, cam_iosched_sbintime_sysctl, "A", 919 "Low water mark for Latency (in us)"); 920 SYSCTL_ADD_PROC(ctx, n, 921 OID_AUTO, "hilat", CTLTYPE_STRING | CTLFLAG_RW, 922 &clp->hilat, 0, cam_iosched_sbintime_sysctl, "A", 923 "Hi water mark for Latency (in us)"); 924 SYSCTL_ADD_INT(ctx, n, 925 OID_AUTO, "alpha", CTLFLAG_RW, 926 &clp->alpha, 0, 927 "Alpha for PLL (x100) aka gain"); 928 } 929 930 static void 931 cam_iosched_cl_sysctl_fini(struct control_loop *clp) 932 { 933 if (clp->sysctl_tree) 934 if (sysctl_ctx_free(&clp->sysctl_ctx) != 0) 935 printf("can't remove iosched sysctl control loop context\n"); 936 } 937 #endif 938 939 /* 940 * Allocate the iosched structure. This also insulates callers from knowing 941 * sizeof struct cam_iosched_softc. 942 */ 943 int 944 cam_iosched_init(struct cam_iosched_softc **iscp, struct cam_periph *periph) 945 { 946 947 *iscp = malloc(sizeof(**iscp), M_CAMSCHED, M_NOWAIT | M_ZERO); 948 if (*iscp == NULL) 949 return ENOMEM; 950 #ifdef CAM_NETFLIX_IOSCHED 951 if (iosched_debug) 952 printf("CAM IOSCHEDULER Allocating entry at %p\n", *iscp); 953 #endif 954 (*iscp)->sort_io_queue = -1; 955 bioq_init(&(*iscp)->bio_queue); 956 bioq_init(&(*iscp)->trim_queue); 957 #ifdef CAM_NETFLIX_IOSCHED 958 if (do_netflix_iosched) { 959 bioq_init(&(*iscp)->write_queue); 960 (*iscp)->read_bias = 100; 961 (*iscp)->current_read_bias = 100; 962 (*iscp)->quanta = 200; 963 cam_iosched_iop_stats_init(*iscp, &(*iscp)->read_stats); 964 cam_iosched_iop_stats_init(*iscp, &(*iscp)->write_stats); 965 cam_iosched_iop_stats_init(*iscp, &(*iscp)->trim_stats); 966 (*iscp)->trim_stats.max = 1; /* Trims are special: one at a time for now */ 967 (*iscp)->last_time = sbinuptime(); 968 callout_init_mtx(&(*iscp)->ticker, cam_periph_mtx(periph), 0); 969 (*iscp)->periph = periph; 970 cam_iosched_cl_init(&(*iscp)->cl, *iscp); 971 callout_reset(&(*iscp)->ticker, hz / (*iscp)->quanta - 1, cam_iosched_ticker, *iscp); 972 (*iscp)->flags |= CAM_IOSCHED_FLAG_CALLOUT_ACTIVE; 973 } 974 #endif 975 976 return 0; 977 } 978 979 /* 980 * Reclaim all used resources. This assumes that other folks have 981 * drained the requests in the hardware. Maybe an unwise assumption. 982 */ 983 void 984 cam_iosched_fini(struct cam_iosched_softc *isc) 985 { 986 if (isc) { 987 cam_iosched_flush(isc, NULL, ENXIO); 988 #ifdef CAM_NETFLIX_IOSCHED 989 cam_iosched_iop_stats_fini(&isc->read_stats); 990 cam_iosched_iop_stats_fini(&isc->write_stats); 991 cam_iosched_iop_stats_fini(&isc->trim_stats); 992 cam_iosched_cl_sysctl_fini(&isc->cl); 993 if (isc->sysctl_tree) 994 if (sysctl_ctx_free(&isc->sysctl_ctx) != 0) 995 printf("can't remove iosched sysctl stats context\n"); 996 if (isc->flags & CAM_IOSCHED_FLAG_CALLOUT_ACTIVE) { 997 callout_drain(&isc->ticker); 998 isc->flags &= ~ CAM_IOSCHED_FLAG_CALLOUT_ACTIVE; 999 } 1000 1001 #endif 1002 free(isc, M_CAMSCHED); 1003 } 1004 } 1005 1006 /* 1007 * After we're sure we're attaching a device, go ahead and add 1008 * hooks for any sysctl we may wish to honor. 1009 */ 1010 void cam_iosched_sysctl_init(struct cam_iosched_softc *isc, 1011 struct sysctl_ctx_list *ctx, struct sysctl_oid *node) 1012 { 1013 #ifdef CAM_NETFLIX_IOSCHED 1014 struct sysctl_oid_list *n; 1015 #endif 1016 1017 SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(node), 1018 OID_AUTO, "sort_io_queue", CTLFLAG_RW | CTLFLAG_MPSAFE, 1019 &isc->sort_io_queue, 0, 1020 "Sort IO queue to try and optimise disk access patterns"); 1021 1022 #ifdef CAM_NETFLIX_IOSCHED 1023 if (!do_netflix_iosched) 1024 return; 1025 1026 isc->sysctl_tree = SYSCTL_ADD_NODE(&isc->sysctl_ctx, 1027 SYSCTL_CHILDREN(node), OID_AUTO, "iosched", 1028 CTLFLAG_RD, 0, "I/O scheduler statistics"); 1029 n = SYSCTL_CHILDREN(isc->sysctl_tree); 1030 ctx = &isc->sysctl_ctx; 1031 1032 cam_iosched_iop_stats_sysctl_init(isc, &isc->read_stats, "read"); 1033 cam_iosched_iop_stats_sysctl_init(isc, &isc->write_stats, "write"); 1034 cam_iosched_iop_stats_sysctl_init(isc, &isc->trim_stats, "trim"); 1035 cam_iosched_cl_sysctl_init(isc); 1036 1037 SYSCTL_ADD_INT(ctx, n, 1038 OID_AUTO, "read_bias", CTLFLAG_RW, 1039 &isc->read_bias, 100, 1040 "How biased towards read should we be independent of limits"); 1041 1042 SYSCTL_ADD_INT(ctx, n, 1043 OID_AUTO, "quanta", CTLFLAG_RW, 1044 &isc->quanta, 200, 1045 "How many quanta per second do we slice the I/O up into"); 1046 1047 SYSCTL_ADD_INT(ctx, n, 1048 OID_AUTO, "total_ticks", CTLFLAG_RD, 1049 &isc->total_ticks, 0, 1050 "Total number of ticks we've done"); 1051 #endif 1052 } 1053 1054 /* 1055 * Flush outstanding I/O. Consumers of this library don't know all the 1056 * queues we may keep, so this allows all I/O to be flushed in one 1057 * convenient call. 1058 */ 1059 void 1060 cam_iosched_flush(struct cam_iosched_softc *isc, struct devstat *stp, int err) 1061 { 1062 bioq_flush(&isc->bio_queue, stp, err); 1063 bioq_flush(&isc->trim_queue, stp, err); 1064 #ifdef CAM_NETFLIX_IOSCHED 1065 if (do_netflix_iosched) 1066 bioq_flush(&isc->write_queue, stp, err); 1067 #endif 1068 } 1069 1070 #ifdef CAM_NETFLIX_IOSCHED 1071 static struct bio * 1072 cam_iosched_get_write(struct cam_iosched_softc *isc) 1073 { 1074 struct bio *bp; 1075 1076 /* 1077 * We control the write rate by controlling how many requests we send 1078 * down to the drive at any one time. Fewer requests limits the 1079 * effects of both starvation when the requests take a while and write 1080 * amplification when each request is causing more than one write to 1081 * the NAND media. Limiting the queue depth like this will also limit 1082 * the write throughput and give and reads that want to compete to 1083 * compete unfairly. 1084 */ 1085 bp = bioq_first(&isc->write_queue); 1086 if (bp == NULL) { 1087 if (iosched_debug > 3) 1088 printf("No writes present in write_queue\n"); 1089 return NULL; 1090 } 1091 1092 /* 1093 * If pending read, prefer that based on current read bias 1094 * setting. 1095 */ 1096 if (bioq_first(&isc->bio_queue) && isc->current_read_bias) { 1097 if (iosched_debug) 1098 printf("Reads present and current_read_bias is %d queued writes %d queued reads %d\n", isc->current_read_bias, isc->write_stats.queued, isc->read_stats.queued); 1099 isc->current_read_bias--; 1100 return NULL; 1101 } 1102 1103 /* 1104 * See if our current limiter allows this I/O. 1105 */ 1106 if (cam_iosched_limiter_iop(&isc->write_stats, bp) != 0) { 1107 if (iosched_debug) 1108 printf("Can't write because limiter says no.\n"); 1109 return NULL; 1110 } 1111 1112 /* 1113 * Let's do this: We've passed all the gates and we're a go 1114 * to schedule the I/O in the SIM. 1115 */ 1116 isc->current_read_bias = isc->read_bias; 1117 bioq_remove(&isc->write_queue, bp); 1118 if (bp->bio_cmd == BIO_WRITE) { 1119 isc->write_stats.queued--; 1120 isc->write_stats.total++; 1121 isc->write_stats.pending++; 1122 } 1123 if (iosched_debug > 9) 1124 printf("HWQ : %p %#x\n", bp, bp->bio_cmd); 1125 return bp; 1126 } 1127 #endif 1128 1129 /* 1130 * Put back a trim that you weren't able to actually schedule this time. 1131 */ 1132 void 1133 cam_iosched_put_back_trim(struct cam_iosched_softc *isc, struct bio *bp) 1134 { 1135 bioq_insert_head(&isc->trim_queue, bp); 1136 #ifdef CAM_NETFLIX_IOSCHED 1137 isc->trim_stats.queued++; 1138 isc->trim_stats.total--; /* since we put it back, don't double count */ 1139 isc->trim_stats.pending--; 1140 #endif 1141 } 1142 1143 /* 1144 * gets the next trim from the trim queue. 1145 * 1146 * Assumes we're called with the periph lock held. It removes this 1147 * trim from the queue and the device must explicitly reinstert it 1148 * should the need arise. 1149 */ 1150 struct bio * 1151 cam_iosched_next_trim(struct cam_iosched_softc *isc) 1152 { 1153 struct bio *bp; 1154 1155 bp = bioq_first(&isc->trim_queue); 1156 if (bp == NULL) 1157 return NULL; 1158 bioq_remove(&isc->trim_queue, bp); 1159 #ifdef CAM_NETFLIX_IOSCHED 1160 isc->trim_stats.queued--; 1161 isc->trim_stats.total++; 1162 isc->trim_stats.pending++; 1163 #endif 1164 return bp; 1165 } 1166 1167 /* 1168 * gets the an available trim from the trim queue, if there's no trim 1169 * already pending. It removes this trim from the queue and the device 1170 * must explicitly reinstert it should the need arise. 1171 * 1172 * Assumes we're called with the periph lock held. 1173 */ 1174 struct bio * 1175 cam_iosched_get_trim(struct cam_iosched_softc *isc) 1176 { 1177 1178 if (!cam_iosched_has_more_trim(isc)) 1179 return NULL; 1180 1181 return cam_iosched_next_trim(isc); 1182 } 1183 1184 /* 1185 * Determine what the next bit of work to do is for the periph. The 1186 * default implementation looks to see if we have trims to do, but no 1187 * trims outstanding. If so, we do that. Otherwise we see if we have 1188 * other work. If we do, then we do that. Otherwise why were we called? 1189 */ 1190 struct bio * 1191 cam_iosched_next_bio(struct cam_iosched_softc *isc) 1192 { 1193 struct bio *bp; 1194 1195 /* 1196 * See if we have a trim that can be scheduled. We can only send one 1197 * at a time down, so this takes that into account. 1198 * 1199 * XXX newer TRIM commands are queueable. Revisit this when we 1200 * implement them. 1201 */ 1202 if ((bp = cam_iosched_get_trim(isc)) != NULL) 1203 return bp; 1204 1205 #ifdef CAM_NETFLIX_IOSCHED 1206 /* 1207 * See if we have any pending writes, and room in the queue for them, 1208 * and if so, those are next. 1209 */ 1210 if (do_netflix_iosched) { 1211 if ((bp = cam_iosched_get_write(isc)) != NULL) 1212 return bp; 1213 } 1214 #endif 1215 1216 /* 1217 * next, see if there's other, normal I/O waiting. If so return that. 1218 */ 1219 if ((bp = bioq_first(&isc->bio_queue)) == NULL) 1220 return NULL; 1221 1222 #ifdef CAM_NETFLIX_IOSCHED 1223 /* 1224 * For the netflix scheduler, bio_queue is only for reads, so enforce 1225 * the limits here. Enforce only for reads. 1226 */ 1227 if (do_netflix_iosched) { 1228 if (bp->bio_cmd == BIO_READ && 1229 cam_iosched_limiter_iop(&isc->read_stats, bp) != 0) 1230 return NULL; 1231 } 1232 #endif 1233 bioq_remove(&isc->bio_queue, bp); 1234 #ifdef CAM_NETFLIX_IOSCHED 1235 if (do_netflix_iosched) { 1236 if (bp->bio_cmd == BIO_READ) { 1237 isc->read_stats.queued--; 1238 isc->read_stats.total++; 1239 isc->read_stats.pending++; 1240 } else 1241 printf("Found bio_cmd = %#x\n", bp->bio_cmd); 1242 } 1243 if (iosched_debug > 9) 1244 printf("HWQ : %p %#x\n", bp, bp->bio_cmd); 1245 #endif 1246 return bp; 1247 } 1248 1249 /* 1250 * Driver has been given some work to do by the block layer. Tell the 1251 * scheduler about it and have it queue the work up. The scheduler module 1252 * will then return the currently most useful bit of work later, possibly 1253 * deferring work for various reasons. 1254 */ 1255 void 1256 cam_iosched_queue_work(struct cam_iosched_softc *isc, struct bio *bp) 1257 { 1258 1259 /* 1260 * Put all trims on the trim queue sorted, since we know 1261 * that the collapsing code requires this. Otherwise put 1262 * the work on the bio queue. 1263 */ 1264 if (bp->bio_cmd == BIO_DELETE) { 1265 bioq_disksort(&isc->trim_queue, bp); 1266 #ifdef CAM_NETFLIX_IOSCHED 1267 isc->trim_stats.in++; 1268 isc->trim_stats.queued++; 1269 #endif 1270 } 1271 #ifdef CAM_NETFLIX_IOSCHED 1272 else if (do_netflix_iosched && 1273 (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_FLUSH)) { 1274 if (cam_iosched_sort_queue(isc)) 1275 bioq_disksort(&isc->write_queue, bp); 1276 else 1277 bioq_insert_tail(&isc->write_queue, bp); 1278 if (iosched_debug > 9) 1279 printf("Qw : %p %#x\n", bp, bp->bio_cmd); 1280 if (bp->bio_cmd == BIO_WRITE) { 1281 isc->write_stats.in++; 1282 isc->write_stats.queued++; 1283 } 1284 } 1285 #endif 1286 else { 1287 if (cam_iosched_sort_queue(isc)) 1288 bioq_disksort(&isc->bio_queue, bp); 1289 else 1290 bioq_insert_tail(&isc->bio_queue, bp); 1291 #ifdef CAM_NETFLIX_IOSCHED 1292 if (iosched_debug > 9) 1293 printf("Qr : %p %#x\n", bp, bp->bio_cmd); 1294 if (bp->bio_cmd == BIO_READ) { 1295 isc->read_stats.in++; 1296 isc->read_stats.queued++; 1297 } else if (bp->bio_cmd == BIO_WRITE) { 1298 isc->write_stats.in++; 1299 isc->write_stats.queued++; 1300 } 1301 #endif 1302 } 1303 } 1304 1305 /* 1306 * If we have work, get it scheduled. Called with the periph lock held. 1307 */ 1308 void 1309 cam_iosched_schedule(struct cam_iosched_softc *isc, struct cam_periph *periph) 1310 { 1311 1312 if (cam_iosched_has_work(isc)) 1313 xpt_schedule(periph, CAM_PRIORITY_NORMAL); 1314 } 1315 1316 /* 1317 * Complete a trim request 1318 */ 1319 void 1320 cam_iosched_trim_done(struct cam_iosched_softc *isc) 1321 { 1322 1323 isc->flags &= ~CAM_IOSCHED_FLAG_TRIM_ACTIVE; 1324 } 1325 1326 /* 1327 * Complete a bio. Called before we release the ccb with xpt_release_ccb so we 1328 * might use notes in the ccb for statistics. 1329 */ 1330 int 1331 cam_iosched_bio_complete(struct cam_iosched_softc *isc, struct bio *bp, 1332 union ccb *done_ccb) 1333 { 1334 int retval = 0; 1335 #ifdef CAM_NETFLIX_IOSCHED 1336 if (!do_netflix_iosched) 1337 return retval; 1338 1339 if (iosched_debug > 10) 1340 printf("done: %p %#x\n", bp, bp->bio_cmd); 1341 if (bp->bio_cmd == BIO_WRITE) { 1342 retval = cam_iosched_limiter_iodone(&isc->write_stats, bp); 1343 isc->write_stats.out++; 1344 isc->write_stats.pending--; 1345 } else if (bp->bio_cmd == BIO_READ) { 1346 retval = cam_iosched_limiter_iodone(&isc->read_stats, bp); 1347 isc->read_stats.out++; 1348 isc->read_stats.pending--; 1349 } else if (bp->bio_cmd == BIO_DELETE) { 1350 isc->trim_stats.out++; 1351 isc->trim_stats.pending--; 1352 } else if (bp->bio_cmd != BIO_FLUSH) { 1353 if (iosched_debug) 1354 printf("Completing command with bio_cmd == %#x\n", bp->bio_cmd); 1355 } 1356 1357 if (!(bp->bio_flags & BIO_ERROR)) 1358 cam_iosched_io_metric_update(isc, done_ccb->ccb_h.qos.sim_data, 1359 bp->bio_cmd, bp->bio_bcount); 1360 #endif 1361 return retval; 1362 } 1363 1364 /* 1365 * Tell the io scheduler that you've pushed a trim down into the sim. 1366 * xxx better place for this? 1367 */ 1368 void 1369 cam_iosched_submit_trim(struct cam_iosched_softc *isc) 1370 { 1371 1372 isc->flags |= CAM_IOSCHED_FLAG_TRIM_ACTIVE; 1373 } 1374 1375 /* 1376 * Change the sorting policy hint for I/O transactions for this device. 1377 */ 1378 void 1379 cam_iosched_set_sort_queue(struct cam_iosched_softc *isc, int val) 1380 { 1381 1382 isc->sort_io_queue = val; 1383 } 1384 1385 int 1386 cam_iosched_has_work_flags(struct cam_iosched_softc *isc, uint32_t flags) 1387 { 1388 return isc->flags & flags; 1389 } 1390 1391 void 1392 cam_iosched_set_work_flags(struct cam_iosched_softc *isc, uint32_t flags) 1393 { 1394 isc->flags |= flags; 1395 } 1396 1397 void 1398 cam_iosched_clr_work_flags(struct cam_iosched_softc *isc, uint32_t flags) 1399 { 1400 isc->flags &= ~flags; 1401 } 1402 1403 #ifdef CAM_NETFLIX_IOSCHED 1404 /* 1405 * After the method presented in Jack Crenshaw's 1998 article "Integer 1406 * Suqare Roots," reprinted at 1407 * http://www.embedded.com/electronics-blogs/programmer-s-toolbox/4219659/Integer-Square-Roots 1408 * and well worth the read. Briefly, we find the power of 4 that's the 1409 * largest smaller than val. We then check each smaller power of 4 to 1410 * see if val is still bigger. The right shifts at each step divide 1411 * the result by 2 which after successive application winds up 1412 * accumulating the right answer. It could also have been accumulated 1413 * using a separate root counter, but this code is smaller and faster 1414 * than that method. This method is also integer size invariant. 1415 * It returns floor(sqrt((float)val)), or the larget integer less than 1416 * or equal to the square root. 1417 */ 1418 static uint64_t 1419 isqrt64(uint64_t val) 1420 { 1421 uint64_t res = 0; 1422 uint64_t bit = 1ULL << (sizeof(uint64_t) * NBBY - 2); 1423 1424 /* 1425 * Find the largest power of 4 smaller than val. 1426 */ 1427 while (bit > val) 1428 bit >>= 2; 1429 1430 /* 1431 * Accumulate the answer, one bit at a time (we keep moving 1432 * them over since 2 is the square root of 4 and we test 1433 * powers of 4). We accumulate where we find the bit, but 1434 * the successive shifts land the bit in the right place 1435 * by the end. 1436 */ 1437 while (bit != 0) { 1438 if (val >= res + bit) { 1439 val -= res + bit; 1440 res = (res >> 1) + bit; 1441 } else 1442 res >>= 1; 1443 bit >>= 2; 1444 } 1445 1446 return res; 1447 } 1448 1449 /* 1450 * a and b are 32.32 fixed point stored in a 64-bit word. 1451 * Let al and bl be the .32 part of a and b. 1452 * Let ah and bh be the 32 part of a and b. 1453 * R is the radix and is 1 << 32 1454 * 1455 * a * b 1456 * (ah + al / R) * (bh + bl / R) 1457 * ah * bh + (al * bh + ah * bl) / R + al * bl / R^2 1458 * 1459 * After multiplicaiton, we have to renormalize by multiply by 1460 * R, so we wind up with 1461 * ah * bh * R + al * bh + ah * bl + al * bl / R 1462 * which turns out to be a very nice way to compute this value 1463 * so long as ah and bh are < 65536 there's no loss of high bits 1464 * and the low order bits are below the threshold of caring for 1465 * this application. 1466 */ 1467 static uint64_t 1468 mul(uint64_t a, uint64_t b) 1469 { 1470 uint64_t al, ah, bl, bh; 1471 al = a & 0xffffffff; 1472 ah = a >> 32; 1473 bl = b & 0xffffffff; 1474 bh = b >> 32; 1475 return ((ah * bh) << 32) + al * bh + ah * bl + ((al * bl) >> 32); 1476 } 1477 1478 static void 1479 cam_iosched_update(struct iop_stats *iop, sbintime_t sim_latency) 1480 { 1481 sbintime_t y, yy; 1482 uint64_t var; 1483 1484 /* 1485 * Classic expoentially decaying average with a tiny alpha 1486 * (2 ^ -alpha_bits). For more info see the NIST statistical 1487 * handbook. 1488 * 1489 * ema_t = y_t * alpha + ema_t-1 * (1 - alpha) 1490 * alpha = 1 / (1 << alpha_bits) 1491 * 1492 * Since alpha is a power of two, we can compute this w/o any mult or 1493 * division. 1494 */ 1495 y = sim_latency; 1496 iop->ema = (y + (iop->ema << alpha_bits) - iop->ema) >> alpha_bits; 1497 1498 yy = mul(y, y); 1499 iop->emss = (yy + (iop->emss << alpha_bits) - iop->emss) >> alpha_bits; 1500 1501 /* 1502 * s_1 = sum of data 1503 * s_2 = sum of data * data 1504 * ema ~ mean (or s_1 / N) 1505 * emss ~ s_2 / N 1506 * 1507 * sd = sqrt((N * s_2 - s_1 ^ 2) / (N * (N - 1))) 1508 * sd = sqrt((N * s_2 / N * (N - 1)) - (s_1 ^ 2 / (N * (N - 1)))) 1509 * 1510 * N ~ 2 / alpha - 1 1511 * alpha < 1 / 16 (typically much less) 1512 * N > 31 --> N large so N * (N - 1) is approx N * N 1513 * 1514 * substituting and rearranging: 1515 * sd ~ sqrt(s_2 / N - (s_1 / N) ^ 2) 1516 * ~ sqrt(emss - ema ^ 2); 1517 * which is the formula used here to get a decent estimate of sd which 1518 * we use to detect outliers. Note that when first starting up, it 1519 * takes a while for emss sum of squares estimator to converge on a 1520 * good value. during this time, it can be less than ema^2. We 1521 * compute a sd of 0 in that case, and ignore outliers. 1522 */ 1523 var = iop->emss - mul(iop->ema, iop->ema); 1524 iop->sd = (int64_t)var < 0 ? 0 : isqrt64(var); 1525 } 1526 1527 #ifdef CAM_NETFLIX_IOSCHED 1528 static void 1529 cam_iosched_io_metric_update(struct cam_iosched_softc *isc, 1530 sbintime_t sim_latency, int cmd, size_t size) 1531 { 1532 /* xxx Do we need to scale based on the size of the I/O ? */ 1533 switch (cmd) { 1534 case BIO_READ: 1535 cam_iosched_update(&isc->read_stats, sim_latency); 1536 break; 1537 case BIO_WRITE: 1538 cam_iosched_update(&isc->write_stats, sim_latency); 1539 break; 1540 case BIO_DELETE: 1541 cam_iosched_update(&isc->trim_stats, sim_latency); 1542 break; 1543 default: 1544 break; 1545 } 1546 } 1547 #endif 1548 1549 #ifdef DDB 1550 static int biolen(struct bio_queue_head *bq) 1551 { 1552 int i = 0; 1553 struct bio *bp; 1554 1555 TAILQ_FOREACH(bp, &bq->queue, bio_queue) { 1556 i++; 1557 } 1558 return i; 1559 } 1560 1561 /* 1562 * Show the internal state of the I/O scheduler. 1563 */ 1564 DB_SHOW_COMMAND(iosched, cam_iosched_db_show) 1565 { 1566 struct cam_iosched_softc *isc; 1567 1568 if (!have_addr) { 1569 db_printf("Need addr\n"); 1570 return; 1571 } 1572 isc = (struct cam_iosched_softc *)addr; 1573 db_printf("pending_reads: %d\n", isc->read_stats.pending); 1574 db_printf("min_reads: %d\n", isc->read_stats.min); 1575 db_printf("max_reads: %d\n", isc->read_stats.max); 1576 db_printf("reads: %d\n", isc->read_stats.total); 1577 db_printf("in_reads: %d\n", isc->read_stats.in); 1578 db_printf("out_reads: %d\n", isc->read_stats.out); 1579 db_printf("queued_reads: %d\n", isc->read_stats.queued); 1580 db_printf("Current Q len %d\n", biolen(&isc->bio_queue)); 1581 db_printf("pending_writes: %d\n", isc->write_stats.pending); 1582 db_printf("min_writes: %d\n", isc->write_stats.min); 1583 db_printf("max_writes: %d\n", isc->write_stats.max); 1584 db_printf("writes: %d\n", isc->write_stats.total); 1585 db_printf("in_writes: %d\n", isc->write_stats.in); 1586 db_printf("out_writes: %d\n", isc->write_stats.out); 1587 db_printf("queued_writes: %d\n", isc->write_stats.queued); 1588 db_printf("Current Q len %d\n", biolen(&isc->write_queue)); 1589 db_printf("pending_trims: %d\n", isc->trim_stats.pending); 1590 db_printf("min_trims: %d\n", isc->trim_stats.min); 1591 db_printf("max_trims: %d\n", isc->trim_stats.max); 1592 db_printf("trims: %d\n", isc->trim_stats.total); 1593 db_printf("in_trims: %d\n", isc->trim_stats.in); 1594 db_printf("out_trims: %d\n", isc->trim_stats.out); 1595 db_printf("queued_trims: %d\n", isc->trim_stats.queued); 1596 db_printf("Current Q len %d\n", biolen(&isc->trim_queue)); 1597 db_printf("read_bias: %d\n", isc->read_bias); 1598 db_printf("current_read_bias: %d\n", isc->current_read_bias); 1599 db_printf("Trim active? %s\n", 1600 (isc->flags & CAM_IOSCHED_FLAG_TRIM_ACTIVE) ? "yes" : "no"); 1601 } 1602 #endif 1603 #endif 1604