1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 2002 Poul-Henning Kamp 5 * Copyright (c) 2002 Networks Associates Technology, Inc. 6 * Copyright (c) 2013 The FreeBSD Foundation 7 * All rights reserved. 8 * 9 * This software was developed for the FreeBSD Project by Poul-Henning Kamp 10 * and NAI Labs, the Security Research Division of Network Associates, Inc. 11 * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the 12 * DARPA CHATS research program. 13 * 14 * Portions of this software were developed by Konstantin Belousov 15 * under sponsorship from the FreeBSD Foundation. 16 * 17 * Redistribution and use in source and binary forms, with or without 18 * modification, are permitted provided that the following conditions 19 * are met: 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 3. The names of the authors may not be used to endorse or promote 26 * products derived from this software without specific prior written 27 * permission. 28 * 29 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 32 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 39 * SUCH DAMAGE. 40 */ 41 42 #include <sys/cdefs.h> 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/kernel.h> 46 #include <sys/malloc.h> 47 #include <sys/bio.h> 48 #include <sys/ktr.h> 49 #include <sys/proc.h> 50 #include <sys/sbuf.h> 51 #include <sys/stack.h> 52 #include <sys/sysctl.h> 53 #include <sys/vmem.h> 54 #include <machine/stack.h> 55 #include <machine/stdarg.h> 56 57 #include <sys/errno.h> 58 #include <geom/geom.h> 59 #include <geom/geom_int.h> 60 #include <sys/devicestat.h> 61 62 #include <vm/uma.h> 63 #include <vm/vm.h> 64 #include <vm/vm_param.h> 65 #include <vm/vm_kern.h> 66 #include <vm/vm_page.h> 67 #include <vm/vm_object.h> 68 #include <vm/vm_extern.h> 69 #include <vm/vm_map.h> 70 71 static int g_io_transient_map_bio(struct bio *bp); 72 73 static struct g_bioq g_bio_run_down; 74 static struct g_bioq g_bio_run_up; 75 76 /* 77 * Pace is a hint that we've had some trouble recently allocating 78 * bios, so we should back off trying to send I/O down the stack 79 * a bit to let the problem resolve. When pacing, we also turn 80 * off direct dispatch to also reduce memory pressure from I/Os 81 * there, at the expxense of some added latency while the memory 82 * pressures exist. See g_io_schedule_down() for more details 83 * and limitations. 84 */ 85 static volatile u_int __read_mostly pace; 86 87 static uma_zone_t __read_mostly biozone; 88 89 #include <machine/atomic.h> 90 91 static void 92 g_bioq_lock(struct g_bioq *bq) 93 { 94 95 mtx_lock(&bq->bio_queue_lock); 96 } 97 98 static void 99 g_bioq_unlock(struct g_bioq *bq) 100 { 101 102 mtx_unlock(&bq->bio_queue_lock); 103 } 104 105 #if 0 106 static void 107 g_bioq_destroy(struct g_bioq *bq) 108 { 109 110 mtx_destroy(&bq->bio_queue_lock); 111 } 112 #endif 113 114 static void 115 g_bioq_init(struct g_bioq *bq) 116 { 117 118 TAILQ_INIT(&bq->bio_queue); 119 mtx_init(&bq->bio_queue_lock, "bio queue", NULL, MTX_DEF); 120 } 121 122 static struct bio * 123 g_bioq_first(struct g_bioq *bq) 124 { 125 struct bio *bp; 126 127 bp = TAILQ_FIRST(&bq->bio_queue); 128 if (bp != NULL) { 129 KASSERT((bp->bio_flags & BIO_ONQUEUE), 130 ("Bio not on queue bp=%p target %p", bp, bq)); 131 bp->bio_flags &= ~BIO_ONQUEUE; 132 TAILQ_REMOVE(&bq->bio_queue, bp, bio_queue); 133 bq->bio_queue_length--; 134 } 135 return (bp); 136 } 137 138 struct bio * 139 g_new_bio(void) 140 { 141 struct bio *bp; 142 143 bp = uma_zalloc(biozone, M_NOWAIT | M_ZERO); 144 #ifdef KTR 145 if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { 146 struct stack st; 147 148 CTR1(KTR_GEOM, "g_new_bio(): %p", bp); 149 stack_save(&st); 150 CTRSTACK(KTR_GEOM, &st, 3); 151 } 152 #endif 153 return (bp); 154 } 155 156 struct bio * 157 g_alloc_bio(void) 158 { 159 struct bio *bp; 160 161 bp = uma_zalloc(biozone, M_WAITOK | M_ZERO); 162 #ifdef KTR 163 if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { 164 struct stack st; 165 166 CTR1(KTR_GEOM, "g_alloc_bio(): %p", bp); 167 stack_save(&st); 168 CTRSTACK(KTR_GEOM, &st, 3); 169 } 170 #endif 171 return (bp); 172 } 173 174 void 175 g_destroy_bio(struct bio *bp) 176 { 177 #ifdef KTR 178 if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { 179 struct stack st; 180 181 CTR1(KTR_GEOM, "g_destroy_bio(): %p", bp); 182 stack_save(&st); 183 CTRSTACK(KTR_GEOM, &st, 3); 184 } 185 #endif 186 uma_zfree(biozone, bp); 187 } 188 189 struct bio * 190 g_clone_bio(struct bio *bp) 191 { 192 struct bio *bp2; 193 194 bp2 = uma_zalloc(biozone, M_NOWAIT | M_ZERO); 195 if (bp2 != NULL) { 196 bp2->bio_parent = bp; 197 bp2->bio_cmd = bp->bio_cmd; 198 /* 199 * BIO_ORDERED flag may be used by disk drivers to enforce 200 * ordering restrictions, so this flag needs to be cloned. 201 * BIO_UNMAPPED, BIO_VLIST, and BIO_SWAP should be inherited, 202 * to properly indicate which way the buffer is passed. 203 * Other bio flags are not suitable for cloning. 204 */ 205 bp2->bio_flags = bp->bio_flags & 206 (BIO_ORDERED | BIO_UNMAPPED | BIO_VLIST | BIO_SWAP); 207 bp2->bio_length = bp->bio_length; 208 bp2->bio_offset = bp->bio_offset; 209 bp2->bio_data = bp->bio_data; 210 bp2->bio_ma = bp->bio_ma; 211 bp2->bio_ma_n = bp->bio_ma_n; 212 bp2->bio_ma_offset = bp->bio_ma_offset; 213 bp2->bio_attribute = bp->bio_attribute; 214 if (bp->bio_cmd == BIO_ZONE) 215 bcopy(&bp->bio_zone, &bp2->bio_zone, 216 sizeof(bp->bio_zone)); 217 #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) 218 bp2->bio_track_bp = bp->bio_track_bp; 219 #endif 220 bp->bio_children++; 221 } 222 #ifdef KTR 223 if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { 224 struct stack st; 225 226 CTR2(KTR_GEOM, "g_clone_bio(%p): %p", bp, bp2); 227 stack_save(&st); 228 CTRSTACK(KTR_GEOM, &st, 3); 229 } 230 #endif 231 return(bp2); 232 } 233 234 struct bio * 235 g_duplicate_bio(struct bio *bp) 236 { 237 struct bio *bp2; 238 239 bp2 = uma_zalloc(biozone, M_WAITOK | M_ZERO); 240 bp2->bio_flags = bp->bio_flags & (BIO_UNMAPPED | BIO_VLIST | BIO_SWAP); 241 bp2->bio_parent = bp; 242 bp2->bio_cmd = bp->bio_cmd; 243 bp2->bio_length = bp->bio_length; 244 bp2->bio_offset = bp->bio_offset; 245 bp2->bio_data = bp->bio_data; 246 bp2->bio_ma = bp->bio_ma; 247 bp2->bio_ma_n = bp->bio_ma_n; 248 bp2->bio_ma_offset = bp->bio_ma_offset; 249 bp2->bio_attribute = bp->bio_attribute; 250 bp->bio_children++; 251 #ifdef KTR 252 if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { 253 struct stack st; 254 255 CTR2(KTR_GEOM, "g_duplicate_bio(%p): %p", bp, bp2); 256 stack_save(&st); 257 CTRSTACK(KTR_GEOM, &st, 3); 258 } 259 #endif 260 return(bp2); 261 } 262 263 void 264 g_reset_bio(struct bio *bp) 265 { 266 267 bzero(bp, sizeof(*bp)); 268 } 269 270 void 271 g_io_init(void) 272 { 273 274 g_bioq_init(&g_bio_run_down); 275 g_bioq_init(&g_bio_run_up); 276 biozone = uma_zcreate("g_bio", sizeof (struct bio), 277 NULL, NULL, 278 NULL, NULL, 279 0, 0); 280 } 281 282 int 283 g_io_getattr(const char *attr, struct g_consumer *cp, int *len, void *ptr) 284 { 285 struct bio *bp; 286 int error; 287 288 g_trace(G_T_BIO, "bio_getattr(%s)", attr); 289 bp = g_alloc_bio(); 290 bp->bio_cmd = BIO_GETATTR; 291 bp->bio_done = NULL; 292 bp->bio_attribute = attr; 293 bp->bio_length = *len; 294 bp->bio_data = ptr; 295 g_io_request(bp, cp); 296 error = biowait(bp, "ggetattr"); 297 *len = bp->bio_completed; 298 g_destroy_bio(bp); 299 return (error); 300 } 301 302 int 303 g_io_zonecmd(struct disk_zone_args *zone_args, struct g_consumer *cp) 304 { 305 struct bio *bp; 306 int error; 307 308 g_trace(G_T_BIO, "bio_zone(%d)", zone_args->zone_cmd); 309 bp = g_alloc_bio(); 310 bp->bio_cmd = BIO_ZONE; 311 bp->bio_done = NULL; 312 /* 313 * XXX KDM need to handle report zone data. 314 */ 315 bcopy(zone_args, &bp->bio_zone, sizeof(*zone_args)); 316 if (zone_args->zone_cmd == DISK_ZONE_REPORT_ZONES) 317 bp->bio_length = 318 zone_args->zone_params.report.entries_allocated * 319 sizeof(struct disk_zone_rep_entry); 320 else 321 bp->bio_length = 0; 322 323 g_io_request(bp, cp); 324 error = biowait(bp, "gzone"); 325 bcopy(&bp->bio_zone, zone_args, sizeof(*zone_args)); 326 g_destroy_bio(bp); 327 return (error); 328 } 329 330 /* 331 * Send a BIO_SPEEDUP down the stack. This is used to tell the lower layers that 332 * the upper layers have detected a resource shortage. The lower layers are 333 * advised to stop delaying I/O that they might be holding for performance 334 * reasons and to schedule it (non-trims) or complete it successfully (trims) as 335 * quickly as it can. bio_length is the amount of the shortage. This call 336 * should be non-blocking. bio_resid is used to communicate back if the lower 337 * layers couldn't find bio_length worth of I/O to schedule or discard. A length 338 * of 0 means to do as much as you can (schedule the h/w queues full, discard 339 * all trims). flags are a hint from the upper layers to the lower layers what 340 * operation should be done. 341 */ 342 int 343 g_io_speedup(off_t shortage, u_int flags, size_t *resid, struct g_consumer *cp) 344 { 345 struct bio *bp; 346 int error; 347 348 KASSERT((flags & (BIO_SPEEDUP_TRIM | BIO_SPEEDUP_WRITE)) != 0, 349 ("Invalid flags passed to g_io_speedup: %#x", flags)); 350 g_trace(G_T_BIO, "bio_speedup(%s, %jd, %#x)", cp->provider->name, 351 (intmax_t)shortage, flags); 352 bp = g_new_bio(); 353 if (bp == NULL) 354 return (ENOMEM); 355 bp->bio_cmd = BIO_SPEEDUP; 356 bp->bio_length = shortage; 357 bp->bio_done = NULL; 358 bp->bio_flags |= flags; 359 g_io_request(bp, cp); 360 error = biowait(bp, "gflush"); 361 *resid = bp->bio_resid; 362 g_destroy_bio(bp); 363 return (error); 364 } 365 366 int 367 g_io_flush(struct g_consumer *cp) 368 { 369 struct bio *bp; 370 int error; 371 372 g_trace(G_T_BIO, "bio_flush(%s)", cp->provider->name); 373 bp = g_alloc_bio(); 374 bp->bio_cmd = BIO_FLUSH; 375 bp->bio_flags |= BIO_ORDERED; 376 bp->bio_done = NULL; 377 bp->bio_attribute = NULL; 378 bp->bio_offset = cp->provider->mediasize; 379 bp->bio_length = 0; 380 bp->bio_data = NULL; 381 g_io_request(bp, cp); 382 error = biowait(bp, "gflush"); 383 g_destroy_bio(bp); 384 return (error); 385 } 386 387 static int 388 g_io_check(struct bio *bp) 389 { 390 struct g_consumer *cp; 391 struct g_provider *pp; 392 off_t excess; 393 int error; 394 395 biotrack(bp, __func__); 396 397 cp = bp->bio_from; 398 pp = bp->bio_to; 399 400 /* Fail if access counters dont allow the operation */ 401 switch(bp->bio_cmd) { 402 case BIO_READ: 403 case BIO_GETATTR: 404 if (cp->acr == 0) 405 return (EPERM); 406 break; 407 case BIO_WRITE: 408 case BIO_DELETE: 409 case BIO_SPEEDUP: 410 case BIO_FLUSH: 411 if (cp->acw == 0) 412 return (EPERM); 413 break; 414 case BIO_ZONE: 415 if ((bp->bio_zone.zone_cmd == DISK_ZONE_REPORT_ZONES) || 416 (bp->bio_zone.zone_cmd == DISK_ZONE_GET_PARAMS)) { 417 if (cp->acr == 0) 418 return (EPERM); 419 } else if (cp->acw == 0) 420 return (EPERM); 421 break; 422 default: 423 return (EPERM); 424 } 425 /* if provider is marked for error, don't disturb. */ 426 if (pp->error) 427 return (pp->error); 428 if (cp->flags & G_CF_ORPHAN) 429 return (ENXIO); 430 431 switch(bp->bio_cmd) { 432 case BIO_READ: 433 case BIO_WRITE: 434 case BIO_DELETE: 435 /* Zero sectorsize or mediasize is probably a lack of media. */ 436 if (pp->sectorsize == 0 || pp->mediasize == 0) 437 return (ENXIO); 438 /* Reject I/O not on sector boundary */ 439 if (bp->bio_offset % pp->sectorsize) 440 return (EINVAL); 441 /* Reject I/O not integral sector long */ 442 if (bp->bio_length % pp->sectorsize) 443 return (EINVAL); 444 /* Reject requests before or past the end of media. */ 445 if (bp->bio_offset < 0) 446 return (EIO); 447 if (bp->bio_offset > pp->mediasize) 448 return (EIO); 449 450 /* Truncate requests to the end of providers media. */ 451 excess = bp->bio_offset + bp->bio_length; 452 if (excess > bp->bio_to->mediasize) { 453 KASSERT((bp->bio_flags & BIO_UNMAPPED) == 0 || 454 round_page(bp->bio_ma_offset + 455 bp->bio_length) / PAGE_SIZE == bp->bio_ma_n, 456 ("excess bio %p too short", bp)); 457 excess -= bp->bio_to->mediasize; 458 bp->bio_length -= excess; 459 if ((bp->bio_flags & BIO_UNMAPPED) != 0) { 460 bp->bio_ma_n = round_page(bp->bio_ma_offset + 461 bp->bio_length) / PAGE_SIZE; 462 } 463 if (excess > 0) 464 CTR3(KTR_GEOM, "g_down truncated bio " 465 "%p provider %s by %d", bp, 466 bp->bio_to->name, excess); 467 } 468 469 /* Deliver zero length transfers right here. */ 470 if (bp->bio_length == 0) { 471 CTR2(KTR_GEOM, "g_down terminated 0-length " 472 "bp %p provider %s", bp, bp->bio_to->name); 473 return (0); 474 } 475 476 if ((bp->bio_flags & BIO_UNMAPPED) != 0 && 477 (bp->bio_to->flags & G_PF_ACCEPT_UNMAPPED) == 0 && 478 (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) { 479 if ((error = g_io_transient_map_bio(bp)) >= 0) 480 return (error); 481 } 482 break; 483 default: 484 break; 485 } 486 return (EJUSTRETURN); 487 } 488 489 void 490 g_io_request(struct bio *bp, struct g_consumer *cp) 491 { 492 struct g_provider *pp; 493 int direct, error, first; 494 uint8_t cmd; 495 496 biotrack(bp, __func__); 497 498 KASSERT(cp != NULL, ("NULL cp in g_io_request")); 499 KASSERT(bp != NULL, ("NULL bp in g_io_request")); 500 pp = cp->provider; 501 KASSERT(pp != NULL, ("consumer not attached in g_io_request")); 502 #ifdef DIAGNOSTIC 503 KASSERT(bp->bio_driver1 == NULL, 504 ("bio_driver1 used by the consumer (geom %s)", cp->geom->name)); 505 KASSERT(bp->bio_driver2 == NULL, 506 ("bio_driver2 used by the consumer (geom %s)", cp->geom->name)); 507 KASSERT(bp->bio_pflags == 0, 508 ("bio_pflags used by the consumer (geom %s)", cp->geom->name)); 509 /* 510 * Remember consumer's private fields, so we can detect if they were 511 * modified by the provider. 512 */ 513 bp->_bio_caller1 = bp->bio_caller1; 514 bp->_bio_caller2 = bp->bio_caller2; 515 bp->_bio_cflags = bp->bio_cflags; 516 #endif 517 518 cmd = bp->bio_cmd; 519 if (cmd == BIO_READ || cmd == BIO_WRITE || cmd == BIO_GETATTR) { 520 KASSERT(bp->bio_data != NULL, 521 ("NULL bp->data in g_io_request(cmd=%hu)", bp->bio_cmd)); 522 } 523 if (cmd == BIO_DELETE || cmd == BIO_FLUSH || cmd == BIO_SPEEDUP) { 524 KASSERT(bp->bio_data == NULL, 525 ("non-NULL bp->data in g_io_request(cmd=%hu)", 526 bp->bio_cmd)); 527 } 528 if (cmd == BIO_READ || cmd == BIO_WRITE || cmd == BIO_DELETE) { 529 KASSERT(bp->bio_offset % cp->provider->sectorsize == 0, 530 ("wrong offset %jd for sectorsize %u", 531 bp->bio_offset, cp->provider->sectorsize)); 532 KASSERT(bp->bio_length % cp->provider->sectorsize == 0, 533 ("wrong length %jd for sectorsize %u", 534 bp->bio_length, cp->provider->sectorsize)); 535 } 536 537 g_trace(G_T_BIO, "bio_request(%p) from %p(%s) to %p(%s) cmd %d", 538 bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd); 539 540 bp->bio_from = cp; 541 bp->bio_to = pp; 542 bp->bio_error = 0; 543 bp->bio_completed = 0; 544 545 KASSERT(!(bp->bio_flags & BIO_ONQUEUE), 546 ("Bio already on queue bp=%p", bp)); 547 548 if ((g_collectstats & G_STATS_CONSUMERS) != 0 || 549 ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL)) 550 binuptime(&bp->bio_t0); 551 else 552 getbinuptime(&bp->bio_t0); 553 if (g_collectstats & G_STATS_CONSUMERS) 554 devstat_start_transaction_bio_t0(cp->stat, bp); 555 if (g_collectstats & G_STATS_PROVIDERS) 556 devstat_start_transaction_bio_t0(pp->stat, bp); 557 #ifdef INVARIANTS 558 atomic_add_int(&cp->nstart, 1); 559 #endif 560 561 direct = (cp->flags & G_CF_DIRECT_SEND) != 0 && 562 (pp->flags & G_PF_DIRECT_RECEIVE) != 0 && 563 curthread != g_down_td && 564 ((pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 || 565 (bp->bio_flags & BIO_UNMAPPED) == 0 || THREAD_CAN_SLEEP()) && 566 pace == 0; 567 if (direct) { 568 /* Block direct execution if less then half of stack left. */ 569 size_t st, su; 570 GET_STACK_USAGE(st, su); 571 if (su * 2 > st) 572 direct = 0; 573 } 574 575 if (direct) { 576 error = g_io_check(bp); 577 if (error >= 0) { 578 CTR3(KTR_GEOM, "g_io_request g_io_check on bp %p " 579 "provider %s returned %d", bp, bp->bio_to->name, 580 error); 581 g_io_deliver(bp, error); 582 return; 583 } 584 bp->bio_to->geom->start(bp); 585 } else { 586 g_bioq_lock(&g_bio_run_down); 587 first = TAILQ_EMPTY(&g_bio_run_down.bio_queue); 588 TAILQ_INSERT_TAIL(&g_bio_run_down.bio_queue, bp, bio_queue); 589 bp->bio_flags |= BIO_ONQUEUE; 590 g_bio_run_down.bio_queue_length++; 591 g_bioq_unlock(&g_bio_run_down); 592 /* Pass it on down. */ 593 if (first) 594 wakeup(&g_wait_down); 595 } 596 } 597 598 void 599 g_io_deliver(struct bio *bp, int error) 600 { 601 struct bintime now; 602 struct g_consumer *cp; 603 struct g_provider *pp; 604 struct mtx *mtxp; 605 int direct, first; 606 607 biotrack(bp, __func__); 608 609 KASSERT(bp != NULL, ("NULL bp in g_io_deliver")); 610 pp = bp->bio_to; 611 KASSERT(pp != NULL, ("NULL bio_to in g_io_deliver")); 612 cp = bp->bio_from; 613 if (cp == NULL) { 614 bp->bio_error = error; 615 bp->bio_done(bp); 616 return; 617 } 618 KASSERT(cp != NULL, ("NULL bio_from in g_io_deliver")); 619 KASSERT(cp->geom != NULL, ("NULL bio_from->geom in g_io_deliver")); 620 #ifdef DIAGNOSTIC 621 /* 622 * Some classes - GJournal in particular - can modify bio's 623 * private fields while the bio is in transit; G_GEOM_VOLATILE_BIO 624 * flag means it's an expected behaviour for that particular geom. 625 */ 626 if ((cp->geom->flags & G_GEOM_VOLATILE_BIO) == 0) { 627 KASSERT(bp->bio_caller1 == bp->_bio_caller1, 628 ("bio_caller1 used by the provider %s", pp->name)); 629 KASSERT(bp->bio_caller2 == bp->_bio_caller2, 630 ("bio_caller2 used by the provider %s", pp->name)); 631 KASSERT(bp->bio_cflags == bp->_bio_cflags, 632 ("bio_cflags used by the provider %s", pp->name)); 633 } 634 #endif 635 KASSERT(bp->bio_completed >= 0, ("bio_completed can't be less than 0")); 636 KASSERT(bp->bio_completed <= bp->bio_length, 637 ("bio_completed can't be greater than bio_length")); 638 639 g_trace(G_T_BIO, 640 "g_io_deliver(%p) from %p(%s) to %p(%s) cmd %d error %d off %jd len %jd", 641 bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd, error, 642 (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length); 643 644 KASSERT(!(bp->bio_flags & BIO_ONQUEUE), 645 ("Bio already on queue bp=%p", bp)); 646 647 /* 648 * XXX: next two doesn't belong here 649 */ 650 bp->bio_bcount = bp->bio_length; 651 bp->bio_resid = bp->bio_bcount - bp->bio_completed; 652 653 direct = (pp->flags & G_PF_DIRECT_SEND) && 654 (cp->flags & G_CF_DIRECT_RECEIVE) && 655 curthread != g_up_td; 656 if (direct) { 657 /* Block direct execution if less then half of stack left. */ 658 size_t st, su; 659 GET_STACK_USAGE(st, su); 660 if (su * 2 > st) 661 direct = 0; 662 } 663 664 /* 665 * The statistics collection is lockless, as such, but we 666 * can not update one instance of the statistics from more 667 * than one thread at a time, so grab the lock first. 668 */ 669 if ((g_collectstats & G_STATS_CONSUMERS) != 0 || 670 ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL)) 671 binuptime(&now); 672 mtxp = mtx_pool_find(mtxpool_sleep, pp); 673 mtx_lock(mtxp); 674 if (g_collectstats & G_STATS_PROVIDERS) 675 devstat_end_transaction_bio_bt(pp->stat, bp, &now); 676 if (g_collectstats & G_STATS_CONSUMERS) 677 devstat_end_transaction_bio_bt(cp->stat, bp, &now); 678 #ifdef INVARIANTS 679 cp->nend++; 680 #endif 681 mtx_unlock(mtxp); 682 683 if (error != ENOMEM) { 684 bp->bio_error = error; 685 if (direct) { 686 biodone(bp); 687 } else { 688 g_bioq_lock(&g_bio_run_up); 689 first = TAILQ_EMPTY(&g_bio_run_up.bio_queue); 690 TAILQ_INSERT_TAIL(&g_bio_run_up.bio_queue, bp, bio_queue); 691 bp->bio_flags |= BIO_ONQUEUE; 692 g_bio_run_up.bio_queue_length++; 693 g_bioq_unlock(&g_bio_run_up); 694 if (first) 695 wakeup(&g_wait_up); 696 } 697 return; 698 } 699 700 if (bootverbose) 701 printf("ENOMEM %p on %p(%s)\n", bp, pp, pp->name); 702 bp->bio_children = 0; 703 bp->bio_inbed = 0; 704 bp->bio_driver1 = NULL; 705 bp->bio_driver2 = NULL; 706 bp->bio_pflags = 0; 707 g_io_request(bp, cp); 708 pace = 1; 709 return; 710 } 711 712 SYSCTL_DECL(_kern_geom); 713 714 static long transient_maps; 715 SYSCTL_LONG(_kern_geom, OID_AUTO, transient_maps, CTLFLAG_RD, 716 &transient_maps, 0, 717 "Total count of the transient mapping requests"); 718 u_int transient_map_retries = 10; 719 SYSCTL_UINT(_kern_geom, OID_AUTO, transient_map_retries, CTLFLAG_RW, 720 &transient_map_retries, 0, 721 "Max count of retries used before giving up on creating transient map"); 722 int transient_map_hard_failures; 723 SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_hard_failures, CTLFLAG_RD, 724 &transient_map_hard_failures, 0, 725 "Failures to establish the transient mapping due to retry attempts " 726 "exhausted"); 727 int transient_map_soft_failures; 728 SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_soft_failures, CTLFLAG_RD, 729 &transient_map_soft_failures, 0, 730 "Count of retried failures to establish the transient mapping"); 731 int inflight_transient_maps; 732 SYSCTL_INT(_kern_geom, OID_AUTO, inflight_transient_maps, CTLFLAG_RD, 733 &inflight_transient_maps, 0, 734 "Current count of the active transient maps"); 735 736 static int 737 g_io_transient_map_bio(struct bio *bp) 738 { 739 vm_offset_t addr; 740 long size; 741 u_int retried; 742 743 KASSERT(unmapped_buf_allowed, ("unmapped disabled")); 744 745 size = round_page(bp->bio_ma_offset + bp->bio_length); 746 KASSERT(size / PAGE_SIZE == bp->bio_ma_n, ("Bio too short %p", bp)); 747 addr = 0; 748 retried = 0; 749 atomic_add_long(&transient_maps, 1); 750 retry: 751 if (vmem_alloc(transient_arena, size, M_BESTFIT | M_NOWAIT, &addr)) { 752 if (transient_map_retries != 0 && 753 retried >= transient_map_retries) { 754 CTR2(KTR_GEOM, "g_down cannot map bp %p provider %s", 755 bp, bp->bio_to->name); 756 atomic_add_int(&transient_map_hard_failures, 1); 757 return (EDEADLK/* XXXKIB */); 758 } else { 759 /* 760 * Naive attempt to quisce the I/O to get more 761 * in-flight requests completed and defragment 762 * the transient_arena. 763 */ 764 CTR3(KTR_GEOM, "g_down retrymap bp %p provider %s r %d", 765 bp, bp->bio_to->name, retried); 766 pause("g_d_tra", hz / 10); 767 retried++; 768 atomic_add_int(&transient_map_soft_failures, 1); 769 goto retry; 770 } 771 } 772 atomic_add_int(&inflight_transient_maps, 1); 773 pmap_qenter((vm_offset_t)addr, bp->bio_ma, OFF_TO_IDX(size)); 774 bp->bio_data = (caddr_t)addr + bp->bio_ma_offset; 775 bp->bio_flags |= BIO_TRANSIENT_MAPPING; 776 bp->bio_flags &= ~BIO_UNMAPPED; 777 return (EJUSTRETURN); 778 } 779 780 void 781 g_io_schedule_down(struct thread *tp __unused) 782 { 783 struct bio *bp; 784 int error; 785 786 for(;;) { 787 g_bioq_lock(&g_bio_run_down); 788 bp = g_bioq_first(&g_bio_run_down); 789 if (bp == NULL) { 790 CTR0(KTR_GEOM, "g_down going to sleep"); 791 msleep(&g_wait_down, &g_bio_run_down.bio_queue_lock, 792 PRIBIO | PDROP, "-", 0); 793 continue; 794 } 795 CTR0(KTR_GEOM, "g_down has work to do"); 796 g_bioq_unlock(&g_bio_run_down); 797 biotrack(bp, __func__); 798 if (pace != 0) { 799 /* 800 * There has been at least one memory allocation 801 * failure since the last I/O completed. Pause 1ms to 802 * give the system a chance to free up memory. We only 803 * do this once because a large number of allocations 804 * can fail in the direct dispatch case and there's no 805 * relationship between the number of these failures and 806 * the length of the outage. If there's still an outage, 807 * we'll pause again and again until it's 808 * resolved. Older versions paused longer and once per 809 * allocation failure. This was OK for a single threaded 810 * g_down, but with direct dispatch would lead to max of 811 * 10 IOPs for minutes at a time when transient memory 812 * issues prevented allocation for a batch of requests 813 * from the upper layers. 814 * 815 * XXX This pacing is really lame. It needs to be solved 816 * by other methods. This is OK only because the worst 817 * case scenario is so rare. In the worst case scenario 818 * all memory is tied up waiting for I/O to complete 819 * which can never happen since we can't allocate bios 820 * for that I/O. 821 */ 822 CTR0(KTR_GEOM, "g_down pacing self"); 823 pause("g_down", min(hz/1000, 1)); 824 pace = 0; 825 } 826 CTR2(KTR_GEOM, "g_down processing bp %p provider %s", bp, 827 bp->bio_to->name); 828 error = g_io_check(bp); 829 if (error >= 0) { 830 CTR3(KTR_GEOM, "g_down g_io_check on bp %p provider " 831 "%s returned %d", bp, bp->bio_to->name, error); 832 g_io_deliver(bp, error); 833 continue; 834 } 835 THREAD_NO_SLEEPING(); 836 CTR4(KTR_GEOM, "g_down starting bp %p provider %s off %ld " 837 "len %ld", bp, bp->bio_to->name, bp->bio_offset, 838 bp->bio_length); 839 bp->bio_to->geom->start(bp); 840 THREAD_SLEEPING_OK(); 841 } 842 } 843 844 void 845 g_io_schedule_up(struct thread *tp __unused) 846 { 847 struct bio *bp; 848 849 for(;;) { 850 g_bioq_lock(&g_bio_run_up); 851 bp = g_bioq_first(&g_bio_run_up); 852 if (bp == NULL) { 853 CTR0(KTR_GEOM, "g_up going to sleep"); 854 msleep(&g_wait_up, &g_bio_run_up.bio_queue_lock, 855 PRIBIO | PDROP, "-", 0); 856 continue; 857 } 858 g_bioq_unlock(&g_bio_run_up); 859 THREAD_NO_SLEEPING(); 860 CTR4(KTR_GEOM, "g_up biodone bp %p provider %s off " 861 "%jd len %ld", bp, bp->bio_to->name, 862 bp->bio_offset, bp->bio_length); 863 biodone(bp); 864 THREAD_SLEEPING_OK(); 865 } 866 } 867 868 void * 869 g_read_data(struct g_consumer *cp, off_t offset, off_t length, int *error) 870 { 871 struct bio *bp; 872 void *ptr; 873 int errorc; 874 875 KASSERT(length > 0 && length >= cp->provider->sectorsize && 876 length <= maxphys, ("g_read_data(): invalid length %jd", 877 (intmax_t)length)); 878 879 bp = g_alloc_bio(); 880 bp->bio_cmd = BIO_READ; 881 bp->bio_done = NULL; 882 bp->bio_offset = offset; 883 bp->bio_length = length; 884 ptr = g_malloc(length, M_WAITOK); 885 bp->bio_data = ptr; 886 g_io_request(bp, cp); 887 errorc = biowait(bp, "gread"); 888 if (errorc == 0 && bp->bio_completed != length) 889 errorc = EIO; 890 if (error != NULL) 891 *error = errorc; 892 g_destroy_bio(bp); 893 if (errorc) { 894 g_free(ptr); 895 ptr = NULL; 896 } 897 return (ptr); 898 } 899 900 /* 901 * A read function for use by ffs_sbget when used by GEOM-layer routines. 902 */ 903 int 904 g_use_g_read_data(void *devfd, off_t loc, void **bufp, int size) 905 { 906 struct g_consumer *cp; 907 908 KASSERT(*bufp == NULL, 909 ("g_use_g_read_data: non-NULL *bufp %p\n", *bufp)); 910 911 cp = (struct g_consumer *)devfd; 912 /* 913 * Take care not to issue an invalid I/O request. The offset of 914 * the superblock candidate must be multiples of the provider's 915 * sector size, otherwise an FFS can't exist on the provider 916 * anyway. 917 */ 918 if (loc % cp->provider->sectorsize != 0) 919 return (ENOENT); 920 *bufp = g_read_data(cp, loc, size, NULL); 921 if (*bufp == NULL) 922 return (ENOENT); 923 return (0); 924 } 925 926 int 927 g_write_data(struct g_consumer *cp, off_t offset, void *ptr, off_t length) 928 { 929 struct bio *bp; 930 int error; 931 932 KASSERT(length > 0 && length >= cp->provider->sectorsize && 933 length <= maxphys, ("g_write_data(): invalid length %jd", 934 (intmax_t)length)); 935 936 bp = g_alloc_bio(); 937 bp->bio_cmd = BIO_WRITE; 938 bp->bio_done = NULL; 939 bp->bio_offset = offset; 940 bp->bio_length = length; 941 bp->bio_data = ptr; 942 g_io_request(bp, cp); 943 error = biowait(bp, "gwrite"); 944 if (error == 0 && bp->bio_completed != length) 945 error = EIO; 946 g_destroy_bio(bp); 947 return (error); 948 } 949 950 /* 951 * A write function for use by ffs_sbput when used by GEOM-layer routines. 952 */ 953 int 954 g_use_g_write_data(void *devfd, off_t loc, void *buf, int size) 955 { 956 957 return (g_write_data((struct g_consumer *)devfd, loc, buf, size)); 958 } 959 960 int 961 g_delete_data(struct g_consumer *cp, off_t offset, off_t length) 962 { 963 struct bio *bp; 964 int error; 965 966 KASSERT(length > 0 && length >= cp->provider->sectorsize, 967 ("g_delete_data(): invalid length %jd", (intmax_t)length)); 968 969 bp = g_alloc_bio(); 970 bp->bio_cmd = BIO_DELETE; 971 bp->bio_done = NULL; 972 bp->bio_offset = offset; 973 bp->bio_length = length; 974 bp->bio_data = NULL; 975 g_io_request(bp, cp); 976 error = biowait(bp, "gdelete"); 977 if (error == 0 && bp->bio_completed != length) 978 error = EIO; 979 g_destroy_bio(bp); 980 return (error); 981 } 982 983 void 984 g_print_bio(const char *prefix, const struct bio *bp, const char *fmtsuffix, 985 ...) 986 { 987 #ifndef PRINTF_BUFR_SIZE 988 #define PRINTF_BUFR_SIZE 64 989 #endif 990 char bufr[PRINTF_BUFR_SIZE]; 991 struct sbuf sb, *sbp __unused; 992 va_list ap; 993 994 sbp = sbuf_new(&sb, bufr, sizeof(bufr), SBUF_FIXEDLEN); 995 KASSERT(sbp != NULL, ("sbuf_new misused?")); 996 997 sbuf_set_drain(&sb, sbuf_printf_drain, NULL); 998 999 sbuf_cat(&sb, prefix); 1000 g_format_bio(&sb, bp); 1001 1002 va_start(ap, fmtsuffix); 1003 sbuf_vprintf(&sb, fmtsuffix, ap); 1004 va_end(ap); 1005 1006 sbuf_nl_terminate(&sb); 1007 1008 sbuf_finish(&sb); 1009 sbuf_delete(&sb); 1010 } 1011 1012 void 1013 g_format_bio(struct sbuf *sb, const struct bio *bp) 1014 { 1015 const char *pname, *cmd = NULL; 1016 1017 if (bp->bio_to != NULL) 1018 pname = bp->bio_to->name; 1019 else if (bp->bio_parent != NULL && bp->bio_parent->bio_to != NULL) 1020 pname = bp->bio_parent->bio_to->name; 1021 else 1022 pname = "[unknown]"; 1023 1024 switch (bp->bio_cmd) { 1025 case BIO_GETATTR: 1026 cmd = "GETATTR"; 1027 sbuf_printf(sb, "%s[%s(attr=%s)]", pname, cmd, 1028 bp->bio_attribute); 1029 return; 1030 case BIO_FLUSH: 1031 cmd = "FLUSH"; 1032 sbuf_printf(sb, "%s[%s]", pname, cmd); 1033 return; 1034 case BIO_ZONE: { 1035 char *subcmd = NULL; 1036 cmd = "ZONE"; 1037 switch (bp->bio_zone.zone_cmd) { 1038 case DISK_ZONE_OPEN: 1039 subcmd = "OPEN"; 1040 break; 1041 case DISK_ZONE_CLOSE: 1042 subcmd = "CLOSE"; 1043 break; 1044 case DISK_ZONE_FINISH: 1045 subcmd = "FINISH"; 1046 break; 1047 case DISK_ZONE_RWP: 1048 subcmd = "RWP"; 1049 break; 1050 case DISK_ZONE_REPORT_ZONES: 1051 subcmd = "REPORT ZONES"; 1052 break; 1053 case DISK_ZONE_GET_PARAMS: 1054 subcmd = "GET PARAMS"; 1055 break; 1056 default: 1057 subcmd = "UNKNOWN"; 1058 break; 1059 } 1060 sbuf_printf(sb, "%s[%s,%s]", pname, cmd, subcmd); 1061 return; 1062 } 1063 case BIO_READ: 1064 cmd = "READ"; 1065 break; 1066 case BIO_WRITE: 1067 cmd = "WRITE"; 1068 break; 1069 case BIO_DELETE: 1070 cmd = "DELETE"; 1071 break; 1072 default: 1073 cmd = "UNKNOWN"; 1074 sbuf_printf(sb, "%s[%s()]", pname, cmd); 1075 return; 1076 } 1077 sbuf_printf(sb, "%s[%s(offset=%jd, length=%jd)]", pname, cmd, 1078 (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length); 1079 } 1080