1 /*- 2 * Copyright (c) 2002 Poul-Henning Kamp 3 * Copyright (c) 2002 Networks Associates Technology, Inc. 4 * Copyright (c) 2013 The FreeBSD Foundation 5 * All rights reserved. 6 * 7 * This software was developed for the FreeBSD Project by Poul-Henning Kamp 8 * and NAI Labs, the Security Research Division of Network Associates, Inc. 9 * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the 10 * DARPA CHATS research program. 11 * 12 * Portions of this software were developed by Konstantin Belousov 13 * under sponsorship from the FreeBSD Foundation. 14 * 15 * Redistribution and use in source and binary forms, with or without 16 * modification, are permitted provided that the following conditions 17 * are met: 18 * 1. Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 3. The names of the authors may not be used to endorse or promote 24 * products derived from this software without specific prior written 25 * permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 37 * SUCH DAMAGE. 38 */ 39 40 #include <sys/cdefs.h> 41 __FBSDID("$FreeBSD$"); 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/kernel.h> 46 #include <sys/malloc.h> 47 #include <sys/bio.h> 48 #include <sys/ktr.h> 49 #include <sys/proc.h> 50 #include <sys/stack.h> 51 #include <sys/sysctl.h> 52 #include <sys/vmem.h> 53 54 #include <sys/errno.h> 55 #include <geom/geom.h> 56 #include <geom/geom_int.h> 57 #include <sys/devicestat.h> 58 59 #include <vm/uma.h> 60 #include <vm/vm.h> 61 #include <vm/vm_param.h> 62 #include <vm/vm_kern.h> 63 #include <vm/vm_page.h> 64 #include <vm/vm_object.h> 65 #include <vm/vm_extern.h> 66 #include <vm/vm_map.h> 67 68 static int g_io_transient_map_bio(struct bio *bp); 69 70 static struct g_bioq g_bio_run_down; 71 static struct g_bioq g_bio_run_up; 72 static struct g_bioq g_bio_run_task; 73 74 /* 75 * Pace is a hint that we've had some trouble recently allocating 76 * bios, so we should back off trying to send I/O down the stack 77 * a bit to let the problem resolve. When pacing, we also turn 78 * off direct dispatch to also reduce memory pressure from I/Os 79 * there, at the expxense of some added latency while the memory 80 * pressures exist. See g_io_schedule_down() for more details 81 * and limitations. 82 */ 83 static volatile u_int pace; 84 85 static uma_zone_t biozone; 86 87 /* 88 * The head of the list of classifiers used in g_io_request. 89 * Use g_register_classifier() and g_unregister_classifier() 90 * to add/remove entries to the list. 91 * Classifiers are invoked in registration order. 92 */ 93 static TAILQ_HEAD(g_classifier_tailq, g_classifier_hook) 94 g_classifier_tailq = TAILQ_HEAD_INITIALIZER(g_classifier_tailq); 95 96 #include <machine/atomic.h> 97 98 static void 99 g_bioq_lock(struct g_bioq *bq) 100 { 101 102 mtx_lock(&bq->bio_queue_lock); 103 } 104 105 static void 106 g_bioq_unlock(struct g_bioq *bq) 107 { 108 109 mtx_unlock(&bq->bio_queue_lock); 110 } 111 112 #if 0 113 static void 114 g_bioq_destroy(struct g_bioq *bq) 115 { 116 117 mtx_destroy(&bq->bio_queue_lock); 118 } 119 #endif 120 121 static void 122 g_bioq_init(struct g_bioq *bq) 123 { 124 125 TAILQ_INIT(&bq->bio_queue); 126 mtx_init(&bq->bio_queue_lock, "bio queue", NULL, MTX_DEF); 127 } 128 129 static struct bio * 130 g_bioq_first(struct g_bioq *bq) 131 { 132 struct bio *bp; 133 134 bp = TAILQ_FIRST(&bq->bio_queue); 135 if (bp != NULL) { 136 KASSERT((bp->bio_flags & BIO_ONQUEUE), 137 ("Bio not on queue bp=%p target %p", bp, bq)); 138 bp->bio_flags &= ~BIO_ONQUEUE; 139 TAILQ_REMOVE(&bq->bio_queue, bp, bio_queue); 140 bq->bio_queue_length--; 141 } 142 return (bp); 143 } 144 145 struct bio * 146 g_new_bio(void) 147 { 148 struct bio *bp; 149 150 bp = uma_zalloc(biozone, M_NOWAIT | M_ZERO); 151 #ifdef KTR 152 if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { 153 struct stack st; 154 155 CTR1(KTR_GEOM, "g_new_bio(): %p", bp); 156 stack_save(&st); 157 CTRSTACK(KTR_GEOM, &st, 3, 0); 158 } 159 #endif 160 return (bp); 161 } 162 163 struct bio * 164 g_alloc_bio(void) 165 { 166 struct bio *bp; 167 168 bp = uma_zalloc(biozone, M_WAITOK | M_ZERO); 169 #ifdef KTR 170 if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { 171 struct stack st; 172 173 CTR1(KTR_GEOM, "g_alloc_bio(): %p", bp); 174 stack_save(&st); 175 CTRSTACK(KTR_GEOM, &st, 3, 0); 176 } 177 #endif 178 return (bp); 179 } 180 181 void 182 g_destroy_bio(struct bio *bp) 183 { 184 #ifdef KTR 185 if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { 186 struct stack st; 187 188 CTR1(KTR_GEOM, "g_destroy_bio(): %p", bp); 189 stack_save(&st); 190 CTRSTACK(KTR_GEOM, &st, 3, 0); 191 } 192 #endif 193 uma_zfree(biozone, bp); 194 } 195 196 struct bio * 197 g_clone_bio(struct bio *bp) 198 { 199 struct bio *bp2; 200 201 bp2 = uma_zalloc(biozone, M_NOWAIT | M_ZERO); 202 if (bp2 != NULL) { 203 bp2->bio_parent = bp; 204 bp2->bio_cmd = bp->bio_cmd; 205 /* 206 * BIO_ORDERED flag may be used by disk drivers to enforce 207 * ordering restrictions, so this flag needs to be cloned. 208 * BIO_UNMAPPED and BIO_VLIST should be inherited, to properly 209 * indicate which way the buffer is passed. 210 * Other bio flags are not suitable for cloning. 211 */ 212 bp2->bio_flags = bp->bio_flags & 213 (BIO_ORDERED | BIO_UNMAPPED | BIO_VLIST); 214 bp2->bio_length = bp->bio_length; 215 bp2->bio_offset = bp->bio_offset; 216 bp2->bio_data = bp->bio_data; 217 bp2->bio_ma = bp->bio_ma; 218 bp2->bio_ma_n = bp->bio_ma_n; 219 bp2->bio_ma_offset = bp->bio_ma_offset; 220 bp2->bio_attribute = bp->bio_attribute; 221 /* Inherit classification info from the parent */ 222 bp2->bio_classifier1 = bp->bio_classifier1; 223 bp2->bio_classifier2 = bp->bio_classifier2; 224 bp->bio_children++; 225 } 226 #ifdef KTR 227 if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { 228 struct stack st; 229 230 CTR2(KTR_GEOM, "g_clone_bio(%p): %p", bp, bp2); 231 stack_save(&st); 232 CTRSTACK(KTR_GEOM, &st, 3, 0); 233 } 234 #endif 235 return(bp2); 236 } 237 238 struct bio * 239 g_duplicate_bio(struct bio *bp) 240 { 241 struct bio *bp2; 242 243 bp2 = uma_zalloc(biozone, M_WAITOK | M_ZERO); 244 bp2->bio_flags = bp->bio_flags & (BIO_UNMAPPED | BIO_VLIST); 245 bp2->bio_parent = bp; 246 bp2->bio_cmd = bp->bio_cmd; 247 bp2->bio_length = bp->bio_length; 248 bp2->bio_offset = bp->bio_offset; 249 bp2->bio_data = bp->bio_data; 250 bp2->bio_ma = bp->bio_ma; 251 bp2->bio_ma_n = bp->bio_ma_n; 252 bp2->bio_ma_offset = bp->bio_ma_offset; 253 bp2->bio_attribute = bp->bio_attribute; 254 bp->bio_children++; 255 #ifdef KTR 256 if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { 257 struct stack st; 258 259 CTR2(KTR_GEOM, "g_duplicate_bio(%p): %p", bp, bp2); 260 stack_save(&st); 261 CTRSTACK(KTR_GEOM, &st, 3, 0); 262 } 263 #endif 264 return(bp2); 265 } 266 267 void 268 g_io_init() 269 { 270 271 g_bioq_init(&g_bio_run_down); 272 g_bioq_init(&g_bio_run_up); 273 g_bioq_init(&g_bio_run_task); 274 biozone = uma_zcreate("g_bio", sizeof (struct bio), 275 NULL, NULL, 276 NULL, NULL, 277 0, 0); 278 } 279 280 int 281 g_io_getattr(const char *attr, struct g_consumer *cp, int *len, void *ptr) 282 { 283 struct bio *bp; 284 int error; 285 286 g_trace(G_T_BIO, "bio_getattr(%s)", attr); 287 bp = g_alloc_bio(); 288 bp->bio_cmd = BIO_GETATTR; 289 bp->bio_done = NULL; 290 bp->bio_attribute = attr; 291 bp->bio_length = *len; 292 bp->bio_data = ptr; 293 g_io_request(bp, cp); 294 error = biowait(bp, "ggetattr"); 295 *len = bp->bio_completed; 296 g_destroy_bio(bp); 297 return (error); 298 } 299 300 int 301 g_io_flush(struct g_consumer *cp) 302 { 303 struct bio *bp; 304 int error; 305 306 g_trace(G_T_BIO, "bio_flush(%s)", cp->provider->name); 307 bp = g_alloc_bio(); 308 bp->bio_cmd = BIO_FLUSH; 309 bp->bio_flags |= BIO_ORDERED; 310 bp->bio_done = NULL; 311 bp->bio_attribute = NULL; 312 bp->bio_offset = cp->provider->mediasize; 313 bp->bio_length = 0; 314 bp->bio_data = NULL; 315 g_io_request(bp, cp); 316 error = biowait(bp, "gflush"); 317 g_destroy_bio(bp); 318 return (error); 319 } 320 321 static int 322 g_io_check(struct bio *bp) 323 { 324 struct g_consumer *cp; 325 struct g_provider *pp; 326 off_t excess; 327 int error; 328 329 cp = bp->bio_from; 330 pp = bp->bio_to; 331 332 /* Fail if access counters dont allow the operation */ 333 switch(bp->bio_cmd) { 334 case BIO_READ: 335 case BIO_GETATTR: 336 if (cp->acr == 0) 337 return (EPERM); 338 break; 339 case BIO_WRITE: 340 case BIO_DELETE: 341 case BIO_FLUSH: 342 if (cp->acw == 0) 343 return (EPERM); 344 break; 345 default: 346 return (EPERM); 347 } 348 /* if provider is marked for error, don't disturb. */ 349 if (pp->error) 350 return (pp->error); 351 if (cp->flags & G_CF_ORPHAN) 352 return (ENXIO); 353 354 switch(bp->bio_cmd) { 355 case BIO_READ: 356 case BIO_WRITE: 357 case BIO_DELETE: 358 /* Zero sectorsize or mediasize is probably a lack of media. */ 359 if (pp->sectorsize == 0 || pp->mediasize == 0) 360 return (ENXIO); 361 /* Reject I/O not on sector boundary */ 362 if (bp->bio_offset % pp->sectorsize) 363 return (EINVAL); 364 /* Reject I/O not integral sector long */ 365 if (bp->bio_length % pp->sectorsize) 366 return (EINVAL); 367 /* Reject requests before or past the end of media. */ 368 if (bp->bio_offset < 0) 369 return (EIO); 370 if (bp->bio_offset > pp->mediasize) 371 return (EIO); 372 373 /* Truncate requests to the end of providers media. */ 374 excess = bp->bio_offset + bp->bio_length; 375 if (excess > bp->bio_to->mediasize) { 376 KASSERT((bp->bio_flags & BIO_UNMAPPED) == 0 || 377 round_page(bp->bio_ma_offset + 378 bp->bio_length) / PAGE_SIZE == bp->bio_ma_n, 379 ("excess bio %p too short", bp)); 380 excess -= bp->bio_to->mediasize; 381 bp->bio_length -= excess; 382 if ((bp->bio_flags & BIO_UNMAPPED) != 0) { 383 bp->bio_ma_n = round_page(bp->bio_ma_offset + 384 bp->bio_length) / PAGE_SIZE; 385 } 386 if (excess > 0) 387 CTR3(KTR_GEOM, "g_down truncated bio " 388 "%p provider %s by %d", bp, 389 bp->bio_to->name, excess); 390 } 391 392 /* Deliver zero length transfers right here. */ 393 if (bp->bio_length == 0) { 394 CTR2(KTR_GEOM, "g_down terminated 0-length " 395 "bp %p provider %s", bp, bp->bio_to->name); 396 return (0); 397 } 398 399 if ((bp->bio_flags & BIO_UNMAPPED) != 0 && 400 (bp->bio_to->flags & G_PF_ACCEPT_UNMAPPED) == 0 && 401 (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) { 402 if ((error = g_io_transient_map_bio(bp)) >= 0) 403 return (error); 404 } 405 break; 406 default: 407 break; 408 } 409 return (EJUSTRETURN); 410 } 411 412 /* 413 * bio classification support. 414 * 415 * g_register_classifier() and g_unregister_classifier() 416 * are used to add/remove a classifier from the list. 417 * The list is protected using the g_bio_run_down lock, 418 * because the classifiers are called in this path. 419 * 420 * g_io_request() passes bio's that are not already classified 421 * (i.e. those with bio_classifier1 == NULL) to g_run_classifiers(). 422 * Classifiers can store their result in the two fields 423 * bio_classifier1 and bio_classifier2. 424 * A classifier that updates one of the fields should 425 * return a non-zero value. 426 * If no classifier updates the field, g_run_classifiers() sets 427 * bio_classifier1 = BIO_NOTCLASSIFIED to avoid further calls. 428 */ 429 430 int 431 g_register_classifier(struct g_classifier_hook *hook) 432 { 433 434 g_bioq_lock(&g_bio_run_down); 435 TAILQ_INSERT_TAIL(&g_classifier_tailq, hook, link); 436 g_bioq_unlock(&g_bio_run_down); 437 438 return (0); 439 } 440 441 void 442 g_unregister_classifier(struct g_classifier_hook *hook) 443 { 444 struct g_classifier_hook *entry; 445 446 g_bioq_lock(&g_bio_run_down); 447 TAILQ_FOREACH(entry, &g_classifier_tailq, link) { 448 if (entry == hook) { 449 TAILQ_REMOVE(&g_classifier_tailq, hook, link); 450 break; 451 } 452 } 453 g_bioq_unlock(&g_bio_run_down); 454 } 455 456 static void 457 g_run_classifiers(struct bio *bp) 458 { 459 struct g_classifier_hook *hook; 460 int classified = 0; 461 462 TAILQ_FOREACH(hook, &g_classifier_tailq, link) 463 classified |= hook->func(hook->arg, bp); 464 465 if (!classified) 466 bp->bio_classifier1 = BIO_NOTCLASSIFIED; 467 } 468 469 void 470 g_io_request(struct bio *bp, struct g_consumer *cp) 471 { 472 struct g_provider *pp; 473 struct mtx *mtxp; 474 int direct, error, first; 475 476 KASSERT(cp != NULL, ("NULL cp in g_io_request")); 477 KASSERT(bp != NULL, ("NULL bp in g_io_request")); 478 pp = cp->provider; 479 KASSERT(pp != NULL, ("consumer not attached in g_io_request")); 480 #ifdef DIAGNOSTIC 481 KASSERT(bp->bio_driver1 == NULL, 482 ("bio_driver1 used by the consumer (geom %s)", cp->geom->name)); 483 KASSERT(bp->bio_driver2 == NULL, 484 ("bio_driver2 used by the consumer (geom %s)", cp->geom->name)); 485 KASSERT(bp->bio_pflags == 0, 486 ("bio_pflags used by the consumer (geom %s)", cp->geom->name)); 487 /* 488 * Remember consumer's private fields, so we can detect if they were 489 * modified by the provider. 490 */ 491 bp->_bio_caller1 = bp->bio_caller1; 492 bp->_bio_caller2 = bp->bio_caller2; 493 bp->_bio_cflags = bp->bio_cflags; 494 #endif 495 496 if (bp->bio_cmd & (BIO_READ|BIO_WRITE|BIO_GETATTR)) { 497 KASSERT(bp->bio_data != NULL, 498 ("NULL bp->data in g_io_request(cmd=%hhu)", bp->bio_cmd)); 499 } 500 if (bp->bio_cmd & (BIO_DELETE|BIO_FLUSH)) { 501 KASSERT(bp->bio_data == NULL, 502 ("non-NULL bp->data in g_io_request(cmd=%hhu)", 503 bp->bio_cmd)); 504 } 505 if (bp->bio_cmd & (BIO_READ|BIO_WRITE|BIO_DELETE)) { 506 KASSERT(bp->bio_offset % cp->provider->sectorsize == 0, 507 ("wrong offset %jd for sectorsize %u", 508 bp->bio_offset, cp->provider->sectorsize)); 509 KASSERT(bp->bio_length % cp->provider->sectorsize == 0, 510 ("wrong length %jd for sectorsize %u", 511 bp->bio_length, cp->provider->sectorsize)); 512 } 513 514 g_trace(G_T_BIO, "bio_request(%p) from %p(%s) to %p(%s) cmd %d", 515 bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd); 516 517 bp->bio_from = cp; 518 bp->bio_to = pp; 519 bp->bio_error = 0; 520 bp->bio_completed = 0; 521 522 KASSERT(!(bp->bio_flags & BIO_ONQUEUE), 523 ("Bio already on queue bp=%p", bp)); 524 if ((g_collectstats & G_STATS_CONSUMERS) != 0 || 525 ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL)) 526 binuptime(&bp->bio_t0); 527 else 528 getbinuptime(&bp->bio_t0); 529 530 #ifdef GET_STACK_USAGE 531 direct = (cp->flags & G_CF_DIRECT_SEND) != 0 && 532 (pp->flags & G_PF_DIRECT_RECEIVE) != 0 && 533 !g_is_geom_thread(curthread) && 534 ((pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 || 535 (bp->bio_flags & BIO_UNMAPPED) == 0 || THREAD_CAN_SLEEP()) && 536 pace == 0; 537 if (direct) { 538 /* Block direct execution if less then half of stack left. */ 539 size_t st, su; 540 GET_STACK_USAGE(st, su); 541 if (su * 2 > st) 542 direct = 0; 543 } 544 #else 545 direct = 0; 546 #endif 547 548 if (!TAILQ_EMPTY(&g_classifier_tailq) && !bp->bio_classifier1) { 549 g_bioq_lock(&g_bio_run_down); 550 g_run_classifiers(bp); 551 g_bioq_unlock(&g_bio_run_down); 552 } 553 554 /* 555 * The statistics collection is lockless, as such, but we 556 * can not update one instance of the statistics from more 557 * than one thread at a time, so grab the lock first. 558 */ 559 mtxp = mtx_pool_find(mtxpool_sleep, pp); 560 mtx_lock(mtxp); 561 if (g_collectstats & G_STATS_PROVIDERS) 562 devstat_start_transaction(pp->stat, &bp->bio_t0); 563 if (g_collectstats & G_STATS_CONSUMERS) 564 devstat_start_transaction(cp->stat, &bp->bio_t0); 565 pp->nstart++; 566 cp->nstart++; 567 mtx_unlock(mtxp); 568 569 if (direct) { 570 error = g_io_check(bp); 571 if (error >= 0) { 572 CTR3(KTR_GEOM, "g_io_request g_io_check on bp %p " 573 "provider %s returned %d", bp, bp->bio_to->name, 574 error); 575 g_io_deliver(bp, error); 576 return; 577 } 578 bp->bio_to->geom->start(bp); 579 } else { 580 g_bioq_lock(&g_bio_run_down); 581 first = TAILQ_EMPTY(&g_bio_run_down.bio_queue); 582 TAILQ_INSERT_TAIL(&g_bio_run_down.bio_queue, bp, bio_queue); 583 bp->bio_flags |= BIO_ONQUEUE; 584 g_bio_run_down.bio_queue_length++; 585 g_bioq_unlock(&g_bio_run_down); 586 /* Pass it on down. */ 587 if (first) 588 wakeup(&g_wait_down); 589 } 590 } 591 592 void 593 g_io_deliver(struct bio *bp, int error) 594 { 595 struct bintime now; 596 struct g_consumer *cp; 597 struct g_provider *pp; 598 struct mtx *mtxp; 599 int direct, first; 600 601 KASSERT(bp != NULL, ("NULL bp in g_io_deliver")); 602 pp = bp->bio_to; 603 KASSERT(pp != NULL, ("NULL bio_to in g_io_deliver")); 604 cp = bp->bio_from; 605 if (cp == NULL) { 606 bp->bio_error = error; 607 bp->bio_done(bp); 608 return; 609 } 610 KASSERT(cp != NULL, ("NULL bio_from in g_io_deliver")); 611 KASSERT(cp->geom != NULL, ("NULL bio_from->geom in g_io_deliver")); 612 #ifdef DIAGNOSTIC 613 /* 614 * Some classes - GJournal in particular - can modify bio's 615 * private fields while the bio is in transit; G_GEOM_VOLATILE_BIO 616 * flag means it's an expected behaviour for that particular geom. 617 */ 618 if ((cp->geom->flags & G_GEOM_VOLATILE_BIO) == 0) { 619 KASSERT(bp->bio_caller1 == bp->_bio_caller1, 620 ("bio_caller1 used by the provider %s", pp->name)); 621 KASSERT(bp->bio_caller2 == bp->_bio_caller2, 622 ("bio_caller2 used by the provider %s", pp->name)); 623 KASSERT(bp->bio_cflags == bp->_bio_cflags, 624 ("bio_cflags used by the provider %s", pp->name)); 625 } 626 #endif 627 KASSERT(bp->bio_completed >= 0, ("bio_completed can't be less than 0")); 628 KASSERT(bp->bio_completed <= bp->bio_length, 629 ("bio_completed can't be greater than bio_length")); 630 631 g_trace(G_T_BIO, 632 "g_io_deliver(%p) from %p(%s) to %p(%s) cmd %d error %d off %jd len %jd", 633 bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd, error, 634 (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length); 635 636 KASSERT(!(bp->bio_flags & BIO_ONQUEUE), 637 ("Bio already on queue bp=%p", bp)); 638 639 /* 640 * XXX: next two doesn't belong here 641 */ 642 bp->bio_bcount = bp->bio_length; 643 bp->bio_resid = bp->bio_bcount - bp->bio_completed; 644 645 #ifdef GET_STACK_USAGE 646 direct = (pp->flags & G_PF_DIRECT_SEND) && 647 (cp->flags & G_CF_DIRECT_RECEIVE) && 648 !g_is_geom_thread(curthread); 649 if (direct) { 650 /* Block direct execution if less then half of stack left. */ 651 size_t st, su; 652 GET_STACK_USAGE(st, su); 653 if (su * 2 > st) 654 direct = 0; 655 } 656 #else 657 direct = 0; 658 #endif 659 660 /* 661 * The statistics collection is lockless, as such, but we 662 * can not update one instance of the statistics from more 663 * than one thread at a time, so grab the lock first. 664 */ 665 if ((g_collectstats & G_STATS_CONSUMERS) != 0 || 666 ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL)) 667 binuptime(&now); 668 mtxp = mtx_pool_find(mtxpool_sleep, cp); 669 mtx_lock(mtxp); 670 if (g_collectstats & G_STATS_PROVIDERS) 671 devstat_end_transaction_bio_bt(pp->stat, bp, &now); 672 if (g_collectstats & G_STATS_CONSUMERS) 673 devstat_end_transaction_bio_bt(cp->stat, bp, &now); 674 cp->nend++; 675 pp->nend++; 676 mtx_unlock(mtxp); 677 678 if (error != ENOMEM) { 679 bp->bio_error = error; 680 if (direct) { 681 biodone(bp); 682 } else { 683 g_bioq_lock(&g_bio_run_up); 684 first = TAILQ_EMPTY(&g_bio_run_up.bio_queue); 685 TAILQ_INSERT_TAIL(&g_bio_run_up.bio_queue, bp, bio_queue); 686 bp->bio_flags |= BIO_ONQUEUE; 687 g_bio_run_up.bio_queue_length++; 688 g_bioq_unlock(&g_bio_run_up); 689 if (first) 690 wakeup(&g_wait_up); 691 } 692 return; 693 } 694 695 if (bootverbose) 696 printf("ENOMEM %p on %p(%s)\n", bp, pp, pp->name); 697 bp->bio_children = 0; 698 bp->bio_inbed = 0; 699 bp->bio_driver1 = NULL; 700 bp->bio_driver2 = NULL; 701 bp->bio_pflags = 0; 702 g_io_request(bp, cp); 703 pace = 1; 704 return; 705 } 706 707 SYSCTL_DECL(_kern_geom); 708 709 static long transient_maps; 710 SYSCTL_LONG(_kern_geom, OID_AUTO, transient_maps, CTLFLAG_RD, 711 &transient_maps, 0, 712 "Total count of the transient mapping requests"); 713 u_int transient_map_retries = 10; 714 SYSCTL_UINT(_kern_geom, OID_AUTO, transient_map_retries, CTLFLAG_RW, 715 &transient_map_retries, 0, 716 "Max count of retries used before giving up on creating transient map"); 717 int transient_map_hard_failures; 718 SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_hard_failures, CTLFLAG_RD, 719 &transient_map_hard_failures, 0, 720 "Failures to establish the transient mapping due to retry attempts " 721 "exhausted"); 722 int transient_map_soft_failures; 723 SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_soft_failures, CTLFLAG_RD, 724 &transient_map_soft_failures, 0, 725 "Count of retried failures to establish the transient mapping"); 726 int inflight_transient_maps; 727 SYSCTL_INT(_kern_geom, OID_AUTO, inflight_transient_maps, CTLFLAG_RD, 728 &inflight_transient_maps, 0, 729 "Current count of the active transient maps"); 730 731 static int 732 g_io_transient_map_bio(struct bio *bp) 733 { 734 vm_offset_t addr; 735 long size; 736 u_int retried; 737 738 KASSERT(unmapped_buf_allowed, ("unmapped disabled")); 739 740 size = round_page(bp->bio_ma_offset + bp->bio_length); 741 KASSERT(size / PAGE_SIZE == bp->bio_ma_n, ("Bio too short %p", bp)); 742 addr = 0; 743 retried = 0; 744 atomic_add_long(&transient_maps, 1); 745 retry: 746 if (vmem_alloc(transient_arena, size, M_BESTFIT | M_NOWAIT, &addr)) { 747 if (transient_map_retries != 0 && 748 retried >= transient_map_retries) { 749 CTR2(KTR_GEOM, "g_down cannot map bp %p provider %s", 750 bp, bp->bio_to->name); 751 atomic_add_int(&transient_map_hard_failures, 1); 752 return (EDEADLK/* XXXKIB */); 753 } else { 754 /* 755 * Naive attempt to quisce the I/O to get more 756 * in-flight requests completed and defragment 757 * the transient_arena. 758 */ 759 CTR3(KTR_GEOM, "g_down retrymap bp %p provider %s r %d", 760 bp, bp->bio_to->name, retried); 761 pause("g_d_tra", hz / 10); 762 retried++; 763 atomic_add_int(&transient_map_soft_failures, 1); 764 goto retry; 765 } 766 } 767 atomic_add_int(&inflight_transient_maps, 1); 768 pmap_qenter((vm_offset_t)addr, bp->bio_ma, OFF_TO_IDX(size)); 769 bp->bio_data = (caddr_t)addr + bp->bio_ma_offset; 770 bp->bio_flags |= BIO_TRANSIENT_MAPPING; 771 bp->bio_flags &= ~BIO_UNMAPPED; 772 return (EJUSTRETURN); 773 } 774 775 void 776 g_io_schedule_down(struct thread *tp __unused) 777 { 778 struct bio *bp; 779 int error; 780 781 for(;;) { 782 g_bioq_lock(&g_bio_run_down); 783 bp = g_bioq_first(&g_bio_run_down); 784 if (bp == NULL) { 785 CTR0(KTR_GEOM, "g_down going to sleep"); 786 msleep(&g_wait_down, &g_bio_run_down.bio_queue_lock, 787 PRIBIO | PDROP, "-", 0); 788 continue; 789 } 790 CTR0(KTR_GEOM, "g_down has work to do"); 791 g_bioq_unlock(&g_bio_run_down); 792 if (pace != 0) { 793 /* 794 * There has been at least one memory allocation 795 * failure since the last I/O completed. Pause 1ms to 796 * give the system a chance to free up memory. We only 797 * do this once because a large number of allocations 798 * can fail in the direct dispatch case and there's no 799 * relationship between the number of these failures and 800 * the length of the outage. If there's still an outage, 801 * we'll pause again and again until it's 802 * resolved. Older versions paused longer and once per 803 * allocation failure. This was OK for a single threaded 804 * g_down, but with direct dispatch would lead to max of 805 * 10 IOPs for minutes at a time when transient memory 806 * issues prevented allocation for a batch of requests 807 * from the upper layers. 808 * 809 * XXX This pacing is really lame. It needs to be solved 810 * by other methods. This is OK only because the worst 811 * case scenario is so rare. In the worst case scenario 812 * all memory is tied up waiting for I/O to complete 813 * which can never happen since we can't allocate bios 814 * for that I/O. 815 */ 816 CTR0(KTR_GEOM, "g_down pacing self"); 817 pause("g_down", min(hz/1000, 1)); 818 pace = 0; 819 } 820 CTR2(KTR_GEOM, "g_down processing bp %p provider %s", bp, 821 bp->bio_to->name); 822 error = g_io_check(bp); 823 if (error >= 0) { 824 CTR3(KTR_GEOM, "g_down g_io_check on bp %p provider " 825 "%s returned %d", bp, bp->bio_to->name, error); 826 g_io_deliver(bp, error); 827 continue; 828 } 829 THREAD_NO_SLEEPING(); 830 CTR4(KTR_GEOM, "g_down starting bp %p provider %s off %ld " 831 "len %ld", bp, bp->bio_to->name, bp->bio_offset, 832 bp->bio_length); 833 bp->bio_to->geom->start(bp); 834 THREAD_SLEEPING_OK(); 835 } 836 } 837 838 void 839 bio_taskqueue(struct bio *bp, bio_task_t *func, void *arg) 840 { 841 bp->bio_task = func; 842 bp->bio_task_arg = arg; 843 /* 844 * The taskqueue is actually just a second queue off the "up" 845 * queue, so we use the same lock. 846 */ 847 g_bioq_lock(&g_bio_run_up); 848 KASSERT(!(bp->bio_flags & BIO_ONQUEUE), 849 ("Bio already on queue bp=%p target taskq", bp)); 850 bp->bio_flags |= BIO_ONQUEUE; 851 TAILQ_INSERT_TAIL(&g_bio_run_task.bio_queue, bp, bio_queue); 852 g_bio_run_task.bio_queue_length++; 853 wakeup(&g_wait_up); 854 g_bioq_unlock(&g_bio_run_up); 855 } 856 857 858 void 859 g_io_schedule_up(struct thread *tp __unused) 860 { 861 struct bio *bp; 862 for(;;) { 863 g_bioq_lock(&g_bio_run_up); 864 bp = g_bioq_first(&g_bio_run_task); 865 if (bp != NULL) { 866 g_bioq_unlock(&g_bio_run_up); 867 THREAD_NO_SLEEPING(); 868 CTR1(KTR_GEOM, "g_up processing task bp %p", bp); 869 bp->bio_task(bp->bio_task_arg); 870 THREAD_SLEEPING_OK(); 871 continue; 872 } 873 bp = g_bioq_first(&g_bio_run_up); 874 if (bp != NULL) { 875 g_bioq_unlock(&g_bio_run_up); 876 THREAD_NO_SLEEPING(); 877 CTR4(KTR_GEOM, "g_up biodone bp %p provider %s off " 878 "%jd len %ld", bp, bp->bio_to->name, 879 bp->bio_offset, bp->bio_length); 880 biodone(bp); 881 THREAD_SLEEPING_OK(); 882 continue; 883 } 884 CTR0(KTR_GEOM, "g_up going to sleep"); 885 msleep(&g_wait_up, &g_bio_run_up.bio_queue_lock, 886 PRIBIO | PDROP, "-", 0); 887 } 888 } 889 890 void * 891 g_read_data(struct g_consumer *cp, off_t offset, off_t length, int *error) 892 { 893 struct bio *bp; 894 void *ptr; 895 int errorc; 896 897 KASSERT(length > 0 && length >= cp->provider->sectorsize && 898 length <= MAXPHYS, ("g_read_data(): invalid length %jd", 899 (intmax_t)length)); 900 901 bp = g_alloc_bio(); 902 bp->bio_cmd = BIO_READ; 903 bp->bio_done = NULL; 904 bp->bio_offset = offset; 905 bp->bio_length = length; 906 ptr = g_malloc(length, M_WAITOK); 907 bp->bio_data = ptr; 908 g_io_request(bp, cp); 909 errorc = biowait(bp, "gread"); 910 if (error != NULL) 911 *error = errorc; 912 g_destroy_bio(bp); 913 if (errorc) { 914 g_free(ptr); 915 ptr = NULL; 916 } 917 return (ptr); 918 } 919 920 int 921 g_write_data(struct g_consumer *cp, off_t offset, void *ptr, off_t length) 922 { 923 struct bio *bp; 924 int error; 925 926 KASSERT(length > 0 && length >= cp->provider->sectorsize && 927 length <= MAXPHYS, ("g_write_data(): invalid length %jd", 928 (intmax_t)length)); 929 930 bp = g_alloc_bio(); 931 bp->bio_cmd = BIO_WRITE; 932 bp->bio_done = NULL; 933 bp->bio_offset = offset; 934 bp->bio_length = length; 935 bp->bio_data = ptr; 936 g_io_request(bp, cp); 937 error = biowait(bp, "gwrite"); 938 g_destroy_bio(bp); 939 return (error); 940 } 941 942 int 943 g_delete_data(struct g_consumer *cp, off_t offset, off_t length) 944 { 945 struct bio *bp; 946 int error; 947 948 KASSERT(length > 0 && length >= cp->provider->sectorsize, 949 ("g_delete_data(): invalid length %jd", (intmax_t)length)); 950 951 bp = g_alloc_bio(); 952 bp->bio_cmd = BIO_DELETE; 953 bp->bio_done = NULL; 954 bp->bio_offset = offset; 955 bp->bio_length = length; 956 bp->bio_data = NULL; 957 g_io_request(bp, cp); 958 error = biowait(bp, "gdelete"); 959 g_destroy_bio(bp); 960 return (error); 961 } 962 963 void 964 g_print_bio(struct bio *bp) 965 { 966 const char *pname, *cmd = NULL; 967 968 if (bp->bio_to != NULL) 969 pname = bp->bio_to->name; 970 else 971 pname = "[unknown]"; 972 973 switch (bp->bio_cmd) { 974 case BIO_GETATTR: 975 cmd = "GETATTR"; 976 printf("%s[%s(attr=%s)]", pname, cmd, bp->bio_attribute); 977 return; 978 case BIO_FLUSH: 979 cmd = "FLUSH"; 980 printf("%s[%s]", pname, cmd); 981 return; 982 case BIO_READ: 983 cmd = "READ"; 984 break; 985 case BIO_WRITE: 986 cmd = "WRITE"; 987 break; 988 case BIO_DELETE: 989 cmd = "DELETE"; 990 break; 991 default: 992 cmd = "UNKNOWN"; 993 printf("%s[%s()]", pname, cmd); 994 return; 995 } 996 printf("%s[%s(offset=%jd, length=%jd)]", pname, cmd, 997 (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length); 998 } 999