xref: /freebsd/sys/geom/geom_io.c (revision 0bf48626aaa33768078f5872b922b1487b3a9296)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 2002 Poul-Henning Kamp
5  * Copyright (c) 2002 Networks Associates Technology, Inc.
6  * Copyright (c) 2013 The FreeBSD Foundation
7  * All rights reserved.
8  *
9  * This software was developed for the FreeBSD Project by Poul-Henning Kamp
10  * and NAI Labs, the Security Research Division of Network Associates, Inc.
11  * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
12  * DARPA CHATS research program.
13  *
14  * Portions of this software were developed by Konstantin Belousov
15  * under sponsorship from the FreeBSD Foundation.
16  *
17  * Redistribution and use in source and binary forms, with or without
18  * modification, are permitted provided that the following conditions
19  * are met:
20  * 1. Redistributions of source code must retain the above copyright
21  *    notice, this list of conditions and the following disclaimer.
22  * 2. Redistributions in binary form must reproduce the above copyright
23  *    notice, this list of conditions and the following disclaimer in the
24  *    documentation and/or other materials provided with the distribution.
25  * 3. The names of the authors may not be used to endorse or promote
26  *    products derived from this software without specific prior written
27  *    permission.
28  *
29  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
30  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
33  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39  * SUCH DAMAGE.
40  */
41 
42 #include <sys/cdefs.h>
43 __FBSDID("$FreeBSD$");
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/kernel.h>
48 #include <sys/malloc.h>
49 #include <sys/bio.h>
50 #include <sys/ktr.h>
51 #include <sys/proc.h>
52 #include <sys/sbuf.h>
53 #include <sys/stack.h>
54 #include <sys/sysctl.h>
55 #include <sys/vmem.h>
56 #include <machine/stdarg.h>
57 
58 #include <sys/errno.h>
59 #include <geom/geom.h>
60 #include <geom/geom_int.h>
61 #include <sys/devicestat.h>
62 
63 #include <vm/uma.h>
64 #include <vm/vm.h>
65 #include <vm/vm_param.h>
66 #include <vm/vm_kern.h>
67 #include <vm/vm_page.h>
68 #include <vm/vm_object.h>
69 #include <vm/vm_extern.h>
70 #include <vm/vm_map.h>
71 
72 static int	g_io_transient_map_bio(struct bio *bp);
73 
74 static struct g_bioq g_bio_run_down;
75 static struct g_bioq g_bio_run_up;
76 
77 /*
78  * Pace is a hint that we've had some trouble recently allocating
79  * bios, so we should back off trying to send I/O down the stack
80  * a bit to let the problem resolve. When pacing, we also turn
81  * off direct dispatch to also reduce memory pressure from I/Os
82  * there, at the expxense of some added latency while the memory
83  * pressures exist. See g_io_schedule_down() for more details
84  * and limitations.
85  */
86 static volatile u_int pace;
87 
88 static uma_zone_t	biozone;
89 
90 /*
91  * The head of the list of classifiers used in g_io_request.
92  * Use g_register_classifier() and g_unregister_classifier()
93  * to add/remove entries to the list.
94  * Classifiers are invoked in registration order.
95  */
96 static TAILQ_HEAD(g_classifier_tailq, g_classifier_hook)
97     g_classifier_tailq = TAILQ_HEAD_INITIALIZER(g_classifier_tailq);
98 
99 #include <machine/atomic.h>
100 
101 static void
102 g_bioq_lock(struct g_bioq *bq)
103 {
104 
105 	mtx_lock(&bq->bio_queue_lock);
106 }
107 
108 static void
109 g_bioq_unlock(struct g_bioq *bq)
110 {
111 
112 	mtx_unlock(&bq->bio_queue_lock);
113 }
114 
115 #if 0
116 static void
117 g_bioq_destroy(struct g_bioq *bq)
118 {
119 
120 	mtx_destroy(&bq->bio_queue_lock);
121 }
122 #endif
123 
124 static void
125 g_bioq_init(struct g_bioq *bq)
126 {
127 
128 	TAILQ_INIT(&bq->bio_queue);
129 	mtx_init(&bq->bio_queue_lock, "bio queue", NULL, MTX_DEF);
130 }
131 
132 static struct bio *
133 g_bioq_first(struct g_bioq *bq)
134 {
135 	struct bio *bp;
136 
137 	bp = TAILQ_FIRST(&bq->bio_queue);
138 	if (bp != NULL) {
139 		KASSERT((bp->bio_flags & BIO_ONQUEUE),
140 		    ("Bio not on queue bp=%p target %p", bp, bq));
141 		bp->bio_flags &= ~BIO_ONQUEUE;
142 		TAILQ_REMOVE(&bq->bio_queue, bp, bio_queue);
143 		bq->bio_queue_length--;
144 	}
145 	return (bp);
146 }
147 
148 struct bio *
149 g_new_bio(void)
150 {
151 	struct bio *bp;
152 
153 	bp = uma_zalloc(biozone, M_NOWAIT | M_ZERO);
154 #ifdef KTR
155 	if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
156 		struct stack st;
157 
158 		CTR1(KTR_GEOM, "g_new_bio(): %p", bp);
159 		stack_save(&st);
160 		CTRSTACK(KTR_GEOM, &st, 3);
161 	}
162 #endif
163 	return (bp);
164 }
165 
166 struct bio *
167 g_alloc_bio(void)
168 {
169 	struct bio *bp;
170 
171 	bp = uma_zalloc(biozone, M_WAITOK | M_ZERO);
172 #ifdef KTR
173 	if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
174 		struct stack st;
175 
176 		CTR1(KTR_GEOM, "g_alloc_bio(): %p", bp);
177 		stack_save(&st);
178 		CTRSTACK(KTR_GEOM, &st, 3);
179 	}
180 #endif
181 	return (bp);
182 }
183 
184 void
185 g_destroy_bio(struct bio *bp)
186 {
187 #ifdef KTR
188 	if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
189 		struct stack st;
190 
191 		CTR1(KTR_GEOM, "g_destroy_bio(): %p", bp);
192 		stack_save(&st);
193 		CTRSTACK(KTR_GEOM, &st, 3);
194 	}
195 #endif
196 	uma_zfree(biozone, bp);
197 }
198 
199 struct bio *
200 g_clone_bio(struct bio *bp)
201 {
202 	struct bio *bp2;
203 
204 	bp2 = uma_zalloc(biozone, M_NOWAIT | M_ZERO);
205 	if (bp2 != NULL) {
206 		bp2->bio_parent = bp;
207 		bp2->bio_cmd = bp->bio_cmd;
208 		/*
209 		 *  BIO_ORDERED flag may be used by disk drivers to enforce
210 		 *  ordering restrictions, so this flag needs to be cloned.
211 		 *  BIO_UNMAPPED and BIO_VLIST should be inherited, to properly
212 		 *  indicate which way the buffer is passed.
213 		 *  Other bio flags are not suitable for cloning.
214 		 */
215 		bp2->bio_flags = bp->bio_flags &
216 		    (BIO_ORDERED | BIO_UNMAPPED | BIO_VLIST);
217 		bp2->bio_length = bp->bio_length;
218 		bp2->bio_offset = bp->bio_offset;
219 		bp2->bio_data = bp->bio_data;
220 		bp2->bio_ma = bp->bio_ma;
221 		bp2->bio_ma_n = bp->bio_ma_n;
222 		bp2->bio_ma_offset = bp->bio_ma_offset;
223 		bp2->bio_attribute = bp->bio_attribute;
224 		if (bp->bio_cmd == BIO_ZONE)
225 			bcopy(&bp->bio_zone, &bp2->bio_zone,
226 			    sizeof(bp->bio_zone));
227 		/* Inherit classification info from the parent */
228 		bp2->bio_classifier1 = bp->bio_classifier1;
229 		bp2->bio_classifier2 = bp->bio_classifier2;
230 #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
231 		bp2->bio_track_bp = bp->bio_track_bp;
232 #endif
233 		bp->bio_children++;
234 	}
235 #ifdef KTR
236 	if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
237 		struct stack st;
238 
239 		CTR2(KTR_GEOM, "g_clone_bio(%p): %p", bp, bp2);
240 		stack_save(&st);
241 		CTRSTACK(KTR_GEOM, &st, 3);
242 	}
243 #endif
244 	return(bp2);
245 }
246 
247 struct bio *
248 g_duplicate_bio(struct bio *bp)
249 {
250 	struct bio *bp2;
251 
252 	bp2 = uma_zalloc(biozone, M_WAITOK | M_ZERO);
253 	bp2->bio_flags = bp->bio_flags & (BIO_UNMAPPED | BIO_VLIST);
254 	bp2->bio_parent = bp;
255 	bp2->bio_cmd = bp->bio_cmd;
256 	bp2->bio_length = bp->bio_length;
257 	bp2->bio_offset = bp->bio_offset;
258 	bp2->bio_data = bp->bio_data;
259 	bp2->bio_ma = bp->bio_ma;
260 	bp2->bio_ma_n = bp->bio_ma_n;
261 	bp2->bio_ma_offset = bp->bio_ma_offset;
262 	bp2->bio_attribute = bp->bio_attribute;
263 	bp->bio_children++;
264 #ifdef KTR
265 	if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
266 		struct stack st;
267 
268 		CTR2(KTR_GEOM, "g_duplicate_bio(%p): %p", bp, bp2);
269 		stack_save(&st);
270 		CTRSTACK(KTR_GEOM, &st, 3);
271 	}
272 #endif
273 	return(bp2);
274 }
275 
276 void
277 g_reset_bio(struct bio *bp)
278 {
279 
280 	bzero(bp, sizeof(*bp));
281 }
282 
283 void
284 g_io_init()
285 {
286 
287 	g_bioq_init(&g_bio_run_down);
288 	g_bioq_init(&g_bio_run_up);
289 	biozone = uma_zcreate("g_bio", sizeof (struct bio),
290 	    NULL, NULL,
291 	    NULL, NULL,
292 	    0, 0);
293 }
294 
295 int
296 g_io_getattr(const char *attr, struct g_consumer *cp, int *len, void *ptr)
297 {
298 	struct bio *bp;
299 	int error;
300 
301 	g_trace(G_T_BIO, "bio_getattr(%s)", attr);
302 	bp = g_alloc_bio();
303 	bp->bio_cmd = BIO_GETATTR;
304 	bp->bio_done = NULL;
305 	bp->bio_attribute = attr;
306 	bp->bio_length = *len;
307 	bp->bio_data = ptr;
308 	g_io_request(bp, cp);
309 	error = biowait(bp, "ggetattr");
310 	*len = bp->bio_completed;
311 	g_destroy_bio(bp);
312 	return (error);
313 }
314 
315 int
316 g_io_zonecmd(struct disk_zone_args *zone_args, struct g_consumer *cp)
317 {
318 	struct bio *bp;
319 	int error;
320 
321 	g_trace(G_T_BIO, "bio_zone(%d)", zone_args->zone_cmd);
322 	bp = g_alloc_bio();
323 	bp->bio_cmd = BIO_ZONE;
324 	bp->bio_done = NULL;
325 	/*
326 	 * XXX KDM need to handle report zone data.
327 	 */
328 	bcopy(zone_args, &bp->bio_zone, sizeof(*zone_args));
329 	if (zone_args->zone_cmd == DISK_ZONE_REPORT_ZONES)
330 		bp->bio_length =
331 		    zone_args->zone_params.report.entries_allocated *
332 		    sizeof(struct disk_zone_rep_entry);
333 	else
334 		bp->bio_length = 0;
335 
336 	g_io_request(bp, cp);
337 	error = biowait(bp, "gzone");
338 	bcopy(&bp->bio_zone, zone_args, sizeof(*zone_args));
339 	g_destroy_bio(bp);
340 	return (error);
341 }
342 
343 int
344 g_io_flush(struct g_consumer *cp)
345 {
346 	struct bio *bp;
347 	int error;
348 
349 	g_trace(G_T_BIO, "bio_flush(%s)", cp->provider->name);
350 	bp = g_alloc_bio();
351 	bp->bio_cmd = BIO_FLUSH;
352 	bp->bio_flags |= BIO_ORDERED;
353 	bp->bio_done = NULL;
354 	bp->bio_attribute = NULL;
355 	bp->bio_offset = cp->provider->mediasize;
356 	bp->bio_length = 0;
357 	bp->bio_data = NULL;
358 	g_io_request(bp, cp);
359 	error = biowait(bp, "gflush");
360 	g_destroy_bio(bp);
361 	return (error);
362 }
363 
364 static int
365 g_io_check(struct bio *bp)
366 {
367 	struct g_consumer *cp;
368 	struct g_provider *pp;
369 	off_t excess;
370 	int error;
371 
372 	biotrack(bp, __func__);
373 
374 	cp = bp->bio_from;
375 	pp = bp->bio_to;
376 
377 	/* Fail if access counters dont allow the operation */
378 	switch(bp->bio_cmd) {
379 	case BIO_READ:
380 	case BIO_GETATTR:
381 		if (cp->acr == 0)
382 			return (EPERM);
383 		break;
384 	case BIO_WRITE:
385 	case BIO_DELETE:
386 	case BIO_FLUSH:
387 		if (cp->acw == 0)
388 			return (EPERM);
389 		break;
390 	case BIO_ZONE:
391 		if ((bp->bio_zone.zone_cmd == DISK_ZONE_REPORT_ZONES) ||
392 		    (bp->bio_zone.zone_cmd == DISK_ZONE_GET_PARAMS)) {
393 			if (cp->acr == 0)
394 				return (EPERM);
395 		} else if (cp->acw == 0)
396 			return (EPERM);
397 		break;
398 	default:
399 		return (EPERM);
400 	}
401 	/* if provider is marked for error, don't disturb. */
402 	if (pp->error)
403 		return (pp->error);
404 	if (cp->flags & G_CF_ORPHAN)
405 		return (ENXIO);
406 
407 	switch(bp->bio_cmd) {
408 	case BIO_READ:
409 	case BIO_WRITE:
410 	case BIO_DELETE:
411 		/* Zero sectorsize or mediasize is probably a lack of media. */
412 		if (pp->sectorsize == 0 || pp->mediasize == 0)
413 			return (ENXIO);
414 		/* Reject I/O not on sector boundary */
415 		if (bp->bio_offset % pp->sectorsize)
416 			return (EINVAL);
417 		/* Reject I/O not integral sector long */
418 		if (bp->bio_length % pp->sectorsize)
419 			return (EINVAL);
420 		/* Reject requests before or past the end of media. */
421 		if (bp->bio_offset < 0)
422 			return (EIO);
423 		if (bp->bio_offset > pp->mediasize)
424 			return (EIO);
425 
426 		/* Truncate requests to the end of providers media. */
427 		excess = bp->bio_offset + bp->bio_length;
428 		if (excess > bp->bio_to->mediasize) {
429 			KASSERT((bp->bio_flags & BIO_UNMAPPED) == 0 ||
430 			    round_page(bp->bio_ma_offset +
431 			    bp->bio_length) / PAGE_SIZE == bp->bio_ma_n,
432 			    ("excess bio %p too short", bp));
433 			excess -= bp->bio_to->mediasize;
434 			bp->bio_length -= excess;
435 			if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
436 				bp->bio_ma_n = round_page(bp->bio_ma_offset +
437 				    bp->bio_length) / PAGE_SIZE;
438 			}
439 			if (excess > 0)
440 				CTR3(KTR_GEOM, "g_down truncated bio "
441 				    "%p provider %s by %d", bp,
442 				    bp->bio_to->name, excess);
443 		}
444 
445 		/* Deliver zero length transfers right here. */
446 		if (bp->bio_length == 0) {
447 			CTR2(KTR_GEOM, "g_down terminated 0-length "
448 			    "bp %p provider %s", bp, bp->bio_to->name);
449 			return (0);
450 		}
451 
452 		if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
453 		    (bp->bio_to->flags & G_PF_ACCEPT_UNMAPPED) == 0 &&
454 		    (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) {
455 			if ((error = g_io_transient_map_bio(bp)) >= 0)
456 				return (error);
457 		}
458 		break;
459 	default:
460 		break;
461 	}
462 	return (EJUSTRETURN);
463 }
464 
465 /*
466  * bio classification support.
467  *
468  * g_register_classifier() and g_unregister_classifier()
469  * are used to add/remove a classifier from the list.
470  * The list is protected using the g_bio_run_down lock,
471  * because the classifiers are called in this path.
472  *
473  * g_io_request() passes bio's that are not already classified
474  * (i.e. those with bio_classifier1 == NULL) to g_run_classifiers().
475  * Classifiers can store their result in the two fields
476  * bio_classifier1 and bio_classifier2.
477  * A classifier that updates one of the fields should
478  * return a non-zero value.
479  * If no classifier updates the field, g_run_classifiers() sets
480  * bio_classifier1 = BIO_NOTCLASSIFIED to avoid further calls.
481  */
482 
483 int
484 g_register_classifier(struct g_classifier_hook *hook)
485 {
486 
487 	g_bioq_lock(&g_bio_run_down);
488 	TAILQ_INSERT_TAIL(&g_classifier_tailq, hook, link);
489 	g_bioq_unlock(&g_bio_run_down);
490 
491 	return (0);
492 }
493 
494 void
495 g_unregister_classifier(struct g_classifier_hook *hook)
496 {
497 	struct g_classifier_hook *entry;
498 
499 	g_bioq_lock(&g_bio_run_down);
500 	TAILQ_FOREACH(entry, &g_classifier_tailq, link) {
501 		if (entry == hook) {
502 			TAILQ_REMOVE(&g_classifier_tailq, hook, link);
503 			break;
504 		}
505 	}
506 	g_bioq_unlock(&g_bio_run_down);
507 }
508 
509 static void
510 g_run_classifiers(struct bio *bp)
511 {
512 	struct g_classifier_hook *hook;
513 	int classified = 0;
514 
515 	biotrack(bp, __func__);
516 
517 	TAILQ_FOREACH(hook, &g_classifier_tailq, link)
518 		classified |= hook->func(hook->arg, bp);
519 
520 	if (!classified)
521 		bp->bio_classifier1 = BIO_NOTCLASSIFIED;
522 }
523 
524 void
525 g_io_request(struct bio *bp, struct g_consumer *cp)
526 {
527 	struct g_provider *pp;
528 	struct mtx *mtxp;
529 	int direct, error, first;
530 	uint8_t cmd;
531 
532 	biotrack(bp, __func__);
533 
534 	KASSERT(cp != NULL, ("NULL cp in g_io_request"));
535 	KASSERT(bp != NULL, ("NULL bp in g_io_request"));
536 	pp = cp->provider;
537 	KASSERT(pp != NULL, ("consumer not attached in g_io_request"));
538 #ifdef DIAGNOSTIC
539 	KASSERT(bp->bio_driver1 == NULL,
540 	    ("bio_driver1 used by the consumer (geom %s)", cp->geom->name));
541 	KASSERT(bp->bio_driver2 == NULL,
542 	    ("bio_driver2 used by the consumer (geom %s)", cp->geom->name));
543 	KASSERT(bp->bio_pflags == 0,
544 	    ("bio_pflags used by the consumer (geom %s)", cp->geom->name));
545 	/*
546 	 * Remember consumer's private fields, so we can detect if they were
547 	 * modified by the provider.
548 	 */
549 	bp->_bio_caller1 = bp->bio_caller1;
550 	bp->_bio_caller2 = bp->bio_caller2;
551 	bp->_bio_cflags = bp->bio_cflags;
552 #endif
553 
554 	cmd = bp->bio_cmd;
555 	if (cmd == BIO_READ || cmd == BIO_WRITE || cmd == BIO_GETATTR) {
556 		KASSERT(bp->bio_data != NULL,
557 		    ("NULL bp->data in g_io_request(cmd=%hu)", bp->bio_cmd));
558 	}
559 	if (cmd == BIO_DELETE || cmd == BIO_FLUSH) {
560 		KASSERT(bp->bio_data == NULL,
561 		    ("non-NULL bp->data in g_io_request(cmd=%hu)",
562 		    bp->bio_cmd));
563 	}
564 	if (cmd == BIO_READ || cmd == BIO_WRITE || cmd == BIO_DELETE) {
565 		KASSERT(bp->bio_offset % cp->provider->sectorsize == 0,
566 		    ("wrong offset %jd for sectorsize %u",
567 		    bp->bio_offset, cp->provider->sectorsize));
568 		KASSERT(bp->bio_length % cp->provider->sectorsize == 0,
569 		    ("wrong length %jd for sectorsize %u",
570 		    bp->bio_length, cp->provider->sectorsize));
571 	}
572 
573 	g_trace(G_T_BIO, "bio_request(%p) from %p(%s) to %p(%s) cmd %d",
574 	    bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd);
575 
576 	bp->bio_from = cp;
577 	bp->bio_to = pp;
578 	bp->bio_error = 0;
579 	bp->bio_completed = 0;
580 
581 	KASSERT(!(bp->bio_flags & BIO_ONQUEUE),
582 	    ("Bio already on queue bp=%p", bp));
583 	if ((g_collectstats & G_STATS_CONSUMERS) != 0 ||
584 	    ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL))
585 		binuptime(&bp->bio_t0);
586 	else
587 		getbinuptime(&bp->bio_t0);
588 
589 #ifdef GET_STACK_USAGE
590 	direct = (cp->flags & G_CF_DIRECT_SEND) != 0 &&
591 	    (pp->flags & G_PF_DIRECT_RECEIVE) != 0 &&
592 	    !g_is_geom_thread(curthread) &&
593 	    ((pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 ||
594 	    (bp->bio_flags & BIO_UNMAPPED) == 0 || THREAD_CAN_SLEEP()) &&
595 	    pace == 0;
596 	if (direct) {
597 		/* Block direct execution if less then half of stack left. */
598 		size_t	st, su;
599 		GET_STACK_USAGE(st, su);
600 		if (su * 2 > st)
601 			direct = 0;
602 	}
603 #else
604 	direct = 0;
605 #endif
606 
607 	if (!TAILQ_EMPTY(&g_classifier_tailq) && !bp->bio_classifier1) {
608 		g_bioq_lock(&g_bio_run_down);
609 		g_run_classifiers(bp);
610 		g_bioq_unlock(&g_bio_run_down);
611 	}
612 
613 	/*
614 	 * The statistics collection is lockless, as such, but we
615 	 * can not update one instance of the statistics from more
616 	 * than one thread at a time, so grab the lock first.
617 	 */
618 	mtxp = mtx_pool_find(mtxpool_sleep, pp);
619 	mtx_lock(mtxp);
620 	if (g_collectstats & G_STATS_PROVIDERS)
621 		devstat_start_transaction(pp->stat, &bp->bio_t0);
622 	if (g_collectstats & G_STATS_CONSUMERS)
623 		devstat_start_transaction(cp->stat, &bp->bio_t0);
624 	pp->nstart++;
625 	cp->nstart++;
626 	mtx_unlock(mtxp);
627 
628 	if (direct) {
629 		error = g_io_check(bp);
630 		if (error >= 0) {
631 			CTR3(KTR_GEOM, "g_io_request g_io_check on bp %p "
632 			    "provider %s returned %d", bp, bp->bio_to->name,
633 			    error);
634 			g_io_deliver(bp, error);
635 			return;
636 		}
637 		bp->bio_to->geom->start(bp);
638 	} else {
639 		g_bioq_lock(&g_bio_run_down);
640 		first = TAILQ_EMPTY(&g_bio_run_down.bio_queue);
641 		TAILQ_INSERT_TAIL(&g_bio_run_down.bio_queue, bp, bio_queue);
642 		bp->bio_flags |= BIO_ONQUEUE;
643 		g_bio_run_down.bio_queue_length++;
644 		g_bioq_unlock(&g_bio_run_down);
645 		/* Pass it on down. */
646 		if (first)
647 			wakeup(&g_wait_down);
648 	}
649 }
650 
651 void
652 g_io_deliver(struct bio *bp, int error)
653 {
654 	struct bintime now;
655 	struct g_consumer *cp;
656 	struct g_provider *pp;
657 	struct mtx *mtxp;
658 	int direct, first;
659 
660 	biotrack(bp, __func__);
661 
662 	KASSERT(bp != NULL, ("NULL bp in g_io_deliver"));
663 	pp = bp->bio_to;
664 	KASSERT(pp != NULL, ("NULL bio_to in g_io_deliver"));
665 	cp = bp->bio_from;
666 	if (cp == NULL) {
667 		bp->bio_error = error;
668 		bp->bio_done(bp);
669 		return;
670 	}
671 	KASSERT(cp != NULL, ("NULL bio_from in g_io_deliver"));
672 	KASSERT(cp->geom != NULL, ("NULL bio_from->geom in g_io_deliver"));
673 #ifdef DIAGNOSTIC
674 	/*
675 	 * Some classes - GJournal in particular - can modify bio's
676 	 * private fields while the bio is in transit; G_GEOM_VOLATILE_BIO
677 	 * flag means it's an expected behaviour for that particular geom.
678 	 */
679 	if ((cp->geom->flags & G_GEOM_VOLATILE_BIO) == 0) {
680 		KASSERT(bp->bio_caller1 == bp->_bio_caller1,
681 		    ("bio_caller1 used by the provider %s", pp->name));
682 		KASSERT(bp->bio_caller2 == bp->_bio_caller2,
683 		    ("bio_caller2 used by the provider %s", pp->name));
684 		KASSERT(bp->bio_cflags == bp->_bio_cflags,
685 		    ("bio_cflags used by the provider %s", pp->name));
686 	}
687 #endif
688 	KASSERT(bp->bio_completed >= 0, ("bio_completed can't be less than 0"));
689 	KASSERT(bp->bio_completed <= bp->bio_length,
690 	    ("bio_completed can't be greater than bio_length"));
691 
692 	g_trace(G_T_BIO,
693 "g_io_deliver(%p) from %p(%s) to %p(%s) cmd %d error %d off %jd len %jd",
694 	    bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd, error,
695 	    (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length);
696 
697 	KASSERT(!(bp->bio_flags & BIO_ONQUEUE),
698 	    ("Bio already on queue bp=%p", bp));
699 
700 	/*
701 	 * XXX: next two doesn't belong here
702 	 */
703 	bp->bio_bcount = bp->bio_length;
704 	bp->bio_resid = bp->bio_bcount - bp->bio_completed;
705 
706 #ifdef GET_STACK_USAGE
707 	direct = (pp->flags & G_PF_DIRECT_SEND) &&
708 		 (cp->flags & G_CF_DIRECT_RECEIVE) &&
709 		 !g_is_geom_thread(curthread);
710 	if (direct) {
711 		/* Block direct execution if less then half of stack left. */
712 		size_t	st, su;
713 		GET_STACK_USAGE(st, su);
714 		if (su * 2 > st)
715 			direct = 0;
716 	}
717 #else
718 	direct = 0;
719 #endif
720 
721 	/*
722 	 * The statistics collection is lockless, as such, but we
723 	 * can not update one instance of the statistics from more
724 	 * than one thread at a time, so grab the lock first.
725 	 */
726 	if ((g_collectstats & G_STATS_CONSUMERS) != 0 ||
727 	    ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL))
728 		binuptime(&now);
729 	mtxp = mtx_pool_find(mtxpool_sleep, cp);
730 	mtx_lock(mtxp);
731 	if (g_collectstats & G_STATS_PROVIDERS)
732 		devstat_end_transaction_bio_bt(pp->stat, bp, &now);
733 	if (g_collectstats & G_STATS_CONSUMERS)
734 		devstat_end_transaction_bio_bt(cp->stat, bp, &now);
735 	cp->nend++;
736 	pp->nend++;
737 	mtx_unlock(mtxp);
738 
739 	if (error != ENOMEM) {
740 		bp->bio_error = error;
741 		if (direct) {
742 			biodone(bp);
743 		} else {
744 			g_bioq_lock(&g_bio_run_up);
745 			first = TAILQ_EMPTY(&g_bio_run_up.bio_queue);
746 			TAILQ_INSERT_TAIL(&g_bio_run_up.bio_queue, bp, bio_queue);
747 			bp->bio_flags |= BIO_ONQUEUE;
748 			g_bio_run_up.bio_queue_length++;
749 			g_bioq_unlock(&g_bio_run_up);
750 			if (first)
751 				wakeup(&g_wait_up);
752 		}
753 		return;
754 	}
755 
756 	if (bootverbose)
757 		printf("ENOMEM %p on %p(%s)\n", bp, pp, pp->name);
758 	bp->bio_children = 0;
759 	bp->bio_inbed = 0;
760 	bp->bio_driver1 = NULL;
761 	bp->bio_driver2 = NULL;
762 	bp->bio_pflags = 0;
763 	g_io_request(bp, cp);
764 	pace = 1;
765 	return;
766 }
767 
768 SYSCTL_DECL(_kern_geom);
769 
770 static long transient_maps;
771 SYSCTL_LONG(_kern_geom, OID_AUTO, transient_maps, CTLFLAG_RD,
772     &transient_maps, 0,
773     "Total count of the transient mapping requests");
774 u_int transient_map_retries = 10;
775 SYSCTL_UINT(_kern_geom, OID_AUTO, transient_map_retries, CTLFLAG_RW,
776     &transient_map_retries, 0,
777     "Max count of retries used before giving up on creating transient map");
778 int transient_map_hard_failures;
779 SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_hard_failures, CTLFLAG_RD,
780     &transient_map_hard_failures, 0,
781     "Failures to establish the transient mapping due to retry attempts "
782     "exhausted");
783 int transient_map_soft_failures;
784 SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_soft_failures, CTLFLAG_RD,
785     &transient_map_soft_failures, 0,
786     "Count of retried failures to establish the transient mapping");
787 int inflight_transient_maps;
788 SYSCTL_INT(_kern_geom, OID_AUTO, inflight_transient_maps, CTLFLAG_RD,
789     &inflight_transient_maps, 0,
790     "Current count of the active transient maps");
791 
792 static int
793 g_io_transient_map_bio(struct bio *bp)
794 {
795 	vm_offset_t addr;
796 	long size;
797 	u_int retried;
798 
799 	KASSERT(unmapped_buf_allowed, ("unmapped disabled"));
800 
801 	size = round_page(bp->bio_ma_offset + bp->bio_length);
802 	KASSERT(size / PAGE_SIZE == bp->bio_ma_n, ("Bio too short %p", bp));
803 	addr = 0;
804 	retried = 0;
805 	atomic_add_long(&transient_maps, 1);
806 retry:
807 	if (vmem_alloc(transient_arena, size, M_BESTFIT | M_NOWAIT, &addr)) {
808 		if (transient_map_retries != 0 &&
809 		    retried >= transient_map_retries) {
810 			CTR2(KTR_GEOM, "g_down cannot map bp %p provider %s",
811 			    bp, bp->bio_to->name);
812 			atomic_add_int(&transient_map_hard_failures, 1);
813 			return (EDEADLK/* XXXKIB */);
814 		} else {
815 			/*
816 			 * Naive attempt to quisce the I/O to get more
817 			 * in-flight requests completed and defragment
818 			 * the transient_arena.
819 			 */
820 			CTR3(KTR_GEOM, "g_down retrymap bp %p provider %s r %d",
821 			    bp, bp->bio_to->name, retried);
822 			pause("g_d_tra", hz / 10);
823 			retried++;
824 			atomic_add_int(&transient_map_soft_failures, 1);
825 			goto retry;
826 		}
827 	}
828 	atomic_add_int(&inflight_transient_maps, 1);
829 	pmap_qenter((vm_offset_t)addr, bp->bio_ma, OFF_TO_IDX(size));
830 	bp->bio_data = (caddr_t)addr + bp->bio_ma_offset;
831 	bp->bio_flags |= BIO_TRANSIENT_MAPPING;
832 	bp->bio_flags &= ~BIO_UNMAPPED;
833 	return (EJUSTRETURN);
834 }
835 
836 void
837 g_io_schedule_down(struct thread *tp __unused)
838 {
839 	struct bio *bp;
840 	int error;
841 
842 	for(;;) {
843 		g_bioq_lock(&g_bio_run_down);
844 		bp = g_bioq_first(&g_bio_run_down);
845 		if (bp == NULL) {
846 			CTR0(KTR_GEOM, "g_down going to sleep");
847 			msleep(&g_wait_down, &g_bio_run_down.bio_queue_lock,
848 			    PRIBIO | PDROP, "-", 0);
849 			continue;
850 		}
851 		CTR0(KTR_GEOM, "g_down has work to do");
852 		g_bioq_unlock(&g_bio_run_down);
853 		biotrack(bp, __func__);
854 		if (pace != 0) {
855 			/*
856 			 * There has been at least one memory allocation
857 			 * failure since the last I/O completed. Pause 1ms to
858 			 * give the system a chance to free up memory. We only
859 			 * do this once because a large number of allocations
860 			 * can fail in the direct dispatch case and there's no
861 			 * relationship between the number of these failures and
862 			 * the length of the outage. If there's still an outage,
863 			 * we'll pause again and again until it's
864 			 * resolved. Older versions paused longer and once per
865 			 * allocation failure. This was OK for a single threaded
866 			 * g_down, but with direct dispatch would lead to max of
867 			 * 10 IOPs for minutes at a time when transient memory
868 			 * issues prevented allocation for a batch of requests
869 			 * from the upper layers.
870 			 *
871 			 * XXX This pacing is really lame. It needs to be solved
872 			 * by other methods. This is OK only because the worst
873 			 * case scenario is so rare. In the worst case scenario
874 			 * all memory is tied up waiting for I/O to complete
875 			 * which can never happen since we can't allocate bios
876 			 * for that I/O.
877 			 */
878 			CTR0(KTR_GEOM, "g_down pacing self");
879 			pause("g_down", min(hz/1000, 1));
880 			pace = 0;
881 		}
882 		CTR2(KTR_GEOM, "g_down processing bp %p provider %s", bp,
883 		    bp->bio_to->name);
884 		error = g_io_check(bp);
885 		if (error >= 0) {
886 			CTR3(KTR_GEOM, "g_down g_io_check on bp %p provider "
887 			    "%s returned %d", bp, bp->bio_to->name, error);
888 			g_io_deliver(bp, error);
889 			continue;
890 		}
891 		THREAD_NO_SLEEPING();
892 		CTR4(KTR_GEOM, "g_down starting bp %p provider %s off %ld "
893 		    "len %ld", bp, bp->bio_to->name, bp->bio_offset,
894 		    bp->bio_length);
895 		bp->bio_to->geom->start(bp);
896 		THREAD_SLEEPING_OK();
897 	}
898 }
899 
900 void
901 g_io_schedule_up(struct thread *tp __unused)
902 {
903 	struct bio *bp;
904 
905 	for(;;) {
906 		g_bioq_lock(&g_bio_run_up);
907 		bp = g_bioq_first(&g_bio_run_up);
908 		if (bp == NULL) {
909 			CTR0(KTR_GEOM, "g_up going to sleep");
910 			msleep(&g_wait_up, &g_bio_run_up.bio_queue_lock,
911 			    PRIBIO | PDROP, "-", 0);
912 			continue;
913 		}
914 		g_bioq_unlock(&g_bio_run_up);
915 		THREAD_NO_SLEEPING();
916 		CTR4(KTR_GEOM, "g_up biodone bp %p provider %s off "
917 		    "%jd len %ld", bp, bp->bio_to->name,
918 		    bp->bio_offset, bp->bio_length);
919 		biodone(bp);
920 		THREAD_SLEEPING_OK();
921 	}
922 }
923 
924 void *
925 g_read_data(struct g_consumer *cp, off_t offset, off_t length, int *error)
926 {
927 	struct bio *bp;
928 	void *ptr;
929 	int errorc;
930 
931 	KASSERT(length > 0 && length >= cp->provider->sectorsize &&
932 	    length <= MAXPHYS, ("g_read_data(): invalid length %jd",
933 	    (intmax_t)length));
934 
935 	bp = g_alloc_bio();
936 	bp->bio_cmd = BIO_READ;
937 	bp->bio_done = NULL;
938 	bp->bio_offset = offset;
939 	bp->bio_length = length;
940 	ptr = g_malloc(length, M_WAITOK);
941 	bp->bio_data = ptr;
942 	g_io_request(bp, cp);
943 	errorc = biowait(bp, "gread");
944 	if (error != NULL)
945 		*error = errorc;
946 	g_destroy_bio(bp);
947 	if (errorc) {
948 		g_free(ptr);
949 		ptr = NULL;
950 	}
951 	return (ptr);
952 }
953 
954 /*
955  * A read function for use by ffs_sbget when used by GEOM-layer routines.
956  */
957 int
958 g_use_g_read_data(void *devfd, off_t loc, void **bufp, int size)
959 {
960 	struct g_consumer *cp;
961 
962 	KASSERT(*bufp == NULL,
963 	    ("g_use_g_read_data: non-NULL *bufp %p\n", *bufp));
964 
965 	cp = (struct g_consumer *)devfd;
966 	/*
967 	 * Take care not to issue an invalid I/O request. The offset of
968 	 * the superblock candidate must be multiples of the provider's
969 	 * sector size, otherwise an FFS can't exist on the provider
970 	 * anyway.
971 	 */
972 	if (loc % cp->provider->sectorsize != 0)
973 		return (ENOENT);
974 	*bufp = g_read_data(cp, loc, size, NULL);
975 	if (*bufp == NULL)
976 		return (ENOENT);
977 	return (0);
978 }
979 
980 int
981 g_write_data(struct g_consumer *cp, off_t offset, void *ptr, off_t length)
982 {
983 	struct bio *bp;
984 	int error;
985 
986 	KASSERT(length > 0 && length >= cp->provider->sectorsize &&
987 	    length <= MAXPHYS, ("g_write_data(): invalid length %jd",
988 	    (intmax_t)length));
989 
990 	bp = g_alloc_bio();
991 	bp->bio_cmd = BIO_WRITE;
992 	bp->bio_done = NULL;
993 	bp->bio_offset = offset;
994 	bp->bio_length = length;
995 	bp->bio_data = ptr;
996 	g_io_request(bp, cp);
997 	error = biowait(bp, "gwrite");
998 	g_destroy_bio(bp);
999 	return (error);
1000 }
1001 
1002 /*
1003  * A write function for use by ffs_sbput when used by GEOM-layer routines.
1004  */
1005 int
1006 g_use_g_write_data(void *devfd, off_t loc, void *buf, int size)
1007 {
1008 
1009 	return (g_write_data((struct g_consumer *)devfd, loc, buf, size));
1010 }
1011 
1012 int
1013 g_delete_data(struct g_consumer *cp, off_t offset, off_t length)
1014 {
1015 	struct bio *bp;
1016 	int error;
1017 
1018 	KASSERT(length > 0 && length >= cp->provider->sectorsize,
1019 	    ("g_delete_data(): invalid length %jd", (intmax_t)length));
1020 
1021 	bp = g_alloc_bio();
1022 	bp->bio_cmd = BIO_DELETE;
1023 	bp->bio_done = NULL;
1024 	bp->bio_offset = offset;
1025 	bp->bio_length = length;
1026 	bp->bio_data = NULL;
1027 	g_io_request(bp, cp);
1028 	error = biowait(bp, "gdelete");
1029 	g_destroy_bio(bp);
1030 	return (error);
1031 }
1032 
1033 void
1034 g_print_bio(const char *prefix, const struct bio *bp, const char *fmtsuffix,
1035     ...)
1036 {
1037 #ifndef PRINTF_BUFR_SIZE
1038 #define PRINTF_BUFR_SIZE 64
1039 #endif
1040 	char bufr[PRINTF_BUFR_SIZE];
1041 	struct sbuf sb, *sbp __unused;
1042 	va_list ap;
1043 
1044 	sbp = sbuf_new(&sb, bufr, sizeof(bufr), SBUF_FIXEDLEN);
1045 	KASSERT(sbp != NULL, ("sbuf_new misused?"));
1046 
1047 	sbuf_set_drain(&sb, sbuf_printf_drain, NULL);
1048 
1049 	sbuf_cat(&sb, prefix);
1050 	g_format_bio(&sb, bp);
1051 
1052 	va_start(ap, fmtsuffix);
1053 	sbuf_vprintf(&sb, fmtsuffix, ap);
1054 	va_end(ap);
1055 
1056 	sbuf_nl_terminate(&sb);
1057 
1058 	sbuf_finish(&sb);
1059 	sbuf_delete(&sb);
1060 }
1061 
1062 void
1063 g_format_bio(struct sbuf *sb, const struct bio *bp)
1064 {
1065 	const char *pname, *cmd = NULL;
1066 
1067 	if (bp->bio_to != NULL)
1068 		pname = bp->bio_to->name;
1069 	else
1070 		pname = "[unknown]";
1071 
1072 	switch (bp->bio_cmd) {
1073 	case BIO_GETATTR:
1074 		cmd = "GETATTR";
1075 		sbuf_printf(sb, "%s[%s(attr=%s)]", pname, cmd,
1076 		    bp->bio_attribute);
1077 		return;
1078 	case BIO_FLUSH:
1079 		cmd = "FLUSH";
1080 		sbuf_printf(sb, "%s[%s]", pname, cmd);
1081 		return;
1082 	case BIO_ZONE: {
1083 		char *subcmd = NULL;
1084 		cmd = "ZONE";
1085 		switch (bp->bio_zone.zone_cmd) {
1086 		case DISK_ZONE_OPEN:
1087 			subcmd = "OPEN";
1088 			break;
1089 		case DISK_ZONE_CLOSE:
1090 			subcmd = "CLOSE";
1091 			break;
1092 		case DISK_ZONE_FINISH:
1093 			subcmd = "FINISH";
1094 			break;
1095 		case DISK_ZONE_RWP:
1096 			subcmd = "RWP";
1097 			break;
1098 		case DISK_ZONE_REPORT_ZONES:
1099 			subcmd = "REPORT ZONES";
1100 			break;
1101 		case DISK_ZONE_GET_PARAMS:
1102 			subcmd = "GET PARAMS";
1103 			break;
1104 		default:
1105 			subcmd = "UNKNOWN";
1106 			break;
1107 		}
1108 		sbuf_printf(sb, "%s[%s,%s]", pname, cmd, subcmd);
1109 		return;
1110 	}
1111 	case BIO_READ:
1112 		cmd = "READ";
1113 		break;
1114 	case BIO_WRITE:
1115 		cmd = "WRITE";
1116 		break;
1117 	case BIO_DELETE:
1118 		cmd = "DELETE";
1119 		break;
1120 	default:
1121 		cmd = "UNKNOWN";
1122 		sbuf_printf(sb, "%s[%s()]", pname, cmd);
1123 		return;
1124 	}
1125 	sbuf_printf(sb, "%s[%s(offset=%jd, length=%jd)]", pname, cmd,
1126 	    (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length);
1127 }
1128