xref: /freebsd/sys/geom/geom_io.c (revision 38f0b757fd84d17d0fc24739a7cda160c4516d81)
1 /*-
2  * Copyright (c) 2002 Poul-Henning Kamp
3  * Copyright (c) 2002 Networks Associates Technology, Inc.
4  * Copyright (c) 2013 The FreeBSD Foundation
5  * All rights reserved.
6  *
7  * This software was developed for the FreeBSD Project by Poul-Henning Kamp
8  * and NAI Labs, the Security Research Division of Network Associates, Inc.
9  * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
10  * DARPA CHATS research program.
11  *
12  * Portions of this software were developed by Konstantin Belousov
13  * under sponsorship from the FreeBSD Foundation.
14  *
15  * Redistribution and use in source and binary forms, with or without
16  * modification, are permitted provided that the following conditions
17  * are met:
18  * 1. Redistributions of source code must retain the above copyright
19  *    notice, this list of conditions and the following disclaimer.
20  * 2. Redistributions in binary form must reproduce the above copyright
21  *    notice, this list of conditions and the following disclaimer in the
22  *    documentation and/or other materials provided with the distribution.
23  * 3. The names of the authors may not be used to endorse or promote
24  *    products derived from this software without specific prior written
25  *    permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37  * SUCH DAMAGE.
38  */
39 
40 #include <sys/cdefs.h>
41 __FBSDID("$FreeBSD$");
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/kernel.h>
46 #include <sys/malloc.h>
47 #include <sys/bio.h>
48 #include <sys/ktr.h>
49 #include <sys/proc.h>
50 #include <sys/stack.h>
51 #include <sys/sysctl.h>
52 #include <sys/vmem.h>
53 
54 #include <sys/errno.h>
55 #include <geom/geom.h>
56 #include <geom/geom_int.h>
57 #include <sys/devicestat.h>
58 
59 #include <vm/uma.h>
60 #include <vm/vm.h>
61 #include <vm/vm_param.h>
62 #include <vm/vm_kern.h>
63 #include <vm/vm_page.h>
64 #include <vm/vm_object.h>
65 #include <vm/vm_extern.h>
66 #include <vm/vm_map.h>
67 
68 static int	g_io_transient_map_bio(struct bio *bp);
69 
70 static struct g_bioq g_bio_run_down;
71 static struct g_bioq g_bio_run_up;
72 static struct g_bioq g_bio_run_task;
73 
74 static u_int pace;
75 static uma_zone_t	biozone;
76 
77 /*
78  * The head of the list of classifiers used in g_io_request.
79  * Use g_register_classifier() and g_unregister_classifier()
80  * to add/remove entries to the list.
81  * Classifiers are invoked in registration order.
82  */
83 static TAILQ_HEAD(g_classifier_tailq, g_classifier_hook)
84     g_classifier_tailq = TAILQ_HEAD_INITIALIZER(g_classifier_tailq);
85 
86 #include <machine/atomic.h>
87 
88 static void
89 g_bioq_lock(struct g_bioq *bq)
90 {
91 
92 	mtx_lock(&bq->bio_queue_lock);
93 }
94 
95 static void
96 g_bioq_unlock(struct g_bioq *bq)
97 {
98 
99 	mtx_unlock(&bq->bio_queue_lock);
100 }
101 
102 #if 0
103 static void
104 g_bioq_destroy(struct g_bioq *bq)
105 {
106 
107 	mtx_destroy(&bq->bio_queue_lock);
108 }
109 #endif
110 
111 static void
112 g_bioq_init(struct g_bioq *bq)
113 {
114 
115 	TAILQ_INIT(&bq->bio_queue);
116 	mtx_init(&bq->bio_queue_lock, "bio queue", NULL, MTX_DEF);
117 }
118 
119 static struct bio *
120 g_bioq_first(struct g_bioq *bq)
121 {
122 	struct bio *bp;
123 
124 	bp = TAILQ_FIRST(&bq->bio_queue);
125 	if (bp != NULL) {
126 		KASSERT((bp->bio_flags & BIO_ONQUEUE),
127 		    ("Bio not on queue bp=%p target %p", bp, bq));
128 		bp->bio_flags &= ~BIO_ONQUEUE;
129 		TAILQ_REMOVE(&bq->bio_queue, bp, bio_queue);
130 		bq->bio_queue_length--;
131 	}
132 	return (bp);
133 }
134 
135 struct bio *
136 g_new_bio(void)
137 {
138 	struct bio *bp;
139 
140 	bp = uma_zalloc(biozone, M_NOWAIT | M_ZERO);
141 #ifdef KTR
142 	if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
143 		struct stack st;
144 
145 		CTR1(KTR_GEOM, "g_new_bio(): %p", bp);
146 		stack_save(&st);
147 		CTRSTACK(KTR_GEOM, &st, 3, 0);
148 	}
149 #endif
150 	return (bp);
151 }
152 
153 struct bio *
154 g_alloc_bio(void)
155 {
156 	struct bio *bp;
157 
158 	bp = uma_zalloc(biozone, M_WAITOK | M_ZERO);
159 #ifdef KTR
160 	if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
161 		struct stack st;
162 
163 		CTR1(KTR_GEOM, "g_alloc_bio(): %p", bp);
164 		stack_save(&st);
165 		CTRSTACK(KTR_GEOM, &st, 3, 0);
166 	}
167 #endif
168 	return (bp);
169 }
170 
171 void
172 g_destroy_bio(struct bio *bp)
173 {
174 #ifdef KTR
175 	if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
176 		struct stack st;
177 
178 		CTR1(KTR_GEOM, "g_destroy_bio(): %p", bp);
179 		stack_save(&st);
180 		CTRSTACK(KTR_GEOM, &st, 3, 0);
181 	}
182 #endif
183 	uma_zfree(biozone, bp);
184 }
185 
186 struct bio *
187 g_clone_bio(struct bio *bp)
188 {
189 	struct bio *bp2;
190 
191 	bp2 = uma_zalloc(biozone, M_NOWAIT | M_ZERO);
192 	if (bp2 != NULL) {
193 		bp2->bio_parent = bp;
194 		bp2->bio_cmd = bp->bio_cmd;
195 		/*
196 		 *  BIO_ORDERED flag may be used by disk drivers to enforce
197 		 *  ordering restrictions, so this flag needs to be cloned.
198 		 *  BIO_UNMAPPED should be inherited, to properly indicate
199 		 *  which way the buffer is passed.
200 		 *  Other bio flags are not suitable for cloning.
201 		 */
202 		bp2->bio_flags = bp->bio_flags & (BIO_ORDERED | BIO_UNMAPPED);
203 		bp2->bio_length = bp->bio_length;
204 		bp2->bio_offset = bp->bio_offset;
205 		bp2->bio_data = bp->bio_data;
206 		bp2->bio_ma = bp->bio_ma;
207 		bp2->bio_ma_n = bp->bio_ma_n;
208 		bp2->bio_ma_offset = bp->bio_ma_offset;
209 		bp2->bio_attribute = bp->bio_attribute;
210 		/* Inherit classification info from the parent */
211 		bp2->bio_classifier1 = bp->bio_classifier1;
212 		bp2->bio_classifier2 = bp->bio_classifier2;
213 		bp->bio_children++;
214 	}
215 #ifdef KTR
216 	if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
217 		struct stack st;
218 
219 		CTR2(KTR_GEOM, "g_clone_bio(%p): %p", bp, bp2);
220 		stack_save(&st);
221 		CTRSTACK(KTR_GEOM, &st, 3, 0);
222 	}
223 #endif
224 	return(bp2);
225 }
226 
227 struct bio *
228 g_duplicate_bio(struct bio *bp)
229 {
230 	struct bio *bp2;
231 
232 	bp2 = uma_zalloc(biozone, M_WAITOK | M_ZERO);
233 	bp2->bio_flags = bp->bio_flags & BIO_UNMAPPED;
234 	bp2->bio_parent = bp;
235 	bp2->bio_cmd = bp->bio_cmd;
236 	bp2->bio_length = bp->bio_length;
237 	bp2->bio_offset = bp->bio_offset;
238 	bp2->bio_data = bp->bio_data;
239 	bp2->bio_ma = bp->bio_ma;
240 	bp2->bio_ma_n = bp->bio_ma_n;
241 	bp2->bio_ma_offset = bp->bio_ma_offset;
242 	bp2->bio_attribute = bp->bio_attribute;
243 	bp->bio_children++;
244 #ifdef KTR
245 	if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
246 		struct stack st;
247 
248 		CTR2(KTR_GEOM, "g_duplicate_bio(%p): %p", bp, bp2);
249 		stack_save(&st);
250 		CTRSTACK(KTR_GEOM, &st, 3, 0);
251 	}
252 #endif
253 	return(bp2);
254 }
255 
256 void
257 g_io_init()
258 {
259 
260 	g_bioq_init(&g_bio_run_down);
261 	g_bioq_init(&g_bio_run_up);
262 	g_bioq_init(&g_bio_run_task);
263 	biozone = uma_zcreate("g_bio", sizeof (struct bio),
264 	    NULL, NULL,
265 	    NULL, NULL,
266 	    0, 0);
267 }
268 
269 int
270 g_io_getattr(const char *attr, struct g_consumer *cp, int *len, void *ptr)
271 {
272 	struct bio *bp;
273 	int error;
274 
275 	g_trace(G_T_BIO, "bio_getattr(%s)", attr);
276 	bp = g_alloc_bio();
277 	bp->bio_cmd = BIO_GETATTR;
278 	bp->bio_done = NULL;
279 	bp->bio_attribute = attr;
280 	bp->bio_length = *len;
281 	bp->bio_data = ptr;
282 	g_io_request(bp, cp);
283 	error = biowait(bp, "ggetattr");
284 	*len = bp->bio_completed;
285 	g_destroy_bio(bp);
286 	return (error);
287 }
288 
289 int
290 g_io_flush(struct g_consumer *cp)
291 {
292 	struct bio *bp;
293 	int error;
294 
295 	g_trace(G_T_BIO, "bio_flush(%s)", cp->provider->name);
296 	bp = g_alloc_bio();
297 	bp->bio_cmd = BIO_FLUSH;
298 	bp->bio_flags |= BIO_ORDERED;
299 	bp->bio_done = NULL;
300 	bp->bio_attribute = NULL;
301 	bp->bio_offset = cp->provider->mediasize;
302 	bp->bio_length = 0;
303 	bp->bio_data = NULL;
304 	g_io_request(bp, cp);
305 	error = biowait(bp, "gflush");
306 	g_destroy_bio(bp);
307 	return (error);
308 }
309 
310 static int
311 g_io_check(struct bio *bp)
312 {
313 	struct g_consumer *cp;
314 	struct g_provider *pp;
315 	off_t excess;
316 	int error;
317 
318 	cp = bp->bio_from;
319 	pp = bp->bio_to;
320 
321 	/* Fail if access counters dont allow the operation */
322 	switch(bp->bio_cmd) {
323 	case BIO_READ:
324 	case BIO_GETATTR:
325 		if (cp->acr == 0)
326 			return (EPERM);
327 		break;
328 	case BIO_WRITE:
329 	case BIO_DELETE:
330 	case BIO_FLUSH:
331 		if (cp->acw == 0)
332 			return (EPERM);
333 		break;
334 	default:
335 		return (EPERM);
336 	}
337 	/* if provider is marked for error, don't disturb. */
338 	if (pp->error)
339 		return (pp->error);
340 	if (cp->flags & G_CF_ORPHAN)
341 		return (ENXIO);
342 
343 	switch(bp->bio_cmd) {
344 	case BIO_READ:
345 	case BIO_WRITE:
346 	case BIO_DELETE:
347 		/* Zero sectorsize or mediasize is probably a lack of media. */
348 		if (pp->sectorsize == 0 || pp->mediasize == 0)
349 			return (ENXIO);
350 		/* Reject I/O not on sector boundary */
351 		if (bp->bio_offset % pp->sectorsize)
352 			return (EINVAL);
353 		/* Reject I/O not integral sector long */
354 		if (bp->bio_length % pp->sectorsize)
355 			return (EINVAL);
356 		/* Reject requests before or past the end of media. */
357 		if (bp->bio_offset < 0)
358 			return (EIO);
359 		if (bp->bio_offset > pp->mediasize)
360 			return (EIO);
361 
362 		/* Truncate requests to the end of providers media. */
363 		excess = bp->bio_offset + bp->bio_length;
364 		if (excess > bp->bio_to->mediasize) {
365 			KASSERT((bp->bio_flags & BIO_UNMAPPED) == 0 ||
366 			    round_page(bp->bio_ma_offset +
367 			    bp->bio_length) / PAGE_SIZE == bp->bio_ma_n,
368 			    ("excess bio %p too short", bp));
369 			excess -= bp->bio_to->mediasize;
370 			bp->bio_length -= excess;
371 			if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
372 				bp->bio_ma_n = round_page(bp->bio_ma_offset +
373 				    bp->bio_length) / PAGE_SIZE;
374 			}
375 			if (excess > 0)
376 				CTR3(KTR_GEOM, "g_down truncated bio "
377 				    "%p provider %s by %d", bp,
378 				    bp->bio_to->name, excess);
379 		}
380 
381 		/* Deliver zero length transfers right here. */
382 		if (bp->bio_length == 0) {
383 			CTR2(KTR_GEOM, "g_down terminated 0-length "
384 			    "bp %p provider %s", bp, bp->bio_to->name);
385 			return (0);
386 		}
387 
388 		if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
389 		    (bp->bio_to->flags & G_PF_ACCEPT_UNMAPPED) == 0 &&
390 		    (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) {
391 			if ((error = g_io_transient_map_bio(bp)) >= 0)
392 				return (error);
393 		}
394 		break;
395 	default:
396 		break;
397 	}
398 	return (EJUSTRETURN);
399 }
400 
401 /*
402  * bio classification support.
403  *
404  * g_register_classifier() and g_unregister_classifier()
405  * are used to add/remove a classifier from the list.
406  * The list is protected using the g_bio_run_down lock,
407  * because the classifiers are called in this path.
408  *
409  * g_io_request() passes bio's that are not already classified
410  * (i.e. those with bio_classifier1 == NULL) to g_run_classifiers().
411  * Classifiers can store their result in the two fields
412  * bio_classifier1 and bio_classifier2.
413  * A classifier that updates one of the fields should
414  * return a non-zero value.
415  * If no classifier updates the field, g_run_classifiers() sets
416  * bio_classifier1 = BIO_NOTCLASSIFIED to avoid further calls.
417  */
418 
419 int
420 g_register_classifier(struct g_classifier_hook *hook)
421 {
422 
423 	g_bioq_lock(&g_bio_run_down);
424 	TAILQ_INSERT_TAIL(&g_classifier_tailq, hook, link);
425 	g_bioq_unlock(&g_bio_run_down);
426 
427 	return (0);
428 }
429 
430 void
431 g_unregister_classifier(struct g_classifier_hook *hook)
432 {
433 	struct g_classifier_hook *entry;
434 
435 	g_bioq_lock(&g_bio_run_down);
436 	TAILQ_FOREACH(entry, &g_classifier_tailq, link) {
437 		if (entry == hook) {
438 			TAILQ_REMOVE(&g_classifier_tailq, hook, link);
439 			break;
440 		}
441 	}
442 	g_bioq_unlock(&g_bio_run_down);
443 }
444 
445 static void
446 g_run_classifiers(struct bio *bp)
447 {
448 	struct g_classifier_hook *hook;
449 	int classified = 0;
450 
451 	TAILQ_FOREACH(hook, &g_classifier_tailq, link)
452 		classified |= hook->func(hook->arg, bp);
453 
454 	if (!classified)
455 		bp->bio_classifier1 = BIO_NOTCLASSIFIED;
456 }
457 
458 void
459 g_io_request(struct bio *bp, struct g_consumer *cp)
460 {
461 	struct g_provider *pp;
462 	struct mtx *mtxp;
463 	int direct, error, first;
464 
465 	KASSERT(cp != NULL, ("NULL cp in g_io_request"));
466 	KASSERT(bp != NULL, ("NULL bp in g_io_request"));
467 	pp = cp->provider;
468 	KASSERT(pp != NULL, ("consumer not attached in g_io_request"));
469 #ifdef DIAGNOSTIC
470 	KASSERT(bp->bio_driver1 == NULL,
471 	    ("bio_driver1 used by the consumer (geom %s)", cp->geom->name));
472 	KASSERT(bp->bio_driver2 == NULL,
473 	    ("bio_driver2 used by the consumer (geom %s)", cp->geom->name));
474 	KASSERT(bp->bio_pflags == 0,
475 	    ("bio_pflags used by the consumer (geom %s)", cp->geom->name));
476 	/*
477 	 * Remember consumer's private fields, so we can detect if they were
478 	 * modified by the provider.
479 	 */
480 	bp->_bio_caller1 = bp->bio_caller1;
481 	bp->_bio_caller2 = bp->bio_caller2;
482 	bp->_bio_cflags = bp->bio_cflags;
483 #endif
484 
485 	if (bp->bio_cmd & (BIO_READ|BIO_WRITE|BIO_GETATTR)) {
486 		KASSERT(bp->bio_data != NULL,
487 		    ("NULL bp->data in g_io_request(cmd=%hhu)", bp->bio_cmd));
488 	}
489 	if (bp->bio_cmd & (BIO_DELETE|BIO_FLUSH)) {
490 		KASSERT(bp->bio_data == NULL,
491 		    ("non-NULL bp->data in g_io_request(cmd=%hhu)",
492 		    bp->bio_cmd));
493 	}
494 	if (bp->bio_cmd & (BIO_READ|BIO_WRITE|BIO_DELETE)) {
495 		KASSERT(bp->bio_offset % cp->provider->sectorsize == 0,
496 		    ("wrong offset %jd for sectorsize %u",
497 		    bp->bio_offset, cp->provider->sectorsize));
498 		KASSERT(bp->bio_length % cp->provider->sectorsize == 0,
499 		    ("wrong length %jd for sectorsize %u",
500 		    bp->bio_length, cp->provider->sectorsize));
501 	}
502 
503 	g_trace(G_T_BIO, "bio_request(%p) from %p(%s) to %p(%s) cmd %d",
504 	    bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd);
505 
506 	bp->bio_from = cp;
507 	bp->bio_to = pp;
508 	bp->bio_error = 0;
509 	bp->bio_completed = 0;
510 
511 	KASSERT(!(bp->bio_flags & BIO_ONQUEUE),
512 	    ("Bio already on queue bp=%p", bp));
513 	if ((g_collectstats & G_STATS_CONSUMERS) != 0 ||
514 	    ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL))
515 		binuptime(&bp->bio_t0);
516 	else
517 		getbinuptime(&bp->bio_t0);
518 
519 #ifdef GET_STACK_USAGE
520 	direct = (cp->flags & G_CF_DIRECT_SEND) &&
521 		 (pp->flags & G_PF_DIRECT_RECEIVE) &&
522 		 !g_is_geom_thread(curthread) &&
523 		 (((pp->flags & G_PF_ACCEPT_UNMAPPED) == 0 &&
524 		   (bp->bio_flags & BIO_UNMAPPED) != 0) || THREAD_CAN_SLEEP());
525 	if (direct) {
526 		/* Block direct execution if less then half of stack left. */
527 		size_t	st, su;
528 		GET_STACK_USAGE(st, su);
529 		if (su * 2 > st)
530 			direct = 0;
531 	}
532 #else
533 	direct = 0;
534 #endif
535 
536 	if (!TAILQ_EMPTY(&g_classifier_tailq) && !bp->bio_classifier1) {
537 		g_bioq_lock(&g_bio_run_down);
538 		g_run_classifiers(bp);
539 		g_bioq_unlock(&g_bio_run_down);
540 	}
541 
542 	/*
543 	 * The statistics collection is lockless, as such, but we
544 	 * can not update one instance of the statistics from more
545 	 * than one thread at a time, so grab the lock first.
546 	 */
547 	mtxp = mtx_pool_find(mtxpool_sleep, pp);
548 	mtx_lock(mtxp);
549 	if (g_collectstats & G_STATS_PROVIDERS)
550 		devstat_start_transaction(pp->stat, &bp->bio_t0);
551 	if (g_collectstats & G_STATS_CONSUMERS)
552 		devstat_start_transaction(cp->stat, &bp->bio_t0);
553 	pp->nstart++;
554 	cp->nstart++;
555 	mtx_unlock(mtxp);
556 
557 	if (direct) {
558 		error = g_io_check(bp);
559 		if (error >= 0) {
560 			CTR3(KTR_GEOM, "g_io_request g_io_check on bp %p "
561 			    "provider %s returned %d", bp, bp->bio_to->name,
562 			    error);
563 			g_io_deliver(bp, error);
564 			return;
565 		}
566 		bp->bio_to->geom->start(bp);
567 	} else {
568 		g_bioq_lock(&g_bio_run_down);
569 		first = TAILQ_EMPTY(&g_bio_run_down.bio_queue);
570 		TAILQ_INSERT_TAIL(&g_bio_run_down.bio_queue, bp, bio_queue);
571 		bp->bio_flags |= BIO_ONQUEUE;
572 		g_bio_run_down.bio_queue_length++;
573 		g_bioq_unlock(&g_bio_run_down);
574 		/* Pass it on down. */
575 		if (first)
576 			wakeup(&g_wait_down);
577 	}
578 }
579 
580 void
581 g_io_deliver(struct bio *bp, int error)
582 {
583 	struct bintime now;
584 	struct g_consumer *cp;
585 	struct g_provider *pp;
586 	struct mtx *mtxp;
587 	int direct, first;
588 
589 	KASSERT(bp != NULL, ("NULL bp in g_io_deliver"));
590 	pp = bp->bio_to;
591 	KASSERT(pp != NULL, ("NULL bio_to in g_io_deliver"));
592 	cp = bp->bio_from;
593 	if (cp == NULL) {
594 		bp->bio_error = error;
595 		bp->bio_done(bp);
596 		return;
597 	}
598 	KASSERT(cp != NULL, ("NULL bio_from in g_io_deliver"));
599 	KASSERT(cp->geom != NULL, ("NULL bio_from->geom in g_io_deliver"));
600 #ifdef DIAGNOSTIC
601 	/*
602 	 * Some classes - GJournal in particular - can modify bio's
603 	 * private fields while the bio is in transit; G_GEOM_VOLATILE_BIO
604 	 * flag means it's an expected behaviour for that particular geom.
605 	 */
606 	if ((cp->geom->flags & G_GEOM_VOLATILE_BIO) == 0) {
607 		KASSERT(bp->bio_caller1 == bp->_bio_caller1,
608 		    ("bio_caller1 used by the provider %s", pp->name));
609 		KASSERT(bp->bio_caller2 == bp->_bio_caller2,
610 		    ("bio_caller2 used by the provider %s", pp->name));
611 		KASSERT(bp->bio_cflags == bp->_bio_cflags,
612 		    ("bio_cflags used by the provider %s", pp->name));
613 	}
614 #endif
615 	KASSERT(bp->bio_completed >= 0, ("bio_completed can't be less than 0"));
616 	KASSERT(bp->bio_completed <= bp->bio_length,
617 	    ("bio_completed can't be greater than bio_length"));
618 
619 	g_trace(G_T_BIO,
620 "g_io_deliver(%p) from %p(%s) to %p(%s) cmd %d error %d off %jd len %jd",
621 	    bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd, error,
622 	    (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length);
623 
624 	KASSERT(!(bp->bio_flags & BIO_ONQUEUE),
625 	    ("Bio already on queue bp=%p", bp));
626 
627 	/*
628 	 * XXX: next two doesn't belong here
629 	 */
630 	bp->bio_bcount = bp->bio_length;
631 	bp->bio_resid = bp->bio_bcount - bp->bio_completed;
632 
633 #ifdef GET_STACK_USAGE
634 	direct = (pp->flags & G_PF_DIRECT_SEND) &&
635 		 (cp->flags & G_CF_DIRECT_RECEIVE) &&
636 		 !g_is_geom_thread(curthread);
637 	if (direct) {
638 		/* Block direct execution if less then half of stack left. */
639 		size_t	st, su;
640 		GET_STACK_USAGE(st, su);
641 		if (su * 2 > st)
642 			direct = 0;
643 	}
644 #else
645 	direct = 0;
646 #endif
647 
648 	/*
649 	 * The statistics collection is lockless, as such, but we
650 	 * can not update one instance of the statistics from more
651 	 * than one thread at a time, so grab the lock first.
652 	 */
653 	if ((g_collectstats & G_STATS_CONSUMERS) != 0 ||
654 	    ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL))
655 		binuptime(&now);
656 	mtxp = mtx_pool_find(mtxpool_sleep, cp);
657 	mtx_lock(mtxp);
658 	if (g_collectstats & G_STATS_PROVIDERS)
659 		devstat_end_transaction_bio_bt(pp->stat, bp, &now);
660 	if (g_collectstats & G_STATS_CONSUMERS)
661 		devstat_end_transaction_bio_bt(cp->stat, bp, &now);
662 	cp->nend++;
663 	pp->nend++;
664 	mtx_unlock(mtxp);
665 
666 	if (error != ENOMEM) {
667 		bp->bio_error = error;
668 		if (direct) {
669 			biodone(bp);
670 		} else {
671 			g_bioq_lock(&g_bio_run_up);
672 			first = TAILQ_EMPTY(&g_bio_run_up.bio_queue);
673 			TAILQ_INSERT_TAIL(&g_bio_run_up.bio_queue, bp, bio_queue);
674 			bp->bio_flags |= BIO_ONQUEUE;
675 			g_bio_run_up.bio_queue_length++;
676 			g_bioq_unlock(&g_bio_run_up);
677 			if (first)
678 				wakeup(&g_wait_up);
679 		}
680 		return;
681 	}
682 
683 	if (bootverbose)
684 		printf("ENOMEM %p on %p(%s)\n", bp, pp, pp->name);
685 	bp->bio_children = 0;
686 	bp->bio_inbed = 0;
687 	bp->bio_driver1 = NULL;
688 	bp->bio_driver2 = NULL;
689 	bp->bio_pflags = 0;
690 	g_io_request(bp, cp);
691 	pace++;
692 	return;
693 }
694 
695 SYSCTL_DECL(_kern_geom);
696 
697 static long transient_maps;
698 SYSCTL_LONG(_kern_geom, OID_AUTO, transient_maps, CTLFLAG_RD,
699     &transient_maps, 0,
700     "Total count of the transient mapping requests");
701 u_int transient_map_retries = 10;
702 SYSCTL_UINT(_kern_geom, OID_AUTO, transient_map_retries, CTLFLAG_RW,
703     &transient_map_retries, 0,
704     "Max count of retries used before giving up on creating transient map");
705 int transient_map_hard_failures;
706 SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_hard_failures, CTLFLAG_RD,
707     &transient_map_hard_failures, 0,
708     "Failures to establish the transient mapping due to retry attempts "
709     "exhausted");
710 int transient_map_soft_failures;
711 SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_soft_failures, CTLFLAG_RD,
712     &transient_map_soft_failures, 0,
713     "Count of retried failures to establish the transient mapping");
714 int inflight_transient_maps;
715 SYSCTL_INT(_kern_geom, OID_AUTO, inflight_transient_maps, CTLFLAG_RD,
716     &inflight_transient_maps, 0,
717     "Current count of the active transient maps");
718 
719 static int
720 g_io_transient_map_bio(struct bio *bp)
721 {
722 	vm_offset_t addr;
723 	long size;
724 	u_int retried;
725 
726 	KASSERT(unmapped_buf_allowed, ("unmapped disabled"));
727 
728 	size = round_page(bp->bio_ma_offset + bp->bio_length);
729 	KASSERT(size / PAGE_SIZE == bp->bio_ma_n, ("Bio too short %p", bp));
730 	addr = 0;
731 	retried = 0;
732 	atomic_add_long(&transient_maps, 1);
733 retry:
734 	if (vmem_alloc(transient_arena, size, M_BESTFIT | M_NOWAIT, &addr)) {
735 		if (transient_map_retries != 0 &&
736 		    retried >= transient_map_retries) {
737 			CTR2(KTR_GEOM, "g_down cannot map bp %p provider %s",
738 			    bp, bp->bio_to->name);
739 			atomic_add_int(&transient_map_hard_failures, 1);
740 			return (EDEADLK/* XXXKIB */);
741 		} else {
742 			/*
743 			 * Naive attempt to quisce the I/O to get more
744 			 * in-flight requests completed and defragment
745 			 * the transient_arena.
746 			 */
747 			CTR3(KTR_GEOM, "g_down retrymap bp %p provider %s r %d",
748 			    bp, bp->bio_to->name, retried);
749 			pause("g_d_tra", hz / 10);
750 			retried++;
751 			atomic_add_int(&transient_map_soft_failures, 1);
752 			goto retry;
753 		}
754 	}
755 	atomic_add_int(&inflight_transient_maps, 1);
756 	pmap_qenter((vm_offset_t)addr, bp->bio_ma, OFF_TO_IDX(size));
757 	bp->bio_data = (caddr_t)addr + bp->bio_ma_offset;
758 	bp->bio_flags |= BIO_TRANSIENT_MAPPING;
759 	bp->bio_flags &= ~BIO_UNMAPPED;
760 	return (EJUSTRETURN);
761 }
762 
763 void
764 g_io_schedule_down(struct thread *tp __unused)
765 {
766 	struct bio *bp;
767 	int error;
768 
769 	for(;;) {
770 		g_bioq_lock(&g_bio_run_down);
771 		bp = g_bioq_first(&g_bio_run_down);
772 		if (bp == NULL) {
773 			CTR0(KTR_GEOM, "g_down going to sleep");
774 			msleep(&g_wait_down, &g_bio_run_down.bio_queue_lock,
775 			    PRIBIO | PDROP, "-", 0);
776 			continue;
777 		}
778 		CTR0(KTR_GEOM, "g_down has work to do");
779 		g_bioq_unlock(&g_bio_run_down);
780 		if (pace > 0) {
781 			CTR1(KTR_GEOM, "g_down pacing self (pace %d)", pace);
782 			pause("g_down", hz/10);
783 			pace--;
784 		}
785 		CTR2(KTR_GEOM, "g_down processing bp %p provider %s", bp,
786 		    bp->bio_to->name);
787 		error = g_io_check(bp);
788 		if (error >= 0) {
789 			CTR3(KTR_GEOM, "g_down g_io_check on bp %p provider "
790 			    "%s returned %d", bp, bp->bio_to->name, error);
791 			g_io_deliver(bp, error);
792 			continue;
793 		}
794 		THREAD_NO_SLEEPING();
795 		CTR4(KTR_GEOM, "g_down starting bp %p provider %s off %ld "
796 		    "len %ld", bp, bp->bio_to->name, bp->bio_offset,
797 		    bp->bio_length);
798 		bp->bio_to->geom->start(bp);
799 		THREAD_SLEEPING_OK();
800 	}
801 }
802 
803 void
804 bio_taskqueue(struct bio *bp, bio_task_t *func, void *arg)
805 {
806 	bp->bio_task = func;
807 	bp->bio_task_arg = arg;
808 	/*
809 	 * The taskqueue is actually just a second queue off the "up"
810 	 * queue, so we use the same lock.
811 	 */
812 	g_bioq_lock(&g_bio_run_up);
813 	KASSERT(!(bp->bio_flags & BIO_ONQUEUE),
814 	    ("Bio already on queue bp=%p target taskq", bp));
815 	bp->bio_flags |= BIO_ONQUEUE;
816 	TAILQ_INSERT_TAIL(&g_bio_run_task.bio_queue, bp, bio_queue);
817 	g_bio_run_task.bio_queue_length++;
818 	wakeup(&g_wait_up);
819 	g_bioq_unlock(&g_bio_run_up);
820 }
821 
822 
823 void
824 g_io_schedule_up(struct thread *tp __unused)
825 {
826 	struct bio *bp;
827 	for(;;) {
828 		g_bioq_lock(&g_bio_run_up);
829 		bp = g_bioq_first(&g_bio_run_task);
830 		if (bp != NULL) {
831 			g_bioq_unlock(&g_bio_run_up);
832 			THREAD_NO_SLEEPING();
833 			CTR1(KTR_GEOM, "g_up processing task bp %p", bp);
834 			bp->bio_task(bp->bio_task_arg);
835 			THREAD_SLEEPING_OK();
836 			continue;
837 		}
838 		bp = g_bioq_first(&g_bio_run_up);
839 		if (bp != NULL) {
840 			g_bioq_unlock(&g_bio_run_up);
841 			THREAD_NO_SLEEPING();
842 			CTR4(KTR_GEOM, "g_up biodone bp %p provider %s off "
843 			    "%jd len %ld", bp, bp->bio_to->name,
844 			    bp->bio_offset, bp->bio_length);
845 			biodone(bp);
846 			THREAD_SLEEPING_OK();
847 			continue;
848 		}
849 		CTR0(KTR_GEOM, "g_up going to sleep");
850 		msleep(&g_wait_up, &g_bio_run_up.bio_queue_lock,
851 		    PRIBIO | PDROP, "-", 0);
852 	}
853 }
854 
855 void *
856 g_read_data(struct g_consumer *cp, off_t offset, off_t length, int *error)
857 {
858 	struct bio *bp;
859 	void *ptr;
860 	int errorc;
861 
862 	KASSERT(length > 0 && length >= cp->provider->sectorsize &&
863 	    length <= MAXPHYS, ("g_read_data(): invalid length %jd",
864 	    (intmax_t)length));
865 
866 	bp = g_alloc_bio();
867 	bp->bio_cmd = BIO_READ;
868 	bp->bio_done = NULL;
869 	bp->bio_offset = offset;
870 	bp->bio_length = length;
871 	ptr = g_malloc(length, M_WAITOK);
872 	bp->bio_data = ptr;
873 	g_io_request(bp, cp);
874 	errorc = biowait(bp, "gread");
875 	if (error != NULL)
876 		*error = errorc;
877 	g_destroy_bio(bp);
878 	if (errorc) {
879 		g_free(ptr);
880 		ptr = NULL;
881 	}
882 	return (ptr);
883 }
884 
885 int
886 g_write_data(struct g_consumer *cp, off_t offset, void *ptr, off_t length)
887 {
888 	struct bio *bp;
889 	int error;
890 
891 	KASSERT(length > 0 && length >= cp->provider->sectorsize &&
892 	    length <= MAXPHYS, ("g_write_data(): invalid length %jd",
893 	    (intmax_t)length));
894 
895 	bp = g_alloc_bio();
896 	bp->bio_cmd = BIO_WRITE;
897 	bp->bio_done = NULL;
898 	bp->bio_offset = offset;
899 	bp->bio_length = length;
900 	bp->bio_data = ptr;
901 	g_io_request(bp, cp);
902 	error = biowait(bp, "gwrite");
903 	g_destroy_bio(bp);
904 	return (error);
905 }
906 
907 int
908 g_delete_data(struct g_consumer *cp, off_t offset, off_t length)
909 {
910 	struct bio *bp;
911 	int error;
912 
913 	KASSERT(length > 0 && length >= cp->provider->sectorsize,
914 	    ("g_delete_data(): invalid length %jd", (intmax_t)length));
915 
916 	bp = g_alloc_bio();
917 	bp->bio_cmd = BIO_DELETE;
918 	bp->bio_done = NULL;
919 	bp->bio_offset = offset;
920 	bp->bio_length = length;
921 	bp->bio_data = NULL;
922 	g_io_request(bp, cp);
923 	error = biowait(bp, "gdelete");
924 	g_destroy_bio(bp);
925 	return (error);
926 }
927 
928 void
929 g_print_bio(struct bio *bp)
930 {
931 	const char *pname, *cmd = NULL;
932 
933 	if (bp->bio_to != NULL)
934 		pname = bp->bio_to->name;
935 	else
936 		pname = "[unknown]";
937 
938 	switch (bp->bio_cmd) {
939 	case BIO_GETATTR:
940 		cmd = "GETATTR";
941 		printf("%s[%s(attr=%s)]", pname, cmd, bp->bio_attribute);
942 		return;
943 	case BIO_FLUSH:
944 		cmd = "FLUSH";
945 		printf("%s[%s]", pname, cmd);
946 		return;
947 	case BIO_READ:
948 		cmd = "READ";
949 		break;
950 	case BIO_WRITE:
951 		cmd = "WRITE";
952 		break;
953 	case BIO_DELETE:
954 		cmd = "DELETE";
955 		break;
956 	default:
957 		cmd = "UNKNOWN";
958 		printf("%s[%s()]", pname, cmd);
959 		return;
960 	}
961 	printf("%s[%s(offset=%jd, length=%jd)]", pname, cmd,
962 	    (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length);
963 }
964