xref: /titanic_52/usr/src/uts/common/avs/ns/sdbc/sd_bio.c (revision af4c679f647cf088543c762e33d41a3ac52cfa14)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/ksynch.h>
29 #include <sys/kmem.h>
30 #include <sys/stat.h>
31 #include <sys/buf.h>
32 #include <sys/open.h>
33 #include <sys/conf.h>
34 #include <sys/file.h>
35 #include <sys/cmn_err.h>
36 #include <sys/errno.h>
37 #include <sys/ddi.h>
38 
39 #include <sys/nsc_thread.h>
40 #include <sys/nsctl/nsctl.h>
41 
42 #include <sys/sdt.h>		/* dtrace is S10 or later */
43 
44 #include <vm/seg_kmem.h>
45 #include "sd_bcache.h"
46 #include "sd_trace.h"
47 #include "sd_io.h"
48 #include "sd_iob.h"
49 #include "sd_misc.h"
50 #if defined(_SD_DEBUG)			/* simulate disk errors */
51 #include "sd_tdaemon.h"
52 #endif
53 
54 #ifndef DS_DDICT
55 extern uintptr_t kobj_getsymvalue(char *, int);	/* DDI violation */
56 #endif
57 
58 #define	DO_PAGE_LIST	sdbc_do_page	/* enable pagelist code */
59 
60 int sdbc_do_page = 0;
61 
62 #define	SGIO_MAX 254
63 
64 static kmutex_t sdbc_bio_mutex;
65 static int sdbc_bio_count;
66 
67 static unsigned long page_size, page_offset_mask;
68 
69 #ifdef _SD_BIO_STATS
70 static __start_io_count = 0;
71 #endif /* _SD_BIO_STATS */
72 
73 /*
74  * Forward declare all statics that are used before defined to enforce
75  * parameter checking.  Also forward-declare all functions that have 64-bit
76  * argument types to enforce correct parameter checking.
77  *
78  * Some (if not all) of these could be removed if the code were reordered
79  */
80 
81 static int _sd_sync_ea(struct buf *, iob_hook_t *);
82 static int _sd_async_ea(struct buf *, iob_hook_t *);
83 static void _sd_pack_pages(struct buf *bp, struct buf *list, sd_addr_t *addr,
84     nsc_off_t offset, nsc_size_t size);
85 static void _sd_pack_pages_nopageio(struct buf *bp, struct buf *list,
86     sd_addr_t *addr, nsc_off_t offset, nsc_size_t size);
87 static void _sd_setup_iob(struct buf *bp, dev_t dev, nsc_off_t pos, int flag);
88 
89 #ifdef	DEBUG
90 static int _sdbc_ioj_lookup(dev_t);
91 static void _sdbc_ioj_clear_err(int);
92 #endif
93 
94 static int SD_WRITES_TOT = 0;
95 static int SD_WRITES_LEN[100];
96 
97 _sd_buf_list_t _sd_buflist;
98 
99 /*
100  * _sd_add_vm_to_bp_plist - add the page corresponding to the
101  * virtual address "v" (kernel virtaddr) to the pagelist linked
102  * to buffer "bp".
103  *
104  * The virtual address "v" is "known" to be allocated by segkmem
105  * and we can look up the page by using the segkmem vnode kvp.
106  * This violates the ddi/ddk but is workable for now anyway.
107  *
108  *
109  */
110 static void
111 _sd_add_vm_to_bp_plist(struct buf *bp, unsigned char *v)
112 {
113 	page_t   *pp;
114 	page_t   *one_pg = NULL;
115 
116 	pp = page_find(&kvp, (u_offset_t)((uintptr_t)v & ~page_offset_mask));
117 	if (!pp) {
118 		cmn_err(CE_PANIC,
119 		    "_sd_add_vm_to_bp_plist: couldn't find page for 0x%p",
120 		    (void *)v);
121 	}
122 
123 	page_add(&one_pg, pp);
124 	page_list_concat(&(bp->b_pages), &one_pg);
125 
126 }
127 
128 #ifdef _SD_BIO_STATS
129 static int
130 _sd_count_pages(page_t *pp)
131 {
132 	int cnt = 0;
133 	page_t *pp1;
134 	if (pp == NULL)
135 		return (cnt);
136 
137 	for (cnt = 1, pp1 = pp->p_next; pp != pp1; cnt++, pp1 = pp1->p_next)
138 		;
139 
140 	return (cnt);
141 }
142 #endif /* _SD_BIO_STATS */
143 
144 
145 /*
146  * _sdbc_iobuf_load - load time initialization of io bufs structures.
147  *
148  *
149  * RETURNS:
150  *	0  - success.
151  *      -1 - failure.
152  *
153  * USAGE:
154  *	This routine initializes load time buf structures.
155  *      Should be called when the cache is loaded.
156  */
157 
158 int
159 _sdbc_iobuf_load(void)
160 {
161 	mutex_init(&sdbc_bio_mutex, NULL, MUTEX_DRIVER, NULL);
162 
163 	/*
164 	 * HACK add a ref to kvp, to prevent VN_RELE on it from panicing
165 	 * the system
166 	 */
167 	VN_HOLD(&kvp);
168 
169 	return (0);
170 }
171 
172 /*
173  * _sdbc_iobuf_unload - unload time cleanup of io buf structures.
174  *
175  *
176  * USAGE:
177  *	This routine removes load time buf structures.
178  *      Should be called when the cache is unloaded.
179  */
180 void
181 _sdbc_iobuf_unload(void)
182 {
183 	/* Undo our VN_HOLD hack, by putting ref count back to normal state */
184 	mutex_enter(&kvp.v_lock);
185 	kvp.v_count = 0;
186 	mutex_exit(&kvp.v_lock);
187 
188 	mutex_destroy(&sdbc_bio_mutex);
189 	bzero(&_sd_buflist, sizeof (_sd_buf_list_t));
190 }
191 
192 /*
193  * _sdbc_iobuf_configure - configure a list of io bufs for later use.
194  *
195  * ARGUMENTS:
196  *	num_bufs - number of buffers. (from the configuration file)
197  *
198  * RETURNS:
199  *	0  - success.
200  * <0  - failure.
201  *
202  * USAGE:
203  *	This routine configures the buf structures for io.
204  *      Should be called when the cache is configured.
205  */
206 
207 int
208 _sdbc_iobuf_configure(int num)
209 {
210 	int i;
211 	_sd_buf_list_t *buflist;
212 	iob_hook_t *hook;
213 	char symbol_name[32];
214 
215 	if (!num || (num > _SD_DEFAULT_IOBUFS))
216 		num = _SD_DEFAULT_IOBUFS;
217 
218 	if ((_sd_buflist.hooks = (iob_hook_t *)nsc_kmem_zalloc(
219 	    num * sizeof (iob_hook_t), KM_SLEEP, sdbc_iobuf_mem)) == NULL) {
220 		return (-1);
221 	}
222 
223 	buflist = &_sd_buflist;
224 	buflist->bl_init_count = num;
225 	buflist->bl_hooks_avail = num;
226 	buflist->bl_hook_lowmark = num;
227 	hook = buflist->hooks;
228 	buflist->hook_head = hook;
229 	for (i = 0; i < num; i++, hook++) {
230 		cv_init(&hook->wait, NULL, CV_DRIVER, NULL);
231 		(void) sprintf(symbol_name, "sd_iob_dcb%d", i);
232 		hook->iob_drv_iodone = (dcb_t)kobj_getsymvalue(symbol_name, 0);
233 		if (!hook->iob_drv_iodone) {
234 			return (-2);
235 		}
236 		hook->next_hook = hook+1;
237 	}
238 	(hook-1)->next_hook = NULL;
239 
240 	for (i = 0; i < MAX_HOOK_LOCKS; i++)
241 		mutex_init(&_sd_buflist.hook_locks[i], NULL, MUTEX_DRIVER,
242 		    NULL);
243 
244 	cv_init(&_sd_buflist.hook_wait, NULL, CV_DRIVER, NULL);
245 	_sd_buflist.hook_waiters = 0;
246 
247 	sdbc_bio_count = 0;
248 	SD_WRITES_TOT = 0;
249 	bzero(SD_WRITES_LEN, sizeof (SD_WRITES_LEN));
250 
251 	/* pagelist i/o pages must be done in cache_init */
252 
253 	page_size = ptob(1);
254 	page_offset_mask = page_size - 1;
255 
256 	return (0);
257 }
258 
259 /*
260  * _sdbc_iobuf_deconfigure - release all memory allocated for buf list
261  *
262  * ARGUMENTS:
263  *	None.
264  *
265  * RETURNS:
266  *	0
267  */
268 void
269 _sdbc_iobuf_deconfigure(void)
270 {
271 	ushort_t i;
272 
273 	if (_sd_buflist.hooks) {
274 		for (i = 0; i < _sd_buflist.bl_init_count; i ++) {
275 			cv_destroy(&_sd_buflist.hooks[i].wait);
276 		}
277 		cv_destroy(&_sd_buflist.hook_wait);
278 		nsc_kmem_free(_sd_buflist.hooks,
279 		    _sd_buflist.bl_init_count * sizeof (iob_hook_t));
280 		for (i = 0; i < MAX_HOOK_LOCKS; i ++) {
281 			mutex_destroy(&_sd_buflist.hook_locks[i]);
282 		}
283 	}
284 
285 	_sd_buflist.hooks = NULL;
286 
287 #ifdef DEBUG
288 	{
289 	void _sdbc_ioj_clear_err(int);
290 	_sdbc_ioj_clear_err(-1); /* clear any injected i/o errors */
291 	_sdbc_ioj_set_dev(-1, 0); /* clear dev entries */
292 	}
293 #endif
294 
295 }
296 
297 /*
298  * _sd_pending_iobuf()
299  *
300  * Return the number of I/O bufs outstanding
301  */
302 int
303 _sd_pending_iobuf(void)
304 {
305 	return (sdbc_bio_count);
306 }
307 
308 /*
309  * _sd_get_iobuf - allocate a buf.
310  *
311  * ARGUMENTS:
312  *	None.
313  *
314  * RETURNS:
315  *	NULL - failure.
316  *      buf ptr otherwise.
317  *
318  * ASSUMPTIONS - process could block if we run out.
319  *
320  */
321 /*ARGSUSED*/
322 static struct buf *
323 _sd_get_iobuf(int num_bdl)
324 {
325 	struct buf *bp;
326 
327 	/* Get a buffer, ready for page list i/o */
328 
329 	if (DO_PAGE_LIST)
330 		bp = pageio_setup(NULL, 0, &kvp, 0);
331 	else
332 		bp = getrbuf(KM_SLEEP);
333 
334 	if (bp == NULL)
335 		return (NULL);
336 	mutex_enter(&sdbc_bio_mutex);
337 	sdbc_bio_count++;
338 	mutex_exit(&sdbc_bio_mutex);
339 	return (bp);
340 }
341 
342 /*
343  * _sd_put_iobuf - put a buf back in the freelist.
344  *
345  * ARGUMENTS:
346  *	bp - buf pointer.
347  *
348  * RETURNS:
349  *	0
350  *
351  */
352 static void
353 _sd_put_iobuf(struct buf *bp)
354 {
355 	mutex_enter(&sdbc_bio_mutex);
356 	sdbc_bio_count--;
357 	mutex_exit(&sdbc_bio_mutex);
358 	if (DO_PAGE_LIST)
359 		pageio_done(bp);
360 	else
361 		freerbuf(bp);
362 }
363 
364 
365 /* use for ORing only */
366 #define	B_KERNBUF 0
367 
368 static void
369 _sd_setup_iob(struct buf *bp, dev_t dev, nsc_off_t pos, int flag)
370 {
371 	bp->b_pages = NULL;
372 	bp->b_un.b_addr = 0;
373 
374 	flag &= (B_READ | B_WRITE);
375 
376 	/*
377 	 * if pagelist i/o, _sd_get_iobuf()/pageio_setup() has already
378 	 * set b_flags to
379 	 * B_KERNBUF | B_PAGEIO | B_NOCACHE | B_BUSY (sol 6,7,8)
380 	 * or
381 	 * B_PAGEIO | B_NOCACHE | B_BUSY (sol 9)
382 	 */
383 
384 	bp->b_flags |= B_KERNBUF | B_BUSY | flag;
385 
386 	bp->b_error = 0;
387 
388 	bp->b_forw = NULL;
389 	bp->b_back = NULL;
390 
391 	bp->b_lblkno = (diskaddr_t)pos;
392 	bp->b_bufsize = 0;
393 	bp->b_resid = 0;
394 	bp->b_proc = NULL;
395 	bp->b_edev = dev;
396 }
397 
398 
399 /*
400  * _sd_get_hook - get an iob hook from the free list.
401  *
402  * ARGUMENTS:
403  *	none
404  *
405  * RETURNS:
406  *	the newly allocated iob_hook.
407  *
408  */
409 static iob_hook_t *
410 _sd_get_hook(void)
411 {
412 
413 	iob_hook_t *ret;
414 
415 	mutex_enter(&sdbc_bio_mutex);
416 
417 retry:
418 	ret = _sd_buflist.hook_head;
419 	if (ret)
420 		_sd_buflist.hook_head = ret->next_hook;
421 	else {
422 		++_sd_buflist.hook_waiters;
423 		if (_sd_buflist.max_hook_waiters < _sd_buflist.hook_waiters)
424 			_sd_buflist.max_hook_waiters = _sd_buflist.hook_waiters;
425 		cv_wait(&_sd_buflist.hook_wait, &sdbc_bio_mutex);
426 		--_sd_buflist.hook_waiters;
427 		goto retry;
428 	}
429 
430 	if (_sd_buflist.bl_hook_lowmark > --_sd_buflist.bl_hooks_avail)
431 		_sd_buflist.bl_hook_lowmark = _sd_buflist.bl_hooks_avail;
432 
433 	mutex_exit(&sdbc_bio_mutex);
434 	ret->skipped = 0;
435 
436 	ret->count = 0;
437 
438 #ifdef _SD_BIO_STATS
439 	ret->PAGE_IO = 0;
440 	ret->NORM_IO = 0;
441 	ret->NORM_IO_SIZE = 0;
442 	ret->SKIP_IO = 0;
443 	ret->PAGE_COMBINED = 0;
444 #endif /* _SD_BIO_STATS */
445 
446 	return (ret);
447 }
448 
449 /*
450  * _sd_put_hook - put an iob hook back on the free list.
451  *
452  * ARGUMENTS:
453  *	hook - an iob_hook to be returned to the freelist.
454  *
455  *
456  */
457 static void
458 _sd_put_hook(iob_hook_t *hook)
459 {
460 
461 	mutex_enter(&sdbc_bio_mutex);
462 
463 	if (_sd_buflist.hook_waiters) {
464 		cv_signal(&_sd_buflist.hook_wait);
465 	}
466 	hook->next_hook = _sd_buflist.hook_head;
467 	_sd_buflist.hook_head = hook;
468 
469 	++_sd_buflist.bl_hooks_avail;
470 
471 	mutex_exit(&sdbc_bio_mutex);
472 }
473 
474 /*
475  * _sd_extend_iob - the i/o block we are handling needs a new struct buf to
476  *    describe the next hunk of i/o. Get a new struct buf initialize it based
477  *    on the state in the struct buf we are passed as an arg.
478  * ARGUMENTS:
479  *    head_bp - a buffer header in the current i/o block we are handling.
480  *              (generally the initial header but in fact could be any
481  *               of the ones [if any] that were chained to the initial
482  *		 one).
483  */
484 static struct buf *
485 _sd_extend_iob(struct buf *head_bp)
486 {
487 	struct buf *bp;
488 	iob_hook_t *hook = (iob_hook_t *)head_bp->b_private;
489 
490 
491 	if (!(bp = _sd_get_iobuf(0)))
492 		return (0);
493 
494 	bp->b_pages = NULL;
495 	bp->b_un.b_addr = 0;
496 
497 	bp->b_flags |=  (head_bp->b_flags & (B_READ | B_WRITE));
498 
499 	if (!DO_PAGE_LIST)
500 		bp->b_flags |= B_KERNBUF | B_BUSY;
501 
502 	bp->b_error = 0;
503 
504 	/*
505 	 *  b_forw/b_back  will form a doubly linked list of all the buffers
506 	 *  associated with this block of i/o.
507 	 *  hook->tail points to the last buffer in the chain.
508 	 */
509 	bp->b_forw = NULL;
510 	bp->b_back = hook->tail;
511 	hook->tail->b_forw = bp;
512 	hook->tail = bp;
513 	hook->count++;
514 
515 	ASSERT(BLK_FBA_OFF(hook->size) == 0);
516 
517 	bp->b_lblkno = (diskaddr_t)hook->start_fba +
518 	    (diskaddr_t)FBA_NUM(hook->size);
519 
520 	bp->b_bufsize = 0;
521 	bp->b_resid = 0;
522 	bp->b_proc = NULL;
523 	bp->b_edev = head_bp->b_edev;
524 
525 	bp->b_iodone = NULL; /* for now */
526 	bp->b_private = hook;
527 
528 	return (bp);
529 }
530 
531 /*
532  * sd_alloc_iob - start processing a block of i/o. This allocates an initial
533  *	buffer header for describing the i/o and a iob_hook for collecting
534  *	information about all the i/o requests added to this buffer.
535  *
536  * ARGUMENTS:
537  *      dev - the device all the i/o is destined for.
538  *	fba_pos - the initial disk block to read.
539  *	blks - ignored
540  *	flag - signal whether this is a read or write request.
541  *
542  * RETURNS:
543  *	pointer to free struct buf which will be used to describe i/o request.
544  */
545 /* ARGSUSED */
546 struct buf *
547 sd_alloc_iob(dev_t dev, nsc_off_t fba_pos, int blks, int flag)
548 {
549 	struct buf *bp;
550 	iob_hook_t *hook;
551 
552 	if (!(bp = _sd_get_iobuf(0)))
553 		return (0);
554 
555 	_sd_setup_iob(bp, dev, fba_pos, flag);
556 
557 	bp->b_iodone = NULL; /* for now */
558 	hook = _sd_get_hook();
559 	if (!hook) {
560 		/* can't see how this could happen */
561 		_sd_put_iobuf(bp);
562 		return (0);
563 	}
564 
565 	/*
566 	 *  pick an arbitrary lock
567 	 */
568 	hook->lockp = &_sd_buflist.hook_locks[((long)hook >> 9) &
569 	    (MAX_HOOK_LOCKS - 1)];
570 	hook->start_fba = fba_pos;
571 	hook->last_fba = fba_pos;
572 	hook->size = 0;
573 	hook->tail = bp;
574 	hook->chain = bp;
575 	hook->count = 1;
576 	hook->error = 0;
577 	bp->b_private = hook;
578 
579 	return (bp);
580 }
581 
582 /*
583  * _sd_pack_pages - produce i/o requests that will perform the type of i/o
584  *      described by bp (READ/WRITE). It attempt to tack the i/o onto the
585  *      buf pointer to by list to minimize the number of bufs required.
586  *
587  * ARGUMENTS:
588  *  bp - is the i/o description i.e. head
589  *  list - is where to start adding this i/o request (null if we should extend)
590  *  addr - address describing where the data is.
591  *  offset - offset from addr where data begins
592  *  size - size of the i/o request.
593  */
594 static void
595 _sd_pack_pages(struct buf *bp, struct buf *list, sd_addr_t *addr,
596     nsc_off_t offset, nsc_size_t size)
597 {
598 	uintptr_t start_addr, end_addr;
599 	int page_end_aligned;
600 #ifdef _SD_BIO_STATS
601 	iob_hook_t *hook = (iob_hook_t *)bp->b_private;
602 	struct buf *orig_list = list;
603 #endif /* _SD_BIO_STATS */
604 
605 	start_addr = (uintptr_t)addr->sa_virt + offset;
606 	end_addr = start_addr + size;
607 
608 	page_end_aligned = !(end_addr & page_offset_mask);
609 
610 	if (!list && !(list = _sd_extend_iob(bp))) {
611 		/*
612 		 *  we're hosed since we have no error return...
613 		 *  though we could ignore stuff from here on out
614 		 *  and return ENOMEM when we get to sd_start_io.
615 		 *  This will do for now.
616 		 */
617 		cmn_err(CE_PANIC, "_sd_pack_pages: couldn't extend iob");
618 	}
619 
620 	/*
621 	 *	We only want to do pagelist i/o if we end on a page boundary.
622 	 *	If we don't end on a page boundary we won't combine with the
623 	 *	next request and so we may as well do it as normal as it
624 	 *	will only use one buffer.
625 	 */
626 
627 	if (DO_PAGE_LIST && page_end_aligned) {
628 		if (start_addr & page_offset_mask) {
629 			/*
630 			 * handle the partial page
631 			 */
632 			if (list->b_bufsize) {
633 				if (!(list = _sd_extend_iob(bp))) {
634 					/*
635 					 * we're hosed since we have no error
636 					 * return though we could ignore stuff
637 					 * from here on out and return ENOMEM
638 					 * when we get to sd_start_io.
639 					 *  This will do for now.
640 					 */
641 					cmn_err(CE_PANIC,
642 					"_sd_pack_pages: couldn't extend iob");
643 				}
644 			}
645 #ifdef _SD_BIO_STATS
646 			hook->PAGE_IO++;
647 #endif /* _SD_BIO_STATS */
648 			_sd_add_vm_to_bp_plist(list,
649 			    (unsigned char *) start_addr);
650 			list->b_bufsize = page_size -
651 			    (start_addr & page_offset_mask);
652 			list->b_un.b_addr = (caddr_t)
653 			    (start_addr & page_offset_mask);
654 			size -= list->b_bufsize;
655 			start_addr += list->b_bufsize;
656 		}
657 		/*
658 		 *	Now fill with all the full pages remaining.
659 		 */
660 		for (; size > 0; size -= page_size) {
661 #ifdef _SD_BIO_STATS
662 			hook->PAGE_IO++;
663 #endif /* _SD_BIO_STATS */
664 
665 			_sd_add_vm_to_bp_plist(list,
666 			    (unsigned char *) start_addr);
667 			start_addr += page_size;
668 			list->b_bufsize += page_size;
669 #ifdef _SD_BIO_STATS
670 			if (list == orig_list)
671 				hook->PAGE_COMBINED++;
672 #endif /* _SD_BIO_STATS */
673 		}
674 		if (size)
675 			cmn_err(CE_PANIC, "_sd_pack_pages: bad size: %"
676 			    NSC_SZFMT, size);
677 	} else {
678 		/*
679 		 *  Wasn't worth it as pagelist i/o, do as normal
680 		 */
681 		if (list->b_bufsize && !(list = _sd_extend_iob(bp))) {
682 			/*
683 			 *  we're hosed since we have no error return...
684 			 *  though we could ignore stuff from here on out
685 			 *  and return ENOMEM when we get to sd_start_io.
686 			 *  This will do for now.
687 			 */
688 			cmn_err(CE_PANIC,
689 			    "_sd_pack_pages: couldn't extend iob");
690 		}
691 
692 		/* kernel virtual */
693 		list->b_flags &= ~(B_PHYS | B_PAGEIO);
694 		list->b_un.b_addr = (caddr_t)start_addr;
695 #ifdef _SD_BIO_STATS
696 		hook->NORM_IO++;
697 		hook->NORM_IO_SIZE += size;
698 #endif /* _SD_BIO_STATS */
699 		list->b_bufsize = (size_t)size;
700 	}
701 
702 }
703 
704 /*
705  * perform same function as _sd_pack_pages() when not doing pageio
706  */
707 static void
708 _sd_pack_pages_nopageio(struct buf *bp, struct buf *list, sd_addr_t *addr,
709 	nsc_off_t offset, nsc_size_t size)
710 {
711 	uintptr_t start_addr;
712 #ifdef _SD_BIO_STATS
713 	iob_hook_t *hook = (iob_hook_t *)bp->b_private;
714 	struct buf *orig_list = list;
715 #endif /* _SD_BIO_STATS */
716 
717 	start_addr = (uintptr_t)addr->sa_virt + offset;
718 
719 	if (!list && !(list = _sd_extend_iob(bp))) {
720 		/*
721 		 *  we're hosed since we have no error return...
722 		 *  though we could ignore stuff from here on out
723 		 *  and return ENOMEM when we get to sd_start_io.
724 		 *  This will do for now.
725 		 */
726 		cmn_err(CE_PANIC, "_sd_pack_pages_nopageio: couldn't "
727 		    "extend iob");
728 	}
729 
730 	if (list->b_bufsize &&
731 	    (start_addr == (uintptr_t)(list->b_un.b_addr + list->b_bufsize))) {
732 		/* contiguous */
733 		list->b_bufsize += (size_t)size;
734 	} else {
735 		/*
736 		 * not contiguous mem (extend) or first buffer (bufsize == 0).
737 		 */
738 		if (list->b_bufsize && !(list = _sd_extend_iob(bp))) {
739 			/*
740 			 *  we're hosed since we have no error return...
741 			 *  though we could ignore stuff from here on out
742 			 *  and return ENOMEM when we get to sd_start_io.
743 			 *  This will do for now.
744 			 */
745 			cmn_err(CE_PANIC, "_sd_pack_pages_nopageio: couldn't "
746 			    "extend iob");
747 		}
748 		list->b_un.b_addr = (caddr_t)start_addr;
749 		list->b_bufsize = (size_t)size;
750 	}
751 
752 #ifdef _SD_BIO_STATS
753 	hook->NORM_IO++;
754 	hook->NORM_IO_SIZE += size;
755 #endif /* _SD_BIO_STATS */
756 }
757 
758 /*
759  * sd_add_fba - add an i/o request to the block of i/o described by bp.
760  *	We try and combine this request with the previous request. In
761  *	Addition we try and do the i/o as PAGELIST_IO if it satisfies
762  *	the restrictions for it. If the i/o request can't be combined
763  *	we extend the i/o description with a new buffer header and add
764  *	it to the chain headed by bp.
765  *
766  * ARGUMENTS:
767  *      bp - the struct buf describing the block i/o we are collecting.
768  *	addr - description of the address where the data will read/written to.
769  *             A NULL indicates that this i/o request doesn't need to actually
770  *             happen. Used to mark reads when the fba is already in cache and
771  *             dirty.
772  *
773  *	fba_pos - offset from address in addr where the i/o is to start.
774  *
775  *	fba_len - number of consecutive fbas to transfer.
776  *
777  *  NOTE: It is assumed that the memory is physically contiguous but may span
778  *  multiple pages (should a cache block be larger than a page).
779  *
780  */
781 void
782 sd_add_fba(struct buf *bp, sd_addr_t *addr, nsc_off_t fba_pos,
783     nsc_size_t fba_len)
784 {
785 	nsc_off_t offset;
786 	nsc_size_t size;
787 	iob_hook_t *hook = (iob_hook_t *)bp->b_private;
788 
789 	size = FBA_SIZE(fba_len);
790 	offset = FBA_SIZE(fba_pos);
791 
792 	if (addr) {
793 		/*
794 		 *  See if this can be combined with previous request(s)
795 		 */
796 		if (!bp->b_bufsize) {
797 			if (DO_PAGE_LIST)
798 				_sd_pack_pages(bp, bp, addr, offset, size);
799 			else
800 				_sd_pack_pages_nopageio(bp, bp, addr, offset,
801 				    size);
802 		} else {
803 			if (DO_PAGE_LIST) {
804 				if (hook->tail->b_flags & B_PAGEIO) {
805 					/*
806 					 * Last buffer was a pagelist. Unless a
807 					 * skip was detected the last request
808 					 * ended on a page boundary. If this
809 					 * one starts on one we combine the
810 					 * best we can.
811 					 */
812 					if (hook->skipped)
813 						_sd_pack_pages(bp, NULL, addr,
814 						    offset, size);
815 					else
816 						_sd_pack_pages(bp, hook->tail,
817 						    addr, offset, size);
818 				} else {
819 					/*
820 					 * Last buffer was vanilla i/o or worse
821 					 * (sd_add_mem)
822 					 */
823 					_sd_pack_pages(bp, NULL, addr, offset,
824 					    size);
825 				}
826 			} else {
827 				if (hook->skipped)
828 					_sd_pack_pages_nopageio(bp, NULL,
829 					    addr, offset, size);
830 				else
831 					_sd_pack_pages_nopageio(bp,
832 					    hook->tail, addr, offset, size);
833 			}
834 		}
835 		hook->skipped = 0;
836 	} else {
837 		/* Must be a read of dirty block we want to discard */
838 
839 		ASSERT(bp->b_flags & B_READ);
840 #ifdef _SD_BIO_STATS
841 		hook->SKIP_IO++;
842 #endif /* _SD_BIO_STATS */
843 		hook->skipped = 1;
844 		if (!bp->b_bufsize)
845 			bp->b_lblkno += fba_len;
846 	}
847 	hook->size += size;
848 
849 }
850 
851 /*
852  * sd_add_mem - add an i/o request to the block of i/o described by bp.
853  *	The memory target for this i/o may span multiple pages and may
854  *	not be physically contiguous.
855  *      also the len might also not be a multiple of an fba.
856  *
857  * ARGUMENTS:
858  *      bp - the struct buf describing the block i/o we are collecting.
859  *
860  *	buf - target of this i/o request.
861  *
862  *	len - number of bytes to transfer.
863  *
864  */
865 void
866 sd_add_mem(struct buf *bp, char *buf, nsc_size_t len)
867 {
868 	nsc_size_t n;
869 	uintptr_t start;
870 	iob_hook_t *hook = (iob_hook_t *)bp->b_private;
871 
872 	start = (uintptr_t)buf & page_offset_mask;
873 
874 	for (; len > 0; buf += n, len -= n, start = 0) {
875 		n = min((nsc_size_t)len, (nsc_size_t)(page_size - start));
876 		/*
877 		 *  i/o size must be multiple of an FBA since we can't
878 		 *  count on lower level drivers to understand b_offset
879 		 */
880 		if (BLK_FBA_OFF(n) != 0) {
881 			cmn_err(CE_WARN,
882 			    "!sdbc(sd_add_mem) i/o request not FBA sized (%"
883 			    NSC_SZFMT ")", n);
884 		}
885 
886 		if (!bp->b_bufsize) {
887 			/* first request */
888 			bp->b_flags &= ~(B_PHYS | B_PAGEIO);
889 			bp->b_un.b_addr = buf;
890 			bp->b_bufsize = (size_t)n;
891 		} else {
892 			struct buf *new_bp;
893 			if (!(new_bp = _sd_extend_iob(bp))) {
894 				/* we're hosed */
895 				cmn_err(CE_PANIC,
896 				"sd_add_mem: couldn't extend iob");
897 			}
898 			new_bp->b_flags &= ~(B_PHYS | B_PAGEIO);
899 			new_bp->b_un.b_addr = buf;
900 			new_bp->b_bufsize = (size_t)n;
901 		}
902 		hook->size += n;
903 	}
904 }
905 
906 
907 /*
908  * sd_start_io - start all the i/o needed to satisfy the i/o request described
909  *	by bp. If supplied the a non-NULL fn then this is an async request
910  *	and we will return NSC_PENDING and call fn when all the i/o complete.
911  *	Otherwise this is a synchronous request and we sleep until all the
912  *	i/o is complete. If any buffer in the chain gets an error we return
913  *	the first error we see (once all the i/o is complete).
914  *
915  * ARGUMENTS:
916  *      bp - the struct buf describing the block i/o we are collecting.
917  *
918  *	strategy - strategy function to call if known by the user, or NULL.
919  *
920  *	fn - user's callback function. NULL implies synchronous request.
921  *
922  *	arg - an argument passed to user's callback function.
923  *
924  */
925 int
926 sd_start_io(struct buf *bp, strategy_fn_t strategy, sdbc_ea_fn_t fn,
927 		blind_t arg)
928 {
929 	int err;
930 	iob_hook_t *hook = (iob_hook_t *)bp->b_private;
931 	struct buf *bp_next;
932 	int (*ea_fn)(struct buf *, iob_hook_t *);
933 #ifdef _SD_BIO_STATS
934 	static int total_pages, total_pages_combined, total_norm;
935 	static int total_norm_combined, total_skipped;
936 	static nsc_size_t total_norm_size;
937 
938 	static int total_bufs;
939 	static int total_xpages_w, total_ypages_w;
940 	static int total_xpages_r, total_ypages_r;
941 	static int max_run_r, max_run_w;
942 
943 #endif /* _SD_BIO_STATS */
944 
945 	hook->func = fn;
946 	hook->param = arg;
947 	if (fn != NULL)
948 		ea_fn = _sd_async_ea;
949 	else
950 		ea_fn = _sd_sync_ea;
951 
952 	hook->iob_hook_iodone = ea_fn;
953 
954 #ifdef _SD_BIO_STATS
955 	__start_io_count++;
956 	total_pages += hook->PAGE_IO;
957 	total_pages_combined += hook->PAGE_COMBINED;
958 	total_norm += hook->NORM_IO;
959 	total_norm_size += hook->NORM_IO_SIZE;
960 	total_skipped += hook->SKIP_IO;
961 #endif /* _SD_BIO_STATS */
962 
963 	for (; bp; bp = bp_next) {
964 
965 	DTRACE_PROBE4(sd_start_io_bufs, struct buf *, bp, long, bp->b_bufsize,
966 	    int, bp->b_flags, iob_hook_t *, hook);
967 
968 		bp_next = bp->b_forw;
969 		if (!(bp->b_flags & B_READ)) {
970 			SD_WRITES_TOT++;
971 			SD_WRITES_LEN[(bp->b_bufsize/32768) %
972 			    (sizeof (SD_WRITES_LEN)/sizeof (int))]++;
973 		}
974 		bp->b_iodone = hook->iob_drv_iodone;
975 		bp->b_bcount = bp->b_bufsize;
976 		bp->b_forw = NULL;
977 		bp->b_back = NULL;
978 		bp->b_private = NULL;
979 
980 #ifdef _SD_BIO_STATS
981 		total_bufs ++;
982 		if (bp->b_flags & B_PAGEIO) {
983 			int i;
984 			i = _sd_count_pages(bp->b_pages);
985 			if (bp->b_flags & B_READ) {
986 				if (i > max_run_r)
987 					max_run_r = i;
988 				total_xpages_r += i;
989 				total_ypages_r++;
990 			} else {
991 				if (i > max_run_w)
992 					max_run_w = i;
993 				total_xpages_w += i;
994 				total_ypages_w++;
995 			}
996 		}
997 #endif /* _SD_BIO_STATS */
998 
999 
1000 		/*
1001 		 *  It's possible for us to be told to read a dirty block
1002 		 *  where all the i/o can go away (e.g. read one fba, it's
1003 		 *  in cache and dirty) so we really have nothing to do but
1004 		 *  say we're done.
1005 		 */
1006 		if (bp->b_bcount) {
1007 			if (!strategy) {
1008 				strategy =
1009 				    nsc_get_strategy(getmajor(bp->b_edev));
1010 			}
1011 
1012 			if (!strategy) {
1013 				bp->b_flags |= B_ERROR;
1014 				bp->b_error = ENXIO;
1015 				(*bp->b_iodone)(bp);
1016 			} else
1017 #ifdef DEBUG
1018 			/* inject i/o error for testing */
1019 			if (bp->b_error = _sdbc_ioj_lookup(bp->b_edev)) {
1020 				bp->b_flags |= B_ERROR;
1021 				(*bp->b_iodone)(bp);
1022 			} else
1023 #endif
1024 			{
1025 				(*strategy)(bp);
1026 			}
1027 		} else {
1028 			(*bp->b_iodone)(bp);
1029 		}
1030 
1031 	}
1032 
1033 #ifdef _SD_BIO_STATS
1034 	if (__start_io_count == 2000) {
1035 		__start_io_count = 0;
1036 		cmn_err(CE_WARN,
1037 		    "!sdbc(sd_start_io) t_bufs %d pages %d "
1038 		    "combined %d norm %d norm_size %" NSC_SZFMT " skipped %d",
1039 		    total_bufs,
1040 		    total_pages, total_pages_combined, total_norm,
1041 		    total_norm_size, total_skipped);
1042 
1043 		total_bufs = 0;
1044 		total_pages = 0;
1045 		total_pages_combined = 0;
1046 		total_norm = 0;
1047 		total_norm_combined = 0;
1048 		total_skipped = 0;
1049 		total_norm_size = 0;
1050 
1051 		cmn_err(CE_WARN,
1052 		    "!sdbc(sd_start_io)(r) max_run %d, total_xp %d total yp %d",
1053 		    max_run_r, total_xpages_r, total_ypages_r);
1054 
1055 		total_xpages_r = 0;
1056 		total_ypages_r = 0;
1057 		max_run_r = 0;
1058 
1059 		cmn_err(CE_WARN,
1060 		    "!sdbc(sd_start_io)(w) max_run %d, total_xp %d total yp %d",
1061 		    max_run_w, total_xpages_w, total_ypages_w);
1062 
1063 		total_xpages_w = 0;
1064 		total_ypages_w = 0;
1065 		max_run_w = 0;
1066 	}
1067 #endif /* _SD_BIO_STATS */
1068 
1069 	if (ea_fn == _sd_async_ea) {
1070 		DTRACE_PROBE(sd_start_io_end);
1071 
1072 		return (NSC_PENDING);
1073 	}
1074 
1075 	mutex_enter(hook->lockp);
1076 
1077 	while (hook->count) {
1078 		cv_wait(&hook->wait, hook->lockp);
1079 	}
1080 	mutex_exit(hook->lockp);
1081 
1082 	err = hook->error ? hook->error : NSC_DONE;
1083 	bp = hook->tail;
1084 	_sd_put_hook(hook);
1085 	_sd_put_iobuf(bp);
1086 
1087 	return (err);
1088 }
1089 
1090 /*
1091  * _sd_sync_ea - called when a single i/o operation is complete. If this
1092  *      is the last outstanding i/o we wakeup the sleeper.
1093  *	If this i/o had an error then we store the error result in the
1094  *	iob_hook if this was the first error.
1095  *
1096  * ARGUMENTS:
1097  *      bp - the struct buf describing the block i/o that just completed.
1098  *
1099  * Comments:
1100  *	This routine is called at interrupt level when the io is done.
1101  */
1102 
1103 static int
1104 _sd_sync_ea(struct buf *bp, iob_hook_t *hook)
1105 {
1106 
1107 	int error;
1108 	int done;
1109 
1110 	/*
1111 	 *  We get called for each buf that completes. When they are all done.
1112 	 *  we wakeup the waiter.
1113 	 */
1114 	error = (bp->b_flags & B_ERROR) ? bp->b_error : 0;
1115 
1116 	mutex_enter(hook->lockp);
1117 
1118 	if (!hook->error)
1119 		hook->error = error;
1120 
1121 	done = !(--hook->count);
1122 	if (done) {
1123 		/* remember the last buffer so we can free it later */
1124 		hook->tail = bp;
1125 		cv_signal(&hook->wait);
1126 	}
1127 	mutex_exit(hook->lockp);
1128 
1129 	/*
1130 	 *  let sd_start_io free the final buffer so the hook can be returned
1131 	 *  first.
1132 	 */
1133 	if (!done)
1134 		_sd_put_iobuf(bp);
1135 
1136 	return (0);
1137 }
1138 
1139 /*
1140  * static int
1141  * _sd_async_ea - End action for async read/write.
1142  *
1143  * ARGUMENTS:
1144  *	bp 	- io buf pointer.
1145  *
1146  * RETURNS:
1147  *	NONE.
1148  *
1149  * Comments:
1150  *	This routine is called at interrupt level when the io is done.
1151  *	This is only called when the operation is asynchronous.
1152  */
1153 static int
1154 _sd_async_ea(struct buf *bp, iob_hook_t *hook)
1155 {
1156 	int done, error;
1157 
1158 	/*
1159 	 *  We get called for each buf that completes. When they are all done.
1160 	 *  we call the requestor's callback function.
1161 	 */
1162 	error = (bp->b_flags & B_ERROR) ? bp->b_error : 0;
1163 
1164 	mutex_enter(hook->lockp);
1165 	done = !(--hook->count);
1166 
1167 	if (!hook->error)
1168 		hook->error = error;
1169 
1170 	mutex_exit(hook->lockp);
1171 
1172 	bp->b_forw = NULL;
1173 	bp->b_back = NULL;
1174 
1175 	if (done) {
1176 		nsc_off_t fba_pos;
1177 		nsc_size_t fba_len;
1178 		int error;
1179 		sdbc_ea_fn_t fn;
1180 		blind_t arg;
1181 
1182 		arg   =  hook->param;
1183 		fn    =  hook->func;
1184 		error = hook->error;
1185 #if defined(_SD_DEBUG)			/* simulate disk errors */
1186 		if (_test_async_fail == bp->b_edev) error = EIO;
1187 #endif
1188 
1189 		/* MAKE SURE b_lblkno, b_count never changes!! */
1190 		fba_pos = hook->start_fba;
1191 		fba_len = FBA_LEN(hook->size);
1192 
1193 		_sd_put_hook(hook);
1194 		_sd_put_iobuf(bp);
1195 		(*fn)(arg, fba_pos, fba_len, error);
1196 	} else
1197 		_sd_put_iobuf(bp);
1198 
1199 	return (0);
1200 }
1201 
1202 #ifdef DEBUG
1203 typedef struct ioerr_inject_s {
1204 	dev_t ioj_dev;
1205 	int   ioj_err;
1206 	int   ioj_cnt;
1207 } ioerr_inject_t;
1208 
1209 static ioerr_inject_t *ioerr_inject_table = NULL;
1210 
1211 void
1212 _sdbc_ioj_load()
1213 {
1214 	ioerr_inject_table =
1215 	    kmem_zalloc(sdbc_max_devs * sizeof (ioerr_inject_t), KM_SLEEP);
1216 }
1217 
1218 void
1219 _sdbc_ioj_unload()
1220 {
1221 	if (ioerr_inject_table != NULL) {
1222 		kmem_free(ioerr_inject_table,
1223 		    sdbc_max_devs * sizeof (ioerr_inject_t));
1224 		ioerr_inject_table = NULL;
1225 	}
1226 }
1227 
1228 static int
1229 _sdbc_ioj_lookup(dev_t dev)
1230 {
1231 	int cd;
1232 
1233 	for (cd = 0; cd < sdbc_max_devs; ++cd)
1234 		if (ioerr_inject_table[cd].ioj_dev == dev) {
1235 			if (ioerr_inject_table[cd].ioj_cnt > 0) {
1236 				--ioerr_inject_table[cd].ioj_cnt;
1237 				return (0);
1238 			} else {
1239 				return (ioerr_inject_table[cd].ioj_err);
1240 			}
1241 		}
1242 	return (0);
1243 }
1244 
1245 void
1246 _sdbc_ioj_set_dev(int cd, dev_t crdev)
1247 {
1248 	int i;
1249 
1250 	if (cd == -1) {  /* all  -- used for clearing table on shutdown */
1251 		for (i = 0; i < sdbc_max_devs; ++i)  {
1252 			ioerr_inject_table[i].ioj_dev = crdev;
1253 		}
1254 	} else
1255 		ioerr_inject_table[cd].ioj_dev = crdev; /* assume valid cd */
1256 }
1257 
1258 static
1259 void
1260 _sdbc_ioj_set_err(int cd, int err, int count)
1261 {
1262 	int i;
1263 
1264 	if (cd == -1) {  /* all */
1265 		for (i = 0; i < sdbc_max_devs; ++i)  {
1266 			ioerr_inject_table[i].ioj_err = err;
1267 			ioerr_inject_table[i].ioj_cnt = count;
1268 		}
1269 	} else {
1270 		ioerr_inject_table[cd].ioj_err = err;
1271 		ioerr_inject_table[cd].ioj_cnt = count;
1272 	}
1273 }
1274 
1275 static void
1276 _sdbc_ioj_clear_err(int cd)
1277 {
1278 	_sdbc_ioj_set_err(cd, 0, 0);
1279 }
1280 
1281 int
1282 _sdbc_inject_ioerr(int cd, int ioj_err, int count)
1283 {
1284 	if ((cd < -1) || (cd >= sdbc_max_devs))
1285 		return (EINVAL);
1286 
1287 	_sdbc_ioj_set_err(cd, ioj_err, count);
1288 
1289 	return (0);
1290 }
1291 
1292 int
1293 _sdbc_clear_ioerr(int cd)
1294 {
1295 	if ((cd < -1) || (cd >= sdbc_max_devs))
1296 		return (EINVAL);
1297 
1298 	_sdbc_ioj_clear_err(cd);
1299 
1300 	return (0);
1301 }
1302 #endif
1303