1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/ksynch.h>
29 #include <sys/kmem.h>
30 #include <sys/stat.h>
31 #include <sys/buf.h>
32 #include <sys/open.h>
33 #include <sys/conf.h>
34 #include <sys/file.h>
35 #include <sys/cmn_err.h>
36 #include <sys/errno.h>
37 #include <sys/ddi.h>
38
39 #include <sys/nsc_thread.h>
40 #include <sys/nsctl/nsctl.h>
41
42 #include <sys/sdt.h> /* dtrace is S10 or later */
43
44 #include <vm/seg_kmem.h>
45 #include "sd_bcache.h"
46 #include "sd_trace.h"
47 #include "sd_io.h"
48 #include "sd_iob.h"
49 #include "sd_misc.h"
50 #if defined(_SD_DEBUG) /* simulate disk errors */
51 #include "sd_tdaemon.h"
52 #endif
53
54 #ifndef DS_DDICT
55 extern uintptr_t kobj_getsymvalue(char *, int); /* DDI violation */
56 #endif
57
58 #define DO_PAGE_LIST sdbc_do_page /* enable pagelist code */
59
60 int sdbc_do_page = 0;
61
62 #define SGIO_MAX 254
63
64 static kmutex_t sdbc_bio_mutex;
65 static int sdbc_bio_count;
66
67 static unsigned long page_size, page_offset_mask;
68
69 #ifdef _SD_BIO_STATS
70 static __start_io_count = 0;
71 #endif /* _SD_BIO_STATS */
72
73 /*
74 * Forward declare all statics that are used before defined to enforce
75 * parameter checking. Also forward-declare all functions that have 64-bit
76 * argument types to enforce correct parameter checking.
77 *
78 * Some (if not all) of these could be removed if the code were reordered
79 */
80
81 static int _sd_sync_ea(struct buf *, iob_hook_t *);
82 static int _sd_async_ea(struct buf *, iob_hook_t *);
83 static void _sd_pack_pages(struct buf *bp, struct buf *list, sd_addr_t *addr,
84 nsc_off_t offset, nsc_size_t size);
85 static void _sd_pack_pages_nopageio(struct buf *bp, struct buf *list,
86 sd_addr_t *addr, nsc_off_t offset, nsc_size_t size);
87 static void _sd_setup_iob(struct buf *bp, dev_t dev, nsc_off_t pos, int flag);
88
89 #ifdef DEBUG
90 static int _sdbc_ioj_lookup(dev_t);
91 static void _sdbc_ioj_clear_err(int);
92 #endif
93
94 static int SD_WRITES_TOT = 0;
95 static int SD_WRITES_LEN[100];
96
97 _sd_buf_list_t _sd_buflist;
98
99 /*
100 * _sd_add_vm_to_bp_plist - add the page corresponding to the
101 * virtual address "v" (kernel virtaddr) to the pagelist linked
102 * to buffer "bp".
103 *
104 * The virtual address "v" is "known" to be allocated by segkmem
105 * and we can look up the page by using the segkmem vnode kvp.
106 * This violates the ddi/ddk but is workable for now anyway.
107 *
108 *
109 */
110 static void
_sd_add_vm_to_bp_plist(struct buf * bp,unsigned char * v)111 _sd_add_vm_to_bp_plist(struct buf *bp, unsigned char *v)
112 {
113 page_t *pp;
114 page_t *one_pg = NULL;
115
116 pp = page_find(&kvp, (u_offset_t)((uintptr_t)v & ~page_offset_mask));
117 if (!pp) {
118 cmn_err(CE_PANIC,
119 "_sd_add_vm_to_bp_plist: couldn't find page for 0x%p",
120 (void *)v);
121 }
122
123 page_add(&one_pg, pp);
124 page_list_concat(&(bp->b_pages), &one_pg);
125
126 }
127
128 #ifdef _SD_BIO_STATS
129 static int
_sd_count_pages(page_t * pp)130 _sd_count_pages(page_t *pp)
131 {
132 int cnt = 0;
133 page_t *pp1;
134 if (pp == NULL)
135 return (cnt);
136
137 for (cnt = 1, pp1 = pp->p_next; pp != pp1; cnt++, pp1 = pp1->p_next)
138 ;
139
140 return (cnt);
141 }
142 #endif /* _SD_BIO_STATS */
143
144
145 /*
146 * _sdbc_iobuf_load - load time initialization of io bufs structures.
147 *
148 *
149 * RETURNS:
150 * 0 - success.
151 * -1 - failure.
152 *
153 * USAGE:
154 * This routine initializes load time buf structures.
155 * Should be called when the cache is loaded.
156 */
157
158 int
_sdbc_iobuf_load(void)159 _sdbc_iobuf_load(void)
160 {
161 mutex_init(&sdbc_bio_mutex, NULL, MUTEX_DRIVER, NULL);
162
163 /*
164 * HACK add a ref to kvp, to prevent VN_RELE on it from panicing
165 * the system
166 */
167 VN_HOLD(&kvp);
168
169 return (0);
170 }
171
172 /*
173 * _sdbc_iobuf_unload - unload time cleanup of io buf structures.
174 *
175 *
176 * USAGE:
177 * This routine removes load time buf structures.
178 * Should be called when the cache is unloaded.
179 */
180 void
_sdbc_iobuf_unload(void)181 _sdbc_iobuf_unload(void)
182 {
183 /* Undo our VN_HOLD hack, by putting ref count back to normal state */
184 mutex_enter(&kvp.v_lock);
185 kvp.v_count = 0;
186 mutex_exit(&kvp.v_lock);
187
188 mutex_destroy(&sdbc_bio_mutex);
189 bzero(&_sd_buflist, sizeof (_sd_buf_list_t));
190 }
191
192 /*
193 * _sdbc_iobuf_configure - configure a list of io bufs for later use.
194 *
195 * ARGUMENTS:
196 * num_bufs - number of buffers. (from the configuration file)
197 *
198 * RETURNS:
199 * 0 - success.
200 * <0 - failure.
201 *
202 * USAGE:
203 * This routine configures the buf structures for io.
204 * Should be called when the cache is configured.
205 */
206
207 int
_sdbc_iobuf_configure(int num)208 _sdbc_iobuf_configure(int num)
209 {
210 int i;
211 _sd_buf_list_t *buflist;
212 iob_hook_t *hook;
213 char symbol_name[32];
214
215 if (!num || (num > _SD_DEFAULT_IOBUFS))
216 num = _SD_DEFAULT_IOBUFS;
217
218 if ((_sd_buflist.hooks = (iob_hook_t *)nsc_kmem_zalloc(
219 num * sizeof (iob_hook_t), KM_SLEEP, sdbc_iobuf_mem)) == NULL) {
220 return (-1);
221 }
222
223 buflist = &_sd_buflist;
224 buflist->bl_init_count = num;
225 buflist->bl_hooks_avail = num;
226 buflist->bl_hook_lowmark = num;
227 hook = buflist->hooks;
228 buflist->hook_head = hook;
229 for (i = 0; i < num; i++, hook++) {
230 cv_init(&hook->wait, NULL, CV_DRIVER, NULL);
231 (void) sprintf(symbol_name, "sd_iob_dcb%d", i);
232 hook->iob_drv_iodone = (dcb_t)kobj_getsymvalue(symbol_name, 0);
233 if (!hook->iob_drv_iodone) {
234 return (-2);
235 }
236 hook->next_hook = hook+1;
237 }
238 (hook-1)->next_hook = NULL;
239
240 for (i = 0; i < MAX_HOOK_LOCKS; i++)
241 mutex_init(&_sd_buflist.hook_locks[i], NULL, MUTEX_DRIVER,
242 NULL);
243
244 cv_init(&_sd_buflist.hook_wait, NULL, CV_DRIVER, NULL);
245 _sd_buflist.hook_waiters = 0;
246
247 sdbc_bio_count = 0;
248 SD_WRITES_TOT = 0;
249 bzero(SD_WRITES_LEN, sizeof (SD_WRITES_LEN));
250
251 /* pagelist i/o pages must be done in cache_init */
252
253 page_size = ptob(1);
254 page_offset_mask = page_size - 1;
255
256 return (0);
257 }
258
259 /*
260 * _sdbc_iobuf_deconfigure - release all memory allocated for buf list
261 *
262 * ARGUMENTS:
263 * None.
264 *
265 * RETURNS:
266 * 0
267 */
268 void
_sdbc_iobuf_deconfigure(void)269 _sdbc_iobuf_deconfigure(void)
270 {
271 ushort_t i;
272
273 if (_sd_buflist.hooks) {
274 for (i = 0; i < _sd_buflist.bl_init_count; i ++) {
275 cv_destroy(&_sd_buflist.hooks[i].wait);
276 }
277 cv_destroy(&_sd_buflist.hook_wait);
278 nsc_kmem_free(_sd_buflist.hooks,
279 _sd_buflist.bl_init_count * sizeof (iob_hook_t));
280 for (i = 0; i < MAX_HOOK_LOCKS; i ++) {
281 mutex_destroy(&_sd_buflist.hook_locks[i]);
282 }
283 }
284
285 _sd_buflist.hooks = NULL;
286
287 #ifdef DEBUG
288 {
289 void _sdbc_ioj_clear_err(int);
290 _sdbc_ioj_clear_err(-1); /* clear any injected i/o errors */
291 _sdbc_ioj_set_dev(-1, 0); /* clear dev entries */
292 }
293 #endif
294
295 }
296
297 /*
298 * _sd_pending_iobuf()
299 *
300 * Return the number of I/O bufs outstanding
301 */
302 int
_sd_pending_iobuf(void)303 _sd_pending_iobuf(void)
304 {
305 return (sdbc_bio_count);
306 }
307
308 /*
309 * _sd_get_iobuf - allocate a buf.
310 *
311 * ARGUMENTS:
312 * None.
313 *
314 * RETURNS:
315 * NULL - failure.
316 * buf ptr otherwise.
317 *
318 * ASSUMPTIONS - process could block if we run out.
319 *
320 */
321 /*ARGSUSED*/
322 static struct buf *
_sd_get_iobuf(int num_bdl)323 _sd_get_iobuf(int num_bdl)
324 {
325 struct buf *bp;
326
327 /* Get a buffer, ready for page list i/o */
328
329 if (DO_PAGE_LIST)
330 bp = pageio_setup(NULL, 0, &kvp, 0);
331 else
332 bp = getrbuf(KM_SLEEP);
333
334 if (bp == NULL)
335 return (NULL);
336 mutex_enter(&sdbc_bio_mutex);
337 sdbc_bio_count++;
338 mutex_exit(&sdbc_bio_mutex);
339 return (bp);
340 }
341
342 /*
343 * _sd_put_iobuf - put a buf back in the freelist.
344 *
345 * ARGUMENTS:
346 * bp - buf pointer.
347 *
348 * RETURNS:
349 * 0
350 *
351 */
352 static void
_sd_put_iobuf(struct buf * bp)353 _sd_put_iobuf(struct buf *bp)
354 {
355 mutex_enter(&sdbc_bio_mutex);
356 sdbc_bio_count--;
357 mutex_exit(&sdbc_bio_mutex);
358 if (DO_PAGE_LIST)
359 pageio_done(bp);
360 else
361 freerbuf(bp);
362 }
363
364
365 /* use for ORing only */
366 #define B_KERNBUF 0
367
368 static void
_sd_setup_iob(struct buf * bp,dev_t dev,nsc_off_t pos,int flag)369 _sd_setup_iob(struct buf *bp, dev_t dev, nsc_off_t pos, int flag)
370 {
371 bp->b_pages = NULL;
372 bp->b_un.b_addr = 0;
373
374 flag &= (B_READ | B_WRITE);
375
376 /*
377 * if pagelist i/o, _sd_get_iobuf()/pageio_setup() has already
378 * set b_flags to
379 * B_KERNBUF | B_PAGEIO | B_NOCACHE | B_BUSY (sol 6,7,8)
380 * or
381 * B_PAGEIO | B_NOCACHE | B_BUSY (sol 9)
382 */
383
384 bp->b_flags |= B_KERNBUF | B_BUSY | flag;
385
386 bp->b_error = 0;
387
388 bp->b_forw = NULL;
389 bp->b_back = NULL;
390
391 bp->b_lblkno = (diskaddr_t)pos;
392 bp->b_bufsize = 0;
393 bp->b_resid = 0;
394 bp->b_proc = NULL;
395 bp->b_edev = dev;
396 }
397
398
399 /*
400 * _sd_get_hook - get an iob hook from the free list.
401 *
402 * ARGUMENTS:
403 * none
404 *
405 * RETURNS:
406 * the newly allocated iob_hook.
407 *
408 */
409 static iob_hook_t *
_sd_get_hook(void)410 _sd_get_hook(void)
411 {
412
413 iob_hook_t *ret;
414
415 mutex_enter(&sdbc_bio_mutex);
416
417 retry:
418 ret = _sd_buflist.hook_head;
419 if (ret)
420 _sd_buflist.hook_head = ret->next_hook;
421 else {
422 ++_sd_buflist.hook_waiters;
423 if (_sd_buflist.max_hook_waiters < _sd_buflist.hook_waiters)
424 _sd_buflist.max_hook_waiters = _sd_buflist.hook_waiters;
425 cv_wait(&_sd_buflist.hook_wait, &sdbc_bio_mutex);
426 --_sd_buflist.hook_waiters;
427 goto retry;
428 }
429
430 if (_sd_buflist.bl_hook_lowmark > --_sd_buflist.bl_hooks_avail)
431 _sd_buflist.bl_hook_lowmark = _sd_buflist.bl_hooks_avail;
432
433 mutex_exit(&sdbc_bio_mutex);
434 ret->skipped = 0;
435
436 ret->count = 0;
437
438 #ifdef _SD_BIO_STATS
439 ret->PAGE_IO = 0;
440 ret->NORM_IO = 0;
441 ret->NORM_IO_SIZE = 0;
442 ret->SKIP_IO = 0;
443 ret->PAGE_COMBINED = 0;
444 #endif /* _SD_BIO_STATS */
445
446 return (ret);
447 }
448
449 /*
450 * _sd_put_hook - put an iob hook back on the free list.
451 *
452 * ARGUMENTS:
453 * hook - an iob_hook to be returned to the freelist.
454 *
455 *
456 */
457 static void
_sd_put_hook(iob_hook_t * hook)458 _sd_put_hook(iob_hook_t *hook)
459 {
460
461 mutex_enter(&sdbc_bio_mutex);
462
463 if (_sd_buflist.hook_waiters) {
464 cv_signal(&_sd_buflist.hook_wait);
465 }
466 hook->next_hook = _sd_buflist.hook_head;
467 _sd_buflist.hook_head = hook;
468
469 ++_sd_buflist.bl_hooks_avail;
470
471 mutex_exit(&sdbc_bio_mutex);
472 }
473
474 /*
475 * _sd_extend_iob - the i/o block we are handling needs a new struct buf to
476 * describe the next hunk of i/o. Get a new struct buf initialize it based
477 * on the state in the struct buf we are passed as an arg.
478 * ARGUMENTS:
479 * head_bp - a buffer header in the current i/o block we are handling.
480 * (generally the initial header but in fact could be any
481 * of the ones [if any] that were chained to the initial
482 * one).
483 */
484 static struct buf *
_sd_extend_iob(struct buf * head_bp)485 _sd_extend_iob(struct buf *head_bp)
486 {
487 struct buf *bp;
488 iob_hook_t *hook = (iob_hook_t *)head_bp->b_private;
489
490
491 if (!(bp = _sd_get_iobuf(0)))
492 return (0);
493
494 bp->b_pages = NULL;
495 bp->b_un.b_addr = 0;
496
497 bp->b_flags |= (head_bp->b_flags & (B_READ | B_WRITE));
498
499 if (!DO_PAGE_LIST)
500 bp->b_flags |= B_KERNBUF | B_BUSY;
501
502 bp->b_error = 0;
503
504 /*
505 * b_forw/b_back will form a doubly linked list of all the buffers
506 * associated with this block of i/o.
507 * hook->tail points to the last buffer in the chain.
508 */
509 bp->b_forw = NULL;
510 bp->b_back = hook->tail;
511 hook->tail->b_forw = bp;
512 hook->tail = bp;
513 hook->count++;
514
515 ASSERT(BLK_FBA_OFF(hook->size) == 0);
516
517 bp->b_lblkno = (diskaddr_t)hook->start_fba +
518 (diskaddr_t)FBA_NUM(hook->size);
519
520 bp->b_bufsize = 0;
521 bp->b_resid = 0;
522 bp->b_proc = NULL;
523 bp->b_edev = head_bp->b_edev;
524
525 bp->b_iodone = NULL; /* for now */
526 bp->b_private = hook;
527
528 return (bp);
529 }
530
531 /*
532 * sd_alloc_iob - start processing a block of i/o. This allocates an initial
533 * buffer header for describing the i/o and a iob_hook for collecting
534 * information about all the i/o requests added to this buffer.
535 *
536 * ARGUMENTS:
537 * dev - the device all the i/o is destined for.
538 * fba_pos - the initial disk block to read.
539 * blks - ignored
540 * flag - signal whether this is a read or write request.
541 *
542 * RETURNS:
543 * pointer to free struct buf which will be used to describe i/o request.
544 */
545 /* ARGSUSED */
546 struct buf *
sd_alloc_iob(dev_t dev,nsc_off_t fba_pos,int blks,int flag)547 sd_alloc_iob(dev_t dev, nsc_off_t fba_pos, int blks, int flag)
548 {
549 struct buf *bp;
550 iob_hook_t *hook;
551
552 if (!(bp = _sd_get_iobuf(0)))
553 return (0);
554
555 _sd_setup_iob(bp, dev, fba_pos, flag);
556
557 bp->b_iodone = NULL; /* for now */
558 hook = _sd_get_hook();
559 if (!hook) {
560 /* can't see how this could happen */
561 _sd_put_iobuf(bp);
562 return (0);
563 }
564
565 /*
566 * pick an arbitrary lock
567 */
568 hook->lockp = &_sd_buflist.hook_locks[((long)hook >> 9) &
569 (MAX_HOOK_LOCKS - 1)];
570 hook->start_fba = fba_pos;
571 hook->last_fba = fba_pos;
572 hook->size = 0;
573 hook->tail = bp;
574 hook->chain = bp;
575 hook->count = 1;
576 hook->error = 0;
577 bp->b_private = hook;
578
579 return (bp);
580 }
581
582 /*
583 * _sd_pack_pages - produce i/o requests that will perform the type of i/o
584 * described by bp (READ/WRITE). It attempt to tack the i/o onto the
585 * buf pointer to by list to minimize the number of bufs required.
586 *
587 * ARGUMENTS:
588 * bp - is the i/o description i.e. head
589 * list - is where to start adding this i/o request (null if we should extend)
590 * addr - address describing where the data is.
591 * offset - offset from addr where data begins
592 * size - size of the i/o request.
593 */
594 static void
_sd_pack_pages(struct buf * bp,struct buf * list,sd_addr_t * addr,nsc_off_t offset,nsc_size_t size)595 _sd_pack_pages(struct buf *bp, struct buf *list, sd_addr_t *addr,
596 nsc_off_t offset, nsc_size_t size)
597 {
598 uintptr_t start_addr, end_addr;
599 int page_end_aligned;
600 #ifdef _SD_BIO_STATS
601 iob_hook_t *hook = (iob_hook_t *)bp->b_private;
602 struct buf *orig_list = list;
603 #endif /* _SD_BIO_STATS */
604
605 start_addr = (uintptr_t)addr->sa_virt + offset;
606 end_addr = start_addr + size;
607
608 page_end_aligned = !(end_addr & page_offset_mask);
609
610 if (!list && !(list = _sd_extend_iob(bp))) {
611 /*
612 * we're hosed since we have no error return...
613 * though we could ignore stuff from here on out
614 * and return ENOMEM when we get to sd_start_io.
615 * This will do for now.
616 */
617 cmn_err(CE_PANIC, "_sd_pack_pages: couldn't extend iob");
618 }
619
620 /*
621 * We only want to do pagelist i/o if we end on a page boundary.
622 * If we don't end on a page boundary we won't combine with the
623 * next request and so we may as well do it as normal as it
624 * will only use one buffer.
625 */
626
627 if (DO_PAGE_LIST && page_end_aligned) {
628 if (start_addr & page_offset_mask) {
629 /*
630 * handle the partial page
631 */
632 if (list->b_bufsize) {
633 if (!(list = _sd_extend_iob(bp))) {
634 /*
635 * we're hosed since we have no error
636 * return though we could ignore stuff
637 * from here on out and return ENOMEM
638 * when we get to sd_start_io.
639 * This will do for now.
640 */
641 cmn_err(CE_PANIC,
642 "_sd_pack_pages: couldn't extend iob");
643 }
644 }
645 #ifdef _SD_BIO_STATS
646 hook->PAGE_IO++;
647 #endif /* _SD_BIO_STATS */
648 _sd_add_vm_to_bp_plist(list,
649 (unsigned char *) start_addr);
650 list->b_bufsize = page_size -
651 (start_addr & page_offset_mask);
652 list->b_un.b_addr = (caddr_t)
653 (start_addr & page_offset_mask);
654 size -= list->b_bufsize;
655 start_addr += list->b_bufsize;
656 }
657 /*
658 * Now fill with all the full pages remaining.
659 */
660 for (; size > 0; size -= page_size) {
661 #ifdef _SD_BIO_STATS
662 hook->PAGE_IO++;
663 #endif /* _SD_BIO_STATS */
664
665 _sd_add_vm_to_bp_plist(list,
666 (unsigned char *) start_addr);
667 start_addr += page_size;
668 list->b_bufsize += page_size;
669 #ifdef _SD_BIO_STATS
670 if (list == orig_list)
671 hook->PAGE_COMBINED++;
672 #endif /* _SD_BIO_STATS */
673 }
674 if (size)
675 cmn_err(CE_PANIC, "_sd_pack_pages: bad size: %"
676 NSC_SZFMT, size);
677 } else {
678 /*
679 * Wasn't worth it as pagelist i/o, do as normal
680 */
681 if (list->b_bufsize && !(list = _sd_extend_iob(bp))) {
682 /*
683 * we're hosed since we have no error return...
684 * though we could ignore stuff from here on out
685 * and return ENOMEM when we get to sd_start_io.
686 * This will do for now.
687 */
688 cmn_err(CE_PANIC,
689 "_sd_pack_pages: couldn't extend iob");
690 }
691
692 /* kernel virtual */
693 list->b_flags &= ~(B_PHYS | B_PAGEIO);
694 list->b_un.b_addr = (caddr_t)start_addr;
695 #ifdef _SD_BIO_STATS
696 hook->NORM_IO++;
697 hook->NORM_IO_SIZE += size;
698 #endif /* _SD_BIO_STATS */
699 list->b_bufsize = (size_t)size;
700 }
701
702 }
703
704 /*
705 * perform same function as _sd_pack_pages() when not doing pageio
706 */
707 static void
_sd_pack_pages_nopageio(struct buf * bp,struct buf * list,sd_addr_t * addr,nsc_off_t offset,nsc_size_t size)708 _sd_pack_pages_nopageio(struct buf *bp, struct buf *list, sd_addr_t *addr,
709 nsc_off_t offset, nsc_size_t size)
710 {
711 uintptr_t start_addr;
712 #ifdef _SD_BIO_STATS
713 iob_hook_t *hook = (iob_hook_t *)bp->b_private;
714 struct buf *orig_list = list;
715 #endif /* _SD_BIO_STATS */
716
717 start_addr = (uintptr_t)addr->sa_virt + offset;
718
719 if (!list && !(list = _sd_extend_iob(bp))) {
720 /*
721 * we're hosed since we have no error return...
722 * though we could ignore stuff from here on out
723 * and return ENOMEM when we get to sd_start_io.
724 * This will do for now.
725 */
726 cmn_err(CE_PANIC, "_sd_pack_pages_nopageio: couldn't "
727 "extend iob");
728 }
729
730 if (list->b_bufsize &&
731 (start_addr == (uintptr_t)(list->b_un.b_addr + list->b_bufsize))) {
732 /* contiguous */
733 list->b_bufsize += (size_t)size;
734 } else {
735 /*
736 * not contiguous mem (extend) or first buffer (bufsize == 0).
737 */
738 if (list->b_bufsize && !(list = _sd_extend_iob(bp))) {
739 /*
740 * we're hosed since we have no error return...
741 * though we could ignore stuff from here on out
742 * and return ENOMEM when we get to sd_start_io.
743 * This will do for now.
744 */
745 cmn_err(CE_PANIC, "_sd_pack_pages_nopageio: couldn't "
746 "extend iob");
747 }
748 list->b_un.b_addr = (caddr_t)start_addr;
749 list->b_bufsize = (size_t)size;
750 }
751
752 #ifdef _SD_BIO_STATS
753 hook->NORM_IO++;
754 hook->NORM_IO_SIZE += size;
755 #endif /* _SD_BIO_STATS */
756 }
757
758 /*
759 * sd_add_fba - add an i/o request to the block of i/o described by bp.
760 * We try and combine this request with the previous request. In
761 * Addition we try and do the i/o as PAGELIST_IO if it satisfies
762 * the restrictions for it. If the i/o request can't be combined
763 * we extend the i/o description with a new buffer header and add
764 * it to the chain headed by bp.
765 *
766 * ARGUMENTS:
767 * bp - the struct buf describing the block i/o we are collecting.
768 * addr - description of the address where the data will read/written to.
769 * A NULL indicates that this i/o request doesn't need to actually
770 * happen. Used to mark reads when the fba is already in cache and
771 * dirty.
772 *
773 * fba_pos - offset from address in addr where the i/o is to start.
774 *
775 * fba_len - number of consecutive fbas to transfer.
776 *
777 * NOTE: It is assumed that the memory is physically contiguous but may span
778 * multiple pages (should a cache block be larger than a page).
779 *
780 */
781 void
sd_add_fba(struct buf * bp,sd_addr_t * addr,nsc_off_t fba_pos,nsc_size_t fba_len)782 sd_add_fba(struct buf *bp, sd_addr_t *addr, nsc_off_t fba_pos,
783 nsc_size_t fba_len)
784 {
785 nsc_off_t offset;
786 nsc_size_t size;
787 iob_hook_t *hook = (iob_hook_t *)bp->b_private;
788
789 size = FBA_SIZE(fba_len);
790 offset = FBA_SIZE(fba_pos);
791
792 if (addr) {
793 /*
794 * See if this can be combined with previous request(s)
795 */
796 if (!bp->b_bufsize) {
797 if (DO_PAGE_LIST)
798 _sd_pack_pages(bp, bp, addr, offset, size);
799 else
800 _sd_pack_pages_nopageio(bp, bp, addr, offset,
801 size);
802 } else {
803 if (DO_PAGE_LIST) {
804 if (hook->tail->b_flags & B_PAGEIO) {
805 /*
806 * Last buffer was a pagelist. Unless a
807 * skip was detected the last request
808 * ended on a page boundary. If this
809 * one starts on one we combine the
810 * best we can.
811 */
812 if (hook->skipped)
813 _sd_pack_pages(bp, NULL, addr,
814 offset, size);
815 else
816 _sd_pack_pages(bp, hook->tail,
817 addr, offset, size);
818 } else {
819 /*
820 * Last buffer was vanilla i/o or worse
821 * (sd_add_mem)
822 */
823 _sd_pack_pages(bp, NULL, addr, offset,
824 size);
825 }
826 } else {
827 if (hook->skipped)
828 _sd_pack_pages_nopageio(bp, NULL,
829 addr, offset, size);
830 else
831 _sd_pack_pages_nopageio(bp,
832 hook->tail, addr, offset, size);
833 }
834 }
835 hook->skipped = 0;
836 } else {
837 /* Must be a read of dirty block we want to discard */
838
839 ASSERT(bp->b_flags & B_READ);
840 #ifdef _SD_BIO_STATS
841 hook->SKIP_IO++;
842 #endif /* _SD_BIO_STATS */
843 hook->skipped = 1;
844 if (!bp->b_bufsize)
845 bp->b_lblkno += fba_len;
846 }
847 hook->size += size;
848
849 }
850
851 /*
852 * sd_add_mem - add an i/o request to the block of i/o described by bp.
853 * The memory target for this i/o may span multiple pages and may
854 * not be physically contiguous.
855 * also the len might also not be a multiple of an fba.
856 *
857 * ARGUMENTS:
858 * bp - the struct buf describing the block i/o we are collecting.
859 *
860 * buf - target of this i/o request.
861 *
862 * len - number of bytes to transfer.
863 *
864 */
865 void
sd_add_mem(struct buf * bp,char * buf,nsc_size_t len)866 sd_add_mem(struct buf *bp, char *buf, nsc_size_t len)
867 {
868 nsc_size_t n;
869 uintptr_t start;
870 iob_hook_t *hook = (iob_hook_t *)bp->b_private;
871
872 start = (uintptr_t)buf & page_offset_mask;
873
874 for (; len > 0; buf += n, len -= n, start = 0) {
875 n = min((nsc_size_t)len, (nsc_size_t)(page_size - start));
876 /*
877 * i/o size must be multiple of an FBA since we can't
878 * count on lower level drivers to understand b_offset
879 */
880 if (BLK_FBA_OFF(n) != 0) {
881 cmn_err(CE_WARN,
882 "!sdbc(sd_add_mem) i/o request not FBA sized (%"
883 NSC_SZFMT ")", n);
884 }
885
886 if (!bp->b_bufsize) {
887 /* first request */
888 bp->b_flags &= ~(B_PHYS | B_PAGEIO);
889 bp->b_un.b_addr = buf;
890 bp->b_bufsize = (size_t)n;
891 } else {
892 struct buf *new_bp;
893 if (!(new_bp = _sd_extend_iob(bp))) {
894 /* we're hosed */
895 cmn_err(CE_PANIC,
896 "sd_add_mem: couldn't extend iob");
897 }
898 new_bp->b_flags &= ~(B_PHYS | B_PAGEIO);
899 new_bp->b_un.b_addr = buf;
900 new_bp->b_bufsize = (size_t)n;
901 }
902 hook->size += n;
903 }
904 }
905
906
907 /*
908 * sd_start_io - start all the i/o needed to satisfy the i/o request described
909 * by bp. If supplied the a non-NULL fn then this is an async request
910 * and we will return NSC_PENDING and call fn when all the i/o complete.
911 * Otherwise this is a synchronous request and we sleep until all the
912 * i/o is complete. If any buffer in the chain gets an error we return
913 * the first error we see (once all the i/o is complete).
914 *
915 * ARGUMENTS:
916 * bp - the struct buf describing the block i/o we are collecting.
917 *
918 * strategy - strategy function to call if known by the user, or NULL.
919 *
920 * fn - user's callback function. NULL implies synchronous request.
921 *
922 * arg - an argument passed to user's callback function.
923 *
924 */
925 int
sd_start_io(struct buf * bp,strategy_fn_t strategy,sdbc_ea_fn_t fn,blind_t arg)926 sd_start_io(struct buf *bp, strategy_fn_t strategy, sdbc_ea_fn_t fn,
927 blind_t arg)
928 {
929 int err;
930 iob_hook_t *hook = (iob_hook_t *)bp->b_private;
931 struct buf *bp_next;
932 int (*ea_fn)(struct buf *, iob_hook_t *);
933 #ifdef _SD_BIO_STATS
934 static int total_pages, total_pages_combined, total_norm;
935 static int total_norm_combined, total_skipped;
936 static nsc_size_t total_norm_size;
937
938 static int total_bufs;
939 static int total_xpages_w, total_ypages_w;
940 static int total_xpages_r, total_ypages_r;
941 static int max_run_r, max_run_w;
942
943 #endif /* _SD_BIO_STATS */
944
945 hook->func = fn;
946 hook->param = arg;
947 if (fn != NULL)
948 ea_fn = _sd_async_ea;
949 else
950 ea_fn = _sd_sync_ea;
951
952 hook->iob_hook_iodone = ea_fn;
953
954 #ifdef _SD_BIO_STATS
955 __start_io_count++;
956 total_pages += hook->PAGE_IO;
957 total_pages_combined += hook->PAGE_COMBINED;
958 total_norm += hook->NORM_IO;
959 total_norm_size += hook->NORM_IO_SIZE;
960 total_skipped += hook->SKIP_IO;
961 #endif /* _SD_BIO_STATS */
962
963 for (; bp; bp = bp_next) {
964
965 DTRACE_PROBE4(sd_start_io_bufs, struct buf *, bp, long, bp->b_bufsize,
966 int, bp->b_flags, iob_hook_t *, hook);
967
968 bp_next = bp->b_forw;
969 if (!(bp->b_flags & B_READ)) {
970 SD_WRITES_TOT++;
971 SD_WRITES_LEN[(bp->b_bufsize/32768) %
972 (sizeof (SD_WRITES_LEN)/sizeof (int))]++;
973 }
974 bp->b_iodone = hook->iob_drv_iodone;
975 bp->b_bcount = bp->b_bufsize;
976 bp->b_forw = NULL;
977 bp->b_back = NULL;
978 bp->b_private = NULL;
979
980 #ifdef _SD_BIO_STATS
981 total_bufs ++;
982 if (bp->b_flags & B_PAGEIO) {
983 int i;
984 i = _sd_count_pages(bp->b_pages);
985 if (bp->b_flags & B_READ) {
986 if (i > max_run_r)
987 max_run_r = i;
988 total_xpages_r += i;
989 total_ypages_r++;
990 } else {
991 if (i > max_run_w)
992 max_run_w = i;
993 total_xpages_w += i;
994 total_ypages_w++;
995 }
996 }
997 #endif /* _SD_BIO_STATS */
998
999
1000 /*
1001 * It's possible for us to be told to read a dirty block
1002 * where all the i/o can go away (e.g. read one fba, it's
1003 * in cache and dirty) so we really have nothing to do but
1004 * say we're done.
1005 */
1006 if (bp->b_bcount) {
1007 if (!strategy) {
1008 strategy =
1009 nsc_get_strategy(getmajor(bp->b_edev));
1010 }
1011
1012 if (!strategy) {
1013 bp->b_flags |= B_ERROR;
1014 bp->b_error = ENXIO;
1015 (*bp->b_iodone)(bp);
1016 } else
1017 #ifdef DEBUG
1018 /* inject i/o error for testing */
1019 if (bp->b_error = _sdbc_ioj_lookup(bp->b_edev)) {
1020 bp->b_flags |= B_ERROR;
1021 (*bp->b_iodone)(bp);
1022 } else
1023 #endif
1024 {
1025 (*strategy)(bp);
1026 }
1027 } else {
1028 (*bp->b_iodone)(bp);
1029 }
1030
1031 }
1032
1033 #ifdef _SD_BIO_STATS
1034 if (__start_io_count == 2000) {
1035 __start_io_count = 0;
1036 cmn_err(CE_WARN,
1037 "!sdbc(sd_start_io) t_bufs %d pages %d "
1038 "combined %d norm %d norm_size %" NSC_SZFMT " skipped %d",
1039 total_bufs,
1040 total_pages, total_pages_combined, total_norm,
1041 total_norm_size, total_skipped);
1042
1043 total_bufs = 0;
1044 total_pages = 0;
1045 total_pages_combined = 0;
1046 total_norm = 0;
1047 total_norm_combined = 0;
1048 total_skipped = 0;
1049 total_norm_size = 0;
1050
1051 cmn_err(CE_WARN,
1052 "!sdbc(sd_start_io)(r) max_run %d, total_xp %d total yp %d",
1053 max_run_r, total_xpages_r, total_ypages_r);
1054
1055 total_xpages_r = 0;
1056 total_ypages_r = 0;
1057 max_run_r = 0;
1058
1059 cmn_err(CE_WARN,
1060 "!sdbc(sd_start_io)(w) max_run %d, total_xp %d total yp %d",
1061 max_run_w, total_xpages_w, total_ypages_w);
1062
1063 total_xpages_w = 0;
1064 total_ypages_w = 0;
1065 max_run_w = 0;
1066 }
1067 #endif /* _SD_BIO_STATS */
1068
1069 if (ea_fn == _sd_async_ea) {
1070 DTRACE_PROBE(sd_start_io_end);
1071
1072 return (NSC_PENDING);
1073 }
1074
1075 mutex_enter(hook->lockp);
1076
1077 while (hook->count) {
1078 cv_wait(&hook->wait, hook->lockp);
1079 }
1080 mutex_exit(hook->lockp);
1081
1082 err = hook->error ? hook->error : NSC_DONE;
1083 bp = hook->tail;
1084 _sd_put_hook(hook);
1085 _sd_put_iobuf(bp);
1086
1087 return (err);
1088 }
1089
1090 /*
1091 * _sd_sync_ea - called when a single i/o operation is complete. If this
1092 * is the last outstanding i/o we wakeup the sleeper.
1093 * If this i/o had an error then we store the error result in the
1094 * iob_hook if this was the first error.
1095 *
1096 * ARGUMENTS:
1097 * bp - the struct buf describing the block i/o that just completed.
1098 *
1099 * Comments:
1100 * This routine is called at interrupt level when the io is done.
1101 */
1102
1103 static int
_sd_sync_ea(struct buf * bp,iob_hook_t * hook)1104 _sd_sync_ea(struct buf *bp, iob_hook_t *hook)
1105 {
1106
1107 int error;
1108 int done;
1109
1110 /*
1111 * We get called for each buf that completes. When they are all done.
1112 * we wakeup the waiter.
1113 */
1114 error = (bp->b_flags & B_ERROR) ? bp->b_error : 0;
1115
1116 mutex_enter(hook->lockp);
1117
1118 if (!hook->error)
1119 hook->error = error;
1120
1121 done = !(--hook->count);
1122 if (done) {
1123 /* remember the last buffer so we can free it later */
1124 hook->tail = bp;
1125 cv_signal(&hook->wait);
1126 }
1127 mutex_exit(hook->lockp);
1128
1129 /*
1130 * let sd_start_io free the final buffer so the hook can be returned
1131 * first.
1132 */
1133 if (!done)
1134 _sd_put_iobuf(bp);
1135
1136 return (0);
1137 }
1138
1139 /*
1140 * static int
1141 * _sd_async_ea - End action for async read/write.
1142 *
1143 * ARGUMENTS:
1144 * bp - io buf pointer.
1145 *
1146 * RETURNS:
1147 * NONE.
1148 *
1149 * Comments:
1150 * This routine is called at interrupt level when the io is done.
1151 * This is only called when the operation is asynchronous.
1152 */
1153 static int
_sd_async_ea(struct buf * bp,iob_hook_t * hook)1154 _sd_async_ea(struct buf *bp, iob_hook_t *hook)
1155 {
1156 int done, error;
1157
1158 /*
1159 * We get called for each buf that completes. When they are all done.
1160 * we call the requestor's callback function.
1161 */
1162 error = (bp->b_flags & B_ERROR) ? bp->b_error : 0;
1163
1164 mutex_enter(hook->lockp);
1165 done = !(--hook->count);
1166
1167 if (!hook->error)
1168 hook->error = error;
1169
1170 mutex_exit(hook->lockp);
1171
1172 bp->b_forw = NULL;
1173 bp->b_back = NULL;
1174
1175 if (done) {
1176 nsc_off_t fba_pos;
1177 nsc_size_t fba_len;
1178 int error;
1179 sdbc_ea_fn_t fn;
1180 blind_t arg;
1181
1182 arg = hook->param;
1183 fn = hook->func;
1184 error = hook->error;
1185 #if defined(_SD_DEBUG) /* simulate disk errors */
1186 if (_test_async_fail == bp->b_edev) error = EIO;
1187 #endif
1188
1189 /* MAKE SURE b_lblkno, b_count never changes!! */
1190 fba_pos = hook->start_fba;
1191 fba_len = FBA_LEN(hook->size);
1192
1193 _sd_put_hook(hook);
1194 _sd_put_iobuf(bp);
1195 (*fn)(arg, fba_pos, fba_len, error);
1196 } else
1197 _sd_put_iobuf(bp);
1198
1199 return (0);
1200 }
1201
1202 #ifdef DEBUG
1203 typedef struct ioerr_inject_s {
1204 dev_t ioj_dev;
1205 int ioj_err;
1206 int ioj_cnt;
1207 } ioerr_inject_t;
1208
1209 static ioerr_inject_t *ioerr_inject_table = NULL;
1210
1211 void
_sdbc_ioj_load()1212 _sdbc_ioj_load()
1213 {
1214 ioerr_inject_table =
1215 kmem_zalloc(sdbc_max_devs * sizeof (ioerr_inject_t), KM_SLEEP);
1216 }
1217
1218 void
_sdbc_ioj_unload()1219 _sdbc_ioj_unload()
1220 {
1221 if (ioerr_inject_table != NULL) {
1222 kmem_free(ioerr_inject_table,
1223 sdbc_max_devs * sizeof (ioerr_inject_t));
1224 ioerr_inject_table = NULL;
1225 }
1226 }
1227
1228 static int
_sdbc_ioj_lookup(dev_t dev)1229 _sdbc_ioj_lookup(dev_t dev)
1230 {
1231 int cd;
1232
1233 for (cd = 0; cd < sdbc_max_devs; ++cd)
1234 if (ioerr_inject_table[cd].ioj_dev == dev) {
1235 if (ioerr_inject_table[cd].ioj_cnt > 0) {
1236 --ioerr_inject_table[cd].ioj_cnt;
1237 return (0);
1238 } else {
1239 return (ioerr_inject_table[cd].ioj_err);
1240 }
1241 }
1242 return (0);
1243 }
1244
1245 void
_sdbc_ioj_set_dev(int cd,dev_t crdev)1246 _sdbc_ioj_set_dev(int cd, dev_t crdev)
1247 {
1248 int i;
1249
1250 if (cd == -1) { /* all -- used for clearing table on shutdown */
1251 for (i = 0; i < sdbc_max_devs; ++i) {
1252 ioerr_inject_table[i].ioj_dev = crdev;
1253 }
1254 } else
1255 ioerr_inject_table[cd].ioj_dev = crdev; /* assume valid cd */
1256 }
1257
1258 static
1259 void
_sdbc_ioj_set_err(int cd,int err,int count)1260 _sdbc_ioj_set_err(int cd, int err, int count)
1261 {
1262 int i;
1263
1264 if (cd == -1) { /* all */
1265 for (i = 0; i < sdbc_max_devs; ++i) {
1266 ioerr_inject_table[i].ioj_err = err;
1267 ioerr_inject_table[i].ioj_cnt = count;
1268 }
1269 } else {
1270 ioerr_inject_table[cd].ioj_err = err;
1271 ioerr_inject_table[cd].ioj_cnt = count;
1272 }
1273 }
1274
1275 static void
_sdbc_ioj_clear_err(int cd)1276 _sdbc_ioj_clear_err(int cd)
1277 {
1278 _sdbc_ioj_set_err(cd, 0, 0);
1279 }
1280
1281 int
_sdbc_inject_ioerr(int cd,int ioj_err,int count)1282 _sdbc_inject_ioerr(int cd, int ioj_err, int count)
1283 {
1284 if ((cd < -1) || (cd >= sdbc_max_devs))
1285 return (EINVAL);
1286
1287 _sdbc_ioj_set_err(cd, ioj_err, count);
1288
1289 return (0);
1290 }
1291
1292 int
_sdbc_clear_ioerr(int cd)1293 _sdbc_clear_ioerr(int cd)
1294 {
1295 if ((cd < -1) || (cd >= sdbc_max_devs))
1296 return (EINVAL);
1297
1298 _sdbc_ioj_clear_err(cd);
1299
1300 return (0);
1301 }
1302 #endif
1303