/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ /* * University Copyright- Copyright (c) 1982, 1986, 1988 * The Regents of the University of California * All Rights Reserved * * University Acknowledgment- Portions of this document are derived from * software developed by the University of California, Berkeley, and its * contributors. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern int maxphys; void minphys(struct buf *bp) { if (bp->b_bcount > maxphys) bp->b_bcount = maxphys; } /* * use kmem_cache_create for physio buffers. This has shown * a better cache distribution compared to buffers on the * stack. It also avoids semaphore construction/deconstruction * per request */ static struct kmem_cache *physio_buf_cache; /* ARGSUSED */ static int physio_buf_constructor(void *buf, void *cdrarg, int kmflags) { bioinit((struct buf *)buf); return (0); } /* ARGSUSED */ static void physio_buf_destructor(void *buf, void *cdrarg) { biofini((struct buf *)buf); } void physio_bufs_init(void) { physio_buf_cache = kmem_cache_create("physio_buf_cache", sizeof (struct buf), 0, physio_buf_constructor, physio_buf_destructor, NULL, NULL, NULL, 0); } /* * initiate raw I/O request * * allocate buf header if necessary * adjust max size of each I/O request * lock down user pages and verify access protections * call driver's strategy routine to submit request * wait for I/O completion * unlock user pages and free allocated buf header */ int default_physio(int (*strat)(struct buf *), struct buf *bp, dev_t dev, int rw, void (*mincnt)(struct buf *), struct uio *uio) { struct iovec *iov; struct proc *procp; struct as *asp; ssize_t c; char *a; int error = 0; page_t **pplist; int allocbuf = 0; TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_START, "physio_start: bp %p", bp); /* Kernel probe */ TNF_PROBE_4(physio_start, "io rawio", /* CSTYLED */, tnf_device, device, dev, tnf_offset, offset, uio->uio_loffset, tnf_size, size, uio->uio_resid, tnf_bioflags, rw, rw); if (rw == B_READ) { CPU_STATS_ADD_K(sys, phread, 1); } else { CPU_STATS_ADD_K(sys, phwrite, 1); } TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_START, "getbuf_start: bp %p", bp); if (bp == NULL) { bp = kmem_cache_alloc(physio_buf_cache, KM_SLEEP); bp->b_iodone = NULL; bp->b_resid = 0; allocbuf = 1; } TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_END, "getbuf_end: bp %p", bp); if (uio->uio_segflg == UIO_USERSPACE) { procp = ttoproc(curthread); asp = procp->p_as; } else { procp = NULL; asp = &kas; } ASSERT(SEMA_HELD(&bp->b_sem)); /* * We need to prepare this buffer for the io:::start probe, including * NULL'ing out the file, clearing the offset, and filling in the * b_dip field. */ bp->b_file = NULL; bp->b_offset = -1; if (dev != NODEV) { (void) devopsp[getmajor(dev)]->devo_getinfo(NULL, DDI_INFO_DEVT2DEVINFO, (void *)dev, (void **)&bp->b_dip); } else { bp->b_dip = NULL; } while (uio->uio_iovcnt > 0) { iov = uio->uio_iov; bp->b_error = 0; bp->b_proc = procp; while (iov->iov_len > 0) { if (uio->uio_resid == 0) break; if (uio->uio_loffset < 0) { error = EINVAL; break; } #ifdef _ILP32 /* * For 32-bit kernels, check against SPEC_MAXOFFSET_T * which represents the maximum size that can be * supported by the IO subsystem. * XXX this code assumes a D_64BIT driver. */ if (uio->uio_loffset > SPEC_MAXOFFSET_T) { error = EINVAL; break; } #endif /* _ILP32 */ bp->b_flags = B_BUSY | B_PHYS | rw; bp->b_edev = dev; bp->b_lblkno = btodt(uio->uio_loffset); /* * Don't count on b_addr remaining untouched by the * code below (it may be reset because someone does * a bp_mapin on the buffer) -- reset from the iov * each time through, updating the iov's base address * instead. */ a = bp->b_un.b_addr = iov->iov_base; bp->b_bcount = MIN(iov->iov_len, uio->uio_resid); (*mincnt)(bp); c = bp->b_bcount; TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_START, "as_pagelock_start: bp %p", bp); error = as_pagelock(asp, &pplist, a, c, rw == B_READ? S_WRITE : S_READ); TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_END, "as_pagelock_end:"); if (error != 0) { bp->b_flags |= B_ERROR; bp->b_error = error; bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS); break; } bp->b_shadow = pplist; if (pplist != NULL) { bp->b_flags |= B_SHADOW; } DTRACE_IO1(start, struct buf *, bp); bp->b_flags |= B_STARTED; (void) (*strat)(bp); error = biowait(bp); /* * unlock the pages */ TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_START, "as_pageunlock_start: bp %p", bp); as_pageunlock(asp, pplist, a, c, rw == B_READ? S_WRITE : S_READ); TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_END, "as_pageunlock_end:"); c -= bp->b_resid; iov->iov_base += c; iov->iov_len -= c; uio->uio_resid -= c; uio->uio_loffset += c; /* bp->b_resid - temp kludge for tape drives */ if (bp->b_resid || error) break; } bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW); /* bp->b_resid - temp kludge for tape drives */ if (bp->b_resid || error) break; uio->uio_iov++; uio->uio_iovcnt--; } if (allocbuf) { kmem_cache_free(physio_buf_cache, bp); } /* Kernel probe */ TNF_PROBE_1(physio_end, "io rawio", /* CSTYLED */, tnf_device, device, dev); TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_END, "physio_end: bp %p", bp); return (error); } /* * Returns 0 on success, or an error on failure. * * This function is no longer a part of the DDI/DKI. * However, for compatibility, its interface should not * be changed and it should not be removed from the kernel. */ int useracc(void *addr, size_t count, int access) { uint_t prot; prot = PROT_USER | ((access == B_READ) ? PROT_READ : PROT_WRITE); return (as_checkprot(ttoproc(curthread)->p_as, addr, count, prot)); } #define MAX_MAPIN_PAGES 8 /* * This function temporarily "borrows" user pages for kernel use. If * "cow" is on, it also sets up copy-on-write protection (only feasible * on MAP_PRIVATE segment) on the user mappings, to protect the borrowed * pages from any changes by the user. The caller is responsible for * unlocking and tearing down cow settings when it's done with the pages. * For an example, see kcfree(). * * Pages behind [uaddr..uaddr+*lenp] under address space "as" are locked * (shared), and mapped into kernel address range [kaddr..kaddr+*lenp] if * kaddr != -1. On entering this function, cached_ppp contains a list * of pages that are mapped into [kaddr..kaddr+*lenp] already (from a * previous call). Thus if same pages remain behind [uaddr..uaddr+*lenp], * the kernel map won't need to be reloaded again. * * For cow == 1, if the pages are anonymous pages, it also bumps the anon * reference count, and change the user-mapping to read-only. This * scheme should work on all types of segment drivers. But to be safe, * we check against segvn here. * * Since this function is used to emulate copyin() semantic, it checks * to make sure the user-mappings allow "user-read". * * On exit "lenp" contains the number of bytes successfully locked and * mapped in. For the unsuccessful ones, the caller can fall back to * copyin(). * * Error return: * ENOTSUP - operation like this is not supported either on this segment * type, or on this platform type. */ int cow_mapin(struct as *as, caddr_t uaddr, caddr_t kaddr, struct page **cached_ppp, struct anon **app, size_t *lenp, int cow) { struct hat *hat; struct seg *seg; caddr_t base; page_t *pp, *ppp[MAX_MAPIN_PAGES]; long i; int flags; size_t size, total = *lenp; char first = 1; faultcode_t res; *lenp = 0; if (cow) { AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); seg = as_findseg(as, uaddr, 0); if ((seg == NULL) || ((base = seg->s_base) > uaddr) || (uaddr + total) > base + seg->s_size) { AS_LOCK_EXIT(as, &as->a_lock); return (EINVAL); } /* * The COW scheme should work for all segment types. * But to be safe, we check against segvn. */ if (seg->s_ops != &segvn_ops) { AS_LOCK_EXIT(as, &as->a_lock); return (ENOTSUP); } else if ((SEGOP_GETTYPE(seg, uaddr) & MAP_PRIVATE) == 0) { AS_LOCK_EXIT(as, &as->a_lock); return (ENOTSUP); } } hat = as->a_hat; size = total; tryagain: /* * If (cow), hat_softlock will also change the usr protection to RO. * This is the first step toward setting up cow. Before we * bump up an_refcnt, we can't allow any cow-fault on this * address. Otherwise segvn_fault will change the protection back * to RW upon seeing an_refcnt == 1. * The solution is to hold the writer lock on "as". */ res = hat_softlock(hat, uaddr, &size, &ppp[0], cow ? HAT_COW : 0); size = total - size; *lenp += size; size = size >> PAGESHIFT; i = 0; while (i < size) { pp = ppp[i]; if (cow) { kmutex_t *ahm; /* * Another solution is to hold SE_EXCL on pp, and * disable PROT_WRITE. This also works for MAP_SHARED * segment. The disadvantage is that it locks the * page from being used by anybody else. */ ahm = AH_MUTEX(pp->p_vnode, pp->p_offset); mutex_enter(ahm); *app = swap_anon(pp->p_vnode, pp->p_offset); /* * Since we are holding the as lock, this avoids a * potential race with anon_decref. (segvn_unmap and * segvn_free needs the as writer lock to do anon_free.) */ if (*app != NULL) { #if 0 if ((*app)->an_refcnt == 0) /* * Consider the following senario (unlikey * though): * 1. an_refcnt == 2 * 2. we solftlock the page. * 3. cow ocurrs on this addr. So a new ap, * page and mapping is established on addr. * 4. an_refcnt drops to 1 (segvn_faultpage * -> anon_decref(oldap)) * 5. the last ref to ap also drops (from * another as). It ends up blocked inside * anon_decref trying to get page's excl lock. * 6. Later kcfree unlocks the page, call * anon_decref -> oops, ap is gone already. * * Holding as writer lock solves all problems. */ *app = NULL; else #endif (*app)->an_refcnt++; } mutex_exit(ahm); } else { *app = NULL; } if (kaddr != (caddr_t)-1) { if (pp != *cached_ppp) { if (*cached_ppp == NULL) flags = HAT_LOAD_LOCK | HAT_NOSYNC | HAT_LOAD_NOCONSIST; else flags = HAT_LOAD_REMAP | HAT_LOAD_NOCONSIST; /* * In order to cache the kernel mapping after * the user page is unlocked, we call * hat_devload instead of hat_memload so * that the kernel mapping we set up here is * "invisible" to the rest of the world. This * is not very pretty. But as long as the * caller bears the responsibility of keeping * cache consistency, we should be ok - * HAT_NOCONSIST will get us a uncached * mapping on VAC. hat_softlock will flush * a VAC_WRITEBACK cache. Therefore the kaddr * doesn't have to be of the same vcolor as * uaddr. * The alternative is - change hat_devload * to get a cached mapping. Allocate a kaddr * with the same vcolor as uaddr. Then * hat_softlock won't need to flush the VAC. */ hat_devload(kas.a_hat, kaddr, PAGESIZE, page_pptonum(pp), PROT_READ, flags); *cached_ppp = pp; } kaddr += PAGESIZE; } cached_ppp++; app++; ++i; } if (cow) { AS_LOCK_EXIT(as, &as->a_lock); } if (first && res == FC_NOMAP) { /* * If the address is not mapped yet, we call as_fault to * fault the pages in. We could've fallen back to copy and * let it fault in the pages. But for a mapped file, we * normally reference each page only once. For zero-copy to * be of any use, we'd better fall in the page now and try * again. */ first = 0; size = size << PAGESHIFT; uaddr += size; total -= size; size = total; res = as_fault(as->a_hat, as, uaddr, size, F_INVAL, S_READ); if (cow) AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); goto tryagain; } switch (res) { case FC_NOSUPPORT: return (ENOTSUP); case FC_PROT: /* Pretend we don't know about it. This will be */ /* caught by the caller when uiomove fails. */ case FC_NOMAP: case FC_OBJERR: default: return (0); } }