1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 26 /* All Rights Reserved */ 27 28 /* 29 * University Copyright- Copyright (c) 1982, 1986, 1988 30 * The Regents of the University of California 31 * All Rights Reserved 32 * 33 * University Acknowledgment- Portions of this document are derived from 34 * software developed by the University of California, Berkeley, and its 35 * contributors. 36 */ 37 38 #include <sys/types.h> 39 #include <sys/t_lock.h> 40 #include <sys/param.h> 41 #include <sys/errno.h> 42 #include <sys/debug.h> 43 #include <sys/cmn_err.h> 44 #include <sys/kmem.h> 45 #include <sys/sysmacros.h> 46 #include <sys/inline.h> 47 #include <sys/buf.h> 48 #include <sys/uio.h> 49 #include <sys/user.h> 50 #include <sys/proc.h> 51 #include <sys/systm.h> 52 #include <sys/vmsystm.h> 53 #include <sys/cpuvar.h> 54 #include <sys/mman.h> 55 #include <sys/cred.h> 56 #include <sys/vnode.h> 57 #include <sys/file.h> 58 #include <sys/vm.h> 59 60 #include <sys/swap.h> 61 #include <sys/vtrace.h> 62 #include <sys/fs/snode.h> 63 #include <sys/copyops.h> 64 #include <sys/conf.h> 65 #include <sys/sdt.h> 66 67 #include <vm/anon.h> 68 #include <vm/hat.h> 69 #include <vm/as.h> 70 #include <vm/seg.h> 71 #include <vm/page.h> 72 #include <vm/seg_vn.h> 73 #include <vm/seg_kmem.h> 74 75 extern int maxphys; 76 77 void 78 minphys(struct buf *bp) 79 { 80 if (bp->b_bcount > maxphys) 81 bp->b_bcount = maxphys; 82 } 83 84 /* 85 * use kmem_cache_create for physio buffers. This has shown 86 * a better cache distribution compared to buffers on the 87 * stack. It also avoids semaphore construction/deconstruction 88 * per request 89 */ 90 91 static struct kmem_cache *physio_buf_cache; 92 93 /* ARGSUSED */ 94 static int 95 physio_buf_constructor(void *buf, void *cdrarg, int kmflags) 96 { 97 bioinit((struct buf *)buf); 98 return (0); 99 } 100 101 /* ARGSUSED */ 102 static void 103 physio_buf_destructor(void *buf, void *cdrarg) 104 { 105 biofini((struct buf *)buf); 106 } 107 108 void 109 physio_bufs_init(void) 110 { 111 physio_buf_cache = kmem_cache_create("physio_buf_cache", 112 sizeof (struct buf), 0, physio_buf_constructor, 113 physio_buf_destructor, NULL, NULL, NULL, 0); 114 } 115 116 117 118 /* 119 * initiate raw I/O request 120 * 121 * allocate buf header if necessary 122 * adjust max size of each I/O request 123 * lock down user pages and verify access protections 124 * call driver's strategy routine to submit request 125 * wait for I/O completion 126 * unlock user pages and free allocated buf header 127 */ 128 129 int 130 default_physio(int (*strat)(struct buf *), struct buf *bp, dev_t dev, 131 int rw, void (*mincnt)(struct buf *), struct uio *uio) 132 { 133 struct iovec *iov; 134 struct proc *procp; 135 struct as *asp; 136 ssize_t c; 137 char *a; 138 int error = 0; 139 page_t **pplist; 140 int allocbuf = 0; 141 142 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_START, "physio_start: bp %p", bp); 143 144 if (rw == B_READ) { 145 CPU_STATS_ADD_K(sys, phread, 1); 146 } else { 147 CPU_STATS_ADD_K(sys, phwrite, 1); 148 } 149 150 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_START, 151 "getbuf_start: bp %p", bp); 152 153 if (bp == NULL) { 154 bp = kmem_cache_alloc(physio_buf_cache, KM_SLEEP); 155 bp->b_iodone = NULL; 156 bp->b_resid = 0; 157 allocbuf = 1; 158 } 159 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_END, "getbuf_end: bp %p", bp); 160 161 if (uio->uio_segflg == UIO_USERSPACE) { 162 procp = ttoproc(curthread); 163 asp = procp->p_as; 164 } else { 165 procp = NULL; 166 asp = &kas; 167 } 168 ASSERT(SEMA_HELD(&bp->b_sem)); 169 170 /* 171 * We need to prepare this buffer for the io:::start probe, including 172 * NULL'ing out the file, clearing the offset, and filling in the 173 * b_dip field. 174 */ 175 bp->b_file = NULL; 176 bp->b_offset = -1; 177 178 if (dev != NODEV) { 179 (void) devopsp[getmajor(dev)]->devo_getinfo(NULL, 180 DDI_INFO_DEVT2DEVINFO, (void *)dev, (void **)&bp->b_dip); 181 } else { 182 bp->b_dip = NULL; 183 } 184 185 while (uio->uio_iovcnt > 0) { 186 iov = uio->uio_iov; 187 188 bp->b_error = 0; 189 bp->b_proc = procp; 190 191 while (iov->iov_len > 0) { 192 if (uio->uio_resid == 0) 193 break; 194 if (uio->uio_loffset < 0) { 195 error = EINVAL; 196 break; 197 } 198 #ifdef _ILP32 199 /* 200 * For 32-bit kernels, check against SPEC_MAXOFFSET_T 201 * which represents the maximum size that can be 202 * supported by the IO subsystem. 203 * XXX this code assumes a D_64BIT driver. 204 */ 205 if (uio->uio_loffset > SPEC_MAXOFFSET_T) { 206 error = EINVAL; 207 break; 208 } 209 #endif /* _ILP32 */ 210 bp->b_flags = B_BUSY | B_PHYS | rw; 211 bp->b_edev = dev; 212 bp->b_lblkno = btodt(uio->uio_loffset); 213 214 /* 215 * Don't count on b_addr remaining untouched by the 216 * code below (it may be reset because someone does 217 * a bp_mapin on the buffer) -- reset from the iov 218 * each time through, updating the iov's base address 219 * instead. 220 */ 221 a = bp->b_un.b_addr = iov->iov_base; 222 bp->b_bcount = MIN(iov->iov_len, uio->uio_resid); 223 (*mincnt)(bp); 224 c = bp->b_bcount; 225 226 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_START, 227 "as_pagelock_start: bp %p", bp); 228 229 error = as_pagelock(asp, &pplist, a, 230 c, rw == B_READ? S_WRITE : S_READ); 231 232 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_END, 233 "as_pagelock_end:"); 234 235 if (error != 0) { 236 bp->b_flags |= B_ERROR; 237 bp->b_error = error; 238 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS); 239 break; 240 } 241 bp->b_shadow = pplist; 242 if (pplist != NULL) { 243 bp->b_flags |= B_SHADOW; 244 } 245 246 DTRACE_IO1(start, struct buf *, bp); 247 bp->b_flags |= B_STARTED; 248 249 (void) (*strat)(bp); 250 error = biowait(bp); 251 252 /* 253 * unlock the pages 254 */ 255 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_START, 256 "as_pageunlock_start: bp %p", bp); 257 258 as_pageunlock(asp, pplist, a, c, 259 rw == B_READ? S_WRITE : S_READ); 260 261 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_END, 262 "as_pageunlock_end:"); 263 264 c -= bp->b_resid; 265 iov->iov_base += c; 266 iov->iov_len -= c; 267 uio->uio_resid -= c; 268 uio->uio_loffset += c; 269 /* bp->b_resid - temp kludge for tape drives */ 270 if (bp->b_resid || error) 271 break; 272 } 273 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW); 274 /* bp->b_resid - temp kludge for tape drives */ 275 if (bp->b_resid || error) 276 break; 277 uio->uio_iov++; 278 uio->uio_iovcnt--; 279 } 280 281 if (allocbuf) { 282 kmem_cache_free(physio_buf_cache, bp); 283 } 284 285 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_END, "physio_end: bp %p", bp); 286 287 return (error); 288 } 289 290 /* 291 * Returns 0 on success, or an error on failure. 292 * 293 * This function is no longer a part of the DDI/DKI. 294 * However, for compatibility, its interface should not 295 * be changed and it should not be removed from the kernel. 296 */ 297 int 298 useracc(void *addr, size_t count, int access) 299 { 300 uint_t prot; 301 302 prot = PROT_USER | ((access == B_READ) ? PROT_READ : PROT_WRITE); 303 return (as_checkprot(ttoproc(curthread)->p_as, addr, count, prot)); 304 } 305 306 #define MAX_MAPIN_PAGES 8 307 308 /* 309 * This function temporarily "borrows" user pages for kernel use. If 310 * "cow" is on, it also sets up copy-on-write protection (only feasible 311 * on MAP_PRIVATE segment) on the user mappings, to protect the borrowed 312 * pages from any changes by the user. The caller is responsible for 313 * unlocking and tearing down cow settings when it's done with the pages. 314 * For an example, see kcfree(). 315 * 316 * Pages behind [uaddr..uaddr+*lenp] under address space "as" are locked 317 * (shared), and mapped into kernel address range [kaddr..kaddr+*lenp] if 318 * kaddr != -1. On entering this function, cached_ppp contains a list 319 * of pages that are mapped into [kaddr..kaddr+*lenp] already (from a 320 * previous call). Thus if same pages remain behind [uaddr..uaddr+*lenp], 321 * the kernel map won't need to be reloaded again. 322 * 323 * For cow == 1, if the pages are anonymous pages, it also bumps the anon 324 * reference count, and change the user-mapping to read-only. This 325 * scheme should work on all types of segment drivers. But to be safe, 326 * we check against segvn here. 327 * 328 * Since this function is used to emulate copyin() semantic, it checks 329 * to make sure the user-mappings allow "user-read". 330 * 331 * On exit "lenp" contains the number of bytes successfully locked and 332 * mapped in. For the unsuccessful ones, the caller can fall back to 333 * copyin(). 334 * 335 * Error return: 336 * ENOTSUP - operation like this is not supported either on this segment 337 * type, or on this platform type. 338 */ 339 int 340 cow_mapin(struct as *as, caddr_t uaddr, caddr_t kaddr, struct page **cached_ppp, 341 struct anon **app, size_t *lenp, int cow) 342 { 343 struct hat *hat; 344 struct seg *seg; 345 caddr_t base; 346 page_t *pp, *ppp[MAX_MAPIN_PAGES]; 347 long i; 348 int flags; 349 size_t size, total = *lenp; 350 char first = 1; 351 faultcode_t res; 352 353 *lenp = 0; 354 if (cow) { 355 AS_LOCK_ENTER(as, RW_WRITER); 356 seg = as_findseg(as, uaddr, 0); 357 if ((seg == NULL) || ((base = seg->s_base) > uaddr) || 358 (uaddr + total) > base + seg->s_size) { 359 AS_LOCK_EXIT(as); 360 return (EINVAL); 361 } 362 /* 363 * The COW scheme should work for all segment types. 364 * But to be safe, we check against segvn. 365 */ 366 if (seg->s_ops != &segvn_ops) { 367 AS_LOCK_EXIT(as); 368 return (ENOTSUP); 369 } else if ((SEGOP_GETTYPE(seg, uaddr) & MAP_PRIVATE) == 0) { 370 AS_LOCK_EXIT(as); 371 return (ENOTSUP); 372 } 373 } 374 hat = as->a_hat; 375 size = total; 376 tryagain: 377 /* 378 * If (cow), hat_softlock will also change the usr protection to RO. 379 * This is the first step toward setting up cow. Before we 380 * bump up an_refcnt, we can't allow any cow-fault on this 381 * address. Otherwise segvn_fault will change the protection back 382 * to RW upon seeing an_refcnt == 1. 383 * The solution is to hold the writer lock on "as". 384 */ 385 res = hat_softlock(hat, uaddr, &size, &ppp[0], cow ? HAT_COW : 0); 386 size = total - size; 387 *lenp += size; 388 size = size >> PAGESHIFT; 389 i = 0; 390 while (i < size) { 391 pp = ppp[i]; 392 if (cow) { 393 kmutex_t *ahm; 394 /* 395 * Another solution is to hold SE_EXCL on pp, and 396 * disable PROT_WRITE. This also works for MAP_SHARED 397 * segment. The disadvantage is that it locks the 398 * page from being used by anybody else. 399 */ 400 ahm = AH_MUTEX(pp->p_vnode, pp->p_offset); 401 mutex_enter(ahm); 402 *app = swap_anon(pp->p_vnode, pp->p_offset); 403 /* 404 * Since we are holding the as lock, this avoids a 405 * potential race with anon_decref. (segvn_unmap and 406 * segvn_free needs the as writer lock to do anon_free.) 407 */ 408 if (*app != NULL) { 409 #if 0 410 if ((*app)->an_refcnt == 0) 411 /* 412 * Consider the following senario (unlikey 413 * though): 414 * 1. an_refcnt == 2 415 * 2. we solftlock the page. 416 * 3. cow ocurrs on this addr. So a new ap, 417 * page and mapping is established on addr. 418 * 4. an_refcnt drops to 1 (segvn_faultpage 419 * -> anon_decref(oldap)) 420 * 5. the last ref to ap also drops (from 421 * another as). It ends up blocked inside 422 * anon_decref trying to get page's excl lock. 423 * 6. Later kcfree unlocks the page, call 424 * anon_decref -> oops, ap is gone already. 425 * 426 * Holding as writer lock solves all problems. 427 */ 428 *app = NULL; 429 else 430 #endif 431 (*app)->an_refcnt++; 432 } 433 mutex_exit(ahm); 434 } else { 435 *app = NULL; 436 } 437 if (kaddr != (caddr_t)-1) { 438 if (pp != *cached_ppp) { 439 if (*cached_ppp == NULL) 440 flags = HAT_LOAD_LOCK | HAT_NOSYNC | 441 HAT_LOAD_NOCONSIST; 442 else 443 flags = HAT_LOAD_REMAP | 444 HAT_LOAD_NOCONSIST; 445 /* 446 * In order to cache the kernel mapping after 447 * the user page is unlocked, we call 448 * hat_devload instead of hat_memload so 449 * that the kernel mapping we set up here is 450 * "invisible" to the rest of the world. This 451 * is not very pretty. But as long as the 452 * caller bears the responsibility of keeping 453 * cache consistency, we should be ok - 454 * HAT_NOCONSIST will get us a uncached 455 * mapping on VAC. hat_softlock will flush 456 * a VAC_WRITEBACK cache. Therefore the kaddr 457 * doesn't have to be of the same vcolor as 458 * uaddr. 459 * The alternative is - change hat_devload 460 * to get a cached mapping. Allocate a kaddr 461 * with the same vcolor as uaddr. Then 462 * hat_softlock won't need to flush the VAC. 463 */ 464 hat_devload(kas.a_hat, kaddr, PAGESIZE, 465 page_pptonum(pp), PROT_READ, flags); 466 *cached_ppp = pp; 467 } 468 kaddr += PAGESIZE; 469 } 470 cached_ppp++; 471 app++; 472 ++i; 473 } 474 if (cow) { 475 AS_LOCK_EXIT(as); 476 } 477 if (first && res == FC_NOMAP) { 478 /* 479 * If the address is not mapped yet, we call as_fault to 480 * fault the pages in. We could've fallen back to copy and 481 * let it fault in the pages. But for a mapped file, we 482 * normally reference each page only once. For zero-copy to 483 * be of any use, we'd better fall in the page now and try 484 * again. 485 */ 486 first = 0; 487 size = size << PAGESHIFT; 488 uaddr += size; 489 total -= size; 490 size = total; 491 res = as_fault(as->a_hat, as, uaddr, size, F_INVAL, S_READ); 492 if (cow) 493 AS_LOCK_ENTER(as, RW_WRITER); 494 goto tryagain; 495 } 496 switch (res) { 497 case FC_NOSUPPORT: 498 return (ENOTSUP); 499 case FC_PROT: /* Pretend we don't know about it. This will be */ 500 /* caught by the caller when uiomove fails. */ 501 case FC_NOMAP: 502 case FC_OBJERR: 503 default: 504 return (0); 505 } 506 } 507