1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 26 /* All Rights Reserved */ 27 28 /* 29 * University Copyright- Copyright (c) 1982, 1986, 1988 30 * The Regents of the University of California 31 * All Rights Reserved 32 * 33 * University Acknowledgment- Portions of this document are derived from 34 * software developed by the University of California, Berkeley, and its 35 * contributors. 36 */ 37 38 #include <sys/types.h> 39 #include <sys/t_lock.h> 40 #include <sys/param.h> 41 #include <sys/errno.h> 42 #include <sys/debug.h> 43 #include <sys/cmn_err.h> 44 #include <sys/kmem.h> 45 #include <sys/sysmacros.h> 46 #include <sys/inline.h> 47 #include <sys/buf.h> 48 #include <sys/uio.h> 49 #include <sys/user.h> 50 #include <sys/proc.h> 51 #include <sys/systm.h> 52 #include <sys/vmsystm.h> 53 #include <sys/cpuvar.h> 54 #include <sys/mman.h> 55 #include <sys/cred.h> 56 #include <sys/vnode.h> 57 #include <sys/file.h> 58 #include <sys/vm.h> 59 60 #include <sys/swap.h> 61 #include <sys/vtrace.h> 62 #include <sys/tnf_probe.h> 63 #include <sys/fs/snode.h> 64 #include <sys/copyops.h> 65 #include <sys/conf.h> 66 #include <sys/sdt.h> 67 68 #include <vm/anon.h> 69 #include <vm/hat.h> 70 #include <vm/as.h> 71 #include <vm/seg.h> 72 #include <vm/page.h> 73 #include <vm/seg_vn.h> 74 #include <vm/seg_kmem.h> 75 76 extern int maxphys; 77 78 void 79 minphys(struct buf *bp) 80 { 81 if (bp->b_bcount > maxphys) 82 bp->b_bcount = maxphys; 83 } 84 85 /* 86 * use kmem_cache_create for physio buffers. This has shown 87 * a better cache distribution compared to buffers on the 88 * stack. It also avoids semaphore construction/deconstruction 89 * per request 90 */ 91 92 static struct kmem_cache *physio_buf_cache; 93 94 /* ARGSUSED */ 95 static int 96 physio_buf_constructor(void *buf, void *cdrarg, int kmflags) 97 { 98 bioinit((struct buf *)buf); 99 return (0); 100 } 101 102 /* ARGSUSED */ 103 static void 104 physio_buf_destructor(void *buf, void *cdrarg) 105 { 106 biofini((struct buf *)buf); 107 } 108 109 void 110 physio_bufs_init(void) 111 { 112 physio_buf_cache = kmem_cache_create("physio_buf_cache", 113 sizeof (struct buf), 0, physio_buf_constructor, 114 physio_buf_destructor, NULL, NULL, NULL, 0); 115 } 116 117 118 119 /* 120 * initiate raw I/O request 121 * 122 * allocate buf header if necessary 123 * adjust max size of each I/O request 124 * lock down user pages and verify access protections 125 * call driver's strategy routine to submit request 126 * wait for I/O completion 127 * unlock user pages and free allocated buf header 128 */ 129 130 int 131 default_physio(int (*strat)(struct buf *), struct buf *bp, dev_t dev, 132 int rw, void (*mincnt)(struct buf *), struct uio *uio) 133 { 134 struct iovec *iov; 135 struct proc *procp; 136 struct as *asp; 137 ssize_t c; 138 char *a; 139 int error = 0; 140 page_t **pplist; 141 int allocbuf = 0; 142 143 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_START, "physio_start: bp %p", bp); 144 145 /* Kernel probe */ 146 TNF_PROBE_4(physio_start, "io rawio", /* CSTYLED */, 147 tnf_device, device, dev, 148 tnf_offset, offset, uio->uio_loffset, 149 tnf_size, size, uio->uio_resid, 150 tnf_bioflags, rw, rw); 151 152 if (rw == B_READ) { 153 CPU_STATS_ADD_K(sys, phread, 1); 154 } else { 155 CPU_STATS_ADD_K(sys, phwrite, 1); 156 } 157 158 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_START, 159 "getbuf_start: bp %p", bp); 160 161 if (bp == NULL) { 162 bp = kmem_cache_alloc(physio_buf_cache, KM_SLEEP); 163 bp->b_iodone = NULL; 164 bp->b_resid = 0; 165 allocbuf = 1; 166 } 167 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_END, "getbuf_end: bp %p", bp); 168 169 if (uio->uio_segflg == UIO_USERSPACE) { 170 procp = ttoproc(curthread); 171 asp = procp->p_as; 172 } else { 173 procp = NULL; 174 asp = &kas; 175 } 176 ASSERT(SEMA_HELD(&bp->b_sem)); 177 178 /* 179 * We need to prepare this buffer for the io:::start probe, including 180 * NULL'ing out the file, clearing the offset, and filling in the 181 * b_dip field. 182 */ 183 bp->b_file = NULL; 184 bp->b_offset = -1; 185 186 if (dev != NODEV) { 187 (void) devopsp[getmajor(dev)]->devo_getinfo(NULL, 188 DDI_INFO_DEVT2DEVINFO, (void *)dev, (void **)&bp->b_dip); 189 } else { 190 bp->b_dip = NULL; 191 } 192 193 while (uio->uio_iovcnt > 0) { 194 iov = uio->uio_iov; 195 196 bp->b_error = 0; 197 bp->b_proc = procp; 198 199 while (iov->iov_len > 0) { 200 if (uio->uio_resid == 0) 201 break; 202 if (uio->uio_loffset < 0) { 203 error = EINVAL; 204 break; 205 } 206 #ifdef _ILP32 207 /* 208 * For 32-bit kernels, check against SPEC_MAXOFFSET_T 209 * which represents the maximum size that can be 210 * supported by the IO subsystem. 211 * XXX this code assumes a D_64BIT driver. 212 */ 213 if (uio->uio_loffset > SPEC_MAXOFFSET_T) { 214 error = EINVAL; 215 break; 216 } 217 #endif /* _ILP32 */ 218 bp->b_flags = B_BUSY | B_PHYS | rw; 219 bp->b_edev = dev; 220 bp->b_lblkno = btodt(uio->uio_loffset); 221 222 /* 223 * Don't count on b_addr remaining untouched by the 224 * code below (it may be reset because someone does 225 * a bp_mapin on the buffer) -- reset from the iov 226 * each time through, updating the iov's base address 227 * instead. 228 */ 229 a = bp->b_un.b_addr = iov->iov_base; 230 bp->b_bcount = MIN(iov->iov_len, uio->uio_resid); 231 (*mincnt)(bp); 232 c = bp->b_bcount; 233 234 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_START, 235 "as_pagelock_start: bp %p", bp); 236 237 error = as_pagelock(asp, &pplist, a, 238 c, rw == B_READ? S_WRITE : S_READ); 239 240 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_END, 241 "as_pagelock_end:"); 242 243 if (error != 0) { 244 bp->b_flags |= B_ERROR; 245 bp->b_error = error; 246 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS); 247 break; 248 } 249 bp->b_shadow = pplist; 250 if (pplist != NULL) { 251 bp->b_flags |= B_SHADOW; 252 } 253 254 DTRACE_IO1(start, struct buf *, bp); 255 bp->b_flags |= B_STARTED; 256 257 (void) (*strat)(bp); 258 error = biowait(bp); 259 260 /* 261 * unlock the pages 262 */ 263 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_START, 264 "as_pageunlock_start: bp %p", bp); 265 266 as_pageunlock(asp, pplist, a, c, 267 rw == B_READ? S_WRITE : S_READ); 268 269 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_END, 270 "as_pageunlock_end:"); 271 272 c -= bp->b_resid; 273 iov->iov_base += c; 274 iov->iov_len -= c; 275 uio->uio_resid -= c; 276 uio->uio_loffset += c; 277 /* bp->b_resid - temp kludge for tape drives */ 278 if (bp->b_resid || error) 279 break; 280 } 281 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW); 282 /* bp->b_resid - temp kludge for tape drives */ 283 if (bp->b_resid || error) 284 break; 285 uio->uio_iov++; 286 uio->uio_iovcnt--; 287 } 288 289 if (allocbuf) { 290 kmem_cache_free(physio_buf_cache, bp); 291 } 292 293 /* Kernel probe */ 294 TNF_PROBE_1(physio_end, "io rawio", /* CSTYLED */, 295 tnf_device, device, dev); 296 297 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_END, "physio_end: bp %p", bp); 298 299 return (error); 300 } 301 302 /* 303 * Returns 0 on success, or an error on failure. 304 * 305 * This function is no longer a part of the DDI/DKI. 306 * However, for compatibility, its interface should not 307 * be changed and it should not be removed from the kernel. 308 */ 309 int 310 useracc(void *addr, size_t count, int access) 311 { 312 uint_t prot; 313 314 prot = PROT_USER | ((access == B_READ) ? PROT_READ : PROT_WRITE); 315 return (as_checkprot(ttoproc(curthread)->p_as, addr, count, prot)); 316 } 317 318 #define MAX_MAPIN_PAGES 8 319 320 /* 321 * This function temporarily "borrows" user pages for kernel use. If 322 * "cow" is on, it also sets up copy-on-write protection (only feasible 323 * on MAP_PRIVATE segment) on the user mappings, to protect the borrowed 324 * pages from any changes by the user. The caller is responsible for 325 * unlocking and tearing down cow settings when it's done with the pages. 326 * For an example, see kcfree(). 327 * 328 * Pages behind [uaddr..uaddr+*lenp] under address space "as" are locked 329 * (shared), and mapped into kernel address range [kaddr..kaddr+*lenp] if 330 * kaddr != -1. On entering this function, cached_ppp contains a list 331 * of pages that are mapped into [kaddr..kaddr+*lenp] already (from a 332 * previous call). Thus if same pages remain behind [uaddr..uaddr+*lenp], 333 * the kernel map won't need to be reloaded again. 334 * 335 * For cow == 1, if the pages are anonymous pages, it also bumps the anon 336 * reference count, and change the user-mapping to read-only. This 337 * scheme should work on all types of segment drivers. But to be safe, 338 * we check against segvn here. 339 * 340 * Since this function is used to emulate copyin() semantic, it checks 341 * to make sure the user-mappings allow "user-read". 342 * 343 * On exit "lenp" contains the number of bytes successfully locked and 344 * mapped in. For the unsuccessful ones, the caller can fall back to 345 * copyin(). 346 * 347 * Error return: 348 * ENOTSUP - operation like this is not supported either on this segment 349 * type, or on this platform type. 350 */ 351 int 352 cow_mapin(struct as *as, caddr_t uaddr, caddr_t kaddr, struct page **cached_ppp, 353 struct anon **app, size_t *lenp, int cow) 354 { 355 struct hat *hat; 356 struct seg *seg; 357 caddr_t base; 358 page_t *pp, *ppp[MAX_MAPIN_PAGES]; 359 long i; 360 int flags; 361 size_t size, total = *lenp; 362 char first = 1; 363 faultcode_t res; 364 365 *lenp = 0; 366 if (cow) { 367 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 368 seg = as_findseg(as, uaddr, 0); 369 if ((seg == NULL) || ((base = seg->s_base) > uaddr) || 370 (uaddr + total) > base + seg->s_size) { 371 AS_LOCK_EXIT(as, &as->a_lock); 372 return (EINVAL); 373 } 374 /* 375 * The COW scheme should work for all segment types. 376 * But to be safe, we check against segvn. 377 */ 378 if (seg->s_ops != &segvn_ops) { 379 AS_LOCK_EXIT(as, &as->a_lock); 380 return (ENOTSUP); 381 } else if ((SEGOP_GETTYPE(seg, uaddr) & MAP_PRIVATE) == 0) { 382 AS_LOCK_EXIT(as, &as->a_lock); 383 return (ENOTSUP); 384 } 385 } 386 hat = as->a_hat; 387 size = total; 388 tryagain: 389 /* 390 * If (cow), hat_softlock will also change the usr protection to RO. 391 * This is the first step toward setting up cow. Before we 392 * bump up an_refcnt, we can't allow any cow-fault on this 393 * address. Otherwise segvn_fault will change the protection back 394 * to RW upon seeing an_refcnt == 1. 395 * The solution is to hold the writer lock on "as". 396 */ 397 res = hat_softlock(hat, uaddr, &size, &ppp[0], cow ? HAT_COW : 0); 398 size = total - size; 399 *lenp += size; 400 size = size >> PAGESHIFT; 401 i = 0; 402 while (i < size) { 403 pp = ppp[i]; 404 if (cow) { 405 kmutex_t *ahm; 406 /* 407 * Another solution is to hold SE_EXCL on pp, and 408 * disable PROT_WRITE. This also works for MAP_SHARED 409 * segment. The disadvantage is that it locks the 410 * page from being used by anybody else. 411 */ 412 ahm = AH_MUTEX(pp->p_vnode, pp->p_offset); 413 mutex_enter(ahm); 414 *app = swap_anon(pp->p_vnode, pp->p_offset); 415 /* 416 * Since we are holding the as lock, this avoids a 417 * potential race with anon_decref. (segvn_unmap and 418 * segvn_free needs the as writer lock to do anon_free.) 419 */ 420 if (*app != NULL) { 421 #if 0 422 if ((*app)->an_refcnt == 0) 423 /* 424 * Consider the following senario (unlikey 425 * though): 426 * 1. an_refcnt == 2 427 * 2. we solftlock the page. 428 * 3. cow ocurrs on this addr. So a new ap, 429 * page and mapping is established on addr. 430 * 4. an_refcnt drops to 1 (segvn_faultpage 431 * -> anon_decref(oldap)) 432 * 5. the last ref to ap also drops (from 433 * another as). It ends up blocked inside 434 * anon_decref trying to get page's excl lock. 435 * 6. Later kcfree unlocks the page, call 436 * anon_decref -> oops, ap is gone already. 437 * 438 * Holding as writer lock solves all problems. 439 */ 440 *app = NULL; 441 else 442 #endif 443 (*app)->an_refcnt++; 444 } 445 mutex_exit(ahm); 446 } else { 447 *app = NULL; 448 } 449 if (kaddr != (caddr_t)-1) { 450 if (pp != *cached_ppp) { 451 if (*cached_ppp == NULL) 452 flags = HAT_LOAD_LOCK | HAT_NOSYNC | 453 HAT_LOAD_NOCONSIST; 454 else 455 flags = HAT_LOAD_REMAP | 456 HAT_LOAD_NOCONSIST; 457 /* 458 * In order to cache the kernel mapping after 459 * the user page is unlocked, we call 460 * hat_devload instead of hat_memload so 461 * that the kernel mapping we set up here is 462 * "invisible" to the rest of the world. This 463 * is not very pretty. But as long as the 464 * caller bears the responsibility of keeping 465 * cache consistency, we should be ok - 466 * HAT_NOCONSIST will get us a uncached 467 * mapping on VAC. hat_softlock will flush 468 * a VAC_WRITEBACK cache. Therefore the kaddr 469 * doesn't have to be of the same vcolor as 470 * uaddr. 471 * The alternative is - change hat_devload 472 * to get a cached mapping. Allocate a kaddr 473 * with the same vcolor as uaddr. Then 474 * hat_softlock won't need to flush the VAC. 475 */ 476 hat_devload(kas.a_hat, kaddr, PAGESIZE, 477 page_pptonum(pp), PROT_READ, flags); 478 *cached_ppp = pp; 479 } 480 kaddr += PAGESIZE; 481 } 482 cached_ppp++; 483 app++; 484 ++i; 485 } 486 if (cow) { 487 AS_LOCK_EXIT(as, &as->a_lock); 488 } 489 if (first && res == FC_NOMAP) { 490 /* 491 * If the address is not mapped yet, we call as_fault to 492 * fault the pages in. We could've fallen back to copy and 493 * let it fault in the pages. But for a mapped file, we 494 * normally reference each page only once. For zero-copy to 495 * be of any use, we'd better fall in the page now and try 496 * again. 497 */ 498 first = 0; 499 size = size << PAGESHIFT; 500 uaddr += size; 501 total -= size; 502 size = total; 503 res = as_fault(as->a_hat, as, uaddr, size, F_INVAL, S_READ); 504 if (cow) 505 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 506 goto tryagain; 507 } 508 switch (res) { 509 case FC_NOSUPPORT: 510 return (ENOTSUP); 511 case FC_PROT: /* Pretend we don't know about it. This will be */ 512 /* caught by the caller when uiomove fails. */ 513 case FC_NOMAP: 514 case FC_OBJERR: 515 default: 516 return (0); 517 } 518 } 519