1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2004 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 40 #pragma ident "%Z%%M% %I% %E% SMI" 41 42 #include <sys/types.h> 43 #include <sys/t_lock.h> 44 #include <sys/param.h> 45 #include <sys/errno.h> 46 #include <sys/debug.h> 47 #include <sys/cmn_err.h> 48 #include <sys/kmem.h> 49 #include <sys/sysmacros.h> 50 #include <sys/inline.h> 51 #include <sys/buf.h> 52 #include <sys/uio.h> 53 #include <sys/user.h> 54 #include <sys/proc.h> 55 #include <sys/systm.h> 56 #include <sys/vmsystm.h> 57 #include <sys/cpuvar.h> 58 #include <sys/mman.h> 59 #include <sys/cred.h> 60 #include <sys/vnode.h> 61 #include <sys/file.h> 62 #include <sys/vm.h> 63 64 #include <sys/swap.h> 65 #include <sys/vtrace.h> 66 #include <sys/tnf_probe.h> 67 #include <sys/fs/snode.h> 68 #include <sys/copyops.h> 69 #include <sys/conf.h> 70 #include <sys/sdt.h> 71 72 #include <vm/anon.h> 73 #include <vm/hat.h> 74 #include <vm/as.h> 75 #include <vm/seg.h> 76 #include <vm/page.h> 77 #include <vm/seg_vn.h> 78 #include <vm/seg_kmem.h> 79 80 extern int maxphys; 81 82 void 83 minphys(struct buf *bp) 84 { 85 if (bp->b_bcount > maxphys) 86 bp->b_bcount = maxphys; 87 } 88 89 /* 90 * use kmem_cache_create for physio buffers. This has shown 91 * a better cache distribution compared to buffers on the 92 * stack. It also avoids semaphore construction/deconstruction 93 * per request 94 */ 95 96 static struct kmem_cache *physio_buf_cache; 97 98 /* ARGSUSED */ 99 static int 100 physio_buf_constructor(void *buf, void *cdrarg, int kmflags) 101 { 102 bioinit((struct buf *)buf); 103 return (0); 104 } 105 106 /* ARGSUSED */ 107 static void 108 physio_buf_destructor(void *buf, void *cdrarg) 109 { 110 biofini((struct buf *)buf); 111 } 112 113 void 114 physio_bufs_init(void) 115 { 116 physio_buf_cache = kmem_cache_create("physio_buf_cache", 117 sizeof (struct buf), 0, 118 physio_buf_constructor, physio_buf_destructor, 119 NULL, NULL, NULL, 0); 120 } 121 122 123 124 /* 125 * initiate raw I/O request 126 * 127 * allocate buf header if necessary 128 * adjust max size of each I/O request 129 * lock down user pages and verify access protections 130 * call driver's strategy routine to submit request 131 * wait for I/O completion 132 * unlock user pages and free allocated buf header 133 */ 134 135 int 136 default_physio(int (*strat)(struct buf *), struct buf *bp, dev_t dev, 137 int rw, void (*mincnt)(struct buf *), struct uio *uio) 138 { 139 struct iovec *iov; 140 struct proc *procp; 141 struct as *asp; 142 ssize_t c; 143 char *a; 144 int error = 0; 145 page_t **pplist; 146 int allocbuf = 0; 147 148 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_START, "physio_start: bp %p", bp); 149 150 /* Kernel probe */ 151 TNF_PROBE_4(physio_start, "io rawio", /* CSTYLED */, 152 tnf_device, device, dev, 153 tnf_offset, offset, uio->uio_loffset, 154 tnf_size, size, uio->uio_resid, 155 tnf_bioflags, rw, rw); 156 157 if (rw == B_READ) { 158 CPU_STATS_ADD_K(sys, phread, 1); 159 } else { 160 CPU_STATS_ADD_K(sys, phwrite, 1); 161 } 162 163 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_START, 164 "getbuf_start: bp %p", bp); 165 166 if (bp == NULL) { 167 bp = kmem_cache_alloc(physio_buf_cache, KM_SLEEP); 168 bp->b_iodone = NULL; 169 bp->b_resid = 0; 170 allocbuf = 1; 171 } 172 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_END, "getbuf_end: bp %p", bp); 173 174 if (uio->uio_segflg == UIO_USERSPACE) { 175 procp = ttoproc(curthread); 176 asp = procp->p_as; 177 } else { 178 procp = NULL; 179 asp = &kas; 180 } 181 ASSERT(SEMA_HELD(&bp->b_sem)); 182 183 /* 184 * We need to prepare this buffer for the io:::start probe, including 185 * NULL'ing out the file, clearing the offset, and filling in the 186 * b_dip field. 187 */ 188 bp->b_file = NULL; 189 bp->b_offset = -1; 190 191 if (dev != NODEV) { 192 (void) devopsp[getmajor(dev)]->devo_getinfo(NULL, 193 DDI_INFO_DEVT2DEVINFO, (void *)dev, (void **)&bp->b_dip); 194 } else { 195 bp->b_dip = NULL; 196 } 197 198 while (uio->uio_iovcnt > 0) { 199 iov = uio->uio_iov; 200 201 bp->b_error = 0; 202 bp->b_proc = procp; 203 204 while (iov->iov_len > 0) { 205 if (uio->uio_resid == 0) 206 break; 207 if (uio->uio_loffset < 0) { 208 error = EINVAL; 209 break; 210 } 211 #ifdef _ILP32 212 /* 213 * For 32-bit kernels, check against SPEC_MAXOFFSET_T 214 * which represents the maximum size that can be 215 * supported by the IO subsystem. 216 * XXX this code assumes a D_64BIT driver. 217 */ 218 if (uio->uio_loffset > SPEC_MAXOFFSET_T) { 219 error = EINVAL; 220 break; 221 } 222 #endif /* _ILP32 */ 223 bp->b_flags = B_BUSY | B_PHYS | rw; 224 bp->b_edev = dev; 225 bp->b_lblkno = btodt(uio->uio_loffset); 226 227 /* 228 * Don't count on b_addr remaining untouched by the 229 * code below (it may be reset because someone does 230 * a bp_mapin on the buffer) -- reset from the iov 231 * each time through, updating the iov's base address 232 * instead. 233 */ 234 a = bp->b_un.b_addr = iov->iov_base; 235 bp->b_bcount = MIN(iov->iov_len, uio->uio_resid); 236 (*mincnt)(bp); 237 c = bp->b_bcount; 238 239 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_START, 240 "as_pagelock_start: bp %p", bp); 241 242 error = as_pagelock(asp, &pplist, a, 243 c, rw == B_READ? S_WRITE : S_READ); 244 245 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_END, 246 "as_pagelock_end:"); 247 248 if (error != 0) { 249 bp->b_flags |= B_ERROR; 250 bp->b_error = error; 251 bp->b_flags &= 252 ~(B_BUSY|B_WANTED|B_PHYS); 253 break; 254 } 255 bp->b_shadow = pplist; 256 if (pplist != NULL) { 257 bp->b_flags |= B_SHADOW; 258 } 259 260 DTRACE_IO1(start, struct buf *, bp); 261 bp->b_flags |= B_STARTED; 262 263 (void) (*strat)(bp); 264 error = biowait(bp); 265 266 /* 267 * unlock the pages 268 */ 269 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_START, 270 "as_pageunlock_start: bp %p", bp); 271 272 as_pageunlock(asp, pplist, a, c, 273 rw == B_READ? S_WRITE : S_READ); 274 275 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_END, 276 "as_pageunlock_end:"); 277 278 c -= bp->b_resid; 279 iov->iov_base += c; 280 iov->iov_len -= c; 281 uio->uio_resid -= c; 282 uio->uio_loffset += c; 283 /* bp->b_resid - temp kludge for tape drives */ 284 if (bp->b_resid || error) 285 break; 286 } 287 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW); 288 /* bp->b_resid - temp kludge for tape drives */ 289 if (bp->b_resid || error) 290 break; 291 uio->uio_iov++; 292 uio->uio_iovcnt--; 293 } 294 295 if (allocbuf) { 296 kmem_cache_free(physio_buf_cache, bp); 297 } 298 299 /* Kernel probe */ 300 TNF_PROBE_1(physio_end, "io rawio", /* CSTYLED */, 301 tnf_device, device, dev); 302 303 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_END, "physio_end: bp %p", bp); 304 305 return (error); 306 } 307 308 /* 309 * Returns 0 on success, or an error on failure. 310 * 311 * This function is no longer a part of the DDI/DKI. 312 * However, for compatibility, its interface should not 313 * be changed and it should not be removed from the kernel. 314 */ 315 int 316 useracc(void *addr, size_t count, int access) 317 { 318 uint_t prot; 319 320 prot = PROT_USER | ((access == B_READ) ? PROT_READ : PROT_WRITE); 321 return (as_checkprot(ttoproc(curthread)->p_as, addr, count, prot)); 322 } 323 324 #define MAX_MAPIN_PAGES 8 325 326 /* 327 * This function temporarily "borrows" user pages for kernel use. If 328 * "cow" is on, it also sets up copy-on-write protection (only feasible 329 * on MAP_PRIVATE segment) on the user mappings, to protect the borrowed 330 * pages from any changes by the user. The caller is responsible for 331 * unlocking and tearing down cow settings when it's done with the pages. 332 * For an example, see kcfree(). 333 * 334 * Pages behind [uaddr..uaddr+*lenp] under address space "as" are locked 335 * (shared), and mapped into kernel address range [kaddr..kaddr+*lenp] if 336 * kaddr != -1. On entering this function, cached_ppp contains a list 337 * of pages that are mapped into [kaddr..kaddr+*lenp] already (from a 338 * previous call). Thus if same pages remain behind [uaddr..uaddr+*lenp], 339 * the kernel map won't need to be reloaded again. 340 * 341 * For cow == 1, if the pages are anonymous pages, it also bumps the anon 342 * reference count, and change the user-mapping to read-only. This 343 * scheme should work on all types of segment drivers. But to be safe, 344 * we check against segvn here. 345 * 346 * Since this function is used to emulate copyin() semantic, it checks 347 * to make sure the user-mappings allow "user-read". 348 * 349 * On exit "lenp" contains the number of bytes successfully locked and 350 * mapped in. For the unsuccessful ones, the caller can fall back to 351 * copyin(). 352 * 353 * Error return: 354 * ENOTSUP - operation like this is not supported either on this segment 355 * type, or on this platform type. 356 */ 357 int 358 cow_mapin(struct as *as, caddr_t uaddr, caddr_t kaddr, struct page **cached_ppp, 359 struct anon **app, size_t *lenp, int cow) 360 { 361 struct hat *hat; 362 struct seg *seg; 363 caddr_t base; 364 page_t *pp, *ppp[MAX_MAPIN_PAGES]; 365 long i; 366 int flags; 367 size_t size, total = *lenp; 368 char first = 1; 369 faultcode_t res; 370 371 *lenp = 0; 372 if (cow) { 373 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 374 seg = as_findseg(as, uaddr, 0); 375 if ((seg == NULL) || ((base = seg->s_base) > uaddr) || 376 (uaddr + total) > base + seg->s_size) { 377 AS_LOCK_EXIT(as, &as->a_lock); 378 return (EINVAL); 379 } 380 /* 381 * The COW scheme should work for all segment types. 382 * But to be safe, we check against segvn. 383 */ 384 if (seg->s_ops != &segvn_ops) { 385 AS_LOCK_EXIT(as, &as->a_lock); 386 return (ENOTSUP); 387 } else if ((SEGOP_GETTYPE(seg, uaddr) & MAP_PRIVATE) == 0) { 388 AS_LOCK_EXIT(as, &as->a_lock); 389 return (ENOTSUP); 390 } 391 } 392 hat = as->a_hat; 393 size = total; 394 tryagain: 395 /* 396 * If (cow), hat_softlock will also change the usr protection to RO. 397 * This is the first step toward setting up cow. Before we 398 * bump up an_refcnt, we can't allow any cow-fault on this 399 * address. Otherwise segvn_fault will change the protection back 400 * to RW upon seeing an_refcnt == 1. 401 * The solution is to hold the writer lock on "as". 402 */ 403 res = hat_softlock(hat, uaddr, &size, &ppp[0], cow ? HAT_COW : 0); 404 size = total - size; 405 *lenp += size; 406 size = size >> PAGESHIFT; 407 i = 0; 408 while (i < size) { 409 pp = ppp[i]; 410 if (cow) { 411 kmutex_t *ahm; 412 /* 413 * Another solution is to hold SE_EXCL on pp, and 414 * disable PROT_WRITE. This also works for MAP_SHARED 415 * segment. The disadvantage is that it locks the 416 * page from being used by anybody else. 417 */ 418 ahm = &anonhash_lock[ 419 AH_LOCK(pp->p_vnode, pp->p_offset)]; 420 mutex_enter(ahm); 421 *app = swap_anon(pp->p_vnode, pp->p_offset); 422 /* 423 * Since we are holding the as lock, this avoids a 424 * potential race with anon_decref. (segvn_unmap and 425 * segvn_free needs the as writer lock to do anon_free.) 426 */ 427 if (*app != NULL) { 428 #if 0 429 if ((*app)->an_refcnt == 0) 430 /* 431 * Consider the following senario (unlikey 432 * though): 433 * 1. an_refcnt == 2 434 * 2. we solftlock the page. 435 * 3. cow ocurrs on this addr. So a new ap, 436 * page and mapping is established on addr. 437 * 4. an_refcnt drops to 1 (segvn_faultpage 438 * -> anon_decref(oldap)) 439 * 5. the last ref to ap also drops (from 440 * another as). It ends up blocked inside 441 * anon_decref trying to get page's excl lock. 442 * 6. Later kcfree unlocks the page, call 443 * anon_decref -> oops, ap is gone already. 444 * 445 * Holding as writer lock solves all problems. 446 */ 447 *app = NULL; 448 else 449 #endif 450 (*app)->an_refcnt++; 451 } 452 mutex_exit(ahm); 453 } else { 454 *app = NULL; 455 } 456 if (kaddr != (caddr_t)-1) { 457 if (pp != *cached_ppp) { 458 if (*cached_ppp == NULL) 459 flags = HAT_LOAD_LOCK | HAT_NOSYNC | 460 HAT_LOAD_NOCONSIST; 461 else 462 flags = HAT_LOAD_REMAP | 463 HAT_LOAD_NOCONSIST; 464 /* 465 * In order to cache the kernel mapping after 466 * the user page is unlocked, we call 467 * hat_devload instead of hat_memload so 468 * that the kernel mapping we set up here is 469 * "invisible" to the rest of the world. This 470 * is not very pretty. But as long as the 471 * caller bears the responsibility of keeping 472 * cache consistency, we should be ok - 473 * HAT_NOCONSIST will get us a uncached 474 * mapping on VAC. hat_softlock will flush 475 * a VAC_WRITEBACK cache. Therefore the kaddr 476 * doesn't have to be of the same vcolor as 477 * uaddr. 478 * The alternative is - change hat_devload 479 * to get a cached mapping. Allocate a kaddr 480 * with the same vcolor as uaddr. Then 481 * hat_softlock won't need to flush the VAC. 482 */ 483 hat_devload(kas.a_hat, kaddr, PAGESIZE, 484 page_pptonum(pp), PROT_READ, flags); 485 *cached_ppp = pp; 486 } 487 kaddr += PAGESIZE; 488 } 489 cached_ppp++; 490 app++; 491 ++i; 492 } 493 if (cow) { 494 AS_LOCK_EXIT(as, &as->a_lock); 495 } 496 if (first && res == FC_NOMAP) { 497 /* 498 * If the address is not mapped yet, we call as_fault to 499 * fault the pages in. We could've fallen back to copy and 500 * let it fault in the pages. But for a mapped file, we 501 * normally reference each page only once. For zero-copy to 502 * be of any use, we'd better fall in the page now and try 503 * again. 504 */ 505 first = 0; 506 size = size << PAGESHIFT; 507 uaddr += size; 508 total -= size; 509 size = total; 510 res = as_fault(as->a_hat, as, uaddr, size, F_INVAL, S_READ); 511 if (cow) 512 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 513 goto tryagain; 514 } 515 switch (res) { 516 case FC_NOSUPPORT: 517 return (ENOTSUP); 518 case FC_PROT: /* Pretend we don't know about it. This will be */ 519 /* caught by the caller when uiomove fails. */ 520 case FC_NOMAP: 521 case FC_OBJERR: 522 default: 523 return (0); 524 } 525 } 526