1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
26 /* All Rights Reserved */
27
28 /*
29 * University Copyright- Copyright (c) 1982, 1986, 1988
30 * The Regents of the University of California
31 * All Rights Reserved
32 *
33 * University Acknowledgment- Portions of this document are derived from
34 * software developed by the University of California, Berkeley, and its
35 * contributors.
36 */
37
38 #include <sys/types.h>
39 #include <sys/t_lock.h>
40 #include <sys/param.h>
41 #include <sys/errno.h>
42 #include <sys/debug.h>
43 #include <sys/cmn_err.h>
44 #include <sys/kmem.h>
45 #include <sys/sysmacros.h>
46 #include <sys/inline.h>
47 #include <sys/buf.h>
48 #include <sys/uio.h>
49 #include <sys/user.h>
50 #include <sys/proc.h>
51 #include <sys/systm.h>
52 #include <sys/vmsystm.h>
53 #include <sys/cpuvar.h>
54 #include <sys/mman.h>
55 #include <sys/cred.h>
56 #include <sys/vnode.h>
57 #include <sys/file.h>
58 #include <sys/vm.h>
59
60 #include <sys/swap.h>
61 #include <sys/vtrace.h>
62 #include <sys/fs/snode.h>
63 #include <sys/copyops.h>
64 #include <sys/conf.h>
65 #include <sys/sdt.h>
66
67 #include <vm/anon.h>
68 #include <vm/hat.h>
69 #include <vm/as.h>
70 #include <vm/seg.h>
71 #include <vm/page.h>
72 #include <vm/seg_vn.h>
73 #include <vm/seg_kmem.h>
74
75 extern int maxphys;
76
77 void
minphys(struct buf * bp)78 minphys(struct buf *bp)
79 {
80 if (bp->b_bcount > maxphys)
81 bp->b_bcount = maxphys;
82 }
83
84 /*
85 * use kmem_cache_create for physio buffers. This has shown
86 * a better cache distribution compared to buffers on the
87 * stack. It also avoids semaphore construction/deconstruction
88 * per request
89 */
90
91 static struct kmem_cache *physio_buf_cache;
92
93 /* ARGSUSED */
94 static int
physio_buf_constructor(void * buf,void * cdrarg,int kmflags)95 physio_buf_constructor(void *buf, void *cdrarg, int kmflags)
96 {
97 bioinit((struct buf *)buf);
98 return (0);
99 }
100
101 /* ARGSUSED */
102 static void
physio_buf_destructor(void * buf,void * cdrarg)103 physio_buf_destructor(void *buf, void *cdrarg)
104 {
105 biofini((struct buf *)buf);
106 }
107
108 void
physio_bufs_init(void)109 physio_bufs_init(void)
110 {
111 physio_buf_cache = kmem_cache_create("physio_buf_cache",
112 sizeof (struct buf), 0, physio_buf_constructor,
113 physio_buf_destructor, NULL, NULL, NULL, 0);
114 }
115
116
117
118 /*
119 * initiate raw I/O request
120 *
121 * allocate buf header if necessary
122 * adjust max size of each I/O request
123 * lock down user pages and verify access protections
124 * call driver's strategy routine to submit request
125 * wait for I/O completion
126 * unlock user pages and free allocated buf header
127 */
128
129 int
default_physio(int (* strat)(struct buf *),struct buf * bp,dev_t dev,int rw,void (* mincnt)(struct buf *),struct uio * uio)130 default_physio(int (*strat)(struct buf *), struct buf *bp, dev_t dev,
131 int rw, void (*mincnt)(struct buf *), struct uio *uio)
132 {
133 struct iovec *iov;
134 struct proc *procp;
135 struct as *asp;
136 ssize_t c;
137 char *a;
138 int error = 0;
139 page_t **pplist;
140 int allocbuf = 0;
141
142 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_START, "physio_start: bp %p", bp);
143
144 if (rw == B_READ) {
145 CPU_STATS_ADD_K(sys, phread, 1);
146 } else {
147 CPU_STATS_ADD_K(sys, phwrite, 1);
148 }
149
150 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_START,
151 "getbuf_start: bp %p", bp);
152
153 if (bp == NULL) {
154 bp = kmem_cache_alloc(physio_buf_cache, KM_SLEEP);
155 bp->b_iodone = NULL;
156 bp->b_resid = 0;
157 allocbuf = 1;
158 }
159 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_END, "getbuf_end: bp %p", bp);
160
161 if (uio->uio_segflg == UIO_USERSPACE) {
162 procp = ttoproc(curthread);
163 asp = procp->p_as;
164 } else {
165 procp = NULL;
166 asp = &kas;
167 }
168 ASSERT(SEMA_HELD(&bp->b_sem));
169
170 /*
171 * We need to prepare this buffer for the io:::start probe, including
172 * NULL'ing out the file, clearing the offset, and filling in the
173 * b_dip field.
174 */
175 bp->b_file = NULL;
176 bp->b_offset = -1;
177
178 if (dev != NODEV) {
179 (void) devopsp[getmajor(dev)]->devo_getinfo(NULL,
180 DDI_INFO_DEVT2DEVINFO, (void *)dev, (void **)&bp->b_dip);
181 } else {
182 bp->b_dip = NULL;
183 }
184
185 while (uio->uio_iovcnt > 0) {
186 iov = uio->uio_iov;
187
188 bp->b_error = 0;
189 bp->b_proc = procp;
190
191 while (iov->iov_len > 0) {
192 if (uio->uio_resid == 0)
193 break;
194 if (uio->uio_loffset < 0) {
195 error = EINVAL;
196 break;
197 }
198 #ifdef _ILP32
199 /*
200 * For 32-bit kernels, check against SPEC_MAXOFFSET_T
201 * which represents the maximum size that can be
202 * supported by the IO subsystem.
203 * XXX this code assumes a D_64BIT driver.
204 */
205 if (uio->uio_loffset > SPEC_MAXOFFSET_T) {
206 error = EINVAL;
207 break;
208 }
209 #endif /* _ILP32 */
210 bp->b_flags = B_BUSY | B_PHYS | rw;
211 bp->b_edev = dev;
212 bp->b_lblkno = btodt(uio->uio_loffset);
213
214 /*
215 * Don't count on b_addr remaining untouched by the
216 * code below (it may be reset because someone does
217 * a bp_mapin on the buffer) -- reset from the iov
218 * each time through, updating the iov's base address
219 * instead.
220 */
221 a = bp->b_un.b_addr = iov->iov_base;
222 bp->b_bcount = MIN(iov->iov_len, uio->uio_resid);
223 (*mincnt)(bp);
224 c = bp->b_bcount;
225
226 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_START,
227 "as_pagelock_start: bp %p", bp);
228
229 error = as_pagelock(asp, &pplist, a,
230 c, rw == B_READ? S_WRITE : S_READ);
231
232 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_END,
233 "as_pagelock_end:");
234
235 if (error != 0) {
236 bp->b_flags |= B_ERROR;
237 bp->b_error = error;
238 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS);
239 break;
240 }
241 bp->b_shadow = pplist;
242 if (pplist != NULL) {
243 bp->b_flags |= B_SHADOW;
244 }
245
246 DTRACE_IO1(start, struct buf *, bp);
247 bp->b_flags |= B_STARTED;
248
249 (void) (*strat)(bp);
250 error = biowait(bp);
251
252 /*
253 * unlock the pages
254 */
255 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_START,
256 "as_pageunlock_start: bp %p", bp);
257
258 as_pageunlock(asp, pplist, a, c,
259 rw == B_READ? S_WRITE : S_READ);
260
261 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_END,
262 "as_pageunlock_end:");
263
264 c -= bp->b_resid;
265 iov->iov_base += c;
266 iov->iov_len -= c;
267 uio->uio_resid -= c;
268 uio->uio_loffset += c;
269 /* bp->b_resid - temp kludge for tape drives */
270 if (bp->b_resid || error)
271 break;
272 }
273 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
274 /* bp->b_resid - temp kludge for tape drives */
275 if (bp->b_resid || error)
276 break;
277 uio->uio_iov++;
278 uio->uio_iovcnt--;
279 }
280
281 if (allocbuf) {
282 kmem_cache_free(physio_buf_cache, bp);
283 }
284
285 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_END, "physio_end: bp %p", bp);
286
287 return (error);
288 }
289
290 /*
291 * Returns 0 on success, or an error on failure.
292 *
293 * This function is no longer a part of the DDI/DKI.
294 * However, for compatibility, its interface should not
295 * be changed and it should not be removed from the kernel.
296 */
297 int
useracc(void * addr,size_t count,int access)298 useracc(void *addr, size_t count, int access)
299 {
300 uint_t prot;
301
302 prot = PROT_USER | ((access == B_READ) ? PROT_READ : PROT_WRITE);
303 return (as_checkprot(ttoproc(curthread)->p_as, addr, count, prot));
304 }
305
306 #define MAX_MAPIN_PAGES 8
307
308 /*
309 * This function temporarily "borrows" user pages for kernel use. If
310 * "cow" is on, it also sets up copy-on-write protection (only feasible
311 * on MAP_PRIVATE segment) on the user mappings, to protect the borrowed
312 * pages from any changes by the user. The caller is responsible for
313 * unlocking and tearing down cow settings when it's done with the pages.
314 * For an example, see kcfree().
315 *
316 * Pages behind [uaddr..uaddr+*lenp] under address space "as" are locked
317 * (shared), and mapped into kernel address range [kaddr..kaddr+*lenp] if
318 * kaddr != -1. On entering this function, cached_ppp contains a list
319 * of pages that are mapped into [kaddr..kaddr+*lenp] already (from a
320 * previous call). Thus if same pages remain behind [uaddr..uaddr+*lenp],
321 * the kernel map won't need to be reloaded again.
322 *
323 * For cow == 1, if the pages are anonymous pages, it also bumps the anon
324 * reference count, and change the user-mapping to read-only. This
325 * scheme should work on all types of segment drivers. But to be safe,
326 * we check against segvn here.
327 *
328 * Since this function is used to emulate copyin() semantic, it checks
329 * to make sure the user-mappings allow "user-read".
330 *
331 * On exit "lenp" contains the number of bytes successfully locked and
332 * mapped in. For the unsuccessful ones, the caller can fall back to
333 * copyin().
334 *
335 * Error return:
336 * ENOTSUP - operation like this is not supported either on this segment
337 * type, or on this platform type.
338 */
339 int
cow_mapin(struct as * as,caddr_t uaddr,caddr_t kaddr,struct page ** cached_ppp,struct anon ** app,size_t * lenp,int cow)340 cow_mapin(struct as *as, caddr_t uaddr, caddr_t kaddr, struct page **cached_ppp,
341 struct anon **app, size_t *lenp, int cow)
342 {
343 struct hat *hat;
344 struct seg *seg;
345 caddr_t base;
346 page_t *pp, *ppp[MAX_MAPIN_PAGES];
347 long i;
348 int flags;
349 size_t size, total = *lenp;
350 char first = 1;
351 faultcode_t res;
352
353 *lenp = 0;
354 if (cow) {
355 AS_LOCK_ENTER(as, RW_WRITER);
356 seg = as_findseg(as, uaddr, 0);
357 if ((seg == NULL) || ((base = seg->s_base) > uaddr) ||
358 (uaddr + total) > base + seg->s_size) {
359 AS_LOCK_EXIT(as);
360 return (EINVAL);
361 }
362 /*
363 * The COW scheme should work for all segment types.
364 * But to be safe, we check against segvn.
365 */
366 if (seg->s_ops != &segvn_ops) {
367 AS_LOCK_EXIT(as);
368 return (ENOTSUP);
369 } else if ((SEGOP_GETTYPE(seg, uaddr) & MAP_PRIVATE) == 0) {
370 AS_LOCK_EXIT(as);
371 return (ENOTSUP);
372 }
373 }
374 hat = as->a_hat;
375 size = total;
376 tryagain:
377 /*
378 * If (cow), hat_softlock will also change the usr protection to RO.
379 * This is the first step toward setting up cow. Before we
380 * bump up an_refcnt, we can't allow any cow-fault on this
381 * address. Otherwise segvn_fault will change the protection back
382 * to RW upon seeing an_refcnt == 1.
383 * The solution is to hold the writer lock on "as".
384 */
385 res = hat_softlock(hat, uaddr, &size, &ppp[0], cow ? HAT_COW : 0);
386 size = total - size;
387 *lenp += size;
388 size = size >> PAGESHIFT;
389 i = 0;
390 while (i < size) {
391 pp = ppp[i];
392 if (cow) {
393 kmutex_t *ahm;
394 /*
395 * Another solution is to hold SE_EXCL on pp, and
396 * disable PROT_WRITE. This also works for MAP_SHARED
397 * segment. The disadvantage is that it locks the
398 * page from being used by anybody else.
399 */
400 ahm = AH_MUTEX(pp->p_vnode, pp->p_offset);
401 mutex_enter(ahm);
402 *app = swap_anon(pp->p_vnode, pp->p_offset);
403 /*
404 * Since we are holding the as lock, this avoids a
405 * potential race with anon_decref. (segvn_unmap and
406 * segvn_free needs the as writer lock to do anon_free.)
407 */
408 if (*app != NULL) {
409 #if 0
410 if ((*app)->an_refcnt == 0)
411 /*
412 * Consider the following senario (unlikey
413 * though):
414 * 1. an_refcnt == 2
415 * 2. we solftlock the page.
416 * 3. cow ocurrs on this addr. So a new ap,
417 * page and mapping is established on addr.
418 * 4. an_refcnt drops to 1 (segvn_faultpage
419 * -> anon_decref(oldap))
420 * 5. the last ref to ap also drops (from
421 * another as). It ends up blocked inside
422 * anon_decref trying to get page's excl lock.
423 * 6. Later kcfree unlocks the page, call
424 * anon_decref -> oops, ap is gone already.
425 *
426 * Holding as writer lock solves all problems.
427 */
428 *app = NULL;
429 else
430 #endif
431 (*app)->an_refcnt++;
432 }
433 mutex_exit(ahm);
434 } else {
435 *app = NULL;
436 }
437 if (kaddr != (caddr_t)-1) {
438 if (pp != *cached_ppp) {
439 if (*cached_ppp == NULL)
440 flags = HAT_LOAD_LOCK | HAT_NOSYNC |
441 HAT_LOAD_NOCONSIST;
442 else
443 flags = HAT_LOAD_REMAP |
444 HAT_LOAD_NOCONSIST;
445 /*
446 * In order to cache the kernel mapping after
447 * the user page is unlocked, we call
448 * hat_devload instead of hat_memload so
449 * that the kernel mapping we set up here is
450 * "invisible" to the rest of the world. This
451 * is not very pretty. But as long as the
452 * caller bears the responsibility of keeping
453 * cache consistency, we should be ok -
454 * HAT_NOCONSIST will get us a uncached
455 * mapping on VAC. hat_softlock will flush
456 * a VAC_WRITEBACK cache. Therefore the kaddr
457 * doesn't have to be of the same vcolor as
458 * uaddr.
459 * The alternative is - change hat_devload
460 * to get a cached mapping. Allocate a kaddr
461 * with the same vcolor as uaddr. Then
462 * hat_softlock won't need to flush the VAC.
463 */
464 hat_devload(kas.a_hat, kaddr, PAGESIZE,
465 page_pptonum(pp), PROT_READ, flags);
466 *cached_ppp = pp;
467 }
468 kaddr += PAGESIZE;
469 }
470 cached_ppp++;
471 app++;
472 ++i;
473 }
474 if (cow) {
475 AS_LOCK_EXIT(as);
476 }
477 if (first && res == FC_NOMAP) {
478 /*
479 * If the address is not mapped yet, we call as_fault to
480 * fault the pages in. We could've fallen back to copy and
481 * let it fault in the pages. But for a mapped file, we
482 * normally reference each page only once. For zero-copy to
483 * be of any use, we'd better fall in the page now and try
484 * again.
485 */
486 first = 0;
487 size = size << PAGESHIFT;
488 uaddr += size;
489 total -= size;
490 size = total;
491 res = as_fault(as->a_hat, as, uaddr, size, F_INVAL, S_READ);
492 if (cow)
493 AS_LOCK_ENTER(as, RW_WRITER);
494 goto tryagain;
495 }
496 switch (res) {
497 case FC_NOSUPPORT:
498 return (ENOTSUP);
499 case FC_PROT: /* Pretend we don't know about it. This will be */
500 /* caught by the caller when uiomove fails. */
501 case FC_NOMAP:
502 case FC_OBJERR:
503 default:
504 return (0);
505 }
506 }
507