xref: /illumos-gate/usr/src/uts/common/os/vm_subr.c (revision 5f10ef697f250374b7b917e10961c4e02d4e3112)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
26 /*	  All Rights Reserved  	*/
27 
28 /*
29  * University Copyright- Copyright (c) 1982, 1986, 1988
30  * The Regents of the University of California
31  * All Rights Reserved
32  *
33  * University Acknowledgment- Portions of this document are derived from
34  * software developed by the University of California, Berkeley, and its
35  * contributors.
36  */
37 
38 #include <sys/types.h>
39 #include <sys/t_lock.h>
40 #include <sys/param.h>
41 #include <sys/errno.h>
42 #include <sys/debug.h>
43 #include <sys/cmn_err.h>
44 #include <sys/kmem.h>
45 #include <sys/sysmacros.h>
46 #include <sys/inline.h>
47 #include <sys/buf.h>
48 #include <sys/uio.h>
49 #include <sys/user.h>
50 #include <sys/proc.h>
51 #include <sys/systm.h>
52 #include <sys/vmsystm.h>
53 #include <sys/cpuvar.h>
54 #include <sys/mman.h>
55 #include <sys/cred.h>
56 #include <sys/vnode.h>
57 #include <sys/file.h>
58 #include <sys/vm.h>
59 
60 #include <sys/swap.h>
61 #include <sys/vtrace.h>
62 #include <sys/tnf_probe.h>
63 #include <sys/fs/snode.h>
64 #include <sys/copyops.h>
65 #include <sys/conf.h>
66 #include <sys/sdt.h>
67 
68 #include <vm/anon.h>
69 #include <vm/hat.h>
70 #include <vm/as.h>
71 #include <vm/seg.h>
72 #include <vm/page.h>
73 #include <vm/seg_vn.h>
74 #include <vm/seg_kmem.h>
75 
76 extern int maxphys;
77 
78 void
79 minphys(struct buf *bp)
80 {
81 	if (bp->b_bcount > maxphys)
82 		bp->b_bcount = maxphys;
83 }
84 
85 /*
86  * use kmem_cache_create for physio buffers. This has shown
87  * a better cache distribution compared to buffers on the
88  * stack. It also avoids semaphore construction/deconstruction
89  * per request
90  */
91 
92 static struct kmem_cache *physio_buf_cache;
93 
94 /* ARGSUSED */
95 static int
96 physio_buf_constructor(void *buf, void *cdrarg, int kmflags)
97 {
98 	bioinit((struct buf *)buf);
99 	return (0);
100 }
101 
102 /* ARGSUSED */
103 static void
104 physio_buf_destructor(void *buf, void *cdrarg)
105 {
106 	biofini((struct buf *)buf);
107 }
108 
109 void
110 physio_bufs_init(void)
111 {
112 	physio_buf_cache = kmem_cache_create("physio_buf_cache",
113 	    sizeof (struct buf), 0, physio_buf_constructor,
114 	    physio_buf_destructor, NULL, NULL, NULL, 0);
115 }
116 
117 
118 
119 /*
120  * initiate raw I/O request
121  *
122  * allocate buf header if necessary
123  * adjust max size of each I/O request
124  * lock down user pages and verify access protections
125  * call driver's strategy routine to submit request
126  * wait for I/O completion
127  * unlock user pages and free allocated buf header
128  */
129 
130 int
131 default_physio(int (*strat)(struct buf *), struct buf *bp, dev_t dev,
132 	int rw, void (*mincnt)(struct buf *), struct uio *uio)
133 {
134 	struct iovec *iov;
135 	struct proc *procp;
136 	struct as *asp;
137 	ssize_t c;
138 	char *a;
139 	int error = 0;
140 	page_t **pplist;
141 	int allocbuf = 0;
142 
143 	TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_START, "physio_start: bp %p", bp);
144 
145 	/* Kernel probe */
146 	TNF_PROBE_4(physio_start, "io rawio", /* CSTYLED */,
147 	    tnf_device,		device,		dev,
148 	    tnf_offset,		offset,		uio->uio_loffset,
149 	    tnf_size,		size,		uio->uio_resid,
150 	    tnf_bioflags,	rw,		rw);
151 
152 	if (rw == B_READ) {
153 		CPU_STATS_ADD_K(sys, phread, 1);
154 	} else {
155 		CPU_STATS_ADD_K(sys, phwrite, 1);
156 	}
157 
158 	TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_START,
159 	    "getbuf_start: bp %p", bp);
160 
161 	if (bp == NULL) {
162 		bp = kmem_cache_alloc(physio_buf_cache, KM_SLEEP);
163 		bp->b_iodone = NULL;
164 		bp->b_resid = 0;
165 		allocbuf = 1;
166 	}
167 	TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_END, "getbuf_end: bp %p", bp);
168 
169 	if (uio->uio_segflg == UIO_USERSPACE) {
170 		procp = ttoproc(curthread);
171 		asp = procp->p_as;
172 	} else {
173 		procp = NULL;
174 		asp = &kas;
175 	}
176 	ASSERT(SEMA_HELD(&bp->b_sem));
177 
178 	/*
179 	 * We need to prepare this buffer for the io:::start probe, including
180 	 * NULL'ing out the file, clearing the offset, and filling in the
181 	 * b_dip field.
182 	 */
183 	bp->b_file = NULL;
184 	bp->b_offset = -1;
185 
186 	if (dev != NODEV) {
187 		(void) devopsp[getmajor(dev)]->devo_getinfo(NULL,
188 		    DDI_INFO_DEVT2DEVINFO, (void *)dev, (void **)&bp->b_dip);
189 	} else {
190 		bp->b_dip = NULL;
191 	}
192 
193 	while (uio->uio_iovcnt > 0) {
194 		iov = uio->uio_iov;
195 
196 		bp->b_error = 0;
197 		bp->b_proc = procp;
198 
199 		while (iov->iov_len > 0) {
200 			if (uio->uio_resid == 0)
201 				break;
202 			if (uio->uio_loffset < 0) {
203 				error = EINVAL;
204 				break;
205 			}
206 #ifdef	_ILP32
207 			/*
208 			 * For 32-bit kernels, check against SPEC_MAXOFFSET_T
209 			 * which represents the maximum size that can be
210 			 * supported by the IO subsystem.
211 			 * XXX this code assumes a D_64BIT driver.
212 			 */
213 			if (uio->uio_loffset > SPEC_MAXOFFSET_T) {
214 				error = EINVAL;
215 				break;
216 			}
217 #endif	/* _ILP32 */
218 			bp->b_flags = B_BUSY | B_PHYS | rw;
219 			bp->b_edev = dev;
220 			bp->b_lblkno = btodt(uio->uio_loffset);
221 
222 			/*
223 			 * Don't count on b_addr remaining untouched by the
224 			 * code below (it may be reset because someone does
225 			 * a bp_mapin on the buffer) -- reset from the iov
226 			 * each time through, updating the iov's base address
227 			 * instead.
228 			 */
229 			a = bp->b_un.b_addr = iov->iov_base;
230 			bp->b_bcount = MIN(iov->iov_len, uio->uio_resid);
231 			(*mincnt)(bp);
232 			c = bp->b_bcount;
233 
234 			TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_START,
235 			    "as_pagelock_start: bp %p", bp);
236 
237 			error = as_pagelock(asp, &pplist, a,
238 			    c, rw == B_READ? S_WRITE : S_READ);
239 
240 			TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_END,
241 			    "as_pagelock_end:");
242 
243 			if (error != 0) {
244 				bp->b_flags |= B_ERROR;
245 				bp->b_error = error;
246 				bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS);
247 				break;
248 			}
249 			bp->b_shadow = pplist;
250 			if (pplist != NULL) {
251 				bp->b_flags |= B_SHADOW;
252 			}
253 
254 			DTRACE_IO1(start, struct buf *, bp);
255 			bp->b_flags |= B_STARTED;
256 
257 			(void) (*strat)(bp);
258 			error = biowait(bp);
259 
260 			/*
261 			 * unlock the pages
262 			 */
263 			TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_START,
264 			    "as_pageunlock_start: bp %p", bp);
265 
266 			as_pageunlock(asp, pplist, a, c,
267 			    rw == B_READ? S_WRITE : S_READ);
268 
269 			TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_END,
270 			    "as_pageunlock_end:");
271 
272 			c -= bp->b_resid;
273 			iov->iov_base += c;
274 			iov->iov_len -= c;
275 			uio->uio_resid -= c;
276 			uio->uio_loffset += c;
277 			/* bp->b_resid - temp kludge for tape drives */
278 			if (bp->b_resid || error)
279 				break;
280 		}
281 		bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
282 		/* bp->b_resid - temp kludge for tape drives */
283 		if (bp->b_resid || error)
284 			break;
285 		uio->uio_iov++;
286 		uio->uio_iovcnt--;
287 	}
288 
289 	if (allocbuf) {
290 		kmem_cache_free(physio_buf_cache, bp);
291 	}
292 
293 	/* Kernel probe */
294 	TNF_PROBE_1(physio_end, "io rawio", /* CSTYLED */,
295 		tnf_device,	device,		dev);
296 
297 	TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_END, "physio_end: bp %p", bp);
298 
299 	return (error);
300 }
301 
302 /*
303  * Returns 0 on success, or an error on failure.
304  *
305  * This function is no longer a part of the DDI/DKI.
306  * However, for compatibility, its interface should not
307  * be changed and it should not be removed from the kernel.
308  */
309 int
310 useracc(void *addr, size_t count, int access)
311 {
312 	uint_t prot;
313 
314 	prot = PROT_USER | ((access == B_READ) ? PROT_READ : PROT_WRITE);
315 	return (as_checkprot(ttoproc(curthread)->p_as, addr, count, prot));
316 }
317 
318 #define	MAX_MAPIN_PAGES	8
319 
320 /*
321  * This function temporarily "borrows" user pages for kernel use. If
322  * "cow" is on, it also sets up copy-on-write protection (only feasible
323  * on MAP_PRIVATE segment) on the user mappings, to protect the borrowed
324  * pages from any changes by the user. The caller is responsible for
325  * unlocking and tearing down cow settings when it's done with the pages.
326  * For an example, see kcfree().
327  *
328  * Pages behind [uaddr..uaddr+*lenp] under address space "as" are locked
329  * (shared), and mapped into kernel address range [kaddr..kaddr+*lenp] if
330  * kaddr != -1. On entering this function, cached_ppp contains a list
331  * of pages that are mapped into [kaddr..kaddr+*lenp] already (from a
332  * previous call). Thus if same pages remain behind [uaddr..uaddr+*lenp],
333  * the kernel map won't need to be reloaded again.
334  *
335  * For cow == 1, if the pages are anonymous pages, it also bumps the anon
336  * reference count, and change the user-mapping to read-only. This
337  * scheme should work on all types of segment drivers. But to be safe,
338  * we check against segvn here.
339  *
340  * Since this function is used to emulate copyin() semantic, it checks
341  * to make sure the user-mappings allow "user-read".
342  *
343  * On exit "lenp" contains the number of bytes successfully locked and
344  * mapped in. For the unsuccessful ones, the caller can fall back to
345  * copyin().
346  *
347  * Error return:
348  * ENOTSUP - operation like this is not supported either on this segment
349  * type, or on this platform type.
350  */
351 int
352 cow_mapin(struct as *as, caddr_t uaddr, caddr_t kaddr, struct page **cached_ppp,
353     struct anon **app, size_t *lenp, int cow)
354 {
355 	struct		hat *hat;
356 	struct seg	*seg;
357 	caddr_t		base;
358 	page_t		*pp, *ppp[MAX_MAPIN_PAGES];
359 	long		i;
360 	int		flags;
361 	size_t		size, total = *lenp;
362 	char		first = 1;
363 	faultcode_t	res;
364 
365 	*lenp = 0;
366 	if (cow) {
367 		AS_LOCK_ENTER(as, RW_WRITER);
368 		seg = as_findseg(as, uaddr, 0);
369 		if ((seg == NULL) || ((base = seg->s_base) > uaddr) ||
370 		    (uaddr + total) > base + seg->s_size) {
371 			AS_LOCK_EXIT(as);
372 			return (EINVAL);
373 		}
374 		/*
375 		 * The COW scheme should work for all segment types.
376 		 * But to be safe, we check against segvn.
377 		 */
378 		if (seg->s_ops != &segvn_ops) {
379 			AS_LOCK_EXIT(as);
380 			return (ENOTSUP);
381 		} else if ((SEGOP_GETTYPE(seg, uaddr) & MAP_PRIVATE) == 0) {
382 			AS_LOCK_EXIT(as);
383 			return (ENOTSUP);
384 		}
385 	}
386 	hat = as->a_hat;
387 	size = total;
388 tryagain:
389 	/*
390 	 * If (cow), hat_softlock will also change the usr protection to RO.
391 	 * This is the first step toward setting up cow. Before we
392 	 * bump up an_refcnt, we can't allow any cow-fault on this
393 	 * address. Otherwise segvn_fault will change the protection back
394 	 * to RW upon seeing an_refcnt == 1.
395 	 * The solution is to hold the writer lock on "as".
396 	 */
397 	res = hat_softlock(hat, uaddr, &size, &ppp[0], cow ? HAT_COW : 0);
398 	size = total - size;
399 	*lenp += size;
400 	size = size >> PAGESHIFT;
401 	i = 0;
402 	while (i < size) {
403 		pp = ppp[i];
404 		if (cow) {
405 			kmutex_t *ahm;
406 			/*
407 			 * Another solution is to hold SE_EXCL on pp, and
408 			 * disable PROT_WRITE. This also works for MAP_SHARED
409 			 * segment. The disadvantage is that it locks the
410 			 * page from being used by anybody else.
411 			 */
412 			ahm = AH_MUTEX(pp->p_vnode, pp->p_offset);
413 			mutex_enter(ahm);
414 			*app = swap_anon(pp->p_vnode, pp->p_offset);
415 			/*
416 			 * Since we are holding the as lock, this avoids a
417 			 * potential race with anon_decref. (segvn_unmap and
418 			 * segvn_free needs the as writer lock to do anon_free.)
419 			 */
420 			if (*app != NULL) {
421 #if 0
422 				if ((*app)->an_refcnt == 0)
423 				/*
424 				 * Consider the following senario (unlikey
425 				 * though):
426 				 * 1. an_refcnt == 2
427 				 * 2. we solftlock the page.
428 				 * 3. cow ocurrs on this addr. So a new ap,
429 				 * page and mapping is established on addr.
430 				 * 4. an_refcnt drops to 1 (segvn_faultpage
431 				 * -> anon_decref(oldap))
432 				 * 5. the last ref to ap also drops (from
433 				 * another as). It ends up blocked inside
434 				 * anon_decref trying to get page's excl lock.
435 				 * 6. Later kcfree unlocks the page, call
436 				 * anon_decref -> oops, ap is gone already.
437 				 *
438 				 * Holding as writer lock solves all problems.
439 				 */
440 					*app = NULL;
441 				else
442 #endif
443 					(*app)->an_refcnt++;
444 			}
445 			mutex_exit(ahm);
446 		} else {
447 			*app = NULL;
448 		}
449 		if (kaddr != (caddr_t)-1) {
450 			if (pp != *cached_ppp) {
451 				if (*cached_ppp == NULL)
452 					flags = HAT_LOAD_LOCK | HAT_NOSYNC |
453 					    HAT_LOAD_NOCONSIST;
454 				else
455 					flags = HAT_LOAD_REMAP |
456 					    HAT_LOAD_NOCONSIST;
457 				/*
458 				 * In order to cache the kernel mapping after
459 				 * the user page is unlocked, we call
460 				 * hat_devload instead of hat_memload so
461 				 * that the kernel mapping we set up here is
462 				 * "invisible" to the rest of the world. This
463 				 * is not very pretty. But as long as the
464 				 * caller bears the responsibility of keeping
465 				 * cache consistency, we should be ok -
466 				 * HAT_NOCONSIST will get us a uncached
467 				 * mapping on VAC. hat_softlock will flush
468 				 * a VAC_WRITEBACK cache. Therefore the kaddr
469 				 * doesn't have to be of the same vcolor as
470 				 * uaddr.
471 				 * The alternative is - change hat_devload
472 				 * to get a cached mapping. Allocate a kaddr
473 				 * with the same vcolor as uaddr. Then
474 				 * hat_softlock won't need to flush the VAC.
475 				 */
476 				hat_devload(kas.a_hat, kaddr, PAGESIZE,
477 				    page_pptonum(pp), PROT_READ, flags);
478 				*cached_ppp = pp;
479 			}
480 			kaddr += PAGESIZE;
481 		}
482 		cached_ppp++;
483 		app++;
484 		++i;
485 	}
486 	if (cow) {
487 		AS_LOCK_EXIT(as);
488 	}
489 	if (first && res == FC_NOMAP) {
490 		/*
491 		 * If the address is not mapped yet, we call as_fault to
492 		 * fault the pages in. We could've fallen back to copy and
493 		 * let it fault in the pages. But for a mapped file, we
494 		 * normally reference each page only once. For zero-copy to
495 		 * be of any use, we'd better fall in the page now and try
496 		 * again.
497 		 */
498 		first = 0;
499 		size = size << PAGESHIFT;
500 		uaddr += size;
501 		total -= size;
502 		size = total;
503 		res = as_fault(as->a_hat, as, uaddr, size, F_INVAL, S_READ);
504 		if (cow)
505 			AS_LOCK_ENTER(as, RW_WRITER);
506 		goto tryagain;
507 	}
508 	switch (res) {
509 	case FC_NOSUPPORT:
510 		return (ENOTSUP);
511 	case FC_PROT:	/* Pretend we don't know about it. This will be */
512 			/* caught by the caller when uiomove fails. */
513 	case FC_NOMAP:
514 	case FC_OBJERR:
515 	default:
516 		return (0);
517 	}
518 }
519