xref: /illumos-gate/usr/src/uts/common/os/move.c (revision 67ce1dada345581246cd990d73516418f321a793)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * University Copyright- Copyright (c) 1982, 1986, 1988
31  * The Regents of the University of California
32  * All Rights Reserved
33  *
34  * University Acknowledgment- Portions of this document are derived from
35  * software developed by the University of California, Berkeley, and its
36  * contributors.
37  */
38 
39 #include <sys/types.h>
40 #include <sys/sysmacros.h>
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/uio.h>
44 #include <sys/errno.h>
45 #include <sys/vmsystm.h>
46 #include <sys/cmn_err.h>
47 #include <vm/as.h>
48 #include <vm/page.h>
49 
50 #include <sys/dcopy.h>
51 
52 int64_t uioa_maxpoll = -1;	/* <0 = noblock, 0 = block, >0 = block after */
53 #define	UIO_DCOPY_CHANNEL	0
54 #define	UIO_DCOPY_CMD		1
55 
56 /*
57  * Move "n" bytes at byte address "p"; "rw" indicates the direction
58  * of the move, and the I/O parameters are provided in "uio", which is
59  * update to reflect the data which was moved.  Returns 0 on success or
60  * a non-zero errno on failure.
61  */
62 int
63 uiomove(void *p, size_t n, enum uio_rw rw, struct uio *uio)
64 {
65 	struct iovec *iov;
66 	ulong_t cnt;
67 	int error;
68 
69 	while (n && uio->uio_resid) {
70 		iov = uio->uio_iov;
71 		cnt = MIN(iov->iov_len, n);
72 		if (cnt == 0l) {
73 			uio->uio_iov++;
74 			uio->uio_iovcnt--;
75 			continue;
76 		}
77 		switch (uio->uio_segflg) {
78 
79 		case UIO_USERSPACE:
80 		case UIO_USERISPACE:
81 			if (rw == UIO_READ) {
82 				error = xcopyout_nta(p, iov->iov_base, cnt,
83 				    (uio->uio_extflg & UIO_COPY_CACHED));
84 			} else {
85 				error = xcopyin_nta(iov->iov_base, p, cnt,
86 				    (uio->uio_extflg & UIO_COPY_CACHED));
87 			}
88 
89 			if (error)
90 				return (error);
91 			break;
92 
93 		case UIO_SYSSPACE:
94 			if (rw == UIO_READ)
95 				error = kcopy_nta(p, iov->iov_base, cnt,
96 				    (uio->uio_extflg & UIO_COPY_CACHED));
97 			else
98 				error = kcopy_nta(iov->iov_base, p, cnt,
99 				    (uio->uio_extflg & UIO_COPY_CACHED));
100 			if (error)
101 				return (error);
102 			break;
103 		}
104 		iov->iov_base += cnt;
105 		iov->iov_len -= cnt;
106 		uio->uio_resid -= cnt;
107 		uio->uio_loffset += cnt;
108 		p = (caddr_t)p + cnt;
109 		n -= cnt;
110 	}
111 	return (0);
112 }
113 
114 /*
115  * transfer a character value into the address space
116  * delineated by a uio and update fields within the
117  * uio for next character. Return 0 for success, EFAULT
118  * for error.
119  */
120 int
121 ureadc(int val, struct uio *uiop)
122 {
123 	struct iovec *iovp;
124 	unsigned char c;
125 
126 	/*
127 	 * first determine if uio is valid.  uiop should be
128 	 * non-NULL and the resid count > 0.
129 	 */
130 	if (!(uiop && uiop->uio_resid > 0))
131 		return (EFAULT);
132 
133 	/*
134 	 * scan through iovecs until one is found that is non-empty.
135 	 * Return EFAULT if none found.
136 	 */
137 	while (uiop->uio_iovcnt > 0) {
138 		iovp = uiop->uio_iov;
139 		if (iovp->iov_len <= 0) {
140 			uiop->uio_iovcnt--;
141 			uiop->uio_iov++;
142 		} else
143 			break;
144 	}
145 
146 	if (uiop->uio_iovcnt <= 0)
147 		return (EFAULT);
148 
149 	/*
150 	 * Transfer character to uio space.
151 	 */
152 
153 	c = (unsigned char) (val & 0xFF);
154 
155 	switch (uiop->uio_segflg) {
156 
157 	case UIO_USERISPACE:
158 	case UIO_USERSPACE:
159 		if (copyout(&c, iovp->iov_base, sizeof (unsigned char)))
160 			return (EFAULT);
161 		break;
162 
163 	case UIO_SYSSPACE: /* can do direct copy since kernel-kernel */
164 		*iovp->iov_base = c;
165 		break;
166 
167 	default:
168 		return (EFAULT); /* invalid segflg value */
169 	}
170 
171 	/*
172 	 * bump up/down iovec and uio members to reflect transfer.
173 	 */
174 	iovp->iov_base++;
175 	iovp->iov_len--;
176 	uiop->uio_resid--;
177 	uiop->uio_loffset++;
178 	return (0); /* success */
179 }
180 
181 /*
182  * return a character value from the address space
183  * delineated by a uio and update fields within the
184  * uio for next character. Return the character for success,
185  * -1 for error.
186  */
187 int
188 uwritec(struct uio *uiop)
189 {
190 	struct iovec *iovp;
191 	unsigned char c;
192 
193 	/*
194 	 * verify we were passed a valid uio structure.
195 	 * (1) non-NULL uiop, (2) positive resid count
196 	 * (3) there is an iovec with positive length
197 	 */
198 
199 	if (!(uiop && uiop->uio_resid > 0))
200 		return (-1);
201 
202 	while (uiop->uio_iovcnt > 0) {
203 		iovp = uiop->uio_iov;
204 		if (iovp->iov_len <= 0) {
205 			uiop->uio_iovcnt--;
206 			uiop->uio_iov++;
207 		} else
208 			break;
209 	}
210 
211 	if (uiop->uio_iovcnt <= 0)
212 		return (-1);
213 
214 	/*
215 	 * Get the character from the uio address space.
216 	 */
217 	switch (uiop->uio_segflg) {
218 
219 	case UIO_USERISPACE:
220 	case UIO_USERSPACE:
221 		if (copyin(iovp->iov_base, &c, sizeof (unsigned char)))
222 			return (-1);
223 		break;
224 
225 	case UIO_SYSSPACE:
226 		c = *iovp->iov_base;
227 		break;
228 
229 	default:
230 		return (-1); /* invalid segflg */
231 	}
232 
233 	/*
234 	 * Adjust fields of iovec and uio appropriately.
235 	 */
236 	iovp->iov_base++;
237 	iovp->iov_len--;
238 	uiop->uio_resid--;
239 	uiop->uio_loffset++;
240 	return ((int)c & 0xFF); /* success */
241 }
242 
243 /*
244  * Drop the next n chars out of *uiop.
245  */
246 void
247 uioskip(uio_t *uiop, size_t n)
248 {
249 	if (n > uiop->uio_resid)
250 		return;
251 	while (n != 0) {
252 		register iovec_t	*iovp = uiop->uio_iov;
253 		register size_t		niovb = MIN(iovp->iov_len, n);
254 
255 		if (niovb == 0) {
256 			uiop->uio_iov++;
257 			uiop->uio_iovcnt--;
258 			continue;
259 		}
260 		iovp->iov_base += niovb;
261 		uiop->uio_loffset += niovb;
262 		iovp->iov_len -= niovb;
263 		uiop->uio_resid -= niovb;
264 		n -= niovb;
265 	}
266 }
267 
268 /*
269  * Dup the suio into the duio and diovec of size diov_cnt. If diov
270  * is too small to dup suio then an error will be returned, else 0.
271  */
272 int
273 uiodup(uio_t *suio, uio_t *duio, iovec_t *diov, int diov_cnt)
274 {
275 	int ix;
276 	iovec_t *siov = suio->uio_iov;
277 
278 	*duio = *suio;
279 	for (ix = 0; ix < suio->uio_iovcnt; ix++) {
280 		diov[ix] = siov[ix];
281 		if (ix >= diov_cnt)
282 			return (1);
283 	}
284 	duio->uio_iov = diov;
285 	return (0);
286 }
287 
288 /*
289  * Shadow state for checking if a platform has hardware asynchronous
290  * copy capability and minimum copy size, e.g. Intel's I/OAT dma engine,
291  *
292  * Dcopy does a call-back to uioa_dcopy_enable() when a dma device calls
293  * into dcopy to register and uioa_dcopy_disable() when the device calls
294  * into dcopy to unregister.
295  */
296 uioasync_t uioasync = {B_FALSE, 1024};
297 
298 void
299 uioa_dcopy_enable()
300 {
301 	uioasync.enabled = B_TRUE;
302 }
303 
304 void
305 uioa_dcopy_disable()
306 {
307 	uioasync.enabled = B_FALSE;
308 }
309 
310 /*
311  * Schedule an asynchronous move of "n" bytes at byte address "p",
312  * "rw" indicates the direction of the move, I/O parameters and
313  * async state are provided in "uioa" which is update to reflect
314  * the data which is to be moved.
315  *
316  * Returns 0 on success or a non-zero errno on failure.
317  *
318  * Note, while the uioasync APIs are general purpose in design
319  * the current implementation is Intel I/OAT specific.
320  */
321 int
322 uioamove(void *p, size_t n, enum uio_rw rw, uioa_t *uioa)
323 {
324 	int		soff, doff;
325 	uint64_t	pa;
326 	int		cnt;
327 	iovec_t		*iov;
328 	dcopy_handle_t	channel;
329 	dcopy_cmd_t	cmd;
330 	int		ret = 0;
331 	int		dcopy_flags;
332 
333 	if (!(uioa->uioa_state & UIOA_ENABLED)) {
334 		/* The uioa_t isn't enabled */
335 		return (ENXIO);
336 	}
337 
338 	if (uioa->uio_segflg != UIO_USERSPACE || rw != UIO_READ) {
339 		/* Only support to user-land from kernel */
340 		return (ENOTSUP);
341 	}
342 
343 
344 	channel = uioa->uioa_hwst[UIO_DCOPY_CHANNEL];
345 	cmd = uioa->uioa_hwst[UIO_DCOPY_CMD];
346 	dcopy_flags = DCOPY_NOSLEEP;
347 
348 	/*
349 	 * While source bytes and destination bytes.
350 	 */
351 	while (n > 0 && uioa->uio_resid > 0) {
352 		iov = uioa->uio_iov;
353 		if (iov->iov_len == 0l) {
354 			uioa->uio_iov++;
355 			uioa->uio_iovcnt--;
356 			uioa->uioa_lcur++;
357 			uioa->uioa_lppp = uioa->uioa_lcur->uioa_ppp;
358 			continue;
359 		}
360 		/*
361 		 * While source bytes schedule an async
362 		 * dma for destination page by page.
363 		 */
364 		while (n > 0) {
365 			/* Addr offset in page src/dst */
366 			soff = (uintptr_t)p & PAGEOFFSET;
367 			doff = (uintptr_t)iov->iov_base & PAGEOFFSET;
368 			/* Min copy count src and dst and page sized */
369 			cnt = MIN(n, iov->iov_len);
370 			cnt = MIN(cnt, PAGESIZE - soff);
371 			cnt = MIN(cnt, PAGESIZE - doff);
372 			/* XXX if next page(s) contiguous could use multipage */
373 
374 			/*
375 			 * if we have an old command, we want to link all
376 			 * other commands to the next command we alloced so
377 			 * we only need to track the last command but can
378 			 * still free them all.
379 			 */
380 			if (cmd != NULL) {
381 				dcopy_flags |= DCOPY_ALLOC_LINK;
382 			}
383 			ret = dcopy_cmd_alloc(channel, dcopy_flags, &cmd);
384 			if (ret != DCOPY_SUCCESS) {
385 				/* Error of some sort */
386 				return (EIO);
387 			}
388 			uioa->uioa_hwst[UIO_DCOPY_CMD] = cmd;
389 
390 			ASSERT(cmd->dp_version == DCOPY_CMD_V0);
391 			if (uioa_maxpoll >= 0) {
392 				/* Blocking (>0 may be) used in uioafini() */
393 				cmd->dp_flags = DCOPY_CMD_INTR;
394 			} else {
395 				/* Non blocking uioafini() so no intr */
396 				cmd->dp_flags = DCOPY_CMD_NOFLAGS;
397 			}
398 			cmd->dp_cmd = DCOPY_CMD_COPY;
399 			pa = ptob((uint64_t)hat_getpfnum(kas.a_hat, p));
400 			cmd->dp.copy.cc_source = pa + soff;
401 			if (uioa->uioa_lcur->uioa_pfncnt == 0) {
402 				/* Have a (page_t **) */
403 				pa = ptob((uint64_t)(
404 				    *(page_t **)uioa->uioa_lppp)->p_pagenum);
405 			} else {
406 				/* Have a (pfn_t *) */
407 				pa = ptob((uint64_t)(
408 				    *(pfn_t *)uioa->uioa_lppp));
409 			}
410 			cmd->dp.copy.cc_dest = pa + doff;
411 			cmd->dp.copy.cc_size = cnt;
412 			ret = dcopy_cmd_post(cmd);
413 			if (ret != DCOPY_SUCCESS) {
414 				/* Error of some sort */
415 				return (EIO);
416 			}
417 			ret = 0;
418 
419 			/* If UIOA_POLL not set, set it */
420 			if (!(uioa->uioa_state & UIOA_POLL))
421 				uioa->uioa_state |= UIOA_POLL;
422 
423 			/* Update iov, uio, and local pointers/counters */
424 			iov->iov_base += cnt;
425 			iov->iov_len -= cnt;
426 			uioa->uio_resid -= cnt;
427 			uioa->uioa_mbytes += cnt;
428 			uioa->uio_loffset += cnt;
429 			p = (caddr_t)p + cnt;
430 			n -= cnt;
431 
432 			/* End of iovec? */
433 			if (iov->iov_len == 0) {
434 				/* Yup, next iovec */
435 				break;
436 			}
437 
438 			/* Next dst addr page? */
439 			if (doff + cnt == PAGESIZE) {
440 				/* Yup, next page_t */
441 				uioa->uioa_lppp++;
442 			}
443 		}
444 	}
445 
446 	return (ret);
447 }
448 
449 /*
450  * Initialize a uioa_t for a given uio_t for the current user context,
451  * copy the common uio_t to the uioa_t, walk the shared iovec_t and
452  * lock down the user-land page(s) containing iovec_t data, then mapin
453  * user-land pages using segkpm.
454  */
455 int
456 uioainit(uio_t *uiop, uioa_t *uioap)
457 {
458 	caddr_t	addr;
459 	page_t		**pages;
460 	int		off;
461 	int		len;
462 	proc_t		*procp = ttoproc(curthread);
463 	struct as	*as = procp->p_as;
464 	iovec_t		*iov = uiop->uio_iov;
465 	int32_t		iovcnt = uiop->uio_iovcnt;
466 	uioa_page_t	*locked = uioap->uioa_locked;
467 	dcopy_handle_t	channel;
468 	int		error;
469 
470 	if (! (uioap->uioa_state & UIOA_ALLOC)) {
471 		/* Can only init() a freshly allocated uioa_t */
472 		return (EINVAL);
473 	}
474 
475 	error = dcopy_alloc(DCOPY_NOSLEEP, &channel);
476 	if (error == DCOPY_NORESOURCES) {
477 		/* Turn off uioa */
478 		uioasync.enabled = B_FALSE;
479 		return (ENODEV);
480 	}
481 	if (error != DCOPY_SUCCESS) {
482 		/* Alloc failed */
483 		return (EIO);
484 	}
485 
486 	uioap->uioa_hwst[UIO_DCOPY_CHANNEL] = channel;
487 	uioap->uioa_hwst[UIO_DCOPY_CMD] = NULL;
488 
489 	/* Indicate uioa_t (will be) initialized */
490 	uioap->uioa_state = UIOA_INIT;
491 
492 	uioap->uioa_mbytes = 0;
493 
494 	uioap->uioa_mbytes = 0;
495 
496 	/* uio_t/uioa_t uio_t common struct copy */
497 	*((uio_t *)uioap) = *uiop;
498 
499 	/* initialize *uiop->uio_iov */
500 	if (iovcnt > UIOA_IOV_MAX) {
501 		/* Too big? */
502 		return (E2BIG);
503 	}
504 	uioap->uio_iov = iov;
505 	uioap->uio_iovcnt = iovcnt;
506 
507 	/* Mark the uioap as such */
508 	uioap->uio_extflg |= UIO_ASYNC;
509 
510 	/*
511 	 * For each iovec_t, lock-down the page(s) backing the iovec_t
512 	 * and save the page_t list for phys addr use in uioamove().
513 	 */
514 	iov = uiop->uio_iov;
515 	iovcnt = uiop->uio_iovcnt;
516 	while (iovcnt > 0) {
517 		addr = iov->iov_base;
518 		off = (uintptr_t)addr & PAGEOFFSET;
519 		addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
520 		len = iov->iov_len + off;
521 
522 		/* Lock down page(s) for the iov span */
523 		if ((error = as_pagelock(as, &pages,
524 		    iov->iov_base, iov->iov_len, S_WRITE)) != 0) {
525 			/* Error */
526 			goto cleanup;
527 		}
528 
529 		if (pages == NULL) {
530 			/*
531 			 * Need page_t list, really only need
532 			 * a pfn list so build one.
533 			 */
534 			pfn_t   *pfnp;
535 			int	pcnt = len >> PAGESHIFT;
536 
537 			if (off)
538 				pcnt++;
539 			if ((pfnp = kmem_alloc(pcnt * sizeof (pfnp),
540 			    KM_NOSLEEP)) == NULL) {
541 				error = ENOMEM;
542 				goto cleanup;
543 			}
544 			locked->uioa_ppp = (void **)pfnp;
545 			locked->uioa_pfncnt = pcnt;
546 			AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
547 			while (pcnt-- > 0) {
548 				*pfnp++ = hat_getpfnum(as->a_hat, addr);
549 				addr += PAGESIZE;
550 			}
551 			AS_LOCK_EXIT(as, &as->a_lock);
552 		} else {
553 			/* Have a page_t list, save it */
554 			locked->uioa_ppp = (void **)pages;
555 			locked->uioa_pfncnt = 0;
556 		}
557 		/* Save for as_pageunlock() in uioafini() */
558 		locked->uioa_base = iov->iov_base;
559 		locked->uioa_len = iov->iov_len;
560 		locked++;
561 
562 		/* Next iovec_t */
563 		iov++;
564 		iovcnt--;
565 	}
566 	/* Initialize curret pointer into uioa_locked[] and it's uioa_ppp */
567 	uioap->uioa_lcur = uioap->uioa_locked;
568 	uioap->uioa_lppp = uioap->uioa_lcur->uioa_ppp;
569 	return (0);
570 
571 cleanup:
572 	/* Unlock any previously locked page_t(s) */
573 	while (locked > uioap->uioa_locked) {
574 		locked--;
575 		as_pageunlock(as, (page_t **)locked->uioa_ppp,
576 		    locked->uioa_base, locked->uioa_len, S_WRITE);
577 	}
578 
579 	/* Last indicate uioa_t still in alloc state */
580 	uioap->uioa_state = UIOA_ALLOC;
581 	uioap->uioa_mbytes = 0;
582 
583 	return (error);
584 }
585 
586 /*
587  * Finish processing of a uioa_t by cleanup any pending "uioap" actions.
588  */
589 int
590 uioafini(uio_t *uiop, uioa_t *uioap)
591 {
592 	int32_t		iovcnt = uiop->uio_iovcnt;
593 	uioa_page_t	*locked = uioap->uioa_locked;
594 	struct as	*as = ttoproc(curthread)->p_as;
595 	dcopy_handle_t	channel;
596 	dcopy_cmd_t	cmd;
597 	int		ret = 0;
598 
599 	ASSERT(uioap->uio_extflg & UIO_ASYNC);
600 
601 	if (!(uioap->uioa_state & (UIOA_ENABLED|UIOA_FINI))) {
602 		/* Must be an active uioa_t */
603 		return (EINVAL);
604 	}
605 
606 	channel = uioap->uioa_hwst[UIO_DCOPY_CHANNEL];
607 	cmd = uioap->uioa_hwst[UIO_DCOPY_CMD];
608 
609 	/* XXX - why do we get cmd == NULL sometimes? */
610 	if (cmd != NULL) {
611 		if (uioap->uioa_state & UIOA_POLL) {
612 			/* Wait for last dcopy() to finish */
613 			int64_t poll = 1;
614 			int poll_flag = DCOPY_POLL_NOFLAGS;
615 
616 			do {
617 				if (uioa_maxpoll == 0 ||
618 				    (uioa_maxpoll > 0 &&
619 				    poll >= uioa_maxpoll)) {
620 					/* Always block or after maxpoll */
621 					poll_flag = DCOPY_POLL_BLOCK;
622 				} else {
623 					/* No block, poll */
624 					poll++;
625 				}
626 				ret = dcopy_cmd_poll(cmd, poll_flag);
627 			} while (ret == DCOPY_PENDING);
628 
629 			if (ret == DCOPY_COMPLETED) {
630 				/* Poll/block succeeded */
631 				ret = 0;
632 			} else {
633 				/* Poll/block failed */
634 				ret = EIO;
635 			}
636 		}
637 		dcopy_cmd_free(&cmd);
638 	}
639 
640 	dcopy_free(&channel);
641 
642 	/* Unlock all page(s) iovec_t by iovec_t */
643 	while (iovcnt-- > 0) {
644 		page_t **pages;
645 
646 		if (locked->uioa_pfncnt == 0) {
647 			/* A as_pagelock() returned (page_t **) */
648 			pages = (page_t **)locked->uioa_ppp;
649 		} else {
650 			/* Our pfn_t array */
651 			pages = NULL;
652 			kmem_free(locked->uioa_ppp, locked->uioa_pfncnt *
653 			    sizeof (pfn_t *));
654 		}
655 		as_pageunlock(as, pages, locked->uioa_base, locked->uioa_len,
656 		    S_WRITE);
657 
658 		locked++;
659 	}
660 	/* uioa_t->uio_t common struct copy */
661 	*uiop = *((uio_t *)uioap);
662 
663 	/*
664 	 * Last, reset uioa state to alloc.
665 	 *
666 	 * Note, we only initialize the state here, all other members
667 	 * will be initialized in a subsequent uioainit().
668 	 */
669 	uioap->uioa_state = UIOA_ALLOC;
670 	uioap->uioa_mbytes = 0;
671 
672 	uioap->uioa_hwst[UIO_DCOPY_CMD] = NULL;
673 	uioap->uioa_hwst[UIO_DCOPY_CHANNEL] = NULL;
674 
675 	return (ret);
676 }
677