xref: /illumos-gate/usr/src/uts/common/os/move.c (revision 36615d24946b849e48cedbbafa9adfb4a02b590c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * University Copyright- Copyright (c) 1982, 1986, 1988
31  * The Regents of the University of California
32  * All Rights Reserved
33  *
34  * University Acknowledgment- Portions of this document are derived from
35  * software developed by the University of California, Berkeley, and its
36  * contributors.
37  */
38 
39 #pragma ident	"%Z%%M%	%I%	%E% SMI"
40 
41 #include <sys/types.h>
42 #include <sys/sysmacros.h>
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/uio.h>
46 #include <sys/errno.h>
47 #include <sys/vmsystm.h>
48 #include <sys/cmn_err.h>
49 #include <vm/as.h>
50 #include <vm/page.h>
51 
52 #include <sys/dcopy.h>
53 
54 int64_t uioa_maxpoll = -1;	/* <0 = noblock, 0 = block, >0 = block after */
55 #define	UIO_DCOPY_CHANNEL	0
56 #define	UIO_DCOPY_CMD		1
57 
58 /*
59  * Move "n" bytes at byte address "p"; "rw" indicates the direction
60  * of the move, and the I/O parameters are provided in "uio", which is
61  * update to reflect the data which was moved.  Returns 0 on success or
62  * a non-zero errno on failure.
63  */
64 int
65 uiomove(void *p, size_t n, enum uio_rw rw, struct uio *uio)
66 {
67 	struct iovec *iov;
68 	ulong_t cnt;
69 	int error;
70 
71 	while (n && uio->uio_resid) {
72 		iov = uio->uio_iov;
73 		cnt = MIN(iov->iov_len, n);
74 		if (cnt == 0l) {
75 			uio->uio_iov++;
76 			uio->uio_iovcnt--;
77 			continue;
78 		}
79 		switch (uio->uio_segflg) {
80 
81 		case UIO_USERSPACE:
82 		case UIO_USERISPACE:
83 			if (rw == UIO_READ) {
84 				error = xcopyout_nta(p, iov->iov_base, cnt,
85 				    (uio->uio_extflg & UIO_COPY_CACHED));
86 			} else {
87 				error = xcopyin_nta(iov->iov_base, p, cnt,
88 				    (uio->uio_extflg & UIO_COPY_CACHED));
89 			}
90 
91 			if (error)
92 				return (error);
93 			break;
94 
95 		case UIO_SYSSPACE:
96 			if (rw == UIO_READ)
97 				error = kcopy_nta(p, iov->iov_base, cnt,
98 				    (uio->uio_extflg & UIO_COPY_CACHED));
99 			else
100 				error = kcopy_nta(iov->iov_base, p, cnt,
101 				    (uio->uio_extflg & UIO_COPY_CACHED));
102 			if (error)
103 				return (error);
104 			break;
105 		}
106 		iov->iov_base += cnt;
107 		iov->iov_len -= cnt;
108 		uio->uio_resid -= cnt;
109 		uio->uio_loffset += cnt;
110 		p = (caddr_t)p + cnt;
111 		n -= cnt;
112 	}
113 	return (0);
114 }
115 
116 /*
117  * transfer a character value into the address space
118  * delineated by a uio and update fields within the
119  * uio for next character. Return 0 for success, EFAULT
120  * for error.
121  */
122 int
123 ureadc(int val, struct uio *uiop)
124 {
125 	struct iovec *iovp;
126 	unsigned char c;
127 
128 	/*
129 	 * first determine if uio is valid.  uiop should be
130 	 * non-NULL and the resid count > 0.
131 	 */
132 	if (!(uiop && uiop->uio_resid > 0))
133 		return (EFAULT);
134 
135 	/*
136 	 * scan through iovecs until one is found that is non-empty.
137 	 * Return EFAULT if none found.
138 	 */
139 	while (uiop->uio_iovcnt > 0) {
140 		iovp = uiop->uio_iov;
141 		if (iovp->iov_len <= 0) {
142 			uiop->uio_iovcnt--;
143 			uiop->uio_iov++;
144 		} else
145 			break;
146 	}
147 
148 	if (uiop->uio_iovcnt <= 0)
149 		return (EFAULT);
150 
151 	/*
152 	 * Transfer character to uio space.
153 	 */
154 
155 	c = (unsigned char) (val & 0xFF);
156 
157 	switch (uiop->uio_segflg) {
158 
159 	case UIO_USERISPACE:
160 	case UIO_USERSPACE:
161 		if (copyout(&c, iovp->iov_base, sizeof (unsigned char)))
162 			return (EFAULT);
163 		break;
164 
165 	case UIO_SYSSPACE: /* can do direct copy since kernel-kernel */
166 		*iovp->iov_base = c;
167 		break;
168 
169 	default:
170 		return (EFAULT); /* invalid segflg value */
171 	}
172 
173 	/*
174 	 * bump up/down iovec and uio members to reflect transfer.
175 	 */
176 	iovp->iov_base++;
177 	iovp->iov_len--;
178 	uiop->uio_resid--;
179 	uiop->uio_loffset++;
180 	return (0); /* success */
181 }
182 
183 /*
184  * return a character value from the address space
185  * delineated by a uio and update fields within the
186  * uio for next character. Return the character for success,
187  * -1 for error.
188  */
189 int
190 uwritec(struct uio *uiop)
191 {
192 	struct iovec *iovp;
193 	unsigned char c;
194 
195 	/*
196 	 * verify we were passed a valid uio structure.
197 	 * (1) non-NULL uiop, (2) positive resid count
198 	 * (3) there is an iovec with positive length
199 	 */
200 
201 	if (!(uiop && uiop->uio_resid > 0))
202 		return (-1);
203 
204 	while (uiop->uio_iovcnt > 0) {
205 		iovp = uiop->uio_iov;
206 		if (iovp->iov_len <= 0) {
207 			uiop->uio_iovcnt--;
208 			uiop->uio_iov++;
209 		} else
210 			break;
211 	}
212 
213 	if (uiop->uio_iovcnt <= 0)
214 		return (-1);
215 
216 	/*
217 	 * Get the character from the uio address space.
218 	 */
219 	switch (uiop->uio_segflg) {
220 
221 	case UIO_USERISPACE:
222 	case UIO_USERSPACE:
223 		if (copyin(iovp->iov_base, &c, sizeof (unsigned char)))
224 			return (-1);
225 		break;
226 
227 	case UIO_SYSSPACE:
228 		c = *iovp->iov_base;
229 		break;
230 
231 	default:
232 		return (-1); /* invalid segflg */
233 	}
234 
235 	/*
236 	 * Adjust fields of iovec and uio appropriately.
237 	 */
238 	iovp->iov_base++;
239 	iovp->iov_len--;
240 	uiop->uio_resid--;
241 	uiop->uio_loffset++;
242 	return ((int)c & 0xFF); /* success */
243 }
244 
245 /*
246  * Drop the next n chars out of *uiop.
247  */
248 void
249 uioskip(uio_t *uiop, size_t n)
250 {
251 	if (n > uiop->uio_resid)
252 		return;
253 	while (n != 0) {
254 		register iovec_t	*iovp = uiop->uio_iov;
255 		register size_t		niovb = MIN(iovp->iov_len, n);
256 
257 		if (niovb == 0) {
258 			uiop->uio_iov++;
259 			uiop->uio_iovcnt--;
260 			continue;
261 		}
262 		iovp->iov_base += niovb;
263 		uiop->uio_loffset += niovb;
264 		iovp->iov_len -= niovb;
265 		uiop->uio_resid -= niovb;
266 		n -= niovb;
267 	}
268 }
269 
270 /*
271  * Dup the suio into the duio and diovec of size diov_cnt. If diov
272  * is too small to dup suio then an error will be returned, else 0.
273  */
274 int
275 uiodup(uio_t *suio, uio_t *duio, iovec_t *diov, int diov_cnt)
276 {
277 	int ix;
278 	iovec_t *siov = suio->uio_iov;
279 
280 	*duio = *suio;
281 	for (ix = 0; ix < suio->uio_iovcnt; ix++) {
282 		diov[ix] = siov[ix];
283 		if (ix >= diov_cnt)
284 			return (1);
285 	}
286 	duio->uio_iov = diov;
287 	return (0);
288 }
289 
290 /*
291  * Shadow state for checking if a platform has hardware asynchronous
292  * copy capability and minimum copy size, e.g. Intel's I/OAT dma engine,
293  *
294  * Dcopy does a call-back to uioa_dcopy_enable() when a dma device calls
295  * into dcopy to register and uioa_dcopy_disable() when the device calls
296  * into dcopy to unregister.
297  */
298 uioasync_t uioasync = {B_FALSE, 1024};
299 
300 void
301 uioa_dcopy_enable()
302 {
303 	uioasync.enabled = B_TRUE;
304 }
305 
306 void
307 uioa_dcopy_disable()
308 {
309 	uioasync.enabled = B_FALSE;
310 }
311 
312 /*
313  * Schedule an asynchronous move of "n" bytes at byte address "p",
314  * "rw" indicates the direction of the move, I/O parameters and
315  * async state are provided in "uioa" which is update to reflect
316  * the data which is to be moved.
317  *
318  * Returns 0 on success or a non-zero errno on failure.
319  *
320  * Note, while the uioasync APIs are general purpose in design
321  * the current implementation is Intel I/OAT specific.
322  */
323 int
324 uioamove(void *p, size_t n, enum uio_rw rw, uioa_t *uioa)
325 {
326 	int		soff, doff;
327 	uint64_t	pa;
328 	int		cnt;
329 	iovec_t		*iov;
330 	dcopy_handle_t	channel;
331 	dcopy_cmd_t	cmd;
332 	int		ret = 0;
333 	int		dcopy_flags;
334 
335 	if (!(uioa->uioa_state & UIOA_ENABLED)) {
336 		/* The uioa_t isn't enabled */
337 		return (ENXIO);
338 	}
339 
340 	if (uioa->uio_segflg != UIO_USERSPACE || rw != UIO_READ) {
341 		/* Only support to user-land from kernel */
342 		return (ENOTSUP);
343 	}
344 
345 
346 	channel = uioa->uioa_hwst[UIO_DCOPY_CHANNEL];
347 	cmd = uioa->uioa_hwst[UIO_DCOPY_CMD];
348 	dcopy_flags = DCOPY_NOSLEEP;
349 
350 	/*
351 	 * While source bytes and destination bytes.
352 	 */
353 	while (n > 0 && uioa->uio_resid > 0) {
354 		iov = uioa->uio_iov;
355 		if (iov->iov_len == 0l) {
356 			uioa->uio_iov++;
357 			uioa->uio_iovcnt--;
358 			uioa->uioa_lcur++;
359 			uioa->uioa_lppp = uioa->uioa_lcur->uioa_ppp;
360 			continue;
361 		}
362 		/*
363 		 * While source bytes schedule an async
364 		 * dma for destination page by page.
365 		 */
366 		while (n > 0) {
367 			/* Addr offset in page src/dst */
368 			soff = (uintptr_t)p & PAGEOFFSET;
369 			doff = (uintptr_t)iov->iov_base & PAGEOFFSET;
370 			/* Min copy count src and dst and page sized */
371 			cnt = MIN(n, iov->iov_len);
372 			cnt = MIN(cnt, PAGESIZE - soff);
373 			cnt = MIN(cnt, PAGESIZE - doff);
374 			/* XXX if next page(s) contiguous could use multipage */
375 
376 			/*
377 			 * if we have an old command, we want to link all
378 			 * other commands to the next command we alloced so
379 			 * we only need to track the last command but can
380 			 * still free them all.
381 			 */
382 			if (cmd != NULL) {
383 				dcopy_flags |= DCOPY_ALLOC_LINK;
384 			}
385 			ret = dcopy_cmd_alloc(channel, dcopy_flags, &cmd);
386 			if (ret != DCOPY_SUCCESS) {
387 				/* Error of some sort */
388 				return (EIO);
389 			}
390 			uioa->uioa_hwst[UIO_DCOPY_CMD] = cmd;
391 
392 			ASSERT(cmd->dp_version == DCOPY_CMD_V0);
393 			if (uioa_maxpoll >= 0) {
394 				/* Blocking (>0 may be) used in uioafini() */
395 				cmd->dp_flags = DCOPY_CMD_INTR;
396 			} else {
397 				/* Non blocking uioafini() so no intr */
398 				cmd->dp_flags = DCOPY_CMD_NOFLAGS;
399 			}
400 			cmd->dp_cmd = DCOPY_CMD_COPY;
401 			pa = ptob((uint64_t)hat_getpfnum(kas.a_hat, p));
402 			cmd->dp.copy.cc_source = pa + soff;
403 			if (uioa->uioa_lcur->uioa_pfncnt == 0) {
404 				/* Have a (page_t **) */
405 				pa = ptob((uint64_t)(
406 				    *(page_t **)uioa->uioa_lppp)->p_pagenum);
407 			} else {
408 				/* Have a (pfn_t *) */
409 				pa = ptob((uint64_t)(
410 				    *(pfn_t *)uioa->uioa_lppp));
411 			}
412 			cmd->dp.copy.cc_dest = pa + doff;
413 			cmd->dp.copy.cc_size = cnt;
414 			ret = dcopy_cmd_post(cmd);
415 			if (ret != DCOPY_SUCCESS) {
416 				/* Error of some sort */
417 				return (EIO);
418 			}
419 			ret = 0;
420 
421 			/* If UIOA_POLL not set, set it */
422 			if (!(uioa->uioa_state & UIOA_POLL))
423 				uioa->uioa_state |= UIOA_POLL;
424 
425 			/* Update iov, uio, and local pointers/counters */
426 			iov->iov_base += cnt;
427 			iov->iov_len -= cnt;
428 			uioa->uio_resid -= cnt;
429 			uioa->uio_loffset += cnt;
430 			p = (caddr_t)p + cnt;
431 			n -= cnt;
432 
433 			/* End of iovec? */
434 			if (iov->iov_len == 0) {
435 				/* Yup, next iovec */
436 				break;
437 			}
438 
439 			/* Next dst addr page? */
440 			if (doff + cnt == PAGESIZE) {
441 				/* Yup, next page_t */
442 				uioa->uioa_lppp++;
443 			}
444 		}
445 	}
446 
447 	return (ret);
448 }
449 
450 /*
451  * Initialize a uioa_t for a given uio_t for the current user context,
452  * copy the common uio_t to the uioa_t, walk the shared iovec_t and
453  * lock down the user-land page(s) containing iovec_t data, then mapin
454  * user-land pages using segkpm.
455  */
456 int
457 uioainit(uio_t *uiop, uioa_t *uioap)
458 {
459 	caddr_t	addr;
460 	page_t		**pages;
461 	int		off;
462 	int		len;
463 	proc_t		*procp = ttoproc(curthread);
464 	struct as	*as = procp->p_as;
465 	iovec_t		*iov = uiop->uio_iov;
466 	int32_t		iovcnt = uiop->uio_iovcnt;
467 	uioa_page_t	*locked = uioap->uioa_locked;
468 	dcopy_handle_t	channel;
469 	int		error;
470 
471 	if (! (uioap->uioa_state & UIOA_ALLOC)) {
472 		/* Can only init() a freshly allocated uioa_t */
473 		return (EINVAL);
474 	}
475 
476 	error = dcopy_alloc(DCOPY_NOSLEEP, &channel);
477 	if (error == DCOPY_NORESOURCES) {
478 		/* Turn off uioa */
479 		uioasync.enabled = B_FALSE;
480 		return (ENODEV);
481 	}
482 	if (error != DCOPY_SUCCESS) {
483 		/* Alloc failed */
484 		return (EIO);
485 	}
486 
487 	uioap->uioa_hwst[UIO_DCOPY_CHANNEL] = channel;
488 	uioap->uioa_hwst[UIO_DCOPY_CMD] = NULL;
489 
490 	/* Indicate uioa_t (will be) initialized */
491 	uioap->uioa_state = UIOA_INIT;
492 
493 	/* uio_t/uioa_t uio_t common struct copy */
494 	*((uio_t *)uioap) = *uiop;
495 
496 	/* initialize *uiop->uio_iov */
497 	if (iovcnt > UIOA_IOV_MAX) {
498 		/* Too big? */
499 		return (E2BIG);
500 	}
501 	uioap->uio_iov = iov;
502 	uioap->uio_iovcnt = iovcnt;
503 
504 	/* Mark the uioap as such */
505 	uioap->uio_extflg |= UIO_ASYNC;
506 
507 	/*
508 	 * For each iovec_t, lock-down the page(s) backing the iovec_t
509 	 * and save the page_t list for phys addr use in uioamove().
510 	 */
511 	iov = uiop->uio_iov;
512 	iovcnt = uiop->uio_iovcnt;
513 	while (iovcnt > 0) {
514 		addr = iov->iov_base;
515 		off = (uintptr_t)addr & PAGEOFFSET;
516 		addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
517 		len = iov->iov_len + off;
518 
519 		/* Lock down page(s) for the iov span */
520 		if ((error = as_pagelock(as, &pages,
521 		    iov->iov_base, iov->iov_len, S_WRITE)) != 0) {
522 			/* Error */
523 			goto cleanup;
524 		}
525 
526 		if (pages == NULL) {
527 			/*
528 			 * Need page_t list, really only need
529 			 * a pfn list so build one.
530 			 */
531 			pfn_t   *pfnp;
532 			int	pcnt = len >> PAGESHIFT;
533 
534 			if (off)
535 				pcnt++;
536 			if ((pfnp = kmem_alloc(pcnt * sizeof (pfnp),
537 			    KM_NOSLEEP)) == NULL) {
538 				error = ENOMEM;
539 				goto cleanup;
540 			}
541 			locked->uioa_ppp = (void **)pfnp;
542 			locked->uioa_pfncnt = pcnt;
543 			AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
544 			while (pcnt-- > 0) {
545 				*pfnp++ = hat_getpfnum(as->a_hat, addr);
546 				addr += PAGESIZE;
547 			}
548 			AS_LOCK_EXIT(as, &as->a_lock);
549 		} else {
550 			/* Have a page_t list, save it */
551 			locked->uioa_ppp = (void **)pages;
552 			locked->uioa_pfncnt = 0;
553 		}
554 		/* Save for as_pageunlock() in uioafini() */
555 		locked->uioa_base = iov->iov_base;
556 		locked->uioa_len = iov->iov_len;
557 		locked++;
558 
559 		/* Next iovec_t */
560 		iov++;
561 		iovcnt--;
562 	}
563 	/* Initialize curret pointer into uioa_locked[] and it's uioa_ppp */
564 	uioap->uioa_lcur = uioap->uioa_locked;
565 	uioap->uioa_lppp = uioap->uioa_lcur->uioa_ppp;
566 	return (0);
567 
568 cleanup:
569 	/* Unlock any previously locked page_t(s) */
570 	while (locked > uioap->uioa_locked) {
571 		locked--;
572 		as_pageunlock(as, (page_t **)locked->uioa_ppp,
573 		    locked->uioa_base, locked->uioa_len, S_WRITE);
574 	}
575 
576 	/* Last indicate uioa_t still in alloc state */
577 	uioap->uioa_state = UIOA_ALLOC;
578 
579 	return (error);
580 }
581 
582 /*
583  * Finish processing of a uioa_t by cleanup any pending "uioap" actions.
584  */
585 int
586 uioafini(uio_t *uiop, uioa_t *uioap)
587 {
588 	int32_t		iovcnt = uiop->uio_iovcnt;
589 	uioa_page_t	*locked = uioap->uioa_locked;
590 	struct as	*as = ttoproc(curthread)->p_as;
591 	dcopy_handle_t	channel;
592 	dcopy_cmd_t	cmd;
593 	int		ret = 0;
594 
595 	ASSERT(uioap->uio_extflg & UIO_ASYNC);
596 
597 	if (!(uioap->uioa_state & (UIOA_ENABLED|UIOA_FINI))) {
598 		/* Must be an active uioa_t */
599 		return (EINVAL);
600 	}
601 
602 	channel = uioap->uioa_hwst[UIO_DCOPY_CHANNEL];
603 	cmd = uioap->uioa_hwst[UIO_DCOPY_CMD];
604 
605 	/* XXX - why do we get cmd == NULL sometimes? */
606 	if (cmd != NULL) {
607 		if (uioap->uioa_state & UIOA_POLL) {
608 			/* Wait for last dcopy() to finish */
609 			int64_t poll = 1;
610 			int poll_flag = DCOPY_POLL_NOFLAGS;
611 
612 			do {
613 				if (uioa_maxpoll == 0 ||
614 				    (uioa_maxpoll > 0 &&
615 				    poll >= uioa_maxpoll)) {
616 					/* Always block or after maxpoll */
617 					poll_flag = DCOPY_POLL_BLOCK;
618 				} else {
619 					/* No block, poll */
620 					poll++;
621 				}
622 				ret = dcopy_cmd_poll(cmd, poll_flag);
623 			} while (ret == DCOPY_PENDING);
624 
625 			if (ret == DCOPY_COMPLETED) {
626 				/* Poll/block succeeded */
627 				ret = 0;
628 			} else {
629 				/* Poll/block failed */
630 				ret = EIO;
631 			}
632 		}
633 		dcopy_cmd_free(&cmd);
634 	}
635 
636 	dcopy_free(&channel);
637 
638 	/* Unlock all page(s) iovec_t by iovec_t */
639 	while (iovcnt-- > 0) {
640 		page_t **pages;
641 
642 		if (locked->uioa_pfncnt == 0) {
643 			/* A as_pagelock() returned (page_t **) */
644 			pages = (page_t **)locked->uioa_ppp;
645 		} else {
646 			/* Our pfn_t array */
647 			pages = NULL;
648 			kmem_free(locked->uioa_ppp, locked->uioa_pfncnt *
649 			    sizeof (pfn_t *));
650 		}
651 		as_pageunlock(as, pages, locked->uioa_base, locked->uioa_len,
652 		    S_WRITE);
653 
654 		locked++;
655 	}
656 	/* uioa_t->uio_t common struct copy */
657 	*uiop = *((uio_t *)uioap);
658 
659 	/*
660 	 * Last, reset uioa state to alloc.
661 	 *
662 	 * Note, we only initialize the state here, all other members
663 	 * will be initialized in a subsequent uioainit().
664 	 */
665 	uioap->uioa_state = UIOA_ALLOC;
666 
667 	uioap->uioa_hwst[UIO_DCOPY_CMD] = NULL;
668 	uioap->uioa_hwst[UIO_DCOPY_CHANNEL] = NULL;
669 
670 	return (ret);
671 }
672