xref: /freebsd/sys/kern/subr_uio.c (revision 9fd69f37d28cfd7438cac3eeb45fe9dd46b4d7dd)
1 /*-
2  * Copyright (c) 1982, 1986, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)kern_subr.c	8.3 (Berkeley) 1/21/94
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_zero.h"
41 
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/kernel.h>
45 #include <sys/limits.h>
46 #include <sys/lock.h>
47 #include <sys/mutex.h>
48 #include <sys/proc.h>
49 #include <sys/sched.h>
50 #include <sys/sysctl.h>
51 #include <sys/vnode.h>
52 
53 #include <vm/vm.h>
54 #include <vm/vm_page.h>
55 #include <vm/vm_map.h>
56 #ifdef ZERO_COPY_SOCKETS
57 #include <vm/vm_param.h>
58 #include <vm/vm_object.h>
59 #endif
60 
61 SYSCTL_INT(_kern, KERN_IOV_MAX, iov_max, CTLFLAG_RD, NULL, UIO_MAXIOV,
62 	"Maximum number of elements in an I/O vector; sysconf(_SC_IOV_MAX)");
63 
64 #ifdef ZERO_COPY_SOCKETS
65 /* Declared in uipc_socket.c */
66 extern int so_zero_copy_receive;
67 
68 /*
69  * Identify the physical page mapped at the given kernel virtual
70  * address.  Insert this physical page into the given address space at
71  * the given virtual address, replacing the physical page, if any,
72  * that already exists there.
73  */
74 static int
75 vm_pgmoveco(vm_map_t mapa, vm_offset_t kaddr, vm_offset_t uaddr)
76 {
77 	vm_map_t map = mapa;
78 	vm_page_t kern_pg, user_pg;
79 	vm_object_t uobject;
80 	vm_map_entry_t entry;
81 	vm_pindex_t upindex;
82 	vm_prot_t prot;
83 	boolean_t wired;
84 
85 	KASSERT((uaddr & PAGE_MASK) == 0,
86 	    ("vm_pgmoveco: uaddr is not page aligned"));
87 
88 	/*
89 	 * Herein the physical page is validated and dirtied.  It is
90 	 * unwired in sf_buf_mext().
91 	 */
92 	kern_pg = PHYS_TO_VM_PAGE(vtophys(kaddr));
93 	kern_pg->valid = VM_PAGE_BITS_ALL;
94 	KASSERT(kern_pg->queue == PQ_NONE && kern_pg->wire_count == 1,
95 	    ("vm_pgmoveco: kern_pg is not correctly wired"));
96 
97 	if ((vm_map_lookup(&map, uaddr,
98 			   VM_PROT_WRITE, &entry, &uobject,
99 			   &upindex, &prot, &wired)) != KERN_SUCCESS) {
100 		return(EFAULT);
101 	}
102 	VM_OBJECT_LOCK(uobject);
103 retry:
104 	if ((user_pg = vm_page_lookup(uobject, upindex)) != NULL) {
105 		if (vm_page_sleep_if_busy(user_pg, TRUE, "vm_pgmoveco"))
106 			goto retry;
107 		vm_page_lock_queues();
108 		pmap_remove_all(user_pg);
109 		vm_page_free(user_pg);
110 	} else {
111 		/*
112 		 * Even if a physical page does not exist in the
113 		 * object chain's first object, a physical page from a
114 		 * backing object may be mapped read only.
115 		 */
116 		if (uobject->backing_object != NULL)
117 			pmap_remove(map->pmap, uaddr, uaddr + PAGE_SIZE);
118 		vm_page_lock_queues();
119 	}
120 	vm_page_insert(kern_pg, uobject, upindex);
121 	vm_page_dirty(kern_pg);
122 	vm_page_unlock_queues();
123 	VM_OBJECT_UNLOCK(uobject);
124 	vm_map_lookup_done(map, entry);
125 	return(KERN_SUCCESS);
126 }
127 #endif /* ZERO_COPY_SOCKETS */
128 
129 int
130 uiomove(void *cp, int n, struct uio *uio)
131 {
132 	struct thread *td = curthread;
133 	struct iovec *iov;
134 	u_int cnt;
135 	int error = 0;
136 	int save = 0;
137 
138 	KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE,
139 	    ("uiomove: mode"));
140 	KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread,
141 	    ("uiomove proc"));
142 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
143 	    "Calling uiomove()");
144 
145 	save = td->td_pflags & TDP_DEADLKTREAT;
146 	td->td_pflags |= TDP_DEADLKTREAT;
147 
148 	while (n > 0 && uio->uio_resid) {
149 		iov = uio->uio_iov;
150 		cnt = iov->iov_len;
151 		if (cnt == 0) {
152 			uio->uio_iov++;
153 			uio->uio_iovcnt--;
154 			continue;
155 		}
156 		if (cnt > n)
157 			cnt = n;
158 
159 		switch (uio->uio_segflg) {
160 
161 		case UIO_USERSPACE:
162 			if (ticks - PCPU_GET(switchticks) >= hogticks)
163 				uio_yield();
164 			if (uio->uio_rw == UIO_READ)
165 				error = copyout(cp, iov->iov_base, cnt);
166 			else
167 				error = copyin(iov->iov_base, cp, cnt);
168 			if (error)
169 				goto out;
170 			break;
171 
172 		case UIO_SYSSPACE:
173 			if (uio->uio_rw == UIO_READ)
174 				bcopy(cp, iov->iov_base, cnt);
175 			else
176 				bcopy(iov->iov_base, cp, cnt);
177 			break;
178 		case UIO_NOCOPY:
179 			break;
180 		}
181 		iov->iov_base = (char *)iov->iov_base + cnt;
182 		iov->iov_len -= cnt;
183 		uio->uio_resid -= cnt;
184 		uio->uio_offset += cnt;
185 		cp = (char *)cp + cnt;
186 		n -= cnt;
187 	}
188 out:
189 	if (save == 0)
190 		td->td_pflags &= ~TDP_DEADLKTREAT;
191 	return (error);
192 }
193 
194 /*
195  * Wrapper for uiomove() that validates the arguments against a known-good
196  * kernel buffer.  Currently, uiomove accepts a signed (n) argument, which
197  * is almost definitely a bad thing, so we catch that here as well.  We
198  * return a runtime failure, but it might be desirable to generate a runtime
199  * assertion failure instead.
200  */
201 int
202 uiomove_frombuf(void *buf, int buflen, struct uio *uio)
203 {
204 	unsigned int offset, n;
205 
206 	if (uio->uio_offset < 0 || uio->uio_resid < 0 ||
207 	    (offset = uio->uio_offset) != uio->uio_offset)
208 		return (EINVAL);
209 	if (buflen <= 0 || offset >= buflen)
210 		return (0);
211 	if ((n = buflen - offset) > INT_MAX)
212 		return (EINVAL);
213 	return (uiomove((char *)buf + offset, n, uio));
214 }
215 
216 #ifdef ZERO_COPY_SOCKETS
217 /*
218  * Experimental support for zero-copy I/O
219  */
220 static int
221 userspaceco(void *cp, u_int cnt, struct uio *uio, int disposable)
222 {
223 	struct iovec *iov;
224 	int error;
225 
226 	iov = uio->uio_iov;
227 	if (uio->uio_rw == UIO_READ) {
228 		if ((so_zero_copy_receive != 0)
229 		 && ((cnt & PAGE_MASK) == 0)
230 		 && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0)
231 		 && ((uio->uio_offset & PAGE_MASK) == 0)
232 		 && ((((intptr_t) cp) & PAGE_MASK) == 0)
233 		 && (disposable != 0)) {
234 			/* SOCKET: use page-trading */
235 			/*
236 			 * We only want to call vm_pgmoveco() on
237 			 * disposeable pages, since it gives the
238 			 * kernel page to the userland process.
239 			 */
240 			error =	vm_pgmoveco(&curproc->p_vmspace->vm_map,
241 			    (vm_offset_t)cp, (vm_offset_t)iov->iov_base);
242 
243 			/*
244 			 * If we get an error back, attempt
245 			 * to use copyout() instead.  The
246 			 * disposable page should be freed
247 			 * automatically if we weren't able to move
248 			 * it into userland.
249 			 */
250 			if (error != 0)
251 				error = copyout(cp, iov->iov_base, cnt);
252 		} else {
253 			error = copyout(cp, iov->iov_base, cnt);
254 		}
255 	} else {
256 		error = copyin(iov->iov_base, cp, cnt);
257 	}
258 	return (error);
259 }
260 
261 int
262 uiomoveco(void *cp, int n, struct uio *uio, int disposable)
263 {
264 	struct iovec *iov;
265 	u_int cnt;
266 	int error;
267 
268 	KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE,
269 	    ("uiomoveco: mode"));
270 	KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread,
271 	    ("uiomoveco proc"));
272 
273 	while (n > 0 && uio->uio_resid) {
274 		iov = uio->uio_iov;
275 		cnt = iov->iov_len;
276 		if (cnt == 0) {
277 			uio->uio_iov++;
278 			uio->uio_iovcnt--;
279 			continue;
280 		}
281 		if (cnt > n)
282 			cnt = n;
283 
284 		switch (uio->uio_segflg) {
285 
286 		case UIO_USERSPACE:
287 			if (ticks - PCPU_GET(switchticks) >= hogticks)
288 				uio_yield();
289 
290 			error = userspaceco(cp, cnt, uio, disposable);
291 
292 			if (error)
293 				return (error);
294 			break;
295 
296 		case UIO_SYSSPACE:
297 			if (uio->uio_rw == UIO_READ)
298 				bcopy(cp, iov->iov_base, cnt);
299 			else
300 				bcopy(iov->iov_base, cp, cnt);
301 			break;
302 		case UIO_NOCOPY:
303 			break;
304 		}
305 		iov->iov_base = (char *)iov->iov_base + cnt;
306 		iov->iov_len -= cnt;
307 		uio->uio_resid -= cnt;
308 		uio->uio_offset += cnt;
309 		cp = (char *)cp + cnt;
310 		n -= cnt;
311 	}
312 	return (0);
313 }
314 #endif /* ZERO_COPY_SOCKETS */
315 
316 /*
317  * Give next character to user as result of read.
318  */
319 int
320 ureadc(int c, struct uio *uio)
321 {
322 	struct iovec *iov;
323 	char *iov_base;
324 
325 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
326 	    "Calling ureadc()");
327 
328 again:
329 	if (uio->uio_iovcnt == 0 || uio->uio_resid == 0)
330 		panic("ureadc");
331 	iov = uio->uio_iov;
332 	if (iov->iov_len == 0) {
333 		uio->uio_iovcnt--;
334 		uio->uio_iov++;
335 		goto again;
336 	}
337 	switch (uio->uio_segflg) {
338 
339 	case UIO_USERSPACE:
340 		if (subyte(iov->iov_base, c) < 0)
341 			return (EFAULT);
342 		break;
343 
344 	case UIO_SYSSPACE:
345 		iov_base = iov->iov_base;
346 		*iov_base = c;
347 		iov->iov_base = iov_base;
348 		break;
349 
350 	case UIO_NOCOPY:
351 		break;
352 	}
353 	iov->iov_base = (char *)iov->iov_base + 1;
354 	iov->iov_len--;
355 	uio->uio_resid--;
356 	uio->uio_offset++;
357 	return (0);
358 }
359 
360 void
361 uio_yield(void)
362 {
363 	struct thread *td;
364 
365 	td = curthread;
366 	DROP_GIANT();
367 	thread_lock(td);
368 	sched_prio(td, td->td_user_pri);
369 	mi_switch(SW_INVOL | SWT_RELINQUISH, NULL);
370 	thread_unlock(td);
371 	PICKUP_GIANT();
372 }
373 
374 int
375 copyinfrom(const void * __restrict src, void * __restrict dst, size_t len,
376     int seg)
377 {
378 	int error = 0;
379 
380 	switch (seg) {
381 	case UIO_USERSPACE:
382 		error = copyin(src, dst, len);
383 		break;
384 	case UIO_SYSSPACE:
385 		bcopy(src, dst, len);
386 		break;
387 	default:
388 		panic("copyinfrom: bad seg %d\n", seg);
389 	}
390 	return (error);
391 }
392 
393 int
394 copyinstrfrom(const void * __restrict src, void * __restrict dst, size_t len,
395     size_t * __restrict copied, int seg)
396 {
397 	int error = 0;
398 
399 	switch (seg) {
400 	case UIO_USERSPACE:
401 		error = copyinstr(src, dst, len, copied);
402 		break;
403 	case UIO_SYSSPACE:
404 		error = copystr(src, dst, len, copied);
405 		break;
406 	default:
407 		panic("copyinstrfrom: bad seg %d\n", seg);
408 	}
409 	return (error);
410 }
411 
412 int
413 copyiniov(struct iovec *iovp, u_int iovcnt, struct iovec **iov, int error)
414 {
415 	u_int iovlen;
416 
417 	*iov = NULL;
418 	if (iovcnt > UIO_MAXIOV)
419 		return (error);
420 	iovlen = iovcnt * sizeof (struct iovec);
421 	*iov = malloc(iovlen, M_IOV, M_WAITOK);
422 	error = copyin(iovp, *iov, iovlen);
423 	if (error) {
424 		free(*iov, M_IOV);
425 		*iov = NULL;
426 	}
427 	return (error);
428 }
429 
430 int
431 copyinuio(struct iovec *iovp, u_int iovcnt, struct uio **uiop)
432 {
433 	struct iovec *iov;
434 	struct uio *uio;
435 	u_int iovlen;
436 	int error, i;
437 
438 	*uiop = NULL;
439 	if (iovcnt > UIO_MAXIOV)
440 		return (EINVAL);
441 	iovlen = iovcnt * sizeof (struct iovec);
442 	uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK);
443 	iov = (struct iovec *)(uio + 1);
444 	error = copyin(iovp, iov, iovlen);
445 	if (error) {
446 		free(uio, M_IOV);
447 		return (error);
448 	}
449 	uio->uio_iov = iov;
450 	uio->uio_iovcnt = iovcnt;
451 	uio->uio_segflg = UIO_USERSPACE;
452 	uio->uio_offset = -1;
453 	uio->uio_resid = 0;
454 	for (i = 0; i < iovcnt; i++) {
455 		if (iov->iov_len > INT_MAX - uio->uio_resid) {
456 			free(uio, M_IOV);
457 			return (EINVAL);
458 		}
459 		uio->uio_resid += iov->iov_len;
460 		iov++;
461 	}
462 	*uiop = uio;
463 	return (0);
464 }
465 
466 struct uio *
467 cloneuio(struct uio *uiop)
468 {
469 	struct uio *uio;
470 	int iovlen;
471 
472 	iovlen = uiop->uio_iovcnt * sizeof (struct iovec);
473 	uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK);
474 	*uio = *uiop;
475 	uio->uio_iov = (struct iovec *)(uio + 1);
476 	bcopy(uiop->uio_iov, uio->uio_iov, iovlen);
477 	return (uio);
478 }
479