1 /*- 2 * Copyright (c) 1982, 1986, 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_zero.h" 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/kernel.h> 45 #include <sys/limits.h> 46 #include <sys/lock.h> 47 #include <sys/mman.h> 48 #include <sys/mutex.h> 49 #include <sys/proc.h> 50 #include <sys/resourcevar.h> 51 #include <sys/sched.h> 52 #include <sys/sysctl.h> 53 #include <sys/vnode.h> 54 55 #include <vm/vm.h> 56 #include <vm/vm_extern.h> 57 #include <vm/vm_page.h> 58 #include <vm/vm_map.h> 59 #ifdef ZERO_COPY_SOCKETS 60 #include <vm/vm_param.h> 61 #include <vm/vm_object.h> 62 #endif 63 64 SYSCTL_INT(_kern, KERN_IOV_MAX, iov_max, CTLFLAG_RD, NULL, UIO_MAXIOV, 65 "Maximum number of elements in an I/O vector; sysconf(_SC_IOV_MAX)"); 66 67 #ifdef ZERO_COPY_SOCKETS 68 /* Declared in uipc_socket.c */ 69 extern int so_zero_copy_receive; 70 71 /* 72 * Identify the physical page mapped at the given kernel virtual 73 * address. Insert this physical page into the given address space at 74 * the given virtual address, replacing the physical page, if any, 75 * that already exists there. 76 */ 77 static int 78 vm_pgmoveco(vm_map_t mapa, vm_offset_t kaddr, vm_offset_t uaddr) 79 { 80 vm_map_t map = mapa; 81 vm_page_t kern_pg, user_pg; 82 vm_object_t uobject; 83 vm_map_entry_t entry; 84 vm_pindex_t upindex; 85 vm_prot_t prot; 86 boolean_t wired; 87 88 KASSERT((uaddr & PAGE_MASK) == 0, 89 ("vm_pgmoveco: uaddr is not page aligned")); 90 91 /* 92 * Herein the physical page is validated and dirtied. It is 93 * unwired in sf_buf_mext(). 94 */ 95 kern_pg = PHYS_TO_VM_PAGE(vtophys(kaddr)); 96 kern_pg->valid = VM_PAGE_BITS_ALL; 97 KASSERT(kern_pg->queue == PQ_NONE && kern_pg->wire_count == 1, 98 ("vm_pgmoveco: kern_pg is not correctly wired")); 99 100 if ((vm_map_lookup(&map, uaddr, 101 VM_PROT_WRITE, &entry, &uobject, 102 &upindex, &prot, &wired)) != KERN_SUCCESS) { 103 return(EFAULT); 104 } 105 VM_OBJECT_LOCK(uobject); 106 retry: 107 if ((user_pg = vm_page_lookup(uobject, upindex)) != NULL) { 108 if (vm_page_sleep_if_busy(user_pg, TRUE, "vm_pgmoveco")) 109 goto retry; 110 vm_page_lock(user_pg); 111 pmap_remove_all(user_pg); 112 vm_page_free(user_pg); 113 vm_page_unlock(user_pg); 114 } else { 115 /* 116 * Even if a physical page does not exist in the 117 * object chain's first object, a physical page from a 118 * backing object may be mapped read only. 119 */ 120 if (uobject->backing_object != NULL) 121 pmap_remove(map->pmap, uaddr, uaddr + PAGE_SIZE); 122 } 123 vm_page_insert(kern_pg, uobject, upindex); 124 vm_page_dirty(kern_pg); 125 VM_OBJECT_UNLOCK(uobject); 126 vm_map_lookup_done(map, entry); 127 return(KERN_SUCCESS); 128 } 129 #endif /* ZERO_COPY_SOCKETS */ 130 131 int 132 uiomove(void *cp, int n, struct uio *uio) 133 { 134 struct thread *td = curthread; 135 struct iovec *iov; 136 u_int cnt; 137 int error = 0; 138 int save = 0; 139 140 KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, 141 ("uiomove: mode")); 142 KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, 143 ("uiomove proc")); 144 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 145 "Calling uiomove()"); 146 147 save = td->td_pflags & TDP_DEADLKTREAT; 148 td->td_pflags |= TDP_DEADLKTREAT; 149 150 while (n > 0 && uio->uio_resid) { 151 iov = uio->uio_iov; 152 cnt = iov->iov_len; 153 if (cnt == 0) { 154 uio->uio_iov++; 155 uio->uio_iovcnt--; 156 continue; 157 } 158 if (cnt > n) 159 cnt = n; 160 161 switch (uio->uio_segflg) { 162 163 case UIO_USERSPACE: 164 maybe_yield(); 165 if (uio->uio_rw == UIO_READ) 166 error = copyout(cp, iov->iov_base, cnt); 167 else 168 error = copyin(iov->iov_base, cp, cnt); 169 if (error) 170 goto out; 171 break; 172 173 case UIO_SYSSPACE: 174 if (uio->uio_rw == UIO_READ) 175 bcopy(cp, iov->iov_base, cnt); 176 else 177 bcopy(iov->iov_base, cp, cnt); 178 break; 179 case UIO_NOCOPY: 180 break; 181 } 182 iov->iov_base = (char *)iov->iov_base + cnt; 183 iov->iov_len -= cnt; 184 uio->uio_resid -= cnt; 185 uio->uio_offset += cnt; 186 cp = (char *)cp + cnt; 187 n -= cnt; 188 } 189 out: 190 if (save == 0) 191 td->td_pflags &= ~TDP_DEADLKTREAT; 192 return (error); 193 } 194 195 /* 196 * Wrapper for uiomove() that validates the arguments against a known-good 197 * kernel buffer. Currently, uiomove accepts a signed (n) argument, which 198 * is almost definitely a bad thing, so we catch that here as well. We 199 * return a runtime failure, but it might be desirable to generate a runtime 200 * assertion failure instead. 201 */ 202 int 203 uiomove_frombuf(void *buf, int buflen, struct uio *uio) 204 { 205 unsigned int offset, n; 206 207 if (uio->uio_offset < 0 || uio->uio_resid < 0 || 208 (offset = uio->uio_offset) != uio->uio_offset) 209 return (EINVAL); 210 if (buflen <= 0 || offset >= buflen) 211 return (0); 212 if ((n = buflen - offset) > INT_MAX) 213 return (EINVAL); 214 return (uiomove((char *)buf + offset, n, uio)); 215 } 216 217 #ifdef ZERO_COPY_SOCKETS 218 /* 219 * Experimental support for zero-copy I/O 220 */ 221 static int 222 userspaceco(void *cp, u_int cnt, struct uio *uio, int disposable) 223 { 224 struct iovec *iov; 225 int error; 226 227 iov = uio->uio_iov; 228 if (uio->uio_rw == UIO_READ) { 229 if ((so_zero_copy_receive != 0) 230 && ((cnt & PAGE_MASK) == 0) 231 && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) 232 && ((uio->uio_offset & PAGE_MASK) == 0) 233 && ((((intptr_t) cp) & PAGE_MASK) == 0) 234 && (disposable != 0)) { 235 /* SOCKET: use page-trading */ 236 /* 237 * We only want to call vm_pgmoveco() on 238 * disposeable pages, since it gives the 239 * kernel page to the userland process. 240 */ 241 error = vm_pgmoveco(&curproc->p_vmspace->vm_map, 242 (vm_offset_t)cp, (vm_offset_t)iov->iov_base); 243 244 /* 245 * If we get an error back, attempt 246 * to use copyout() instead. The 247 * disposable page should be freed 248 * automatically if we weren't able to move 249 * it into userland. 250 */ 251 if (error != 0) 252 error = copyout(cp, iov->iov_base, cnt); 253 } else { 254 error = copyout(cp, iov->iov_base, cnt); 255 } 256 } else { 257 error = copyin(iov->iov_base, cp, cnt); 258 } 259 return (error); 260 } 261 262 int 263 uiomoveco(void *cp, int n, struct uio *uio, int disposable) 264 { 265 struct iovec *iov; 266 u_int cnt; 267 int error; 268 269 KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, 270 ("uiomoveco: mode")); 271 KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, 272 ("uiomoveco proc")); 273 274 while (n > 0 && uio->uio_resid) { 275 iov = uio->uio_iov; 276 cnt = iov->iov_len; 277 if (cnt == 0) { 278 uio->uio_iov++; 279 uio->uio_iovcnt--; 280 continue; 281 } 282 if (cnt > n) 283 cnt = n; 284 285 switch (uio->uio_segflg) { 286 287 case UIO_USERSPACE: 288 maybe_yield(); 289 error = userspaceco(cp, cnt, uio, disposable); 290 if (error) 291 return (error); 292 break; 293 294 case UIO_SYSSPACE: 295 if (uio->uio_rw == UIO_READ) 296 bcopy(cp, iov->iov_base, cnt); 297 else 298 bcopy(iov->iov_base, cp, cnt); 299 break; 300 case UIO_NOCOPY: 301 break; 302 } 303 iov->iov_base = (char *)iov->iov_base + cnt; 304 iov->iov_len -= cnt; 305 uio->uio_resid -= cnt; 306 uio->uio_offset += cnt; 307 cp = (char *)cp + cnt; 308 n -= cnt; 309 } 310 return (0); 311 } 312 #endif /* ZERO_COPY_SOCKETS */ 313 314 /* 315 * Give next character to user as result of read. 316 */ 317 int 318 ureadc(int c, struct uio *uio) 319 { 320 struct iovec *iov; 321 char *iov_base; 322 323 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 324 "Calling ureadc()"); 325 326 again: 327 if (uio->uio_iovcnt == 0 || uio->uio_resid == 0) 328 panic("ureadc"); 329 iov = uio->uio_iov; 330 if (iov->iov_len == 0) { 331 uio->uio_iovcnt--; 332 uio->uio_iov++; 333 goto again; 334 } 335 switch (uio->uio_segflg) { 336 337 case UIO_USERSPACE: 338 if (subyte(iov->iov_base, c) < 0) 339 return (EFAULT); 340 break; 341 342 case UIO_SYSSPACE: 343 iov_base = iov->iov_base; 344 *iov_base = c; 345 iov->iov_base = iov_base; 346 break; 347 348 case UIO_NOCOPY: 349 break; 350 } 351 iov->iov_base = (char *)iov->iov_base + 1; 352 iov->iov_len--; 353 uio->uio_resid--; 354 uio->uio_offset++; 355 return (0); 356 } 357 358 int 359 copyinfrom(const void * __restrict src, void * __restrict dst, size_t len, 360 int seg) 361 { 362 int error = 0; 363 364 switch (seg) { 365 case UIO_USERSPACE: 366 error = copyin(src, dst, len); 367 break; 368 case UIO_SYSSPACE: 369 bcopy(src, dst, len); 370 break; 371 default: 372 panic("copyinfrom: bad seg %d\n", seg); 373 } 374 return (error); 375 } 376 377 int 378 copyinstrfrom(const void * __restrict src, void * __restrict dst, size_t len, 379 size_t * __restrict copied, int seg) 380 { 381 int error = 0; 382 383 switch (seg) { 384 case UIO_USERSPACE: 385 error = copyinstr(src, dst, len, copied); 386 break; 387 case UIO_SYSSPACE: 388 error = copystr(src, dst, len, copied); 389 break; 390 default: 391 panic("copyinstrfrom: bad seg %d\n", seg); 392 } 393 return (error); 394 } 395 396 int 397 copyiniov(struct iovec *iovp, u_int iovcnt, struct iovec **iov, int error) 398 { 399 u_int iovlen; 400 401 *iov = NULL; 402 if (iovcnt > UIO_MAXIOV) 403 return (error); 404 iovlen = iovcnt * sizeof (struct iovec); 405 *iov = malloc(iovlen, M_IOV, M_WAITOK); 406 error = copyin(iovp, *iov, iovlen); 407 if (error) { 408 free(*iov, M_IOV); 409 *iov = NULL; 410 } 411 return (error); 412 } 413 414 int 415 copyinuio(struct iovec *iovp, u_int iovcnt, struct uio **uiop) 416 { 417 struct iovec *iov; 418 struct uio *uio; 419 u_int iovlen; 420 int error, i; 421 422 *uiop = NULL; 423 if (iovcnt > UIO_MAXIOV) 424 return (EINVAL); 425 iovlen = iovcnt * sizeof (struct iovec); 426 uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK); 427 iov = (struct iovec *)(uio + 1); 428 error = copyin(iovp, iov, iovlen); 429 if (error) { 430 free(uio, M_IOV); 431 return (error); 432 } 433 uio->uio_iov = iov; 434 uio->uio_iovcnt = iovcnt; 435 uio->uio_segflg = UIO_USERSPACE; 436 uio->uio_offset = -1; 437 uio->uio_resid = 0; 438 for (i = 0; i < iovcnt; i++) { 439 if (iov->iov_len > INT_MAX - uio->uio_resid) { 440 free(uio, M_IOV); 441 return (EINVAL); 442 } 443 uio->uio_resid += iov->iov_len; 444 iov++; 445 } 446 *uiop = uio; 447 return (0); 448 } 449 450 struct uio * 451 cloneuio(struct uio *uiop) 452 { 453 struct uio *uio; 454 int iovlen; 455 456 iovlen = uiop->uio_iovcnt * sizeof (struct iovec); 457 uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK); 458 *uio = *uiop; 459 uio->uio_iov = (struct iovec *)(uio + 1); 460 bcopy(uiop->uio_iov, uio->uio_iov, iovlen); 461 return (uio); 462 } 463 464 /* 465 * Map some anonymous memory in user space of size sz, rounded up to the page 466 * boundary. 467 */ 468 int 469 copyout_map(struct thread *td, vm_offset_t *addr, size_t sz) 470 { 471 struct vmspace *vms; 472 int error; 473 vm_size_t size; 474 475 vms = td->td_proc->p_vmspace; 476 477 /* 478 * Map somewhere after heap in process memory. 479 */ 480 PROC_LOCK(td->td_proc); 481 *addr = round_page((vm_offset_t)vms->vm_daddr + 482 lim_max(td->td_proc, RLIMIT_DATA)); 483 PROC_UNLOCK(td->td_proc); 484 485 /* round size up to page boundry */ 486 size = (vm_size_t)round_page(sz); 487 488 error = vm_mmap(&vms->vm_map, addr, size, PROT_READ | PROT_WRITE, 489 VM_PROT_ALL, MAP_PRIVATE | MAP_ANON, OBJT_DEFAULT, NULL, 0); 490 491 return (error); 492 } 493 494 /* 495 * Unmap memory in user space. 496 */ 497 int 498 copyout_unmap(struct thread *td, vm_offset_t addr, size_t sz) 499 { 500 vm_map_t map; 501 vm_size_t size; 502 503 if (sz == 0) 504 return (0); 505 506 map = &td->td_proc->p_vmspace->vm_map; 507 size = (vm_size_t)round_page(sz); 508 509 if (vm_map_remove(map, addr, addr + size) != KERN_SUCCESS) 510 return (EINVAL); 511 512 return (0); 513 } 514