1 /*- 2 * Copyright (c) 1982, 1986, 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_zero.h" 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/kernel.h> 45 #include <sys/limits.h> 46 #include <sys/lock.h> 47 #include <sys/mman.h> 48 #include <sys/proc.h> 49 #include <sys/resourcevar.h> 50 #include <sys/rwlock.h> 51 #include <sys/sched.h> 52 #include <sys/sysctl.h> 53 #include <sys/vnode.h> 54 55 #include <vm/vm.h> 56 #include <vm/vm_param.h> 57 #include <vm/vm_extern.h> 58 #include <vm/vm_page.h> 59 #include <vm/vm_pageout.h> 60 #include <vm/vm_map.h> 61 #ifdef SOCKET_SEND_COW 62 #include <vm/vm_object.h> 63 #endif 64 65 SYSCTL_INT(_kern, KERN_IOV_MAX, iov_max, CTLFLAG_RD, NULL, UIO_MAXIOV, 66 "Maximum number of elements in an I/O vector; sysconf(_SC_IOV_MAX)"); 67 68 static int uiomove_faultflag(void *cp, int n, struct uio *uio, int nofault); 69 70 #ifdef SOCKET_SEND_COW 71 /* Declared in uipc_socket.c */ 72 extern int so_zero_copy_receive; 73 74 /* 75 * Identify the physical page mapped at the given kernel virtual 76 * address. Insert this physical page into the given address space at 77 * the given virtual address, replacing the physical page, if any, 78 * that already exists there. 79 */ 80 static int 81 vm_pgmoveco(vm_map_t mapa, vm_offset_t kaddr, vm_offset_t uaddr) 82 { 83 vm_map_t map = mapa; 84 vm_page_t kern_pg, user_pg; 85 vm_object_t uobject; 86 vm_map_entry_t entry; 87 vm_pindex_t upindex; 88 vm_prot_t prot; 89 boolean_t wired; 90 91 KASSERT((uaddr & PAGE_MASK) == 0, 92 ("vm_pgmoveco: uaddr is not page aligned")); 93 94 /* 95 * Herein the physical page is validated and dirtied. It is 96 * unwired in sf_buf_mext(). 97 */ 98 kern_pg = PHYS_TO_VM_PAGE(vtophys(kaddr)); 99 kern_pg->valid = VM_PAGE_BITS_ALL; 100 KASSERT(kern_pg->queue == PQ_NONE && kern_pg->wire_count == 1, 101 ("vm_pgmoveco: kern_pg is not correctly wired")); 102 103 if ((vm_map_lookup(&map, uaddr, 104 VM_PROT_WRITE, &entry, &uobject, 105 &upindex, &prot, &wired)) != KERN_SUCCESS) { 106 return(EFAULT); 107 } 108 VM_OBJECT_WLOCK(uobject); 109 retry: 110 if ((user_pg = vm_page_lookup(uobject, upindex)) != NULL) { 111 if (vm_page_sleep_if_busy(user_pg, "vm_pgmoveco")) 112 goto retry; 113 vm_page_lock(user_pg); 114 pmap_remove_all(user_pg); 115 vm_page_free(user_pg); 116 vm_page_unlock(user_pg); 117 } else { 118 /* 119 * Even if a physical page does not exist in the 120 * object chain's first object, a physical page from a 121 * backing object may be mapped read only. 122 */ 123 if (uobject->backing_object != NULL) 124 pmap_remove(map->pmap, uaddr, uaddr + PAGE_SIZE); 125 } 126 if (vm_page_insert(kern_pg, uobject, upindex)) { 127 VM_OBJECT_WUNLOCK(uobject); 128 VM_WAIT; 129 VM_OBJECT_WLOCK(uobject); 130 goto retry; 131 } 132 vm_page_dirty(kern_pg); 133 VM_OBJECT_WUNLOCK(uobject); 134 vm_map_lookup_done(map, entry); 135 return(KERN_SUCCESS); 136 } 137 #endif /* SOCKET_SEND_COW */ 138 139 int 140 copyin_nofault(const void *udaddr, void *kaddr, size_t len) 141 { 142 int error, save; 143 144 save = vm_fault_disable_pagefaults(); 145 error = copyin(udaddr, kaddr, len); 146 vm_fault_enable_pagefaults(save); 147 return (error); 148 } 149 150 int 151 copyout_nofault(const void *kaddr, void *udaddr, size_t len) 152 { 153 int error, save; 154 155 save = vm_fault_disable_pagefaults(); 156 error = copyout(kaddr, udaddr, len); 157 vm_fault_enable_pagefaults(save); 158 return (error); 159 } 160 161 #define PHYS_PAGE_COUNT(len) (howmany(len, PAGE_SIZE) + 1) 162 163 int 164 physcopyin(void *src, vm_paddr_t dst, size_t len) 165 { 166 vm_page_t m[PHYS_PAGE_COUNT(len)]; 167 struct iovec iov[1]; 168 struct uio uio; 169 int i; 170 171 iov[0].iov_base = src; 172 iov[0].iov_len = len; 173 uio.uio_iov = iov; 174 uio.uio_iovcnt = 1; 175 uio.uio_offset = 0; 176 uio.uio_resid = len; 177 uio.uio_segflg = UIO_SYSSPACE; 178 uio.uio_rw = UIO_WRITE; 179 for (i = 0; i < PHYS_PAGE_COUNT(len); i++, dst += PAGE_SIZE) 180 m[i] = PHYS_TO_VM_PAGE(dst); 181 return (uiomove_fromphys(m, dst & PAGE_MASK, len, &uio)); 182 } 183 184 int 185 physcopyout(vm_paddr_t src, void *dst, size_t len) 186 { 187 vm_page_t m[PHYS_PAGE_COUNT(len)]; 188 struct iovec iov[1]; 189 struct uio uio; 190 int i; 191 192 iov[0].iov_base = dst; 193 iov[0].iov_len = len; 194 uio.uio_iov = iov; 195 uio.uio_iovcnt = 1; 196 uio.uio_offset = 0; 197 uio.uio_resid = len; 198 uio.uio_segflg = UIO_SYSSPACE; 199 uio.uio_rw = UIO_READ; 200 for (i = 0; i < PHYS_PAGE_COUNT(len); i++, src += PAGE_SIZE) 201 m[i] = PHYS_TO_VM_PAGE(src); 202 return (uiomove_fromphys(m, src & PAGE_MASK, len, &uio)); 203 } 204 205 #undef PHYS_PAGE_COUNT 206 207 int 208 uiomove(void *cp, int n, struct uio *uio) 209 { 210 211 return (uiomove_faultflag(cp, n, uio, 0)); 212 } 213 214 int 215 uiomove_nofault(void *cp, int n, struct uio *uio) 216 { 217 218 return (uiomove_faultflag(cp, n, uio, 1)); 219 } 220 221 static int 222 uiomove_faultflag(void *cp, int n, struct uio *uio, int nofault) 223 { 224 struct thread *td; 225 struct iovec *iov; 226 size_t cnt; 227 int error, newflags, save; 228 229 td = curthread; 230 error = 0; 231 232 KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, 233 ("uiomove: mode")); 234 KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == td, 235 ("uiomove proc")); 236 if (!nofault) 237 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 238 "Calling uiomove()"); 239 240 /* XXX does it make a sense to set TDP_DEADLKTREAT for UIO_SYSSPACE ? */ 241 newflags = TDP_DEADLKTREAT; 242 if (uio->uio_segflg == UIO_USERSPACE && nofault) { 243 /* 244 * Fail if a non-spurious page fault occurs. 245 */ 246 newflags |= TDP_NOFAULTING | TDP_RESETSPUR; 247 } 248 save = curthread_pflags_set(newflags); 249 250 while (n > 0 && uio->uio_resid) { 251 iov = uio->uio_iov; 252 cnt = iov->iov_len; 253 if (cnt == 0) { 254 uio->uio_iov++; 255 uio->uio_iovcnt--; 256 continue; 257 } 258 if (cnt > n) 259 cnt = n; 260 261 switch (uio->uio_segflg) { 262 263 case UIO_USERSPACE: 264 maybe_yield(); 265 if (uio->uio_rw == UIO_READ) 266 error = copyout(cp, iov->iov_base, cnt); 267 else 268 error = copyin(iov->iov_base, cp, cnt); 269 if (error) 270 goto out; 271 break; 272 273 case UIO_SYSSPACE: 274 if (uio->uio_rw == UIO_READ) 275 bcopy(cp, iov->iov_base, cnt); 276 else 277 bcopy(iov->iov_base, cp, cnt); 278 break; 279 case UIO_NOCOPY: 280 break; 281 } 282 iov->iov_base = (char *)iov->iov_base + cnt; 283 iov->iov_len -= cnt; 284 uio->uio_resid -= cnt; 285 uio->uio_offset += cnt; 286 cp = (char *)cp + cnt; 287 n -= cnt; 288 } 289 out: 290 curthread_pflags_restore(save); 291 return (error); 292 } 293 294 /* 295 * Wrapper for uiomove() that validates the arguments against a known-good 296 * kernel buffer. Currently, uiomove accepts a signed (n) argument, which 297 * is almost definitely a bad thing, so we catch that here as well. We 298 * return a runtime failure, but it might be desirable to generate a runtime 299 * assertion failure instead. 300 */ 301 int 302 uiomove_frombuf(void *buf, int buflen, struct uio *uio) 303 { 304 size_t offset, n; 305 306 if (uio->uio_offset < 0 || uio->uio_resid < 0 || 307 (offset = uio->uio_offset) != uio->uio_offset) 308 return (EINVAL); 309 if (buflen <= 0 || offset >= buflen) 310 return (0); 311 if ((n = buflen - offset) > IOSIZE_MAX) 312 return (EINVAL); 313 return (uiomove((char *)buf + offset, n, uio)); 314 } 315 316 #ifdef SOCKET_RECV_PFLIP 317 /* 318 * Experimental support for zero-copy I/O 319 */ 320 static int 321 userspaceco(void *cp, u_int cnt, struct uio *uio, int disposable) 322 { 323 struct iovec *iov; 324 int error; 325 326 iov = uio->uio_iov; 327 if (uio->uio_rw == UIO_READ) { 328 if ((so_zero_copy_receive != 0) 329 && ((cnt & PAGE_MASK) == 0) 330 && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) 331 && ((uio->uio_offset & PAGE_MASK) == 0) 332 && ((((intptr_t) cp) & PAGE_MASK) == 0) 333 && (disposable != 0)) { 334 /* SOCKET: use page-trading */ 335 /* 336 * We only want to call vm_pgmoveco() on 337 * disposeable pages, since it gives the 338 * kernel page to the userland process. 339 */ 340 error = vm_pgmoveco(&curproc->p_vmspace->vm_map, 341 (vm_offset_t)cp, (vm_offset_t)iov->iov_base); 342 343 /* 344 * If we get an error back, attempt 345 * to use copyout() instead. The 346 * disposable page should be freed 347 * automatically if we weren't able to move 348 * it into userland. 349 */ 350 if (error != 0) 351 error = copyout(cp, iov->iov_base, cnt); 352 } else { 353 error = copyout(cp, iov->iov_base, cnt); 354 } 355 } else { 356 error = copyin(iov->iov_base, cp, cnt); 357 } 358 return (error); 359 } 360 361 int 362 uiomoveco(void *cp, int n, struct uio *uio, int disposable) 363 { 364 struct iovec *iov; 365 u_int cnt; 366 int error; 367 368 KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, 369 ("uiomoveco: mode")); 370 KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, 371 ("uiomoveco proc")); 372 373 while (n > 0 && uio->uio_resid) { 374 iov = uio->uio_iov; 375 cnt = iov->iov_len; 376 if (cnt == 0) { 377 uio->uio_iov++; 378 uio->uio_iovcnt--; 379 continue; 380 } 381 if (cnt > n) 382 cnt = n; 383 384 switch (uio->uio_segflg) { 385 386 case UIO_USERSPACE: 387 maybe_yield(); 388 error = userspaceco(cp, cnt, uio, disposable); 389 if (error) 390 return (error); 391 break; 392 393 case UIO_SYSSPACE: 394 if (uio->uio_rw == UIO_READ) 395 bcopy(cp, iov->iov_base, cnt); 396 else 397 bcopy(iov->iov_base, cp, cnt); 398 break; 399 case UIO_NOCOPY: 400 break; 401 } 402 iov->iov_base = (char *)iov->iov_base + cnt; 403 iov->iov_len -= cnt; 404 uio->uio_resid -= cnt; 405 uio->uio_offset += cnt; 406 cp = (char *)cp + cnt; 407 n -= cnt; 408 } 409 return (0); 410 } 411 #endif /* SOCKET_RECV_PFLIP */ 412 413 /* 414 * Give next character to user as result of read. 415 */ 416 int 417 ureadc(int c, struct uio *uio) 418 { 419 struct iovec *iov; 420 char *iov_base; 421 422 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 423 "Calling ureadc()"); 424 425 again: 426 if (uio->uio_iovcnt == 0 || uio->uio_resid == 0) 427 panic("ureadc"); 428 iov = uio->uio_iov; 429 if (iov->iov_len == 0) { 430 uio->uio_iovcnt--; 431 uio->uio_iov++; 432 goto again; 433 } 434 switch (uio->uio_segflg) { 435 436 case UIO_USERSPACE: 437 if (subyte(iov->iov_base, c) < 0) 438 return (EFAULT); 439 break; 440 441 case UIO_SYSSPACE: 442 iov_base = iov->iov_base; 443 *iov_base = c; 444 break; 445 446 case UIO_NOCOPY: 447 break; 448 } 449 iov->iov_base = (char *)iov->iov_base + 1; 450 iov->iov_len--; 451 uio->uio_resid--; 452 uio->uio_offset++; 453 return (0); 454 } 455 456 int 457 copyinfrom(const void * __restrict src, void * __restrict dst, size_t len, 458 int seg) 459 { 460 int error = 0; 461 462 switch (seg) { 463 case UIO_USERSPACE: 464 error = copyin(src, dst, len); 465 break; 466 case UIO_SYSSPACE: 467 bcopy(src, dst, len); 468 break; 469 default: 470 panic("copyinfrom: bad seg %d\n", seg); 471 } 472 return (error); 473 } 474 475 int 476 copyinstrfrom(const void * __restrict src, void * __restrict dst, size_t len, 477 size_t * __restrict copied, int seg) 478 { 479 int error = 0; 480 481 switch (seg) { 482 case UIO_USERSPACE: 483 error = copyinstr(src, dst, len, copied); 484 break; 485 case UIO_SYSSPACE: 486 error = copystr(src, dst, len, copied); 487 break; 488 default: 489 panic("copyinstrfrom: bad seg %d\n", seg); 490 } 491 return (error); 492 } 493 494 int 495 copyiniov(const struct iovec *iovp, u_int iovcnt, struct iovec **iov, int error) 496 { 497 u_int iovlen; 498 499 *iov = NULL; 500 if (iovcnt > UIO_MAXIOV) 501 return (error); 502 iovlen = iovcnt * sizeof (struct iovec); 503 *iov = malloc(iovlen, M_IOV, M_WAITOK); 504 error = copyin(iovp, *iov, iovlen); 505 if (error) { 506 free(*iov, M_IOV); 507 *iov = NULL; 508 } 509 return (error); 510 } 511 512 int 513 copyinuio(const struct iovec *iovp, u_int iovcnt, struct uio **uiop) 514 { 515 struct iovec *iov; 516 struct uio *uio; 517 u_int iovlen; 518 int error, i; 519 520 *uiop = NULL; 521 if (iovcnt > UIO_MAXIOV) 522 return (EINVAL); 523 iovlen = iovcnt * sizeof (struct iovec); 524 uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK); 525 iov = (struct iovec *)(uio + 1); 526 error = copyin(iovp, iov, iovlen); 527 if (error) { 528 free(uio, M_IOV); 529 return (error); 530 } 531 uio->uio_iov = iov; 532 uio->uio_iovcnt = iovcnt; 533 uio->uio_segflg = UIO_USERSPACE; 534 uio->uio_offset = -1; 535 uio->uio_resid = 0; 536 for (i = 0; i < iovcnt; i++) { 537 if (iov->iov_len > IOSIZE_MAX - uio->uio_resid) { 538 free(uio, M_IOV); 539 return (EINVAL); 540 } 541 uio->uio_resid += iov->iov_len; 542 iov++; 543 } 544 *uiop = uio; 545 return (0); 546 } 547 548 struct uio * 549 cloneuio(struct uio *uiop) 550 { 551 struct uio *uio; 552 int iovlen; 553 554 iovlen = uiop->uio_iovcnt * sizeof (struct iovec); 555 uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK); 556 *uio = *uiop; 557 uio->uio_iov = (struct iovec *)(uio + 1); 558 bcopy(uiop->uio_iov, uio->uio_iov, iovlen); 559 return (uio); 560 } 561 562 /* 563 * Map some anonymous memory in user space of size sz, rounded up to the page 564 * boundary. 565 */ 566 int 567 copyout_map(struct thread *td, vm_offset_t *addr, size_t sz) 568 { 569 struct vmspace *vms; 570 int error; 571 vm_size_t size; 572 573 vms = td->td_proc->p_vmspace; 574 575 /* 576 * Map somewhere after heap in process memory. 577 */ 578 PROC_LOCK(td->td_proc); 579 *addr = round_page((vm_offset_t)vms->vm_daddr + 580 lim_max(td->td_proc, RLIMIT_DATA)); 581 PROC_UNLOCK(td->td_proc); 582 583 /* round size up to page boundry */ 584 size = (vm_size_t)round_page(sz); 585 586 error = vm_mmap(&vms->vm_map, addr, size, PROT_READ | PROT_WRITE, 587 VM_PROT_ALL, MAP_PRIVATE | MAP_ANON, OBJT_DEFAULT, NULL, 0); 588 589 return (error); 590 } 591 592 /* 593 * Unmap memory in user space. 594 */ 595 int 596 copyout_unmap(struct thread *td, vm_offset_t addr, size_t sz) 597 { 598 vm_map_t map; 599 vm_size_t size; 600 601 if (sz == 0) 602 return (0); 603 604 map = &td->td_proc->p_vmspace->vm_map; 605 size = (vm_size_t)round_page(sz); 606 607 if (vm_map_remove(map, addr, addr + size) != KERN_SUCCESS) 608 return (EINVAL); 609 610 return (0); 611 } 612