1 /*- 2 * Copyright (c) 1982, 1986, 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_zero.h" 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/kernel.h> 45 #include <sys/limits.h> 46 #include <sys/lock.h> 47 #include <sys/mman.h> 48 #include <sys/proc.h> 49 #include <sys/resourcevar.h> 50 #include <sys/rwlock.h> 51 #include <sys/sched.h> 52 #include <sys/sysctl.h> 53 #include <sys/vnode.h> 54 55 #include <vm/vm.h> 56 #include <vm/vm_param.h> 57 #include <vm/vm_extern.h> 58 #include <vm/vm_page.h> 59 #include <vm/vm_map.h> 60 #ifdef SOCKET_SEND_COW 61 #include <vm/vm_object.h> 62 #endif 63 64 SYSCTL_INT(_kern, KERN_IOV_MAX, iov_max, CTLFLAG_RD, NULL, UIO_MAXIOV, 65 "Maximum number of elements in an I/O vector; sysconf(_SC_IOV_MAX)"); 66 67 static int uiomove_faultflag(void *cp, int n, struct uio *uio, int nofault); 68 69 #ifdef SOCKET_SEND_COW 70 /* Declared in uipc_socket.c */ 71 extern int so_zero_copy_receive; 72 73 /* 74 * Identify the physical page mapped at the given kernel virtual 75 * address. Insert this physical page into the given address space at 76 * the given virtual address, replacing the physical page, if any, 77 * that already exists there. 78 */ 79 static int 80 vm_pgmoveco(vm_map_t mapa, vm_offset_t kaddr, vm_offset_t uaddr) 81 { 82 vm_map_t map = mapa; 83 vm_page_t kern_pg, user_pg; 84 vm_object_t uobject; 85 vm_map_entry_t entry; 86 vm_pindex_t upindex; 87 vm_prot_t prot; 88 boolean_t wired; 89 90 KASSERT((uaddr & PAGE_MASK) == 0, 91 ("vm_pgmoveco: uaddr is not page aligned")); 92 93 /* 94 * Herein the physical page is validated and dirtied. It is 95 * unwired in sf_buf_mext(). 96 */ 97 kern_pg = PHYS_TO_VM_PAGE(vtophys(kaddr)); 98 kern_pg->valid = VM_PAGE_BITS_ALL; 99 KASSERT(kern_pg->queue == PQ_NONE && kern_pg->wire_count == 1, 100 ("vm_pgmoveco: kern_pg is not correctly wired")); 101 102 if ((vm_map_lookup(&map, uaddr, 103 VM_PROT_WRITE, &entry, &uobject, 104 &upindex, &prot, &wired)) != KERN_SUCCESS) { 105 return(EFAULT); 106 } 107 VM_OBJECT_WLOCK(uobject); 108 retry: 109 if ((user_pg = vm_page_lookup(uobject, upindex)) != NULL) { 110 if (vm_page_sleep_if_busy(user_pg, TRUE, "vm_pgmoveco")) 111 goto retry; 112 vm_page_lock(user_pg); 113 pmap_remove_all(user_pg); 114 vm_page_free(user_pg); 115 vm_page_unlock(user_pg); 116 } else { 117 /* 118 * Even if a physical page does not exist in the 119 * object chain's first object, a physical page from a 120 * backing object may be mapped read only. 121 */ 122 if (uobject->backing_object != NULL) 123 pmap_remove(map->pmap, uaddr, uaddr + PAGE_SIZE); 124 } 125 vm_page_insert(kern_pg, uobject, upindex); 126 vm_page_dirty(kern_pg); 127 VM_OBJECT_WUNLOCK(uobject); 128 vm_map_lookup_done(map, entry); 129 return(KERN_SUCCESS); 130 } 131 #endif /* SOCKET_SEND_COW */ 132 133 int 134 copyin_nofault(const void *udaddr, void *kaddr, size_t len) 135 { 136 int error, save; 137 138 save = vm_fault_disable_pagefaults(); 139 error = copyin(udaddr, kaddr, len); 140 vm_fault_enable_pagefaults(save); 141 return (error); 142 } 143 144 int 145 copyout_nofault(const void *kaddr, void *udaddr, size_t len) 146 { 147 int error, save; 148 149 save = vm_fault_disable_pagefaults(); 150 error = copyout(kaddr, udaddr, len); 151 vm_fault_enable_pagefaults(save); 152 return (error); 153 } 154 155 #define PHYS_PAGE_COUNT(len) (howmany(len, PAGE_SIZE) + 1) 156 157 int 158 physcopyin(void *src, vm_paddr_t dst, size_t len) 159 { 160 vm_page_t m[PHYS_PAGE_COUNT(len)]; 161 struct iovec iov[1]; 162 struct uio uio; 163 int i; 164 165 iov[0].iov_base = src; 166 iov[0].iov_len = len; 167 uio.uio_iov = iov; 168 uio.uio_iovcnt = 1; 169 uio.uio_offset = 0; 170 uio.uio_resid = len; 171 uio.uio_segflg = UIO_SYSSPACE; 172 uio.uio_rw = UIO_WRITE; 173 for (i = 0; i < PHYS_PAGE_COUNT(len); i++, dst += PAGE_SIZE) 174 m[i] = PHYS_TO_VM_PAGE(dst); 175 return (uiomove_fromphys(m, dst & PAGE_MASK, len, &uio)); 176 } 177 178 int 179 physcopyout(vm_paddr_t src, void *dst, size_t len) 180 { 181 vm_page_t m[PHYS_PAGE_COUNT(len)]; 182 struct iovec iov[1]; 183 struct uio uio; 184 int i; 185 186 iov[0].iov_base = dst; 187 iov[0].iov_len = len; 188 uio.uio_iov = iov; 189 uio.uio_iovcnt = 1; 190 uio.uio_offset = 0; 191 uio.uio_resid = len; 192 uio.uio_segflg = UIO_SYSSPACE; 193 uio.uio_rw = UIO_READ; 194 for (i = 0; i < PHYS_PAGE_COUNT(len); i++, src += PAGE_SIZE) 195 m[i] = PHYS_TO_VM_PAGE(src); 196 return (uiomove_fromphys(m, src & PAGE_MASK, len, &uio)); 197 } 198 199 #undef PHYS_PAGE_COUNT 200 201 int 202 uiomove(void *cp, int n, struct uio *uio) 203 { 204 205 return (uiomove_faultflag(cp, n, uio, 0)); 206 } 207 208 int 209 uiomove_nofault(void *cp, int n, struct uio *uio) 210 { 211 212 return (uiomove_faultflag(cp, n, uio, 1)); 213 } 214 215 static int 216 uiomove_faultflag(void *cp, int n, struct uio *uio, int nofault) 217 { 218 struct thread *td; 219 struct iovec *iov; 220 size_t cnt; 221 int error, newflags, save; 222 223 td = curthread; 224 error = 0; 225 226 KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, 227 ("uiomove: mode")); 228 KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == td, 229 ("uiomove proc")); 230 if (!nofault) 231 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 232 "Calling uiomove()"); 233 234 /* XXX does it make a sense to set TDP_DEADLKTREAT for UIO_SYSSPACE ? */ 235 newflags = TDP_DEADLKTREAT; 236 if (uio->uio_segflg == UIO_USERSPACE && nofault) { 237 /* 238 * Fail if a non-spurious page fault occurs. 239 */ 240 newflags |= TDP_NOFAULTING | TDP_RESETSPUR; 241 } 242 save = curthread_pflags_set(newflags); 243 244 while (n > 0 && uio->uio_resid) { 245 iov = uio->uio_iov; 246 cnt = iov->iov_len; 247 if (cnt == 0) { 248 uio->uio_iov++; 249 uio->uio_iovcnt--; 250 continue; 251 } 252 if (cnt > n) 253 cnt = n; 254 255 switch (uio->uio_segflg) { 256 257 case UIO_USERSPACE: 258 maybe_yield(); 259 if (uio->uio_rw == UIO_READ) 260 error = copyout(cp, iov->iov_base, cnt); 261 else 262 error = copyin(iov->iov_base, cp, cnt); 263 if (error) 264 goto out; 265 break; 266 267 case UIO_SYSSPACE: 268 if (uio->uio_rw == UIO_READ) 269 bcopy(cp, iov->iov_base, cnt); 270 else 271 bcopy(iov->iov_base, cp, cnt); 272 break; 273 case UIO_NOCOPY: 274 break; 275 } 276 iov->iov_base = (char *)iov->iov_base + cnt; 277 iov->iov_len -= cnt; 278 uio->uio_resid -= cnt; 279 uio->uio_offset += cnt; 280 cp = (char *)cp + cnt; 281 n -= cnt; 282 } 283 out: 284 curthread_pflags_restore(save); 285 return (error); 286 } 287 288 /* 289 * Wrapper for uiomove() that validates the arguments against a known-good 290 * kernel buffer. Currently, uiomove accepts a signed (n) argument, which 291 * is almost definitely a bad thing, so we catch that here as well. We 292 * return a runtime failure, but it might be desirable to generate a runtime 293 * assertion failure instead. 294 */ 295 int 296 uiomove_frombuf(void *buf, int buflen, struct uio *uio) 297 { 298 size_t offset, n; 299 300 if (uio->uio_offset < 0 || uio->uio_resid < 0 || 301 (offset = uio->uio_offset) != uio->uio_offset) 302 return (EINVAL); 303 if (buflen <= 0 || offset >= buflen) 304 return (0); 305 if ((n = buflen - offset) > IOSIZE_MAX) 306 return (EINVAL); 307 return (uiomove((char *)buf + offset, n, uio)); 308 } 309 310 #ifdef SOCKET_RECV_PFLIP 311 /* 312 * Experimental support for zero-copy I/O 313 */ 314 static int 315 userspaceco(void *cp, u_int cnt, struct uio *uio, int disposable) 316 { 317 struct iovec *iov; 318 int error; 319 320 iov = uio->uio_iov; 321 if (uio->uio_rw == UIO_READ) { 322 if ((so_zero_copy_receive != 0) 323 && ((cnt & PAGE_MASK) == 0) 324 && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) 325 && ((uio->uio_offset & PAGE_MASK) == 0) 326 && ((((intptr_t) cp) & PAGE_MASK) == 0) 327 && (disposable != 0)) { 328 /* SOCKET: use page-trading */ 329 /* 330 * We only want to call vm_pgmoveco() on 331 * disposeable pages, since it gives the 332 * kernel page to the userland process. 333 */ 334 error = vm_pgmoveco(&curproc->p_vmspace->vm_map, 335 (vm_offset_t)cp, (vm_offset_t)iov->iov_base); 336 337 /* 338 * If we get an error back, attempt 339 * to use copyout() instead. The 340 * disposable page should be freed 341 * automatically if we weren't able to move 342 * it into userland. 343 */ 344 if (error != 0) 345 error = copyout(cp, iov->iov_base, cnt); 346 } else { 347 error = copyout(cp, iov->iov_base, cnt); 348 } 349 } else { 350 error = copyin(iov->iov_base, cp, cnt); 351 } 352 return (error); 353 } 354 355 int 356 uiomoveco(void *cp, int n, struct uio *uio, int disposable) 357 { 358 struct iovec *iov; 359 u_int cnt; 360 int error; 361 362 KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, 363 ("uiomoveco: mode")); 364 KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, 365 ("uiomoveco proc")); 366 367 while (n > 0 && uio->uio_resid) { 368 iov = uio->uio_iov; 369 cnt = iov->iov_len; 370 if (cnt == 0) { 371 uio->uio_iov++; 372 uio->uio_iovcnt--; 373 continue; 374 } 375 if (cnt > n) 376 cnt = n; 377 378 switch (uio->uio_segflg) { 379 380 case UIO_USERSPACE: 381 maybe_yield(); 382 error = userspaceco(cp, cnt, uio, disposable); 383 if (error) 384 return (error); 385 break; 386 387 case UIO_SYSSPACE: 388 if (uio->uio_rw == UIO_READ) 389 bcopy(cp, iov->iov_base, cnt); 390 else 391 bcopy(iov->iov_base, cp, cnt); 392 break; 393 case UIO_NOCOPY: 394 break; 395 } 396 iov->iov_base = (char *)iov->iov_base + cnt; 397 iov->iov_len -= cnt; 398 uio->uio_resid -= cnt; 399 uio->uio_offset += cnt; 400 cp = (char *)cp + cnt; 401 n -= cnt; 402 } 403 return (0); 404 } 405 #endif /* SOCKET_RECV_PFLIP */ 406 407 /* 408 * Give next character to user as result of read. 409 */ 410 int 411 ureadc(int c, struct uio *uio) 412 { 413 struct iovec *iov; 414 char *iov_base; 415 416 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 417 "Calling ureadc()"); 418 419 again: 420 if (uio->uio_iovcnt == 0 || uio->uio_resid == 0) 421 panic("ureadc"); 422 iov = uio->uio_iov; 423 if (iov->iov_len == 0) { 424 uio->uio_iovcnt--; 425 uio->uio_iov++; 426 goto again; 427 } 428 switch (uio->uio_segflg) { 429 430 case UIO_USERSPACE: 431 if (subyte(iov->iov_base, c) < 0) 432 return (EFAULT); 433 break; 434 435 case UIO_SYSSPACE: 436 iov_base = iov->iov_base; 437 *iov_base = c; 438 break; 439 440 case UIO_NOCOPY: 441 break; 442 } 443 iov->iov_base = (char *)iov->iov_base + 1; 444 iov->iov_len--; 445 uio->uio_resid--; 446 uio->uio_offset++; 447 return (0); 448 } 449 450 int 451 copyinfrom(const void * __restrict src, void * __restrict dst, size_t len, 452 int seg) 453 { 454 int error = 0; 455 456 switch (seg) { 457 case UIO_USERSPACE: 458 error = copyin(src, dst, len); 459 break; 460 case UIO_SYSSPACE: 461 bcopy(src, dst, len); 462 break; 463 default: 464 panic("copyinfrom: bad seg %d\n", seg); 465 } 466 return (error); 467 } 468 469 int 470 copyinstrfrom(const void * __restrict src, void * __restrict dst, size_t len, 471 size_t * __restrict copied, int seg) 472 { 473 int error = 0; 474 475 switch (seg) { 476 case UIO_USERSPACE: 477 error = copyinstr(src, dst, len, copied); 478 break; 479 case UIO_SYSSPACE: 480 error = copystr(src, dst, len, copied); 481 break; 482 default: 483 panic("copyinstrfrom: bad seg %d\n", seg); 484 } 485 return (error); 486 } 487 488 int 489 copyiniov(const struct iovec *iovp, u_int iovcnt, struct iovec **iov, int error) 490 { 491 u_int iovlen; 492 493 *iov = NULL; 494 if (iovcnt > UIO_MAXIOV) 495 return (error); 496 iovlen = iovcnt * sizeof (struct iovec); 497 *iov = malloc(iovlen, M_IOV, M_WAITOK); 498 error = copyin(iovp, *iov, iovlen); 499 if (error) { 500 free(*iov, M_IOV); 501 *iov = NULL; 502 } 503 return (error); 504 } 505 506 int 507 copyinuio(const struct iovec *iovp, u_int iovcnt, struct uio **uiop) 508 { 509 struct iovec *iov; 510 struct uio *uio; 511 u_int iovlen; 512 int error, i; 513 514 *uiop = NULL; 515 if (iovcnt > UIO_MAXIOV) 516 return (EINVAL); 517 iovlen = iovcnt * sizeof (struct iovec); 518 uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK); 519 iov = (struct iovec *)(uio + 1); 520 error = copyin(iovp, iov, iovlen); 521 if (error) { 522 free(uio, M_IOV); 523 return (error); 524 } 525 uio->uio_iov = iov; 526 uio->uio_iovcnt = iovcnt; 527 uio->uio_segflg = UIO_USERSPACE; 528 uio->uio_offset = -1; 529 uio->uio_resid = 0; 530 for (i = 0; i < iovcnt; i++) { 531 if (iov->iov_len > IOSIZE_MAX - uio->uio_resid) { 532 free(uio, M_IOV); 533 return (EINVAL); 534 } 535 uio->uio_resid += iov->iov_len; 536 iov++; 537 } 538 *uiop = uio; 539 return (0); 540 } 541 542 struct uio * 543 cloneuio(struct uio *uiop) 544 { 545 struct uio *uio; 546 int iovlen; 547 548 iovlen = uiop->uio_iovcnt * sizeof (struct iovec); 549 uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK); 550 *uio = *uiop; 551 uio->uio_iov = (struct iovec *)(uio + 1); 552 bcopy(uiop->uio_iov, uio->uio_iov, iovlen); 553 return (uio); 554 } 555 556 /* 557 * Map some anonymous memory in user space of size sz, rounded up to the page 558 * boundary. 559 */ 560 int 561 copyout_map(struct thread *td, vm_offset_t *addr, size_t sz) 562 { 563 struct vmspace *vms; 564 int error; 565 vm_size_t size; 566 567 vms = td->td_proc->p_vmspace; 568 569 /* 570 * Map somewhere after heap in process memory. 571 */ 572 PROC_LOCK(td->td_proc); 573 *addr = round_page((vm_offset_t)vms->vm_daddr + 574 lim_max(td->td_proc, RLIMIT_DATA)); 575 PROC_UNLOCK(td->td_proc); 576 577 /* round size up to page boundry */ 578 size = (vm_size_t)round_page(sz); 579 580 error = vm_mmap(&vms->vm_map, addr, size, PROT_READ | PROT_WRITE, 581 VM_PROT_ALL, MAP_PRIVATE | MAP_ANON, OBJT_DEFAULT, NULL, 0); 582 583 return (error); 584 } 585 586 /* 587 * Unmap memory in user space. 588 */ 589 int 590 copyout_unmap(struct thread *td, vm_offset_t addr, size_t sz) 591 { 592 vm_map_t map; 593 vm_size_t size; 594 595 if (sz == 0) 596 return (0); 597 598 map = &td->td_proc->p_vmspace->vm_map; 599 size = (vm_size_t)round_page(sz); 600 601 if (vm_map_remove(map, addr, addr + size) != KERN_SUCCESS) 602 return (EINVAL); 603 604 return (0); 605 } 606