1 /*- 2 * Copyright (c) 1982, 1986, 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_zero.h" 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/kernel.h> 45 #include <sys/limits.h> 46 #include <sys/lock.h> 47 #include <sys/mutex.h> 48 #include <sys/proc.h> 49 #include <sys/sched.h> 50 #include <sys/sysctl.h> 51 #include <sys/vnode.h> 52 53 #include <vm/vm.h> 54 #include <vm/vm_page.h> 55 #include <vm/vm_map.h> 56 #ifdef ZERO_COPY_SOCKETS 57 #include <vm/vm_param.h> 58 #include <vm/vm_object.h> 59 #endif 60 61 SYSCTL_INT(_kern, KERN_IOV_MAX, iov_max, CTLFLAG_RD, NULL, UIO_MAXIOV, 62 "Maximum number of elements in an I/O vector; sysconf(_SC_IOV_MAX)"); 63 64 #ifdef ZERO_COPY_SOCKETS 65 /* Declared in uipc_socket.c */ 66 extern int so_zero_copy_receive; 67 68 /* 69 * Identify the physical page mapped at the given kernel virtual 70 * address. Insert this physical page into the given address space at 71 * the given virtual address, replacing the physical page, if any, 72 * that already exists there. 73 */ 74 static int 75 vm_pgmoveco(vm_map_t mapa, vm_offset_t kaddr, vm_offset_t uaddr) 76 { 77 vm_map_t map = mapa; 78 vm_page_t kern_pg, user_pg; 79 vm_object_t uobject; 80 vm_map_entry_t entry; 81 vm_pindex_t upindex; 82 vm_prot_t prot; 83 boolean_t wired; 84 85 KASSERT((uaddr & PAGE_MASK) == 0, 86 ("vm_pgmoveco: uaddr is not page aligned")); 87 88 /* 89 * Herein the physical page is validated and dirtied. It is 90 * unwired in sf_buf_mext(). 91 */ 92 kern_pg = PHYS_TO_VM_PAGE(vtophys(kaddr)); 93 kern_pg->valid = VM_PAGE_BITS_ALL; 94 KASSERT(kern_pg->queue == PQ_NONE && kern_pg->wire_count == 1, 95 ("vm_pgmoveco: kern_pg is not correctly wired")); 96 97 if ((vm_map_lookup(&map, uaddr, 98 VM_PROT_WRITE, &entry, &uobject, 99 &upindex, &prot, &wired)) != KERN_SUCCESS) { 100 return(EFAULT); 101 } 102 VM_OBJECT_LOCK(uobject); 103 retry: 104 if ((user_pg = vm_page_lookup(uobject, upindex)) != NULL) { 105 if (vm_page_sleep_if_busy(user_pg, TRUE, "vm_pgmoveco")) 106 goto retry; 107 vm_page_lock(user_pg); 108 pmap_remove_all(user_pg); 109 vm_page_free(user_pg); 110 vm_page_unlock(user_pg); 111 } else { 112 /* 113 * Even if a physical page does not exist in the 114 * object chain's first object, a physical page from a 115 * backing object may be mapped read only. 116 */ 117 if (uobject->backing_object != NULL) 118 pmap_remove(map->pmap, uaddr, uaddr + PAGE_SIZE); 119 } 120 vm_page_insert(kern_pg, uobject, upindex); 121 vm_page_dirty(kern_pg); 122 VM_OBJECT_UNLOCK(uobject); 123 vm_map_lookup_done(map, entry); 124 return(KERN_SUCCESS); 125 } 126 #endif /* ZERO_COPY_SOCKETS */ 127 128 int 129 uiomove(void *cp, int n, struct uio *uio) 130 { 131 struct thread *td = curthread; 132 struct iovec *iov; 133 u_int cnt; 134 int error = 0; 135 int save = 0; 136 137 KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, 138 ("uiomove: mode")); 139 KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, 140 ("uiomove proc")); 141 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 142 "Calling uiomove()"); 143 144 save = td->td_pflags & TDP_DEADLKTREAT; 145 td->td_pflags |= TDP_DEADLKTREAT; 146 147 while (n > 0 && uio->uio_resid) { 148 iov = uio->uio_iov; 149 cnt = iov->iov_len; 150 if (cnt == 0) { 151 uio->uio_iov++; 152 uio->uio_iovcnt--; 153 continue; 154 } 155 if (cnt > n) 156 cnt = n; 157 158 switch (uio->uio_segflg) { 159 160 case UIO_USERSPACE: 161 if (ticks - PCPU_GET(switchticks) >= hogticks) 162 uio_yield(); 163 if (uio->uio_rw == UIO_READ) 164 error = copyout(cp, iov->iov_base, cnt); 165 else 166 error = copyin(iov->iov_base, cp, cnt); 167 if (error) 168 goto out; 169 break; 170 171 case UIO_SYSSPACE: 172 if (uio->uio_rw == UIO_READ) 173 bcopy(cp, iov->iov_base, cnt); 174 else 175 bcopy(iov->iov_base, cp, cnt); 176 break; 177 case UIO_NOCOPY: 178 break; 179 } 180 iov->iov_base = (char *)iov->iov_base + cnt; 181 iov->iov_len -= cnt; 182 uio->uio_resid -= cnt; 183 uio->uio_offset += cnt; 184 cp = (char *)cp + cnt; 185 n -= cnt; 186 } 187 out: 188 if (save == 0) 189 td->td_pflags &= ~TDP_DEADLKTREAT; 190 return (error); 191 } 192 193 /* 194 * Wrapper for uiomove() that validates the arguments against a known-good 195 * kernel buffer. Currently, uiomove accepts a signed (n) argument, which 196 * is almost definitely a bad thing, so we catch that here as well. We 197 * return a runtime failure, but it might be desirable to generate a runtime 198 * assertion failure instead. 199 */ 200 int 201 uiomove_frombuf(void *buf, int buflen, struct uio *uio) 202 { 203 unsigned int offset, n; 204 205 if (uio->uio_offset < 0 || uio->uio_resid < 0 || 206 (offset = uio->uio_offset) != uio->uio_offset) 207 return (EINVAL); 208 if (buflen <= 0 || offset >= buflen) 209 return (0); 210 if ((n = buflen - offset) > INT_MAX) 211 return (EINVAL); 212 return (uiomove((char *)buf + offset, n, uio)); 213 } 214 215 #ifdef ZERO_COPY_SOCKETS 216 /* 217 * Experimental support for zero-copy I/O 218 */ 219 static int 220 userspaceco(void *cp, u_int cnt, struct uio *uio, int disposable) 221 { 222 struct iovec *iov; 223 int error; 224 225 iov = uio->uio_iov; 226 if (uio->uio_rw == UIO_READ) { 227 if ((so_zero_copy_receive != 0) 228 && ((cnt & PAGE_MASK) == 0) 229 && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) 230 && ((uio->uio_offset & PAGE_MASK) == 0) 231 && ((((intptr_t) cp) & PAGE_MASK) == 0) 232 && (disposable != 0)) { 233 /* SOCKET: use page-trading */ 234 /* 235 * We only want to call vm_pgmoveco() on 236 * disposeable pages, since it gives the 237 * kernel page to the userland process. 238 */ 239 error = vm_pgmoveco(&curproc->p_vmspace->vm_map, 240 (vm_offset_t)cp, (vm_offset_t)iov->iov_base); 241 242 /* 243 * If we get an error back, attempt 244 * to use copyout() instead. The 245 * disposable page should be freed 246 * automatically if we weren't able to move 247 * it into userland. 248 */ 249 if (error != 0) 250 error = copyout(cp, iov->iov_base, cnt); 251 } else { 252 error = copyout(cp, iov->iov_base, cnt); 253 } 254 } else { 255 error = copyin(iov->iov_base, cp, cnt); 256 } 257 return (error); 258 } 259 260 int 261 uiomoveco(void *cp, int n, struct uio *uio, int disposable) 262 { 263 struct iovec *iov; 264 u_int cnt; 265 int error; 266 267 KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, 268 ("uiomoveco: mode")); 269 KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, 270 ("uiomoveco proc")); 271 272 while (n > 0 && uio->uio_resid) { 273 iov = uio->uio_iov; 274 cnt = iov->iov_len; 275 if (cnt == 0) { 276 uio->uio_iov++; 277 uio->uio_iovcnt--; 278 continue; 279 } 280 if (cnt > n) 281 cnt = n; 282 283 switch (uio->uio_segflg) { 284 285 case UIO_USERSPACE: 286 if (ticks - PCPU_GET(switchticks) >= hogticks) 287 uio_yield(); 288 289 error = userspaceco(cp, cnt, uio, disposable); 290 291 if (error) 292 return (error); 293 break; 294 295 case UIO_SYSSPACE: 296 if (uio->uio_rw == UIO_READ) 297 bcopy(cp, iov->iov_base, cnt); 298 else 299 bcopy(iov->iov_base, cp, cnt); 300 break; 301 case UIO_NOCOPY: 302 break; 303 } 304 iov->iov_base = (char *)iov->iov_base + cnt; 305 iov->iov_len -= cnt; 306 uio->uio_resid -= cnt; 307 uio->uio_offset += cnt; 308 cp = (char *)cp + cnt; 309 n -= cnt; 310 } 311 return (0); 312 } 313 #endif /* ZERO_COPY_SOCKETS */ 314 315 /* 316 * Give next character to user as result of read. 317 */ 318 int 319 ureadc(int c, struct uio *uio) 320 { 321 struct iovec *iov; 322 char *iov_base; 323 324 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 325 "Calling ureadc()"); 326 327 again: 328 if (uio->uio_iovcnt == 0 || uio->uio_resid == 0) 329 panic("ureadc"); 330 iov = uio->uio_iov; 331 if (iov->iov_len == 0) { 332 uio->uio_iovcnt--; 333 uio->uio_iov++; 334 goto again; 335 } 336 switch (uio->uio_segflg) { 337 338 case UIO_USERSPACE: 339 if (subyte(iov->iov_base, c) < 0) 340 return (EFAULT); 341 break; 342 343 case UIO_SYSSPACE: 344 iov_base = iov->iov_base; 345 *iov_base = c; 346 iov->iov_base = iov_base; 347 break; 348 349 case UIO_NOCOPY: 350 break; 351 } 352 iov->iov_base = (char *)iov->iov_base + 1; 353 iov->iov_len--; 354 uio->uio_resid--; 355 uio->uio_offset++; 356 return (0); 357 } 358 359 void 360 uio_yield(void) 361 { 362 struct thread *td; 363 364 td = curthread; 365 DROP_GIANT(); 366 thread_lock(td); 367 sched_prio(td, td->td_user_pri); 368 mi_switch(SW_INVOL | SWT_RELINQUISH, NULL); 369 thread_unlock(td); 370 PICKUP_GIANT(); 371 } 372 373 int 374 copyinfrom(const void * __restrict src, void * __restrict dst, size_t len, 375 int seg) 376 { 377 int error = 0; 378 379 switch (seg) { 380 case UIO_USERSPACE: 381 error = copyin(src, dst, len); 382 break; 383 case UIO_SYSSPACE: 384 bcopy(src, dst, len); 385 break; 386 default: 387 panic("copyinfrom: bad seg %d\n", seg); 388 } 389 return (error); 390 } 391 392 int 393 copyinstrfrom(const void * __restrict src, void * __restrict dst, size_t len, 394 size_t * __restrict copied, int seg) 395 { 396 int error = 0; 397 398 switch (seg) { 399 case UIO_USERSPACE: 400 error = copyinstr(src, dst, len, copied); 401 break; 402 case UIO_SYSSPACE: 403 error = copystr(src, dst, len, copied); 404 break; 405 default: 406 panic("copyinstrfrom: bad seg %d\n", seg); 407 } 408 return (error); 409 } 410 411 int 412 copyiniov(struct iovec *iovp, u_int iovcnt, struct iovec **iov, int error) 413 { 414 u_int iovlen; 415 416 *iov = NULL; 417 if (iovcnt > UIO_MAXIOV) 418 return (error); 419 iovlen = iovcnt * sizeof (struct iovec); 420 *iov = malloc(iovlen, M_IOV, M_WAITOK); 421 error = copyin(iovp, *iov, iovlen); 422 if (error) { 423 free(*iov, M_IOV); 424 *iov = NULL; 425 } 426 return (error); 427 } 428 429 int 430 copyinuio(struct iovec *iovp, u_int iovcnt, struct uio **uiop) 431 { 432 struct iovec *iov; 433 struct uio *uio; 434 u_int iovlen; 435 int error, i; 436 437 *uiop = NULL; 438 if (iovcnt > UIO_MAXIOV) 439 return (EINVAL); 440 iovlen = iovcnt * sizeof (struct iovec); 441 uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK); 442 iov = (struct iovec *)(uio + 1); 443 error = copyin(iovp, iov, iovlen); 444 if (error) { 445 free(uio, M_IOV); 446 return (error); 447 } 448 uio->uio_iov = iov; 449 uio->uio_iovcnt = iovcnt; 450 uio->uio_segflg = UIO_USERSPACE; 451 uio->uio_offset = -1; 452 uio->uio_resid = 0; 453 for (i = 0; i < iovcnt; i++) { 454 if (iov->iov_len > INT_MAX - uio->uio_resid) { 455 free(uio, M_IOV); 456 return (EINVAL); 457 } 458 uio->uio_resid += iov->iov_len; 459 iov++; 460 } 461 *uiop = uio; 462 return (0); 463 } 464 465 struct uio * 466 cloneuio(struct uio *uiop) 467 { 468 struct uio *uio; 469 int iovlen; 470 471 iovlen = uiop->uio_iovcnt * sizeof (struct iovec); 472 uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK); 473 *uio = *uiop; 474 uio->uio_iov = (struct iovec *)(uio + 1); 475 bcopy(uiop->uio_iov, uio->uio_iov, iovlen); 476 return (uio); 477 } 478