1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * Copyright (c) 2004 The FreeBSD Foundation. All rights reserved. 7 * Copyright (c) 2004-2008 Robert N. M. Watson. All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * Excerpts taken from tcp_subr.c, tcp_usrreq.c, uipc_socket.c 34 */ 35 36 /* 37 * 38 * Copyright (c) 2010 Isilon Systems, Inc. 39 * Copyright (c) 2010 iX Systems, Inc. 40 * Copyright (c) 2010 Panasas, Inc. 41 * All rights reserved. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice unmodified, this list of conditions, and the following 48 * disclaimer. 49 * 2. Redistributions in binary form must reproduce the above copyright 50 * notice, this list of conditions and the following disclaimer in the 51 * documentation and/or other materials provided with the distribution. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 54 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 55 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 56 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 57 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 58 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 59 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 60 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 61 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 62 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 63 * 64 */ 65 #include <sys/cdefs.h> 66 __FBSDID("$FreeBSD$"); 67 68 #include <sys/param.h> 69 #include <sys/eventhandler.h> 70 #include <sys/kernel.h> 71 #include <sys/malloc.h> 72 73 #include "sdp.h" 74 75 #include <net/if.h> 76 #include <net/route.h> 77 #include <net/vnet.h> 78 #include <sys/sysctl.h> 79 80 uma_zone_t sdp_zone; 81 struct rwlock sdp_lock; 82 LIST_HEAD(, sdp_sock) sdp_list; 83 84 struct workqueue_struct *rx_comp_wq; 85 86 RW_SYSINIT(sdplockinit, &sdp_lock, "SDP lock"); 87 #define SDP_LIST_WLOCK() rw_wlock(&sdp_lock) 88 #define SDP_LIST_RLOCK() rw_rlock(&sdp_lock) 89 #define SDP_LIST_WUNLOCK() rw_wunlock(&sdp_lock) 90 #define SDP_LIST_RUNLOCK() rw_runlock(&sdp_lock) 91 #define SDP_LIST_WLOCK_ASSERT() rw_assert(&sdp_lock, RW_WLOCKED) 92 #define SDP_LIST_RLOCK_ASSERT() rw_assert(&sdp_lock, RW_RLOCKED) 93 #define SDP_LIST_LOCK_ASSERT() rw_assert(&sdp_lock, RW_LOCKED) 94 95 MALLOC_DEFINE(M_SDP, "sdp", "Sockets Direct Protocol"); 96 97 static void sdp_stop_keepalive_timer(struct socket *so); 98 99 /* 100 * SDP protocol interface to socket abstraction. 101 */ 102 /* 103 * sdp_sendspace and sdp_recvspace are the default send and receive window 104 * sizes, respectively. 105 */ 106 u_long sdp_sendspace = 1024*32; 107 u_long sdp_recvspace = 1024*64; 108 109 static int sdp_count; 110 111 /* 112 * Disable async. CMA events for sockets which are being torn down. 113 */ 114 static void 115 sdp_destroy_cma(struct sdp_sock *ssk) 116 { 117 118 if (ssk->id == NULL) 119 return; 120 rdma_destroy_id(ssk->id); 121 ssk->id = NULL; 122 } 123 124 static int 125 sdp_pcbbind(struct sdp_sock *ssk, struct sockaddr *nam, struct ucred *cred) 126 { 127 struct sockaddr_in *sin; 128 struct sockaddr_in null; 129 int error; 130 131 SDP_WLOCK_ASSERT(ssk); 132 133 if (ssk->lport != 0 || ssk->laddr != INADDR_ANY) 134 return (EINVAL); 135 /* rdma_bind_addr handles bind races. */ 136 SDP_WUNLOCK(ssk); 137 if (ssk->id == NULL) 138 ssk->id = rdma_create_id(&init_net, sdp_cma_handler, ssk, RDMA_PS_SDP, IB_QPT_RC); 139 if (ssk->id == NULL) { 140 SDP_WLOCK(ssk); 141 return (ENOMEM); 142 } 143 if (nam == NULL) { 144 null.sin_family = AF_INET; 145 null.sin_len = sizeof(null); 146 null.sin_addr.s_addr = INADDR_ANY; 147 null.sin_port = 0; 148 bzero(&null.sin_zero, sizeof(null.sin_zero)); 149 nam = (struct sockaddr *)&null; 150 } 151 error = -rdma_bind_addr(ssk->id, nam); 152 SDP_WLOCK(ssk); 153 if (error == 0) { 154 sin = (struct sockaddr_in *)&ssk->id->route.addr.src_addr; 155 ssk->laddr = sin->sin_addr.s_addr; 156 ssk->lport = sin->sin_port; 157 } else 158 sdp_destroy_cma(ssk); 159 return (error); 160 } 161 162 static void 163 sdp_pcbfree(struct sdp_sock *ssk) 164 { 165 166 KASSERT(ssk->socket == NULL, ("ssk %p socket still attached", ssk)); 167 KASSERT((ssk->flags & SDP_DESTROY) == 0, 168 ("ssk %p already destroyed", ssk)); 169 170 sdp_dbg(ssk->socket, "Freeing pcb"); 171 SDP_WLOCK_ASSERT(ssk); 172 ssk->flags |= SDP_DESTROY; 173 SDP_WUNLOCK(ssk); 174 SDP_LIST_WLOCK(); 175 sdp_count--; 176 LIST_REMOVE(ssk, list); 177 SDP_LIST_WUNLOCK(); 178 crfree(ssk->cred); 179 ssk->qp_active = 0; 180 if (ssk->qp) { 181 ib_destroy_qp(ssk->qp); 182 ssk->qp = NULL; 183 } 184 sdp_tx_ring_destroy(ssk); 185 sdp_rx_ring_destroy(ssk); 186 sdp_destroy_cma(ssk); 187 rw_destroy(&ssk->rx_ring.destroyed_lock); 188 rw_destroy(&ssk->lock); 189 uma_zfree(sdp_zone, ssk); 190 } 191 192 /* 193 * Common routines to return a socket address. 194 */ 195 static struct sockaddr * 196 sdp_sockaddr(in_port_t port, struct in_addr *addr_p) 197 { 198 struct sockaddr_in *sin; 199 200 sin = malloc(sizeof *sin, M_SONAME, 201 M_WAITOK | M_ZERO); 202 sin->sin_family = AF_INET; 203 sin->sin_len = sizeof(*sin); 204 sin->sin_addr = *addr_p; 205 sin->sin_port = port; 206 207 return (struct sockaddr *)sin; 208 } 209 210 static int 211 sdp_getsockaddr(struct socket *so, struct sockaddr **nam) 212 { 213 struct sdp_sock *ssk; 214 struct in_addr addr; 215 in_port_t port; 216 217 ssk = sdp_sk(so); 218 SDP_RLOCK(ssk); 219 port = ssk->lport; 220 addr.s_addr = ssk->laddr; 221 SDP_RUNLOCK(ssk); 222 223 *nam = sdp_sockaddr(port, &addr); 224 return 0; 225 } 226 227 static int 228 sdp_getpeeraddr(struct socket *so, struct sockaddr **nam) 229 { 230 struct sdp_sock *ssk; 231 struct in_addr addr; 232 in_port_t port; 233 234 ssk = sdp_sk(so); 235 SDP_RLOCK(ssk); 236 port = ssk->fport; 237 addr.s_addr = ssk->faddr; 238 SDP_RUNLOCK(ssk); 239 240 *nam = sdp_sockaddr(port, &addr); 241 return 0; 242 } 243 244 static void 245 sdp_pcbnotifyall(struct in_addr faddr, int errno, 246 struct sdp_sock *(*notify)(struct sdp_sock *, int)) 247 { 248 struct sdp_sock *ssk, *ssk_temp; 249 250 SDP_LIST_WLOCK(); 251 LIST_FOREACH_SAFE(ssk, &sdp_list, list, ssk_temp) { 252 SDP_WLOCK(ssk); 253 if (ssk->faddr != faddr.s_addr || ssk->socket == NULL) { 254 SDP_WUNLOCK(ssk); 255 continue; 256 } 257 if ((ssk->flags & SDP_DESTROY) == 0) 258 if ((*notify)(ssk, errno)) 259 SDP_WUNLOCK(ssk); 260 } 261 SDP_LIST_WUNLOCK(); 262 } 263 264 #if 0 265 static void 266 sdp_apply_all(void (*func)(struct sdp_sock *, void *), void *arg) 267 { 268 struct sdp_sock *ssk; 269 270 SDP_LIST_RLOCK(); 271 LIST_FOREACH(ssk, &sdp_list, list) { 272 SDP_WLOCK(ssk); 273 func(ssk, arg); 274 SDP_WUNLOCK(ssk); 275 } 276 SDP_LIST_RUNLOCK(); 277 } 278 #endif 279 280 static void 281 sdp_output_reset(struct sdp_sock *ssk) 282 { 283 struct rdma_cm_id *id; 284 285 SDP_WLOCK_ASSERT(ssk); 286 if (ssk->id) { 287 id = ssk->id; 288 ssk->qp_active = 0; 289 SDP_WUNLOCK(ssk); 290 rdma_disconnect(id); 291 SDP_WLOCK(ssk); 292 } 293 ssk->state = TCPS_CLOSED; 294 } 295 296 /* 297 * Attempt to close a SDP socket, marking it as dropped, and freeing 298 * the socket if we hold the only reference. 299 */ 300 static struct sdp_sock * 301 sdp_closed(struct sdp_sock *ssk) 302 { 303 struct socket *so; 304 305 SDP_WLOCK_ASSERT(ssk); 306 307 ssk->flags |= SDP_DROPPED; 308 so = ssk->socket; 309 soisdisconnected(so); 310 if (ssk->flags & SDP_SOCKREF) { 311 ssk->flags &= ~SDP_SOCKREF; 312 SDP_WUNLOCK(ssk); 313 sorele(so); 314 return (NULL); 315 } 316 return (ssk); 317 } 318 319 /* 320 * Perform timer based shutdowns which can not operate in 321 * callout context. 322 */ 323 static void 324 sdp_shutdown_task(void *data, int pending) 325 { 326 struct sdp_sock *ssk; 327 328 ssk = data; 329 SDP_WLOCK(ssk); 330 /* 331 * I don't think this can race with another call to pcbfree() 332 * because SDP_TIMEWAIT protects it. SDP_DESTROY may be redundant. 333 */ 334 if (ssk->flags & SDP_DESTROY) 335 panic("sdp_shutdown_task: Racing with pcbfree for ssk %p", 336 ssk); 337 if (ssk->flags & SDP_DISCON) 338 sdp_output_reset(ssk); 339 /* We have to clear this so sdp_detach() will call pcbfree(). */ 340 ssk->flags &= ~(SDP_TIMEWAIT | SDP_DREQWAIT); 341 if ((ssk->flags & SDP_DROPPED) == 0 && 342 sdp_closed(ssk) == NULL) 343 return; 344 if (ssk->socket == NULL) { 345 sdp_pcbfree(ssk); 346 return; 347 } 348 SDP_WUNLOCK(ssk); 349 } 350 351 /* 352 * 2msl has expired, schedule the shutdown task. 353 */ 354 static void 355 sdp_2msl_timeout(void *data) 356 { 357 struct sdp_sock *ssk; 358 359 ssk = data; 360 /* Callout canceled. */ 361 if (!callout_active(&ssk->keep2msl)) 362 goto out; 363 callout_deactivate(&ssk->keep2msl); 364 /* Should be impossible, defensive programming. */ 365 if ((ssk->flags & SDP_TIMEWAIT) == 0) 366 goto out; 367 taskqueue_enqueue(taskqueue_thread, &ssk->shutdown_task); 368 out: 369 SDP_WUNLOCK(ssk); 370 return; 371 } 372 373 /* 374 * Schedule the 2msl wait timer. 375 */ 376 static void 377 sdp_2msl_wait(struct sdp_sock *ssk) 378 { 379 380 SDP_WLOCK_ASSERT(ssk); 381 ssk->flags |= SDP_TIMEWAIT; 382 ssk->state = TCPS_TIME_WAIT; 383 soisdisconnected(ssk->socket); 384 callout_reset(&ssk->keep2msl, TCPTV_MSL, sdp_2msl_timeout, ssk); 385 } 386 387 /* 388 * Timed out waiting for the final fin/ack from rdma_disconnect(). 389 */ 390 static void 391 sdp_dreq_timeout(void *data) 392 { 393 struct sdp_sock *ssk; 394 395 ssk = data; 396 /* Callout canceled. */ 397 if (!callout_active(&ssk->keep2msl)) 398 goto out; 399 /* Callout rescheduled, probably as a different timer. */ 400 if (callout_pending(&ssk->keep2msl)) 401 goto out; 402 callout_deactivate(&ssk->keep2msl); 403 if (ssk->state != TCPS_FIN_WAIT_1 && ssk->state != TCPS_LAST_ACK) 404 goto out; 405 if ((ssk->flags & SDP_DREQWAIT) == 0) 406 goto out; 407 ssk->flags &= ~SDP_DREQWAIT; 408 ssk->flags |= SDP_DISCON; 409 sdp_2msl_wait(ssk); 410 ssk->qp_active = 0; 411 out: 412 SDP_WUNLOCK(ssk); 413 } 414 415 /* 416 * Received the final fin/ack. Cancel the 2msl. 417 */ 418 void 419 sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk) 420 { 421 sdp_dbg(ssk->socket, "cancelling dreq wait timeout\n"); 422 ssk->flags &= ~SDP_DREQWAIT; 423 sdp_2msl_wait(ssk); 424 } 425 426 static int 427 sdp_init_sock(struct socket *sk) 428 { 429 struct sdp_sock *ssk = sdp_sk(sk); 430 431 sdp_dbg(sk, "%s\n", __func__); 432 433 callout_init_rw(&ssk->keep2msl, &ssk->lock, CALLOUT_RETURNUNLOCKED); 434 TASK_INIT(&ssk->shutdown_task, 0, sdp_shutdown_task, ssk); 435 #ifdef SDP_ZCOPY 436 INIT_DELAYED_WORK(&ssk->srcavail_cancel_work, srcavail_cancel_timeout); 437 ssk->zcopy_thresh = -1; /* use global sdp_zcopy_thresh */ 438 ssk->tx_ring.rdma_inflight = NULL; 439 #endif 440 atomic_set(&ssk->mseq_ack, 0); 441 sdp_rx_ring_init(ssk); 442 ssk->tx_ring.buffer = NULL; 443 444 return 0; 445 } 446 447 /* 448 * Allocate an sdp_sock for the socket and reserve socket buffer space. 449 */ 450 static int 451 sdp_attach(struct socket *so, int proto, struct thread *td) 452 { 453 struct sdp_sock *ssk; 454 int error; 455 456 ssk = sdp_sk(so); 457 KASSERT(ssk == NULL, ("sdp_attach: ssk already set on so %p", so)); 458 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { 459 error = soreserve(so, sdp_sendspace, sdp_recvspace); 460 if (error) 461 return (error); 462 } 463 so->so_rcv.sb_flags |= SB_AUTOSIZE; 464 so->so_snd.sb_flags |= SB_AUTOSIZE; 465 ssk = uma_zalloc(sdp_zone, M_NOWAIT | M_ZERO); 466 if (ssk == NULL) 467 return (ENOBUFS); 468 rw_init(&ssk->lock, "sdpsock"); 469 ssk->socket = so; 470 ssk->cred = crhold(so->so_cred); 471 so->so_pcb = (caddr_t)ssk; 472 sdp_init_sock(so); 473 ssk->flags = 0; 474 ssk->qp_active = 0; 475 ssk->state = TCPS_CLOSED; 476 mbufq_init(&ssk->rxctlq, INT_MAX); 477 SDP_LIST_WLOCK(); 478 LIST_INSERT_HEAD(&sdp_list, ssk, list); 479 sdp_count++; 480 SDP_LIST_WUNLOCK(); 481 482 return (0); 483 } 484 485 /* 486 * Detach SDP from the socket, potentially leaving it around for the 487 * timewait to expire. 488 */ 489 static void 490 sdp_detach(struct socket *so) 491 { 492 struct sdp_sock *ssk; 493 494 ssk = sdp_sk(so); 495 SDP_WLOCK(ssk); 496 KASSERT(ssk->socket != NULL, ("sdp_detach: socket is NULL")); 497 ssk->socket->so_pcb = NULL; 498 ssk->socket = NULL; 499 if (ssk->flags & (SDP_TIMEWAIT | SDP_DREQWAIT)) 500 SDP_WUNLOCK(ssk); 501 else if (ssk->flags & SDP_DROPPED || ssk->state < TCPS_SYN_SENT) 502 sdp_pcbfree(ssk); 503 else 504 panic("sdp_detach: Unexpected state, ssk %p.\n", ssk); 505 } 506 507 /* 508 * Allocate a local address for the socket. 509 */ 510 static int 511 sdp_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 512 { 513 int error = 0; 514 struct sdp_sock *ssk; 515 struct sockaddr_in *sin; 516 517 sin = (struct sockaddr_in *)nam; 518 if (sin->sin_family != AF_INET) 519 return (EAFNOSUPPORT); 520 if (nam->sa_len != sizeof(*sin)) 521 return (EINVAL); 522 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) 523 return (EAFNOSUPPORT); 524 525 ssk = sdp_sk(so); 526 SDP_WLOCK(ssk); 527 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 528 error = EINVAL; 529 goto out; 530 } 531 error = sdp_pcbbind(ssk, nam, td->td_ucred); 532 out: 533 SDP_WUNLOCK(ssk); 534 535 return (error); 536 } 537 538 /* 539 * Prepare to accept connections. 540 */ 541 static int 542 sdp_listen(struct socket *so, int backlog, struct thread *td) 543 { 544 int error = 0; 545 struct sdp_sock *ssk; 546 547 ssk = sdp_sk(so); 548 SDP_WLOCK(ssk); 549 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 550 error = EINVAL; 551 goto out; 552 } 553 if (error == 0 && ssk->lport == 0) 554 error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred); 555 SOCK_LOCK(so); 556 if (error == 0) 557 error = solisten_proto_check(so); 558 if (error == 0) { 559 solisten_proto(so, backlog); 560 ssk->state = TCPS_LISTEN; 561 } 562 SOCK_UNLOCK(so); 563 564 out: 565 SDP_WUNLOCK(ssk); 566 if (error == 0) 567 error = -rdma_listen(ssk->id, backlog); 568 return (error); 569 } 570 571 /* 572 * Initiate a SDP connection to nam. 573 */ 574 static int 575 sdp_start_connect(struct sdp_sock *ssk, struct sockaddr *nam, struct thread *td) 576 { 577 struct sockaddr_in src; 578 struct socket *so; 579 int error; 580 581 so = ssk->socket; 582 583 SDP_WLOCK_ASSERT(ssk); 584 if (ssk->lport == 0) { 585 error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred); 586 if (error) 587 return error; 588 } 589 src.sin_family = AF_INET; 590 src.sin_len = sizeof(src); 591 bzero(&src.sin_zero, sizeof(src.sin_zero)); 592 src.sin_port = ssk->lport; 593 src.sin_addr.s_addr = ssk->laddr; 594 soisconnecting(so); 595 SDP_WUNLOCK(ssk); 596 error = -rdma_resolve_addr(ssk->id, (struct sockaddr *)&src, nam, 597 SDP_RESOLVE_TIMEOUT); 598 SDP_WLOCK(ssk); 599 if (error == 0) 600 ssk->state = TCPS_SYN_SENT; 601 602 return 0; 603 } 604 605 /* 606 * Initiate SDP connection. 607 */ 608 static int 609 sdp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 610 { 611 int error = 0; 612 struct sdp_sock *ssk; 613 struct sockaddr_in *sin; 614 615 sin = (struct sockaddr_in *)nam; 616 if (nam->sa_len != sizeof(*sin)) 617 return (EINVAL); 618 if (sin->sin_family != AF_INET) 619 return (EAFNOSUPPORT); 620 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) 621 return (EAFNOSUPPORT); 622 if ((error = prison_remote_ip4(td->td_ucred, &sin->sin_addr)) != 0) 623 return (error); 624 ssk = sdp_sk(so); 625 SDP_WLOCK(ssk); 626 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) 627 error = EINVAL; 628 else 629 error = sdp_start_connect(ssk, nam, td); 630 SDP_WUNLOCK(ssk); 631 return (error); 632 } 633 634 /* 635 * Drop a SDP socket, reporting 636 * the specified error. If connection is synchronized, 637 * then send a RST to peer. 638 */ 639 static struct sdp_sock * 640 sdp_drop(struct sdp_sock *ssk, int errno) 641 { 642 struct socket *so; 643 644 SDP_WLOCK_ASSERT(ssk); 645 so = ssk->socket; 646 if (TCPS_HAVERCVDSYN(ssk->state)) 647 sdp_output_reset(ssk); 648 if (errno == ETIMEDOUT && ssk->softerror) 649 errno = ssk->softerror; 650 so->so_error = errno; 651 return (sdp_closed(ssk)); 652 } 653 654 /* 655 * User issued close, and wish to trail through shutdown states: 656 * if never received SYN, just forget it. If got a SYN from peer, 657 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 658 * If already got a FIN from peer, then almost done; go to LAST_ACK 659 * state. In all other cases, have already sent FIN to peer (e.g. 660 * after PRU_SHUTDOWN), and just have to play tedious game waiting 661 * for peer to send FIN or not respond to keep-alives, etc. 662 * We can let the user exit from the close as soon as the FIN is acked. 663 */ 664 static void 665 sdp_usrclosed(struct sdp_sock *ssk) 666 { 667 668 SDP_WLOCK_ASSERT(ssk); 669 670 switch (ssk->state) { 671 case TCPS_LISTEN: 672 ssk->state = TCPS_CLOSED; 673 SDP_WUNLOCK(ssk); 674 sdp_destroy_cma(ssk); 675 SDP_WLOCK(ssk); 676 /* FALLTHROUGH */ 677 case TCPS_CLOSED: 678 ssk = sdp_closed(ssk); 679 /* 680 * sdp_closed() should never return NULL here as the socket is 681 * still open. 682 */ 683 KASSERT(ssk != NULL, 684 ("sdp_usrclosed: sdp_closed() returned NULL")); 685 break; 686 687 case TCPS_SYN_SENT: 688 /* FALLTHROUGH */ 689 case TCPS_SYN_RECEIVED: 690 ssk->flags |= SDP_NEEDFIN; 691 break; 692 693 case TCPS_ESTABLISHED: 694 ssk->flags |= SDP_NEEDFIN; 695 ssk->state = TCPS_FIN_WAIT_1; 696 break; 697 698 case TCPS_CLOSE_WAIT: 699 ssk->state = TCPS_LAST_ACK; 700 break; 701 } 702 if (ssk->state >= TCPS_FIN_WAIT_2) { 703 /* Prevent the connection hanging in FIN_WAIT_2 forever. */ 704 if (ssk->state == TCPS_FIN_WAIT_2) 705 sdp_2msl_wait(ssk); 706 else 707 soisdisconnected(ssk->socket); 708 } 709 } 710 711 static void 712 sdp_output_disconnect(struct sdp_sock *ssk) 713 { 714 715 SDP_WLOCK_ASSERT(ssk); 716 callout_reset(&ssk->keep2msl, SDP_FIN_WAIT_TIMEOUT, 717 sdp_dreq_timeout, ssk); 718 ssk->flags |= SDP_NEEDFIN | SDP_DREQWAIT; 719 sdp_post_sends(ssk, M_NOWAIT); 720 } 721 722 /* 723 * Initiate or continue a disconnect. 724 * If embryonic state, just send reset (once). 725 * If in ``let data drain'' option and linger null, just drop. 726 * Otherwise (hard), mark socket disconnecting and drop 727 * current input data; switch states based on user close, and 728 * send segment to peer (with FIN). 729 */ 730 static void 731 sdp_start_disconnect(struct sdp_sock *ssk) 732 { 733 struct socket *so; 734 int unread; 735 736 so = ssk->socket; 737 SDP_WLOCK_ASSERT(ssk); 738 sdp_stop_keepalive_timer(so); 739 /* 740 * Neither sdp_closed() nor sdp_drop() should return NULL, as the 741 * socket is still open. 742 */ 743 if (ssk->state < TCPS_ESTABLISHED) { 744 ssk = sdp_closed(ssk); 745 KASSERT(ssk != NULL, 746 ("sdp_start_disconnect: sdp_close() returned NULL")); 747 } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) { 748 ssk = sdp_drop(ssk, 0); 749 KASSERT(ssk != NULL, 750 ("sdp_start_disconnect: sdp_drop() returned NULL")); 751 } else { 752 soisdisconnecting(so); 753 unread = sbused(&so->so_rcv); 754 sbflush(&so->so_rcv); 755 sdp_usrclosed(ssk); 756 if (!(ssk->flags & SDP_DROPPED)) { 757 if (unread) 758 sdp_output_reset(ssk); 759 else 760 sdp_output_disconnect(ssk); 761 } 762 } 763 } 764 765 /* 766 * User initiated disconnect. 767 */ 768 static int 769 sdp_disconnect(struct socket *so) 770 { 771 struct sdp_sock *ssk; 772 int error = 0; 773 774 ssk = sdp_sk(so); 775 SDP_WLOCK(ssk); 776 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 777 error = ECONNRESET; 778 goto out; 779 } 780 sdp_start_disconnect(ssk); 781 out: 782 SDP_WUNLOCK(ssk); 783 return (error); 784 } 785 786 /* 787 * Accept a connection. Essentially all the work is done at higher levels; 788 * just return the address of the peer, storing through addr. 789 * 790 * 791 * XXX This is broken XXX 792 * 793 * The rationale for acquiring the sdp lock here is somewhat complicated, 794 * and is described in detail in the commit log entry for r175612. Acquiring 795 * it delays an accept(2) racing with sonewconn(), which inserts the socket 796 * before the address/port fields are initialized. A better fix would 797 * prevent the socket from being placed in the listen queue until all fields 798 * are fully initialized. 799 */ 800 static int 801 sdp_accept(struct socket *so, struct sockaddr **nam) 802 { 803 struct sdp_sock *ssk = NULL; 804 struct in_addr addr; 805 in_port_t port; 806 int error; 807 808 if (so->so_state & SS_ISDISCONNECTED) 809 return (ECONNABORTED); 810 811 port = 0; 812 addr.s_addr = 0; 813 error = 0; 814 ssk = sdp_sk(so); 815 SDP_WLOCK(ssk); 816 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 817 error = ECONNABORTED; 818 goto out; 819 } 820 port = ssk->fport; 821 addr.s_addr = ssk->faddr; 822 out: 823 SDP_WUNLOCK(ssk); 824 if (error == 0) 825 *nam = sdp_sockaddr(port, &addr); 826 return error; 827 } 828 829 /* 830 * Mark the connection as being incapable of further output. 831 */ 832 static int 833 sdp_shutdown(struct socket *so) 834 { 835 int error = 0; 836 struct sdp_sock *ssk; 837 838 ssk = sdp_sk(so); 839 SDP_WLOCK(ssk); 840 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 841 error = ECONNRESET; 842 goto out; 843 } 844 socantsendmore(so); 845 sdp_usrclosed(ssk); 846 if (!(ssk->flags & SDP_DROPPED)) 847 sdp_output_disconnect(ssk); 848 849 out: 850 SDP_WUNLOCK(ssk); 851 852 return (error); 853 } 854 855 static void 856 sdp_append(struct sdp_sock *ssk, struct sockbuf *sb, struct mbuf *mb, int cnt) 857 { 858 struct mbuf *n; 859 int ncnt; 860 861 SOCKBUF_LOCK_ASSERT(sb); 862 SBLASTRECORDCHK(sb); 863 KASSERT(mb->m_flags & M_PKTHDR, 864 ("sdp_append: %p Missing packet header.\n", mb)); 865 n = sb->sb_lastrecord; 866 /* 867 * If the queue is empty just set all pointers and proceed. 868 */ 869 if (n == NULL) { 870 sb->sb_lastrecord = sb->sb_mb = sb->sb_sndptr = mb; 871 for (; mb; mb = mb->m_next) { 872 sb->sb_mbtail = mb; 873 sballoc(sb, mb); 874 } 875 return; 876 } 877 /* 878 * Count the number of mbufs in the current tail. 879 */ 880 for (ncnt = 0; n->m_next; n = n->m_next) 881 ncnt++; 882 n = sb->sb_lastrecord; 883 /* 884 * If the two chains can fit in a single sdp packet and 885 * the last record has not been sent yet (WRITABLE) coalesce 886 * them. The lastrecord remains the same but we must strip the 887 * packet header and then let sbcompress do the hard part. 888 */ 889 if (M_WRITABLE(n) && ncnt + cnt < SDP_MAX_SEND_SGES && 890 n->m_pkthdr.len + mb->m_pkthdr.len - SDP_HEAD_SIZE < 891 ssk->xmit_size_goal) { 892 m_adj(mb, SDP_HEAD_SIZE); 893 n->m_pkthdr.len += mb->m_pkthdr.len; 894 n->m_flags |= mb->m_flags & (M_PUSH | M_URG); 895 m_demote(mb, 1, 0); 896 sbcompress(sb, mb, sb->sb_mbtail); 897 return; 898 } 899 /* 900 * Not compressible, just append to the end and adjust counters. 901 */ 902 sb->sb_lastrecord->m_flags |= M_PUSH; 903 sb->sb_lastrecord->m_nextpkt = mb; 904 sb->sb_lastrecord = mb; 905 if (sb->sb_sndptr == NULL) 906 sb->sb_sndptr = mb; 907 for (; mb; mb = mb->m_next) { 908 sb->sb_mbtail = mb; 909 sballoc(sb, mb); 910 } 911 } 912 913 /* 914 * Do a send by putting data in output queue and updating urgent 915 * marker if URG set. Possibly send more data. Unlike the other 916 * pru_*() routines, the mbuf chains are our responsibility. We 917 * must either enqueue them or free them. The other pru_* routines 918 * generally are caller-frees. 919 * 920 * This comes from sendfile, normal sends will come from sdp_sosend(). 921 */ 922 static int 923 sdp_send(struct socket *so, int flags, struct mbuf *m, 924 struct sockaddr *nam, struct mbuf *control, struct thread *td) 925 { 926 struct sdp_sock *ssk; 927 struct mbuf *n; 928 int error; 929 int cnt; 930 931 if (nam != NULL) { 932 if (nam->sa_family != AF_INET) { 933 if (control) 934 m_freem(control); 935 m_freem(m); 936 return (EAFNOSUPPORT); 937 } 938 if (nam->sa_len != sizeof(struct sockaddr_in)) { 939 if (control) 940 m_freem(control); 941 m_freem(m); 942 return (EINVAL); 943 } 944 } 945 946 error = 0; 947 ssk = sdp_sk(so); 948 KASSERT(m->m_flags & M_PKTHDR, 949 ("sdp_send: %p no packet header", m)); 950 M_PREPEND(m, SDP_HEAD_SIZE, M_WAITOK); 951 mtod(m, struct sdp_bsdh *)->mid = SDP_MID_DATA; 952 for (n = m, cnt = 0; n->m_next; n = n->m_next) 953 cnt++; 954 if (cnt > SDP_MAX_SEND_SGES) { 955 n = m_collapse(m, M_WAITOK, SDP_MAX_SEND_SGES); 956 if (n == NULL) { 957 m_freem(m); 958 return (EMSGSIZE); 959 } 960 m = n; 961 for (cnt = 0; n->m_next; n = n->m_next) 962 cnt++; 963 } 964 SDP_WLOCK(ssk); 965 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 966 if (control) 967 m_freem(control); 968 if (m) 969 m_freem(m); 970 error = ECONNRESET; 971 goto out; 972 } 973 if (control) { 974 /* SDP doesn't support control messages. */ 975 if (control->m_len) { 976 m_freem(control); 977 if (m) 978 m_freem(m); 979 error = EINVAL; 980 goto out; 981 } 982 m_freem(control); /* empty control, just free it */ 983 } 984 if (!(flags & PRUS_OOB)) { 985 SOCKBUF_LOCK(&so->so_snd); 986 sdp_append(ssk, &so->so_snd, m, cnt); 987 SOCKBUF_UNLOCK(&so->so_snd); 988 if (nam && ssk->state < TCPS_SYN_SENT) { 989 /* 990 * Do implied connect if not yet connected. 991 */ 992 error = sdp_start_connect(ssk, nam, td); 993 if (error) 994 goto out; 995 } 996 if (flags & PRUS_EOF) { 997 /* 998 * Close the send side of the connection after 999 * the data is sent. 1000 */ 1001 socantsendmore(so); 1002 sdp_usrclosed(ssk); 1003 if (!(ssk->flags & SDP_DROPPED)) 1004 sdp_output_disconnect(ssk); 1005 } else if (!(ssk->flags & SDP_DROPPED) && 1006 !(flags & PRUS_MORETOCOME)) 1007 sdp_post_sends(ssk, M_NOWAIT); 1008 SDP_WUNLOCK(ssk); 1009 return (0); 1010 } else { 1011 SOCKBUF_LOCK(&so->so_snd); 1012 if (sbspace(&so->so_snd) < -512) { 1013 SOCKBUF_UNLOCK(&so->so_snd); 1014 m_freem(m); 1015 error = ENOBUFS; 1016 goto out; 1017 } 1018 /* 1019 * According to RFC961 (Assigned Protocols), 1020 * the urgent pointer points to the last octet 1021 * of urgent data. We continue, however, 1022 * to consider it to indicate the first octet 1023 * of data past the urgent section. 1024 * Otherwise, snd_up should be one lower. 1025 */ 1026 m->m_flags |= M_URG | M_PUSH; 1027 sdp_append(ssk, &so->so_snd, m, cnt); 1028 SOCKBUF_UNLOCK(&so->so_snd); 1029 if (nam && ssk->state < TCPS_SYN_SENT) { 1030 /* 1031 * Do implied connect if not yet connected. 1032 */ 1033 error = sdp_start_connect(ssk, nam, td); 1034 if (error) 1035 goto out; 1036 } 1037 sdp_post_sends(ssk, M_NOWAIT); 1038 SDP_WUNLOCK(ssk); 1039 return (0); 1040 } 1041 out: 1042 SDP_WUNLOCK(ssk); 1043 return (error); 1044 } 1045 1046 /* 1047 * Send on a socket. If send must go all at once and message is larger than 1048 * send buffering, then hard error. Lock against other senders. If must go 1049 * all at once and not enough room now, then inform user that this would 1050 * block and do nothing. Otherwise, if nonblocking, send as much as 1051 * possible. The data to be sent is described by "uio" if nonzero, otherwise 1052 * by the mbuf chain "top" (which must be null if uio is not). Data provided 1053 * in mbuf chain must be small enough to send all at once. 1054 * 1055 * Returns nonzero on error, timeout or signal; callers must check for short 1056 * counts if EINTR/ERESTART are returned. Data and control buffers are freed 1057 * on return. 1058 */ 1059 static int 1060 sdp_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 1061 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1062 { 1063 struct sdp_sock *ssk; 1064 long space, resid; 1065 int atomic; 1066 int error; 1067 int copy; 1068 1069 if (uio != NULL) 1070 resid = uio->uio_resid; 1071 else 1072 resid = top->m_pkthdr.len; 1073 atomic = top != NULL; 1074 if (control != NULL) { 1075 if (control->m_len) { 1076 m_freem(control); 1077 if (top) 1078 m_freem(top); 1079 return (EINVAL); 1080 } 1081 m_freem(control); 1082 control = NULL; 1083 } 1084 /* 1085 * In theory resid should be unsigned. However, space must be 1086 * signed, as it might be less than 0 if we over-committed, and we 1087 * must use a signed comparison of space and resid. On the other 1088 * hand, a negative resid causes us to loop sending 0-length 1089 * segments to the protocol. 1090 * 1091 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 1092 * type sockets since that's an error. 1093 */ 1094 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 1095 error = EINVAL; 1096 goto out; 1097 } 1098 if (td != NULL) 1099 td->td_ru.ru_msgsnd++; 1100 1101 ssk = sdp_sk(so); 1102 error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags)); 1103 if (error) 1104 goto out; 1105 1106 restart: 1107 do { 1108 SOCKBUF_LOCK(&so->so_snd); 1109 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1110 SOCKBUF_UNLOCK(&so->so_snd); 1111 error = EPIPE; 1112 goto release; 1113 } 1114 if (so->so_error) { 1115 error = so->so_error; 1116 so->so_error = 0; 1117 SOCKBUF_UNLOCK(&so->so_snd); 1118 goto release; 1119 } 1120 if ((so->so_state & SS_ISCONNECTED) == 0 && addr == NULL) { 1121 SOCKBUF_UNLOCK(&so->so_snd); 1122 error = ENOTCONN; 1123 goto release; 1124 } 1125 space = sbspace(&so->so_snd); 1126 if (flags & MSG_OOB) 1127 space += 1024; 1128 if (atomic && resid > ssk->xmit_size_goal - SDP_HEAD_SIZE) { 1129 SOCKBUF_UNLOCK(&so->so_snd); 1130 error = EMSGSIZE; 1131 goto release; 1132 } 1133 if (space < resid && 1134 (atomic || space < so->so_snd.sb_lowat)) { 1135 if ((so->so_state & SS_NBIO) || 1136 (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { 1137 SOCKBUF_UNLOCK(&so->so_snd); 1138 error = EWOULDBLOCK; 1139 goto release; 1140 } 1141 error = sbwait(so, SO_SND); 1142 SOCKBUF_UNLOCK(&so->so_snd); 1143 if (error) 1144 goto release; 1145 goto restart; 1146 } 1147 SOCKBUF_UNLOCK(&so->so_snd); 1148 do { 1149 if (uio == NULL) { 1150 resid = 0; 1151 if (flags & MSG_EOR) 1152 top->m_flags |= M_EOR; 1153 } else { 1154 /* 1155 * Copy the data from userland into a mbuf 1156 * chain. If no data is to be copied in, 1157 * a single empty mbuf is returned. 1158 */ 1159 copy = min(space, 1160 ssk->xmit_size_goal - SDP_HEAD_SIZE); 1161 top = m_uiotombuf(uio, M_WAITOK, copy, 1162 0, M_PKTHDR | 1163 ((flags & MSG_EOR) ? M_EOR : 0)); 1164 if (top == NULL) { 1165 /* only possible error */ 1166 error = EFAULT; 1167 goto release; 1168 } 1169 space -= resid - uio->uio_resid; 1170 resid = uio->uio_resid; 1171 } 1172 /* 1173 * XXX all the SBS_CANTSENDMORE checks previously 1174 * done could be out of date after dropping the 1175 * socket lock. 1176 */ 1177 error = sdp_send(so, (flags & MSG_OOB) ? PRUS_OOB : 1178 /* 1179 * Set EOF on the last send if the user specified 1180 * MSG_EOF. 1181 */ 1182 ((flags & MSG_EOF) && (resid <= 0)) ? PRUS_EOF : 1183 /* If there is more to send set PRUS_MORETOCOME. */ 1184 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1185 top, addr, NULL, td); 1186 top = NULL; 1187 if (error) 1188 goto release; 1189 } while (resid && space > 0); 1190 } while (resid); 1191 1192 release: 1193 SOCK_IO_SEND_UNLOCK(so); 1194 out: 1195 if (top != NULL) 1196 m_freem(top); 1197 return (error); 1198 } 1199 1200 /* 1201 * The part of soreceive() that implements reading non-inline out-of-band 1202 * data from a socket. For more complete comments, see soreceive(), from 1203 * which this code originated. 1204 * 1205 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is 1206 * unable to return an mbuf chain to the caller. 1207 */ 1208 static int 1209 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) 1210 { 1211 struct protosw *pr = so->so_proto; 1212 struct mbuf *m; 1213 int error; 1214 1215 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 1216 1217 m = m_get(M_WAITOK, MT_DATA); 1218 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 1219 if (error) 1220 goto bad; 1221 do { 1222 error = uiomove(mtod(m, void *), 1223 (int) min(uio->uio_resid, m->m_len), uio); 1224 m = m_free(m); 1225 } while (uio->uio_resid && error == 0 && m); 1226 bad: 1227 if (m != NULL) 1228 m_freem(m); 1229 return (error); 1230 } 1231 1232 /* 1233 * Optimized version of soreceive() for stream (TCP) sockets. 1234 */ 1235 static int 1236 sdp_sorecv(struct socket *so, struct sockaddr **psa, struct uio *uio, 1237 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 1238 { 1239 int len = 0, error = 0, flags, oresid; 1240 struct sockbuf *sb; 1241 struct mbuf *m, *n = NULL; 1242 struct sdp_sock *ssk; 1243 1244 /* We only do stream sockets. */ 1245 if (so->so_type != SOCK_STREAM) 1246 return (EINVAL); 1247 if (psa != NULL) 1248 *psa = NULL; 1249 if (controlp != NULL) 1250 return (EINVAL); 1251 if (flagsp != NULL) 1252 flags = *flagsp &~ MSG_EOR; 1253 else 1254 flags = 0; 1255 if (flags & MSG_OOB) 1256 return (soreceive_rcvoob(so, uio, flags)); 1257 if (mp0 != NULL) 1258 *mp0 = NULL; 1259 1260 sb = &so->so_rcv; 1261 ssk = sdp_sk(so); 1262 1263 /* Prevent other readers from entering the socket. */ 1264 error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); 1265 if (error) 1266 return (error); 1267 SOCKBUF_LOCK(sb); 1268 1269 /* Easy one, no space to copyout anything. */ 1270 if (uio->uio_resid == 0) { 1271 error = EINVAL; 1272 goto out; 1273 } 1274 oresid = uio->uio_resid; 1275 1276 /* We will never ever get anything unless we are connected. */ 1277 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { 1278 /* When disconnecting there may be still some data left. */ 1279 if (sbavail(sb)) 1280 goto deliver; 1281 if (!(so->so_state & SS_ISDISCONNECTED)) 1282 error = ENOTCONN; 1283 goto out; 1284 } 1285 1286 /* Socket buffer is empty and we shall not block. */ 1287 if (sbavail(sb) == 0 && 1288 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { 1289 error = EAGAIN; 1290 goto out; 1291 } 1292 1293 restart: 1294 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1295 1296 /* Abort if socket has reported problems. */ 1297 if (so->so_error) { 1298 if (sbavail(sb)) 1299 goto deliver; 1300 if (oresid > uio->uio_resid) 1301 goto out; 1302 error = so->so_error; 1303 if (!(flags & MSG_PEEK)) 1304 so->so_error = 0; 1305 goto out; 1306 } 1307 1308 /* Door is closed. Deliver what is left, if any. */ 1309 if (sb->sb_state & SBS_CANTRCVMORE) { 1310 if (sbavail(sb)) 1311 goto deliver; 1312 else 1313 goto out; 1314 } 1315 1316 /* Socket buffer got some data that we shall deliver now. */ 1317 if (sbavail(sb) && !(flags & MSG_WAITALL) && 1318 ((so->so_state & SS_NBIO) || 1319 (flags & (MSG_DONTWAIT|MSG_NBIO)) || 1320 sbavail(sb) >= sb->sb_lowat || 1321 sbavail(sb) >= uio->uio_resid || 1322 sbavail(sb) >= sb->sb_hiwat) ) { 1323 goto deliver; 1324 } 1325 1326 /* On MSG_WAITALL we must wait until all data or error arrives. */ 1327 if ((flags & MSG_WAITALL) && 1328 (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_lowat)) 1329 goto deliver; 1330 1331 /* 1332 * Wait and block until (more) data comes in. 1333 * NB: Drops the sockbuf lock during wait. 1334 */ 1335 error = sbwait(so, SO_RCV); 1336 if (error) 1337 goto out; 1338 goto restart; 1339 1340 deliver: 1341 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1342 KASSERT(sbavail(sb), ("%s: sockbuf empty", __func__)); 1343 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); 1344 1345 /* Statistics. */ 1346 if (uio->uio_td) 1347 uio->uio_td->td_ru.ru_msgrcv++; 1348 1349 /* Fill uio until full or current end of socket buffer is reached. */ 1350 len = min(uio->uio_resid, sbavail(sb)); 1351 if (mp0 != NULL) { 1352 /* Dequeue as many mbufs as possible. */ 1353 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { 1354 for (*mp0 = m = sb->sb_mb; 1355 m != NULL && m->m_len <= len; 1356 m = m->m_next) { 1357 len -= m->m_len; 1358 uio->uio_resid -= m->m_len; 1359 sbfree(sb, m); 1360 n = m; 1361 } 1362 sb->sb_mb = m; 1363 if (sb->sb_mb == NULL) 1364 SB_EMPTY_FIXUP(sb); 1365 n->m_next = NULL; 1366 } 1367 /* Copy the remainder. */ 1368 if (len > 0) { 1369 KASSERT(sb->sb_mb != NULL, 1370 ("%s: len > 0 && sb->sb_mb empty", __func__)); 1371 1372 m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); 1373 if (m == NULL) 1374 len = 0; /* Don't flush data from sockbuf. */ 1375 else 1376 uio->uio_resid -= m->m_len; 1377 if (*mp0 != NULL) 1378 n->m_next = m; 1379 else 1380 *mp0 = m; 1381 if (*mp0 == NULL) { 1382 error = ENOBUFS; 1383 goto out; 1384 } 1385 } 1386 } else { 1387 /* NB: Must unlock socket buffer as uiomove may sleep. */ 1388 SOCKBUF_UNLOCK(sb); 1389 error = m_mbuftouio(uio, sb->sb_mb, len); 1390 SOCKBUF_LOCK(sb); 1391 if (error) 1392 goto out; 1393 } 1394 SBLASTRECORDCHK(sb); 1395 SBLASTMBUFCHK(sb); 1396 1397 /* 1398 * Remove the delivered data from the socket buffer unless we 1399 * were only peeking. 1400 */ 1401 if (!(flags & MSG_PEEK)) { 1402 if (len > 0) 1403 sbdrop_locked(sb, len); 1404 1405 /* Notify protocol that we drained some data. */ 1406 SOCKBUF_UNLOCK(sb); 1407 SDP_WLOCK(ssk); 1408 sdp_do_posts(ssk); 1409 SDP_WUNLOCK(ssk); 1410 SOCKBUF_LOCK(sb); 1411 } 1412 1413 /* 1414 * For MSG_WAITALL we may have to loop again and wait for 1415 * more data to come in. 1416 */ 1417 if ((flags & MSG_WAITALL) && uio->uio_resid > 0) 1418 goto restart; 1419 out: 1420 SBLASTRECORDCHK(sb); 1421 SBLASTMBUFCHK(sb); 1422 SOCKBUF_UNLOCK(sb); 1423 SOCK_IO_RECV_UNLOCK(so); 1424 return (error); 1425 } 1426 1427 /* 1428 * Abort is used to teardown a connection typically while sitting in 1429 * the accept queue. 1430 */ 1431 void 1432 sdp_abort(struct socket *so) 1433 { 1434 struct sdp_sock *ssk; 1435 1436 ssk = sdp_sk(so); 1437 SDP_WLOCK(ssk); 1438 /* 1439 * If we have not yet dropped, do it now. 1440 */ 1441 if (!(ssk->flags & SDP_TIMEWAIT) && 1442 !(ssk->flags & SDP_DROPPED)) 1443 sdp_drop(ssk, ECONNABORTED); 1444 KASSERT(ssk->flags & SDP_DROPPED, ("sdp_abort: %p not dropped 0x%X", 1445 ssk, ssk->flags)); 1446 SDP_WUNLOCK(ssk); 1447 } 1448 1449 /* 1450 * Close a SDP socket and initiate a friendly disconnect. 1451 */ 1452 static void 1453 sdp_close(struct socket *so) 1454 { 1455 struct sdp_sock *ssk; 1456 1457 ssk = sdp_sk(so); 1458 SDP_WLOCK(ssk); 1459 /* 1460 * If we have not yet dropped, do it now. 1461 */ 1462 if (!(ssk->flags & SDP_TIMEWAIT) && 1463 !(ssk->flags & SDP_DROPPED)) 1464 sdp_start_disconnect(ssk); 1465 1466 /* 1467 * If we've still not dropped let the socket layer know we're 1468 * holding on to the socket and pcb for a while. 1469 */ 1470 if (!(ssk->flags & SDP_DROPPED)) { 1471 ssk->flags |= SDP_SOCKREF; 1472 soref(so); 1473 } 1474 SDP_WUNLOCK(ssk); 1475 } 1476 1477 /* 1478 * User requests out-of-band data. 1479 */ 1480 static int 1481 sdp_rcvoob(struct socket *so, struct mbuf *m, int flags) 1482 { 1483 int error = 0; 1484 struct sdp_sock *ssk; 1485 1486 ssk = sdp_sk(so); 1487 SDP_WLOCK(ssk); 1488 if (!rx_ring_trylock(&ssk->rx_ring)) { 1489 SDP_WUNLOCK(ssk); 1490 return (ECONNRESET); 1491 } 1492 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 1493 error = ECONNRESET; 1494 goto out; 1495 } 1496 if ((so->so_oobmark == 0 && 1497 (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) || 1498 so->so_options & SO_OOBINLINE || 1499 ssk->oobflags & SDP_HADOOB) { 1500 error = EINVAL; 1501 goto out; 1502 } 1503 if ((ssk->oobflags & SDP_HAVEOOB) == 0) { 1504 error = EWOULDBLOCK; 1505 goto out; 1506 } 1507 m->m_len = 1; 1508 *mtod(m, caddr_t) = ssk->iobc; 1509 if ((flags & MSG_PEEK) == 0) 1510 ssk->oobflags ^= (SDP_HAVEOOB | SDP_HADOOB); 1511 out: 1512 rx_ring_unlock(&ssk->rx_ring); 1513 SDP_WUNLOCK(ssk); 1514 return (error); 1515 } 1516 1517 void 1518 sdp_urg(struct sdp_sock *ssk, struct mbuf *mb) 1519 { 1520 struct mbuf *m; 1521 struct socket *so; 1522 1523 so = ssk->socket; 1524 if (so == NULL) 1525 return; 1526 1527 so->so_oobmark = sbused(&so->so_rcv) + mb->m_pkthdr.len - 1; 1528 sohasoutofband(so); 1529 ssk->oobflags &= ~(SDP_HAVEOOB | SDP_HADOOB); 1530 if (!(so->so_options & SO_OOBINLINE)) { 1531 for (m = mb; m->m_next != NULL; m = m->m_next); 1532 ssk->iobc = *(mtod(m, char *) + m->m_len - 1); 1533 ssk->oobflags |= SDP_HAVEOOB; 1534 m->m_len--; 1535 mb->m_pkthdr.len--; 1536 } 1537 } 1538 1539 /* 1540 * Notify a sdp socket of an asynchronous error. 1541 * 1542 * Do not wake up user since there currently is no mechanism for 1543 * reporting soft errors (yet - a kqueue filter may be added). 1544 */ 1545 struct sdp_sock * 1546 sdp_notify(struct sdp_sock *ssk, int error) 1547 { 1548 1549 SDP_WLOCK_ASSERT(ssk); 1550 1551 if ((ssk->flags & SDP_TIMEWAIT) || 1552 (ssk->flags & SDP_DROPPED)) 1553 return (ssk); 1554 1555 /* 1556 * Ignore some errors if we are hooked up. 1557 */ 1558 if (ssk->state == TCPS_ESTABLISHED && 1559 (error == EHOSTUNREACH || error == ENETUNREACH || 1560 error == EHOSTDOWN)) 1561 return (ssk); 1562 ssk->softerror = error; 1563 return sdp_drop(ssk, error); 1564 } 1565 1566 static void 1567 sdp_ctlinput(int cmd, struct sockaddr *sa, void *vip) 1568 { 1569 struct in_addr faddr; 1570 1571 faddr = ((struct sockaddr_in *)sa)->sin_addr; 1572 if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) 1573 return; 1574 1575 sdp_pcbnotifyall(faddr, inetctlerrmap[cmd], sdp_notify); 1576 } 1577 1578 static void 1579 sdp_keepalive_timeout(void *data) 1580 { 1581 struct sdp_sock *ssk; 1582 1583 ssk = data; 1584 /* Callout canceled. */ 1585 if (!callout_active(&ssk->keep2msl)) 1586 return; 1587 /* Callout rescheduled as a different kind of timer. */ 1588 if (callout_pending(&ssk->keep2msl)) 1589 goto out; 1590 callout_deactivate(&ssk->keep2msl); 1591 if (ssk->flags & SDP_DROPPED || 1592 (ssk->socket->so_options & SO_KEEPALIVE) == 0) 1593 goto out; 1594 sdp_post_keepalive(ssk); 1595 callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME, 1596 sdp_keepalive_timeout, ssk); 1597 out: 1598 SDP_WUNLOCK(ssk); 1599 } 1600 1601 1602 void 1603 sdp_start_keepalive_timer(struct socket *so) 1604 { 1605 struct sdp_sock *ssk; 1606 1607 ssk = sdp_sk(so); 1608 if (!callout_pending(&ssk->keep2msl)) 1609 callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME, 1610 sdp_keepalive_timeout, ssk); 1611 } 1612 1613 static void 1614 sdp_stop_keepalive_timer(struct socket *so) 1615 { 1616 struct sdp_sock *ssk; 1617 1618 ssk = sdp_sk(so); 1619 callout_stop(&ssk->keep2msl); 1620 } 1621 1622 /* 1623 * sdp_ctloutput() must drop the inpcb lock before performing copyin on 1624 * socket option arguments. When it re-acquires the lock after the copy, it 1625 * has to revalidate that the connection is still valid for the socket 1626 * option. 1627 */ 1628 #define SDP_WLOCK_RECHECK(inp) do { \ 1629 SDP_WLOCK(ssk); \ 1630 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { \ 1631 SDP_WUNLOCK(ssk); \ 1632 return (ECONNRESET); \ 1633 } \ 1634 } while(0) 1635 1636 static int 1637 sdp_ctloutput(struct socket *so, struct sockopt *sopt) 1638 { 1639 int error, opt, optval; 1640 struct sdp_sock *ssk; 1641 1642 error = 0; 1643 ssk = sdp_sk(so); 1644 if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_KEEPALIVE) { 1645 SDP_WLOCK(ssk); 1646 if (so->so_options & SO_KEEPALIVE) 1647 sdp_start_keepalive_timer(so); 1648 else 1649 sdp_stop_keepalive_timer(so); 1650 SDP_WUNLOCK(ssk); 1651 } 1652 if (sopt->sopt_level != IPPROTO_TCP) 1653 return (error); 1654 1655 SDP_WLOCK(ssk); 1656 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 1657 SDP_WUNLOCK(ssk); 1658 return (ECONNRESET); 1659 } 1660 1661 switch (sopt->sopt_dir) { 1662 case SOPT_SET: 1663 switch (sopt->sopt_name) { 1664 case TCP_NODELAY: 1665 SDP_WUNLOCK(ssk); 1666 error = sooptcopyin(sopt, &optval, sizeof optval, 1667 sizeof optval); 1668 if (error) 1669 return (error); 1670 1671 SDP_WLOCK_RECHECK(ssk); 1672 opt = SDP_NODELAY; 1673 if (optval) 1674 ssk->flags |= opt; 1675 else 1676 ssk->flags &= ~opt; 1677 sdp_do_posts(ssk); 1678 SDP_WUNLOCK(ssk); 1679 break; 1680 1681 default: 1682 SDP_WUNLOCK(ssk); 1683 error = ENOPROTOOPT; 1684 break; 1685 } 1686 break; 1687 1688 case SOPT_GET: 1689 switch (sopt->sopt_name) { 1690 case TCP_NODELAY: 1691 optval = ssk->flags & SDP_NODELAY; 1692 SDP_WUNLOCK(ssk); 1693 error = sooptcopyout(sopt, &optval, sizeof optval); 1694 break; 1695 default: 1696 SDP_WUNLOCK(ssk); 1697 error = ENOPROTOOPT; 1698 break; 1699 } 1700 break; 1701 } 1702 return (error); 1703 } 1704 #undef SDP_WLOCK_RECHECK 1705 1706 int sdp_mod_count = 0; 1707 int sdp_mod_usec = 0; 1708 1709 void 1710 sdp_set_default_moderation(struct sdp_sock *ssk) 1711 { 1712 if (sdp_mod_count <= 0 || sdp_mod_usec <= 0) 1713 return; 1714 ib_modify_cq(ssk->rx_ring.cq, sdp_mod_count, sdp_mod_usec); 1715 } 1716 1717 static void 1718 sdp_dev_add(struct ib_device *device) 1719 { 1720 struct ib_fmr_pool_param param; 1721 struct sdp_device *sdp_dev; 1722 1723 sdp_dev = malloc(sizeof(*sdp_dev), M_SDP, M_WAITOK | M_ZERO); 1724 sdp_dev->pd = ib_alloc_pd(device, 0); 1725 if (IS_ERR(sdp_dev->pd)) 1726 goto out_pd; 1727 memset(¶m, 0, sizeof param); 1728 param.max_pages_per_fmr = SDP_FMR_SIZE; 1729 param.page_shift = PAGE_SHIFT; 1730 param.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ); 1731 param.pool_size = SDP_FMR_POOL_SIZE; 1732 param.dirty_watermark = SDP_FMR_DIRTY_SIZE; 1733 param.cache = 1; 1734 sdp_dev->fmr_pool = ib_create_fmr_pool(sdp_dev->pd, ¶m); 1735 if (IS_ERR(sdp_dev->fmr_pool)) 1736 goto out_fmr; 1737 ib_set_client_data(device, &sdp_client, sdp_dev); 1738 return; 1739 1740 out_fmr: 1741 ib_dealloc_pd(sdp_dev->pd); 1742 out_pd: 1743 free(sdp_dev, M_SDP); 1744 } 1745 1746 static void 1747 sdp_dev_rem(struct ib_device *device, void *client_data) 1748 { 1749 struct sdp_device *sdp_dev; 1750 struct sdp_sock *ssk; 1751 1752 SDP_LIST_WLOCK(); 1753 LIST_FOREACH(ssk, &sdp_list, list) { 1754 if (ssk->ib_device != device) 1755 continue; 1756 SDP_WLOCK(ssk); 1757 if ((ssk->flags & SDP_DESTROY) == 0) 1758 ssk = sdp_notify(ssk, ECONNRESET); 1759 if (ssk) 1760 SDP_WUNLOCK(ssk); 1761 } 1762 SDP_LIST_WUNLOCK(); 1763 /* 1764 * XXX Do I need to wait between these two? 1765 */ 1766 sdp_dev = ib_get_client_data(device, &sdp_client); 1767 if (!sdp_dev) 1768 return; 1769 ib_flush_fmr_pool(sdp_dev->fmr_pool); 1770 ib_destroy_fmr_pool(sdp_dev->fmr_pool); 1771 ib_dealloc_pd(sdp_dev->pd); 1772 free(sdp_dev, M_SDP); 1773 } 1774 1775 struct ib_client sdp_client = 1776 { .name = "sdp", .add = sdp_dev_add, .remove = sdp_dev_rem }; 1777 1778 1779 static int 1780 sdp_pcblist(SYSCTL_HANDLER_ARGS) 1781 { 1782 int error, n, i; 1783 struct sdp_sock *ssk; 1784 struct xinpgen xig; 1785 1786 /* 1787 * The process of preparing the TCB list is too time-consuming and 1788 * resource-intensive to repeat twice on every request. 1789 */ 1790 if (req->oldptr == NULL) { 1791 n = sdp_count; 1792 n += imax(n / 8, 10); 1793 req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb); 1794 return (0); 1795 } 1796 1797 if (req->newptr != NULL) 1798 return (EPERM); 1799 1800 /* 1801 * OK, now we're committed to doing something. 1802 */ 1803 SDP_LIST_RLOCK(); 1804 n = sdp_count; 1805 SDP_LIST_RUNLOCK(); 1806 1807 error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) 1808 + n * sizeof(struct xtcpcb)); 1809 if (error != 0) 1810 return (error); 1811 1812 bzero(&xig, sizeof(xig)); 1813 xig.xig_len = sizeof xig; 1814 xig.xig_count = n; 1815 xig.xig_gen = 0; 1816 xig.xig_sogen = so_gencnt; 1817 error = SYSCTL_OUT(req, &xig, sizeof xig); 1818 if (error) 1819 return (error); 1820 1821 SDP_LIST_RLOCK(); 1822 for (ssk = LIST_FIRST(&sdp_list), i = 0; 1823 ssk != NULL && i < n; ssk = LIST_NEXT(ssk, list)) { 1824 struct xtcpcb xt; 1825 1826 SDP_RLOCK(ssk); 1827 if (ssk->flags & SDP_TIMEWAIT) { 1828 if (ssk->cred != NULL) 1829 error = cr_cansee(req->td->td_ucred, 1830 ssk->cred); 1831 else 1832 error = EINVAL; /* Skip this inp. */ 1833 } else if (ssk->socket) 1834 error = cr_canseesocket(req->td->td_ucred, 1835 ssk->socket); 1836 else 1837 error = EINVAL; 1838 if (error) { 1839 error = 0; 1840 goto next; 1841 } 1842 1843 bzero(&xt, sizeof(xt)); 1844 xt.xt_len = sizeof xt; 1845 xt.xt_inp.inp_gencnt = 0; 1846 xt.xt_inp.inp_vflag = INP_IPV4; 1847 memcpy(&xt.xt_inp.inp_laddr, &ssk->laddr, sizeof(ssk->laddr)); 1848 xt.xt_inp.inp_lport = ssk->lport; 1849 memcpy(&xt.xt_inp.inp_faddr, &ssk->faddr, sizeof(ssk->faddr)); 1850 xt.xt_inp.inp_fport = ssk->fport; 1851 xt.t_state = ssk->state; 1852 if (ssk->socket != NULL) 1853 sotoxsocket(ssk->socket, &xt.xt_inp.xi_socket); 1854 xt.xt_inp.xi_socket.xso_protocol = IPPROTO_TCP; 1855 SDP_RUNLOCK(ssk); 1856 error = SYSCTL_OUT(req, &xt, sizeof xt); 1857 if (error) 1858 break; 1859 i++; 1860 continue; 1861 next: 1862 SDP_RUNLOCK(ssk); 1863 } 1864 if (!error) { 1865 /* 1866 * Give the user an updated idea of our state. 1867 * If the generation differs from what we told 1868 * her before, she knows that something happened 1869 * while we were processing this request, and it 1870 * might be necessary to retry. 1871 */ 1872 xig.xig_gen = 0; 1873 xig.xig_sogen = so_gencnt; 1874 xig.xig_count = sdp_count; 1875 error = SYSCTL_OUT(req, &xig, sizeof xig); 1876 } 1877 SDP_LIST_RUNLOCK(); 1878 return (error); 1879 } 1880 1881 SYSCTL_NODE(_net_inet, -1, sdp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1882 "SDP"); 1883 1884 SYSCTL_PROC(_net_inet_sdp, TCPCTL_PCBLIST, pcblist, 1885 CTLFLAG_RD | CTLTYPE_STRUCT | CTLFLAG_MPSAFE, 1886 0, 0, sdp_pcblist, "S,xtcpcb", 1887 "List of active SDP connections"); 1888 1889 static void 1890 sdp_zone_change(void *tag) 1891 { 1892 1893 uma_zone_set_max(sdp_zone, maxsockets); 1894 } 1895 1896 static void 1897 sdp_init(void *arg __unused) 1898 { 1899 1900 LIST_INIT(&sdp_list); 1901 sdp_zone = uma_zcreate("sdp_sock", sizeof(struct sdp_sock), 1902 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 1903 uma_zone_set_max(sdp_zone, maxsockets); 1904 EVENTHANDLER_REGISTER(maxsockets_change, sdp_zone_change, NULL, 1905 EVENTHANDLER_PRI_ANY); 1906 rx_comp_wq = create_singlethread_workqueue("rx_comp_wq"); 1907 ib_register_client(&sdp_client); 1908 } 1909 SYSINIT(sdp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND, sdp_init, NULL); 1910 1911 extern struct domain sdpdomain; 1912 1913 struct pr_usrreqs sdp_usrreqs = { 1914 .pru_abort = sdp_abort, 1915 .pru_accept = sdp_accept, 1916 .pru_attach = sdp_attach, 1917 .pru_bind = sdp_bind, 1918 .pru_connect = sdp_connect, 1919 .pru_detach = sdp_detach, 1920 .pru_disconnect = sdp_disconnect, 1921 .pru_listen = sdp_listen, 1922 .pru_peeraddr = sdp_getpeeraddr, 1923 .pru_rcvoob = sdp_rcvoob, 1924 .pru_send = sdp_send, 1925 .pru_sosend = sdp_sosend, 1926 .pru_soreceive = sdp_sorecv, 1927 .pru_shutdown = sdp_shutdown, 1928 .pru_sockaddr = sdp_getsockaddr, 1929 .pru_close = sdp_close, 1930 }; 1931 1932 struct protosw sdpsw[] = { 1933 { 1934 .pr_type = SOCK_STREAM, 1935 .pr_domain = &sdpdomain, 1936 .pr_protocol = IPPROTO_IP, 1937 .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD, 1938 .pr_ctlinput = sdp_ctlinput, 1939 .pr_ctloutput = sdp_ctloutput, 1940 .pr_usrreqs = &sdp_usrreqs 1941 }, 1942 { 1943 .pr_type = SOCK_STREAM, 1944 .pr_domain = &sdpdomain, 1945 .pr_protocol = IPPROTO_TCP, 1946 .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD, 1947 .pr_ctlinput = sdp_ctlinput, 1948 .pr_ctloutput = sdp_ctloutput, 1949 .pr_usrreqs = &sdp_usrreqs 1950 }, 1951 }; 1952 1953 struct domain sdpdomain = { 1954 .dom_family = AF_INET_SDP, 1955 .dom_name = "SDP", 1956 .dom_protosw = sdpsw, 1957 .dom_protoswNPROTOSW = &sdpsw[sizeof(sdpsw)/sizeof(sdpsw[0])], 1958 }; 1959 1960 DOMAIN_SET(sdp); 1961 1962 int sdp_debug_level = 1; 1963 int sdp_data_debug_level = 0; 1964