1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * Copyright (c) 2004 The FreeBSD Foundation. All rights reserved. 7 * Copyright (c) 2004-2008 Robert N. M. Watson. All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * Excerpts taken from tcp_subr.c, tcp_usrreq.c, uipc_socket.c 34 */ 35 36 /* 37 * 38 * Copyright (c) 2010 Isilon Systems, Inc. 39 * Copyright (c) 2010 iX Systems, Inc. 40 * Copyright (c) 2010 Panasas, Inc. 41 * All rights reserved. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice unmodified, this list of conditions, and the following 48 * disclaimer. 49 * 2. Redistributions in binary form must reproduce the above copyright 50 * notice, this list of conditions and the following disclaimer in the 51 * documentation and/or other materials provided with the distribution. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 54 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 55 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 56 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 57 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 58 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 59 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 60 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 61 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 62 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 63 * 64 */ 65 #include <sys/cdefs.h> 66 #include <sys/param.h> 67 #include <sys/eventhandler.h> 68 #include <sys/kernel.h> 69 #include <sys/malloc.h> 70 71 #include "sdp.h" 72 73 #include <net/if.h> 74 #include <net/route.h> 75 #include <net/vnet.h> 76 #include <sys/sysctl.h> 77 78 uma_zone_t sdp_zone; 79 struct rwlock sdp_lock; 80 LIST_HEAD(, sdp_sock) sdp_list; 81 82 struct workqueue_struct *rx_comp_wq; 83 84 RW_SYSINIT(sdplockinit, &sdp_lock, "SDP lock"); 85 #define SDP_LIST_WLOCK() rw_wlock(&sdp_lock) 86 #define SDP_LIST_RLOCK() rw_rlock(&sdp_lock) 87 #define SDP_LIST_WUNLOCK() rw_wunlock(&sdp_lock) 88 #define SDP_LIST_RUNLOCK() rw_runlock(&sdp_lock) 89 #define SDP_LIST_WLOCK_ASSERT() rw_assert(&sdp_lock, RW_WLOCKED) 90 #define SDP_LIST_RLOCK_ASSERT() rw_assert(&sdp_lock, RW_RLOCKED) 91 #define SDP_LIST_LOCK_ASSERT() rw_assert(&sdp_lock, RW_LOCKED) 92 93 MALLOC_DEFINE(M_SDP, "sdp", "Sockets Direct Protocol"); 94 95 static void sdp_stop_keepalive_timer(struct socket *so); 96 97 /* 98 * SDP protocol interface to socket abstraction. 99 */ 100 /* 101 * sdp_sendspace and sdp_recvspace are the default send and receive window 102 * sizes, respectively. 103 */ 104 u_long sdp_sendspace = 1024*32; 105 u_long sdp_recvspace = 1024*64; 106 107 static int sdp_count; 108 109 /* 110 * Disable async. CMA events for sockets which are being torn down. 111 */ 112 static void 113 sdp_destroy_cma(struct sdp_sock *ssk) 114 { 115 116 if (ssk->id == NULL) 117 return; 118 rdma_destroy_id(ssk->id); 119 ssk->id = NULL; 120 } 121 122 static int 123 sdp_pcbbind(struct sdp_sock *ssk, struct sockaddr *nam, struct ucred *cred) 124 { 125 struct sockaddr_in *sin; 126 struct sockaddr_in null; 127 int error; 128 129 SDP_WLOCK_ASSERT(ssk); 130 131 if (ssk->lport != 0 || ssk->laddr != INADDR_ANY) 132 return (EINVAL); 133 /* rdma_bind_addr handles bind races. */ 134 SDP_WUNLOCK(ssk); 135 if (ssk->id == NULL) 136 ssk->id = rdma_create_id(&init_net, sdp_cma_handler, ssk, RDMA_PS_SDP, IB_QPT_RC); 137 if (ssk->id == NULL) { 138 SDP_WLOCK(ssk); 139 return (ENOMEM); 140 } 141 if (nam == NULL) { 142 null.sin_family = AF_INET; 143 null.sin_len = sizeof(null); 144 null.sin_addr.s_addr = INADDR_ANY; 145 null.sin_port = 0; 146 bzero(&null.sin_zero, sizeof(null.sin_zero)); 147 nam = (struct sockaddr *)&null; 148 } 149 error = -rdma_bind_addr(ssk->id, nam); 150 SDP_WLOCK(ssk); 151 if (error == 0) { 152 sin = (struct sockaddr_in *)&ssk->id->route.addr.src_addr; 153 ssk->laddr = sin->sin_addr.s_addr; 154 ssk->lport = sin->sin_port; 155 } else 156 sdp_destroy_cma(ssk); 157 return (error); 158 } 159 160 static void 161 sdp_pcbfree(struct sdp_sock *ssk) 162 { 163 164 KASSERT(ssk->socket == NULL, ("ssk %p socket still attached", ssk)); 165 KASSERT((ssk->flags & SDP_DESTROY) == 0, 166 ("ssk %p already destroyed", ssk)); 167 168 sdp_dbg(ssk->socket, "Freeing pcb"); 169 SDP_WLOCK_ASSERT(ssk); 170 ssk->flags |= SDP_DESTROY; 171 SDP_WUNLOCK(ssk); 172 SDP_LIST_WLOCK(); 173 sdp_count--; 174 LIST_REMOVE(ssk, list); 175 SDP_LIST_WUNLOCK(); 176 crfree(ssk->cred); 177 ssk->qp_active = 0; 178 if (ssk->qp) { 179 ib_destroy_qp(ssk->qp); 180 ssk->qp = NULL; 181 } 182 sdp_tx_ring_destroy(ssk); 183 sdp_rx_ring_destroy(ssk); 184 sdp_destroy_cma(ssk); 185 rw_destroy(&ssk->rx_ring.destroyed_lock); 186 rw_destroy(&ssk->lock); 187 uma_zfree(sdp_zone, ssk); 188 } 189 190 /* 191 * Common routines to return a socket address. 192 */ 193 static struct sockaddr * 194 sdp_sockaddr(in_port_t port, struct in_addr *addr_p) 195 { 196 struct sockaddr_in *sin; 197 198 sin = malloc(sizeof *sin, M_SONAME, 199 M_WAITOK | M_ZERO); 200 sin->sin_family = AF_INET; 201 sin->sin_len = sizeof(*sin); 202 sin->sin_addr = *addr_p; 203 sin->sin_port = port; 204 205 return (struct sockaddr *)sin; 206 } 207 208 static int 209 sdp_getsockaddr(struct socket *so, struct sockaddr **nam) 210 { 211 struct sdp_sock *ssk; 212 struct in_addr addr; 213 in_port_t port; 214 215 ssk = sdp_sk(so); 216 SDP_RLOCK(ssk); 217 port = ssk->lport; 218 addr.s_addr = ssk->laddr; 219 SDP_RUNLOCK(ssk); 220 221 *nam = sdp_sockaddr(port, &addr); 222 return 0; 223 } 224 225 static int 226 sdp_getpeeraddr(struct socket *so, struct sockaddr **nam) 227 { 228 struct sdp_sock *ssk; 229 struct in_addr addr; 230 in_port_t port; 231 232 ssk = sdp_sk(so); 233 SDP_RLOCK(ssk); 234 port = ssk->fport; 235 addr.s_addr = ssk->faddr; 236 SDP_RUNLOCK(ssk); 237 238 *nam = sdp_sockaddr(port, &addr); 239 return 0; 240 } 241 242 #if 0 243 static void 244 sdp_apply_all(void (*func)(struct sdp_sock *, void *), void *arg) 245 { 246 struct sdp_sock *ssk; 247 248 SDP_LIST_RLOCK(); 249 LIST_FOREACH(ssk, &sdp_list, list) { 250 SDP_WLOCK(ssk); 251 func(ssk, arg); 252 SDP_WUNLOCK(ssk); 253 } 254 SDP_LIST_RUNLOCK(); 255 } 256 #endif 257 258 static void 259 sdp_output_reset(struct sdp_sock *ssk) 260 { 261 struct rdma_cm_id *id; 262 263 SDP_WLOCK_ASSERT(ssk); 264 if (ssk->id) { 265 id = ssk->id; 266 ssk->qp_active = 0; 267 SDP_WUNLOCK(ssk); 268 rdma_disconnect(id); 269 SDP_WLOCK(ssk); 270 } 271 ssk->state = TCPS_CLOSED; 272 } 273 274 /* 275 * Attempt to close a SDP socket, marking it as dropped, and freeing 276 * the socket if we hold the only reference. 277 */ 278 static struct sdp_sock * 279 sdp_closed(struct sdp_sock *ssk) 280 { 281 struct socket *so; 282 283 SDP_WLOCK_ASSERT(ssk); 284 285 ssk->flags |= SDP_DROPPED; 286 so = ssk->socket; 287 soisdisconnected(so); 288 if (ssk->flags & SDP_SOCKREF) { 289 ssk->flags &= ~SDP_SOCKREF; 290 SDP_WUNLOCK(ssk); 291 sorele(so); 292 return (NULL); 293 } 294 return (ssk); 295 } 296 297 /* 298 * Perform timer based shutdowns which can not operate in 299 * callout context. 300 */ 301 static void 302 sdp_shutdown_task(void *data, int pending) 303 { 304 struct sdp_sock *ssk; 305 306 ssk = data; 307 SDP_WLOCK(ssk); 308 /* 309 * I don't think this can race with another call to pcbfree() 310 * because SDP_TIMEWAIT protects it. SDP_DESTROY may be redundant. 311 */ 312 if (ssk->flags & SDP_DESTROY) 313 panic("sdp_shutdown_task: Racing with pcbfree for ssk %p", 314 ssk); 315 if (ssk->flags & SDP_DISCON) 316 sdp_output_reset(ssk); 317 /* We have to clear this so sdp_detach() will call pcbfree(). */ 318 ssk->flags &= ~(SDP_TIMEWAIT | SDP_DREQWAIT); 319 if ((ssk->flags & SDP_DROPPED) == 0 && 320 sdp_closed(ssk) == NULL) 321 return; 322 if (ssk->socket == NULL) { 323 sdp_pcbfree(ssk); 324 return; 325 } 326 SDP_WUNLOCK(ssk); 327 } 328 329 /* 330 * 2msl has expired, schedule the shutdown task. 331 */ 332 static void 333 sdp_2msl_timeout(void *data) 334 { 335 struct sdp_sock *ssk; 336 337 ssk = data; 338 /* Callout canceled. */ 339 if (!callout_active(&ssk->keep2msl)) 340 goto out; 341 callout_deactivate(&ssk->keep2msl); 342 /* Should be impossible, defensive programming. */ 343 if ((ssk->flags & SDP_TIMEWAIT) == 0) 344 goto out; 345 taskqueue_enqueue(taskqueue_thread, &ssk->shutdown_task); 346 out: 347 SDP_WUNLOCK(ssk); 348 return; 349 } 350 351 /* 352 * Schedule the 2msl wait timer. 353 */ 354 static void 355 sdp_2msl_wait(struct sdp_sock *ssk) 356 { 357 358 SDP_WLOCK_ASSERT(ssk); 359 ssk->flags |= SDP_TIMEWAIT; 360 ssk->state = TCPS_TIME_WAIT; 361 soisdisconnected(ssk->socket); 362 callout_reset(&ssk->keep2msl, TCPTV_MSL, sdp_2msl_timeout, ssk); 363 } 364 365 /* 366 * Timed out waiting for the final fin/ack from rdma_disconnect(). 367 */ 368 static void 369 sdp_dreq_timeout(void *data) 370 { 371 struct sdp_sock *ssk; 372 373 ssk = data; 374 /* Callout canceled. */ 375 if (!callout_active(&ssk->keep2msl)) 376 goto out; 377 /* Callout rescheduled, probably as a different timer. */ 378 if (callout_pending(&ssk->keep2msl)) 379 goto out; 380 callout_deactivate(&ssk->keep2msl); 381 if (ssk->state != TCPS_FIN_WAIT_1 && ssk->state != TCPS_LAST_ACK) 382 goto out; 383 if ((ssk->flags & SDP_DREQWAIT) == 0) 384 goto out; 385 ssk->flags &= ~SDP_DREQWAIT; 386 ssk->flags |= SDP_DISCON; 387 sdp_2msl_wait(ssk); 388 ssk->qp_active = 0; 389 out: 390 SDP_WUNLOCK(ssk); 391 } 392 393 /* 394 * Received the final fin/ack. Cancel the 2msl. 395 */ 396 void 397 sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk) 398 { 399 sdp_dbg(ssk->socket, "cancelling dreq wait timeout\n"); 400 ssk->flags &= ~SDP_DREQWAIT; 401 sdp_2msl_wait(ssk); 402 } 403 404 static int 405 sdp_init_sock(struct socket *sk) 406 { 407 struct sdp_sock *ssk = sdp_sk(sk); 408 409 sdp_dbg(sk, "%s\n", __func__); 410 411 callout_init_rw(&ssk->keep2msl, &ssk->lock, CALLOUT_RETURNUNLOCKED); 412 TASK_INIT(&ssk->shutdown_task, 0, sdp_shutdown_task, ssk); 413 #ifdef SDP_ZCOPY 414 INIT_DELAYED_WORK(&ssk->srcavail_cancel_work, srcavail_cancel_timeout); 415 ssk->zcopy_thresh = -1; /* use global sdp_zcopy_thresh */ 416 ssk->tx_ring.rdma_inflight = NULL; 417 #endif 418 atomic_set(&ssk->mseq_ack, 0); 419 sdp_rx_ring_init(ssk); 420 ssk->tx_ring.buffer = NULL; 421 422 return 0; 423 } 424 425 /* 426 * Allocate an sdp_sock for the socket and reserve socket buffer space. 427 */ 428 static int 429 sdp_attach(struct socket *so, int proto, struct thread *td) 430 { 431 struct sdp_sock *ssk; 432 int error; 433 434 ssk = sdp_sk(so); 435 KASSERT(ssk == NULL, ("sdp_attach: ssk already set on so %p", so)); 436 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { 437 error = soreserve(so, sdp_sendspace, sdp_recvspace); 438 if (error) 439 return (error); 440 } 441 so->so_rcv.sb_flags |= SB_AUTOSIZE; 442 so->so_snd.sb_flags |= SB_AUTOSIZE; 443 ssk = uma_zalloc(sdp_zone, M_NOWAIT | M_ZERO); 444 if (ssk == NULL) 445 return (ENOBUFS); 446 rw_init(&ssk->lock, "sdpsock"); 447 ssk->socket = so; 448 ssk->cred = crhold(so->so_cred); 449 so->so_pcb = (caddr_t)ssk; 450 sdp_init_sock(so); 451 ssk->flags = 0; 452 ssk->qp_active = 0; 453 ssk->state = TCPS_CLOSED; 454 mbufq_init(&ssk->rxctlq, INT_MAX); 455 SDP_LIST_WLOCK(); 456 LIST_INSERT_HEAD(&sdp_list, ssk, list); 457 sdp_count++; 458 SDP_LIST_WUNLOCK(); 459 460 return (0); 461 } 462 463 /* 464 * Detach SDP from the socket, potentially leaving it around for the 465 * timewait to expire. 466 */ 467 static void 468 sdp_detach(struct socket *so) 469 { 470 struct sdp_sock *ssk; 471 472 ssk = sdp_sk(so); 473 SDP_WLOCK(ssk); 474 KASSERT(ssk->socket != NULL, ("sdp_detach: socket is NULL")); 475 ssk->socket->so_pcb = NULL; 476 ssk->socket = NULL; 477 if (ssk->flags & (SDP_TIMEWAIT | SDP_DREQWAIT)) 478 SDP_WUNLOCK(ssk); 479 else if (ssk->flags & SDP_DROPPED || ssk->state < TCPS_SYN_SENT) 480 sdp_pcbfree(ssk); 481 else 482 panic("sdp_detach: Unexpected state, ssk %p.\n", ssk); 483 } 484 485 /* 486 * Allocate a local address for the socket. 487 */ 488 static int 489 sdp_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 490 { 491 int error = 0; 492 struct sdp_sock *ssk; 493 struct sockaddr_in *sin; 494 495 sin = (struct sockaddr_in *)nam; 496 if (sin->sin_family != AF_INET) 497 return (EAFNOSUPPORT); 498 if (nam->sa_len != sizeof(*sin)) 499 return (EINVAL); 500 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) 501 return (EAFNOSUPPORT); 502 503 ssk = sdp_sk(so); 504 SDP_WLOCK(ssk); 505 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 506 error = EINVAL; 507 goto out; 508 } 509 error = sdp_pcbbind(ssk, nam, td->td_ucred); 510 out: 511 SDP_WUNLOCK(ssk); 512 513 return (error); 514 } 515 516 /* 517 * Prepare to accept connections. 518 */ 519 static int 520 sdp_listen(struct socket *so, int backlog, struct thread *td) 521 { 522 int error = 0; 523 struct sdp_sock *ssk; 524 525 ssk = sdp_sk(so); 526 SDP_WLOCK(ssk); 527 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 528 error = EINVAL; 529 goto out; 530 } 531 if (error == 0 && ssk->lport == 0) 532 error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred); 533 SOCK_LOCK(so); 534 if (error == 0) 535 error = solisten_proto_check(so); 536 if (error == 0) { 537 solisten_proto(so, backlog); 538 ssk->state = TCPS_LISTEN; 539 } 540 SOCK_UNLOCK(so); 541 542 out: 543 SDP_WUNLOCK(ssk); 544 if (error == 0) 545 error = -rdma_listen(ssk->id, backlog); 546 return (error); 547 } 548 549 /* 550 * Initiate a SDP connection to nam. 551 */ 552 static int 553 sdp_start_connect(struct sdp_sock *ssk, struct sockaddr *nam, struct thread *td) 554 { 555 struct sockaddr_in src; 556 struct socket *so; 557 int error; 558 559 so = ssk->socket; 560 561 SDP_WLOCK_ASSERT(ssk); 562 if (ssk->lport == 0) { 563 error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred); 564 if (error) 565 return error; 566 } 567 src.sin_family = AF_INET; 568 src.sin_len = sizeof(src); 569 bzero(&src.sin_zero, sizeof(src.sin_zero)); 570 src.sin_port = ssk->lport; 571 src.sin_addr.s_addr = ssk->laddr; 572 soisconnecting(so); 573 SDP_WUNLOCK(ssk); 574 error = -rdma_resolve_addr(ssk->id, (struct sockaddr *)&src, nam, 575 SDP_RESOLVE_TIMEOUT); 576 SDP_WLOCK(ssk); 577 if (error == 0) 578 ssk->state = TCPS_SYN_SENT; 579 580 return 0; 581 } 582 583 /* 584 * Initiate SDP connection. 585 */ 586 static int 587 sdp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 588 { 589 int error = 0; 590 struct sdp_sock *ssk; 591 struct sockaddr_in *sin; 592 593 sin = (struct sockaddr_in *)nam; 594 if (nam->sa_len != sizeof(*sin)) 595 return (EINVAL); 596 if (sin->sin_family != AF_INET) 597 return (EAFNOSUPPORT); 598 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) 599 return (EAFNOSUPPORT); 600 if ((error = prison_remote_ip4(td->td_ucred, &sin->sin_addr)) != 0) 601 return (error); 602 ssk = sdp_sk(so); 603 SDP_WLOCK(ssk); 604 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) 605 error = EINVAL; 606 else 607 error = sdp_start_connect(ssk, nam, td); 608 SDP_WUNLOCK(ssk); 609 return (error); 610 } 611 612 /* 613 * Drop a SDP socket, reporting 614 * the specified error. If connection is synchronized, 615 * then send a RST to peer. 616 */ 617 static struct sdp_sock * 618 sdp_drop(struct sdp_sock *ssk, int errno) 619 { 620 struct socket *so; 621 622 SDP_WLOCK_ASSERT(ssk); 623 so = ssk->socket; 624 if (TCPS_HAVERCVDSYN(ssk->state)) 625 sdp_output_reset(ssk); 626 if (errno == ETIMEDOUT && ssk->softerror) 627 errno = ssk->softerror; 628 so->so_error = errno; 629 return (sdp_closed(ssk)); 630 } 631 632 /* 633 * User issued close, and wish to trail through shutdown states: 634 * if never received SYN, just forget it. If got a SYN from peer, 635 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 636 * If already got a FIN from peer, then almost done; go to LAST_ACK 637 * state. In all other cases, have already sent FIN to peer (e.g. 638 * after PRU_SHUTDOWN), and just have to play tedious game waiting 639 * for peer to send FIN or not respond to keep-alives, etc. 640 * We can let the user exit from the close as soon as the FIN is acked. 641 */ 642 static void 643 sdp_usrclosed(struct sdp_sock *ssk) 644 { 645 646 SDP_WLOCK_ASSERT(ssk); 647 648 switch (ssk->state) { 649 case TCPS_LISTEN: 650 ssk->state = TCPS_CLOSED; 651 SDP_WUNLOCK(ssk); 652 sdp_destroy_cma(ssk); 653 SDP_WLOCK(ssk); 654 /* FALLTHROUGH */ 655 case TCPS_CLOSED: 656 ssk = sdp_closed(ssk); 657 /* 658 * sdp_closed() should never return NULL here as the socket is 659 * still open. 660 */ 661 KASSERT(ssk != NULL, 662 ("sdp_usrclosed: sdp_closed() returned NULL")); 663 break; 664 665 case TCPS_SYN_SENT: 666 /* FALLTHROUGH */ 667 case TCPS_SYN_RECEIVED: 668 ssk->flags |= SDP_NEEDFIN; 669 break; 670 671 case TCPS_ESTABLISHED: 672 ssk->flags |= SDP_NEEDFIN; 673 ssk->state = TCPS_FIN_WAIT_1; 674 break; 675 676 case TCPS_CLOSE_WAIT: 677 ssk->state = TCPS_LAST_ACK; 678 break; 679 } 680 if (ssk->state >= TCPS_FIN_WAIT_2) { 681 /* Prevent the connection hanging in FIN_WAIT_2 forever. */ 682 if (ssk->state == TCPS_FIN_WAIT_2) 683 sdp_2msl_wait(ssk); 684 else 685 soisdisconnected(ssk->socket); 686 } 687 } 688 689 static void 690 sdp_output_disconnect(struct sdp_sock *ssk) 691 { 692 693 SDP_WLOCK_ASSERT(ssk); 694 callout_reset(&ssk->keep2msl, SDP_FIN_WAIT_TIMEOUT, 695 sdp_dreq_timeout, ssk); 696 ssk->flags |= SDP_NEEDFIN | SDP_DREQWAIT; 697 sdp_post_sends(ssk, M_NOWAIT); 698 } 699 700 /* 701 * Initiate or continue a disconnect. 702 * If embryonic state, just send reset (once). 703 * If in ``let data drain'' option and linger null, just drop. 704 * Otherwise (hard), mark socket disconnecting and drop 705 * current input data; switch states based on user close, and 706 * send segment to peer (with FIN). 707 */ 708 static void 709 sdp_start_disconnect(struct sdp_sock *ssk) 710 { 711 struct socket *so; 712 int unread; 713 714 so = ssk->socket; 715 SDP_WLOCK_ASSERT(ssk); 716 sdp_stop_keepalive_timer(so); 717 /* 718 * Neither sdp_closed() nor sdp_drop() should return NULL, as the 719 * socket is still open. 720 */ 721 if (ssk->state < TCPS_ESTABLISHED) { 722 ssk = sdp_closed(ssk); 723 KASSERT(ssk != NULL, 724 ("sdp_start_disconnect: sdp_close() returned NULL")); 725 } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) { 726 ssk = sdp_drop(ssk, 0); 727 KASSERT(ssk != NULL, 728 ("sdp_start_disconnect: sdp_drop() returned NULL")); 729 } else { 730 soisdisconnecting(so); 731 unread = sbused(&so->so_rcv); 732 sbflush(&so->so_rcv); 733 sdp_usrclosed(ssk); 734 if (!(ssk->flags & SDP_DROPPED)) { 735 if (unread) 736 sdp_output_reset(ssk); 737 else 738 sdp_output_disconnect(ssk); 739 } 740 } 741 } 742 743 /* 744 * User initiated disconnect. 745 */ 746 static int 747 sdp_disconnect(struct socket *so) 748 { 749 struct sdp_sock *ssk; 750 int error = 0; 751 752 ssk = sdp_sk(so); 753 SDP_WLOCK(ssk); 754 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 755 error = ECONNRESET; 756 goto out; 757 } 758 sdp_start_disconnect(ssk); 759 out: 760 SDP_WUNLOCK(ssk); 761 return (error); 762 } 763 764 /* 765 * Accept a connection. Essentially all the work is done at higher levels; 766 * just return the address of the peer, storing through addr. 767 * 768 * 769 * XXX This is broken XXX 770 * 771 * The rationale for acquiring the sdp lock here is somewhat complicated, 772 * and is described in detail in the commit log entry for r175612. Acquiring 773 * it delays an accept(2) racing with sonewconn(), which inserts the socket 774 * before the address/port fields are initialized. A better fix would 775 * prevent the socket from being placed in the listen queue until all fields 776 * are fully initialized. 777 */ 778 static int 779 sdp_accept(struct socket *so, struct sockaddr **nam) 780 { 781 struct sdp_sock *ssk = NULL; 782 struct in_addr addr; 783 in_port_t port; 784 int error; 785 786 if (so->so_state & SS_ISDISCONNECTED) 787 return (ECONNABORTED); 788 789 port = 0; 790 addr.s_addr = 0; 791 error = 0; 792 ssk = sdp_sk(so); 793 SDP_WLOCK(ssk); 794 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 795 error = ECONNABORTED; 796 goto out; 797 } 798 port = ssk->fport; 799 addr.s_addr = ssk->faddr; 800 out: 801 SDP_WUNLOCK(ssk); 802 if (error == 0) 803 *nam = sdp_sockaddr(port, &addr); 804 return error; 805 } 806 807 /* 808 * Mark the connection as being incapable of further output. 809 */ 810 static int 811 sdp_shutdown(struct socket *so) 812 { 813 int error = 0; 814 struct sdp_sock *ssk; 815 816 ssk = sdp_sk(so); 817 SDP_WLOCK(ssk); 818 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 819 error = ECONNRESET; 820 goto out; 821 } 822 socantsendmore(so); 823 sdp_usrclosed(ssk); 824 if (!(ssk->flags & SDP_DROPPED)) 825 sdp_output_disconnect(ssk); 826 827 out: 828 SDP_WUNLOCK(ssk); 829 830 return (error); 831 } 832 833 static void 834 sdp_append(struct sdp_sock *ssk, struct sockbuf *sb, struct mbuf *mb, int cnt) 835 { 836 struct mbuf *n; 837 int ncnt; 838 839 SOCKBUF_LOCK_ASSERT(sb); 840 SBLASTRECORDCHK(sb); 841 KASSERT(mb->m_flags & M_PKTHDR, 842 ("sdp_append: %p Missing packet header.\n", mb)); 843 n = sb->sb_lastrecord; 844 /* 845 * If the queue is empty just set all pointers and proceed. 846 */ 847 if (n == NULL) { 848 sb->sb_lastrecord = sb->sb_mb = sb->sb_sndptr = mb; 849 for (; mb; mb = mb->m_next) { 850 sb->sb_mbtail = mb; 851 sballoc(sb, mb); 852 } 853 return; 854 } 855 /* 856 * Count the number of mbufs in the current tail. 857 */ 858 for (ncnt = 0; n->m_next; n = n->m_next) 859 ncnt++; 860 n = sb->sb_lastrecord; 861 /* 862 * If the two chains can fit in a single sdp packet and 863 * the last record has not been sent yet (WRITABLE) coalesce 864 * them. The lastrecord remains the same but we must strip the 865 * packet header and then let sbcompress do the hard part. 866 */ 867 if (M_WRITABLE(n) && ncnt + cnt < SDP_MAX_SEND_SGES && 868 n->m_pkthdr.len + mb->m_pkthdr.len - SDP_HEAD_SIZE < 869 ssk->xmit_size_goal) { 870 m_adj(mb, SDP_HEAD_SIZE); 871 n->m_pkthdr.len += mb->m_pkthdr.len; 872 n->m_flags |= mb->m_flags & (M_PUSH | M_URG); 873 m_demote(mb, 1, 0); 874 sbcompress(sb, mb, sb->sb_mbtail); 875 return; 876 } 877 /* 878 * Not compressible, just append to the end and adjust counters. 879 */ 880 sb->sb_lastrecord->m_flags |= M_PUSH; 881 sb->sb_lastrecord->m_nextpkt = mb; 882 sb->sb_lastrecord = mb; 883 if (sb->sb_sndptr == NULL) 884 sb->sb_sndptr = mb; 885 for (; mb; mb = mb->m_next) { 886 sb->sb_mbtail = mb; 887 sballoc(sb, mb); 888 } 889 } 890 891 /* 892 * Do a send by putting data in output queue and updating urgent 893 * marker if URG set. Possibly send more data. Unlike the other 894 * pru_*() routines, the mbuf chains are our responsibility. We 895 * must either enqueue them or free them. The other pru_* routines 896 * generally are caller-frees. 897 * 898 * This comes from sendfile, normal sends will come from sdp_sosend(). 899 */ 900 static int 901 sdp_send(struct socket *so, int flags, struct mbuf *m, 902 struct sockaddr *nam, struct mbuf *control, struct thread *td) 903 { 904 struct sdp_sock *ssk; 905 struct mbuf *n; 906 int error; 907 int cnt; 908 909 if (nam != NULL) { 910 if (nam->sa_family != AF_INET) { 911 if (control) 912 m_freem(control); 913 m_freem(m); 914 return (EAFNOSUPPORT); 915 } 916 if (nam->sa_len != sizeof(struct sockaddr_in)) { 917 if (control) 918 m_freem(control); 919 m_freem(m); 920 return (EINVAL); 921 } 922 } 923 924 error = 0; 925 ssk = sdp_sk(so); 926 KASSERT(m->m_flags & M_PKTHDR, 927 ("sdp_send: %p no packet header", m)); 928 M_PREPEND(m, SDP_HEAD_SIZE, M_WAITOK); 929 mtod(m, struct sdp_bsdh *)->mid = SDP_MID_DATA; 930 for (n = m, cnt = 0; n->m_next; n = n->m_next) 931 cnt++; 932 if (cnt > SDP_MAX_SEND_SGES) { 933 n = m_collapse(m, M_WAITOK, SDP_MAX_SEND_SGES); 934 if (n == NULL) { 935 m_freem(m); 936 return (EMSGSIZE); 937 } 938 m = n; 939 for (cnt = 0; n->m_next; n = n->m_next) 940 cnt++; 941 } 942 SDP_WLOCK(ssk); 943 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 944 if (control) 945 m_freem(control); 946 if (m) 947 m_freem(m); 948 error = ECONNRESET; 949 goto out; 950 } 951 if (control) { 952 /* SDP doesn't support control messages. */ 953 if (control->m_len) { 954 m_freem(control); 955 if (m) 956 m_freem(m); 957 error = EINVAL; 958 goto out; 959 } 960 m_freem(control); /* empty control, just free it */ 961 } 962 if (!(flags & PRUS_OOB)) { 963 SOCKBUF_LOCK(&so->so_snd); 964 sdp_append(ssk, &so->so_snd, m, cnt); 965 SOCKBUF_UNLOCK(&so->so_snd); 966 if (nam && ssk->state < TCPS_SYN_SENT) { 967 /* 968 * Do implied connect if not yet connected. 969 */ 970 error = sdp_start_connect(ssk, nam, td); 971 if (error) 972 goto out; 973 } 974 if (flags & PRUS_EOF) { 975 /* 976 * Close the send side of the connection after 977 * the data is sent. 978 */ 979 socantsendmore(so); 980 sdp_usrclosed(ssk); 981 if (!(ssk->flags & SDP_DROPPED)) 982 sdp_output_disconnect(ssk); 983 } else if (!(ssk->flags & SDP_DROPPED) && 984 !(flags & PRUS_MORETOCOME)) 985 sdp_post_sends(ssk, M_NOWAIT); 986 SDP_WUNLOCK(ssk); 987 return (0); 988 } else { 989 SOCKBUF_LOCK(&so->so_snd); 990 if (sbspace(&so->so_snd) < -512) { 991 SOCKBUF_UNLOCK(&so->so_snd); 992 m_freem(m); 993 error = ENOBUFS; 994 goto out; 995 } 996 /* 997 * According to RFC961 (Assigned Protocols), 998 * the urgent pointer points to the last octet 999 * of urgent data. We continue, however, 1000 * to consider it to indicate the first octet 1001 * of data past the urgent section. 1002 * Otherwise, snd_up should be one lower. 1003 */ 1004 m->m_flags |= M_URG | M_PUSH; 1005 sdp_append(ssk, &so->so_snd, m, cnt); 1006 SOCKBUF_UNLOCK(&so->so_snd); 1007 if (nam && ssk->state < TCPS_SYN_SENT) { 1008 /* 1009 * Do implied connect if not yet connected. 1010 */ 1011 error = sdp_start_connect(ssk, nam, td); 1012 if (error) 1013 goto out; 1014 } 1015 sdp_post_sends(ssk, M_NOWAIT); 1016 SDP_WUNLOCK(ssk); 1017 return (0); 1018 } 1019 out: 1020 SDP_WUNLOCK(ssk); 1021 return (error); 1022 } 1023 1024 /* 1025 * Send on a socket. If send must go all at once and message is larger than 1026 * send buffering, then hard error. Lock against other senders. If must go 1027 * all at once and not enough room now, then inform user that this would 1028 * block and do nothing. Otherwise, if nonblocking, send as much as 1029 * possible. The data to be sent is described by "uio" if nonzero, otherwise 1030 * by the mbuf chain "top" (which must be null if uio is not). Data provided 1031 * in mbuf chain must be small enough to send all at once. 1032 * 1033 * Returns nonzero on error, timeout or signal; callers must check for short 1034 * counts if EINTR/ERESTART are returned. Data and control buffers are freed 1035 * on return. 1036 */ 1037 static int 1038 sdp_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 1039 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1040 { 1041 struct sdp_sock *ssk; 1042 long space, resid; 1043 int atomic; 1044 int error; 1045 int copy; 1046 1047 if (uio != NULL) 1048 resid = uio->uio_resid; 1049 else 1050 resid = top->m_pkthdr.len; 1051 atomic = top != NULL; 1052 if (control != NULL) { 1053 if (control->m_len) { 1054 m_freem(control); 1055 if (top) 1056 m_freem(top); 1057 return (EINVAL); 1058 } 1059 m_freem(control); 1060 control = NULL; 1061 } 1062 /* 1063 * In theory resid should be unsigned. However, space must be 1064 * signed, as it might be less than 0 if we over-committed, and we 1065 * must use a signed comparison of space and resid. On the other 1066 * hand, a negative resid causes us to loop sending 0-length 1067 * segments to the protocol. 1068 * 1069 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 1070 * type sockets since that's an error. 1071 */ 1072 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 1073 error = EINVAL; 1074 goto out; 1075 } 1076 if (td != NULL) 1077 td->td_ru.ru_msgsnd++; 1078 1079 ssk = sdp_sk(so); 1080 error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags)); 1081 if (error) 1082 goto out; 1083 1084 restart: 1085 do { 1086 SOCKBUF_LOCK(&so->so_snd); 1087 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1088 SOCKBUF_UNLOCK(&so->so_snd); 1089 error = EPIPE; 1090 goto release; 1091 } 1092 if (so->so_error) { 1093 error = so->so_error; 1094 so->so_error = 0; 1095 SOCKBUF_UNLOCK(&so->so_snd); 1096 goto release; 1097 } 1098 if ((so->so_state & SS_ISCONNECTED) == 0 && addr == NULL) { 1099 SOCKBUF_UNLOCK(&so->so_snd); 1100 error = ENOTCONN; 1101 goto release; 1102 } 1103 space = sbspace(&so->so_snd); 1104 if (flags & MSG_OOB) 1105 space += 1024; 1106 if (atomic && resid > ssk->xmit_size_goal - SDP_HEAD_SIZE) { 1107 SOCKBUF_UNLOCK(&so->so_snd); 1108 error = EMSGSIZE; 1109 goto release; 1110 } 1111 if (space < resid && 1112 (atomic || space < so->so_snd.sb_lowat)) { 1113 if ((so->so_state & SS_NBIO) || 1114 (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { 1115 SOCKBUF_UNLOCK(&so->so_snd); 1116 error = EWOULDBLOCK; 1117 goto release; 1118 } 1119 error = sbwait(so, SO_SND); 1120 SOCKBUF_UNLOCK(&so->so_snd); 1121 if (error) 1122 goto release; 1123 goto restart; 1124 } 1125 SOCKBUF_UNLOCK(&so->so_snd); 1126 do { 1127 if (uio == NULL) { 1128 resid = 0; 1129 if (flags & MSG_EOR) 1130 top->m_flags |= M_EOR; 1131 } else { 1132 /* 1133 * Copy the data from userland into a mbuf 1134 * chain. If no data is to be copied in, 1135 * a single empty mbuf is returned. 1136 */ 1137 copy = min(space, 1138 ssk->xmit_size_goal - SDP_HEAD_SIZE); 1139 top = m_uiotombuf(uio, M_WAITOK, copy, 1140 0, M_PKTHDR | 1141 ((flags & MSG_EOR) ? M_EOR : 0)); 1142 if (top == NULL) { 1143 /* only possible error */ 1144 error = EFAULT; 1145 goto release; 1146 } 1147 space -= resid - uio->uio_resid; 1148 resid = uio->uio_resid; 1149 } 1150 /* 1151 * XXX all the SBS_CANTSENDMORE checks previously 1152 * done could be out of date after dropping the 1153 * socket lock. 1154 */ 1155 error = sdp_send(so, (flags & MSG_OOB) ? PRUS_OOB : 1156 /* 1157 * Set EOF on the last send if the user specified 1158 * MSG_EOF. 1159 */ 1160 ((flags & MSG_EOF) && (resid <= 0)) ? PRUS_EOF : 1161 /* If there is more to send set PRUS_MORETOCOME. */ 1162 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1163 top, addr, NULL, td); 1164 top = NULL; 1165 if (error) 1166 goto release; 1167 } while (resid && space > 0); 1168 } while (resid); 1169 1170 release: 1171 SOCK_IO_SEND_UNLOCK(so); 1172 out: 1173 if (top != NULL) 1174 m_freem(top); 1175 return (error); 1176 } 1177 1178 /* 1179 * The part of soreceive() that implements reading non-inline out-of-band 1180 * data from a socket. For more complete comments, see soreceive(), from 1181 * which this code originated. 1182 * 1183 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is 1184 * unable to return an mbuf chain to the caller. 1185 */ 1186 static int 1187 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) 1188 { 1189 struct protosw *pr = so->so_proto; 1190 struct mbuf *m; 1191 int error; 1192 1193 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 1194 1195 m = m_get(M_WAITOK, MT_DATA); 1196 error = pr->pr_rcvoob(so, m, flags & MSG_PEEK); 1197 if (error) 1198 goto bad; 1199 do { 1200 error = uiomove(mtod(m, void *), 1201 (int) min(uio->uio_resid, m->m_len), uio); 1202 m = m_free(m); 1203 } while (uio->uio_resid && error == 0 && m); 1204 bad: 1205 if (m != NULL) 1206 m_freem(m); 1207 return (error); 1208 } 1209 1210 /* 1211 * Optimized version of soreceive() for stream (TCP) sockets. 1212 */ 1213 static int 1214 sdp_sorecv(struct socket *so, struct sockaddr **psa, struct uio *uio, 1215 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 1216 { 1217 int len = 0, error = 0, flags, oresid; 1218 struct sockbuf *sb; 1219 struct mbuf *m, *n = NULL; 1220 struct sdp_sock *ssk; 1221 1222 /* We only do stream sockets. */ 1223 if (so->so_type != SOCK_STREAM) 1224 return (EINVAL); 1225 if (psa != NULL) 1226 *psa = NULL; 1227 if (controlp != NULL) 1228 return (EINVAL); 1229 if (flagsp != NULL) 1230 flags = *flagsp &~ MSG_EOR; 1231 else 1232 flags = 0; 1233 if (flags & MSG_OOB) 1234 return (soreceive_rcvoob(so, uio, flags)); 1235 if (mp0 != NULL) 1236 *mp0 = NULL; 1237 1238 sb = &so->so_rcv; 1239 ssk = sdp_sk(so); 1240 1241 /* Prevent other readers from entering the socket. */ 1242 error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); 1243 if (error) 1244 return (error); 1245 SOCKBUF_LOCK(sb); 1246 1247 /* Easy one, no space to copyout anything. */ 1248 if (uio->uio_resid == 0) { 1249 error = EINVAL; 1250 goto out; 1251 } 1252 oresid = uio->uio_resid; 1253 1254 /* We will never ever get anything unless we are connected. */ 1255 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { 1256 /* When disconnecting there may be still some data left. */ 1257 if (sbavail(sb)) 1258 goto deliver; 1259 if (!(so->so_state & SS_ISDISCONNECTED)) 1260 error = ENOTCONN; 1261 goto out; 1262 } 1263 1264 /* Socket buffer is empty and we shall not block. */ 1265 if (sbavail(sb) == 0 && 1266 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { 1267 error = EAGAIN; 1268 goto out; 1269 } 1270 1271 restart: 1272 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1273 1274 /* Abort if socket has reported problems. */ 1275 if (so->so_error) { 1276 if (sbavail(sb)) 1277 goto deliver; 1278 if (oresid > uio->uio_resid) 1279 goto out; 1280 error = so->so_error; 1281 if (!(flags & MSG_PEEK)) 1282 so->so_error = 0; 1283 goto out; 1284 } 1285 1286 /* Door is closed. Deliver what is left, if any. */ 1287 if (sb->sb_state & SBS_CANTRCVMORE) { 1288 if (sbavail(sb)) 1289 goto deliver; 1290 else 1291 goto out; 1292 } 1293 1294 /* Socket buffer got some data that we shall deliver now. */ 1295 if (sbavail(sb) && !(flags & MSG_WAITALL) && 1296 ((so->so_state & SS_NBIO) || 1297 (flags & (MSG_DONTWAIT|MSG_NBIO)) || 1298 sbavail(sb) >= sb->sb_lowat || 1299 sbavail(sb) >= uio->uio_resid || 1300 sbavail(sb) >= sb->sb_hiwat) ) { 1301 goto deliver; 1302 } 1303 1304 /* On MSG_WAITALL we must wait until all data or error arrives. */ 1305 if ((flags & MSG_WAITALL) && 1306 (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_lowat)) 1307 goto deliver; 1308 1309 /* 1310 * Wait and block until (more) data comes in. 1311 * NB: Drops the sockbuf lock during wait. 1312 */ 1313 error = sbwait(so, SO_RCV); 1314 if (error) 1315 goto out; 1316 goto restart; 1317 1318 deliver: 1319 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1320 KASSERT(sbavail(sb), ("%s: sockbuf empty", __func__)); 1321 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); 1322 1323 /* Statistics. */ 1324 if (uio->uio_td) 1325 uio->uio_td->td_ru.ru_msgrcv++; 1326 1327 /* Fill uio until full or current end of socket buffer is reached. */ 1328 len = min(uio->uio_resid, sbavail(sb)); 1329 if (mp0 != NULL) { 1330 /* Dequeue as many mbufs as possible. */ 1331 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { 1332 for (*mp0 = m = sb->sb_mb; 1333 m != NULL && m->m_len <= len; 1334 m = m->m_next) { 1335 len -= m->m_len; 1336 uio->uio_resid -= m->m_len; 1337 sbfree(sb, m); 1338 n = m; 1339 } 1340 sb->sb_mb = m; 1341 if (sb->sb_mb == NULL) 1342 SB_EMPTY_FIXUP(sb); 1343 n->m_next = NULL; 1344 } 1345 /* Copy the remainder. */ 1346 if (len > 0) { 1347 KASSERT(sb->sb_mb != NULL, 1348 ("%s: len > 0 && sb->sb_mb empty", __func__)); 1349 1350 m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); 1351 if (m == NULL) 1352 len = 0; /* Don't flush data from sockbuf. */ 1353 else 1354 uio->uio_resid -= m->m_len; 1355 if (*mp0 != NULL) 1356 n->m_next = m; 1357 else 1358 *mp0 = m; 1359 if (*mp0 == NULL) { 1360 error = ENOBUFS; 1361 goto out; 1362 } 1363 } 1364 } else { 1365 /* NB: Must unlock socket buffer as uiomove may sleep. */ 1366 SOCKBUF_UNLOCK(sb); 1367 error = m_mbuftouio(uio, sb->sb_mb, len); 1368 SOCKBUF_LOCK(sb); 1369 if (error) 1370 goto out; 1371 } 1372 SBLASTRECORDCHK(sb); 1373 SBLASTMBUFCHK(sb); 1374 1375 /* 1376 * Remove the delivered data from the socket buffer unless we 1377 * were only peeking. 1378 */ 1379 if (!(flags & MSG_PEEK)) { 1380 if (len > 0) 1381 sbdrop_locked(sb, len); 1382 1383 /* Notify protocol that we drained some data. */ 1384 SOCKBUF_UNLOCK(sb); 1385 SDP_WLOCK(ssk); 1386 sdp_do_posts(ssk); 1387 SDP_WUNLOCK(ssk); 1388 SOCKBUF_LOCK(sb); 1389 } 1390 1391 /* 1392 * For MSG_WAITALL we may have to loop again and wait for 1393 * more data to come in. 1394 */ 1395 if ((flags & MSG_WAITALL) && uio->uio_resid > 0) 1396 goto restart; 1397 out: 1398 SBLASTRECORDCHK(sb); 1399 SBLASTMBUFCHK(sb); 1400 SOCKBUF_UNLOCK(sb); 1401 SOCK_IO_RECV_UNLOCK(so); 1402 return (error); 1403 } 1404 1405 /* 1406 * Abort is used to teardown a connection typically while sitting in 1407 * the accept queue. 1408 */ 1409 void 1410 sdp_abort(struct socket *so) 1411 { 1412 struct sdp_sock *ssk; 1413 1414 ssk = sdp_sk(so); 1415 SDP_WLOCK(ssk); 1416 /* 1417 * If we have not yet dropped, do it now. 1418 */ 1419 if (!(ssk->flags & SDP_TIMEWAIT) && 1420 !(ssk->flags & SDP_DROPPED)) 1421 sdp_drop(ssk, ECONNABORTED); 1422 KASSERT(ssk->flags & SDP_DROPPED, ("sdp_abort: %p not dropped 0x%X", 1423 ssk, ssk->flags)); 1424 SDP_WUNLOCK(ssk); 1425 } 1426 1427 /* 1428 * Close a SDP socket and initiate a friendly disconnect. 1429 */ 1430 static void 1431 sdp_close(struct socket *so) 1432 { 1433 struct sdp_sock *ssk; 1434 1435 ssk = sdp_sk(so); 1436 SDP_WLOCK(ssk); 1437 /* 1438 * If we have not yet dropped, do it now. 1439 */ 1440 if (!(ssk->flags & SDP_TIMEWAIT) && 1441 !(ssk->flags & SDP_DROPPED)) 1442 sdp_start_disconnect(ssk); 1443 1444 /* 1445 * If we've still not dropped let the socket layer know we're 1446 * holding on to the socket and pcb for a while. 1447 */ 1448 if (!(ssk->flags & SDP_DROPPED)) { 1449 ssk->flags |= SDP_SOCKREF; 1450 soref(so); 1451 } 1452 SDP_WUNLOCK(ssk); 1453 } 1454 1455 /* 1456 * User requests out-of-band data. 1457 */ 1458 static int 1459 sdp_rcvoob(struct socket *so, struct mbuf *m, int flags) 1460 { 1461 int error = 0; 1462 struct sdp_sock *ssk; 1463 1464 ssk = sdp_sk(so); 1465 SDP_WLOCK(ssk); 1466 if (!rx_ring_trylock(&ssk->rx_ring)) { 1467 SDP_WUNLOCK(ssk); 1468 return (ECONNRESET); 1469 } 1470 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 1471 error = ECONNRESET; 1472 goto out; 1473 } 1474 if ((so->so_oobmark == 0 && 1475 (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) || 1476 so->so_options & SO_OOBINLINE || 1477 ssk->oobflags & SDP_HADOOB) { 1478 error = EINVAL; 1479 goto out; 1480 } 1481 if ((ssk->oobflags & SDP_HAVEOOB) == 0) { 1482 error = EWOULDBLOCK; 1483 goto out; 1484 } 1485 m->m_len = 1; 1486 *mtod(m, caddr_t) = ssk->iobc; 1487 if ((flags & MSG_PEEK) == 0) 1488 ssk->oobflags ^= (SDP_HAVEOOB | SDP_HADOOB); 1489 out: 1490 rx_ring_unlock(&ssk->rx_ring); 1491 SDP_WUNLOCK(ssk); 1492 return (error); 1493 } 1494 1495 void 1496 sdp_urg(struct sdp_sock *ssk, struct mbuf *mb) 1497 { 1498 struct mbuf *m; 1499 struct socket *so; 1500 1501 so = ssk->socket; 1502 if (so == NULL) 1503 return; 1504 1505 so->so_oobmark = sbused(&so->so_rcv) + mb->m_pkthdr.len - 1; 1506 sohasoutofband(so); 1507 ssk->oobflags &= ~(SDP_HAVEOOB | SDP_HADOOB); 1508 if (!(so->so_options & SO_OOBINLINE)) { 1509 for (m = mb; m->m_next != NULL; m = m->m_next); 1510 ssk->iobc = *(mtod(m, char *) + m->m_len - 1); 1511 ssk->oobflags |= SDP_HAVEOOB; 1512 m->m_len--; 1513 mb->m_pkthdr.len--; 1514 } 1515 } 1516 1517 /* 1518 * Notify a sdp socket of an asynchronous error. 1519 * 1520 * Do not wake up user since there currently is no mechanism for 1521 * reporting soft errors (yet - a kqueue filter may be added). 1522 */ 1523 struct sdp_sock * 1524 sdp_notify(struct sdp_sock *ssk, int error) 1525 { 1526 1527 SDP_WLOCK_ASSERT(ssk); 1528 1529 if ((ssk->flags & SDP_TIMEWAIT) || 1530 (ssk->flags & SDP_DROPPED)) 1531 return (ssk); 1532 1533 /* 1534 * Ignore some errors if we are hooked up. 1535 */ 1536 if (ssk->state == TCPS_ESTABLISHED && 1537 (error == EHOSTUNREACH || error == ENETUNREACH || 1538 error == EHOSTDOWN)) 1539 return (ssk); 1540 ssk->softerror = error; 1541 return sdp_drop(ssk, error); 1542 } 1543 1544 static void 1545 sdp_keepalive_timeout(void *data) 1546 { 1547 struct sdp_sock *ssk; 1548 1549 ssk = data; 1550 /* Callout canceled. */ 1551 if (!callout_active(&ssk->keep2msl)) 1552 return; 1553 /* Callout rescheduled as a different kind of timer. */ 1554 if (callout_pending(&ssk->keep2msl)) 1555 goto out; 1556 callout_deactivate(&ssk->keep2msl); 1557 if (ssk->flags & SDP_DROPPED || 1558 (ssk->socket->so_options & SO_KEEPALIVE) == 0) 1559 goto out; 1560 sdp_post_keepalive(ssk); 1561 callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME, 1562 sdp_keepalive_timeout, ssk); 1563 out: 1564 SDP_WUNLOCK(ssk); 1565 } 1566 1567 1568 void 1569 sdp_start_keepalive_timer(struct socket *so) 1570 { 1571 struct sdp_sock *ssk; 1572 1573 ssk = sdp_sk(so); 1574 if (!callout_pending(&ssk->keep2msl)) 1575 callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME, 1576 sdp_keepalive_timeout, ssk); 1577 } 1578 1579 static void 1580 sdp_stop_keepalive_timer(struct socket *so) 1581 { 1582 struct sdp_sock *ssk; 1583 1584 ssk = sdp_sk(so); 1585 callout_stop(&ssk->keep2msl); 1586 } 1587 1588 /* 1589 * sdp_ctloutput() must drop the inpcb lock before performing copyin on 1590 * socket option arguments. When it re-acquires the lock after the copy, it 1591 * has to revalidate that the connection is still valid for the socket 1592 * option. 1593 */ 1594 #define SDP_WLOCK_RECHECK(inp) do { \ 1595 SDP_WLOCK(ssk); \ 1596 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { \ 1597 SDP_WUNLOCK(ssk); \ 1598 return (ECONNRESET); \ 1599 } \ 1600 } while(0) 1601 1602 static int 1603 sdp_ctloutput(struct socket *so, struct sockopt *sopt) 1604 { 1605 int error, opt, optval; 1606 struct sdp_sock *ssk; 1607 1608 error = 0; 1609 ssk = sdp_sk(so); 1610 if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_KEEPALIVE) { 1611 SDP_WLOCK(ssk); 1612 if (so->so_options & SO_KEEPALIVE) 1613 sdp_start_keepalive_timer(so); 1614 else 1615 sdp_stop_keepalive_timer(so); 1616 SDP_WUNLOCK(ssk); 1617 } 1618 if (sopt->sopt_level != IPPROTO_TCP) 1619 return (error); 1620 1621 SDP_WLOCK(ssk); 1622 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 1623 SDP_WUNLOCK(ssk); 1624 return (ECONNRESET); 1625 } 1626 1627 switch (sopt->sopt_dir) { 1628 case SOPT_SET: 1629 switch (sopt->sopt_name) { 1630 case TCP_NODELAY: 1631 SDP_WUNLOCK(ssk); 1632 error = sooptcopyin(sopt, &optval, sizeof optval, 1633 sizeof optval); 1634 if (error) 1635 return (error); 1636 1637 SDP_WLOCK_RECHECK(ssk); 1638 opt = SDP_NODELAY; 1639 if (optval) 1640 ssk->flags |= opt; 1641 else 1642 ssk->flags &= ~opt; 1643 sdp_do_posts(ssk); 1644 SDP_WUNLOCK(ssk); 1645 break; 1646 1647 default: 1648 SDP_WUNLOCK(ssk); 1649 error = ENOPROTOOPT; 1650 break; 1651 } 1652 break; 1653 1654 case SOPT_GET: 1655 switch (sopt->sopt_name) { 1656 case TCP_NODELAY: 1657 optval = ssk->flags & SDP_NODELAY; 1658 SDP_WUNLOCK(ssk); 1659 error = sooptcopyout(sopt, &optval, sizeof optval); 1660 break; 1661 default: 1662 SDP_WUNLOCK(ssk); 1663 error = ENOPROTOOPT; 1664 break; 1665 } 1666 break; 1667 } 1668 return (error); 1669 } 1670 #undef SDP_WLOCK_RECHECK 1671 1672 int sdp_mod_count = 0; 1673 int sdp_mod_usec = 0; 1674 1675 void 1676 sdp_set_default_moderation(struct sdp_sock *ssk) 1677 { 1678 if (sdp_mod_count <= 0 || sdp_mod_usec <= 0) 1679 return; 1680 ib_modify_cq(ssk->rx_ring.cq, sdp_mod_count, sdp_mod_usec); 1681 } 1682 1683 static void 1684 sdp_dev_add(struct ib_device *device) 1685 { 1686 struct ib_fmr_pool_param param; 1687 struct sdp_device *sdp_dev; 1688 1689 sdp_dev = malloc(sizeof(*sdp_dev), M_SDP, M_WAITOK | M_ZERO); 1690 sdp_dev->pd = ib_alloc_pd(device, 0); 1691 if (IS_ERR(sdp_dev->pd)) 1692 goto out_pd; 1693 memset(¶m, 0, sizeof param); 1694 param.max_pages_per_fmr = SDP_FMR_SIZE; 1695 param.page_shift = PAGE_SHIFT; 1696 param.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ); 1697 param.pool_size = SDP_FMR_POOL_SIZE; 1698 param.dirty_watermark = SDP_FMR_DIRTY_SIZE; 1699 param.cache = 1; 1700 sdp_dev->fmr_pool = ib_create_fmr_pool(sdp_dev->pd, ¶m); 1701 if (IS_ERR(sdp_dev->fmr_pool)) 1702 goto out_fmr; 1703 ib_set_client_data(device, &sdp_client, sdp_dev); 1704 return; 1705 1706 out_fmr: 1707 ib_dealloc_pd(sdp_dev->pd); 1708 out_pd: 1709 free(sdp_dev, M_SDP); 1710 } 1711 1712 static void 1713 sdp_dev_rem(struct ib_device *device, void *client_data) 1714 { 1715 struct sdp_device *sdp_dev; 1716 struct sdp_sock *ssk; 1717 1718 SDP_LIST_WLOCK(); 1719 LIST_FOREACH(ssk, &sdp_list, list) { 1720 if (ssk->ib_device != device) 1721 continue; 1722 SDP_WLOCK(ssk); 1723 if ((ssk->flags & SDP_DESTROY) == 0) 1724 ssk = sdp_notify(ssk, ECONNRESET); 1725 if (ssk) 1726 SDP_WUNLOCK(ssk); 1727 } 1728 SDP_LIST_WUNLOCK(); 1729 /* 1730 * XXX Do I need to wait between these two? 1731 */ 1732 sdp_dev = ib_get_client_data(device, &sdp_client); 1733 if (!sdp_dev) 1734 return; 1735 ib_flush_fmr_pool(sdp_dev->fmr_pool); 1736 ib_destroy_fmr_pool(sdp_dev->fmr_pool); 1737 ib_dealloc_pd(sdp_dev->pd); 1738 free(sdp_dev, M_SDP); 1739 } 1740 1741 struct ib_client sdp_client = 1742 { .name = "sdp", .add = sdp_dev_add, .remove = sdp_dev_rem }; 1743 1744 1745 static int 1746 sdp_pcblist(SYSCTL_HANDLER_ARGS) 1747 { 1748 int error, n, i; 1749 struct sdp_sock *ssk; 1750 struct xinpgen xig; 1751 1752 /* 1753 * The process of preparing the TCB list is too time-consuming and 1754 * resource-intensive to repeat twice on every request. 1755 */ 1756 if (req->oldptr == NULL) { 1757 n = sdp_count; 1758 n += imax(n / 8, 10); 1759 req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb); 1760 return (0); 1761 } 1762 1763 if (req->newptr != NULL) 1764 return (EPERM); 1765 1766 /* 1767 * OK, now we're committed to doing something. 1768 */ 1769 SDP_LIST_RLOCK(); 1770 n = sdp_count; 1771 SDP_LIST_RUNLOCK(); 1772 1773 error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) 1774 + n * sizeof(struct xtcpcb)); 1775 if (error != 0) 1776 return (error); 1777 1778 bzero(&xig, sizeof(xig)); 1779 xig.xig_len = sizeof xig; 1780 xig.xig_count = n; 1781 xig.xig_gen = 0; 1782 xig.xig_sogen = so_gencnt; 1783 error = SYSCTL_OUT(req, &xig, sizeof xig); 1784 if (error) 1785 return (error); 1786 1787 SDP_LIST_RLOCK(); 1788 for (ssk = LIST_FIRST(&sdp_list), i = 0; 1789 ssk != NULL && i < n; ssk = LIST_NEXT(ssk, list)) { 1790 struct xtcpcb xt; 1791 1792 SDP_RLOCK(ssk); 1793 if (ssk->flags & SDP_TIMEWAIT) { 1794 if (ssk->cred != NULL) 1795 error = cr_cansee(req->td->td_ucred, 1796 ssk->cred); 1797 else 1798 error = EINVAL; /* Skip this inp. */ 1799 } else if (ssk->socket) 1800 error = cr_canseesocket(req->td->td_ucred, 1801 ssk->socket); 1802 else 1803 error = EINVAL; 1804 if (error) { 1805 error = 0; 1806 goto next; 1807 } 1808 1809 bzero(&xt, sizeof(xt)); 1810 xt.xt_len = sizeof xt; 1811 xt.xt_inp.inp_gencnt = 0; 1812 xt.xt_inp.inp_vflag = INP_IPV4; 1813 memcpy(&xt.xt_inp.inp_laddr, &ssk->laddr, sizeof(ssk->laddr)); 1814 xt.xt_inp.inp_lport = ssk->lport; 1815 memcpy(&xt.xt_inp.inp_faddr, &ssk->faddr, sizeof(ssk->faddr)); 1816 xt.xt_inp.inp_fport = ssk->fport; 1817 xt.t_state = ssk->state; 1818 if (ssk->socket != NULL) 1819 sotoxsocket(ssk->socket, &xt.xt_inp.xi_socket); 1820 xt.xt_inp.xi_socket.xso_protocol = IPPROTO_TCP; 1821 SDP_RUNLOCK(ssk); 1822 error = SYSCTL_OUT(req, &xt, sizeof xt); 1823 if (error) 1824 break; 1825 i++; 1826 continue; 1827 next: 1828 SDP_RUNLOCK(ssk); 1829 } 1830 if (!error) { 1831 /* 1832 * Give the user an updated idea of our state. 1833 * If the generation differs from what we told 1834 * her before, she knows that something happened 1835 * while we were processing this request, and it 1836 * might be necessary to retry. 1837 */ 1838 xig.xig_gen = 0; 1839 xig.xig_sogen = so_gencnt; 1840 xig.xig_count = sdp_count; 1841 error = SYSCTL_OUT(req, &xig, sizeof xig); 1842 } 1843 SDP_LIST_RUNLOCK(); 1844 return (error); 1845 } 1846 1847 SYSCTL_NODE(_net_inet, -1, sdp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1848 "SDP"); 1849 1850 SYSCTL_PROC(_net_inet_sdp, TCPCTL_PCBLIST, pcblist, 1851 CTLFLAG_RD | CTLTYPE_STRUCT | CTLFLAG_MPSAFE, 1852 0, 0, sdp_pcblist, "S,xtcpcb", 1853 "List of active SDP connections"); 1854 1855 static void 1856 sdp_zone_change(void *tag) 1857 { 1858 1859 uma_zone_set_max(sdp_zone, maxsockets); 1860 } 1861 1862 static void 1863 sdp_init(void *arg __unused) 1864 { 1865 1866 LIST_INIT(&sdp_list); 1867 sdp_zone = uma_zcreate("sdp_sock", sizeof(struct sdp_sock), 1868 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 1869 uma_zone_set_max(sdp_zone, maxsockets); 1870 EVENTHANDLER_REGISTER(maxsockets_change, sdp_zone_change, NULL, 1871 EVENTHANDLER_PRI_ANY); 1872 rx_comp_wq = create_singlethread_workqueue("rx_comp_wq"); 1873 ib_register_client(&sdp_client); 1874 } 1875 SYSINIT(sdp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND, sdp_init, NULL); 1876 1877 #define SDP_PROTOSW \ 1878 .pr_type = SOCK_STREAM, \ 1879 .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,\ 1880 .pr_ctloutput = sdp_ctloutput, \ 1881 .pr_abort = sdp_abort, \ 1882 .pr_accept = sdp_accept, \ 1883 .pr_attach = sdp_attach, \ 1884 .pr_bind = sdp_bind, \ 1885 .pr_connect = sdp_connect, \ 1886 .pr_detach = sdp_detach, \ 1887 .pr_disconnect = sdp_disconnect, \ 1888 .pr_listen = sdp_listen, \ 1889 .pr_peeraddr = sdp_getpeeraddr, \ 1890 .pr_rcvoob = sdp_rcvoob, \ 1891 .pr_send = sdp_send, \ 1892 .pr_sosend = sdp_sosend, \ 1893 .pr_soreceive = sdp_sorecv, \ 1894 .pr_shutdown = sdp_shutdown, \ 1895 .pr_sockaddr = sdp_getsockaddr, \ 1896 .pr_close = sdp_close 1897 1898 1899 static struct protosw sdp_ip_protosw = { 1900 .pr_protocol = IPPROTO_IP, 1901 SDP_PROTOSW 1902 }; 1903 static struct protosw sdp_tcp_protosw = { 1904 .pr_protocol = IPPROTO_TCP, 1905 SDP_PROTOSW 1906 }; 1907 1908 static struct domain sdpdomain = { 1909 .dom_family = AF_INET_SDP, 1910 .dom_name = "SDP", 1911 .dom_nprotosw = 2, 1912 .dom_protosw = { 1913 &sdp_ip_protosw, 1914 &sdp_tcp_protosw, 1915 }, 1916 }; 1917 1918 DOMAIN_SET(sdp); 1919 1920 int sdp_debug_level = 1; 1921 int sdp_data_debug_level = 0; 1922