1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * Copyright (c) 2004 The FreeBSD Foundation. All rights reserved. 7 * Copyright (c) 2004-2008 Robert N. M. Watson. All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * Excerpts taken from tcp_subr.c, tcp_usrreq.c, uipc_socket.c 34 */ 35 36 /* 37 * 38 * Copyright (c) 2010 Isilon Systems, Inc. 39 * Copyright (c) 2010 iX Systems, Inc. 40 * Copyright (c) 2010 Panasas, Inc. 41 * All rights reserved. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice unmodified, this list of conditions, and the following 48 * disclaimer. 49 * 2. Redistributions in binary form must reproduce the above copyright 50 * notice, this list of conditions and the following disclaimer in the 51 * documentation and/or other materials provided with the distribution. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 54 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 55 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 56 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 57 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 58 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 59 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 60 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 61 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 62 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 63 * 64 */ 65 66 #include <sys/param.h> 67 #include <sys/eventhandler.h> 68 #include <sys/kernel.h> 69 #include <sys/malloc.h> 70 71 #include "sdp.h" 72 73 #include <net/if.h> 74 #include <net/route.h> 75 #include <net/vnet.h> 76 #include <sys/sysctl.h> 77 78 uma_zone_t sdp_zone; 79 struct rwlock sdp_lock; 80 LIST_HEAD(, sdp_sock) sdp_list; 81 82 struct workqueue_struct *rx_comp_wq; 83 84 RW_SYSINIT(sdplockinit, &sdp_lock, "SDP lock"); 85 #define SDP_LIST_WLOCK() rw_wlock(&sdp_lock) 86 #define SDP_LIST_RLOCK() rw_rlock(&sdp_lock) 87 #define SDP_LIST_WUNLOCK() rw_wunlock(&sdp_lock) 88 #define SDP_LIST_RUNLOCK() rw_runlock(&sdp_lock) 89 #define SDP_LIST_WLOCK_ASSERT() rw_assert(&sdp_lock, RW_WLOCKED) 90 #define SDP_LIST_RLOCK_ASSERT() rw_assert(&sdp_lock, RW_RLOCKED) 91 #define SDP_LIST_LOCK_ASSERT() rw_assert(&sdp_lock, RW_LOCKED) 92 93 MALLOC_DEFINE(M_SDP, "sdp", "Sockets Direct Protocol"); 94 95 static void sdp_stop_keepalive_timer(struct socket *so); 96 97 /* 98 * SDP protocol interface to socket abstraction. 99 */ 100 /* 101 * sdp_sendspace and sdp_recvspace are the default send and receive window 102 * sizes, respectively. 103 */ 104 u_long sdp_sendspace = 1024*32; 105 u_long sdp_recvspace = 1024*64; 106 107 static int sdp_count; 108 109 /* 110 * Disable async. CMA events for sockets which are being torn down. 111 */ 112 static void 113 sdp_destroy_cma(struct sdp_sock *ssk) 114 { 115 116 if (ssk->id == NULL) 117 return; 118 rdma_destroy_id(ssk->id); 119 ssk->id = NULL; 120 } 121 122 static int 123 sdp_pcbbind(struct sdp_sock *ssk, struct sockaddr *nam, struct ucred *cred) 124 { 125 struct sockaddr_in *sin; 126 struct sockaddr_in null; 127 int error; 128 129 SDP_WLOCK_ASSERT(ssk); 130 131 if (ssk->lport != 0 || ssk->laddr != INADDR_ANY) 132 return (EINVAL); 133 /* rdma_bind_addr handles bind races. */ 134 SDP_WUNLOCK(ssk); 135 if (ssk->id == NULL) 136 ssk->id = rdma_create_id(&init_net, sdp_cma_handler, ssk, RDMA_PS_SDP, IB_QPT_RC); 137 if (ssk->id == NULL) { 138 SDP_WLOCK(ssk); 139 return (ENOMEM); 140 } 141 if (nam == NULL) { 142 null.sin_family = AF_INET; 143 null.sin_len = sizeof(null); 144 null.sin_addr.s_addr = INADDR_ANY; 145 null.sin_port = 0; 146 bzero(&null.sin_zero, sizeof(null.sin_zero)); 147 nam = (struct sockaddr *)&null; 148 } 149 error = -rdma_bind_addr(ssk->id, nam); 150 SDP_WLOCK(ssk); 151 if (error == 0) { 152 sin = (struct sockaddr_in *)&ssk->id->route.addr.src_addr; 153 ssk->laddr = sin->sin_addr.s_addr; 154 ssk->lport = sin->sin_port; 155 } else 156 sdp_destroy_cma(ssk); 157 return (error); 158 } 159 160 static void 161 sdp_pcbfree(struct sdp_sock *ssk) 162 { 163 164 KASSERT(ssk->socket == NULL, ("ssk %p socket still attached", ssk)); 165 KASSERT((ssk->flags & SDP_DESTROY) == 0, 166 ("ssk %p already destroyed", ssk)); 167 168 sdp_dbg(ssk->socket, "Freeing pcb"); 169 SDP_WLOCK_ASSERT(ssk); 170 ssk->flags |= SDP_DESTROY; 171 SDP_WUNLOCK(ssk); 172 SDP_LIST_WLOCK(); 173 sdp_count--; 174 LIST_REMOVE(ssk, list); 175 SDP_LIST_WUNLOCK(); 176 crfree(ssk->cred); 177 ssk->qp_active = 0; 178 if (ssk->qp) { 179 ib_destroy_qp(ssk->qp); 180 ssk->qp = NULL; 181 } 182 sdp_tx_ring_destroy(ssk); 183 sdp_rx_ring_destroy(ssk); 184 sdp_destroy_cma(ssk); 185 rw_destroy(&ssk->rx_ring.destroyed_lock); 186 rw_destroy(&ssk->lock); 187 uma_zfree(sdp_zone, ssk); 188 } 189 190 static int 191 sdp_getsockaddr(struct socket *so, struct sockaddr *sa) 192 { 193 struct sdp_sock *ssk = sdp_sk(so); 194 195 SDP_RLOCK(ssk); 196 *(struct sockaddr_in *)sa = (struct sockaddr_in ){ 197 .sin_family = AF_INET, 198 .sin_len = sizeof(struct sockaddr_in), 199 .sin_addr.s_addr = ssk->laddr, 200 .sin_port = ssk->lport, 201 }; 202 SDP_RUNLOCK(ssk); 203 204 return (0); 205 } 206 207 static int 208 sdp_getpeeraddr(struct socket *so, struct sockaddr *sa) 209 { 210 struct sdp_sock *ssk = sdp_sk(so); 211 212 SDP_RLOCK(ssk); 213 *(struct sockaddr_in *)sa = (struct sockaddr_in ){ 214 .sin_family = AF_INET, 215 .sin_len = sizeof(struct sockaddr_in), 216 .sin_addr.s_addr = ssk->faddr, 217 .sin_port = ssk->fport, 218 }; 219 SDP_RUNLOCK(ssk); 220 221 return (0); 222 } 223 224 #if 0 225 static void 226 sdp_apply_all(void (*func)(struct sdp_sock *, void *), void *arg) 227 { 228 struct sdp_sock *ssk; 229 230 SDP_LIST_RLOCK(); 231 LIST_FOREACH(ssk, &sdp_list, list) { 232 SDP_WLOCK(ssk); 233 func(ssk, arg); 234 SDP_WUNLOCK(ssk); 235 } 236 SDP_LIST_RUNLOCK(); 237 } 238 #endif 239 240 static void 241 sdp_output_reset(struct sdp_sock *ssk) 242 { 243 struct rdma_cm_id *id; 244 245 SDP_WLOCK_ASSERT(ssk); 246 if (ssk->id) { 247 id = ssk->id; 248 ssk->qp_active = 0; 249 SDP_WUNLOCK(ssk); 250 rdma_disconnect(id); 251 SDP_WLOCK(ssk); 252 } 253 ssk->state = TCPS_CLOSED; 254 } 255 256 /* 257 * Attempt to close a SDP socket, marking it as dropped, and freeing 258 * the socket if we hold the only reference. 259 */ 260 static struct sdp_sock * 261 sdp_closed(struct sdp_sock *ssk) 262 { 263 struct socket *so; 264 265 SDP_WLOCK_ASSERT(ssk); 266 267 ssk->flags |= SDP_DROPPED; 268 so = ssk->socket; 269 soisdisconnected(so); 270 if (ssk->flags & SDP_SOCKREF) { 271 ssk->flags &= ~SDP_SOCKREF; 272 SDP_WUNLOCK(ssk); 273 sorele(so); 274 return (NULL); 275 } 276 return (ssk); 277 } 278 279 /* 280 * Perform timer based shutdowns which can not operate in 281 * callout context. 282 */ 283 static void 284 sdp_shutdown_task(void *data, int pending) 285 { 286 struct sdp_sock *ssk; 287 288 ssk = data; 289 SDP_WLOCK(ssk); 290 /* 291 * I don't think this can race with another call to pcbfree() 292 * because SDP_TIMEWAIT protects it. SDP_DESTROY may be redundant. 293 */ 294 if (ssk->flags & SDP_DESTROY) 295 panic("sdp_shutdown_task: Racing with pcbfree for ssk %p", 296 ssk); 297 if (ssk->flags & SDP_DISCON) 298 sdp_output_reset(ssk); 299 /* We have to clear this so sdp_detach() will call pcbfree(). */ 300 ssk->flags &= ~(SDP_TIMEWAIT | SDP_DREQWAIT); 301 if ((ssk->flags & SDP_DROPPED) == 0 && 302 sdp_closed(ssk) == NULL) 303 return; 304 if (ssk->socket == NULL) { 305 sdp_pcbfree(ssk); 306 return; 307 } 308 SDP_WUNLOCK(ssk); 309 } 310 311 /* 312 * 2msl has expired, schedule the shutdown task. 313 */ 314 static void 315 sdp_2msl_timeout(void *data) 316 { 317 struct sdp_sock *ssk; 318 319 ssk = data; 320 /* Callout canceled. */ 321 if (!callout_active(&ssk->keep2msl)) 322 goto out; 323 callout_deactivate(&ssk->keep2msl); 324 /* Should be impossible, defensive programming. */ 325 if ((ssk->flags & SDP_TIMEWAIT) == 0) 326 goto out; 327 taskqueue_enqueue(taskqueue_thread, &ssk->shutdown_task); 328 out: 329 SDP_WUNLOCK(ssk); 330 return; 331 } 332 333 /* 334 * Schedule the 2msl wait timer. 335 */ 336 static void 337 sdp_2msl_wait(struct sdp_sock *ssk) 338 { 339 340 SDP_WLOCK_ASSERT(ssk); 341 ssk->flags |= SDP_TIMEWAIT; 342 ssk->state = TCPS_TIME_WAIT; 343 soisdisconnected(ssk->socket); 344 callout_reset(&ssk->keep2msl, TCPTV_MSL, sdp_2msl_timeout, ssk); 345 } 346 347 /* 348 * Timed out waiting for the final fin/ack from rdma_disconnect(). 349 */ 350 static void 351 sdp_dreq_timeout(void *data) 352 { 353 struct sdp_sock *ssk; 354 355 ssk = data; 356 /* Callout canceled. */ 357 if (!callout_active(&ssk->keep2msl)) 358 goto out; 359 /* Callout rescheduled, probably as a different timer. */ 360 if (callout_pending(&ssk->keep2msl)) 361 goto out; 362 callout_deactivate(&ssk->keep2msl); 363 if (ssk->state != TCPS_FIN_WAIT_1 && ssk->state != TCPS_LAST_ACK) 364 goto out; 365 if ((ssk->flags & SDP_DREQWAIT) == 0) 366 goto out; 367 ssk->flags &= ~SDP_DREQWAIT; 368 ssk->flags |= SDP_DISCON; 369 sdp_2msl_wait(ssk); 370 ssk->qp_active = 0; 371 out: 372 SDP_WUNLOCK(ssk); 373 } 374 375 /* 376 * Received the final fin/ack. Cancel the 2msl. 377 */ 378 void 379 sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk) 380 { 381 sdp_dbg(ssk->socket, "cancelling dreq wait timeout\n"); 382 ssk->flags &= ~SDP_DREQWAIT; 383 sdp_2msl_wait(ssk); 384 } 385 386 static int 387 sdp_init_sock(struct socket *sk) 388 { 389 struct sdp_sock *ssk = sdp_sk(sk); 390 391 sdp_dbg(sk, "%s\n", __func__); 392 393 callout_init_rw(&ssk->keep2msl, &ssk->lock, CALLOUT_RETURNUNLOCKED); 394 TASK_INIT(&ssk->shutdown_task, 0, sdp_shutdown_task, ssk); 395 #ifdef SDP_ZCOPY 396 INIT_DELAYED_WORK(&ssk->srcavail_cancel_work, srcavail_cancel_timeout); 397 ssk->zcopy_thresh = -1; /* use global sdp_zcopy_thresh */ 398 ssk->tx_ring.rdma_inflight = NULL; 399 #endif 400 atomic_set(&ssk->mseq_ack, 0); 401 sdp_rx_ring_init(ssk); 402 ssk->tx_ring.buffer = NULL; 403 404 return 0; 405 } 406 407 /* 408 * Allocate an sdp_sock for the socket and reserve socket buffer space. 409 */ 410 static int 411 sdp_attach(struct socket *so, int proto, struct thread *td) 412 { 413 struct sdp_sock *ssk; 414 int error; 415 416 ssk = sdp_sk(so); 417 KASSERT(ssk == NULL, ("sdp_attach: ssk already set on so %p", so)); 418 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { 419 error = soreserve(so, sdp_sendspace, sdp_recvspace); 420 if (error) 421 return (error); 422 } 423 so->so_rcv.sb_flags |= SB_AUTOSIZE; 424 so->so_snd.sb_flags |= SB_AUTOSIZE; 425 ssk = uma_zalloc(sdp_zone, M_NOWAIT | M_ZERO); 426 if (ssk == NULL) 427 return (ENOBUFS); 428 rw_init(&ssk->lock, "sdpsock"); 429 ssk->socket = so; 430 ssk->cred = crhold(so->so_cred); 431 so->so_pcb = (caddr_t)ssk; 432 sdp_init_sock(so); 433 ssk->flags = 0; 434 ssk->qp_active = 0; 435 ssk->state = TCPS_CLOSED; 436 mbufq_init(&ssk->rxctlq, INT_MAX); 437 SDP_LIST_WLOCK(); 438 LIST_INSERT_HEAD(&sdp_list, ssk, list); 439 sdp_count++; 440 SDP_LIST_WUNLOCK(); 441 442 return (0); 443 } 444 445 /* 446 * Detach SDP from the socket, potentially leaving it around for the 447 * timewait to expire. 448 */ 449 static void 450 sdp_detach(struct socket *so) 451 { 452 struct sdp_sock *ssk; 453 454 ssk = sdp_sk(so); 455 SDP_WLOCK(ssk); 456 KASSERT(ssk->socket != NULL, ("sdp_detach: socket is NULL")); 457 ssk->socket->so_pcb = NULL; 458 ssk->socket = NULL; 459 if (ssk->flags & (SDP_TIMEWAIT | SDP_DREQWAIT)) 460 SDP_WUNLOCK(ssk); 461 else if (ssk->flags & SDP_DROPPED || ssk->state < TCPS_SYN_SENT) 462 sdp_pcbfree(ssk); 463 else 464 panic("sdp_detach: Unexpected state, ssk %p.\n", ssk); 465 } 466 467 /* 468 * Allocate a local address for the socket. 469 */ 470 static int 471 sdp_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 472 { 473 int error = 0; 474 struct sdp_sock *ssk; 475 struct sockaddr_in *sin; 476 477 sin = (struct sockaddr_in *)nam; 478 if (sin->sin_family != AF_INET) 479 return (EAFNOSUPPORT); 480 if (nam->sa_len != sizeof(*sin)) 481 return (EINVAL); 482 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) 483 return (EAFNOSUPPORT); 484 485 ssk = sdp_sk(so); 486 SDP_WLOCK(ssk); 487 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 488 error = EINVAL; 489 goto out; 490 } 491 error = sdp_pcbbind(ssk, nam, td->td_ucred); 492 out: 493 SDP_WUNLOCK(ssk); 494 495 return (error); 496 } 497 498 /* 499 * Prepare to accept connections. 500 */ 501 static int 502 sdp_listen(struct socket *so, int backlog, struct thread *td) 503 { 504 int error = 0; 505 struct sdp_sock *ssk; 506 507 ssk = sdp_sk(so); 508 SDP_WLOCK(ssk); 509 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 510 error = EINVAL; 511 goto out; 512 } 513 if (error == 0 && ssk->lport == 0) 514 error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred); 515 SOCK_LOCK(so); 516 if (error == 0) 517 error = solisten_proto_check(so); 518 if (error == 0) { 519 solisten_proto(so, backlog); 520 ssk->state = TCPS_LISTEN; 521 } 522 SOCK_UNLOCK(so); 523 524 out: 525 SDP_WUNLOCK(ssk); 526 if (error == 0) 527 error = -rdma_listen(ssk->id, backlog); 528 return (error); 529 } 530 531 /* 532 * Initiate a SDP connection to nam. 533 */ 534 static int 535 sdp_start_connect(struct sdp_sock *ssk, struct sockaddr *nam, struct thread *td) 536 { 537 struct sockaddr_in src; 538 struct socket *so; 539 int error; 540 541 so = ssk->socket; 542 543 SDP_WLOCK_ASSERT(ssk); 544 if (ssk->lport == 0) { 545 error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred); 546 if (error) 547 return error; 548 } 549 src.sin_family = AF_INET; 550 src.sin_len = sizeof(src); 551 bzero(&src.sin_zero, sizeof(src.sin_zero)); 552 src.sin_port = ssk->lport; 553 src.sin_addr.s_addr = ssk->laddr; 554 soisconnecting(so); 555 SDP_WUNLOCK(ssk); 556 error = -rdma_resolve_addr(ssk->id, (struct sockaddr *)&src, nam, 557 SDP_RESOLVE_TIMEOUT); 558 SDP_WLOCK(ssk); 559 if (error == 0) 560 ssk->state = TCPS_SYN_SENT; 561 562 return 0; 563 } 564 565 /* 566 * Initiate SDP connection. 567 */ 568 static int 569 sdp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 570 { 571 int error = 0; 572 struct sdp_sock *ssk; 573 struct sockaddr_in *sin; 574 575 sin = (struct sockaddr_in *)nam; 576 if (nam->sa_len != sizeof(*sin)) 577 return (EINVAL); 578 if (sin->sin_family != AF_INET) 579 return (EAFNOSUPPORT); 580 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) 581 return (EAFNOSUPPORT); 582 if ((error = prison_remote_ip4(td->td_ucred, &sin->sin_addr)) != 0) 583 return (error); 584 ssk = sdp_sk(so); 585 SDP_WLOCK(ssk); 586 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) 587 error = EINVAL; 588 else 589 error = sdp_start_connect(ssk, nam, td); 590 SDP_WUNLOCK(ssk); 591 return (error); 592 } 593 594 /* 595 * Drop a SDP socket, reporting 596 * the specified error. If connection is synchronized, 597 * then send a RST to peer. 598 */ 599 static struct sdp_sock * 600 sdp_drop(struct sdp_sock *ssk, int errno) 601 { 602 struct socket *so; 603 604 SDP_WLOCK_ASSERT(ssk); 605 so = ssk->socket; 606 if (TCPS_HAVERCVDSYN(ssk->state)) 607 sdp_output_reset(ssk); 608 if (errno == ETIMEDOUT && ssk->softerror) 609 errno = ssk->softerror; 610 so->so_error = errno; 611 return (sdp_closed(ssk)); 612 } 613 614 /* 615 * User issued close, and wish to trail through shutdown states: 616 * if never received SYN, just forget it. If got a SYN from peer, 617 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 618 * If already got a FIN from peer, then almost done; go to LAST_ACK 619 * state. In all other cases, have already sent FIN to peer (e.g. 620 * after PRU_SHUTDOWN), and just have to play tedious game waiting 621 * for peer to send FIN or not respond to keep-alives, etc. 622 * We can let the user exit from the close as soon as the FIN is acked. 623 */ 624 static void 625 sdp_usrclosed(struct sdp_sock *ssk) 626 { 627 628 SDP_WLOCK_ASSERT(ssk); 629 630 switch (ssk->state) { 631 case TCPS_LISTEN: 632 ssk->state = TCPS_CLOSED; 633 SDP_WUNLOCK(ssk); 634 sdp_destroy_cma(ssk); 635 SDP_WLOCK(ssk); 636 /* FALLTHROUGH */ 637 case TCPS_CLOSED: 638 ssk = sdp_closed(ssk); 639 /* 640 * sdp_closed() should never return NULL here as the socket is 641 * still open. 642 */ 643 KASSERT(ssk != NULL, 644 ("sdp_usrclosed: sdp_closed() returned NULL")); 645 break; 646 647 case TCPS_SYN_SENT: 648 /* FALLTHROUGH */ 649 case TCPS_SYN_RECEIVED: 650 ssk->flags |= SDP_NEEDFIN; 651 break; 652 653 case TCPS_ESTABLISHED: 654 ssk->flags |= SDP_NEEDFIN; 655 ssk->state = TCPS_FIN_WAIT_1; 656 break; 657 658 case TCPS_CLOSE_WAIT: 659 ssk->state = TCPS_LAST_ACK; 660 break; 661 } 662 if (ssk->state >= TCPS_FIN_WAIT_2) { 663 /* Prevent the connection hanging in FIN_WAIT_2 forever. */ 664 if (ssk->state == TCPS_FIN_WAIT_2) 665 sdp_2msl_wait(ssk); 666 else 667 soisdisconnected(ssk->socket); 668 } 669 } 670 671 static void 672 sdp_output_disconnect(struct sdp_sock *ssk) 673 { 674 675 SDP_WLOCK_ASSERT(ssk); 676 callout_reset(&ssk->keep2msl, SDP_FIN_WAIT_TIMEOUT, 677 sdp_dreq_timeout, ssk); 678 ssk->flags |= SDP_NEEDFIN | SDP_DREQWAIT; 679 sdp_post_sends(ssk, M_NOWAIT); 680 } 681 682 /* 683 * Initiate or continue a disconnect. 684 * If embryonic state, just send reset (once). 685 * If in ``let data drain'' option and linger null, just drop. 686 * Otherwise (hard), mark socket disconnecting and drop 687 * current input data; switch states based on user close, and 688 * send segment to peer (with FIN). 689 */ 690 static void 691 sdp_start_disconnect(struct sdp_sock *ssk) 692 { 693 struct socket *so; 694 int unread; 695 696 so = ssk->socket; 697 SDP_WLOCK_ASSERT(ssk); 698 sdp_stop_keepalive_timer(so); 699 /* 700 * Neither sdp_closed() nor sdp_drop() should return NULL, as the 701 * socket is still open. 702 */ 703 if (ssk->state < TCPS_ESTABLISHED) { 704 ssk = sdp_closed(ssk); 705 KASSERT(ssk != NULL, 706 ("sdp_start_disconnect: sdp_close() returned NULL")); 707 } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) { 708 ssk = sdp_drop(ssk, 0); 709 KASSERT(ssk != NULL, 710 ("sdp_start_disconnect: sdp_drop() returned NULL")); 711 } else { 712 soisdisconnecting(so); 713 unread = sbused(&so->so_rcv); 714 sbflush(&so->so_rcv); 715 sdp_usrclosed(ssk); 716 if (!(ssk->flags & SDP_DROPPED)) { 717 if (unread) 718 sdp_output_reset(ssk); 719 else 720 sdp_output_disconnect(ssk); 721 } 722 } 723 } 724 725 /* 726 * User initiated disconnect. 727 */ 728 static int 729 sdp_disconnect(struct socket *so) 730 { 731 struct sdp_sock *ssk; 732 int error = 0; 733 734 ssk = sdp_sk(so); 735 SDP_WLOCK(ssk); 736 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 737 error = ECONNRESET; 738 goto out; 739 } 740 sdp_start_disconnect(ssk); 741 out: 742 SDP_WUNLOCK(ssk); 743 return (error); 744 } 745 746 /* 747 * Accept a connection. Essentially all the work is done at higher levels; 748 * just return the address of the peer, storing through addr. 749 * 750 * 751 * XXX This is broken XXX 752 * 753 * The rationale for acquiring the sdp lock here is somewhat complicated, 754 * and is described in detail in the commit log entry for r175612. Acquiring 755 * it delays an accept(2) racing with sonewconn(), which inserts the socket 756 * before the address/port fields are initialized. A better fix would 757 * prevent the socket from being placed in the listen queue until all fields 758 * are fully initialized. 759 */ 760 static int 761 sdp_accept(struct socket *so, struct sockaddr *sa) 762 { 763 struct sdp_sock *ssk = NULL; 764 int error; 765 766 if (so->so_state & SS_ISDISCONNECTED) 767 return (ECONNABORTED); 768 769 error = 0; 770 ssk = sdp_sk(so); 771 SDP_WLOCK(ssk); 772 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) 773 error = ECONNABORTED; 774 else 775 *(struct sockaddr_in *)sa = (struct sockaddr_in ){ 776 .sin_family = AF_INET, 777 .sin_len = sizeof(struct sockaddr_in), 778 .sin_addr.s_addr = ssk->faddr, 779 .sin_port = ssk->fport, 780 }; 781 SDP_WUNLOCK(ssk); 782 783 return (error); 784 } 785 786 /* 787 * Mark the connection as being incapable of further output. 788 */ 789 static int 790 sdp_shutdown(struct socket *so, enum shutdown_how how) 791 { 792 struct sdp_sock *ssk = sdp_sk(so); 793 int error = 0; 794 795 SOCK_LOCK(so); 796 if ((so->so_state & 797 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) { 798 SOCK_UNLOCK(so); 799 return (ENOTCONN); 800 } 801 if (SOLISTENING(so)) { 802 if (how != SHUT_WR) { 803 so->so_error = ECONNABORTED; 804 solisten_wakeup(so); /* unlocks so */ 805 } else 806 SOCK_UNLOCK(so); 807 return (0); 808 } 809 SOCK_UNLOCK(so); 810 811 switch (how) { 812 case SHUT_RD: 813 socantrcvmore(so); 814 sbrelease(so, SO_RCV); 815 break; 816 case SHUT_RDWR: 817 socantrcvmore(so); 818 sbrelease(so, SO_RCV); 819 /* FALLTHROUGH */ 820 case SHUT_WR: 821 SDP_WLOCK(ssk); 822 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 823 SDP_WUNLOCK(ssk); 824 error = ECONNRESET; 825 break; 826 } 827 socantsendmore(so); 828 sdp_usrclosed(ssk); 829 if (!(ssk->flags & SDP_DROPPED)) 830 sdp_output_disconnect(ssk); 831 SDP_WUNLOCK(ssk); 832 } 833 wakeup(&so->so_timeo); 834 835 return (error); 836 } 837 838 static void 839 sdp_append(struct sdp_sock *ssk, struct sockbuf *sb, struct mbuf *mb, int cnt) 840 { 841 struct mbuf *n; 842 int ncnt; 843 844 SOCKBUF_LOCK_ASSERT(sb); 845 SBLASTRECORDCHK(sb); 846 KASSERT(mb->m_flags & M_PKTHDR, 847 ("sdp_append: %p Missing packet header.\n", mb)); 848 n = sb->sb_lastrecord; 849 /* 850 * If the queue is empty just set all pointers and proceed. 851 */ 852 if (n == NULL) { 853 sb->sb_lastrecord = sb->sb_mb = sb->sb_sndptr = mb; 854 for (; mb; mb = mb->m_next) { 855 sb->sb_mbtail = mb; 856 sballoc(sb, mb); 857 } 858 return; 859 } 860 /* 861 * Count the number of mbufs in the current tail. 862 */ 863 for (ncnt = 0; n->m_next; n = n->m_next) 864 ncnt++; 865 n = sb->sb_lastrecord; 866 /* 867 * If the two chains can fit in a single sdp packet and 868 * the last record has not been sent yet (WRITABLE) coalesce 869 * them. The lastrecord remains the same but we must strip the 870 * packet header and then let sbcompress do the hard part. 871 */ 872 if (M_WRITABLE(n) && ncnt + cnt < SDP_MAX_SEND_SGES && 873 n->m_pkthdr.len + mb->m_pkthdr.len - SDP_HEAD_SIZE < 874 ssk->xmit_size_goal) { 875 m_adj(mb, SDP_HEAD_SIZE); 876 n->m_pkthdr.len += mb->m_pkthdr.len; 877 n->m_flags |= mb->m_flags & (M_PUSH | M_URG); 878 m_demote(mb, 1, 0); 879 sbcompress(sb, mb, sb->sb_mbtail); 880 return; 881 } 882 /* 883 * Not compressible, just append to the end and adjust counters. 884 */ 885 sb->sb_lastrecord->m_flags |= M_PUSH; 886 sb->sb_lastrecord->m_nextpkt = mb; 887 sb->sb_lastrecord = mb; 888 if (sb->sb_sndptr == NULL) 889 sb->sb_sndptr = mb; 890 for (; mb; mb = mb->m_next) { 891 sb->sb_mbtail = mb; 892 sballoc(sb, mb); 893 } 894 } 895 896 /* 897 * Do a send by putting data in output queue and updating urgent 898 * marker if URG set. Possibly send more data. Unlike the other 899 * pru_*() routines, the mbuf chains are our responsibility. We 900 * must either enqueue them or free them. The other pru_* routines 901 * generally are caller-frees. 902 * 903 * This comes from sendfile, normal sends will come from sdp_sosend(). 904 */ 905 static int 906 sdp_send(struct socket *so, int flags, struct mbuf *m, 907 struct sockaddr *nam, struct mbuf *control, struct thread *td) 908 { 909 struct sdp_sock *ssk; 910 struct mbuf *n; 911 int error; 912 int cnt; 913 914 if (nam != NULL) { 915 if (nam->sa_family != AF_INET) { 916 if (control) 917 m_freem(control); 918 m_freem(m); 919 return (EAFNOSUPPORT); 920 } 921 if (nam->sa_len != sizeof(struct sockaddr_in)) { 922 if (control) 923 m_freem(control); 924 m_freem(m); 925 return (EINVAL); 926 } 927 } 928 929 error = 0; 930 ssk = sdp_sk(so); 931 KASSERT(m->m_flags & M_PKTHDR, 932 ("sdp_send: %p no packet header", m)); 933 M_PREPEND(m, SDP_HEAD_SIZE, M_WAITOK); 934 mtod(m, struct sdp_bsdh *)->mid = SDP_MID_DATA; 935 for (n = m, cnt = 0; n->m_next; n = n->m_next) 936 cnt++; 937 if (cnt > SDP_MAX_SEND_SGES) { 938 n = m_collapse(m, M_WAITOK, SDP_MAX_SEND_SGES); 939 if (n == NULL) { 940 m_freem(m); 941 return (EMSGSIZE); 942 } 943 m = n; 944 for (cnt = 0; n->m_next; n = n->m_next) 945 cnt++; 946 } 947 SDP_WLOCK(ssk); 948 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 949 if (control) 950 m_freem(control); 951 if (m) 952 m_freem(m); 953 error = ECONNRESET; 954 goto out; 955 } 956 if (control) { 957 /* SDP doesn't support control messages. */ 958 if (control->m_len) { 959 m_freem(control); 960 if (m) 961 m_freem(m); 962 error = EINVAL; 963 goto out; 964 } 965 m_freem(control); /* empty control, just free it */ 966 } 967 if (!(flags & PRUS_OOB)) { 968 SOCKBUF_LOCK(&so->so_snd); 969 sdp_append(ssk, &so->so_snd, m, cnt); 970 SOCKBUF_UNLOCK(&so->so_snd); 971 if (nam && ssk->state < TCPS_SYN_SENT) { 972 /* 973 * Do implied connect if not yet connected. 974 */ 975 error = sdp_start_connect(ssk, nam, td); 976 if (error) 977 goto out; 978 } 979 if (flags & PRUS_EOF) { 980 /* 981 * Close the send side of the connection after 982 * the data is sent. 983 */ 984 socantsendmore(so); 985 sdp_usrclosed(ssk); 986 if (!(ssk->flags & SDP_DROPPED)) 987 sdp_output_disconnect(ssk); 988 } else if (!(ssk->flags & SDP_DROPPED) && 989 !(flags & PRUS_MORETOCOME)) 990 sdp_post_sends(ssk, M_NOWAIT); 991 SDP_WUNLOCK(ssk); 992 return (0); 993 } else { 994 SOCKBUF_LOCK(&so->so_snd); 995 if (sbspace(&so->so_snd) < -512) { 996 SOCKBUF_UNLOCK(&so->so_snd); 997 m_freem(m); 998 error = ENOBUFS; 999 goto out; 1000 } 1001 /* 1002 * According to RFC961 (Assigned Protocols), 1003 * the urgent pointer points to the last octet 1004 * of urgent data. We continue, however, 1005 * to consider it to indicate the first octet 1006 * of data past the urgent section. 1007 * Otherwise, snd_up should be one lower. 1008 */ 1009 m->m_flags |= M_URG | M_PUSH; 1010 sdp_append(ssk, &so->so_snd, m, cnt); 1011 SOCKBUF_UNLOCK(&so->so_snd); 1012 if (nam && ssk->state < TCPS_SYN_SENT) { 1013 /* 1014 * Do implied connect if not yet connected. 1015 */ 1016 error = sdp_start_connect(ssk, nam, td); 1017 if (error) 1018 goto out; 1019 } 1020 sdp_post_sends(ssk, M_NOWAIT); 1021 SDP_WUNLOCK(ssk); 1022 return (0); 1023 } 1024 out: 1025 SDP_WUNLOCK(ssk); 1026 return (error); 1027 } 1028 1029 /* 1030 * Send on a socket. If send must go all at once and message is larger than 1031 * send buffering, then hard error. Lock against other senders. If must go 1032 * all at once and not enough room now, then inform user that this would 1033 * block and do nothing. Otherwise, if nonblocking, send as much as 1034 * possible. The data to be sent is described by "uio" if nonzero, otherwise 1035 * by the mbuf chain "top" (which must be null if uio is not). Data provided 1036 * in mbuf chain must be small enough to send all at once. 1037 * 1038 * Returns nonzero on error, timeout or signal; callers must check for short 1039 * counts if EINTR/ERESTART are returned. Data and control buffers are freed 1040 * on return. 1041 */ 1042 static int 1043 sdp_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 1044 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1045 { 1046 struct sdp_sock *ssk; 1047 long space, resid; 1048 int atomic; 1049 int error; 1050 int copy; 1051 1052 if (uio != NULL) 1053 resid = uio->uio_resid; 1054 else 1055 resid = top->m_pkthdr.len; 1056 atomic = top != NULL; 1057 if (control != NULL) { 1058 if (control->m_len) { 1059 m_freem(control); 1060 if (top) 1061 m_freem(top); 1062 return (EINVAL); 1063 } 1064 m_freem(control); 1065 control = NULL; 1066 } 1067 /* 1068 * In theory resid should be unsigned. However, space must be 1069 * signed, as it might be less than 0 if we over-committed, and we 1070 * must use a signed comparison of space and resid. On the other 1071 * hand, a negative resid causes us to loop sending 0-length 1072 * segments to the protocol. 1073 * 1074 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 1075 * type sockets since that's an error. 1076 */ 1077 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 1078 error = EINVAL; 1079 goto out; 1080 } 1081 if (td != NULL) 1082 td->td_ru.ru_msgsnd++; 1083 1084 ssk = sdp_sk(so); 1085 error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags)); 1086 if (error) 1087 goto out; 1088 1089 restart: 1090 do { 1091 SOCKBUF_LOCK(&so->so_snd); 1092 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1093 SOCKBUF_UNLOCK(&so->so_snd); 1094 error = EPIPE; 1095 goto release; 1096 } 1097 if (so->so_error) { 1098 error = so->so_error; 1099 so->so_error = 0; 1100 SOCKBUF_UNLOCK(&so->so_snd); 1101 goto release; 1102 } 1103 if ((so->so_state & SS_ISCONNECTED) == 0 && addr == NULL) { 1104 SOCKBUF_UNLOCK(&so->so_snd); 1105 error = ENOTCONN; 1106 goto release; 1107 } 1108 space = sbspace(&so->so_snd); 1109 if (flags & MSG_OOB) 1110 space += 1024; 1111 if (atomic && resid > ssk->xmit_size_goal - SDP_HEAD_SIZE) { 1112 SOCKBUF_UNLOCK(&so->so_snd); 1113 error = EMSGSIZE; 1114 goto release; 1115 } 1116 if (space < resid && 1117 (atomic || space < so->so_snd.sb_lowat)) { 1118 if ((so->so_state & SS_NBIO) || 1119 (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { 1120 SOCKBUF_UNLOCK(&so->so_snd); 1121 error = EWOULDBLOCK; 1122 goto release; 1123 } 1124 error = sbwait(so, SO_SND); 1125 SOCKBUF_UNLOCK(&so->so_snd); 1126 if (error) 1127 goto release; 1128 goto restart; 1129 } 1130 SOCKBUF_UNLOCK(&so->so_snd); 1131 do { 1132 if (uio == NULL) { 1133 resid = 0; 1134 if (flags & MSG_EOR) 1135 top->m_flags |= M_EOR; 1136 } else { 1137 /* 1138 * Copy the data from userland into a mbuf 1139 * chain. If no data is to be copied in, 1140 * a single empty mbuf is returned. 1141 */ 1142 copy = min(space, 1143 ssk->xmit_size_goal - SDP_HEAD_SIZE); 1144 top = m_uiotombuf(uio, M_WAITOK, copy, 1145 0, M_PKTHDR | 1146 ((flags & MSG_EOR) ? M_EOR : 0)); 1147 if (top == NULL) { 1148 /* only possible error */ 1149 error = EFAULT; 1150 goto release; 1151 } 1152 space -= resid - uio->uio_resid; 1153 resid = uio->uio_resid; 1154 } 1155 /* 1156 * XXX all the SBS_CANTSENDMORE checks previously 1157 * done could be out of date after dropping the 1158 * socket lock. 1159 */ 1160 error = sdp_send(so, (flags & MSG_OOB) ? PRUS_OOB : 1161 /* 1162 * Set EOF on the last send if the user specified 1163 * MSG_EOF. 1164 */ 1165 ((flags & MSG_EOF) && (resid <= 0)) ? PRUS_EOF : 1166 /* If there is more to send set PRUS_MORETOCOME. */ 1167 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1168 top, addr, NULL, td); 1169 top = NULL; 1170 if (error) 1171 goto release; 1172 } while (resid && space > 0); 1173 } while (resid); 1174 1175 release: 1176 SOCK_IO_SEND_UNLOCK(so); 1177 out: 1178 if (top != NULL) 1179 m_freem(top); 1180 return (error); 1181 } 1182 1183 /* 1184 * The part of soreceive() that implements reading non-inline out-of-band 1185 * data from a socket. For more complete comments, see soreceive(), from 1186 * which this code originated. 1187 * 1188 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is 1189 * unable to return an mbuf chain to the caller. 1190 */ 1191 static int 1192 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) 1193 { 1194 struct protosw *pr = so->so_proto; 1195 struct mbuf *m; 1196 int error; 1197 1198 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 1199 1200 m = m_get(M_WAITOK, MT_DATA); 1201 error = pr->pr_rcvoob(so, m, flags & MSG_PEEK); 1202 if (error) 1203 goto bad; 1204 do { 1205 error = uiomove(mtod(m, void *), 1206 (int) min(uio->uio_resid, m->m_len), uio); 1207 m = m_free(m); 1208 } while (uio->uio_resid && error == 0 && m); 1209 bad: 1210 if (m != NULL) 1211 m_freem(m); 1212 return (error); 1213 } 1214 1215 /* 1216 * Optimized version of soreceive() for stream (TCP) sockets. 1217 */ 1218 static int 1219 sdp_sorecv(struct socket *so, struct sockaddr **psa, struct uio *uio, 1220 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 1221 { 1222 int len = 0, error = 0, flags, oresid; 1223 struct sockbuf *sb; 1224 struct mbuf *m, *n = NULL; 1225 struct sdp_sock *ssk; 1226 1227 /* We only do stream sockets. */ 1228 if (so->so_type != SOCK_STREAM) 1229 return (EINVAL); 1230 if (psa != NULL) 1231 *psa = NULL; 1232 if (controlp != NULL) 1233 return (EINVAL); 1234 if (flagsp != NULL) 1235 flags = *flagsp &~ MSG_EOR; 1236 else 1237 flags = 0; 1238 if (flags & MSG_OOB) 1239 return (soreceive_rcvoob(so, uio, flags)); 1240 if (mp0 != NULL) 1241 *mp0 = NULL; 1242 1243 sb = &so->so_rcv; 1244 ssk = sdp_sk(so); 1245 1246 /* Prevent other readers from entering the socket. */ 1247 error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); 1248 if (error) 1249 return (error); 1250 SOCKBUF_LOCK(sb); 1251 1252 /* Easy one, no space to copyout anything. */ 1253 if (uio->uio_resid == 0) { 1254 error = EINVAL; 1255 goto out; 1256 } 1257 oresid = uio->uio_resid; 1258 1259 /* We will never ever get anything unless we are connected. */ 1260 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { 1261 /* When disconnecting there may be still some data left. */ 1262 if (sbavail(sb)) 1263 goto deliver; 1264 if (!(so->so_state & SS_ISDISCONNECTED)) 1265 error = ENOTCONN; 1266 goto out; 1267 } 1268 1269 /* Socket buffer is empty and we shall not block. */ 1270 if (sbavail(sb) == 0 && 1271 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { 1272 error = EAGAIN; 1273 goto out; 1274 } 1275 1276 restart: 1277 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1278 1279 /* Abort if socket has reported problems. */ 1280 if (so->so_error) { 1281 if (sbavail(sb)) 1282 goto deliver; 1283 if (oresid > uio->uio_resid) 1284 goto out; 1285 error = so->so_error; 1286 if (!(flags & MSG_PEEK)) 1287 so->so_error = 0; 1288 goto out; 1289 } 1290 1291 /* Door is closed. Deliver what is left, if any. */ 1292 if (sb->sb_state & SBS_CANTRCVMORE) { 1293 if (sbavail(sb)) 1294 goto deliver; 1295 else 1296 goto out; 1297 } 1298 1299 /* Socket buffer got some data that we shall deliver now. */ 1300 if (sbavail(sb) && !(flags & MSG_WAITALL) && 1301 ((so->so_state & SS_NBIO) || 1302 (flags & (MSG_DONTWAIT|MSG_NBIO)) || 1303 sbavail(sb) >= sb->sb_lowat || 1304 sbavail(sb) >= uio->uio_resid || 1305 sbavail(sb) >= sb->sb_hiwat) ) { 1306 goto deliver; 1307 } 1308 1309 /* On MSG_WAITALL we must wait until all data or error arrives. */ 1310 if ((flags & MSG_WAITALL) && 1311 (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_lowat)) 1312 goto deliver; 1313 1314 /* 1315 * Wait and block until (more) data comes in. 1316 * NB: Drops the sockbuf lock during wait. 1317 */ 1318 error = sbwait(so, SO_RCV); 1319 if (error) 1320 goto out; 1321 goto restart; 1322 1323 deliver: 1324 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1325 KASSERT(sbavail(sb), ("%s: sockbuf empty", __func__)); 1326 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); 1327 1328 /* Statistics. */ 1329 if (uio->uio_td) 1330 uio->uio_td->td_ru.ru_msgrcv++; 1331 1332 /* Fill uio until full or current end of socket buffer is reached. */ 1333 len = min(uio->uio_resid, sbavail(sb)); 1334 if (mp0 != NULL) { 1335 /* Dequeue as many mbufs as possible. */ 1336 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { 1337 for (*mp0 = m = sb->sb_mb; 1338 m != NULL && m->m_len <= len; 1339 m = m->m_next) { 1340 len -= m->m_len; 1341 uio->uio_resid -= m->m_len; 1342 sbfree(sb, m); 1343 n = m; 1344 } 1345 sb->sb_mb = m; 1346 if (sb->sb_mb == NULL) 1347 SB_EMPTY_FIXUP(sb); 1348 n->m_next = NULL; 1349 } 1350 /* Copy the remainder. */ 1351 if (len > 0) { 1352 KASSERT(sb->sb_mb != NULL, 1353 ("%s: len > 0 && sb->sb_mb empty", __func__)); 1354 1355 m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); 1356 if (m == NULL) 1357 len = 0; /* Don't flush data from sockbuf. */ 1358 else 1359 uio->uio_resid -= m->m_len; 1360 if (*mp0 != NULL) 1361 n->m_next = m; 1362 else 1363 *mp0 = m; 1364 if (*mp0 == NULL) { 1365 error = ENOBUFS; 1366 goto out; 1367 } 1368 } 1369 } else { 1370 /* NB: Must unlock socket buffer as uiomove may sleep. */ 1371 SOCKBUF_UNLOCK(sb); 1372 error = m_mbuftouio(uio, sb->sb_mb, len); 1373 SOCKBUF_LOCK(sb); 1374 if (error) 1375 goto out; 1376 } 1377 SBLASTRECORDCHK(sb); 1378 SBLASTMBUFCHK(sb); 1379 1380 /* 1381 * Remove the delivered data from the socket buffer unless we 1382 * were only peeking. 1383 */ 1384 if (!(flags & MSG_PEEK)) { 1385 if (len > 0) 1386 sbdrop_locked(sb, len); 1387 1388 /* Notify protocol that we drained some data. */ 1389 SOCKBUF_UNLOCK(sb); 1390 SDP_WLOCK(ssk); 1391 sdp_do_posts(ssk); 1392 SDP_WUNLOCK(ssk); 1393 SOCKBUF_LOCK(sb); 1394 } 1395 1396 /* 1397 * For MSG_WAITALL we may have to loop again and wait for 1398 * more data to come in. 1399 */ 1400 if ((flags & MSG_WAITALL) && uio->uio_resid > 0) 1401 goto restart; 1402 out: 1403 SBLASTRECORDCHK(sb); 1404 SBLASTMBUFCHK(sb); 1405 SOCKBUF_UNLOCK(sb); 1406 SOCK_IO_RECV_UNLOCK(so); 1407 return (error); 1408 } 1409 1410 /* 1411 * Abort is used to teardown a connection typically while sitting in 1412 * the accept queue. 1413 */ 1414 void 1415 sdp_abort(struct socket *so) 1416 { 1417 struct sdp_sock *ssk; 1418 1419 ssk = sdp_sk(so); 1420 SDP_WLOCK(ssk); 1421 /* 1422 * If we have not yet dropped, do it now. 1423 */ 1424 if (!(ssk->flags & SDP_TIMEWAIT) && 1425 !(ssk->flags & SDP_DROPPED)) 1426 sdp_drop(ssk, ECONNABORTED); 1427 KASSERT(ssk->flags & SDP_DROPPED, ("sdp_abort: %p not dropped 0x%X", 1428 ssk, ssk->flags)); 1429 SDP_WUNLOCK(ssk); 1430 } 1431 1432 /* 1433 * Close a SDP socket and initiate a friendly disconnect. 1434 */ 1435 static void 1436 sdp_close(struct socket *so) 1437 { 1438 struct sdp_sock *ssk; 1439 1440 ssk = sdp_sk(so); 1441 SDP_WLOCK(ssk); 1442 /* 1443 * If we have not yet dropped, do it now. 1444 */ 1445 if (!(ssk->flags & SDP_TIMEWAIT) && 1446 !(ssk->flags & SDP_DROPPED)) 1447 sdp_start_disconnect(ssk); 1448 1449 /* 1450 * If we've still not dropped let the socket layer know we're 1451 * holding on to the socket and pcb for a while. 1452 */ 1453 if (!(ssk->flags & SDP_DROPPED)) { 1454 ssk->flags |= SDP_SOCKREF; 1455 soref(so); 1456 } 1457 SDP_WUNLOCK(ssk); 1458 } 1459 1460 /* 1461 * User requests out-of-band data. 1462 */ 1463 static int 1464 sdp_rcvoob(struct socket *so, struct mbuf *m, int flags) 1465 { 1466 int error = 0; 1467 struct sdp_sock *ssk; 1468 1469 ssk = sdp_sk(so); 1470 SDP_WLOCK(ssk); 1471 if (!rx_ring_trylock(&ssk->rx_ring)) { 1472 SDP_WUNLOCK(ssk); 1473 return (ECONNRESET); 1474 } 1475 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 1476 error = ECONNRESET; 1477 goto out; 1478 } 1479 if ((so->so_oobmark == 0 && 1480 (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) || 1481 so->so_options & SO_OOBINLINE || 1482 ssk->oobflags & SDP_HADOOB) { 1483 error = EINVAL; 1484 goto out; 1485 } 1486 if ((ssk->oobflags & SDP_HAVEOOB) == 0) { 1487 error = EWOULDBLOCK; 1488 goto out; 1489 } 1490 m->m_len = 1; 1491 *mtod(m, caddr_t) = ssk->iobc; 1492 if ((flags & MSG_PEEK) == 0) 1493 ssk->oobflags ^= (SDP_HAVEOOB | SDP_HADOOB); 1494 out: 1495 rx_ring_unlock(&ssk->rx_ring); 1496 SDP_WUNLOCK(ssk); 1497 return (error); 1498 } 1499 1500 void 1501 sdp_urg(struct sdp_sock *ssk, struct mbuf *mb) 1502 { 1503 struct mbuf *m; 1504 struct socket *so; 1505 1506 so = ssk->socket; 1507 if (so == NULL) 1508 return; 1509 1510 so->so_oobmark = sbused(&so->so_rcv) + mb->m_pkthdr.len - 1; 1511 sohasoutofband(so); 1512 ssk->oobflags &= ~(SDP_HAVEOOB | SDP_HADOOB); 1513 if (!(so->so_options & SO_OOBINLINE)) { 1514 for (m = mb; m->m_next != NULL; m = m->m_next); 1515 ssk->iobc = *(mtod(m, char *) + m->m_len - 1); 1516 ssk->oobflags |= SDP_HAVEOOB; 1517 m->m_len--; 1518 mb->m_pkthdr.len--; 1519 } 1520 } 1521 1522 /* 1523 * Notify a sdp socket of an asynchronous error. 1524 * 1525 * Do not wake up user since there currently is no mechanism for 1526 * reporting soft errors (yet - a kqueue filter may be added). 1527 */ 1528 struct sdp_sock * 1529 sdp_notify(struct sdp_sock *ssk, int error) 1530 { 1531 1532 SDP_WLOCK_ASSERT(ssk); 1533 1534 if ((ssk->flags & SDP_TIMEWAIT) || 1535 (ssk->flags & SDP_DROPPED)) 1536 return (ssk); 1537 1538 /* 1539 * Ignore some errors if we are hooked up. 1540 */ 1541 if (ssk->state == TCPS_ESTABLISHED && 1542 (error == EHOSTUNREACH || error == ENETUNREACH || 1543 error == EHOSTDOWN)) 1544 return (ssk); 1545 ssk->softerror = error; 1546 return sdp_drop(ssk, error); 1547 } 1548 1549 static void 1550 sdp_keepalive_timeout(void *data) 1551 { 1552 struct sdp_sock *ssk; 1553 1554 ssk = data; 1555 /* Callout canceled. */ 1556 if (!callout_active(&ssk->keep2msl)) 1557 return; 1558 /* Callout rescheduled as a different kind of timer. */ 1559 if (callout_pending(&ssk->keep2msl)) 1560 goto out; 1561 callout_deactivate(&ssk->keep2msl); 1562 if (ssk->flags & SDP_DROPPED || 1563 (ssk->socket->so_options & SO_KEEPALIVE) == 0) 1564 goto out; 1565 sdp_post_keepalive(ssk); 1566 callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME, 1567 sdp_keepalive_timeout, ssk); 1568 out: 1569 SDP_WUNLOCK(ssk); 1570 } 1571 1572 1573 void 1574 sdp_start_keepalive_timer(struct socket *so) 1575 { 1576 struct sdp_sock *ssk; 1577 1578 ssk = sdp_sk(so); 1579 if (!callout_pending(&ssk->keep2msl)) 1580 callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME, 1581 sdp_keepalive_timeout, ssk); 1582 } 1583 1584 static void 1585 sdp_stop_keepalive_timer(struct socket *so) 1586 { 1587 struct sdp_sock *ssk; 1588 1589 ssk = sdp_sk(so); 1590 callout_stop(&ssk->keep2msl); 1591 } 1592 1593 /* 1594 * sdp_ctloutput() must drop the inpcb lock before performing copyin on 1595 * socket option arguments. When it re-acquires the lock after the copy, it 1596 * has to revalidate that the connection is still valid for the socket 1597 * option. 1598 */ 1599 #define SDP_WLOCK_RECHECK(inp) do { \ 1600 SDP_WLOCK(ssk); \ 1601 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { \ 1602 SDP_WUNLOCK(ssk); \ 1603 return (ECONNRESET); \ 1604 } \ 1605 } while(0) 1606 1607 static int 1608 sdp_ctloutput(struct socket *so, struct sockopt *sopt) 1609 { 1610 int error, opt, optval; 1611 struct sdp_sock *ssk; 1612 1613 error = 0; 1614 ssk = sdp_sk(so); 1615 if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_KEEPALIVE) { 1616 SDP_WLOCK(ssk); 1617 if (so->so_options & SO_KEEPALIVE) 1618 sdp_start_keepalive_timer(so); 1619 else 1620 sdp_stop_keepalive_timer(so); 1621 SDP_WUNLOCK(ssk); 1622 } 1623 if (sopt->sopt_level != IPPROTO_TCP) 1624 return (error); 1625 1626 SDP_WLOCK(ssk); 1627 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 1628 SDP_WUNLOCK(ssk); 1629 return (ECONNRESET); 1630 } 1631 1632 switch (sopt->sopt_dir) { 1633 case SOPT_SET: 1634 switch (sopt->sopt_name) { 1635 case TCP_NODELAY: 1636 SDP_WUNLOCK(ssk); 1637 error = sooptcopyin(sopt, &optval, sizeof optval, 1638 sizeof optval); 1639 if (error) 1640 return (error); 1641 1642 SDP_WLOCK_RECHECK(ssk); 1643 opt = SDP_NODELAY; 1644 if (optval) 1645 ssk->flags |= opt; 1646 else 1647 ssk->flags &= ~opt; 1648 sdp_do_posts(ssk); 1649 SDP_WUNLOCK(ssk); 1650 break; 1651 1652 default: 1653 SDP_WUNLOCK(ssk); 1654 error = ENOPROTOOPT; 1655 break; 1656 } 1657 break; 1658 1659 case SOPT_GET: 1660 switch (sopt->sopt_name) { 1661 case TCP_NODELAY: 1662 optval = ssk->flags & SDP_NODELAY; 1663 SDP_WUNLOCK(ssk); 1664 error = sooptcopyout(sopt, &optval, sizeof optval); 1665 break; 1666 default: 1667 SDP_WUNLOCK(ssk); 1668 error = ENOPROTOOPT; 1669 break; 1670 } 1671 break; 1672 } 1673 return (error); 1674 } 1675 #undef SDP_WLOCK_RECHECK 1676 1677 int sdp_mod_count = 0; 1678 int sdp_mod_usec = 0; 1679 1680 void 1681 sdp_set_default_moderation(struct sdp_sock *ssk) 1682 { 1683 if (sdp_mod_count <= 0 || sdp_mod_usec <= 0) 1684 return; 1685 ib_modify_cq(ssk->rx_ring.cq, sdp_mod_count, sdp_mod_usec); 1686 } 1687 1688 static void 1689 sdp_dev_add(struct ib_device *device) 1690 { 1691 struct ib_fmr_pool_param param; 1692 struct sdp_device *sdp_dev; 1693 1694 sdp_dev = malloc(sizeof(*sdp_dev), M_SDP, M_WAITOK | M_ZERO); 1695 sdp_dev->pd = ib_alloc_pd(device, 0); 1696 if (IS_ERR(sdp_dev->pd)) 1697 goto out_pd; 1698 memset(¶m, 0, sizeof param); 1699 param.max_pages_per_fmr = SDP_FMR_SIZE; 1700 param.page_shift = PAGE_SHIFT; 1701 param.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ); 1702 param.pool_size = SDP_FMR_POOL_SIZE; 1703 param.dirty_watermark = SDP_FMR_DIRTY_SIZE; 1704 param.cache = 1; 1705 sdp_dev->fmr_pool = ib_create_fmr_pool(sdp_dev->pd, ¶m); 1706 if (IS_ERR(sdp_dev->fmr_pool)) 1707 goto out_fmr; 1708 ib_set_client_data(device, &sdp_client, sdp_dev); 1709 return; 1710 1711 out_fmr: 1712 ib_dealloc_pd(sdp_dev->pd); 1713 out_pd: 1714 free(sdp_dev, M_SDP); 1715 } 1716 1717 static void 1718 sdp_dev_rem(struct ib_device *device, void *client_data) 1719 { 1720 struct sdp_device *sdp_dev; 1721 struct sdp_sock *ssk; 1722 1723 SDP_LIST_WLOCK(); 1724 LIST_FOREACH(ssk, &sdp_list, list) { 1725 if (ssk->ib_device != device) 1726 continue; 1727 SDP_WLOCK(ssk); 1728 if ((ssk->flags & SDP_DESTROY) == 0) 1729 ssk = sdp_notify(ssk, ECONNRESET); 1730 if (ssk) 1731 SDP_WUNLOCK(ssk); 1732 } 1733 SDP_LIST_WUNLOCK(); 1734 /* 1735 * XXX Do I need to wait between these two? 1736 */ 1737 sdp_dev = ib_get_client_data(device, &sdp_client); 1738 if (!sdp_dev) 1739 return; 1740 ib_flush_fmr_pool(sdp_dev->fmr_pool); 1741 ib_destroy_fmr_pool(sdp_dev->fmr_pool); 1742 ib_dealloc_pd(sdp_dev->pd); 1743 free(sdp_dev, M_SDP); 1744 } 1745 1746 struct ib_client sdp_client = 1747 { .name = "sdp", .add = sdp_dev_add, .remove = sdp_dev_rem }; 1748 1749 1750 static int 1751 sdp_pcblist(SYSCTL_HANDLER_ARGS) 1752 { 1753 int error, n, i; 1754 struct sdp_sock *ssk; 1755 struct xinpgen xig; 1756 1757 /* 1758 * The process of preparing the TCB list is too time-consuming and 1759 * resource-intensive to repeat twice on every request. 1760 */ 1761 if (req->oldptr == NULL) { 1762 n = sdp_count; 1763 n += imax(n / 8, 10); 1764 req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb); 1765 return (0); 1766 } 1767 1768 if (req->newptr != NULL) 1769 return (EPERM); 1770 1771 /* 1772 * OK, now we're committed to doing something. 1773 */ 1774 SDP_LIST_RLOCK(); 1775 n = sdp_count; 1776 SDP_LIST_RUNLOCK(); 1777 1778 error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) 1779 + n * sizeof(struct xtcpcb)); 1780 if (error != 0) 1781 return (error); 1782 1783 bzero(&xig, sizeof(xig)); 1784 xig.xig_len = sizeof xig; 1785 xig.xig_count = n; 1786 xig.xig_gen = 0; 1787 xig.xig_sogen = so_gencnt; 1788 error = SYSCTL_OUT(req, &xig, sizeof xig); 1789 if (error) 1790 return (error); 1791 1792 SDP_LIST_RLOCK(); 1793 for (ssk = LIST_FIRST(&sdp_list), i = 0; 1794 ssk != NULL && i < n; ssk = LIST_NEXT(ssk, list)) { 1795 struct xtcpcb xt; 1796 1797 SDP_RLOCK(ssk); 1798 if (ssk->flags & SDP_TIMEWAIT) { 1799 if (ssk->cred != NULL) 1800 error = cr_cansee(req->td->td_ucred, 1801 ssk->cred); 1802 else 1803 error = EINVAL; /* Skip this inp. */ 1804 } else if (ssk->socket) 1805 error = cr_canseesocket(req->td->td_ucred, 1806 ssk->socket); 1807 else 1808 error = EINVAL; 1809 if (error) { 1810 error = 0; 1811 goto next; 1812 } 1813 1814 bzero(&xt, sizeof(xt)); 1815 xt.xt_len = sizeof xt; 1816 xt.xt_inp.inp_gencnt = 0; 1817 xt.xt_inp.inp_vflag = INP_IPV4; 1818 memcpy(&xt.xt_inp.inp_laddr, &ssk->laddr, sizeof(ssk->laddr)); 1819 xt.xt_inp.inp_lport = ssk->lport; 1820 memcpy(&xt.xt_inp.inp_faddr, &ssk->faddr, sizeof(ssk->faddr)); 1821 xt.xt_inp.inp_fport = ssk->fport; 1822 xt.t_state = ssk->state; 1823 if (ssk->socket != NULL) 1824 sotoxsocket(ssk->socket, &xt.xt_inp.xi_socket); 1825 xt.xt_inp.xi_socket.xso_protocol = IPPROTO_TCP; 1826 SDP_RUNLOCK(ssk); 1827 error = SYSCTL_OUT(req, &xt, sizeof xt); 1828 if (error) 1829 break; 1830 i++; 1831 continue; 1832 next: 1833 SDP_RUNLOCK(ssk); 1834 } 1835 if (!error) { 1836 /* 1837 * Give the user an updated idea of our state. 1838 * If the generation differs from what we told 1839 * her before, she knows that something happened 1840 * while we were processing this request, and it 1841 * might be necessary to retry. 1842 */ 1843 xig.xig_gen = 0; 1844 xig.xig_sogen = so_gencnt; 1845 xig.xig_count = sdp_count; 1846 error = SYSCTL_OUT(req, &xig, sizeof xig); 1847 } 1848 SDP_LIST_RUNLOCK(); 1849 return (error); 1850 } 1851 1852 SYSCTL_NODE(_net_inet, -1, sdp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1853 "SDP"); 1854 1855 SYSCTL_PROC(_net_inet_sdp, TCPCTL_PCBLIST, pcblist, 1856 CTLFLAG_RD | CTLTYPE_STRUCT | CTLFLAG_MPSAFE, 1857 0, 0, sdp_pcblist, "S,xtcpcb", 1858 "List of active SDP connections"); 1859 1860 static void 1861 sdp_zone_change(void *tag) 1862 { 1863 1864 uma_zone_set_max(sdp_zone, maxsockets); 1865 } 1866 1867 static void 1868 sdp_init(void *arg __unused) 1869 { 1870 1871 LIST_INIT(&sdp_list); 1872 sdp_zone = uma_zcreate("sdp_sock", sizeof(struct sdp_sock), 1873 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 1874 uma_zone_set_max(sdp_zone, maxsockets); 1875 EVENTHANDLER_REGISTER(maxsockets_change, sdp_zone_change, NULL, 1876 EVENTHANDLER_PRI_ANY); 1877 rx_comp_wq = create_singlethread_workqueue("rx_comp_wq"); 1878 ib_register_client(&sdp_client); 1879 } 1880 SYSINIT(sdp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND, sdp_init, NULL); 1881 1882 #define SDP_PROTOSW \ 1883 .pr_type = SOCK_STREAM, \ 1884 .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,\ 1885 .pr_ctloutput = sdp_ctloutput, \ 1886 .pr_abort = sdp_abort, \ 1887 .pr_accept = sdp_accept, \ 1888 .pr_attach = sdp_attach, \ 1889 .pr_bind = sdp_bind, \ 1890 .pr_connect = sdp_connect, \ 1891 .pr_detach = sdp_detach, \ 1892 .pr_disconnect = sdp_disconnect, \ 1893 .pr_listen = sdp_listen, \ 1894 .pr_peeraddr = sdp_getpeeraddr, \ 1895 .pr_rcvoob = sdp_rcvoob, \ 1896 .pr_send = sdp_send, \ 1897 .pr_sosend = sdp_sosend, \ 1898 .pr_soreceive = sdp_sorecv, \ 1899 .pr_shutdown = sdp_shutdown, \ 1900 .pr_sockaddr = sdp_getsockaddr, \ 1901 .pr_close = sdp_close 1902 1903 1904 static struct protosw sdp_ip_protosw = { 1905 .pr_protocol = IPPROTO_IP, 1906 SDP_PROTOSW 1907 }; 1908 static struct protosw sdp_tcp_protosw = { 1909 .pr_protocol = IPPROTO_TCP, 1910 SDP_PROTOSW 1911 }; 1912 1913 static struct domain sdpdomain = { 1914 .dom_family = AF_INET_SDP, 1915 .dom_name = "SDP", 1916 .dom_nprotosw = 2, 1917 .dom_protosw = { 1918 &sdp_ip_protosw, 1919 &sdp_tcp_protosw, 1920 }, 1921 }; 1922 1923 DOMAIN_SET(sdp); 1924 1925 int sdp_debug_level = 1; 1926 int sdp_data_debug_level = 0; 1927