1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * Copyright (c) 2004 The FreeBSD Foundation. All rights reserved. 7 * Copyright (c) 2004-2008 Robert N. M. Watson. All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * Excerpts taken from tcp_subr.c, tcp_usrreq.c, uipc_socket.c 34 */ 35 36 /* 37 * 38 * Copyright (c) 2010 Isilon Systems, Inc. 39 * Copyright (c) 2010 iX Systems, Inc. 40 * Copyright (c) 2010 Panasas, Inc. 41 * All rights reserved. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice unmodified, this list of conditions, and the following 48 * disclaimer. 49 * 2. Redistributions in binary form must reproduce the above copyright 50 * notice, this list of conditions and the following disclaimer in the 51 * documentation and/or other materials provided with the distribution. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 54 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 55 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 56 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 57 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 58 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 59 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 60 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 61 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 62 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 63 * 64 */ 65 #include <sys/cdefs.h> 66 __FBSDID("$FreeBSD$"); 67 68 #include <sys/param.h> 69 #include <sys/eventhandler.h> 70 #include <sys/kernel.h> 71 #include <sys/malloc.h> 72 73 #include "sdp.h" 74 75 #include <net/if.h> 76 #include <net/route.h> 77 #include <net/vnet.h> 78 #include <sys/sysctl.h> 79 80 uma_zone_t sdp_zone; 81 struct rwlock sdp_lock; 82 LIST_HEAD(, sdp_sock) sdp_list; 83 84 struct workqueue_struct *rx_comp_wq; 85 86 RW_SYSINIT(sdplockinit, &sdp_lock, "SDP lock"); 87 #define SDP_LIST_WLOCK() rw_wlock(&sdp_lock) 88 #define SDP_LIST_RLOCK() rw_rlock(&sdp_lock) 89 #define SDP_LIST_WUNLOCK() rw_wunlock(&sdp_lock) 90 #define SDP_LIST_RUNLOCK() rw_runlock(&sdp_lock) 91 #define SDP_LIST_WLOCK_ASSERT() rw_assert(&sdp_lock, RW_WLOCKED) 92 #define SDP_LIST_RLOCK_ASSERT() rw_assert(&sdp_lock, RW_RLOCKED) 93 #define SDP_LIST_LOCK_ASSERT() rw_assert(&sdp_lock, RW_LOCKED) 94 95 MALLOC_DEFINE(M_SDP, "sdp", "Sockets Direct Protocol"); 96 97 static void sdp_stop_keepalive_timer(struct socket *so); 98 99 /* 100 * SDP protocol interface to socket abstraction. 101 */ 102 /* 103 * sdp_sendspace and sdp_recvspace are the default send and receive window 104 * sizes, respectively. 105 */ 106 u_long sdp_sendspace = 1024*32; 107 u_long sdp_recvspace = 1024*64; 108 109 static int sdp_count; 110 111 /* 112 * Disable async. CMA events for sockets which are being torn down. 113 */ 114 static void 115 sdp_destroy_cma(struct sdp_sock *ssk) 116 { 117 118 if (ssk->id == NULL) 119 return; 120 rdma_destroy_id(ssk->id); 121 ssk->id = NULL; 122 } 123 124 static int 125 sdp_pcbbind(struct sdp_sock *ssk, struct sockaddr *nam, struct ucred *cred) 126 { 127 struct sockaddr_in *sin; 128 struct sockaddr_in null; 129 int error; 130 131 SDP_WLOCK_ASSERT(ssk); 132 133 if (ssk->lport != 0 || ssk->laddr != INADDR_ANY) 134 return (EINVAL); 135 /* rdma_bind_addr handles bind races. */ 136 SDP_WUNLOCK(ssk); 137 if (ssk->id == NULL) 138 ssk->id = rdma_create_id(&init_net, sdp_cma_handler, ssk, RDMA_PS_SDP, IB_QPT_RC); 139 if (ssk->id == NULL) { 140 SDP_WLOCK(ssk); 141 return (ENOMEM); 142 } 143 if (nam == NULL) { 144 null.sin_family = AF_INET; 145 null.sin_len = sizeof(null); 146 null.sin_addr.s_addr = INADDR_ANY; 147 null.sin_port = 0; 148 bzero(&null.sin_zero, sizeof(null.sin_zero)); 149 nam = (struct sockaddr *)&null; 150 } 151 error = -rdma_bind_addr(ssk->id, nam); 152 SDP_WLOCK(ssk); 153 if (error == 0) { 154 sin = (struct sockaddr_in *)&ssk->id->route.addr.src_addr; 155 ssk->laddr = sin->sin_addr.s_addr; 156 ssk->lport = sin->sin_port; 157 } else 158 sdp_destroy_cma(ssk); 159 return (error); 160 } 161 162 static void 163 sdp_pcbfree(struct sdp_sock *ssk) 164 { 165 166 KASSERT(ssk->socket == NULL, ("ssk %p socket still attached", ssk)); 167 KASSERT((ssk->flags & SDP_DESTROY) == 0, 168 ("ssk %p already destroyed", ssk)); 169 170 sdp_dbg(ssk->socket, "Freeing pcb"); 171 SDP_WLOCK_ASSERT(ssk); 172 ssk->flags |= SDP_DESTROY; 173 SDP_WUNLOCK(ssk); 174 SDP_LIST_WLOCK(); 175 sdp_count--; 176 LIST_REMOVE(ssk, list); 177 SDP_LIST_WUNLOCK(); 178 crfree(ssk->cred); 179 ssk->qp_active = 0; 180 if (ssk->qp) { 181 ib_destroy_qp(ssk->qp); 182 ssk->qp = NULL; 183 } 184 sdp_tx_ring_destroy(ssk); 185 sdp_rx_ring_destroy(ssk); 186 sdp_destroy_cma(ssk); 187 rw_destroy(&ssk->rx_ring.destroyed_lock); 188 rw_destroy(&ssk->lock); 189 uma_zfree(sdp_zone, ssk); 190 } 191 192 /* 193 * Common routines to return a socket address. 194 */ 195 static struct sockaddr * 196 sdp_sockaddr(in_port_t port, struct in_addr *addr_p) 197 { 198 struct sockaddr_in *sin; 199 200 sin = malloc(sizeof *sin, M_SONAME, 201 M_WAITOK | M_ZERO); 202 sin->sin_family = AF_INET; 203 sin->sin_len = sizeof(*sin); 204 sin->sin_addr = *addr_p; 205 sin->sin_port = port; 206 207 return (struct sockaddr *)sin; 208 } 209 210 static int 211 sdp_getsockaddr(struct socket *so, struct sockaddr **nam) 212 { 213 struct sdp_sock *ssk; 214 struct in_addr addr; 215 in_port_t port; 216 217 ssk = sdp_sk(so); 218 SDP_RLOCK(ssk); 219 port = ssk->lport; 220 addr.s_addr = ssk->laddr; 221 SDP_RUNLOCK(ssk); 222 223 *nam = sdp_sockaddr(port, &addr); 224 return 0; 225 } 226 227 static int 228 sdp_getpeeraddr(struct socket *so, struct sockaddr **nam) 229 { 230 struct sdp_sock *ssk; 231 struct in_addr addr; 232 in_port_t port; 233 234 ssk = sdp_sk(so); 235 SDP_RLOCK(ssk); 236 port = ssk->fport; 237 addr.s_addr = ssk->faddr; 238 SDP_RUNLOCK(ssk); 239 240 *nam = sdp_sockaddr(port, &addr); 241 return 0; 242 } 243 244 #if 0 245 static void 246 sdp_apply_all(void (*func)(struct sdp_sock *, void *), void *arg) 247 { 248 struct sdp_sock *ssk; 249 250 SDP_LIST_RLOCK(); 251 LIST_FOREACH(ssk, &sdp_list, list) { 252 SDP_WLOCK(ssk); 253 func(ssk, arg); 254 SDP_WUNLOCK(ssk); 255 } 256 SDP_LIST_RUNLOCK(); 257 } 258 #endif 259 260 static void 261 sdp_output_reset(struct sdp_sock *ssk) 262 { 263 struct rdma_cm_id *id; 264 265 SDP_WLOCK_ASSERT(ssk); 266 if (ssk->id) { 267 id = ssk->id; 268 ssk->qp_active = 0; 269 SDP_WUNLOCK(ssk); 270 rdma_disconnect(id); 271 SDP_WLOCK(ssk); 272 } 273 ssk->state = TCPS_CLOSED; 274 } 275 276 /* 277 * Attempt to close a SDP socket, marking it as dropped, and freeing 278 * the socket if we hold the only reference. 279 */ 280 static struct sdp_sock * 281 sdp_closed(struct sdp_sock *ssk) 282 { 283 struct socket *so; 284 285 SDP_WLOCK_ASSERT(ssk); 286 287 ssk->flags |= SDP_DROPPED; 288 so = ssk->socket; 289 soisdisconnected(so); 290 if (ssk->flags & SDP_SOCKREF) { 291 ssk->flags &= ~SDP_SOCKREF; 292 SDP_WUNLOCK(ssk); 293 sorele(so); 294 return (NULL); 295 } 296 return (ssk); 297 } 298 299 /* 300 * Perform timer based shutdowns which can not operate in 301 * callout context. 302 */ 303 static void 304 sdp_shutdown_task(void *data, int pending) 305 { 306 struct sdp_sock *ssk; 307 308 ssk = data; 309 SDP_WLOCK(ssk); 310 /* 311 * I don't think this can race with another call to pcbfree() 312 * because SDP_TIMEWAIT protects it. SDP_DESTROY may be redundant. 313 */ 314 if (ssk->flags & SDP_DESTROY) 315 panic("sdp_shutdown_task: Racing with pcbfree for ssk %p", 316 ssk); 317 if (ssk->flags & SDP_DISCON) 318 sdp_output_reset(ssk); 319 /* We have to clear this so sdp_detach() will call pcbfree(). */ 320 ssk->flags &= ~(SDP_TIMEWAIT | SDP_DREQWAIT); 321 if ((ssk->flags & SDP_DROPPED) == 0 && 322 sdp_closed(ssk) == NULL) 323 return; 324 if (ssk->socket == NULL) { 325 sdp_pcbfree(ssk); 326 return; 327 } 328 SDP_WUNLOCK(ssk); 329 } 330 331 /* 332 * 2msl has expired, schedule the shutdown task. 333 */ 334 static void 335 sdp_2msl_timeout(void *data) 336 { 337 struct sdp_sock *ssk; 338 339 ssk = data; 340 /* Callout canceled. */ 341 if (!callout_active(&ssk->keep2msl)) 342 goto out; 343 callout_deactivate(&ssk->keep2msl); 344 /* Should be impossible, defensive programming. */ 345 if ((ssk->flags & SDP_TIMEWAIT) == 0) 346 goto out; 347 taskqueue_enqueue(taskqueue_thread, &ssk->shutdown_task); 348 out: 349 SDP_WUNLOCK(ssk); 350 return; 351 } 352 353 /* 354 * Schedule the 2msl wait timer. 355 */ 356 static void 357 sdp_2msl_wait(struct sdp_sock *ssk) 358 { 359 360 SDP_WLOCK_ASSERT(ssk); 361 ssk->flags |= SDP_TIMEWAIT; 362 ssk->state = TCPS_TIME_WAIT; 363 soisdisconnected(ssk->socket); 364 callout_reset(&ssk->keep2msl, TCPTV_MSL, sdp_2msl_timeout, ssk); 365 } 366 367 /* 368 * Timed out waiting for the final fin/ack from rdma_disconnect(). 369 */ 370 static void 371 sdp_dreq_timeout(void *data) 372 { 373 struct sdp_sock *ssk; 374 375 ssk = data; 376 /* Callout canceled. */ 377 if (!callout_active(&ssk->keep2msl)) 378 goto out; 379 /* Callout rescheduled, probably as a different timer. */ 380 if (callout_pending(&ssk->keep2msl)) 381 goto out; 382 callout_deactivate(&ssk->keep2msl); 383 if (ssk->state != TCPS_FIN_WAIT_1 && ssk->state != TCPS_LAST_ACK) 384 goto out; 385 if ((ssk->flags & SDP_DREQWAIT) == 0) 386 goto out; 387 ssk->flags &= ~SDP_DREQWAIT; 388 ssk->flags |= SDP_DISCON; 389 sdp_2msl_wait(ssk); 390 ssk->qp_active = 0; 391 out: 392 SDP_WUNLOCK(ssk); 393 } 394 395 /* 396 * Received the final fin/ack. Cancel the 2msl. 397 */ 398 void 399 sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk) 400 { 401 sdp_dbg(ssk->socket, "cancelling dreq wait timeout\n"); 402 ssk->flags &= ~SDP_DREQWAIT; 403 sdp_2msl_wait(ssk); 404 } 405 406 static int 407 sdp_init_sock(struct socket *sk) 408 { 409 struct sdp_sock *ssk = sdp_sk(sk); 410 411 sdp_dbg(sk, "%s\n", __func__); 412 413 callout_init_rw(&ssk->keep2msl, &ssk->lock, CALLOUT_RETURNUNLOCKED); 414 TASK_INIT(&ssk->shutdown_task, 0, sdp_shutdown_task, ssk); 415 #ifdef SDP_ZCOPY 416 INIT_DELAYED_WORK(&ssk->srcavail_cancel_work, srcavail_cancel_timeout); 417 ssk->zcopy_thresh = -1; /* use global sdp_zcopy_thresh */ 418 ssk->tx_ring.rdma_inflight = NULL; 419 #endif 420 atomic_set(&ssk->mseq_ack, 0); 421 sdp_rx_ring_init(ssk); 422 ssk->tx_ring.buffer = NULL; 423 424 return 0; 425 } 426 427 /* 428 * Allocate an sdp_sock for the socket and reserve socket buffer space. 429 */ 430 static int 431 sdp_attach(struct socket *so, int proto, struct thread *td) 432 { 433 struct sdp_sock *ssk; 434 int error; 435 436 ssk = sdp_sk(so); 437 KASSERT(ssk == NULL, ("sdp_attach: ssk already set on so %p", so)); 438 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { 439 error = soreserve(so, sdp_sendspace, sdp_recvspace); 440 if (error) 441 return (error); 442 } 443 so->so_rcv.sb_flags |= SB_AUTOSIZE; 444 so->so_snd.sb_flags |= SB_AUTOSIZE; 445 ssk = uma_zalloc(sdp_zone, M_NOWAIT | M_ZERO); 446 if (ssk == NULL) 447 return (ENOBUFS); 448 rw_init(&ssk->lock, "sdpsock"); 449 ssk->socket = so; 450 ssk->cred = crhold(so->so_cred); 451 so->so_pcb = (caddr_t)ssk; 452 sdp_init_sock(so); 453 ssk->flags = 0; 454 ssk->qp_active = 0; 455 ssk->state = TCPS_CLOSED; 456 mbufq_init(&ssk->rxctlq, INT_MAX); 457 SDP_LIST_WLOCK(); 458 LIST_INSERT_HEAD(&sdp_list, ssk, list); 459 sdp_count++; 460 SDP_LIST_WUNLOCK(); 461 462 return (0); 463 } 464 465 /* 466 * Detach SDP from the socket, potentially leaving it around for the 467 * timewait to expire. 468 */ 469 static void 470 sdp_detach(struct socket *so) 471 { 472 struct sdp_sock *ssk; 473 474 ssk = sdp_sk(so); 475 SDP_WLOCK(ssk); 476 KASSERT(ssk->socket != NULL, ("sdp_detach: socket is NULL")); 477 ssk->socket->so_pcb = NULL; 478 ssk->socket = NULL; 479 if (ssk->flags & (SDP_TIMEWAIT | SDP_DREQWAIT)) 480 SDP_WUNLOCK(ssk); 481 else if (ssk->flags & SDP_DROPPED || ssk->state < TCPS_SYN_SENT) 482 sdp_pcbfree(ssk); 483 else 484 panic("sdp_detach: Unexpected state, ssk %p.\n", ssk); 485 } 486 487 /* 488 * Allocate a local address for the socket. 489 */ 490 static int 491 sdp_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 492 { 493 int error = 0; 494 struct sdp_sock *ssk; 495 struct sockaddr_in *sin; 496 497 sin = (struct sockaddr_in *)nam; 498 if (sin->sin_family != AF_INET) 499 return (EAFNOSUPPORT); 500 if (nam->sa_len != sizeof(*sin)) 501 return (EINVAL); 502 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) 503 return (EAFNOSUPPORT); 504 505 ssk = sdp_sk(so); 506 SDP_WLOCK(ssk); 507 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 508 error = EINVAL; 509 goto out; 510 } 511 error = sdp_pcbbind(ssk, nam, td->td_ucred); 512 out: 513 SDP_WUNLOCK(ssk); 514 515 return (error); 516 } 517 518 /* 519 * Prepare to accept connections. 520 */ 521 static int 522 sdp_listen(struct socket *so, int backlog, struct thread *td) 523 { 524 int error = 0; 525 struct sdp_sock *ssk; 526 527 ssk = sdp_sk(so); 528 SDP_WLOCK(ssk); 529 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 530 error = EINVAL; 531 goto out; 532 } 533 if (error == 0 && ssk->lport == 0) 534 error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred); 535 SOCK_LOCK(so); 536 if (error == 0) 537 error = solisten_proto_check(so); 538 if (error == 0) { 539 solisten_proto(so, backlog); 540 ssk->state = TCPS_LISTEN; 541 } 542 SOCK_UNLOCK(so); 543 544 out: 545 SDP_WUNLOCK(ssk); 546 if (error == 0) 547 error = -rdma_listen(ssk->id, backlog); 548 return (error); 549 } 550 551 /* 552 * Initiate a SDP connection to nam. 553 */ 554 static int 555 sdp_start_connect(struct sdp_sock *ssk, struct sockaddr *nam, struct thread *td) 556 { 557 struct sockaddr_in src; 558 struct socket *so; 559 int error; 560 561 so = ssk->socket; 562 563 SDP_WLOCK_ASSERT(ssk); 564 if (ssk->lport == 0) { 565 error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred); 566 if (error) 567 return error; 568 } 569 src.sin_family = AF_INET; 570 src.sin_len = sizeof(src); 571 bzero(&src.sin_zero, sizeof(src.sin_zero)); 572 src.sin_port = ssk->lport; 573 src.sin_addr.s_addr = ssk->laddr; 574 soisconnecting(so); 575 SDP_WUNLOCK(ssk); 576 error = -rdma_resolve_addr(ssk->id, (struct sockaddr *)&src, nam, 577 SDP_RESOLVE_TIMEOUT); 578 SDP_WLOCK(ssk); 579 if (error == 0) 580 ssk->state = TCPS_SYN_SENT; 581 582 return 0; 583 } 584 585 /* 586 * Initiate SDP connection. 587 */ 588 static int 589 sdp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 590 { 591 int error = 0; 592 struct sdp_sock *ssk; 593 struct sockaddr_in *sin; 594 595 sin = (struct sockaddr_in *)nam; 596 if (nam->sa_len != sizeof(*sin)) 597 return (EINVAL); 598 if (sin->sin_family != AF_INET) 599 return (EAFNOSUPPORT); 600 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) 601 return (EAFNOSUPPORT); 602 if ((error = prison_remote_ip4(td->td_ucred, &sin->sin_addr)) != 0) 603 return (error); 604 ssk = sdp_sk(so); 605 SDP_WLOCK(ssk); 606 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) 607 error = EINVAL; 608 else 609 error = sdp_start_connect(ssk, nam, td); 610 SDP_WUNLOCK(ssk); 611 return (error); 612 } 613 614 /* 615 * Drop a SDP socket, reporting 616 * the specified error. If connection is synchronized, 617 * then send a RST to peer. 618 */ 619 static struct sdp_sock * 620 sdp_drop(struct sdp_sock *ssk, int errno) 621 { 622 struct socket *so; 623 624 SDP_WLOCK_ASSERT(ssk); 625 so = ssk->socket; 626 if (TCPS_HAVERCVDSYN(ssk->state)) 627 sdp_output_reset(ssk); 628 if (errno == ETIMEDOUT && ssk->softerror) 629 errno = ssk->softerror; 630 so->so_error = errno; 631 return (sdp_closed(ssk)); 632 } 633 634 /* 635 * User issued close, and wish to trail through shutdown states: 636 * if never received SYN, just forget it. If got a SYN from peer, 637 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 638 * If already got a FIN from peer, then almost done; go to LAST_ACK 639 * state. In all other cases, have already sent FIN to peer (e.g. 640 * after PRU_SHUTDOWN), and just have to play tedious game waiting 641 * for peer to send FIN or not respond to keep-alives, etc. 642 * We can let the user exit from the close as soon as the FIN is acked. 643 */ 644 static void 645 sdp_usrclosed(struct sdp_sock *ssk) 646 { 647 648 SDP_WLOCK_ASSERT(ssk); 649 650 switch (ssk->state) { 651 case TCPS_LISTEN: 652 ssk->state = TCPS_CLOSED; 653 SDP_WUNLOCK(ssk); 654 sdp_destroy_cma(ssk); 655 SDP_WLOCK(ssk); 656 /* FALLTHROUGH */ 657 case TCPS_CLOSED: 658 ssk = sdp_closed(ssk); 659 /* 660 * sdp_closed() should never return NULL here as the socket is 661 * still open. 662 */ 663 KASSERT(ssk != NULL, 664 ("sdp_usrclosed: sdp_closed() returned NULL")); 665 break; 666 667 case TCPS_SYN_SENT: 668 /* FALLTHROUGH */ 669 case TCPS_SYN_RECEIVED: 670 ssk->flags |= SDP_NEEDFIN; 671 break; 672 673 case TCPS_ESTABLISHED: 674 ssk->flags |= SDP_NEEDFIN; 675 ssk->state = TCPS_FIN_WAIT_1; 676 break; 677 678 case TCPS_CLOSE_WAIT: 679 ssk->state = TCPS_LAST_ACK; 680 break; 681 } 682 if (ssk->state >= TCPS_FIN_WAIT_2) { 683 /* Prevent the connection hanging in FIN_WAIT_2 forever. */ 684 if (ssk->state == TCPS_FIN_WAIT_2) 685 sdp_2msl_wait(ssk); 686 else 687 soisdisconnected(ssk->socket); 688 } 689 } 690 691 static void 692 sdp_output_disconnect(struct sdp_sock *ssk) 693 { 694 695 SDP_WLOCK_ASSERT(ssk); 696 callout_reset(&ssk->keep2msl, SDP_FIN_WAIT_TIMEOUT, 697 sdp_dreq_timeout, ssk); 698 ssk->flags |= SDP_NEEDFIN | SDP_DREQWAIT; 699 sdp_post_sends(ssk, M_NOWAIT); 700 } 701 702 /* 703 * Initiate or continue a disconnect. 704 * If embryonic state, just send reset (once). 705 * If in ``let data drain'' option and linger null, just drop. 706 * Otherwise (hard), mark socket disconnecting and drop 707 * current input data; switch states based on user close, and 708 * send segment to peer (with FIN). 709 */ 710 static void 711 sdp_start_disconnect(struct sdp_sock *ssk) 712 { 713 struct socket *so; 714 int unread; 715 716 so = ssk->socket; 717 SDP_WLOCK_ASSERT(ssk); 718 sdp_stop_keepalive_timer(so); 719 /* 720 * Neither sdp_closed() nor sdp_drop() should return NULL, as the 721 * socket is still open. 722 */ 723 if (ssk->state < TCPS_ESTABLISHED) { 724 ssk = sdp_closed(ssk); 725 KASSERT(ssk != NULL, 726 ("sdp_start_disconnect: sdp_close() returned NULL")); 727 } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) { 728 ssk = sdp_drop(ssk, 0); 729 KASSERT(ssk != NULL, 730 ("sdp_start_disconnect: sdp_drop() returned NULL")); 731 } else { 732 soisdisconnecting(so); 733 unread = sbused(&so->so_rcv); 734 sbflush(&so->so_rcv); 735 sdp_usrclosed(ssk); 736 if (!(ssk->flags & SDP_DROPPED)) { 737 if (unread) 738 sdp_output_reset(ssk); 739 else 740 sdp_output_disconnect(ssk); 741 } 742 } 743 } 744 745 /* 746 * User initiated disconnect. 747 */ 748 static int 749 sdp_disconnect(struct socket *so) 750 { 751 struct sdp_sock *ssk; 752 int error = 0; 753 754 ssk = sdp_sk(so); 755 SDP_WLOCK(ssk); 756 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 757 error = ECONNRESET; 758 goto out; 759 } 760 sdp_start_disconnect(ssk); 761 out: 762 SDP_WUNLOCK(ssk); 763 return (error); 764 } 765 766 /* 767 * Accept a connection. Essentially all the work is done at higher levels; 768 * just return the address of the peer, storing through addr. 769 * 770 * 771 * XXX This is broken XXX 772 * 773 * The rationale for acquiring the sdp lock here is somewhat complicated, 774 * and is described in detail in the commit log entry for r175612. Acquiring 775 * it delays an accept(2) racing with sonewconn(), which inserts the socket 776 * before the address/port fields are initialized. A better fix would 777 * prevent the socket from being placed in the listen queue until all fields 778 * are fully initialized. 779 */ 780 static int 781 sdp_accept(struct socket *so, struct sockaddr **nam) 782 { 783 struct sdp_sock *ssk = NULL; 784 struct in_addr addr; 785 in_port_t port; 786 int error; 787 788 if (so->so_state & SS_ISDISCONNECTED) 789 return (ECONNABORTED); 790 791 port = 0; 792 addr.s_addr = 0; 793 error = 0; 794 ssk = sdp_sk(so); 795 SDP_WLOCK(ssk); 796 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 797 error = ECONNABORTED; 798 goto out; 799 } 800 port = ssk->fport; 801 addr.s_addr = ssk->faddr; 802 out: 803 SDP_WUNLOCK(ssk); 804 if (error == 0) 805 *nam = sdp_sockaddr(port, &addr); 806 return error; 807 } 808 809 /* 810 * Mark the connection as being incapable of further output. 811 */ 812 static int 813 sdp_shutdown(struct socket *so) 814 { 815 int error = 0; 816 struct sdp_sock *ssk; 817 818 ssk = sdp_sk(so); 819 SDP_WLOCK(ssk); 820 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 821 error = ECONNRESET; 822 goto out; 823 } 824 socantsendmore(so); 825 sdp_usrclosed(ssk); 826 if (!(ssk->flags & SDP_DROPPED)) 827 sdp_output_disconnect(ssk); 828 829 out: 830 SDP_WUNLOCK(ssk); 831 832 return (error); 833 } 834 835 static void 836 sdp_append(struct sdp_sock *ssk, struct sockbuf *sb, struct mbuf *mb, int cnt) 837 { 838 struct mbuf *n; 839 int ncnt; 840 841 SOCKBUF_LOCK_ASSERT(sb); 842 SBLASTRECORDCHK(sb); 843 KASSERT(mb->m_flags & M_PKTHDR, 844 ("sdp_append: %p Missing packet header.\n", mb)); 845 n = sb->sb_lastrecord; 846 /* 847 * If the queue is empty just set all pointers and proceed. 848 */ 849 if (n == NULL) { 850 sb->sb_lastrecord = sb->sb_mb = sb->sb_sndptr = mb; 851 for (; mb; mb = mb->m_next) { 852 sb->sb_mbtail = mb; 853 sballoc(sb, mb); 854 } 855 return; 856 } 857 /* 858 * Count the number of mbufs in the current tail. 859 */ 860 for (ncnt = 0; n->m_next; n = n->m_next) 861 ncnt++; 862 n = sb->sb_lastrecord; 863 /* 864 * If the two chains can fit in a single sdp packet and 865 * the last record has not been sent yet (WRITABLE) coalesce 866 * them. The lastrecord remains the same but we must strip the 867 * packet header and then let sbcompress do the hard part. 868 */ 869 if (M_WRITABLE(n) && ncnt + cnt < SDP_MAX_SEND_SGES && 870 n->m_pkthdr.len + mb->m_pkthdr.len - SDP_HEAD_SIZE < 871 ssk->xmit_size_goal) { 872 m_adj(mb, SDP_HEAD_SIZE); 873 n->m_pkthdr.len += mb->m_pkthdr.len; 874 n->m_flags |= mb->m_flags & (M_PUSH | M_URG); 875 m_demote(mb, 1, 0); 876 sbcompress(sb, mb, sb->sb_mbtail); 877 return; 878 } 879 /* 880 * Not compressible, just append to the end and adjust counters. 881 */ 882 sb->sb_lastrecord->m_flags |= M_PUSH; 883 sb->sb_lastrecord->m_nextpkt = mb; 884 sb->sb_lastrecord = mb; 885 if (sb->sb_sndptr == NULL) 886 sb->sb_sndptr = mb; 887 for (; mb; mb = mb->m_next) { 888 sb->sb_mbtail = mb; 889 sballoc(sb, mb); 890 } 891 } 892 893 /* 894 * Do a send by putting data in output queue and updating urgent 895 * marker if URG set. Possibly send more data. Unlike the other 896 * pru_*() routines, the mbuf chains are our responsibility. We 897 * must either enqueue them or free them. The other pru_* routines 898 * generally are caller-frees. 899 * 900 * This comes from sendfile, normal sends will come from sdp_sosend(). 901 */ 902 static int 903 sdp_send(struct socket *so, int flags, struct mbuf *m, 904 struct sockaddr *nam, struct mbuf *control, struct thread *td) 905 { 906 struct sdp_sock *ssk; 907 struct mbuf *n; 908 int error; 909 int cnt; 910 911 if (nam != NULL) { 912 if (nam->sa_family != AF_INET) { 913 if (control) 914 m_freem(control); 915 m_freem(m); 916 return (EAFNOSUPPORT); 917 } 918 if (nam->sa_len != sizeof(struct sockaddr_in)) { 919 if (control) 920 m_freem(control); 921 m_freem(m); 922 return (EINVAL); 923 } 924 } 925 926 error = 0; 927 ssk = sdp_sk(so); 928 KASSERT(m->m_flags & M_PKTHDR, 929 ("sdp_send: %p no packet header", m)); 930 M_PREPEND(m, SDP_HEAD_SIZE, M_WAITOK); 931 mtod(m, struct sdp_bsdh *)->mid = SDP_MID_DATA; 932 for (n = m, cnt = 0; n->m_next; n = n->m_next) 933 cnt++; 934 if (cnt > SDP_MAX_SEND_SGES) { 935 n = m_collapse(m, M_WAITOK, SDP_MAX_SEND_SGES); 936 if (n == NULL) { 937 m_freem(m); 938 return (EMSGSIZE); 939 } 940 m = n; 941 for (cnt = 0; n->m_next; n = n->m_next) 942 cnt++; 943 } 944 SDP_WLOCK(ssk); 945 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 946 if (control) 947 m_freem(control); 948 if (m) 949 m_freem(m); 950 error = ECONNRESET; 951 goto out; 952 } 953 if (control) { 954 /* SDP doesn't support control messages. */ 955 if (control->m_len) { 956 m_freem(control); 957 if (m) 958 m_freem(m); 959 error = EINVAL; 960 goto out; 961 } 962 m_freem(control); /* empty control, just free it */ 963 } 964 if (!(flags & PRUS_OOB)) { 965 SOCKBUF_LOCK(&so->so_snd); 966 sdp_append(ssk, &so->so_snd, m, cnt); 967 SOCKBUF_UNLOCK(&so->so_snd); 968 if (nam && ssk->state < TCPS_SYN_SENT) { 969 /* 970 * Do implied connect if not yet connected. 971 */ 972 error = sdp_start_connect(ssk, nam, td); 973 if (error) 974 goto out; 975 } 976 if (flags & PRUS_EOF) { 977 /* 978 * Close the send side of the connection after 979 * the data is sent. 980 */ 981 socantsendmore(so); 982 sdp_usrclosed(ssk); 983 if (!(ssk->flags & SDP_DROPPED)) 984 sdp_output_disconnect(ssk); 985 } else if (!(ssk->flags & SDP_DROPPED) && 986 !(flags & PRUS_MORETOCOME)) 987 sdp_post_sends(ssk, M_NOWAIT); 988 SDP_WUNLOCK(ssk); 989 return (0); 990 } else { 991 SOCKBUF_LOCK(&so->so_snd); 992 if (sbspace(&so->so_snd) < -512) { 993 SOCKBUF_UNLOCK(&so->so_snd); 994 m_freem(m); 995 error = ENOBUFS; 996 goto out; 997 } 998 /* 999 * According to RFC961 (Assigned Protocols), 1000 * the urgent pointer points to the last octet 1001 * of urgent data. We continue, however, 1002 * to consider it to indicate the first octet 1003 * of data past the urgent section. 1004 * Otherwise, snd_up should be one lower. 1005 */ 1006 m->m_flags |= M_URG | M_PUSH; 1007 sdp_append(ssk, &so->so_snd, m, cnt); 1008 SOCKBUF_UNLOCK(&so->so_snd); 1009 if (nam && ssk->state < TCPS_SYN_SENT) { 1010 /* 1011 * Do implied connect if not yet connected. 1012 */ 1013 error = sdp_start_connect(ssk, nam, td); 1014 if (error) 1015 goto out; 1016 } 1017 sdp_post_sends(ssk, M_NOWAIT); 1018 SDP_WUNLOCK(ssk); 1019 return (0); 1020 } 1021 out: 1022 SDP_WUNLOCK(ssk); 1023 return (error); 1024 } 1025 1026 /* 1027 * Send on a socket. If send must go all at once and message is larger than 1028 * send buffering, then hard error. Lock against other senders. If must go 1029 * all at once and not enough room now, then inform user that this would 1030 * block and do nothing. Otherwise, if nonblocking, send as much as 1031 * possible. The data to be sent is described by "uio" if nonzero, otherwise 1032 * by the mbuf chain "top" (which must be null if uio is not). Data provided 1033 * in mbuf chain must be small enough to send all at once. 1034 * 1035 * Returns nonzero on error, timeout or signal; callers must check for short 1036 * counts if EINTR/ERESTART are returned. Data and control buffers are freed 1037 * on return. 1038 */ 1039 static int 1040 sdp_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 1041 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1042 { 1043 struct sdp_sock *ssk; 1044 long space, resid; 1045 int atomic; 1046 int error; 1047 int copy; 1048 1049 if (uio != NULL) 1050 resid = uio->uio_resid; 1051 else 1052 resid = top->m_pkthdr.len; 1053 atomic = top != NULL; 1054 if (control != NULL) { 1055 if (control->m_len) { 1056 m_freem(control); 1057 if (top) 1058 m_freem(top); 1059 return (EINVAL); 1060 } 1061 m_freem(control); 1062 control = NULL; 1063 } 1064 /* 1065 * In theory resid should be unsigned. However, space must be 1066 * signed, as it might be less than 0 if we over-committed, and we 1067 * must use a signed comparison of space and resid. On the other 1068 * hand, a negative resid causes us to loop sending 0-length 1069 * segments to the protocol. 1070 * 1071 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 1072 * type sockets since that's an error. 1073 */ 1074 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 1075 error = EINVAL; 1076 goto out; 1077 } 1078 if (td != NULL) 1079 td->td_ru.ru_msgsnd++; 1080 1081 ssk = sdp_sk(so); 1082 error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags)); 1083 if (error) 1084 goto out; 1085 1086 restart: 1087 do { 1088 SOCKBUF_LOCK(&so->so_snd); 1089 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1090 SOCKBUF_UNLOCK(&so->so_snd); 1091 error = EPIPE; 1092 goto release; 1093 } 1094 if (so->so_error) { 1095 error = so->so_error; 1096 so->so_error = 0; 1097 SOCKBUF_UNLOCK(&so->so_snd); 1098 goto release; 1099 } 1100 if ((so->so_state & SS_ISCONNECTED) == 0 && addr == NULL) { 1101 SOCKBUF_UNLOCK(&so->so_snd); 1102 error = ENOTCONN; 1103 goto release; 1104 } 1105 space = sbspace(&so->so_snd); 1106 if (flags & MSG_OOB) 1107 space += 1024; 1108 if (atomic && resid > ssk->xmit_size_goal - SDP_HEAD_SIZE) { 1109 SOCKBUF_UNLOCK(&so->so_snd); 1110 error = EMSGSIZE; 1111 goto release; 1112 } 1113 if (space < resid && 1114 (atomic || space < so->so_snd.sb_lowat)) { 1115 if ((so->so_state & SS_NBIO) || 1116 (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { 1117 SOCKBUF_UNLOCK(&so->so_snd); 1118 error = EWOULDBLOCK; 1119 goto release; 1120 } 1121 error = sbwait(so, SO_SND); 1122 SOCKBUF_UNLOCK(&so->so_snd); 1123 if (error) 1124 goto release; 1125 goto restart; 1126 } 1127 SOCKBUF_UNLOCK(&so->so_snd); 1128 do { 1129 if (uio == NULL) { 1130 resid = 0; 1131 if (flags & MSG_EOR) 1132 top->m_flags |= M_EOR; 1133 } else { 1134 /* 1135 * Copy the data from userland into a mbuf 1136 * chain. If no data is to be copied in, 1137 * a single empty mbuf is returned. 1138 */ 1139 copy = min(space, 1140 ssk->xmit_size_goal - SDP_HEAD_SIZE); 1141 top = m_uiotombuf(uio, M_WAITOK, copy, 1142 0, M_PKTHDR | 1143 ((flags & MSG_EOR) ? M_EOR : 0)); 1144 if (top == NULL) { 1145 /* only possible error */ 1146 error = EFAULT; 1147 goto release; 1148 } 1149 space -= resid - uio->uio_resid; 1150 resid = uio->uio_resid; 1151 } 1152 /* 1153 * XXX all the SBS_CANTSENDMORE checks previously 1154 * done could be out of date after dropping the 1155 * socket lock. 1156 */ 1157 error = sdp_send(so, (flags & MSG_OOB) ? PRUS_OOB : 1158 /* 1159 * Set EOF on the last send if the user specified 1160 * MSG_EOF. 1161 */ 1162 ((flags & MSG_EOF) && (resid <= 0)) ? PRUS_EOF : 1163 /* If there is more to send set PRUS_MORETOCOME. */ 1164 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1165 top, addr, NULL, td); 1166 top = NULL; 1167 if (error) 1168 goto release; 1169 } while (resid && space > 0); 1170 } while (resid); 1171 1172 release: 1173 SOCK_IO_SEND_UNLOCK(so); 1174 out: 1175 if (top != NULL) 1176 m_freem(top); 1177 return (error); 1178 } 1179 1180 /* 1181 * The part of soreceive() that implements reading non-inline out-of-band 1182 * data from a socket. For more complete comments, see soreceive(), from 1183 * which this code originated. 1184 * 1185 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is 1186 * unable to return an mbuf chain to the caller. 1187 */ 1188 static int 1189 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) 1190 { 1191 struct protosw *pr = so->so_proto; 1192 struct mbuf *m; 1193 int error; 1194 1195 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 1196 1197 m = m_get(M_WAITOK, MT_DATA); 1198 error = pr->pr_rcvoob(so, m, flags & MSG_PEEK); 1199 if (error) 1200 goto bad; 1201 do { 1202 error = uiomove(mtod(m, void *), 1203 (int) min(uio->uio_resid, m->m_len), uio); 1204 m = m_free(m); 1205 } while (uio->uio_resid && error == 0 && m); 1206 bad: 1207 if (m != NULL) 1208 m_freem(m); 1209 return (error); 1210 } 1211 1212 /* 1213 * Optimized version of soreceive() for stream (TCP) sockets. 1214 */ 1215 static int 1216 sdp_sorecv(struct socket *so, struct sockaddr **psa, struct uio *uio, 1217 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 1218 { 1219 int len = 0, error = 0, flags, oresid; 1220 struct sockbuf *sb; 1221 struct mbuf *m, *n = NULL; 1222 struct sdp_sock *ssk; 1223 1224 /* We only do stream sockets. */ 1225 if (so->so_type != SOCK_STREAM) 1226 return (EINVAL); 1227 if (psa != NULL) 1228 *psa = NULL; 1229 if (controlp != NULL) 1230 return (EINVAL); 1231 if (flagsp != NULL) 1232 flags = *flagsp &~ MSG_EOR; 1233 else 1234 flags = 0; 1235 if (flags & MSG_OOB) 1236 return (soreceive_rcvoob(so, uio, flags)); 1237 if (mp0 != NULL) 1238 *mp0 = NULL; 1239 1240 sb = &so->so_rcv; 1241 ssk = sdp_sk(so); 1242 1243 /* Prevent other readers from entering the socket. */ 1244 error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); 1245 if (error) 1246 return (error); 1247 SOCKBUF_LOCK(sb); 1248 1249 /* Easy one, no space to copyout anything. */ 1250 if (uio->uio_resid == 0) { 1251 error = EINVAL; 1252 goto out; 1253 } 1254 oresid = uio->uio_resid; 1255 1256 /* We will never ever get anything unless we are connected. */ 1257 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { 1258 /* When disconnecting there may be still some data left. */ 1259 if (sbavail(sb)) 1260 goto deliver; 1261 if (!(so->so_state & SS_ISDISCONNECTED)) 1262 error = ENOTCONN; 1263 goto out; 1264 } 1265 1266 /* Socket buffer is empty and we shall not block. */ 1267 if (sbavail(sb) == 0 && 1268 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { 1269 error = EAGAIN; 1270 goto out; 1271 } 1272 1273 restart: 1274 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1275 1276 /* Abort if socket has reported problems. */ 1277 if (so->so_error) { 1278 if (sbavail(sb)) 1279 goto deliver; 1280 if (oresid > uio->uio_resid) 1281 goto out; 1282 error = so->so_error; 1283 if (!(flags & MSG_PEEK)) 1284 so->so_error = 0; 1285 goto out; 1286 } 1287 1288 /* Door is closed. Deliver what is left, if any. */ 1289 if (sb->sb_state & SBS_CANTRCVMORE) { 1290 if (sbavail(sb)) 1291 goto deliver; 1292 else 1293 goto out; 1294 } 1295 1296 /* Socket buffer got some data that we shall deliver now. */ 1297 if (sbavail(sb) && !(flags & MSG_WAITALL) && 1298 ((so->so_state & SS_NBIO) || 1299 (flags & (MSG_DONTWAIT|MSG_NBIO)) || 1300 sbavail(sb) >= sb->sb_lowat || 1301 sbavail(sb) >= uio->uio_resid || 1302 sbavail(sb) >= sb->sb_hiwat) ) { 1303 goto deliver; 1304 } 1305 1306 /* On MSG_WAITALL we must wait until all data or error arrives. */ 1307 if ((flags & MSG_WAITALL) && 1308 (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_lowat)) 1309 goto deliver; 1310 1311 /* 1312 * Wait and block until (more) data comes in. 1313 * NB: Drops the sockbuf lock during wait. 1314 */ 1315 error = sbwait(so, SO_RCV); 1316 if (error) 1317 goto out; 1318 goto restart; 1319 1320 deliver: 1321 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1322 KASSERT(sbavail(sb), ("%s: sockbuf empty", __func__)); 1323 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); 1324 1325 /* Statistics. */ 1326 if (uio->uio_td) 1327 uio->uio_td->td_ru.ru_msgrcv++; 1328 1329 /* Fill uio until full or current end of socket buffer is reached. */ 1330 len = min(uio->uio_resid, sbavail(sb)); 1331 if (mp0 != NULL) { 1332 /* Dequeue as many mbufs as possible. */ 1333 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { 1334 for (*mp0 = m = sb->sb_mb; 1335 m != NULL && m->m_len <= len; 1336 m = m->m_next) { 1337 len -= m->m_len; 1338 uio->uio_resid -= m->m_len; 1339 sbfree(sb, m); 1340 n = m; 1341 } 1342 sb->sb_mb = m; 1343 if (sb->sb_mb == NULL) 1344 SB_EMPTY_FIXUP(sb); 1345 n->m_next = NULL; 1346 } 1347 /* Copy the remainder. */ 1348 if (len > 0) { 1349 KASSERT(sb->sb_mb != NULL, 1350 ("%s: len > 0 && sb->sb_mb empty", __func__)); 1351 1352 m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); 1353 if (m == NULL) 1354 len = 0; /* Don't flush data from sockbuf. */ 1355 else 1356 uio->uio_resid -= m->m_len; 1357 if (*mp0 != NULL) 1358 n->m_next = m; 1359 else 1360 *mp0 = m; 1361 if (*mp0 == NULL) { 1362 error = ENOBUFS; 1363 goto out; 1364 } 1365 } 1366 } else { 1367 /* NB: Must unlock socket buffer as uiomove may sleep. */ 1368 SOCKBUF_UNLOCK(sb); 1369 error = m_mbuftouio(uio, sb->sb_mb, len); 1370 SOCKBUF_LOCK(sb); 1371 if (error) 1372 goto out; 1373 } 1374 SBLASTRECORDCHK(sb); 1375 SBLASTMBUFCHK(sb); 1376 1377 /* 1378 * Remove the delivered data from the socket buffer unless we 1379 * were only peeking. 1380 */ 1381 if (!(flags & MSG_PEEK)) { 1382 if (len > 0) 1383 sbdrop_locked(sb, len); 1384 1385 /* Notify protocol that we drained some data. */ 1386 SOCKBUF_UNLOCK(sb); 1387 SDP_WLOCK(ssk); 1388 sdp_do_posts(ssk); 1389 SDP_WUNLOCK(ssk); 1390 SOCKBUF_LOCK(sb); 1391 } 1392 1393 /* 1394 * For MSG_WAITALL we may have to loop again and wait for 1395 * more data to come in. 1396 */ 1397 if ((flags & MSG_WAITALL) && uio->uio_resid > 0) 1398 goto restart; 1399 out: 1400 SBLASTRECORDCHK(sb); 1401 SBLASTMBUFCHK(sb); 1402 SOCKBUF_UNLOCK(sb); 1403 SOCK_IO_RECV_UNLOCK(so); 1404 return (error); 1405 } 1406 1407 /* 1408 * Abort is used to teardown a connection typically while sitting in 1409 * the accept queue. 1410 */ 1411 void 1412 sdp_abort(struct socket *so) 1413 { 1414 struct sdp_sock *ssk; 1415 1416 ssk = sdp_sk(so); 1417 SDP_WLOCK(ssk); 1418 /* 1419 * If we have not yet dropped, do it now. 1420 */ 1421 if (!(ssk->flags & SDP_TIMEWAIT) && 1422 !(ssk->flags & SDP_DROPPED)) 1423 sdp_drop(ssk, ECONNABORTED); 1424 KASSERT(ssk->flags & SDP_DROPPED, ("sdp_abort: %p not dropped 0x%X", 1425 ssk, ssk->flags)); 1426 SDP_WUNLOCK(ssk); 1427 } 1428 1429 /* 1430 * Close a SDP socket and initiate a friendly disconnect. 1431 */ 1432 static void 1433 sdp_close(struct socket *so) 1434 { 1435 struct sdp_sock *ssk; 1436 1437 ssk = sdp_sk(so); 1438 SDP_WLOCK(ssk); 1439 /* 1440 * If we have not yet dropped, do it now. 1441 */ 1442 if (!(ssk->flags & SDP_TIMEWAIT) && 1443 !(ssk->flags & SDP_DROPPED)) 1444 sdp_start_disconnect(ssk); 1445 1446 /* 1447 * If we've still not dropped let the socket layer know we're 1448 * holding on to the socket and pcb for a while. 1449 */ 1450 if (!(ssk->flags & SDP_DROPPED)) { 1451 ssk->flags |= SDP_SOCKREF; 1452 soref(so); 1453 } 1454 SDP_WUNLOCK(ssk); 1455 } 1456 1457 /* 1458 * User requests out-of-band data. 1459 */ 1460 static int 1461 sdp_rcvoob(struct socket *so, struct mbuf *m, int flags) 1462 { 1463 int error = 0; 1464 struct sdp_sock *ssk; 1465 1466 ssk = sdp_sk(so); 1467 SDP_WLOCK(ssk); 1468 if (!rx_ring_trylock(&ssk->rx_ring)) { 1469 SDP_WUNLOCK(ssk); 1470 return (ECONNRESET); 1471 } 1472 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 1473 error = ECONNRESET; 1474 goto out; 1475 } 1476 if ((so->so_oobmark == 0 && 1477 (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) || 1478 so->so_options & SO_OOBINLINE || 1479 ssk->oobflags & SDP_HADOOB) { 1480 error = EINVAL; 1481 goto out; 1482 } 1483 if ((ssk->oobflags & SDP_HAVEOOB) == 0) { 1484 error = EWOULDBLOCK; 1485 goto out; 1486 } 1487 m->m_len = 1; 1488 *mtod(m, caddr_t) = ssk->iobc; 1489 if ((flags & MSG_PEEK) == 0) 1490 ssk->oobflags ^= (SDP_HAVEOOB | SDP_HADOOB); 1491 out: 1492 rx_ring_unlock(&ssk->rx_ring); 1493 SDP_WUNLOCK(ssk); 1494 return (error); 1495 } 1496 1497 void 1498 sdp_urg(struct sdp_sock *ssk, struct mbuf *mb) 1499 { 1500 struct mbuf *m; 1501 struct socket *so; 1502 1503 so = ssk->socket; 1504 if (so == NULL) 1505 return; 1506 1507 so->so_oobmark = sbused(&so->so_rcv) + mb->m_pkthdr.len - 1; 1508 sohasoutofband(so); 1509 ssk->oobflags &= ~(SDP_HAVEOOB | SDP_HADOOB); 1510 if (!(so->so_options & SO_OOBINLINE)) { 1511 for (m = mb; m->m_next != NULL; m = m->m_next); 1512 ssk->iobc = *(mtod(m, char *) + m->m_len - 1); 1513 ssk->oobflags |= SDP_HAVEOOB; 1514 m->m_len--; 1515 mb->m_pkthdr.len--; 1516 } 1517 } 1518 1519 /* 1520 * Notify a sdp socket of an asynchronous error. 1521 * 1522 * Do not wake up user since there currently is no mechanism for 1523 * reporting soft errors (yet - a kqueue filter may be added). 1524 */ 1525 struct sdp_sock * 1526 sdp_notify(struct sdp_sock *ssk, int error) 1527 { 1528 1529 SDP_WLOCK_ASSERT(ssk); 1530 1531 if ((ssk->flags & SDP_TIMEWAIT) || 1532 (ssk->flags & SDP_DROPPED)) 1533 return (ssk); 1534 1535 /* 1536 * Ignore some errors if we are hooked up. 1537 */ 1538 if (ssk->state == TCPS_ESTABLISHED && 1539 (error == EHOSTUNREACH || error == ENETUNREACH || 1540 error == EHOSTDOWN)) 1541 return (ssk); 1542 ssk->softerror = error; 1543 return sdp_drop(ssk, error); 1544 } 1545 1546 static void 1547 sdp_keepalive_timeout(void *data) 1548 { 1549 struct sdp_sock *ssk; 1550 1551 ssk = data; 1552 /* Callout canceled. */ 1553 if (!callout_active(&ssk->keep2msl)) 1554 return; 1555 /* Callout rescheduled as a different kind of timer. */ 1556 if (callout_pending(&ssk->keep2msl)) 1557 goto out; 1558 callout_deactivate(&ssk->keep2msl); 1559 if (ssk->flags & SDP_DROPPED || 1560 (ssk->socket->so_options & SO_KEEPALIVE) == 0) 1561 goto out; 1562 sdp_post_keepalive(ssk); 1563 callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME, 1564 sdp_keepalive_timeout, ssk); 1565 out: 1566 SDP_WUNLOCK(ssk); 1567 } 1568 1569 1570 void 1571 sdp_start_keepalive_timer(struct socket *so) 1572 { 1573 struct sdp_sock *ssk; 1574 1575 ssk = sdp_sk(so); 1576 if (!callout_pending(&ssk->keep2msl)) 1577 callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME, 1578 sdp_keepalive_timeout, ssk); 1579 } 1580 1581 static void 1582 sdp_stop_keepalive_timer(struct socket *so) 1583 { 1584 struct sdp_sock *ssk; 1585 1586 ssk = sdp_sk(so); 1587 callout_stop(&ssk->keep2msl); 1588 } 1589 1590 /* 1591 * sdp_ctloutput() must drop the inpcb lock before performing copyin on 1592 * socket option arguments. When it re-acquires the lock after the copy, it 1593 * has to revalidate that the connection is still valid for the socket 1594 * option. 1595 */ 1596 #define SDP_WLOCK_RECHECK(inp) do { \ 1597 SDP_WLOCK(ssk); \ 1598 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { \ 1599 SDP_WUNLOCK(ssk); \ 1600 return (ECONNRESET); \ 1601 } \ 1602 } while(0) 1603 1604 static int 1605 sdp_ctloutput(struct socket *so, struct sockopt *sopt) 1606 { 1607 int error, opt, optval; 1608 struct sdp_sock *ssk; 1609 1610 error = 0; 1611 ssk = sdp_sk(so); 1612 if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_KEEPALIVE) { 1613 SDP_WLOCK(ssk); 1614 if (so->so_options & SO_KEEPALIVE) 1615 sdp_start_keepalive_timer(so); 1616 else 1617 sdp_stop_keepalive_timer(so); 1618 SDP_WUNLOCK(ssk); 1619 } 1620 if (sopt->sopt_level != IPPROTO_TCP) 1621 return (error); 1622 1623 SDP_WLOCK(ssk); 1624 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 1625 SDP_WUNLOCK(ssk); 1626 return (ECONNRESET); 1627 } 1628 1629 switch (sopt->sopt_dir) { 1630 case SOPT_SET: 1631 switch (sopt->sopt_name) { 1632 case TCP_NODELAY: 1633 SDP_WUNLOCK(ssk); 1634 error = sooptcopyin(sopt, &optval, sizeof optval, 1635 sizeof optval); 1636 if (error) 1637 return (error); 1638 1639 SDP_WLOCK_RECHECK(ssk); 1640 opt = SDP_NODELAY; 1641 if (optval) 1642 ssk->flags |= opt; 1643 else 1644 ssk->flags &= ~opt; 1645 sdp_do_posts(ssk); 1646 SDP_WUNLOCK(ssk); 1647 break; 1648 1649 default: 1650 SDP_WUNLOCK(ssk); 1651 error = ENOPROTOOPT; 1652 break; 1653 } 1654 break; 1655 1656 case SOPT_GET: 1657 switch (sopt->sopt_name) { 1658 case TCP_NODELAY: 1659 optval = ssk->flags & SDP_NODELAY; 1660 SDP_WUNLOCK(ssk); 1661 error = sooptcopyout(sopt, &optval, sizeof optval); 1662 break; 1663 default: 1664 SDP_WUNLOCK(ssk); 1665 error = ENOPROTOOPT; 1666 break; 1667 } 1668 break; 1669 } 1670 return (error); 1671 } 1672 #undef SDP_WLOCK_RECHECK 1673 1674 int sdp_mod_count = 0; 1675 int sdp_mod_usec = 0; 1676 1677 void 1678 sdp_set_default_moderation(struct sdp_sock *ssk) 1679 { 1680 if (sdp_mod_count <= 0 || sdp_mod_usec <= 0) 1681 return; 1682 ib_modify_cq(ssk->rx_ring.cq, sdp_mod_count, sdp_mod_usec); 1683 } 1684 1685 static void 1686 sdp_dev_add(struct ib_device *device) 1687 { 1688 struct ib_fmr_pool_param param; 1689 struct sdp_device *sdp_dev; 1690 1691 sdp_dev = malloc(sizeof(*sdp_dev), M_SDP, M_WAITOK | M_ZERO); 1692 sdp_dev->pd = ib_alloc_pd(device, 0); 1693 if (IS_ERR(sdp_dev->pd)) 1694 goto out_pd; 1695 memset(¶m, 0, sizeof param); 1696 param.max_pages_per_fmr = SDP_FMR_SIZE; 1697 param.page_shift = PAGE_SHIFT; 1698 param.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ); 1699 param.pool_size = SDP_FMR_POOL_SIZE; 1700 param.dirty_watermark = SDP_FMR_DIRTY_SIZE; 1701 param.cache = 1; 1702 sdp_dev->fmr_pool = ib_create_fmr_pool(sdp_dev->pd, ¶m); 1703 if (IS_ERR(sdp_dev->fmr_pool)) 1704 goto out_fmr; 1705 ib_set_client_data(device, &sdp_client, sdp_dev); 1706 return; 1707 1708 out_fmr: 1709 ib_dealloc_pd(sdp_dev->pd); 1710 out_pd: 1711 free(sdp_dev, M_SDP); 1712 } 1713 1714 static void 1715 sdp_dev_rem(struct ib_device *device, void *client_data) 1716 { 1717 struct sdp_device *sdp_dev; 1718 struct sdp_sock *ssk; 1719 1720 SDP_LIST_WLOCK(); 1721 LIST_FOREACH(ssk, &sdp_list, list) { 1722 if (ssk->ib_device != device) 1723 continue; 1724 SDP_WLOCK(ssk); 1725 if ((ssk->flags & SDP_DESTROY) == 0) 1726 ssk = sdp_notify(ssk, ECONNRESET); 1727 if (ssk) 1728 SDP_WUNLOCK(ssk); 1729 } 1730 SDP_LIST_WUNLOCK(); 1731 /* 1732 * XXX Do I need to wait between these two? 1733 */ 1734 sdp_dev = ib_get_client_data(device, &sdp_client); 1735 if (!sdp_dev) 1736 return; 1737 ib_flush_fmr_pool(sdp_dev->fmr_pool); 1738 ib_destroy_fmr_pool(sdp_dev->fmr_pool); 1739 ib_dealloc_pd(sdp_dev->pd); 1740 free(sdp_dev, M_SDP); 1741 } 1742 1743 struct ib_client sdp_client = 1744 { .name = "sdp", .add = sdp_dev_add, .remove = sdp_dev_rem }; 1745 1746 1747 static int 1748 sdp_pcblist(SYSCTL_HANDLER_ARGS) 1749 { 1750 int error, n, i; 1751 struct sdp_sock *ssk; 1752 struct xinpgen xig; 1753 1754 /* 1755 * The process of preparing the TCB list is too time-consuming and 1756 * resource-intensive to repeat twice on every request. 1757 */ 1758 if (req->oldptr == NULL) { 1759 n = sdp_count; 1760 n += imax(n / 8, 10); 1761 req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb); 1762 return (0); 1763 } 1764 1765 if (req->newptr != NULL) 1766 return (EPERM); 1767 1768 /* 1769 * OK, now we're committed to doing something. 1770 */ 1771 SDP_LIST_RLOCK(); 1772 n = sdp_count; 1773 SDP_LIST_RUNLOCK(); 1774 1775 error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) 1776 + n * sizeof(struct xtcpcb)); 1777 if (error != 0) 1778 return (error); 1779 1780 bzero(&xig, sizeof(xig)); 1781 xig.xig_len = sizeof xig; 1782 xig.xig_count = n; 1783 xig.xig_gen = 0; 1784 xig.xig_sogen = so_gencnt; 1785 error = SYSCTL_OUT(req, &xig, sizeof xig); 1786 if (error) 1787 return (error); 1788 1789 SDP_LIST_RLOCK(); 1790 for (ssk = LIST_FIRST(&sdp_list), i = 0; 1791 ssk != NULL && i < n; ssk = LIST_NEXT(ssk, list)) { 1792 struct xtcpcb xt; 1793 1794 SDP_RLOCK(ssk); 1795 if (ssk->flags & SDP_TIMEWAIT) { 1796 if (ssk->cred != NULL) 1797 error = cr_cansee(req->td->td_ucred, 1798 ssk->cred); 1799 else 1800 error = EINVAL; /* Skip this inp. */ 1801 } else if (ssk->socket) 1802 error = cr_canseesocket(req->td->td_ucred, 1803 ssk->socket); 1804 else 1805 error = EINVAL; 1806 if (error) { 1807 error = 0; 1808 goto next; 1809 } 1810 1811 bzero(&xt, sizeof(xt)); 1812 xt.xt_len = sizeof xt; 1813 xt.xt_inp.inp_gencnt = 0; 1814 xt.xt_inp.inp_vflag = INP_IPV4; 1815 memcpy(&xt.xt_inp.inp_laddr, &ssk->laddr, sizeof(ssk->laddr)); 1816 xt.xt_inp.inp_lport = ssk->lport; 1817 memcpy(&xt.xt_inp.inp_faddr, &ssk->faddr, sizeof(ssk->faddr)); 1818 xt.xt_inp.inp_fport = ssk->fport; 1819 xt.t_state = ssk->state; 1820 if (ssk->socket != NULL) 1821 sotoxsocket(ssk->socket, &xt.xt_inp.xi_socket); 1822 xt.xt_inp.xi_socket.xso_protocol = IPPROTO_TCP; 1823 SDP_RUNLOCK(ssk); 1824 error = SYSCTL_OUT(req, &xt, sizeof xt); 1825 if (error) 1826 break; 1827 i++; 1828 continue; 1829 next: 1830 SDP_RUNLOCK(ssk); 1831 } 1832 if (!error) { 1833 /* 1834 * Give the user an updated idea of our state. 1835 * If the generation differs from what we told 1836 * her before, she knows that something happened 1837 * while we were processing this request, and it 1838 * might be necessary to retry. 1839 */ 1840 xig.xig_gen = 0; 1841 xig.xig_sogen = so_gencnt; 1842 xig.xig_count = sdp_count; 1843 error = SYSCTL_OUT(req, &xig, sizeof xig); 1844 } 1845 SDP_LIST_RUNLOCK(); 1846 return (error); 1847 } 1848 1849 SYSCTL_NODE(_net_inet, -1, sdp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1850 "SDP"); 1851 1852 SYSCTL_PROC(_net_inet_sdp, TCPCTL_PCBLIST, pcblist, 1853 CTLFLAG_RD | CTLTYPE_STRUCT | CTLFLAG_MPSAFE, 1854 0, 0, sdp_pcblist, "S,xtcpcb", 1855 "List of active SDP connections"); 1856 1857 static void 1858 sdp_zone_change(void *tag) 1859 { 1860 1861 uma_zone_set_max(sdp_zone, maxsockets); 1862 } 1863 1864 static void 1865 sdp_init(void *arg __unused) 1866 { 1867 1868 LIST_INIT(&sdp_list); 1869 sdp_zone = uma_zcreate("sdp_sock", sizeof(struct sdp_sock), 1870 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 1871 uma_zone_set_max(sdp_zone, maxsockets); 1872 EVENTHANDLER_REGISTER(maxsockets_change, sdp_zone_change, NULL, 1873 EVENTHANDLER_PRI_ANY); 1874 rx_comp_wq = create_singlethread_workqueue("rx_comp_wq"); 1875 ib_register_client(&sdp_client); 1876 } 1877 SYSINIT(sdp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND, sdp_init, NULL); 1878 1879 #define SDP_PROTOSW \ 1880 .pr_type = SOCK_STREAM, \ 1881 .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,\ 1882 .pr_ctloutput = sdp_ctloutput, \ 1883 .pr_abort = sdp_abort, \ 1884 .pr_accept = sdp_accept, \ 1885 .pr_attach = sdp_attach, \ 1886 .pr_bind = sdp_bind, \ 1887 .pr_connect = sdp_connect, \ 1888 .pr_detach = sdp_detach, \ 1889 .pr_disconnect = sdp_disconnect, \ 1890 .pr_listen = sdp_listen, \ 1891 .pr_peeraddr = sdp_getpeeraddr, \ 1892 .pr_rcvoob = sdp_rcvoob, \ 1893 .pr_send = sdp_send, \ 1894 .pr_sosend = sdp_sosend, \ 1895 .pr_soreceive = sdp_sorecv, \ 1896 .pr_shutdown = sdp_shutdown, \ 1897 .pr_sockaddr = sdp_getsockaddr, \ 1898 .pr_close = sdp_close 1899 1900 1901 static struct protosw sdp_ip_protosw = { 1902 .pr_protocol = IPPROTO_IP, 1903 SDP_PROTOSW 1904 }; 1905 static struct protosw sdp_tcp_protosw = { 1906 .pr_protocol = IPPROTO_TCP, 1907 SDP_PROTOSW 1908 }; 1909 1910 static struct domain sdpdomain = { 1911 .dom_family = AF_INET_SDP, 1912 .dom_name = "SDP", 1913 .dom_nprotosw = 2, 1914 .dom_protosw = { 1915 &sdp_ip_protosw, 1916 &sdp_tcp_protosw, 1917 }, 1918 }; 1919 1920 DOMAIN_SET(sdp); 1921 1922 int sdp_debug_level = 1; 1923 int sdp_data_debug_level = 0; 1924