1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * Copyright (c) 2004 The FreeBSD Foundation. All rights reserved. 7 * Copyright (c) 2004-2008 Robert N. M. Watson. All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * Excerpts taken from tcp_subr.c, tcp_usrreq.c, uipc_socket.c 34 */ 35 36 /* 37 * 38 * Copyright (c) 2010 Isilon Systems, Inc. 39 * Copyright (c) 2010 iX Systems, Inc. 40 * Copyright (c) 2010 Panasas, Inc. 41 * All rights reserved. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice unmodified, this list of conditions, and the following 48 * disclaimer. 49 * 2. Redistributions in binary form must reproduce the above copyright 50 * notice, this list of conditions and the following disclaimer in the 51 * documentation and/or other materials provided with the distribution. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 54 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 55 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 56 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 57 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 58 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 59 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 60 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 61 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 62 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 63 * 64 */ 65 #include <sys/cdefs.h> 66 __FBSDID("$FreeBSD$"); 67 68 #include <sys/param.h> 69 #include <sys/eventhandler.h> 70 #include <sys/kernel.h> 71 #include <sys/malloc.h> 72 73 #include "sdp.h" 74 75 #include <net/if.h> 76 #include <net/route.h> 77 #include <net/vnet.h> 78 #include <sys/sysctl.h> 79 80 uma_zone_t sdp_zone; 81 struct rwlock sdp_lock; 82 LIST_HEAD(, sdp_sock) sdp_list; 83 84 struct workqueue_struct *rx_comp_wq; 85 86 RW_SYSINIT(sdplockinit, &sdp_lock, "SDP lock"); 87 #define SDP_LIST_WLOCK() rw_wlock(&sdp_lock) 88 #define SDP_LIST_RLOCK() rw_rlock(&sdp_lock) 89 #define SDP_LIST_WUNLOCK() rw_wunlock(&sdp_lock) 90 #define SDP_LIST_RUNLOCK() rw_runlock(&sdp_lock) 91 #define SDP_LIST_WLOCK_ASSERT() rw_assert(&sdp_lock, RW_WLOCKED) 92 #define SDP_LIST_RLOCK_ASSERT() rw_assert(&sdp_lock, RW_RLOCKED) 93 #define SDP_LIST_LOCK_ASSERT() rw_assert(&sdp_lock, RW_LOCKED) 94 95 MALLOC_DEFINE(M_SDP, "sdp", "Sockets Direct Protocol"); 96 97 static void sdp_stop_keepalive_timer(struct socket *so); 98 99 /* 100 * SDP protocol interface to socket abstraction. 101 */ 102 /* 103 * sdp_sendspace and sdp_recvspace are the default send and receive window 104 * sizes, respectively. 105 */ 106 u_long sdp_sendspace = 1024*32; 107 u_long sdp_recvspace = 1024*64; 108 109 static int sdp_count; 110 111 /* 112 * Disable async. CMA events for sockets which are being torn down. 113 */ 114 static void 115 sdp_destroy_cma(struct sdp_sock *ssk) 116 { 117 118 if (ssk->id == NULL) 119 return; 120 rdma_destroy_id(ssk->id); 121 ssk->id = NULL; 122 } 123 124 static int 125 sdp_pcbbind(struct sdp_sock *ssk, struct sockaddr *nam, struct ucred *cred) 126 { 127 struct sockaddr_in *sin; 128 struct sockaddr_in null; 129 int error; 130 131 SDP_WLOCK_ASSERT(ssk); 132 133 if (ssk->lport != 0 || ssk->laddr != INADDR_ANY) 134 return (EINVAL); 135 /* rdma_bind_addr handles bind races. */ 136 SDP_WUNLOCK(ssk); 137 if (ssk->id == NULL) 138 ssk->id = rdma_create_id(&init_net, sdp_cma_handler, ssk, RDMA_PS_SDP, IB_QPT_RC); 139 if (ssk->id == NULL) { 140 SDP_WLOCK(ssk); 141 return (ENOMEM); 142 } 143 if (nam == NULL) { 144 null.sin_family = AF_INET; 145 null.sin_len = sizeof(null); 146 null.sin_addr.s_addr = INADDR_ANY; 147 null.sin_port = 0; 148 bzero(&null.sin_zero, sizeof(null.sin_zero)); 149 nam = (struct sockaddr *)&null; 150 } 151 error = -rdma_bind_addr(ssk->id, nam); 152 SDP_WLOCK(ssk); 153 if (error == 0) { 154 sin = (struct sockaddr_in *)&ssk->id->route.addr.src_addr; 155 ssk->laddr = sin->sin_addr.s_addr; 156 ssk->lport = sin->sin_port; 157 } else 158 sdp_destroy_cma(ssk); 159 return (error); 160 } 161 162 static void 163 sdp_pcbfree(struct sdp_sock *ssk) 164 { 165 166 KASSERT(ssk->socket == NULL, ("ssk %p socket still attached", ssk)); 167 KASSERT((ssk->flags & SDP_DESTROY) == 0, 168 ("ssk %p already destroyed", ssk)); 169 170 sdp_dbg(ssk->socket, "Freeing pcb"); 171 SDP_WLOCK_ASSERT(ssk); 172 ssk->flags |= SDP_DESTROY; 173 SDP_WUNLOCK(ssk); 174 SDP_LIST_WLOCK(); 175 sdp_count--; 176 LIST_REMOVE(ssk, list); 177 SDP_LIST_WUNLOCK(); 178 crfree(ssk->cred); 179 ssk->qp_active = 0; 180 if (ssk->qp) { 181 ib_destroy_qp(ssk->qp); 182 ssk->qp = NULL; 183 } 184 sdp_tx_ring_destroy(ssk); 185 sdp_rx_ring_destroy(ssk); 186 sdp_destroy_cma(ssk); 187 rw_destroy(&ssk->rx_ring.destroyed_lock); 188 rw_destroy(&ssk->lock); 189 uma_zfree(sdp_zone, ssk); 190 } 191 192 /* 193 * Common routines to return a socket address. 194 */ 195 static struct sockaddr * 196 sdp_sockaddr(in_port_t port, struct in_addr *addr_p) 197 { 198 struct sockaddr_in *sin; 199 200 sin = malloc(sizeof *sin, M_SONAME, 201 M_WAITOK | M_ZERO); 202 sin->sin_family = AF_INET; 203 sin->sin_len = sizeof(*sin); 204 sin->sin_addr = *addr_p; 205 sin->sin_port = port; 206 207 return (struct sockaddr *)sin; 208 } 209 210 static int 211 sdp_getsockaddr(struct socket *so, struct sockaddr **nam) 212 { 213 struct sdp_sock *ssk; 214 struct in_addr addr; 215 in_port_t port; 216 217 ssk = sdp_sk(so); 218 SDP_RLOCK(ssk); 219 port = ssk->lport; 220 addr.s_addr = ssk->laddr; 221 SDP_RUNLOCK(ssk); 222 223 *nam = sdp_sockaddr(port, &addr); 224 return 0; 225 } 226 227 static int 228 sdp_getpeeraddr(struct socket *so, struct sockaddr **nam) 229 { 230 struct sdp_sock *ssk; 231 struct in_addr addr; 232 in_port_t port; 233 234 ssk = sdp_sk(so); 235 SDP_RLOCK(ssk); 236 port = ssk->fport; 237 addr.s_addr = ssk->faddr; 238 SDP_RUNLOCK(ssk); 239 240 *nam = sdp_sockaddr(port, &addr); 241 return 0; 242 } 243 244 static void 245 sdp_pcbnotifyall(struct in_addr faddr, int errno, 246 struct sdp_sock *(*notify)(struct sdp_sock *, int)) 247 { 248 struct sdp_sock *ssk, *ssk_temp; 249 250 SDP_LIST_WLOCK(); 251 LIST_FOREACH_SAFE(ssk, &sdp_list, list, ssk_temp) { 252 SDP_WLOCK(ssk); 253 if (ssk->faddr != faddr.s_addr || ssk->socket == NULL) { 254 SDP_WUNLOCK(ssk); 255 continue; 256 } 257 if ((ssk->flags & SDP_DESTROY) == 0) 258 if ((*notify)(ssk, errno)) 259 SDP_WUNLOCK(ssk); 260 } 261 SDP_LIST_WUNLOCK(); 262 } 263 264 #if 0 265 static void 266 sdp_apply_all(void (*func)(struct sdp_sock *, void *), void *arg) 267 { 268 struct sdp_sock *ssk; 269 270 SDP_LIST_RLOCK(); 271 LIST_FOREACH(ssk, &sdp_list, list) { 272 SDP_WLOCK(ssk); 273 func(ssk, arg); 274 SDP_WUNLOCK(ssk); 275 } 276 SDP_LIST_RUNLOCK(); 277 } 278 #endif 279 280 static void 281 sdp_output_reset(struct sdp_sock *ssk) 282 { 283 struct rdma_cm_id *id; 284 285 SDP_WLOCK_ASSERT(ssk); 286 if (ssk->id) { 287 id = ssk->id; 288 ssk->qp_active = 0; 289 SDP_WUNLOCK(ssk); 290 rdma_disconnect(id); 291 SDP_WLOCK(ssk); 292 } 293 ssk->state = TCPS_CLOSED; 294 } 295 296 /* 297 * Attempt to close a SDP socket, marking it as dropped, and freeing 298 * the socket if we hold the only reference. 299 */ 300 static struct sdp_sock * 301 sdp_closed(struct sdp_sock *ssk) 302 { 303 struct socket *so; 304 305 SDP_WLOCK_ASSERT(ssk); 306 307 ssk->flags |= SDP_DROPPED; 308 so = ssk->socket; 309 soisdisconnected(so); 310 if (ssk->flags & SDP_SOCKREF) { 311 KASSERT(so->so_state & SS_PROTOREF, 312 ("sdp_closed: !SS_PROTOREF")); 313 ssk->flags &= ~SDP_SOCKREF; 314 SDP_WUNLOCK(ssk); 315 SOCK_LOCK(so); 316 so->so_state &= ~SS_PROTOREF; 317 sofree(so); 318 return (NULL); 319 } 320 return (ssk); 321 } 322 323 /* 324 * Perform timer based shutdowns which can not operate in 325 * callout context. 326 */ 327 static void 328 sdp_shutdown_task(void *data, int pending) 329 { 330 struct sdp_sock *ssk; 331 332 ssk = data; 333 SDP_WLOCK(ssk); 334 /* 335 * I don't think this can race with another call to pcbfree() 336 * because SDP_TIMEWAIT protects it. SDP_DESTROY may be redundant. 337 */ 338 if (ssk->flags & SDP_DESTROY) 339 panic("sdp_shutdown_task: Racing with pcbfree for ssk %p", 340 ssk); 341 if (ssk->flags & SDP_DISCON) 342 sdp_output_reset(ssk); 343 /* We have to clear this so sdp_detach() will call pcbfree(). */ 344 ssk->flags &= ~(SDP_TIMEWAIT | SDP_DREQWAIT); 345 if ((ssk->flags & SDP_DROPPED) == 0 && 346 sdp_closed(ssk) == NULL) 347 return; 348 if (ssk->socket == NULL) { 349 sdp_pcbfree(ssk); 350 return; 351 } 352 SDP_WUNLOCK(ssk); 353 } 354 355 /* 356 * 2msl has expired, schedule the shutdown task. 357 */ 358 static void 359 sdp_2msl_timeout(void *data) 360 { 361 struct sdp_sock *ssk; 362 363 ssk = data; 364 /* Callout canceled. */ 365 if (!callout_active(&ssk->keep2msl)) 366 goto out; 367 callout_deactivate(&ssk->keep2msl); 368 /* Should be impossible, defensive programming. */ 369 if ((ssk->flags & SDP_TIMEWAIT) == 0) 370 goto out; 371 taskqueue_enqueue(taskqueue_thread, &ssk->shutdown_task); 372 out: 373 SDP_WUNLOCK(ssk); 374 return; 375 } 376 377 /* 378 * Schedule the 2msl wait timer. 379 */ 380 static void 381 sdp_2msl_wait(struct sdp_sock *ssk) 382 { 383 384 SDP_WLOCK_ASSERT(ssk); 385 ssk->flags |= SDP_TIMEWAIT; 386 ssk->state = TCPS_TIME_WAIT; 387 soisdisconnected(ssk->socket); 388 callout_reset(&ssk->keep2msl, TCPTV_MSL, sdp_2msl_timeout, ssk); 389 } 390 391 /* 392 * Timed out waiting for the final fin/ack from rdma_disconnect(). 393 */ 394 static void 395 sdp_dreq_timeout(void *data) 396 { 397 struct sdp_sock *ssk; 398 399 ssk = data; 400 /* Callout canceled. */ 401 if (!callout_active(&ssk->keep2msl)) 402 goto out; 403 /* Callout rescheduled, probably as a different timer. */ 404 if (callout_pending(&ssk->keep2msl)) 405 goto out; 406 callout_deactivate(&ssk->keep2msl); 407 if (ssk->state != TCPS_FIN_WAIT_1 && ssk->state != TCPS_LAST_ACK) 408 goto out; 409 if ((ssk->flags & SDP_DREQWAIT) == 0) 410 goto out; 411 ssk->flags &= ~SDP_DREQWAIT; 412 ssk->flags |= SDP_DISCON; 413 sdp_2msl_wait(ssk); 414 ssk->qp_active = 0; 415 out: 416 SDP_WUNLOCK(ssk); 417 } 418 419 /* 420 * Received the final fin/ack. Cancel the 2msl. 421 */ 422 void 423 sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk) 424 { 425 sdp_dbg(ssk->socket, "cancelling dreq wait timeout\n"); 426 ssk->flags &= ~SDP_DREQWAIT; 427 sdp_2msl_wait(ssk); 428 } 429 430 static int 431 sdp_init_sock(struct socket *sk) 432 { 433 struct sdp_sock *ssk = sdp_sk(sk); 434 435 sdp_dbg(sk, "%s\n", __func__); 436 437 callout_init_rw(&ssk->keep2msl, &ssk->lock, CALLOUT_RETURNUNLOCKED); 438 TASK_INIT(&ssk->shutdown_task, 0, sdp_shutdown_task, ssk); 439 #ifdef SDP_ZCOPY 440 INIT_DELAYED_WORK(&ssk->srcavail_cancel_work, srcavail_cancel_timeout); 441 ssk->zcopy_thresh = -1; /* use global sdp_zcopy_thresh */ 442 ssk->tx_ring.rdma_inflight = NULL; 443 #endif 444 atomic_set(&ssk->mseq_ack, 0); 445 sdp_rx_ring_init(ssk); 446 ssk->tx_ring.buffer = NULL; 447 448 return 0; 449 } 450 451 /* 452 * Allocate an sdp_sock for the socket and reserve socket buffer space. 453 */ 454 static int 455 sdp_attach(struct socket *so, int proto, struct thread *td) 456 { 457 struct sdp_sock *ssk; 458 int error; 459 460 ssk = sdp_sk(so); 461 KASSERT(ssk == NULL, ("sdp_attach: ssk already set on so %p", so)); 462 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { 463 error = soreserve(so, sdp_sendspace, sdp_recvspace); 464 if (error) 465 return (error); 466 } 467 so->so_rcv.sb_flags |= SB_AUTOSIZE; 468 so->so_snd.sb_flags |= SB_AUTOSIZE; 469 ssk = uma_zalloc(sdp_zone, M_NOWAIT | M_ZERO); 470 if (ssk == NULL) 471 return (ENOBUFS); 472 rw_init(&ssk->lock, "sdpsock"); 473 ssk->socket = so; 474 ssk->cred = crhold(so->so_cred); 475 so->so_pcb = (caddr_t)ssk; 476 sdp_init_sock(so); 477 ssk->flags = 0; 478 ssk->qp_active = 0; 479 ssk->state = TCPS_CLOSED; 480 mbufq_init(&ssk->rxctlq, INT_MAX); 481 SDP_LIST_WLOCK(); 482 LIST_INSERT_HEAD(&sdp_list, ssk, list); 483 sdp_count++; 484 SDP_LIST_WUNLOCK(); 485 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 486 so->so_linger = TCP_LINGERTIME; 487 488 return (0); 489 } 490 491 /* 492 * Detach SDP from the socket, potentially leaving it around for the 493 * timewait to expire. 494 */ 495 static void 496 sdp_detach(struct socket *so) 497 { 498 struct sdp_sock *ssk; 499 500 ssk = sdp_sk(so); 501 SDP_WLOCK(ssk); 502 KASSERT(ssk->socket != NULL, ("sdp_detach: socket is NULL")); 503 ssk->socket->so_pcb = NULL; 504 ssk->socket = NULL; 505 if (ssk->flags & (SDP_TIMEWAIT | SDP_DREQWAIT)) 506 SDP_WUNLOCK(ssk); 507 else if (ssk->flags & SDP_DROPPED || ssk->state < TCPS_SYN_SENT) 508 sdp_pcbfree(ssk); 509 else 510 panic("sdp_detach: Unexpected state, ssk %p.\n", ssk); 511 } 512 513 /* 514 * Allocate a local address for the socket. 515 */ 516 static int 517 sdp_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 518 { 519 int error = 0; 520 struct sdp_sock *ssk; 521 struct sockaddr_in *sin; 522 523 sin = (struct sockaddr_in *)nam; 524 if (nam->sa_len != sizeof (*sin)) 525 return (EINVAL); 526 if (sin->sin_family != AF_INET) 527 return (EINVAL); 528 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) 529 return (EAFNOSUPPORT); 530 531 ssk = sdp_sk(so); 532 SDP_WLOCK(ssk); 533 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 534 error = EINVAL; 535 goto out; 536 } 537 error = sdp_pcbbind(ssk, nam, td->td_ucred); 538 out: 539 SDP_WUNLOCK(ssk); 540 541 return (error); 542 } 543 544 /* 545 * Prepare to accept connections. 546 */ 547 static int 548 sdp_listen(struct socket *so, int backlog, struct thread *td) 549 { 550 int error = 0; 551 struct sdp_sock *ssk; 552 553 ssk = sdp_sk(so); 554 SDP_WLOCK(ssk); 555 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 556 error = EINVAL; 557 goto out; 558 } 559 if (error == 0 && ssk->lport == 0) 560 error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred); 561 SOCK_LOCK(so); 562 if (error == 0) 563 error = solisten_proto_check(so); 564 if (error == 0) { 565 solisten_proto(so, backlog); 566 ssk->state = TCPS_LISTEN; 567 } 568 SOCK_UNLOCK(so); 569 570 out: 571 SDP_WUNLOCK(ssk); 572 if (error == 0) 573 error = -rdma_listen(ssk->id, backlog); 574 return (error); 575 } 576 577 /* 578 * Initiate a SDP connection to nam. 579 */ 580 static int 581 sdp_start_connect(struct sdp_sock *ssk, struct sockaddr *nam, struct thread *td) 582 { 583 struct sockaddr_in src; 584 struct socket *so; 585 int error; 586 587 so = ssk->socket; 588 589 SDP_WLOCK_ASSERT(ssk); 590 if (ssk->lport == 0) { 591 error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred); 592 if (error) 593 return error; 594 } 595 src.sin_family = AF_INET; 596 src.sin_len = sizeof(src); 597 bzero(&src.sin_zero, sizeof(src.sin_zero)); 598 src.sin_port = ssk->lport; 599 src.sin_addr.s_addr = ssk->laddr; 600 soisconnecting(so); 601 SDP_WUNLOCK(ssk); 602 error = -rdma_resolve_addr(ssk->id, (struct sockaddr *)&src, nam, 603 SDP_RESOLVE_TIMEOUT); 604 SDP_WLOCK(ssk); 605 if (error == 0) 606 ssk->state = TCPS_SYN_SENT; 607 608 return 0; 609 } 610 611 /* 612 * Initiate SDP connection. 613 */ 614 static int 615 sdp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 616 { 617 int error = 0; 618 struct sdp_sock *ssk; 619 struct sockaddr_in *sin; 620 621 sin = (struct sockaddr_in *)nam; 622 if (nam->sa_len != sizeof (*sin)) 623 return (EINVAL); 624 if (sin->sin_family != AF_INET) 625 return (EINVAL); 626 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) 627 return (EAFNOSUPPORT); 628 if ((error = prison_remote_ip4(td->td_ucred, &sin->sin_addr)) != 0) 629 return (error); 630 ssk = sdp_sk(so); 631 SDP_WLOCK(ssk); 632 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) 633 error = EINVAL; 634 else 635 error = sdp_start_connect(ssk, nam, td); 636 SDP_WUNLOCK(ssk); 637 return (error); 638 } 639 640 /* 641 * Drop a SDP socket, reporting 642 * the specified error. If connection is synchronized, 643 * then send a RST to peer. 644 */ 645 static struct sdp_sock * 646 sdp_drop(struct sdp_sock *ssk, int errno) 647 { 648 struct socket *so; 649 650 SDP_WLOCK_ASSERT(ssk); 651 so = ssk->socket; 652 if (TCPS_HAVERCVDSYN(ssk->state)) 653 sdp_output_reset(ssk); 654 if (errno == ETIMEDOUT && ssk->softerror) 655 errno = ssk->softerror; 656 so->so_error = errno; 657 return (sdp_closed(ssk)); 658 } 659 660 /* 661 * User issued close, and wish to trail through shutdown states: 662 * if never received SYN, just forget it. If got a SYN from peer, 663 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 664 * If already got a FIN from peer, then almost done; go to LAST_ACK 665 * state. In all other cases, have already sent FIN to peer (e.g. 666 * after PRU_SHUTDOWN), and just have to play tedious game waiting 667 * for peer to send FIN or not respond to keep-alives, etc. 668 * We can let the user exit from the close as soon as the FIN is acked. 669 */ 670 static void 671 sdp_usrclosed(struct sdp_sock *ssk) 672 { 673 674 SDP_WLOCK_ASSERT(ssk); 675 676 switch (ssk->state) { 677 case TCPS_LISTEN: 678 ssk->state = TCPS_CLOSED; 679 SDP_WUNLOCK(ssk); 680 sdp_destroy_cma(ssk); 681 SDP_WLOCK(ssk); 682 /* FALLTHROUGH */ 683 case TCPS_CLOSED: 684 ssk = sdp_closed(ssk); 685 /* 686 * sdp_closed() should never return NULL here as the socket is 687 * still open. 688 */ 689 KASSERT(ssk != NULL, 690 ("sdp_usrclosed: sdp_closed() returned NULL")); 691 break; 692 693 case TCPS_SYN_SENT: 694 /* FALLTHROUGH */ 695 case TCPS_SYN_RECEIVED: 696 ssk->flags |= SDP_NEEDFIN; 697 break; 698 699 case TCPS_ESTABLISHED: 700 ssk->flags |= SDP_NEEDFIN; 701 ssk->state = TCPS_FIN_WAIT_1; 702 break; 703 704 case TCPS_CLOSE_WAIT: 705 ssk->state = TCPS_LAST_ACK; 706 break; 707 } 708 if (ssk->state >= TCPS_FIN_WAIT_2) { 709 /* Prevent the connection hanging in FIN_WAIT_2 forever. */ 710 if (ssk->state == TCPS_FIN_WAIT_2) 711 sdp_2msl_wait(ssk); 712 else 713 soisdisconnected(ssk->socket); 714 } 715 } 716 717 static void 718 sdp_output_disconnect(struct sdp_sock *ssk) 719 { 720 721 SDP_WLOCK_ASSERT(ssk); 722 callout_reset(&ssk->keep2msl, SDP_FIN_WAIT_TIMEOUT, 723 sdp_dreq_timeout, ssk); 724 ssk->flags |= SDP_NEEDFIN | SDP_DREQWAIT; 725 sdp_post_sends(ssk, M_NOWAIT); 726 } 727 728 /* 729 * Initiate or continue a disconnect. 730 * If embryonic state, just send reset (once). 731 * If in ``let data drain'' option and linger null, just drop. 732 * Otherwise (hard), mark socket disconnecting and drop 733 * current input data; switch states based on user close, and 734 * send segment to peer (with FIN). 735 */ 736 static void 737 sdp_start_disconnect(struct sdp_sock *ssk) 738 { 739 struct socket *so; 740 int unread; 741 742 so = ssk->socket; 743 SDP_WLOCK_ASSERT(ssk); 744 sdp_stop_keepalive_timer(so); 745 /* 746 * Neither sdp_closed() nor sdp_drop() should return NULL, as the 747 * socket is still open. 748 */ 749 if (ssk->state < TCPS_ESTABLISHED) { 750 ssk = sdp_closed(ssk); 751 KASSERT(ssk != NULL, 752 ("sdp_start_disconnect: sdp_close() returned NULL")); 753 } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) { 754 ssk = sdp_drop(ssk, 0); 755 KASSERT(ssk != NULL, 756 ("sdp_start_disconnect: sdp_drop() returned NULL")); 757 } else { 758 soisdisconnecting(so); 759 unread = sbused(&so->so_rcv); 760 sbflush(&so->so_rcv); 761 sdp_usrclosed(ssk); 762 if (!(ssk->flags & SDP_DROPPED)) { 763 if (unread) 764 sdp_output_reset(ssk); 765 else 766 sdp_output_disconnect(ssk); 767 } 768 } 769 } 770 771 /* 772 * User initiated disconnect. 773 */ 774 static int 775 sdp_disconnect(struct socket *so) 776 { 777 struct sdp_sock *ssk; 778 int error = 0; 779 780 ssk = sdp_sk(so); 781 SDP_WLOCK(ssk); 782 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 783 error = ECONNRESET; 784 goto out; 785 } 786 sdp_start_disconnect(ssk); 787 out: 788 SDP_WUNLOCK(ssk); 789 return (error); 790 } 791 792 /* 793 * Accept a connection. Essentially all the work is done at higher levels; 794 * just return the address of the peer, storing through addr. 795 * 796 * 797 * XXX This is broken XXX 798 * 799 * The rationale for acquiring the sdp lock here is somewhat complicated, 800 * and is described in detail in the commit log entry for r175612. Acquiring 801 * it delays an accept(2) racing with sonewconn(), which inserts the socket 802 * before the address/port fields are initialized. A better fix would 803 * prevent the socket from being placed in the listen queue until all fields 804 * are fully initialized. 805 */ 806 static int 807 sdp_accept(struct socket *so, struct sockaddr **nam) 808 { 809 struct sdp_sock *ssk = NULL; 810 struct in_addr addr; 811 in_port_t port; 812 int error; 813 814 if (so->so_state & SS_ISDISCONNECTED) 815 return (ECONNABORTED); 816 817 port = 0; 818 addr.s_addr = 0; 819 error = 0; 820 ssk = sdp_sk(so); 821 SDP_WLOCK(ssk); 822 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 823 error = ECONNABORTED; 824 goto out; 825 } 826 port = ssk->fport; 827 addr.s_addr = ssk->faddr; 828 out: 829 SDP_WUNLOCK(ssk); 830 if (error == 0) 831 *nam = sdp_sockaddr(port, &addr); 832 return error; 833 } 834 835 /* 836 * Mark the connection as being incapable of further output. 837 */ 838 static int 839 sdp_shutdown(struct socket *so) 840 { 841 int error = 0; 842 struct sdp_sock *ssk; 843 844 ssk = sdp_sk(so); 845 SDP_WLOCK(ssk); 846 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 847 error = ECONNRESET; 848 goto out; 849 } 850 socantsendmore(so); 851 sdp_usrclosed(ssk); 852 if (!(ssk->flags & SDP_DROPPED)) 853 sdp_output_disconnect(ssk); 854 855 out: 856 SDP_WUNLOCK(ssk); 857 858 return (error); 859 } 860 861 static void 862 sdp_append(struct sdp_sock *ssk, struct sockbuf *sb, struct mbuf *mb, int cnt) 863 { 864 struct mbuf *n; 865 int ncnt; 866 867 SOCKBUF_LOCK_ASSERT(sb); 868 SBLASTRECORDCHK(sb); 869 KASSERT(mb->m_flags & M_PKTHDR, 870 ("sdp_append: %p Missing packet header.\n", mb)); 871 n = sb->sb_lastrecord; 872 /* 873 * If the queue is empty just set all pointers and proceed. 874 */ 875 if (n == NULL) { 876 sb->sb_lastrecord = sb->sb_mb = sb->sb_sndptr = mb; 877 for (; mb; mb = mb->m_next) { 878 sb->sb_mbtail = mb; 879 sballoc(sb, mb); 880 } 881 return; 882 } 883 /* 884 * Count the number of mbufs in the current tail. 885 */ 886 for (ncnt = 0; n->m_next; n = n->m_next) 887 ncnt++; 888 n = sb->sb_lastrecord; 889 /* 890 * If the two chains can fit in a single sdp packet and 891 * the last record has not been sent yet (WRITABLE) coalesce 892 * them. The lastrecord remains the same but we must strip the 893 * packet header and then let sbcompress do the hard part. 894 */ 895 if (M_WRITABLE(n) && ncnt + cnt < SDP_MAX_SEND_SGES && 896 n->m_pkthdr.len + mb->m_pkthdr.len - SDP_HEAD_SIZE < 897 ssk->xmit_size_goal) { 898 m_adj(mb, SDP_HEAD_SIZE); 899 n->m_pkthdr.len += mb->m_pkthdr.len; 900 n->m_flags |= mb->m_flags & (M_PUSH | M_URG); 901 m_demote(mb, 1, 0); 902 sbcompress(sb, mb, sb->sb_mbtail); 903 return; 904 } 905 /* 906 * Not compressible, just append to the end and adjust counters. 907 */ 908 sb->sb_lastrecord->m_flags |= M_PUSH; 909 sb->sb_lastrecord->m_nextpkt = mb; 910 sb->sb_lastrecord = mb; 911 if (sb->sb_sndptr == NULL) 912 sb->sb_sndptr = mb; 913 for (; mb; mb = mb->m_next) { 914 sb->sb_mbtail = mb; 915 sballoc(sb, mb); 916 } 917 } 918 919 /* 920 * Do a send by putting data in output queue and updating urgent 921 * marker if URG set. Possibly send more data. Unlike the other 922 * pru_*() routines, the mbuf chains are our responsibility. We 923 * must either enqueue them or free them. The other pru_* routines 924 * generally are caller-frees. 925 * 926 * This comes from sendfile, normal sends will come from sdp_sosend(). 927 */ 928 static int 929 sdp_send(struct socket *so, int flags, struct mbuf *m, 930 struct sockaddr *nam, struct mbuf *control, struct thread *td) 931 { 932 struct sdp_sock *ssk; 933 struct mbuf *n; 934 int error; 935 int cnt; 936 937 error = 0; 938 ssk = sdp_sk(so); 939 KASSERT(m->m_flags & M_PKTHDR, 940 ("sdp_send: %p no packet header", m)); 941 M_PREPEND(m, SDP_HEAD_SIZE, M_WAITOK); 942 mtod(m, struct sdp_bsdh *)->mid = SDP_MID_DATA; 943 for (n = m, cnt = 0; n->m_next; n = n->m_next) 944 cnt++; 945 if (cnt > SDP_MAX_SEND_SGES) { 946 n = m_collapse(m, M_WAITOK, SDP_MAX_SEND_SGES); 947 if (n == NULL) { 948 m_freem(m); 949 return (EMSGSIZE); 950 } 951 m = n; 952 for (cnt = 0; n->m_next; n = n->m_next) 953 cnt++; 954 } 955 SDP_WLOCK(ssk); 956 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 957 if (control) 958 m_freem(control); 959 if (m) 960 m_freem(m); 961 error = ECONNRESET; 962 goto out; 963 } 964 if (control) { 965 /* SDP doesn't support control messages. */ 966 if (control->m_len) { 967 m_freem(control); 968 if (m) 969 m_freem(m); 970 error = EINVAL; 971 goto out; 972 } 973 m_freem(control); /* empty control, just free it */ 974 } 975 if (!(flags & PRUS_OOB)) { 976 SOCKBUF_LOCK(&so->so_snd); 977 sdp_append(ssk, &so->so_snd, m, cnt); 978 SOCKBUF_UNLOCK(&so->so_snd); 979 if (nam && ssk->state < TCPS_SYN_SENT) { 980 /* 981 * Do implied connect if not yet connected. 982 */ 983 error = sdp_start_connect(ssk, nam, td); 984 if (error) 985 goto out; 986 } 987 if (flags & PRUS_EOF) { 988 /* 989 * Close the send side of the connection after 990 * the data is sent. 991 */ 992 socantsendmore(so); 993 sdp_usrclosed(ssk); 994 if (!(ssk->flags & SDP_DROPPED)) 995 sdp_output_disconnect(ssk); 996 } else if (!(ssk->flags & SDP_DROPPED) && 997 !(flags & PRUS_MORETOCOME)) 998 sdp_post_sends(ssk, M_NOWAIT); 999 SDP_WUNLOCK(ssk); 1000 return (0); 1001 } else { 1002 SOCKBUF_LOCK(&so->so_snd); 1003 if (sbspace(&so->so_snd) < -512) { 1004 SOCKBUF_UNLOCK(&so->so_snd); 1005 m_freem(m); 1006 error = ENOBUFS; 1007 goto out; 1008 } 1009 /* 1010 * According to RFC961 (Assigned Protocols), 1011 * the urgent pointer points to the last octet 1012 * of urgent data. We continue, however, 1013 * to consider it to indicate the first octet 1014 * of data past the urgent section. 1015 * Otherwise, snd_up should be one lower. 1016 */ 1017 m->m_flags |= M_URG | M_PUSH; 1018 sdp_append(ssk, &so->so_snd, m, cnt); 1019 SOCKBUF_UNLOCK(&so->so_snd); 1020 if (nam && ssk->state < TCPS_SYN_SENT) { 1021 /* 1022 * Do implied connect if not yet connected. 1023 */ 1024 error = sdp_start_connect(ssk, nam, td); 1025 if (error) 1026 goto out; 1027 } 1028 sdp_post_sends(ssk, M_NOWAIT); 1029 SDP_WUNLOCK(ssk); 1030 return (0); 1031 } 1032 out: 1033 SDP_WUNLOCK(ssk); 1034 return (error); 1035 } 1036 1037 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 1038 1039 /* 1040 * Send on a socket. If send must go all at once and message is larger than 1041 * send buffering, then hard error. Lock against other senders. If must go 1042 * all at once and not enough room now, then inform user that this would 1043 * block and do nothing. Otherwise, if nonblocking, send as much as 1044 * possible. The data to be sent is described by "uio" if nonzero, otherwise 1045 * by the mbuf chain "top" (which must be null if uio is not). Data provided 1046 * in mbuf chain must be small enough to send all at once. 1047 * 1048 * Returns nonzero on error, timeout or signal; callers must check for short 1049 * counts if EINTR/ERESTART are returned. Data and control buffers are freed 1050 * on return. 1051 */ 1052 static int 1053 sdp_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 1054 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1055 { 1056 struct sdp_sock *ssk; 1057 long space, resid; 1058 int atomic; 1059 int error; 1060 int copy; 1061 1062 if (uio != NULL) 1063 resid = uio->uio_resid; 1064 else 1065 resid = top->m_pkthdr.len; 1066 atomic = top != NULL; 1067 if (control != NULL) { 1068 if (control->m_len) { 1069 m_freem(control); 1070 if (top) 1071 m_freem(top); 1072 return (EINVAL); 1073 } 1074 m_freem(control); 1075 control = NULL; 1076 } 1077 /* 1078 * In theory resid should be unsigned. However, space must be 1079 * signed, as it might be less than 0 if we over-committed, and we 1080 * must use a signed comparison of space and resid. On the other 1081 * hand, a negative resid causes us to loop sending 0-length 1082 * segments to the protocol. 1083 * 1084 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 1085 * type sockets since that's an error. 1086 */ 1087 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 1088 error = EINVAL; 1089 goto out; 1090 } 1091 if (td != NULL) 1092 td->td_ru.ru_msgsnd++; 1093 1094 ssk = sdp_sk(so); 1095 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 1096 if (error) 1097 goto out; 1098 1099 restart: 1100 do { 1101 SOCKBUF_LOCK(&so->so_snd); 1102 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1103 SOCKBUF_UNLOCK(&so->so_snd); 1104 error = EPIPE; 1105 goto release; 1106 } 1107 if (so->so_error) { 1108 error = so->so_error; 1109 so->so_error = 0; 1110 SOCKBUF_UNLOCK(&so->so_snd); 1111 goto release; 1112 } 1113 if ((so->so_state & SS_ISCONNECTED) == 0 && addr == NULL) { 1114 SOCKBUF_UNLOCK(&so->so_snd); 1115 error = ENOTCONN; 1116 goto release; 1117 } 1118 space = sbspace(&so->so_snd); 1119 if (flags & MSG_OOB) 1120 space += 1024; 1121 if (atomic && resid > ssk->xmit_size_goal - SDP_HEAD_SIZE) { 1122 SOCKBUF_UNLOCK(&so->so_snd); 1123 error = EMSGSIZE; 1124 goto release; 1125 } 1126 if (space < resid && 1127 (atomic || space < so->so_snd.sb_lowat)) { 1128 if ((so->so_state & SS_NBIO) || 1129 (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { 1130 SOCKBUF_UNLOCK(&so->so_snd); 1131 error = EWOULDBLOCK; 1132 goto release; 1133 } 1134 error = sbwait(&so->so_snd); 1135 SOCKBUF_UNLOCK(&so->so_snd); 1136 if (error) 1137 goto release; 1138 goto restart; 1139 } 1140 SOCKBUF_UNLOCK(&so->so_snd); 1141 do { 1142 if (uio == NULL) { 1143 resid = 0; 1144 if (flags & MSG_EOR) 1145 top->m_flags |= M_EOR; 1146 } else { 1147 /* 1148 * Copy the data from userland into a mbuf 1149 * chain. If no data is to be copied in, 1150 * a single empty mbuf is returned. 1151 */ 1152 copy = min(space, 1153 ssk->xmit_size_goal - SDP_HEAD_SIZE); 1154 top = m_uiotombuf(uio, M_WAITOK, copy, 1155 0, M_PKTHDR | 1156 ((flags & MSG_EOR) ? M_EOR : 0)); 1157 if (top == NULL) { 1158 /* only possible error */ 1159 error = EFAULT; 1160 goto release; 1161 } 1162 space -= resid - uio->uio_resid; 1163 resid = uio->uio_resid; 1164 } 1165 /* 1166 * XXX all the SBS_CANTSENDMORE checks previously 1167 * done could be out of date after dropping the 1168 * socket lock. 1169 */ 1170 error = sdp_send(so, (flags & MSG_OOB) ? PRUS_OOB : 1171 /* 1172 * Set EOF on the last send if the user specified 1173 * MSG_EOF. 1174 */ 1175 ((flags & MSG_EOF) && (resid <= 0)) ? PRUS_EOF : 1176 /* If there is more to send set PRUS_MORETOCOME. */ 1177 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1178 top, addr, NULL, td); 1179 top = NULL; 1180 if (error) 1181 goto release; 1182 } while (resid && space > 0); 1183 } while (resid); 1184 1185 release: 1186 sbunlock(&so->so_snd); 1187 out: 1188 if (top != NULL) 1189 m_freem(top); 1190 return (error); 1191 } 1192 1193 /* 1194 * The part of soreceive() that implements reading non-inline out-of-band 1195 * data from a socket. For more complete comments, see soreceive(), from 1196 * which this code originated. 1197 * 1198 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is 1199 * unable to return an mbuf chain to the caller. 1200 */ 1201 static int 1202 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) 1203 { 1204 struct protosw *pr = so->so_proto; 1205 struct mbuf *m; 1206 int error; 1207 1208 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 1209 1210 m = m_get(M_WAITOK, MT_DATA); 1211 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 1212 if (error) 1213 goto bad; 1214 do { 1215 error = uiomove(mtod(m, void *), 1216 (int) min(uio->uio_resid, m->m_len), uio); 1217 m = m_free(m); 1218 } while (uio->uio_resid && error == 0 && m); 1219 bad: 1220 if (m != NULL) 1221 m_freem(m); 1222 return (error); 1223 } 1224 1225 /* 1226 * Optimized version of soreceive() for stream (TCP) sockets. 1227 */ 1228 static int 1229 sdp_sorecv(struct socket *so, struct sockaddr **psa, struct uio *uio, 1230 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 1231 { 1232 int len = 0, error = 0, flags, oresid; 1233 struct sockbuf *sb; 1234 struct mbuf *m, *n = NULL; 1235 struct sdp_sock *ssk; 1236 1237 /* We only do stream sockets. */ 1238 if (so->so_type != SOCK_STREAM) 1239 return (EINVAL); 1240 if (psa != NULL) 1241 *psa = NULL; 1242 if (controlp != NULL) 1243 return (EINVAL); 1244 if (flagsp != NULL) 1245 flags = *flagsp &~ MSG_EOR; 1246 else 1247 flags = 0; 1248 if (flags & MSG_OOB) 1249 return (soreceive_rcvoob(so, uio, flags)); 1250 if (mp0 != NULL) 1251 *mp0 = NULL; 1252 1253 sb = &so->so_rcv; 1254 ssk = sdp_sk(so); 1255 1256 /* Prevent other readers from entering the socket. */ 1257 error = sblock(sb, SBLOCKWAIT(flags)); 1258 if (error) 1259 goto out; 1260 SOCKBUF_LOCK(sb); 1261 1262 /* Easy one, no space to copyout anything. */ 1263 if (uio->uio_resid == 0) { 1264 error = EINVAL; 1265 goto out; 1266 } 1267 oresid = uio->uio_resid; 1268 1269 /* We will never ever get anything unless we are connected. */ 1270 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { 1271 /* When disconnecting there may be still some data left. */ 1272 if (sbavail(sb)) 1273 goto deliver; 1274 if (!(so->so_state & SS_ISDISCONNECTED)) 1275 error = ENOTCONN; 1276 goto out; 1277 } 1278 1279 /* Socket buffer is empty and we shall not block. */ 1280 if (sbavail(sb) == 0 && 1281 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { 1282 error = EAGAIN; 1283 goto out; 1284 } 1285 1286 restart: 1287 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1288 1289 /* Abort if socket has reported problems. */ 1290 if (so->so_error) { 1291 if (sbavail(sb)) 1292 goto deliver; 1293 if (oresid > uio->uio_resid) 1294 goto out; 1295 error = so->so_error; 1296 if (!(flags & MSG_PEEK)) 1297 so->so_error = 0; 1298 goto out; 1299 } 1300 1301 /* Door is closed. Deliver what is left, if any. */ 1302 if (sb->sb_state & SBS_CANTRCVMORE) { 1303 if (sbavail(sb)) 1304 goto deliver; 1305 else 1306 goto out; 1307 } 1308 1309 /* Socket buffer got some data that we shall deliver now. */ 1310 if (sbavail(sb) && !(flags & MSG_WAITALL) && 1311 ((so->so_state & SS_NBIO) || 1312 (flags & (MSG_DONTWAIT|MSG_NBIO)) || 1313 sbavail(sb) >= sb->sb_lowat || 1314 sbavail(sb) >= uio->uio_resid || 1315 sbavail(sb) >= sb->sb_hiwat) ) { 1316 goto deliver; 1317 } 1318 1319 /* On MSG_WAITALL we must wait until all data or error arrives. */ 1320 if ((flags & MSG_WAITALL) && 1321 (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_lowat)) 1322 goto deliver; 1323 1324 /* 1325 * Wait and block until (more) data comes in. 1326 * NB: Drops the sockbuf lock during wait. 1327 */ 1328 error = sbwait(sb); 1329 if (error) 1330 goto out; 1331 goto restart; 1332 1333 deliver: 1334 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1335 KASSERT(sbavail(sb), ("%s: sockbuf empty", __func__)); 1336 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); 1337 1338 /* Statistics. */ 1339 if (uio->uio_td) 1340 uio->uio_td->td_ru.ru_msgrcv++; 1341 1342 /* Fill uio until full or current end of socket buffer is reached. */ 1343 len = min(uio->uio_resid, sbavail(sb)); 1344 if (mp0 != NULL) { 1345 /* Dequeue as many mbufs as possible. */ 1346 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { 1347 for (*mp0 = m = sb->sb_mb; 1348 m != NULL && m->m_len <= len; 1349 m = m->m_next) { 1350 len -= m->m_len; 1351 uio->uio_resid -= m->m_len; 1352 sbfree(sb, m); 1353 n = m; 1354 } 1355 sb->sb_mb = m; 1356 if (sb->sb_mb == NULL) 1357 SB_EMPTY_FIXUP(sb); 1358 n->m_next = NULL; 1359 } 1360 /* Copy the remainder. */ 1361 if (len > 0) { 1362 KASSERT(sb->sb_mb != NULL, 1363 ("%s: len > 0 && sb->sb_mb empty", __func__)); 1364 1365 m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); 1366 if (m == NULL) 1367 len = 0; /* Don't flush data from sockbuf. */ 1368 else 1369 uio->uio_resid -= m->m_len; 1370 if (*mp0 != NULL) 1371 n->m_next = m; 1372 else 1373 *mp0 = m; 1374 if (*mp0 == NULL) { 1375 error = ENOBUFS; 1376 goto out; 1377 } 1378 } 1379 } else { 1380 /* NB: Must unlock socket buffer as uiomove may sleep. */ 1381 SOCKBUF_UNLOCK(sb); 1382 error = m_mbuftouio(uio, sb->sb_mb, len); 1383 SOCKBUF_LOCK(sb); 1384 if (error) 1385 goto out; 1386 } 1387 SBLASTRECORDCHK(sb); 1388 SBLASTMBUFCHK(sb); 1389 1390 /* 1391 * Remove the delivered data from the socket buffer unless we 1392 * were only peeking. 1393 */ 1394 if (!(flags & MSG_PEEK)) { 1395 if (len > 0) 1396 sbdrop_locked(sb, len); 1397 1398 /* Notify protocol that we drained some data. */ 1399 SOCKBUF_UNLOCK(sb); 1400 SDP_WLOCK(ssk); 1401 sdp_do_posts(ssk); 1402 SDP_WUNLOCK(ssk); 1403 SOCKBUF_LOCK(sb); 1404 } 1405 1406 /* 1407 * For MSG_WAITALL we may have to loop again and wait for 1408 * more data to come in. 1409 */ 1410 if ((flags & MSG_WAITALL) && uio->uio_resid > 0) 1411 goto restart; 1412 out: 1413 SOCKBUF_LOCK_ASSERT(sb); 1414 SBLASTRECORDCHK(sb); 1415 SBLASTMBUFCHK(sb); 1416 SOCKBUF_UNLOCK(sb); 1417 sbunlock(sb); 1418 return (error); 1419 } 1420 1421 /* 1422 * Abort is used to teardown a connection typically while sitting in 1423 * the accept queue. 1424 */ 1425 void 1426 sdp_abort(struct socket *so) 1427 { 1428 struct sdp_sock *ssk; 1429 1430 ssk = sdp_sk(so); 1431 SDP_WLOCK(ssk); 1432 /* 1433 * If we have not yet dropped, do it now. 1434 */ 1435 if (!(ssk->flags & SDP_TIMEWAIT) && 1436 !(ssk->flags & SDP_DROPPED)) 1437 sdp_drop(ssk, ECONNABORTED); 1438 KASSERT(ssk->flags & SDP_DROPPED, ("sdp_abort: %p not dropped 0x%X", 1439 ssk, ssk->flags)); 1440 SDP_WUNLOCK(ssk); 1441 } 1442 1443 /* 1444 * Close a SDP socket and initiate a friendly disconnect. 1445 */ 1446 static void 1447 sdp_close(struct socket *so) 1448 { 1449 struct sdp_sock *ssk; 1450 1451 ssk = sdp_sk(so); 1452 SDP_WLOCK(ssk); 1453 /* 1454 * If we have not yet dropped, do it now. 1455 */ 1456 if (!(ssk->flags & SDP_TIMEWAIT) && 1457 !(ssk->flags & SDP_DROPPED)) 1458 sdp_start_disconnect(ssk); 1459 1460 /* 1461 * If we've still not dropped let the socket layer know we're 1462 * holding on to the socket and pcb for a while. 1463 */ 1464 if (!(ssk->flags & SDP_DROPPED)) { 1465 SOCK_LOCK(so); 1466 so->so_state |= SS_PROTOREF; 1467 SOCK_UNLOCK(so); 1468 ssk->flags |= SDP_SOCKREF; 1469 } 1470 SDP_WUNLOCK(ssk); 1471 } 1472 1473 /* 1474 * User requests out-of-band data. 1475 */ 1476 static int 1477 sdp_rcvoob(struct socket *so, struct mbuf *m, int flags) 1478 { 1479 int error = 0; 1480 struct sdp_sock *ssk; 1481 1482 ssk = sdp_sk(so); 1483 SDP_WLOCK(ssk); 1484 if (!rx_ring_trylock(&ssk->rx_ring)) { 1485 SDP_WUNLOCK(ssk); 1486 return (ECONNRESET); 1487 } 1488 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 1489 error = ECONNRESET; 1490 goto out; 1491 } 1492 if ((so->so_oobmark == 0 && 1493 (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) || 1494 so->so_options & SO_OOBINLINE || 1495 ssk->oobflags & SDP_HADOOB) { 1496 error = EINVAL; 1497 goto out; 1498 } 1499 if ((ssk->oobflags & SDP_HAVEOOB) == 0) { 1500 error = EWOULDBLOCK; 1501 goto out; 1502 } 1503 m->m_len = 1; 1504 *mtod(m, caddr_t) = ssk->iobc; 1505 if ((flags & MSG_PEEK) == 0) 1506 ssk->oobflags ^= (SDP_HAVEOOB | SDP_HADOOB); 1507 out: 1508 rx_ring_unlock(&ssk->rx_ring); 1509 SDP_WUNLOCK(ssk); 1510 return (error); 1511 } 1512 1513 void 1514 sdp_urg(struct sdp_sock *ssk, struct mbuf *mb) 1515 { 1516 struct mbuf *m; 1517 struct socket *so; 1518 1519 so = ssk->socket; 1520 if (so == NULL) 1521 return; 1522 1523 so->so_oobmark = sbused(&so->so_rcv) + mb->m_pkthdr.len - 1; 1524 sohasoutofband(so); 1525 ssk->oobflags &= ~(SDP_HAVEOOB | SDP_HADOOB); 1526 if (!(so->so_options & SO_OOBINLINE)) { 1527 for (m = mb; m->m_next != NULL; m = m->m_next); 1528 ssk->iobc = *(mtod(m, char *) + m->m_len - 1); 1529 ssk->oobflags |= SDP_HAVEOOB; 1530 m->m_len--; 1531 mb->m_pkthdr.len--; 1532 } 1533 } 1534 1535 /* 1536 * Notify a sdp socket of an asynchronous error. 1537 * 1538 * Do not wake up user since there currently is no mechanism for 1539 * reporting soft errors (yet - a kqueue filter may be added). 1540 */ 1541 struct sdp_sock * 1542 sdp_notify(struct sdp_sock *ssk, int error) 1543 { 1544 1545 SDP_WLOCK_ASSERT(ssk); 1546 1547 if ((ssk->flags & SDP_TIMEWAIT) || 1548 (ssk->flags & SDP_DROPPED)) 1549 return (ssk); 1550 1551 /* 1552 * Ignore some errors if we are hooked up. 1553 */ 1554 if (ssk->state == TCPS_ESTABLISHED && 1555 (error == EHOSTUNREACH || error == ENETUNREACH || 1556 error == EHOSTDOWN)) 1557 return (ssk); 1558 ssk->softerror = error; 1559 return sdp_drop(ssk, error); 1560 } 1561 1562 static void 1563 sdp_ctlinput(int cmd, struct sockaddr *sa, void *vip) 1564 { 1565 struct in_addr faddr; 1566 1567 faddr = ((struct sockaddr_in *)sa)->sin_addr; 1568 if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) 1569 return; 1570 1571 sdp_pcbnotifyall(faddr, inetctlerrmap[cmd], sdp_notify); 1572 } 1573 1574 static int 1575 sdp_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, 1576 struct thread *td) 1577 { 1578 return (EOPNOTSUPP); 1579 } 1580 1581 static void 1582 sdp_keepalive_timeout(void *data) 1583 { 1584 struct sdp_sock *ssk; 1585 1586 ssk = data; 1587 /* Callout canceled. */ 1588 if (!callout_active(&ssk->keep2msl)) 1589 return; 1590 /* Callout rescheduled as a different kind of timer. */ 1591 if (callout_pending(&ssk->keep2msl)) 1592 goto out; 1593 callout_deactivate(&ssk->keep2msl); 1594 if (ssk->flags & SDP_DROPPED || 1595 (ssk->socket->so_options & SO_KEEPALIVE) == 0) 1596 goto out; 1597 sdp_post_keepalive(ssk); 1598 callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME, 1599 sdp_keepalive_timeout, ssk); 1600 out: 1601 SDP_WUNLOCK(ssk); 1602 } 1603 1604 1605 void 1606 sdp_start_keepalive_timer(struct socket *so) 1607 { 1608 struct sdp_sock *ssk; 1609 1610 ssk = sdp_sk(so); 1611 if (!callout_pending(&ssk->keep2msl)) 1612 callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME, 1613 sdp_keepalive_timeout, ssk); 1614 } 1615 1616 static void 1617 sdp_stop_keepalive_timer(struct socket *so) 1618 { 1619 struct sdp_sock *ssk; 1620 1621 ssk = sdp_sk(so); 1622 callout_stop(&ssk->keep2msl); 1623 } 1624 1625 /* 1626 * sdp_ctloutput() must drop the inpcb lock before performing copyin on 1627 * socket option arguments. When it re-acquires the lock after the copy, it 1628 * has to revalidate that the connection is still valid for the socket 1629 * option. 1630 */ 1631 #define SDP_WLOCK_RECHECK(inp) do { \ 1632 SDP_WLOCK(ssk); \ 1633 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { \ 1634 SDP_WUNLOCK(ssk); \ 1635 return (ECONNRESET); \ 1636 } \ 1637 } while(0) 1638 1639 static int 1640 sdp_ctloutput(struct socket *so, struct sockopt *sopt) 1641 { 1642 int error, opt, optval; 1643 struct sdp_sock *ssk; 1644 1645 error = 0; 1646 ssk = sdp_sk(so); 1647 if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_KEEPALIVE) { 1648 SDP_WLOCK(ssk); 1649 if (so->so_options & SO_KEEPALIVE) 1650 sdp_start_keepalive_timer(so); 1651 else 1652 sdp_stop_keepalive_timer(so); 1653 SDP_WUNLOCK(ssk); 1654 } 1655 if (sopt->sopt_level != IPPROTO_TCP) 1656 return (error); 1657 1658 SDP_WLOCK(ssk); 1659 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 1660 SDP_WUNLOCK(ssk); 1661 return (ECONNRESET); 1662 } 1663 1664 switch (sopt->sopt_dir) { 1665 case SOPT_SET: 1666 switch (sopt->sopt_name) { 1667 case TCP_NODELAY: 1668 SDP_WUNLOCK(ssk); 1669 error = sooptcopyin(sopt, &optval, sizeof optval, 1670 sizeof optval); 1671 if (error) 1672 return (error); 1673 1674 SDP_WLOCK_RECHECK(ssk); 1675 opt = SDP_NODELAY; 1676 if (optval) 1677 ssk->flags |= opt; 1678 else 1679 ssk->flags &= ~opt; 1680 sdp_do_posts(ssk); 1681 SDP_WUNLOCK(ssk); 1682 break; 1683 1684 default: 1685 SDP_WUNLOCK(ssk); 1686 error = ENOPROTOOPT; 1687 break; 1688 } 1689 break; 1690 1691 case SOPT_GET: 1692 switch (sopt->sopt_name) { 1693 case TCP_NODELAY: 1694 optval = ssk->flags & SDP_NODELAY; 1695 SDP_WUNLOCK(ssk); 1696 error = sooptcopyout(sopt, &optval, sizeof optval); 1697 break; 1698 default: 1699 SDP_WUNLOCK(ssk); 1700 error = ENOPROTOOPT; 1701 break; 1702 } 1703 break; 1704 } 1705 return (error); 1706 } 1707 #undef SDP_WLOCK_RECHECK 1708 1709 int sdp_mod_count = 0; 1710 int sdp_mod_usec = 0; 1711 1712 void 1713 sdp_set_default_moderation(struct sdp_sock *ssk) 1714 { 1715 if (sdp_mod_count <= 0 || sdp_mod_usec <= 0) 1716 return; 1717 ib_modify_cq(ssk->rx_ring.cq, sdp_mod_count, sdp_mod_usec); 1718 } 1719 1720 static void 1721 sdp_dev_add(struct ib_device *device) 1722 { 1723 struct ib_fmr_pool_param param; 1724 struct sdp_device *sdp_dev; 1725 1726 sdp_dev = malloc(sizeof(*sdp_dev), M_SDP, M_WAITOK | M_ZERO); 1727 sdp_dev->pd = ib_alloc_pd(device, 0); 1728 if (IS_ERR(sdp_dev->pd)) 1729 goto out_pd; 1730 memset(¶m, 0, sizeof param); 1731 param.max_pages_per_fmr = SDP_FMR_SIZE; 1732 param.page_shift = PAGE_SHIFT; 1733 param.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ); 1734 param.pool_size = SDP_FMR_POOL_SIZE; 1735 param.dirty_watermark = SDP_FMR_DIRTY_SIZE; 1736 param.cache = 1; 1737 sdp_dev->fmr_pool = ib_create_fmr_pool(sdp_dev->pd, ¶m); 1738 if (IS_ERR(sdp_dev->fmr_pool)) 1739 goto out_fmr; 1740 ib_set_client_data(device, &sdp_client, sdp_dev); 1741 return; 1742 1743 out_fmr: 1744 ib_dealloc_pd(sdp_dev->pd); 1745 out_pd: 1746 free(sdp_dev, M_SDP); 1747 } 1748 1749 static void 1750 sdp_dev_rem(struct ib_device *device, void *client_data) 1751 { 1752 struct sdp_device *sdp_dev; 1753 struct sdp_sock *ssk; 1754 1755 SDP_LIST_WLOCK(); 1756 LIST_FOREACH(ssk, &sdp_list, list) { 1757 if (ssk->ib_device != device) 1758 continue; 1759 SDP_WLOCK(ssk); 1760 if ((ssk->flags & SDP_DESTROY) == 0) 1761 ssk = sdp_notify(ssk, ECONNRESET); 1762 if (ssk) 1763 SDP_WUNLOCK(ssk); 1764 } 1765 SDP_LIST_WUNLOCK(); 1766 /* 1767 * XXX Do I need to wait between these two? 1768 */ 1769 sdp_dev = ib_get_client_data(device, &sdp_client); 1770 if (!sdp_dev) 1771 return; 1772 ib_flush_fmr_pool(sdp_dev->fmr_pool); 1773 ib_destroy_fmr_pool(sdp_dev->fmr_pool); 1774 ib_dealloc_pd(sdp_dev->pd); 1775 free(sdp_dev, M_SDP); 1776 } 1777 1778 struct ib_client sdp_client = 1779 { .name = "sdp", .add = sdp_dev_add, .remove = sdp_dev_rem }; 1780 1781 1782 static int 1783 sdp_pcblist(SYSCTL_HANDLER_ARGS) 1784 { 1785 int error, n, i; 1786 struct sdp_sock *ssk; 1787 struct xinpgen xig; 1788 1789 /* 1790 * The process of preparing the TCB list is too time-consuming and 1791 * resource-intensive to repeat twice on every request. 1792 */ 1793 if (req->oldptr == NULL) { 1794 n = sdp_count; 1795 n += imax(n / 8, 10); 1796 req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb); 1797 return (0); 1798 } 1799 1800 if (req->newptr != NULL) 1801 return (EPERM); 1802 1803 /* 1804 * OK, now we're committed to doing something. 1805 */ 1806 SDP_LIST_RLOCK(); 1807 n = sdp_count; 1808 SDP_LIST_RUNLOCK(); 1809 1810 error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) 1811 + n * sizeof(struct xtcpcb)); 1812 if (error != 0) 1813 return (error); 1814 1815 bzero(&xig, sizeof(xig)); 1816 xig.xig_len = sizeof xig; 1817 xig.xig_count = n; 1818 xig.xig_gen = 0; 1819 xig.xig_sogen = so_gencnt; 1820 error = SYSCTL_OUT(req, &xig, sizeof xig); 1821 if (error) 1822 return (error); 1823 1824 SDP_LIST_RLOCK(); 1825 for (ssk = LIST_FIRST(&sdp_list), i = 0; 1826 ssk != NULL && i < n; ssk = LIST_NEXT(ssk, list)) { 1827 struct xtcpcb xt; 1828 1829 SDP_RLOCK(ssk); 1830 if (ssk->flags & SDP_TIMEWAIT) { 1831 if (ssk->cred != NULL) 1832 error = cr_cansee(req->td->td_ucred, 1833 ssk->cred); 1834 else 1835 error = EINVAL; /* Skip this inp. */ 1836 } else if (ssk->socket) 1837 error = cr_canseesocket(req->td->td_ucred, 1838 ssk->socket); 1839 else 1840 error = EINVAL; 1841 if (error) { 1842 error = 0; 1843 goto next; 1844 } 1845 1846 bzero(&xt, sizeof(xt)); 1847 xt.xt_len = sizeof xt; 1848 xt.xt_inp.inp_gencnt = 0; 1849 xt.xt_inp.inp_vflag = INP_IPV4; 1850 memcpy(&xt.xt_inp.inp_laddr, &ssk->laddr, sizeof(ssk->laddr)); 1851 xt.xt_inp.inp_lport = ssk->lport; 1852 memcpy(&xt.xt_inp.inp_faddr, &ssk->faddr, sizeof(ssk->faddr)); 1853 xt.xt_inp.inp_fport = ssk->fport; 1854 xt.t_state = ssk->state; 1855 if (ssk->socket != NULL) 1856 sotoxsocket(ssk->socket, &xt.xt_inp.xi_socket); 1857 xt.xt_inp.xi_socket.xso_protocol = IPPROTO_TCP; 1858 SDP_RUNLOCK(ssk); 1859 error = SYSCTL_OUT(req, &xt, sizeof xt); 1860 if (error) 1861 break; 1862 i++; 1863 continue; 1864 next: 1865 SDP_RUNLOCK(ssk); 1866 } 1867 if (!error) { 1868 /* 1869 * Give the user an updated idea of our state. 1870 * If the generation differs from what we told 1871 * her before, she knows that something happened 1872 * while we were processing this request, and it 1873 * might be necessary to retry. 1874 */ 1875 xig.xig_gen = 0; 1876 xig.xig_sogen = so_gencnt; 1877 xig.xig_count = sdp_count; 1878 error = SYSCTL_OUT(req, &xig, sizeof xig); 1879 } 1880 SDP_LIST_RUNLOCK(); 1881 return (error); 1882 } 1883 1884 SYSCTL_NODE(_net_inet, -1, sdp, CTLFLAG_RW, 0, "SDP"); 1885 1886 SYSCTL_PROC(_net_inet_sdp, TCPCTL_PCBLIST, pcblist, 1887 CTLFLAG_RD | CTLTYPE_STRUCT, 0, 0, sdp_pcblist, "S,xtcpcb", 1888 "List of active SDP connections"); 1889 1890 static void 1891 sdp_zone_change(void *tag) 1892 { 1893 1894 uma_zone_set_max(sdp_zone, maxsockets); 1895 } 1896 1897 static void 1898 sdp_init(void) 1899 { 1900 1901 LIST_INIT(&sdp_list); 1902 sdp_zone = uma_zcreate("sdp_sock", sizeof(struct sdp_sock), 1903 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 1904 uma_zone_set_max(sdp_zone, maxsockets); 1905 EVENTHANDLER_REGISTER(maxsockets_change, sdp_zone_change, NULL, 1906 EVENTHANDLER_PRI_ANY); 1907 rx_comp_wq = create_singlethread_workqueue("rx_comp_wq"); 1908 ib_register_client(&sdp_client); 1909 } 1910 1911 extern struct domain sdpdomain; 1912 1913 struct pr_usrreqs sdp_usrreqs = { 1914 .pru_abort = sdp_abort, 1915 .pru_accept = sdp_accept, 1916 .pru_attach = sdp_attach, 1917 .pru_bind = sdp_bind, 1918 .pru_connect = sdp_connect, 1919 .pru_control = sdp_control, 1920 .pru_detach = sdp_detach, 1921 .pru_disconnect = sdp_disconnect, 1922 .pru_listen = sdp_listen, 1923 .pru_peeraddr = sdp_getpeeraddr, 1924 .pru_rcvoob = sdp_rcvoob, 1925 .pru_send = sdp_send, 1926 .pru_sosend = sdp_sosend, 1927 .pru_soreceive = sdp_sorecv, 1928 .pru_shutdown = sdp_shutdown, 1929 .pru_sockaddr = sdp_getsockaddr, 1930 .pru_close = sdp_close, 1931 }; 1932 1933 struct protosw sdpsw[] = { 1934 { 1935 .pr_type = SOCK_STREAM, 1936 .pr_domain = &sdpdomain, 1937 .pr_protocol = IPPROTO_IP, 1938 .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD, 1939 .pr_ctlinput = sdp_ctlinput, 1940 .pr_ctloutput = sdp_ctloutput, 1941 .pr_usrreqs = &sdp_usrreqs 1942 }, 1943 { 1944 .pr_type = SOCK_STREAM, 1945 .pr_domain = &sdpdomain, 1946 .pr_protocol = IPPROTO_TCP, 1947 .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD, 1948 .pr_ctlinput = sdp_ctlinput, 1949 .pr_ctloutput = sdp_ctloutput, 1950 .pr_usrreqs = &sdp_usrreqs 1951 }, 1952 }; 1953 1954 struct domain sdpdomain = { 1955 .dom_family = AF_INET_SDP, 1956 .dom_name = "SDP", 1957 .dom_init = sdp_init, 1958 .dom_protosw = sdpsw, 1959 .dom_protoswNPROTOSW = &sdpsw[sizeof(sdpsw)/sizeof(sdpsw[0])], 1960 }; 1961 1962 DOMAIN_SET(sdp); 1963 1964 int sdp_debug_level = 1; 1965 int sdp_data_debug_level = 0; 1966