1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * Copyright (c) 2004 The FreeBSD Foundation. All rights reserved. 7 * Copyright (c) 2004-2008 Robert N. M. Watson. All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * Excerpts taken from tcp_subr.c, tcp_usrreq.c, uipc_socket.c 34 */ 35 36 /* 37 * 38 * Copyright (c) 2010 Isilon Systems, Inc. 39 * Copyright (c) 2010 iX Systems, Inc. 40 * Copyright (c) 2010 Panasas, Inc. 41 * All rights reserved. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice unmodified, this list of conditions, and the following 48 * disclaimer. 49 * 2. Redistributions in binary form must reproduce the above copyright 50 * notice, this list of conditions and the following disclaimer in the 51 * documentation and/or other materials provided with the distribution. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 54 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 55 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 56 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 57 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 58 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 59 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 60 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 61 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 62 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 63 * 64 */ 65 #include <sys/cdefs.h> 66 __FBSDID("$FreeBSD$"); 67 68 #include <sys/param.h> 69 #include <sys/kernel.h> 70 #include <sys/malloc.h> 71 72 #include "sdp.h" 73 74 #include <net/if.h> 75 #include <net/route.h> 76 #include <net/vnet.h> 77 #include <sys/sysctl.h> 78 79 uma_zone_t sdp_zone; 80 struct rwlock sdp_lock; 81 LIST_HEAD(, sdp_sock) sdp_list; 82 83 struct workqueue_struct *rx_comp_wq; 84 85 RW_SYSINIT(sdplockinit, &sdp_lock, "SDP lock"); 86 #define SDP_LIST_WLOCK() rw_wlock(&sdp_lock) 87 #define SDP_LIST_RLOCK() rw_rlock(&sdp_lock) 88 #define SDP_LIST_WUNLOCK() rw_wunlock(&sdp_lock) 89 #define SDP_LIST_RUNLOCK() rw_runlock(&sdp_lock) 90 #define SDP_LIST_WLOCK_ASSERT() rw_assert(&sdp_lock, RW_WLOCKED) 91 #define SDP_LIST_RLOCK_ASSERT() rw_assert(&sdp_lock, RW_RLOCKED) 92 #define SDP_LIST_LOCK_ASSERT() rw_assert(&sdp_lock, RW_LOCKED) 93 94 MALLOC_DEFINE(M_SDP, "sdp", "Sockets Direct Protocol"); 95 96 static void sdp_stop_keepalive_timer(struct socket *so); 97 98 /* 99 * SDP protocol interface to socket abstraction. 100 */ 101 /* 102 * sdp_sendspace and sdp_recvspace are the default send and receive window 103 * sizes, respectively. 104 */ 105 u_long sdp_sendspace = 1024*32; 106 u_long sdp_recvspace = 1024*64; 107 108 static int sdp_count; 109 110 /* 111 * Disable async. CMA events for sockets which are being torn down. 112 */ 113 static void 114 sdp_destroy_cma(struct sdp_sock *ssk) 115 { 116 117 if (ssk->id == NULL) 118 return; 119 rdma_destroy_id(ssk->id); 120 ssk->id = NULL; 121 } 122 123 static int 124 sdp_pcbbind(struct sdp_sock *ssk, struct sockaddr *nam, struct ucred *cred) 125 { 126 struct sockaddr_in *sin; 127 struct sockaddr_in null; 128 int error; 129 130 SDP_WLOCK_ASSERT(ssk); 131 132 if (ssk->lport != 0 || ssk->laddr != INADDR_ANY) 133 return (EINVAL); 134 /* rdma_bind_addr handles bind races. */ 135 SDP_WUNLOCK(ssk); 136 if (ssk->id == NULL) 137 ssk->id = rdma_create_id(&init_net, sdp_cma_handler, ssk, RDMA_PS_SDP, IB_QPT_RC); 138 if (ssk->id == NULL) { 139 SDP_WLOCK(ssk); 140 return (ENOMEM); 141 } 142 if (nam == NULL) { 143 null.sin_family = AF_INET; 144 null.sin_len = sizeof(null); 145 null.sin_addr.s_addr = INADDR_ANY; 146 null.sin_port = 0; 147 bzero(&null.sin_zero, sizeof(null.sin_zero)); 148 nam = (struct sockaddr *)&null; 149 } 150 error = -rdma_bind_addr(ssk->id, nam); 151 SDP_WLOCK(ssk); 152 if (error == 0) { 153 sin = (struct sockaddr_in *)&ssk->id->route.addr.src_addr; 154 ssk->laddr = sin->sin_addr.s_addr; 155 ssk->lport = sin->sin_port; 156 } else 157 sdp_destroy_cma(ssk); 158 return (error); 159 } 160 161 static void 162 sdp_pcbfree(struct sdp_sock *ssk) 163 { 164 165 KASSERT(ssk->socket == NULL, ("ssk %p socket still attached", ssk)); 166 KASSERT((ssk->flags & SDP_DESTROY) == 0, 167 ("ssk %p already destroyed", ssk)); 168 169 sdp_dbg(ssk->socket, "Freeing pcb"); 170 SDP_WLOCK_ASSERT(ssk); 171 ssk->flags |= SDP_DESTROY; 172 SDP_WUNLOCK(ssk); 173 SDP_LIST_WLOCK(); 174 sdp_count--; 175 LIST_REMOVE(ssk, list); 176 SDP_LIST_WUNLOCK(); 177 crfree(ssk->cred); 178 ssk->qp_active = 0; 179 if (ssk->qp) { 180 ib_destroy_qp(ssk->qp); 181 ssk->qp = NULL; 182 } 183 sdp_tx_ring_destroy(ssk); 184 sdp_rx_ring_destroy(ssk); 185 sdp_destroy_cma(ssk); 186 rw_destroy(&ssk->rx_ring.destroyed_lock); 187 rw_destroy(&ssk->lock); 188 uma_zfree(sdp_zone, ssk); 189 } 190 191 /* 192 * Common routines to return a socket address. 193 */ 194 static struct sockaddr * 195 sdp_sockaddr(in_port_t port, struct in_addr *addr_p) 196 { 197 struct sockaddr_in *sin; 198 199 sin = malloc(sizeof *sin, M_SONAME, 200 M_WAITOK | M_ZERO); 201 sin->sin_family = AF_INET; 202 sin->sin_len = sizeof(*sin); 203 sin->sin_addr = *addr_p; 204 sin->sin_port = port; 205 206 return (struct sockaddr *)sin; 207 } 208 209 static int 210 sdp_getsockaddr(struct socket *so, struct sockaddr **nam) 211 { 212 struct sdp_sock *ssk; 213 struct in_addr addr; 214 in_port_t port; 215 216 ssk = sdp_sk(so); 217 SDP_RLOCK(ssk); 218 port = ssk->lport; 219 addr.s_addr = ssk->laddr; 220 SDP_RUNLOCK(ssk); 221 222 *nam = sdp_sockaddr(port, &addr); 223 return 0; 224 } 225 226 static int 227 sdp_getpeeraddr(struct socket *so, struct sockaddr **nam) 228 { 229 struct sdp_sock *ssk; 230 struct in_addr addr; 231 in_port_t port; 232 233 ssk = sdp_sk(so); 234 SDP_RLOCK(ssk); 235 port = ssk->fport; 236 addr.s_addr = ssk->faddr; 237 SDP_RUNLOCK(ssk); 238 239 *nam = sdp_sockaddr(port, &addr); 240 return 0; 241 } 242 243 static void 244 sdp_pcbnotifyall(struct in_addr faddr, int errno, 245 struct sdp_sock *(*notify)(struct sdp_sock *, int)) 246 { 247 struct sdp_sock *ssk, *ssk_temp; 248 249 SDP_LIST_WLOCK(); 250 LIST_FOREACH_SAFE(ssk, &sdp_list, list, ssk_temp) { 251 SDP_WLOCK(ssk); 252 if (ssk->faddr != faddr.s_addr || ssk->socket == NULL) { 253 SDP_WUNLOCK(ssk); 254 continue; 255 } 256 if ((ssk->flags & SDP_DESTROY) == 0) 257 if ((*notify)(ssk, errno)) 258 SDP_WUNLOCK(ssk); 259 } 260 SDP_LIST_WUNLOCK(); 261 } 262 263 #if 0 264 static void 265 sdp_apply_all(void (*func)(struct sdp_sock *, void *), void *arg) 266 { 267 struct sdp_sock *ssk; 268 269 SDP_LIST_RLOCK(); 270 LIST_FOREACH(ssk, &sdp_list, list) { 271 SDP_WLOCK(ssk); 272 func(ssk, arg); 273 SDP_WUNLOCK(ssk); 274 } 275 SDP_LIST_RUNLOCK(); 276 } 277 #endif 278 279 static void 280 sdp_output_reset(struct sdp_sock *ssk) 281 { 282 struct rdma_cm_id *id; 283 284 SDP_WLOCK_ASSERT(ssk); 285 if (ssk->id) { 286 id = ssk->id; 287 ssk->qp_active = 0; 288 SDP_WUNLOCK(ssk); 289 rdma_disconnect(id); 290 SDP_WLOCK(ssk); 291 } 292 ssk->state = TCPS_CLOSED; 293 } 294 295 /* 296 * Attempt to close a SDP socket, marking it as dropped, and freeing 297 * the socket if we hold the only reference. 298 */ 299 static struct sdp_sock * 300 sdp_closed(struct sdp_sock *ssk) 301 { 302 struct socket *so; 303 304 SDP_WLOCK_ASSERT(ssk); 305 306 ssk->flags |= SDP_DROPPED; 307 so = ssk->socket; 308 soisdisconnected(so); 309 if (ssk->flags & SDP_SOCKREF) { 310 KASSERT(so->so_state & SS_PROTOREF, 311 ("sdp_closed: !SS_PROTOREF")); 312 ssk->flags &= ~SDP_SOCKREF; 313 SDP_WUNLOCK(ssk); 314 SOCK_LOCK(so); 315 so->so_state &= ~SS_PROTOREF; 316 sofree(so); 317 return (NULL); 318 } 319 return (ssk); 320 } 321 322 /* 323 * Perform timer based shutdowns which can not operate in 324 * callout context. 325 */ 326 static void 327 sdp_shutdown_task(void *data, int pending) 328 { 329 struct sdp_sock *ssk; 330 331 ssk = data; 332 SDP_WLOCK(ssk); 333 /* 334 * I don't think this can race with another call to pcbfree() 335 * because SDP_TIMEWAIT protects it. SDP_DESTROY may be redundant. 336 */ 337 if (ssk->flags & SDP_DESTROY) 338 panic("sdp_shutdown_task: Racing with pcbfree for ssk %p", 339 ssk); 340 if (ssk->flags & SDP_DISCON) 341 sdp_output_reset(ssk); 342 /* We have to clear this so sdp_detach() will call pcbfree(). */ 343 ssk->flags &= ~(SDP_TIMEWAIT | SDP_DREQWAIT); 344 if ((ssk->flags & SDP_DROPPED) == 0 && 345 sdp_closed(ssk) == NULL) 346 return; 347 if (ssk->socket == NULL) { 348 sdp_pcbfree(ssk); 349 return; 350 } 351 SDP_WUNLOCK(ssk); 352 } 353 354 /* 355 * 2msl has expired, schedule the shutdown task. 356 */ 357 static void 358 sdp_2msl_timeout(void *data) 359 { 360 struct sdp_sock *ssk; 361 362 ssk = data; 363 /* Callout canceled. */ 364 if (!callout_active(&ssk->keep2msl)) 365 goto out; 366 callout_deactivate(&ssk->keep2msl); 367 /* Should be impossible, defensive programming. */ 368 if ((ssk->flags & SDP_TIMEWAIT) == 0) 369 goto out; 370 taskqueue_enqueue(taskqueue_thread, &ssk->shutdown_task); 371 out: 372 SDP_WUNLOCK(ssk); 373 return; 374 } 375 376 /* 377 * Schedule the 2msl wait timer. 378 */ 379 static void 380 sdp_2msl_wait(struct sdp_sock *ssk) 381 { 382 383 SDP_WLOCK_ASSERT(ssk); 384 ssk->flags |= SDP_TIMEWAIT; 385 ssk->state = TCPS_TIME_WAIT; 386 soisdisconnected(ssk->socket); 387 callout_reset(&ssk->keep2msl, TCPTV_MSL, sdp_2msl_timeout, ssk); 388 } 389 390 /* 391 * Timed out waiting for the final fin/ack from rdma_disconnect(). 392 */ 393 static void 394 sdp_dreq_timeout(void *data) 395 { 396 struct sdp_sock *ssk; 397 398 ssk = data; 399 /* Callout canceled. */ 400 if (!callout_active(&ssk->keep2msl)) 401 goto out; 402 /* Callout rescheduled, probably as a different timer. */ 403 if (callout_pending(&ssk->keep2msl)) 404 goto out; 405 callout_deactivate(&ssk->keep2msl); 406 if (ssk->state != TCPS_FIN_WAIT_1 && ssk->state != TCPS_LAST_ACK) 407 goto out; 408 if ((ssk->flags & SDP_DREQWAIT) == 0) 409 goto out; 410 ssk->flags &= ~SDP_DREQWAIT; 411 ssk->flags |= SDP_DISCON; 412 sdp_2msl_wait(ssk); 413 ssk->qp_active = 0; 414 out: 415 SDP_WUNLOCK(ssk); 416 } 417 418 /* 419 * Received the final fin/ack. Cancel the 2msl. 420 */ 421 void 422 sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk) 423 { 424 sdp_dbg(ssk->socket, "cancelling dreq wait timeout\n"); 425 ssk->flags &= ~SDP_DREQWAIT; 426 sdp_2msl_wait(ssk); 427 } 428 429 static int 430 sdp_init_sock(struct socket *sk) 431 { 432 struct sdp_sock *ssk = sdp_sk(sk); 433 434 sdp_dbg(sk, "%s\n", __func__); 435 436 callout_init_rw(&ssk->keep2msl, &ssk->lock, CALLOUT_RETURNUNLOCKED); 437 TASK_INIT(&ssk->shutdown_task, 0, sdp_shutdown_task, ssk); 438 #ifdef SDP_ZCOPY 439 INIT_DELAYED_WORK(&ssk->srcavail_cancel_work, srcavail_cancel_timeout); 440 ssk->zcopy_thresh = -1; /* use global sdp_zcopy_thresh */ 441 ssk->tx_ring.rdma_inflight = NULL; 442 #endif 443 atomic_set(&ssk->mseq_ack, 0); 444 sdp_rx_ring_init(ssk); 445 ssk->tx_ring.buffer = NULL; 446 447 return 0; 448 } 449 450 /* 451 * Allocate an sdp_sock for the socket and reserve socket buffer space. 452 */ 453 static int 454 sdp_attach(struct socket *so, int proto, struct thread *td) 455 { 456 struct sdp_sock *ssk; 457 int error; 458 459 ssk = sdp_sk(so); 460 KASSERT(ssk == NULL, ("sdp_attach: ssk already set on so %p", so)); 461 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { 462 error = soreserve(so, sdp_sendspace, sdp_recvspace); 463 if (error) 464 return (error); 465 } 466 so->so_rcv.sb_flags |= SB_AUTOSIZE; 467 so->so_snd.sb_flags |= SB_AUTOSIZE; 468 ssk = uma_zalloc(sdp_zone, M_NOWAIT | M_ZERO); 469 if (ssk == NULL) 470 return (ENOBUFS); 471 rw_init(&ssk->lock, "sdpsock"); 472 ssk->socket = so; 473 ssk->cred = crhold(so->so_cred); 474 so->so_pcb = (caddr_t)ssk; 475 sdp_init_sock(so); 476 ssk->flags = 0; 477 ssk->qp_active = 0; 478 ssk->state = TCPS_CLOSED; 479 mbufq_init(&ssk->rxctlq, INT_MAX); 480 SDP_LIST_WLOCK(); 481 LIST_INSERT_HEAD(&sdp_list, ssk, list); 482 sdp_count++; 483 SDP_LIST_WUNLOCK(); 484 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 485 so->so_linger = TCP_LINGERTIME; 486 487 return (0); 488 } 489 490 /* 491 * Detach SDP from the socket, potentially leaving it around for the 492 * timewait to expire. 493 */ 494 static void 495 sdp_detach(struct socket *so) 496 { 497 struct sdp_sock *ssk; 498 499 ssk = sdp_sk(so); 500 SDP_WLOCK(ssk); 501 KASSERT(ssk->socket != NULL, ("sdp_detach: socket is NULL")); 502 ssk->socket->so_pcb = NULL; 503 ssk->socket = NULL; 504 if (ssk->flags & (SDP_TIMEWAIT | SDP_DREQWAIT)) 505 SDP_WUNLOCK(ssk); 506 else if (ssk->flags & SDP_DROPPED || ssk->state < TCPS_SYN_SENT) 507 sdp_pcbfree(ssk); 508 else 509 panic("sdp_detach: Unexpected state, ssk %p.\n", ssk); 510 } 511 512 /* 513 * Allocate a local address for the socket. 514 */ 515 static int 516 sdp_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 517 { 518 int error = 0; 519 struct sdp_sock *ssk; 520 struct sockaddr_in *sin; 521 522 sin = (struct sockaddr_in *)nam; 523 if (nam->sa_len != sizeof (*sin)) 524 return (EINVAL); 525 if (sin->sin_family != AF_INET) 526 return (EINVAL); 527 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) 528 return (EAFNOSUPPORT); 529 530 ssk = sdp_sk(so); 531 SDP_WLOCK(ssk); 532 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 533 error = EINVAL; 534 goto out; 535 } 536 error = sdp_pcbbind(ssk, nam, td->td_ucred); 537 out: 538 SDP_WUNLOCK(ssk); 539 540 return (error); 541 } 542 543 /* 544 * Prepare to accept connections. 545 */ 546 static int 547 sdp_listen(struct socket *so, int backlog, struct thread *td) 548 { 549 int error = 0; 550 struct sdp_sock *ssk; 551 552 ssk = sdp_sk(so); 553 SDP_WLOCK(ssk); 554 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 555 error = EINVAL; 556 goto out; 557 } 558 if (error == 0 && ssk->lport == 0) 559 error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred); 560 SOCK_LOCK(so); 561 if (error == 0) 562 error = solisten_proto_check(so); 563 if (error == 0) { 564 solisten_proto(so, backlog); 565 ssk->state = TCPS_LISTEN; 566 } 567 SOCK_UNLOCK(so); 568 569 out: 570 SDP_WUNLOCK(ssk); 571 if (error == 0) 572 error = -rdma_listen(ssk->id, backlog); 573 return (error); 574 } 575 576 /* 577 * Initiate a SDP connection to nam. 578 */ 579 static int 580 sdp_start_connect(struct sdp_sock *ssk, struct sockaddr *nam, struct thread *td) 581 { 582 struct sockaddr_in src; 583 struct socket *so; 584 int error; 585 586 so = ssk->socket; 587 588 SDP_WLOCK_ASSERT(ssk); 589 if (ssk->lport == 0) { 590 error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred); 591 if (error) 592 return error; 593 } 594 src.sin_family = AF_INET; 595 src.sin_len = sizeof(src); 596 bzero(&src.sin_zero, sizeof(src.sin_zero)); 597 src.sin_port = ssk->lport; 598 src.sin_addr.s_addr = ssk->laddr; 599 soisconnecting(so); 600 SDP_WUNLOCK(ssk); 601 error = -rdma_resolve_addr(ssk->id, (struct sockaddr *)&src, nam, 602 SDP_RESOLVE_TIMEOUT); 603 SDP_WLOCK(ssk); 604 if (error == 0) 605 ssk->state = TCPS_SYN_SENT; 606 607 return 0; 608 } 609 610 /* 611 * Initiate SDP connection. 612 */ 613 static int 614 sdp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 615 { 616 int error = 0; 617 struct sdp_sock *ssk; 618 struct sockaddr_in *sin; 619 620 sin = (struct sockaddr_in *)nam; 621 if (nam->sa_len != sizeof (*sin)) 622 return (EINVAL); 623 if (sin->sin_family != AF_INET) 624 return (EINVAL); 625 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) 626 return (EAFNOSUPPORT); 627 if ((error = prison_remote_ip4(td->td_ucred, &sin->sin_addr)) != 0) 628 return (error); 629 ssk = sdp_sk(so); 630 SDP_WLOCK(ssk); 631 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) 632 error = EINVAL; 633 else 634 error = sdp_start_connect(ssk, nam, td); 635 SDP_WUNLOCK(ssk); 636 return (error); 637 } 638 639 /* 640 * Drop a SDP socket, reporting 641 * the specified error. If connection is synchronized, 642 * then send a RST to peer. 643 */ 644 static struct sdp_sock * 645 sdp_drop(struct sdp_sock *ssk, int errno) 646 { 647 struct socket *so; 648 649 SDP_WLOCK_ASSERT(ssk); 650 so = ssk->socket; 651 if (TCPS_HAVERCVDSYN(ssk->state)) 652 sdp_output_reset(ssk); 653 if (errno == ETIMEDOUT && ssk->softerror) 654 errno = ssk->softerror; 655 so->so_error = errno; 656 return (sdp_closed(ssk)); 657 } 658 659 /* 660 * User issued close, and wish to trail through shutdown states: 661 * if never received SYN, just forget it. If got a SYN from peer, 662 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 663 * If already got a FIN from peer, then almost done; go to LAST_ACK 664 * state. In all other cases, have already sent FIN to peer (e.g. 665 * after PRU_SHUTDOWN), and just have to play tedious game waiting 666 * for peer to send FIN or not respond to keep-alives, etc. 667 * We can let the user exit from the close as soon as the FIN is acked. 668 */ 669 static void 670 sdp_usrclosed(struct sdp_sock *ssk) 671 { 672 673 SDP_WLOCK_ASSERT(ssk); 674 675 switch (ssk->state) { 676 case TCPS_LISTEN: 677 ssk->state = TCPS_CLOSED; 678 SDP_WUNLOCK(ssk); 679 sdp_destroy_cma(ssk); 680 SDP_WLOCK(ssk); 681 /* FALLTHROUGH */ 682 case TCPS_CLOSED: 683 ssk = sdp_closed(ssk); 684 /* 685 * sdp_closed() should never return NULL here as the socket is 686 * still open. 687 */ 688 KASSERT(ssk != NULL, 689 ("sdp_usrclosed: sdp_closed() returned NULL")); 690 break; 691 692 case TCPS_SYN_SENT: 693 /* FALLTHROUGH */ 694 case TCPS_SYN_RECEIVED: 695 ssk->flags |= SDP_NEEDFIN; 696 break; 697 698 case TCPS_ESTABLISHED: 699 ssk->flags |= SDP_NEEDFIN; 700 ssk->state = TCPS_FIN_WAIT_1; 701 break; 702 703 case TCPS_CLOSE_WAIT: 704 ssk->state = TCPS_LAST_ACK; 705 break; 706 } 707 if (ssk->state >= TCPS_FIN_WAIT_2) { 708 /* Prevent the connection hanging in FIN_WAIT_2 forever. */ 709 if (ssk->state == TCPS_FIN_WAIT_2) 710 sdp_2msl_wait(ssk); 711 else 712 soisdisconnected(ssk->socket); 713 } 714 } 715 716 static void 717 sdp_output_disconnect(struct sdp_sock *ssk) 718 { 719 720 SDP_WLOCK_ASSERT(ssk); 721 callout_reset(&ssk->keep2msl, SDP_FIN_WAIT_TIMEOUT, 722 sdp_dreq_timeout, ssk); 723 ssk->flags |= SDP_NEEDFIN | SDP_DREQWAIT; 724 sdp_post_sends(ssk, M_NOWAIT); 725 } 726 727 /* 728 * Initiate or continue a disconnect. 729 * If embryonic state, just send reset (once). 730 * If in ``let data drain'' option and linger null, just drop. 731 * Otherwise (hard), mark socket disconnecting and drop 732 * current input data; switch states based on user close, and 733 * send segment to peer (with FIN). 734 */ 735 static void 736 sdp_start_disconnect(struct sdp_sock *ssk) 737 { 738 struct socket *so; 739 int unread; 740 741 so = ssk->socket; 742 SDP_WLOCK_ASSERT(ssk); 743 sdp_stop_keepalive_timer(so); 744 /* 745 * Neither sdp_closed() nor sdp_drop() should return NULL, as the 746 * socket is still open. 747 */ 748 if (ssk->state < TCPS_ESTABLISHED) { 749 ssk = sdp_closed(ssk); 750 KASSERT(ssk != NULL, 751 ("sdp_start_disconnect: sdp_close() returned NULL")); 752 } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) { 753 ssk = sdp_drop(ssk, 0); 754 KASSERT(ssk != NULL, 755 ("sdp_start_disconnect: sdp_drop() returned NULL")); 756 } else { 757 soisdisconnecting(so); 758 unread = sbused(&so->so_rcv); 759 sbflush(&so->so_rcv); 760 sdp_usrclosed(ssk); 761 if (!(ssk->flags & SDP_DROPPED)) { 762 if (unread) 763 sdp_output_reset(ssk); 764 else 765 sdp_output_disconnect(ssk); 766 } 767 } 768 } 769 770 /* 771 * User initiated disconnect. 772 */ 773 static int 774 sdp_disconnect(struct socket *so) 775 { 776 struct sdp_sock *ssk; 777 int error = 0; 778 779 ssk = sdp_sk(so); 780 SDP_WLOCK(ssk); 781 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 782 error = ECONNRESET; 783 goto out; 784 } 785 sdp_start_disconnect(ssk); 786 out: 787 SDP_WUNLOCK(ssk); 788 return (error); 789 } 790 791 /* 792 * Accept a connection. Essentially all the work is done at higher levels; 793 * just return the address of the peer, storing through addr. 794 * 795 * 796 * XXX This is broken XXX 797 * 798 * The rationale for acquiring the sdp lock here is somewhat complicated, 799 * and is described in detail in the commit log entry for r175612. Acquiring 800 * it delays an accept(2) racing with sonewconn(), which inserts the socket 801 * before the address/port fields are initialized. A better fix would 802 * prevent the socket from being placed in the listen queue until all fields 803 * are fully initialized. 804 */ 805 static int 806 sdp_accept(struct socket *so, struct sockaddr **nam) 807 { 808 struct sdp_sock *ssk = NULL; 809 struct in_addr addr; 810 in_port_t port; 811 int error; 812 813 if (so->so_state & SS_ISDISCONNECTED) 814 return (ECONNABORTED); 815 816 port = 0; 817 addr.s_addr = 0; 818 error = 0; 819 ssk = sdp_sk(so); 820 SDP_WLOCK(ssk); 821 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 822 error = ECONNABORTED; 823 goto out; 824 } 825 port = ssk->fport; 826 addr.s_addr = ssk->faddr; 827 out: 828 SDP_WUNLOCK(ssk); 829 if (error == 0) 830 *nam = sdp_sockaddr(port, &addr); 831 return error; 832 } 833 834 /* 835 * Mark the connection as being incapable of further output. 836 */ 837 static int 838 sdp_shutdown(struct socket *so) 839 { 840 int error = 0; 841 struct sdp_sock *ssk; 842 843 ssk = sdp_sk(so); 844 SDP_WLOCK(ssk); 845 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 846 error = ECONNRESET; 847 goto out; 848 } 849 socantsendmore(so); 850 sdp_usrclosed(ssk); 851 if (!(ssk->flags & SDP_DROPPED)) 852 sdp_output_disconnect(ssk); 853 854 out: 855 SDP_WUNLOCK(ssk); 856 857 return (error); 858 } 859 860 static void 861 sdp_append(struct sdp_sock *ssk, struct sockbuf *sb, struct mbuf *mb, int cnt) 862 { 863 struct mbuf *n; 864 int ncnt; 865 866 SOCKBUF_LOCK_ASSERT(sb); 867 SBLASTRECORDCHK(sb); 868 KASSERT(mb->m_flags & M_PKTHDR, 869 ("sdp_append: %p Missing packet header.\n", mb)); 870 n = sb->sb_lastrecord; 871 /* 872 * If the queue is empty just set all pointers and proceed. 873 */ 874 if (n == NULL) { 875 sb->sb_lastrecord = sb->sb_mb = sb->sb_sndptr = mb; 876 for (; mb; mb = mb->m_next) { 877 sb->sb_mbtail = mb; 878 sballoc(sb, mb); 879 } 880 return; 881 } 882 /* 883 * Count the number of mbufs in the current tail. 884 */ 885 for (ncnt = 0; n->m_next; n = n->m_next) 886 ncnt++; 887 n = sb->sb_lastrecord; 888 /* 889 * If the two chains can fit in a single sdp packet and 890 * the last record has not been sent yet (WRITABLE) coalesce 891 * them. The lastrecord remains the same but we must strip the 892 * packet header and then let sbcompress do the hard part. 893 */ 894 if (M_WRITABLE(n) && ncnt + cnt < SDP_MAX_SEND_SGES && 895 n->m_pkthdr.len + mb->m_pkthdr.len - SDP_HEAD_SIZE < 896 ssk->xmit_size_goal) { 897 m_adj(mb, SDP_HEAD_SIZE); 898 n->m_pkthdr.len += mb->m_pkthdr.len; 899 n->m_flags |= mb->m_flags & (M_PUSH | M_URG); 900 m_demote(mb, 1, 0); 901 sbcompress(sb, mb, sb->sb_mbtail); 902 return; 903 } 904 /* 905 * Not compressible, just append to the end and adjust counters. 906 */ 907 sb->sb_lastrecord->m_flags |= M_PUSH; 908 sb->sb_lastrecord->m_nextpkt = mb; 909 sb->sb_lastrecord = mb; 910 if (sb->sb_sndptr == NULL) 911 sb->sb_sndptr = mb; 912 for (; mb; mb = mb->m_next) { 913 sb->sb_mbtail = mb; 914 sballoc(sb, mb); 915 } 916 } 917 918 /* 919 * Do a send by putting data in output queue and updating urgent 920 * marker if URG set. Possibly send more data. Unlike the other 921 * pru_*() routines, the mbuf chains are our responsibility. We 922 * must either enqueue them or free them. The other pru_* routines 923 * generally are caller-frees. 924 * 925 * This comes from sendfile, normal sends will come from sdp_sosend(). 926 */ 927 static int 928 sdp_send(struct socket *so, int flags, struct mbuf *m, 929 struct sockaddr *nam, struct mbuf *control, struct thread *td) 930 { 931 struct sdp_sock *ssk; 932 struct mbuf *n; 933 int error; 934 int cnt; 935 936 error = 0; 937 ssk = sdp_sk(so); 938 KASSERT(m->m_flags & M_PKTHDR, 939 ("sdp_send: %p no packet header", m)); 940 M_PREPEND(m, SDP_HEAD_SIZE, M_WAITOK); 941 mtod(m, struct sdp_bsdh *)->mid = SDP_MID_DATA; 942 for (n = m, cnt = 0; n->m_next; n = n->m_next) 943 cnt++; 944 if (cnt > SDP_MAX_SEND_SGES) { 945 n = m_collapse(m, M_WAITOK, SDP_MAX_SEND_SGES); 946 if (n == NULL) { 947 m_freem(m); 948 return (EMSGSIZE); 949 } 950 m = n; 951 for (cnt = 0; n->m_next; n = n->m_next) 952 cnt++; 953 } 954 SDP_WLOCK(ssk); 955 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 956 if (control) 957 m_freem(control); 958 if (m) 959 m_freem(m); 960 error = ECONNRESET; 961 goto out; 962 } 963 if (control) { 964 /* SDP doesn't support control messages. */ 965 if (control->m_len) { 966 m_freem(control); 967 if (m) 968 m_freem(m); 969 error = EINVAL; 970 goto out; 971 } 972 m_freem(control); /* empty control, just free it */ 973 } 974 if (!(flags & PRUS_OOB)) { 975 SOCKBUF_LOCK(&so->so_snd); 976 sdp_append(ssk, &so->so_snd, m, cnt); 977 SOCKBUF_UNLOCK(&so->so_snd); 978 if (nam && ssk->state < TCPS_SYN_SENT) { 979 /* 980 * Do implied connect if not yet connected. 981 */ 982 error = sdp_start_connect(ssk, nam, td); 983 if (error) 984 goto out; 985 } 986 if (flags & PRUS_EOF) { 987 /* 988 * Close the send side of the connection after 989 * the data is sent. 990 */ 991 socantsendmore(so); 992 sdp_usrclosed(ssk); 993 if (!(ssk->flags & SDP_DROPPED)) 994 sdp_output_disconnect(ssk); 995 } else if (!(ssk->flags & SDP_DROPPED) && 996 !(flags & PRUS_MORETOCOME)) 997 sdp_post_sends(ssk, M_NOWAIT); 998 SDP_WUNLOCK(ssk); 999 return (0); 1000 } else { 1001 SOCKBUF_LOCK(&so->so_snd); 1002 if (sbspace(&so->so_snd) < -512) { 1003 SOCKBUF_UNLOCK(&so->so_snd); 1004 m_freem(m); 1005 error = ENOBUFS; 1006 goto out; 1007 } 1008 /* 1009 * According to RFC961 (Assigned Protocols), 1010 * the urgent pointer points to the last octet 1011 * of urgent data. We continue, however, 1012 * to consider it to indicate the first octet 1013 * of data past the urgent section. 1014 * Otherwise, snd_up should be one lower. 1015 */ 1016 m->m_flags |= M_URG | M_PUSH; 1017 sdp_append(ssk, &so->so_snd, m, cnt); 1018 SOCKBUF_UNLOCK(&so->so_snd); 1019 if (nam && ssk->state < TCPS_SYN_SENT) { 1020 /* 1021 * Do implied connect if not yet connected. 1022 */ 1023 error = sdp_start_connect(ssk, nam, td); 1024 if (error) 1025 goto out; 1026 } 1027 sdp_post_sends(ssk, M_NOWAIT); 1028 SDP_WUNLOCK(ssk); 1029 return (0); 1030 } 1031 out: 1032 SDP_WUNLOCK(ssk); 1033 return (error); 1034 } 1035 1036 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 1037 1038 /* 1039 * Send on a socket. If send must go all at once and message is larger than 1040 * send buffering, then hard error. Lock against other senders. If must go 1041 * all at once and not enough room now, then inform user that this would 1042 * block and do nothing. Otherwise, if nonblocking, send as much as 1043 * possible. The data to be sent is described by "uio" if nonzero, otherwise 1044 * by the mbuf chain "top" (which must be null if uio is not). Data provided 1045 * in mbuf chain must be small enough to send all at once. 1046 * 1047 * Returns nonzero on error, timeout or signal; callers must check for short 1048 * counts if EINTR/ERESTART are returned. Data and control buffers are freed 1049 * on return. 1050 */ 1051 static int 1052 sdp_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 1053 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1054 { 1055 struct sdp_sock *ssk; 1056 long space, resid; 1057 int atomic; 1058 int error; 1059 int copy; 1060 1061 if (uio != NULL) 1062 resid = uio->uio_resid; 1063 else 1064 resid = top->m_pkthdr.len; 1065 atomic = top != NULL; 1066 if (control != NULL) { 1067 if (control->m_len) { 1068 m_freem(control); 1069 if (top) 1070 m_freem(top); 1071 return (EINVAL); 1072 } 1073 m_freem(control); 1074 control = NULL; 1075 } 1076 /* 1077 * In theory resid should be unsigned. However, space must be 1078 * signed, as it might be less than 0 if we over-committed, and we 1079 * must use a signed comparison of space and resid. On the other 1080 * hand, a negative resid causes us to loop sending 0-length 1081 * segments to the protocol. 1082 * 1083 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 1084 * type sockets since that's an error. 1085 */ 1086 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 1087 error = EINVAL; 1088 goto out; 1089 } 1090 if (td != NULL) 1091 td->td_ru.ru_msgsnd++; 1092 1093 ssk = sdp_sk(so); 1094 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 1095 if (error) 1096 goto out; 1097 1098 restart: 1099 do { 1100 SOCKBUF_LOCK(&so->so_snd); 1101 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1102 SOCKBUF_UNLOCK(&so->so_snd); 1103 error = EPIPE; 1104 goto release; 1105 } 1106 if (so->so_error) { 1107 error = so->so_error; 1108 so->so_error = 0; 1109 SOCKBUF_UNLOCK(&so->so_snd); 1110 goto release; 1111 } 1112 if ((so->so_state & SS_ISCONNECTED) == 0 && addr == NULL) { 1113 SOCKBUF_UNLOCK(&so->so_snd); 1114 error = ENOTCONN; 1115 goto release; 1116 } 1117 space = sbspace(&so->so_snd); 1118 if (flags & MSG_OOB) 1119 space += 1024; 1120 if (atomic && resid > ssk->xmit_size_goal - SDP_HEAD_SIZE) { 1121 SOCKBUF_UNLOCK(&so->so_snd); 1122 error = EMSGSIZE; 1123 goto release; 1124 } 1125 if (space < resid && 1126 (atomic || space < so->so_snd.sb_lowat)) { 1127 if ((so->so_state & SS_NBIO) || 1128 (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { 1129 SOCKBUF_UNLOCK(&so->so_snd); 1130 error = EWOULDBLOCK; 1131 goto release; 1132 } 1133 error = sbwait(&so->so_snd); 1134 SOCKBUF_UNLOCK(&so->so_snd); 1135 if (error) 1136 goto release; 1137 goto restart; 1138 } 1139 SOCKBUF_UNLOCK(&so->so_snd); 1140 do { 1141 if (uio == NULL) { 1142 resid = 0; 1143 if (flags & MSG_EOR) 1144 top->m_flags |= M_EOR; 1145 } else { 1146 /* 1147 * Copy the data from userland into a mbuf 1148 * chain. If no data is to be copied in, 1149 * a single empty mbuf is returned. 1150 */ 1151 copy = min(space, 1152 ssk->xmit_size_goal - SDP_HEAD_SIZE); 1153 top = m_uiotombuf(uio, M_WAITOK, copy, 1154 0, M_PKTHDR | 1155 ((flags & MSG_EOR) ? M_EOR : 0)); 1156 if (top == NULL) { 1157 /* only possible error */ 1158 error = EFAULT; 1159 goto release; 1160 } 1161 space -= resid - uio->uio_resid; 1162 resid = uio->uio_resid; 1163 } 1164 /* 1165 * XXX all the SBS_CANTSENDMORE checks previously 1166 * done could be out of date after dropping the 1167 * socket lock. 1168 */ 1169 error = sdp_send(so, (flags & MSG_OOB) ? PRUS_OOB : 1170 /* 1171 * Set EOF on the last send if the user specified 1172 * MSG_EOF. 1173 */ 1174 ((flags & MSG_EOF) && (resid <= 0)) ? PRUS_EOF : 1175 /* If there is more to send set PRUS_MORETOCOME. */ 1176 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1177 top, addr, NULL, td); 1178 top = NULL; 1179 if (error) 1180 goto release; 1181 } while (resid && space > 0); 1182 } while (resid); 1183 1184 release: 1185 sbunlock(&so->so_snd); 1186 out: 1187 if (top != NULL) 1188 m_freem(top); 1189 return (error); 1190 } 1191 1192 /* 1193 * The part of soreceive() that implements reading non-inline out-of-band 1194 * data from a socket. For more complete comments, see soreceive(), from 1195 * which this code originated. 1196 * 1197 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is 1198 * unable to return an mbuf chain to the caller. 1199 */ 1200 static int 1201 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) 1202 { 1203 struct protosw *pr = so->so_proto; 1204 struct mbuf *m; 1205 int error; 1206 1207 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 1208 1209 m = m_get(M_WAITOK, MT_DATA); 1210 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 1211 if (error) 1212 goto bad; 1213 do { 1214 error = uiomove(mtod(m, void *), 1215 (int) min(uio->uio_resid, m->m_len), uio); 1216 m = m_free(m); 1217 } while (uio->uio_resid && error == 0 && m); 1218 bad: 1219 if (m != NULL) 1220 m_freem(m); 1221 return (error); 1222 } 1223 1224 /* 1225 * Optimized version of soreceive() for stream (TCP) sockets. 1226 */ 1227 static int 1228 sdp_sorecv(struct socket *so, struct sockaddr **psa, struct uio *uio, 1229 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 1230 { 1231 int len = 0, error = 0, flags, oresid; 1232 struct sockbuf *sb; 1233 struct mbuf *m, *n = NULL; 1234 struct sdp_sock *ssk; 1235 1236 /* We only do stream sockets. */ 1237 if (so->so_type != SOCK_STREAM) 1238 return (EINVAL); 1239 if (psa != NULL) 1240 *psa = NULL; 1241 if (controlp != NULL) 1242 return (EINVAL); 1243 if (flagsp != NULL) 1244 flags = *flagsp &~ MSG_EOR; 1245 else 1246 flags = 0; 1247 if (flags & MSG_OOB) 1248 return (soreceive_rcvoob(so, uio, flags)); 1249 if (mp0 != NULL) 1250 *mp0 = NULL; 1251 1252 sb = &so->so_rcv; 1253 ssk = sdp_sk(so); 1254 1255 /* Prevent other readers from entering the socket. */ 1256 error = sblock(sb, SBLOCKWAIT(flags)); 1257 if (error) 1258 goto out; 1259 SOCKBUF_LOCK(sb); 1260 1261 /* Easy one, no space to copyout anything. */ 1262 if (uio->uio_resid == 0) { 1263 error = EINVAL; 1264 goto out; 1265 } 1266 oresid = uio->uio_resid; 1267 1268 /* We will never ever get anything unless we are connected. */ 1269 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { 1270 /* When disconnecting there may be still some data left. */ 1271 if (sbavail(sb)) 1272 goto deliver; 1273 if (!(so->so_state & SS_ISDISCONNECTED)) 1274 error = ENOTCONN; 1275 goto out; 1276 } 1277 1278 /* Socket buffer is empty and we shall not block. */ 1279 if (sbavail(sb) == 0 && 1280 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { 1281 error = EAGAIN; 1282 goto out; 1283 } 1284 1285 restart: 1286 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1287 1288 /* Abort if socket has reported problems. */ 1289 if (so->so_error) { 1290 if (sbavail(sb)) 1291 goto deliver; 1292 if (oresid > uio->uio_resid) 1293 goto out; 1294 error = so->so_error; 1295 if (!(flags & MSG_PEEK)) 1296 so->so_error = 0; 1297 goto out; 1298 } 1299 1300 /* Door is closed. Deliver what is left, if any. */ 1301 if (sb->sb_state & SBS_CANTRCVMORE) { 1302 if (sbavail(sb)) 1303 goto deliver; 1304 else 1305 goto out; 1306 } 1307 1308 /* Socket buffer got some data that we shall deliver now. */ 1309 if (sbavail(sb) && !(flags & MSG_WAITALL) && 1310 ((so->so_state & SS_NBIO) || 1311 (flags & (MSG_DONTWAIT|MSG_NBIO)) || 1312 sbavail(sb) >= sb->sb_lowat || 1313 sbavail(sb) >= uio->uio_resid || 1314 sbavail(sb) >= sb->sb_hiwat) ) { 1315 goto deliver; 1316 } 1317 1318 /* On MSG_WAITALL we must wait until all data or error arrives. */ 1319 if ((flags & MSG_WAITALL) && 1320 (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_lowat)) 1321 goto deliver; 1322 1323 /* 1324 * Wait and block until (more) data comes in. 1325 * NB: Drops the sockbuf lock during wait. 1326 */ 1327 error = sbwait(sb); 1328 if (error) 1329 goto out; 1330 goto restart; 1331 1332 deliver: 1333 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1334 KASSERT(sbavail(sb), ("%s: sockbuf empty", __func__)); 1335 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); 1336 1337 /* Statistics. */ 1338 if (uio->uio_td) 1339 uio->uio_td->td_ru.ru_msgrcv++; 1340 1341 /* Fill uio until full or current end of socket buffer is reached. */ 1342 len = min(uio->uio_resid, sbavail(sb)); 1343 if (mp0 != NULL) { 1344 /* Dequeue as many mbufs as possible. */ 1345 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { 1346 for (*mp0 = m = sb->sb_mb; 1347 m != NULL && m->m_len <= len; 1348 m = m->m_next) { 1349 len -= m->m_len; 1350 uio->uio_resid -= m->m_len; 1351 sbfree(sb, m); 1352 n = m; 1353 } 1354 sb->sb_mb = m; 1355 if (sb->sb_mb == NULL) 1356 SB_EMPTY_FIXUP(sb); 1357 n->m_next = NULL; 1358 } 1359 /* Copy the remainder. */ 1360 if (len > 0) { 1361 KASSERT(sb->sb_mb != NULL, 1362 ("%s: len > 0 && sb->sb_mb empty", __func__)); 1363 1364 m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); 1365 if (m == NULL) 1366 len = 0; /* Don't flush data from sockbuf. */ 1367 else 1368 uio->uio_resid -= m->m_len; 1369 if (*mp0 != NULL) 1370 n->m_next = m; 1371 else 1372 *mp0 = m; 1373 if (*mp0 == NULL) { 1374 error = ENOBUFS; 1375 goto out; 1376 } 1377 } 1378 } else { 1379 /* NB: Must unlock socket buffer as uiomove may sleep. */ 1380 SOCKBUF_UNLOCK(sb); 1381 error = m_mbuftouio(uio, sb->sb_mb, len); 1382 SOCKBUF_LOCK(sb); 1383 if (error) 1384 goto out; 1385 } 1386 SBLASTRECORDCHK(sb); 1387 SBLASTMBUFCHK(sb); 1388 1389 /* 1390 * Remove the delivered data from the socket buffer unless we 1391 * were only peeking. 1392 */ 1393 if (!(flags & MSG_PEEK)) { 1394 if (len > 0) 1395 sbdrop_locked(sb, len); 1396 1397 /* Notify protocol that we drained some data. */ 1398 SOCKBUF_UNLOCK(sb); 1399 SDP_WLOCK(ssk); 1400 sdp_do_posts(ssk); 1401 SDP_WUNLOCK(ssk); 1402 SOCKBUF_LOCK(sb); 1403 } 1404 1405 /* 1406 * For MSG_WAITALL we may have to loop again and wait for 1407 * more data to come in. 1408 */ 1409 if ((flags & MSG_WAITALL) && uio->uio_resid > 0) 1410 goto restart; 1411 out: 1412 SOCKBUF_LOCK_ASSERT(sb); 1413 SBLASTRECORDCHK(sb); 1414 SBLASTMBUFCHK(sb); 1415 SOCKBUF_UNLOCK(sb); 1416 sbunlock(sb); 1417 return (error); 1418 } 1419 1420 /* 1421 * Abort is used to teardown a connection typically while sitting in 1422 * the accept queue. 1423 */ 1424 void 1425 sdp_abort(struct socket *so) 1426 { 1427 struct sdp_sock *ssk; 1428 1429 ssk = sdp_sk(so); 1430 SDP_WLOCK(ssk); 1431 /* 1432 * If we have not yet dropped, do it now. 1433 */ 1434 if (!(ssk->flags & SDP_TIMEWAIT) && 1435 !(ssk->flags & SDP_DROPPED)) 1436 sdp_drop(ssk, ECONNABORTED); 1437 KASSERT(ssk->flags & SDP_DROPPED, ("sdp_abort: %p not dropped 0x%X", 1438 ssk, ssk->flags)); 1439 SDP_WUNLOCK(ssk); 1440 } 1441 1442 /* 1443 * Close a SDP socket and initiate a friendly disconnect. 1444 */ 1445 static void 1446 sdp_close(struct socket *so) 1447 { 1448 struct sdp_sock *ssk; 1449 1450 ssk = sdp_sk(so); 1451 SDP_WLOCK(ssk); 1452 /* 1453 * If we have not yet dropped, do it now. 1454 */ 1455 if (!(ssk->flags & SDP_TIMEWAIT) && 1456 !(ssk->flags & SDP_DROPPED)) 1457 sdp_start_disconnect(ssk); 1458 1459 /* 1460 * If we've still not dropped let the socket layer know we're 1461 * holding on to the socket and pcb for a while. 1462 */ 1463 if (!(ssk->flags & SDP_DROPPED)) { 1464 SOCK_LOCK(so); 1465 so->so_state |= SS_PROTOREF; 1466 SOCK_UNLOCK(so); 1467 ssk->flags |= SDP_SOCKREF; 1468 } 1469 SDP_WUNLOCK(ssk); 1470 } 1471 1472 /* 1473 * User requests out-of-band data. 1474 */ 1475 static int 1476 sdp_rcvoob(struct socket *so, struct mbuf *m, int flags) 1477 { 1478 int error = 0; 1479 struct sdp_sock *ssk; 1480 1481 ssk = sdp_sk(so); 1482 SDP_WLOCK(ssk); 1483 if (!rx_ring_trylock(&ssk->rx_ring)) { 1484 SDP_WUNLOCK(ssk); 1485 return (ECONNRESET); 1486 } 1487 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 1488 error = ECONNRESET; 1489 goto out; 1490 } 1491 if ((so->so_oobmark == 0 && 1492 (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) || 1493 so->so_options & SO_OOBINLINE || 1494 ssk->oobflags & SDP_HADOOB) { 1495 error = EINVAL; 1496 goto out; 1497 } 1498 if ((ssk->oobflags & SDP_HAVEOOB) == 0) { 1499 error = EWOULDBLOCK; 1500 goto out; 1501 } 1502 m->m_len = 1; 1503 *mtod(m, caddr_t) = ssk->iobc; 1504 if ((flags & MSG_PEEK) == 0) 1505 ssk->oobflags ^= (SDP_HAVEOOB | SDP_HADOOB); 1506 out: 1507 rx_ring_unlock(&ssk->rx_ring); 1508 SDP_WUNLOCK(ssk); 1509 return (error); 1510 } 1511 1512 void 1513 sdp_urg(struct sdp_sock *ssk, struct mbuf *mb) 1514 { 1515 struct mbuf *m; 1516 struct socket *so; 1517 1518 so = ssk->socket; 1519 if (so == NULL) 1520 return; 1521 1522 so->so_oobmark = sbused(&so->so_rcv) + mb->m_pkthdr.len - 1; 1523 sohasoutofband(so); 1524 ssk->oobflags &= ~(SDP_HAVEOOB | SDP_HADOOB); 1525 if (!(so->so_options & SO_OOBINLINE)) { 1526 for (m = mb; m->m_next != NULL; m = m->m_next); 1527 ssk->iobc = *(mtod(m, char *) + m->m_len - 1); 1528 ssk->oobflags |= SDP_HAVEOOB; 1529 m->m_len--; 1530 mb->m_pkthdr.len--; 1531 } 1532 } 1533 1534 /* 1535 * Notify a sdp socket of an asynchronous error. 1536 * 1537 * Do not wake up user since there currently is no mechanism for 1538 * reporting soft errors (yet - a kqueue filter may be added). 1539 */ 1540 struct sdp_sock * 1541 sdp_notify(struct sdp_sock *ssk, int error) 1542 { 1543 1544 SDP_WLOCK_ASSERT(ssk); 1545 1546 if ((ssk->flags & SDP_TIMEWAIT) || 1547 (ssk->flags & SDP_DROPPED)) 1548 return (ssk); 1549 1550 /* 1551 * Ignore some errors if we are hooked up. 1552 */ 1553 if (ssk->state == TCPS_ESTABLISHED && 1554 (error == EHOSTUNREACH || error == ENETUNREACH || 1555 error == EHOSTDOWN)) 1556 return (ssk); 1557 ssk->softerror = error; 1558 return sdp_drop(ssk, error); 1559 } 1560 1561 static void 1562 sdp_ctlinput(int cmd, struct sockaddr *sa, void *vip) 1563 { 1564 struct in_addr faddr; 1565 1566 faddr = ((struct sockaddr_in *)sa)->sin_addr; 1567 if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) 1568 return; 1569 1570 sdp_pcbnotifyall(faddr, inetctlerrmap[cmd], sdp_notify); 1571 } 1572 1573 static int 1574 sdp_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, 1575 struct thread *td) 1576 { 1577 return (EOPNOTSUPP); 1578 } 1579 1580 static void 1581 sdp_keepalive_timeout(void *data) 1582 { 1583 struct sdp_sock *ssk; 1584 1585 ssk = data; 1586 /* Callout canceled. */ 1587 if (!callout_active(&ssk->keep2msl)) 1588 return; 1589 /* Callout rescheduled as a different kind of timer. */ 1590 if (callout_pending(&ssk->keep2msl)) 1591 goto out; 1592 callout_deactivate(&ssk->keep2msl); 1593 if (ssk->flags & SDP_DROPPED || 1594 (ssk->socket->so_options & SO_KEEPALIVE) == 0) 1595 goto out; 1596 sdp_post_keepalive(ssk); 1597 callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME, 1598 sdp_keepalive_timeout, ssk); 1599 out: 1600 SDP_WUNLOCK(ssk); 1601 } 1602 1603 1604 void 1605 sdp_start_keepalive_timer(struct socket *so) 1606 { 1607 struct sdp_sock *ssk; 1608 1609 ssk = sdp_sk(so); 1610 if (!callout_pending(&ssk->keep2msl)) 1611 callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME, 1612 sdp_keepalive_timeout, ssk); 1613 } 1614 1615 static void 1616 sdp_stop_keepalive_timer(struct socket *so) 1617 { 1618 struct sdp_sock *ssk; 1619 1620 ssk = sdp_sk(so); 1621 callout_stop(&ssk->keep2msl); 1622 } 1623 1624 /* 1625 * sdp_ctloutput() must drop the inpcb lock before performing copyin on 1626 * socket option arguments. When it re-acquires the lock after the copy, it 1627 * has to revalidate that the connection is still valid for the socket 1628 * option. 1629 */ 1630 #define SDP_WLOCK_RECHECK(inp) do { \ 1631 SDP_WLOCK(ssk); \ 1632 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { \ 1633 SDP_WUNLOCK(ssk); \ 1634 return (ECONNRESET); \ 1635 } \ 1636 } while(0) 1637 1638 static int 1639 sdp_ctloutput(struct socket *so, struct sockopt *sopt) 1640 { 1641 int error, opt, optval; 1642 struct sdp_sock *ssk; 1643 1644 error = 0; 1645 ssk = sdp_sk(so); 1646 if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_KEEPALIVE) { 1647 SDP_WLOCK(ssk); 1648 if (so->so_options & SO_KEEPALIVE) 1649 sdp_start_keepalive_timer(so); 1650 else 1651 sdp_stop_keepalive_timer(so); 1652 SDP_WUNLOCK(ssk); 1653 } 1654 if (sopt->sopt_level != IPPROTO_TCP) 1655 return (error); 1656 1657 SDP_WLOCK(ssk); 1658 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 1659 SDP_WUNLOCK(ssk); 1660 return (ECONNRESET); 1661 } 1662 1663 switch (sopt->sopt_dir) { 1664 case SOPT_SET: 1665 switch (sopt->sopt_name) { 1666 case TCP_NODELAY: 1667 SDP_WUNLOCK(ssk); 1668 error = sooptcopyin(sopt, &optval, sizeof optval, 1669 sizeof optval); 1670 if (error) 1671 return (error); 1672 1673 SDP_WLOCK_RECHECK(ssk); 1674 opt = SDP_NODELAY; 1675 if (optval) 1676 ssk->flags |= opt; 1677 else 1678 ssk->flags &= ~opt; 1679 sdp_do_posts(ssk); 1680 SDP_WUNLOCK(ssk); 1681 break; 1682 1683 default: 1684 SDP_WUNLOCK(ssk); 1685 error = ENOPROTOOPT; 1686 break; 1687 } 1688 break; 1689 1690 case SOPT_GET: 1691 switch (sopt->sopt_name) { 1692 case TCP_NODELAY: 1693 optval = ssk->flags & SDP_NODELAY; 1694 SDP_WUNLOCK(ssk); 1695 error = sooptcopyout(sopt, &optval, sizeof optval); 1696 break; 1697 default: 1698 SDP_WUNLOCK(ssk); 1699 error = ENOPROTOOPT; 1700 break; 1701 } 1702 break; 1703 } 1704 return (error); 1705 } 1706 #undef SDP_WLOCK_RECHECK 1707 1708 int sdp_mod_count = 0; 1709 int sdp_mod_usec = 0; 1710 1711 void 1712 sdp_set_default_moderation(struct sdp_sock *ssk) 1713 { 1714 if (sdp_mod_count <= 0 || sdp_mod_usec <= 0) 1715 return; 1716 ib_modify_cq(ssk->rx_ring.cq, sdp_mod_count, sdp_mod_usec); 1717 } 1718 1719 static void 1720 sdp_dev_add(struct ib_device *device) 1721 { 1722 struct ib_fmr_pool_param param; 1723 struct sdp_device *sdp_dev; 1724 1725 sdp_dev = malloc(sizeof(*sdp_dev), M_SDP, M_WAITOK | M_ZERO); 1726 sdp_dev->pd = ib_alloc_pd(device, 0); 1727 if (IS_ERR(sdp_dev->pd)) 1728 goto out_pd; 1729 memset(¶m, 0, sizeof param); 1730 param.max_pages_per_fmr = SDP_FMR_SIZE; 1731 param.page_shift = PAGE_SHIFT; 1732 param.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ); 1733 param.pool_size = SDP_FMR_POOL_SIZE; 1734 param.dirty_watermark = SDP_FMR_DIRTY_SIZE; 1735 param.cache = 1; 1736 sdp_dev->fmr_pool = ib_create_fmr_pool(sdp_dev->pd, ¶m); 1737 if (IS_ERR(sdp_dev->fmr_pool)) 1738 goto out_fmr; 1739 ib_set_client_data(device, &sdp_client, sdp_dev); 1740 return; 1741 1742 out_fmr: 1743 ib_dealloc_pd(sdp_dev->pd); 1744 out_pd: 1745 free(sdp_dev, M_SDP); 1746 } 1747 1748 static void 1749 sdp_dev_rem(struct ib_device *device, void *client_data) 1750 { 1751 struct sdp_device *sdp_dev; 1752 struct sdp_sock *ssk; 1753 1754 SDP_LIST_WLOCK(); 1755 LIST_FOREACH(ssk, &sdp_list, list) { 1756 if (ssk->ib_device != device) 1757 continue; 1758 SDP_WLOCK(ssk); 1759 if ((ssk->flags & SDP_DESTROY) == 0) 1760 ssk = sdp_notify(ssk, ECONNRESET); 1761 if (ssk) 1762 SDP_WUNLOCK(ssk); 1763 } 1764 SDP_LIST_WUNLOCK(); 1765 /* 1766 * XXX Do I need to wait between these two? 1767 */ 1768 sdp_dev = ib_get_client_data(device, &sdp_client); 1769 if (!sdp_dev) 1770 return; 1771 ib_flush_fmr_pool(sdp_dev->fmr_pool); 1772 ib_destroy_fmr_pool(sdp_dev->fmr_pool); 1773 ib_dealloc_pd(sdp_dev->pd); 1774 free(sdp_dev, M_SDP); 1775 } 1776 1777 struct ib_client sdp_client = 1778 { .name = "sdp", .add = sdp_dev_add, .remove = sdp_dev_rem }; 1779 1780 1781 static int 1782 sdp_pcblist(SYSCTL_HANDLER_ARGS) 1783 { 1784 int error, n, i; 1785 struct sdp_sock *ssk; 1786 struct xinpgen xig; 1787 1788 /* 1789 * The process of preparing the TCB list is too time-consuming and 1790 * resource-intensive to repeat twice on every request. 1791 */ 1792 if (req->oldptr == NULL) { 1793 n = sdp_count; 1794 n += imax(n / 8, 10); 1795 req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb); 1796 return (0); 1797 } 1798 1799 if (req->newptr != NULL) 1800 return (EPERM); 1801 1802 /* 1803 * OK, now we're committed to doing something. 1804 */ 1805 SDP_LIST_RLOCK(); 1806 n = sdp_count; 1807 SDP_LIST_RUNLOCK(); 1808 1809 error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) 1810 + n * sizeof(struct xtcpcb)); 1811 if (error != 0) 1812 return (error); 1813 1814 bzero(&xig, sizeof(xig)); 1815 xig.xig_len = sizeof xig; 1816 xig.xig_count = n; 1817 xig.xig_gen = 0; 1818 xig.xig_sogen = so_gencnt; 1819 error = SYSCTL_OUT(req, &xig, sizeof xig); 1820 if (error) 1821 return (error); 1822 1823 SDP_LIST_RLOCK(); 1824 for (ssk = LIST_FIRST(&sdp_list), i = 0; 1825 ssk != NULL && i < n; ssk = LIST_NEXT(ssk, list)) { 1826 struct xtcpcb xt; 1827 1828 SDP_RLOCK(ssk); 1829 if (ssk->flags & SDP_TIMEWAIT) { 1830 if (ssk->cred != NULL) 1831 error = cr_cansee(req->td->td_ucred, 1832 ssk->cred); 1833 else 1834 error = EINVAL; /* Skip this inp. */ 1835 } else if (ssk->socket) 1836 error = cr_canseesocket(req->td->td_ucred, 1837 ssk->socket); 1838 else 1839 error = EINVAL; 1840 if (error) { 1841 error = 0; 1842 goto next; 1843 } 1844 1845 bzero(&xt, sizeof(xt)); 1846 xt.xt_len = sizeof xt; 1847 xt.xt_inp.inp_gencnt = 0; 1848 xt.xt_inp.inp_vflag = INP_IPV4; 1849 memcpy(&xt.xt_inp.inp_laddr, &ssk->laddr, sizeof(ssk->laddr)); 1850 xt.xt_inp.inp_lport = ssk->lport; 1851 memcpy(&xt.xt_inp.inp_faddr, &ssk->faddr, sizeof(ssk->faddr)); 1852 xt.xt_inp.inp_fport = ssk->fport; 1853 xt.t_state = ssk->state; 1854 if (ssk->socket != NULL) 1855 sotoxsocket(ssk->socket, &xt.xt_inp.xi_socket); 1856 xt.xt_inp.xi_socket.xso_protocol = IPPROTO_TCP; 1857 SDP_RUNLOCK(ssk); 1858 error = SYSCTL_OUT(req, &xt, sizeof xt); 1859 if (error) 1860 break; 1861 i++; 1862 continue; 1863 next: 1864 SDP_RUNLOCK(ssk); 1865 } 1866 if (!error) { 1867 /* 1868 * Give the user an updated idea of our state. 1869 * If the generation differs from what we told 1870 * her before, she knows that something happened 1871 * while we were processing this request, and it 1872 * might be necessary to retry. 1873 */ 1874 xig.xig_gen = 0; 1875 xig.xig_sogen = so_gencnt; 1876 xig.xig_count = sdp_count; 1877 error = SYSCTL_OUT(req, &xig, sizeof xig); 1878 } 1879 SDP_LIST_RUNLOCK(); 1880 return (error); 1881 } 1882 1883 static SYSCTL_NODE(_net_inet, -1, sdp, CTLFLAG_RW, 0, "SDP"); 1884 1885 SYSCTL_PROC(_net_inet_sdp, TCPCTL_PCBLIST, pcblist, 1886 CTLFLAG_RD | CTLTYPE_STRUCT, 0, 0, sdp_pcblist, "S,xtcpcb", 1887 "List of active SDP connections"); 1888 1889 static void 1890 sdp_zone_change(void *tag) 1891 { 1892 1893 uma_zone_set_max(sdp_zone, maxsockets); 1894 } 1895 1896 static void 1897 sdp_init(void) 1898 { 1899 1900 LIST_INIT(&sdp_list); 1901 sdp_zone = uma_zcreate("sdp_sock", sizeof(struct sdp_sock), 1902 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 1903 uma_zone_set_max(sdp_zone, maxsockets); 1904 EVENTHANDLER_REGISTER(maxsockets_change, sdp_zone_change, NULL, 1905 EVENTHANDLER_PRI_ANY); 1906 rx_comp_wq = create_singlethread_workqueue("rx_comp_wq"); 1907 ib_register_client(&sdp_client); 1908 } 1909 1910 extern struct domain sdpdomain; 1911 1912 struct pr_usrreqs sdp_usrreqs = { 1913 .pru_abort = sdp_abort, 1914 .pru_accept = sdp_accept, 1915 .pru_attach = sdp_attach, 1916 .pru_bind = sdp_bind, 1917 .pru_connect = sdp_connect, 1918 .pru_control = sdp_control, 1919 .pru_detach = sdp_detach, 1920 .pru_disconnect = sdp_disconnect, 1921 .pru_listen = sdp_listen, 1922 .pru_peeraddr = sdp_getpeeraddr, 1923 .pru_rcvoob = sdp_rcvoob, 1924 .pru_send = sdp_send, 1925 .pru_sosend = sdp_sosend, 1926 .pru_soreceive = sdp_sorecv, 1927 .pru_shutdown = sdp_shutdown, 1928 .pru_sockaddr = sdp_getsockaddr, 1929 .pru_close = sdp_close, 1930 }; 1931 1932 struct protosw sdpsw[] = { 1933 { 1934 .pr_type = SOCK_STREAM, 1935 .pr_domain = &sdpdomain, 1936 .pr_protocol = IPPROTO_IP, 1937 .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD, 1938 .pr_ctlinput = sdp_ctlinput, 1939 .pr_ctloutput = sdp_ctloutput, 1940 .pr_usrreqs = &sdp_usrreqs 1941 }, 1942 { 1943 .pr_type = SOCK_STREAM, 1944 .pr_domain = &sdpdomain, 1945 .pr_protocol = IPPROTO_TCP, 1946 .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD, 1947 .pr_ctlinput = sdp_ctlinput, 1948 .pr_ctloutput = sdp_ctloutput, 1949 .pr_usrreqs = &sdp_usrreqs 1950 }, 1951 }; 1952 1953 struct domain sdpdomain = { 1954 .dom_family = AF_INET_SDP, 1955 .dom_name = "SDP", 1956 .dom_init = sdp_init, 1957 .dom_protosw = sdpsw, 1958 .dom_protoswNPROTOSW = &sdpsw[sizeof(sdpsw)/sizeof(sdpsw[0])], 1959 }; 1960 1961 DOMAIN_SET(sdp); 1962 1963 int sdp_debug_level = 1; 1964 int sdp_data_debug_level = 0; 1965