1 2 /*- 3 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 4 * The Regents of the University of California. All rights reserved. 5 * Copyright (c) 2004 The FreeBSD Foundation. All rights reserved. 6 * Copyright (c) 2004-2008 Robert N. M. Watson. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 4. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * Excerpts taken from tcp_subr.c, tcp_usrreq.c, uipc_socket.c 33 */ 34 35 /* 36 * 37 * Copyright (c) 2010 Isilon Systems, Inc. 38 * Copyright (c) 2010 iX Systems, Inc. 39 * Copyright (c) 2010 Panasas, Inc. 40 * All rights reserved. 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice unmodified, this list of conditions, and the following 47 * disclaimer. 48 * 2. Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * 52 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 53 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 54 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 55 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 56 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 57 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 58 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 59 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 60 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 61 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 62 * 63 */ 64 #include <sys/cdefs.h> 65 __FBSDID("$FreeBSD$"); 66 67 #include "sdp.h" 68 69 #include <net/if.h> 70 #include <net/route.h> 71 #include <net/vnet.h> 72 73 uma_zone_t sdp_zone; 74 struct rwlock sdp_lock; 75 LIST_HEAD(, sdp_sock) sdp_list; 76 77 struct workqueue_struct *rx_comp_wq; 78 79 RW_SYSINIT(sdplockinit, &sdp_lock, "SDP lock"); 80 #define SDP_LIST_WLOCK() rw_wlock(&sdp_lock) 81 #define SDP_LIST_RLOCK() rw_rlock(&sdp_lock) 82 #define SDP_LIST_WUNLOCK() rw_wunlock(&sdp_lock) 83 #define SDP_LIST_RUNLOCK() rw_runlock(&sdp_lock) 84 #define SDP_LIST_WLOCK_ASSERT() rw_assert(&sdp_lock, RW_WLOCKED) 85 #define SDP_LIST_RLOCK_ASSERT() rw_assert(&sdp_lock, RW_RLOCKED) 86 #define SDP_LIST_LOCK_ASSERT() rw_assert(&sdp_lock, RW_LOCKED) 87 88 static MALLOC_DEFINE(M_SDP, "sdp", "Socket Direct Protocol"); 89 90 static void sdp_stop_keepalive_timer(struct socket *so); 91 92 /* 93 * SDP protocol interface to socket abstraction. 94 */ 95 /* 96 * sdp_sendspace and sdp_recvspace are the default send and receive window 97 * sizes, respectively. 98 */ 99 u_long sdp_sendspace = 1024*32; 100 u_long sdp_recvspace = 1024*64; 101 102 static int sdp_count; 103 104 /* 105 * Disable async. CMA events for sockets which are being torn down. 106 */ 107 static void 108 sdp_destroy_cma(struct sdp_sock *ssk) 109 { 110 111 if (ssk->id == NULL) 112 return; 113 rdma_destroy_id(ssk->id); 114 ssk->id = NULL; 115 } 116 117 static int 118 sdp_pcbbind(struct sdp_sock *ssk, struct sockaddr *nam, struct ucred *cred) 119 { 120 struct sockaddr_in *sin; 121 struct sockaddr_in null; 122 int error; 123 124 SDP_WLOCK_ASSERT(ssk); 125 126 if (ssk->lport != 0 || ssk->laddr != INADDR_ANY) 127 return (EINVAL); 128 /* rdma_bind_addr handles bind races. */ 129 SDP_WUNLOCK(ssk); 130 if (ssk->id == NULL) 131 ssk->id = rdma_create_id(sdp_cma_handler, ssk, RDMA_PS_SDP); 132 if (ssk->id == NULL) { 133 SDP_WLOCK(ssk); 134 return (ENOMEM); 135 } 136 if (nam == NULL) { 137 null.sin_family = AF_INET; 138 null.sin_len = sizeof(null); 139 null.sin_addr.s_addr = INADDR_ANY; 140 null.sin_port = 0; 141 bzero(&null.sin_zero, sizeof(null.sin_zero)); 142 nam = (struct sockaddr *)&null; 143 } 144 error = -rdma_bind_addr(ssk->id, nam); 145 SDP_WLOCK(ssk); 146 if (error == 0) { 147 sin = (struct sockaddr_in *)&ssk->id->route.addr.src_addr; 148 ssk->laddr = sin->sin_addr.s_addr; 149 ssk->lport = sin->sin_port; 150 } else 151 sdp_destroy_cma(ssk); 152 return (error); 153 } 154 155 static void 156 sdp_pcbfree(struct sdp_sock *ssk) 157 { 158 KASSERT(ssk->socket == NULL, ("ssk %p socket still attached", ssk)); 159 160 sdp_dbg(ssk->socket, "Freeing pcb"); 161 SDP_WLOCK_ASSERT(ssk); 162 ssk->flags |= SDP_DESTROY; 163 SDP_WUNLOCK(ssk); 164 SDP_LIST_WLOCK(); 165 sdp_count--; 166 LIST_REMOVE(ssk, list); 167 SDP_LIST_WUNLOCK(); 168 crfree(ssk->cred); 169 sdp_destroy_cma(ssk); 170 ssk->qp_active = 0; 171 if (ssk->qp) { 172 ib_destroy_qp(ssk->qp); 173 ssk->qp = NULL; 174 } 175 sdp_tx_ring_destroy(ssk); 176 sdp_rx_ring_destroy(ssk); 177 rw_destroy(&ssk->rx_ring.destroyed_lock); 178 uma_zfree(sdp_zone, ssk); 179 rw_destroy(&ssk->lock); 180 } 181 182 /* 183 * Common routines to return a socket address. 184 */ 185 static struct sockaddr * 186 sdp_sockaddr(in_port_t port, struct in_addr *addr_p) 187 { 188 struct sockaddr_in *sin; 189 190 sin = malloc(sizeof *sin, M_SONAME, 191 M_WAITOK | M_ZERO); 192 sin->sin_family = AF_INET; 193 sin->sin_len = sizeof(*sin); 194 sin->sin_addr = *addr_p; 195 sin->sin_port = port; 196 197 return (struct sockaddr *)sin; 198 } 199 200 static int 201 sdp_getsockaddr(struct socket *so, struct sockaddr **nam) 202 { 203 struct sdp_sock *ssk; 204 struct in_addr addr; 205 in_port_t port; 206 207 ssk = sdp_sk(so); 208 SDP_RLOCK(ssk); 209 port = ssk->lport; 210 addr.s_addr = ssk->laddr; 211 SDP_RUNLOCK(ssk); 212 213 *nam = sdp_sockaddr(port, &addr); 214 return 0; 215 } 216 217 static int 218 sdp_getpeeraddr(struct socket *so, struct sockaddr **nam) 219 { 220 struct sdp_sock *ssk; 221 struct in_addr addr; 222 in_port_t port; 223 224 ssk = sdp_sk(so); 225 SDP_RLOCK(ssk); 226 port = ssk->fport; 227 addr.s_addr = ssk->faddr; 228 SDP_RUNLOCK(ssk); 229 230 *nam = sdp_sockaddr(port, &addr); 231 return 0; 232 } 233 234 static void 235 sdp_pcbnotifyall(struct in_addr faddr, int errno, 236 struct sdp_sock *(*notify)(struct sdp_sock *, int)) 237 { 238 struct sdp_sock *ssk, *ssk_temp; 239 240 SDP_LIST_WLOCK(); 241 LIST_FOREACH_SAFE(ssk, &sdp_list, list, ssk_temp) { 242 SDP_WLOCK(ssk); 243 if (ssk->faddr != faddr.s_addr || ssk->socket == NULL) { 244 SDP_WUNLOCK(ssk); 245 continue; 246 } 247 if ((ssk->flags & SDP_DESTROY) == 0) 248 if ((*notify)(ssk, errno)) 249 SDP_WUNLOCK(ssk); 250 } 251 SDP_LIST_WUNLOCK(); 252 } 253 254 #if 0 255 static void 256 sdp_apply_all(void (*func)(struct sdp_sock *, void *), void *arg) 257 { 258 struct sdp_sock *ssk; 259 260 SDP_LIST_RLOCK(); 261 LIST_FOREACH(ssk, &sdp_list, list) { 262 SDP_WLOCK(ssk); 263 func(ssk, arg); 264 SDP_WUNLOCK(ssk); 265 } 266 SDP_LIST_RUNLOCK(); 267 } 268 #endif 269 270 static void 271 sdp_output_reset(struct sdp_sock *ssk) 272 { 273 struct rdma_cm_id *id; 274 275 SDP_WLOCK_ASSERT(ssk); 276 if (ssk->id) { 277 id = ssk->id; 278 ssk->qp_active = 0; 279 SDP_WUNLOCK(ssk); 280 rdma_disconnect(id); 281 SDP_WLOCK(ssk); 282 } 283 ssk->state = TCPS_CLOSED; 284 } 285 286 /* 287 * Attempt to close a SDP socket, marking it as dropped, and freeing 288 * the socket if we hold the only reference. 289 */ 290 static struct sdp_sock * 291 sdp_closed(struct sdp_sock *ssk) 292 { 293 struct socket *so; 294 295 SDP_WLOCK_ASSERT(ssk); 296 297 ssk->flags |= SDP_DROPPED; 298 so = ssk->socket; 299 soisdisconnected(so); 300 if (ssk->flags & SDP_SOCKREF) { 301 KASSERT(so->so_state & SS_PROTOREF, 302 ("sdp_closed: !SS_PROTOREF")); 303 ssk->flags &= ~SDP_SOCKREF; 304 SDP_WUNLOCK(ssk); 305 ACCEPT_LOCK(); 306 SOCK_LOCK(so); 307 so->so_state &= ~SS_PROTOREF; 308 sofree(so); 309 return (NULL); 310 } 311 return (ssk); 312 } 313 314 /* 315 * Perform timer based shutdowns which can not operate in 316 * callout context. 317 */ 318 static void 319 sdp_shutdown_task(void *data, int pending) 320 { 321 struct sdp_sock *ssk; 322 323 ssk = data; 324 SDP_WLOCK(ssk); 325 /* 326 * I don't think this can race with another call to pcbfree() 327 * because SDP_TIMEWAIT protects it. SDP_DESTROY may be redundant. 328 */ 329 if (ssk->flags & SDP_DESTROY) 330 panic("sdp_shutdown_task: Racing with pcbfree for ssk %p", 331 ssk); 332 if (ssk->flags & SDP_DISCON) 333 sdp_output_reset(ssk); 334 /* We have to clear this so sdp_detach() will call pcbfree(). */ 335 ssk->flags &= ~(SDP_TIMEWAIT | SDP_DREQWAIT); 336 if ((ssk->flags & SDP_DROPPED) == 0 && 337 sdp_closed(ssk) == NULL) 338 return; 339 if (ssk->socket == NULL) { 340 sdp_pcbfree(ssk); 341 return; 342 } 343 SDP_WUNLOCK(ssk); 344 } 345 346 /* 347 * 2msl has expired, schedule the shutdown task. 348 */ 349 static void 350 sdp_2msl_timeout(void *data) 351 { 352 struct sdp_sock *ssk; 353 354 ssk = data; 355 /* Callout canceled. */ 356 if (!callout_active(&ssk->keep2msl)) 357 goto out; 358 callout_deactivate(&ssk->keep2msl); 359 /* Should be impossible, defensive programming. */ 360 if ((ssk->flags & SDP_TIMEWAIT) == 0) 361 goto out; 362 taskqueue_enqueue(taskqueue_thread, &ssk->shutdown_task); 363 out: 364 SDP_WUNLOCK(ssk); 365 return; 366 } 367 368 /* 369 * Schedule the 2msl wait timer. 370 */ 371 static void 372 sdp_2msl_wait(struct sdp_sock *ssk) 373 { 374 375 SDP_WLOCK_ASSERT(ssk); 376 ssk->flags |= SDP_TIMEWAIT; 377 ssk->state = TCPS_TIME_WAIT; 378 soisdisconnected(ssk->socket); 379 callout_reset(&ssk->keep2msl, TCPTV_MSL, sdp_2msl_timeout, ssk); 380 } 381 382 /* 383 * Timed out waiting for the final fin/ack from rdma_disconnect(). 384 */ 385 static void 386 sdp_dreq_timeout(void *data) 387 { 388 struct sdp_sock *ssk; 389 390 ssk = data; 391 /* Callout canceled. */ 392 if (!callout_active(&ssk->keep2msl)) 393 goto out; 394 /* Callout rescheduled, probably as a different timer. */ 395 if (callout_pending(&ssk->keep2msl)) 396 goto out; 397 callout_deactivate(&ssk->keep2msl); 398 if (ssk->state != TCPS_FIN_WAIT_1 && ssk->state != TCPS_LAST_ACK) 399 goto out; 400 if ((ssk->flags & SDP_DREQWAIT) == 0) 401 goto out; 402 ssk->flags &= ~SDP_DREQWAIT; 403 ssk->flags |= SDP_DISCON; 404 sdp_2msl_wait(ssk); 405 ssk->qp_active = 0; 406 out: 407 SDP_WUNLOCK(ssk); 408 } 409 410 /* 411 * Received the final fin/ack. Cancel the 2msl. 412 */ 413 void 414 sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk) 415 { 416 sdp_dbg(ssk->socket, "cancelling dreq wait timeout\n"); 417 ssk->flags &= ~SDP_DREQWAIT; 418 sdp_2msl_wait(ssk); 419 } 420 421 static int 422 sdp_init_sock(struct socket *sk) 423 { 424 struct sdp_sock *ssk = sdp_sk(sk); 425 426 sdp_dbg(sk, "%s\n", __func__); 427 428 callout_init_rw(&ssk->keep2msl, &ssk->lock, CALLOUT_RETURNUNLOCKED); 429 TASK_INIT(&ssk->shutdown_task, 0, sdp_shutdown_task, ssk); 430 #ifdef SDP_ZCOPY 431 INIT_DELAYED_WORK(&ssk->srcavail_cancel_work, srcavail_cancel_timeout); 432 ssk->zcopy_thresh = -1; /* use global sdp_zcopy_thresh */ 433 ssk->tx_ring.rdma_inflight = NULL; 434 #endif 435 atomic_set(&ssk->mseq_ack, 0); 436 sdp_rx_ring_init(ssk); 437 ssk->tx_ring.buffer = NULL; 438 439 return 0; 440 } 441 442 /* 443 * Allocate an sdp_sock for the socket and reserve socket buffer space. 444 */ 445 static int 446 sdp_attach(struct socket *so, int proto, struct thread *td) 447 { 448 struct sdp_sock *ssk; 449 int error; 450 451 ssk = sdp_sk(so); 452 KASSERT(ssk == NULL, ("sdp_attach: ssk already set on so %p", so)); 453 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { 454 error = soreserve(so, sdp_sendspace, sdp_recvspace); 455 if (error) 456 return (error); 457 } 458 so->so_rcv.sb_flags |= SB_AUTOSIZE; 459 so->so_snd.sb_flags |= SB_AUTOSIZE; 460 ssk = uma_zalloc(sdp_zone, M_NOWAIT | M_ZERO); 461 if (ssk == NULL) 462 return (ENOBUFS); 463 rw_init(&ssk->lock, "sdpsock"); 464 ssk->socket = so; 465 ssk->cred = crhold(so->so_cred); 466 so->so_pcb = (caddr_t)ssk; 467 sdp_init_sock(so); 468 ssk->flags = 0; 469 ssk->qp_active = 0; 470 ssk->state = TCPS_CLOSED; 471 SDP_LIST_WLOCK(); 472 LIST_INSERT_HEAD(&sdp_list, ssk, list); 473 sdp_count++; 474 SDP_LIST_WUNLOCK(); 475 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 476 so->so_linger = TCP_LINGERTIME; 477 478 return (0); 479 } 480 481 /* 482 * Detach SDP from the socket, potentially leaving it around for the 483 * timewait to expire. 484 */ 485 static void 486 sdp_detach(struct socket *so) 487 { 488 struct sdp_sock *ssk; 489 490 ssk = sdp_sk(so); 491 SDP_WLOCK(ssk); 492 KASSERT(ssk->socket != NULL, ("sdp_detach: socket is NULL")); 493 ssk->socket->so_pcb = NULL; 494 ssk->socket = NULL; 495 if (ssk->flags & (SDP_TIMEWAIT | SDP_DREQWAIT)) 496 SDP_WUNLOCK(ssk); 497 else if (ssk->flags & SDP_DROPPED || ssk->state < TCPS_SYN_SENT) 498 sdp_pcbfree(ssk); 499 else 500 panic("sdp_detach: Unexpected state, ssk %p.\n", ssk); 501 } 502 503 /* 504 * Allocate a local address for the socket. 505 */ 506 static int 507 sdp_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 508 { 509 int error = 0; 510 struct sdp_sock *ssk; 511 struct sockaddr_in *sin; 512 513 sin = (struct sockaddr_in *)nam; 514 if (nam->sa_len != sizeof (*sin)) 515 return (EINVAL); 516 if (sin->sin_family != AF_INET) 517 return (EINVAL); 518 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) 519 return (EAFNOSUPPORT); 520 521 ssk = sdp_sk(so); 522 SDP_WLOCK(ssk); 523 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 524 error = EINVAL; 525 goto out; 526 } 527 error = sdp_pcbbind(ssk, nam, td->td_ucred); 528 out: 529 SDP_WUNLOCK(ssk); 530 531 return (error); 532 } 533 534 /* 535 * Prepare to accept connections. 536 */ 537 static int 538 sdp_listen(struct socket *so, int backlog, struct thread *td) 539 { 540 int error = 0; 541 struct sdp_sock *ssk; 542 543 ssk = sdp_sk(so); 544 SDP_WLOCK(ssk); 545 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 546 error = EINVAL; 547 goto out; 548 } 549 if (error == 0 && ssk->lport == 0) 550 error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred); 551 SOCK_LOCK(so); 552 if (error == 0) 553 error = solisten_proto_check(so); 554 if (error == 0) { 555 solisten_proto(so, backlog); 556 ssk->state = TCPS_LISTEN; 557 } 558 SOCK_UNLOCK(so); 559 560 out: 561 SDP_WUNLOCK(ssk); 562 if (error == 0) 563 error = -rdma_listen(ssk->id, backlog); 564 return (error); 565 } 566 567 /* 568 * Initiate a SDP connection to nam. 569 */ 570 static int 571 sdp_start_connect(struct sdp_sock *ssk, struct sockaddr *nam, struct thread *td) 572 { 573 struct sockaddr_in src; 574 struct socket *so; 575 int error; 576 577 so = ssk->socket; 578 579 SDP_WLOCK_ASSERT(ssk); 580 if (ssk->lport == 0) { 581 error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred); 582 if (error) 583 return error; 584 } 585 src.sin_family = AF_INET; 586 src.sin_len = sizeof(src); 587 bzero(&src.sin_zero, sizeof(src.sin_zero)); 588 src.sin_port = ssk->lport; 589 src.sin_addr.s_addr = ssk->laddr; 590 soisconnecting(so); 591 SDP_WUNLOCK(ssk); 592 error = -rdma_resolve_addr(ssk->id, (struct sockaddr *)&src, nam, 593 SDP_RESOLVE_TIMEOUT); 594 SDP_WLOCK(ssk); 595 if (error == 0) 596 ssk->state = TCPS_SYN_SENT; 597 598 return 0; 599 } 600 601 /* 602 * Initiate SDP connection. 603 */ 604 static int 605 sdp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 606 { 607 int error = 0; 608 struct sdp_sock *ssk; 609 struct sockaddr_in *sin; 610 611 sin = (struct sockaddr_in *)nam; 612 if (nam->sa_len != sizeof (*sin)) 613 return (EINVAL); 614 if (sin->sin_family != AF_INET) 615 return (EINVAL); 616 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) 617 return (EAFNOSUPPORT); 618 if ((error = prison_remote_ip4(td->td_ucred, &sin->sin_addr)) != 0) 619 return (error); 620 ssk = sdp_sk(so); 621 SDP_WLOCK(ssk); 622 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) 623 error = EINVAL; 624 else 625 error = sdp_start_connect(ssk, nam, td); 626 SDP_WUNLOCK(ssk); 627 return (error); 628 } 629 630 /* 631 * Drop a SDP socket, reporting 632 * the specified error. If connection is synchronized, 633 * then send a RST to peer. 634 */ 635 static struct sdp_sock * 636 sdp_drop(struct sdp_sock *ssk, int errno) 637 { 638 struct socket *so; 639 640 SDP_WLOCK_ASSERT(ssk); 641 so = ssk->socket; 642 if (TCPS_HAVERCVDSYN(ssk->state)) 643 sdp_output_reset(ssk); 644 if (errno == ETIMEDOUT && ssk->softerror) 645 errno = ssk->softerror; 646 so->so_error = errno; 647 return (sdp_closed(ssk)); 648 } 649 650 /* 651 * User issued close, and wish to trail through shutdown states: 652 * if never received SYN, just forget it. If got a SYN from peer, 653 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 654 * If already got a FIN from peer, then almost done; go to LAST_ACK 655 * state. In all other cases, have already sent FIN to peer (e.g. 656 * after PRU_SHUTDOWN), and just have to play tedious game waiting 657 * for peer to send FIN or not respond to keep-alives, etc. 658 * We can let the user exit from the close as soon as the FIN is acked. 659 */ 660 static void 661 sdp_usrclosed(struct sdp_sock *ssk) 662 { 663 664 SDP_WLOCK_ASSERT(ssk); 665 666 switch (ssk->state) { 667 case TCPS_LISTEN: 668 ssk->state = TCPS_CLOSED; 669 SDP_WUNLOCK(ssk); 670 sdp_destroy_cma(ssk); 671 SDP_WLOCK(ssk); 672 /* FALLTHROUGH */ 673 case TCPS_CLOSED: 674 ssk = sdp_closed(ssk); 675 /* 676 * sdp_closed() should never return NULL here as the socket is 677 * still open. 678 */ 679 KASSERT(ssk != NULL, 680 ("sdp_usrclosed: sdp_closed() returned NULL")); 681 break; 682 683 case TCPS_SYN_SENT: 684 /* FALLTHROUGH */ 685 case TCPS_SYN_RECEIVED: 686 ssk->flags |= SDP_NEEDFIN; 687 break; 688 689 case TCPS_ESTABLISHED: 690 ssk->flags |= SDP_NEEDFIN; 691 ssk->state = TCPS_FIN_WAIT_1; 692 break; 693 694 case TCPS_CLOSE_WAIT: 695 ssk->state = TCPS_LAST_ACK; 696 break; 697 } 698 if (ssk->state >= TCPS_FIN_WAIT_2) { 699 /* Prevent the connection hanging in FIN_WAIT_2 forever. */ 700 if (ssk->state == TCPS_FIN_WAIT_2) 701 sdp_2msl_wait(ssk); 702 else 703 soisdisconnected(ssk->socket); 704 } 705 } 706 707 static void 708 sdp_output_disconnect(struct sdp_sock *ssk) 709 { 710 711 SDP_WLOCK_ASSERT(ssk); 712 callout_reset(&ssk->keep2msl, SDP_FIN_WAIT_TIMEOUT, 713 sdp_dreq_timeout, ssk); 714 ssk->flags |= SDP_NEEDFIN | SDP_DREQWAIT; 715 sdp_post_sends(ssk, M_NOWAIT); 716 } 717 718 /* 719 * Initiate or continue a disconnect. 720 * If embryonic state, just send reset (once). 721 * If in ``let data drain'' option and linger null, just drop. 722 * Otherwise (hard), mark socket disconnecting and drop 723 * current input data; switch states based on user close, and 724 * send segment to peer (with FIN). 725 */ 726 static void 727 sdp_start_disconnect(struct sdp_sock *ssk) 728 { 729 struct socket *so; 730 int unread; 731 732 so = ssk->socket; 733 SDP_WLOCK_ASSERT(ssk); 734 sdp_stop_keepalive_timer(so); 735 /* 736 * Neither sdp_closed() nor sdp_drop() should return NULL, as the 737 * socket is still open. 738 */ 739 if (ssk->state < TCPS_ESTABLISHED) { 740 ssk = sdp_closed(ssk); 741 KASSERT(ssk != NULL, 742 ("sdp_start_disconnect: sdp_close() returned NULL")); 743 } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) { 744 ssk = sdp_drop(ssk, 0); 745 KASSERT(ssk != NULL, 746 ("sdp_start_disconnect: sdp_drop() returned NULL")); 747 } else { 748 soisdisconnecting(so); 749 unread = so->so_rcv.sb_cc; 750 sbflush(&so->so_rcv); 751 sdp_usrclosed(ssk); 752 if (!(ssk->flags & SDP_DROPPED)) { 753 if (unread) 754 sdp_output_reset(ssk); 755 else 756 sdp_output_disconnect(ssk); 757 } 758 } 759 } 760 761 /* 762 * User initiated disconnect. 763 */ 764 static int 765 sdp_disconnect(struct socket *so) 766 { 767 struct sdp_sock *ssk; 768 int error = 0; 769 770 ssk = sdp_sk(so); 771 SDP_WLOCK(ssk); 772 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 773 error = ECONNRESET; 774 goto out; 775 } 776 sdp_start_disconnect(ssk); 777 out: 778 SDP_WUNLOCK(ssk); 779 return (error); 780 } 781 782 /* 783 * Accept a connection. Essentially all the work is done at higher levels; 784 * just return the address of the peer, storing through addr. 785 * 786 * 787 * XXX This is broken XXX 788 * 789 * The rationale for acquiring the sdp lock here is somewhat complicated, 790 * and is described in detail in the commit log entry for r175612. Acquiring 791 * it delays an accept(2) racing with sonewconn(), which inserts the socket 792 * before the address/port fields are initialized. A better fix would 793 * prevent the socket from being placed in the listen queue until all fields 794 * are fully initialized. 795 */ 796 static int 797 sdp_accept(struct socket *so, struct sockaddr **nam) 798 { 799 struct sdp_sock *ssk = NULL; 800 struct in_addr addr; 801 in_port_t port; 802 int error; 803 804 if (so->so_state & SS_ISDISCONNECTED) 805 return (ECONNABORTED); 806 807 port = 0; 808 addr.s_addr = 0; 809 error = 0; 810 ssk = sdp_sk(so); 811 SDP_WLOCK(ssk); 812 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 813 error = ECONNABORTED; 814 goto out; 815 } 816 port = ssk->fport; 817 addr.s_addr = ssk->faddr; 818 out: 819 SDP_WUNLOCK(ssk); 820 if (error == 0) 821 *nam = sdp_sockaddr(port, &addr); 822 return error; 823 } 824 825 /* 826 * Mark the connection as being incapable of further output. 827 */ 828 static int 829 sdp_shutdown(struct socket *so) 830 { 831 int error = 0; 832 struct sdp_sock *ssk; 833 834 ssk = sdp_sk(so); 835 SDP_WLOCK(ssk); 836 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 837 error = ECONNRESET; 838 goto out; 839 } 840 socantsendmore(so); 841 sdp_usrclosed(ssk); 842 if (!(ssk->flags & SDP_DROPPED)) 843 sdp_output_disconnect(ssk); 844 845 out: 846 SDP_WUNLOCK(ssk); 847 848 return (error); 849 } 850 851 static void 852 sdp_append(struct sdp_sock *ssk, struct sockbuf *sb, struct mbuf *mb, int cnt) 853 { 854 struct mbuf *n; 855 int ncnt; 856 857 SOCKBUF_LOCK_ASSERT(sb); 858 SBLASTRECORDCHK(sb); 859 KASSERT(mb->m_flags & M_PKTHDR, 860 ("sdp_append: %p Missing packet header.\n", mb)); 861 n = sb->sb_lastrecord; 862 /* 863 * If the queue is empty just set all pointers and proceed. 864 */ 865 if (n == NULL) { 866 sb->sb_lastrecord = sb->sb_mb = sb->sb_sndptr = mb; 867 for (; mb; mb = mb->m_next) { 868 sb->sb_mbtail = mb; 869 sballoc(sb, mb); 870 } 871 return; 872 } 873 /* 874 * Count the number of mbufs in the current tail. 875 */ 876 for (ncnt = 0; n->m_next; n = n->m_next) 877 ncnt++; 878 n = sb->sb_lastrecord; 879 /* 880 * If the two chains can fit in a single sdp packet and 881 * the last record has not been sent yet (WRITABLE) coalesce 882 * them. The lastrecord remains the same but we must strip the 883 * packet header and then let sbcompress do the hard part. 884 */ 885 if (M_WRITABLE(n) && ncnt + cnt < SDP_MAX_SEND_SGES && 886 n->m_pkthdr.len + mb->m_pkthdr.len - SDP_HEAD_SIZE < 887 ssk->xmit_size_goal) { 888 m_adj(mb, SDP_HEAD_SIZE); 889 n->m_pkthdr.len += mb->m_pkthdr.len; 890 n->m_flags |= mb->m_flags & (M_PUSH | M_URG); 891 m_demote(mb, 1); 892 sbcompress(sb, mb, sb->sb_mbtail); 893 return; 894 } 895 /* 896 * Not compressible, just append to the end and adjust counters. 897 */ 898 sb->sb_lastrecord->m_flags |= M_PUSH; 899 sb->sb_lastrecord->m_nextpkt = mb; 900 sb->sb_lastrecord = mb; 901 if (sb->sb_sndptr == NULL) 902 sb->sb_sndptr = mb; 903 for (; mb; mb = mb->m_next) { 904 sb->sb_mbtail = mb; 905 sballoc(sb, mb); 906 } 907 } 908 909 /* 910 * Do a send by putting data in output queue and updating urgent 911 * marker if URG set. Possibly send more data. Unlike the other 912 * pru_*() routines, the mbuf chains are our responsibility. We 913 * must either enqueue them or free them. The other pru_* routines 914 * generally are caller-frees. 915 * 916 * This comes from sendfile, normal sends will come from sdp_sosend(). 917 */ 918 static int 919 sdp_send(struct socket *so, int flags, struct mbuf *m, 920 struct sockaddr *nam, struct mbuf *control, struct thread *td) 921 { 922 struct sdp_sock *ssk; 923 struct mbuf *n; 924 int error; 925 int cnt; 926 927 error = 0; 928 ssk = sdp_sk(so); 929 KASSERT(m->m_flags & M_PKTHDR, 930 ("sdp_send: %p no packet header", m)); 931 M_PREPEND(m, SDP_HEAD_SIZE, M_WAIT); 932 mtod(m, struct sdp_bsdh *)->mid = SDP_MID_DATA; 933 for (n = m, cnt = 0; n->m_next; n = n->m_next) 934 cnt++; 935 if (cnt > SDP_MAX_SEND_SGES) { 936 n = m_collapse(m, M_WAIT, SDP_MAX_SEND_SGES); 937 if (n == NULL) { 938 m_freem(m); 939 return (EMSGSIZE); 940 } 941 m = n; 942 for (cnt = 0; n->m_next; n = n->m_next) 943 cnt++; 944 } 945 SDP_WLOCK(ssk); 946 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 947 if (control) 948 m_freem(control); 949 if (m) 950 m_freem(m); 951 error = ECONNRESET; 952 goto out; 953 } 954 if (control) { 955 /* SDP doesn't support control messages. */ 956 if (control->m_len) { 957 m_freem(control); 958 if (m) 959 m_freem(m); 960 error = EINVAL; 961 goto out; 962 } 963 m_freem(control); /* empty control, just free it */ 964 } 965 if (!(flags & PRUS_OOB)) { 966 SOCKBUF_LOCK(&so->so_snd); 967 sdp_append(ssk, &so->so_snd, m, cnt); 968 SOCKBUF_UNLOCK(&so->so_snd); 969 if (nam && ssk->state < TCPS_SYN_SENT) { 970 /* 971 * Do implied connect if not yet connected. 972 */ 973 error = sdp_start_connect(ssk, nam, td); 974 if (error) 975 goto out; 976 } 977 if (flags & PRUS_EOF) { 978 /* 979 * Close the send side of the connection after 980 * the data is sent. 981 */ 982 socantsendmore(so); 983 sdp_usrclosed(ssk); 984 if (!(ssk->flags & SDP_DROPPED)) 985 sdp_output_disconnect(ssk); 986 } else if (!(ssk->flags & SDP_DROPPED) && 987 !(flags & PRUS_MORETOCOME)) 988 sdp_post_sends(ssk, M_NOWAIT); 989 SDP_WUNLOCK(ssk); 990 return (0); 991 } else { 992 SOCKBUF_LOCK(&so->so_snd); 993 if (sbspace(&so->so_snd) < -512) { 994 SOCKBUF_UNLOCK(&so->so_snd); 995 m_freem(m); 996 error = ENOBUFS; 997 goto out; 998 } 999 /* 1000 * According to RFC961 (Assigned Protocols), 1001 * the urgent pointer points to the last octet 1002 * of urgent data. We continue, however, 1003 * to consider it to indicate the first octet 1004 * of data past the urgent section. 1005 * Otherwise, snd_up should be one lower. 1006 */ 1007 m->m_flags |= M_URG | M_PUSH; 1008 sdp_append(ssk, &so->so_snd, m, cnt); 1009 SOCKBUF_UNLOCK(&so->so_snd); 1010 if (nam && ssk->state < TCPS_SYN_SENT) { 1011 /* 1012 * Do implied connect if not yet connected. 1013 */ 1014 error = sdp_start_connect(ssk, nam, td); 1015 if (error) 1016 goto out; 1017 } 1018 sdp_post_sends(ssk, M_NOWAIT); 1019 SDP_WUNLOCK(ssk); 1020 return (0); 1021 } 1022 out: 1023 SDP_WUNLOCK(ssk); 1024 return (error); 1025 } 1026 1027 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 1028 1029 /* 1030 * Send on a socket. If send must go all at once and message is larger than 1031 * send buffering, then hard error. Lock against other senders. If must go 1032 * all at once and not enough room now, then inform user that this would 1033 * block and do nothing. Otherwise, if nonblocking, send as much as 1034 * possible. The data to be sent is described by "uio" if nonzero, otherwise 1035 * by the mbuf chain "top" (which must be null if uio is not). Data provided 1036 * in mbuf chain must be small enough to send all at once. 1037 * 1038 * Returns nonzero on error, timeout or signal; callers must check for short 1039 * counts if EINTR/ERESTART are returned. Data and control buffers are freed 1040 * on return. 1041 */ 1042 static int 1043 sdp_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 1044 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1045 { 1046 struct sdp_sock *ssk; 1047 long space, resid; 1048 int atomic; 1049 int error; 1050 int copy; 1051 1052 if (uio != NULL) 1053 resid = uio->uio_resid; 1054 else 1055 resid = top->m_pkthdr.len; 1056 atomic = top != NULL; 1057 if (control != NULL) { 1058 if (control->m_len) { 1059 m_freem(control); 1060 if (top) 1061 m_freem(top); 1062 return (EINVAL); 1063 } 1064 m_freem(control); 1065 control = NULL; 1066 } 1067 /* 1068 * In theory resid should be unsigned. However, space must be 1069 * signed, as it might be less than 0 if we over-committed, and we 1070 * must use a signed comparison of space and resid. On the other 1071 * hand, a negative resid causes us to loop sending 0-length 1072 * segments to the protocol. 1073 * 1074 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 1075 * type sockets since that's an error. 1076 */ 1077 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 1078 error = EINVAL; 1079 goto out; 1080 } 1081 if (td != NULL) 1082 td->td_ru.ru_msgsnd++; 1083 1084 ssk = sdp_sk(so); 1085 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 1086 if (error) 1087 goto out; 1088 1089 restart: 1090 do { 1091 SOCKBUF_LOCK(&so->so_snd); 1092 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1093 SOCKBUF_UNLOCK(&so->so_snd); 1094 error = EPIPE; 1095 goto release; 1096 } 1097 if (so->so_error) { 1098 error = so->so_error; 1099 so->so_error = 0; 1100 SOCKBUF_UNLOCK(&so->so_snd); 1101 goto release; 1102 } 1103 if ((so->so_state & SS_ISCONNECTED) == 0 && addr == NULL) { 1104 SOCKBUF_UNLOCK(&so->so_snd); 1105 error = ENOTCONN; 1106 goto release; 1107 } 1108 space = sbspace(&so->so_snd); 1109 if (flags & MSG_OOB) 1110 space += 1024; 1111 if (atomic && resid > ssk->xmit_size_goal - SDP_HEAD_SIZE) { 1112 SOCKBUF_UNLOCK(&so->so_snd); 1113 error = EMSGSIZE; 1114 goto release; 1115 } 1116 if (space < resid && 1117 (atomic || space < so->so_snd.sb_lowat)) { 1118 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) { 1119 SOCKBUF_UNLOCK(&so->so_snd); 1120 error = EWOULDBLOCK; 1121 goto release; 1122 } 1123 error = sbwait(&so->so_snd); 1124 SOCKBUF_UNLOCK(&so->so_snd); 1125 if (error) 1126 goto release; 1127 goto restart; 1128 } 1129 SOCKBUF_UNLOCK(&so->so_snd); 1130 do { 1131 if (uio == NULL) { 1132 resid = 0; 1133 if (flags & MSG_EOR) 1134 top->m_flags |= M_EOR; 1135 } else { 1136 /* 1137 * Copy the data from userland into a mbuf 1138 * chain. If no data is to be copied in, 1139 * a single empty mbuf is returned. 1140 */ 1141 copy = min(space, 1142 ssk->xmit_size_goal - SDP_HEAD_SIZE); 1143 top = m_uiotombuf(uio, M_WAITOK, copy, 1144 0, M_PKTHDR | 1145 ((flags & MSG_EOR) ? M_EOR : 0)); 1146 if (top == NULL) { 1147 /* only possible error */ 1148 error = EFAULT; 1149 goto release; 1150 } 1151 space -= resid - uio->uio_resid; 1152 resid = uio->uio_resid; 1153 } 1154 /* 1155 * XXX all the SBS_CANTSENDMORE checks previously 1156 * done could be out of date after dropping the 1157 * socket lock. 1158 */ 1159 error = sdp_send(so, (flags & MSG_OOB) ? PRUS_OOB : 1160 /* 1161 * Set EOF on the last send if the user specified 1162 * MSG_EOF. 1163 */ 1164 ((flags & MSG_EOF) && (resid <= 0)) ? PRUS_EOF : 1165 /* If there is more to send set PRUS_MORETOCOME. */ 1166 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1167 top, addr, NULL, td); 1168 top = NULL; 1169 if (error) 1170 goto release; 1171 } while (resid && space > 0); 1172 } while (resid); 1173 1174 release: 1175 sbunlock(&so->so_snd); 1176 out: 1177 if (top != NULL) 1178 m_freem(top); 1179 return (error); 1180 } 1181 1182 /* 1183 * The part of soreceive() that implements reading non-inline out-of-band 1184 * data from a socket. For more complete comments, see soreceive(), from 1185 * which this code originated. 1186 * 1187 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is 1188 * unable to return an mbuf chain to the caller. 1189 */ 1190 static int 1191 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) 1192 { 1193 struct protosw *pr = so->so_proto; 1194 struct mbuf *m; 1195 int error; 1196 1197 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 1198 1199 m = m_get(M_WAIT, MT_DATA); 1200 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 1201 if (error) 1202 goto bad; 1203 do { 1204 error = uiomove(mtod(m, void *), 1205 (int) min(uio->uio_resid, m->m_len), uio); 1206 m = m_free(m); 1207 } while (uio->uio_resid && error == 0 && m); 1208 bad: 1209 if (m != NULL) 1210 m_freem(m); 1211 return (error); 1212 } 1213 1214 /* 1215 * Optimized version of soreceive() for stream (TCP) sockets. 1216 */ 1217 static int 1218 sdp_sorecv(struct socket *so, struct sockaddr **psa, struct uio *uio, 1219 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 1220 { 1221 int len = 0, error = 0, flags, oresid; 1222 struct sockbuf *sb; 1223 struct mbuf *m, *n = NULL; 1224 struct sdp_sock *ssk; 1225 1226 /* We only do stream sockets. */ 1227 if (so->so_type != SOCK_STREAM) 1228 return (EINVAL); 1229 if (psa != NULL) 1230 *psa = NULL; 1231 if (controlp != NULL) 1232 return (EINVAL); 1233 if (flagsp != NULL) 1234 flags = *flagsp &~ MSG_EOR; 1235 else 1236 flags = 0; 1237 if (flags & MSG_OOB) 1238 return (soreceive_rcvoob(so, uio, flags)); 1239 if (mp0 != NULL) 1240 *mp0 = NULL; 1241 1242 sb = &so->so_rcv; 1243 ssk = sdp_sk(so); 1244 1245 /* Prevent other readers from entering the socket. */ 1246 error = sblock(sb, SBLOCKWAIT(flags)); 1247 if (error) 1248 goto out; 1249 SOCKBUF_LOCK(sb); 1250 1251 /* Easy one, no space to copyout anything. */ 1252 if (uio->uio_resid == 0) { 1253 error = EINVAL; 1254 goto out; 1255 } 1256 oresid = uio->uio_resid; 1257 1258 /* We will never ever get anything unless we are connected. */ 1259 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { 1260 /* When disconnecting there may be still some data left. */ 1261 if (sb->sb_cc > 0) 1262 goto deliver; 1263 if (!(so->so_state & SS_ISDISCONNECTED)) 1264 error = ENOTCONN; 1265 goto out; 1266 } 1267 1268 /* Socket buffer is empty and we shall not block. */ 1269 if (sb->sb_cc == 0 && 1270 ((sb->sb_flags & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { 1271 error = EAGAIN; 1272 goto out; 1273 } 1274 1275 restart: 1276 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1277 1278 /* Abort if socket has reported problems. */ 1279 if (so->so_error) { 1280 if (sb->sb_cc > 0) 1281 goto deliver; 1282 if (oresid > uio->uio_resid) 1283 goto out; 1284 error = so->so_error; 1285 if (!(flags & MSG_PEEK)) 1286 so->so_error = 0; 1287 goto out; 1288 } 1289 1290 /* Door is closed. Deliver what is left, if any. */ 1291 if (sb->sb_state & SBS_CANTRCVMORE) { 1292 if (sb->sb_cc > 0) 1293 goto deliver; 1294 else 1295 goto out; 1296 } 1297 1298 /* Socket buffer got some data that we shall deliver now. */ 1299 if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) && 1300 ((sb->sb_flags & SS_NBIO) || 1301 (flags & (MSG_DONTWAIT|MSG_NBIO)) || 1302 sb->sb_cc >= sb->sb_lowat || 1303 sb->sb_cc >= uio->uio_resid || 1304 sb->sb_cc >= sb->sb_hiwat) ) { 1305 goto deliver; 1306 } 1307 1308 /* On MSG_WAITALL we must wait until all data or error arrives. */ 1309 if ((flags & MSG_WAITALL) && 1310 (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_lowat)) 1311 goto deliver; 1312 1313 /* 1314 * Wait and block until (more) data comes in. 1315 * NB: Drops the sockbuf lock during wait. 1316 */ 1317 error = sbwait(sb); 1318 if (error) 1319 goto out; 1320 goto restart; 1321 1322 deliver: 1323 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1324 KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__)); 1325 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); 1326 1327 /* Statistics. */ 1328 if (uio->uio_td) 1329 uio->uio_td->td_ru.ru_msgrcv++; 1330 1331 /* Fill uio until full or current end of socket buffer is reached. */ 1332 len = min(uio->uio_resid, sb->sb_cc); 1333 if (mp0 != NULL) { 1334 /* Dequeue as many mbufs as possible. */ 1335 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { 1336 for (*mp0 = m = sb->sb_mb; 1337 m != NULL && m->m_len <= len; 1338 m = m->m_next) { 1339 len -= m->m_len; 1340 uio->uio_resid -= m->m_len; 1341 sbfree(sb, m); 1342 n = m; 1343 } 1344 sb->sb_mb = m; 1345 if (sb->sb_mb == NULL) 1346 SB_EMPTY_FIXUP(sb); 1347 n->m_next = NULL; 1348 } 1349 /* Copy the remainder. */ 1350 if (len > 0) { 1351 KASSERT(sb->sb_mb != NULL, 1352 ("%s: len > 0 && sb->sb_mb empty", __func__)); 1353 1354 m = m_copym(sb->sb_mb, 0, len, M_DONTWAIT); 1355 if (m == NULL) 1356 len = 0; /* Don't flush data from sockbuf. */ 1357 else 1358 uio->uio_resid -= m->m_len; 1359 if (*mp0 != NULL) 1360 n->m_next = m; 1361 else 1362 *mp0 = m; 1363 if (*mp0 == NULL) { 1364 error = ENOBUFS; 1365 goto out; 1366 } 1367 } 1368 } else { 1369 /* NB: Must unlock socket buffer as uiomove may sleep. */ 1370 SOCKBUF_UNLOCK(sb); 1371 error = m_mbuftouio(uio, sb->sb_mb, len); 1372 SOCKBUF_LOCK(sb); 1373 if (error) 1374 goto out; 1375 } 1376 SBLASTRECORDCHK(sb); 1377 SBLASTMBUFCHK(sb); 1378 1379 /* 1380 * Remove the delivered data from the socket buffer unless we 1381 * were only peeking. 1382 */ 1383 if (!(flags & MSG_PEEK)) { 1384 if (len > 0) 1385 sbdrop_locked(sb, len); 1386 1387 /* Notify protocol that we drained some data. */ 1388 SOCKBUF_UNLOCK(sb); 1389 SDP_WLOCK(ssk); 1390 sdp_do_posts(ssk); 1391 SDP_WUNLOCK(ssk); 1392 SOCKBUF_LOCK(sb); 1393 } 1394 1395 /* 1396 * For MSG_WAITALL we may have to loop again and wait for 1397 * more data to come in. 1398 */ 1399 if ((flags & MSG_WAITALL) && uio->uio_resid > 0) 1400 goto restart; 1401 out: 1402 SOCKBUF_LOCK_ASSERT(sb); 1403 SBLASTRECORDCHK(sb); 1404 SBLASTMBUFCHK(sb); 1405 SOCKBUF_UNLOCK(sb); 1406 sbunlock(sb); 1407 return (error); 1408 } 1409 1410 /* 1411 * Abort is used to teardown a connection typically while sitting in 1412 * the accept queue. 1413 */ 1414 void 1415 sdp_abort(struct socket *so) 1416 { 1417 struct sdp_sock *ssk; 1418 1419 ssk = sdp_sk(so); 1420 SDP_WLOCK(ssk); 1421 /* 1422 * If we have not yet dropped, do it now. 1423 */ 1424 if (!(ssk->flags & SDP_TIMEWAIT) && 1425 !(ssk->flags & SDP_DROPPED)) 1426 sdp_drop(ssk, ECONNABORTED); 1427 KASSERT(ssk->flags & SDP_DROPPED, ("sdp_abort: %p not dropped 0x%X", 1428 ssk, ssk->flags)); 1429 SDP_WUNLOCK(ssk); 1430 } 1431 1432 /* 1433 * Close a SDP socket and initiate a friendly disconnect. 1434 */ 1435 static void 1436 sdp_close(struct socket *so) 1437 { 1438 struct sdp_sock *ssk; 1439 1440 ssk = sdp_sk(so); 1441 SDP_WLOCK(ssk); 1442 /* 1443 * If we have not yet dropped, do it now. 1444 */ 1445 if (!(ssk->flags & SDP_TIMEWAIT) && 1446 !(ssk->flags & SDP_DROPPED)) 1447 sdp_start_disconnect(ssk); 1448 1449 /* 1450 * If we've still not dropped let the socket layer know we're 1451 * holding on to the socket and pcb for a while. 1452 */ 1453 if (!(ssk->flags & SDP_DROPPED)) { 1454 SOCK_LOCK(so); 1455 so->so_state |= SS_PROTOREF; 1456 SOCK_UNLOCK(so); 1457 ssk->flags |= SDP_SOCKREF; 1458 } 1459 SDP_WUNLOCK(ssk); 1460 } 1461 1462 /* 1463 * User requests out-of-band data. 1464 */ 1465 static int 1466 sdp_rcvoob(struct socket *so, struct mbuf *m, int flags) 1467 { 1468 int error = 0; 1469 struct sdp_sock *ssk; 1470 1471 ssk = sdp_sk(so); 1472 SDP_WLOCK(ssk); 1473 if (!rx_ring_trylock(&ssk->rx_ring)) { 1474 SDP_WUNLOCK(ssk); 1475 return (ECONNRESET); 1476 } 1477 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 1478 error = ECONNRESET; 1479 goto out; 1480 } 1481 if ((so->so_oobmark == 0 && 1482 (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) || 1483 so->so_options & SO_OOBINLINE || 1484 ssk->oobflags & SDP_HADOOB) { 1485 error = EINVAL; 1486 goto out; 1487 } 1488 if ((ssk->oobflags & SDP_HAVEOOB) == 0) { 1489 error = EWOULDBLOCK; 1490 goto out; 1491 } 1492 m->m_len = 1; 1493 *mtod(m, caddr_t) = ssk->iobc; 1494 if ((flags & MSG_PEEK) == 0) 1495 ssk->oobflags ^= (SDP_HAVEOOB | SDP_HADOOB); 1496 out: 1497 rx_ring_unlock(&ssk->rx_ring); 1498 SDP_WUNLOCK(ssk); 1499 return (error); 1500 } 1501 1502 void 1503 sdp_urg(struct sdp_sock *ssk, struct mbuf *mb) 1504 { 1505 struct mbuf *m; 1506 struct socket *so; 1507 1508 so = ssk->socket; 1509 if (so == NULL) 1510 return; 1511 1512 so->so_oobmark = so->so_rcv.sb_cc + mb->m_pkthdr.len - 1; 1513 sohasoutofband(so); 1514 ssk->oobflags &= ~(SDP_HAVEOOB | SDP_HADOOB); 1515 if (!(so->so_options & SO_OOBINLINE)) { 1516 for (m = mb; m->m_next != NULL; m = m->m_next); 1517 ssk->iobc = *(mtod(m, char *) + m->m_len - 1); 1518 ssk->oobflags |= SDP_HAVEOOB; 1519 m->m_len--; 1520 mb->m_pkthdr.len--; 1521 } 1522 } 1523 1524 /* 1525 * Notify a sdp socket of an asynchronous error. 1526 * 1527 * Do not wake up user since there currently is no mechanism for 1528 * reporting soft errors (yet - a kqueue filter may be added). 1529 */ 1530 struct sdp_sock * 1531 sdp_notify(struct sdp_sock *ssk, int error) 1532 { 1533 1534 SDP_WLOCK_ASSERT(ssk); 1535 1536 if ((ssk->flags & SDP_TIMEWAIT) || 1537 (ssk->flags & SDP_DROPPED)) 1538 return (ssk); 1539 1540 /* 1541 * Ignore some errors if we are hooked up. 1542 */ 1543 if (ssk->state == TCPS_ESTABLISHED && 1544 (error == EHOSTUNREACH || error == ENETUNREACH || 1545 error == EHOSTDOWN)) 1546 return (ssk); 1547 ssk->softerror = error; 1548 return sdp_drop(ssk, error); 1549 } 1550 1551 static void 1552 sdp_ctlinput(int cmd, struct sockaddr *sa, void *vip) 1553 { 1554 struct in_addr faddr; 1555 1556 faddr = ((struct sockaddr_in *)sa)->sin_addr; 1557 if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) 1558 return; 1559 1560 sdp_pcbnotifyall(faddr, inetctlerrmap[cmd], sdp_notify); 1561 } 1562 1563 static int 1564 sdp_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, 1565 struct thread *td) 1566 { 1567 return (EOPNOTSUPP); 1568 } 1569 1570 static void 1571 sdp_keepalive_timeout(void *data) 1572 { 1573 struct sdp_sock *ssk; 1574 1575 ssk = data; 1576 /* Callout canceled. */ 1577 if (!callout_active(&ssk->keep2msl)) 1578 return; 1579 /* Callout rescheduled as a different kind of timer. */ 1580 if (callout_pending(&ssk->keep2msl)) 1581 goto out; 1582 callout_deactivate(&ssk->keep2msl); 1583 if (ssk->flags & SDP_DROPPED || 1584 (ssk->socket->so_options & SO_KEEPALIVE) == 0) 1585 goto out; 1586 sdp_post_keepalive(ssk); 1587 callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME, 1588 sdp_keepalive_timeout, ssk); 1589 out: 1590 SDP_WUNLOCK(ssk); 1591 } 1592 1593 1594 void 1595 sdp_start_keepalive_timer(struct socket *so) 1596 { 1597 struct sdp_sock *ssk; 1598 1599 ssk = sdp_sk(so); 1600 if (!callout_pending(&ssk->keep2msl)) 1601 callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME, 1602 sdp_keepalive_timeout, ssk); 1603 } 1604 1605 static void 1606 sdp_stop_keepalive_timer(struct socket *so) 1607 { 1608 struct sdp_sock *ssk; 1609 1610 ssk = sdp_sk(so); 1611 callout_stop(&ssk->keep2msl); 1612 } 1613 1614 /* 1615 * sdp_ctloutput() must drop the inpcb lock before performing copyin on 1616 * socket option arguments. When it re-acquires the lock after the copy, it 1617 * has to revalidate that the connection is still valid for the socket 1618 * option. 1619 */ 1620 #define SDP_WLOCK_RECHECK(inp) do { \ 1621 SDP_WLOCK(ssk); \ 1622 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { \ 1623 SDP_WUNLOCK(ssk); \ 1624 return (ECONNRESET); \ 1625 } \ 1626 } while(0) 1627 1628 static int 1629 sdp_ctloutput(struct socket *so, struct sockopt *sopt) 1630 { 1631 int error, opt, optval; 1632 struct sdp_sock *ssk; 1633 1634 error = 0; 1635 ssk = sdp_sk(so); 1636 if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_KEEPALIVE) { 1637 SDP_WLOCK(ssk); 1638 if (so->so_options & SO_KEEPALIVE) 1639 sdp_start_keepalive_timer(so); 1640 else 1641 sdp_stop_keepalive_timer(so); 1642 SDP_WUNLOCK(ssk); 1643 } 1644 if (sopt->sopt_level != IPPROTO_TCP) 1645 return (error); 1646 1647 SDP_WLOCK(ssk); 1648 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 1649 SDP_WUNLOCK(ssk); 1650 return (ECONNRESET); 1651 } 1652 1653 switch (sopt->sopt_dir) { 1654 case SOPT_SET: 1655 switch (sopt->sopt_name) { 1656 case TCP_NODELAY: 1657 SDP_WUNLOCK(ssk); 1658 error = sooptcopyin(sopt, &optval, sizeof optval, 1659 sizeof optval); 1660 if (error) 1661 return (error); 1662 1663 SDP_WLOCK_RECHECK(ssk); 1664 opt = SDP_NODELAY; 1665 if (optval) 1666 ssk->flags |= opt; 1667 else 1668 ssk->flags &= ~opt; 1669 sdp_do_posts(ssk); 1670 SDP_WUNLOCK(ssk); 1671 break; 1672 1673 default: 1674 SDP_WUNLOCK(ssk); 1675 error = ENOPROTOOPT; 1676 break; 1677 } 1678 break; 1679 1680 case SOPT_GET: 1681 switch (sopt->sopt_name) { 1682 case TCP_NODELAY: 1683 optval = ssk->flags & SDP_NODELAY; 1684 SDP_WUNLOCK(ssk); 1685 error = sooptcopyout(sopt, &optval, sizeof optval); 1686 break; 1687 default: 1688 SDP_WUNLOCK(ssk); 1689 error = ENOPROTOOPT; 1690 break; 1691 } 1692 break; 1693 } 1694 return (error); 1695 } 1696 #undef SDP_WLOCK_RECHECK 1697 1698 int sdp_mod_count = 0; 1699 int sdp_mod_usec = 0; 1700 1701 void 1702 sdp_set_default_moderation(struct sdp_sock *ssk) 1703 { 1704 if (sdp_mod_count <= 0 || sdp_mod_usec <= 0) 1705 return; 1706 ib_modify_cq(ssk->rx_ring.cq, sdp_mod_count, sdp_mod_usec); 1707 } 1708 1709 1710 static void 1711 sdp_dev_add(struct ib_device *device) 1712 { 1713 struct ib_fmr_pool_param param; 1714 struct sdp_device *sdp_dev; 1715 1716 sdp_dev = malloc(sizeof(*sdp_dev), M_SDP, M_WAITOK | M_ZERO); 1717 sdp_dev->pd = ib_alloc_pd(device); 1718 if (IS_ERR(sdp_dev->pd)) 1719 goto out_pd; 1720 sdp_dev->mr = ib_get_dma_mr(sdp_dev->pd, IB_ACCESS_LOCAL_WRITE); 1721 if (IS_ERR(sdp_dev->mr)) 1722 goto out_mr; 1723 memset(¶m, 0, sizeof param); 1724 param.max_pages_per_fmr = SDP_FMR_SIZE; 1725 param.page_shift = PAGE_SHIFT; 1726 param.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ); 1727 param.pool_size = SDP_FMR_POOL_SIZE; 1728 param.dirty_watermark = SDP_FMR_DIRTY_SIZE; 1729 param.cache = 1; 1730 sdp_dev->fmr_pool = ib_create_fmr_pool(sdp_dev->pd, ¶m); 1731 if (IS_ERR(sdp_dev->fmr_pool)) 1732 goto out_fmr; 1733 ib_set_client_data(device, &sdp_client, sdp_dev); 1734 return; 1735 1736 out_fmr: 1737 ib_dereg_mr(sdp_dev->mr); 1738 out_mr: 1739 ib_dealloc_pd(sdp_dev->pd); 1740 out_pd: 1741 free(sdp_dev, M_SDP); 1742 } 1743 1744 static void 1745 sdp_dev_rem(struct ib_device *device) 1746 { 1747 struct sdp_device *sdp_dev; 1748 struct sdp_sock *ssk; 1749 1750 SDP_LIST_WLOCK(); 1751 LIST_FOREACH(ssk, &sdp_list, list) { 1752 if (ssk->ib_device != device) 1753 continue; 1754 SDP_WLOCK(ssk); 1755 if ((ssk->flags & SDP_DESTROY) == 0) 1756 ssk = sdp_notify(ssk, ECONNRESET); 1757 if (ssk) 1758 SDP_WUNLOCK(ssk); 1759 } 1760 SDP_LIST_WUNLOCK(); 1761 /* 1762 * XXX Do I need to wait between these two? 1763 */ 1764 sdp_dev = ib_get_client_data(device, &sdp_client); 1765 if (!sdp_dev) 1766 return; 1767 ib_flush_fmr_pool(sdp_dev->fmr_pool); 1768 ib_destroy_fmr_pool(sdp_dev->fmr_pool); 1769 ib_dereg_mr(sdp_dev->mr); 1770 ib_dealloc_pd(sdp_dev->pd); 1771 free(sdp_dev, M_SDP); 1772 } 1773 1774 struct ib_client sdp_client = 1775 { .name = "sdp", .add = sdp_dev_add, .remove = sdp_dev_rem }; 1776 1777 1778 static int 1779 sdp_pcblist(SYSCTL_HANDLER_ARGS) 1780 { 1781 int error, n, i; 1782 struct sdp_sock *ssk; 1783 struct xinpgen xig; 1784 1785 /* 1786 * The process of preparing the TCB list is too time-consuming and 1787 * resource-intensive to repeat twice on every request. 1788 */ 1789 if (req->oldptr == NULL) { 1790 n = sdp_count; 1791 n += imax(n / 8, 10); 1792 req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb); 1793 return (0); 1794 } 1795 1796 if (req->newptr != NULL) 1797 return (EPERM); 1798 1799 /* 1800 * OK, now we're committed to doing something. 1801 */ 1802 SDP_LIST_RLOCK(); 1803 n = sdp_count; 1804 SDP_LIST_RUNLOCK(); 1805 1806 error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) 1807 + n * sizeof(struct xtcpcb)); 1808 if (error != 0) 1809 return (error); 1810 1811 xig.xig_len = sizeof xig; 1812 xig.xig_count = n; 1813 xig.xig_gen = 0; 1814 xig.xig_sogen = so_gencnt; 1815 error = SYSCTL_OUT(req, &xig, sizeof xig); 1816 if (error) 1817 return (error); 1818 1819 SDP_LIST_RLOCK(); 1820 for (ssk = LIST_FIRST(&sdp_list), i = 0; 1821 ssk != NULL && i < n; ssk = LIST_NEXT(ssk, list)) { 1822 struct xtcpcb xt; 1823 1824 SDP_RLOCK(ssk); 1825 if (ssk->flags & SDP_TIMEWAIT) { 1826 if (ssk->cred != NULL) 1827 error = cr_cansee(req->td->td_ucred, 1828 ssk->cred); 1829 else 1830 error = EINVAL; /* Skip this inp. */ 1831 } else if (ssk->socket) 1832 error = cr_canseesocket(req->td->td_ucred, 1833 ssk->socket); 1834 else 1835 error = EINVAL; 1836 if (error) { 1837 error = 0; 1838 goto next; 1839 } 1840 1841 bzero(&xt, sizeof(xt)); 1842 xt.xt_len = sizeof xt; 1843 xt.xt_inp.inp_gencnt = 0; 1844 xt.xt_inp.inp_vflag = INP_IPV4; 1845 memcpy(&xt.xt_inp.inp_laddr, &ssk->laddr, sizeof(ssk->laddr)); 1846 xt.xt_inp.inp_lport = ssk->lport; 1847 memcpy(&xt.xt_inp.inp_faddr, &ssk->faddr, sizeof(ssk->faddr)); 1848 xt.xt_inp.inp_fport = ssk->fport; 1849 xt.xt_tp.t_state = ssk->state; 1850 if (ssk->socket != NULL) 1851 sotoxsocket(ssk->socket, &xt.xt_socket); 1852 else 1853 bzero(&xt.xt_socket, sizeof xt.xt_socket); 1854 xt.xt_socket.xso_protocol = IPPROTO_TCP; 1855 SDP_RUNLOCK(ssk); 1856 error = SYSCTL_OUT(req, &xt, sizeof xt); 1857 if (error) 1858 break; 1859 i++; 1860 continue; 1861 next: 1862 SDP_RUNLOCK(ssk); 1863 } 1864 if (!error) { 1865 /* 1866 * Give the user an updated idea of our state. 1867 * If the generation differs from what we told 1868 * her before, she knows that something happened 1869 * while we were processing this request, and it 1870 * might be necessary to retry. 1871 */ 1872 xig.xig_gen = 0; 1873 xig.xig_sogen = so_gencnt; 1874 xig.xig_count = sdp_count; 1875 error = SYSCTL_OUT(req, &xig, sizeof xig); 1876 } 1877 SDP_LIST_RUNLOCK(); 1878 return (error); 1879 } 1880 1881 static SYSCTL_NODE(_net_inet, -1, sdp, CTLFLAG_RW, 0, "SDP"); 1882 1883 SYSCTL_PROC(_net_inet_sdp, TCPCTL_PCBLIST, pcblist, 1884 CTLFLAG_RD | CTLTYPE_STRUCT, 0, 0, sdp_pcblist, "S,xtcpcb", 1885 "List of active SDP connections"); 1886 1887 static void 1888 sdp_zone_change(void *tag) 1889 { 1890 1891 uma_zone_set_max(sdp_zone, maxsockets); 1892 } 1893 1894 static void 1895 sdp_init(void) 1896 { 1897 1898 LIST_INIT(&sdp_list); 1899 sdp_zone = uma_zcreate("sdp_sock", sizeof(struct sdp_sock), 1900 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 1901 uma_zone_set_max(sdp_zone, maxsockets); 1902 EVENTHANDLER_REGISTER(maxsockets_change, sdp_zone_change, NULL, 1903 EVENTHANDLER_PRI_ANY); 1904 rx_comp_wq = create_singlethread_workqueue("rx_comp_wq"); 1905 ib_register_client(&sdp_client); 1906 } 1907 1908 extern struct domain sdpdomain; 1909 1910 struct pr_usrreqs sdp_usrreqs = { 1911 .pru_abort = sdp_abort, 1912 .pru_accept = sdp_accept, 1913 .pru_attach = sdp_attach, 1914 .pru_bind = sdp_bind, 1915 .pru_connect = sdp_connect, 1916 .pru_control = sdp_control, 1917 .pru_detach = sdp_detach, 1918 .pru_disconnect = sdp_disconnect, 1919 .pru_listen = sdp_listen, 1920 .pru_peeraddr = sdp_getpeeraddr, 1921 .pru_rcvoob = sdp_rcvoob, 1922 .pru_send = sdp_send, 1923 .pru_sosend = sdp_sosend, 1924 .pru_soreceive = sdp_sorecv, 1925 .pru_shutdown = sdp_shutdown, 1926 .pru_sockaddr = sdp_getsockaddr, 1927 .pru_close = sdp_close, 1928 }; 1929 1930 struct protosw sdpsw[] = { 1931 { 1932 .pr_type = SOCK_STREAM, 1933 .pr_domain = &sdpdomain, 1934 .pr_protocol = IPPROTO_IP, 1935 .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD, 1936 .pr_ctlinput = sdp_ctlinput, 1937 .pr_ctloutput = sdp_ctloutput, 1938 .pr_usrreqs = &sdp_usrreqs 1939 }, 1940 { 1941 .pr_type = SOCK_STREAM, 1942 .pr_domain = &sdpdomain, 1943 .pr_protocol = IPPROTO_TCP, 1944 .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD, 1945 .pr_ctlinput = sdp_ctlinput, 1946 .pr_ctloutput = sdp_ctloutput, 1947 .pr_usrreqs = &sdp_usrreqs 1948 }, 1949 }; 1950 1951 struct domain sdpdomain = { 1952 .dom_family = AF_INET_SDP, 1953 .dom_name = "SDP", 1954 .dom_init = sdp_init, 1955 .dom_protosw = sdpsw, 1956 .dom_protoswNPROTOSW = &sdpsw[sizeof(sdpsw)/sizeof(sdpsw[0])], 1957 }; 1958 1959 DOMAIN_SET(sdp); 1960 1961 int sdp_debug_level = 1; 1962 int sdp_data_debug_level = 0; 1963