1 2 /*- 3 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 4 * The Regents of the University of California. All rights reserved. 5 * Copyright (c) 2004 The FreeBSD Foundation. All rights reserved. 6 * Copyright (c) 2004-2008 Robert N. M. Watson. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 4. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * Excerpts taken from tcp_subr.c, tcp_usrreq.c, uipc_socket.c 33 */ 34 35 /* 36 * 37 * Copyright (c) 2010 Isilon Systems, Inc. 38 * Copyright (c) 2010 iX Systems, Inc. 39 * Copyright (c) 2010 Panasas, Inc. 40 * All rights reserved. 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice unmodified, this list of conditions, and the following 47 * disclaimer. 48 * 2. Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * 52 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 53 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 54 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 55 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 56 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 57 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 58 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 59 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 60 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 61 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 62 * 63 */ 64 #include <sys/cdefs.h> 65 __FBSDID("$FreeBSD$"); 66 67 #include "sdp.h" 68 69 #include <net/if.h> 70 #include <net/route.h> 71 #include <net/vnet.h> 72 #include <sys/sysctl.h> 73 74 uma_zone_t sdp_zone; 75 struct rwlock sdp_lock; 76 LIST_HEAD(, sdp_sock) sdp_list; 77 78 struct workqueue_struct *rx_comp_wq; 79 80 RW_SYSINIT(sdplockinit, &sdp_lock, "SDP lock"); 81 #define SDP_LIST_WLOCK() rw_wlock(&sdp_lock) 82 #define SDP_LIST_RLOCK() rw_rlock(&sdp_lock) 83 #define SDP_LIST_WUNLOCK() rw_wunlock(&sdp_lock) 84 #define SDP_LIST_RUNLOCK() rw_runlock(&sdp_lock) 85 #define SDP_LIST_WLOCK_ASSERT() rw_assert(&sdp_lock, RW_WLOCKED) 86 #define SDP_LIST_RLOCK_ASSERT() rw_assert(&sdp_lock, RW_RLOCKED) 87 #define SDP_LIST_LOCK_ASSERT() rw_assert(&sdp_lock, RW_LOCKED) 88 89 static MALLOC_DEFINE(M_SDP, "sdp", "Socket Direct Protocol"); 90 91 static void sdp_stop_keepalive_timer(struct socket *so); 92 93 /* 94 * SDP protocol interface to socket abstraction. 95 */ 96 /* 97 * sdp_sendspace and sdp_recvspace are the default send and receive window 98 * sizes, respectively. 99 */ 100 u_long sdp_sendspace = 1024*32; 101 u_long sdp_recvspace = 1024*64; 102 103 static int sdp_count; 104 105 /* 106 * Disable async. CMA events for sockets which are being torn down. 107 */ 108 static void 109 sdp_destroy_cma(struct sdp_sock *ssk) 110 { 111 112 if (ssk->id == NULL) 113 return; 114 rdma_destroy_id(ssk->id); 115 ssk->id = NULL; 116 } 117 118 static int 119 sdp_pcbbind(struct sdp_sock *ssk, struct sockaddr *nam, struct ucred *cred) 120 { 121 struct sockaddr_in *sin; 122 struct sockaddr_in null; 123 int error; 124 125 SDP_WLOCK_ASSERT(ssk); 126 127 if (ssk->lport != 0 || ssk->laddr != INADDR_ANY) 128 return (EINVAL); 129 /* rdma_bind_addr handles bind races. */ 130 SDP_WUNLOCK(ssk); 131 if (ssk->id == NULL) 132 ssk->id = rdma_create_id(sdp_cma_handler, ssk, RDMA_PS_SDP); 133 if (ssk->id == NULL) { 134 SDP_WLOCK(ssk); 135 return (ENOMEM); 136 } 137 if (nam == NULL) { 138 null.sin_family = AF_INET; 139 null.sin_len = sizeof(null); 140 null.sin_addr.s_addr = INADDR_ANY; 141 null.sin_port = 0; 142 bzero(&null.sin_zero, sizeof(null.sin_zero)); 143 nam = (struct sockaddr *)&null; 144 } 145 error = -rdma_bind_addr(ssk->id, nam); 146 SDP_WLOCK(ssk); 147 if (error == 0) { 148 sin = (struct sockaddr_in *)&ssk->id->route.addr.src_addr; 149 ssk->laddr = sin->sin_addr.s_addr; 150 ssk->lport = sin->sin_port; 151 } else 152 sdp_destroy_cma(ssk); 153 return (error); 154 } 155 156 static void 157 sdp_pcbfree(struct sdp_sock *ssk) 158 { 159 KASSERT(ssk->socket == NULL, ("ssk %p socket still attached", ssk)); 160 161 sdp_dbg(ssk->socket, "Freeing pcb"); 162 SDP_WLOCK_ASSERT(ssk); 163 ssk->flags |= SDP_DESTROY; 164 SDP_WUNLOCK(ssk); 165 SDP_LIST_WLOCK(); 166 sdp_count--; 167 LIST_REMOVE(ssk, list); 168 SDP_LIST_WUNLOCK(); 169 crfree(ssk->cred); 170 sdp_destroy_cma(ssk); 171 ssk->qp_active = 0; 172 if (ssk->qp) { 173 ib_destroy_qp(ssk->qp); 174 ssk->qp = NULL; 175 } 176 sdp_tx_ring_destroy(ssk); 177 sdp_rx_ring_destroy(ssk); 178 rw_destroy(&ssk->rx_ring.destroyed_lock); 179 uma_zfree(sdp_zone, ssk); 180 rw_destroy(&ssk->lock); 181 } 182 183 /* 184 * Common routines to return a socket address. 185 */ 186 static struct sockaddr * 187 sdp_sockaddr(in_port_t port, struct in_addr *addr_p) 188 { 189 struct sockaddr_in *sin; 190 191 sin = malloc(sizeof *sin, M_SONAME, 192 M_WAITOK | M_ZERO); 193 sin->sin_family = AF_INET; 194 sin->sin_len = sizeof(*sin); 195 sin->sin_addr = *addr_p; 196 sin->sin_port = port; 197 198 return (struct sockaddr *)sin; 199 } 200 201 static int 202 sdp_getsockaddr(struct socket *so, struct sockaddr **nam) 203 { 204 struct sdp_sock *ssk; 205 struct in_addr addr; 206 in_port_t port; 207 208 ssk = sdp_sk(so); 209 SDP_RLOCK(ssk); 210 port = ssk->lport; 211 addr.s_addr = ssk->laddr; 212 SDP_RUNLOCK(ssk); 213 214 *nam = sdp_sockaddr(port, &addr); 215 return 0; 216 } 217 218 static int 219 sdp_getpeeraddr(struct socket *so, struct sockaddr **nam) 220 { 221 struct sdp_sock *ssk; 222 struct in_addr addr; 223 in_port_t port; 224 225 ssk = sdp_sk(so); 226 SDP_RLOCK(ssk); 227 port = ssk->fport; 228 addr.s_addr = ssk->faddr; 229 SDP_RUNLOCK(ssk); 230 231 *nam = sdp_sockaddr(port, &addr); 232 return 0; 233 } 234 235 static void 236 sdp_pcbnotifyall(struct in_addr faddr, int errno, 237 struct sdp_sock *(*notify)(struct sdp_sock *, int)) 238 { 239 struct sdp_sock *ssk, *ssk_temp; 240 241 SDP_LIST_WLOCK(); 242 LIST_FOREACH_SAFE(ssk, &sdp_list, list, ssk_temp) { 243 SDP_WLOCK(ssk); 244 if (ssk->faddr != faddr.s_addr || ssk->socket == NULL) { 245 SDP_WUNLOCK(ssk); 246 continue; 247 } 248 if ((ssk->flags & SDP_DESTROY) == 0) 249 if ((*notify)(ssk, errno)) 250 SDP_WUNLOCK(ssk); 251 } 252 SDP_LIST_WUNLOCK(); 253 } 254 255 #if 0 256 static void 257 sdp_apply_all(void (*func)(struct sdp_sock *, void *), void *arg) 258 { 259 struct sdp_sock *ssk; 260 261 SDP_LIST_RLOCK(); 262 LIST_FOREACH(ssk, &sdp_list, list) { 263 SDP_WLOCK(ssk); 264 func(ssk, arg); 265 SDP_WUNLOCK(ssk); 266 } 267 SDP_LIST_RUNLOCK(); 268 } 269 #endif 270 271 static void 272 sdp_output_reset(struct sdp_sock *ssk) 273 { 274 struct rdma_cm_id *id; 275 276 SDP_WLOCK_ASSERT(ssk); 277 if (ssk->id) { 278 id = ssk->id; 279 ssk->qp_active = 0; 280 SDP_WUNLOCK(ssk); 281 rdma_disconnect(id); 282 SDP_WLOCK(ssk); 283 } 284 ssk->state = TCPS_CLOSED; 285 } 286 287 /* 288 * Attempt to close a SDP socket, marking it as dropped, and freeing 289 * the socket if we hold the only reference. 290 */ 291 static struct sdp_sock * 292 sdp_closed(struct sdp_sock *ssk) 293 { 294 struct socket *so; 295 296 SDP_WLOCK_ASSERT(ssk); 297 298 ssk->flags |= SDP_DROPPED; 299 so = ssk->socket; 300 soisdisconnected(so); 301 if (ssk->flags & SDP_SOCKREF) { 302 KASSERT(so->so_state & SS_PROTOREF, 303 ("sdp_closed: !SS_PROTOREF")); 304 ssk->flags &= ~SDP_SOCKREF; 305 SDP_WUNLOCK(ssk); 306 ACCEPT_LOCK(); 307 SOCK_LOCK(so); 308 so->so_state &= ~SS_PROTOREF; 309 sofree(so); 310 return (NULL); 311 } 312 return (ssk); 313 } 314 315 /* 316 * Perform timer based shutdowns which can not operate in 317 * callout context. 318 */ 319 static void 320 sdp_shutdown_task(void *data, int pending) 321 { 322 struct sdp_sock *ssk; 323 324 ssk = data; 325 SDP_WLOCK(ssk); 326 /* 327 * I don't think this can race with another call to pcbfree() 328 * because SDP_TIMEWAIT protects it. SDP_DESTROY may be redundant. 329 */ 330 if (ssk->flags & SDP_DESTROY) 331 panic("sdp_shutdown_task: Racing with pcbfree for ssk %p", 332 ssk); 333 if (ssk->flags & SDP_DISCON) 334 sdp_output_reset(ssk); 335 /* We have to clear this so sdp_detach() will call pcbfree(). */ 336 ssk->flags &= ~(SDP_TIMEWAIT | SDP_DREQWAIT); 337 if ((ssk->flags & SDP_DROPPED) == 0 && 338 sdp_closed(ssk) == NULL) 339 return; 340 if (ssk->socket == NULL) { 341 sdp_pcbfree(ssk); 342 return; 343 } 344 SDP_WUNLOCK(ssk); 345 } 346 347 /* 348 * 2msl has expired, schedule the shutdown task. 349 */ 350 static void 351 sdp_2msl_timeout(void *data) 352 { 353 struct sdp_sock *ssk; 354 355 ssk = data; 356 /* Callout canceled. */ 357 if (!callout_active(&ssk->keep2msl)) 358 goto out; 359 callout_deactivate(&ssk->keep2msl); 360 /* Should be impossible, defensive programming. */ 361 if ((ssk->flags & SDP_TIMEWAIT) == 0) 362 goto out; 363 taskqueue_enqueue(taskqueue_thread, &ssk->shutdown_task); 364 out: 365 SDP_WUNLOCK(ssk); 366 return; 367 } 368 369 /* 370 * Schedule the 2msl wait timer. 371 */ 372 static void 373 sdp_2msl_wait(struct sdp_sock *ssk) 374 { 375 376 SDP_WLOCK_ASSERT(ssk); 377 ssk->flags |= SDP_TIMEWAIT; 378 ssk->state = TCPS_TIME_WAIT; 379 soisdisconnected(ssk->socket); 380 callout_reset(&ssk->keep2msl, TCPTV_MSL, sdp_2msl_timeout, ssk); 381 } 382 383 /* 384 * Timed out waiting for the final fin/ack from rdma_disconnect(). 385 */ 386 static void 387 sdp_dreq_timeout(void *data) 388 { 389 struct sdp_sock *ssk; 390 391 ssk = data; 392 /* Callout canceled. */ 393 if (!callout_active(&ssk->keep2msl)) 394 goto out; 395 /* Callout rescheduled, probably as a different timer. */ 396 if (callout_pending(&ssk->keep2msl)) 397 goto out; 398 callout_deactivate(&ssk->keep2msl); 399 if (ssk->state != TCPS_FIN_WAIT_1 && ssk->state != TCPS_LAST_ACK) 400 goto out; 401 if ((ssk->flags & SDP_DREQWAIT) == 0) 402 goto out; 403 ssk->flags &= ~SDP_DREQWAIT; 404 ssk->flags |= SDP_DISCON; 405 sdp_2msl_wait(ssk); 406 ssk->qp_active = 0; 407 out: 408 SDP_WUNLOCK(ssk); 409 } 410 411 /* 412 * Received the final fin/ack. Cancel the 2msl. 413 */ 414 void 415 sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk) 416 { 417 sdp_dbg(ssk->socket, "cancelling dreq wait timeout\n"); 418 ssk->flags &= ~SDP_DREQWAIT; 419 sdp_2msl_wait(ssk); 420 } 421 422 static int 423 sdp_init_sock(struct socket *sk) 424 { 425 struct sdp_sock *ssk = sdp_sk(sk); 426 427 sdp_dbg(sk, "%s\n", __func__); 428 429 callout_init_rw(&ssk->keep2msl, &ssk->lock, CALLOUT_RETURNUNLOCKED); 430 TASK_INIT(&ssk->shutdown_task, 0, sdp_shutdown_task, ssk); 431 #ifdef SDP_ZCOPY 432 INIT_DELAYED_WORK(&ssk->srcavail_cancel_work, srcavail_cancel_timeout); 433 ssk->zcopy_thresh = -1; /* use global sdp_zcopy_thresh */ 434 ssk->tx_ring.rdma_inflight = NULL; 435 #endif 436 atomic_set(&ssk->mseq_ack, 0); 437 sdp_rx_ring_init(ssk); 438 ssk->tx_ring.buffer = NULL; 439 440 return 0; 441 } 442 443 /* 444 * Allocate an sdp_sock for the socket and reserve socket buffer space. 445 */ 446 static int 447 sdp_attach(struct socket *so, int proto, struct thread *td) 448 { 449 struct sdp_sock *ssk; 450 int error; 451 452 ssk = sdp_sk(so); 453 KASSERT(ssk == NULL, ("sdp_attach: ssk already set on so %p", so)); 454 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { 455 error = soreserve(so, sdp_sendspace, sdp_recvspace); 456 if (error) 457 return (error); 458 } 459 so->so_rcv.sb_flags |= SB_AUTOSIZE; 460 so->so_snd.sb_flags |= SB_AUTOSIZE; 461 ssk = uma_zalloc(sdp_zone, M_NOWAIT | M_ZERO); 462 if (ssk == NULL) 463 return (ENOBUFS); 464 rw_init(&ssk->lock, "sdpsock"); 465 ssk->socket = so; 466 ssk->cred = crhold(so->so_cred); 467 so->so_pcb = (caddr_t)ssk; 468 sdp_init_sock(so); 469 ssk->flags = 0; 470 ssk->qp_active = 0; 471 ssk->state = TCPS_CLOSED; 472 SDP_LIST_WLOCK(); 473 LIST_INSERT_HEAD(&sdp_list, ssk, list); 474 sdp_count++; 475 SDP_LIST_WUNLOCK(); 476 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 477 so->so_linger = TCP_LINGERTIME; 478 479 return (0); 480 } 481 482 /* 483 * Detach SDP from the socket, potentially leaving it around for the 484 * timewait to expire. 485 */ 486 static void 487 sdp_detach(struct socket *so) 488 { 489 struct sdp_sock *ssk; 490 491 ssk = sdp_sk(so); 492 SDP_WLOCK(ssk); 493 KASSERT(ssk->socket != NULL, ("sdp_detach: socket is NULL")); 494 ssk->socket->so_pcb = NULL; 495 ssk->socket = NULL; 496 if (ssk->flags & (SDP_TIMEWAIT | SDP_DREQWAIT)) 497 SDP_WUNLOCK(ssk); 498 else if (ssk->flags & SDP_DROPPED || ssk->state < TCPS_SYN_SENT) 499 sdp_pcbfree(ssk); 500 else 501 panic("sdp_detach: Unexpected state, ssk %p.\n", ssk); 502 } 503 504 /* 505 * Allocate a local address for the socket. 506 */ 507 static int 508 sdp_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 509 { 510 int error = 0; 511 struct sdp_sock *ssk; 512 struct sockaddr_in *sin; 513 514 sin = (struct sockaddr_in *)nam; 515 if (nam->sa_len != sizeof (*sin)) 516 return (EINVAL); 517 if (sin->sin_family != AF_INET) 518 return (EINVAL); 519 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) 520 return (EAFNOSUPPORT); 521 522 ssk = sdp_sk(so); 523 SDP_WLOCK(ssk); 524 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 525 error = EINVAL; 526 goto out; 527 } 528 error = sdp_pcbbind(ssk, nam, td->td_ucred); 529 out: 530 SDP_WUNLOCK(ssk); 531 532 return (error); 533 } 534 535 /* 536 * Prepare to accept connections. 537 */ 538 static int 539 sdp_listen(struct socket *so, int backlog, struct thread *td) 540 { 541 int error = 0; 542 struct sdp_sock *ssk; 543 544 ssk = sdp_sk(so); 545 SDP_WLOCK(ssk); 546 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 547 error = EINVAL; 548 goto out; 549 } 550 if (error == 0 && ssk->lport == 0) 551 error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred); 552 SOCK_LOCK(so); 553 if (error == 0) 554 error = solisten_proto_check(so); 555 if (error == 0) { 556 solisten_proto(so, backlog); 557 ssk->state = TCPS_LISTEN; 558 } 559 SOCK_UNLOCK(so); 560 561 out: 562 SDP_WUNLOCK(ssk); 563 if (error == 0) 564 error = -rdma_listen(ssk->id, backlog); 565 return (error); 566 } 567 568 /* 569 * Initiate a SDP connection to nam. 570 */ 571 static int 572 sdp_start_connect(struct sdp_sock *ssk, struct sockaddr *nam, struct thread *td) 573 { 574 struct sockaddr_in src; 575 struct socket *so; 576 int error; 577 578 so = ssk->socket; 579 580 SDP_WLOCK_ASSERT(ssk); 581 if (ssk->lport == 0) { 582 error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred); 583 if (error) 584 return error; 585 } 586 src.sin_family = AF_INET; 587 src.sin_len = sizeof(src); 588 bzero(&src.sin_zero, sizeof(src.sin_zero)); 589 src.sin_port = ssk->lport; 590 src.sin_addr.s_addr = ssk->laddr; 591 soisconnecting(so); 592 SDP_WUNLOCK(ssk); 593 error = -rdma_resolve_addr(ssk->id, (struct sockaddr *)&src, nam, 594 SDP_RESOLVE_TIMEOUT); 595 SDP_WLOCK(ssk); 596 if (error == 0) 597 ssk->state = TCPS_SYN_SENT; 598 599 return 0; 600 } 601 602 /* 603 * Initiate SDP connection. 604 */ 605 static int 606 sdp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 607 { 608 int error = 0; 609 struct sdp_sock *ssk; 610 struct sockaddr_in *sin; 611 612 sin = (struct sockaddr_in *)nam; 613 if (nam->sa_len != sizeof (*sin)) 614 return (EINVAL); 615 if (sin->sin_family != AF_INET) 616 return (EINVAL); 617 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) 618 return (EAFNOSUPPORT); 619 if ((error = prison_remote_ip4(td->td_ucred, &sin->sin_addr)) != 0) 620 return (error); 621 ssk = sdp_sk(so); 622 SDP_WLOCK(ssk); 623 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) 624 error = EINVAL; 625 else 626 error = sdp_start_connect(ssk, nam, td); 627 SDP_WUNLOCK(ssk); 628 return (error); 629 } 630 631 /* 632 * Drop a SDP socket, reporting 633 * the specified error. If connection is synchronized, 634 * then send a RST to peer. 635 */ 636 static struct sdp_sock * 637 sdp_drop(struct sdp_sock *ssk, int errno) 638 { 639 struct socket *so; 640 641 SDP_WLOCK_ASSERT(ssk); 642 so = ssk->socket; 643 if (TCPS_HAVERCVDSYN(ssk->state)) 644 sdp_output_reset(ssk); 645 if (errno == ETIMEDOUT && ssk->softerror) 646 errno = ssk->softerror; 647 so->so_error = errno; 648 return (sdp_closed(ssk)); 649 } 650 651 /* 652 * User issued close, and wish to trail through shutdown states: 653 * if never received SYN, just forget it. If got a SYN from peer, 654 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 655 * If already got a FIN from peer, then almost done; go to LAST_ACK 656 * state. In all other cases, have already sent FIN to peer (e.g. 657 * after PRU_SHUTDOWN), and just have to play tedious game waiting 658 * for peer to send FIN or not respond to keep-alives, etc. 659 * We can let the user exit from the close as soon as the FIN is acked. 660 */ 661 static void 662 sdp_usrclosed(struct sdp_sock *ssk) 663 { 664 665 SDP_WLOCK_ASSERT(ssk); 666 667 switch (ssk->state) { 668 case TCPS_LISTEN: 669 ssk->state = TCPS_CLOSED; 670 SDP_WUNLOCK(ssk); 671 sdp_destroy_cma(ssk); 672 SDP_WLOCK(ssk); 673 /* FALLTHROUGH */ 674 case TCPS_CLOSED: 675 ssk = sdp_closed(ssk); 676 /* 677 * sdp_closed() should never return NULL here as the socket is 678 * still open. 679 */ 680 KASSERT(ssk != NULL, 681 ("sdp_usrclosed: sdp_closed() returned NULL")); 682 break; 683 684 case TCPS_SYN_SENT: 685 /* FALLTHROUGH */ 686 case TCPS_SYN_RECEIVED: 687 ssk->flags |= SDP_NEEDFIN; 688 break; 689 690 case TCPS_ESTABLISHED: 691 ssk->flags |= SDP_NEEDFIN; 692 ssk->state = TCPS_FIN_WAIT_1; 693 break; 694 695 case TCPS_CLOSE_WAIT: 696 ssk->state = TCPS_LAST_ACK; 697 break; 698 } 699 if (ssk->state >= TCPS_FIN_WAIT_2) { 700 /* Prevent the connection hanging in FIN_WAIT_2 forever. */ 701 if (ssk->state == TCPS_FIN_WAIT_2) 702 sdp_2msl_wait(ssk); 703 else 704 soisdisconnected(ssk->socket); 705 } 706 } 707 708 static void 709 sdp_output_disconnect(struct sdp_sock *ssk) 710 { 711 712 SDP_WLOCK_ASSERT(ssk); 713 callout_reset(&ssk->keep2msl, SDP_FIN_WAIT_TIMEOUT, 714 sdp_dreq_timeout, ssk); 715 ssk->flags |= SDP_NEEDFIN | SDP_DREQWAIT; 716 sdp_post_sends(ssk, M_NOWAIT); 717 } 718 719 /* 720 * Initiate or continue a disconnect. 721 * If embryonic state, just send reset (once). 722 * If in ``let data drain'' option and linger null, just drop. 723 * Otherwise (hard), mark socket disconnecting and drop 724 * current input data; switch states based on user close, and 725 * send segment to peer (with FIN). 726 */ 727 static void 728 sdp_start_disconnect(struct sdp_sock *ssk) 729 { 730 struct socket *so; 731 int unread; 732 733 so = ssk->socket; 734 SDP_WLOCK_ASSERT(ssk); 735 sdp_stop_keepalive_timer(so); 736 /* 737 * Neither sdp_closed() nor sdp_drop() should return NULL, as the 738 * socket is still open. 739 */ 740 if (ssk->state < TCPS_ESTABLISHED) { 741 ssk = sdp_closed(ssk); 742 KASSERT(ssk != NULL, 743 ("sdp_start_disconnect: sdp_close() returned NULL")); 744 } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) { 745 ssk = sdp_drop(ssk, 0); 746 KASSERT(ssk != NULL, 747 ("sdp_start_disconnect: sdp_drop() returned NULL")); 748 } else { 749 soisdisconnecting(so); 750 unread = sbused(&so->so_rcv); 751 sbflush(&so->so_rcv); 752 sdp_usrclosed(ssk); 753 if (!(ssk->flags & SDP_DROPPED)) { 754 if (unread) 755 sdp_output_reset(ssk); 756 else 757 sdp_output_disconnect(ssk); 758 } 759 } 760 } 761 762 /* 763 * User initiated disconnect. 764 */ 765 static int 766 sdp_disconnect(struct socket *so) 767 { 768 struct sdp_sock *ssk; 769 int error = 0; 770 771 ssk = sdp_sk(so); 772 SDP_WLOCK(ssk); 773 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 774 error = ECONNRESET; 775 goto out; 776 } 777 sdp_start_disconnect(ssk); 778 out: 779 SDP_WUNLOCK(ssk); 780 return (error); 781 } 782 783 /* 784 * Accept a connection. Essentially all the work is done at higher levels; 785 * just return the address of the peer, storing through addr. 786 * 787 * 788 * XXX This is broken XXX 789 * 790 * The rationale for acquiring the sdp lock here is somewhat complicated, 791 * and is described in detail in the commit log entry for r175612. Acquiring 792 * it delays an accept(2) racing with sonewconn(), which inserts the socket 793 * before the address/port fields are initialized. A better fix would 794 * prevent the socket from being placed in the listen queue until all fields 795 * are fully initialized. 796 */ 797 static int 798 sdp_accept(struct socket *so, struct sockaddr **nam) 799 { 800 struct sdp_sock *ssk = NULL; 801 struct in_addr addr; 802 in_port_t port; 803 int error; 804 805 if (so->so_state & SS_ISDISCONNECTED) 806 return (ECONNABORTED); 807 808 port = 0; 809 addr.s_addr = 0; 810 error = 0; 811 ssk = sdp_sk(so); 812 SDP_WLOCK(ssk); 813 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 814 error = ECONNABORTED; 815 goto out; 816 } 817 port = ssk->fport; 818 addr.s_addr = ssk->faddr; 819 out: 820 SDP_WUNLOCK(ssk); 821 if (error == 0) 822 *nam = sdp_sockaddr(port, &addr); 823 return error; 824 } 825 826 /* 827 * Mark the connection as being incapable of further output. 828 */ 829 static int 830 sdp_shutdown(struct socket *so) 831 { 832 int error = 0; 833 struct sdp_sock *ssk; 834 835 ssk = sdp_sk(so); 836 SDP_WLOCK(ssk); 837 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 838 error = ECONNRESET; 839 goto out; 840 } 841 socantsendmore(so); 842 sdp_usrclosed(ssk); 843 if (!(ssk->flags & SDP_DROPPED)) 844 sdp_output_disconnect(ssk); 845 846 out: 847 SDP_WUNLOCK(ssk); 848 849 return (error); 850 } 851 852 static void 853 sdp_append(struct sdp_sock *ssk, struct sockbuf *sb, struct mbuf *mb, int cnt) 854 { 855 struct mbuf *n; 856 int ncnt; 857 858 SOCKBUF_LOCK_ASSERT(sb); 859 SBLASTRECORDCHK(sb); 860 KASSERT(mb->m_flags & M_PKTHDR, 861 ("sdp_append: %p Missing packet header.\n", mb)); 862 n = sb->sb_lastrecord; 863 /* 864 * If the queue is empty just set all pointers and proceed. 865 */ 866 if (n == NULL) { 867 sb->sb_lastrecord = sb->sb_mb = sb->sb_sndptr = mb; 868 for (; mb; mb = mb->m_next) { 869 sb->sb_mbtail = mb; 870 sballoc(sb, mb); 871 } 872 return; 873 } 874 /* 875 * Count the number of mbufs in the current tail. 876 */ 877 for (ncnt = 0; n->m_next; n = n->m_next) 878 ncnt++; 879 n = sb->sb_lastrecord; 880 /* 881 * If the two chains can fit in a single sdp packet and 882 * the last record has not been sent yet (WRITABLE) coalesce 883 * them. The lastrecord remains the same but we must strip the 884 * packet header and then let sbcompress do the hard part. 885 */ 886 if (M_WRITABLE(n) && ncnt + cnt < SDP_MAX_SEND_SGES && 887 n->m_pkthdr.len + mb->m_pkthdr.len - SDP_HEAD_SIZE < 888 ssk->xmit_size_goal) { 889 m_adj(mb, SDP_HEAD_SIZE); 890 n->m_pkthdr.len += mb->m_pkthdr.len; 891 n->m_flags |= mb->m_flags & (M_PUSH | M_URG); 892 m_demote(mb, 1); 893 sbcompress(sb, mb, sb->sb_mbtail); 894 return; 895 } 896 /* 897 * Not compressible, just append to the end and adjust counters. 898 */ 899 sb->sb_lastrecord->m_flags |= M_PUSH; 900 sb->sb_lastrecord->m_nextpkt = mb; 901 sb->sb_lastrecord = mb; 902 if (sb->sb_sndptr == NULL) 903 sb->sb_sndptr = mb; 904 for (; mb; mb = mb->m_next) { 905 sb->sb_mbtail = mb; 906 sballoc(sb, mb); 907 } 908 } 909 910 /* 911 * Do a send by putting data in output queue and updating urgent 912 * marker if URG set. Possibly send more data. Unlike the other 913 * pru_*() routines, the mbuf chains are our responsibility. We 914 * must either enqueue them or free them. The other pru_* routines 915 * generally are caller-frees. 916 * 917 * This comes from sendfile, normal sends will come from sdp_sosend(). 918 */ 919 static int 920 sdp_send(struct socket *so, int flags, struct mbuf *m, 921 struct sockaddr *nam, struct mbuf *control, struct thread *td) 922 { 923 struct sdp_sock *ssk; 924 struct mbuf *n; 925 int error; 926 int cnt; 927 928 error = 0; 929 ssk = sdp_sk(so); 930 KASSERT(m->m_flags & M_PKTHDR, 931 ("sdp_send: %p no packet header", m)); 932 M_PREPEND(m, SDP_HEAD_SIZE, M_WAITOK); 933 mtod(m, struct sdp_bsdh *)->mid = SDP_MID_DATA; 934 for (n = m, cnt = 0; n->m_next; n = n->m_next) 935 cnt++; 936 if (cnt > SDP_MAX_SEND_SGES) { 937 n = m_collapse(m, M_WAITOK, SDP_MAX_SEND_SGES); 938 if (n == NULL) { 939 m_freem(m); 940 return (EMSGSIZE); 941 } 942 m = n; 943 for (cnt = 0; n->m_next; n = n->m_next) 944 cnt++; 945 } 946 SDP_WLOCK(ssk); 947 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 948 if (control) 949 m_freem(control); 950 if (m) 951 m_freem(m); 952 error = ECONNRESET; 953 goto out; 954 } 955 if (control) { 956 /* SDP doesn't support control messages. */ 957 if (control->m_len) { 958 m_freem(control); 959 if (m) 960 m_freem(m); 961 error = EINVAL; 962 goto out; 963 } 964 m_freem(control); /* empty control, just free it */ 965 } 966 if (!(flags & PRUS_OOB)) { 967 SOCKBUF_LOCK(&so->so_snd); 968 sdp_append(ssk, &so->so_snd, m, cnt); 969 SOCKBUF_UNLOCK(&so->so_snd); 970 if (nam && ssk->state < TCPS_SYN_SENT) { 971 /* 972 * Do implied connect if not yet connected. 973 */ 974 error = sdp_start_connect(ssk, nam, td); 975 if (error) 976 goto out; 977 } 978 if (flags & PRUS_EOF) { 979 /* 980 * Close the send side of the connection after 981 * the data is sent. 982 */ 983 socantsendmore(so); 984 sdp_usrclosed(ssk); 985 if (!(ssk->flags & SDP_DROPPED)) 986 sdp_output_disconnect(ssk); 987 } else if (!(ssk->flags & SDP_DROPPED) && 988 !(flags & PRUS_MORETOCOME)) 989 sdp_post_sends(ssk, M_NOWAIT); 990 SDP_WUNLOCK(ssk); 991 return (0); 992 } else { 993 SOCKBUF_LOCK(&so->so_snd); 994 if (sbspace(&so->so_snd) < -512) { 995 SOCKBUF_UNLOCK(&so->so_snd); 996 m_freem(m); 997 error = ENOBUFS; 998 goto out; 999 } 1000 /* 1001 * According to RFC961 (Assigned Protocols), 1002 * the urgent pointer points to the last octet 1003 * of urgent data. We continue, however, 1004 * to consider it to indicate the first octet 1005 * of data past the urgent section. 1006 * Otherwise, snd_up should be one lower. 1007 */ 1008 m->m_flags |= M_URG | M_PUSH; 1009 sdp_append(ssk, &so->so_snd, m, cnt); 1010 SOCKBUF_UNLOCK(&so->so_snd); 1011 if (nam && ssk->state < TCPS_SYN_SENT) { 1012 /* 1013 * Do implied connect if not yet connected. 1014 */ 1015 error = sdp_start_connect(ssk, nam, td); 1016 if (error) 1017 goto out; 1018 } 1019 sdp_post_sends(ssk, M_NOWAIT); 1020 SDP_WUNLOCK(ssk); 1021 return (0); 1022 } 1023 out: 1024 SDP_WUNLOCK(ssk); 1025 return (error); 1026 } 1027 1028 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 1029 1030 /* 1031 * Send on a socket. If send must go all at once and message is larger than 1032 * send buffering, then hard error. Lock against other senders. If must go 1033 * all at once and not enough room now, then inform user that this would 1034 * block and do nothing. Otherwise, if nonblocking, send as much as 1035 * possible. The data to be sent is described by "uio" if nonzero, otherwise 1036 * by the mbuf chain "top" (which must be null if uio is not). Data provided 1037 * in mbuf chain must be small enough to send all at once. 1038 * 1039 * Returns nonzero on error, timeout or signal; callers must check for short 1040 * counts if EINTR/ERESTART are returned. Data and control buffers are freed 1041 * on return. 1042 */ 1043 static int 1044 sdp_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 1045 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1046 { 1047 struct sdp_sock *ssk; 1048 long space, resid; 1049 int atomic; 1050 int error; 1051 int copy; 1052 1053 if (uio != NULL) 1054 resid = uio->uio_resid; 1055 else 1056 resid = top->m_pkthdr.len; 1057 atomic = top != NULL; 1058 if (control != NULL) { 1059 if (control->m_len) { 1060 m_freem(control); 1061 if (top) 1062 m_freem(top); 1063 return (EINVAL); 1064 } 1065 m_freem(control); 1066 control = NULL; 1067 } 1068 /* 1069 * In theory resid should be unsigned. However, space must be 1070 * signed, as it might be less than 0 if we over-committed, and we 1071 * must use a signed comparison of space and resid. On the other 1072 * hand, a negative resid causes us to loop sending 0-length 1073 * segments to the protocol. 1074 * 1075 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 1076 * type sockets since that's an error. 1077 */ 1078 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 1079 error = EINVAL; 1080 goto out; 1081 } 1082 if (td != NULL) 1083 td->td_ru.ru_msgsnd++; 1084 1085 ssk = sdp_sk(so); 1086 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 1087 if (error) 1088 goto out; 1089 1090 restart: 1091 do { 1092 SOCKBUF_LOCK(&so->so_snd); 1093 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1094 SOCKBUF_UNLOCK(&so->so_snd); 1095 error = EPIPE; 1096 goto release; 1097 } 1098 if (so->so_error) { 1099 error = so->so_error; 1100 so->so_error = 0; 1101 SOCKBUF_UNLOCK(&so->so_snd); 1102 goto release; 1103 } 1104 if ((so->so_state & SS_ISCONNECTED) == 0 && addr == NULL) { 1105 SOCKBUF_UNLOCK(&so->so_snd); 1106 error = ENOTCONN; 1107 goto release; 1108 } 1109 space = sbspace(&so->so_snd); 1110 if (flags & MSG_OOB) 1111 space += 1024; 1112 if (atomic && resid > ssk->xmit_size_goal - SDP_HEAD_SIZE) { 1113 SOCKBUF_UNLOCK(&so->so_snd); 1114 error = EMSGSIZE; 1115 goto release; 1116 } 1117 if (space < resid && 1118 (atomic || space < so->so_snd.sb_lowat)) { 1119 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) { 1120 SOCKBUF_UNLOCK(&so->so_snd); 1121 error = EWOULDBLOCK; 1122 goto release; 1123 } 1124 error = sbwait(&so->so_snd); 1125 SOCKBUF_UNLOCK(&so->so_snd); 1126 if (error) 1127 goto release; 1128 goto restart; 1129 } 1130 SOCKBUF_UNLOCK(&so->so_snd); 1131 do { 1132 if (uio == NULL) { 1133 resid = 0; 1134 if (flags & MSG_EOR) 1135 top->m_flags |= M_EOR; 1136 } else { 1137 /* 1138 * Copy the data from userland into a mbuf 1139 * chain. If no data is to be copied in, 1140 * a single empty mbuf is returned. 1141 */ 1142 copy = min(space, 1143 ssk->xmit_size_goal - SDP_HEAD_SIZE); 1144 top = m_uiotombuf(uio, M_WAITOK, copy, 1145 0, M_PKTHDR | 1146 ((flags & MSG_EOR) ? M_EOR : 0)); 1147 if (top == NULL) { 1148 /* only possible error */ 1149 error = EFAULT; 1150 goto release; 1151 } 1152 space -= resid - uio->uio_resid; 1153 resid = uio->uio_resid; 1154 } 1155 /* 1156 * XXX all the SBS_CANTSENDMORE checks previously 1157 * done could be out of date after dropping the 1158 * socket lock. 1159 */ 1160 error = sdp_send(so, (flags & MSG_OOB) ? PRUS_OOB : 1161 /* 1162 * Set EOF on the last send if the user specified 1163 * MSG_EOF. 1164 */ 1165 ((flags & MSG_EOF) && (resid <= 0)) ? PRUS_EOF : 1166 /* If there is more to send set PRUS_MORETOCOME. */ 1167 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1168 top, addr, NULL, td); 1169 top = NULL; 1170 if (error) 1171 goto release; 1172 } while (resid && space > 0); 1173 } while (resid); 1174 1175 release: 1176 sbunlock(&so->so_snd); 1177 out: 1178 if (top != NULL) 1179 m_freem(top); 1180 return (error); 1181 } 1182 1183 /* 1184 * The part of soreceive() that implements reading non-inline out-of-band 1185 * data from a socket. For more complete comments, see soreceive(), from 1186 * which this code originated. 1187 * 1188 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is 1189 * unable to return an mbuf chain to the caller. 1190 */ 1191 static int 1192 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) 1193 { 1194 struct protosw *pr = so->so_proto; 1195 struct mbuf *m; 1196 int error; 1197 1198 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 1199 1200 m = m_get(M_WAITOK, MT_DATA); 1201 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 1202 if (error) 1203 goto bad; 1204 do { 1205 error = uiomove(mtod(m, void *), 1206 (int) min(uio->uio_resid, m->m_len), uio); 1207 m = m_free(m); 1208 } while (uio->uio_resid && error == 0 && m); 1209 bad: 1210 if (m != NULL) 1211 m_freem(m); 1212 return (error); 1213 } 1214 1215 /* 1216 * Optimized version of soreceive() for stream (TCP) sockets. 1217 */ 1218 static int 1219 sdp_sorecv(struct socket *so, struct sockaddr **psa, struct uio *uio, 1220 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 1221 { 1222 int len = 0, error = 0, flags, oresid; 1223 struct sockbuf *sb; 1224 struct mbuf *m, *n = NULL; 1225 struct sdp_sock *ssk; 1226 1227 /* We only do stream sockets. */ 1228 if (so->so_type != SOCK_STREAM) 1229 return (EINVAL); 1230 if (psa != NULL) 1231 *psa = NULL; 1232 if (controlp != NULL) 1233 return (EINVAL); 1234 if (flagsp != NULL) 1235 flags = *flagsp &~ MSG_EOR; 1236 else 1237 flags = 0; 1238 if (flags & MSG_OOB) 1239 return (soreceive_rcvoob(so, uio, flags)); 1240 if (mp0 != NULL) 1241 *mp0 = NULL; 1242 1243 sb = &so->so_rcv; 1244 ssk = sdp_sk(so); 1245 1246 /* Prevent other readers from entering the socket. */ 1247 error = sblock(sb, SBLOCKWAIT(flags)); 1248 if (error) 1249 goto out; 1250 SOCKBUF_LOCK(sb); 1251 1252 /* Easy one, no space to copyout anything. */ 1253 if (uio->uio_resid == 0) { 1254 error = EINVAL; 1255 goto out; 1256 } 1257 oresid = uio->uio_resid; 1258 1259 /* We will never ever get anything unless we are connected. */ 1260 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { 1261 /* When disconnecting there may be still some data left. */ 1262 if (sbavail(sb)) 1263 goto deliver; 1264 if (!(so->so_state & SS_ISDISCONNECTED)) 1265 error = ENOTCONN; 1266 goto out; 1267 } 1268 1269 /* Socket buffer is empty and we shall not block. */ 1270 if (sbavail(sb) == 0 && 1271 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { 1272 error = EAGAIN; 1273 goto out; 1274 } 1275 1276 restart: 1277 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1278 1279 /* Abort if socket has reported problems. */ 1280 if (so->so_error) { 1281 if (sbavail(sb)) 1282 goto deliver; 1283 if (oresid > uio->uio_resid) 1284 goto out; 1285 error = so->so_error; 1286 if (!(flags & MSG_PEEK)) 1287 so->so_error = 0; 1288 goto out; 1289 } 1290 1291 /* Door is closed. Deliver what is left, if any. */ 1292 if (sb->sb_state & SBS_CANTRCVMORE) { 1293 if (sbavail(sb)) 1294 goto deliver; 1295 else 1296 goto out; 1297 } 1298 1299 /* Socket buffer got some data that we shall deliver now. */ 1300 if (sbavail(sb) && !(flags & MSG_WAITALL) && 1301 ((so->so_state & SS_NBIO) || 1302 (flags & (MSG_DONTWAIT|MSG_NBIO)) || 1303 sbavail(sb) >= sb->sb_lowat || 1304 sbavail(sb) >= uio->uio_resid || 1305 sbavail(sb) >= sb->sb_hiwat) ) { 1306 goto deliver; 1307 } 1308 1309 /* On MSG_WAITALL we must wait until all data or error arrives. */ 1310 if ((flags & MSG_WAITALL) && 1311 (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_lowat)) 1312 goto deliver; 1313 1314 /* 1315 * Wait and block until (more) data comes in. 1316 * NB: Drops the sockbuf lock during wait. 1317 */ 1318 error = sbwait(sb); 1319 if (error) 1320 goto out; 1321 goto restart; 1322 1323 deliver: 1324 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1325 KASSERT(sbavail(sb), ("%s: sockbuf empty", __func__)); 1326 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); 1327 1328 /* Statistics. */ 1329 if (uio->uio_td) 1330 uio->uio_td->td_ru.ru_msgrcv++; 1331 1332 /* Fill uio until full or current end of socket buffer is reached. */ 1333 len = min(uio->uio_resid, sbavail(sb)); 1334 if (mp0 != NULL) { 1335 /* Dequeue as many mbufs as possible. */ 1336 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { 1337 for (*mp0 = m = sb->sb_mb; 1338 m != NULL && m->m_len <= len; 1339 m = m->m_next) { 1340 len -= m->m_len; 1341 uio->uio_resid -= m->m_len; 1342 sbfree(sb, m); 1343 n = m; 1344 } 1345 sb->sb_mb = m; 1346 if (sb->sb_mb == NULL) 1347 SB_EMPTY_FIXUP(sb); 1348 n->m_next = NULL; 1349 } 1350 /* Copy the remainder. */ 1351 if (len > 0) { 1352 KASSERT(sb->sb_mb != NULL, 1353 ("%s: len > 0 && sb->sb_mb empty", __func__)); 1354 1355 m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); 1356 if (m == NULL) 1357 len = 0; /* Don't flush data from sockbuf. */ 1358 else 1359 uio->uio_resid -= m->m_len; 1360 if (*mp0 != NULL) 1361 n->m_next = m; 1362 else 1363 *mp0 = m; 1364 if (*mp0 == NULL) { 1365 error = ENOBUFS; 1366 goto out; 1367 } 1368 } 1369 } else { 1370 /* NB: Must unlock socket buffer as uiomove may sleep. */ 1371 SOCKBUF_UNLOCK(sb); 1372 error = m_mbuftouio(uio, sb->sb_mb, len); 1373 SOCKBUF_LOCK(sb); 1374 if (error) 1375 goto out; 1376 } 1377 SBLASTRECORDCHK(sb); 1378 SBLASTMBUFCHK(sb); 1379 1380 /* 1381 * Remove the delivered data from the socket buffer unless we 1382 * were only peeking. 1383 */ 1384 if (!(flags & MSG_PEEK)) { 1385 if (len > 0) 1386 sbdrop_locked(sb, len); 1387 1388 /* Notify protocol that we drained some data. */ 1389 SOCKBUF_UNLOCK(sb); 1390 SDP_WLOCK(ssk); 1391 sdp_do_posts(ssk); 1392 SDP_WUNLOCK(ssk); 1393 SOCKBUF_LOCK(sb); 1394 } 1395 1396 /* 1397 * For MSG_WAITALL we may have to loop again and wait for 1398 * more data to come in. 1399 */ 1400 if ((flags & MSG_WAITALL) && uio->uio_resid > 0) 1401 goto restart; 1402 out: 1403 SOCKBUF_LOCK_ASSERT(sb); 1404 SBLASTRECORDCHK(sb); 1405 SBLASTMBUFCHK(sb); 1406 SOCKBUF_UNLOCK(sb); 1407 sbunlock(sb); 1408 return (error); 1409 } 1410 1411 /* 1412 * Abort is used to teardown a connection typically while sitting in 1413 * the accept queue. 1414 */ 1415 void 1416 sdp_abort(struct socket *so) 1417 { 1418 struct sdp_sock *ssk; 1419 1420 ssk = sdp_sk(so); 1421 SDP_WLOCK(ssk); 1422 /* 1423 * If we have not yet dropped, do it now. 1424 */ 1425 if (!(ssk->flags & SDP_TIMEWAIT) && 1426 !(ssk->flags & SDP_DROPPED)) 1427 sdp_drop(ssk, ECONNABORTED); 1428 KASSERT(ssk->flags & SDP_DROPPED, ("sdp_abort: %p not dropped 0x%X", 1429 ssk, ssk->flags)); 1430 SDP_WUNLOCK(ssk); 1431 } 1432 1433 /* 1434 * Close a SDP socket and initiate a friendly disconnect. 1435 */ 1436 static void 1437 sdp_close(struct socket *so) 1438 { 1439 struct sdp_sock *ssk; 1440 1441 ssk = sdp_sk(so); 1442 SDP_WLOCK(ssk); 1443 /* 1444 * If we have not yet dropped, do it now. 1445 */ 1446 if (!(ssk->flags & SDP_TIMEWAIT) && 1447 !(ssk->flags & SDP_DROPPED)) 1448 sdp_start_disconnect(ssk); 1449 1450 /* 1451 * If we've still not dropped let the socket layer know we're 1452 * holding on to the socket and pcb for a while. 1453 */ 1454 if (!(ssk->flags & SDP_DROPPED)) { 1455 SOCK_LOCK(so); 1456 so->so_state |= SS_PROTOREF; 1457 SOCK_UNLOCK(so); 1458 ssk->flags |= SDP_SOCKREF; 1459 } 1460 SDP_WUNLOCK(ssk); 1461 } 1462 1463 /* 1464 * User requests out-of-band data. 1465 */ 1466 static int 1467 sdp_rcvoob(struct socket *so, struct mbuf *m, int flags) 1468 { 1469 int error = 0; 1470 struct sdp_sock *ssk; 1471 1472 ssk = sdp_sk(so); 1473 SDP_WLOCK(ssk); 1474 if (!rx_ring_trylock(&ssk->rx_ring)) { 1475 SDP_WUNLOCK(ssk); 1476 return (ECONNRESET); 1477 } 1478 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 1479 error = ECONNRESET; 1480 goto out; 1481 } 1482 if ((so->so_oobmark == 0 && 1483 (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) || 1484 so->so_options & SO_OOBINLINE || 1485 ssk->oobflags & SDP_HADOOB) { 1486 error = EINVAL; 1487 goto out; 1488 } 1489 if ((ssk->oobflags & SDP_HAVEOOB) == 0) { 1490 error = EWOULDBLOCK; 1491 goto out; 1492 } 1493 m->m_len = 1; 1494 *mtod(m, caddr_t) = ssk->iobc; 1495 if ((flags & MSG_PEEK) == 0) 1496 ssk->oobflags ^= (SDP_HAVEOOB | SDP_HADOOB); 1497 out: 1498 rx_ring_unlock(&ssk->rx_ring); 1499 SDP_WUNLOCK(ssk); 1500 return (error); 1501 } 1502 1503 void 1504 sdp_urg(struct sdp_sock *ssk, struct mbuf *mb) 1505 { 1506 struct mbuf *m; 1507 struct socket *so; 1508 1509 so = ssk->socket; 1510 if (so == NULL) 1511 return; 1512 1513 so->so_oobmark = sbused(&so->so_rcv) + mb->m_pkthdr.len - 1; 1514 sohasoutofband(so); 1515 ssk->oobflags &= ~(SDP_HAVEOOB | SDP_HADOOB); 1516 if (!(so->so_options & SO_OOBINLINE)) { 1517 for (m = mb; m->m_next != NULL; m = m->m_next); 1518 ssk->iobc = *(mtod(m, char *) + m->m_len - 1); 1519 ssk->oobflags |= SDP_HAVEOOB; 1520 m->m_len--; 1521 mb->m_pkthdr.len--; 1522 } 1523 } 1524 1525 /* 1526 * Notify a sdp socket of an asynchronous error. 1527 * 1528 * Do not wake up user since there currently is no mechanism for 1529 * reporting soft errors (yet - a kqueue filter may be added). 1530 */ 1531 struct sdp_sock * 1532 sdp_notify(struct sdp_sock *ssk, int error) 1533 { 1534 1535 SDP_WLOCK_ASSERT(ssk); 1536 1537 if ((ssk->flags & SDP_TIMEWAIT) || 1538 (ssk->flags & SDP_DROPPED)) 1539 return (ssk); 1540 1541 /* 1542 * Ignore some errors if we are hooked up. 1543 */ 1544 if (ssk->state == TCPS_ESTABLISHED && 1545 (error == EHOSTUNREACH || error == ENETUNREACH || 1546 error == EHOSTDOWN)) 1547 return (ssk); 1548 ssk->softerror = error; 1549 return sdp_drop(ssk, error); 1550 } 1551 1552 static void 1553 sdp_ctlinput(int cmd, struct sockaddr *sa, void *vip) 1554 { 1555 struct in_addr faddr; 1556 1557 faddr = ((struct sockaddr_in *)sa)->sin_addr; 1558 if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) 1559 return; 1560 1561 sdp_pcbnotifyall(faddr, inetctlerrmap[cmd], sdp_notify); 1562 } 1563 1564 static int 1565 sdp_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, 1566 struct thread *td) 1567 { 1568 return (EOPNOTSUPP); 1569 } 1570 1571 static void 1572 sdp_keepalive_timeout(void *data) 1573 { 1574 struct sdp_sock *ssk; 1575 1576 ssk = data; 1577 /* Callout canceled. */ 1578 if (!callout_active(&ssk->keep2msl)) 1579 return; 1580 /* Callout rescheduled as a different kind of timer. */ 1581 if (callout_pending(&ssk->keep2msl)) 1582 goto out; 1583 callout_deactivate(&ssk->keep2msl); 1584 if (ssk->flags & SDP_DROPPED || 1585 (ssk->socket->so_options & SO_KEEPALIVE) == 0) 1586 goto out; 1587 sdp_post_keepalive(ssk); 1588 callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME, 1589 sdp_keepalive_timeout, ssk); 1590 out: 1591 SDP_WUNLOCK(ssk); 1592 } 1593 1594 1595 void 1596 sdp_start_keepalive_timer(struct socket *so) 1597 { 1598 struct sdp_sock *ssk; 1599 1600 ssk = sdp_sk(so); 1601 if (!callout_pending(&ssk->keep2msl)) 1602 callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME, 1603 sdp_keepalive_timeout, ssk); 1604 } 1605 1606 static void 1607 sdp_stop_keepalive_timer(struct socket *so) 1608 { 1609 struct sdp_sock *ssk; 1610 1611 ssk = sdp_sk(so); 1612 callout_stop(&ssk->keep2msl); 1613 } 1614 1615 /* 1616 * sdp_ctloutput() must drop the inpcb lock before performing copyin on 1617 * socket option arguments. When it re-acquires the lock after the copy, it 1618 * has to revalidate that the connection is still valid for the socket 1619 * option. 1620 */ 1621 #define SDP_WLOCK_RECHECK(inp) do { \ 1622 SDP_WLOCK(ssk); \ 1623 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { \ 1624 SDP_WUNLOCK(ssk); \ 1625 return (ECONNRESET); \ 1626 } \ 1627 } while(0) 1628 1629 static int 1630 sdp_ctloutput(struct socket *so, struct sockopt *sopt) 1631 { 1632 int error, opt, optval; 1633 struct sdp_sock *ssk; 1634 1635 error = 0; 1636 ssk = sdp_sk(so); 1637 if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_KEEPALIVE) { 1638 SDP_WLOCK(ssk); 1639 if (so->so_options & SO_KEEPALIVE) 1640 sdp_start_keepalive_timer(so); 1641 else 1642 sdp_stop_keepalive_timer(so); 1643 SDP_WUNLOCK(ssk); 1644 } 1645 if (sopt->sopt_level != IPPROTO_TCP) 1646 return (error); 1647 1648 SDP_WLOCK(ssk); 1649 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 1650 SDP_WUNLOCK(ssk); 1651 return (ECONNRESET); 1652 } 1653 1654 switch (sopt->sopt_dir) { 1655 case SOPT_SET: 1656 switch (sopt->sopt_name) { 1657 case TCP_NODELAY: 1658 SDP_WUNLOCK(ssk); 1659 error = sooptcopyin(sopt, &optval, sizeof optval, 1660 sizeof optval); 1661 if (error) 1662 return (error); 1663 1664 SDP_WLOCK_RECHECK(ssk); 1665 opt = SDP_NODELAY; 1666 if (optval) 1667 ssk->flags |= opt; 1668 else 1669 ssk->flags &= ~opt; 1670 sdp_do_posts(ssk); 1671 SDP_WUNLOCK(ssk); 1672 break; 1673 1674 default: 1675 SDP_WUNLOCK(ssk); 1676 error = ENOPROTOOPT; 1677 break; 1678 } 1679 break; 1680 1681 case SOPT_GET: 1682 switch (sopt->sopt_name) { 1683 case TCP_NODELAY: 1684 optval = ssk->flags & SDP_NODELAY; 1685 SDP_WUNLOCK(ssk); 1686 error = sooptcopyout(sopt, &optval, sizeof optval); 1687 break; 1688 default: 1689 SDP_WUNLOCK(ssk); 1690 error = ENOPROTOOPT; 1691 break; 1692 } 1693 break; 1694 } 1695 return (error); 1696 } 1697 #undef SDP_WLOCK_RECHECK 1698 1699 int sdp_mod_count = 0; 1700 int sdp_mod_usec = 0; 1701 1702 void 1703 sdp_set_default_moderation(struct sdp_sock *ssk) 1704 { 1705 if (sdp_mod_count <= 0 || sdp_mod_usec <= 0) 1706 return; 1707 ib_modify_cq(ssk->rx_ring.cq, sdp_mod_count, sdp_mod_usec); 1708 } 1709 1710 1711 static void 1712 sdp_dev_add(struct ib_device *device) 1713 { 1714 struct ib_fmr_pool_param param; 1715 struct sdp_device *sdp_dev; 1716 1717 sdp_dev = malloc(sizeof(*sdp_dev), M_SDP, M_WAITOK | M_ZERO); 1718 sdp_dev->pd = ib_alloc_pd(device); 1719 if (IS_ERR(sdp_dev->pd)) 1720 goto out_pd; 1721 sdp_dev->mr = ib_get_dma_mr(sdp_dev->pd, IB_ACCESS_LOCAL_WRITE); 1722 if (IS_ERR(sdp_dev->mr)) 1723 goto out_mr; 1724 memset(¶m, 0, sizeof param); 1725 param.max_pages_per_fmr = SDP_FMR_SIZE; 1726 param.page_shift = PAGE_SHIFT; 1727 param.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ); 1728 param.pool_size = SDP_FMR_POOL_SIZE; 1729 param.dirty_watermark = SDP_FMR_DIRTY_SIZE; 1730 param.cache = 1; 1731 sdp_dev->fmr_pool = ib_create_fmr_pool(sdp_dev->pd, ¶m); 1732 if (IS_ERR(sdp_dev->fmr_pool)) 1733 goto out_fmr; 1734 ib_set_client_data(device, &sdp_client, sdp_dev); 1735 return; 1736 1737 out_fmr: 1738 ib_dereg_mr(sdp_dev->mr); 1739 out_mr: 1740 ib_dealloc_pd(sdp_dev->pd); 1741 out_pd: 1742 free(sdp_dev, M_SDP); 1743 } 1744 1745 static void 1746 sdp_dev_rem(struct ib_device *device) 1747 { 1748 struct sdp_device *sdp_dev; 1749 struct sdp_sock *ssk; 1750 1751 SDP_LIST_WLOCK(); 1752 LIST_FOREACH(ssk, &sdp_list, list) { 1753 if (ssk->ib_device != device) 1754 continue; 1755 SDP_WLOCK(ssk); 1756 if ((ssk->flags & SDP_DESTROY) == 0) 1757 ssk = sdp_notify(ssk, ECONNRESET); 1758 if (ssk) 1759 SDP_WUNLOCK(ssk); 1760 } 1761 SDP_LIST_WUNLOCK(); 1762 /* 1763 * XXX Do I need to wait between these two? 1764 */ 1765 sdp_dev = ib_get_client_data(device, &sdp_client); 1766 if (!sdp_dev) 1767 return; 1768 ib_flush_fmr_pool(sdp_dev->fmr_pool); 1769 ib_destroy_fmr_pool(sdp_dev->fmr_pool); 1770 ib_dereg_mr(sdp_dev->mr); 1771 ib_dealloc_pd(sdp_dev->pd); 1772 free(sdp_dev, M_SDP); 1773 } 1774 1775 struct ib_client sdp_client = 1776 { .name = "sdp", .add = sdp_dev_add, .remove = sdp_dev_rem }; 1777 1778 1779 static int 1780 sdp_pcblist(SYSCTL_HANDLER_ARGS) 1781 { 1782 int error, n, i; 1783 struct sdp_sock *ssk; 1784 struct xinpgen xig; 1785 1786 /* 1787 * The process of preparing the TCB list is too time-consuming and 1788 * resource-intensive to repeat twice on every request. 1789 */ 1790 if (req->oldptr == NULL) { 1791 n = sdp_count; 1792 n += imax(n / 8, 10); 1793 req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb); 1794 return (0); 1795 } 1796 1797 if (req->newptr != NULL) 1798 return (EPERM); 1799 1800 /* 1801 * OK, now we're committed to doing something. 1802 */ 1803 SDP_LIST_RLOCK(); 1804 n = sdp_count; 1805 SDP_LIST_RUNLOCK(); 1806 1807 error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) 1808 + n * sizeof(struct xtcpcb)); 1809 if (error != 0) 1810 return (error); 1811 1812 xig.xig_len = sizeof xig; 1813 xig.xig_count = n; 1814 xig.xig_gen = 0; 1815 xig.xig_sogen = so_gencnt; 1816 error = SYSCTL_OUT(req, &xig, sizeof xig); 1817 if (error) 1818 return (error); 1819 1820 SDP_LIST_RLOCK(); 1821 for (ssk = LIST_FIRST(&sdp_list), i = 0; 1822 ssk != NULL && i < n; ssk = LIST_NEXT(ssk, list)) { 1823 struct xtcpcb xt; 1824 1825 SDP_RLOCK(ssk); 1826 if (ssk->flags & SDP_TIMEWAIT) { 1827 if (ssk->cred != NULL) 1828 error = cr_cansee(req->td->td_ucred, 1829 ssk->cred); 1830 else 1831 error = EINVAL; /* Skip this inp. */ 1832 } else if (ssk->socket) 1833 error = cr_canseesocket(req->td->td_ucred, 1834 ssk->socket); 1835 else 1836 error = EINVAL; 1837 if (error) { 1838 error = 0; 1839 goto next; 1840 } 1841 1842 bzero(&xt, sizeof(xt)); 1843 xt.xt_len = sizeof xt; 1844 xt.xt_inp.inp_gencnt = 0; 1845 xt.xt_inp.inp_vflag = INP_IPV4; 1846 memcpy(&xt.xt_inp.inp_laddr, &ssk->laddr, sizeof(ssk->laddr)); 1847 xt.xt_inp.inp_lport = ssk->lport; 1848 memcpy(&xt.xt_inp.inp_faddr, &ssk->faddr, sizeof(ssk->faddr)); 1849 xt.xt_inp.inp_fport = ssk->fport; 1850 xt.xt_tp.t_state = ssk->state; 1851 if (ssk->socket != NULL) 1852 sotoxsocket(ssk->socket, &xt.xt_socket); 1853 else 1854 bzero(&xt.xt_socket, sizeof xt.xt_socket); 1855 xt.xt_socket.xso_protocol = IPPROTO_TCP; 1856 SDP_RUNLOCK(ssk); 1857 error = SYSCTL_OUT(req, &xt, sizeof xt); 1858 if (error) 1859 break; 1860 i++; 1861 continue; 1862 next: 1863 SDP_RUNLOCK(ssk); 1864 } 1865 if (!error) { 1866 /* 1867 * Give the user an updated idea of our state. 1868 * If the generation differs from what we told 1869 * her before, she knows that something happened 1870 * while we were processing this request, and it 1871 * might be necessary to retry. 1872 */ 1873 xig.xig_gen = 0; 1874 xig.xig_sogen = so_gencnt; 1875 xig.xig_count = sdp_count; 1876 error = SYSCTL_OUT(req, &xig, sizeof xig); 1877 } 1878 SDP_LIST_RUNLOCK(); 1879 return (error); 1880 } 1881 1882 static SYSCTL_NODE(_net_inet, -1, sdp, CTLFLAG_RW, 0, "SDP"); 1883 1884 SYSCTL_PROC(_net_inet_sdp, TCPCTL_PCBLIST, pcblist, 1885 CTLFLAG_RD | CTLTYPE_STRUCT, 0, 0, sdp_pcblist, "S,xtcpcb", 1886 "List of active SDP connections"); 1887 1888 static void 1889 sdp_zone_change(void *tag) 1890 { 1891 1892 uma_zone_set_max(sdp_zone, maxsockets); 1893 } 1894 1895 static void 1896 sdp_init(void) 1897 { 1898 1899 LIST_INIT(&sdp_list); 1900 sdp_zone = uma_zcreate("sdp_sock", sizeof(struct sdp_sock), 1901 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 1902 uma_zone_set_max(sdp_zone, maxsockets); 1903 EVENTHANDLER_REGISTER(maxsockets_change, sdp_zone_change, NULL, 1904 EVENTHANDLER_PRI_ANY); 1905 rx_comp_wq = create_singlethread_workqueue("rx_comp_wq"); 1906 ib_register_client(&sdp_client); 1907 } 1908 1909 extern struct domain sdpdomain; 1910 1911 struct pr_usrreqs sdp_usrreqs = { 1912 .pru_abort = sdp_abort, 1913 .pru_accept = sdp_accept, 1914 .pru_attach = sdp_attach, 1915 .pru_bind = sdp_bind, 1916 .pru_connect = sdp_connect, 1917 .pru_control = sdp_control, 1918 .pru_detach = sdp_detach, 1919 .pru_disconnect = sdp_disconnect, 1920 .pru_listen = sdp_listen, 1921 .pru_peeraddr = sdp_getpeeraddr, 1922 .pru_rcvoob = sdp_rcvoob, 1923 .pru_send = sdp_send, 1924 .pru_sosend = sdp_sosend, 1925 .pru_soreceive = sdp_sorecv, 1926 .pru_shutdown = sdp_shutdown, 1927 .pru_sockaddr = sdp_getsockaddr, 1928 .pru_close = sdp_close, 1929 }; 1930 1931 struct protosw sdpsw[] = { 1932 { 1933 .pr_type = SOCK_STREAM, 1934 .pr_domain = &sdpdomain, 1935 .pr_protocol = IPPROTO_IP, 1936 .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD, 1937 .pr_ctlinput = sdp_ctlinput, 1938 .pr_ctloutput = sdp_ctloutput, 1939 .pr_usrreqs = &sdp_usrreqs 1940 }, 1941 { 1942 .pr_type = SOCK_STREAM, 1943 .pr_domain = &sdpdomain, 1944 .pr_protocol = IPPROTO_TCP, 1945 .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD, 1946 .pr_ctlinput = sdp_ctlinput, 1947 .pr_ctloutput = sdp_ctloutput, 1948 .pr_usrreqs = &sdp_usrreqs 1949 }, 1950 }; 1951 1952 struct domain sdpdomain = { 1953 .dom_family = AF_INET_SDP, 1954 .dom_name = "SDP", 1955 .dom_init = sdp_init, 1956 .dom_protosw = sdpsw, 1957 .dom_protoswNPROTOSW = &sdpsw[sizeof(sdpsw)/sizeof(sdpsw[0])], 1958 }; 1959 1960 DOMAIN_SET(sdp); 1961 1962 int sdp_debug_level = 1; 1963 int sdp_data_debug_level = 0; 1964