1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ 4 /* Fredy Neeser */ 5 /* Greg Joyce <greg@opengridcomputing.com> */ 6 /* Copyright (c) 2008-2019, IBM Corporation */ 7 /* Copyright (c) 2017, Open Grid Computing, Inc. */ 8 9 #include <linux/errno.h> 10 #include <linux/types.h> 11 #include <linux/net.h> 12 #include <linux/inetdevice.h> 13 #include <net/addrconf.h> 14 #include <linux/workqueue.h> 15 #include <net/sock.h> 16 #include <net/tcp.h> 17 #include <linux/inet.h> 18 #include <linux/tcp.h> 19 #include <trace/events/sock.h> 20 21 #include <rdma/iw_cm.h> 22 #include <rdma/ib_verbs.h> 23 #include <rdma/ib_user_verbs.h> 24 25 #include "siw.h" 26 #include "siw_cm.h" 27 28 /* 29 * Set to any combination of 30 * MPA_V2_RDMA_NO_RTR, MPA_V2_RDMA_READ_RTR, MPA_V2_RDMA_WRITE_RTR 31 */ 32 static __be16 rtr_type = MPA_V2_RDMA_READ_RTR | MPA_V2_RDMA_WRITE_RTR; 33 static const bool relaxed_ird_negotiation = true; 34 35 static void siw_cm_llp_state_change(struct sock *s); 36 static void siw_cm_llp_data_ready(struct sock *s); 37 static void siw_cm_llp_write_space(struct sock *s); 38 static void siw_cm_llp_error_report(struct sock *s); 39 static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason, 40 int status); 41 42 static void siw_sk_assign_cm_upcalls(struct sock *sk) 43 { 44 struct siw_cep *cep = sk_to_cep(sk); 45 46 write_lock_bh(&sk->sk_callback_lock); 47 cep->sk_state_change = sk->sk_state_change; 48 cep->sk_data_ready = sk->sk_data_ready; 49 cep->sk_write_space = sk->sk_write_space; 50 cep->sk_error_report = sk->sk_error_report; 51 52 sk->sk_state_change = siw_cm_llp_state_change; 53 sk->sk_data_ready = siw_cm_llp_data_ready; 54 sk->sk_write_space = siw_cm_llp_write_space; 55 sk->sk_error_report = siw_cm_llp_error_report; 56 write_unlock_bh(&sk->sk_callback_lock); 57 } 58 59 static void siw_sk_restore_upcalls(struct sock *sk, struct siw_cep *cep) 60 { 61 sk->sk_state_change = cep->sk_state_change; 62 sk->sk_data_ready = cep->sk_data_ready; 63 sk->sk_write_space = cep->sk_write_space; 64 sk->sk_error_report = cep->sk_error_report; 65 sk->sk_user_data = NULL; 66 } 67 68 static void siw_qp_socket_assoc(struct siw_cep *cep, struct siw_qp *qp) 69 { 70 struct socket *s = cep->sock; 71 struct sock *sk = s->sk; 72 73 write_lock_bh(&sk->sk_callback_lock); 74 75 qp->attrs.sk = s; 76 sk->sk_data_ready = siw_qp_llp_data_ready; 77 sk->sk_write_space = siw_qp_llp_write_space; 78 79 write_unlock_bh(&sk->sk_callback_lock); 80 } 81 82 static void siw_socket_disassoc(struct socket *s) 83 { 84 struct sock *sk = s->sk; 85 struct siw_cep *cep; 86 87 if (sk) { 88 write_lock_bh(&sk->sk_callback_lock); 89 cep = sk_to_cep(sk); 90 if (cep) { 91 siw_sk_restore_upcalls(sk, cep); 92 siw_cep_put(cep); 93 } else { 94 pr_warn("siw: cannot restore sk callbacks: no ep\n"); 95 } 96 write_unlock_bh(&sk->sk_callback_lock); 97 } else { 98 pr_warn("siw: cannot restore sk callbacks: no sk\n"); 99 } 100 } 101 102 static void siw_rtr_data_ready(struct sock *sk) 103 { 104 struct siw_cep *cep; 105 struct siw_qp *qp = NULL; 106 read_descriptor_t rd_desc; 107 108 trace_sk_data_ready(sk); 109 110 read_lock(&sk->sk_callback_lock); 111 112 cep = sk_to_cep(sk); 113 if (!cep) { 114 WARN(1, "No connection endpoint\n"); 115 goto out; 116 } 117 qp = sk_to_qp(sk); 118 119 memset(&rd_desc, 0, sizeof(rd_desc)); 120 rd_desc.arg.data = qp; 121 rd_desc.count = 1; 122 123 tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data); 124 /* 125 * Check if first frame was successfully processed. 126 * Signal connection full establishment if yes. 127 * Failed data processing would have already scheduled 128 * connection drop. 129 */ 130 if (!qp->rx_stream.rx_suspend) 131 siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0); 132 out: 133 read_unlock(&sk->sk_callback_lock); 134 if (qp) 135 siw_qp_socket_assoc(cep, qp); 136 } 137 138 static void siw_sk_assign_rtr_upcalls(struct siw_cep *cep) 139 { 140 struct sock *sk = cep->sock->sk; 141 142 write_lock_bh(&sk->sk_callback_lock); 143 sk->sk_data_ready = siw_rtr_data_ready; 144 sk->sk_write_space = siw_qp_llp_write_space; 145 write_unlock_bh(&sk->sk_callback_lock); 146 } 147 148 static void siw_cep_socket_assoc(struct siw_cep *cep, struct socket *s) 149 { 150 cep->sock = s; 151 siw_cep_get(cep); 152 s->sk->sk_user_data = cep; 153 154 siw_sk_assign_cm_upcalls(s->sk); 155 } 156 157 static struct siw_cep *siw_cep_alloc(struct siw_device *sdev) 158 { 159 struct siw_cep *cep = kzalloc(sizeof(*cep), GFP_KERNEL); 160 unsigned long flags; 161 162 if (!cep) 163 return NULL; 164 165 INIT_LIST_HEAD(&cep->listenq); 166 INIT_LIST_HEAD(&cep->devq); 167 INIT_LIST_HEAD(&cep->work_freelist); 168 169 kref_init(&cep->ref); 170 cep->state = SIW_EPSTATE_IDLE; 171 init_waitqueue_head(&cep->waitq); 172 spin_lock_init(&cep->lock); 173 cep->sdev = sdev; 174 cep->enhanced_rdma_conn_est = false; 175 176 spin_lock_irqsave(&sdev->lock, flags); 177 list_add_tail(&cep->devq, &sdev->cep_list); 178 spin_unlock_irqrestore(&sdev->lock, flags); 179 180 siw_dbg_cep(cep, "new endpoint\n"); 181 return cep; 182 } 183 184 static void siw_cm_free_work(struct siw_cep *cep) 185 { 186 struct list_head *w, *tmp; 187 struct siw_cm_work *work; 188 189 list_for_each_safe(w, tmp, &cep->work_freelist) { 190 work = list_entry(w, struct siw_cm_work, list); 191 list_del(&work->list); 192 kfree(work); 193 } 194 } 195 196 static void siw_cancel_mpatimer(struct siw_cep *cep) 197 { 198 spin_lock_bh(&cep->lock); 199 if (cep->mpa_timer) { 200 if (cancel_delayed_work(&cep->mpa_timer->work)) { 201 siw_cep_put(cep); 202 kfree(cep->mpa_timer); /* not needed again */ 203 } 204 cep->mpa_timer = NULL; 205 } 206 spin_unlock_bh(&cep->lock); 207 } 208 209 static void siw_put_work(struct siw_cm_work *work) 210 { 211 INIT_LIST_HEAD(&work->list); 212 spin_lock_bh(&work->cep->lock); 213 list_add(&work->list, &work->cep->work_freelist); 214 spin_unlock_bh(&work->cep->lock); 215 } 216 217 static void siw_cep_set_inuse(struct siw_cep *cep) 218 { 219 unsigned long flags; 220 retry: 221 spin_lock_irqsave(&cep->lock, flags); 222 223 if (cep->in_use) { 224 spin_unlock_irqrestore(&cep->lock, flags); 225 wait_event_interruptible(cep->waitq, !cep->in_use); 226 if (signal_pending(current)) 227 flush_signals(current); 228 goto retry; 229 } else { 230 cep->in_use = 1; 231 spin_unlock_irqrestore(&cep->lock, flags); 232 } 233 } 234 235 static void siw_cep_set_free(struct siw_cep *cep) 236 { 237 unsigned long flags; 238 239 spin_lock_irqsave(&cep->lock, flags); 240 cep->in_use = 0; 241 spin_unlock_irqrestore(&cep->lock, flags); 242 243 wake_up(&cep->waitq); 244 } 245 246 static void __siw_cep_dealloc(struct kref *ref) 247 { 248 struct siw_cep *cep = container_of(ref, struct siw_cep, ref); 249 struct siw_device *sdev = cep->sdev; 250 unsigned long flags; 251 252 WARN_ON(cep->listen_cep); 253 254 /* kfree(NULL) is safe */ 255 kfree(cep->mpa.pdata); 256 spin_lock_bh(&cep->lock); 257 if (!list_empty(&cep->work_freelist)) 258 siw_cm_free_work(cep); 259 spin_unlock_bh(&cep->lock); 260 261 spin_lock_irqsave(&sdev->lock, flags); 262 list_del(&cep->devq); 263 spin_unlock_irqrestore(&sdev->lock, flags); 264 265 siw_dbg_cep(cep, "free endpoint\n"); 266 kfree(cep); 267 } 268 269 static struct siw_cm_work *siw_get_work(struct siw_cep *cep) 270 { 271 struct siw_cm_work *work = NULL; 272 273 spin_lock_bh(&cep->lock); 274 if (!list_empty(&cep->work_freelist)) { 275 work = list_entry(cep->work_freelist.next, struct siw_cm_work, 276 list); 277 list_del_init(&work->list); 278 } 279 spin_unlock_bh(&cep->lock); 280 return work; 281 } 282 283 static int siw_cm_alloc_work(struct siw_cep *cep, int num) 284 { 285 struct siw_cm_work *work; 286 287 while (num--) { 288 work = kmalloc(sizeof(*work), GFP_KERNEL); 289 if (!work) { 290 if (!(list_empty(&cep->work_freelist))) 291 siw_cm_free_work(cep); 292 return -ENOMEM; 293 } 294 work->cep = cep; 295 INIT_LIST_HEAD(&work->list); 296 list_add(&work->list, &cep->work_freelist); 297 } 298 return 0; 299 } 300 301 /* 302 * siw_cm_upcall() 303 * 304 * Upcall to IWCM to inform about async connection events 305 */ 306 static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason, 307 int status) 308 { 309 struct iw_cm_event event; 310 struct iw_cm_id *id; 311 312 memset(&event, 0, sizeof(event)); 313 event.status = status; 314 event.event = reason; 315 316 if (reason == IW_CM_EVENT_CONNECT_REQUEST) { 317 event.provider_data = cep; 318 id = cep->listen_cep->cm_id; 319 } else { 320 id = cep->cm_id; 321 } 322 /* Signal IRD and ORD */ 323 if (reason == IW_CM_EVENT_ESTABLISHED || 324 reason == IW_CM_EVENT_CONNECT_REPLY) { 325 /* Signal negotiated IRD/ORD values we will use */ 326 event.ird = cep->ird; 327 event.ord = cep->ord; 328 } else if (reason == IW_CM_EVENT_CONNECT_REQUEST) { 329 event.ird = cep->ord; 330 event.ord = cep->ird; 331 } 332 /* Signal private data and address information */ 333 if (reason == IW_CM_EVENT_CONNECT_REQUEST || 334 reason == IW_CM_EVENT_CONNECT_REPLY) { 335 u16 pd_len = be16_to_cpu(cep->mpa.hdr.params.pd_len); 336 337 if (pd_len) { 338 /* 339 * hand over MPA private data 340 */ 341 event.private_data_len = pd_len; 342 event.private_data = cep->mpa.pdata; 343 344 /* Hide MPA V2 IRD/ORD control */ 345 if (cep->enhanced_rdma_conn_est) { 346 event.private_data_len -= 347 sizeof(struct mpa_v2_data); 348 event.private_data += 349 sizeof(struct mpa_v2_data); 350 } 351 } 352 getname_local(cep->sock, &event.local_addr); 353 getname_peer(cep->sock, &event.remote_addr); 354 } 355 siw_dbg_cep(cep, "[QP %u]: reason=%d, status=%d\n", 356 cep->qp ? qp_id(cep->qp) : UINT_MAX, reason, status); 357 358 return id->event_handler(id, &event); 359 } 360 361 static void siw_free_cm_id(struct siw_cep *cep) 362 { 363 if (!cep->cm_id) 364 return; 365 366 cep->cm_id->rem_ref(cep->cm_id); 367 cep->cm_id = NULL; 368 } 369 370 static void siw_destroy_cep_sock(struct siw_cep *cep) 371 { 372 if (cep->sock) { 373 siw_socket_disassoc(cep->sock); 374 sock_release(cep->sock); 375 cep->sock = NULL; 376 } 377 } 378 379 /* 380 * siw_qp_cm_drop() 381 * 382 * Drops established LLP connection if present and not already 383 * scheduled for dropping. Called from user context, SQ workqueue 384 * or receive IRQ. Caller signals if socket can be immediately 385 * closed (basically, if not in IRQ). 386 */ 387 void siw_qp_cm_drop(struct siw_qp *qp, int schedule) 388 { 389 struct siw_cep *cep = qp->cep; 390 391 qp->rx_stream.rx_suspend = 1; 392 qp->tx_ctx.tx_suspend = 1; 393 394 if (!qp->cep) 395 return; 396 397 if (schedule) { 398 siw_cm_queue_work(cep, SIW_CM_WORK_CLOSE_LLP); 399 } else { 400 siw_cep_set_inuse(cep); 401 402 if (cep->state == SIW_EPSTATE_CLOSED) { 403 siw_dbg_cep(cep, "already closed\n"); 404 goto out; 405 } 406 siw_dbg_cep(cep, "immediate close, state %d\n", cep->state); 407 408 siw_send_terminate(qp); 409 410 if (cep->cm_id) { 411 switch (cep->state) { 412 case SIW_EPSTATE_AWAIT_MPAREP: 413 siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, 414 -EINVAL); 415 break; 416 417 case SIW_EPSTATE_RDMA_MODE: 418 siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); 419 break; 420 421 case SIW_EPSTATE_IDLE: 422 case SIW_EPSTATE_LISTENING: 423 case SIW_EPSTATE_CONNECTING: 424 case SIW_EPSTATE_AWAIT_MPAREQ: 425 case SIW_EPSTATE_RECVD_MPAREQ: 426 case SIW_EPSTATE_CLOSED: 427 default: 428 break; 429 } 430 siw_free_cm_id(cep); 431 siw_cep_put(cep); 432 } 433 cep->state = SIW_EPSTATE_CLOSED; 434 435 siw_destroy_cep_sock(cep); 436 if (cep->qp) { 437 cep->qp = NULL; 438 siw_qp_put(qp); 439 } 440 out: 441 siw_cep_set_free(cep); 442 } 443 } 444 445 void siw_cep_put(struct siw_cep *cep) 446 { 447 WARN_ON(kref_read(&cep->ref) < 1); 448 kref_put(&cep->ref, __siw_cep_dealloc); 449 } 450 451 static void siw_cep_set_free_and_put(struct siw_cep *cep) 452 { 453 siw_cep_set_free(cep); 454 siw_cep_put(cep); 455 } 456 457 void siw_cep_get(struct siw_cep *cep) 458 { 459 kref_get(&cep->ref); 460 } 461 462 /* 463 * Expects params->pd_len in host byte order 464 */ 465 static int siw_send_mpareqrep(struct siw_cep *cep, const void *pdata, u8 pd_len) 466 { 467 struct socket *s = cep->sock; 468 struct mpa_rr *rr = &cep->mpa.hdr; 469 struct kvec iov[3]; 470 struct msghdr msg; 471 int rv; 472 int iovec_num = 0; 473 int mpa_len; 474 475 memset(&msg, 0, sizeof(msg)); 476 477 iov[iovec_num].iov_base = rr; 478 iov[iovec_num].iov_len = sizeof(*rr); 479 mpa_len = sizeof(*rr); 480 481 if (cep->enhanced_rdma_conn_est) { 482 iovec_num++; 483 iov[iovec_num].iov_base = &cep->mpa.v2_ctrl; 484 iov[iovec_num].iov_len = sizeof(cep->mpa.v2_ctrl); 485 mpa_len += sizeof(cep->mpa.v2_ctrl); 486 } 487 if (pd_len) { 488 iovec_num++; 489 iov[iovec_num].iov_base = (char *)pdata; 490 iov[iovec_num].iov_len = pd_len; 491 mpa_len += pd_len; 492 } 493 if (cep->enhanced_rdma_conn_est) 494 pd_len += sizeof(cep->mpa.v2_ctrl); 495 496 rr->params.pd_len = cpu_to_be16(pd_len); 497 498 rv = kernel_sendmsg(s, &msg, iov, iovec_num + 1, mpa_len); 499 500 return rv < 0 ? rv : 0; 501 } 502 503 /* 504 * Receive MPA Request/Reply header. 505 * 506 * Returns 0 if complete MPA Request/Reply header including 507 * eventual private data was received. Returns -EAGAIN if 508 * header was partially received or negative error code otherwise. 509 * 510 * Context: May be called in process context only 511 */ 512 static int siw_recv_mpa_rr(struct siw_cep *cep) 513 { 514 struct mpa_rr *hdr = &cep->mpa.hdr; 515 struct socket *s = cep->sock; 516 u16 pd_len; 517 int rcvd, to_rcv; 518 519 if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) { 520 rcvd = ksock_recv(s, (char *)hdr + cep->mpa.bytes_rcvd, 521 sizeof(struct mpa_rr) - cep->mpa.bytes_rcvd, 522 0); 523 if (rcvd <= 0) 524 return -ECONNABORTED; 525 526 cep->mpa.bytes_rcvd += rcvd; 527 528 if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) 529 return -EAGAIN; 530 531 if (be16_to_cpu(hdr->params.pd_len) > MPA_MAX_PRIVDATA) 532 return -EPROTO; 533 } 534 pd_len = be16_to_cpu(hdr->params.pd_len); 535 536 /* 537 * At least the MPA Request/Reply header (frame not including 538 * private data) has been received. 539 * Receive (or continue receiving) any private data. 540 */ 541 to_rcv = pd_len - (cep->mpa.bytes_rcvd - sizeof(struct mpa_rr)); 542 543 if (!to_rcv) { 544 /* 545 * We must have hdr->params.pd_len == 0 and thus received a 546 * complete MPA Request/Reply frame. 547 * Check against peer protocol violation. 548 */ 549 u32 word; 550 551 rcvd = ksock_recv(s, (char *)&word, sizeof(word), MSG_DONTWAIT); 552 if (rcvd == -EAGAIN) 553 return 0; 554 555 if (rcvd == 0) { 556 siw_dbg_cep(cep, "peer EOF\n"); 557 return -EPIPE; 558 } 559 if (rcvd < 0) { 560 siw_dbg_cep(cep, "error: %d\n", rcvd); 561 return rcvd; 562 } 563 siw_dbg_cep(cep, "peer sent extra data: %d\n", rcvd); 564 565 return -EPROTO; 566 } 567 568 /* 569 * At this point, we must have hdr->params.pd_len != 0. 570 * A private data buffer gets allocated if hdr->params.pd_len != 0. 571 */ 572 if (!cep->mpa.pdata) { 573 cep->mpa.pdata = kmalloc(pd_len + 4, GFP_KERNEL); 574 if (!cep->mpa.pdata) 575 return -ENOMEM; 576 } 577 rcvd = ksock_recv( 578 s, cep->mpa.pdata + cep->mpa.bytes_rcvd - sizeof(struct mpa_rr), 579 to_rcv + 4, MSG_DONTWAIT); 580 581 if (rcvd < 0) 582 return rcvd; 583 584 if (rcvd > to_rcv) 585 return -EPROTO; 586 587 cep->mpa.bytes_rcvd += rcvd; 588 589 if (to_rcv == rcvd) { 590 siw_dbg_cep(cep, "%d bytes private data received\n", pd_len); 591 return 0; 592 } 593 return -EAGAIN; 594 } 595 596 /* 597 * siw_proc_mpareq() 598 * 599 * Read MPA Request from socket and signal new connection to IWCM 600 * if success. Caller must hold lock on corresponding listening CEP. 601 */ 602 static int siw_proc_mpareq(struct siw_cep *cep) 603 { 604 struct mpa_rr *req; 605 int version, rv; 606 u16 pd_len; 607 608 rv = siw_recv_mpa_rr(cep); 609 if (rv) 610 return rv; 611 612 req = &cep->mpa.hdr; 613 614 version = __mpa_rr_revision(req->params.bits); 615 pd_len = be16_to_cpu(req->params.pd_len); 616 617 if (version > MPA_REVISION_2) 618 /* allow for 0, 1, and 2 only */ 619 return -EPROTO; 620 621 if (memcmp(req->key, MPA_KEY_REQ, 16)) 622 return -EPROTO; 623 624 /* Prepare for sending MPA reply */ 625 memcpy(req->key, MPA_KEY_REP, 16); 626 627 if (version == MPA_REVISION_2 && 628 (req->params.bits & MPA_RR_FLAG_ENHANCED)) { 629 /* 630 * MPA version 2 must signal IRD/ORD values and P2P mode 631 * in private data if header flag MPA_RR_FLAG_ENHANCED 632 * is set. 633 */ 634 if (pd_len < sizeof(struct mpa_v2_data)) 635 goto reject_conn; 636 637 cep->enhanced_rdma_conn_est = true; 638 } 639 640 /* MPA Markers: currently not supported. Marker TX to be added. */ 641 if (req->params.bits & MPA_RR_FLAG_MARKERS) 642 goto reject_conn; 643 644 if (req->params.bits & MPA_RR_FLAG_CRC) { 645 /* 646 * RFC 5044, page 27: CRC MUST be used if peer requests it. 647 * siw specific: 'mpa_crc_strict' parameter to reject 648 * connection with CRC if local CRC off enforced by 649 * 'mpa_crc_strict' module parameter. 650 */ 651 if (!mpa_crc_required && mpa_crc_strict) 652 goto reject_conn; 653 654 /* Enable CRC if requested by module parameter */ 655 if (mpa_crc_required) 656 req->params.bits |= MPA_RR_FLAG_CRC; 657 } 658 if (cep->enhanced_rdma_conn_est) { 659 struct mpa_v2_data *v2 = (struct mpa_v2_data *)cep->mpa.pdata; 660 661 /* 662 * Peer requested ORD becomes requested local IRD, 663 * peer requested IRD becomes requested local ORD. 664 * IRD and ORD get limited by global maximum values. 665 */ 666 cep->ord = ntohs(v2->ird) & MPA_IRD_ORD_MASK; 667 cep->ord = min(cep->ord, SIW_MAX_ORD_QP); 668 cep->ird = ntohs(v2->ord) & MPA_IRD_ORD_MASK; 669 cep->ird = min(cep->ird, SIW_MAX_IRD_QP); 670 671 /* May get overwritten by locally negotiated values */ 672 cep->mpa.v2_ctrl.ird = htons(cep->ird); 673 cep->mpa.v2_ctrl.ord = htons(cep->ord); 674 675 /* 676 * Support for peer sent zero length Write or Read to 677 * let local side enter RTS. Writes are preferred. 678 * Sends would require pre-posting a Receive and are 679 * not supported. 680 * Propose zero length Write if none of Read and Write 681 * is indicated. 682 */ 683 if (v2->ird & MPA_V2_PEER_TO_PEER) { 684 cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER; 685 686 if (v2->ord & MPA_V2_RDMA_WRITE_RTR) 687 cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR; 688 else if (v2->ord & MPA_V2_RDMA_READ_RTR) 689 cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_READ_RTR; 690 else 691 cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR; 692 } 693 } 694 695 cep->state = SIW_EPSTATE_RECVD_MPAREQ; 696 697 /* Keep reference until IWCM accepts/rejects */ 698 siw_cep_get(cep); 699 rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REQUEST, 0); 700 if (rv) 701 siw_cep_put(cep); 702 703 return rv; 704 705 reject_conn: 706 siw_dbg_cep(cep, "reject: crc %d:%d:%d, m %d:%d\n", 707 req->params.bits & MPA_RR_FLAG_CRC ? 1 : 0, 708 mpa_crc_required, mpa_crc_strict, 709 req->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0); 710 711 req->params.bits &= ~MPA_RR_FLAG_MARKERS; 712 req->params.bits |= MPA_RR_FLAG_REJECT; 713 714 if (!mpa_crc_required && mpa_crc_strict) 715 req->params.bits &= ~MPA_RR_FLAG_CRC; 716 717 if (pd_len) 718 kfree(cep->mpa.pdata); 719 720 cep->mpa.pdata = NULL; 721 722 siw_send_mpareqrep(cep, NULL, 0); 723 724 return -EOPNOTSUPP; 725 } 726 727 static int siw_proc_mpareply(struct siw_cep *cep) 728 { 729 struct siw_qp_attrs qp_attrs; 730 enum siw_qp_attr_mask qp_attr_mask; 731 struct siw_qp *qp = cep->qp; 732 struct mpa_rr *rep; 733 int rv; 734 u16 rep_ord; 735 u16 rep_ird; 736 bool ird_insufficient = false; 737 enum mpa_v2_ctrl mpa_p2p_mode = MPA_V2_RDMA_NO_RTR; 738 739 rv = siw_recv_mpa_rr(cep); 740 if (rv) 741 goto out_err; 742 743 siw_cancel_mpatimer(cep); 744 745 rep = &cep->mpa.hdr; 746 747 if (__mpa_rr_revision(rep->params.bits) > MPA_REVISION_2) { 748 /* allow for 0, 1, and 2 only */ 749 rv = -EPROTO; 750 goto out_err; 751 } 752 if (memcmp(rep->key, MPA_KEY_REP, 16)) { 753 siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, LLP_ETYPE_MPA, 754 LLP_ECODE_INVALID_REQ_RESP, 0); 755 siw_send_terminate(qp); 756 rv = -EPROTO; 757 goto out_err; 758 } 759 if (rep->params.bits & MPA_RR_FLAG_REJECT) { 760 siw_dbg_cep(cep, "got mpa reject\n"); 761 siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET); 762 763 return -ECONNRESET; 764 } 765 if (try_gso && rep->params.bits & MPA_RR_FLAG_GSO_EXP) { 766 siw_dbg_cep(cep, "peer allows GSO on TX\n"); 767 qp->tx_ctx.gso_seg_limit = 0; 768 } 769 if ((rep->params.bits & MPA_RR_FLAG_MARKERS) || 770 (mpa_crc_required && !(rep->params.bits & MPA_RR_FLAG_CRC)) || 771 (mpa_crc_strict && !mpa_crc_required && 772 (rep->params.bits & MPA_RR_FLAG_CRC))) { 773 siw_dbg_cep(cep, "reply unsupp: crc %d:%d:%d, m %d:%d\n", 774 rep->params.bits & MPA_RR_FLAG_CRC ? 1 : 0, 775 mpa_crc_required, mpa_crc_strict, 776 rep->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0); 777 778 siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNREFUSED); 779 780 return -EINVAL; 781 } 782 if (cep->enhanced_rdma_conn_est) { 783 struct mpa_v2_data *v2; 784 785 if (__mpa_rr_revision(rep->params.bits) < MPA_REVISION_2 || 786 !(rep->params.bits & MPA_RR_FLAG_ENHANCED)) { 787 /* 788 * Protocol failure: The responder MUST reply with 789 * MPA version 2 and MUST set MPA_RR_FLAG_ENHANCED. 790 */ 791 siw_dbg_cep(cep, "mpa reply error: vers %d, enhcd %d\n", 792 __mpa_rr_revision(rep->params.bits), 793 rep->params.bits & MPA_RR_FLAG_ENHANCED ? 794 1 : 795 0); 796 797 siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, 798 -ECONNRESET); 799 return -EINVAL; 800 } 801 v2 = (struct mpa_v2_data *)cep->mpa.pdata; 802 rep_ird = ntohs(v2->ird) & MPA_IRD_ORD_MASK; 803 rep_ord = ntohs(v2->ord) & MPA_IRD_ORD_MASK; 804 805 if (cep->ird < rep_ord && 806 (relaxed_ird_negotiation == false || 807 rep_ord > cep->sdev->attrs.max_ird)) { 808 siw_dbg_cep(cep, "ird %d, rep_ord %d, max_ord %d\n", 809 cep->ird, rep_ord, 810 cep->sdev->attrs.max_ord); 811 ird_insufficient = true; 812 } 813 if (cep->ord > rep_ird && relaxed_ird_negotiation == false) { 814 siw_dbg_cep(cep, "ord %d, rep_ird %d\n", cep->ord, 815 rep_ird); 816 ird_insufficient = true; 817 } 818 /* 819 * Always report negotiated peer values to user, 820 * even if IRD/ORD negotiation failed 821 */ 822 cep->ird = rep_ord; 823 cep->ord = rep_ird; 824 825 if (ird_insufficient) { 826 /* 827 * If the initiator IRD is insuffient for the 828 * responder ORD, send a TERM. 829 */ 830 siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, 831 LLP_ETYPE_MPA, 832 LLP_ECODE_INSUFFICIENT_IRD, 0); 833 siw_send_terminate(qp); 834 rv = -ENOMEM; 835 goto out_err; 836 } 837 if (cep->mpa.v2_ctrl_req.ird & MPA_V2_PEER_TO_PEER) 838 mpa_p2p_mode = 839 cep->mpa.v2_ctrl_req.ord & 840 (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR); 841 842 /* 843 * Check if we requested P2P mode, and if peer agrees 844 */ 845 if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) { 846 if ((mpa_p2p_mode & v2->ord) == 0) { 847 /* 848 * We requested RTR mode(s), but the peer 849 * did not pick any mode we support. 850 */ 851 siw_dbg_cep(cep, 852 "rtr mode: req %2x, got %2x\n", 853 mpa_p2p_mode, 854 v2->ord & (MPA_V2_RDMA_WRITE_RTR | 855 MPA_V2_RDMA_READ_RTR)); 856 857 siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, 858 LLP_ETYPE_MPA, 859 LLP_ECODE_NO_MATCHING_RTR, 860 0); 861 siw_send_terminate(qp); 862 rv = -EPROTO; 863 goto out_err; 864 } 865 mpa_p2p_mode = v2->ord & (MPA_V2_RDMA_WRITE_RTR | 866 MPA_V2_RDMA_READ_RTR); 867 } 868 } 869 memset(&qp_attrs, 0, sizeof(qp_attrs)); 870 871 if (rep->params.bits & MPA_RR_FLAG_CRC) 872 qp_attrs.flags = SIW_MPA_CRC; 873 874 qp_attrs.irq_size = cep->ird; 875 qp_attrs.orq_size = cep->ord; 876 qp_attrs.sk = cep->sock; 877 qp_attrs.state = SIW_QP_STATE_RTS; 878 879 qp_attr_mask = SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE | 880 SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD | SIW_QP_ATTR_MPA; 881 882 /* Move socket RX/TX under QP control */ 883 down_write(&qp->state_lock); 884 if (qp->attrs.state > SIW_QP_STATE_RTR) { 885 rv = -EINVAL; 886 up_write(&qp->state_lock); 887 goto out_err; 888 } 889 rv = siw_qp_modify(qp, &qp_attrs, qp_attr_mask); 890 891 siw_qp_socket_assoc(cep, qp); 892 893 up_write(&qp->state_lock); 894 895 /* Send extra RDMA frame to trigger peer RTS if negotiated */ 896 if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) { 897 rv = siw_qp_mpa_rts(qp, mpa_p2p_mode); 898 if (rv) 899 goto out_err; 900 } 901 if (!rv) { 902 rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, 0); 903 if (!rv) 904 cep->state = SIW_EPSTATE_RDMA_MODE; 905 906 return 0; 907 } 908 909 out_err: 910 if (rv != -EAGAIN) 911 siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL); 912 913 return rv; 914 } 915 916 /* 917 * siw_accept_newconn - accept an incoming pending connection 918 * 919 */ 920 static void siw_accept_newconn(struct siw_cep *cep) 921 { 922 struct socket *s = cep->sock; 923 struct socket *new_s = NULL; 924 struct siw_cep *new_cep = NULL; 925 int rv = 0; /* debug only. should disappear */ 926 927 if (cep->state != SIW_EPSTATE_LISTENING) 928 goto error; 929 930 new_cep = siw_cep_alloc(cep->sdev); 931 if (!new_cep) 932 goto error; 933 934 /* 935 * 4: Allocate a sufficient number of work elements 936 * to allow concurrent handling of local + peer close 937 * events, MPA header processing + MPA timeout. 938 */ 939 if (siw_cm_alloc_work(new_cep, 4) != 0) 940 goto error; 941 942 /* 943 * Copy saved socket callbacks from listening CEP 944 * and assign new socket with new CEP 945 */ 946 new_cep->sk_state_change = cep->sk_state_change; 947 new_cep->sk_data_ready = cep->sk_data_ready; 948 new_cep->sk_write_space = cep->sk_write_space; 949 new_cep->sk_error_report = cep->sk_error_report; 950 951 rv = kernel_accept(s, &new_s, O_NONBLOCK); 952 if (rv != 0) { 953 /* 954 * Connection already aborted by peer..? 955 */ 956 siw_dbg_cep(cep, "kernel_accept() error: %d\n", rv); 957 goto error; 958 } 959 new_cep->sock = new_s; 960 siw_cep_get(new_cep); 961 new_s->sk->sk_user_data = new_cep; 962 963 if (siw_tcp_nagle == false) 964 tcp_sock_set_nodelay(new_s->sk); 965 new_cep->state = SIW_EPSTATE_AWAIT_MPAREQ; 966 967 rv = siw_cm_queue_work(new_cep, SIW_CM_WORK_MPATIMEOUT); 968 if (rv) 969 goto error; 970 /* 971 * See siw_proc_mpareq() etc. for the use of new_cep->listen_cep. 972 */ 973 new_cep->listen_cep = cep; 974 siw_cep_get(cep); 975 976 if (atomic_read(&new_s->sk->sk_rmem_alloc)) { 977 /* 978 * MPA REQ already queued 979 */ 980 siw_dbg_cep(cep, "immediate mpa request\n"); 981 982 siw_cep_set_inuse(new_cep); 983 rv = siw_proc_mpareq(new_cep); 984 if (rv != -EAGAIN) { 985 siw_cep_put(cep); 986 new_cep->listen_cep = NULL; 987 if (rv) { 988 siw_cancel_mpatimer(new_cep); 989 siw_cep_set_free(new_cep); 990 goto error; 991 } 992 } 993 siw_cep_set_free(new_cep); 994 } 995 return; 996 997 error: 998 if (new_cep) 999 siw_cep_put(new_cep); 1000 1001 if (new_s) { 1002 siw_socket_disassoc(new_s); 1003 sock_release(new_s); 1004 new_cep->sock = NULL; 1005 } 1006 siw_dbg_cep(cep, "error %d\n", rv); 1007 } 1008 1009 static void siw_cm_work_handler(struct work_struct *w) 1010 { 1011 struct siw_cm_work *work; 1012 struct siw_cep *cep; 1013 int release_cep = 0, rv = 0; 1014 1015 work = container_of(w, struct siw_cm_work, work.work); 1016 cep = work->cep; 1017 1018 siw_dbg_cep(cep, "[QP %u]: work type: %d, state %d\n", 1019 cep->qp ? qp_id(cep->qp) : UINT_MAX, 1020 work->type, cep->state); 1021 1022 siw_cep_set_inuse(cep); 1023 1024 switch (work->type) { 1025 case SIW_CM_WORK_ACCEPT: 1026 siw_accept_newconn(cep); 1027 break; 1028 1029 case SIW_CM_WORK_READ_MPAHDR: 1030 if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) { 1031 if (cep->listen_cep) { 1032 siw_cep_set_inuse(cep->listen_cep); 1033 1034 if (cep->listen_cep->state == 1035 SIW_EPSTATE_LISTENING) 1036 rv = siw_proc_mpareq(cep); 1037 else 1038 rv = -EFAULT; 1039 1040 siw_cep_set_free(cep->listen_cep); 1041 1042 if (rv != -EAGAIN) { 1043 siw_cep_put(cep->listen_cep); 1044 cep->listen_cep = NULL; 1045 if (rv) 1046 siw_cep_put(cep); 1047 } 1048 } 1049 } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) { 1050 rv = siw_proc_mpareply(cep); 1051 } else { 1052 /* 1053 * CEP already moved out of MPA handshake. 1054 * any connection management already done. 1055 * silently ignore the mpa packet. 1056 */ 1057 if (cep->state == SIW_EPSTATE_RDMA_MODE) { 1058 cep->sock->sk->sk_data_ready(cep->sock->sk); 1059 siw_dbg_cep(cep, "already in RDMA mode"); 1060 } else { 1061 siw_dbg_cep(cep, "out of state: %d\n", 1062 cep->state); 1063 } 1064 } 1065 if (rv && rv != -EAGAIN) 1066 release_cep = 1; 1067 break; 1068 1069 case SIW_CM_WORK_CLOSE_LLP: 1070 /* 1071 * QP scheduled LLP close 1072 */ 1073 if (cep->qp) 1074 siw_send_terminate(cep->qp); 1075 1076 if (cep->cm_id) 1077 siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); 1078 1079 release_cep = 1; 1080 break; 1081 1082 case SIW_CM_WORK_PEER_CLOSE: 1083 if (cep->cm_id) { 1084 if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) { 1085 /* 1086 * MPA reply not received, but connection drop 1087 */ 1088 siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, 1089 -ECONNRESET); 1090 } else if (cep->state == SIW_EPSTATE_RDMA_MODE) { 1091 /* 1092 * NOTE: IW_CM_EVENT_DISCONNECT is given just 1093 * to transition IWCM into CLOSING. 1094 */ 1095 siw_cm_upcall(cep, IW_CM_EVENT_DISCONNECT, 0); 1096 siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); 1097 } 1098 /* 1099 * for other states there is no connection 1100 * known to the IWCM. 1101 */ 1102 } else { 1103 if (cep->state == SIW_EPSTATE_RECVD_MPAREQ) { 1104 /* 1105 * Wait for the ulp/CM to call accept/reject 1106 */ 1107 siw_dbg_cep(cep, 1108 "mpa req recvd, wait for ULP\n"); 1109 } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) { 1110 /* 1111 * Socket close before MPA request received. 1112 */ 1113 if (cep->listen_cep) { 1114 siw_dbg_cep(cep, 1115 "no mpareq: drop listener\n"); 1116 siw_cep_put(cep->listen_cep); 1117 cep->listen_cep = NULL; 1118 } 1119 } 1120 } 1121 release_cep = 1; 1122 break; 1123 1124 case SIW_CM_WORK_MPATIMEOUT: 1125 cep->mpa_timer = NULL; 1126 1127 if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) { 1128 /* 1129 * MPA request timed out: 1130 * Hide any partially received private data and signal 1131 * timeout 1132 */ 1133 cep->mpa.hdr.params.pd_len = 0; 1134 1135 if (cep->cm_id) 1136 siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, 1137 -ETIMEDOUT); 1138 release_cep = 1; 1139 1140 } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) { 1141 /* 1142 * No MPA request received after peer TCP stream setup. 1143 */ 1144 if (cep->listen_cep) { 1145 siw_cep_put(cep->listen_cep); 1146 cep->listen_cep = NULL; 1147 } 1148 release_cep = 1; 1149 } 1150 break; 1151 1152 default: 1153 WARN(1, "Undefined CM work type: %d\n", work->type); 1154 } 1155 if (release_cep) { 1156 siw_dbg_cep(cep, 1157 "release: timer=%s, QP[%u]\n", 1158 cep->mpa_timer ? "y" : "n", 1159 cep->qp ? qp_id(cep->qp) : UINT_MAX); 1160 1161 siw_cancel_mpatimer(cep); 1162 1163 cep->state = SIW_EPSTATE_CLOSED; 1164 1165 if (cep->qp) { 1166 struct siw_qp *qp = cep->qp; 1167 /* 1168 * Serialize a potential race with application 1169 * closing the QP and calling siw_qp_cm_drop() 1170 */ 1171 siw_qp_get(qp); 1172 siw_cep_set_free(cep); 1173 1174 siw_qp_llp_close(qp); 1175 siw_qp_put(qp); 1176 1177 siw_cep_set_inuse(cep); 1178 cep->qp = NULL; 1179 siw_qp_put(qp); 1180 } 1181 if (cep->sock) { 1182 siw_socket_disassoc(cep->sock); 1183 sock_release(cep->sock); 1184 cep->sock = NULL; 1185 } 1186 if (cep->cm_id) { 1187 siw_free_cm_id(cep); 1188 siw_cep_put(cep); 1189 } 1190 } 1191 siw_cep_set_free(cep); 1192 siw_put_work(work); 1193 siw_cep_put(cep); 1194 } 1195 1196 static struct workqueue_struct *siw_cm_wq; 1197 1198 int siw_cm_queue_work(struct siw_cep *cep, enum siw_work_type type) 1199 { 1200 struct siw_cm_work *work = siw_get_work(cep); 1201 unsigned long delay = 0; 1202 1203 if (!work) { 1204 siw_dbg_cep(cep, "failed with no work available\n"); 1205 return -ENOMEM; 1206 } 1207 work->type = type; 1208 work->cep = cep; 1209 1210 siw_cep_get(cep); 1211 1212 INIT_DELAYED_WORK(&work->work, siw_cm_work_handler); 1213 1214 if (type == SIW_CM_WORK_MPATIMEOUT) { 1215 cep->mpa_timer = work; 1216 1217 if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) 1218 delay = MPAREQ_TIMEOUT; 1219 else 1220 delay = MPAREP_TIMEOUT; 1221 } 1222 siw_dbg_cep(cep, "[QP %u]: work type: %d, timeout %lu\n", 1223 cep->qp ? qp_id(cep->qp) : -1, type, delay); 1224 1225 queue_delayed_work(siw_cm_wq, &work->work, delay); 1226 1227 return 0; 1228 } 1229 1230 static void siw_cm_llp_data_ready(struct sock *sk) 1231 { 1232 struct siw_cep *cep; 1233 1234 trace_sk_data_ready(sk); 1235 1236 read_lock(&sk->sk_callback_lock); 1237 1238 cep = sk_to_cep(sk); 1239 if (!cep) 1240 goto out; 1241 1242 siw_dbg_cep(cep, "cep state: %d, socket state %d\n", 1243 cep->state, sk->sk_state); 1244 1245 if (sk->sk_state != TCP_ESTABLISHED) 1246 goto out; 1247 1248 switch (cep->state) { 1249 case SIW_EPSTATE_RDMA_MODE: 1250 case SIW_EPSTATE_LISTENING: 1251 break; 1252 1253 case SIW_EPSTATE_AWAIT_MPAREQ: 1254 case SIW_EPSTATE_AWAIT_MPAREP: 1255 siw_cm_queue_work(cep, SIW_CM_WORK_READ_MPAHDR); 1256 break; 1257 1258 default: 1259 siw_dbg_cep(cep, "unexpected data, state %d\n", cep->state); 1260 break; 1261 } 1262 out: 1263 read_unlock(&sk->sk_callback_lock); 1264 } 1265 1266 static void siw_cm_llp_write_space(struct sock *sk) 1267 { 1268 struct siw_cep *cep = sk_to_cep(sk); 1269 1270 if (cep) 1271 siw_dbg_cep(cep, "state: %d\n", cep->state); 1272 } 1273 1274 static void siw_cm_llp_error_report(struct sock *sk) 1275 { 1276 struct siw_cep *cep = sk_to_cep(sk); 1277 1278 if (cep) { 1279 siw_dbg_cep(cep, "error %d, socket state: %d, cep state: %d\n", 1280 sk->sk_err, sk->sk_state, cep->state); 1281 cep->sk_error_report(sk); 1282 } 1283 } 1284 1285 static void siw_cm_llp_state_change(struct sock *sk) 1286 { 1287 struct siw_cep *cep; 1288 void (*orig_state_change)(struct sock *s); 1289 1290 read_lock(&sk->sk_callback_lock); 1291 1292 cep = sk_to_cep(sk); 1293 if (!cep) { 1294 /* endpoint already disassociated */ 1295 read_unlock(&sk->sk_callback_lock); 1296 return; 1297 } 1298 orig_state_change = cep->sk_state_change; 1299 1300 siw_dbg_cep(cep, "state: %d\n", cep->state); 1301 1302 switch (sk->sk_state) { 1303 case TCP_ESTABLISHED: 1304 /* 1305 * handle accepting socket as special case where only 1306 * new connection is possible 1307 */ 1308 siw_cm_queue_work(cep, SIW_CM_WORK_ACCEPT); 1309 break; 1310 1311 case TCP_CLOSE: 1312 case TCP_CLOSE_WAIT: 1313 if (cep->qp) 1314 cep->qp->tx_ctx.tx_suspend = 1; 1315 siw_cm_queue_work(cep, SIW_CM_WORK_PEER_CLOSE); 1316 break; 1317 1318 default: 1319 siw_dbg_cep(cep, "unexpected socket state %d\n", sk->sk_state); 1320 } 1321 read_unlock(&sk->sk_callback_lock); 1322 orig_state_change(sk); 1323 } 1324 1325 static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr, 1326 struct sockaddr *raddr, bool afonly) 1327 { 1328 int rv, flags = 0; 1329 size_t size = laddr->sa_family == AF_INET ? 1330 sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); 1331 1332 /* 1333 * Make address available again asap. 1334 */ 1335 sock_set_reuseaddr(s->sk); 1336 1337 if (afonly) { 1338 rv = ip6_sock_set_v6only(s->sk); 1339 if (rv) 1340 return rv; 1341 } 1342 1343 rv = s->ops->bind(s, laddr, size); 1344 if (rv < 0) 1345 return rv; 1346 1347 rv = s->ops->connect(s, raddr, size, flags); 1348 1349 return rv < 0 ? rv : 0; 1350 } 1351 1352 int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params) 1353 { 1354 struct siw_device *sdev = to_siw_dev(id->device); 1355 struct siw_qp *qp; 1356 struct siw_cep *cep = NULL; 1357 struct socket *s = NULL; 1358 struct sockaddr *laddr = (struct sockaddr *)&id->local_addr, 1359 *raddr = (struct sockaddr *)&id->remote_addr; 1360 bool p2p_mode = peer_to_peer, v4 = true; 1361 u16 pd_len = params->private_data_len; 1362 int version = mpa_version, rv; 1363 1364 if (pd_len > MPA_MAX_PRIVDATA) 1365 return -EINVAL; 1366 1367 if (params->ird > sdev->attrs.max_ird || 1368 params->ord > sdev->attrs.max_ord) 1369 return -ENOMEM; 1370 1371 if (laddr->sa_family == AF_INET6) 1372 v4 = false; 1373 else if (laddr->sa_family != AF_INET) 1374 return -EAFNOSUPPORT; 1375 1376 /* 1377 * Respect any iwarp port mapping: Use mapped remote address 1378 * if valid. Local address must not be mapped, since siw 1379 * uses kernel TCP stack. 1380 */ 1381 if ((v4 && to_sockaddr_in(id->remote_addr).sin_port != 0) || 1382 to_sockaddr_in6(id->remote_addr).sin6_port != 0) 1383 raddr = (struct sockaddr *)&id->m_remote_addr; 1384 1385 qp = siw_qp_id2obj(sdev, params->qpn); 1386 if (!qp) { 1387 WARN(1, "[QP %u] does not exist\n", params->qpn); 1388 rv = -EINVAL; 1389 goto error; 1390 } 1391 siw_dbg_qp(qp, "pd_len %d, laddr %pISp, raddr %pISp\n", pd_len, laddr, 1392 raddr); 1393 1394 rv = sock_create(v4 ? AF_INET : AF_INET6, SOCK_STREAM, IPPROTO_TCP, &s); 1395 if (rv < 0) 1396 goto error; 1397 1398 /* 1399 * NOTE: For simplification, connect() is called in blocking 1400 * mode. Might be reconsidered for async connection setup at 1401 * TCP level. 1402 */ 1403 rv = kernel_bindconnect(s, laddr, raddr, id->afonly); 1404 if (rv != 0) { 1405 siw_dbg_qp(qp, "kernel_bindconnect: error %d\n", rv); 1406 goto error; 1407 } 1408 if (siw_tcp_nagle == false) 1409 tcp_sock_set_nodelay(s->sk); 1410 cep = siw_cep_alloc(sdev); 1411 if (!cep) { 1412 rv = -ENOMEM; 1413 goto error; 1414 } 1415 siw_cep_set_inuse(cep); 1416 1417 /* Associate QP with CEP */ 1418 siw_cep_get(cep); 1419 qp->cep = cep; 1420 1421 /* siw_qp_get(qp) already done by QP lookup */ 1422 cep->qp = qp; 1423 1424 id->add_ref(id); 1425 cep->cm_id = id; 1426 1427 /* 1428 * 4: Allocate a sufficient number of work elements 1429 * to allow concurrent handling of local + peer close 1430 * events, MPA header processing + MPA timeout. 1431 */ 1432 rv = siw_cm_alloc_work(cep, 4); 1433 if (rv != 0) { 1434 rv = -ENOMEM; 1435 goto error; 1436 } 1437 cep->ird = params->ird; 1438 cep->ord = params->ord; 1439 1440 if (p2p_mode && cep->ord == 0) 1441 cep->ord = 1; 1442 1443 cep->state = SIW_EPSTATE_CONNECTING; 1444 1445 /* 1446 * Associate CEP with socket 1447 */ 1448 siw_cep_socket_assoc(cep, s); 1449 1450 cep->state = SIW_EPSTATE_AWAIT_MPAREP; 1451 1452 /* 1453 * Set MPA Request bits: CRC if required, no MPA Markers, 1454 * MPA Rev. according to module parameter 'mpa_version', Key 'Request'. 1455 */ 1456 cep->mpa.hdr.params.bits = 0; 1457 if (version > MPA_REVISION_2) { 1458 pr_warn("Setting MPA version to %u\n", MPA_REVISION_2); 1459 version = MPA_REVISION_2; 1460 /* Adjust also module parameter */ 1461 mpa_version = MPA_REVISION_2; 1462 } 1463 __mpa_rr_set_revision(&cep->mpa.hdr.params.bits, version); 1464 1465 if (try_gso) 1466 cep->mpa.hdr.params.bits |= MPA_RR_FLAG_GSO_EXP; 1467 1468 if (mpa_crc_required) 1469 cep->mpa.hdr.params.bits |= MPA_RR_FLAG_CRC; 1470 1471 /* 1472 * If MPA version == 2: 1473 * o Include ORD and IRD. 1474 * o Indicate peer-to-peer mode, if required by module 1475 * parameter 'peer_to_peer'. 1476 */ 1477 if (version == MPA_REVISION_2) { 1478 cep->enhanced_rdma_conn_est = true; 1479 cep->mpa.hdr.params.bits |= MPA_RR_FLAG_ENHANCED; 1480 1481 cep->mpa.v2_ctrl.ird = htons(cep->ird); 1482 cep->mpa.v2_ctrl.ord = htons(cep->ord); 1483 1484 if (p2p_mode) { 1485 cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER; 1486 cep->mpa.v2_ctrl.ord |= rtr_type; 1487 } 1488 /* Remember own P2P mode requested */ 1489 cep->mpa.v2_ctrl_req.ird = cep->mpa.v2_ctrl.ird; 1490 cep->mpa.v2_ctrl_req.ord = cep->mpa.v2_ctrl.ord; 1491 } 1492 memcpy(cep->mpa.hdr.key, MPA_KEY_REQ, 16); 1493 1494 rv = siw_send_mpareqrep(cep, params->private_data, pd_len); 1495 /* 1496 * Reset private data. 1497 */ 1498 cep->mpa.hdr.params.pd_len = 0; 1499 1500 if (rv >= 0) { 1501 rv = siw_cm_queue_work(cep, SIW_CM_WORK_MPATIMEOUT); 1502 if (!rv) { 1503 siw_dbg_cep(cep, "[QP %u]: exit\n", qp_id(qp)); 1504 siw_cep_set_free(cep); 1505 return 0; 1506 } 1507 } 1508 error: 1509 siw_dbg(id->device, "failed: %d\n", rv); 1510 1511 if (cep) { 1512 siw_socket_disassoc(s); 1513 sock_release(s); 1514 cep->sock = NULL; 1515 1516 cep->qp = NULL; 1517 1518 cep->cm_id = NULL; 1519 id->rem_ref(id); 1520 1521 qp->cep = NULL; 1522 siw_cep_put(cep); 1523 1524 cep->state = SIW_EPSTATE_CLOSED; 1525 1526 siw_cep_set_free_and_put(cep); 1527 1528 } else if (s) { 1529 sock_release(s); 1530 } 1531 if (qp) 1532 siw_qp_put(qp); 1533 1534 return rv; 1535 } 1536 1537 /* 1538 * siw_accept - Let SoftiWARP accept an RDMA connection request 1539 * 1540 * @id: New connection management id to be used for accepted 1541 * connection request 1542 * @params: Connection parameters provided by ULP for accepting connection 1543 * 1544 * Transition QP to RTS state, associate new CM id @id with accepted CEP 1545 * and get prepared for TCP input by installing socket callbacks. 1546 * Then send MPA Reply and generate the "connection established" event. 1547 * Socket callbacks must be installed before sending MPA Reply, because 1548 * the latter may cause a first RDMA message to arrive from the RDMA Initiator 1549 * side very quickly, at which time the socket callbacks must be ready. 1550 */ 1551 int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params) 1552 { 1553 struct siw_device *sdev = to_siw_dev(id->device); 1554 struct siw_cep *cep = (struct siw_cep *)id->provider_data; 1555 struct siw_qp *qp; 1556 struct siw_qp_attrs qp_attrs; 1557 int rv = -EINVAL, max_priv_data = MPA_MAX_PRIVDATA; 1558 bool wait_for_peer_rts = false; 1559 1560 siw_cep_set_inuse(cep); 1561 siw_cep_put(cep); 1562 1563 /* Free lingering inbound private data */ 1564 if (cep->mpa.hdr.params.pd_len) { 1565 cep->mpa.hdr.params.pd_len = 0; 1566 kfree(cep->mpa.pdata); 1567 cep->mpa.pdata = NULL; 1568 } 1569 siw_cancel_mpatimer(cep); 1570 1571 if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) { 1572 siw_dbg_cep(cep, "out of state\n"); 1573 rv = -ECONNRESET; 1574 goto free_cep; 1575 } 1576 qp = siw_qp_id2obj(sdev, params->qpn); 1577 if (!qp) { 1578 WARN(1, "[QP %d] does not exist\n", params->qpn); 1579 goto free_cep; 1580 } 1581 down_write(&qp->state_lock); 1582 if (qp->attrs.state > SIW_QP_STATE_RTR) 1583 goto error_unlock; 1584 siw_dbg_cep(cep, "[QP %d]\n", params->qpn); 1585 1586 if (try_gso && cep->mpa.hdr.params.bits & MPA_RR_FLAG_GSO_EXP) { 1587 siw_dbg_cep(cep, "peer allows GSO on TX\n"); 1588 qp->tx_ctx.gso_seg_limit = 0; 1589 } 1590 if (params->ord > sdev->attrs.max_ord || 1591 params->ird > sdev->attrs.max_ird) { 1592 siw_dbg_cep( 1593 cep, 1594 "[QP %u]: ord %d (max %d), ird %d (max %d)\n", 1595 qp_id(qp), params->ord, sdev->attrs.max_ord, 1596 params->ird, sdev->attrs.max_ird); 1597 goto error_unlock; 1598 } 1599 if (cep->enhanced_rdma_conn_est) 1600 max_priv_data -= sizeof(struct mpa_v2_data); 1601 1602 if (params->private_data_len > max_priv_data) { 1603 siw_dbg_cep( 1604 cep, 1605 "[QP %u]: private data length: %d (max %d)\n", 1606 qp_id(qp), params->private_data_len, max_priv_data); 1607 goto error_unlock; 1608 } 1609 if (cep->enhanced_rdma_conn_est) { 1610 if (params->ord > cep->ord) { 1611 if (relaxed_ird_negotiation) { 1612 params->ord = cep->ord; 1613 } else { 1614 cep->ird = params->ird; 1615 cep->ord = params->ord; 1616 goto error_unlock; 1617 } 1618 } 1619 if (params->ird < cep->ird) { 1620 if (relaxed_ird_negotiation && 1621 cep->ird <= sdev->attrs.max_ird) 1622 params->ird = cep->ird; 1623 else { 1624 rv = -ENOMEM; 1625 goto error_unlock; 1626 } 1627 } 1628 if (cep->mpa.v2_ctrl.ord & 1629 (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR)) 1630 wait_for_peer_rts = true; 1631 /* 1632 * Signal back negotiated IRD and ORD values 1633 */ 1634 cep->mpa.v2_ctrl.ord = 1635 htons(params->ord & MPA_IRD_ORD_MASK) | 1636 (cep->mpa.v2_ctrl.ord & ~MPA_V2_MASK_IRD_ORD); 1637 cep->mpa.v2_ctrl.ird = 1638 htons(params->ird & MPA_IRD_ORD_MASK) | 1639 (cep->mpa.v2_ctrl.ird & ~MPA_V2_MASK_IRD_ORD); 1640 } 1641 cep->ird = params->ird; 1642 cep->ord = params->ord; 1643 1644 cep->cm_id = id; 1645 id->add_ref(id); 1646 1647 memset(&qp_attrs, 0, sizeof(qp_attrs)); 1648 qp_attrs.orq_size = cep->ord; 1649 qp_attrs.irq_size = cep->ird; 1650 qp_attrs.sk = cep->sock; 1651 if (cep->mpa.hdr.params.bits & MPA_RR_FLAG_CRC) 1652 qp_attrs.flags = SIW_MPA_CRC; 1653 qp_attrs.state = SIW_QP_STATE_RTS; 1654 1655 siw_dbg_cep(cep, "[QP%u]: moving to rts\n", qp_id(qp)); 1656 1657 /* Associate QP with CEP */ 1658 siw_cep_get(cep); 1659 qp->cep = cep; 1660 1661 /* siw_qp_get(qp) already done by QP lookup */ 1662 cep->qp = qp; 1663 1664 cep->state = SIW_EPSTATE_RDMA_MODE; 1665 1666 /* Move socket RX/TX under QP control */ 1667 rv = siw_qp_modify(qp, &qp_attrs, 1668 SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE | 1669 SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD | 1670 SIW_QP_ATTR_MPA); 1671 up_write(&qp->state_lock); 1672 if (rv) 1673 goto error; 1674 1675 siw_dbg_cep(cep, "[QP %u]: send mpa reply, %d byte pdata\n", 1676 qp_id(qp), params->private_data_len); 1677 1678 rv = siw_send_mpareqrep(cep, params->private_data, 1679 params->private_data_len); 1680 if (rv != 0) 1681 goto error; 1682 1683 if (wait_for_peer_rts) { 1684 siw_sk_assign_rtr_upcalls(cep); 1685 } else { 1686 siw_qp_socket_assoc(cep, qp); 1687 rv = siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0); 1688 if (rv) 1689 goto error; 1690 } 1691 siw_cep_set_free(cep); 1692 1693 return 0; 1694 1695 error_unlock: 1696 up_write(&qp->state_lock); 1697 error: 1698 siw_destroy_cep_sock(cep); 1699 1700 cep->state = SIW_EPSTATE_CLOSED; 1701 1702 siw_free_cm_id(cep); 1703 if (qp->cep) { 1704 siw_cep_put(cep); 1705 qp->cep = NULL; 1706 } 1707 cep->qp = NULL; 1708 siw_qp_put(qp); 1709 free_cep: 1710 siw_cep_set_free_and_put(cep); 1711 return rv; 1712 } 1713 1714 /* 1715 * siw_reject() 1716 * 1717 * Local connection reject case. Send private data back to peer, 1718 * close connection and dereference connection id. 1719 */ 1720 int siw_reject(struct iw_cm_id *id, const void *pdata, u8 pd_len) 1721 { 1722 struct siw_cep *cep = (struct siw_cep *)id->provider_data; 1723 1724 siw_cep_set_inuse(cep); 1725 siw_cep_put(cep); 1726 1727 siw_cancel_mpatimer(cep); 1728 1729 if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) { 1730 siw_dbg_cep(cep, "out of state\n"); 1731 1732 siw_cep_set_free_and_put(cep); /* put last reference */ 1733 1734 return -ECONNRESET; 1735 } 1736 siw_dbg_cep(cep, "cep->state %d, pd_len %d\n", cep->state, 1737 pd_len); 1738 1739 if (__mpa_rr_revision(cep->mpa.hdr.params.bits) >= MPA_REVISION_1) { 1740 cep->mpa.hdr.params.bits |= MPA_RR_FLAG_REJECT; /* reject */ 1741 siw_send_mpareqrep(cep, pdata, pd_len); 1742 } 1743 siw_destroy_cep_sock(cep); 1744 1745 cep->state = SIW_EPSTATE_CLOSED; 1746 1747 siw_cep_set_free_and_put(cep); 1748 1749 return 0; 1750 } 1751 1752 /* 1753 * siw_create_listen - Create resources for a listener's IWCM ID @id 1754 * 1755 * Starts listen on the socket address id->local_addr. 1756 * 1757 */ 1758 int siw_create_listen(struct iw_cm_id *id, int backlog) 1759 { 1760 struct socket *s; 1761 struct siw_cep *cep = NULL; 1762 struct net_device *ndev = NULL; 1763 struct siw_device *sdev = to_siw_dev(id->device); 1764 int addr_family = id->local_addr.ss_family; 1765 int rv = 0; 1766 1767 if (addr_family != AF_INET && addr_family != AF_INET6) 1768 return -EAFNOSUPPORT; 1769 1770 rv = sock_create(addr_family, SOCK_STREAM, IPPROTO_TCP, &s); 1771 if (rv < 0) 1772 return rv; 1773 1774 /* 1775 * Allow binding local port when still in TIME_WAIT from last close. 1776 */ 1777 sock_set_reuseaddr(s->sk); 1778 1779 if (addr_family == AF_INET) { 1780 struct sockaddr_in *laddr = &to_sockaddr_in(id->local_addr); 1781 1782 /* For wildcard addr, limit binding to current device only */ 1783 if (ipv4_is_zeronet(laddr->sin_addr.s_addr)) { 1784 ndev = ib_device_get_netdev(id->device, SIW_PORT); 1785 if (ndev) { 1786 s->sk->sk_bound_dev_if = ndev->ifindex; 1787 } else { 1788 rv = -ENODEV; 1789 goto error; 1790 } 1791 } 1792 rv = s->ops->bind(s, (struct sockaddr *)laddr, 1793 sizeof(struct sockaddr_in)); 1794 } else { 1795 struct sockaddr_in6 *laddr = &to_sockaddr_in6(id->local_addr); 1796 1797 if (id->afonly) { 1798 rv = ip6_sock_set_v6only(s->sk); 1799 if (rv) { 1800 siw_dbg(id->device, 1801 "ip6_sock_set_v6only erro: %d\n", rv); 1802 goto error; 1803 } 1804 } 1805 1806 /* For wildcard addr, limit binding to current device only */ 1807 if (ipv6_addr_any(&laddr->sin6_addr)) { 1808 ndev = ib_device_get_netdev(id->device, SIW_PORT); 1809 if (ndev) { 1810 s->sk->sk_bound_dev_if = ndev->ifindex; 1811 } else { 1812 rv = -ENODEV; 1813 goto error; 1814 } 1815 } 1816 rv = s->ops->bind(s, (struct sockaddr *)laddr, 1817 sizeof(struct sockaddr_in6)); 1818 } 1819 if (rv) { 1820 siw_dbg(id->device, "socket bind error: %d\n", rv); 1821 goto error; 1822 } 1823 cep = siw_cep_alloc(sdev); 1824 if (!cep) { 1825 rv = -ENOMEM; 1826 goto error; 1827 } 1828 siw_cep_socket_assoc(cep, s); 1829 1830 rv = siw_cm_alloc_work(cep, backlog); 1831 if (rv) { 1832 siw_dbg(id->device, 1833 "alloc_work error %d, backlog %d\n", 1834 rv, backlog); 1835 goto error; 1836 } 1837 rv = s->ops->listen(s, backlog); 1838 if (rv) { 1839 siw_dbg(id->device, "listen error %d\n", rv); 1840 goto error; 1841 } 1842 cep->cm_id = id; 1843 id->add_ref(id); 1844 1845 /* 1846 * In case of a wildcard rdma_listen on a multi-homed device, 1847 * a listener's IWCM id is associated with more than one listening CEP. 1848 * 1849 * We currently use id->provider_data in three different ways: 1850 * 1851 * o For a listener's IWCM id, id->provider_data points to 1852 * the list_head of the list of listening CEPs. 1853 * Uses: siw_create_listen(), siw_destroy_listen() 1854 * 1855 * o For each accepted passive-side IWCM id, id->provider_data 1856 * points to the CEP itself. This is a consequence of 1857 * - siw_cm_upcall() setting event.provider_data = cep and 1858 * - the IWCM's cm_conn_req_handler() setting provider_data of the 1859 * new passive-side IWCM id equal to event.provider_data 1860 * Uses: siw_accept(), siw_reject() 1861 * 1862 * o For an active-side IWCM id, id->provider_data is not used at all. 1863 * 1864 */ 1865 if (!id->provider_data) { 1866 id->provider_data = 1867 kmalloc(sizeof(struct list_head), GFP_KERNEL); 1868 if (!id->provider_data) { 1869 rv = -ENOMEM; 1870 goto error; 1871 } 1872 INIT_LIST_HEAD((struct list_head *)id->provider_data); 1873 } 1874 list_add_tail(&cep->listenq, (struct list_head *)id->provider_data); 1875 cep->state = SIW_EPSTATE_LISTENING; 1876 dev_put(ndev); 1877 1878 siw_dbg(id->device, "Listen at laddr %pISp\n", &id->local_addr); 1879 1880 return 0; 1881 1882 error: 1883 siw_dbg(id->device, "failed: %d\n", rv); 1884 1885 if (cep) { 1886 siw_cep_set_inuse(cep); 1887 1888 siw_free_cm_id(cep); 1889 cep->sock = NULL; 1890 siw_socket_disassoc(s); 1891 cep->state = SIW_EPSTATE_CLOSED; 1892 1893 siw_cep_set_free_and_put(cep); 1894 } 1895 sock_release(s); 1896 dev_put(ndev); 1897 1898 return rv; 1899 } 1900 1901 static void siw_drop_listeners(struct iw_cm_id *id) 1902 { 1903 struct list_head *p, *tmp; 1904 1905 /* 1906 * In case of a wildcard rdma_listen on a multi-homed device, 1907 * a listener's IWCM id is associated with more than one listening CEP. 1908 */ 1909 list_for_each_safe(p, tmp, (struct list_head *)id->provider_data) { 1910 struct siw_cep *cep = list_entry(p, struct siw_cep, listenq); 1911 1912 list_del(p); 1913 1914 siw_dbg_cep(cep, "drop cep, state %d\n", cep->state); 1915 1916 siw_cep_set_inuse(cep); 1917 1918 siw_free_cm_id(cep); 1919 if (cep->sock) { 1920 siw_socket_disassoc(cep->sock); 1921 sock_release(cep->sock); 1922 cep->sock = NULL; 1923 } 1924 cep->state = SIW_EPSTATE_CLOSED; 1925 siw_cep_set_free_and_put(cep); 1926 } 1927 } 1928 1929 int siw_destroy_listen(struct iw_cm_id *id) 1930 { 1931 if (!id->provider_data) { 1932 siw_dbg(id->device, "no cep(s)\n"); 1933 return 0; 1934 } 1935 siw_drop_listeners(id); 1936 kfree(id->provider_data); 1937 id->provider_data = NULL; 1938 1939 return 0; 1940 } 1941 1942 int siw_cm_init(void) 1943 { 1944 /* 1945 * create_single_workqueue for strict ordering 1946 */ 1947 siw_cm_wq = create_singlethread_workqueue("siw_cm_wq"); 1948 if (!siw_cm_wq) 1949 return -ENOMEM; 1950 1951 return 0; 1952 } 1953 1954 void siw_cm_exit(void) 1955 { 1956 if (siw_cm_wq) 1957 destroy_workqueue(siw_cm_wq); 1958 } 1959