1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2017, Microsoft Corporation. 4 * Copyright (C) 2018, LG Electronics. 5 * 6 * Author(s): Long Li <longli@microsoft.com>, 7 * Hyunchul Lee <hyc.lee@gmail.com> 8 */ 9 10 #define SUBMOD_NAME "smb_direct" 11 12 #include <linux/kthread.h> 13 #include <linux/list.h> 14 #include <linux/mempool.h> 15 #include <linux/highmem.h> 16 #include <linux/scatterlist.h> 17 #include <linux/string_choices.h> 18 #include <rdma/ib_verbs.h> 19 #include <rdma/rdma_cm.h> 20 #include <rdma/rw.h> 21 22 #define __SMBDIRECT_SOCKET_DISCONNECT(__sc) smb_direct_disconnect_rdma_connection(__sc) 23 24 #include "glob.h" 25 #include "connection.h" 26 #include "smb_common.h" 27 #include "../common/smb2status.h" 28 #include "../common/smbdirect/smbdirect.h" 29 #include "../common/smbdirect/smbdirect_pdu.h" 30 #include "../common/smbdirect/smbdirect_socket.h" 31 #include "transport_rdma.h" 32 33 #define SMB_DIRECT_PORT_IWARP 5445 34 #define SMB_DIRECT_PORT_INFINIBAND 445 35 36 #define SMB_DIRECT_VERSION_LE cpu_to_le16(SMBDIRECT_V1) 37 38 /* SMB_DIRECT negotiation timeout (for the server) in seconds */ 39 #define SMB_DIRECT_NEGOTIATE_TIMEOUT 5 40 41 /* The timeout to wait for a keepalive message from peer in seconds */ 42 #define SMB_DIRECT_KEEPALIVE_SEND_INTERVAL 120 43 44 /* The timeout to wait for a keepalive message from peer in seconds */ 45 #define SMB_DIRECT_KEEPALIVE_RECV_TIMEOUT 5 46 47 /* 48 * Default maximum number of RDMA read/write outstanding on this connection 49 * This value is possibly decreased during QP creation on hardware limit 50 */ 51 #define SMB_DIRECT_CM_INITIATOR_DEPTH 8 52 53 /* Maximum number of retries on data transfer operations */ 54 #define SMB_DIRECT_CM_RETRY 6 55 /* No need to retry on Receiver Not Ready since SMB_DIRECT manages credits */ 56 #define SMB_DIRECT_CM_RNR_RETRY 0 57 58 /* 59 * User configurable initial values per SMB_DIRECT transport connection 60 * as defined in [MS-SMBD] 3.1.1.1 61 * Those may change after a SMB_DIRECT negotiation 62 */ 63 64 /* Set 445 port to SMB Direct port by default */ 65 static int smb_direct_port = SMB_DIRECT_PORT_INFINIBAND; 66 67 /* The local peer's maximum number of credits to grant to the peer */ 68 static int smb_direct_receive_credit_max = 255; 69 70 /* The remote peer's credit request of local peer */ 71 static int smb_direct_send_credit_target = 255; 72 73 /* The maximum single message size can be sent to remote peer */ 74 static int smb_direct_max_send_size = 1364; 75 76 /* The maximum fragmented upper-layer payload receive size supported */ 77 static int smb_direct_max_fragmented_recv_size = 1024 * 1024; 78 79 /* The maximum single-message size which can be received */ 80 static int smb_direct_max_receive_size = 1364; 81 82 static int smb_direct_max_read_write_size = SMBD_DEFAULT_IOSIZE; 83 84 static LIST_HEAD(smb_direct_device_list); 85 static DEFINE_RWLOCK(smb_direct_device_lock); 86 87 struct smb_direct_device { 88 struct ib_device *ib_dev; 89 struct list_head list; 90 }; 91 92 static struct smb_direct_listener { 93 struct rdma_cm_id *cm_id; 94 } smb_direct_listener; 95 96 static struct workqueue_struct *smb_direct_wq; 97 98 struct smb_direct_transport { 99 struct ksmbd_transport transport; 100 101 struct smbdirect_socket socket; 102 }; 103 104 #define KSMBD_TRANS(t) (&(t)->transport) 105 #define SMBD_TRANS(t) (container_of(t, \ 106 struct smb_direct_transport, transport)) 107 108 static const struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops; 109 110 void init_smbd_max_io_size(unsigned int sz) 111 { 112 sz = clamp_val(sz, SMBD_MIN_IOSIZE, SMBD_MAX_IOSIZE); 113 smb_direct_max_read_write_size = sz; 114 } 115 116 unsigned int get_smbd_max_read_write_size(struct ksmbd_transport *kt) 117 { 118 struct smb_direct_transport *t; 119 struct smbdirect_socket *sc; 120 struct smbdirect_socket_parameters *sp; 121 122 if (kt->ops != &ksmbd_smb_direct_transport_ops) 123 return 0; 124 125 t = SMBD_TRANS(kt); 126 sc = &t->socket; 127 sp = &sc->parameters; 128 129 return sp->max_read_write_size; 130 } 131 132 static inline int get_buf_page_count(void *buf, int size) 133 { 134 return DIV_ROUND_UP((uintptr_t)buf + size, PAGE_SIZE) - 135 (uintptr_t)buf / PAGE_SIZE; 136 } 137 138 static void smb_direct_destroy_pools(struct smbdirect_socket *sc); 139 static void smb_direct_post_recv_credits(struct work_struct *work); 140 static int smb_direct_post_send_data(struct smbdirect_socket *sc, 141 struct smbdirect_send_batch *send_ctx, 142 struct kvec *iov, int niov, 143 int remaining_data_length); 144 145 static inline void 146 *smbdirect_recv_io_payload(struct smbdirect_recv_io *recvmsg) 147 { 148 return (void *)recvmsg->packet; 149 } 150 151 static struct 152 smbdirect_recv_io *get_free_recvmsg(struct smbdirect_socket *sc) 153 { 154 struct smbdirect_recv_io *recvmsg = NULL; 155 unsigned long flags; 156 157 spin_lock_irqsave(&sc->recv_io.free.lock, flags); 158 if (!list_empty(&sc->recv_io.free.list)) { 159 recvmsg = list_first_entry(&sc->recv_io.free.list, 160 struct smbdirect_recv_io, 161 list); 162 list_del(&recvmsg->list); 163 } 164 spin_unlock_irqrestore(&sc->recv_io.free.lock, flags); 165 return recvmsg; 166 } 167 168 static void put_recvmsg(struct smbdirect_socket *sc, 169 struct smbdirect_recv_io *recvmsg) 170 { 171 unsigned long flags; 172 173 if (likely(recvmsg->sge.length != 0)) { 174 ib_dma_unmap_single(sc->ib.dev, 175 recvmsg->sge.addr, 176 recvmsg->sge.length, 177 DMA_FROM_DEVICE); 178 recvmsg->sge.length = 0; 179 } 180 181 spin_lock_irqsave(&sc->recv_io.free.lock, flags); 182 list_add(&recvmsg->list, &sc->recv_io.free.list); 183 spin_unlock_irqrestore(&sc->recv_io.free.lock, flags); 184 185 queue_work(sc->workqueue, &sc->recv_io.posted.refill_work); 186 } 187 188 static void enqueue_reassembly(struct smbdirect_socket *sc, 189 struct smbdirect_recv_io *recvmsg, 190 int data_length) 191 { 192 unsigned long flags; 193 194 spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); 195 list_add_tail(&recvmsg->list, &sc->recv_io.reassembly.list); 196 sc->recv_io.reassembly.queue_length++; 197 /* 198 * Make sure reassembly_data_length is updated after list and 199 * reassembly_queue_length are updated. On the dequeue side 200 * reassembly_data_length is checked without a lock to determine 201 * if reassembly_queue_length and list is up to date 202 */ 203 virt_wmb(); 204 sc->recv_io.reassembly.data_length += data_length; 205 spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); 206 } 207 208 static struct smbdirect_recv_io *get_first_reassembly(struct smbdirect_socket *sc) 209 { 210 if (!list_empty(&sc->recv_io.reassembly.list)) 211 return list_first_entry(&sc->recv_io.reassembly.list, 212 struct smbdirect_recv_io, list); 213 else 214 return NULL; 215 } 216 217 static void smb_direct_disconnect_wake_up_all(struct smbdirect_socket *sc) 218 { 219 /* 220 * Wake up all waiters in all wait queues 221 * in order to notice the broken connection. 222 */ 223 wake_up_all(&sc->status_wait); 224 wake_up_all(&sc->send_io.lcredits.wait_queue); 225 wake_up_all(&sc->send_io.credits.wait_queue); 226 wake_up_all(&sc->send_io.pending.zero_wait_queue); 227 wake_up_all(&sc->recv_io.reassembly.wait_queue); 228 wake_up_all(&sc->rw_io.credits.wait_queue); 229 } 230 231 static void smb_direct_disconnect_rdma_work(struct work_struct *work) 232 { 233 struct smbdirect_socket *sc = 234 container_of(work, struct smbdirect_socket, disconnect_work); 235 236 if (sc->first_error == 0) 237 sc->first_error = -ECONNABORTED; 238 239 /* 240 * make sure this and other work is not queued again 241 * but here we don't block and avoid 242 * disable[_delayed]_work_sync() 243 */ 244 disable_work(&sc->disconnect_work); 245 disable_work(&sc->connect.work); 246 disable_work(&sc->recv_io.posted.refill_work); 247 disable_delayed_work(&sc->idle.timer_work); 248 disable_work(&sc->idle.immediate_work); 249 250 switch (sc->status) { 251 case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED: 252 case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING: 253 case SMBDIRECT_SOCKET_NEGOTIATE_FAILED: 254 case SMBDIRECT_SOCKET_CONNECTED: 255 case SMBDIRECT_SOCKET_ERROR: 256 sc->status = SMBDIRECT_SOCKET_DISCONNECTING; 257 rdma_disconnect(sc->rdma.cm_id); 258 break; 259 260 case SMBDIRECT_SOCKET_CREATED: 261 case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED: 262 case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING: 263 case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED: 264 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED: 265 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING: 266 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED: 267 case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED: 268 case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING: 269 case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED: 270 /* 271 * rdma_accept() never reached 272 * RDMA_CM_EVENT_ESTABLISHED 273 */ 274 sc->status = SMBDIRECT_SOCKET_DISCONNECTED; 275 break; 276 277 case SMBDIRECT_SOCKET_DISCONNECTING: 278 case SMBDIRECT_SOCKET_DISCONNECTED: 279 case SMBDIRECT_SOCKET_DESTROYED: 280 break; 281 } 282 283 /* 284 * Wake up all waiters in all wait queues 285 * in order to notice the broken connection. 286 */ 287 smb_direct_disconnect_wake_up_all(sc); 288 } 289 290 static void 291 smb_direct_disconnect_rdma_connection(struct smbdirect_socket *sc) 292 { 293 if (sc->first_error == 0) 294 sc->first_error = -ECONNABORTED; 295 296 /* 297 * make sure other work (than disconnect_work) is 298 * not queued again but here we don't block and avoid 299 * disable[_delayed]_work_sync() 300 */ 301 disable_work(&sc->connect.work); 302 disable_work(&sc->recv_io.posted.refill_work); 303 disable_work(&sc->idle.immediate_work); 304 disable_delayed_work(&sc->idle.timer_work); 305 306 switch (sc->status) { 307 case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED: 308 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED: 309 case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED: 310 case SMBDIRECT_SOCKET_NEGOTIATE_FAILED: 311 case SMBDIRECT_SOCKET_ERROR: 312 case SMBDIRECT_SOCKET_DISCONNECTING: 313 case SMBDIRECT_SOCKET_DISCONNECTED: 314 case SMBDIRECT_SOCKET_DESTROYED: 315 /* 316 * Keep the current error status 317 */ 318 break; 319 320 case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED: 321 case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING: 322 sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED; 323 break; 324 325 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED: 326 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING: 327 sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED; 328 break; 329 330 case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED: 331 case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING: 332 sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED; 333 break; 334 335 case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED: 336 case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING: 337 sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; 338 break; 339 340 case SMBDIRECT_SOCKET_CREATED: 341 sc->status = SMBDIRECT_SOCKET_DISCONNECTED; 342 break; 343 344 case SMBDIRECT_SOCKET_CONNECTED: 345 sc->status = SMBDIRECT_SOCKET_ERROR; 346 break; 347 } 348 349 /* 350 * Wake up all waiters in all wait queues 351 * in order to notice the broken connection. 352 */ 353 smb_direct_disconnect_wake_up_all(sc); 354 355 queue_work(sc->workqueue, &sc->disconnect_work); 356 } 357 358 static void smb_direct_send_immediate_work(struct work_struct *work) 359 { 360 struct smbdirect_socket *sc = 361 container_of(work, struct smbdirect_socket, idle.immediate_work); 362 363 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) 364 return; 365 366 smb_direct_post_send_data(sc, NULL, NULL, 0, 0); 367 } 368 369 static void smb_direct_idle_connection_timer(struct work_struct *work) 370 { 371 struct smbdirect_socket *sc = 372 container_of(work, struct smbdirect_socket, idle.timer_work.work); 373 struct smbdirect_socket_parameters *sp = &sc->parameters; 374 375 if (sc->idle.keepalive != SMBDIRECT_KEEPALIVE_NONE) { 376 smb_direct_disconnect_rdma_connection(sc); 377 return; 378 } 379 380 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) 381 return; 382 383 /* 384 * Now use the keepalive timeout (instead of keepalive interval) 385 * in order to wait for a response 386 */ 387 sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING; 388 mod_delayed_work(sc->workqueue, &sc->idle.timer_work, 389 msecs_to_jiffies(sp->keepalive_timeout_msec)); 390 queue_work(sc->workqueue, &sc->idle.immediate_work); 391 } 392 393 static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) 394 { 395 struct smb_direct_transport *t; 396 struct smbdirect_socket *sc; 397 struct smbdirect_socket_parameters *sp; 398 struct ksmbd_conn *conn; 399 400 t = kzalloc(sizeof(*t), KSMBD_DEFAULT_GFP); 401 if (!t) 402 return NULL; 403 sc = &t->socket; 404 smbdirect_socket_init(sc); 405 sp = &sc->parameters; 406 407 sc->workqueue = smb_direct_wq; 408 409 INIT_WORK(&sc->disconnect_work, smb_direct_disconnect_rdma_work); 410 411 sp->negotiate_timeout_msec = SMB_DIRECT_NEGOTIATE_TIMEOUT * 1000; 412 sp->initiator_depth = SMB_DIRECT_CM_INITIATOR_DEPTH; 413 sp->responder_resources = 1; 414 sp->recv_credit_max = smb_direct_receive_credit_max; 415 sp->send_credit_target = smb_direct_send_credit_target; 416 sp->max_send_size = smb_direct_max_send_size; 417 sp->max_fragmented_recv_size = smb_direct_max_fragmented_recv_size; 418 sp->max_recv_size = smb_direct_max_receive_size; 419 sp->max_read_write_size = smb_direct_max_read_write_size; 420 sp->keepalive_interval_msec = SMB_DIRECT_KEEPALIVE_SEND_INTERVAL * 1000; 421 sp->keepalive_timeout_msec = SMB_DIRECT_KEEPALIVE_RECV_TIMEOUT * 1000; 422 423 sc->rdma.cm_id = cm_id; 424 cm_id->context = sc; 425 426 sc->ib.dev = sc->rdma.cm_id->device; 427 428 INIT_DELAYED_WORK(&sc->idle.timer_work, smb_direct_idle_connection_timer); 429 430 conn = ksmbd_conn_alloc(); 431 if (!conn) 432 goto err; 433 434 down_write(&conn_list_lock); 435 hash_add(conn_list, &conn->hlist, 0); 436 up_write(&conn_list_lock); 437 438 conn->transport = KSMBD_TRANS(t); 439 KSMBD_TRANS(t)->conn = conn; 440 KSMBD_TRANS(t)->ops = &ksmbd_smb_direct_transport_ops; 441 return t; 442 err: 443 kfree(t); 444 return NULL; 445 } 446 447 static void smb_direct_free_transport(struct ksmbd_transport *kt) 448 { 449 kfree(SMBD_TRANS(kt)); 450 } 451 452 static void free_transport(struct smb_direct_transport *t) 453 { 454 struct smbdirect_socket *sc = &t->socket; 455 struct smbdirect_recv_io *recvmsg; 456 457 disable_work_sync(&sc->disconnect_work); 458 if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING) 459 smb_direct_disconnect_rdma_work(&sc->disconnect_work); 460 if (sc->status < SMBDIRECT_SOCKET_DISCONNECTED) 461 wait_event(sc->status_wait, sc->status == SMBDIRECT_SOCKET_DISCONNECTED); 462 463 /* 464 * Wake up all waiters in all wait queues 465 * in order to notice the broken connection. 466 * 467 * Most likely this was already called via 468 * smb_direct_disconnect_rdma_work(), but call it again... 469 */ 470 smb_direct_disconnect_wake_up_all(sc); 471 472 disable_work_sync(&sc->connect.work); 473 disable_work_sync(&sc->recv_io.posted.refill_work); 474 disable_delayed_work_sync(&sc->idle.timer_work); 475 disable_work_sync(&sc->idle.immediate_work); 476 477 if (sc->rdma.cm_id) 478 rdma_lock_handler(sc->rdma.cm_id); 479 480 if (sc->ib.qp) { 481 ib_drain_qp(sc->ib.qp); 482 sc->ib.qp = NULL; 483 rdma_destroy_qp(sc->rdma.cm_id); 484 } 485 486 ksmbd_debug(RDMA, "drain the reassembly queue\n"); 487 do { 488 unsigned long flags; 489 490 spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); 491 recvmsg = get_first_reassembly(sc); 492 if (recvmsg) { 493 list_del(&recvmsg->list); 494 spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); 495 put_recvmsg(sc, recvmsg); 496 } else { 497 spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); 498 } 499 } while (recvmsg); 500 sc->recv_io.reassembly.data_length = 0; 501 502 if (sc->ib.send_cq) 503 ib_free_cq(sc->ib.send_cq); 504 if (sc->ib.recv_cq) 505 ib_free_cq(sc->ib.recv_cq); 506 if (sc->ib.pd) 507 ib_dealloc_pd(sc->ib.pd); 508 if (sc->rdma.cm_id) { 509 rdma_unlock_handler(sc->rdma.cm_id); 510 rdma_destroy_id(sc->rdma.cm_id); 511 } 512 513 smb_direct_destroy_pools(sc); 514 ksmbd_conn_free(KSMBD_TRANS(t)->conn); 515 } 516 517 static struct smbdirect_send_io 518 *smb_direct_alloc_sendmsg(struct smbdirect_socket *sc) 519 { 520 struct smbdirect_send_io *msg; 521 522 msg = mempool_alloc(sc->send_io.mem.pool, KSMBD_DEFAULT_GFP); 523 if (!msg) 524 return ERR_PTR(-ENOMEM); 525 msg->socket = sc; 526 INIT_LIST_HEAD(&msg->sibling_list); 527 msg->num_sge = 0; 528 return msg; 529 } 530 531 static void smb_direct_free_sendmsg(struct smbdirect_socket *sc, 532 struct smbdirect_send_io *msg) 533 { 534 int i; 535 536 /* 537 * The list needs to be empty! 538 * The caller should take care of it. 539 */ 540 WARN_ON_ONCE(!list_empty(&msg->sibling_list)); 541 542 if (msg->num_sge > 0) { 543 ib_dma_unmap_single(sc->ib.dev, 544 msg->sge[0].addr, msg->sge[0].length, 545 DMA_TO_DEVICE); 546 for (i = 1; i < msg->num_sge; i++) 547 ib_dma_unmap_page(sc->ib.dev, 548 msg->sge[i].addr, msg->sge[i].length, 549 DMA_TO_DEVICE); 550 } 551 mempool_free(msg, sc->send_io.mem.pool); 552 } 553 554 static int smb_direct_check_recvmsg(struct smbdirect_recv_io *recvmsg) 555 { 556 struct smbdirect_socket *sc = recvmsg->socket; 557 558 switch (sc->recv_io.expected) { 559 case SMBDIRECT_EXPECT_DATA_TRANSFER: { 560 struct smbdirect_data_transfer *req = 561 (struct smbdirect_data_transfer *)recvmsg->packet; 562 struct smb2_hdr *hdr = (struct smb2_hdr *)(recvmsg->packet 563 + le32_to_cpu(req->data_offset)); 564 ksmbd_debug(RDMA, 565 "CreditGranted: %u, CreditRequested: %u, DataLength: %u, RemainingDataLength: %u, SMB: %x, Command: %u\n", 566 le16_to_cpu(req->credits_granted), 567 le16_to_cpu(req->credits_requested), 568 req->data_length, req->remaining_data_length, 569 hdr->ProtocolId, hdr->Command); 570 return 0; 571 } 572 case SMBDIRECT_EXPECT_NEGOTIATE_REQ: { 573 struct smbdirect_negotiate_req *req = 574 (struct smbdirect_negotiate_req *)recvmsg->packet; 575 ksmbd_debug(RDMA, 576 "MinVersion: %u, MaxVersion: %u, CreditRequested: %u, MaxSendSize: %u, MaxRecvSize: %u, MaxFragmentedSize: %u\n", 577 le16_to_cpu(req->min_version), 578 le16_to_cpu(req->max_version), 579 le16_to_cpu(req->credits_requested), 580 le32_to_cpu(req->preferred_send_size), 581 le32_to_cpu(req->max_receive_size), 582 le32_to_cpu(req->max_fragmented_size)); 583 if (le16_to_cpu(req->min_version) > 0x0100 || 584 le16_to_cpu(req->max_version) < 0x0100) 585 return -EOPNOTSUPP; 586 if (le16_to_cpu(req->credits_requested) <= 0 || 587 le32_to_cpu(req->max_receive_size) <= 128 || 588 le32_to_cpu(req->max_fragmented_size) <= 589 128 * 1024) 590 return -ECONNABORTED; 591 592 return 0; 593 } 594 case SMBDIRECT_EXPECT_NEGOTIATE_REP: 595 /* client only */ 596 break; 597 } 598 599 /* This is an internal error */ 600 return -EINVAL; 601 } 602 603 static void recv_done(struct ib_cq *cq, struct ib_wc *wc) 604 { 605 struct smbdirect_recv_io *recvmsg; 606 struct smbdirect_socket *sc; 607 struct smbdirect_socket_parameters *sp; 608 609 recvmsg = container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe); 610 sc = recvmsg->socket; 611 sp = &sc->parameters; 612 613 if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) { 614 put_recvmsg(sc, recvmsg); 615 if (wc->status != IB_WC_WR_FLUSH_ERR) { 616 pr_err("Recv error. status='%s (%d)' opcode=%d\n", 617 ib_wc_status_msg(wc->status), wc->status, 618 wc->opcode); 619 smb_direct_disconnect_rdma_connection(sc); 620 } 621 return; 622 } 623 624 ksmbd_debug(RDMA, "Recv completed. status='%s (%d)', opcode=%d\n", 625 ib_wc_status_msg(wc->status), wc->status, 626 wc->opcode); 627 628 ib_dma_sync_single_for_cpu(wc->qp->device, recvmsg->sge.addr, 629 recvmsg->sge.length, DMA_FROM_DEVICE); 630 631 /* 632 * Reset timer to the keepalive interval in 633 * order to trigger our next keepalive message. 634 */ 635 sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE; 636 mod_delayed_work(sc->workqueue, &sc->idle.timer_work, 637 msecs_to_jiffies(sp->keepalive_interval_msec)); 638 639 switch (sc->recv_io.expected) { 640 case SMBDIRECT_EXPECT_NEGOTIATE_REQ: 641 /* see smb_direct_negotiate_recv_done */ 642 break; 643 case SMBDIRECT_EXPECT_DATA_TRANSFER: { 644 struct smbdirect_data_transfer *data_transfer = 645 (struct smbdirect_data_transfer *)recvmsg->packet; 646 u32 remaining_data_length, data_offset, data_length; 647 u16 old_recv_credit_target; 648 649 if (wc->byte_len < 650 offsetof(struct smbdirect_data_transfer, padding)) { 651 put_recvmsg(sc, recvmsg); 652 smb_direct_disconnect_rdma_connection(sc); 653 return; 654 } 655 656 remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length); 657 data_length = le32_to_cpu(data_transfer->data_length); 658 data_offset = le32_to_cpu(data_transfer->data_offset); 659 if (wc->byte_len < data_offset || 660 wc->byte_len < (u64)data_offset + data_length) { 661 put_recvmsg(sc, recvmsg); 662 smb_direct_disconnect_rdma_connection(sc); 663 return; 664 } 665 if (remaining_data_length > sp->max_fragmented_recv_size || 666 data_length > sp->max_fragmented_recv_size || 667 (u64)remaining_data_length + (u64)data_length > 668 (u64)sp->max_fragmented_recv_size) { 669 put_recvmsg(sc, recvmsg); 670 smb_direct_disconnect_rdma_connection(sc); 671 return; 672 } 673 674 if (data_length) { 675 if (sc->recv_io.reassembly.full_packet_received) 676 recvmsg->first_segment = true; 677 678 if (le32_to_cpu(data_transfer->remaining_data_length)) 679 sc->recv_io.reassembly.full_packet_received = false; 680 else 681 sc->recv_io.reassembly.full_packet_received = true; 682 } 683 684 atomic_dec(&sc->recv_io.posted.count); 685 atomic_dec(&sc->recv_io.credits.count); 686 687 old_recv_credit_target = sc->recv_io.credits.target; 688 sc->recv_io.credits.target = 689 le16_to_cpu(data_transfer->credits_requested); 690 sc->recv_io.credits.target = 691 min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max); 692 sc->recv_io.credits.target = 693 max_t(u16, sc->recv_io.credits.target, 1); 694 atomic_add(le16_to_cpu(data_transfer->credits_granted), 695 &sc->send_io.credits.count); 696 697 if (le16_to_cpu(data_transfer->flags) & 698 SMBDIRECT_FLAG_RESPONSE_REQUESTED) 699 queue_work(sc->workqueue, &sc->idle.immediate_work); 700 701 if (atomic_read(&sc->send_io.credits.count) > 0) 702 wake_up(&sc->send_io.credits.wait_queue); 703 704 if (data_length) { 705 if (sc->recv_io.credits.target > old_recv_credit_target) 706 queue_work(sc->workqueue, &sc->recv_io.posted.refill_work); 707 708 enqueue_reassembly(sc, recvmsg, (int)data_length); 709 wake_up(&sc->recv_io.reassembly.wait_queue); 710 } else 711 put_recvmsg(sc, recvmsg); 712 713 return; 714 } 715 case SMBDIRECT_EXPECT_NEGOTIATE_REP: 716 /* client only */ 717 break; 718 } 719 720 /* 721 * This is an internal error! 722 */ 723 WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_DATA_TRANSFER); 724 put_recvmsg(sc, recvmsg); 725 smb_direct_disconnect_rdma_connection(sc); 726 } 727 728 static void smb_direct_negotiate_recv_work(struct work_struct *work); 729 730 static void smb_direct_negotiate_recv_done(struct ib_cq *cq, struct ib_wc *wc) 731 { 732 struct smbdirect_recv_io *recv_io = 733 container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe); 734 struct smbdirect_socket *sc = recv_io->socket; 735 unsigned long flags; 736 737 /* 738 * reset the common recv_done for later reuse. 739 */ 740 recv_io->cqe.done = recv_done; 741 742 if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) { 743 put_recvmsg(sc, recv_io); 744 if (wc->status != IB_WC_WR_FLUSH_ERR) { 745 pr_err("Negotiate Recv error. status='%s (%d)' opcode=%d\n", 746 ib_wc_status_msg(wc->status), wc->status, 747 wc->opcode); 748 smb_direct_disconnect_rdma_connection(sc); 749 } 750 return; 751 } 752 753 ksmbd_debug(RDMA, "Negotiate Recv completed. status='%s (%d)', opcode=%d\n", 754 ib_wc_status_msg(wc->status), wc->status, 755 wc->opcode); 756 757 ib_dma_sync_single_for_cpu(sc->ib.dev, 758 recv_io->sge.addr, 759 recv_io->sge.length, 760 DMA_FROM_DEVICE); 761 762 /* 763 * This is an internal error! 764 */ 765 if (WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_NEGOTIATE_REQ)) { 766 put_recvmsg(sc, recv_io); 767 smb_direct_disconnect_rdma_connection(sc); 768 return; 769 } 770 771 /* 772 * Don't reset timer to the keepalive interval in 773 * this will be done in smb_direct_negotiate_recv_work. 774 */ 775 776 /* 777 * Only remember the recv_io if it has enough bytes, 778 * this gives smb_direct_negotiate_recv_work enough 779 * information in order to disconnect if it was not 780 * valid. 781 */ 782 sc->recv_io.reassembly.full_packet_received = true; 783 if (wc->byte_len >= sizeof(struct smbdirect_negotiate_req)) 784 enqueue_reassembly(sc, recv_io, 0); 785 else 786 put_recvmsg(sc, recv_io); 787 788 /* 789 * Some drivers (at least mlx5_ib and irdma in roce mode) 790 * might post a recv completion before RDMA_CM_EVENT_ESTABLISHED, 791 * we need to adjust our expectation in that case. 792 * 793 * So we defer further processing of the negotiation 794 * to smb_direct_negotiate_recv_work(). 795 * 796 * If we are already in SMBDIRECT_SOCKET_NEGOTIATE_NEEDED 797 * we queue the work directly otherwise 798 * smb_direct_cm_handler() will do it, when 799 * RDMA_CM_EVENT_ESTABLISHED arrived. 800 */ 801 spin_lock_irqsave(&sc->connect.lock, flags); 802 if (!sc->first_error) { 803 INIT_WORK(&sc->connect.work, smb_direct_negotiate_recv_work); 804 if (sc->status == SMBDIRECT_SOCKET_NEGOTIATE_NEEDED) 805 queue_work(sc->workqueue, &sc->connect.work); 806 } 807 spin_unlock_irqrestore(&sc->connect.lock, flags); 808 } 809 810 static void smb_direct_negotiate_recv_work(struct work_struct *work) 811 { 812 struct smbdirect_socket *sc = 813 container_of(work, struct smbdirect_socket, connect.work); 814 const struct smbdirect_socket_parameters *sp = &sc->parameters; 815 struct smbdirect_recv_io *recv_io; 816 817 if (sc->first_error) 818 return; 819 820 ksmbd_debug(RDMA, "Negotiate Recv Work running\n"); 821 822 /* 823 * Reset timer to the keepalive interval in 824 * order to trigger our next keepalive message. 825 */ 826 sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE; 827 mod_delayed_work(sc->workqueue, &sc->idle.timer_work, 828 msecs_to_jiffies(sp->keepalive_interval_msec)); 829 830 /* 831 * If smb_direct_negotiate_recv_done() detected an 832 * invalid request we want to disconnect. 833 */ 834 recv_io = get_first_reassembly(sc); 835 if (!recv_io) { 836 smb_direct_disconnect_rdma_connection(sc); 837 return; 838 } 839 840 if (SMBDIRECT_CHECK_STATUS_WARN(sc, SMBDIRECT_SOCKET_NEGOTIATE_NEEDED)) { 841 smb_direct_disconnect_rdma_connection(sc); 842 return; 843 } 844 sc->status = SMBDIRECT_SOCKET_NEGOTIATE_RUNNING; 845 wake_up(&sc->status_wait); 846 } 847 848 static int smb_direct_post_recv(struct smbdirect_socket *sc, 849 struct smbdirect_recv_io *recvmsg) 850 { 851 struct smbdirect_socket_parameters *sp = &sc->parameters; 852 struct ib_recv_wr wr; 853 int ret; 854 855 recvmsg->sge.addr = ib_dma_map_single(sc->ib.dev, 856 recvmsg->packet, 857 sp->max_recv_size, 858 DMA_FROM_DEVICE); 859 ret = ib_dma_mapping_error(sc->ib.dev, recvmsg->sge.addr); 860 if (ret) 861 return ret; 862 recvmsg->sge.length = sp->max_recv_size; 863 recvmsg->sge.lkey = sc->ib.pd->local_dma_lkey; 864 865 wr.wr_cqe = &recvmsg->cqe; 866 wr.next = NULL; 867 wr.sg_list = &recvmsg->sge; 868 wr.num_sge = 1; 869 870 ret = ib_post_recv(sc->ib.qp, &wr, NULL); 871 if (ret) { 872 pr_err("Can't post recv: %d\n", ret); 873 ib_dma_unmap_single(sc->ib.dev, 874 recvmsg->sge.addr, recvmsg->sge.length, 875 DMA_FROM_DEVICE); 876 recvmsg->sge.length = 0; 877 smb_direct_disconnect_rdma_connection(sc); 878 return ret; 879 } 880 return ret; 881 } 882 883 static int smb_direct_read(struct ksmbd_transport *t, char *buf, 884 unsigned int size, int unused) 885 { 886 struct smbdirect_recv_io *recvmsg; 887 struct smbdirect_data_transfer *data_transfer; 888 int to_copy, to_read, data_read, offset; 889 u32 data_length, remaining_data_length, data_offset; 890 int rc; 891 struct smb_direct_transport *st = SMBD_TRANS(t); 892 struct smbdirect_socket *sc = &st->socket; 893 894 again: 895 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { 896 pr_err("disconnected\n"); 897 return -ENOTCONN; 898 } 899 900 /* 901 * No need to hold the reassembly queue lock all the time as we are 902 * the only one reading from the front of the queue. The transport 903 * may add more entries to the back of the queue at the same time 904 */ 905 if (sc->recv_io.reassembly.data_length >= size) { 906 int queue_length; 907 int queue_removed = 0; 908 unsigned long flags; 909 910 /* 911 * Need to make sure reassembly_data_length is read before 912 * reading reassembly_queue_length and calling 913 * get_first_reassembly. This call is lock free 914 * as we never read at the end of the queue which are being 915 * updated in SOFTIRQ as more data is received 916 */ 917 virt_rmb(); 918 queue_length = sc->recv_io.reassembly.queue_length; 919 data_read = 0; 920 to_read = size; 921 offset = sc->recv_io.reassembly.first_entry_offset; 922 while (data_read < size) { 923 recvmsg = get_first_reassembly(sc); 924 data_transfer = smbdirect_recv_io_payload(recvmsg); 925 data_length = le32_to_cpu(data_transfer->data_length); 926 remaining_data_length = 927 le32_to_cpu(data_transfer->remaining_data_length); 928 data_offset = le32_to_cpu(data_transfer->data_offset); 929 930 /* 931 * The upper layer expects RFC1002 length at the 932 * beginning of the payload. Return it to indicate 933 * the total length of the packet. This minimize the 934 * change to upper layer packet processing logic. This 935 * will be eventually remove when an intermediate 936 * transport layer is added 937 */ 938 if (recvmsg->first_segment && size == 4) { 939 unsigned int rfc1002_len = 940 data_length + remaining_data_length; 941 *((__be32 *)buf) = cpu_to_be32(rfc1002_len); 942 data_read = 4; 943 recvmsg->first_segment = false; 944 ksmbd_debug(RDMA, 945 "returning rfc1002 length %d\n", 946 rfc1002_len); 947 goto read_rfc1002_done; 948 } 949 950 to_copy = min_t(int, data_length - offset, to_read); 951 memcpy(buf + data_read, (char *)data_transfer + data_offset + offset, 952 to_copy); 953 954 /* move on to the next buffer? */ 955 if (to_copy == data_length - offset) { 956 queue_length--; 957 /* 958 * No need to lock if we are not at the 959 * end of the queue 960 */ 961 if (queue_length) { 962 list_del(&recvmsg->list); 963 } else { 964 spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); 965 list_del(&recvmsg->list); 966 spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); 967 } 968 queue_removed++; 969 put_recvmsg(sc, recvmsg); 970 offset = 0; 971 } else { 972 offset += to_copy; 973 } 974 975 to_read -= to_copy; 976 data_read += to_copy; 977 } 978 979 spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); 980 sc->recv_io.reassembly.data_length -= data_read; 981 sc->recv_io.reassembly.queue_length -= queue_removed; 982 spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); 983 984 sc->recv_io.reassembly.first_entry_offset = offset; 985 ksmbd_debug(RDMA, 986 "returning to thread data_read=%d reassembly_data_length=%d first_entry_offset=%d\n", 987 data_read, sc->recv_io.reassembly.data_length, 988 sc->recv_io.reassembly.first_entry_offset); 989 read_rfc1002_done: 990 return data_read; 991 } 992 993 ksmbd_debug(RDMA, "wait_event on more data\n"); 994 rc = wait_event_interruptible(sc->recv_io.reassembly.wait_queue, 995 sc->recv_io.reassembly.data_length >= size || 996 sc->status != SMBDIRECT_SOCKET_CONNECTED); 997 if (rc) 998 return -EINTR; 999 1000 goto again; 1001 } 1002 1003 static void smb_direct_post_recv_credits(struct work_struct *work) 1004 { 1005 struct smbdirect_socket *sc = 1006 container_of(work, struct smbdirect_socket, recv_io.posted.refill_work); 1007 struct smbdirect_recv_io *recvmsg; 1008 int credits = 0; 1009 int ret; 1010 1011 if (atomic_read(&sc->recv_io.credits.count) < sc->recv_io.credits.target) { 1012 while (true) { 1013 recvmsg = get_free_recvmsg(sc); 1014 if (!recvmsg) 1015 break; 1016 1017 recvmsg->first_segment = false; 1018 1019 ret = smb_direct_post_recv(sc, recvmsg); 1020 if (ret) { 1021 pr_err("Can't post recv: %d\n", ret); 1022 put_recvmsg(sc, recvmsg); 1023 break; 1024 } 1025 credits++; 1026 1027 atomic_inc(&sc->recv_io.posted.count); 1028 } 1029 } 1030 1031 if (credits) 1032 queue_work(sc->workqueue, &sc->idle.immediate_work); 1033 } 1034 1035 static void send_done(struct ib_cq *cq, struct ib_wc *wc) 1036 { 1037 struct smbdirect_send_io *sendmsg, *sibling, *next; 1038 struct smbdirect_socket *sc; 1039 int lcredits = 0; 1040 1041 sendmsg = container_of(wc->wr_cqe, struct smbdirect_send_io, cqe); 1042 sc = sendmsg->socket; 1043 1044 ksmbd_debug(RDMA, "Send completed. status='%s (%d)', opcode=%d\n", 1045 ib_wc_status_msg(wc->status), wc->status, 1046 wc->opcode); 1047 1048 /* 1049 * Free possible siblings and then the main send_io 1050 */ 1051 list_for_each_entry_safe(sibling, next, &sendmsg->sibling_list, sibling_list) { 1052 list_del_init(&sibling->sibling_list); 1053 smb_direct_free_sendmsg(sc, sibling); 1054 lcredits += 1; 1055 } 1056 /* Note this frees wc->wr_cqe, but not wc */ 1057 smb_direct_free_sendmsg(sc, sendmsg); 1058 lcredits += 1; 1059 1060 if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) { 1061 pr_err("Send error. status='%s (%d)', opcode=%d\n", 1062 ib_wc_status_msg(wc->status), wc->status, 1063 wc->opcode); 1064 smb_direct_disconnect_rdma_connection(sc); 1065 return; 1066 } 1067 1068 atomic_add(lcredits, &sc->send_io.lcredits.count); 1069 wake_up(&sc->send_io.lcredits.wait_queue); 1070 1071 if (atomic_dec_and_test(&sc->send_io.pending.count)) 1072 wake_up(&sc->send_io.pending.zero_wait_queue); 1073 } 1074 1075 static int manage_credits_prior_sending(struct smbdirect_socket *sc) 1076 { 1077 int new_credits; 1078 1079 if (atomic_read(&sc->recv_io.credits.count) >= sc->recv_io.credits.target) 1080 return 0; 1081 1082 new_credits = atomic_read(&sc->recv_io.posted.count); 1083 if (new_credits == 0) 1084 return 0; 1085 1086 new_credits -= atomic_read(&sc->recv_io.credits.count); 1087 if (new_credits <= 0) 1088 return 0; 1089 1090 atomic_add(new_credits, &sc->recv_io.credits.count); 1091 return new_credits; 1092 } 1093 1094 static int manage_keep_alive_before_sending(struct smbdirect_socket *sc) 1095 { 1096 struct smbdirect_socket_parameters *sp = &sc->parameters; 1097 1098 if (sc->idle.keepalive == SMBDIRECT_KEEPALIVE_PENDING) { 1099 sc->idle.keepalive = SMBDIRECT_KEEPALIVE_SENT; 1100 /* 1101 * Now use the keepalive timeout (instead of keepalive interval) 1102 * in order to wait for a response 1103 */ 1104 mod_delayed_work(sc->workqueue, &sc->idle.timer_work, 1105 msecs_to_jiffies(sp->keepalive_timeout_msec)); 1106 return 1; 1107 } 1108 return 0; 1109 } 1110 1111 static int smb_direct_post_send(struct smbdirect_socket *sc, 1112 struct ib_send_wr *wr) 1113 { 1114 int ret; 1115 1116 atomic_inc(&sc->send_io.pending.count); 1117 ret = ib_post_send(sc->ib.qp, wr, NULL); 1118 if (ret) { 1119 pr_err("failed to post send: %d\n", ret); 1120 smb_direct_disconnect_rdma_connection(sc); 1121 } 1122 return ret; 1123 } 1124 1125 static void smb_direct_send_ctx_init(struct smbdirect_send_batch *send_ctx, 1126 bool need_invalidate_rkey, 1127 unsigned int remote_key) 1128 { 1129 INIT_LIST_HEAD(&send_ctx->msg_list); 1130 send_ctx->wr_cnt = 0; 1131 send_ctx->need_invalidate_rkey = need_invalidate_rkey; 1132 send_ctx->remote_key = remote_key; 1133 } 1134 1135 static int smb_direct_flush_send_list(struct smbdirect_socket *sc, 1136 struct smbdirect_send_batch *send_ctx, 1137 bool is_last) 1138 { 1139 struct smbdirect_send_io *first, *last; 1140 int ret; 1141 1142 if (list_empty(&send_ctx->msg_list)) 1143 return 0; 1144 1145 first = list_first_entry(&send_ctx->msg_list, 1146 struct smbdirect_send_io, 1147 sibling_list); 1148 last = list_last_entry(&send_ctx->msg_list, 1149 struct smbdirect_send_io, 1150 sibling_list); 1151 1152 if (send_ctx->need_invalidate_rkey) { 1153 first->wr.opcode = IB_WR_SEND_WITH_INV; 1154 first->wr.ex.invalidate_rkey = send_ctx->remote_key; 1155 send_ctx->need_invalidate_rkey = false; 1156 send_ctx->remote_key = 0; 1157 } 1158 1159 last->wr.send_flags = IB_SEND_SIGNALED; 1160 last->wr.wr_cqe = &last->cqe; 1161 1162 /* 1163 * Remove last from send_ctx->msg_list 1164 * and splice the rest of send_ctx->msg_list 1165 * to last->sibling_list. 1166 * 1167 * send_ctx->msg_list is a valid empty list 1168 * at the end. 1169 */ 1170 list_del_init(&last->sibling_list); 1171 list_splice_tail_init(&send_ctx->msg_list, &last->sibling_list); 1172 send_ctx->wr_cnt = 0; 1173 1174 ret = smb_direct_post_send(sc, &first->wr); 1175 if (ret) { 1176 struct smbdirect_send_io *sibling, *next; 1177 1178 list_for_each_entry_safe(sibling, next, &last->sibling_list, sibling_list) { 1179 list_del_init(&sibling->sibling_list); 1180 smb_direct_free_sendmsg(sc, sibling); 1181 } 1182 smb_direct_free_sendmsg(sc, last); 1183 } 1184 1185 return ret; 1186 } 1187 1188 static int wait_for_credits(struct smbdirect_socket *sc, 1189 wait_queue_head_t *waitq, atomic_t *total_credits, 1190 int needed) 1191 { 1192 int ret; 1193 1194 do { 1195 if (atomic_sub_return(needed, total_credits) >= 0) 1196 return 0; 1197 1198 atomic_add(needed, total_credits); 1199 ret = wait_event_interruptible(*waitq, 1200 atomic_read(total_credits) >= needed || 1201 sc->status != SMBDIRECT_SOCKET_CONNECTED); 1202 1203 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) 1204 return -ENOTCONN; 1205 else if (ret < 0) 1206 return ret; 1207 } while (true); 1208 } 1209 1210 static int wait_for_send_lcredit(struct smbdirect_socket *sc, 1211 struct smbdirect_send_batch *send_ctx) 1212 { 1213 if (send_ctx && (atomic_read(&sc->send_io.lcredits.count) <= 1)) { 1214 int ret; 1215 1216 ret = smb_direct_flush_send_list(sc, send_ctx, false); 1217 if (ret) 1218 return ret; 1219 } 1220 1221 return wait_for_credits(sc, 1222 &sc->send_io.lcredits.wait_queue, 1223 &sc->send_io.lcredits.count, 1224 1); 1225 } 1226 1227 static int wait_for_send_credits(struct smbdirect_socket *sc, 1228 struct smbdirect_send_batch *send_ctx) 1229 { 1230 int ret; 1231 1232 if (send_ctx && 1233 (send_ctx->wr_cnt >= 16 || atomic_read(&sc->send_io.credits.count) <= 1)) { 1234 ret = smb_direct_flush_send_list(sc, send_ctx, false); 1235 if (ret) 1236 return ret; 1237 } 1238 1239 return wait_for_credits(sc, &sc->send_io.credits.wait_queue, &sc->send_io.credits.count, 1); 1240 } 1241 1242 static int wait_for_rw_credits(struct smbdirect_socket *sc, int credits) 1243 { 1244 return wait_for_credits(sc, 1245 &sc->rw_io.credits.wait_queue, 1246 &sc->rw_io.credits.count, 1247 credits); 1248 } 1249 1250 static int calc_rw_credits(struct smbdirect_socket *sc, 1251 char *buf, unsigned int len) 1252 { 1253 return DIV_ROUND_UP(get_buf_page_count(buf, len), 1254 sc->rw_io.credits.num_pages); 1255 } 1256 1257 static int smb_direct_create_header(struct smbdirect_socket *sc, 1258 int size, int remaining_data_length, 1259 struct smbdirect_send_io **sendmsg_out) 1260 { 1261 struct smbdirect_socket_parameters *sp = &sc->parameters; 1262 struct smbdirect_send_io *sendmsg; 1263 struct smbdirect_data_transfer *packet; 1264 int header_length; 1265 int ret; 1266 1267 sendmsg = smb_direct_alloc_sendmsg(sc); 1268 if (IS_ERR(sendmsg)) 1269 return PTR_ERR(sendmsg); 1270 1271 /* Fill in the packet header */ 1272 packet = (struct smbdirect_data_transfer *)sendmsg->packet; 1273 packet->credits_requested = cpu_to_le16(sp->send_credit_target); 1274 packet->credits_granted = cpu_to_le16(manage_credits_prior_sending(sc)); 1275 1276 packet->flags = 0; 1277 if (manage_keep_alive_before_sending(sc)) 1278 packet->flags |= cpu_to_le16(SMBDIRECT_FLAG_RESPONSE_REQUESTED); 1279 1280 packet->reserved = 0; 1281 if (!size) 1282 packet->data_offset = 0; 1283 else 1284 packet->data_offset = cpu_to_le32(24); 1285 packet->data_length = cpu_to_le32(size); 1286 packet->remaining_data_length = cpu_to_le32(remaining_data_length); 1287 packet->padding = 0; 1288 1289 ksmbd_debug(RDMA, 1290 "credits_requested=%d credits_granted=%d data_offset=%d data_length=%d remaining_data_length=%d\n", 1291 le16_to_cpu(packet->credits_requested), 1292 le16_to_cpu(packet->credits_granted), 1293 le32_to_cpu(packet->data_offset), 1294 le32_to_cpu(packet->data_length), 1295 le32_to_cpu(packet->remaining_data_length)); 1296 1297 /* Map the packet to DMA */ 1298 header_length = sizeof(struct smbdirect_data_transfer); 1299 /* If this is a packet without payload, don't send padding */ 1300 if (!size) 1301 header_length = 1302 offsetof(struct smbdirect_data_transfer, padding); 1303 1304 sendmsg->sge[0].addr = ib_dma_map_single(sc->ib.dev, 1305 (void *)packet, 1306 header_length, 1307 DMA_TO_DEVICE); 1308 ret = ib_dma_mapping_error(sc->ib.dev, sendmsg->sge[0].addr); 1309 if (ret) { 1310 smb_direct_free_sendmsg(sc, sendmsg); 1311 return ret; 1312 } 1313 1314 sendmsg->num_sge = 1; 1315 sendmsg->sge[0].length = header_length; 1316 sendmsg->sge[0].lkey = sc->ib.pd->local_dma_lkey; 1317 1318 *sendmsg_out = sendmsg; 1319 return 0; 1320 } 1321 1322 static int get_sg_list(void *buf, int size, struct scatterlist *sg_list, int nentries) 1323 { 1324 bool high = is_vmalloc_addr(buf); 1325 struct page *page; 1326 int offset, len; 1327 int i = 0; 1328 1329 if (size <= 0 || nentries < get_buf_page_count(buf, size)) 1330 return -EINVAL; 1331 1332 offset = offset_in_page(buf); 1333 buf -= offset; 1334 while (size > 0) { 1335 len = min_t(int, PAGE_SIZE - offset, size); 1336 if (high) 1337 page = vmalloc_to_page(buf); 1338 else 1339 page = kmap_to_page(buf); 1340 1341 if (!sg_list) 1342 return -EINVAL; 1343 sg_set_page(sg_list, page, len, offset); 1344 sg_list = sg_next(sg_list); 1345 1346 buf += PAGE_SIZE; 1347 size -= len; 1348 offset = 0; 1349 i++; 1350 } 1351 return i; 1352 } 1353 1354 static int get_mapped_sg_list(struct ib_device *device, void *buf, int size, 1355 struct scatterlist *sg_list, int nentries, 1356 enum dma_data_direction dir, int *npages) 1357 { 1358 *npages = get_sg_list(buf, size, sg_list, nentries); 1359 if (*npages < 0) 1360 return -EINVAL; 1361 return ib_dma_map_sg(device, sg_list, *npages, dir); 1362 } 1363 1364 static int post_sendmsg(struct smbdirect_socket *sc, 1365 struct smbdirect_send_batch *send_ctx, 1366 struct smbdirect_send_io *msg) 1367 { 1368 int i; 1369 1370 for (i = 0; i < msg->num_sge; i++) 1371 ib_dma_sync_single_for_device(sc->ib.dev, 1372 msg->sge[i].addr, msg->sge[i].length, 1373 DMA_TO_DEVICE); 1374 1375 msg->cqe.done = send_done; 1376 msg->wr.opcode = IB_WR_SEND; 1377 msg->wr.sg_list = &msg->sge[0]; 1378 msg->wr.num_sge = msg->num_sge; 1379 msg->wr.next = NULL; 1380 1381 if (send_ctx) { 1382 msg->wr.wr_cqe = NULL; 1383 msg->wr.send_flags = 0; 1384 if (!list_empty(&send_ctx->msg_list)) { 1385 struct smbdirect_send_io *last; 1386 1387 last = list_last_entry(&send_ctx->msg_list, 1388 struct smbdirect_send_io, 1389 sibling_list); 1390 last->wr.next = &msg->wr; 1391 } 1392 list_add_tail(&msg->sibling_list, &send_ctx->msg_list); 1393 send_ctx->wr_cnt++; 1394 return 0; 1395 } 1396 1397 msg->wr.wr_cqe = &msg->cqe; 1398 msg->wr.send_flags = IB_SEND_SIGNALED; 1399 return smb_direct_post_send(sc, &msg->wr); 1400 } 1401 1402 static int smb_direct_post_send_data(struct smbdirect_socket *sc, 1403 struct smbdirect_send_batch *send_ctx, 1404 struct kvec *iov, int niov, 1405 int remaining_data_length) 1406 { 1407 int i, j, ret; 1408 struct smbdirect_send_io *msg; 1409 int data_length; 1410 struct scatterlist sg[SMBDIRECT_SEND_IO_MAX_SGE - 1]; 1411 1412 ret = wait_for_send_lcredit(sc, send_ctx); 1413 if (ret) 1414 goto lcredit_failed; 1415 1416 ret = wait_for_send_credits(sc, send_ctx); 1417 if (ret) 1418 goto credit_failed; 1419 1420 data_length = 0; 1421 for (i = 0; i < niov; i++) 1422 data_length += iov[i].iov_len; 1423 1424 ret = smb_direct_create_header(sc, data_length, remaining_data_length, 1425 &msg); 1426 if (ret) 1427 goto header_failed; 1428 1429 for (i = 0; i < niov; i++) { 1430 struct ib_sge *sge; 1431 int sg_cnt; 1432 int npages; 1433 1434 sg_init_table(sg, SMBDIRECT_SEND_IO_MAX_SGE - 1); 1435 sg_cnt = get_mapped_sg_list(sc->ib.dev, 1436 iov[i].iov_base, iov[i].iov_len, 1437 sg, SMBDIRECT_SEND_IO_MAX_SGE - 1, 1438 DMA_TO_DEVICE, &npages); 1439 if (sg_cnt <= 0) { 1440 pr_err("failed to map buffer\n"); 1441 ret = -ENOMEM; 1442 goto err; 1443 } else if (sg_cnt + msg->num_sge > SMBDIRECT_SEND_IO_MAX_SGE) { 1444 pr_err("buffer not fitted into sges\n"); 1445 ret = -E2BIG; 1446 ib_dma_unmap_sg(sc->ib.dev, sg, npages, 1447 DMA_TO_DEVICE); 1448 goto err; 1449 } 1450 1451 for (j = 0; j < sg_cnt; j++) { 1452 sge = &msg->sge[msg->num_sge]; 1453 sge->addr = sg_dma_address(&sg[j]); 1454 sge->length = sg_dma_len(&sg[j]); 1455 sge->lkey = sc->ib.pd->local_dma_lkey; 1456 msg->num_sge++; 1457 } 1458 } 1459 1460 ret = post_sendmsg(sc, send_ctx, msg); 1461 if (ret) 1462 goto err; 1463 return 0; 1464 err: 1465 smb_direct_free_sendmsg(sc, msg); 1466 header_failed: 1467 atomic_inc(&sc->send_io.credits.count); 1468 credit_failed: 1469 atomic_inc(&sc->send_io.lcredits.count); 1470 lcredit_failed: 1471 return ret; 1472 } 1473 1474 static int smb_direct_writev(struct ksmbd_transport *t, 1475 struct kvec *iov, int niovs, int buflen, 1476 bool need_invalidate, unsigned int remote_key) 1477 { 1478 struct smb_direct_transport *st = SMBD_TRANS(t); 1479 struct smbdirect_socket *sc = &st->socket; 1480 struct smbdirect_socket_parameters *sp = &sc->parameters; 1481 size_t remaining_data_length; 1482 size_t iov_idx; 1483 size_t iov_ofs; 1484 size_t max_iov_size = sp->max_send_size - 1485 sizeof(struct smbdirect_data_transfer); 1486 int ret; 1487 struct smbdirect_send_batch send_ctx; 1488 int error = 0; 1489 1490 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) 1491 return -ENOTCONN; 1492 1493 //FIXME: skip RFC1002 header.. 1494 if (WARN_ON_ONCE(niovs <= 1 || iov[0].iov_len != 4)) 1495 return -EINVAL; 1496 buflen -= 4; 1497 iov_idx = 1; 1498 iov_ofs = 0; 1499 1500 remaining_data_length = buflen; 1501 ksmbd_debug(RDMA, "Sending smb (RDMA): smb_len=%u\n", buflen); 1502 1503 smb_direct_send_ctx_init(&send_ctx, need_invalidate, remote_key); 1504 while (remaining_data_length) { 1505 struct kvec vecs[SMBDIRECT_SEND_IO_MAX_SGE - 1]; /* minus smbdirect hdr */ 1506 size_t possible_bytes = max_iov_size; 1507 size_t possible_vecs; 1508 size_t bytes = 0; 1509 size_t nvecs = 0; 1510 1511 /* 1512 * For the last message remaining_data_length should be 1513 * have been 0 already! 1514 */ 1515 if (WARN_ON_ONCE(iov_idx >= niovs)) { 1516 error = -EINVAL; 1517 goto done; 1518 } 1519 1520 /* 1521 * We have 2 factors which limit the arguments we pass 1522 * to smb_direct_post_send_data(): 1523 * 1524 * 1. The number of supported sges for the send, 1525 * while one is reserved for the smbdirect header. 1526 * And we currently need one SGE per page. 1527 * 2. The number of negotiated payload bytes per send. 1528 */ 1529 possible_vecs = min_t(size_t, ARRAY_SIZE(vecs), niovs - iov_idx); 1530 1531 while (iov_idx < niovs && possible_vecs && possible_bytes) { 1532 struct kvec *v = &vecs[nvecs]; 1533 int page_count; 1534 1535 v->iov_base = ((u8 *)iov[iov_idx].iov_base) + iov_ofs; 1536 v->iov_len = min_t(size_t, 1537 iov[iov_idx].iov_len - iov_ofs, 1538 possible_bytes); 1539 page_count = get_buf_page_count(v->iov_base, v->iov_len); 1540 if (page_count > possible_vecs) { 1541 /* 1542 * If the number of pages in the buffer 1543 * is to much (because we currently require 1544 * one SGE per page), we need to limit the 1545 * length. 1546 * 1547 * We know possible_vecs is at least 1, 1548 * so we always keep the first page. 1549 * 1550 * We need to calculate the number extra 1551 * pages (epages) we can also keep. 1552 * 1553 * We calculate the number of bytes in the 1554 * first page (fplen), this should never be 1555 * larger than v->iov_len because page_count is 1556 * at least 2, but adding a limitation feels 1557 * better. 1558 * 1559 * Then we calculate the number of bytes (elen) 1560 * we can keep for the extra pages. 1561 */ 1562 size_t epages = possible_vecs - 1; 1563 size_t fpofs = offset_in_page(v->iov_base); 1564 size_t fplen = min_t(size_t, PAGE_SIZE - fpofs, v->iov_len); 1565 size_t elen = min_t(size_t, v->iov_len - fplen, epages*PAGE_SIZE); 1566 1567 v->iov_len = fplen + elen; 1568 page_count = get_buf_page_count(v->iov_base, v->iov_len); 1569 if (WARN_ON_ONCE(page_count > possible_vecs)) { 1570 /* 1571 * Something went wrong in the above 1572 * logic... 1573 */ 1574 error = -EINVAL; 1575 goto done; 1576 } 1577 } 1578 possible_vecs -= page_count; 1579 nvecs += 1; 1580 possible_bytes -= v->iov_len; 1581 bytes += v->iov_len; 1582 1583 iov_ofs += v->iov_len; 1584 if (iov_ofs >= iov[iov_idx].iov_len) { 1585 iov_idx += 1; 1586 iov_ofs = 0; 1587 } 1588 } 1589 1590 remaining_data_length -= bytes; 1591 1592 ret = smb_direct_post_send_data(sc, &send_ctx, 1593 vecs, nvecs, 1594 remaining_data_length); 1595 if (unlikely(ret)) { 1596 error = ret; 1597 goto done; 1598 } 1599 } 1600 1601 done: 1602 ret = smb_direct_flush_send_list(sc, &send_ctx, true); 1603 if (unlikely(!ret && error)) 1604 ret = error; 1605 1606 /* 1607 * As an optimization, we don't wait for individual I/O to finish 1608 * before sending the next one. 1609 * Send them all and wait for pending send count to get to 0 1610 * that means all the I/Os have been out and we are good to return 1611 */ 1612 1613 wait_event(sc->send_io.pending.zero_wait_queue, 1614 atomic_read(&sc->send_io.pending.count) == 0 || 1615 sc->status != SMBDIRECT_SOCKET_CONNECTED); 1616 if (sc->status != SMBDIRECT_SOCKET_CONNECTED && ret == 0) 1617 ret = -ENOTCONN; 1618 1619 return ret; 1620 } 1621 1622 static void smb_direct_free_rdma_rw_msg(struct smb_direct_transport *t, 1623 struct smbdirect_rw_io *msg, 1624 enum dma_data_direction dir) 1625 { 1626 struct smbdirect_socket *sc = &t->socket; 1627 1628 rdma_rw_ctx_destroy(&msg->rdma_ctx, sc->ib.qp, sc->ib.qp->port, 1629 msg->sgt.sgl, msg->sgt.nents, dir); 1630 sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); 1631 kfree(msg); 1632 } 1633 1634 static void read_write_done(struct ib_cq *cq, struct ib_wc *wc, 1635 enum dma_data_direction dir) 1636 { 1637 struct smbdirect_rw_io *msg = 1638 container_of(wc->wr_cqe, struct smbdirect_rw_io, cqe); 1639 struct smbdirect_socket *sc = msg->socket; 1640 1641 if (wc->status != IB_WC_SUCCESS) { 1642 msg->error = -EIO; 1643 pr_err("read/write error. opcode = %d, status = %s(%d)\n", 1644 wc->opcode, ib_wc_status_msg(wc->status), wc->status); 1645 if (wc->status != IB_WC_WR_FLUSH_ERR) 1646 smb_direct_disconnect_rdma_connection(sc); 1647 } 1648 1649 complete(msg->completion); 1650 } 1651 1652 static void read_done(struct ib_cq *cq, struct ib_wc *wc) 1653 { 1654 read_write_done(cq, wc, DMA_FROM_DEVICE); 1655 } 1656 1657 static void write_done(struct ib_cq *cq, struct ib_wc *wc) 1658 { 1659 read_write_done(cq, wc, DMA_TO_DEVICE); 1660 } 1661 1662 static int smb_direct_rdma_xmit(struct smb_direct_transport *t, 1663 void *buf, int buf_len, 1664 struct smbdirect_buffer_descriptor_v1 *desc, 1665 unsigned int desc_len, 1666 bool is_read) 1667 { 1668 struct smbdirect_socket *sc = &t->socket; 1669 struct smbdirect_socket_parameters *sp = &sc->parameters; 1670 struct smbdirect_rw_io *msg, *next_msg; 1671 int i, ret; 1672 DECLARE_COMPLETION_ONSTACK(completion); 1673 struct ib_send_wr *first_wr; 1674 LIST_HEAD(msg_list); 1675 char *desc_buf; 1676 int credits_needed; 1677 unsigned int desc_buf_len, desc_num = 0; 1678 1679 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) 1680 return -ENOTCONN; 1681 1682 if (buf_len > sp->max_read_write_size) 1683 return -EINVAL; 1684 1685 /* calculate needed credits */ 1686 credits_needed = 0; 1687 desc_buf = buf; 1688 for (i = 0; i < desc_len / sizeof(*desc); i++) { 1689 if (!buf_len) 1690 break; 1691 1692 desc_buf_len = le32_to_cpu(desc[i].length); 1693 if (!desc_buf_len) 1694 return -EINVAL; 1695 1696 if (desc_buf_len > buf_len) { 1697 desc_buf_len = buf_len; 1698 desc[i].length = cpu_to_le32(desc_buf_len); 1699 buf_len = 0; 1700 } 1701 1702 credits_needed += calc_rw_credits(sc, desc_buf, desc_buf_len); 1703 desc_buf += desc_buf_len; 1704 buf_len -= desc_buf_len; 1705 desc_num++; 1706 } 1707 1708 ksmbd_debug(RDMA, "RDMA %s, len %#x, needed credits %#x\n", 1709 str_read_write(is_read), buf_len, credits_needed); 1710 1711 ret = wait_for_rw_credits(sc, credits_needed); 1712 if (ret < 0) 1713 return ret; 1714 1715 /* build rdma_rw_ctx for each descriptor */ 1716 desc_buf = buf; 1717 for (i = 0; i < desc_num; i++) { 1718 msg = kzalloc(struct_size(msg, sg_list, SG_CHUNK_SIZE), 1719 KSMBD_DEFAULT_GFP); 1720 if (!msg) { 1721 ret = -ENOMEM; 1722 goto out; 1723 } 1724 1725 desc_buf_len = le32_to_cpu(desc[i].length); 1726 1727 msg->socket = sc; 1728 msg->cqe.done = is_read ? read_done : write_done; 1729 msg->completion = &completion; 1730 1731 msg->sgt.sgl = &msg->sg_list[0]; 1732 ret = sg_alloc_table_chained(&msg->sgt, 1733 get_buf_page_count(desc_buf, desc_buf_len), 1734 msg->sg_list, SG_CHUNK_SIZE); 1735 if (ret) { 1736 ret = -ENOMEM; 1737 goto free_msg; 1738 } 1739 1740 ret = get_sg_list(desc_buf, desc_buf_len, 1741 msg->sgt.sgl, msg->sgt.orig_nents); 1742 if (ret < 0) 1743 goto free_table; 1744 1745 ret = rdma_rw_ctx_init(&msg->rdma_ctx, sc->ib.qp, sc->ib.qp->port, 1746 msg->sgt.sgl, 1747 get_buf_page_count(desc_buf, desc_buf_len), 1748 0, 1749 le64_to_cpu(desc[i].offset), 1750 le32_to_cpu(desc[i].token), 1751 is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); 1752 if (ret < 0) { 1753 pr_err("failed to init rdma_rw_ctx: %d\n", ret); 1754 goto free_table; 1755 } 1756 1757 list_add_tail(&msg->list, &msg_list); 1758 desc_buf += desc_buf_len; 1759 } 1760 1761 /* concatenate work requests of rdma_rw_ctxs */ 1762 first_wr = NULL; 1763 list_for_each_entry_reverse(msg, &msg_list, list) { 1764 first_wr = rdma_rw_ctx_wrs(&msg->rdma_ctx, sc->ib.qp, sc->ib.qp->port, 1765 &msg->cqe, first_wr); 1766 } 1767 1768 ret = ib_post_send(sc->ib.qp, first_wr, NULL); 1769 if (ret) { 1770 pr_err("failed to post send wr for RDMA R/W: %d\n", ret); 1771 goto out; 1772 } 1773 1774 msg = list_last_entry(&msg_list, struct smbdirect_rw_io, list); 1775 wait_for_completion(&completion); 1776 ret = msg->error; 1777 out: 1778 list_for_each_entry_safe(msg, next_msg, &msg_list, list) { 1779 list_del(&msg->list); 1780 smb_direct_free_rdma_rw_msg(t, msg, 1781 is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); 1782 } 1783 atomic_add(credits_needed, &sc->rw_io.credits.count); 1784 wake_up(&sc->rw_io.credits.wait_queue); 1785 return ret; 1786 1787 free_table: 1788 sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); 1789 free_msg: 1790 kfree(msg); 1791 goto out; 1792 } 1793 1794 static int smb_direct_rdma_write(struct ksmbd_transport *t, 1795 void *buf, unsigned int buflen, 1796 struct smbdirect_buffer_descriptor_v1 *desc, 1797 unsigned int desc_len) 1798 { 1799 return smb_direct_rdma_xmit(SMBD_TRANS(t), buf, buflen, 1800 desc, desc_len, false); 1801 } 1802 1803 static int smb_direct_rdma_read(struct ksmbd_transport *t, 1804 void *buf, unsigned int buflen, 1805 struct smbdirect_buffer_descriptor_v1 *desc, 1806 unsigned int desc_len) 1807 { 1808 return smb_direct_rdma_xmit(SMBD_TRANS(t), buf, buflen, 1809 desc, desc_len, true); 1810 } 1811 1812 static void smb_direct_disconnect(struct ksmbd_transport *t) 1813 { 1814 struct smb_direct_transport *st = SMBD_TRANS(t); 1815 struct smbdirect_socket *sc = &st->socket; 1816 1817 ksmbd_debug(RDMA, "Disconnecting cm_id=%p\n", sc->rdma.cm_id); 1818 1819 free_transport(st); 1820 } 1821 1822 static void smb_direct_shutdown(struct ksmbd_transport *t) 1823 { 1824 struct smb_direct_transport *st = SMBD_TRANS(t); 1825 struct smbdirect_socket *sc = &st->socket; 1826 1827 ksmbd_debug(RDMA, "smb-direct shutdown cm_id=%p\n", sc->rdma.cm_id); 1828 1829 smb_direct_disconnect_rdma_work(&sc->disconnect_work); 1830 } 1831 1832 static int smb_direct_cm_handler(struct rdma_cm_id *cm_id, 1833 struct rdma_cm_event *event) 1834 { 1835 struct smbdirect_socket *sc = cm_id->context; 1836 unsigned long flags; 1837 1838 ksmbd_debug(RDMA, "RDMA CM event. cm_id=%p event=%s (%d)\n", 1839 cm_id, rdma_event_msg(event->event), event->event); 1840 1841 switch (event->event) { 1842 case RDMA_CM_EVENT_ESTABLISHED: { 1843 /* 1844 * Some drivers (at least mlx5_ib and irdma in roce mode) 1845 * might post a recv completion before RDMA_CM_EVENT_ESTABLISHED, 1846 * we need to adjust our expectation in that case. 1847 * 1848 * If smb_direct_negotiate_recv_done was called first 1849 * it initialized sc->connect.work only for us to 1850 * start, so that we turned into 1851 * SMBDIRECT_SOCKET_NEGOTIATE_NEEDED, before 1852 * smb_direct_negotiate_recv_work() runs. 1853 * 1854 * If smb_direct_negotiate_recv_done didn't happen 1855 * yet. sc->connect.work is still be disabled and 1856 * queue_work() is a no-op. 1857 */ 1858 if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING)) 1859 break; 1860 sc->status = SMBDIRECT_SOCKET_NEGOTIATE_NEEDED; 1861 spin_lock_irqsave(&sc->connect.lock, flags); 1862 if (!sc->first_error) 1863 queue_work(sc->workqueue, &sc->connect.work); 1864 spin_unlock_irqrestore(&sc->connect.lock, flags); 1865 wake_up(&sc->status_wait); 1866 break; 1867 } 1868 case RDMA_CM_EVENT_DEVICE_REMOVAL: 1869 case RDMA_CM_EVENT_DISCONNECTED: { 1870 sc->status = SMBDIRECT_SOCKET_DISCONNECTED; 1871 smb_direct_disconnect_rdma_work(&sc->disconnect_work); 1872 if (sc->ib.qp) 1873 ib_drain_qp(sc->ib.qp); 1874 break; 1875 } 1876 case RDMA_CM_EVENT_CONNECT_ERROR: { 1877 sc->status = SMBDIRECT_SOCKET_DISCONNECTED; 1878 smb_direct_disconnect_rdma_work(&sc->disconnect_work); 1879 break; 1880 } 1881 default: 1882 pr_err("Unexpected RDMA CM event. cm_id=%p, event=%s (%d)\n", 1883 cm_id, rdma_event_msg(event->event), 1884 event->event); 1885 break; 1886 } 1887 return 0; 1888 } 1889 1890 static void smb_direct_qpair_handler(struct ib_event *event, void *context) 1891 { 1892 struct smbdirect_socket *sc = context; 1893 1894 ksmbd_debug(RDMA, "Received QP event. cm_id=%p, event=%s (%d)\n", 1895 sc->rdma.cm_id, ib_event_msg(event->event), event->event); 1896 1897 switch (event->event) { 1898 case IB_EVENT_CQ_ERR: 1899 case IB_EVENT_QP_FATAL: 1900 smb_direct_disconnect_rdma_connection(sc); 1901 break; 1902 default: 1903 break; 1904 } 1905 } 1906 1907 static int smb_direct_send_negotiate_response(struct smbdirect_socket *sc, 1908 int failed) 1909 { 1910 struct smbdirect_socket_parameters *sp = &sc->parameters; 1911 struct smbdirect_send_io *sendmsg; 1912 struct smbdirect_negotiate_resp *resp; 1913 int ret; 1914 1915 sendmsg = smb_direct_alloc_sendmsg(sc); 1916 if (IS_ERR(sendmsg)) 1917 return -ENOMEM; 1918 1919 resp = (struct smbdirect_negotiate_resp *)sendmsg->packet; 1920 if (failed) { 1921 memset(resp, 0, sizeof(*resp)); 1922 resp->min_version = SMB_DIRECT_VERSION_LE; 1923 resp->max_version = SMB_DIRECT_VERSION_LE; 1924 resp->status = STATUS_NOT_SUPPORTED; 1925 1926 sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; 1927 } else { 1928 resp->status = STATUS_SUCCESS; 1929 resp->min_version = SMB_DIRECT_VERSION_LE; 1930 resp->max_version = SMB_DIRECT_VERSION_LE; 1931 resp->negotiated_version = SMB_DIRECT_VERSION_LE; 1932 resp->reserved = 0; 1933 resp->credits_requested = 1934 cpu_to_le16(sp->send_credit_target); 1935 resp->credits_granted = cpu_to_le16(manage_credits_prior_sending(sc)); 1936 resp->max_readwrite_size = cpu_to_le32(sp->max_read_write_size); 1937 resp->preferred_send_size = cpu_to_le32(sp->max_send_size); 1938 resp->max_receive_size = cpu_to_le32(sp->max_recv_size); 1939 resp->max_fragmented_size = 1940 cpu_to_le32(sp->max_fragmented_recv_size); 1941 1942 sc->recv_io.expected = SMBDIRECT_EXPECT_DATA_TRANSFER; 1943 sc->status = SMBDIRECT_SOCKET_CONNECTED; 1944 } 1945 1946 sendmsg->sge[0].addr = ib_dma_map_single(sc->ib.dev, 1947 (void *)resp, sizeof(*resp), 1948 DMA_TO_DEVICE); 1949 ret = ib_dma_mapping_error(sc->ib.dev, sendmsg->sge[0].addr); 1950 if (ret) { 1951 smb_direct_free_sendmsg(sc, sendmsg); 1952 return ret; 1953 } 1954 1955 sendmsg->num_sge = 1; 1956 sendmsg->sge[0].length = sizeof(*resp); 1957 sendmsg->sge[0].lkey = sc->ib.pd->local_dma_lkey; 1958 1959 ret = post_sendmsg(sc, NULL, sendmsg); 1960 if (ret) { 1961 smb_direct_free_sendmsg(sc, sendmsg); 1962 return ret; 1963 } 1964 1965 wait_event(sc->send_io.pending.zero_wait_queue, 1966 atomic_read(&sc->send_io.pending.count) == 0 || 1967 sc->status != SMBDIRECT_SOCKET_CONNECTED); 1968 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) 1969 return -ENOTCONN; 1970 1971 return 0; 1972 } 1973 1974 static int smb_direct_accept_client(struct smbdirect_socket *sc) 1975 { 1976 struct smbdirect_socket_parameters *sp = &sc->parameters; 1977 struct rdma_conn_param conn_param; 1978 __be32 ird_ord_hdr[2]; 1979 int ret; 1980 1981 /* 1982 * smb_direct_handle_connect_request() 1983 * already negotiated sp->initiator_depth 1984 * and sp->responder_resources 1985 */ 1986 memset(&conn_param, 0, sizeof(conn_param)); 1987 conn_param.initiator_depth = sp->initiator_depth; 1988 conn_param.responder_resources = sp->responder_resources; 1989 1990 if (sc->rdma.legacy_iwarp) { 1991 ird_ord_hdr[0] = cpu_to_be32(conn_param.responder_resources); 1992 ird_ord_hdr[1] = cpu_to_be32(conn_param.initiator_depth); 1993 conn_param.private_data = ird_ord_hdr; 1994 conn_param.private_data_len = sizeof(ird_ord_hdr); 1995 } else { 1996 conn_param.private_data = NULL; 1997 conn_param.private_data_len = 0; 1998 } 1999 conn_param.retry_count = SMB_DIRECT_CM_RETRY; 2000 conn_param.rnr_retry_count = SMB_DIRECT_CM_RNR_RETRY; 2001 conn_param.flow_control = 0; 2002 2003 /* 2004 * start with the negotiate timeout and SMBDIRECT_KEEPALIVE_PENDING 2005 * so that the timer will cause a disconnect. 2006 */ 2007 sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING; 2008 mod_delayed_work(sc->workqueue, &sc->idle.timer_work, 2009 msecs_to_jiffies(sp->negotiate_timeout_msec)); 2010 2011 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED); 2012 sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING; 2013 ret = rdma_accept(sc->rdma.cm_id, &conn_param); 2014 if (ret) { 2015 pr_err("error at rdma_accept: %d\n", ret); 2016 return ret; 2017 } 2018 return 0; 2019 } 2020 2021 static int smb_direct_prepare_negotiation(struct smbdirect_socket *sc) 2022 { 2023 struct smbdirect_recv_io *recvmsg; 2024 bool recv_posted = false; 2025 int ret; 2026 2027 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED); 2028 sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED; 2029 2030 sc->recv_io.expected = SMBDIRECT_EXPECT_NEGOTIATE_REQ; 2031 2032 recvmsg = get_free_recvmsg(sc); 2033 if (!recvmsg) 2034 return -ENOMEM; 2035 recvmsg->cqe.done = smb_direct_negotiate_recv_done; 2036 2037 ret = smb_direct_post_recv(sc, recvmsg); 2038 if (ret) { 2039 pr_err("Can't post recv: %d\n", ret); 2040 goto out_err; 2041 } 2042 recv_posted = true; 2043 2044 ret = smb_direct_accept_client(sc); 2045 if (ret) { 2046 pr_err("Can't accept client\n"); 2047 goto out_err; 2048 } 2049 2050 return 0; 2051 out_err: 2052 /* 2053 * If the recv was never posted, return it to the free list. 2054 * If it was posted, leave it alone so disconnect teardown can 2055 * drain the QP and complete it (flush) and the completion path 2056 * will unmap it exactly once. 2057 */ 2058 if (!recv_posted) 2059 put_recvmsg(sc, recvmsg); 2060 return ret; 2061 } 2062 2063 static int smb_direct_init_params(struct smbdirect_socket *sc) 2064 { 2065 struct smbdirect_socket_parameters *sp = &sc->parameters; 2066 int max_send_sges; 2067 unsigned int maxpages; 2068 2069 /* need 3 more sge. because a SMB_DIRECT header, SMB2 header, 2070 * SMB2 response could be mapped. 2071 */ 2072 max_send_sges = DIV_ROUND_UP(sp->max_send_size, PAGE_SIZE) + 3; 2073 if (max_send_sges > SMBDIRECT_SEND_IO_MAX_SGE) { 2074 pr_err("max_send_size %d is too large\n", sp->max_send_size); 2075 return -EINVAL; 2076 } 2077 2078 atomic_set(&sc->send_io.lcredits.count, sp->send_credit_target); 2079 2080 maxpages = DIV_ROUND_UP(sp->max_read_write_size, PAGE_SIZE); 2081 sc->rw_io.credits.max = rdma_rw_mr_factor(sc->ib.dev, 2082 sc->rdma.cm_id->port_num, 2083 maxpages); 2084 sc->rw_io.credits.num_pages = DIV_ROUND_UP(maxpages, sc->rw_io.credits.max); 2085 /* add one extra in order to handle unaligned pages */ 2086 sc->rw_io.credits.max += 1; 2087 2088 sc->recv_io.credits.target = 1; 2089 2090 atomic_set(&sc->rw_io.credits.count, sc->rw_io.credits.max); 2091 2092 return 0; 2093 } 2094 2095 static void smb_direct_destroy_pools(struct smbdirect_socket *sc) 2096 { 2097 struct smbdirect_recv_io *recvmsg; 2098 2099 while ((recvmsg = get_free_recvmsg(sc))) 2100 mempool_free(recvmsg, sc->recv_io.mem.pool); 2101 2102 mempool_destroy(sc->recv_io.mem.pool); 2103 sc->recv_io.mem.pool = NULL; 2104 2105 kmem_cache_destroy(sc->recv_io.mem.cache); 2106 sc->recv_io.mem.cache = NULL; 2107 2108 mempool_destroy(sc->send_io.mem.pool); 2109 sc->send_io.mem.pool = NULL; 2110 2111 kmem_cache_destroy(sc->send_io.mem.cache); 2112 sc->send_io.mem.cache = NULL; 2113 } 2114 2115 static int smb_direct_create_pools(struct smbdirect_socket *sc) 2116 { 2117 struct smbdirect_socket_parameters *sp = &sc->parameters; 2118 char name[80]; 2119 int i; 2120 struct smbdirect_recv_io *recvmsg; 2121 2122 snprintf(name, sizeof(name), "smbdirect_send_io_pool_%p", sc); 2123 sc->send_io.mem.cache = kmem_cache_create(name, 2124 sizeof(struct smbdirect_send_io) + 2125 sizeof(struct smbdirect_negotiate_resp), 2126 0, SLAB_HWCACHE_ALIGN, NULL); 2127 if (!sc->send_io.mem.cache) 2128 return -ENOMEM; 2129 2130 sc->send_io.mem.pool = mempool_create(sp->send_credit_target, 2131 mempool_alloc_slab, mempool_free_slab, 2132 sc->send_io.mem.cache); 2133 if (!sc->send_io.mem.pool) 2134 goto err; 2135 2136 snprintf(name, sizeof(name), "smbdirect_recv_io_pool_%p", sc); 2137 sc->recv_io.mem.cache = kmem_cache_create(name, 2138 sizeof(struct smbdirect_recv_io) + 2139 sp->max_recv_size, 2140 0, SLAB_HWCACHE_ALIGN, NULL); 2141 if (!sc->recv_io.mem.cache) 2142 goto err; 2143 2144 sc->recv_io.mem.pool = 2145 mempool_create(sp->recv_credit_max, mempool_alloc_slab, 2146 mempool_free_slab, sc->recv_io.mem.cache); 2147 if (!sc->recv_io.mem.pool) 2148 goto err; 2149 2150 for (i = 0; i < sp->recv_credit_max; i++) { 2151 recvmsg = mempool_alloc(sc->recv_io.mem.pool, KSMBD_DEFAULT_GFP); 2152 if (!recvmsg) 2153 goto err; 2154 recvmsg->socket = sc; 2155 recvmsg->sge.length = 0; 2156 list_add(&recvmsg->list, &sc->recv_io.free.list); 2157 } 2158 2159 return 0; 2160 err: 2161 smb_direct_destroy_pools(sc); 2162 return -ENOMEM; 2163 } 2164 2165 static u32 smb_direct_rdma_rw_send_wrs(struct ib_device *dev, const struct ib_qp_init_attr *attr) 2166 { 2167 /* 2168 * This could be split out of rdma_rw_init_qp() 2169 * and be a helper function next to rdma_rw_mr_factor() 2170 * 2171 * We can't check unlikely(rdma_rw_force_mr) here, 2172 * but that is most likely 0 anyway. 2173 */ 2174 u32 factor; 2175 2176 WARN_ON_ONCE(attr->port_num == 0); 2177 2178 /* 2179 * Each context needs at least one RDMA READ or WRITE WR. 2180 * 2181 * For some hardware we might need more, eventually we should ask the 2182 * HCA driver for a multiplier here. 2183 */ 2184 factor = 1; 2185 2186 /* 2187 * If the device needs MRs to perform RDMA READ or WRITE operations, 2188 * we'll need two additional MRs for the registrations and the 2189 * invalidation. 2190 */ 2191 if (rdma_protocol_iwarp(dev, attr->port_num) || dev->attrs.max_sgl_rd) 2192 factor += 2; /* inv + reg */ 2193 2194 return factor * attr->cap.max_rdma_ctxs; 2195 } 2196 2197 static int smb_direct_create_qpair(struct smbdirect_socket *sc) 2198 { 2199 struct smbdirect_socket_parameters *sp = &sc->parameters; 2200 int ret; 2201 struct ib_qp_cap qp_cap; 2202 struct ib_qp_init_attr qp_attr; 2203 u32 max_send_wr; 2204 u32 rdma_send_wr; 2205 2206 /* 2207 * Note that {rdma,ib}_create_qp() will call 2208 * rdma_rw_init_qp() if cap->max_rdma_ctxs is not 0. 2209 * It will adjust cap->max_send_wr to the required 2210 * number of additional WRs for the RDMA RW operations. 2211 * It will cap cap->max_send_wr to the device limit. 2212 * 2213 * +1 for ib_drain_qp 2214 */ 2215 qp_cap.max_send_wr = sp->send_credit_target + 1; 2216 qp_cap.max_recv_wr = sp->recv_credit_max + 1; 2217 qp_cap.max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE; 2218 qp_cap.max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE; 2219 qp_cap.max_inline_data = 0; 2220 qp_cap.max_rdma_ctxs = sc->rw_io.credits.max; 2221 2222 /* 2223 * Find out the number of max_send_wr 2224 * after rdma_rw_init_qp() adjusted it. 2225 * 2226 * We only do it on a temporary variable, 2227 * as rdma_create_qp() will trigger 2228 * rdma_rw_init_qp() again. 2229 */ 2230 memset(&qp_attr, 0, sizeof(qp_attr)); 2231 qp_attr.cap = qp_cap; 2232 qp_attr.port_num = sc->rdma.cm_id->port_num; 2233 rdma_send_wr = smb_direct_rdma_rw_send_wrs(sc->ib.dev, &qp_attr); 2234 max_send_wr = qp_cap.max_send_wr + rdma_send_wr; 2235 2236 if (qp_cap.max_send_wr > sc->ib.dev->attrs.max_cqe || 2237 qp_cap.max_send_wr > sc->ib.dev->attrs.max_qp_wr) { 2238 pr_err("Possible CQE overrun: max_send_wr %d\n", 2239 qp_cap.max_send_wr); 2240 pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n", 2241 IB_DEVICE_NAME_MAX, 2242 sc->ib.dev->name, 2243 sc->ib.dev->attrs.max_cqe, 2244 sc->ib.dev->attrs.max_qp_wr); 2245 pr_err("consider lowering send_credit_target = %d\n", 2246 sp->send_credit_target); 2247 return -EINVAL; 2248 } 2249 2250 if (qp_cap.max_rdma_ctxs && 2251 (max_send_wr >= sc->ib.dev->attrs.max_cqe || 2252 max_send_wr >= sc->ib.dev->attrs.max_qp_wr)) { 2253 pr_err("Possible CQE overrun: rdma_send_wr %d + max_send_wr %d = %d\n", 2254 rdma_send_wr, qp_cap.max_send_wr, max_send_wr); 2255 pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n", 2256 IB_DEVICE_NAME_MAX, 2257 sc->ib.dev->name, 2258 sc->ib.dev->attrs.max_cqe, 2259 sc->ib.dev->attrs.max_qp_wr); 2260 pr_err("consider lowering send_credit_target = %d, max_rdma_ctxs = %d\n", 2261 sp->send_credit_target, qp_cap.max_rdma_ctxs); 2262 return -EINVAL; 2263 } 2264 2265 if (qp_cap.max_recv_wr > sc->ib.dev->attrs.max_cqe || 2266 qp_cap.max_recv_wr > sc->ib.dev->attrs.max_qp_wr) { 2267 pr_err("Possible CQE overrun: max_recv_wr %d\n", 2268 qp_cap.max_recv_wr); 2269 pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n", 2270 IB_DEVICE_NAME_MAX, 2271 sc->ib.dev->name, 2272 sc->ib.dev->attrs.max_cqe, 2273 sc->ib.dev->attrs.max_qp_wr); 2274 pr_err("consider lowering receive_credit_max = %d\n", 2275 sp->recv_credit_max); 2276 return -EINVAL; 2277 } 2278 2279 if (qp_cap.max_send_sge > sc->ib.dev->attrs.max_send_sge || 2280 qp_cap.max_recv_sge > sc->ib.dev->attrs.max_recv_sge) { 2281 pr_err("device %.*s max_send_sge/max_recv_sge = %d/%d too small\n", 2282 IB_DEVICE_NAME_MAX, 2283 sc->ib.dev->name, 2284 sc->ib.dev->attrs.max_send_sge, 2285 sc->ib.dev->attrs.max_recv_sge); 2286 return -EINVAL; 2287 } 2288 2289 sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0); 2290 if (IS_ERR(sc->ib.pd)) { 2291 pr_err("Can't create RDMA PD\n"); 2292 ret = PTR_ERR(sc->ib.pd); 2293 sc->ib.pd = NULL; 2294 return ret; 2295 } 2296 2297 sc->ib.send_cq = ib_alloc_cq_any(sc->ib.dev, sc, 2298 max_send_wr, 2299 IB_POLL_WORKQUEUE); 2300 if (IS_ERR(sc->ib.send_cq)) { 2301 pr_err("Can't create RDMA send CQ\n"); 2302 ret = PTR_ERR(sc->ib.send_cq); 2303 sc->ib.send_cq = NULL; 2304 goto err; 2305 } 2306 2307 sc->ib.recv_cq = ib_alloc_cq_any(sc->ib.dev, sc, 2308 qp_cap.max_recv_wr, 2309 IB_POLL_WORKQUEUE); 2310 if (IS_ERR(sc->ib.recv_cq)) { 2311 pr_err("Can't create RDMA recv CQ\n"); 2312 ret = PTR_ERR(sc->ib.recv_cq); 2313 sc->ib.recv_cq = NULL; 2314 goto err; 2315 } 2316 2317 /* 2318 * We reset completely here! 2319 * As the above use was just temporary 2320 * to calc max_send_wr and rdma_send_wr. 2321 * 2322 * rdma_create_qp() will trigger rdma_rw_init_qp() 2323 * again if max_rdma_ctxs is not 0. 2324 */ 2325 memset(&qp_attr, 0, sizeof(qp_attr)); 2326 qp_attr.event_handler = smb_direct_qpair_handler; 2327 qp_attr.qp_context = sc; 2328 qp_attr.cap = qp_cap; 2329 qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; 2330 qp_attr.qp_type = IB_QPT_RC; 2331 qp_attr.send_cq = sc->ib.send_cq; 2332 qp_attr.recv_cq = sc->ib.recv_cq; 2333 qp_attr.port_num = ~0; 2334 2335 ret = rdma_create_qp(sc->rdma.cm_id, sc->ib.pd, &qp_attr); 2336 if (ret) { 2337 pr_err("Can't create RDMA QP: %d\n", ret); 2338 goto err; 2339 } 2340 2341 sc->ib.qp = sc->rdma.cm_id->qp; 2342 sc->rdma.cm_id->event_handler = smb_direct_cm_handler; 2343 2344 return 0; 2345 err: 2346 if (sc->ib.qp) { 2347 sc->ib.qp = NULL; 2348 rdma_destroy_qp(sc->rdma.cm_id); 2349 } 2350 if (sc->ib.recv_cq) { 2351 ib_destroy_cq(sc->ib.recv_cq); 2352 sc->ib.recv_cq = NULL; 2353 } 2354 if (sc->ib.send_cq) { 2355 ib_destroy_cq(sc->ib.send_cq); 2356 sc->ib.send_cq = NULL; 2357 } 2358 if (sc->ib.pd) { 2359 ib_dealloc_pd(sc->ib.pd); 2360 sc->ib.pd = NULL; 2361 } 2362 return ret; 2363 } 2364 2365 static int smb_direct_prepare(struct ksmbd_transport *t) 2366 { 2367 struct smb_direct_transport *st = SMBD_TRANS(t); 2368 struct smbdirect_socket *sc = &st->socket; 2369 struct smbdirect_socket_parameters *sp = &sc->parameters; 2370 struct smbdirect_recv_io *recvmsg; 2371 struct smbdirect_negotiate_req *req; 2372 unsigned long flags; 2373 int ret; 2374 2375 /* 2376 * We are waiting to pass the following states: 2377 * 2378 * SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED 2379 * SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING 2380 * SMBDIRECT_SOCKET_NEGOTIATE_NEEDED 2381 * 2382 * To finally get to SMBDIRECT_SOCKET_NEGOTIATE_RUNNING 2383 * in order to continue below. 2384 * 2385 * Everything else is unexpected and an error. 2386 */ 2387 ksmbd_debug(RDMA, "Waiting for SMB_DIRECT negotiate request\n"); 2388 ret = wait_event_interruptible_timeout(sc->status_wait, 2389 sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED && 2390 sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING && 2391 sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED, 2392 msecs_to_jiffies(sp->negotiate_timeout_msec)); 2393 if (ret <= 0 || sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING) 2394 return ret < 0 ? ret : -ETIMEDOUT; 2395 2396 recvmsg = get_first_reassembly(sc); 2397 if (!recvmsg) 2398 return -ECONNABORTED; 2399 2400 ret = smb_direct_check_recvmsg(recvmsg); 2401 if (ret) 2402 goto put; 2403 2404 req = (struct smbdirect_negotiate_req *)recvmsg->packet; 2405 sp->max_recv_size = min_t(int, sp->max_recv_size, 2406 le32_to_cpu(req->preferred_send_size)); 2407 sp->max_send_size = min_t(int, sp->max_send_size, 2408 le32_to_cpu(req->max_receive_size)); 2409 sp->max_fragmented_send_size = 2410 le32_to_cpu(req->max_fragmented_size); 2411 sp->max_fragmented_recv_size = 2412 (sp->recv_credit_max * sp->max_recv_size) / 2; 2413 sc->recv_io.credits.target = le16_to_cpu(req->credits_requested); 2414 sc->recv_io.credits.target = min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max); 2415 sc->recv_io.credits.target = max_t(u16, sc->recv_io.credits.target, 1); 2416 2417 put: 2418 spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); 2419 sc->recv_io.reassembly.queue_length--; 2420 list_del(&recvmsg->list); 2421 spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); 2422 put_recvmsg(sc, recvmsg); 2423 2424 if (ret == -ECONNABORTED) 2425 return ret; 2426 2427 if (ret) 2428 goto respond; 2429 2430 /* 2431 * We negotiated with success, so we need to refill the recv queue. 2432 * We do that with sc->idle.immediate_work still being disabled 2433 * via smbdirect_socket_init(), so that queue_work(sc->workqueue, 2434 * &sc->idle.immediate_work) in smb_direct_post_recv_credits() 2435 * is a no-op. 2436 * 2437 * The message that grants the credits to the client is 2438 * the negotiate response. 2439 */ 2440 INIT_WORK(&sc->recv_io.posted.refill_work, smb_direct_post_recv_credits); 2441 smb_direct_post_recv_credits(&sc->recv_io.posted.refill_work); 2442 if (unlikely(sc->first_error)) 2443 return sc->first_error; 2444 INIT_WORK(&sc->idle.immediate_work, smb_direct_send_immediate_work); 2445 2446 respond: 2447 ret = smb_direct_send_negotiate_response(sc, ret); 2448 2449 return ret; 2450 } 2451 2452 static int smb_direct_connect(struct smbdirect_socket *sc) 2453 { 2454 struct smbdirect_recv_io *recv_io; 2455 int ret; 2456 2457 ret = smb_direct_init_params(sc); 2458 if (ret) { 2459 pr_err("Can't configure RDMA parameters\n"); 2460 return ret; 2461 } 2462 2463 ret = smb_direct_create_pools(sc); 2464 if (ret) { 2465 pr_err("Can't init RDMA pool: %d\n", ret); 2466 return ret; 2467 } 2468 2469 list_for_each_entry(recv_io, &sc->recv_io.free.list, list) 2470 recv_io->cqe.done = recv_done; 2471 2472 ret = smb_direct_create_qpair(sc); 2473 if (ret) { 2474 pr_err("Can't accept RDMA client: %d\n", ret); 2475 return ret; 2476 } 2477 2478 ret = smb_direct_prepare_negotiation(sc); 2479 if (ret) { 2480 pr_err("Can't negotiate: %d\n", ret); 2481 return ret; 2482 } 2483 return 0; 2484 } 2485 2486 static bool rdma_frwr_is_supported(struct ib_device_attr *attrs) 2487 { 2488 if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) 2489 return false; 2490 if (attrs->max_fast_reg_page_list_len == 0) 2491 return false; 2492 return true; 2493 } 2494 2495 static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id, 2496 struct rdma_cm_event *event) 2497 { 2498 struct smb_direct_transport *t; 2499 struct smbdirect_socket *sc; 2500 struct smbdirect_socket_parameters *sp; 2501 struct task_struct *handler; 2502 u8 peer_initiator_depth; 2503 u8 peer_responder_resources; 2504 int ret; 2505 2506 if (!rdma_frwr_is_supported(&new_cm_id->device->attrs)) { 2507 ksmbd_debug(RDMA, 2508 "Fast Registration Work Requests is not supported. device capabilities=%llx\n", 2509 new_cm_id->device->attrs.device_cap_flags); 2510 return -EPROTONOSUPPORT; 2511 } 2512 2513 t = alloc_transport(new_cm_id); 2514 if (!t) 2515 return -ENOMEM; 2516 sc = &t->socket; 2517 sp = &sc->parameters; 2518 2519 peer_initiator_depth = event->param.conn.initiator_depth; 2520 peer_responder_resources = event->param.conn.responder_resources; 2521 if (rdma_protocol_iwarp(new_cm_id->device, new_cm_id->port_num) && 2522 event->param.conn.private_data_len == 8) { 2523 /* 2524 * Legacy clients with only iWarp MPA v1 support 2525 * need a private blob in order to negotiate 2526 * the IRD/ORD values. 2527 */ 2528 const __be32 *ird_ord_hdr = event->param.conn.private_data; 2529 u32 ird32 = be32_to_cpu(ird_ord_hdr[0]); 2530 u32 ord32 = be32_to_cpu(ird_ord_hdr[1]); 2531 2532 /* 2533 * cifs.ko sends the legacy IRD/ORD negotiation 2534 * event if iWarp MPA v2 was used. 2535 * 2536 * Here we check that the values match and only 2537 * mark the client as legacy if they don't match. 2538 */ 2539 if ((u32)event->param.conn.initiator_depth != ird32 || 2540 (u32)event->param.conn.responder_resources != ord32) { 2541 /* 2542 * There are broken clients (old cifs.ko) 2543 * using little endian and also 2544 * struct rdma_conn_param only uses u8 2545 * for initiator_depth and responder_resources, 2546 * so we truncate the value to U8_MAX. 2547 * 2548 * smb_direct_accept_client() will then 2549 * do the real negotiation in order to 2550 * select the minimum between client and 2551 * server. 2552 */ 2553 ird32 = min_t(u32, ird32, U8_MAX); 2554 ord32 = min_t(u32, ord32, U8_MAX); 2555 2556 sc->rdma.legacy_iwarp = true; 2557 peer_initiator_depth = (u8)ird32; 2558 peer_responder_resources = (u8)ord32; 2559 } 2560 } 2561 2562 /* 2563 * First set what the we as server are able to support 2564 */ 2565 sp->initiator_depth = min_t(u8, sp->initiator_depth, 2566 new_cm_id->device->attrs.max_qp_rd_atom); 2567 2568 /* 2569 * negotiate the value by using the minimum 2570 * between client and server if the client provided 2571 * non 0 values. 2572 */ 2573 if (peer_initiator_depth != 0) 2574 sp->initiator_depth = min_t(u8, sp->initiator_depth, 2575 peer_initiator_depth); 2576 if (peer_responder_resources != 0) 2577 sp->responder_resources = min_t(u8, sp->responder_resources, 2578 peer_responder_resources); 2579 2580 ret = smb_direct_connect(sc); 2581 if (ret) 2582 goto out_err; 2583 2584 handler = kthread_run(ksmbd_conn_handler_loop, 2585 KSMBD_TRANS(t)->conn, "ksmbd:r%u", 2586 smb_direct_port); 2587 if (IS_ERR(handler)) { 2588 ret = PTR_ERR(handler); 2589 pr_err("Can't start thread\n"); 2590 goto out_err; 2591 } 2592 2593 return 0; 2594 out_err: 2595 free_transport(t); 2596 return ret; 2597 } 2598 2599 static int smb_direct_listen_handler(struct rdma_cm_id *cm_id, 2600 struct rdma_cm_event *event) 2601 { 2602 switch (event->event) { 2603 case RDMA_CM_EVENT_CONNECT_REQUEST: { 2604 int ret = smb_direct_handle_connect_request(cm_id, event); 2605 2606 if (ret) { 2607 pr_err("Can't create transport: %d\n", ret); 2608 return ret; 2609 } 2610 2611 ksmbd_debug(RDMA, "Received connection request. cm_id=%p\n", 2612 cm_id); 2613 break; 2614 } 2615 default: 2616 pr_err("Unexpected listen event. cm_id=%p, event=%s (%d)\n", 2617 cm_id, rdma_event_msg(event->event), event->event); 2618 break; 2619 } 2620 return 0; 2621 } 2622 2623 static int smb_direct_listen(int port) 2624 { 2625 int ret; 2626 struct rdma_cm_id *cm_id; 2627 struct sockaddr_in sin = { 2628 .sin_family = AF_INET, 2629 .sin_addr.s_addr = htonl(INADDR_ANY), 2630 .sin_port = htons(port), 2631 }; 2632 2633 cm_id = rdma_create_id(&init_net, smb_direct_listen_handler, 2634 &smb_direct_listener, RDMA_PS_TCP, IB_QPT_RC); 2635 if (IS_ERR(cm_id)) { 2636 pr_err("Can't create cm id: %ld\n", PTR_ERR(cm_id)); 2637 return PTR_ERR(cm_id); 2638 } 2639 2640 ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); 2641 if (ret) { 2642 pr_err("Can't bind: %d\n", ret); 2643 goto err; 2644 } 2645 2646 smb_direct_listener.cm_id = cm_id; 2647 2648 ret = rdma_listen(cm_id, 10); 2649 if (ret) { 2650 pr_err("Can't listen: %d\n", ret); 2651 goto err; 2652 } 2653 return 0; 2654 err: 2655 smb_direct_listener.cm_id = NULL; 2656 rdma_destroy_id(cm_id); 2657 return ret; 2658 } 2659 2660 static int smb_direct_ib_client_add(struct ib_device *ib_dev) 2661 { 2662 struct smb_direct_device *smb_dev; 2663 2664 /* Set 5445 port if device type is iWARP(No IB) */ 2665 if (ib_dev->node_type != RDMA_NODE_IB_CA) 2666 smb_direct_port = SMB_DIRECT_PORT_IWARP; 2667 2668 if (!rdma_frwr_is_supported(&ib_dev->attrs)) 2669 return 0; 2670 2671 smb_dev = kzalloc(sizeof(*smb_dev), KSMBD_DEFAULT_GFP); 2672 if (!smb_dev) 2673 return -ENOMEM; 2674 smb_dev->ib_dev = ib_dev; 2675 2676 write_lock(&smb_direct_device_lock); 2677 list_add(&smb_dev->list, &smb_direct_device_list); 2678 write_unlock(&smb_direct_device_lock); 2679 2680 ksmbd_debug(RDMA, "ib device added: name %s\n", ib_dev->name); 2681 return 0; 2682 } 2683 2684 static void smb_direct_ib_client_remove(struct ib_device *ib_dev, 2685 void *client_data) 2686 { 2687 struct smb_direct_device *smb_dev, *tmp; 2688 2689 write_lock(&smb_direct_device_lock); 2690 list_for_each_entry_safe(smb_dev, tmp, &smb_direct_device_list, list) { 2691 if (smb_dev->ib_dev == ib_dev) { 2692 list_del(&smb_dev->list); 2693 kfree(smb_dev); 2694 break; 2695 } 2696 } 2697 write_unlock(&smb_direct_device_lock); 2698 } 2699 2700 static struct ib_client smb_direct_ib_client = { 2701 .name = "ksmbd_smb_direct_ib", 2702 .add = smb_direct_ib_client_add, 2703 .remove = smb_direct_ib_client_remove, 2704 }; 2705 2706 int ksmbd_rdma_init(void) 2707 { 2708 int ret; 2709 2710 smb_direct_port = SMB_DIRECT_PORT_INFINIBAND; 2711 smb_direct_listener.cm_id = NULL; 2712 2713 ret = ib_register_client(&smb_direct_ib_client); 2714 if (ret) { 2715 pr_err("failed to ib_register_client\n"); 2716 return ret; 2717 } 2718 2719 /* When a client is running out of send credits, the credits are 2720 * granted by the server's sending a packet using this queue. 2721 * This avoids the situation that a clients cannot send packets 2722 * for lack of credits 2723 */ 2724 smb_direct_wq = alloc_workqueue("ksmbd-smb_direct-wq", 2725 WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_PERCPU, 2726 0); 2727 if (!smb_direct_wq) 2728 return -ENOMEM; 2729 2730 ret = smb_direct_listen(smb_direct_port); 2731 if (ret) { 2732 destroy_workqueue(smb_direct_wq); 2733 smb_direct_wq = NULL; 2734 pr_err("Can't listen: %d\n", ret); 2735 return ret; 2736 } 2737 2738 ksmbd_debug(RDMA, "init RDMA listener. cm_id=%p\n", 2739 smb_direct_listener.cm_id); 2740 return 0; 2741 } 2742 2743 void ksmbd_rdma_stop_listening(void) 2744 { 2745 if (!smb_direct_listener.cm_id) 2746 return; 2747 2748 ib_unregister_client(&smb_direct_ib_client); 2749 rdma_destroy_id(smb_direct_listener.cm_id); 2750 2751 smb_direct_listener.cm_id = NULL; 2752 } 2753 2754 void ksmbd_rdma_destroy(void) 2755 { 2756 if (smb_direct_wq) { 2757 destroy_workqueue(smb_direct_wq); 2758 smb_direct_wq = NULL; 2759 } 2760 } 2761 2762 static bool ksmbd_find_rdma_capable_netdev(struct net_device *netdev) 2763 { 2764 struct smb_direct_device *smb_dev; 2765 int i; 2766 bool rdma_capable = false; 2767 2768 read_lock(&smb_direct_device_lock); 2769 list_for_each_entry(smb_dev, &smb_direct_device_list, list) { 2770 for (i = 0; i < smb_dev->ib_dev->phys_port_cnt; i++) { 2771 struct net_device *ndev; 2772 2773 ndev = ib_device_get_netdev(smb_dev->ib_dev, i + 1); 2774 if (!ndev) 2775 continue; 2776 2777 if (ndev == netdev) { 2778 dev_put(ndev); 2779 rdma_capable = true; 2780 goto out; 2781 } 2782 dev_put(ndev); 2783 } 2784 } 2785 out: 2786 read_unlock(&smb_direct_device_lock); 2787 2788 if (rdma_capable == false) { 2789 struct ib_device *ibdev; 2790 2791 ibdev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_UNKNOWN); 2792 if (ibdev) { 2793 rdma_capable = rdma_frwr_is_supported(&ibdev->attrs); 2794 ib_device_put(ibdev); 2795 } 2796 } 2797 2798 ksmbd_debug(RDMA, "netdev(%s) rdma capable : %s\n", 2799 netdev->name, str_true_false(rdma_capable)); 2800 2801 return rdma_capable; 2802 } 2803 2804 bool ksmbd_rdma_capable_netdev(struct net_device *netdev) 2805 { 2806 struct net_device *lower_dev; 2807 struct list_head *iter; 2808 2809 if (ksmbd_find_rdma_capable_netdev(netdev)) 2810 return true; 2811 2812 /* check if netdev is bridge or VLAN */ 2813 if (netif_is_bridge_master(netdev) || 2814 netdev->priv_flags & IFF_802_1Q_VLAN) 2815 netdev_for_each_lower_dev(netdev, lower_dev, iter) 2816 if (ksmbd_find_rdma_capable_netdev(lower_dev)) 2817 return true; 2818 2819 /* check if netdev is IPoIB safely without layer violation */ 2820 if (netdev->type == ARPHRD_INFINIBAND) 2821 return true; 2822 2823 return false; 2824 } 2825 2826 static const struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops = { 2827 .prepare = smb_direct_prepare, 2828 .disconnect = smb_direct_disconnect, 2829 .shutdown = smb_direct_shutdown, 2830 .writev = smb_direct_writev, 2831 .read = smb_direct_read, 2832 .rdma_read = smb_direct_rdma_read, 2833 .rdma_write = smb_direct_rdma_write, 2834 .free_transport = smb_direct_free_transport, 2835 }; 2836