1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2017, Microsoft Corporation. 4 * Copyright (C) 2018, LG Electronics. 5 * 6 * Author(s): Long Li <longli@microsoft.com>, 7 * Hyunchul Lee <hyc.lee@gmail.com> 8 */ 9 10 #define SUBMOD_NAME "smb_direct" 11 12 #include <linux/kthread.h> 13 #include <linux/list.h> 14 #include <linux/mempool.h> 15 #include <linux/highmem.h> 16 #include <linux/scatterlist.h> 17 #include <linux/string_choices.h> 18 #include <rdma/ib_verbs.h> 19 #include <rdma/rdma_cm.h> 20 #include <rdma/rw.h> 21 22 #define __SMBDIRECT_SOCKET_DISCONNECT(__sc) smb_direct_disconnect_rdma_connection(__sc) 23 24 #include "glob.h" 25 #include "connection.h" 26 #include "smb_common.h" 27 #include "../common/smb2status.h" 28 #include "../common/smbdirect/smbdirect.h" 29 #include "../common/smbdirect/smbdirect_pdu.h" 30 #include "../common/smbdirect/smbdirect_socket.h" 31 #include "transport_rdma.h" 32 33 #define SMB_DIRECT_PORT_IWARP 5445 34 #define SMB_DIRECT_PORT_INFINIBAND 445 35 36 #define SMB_DIRECT_VERSION_LE cpu_to_le16(SMBDIRECT_V1) 37 38 /* SMB_DIRECT negotiation timeout (for the server) in seconds */ 39 #define SMB_DIRECT_NEGOTIATE_TIMEOUT 5 40 41 /* The timeout to wait for a keepalive message from peer in seconds */ 42 #define SMB_DIRECT_KEEPALIVE_SEND_INTERVAL 120 43 44 /* The timeout to wait for a keepalive message from peer in seconds */ 45 #define SMB_DIRECT_KEEPALIVE_RECV_TIMEOUT 5 46 47 /* 48 * Default maximum number of RDMA read/write outstanding on this connection 49 * This value is possibly decreased during QP creation on hardware limit 50 */ 51 #define SMB_DIRECT_CM_INITIATOR_DEPTH 8 52 53 /* Maximum number of retries on data transfer operations */ 54 #define SMB_DIRECT_CM_RETRY 6 55 /* No need to retry on Receiver Not Ready since SMB_DIRECT manages credits */ 56 #define SMB_DIRECT_CM_RNR_RETRY 0 57 58 /* 59 * User configurable initial values per SMB_DIRECT transport connection 60 * as defined in [MS-SMBD] 3.1.1.1 61 * Those may change after a SMB_DIRECT negotiation 62 */ 63 64 /* Set 445 port to SMB Direct port by default */ 65 static int smb_direct_port = SMB_DIRECT_PORT_INFINIBAND; 66 67 /* The local peer's maximum number of credits to grant to the peer */ 68 static int smb_direct_receive_credit_max = 255; 69 70 /* The remote peer's credit request of local peer */ 71 static int smb_direct_send_credit_target = 255; 72 73 /* The maximum single message size can be sent to remote peer */ 74 static int smb_direct_max_send_size = 1364; 75 76 /* The maximum fragmented upper-layer payload receive size supported */ 77 static int smb_direct_max_fragmented_recv_size = 1024 * 1024; 78 79 /* The maximum single-message size which can be received */ 80 static int smb_direct_max_receive_size = 1364; 81 82 static int smb_direct_max_read_write_size = SMBD_DEFAULT_IOSIZE; 83 84 static LIST_HEAD(smb_direct_device_list); 85 static DEFINE_RWLOCK(smb_direct_device_lock); 86 87 struct smb_direct_device { 88 struct ib_device *ib_dev; 89 struct list_head list; 90 }; 91 92 static struct smb_direct_listener { 93 struct rdma_cm_id *cm_id; 94 } smb_direct_listener; 95 96 static struct workqueue_struct *smb_direct_wq; 97 98 struct smb_direct_transport { 99 struct ksmbd_transport transport; 100 101 struct smbdirect_socket socket; 102 }; 103 104 #define KSMBD_TRANS(t) (&(t)->transport) 105 #define SMBD_TRANS(t) (container_of(t, \ 106 struct smb_direct_transport, transport)) 107 108 static const struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops; 109 110 void init_smbd_max_io_size(unsigned int sz) 111 { 112 sz = clamp_val(sz, SMBD_MIN_IOSIZE, SMBD_MAX_IOSIZE); 113 smb_direct_max_read_write_size = sz; 114 } 115 116 unsigned int get_smbd_max_read_write_size(struct ksmbd_transport *kt) 117 { 118 struct smb_direct_transport *t; 119 struct smbdirect_socket *sc; 120 struct smbdirect_socket_parameters *sp; 121 122 if (kt->ops != &ksmbd_smb_direct_transport_ops) 123 return 0; 124 125 t = SMBD_TRANS(kt); 126 sc = &t->socket; 127 sp = &sc->parameters; 128 129 return sp->max_read_write_size; 130 } 131 132 static inline int get_buf_page_count(void *buf, int size) 133 { 134 return DIV_ROUND_UP((uintptr_t)buf + size, PAGE_SIZE) - 135 (uintptr_t)buf / PAGE_SIZE; 136 } 137 138 static void smb_direct_destroy_pools(struct smbdirect_socket *sc); 139 static void smb_direct_post_recv_credits(struct work_struct *work); 140 static int smb_direct_post_send_data(struct smbdirect_socket *sc, 141 struct smbdirect_send_batch *send_ctx, 142 struct kvec *iov, int niov, 143 int remaining_data_length); 144 145 static inline void 146 *smbdirect_recv_io_payload(struct smbdirect_recv_io *recvmsg) 147 { 148 return (void *)recvmsg->packet; 149 } 150 151 static struct 152 smbdirect_recv_io *get_free_recvmsg(struct smbdirect_socket *sc) 153 { 154 struct smbdirect_recv_io *recvmsg = NULL; 155 unsigned long flags; 156 157 spin_lock_irqsave(&sc->recv_io.free.lock, flags); 158 if (!list_empty(&sc->recv_io.free.list)) { 159 recvmsg = list_first_entry(&sc->recv_io.free.list, 160 struct smbdirect_recv_io, 161 list); 162 list_del(&recvmsg->list); 163 } 164 spin_unlock_irqrestore(&sc->recv_io.free.lock, flags); 165 return recvmsg; 166 } 167 168 static void put_recvmsg(struct smbdirect_socket *sc, 169 struct smbdirect_recv_io *recvmsg) 170 { 171 unsigned long flags; 172 173 if (likely(recvmsg->sge.length != 0)) { 174 ib_dma_unmap_single(sc->ib.dev, 175 recvmsg->sge.addr, 176 recvmsg->sge.length, 177 DMA_FROM_DEVICE); 178 recvmsg->sge.length = 0; 179 } 180 181 spin_lock_irqsave(&sc->recv_io.free.lock, flags); 182 list_add(&recvmsg->list, &sc->recv_io.free.list); 183 spin_unlock_irqrestore(&sc->recv_io.free.lock, flags); 184 185 queue_work(sc->workqueue, &sc->recv_io.posted.refill_work); 186 } 187 188 static void enqueue_reassembly(struct smbdirect_socket *sc, 189 struct smbdirect_recv_io *recvmsg, 190 int data_length) 191 { 192 unsigned long flags; 193 194 spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); 195 list_add_tail(&recvmsg->list, &sc->recv_io.reassembly.list); 196 sc->recv_io.reassembly.queue_length++; 197 /* 198 * Make sure reassembly_data_length is updated after list and 199 * reassembly_queue_length are updated. On the dequeue side 200 * reassembly_data_length is checked without a lock to determine 201 * if reassembly_queue_length and list is up to date 202 */ 203 virt_wmb(); 204 sc->recv_io.reassembly.data_length += data_length; 205 spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); 206 } 207 208 static struct smbdirect_recv_io *get_first_reassembly(struct smbdirect_socket *sc) 209 { 210 if (!list_empty(&sc->recv_io.reassembly.list)) 211 return list_first_entry(&sc->recv_io.reassembly.list, 212 struct smbdirect_recv_io, list); 213 else 214 return NULL; 215 } 216 217 static void smb_direct_disconnect_wake_up_all(struct smbdirect_socket *sc) 218 { 219 /* 220 * Wake up all waiters in all wait queues 221 * in order to notice the broken connection. 222 */ 223 wake_up_all(&sc->status_wait); 224 wake_up_all(&sc->send_io.lcredits.wait_queue); 225 wake_up_all(&sc->send_io.credits.wait_queue); 226 wake_up_all(&sc->send_io.pending.zero_wait_queue); 227 wake_up_all(&sc->recv_io.reassembly.wait_queue); 228 wake_up_all(&sc->rw_io.credits.wait_queue); 229 } 230 231 static void smb_direct_disconnect_rdma_work(struct work_struct *work) 232 { 233 struct smbdirect_socket *sc = 234 container_of(work, struct smbdirect_socket, disconnect_work); 235 236 if (sc->first_error == 0) 237 sc->first_error = -ECONNABORTED; 238 239 /* 240 * make sure this and other work is not queued again 241 * but here we don't block and avoid 242 * disable[_delayed]_work_sync() 243 */ 244 disable_work(&sc->disconnect_work); 245 disable_work(&sc->connect.work); 246 disable_work(&sc->recv_io.posted.refill_work); 247 disable_delayed_work(&sc->idle.timer_work); 248 disable_work(&sc->idle.immediate_work); 249 250 switch (sc->status) { 251 case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED: 252 case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING: 253 case SMBDIRECT_SOCKET_NEGOTIATE_FAILED: 254 case SMBDIRECT_SOCKET_CONNECTED: 255 case SMBDIRECT_SOCKET_ERROR: 256 sc->status = SMBDIRECT_SOCKET_DISCONNECTING; 257 rdma_disconnect(sc->rdma.cm_id); 258 break; 259 260 case SMBDIRECT_SOCKET_CREATED: 261 case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED: 262 case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING: 263 case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED: 264 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED: 265 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING: 266 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED: 267 case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED: 268 case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING: 269 case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED: 270 /* 271 * rdma_accept() never reached 272 * RDMA_CM_EVENT_ESTABLISHED 273 */ 274 sc->status = SMBDIRECT_SOCKET_DISCONNECTED; 275 break; 276 277 case SMBDIRECT_SOCKET_DISCONNECTING: 278 case SMBDIRECT_SOCKET_DISCONNECTED: 279 case SMBDIRECT_SOCKET_DESTROYED: 280 break; 281 } 282 283 /* 284 * Wake up all waiters in all wait queues 285 * in order to notice the broken connection. 286 */ 287 smb_direct_disconnect_wake_up_all(sc); 288 } 289 290 static void 291 smb_direct_disconnect_rdma_connection(struct smbdirect_socket *sc) 292 { 293 if (sc->first_error == 0) 294 sc->first_error = -ECONNABORTED; 295 296 /* 297 * make sure other work (than disconnect_work) is 298 * not queued again but here we don't block and avoid 299 * disable[_delayed]_work_sync() 300 */ 301 disable_work(&sc->connect.work); 302 disable_work(&sc->recv_io.posted.refill_work); 303 disable_work(&sc->idle.immediate_work); 304 disable_delayed_work(&sc->idle.timer_work); 305 306 switch (sc->status) { 307 case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED: 308 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED: 309 case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED: 310 case SMBDIRECT_SOCKET_NEGOTIATE_FAILED: 311 case SMBDIRECT_SOCKET_ERROR: 312 case SMBDIRECT_SOCKET_DISCONNECTING: 313 case SMBDIRECT_SOCKET_DISCONNECTED: 314 case SMBDIRECT_SOCKET_DESTROYED: 315 /* 316 * Keep the current error status 317 */ 318 break; 319 320 case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED: 321 case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING: 322 sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED; 323 break; 324 325 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED: 326 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING: 327 sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED; 328 break; 329 330 case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED: 331 case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING: 332 sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED; 333 break; 334 335 case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED: 336 case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING: 337 sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; 338 break; 339 340 case SMBDIRECT_SOCKET_CREATED: 341 sc->status = SMBDIRECT_SOCKET_DISCONNECTED; 342 break; 343 344 case SMBDIRECT_SOCKET_CONNECTED: 345 sc->status = SMBDIRECT_SOCKET_ERROR; 346 break; 347 } 348 349 /* 350 * Wake up all waiters in all wait queues 351 * in order to notice the broken connection. 352 */ 353 smb_direct_disconnect_wake_up_all(sc); 354 355 queue_work(sc->workqueue, &sc->disconnect_work); 356 } 357 358 static void smb_direct_send_immediate_work(struct work_struct *work) 359 { 360 struct smbdirect_socket *sc = 361 container_of(work, struct smbdirect_socket, idle.immediate_work); 362 363 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) 364 return; 365 366 smb_direct_post_send_data(sc, NULL, NULL, 0, 0); 367 } 368 369 static void smb_direct_idle_connection_timer(struct work_struct *work) 370 { 371 struct smbdirect_socket *sc = 372 container_of(work, struct smbdirect_socket, idle.timer_work.work); 373 struct smbdirect_socket_parameters *sp = &sc->parameters; 374 375 if (sc->idle.keepalive != SMBDIRECT_KEEPALIVE_NONE) { 376 smb_direct_disconnect_rdma_connection(sc); 377 return; 378 } 379 380 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) 381 return; 382 383 /* 384 * Now use the keepalive timeout (instead of keepalive interval) 385 * in order to wait for a response 386 */ 387 sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING; 388 mod_delayed_work(sc->workqueue, &sc->idle.timer_work, 389 msecs_to_jiffies(sp->keepalive_timeout_msec)); 390 queue_work(sc->workqueue, &sc->idle.immediate_work); 391 } 392 393 static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) 394 { 395 struct smb_direct_transport *t; 396 struct smbdirect_socket *sc; 397 struct smbdirect_socket_parameters *sp; 398 struct ksmbd_conn *conn; 399 400 t = kzalloc(sizeof(*t), KSMBD_DEFAULT_GFP); 401 if (!t) 402 return NULL; 403 sc = &t->socket; 404 smbdirect_socket_init(sc); 405 sp = &sc->parameters; 406 407 sc->workqueue = smb_direct_wq; 408 409 INIT_WORK(&sc->disconnect_work, smb_direct_disconnect_rdma_work); 410 411 sp->negotiate_timeout_msec = SMB_DIRECT_NEGOTIATE_TIMEOUT * 1000; 412 sp->initiator_depth = SMB_DIRECT_CM_INITIATOR_DEPTH; 413 sp->responder_resources = 1; 414 sp->recv_credit_max = smb_direct_receive_credit_max; 415 sp->send_credit_target = smb_direct_send_credit_target; 416 sp->max_send_size = smb_direct_max_send_size; 417 sp->max_fragmented_recv_size = smb_direct_max_fragmented_recv_size; 418 sp->max_recv_size = smb_direct_max_receive_size; 419 sp->max_read_write_size = smb_direct_max_read_write_size; 420 sp->keepalive_interval_msec = SMB_DIRECT_KEEPALIVE_SEND_INTERVAL * 1000; 421 sp->keepalive_timeout_msec = SMB_DIRECT_KEEPALIVE_RECV_TIMEOUT * 1000; 422 423 sc->rdma.cm_id = cm_id; 424 cm_id->context = sc; 425 426 sc->ib.dev = sc->rdma.cm_id->device; 427 428 INIT_DELAYED_WORK(&sc->idle.timer_work, smb_direct_idle_connection_timer); 429 430 conn = ksmbd_conn_alloc(); 431 if (!conn) 432 goto err; 433 434 down_write(&conn_list_lock); 435 hash_add(conn_list, &conn->hlist, 0); 436 up_write(&conn_list_lock); 437 438 conn->transport = KSMBD_TRANS(t); 439 KSMBD_TRANS(t)->conn = conn; 440 KSMBD_TRANS(t)->ops = &ksmbd_smb_direct_transport_ops; 441 return t; 442 err: 443 kfree(t); 444 return NULL; 445 } 446 447 static void smb_direct_free_transport(struct ksmbd_transport *kt) 448 { 449 kfree(SMBD_TRANS(kt)); 450 } 451 452 static void free_transport(struct smb_direct_transport *t) 453 { 454 struct smbdirect_socket *sc = &t->socket; 455 struct smbdirect_recv_io *recvmsg; 456 457 disable_work_sync(&sc->disconnect_work); 458 if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING) 459 smb_direct_disconnect_rdma_work(&sc->disconnect_work); 460 if (sc->status < SMBDIRECT_SOCKET_DISCONNECTED) 461 wait_event(sc->status_wait, sc->status == SMBDIRECT_SOCKET_DISCONNECTED); 462 463 /* 464 * Wake up all waiters in all wait queues 465 * in order to notice the broken connection. 466 * 467 * Most likely this was already called via 468 * smb_direct_disconnect_rdma_work(), but call it again... 469 */ 470 smb_direct_disconnect_wake_up_all(sc); 471 472 disable_work_sync(&sc->connect.work); 473 disable_work_sync(&sc->recv_io.posted.refill_work); 474 disable_delayed_work_sync(&sc->idle.timer_work); 475 disable_work_sync(&sc->idle.immediate_work); 476 477 if (sc->rdma.cm_id) 478 rdma_lock_handler(sc->rdma.cm_id); 479 480 if (sc->ib.qp) { 481 ib_drain_qp(sc->ib.qp); 482 sc->ib.qp = NULL; 483 rdma_destroy_qp(sc->rdma.cm_id); 484 } 485 486 ksmbd_debug(RDMA, "drain the reassembly queue\n"); 487 do { 488 unsigned long flags; 489 490 spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); 491 recvmsg = get_first_reassembly(sc); 492 if (recvmsg) { 493 list_del(&recvmsg->list); 494 spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); 495 put_recvmsg(sc, recvmsg); 496 } else { 497 spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); 498 } 499 } while (recvmsg); 500 sc->recv_io.reassembly.data_length = 0; 501 502 if (sc->ib.send_cq) 503 ib_free_cq(sc->ib.send_cq); 504 if (sc->ib.recv_cq) 505 ib_free_cq(sc->ib.recv_cq); 506 if (sc->ib.pd) 507 ib_dealloc_pd(sc->ib.pd); 508 if (sc->rdma.cm_id) { 509 rdma_unlock_handler(sc->rdma.cm_id); 510 rdma_destroy_id(sc->rdma.cm_id); 511 } 512 513 smb_direct_destroy_pools(sc); 514 ksmbd_conn_free(KSMBD_TRANS(t)->conn); 515 } 516 517 static struct smbdirect_send_io 518 *smb_direct_alloc_sendmsg(struct smbdirect_socket *sc) 519 { 520 struct smbdirect_send_io *msg; 521 522 msg = mempool_alloc(sc->send_io.mem.pool, KSMBD_DEFAULT_GFP); 523 if (!msg) 524 return ERR_PTR(-ENOMEM); 525 msg->socket = sc; 526 INIT_LIST_HEAD(&msg->sibling_list); 527 msg->num_sge = 0; 528 return msg; 529 } 530 531 static void smb_direct_free_sendmsg(struct smbdirect_socket *sc, 532 struct smbdirect_send_io *msg) 533 { 534 int i; 535 536 /* 537 * The list needs to be empty! 538 * The caller should take care of it. 539 */ 540 WARN_ON_ONCE(!list_empty(&msg->sibling_list)); 541 542 if (msg->num_sge > 0) { 543 ib_dma_unmap_single(sc->ib.dev, 544 msg->sge[0].addr, msg->sge[0].length, 545 DMA_TO_DEVICE); 546 for (i = 1; i < msg->num_sge; i++) 547 ib_dma_unmap_page(sc->ib.dev, 548 msg->sge[i].addr, msg->sge[i].length, 549 DMA_TO_DEVICE); 550 } 551 mempool_free(msg, sc->send_io.mem.pool); 552 } 553 554 static int smb_direct_check_recvmsg(struct smbdirect_recv_io *recvmsg) 555 { 556 struct smbdirect_socket *sc = recvmsg->socket; 557 558 switch (sc->recv_io.expected) { 559 case SMBDIRECT_EXPECT_DATA_TRANSFER: { 560 struct smbdirect_data_transfer *req = 561 (struct smbdirect_data_transfer *)recvmsg->packet; 562 struct smb2_hdr *hdr = (struct smb2_hdr *)(recvmsg->packet 563 + le32_to_cpu(req->data_offset)); 564 ksmbd_debug(RDMA, 565 "CreditGranted: %u, CreditRequested: %u, DataLength: %u, RemainingDataLength: %u, SMB: %x, Command: %u\n", 566 le16_to_cpu(req->credits_granted), 567 le16_to_cpu(req->credits_requested), 568 req->data_length, req->remaining_data_length, 569 hdr->ProtocolId, hdr->Command); 570 return 0; 571 } 572 case SMBDIRECT_EXPECT_NEGOTIATE_REQ: { 573 struct smbdirect_negotiate_req *req = 574 (struct smbdirect_negotiate_req *)recvmsg->packet; 575 ksmbd_debug(RDMA, 576 "MinVersion: %u, MaxVersion: %u, CreditRequested: %u, MaxSendSize: %u, MaxRecvSize: %u, MaxFragmentedSize: %u\n", 577 le16_to_cpu(req->min_version), 578 le16_to_cpu(req->max_version), 579 le16_to_cpu(req->credits_requested), 580 le32_to_cpu(req->preferred_send_size), 581 le32_to_cpu(req->max_receive_size), 582 le32_to_cpu(req->max_fragmented_size)); 583 if (le16_to_cpu(req->min_version) > 0x0100 || 584 le16_to_cpu(req->max_version) < 0x0100) 585 return -EOPNOTSUPP; 586 if (le16_to_cpu(req->credits_requested) <= 0 || 587 le32_to_cpu(req->max_receive_size) <= 128 || 588 le32_to_cpu(req->max_fragmented_size) <= 589 128 * 1024) 590 return -ECONNABORTED; 591 592 return 0; 593 } 594 case SMBDIRECT_EXPECT_NEGOTIATE_REP: 595 /* client only */ 596 break; 597 } 598 599 /* This is an internal error */ 600 return -EINVAL; 601 } 602 603 static void recv_done(struct ib_cq *cq, struct ib_wc *wc) 604 { 605 struct smbdirect_recv_io *recvmsg; 606 struct smbdirect_socket *sc; 607 struct smbdirect_socket_parameters *sp; 608 609 recvmsg = container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe); 610 sc = recvmsg->socket; 611 sp = &sc->parameters; 612 613 if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) { 614 put_recvmsg(sc, recvmsg); 615 if (wc->status != IB_WC_WR_FLUSH_ERR) { 616 pr_err("Recv error. status='%s (%d)' opcode=%d\n", 617 ib_wc_status_msg(wc->status), wc->status, 618 wc->opcode); 619 smb_direct_disconnect_rdma_connection(sc); 620 } 621 return; 622 } 623 624 ksmbd_debug(RDMA, "Recv completed. status='%s (%d)', opcode=%d\n", 625 ib_wc_status_msg(wc->status), wc->status, 626 wc->opcode); 627 628 ib_dma_sync_single_for_cpu(wc->qp->device, recvmsg->sge.addr, 629 recvmsg->sge.length, DMA_FROM_DEVICE); 630 631 /* 632 * Reset timer to the keepalive interval in 633 * order to trigger our next keepalive message. 634 */ 635 sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE; 636 mod_delayed_work(sc->workqueue, &sc->idle.timer_work, 637 msecs_to_jiffies(sp->keepalive_interval_msec)); 638 639 switch (sc->recv_io.expected) { 640 case SMBDIRECT_EXPECT_NEGOTIATE_REQ: 641 /* see smb_direct_negotiate_recv_done */ 642 break; 643 case SMBDIRECT_EXPECT_DATA_TRANSFER: { 644 struct smbdirect_data_transfer *data_transfer = 645 (struct smbdirect_data_transfer *)recvmsg->packet; 646 u32 remaining_data_length, data_offset, data_length; 647 u16 old_recv_credit_target; 648 649 if (wc->byte_len < 650 offsetof(struct smbdirect_data_transfer, padding)) { 651 put_recvmsg(sc, recvmsg); 652 smb_direct_disconnect_rdma_connection(sc); 653 return; 654 } 655 656 remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length); 657 data_length = le32_to_cpu(data_transfer->data_length); 658 data_offset = le32_to_cpu(data_transfer->data_offset); 659 if (wc->byte_len < data_offset || 660 wc->byte_len < (u64)data_offset + data_length) { 661 put_recvmsg(sc, recvmsg); 662 smb_direct_disconnect_rdma_connection(sc); 663 return; 664 } 665 if (remaining_data_length > sp->max_fragmented_recv_size || 666 data_length > sp->max_fragmented_recv_size || 667 (u64)remaining_data_length + (u64)data_length > 668 (u64)sp->max_fragmented_recv_size) { 669 put_recvmsg(sc, recvmsg); 670 smb_direct_disconnect_rdma_connection(sc); 671 return; 672 } 673 674 if (data_length) { 675 if (sc->recv_io.reassembly.full_packet_received) 676 recvmsg->first_segment = true; 677 678 if (le32_to_cpu(data_transfer->remaining_data_length)) 679 sc->recv_io.reassembly.full_packet_received = false; 680 else 681 sc->recv_io.reassembly.full_packet_received = true; 682 } 683 684 atomic_dec(&sc->recv_io.posted.count); 685 atomic_dec(&sc->recv_io.credits.count); 686 687 old_recv_credit_target = sc->recv_io.credits.target; 688 sc->recv_io.credits.target = 689 le16_to_cpu(data_transfer->credits_requested); 690 sc->recv_io.credits.target = 691 min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max); 692 sc->recv_io.credits.target = 693 max_t(u16, sc->recv_io.credits.target, 1); 694 atomic_add(le16_to_cpu(data_transfer->credits_granted), 695 &sc->send_io.credits.count); 696 697 if (le16_to_cpu(data_transfer->flags) & 698 SMBDIRECT_FLAG_RESPONSE_REQUESTED) 699 queue_work(sc->workqueue, &sc->idle.immediate_work); 700 701 if (atomic_read(&sc->send_io.credits.count) > 0) 702 wake_up(&sc->send_io.credits.wait_queue); 703 704 if (data_length) { 705 if (sc->recv_io.credits.target > old_recv_credit_target) 706 queue_work(sc->workqueue, &sc->recv_io.posted.refill_work); 707 708 enqueue_reassembly(sc, recvmsg, (int)data_length); 709 wake_up(&sc->recv_io.reassembly.wait_queue); 710 } else 711 put_recvmsg(sc, recvmsg); 712 713 return; 714 } 715 case SMBDIRECT_EXPECT_NEGOTIATE_REP: 716 /* client only */ 717 break; 718 } 719 720 /* 721 * This is an internal error! 722 */ 723 WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_DATA_TRANSFER); 724 put_recvmsg(sc, recvmsg); 725 smb_direct_disconnect_rdma_connection(sc); 726 } 727 728 static void smb_direct_negotiate_recv_work(struct work_struct *work); 729 730 static void smb_direct_negotiate_recv_done(struct ib_cq *cq, struct ib_wc *wc) 731 { 732 struct smbdirect_recv_io *recv_io = 733 container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe); 734 struct smbdirect_socket *sc = recv_io->socket; 735 unsigned long flags; 736 737 /* 738 * reset the common recv_done for later reuse. 739 */ 740 recv_io->cqe.done = recv_done; 741 742 if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) { 743 put_recvmsg(sc, recv_io); 744 if (wc->status != IB_WC_WR_FLUSH_ERR) { 745 pr_err("Negotiate Recv error. status='%s (%d)' opcode=%d\n", 746 ib_wc_status_msg(wc->status), wc->status, 747 wc->opcode); 748 smb_direct_disconnect_rdma_connection(sc); 749 } 750 return; 751 } 752 753 ksmbd_debug(RDMA, "Negotiate Recv completed. status='%s (%d)', opcode=%d\n", 754 ib_wc_status_msg(wc->status), wc->status, 755 wc->opcode); 756 757 ib_dma_sync_single_for_cpu(sc->ib.dev, 758 recv_io->sge.addr, 759 recv_io->sge.length, 760 DMA_FROM_DEVICE); 761 762 /* 763 * This is an internal error! 764 */ 765 if (WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_NEGOTIATE_REQ)) { 766 put_recvmsg(sc, recv_io); 767 smb_direct_disconnect_rdma_connection(sc); 768 return; 769 } 770 771 /* 772 * Don't reset timer to the keepalive interval in 773 * this will be done in smb_direct_negotiate_recv_work. 774 */ 775 776 /* 777 * Only remember the recv_io if it has enough bytes, 778 * this gives smb_direct_negotiate_recv_work enough 779 * information in order to disconnect if it was not 780 * valid. 781 */ 782 sc->recv_io.reassembly.full_packet_received = true; 783 if (wc->byte_len >= sizeof(struct smbdirect_negotiate_req)) 784 enqueue_reassembly(sc, recv_io, 0); 785 else 786 put_recvmsg(sc, recv_io); 787 788 /* 789 * Some drivers (at least mlx5_ib and irdma in roce mode) 790 * might post a recv completion before RDMA_CM_EVENT_ESTABLISHED, 791 * we need to adjust our expectation in that case. 792 * 793 * So we defer further processing of the negotiation 794 * to smb_direct_negotiate_recv_work(). 795 * 796 * If we are already in SMBDIRECT_SOCKET_NEGOTIATE_NEEDED 797 * we queue the work directly otherwise 798 * smb_direct_cm_handler() will do it, when 799 * RDMA_CM_EVENT_ESTABLISHED arrived. 800 */ 801 spin_lock_irqsave(&sc->connect.lock, flags); 802 if (!sc->first_error) { 803 INIT_WORK(&sc->connect.work, smb_direct_negotiate_recv_work); 804 if (sc->status == SMBDIRECT_SOCKET_NEGOTIATE_NEEDED) 805 queue_work(sc->workqueue, &sc->connect.work); 806 } 807 spin_unlock_irqrestore(&sc->connect.lock, flags); 808 } 809 810 static void smb_direct_negotiate_recv_work(struct work_struct *work) 811 { 812 struct smbdirect_socket *sc = 813 container_of(work, struct smbdirect_socket, connect.work); 814 const struct smbdirect_socket_parameters *sp = &sc->parameters; 815 struct smbdirect_recv_io *recv_io; 816 817 if (sc->first_error) 818 return; 819 820 ksmbd_debug(RDMA, "Negotiate Recv Work running\n"); 821 822 /* 823 * Reset timer to the keepalive interval in 824 * order to trigger our next keepalive message. 825 */ 826 sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE; 827 mod_delayed_work(sc->workqueue, &sc->idle.timer_work, 828 msecs_to_jiffies(sp->keepalive_interval_msec)); 829 830 /* 831 * If smb_direct_negotiate_recv_done() detected an 832 * invalid request we want to disconnect. 833 */ 834 recv_io = get_first_reassembly(sc); 835 if (!recv_io) { 836 smb_direct_disconnect_rdma_connection(sc); 837 return; 838 } 839 840 if (SMBDIRECT_CHECK_STATUS_WARN(sc, SMBDIRECT_SOCKET_NEGOTIATE_NEEDED)) { 841 smb_direct_disconnect_rdma_connection(sc); 842 return; 843 } 844 sc->status = SMBDIRECT_SOCKET_NEGOTIATE_RUNNING; 845 wake_up(&sc->status_wait); 846 } 847 848 static int smb_direct_post_recv(struct smbdirect_socket *sc, 849 struct smbdirect_recv_io *recvmsg) 850 { 851 struct smbdirect_socket_parameters *sp = &sc->parameters; 852 struct ib_recv_wr wr; 853 int ret; 854 855 recvmsg->sge.addr = ib_dma_map_single(sc->ib.dev, 856 recvmsg->packet, 857 sp->max_recv_size, 858 DMA_FROM_DEVICE); 859 ret = ib_dma_mapping_error(sc->ib.dev, recvmsg->sge.addr); 860 if (ret) 861 return ret; 862 recvmsg->sge.length = sp->max_recv_size; 863 recvmsg->sge.lkey = sc->ib.pd->local_dma_lkey; 864 865 wr.wr_cqe = &recvmsg->cqe; 866 wr.next = NULL; 867 wr.sg_list = &recvmsg->sge; 868 wr.num_sge = 1; 869 870 ret = ib_post_recv(sc->ib.qp, &wr, NULL); 871 if (ret) { 872 pr_err("Can't post recv: %d\n", ret); 873 ib_dma_unmap_single(sc->ib.dev, 874 recvmsg->sge.addr, recvmsg->sge.length, 875 DMA_FROM_DEVICE); 876 recvmsg->sge.length = 0; 877 smb_direct_disconnect_rdma_connection(sc); 878 return ret; 879 } 880 return ret; 881 } 882 883 static int smb_direct_read(struct ksmbd_transport *t, char *buf, 884 unsigned int size, int unused) 885 { 886 struct smbdirect_recv_io *recvmsg; 887 struct smbdirect_data_transfer *data_transfer; 888 int to_copy, to_read, data_read, offset; 889 u32 data_length, remaining_data_length, data_offset; 890 int rc; 891 struct smb_direct_transport *st = SMBD_TRANS(t); 892 struct smbdirect_socket *sc = &st->socket; 893 894 again: 895 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { 896 pr_err("disconnected\n"); 897 return -ENOTCONN; 898 } 899 900 /* 901 * No need to hold the reassembly queue lock all the time as we are 902 * the only one reading from the front of the queue. The transport 903 * may add more entries to the back of the queue at the same time 904 */ 905 if (sc->recv_io.reassembly.data_length >= size) { 906 int queue_length; 907 int queue_removed = 0; 908 unsigned long flags; 909 910 /* 911 * Need to make sure reassembly_data_length is read before 912 * reading reassembly_queue_length and calling 913 * get_first_reassembly. This call is lock free 914 * as we never read at the end of the queue which are being 915 * updated in SOFTIRQ as more data is received 916 */ 917 virt_rmb(); 918 queue_length = sc->recv_io.reassembly.queue_length; 919 data_read = 0; 920 to_read = size; 921 offset = sc->recv_io.reassembly.first_entry_offset; 922 while (data_read < size) { 923 recvmsg = get_first_reassembly(sc); 924 data_transfer = smbdirect_recv_io_payload(recvmsg); 925 data_length = le32_to_cpu(data_transfer->data_length); 926 remaining_data_length = 927 le32_to_cpu(data_transfer->remaining_data_length); 928 data_offset = le32_to_cpu(data_transfer->data_offset); 929 930 /* 931 * The upper layer expects RFC1002 length at the 932 * beginning of the payload. Return it to indicate 933 * the total length of the packet. This minimize the 934 * change to upper layer packet processing logic. This 935 * will be eventually remove when an intermediate 936 * transport layer is added 937 */ 938 if (recvmsg->first_segment && size == 4) { 939 unsigned int rfc1002_len = 940 data_length + remaining_data_length; 941 *((__be32 *)buf) = cpu_to_be32(rfc1002_len); 942 data_read = 4; 943 recvmsg->first_segment = false; 944 ksmbd_debug(RDMA, 945 "returning rfc1002 length %d\n", 946 rfc1002_len); 947 goto read_rfc1002_done; 948 } 949 950 to_copy = min_t(int, data_length - offset, to_read); 951 memcpy(buf + data_read, (char *)data_transfer + data_offset + offset, 952 to_copy); 953 954 /* move on to the next buffer? */ 955 if (to_copy == data_length - offset) { 956 queue_length--; 957 /* 958 * No need to lock if we are not at the 959 * end of the queue 960 */ 961 if (queue_length) { 962 list_del(&recvmsg->list); 963 } else { 964 spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); 965 list_del(&recvmsg->list); 966 spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); 967 } 968 queue_removed++; 969 put_recvmsg(sc, recvmsg); 970 offset = 0; 971 } else { 972 offset += to_copy; 973 } 974 975 to_read -= to_copy; 976 data_read += to_copy; 977 } 978 979 spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); 980 sc->recv_io.reassembly.data_length -= data_read; 981 sc->recv_io.reassembly.queue_length -= queue_removed; 982 spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); 983 984 sc->recv_io.reassembly.first_entry_offset = offset; 985 ksmbd_debug(RDMA, 986 "returning to thread data_read=%d reassembly_data_length=%d first_entry_offset=%d\n", 987 data_read, sc->recv_io.reassembly.data_length, 988 sc->recv_io.reassembly.first_entry_offset); 989 read_rfc1002_done: 990 return data_read; 991 } 992 993 ksmbd_debug(RDMA, "wait_event on more data\n"); 994 rc = wait_event_interruptible(sc->recv_io.reassembly.wait_queue, 995 sc->recv_io.reassembly.data_length >= size || 996 sc->status != SMBDIRECT_SOCKET_CONNECTED); 997 if (rc) 998 return -EINTR; 999 1000 goto again; 1001 } 1002 1003 static void smb_direct_post_recv_credits(struct work_struct *work) 1004 { 1005 struct smbdirect_socket *sc = 1006 container_of(work, struct smbdirect_socket, recv_io.posted.refill_work); 1007 struct smbdirect_recv_io *recvmsg; 1008 int credits = 0; 1009 int ret; 1010 1011 if (atomic_read(&sc->recv_io.credits.count) < sc->recv_io.credits.target) { 1012 while (true) { 1013 recvmsg = get_free_recvmsg(sc); 1014 if (!recvmsg) 1015 break; 1016 1017 recvmsg->first_segment = false; 1018 1019 ret = smb_direct_post_recv(sc, recvmsg); 1020 if (ret) { 1021 pr_err("Can't post recv: %d\n", ret); 1022 put_recvmsg(sc, recvmsg); 1023 break; 1024 } 1025 credits++; 1026 1027 atomic_inc(&sc->recv_io.posted.count); 1028 } 1029 } 1030 1031 if (credits) 1032 queue_work(sc->workqueue, &sc->idle.immediate_work); 1033 } 1034 1035 static void send_done(struct ib_cq *cq, struct ib_wc *wc) 1036 { 1037 struct smbdirect_send_io *sendmsg, *sibling, *next; 1038 struct smbdirect_socket *sc; 1039 int lcredits = 0; 1040 1041 sendmsg = container_of(wc->wr_cqe, struct smbdirect_send_io, cqe); 1042 sc = sendmsg->socket; 1043 1044 ksmbd_debug(RDMA, "Send completed. status='%s (%d)', opcode=%d\n", 1045 ib_wc_status_msg(wc->status), wc->status, 1046 wc->opcode); 1047 1048 /* 1049 * Free possible siblings and then the main send_io 1050 */ 1051 list_for_each_entry_safe(sibling, next, &sendmsg->sibling_list, sibling_list) { 1052 list_del_init(&sibling->sibling_list); 1053 smb_direct_free_sendmsg(sc, sibling); 1054 lcredits += 1; 1055 } 1056 /* Note this frees wc->wr_cqe, but not wc */ 1057 smb_direct_free_sendmsg(sc, sendmsg); 1058 lcredits += 1; 1059 1060 if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) { 1061 pr_err("Send error. status='%s (%d)', opcode=%d\n", 1062 ib_wc_status_msg(wc->status), wc->status, 1063 wc->opcode); 1064 smb_direct_disconnect_rdma_connection(sc); 1065 return; 1066 } 1067 1068 atomic_add(lcredits, &sc->send_io.lcredits.count); 1069 wake_up(&sc->send_io.lcredits.wait_queue); 1070 1071 if (atomic_dec_and_test(&sc->send_io.pending.count)) 1072 wake_up(&sc->send_io.pending.zero_wait_queue); 1073 } 1074 1075 static int manage_credits_prior_sending(struct smbdirect_socket *sc) 1076 { 1077 int new_credits; 1078 1079 if (atomic_read(&sc->recv_io.credits.count) >= sc->recv_io.credits.target) 1080 return 0; 1081 1082 new_credits = atomic_read(&sc->recv_io.posted.count); 1083 if (new_credits == 0) 1084 return 0; 1085 1086 new_credits -= atomic_read(&sc->recv_io.credits.count); 1087 if (new_credits <= 0) 1088 return 0; 1089 1090 atomic_add(new_credits, &sc->recv_io.credits.count); 1091 return new_credits; 1092 } 1093 1094 static int manage_keep_alive_before_sending(struct smbdirect_socket *sc) 1095 { 1096 struct smbdirect_socket_parameters *sp = &sc->parameters; 1097 1098 if (sc->idle.keepalive == SMBDIRECT_KEEPALIVE_PENDING) { 1099 sc->idle.keepalive = SMBDIRECT_KEEPALIVE_SENT; 1100 /* 1101 * Now use the keepalive timeout (instead of keepalive interval) 1102 * in order to wait for a response 1103 */ 1104 mod_delayed_work(sc->workqueue, &sc->idle.timer_work, 1105 msecs_to_jiffies(sp->keepalive_timeout_msec)); 1106 return 1; 1107 } 1108 return 0; 1109 } 1110 1111 static int smb_direct_post_send(struct smbdirect_socket *sc, 1112 struct ib_send_wr *wr) 1113 { 1114 int ret; 1115 1116 atomic_inc(&sc->send_io.pending.count); 1117 ret = ib_post_send(sc->ib.qp, wr, NULL); 1118 if (ret) { 1119 pr_err("failed to post send: %d\n", ret); 1120 smb_direct_disconnect_rdma_connection(sc); 1121 } 1122 return ret; 1123 } 1124 1125 static void smb_direct_send_ctx_init(struct smbdirect_send_batch *send_ctx, 1126 bool need_invalidate_rkey, 1127 unsigned int remote_key) 1128 { 1129 INIT_LIST_HEAD(&send_ctx->msg_list); 1130 send_ctx->wr_cnt = 0; 1131 send_ctx->need_invalidate_rkey = need_invalidate_rkey; 1132 send_ctx->remote_key = remote_key; 1133 } 1134 1135 static int smb_direct_flush_send_list(struct smbdirect_socket *sc, 1136 struct smbdirect_send_batch *send_ctx, 1137 bool is_last) 1138 { 1139 struct smbdirect_send_io *first, *last; 1140 int ret; 1141 1142 if (list_empty(&send_ctx->msg_list)) 1143 return 0; 1144 1145 first = list_first_entry(&send_ctx->msg_list, 1146 struct smbdirect_send_io, 1147 sibling_list); 1148 last = list_last_entry(&send_ctx->msg_list, 1149 struct smbdirect_send_io, 1150 sibling_list); 1151 1152 if (send_ctx->need_invalidate_rkey) { 1153 first->wr.opcode = IB_WR_SEND_WITH_INV; 1154 first->wr.ex.invalidate_rkey = send_ctx->remote_key; 1155 send_ctx->need_invalidate_rkey = false; 1156 send_ctx->remote_key = 0; 1157 } 1158 1159 last->wr.send_flags = IB_SEND_SIGNALED; 1160 last->wr.wr_cqe = &last->cqe; 1161 1162 /* 1163 * Remove last from send_ctx->msg_list 1164 * and splice the rest of send_ctx->msg_list 1165 * to last->sibling_list. 1166 * 1167 * send_ctx->msg_list is a valid empty list 1168 * at the end. 1169 */ 1170 list_del_init(&last->sibling_list); 1171 list_splice_tail_init(&send_ctx->msg_list, &last->sibling_list); 1172 send_ctx->wr_cnt = 0; 1173 1174 ret = smb_direct_post_send(sc, &first->wr); 1175 if (ret) { 1176 struct smbdirect_send_io *sibling, *next; 1177 1178 list_for_each_entry_safe(sibling, next, &last->sibling_list, sibling_list) { 1179 list_del_init(&sibling->sibling_list); 1180 smb_direct_free_sendmsg(sc, sibling); 1181 } 1182 smb_direct_free_sendmsg(sc, last); 1183 } 1184 1185 return ret; 1186 } 1187 1188 static int wait_for_credits(struct smbdirect_socket *sc, 1189 wait_queue_head_t *waitq, atomic_t *total_credits, 1190 int needed) 1191 { 1192 int ret; 1193 1194 do { 1195 if (atomic_sub_return(needed, total_credits) >= 0) 1196 return 0; 1197 1198 atomic_add(needed, total_credits); 1199 ret = wait_event_interruptible(*waitq, 1200 atomic_read(total_credits) >= needed || 1201 sc->status != SMBDIRECT_SOCKET_CONNECTED); 1202 1203 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) 1204 return -ENOTCONN; 1205 else if (ret < 0) 1206 return ret; 1207 } while (true); 1208 } 1209 1210 static int wait_for_send_lcredit(struct smbdirect_socket *sc, 1211 struct smbdirect_send_batch *send_ctx) 1212 { 1213 if (send_ctx && (atomic_read(&sc->send_io.lcredits.count) <= 1)) { 1214 int ret; 1215 1216 ret = smb_direct_flush_send_list(sc, send_ctx, false); 1217 if (ret) 1218 return ret; 1219 } 1220 1221 return wait_for_credits(sc, 1222 &sc->send_io.lcredits.wait_queue, 1223 &sc->send_io.lcredits.count, 1224 1); 1225 } 1226 1227 static int wait_for_send_credits(struct smbdirect_socket *sc, 1228 struct smbdirect_send_batch *send_ctx) 1229 { 1230 int ret; 1231 1232 if (send_ctx && 1233 (send_ctx->wr_cnt >= 16 || atomic_read(&sc->send_io.credits.count) <= 1)) { 1234 ret = smb_direct_flush_send_list(sc, send_ctx, false); 1235 if (ret) 1236 return ret; 1237 } 1238 1239 return wait_for_credits(sc, &sc->send_io.credits.wait_queue, &sc->send_io.credits.count, 1); 1240 } 1241 1242 static int wait_for_rw_credits(struct smbdirect_socket *sc, int credits) 1243 { 1244 return wait_for_credits(sc, 1245 &sc->rw_io.credits.wait_queue, 1246 &sc->rw_io.credits.count, 1247 credits); 1248 } 1249 1250 static int calc_rw_credits(struct smbdirect_socket *sc, 1251 char *buf, unsigned int len) 1252 { 1253 return DIV_ROUND_UP(get_buf_page_count(buf, len), 1254 sc->rw_io.credits.num_pages); 1255 } 1256 1257 static int smb_direct_create_header(struct smbdirect_socket *sc, 1258 int size, int remaining_data_length, 1259 struct smbdirect_send_io **sendmsg_out) 1260 { 1261 struct smbdirect_socket_parameters *sp = &sc->parameters; 1262 struct smbdirect_send_io *sendmsg; 1263 struct smbdirect_data_transfer *packet; 1264 int header_length; 1265 int ret; 1266 1267 sendmsg = smb_direct_alloc_sendmsg(sc); 1268 if (IS_ERR(sendmsg)) 1269 return PTR_ERR(sendmsg); 1270 1271 /* Fill in the packet header */ 1272 packet = (struct smbdirect_data_transfer *)sendmsg->packet; 1273 packet->credits_requested = cpu_to_le16(sp->send_credit_target); 1274 packet->credits_granted = cpu_to_le16(manage_credits_prior_sending(sc)); 1275 1276 packet->flags = 0; 1277 if (manage_keep_alive_before_sending(sc)) 1278 packet->flags |= cpu_to_le16(SMBDIRECT_FLAG_RESPONSE_REQUESTED); 1279 1280 packet->reserved = 0; 1281 if (!size) 1282 packet->data_offset = 0; 1283 else 1284 packet->data_offset = cpu_to_le32(24); 1285 packet->data_length = cpu_to_le32(size); 1286 packet->remaining_data_length = cpu_to_le32(remaining_data_length); 1287 packet->padding = 0; 1288 1289 ksmbd_debug(RDMA, 1290 "credits_requested=%d credits_granted=%d data_offset=%d data_length=%d remaining_data_length=%d\n", 1291 le16_to_cpu(packet->credits_requested), 1292 le16_to_cpu(packet->credits_granted), 1293 le32_to_cpu(packet->data_offset), 1294 le32_to_cpu(packet->data_length), 1295 le32_to_cpu(packet->remaining_data_length)); 1296 1297 /* Map the packet to DMA */ 1298 header_length = sizeof(struct smbdirect_data_transfer); 1299 /* If this is a packet without payload, don't send padding */ 1300 if (!size) 1301 header_length = 1302 offsetof(struct smbdirect_data_transfer, padding); 1303 1304 sendmsg->sge[0].addr = ib_dma_map_single(sc->ib.dev, 1305 (void *)packet, 1306 header_length, 1307 DMA_TO_DEVICE); 1308 ret = ib_dma_mapping_error(sc->ib.dev, sendmsg->sge[0].addr); 1309 if (ret) { 1310 smb_direct_free_sendmsg(sc, sendmsg); 1311 return ret; 1312 } 1313 1314 sendmsg->num_sge = 1; 1315 sendmsg->sge[0].length = header_length; 1316 sendmsg->sge[0].lkey = sc->ib.pd->local_dma_lkey; 1317 1318 *sendmsg_out = sendmsg; 1319 return 0; 1320 } 1321 1322 static int get_sg_list(void *buf, int size, struct scatterlist *sg_list, int nentries) 1323 { 1324 bool high = is_vmalloc_addr(buf); 1325 struct page *page; 1326 int offset, len; 1327 int i = 0; 1328 1329 if (size <= 0 || nentries < get_buf_page_count(buf, size)) 1330 return -EINVAL; 1331 1332 offset = offset_in_page(buf); 1333 buf -= offset; 1334 while (size > 0) { 1335 len = min_t(int, PAGE_SIZE - offset, size); 1336 if (high) 1337 page = vmalloc_to_page(buf); 1338 else 1339 page = kmap_to_page(buf); 1340 1341 if (!sg_list) 1342 return -EINVAL; 1343 sg_set_page(sg_list, page, len, offset); 1344 sg_list = sg_next(sg_list); 1345 1346 buf += PAGE_SIZE; 1347 size -= len; 1348 offset = 0; 1349 i++; 1350 } 1351 return i; 1352 } 1353 1354 static int get_mapped_sg_list(struct ib_device *device, void *buf, int size, 1355 struct scatterlist *sg_list, int nentries, 1356 enum dma_data_direction dir) 1357 { 1358 int npages; 1359 1360 npages = get_sg_list(buf, size, sg_list, nentries); 1361 if (npages < 0) 1362 return -EINVAL; 1363 return ib_dma_map_sg(device, sg_list, npages, dir); 1364 } 1365 1366 static int post_sendmsg(struct smbdirect_socket *sc, 1367 struct smbdirect_send_batch *send_ctx, 1368 struct smbdirect_send_io *msg) 1369 { 1370 int i; 1371 1372 for (i = 0; i < msg->num_sge; i++) 1373 ib_dma_sync_single_for_device(sc->ib.dev, 1374 msg->sge[i].addr, msg->sge[i].length, 1375 DMA_TO_DEVICE); 1376 1377 msg->cqe.done = send_done; 1378 msg->wr.opcode = IB_WR_SEND; 1379 msg->wr.sg_list = &msg->sge[0]; 1380 msg->wr.num_sge = msg->num_sge; 1381 msg->wr.next = NULL; 1382 1383 if (send_ctx) { 1384 msg->wr.wr_cqe = NULL; 1385 msg->wr.send_flags = 0; 1386 if (!list_empty(&send_ctx->msg_list)) { 1387 struct smbdirect_send_io *last; 1388 1389 last = list_last_entry(&send_ctx->msg_list, 1390 struct smbdirect_send_io, 1391 sibling_list); 1392 last->wr.next = &msg->wr; 1393 } 1394 list_add_tail(&msg->sibling_list, &send_ctx->msg_list); 1395 send_ctx->wr_cnt++; 1396 return 0; 1397 } 1398 1399 msg->wr.wr_cqe = &msg->cqe; 1400 msg->wr.send_flags = IB_SEND_SIGNALED; 1401 return smb_direct_post_send(sc, &msg->wr); 1402 } 1403 1404 static int smb_direct_post_send_data(struct smbdirect_socket *sc, 1405 struct smbdirect_send_batch *send_ctx, 1406 struct kvec *iov, int niov, 1407 int remaining_data_length) 1408 { 1409 int i, j, ret; 1410 struct smbdirect_send_io *msg; 1411 int data_length; 1412 struct scatterlist sg[SMBDIRECT_SEND_IO_MAX_SGE - 1]; 1413 1414 ret = wait_for_send_lcredit(sc, send_ctx); 1415 if (ret) 1416 goto lcredit_failed; 1417 1418 ret = wait_for_send_credits(sc, send_ctx); 1419 if (ret) 1420 goto credit_failed; 1421 1422 data_length = 0; 1423 for (i = 0; i < niov; i++) 1424 data_length += iov[i].iov_len; 1425 1426 ret = smb_direct_create_header(sc, data_length, remaining_data_length, 1427 &msg); 1428 if (ret) 1429 goto header_failed; 1430 1431 for (i = 0; i < niov; i++) { 1432 struct ib_sge *sge; 1433 int sg_cnt; 1434 1435 sg_init_table(sg, SMBDIRECT_SEND_IO_MAX_SGE - 1); 1436 sg_cnt = get_mapped_sg_list(sc->ib.dev, 1437 iov[i].iov_base, iov[i].iov_len, 1438 sg, SMBDIRECT_SEND_IO_MAX_SGE - 1, 1439 DMA_TO_DEVICE); 1440 if (sg_cnt <= 0) { 1441 pr_err("failed to map buffer\n"); 1442 ret = -ENOMEM; 1443 goto err; 1444 } else if (sg_cnt + msg->num_sge > SMBDIRECT_SEND_IO_MAX_SGE) { 1445 pr_err("buffer not fitted into sges\n"); 1446 ret = -E2BIG; 1447 ib_dma_unmap_sg(sc->ib.dev, sg, sg_cnt, 1448 DMA_TO_DEVICE); 1449 goto err; 1450 } 1451 1452 for (j = 0; j < sg_cnt; j++) { 1453 sge = &msg->sge[msg->num_sge]; 1454 sge->addr = sg_dma_address(&sg[j]); 1455 sge->length = sg_dma_len(&sg[j]); 1456 sge->lkey = sc->ib.pd->local_dma_lkey; 1457 msg->num_sge++; 1458 } 1459 } 1460 1461 ret = post_sendmsg(sc, send_ctx, msg); 1462 if (ret) 1463 goto err; 1464 return 0; 1465 err: 1466 smb_direct_free_sendmsg(sc, msg); 1467 header_failed: 1468 atomic_inc(&sc->send_io.credits.count); 1469 credit_failed: 1470 atomic_inc(&sc->send_io.lcredits.count); 1471 lcredit_failed: 1472 return ret; 1473 } 1474 1475 static int smb_direct_writev(struct ksmbd_transport *t, 1476 struct kvec *iov, int niovs, int buflen, 1477 bool need_invalidate, unsigned int remote_key) 1478 { 1479 struct smb_direct_transport *st = SMBD_TRANS(t); 1480 struct smbdirect_socket *sc = &st->socket; 1481 struct smbdirect_socket_parameters *sp = &sc->parameters; 1482 size_t remaining_data_length; 1483 size_t iov_idx; 1484 size_t iov_ofs; 1485 size_t max_iov_size = sp->max_send_size - 1486 sizeof(struct smbdirect_data_transfer); 1487 int ret; 1488 struct smbdirect_send_batch send_ctx; 1489 int error = 0; 1490 1491 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) 1492 return -ENOTCONN; 1493 1494 //FIXME: skip RFC1002 header.. 1495 if (WARN_ON_ONCE(niovs <= 1 || iov[0].iov_len != 4)) 1496 return -EINVAL; 1497 buflen -= 4; 1498 iov_idx = 1; 1499 iov_ofs = 0; 1500 1501 remaining_data_length = buflen; 1502 ksmbd_debug(RDMA, "Sending smb (RDMA): smb_len=%u\n", buflen); 1503 1504 smb_direct_send_ctx_init(&send_ctx, need_invalidate, remote_key); 1505 while (remaining_data_length) { 1506 struct kvec vecs[SMBDIRECT_SEND_IO_MAX_SGE - 1]; /* minus smbdirect hdr */ 1507 size_t possible_bytes = max_iov_size; 1508 size_t possible_vecs; 1509 size_t bytes = 0; 1510 size_t nvecs = 0; 1511 1512 /* 1513 * For the last message remaining_data_length should be 1514 * have been 0 already! 1515 */ 1516 if (WARN_ON_ONCE(iov_idx >= niovs)) { 1517 error = -EINVAL; 1518 goto done; 1519 } 1520 1521 /* 1522 * We have 2 factors which limit the arguments we pass 1523 * to smb_direct_post_send_data(): 1524 * 1525 * 1. The number of supported sges for the send, 1526 * while one is reserved for the smbdirect header. 1527 * And we currently need one SGE per page. 1528 * 2. The number of negotiated payload bytes per send. 1529 */ 1530 possible_vecs = min_t(size_t, ARRAY_SIZE(vecs), niovs - iov_idx); 1531 1532 while (iov_idx < niovs && possible_vecs && possible_bytes) { 1533 struct kvec *v = &vecs[nvecs]; 1534 int page_count; 1535 1536 v->iov_base = ((u8 *)iov[iov_idx].iov_base) + iov_ofs; 1537 v->iov_len = min_t(size_t, 1538 iov[iov_idx].iov_len - iov_ofs, 1539 possible_bytes); 1540 page_count = get_buf_page_count(v->iov_base, v->iov_len); 1541 if (page_count > possible_vecs) { 1542 /* 1543 * If the number of pages in the buffer 1544 * is to much (because we currently require 1545 * one SGE per page), we need to limit the 1546 * length. 1547 * 1548 * We know possible_vecs is at least 1, 1549 * so we always keep the first page. 1550 * 1551 * We need to calculate the number extra 1552 * pages (epages) we can also keep. 1553 * 1554 * We calculate the number of bytes in the 1555 * first page (fplen), this should never be 1556 * larger than v->iov_len because page_count is 1557 * at least 2, but adding a limitation feels 1558 * better. 1559 * 1560 * Then we calculate the number of bytes (elen) 1561 * we can keep for the extra pages. 1562 */ 1563 size_t epages = possible_vecs - 1; 1564 size_t fpofs = offset_in_page(v->iov_base); 1565 size_t fplen = min_t(size_t, PAGE_SIZE - fpofs, v->iov_len); 1566 size_t elen = min_t(size_t, v->iov_len - fplen, epages*PAGE_SIZE); 1567 1568 v->iov_len = fplen + elen; 1569 page_count = get_buf_page_count(v->iov_base, v->iov_len); 1570 if (WARN_ON_ONCE(page_count > possible_vecs)) { 1571 /* 1572 * Something went wrong in the above 1573 * logic... 1574 */ 1575 error = -EINVAL; 1576 goto done; 1577 } 1578 } 1579 possible_vecs -= page_count; 1580 nvecs += 1; 1581 possible_bytes -= v->iov_len; 1582 bytes += v->iov_len; 1583 1584 iov_ofs += v->iov_len; 1585 if (iov_ofs >= iov[iov_idx].iov_len) { 1586 iov_idx += 1; 1587 iov_ofs = 0; 1588 } 1589 } 1590 1591 remaining_data_length -= bytes; 1592 1593 ret = smb_direct_post_send_data(sc, &send_ctx, 1594 vecs, nvecs, 1595 remaining_data_length); 1596 if (unlikely(ret)) { 1597 error = ret; 1598 goto done; 1599 } 1600 } 1601 1602 done: 1603 ret = smb_direct_flush_send_list(sc, &send_ctx, true); 1604 if (unlikely(!ret && error)) 1605 ret = error; 1606 1607 /* 1608 * As an optimization, we don't wait for individual I/O to finish 1609 * before sending the next one. 1610 * Send them all and wait for pending send count to get to 0 1611 * that means all the I/Os have been out and we are good to return 1612 */ 1613 1614 wait_event(sc->send_io.pending.zero_wait_queue, 1615 atomic_read(&sc->send_io.pending.count) == 0 || 1616 sc->status != SMBDIRECT_SOCKET_CONNECTED); 1617 if (sc->status != SMBDIRECT_SOCKET_CONNECTED && ret == 0) 1618 ret = -ENOTCONN; 1619 1620 return ret; 1621 } 1622 1623 static void smb_direct_free_rdma_rw_msg(struct smb_direct_transport *t, 1624 struct smbdirect_rw_io *msg, 1625 enum dma_data_direction dir) 1626 { 1627 struct smbdirect_socket *sc = &t->socket; 1628 1629 rdma_rw_ctx_destroy(&msg->rdma_ctx, sc->ib.qp, sc->ib.qp->port, 1630 msg->sgt.sgl, msg->sgt.nents, dir); 1631 sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); 1632 kfree(msg); 1633 } 1634 1635 static void read_write_done(struct ib_cq *cq, struct ib_wc *wc, 1636 enum dma_data_direction dir) 1637 { 1638 struct smbdirect_rw_io *msg = 1639 container_of(wc->wr_cqe, struct smbdirect_rw_io, cqe); 1640 struct smbdirect_socket *sc = msg->socket; 1641 1642 if (wc->status != IB_WC_SUCCESS) { 1643 msg->error = -EIO; 1644 pr_err("read/write error. opcode = %d, status = %s(%d)\n", 1645 wc->opcode, ib_wc_status_msg(wc->status), wc->status); 1646 if (wc->status != IB_WC_WR_FLUSH_ERR) 1647 smb_direct_disconnect_rdma_connection(sc); 1648 } 1649 1650 complete(msg->completion); 1651 } 1652 1653 static void read_done(struct ib_cq *cq, struct ib_wc *wc) 1654 { 1655 read_write_done(cq, wc, DMA_FROM_DEVICE); 1656 } 1657 1658 static void write_done(struct ib_cq *cq, struct ib_wc *wc) 1659 { 1660 read_write_done(cq, wc, DMA_TO_DEVICE); 1661 } 1662 1663 static int smb_direct_rdma_xmit(struct smb_direct_transport *t, 1664 void *buf, int buf_len, 1665 struct smbdirect_buffer_descriptor_v1 *desc, 1666 unsigned int desc_len, 1667 bool is_read) 1668 { 1669 struct smbdirect_socket *sc = &t->socket; 1670 struct smbdirect_socket_parameters *sp = &sc->parameters; 1671 struct smbdirect_rw_io *msg, *next_msg; 1672 int i, ret; 1673 DECLARE_COMPLETION_ONSTACK(completion); 1674 struct ib_send_wr *first_wr; 1675 LIST_HEAD(msg_list); 1676 char *desc_buf; 1677 int credits_needed; 1678 unsigned int desc_buf_len, desc_num = 0; 1679 1680 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) 1681 return -ENOTCONN; 1682 1683 if (buf_len > sp->max_read_write_size) 1684 return -EINVAL; 1685 1686 /* calculate needed credits */ 1687 credits_needed = 0; 1688 desc_buf = buf; 1689 for (i = 0; i < desc_len / sizeof(*desc); i++) { 1690 if (!buf_len) 1691 break; 1692 1693 desc_buf_len = le32_to_cpu(desc[i].length); 1694 if (!desc_buf_len) 1695 return -EINVAL; 1696 1697 if (desc_buf_len > buf_len) { 1698 desc_buf_len = buf_len; 1699 desc[i].length = cpu_to_le32(desc_buf_len); 1700 buf_len = 0; 1701 } 1702 1703 credits_needed += calc_rw_credits(sc, desc_buf, desc_buf_len); 1704 desc_buf += desc_buf_len; 1705 buf_len -= desc_buf_len; 1706 desc_num++; 1707 } 1708 1709 ksmbd_debug(RDMA, "RDMA %s, len %#x, needed credits %#x\n", 1710 str_read_write(is_read), buf_len, credits_needed); 1711 1712 ret = wait_for_rw_credits(sc, credits_needed); 1713 if (ret < 0) 1714 return ret; 1715 1716 /* build rdma_rw_ctx for each descriptor */ 1717 desc_buf = buf; 1718 for (i = 0; i < desc_num; i++) { 1719 msg = kzalloc(struct_size(msg, sg_list, SG_CHUNK_SIZE), 1720 KSMBD_DEFAULT_GFP); 1721 if (!msg) { 1722 ret = -ENOMEM; 1723 goto out; 1724 } 1725 1726 desc_buf_len = le32_to_cpu(desc[i].length); 1727 1728 msg->socket = sc; 1729 msg->cqe.done = is_read ? read_done : write_done; 1730 msg->completion = &completion; 1731 1732 msg->sgt.sgl = &msg->sg_list[0]; 1733 ret = sg_alloc_table_chained(&msg->sgt, 1734 get_buf_page_count(desc_buf, desc_buf_len), 1735 msg->sg_list, SG_CHUNK_SIZE); 1736 if (ret) { 1737 ret = -ENOMEM; 1738 goto free_msg; 1739 } 1740 1741 ret = get_sg_list(desc_buf, desc_buf_len, 1742 msg->sgt.sgl, msg->sgt.orig_nents); 1743 if (ret < 0) 1744 goto free_table; 1745 1746 ret = rdma_rw_ctx_init(&msg->rdma_ctx, sc->ib.qp, sc->ib.qp->port, 1747 msg->sgt.sgl, 1748 get_buf_page_count(desc_buf, desc_buf_len), 1749 0, 1750 le64_to_cpu(desc[i].offset), 1751 le32_to_cpu(desc[i].token), 1752 is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); 1753 if (ret < 0) { 1754 pr_err("failed to init rdma_rw_ctx: %d\n", ret); 1755 goto free_table; 1756 } 1757 1758 list_add_tail(&msg->list, &msg_list); 1759 desc_buf += desc_buf_len; 1760 } 1761 1762 /* concatenate work requests of rdma_rw_ctxs */ 1763 first_wr = NULL; 1764 list_for_each_entry_reverse(msg, &msg_list, list) { 1765 first_wr = rdma_rw_ctx_wrs(&msg->rdma_ctx, sc->ib.qp, sc->ib.qp->port, 1766 &msg->cqe, first_wr); 1767 } 1768 1769 ret = ib_post_send(sc->ib.qp, first_wr, NULL); 1770 if (ret) { 1771 pr_err("failed to post send wr for RDMA R/W: %d\n", ret); 1772 goto out; 1773 } 1774 1775 msg = list_last_entry(&msg_list, struct smbdirect_rw_io, list); 1776 wait_for_completion(&completion); 1777 ret = msg->error; 1778 out: 1779 list_for_each_entry_safe(msg, next_msg, &msg_list, list) { 1780 list_del(&msg->list); 1781 smb_direct_free_rdma_rw_msg(t, msg, 1782 is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); 1783 } 1784 atomic_add(credits_needed, &sc->rw_io.credits.count); 1785 wake_up(&sc->rw_io.credits.wait_queue); 1786 return ret; 1787 1788 free_table: 1789 sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); 1790 free_msg: 1791 kfree(msg); 1792 goto out; 1793 } 1794 1795 static int smb_direct_rdma_write(struct ksmbd_transport *t, 1796 void *buf, unsigned int buflen, 1797 struct smbdirect_buffer_descriptor_v1 *desc, 1798 unsigned int desc_len) 1799 { 1800 return smb_direct_rdma_xmit(SMBD_TRANS(t), buf, buflen, 1801 desc, desc_len, false); 1802 } 1803 1804 static int smb_direct_rdma_read(struct ksmbd_transport *t, 1805 void *buf, unsigned int buflen, 1806 struct smbdirect_buffer_descriptor_v1 *desc, 1807 unsigned int desc_len) 1808 { 1809 return smb_direct_rdma_xmit(SMBD_TRANS(t), buf, buflen, 1810 desc, desc_len, true); 1811 } 1812 1813 static void smb_direct_disconnect(struct ksmbd_transport *t) 1814 { 1815 struct smb_direct_transport *st = SMBD_TRANS(t); 1816 struct smbdirect_socket *sc = &st->socket; 1817 1818 ksmbd_debug(RDMA, "Disconnecting cm_id=%p\n", sc->rdma.cm_id); 1819 1820 free_transport(st); 1821 } 1822 1823 static void smb_direct_shutdown(struct ksmbd_transport *t) 1824 { 1825 struct smb_direct_transport *st = SMBD_TRANS(t); 1826 struct smbdirect_socket *sc = &st->socket; 1827 1828 ksmbd_debug(RDMA, "smb-direct shutdown cm_id=%p\n", sc->rdma.cm_id); 1829 1830 smb_direct_disconnect_rdma_work(&sc->disconnect_work); 1831 } 1832 1833 static int smb_direct_cm_handler(struct rdma_cm_id *cm_id, 1834 struct rdma_cm_event *event) 1835 { 1836 struct smbdirect_socket *sc = cm_id->context; 1837 unsigned long flags; 1838 1839 ksmbd_debug(RDMA, "RDMA CM event. cm_id=%p event=%s (%d)\n", 1840 cm_id, rdma_event_msg(event->event), event->event); 1841 1842 switch (event->event) { 1843 case RDMA_CM_EVENT_ESTABLISHED: { 1844 /* 1845 * Some drivers (at least mlx5_ib and irdma in roce mode) 1846 * might post a recv completion before RDMA_CM_EVENT_ESTABLISHED, 1847 * we need to adjust our expectation in that case. 1848 * 1849 * If smb_direct_negotiate_recv_done was called first 1850 * it initialized sc->connect.work only for us to 1851 * start, so that we turned into 1852 * SMBDIRECT_SOCKET_NEGOTIATE_NEEDED, before 1853 * smb_direct_negotiate_recv_work() runs. 1854 * 1855 * If smb_direct_negotiate_recv_done didn't happen 1856 * yet. sc->connect.work is still be disabled and 1857 * queue_work() is a no-op. 1858 */ 1859 if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING)) 1860 break; 1861 sc->status = SMBDIRECT_SOCKET_NEGOTIATE_NEEDED; 1862 spin_lock_irqsave(&sc->connect.lock, flags); 1863 if (!sc->first_error) 1864 queue_work(sc->workqueue, &sc->connect.work); 1865 spin_unlock_irqrestore(&sc->connect.lock, flags); 1866 wake_up(&sc->status_wait); 1867 break; 1868 } 1869 case RDMA_CM_EVENT_DEVICE_REMOVAL: 1870 case RDMA_CM_EVENT_DISCONNECTED: { 1871 sc->status = SMBDIRECT_SOCKET_DISCONNECTED; 1872 smb_direct_disconnect_rdma_work(&sc->disconnect_work); 1873 if (sc->ib.qp) 1874 ib_drain_qp(sc->ib.qp); 1875 break; 1876 } 1877 case RDMA_CM_EVENT_CONNECT_ERROR: { 1878 sc->status = SMBDIRECT_SOCKET_DISCONNECTED; 1879 smb_direct_disconnect_rdma_work(&sc->disconnect_work); 1880 break; 1881 } 1882 default: 1883 pr_err("Unexpected RDMA CM event. cm_id=%p, event=%s (%d)\n", 1884 cm_id, rdma_event_msg(event->event), 1885 event->event); 1886 break; 1887 } 1888 return 0; 1889 } 1890 1891 static void smb_direct_qpair_handler(struct ib_event *event, void *context) 1892 { 1893 struct smbdirect_socket *sc = context; 1894 1895 ksmbd_debug(RDMA, "Received QP event. cm_id=%p, event=%s (%d)\n", 1896 sc->rdma.cm_id, ib_event_msg(event->event), event->event); 1897 1898 switch (event->event) { 1899 case IB_EVENT_CQ_ERR: 1900 case IB_EVENT_QP_FATAL: 1901 smb_direct_disconnect_rdma_connection(sc); 1902 break; 1903 default: 1904 break; 1905 } 1906 } 1907 1908 static int smb_direct_send_negotiate_response(struct smbdirect_socket *sc, 1909 int failed) 1910 { 1911 struct smbdirect_socket_parameters *sp = &sc->parameters; 1912 struct smbdirect_send_io *sendmsg; 1913 struct smbdirect_negotiate_resp *resp; 1914 int ret; 1915 1916 sendmsg = smb_direct_alloc_sendmsg(sc); 1917 if (IS_ERR(sendmsg)) 1918 return -ENOMEM; 1919 1920 resp = (struct smbdirect_negotiate_resp *)sendmsg->packet; 1921 if (failed) { 1922 memset(resp, 0, sizeof(*resp)); 1923 resp->min_version = SMB_DIRECT_VERSION_LE; 1924 resp->max_version = SMB_DIRECT_VERSION_LE; 1925 resp->status = STATUS_NOT_SUPPORTED; 1926 1927 sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; 1928 } else { 1929 resp->status = STATUS_SUCCESS; 1930 resp->min_version = SMB_DIRECT_VERSION_LE; 1931 resp->max_version = SMB_DIRECT_VERSION_LE; 1932 resp->negotiated_version = SMB_DIRECT_VERSION_LE; 1933 resp->reserved = 0; 1934 resp->credits_requested = 1935 cpu_to_le16(sp->send_credit_target); 1936 resp->credits_granted = cpu_to_le16(manage_credits_prior_sending(sc)); 1937 resp->max_readwrite_size = cpu_to_le32(sp->max_read_write_size); 1938 resp->preferred_send_size = cpu_to_le32(sp->max_send_size); 1939 resp->max_receive_size = cpu_to_le32(sp->max_recv_size); 1940 resp->max_fragmented_size = 1941 cpu_to_le32(sp->max_fragmented_recv_size); 1942 1943 sc->recv_io.expected = SMBDIRECT_EXPECT_DATA_TRANSFER; 1944 sc->status = SMBDIRECT_SOCKET_CONNECTED; 1945 } 1946 1947 sendmsg->sge[0].addr = ib_dma_map_single(sc->ib.dev, 1948 (void *)resp, sizeof(*resp), 1949 DMA_TO_DEVICE); 1950 ret = ib_dma_mapping_error(sc->ib.dev, sendmsg->sge[0].addr); 1951 if (ret) { 1952 smb_direct_free_sendmsg(sc, sendmsg); 1953 return ret; 1954 } 1955 1956 sendmsg->num_sge = 1; 1957 sendmsg->sge[0].length = sizeof(*resp); 1958 sendmsg->sge[0].lkey = sc->ib.pd->local_dma_lkey; 1959 1960 ret = post_sendmsg(sc, NULL, sendmsg); 1961 if (ret) { 1962 smb_direct_free_sendmsg(sc, sendmsg); 1963 return ret; 1964 } 1965 1966 wait_event(sc->send_io.pending.zero_wait_queue, 1967 atomic_read(&sc->send_io.pending.count) == 0 || 1968 sc->status != SMBDIRECT_SOCKET_CONNECTED); 1969 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) 1970 return -ENOTCONN; 1971 1972 return 0; 1973 } 1974 1975 static int smb_direct_accept_client(struct smbdirect_socket *sc) 1976 { 1977 struct smbdirect_socket_parameters *sp = &sc->parameters; 1978 struct rdma_conn_param conn_param; 1979 __be32 ird_ord_hdr[2]; 1980 int ret; 1981 1982 /* 1983 * smb_direct_handle_connect_request() 1984 * already negotiated sp->initiator_depth 1985 * and sp->responder_resources 1986 */ 1987 memset(&conn_param, 0, sizeof(conn_param)); 1988 conn_param.initiator_depth = sp->initiator_depth; 1989 conn_param.responder_resources = sp->responder_resources; 1990 1991 if (sc->rdma.legacy_iwarp) { 1992 ird_ord_hdr[0] = cpu_to_be32(conn_param.responder_resources); 1993 ird_ord_hdr[1] = cpu_to_be32(conn_param.initiator_depth); 1994 conn_param.private_data = ird_ord_hdr; 1995 conn_param.private_data_len = sizeof(ird_ord_hdr); 1996 } else { 1997 conn_param.private_data = NULL; 1998 conn_param.private_data_len = 0; 1999 } 2000 conn_param.retry_count = SMB_DIRECT_CM_RETRY; 2001 conn_param.rnr_retry_count = SMB_DIRECT_CM_RNR_RETRY; 2002 conn_param.flow_control = 0; 2003 2004 /* 2005 * start with the negotiate timeout and SMBDIRECT_KEEPALIVE_PENDING 2006 * so that the timer will cause a disconnect. 2007 */ 2008 sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING; 2009 mod_delayed_work(sc->workqueue, &sc->idle.timer_work, 2010 msecs_to_jiffies(sp->negotiate_timeout_msec)); 2011 2012 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED); 2013 sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING; 2014 ret = rdma_accept(sc->rdma.cm_id, &conn_param); 2015 if (ret) { 2016 pr_err("error at rdma_accept: %d\n", ret); 2017 return ret; 2018 } 2019 return 0; 2020 } 2021 2022 static int smb_direct_prepare_negotiation(struct smbdirect_socket *sc) 2023 { 2024 struct smbdirect_recv_io *recvmsg; 2025 bool recv_posted = false; 2026 int ret; 2027 2028 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED); 2029 sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED; 2030 2031 sc->recv_io.expected = SMBDIRECT_EXPECT_NEGOTIATE_REQ; 2032 2033 recvmsg = get_free_recvmsg(sc); 2034 if (!recvmsg) 2035 return -ENOMEM; 2036 recvmsg->cqe.done = smb_direct_negotiate_recv_done; 2037 2038 ret = smb_direct_post_recv(sc, recvmsg); 2039 if (ret) { 2040 pr_err("Can't post recv: %d\n", ret); 2041 goto out_err; 2042 } 2043 recv_posted = true; 2044 2045 ret = smb_direct_accept_client(sc); 2046 if (ret) { 2047 pr_err("Can't accept client\n"); 2048 goto out_err; 2049 } 2050 2051 return 0; 2052 out_err: 2053 /* 2054 * If the recv was never posted, return it to the free list. 2055 * If it was posted, leave it alone so disconnect teardown can 2056 * drain the QP and complete it (flush) and the completion path 2057 * will unmap it exactly once. 2058 */ 2059 if (!recv_posted) 2060 put_recvmsg(sc, recvmsg); 2061 return ret; 2062 } 2063 2064 static int smb_direct_init_params(struct smbdirect_socket *sc) 2065 { 2066 struct smbdirect_socket_parameters *sp = &sc->parameters; 2067 int max_send_sges; 2068 unsigned int maxpages; 2069 2070 /* need 3 more sge. because a SMB_DIRECT header, SMB2 header, 2071 * SMB2 response could be mapped. 2072 */ 2073 max_send_sges = DIV_ROUND_UP(sp->max_send_size, PAGE_SIZE) + 3; 2074 if (max_send_sges > SMBDIRECT_SEND_IO_MAX_SGE) { 2075 pr_err("max_send_size %d is too large\n", sp->max_send_size); 2076 return -EINVAL; 2077 } 2078 2079 atomic_set(&sc->send_io.lcredits.count, sp->send_credit_target); 2080 2081 maxpages = DIV_ROUND_UP(sp->max_read_write_size, PAGE_SIZE); 2082 sc->rw_io.credits.max = rdma_rw_mr_factor(sc->ib.dev, 2083 sc->rdma.cm_id->port_num, 2084 maxpages); 2085 sc->rw_io.credits.num_pages = DIV_ROUND_UP(maxpages, sc->rw_io.credits.max); 2086 /* add one extra in order to handle unaligned pages */ 2087 sc->rw_io.credits.max += 1; 2088 2089 sc->recv_io.credits.target = 1; 2090 2091 atomic_set(&sc->rw_io.credits.count, sc->rw_io.credits.max); 2092 2093 return 0; 2094 } 2095 2096 static void smb_direct_destroy_pools(struct smbdirect_socket *sc) 2097 { 2098 struct smbdirect_recv_io *recvmsg; 2099 2100 while ((recvmsg = get_free_recvmsg(sc))) 2101 mempool_free(recvmsg, sc->recv_io.mem.pool); 2102 2103 mempool_destroy(sc->recv_io.mem.pool); 2104 sc->recv_io.mem.pool = NULL; 2105 2106 kmem_cache_destroy(sc->recv_io.mem.cache); 2107 sc->recv_io.mem.cache = NULL; 2108 2109 mempool_destroy(sc->send_io.mem.pool); 2110 sc->send_io.mem.pool = NULL; 2111 2112 kmem_cache_destroy(sc->send_io.mem.cache); 2113 sc->send_io.mem.cache = NULL; 2114 } 2115 2116 static int smb_direct_create_pools(struct smbdirect_socket *sc) 2117 { 2118 struct smbdirect_socket_parameters *sp = &sc->parameters; 2119 char name[80]; 2120 int i; 2121 struct smbdirect_recv_io *recvmsg; 2122 2123 snprintf(name, sizeof(name), "smbdirect_send_io_pool_%p", sc); 2124 sc->send_io.mem.cache = kmem_cache_create(name, 2125 sizeof(struct smbdirect_send_io) + 2126 sizeof(struct smbdirect_negotiate_resp), 2127 0, SLAB_HWCACHE_ALIGN, NULL); 2128 if (!sc->send_io.mem.cache) 2129 return -ENOMEM; 2130 2131 sc->send_io.mem.pool = mempool_create(sp->send_credit_target, 2132 mempool_alloc_slab, mempool_free_slab, 2133 sc->send_io.mem.cache); 2134 if (!sc->send_io.mem.pool) 2135 goto err; 2136 2137 snprintf(name, sizeof(name), "smbdirect_recv_io_pool_%p", sc); 2138 sc->recv_io.mem.cache = kmem_cache_create(name, 2139 sizeof(struct smbdirect_recv_io) + 2140 sp->max_recv_size, 2141 0, SLAB_HWCACHE_ALIGN, NULL); 2142 if (!sc->recv_io.mem.cache) 2143 goto err; 2144 2145 sc->recv_io.mem.pool = 2146 mempool_create(sp->recv_credit_max, mempool_alloc_slab, 2147 mempool_free_slab, sc->recv_io.mem.cache); 2148 if (!sc->recv_io.mem.pool) 2149 goto err; 2150 2151 for (i = 0; i < sp->recv_credit_max; i++) { 2152 recvmsg = mempool_alloc(sc->recv_io.mem.pool, KSMBD_DEFAULT_GFP); 2153 if (!recvmsg) 2154 goto err; 2155 recvmsg->socket = sc; 2156 recvmsg->sge.length = 0; 2157 list_add(&recvmsg->list, &sc->recv_io.free.list); 2158 } 2159 2160 return 0; 2161 err: 2162 smb_direct_destroy_pools(sc); 2163 return -ENOMEM; 2164 } 2165 2166 static u32 smb_direct_rdma_rw_send_wrs(struct ib_device *dev, const struct ib_qp_init_attr *attr) 2167 { 2168 /* 2169 * This could be split out of rdma_rw_init_qp() 2170 * and be a helper function next to rdma_rw_mr_factor() 2171 * 2172 * We can't check unlikely(rdma_rw_force_mr) here, 2173 * but that is most likely 0 anyway. 2174 */ 2175 u32 factor; 2176 2177 WARN_ON_ONCE(attr->port_num == 0); 2178 2179 /* 2180 * Each context needs at least one RDMA READ or WRITE WR. 2181 * 2182 * For some hardware we might need more, eventually we should ask the 2183 * HCA driver for a multiplier here. 2184 */ 2185 factor = 1; 2186 2187 /* 2188 * If the device needs MRs to perform RDMA READ or WRITE operations, 2189 * we'll need two additional MRs for the registrations and the 2190 * invalidation. 2191 */ 2192 if (rdma_protocol_iwarp(dev, attr->port_num) || dev->attrs.max_sgl_rd) 2193 factor += 2; /* inv + reg */ 2194 2195 return factor * attr->cap.max_rdma_ctxs; 2196 } 2197 2198 static int smb_direct_create_qpair(struct smbdirect_socket *sc) 2199 { 2200 struct smbdirect_socket_parameters *sp = &sc->parameters; 2201 int ret; 2202 struct ib_qp_cap qp_cap; 2203 struct ib_qp_init_attr qp_attr; 2204 u32 max_send_wr; 2205 u32 rdma_send_wr; 2206 2207 /* 2208 * Note that {rdma,ib}_create_qp() will call 2209 * rdma_rw_init_qp() if cap->max_rdma_ctxs is not 0. 2210 * It will adjust cap->max_send_wr to the required 2211 * number of additional WRs for the RDMA RW operations. 2212 * It will cap cap->max_send_wr to the device limit. 2213 * 2214 * +1 for ib_drain_qp 2215 */ 2216 qp_cap.max_send_wr = sp->send_credit_target + 1; 2217 qp_cap.max_recv_wr = sp->recv_credit_max + 1; 2218 qp_cap.max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE; 2219 qp_cap.max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE; 2220 qp_cap.max_inline_data = 0; 2221 qp_cap.max_rdma_ctxs = sc->rw_io.credits.max; 2222 2223 /* 2224 * Find out the number of max_send_wr 2225 * after rdma_rw_init_qp() adjusted it. 2226 * 2227 * We only do it on a temporary variable, 2228 * as rdma_create_qp() will trigger 2229 * rdma_rw_init_qp() again. 2230 */ 2231 memset(&qp_attr, 0, sizeof(qp_attr)); 2232 qp_attr.cap = qp_cap; 2233 qp_attr.port_num = sc->rdma.cm_id->port_num; 2234 rdma_send_wr = smb_direct_rdma_rw_send_wrs(sc->ib.dev, &qp_attr); 2235 max_send_wr = qp_cap.max_send_wr + rdma_send_wr; 2236 2237 if (qp_cap.max_send_wr > sc->ib.dev->attrs.max_cqe || 2238 qp_cap.max_send_wr > sc->ib.dev->attrs.max_qp_wr) { 2239 pr_err("Possible CQE overrun: max_send_wr %d\n", 2240 qp_cap.max_send_wr); 2241 pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n", 2242 IB_DEVICE_NAME_MAX, 2243 sc->ib.dev->name, 2244 sc->ib.dev->attrs.max_cqe, 2245 sc->ib.dev->attrs.max_qp_wr); 2246 pr_err("consider lowering send_credit_target = %d\n", 2247 sp->send_credit_target); 2248 return -EINVAL; 2249 } 2250 2251 if (qp_cap.max_rdma_ctxs && 2252 (max_send_wr >= sc->ib.dev->attrs.max_cqe || 2253 max_send_wr >= sc->ib.dev->attrs.max_qp_wr)) { 2254 pr_err("Possible CQE overrun: rdma_send_wr %d + max_send_wr %d = %d\n", 2255 rdma_send_wr, qp_cap.max_send_wr, max_send_wr); 2256 pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n", 2257 IB_DEVICE_NAME_MAX, 2258 sc->ib.dev->name, 2259 sc->ib.dev->attrs.max_cqe, 2260 sc->ib.dev->attrs.max_qp_wr); 2261 pr_err("consider lowering send_credit_target = %d, max_rdma_ctxs = %d\n", 2262 sp->send_credit_target, qp_cap.max_rdma_ctxs); 2263 return -EINVAL; 2264 } 2265 2266 if (qp_cap.max_recv_wr > sc->ib.dev->attrs.max_cqe || 2267 qp_cap.max_recv_wr > sc->ib.dev->attrs.max_qp_wr) { 2268 pr_err("Possible CQE overrun: max_recv_wr %d\n", 2269 qp_cap.max_recv_wr); 2270 pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n", 2271 IB_DEVICE_NAME_MAX, 2272 sc->ib.dev->name, 2273 sc->ib.dev->attrs.max_cqe, 2274 sc->ib.dev->attrs.max_qp_wr); 2275 pr_err("consider lowering receive_credit_max = %d\n", 2276 sp->recv_credit_max); 2277 return -EINVAL; 2278 } 2279 2280 if (qp_cap.max_send_sge > sc->ib.dev->attrs.max_send_sge || 2281 qp_cap.max_recv_sge > sc->ib.dev->attrs.max_recv_sge) { 2282 pr_err("device %.*s max_send_sge/max_recv_sge = %d/%d too small\n", 2283 IB_DEVICE_NAME_MAX, 2284 sc->ib.dev->name, 2285 sc->ib.dev->attrs.max_send_sge, 2286 sc->ib.dev->attrs.max_recv_sge); 2287 return -EINVAL; 2288 } 2289 2290 sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0); 2291 if (IS_ERR(sc->ib.pd)) { 2292 pr_err("Can't create RDMA PD\n"); 2293 ret = PTR_ERR(sc->ib.pd); 2294 sc->ib.pd = NULL; 2295 return ret; 2296 } 2297 2298 sc->ib.send_cq = ib_alloc_cq_any(sc->ib.dev, sc, 2299 max_send_wr, 2300 IB_POLL_WORKQUEUE); 2301 if (IS_ERR(sc->ib.send_cq)) { 2302 pr_err("Can't create RDMA send CQ\n"); 2303 ret = PTR_ERR(sc->ib.send_cq); 2304 sc->ib.send_cq = NULL; 2305 goto err; 2306 } 2307 2308 sc->ib.recv_cq = ib_alloc_cq_any(sc->ib.dev, sc, 2309 qp_cap.max_recv_wr, 2310 IB_POLL_WORKQUEUE); 2311 if (IS_ERR(sc->ib.recv_cq)) { 2312 pr_err("Can't create RDMA recv CQ\n"); 2313 ret = PTR_ERR(sc->ib.recv_cq); 2314 sc->ib.recv_cq = NULL; 2315 goto err; 2316 } 2317 2318 /* 2319 * We reset completely here! 2320 * As the above use was just temporary 2321 * to calc max_send_wr and rdma_send_wr. 2322 * 2323 * rdma_create_qp() will trigger rdma_rw_init_qp() 2324 * again if max_rdma_ctxs is not 0. 2325 */ 2326 memset(&qp_attr, 0, sizeof(qp_attr)); 2327 qp_attr.event_handler = smb_direct_qpair_handler; 2328 qp_attr.qp_context = sc; 2329 qp_attr.cap = qp_cap; 2330 qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; 2331 qp_attr.qp_type = IB_QPT_RC; 2332 qp_attr.send_cq = sc->ib.send_cq; 2333 qp_attr.recv_cq = sc->ib.recv_cq; 2334 qp_attr.port_num = ~0; 2335 2336 ret = rdma_create_qp(sc->rdma.cm_id, sc->ib.pd, &qp_attr); 2337 if (ret) { 2338 pr_err("Can't create RDMA QP: %d\n", ret); 2339 goto err; 2340 } 2341 2342 sc->ib.qp = sc->rdma.cm_id->qp; 2343 sc->rdma.cm_id->event_handler = smb_direct_cm_handler; 2344 2345 return 0; 2346 err: 2347 if (sc->ib.qp) { 2348 sc->ib.qp = NULL; 2349 rdma_destroy_qp(sc->rdma.cm_id); 2350 } 2351 if (sc->ib.recv_cq) { 2352 ib_destroy_cq(sc->ib.recv_cq); 2353 sc->ib.recv_cq = NULL; 2354 } 2355 if (sc->ib.send_cq) { 2356 ib_destroy_cq(sc->ib.send_cq); 2357 sc->ib.send_cq = NULL; 2358 } 2359 if (sc->ib.pd) { 2360 ib_dealloc_pd(sc->ib.pd); 2361 sc->ib.pd = NULL; 2362 } 2363 return ret; 2364 } 2365 2366 static int smb_direct_prepare(struct ksmbd_transport *t) 2367 { 2368 struct smb_direct_transport *st = SMBD_TRANS(t); 2369 struct smbdirect_socket *sc = &st->socket; 2370 struct smbdirect_socket_parameters *sp = &sc->parameters; 2371 struct smbdirect_recv_io *recvmsg; 2372 struct smbdirect_negotiate_req *req; 2373 unsigned long flags; 2374 int ret; 2375 2376 /* 2377 * We are waiting to pass the following states: 2378 * 2379 * SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED 2380 * SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING 2381 * SMBDIRECT_SOCKET_NEGOTIATE_NEEDED 2382 * 2383 * To finally get to SMBDIRECT_SOCKET_NEGOTIATE_RUNNING 2384 * in order to continue below. 2385 * 2386 * Everything else is unexpected and an error. 2387 */ 2388 ksmbd_debug(RDMA, "Waiting for SMB_DIRECT negotiate request\n"); 2389 ret = wait_event_interruptible_timeout(sc->status_wait, 2390 sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED && 2391 sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING && 2392 sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED, 2393 msecs_to_jiffies(sp->negotiate_timeout_msec)); 2394 if (ret <= 0 || sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING) 2395 return ret < 0 ? ret : -ETIMEDOUT; 2396 2397 recvmsg = get_first_reassembly(sc); 2398 if (!recvmsg) 2399 return -ECONNABORTED; 2400 2401 ret = smb_direct_check_recvmsg(recvmsg); 2402 if (ret) 2403 goto put; 2404 2405 req = (struct smbdirect_negotiate_req *)recvmsg->packet; 2406 sp->max_recv_size = min_t(int, sp->max_recv_size, 2407 le32_to_cpu(req->preferred_send_size)); 2408 sp->max_send_size = min_t(int, sp->max_send_size, 2409 le32_to_cpu(req->max_receive_size)); 2410 sp->max_fragmented_send_size = 2411 le32_to_cpu(req->max_fragmented_size); 2412 sp->max_fragmented_recv_size = 2413 (sp->recv_credit_max * sp->max_recv_size) / 2; 2414 sc->recv_io.credits.target = le16_to_cpu(req->credits_requested); 2415 sc->recv_io.credits.target = min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max); 2416 sc->recv_io.credits.target = max_t(u16, sc->recv_io.credits.target, 1); 2417 2418 put: 2419 spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); 2420 sc->recv_io.reassembly.queue_length--; 2421 list_del(&recvmsg->list); 2422 spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); 2423 put_recvmsg(sc, recvmsg); 2424 2425 if (ret == -ECONNABORTED) 2426 return ret; 2427 2428 if (ret) 2429 goto respond; 2430 2431 /* 2432 * We negotiated with success, so we need to refill the recv queue. 2433 * We do that with sc->idle.immediate_work still being disabled 2434 * via smbdirect_socket_init(), so that queue_work(sc->workqueue, 2435 * &sc->idle.immediate_work) in smb_direct_post_recv_credits() 2436 * is a no-op. 2437 * 2438 * The message that grants the credits to the client is 2439 * the negotiate response. 2440 */ 2441 INIT_WORK(&sc->recv_io.posted.refill_work, smb_direct_post_recv_credits); 2442 smb_direct_post_recv_credits(&sc->recv_io.posted.refill_work); 2443 if (unlikely(sc->first_error)) 2444 return sc->first_error; 2445 INIT_WORK(&sc->idle.immediate_work, smb_direct_send_immediate_work); 2446 2447 respond: 2448 ret = smb_direct_send_negotiate_response(sc, ret); 2449 2450 return ret; 2451 } 2452 2453 static int smb_direct_connect(struct smbdirect_socket *sc) 2454 { 2455 struct smbdirect_recv_io *recv_io; 2456 int ret; 2457 2458 ret = smb_direct_init_params(sc); 2459 if (ret) { 2460 pr_err("Can't configure RDMA parameters\n"); 2461 return ret; 2462 } 2463 2464 ret = smb_direct_create_pools(sc); 2465 if (ret) { 2466 pr_err("Can't init RDMA pool: %d\n", ret); 2467 return ret; 2468 } 2469 2470 list_for_each_entry(recv_io, &sc->recv_io.free.list, list) 2471 recv_io->cqe.done = recv_done; 2472 2473 ret = smb_direct_create_qpair(sc); 2474 if (ret) { 2475 pr_err("Can't accept RDMA client: %d\n", ret); 2476 return ret; 2477 } 2478 2479 ret = smb_direct_prepare_negotiation(sc); 2480 if (ret) { 2481 pr_err("Can't negotiate: %d\n", ret); 2482 return ret; 2483 } 2484 return 0; 2485 } 2486 2487 static bool rdma_frwr_is_supported(struct ib_device_attr *attrs) 2488 { 2489 if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) 2490 return false; 2491 if (attrs->max_fast_reg_page_list_len == 0) 2492 return false; 2493 return true; 2494 } 2495 2496 static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id, 2497 struct rdma_cm_event *event) 2498 { 2499 struct smb_direct_transport *t; 2500 struct smbdirect_socket *sc; 2501 struct smbdirect_socket_parameters *sp; 2502 struct task_struct *handler; 2503 u8 peer_initiator_depth; 2504 u8 peer_responder_resources; 2505 int ret; 2506 2507 if (!rdma_frwr_is_supported(&new_cm_id->device->attrs)) { 2508 ksmbd_debug(RDMA, 2509 "Fast Registration Work Requests is not supported. device capabilities=%llx\n", 2510 new_cm_id->device->attrs.device_cap_flags); 2511 return -EPROTONOSUPPORT; 2512 } 2513 2514 t = alloc_transport(new_cm_id); 2515 if (!t) 2516 return -ENOMEM; 2517 sc = &t->socket; 2518 sp = &sc->parameters; 2519 2520 peer_initiator_depth = event->param.conn.initiator_depth; 2521 peer_responder_resources = event->param.conn.responder_resources; 2522 if (rdma_protocol_iwarp(new_cm_id->device, new_cm_id->port_num) && 2523 event->param.conn.private_data_len == 8) { 2524 /* 2525 * Legacy clients with only iWarp MPA v1 support 2526 * need a private blob in order to negotiate 2527 * the IRD/ORD values. 2528 */ 2529 const __be32 *ird_ord_hdr = event->param.conn.private_data; 2530 u32 ird32 = be32_to_cpu(ird_ord_hdr[0]); 2531 u32 ord32 = be32_to_cpu(ird_ord_hdr[1]); 2532 2533 /* 2534 * cifs.ko sends the legacy IRD/ORD negotiation 2535 * event if iWarp MPA v2 was used. 2536 * 2537 * Here we check that the values match and only 2538 * mark the client as legacy if they don't match. 2539 */ 2540 if ((u32)event->param.conn.initiator_depth != ird32 || 2541 (u32)event->param.conn.responder_resources != ord32) { 2542 /* 2543 * There are broken clients (old cifs.ko) 2544 * using little endian and also 2545 * struct rdma_conn_param only uses u8 2546 * for initiator_depth and responder_resources, 2547 * so we truncate the value to U8_MAX. 2548 * 2549 * smb_direct_accept_client() will then 2550 * do the real negotiation in order to 2551 * select the minimum between client and 2552 * server. 2553 */ 2554 ird32 = min_t(u32, ird32, U8_MAX); 2555 ord32 = min_t(u32, ord32, U8_MAX); 2556 2557 sc->rdma.legacy_iwarp = true; 2558 peer_initiator_depth = (u8)ird32; 2559 peer_responder_resources = (u8)ord32; 2560 } 2561 } 2562 2563 /* 2564 * First set what the we as server are able to support 2565 */ 2566 sp->initiator_depth = min_t(u8, sp->initiator_depth, 2567 new_cm_id->device->attrs.max_qp_rd_atom); 2568 2569 /* 2570 * negotiate the value by using the minimum 2571 * between client and server if the client provided 2572 * non 0 values. 2573 */ 2574 if (peer_initiator_depth != 0) 2575 sp->initiator_depth = min_t(u8, sp->initiator_depth, 2576 peer_initiator_depth); 2577 if (peer_responder_resources != 0) 2578 sp->responder_resources = min_t(u8, sp->responder_resources, 2579 peer_responder_resources); 2580 2581 ret = smb_direct_connect(sc); 2582 if (ret) 2583 goto out_err; 2584 2585 handler = kthread_run(ksmbd_conn_handler_loop, 2586 KSMBD_TRANS(t)->conn, "ksmbd:r%u", 2587 smb_direct_port); 2588 if (IS_ERR(handler)) { 2589 ret = PTR_ERR(handler); 2590 pr_err("Can't start thread\n"); 2591 goto out_err; 2592 } 2593 2594 return 0; 2595 out_err: 2596 free_transport(t); 2597 return ret; 2598 } 2599 2600 static int smb_direct_listen_handler(struct rdma_cm_id *cm_id, 2601 struct rdma_cm_event *event) 2602 { 2603 switch (event->event) { 2604 case RDMA_CM_EVENT_CONNECT_REQUEST: { 2605 int ret = smb_direct_handle_connect_request(cm_id, event); 2606 2607 if (ret) { 2608 pr_err("Can't create transport: %d\n", ret); 2609 return ret; 2610 } 2611 2612 ksmbd_debug(RDMA, "Received connection request. cm_id=%p\n", 2613 cm_id); 2614 break; 2615 } 2616 default: 2617 pr_err("Unexpected listen event. cm_id=%p, event=%s (%d)\n", 2618 cm_id, rdma_event_msg(event->event), event->event); 2619 break; 2620 } 2621 return 0; 2622 } 2623 2624 static int smb_direct_listen(int port) 2625 { 2626 int ret; 2627 struct rdma_cm_id *cm_id; 2628 struct sockaddr_in sin = { 2629 .sin_family = AF_INET, 2630 .sin_addr.s_addr = htonl(INADDR_ANY), 2631 .sin_port = htons(port), 2632 }; 2633 2634 cm_id = rdma_create_id(&init_net, smb_direct_listen_handler, 2635 &smb_direct_listener, RDMA_PS_TCP, IB_QPT_RC); 2636 if (IS_ERR(cm_id)) { 2637 pr_err("Can't create cm id: %ld\n", PTR_ERR(cm_id)); 2638 return PTR_ERR(cm_id); 2639 } 2640 2641 ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); 2642 if (ret) { 2643 pr_err("Can't bind: %d\n", ret); 2644 goto err; 2645 } 2646 2647 smb_direct_listener.cm_id = cm_id; 2648 2649 ret = rdma_listen(cm_id, 10); 2650 if (ret) { 2651 pr_err("Can't listen: %d\n", ret); 2652 goto err; 2653 } 2654 return 0; 2655 err: 2656 smb_direct_listener.cm_id = NULL; 2657 rdma_destroy_id(cm_id); 2658 return ret; 2659 } 2660 2661 static int smb_direct_ib_client_add(struct ib_device *ib_dev) 2662 { 2663 struct smb_direct_device *smb_dev; 2664 2665 /* Set 5445 port if device type is iWARP(No IB) */ 2666 if (ib_dev->node_type != RDMA_NODE_IB_CA) 2667 smb_direct_port = SMB_DIRECT_PORT_IWARP; 2668 2669 if (!rdma_frwr_is_supported(&ib_dev->attrs)) 2670 return 0; 2671 2672 smb_dev = kzalloc(sizeof(*smb_dev), KSMBD_DEFAULT_GFP); 2673 if (!smb_dev) 2674 return -ENOMEM; 2675 smb_dev->ib_dev = ib_dev; 2676 2677 write_lock(&smb_direct_device_lock); 2678 list_add(&smb_dev->list, &smb_direct_device_list); 2679 write_unlock(&smb_direct_device_lock); 2680 2681 ksmbd_debug(RDMA, "ib device added: name %s\n", ib_dev->name); 2682 return 0; 2683 } 2684 2685 static void smb_direct_ib_client_remove(struct ib_device *ib_dev, 2686 void *client_data) 2687 { 2688 struct smb_direct_device *smb_dev, *tmp; 2689 2690 write_lock(&smb_direct_device_lock); 2691 list_for_each_entry_safe(smb_dev, tmp, &smb_direct_device_list, list) { 2692 if (smb_dev->ib_dev == ib_dev) { 2693 list_del(&smb_dev->list); 2694 kfree(smb_dev); 2695 break; 2696 } 2697 } 2698 write_unlock(&smb_direct_device_lock); 2699 } 2700 2701 static struct ib_client smb_direct_ib_client = { 2702 .name = "ksmbd_smb_direct_ib", 2703 .add = smb_direct_ib_client_add, 2704 .remove = smb_direct_ib_client_remove, 2705 }; 2706 2707 int ksmbd_rdma_init(void) 2708 { 2709 int ret; 2710 2711 smb_direct_listener.cm_id = NULL; 2712 2713 ret = ib_register_client(&smb_direct_ib_client); 2714 if (ret) { 2715 pr_err("failed to ib_register_client\n"); 2716 return ret; 2717 } 2718 2719 /* When a client is running out of send credits, the credits are 2720 * granted by the server's sending a packet using this queue. 2721 * This avoids the situation that a clients cannot send packets 2722 * for lack of credits 2723 */ 2724 smb_direct_wq = alloc_workqueue("ksmbd-smb_direct-wq", 2725 WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_PERCPU, 2726 0); 2727 if (!smb_direct_wq) 2728 return -ENOMEM; 2729 2730 ret = smb_direct_listen(smb_direct_port); 2731 if (ret) { 2732 destroy_workqueue(smb_direct_wq); 2733 smb_direct_wq = NULL; 2734 pr_err("Can't listen: %d\n", ret); 2735 return ret; 2736 } 2737 2738 ksmbd_debug(RDMA, "init RDMA listener. cm_id=%p\n", 2739 smb_direct_listener.cm_id); 2740 return 0; 2741 } 2742 2743 void ksmbd_rdma_stop_listening(void) 2744 { 2745 if (!smb_direct_listener.cm_id) 2746 return; 2747 2748 ib_unregister_client(&smb_direct_ib_client); 2749 rdma_destroy_id(smb_direct_listener.cm_id); 2750 2751 smb_direct_listener.cm_id = NULL; 2752 } 2753 2754 void ksmbd_rdma_destroy(void) 2755 { 2756 if (smb_direct_wq) { 2757 destroy_workqueue(smb_direct_wq); 2758 smb_direct_wq = NULL; 2759 } 2760 } 2761 2762 static bool ksmbd_find_rdma_capable_netdev(struct net_device *netdev) 2763 { 2764 struct smb_direct_device *smb_dev; 2765 int i; 2766 bool rdma_capable = false; 2767 2768 read_lock(&smb_direct_device_lock); 2769 list_for_each_entry(smb_dev, &smb_direct_device_list, list) { 2770 for (i = 0; i < smb_dev->ib_dev->phys_port_cnt; i++) { 2771 struct net_device *ndev; 2772 2773 ndev = ib_device_get_netdev(smb_dev->ib_dev, i + 1); 2774 if (!ndev) 2775 continue; 2776 2777 if (ndev == netdev) { 2778 dev_put(ndev); 2779 rdma_capable = true; 2780 goto out; 2781 } 2782 dev_put(ndev); 2783 } 2784 } 2785 out: 2786 read_unlock(&smb_direct_device_lock); 2787 2788 if (rdma_capable == false) { 2789 struct ib_device *ibdev; 2790 2791 ibdev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_UNKNOWN); 2792 if (ibdev) { 2793 rdma_capable = rdma_frwr_is_supported(&ibdev->attrs); 2794 ib_device_put(ibdev); 2795 } 2796 } 2797 2798 ksmbd_debug(RDMA, "netdev(%s) rdma capable : %s\n", 2799 netdev->name, str_true_false(rdma_capable)); 2800 2801 return rdma_capable; 2802 } 2803 2804 bool ksmbd_rdma_capable_netdev(struct net_device *netdev) 2805 { 2806 struct net_device *lower_dev; 2807 struct list_head *iter; 2808 2809 if (ksmbd_find_rdma_capable_netdev(netdev)) 2810 return true; 2811 2812 /* check if netdev is bridge or VLAN */ 2813 if (netif_is_bridge_master(netdev) || 2814 netdev->priv_flags & IFF_802_1Q_VLAN) 2815 netdev_for_each_lower_dev(netdev, lower_dev, iter) 2816 if (ksmbd_find_rdma_capable_netdev(lower_dev)) 2817 return true; 2818 2819 /* check if netdev is IPoIB safely without layer violation */ 2820 if (netdev->type == ARPHRD_INFINIBAND) 2821 return true; 2822 2823 return false; 2824 } 2825 2826 static const struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops = { 2827 .prepare = smb_direct_prepare, 2828 .disconnect = smb_direct_disconnect, 2829 .shutdown = smb_direct_shutdown, 2830 .writev = smb_direct_writev, 2831 .read = smb_direct_read, 2832 .rdma_read = smb_direct_rdma_read, 2833 .rdma_write = smb_direct_rdma_write, 2834 .free_transport = smb_direct_free_transport, 2835 }; 2836