1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2017, Microsoft Corporation. 4 * Copyright (C) 2018, LG Electronics. 5 * 6 * Author(s): Long Li <longli@microsoft.com>, 7 * Hyunchul Lee <hyc.lee@gmail.com> 8 */ 9 10 #define SUBMOD_NAME "smb_direct" 11 12 #include <linux/kthread.h> 13 #include <linux/list.h> 14 #include <linux/mempool.h> 15 #include <linux/highmem.h> 16 #include <linux/scatterlist.h> 17 #include <linux/string_choices.h> 18 #include <rdma/ib_verbs.h> 19 #include <rdma/rdma_cm.h> 20 #include <rdma/rw.h> 21 22 #define __SMBDIRECT_SOCKET_DISCONNECT(__sc) smb_direct_disconnect_rdma_connection(__sc) 23 24 #include "glob.h" 25 #include "connection.h" 26 #include "smb_common.h" 27 #include "../common/smb2status.h" 28 #include "../common/smbdirect/smbdirect.h" 29 #include "../common/smbdirect/smbdirect_pdu.h" 30 #include "../common/smbdirect/smbdirect_socket.h" 31 #include "transport_rdma.h" 32 33 #define SMB_DIRECT_PORT_IWARP 5445 34 #define SMB_DIRECT_PORT_INFINIBAND 445 35 36 #define SMB_DIRECT_VERSION_LE cpu_to_le16(SMBDIRECT_V1) 37 38 /* SMB_DIRECT negotiation timeout (for the server) in seconds */ 39 #define SMB_DIRECT_NEGOTIATE_TIMEOUT 5 40 41 /* The timeout to wait for a keepalive message from peer in seconds */ 42 #define SMB_DIRECT_KEEPALIVE_SEND_INTERVAL 120 43 44 /* The timeout to wait for a keepalive message from peer in seconds */ 45 #define SMB_DIRECT_KEEPALIVE_RECV_TIMEOUT 5 46 47 /* 48 * Default maximum number of RDMA read/write outstanding on this connection 49 * This value is possibly decreased during QP creation on hardware limit 50 */ 51 #define SMB_DIRECT_CM_INITIATOR_DEPTH 8 52 53 /* Maximum number of retries on data transfer operations */ 54 #define SMB_DIRECT_CM_RETRY 6 55 /* No need to retry on Receiver Not Ready since SMB_DIRECT manages credits */ 56 #define SMB_DIRECT_CM_RNR_RETRY 0 57 58 /* 59 * User configurable initial values per SMB_DIRECT transport connection 60 * as defined in [MS-SMBD] 3.1.1.1 61 * Those may change after a SMB_DIRECT negotiation 62 */ 63 64 /* The local peer's maximum number of credits to grant to the peer */ 65 static int smb_direct_receive_credit_max = 255; 66 67 /* The remote peer's credit request of local peer */ 68 static int smb_direct_send_credit_target = 255; 69 70 /* The maximum single message size can be sent to remote peer */ 71 static int smb_direct_max_send_size = 1364; 72 73 /* 74 * The maximum fragmented upper-layer payload receive size supported 75 * 76 * Assume max_payload_per_credit is 77 * smb_direct_receive_credit_max - 24 = 1340 78 * 79 * The maximum number would be 80 * smb_direct_receive_credit_max * max_payload_per_credit 81 * 82 * 1340 * 255 = 341700 (0x536C4) 83 * 84 * The minimum value from the spec is 131072 (0x20000) 85 * 86 * For now we use the logic we used before: 87 * (1364 * 255) / 2 = 173910 (0x2A756) 88 */ 89 static int smb_direct_max_fragmented_recv_size = (1364 * 255) / 2; 90 91 /* The maximum single-message size which can be received */ 92 static int smb_direct_max_receive_size = 1364; 93 94 static int smb_direct_max_read_write_size = SMBD_DEFAULT_IOSIZE; 95 96 static LIST_HEAD(smb_direct_device_list); 97 static DEFINE_RWLOCK(smb_direct_device_lock); 98 99 struct smb_direct_device { 100 struct ib_device *ib_dev; 101 struct list_head list; 102 }; 103 104 static struct smb_direct_listener { 105 int port; 106 struct rdma_cm_id *cm_id; 107 } smb_direct_ib_listener, smb_direct_iw_listener; 108 109 static struct workqueue_struct *smb_direct_wq; 110 111 struct smb_direct_transport { 112 struct ksmbd_transport transport; 113 114 struct smbdirect_socket socket; 115 }; 116 117 #define KSMBD_TRANS(t) (&(t)->transport) 118 #define SMBD_TRANS(t) (container_of(t, \ 119 struct smb_direct_transport, transport)) 120 121 static const struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops; 122 123 void init_smbd_max_io_size(unsigned int sz) 124 { 125 sz = clamp_val(sz, SMBD_MIN_IOSIZE, SMBD_MAX_IOSIZE); 126 smb_direct_max_read_write_size = sz; 127 } 128 129 unsigned int get_smbd_max_read_write_size(struct ksmbd_transport *kt) 130 { 131 struct smb_direct_transport *t; 132 struct smbdirect_socket *sc; 133 struct smbdirect_socket_parameters *sp; 134 135 if (kt->ops != &ksmbd_smb_direct_transport_ops) 136 return 0; 137 138 t = SMBD_TRANS(kt); 139 sc = &t->socket; 140 sp = &sc->parameters; 141 142 return sp->max_read_write_size; 143 } 144 145 static inline int get_buf_page_count(void *buf, int size) 146 { 147 return DIV_ROUND_UP((uintptr_t)buf + size, PAGE_SIZE) - 148 (uintptr_t)buf / PAGE_SIZE; 149 } 150 151 static void smb_direct_destroy_pools(struct smbdirect_socket *sc); 152 static void smb_direct_post_recv_credits(struct work_struct *work); 153 static int smb_direct_post_send_data(struct smbdirect_socket *sc, 154 struct smbdirect_send_batch *send_ctx, 155 struct kvec *iov, int niov, 156 int remaining_data_length); 157 158 static inline void 159 *smbdirect_recv_io_payload(struct smbdirect_recv_io *recvmsg) 160 { 161 return (void *)recvmsg->packet; 162 } 163 164 static struct 165 smbdirect_recv_io *get_free_recvmsg(struct smbdirect_socket *sc) 166 { 167 struct smbdirect_recv_io *recvmsg = NULL; 168 unsigned long flags; 169 170 spin_lock_irqsave(&sc->recv_io.free.lock, flags); 171 if (!list_empty(&sc->recv_io.free.list)) { 172 recvmsg = list_first_entry(&sc->recv_io.free.list, 173 struct smbdirect_recv_io, 174 list); 175 list_del(&recvmsg->list); 176 } 177 spin_unlock_irqrestore(&sc->recv_io.free.lock, flags); 178 return recvmsg; 179 } 180 181 static void put_recvmsg(struct smbdirect_socket *sc, 182 struct smbdirect_recv_io *recvmsg) 183 { 184 unsigned long flags; 185 186 if (likely(recvmsg->sge.length != 0)) { 187 ib_dma_unmap_single(sc->ib.dev, 188 recvmsg->sge.addr, 189 recvmsg->sge.length, 190 DMA_FROM_DEVICE); 191 recvmsg->sge.length = 0; 192 } 193 194 spin_lock_irqsave(&sc->recv_io.free.lock, flags); 195 list_add(&recvmsg->list, &sc->recv_io.free.list); 196 spin_unlock_irqrestore(&sc->recv_io.free.lock, flags); 197 198 queue_work(sc->workqueue, &sc->recv_io.posted.refill_work); 199 } 200 201 static void enqueue_reassembly(struct smbdirect_socket *sc, 202 struct smbdirect_recv_io *recvmsg, 203 int data_length) 204 { 205 unsigned long flags; 206 207 spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); 208 list_add_tail(&recvmsg->list, &sc->recv_io.reassembly.list); 209 sc->recv_io.reassembly.queue_length++; 210 /* 211 * Make sure reassembly_data_length is updated after list and 212 * reassembly_queue_length are updated. On the dequeue side 213 * reassembly_data_length is checked without a lock to determine 214 * if reassembly_queue_length and list is up to date 215 */ 216 virt_wmb(); 217 sc->recv_io.reassembly.data_length += data_length; 218 spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); 219 } 220 221 static struct smbdirect_recv_io *get_first_reassembly(struct smbdirect_socket *sc) 222 { 223 if (!list_empty(&sc->recv_io.reassembly.list)) 224 return list_first_entry(&sc->recv_io.reassembly.list, 225 struct smbdirect_recv_io, list); 226 else 227 return NULL; 228 } 229 230 static void smb_direct_disconnect_wake_up_all(struct smbdirect_socket *sc) 231 { 232 /* 233 * Wake up all waiters in all wait queues 234 * in order to notice the broken connection. 235 */ 236 wake_up_all(&sc->status_wait); 237 wake_up_all(&sc->send_io.bcredits.wait_queue); 238 wake_up_all(&sc->send_io.lcredits.wait_queue); 239 wake_up_all(&sc->send_io.credits.wait_queue); 240 wake_up_all(&sc->send_io.pending.zero_wait_queue); 241 wake_up_all(&sc->recv_io.reassembly.wait_queue); 242 wake_up_all(&sc->rw_io.credits.wait_queue); 243 } 244 245 static void smb_direct_disconnect_rdma_work(struct work_struct *work) 246 { 247 struct smbdirect_socket *sc = 248 container_of(work, struct smbdirect_socket, disconnect_work); 249 250 if (sc->first_error == 0) 251 sc->first_error = -ECONNABORTED; 252 253 /* 254 * make sure this and other work is not queued again 255 * but here we don't block and avoid 256 * disable[_delayed]_work_sync() 257 */ 258 disable_work(&sc->disconnect_work); 259 disable_work(&sc->connect.work); 260 disable_work(&sc->recv_io.posted.refill_work); 261 disable_delayed_work(&sc->idle.timer_work); 262 disable_work(&sc->idle.immediate_work); 263 264 switch (sc->status) { 265 case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED: 266 case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING: 267 case SMBDIRECT_SOCKET_NEGOTIATE_FAILED: 268 case SMBDIRECT_SOCKET_CONNECTED: 269 case SMBDIRECT_SOCKET_ERROR: 270 sc->status = SMBDIRECT_SOCKET_DISCONNECTING; 271 rdma_disconnect(sc->rdma.cm_id); 272 break; 273 274 case SMBDIRECT_SOCKET_CREATED: 275 case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED: 276 case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING: 277 case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED: 278 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED: 279 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING: 280 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED: 281 case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED: 282 case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING: 283 case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED: 284 /* 285 * rdma_accept() never reached 286 * RDMA_CM_EVENT_ESTABLISHED 287 */ 288 sc->status = SMBDIRECT_SOCKET_DISCONNECTED; 289 break; 290 291 case SMBDIRECT_SOCKET_DISCONNECTING: 292 case SMBDIRECT_SOCKET_DISCONNECTED: 293 case SMBDIRECT_SOCKET_DESTROYED: 294 break; 295 } 296 297 /* 298 * Wake up all waiters in all wait queues 299 * in order to notice the broken connection. 300 */ 301 smb_direct_disconnect_wake_up_all(sc); 302 } 303 304 static void 305 smb_direct_disconnect_rdma_connection(struct smbdirect_socket *sc) 306 { 307 if (sc->first_error == 0) 308 sc->first_error = -ECONNABORTED; 309 310 /* 311 * make sure other work (than disconnect_work) is 312 * not queued again but here we don't block and avoid 313 * disable[_delayed]_work_sync() 314 */ 315 disable_work(&sc->connect.work); 316 disable_work(&sc->recv_io.posted.refill_work); 317 disable_work(&sc->idle.immediate_work); 318 disable_delayed_work(&sc->idle.timer_work); 319 320 switch (sc->status) { 321 case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED: 322 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED: 323 case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED: 324 case SMBDIRECT_SOCKET_NEGOTIATE_FAILED: 325 case SMBDIRECT_SOCKET_ERROR: 326 case SMBDIRECT_SOCKET_DISCONNECTING: 327 case SMBDIRECT_SOCKET_DISCONNECTED: 328 case SMBDIRECT_SOCKET_DESTROYED: 329 /* 330 * Keep the current error status 331 */ 332 break; 333 334 case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED: 335 case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING: 336 sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED; 337 break; 338 339 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED: 340 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING: 341 sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED; 342 break; 343 344 case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED: 345 case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING: 346 sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED; 347 break; 348 349 case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED: 350 case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING: 351 sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; 352 break; 353 354 case SMBDIRECT_SOCKET_CREATED: 355 sc->status = SMBDIRECT_SOCKET_DISCONNECTED; 356 break; 357 358 case SMBDIRECT_SOCKET_CONNECTED: 359 sc->status = SMBDIRECT_SOCKET_ERROR; 360 break; 361 } 362 363 /* 364 * Wake up all waiters in all wait queues 365 * in order to notice the broken connection. 366 */ 367 smb_direct_disconnect_wake_up_all(sc); 368 369 queue_work(sc->workqueue, &sc->disconnect_work); 370 } 371 372 static void smb_direct_send_immediate_work(struct work_struct *work) 373 { 374 struct smbdirect_socket *sc = 375 container_of(work, struct smbdirect_socket, idle.immediate_work); 376 377 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) 378 return; 379 380 smb_direct_post_send_data(sc, NULL, NULL, 0, 0); 381 } 382 383 static void smb_direct_idle_connection_timer(struct work_struct *work) 384 { 385 struct smbdirect_socket *sc = 386 container_of(work, struct smbdirect_socket, idle.timer_work.work); 387 struct smbdirect_socket_parameters *sp = &sc->parameters; 388 389 if (sc->idle.keepalive != SMBDIRECT_KEEPALIVE_NONE) { 390 smb_direct_disconnect_rdma_connection(sc); 391 return; 392 } 393 394 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) 395 return; 396 397 /* 398 * Now use the keepalive timeout (instead of keepalive interval) 399 * in order to wait for a response 400 */ 401 sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING; 402 mod_delayed_work(sc->workqueue, &sc->idle.timer_work, 403 msecs_to_jiffies(sp->keepalive_timeout_msec)); 404 queue_work(sc->workqueue, &sc->idle.immediate_work); 405 } 406 407 static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) 408 { 409 struct smb_direct_transport *t; 410 struct smbdirect_socket *sc; 411 struct smbdirect_socket_parameters *sp; 412 struct ksmbd_conn *conn; 413 414 t = kzalloc(sizeof(*t), KSMBD_DEFAULT_GFP); 415 if (!t) 416 return NULL; 417 sc = &t->socket; 418 smbdirect_socket_init(sc); 419 sp = &sc->parameters; 420 421 sc->workqueue = smb_direct_wq; 422 423 INIT_WORK(&sc->disconnect_work, smb_direct_disconnect_rdma_work); 424 425 sp->negotiate_timeout_msec = SMB_DIRECT_NEGOTIATE_TIMEOUT * 1000; 426 sp->initiator_depth = SMB_DIRECT_CM_INITIATOR_DEPTH; 427 sp->responder_resources = 1; 428 sp->recv_credit_max = smb_direct_receive_credit_max; 429 sp->send_credit_target = smb_direct_send_credit_target; 430 sp->max_send_size = smb_direct_max_send_size; 431 sp->max_fragmented_recv_size = smb_direct_max_fragmented_recv_size; 432 sp->max_recv_size = smb_direct_max_receive_size; 433 sp->max_read_write_size = smb_direct_max_read_write_size; 434 sp->keepalive_interval_msec = SMB_DIRECT_KEEPALIVE_SEND_INTERVAL * 1000; 435 sp->keepalive_timeout_msec = SMB_DIRECT_KEEPALIVE_RECV_TIMEOUT * 1000; 436 437 sc->rdma.cm_id = cm_id; 438 cm_id->context = sc; 439 440 sc->ib.dev = sc->rdma.cm_id->device; 441 442 INIT_DELAYED_WORK(&sc->idle.timer_work, smb_direct_idle_connection_timer); 443 444 conn = ksmbd_conn_alloc(); 445 if (!conn) 446 goto err; 447 448 down_write(&conn_list_lock); 449 hash_add(conn_list, &conn->hlist, 0); 450 up_write(&conn_list_lock); 451 452 conn->transport = KSMBD_TRANS(t); 453 KSMBD_TRANS(t)->conn = conn; 454 KSMBD_TRANS(t)->ops = &ksmbd_smb_direct_transport_ops; 455 return t; 456 err: 457 kfree(t); 458 return NULL; 459 } 460 461 static void smb_direct_free_transport(struct ksmbd_transport *kt) 462 { 463 kfree(SMBD_TRANS(kt)); 464 } 465 466 static void free_transport(struct smb_direct_transport *t) 467 { 468 struct smbdirect_socket *sc = &t->socket; 469 struct smbdirect_recv_io *recvmsg; 470 471 disable_work_sync(&sc->disconnect_work); 472 if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING) 473 smb_direct_disconnect_rdma_work(&sc->disconnect_work); 474 if (sc->status < SMBDIRECT_SOCKET_DISCONNECTED) 475 wait_event(sc->status_wait, sc->status == SMBDIRECT_SOCKET_DISCONNECTED); 476 477 /* 478 * Wake up all waiters in all wait queues 479 * in order to notice the broken connection. 480 * 481 * Most likely this was already called via 482 * smb_direct_disconnect_rdma_work(), but call it again... 483 */ 484 smb_direct_disconnect_wake_up_all(sc); 485 486 disable_work_sync(&sc->connect.work); 487 disable_work_sync(&sc->recv_io.posted.refill_work); 488 disable_delayed_work_sync(&sc->idle.timer_work); 489 disable_work_sync(&sc->idle.immediate_work); 490 491 if (sc->rdma.cm_id) 492 rdma_lock_handler(sc->rdma.cm_id); 493 494 if (sc->ib.qp) { 495 ib_drain_qp(sc->ib.qp); 496 sc->ib.qp = NULL; 497 rdma_destroy_qp(sc->rdma.cm_id); 498 } 499 500 ksmbd_debug(RDMA, "drain the reassembly queue\n"); 501 do { 502 unsigned long flags; 503 504 spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); 505 recvmsg = get_first_reassembly(sc); 506 if (recvmsg) { 507 list_del(&recvmsg->list); 508 spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); 509 put_recvmsg(sc, recvmsg); 510 } else { 511 spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); 512 } 513 } while (recvmsg); 514 sc->recv_io.reassembly.data_length = 0; 515 516 if (sc->ib.send_cq) 517 ib_free_cq(sc->ib.send_cq); 518 if (sc->ib.recv_cq) 519 ib_free_cq(sc->ib.recv_cq); 520 if (sc->ib.pd) 521 ib_dealloc_pd(sc->ib.pd); 522 if (sc->rdma.cm_id) { 523 rdma_unlock_handler(sc->rdma.cm_id); 524 rdma_destroy_id(sc->rdma.cm_id); 525 } 526 527 smb_direct_destroy_pools(sc); 528 ksmbd_conn_free(KSMBD_TRANS(t)->conn); 529 } 530 531 static struct smbdirect_send_io 532 *smb_direct_alloc_sendmsg(struct smbdirect_socket *sc) 533 { 534 struct smbdirect_send_io *msg; 535 536 msg = mempool_alloc(sc->send_io.mem.pool, KSMBD_DEFAULT_GFP); 537 if (!msg) 538 return ERR_PTR(-ENOMEM); 539 msg->socket = sc; 540 INIT_LIST_HEAD(&msg->sibling_list); 541 msg->num_sge = 0; 542 return msg; 543 } 544 545 static void smb_direct_free_sendmsg(struct smbdirect_socket *sc, 546 struct smbdirect_send_io *msg) 547 { 548 int i; 549 550 /* 551 * The list needs to be empty! 552 * The caller should take care of it. 553 */ 554 WARN_ON_ONCE(!list_empty(&msg->sibling_list)); 555 556 if (msg->num_sge > 0) { 557 ib_dma_unmap_single(sc->ib.dev, 558 msg->sge[0].addr, msg->sge[0].length, 559 DMA_TO_DEVICE); 560 for (i = 1; i < msg->num_sge; i++) 561 ib_dma_unmap_page(sc->ib.dev, 562 msg->sge[i].addr, msg->sge[i].length, 563 DMA_TO_DEVICE); 564 } 565 mempool_free(msg, sc->send_io.mem.pool); 566 } 567 568 static int smb_direct_check_recvmsg(struct smbdirect_recv_io *recvmsg) 569 { 570 struct smbdirect_socket *sc = recvmsg->socket; 571 572 switch (sc->recv_io.expected) { 573 case SMBDIRECT_EXPECT_DATA_TRANSFER: { 574 struct smbdirect_data_transfer *req = 575 (struct smbdirect_data_transfer *)recvmsg->packet; 576 struct smb2_hdr *hdr = (struct smb2_hdr *)(recvmsg->packet 577 + le32_to_cpu(req->data_offset)); 578 ksmbd_debug(RDMA, 579 "CreditGranted: %u, CreditRequested: %u, DataLength: %u, RemainingDataLength: %u, SMB: %x, Command: %u\n", 580 le16_to_cpu(req->credits_granted), 581 le16_to_cpu(req->credits_requested), 582 req->data_length, req->remaining_data_length, 583 hdr->ProtocolId, hdr->Command); 584 return 0; 585 } 586 case SMBDIRECT_EXPECT_NEGOTIATE_REQ: { 587 struct smbdirect_negotiate_req *req = 588 (struct smbdirect_negotiate_req *)recvmsg->packet; 589 ksmbd_debug(RDMA, 590 "MinVersion: %u, MaxVersion: %u, CreditRequested: %u, MaxSendSize: %u, MaxRecvSize: %u, MaxFragmentedSize: %u\n", 591 le16_to_cpu(req->min_version), 592 le16_to_cpu(req->max_version), 593 le16_to_cpu(req->credits_requested), 594 le32_to_cpu(req->preferred_send_size), 595 le32_to_cpu(req->max_receive_size), 596 le32_to_cpu(req->max_fragmented_size)); 597 if (le16_to_cpu(req->min_version) > 0x0100 || 598 le16_to_cpu(req->max_version) < 0x0100) 599 return -EOPNOTSUPP; 600 if (le16_to_cpu(req->credits_requested) <= 0 || 601 le32_to_cpu(req->max_receive_size) <= 128 || 602 le32_to_cpu(req->max_fragmented_size) <= 603 128 * 1024) 604 return -ECONNABORTED; 605 606 return 0; 607 } 608 case SMBDIRECT_EXPECT_NEGOTIATE_REP: 609 /* client only */ 610 break; 611 } 612 613 /* This is an internal error */ 614 return -EINVAL; 615 } 616 617 static void recv_done(struct ib_cq *cq, struct ib_wc *wc) 618 { 619 struct smbdirect_recv_io *recvmsg; 620 struct smbdirect_socket *sc; 621 struct smbdirect_socket_parameters *sp; 622 623 recvmsg = container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe); 624 sc = recvmsg->socket; 625 sp = &sc->parameters; 626 627 if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) { 628 put_recvmsg(sc, recvmsg); 629 if (wc->status != IB_WC_WR_FLUSH_ERR) { 630 pr_err("Recv error. status='%s (%d)' opcode=%d\n", 631 ib_wc_status_msg(wc->status), wc->status, 632 wc->opcode); 633 smb_direct_disconnect_rdma_connection(sc); 634 } 635 return; 636 } 637 638 ksmbd_debug(RDMA, "Recv completed. status='%s (%d)', opcode=%d\n", 639 ib_wc_status_msg(wc->status), wc->status, 640 wc->opcode); 641 642 ib_dma_sync_single_for_cpu(wc->qp->device, recvmsg->sge.addr, 643 recvmsg->sge.length, DMA_FROM_DEVICE); 644 645 /* 646 * Reset timer to the keepalive interval in 647 * order to trigger our next keepalive message. 648 */ 649 sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE; 650 mod_delayed_work(sc->workqueue, &sc->idle.timer_work, 651 msecs_to_jiffies(sp->keepalive_interval_msec)); 652 653 switch (sc->recv_io.expected) { 654 case SMBDIRECT_EXPECT_NEGOTIATE_REQ: 655 /* see smb_direct_negotiate_recv_done */ 656 break; 657 case SMBDIRECT_EXPECT_DATA_TRANSFER: { 658 struct smbdirect_data_transfer *data_transfer = 659 (struct smbdirect_data_transfer *)recvmsg->packet; 660 u32 remaining_data_length, data_offset, data_length; 661 int current_recv_credits; 662 u16 old_recv_credit_target; 663 664 if (wc->byte_len < 665 offsetof(struct smbdirect_data_transfer, padding)) { 666 put_recvmsg(sc, recvmsg); 667 smb_direct_disconnect_rdma_connection(sc); 668 return; 669 } 670 671 remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length); 672 data_length = le32_to_cpu(data_transfer->data_length); 673 data_offset = le32_to_cpu(data_transfer->data_offset); 674 if (wc->byte_len < data_offset || 675 wc->byte_len < (u64)data_offset + data_length) { 676 put_recvmsg(sc, recvmsg); 677 smb_direct_disconnect_rdma_connection(sc); 678 return; 679 } 680 if (remaining_data_length > sp->max_fragmented_recv_size || 681 data_length > sp->max_fragmented_recv_size || 682 (u64)remaining_data_length + (u64)data_length > 683 (u64)sp->max_fragmented_recv_size) { 684 put_recvmsg(sc, recvmsg); 685 smb_direct_disconnect_rdma_connection(sc); 686 return; 687 } 688 689 if (data_length) { 690 if (sc->recv_io.reassembly.full_packet_received) 691 recvmsg->first_segment = true; 692 693 if (le32_to_cpu(data_transfer->remaining_data_length)) 694 sc->recv_io.reassembly.full_packet_received = false; 695 else 696 sc->recv_io.reassembly.full_packet_received = true; 697 } 698 699 atomic_dec(&sc->recv_io.posted.count); 700 current_recv_credits = atomic_dec_return(&sc->recv_io.credits.count); 701 702 old_recv_credit_target = sc->recv_io.credits.target; 703 sc->recv_io.credits.target = 704 le16_to_cpu(data_transfer->credits_requested); 705 sc->recv_io.credits.target = 706 min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max); 707 sc->recv_io.credits.target = 708 max_t(u16, sc->recv_io.credits.target, 1); 709 atomic_add(le16_to_cpu(data_transfer->credits_granted), 710 &sc->send_io.credits.count); 711 712 if (le16_to_cpu(data_transfer->flags) & 713 SMBDIRECT_FLAG_RESPONSE_REQUESTED) 714 queue_work(sc->workqueue, &sc->idle.immediate_work); 715 716 if (atomic_read(&sc->send_io.credits.count) > 0) 717 wake_up(&sc->send_io.credits.wait_queue); 718 719 if (data_length) { 720 if (current_recv_credits <= (sc->recv_io.credits.target / 4) || 721 sc->recv_io.credits.target > old_recv_credit_target) 722 queue_work(sc->workqueue, &sc->recv_io.posted.refill_work); 723 724 enqueue_reassembly(sc, recvmsg, (int)data_length); 725 wake_up(&sc->recv_io.reassembly.wait_queue); 726 } else 727 put_recvmsg(sc, recvmsg); 728 729 return; 730 } 731 case SMBDIRECT_EXPECT_NEGOTIATE_REP: 732 /* client only */ 733 break; 734 } 735 736 /* 737 * This is an internal error! 738 */ 739 WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_DATA_TRANSFER); 740 put_recvmsg(sc, recvmsg); 741 smb_direct_disconnect_rdma_connection(sc); 742 } 743 744 static void smb_direct_negotiate_recv_work(struct work_struct *work); 745 746 static void smb_direct_negotiate_recv_done(struct ib_cq *cq, struct ib_wc *wc) 747 { 748 struct smbdirect_recv_io *recv_io = 749 container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe); 750 struct smbdirect_socket *sc = recv_io->socket; 751 unsigned long flags; 752 753 /* 754 * reset the common recv_done for later reuse. 755 */ 756 recv_io->cqe.done = recv_done; 757 758 if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) { 759 put_recvmsg(sc, recv_io); 760 if (wc->status != IB_WC_WR_FLUSH_ERR) { 761 pr_err("Negotiate Recv error. status='%s (%d)' opcode=%d\n", 762 ib_wc_status_msg(wc->status), wc->status, 763 wc->opcode); 764 smb_direct_disconnect_rdma_connection(sc); 765 } 766 return; 767 } 768 769 ksmbd_debug(RDMA, "Negotiate Recv completed. status='%s (%d)', opcode=%d\n", 770 ib_wc_status_msg(wc->status), wc->status, 771 wc->opcode); 772 773 ib_dma_sync_single_for_cpu(sc->ib.dev, 774 recv_io->sge.addr, 775 recv_io->sge.length, 776 DMA_FROM_DEVICE); 777 778 /* 779 * This is an internal error! 780 */ 781 if (WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_NEGOTIATE_REQ)) { 782 put_recvmsg(sc, recv_io); 783 smb_direct_disconnect_rdma_connection(sc); 784 return; 785 } 786 787 /* 788 * Don't reset timer to the keepalive interval in 789 * this will be done in smb_direct_negotiate_recv_work. 790 */ 791 792 /* 793 * Only remember the recv_io if it has enough bytes, 794 * this gives smb_direct_negotiate_recv_work enough 795 * information in order to disconnect if it was not 796 * valid. 797 */ 798 sc->recv_io.reassembly.full_packet_received = true; 799 if (wc->byte_len >= sizeof(struct smbdirect_negotiate_req)) 800 enqueue_reassembly(sc, recv_io, 0); 801 else 802 put_recvmsg(sc, recv_io); 803 804 /* 805 * Some drivers (at least mlx5_ib and irdma in roce mode) 806 * might post a recv completion before RDMA_CM_EVENT_ESTABLISHED, 807 * we need to adjust our expectation in that case. 808 * 809 * So we defer further processing of the negotiation 810 * to smb_direct_negotiate_recv_work(). 811 * 812 * If we are already in SMBDIRECT_SOCKET_NEGOTIATE_NEEDED 813 * we queue the work directly otherwise 814 * smb_direct_cm_handler() will do it, when 815 * RDMA_CM_EVENT_ESTABLISHED arrived. 816 */ 817 spin_lock_irqsave(&sc->connect.lock, flags); 818 if (!sc->first_error) { 819 INIT_WORK(&sc->connect.work, smb_direct_negotiate_recv_work); 820 if (sc->status == SMBDIRECT_SOCKET_NEGOTIATE_NEEDED) 821 queue_work(sc->workqueue, &sc->connect.work); 822 } 823 spin_unlock_irqrestore(&sc->connect.lock, flags); 824 } 825 826 static void smb_direct_negotiate_recv_work(struct work_struct *work) 827 { 828 struct smbdirect_socket *sc = 829 container_of(work, struct smbdirect_socket, connect.work); 830 const struct smbdirect_socket_parameters *sp = &sc->parameters; 831 struct smbdirect_recv_io *recv_io; 832 833 if (sc->first_error) 834 return; 835 836 ksmbd_debug(RDMA, "Negotiate Recv Work running\n"); 837 838 /* 839 * Reset timer to the keepalive interval in 840 * order to trigger our next keepalive message. 841 */ 842 sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE; 843 mod_delayed_work(sc->workqueue, &sc->idle.timer_work, 844 msecs_to_jiffies(sp->keepalive_interval_msec)); 845 846 /* 847 * If smb_direct_negotiate_recv_done() detected an 848 * invalid request we want to disconnect. 849 */ 850 recv_io = get_first_reassembly(sc); 851 if (!recv_io) { 852 smb_direct_disconnect_rdma_connection(sc); 853 return; 854 } 855 856 if (SMBDIRECT_CHECK_STATUS_WARN(sc, SMBDIRECT_SOCKET_NEGOTIATE_NEEDED)) { 857 smb_direct_disconnect_rdma_connection(sc); 858 return; 859 } 860 sc->status = SMBDIRECT_SOCKET_NEGOTIATE_RUNNING; 861 wake_up(&sc->status_wait); 862 } 863 864 static int smb_direct_post_recv(struct smbdirect_socket *sc, 865 struct smbdirect_recv_io *recvmsg) 866 { 867 struct smbdirect_socket_parameters *sp = &sc->parameters; 868 struct ib_recv_wr wr; 869 int ret; 870 871 recvmsg->sge.addr = ib_dma_map_single(sc->ib.dev, 872 recvmsg->packet, 873 sp->max_recv_size, 874 DMA_FROM_DEVICE); 875 ret = ib_dma_mapping_error(sc->ib.dev, recvmsg->sge.addr); 876 if (ret) 877 return ret; 878 recvmsg->sge.length = sp->max_recv_size; 879 recvmsg->sge.lkey = sc->ib.pd->local_dma_lkey; 880 881 wr.wr_cqe = &recvmsg->cqe; 882 wr.next = NULL; 883 wr.sg_list = &recvmsg->sge; 884 wr.num_sge = 1; 885 886 ret = ib_post_recv(sc->ib.qp, &wr, NULL); 887 if (ret) { 888 pr_err("Can't post recv: %d\n", ret); 889 ib_dma_unmap_single(sc->ib.dev, 890 recvmsg->sge.addr, recvmsg->sge.length, 891 DMA_FROM_DEVICE); 892 recvmsg->sge.length = 0; 893 smb_direct_disconnect_rdma_connection(sc); 894 return ret; 895 } 896 return ret; 897 } 898 899 static int smb_direct_read(struct ksmbd_transport *t, char *buf, 900 unsigned int size, int unused) 901 { 902 struct smbdirect_recv_io *recvmsg; 903 struct smbdirect_data_transfer *data_transfer; 904 int to_copy, to_read, data_read, offset; 905 u32 data_length, remaining_data_length, data_offset; 906 int rc; 907 struct smb_direct_transport *st = SMBD_TRANS(t); 908 struct smbdirect_socket *sc = &st->socket; 909 910 again: 911 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { 912 pr_err("disconnected\n"); 913 return -ENOTCONN; 914 } 915 916 /* 917 * No need to hold the reassembly queue lock all the time as we are 918 * the only one reading from the front of the queue. The transport 919 * may add more entries to the back of the queue at the same time 920 */ 921 if (sc->recv_io.reassembly.data_length >= size) { 922 int queue_length; 923 int queue_removed = 0; 924 unsigned long flags; 925 926 /* 927 * Need to make sure reassembly_data_length is read before 928 * reading reassembly_queue_length and calling 929 * get_first_reassembly. This call is lock free 930 * as we never read at the end of the queue which are being 931 * updated in SOFTIRQ as more data is received 932 */ 933 virt_rmb(); 934 queue_length = sc->recv_io.reassembly.queue_length; 935 data_read = 0; 936 to_read = size; 937 offset = sc->recv_io.reassembly.first_entry_offset; 938 while (data_read < size) { 939 recvmsg = get_first_reassembly(sc); 940 data_transfer = smbdirect_recv_io_payload(recvmsg); 941 data_length = le32_to_cpu(data_transfer->data_length); 942 remaining_data_length = 943 le32_to_cpu(data_transfer->remaining_data_length); 944 data_offset = le32_to_cpu(data_transfer->data_offset); 945 946 /* 947 * The upper layer expects RFC1002 length at the 948 * beginning of the payload. Return it to indicate 949 * the total length of the packet. This minimize the 950 * change to upper layer packet processing logic. This 951 * will be eventually remove when an intermediate 952 * transport layer is added 953 */ 954 if (recvmsg->first_segment && size == 4) { 955 unsigned int rfc1002_len = 956 data_length + remaining_data_length; 957 *((__be32 *)buf) = cpu_to_be32(rfc1002_len); 958 data_read = 4; 959 recvmsg->first_segment = false; 960 ksmbd_debug(RDMA, 961 "returning rfc1002 length %d\n", 962 rfc1002_len); 963 goto read_rfc1002_done; 964 } 965 966 to_copy = min_t(int, data_length - offset, to_read); 967 memcpy(buf + data_read, (char *)data_transfer + data_offset + offset, 968 to_copy); 969 970 /* move on to the next buffer? */ 971 if (to_copy == data_length - offset) { 972 queue_length--; 973 /* 974 * No need to lock if we are not at the 975 * end of the queue 976 */ 977 if (queue_length) { 978 list_del(&recvmsg->list); 979 } else { 980 spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); 981 list_del(&recvmsg->list); 982 spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); 983 } 984 queue_removed++; 985 put_recvmsg(sc, recvmsg); 986 offset = 0; 987 } else { 988 offset += to_copy; 989 } 990 991 to_read -= to_copy; 992 data_read += to_copy; 993 } 994 995 spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); 996 sc->recv_io.reassembly.data_length -= data_read; 997 sc->recv_io.reassembly.queue_length -= queue_removed; 998 spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); 999 1000 sc->recv_io.reassembly.first_entry_offset = offset; 1001 ksmbd_debug(RDMA, 1002 "returning to thread data_read=%d reassembly_data_length=%d first_entry_offset=%d\n", 1003 data_read, sc->recv_io.reassembly.data_length, 1004 sc->recv_io.reassembly.first_entry_offset); 1005 read_rfc1002_done: 1006 return data_read; 1007 } 1008 1009 ksmbd_debug(RDMA, "wait_event on more data\n"); 1010 rc = wait_event_interruptible(sc->recv_io.reassembly.wait_queue, 1011 sc->recv_io.reassembly.data_length >= size || 1012 sc->status != SMBDIRECT_SOCKET_CONNECTED); 1013 if (rc) 1014 return -EINTR; 1015 1016 goto again; 1017 } 1018 1019 static void smb_direct_post_recv_credits(struct work_struct *work) 1020 { 1021 struct smbdirect_socket *sc = 1022 container_of(work, struct smbdirect_socket, recv_io.posted.refill_work); 1023 struct smbdirect_recv_io *recvmsg; 1024 int credits = 0; 1025 int ret; 1026 1027 if (atomic_read(&sc->recv_io.credits.count) < sc->recv_io.credits.target) { 1028 while (true) { 1029 recvmsg = get_free_recvmsg(sc); 1030 if (!recvmsg) 1031 break; 1032 1033 recvmsg->first_segment = false; 1034 1035 ret = smb_direct_post_recv(sc, recvmsg); 1036 if (ret) { 1037 pr_err("Can't post recv: %d\n", ret); 1038 put_recvmsg(sc, recvmsg); 1039 break; 1040 } 1041 credits++; 1042 1043 atomic_inc(&sc->recv_io.posted.count); 1044 } 1045 } 1046 1047 atomic_add(credits, &sc->recv_io.credits.available); 1048 1049 /* 1050 * If the last send credit is waiting for credits 1051 * it can grant we need to wake it up 1052 */ 1053 if (credits && 1054 atomic_read(&sc->send_io.bcredits.count) == 0 && 1055 atomic_read(&sc->send_io.credits.count) == 0) 1056 wake_up(&sc->send_io.credits.wait_queue); 1057 1058 if (credits) 1059 queue_work(sc->workqueue, &sc->idle.immediate_work); 1060 } 1061 1062 static void send_done(struct ib_cq *cq, struct ib_wc *wc) 1063 { 1064 struct smbdirect_send_io *sendmsg, *sibling, *next; 1065 struct smbdirect_socket *sc; 1066 int lcredits = 0; 1067 1068 sendmsg = container_of(wc->wr_cqe, struct smbdirect_send_io, cqe); 1069 sc = sendmsg->socket; 1070 1071 ksmbd_debug(RDMA, "Send completed. status='%s (%d)', opcode=%d\n", 1072 ib_wc_status_msg(wc->status), wc->status, 1073 wc->opcode); 1074 1075 if (unlikely(!(sendmsg->wr.send_flags & IB_SEND_SIGNALED))) { 1076 /* 1077 * This happens when smbdirect_send_io is a sibling 1078 * before the final message, it is signaled on 1079 * error anyway, so we need to skip 1080 * smbdirect_connection_free_send_io here, 1081 * otherwise is will destroy the memory 1082 * of the siblings too, which will cause 1083 * use after free problems for the others 1084 * triggered from ib_drain_qp(). 1085 */ 1086 if (wc->status != IB_WC_SUCCESS) 1087 goto skip_free; 1088 1089 /* 1090 * This should not happen! 1091 * But we better just close the 1092 * connection... 1093 */ 1094 pr_err("unexpected send completion wc->status=%s (%d) wc->opcode=%d\n", 1095 ib_wc_status_msg(wc->status), wc->status, wc->opcode); 1096 smb_direct_disconnect_rdma_connection(sc); 1097 return; 1098 } 1099 1100 /* 1101 * Free possible siblings and then the main send_io 1102 */ 1103 list_for_each_entry_safe(sibling, next, &sendmsg->sibling_list, sibling_list) { 1104 list_del_init(&sibling->sibling_list); 1105 smb_direct_free_sendmsg(sc, sibling); 1106 lcredits += 1; 1107 } 1108 /* Note this frees wc->wr_cqe, but not wc */ 1109 smb_direct_free_sendmsg(sc, sendmsg); 1110 lcredits += 1; 1111 1112 if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) { 1113 skip_free: 1114 pr_err("Send error. status='%s (%d)', opcode=%d\n", 1115 ib_wc_status_msg(wc->status), wc->status, 1116 wc->opcode); 1117 smb_direct_disconnect_rdma_connection(sc); 1118 return; 1119 } 1120 1121 atomic_add(lcredits, &sc->send_io.lcredits.count); 1122 wake_up(&sc->send_io.lcredits.wait_queue); 1123 1124 if (atomic_dec_and_test(&sc->send_io.pending.count)) 1125 wake_up(&sc->send_io.pending.zero_wait_queue); 1126 } 1127 1128 static int manage_credits_prior_sending(struct smbdirect_socket *sc) 1129 { 1130 int missing; 1131 int available; 1132 int new_credits; 1133 1134 if (atomic_read(&sc->recv_io.credits.count) >= sc->recv_io.credits.target) 1135 return 0; 1136 1137 missing = (int)sc->recv_io.credits.target - atomic_read(&sc->recv_io.credits.count); 1138 available = atomic_xchg(&sc->recv_io.credits.available, 0); 1139 new_credits = (u16)min3(U16_MAX, missing, available); 1140 if (new_credits <= 0) { 1141 /* 1142 * If credits are available, but not granted 1143 * we need to re-add them again. 1144 */ 1145 if (available) 1146 atomic_add(available, &sc->recv_io.credits.available); 1147 return 0; 1148 } 1149 1150 if (new_credits < available) { 1151 /* 1152 * Readd the remaining available again. 1153 */ 1154 available -= new_credits; 1155 atomic_add(available, &sc->recv_io.credits.available); 1156 } 1157 1158 /* 1159 * Remember we granted the credits 1160 */ 1161 atomic_add(new_credits, &sc->recv_io.credits.count); 1162 return new_credits; 1163 } 1164 1165 static int manage_keep_alive_before_sending(struct smbdirect_socket *sc) 1166 { 1167 struct smbdirect_socket_parameters *sp = &sc->parameters; 1168 1169 if (sc->idle.keepalive == SMBDIRECT_KEEPALIVE_PENDING) { 1170 sc->idle.keepalive = SMBDIRECT_KEEPALIVE_SENT; 1171 /* 1172 * Now use the keepalive timeout (instead of keepalive interval) 1173 * in order to wait for a response 1174 */ 1175 mod_delayed_work(sc->workqueue, &sc->idle.timer_work, 1176 msecs_to_jiffies(sp->keepalive_timeout_msec)); 1177 return 1; 1178 } 1179 return 0; 1180 } 1181 1182 static int smb_direct_post_send(struct smbdirect_socket *sc, 1183 struct ib_send_wr *wr) 1184 { 1185 int ret; 1186 1187 atomic_inc(&sc->send_io.pending.count); 1188 ret = ib_post_send(sc->ib.qp, wr, NULL); 1189 if (ret) { 1190 pr_err("failed to post send: %d\n", ret); 1191 smb_direct_disconnect_rdma_connection(sc); 1192 } 1193 return ret; 1194 } 1195 1196 static void smb_direct_send_ctx_init(struct smbdirect_send_batch *send_ctx, 1197 bool need_invalidate_rkey, 1198 unsigned int remote_key) 1199 { 1200 INIT_LIST_HEAD(&send_ctx->msg_list); 1201 send_ctx->wr_cnt = 0; 1202 send_ctx->need_invalidate_rkey = need_invalidate_rkey; 1203 send_ctx->remote_key = remote_key; 1204 send_ctx->credit = 0; 1205 } 1206 1207 static int smb_direct_flush_send_list(struct smbdirect_socket *sc, 1208 struct smbdirect_send_batch *send_ctx, 1209 bool is_last) 1210 { 1211 struct smbdirect_send_io *first, *last; 1212 int ret = 0; 1213 1214 if (list_empty(&send_ctx->msg_list)) 1215 goto release_credit; 1216 1217 first = list_first_entry(&send_ctx->msg_list, 1218 struct smbdirect_send_io, 1219 sibling_list); 1220 last = list_last_entry(&send_ctx->msg_list, 1221 struct smbdirect_send_io, 1222 sibling_list); 1223 1224 if (send_ctx->need_invalidate_rkey) { 1225 first->wr.opcode = IB_WR_SEND_WITH_INV; 1226 first->wr.ex.invalidate_rkey = send_ctx->remote_key; 1227 send_ctx->need_invalidate_rkey = false; 1228 send_ctx->remote_key = 0; 1229 } 1230 1231 last->wr.send_flags = IB_SEND_SIGNALED; 1232 last->wr.wr_cqe = &last->cqe; 1233 1234 /* 1235 * Remove last from send_ctx->msg_list 1236 * and splice the rest of send_ctx->msg_list 1237 * to last->sibling_list. 1238 * 1239 * send_ctx->msg_list is a valid empty list 1240 * at the end. 1241 */ 1242 list_del_init(&last->sibling_list); 1243 list_splice_tail_init(&send_ctx->msg_list, &last->sibling_list); 1244 send_ctx->wr_cnt = 0; 1245 1246 ret = smb_direct_post_send(sc, &first->wr); 1247 if (ret) { 1248 struct smbdirect_send_io *sibling, *next; 1249 1250 list_for_each_entry_safe(sibling, next, &last->sibling_list, sibling_list) { 1251 list_del_init(&sibling->sibling_list); 1252 smb_direct_free_sendmsg(sc, sibling); 1253 } 1254 smb_direct_free_sendmsg(sc, last); 1255 } 1256 1257 release_credit: 1258 if (is_last && !ret && send_ctx->credit) { 1259 atomic_add(send_ctx->credit, &sc->send_io.bcredits.count); 1260 send_ctx->credit = 0; 1261 wake_up(&sc->send_io.bcredits.wait_queue); 1262 } 1263 1264 return ret; 1265 } 1266 1267 static int wait_for_credits(struct smbdirect_socket *sc, 1268 wait_queue_head_t *waitq, atomic_t *total_credits, 1269 int needed) 1270 { 1271 int ret; 1272 1273 do { 1274 if (atomic_sub_return(needed, total_credits) >= 0) 1275 return 0; 1276 1277 atomic_add(needed, total_credits); 1278 ret = wait_event_interruptible(*waitq, 1279 atomic_read(total_credits) >= needed || 1280 sc->status != SMBDIRECT_SOCKET_CONNECTED); 1281 1282 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) 1283 return -ENOTCONN; 1284 else if (ret < 0) 1285 return ret; 1286 } while (true); 1287 } 1288 1289 static int wait_for_send_bcredit(struct smbdirect_socket *sc, 1290 struct smbdirect_send_batch *send_ctx) 1291 { 1292 int ret; 1293 1294 if (send_ctx->credit) 1295 return 0; 1296 1297 ret = wait_for_credits(sc, 1298 &sc->send_io.bcredits.wait_queue, 1299 &sc->send_io.bcredits.count, 1300 1); 1301 if (ret) 1302 return ret; 1303 1304 send_ctx->credit = 1; 1305 return 0; 1306 } 1307 1308 static int wait_for_send_lcredit(struct smbdirect_socket *sc, 1309 struct smbdirect_send_batch *send_ctx) 1310 { 1311 if (send_ctx && (atomic_read(&sc->send_io.lcredits.count) <= 1)) { 1312 int ret; 1313 1314 ret = smb_direct_flush_send_list(sc, send_ctx, false); 1315 if (ret) 1316 return ret; 1317 } 1318 1319 return wait_for_credits(sc, 1320 &sc->send_io.lcredits.wait_queue, 1321 &sc->send_io.lcredits.count, 1322 1); 1323 } 1324 1325 static int wait_for_send_credits(struct smbdirect_socket *sc, 1326 struct smbdirect_send_batch *send_ctx) 1327 { 1328 int ret; 1329 1330 if (send_ctx && 1331 (send_ctx->wr_cnt >= 16 || atomic_read(&sc->send_io.credits.count) <= 1)) { 1332 ret = smb_direct_flush_send_list(sc, send_ctx, false); 1333 if (ret) 1334 return ret; 1335 } 1336 1337 return wait_for_credits(sc, &sc->send_io.credits.wait_queue, &sc->send_io.credits.count, 1); 1338 } 1339 1340 static int wait_for_rw_credits(struct smbdirect_socket *sc, int credits) 1341 { 1342 return wait_for_credits(sc, 1343 &sc->rw_io.credits.wait_queue, 1344 &sc->rw_io.credits.count, 1345 credits); 1346 } 1347 1348 static int calc_rw_credits(struct smbdirect_socket *sc, 1349 char *buf, unsigned int len) 1350 { 1351 return DIV_ROUND_UP(get_buf_page_count(buf, len), 1352 sc->rw_io.credits.num_pages); 1353 } 1354 1355 static int smb_direct_create_header(struct smbdirect_socket *sc, 1356 int size, int remaining_data_length, 1357 int new_credits, 1358 struct smbdirect_send_io **sendmsg_out) 1359 { 1360 struct smbdirect_socket_parameters *sp = &sc->parameters; 1361 struct smbdirect_send_io *sendmsg; 1362 struct smbdirect_data_transfer *packet; 1363 int header_length; 1364 int ret; 1365 1366 sendmsg = smb_direct_alloc_sendmsg(sc); 1367 if (IS_ERR(sendmsg)) 1368 return PTR_ERR(sendmsg); 1369 1370 /* Fill in the packet header */ 1371 packet = (struct smbdirect_data_transfer *)sendmsg->packet; 1372 packet->credits_requested = cpu_to_le16(sp->send_credit_target); 1373 packet->credits_granted = cpu_to_le16(new_credits); 1374 1375 packet->flags = 0; 1376 if (manage_keep_alive_before_sending(sc)) 1377 packet->flags |= cpu_to_le16(SMBDIRECT_FLAG_RESPONSE_REQUESTED); 1378 1379 packet->reserved = 0; 1380 if (!size) 1381 packet->data_offset = 0; 1382 else 1383 packet->data_offset = cpu_to_le32(24); 1384 packet->data_length = cpu_to_le32(size); 1385 packet->remaining_data_length = cpu_to_le32(remaining_data_length); 1386 packet->padding = 0; 1387 1388 ksmbd_debug(RDMA, 1389 "credits_requested=%d credits_granted=%d data_offset=%d data_length=%d remaining_data_length=%d\n", 1390 le16_to_cpu(packet->credits_requested), 1391 le16_to_cpu(packet->credits_granted), 1392 le32_to_cpu(packet->data_offset), 1393 le32_to_cpu(packet->data_length), 1394 le32_to_cpu(packet->remaining_data_length)); 1395 1396 /* Map the packet to DMA */ 1397 header_length = sizeof(struct smbdirect_data_transfer); 1398 /* If this is a packet without payload, don't send padding */ 1399 if (!size) 1400 header_length = 1401 offsetof(struct smbdirect_data_transfer, padding); 1402 1403 sendmsg->sge[0].addr = ib_dma_map_single(sc->ib.dev, 1404 (void *)packet, 1405 header_length, 1406 DMA_TO_DEVICE); 1407 ret = ib_dma_mapping_error(sc->ib.dev, sendmsg->sge[0].addr); 1408 if (ret) { 1409 smb_direct_free_sendmsg(sc, sendmsg); 1410 return ret; 1411 } 1412 1413 sendmsg->num_sge = 1; 1414 sendmsg->sge[0].length = header_length; 1415 sendmsg->sge[0].lkey = sc->ib.pd->local_dma_lkey; 1416 1417 *sendmsg_out = sendmsg; 1418 return 0; 1419 } 1420 1421 static int get_sg_list(void *buf, int size, struct scatterlist *sg_list, int nentries) 1422 { 1423 bool high = is_vmalloc_addr(buf); 1424 struct page *page; 1425 int offset, len; 1426 int i = 0; 1427 1428 if (size <= 0 || nentries < get_buf_page_count(buf, size)) 1429 return -EINVAL; 1430 1431 offset = offset_in_page(buf); 1432 buf -= offset; 1433 while (size > 0) { 1434 len = min_t(int, PAGE_SIZE - offset, size); 1435 if (high) 1436 page = vmalloc_to_page(buf); 1437 else 1438 page = kmap_to_page(buf); 1439 1440 if (!sg_list) 1441 return -EINVAL; 1442 sg_set_page(sg_list, page, len, offset); 1443 sg_list = sg_next(sg_list); 1444 1445 buf += PAGE_SIZE; 1446 size -= len; 1447 offset = 0; 1448 i++; 1449 } 1450 return i; 1451 } 1452 1453 static int get_mapped_sg_list(struct ib_device *device, void *buf, int size, 1454 struct scatterlist *sg_list, int nentries, 1455 enum dma_data_direction dir, int *npages) 1456 { 1457 *npages = get_sg_list(buf, size, sg_list, nentries); 1458 if (*npages < 0) 1459 return -EINVAL; 1460 return ib_dma_map_sg(device, sg_list, *npages, dir); 1461 } 1462 1463 static int post_sendmsg(struct smbdirect_socket *sc, 1464 struct smbdirect_send_batch *send_ctx, 1465 struct smbdirect_send_io *msg) 1466 { 1467 int i; 1468 1469 for (i = 0; i < msg->num_sge; i++) 1470 ib_dma_sync_single_for_device(sc->ib.dev, 1471 msg->sge[i].addr, msg->sge[i].length, 1472 DMA_TO_DEVICE); 1473 1474 msg->cqe.done = send_done; 1475 msg->wr.opcode = IB_WR_SEND; 1476 msg->wr.sg_list = &msg->sge[0]; 1477 msg->wr.num_sge = msg->num_sge; 1478 msg->wr.next = NULL; 1479 1480 if (send_ctx) { 1481 msg->wr.wr_cqe = NULL; 1482 msg->wr.send_flags = 0; 1483 if (!list_empty(&send_ctx->msg_list)) { 1484 struct smbdirect_send_io *last; 1485 1486 last = list_last_entry(&send_ctx->msg_list, 1487 struct smbdirect_send_io, 1488 sibling_list); 1489 last->wr.next = &msg->wr; 1490 } 1491 list_add_tail(&msg->sibling_list, &send_ctx->msg_list); 1492 send_ctx->wr_cnt++; 1493 return 0; 1494 } 1495 1496 msg->wr.wr_cqe = &msg->cqe; 1497 msg->wr.send_flags = IB_SEND_SIGNALED; 1498 return smb_direct_post_send(sc, &msg->wr); 1499 } 1500 1501 static int smb_direct_post_send_data(struct smbdirect_socket *sc, 1502 struct smbdirect_send_batch *send_ctx, 1503 struct kvec *iov, int niov, 1504 int remaining_data_length) 1505 { 1506 int i, j, ret; 1507 struct smbdirect_send_io *msg; 1508 int data_length; 1509 struct scatterlist sg[SMBDIRECT_SEND_IO_MAX_SGE - 1]; 1510 struct smbdirect_send_batch _send_ctx; 1511 int new_credits; 1512 1513 if (!send_ctx) { 1514 smb_direct_send_ctx_init(&_send_ctx, false, 0); 1515 send_ctx = &_send_ctx; 1516 } 1517 1518 ret = wait_for_send_bcredit(sc, send_ctx); 1519 if (ret) 1520 goto bcredit_failed; 1521 1522 ret = wait_for_send_lcredit(sc, send_ctx); 1523 if (ret) 1524 goto lcredit_failed; 1525 1526 ret = wait_for_send_credits(sc, send_ctx); 1527 if (ret) 1528 goto credit_failed; 1529 1530 new_credits = manage_credits_prior_sending(sc); 1531 if (new_credits == 0 && 1532 atomic_read(&sc->send_io.credits.count) == 0 && 1533 atomic_read(&sc->recv_io.credits.count) == 0) { 1534 queue_work(sc->workqueue, &sc->recv_io.posted.refill_work); 1535 ret = wait_event_interruptible(sc->send_io.credits.wait_queue, 1536 atomic_read(&sc->send_io.credits.count) >= 1 || 1537 atomic_read(&sc->recv_io.credits.available) >= 1 || 1538 sc->status != SMBDIRECT_SOCKET_CONNECTED); 1539 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) 1540 ret = -ENOTCONN; 1541 if (ret < 0) 1542 goto credit_failed; 1543 1544 new_credits = manage_credits_prior_sending(sc); 1545 } 1546 1547 data_length = 0; 1548 for (i = 0; i < niov; i++) 1549 data_length += iov[i].iov_len; 1550 1551 ret = smb_direct_create_header(sc, data_length, remaining_data_length, 1552 new_credits, &msg); 1553 if (ret) 1554 goto header_failed; 1555 1556 for (i = 0; i < niov; i++) { 1557 struct ib_sge *sge; 1558 int sg_cnt; 1559 int npages; 1560 1561 sg_init_table(sg, SMBDIRECT_SEND_IO_MAX_SGE - 1); 1562 sg_cnt = get_mapped_sg_list(sc->ib.dev, 1563 iov[i].iov_base, iov[i].iov_len, 1564 sg, SMBDIRECT_SEND_IO_MAX_SGE - 1, 1565 DMA_TO_DEVICE, &npages); 1566 if (sg_cnt <= 0) { 1567 pr_err("failed to map buffer\n"); 1568 ret = -ENOMEM; 1569 goto err; 1570 } else if (sg_cnt + msg->num_sge > SMBDIRECT_SEND_IO_MAX_SGE) { 1571 pr_err("buffer not fitted into sges\n"); 1572 ret = -E2BIG; 1573 ib_dma_unmap_sg(sc->ib.dev, sg, npages, 1574 DMA_TO_DEVICE); 1575 goto err; 1576 } 1577 1578 for (j = 0; j < sg_cnt; j++) { 1579 sge = &msg->sge[msg->num_sge]; 1580 sge->addr = sg_dma_address(&sg[j]); 1581 sge->length = sg_dma_len(&sg[j]); 1582 sge->lkey = sc->ib.pd->local_dma_lkey; 1583 msg->num_sge++; 1584 } 1585 } 1586 1587 ret = post_sendmsg(sc, send_ctx, msg); 1588 if (ret) 1589 goto err; 1590 1591 if (send_ctx == &_send_ctx) { 1592 ret = smb_direct_flush_send_list(sc, send_ctx, true); 1593 if (ret) 1594 goto err; 1595 } 1596 1597 return 0; 1598 err: 1599 smb_direct_free_sendmsg(sc, msg); 1600 header_failed: 1601 atomic_inc(&sc->send_io.credits.count); 1602 credit_failed: 1603 atomic_inc(&sc->send_io.lcredits.count); 1604 lcredit_failed: 1605 atomic_add(send_ctx->credit, &sc->send_io.bcredits.count); 1606 send_ctx->credit = 0; 1607 bcredit_failed: 1608 return ret; 1609 } 1610 1611 static int smb_direct_writev(struct ksmbd_transport *t, 1612 struct kvec *iov, int niovs, int buflen, 1613 bool need_invalidate, unsigned int remote_key) 1614 { 1615 struct smb_direct_transport *st = SMBD_TRANS(t); 1616 struct smbdirect_socket *sc = &st->socket; 1617 struct smbdirect_socket_parameters *sp = &sc->parameters; 1618 size_t remaining_data_length; 1619 size_t iov_idx; 1620 size_t iov_ofs; 1621 size_t max_iov_size = sp->max_send_size - 1622 sizeof(struct smbdirect_data_transfer); 1623 int ret; 1624 struct smbdirect_send_batch send_ctx; 1625 int error = 0; 1626 1627 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) 1628 return -ENOTCONN; 1629 1630 //FIXME: skip RFC1002 header.. 1631 if (WARN_ON_ONCE(niovs <= 1 || iov[0].iov_len != 4)) 1632 return -EINVAL; 1633 buflen -= 4; 1634 iov_idx = 1; 1635 iov_ofs = 0; 1636 1637 remaining_data_length = buflen; 1638 ksmbd_debug(RDMA, "Sending smb (RDMA): smb_len=%u\n", buflen); 1639 1640 smb_direct_send_ctx_init(&send_ctx, need_invalidate, remote_key); 1641 while (remaining_data_length) { 1642 struct kvec vecs[SMBDIRECT_SEND_IO_MAX_SGE - 1]; /* minus smbdirect hdr */ 1643 size_t possible_bytes = max_iov_size; 1644 size_t possible_vecs; 1645 size_t bytes = 0; 1646 size_t nvecs = 0; 1647 1648 /* 1649 * For the last message remaining_data_length should be 1650 * have been 0 already! 1651 */ 1652 if (WARN_ON_ONCE(iov_idx >= niovs)) { 1653 error = -EINVAL; 1654 goto done; 1655 } 1656 1657 /* 1658 * We have 2 factors which limit the arguments we pass 1659 * to smb_direct_post_send_data(): 1660 * 1661 * 1. The number of supported sges for the send, 1662 * while one is reserved for the smbdirect header. 1663 * And we currently need one SGE per page. 1664 * 2. The number of negotiated payload bytes per send. 1665 */ 1666 possible_vecs = min_t(size_t, ARRAY_SIZE(vecs), niovs - iov_idx); 1667 1668 while (iov_idx < niovs && possible_vecs && possible_bytes) { 1669 struct kvec *v = &vecs[nvecs]; 1670 int page_count; 1671 1672 v->iov_base = ((u8 *)iov[iov_idx].iov_base) + iov_ofs; 1673 v->iov_len = min_t(size_t, 1674 iov[iov_idx].iov_len - iov_ofs, 1675 possible_bytes); 1676 page_count = get_buf_page_count(v->iov_base, v->iov_len); 1677 if (page_count > possible_vecs) { 1678 /* 1679 * If the number of pages in the buffer 1680 * is to much (because we currently require 1681 * one SGE per page), we need to limit the 1682 * length. 1683 * 1684 * We know possible_vecs is at least 1, 1685 * so we always keep the first page. 1686 * 1687 * We need to calculate the number extra 1688 * pages (epages) we can also keep. 1689 * 1690 * We calculate the number of bytes in the 1691 * first page (fplen), this should never be 1692 * larger than v->iov_len because page_count is 1693 * at least 2, but adding a limitation feels 1694 * better. 1695 * 1696 * Then we calculate the number of bytes (elen) 1697 * we can keep for the extra pages. 1698 */ 1699 size_t epages = possible_vecs - 1; 1700 size_t fpofs = offset_in_page(v->iov_base); 1701 size_t fplen = min_t(size_t, PAGE_SIZE - fpofs, v->iov_len); 1702 size_t elen = min_t(size_t, v->iov_len - fplen, epages*PAGE_SIZE); 1703 1704 v->iov_len = fplen + elen; 1705 page_count = get_buf_page_count(v->iov_base, v->iov_len); 1706 if (WARN_ON_ONCE(page_count > possible_vecs)) { 1707 /* 1708 * Something went wrong in the above 1709 * logic... 1710 */ 1711 error = -EINVAL; 1712 goto done; 1713 } 1714 } 1715 possible_vecs -= page_count; 1716 nvecs += 1; 1717 possible_bytes -= v->iov_len; 1718 bytes += v->iov_len; 1719 1720 iov_ofs += v->iov_len; 1721 if (iov_ofs >= iov[iov_idx].iov_len) { 1722 iov_idx += 1; 1723 iov_ofs = 0; 1724 } 1725 } 1726 1727 remaining_data_length -= bytes; 1728 1729 ret = smb_direct_post_send_data(sc, &send_ctx, 1730 vecs, nvecs, 1731 remaining_data_length); 1732 if (unlikely(ret)) { 1733 error = ret; 1734 goto done; 1735 } 1736 } 1737 1738 done: 1739 ret = smb_direct_flush_send_list(sc, &send_ctx, true); 1740 if (unlikely(!ret && error)) 1741 ret = error; 1742 1743 /* 1744 * As an optimization, we don't wait for individual I/O to finish 1745 * before sending the next one. 1746 * Send them all and wait for pending send count to get to 0 1747 * that means all the I/Os have been out and we are good to return 1748 */ 1749 1750 wait_event(sc->send_io.pending.zero_wait_queue, 1751 atomic_read(&sc->send_io.pending.count) == 0 || 1752 sc->status != SMBDIRECT_SOCKET_CONNECTED); 1753 if (sc->status != SMBDIRECT_SOCKET_CONNECTED && ret == 0) 1754 ret = -ENOTCONN; 1755 1756 return ret; 1757 } 1758 1759 static void smb_direct_free_rdma_rw_msg(struct smb_direct_transport *t, 1760 struct smbdirect_rw_io *msg, 1761 enum dma_data_direction dir) 1762 { 1763 struct smbdirect_socket *sc = &t->socket; 1764 1765 rdma_rw_ctx_destroy(&msg->rdma_ctx, sc->ib.qp, sc->ib.qp->port, 1766 msg->sgt.sgl, msg->sgt.nents, dir); 1767 sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); 1768 kfree(msg); 1769 } 1770 1771 static void read_write_done(struct ib_cq *cq, struct ib_wc *wc, 1772 enum dma_data_direction dir) 1773 { 1774 struct smbdirect_rw_io *msg = 1775 container_of(wc->wr_cqe, struct smbdirect_rw_io, cqe); 1776 struct smbdirect_socket *sc = msg->socket; 1777 1778 if (wc->status != IB_WC_SUCCESS) { 1779 msg->error = -EIO; 1780 pr_err("read/write error. opcode = %d, status = %s(%d)\n", 1781 wc->opcode, ib_wc_status_msg(wc->status), wc->status); 1782 if (wc->status != IB_WC_WR_FLUSH_ERR) 1783 smb_direct_disconnect_rdma_connection(sc); 1784 } 1785 1786 complete(msg->completion); 1787 } 1788 1789 static void read_done(struct ib_cq *cq, struct ib_wc *wc) 1790 { 1791 read_write_done(cq, wc, DMA_FROM_DEVICE); 1792 } 1793 1794 static void write_done(struct ib_cq *cq, struct ib_wc *wc) 1795 { 1796 read_write_done(cq, wc, DMA_TO_DEVICE); 1797 } 1798 1799 static int smb_direct_rdma_xmit(struct smb_direct_transport *t, 1800 void *buf, int buf_len, 1801 struct smbdirect_buffer_descriptor_v1 *desc, 1802 unsigned int desc_len, 1803 bool is_read) 1804 { 1805 struct smbdirect_socket *sc = &t->socket; 1806 struct smbdirect_socket_parameters *sp = &sc->parameters; 1807 struct smbdirect_rw_io *msg, *next_msg; 1808 int i, ret; 1809 DECLARE_COMPLETION_ONSTACK(completion); 1810 struct ib_send_wr *first_wr; 1811 LIST_HEAD(msg_list); 1812 char *desc_buf; 1813 int credits_needed; 1814 unsigned int desc_buf_len, desc_num = 0; 1815 1816 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) 1817 return -ENOTCONN; 1818 1819 if (buf_len > sp->max_read_write_size) 1820 return -EINVAL; 1821 1822 /* calculate needed credits */ 1823 credits_needed = 0; 1824 desc_buf = buf; 1825 for (i = 0; i < desc_len / sizeof(*desc); i++) { 1826 if (!buf_len) 1827 break; 1828 1829 desc_buf_len = le32_to_cpu(desc[i].length); 1830 if (!desc_buf_len) 1831 return -EINVAL; 1832 1833 if (desc_buf_len > buf_len) { 1834 desc_buf_len = buf_len; 1835 desc[i].length = cpu_to_le32(desc_buf_len); 1836 buf_len = 0; 1837 } 1838 1839 credits_needed += calc_rw_credits(sc, desc_buf, desc_buf_len); 1840 desc_buf += desc_buf_len; 1841 buf_len -= desc_buf_len; 1842 desc_num++; 1843 } 1844 1845 ksmbd_debug(RDMA, "RDMA %s, len %#x, needed credits %#x\n", 1846 str_read_write(is_read), buf_len, credits_needed); 1847 1848 ret = wait_for_rw_credits(sc, credits_needed); 1849 if (ret < 0) 1850 return ret; 1851 1852 /* build rdma_rw_ctx for each descriptor */ 1853 desc_buf = buf; 1854 for (i = 0; i < desc_num; i++) { 1855 msg = kzalloc(struct_size(msg, sg_list, SG_CHUNK_SIZE), 1856 KSMBD_DEFAULT_GFP); 1857 if (!msg) { 1858 ret = -ENOMEM; 1859 goto out; 1860 } 1861 1862 desc_buf_len = le32_to_cpu(desc[i].length); 1863 1864 msg->socket = sc; 1865 msg->cqe.done = is_read ? read_done : write_done; 1866 msg->completion = &completion; 1867 1868 msg->sgt.sgl = &msg->sg_list[0]; 1869 ret = sg_alloc_table_chained(&msg->sgt, 1870 get_buf_page_count(desc_buf, desc_buf_len), 1871 msg->sg_list, SG_CHUNK_SIZE); 1872 if (ret) { 1873 ret = -ENOMEM; 1874 goto free_msg; 1875 } 1876 1877 ret = get_sg_list(desc_buf, desc_buf_len, 1878 msg->sgt.sgl, msg->sgt.orig_nents); 1879 if (ret < 0) 1880 goto free_table; 1881 1882 ret = rdma_rw_ctx_init(&msg->rdma_ctx, sc->ib.qp, sc->ib.qp->port, 1883 msg->sgt.sgl, 1884 get_buf_page_count(desc_buf, desc_buf_len), 1885 0, 1886 le64_to_cpu(desc[i].offset), 1887 le32_to_cpu(desc[i].token), 1888 is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); 1889 if (ret < 0) { 1890 pr_err("failed to init rdma_rw_ctx: %d\n", ret); 1891 goto free_table; 1892 } 1893 1894 list_add_tail(&msg->list, &msg_list); 1895 desc_buf += desc_buf_len; 1896 } 1897 1898 /* concatenate work requests of rdma_rw_ctxs */ 1899 first_wr = NULL; 1900 list_for_each_entry_reverse(msg, &msg_list, list) { 1901 first_wr = rdma_rw_ctx_wrs(&msg->rdma_ctx, sc->ib.qp, sc->ib.qp->port, 1902 &msg->cqe, first_wr); 1903 } 1904 1905 ret = ib_post_send(sc->ib.qp, first_wr, NULL); 1906 if (ret) { 1907 pr_err("failed to post send wr for RDMA R/W: %d\n", ret); 1908 goto out; 1909 } 1910 1911 msg = list_last_entry(&msg_list, struct smbdirect_rw_io, list); 1912 wait_for_completion(&completion); 1913 ret = msg->error; 1914 out: 1915 list_for_each_entry_safe(msg, next_msg, &msg_list, list) { 1916 list_del(&msg->list); 1917 smb_direct_free_rdma_rw_msg(t, msg, 1918 is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); 1919 } 1920 atomic_add(credits_needed, &sc->rw_io.credits.count); 1921 wake_up(&sc->rw_io.credits.wait_queue); 1922 return ret; 1923 1924 free_table: 1925 sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); 1926 free_msg: 1927 kfree(msg); 1928 goto out; 1929 } 1930 1931 static int smb_direct_rdma_write(struct ksmbd_transport *t, 1932 void *buf, unsigned int buflen, 1933 struct smbdirect_buffer_descriptor_v1 *desc, 1934 unsigned int desc_len) 1935 { 1936 return smb_direct_rdma_xmit(SMBD_TRANS(t), buf, buflen, 1937 desc, desc_len, false); 1938 } 1939 1940 static int smb_direct_rdma_read(struct ksmbd_transport *t, 1941 void *buf, unsigned int buflen, 1942 struct smbdirect_buffer_descriptor_v1 *desc, 1943 unsigned int desc_len) 1944 { 1945 return smb_direct_rdma_xmit(SMBD_TRANS(t), buf, buflen, 1946 desc, desc_len, true); 1947 } 1948 1949 static void smb_direct_disconnect(struct ksmbd_transport *t) 1950 { 1951 struct smb_direct_transport *st = SMBD_TRANS(t); 1952 struct smbdirect_socket *sc = &st->socket; 1953 1954 ksmbd_debug(RDMA, "Disconnecting cm_id=%p\n", sc->rdma.cm_id); 1955 1956 free_transport(st); 1957 } 1958 1959 static void smb_direct_shutdown(struct ksmbd_transport *t) 1960 { 1961 struct smb_direct_transport *st = SMBD_TRANS(t); 1962 struct smbdirect_socket *sc = &st->socket; 1963 1964 ksmbd_debug(RDMA, "smb-direct shutdown cm_id=%p\n", sc->rdma.cm_id); 1965 1966 smb_direct_disconnect_rdma_work(&sc->disconnect_work); 1967 } 1968 1969 static int smb_direct_cm_handler(struct rdma_cm_id *cm_id, 1970 struct rdma_cm_event *event) 1971 { 1972 struct smbdirect_socket *sc = cm_id->context; 1973 unsigned long flags; 1974 1975 ksmbd_debug(RDMA, "RDMA CM event. cm_id=%p event=%s (%d)\n", 1976 cm_id, rdma_event_msg(event->event), event->event); 1977 1978 switch (event->event) { 1979 case RDMA_CM_EVENT_ESTABLISHED: { 1980 /* 1981 * Some drivers (at least mlx5_ib and irdma in roce mode) 1982 * might post a recv completion before RDMA_CM_EVENT_ESTABLISHED, 1983 * we need to adjust our expectation in that case. 1984 * 1985 * If smb_direct_negotiate_recv_done was called first 1986 * it initialized sc->connect.work only for us to 1987 * start, so that we turned into 1988 * SMBDIRECT_SOCKET_NEGOTIATE_NEEDED, before 1989 * smb_direct_negotiate_recv_work() runs. 1990 * 1991 * If smb_direct_negotiate_recv_done didn't happen 1992 * yet. sc->connect.work is still be disabled and 1993 * queue_work() is a no-op. 1994 */ 1995 if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING)) 1996 break; 1997 sc->status = SMBDIRECT_SOCKET_NEGOTIATE_NEEDED; 1998 spin_lock_irqsave(&sc->connect.lock, flags); 1999 if (!sc->first_error) 2000 queue_work(sc->workqueue, &sc->connect.work); 2001 spin_unlock_irqrestore(&sc->connect.lock, flags); 2002 wake_up(&sc->status_wait); 2003 break; 2004 } 2005 case RDMA_CM_EVENT_DEVICE_REMOVAL: 2006 case RDMA_CM_EVENT_DISCONNECTED: { 2007 sc->status = SMBDIRECT_SOCKET_DISCONNECTED; 2008 smb_direct_disconnect_rdma_work(&sc->disconnect_work); 2009 if (sc->ib.qp) 2010 ib_drain_qp(sc->ib.qp); 2011 break; 2012 } 2013 case RDMA_CM_EVENT_CONNECT_ERROR: { 2014 sc->status = SMBDIRECT_SOCKET_DISCONNECTED; 2015 smb_direct_disconnect_rdma_work(&sc->disconnect_work); 2016 break; 2017 } 2018 default: 2019 pr_err("Unexpected RDMA CM event. cm_id=%p, event=%s (%d)\n", 2020 cm_id, rdma_event_msg(event->event), 2021 event->event); 2022 break; 2023 } 2024 return 0; 2025 } 2026 2027 static void smb_direct_qpair_handler(struct ib_event *event, void *context) 2028 { 2029 struct smbdirect_socket *sc = context; 2030 2031 ksmbd_debug(RDMA, "Received QP event. cm_id=%p, event=%s (%d)\n", 2032 sc->rdma.cm_id, ib_event_msg(event->event), event->event); 2033 2034 switch (event->event) { 2035 case IB_EVENT_CQ_ERR: 2036 case IB_EVENT_QP_FATAL: 2037 smb_direct_disconnect_rdma_connection(sc); 2038 break; 2039 default: 2040 break; 2041 } 2042 } 2043 2044 static int smb_direct_send_negotiate_response(struct smbdirect_socket *sc, 2045 int failed) 2046 { 2047 struct smbdirect_socket_parameters *sp = &sc->parameters; 2048 struct smbdirect_send_io *sendmsg; 2049 struct smbdirect_negotiate_resp *resp; 2050 int ret; 2051 2052 sendmsg = smb_direct_alloc_sendmsg(sc); 2053 if (IS_ERR(sendmsg)) 2054 return -ENOMEM; 2055 2056 resp = (struct smbdirect_negotiate_resp *)sendmsg->packet; 2057 if (failed) { 2058 memset(resp, 0, sizeof(*resp)); 2059 resp->min_version = SMB_DIRECT_VERSION_LE; 2060 resp->max_version = SMB_DIRECT_VERSION_LE; 2061 resp->status = STATUS_NOT_SUPPORTED; 2062 2063 sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; 2064 } else { 2065 resp->status = STATUS_SUCCESS; 2066 resp->min_version = SMB_DIRECT_VERSION_LE; 2067 resp->max_version = SMB_DIRECT_VERSION_LE; 2068 resp->negotiated_version = SMB_DIRECT_VERSION_LE; 2069 resp->reserved = 0; 2070 resp->credits_requested = 2071 cpu_to_le16(sp->send_credit_target); 2072 resp->credits_granted = cpu_to_le16(manage_credits_prior_sending(sc)); 2073 resp->max_readwrite_size = cpu_to_le32(sp->max_read_write_size); 2074 resp->preferred_send_size = cpu_to_le32(sp->max_send_size); 2075 resp->max_receive_size = cpu_to_le32(sp->max_recv_size); 2076 resp->max_fragmented_size = 2077 cpu_to_le32(sp->max_fragmented_recv_size); 2078 2079 atomic_set(&sc->send_io.bcredits.count, 1); 2080 sc->recv_io.expected = SMBDIRECT_EXPECT_DATA_TRANSFER; 2081 sc->status = SMBDIRECT_SOCKET_CONNECTED; 2082 } 2083 2084 sendmsg->sge[0].addr = ib_dma_map_single(sc->ib.dev, 2085 (void *)resp, sizeof(*resp), 2086 DMA_TO_DEVICE); 2087 ret = ib_dma_mapping_error(sc->ib.dev, sendmsg->sge[0].addr); 2088 if (ret) { 2089 smb_direct_free_sendmsg(sc, sendmsg); 2090 return ret; 2091 } 2092 2093 sendmsg->num_sge = 1; 2094 sendmsg->sge[0].length = sizeof(*resp); 2095 sendmsg->sge[0].lkey = sc->ib.pd->local_dma_lkey; 2096 2097 ret = post_sendmsg(sc, NULL, sendmsg); 2098 if (ret) { 2099 smb_direct_free_sendmsg(sc, sendmsg); 2100 return ret; 2101 } 2102 2103 wait_event(sc->send_io.pending.zero_wait_queue, 2104 atomic_read(&sc->send_io.pending.count) == 0 || 2105 sc->status != SMBDIRECT_SOCKET_CONNECTED); 2106 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) 2107 return -ENOTCONN; 2108 2109 return 0; 2110 } 2111 2112 static int smb_direct_accept_client(struct smbdirect_socket *sc) 2113 { 2114 struct smbdirect_socket_parameters *sp = &sc->parameters; 2115 struct rdma_conn_param conn_param; 2116 __be32 ird_ord_hdr[2]; 2117 int ret; 2118 2119 /* 2120 * smb_direct_handle_connect_request() 2121 * already negotiated sp->initiator_depth 2122 * and sp->responder_resources 2123 */ 2124 memset(&conn_param, 0, sizeof(conn_param)); 2125 conn_param.initiator_depth = sp->initiator_depth; 2126 conn_param.responder_resources = sp->responder_resources; 2127 2128 if (sc->rdma.legacy_iwarp) { 2129 ird_ord_hdr[0] = cpu_to_be32(conn_param.responder_resources); 2130 ird_ord_hdr[1] = cpu_to_be32(conn_param.initiator_depth); 2131 conn_param.private_data = ird_ord_hdr; 2132 conn_param.private_data_len = sizeof(ird_ord_hdr); 2133 } else { 2134 conn_param.private_data = NULL; 2135 conn_param.private_data_len = 0; 2136 } 2137 conn_param.retry_count = SMB_DIRECT_CM_RETRY; 2138 conn_param.rnr_retry_count = SMB_DIRECT_CM_RNR_RETRY; 2139 conn_param.flow_control = 0; 2140 2141 /* 2142 * start with the negotiate timeout and SMBDIRECT_KEEPALIVE_PENDING 2143 * so that the timer will cause a disconnect. 2144 */ 2145 sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING; 2146 mod_delayed_work(sc->workqueue, &sc->idle.timer_work, 2147 msecs_to_jiffies(sp->negotiate_timeout_msec)); 2148 2149 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED); 2150 sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING; 2151 ret = rdma_accept(sc->rdma.cm_id, &conn_param); 2152 if (ret) { 2153 pr_err("error at rdma_accept: %d\n", ret); 2154 return ret; 2155 } 2156 return 0; 2157 } 2158 2159 static int smb_direct_prepare_negotiation(struct smbdirect_socket *sc) 2160 { 2161 struct smbdirect_recv_io *recvmsg; 2162 bool recv_posted = false; 2163 int ret; 2164 2165 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED); 2166 sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED; 2167 2168 sc->recv_io.expected = SMBDIRECT_EXPECT_NEGOTIATE_REQ; 2169 2170 recvmsg = get_free_recvmsg(sc); 2171 if (!recvmsg) 2172 return -ENOMEM; 2173 recvmsg->cqe.done = smb_direct_negotiate_recv_done; 2174 2175 ret = smb_direct_post_recv(sc, recvmsg); 2176 if (ret) { 2177 pr_err("Can't post recv: %d\n", ret); 2178 goto out_err; 2179 } 2180 recv_posted = true; 2181 2182 ret = smb_direct_accept_client(sc); 2183 if (ret) { 2184 pr_err("Can't accept client\n"); 2185 goto out_err; 2186 } 2187 2188 return 0; 2189 out_err: 2190 /* 2191 * If the recv was never posted, return it to the free list. 2192 * If it was posted, leave it alone so disconnect teardown can 2193 * drain the QP and complete it (flush) and the completion path 2194 * will unmap it exactly once. 2195 */ 2196 if (!recv_posted) 2197 put_recvmsg(sc, recvmsg); 2198 return ret; 2199 } 2200 2201 static int smb_direct_init_params(struct smbdirect_socket *sc) 2202 { 2203 struct smbdirect_socket_parameters *sp = &sc->parameters; 2204 int max_send_sges; 2205 unsigned int maxpages; 2206 2207 /* need 3 more sge. because a SMB_DIRECT header, SMB2 header, 2208 * SMB2 response could be mapped. 2209 */ 2210 max_send_sges = DIV_ROUND_UP(sp->max_send_size, PAGE_SIZE) + 3; 2211 if (max_send_sges > SMBDIRECT_SEND_IO_MAX_SGE) { 2212 pr_err("max_send_size %d is too large\n", sp->max_send_size); 2213 return -EINVAL; 2214 } 2215 2216 atomic_set(&sc->send_io.lcredits.count, sp->send_credit_target); 2217 2218 maxpages = DIV_ROUND_UP(sp->max_read_write_size, PAGE_SIZE); 2219 sc->rw_io.credits.max = rdma_rw_mr_factor(sc->ib.dev, 2220 sc->rdma.cm_id->port_num, 2221 maxpages); 2222 sc->rw_io.credits.num_pages = DIV_ROUND_UP(maxpages, sc->rw_io.credits.max); 2223 /* add one extra in order to handle unaligned pages */ 2224 sc->rw_io.credits.max += 1; 2225 2226 sc->recv_io.credits.target = 1; 2227 2228 atomic_set(&sc->rw_io.credits.count, sc->rw_io.credits.max); 2229 2230 return 0; 2231 } 2232 2233 static void smb_direct_destroy_pools(struct smbdirect_socket *sc) 2234 { 2235 struct smbdirect_recv_io *recvmsg; 2236 2237 while ((recvmsg = get_free_recvmsg(sc))) 2238 mempool_free(recvmsg, sc->recv_io.mem.pool); 2239 2240 mempool_destroy(sc->recv_io.mem.pool); 2241 sc->recv_io.mem.pool = NULL; 2242 2243 kmem_cache_destroy(sc->recv_io.mem.cache); 2244 sc->recv_io.mem.cache = NULL; 2245 2246 mempool_destroy(sc->send_io.mem.pool); 2247 sc->send_io.mem.pool = NULL; 2248 2249 kmem_cache_destroy(sc->send_io.mem.cache); 2250 sc->send_io.mem.cache = NULL; 2251 } 2252 2253 static int smb_direct_create_pools(struct smbdirect_socket *sc) 2254 { 2255 struct smbdirect_socket_parameters *sp = &sc->parameters; 2256 char name[80]; 2257 int i; 2258 struct smbdirect_recv_io *recvmsg; 2259 2260 snprintf(name, sizeof(name), "smbdirect_send_io_pool_%p", sc); 2261 sc->send_io.mem.cache = kmem_cache_create(name, 2262 sizeof(struct smbdirect_send_io) + 2263 sizeof(struct smbdirect_negotiate_resp), 2264 0, SLAB_HWCACHE_ALIGN, NULL); 2265 if (!sc->send_io.mem.cache) 2266 return -ENOMEM; 2267 2268 sc->send_io.mem.pool = mempool_create(sp->send_credit_target, 2269 mempool_alloc_slab, mempool_free_slab, 2270 sc->send_io.mem.cache); 2271 if (!sc->send_io.mem.pool) 2272 goto err; 2273 2274 snprintf(name, sizeof(name), "smbdirect_recv_io_pool_%p", sc); 2275 sc->recv_io.mem.cache = kmem_cache_create(name, 2276 sizeof(struct smbdirect_recv_io) + 2277 sp->max_recv_size, 2278 0, SLAB_HWCACHE_ALIGN, NULL); 2279 if (!sc->recv_io.mem.cache) 2280 goto err; 2281 2282 sc->recv_io.mem.pool = 2283 mempool_create(sp->recv_credit_max, mempool_alloc_slab, 2284 mempool_free_slab, sc->recv_io.mem.cache); 2285 if (!sc->recv_io.mem.pool) 2286 goto err; 2287 2288 for (i = 0; i < sp->recv_credit_max; i++) { 2289 recvmsg = mempool_alloc(sc->recv_io.mem.pool, KSMBD_DEFAULT_GFP); 2290 if (!recvmsg) 2291 goto err; 2292 recvmsg->socket = sc; 2293 recvmsg->sge.length = 0; 2294 list_add(&recvmsg->list, &sc->recv_io.free.list); 2295 } 2296 2297 return 0; 2298 err: 2299 smb_direct_destroy_pools(sc); 2300 return -ENOMEM; 2301 } 2302 2303 static u32 smb_direct_rdma_rw_send_wrs(struct ib_device *dev, const struct ib_qp_init_attr *attr) 2304 { 2305 /* 2306 * This could be split out of rdma_rw_init_qp() 2307 * and be a helper function next to rdma_rw_mr_factor() 2308 * 2309 * We can't check unlikely(rdma_rw_force_mr) here, 2310 * but that is most likely 0 anyway. 2311 */ 2312 u32 factor; 2313 2314 WARN_ON_ONCE(attr->port_num == 0); 2315 2316 /* 2317 * Each context needs at least one RDMA READ or WRITE WR. 2318 * 2319 * For some hardware we might need more, eventually we should ask the 2320 * HCA driver for a multiplier here. 2321 */ 2322 factor = 1; 2323 2324 /* 2325 * If the device needs MRs to perform RDMA READ or WRITE operations, 2326 * we'll need two additional MRs for the registrations and the 2327 * invalidation. 2328 */ 2329 if (rdma_protocol_iwarp(dev, attr->port_num) || dev->attrs.max_sgl_rd) 2330 factor += 2; /* inv + reg */ 2331 2332 return factor * attr->cap.max_rdma_ctxs; 2333 } 2334 2335 static int smb_direct_create_qpair(struct smbdirect_socket *sc) 2336 { 2337 struct smbdirect_socket_parameters *sp = &sc->parameters; 2338 int ret; 2339 struct ib_qp_cap qp_cap; 2340 struct ib_qp_init_attr qp_attr; 2341 u32 max_send_wr; 2342 u32 rdma_send_wr; 2343 2344 /* 2345 * Note that {rdma,ib}_create_qp() will call 2346 * rdma_rw_init_qp() if cap->max_rdma_ctxs is not 0. 2347 * It will adjust cap->max_send_wr to the required 2348 * number of additional WRs for the RDMA RW operations. 2349 * It will cap cap->max_send_wr to the device limit. 2350 * 2351 * +1 for ib_drain_qp 2352 */ 2353 qp_cap.max_send_wr = sp->send_credit_target + 1; 2354 qp_cap.max_recv_wr = sp->recv_credit_max + 1; 2355 qp_cap.max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE; 2356 qp_cap.max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE; 2357 qp_cap.max_inline_data = 0; 2358 qp_cap.max_rdma_ctxs = sc->rw_io.credits.max; 2359 2360 /* 2361 * Find out the number of max_send_wr 2362 * after rdma_rw_init_qp() adjusted it. 2363 * 2364 * We only do it on a temporary variable, 2365 * as rdma_create_qp() will trigger 2366 * rdma_rw_init_qp() again. 2367 */ 2368 memset(&qp_attr, 0, sizeof(qp_attr)); 2369 qp_attr.cap = qp_cap; 2370 qp_attr.port_num = sc->rdma.cm_id->port_num; 2371 rdma_send_wr = smb_direct_rdma_rw_send_wrs(sc->ib.dev, &qp_attr); 2372 max_send_wr = qp_cap.max_send_wr + rdma_send_wr; 2373 2374 if (qp_cap.max_send_wr > sc->ib.dev->attrs.max_cqe || 2375 qp_cap.max_send_wr > sc->ib.dev->attrs.max_qp_wr) { 2376 pr_err("Possible CQE overrun: max_send_wr %d\n", 2377 qp_cap.max_send_wr); 2378 pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n", 2379 IB_DEVICE_NAME_MAX, 2380 sc->ib.dev->name, 2381 sc->ib.dev->attrs.max_cqe, 2382 sc->ib.dev->attrs.max_qp_wr); 2383 pr_err("consider lowering send_credit_target = %d\n", 2384 sp->send_credit_target); 2385 return -EINVAL; 2386 } 2387 2388 if (qp_cap.max_rdma_ctxs && 2389 (max_send_wr >= sc->ib.dev->attrs.max_cqe || 2390 max_send_wr >= sc->ib.dev->attrs.max_qp_wr)) { 2391 pr_err("Possible CQE overrun: rdma_send_wr %d + max_send_wr %d = %d\n", 2392 rdma_send_wr, qp_cap.max_send_wr, max_send_wr); 2393 pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n", 2394 IB_DEVICE_NAME_MAX, 2395 sc->ib.dev->name, 2396 sc->ib.dev->attrs.max_cqe, 2397 sc->ib.dev->attrs.max_qp_wr); 2398 pr_err("consider lowering send_credit_target = %d, max_rdma_ctxs = %d\n", 2399 sp->send_credit_target, qp_cap.max_rdma_ctxs); 2400 return -EINVAL; 2401 } 2402 2403 if (qp_cap.max_recv_wr > sc->ib.dev->attrs.max_cqe || 2404 qp_cap.max_recv_wr > sc->ib.dev->attrs.max_qp_wr) { 2405 pr_err("Possible CQE overrun: max_recv_wr %d\n", 2406 qp_cap.max_recv_wr); 2407 pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n", 2408 IB_DEVICE_NAME_MAX, 2409 sc->ib.dev->name, 2410 sc->ib.dev->attrs.max_cqe, 2411 sc->ib.dev->attrs.max_qp_wr); 2412 pr_err("consider lowering receive_credit_max = %d\n", 2413 sp->recv_credit_max); 2414 return -EINVAL; 2415 } 2416 2417 if (qp_cap.max_send_sge > sc->ib.dev->attrs.max_send_sge || 2418 qp_cap.max_recv_sge > sc->ib.dev->attrs.max_recv_sge) { 2419 pr_err("device %.*s max_send_sge/max_recv_sge = %d/%d too small\n", 2420 IB_DEVICE_NAME_MAX, 2421 sc->ib.dev->name, 2422 sc->ib.dev->attrs.max_send_sge, 2423 sc->ib.dev->attrs.max_recv_sge); 2424 return -EINVAL; 2425 } 2426 2427 sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0); 2428 if (IS_ERR(sc->ib.pd)) { 2429 pr_err("Can't create RDMA PD\n"); 2430 ret = PTR_ERR(sc->ib.pd); 2431 sc->ib.pd = NULL; 2432 return ret; 2433 } 2434 2435 sc->ib.send_cq = ib_alloc_cq_any(sc->ib.dev, sc, 2436 max_send_wr, 2437 IB_POLL_WORKQUEUE); 2438 if (IS_ERR(sc->ib.send_cq)) { 2439 pr_err("Can't create RDMA send CQ\n"); 2440 ret = PTR_ERR(sc->ib.send_cq); 2441 sc->ib.send_cq = NULL; 2442 goto err; 2443 } 2444 2445 sc->ib.recv_cq = ib_alloc_cq_any(sc->ib.dev, sc, 2446 qp_cap.max_recv_wr, 2447 IB_POLL_WORKQUEUE); 2448 if (IS_ERR(sc->ib.recv_cq)) { 2449 pr_err("Can't create RDMA recv CQ\n"); 2450 ret = PTR_ERR(sc->ib.recv_cq); 2451 sc->ib.recv_cq = NULL; 2452 goto err; 2453 } 2454 2455 /* 2456 * We reset completely here! 2457 * As the above use was just temporary 2458 * to calc max_send_wr and rdma_send_wr. 2459 * 2460 * rdma_create_qp() will trigger rdma_rw_init_qp() 2461 * again if max_rdma_ctxs is not 0. 2462 */ 2463 memset(&qp_attr, 0, sizeof(qp_attr)); 2464 qp_attr.event_handler = smb_direct_qpair_handler; 2465 qp_attr.qp_context = sc; 2466 qp_attr.cap = qp_cap; 2467 qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; 2468 qp_attr.qp_type = IB_QPT_RC; 2469 qp_attr.send_cq = sc->ib.send_cq; 2470 qp_attr.recv_cq = sc->ib.recv_cq; 2471 qp_attr.port_num = ~0; 2472 2473 ret = rdma_create_qp(sc->rdma.cm_id, sc->ib.pd, &qp_attr); 2474 if (ret) { 2475 pr_err("Can't create RDMA QP: %d\n", ret); 2476 goto err; 2477 } 2478 2479 sc->ib.qp = sc->rdma.cm_id->qp; 2480 sc->rdma.cm_id->event_handler = smb_direct_cm_handler; 2481 2482 return 0; 2483 err: 2484 if (sc->ib.qp) { 2485 sc->ib.qp = NULL; 2486 rdma_destroy_qp(sc->rdma.cm_id); 2487 } 2488 if (sc->ib.recv_cq) { 2489 ib_destroy_cq(sc->ib.recv_cq); 2490 sc->ib.recv_cq = NULL; 2491 } 2492 if (sc->ib.send_cq) { 2493 ib_destroy_cq(sc->ib.send_cq); 2494 sc->ib.send_cq = NULL; 2495 } 2496 if (sc->ib.pd) { 2497 ib_dealloc_pd(sc->ib.pd); 2498 sc->ib.pd = NULL; 2499 } 2500 return ret; 2501 } 2502 2503 static int smb_direct_prepare(struct ksmbd_transport *t) 2504 { 2505 struct smb_direct_transport *st = SMBD_TRANS(t); 2506 struct smbdirect_socket *sc = &st->socket; 2507 struct smbdirect_socket_parameters *sp = &sc->parameters; 2508 struct smbdirect_recv_io *recvmsg; 2509 struct smbdirect_negotiate_req *req; 2510 unsigned long flags; 2511 int ret; 2512 2513 /* 2514 * We are waiting to pass the following states: 2515 * 2516 * SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED 2517 * SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING 2518 * SMBDIRECT_SOCKET_NEGOTIATE_NEEDED 2519 * 2520 * To finally get to SMBDIRECT_SOCKET_NEGOTIATE_RUNNING 2521 * in order to continue below. 2522 * 2523 * Everything else is unexpected and an error. 2524 */ 2525 ksmbd_debug(RDMA, "Waiting for SMB_DIRECT negotiate request\n"); 2526 ret = wait_event_interruptible_timeout(sc->status_wait, 2527 sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED && 2528 sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING && 2529 sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED, 2530 msecs_to_jiffies(sp->negotiate_timeout_msec)); 2531 if (ret <= 0 || sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING) 2532 return ret < 0 ? ret : -ETIMEDOUT; 2533 2534 recvmsg = get_first_reassembly(sc); 2535 if (!recvmsg) 2536 return -ECONNABORTED; 2537 2538 ret = smb_direct_check_recvmsg(recvmsg); 2539 if (ret) 2540 goto put; 2541 2542 req = (struct smbdirect_negotiate_req *)recvmsg->packet; 2543 sp->max_recv_size = min_t(int, sp->max_recv_size, 2544 le32_to_cpu(req->preferred_send_size)); 2545 sp->max_send_size = min_t(int, sp->max_send_size, 2546 le32_to_cpu(req->max_receive_size)); 2547 sp->max_fragmented_send_size = 2548 le32_to_cpu(req->max_fragmented_size); 2549 /* 2550 * The maximum fragmented upper-layer payload receive size supported 2551 * 2552 * Assume max_payload_per_credit is 2553 * smb_direct_receive_credit_max - 24 = 1340 2554 * 2555 * The maximum number would be 2556 * smb_direct_receive_credit_max * max_payload_per_credit 2557 * 2558 * 1340 * 255 = 341700 (0x536C4) 2559 * 2560 * The minimum value from the spec is 131072 (0x20000) 2561 * 2562 * For now we use the logic we used before: 2563 * (1364 * 255) / 2 = 173910 (0x2A756) 2564 * 2565 * We need to adjust this here in case the peer 2566 * lowered sp->max_recv_size. 2567 * 2568 * TODO: instead of adjusting max_fragmented_recv_size 2569 * we should adjust the number of available buffers, 2570 * but for now we keep the current logic. 2571 */ 2572 sp->max_fragmented_recv_size = 2573 (sp->recv_credit_max * sp->max_recv_size) / 2; 2574 sc->recv_io.credits.target = le16_to_cpu(req->credits_requested); 2575 sc->recv_io.credits.target = min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max); 2576 sc->recv_io.credits.target = max_t(u16, sc->recv_io.credits.target, 1); 2577 2578 put: 2579 spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); 2580 sc->recv_io.reassembly.queue_length--; 2581 list_del(&recvmsg->list); 2582 spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); 2583 put_recvmsg(sc, recvmsg); 2584 2585 if (ret == -ECONNABORTED) 2586 return ret; 2587 2588 if (ret) 2589 goto respond; 2590 2591 /* 2592 * We negotiated with success, so we need to refill the recv queue. 2593 * We do that with sc->idle.immediate_work still being disabled 2594 * via smbdirect_socket_init(), so that queue_work(sc->workqueue, 2595 * &sc->idle.immediate_work) in smb_direct_post_recv_credits() 2596 * is a no-op. 2597 * 2598 * The message that grants the credits to the client is 2599 * the negotiate response. 2600 */ 2601 INIT_WORK(&sc->recv_io.posted.refill_work, smb_direct_post_recv_credits); 2602 smb_direct_post_recv_credits(&sc->recv_io.posted.refill_work); 2603 if (unlikely(sc->first_error)) 2604 return sc->first_error; 2605 INIT_WORK(&sc->idle.immediate_work, smb_direct_send_immediate_work); 2606 2607 respond: 2608 ret = smb_direct_send_negotiate_response(sc, ret); 2609 2610 return ret; 2611 } 2612 2613 static int smb_direct_connect(struct smbdirect_socket *sc) 2614 { 2615 struct smbdirect_recv_io *recv_io; 2616 int ret; 2617 2618 ret = smb_direct_init_params(sc); 2619 if (ret) { 2620 pr_err("Can't configure RDMA parameters\n"); 2621 return ret; 2622 } 2623 2624 ret = smb_direct_create_pools(sc); 2625 if (ret) { 2626 pr_err("Can't init RDMA pool: %d\n", ret); 2627 return ret; 2628 } 2629 2630 list_for_each_entry(recv_io, &sc->recv_io.free.list, list) 2631 recv_io->cqe.done = recv_done; 2632 2633 ret = smb_direct_create_qpair(sc); 2634 if (ret) { 2635 pr_err("Can't accept RDMA client: %d\n", ret); 2636 return ret; 2637 } 2638 2639 ret = smb_direct_prepare_negotiation(sc); 2640 if (ret) { 2641 pr_err("Can't negotiate: %d\n", ret); 2642 return ret; 2643 } 2644 return 0; 2645 } 2646 2647 static bool rdma_frwr_is_supported(struct ib_device_attr *attrs) 2648 { 2649 if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) 2650 return false; 2651 if (attrs->max_fast_reg_page_list_len == 0) 2652 return false; 2653 return true; 2654 } 2655 2656 static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id, 2657 struct rdma_cm_event *event) 2658 { 2659 struct smb_direct_listener *listener = new_cm_id->context; 2660 struct smb_direct_transport *t; 2661 struct smbdirect_socket *sc; 2662 struct smbdirect_socket_parameters *sp; 2663 struct task_struct *handler; 2664 u8 peer_initiator_depth; 2665 u8 peer_responder_resources; 2666 int ret; 2667 2668 if (!rdma_frwr_is_supported(&new_cm_id->device->attrs)) { 2669 ksmbd_debug(RDMA, 2670 "Fast Registration Work Requests is not supported. device capabilities=%llx\n", 2671 new_cm_id->device->attrs.device_cap_flags); 2672 return -EPROTONOSUPPORT; 2673 } 2674 2675 t = alloc_transport(new_cm_id); 2676 if (!t) 2677 return -ENOMEM; 2678 sc = &t->socket; 2679 sp = &sc->parameters; 2680 2681 peer_initiator_depth = event->param.conn.initiator_depth; 2682 peer_responder_resources = event->param.conn.responder_resources; 2683 if (rdma_protocol_iwarp(new_cm_id->device, new_cm_id->port_num) && 2684 event->param.conn.private_data_len == 8) { 2685 /* 2686 * Legacy clients with only iWarp MPA v1 support 2687 * need a private blob in order to negotiate 2688 * the IRD/ORD values. 2689 */ 2690 const __be32 *ird_ord_hdr = event->param.conn.private_data; 2691 u32 ird32 = be32_to_cpu(ird_ord_hdr[0]); 2692 u32 ord32 = be32_to_cpu(ird_ord_hdr[1]); 2693 2694 /* 2695 * cifs.ko sends the legacy IRD/ORD negotiation 2696 * event if iWarp MPA v2 was used. 2697 * 2698 * Here we check that the values match and only 2699 * mark the client as legacy if they don't match. 2700 */ 2701 if ((u32)event->param.conn.initiator_depth != ird32 || 2702 (u32)event->param.conn.responder_resources != ord32) { 2703 /* 2704 * There are broken clients (old cifs.ko) 2705 * using little endian and also 2706 * struct rdma_conn_param only uses u8 2707 * for initiator_depth and responder_resources, 2708 * so we truncate the value to U8_MAX. 2709 * 2710 * smb_direct_accept_client() will then 2711 * do the real negotiation in order to 2712 * select the minimum between client and 2713 * server. 2714 */ 2715 ird32 = min_t(u32, ird32, U8_MAX); 2716 ord32 = min_t(u32, ord32, U8_MAX); 2717 2718 sc->rdma.legacy_iwarp = true; 2719 peer_initiator_depth = (u8)ird32; 2720 peer_responder_resources = (u8)ord32; 2721 } 2722 } 2723 2724 /* 2725 * First set what the we as server are able to support 2726 */ 2727 sp->initiator_depth = min_t(u8, sp->initiator_depth, 2728 new_cm_id->device->attrs.max_qp_rd_atom); 2729 2730 /* 2731 * negotiate the value by using the minimum 2732 * between client and server if the client provided 2733 * non 0 values. 2734 */ 2735 if (peer_initiator_depth != 0) 2736 sp->initiator_depth = min_t(u8, sp->initiator_depth, 2737 peer_initiator_depth); 2738 if (peer_responder_resources != 0) 2739 sp->responder_resources = min_t(u8, sp->responder_resources, 2740 peer_responder_resources); 2741 2742 ret = smb_direct_connect(sc); 2743 if (ret) 2744 goto out_err; 2745 2746 handler = kthread_run(ksmbd_conn_handler_loop, 2747 KSMBD_TRANS(t)->conn, "ksmbd:r%u", 2748 listener->port); 2749 if (IS_ERR(handler)) { 2750 ret = PTR_ERR(handler); 2751 pr_err("Can't start thread\n"); 2752 goto out_err; 2753 } 2754 2755 return 0; 2756 out_err: 2757 free_transport(t); 2758 return ret; 2759 } 2760 2761 static int smb_direct_listen_handler(struct rdma_cm_id *cm_id, 2762 struct rdma_cm_event *event) 2763 { 2764 switch (event->event) { 2765 case RDMA_CM_EVENT_CONNECT_REQUEST: { 2766 int ret = smb_direct_handle_connect_request(cm_id, event); 2767 2768 if (ret) { 2769 pr_err("Can't create transport: %d\n", ret); 2770 return ret; 2771 } 2772 2773 ksmbd_debug(RDMA, "Received connection request. cm_id=%p\n", 2774 cm_id); 2775 break; 2776 } 2777 default: 2778 pr_err("Unexpected listen event. cm_id=%p, event=%s (%d)\n", 2779 cm_id, rdma_event_msg(event->event), event->event); 2780 break; 2781 } 2782 return 0; 2783 } 2784 2785 static int smb_direct_listen(struct smb_direct_listener *listener, 2786 int port) 2787 { 2788 int ret; 2789 struct rdma_cm_id *cm_id; 2790 u8 node_type = RDMA_NODE_UNSPECIFIED; 2791 struct sockaddr_in sin = { 2792 .sin_family = AF_INET, 2793 .sin_addr.s_addr = htonl(INADDR_ANY), 2794 .sin_port = htons(port), 2795 }; 2796 2797 switch (port) { 2798 case SMB_DIRECT_PORT_IWARP: 2799 /* 2800 * only allow iWarp devices 2801 * for port 5445. 2802 */ 2803 node_type = RDMA_NODE_RNIC; 2804 break; 2805 case SMB_DIRECT_PORT_INFINIBAND: 2806 /* 2807 * only allow InfiniBand, RoCEv1 or RoCEv2 2808 * devices for port 445. 2809 * 2810 * (Basically don't allow iWarp devices) 2811 */ 2812 node_type = RDMA_NODE_IB_CA; 2813 break; 2814 default: 2815 pr_err("unsupported smbdirect port=%d!\n", port); 2816 return -ENODEV; 2817 } 2818 2819 cm_id = rdma_create_id(&init_net, smb_direct_listen_handler, 2820 listener, RDMA_PS_TCP, IB_QPT_RC); 2821 if (IS_ERR(cm_id)) { 2822 pr_err("Can't create cm id: %ld\n", PTR_ERR(cm_id)); 2823 return PTR_ERR(cm_id); 2824 } 2825 2826 ret = rdma_restrict_node_type(cm_id, node_type); 2827 if (ret) { 2828 pr_err("rdma_restrict_node_type(%u) failed %d\n", 2829 node_type, ret); 2830 goto err; 2831 } 2832 2833 ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); 2834 if (ret) { 2835 pr_err("Can't bind: %d\n", ret); 2836 goto err; 2837 } 2838 2839 ret = rdma_listen(cm_id, 10); 2840 if (ret) { 2841 pr_err("Can't listen: %d\n", ret); 2842 goto err; 2843 } 2844 2845 listener->port = port; 2846 listener->cm_id = cm_id; 2847 2848 return 0; 2849 err: 2850 listener->port = 0; 2851 listener->cm_id = NULL; 2852 rdma_destroy_id(cm_id); 2853 return ret; 2854 } 2855 2856 static int smb_direct_ib_client_add(struct ib_device *ib_dev) 2857 { 2858 struct smb_direct_device *smb_dev; 2859 2860 if (!rdma_frwr_is_supported(&ib_dev->attrs)) 2861 return 0; 2862 2863 smb_dev = kzalloc(sizeof(*smb_dev), KSMBD_DEFAULT_GFP); 2864 if (!smb_dev) 2865 return -ENOMEM; 2866 smb_dev->ib_dev = ib_dev; 2867 2868 write_lock(&smb_direct_device_lock); 2869 list_add(&smb_dev->list, &smb_direct_device_list); 2870 write_unlock(&smb_direct_device_lock); 2871 2872 ksmbd_debug(RDMA, "ib device added: name %s\n", ib_dev->name); 2873 return 0; 2874 } 2875 2876 static void smb_direct_ib_client_remove(struct ib_device *ib_dev, 2877 void *client_data) 2878 { 2879 struct smb_direct_device *smb_dev, *tmp; 2880 2881 write_lock(&smb_direct_device_lock); 2882 list_for_each_entry_safe(smb_dev, tmp, &smb_direct_device_list, list) { 2883 if (smb_dev->ib_dev == ib_dev) { 2884 list_del(&smb_dev->list); 2885 kfree(smb_dev); 2886 break; 2887 } 2888 } 2889 write_unlock(&smb_direct_device_lock); 2890 } 2891 2892 static struct ib_client smb_direct_ib_client = { 2893 .name = "ksmbd_smb_direct_ib", 2894 .add = smb_direct_ib_client_add, 2895 .remove = smb_direct_ib_client_remove, 2896 }; 2897 2898 int ksmbd_rdma_init(void) 2899 { 2900 int ret; 2901 2902 smb_direct_ib_listener = smb_direct_iw_listener = (struct smb_direct_listener) { 2903 .cm_id = NULL, 2904 }; 2905 2906 ret = ib_register_client(&smb_direct_ib_client); 2907 if (ret) { 2908 pr_err("failed to ib_register_client\n"); 2909 return ret; 2910 } 2911 2912 /* When a client is running out of send credits, the credits are 2913 * granted by the server's sending a packet using this queue. 2914 * This avoids the situation that a clients cannot send packets 2915 * for lack of credits 2916 */ 2917 smb_direct_wq = alloc_workqueue("ksmbd-smb_direct-wq", 2918 WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_PERCPU, 2919 0); 2920 if (!smb_direct_wq) { 2921 ret = -ENOMEM; 2922 goto err; 2923 } 2924 2925 ret = smb_direct_listen(&smb_direct_ib_listener, 2926 SMB_DIRECT_PORT_INFINIBAND); 2927 if (ret) { 2928 pr_err("Can't listen on InfiniBand/RoCEv1/RoCEv2: %d\n", ret); 2929 goto err; 2930 } 2931 2932 ksmbd_debug(RDMA, "InfiniBand/RoCEv1/RoCEv2 RDMA listener. cm_id=%p\n", 2933 smb_direct_ib_listener.cm_id); 2934 2935 ret = smb_direct_listen(&smb_direct_iw_listener, 2936 SMB_DIRECT_PORT_IWARP); 2937 if (ret) { 2938 pr_err("Can't listen on iWarp: %d\n", ret); 2939 goto err; 2940 } 2941 2942 ksmbd_debug(RDMA, "iWarp RDMA listener. cm_id=%p\n", 2943 smb_direct_iw_listener.cm_id); 2944 2945 return 0; 2946 err: 2947 ksmbd_rdma_stop_listening(); 2948 ksmbd_rdma_destroy(); 2949 return ret; 2950 } 2951 2952 void ksmbd_rdma_stop_listening(void) 2953 { 2954 if (!smb_direct_ib_listener.cm_id && !smb_direct_iw_listener.cm_id) 2955 return; 2956 2957 ib_unregister_client(&smb_direct_ib_client); 2958 2959 if (smb_direct_ib_listener.cm_id) 2960 rdma_destroy_id(smb_direct_ib_listener.cm_id); 2961 if (smb_direct_iw_listener.cm_id) 2962 rdma_destroy_id(smb_direct_iw_listener.cm_id); 2963 2964 smb_direct_ib_listener = smb_direct_iw_listener = (struct smb_direct_listener) { 2965 .cm_id = NULL, 2966 }; 2967 } 2968 2969 void ksmbd_rdma_destroy(void) 2970 { 2971 if (smb_direct_wq) { 2972 destroy_workqueue(smb_direct_wq); 2973 smb_direct_wq = NULL; 2974 } 2975 } 2976 2977 static bool ksmbd_find_rdma_capable_netdev(struct net_device *netdev) 2978 { 2979 struct smb_direct_device *smb_dev; 2980 int i; 2981 bool rdma_capable = false; 2982 2983 read_lock(&smb_direct_device_lock); 2984 list_for_each_entry(smb_dev, &smb_direct_device_list, list) { 2985 for (i = 0; i < smb_dev->ib_dev->phys_port_cnt; i++) { 2986 struct net_device *ndev; 2987 2988 ndev = ib_device_get_netdev(smb_dev->ib_dev, i + 1); 2989 if (!ndev) 2990 continue; 2991 2992 if (ndev == netdev) { 2993 dev_put(ndev); 2994 rdma_capable = true; 2995 goto out; 2996 } 2997 dev_put(ndev); 2998 } 2999 } 3000 out: 3001 read_unlock(&smb_direct_device_lock); 3002 3003 if (rdma_capable == false) { 3004 struct ib_device *ibdev; 3005 3006 ibdev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_UNKNOWN); 3007 if (ibdev) { 3008 rdma_capable = rdma_frwr_is_supported(&ibdev->attrs); 3009 ib_device_put(ibdev); 3010 } 3011 } 3012 3013 ksmbd_debug(RDMA, "netdev(%s) rdma capable : %s\n", 3014 netdev->name, str_true_false(rdma_capable)); 3015 3016 return rdma_capable; 3017 } 3018 3019 bool ksmbd_rdma_capable_netdev(struct net_device *netdev) 3020 { 3021 struct net_device *lower_dev; 3022 struct list_head *iter; 3023 3024 if (ksmbd_find_rdma_capable_netdev(netdev)) 3025 return true; 3026 3027 /* check if netdev is bridge or VLAN */ 3028 if (netif_is_bridge_master(netdev) || 3029 netdev->priv_flags & IFF_802_1Q_VLAN) 3030 netdev_for_each_lower_dev(netdev, lower_dev, iter) 3031 if (ksmbd_find_rdma_capable_netdev(lower_dev)) 3032 return true; 3033 3034 /* check if netdev is IPoIB safely without layer violation */ 3035 if (netdev->type == ARPHRD_INFINIBAND) 3036 return true; 3037 3038 return false; 3039 } 3040 3041 static const struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops = { 3042 .prepare = smb_direct_prepare, 3043 .disconnect = smb_direct_disconnect, 3044 .shutdown = smb_direct_shutdown, 3045 .writev = smb_direct_writev, 3046 .read = smb_direct_read, 3047 .rdma_read = smb_direct_rdma_read, 3048 .rdma_write = smb_direct_rdma_write, 3049 .free_transport = smb_direct_free_transport, 3050 }; 3051