1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2017, Microsoft Corporation. 4 * 5 * Author(s): Long Li <longli@microsoft.com> 6 */ 7 #include <linux/module.h> 8 #include <linux/highmem.h> 9 #include <linux/folio_queue.h> 10 #include "../common/smbdirect/smbdirect_pdu.h" 11 #include "smbdirect.h" 12 #include "cifs_debug.h" 13 #include "cifsproto.h" 14 #include "smb2proto.h" 15 16 const struct smbdirect_socket_parameters *smbd_get_parameters(struct smbd_connection *conn) 17 { 18 struct smbdirect_socket *sc = &conn->socket; 19 20 return &sc->parameters; 21 } 22 23 static struct smbdirect_recv_io *get_receive_buffer( 24 struct smbdirect_socket *sc); 25 static void put_receive_buffer( 26 struct smbdirect_socket *sc, 27 struct smbdirect_recv_io *response); 28 static int allocate_receive_buffers(struct smbdirect_socket *sc, int num_buf); 29 static void destroy_receive_buffers(struct smbdirect_socket *sc); 30 31 static void enqueue_reassembly( 32 struct smbdirect_socket *sc, 33 struct smbdirect_recv_io *response, int data_length); 34 static struct smbdirect_recv_io *_get_first_reassembly( 35 struct smbdirect_socket *sc); 36 37 static int smbd_post_recv( 38 struct smbdirect_socket *sc, 39 struct smbdirect_recv_io *response); 40 41 static int smbd_post_send_empty(struct smbdirect_socket *sc); 42 43 static void destroy_mr_list(struct smbdirect_socket *sc); 44 static int allocate_mr_list(struct smbdirect_socket *sc); 45 46 struct smb_extract_to_rdma { 47 struct ib_sge *sge; 48 unsigned int nr_sge; 49 unsigned int max_sge; 50 struct ib_device *device; 51 u32 local_dma_lkey; 52 enum dma_data_direction direction; 53 }; 54 static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len, 55 struct smb_extract_to_rdma *rdma); 56 57 /* Port numbers for SMBD transport */ 58 #define SMB_PORT 445 59 #define SMBD_PORT 5445 60 61 /* Address lookup and resolve timeout in ms */ 62 #define RDMA_RESOLVE_TIMEOUT 5000 63 64 /* SMBD negotiation timeout in seconds */ 65 #define SMBD_NEGOTIATE_TIMEOUT 120 66 67 /* The timeout to wait for a keepalive message from peer in seconds */ 68 #define KEEPALIVE_RECV_TIMEOUT 5 69 70 /* SMBD minimum receive size and fragmented sized defined in [MS-SMBD] */ 71 #define SMBD_MIN_RECEIVE_SIZE 128 72 #define SMBD_MIN_FRAGMENTED_SIZE 131072 73 74 /* 75 * Default maximum number of RDMA read/write outstanding on this connection 76 * This value is possibly decreased during QP creation on hardware limit 77 */ 78 #define SMBD_CM_RESPONDER_RESOURCES 32 79 80 /* Maximum number of retries on data transfer operations */ 81 #define SMBD_CM_RETRY 6 82 /* No need to retry on Receiver Not Ready since SMBD manages credits */ 83 #define SMBD_CM_RNR_RETRY 0 84 85 /* 86 * User configurable initial values per SMBD transport connection 87 * as defined in [MS-SMBD] 3.1.1.1 88 * Those may change after a SMBD negotiation 89 */ 90 /* The local peer's maximum number of credits to grant to the peer */ 91 int smbd_receive_credit_max = 255; 92 93 /* The remote peer's credit request of local peer */ 94 int smbd_send_credit_target = 255; 95 96 /* The maximum single message size can be sent to remote peer */ 97 int smbd_max_send_size = 1364; 98 99 /* The maximum fragmented upper-layer payload receive size supported */ 100 int smbd_max_fragmented_recv_size = 1024 * 1024; 101 102 /* The maximum single-message size which can be received */ 103 int smbd_max_receive_size = 1364; 104 105 /* The timeout to initiate send of a keepalive message on idle */ 106 int smbd_keep_alive_interval = 120; 107 108 /* 109 * User configurable initial values for RDMA transport 110 * The actual values used may be lower and are limited to hardware capabilities 111 */ 112 /* Default maximum number of pages in a single RDMA write/read */ 113 int smbd_max_frmr_depth = 2048; 114 115 /* If payload is less than this byte, use RDMA send/recv not read/write */ 116 int rdma_readwrite_threshold = 4096; 117 118 /* Transport logging functions 119 * Logging are defined as classes. They can be OR'ed to define the actual 120 * logging level via module parameter smbd_logging_class 121 * e.g. cifs.smbd_logging_class=0xa0 will log all log_rdma_recv() and 122 * log_rdma_event() 123 */ 124 #define LOG_OUTGOING 0x1 125 #define LOG_INCOMING 0x2 126 #define LOG_READ 0x4 127 #define LOG_WRITE 0x8 128 #define LOG_RDMA_SEND 0x10 129 #define LOG_RDMA_RECV 0x20 130 #define LOG_KEEP_ALIVE 0x40 131 #define LOG_RDMA_EVENT 0x80 132 #define LOG_RDMA_MR 0x100 133 static unsigned int smbd_logging_class; 134 module_param(smbd_logging_class, uint, 0644); 135 MODULE_PARM_DESC(smbd_logging_class, 136 "Logging class for SMBD transport 0x0 to 0x100"); 137 138 #define ERR 0x0 139 #define INFO 0x1 140 static unsigned int smbd_logging_level = ERR; 141 module_param(smbd_logging_level, uint, 0644); 142 MODULE_PARM_DESC(smbd_logging_level, 143 "Logging level for SMBD transport, 0 (default): error, 1: info"); 144 145 #define log_rdma(level, class, fmt, args...) \ 146 do { \ 147 if (level <= smbd_logging_level || class & smbd_logging_class) \ 148 cifs_dbg(VFS, "%s:%d " fmt, __func__, __LINE__, ##args);\ 149 } while (0) 150 151 #define log_outgoing(level, fmt, args...) \ 152 log_rdma(level, LOG_OUTGOING, fmt, ##args) 153 #define log_incoming(level, fmt, args...) \ 154 log_rdma(level, LOG_INCOMING, fmt, ##args) 155 #define log_read(level, fmt, args...) log_rdma(level, LOG_READ, fmt, ##args) 156 #define log_write(level, fmt, args...) log_rdma(level, LOG_WRITE, fmt, ##args) 157 #define log_rdma_send(level, fmt, args...) \ 158 log_rdma(level, LOG_RDMA_SEND, fmt, ##args) 159 #define log_rdma_recv(level, fmt, args...) \ 160 log_rdma(level, LOG_RDMA_RECV, fmt, ##args) 161 #define log_keep_alive(level, fmt, args...) \ 162 log_rdma(level, LOG_KEEP_ALIVE, fmt, ##args) 163 #define log_rdma_event(level, fmt, args...) \ 164 log_rdma(level, LOG_RDMA_EVENT, fmt, ##args) 165 #define log_rdma_mr(level, fmt, args...) \ 166 log_rdma(level, LOG_RDMA_MR, fmt, ##args) 167 168 static void smbd_disconnect_wake_up_all(struct smbdirect_socket *sc) 169 { 170 /* 171 * Wake up all waiters in all wait queues 172 * in order to notice the broken connection. 173 */ 174 wake_up_all(&sc->status_wait); 175 wake_up_all(&sc->send_io.lcredits.wait_queue); 176 wake_up_all(&sc->send_io.credits.wait_queue); 177 wake_up_all(&sc->send_io.pending.dec_wait_queue); 178 wake_up_all(&sc->send_io.pending.zero_wait_queue); 179 wake_up_all(&sc->recv_io.reassembly.wait_queue); 180 wake_up_all(&sc->mr_io.ready.wait_queue); 181 wake_up_all(&sc->mr_io.cleanup.wait_queue); 182 } 183 184 static void smbd_disconnect_rdma_work(struct work_struct *work) 185 { 186 struct smbdirect_socket *sc = 187 container_of(work, struct smbdirect_socket, disconnect_work); 188 189 /* 190 * make sure this and other work is not queued again 191 * but here we don't block and avoid 192 * disable[_delayed]_work_sync() 193 */ 194 disable_work(&sc->disconnect_work); 195 disable_work(&sc->recv_io.posted.refill_work); 196 disable_work(&sc->mr_io.recovery_work); 197 disable_work(&sc->idle.immediate_work); 198 disable_delayed_work(&sc->idle.timer_work); 199 200 if (sc->first_error == 0) 201 sc->first_error = -ECONNABORTED; 202 203 switch (sc->status) { 204 case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED: 205 case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING: 206 case SMBDIRECT_SOCKET_NEGOTIATE_FAILED: 207 case SMBDIRECT_SOCKET_CONNECTED: 208 case SMBDIRECT_SOCKET_ERROR: 209 sc->status = SMBDIRECT_SOCKET_DISCONNECTING; 210 rdma_disconnect(sc->rdma.cm_id); 211 break; 212 213 case SMBDIRECT_SOCKET_CREATED: 214 case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED: 215 case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING: 216 case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED: 217 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED: 218 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING: 219 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED: 220 case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED: 221 case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING: 222 case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED: 223 /* 224 * rdma_connect() never reached 225 * RDMA_CM_EVENT_ESTABLISHED 226 */ 227 sc->status = SMBDIRECT_SOCKET_DISCONNECTED; 228 break; 229 230 case SMBDIRECT_SOCKET_DISCONNECTING: 231 case SMBDIRECT_SOCKET_DISCONNECTED: 232 case SMBDIRECT_SOCKET_DESTROYED: 233 break; 234 } 235 236 /* 237 * Wake up all waiters in all wait queues 238 * in order to notice the broken connection. 239 */ 240 smbd_disconnect_wake_up_all(sc); 241 } 242 243 static void smbd_disconnect_rdma_connection(struct smbdirect_socket *sc) 244 { 245 /* 246 * make sure other work (than disconnect_work) is 247 * not queued again but here we don't block and avoid 248 * disable[_delayed]_work_sync() 249 */ 250 disable_work(&sc->recv_io.posted.refill_work); 251 disable_work(&sc->mr_io.recovery_work); 252 disable_work(&sc->idle.immediate_work); 253 disable_delayed_work(&sc->idle.timer_work); 254 255 if (sc->first_error == 0) 256 sc->first_error = -ECONNABORTED; 257 258 switch (sc->status) { 259 case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED: 260 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED: 261 case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED: 262 case SMBDIRECT_SOCKET_NEGOTIATE_FAILED: 263 case SMBDIRECT_SOCKET_ERROR: 264 case SMBDIRECT_SOCKET_DISCONNECTING: 265 case SMBDIRECT_SOCKET_DISCONNECTED: 266 case SMBDIRECT_SOCKET_DESTROYED: 267 /* 268 * Keep the current error status 269 */ 270 break; 271 272 case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED: 273 case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING: 274 sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED; 275 break; 276 277 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED: 278 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING: 279 sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED; 280 break; 281 282 case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED: 283 case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING: 284 sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED; 285 break; 286 287 case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED: 288 case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING: 289 sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; 290 break; 291 292 case SMBDIRECT_SOCKET_CREATED: 293 case SMBDIRECT_SOCKET_CONNECTED: 294 sc->status = SMBDIRECT_SOCKET_ERROR; 295 break; 296 } 297 298 /* 299 * Wake up all waiters in all wait queues 300 * in order to notice the broken connection. 301 */ 302 smbd_disconnect_wake_up_all(sc); 303 304 queue_work(sc->workqueue, &sc->disconnect_work); 305 } 306 307 /* Upcall from RDMA CM */ 308 static int smbd_conn_upcall( 309 struct rdma_cm_id *id, struct rdma_cm_event *event) 310 { 311 struct smbdirect_socket *sc = id->context; 312 struct smbdirect_socket_parameters *sp = &sc->parameters; 313 const char *event_name = rdma_event_msg(event->event); 314 u8 peer_initiator_depth; 315 u8 peer_responder_resources; 316 317 log_rdma_event(INFO, "event=%s status=%d\n", 318 event_name, event->status); 319 320 switch (event->event) { 321 case RDMA_CM_EVENT_ADDR_RESOLVED: 322 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING); 323 sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED; 324 wake_up(&sc->status_wait); 325 break; 326 327 case RDMA_CM_EVENT_ROUTE_RESOLVED: 328 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING); 329 sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED; 330 wake_up(&sc->status_wait); 331 break; 332 333 case RDMA_CM_EVENT_ADDR_ERROR: 334 log_rdma_event(ERR, "connecting failed event=%s\n", event_name); 335 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING); 336 sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED; 337 smbd_disconnect_rdma_work(&sc->disconnect_work); 338 break; 339 340 case RDMA_CM_EVENT_ROUTE_ERROR: 341 log_rdma_event(ERR, "connecting failed event=%s\n", event_name); 342 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING); 343 sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED; 344 smbd_disconnect_rdma_work(&sc->disconnect_work); 345 break; 346 347 case RDMA_CM_EVENT_ESTABLISHED: 348 log_rdma_event(INFO, "connected event=%s\n", event_name); 349 350 /* 351 * Here we work around an inconsistency between 352 * iWarp and other devices (at least rxe and irdma using RoCEv2) 353 */ 354 if (rdma_protocol_iwarp(id->device, id->port_num)) { 355 /* 356 * iWarp devices report the peer's values 357 * with the perspective of the peer here. 358 * Tested with siw and irdma (in iwarp mode) 359 * We need to change to our perspective here, 360 * so we need to switch the values. 361 */ 362 peer_initiator_depth = event->param.conn.responder_resources; 363 peer_responder_resources = event->param.conn.initiator_depth; 364 } else { 365 /* 366 * Non iWarp devices report the peer's values 367 * already changed to our perspective here. 368 * Tested with rxe and irdma (in roce mode). 369 */ 370 peer_initiator_depth = event->param.conn.initiator_depth; 371 peer_responder_resources = event->param.conn.responder_resources; 372 } 373 if (rdma_protocol_iwarp(id->device, id->port_num) && 374 event->param.conn.private_data_len == 8) { 375 /* 376 * Legacy clients with only iWarp MPA v1 support 377 * need a private blob in order to negotiate 378 * the IRD/ORD values. 379 */ 380 const __be32 *ird_ord_hdr = event->param.conn.private_data; 381 u32 ird32 = be32_to_cpu(ird_ord_hdr[0]); 382 u32 ord32 = be32_to_cpu(ird_ord_hdr[1]); 383 384 /* 385 * cifs.ko sends the legacy IRD/ORD negotiation 386 * event if iWarp MPA v2 was used. 387 * 388 * Here we check that the values match and only 389 * mark the client as legacy if they don't match. 390 */ 391 if ((u32)event->param.conn.initiator_depth != ird32 || 392 (u32)event->param.conn.responder_resources != ord32) { 393 /* 394 * There are broken clients (old cifs.ko) 395 * using little endian and also 396 * struct rdma_conn_param only uses u8 397 * for initiator_depth and responder_resources, 398 * so we truncate the value to U8_MAX. 399 * 400 * smb_direct_accept_client() will then 401 * do the real negotiation in order to 402 * select the minimum between client and 403 * server. 404 */ 405 ird32 = min_t(u32, ird32, U8_MAX); 406 ord32 = min_t(u32, ord32, U8_MAX); 407 408 sc->rdma.legacy_iwarp = true; 409 peer_initiator_depth = (u8)ird32; 410 peer_responder_resources = (u8)ord32; 411 } 412 } 413 414 /* 415 * negotiate the value by using the minimum 416 * between client and server if the client provided 417 * non 0 values. 418 */ 419 if (peer_initiator_depth != 0) 420 sp->initiator_depth = 421 min_t(u8, sp->initiator_depth, 422 peer_initiator_depth); 423 if (peer_responder_resources != 0) 424 sp->responder_resources = 425 min_t(u8, sp->responder_resources, 426 peer_responder_resources); 427 428 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING); 429 sc->status = SMBDIRECT_SOCKET_NEGOTIATE_NEEDED; 430 wake_up(&sc->status_wait); 431 break; 432 433 case RDMA_CM_EVENT_CONNECT_ERROR: 434 case RDMA_CM_EVENT_UNREACHABLE: 435 case RDMA_CM_EVENT_REJECTED: 436 log_rdma_event(ERR, "connecting failed event=%s\n", event_name); 437 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING); 438 sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED; 439 smbd_disconnect_rdma_work(&sc->disconnect_work); 440 break; 441 442 case RDMA_CM_EVENT_DEVICE_REMOVAL: 443 case RDMA_CM_EVENT_DISCONNECTED: 444 /* This happens when we fail the negotiation */ 445 if (sc->status == SMBDIRECT_SOCKET_NEGOTIATE_FAILED) { 446 log_rdma_event(ERR, "event=%s during negotiation\n", event_name); 447 } 448 449 sc->status = SMBDIRECT_SOCKET_DISCONNECTED; 450 smbd_disconnect_rdma_work(&sc->disconnect_work); 451 break; 452 453 default: 454 log_rdma_event(ERR, "unexpected event=%s status=%d\n", 455 event_name, event->status); 456 break; 457 } 458 459 return 0; 460 } 461 462 /* Upcall from RDMA QP */ 463 static void 464 smbd_qp_async_error_upcall(struct ib_event *event, void *context) 465 { 466 struct smbdirect_socket *sc = context; 467 468 log_rdma_event(ERR, "%s on device %s socket %p\n", 469 ib_event_msg(event->event), event->device->name, sc); 470 471 switch (event->event) { 472 case IB_EVENT_CQ_ERR: 473 case IB_EVENT_QP_FATAL: 474 smbd_disconnect_rdma_connection(sc); 475 break; 476 477 default: 478 break; 479 } 480 } 481 482 static inline void *smbdirect_send_io_payload(struct smbdirect_send_io *request) 483 { 484 return (void *)request->packet; 485 } 486 487 static inline void *smbdirect_recv_io_payload(struct smbdirect_recv_io *response) 488 { 489 return (void *)response->packet; 490 } 491 492 /* Called when a RDMA send is done */ 493 static void send_done(struct ib_cq *cq, struct ib_wc *wc) 494 { 495 int i; 496 struct smbdirect_send_io *request = 497 container_of(wc->wr_cqe, struct smbdirect_send_io, cqe); 498 struct smbdirect_socket *sc = request->socket; 499 int lcredits = 0; 500 501 log_rdma_send(INFO, "smbdirect_send_io 0x%p completed wc->status=%s\n", 502 request, ib_wc_status_msg(wc->status)); 503 504 for (i = 0; i < request->num_sge; i++) 505 ib_dma_unmap_single(sc->ib.dev, 506 request->sge[i].addr, 507 request->sge[i].length, 508 DMA_TO_DEVICE); 509 mempool_free(request, sc->send_io.mem.pool); 510 lcredits += 1; 511 512 if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) { 513 if (wc->status != IB_WC_WR_FLUSH_ERR) 514 log_rdma_send(ERR, "wc->status=%s wc->opcode=%d\n", 515 ib_wc_status_msg(wc->status), wc->opcode); 516 smbd_disconnect_rdma_connection(sc); 517 return; 518 } 519 520 atomic_add(lcredits, &sc->send_io.lcredits.count); 521 wake_up(&sc->send_io.lcredits.wait_queue); 522 523 if (atomic_dec_and_test(&sc->send_io.pending.count)) 524 wake_up(&sc->send_io.pending.zero_wait_queue); 525 526 wake_up(&sc->send_io.pending.dec_wait_queue); 527 } 528 529 static void dump_smbdirect_negotiate_resp(struct smbdirect_negotiate_resp *resp) 530 { 531 log_rdma_event(INFO, "resp message min_version %u max_version %u negotiated_version %u credits_requested %u credits_granted %u status %u max_readwrite_size %u preferred_send_size %u max_receive_size %u max_fragmented_size %u\n", 532 resp->min_version, resp->max_version, 533 resp->negotiated_version, resp->credits_requested, 534 resp->credits_granted, resp->status, 535 resp->max_readwrite_size, resp->preferred_send_size, 536 resp->max_receive_size, resp->max_fragmented_size); 537 } 538 539 /* 540 * Process a negotiation response message, according to [MS-SMBD]3.1.5.7 541 * response, packet_length: the negotiation response message 542 * return value: true if negotiation is a success, false if failed 543 */ 544 static bool process_negotiation_response( 545 struct smbdirect_recv_io *response, int packet_length) 546 { 547 struct smbdirect_socket *sc = response->socket; 548 struct smbdirect_socket_parameters *sp = &sc->parameters; 549 struct smbdirect_negotiate_resp *packet = smbdirect_recv_io_payload(response); 550 551 if (packet_length < sizeof(struct smbdirect_negotiate_resp)) { 552 log_rdma_event(ERR, 553 "error: packet_length=%d\n", packet_length); 554 return false; 555 } 556 557 if (le16_to_cpu(packet->negotiated_version) != SMBDIRECT_V1) { 558 log_rdma_event(ERR, "error: negotiated_version=%x\n", 559 le16_to_cpu(packet->negotiated_version)); 560 return false; 561 } 562 563 if (packet->credits_requested == 0) { 564 log_rdma_event(ERR, "error: credits_requested==0\n"); 565 return false; 566 } 567 sc->recv_io.credits.target = le16_to_cpu(packet->credits_requested); 568 sc->recv_io.credits.target = min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max); 569 570 if (packet->credits_granted == 0) { 571 log_rdma_event(ERR, "error: credits_granted==0\n"); 572 return false; 573 } 574 atomic_set(&sc->send_io.lcredits.count, sp->send_credit_target); 575 atomic_set(&sc->send_io.credits.count, le16_to_cpu(packet->credits_granted)); 576 577 if (le32_to_cpu(packet->preferred_send_size) > sp->max_recv_size) { 578 log_rdma_event(ERR, "error: preferred_send_size=%d\n", 579 le32_to_cpu(packet->preferred_send_size)); 580 return false; 581 } 582 sp->max_recv_size = le32_to_cpu(packet->preferred_send_size); 583 584 if (le32_to_cpu(packet->max_receive_size) < SMBD_MIN_RECEIVE_SIZE) { 585 log_rdma_event(ERR, "error: max_receive_size=%d\n", 586 le32_to_cpu(packet->max_receive_size)); 587 return false; 588 } 589 sp->max_send_size = min_t(u32, sp->max_send_size, 590 le32_to_cpu(packet->max_receive_size)); 591 592 if (le32_to_cpu(packet->max_fragmented_size) < 593 SMBD_MIN_FRAGMENTED_SIZE) { 594 log_rdma_event(ERR, "error: max_fragmented_size=%d\n", 595 le32_to_cpu(packet->max_fragmented_size)); 596 return false; 597 } 598 sp->max_fragmented_send_size = 599 le32_to_cpu(packet->max_fragmented_size); 600 601 602 sp->max_read_write_size = min_t(u32, 603 le32_to_cpu(packet->max_readwrite_size), 604 sp->max_frmr_depth * PAGE_SIZE); 605 sp->max_frmr_depth = sp->max_read_write_size / PAGE_SIZE; 606 607 sc->recv_io.expected = SMBDIRECT_EXPECT_DATA_TRANSFER; 608 return true; 609 } 610 611 static void smbd_post_send_credits(struct work_struct *work) 612 { 613 int rc; 614 struct smbdirect_recv_io *response; 615 struct smbdirect_socket *sc = 616 container_of(work, struct smbdirect_socket, recv_io.posted.refill_work); 617 618 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { 619 return; 620 } 621 622 if (sc->recv_io.credits.target > 623 atomic_read(&sc->recv_io.credits.count)) { 624 while (true) { 625 response = get_receive_buffer(sc); 626 if (!response) 627 break; 628 629 response->first_segment = false; 630 rc = smbd_post_recv(sc, response); 631 if (rc) { 632 log_rdma_recv(ERR, 633 "post_recv failed rc=%d\n", rc); 634 put_receive_buffer(sc, response); 635 break; 636 } 637 638 atomic_inc(&sc->recv_io.posted.count); 639 } 640 } 641 642 /* Promptly send an immediate packet as defined in [MS-SMBD] 3.1.1.1 */ 643 if (atomic_read(&sc->recv_io.credits.count) < 644 sc->recv_io.credits.target - 1) { 645 log_keep_alive(INFO, "schedule send of an empty message\n"); 646 queue_work(sc->workqueue, &sc->idle.immediate_work); 647 } 648 } 649 650 /* Called from softirq, when recv is done */ 651 static void recv_done(struct ib_cq *cq, struct ib_wc *wc) 652 { 653 struct smbdirect_data_transfer *data_transfer; 654 struct smbdirect_recv_io *response = 655 container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe); 656 struct smbdirect_socket *sc = response->socket; 657 struct smbdirect_socket_parameters *sp = &sc->parameters; 658 u16 old_recv_credit_target; 659 u32 data_offset = 0; 660 u32 data_length = 0; 661 u32 remaining_data_length = 0; 662 bool negotiate_done = false; 663 664 log_rdma_recv(INFO, 665 "response=0x%p type=%d wc status=%s wc opcode %d byte_len=%d pkey_index=%u\n", 666 response, sc->recv_io.expected, 667 ib_wc_status_msg(wc->status), wc->opcode, 668 wc->byte_len, wc->pkey_index); 669 670 if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) { 671 if (wc->status != IB_WC_WR_FLUSH_ERR) 672 log_rdma_recv(ERR, "wc->status=%s opcode=%d\n", 673 ib_wc_status_msg(wc->status), wc->opcode); 674 goto error; 675 } 676 677 ib_dma_sync_single_for_cpu( 678 wc->qp->device, 679 response->sge.addr, 680 response->sge.length, 681 DMA_FROM_DEVICE); 682 683 /* 684 * Reset timer to the keepalive interval in 685 * order to trigger our next keepalive message. 686 */ 687 sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE; 688 mod_delayed_work(sc->workqueue, &sc->idle.timer_work, 689 msecs_to_jiffies(sp->keepalive_interval_msec)); 690 691 switch (sc->recv_io.expected) { 692 /* SMBD negotiation response */ 693 case SMBDIRECT_EXPECT_NEGOTIATE_REP: 694 dump_smbdirect_negotiate_resp(smbdirect_recv_io_payload(response)); 695 sc->recv_io.reassembly.full_packet_received = true; 696 negotiate_done = 697 process_negotiation_response(response, wc->byte_len); 698 put_receive_buffer(sc, response); 699 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING); 700 if (!negotiate_done) { 701 sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; 702 smbd_disconnect_rdma_connection(sc); 703 } else { 704 sc->status = SMBDIRECT_SOCKET_CONNECTED; 705 wake_up(&sc->status_wait); 706 } 707 708 return; 709 710 /* SMBD data transfer packet */ 711 case SMBDIRECT_EXPECT_DATA_TRANSFER: 712 data_transfer = smbdirect_recv_io_payload(response); 713 714 if (wc->byte_len < 715 offsetof(struct smbdirect_data_transfer, padding)) 716 goto error; 717 718 remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length); 719 data_offset = le32_to_cpu(data_transfer->data_offset); 720 data_length = le32_to_cpu(data_transfer->data_length); 721 if (wc->byte_len < data_offset || 722 (u64)wc->byte_len < (u64)data_offset + data_length) 723 goto error; 724 725 if (remaining_data_length > sp->max_fragmented_recv_size || 726 data_length > sp->max_fragmented_recv_size || 727 (u64)remaining_data_length + (u64)data_length > (u64)sp->max_fragmented_recv_size) 728 goto error; 729 730 if (data_length) { 731 if (sc->recv_io.reassembly.full_packet_received) 732 response->first_segment = true; 733 734 if (le32_to_cpu(data_transfer->remaining_data_length)) 735 sc->recv_io.reassembly.full_packet_received = false; 736 else 737 sc->recv_io.reassembly.full_packet_received = true; 738 } 739 740 atomic_dec(&sc->recv_io.posted.count); 741 atomic_dec(&sc->recv_io.credits.count); 742 old_recv_credit_target = sc->recv_io.credits.target; 743 sc->recv_io.credits.target = 744 le16_to_cpu(data_transfer->credits_requested); 745 sc->recv_io.credits.target = 746 min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max); 747 sc->recv_io.credits.target = 748 max_t(u16, sc->recv_io.credits.target, 1); 749 if (le16_to_cpu(data_transfer->credits_granted)) { 750 atomic_add(le16_to_cpu(data_transfer->credits_granted), 751 &sc->send_io.credits.count); 752 /* 753 * We have new send credits granted from remote peer 754 * If any sender is waiting for credits, unblock it 755 */ 756 wake_up(&sc->send_io.credits.wait_queue); 757 } 758 759 log_incoming(INFO, "data flags %d data_offset %d data_length %d remaining_data_length %d\n", 760 le16_to_cpu(data_transfer->flags), 761 le32_to_cpu(data_transfer->data_offset), 762 le32_to_cpu(data_transfer->data_length), 763 le32_to_cpu(data_transfer->remaining_data_length)); 764 765 /* Send an immediate response right away if requested */ 766 if (le16_to_cpu(data_transfer->flags) & 767 SMBDIRECT_FLAG_RESPONSE_REQUESTED) { 768 log_keep_alive(INFO, "schedule send of immediate response\n"); 769 queue_work(sc->workqueue, &sc->idle.immediate_work); 770 } 771 772 /* 773 * If this is a packet with data playload place the data in 774 * reassembly queue and wake up the reading thread 775 */ 776 if (data_length) { 777 if (sc->recv_io.credits.target > old_recv_credit_target) 778 queue_work(sc->workqueue, &sc->recv_io.posted.refill_work); 779 780 enqueue_reassembly(sc, response, data_length); 781 wake_up(&sc->recv_io.reassembly.wait_queue); 782 } else 783 put_receive_buffer(sc, response); 784 785 return; 786 787 case SMBDIRECT_EXPECT_NEGOTIATE_REQ: 788 /* Only server... */ 789 break; 790 } 791 792 /* 793 * This is an internal error! 794 */ 795 log_rdma_recv(ERR, "unexpected response type=%d\n", sc->recv_io.expected); 796 WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_DATA_TRANSFER); 797 error: 798 put_receive_buffer(sc, response); 799 smbd_disconnect_rdma_connection(sc); 800 } 801 802 static struct rdma_cm_id *smbd_create_id( 803 struct smbdirect_socket *sc, 804 struct sockaddr *dstaddr, int port) 805 { 806 struct smbdirect_socket_parameters *sp = &sc->parameters; 807 struct rdma_cm_id *id; 808 int rc; 809 __be16 *sport; 810 811 id = rdma_create_id(&init_net, smbd_conn_upcall, sc, 812 RDMA_PS_TCP, IB_QPT_RC); 813 if (IS_ERR(id)) { 814 rc = PTR_ERR(id); 815 log_rdma_event(ERR, "rdma_create_id() failed %i\n", rc); 816 return id; 817 } 818 819 if (dstaddr->sa_family == AF_INET6) 820 sport = &((struct sockaddr_in6 *)dstaddr)->sin6_port; 821 else 822 sport = &((struct sockaddr_in *)dstaddr)->sin_port; 823 824 *sport = htons(port); 825 826 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED); 827 sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING; 828 rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)dstaddr, 829 sp->resolve_addr_timeout_msec); 830 if (rc) { 831 log_rdma_event(ERR, "rdma_resolve_addr() failed %i\n", rc); 832 goto out; 833 } 834 rc = wait_event_interruptible_timeout( 835 sc->status_wait, 836 sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING, 837 msecs_to_jiffies(sp->resolve_addr_timeout_msec)); 838 /* e.g. if interrupted returns -ERESTARTSYS */ 839 if (rc < 0) { 840 log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc); 841 goto out; 842 } 843 if (sc->status == SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING) { 844 rc = -ETIMEDOUT; 845 log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc); 846 goto out; 847 } 848 if (sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED) { 849 rc = -EHOSTUNREACH; 850 log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc); 851 goto out; 852 } 853 854 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED); 855 sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING; 856 rc = rdma_resolve_route(id, sp->resolve_route_timeout_msec); 857 if (rc) { 858 log_rdma_event(ERR, "rdma_resolve_route() failed %i\n", rc); 859 goto out; 860 } 861 rc = wait_event_interruptible_timeout( 862 sc->status_wait, 863 sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING, 864 msecs_to_jiffies(sp->resolve_route_timeout_msec)); 865 /* e.g. if interrupted returns -ERESTARTSYS */ 866 if (rc < 0) { 867 log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc); 868 goto out; 869 } 870 if (sc->status == SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING) { 871 rc = -ETIMEDOUT; 872 log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc); 873 goto out; 874 } 875 if (sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED) { 876 rc = -ENETUNREACH; 877 log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc); 878 goto out; 879 } 880 881 return id; 882 883 out: 884 rdma_destroy_id(id); 885 return ERR_PTR(rc); 886 } 887 888 /* 889 * Test if FRWR (Fast Registration Work Requests) is supported on the device 890 * This implementation requires FRWR on RDMA read/write 891 * return value: true if it is supported 892 */ 893 static bool frwr_is_supported(struct ib_device_attr *attrs) 894 { 895 if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) 896 return false; 897 if (attrs->max_fast_reg_page_list_len == 0) 898 return false; 899 return true; 900 } 901 902 static int smbd_ia_open( 903 struct smbdirect_socket *sc, 904 struct sockaddr *dstaddr, int port) 905 { 906 struct smbdirect_socket_parameters *sp = &sc->parameters; 907 int rc; 908 909 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED); 910 sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED; 911 912 sc->rdma.cm_id = smbd_create_id(sc, dstaddr, port); 913 if (IS_ERR(sc->rdma.cm_id)) { 914 rc = PTR_ERR(sc->rdma.cm_id); 915 goto out1; 916 } 917 sc->ib.dev = sc->rdma.cm_id->device; 918 919 if (!frwr_is_supported(&sc->ib.dev->attrs)) { 920 log_rdma_event(ERR, "Fast Registration Work Requests (FRWR) is not supported\n"); 921 log_rdma_event(ERR, "Device capability flags = %llx max_fast_reg_page_list_len = %u\n", 922 sc->ib.dev->attrs.device_cap_flags, 923 sc->ib.dev->attrs.max_fast_reg_page_list_len); 924 rc = -EPROTONOSUPPORT; 925 goto out2; 926 } 927 sp->max_frmr_depth = min_t(u32, 928 sp->max_frmr_depth, 929 sc->ib.dev->attrs.max_fast_reg_page_list_len); 930 sc->mr_io.type = IB_MR_TYPE_MEM_REG; 931 if (sc->ib.dev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG) 932 sc->mr_io.type = IB_MR_TYPE_SG_GAPS; 933 934 return 0; 935 936 out2: 937 rdma_destroy_id(sc->rdma.cm_id); 938 sc->rdma.cm_id = NULL; 939 940 out1: 941 return rc; 942 } 943 944 /* 945 * Send a negotiation request message to the peer 946 * The negotiation procedure is in [MS-SMBD] 3.1.5.2 and 3.1.5.3 947 * After negotiation, the transport is connected and ready for 948 * carrying upper layer SMB payload 949 */ 950 static int smbd_post_send_negotiate_req(struct smbdirect_socket *sc) 951 { 952 struct smbdirect_socket_parameters *sp = &sc->parameters; 953 struct ib_send_wr send_wr; 954 int rc = -ENOMEM; 955 struct smbdirect_send_io *request; 956 struct smbdirect_negotiate_req *packet; 957 958 request = mempool_alloc(sc->send_io.mem.pool, GFP_KERNEL); 959 if (!request) 960 return rc; 961 962 request->socket = sc; 963 964 packet = smbdirect_send_io_payload(request); 965 packet->min_version = cpu_to_le16(SMBDIRECT_V1); 966 packet->max_version = cpu_to_le16(SMBDIRECT_V1); 967 packet->reserved = 0; 968 packet->credits_requested = cpu_to_le16(sp->send_credit_target); 969 packet->preferred_send_size = cpu_to_le32(sp->max_send_size); 970 packet->max_receive_size = cpu_to_le32(sp->max_recv_size); 971 packet->max_fragmented_size = 972 cpu_to_le32(sp->max_fragmented_recv_size); 973 974 request->num_sge = 1; 975 request->sge[0].addr = ib_dma_map_single( 976 sc->ib.dev, (void *)packet, 977 sizeof(*packet), DMA_TO_DEVICE); 978 if (ib_dma_mapping_error(sc->ib.dev, request->sge[0].addr)) { 979 rc = -EIO; 980 goto dma_mapping_failed; 981 } 982 983 request->sge[0].length = sizeof(*packet); 984 request->sge[0].lkey = sc->ib.pd->local_dma_lkey; 985 986 ib_dma_sync_single_for_device( 987 sc->ib.dev, request->sge[0].addr, 988 request->sge[0].length, DMA_TO_DEVICE); 989 990 request->cqe.done = send_done; 991 992 send_wr.next = NULL; 993 send_wr.wr_cqe = &request->cqe; 994 send_wr.sg_list = request->sge; 995 send_wr.num_sge = request->num_sge; 996 send_wr.opcode = IB_WR_SEND; 997 send_wr.send_flags = IB_SEND_SIGNALED; 998 999 log_rdma_send(INFO, "sge addr=0x%llx length=%u lkey=0x%x\n", 1000 request->sge[0].addr, 1001 request->sge[0].length, request->sge[0].lkey); 1002 1003 atomic_inc(&sc->send_io.pending.count); 1004 rc = ib_post_send(sc->ib.qp, &send_wr, NULL); 1005 if (!rc) 1006 return 0; 1007 1008 /* if we reach here, post send failed */ 1009 log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc); 1010 atomic_dec(&sc->send_io.pending.count); 1011 ib_dma_unmap_single(sc->ib.dev, request->sge[0].addr, 1012 request->sge[0].length, DMA_TO_DEVICE); 1013 1014 smbd_disconnect_rdma_connection(sc); 1015 1016 dma_mapping_failed: 1017 mempool_free(request, sc->send_io.mem.pool); 1018 return rc; 1019 } 1020 1021 /* 1022 * Extend the credits to remote peer 1023 * This implements [MS-SMBD] 3.1.5.9 1024 * The idea is that we should extend credits to remote peer as quickly as 1025 * it's allowed, to maintain data flow. We allocate as much receive 1026 * buffer as possible, and extend the receive credits to remote peer 1027 * return value: the new credtis being granted. 1028 */ 1029 static int manage_credits_prior_sending(struct smbdirect_socket *sc) 1030 { 1031 int new_credits; 1032 1033 if (atomic_read(&sc->recv_io.credits.count) >= sc->recv_io.credits.target) 1034 return 0; 1035 1036 new_credits = atomic_read(&sc->recv_io.posted.count); 1037 if (new_credits == 0) 1038 return 0; 1039 1040 new_credits -= atomic_read(&sc->recv_io.credits.count); 1041 if (new_credits <= 0) 1042 return 0; 1043 1044 return new_credits; 1045 } 1046 1047 /* 1048 * Check if we need to send a KEEP_ALIVE message 1049 * The idle connection timer triggers a KEEP_ALIVE message when expires 1050 * SMBDIRECT_FLAG_RESPONSE_REQUESTED is set in the message flag to have peer send 1051 * back a response. 1052 * return value: 1053 * 1 if SMBDIRECT_FLAG_RESPONSE_REQUESTED needs to be set 1054 * 0: otherwise 1055 */ 1056 static int manage_keep_alive_before_sending(struct smbdirect_socket *sc) 1057 { 1058 struct smbdirect_socket_parameters *sp = &sc->parameters; 1059 1060 if (sc->idle.keepalive == SMBDIRECT_KEEPALIVE_PENDING) { 1061 sc->idle.keepalive = SMBDIRECT_KEEPALIVE_SENT; 1062 /* 1063 * Now use the keepalive timeout (instead of keepalive interval) 1064 * in order to wait for a response 1065 */ 1066 mod_delayed_work(sc->workqueue, &sc->idle.timer_work, 1067 msecs_to_jiffies(sp->keepalive_timeout_msec)); 1068 return 1; 1069 } 1070 return 0; 1071 } 1072 1073 /* Post the send request */ 1074 static int smbd_post_send(struct smbdirect_socket *sc, 1075 struct smbdirect_send_io *request) 1076 { 1077 struct ib_send_wr send_wr; 1078 int rc, i; 1079 1080 for (i = 0; i < request->num_sge; i++) { 1081 log_rdma_send(INFO, 1082 "rdma_request sge[%d] addr=0x%llx length=%u\n", 1083 i, request->sge[i].addr, request->sge[i].length); 1084 ib_dma_sync_single_for_device( 1085 sc->ib.dev, 1086 request->sge[i].addr, 1087 request->sge[i].length, 1088 DMA_TO_DEVICE); 1089 } 1090 1091 request->cqe.done = send_done; 1092 1093 send_wr.next = NULL; 1094 send_wr.wr_cqe = &request->cqe; 1095 send_wr.sg_list = request->sge; 1096 send_wr.num_sge = request->num_sge; 1097 send_wr.opcode = IB_WR_SEND; 1098 send_wr.send_flags = IB_SEND_SIGNALED; 1099 1100 rc = ib_post_send(sc->ib.qp, &send_wr, NULL); 1101 if (rc) { 1102 log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc); 1103 smbd_disconnect_rdma_connection(sc); 1104 rc = -EAGAIN; 1105 } 1106 1107 return rc; 1108 } 1109 1110 static int smbd_post_send_iter(struct smbdirect_socket *sc, 1111 struct iov_iter *iter, 1112 int *_remaining_data_length) 1113 { 1114 struct smbdirect_socket_parameters *sp = &sc->parameters; 1115 int i, rc; 1116 int header_length; 1117 int data_length; 1118 struct smbdirect_send_io *request; 1119 struct smbdirect_data_transfer *packet; 1120 int new_credits = 0; 1121 1122 wait_lcredit: 1123 /* Wait for local send credits */ 1124 rc = wait_event_interruptible(sc->send_io.lcredits.wait_queue, 1125 atomic_read(&sc->send_io.lcredits.count) > 0 || 1126 sc->status != SMBDIRECT_SOCKET_CONNECTED); 1127 if (rc) 1128 goto err_wait_lcredit; 1129 1130 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { 1131 log_outgoing(ERR, "disconnected not sending on wait_credit\n"); 1132 rc = -EAGAIN; 1133 goto err_wait_lcredit; 1134 } 1135 if (unlikely(atomic_dec_return(&sc->send_io.lcredits.count) < 0)) { 1136 atomic_inc(&sc->send_io.lcredits.count); 1137 goto wait_lcredit; 1138 } 1139 1140 wait_credit: 1141 /* Wait for send credits. A SMBD packet needs one credit */ 1142 rc = wait_event_interruptible(sc->send_io.credits.wait_queue, 1143 atomic_read(&sc->send_io.credits.count) > 0 || 1144 sc->status != SMBDIRECT_SOCKET_CONNECTED); 1145 if (rc) 1146 goto err_wait_credit; 1147 1148 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { 1149 log_outgoing(ERR, "disconnected not sending on wait_credit\n"); 1150 rc = -EAGAIN; 1151 goto err_wait_credit; 1152 } 1153 if (unlikely(atomic_dec_return(&sc->send_io.credits.count) < 0)) { 1154 atomic_inc(&sc->send_io.credits.count); 1155 goto wait_credit; 1156 } 1157 1158 request = mempool_alloc(sc->send_io.mem.pool, GFP_KERNEL); 1159 if (!request) { 1160 rc = -ENOMEM; 1161 goto err_alloc; 1162 } 1163 1164 request->socket = sc; 1165 memset(request->sge, 0, sizeof(request->sge)); 1166 1167 /* Map the packet to DMA */ 1168 header_length = sizeof(struct smbdirect_data_transfer); 1169 /* If this is a packet without payload, don't send padding */ 1170 if (!iter) 1171 header_length = offsetof(struct smbdirect_data_transfer, padding); 1172 1173 packet = smbdirect_send_io_payload(request); 1174 request->sge[0].addr = ib_dma_map_single(sc->ib.dev, 1175 (void *)packet, 1176 header_length, 1177 DMA_TO_DEVICE); 1178 if (ib_dma_mapping_error(sc->ib.dev, request->sge[0].addr)) { 1179 rc = -EIO; 1180 goto err_dma; 1181 } 1182 1183 request->sge[0].length = header_length; 1184 request->sge[0].lkey = sc->ib.pd->local_dma_lkey; 1185 request->num_sge = 1; 1186 1187 /* Fill in the data payload to find out how much data we can add */ 1188 if (iter) { 1189 struct smb_extract_to_rdma extract = { 1190 .nr_sge = request->num_sge, 1191 .max_sge = SMBDIRECT_SEND_IO_MAX_SGE, 1192 .sge = request->sge, 1193 .device = sc->ib.dev, 1194 .local_dma_lkey = sc->ib.pd->local_dma_lkey, 1195 .direction = DMA_TO_DEVICE, 1196 }; 1197 size_t payload_len = umin(*_remaining_data_length, 1198 sp->max_send_size - sizeof(*packet)); 1199 1200 rc = smb_extract_iter_to_rdma(iter, payload_len, 1201 &extract); 1202 if (rc < 0) 1203 goto err_dma; 1204 data_length = rc; 1205 request->num_sge = extract.nr_sge; 1206 *_remaining_data_length -= data_length; 1207 } else { 1208 data_length = 0; 1209 } 1210 1211 /* Fill in the packet header */ 1212 packet->credits_requested = cpu_to_le16(sp->send_credit_target); 1213 1214 new_credits = manage_credits_prior_sending(sc); 1215 atomic_add(new_credits, &sc->recv_io.credits.count); 1216 packet->credits_granted = cpu_to_le16(new_credits); 1217 1218 packet->flags = 0; 1219 if (manage_keep_alive_before_sending(sc)) 1220 packet->flags |= cpu_to_le16(SMBDIRECT_FLAG_RESPONSE_REQUESTED); 1221 1222 packet->reserved = 0; 1223 if (!data_length) 1224 packet->data_offset = 0; 1225 else 1226 packet->data_offset = cpu_to_le32(24); 1227 packet->data_length = cpu_to_le32(data_length); 1228 packet->remaining_data_length = cpu_to_le32(*_remaining_data_length); 1229 packet->padding = 0; 1230 1231 log_outgoing(INFO, "credits_requested=%d credits_granted=%d data_offset=%d data_length=%d remaining_data_length=%d\n", 1232 le16_to_cpu(packet->credits_requested), 1233 le16_to_cpu(packet->credits_granted), 1234 le32_to_cpu(packet->data_offset), 1235 le32_to_cpu(packet->data_length), 1236 le32_to_cpu(packet->remaining_data_length)); 1237 1238 /* 1239 * Now that we got a local and a remote credit 1240 * we add us as pending 1241 */ 1242 atomic_inc(&sc->send_io.pending.count); 1243 1244 rc = smbd_post_send(sc, request); 1245 if (!rc) 1246 return 0; 1247 1248 if (atomic_dec_and_test(&sc->send_io.pending.count)) 1249 wake_up(&sc->send_io.pending.zero_wait_queue); 1250 1251 wake_up(&sc->send_io.pending.dec_wait_queue); 1252 1253 err_dma: 1254 for (i = 0; i < request->num_sge; i++) 1255 if (request->sge[i].addr) 1256 ib_dma_unmap_single(sc->ib.dev, 1257 request->sge[i].addr, 1258 request->sge[i].length, 1259 DMA_TO_DEVICE); 1260 mempool_free(request, sc->send_io.mem.pool); 1261 1262 /* roll back the granted receive credits */ 1263 atomic_sub(new_credits, &sc->recv_io.credits.count); 1264 1265 err_alloc: 1266 atomic_inc(&sc->send_io.credits.count); 1267 wake_up(&sc->send_io.credits.wait_queue); 1268 1269 err_wait_credit: 1270 atomic_inc(&sc->send_io.lcredits.count); 1271 wake_up(&sc->send_io.lcredits.wait_queue); 1272 1273 err_wait_lcredit: 1274 return rc; 1275 } 1276 1277 /* 1278 * Send an empty message 1279 * Empty message is used to extend credits to peer to for keep live 1280 * while there is no upper layer payload to send at the time 1281 */ 1282 static int smbd_post_send_empty(struct smbdirect_socket *sc) 1283 { 1284 int remaining_data_length = 0; 1285 1286 sc->statistics.send_empty++; 1287 return smbd_post_send_iter(sc, NULL, &remaining_data_length); 1288 } 1289 1290 static int smbd_post_send_full_iter(struct smbdirect_socket *sc, 1291 struct iov_iter *iter, 1292 int *_remaining_data_length) 1293 { 1294 int rc = 0; 1295 1296 /* 1297 * smbd_post_send_iter() respects the 1298 * negotiated max_send_size, so we need to 1299 * loop until the full iter is posted 1300 */ 1301 1302 while (iov_iter_count(iter) > 0) { 1303 rc = smbd_post_send_iter(sc, iter, _remaining_data_length); 1304 if (rc < 0) 1305 break; 1306 } 1307 1308 return rc; 1309 } 1310 1311 /* 1312 * Post a receive request to the transport 1313 * The remote peer can only send data when a receive request is posted 1314 * The interaction is controlled by send/receive credit system 1315 */ 1316 static int smbd_post_recv( 1317 struct smbdirect_socket *sc, struct smbdirect_recv_io *response) 1318 { 1319 struct smbdirect_socket_parameters *sp = &sc->parameters; 1320 struct ib_recv_wr recv_wr; 1321 int rc = -EIO; 1322 1323 response->sge.addr = ib_dma_map_single( 1324 sc->ib.dev, response->packet, 1325 sp->max_recv_size, DMA_FROM_DEVICE); 1326 if (ib_dma_mapping_error(sc->ib.dev, response->sge.addr)) 1327 return rc; 1328 1329 response->sge.length = sp->max_recv_size; 1330 response->sge.lkey = sc->ib.pd->local_dma_lkey; 1331 1332 response->cqe.done = recv_done; 1333 1334 recv_wr.wr_cqe = &response->cqe; 1335 recv_wr.next = NULL; 1336 recv_wr.sg_list = &response->sge; 1337 recv_wr.num_sge = 1; 1338 1339 rc = ib_post_recv(sc->ib.qp, &recv_wr, NULL); 1340 if (rc) { 1341 ib_dma_unmap_single(sc->ib.dev, response->sge.addr, 1342 response->sge.length, DMA_FROM_DEVICE); 1343 response->sge.length = 0; 1344 smbd_disconnect_rdma_connection(sc); 1345 log_rdma_recv(ERR, "ib_post_recv failed rc=%d\n", rc); 1346 } 1347 1348 return rc; 1349 } 1350 1351 /* Perform SMBD negotiate according to [MS-SMBD] 3.1.5.2 */ 1352 static int smbd_negotiate(struct smbdirect_socket *sc) 1353 { 1354 struct smbdirect_socket_parameters *sp = &sc->parameters; 1355 int rc; 1356 struct smbdirect_recv_io *response = get_receive_buffer(sc); 1357 1358 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED); 1359 sc->status = SMBDIRECT_SOCKET_NEGOTIATE_RUNNING; 1360 1361 sc->recv_io.expected = SMBDIRECT_EXPECT_NEGOTIATE_REP; 1362 rc = smbd_post_recv(sc, response); 1363 log_rdma_event(INFO, "smbd_post_recv rc=%d iov.addr=0x%llx iov.length=%u iov.lkey=0x%x\n", 1364 rc, response->sge.addr, 1365 response->sge.length, response->sge.lkey); 1366 if (rc) { 1367 put_receive_buffer(sc, response); 1368 return rc; 1369 } 1370 1371 rc = smbd_post_send_negotiate_req(sc); 1372 if (rc) 1373 return rc; 1374 1375 rc = wait_event_interruptible_timeout( 1376 sc->status_wait, 1377 sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING, 1378 msecs_to_jiffies(sp->negotiate_timeout_msec)); 1379 log_rdma_event(INFO, "wait_event_interruptible_timeout rc=%d\n", rc); 1380 1381 if (sc->status == SMBDIRECT_SOCKET_CONNECTED) 1382 return 0; 1383 1384 if (rc == 0) 1385 rc = -ETIMEDOUT; 1386 else if (rc == -ERESTARTSYS) 1387 rc = -EINTR; 1388 else 1389 rc = -ENOTCONN; 1390 1391 return rc; 1392 } 1393 1394 /* 1395 * Implement Connection.FragmentReassemblyBuffer defined in [MS-SMBD] 3.1.1.1 1396 * This is a queue for reassembling upper layer payload and present to upper 1397 * layer. All the inncoming payload go to the reassembly queue, regardless of 1398 * if reassembly is required. The uuper layer code reads from the queue for all 1399 * incoming payloads. 1400 * Put a received packet to the reassembly queue 1401 * response: the packet received 1402 * data_length: the size of payload in this packet 1403 */ 1404 static void enqueue_reassembly( 1405 struct smbdirect_socket *sc, 1406 struct smbdirect_recv_io *response, 1407 int data_length) 1408 { 1409 unsigned long flags; 1410 1411 spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); 1412 list_add_tail(&response->list, &sc->recv_io.reassembly.list); 1413 sc->recv_io.reassembly.queue_length++; 1414 /* 1415 * Make sure reassembly_data_length is updated after list and 1416 * reassembly_queue_length are updated. On the dequeue side 1417 * reassembly_data_length is checked without a lock to determine 1418 * if reassembly_queue_length and list is up to date 1419 */ 1420 virt_wmb(); 1421 sc->recv_io.reassembly.data_length += data_length; 1422 spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); 1423 sc->statistics.enqueue_reassembly_queue++; 1424 } 1425 1426 /* 1427 * Get the first entry at the front of reassembly queue 1428 * Caller is responsible for locking 1429 * return value: the first entry if any, NULL if queue is empty 1430 */ 1431 static struct smbdirect_recv_io *_get_first_reassembly(struct smbdirect_socket *sc) 1432 { 1433 struct smbdirect_recv_io *ret = NULL; 1434 1435 if (!list_empty(&sc->recv_io.reassembly.list)) { 1436 ret = list_first_entry( 1437 &sc->recv_io.reassembly.list, 1438 struct smbdirect_recv_io, list); 1439 } 1440 return ret; 1441 } 1442 1443 /* 1444 * Get a receive buffer 1445 * For each remote send, we need to post a receive. The receive buffers are 1446 * pre-allocated in advance. 1447 * return value: the receive buffer, NULL if none is available 1448 */ 1449 static struct smbdirect_recv_io *get_receive_buffer(struct smbdirect_socket *sc) 1450 { 1451 struct smbdirect_recv_io *ret = NULL; 1452 unsigned long flags; 1453 1454 spin_lock_irqsave(&sc->recv_io.free.lock, flags); 1455 if (!list_empty(&sc->recv_io.free.list)) { 1456 ret = list_first_entry( 1457 &sc->recv_io.free.list, 1458 struct smbdirect_recv_io, list); 1459 list_del(&ret->list); 1460 sc->statistics.get_receive_buffer++; 1461 } 1462 spin_unlock_irqrestore(&sc->recv_io.free.lock, flags); 1463 1464 return ret; 1465 } 1466 1467 /* 1468 * Return a receive buffer 1469 * Upon returning of a receive buffer, we can post new receive and extend 1470 * more receive credits to remote peer. This is done immediately after a 1471 * receive buffer is returned. 1472 */ 1473 static void put_receive_buffer( 1474 struct smbdirect_socket *sc, struct smbdirect_recv_io *response) 1475 { 1476 unsigned long flags; 1477 1478 if (likely(response->sge.length != 0)) { 1479 ib_dma_unmap_single(sc->ib.dev, 1480 response->sge.addr, 1481 response->sge.length, 1482 DMA_FROM_DEVICE); 1483 response->sge.length = 0; 1484 } 1485 1486 spin_lock_irqsave(&sc->recv_io.free.lock, flags); 1487 list_add_tail(&response->list, &sc->recv_io.free.list); 1488 sc->statistics.put_receive_buffer++; 1489 spin_unlock_irqrestore(&sc->recv_io.free.lock, flags); 1490 1491 queue_work(sc->workqueue, &sc->recv_io.posted.refill_work); 1492 } 1493 1494 /* Preallocate all receive buffer on transport establishment */ 1495 static int allocate_receive_buffers(struct smbdirect_socket *sc, int num_buf) 1496 { 1497 struct smbdirect_recv_io *response; 1498 int i; 1499 1500 for (i = 0; i < num_buf; i++) { 1501 response = mempool_alloc(sc->recv_io.mem.pool, GFP_KERNEL); 1502 if (!response) 1503 goto allocate_failed; 1504 1505 response->socket = sc; 1506 response->sge.length = 0; 1507 list_add_tail(&response->list, &sc->recv_io.free.list); 1508 } 1509 1510 return 0; 1511 1512 allocate_failed: 1513 while (!list_empty(&sc->recv_io.free.list)) { 1514 response = list_first_entry( 1515 &sc->recv_io.free.list, 1516 struct smbdirect_recv_io, list); 1517 list_del(&response->list); 1518 1519 mempool_free(response, sc->recv_io.mem.pool); 1520 } 1521 return -ENOMEM; 1522 } 1523 1524 static void destroy_receive_buffers(struct smbdirect_socket *sc) 1525 { 1526 struct smbdirect_recv_io *response; 1527 1528 while ((response = get_receive_buffer(sc))) 1529 mempool_free(response, sc->recv_io.mem.pool); 1530 } 1531 1532 static void send_immediate_empty_message(struct work_struct *work) 1533 { 1534 struct smbdirect_socket *sc = 1535 container_of(work, struct smbdirect_socket, idle.immediate_work); 1536 1537 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) 1538 return; 1539 1540 log_keep_alive(INFO, "send an empty message\n"); 1541 smbd_post_send_empty(sc); 1542 } 1543 1544 /* Implement idle connection timer [MS-SMBD] 3.1.6.2 */ 1545 static void idle_connection_timer(struct work_struct *work) 1546 { 1547 struct smbdirect_socket *sc = 1548 container_of(work, struct smbdirect_socket, idle.timer_work.work); 1549 struct smbdirect_socket_parameters *sp = &sc->parameters; 1550 1551 if (sc->idle.keepalive != SMBDIRECT_KEEPALIVE_NONE) { 1552 log_keep_alive(ERR, 1553 "error status sc->idle.keepalive=%d\n", 1554 sc->idle.keepalive); 1555 smbd_disconnect_rdma_connection(sc); 1556 return; 1557 } 1558 1559 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) 1560 return; 1561 1562 /* 1563 * Now use the keepalive timeout (instead of keepalive interval) 1564 * in order to wait for a response 1565 */ 1566 sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING; 1567 mod_delayed_work(sc->workqueue, &sc->idle.timer_work, 1568 msecs_to_jiffies(sp->keepalive_timeout_msec)); 1569 log_keep_alive(INFO, "schedule send of empty idle message\n"); 1570 queue_work(sc->workqueue, &sc->idle.immediate_work); 1571 } 1572 1573 /* 1574 * Destroy the transport and related RDMA and memory resources 1575 * Need to go through all the pending counters and make sure on one is using 1576 * the transport while it is destroyed 1577 */ 1578 void smbd_destroy(struct TCP_Server_Info *server) 1579 { 1580 struct smbd_connection *info = server->smbd_conn; 1581 struct smbdirect_socket *sc; 1582 struct smbdirect_recv_io *response; 1583 unsigned long flags; 1584 1585 if (!info) { 1586 log_rdma_event(INFO, "rdma session already destroyed\n"); 1587 return; 1588 } 1589 sc = &info->socket; 1590 1591 log_rdma_event(INFO, "cancelling and disable disconnect_work\n"); 1592 disable_work_sync(&sc->disconnect_work); 1593 1594 log_rdma_event(INFO, "destroying rdma session\n"); 1595 if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING) 1596 smbd_disconnect_rdma_work(&sc->disconnect_work); 1597 if (sc->status < SMBDIRECT_SOCKET_DISCONNECTED) { 1598 log_rdma_event(INFO, "wait for transport being disconnected\n"); 1599 wait_event(sc->status_wait, sc->status == SMBDIRECT_SOCKET_DISCONNECTED); 1600 log_rdma_event(INFO, "waited for transport being disconnected\n"); 1601 } 1602 1603 /* 1604 * Wake up all waiters in all wait queues 1605 * in order to notice the broken connection. 1606 * 1607 * Most likely this was already called via 1608 * smbd_disconnect_rdma_work(), but call it again... 1609 */ 1610 smbd_disconnect_wake_up_all(sc); 1611 1612 log_rdma_event(INFO, "cancelling recv_io.posted.refill_work\n"); 1613 disable_work_sync(&sc->recv_io.posted.refill_work); 1614 1615 log_rdma_event(INFO, "destroying qp\n"); 1616 ib_drain_qp(sc->ib.qp); 1617 rdma_destroy_qp(sc->rdma.cm_id); 1618 sc->ib.qp = NULL; 1619 1620 log_rdma_event(INFO, "cancelling idle timer\n"); 1621 disable_delayed_work_sync(&sc->idle.timer_work); 1622 log_rdma_event(INFO, "cancelling send immediate work\n"); 1623 disable_work_sync(&sc->idle.immediate_work); 1624 1625 /* It's not possible for upper layer to get to reassembly */ 1626 log_rdma_event(INFO, "drain the reassembly queue\n"); 1627 do { 1628 spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); 1629 response = _get_first_reassembly(sc); 1630 if (response) { 1631 list_del(&response->list); 1632 spin_unlock_irqrestore( 1633 &sc->recv_io.reassembly.lock, flags); 1634 put_receive_buffer(sc, response); 1635 } else 1636 spin_unlock_irqrestore( 1637 &sc->recv_io.reassembly.lock, flags); 1638 } while (response); 1639 sc->recv_io.reassembly.data_length = 0; 1640 1641 log_rdma_event(INFO, "free receive buffers\n"); 1642 destroy_receive_buffers(sc); 1643 1644 log_rdma_event(INFO, "freeing mr list\n"); 1645 destroy_mr_list(sc); 1646 1647 ib_free_cq(sc->ib.send_cq); 1648 ib_free_cq(sc->ib.recv_cq); 1649 ib_dealloc_pd(sc->ib.pd); 1650 rdma_destroy_id(sc->rdma.cm_id); 1651 1652 /* free mempools */ 1653 mempool_destroy(sc->send_io.mem.pool); 1654 kmem_cache_destroy(sc->send_io.mem.cache); 1655 1656 mempool_destroy(sc->recv_io.mem.pool); 1657 kmem_cache_destroy(sc->recv_io.mem.cache); 1658 1659 sc->status = SMBDIRECT_SOCKET_DESTROYED; 1660 1661 destroy_workqueue(sc->workqueue); 1662 log_rdma_event(INFO, "rdma session destroyed\n"); 1663 kfree(info); 1664 server->smbd_conn = NULL; 1665 } 1666 1667 /* 1668 * Reconnect this SMBD connection, called from upper layer 1669 * return value: 0 on success, or actual error code 1670 */ 1671 int smbd_reconnect(struct TCP_Server_Info *server) 1672 { 1673 log_rdma_event(INFO, "reconnecting rdma session\n"); 1674 1675 if (!server->smbd_conn) { 1676 log_rdma_event(INFO, "rdma session already destroyed\n"); 1677 goto create_conn; 1678 } 1679 1680 /* 1681 * This is possible if transport is disconnected and we haven't received 1682 * notification from RDMA, but upper layer has detected timeout 1683 */ 1684 if (server->smbd_conn->socket.status == SMBDIRECT_SOCKET_CONNECTED) { 1685 log_rdma_event(INFO, "disconnecting transport\n"); 1686 smbd_destroy(server); 1687 } 1688 1689 create_conn: 1690 log_rdma_event(INFO, "creating rdma session\n"); 1691 server->smbd_conn = smbd_get_connection( 1692 server, (struct sockaddr *) &server->dstaddr); 1693 1694 if (server->smbd_conn) { 1695 cifs_dbg(VFS, "RDMA transport re-established\n"); 1696 trace_smb3_smbd_connect_done(server->hostname, server->conn_id, &server->dstaddr); 1697 return 0; 1698 } 1699 trace_smb3_smbd_connect_err(server->hostname, server->conn_id, &server->dstaddr); 1700 return -ENOENT; 1701 } 1702 1703 static void destroy_caches(struct smbdirect_socket *sc) 1704 { 1705 destroy_receive_buffers(sc); 1706 mempool_destroy(sc->recv_io.mem.pool); 1707 kmem_cache_destroy(sc->recv_io.mem.cache); 1708 mempool_destroy(sc->send_io.mem.pool); 1709 kmem_cache_destroy(sc->send_io.mem.cache); 1710 } 1711 1712 #define MAX_NAME_LEN 80 1713 static int allocate_caches(struct smbdirect_socket *sc) 1714 { 1715 struct smbdirect_socket_parameters *sp = &sc->parameters; 1716 char name[MAX_NAME_LEN]; 1717 int rc; 1718 1719 if (WARN_ON_ONCE(sp->max_recv_size < sizeof(struct smbdirect_data_transfer))) 1720 return -ENOMEM; 1721 1722 scnprintf(name, MAX_NAME_LEN, "smbdirect_send_io_%p", sc); 1723 sc->send_io.mem.cache = 1724 kmem_cache_create( 1725 name, 1726 sizeof(struct smbdirect_send_io) + 1727 sizeof(struct smbdirect_data_transfer), 1728 0, SLAB_HWCACHE_ALIGN, NULL); 1729 if (!sc->send_io.mem.cache) 1730 return -ENOMEM; 1731 1732 sc->send_io.mem.pool = 1733 mempool_create(sp->send_credit_target, mempool_alloc_slab, 1734 mempool_free_slab, sc->send_io.mem.cache); 1735 if (!sc->send_io.mem.pool) 1736 goto out1; 1737 1738 scnprintf(name, MAX_NAME_LEN, "smbdirect_recv_io_%p", sc); 1739 1740 struct kmem_cache_args response_args = { 1741 .align = __alignof__(struct smbdirect_recv_io), 1742 .useroffset = (offsetof(struct smbdirect_recv_io, packet) + 1743 sizeof(struct smbdirect_data_transfer)), 1744 .usersize = sp->max_recv_size - sizeof(struct smbdirect_data_transfer), 1745 }; 1746 sc->recv_io.mem.cache = 1747 kmem_cache_create(name, 1748 sizeof(struct smbdirect_recv_io) + sp->max_recv_size, 1749 &response_args, SLAB_HWCACHE_ALIGN); 1750 if (!sc->recv_io.mem.cache) 1751 goto out2; 1752 1753 sc->recv_io.mem.pool = 1754 mempool_create(sp->recv_credit_max, mempool_alloc_slab, 1755 mempool_free_slab, sc->recv_io.mem.cache); 1756 if (!sc->recv_io.mem.pool) 1757 goto out3; 1758 1759 rc = allocate_receive_buffers(sc, sp->recv_credit_max); 1760 if (rc) { 1761 log_rdma_event(ERR, "failed to allocate receive buffers\n"); 1762 goto out4; 1763 } 1764 1765 return 0; 1766 1767 out4: 1768 mempool_destroy(sc->recv_io.mem.pool); 1769 out3: 1770 kmem_cache_destroy(sc->recv_io.mem.cache); 1771 out2: 1772 mempool_destroy(sc->send_io.mem.pool); 1773 out1: 1774 kmem_cache_destroy(sc->send_io.mem.cache); 1775 return -ENOMEM; 1776 } 1777 1778 /* Create a SMBD connection, called by upper layer */ 1779 static struct smbd_connection *_smbd_get_connection( 1780 struct TCP_Server_Info *server, struct sockaddr *dstaddr, int port) 1781 { 1782 int rc; 1783 struct smbd_connection *info; 1784 struct smbdirect_socket *sc; 1785 struct smbdirect_socket_parameters *sp; 1786 struct rdma_conn_param conn_param; 1787 struct ib_qp_cap qp_cap; 1788 struct ib_qp_init_attr qp_attr; 1789 struct sockaddr_in *addr_in = (struct sockaddr_in *) dstaddr; 1790 struct ib_port_immutable port_immutable; 1791 __be32 ird_ord_hdr[2]; 1792 char wq_name[80]; 1793 struct workqueue_struct *workqueue; 1794 1795 info = kzalloc(sizeof(struct smbd_connection), GFP_KERNEL); 1796 if (!info) 1797 return NULL; 1798 sc = &info->socket; 1799 scnprintf(wq_name, ARRAY_SIZE(wq_name), "smbd_%p", sc); 1800 workqueue = create_workqueue(wq_name); 1801 if (!workqueue) 1802 goto create_wq_failed; 1803 smbdirect_socket_init(sc); 1804 sc->workqueue = workqueue; 1805 sp = &sc->parameters; 1806 1807 INIT_WORK(&sc->disconnect_work, smbd_disconnect_rdma_work); 1808 1809 sp->resolve_addr_timeout_msec = RDMA_RESOLVE_TIMEOUT; 1810 sp->resolve_route_timeout_msec = RDMA_RESOLVE_TIMEOUT; 1811 sp->rdma_connect_timeout_msec = RDMA_RESOLVE_TIMEOUT; 1812 sp->negotiate_timeout_msec = SMBD_NEGOTIATE_TIMEOUT * 1000; 1813 sp->initiator_depth = 1; 1814 sp->responder_resources = SMBD_CM_RESPONDER_RESOURCES; 1815 sp->recv_credit_max = smbd_receive_credit_max; 1816 sp->send_credit_target = smbd_send_credit_target; 1817 sp->max_send_size = smbd_max_send_size; 1818 sp->max_fragmented_recv_size = smbd_max_fragmented_recv_size; 1819 sp->max_recv_size = smbd_max_receive_size; 1820 sp->max_frmr_depth = smbd_max_frmr_depth; 1821 sp->keepalive_interval_msec = smbd_keep_alive_interval * 1000; 1822 sp->keepalive_timeout_msec = KEEPALIVE_RECV_TIMEOUT * 1000; 1823 1824 rc = smbd_ia_open(sc, dstaddr, port); 1825 if (rc) { 1826 log_rdma_event(INFO, "smbd_ia_open rc=%d\n", rc); 1827 goto create_id_failed; 1828 } 1829 1830 if (sp->send_credit_target > sc->ib.dev->attrs.max_cqe || 1831 sp->send_credit_target > sc->ib.dev->attrs.max_qp_wr) { 1832 log_rdma_event(ERR, "consider lowering send_credit_target = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n", 1833 sp->send_credit_target, 1834 sc->ib.dev->attrs.max_cqe, 1835 sc->ib.dev->attrs.max_qp_wr); 1836 goto config_failed; 1837 } 1838 1839 if (sp->recv_credit_max > sc->ib.dev->attrs.max_cqe || 1840 sp->recv_credit_max > sc->ib.dev->attrs.max_qp_wr) { 1841 log_rdma_event(ERR, "consider lowering receive_credit_max = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n", 1842 sp->recv_credit_max, 1843 sc->ib.dev->attrs.max_cqe, 1844 sc->ib.dev->attrs.max_qp_wr); 1845 goto config_failed; 1846 } 1847 1848 if (sc->ib.dev->attrs.max_send_sge < SMBDIRECT_SEND_IO_MAX_SGE || 1849 sc->ib.dev->attrs.max_recv_sge < SMBDIRECT_RECV_IO_MAX_SGE) { 1850 log_rdma_event(ERR, 1851 "device %.*s max_send_sge/max_recv_sge = %d/%d too small\n", 1852 IB_DEVICE_NAME_MAX, 1853 sc->ib.dev->name, 1854 sc->ib.dev->attrs.max_send_sge, 1855 sc->ib.dev->attrs.max_recv_sge); 1856 goto config_failed; 1857 } 1858 1859 sp->responder_resources = 1860 min_t(u8, sp->responder_resources, 1861 sc->ib.dev->attrs.max_qp_rd_atom); 1862 log_rdma_mr(INFO, "responder_resources=%d\n", 1863 sp->responder_resources); 1864 1865 /* 1866 * We use allocate sp->responder_resources * 2 MRs 1867 * and each MR needs WRs for REG and INV, so 1868 * we use '* 4'. 1869 * 1870 * +1 for ib_drain_qp() 1871 */ 1872 memset(&qp_cap, 0, sizeof(qp_cap)); 1873 qp_cap.max_send_wr = sp->send_credit_target + sp->responder_resources * 4 + 1; 1874 qp_cap.max_recv_wr = sp->recv_credit_max + 1; 1875 qp_cap.max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE; 1876 qp_cap.max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE; 1877 1878 sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0); 1879 if (IS_ERR(sc->ib.pd)) { 1880 rc = PTR_ERR(sc->ib.pd); 1881 sc->ib.pd = NULL; 1882 log_rdma_event(ERR, "ib_alloc_pd() returned %d\n", rc); 1883 goto alloc_pd_failed; 1884 } 1885 1886 sc->ib.send_cq = 1887 ib_alloc_cq_any(sc->ib.dev, sc, 1888 qp_cap.max_send_wr, IB_POLL_SOFTIRQ); 1889 if (IS_ERR(sc->ib.send_cq)) { 1890 sc->ib.send_cq = NULL; 1891 goto alloc_cq_failed; 1892 } 1893 1894 sc->ib.recv_cq = 1895 ib_alloc_cq_any(sc->ib.dev, sc, 1896 qp_cap.max_recv_wr, IB_POLL_SOFTIRQ); 1897 if (IS_ERR(sc->ib.recv_cq)) { 1898 sc->ib.recv_cq = NULL; 1899 goto alloc_cq_failed; 1900 } 1901 1902 memset(&qp_attr, 0, sizeof(qp_attr)); 1903 qp_attr.event_handler = smbd_qp_async_error_upcall; 1904 qp_attr.qp_context = sc; 1905 qp_attr.cap = qp_cap; 1906 qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; 1907 qp_attr.qp_type = IB_QPT_RC; 1908 qp_attr.send_cq = sc->ib.send_cq; 1909 qp_attr.recv_cq = sc->ib.recv_cq; 1910 qp_attr.port_num = ~0; 1911 1912 rc = rdma_create_qp(sc->rdma.cm_id, sc->ib.pd, &qp_attr); 1913 if (rc) { 1914 log_rdma_event(ERR, "rdma_create_qp failed %i\n", rc); 1915 goto create_qp_failed; 1916 } 1917 sc->ib.qp = sc->rdma.cm_id->qp; 1918 1919 memset(&conn_param, 0, sizeof(conn_param)); 1920 conn_param.initiator_depth = sp->initiator_depth; 1921 conn_param.responder_resources = sp->responder_resources; 1922 1923 /* Need to send IRD/ORD in private data for iWARP */ 1924 sc->ib.dev->ops.get_port_immutable( 1925 sc->ib.dev, sc->rdma.cm_id->port_num, &port_immutable); 1926 if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) { 1927 ird_ord_hdr[0] = cpu_to_be32(conn_param.responder_resources); 1928 ird_ord_hdr[1] = cpu_to_be32(conn_param.initiator_depth); 1929 conn_param.private_data = ird_ord_hdr; 1930 conn_param.private_data_len = sizeof(ird_ord_hdr); 1931 } else { 1932 conn_param.private_data = NULL; 1933 conn_param.private_data_len = 0; 1934 } 1935 1936 conn_param.retry_count = SMBD_CM_RETRY; 1937 conn_param.rnr_retry_count = SMBD_CM_RNR_RETRY; 1938 conn_param.flow_control = 0; 1939 1940 log_rdma_event(INFO, "connecting to IP %pI4 port %d\n", 1941 &addr_in->sin_addr, port); 1942 1943 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED); 1944 sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING; 1945 rc = rdma_connect(sc->rdma.cm_id, &conn_param); 1946 if (rc) { 1947 log_rdma_event(ERR, "rdma_connect() failed with %i\n", rc); 1948 goto rdma_connect_failed; 1949 } 1950 1951 wait_event_interruptible_timeout( 1952 sc->status_wait, 1953 sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING, 1954 msecs_to_jiffies(sp->rdma_connect_timeout_msec)); 1955 1956 if (sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED) { 1957 log_rdma_event(ERR, "rdma_connect failed port=%d\n", port); 1958 goto rdma_connect_failed; 1959 } 1960 1961 log_rdma_event(INFO, "rdma_connect connected\n"); 1962 1963 rc = allocate_caches(sc); 1964 if (rc) { 1965 log_rdma_event(ERR, "cache allocation failed\n"); 1966 goto allocate_cache_failed; 1967 } 1968 1969 INIT_WORK(&sc->idle.immediate_work, send_immediate_empty_message); 1970 INIT_DELAYED_WORK(&sc->idle.timer_work, idle_connection_timer); 1971 /* 1972 * start with the negotiate timeout and SMBDIRECT_KEEPALIVE_PENDING 1973 * so that the timer will cause a disconnect. 1974 */ 1975 sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING; 1976 mod_delayed_work(sc->workqueue, &sc->idle.timer_work, 1977 msecs_to_jiffies(sp->negotiate_timeout_msec)); 1978 1979 INIT_WORK(&sc->recv_io.posted.refill_work, smbd_post_send_credits); 1980 1981 rc = smbd_negotiate(sc); 1982 if (rc) { 1983 log_rdma_event(ERR, "smbd_negotiate rc=%d\n", rc); 1984 goto negotiation_failed; 1985 } 1986 1987 rc = allocate_mr_list(sc); 1988 if (rc) { 1989 log_rdma_mr(ERR, "memory registration allocation failed\n"); 1990 goto allocate_mr_failed; 1991 } 1992 1993 return info; 1994 1995 allocate_mr_failed: 1996 /* At this point, need to a full transport shutdown */ 1997 server->smbd_conn = info; 1998 smbd_destroy(server); 1999 return NULL; 2000 2001 negotiation_failed: 2002 disable_delayed_work_sync(&sc->idle.timer_work); 2003 destroy_caches(sc); 2004 sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; 2005 rdma_disconnect(sc->rdma.cm_id); 2006 wait_event(sc->status_wait, 2007 sc->status == SMBDIRECT_SOCKET_DISCONNECTED); 2008 2009 allocate_cache_failed: 2010 rdma_connect_failed: 2011 rdma_destroy_qp(sc->rdma.cm_id); 2012 2013 create_qp_failed: 2014 alloc_cq_failed: 2015 if (sc->ib.send_cq) 2016 ib_free_cq(sc->ib.send_cq); 2017 if (sc->ib.recv_cq) 2018 ib_free_cq(sc->ib.recv_cq); 2019 2020 ib_dealloc_pd(sc->ib.pd); 2021 2022 alloc_pd_failed: 2023 config_failed: 2024 rdma_destroy_id(sc->rdma.cm_id); 2025 2026 create_id_failed: 2027 destroy_workqueue(sc->workqueue); 2028 create_wq_failed: 2029 kfree(info); 2030 return NULL; 2031 } 2032 2033 struct smbd_connection *smbd_get_connection( 2034 struct TCP_Server_Info *server, struct sockaddr *dstaddr) 2035 { 2036 struct smbd_connection *ret; 2037 const struct smbdirect_socket_parameters *sp; 2038 int port = SMBD_PORT; 2039 2040 try_again: 2041 ret = _smbd_get_connection(server, dstaddr, port); 2042 2043 /* Try SMB_PORT if SMBD_PORT doesn't work */ 2044 if (!ret && port == SMBD_PORT) { 2045 port = SMB_PORT; 2046 goto try_again; 2047 } 2048 if (!ret) 2049 return NULL; 2050 2051 sp = &ret->socket.parameters; 2052 2053 server->rdma_readwrite_threshold = 2054 rdma_readwrite_threshold > sp->max_fragmented_send_size ? 2055 sp->max_fragmented_send_size : 2056 rdma_readwrite_threshold; 2057 2058 return ret; 2059 } 2060 2061 /* 2062 * Receive data from the transport's receive reassembly queue 2063 * All the incoming data packets are placed in reassembly queue 2064 * iter: the buffer to read data into 2065 * size: the length of data to read 2066 * return value: actual data read 2067 * 2068 * Note: this implementation copies the data from reassembly queue to receive 2069 * buffers used by upper layer. This is not the optimal code path. A better way 2070 * to do it is to not have upper layer allocate its receive buffers but rather 2071 * borrow the buffer from reassembly queue, and return it after data is 2072 * consumed. But this will require more changes to upper layer code, and also 2073 * need to consider packet boundaries while they still being reassembled. 2074 */ 2075 int smbd_recv(struct smbd_connection *info, struct msghdr *msg) 2076 { 2077 struct smbdirect_socket *sc = &info->socket; 2078 struct smbdirect_recv_io *response; 2079 struct smbdirect_data_transfer *data_transfer; 2080 size_t size = iov_iter_count(&msg->msg_iter); 2081 int to_copy, to_read, data_read, offset; 2082 u32 data_length, remaining_data_length, data_offset; 2083 int rc; 2084 2085 if (WARN_ON_ONCE(iov_iter_rw(&msg->msg_iter) == WRITE)) 2086 return -EINVAL; /* It's a bug in upper layer to get there */ 2087 2088 again: 2089 /* 2090 * No need to hold the reassembly queue lock all the time as we are 2091 * the only one reading from the front of the queue. The transport 2092 * may add more entries to the back of the queue at the same time 2093 */ 2094 log_read(INFO, "size=%zd sc->recv_io.reassembly.data_length=%d\n", size, 2095 sc->recv_io.reassembly.data_length); 2096 if (sc->recv_io.reassembly.data_length >= size) { 2097 int queue_length; 2098 int queue_removed = 0; 2099 unsigned long flags; 2100 2101 /* 2102 * Need to make sure reassembly_data_length is read before 2103 * reading reassembly_queue_length and calling 2104 * _get_first_reassembly. This call is lock free 2105 * as we never read at the end of the queue which are being 2106 * updated in SOFTIRQ as more data is received 2107 */ 2108 virt_rmb(); 2109 queue_length = sc->recv_io.reassembly.queue_length; 2110 data_read = 0; 2111 to_read = size; 2112 offset = sc->recv_io.reassembly.first_entry_offset; 2113 while (data_read < size) { 2114 response = _get_first_reassembly(sc); 2115 data_transfer = smbdirect_recv_io_payload(response); 2116 data_length = le32_to_cpu(data_transfer->data_length); 2117 remaining_data_length = 2118 le32_to_cpu( 2119 data_transfer->remaining_data_length); 2120 data_offset = le32_to_cpu(data_transfer->data_offset); 2121 2122 /* 2123 * The upper layer expects RFC1002 length at the 2124 * beginning of the payload. Return it to indicate 2125 * the total length of the packet. This minimize the 2126 * change to upper layer packet processing logic. This 2127 * will be eventually remove when an intermediate 2128 * transport layer is added 2129 */ 2130 if (response->first_segment && size == 4) { 2131 unsigned int rfc1002_len = 2132 data_length + remaining_data_length; 2133 __be32 rfc1002_hdr = cpu_to_be32(rfc1002_len); 2134 if (copy_to_iter(&rfc1002_hdr, sizeof(rfc1002_hdr), 2135 &msg->msg_iter) != sizeof(rfc1002_hdr)) 2136 return -EFAULT; 2137 data_read = 4; 2138 response->first_segment = false; 2139 log_read(INFO, "returning rfc1002 length %d\n", 2140 rfc1002_len); 2141 goto read_rfc1002_done; 2142 } 2143 2144 to_copy = min_t(int, data_length - offset, to_read); 2145 if (copy_to_iter((char *)data_transfer + data_offset + offset, 2146 to_copy, &msg->msg_iter) != to_copy) 2147 return -EFAULT; 2148 2149 /* move on to the next buffer? */ 2150 if (to_copy == data_length - offset) { 2151 queue_length--; 2152 /* 2153 * No need to lock if we are not at the 2154 * end of the queue 2155 */ 2156 if (queue_length) 2157 list_del(&response->list); 2158 else { 2159 spin_lock_irqsave( 2160 &sc->recv_io.reassembly.lock, flags); 2161 list_del(&response->list); 2162 spin_unlock_irqrestore( 2163 &sc->recv_io.reassembly.lock, flags); 2164 } 2165 queue_removed++; 2166 sc->statistics.dequeue_reassembly_queue++; 2167 put_receive_buffer(sc, response); 2168 offset = 0; 2169 log_read(INFO, "put_receive_buffer offset=0\n"); 2170 } else 2171 offset += to_copy; 2172 2173 to_read -= to_copy; 2174 data_read += to_copy; 2175 2176 log_read(INFO, "_get_first_reassembly memcpy %d bytes data_transfer_length-offset=%d after that to_read=%d data_read=%d offset=%d\n", 2177 to_copy, data_length - offset, 2178 to_read, data_read, offset); 2179 } 2180 2181 spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); 2182 sc->recv_io.reassembly.data_length -= data_read; 2183 sc->recv_io.reassembly.queue_length -= queue_removed; 2184 spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); 2185 2186 sc->recv_io.reassembly.first_entry_offset = offset; 2187 log_read(INFO, "returning to thread data_read=%d reassembly_data_length=%d first_entry_offset=%d\n", 2188 data_read, sc->recv_io.reassembly.data_length, 2189 sc->recv_io.reassembly.first_entry_offset); 2190 read_rfc1002_done: 2191 return data_read; 2192 } 2193 2194 log_read(INFO, "wait_event on more data\n"); 2195 rc = wait_event_interruptible( 2196 sc->recv_io.reassembly.wait_queue, 2197 sc->recv_io.reassembly.data_length >= size || 2198 sc->status != SMBDIRECT_SOCKET_CONNECTED); 2199 /* Don't return any data if interrupted */ 2200 if (rc) 2201 return rc; 2202 2203 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { 2204 log_read(ERR, "disconnected\n"); 2205 return -ECONNABORTED; 2206 } 2207 2208 goto again; 2209 } 2210 2211 /* 2212 * Send data to transport 2213 * Each rqst is transported as a SMBDirect payload 2214 * rqst: the data to write 2215 * return value: 0 if successfully write, otherwise error code 2216 */ 2217 int smbd_send(struct TCP_Server_Info *server, 2218 int num_rqst, struct smb_rqst *rqst_array) 2219 { 2220 struct smbd_connection *info = server->smbd_conn; 2221 struct smbdirect_socket *sc = &info->socket; 2222 struct smbdirect_socket_parameters *sp = &sc->parameters; 2223 struct smb_rqst *rqst; 2224 struct iov_iter iter; 2225 unsigned int remaining_data_length, klen; 2226 int rc, i, rqst_idx; 2227 2228 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) 2229 return -EAGAIN; 2230 2231 /* 2232 * Add in the page array if there is one. The caller needs to set 2233 * rq_tailsz to PAGE_SIZE when the buffer has multiple pages and 2234 * ends at page boundary 2235 */ 2236 remaining_data_length = 0; 2237 for (i = 0; i < num_rqst; i++) 2238 remaining_data_length += smb_rqst_len(server, &rqst_array[i]); 2239 2240 if (unlikely(remaining_data_length > sp->max_fragmented_send_size)) { 2241 /* assertion: payload never exceeds negotiated maximum */ 2242 log_write(ERR, "payload size %d > max size %d\n", 2243 remaining_data_length, sp->max_fragmented_send_size); 2244 return -EINVAL; 2245 } 2246 2247 log_write(INFO, "num_rqst=%d total length=%u\n", 2248 num_rqst, remaining_data_length); 2249 2250 rqst_idx = 0; 2251 do { 2252 rqst = &rqst_array[rqst_idx]; 2253 2254 cifs_dbg(FYI, "Sending smb (RDMA): idx=%d smb_len=%lu\n", 2255 rqst_idx, smb_rqst_len(server, rqst)); 2256 for (i = 0; i < rqst->rq_nvec; i++) 2257 dump_smb(rqst->rq_iov[i].iov_base, rqst->rq_iov[i].iov_len); 2258 2259 log_write(INFO, "RDMA-WR[%u] nvec=%d len=%u iter=%zu rqlen=%lu\n", 2260 rqst_idx, rqst->rq_nvec, remaining_data_length, 2261 iov_iter_count(&rqst->rq_iter), smb_rqst_len(server, rqst)); 2262 2263 /* Send the metadata pages. */ 2264 klen = 0; 2265 for (i = 0; i < rqst->rq_nvec; i++) 2266 klen += rqst->rq_iov[i].iov_len; 2267 iov_iter_kvec(&iter, ITER_SOURCE, rqst->rq_iov, rqst->rq_nvec, klen); 2268 2269 rc = smbd_post_send_full_iter(sc, &iter, &remaining_data_length); 2270 if (rc < 0) 2271 break; 2272 2273 if (iov_iter_count(&rqst->rq_iter) > 0) { 2274 /* And then the data pages if there are any */ 2275 rc = smbd_post_send_full_iter(sc, &rqst->rq_iter, 2276 &remaining_data_length); 2277 if (rc < 0) 2278 break; 2279 } 2280 2281 } while (++rqst_idx < num_rqst); 2282 2283 /* 2284 * As an optimization, we don't wait for individual I/O to finish 2285 * before sending the next one. 2286 * Send them all and wait for pending send count to get to 0 2287 * that means all the I/Os have been out and we are good to return 2288 */ 2289 2290 wait_event(sc->send_io.pending.zero_wait_queue, 2291 atomic_read(&sc->send_io.pending.count) == 0 || 2292 sc->status != SMBDIRECT_SOCKET_CONNECTED); 2293 2294 if (sc->status != SMBDIRECT_SOCKET_CONNECTED && rc == 0) 2295 rc = -EAGAIN; 2296 2297 return rc; 2298 } 2299 2300 static void register_mr_done(struct ib_cq *cq, struct ib_wc *wc) 2301 { 2302 struct smbdirect_mr_io *mr = 2303 container_of(wc->wr_cqe, struct smbdirect_mr_io, cqe); 2304 struct smbdirect_socket *sc = mr->socket; 2305 2306 if (wc->status) { 2307 log_rdma_mr(ERR, "status=%d\n", wc->status); 2308 smbd_disconnect_rdma_connection(sc); 2309 } 2310 } 2311 2312 /* 2313 * The work queue function that recovers MRs 2314 * We need to call ib_dereg_mr() and ib_alloc_mr() before this MR can be used 2315 * again. Both calls are slow, so finish them in a workqueue. This will not 2316 * block I/O path. 2317 * There is one workqueue that recovers MRs, there is no need to lock as the 2318 * I/O requests calling smbd_register_mr will never update the links in the 2319 * mr_list. 2320 */ 2321 static void smbd_mr_recovery_work(struct work_struct *work) 2322 { 2323 struct smbdirect_socket *sc = 2324 container_of(work, struct smbdirect_socket, mr_io.recovery_work); 2325 struct smbdirect_socket_parameters *sp = &sc->parameters; 2326 struct smbdirect_mr_io *smbdirect_mr; 2327 int rc; 2328 2329 list_for_each_entry(smbdirect_mr, &sc->mr_io.all.list, list) { 2330 if (smbdirect_mr->state == SMBDIRECT_MR_ERROR) { 2331 2332 /* recover this MR entry */ 2333 rc = ib_dereg_mr(smbdirect_mr->mr); 2334 if (rc) { 2335 log_rdma_mr(ERR, 2336 "ib_dereg_mr failed rc=%x\n", 2337 rc); 2338 smbd_disconnect_rdma_connection(sc); 2339 continue; 2340 } 2341 2342 smbdirect_mr->mr = ib_alloc_mr( 2343 sc->ib.pd, sc->mr_io.type, 2344 sp->max_frmr_depth); 2345 if (IS_ERR(smbdirect_mr->mr)) { 2346 log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n", 2347 sc->mr_io.type, 2348 sp->max_frmr_depth); 2349 smbd_disconnect_rdma_connection(sc); 2350 continue; 2351 } 2352 } else 2353 /* This MR is being used, don't recover it */ 2354 continue; 2355 2356 smbdirect_mr->state = SMBDIRECT_MR_READY; 2357 2358 /* smbdirect_mr->state is updated by this function 2359 * and is read and updated by I/O issuing CPUs trying 2360 * to get a MR, the call to atomic_inc_return 2361 * implicates a memory barrier and guarantees this 2362 * value is updated before waking up any calls to 2363 * get_mr() from the I/O issuing CPUs 2364 */ 2365 if (atomic_inc_return(&sc->mr_io.ready.count) == 1) 2366 wake_up(&sc->mr_io.ready.wait_queue); 2367 } 2368 } 2369 2370 static void smbd_mr_disable_locked(struct smbdirect_mr_io *mr) 2371 { 2372 struct smbdirect_socket *sc = mr->socket; 2373 2374 lockdep_assert_held(&mr->mutex); 2375 2376 if (mr->state == SMBDIRECT_MR_DISABLED) 2377 return; 2378 2379 if (mr->mr) 2380 ib_dereg_mr(mr->mr); 2381 if (mr->sgt.nents) 2382 ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir); 2383 kfree(mr->sgt.sgl); 2384 2385 mr->mr = NULL; 2386 mr->sgt.sgl = NULL; 2387 mr->sgt.nents = 0; 2388 2389 mr->state = SMBDIRECT_MR_DISABLED; 2390 } 2391 2392 static void smbd_mr_free_locked(struct kref *kref) 2393 { 2394 struct smbdirect_mr_io *mr = 2395 container_of(kref, struct smbdirect_mr_io, kref); 2396 2397 lockdep_assert_held(&mr->mutex); 2398 2399 /* 2400 * smbd_mr_disable_locked() should already be called! 2401 */ 2402 if (WARN_ON_ONCE(mr->state != SMBDIRECT_MR_DISABLED)) 2403 smbd_mr_disable_locked(mr); 2404 2405 mutex_unlock(&mr->mutex); 2406 mutex_destroy(&mr->mutex); 2407 kfree(mr); 2408 } 2409 2410 static void destroy_mr_list(struct smbdirect_socket *sc) 2411 { 2412 struct smbdirect_mr_io *mr, *tmp; 2413 LIST_HEAD(all_list); 2414 unsigned long flags; 2415 2416 disable_work_sync(&sc->mr_io.recovery_work); 2417 2418 spin_lock_irqsave(&sc->mr_io.all.lock, flags); 2419 list_splice_tail_init(&sc->mr_io.all.list, &all_list); 2420 spin_unlock_irqrestore(&sc->mr_io.all.lock, flags); 2421 2422 list_for_each_entry_safe(mr, tmp, &all_list, list) { 2423 mutex_lock(&mr->mutex); 2424 2425 smbd_mr_disable_locked(mr); 2426 list_del(&mr->list); 2427 mr->socket = NULL; 2428 2429 /* 2430 * No kref_put_mutex() as it's already locked. 2431 * 2432 * If smbd_mr_free_locked() is called 2433 * and the mutex is unlocked and mr is gone, 2434 * in that case kref_put() returned 1. 2435 * 2436 * If kref_put() returned 0 we know that 2437 * smbd_mr_free_locked() didn't 2438 * run. Not by us nor by anyone else, as we 2439 * still hold the mutex, so we need to unlock. 2440 * 2441 * If the mr is still registered it will 2442 * be dangling (detached from the connection 2443 * waiting for smbd_deregister_mr() to be 2444 * called in order to free the memory. 2445 */ 2446 if (!kref_put(&mr->kref, smbd_mr_free_locked)) 2447 mutex_unlock(&mr->mutex); 2448 } 2449 } 2450 2451 /* 2452 * Allocate MRs used for RDMA read/write 2453 * The number of MRs will not exceed hardware capability in responder_resources 2454 * All MRs are kept in mr_list. The MR can be recovered after it's used 2455 * Recovery is done in smbd_mr_recovery_work. The content of list entry changes 2456 * as MRs are used and recovered for I/O, but the list links will not change 2457 */ 2458 static int allocate_mr_list(struct smbdirect_socket *sc) 2459 { 2460 struct smbdirect_socket_parameters *sp = &sc->parameters; 2461 struct smbdirect_mr_io *mr; 2462 int ret; 2463 u32 i; 2464 2465 if (sp->responder_resources == 0) { 2466 log_rdma_mr(ERR, "responder_resources negotiated as 0\n"); 2467 return -EINVAL; 2468 } 2469 2470 /* Allocate more MRs (2x) than hardware responder_resources */ 2471 for (i = 0; i < sp->responder_resources * 2; i++) { 2472 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 2473 if (!mr) { 2474 ret = -ENOMEM; 2475 goto kzalloc_mr_failed; 2476 } 2477 2478 kref_init(&mr->kref); 2479 mutex_init(&mr->mutex); 2480 2481 mr->mr = ib_alloc_mr(sc->ib.pd, 2482 sc->mr_io.type, 2483 sp->max_frmr_depth); 2484 if (IS_ERR(mr->mr)) { 2485 ret = PTR_ERR(mr->mr); 2486 log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n", 2487 sc->mr_io.type, sp->max_frmr_depth); 2488 goto ib_alloc_mr_failed; 2489 } 2490 2491 mr->sgt.sgl = kcalloc(sp->max_frmr_depth, 2492 sizeof(struct scatterlist), 2493 GFP_KERNEL); 2494 if (!mr->sgt.sgl) { 2495 ret = -ENOMEM; 2496 log_rdma_mr(ERR, "failed to allocate sgl\n"); 2497 goto kcalloc_sgl_failed; 2498 } 2499 mr->state = SMBDIRECT_MR_READY; 2500 mr->socket = sc; 2501 2502 list_add_tail(&mr->list, &sc->mr_io.all.list); 2503 atomic_inc(&sc->mr_io.ready.count); 2504 } 2505 2506 INIT_WORK(&sc->mr_io.recovery_work, smbd_mr_recovery_work); 2507 2508 return 0; 2509 2510 kcalloc_sgl_failed: 2511 ib_dereg_mr(mr->mr); 2512 ib_alloc_mr_failed: 2513 mutex_destroy(&mr->mutex); 2514 kfree(mr); 2515 kzalloc_mr_failed: 2516 destroy_mr_list(sc); 2517 return ret; 2518 } 2519 2520 /* 2521 * Get a MR from mr_list. This function waits until there is at least one 2522 * MR available in the list. It may access the list while the 2523 * smbd_mr_recovery_work is recovering the MR list. This doesn't need a lock 2524 * as they never modify the same places. However, there may be several CPUs 2525 * issuing I/O trying to get MR at the same time, mr_list_lock is used to 2526 * protect this situation. 2527 */ 2528 static struct smbdirect_mr_io *get_mr(struct smbdirect_socket *sc) 2529 { 2530 struct smbdirect_mr_io *ret; 2531 unsigned long flags; 2532 int rc; 2533 again: 2534 rc = wait_event_interruptible(sc->mr_io.ready.wait_queue, 2535 atomic_read(&sc->mr_io.ready.count) || 2536 sc->status != SMBDIRECT_SOCKET_CONNECTED); 2537 if (rc) { 2538 log_rdma_mr(ERR, "wait_event_interruptible rc=%x\n", rc); 2539 return NULL; 2540 } 2541 2542 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { 2543 log_rdma_mr(ERR, "sc->status=%x\n", sc->status); 2544 return NULL; 2545 } 2546 2547 spin_lock_irqsave(&sc->mr_io.all.lock, flags); 2548 list_for_each_entry(ret, &sc->mr_io.all.list, list) { 2549 if (ret->state == SMBDIRECT_MR_READY) { 2550 ret->state = SMBDIRECT_MR_REGISTERED; 2551 kref_get(&ret->kref); 2552 spin_unlock_irqrestore(&sc->mr_io.all.lock, flags); 2553 atomic_dec(&sc->mr_io.ready.count); 2554 atomic_inc(&sc->mr_io.used.count); 2555 return ret; 2556 } 2557 } 2558 2559 spin_unlock_irqrestore(&sc->mr_io.all.lock, flags); 2560 /* 2561 * It is possible that we could fail to get MR because other processes may 2562 * try to acquire a MR at the same time. If this is the case, retry it. 2563 */ 2564 goto again; 2565 } 2566 2567 /* 2568 * Transcribe the pages from an iterator into an MR scatterlist. 2569 */ 2570 static int smbd_iter_to_mr(struct iov_iter *iter, 2571 struct sg_table *sgt, 2572 unsigned int max_sg) 2573 { 2574 int ret; 2575 2576 memset(sgt->sgl, 0, max_sg * sizeof(struct scatterlist)); 2577 2578 ret = extract_iter_to_sg(iter, iov_iter_count(iter), sgt, max_sg, 0); 2579 WARN_ON(ret < 0); 2580 if (sgt->nents > 0) 2581 sg_mark_end(&sgt->sgl[sgt->nents - 1]); 2582 return ret; 2583 } 2584 2585 /* 2586 * Register memory for RDMA read/write 2587 * iter: the buffer to register memory with 2588 * writing: true if this is a RDMA write (SMB read), false for RDMA read 2589 * need_invalidate: true if this MR needs to be locally invalidated after I/O 2590 * return value: the MR registered, NULL if failed. 2591 */ 2592 struct smbdirect_mr_io *smbd_register_mr(struct smbd_connection *info, 2593 struct iov_iter *iter, 2594 bool writing, bool need_invalidate) 2595 { 2596 struct smbdirect_socket *sc = &info->socket; 2597 struct smbdirect_socket_parameters *sp = &sc->parameters; 2598 struct smbdirect_mr_io *mr; 2599 int rc, num_pages; 2600 struct ib_reg_wr *reg_wr; 2601 2602 num_pages = iov_iter_npages(iter, sp->max_frmr_depth + 1); 2603 if (num_pages > sp->max_frmr_depth) { 2604 log_rdma_mr(ERR, "num_pages=%d max_frmr_depth=%d\n", 2605 num_pages, sp->max_frmr_depth); 2606 WARN_ON_ONCE(1); 2607 return NULL; 2608 } 2609 2610 mr = get_mr(sc); 2611 if (!mr) { 2612 log_rdma_mr(ERR, "get_mr returning NULL\n"); 2613 return NULL; 2614 } 2615 2616 mutex_lock(&mr->mutex); 2617 2618 mr->dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; 2619 mr->need_invalidate = need_invalidate; 2620 mr->sgt.nents = 0; 2621 mr->sgt.orig_nents = 0; 2622 2623 log_rdma_mr(INFO, "num_pages=0x%x count=0x%zx depth=%u\n", 2624 num_pages, iov_iter_count(iter), sp->max_frmr_depth); 2625 smbd_iter_to_mr(iter, &mr->sgt, sp->max_frmr_depth); 2626 2627 rc = ib_dma_map_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir); 2628 if (!rc) { 2629 log_rdma_mr(ERR, "ib_dma_map_sg num_pages=%x dir=%x rc=%x\n", 2630 num_pages, mr->dir, rc); 2631 goto dma_map_error; 2632 } 2633 2634 rc = ib_map_mr_sg(mr->mr, mr->sgt.sgl, mr->sgt.nents, NULL, PAGE_SIZE); 2635 if (rc != mr->sgt.nents) { 2636 log_rdma_mr(ERR, 2637 "ib_map_mr_sg failed rc = %d nents = %x\n", 2638 rc, mr->sgt.nents); 2639 goto map_mr_error; 2640 } 2641 2642 ib_update_fast_reg_key(mr->mr, ib_inc_rkey(mr->mr->rkey)); 2643 reg_wr = &mr->wr; 2644 reg_wr->wr.opcode = IB_WR_REG_MR; 2645 mr->cqe.done = register_mr_done; 2646 reg_wr->wr.wr_cqe = &mr->cqe; 2647 reg_wr->wr.num_sge = 0; 2648 reg_wr->wr.send_flags = IB_SEND_SIGNALED; 2649 reg_wr->mr = mr->mr; 2650 reg_wr->key = mr->mr->rkey; 2651 reg_wr->access = writing ? 2652 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : 2653 IB_ACCESS_REMOTE_READ; 2654 2655 /* 2656 * There is no need for waiting for complemtion on ib_post_send 2657 * on IB_WR_REG_MR. Hardware enforces a barrier and order of execution 2658 * on the next ib_post_send when we actually send I/O to remote peer 2659 */ 2660 rc = ib_post_send(sc->ib.qp, ®_wr->wr, NULL); 2661 if (!rc) { 2662 /* 2663 * get_mr() gave us a reference 2664 * via kref_get(&mr->kref), we keep that and let 2665 * the caller use smbd_deregister_mr() 2666 * to remove it again. 2667 */ 2668 mutex_unlock(&mr->mutex); 2669 return mr; 2670 } 2671 2672 log_rdma_mr(ERR, "ib_post_send failed rc=%x reg_wr->key=%x\n", 2673 rc, reg_wr->key); 2674 2675 /* If all failed, attempt to recover this MR by setting it SMBDIRECT_MR_ERROR*/ 2676 map_mr_error: 2677 ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir); 2678 2679 dma_map_error: 2680 mr->sgt.nents = 0; 2681 mr->state = SMBDIRECT_MR_ERROR; 2682 if (atomic_dec_and_test(&sc->mr_io.used.count)) 2683 wake_up(&sc->mr_io.cleanup.wait_queue); 2684 2685 smbd_disconnect_rdma_connection(sc); 2686 2687 /* 2688 * get_mr() gave us a reference 2689 * via kref_get(&mr->kref), we need to remove it again 2690 * on error. 2691 * 2692 * No kref_put_mutex() as it's already locked. 2693 * 2694 * If smbd_mr_free_locked() is called 2695 * and the mutex is unlocked and mr is gone, 2696 * in that case kref_put() returned 1. 2697 * 2698 * If kref_put() returned 0 we know that 2699 * smbd_mr_free_locked() didn't 2700 * run. Not by us nor by anyone else, as we 2701 * still hold the mutex, so we need to unlock. 2702 */ 2703 if (!kref_put(&mr->kref, smbd_mr_free_locked)) 2704 mutex_unlock(&mr->mutex); 2705 2706 return NULL; 2707 } 2708 2709 static void local_inv_done(struct ib_cq *cq, struct ib_wc *wc) 2710 { 2711 struct smbdirect_mr_io *smbdirect_mr; 2712 struct ib_cqe *cqe; 2713 2714 cqe = wc->wr_cqe; 2715 smbdirect_mr = container_of(cqe, struct smbdirect_mr_io, cqe); 2716 smbdirect_mr->state = SMBDIRECT_MR_INVALIDATED; 2717 if (wc->status != IB_WC_SUCCESS) { 2718 log_rdma_mr(ERR, "invalidate failed status=%x\n", wc->status); 2719 smbdirect_mr->state = SMBDIRECT_MR_ERROR; 2720 } 2721 complete(&smbdirect_mr->invalidate_done); 2722 } 2723 2724 /* 2725 * Deregister a MR after I/O is done 2726 * This function may wait if remote invalidation is not used 2727 * and we have to locally invalidate the buffer to prevent data is being 2728 * modified by remote peer after upper layer consumes it 2729 */ 2730 void smbd_deregister_mr(struct smbdirect_mr_io *mr) 2731 { 2732 struct smbdirect_socket *sc = mr->socket; 2733 2734 mutex_lock(&mr->mutex); 2735 if (mr->state == SMBDIRECT_MR_DISABLED) 2736 goto put_kref; 2737 2738 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { 2739 smbd_mr_disable_locked(mr); 2740 goto put_kref; 2741 } 2742 2743 if (mr->need_invalidate) { 2744 struct ib_send_wr *wr = &mr->inv_wr; 2745 int rc; 2746 2747 /* Need to finish local invalidation before returning */ 2748 wr->opcode = IB_WR_LOCAL_INV; 2749 mr->cqe.done = local_inv_done; 2750 wr->wr_cqe = &mr->cqe; 2751 wr->num_sge = 0; 2752 wr->ex.invalidate_rkey = mr->mr->rkey; 2753 wr->send_flags = IB_SEND_SIGNALED; 2754 2755 init_completion(&mr->invalidate_done); 2756 rc = ib_post_send(sc->ib.qp, wr, NULL); 2757 if (rc) { 2758 log_rdma_mr(ERR, "ib_post_send failed rc=%x\n", rc); 2759 smbd_mr_disable_locked(mr); 2760 smbd_disconnect_rdma_connection(sc); 2761 goto done; 2762 } 2763 wait_for_completion(&mr->invalidate_done); 2764 mr->need_invalidate = false; 2765 } else 2766 /* 2767 * For remote invalidation, just set it to SMBDIRECT_MR_INVALIDATED 2768 * and defer to mr_recovery_work to recover the MR for next use 2769 */ 2770 mr->state = SMBDIRECT_MR_INVALIDATED; 2771 2772 if (mr->sgt.nents) { 2773 ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir); 2774 mr->sgt.nents = 0; 2775 } 2776 2777 if (mr->state == SMBDIRECT_MR_INVALIDATED) { 2778 mr->state = SMBDIRECT_MR_READY; 2779 if (atomic_inc_return(&sc->mr_io.ready.count) == 1) 2780 wake_up(&sc->mr_io.ready.wait_queue); 2781 } else 2782 /* 2783 * Schedule the work to do MR recovery for future I/Os MR 2784 * recovery is slow and don't want it to block current I/O 2785 */ 2786 queue_work(sc->workqueue, &sc->mr_io.recovery_work); 2787 2788 done: 2789 if (atomic_dec_and_test(&sc->mr_io.used.count)) 2790 wake_up(&sc->mr_io.cleanup.wait_queue); 2791 2792 put_kref: 2793 /* 2794 * No kref_put_mutex() as it's already locked. 2795 * 2796 * If smbd_mr_free_locked() is called 2797 * and the mutex is unlocked and mr is gone, 2798 * in that case kref_put() returned 1. 2799 * 2800 * If kref_put() returned 0 we know that 2801 * smbd_mr_free_locked() didn't 2802 * run. Not by us nor by anyone else, as we 2803 * still hold the mutex, so we need to unlock 2804 * and keep the mr in SMBDIRECT_MR_READY or 2805 * SMBDIRECT_MR_ERROR state. 2806 */ 2807 if (!kref_put(&mr->kref, smbd_mr_free_locked)) 2808 mutex_unlock(&mr->mutex); 2809 } 2810 2811 static bool smb_set_sge(struct smb_extract_to_rdma *rdma, 2812 struct page *lowest_page, size_t off, size_t len) 2813 { 2814 struct ib_sge *sge = &rdma->sge[rdma->nr_sge]; 2815 u64 addr; 2816 2817 addr = ib_dma_map_page(rdma->device, lowest_page, 2818 off, len, rdma->direction); 2819 if (ib_dma_mapping_error(rdma->device, addr)) 2820 return false; 2821 2822 sge->addr = addr; 2823 sge->length = len; 2824 sge->lkey = rdma->local_dma_lkey; 2825 rdma->nr_sge++; 2826 return true; 2827 } 2828 2829 /* 2830 * Extract page fragments from a BVEC-class iterator and add them to an RDMA 2831 * element list. The pages are not pinned. 2832 */ 2833 static ssize_t smb_extract_bvec_to_rdma(struct iov_iter *iter, 2834 struct smb_extract_to_rdma *rdma, 2835 ssize_t maxsize) 2836 { 2837 const struct bio_vec *bv = iter->bvec; 2838 unsigned long start = iter->iov_offset; 2839 unsigned int i; 2840 ssize_t ret = 0; 2841 2842 for (i = 0; i < iter->nr_segs; i++) { 2843 size_t off, len; 2844 2845 len = bv[i].bv_len; 2846 if (start >= len) { 2847 start -= len; 2848 continue; 2849 } 2850 2851 len = min_t(size_t, maxsize, len - start); 2852 off = bv[i].bv_offset + start; 2853 2854 if (!smb_set_sge(rdma, bv[i].bv_page, off, len)) 2855 return -EIO; 2856 2857 ret += len; 2858 maxsize -= len; 2859 if (rdma->nr_sge >= rdma->max_sge || maxsize <= 0) 2860 break; 2861 start = 0; 2862 } 2863 2864 if (ret > 0) 2865 iov_iter_advance(iter, ret); 2866 return ret; 2867 } 2868 2869 /* 2870 * Extract fragments from a KVEC-class iterator and add them to an RDMA list. 2871 * This can deal with vmalloc'd buffers as well as kmalloc'd or static buffers. 2872 * The pages are not pinned. 2873 */ 2874 static ssize_t smb_extract_kvec_to_rdma(struct iov_iter *iter, 2875 struct smb_extract_to_rdma *rdma, 2876 ssize_t maxsize) 2877 { 2878 const struct kvec *kv = iter->kvec; 2879 unsigned long start = iter->iov_offset; 2880 unsigned int i; 2881 ssize_t ret = 0; 2882 2883 for (i = 0; i < iter->nr_segs; i++) { 2884 struct page *page; 2885 unsigned long kaddr; 2886 size_t off, len, seg; 2887 2888 len = kv[i].iov_len; 2889 if (start >= len) { 2890 start -= len; 2891 continue; 2892 } 2893 2894 kaddr = (unsigned long)kv[i].iov_base + start; 2895 off = kaddr & ~PAGE_MASK; 2896 len = min_t(size_t, maxsize, len - start); 2897 kaddr &= PAGE_MASK; 2898 2899 maxsize -= len; 2900 do { 2901 seg = min_t(size_t, len, PAGE_SIZE - off); 2902 2903 if (is_vmalloc_or_module_addr((void *)kaddr)) 2904 page = vmalloc_to_page((void *)kaddr); 2905 else 2906 page = virt_to_page((void *)kaddr); 2907 2908 if (!smb_set_sge(rdma, page, off, seg)) 2909 return -EIO; 2910 2911 ret += seg; 2912 len -= seg; 2913 kaddr += PAGE_SIZE; 2914 off = 0; 2915 } while (len > 0 && rdma->nr_sge < rdma->max_sge); 2916 2917 if (rdma->nr_sge >= rdma->max_sge || maxsize <= 0) 2918 break; 2919 start = 0; 2920 } 2921 2922 if (ret > 0) 2923 iov_iter_advance(iter, ret); 2924 return ret; 2925 } 2926 2927 /* 2928 * Extract folio fragments from a FOLIOQ-class iterator and add them to an RDMA 2929 * list. The folios are not pinned. 2930 */ 2931 static ssize_t smb_extract_folioq_to_rdma(struct iov_iter *iter, 2932 struct smb_extract_to_rdma *rdma, 2933 ssize_t maxsize) 2934 { 2935 const struct folio_queue *folioq = iter->folioq; 2936 unsigned int slot = iter->folioq_slot; 2937 ssize_t ret = 0; 2938 size_t offset = iter->iov_offset; 2939 2940 BUG_ON(!folioq); 2941 2942 if (slot >= folioq_nr_slots(folioq)) { 2943 folioq = folioq->next; 2944 if (WARN_ON_ONCE(!folioq)) 2945 return -EIO; 2946 slot = 0; 2947 } 2948 2949 do { 2950 struct folio *folio = folioq_folio(folioq, slot); 2951 size_t fsize = folioq_folio_size(folioq, slot); 2952 2953 if (offset < fsize) { 2954 size_t part = umin(maxsize, fsize - offset); 2955 2956 if (!smb_set_sge(rdma, folio_page(folio, 0), offset, part)) 2957 return -EIO; 2958 2959 offset += part; 2960 ret += part; 2961 maxsize -= part; 2962 } 2963 2964 if (offset >= fsize) { 2965 offset = 0; 2966 slot++; 2967 if (slot >= folioq_nr_slots(folioq)) { 2968 if (!folioq->next) { 2969 WARN_ON_ONCE(ret < iter->count); 2970 break; 2971 } 2972 folioq = folioq->next; 2973 slot = 0; 2974 } 2975 } 2976 } while (rdma->nr_sge < rdma->max_sge && maxsize > 0); 2977 2978 iter->folioq = folioq; 2979 iter->folioq_slot = slot; 2980 iter->iov_offset = offset; 2981 iter->count -= ret; 2982 return ret; 2983 } 2984 2985 /* 2986 * Extract page fragments from up to the given amount of the source iterator 2987 * and build up an RDMA list that refers to all of those bits. The RDMA list 2988 * is appended to, up to the maximum number of elements set in the parameter 2989 * block. 2990 * 2991 * The extracted page fragments are not pinned or ref'd in any way; if an 2992 * IOVEC/UBUF-type iterator is to be used, it should be converted to a 2993 * BVEC-type iterator and the pages pinned, ref'd or otherwise held in some 2994 * way. 2995 */ 2996 static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len, 2997 struct smb_extract_to_rdma *rdma) 2998 { 2999 ssize_t ret; 3000 int before = rdma->nr_sge; 3001 3002 switch (iov_iter_type(iter)) { 3003 case ITER_BVEC: 3004 ret = smb_extract_bvec_to_rdma(iter, rdma, len); 3005 break; 3006 case ITER_KVEC: 3007 ret = smb_extract_kvec_to_rdma(iter, rdma, len); 3008 break; 3009 case ITER_FOLIOQ: 3010 ret = smb_extract_folioq_to_rdma(iter, rdma, len); 3011 break; 3012 default: 3013 WARN_ON_ONCE(1); 3014 return -EIO; 3015 } 3016 3017 if (ret < 0) { 3018 while (rdma->nr_sge > before) { 3019 struct ib_sge *sge = &rdma->sge[rdma->nr_sge--]; 3020 3021 ib_dma_unmap_single(rdma->device, sge->addr, sge->length, 3022 rdma->direction); 3023 sge->addr = 0; 3024 } 3025 } 3026 3027 return ret; 3028 } 3029