1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2017, Microsoft Corporation. 4 * 5 * Author(s): Long Li <longli@microsoft.com> 6 */ 7 8 #include "smbdirect.h" 9 #include "cifs_debug.h" 10 #include "cifsproto.h" 11 #include "smb2proto.h" 12 13 /* Port numbers for SMBD transport */ 14 #define SMB_PORT 445 15 #define SMBD_PORT 5445 16 17 /* Address lookup and resolve timeout in ms */ 18 #define RDMA_RESOLVE_TIMEOUT 5000 19 20 /* SMBD negotiation timeout in seconds */ 21 #define SMBD_NEGOTIATE_TIMEOUT 120 22 23 /* The timeout to wait for a keepalive message from peer in seconds */ 24 #define KEEPALIVE_RECV_TIMEOUT 5 25 26 /* 27 * Default maximum number of RDMA read/write outstanding on this connection 28 * This value is possibly decreased during QP creation on hardware limit 29 */ 30 #define SMBD_CM_RESPONDER_RESOURCES 32 31 32 /* 33 * User configurable initial values per SMBD transport connection 34 * as defined in [MS-SMBD] 3.1.1.1 35 * Those may change after a SMBD negotiation 36 */ 37 /* The local peer's maximum number of credits to grant to the peer */ 38 int smbd_receive_credit_max = 255; 39 40 /* The remote peer's credit request of local peer */ 41 int smbd_send_credit_target = 255; 42 43 /* The maximum single message size can be sent to remote peer */ 44 int smbd_max_send_size = 1364; 45 46 /* 47 * The maximum fragmented upper-layer payload receive size supported 48 * 49 * Assume max_payload_per_credit is 50 * smbd_max_receive_size - 24 = 1340 51 * 52 * The maximum number would be 53 * smbd_receive_credit_max * max_payload_per_credit 54 * 55 * 1340 * 255 = 341700 (0x536C4) 56 * 57 * The minimum value from the spec is 131072 (0x20000) 58 * 59 * For now we use the logic we used in ksmbd before: 60 * (1364 * 255) / 2 = 173910 (0x2A756) 61 */ 62 int smbd_max_fragmented_recv_size = (1364 * 255) / 2; 63 64 /* The maximum single-message size which can be received */ 65 int smbd_max_receive_size = 1364; 66 67 /* The timeout to initiate send of a keepalive message on idle */ 68 int smbd_keep_alive_interval = 120; 69 70 /* 71 * User configurable initial values for RDMA transport 72 * The actual values used may be lower and are limited to hardware capabilities 73 */ 74 /* Default maximum number of pages in a single RDMA write/read */ 75 int smbd_max_frmr_depth = 2048; 76 77 /* If payload is less than this byte, use RDMA send/recv not read/write */ 78 int rdma_readwrite_threshold = 4096; 79 80 /* Transport logging functions 81 * Logging are defined as classes. They can be OR'ed to define the actual 82 * logging level via module parameter smbd_logging_class 83 * e.g. cifs.smbd_logging_class=0xa0 will log all log_rdma_recv() and 84 * log_rdma_event() 85 */ 86 #define LOG_OUTGOING 0x1 87 #define LOG_INCOMING 0x2 88 #define LOG_READ 0x4 89 #define LOG_WRITE 0x8 90 #define LOG_RDMA_SEND 0x10 91 #define LOG_RDMA_RECV 0x20 92 #define LOG_KEEP_ALIVE 0x40 93 #define LOG_RDMA_EVENT 0x80 94 #define LOG_RDMA_MR 0x100 95 static unsigned int smbd_logging_class; 96 module_param(smbd_logging_class, uint, 0644); 97 MODULE_PARM_DESC(smbd_logging_class, 98 "Logging class for SMBD transport 0x0 to 0x100"); 99 100 #define ERR 0x0 101 #define INFO 0x1 102 static unsigned int smbd_logging_level = ERR; 103 module_param(smbd_logging_level, uint, 0644); 104 MODULE_PARM_DESC(smbd_logging_level, 105 "Logging level for SMBD transport, 0 (default): error, 1: info"); 106 107 static bool smbd_logging_needed(struct smbdirect_socket *sc, 108 void *private_ptr, 109 unsigned int lvl, 110 unsigned int cls) 111 { 112 #define BUILD_BUG_SAME(x) BUILD_BUG_ON(x != SMBDIRECT_LOG_ ##x) 113 BUILD_BUG_SAME(ERR); 114 BUILD_BUG_SAME(INFO); 115 #undef BUILD_BUG_SAME 116 #define BUILD_BUG_SAME(x) BUILD_BUG_ON(x != SMBDIRECT_ ##x) 117 BUILD_BUG_SAME(LOG_OUTGOING); 118 BUILD_BUG_SAME(LOG_INCOMING); 119 BUILD_BUG_SAME(LOG_READ); 120 BUILD_BUG_SAME(LOG_WRITE); 121 BUILD_BUG_SAME(LOG_RDMA_SEND); 122 BUILD_BUG_SAME(LOG_RDMA_RECV); 123 BUILD_BUG_SAME(LOG_KEEP_ALIVE); 124 BUILD_BUG_SAME(LOG_RDMA_EVENT); 125 BUILD_BUG_SAME(LOG_RDMA_MR); 126 #undef BUILD_BUG_SAME 127 128 if (lvl <= smbd_logging_level || cls & smbd_logging_class) 129 return true; 130 return false; 131 } 132 133 static void smbd_logging_vaprintf(struct smbdirect_socket *sc, 134 const char *func, 135 unsigned int line, 136 void *private_ptr, 137 unsigned int lvl, 138 unsigned int cls, 139 struct va_format *vaf) 140 { 141 cifs_dbg(VFS, "%s:%u %pV", func, line, vaf); 142 } 143 144 #define log_rdma(level, class, fmt, args...) \ 145 do { \ 146 if (level <= smbd_logging_level || class & smbd_logging_class) \ 147 cifs_dbg(VFS, "%s:%d " fmt, __func__, __LINE__, ##args);\ 148 } while (0) 149 150 #define log_outgoing(level, fmt, args...) \ 151 log_rdma(level, LOG_OUTGOING, fmt, ##args) 152 #define log_incoming(level, fmt, args...) \ 153 log_rdma(level, LOG_INCOMING, fmt, ##args) 154 #define log_read(level, fmt, args...) log_rdma(level, LOG_READ, fmt, ##args) 155 #define log_write(level, fmt, args...) log_rdma(level, LOG_WRITE, fmt, ##args) 156 #define log_rdma_send(level, fmt, args...) \ 157 log_rdma(level, LOG_RDMA_SEND, fmt, ##args) 158 #define log_rdma_recv(level, fmt, args...) \ 159 log_rdma(level, LOG_RDMA_RECV, fmt, ##args) 160 #define log_keep_alive(level, fmt, args...) \ 161 log_rdma(level, LOG_KEEP_ALIVE, fmt, ##args) 162 #define log_rdma_event(level, fmt, args...) \ 163 log_rdma(level, LOG_RDMA_EVENT, fmt, ##args) 164 #define log_rdma_mr(level, fmt, args...) \ 165 log_rdma(level, LOG_RDMA_MR, fmt, ##args) 166 167 static int smbd_post_send_full_iter(struct smbdirect_socket *sc, 168 struct smbdirect_send_batch *batch, 169 struct iov_iter *iter, 170 u32 remaining_data_length) 171 { 172 int bytes = 0; 173 174 /* 175 * smbdirect_connection_send_single_iter() respects the 176 * negotiated max_send_size, so we need to 177 * loop until the full iter is posted 178 */ 179 180 while (iov_iter_count(iter) > 0) { 181 int rc; 182 183 rc = smbdirect_connection_send_single_iter(sc, 184 batch, 185 iter, 186 0, /* flags */ 187 remaining_data_length); 188 if (rc < 0) 189 return rc; 190 remaining_data_length -= rc; 191 bytes += rc; 192 } 193 194 return bytes; 195 } 196 197 /* 198 * Destroy the transport and related RDMA and memory resources 199 * Need to go through all the pending counters and make sure on one is using 200 * the transport while it is destroyed 201 */ 202 void smbd_destroy(struct TCP_Server_Info *server) 203 { 204 struct smbd_connection *info = server->smbd_conn; 205 206 if (!info) { 207 log_rdma_event(INFO, "rdma session already destroyed\n"); 208 return; 209 } 210 211 smbdirect_socket_release(info->socket); 212 213 kfree(info); 214 server->smbd_conn = NULL; 215 } 216 217 /* 218 * Reconnect this SMBD connection, called from upper layer 219 * return value: 0 on success, or actual error code 220 */ 221 int smbd_reconnect(struct TCP_Server_Info *server) 222 { 223 log_rdma_event(INFO, "reconnecting rdma session\n"); 224 225 if (!server->smbd_conn) { 226 log_rdma_event(INFO, "rdma session already destroyed\n"); 227 goto create_conn; 228 } 229 230 /* 231 * This is possible if transport is disconnected and we haven't received 232 * notification from RDMA, but upper layer has detected timeout 233 */ 234 log_rdma_event(INFO, "disconnecting transport\n"); 235 smbd_destroy(server); 236 237 create_conn: 238 log_rdma_event(INFO, "creating rdma session\n"); 239 server->smbd_conn = smbd_get_connection( 240 server, (struct sockaddr *) &server->dstaddr); 241 242 if (server->smbd_conn) { 243 cifs_dbg(VFS, "RDMA transport re-established\n"); 244 trace_smb3_smbd_connect_done(server->hostname, server->conn_id, &server->dstaddr); 245 return 0; 246 } 247 trace_smb3_smbd_connect_err(server->hostname, server->conn_id, &server->dstaddr); 248 return -ENOENT; 249 } 250 251 /* Create a SMBD connection, called by upper layer */ 252 static struct smbd_connection *_smbd_get_connection( 253 struct TCP_Server_Info *server, struct sockaddr *dstaddr, int port) 254 { 255 struct net *net = cifs_net_ns(server); 256 struct smbd_connection *info; 257 struct smbdirect_socket *sc; 258 struct smbdirect_socket_parameters init_params = {}; 259 struct smbdirect_socket_parameters *sp; 260 __be16 *sport; 261 u64 port_flags = 0; 262 int ret; 263 264 switch (port) { 265 case SMBD_PORT: 266 /* 267 * only allow iWarp devices 268 * for port 5445. 269 */ 270 port_flags |= SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW; 271 break; 272 case SMB_PORT: 273 /* 274 * only allow InfiniBand, RoCEv1 or RoCEv2 275 * devices for port 445. 276 * 277 * (Basically don't allow iWarp devices) 278 */ 279 port_flags |= SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB; 280 break; 281 } 282 283 /* 284 * Create the initial parameters 285 */ 286 sp = &init_params; 287 sp->flags = port_flags; 288 sp->resolve_addr_timeout_msec = RDMA_RESOLVE_TIMEOUT; 289 sp->resolve_route_timeout_msec = RDMA_RESOLVE_TIMEOUT; 290 sp->rdma_connect_timeout_msec = RDMA_RESOLVE_TIMEOUT; 291 sp->negotiate_timeout_msec = SMBD_NEGOTIATE_TIMEOUT * 1000; 292 sp->initiator_depth = 1; 293 sp->responder_resources = SMBD_CM_RESPONDER_RESOURCES; 294 sp->recv_credit_max = smbd_receive_credit_max; 295 sp->send_credit_target = smbd_send_credit_target; 296 sp->max_send_size = smbd_max_send_size; 297 sp->max_fragmented_recv_size = smbd_max_fragmented_recv_size; 298 sp->max_recv_size = smbd_max_receive_size; 299 sp->max_frmr_depth = smbd_max_frmr_depth; 300 sp->keepalive_interval_msec = smbd_keep_alive_interval * 1000; 301 sp->keepalive_timeout_msec = KEEPALIVE_RECV_TIMEOUT * 1000; 302 303 info = kzalloc_obj(*info); 304 if (!info) 305 return NULL; 306 ret = smbdirect_socket_create_kern(net, &sc); 307 if (ret) 308 goto socket_init_failed; 309 smbdirect_socket_set_logging(sc, NULL, smbd_logging_needed, smbd_logging_vaprintf); 310 ret = smbdirect_socket_set_initial_parameters(sc, sp); 311 if (ret) 312 goto set_params_failed; 313 ret = smbdirect_socket_set_kernel_settings(sc, IB_POLL_SOFTIRQ, GFP_KERNEL); 314 if (ret) 315 goto set_settings_failed; 316 317 if (dstaddr->sa_family == AF_INET6) 318 sport = &((struct sockaddr_in6 *)dstaddr)->sin6_port; 319 else 320 sport = &((struct sockaddr_in *)dstaddr)->sin_port; 321 322 *sport = htons(port); 323 324 ret = smbdirect_connect_sync(sc, dstaddr); 325 if (ret) { 326 log_rdma_event(ERR, "connect to %pISpsfc failed: %1pe\n", 327 dstaddr, ERR_PTR(ret)); 328 goto connect_failed; 329 } 330 331 info->socket = sc; 332 return info; 333 334 connect_failed: 335 set_settings_failed: 336 set_params_failed: 337 smbdirect_socket_release(sc); 338 socket_init_failed: 339 kfree(info); 340 return NULL; 341 } 342 343 const struct smbdirect_socket_parameters *smbd_get_parameters(struct smbd_connection *conn) 344 { 345 if (unlikely(!conn->socket)) { 346 static const struct smbdirect_socket_parameters zero_params; 347 348 return &zero_params; 349 } 350 351 return smbdirect_socket_get_current_parameters(conn->socket); 352 } 353 354 struct smbd_connection *smbd_get_connection( 355 struct TCP_Server_Info *server, struct sockaddr *dstaddr) 356 { 357 struct smbd_connection *ret; 358 const struct smbdirect_socket_parameters *sp; 359 int port = SMBD_PORT; 360 361 try_again: 362 ret = _smbd_get_connection(server, dstaddr, port); 363 364 /* Try SMB_PORT if SMBD_PORT doesn't work */ 365 if (!ret && port == SMBD_PORT) { 366 port = SMB_PORT; 367 goto try_again; 368 } 369 if (!ret) 370 return NULL; 371 372 sp = smbd_get_parameters(ret); 373 374 server->rdma_readwrite_threshold = 375 rdma_readwrite_threshold > sp->max_fragmented_send_size ? 376 sp->max_fragmented_send_size : 377 rdma_readwrite_threshold; 378 379 return ret; 380 } 381 382 /* 383 * Receive data from the transport's receive reassembly queue 384 * All the incoming data packets are placed in reassembly queue 385 * iter: the buffer to read data into 386 * size: the length of data to read 387 * return value: actual data read 388 * 389 * Note: this implementation copies the data from reassembly queue to receive 390 * buffers used by upper layer. This is not the optimal code path. A better way 391 * to do it is to not have upper layer allocate its receive buffers but rather 392 * borrow the buffer from reassembly queue, and return it after data is 393 * consumed. But this will require more changes to upper layer code, and also 394 * need to consider packet boundaries while they still being reassembled. 395 */ 396 int smbd_recv(struct smbd_connection *info, struct msghdr *msg) 397 { 398 struct smbdirect_socket *sc = info->socket; 399 400 if (!smbdirect_connection_is_connected(sc)) 401 return -ENOTCONN; 402 403 return smbdirect_connection_recvmsg(sc, msg, 0); 404 } 405 406 /* 407 * Send data to transport 408 * Each rqst is transported as a SMBDirect payload 409 * rqst: the data to write 410 * return value: 0 if successfully write, otherwise error code 411 */ 412 int smbd_send(struct TCP_Server_Info *server, 413 int num_rqst, struct smb_rqst *rqst_array) 414 { 415 struct smbd_connection *info = server->smbd_conn; 416 struct smbdirect_socket *sc = info->socket; 417 const struct smbdirect_socket_parameters *sp = smbd_get_parameters(info); 418 struct smb_rqst *rqst; 419 struct iov_iter iter; 420 struct smbdirect_send_batch_storage bstorage; 421 struct smbdirect_send_batch *batch; 422 unsigned int remaining_data_length, klen; 423 int rc, i, rqst_idx; 424 int error = 0; 425 426 if (!smbdirect_connection_is_connected(sc)) 427 return -EAGAIN; 428 429 /* 430 * Add in the page array if there is one. The caller needs to set 431 * rq_tailsz to PAGE_SIZE when the buffer has multiple pages and 432 * ends at page boundary 433 */ 434 remaining_data_length = 0; 435 for (i = 0; i < num_rqst; i++) 436 remaining_data_length += smb_rqst_len(server, &rqst_array[i]); 437 438 if (unlikely(remaining_data_length > sp->max_fragmented_send_size)) { 439 /* assertion: payload never exceeds negotiated maximum */ 440 log_write(ERR, "payload size %d > max size %d\n", 441 remaining_data_length, sp->max_fragmented_send_size); 442 return -EINVAL; 443 } 444 445 log_write(INFO, "num_rqst=%d total length=%u\n", 446 num_rqst, remaining_data_length); 447 448 rqst_idx = 0; 449 batch = smbdirect_init_send_batch_storage(&bstorage, false, 0); 450 do { 451 rqst = &rqst_array[rqst_idx]; 452 453 cifs_dbg(FYI, "Sending smb (RDMA): idx=%d smb_len=%lu\n", 454 rqst_idx, smb_rqst_len(server, rqst)); 455 for (i = 0; i < rqst->rq_nvec; i++) 456 dump_smb(rqst->rq_iov[i].iov_base, rqst->rq_iov[i].iov_len); 457 458 log_write(INFO, "RDMA-WR[%u] nvec=%d len=%u iter=%zu rqlen=%lu\n", 459 rqst_idx, rqst->rq_nvec, remaining_data_length, 460 iov_iter_count(&rqst->rq_iter), smb_rqst_len(server, rqst)); 461 462 /* Send the metadata pages. */ 463 klen = 0; 464 for (i = 0; i < rqst->rq_nvec; i++) 465 klen += rqst->rq_iov[i].iov_len; 466 iov_iter_kvec(&iter, ITER_SOURCE, rqst->rq_iov, rqst->rq_nvec, klen); 467 468 rc = smbd_post_send_full_iter(sc, batch, &iter, remaining_data_length); 469 if (rc < 0) { 470 error = rc; 471 break; 472 } 473 remaining_data_length -= rc; 474 475 if (iov_iter_count(&rqst->rq_iter) > 0) { 476 /* And then the data pages if there are any */ 477 rc = smbd_post_send_full_iter(sc, batch, &rqst->rq_iter, 478 remaining_data_length); 479 if (rc < 0) { 480 error = rc; 481 break; 482 } 483 remaining_data_length -= rc; 484 } 485 486 } while (++rqst_idx < num_rqst); 487 488 rc = smbdirect_connection_send_batch_flush(sc, batch, true); 489 if (unlikely(!rc && error)) 490 rc = error; 491 492 /* 493 * As an optimization, we don't wait for individual I/O to finish 494 * before sending the next one. 495 * Send them all and wait for pending send count to get to 0 496 * that means all the I/Os have been out and we are good to return 497 */ 498 499 error = rc; 500 rc = smbdirect_connection_send_wait_zero_pending(sc); 501 if (unlikely(rc && !error)) 502 error = -EAGAIN; 503 504 if (unlikely(error)) 505 return error; 506 507 return 0; 508 } 509 510 /* 511 * Register memory for RDMA read/write 512 * iter: the buffer to register memory with 513 * writing: true if this is a RDMA write (SMB read), false for RDMA read 514 * need_invalidate: true if this MR needs to be locally invalidated after I/O 515 * return value: the MR registered, NULL if failed. 516 */ 517 struct smbdirect_mr_io *smbd_register_mr(struct smbd_connection *info, 518 struct iov_iter *iter, 519 bool writing, bool need_invalidate) 520 { 521 struct smbdirect_socket *sc = info->socket; 522 523 if (!smbdirect_connection_is_connected(sc)) 524 return NULL; 525 526 return smbdirect_connection_register_mr_io(sc, iter, writing, need_invalidate); 527 } 528 529 void smbd_mr_fill_buffer_descriptor(struct smbdirect_mr_io *mr, 530 struct smbdirect_buffer_descriptor_v1 *v1) 531 { 532 smbdirect_mr_io_fill_buffer_descriptor(mr, v1); 533 } 534 535 /* 536 * Deregister a MR after I/O is done 537 * This function may wait if remote invalidation is not used 538 * and we have to locally invalidate the buffer to prevent data is being 539 * modified by remote peer after upper layer consumes it 540 */ 541 void smbd_deregister_mr(struct smbdirect_mr_io *mr) 542 { 543 smbdirect_connection_deregister_mr_io(mr); 544 } 545 546 void smbd_debug_proc_show(struct TCP_Server_Info *server, struct seq_file *m) 547 { 548 if (!server->rdma) 549 return; 550 551 if (!server->smbd_conn) { 552 seq_puts(m, "\nSMBDirect transport not available"); 553 return; 554 } 555 556 smbdirect_connection_legacy_debug_proc_show(server->smbd_conn->socket, 557 server->rdma_readwrite_threshold, 558 m); 559 } 560 561 MODULE_IMPORT_NS("SMBDIRECT"); 562