1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2017, Microsoft Corporation. 4 * 5 * Author(s): Long Li <longli@microsoft.com> 6 */ 7 8 #include "smbdirect.h" 9 #include "cifs_debug.h" 10 #include "cifsproto.h" 11 #include "smb2proto.h" 12 #include "../common/smbdirect/smbdirect_public.h" 13 14 /* Port numbers for SMBD transport */ 15 #define SMB_PORT 445 16 #define SMBD_PORT 5445 17 18 /* Address lookup and resolve timeout in ms */ 19 #define RDMA_RESOLVE_TIMEOUT 5000 20 21 /* SMBD negotiation timeout in seconds */ 22 #define SMBD_NEGOTIATE_TIMEOUT 120 23 24 /* The timeout to wait for a keepalive message from peer in seconds */ 25 #define KEEPALIVE_RECV_TIMEOUT 5 26 27 /* 28 * Default maximum number of RDMA read/write outstanding on this connection 29 * This value is possibly decreased during QP creation on hardware limit 30 */ 31 #define SMBD_CM_RESPONDER_RESOURCES 32 32 33 /* 34 * User configurable initial values per SMBD transport connection 35 * as defined in [MS-SMBD] 3.1.1.1 36 * Those may change after a SMBD negotiation 37 */ 38 /* The local peer's maximum number of credits to grant to the peer */ 39 int smbd_receive_credit_max = 255; 40 41 /* The remote peer's credit request of local peer */ 42 int smbd_send_credit_target = 255; 43 44 /* The maximum single message size can be sent to remote peer */ 45 int smbd_max_send_size = 1364; 46 47 /* 48 * The maximum fragmented upper-layer payload receive size supported 49 * 50 * Assume max_payload_per_credit is 51 * smbd_max_receive_size - 24 = 1340 52 * 53 * The maximum number would be 54 * smbd_receive_credit_max * max_payload_per_credit 55 * 56 * 1340 * 255 = 341700 (0x536C4) 57 * 58 * The minimum value from the spec is 131072 (0x20000) 59 * 60 * For now we use the logic we used in ksmbd before: 61 * (1364 * 255) / 2 = 173910 (0x2A756) 62 */ 63 int smbd_max_fragmented_recv_size = (1364 * 255) / 2; 64 65 /* The maximum single-message size which can be received */ 66 int smbd_max_receive_size = 1364; 67 68 /* The timeout to initiate send of a keepalive message on idle */ 69 int smbd_keep_alive_interval = 120; 70 71 /* 72 * User configurable initial values for RDMA transport 73 * The actual values used may be lower and are limited to hardware capabilities 74 */ 75 /* Default maximum number of pages in a single RDMA write/read */ 76 int smbd_max_frmr_depth = 2048; 77 78 /* If payload is less than this byte, use RDMA send/recv not read/write */ 79 int rdma_readwrite_threshold = 4096; 80 81 /* Transport logging functions 82 * Logging are defined as classes. They can be OR'ed to define the actual 83 * logging level via module parameter smbd_logging_class 84 * e.g. cifs.smbd_logging_class=0xa0 will log all log_rdma_recv() and 85 * log_rdma_event() 86 */ 87 #define LOG_OUTGOING 0x1 88 #define LOG_INCOMING 0x2 89 #define LOG_READ 0x4 90 #define LOG_WRITE 0x8 91 #define LOG_RDMA_SEND 0x10 92 #define LOG_RDMA_RECV 0x20 93 #define LOG_KEEP_ALIVE 0x40 94 #define LOG_RDMA_EVENT 0x80 95 #define LOG_RDMA_MR 0x100 96 static unsigned int smbd_logging_class; 97 module_param(smbd_logging_class, uint, 0644); 98 MODULE_PARM_DESC(smbd_logging_class, 99 "Logging class for SMBD transport 0x0 to 0x100"); 100 101 #define ERR 0x0 102 #define INFO 0x1 103 static unsigned int smbd_logging_level = ERR; 104 module_param(smbd_logging_level, uint, 0644); 105 MODULE_PARM_DESC(smbd_logging_level, 106 "Logging level for SMBD transport, 0 (default): error, 1: info"); 107 108 static bool smbd_logging_needed(struct smbdirect_socket *sc, 109 void *private_ptr, 110 unsigned int lvl, 111 unsigned int cls) 112 { 113 #define BUILD_BUG_SAME(x) BUILD_BUG_ON(x != SMBDIRECT_LOG_ ##x) 114 BUILD_BUG_SAME(ERR); 115 BUILD_BUG_SAME(INFO); 116 #undef BUILD_BUG_SAME 117 #define BUILD_BUG_SAME(x) BUILD_BUG_ON(x != SMBDIRECT_ ##x) 118 BUILD_BUG_SAME(LOG_OUTGOING); 119 BUILD_BUG_SAME(LOG_INCOMING); 120 BUILD_BUG_SAME(LOG_READ); 121 BUILD_BUG_SAME(LOG_WRITE); 122 BUILD_BUG_SAME(LOG_RDMA_SEND); 123 BUILD_BUG_SAME(LOG_RDMA_RECV); 124 BUILD_BUG_SAME(LOG_KEEP_ALIVE); 125 BUILD_BUG_SAME(LOG_RDMA_EVENT); 126 BUILD_BUG_SAME(LOG_RDMA_MR); 127 #undef BUILD_BUG_SAME 128 129 if (lvl <= smbd_logging_level || cls & smbd_logging_class) 130 return true; 131 return false; 132 } 133 134 static void smbd_logging_vaprintf(struct smbdirect_socket *sc, 135 const char *func, 136 unsigned int line, 137 void *private_ptr, 138 unsigned int lvl, 139 unsigned int cls, 140 struct va_format *vaf) 141 { 142 cifs_dbg(VFS, "%s:%u %pV", func, line, vaf); 143 } 144 145 #define log_rdma(level, class, fmt, args...) \ 146 do { \ 147 if (level <= smbd_logging_level || class & smbd_logging_class) \ 148 cifs_dbg(VFS, "%s:%d " fmt, __func__, __LINE__, ##args);\ 149 } while (0) 150 151 #define log_outgoing(level, fmt, args...) \ 152 log_rdma(level, LOG_OUTGOING, fmt, ##args) 153 #define log_incoming(level, fmt, args...) \ 154 log_rdma(level, LOG_INCOMING, fmt, ##args) 155 #define log_read(level, fmt, args...) log_rdma(level, LOG_READ, fmt, ##args) 156 #define log_write(level, fmt, args...) log_rdma(level, LOG_WRITE, fmt, ##args) 157 #define log_rdma_send(level, fmt, args...) \ 158 log_rdma(level, LOG_RDMA_SEND, fmt, ##args) 159 #define log_rdma_recv(level, fmt, args...) \ 160 log_rdma(level, LOG_RDMA_RECV, fmt, ##args) 161 #define log_keep_alive(level, fmt, args...) \ 162 log_rdma(level, LOG_KEEP_ALIVE, fmt, ##args) 163 #define log_rdma_event(level, fmt, args...) \ 164 log_rdma(level, LOG_RDMA_EVENT, fmt, ##args) 165 #define log_rdma_mr(level, fmt, args...) \ 166 log_rdma(level, LOG_RDMA_MR, fmt, ##args) 167 168 static int smbd_post_send_full_iter(struct smbdirect_socket *sc, 169 struct smbdirect_send_batch *batch, 170 struct iov_iter *iter, 171 u32 remaining_data_length) 172 { 173 int bytes = 0; 174 175 /* 176 * smbdirect_connection_send_single_iter() respects the 177 * negotiated max_send_size, so we need to 178 * loop until the full iter is posted 179 */ 180 181 while (iov_iter_count(iter) > 0) { 182 int rc; 183 184 rc = smbdirect_connection_send_single_iter(sc, 185 batch, 186 iter, 187 0, /* flags */ 188 remaining_data_length); 189 if (rc < 0) 190 return rc; 191 remaining_data_length -= rc; 192 bytes += rc; 193 } 194 195 return bytes; 196 } 197 198 /* 199 * Destroy the transport and related RDMA and memory resources 200 * Need to go through all the pending counters and make sure on one is using 201 * the transport while it is destroyed 202 */ 203 void smbd_destroy(struct TCP_Server_Info *server) 204 { 205 struct smbd_connection *info = server->smbd_conn; 206 207 if (!info) { 208 log_rdma_event(INFO, "rdma session already destroyed\n"); 209 return; 210 } 211 212 smbdirect_socket_release(info->socket); 213 214 kfree(info); 215 server->smbd_conn = NULL; 216 } 217 218 /* 219 * Reconnect this SMBD connection, called from upper layer 220 * return value: 0 on success, or actual error code 221 */ 222 int smbd_reconnect(struct TCP_Server_Info *server) 223 { 224 log_rdma_event(INFO, "reconnecting rdma session\n"); 225 226 if (!server->smbd_conn) { 227 log_rdma_event(INFO, "rdma session already destroyed\n"); 228 goto create_conn; 229 } 230 231 /* 232 * This is possible if transport is disconnected and we haven't received 233 * notification from RDMA, but upper layer has detected timeout 234 */ 235 log_rdma_event(INFO, "disconnecting transport\n"); 236 smbd_destroy(server); 237 238 create_conn: 239 log_rdma_event(INFO, "creating rdma session\n"); 240 server->smbd_conn = smbd_get_connection( 241 server, (struct sockaddr *) &server->dstaddr); 242 243 if (server->smbd_conn) { 244 cifs_dbg(VFS, "RDMA transport re-established\n"); 245 trace_smb3_smbd_connect_done(server->hostname, server->conn_id, &server->dstaddr); 246 return 0; 247 } 248 trace_smb3_smbd_connect_err(server->hostname, server->conn_id, &server->dstaddr); 249 return -ENOENT; 250 } 251 252 /* Create a SMBD connection, called by upper layer */ 253 static struct smbd_connection *_smbd_get_connection( 254 struct TCP_Server_Info *server, struct sockaddr *dstaddr, int port) 255 { 256 struct net *net = cifs_net_ns(server); 257 struct smbd_connection *info; 258 struct smbdirect_socket *sc; 259 struct smbdirect_socket_parameters init_params = {}; 260 struct smbdirect_socket_parameters *sp; 261 __be16 *sport; 262 u64 port_flags = 0; 263 int ret; 264 265 switch (port) { 266 case SMBD_PORT: 267 /* 268 * only allow iWarp devices 269 * for port 5445. 270 */ 271 port_flags |= SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW; 272 break; 273 case SMB_PORT: 274 /* 275 * only allow InfiniBand, RoCEv1 or RoCEv2 276 * devices for port 445. 277 * 278 * (Basically don't allow iWarp devices) 279 */ 280 port_flags |= SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB; 281 break; 282 } 283 284 /* 285 * Create the initial parameters 286 */ 287 sp = &init_params; 288 sp->flags = port_flags; 289 sp->resolve_addr_timeout_msec = RDMA_RESOLVE_TIMEOUT; 290 sp->resolve_route_timeout_msec = RDMA_RESOLVE_TIMEOUT; 291 sp->rdma_connect_timeout_msec = RDMA_RESOLVE_TIMEOUT; 292 sp->negotiate_timeout_msec = SMBD_NEGOTIATE_TIMEOUT * 1000; 293 sp->initiator_depth = 1; 294 sp->responder_resources = SMBD_CM_RESPONDER_RESOURCES; 295 sp->recv_credit_max = smbd_receive_credit_max; 296 sp->send_credit_target = smbd_send_credit_target; 297 sp->max_send_size = smbd_max_send_size; 298 sp->max_fragmented_recv_size = smbd_max_fragmented_recv_size; 299 sp->max_recv_size = smbd_max_receive_size; 300 sp->max_frmr_depth = smbd_max_frmr_depth; 301 sp->keepalive_interval_msec = smbd_keep_alive_interval * 1000; 302 sp->keepalive_timeout_msec = KEEPALIVE_RECV_TIMEOUT * 1000; 303 304 info = kzalloc_obj(*info); 305 if (!info) 306 return NULL; 307 ret = smbdirect_socket_create_kern(net, &sc); 308 if (ret) 309 goto socket_init_failed; 310 smbdirect_socket_set_logging(sc, NULL, smbd_logging_needed, smbd_logging_vaprintf); 311 ret = smbdirect_socket_set_initial_parameters(sc, sp); 312 if (ret) 313 goto set_params_failed; 314 ret = smbdirect_socket_set_kernel_settings(sc, IB_POLL_SOFTIRQ, GFP_KERNEL); 315 if (ret) 316 goto set_settings_failed; 317 318 if (dstaddr->sa_family == AF_INET6) 319 sport = &((struct sockaddr_in6 *)dstaddr)->sin6_port; 320 else 321 sport = &((struct sockaddr_in *)dstaddr)->sin_port; 322 323 *sport = htons(port); 324 325 ret = smbdirect_connect_sync(sc, dstaddr); 326 if (ret) { 327 log_rdma_event(ERR, "connect to %pISpsfc failed: %1pe\n", 328 dstaddr, ERR_PTR(ret)); 329 goto connect_failed; 330 } 331 332 info->socket = sc; 333 return info; 334 335 connect_failed: 336 set_settings_failed: 337 set_params_failed: 338 smbdirect_socket_release(sc); 339 socket_init_failed: 340 kfree(info); 341 return NULL; 342 } 343 344 const struct smbdirect_socket_parameters *smbd_get_parameters(struct smbd_connection *conn) 345 { 346 if (unlikely(!conn->socket)) { 347 static const struct smbdirect_socket_parameters zero_params; 348 349 return &zero_params; 350 } 351 352 return smbdirect_socket_get_current_parameters(conn->socket); 353 } 354 355 struct smbd_connection *smbd_get_connection( 356 struct TCP_Server_Info *server, struct sockaddr *dstaddr) 357 { 358 struct smbd_connection *ret; 359 const struct smbdirect_socket_parameters *sp; 360 int port = SMBD_PORT; 361 362 try_again: 363 ret = _smbd_get_connection(server, dstaddr, port); 364 365 /* Try SMB_PORT if SMBD_PORT doesn't work */ 366 if (!ret && port == SMBD_PORT) { 367 port = SMB_PORT; 368 goto try_again; 369 } 370 if (!ret) 371 return NULL; 372 373 sp = smbd_get_parameters(ret); 374 375 server->rdma_readwrite_threshold = 376 rdma_readwrite_threshold > sp->max_fragmented_send_size ? 377 sp->max_fragmented_send_size : 378 rdma_readwrite_threshold; 379 380 return ret; 381 } 382 383 /* 384 * Receive data from the transport's receive reassembly queue 385 * All the incoming data packets are placed in reassembly queue 386 * iter: the buffer to read data into 387 * size: the length of data to read 388 * return value: actual data read 389 * 390 * Note: this implementation copies the data from reassembly queue to receive 391 * buffers used by upper layer. This is not the optimal code path. A better way 392 * to do it is to not have upper layer allocate its receive buffers but rather 393 * borrow the buffer from reassembly queue, and return it after data is 394 * consumed. But this will require more changes to upper layer code, and also 395 * need to consider packet boundaries while they still being reassembled. 396 */ 397 int smbd_recv(struct smbd_connection *info, struct msghdr *msg) 398 { 399 struct smbdirect_socket *sc = info->socket; 400 401 if (!smbdirect_connection_is_connected(sc)) 402 return -ENOTCONN; 403 404 return smbdirect_connection_recvmsg(sc, msg, 0); 405 } 406 407 /* 408 * Send data to transport 409 * Each rqst is transported as a SMBDirect payload 410 * rqst: the data to write 411 * return value: 0 if successfully write, otherwise error code 412 */ 413 int smbd_send(struct TCP_Server_Info *server, 414 int num_rqst, struct smb_rqst *rqst_array) 415 { 416 struct smbd_connection *info = server->smbd_conn; 417 struct smbdirect_socket *sc = info->socket; 418 const struct smbdirect_socket_parameters *sp = smbd_get_parameters(info); 419 struct smb_rqst *rqst; 420 struct iov_iter iter; 421 struct smbdirect_send_batch_storage bstorage; 422 struct smbdirect_send_batch *batch; 423 unsigned int remaining_data_length, klen; 424 int rc, i, rqst_idx; 425 int error = 0; 426 427 if (!smbdirect_connection_is_connected(sc)) 428 return -EAGAIN; 429 430 /* 431 * Add in the page array if there is one. The caller needs to set 432 * rq_tailsz to PAGE_SIZE when the buffer has multiple pages and 433 * ends at page boundary 434 */ 435 remaining_data_length = 0; 436 for (i = 0; i < num_rqst; i++) 437 remaining_data_length += smb_rqst_len(server, &rqst_array[i]); 438 439 if (unlikely(remaining_data_length > sp->max_fragmented_send_size)) { 440 /* assertion: payload never exceeds negotiated maximum */ 441 log_write(ERR, "payload size %d > max size %d\n", 442 remaining_data_length, sp->max_fragmented_send_size); 443 return -EINVAL; 444 } 445 446 log_write(INFO, "num_rqst=%d total length=%u\n", 447 num_rqst, remaining_data_length); 448 449 rqst_idx = 0; 450 batch = smbdirect_init_send_batch_storage(&bstorage, false, 0); 451 do { 452 rqst = &rqst_array[rqst_idx]; 453 454 cifs_dbg(FYI, "Sending smb (RDMA): idx=%d smb_len=%lu\n", 455 rqst_idx, smb_rqst_len(server, rqst)); 456 for (i = 0; i < rqst->rq_nvec; i++) 457 dump_smb(rqst->rq_iov[i].iov_base, rqst->rq_iov[i].iov_len); 458 459 log_write(INFO, "RDMA-WR[%u] nvec=%d len=%u iter=%zu rqlen=%lu\n", 460 rqst_idx, rqst->rq_nvec, remaining_data_length, 461 iov_iter_count(&rqst->rq_iter), smb_rqst_len(server, rqst)); 462 463 /* Send the metadata pages. */ 464 klen = 0; 465 for (i = 0; i < rqst->rq_nvec; i++) 466 klen += rqst->rq_iov[i].iov_len; 467 iov_iter_kvec(&iter, ITER_SOURCE, rqst->rq_iov, rqst->rq_nvec, klen); 468 469 rc = smbd_post_send_full_iter(sc, batch, &iter, remaining_data_length); 470 if (rc < 0) { 471 error = rc; 472 break; 473 } 474 remaining_data_length -= rc; 475 476 if (iov_iter_count(&rqst->rq_iter) > 0) { 477 /* And then the data pages if there are any */ 478 rc = smbd_post_send_full_iter(sc, batch, &rqst->rq_iter, 479 remaining_data_length); 480 if (rc < 0) { 481 error = rc; 482 break; 483 } 484 remaining_data_length -= rc; 485 } 486 487 } while (++rqst_idx < num_rqst); 488 489 rc = smbdirect_connection_send_batch_flush(sc, batch, true); 490 if (unlikely(!rc && error)) 491 rc = error; 492 493 /* 494 * As an optimization, we don't wait for individual I/O to finish 495 * before sending the next one. 496 * Send them all and wait for pending send count to get to 0 497 * that means all the I/Os have been out and we are good to return 498 */ 499 500 error = rc; 501 rc = smbdirect_connection_send_wait_zero_pending(sc); 502 if (unlikely(rc && !error)) 503 error = -EAGAIN; 504 505 if (unlikely(error)) 506 return error; 507 508 return 0; 509 } 510 511 /* 512 * Register memory for RDMA read/write 513 * iter: the buffer to register memory with 514 * writing: true if this is a RDMA write (SMB read), false for RDMA read 515 * need_invalidate: true if this MR needs to be locally invalidated after I/O 516 * return value: the MR registered, NULL if failed. 517 */ 518 struct smbdirect_mr_io *smbd_register_mr(struct smbd_connection *info, 519 struct iov_iter *iter, 520 bool writing, bool need_invalidate) 521 { 522 struct smbdirect_socket *sc = info->socket; 523 524 if (!smbdirect_connection_is_connected(sc)) 525 return NULL; 526 527 return smbdirect_connection_register_mr_io(sc, iter, writing, need_invalidate); 528 } 529 530 void smbd_mr_fill_buffer_descriptor(struct smbdirect_mr_io *mr, 531 struct smbdirect_buffer_descriptor_v1 *v1) 532 { 533 smbdirect_mr_io_fill_buffer_descriptor(mr, v1); 534 } 535 536 /* 537 * Deregister a MR after I/O is done 538 * This function may wait if remote invalidation is not used 539 * and we have to locally invalidate the buffer to prevent data is being 540 * modified by remote peer after upper layer consumes it 541 */ 542 void smbd_deregister_mr(struct smbdirect_mr_io *mr) 543 { 544 smbdirect_connection_deregister_mr_io(mr); 545 } 546 547 void smbd_debug_proc_show(struct TCP_Server_Info *server, struct seq_file *m) 548 { 549 if (!server->rdma) 550 return; 551 552 if (!server->smbd_conn) { 553 seq_puts(m, "\nSMBDirect transport not available"); 554 return; 555 } 556 557 smbdirect_connection_legacy_debug_proc_show(server->smbd_conn->socket, 558 server->rdma_readwrite_threshold, 559 m); 560 } 561