1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (C) 2017, Microsoft Corporation.
4 *
5 * Author(s): Long Li <longli@microsoft.com>
6 */
7
8 #include "smbdirect.h"
9 #include "cifs_debug.h"
10 #include "cifsproto.h"
11 #include "smb2proto.h"
12
13 /* Port numbers for SMBD transport */
14 #define SMB_PORT 445
15 #define SMBD_PORT 5445
16
17 /* Address lookup and resolve timeout in ms */
18 #define RDMA_RESOLVE_TIMEOUT 5000
19
20 /* SMBD negotiation timeout in seconds */
21 #define SMBD_NEGOTIATE_TIMEOUT 120
22
23 /* The timeout to wait for a keepalive message from peer in seconds */
24 #define KEEPALIVE_RECV_TIMEOUT 5
25
26 /*
27 * Default maximum number of RDMA read/write outstanding on this connection
28 * This value is possibly decreased during QP creation on hardware limit
29 */
30 #define SMBD_CM_RESPONDER_RESOURCES 32
31
32 /*
33 * User configurable initial values per SMBD transport connection
34 * as defined in [MS-SMBD] 3.1.1.1
35 * Those may change after a SMBD negotiation
36 */
37 /* The local peer's maximum number of credits to grant to the peer */
38 int smbd_receive_credit_max = 255;
39
40 /* The remote peer's credit request of local peer */
41 int smbd_send_credit_target = 255;
42
43 /* The maximum single message size can be sent to remote peer */
44 int smbd_max_send_size = 1364;
45
46 /*
47 * The maximum fragmented upper-layer payload receive size supported
48 *
49 * Assume max_payload_per_credit is
50 * smbd_max_receive_size - 24 = 1340
51 *
52 * The maximum number would be
53 * smbd_receive_credit_max * max_payload_per_credit
54 *
55 * 1340 * 255 = 341700 (0x536C4)
56 *
57 * The minimum value from the spec is 131072 (0x20000)
58 *
59 * For now we use the logic we used in ksmbd before:
60 * (1364 * 255) / 2 = 173910 (0x2A756)
61 */
62 int smbd_max_fragmented_recv_size = (1364 * 255) / 2;
63
64 /* The maximum single-message size which can be received */
65 int smbd_max_receive_size = 1364;
66
67 /* The timeout to initiate send of a keepalive message on idle */
68 int smbd_keep_alive_interval = 120;
69
70 /*
71 * User configurable initial values for RDMA transport
72 * The actual values used may be lower and are limited to hardware capabilities
73 */
74 /* Default maximum number of pages in a single RDMA write/read */
75 int smbd_max_frmr_depth = 2048;
76
77 /* If payload is less than this byte, use RDMA send/recv not read/write */
78 int rdma_readwrite_threshold = 4096;
79
80 /* Transport logging functions
81 * Logging are defined as classes. They can be OR'ed to define the actual
82 * logging level via module parameter smbd_logging_class
83 * e.g. cifs.smbd_logging_class=0xa0 will log all log_rdma_recv() and
84 * log_rdma_event()
85 */
86 #define LOG_OUTGOING 0x1
87 #define LOG_INCOMING 0x2
88 #define LOG_READ 0x4
89 #define LOG_WRITE 0x8
90 #define LOG_RDMA_SEND 0x10
91 #define LOG_RDMA_RECV 0x20
92 #define LOG_KEEP_ALIVE 0x40
93 #define LOG_RDMA_EVENT 0x80
94 #define LOG_RDMA_MR 0x100
95 static unsigned int smbd_logging_class;
96 module_param(smbd_logging_class, uint, 0644);
97 MODULE_PARM_DESC(smbd_logging_class,
98 "Logging class for SMBD transport 0x0 to 0x100");
99
100 #define ERR 0x0
101 #define INFO 0x1
102 static unsigned int smbd_logging_level = ERR;
103 module_param(smbd_logging_level, uint, 0644);
104 MODULE_PARM_DESC(smbd_logging_level,
105 "Logging level for SMBD transport, 0 (default): error, 1: info");
106
smbd_logging_needed(struct smbdirect_socket * sc,void * private_ptr,unsigned int lvl,unsigned int cls)107 static bool smbd_logging_needed(struct smbdirect_socket *sc,
108 void *private_ptr,
109 unsigned int lvl,
110 unsigned int cls)
111 {
112 #define BUILD_BUG_SAME(x) BUILD_BUG_ON(x != SMBDIRECT_LOG_ ##x)
113 BUILD_BUG_SAME(ERR);
114 BUILD_BUG_SAME(INFO);
115 #undef BUILD_BUG_SAME
116 #define BUILD_BUG_SAME(x) BUILD_BUG_ON(x != SMBDIRECT_ ##x)
117 BUILD_BUG_SAME(LOG_OUTGOING);
118 BUILD_BUG_SAME(LOG_INCOMING);
119 BUILD_BUG_SAME(LOG_READ);
120 BUILD_BUG_SAME(LOG_WRITE);
121 BUILD_BUG_SAME(LOG_RDMA_SEND);
122 BUILD_BUG_SAME(LOG_RDMA_RECV);
123 BUILD_BUG_SAME(LOG_KEEP_ALIVE);
124 BUILD_BUG_SAME(LOG_RDMA_EVENT);
125 BUILD_BUG_SAME(LOG_RDMA_MR);
126 #undef BUILD_BUG_SAME
127
128 if (lvl <= smbd_logging_level || cls & smbd_logging_class)
129 return true;
130 return false;
131 }
132
smbd_logging_vaprintf(struct smbdirect_socket * sc,const char * func,unsigned int line,void * private_ptr,unsigned int lvl,unsigned int cls,struct va_format * vaf)133 static void smbd_logging_vaprintf(struct smbdirect_socket *sc,
134 const char *func,
135 unsigned int line,
136 void *private_ptr,
137 unsigned int lvl,
138 unsigned int cls,
139 struct va_format *vaf)
140 {
141 cifs_dbg(VFS, "%s:%u %pV", func, line, vaf);
142 }
143
144 #define log_rdma(level, class, fmt, args...) \
145 do { \
146 if (level <= smbd_logging_level || class & smbd_logging_class) \
147 cifs_dbg(VFS, "%s:%d " fmt, __func__, __LINE__, ##args);\
148 } while (0)
149
150 #define log_outgoing(level, fmt, args...) \
151 log_rdma(level, LOG_OUTGOING, fmt, ##args)
152 #define log_incoming(level, fmt, args...) \
153 log_rdma(level, LOG_INCOMING, fmt, ##args)
154 #define log_read(level, fmt, args...) log_rdma(level, LOG_READ, fmt, ##args)
155 #define log_write(level, fmt, args...) log_rdma(level, LOG_WRITE, fmt, ##args)
156 #define log_rdma_send(level, fmt, args...) \
157 log_rdma(level, LOG_RDMA_SEND, fmt, ##args)
158 #define log_rdma_recv(level, fmt, args...) \
159 log_rdma(level, LOG_RDMA_RECV, fmt, ##args)
160 #define log_keep_alive(level, fmt, args...) \
161 log_rdma(level, LOG_KEEP_ALIVE, fmt, ##args)
162 #define log_rdma_event(level, fmt, args...) \
163 log_rdma(level, LOG_RDMA_EVENT, fmt, ##args)
164 #define log_rdma_mr(level, fmt, args...) \
165 log_rdma(level, LOG_RDMA_MR, fmt, ##args)
166
smbd_post_send_full_iter(struct smbdirect_socket * sc,struct smbdirect_send_batch * batch,struct iov_iter * iter,u32 remaining_data_length)167 static int smbd_post_send_full_iter(struct smbdirect_socket *sc,
168 struct smbdirect_send_batch *batch,
169 struct iov_iter *iter,
170 u32 remaining_data_length)
171 {
172 int bytes = 0;
173
174 /*
175 * smbdirect_connection_send_single_iter() respects the
176 * negotiated max_send_size, so we need to
177 * loop until the full iter is posted
178 */
179
180 while (iov_iter_count(iter) > 0) {
181 int rc;
182
183 rc = smbdirect_connection_send_single_iter(sc,
184 batch,
185 iter,
186 0, /* flags */
187 remaining_data_length);
188 if (rc < 0)
189 return rc;
190 remaining_data_length -= rc;
191 bytes += rc;
192 }
193
194 return bytes;
195 }
196
197 /*
198 * Destroy the transport and related RDMA and memory resources
199 * Need to go through all the pending counters and make sure on one is using
200 * the transport while it is destroyed
201 */
smbd_destroy(struct TCP_Server_Info * server)202 void smbd_destroy(struct TCP_Server_Info *server)
203 {
204 struct smbd_connection *info = server->smbd_conn;
205
206 if (!info) {
207 log_rdma_event(INFO, "rdma session already destroyed\n");
208 return;
209 }
210
211 smbdirect_socket_release(info->socket);
212
213 kfree(info);
214 server->smbd_conn = NULL;
215 }
216
217 /*
218 * Reconnect this SMBD connection, called from upper layer
219 * return value: 0 on success, or actual error code
220 */
smbd_reconnect(struct TCP_Server_Info * server)221 int smbd_reconnect(struct TCP_Server_Info *server)
222 {
223 log_rdma_event(INFO, "reconnecting rdma session\n");
224
225 if (!server->smbd_conn) {
226 log_rdma_event(INFO, "rdma session already destroyed\n");
227 goto create_conn;
228 }
229
230 /*
231 * This is possible if transport is disconnected and we haven't received
232 * notification from RDMA, but upper layer has detected timeout
233 */
234 log_rdma_event(INFO, "disconnecting transport\n");
235 smbd_destroy(server);
236
237 create_conn:
238 log_rdma_event(INFO, "creating rdma session\n");
239 server->smbd_conn = smbd_get_connection(
240 server, (struct sockaddr *) &server->dstaddr);
241
242 if (server->smbd_conn) {
243 cifs_dbg(VFS, "RDMA transport re-established\n");
244 trace_smb3_smbd_connect_done(server->hostname, server->conn_id, &server->dstaddr);
245 return 0;
246 }
247 trace_smb3_smbd_connect_err(server->hostname, server->conn_id, &server->dstaddr);
248 return -ENOENT;
249 }
250
251 /* Create a SMBD connection, called by upper layer */
_smbd_get_connection(struct TCP_Server_Info * server,struct sockaddr * dstaddr,int port)252 static struct smbd_connection *_smbd_get_connection(
253 struct TCP_Server_Info *server, struct sockaddr *dstaddr, int port)
254 {
255 struct net *net = cifs_net_ns(server);
256 struct smbd_connection *info;
257 struct smbdirect_socket *sc;
258 struct smbdirect_socket_parameters init_params = {};
259 struct smbdirect_socket_parameters *sp;
260 __be16 *sport;
261 u64 port_flags = 0;
262 int ret;
263
264 switch (port) {
265 case SMBD_PORT:
266 /*
267 * only allow iWarp devices
268 * for port 5445.
269 */
270 port_flags |= SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW;
271 break;
272 case SMB_PORT:
273 /*
274 * only allow InfiniBand, RoCEv1 or RoCEv2
275 * devices for port 445.
276 *
277 * (Basically don't allow iWarp devices)
278 */
279 port_flags |= SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB;
280 break;
281 }
282
283 /*
284 * Create the initial parameters
285 */
286 sp = &init_params;
287 sp->flags = port_flags;
288 sp->resolve_addr_timeout_msec = RDMA_RESOLVE_TIMEOUT;
289 sp->resolve_route_timeout_msec = RDMA_RESOLVE_TIMEOUT;
290 sp->rdma_connect_timeout_msec = RDMA_RESOLVE_TIMEOUT;
291 sp->negotiate_timeout_msec = SMBD_NEGOTIATE_TIMEOUT * 1000;
292 sp->initiator_depth = 1;
293 sp->responder_resources = SMBD_CM_RESPONDER_RESOURCES;
294 sp->recv_credit_max = smbd_receive_credit_max;
295 sp->send_credit_target = smbd_send_credit_target;
296 sp->max_send_size = smbd_max_send_size;
297 sp->max_fragmented_recv_size = smbd_max_fragmented_recv_size;
298 sp->max_recv_size = smbd_max_receive_size;
299 sp->max_frmr_depth = smbd_max_frmr_depth;
300 sp->keepalive_interval_msec = smbd_keep_alive_interval * 1000;
301 sp->keepalive_timeout_msec = KEEPALIVE_RECV_TIMEOUT * 1000;
302
303 info = kzalloc_obj(*info);
304 if (!info)
305 return NULL;
306 ret = smbdirect_socket_create_kern(net, &sc);
307 if (ret)
308 goto socket_init_failed;
309 smbdirect_socket_set_logging(sc, NULL, smbd_logging_needed, smbd_logging_vaprintf);
310 ret = smbdirect_socket_set_initial_parameters(sc, sp);
311 if (ret)
312 goto set_params_failed;
313 ret = smbdirect_socket_set_kernel_settings(sc, IB_POLL_SOFTIRQ, GFP_KERNEL);
314 if (ret)
315 goto set_settings_failed;
316
317 if (dstaddr->sa_family == AF_INET6)
318 sport = &((struct sockaddr_in6 *)dstaddr)->sin6_port;
319 else
320 sport = &((struct sockaddr_in *)dstaddr)->sin_port;
321
322 *sport = htons(port);
323
324 ret = smbdirect_connect_sync(sc, dstaddr);
325 if (ret) {
326 log_rdma_event(ERR, "connect to %pISpsfc failed: %1pe\n",
327 dstaddr, ERR_PTR(ret));
328 goto connect_failed;
329 }
330
331 info->socket = sc;
332 return info;
333
334 connect_failed:
335 set_settings_failed:
336 set_params_failed:
337 smbdirect_socket_release(sc);
338 socket_init_failed:
339 kfree(info);
340 return NULL;
341 }
342
smbd_get_parameters(struct smbd_connection * conn)343 const struct smbdirect_socket_parameters *smbd_get_parameters(struct smbd_connection *conn)
344 {
345 if (unlikely(!conn->socket)) {
346 static const struct smbdirect_socket_parameters zero_params;
347
348 return &zero_params;
349 }
350
351 return smbdirect_socket_get_current_parameters(conn->socket);
352 }
353
smbd_get_connection(struct TCP_Server_Info * server,struct sockaddr * dstaddr)354 struct smbd_connection *smbd_get_connection(
355 struct TCP_Server_Info *server, struct sockaddr *dstaddr)
356 {
357 struct smbd_connection *ret;
358 const struct smbdirect_socket_parameters *sp;
359 int port = SMBD_PORT;
360
361 try_again:
362 ret = _smbd_get_connection(server, dstaddr, port);
363
364 /* Try SMB_PORT if SMBD_PORT doesn't work */
365 if (!ret && port == SMBD_PORT) {
366 port = SMB_PORT;
367 goto try_again;
368 }
369 if (!ret)
370 return NULL;
371
372 sp = smbd_get_parameters(ret);
373
374 server->rdma_readwrite_threshold =
375 rdma_readwrite_threshold > sp->max_fragmented_send_size ?
376 sp->max_fragmented_send_size :
377 rdma_readwrite_threshold;
378
379 return ret;
380 }
381
382 /*
383 * Receive data from the transport's receive reassembly queue
384 * All the incoming data packets are placed in reassembly queue
385 * iter: the buffer to read data into
386 * size: the length of data to read
387 * return value: actual data read
388 *
389 * Note: this implementation copies the data from reassembly queue to receive
390 * buffers used by upper layer. This is not the optimal code path. A better way
391 * to do it is to not have upper layer allocate its receive buffers but rather
392 * borrow the buffer from reassembly queue, and return it after data is
393 * consumed. But this will require more changes to upper layer code, and also
394 * need to consider packet boundaries while they still being reassembled.
395 */
smbd_recv(struct smbd_connection * info,struct msghdr * msg)396 int smbd_recv(struct smbd_connection *info, struct msghdr *msg)
397 {
398 struct smbdirect_socket *sc = info->socket;
399
400 if (!smbdirect_connection_is_connected(sc))
401 return -ENOTCONN;
402
403 return smbdirect_connection_recvmsg(sc, msg, 0);
404 }
405
406 /*
407 * Send data to transport
408 * Each rqst is transported as a SMBDirect payload
409 * rqst: the data to write
410 * return value: 0 if successfully write, otherwise error code
411 */
smbd_send(struct TCP_Server_Info * server,int num_rqst,struct smb_rqst * rqst_array)412 int smbd_send(struct TCP_Server_Info *server,
413 int num_rqst, struct smb_rqst *rqst_array)
414 {
415 struct smbd_connection *info = server->smbd_conn;
416 struct smbdirect_socket *sc = info->socket;
417 const struct smbdirect_socket_parameters *sp = smbd_get_parameters(info);
418 struct smb_rqst *rqst;
419 struct iov_iter iter;
420 struct smbdirect_send_batch_storage bstorage;
421 struct smbdirect_send_batch *batch;
422 unsigned int remaining_data_length, klen;
423 int rc, i, rqst_idx;
424 int error = 0;
425
426 if (!smbdirect_connection_is_connected(sc))
427 return -EAGAIN;
428
429 /*
430 * Add in the page array if there is one. The caller needs to set
431 * rq_tailsz to PAGE_SIZE when the buffer has multiple pages and
432 * ends at page boundary
433 */
434 remaining_data_length = 0;
435 for (i = 0; i < num_rqst; i++)
436 remaining_data_length += smb_rqst_len(server, &rqst_array[i]);
437
438 if (unlikely(remaining_data_length > sp->max_fragmented_send_size)) {
439 /* assertion: payload never exceeds negotiated maximum */
440 log_write(ERR, "payload size %d > max size %d\n",
441 remaining_data_length, sp->max_fragmented_send_size);
442 return -EINVAL;
443 }
444
445 log_write(INFO, "num_rqst=%d total length=%u\n",
446 num_rqst, remaining_data_length);
447
448 rqst_idx = 0;
449 batch = smbdirect_init_send_batch_storage(&bstorage, false, 0);
450 do {
451 rqst = &rqst_array[rqst_idx];
452
453 cifs_dbg(FYI, "Sending smb (RDMA): idx=%d smb_len=%lu\n",
454 rqst_idx, smb_rqst_len(server, rqst));
455 for (i = 0; i < rqst->rq_nvec; i++)
456 dump_smb(rqst->rq_iov[i].iov_base, rqst->rq_iov[i].iov_len);
457
458 log_write(INFO, "RDMA-WR[%u] nvec=%d len=%u iter=%zu rqlen=%lu\n",
459 rqst_idx, rqst->rq_nvec, remaining_data_length,
460 iov_iter_count(&rqst->rq_iter), smb_rqst_len(server, rqst));
461
462 /* Send the metadata pages. */
463 klen = 0;
464 for (i = 0; i < rqst->rq_nvec; i++)
465 klen += rqst->rq_iov[i].iov_len;
466 iov_iter_kvec(&iter, ITER_SOURCE, rqst->rq_iov, rqst->rq_nvec, klen);
467
468 rc = smbd_post_send_full_iter(sc, batch, &iter, remaining_data_length);
469 if (rc < 0) {
470 error = rc;
471 break;
472 }
473 remaining_data_length -= rc;
474
475 if (iov_iter_count(&rqst->rq_iter) > 0) {
476 /* And then the data pages if there are any */
477 rc = smbd_post_send_full_iter(sc, batch, &rqst->rq_iter,
478 remaining_data_length);
479 if (rc < 0) {
480 error = rc;
481 break;
482 }
483 remaining_data_length -= rc;
484 }
485
486 } while (++rqst_idx < num_rqst);
487
488 rc = smbdirect_connection_send_batch_flush(sc, batch, true);
489 if (unlikely(!rc && error))
490 rc = error;
491
492 /*
493 * As an optimization, we don't wait for individual I/O to finish
494 * before sending the next one.
495 * Send them all and wait for pending send count to get to 0
496 * that means all the I/Os have been out and we are good to return
497 */
498
499 error = rc;
500 rc = smbdirect_connection_send_wait_zero_pending(sc);
501 if (unlikely(rc && !error))
502 error = -EAGAIN;
503
504 if (unlikely(error))
505 return error;
506
507 return 0;
508 }
509
510 /*
511 * Register memory for RDMA read/write
512 * iter: the buffer to register memory with
513 * writing: true if this is a RDMA write (SMB read), false for RDMA read
514 * need_invalidate: true if this MR needs to be locally invalidated after I/O
515 * return value: the MR registered, NULL if failed.
516 */
smbd_register_mr(struct smbd_connection * info,struct iov_iter * iter,bool writing,bool need_invalidate)517 struct smbdirect_mr_io *smbd_register_mr(struct smbd_connection *info,
518 struct iov_iter *iter,
519 bool writing, bool need_invalidate)
520 {
521 struct smbdirect_socket *sc = info->socket;
522
523 if (!smbdirect_connection_is_connected(sc))
524 return NULL;
525
526 return smbdirect_connection_register_mr_io(sc, iter, writing, need_invalidate);
527 }
528
smbd_mr_fill_buffer_descriptor(struct smbdirect_mr_io * mr,struct smbdirect_buffer_descriptor_v1 * v1)529 void smbd_mr_fill_buffer_descriptor(struct smbdirect_mr_io *mr,
530 struct smbdirect_buffer_descriptor_v1 *v1)
531 {
532 smbdirect_mr_io_fill_buffer_descriptor(mr, v1);
533 }
534
535 /*
536 * Deregister a MR after I/O is done
537 * This function may wait if remote invalidation is not used
538 * and we have to locally invalidate the buffer to prevent data is being
539 * modified by remote peer after upper layer consumes it
540 */
smbd_deregister_mr(struct smbdirect_mr_io * mr)541 void smbd_deregister_mr(struct smbdirect_mr_io *mr)
542 {
543 smbdirect_connection_deregister_mr_io(mr);
544 }
545
smbd_debug_proc_show(struct TCP_Server_Info * server,struct seq_file * m)546 void smbd_debug_proc_show(struct TCP_Server_Info *server, struct seq_file *m)
547 {
548 if (!server->rdma)
549 return;
550
551 if (!server->smbd_conn) {
552 seq_puts(m, "\nSMBDirect transport not available");
553 return;
554 }
555
556 smbdirect_connection_legacy_debug_proc_show(server->smbd_conn->socket,
557 server->rdma_readwrite_threshold,
558 m);
559 }
560
561 MODULE_IMPORT_NS("SMBDIRECT");
562