xref: /linux/fs/smb/client/smbdirect.c (revision 73dc52d2942ccf4d4f680176c1e7f36aadba4ce8)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *   Copyright (C) 2017, Microsoft Corporation.
4  *
5  *   Author(s): Long Li <longli@microsoft.com>
6  */
7 
8 #include "smbdirect.h"
9 #include "cifs_debug.h"
10 #include "cifsproto.h"
11 #include "smb2proto.h"
12 #include "../common/smbdirect/smbdirect_public.h"
13 
14 /* Port numbers for SMBD transport */
15 #define SMB_PORT	445
16 #define SMBD_PORT	5445
17 
18 /* Address lookup and resolve timeout in ms */
19 #define RDMA_RESOLVE_TIMEOUT	5000
20 
21 /* SMBD negotiation timeout in seconds */
22 #define SMBD_NEGOTIATE_TIMEOUT	120
23 
24 /* The timeout to wait for a keepalive message from peer in seconds */
25 #define KEEPALIVE_RECV_TIMEOUT 5
26 
27 /*
28  * Default maximum number of RDMA read/write outstanding on this connection
29  * This value is possibly decreased during QP creation on hardware limit
30  */
31 #define SMBD_CM_RESPONDER_RESOURCES	32
32 
33 /*
34  * User configurable initial values per SMBD transport connection
35  * as defined in [MS-SMBD] 3.1.1.1
36  * Those may change after a SMBD negotiation
37  */
38 /* The local peer's maximum number of credits to grant to the peer */
39 int smbd_receive_credit_max = 255;
40 
41 /* The remote peer's credit request of local peer */
42 int smbd_send_credit_target = 255;
43 
44 /* The maximum single message size can be sent to remote peer */
45 int smbd_max_send_size = 1364;
46 
47 /*
48  * The maximum fragmented upper-layer payload receive size supported
49  *
50  * Assume max_payload_per_credit is
51  * smbd_max_receive_size - 24 = 1340
52  *
53  * The maximum number would be
54  * smbd_receive_credit_max * max_payload_per_credit
55  *
56  *                       1340 * 255 = 341700 (0x536C4)
57  *
58  * The minimum value from the spec is 131072 (0x20000)
59  *
60  * For now we use the logic we used in ksmbd before:
61  *                 (1364 * 255) / 2 = 173910 (0x2A756)
62  */
63 int smbd_max_fragmented_recv_size = (1364 * 255) / 2;
64 
65 /*  The maximum single-message size which can be received */
66 int smbd_max_receive_size = 1364;
67 
68 /* The timeout to initiate send of a keepalive message on idle */
69 int smbd_keep_alive_interval = 120;
70 
71 /*
72  * User configurable initial values for RDMA transport
73  * The actual values used may be lower and are limited to hardware capabilities
74  */
75 /* Default maximum number of pages in a single RDMA write/read */
76 int smbd_max_frmr_depth = 2048;
77 
78 /* If payload is less than this byte, use RDMA send/recv not read/write */
79 int rdma_readwrite_threshold = 4096;
80 
81 /* Transport logging functions
82  * Logging are defined as classes. They can be OR'ed to define the actual
83  * logging level via module parameter smbd_logging_class
84  * e.g. cifs.smbd_logging_class=0xa0 will log all log_rdma_recv() and
85  * log_rdma_event()
86  */
87 #define LOG_OUTGOING			0x1
88 #define LOG_INCOMING			0x2
89 #define LOG_READ			0x4
90 #define LOG_WRITE			0x8
91 #define LOG_RDMA_SEND			0x10
92 #define LOG_RDMA_RECV			0x20
93 #define LOG_KEEP_ALIVE			0x40
94 #define LOG_RDMA_EVENT			0x80
95 #define LOG_RDMA_MR			0x100
96 static unsigned int smbd_logging_class;
97 module_param(smbd_logging_class, uint, 0644);
98 MODULE_PARM_DESC(smbd_logging_class,
99 	"Logging class for SMBD transport 0x0 to 0x100");
100 
101 #define ERR		0x0
102 #define INFO		0x1
103 static unsigned int smbd_logging_level = ERR;
104 module_param(smbd_logging_level, uint, 0644);
105 MODULE_PARM_DESC(smbd_logging_level,
106 	"Logging level for SMBD transport, 0 (default): error, 1: info");
107 
108 static bool smbd_logging_needed(struct smbdirect_socket *sc,
109 				void *private_ptr,
110 				unsigned int lvl,
111 				unsigned int cls)
112 {
113 #define BUILD_BUG_SAME(x) BUILD_BUG_ON(x != SMBDIRECT_LOG_ ##x)
114 	BUILD_BUG_SAME(ERR);
115 	BUILD_BUG_SAME(INFO);
116 #undef BUILD_BUG_SAME
117 #define BUILD_BUG_SAME(x) BUILD_BUG_ON(x != SMBDIRECT_ ##x)
118 	BUILD_BUG_SAME(LOG_OUTGOING);
119 	BUILD_BUG_SAME(LOG_INCOMING);
120 	BUILD_BUG_SAME(LOG_READ);
121 	BUILD_BUG_SAME(LOG_WRITE);
122 	BUILD_BUG_SAME(LOG_RDMA_SEND);
123 	BUILD_BUG_SAME(LOG_RDMA_RECV);
124 	BUILD_BUG_SAME(LOG_KEEP_ALIVE);
125 	BUILD_BUG_SAME(LOG_RDMA_EVENT);
126 	BUILD_BUG_SAME(LOG_RDMA_MR);
127 #undef BUILD_BUG_SAME
128 
129 	if (lvl <= smbd_logging_level || cls & smbd_logging_class)
130 		return true;
131 	return false;
132 }
133 
134 static void smbd_logging_vaprintf(struct smbdirect_socket *sc,
135 				  const char *func,
136 				  unsigned int line,
137 				  void *private_ptr,
138 				  unsigned int lvl,
139 				  unsigned int cls,
140 				  struct va_format *vaf)
141 {
142 	cifs_dbg(VFS, "%s:%u %pV", func, line, vaf);
143 }
144 
145 #define log_rdma(level, class, fmt, args...)				\
146 do {									\
147 	if (level <= smbd_logging_level || class & smbd_logging_class)	\
148 		cifs_dbg(VFS, "%s:%d " fmt, __func__, __LINE__, ##args);\
149 } while (0)
150 
151 #define log_outgoing(level, fmt, args...) \
152 		log_rdma(level, LOG_OUTGOING, fmt, ##args)
153 #define log_incoming(level, fmt, args...) \
154 		log_rdma(level, LOG_INCOMING, fmt, ##args)
155 #define log_read(level, fmt, args...)	log_rdma(level, LOG_READ, fmt, ##args)
156 #define log_write(level, fmt, args...)	log_rdma(level, LOG_WRITE, fmt, ##args)
157 #define log_rdma_send(level, fmt, args...) \
158 		log_rdma(level, LOG_RDMA_SEND, fmt, ##args)
159 #define log_rdma_recv(level, fmt, args...) \
160 		log_rdma(level, LOG_RDMA_RECV, fmt, ##args)
161 #define log_keep_alive(level, fmt, args...) \
162 		log_rdma(level, LOG_KEEP_ALIVE, fmt, ##args)
163 #define log_rdma_event(level, fmt, args...) \
164 		log_rdma(level, LOG_RDMA_EVENT, fmt, ##args)
165 #define log_rdma_mr(level, fmt, args...) \
166 		log_rdma(level, LOG_RDMA_MR, fmt, ##args)
167 
168 static int smbd_post_send_full_iter(struct smbdirect_socket *sc,
169 				    struct smbdirect_send_batch *batch,
170 				    struct iov_iter *iter,
171 				    u32 remaining_data_length)
172 {
173 	int bytes = 0;
174 
175 	/*
176 	 * smbdirect_connection_send_single_iter() respects the
177 	 * negotiated max_send_size, so we need to
178 	 * loop until the full iter is posted
179 	 */
180 
181 	while (iov_iter_count(iter) > 0) {
182 		int rc;
183 
184 		rc = smbdirect_connection_send_single_iter(sc,
185 							   batch,
186 							   iter,
187 							   0, /* flags */
188 							   remaining_data_length);
189 		if (rc < 0)
190 			return rc;
191 		remaining_data_length -= rc;
192 		bytes += rc;
193 	}
194 
195 	return bytes;
196 }
197 
198 /*
199  * Destroy the transport and related RDMA and memory resources
200  * Need to go through all the pending counters and make sure on one is using
201  * the transport while it is destroyed
202  */
203 void smbd_destroy(struct TCP_Server_Info *server)
204 {
205 	struct smbd_connection *info = server->smbd_conn;
206 
207 	if (!info) {
208 		log_rdma_event(INFO, "rdma session already destroyed\n");
209 		return;
210 	}
211 
212 	smbdirect_socket_release(info->socket);
213 
214 	kfree(info);
215 	server->smbd_conn = NULL;
216 }
217 
218 /*
219  * Reconnect this SMBD connection, called from upper layer
220  * return value: 0 on success, or actual error code
221  */
222 int smbd_reconnect(struct TCP_Server_Info *server)
223 {
224 	log_rdma_event(INFO, "reconnecting rdma session\n");
225 
226 	if (!server->smbd_conn) {
227 		log_rdma_event(INFO, "rdma session already destroyed\n");
228 		goto create_conn;
229 	}
230 
231 	/*
232 	 * This is possible if transport is disconnected and we haven't received
233 	 * notification from RDMA, but upper layer has detected timeout
234 	 */
235 	log_rdma_event(INFO, "disconnecting transport\n");
236 	smbd_destroy(server);
237 
238 create_conn:
239 	log_rdma_event(INFO, "creating rdma session\n");
240 	server->smbd_conn = smbd_get_connection(
241 		server, (struct sockaddr *) &server->dstaddr);
242 
243 	if (server->smbd_conn) {
244 		cifs_dbg(VFS, "RDMA transport re-established\n");
245 		trace_smb3_smbd_connect_done(server->hostname, server->conn_id, &server->dstaddr);
246 		return 0;
247 	}
248 	trace_smb3_smbd_connect_err(server->hostname, server->conn_id, &server->dstaddr);
249 	return -ENOENT;
250 }
251 
252 /* Create a SMBD connection, called by upper layer */
253 static struct smbd_connection *_smbd_get_connection(
254 	struct TCP_Server_Info *server, struct sockaddr *dstaddr, int port)
255 {
256 	struct net *net = cifs_net_ns(server);
257 	struct smbd_connection *info;
258 	struct smbdirect_socket *sc;
259 	struct smbdirect_socket_parameters init_params = {};
260 	struct smbdirect_socket_parameters *sp;
261 	__be16 *sport;
262 	u64 port_flags = 0;
263 	int ret;
264 
265 	switch (port) {
266 	case SMBD_PORT:
267 		/*
268 		 * only allow iWarp devices
269 		 * for port 5445.
270 		 */
271 		port_flags |= SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW;
272 		break;
273 	case SMB_PORT:
274 		/*
275 		 * only allow InfiniBand, RoCEv1 or RoCEv2
276 		 * devices for port 445.
277 		 *
278 		 * (Basically don't allow iWarp devices)
279 		 */
280 		port_flags |= SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB;
281 		break;
282 	}
283 
284 	/*
285 	 * Create the initial parameters
286 	 */
287 	sp = &init_params;
288 	sp->flags = port_flags;
289 	sp->resolve_addr_timeout_msec = RDMA_RESOLVE_TIMEOUT;
290 	sp->resolve_route_timeout_msec = RDMA_RESOLVE_TIMEOUT;
291 	sp->rdma_connect_timeout_msec = RDMA_RESOLVE_TIMEOUT;
292 	sp->negotiate_timeout_msec = SMBD_NEGOTIATE_TIMEOUT * 1000;
293 	sp->initiator_depth = 1;
294 	sp->responder_resources = SMBD_CM_RESPONDER_RESOURCES;
295 	sp->recv_credit_max = smbd_receive_credit_max;
296 	sp->send_credit_target = smbd_send_credit_target;
297 	sp->max_send_size = smbd_max_send_size;
298 	sp->max_fragmented_recv_size = smbd_max_fragmented_recv_size;
299 	sp->max_recv_size = smbd_max_receive_size;
300 	sp->max_frmr_depth = smbd_max_frmr_depth;
301 	sp->keepalive_interval_msec = smbd_keep_alive_interval * 1000;
302 	sp->keepalive_timeout_msec = KEEPALIVE_RECV_TIMEOUT * 1000;
303 
304 	info = kzalloc_obj(*info);
305 	if (!info)
306 		return NULL;
307 	ret = smbdirect_socket_create_kern(net, &sc);
308 	if (ret)
309 		goto socket_init_failed;
310 	smbdirect_socket_set_logging(sc, NULL, smbd_logging_needed, smbd_logging_vaprintf);
311 	ret = smbdirect_socket_set_initial_parameters(sc, sp);
312 	if (ret)
313 		goto set_params_failed;
314 	ret = smbdirect_socket_set_kernel_settings(sc, IB_POLL_SOFTIRQ, GFP_KERNEL);
315 	if (ret)
316 		goto set_settings_failed;
317 
318 	if (dstaddr->sa_family == AF_INET6)
319 		sport = &((struct sockaddr_in6 *)dstaddr)->sin6_port;
320 	else
321 		sport = &((struct sockaddr_in *)dstaddr)->sin_port;
322 
323 	*sport = htons(port);
324 
325 	ret = smbdirect_connect_sync(sc, dstaddr);
326 	if (ret) {
327 		log_rdma_event(ERR, "connect to %pISpsfc failed: %1pe\n",
328 			       dstaddr, ERR_PTR(ret));
329 		goto connect_failed;
330 	}
331 
332 	info->socket = sc;
333 	return info;
334 
335 connect_failed:
336 set_settings_failed:
337 set_params_failed:
338 	smbdirect_socket_release(sc);
339 socket_init_failed:
340 	kfree(info);
341 	return NULL;
342 }
343 
344 const struct smbdirect_socket_parameters *smbd_get_parameters(struct smbd_connection *conn)
345 {
346 	if (unlikely(!conn->socket)) {
347 		static const struct smbdirect_socket_parameters zero_params;
348 
349 		return &zero_params;
350 	}
351 
352 	return smbdirect_socket_get_current_parameters(conn->socket);
353 }
354 
355 struct smbd_connection *smbd_get_connection(
356 	struct TCP_Server_Info *server, struct sockaddr *dstaddr)
357 {
358 	struct smbd_connection *ret;
359 	const struct smbdirect_socket_parameters *sp;
360 	int port = SMBD_PORT;
361 
362 try_again:
363 	ret = _smbd_get_connection(server, dstaddr, port);
364 
365 	/* Try SMB_PORT if SMBD_PORT doesn't work */
366 	if (!ret && port == SMBD_PORT) {
367 		port = SMB_PORT;
368 		goto try_again;
369 	}
370 	if (!ret)
371 		return NULL;
372 
373 	sp = smbd_get_parameters(ret);
374 
375 	server->rdma_readwrite_threshold =
376 		rdma_readwrite_threshold > sp->max_fragmented_send_size ?
377 		sp->max_fragmented_send_size :
378 		rdma_readwrite_threshold;
379 
380 	return ret;
381 }
382 
383 /*
384  * Receive data from the transport's receive reassembly queue
385  * All the incoming data packets are placed in reassembly queue
386  * iter: the buffer to read data into
387  * size: the length of data to read
388  * return value: actual data read
389  *
390  * Note: this implementation copies the data from reassembly queue to receive
391  * buffers used by upper layer. This is not the optimal code path. A better way
392  * to do it is to not have upper layer allocate its receive buffers but rather
393  * borrow the buffer from reassembly queue, and return it after data is
394  * consumed. But this will require more changes to upper layer code, and also
395  * need to consider packet boundaries while they still being reassembled.
396  */
397 int smbd_recv(struct smbd_connection *info, struct msghdr *msg)
398 {
399 	struct smbdirect_socket *sc = info->socket;
400 
401 	if (!smbdirect_connection_is_connected(sc))
402 		return -ENOTCONN;
403 
404 	return smbdirect_connection_recvmsg(sc, msg, 0);
405 }
406 
407 /*
408  * Send data to transport
409  * Each rqst is transported as a SMBDirect payload
410  * rqst: the data to write
411  * return value: 0 if successfully write, otherwise error code
412  */
413 int smbd_send(struct TCP_Server_Info *server,
414 	int num_rqst, struct smb_rqst *rqst_array)
415 {
416 	struct smbd_connection *info = server->smbd_conn;
417 	struct smbdirect_socket *sc = info->socket;
418 	const struct smbdirect_socket_parameters *sp = smbd_get_parameters(info);
419 	struct smb_rqst *rqst;
420 	struct iov_iter iter;
421 	struct smbdirect_send_batch_storage bstorage;
422 	struct smbdirect_send_batch *batch;
423 	unsigned int remaining_data_length, klen;
424 	int rc, i, rqst_idx;
425 	int error = 0;
426 
427 	if (!smbdirect_connection_is_connected(sc))
428 		return -EAGAIN;
429 
430 	/*
431 	 * Add in the page array if there is one. The caller needs to set
432 	 * rq_tailsz to PAGE_SIZE when the buffer has multiple pages and
433 	 * ends at page boundary
434 	 */
435 	remaining_data_length = 0;
436 	for (i = 0; i < num_rqst; i++)
437 		remaining_data_length += smb_rqst_len(server, &rqst_array[i]);
438 
439 	if (unlikely(remaining_data_length > sp->max_fragmented_send_size)) {
440 		/* assertion: payload never exceeds negotiated maximum */
441 		log_write(ERR, "payload size %d > max size %d\n",
442 			remaining_data_length, sp->max_fragmented_send_size);
443 		return -EINVAL;
444 	}
445 
446 	log_write(INFO, "num_rqst=%d total length=%u\n",
447 			num_rqst, remaining_data_length);
448 
449 	rqst_idx = 0;
450 	batch = smbdirect_init_send_batch_storage(&bstorage, false, 0);
451 	do {
452 		rqst = &rqst_array[rqst_idx];
453 
454 		cifs_dbg(FYI, "Sending smb (RDMA): idx=%d smb_len=%lu\n",
455 			 rqst_idx, smb_rqst_len(server, rqst));
456 		for (i = 0; i < rqst->rq_nvec; i++)
457 			dump_smb(rqst->rq_iov[i].iov_base, rqst->rq_iov[i].iov_len);
458 
459 		log_write(INFO, "RDMA-WR[%u] nvec=%d len=%u iter=%zu rqlen=%lu\n",
460 			  rqst_idx, rqst->rq_nvec, remaining_data_length,
461 			  iov_iter_count(&rqst->rq_iter), smb_rqst_len(server, rqst));
462 
463 		/* Send the metadata pages. */
464 		klen = 0;
465 		for (i = 0; i < rqst->rq_nvec; i++)
466 			klen += rqst->rq_iov[i].iov_len;
467 		iov_iter_kvec(&iter, ITER_SOURCE, rqst->rq_iov, rqst->rq_nvec, klen);
468 
469 		rc = smbd_post_send_full_iter(sc, batch, &iter, remaining_data_length);
470 		if (rc < 0) {
471 			error = rc;
472 			break;
473 		}
474 		remaining_data_length -= rc;
475 
476 		if (iov_iter_count(&rqst->rq_iter) > 0) {
477 			/* And then the data pages if there are any */
478 			rc = smbd_post_send_full_iter(sc, batch, &rqst->rq_iter,
479 						      remaining_data_length);
480 			if (rc < 0) {
481 				error = rc;
482 				break;
483 			}
484 			remaining_data_length -= rc;
485 		}
486 
487 	} while (++rqst_idx < num_rqst);
488 
489 	rc = smbdirect_connection_send_batch_flush(sc, batch, true);
490 	if (unlikely(!rc && error))
491 		rc = error;
492 
493 	/*
494 	 * As an optimization, we don't wait for individual I/O to finish
495 	 * before sending the next one.
496 	 * Send them all and wait for pending send count to get to 0
497 	 * that means all the I/Os have been out and we are good to return
498 	 */
499 
500 	error = rc;
501 	rc = smbdirect_connection_send_wait_zero_pending(sc);
502 	if (unlikely(rc && !error))
503 		error = -EAGAIN;
504 
505 	if (unlikely(error))
506 		return error;
507 
508 	return 0;
509 }
510 
511 /*
512  * Register memory for RDMA read/write
513  * iter: the buffer to register memory with
514  * writing: true if this is a RDMA write (SMB read), false for RDMA read
515  * need_invalidate: true if this MR needs to be locally invalidated after I/O
516  * return value: the MR registered, NULL if failed.
517  */
518 struct smbdirect_mr_io *smbd_register_mr(struct smbd_connection *info,
519 				 struct iov_iter *iter,
520 				 bool writing, bool need_invalidate)
521 {
522 	struct smbdirect_socket *sc = info->socket;
523 
524 	if (!smbdirect_connection_is_connected(sc))
525 		return NULL;
526 
527 	return smbdirect_connection_register_mr_io(sc, iter, writing, need_invalidate);
528 }
529 
530 void smbd_mr_fill_buffer_descriptor(struct smbdirect_mr_io *mr,
531 				    struct smbdirect_buffer_descriptor_v1 *v1)
532 {
533 	smbdirect_mr_io_fill_buffer_descriptor(mr, v1);
534 }
535 
536 /*
537  * Deregister a MR after I/O is done
538  * This function may wait if remote invalidation is not used
539  * and we have to locally invalidate the buffer to prevent data is being
540  * modified by remote peer after upper layer consumes it
541  */
542 void smbd_deregister_mr(struct smbdirect_mr_io *mr)
543 {
544 	smbdirect_connection_deregister_mr_io(mr);
545 }
546 
547 void smbd_debug_proc_show(struct TCP_Server_Info *server, struct seq_file *m)
548 {
549 	if (!server->rdma)
550 		return;
551 
552 	if (!server->smbd_conn) {
553 		seq_puts(m, "\nSMBDirect transport not available");
554 		return;
555 	}
556 
557 	smbdirect_connection_legacy_debug_proc_show(server->smbd_conn->socket,
558 						    server->rdma_readwrite_threshold,
559 						    m);
560 }
561