xref: /linux/fs/smb/client/smbdirect.c (revision 8ab992f815d6736b5c7a6f5fd7bfe7bc106bb3dc)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *   Copyright (C) 2017, Microsoft Corporation.
4  *
5  *   Author(s): Long Li <longli@microsoft.com>
6  */
7 
8 #include "smbdirect.h"
9 #include "cifs_debug.h"
10 #include "cifsproto.h"
11 #include "smb2proto.h"
12 
13 /* Port numbers for SMBD transport */
14 #define SMB_PORT	445
15 #define SMBD_PORT	5445
16 
17 /* Address lookup and resolve timeout in ms */
18 #define RDMA_RESOLVE_TIMEOUT	5000
19 
20 /* SMBD negotiation timeout in seconds */
21 #define SMBD_NEGOTIATE_TIMEOUT	120
22 
23 /* The timeout to wait for a keepalive message from peer in seconds */
24 #define KEEPALIVE_RECV_TIMEOUT 5
25 
26 /*
27  * Default maximum number of RDMA read/write outstanding on this connection
28  * This value is possibly decreased during QP creation on hardware limit
29  */
30 #define SMBD_CM_RESPONDER_RESOURCES	32
31 
32 /*
33  * User configurable initial values per SMBD transport connection
34  * as defined in [MS-SMBD] 3.1.1.1
35  * Those may change after a SMBD negotiation
36  */
37 /* The local peer's maximum number of credits to grant to the peer */
38 int smbd_receive_credit_max = 255;
39 
40 /* The remote peer's credit request of local peer */
41 int smbd_send_credit_target = 255;
42 
43 /* The maximum single message size can be sent to remote peer */
44 int smbd_max_send_size = 1364;
45 
46 /*
47  * The maximum fragmented upper-layer payload receive size supported
48  *
49  * Assume max_payload_per_credit is
50  * smbd_max_receive_size - 24 = 1340
51  *
52  * The maximum number would be
53  * smbd_receive_credit_max * max_payload_per_credit
54  *
55  *                       1340 * 255 = 341700 (0x536C4)
56  *
57  * The minimum value from the spec is 131072 (0x20000)
58  *
59  * For now we use the logic we used in ksmbd before:
60  *                 (1364 * 255) / 2 = 173910 (0x2A756)
61  */
62 int smbd_max_fragmented_recv_size = (1364 * 255) / 2;
63 
64 /*  The maximum single-message size which can be received */
65 int smbd_max_receive_size = 1364;
66 
67 /* The timeout to initiate send of a keepalive message on idle */
68 int smbd_keep_alive_interval = 120;
69 
70 /*
71  * User configurable initial values for RDMA transport
72  * The actual values used may be lower and are limited to hardware capabilities
73  */
74 /* Default maximum number of pages in a single RDMA write/read */
75 int smbd_max_frmr_depth = 2048;
76 
77 /* If payload is less than this byte, use RDMA send/recv not read/write */
78 int rdma_readwrite_threshold = 4096;
79 
80 /* Transport logging functions
81  * Logging are defined as classes. They can be OR'ed to define the actual
82  * logging level via module parameter smbd_logging_class
83  * e.g. cifs.smbd_logging_class=0xa0 will log all log_rdma_recv() and
84  * log_rdma_event()
85  */
86 #define LOG_OUTGOING			0x1
87 #define LOG_INCOMING			0x2
88 #define LOG_READ			0x4
89 #define LOG_WRITE			0x8
90 #define LOG_RDMA_SEND			0x10
91 #define LOG_RDMA_RECV			0x20
92 #define LOG_KEEP_ALIVE			0x40
93 #define LOG_RDMA_EVENT			0x80
94 #define LOG_RDMA_MR			0x100
95 static unsigned int smbd_logging_class;
96 module_param(smbd_logging_class, uint, 0644);
97 MODULE_PARM_DESC(smbd_logging_class,
98 	"Logging class for SMBD transport 0x0 to 0x100");
99 
100 #define ERR		0x0
101 #define INFO		0x1
102 static unsigned int smbd_logging_level = ERR;
103 module_param(smbd_logging_level, uint, 0644);
104 MODULE_PARM_DESC(smbd_logging_level,
105 	"Logging level for SMBD transport, 0 (default): error, 1: info");
106 
smbd_logging_needed(struct smbdirect_socket * sc,void * private_ptr,unsigned int lvl,unsigned int cls)107 static bool smbd_logging_needed(struct smbdirect_socket *sc,
108 				void *private_ptr,
109 				unsigned int lvl,
110 				unsigned int cls)
111 {
112 #define BUILD_BUG_SAME(x) BUILD_BUG_ON(x != SMBDIRECT_LOG_ ##x)
113 	BUILD_BUG_SAME(ERR);
114 	BUILD_BUG_SAME(INFO);
115 #undef BUILD_BUG_SAME
116 #define BUILD_BUG_SAME(x) BUILD_BUG_ON(x != SMBDIRECT_ ##x)
117 	BUILD_BUG_SAME(LOG_OUTGOING);
118 	BUILD_BUG_SAME(LOG_INCOMING);
119 	BUILD_BUG_SAME(LOG_READ);
120 	BUILD_BUG_SAME(LOG_WRITE);
121 	BUILD_BUG_SAME(LOG_RDMA_SEND);
122 	BUILD_BUG_SAME(LOG_RDMA_RECV);
123 	BUILD_BUG_SAME(LOG_KEEP_ALIVE);
124 	BUILD_BUG_SAME(LOG_RDMA_EVENT);
125 	BUILD_BUG_SAME(LOG_RDMA_MR);
126 #undef BUILD_BUG_SAME
127 
128 	if (lvl <= smbd_logging_level || cls & smbd_logging_class)
129 		return true;
130 	return false;
131 }
132 
smbd_logging_vaprintf(struct smbdirect_socket * sc,const char * func,unsigned int line,void * private_ptr,unsigned int lvl,unsigned int cls,struct va_format * vaf)133 static void smbd_logging_vaprintf(struct smbdirect_socket *sc,
134 				  const char *func,
135 				  unsigned int line,
136 				  void *private_ptr,
137 				  unsigned int lvl,
138 				  unsigned int cls,
139 				  struct va_format *vaf)
140 {
141 	cifs_dbg(VFS, "%s:%u %pV", func, line, vaf);
142 }
143 
144 #define log_rdma(level, class, fmt, args...)				\
145 do {									\
146 	if (level <= smbd_logging_level || class & smbd_logging_class)	\
147 		cifs_dbg(VFS, "%s:%d " fmt, __func__, __LINE__, ##args);\
148 } while (0)
149 
150 #define log_outgoing(level, fmt, args...) \
151 		log_rdma(level, LOG_OUTGOING, fmt, ##args)
152 #define log_incoming(level, fmt, args...) \
153 		log_rdma(level, LOG_INCOMING, fmt, ##args)
154 #define log_read(level, fmt, args...)	log_rdma(level, LOG_READ, fmt, ##args)
155 #define log_write(level, fmt, args...)	log_rdma(level, LOG_WRITE, fmt, ##args)
156 #define log_rdma_send(level, fmt, args...) \
157 		log_rdma(level, LOG_RDMA_SEND, fmt, ##args)
158 #define log_rdma_recv(level, fmt, args...) \
159 		log_rdma(level, LOG_RDMA_RECV, fmt, ##args)
160 #define log_keep_alive(level, fmt, args...) \
161 		log_rdma(level, LOG_KEEP_ALIVE, fmt, ##args)
162 #define log_rdma_event(level, fmt, args...) \
163 		log_rdma(level, LOG_RDMA_EVENT, fmt, ##args)
164 #define log_rdma_mr(level, fmt, args...) \
165 		log_rdma(level, LOG_RDMA_MR, fmt, ##args)
166 
smbd_post_send_full_iter(struct smbdirect_socket * sc,struct smbdirect_send_batch * batch,struct iov_iter * iter,u32 remaining_data_length)167 static int smbd_post_send_full_iter(struct smbdirect_socket *sc,
168 				    struct smbdirect_send_batch *batch,
169 				    struct iov_iter *iter,
170 				    u32 remaining_data_length)
171 {
172 	int bytes = 0;
173 
174 	/*
175 	 * smbdirect_connection_send_single_iter() respects the
176 	 * negotiated max_send_size, so we need to
177 	 * loop until the full iter is posted
178 	 */
179 
180 	while (iov_iter_count(iter) > 0) {
181 		int rc;
182 
183 		rc = smbdirect_connection_send_single_iter(sc,
184 							   batch,
185 							   iter,
186 							   0, /* flags */
187 							   remaining_data_length);
188 		if (rc < 0)
189 			return rc;
190 		remaining_data_length -= rc;
191 		bytes += rc;
192 	}
193 
194 	return bytes;
195 }
196 
197 /*
198  * Destroy the transport and related RDMA and memory resources
199  * Need to go through all the pending counters and make sure on one is using
200  * the transport while it is destroyed
201  */
smbd_destroy(struct TCP_Server_Info * server)202 void smbd_destroy(struct TCP_Server_Info *server)
203 {
204 	struct smbd_connection *info = server->smbd_conn;
205 
206 	if (!info) {
207 		log_rdma_event(INFO, "rdma session already destroyed\n");
208 		return;
209 	}
210 
211 	smbdirect_socket_release(info->socket);
212 
213 	kfree(info);
214 	server->smbd_conn = NULL;
215 }
216 
217 /*
218  * Reconnect this SMBD connection, called from upper layer
219  * return value: 0 on success, or actual error code
220  */
smbd_reconnect(struct TCP_Server_Info * server)221 int smbd_reconnect(struct TCP_Server_Info *server)
222 {
223 	log_rdma_event(INFO, "reconnecting rdma session\n");
224 
225 	if (!server->smbd_conn) {
226 		log_rdma_event(INFO, "rdma session already destroyed\n");
227 		goto create_conn;
228 	}
229 
230 	/*
231 	 * This is possible if transport is disconnected and we haven't received
232 	 * notification from RDMA, but upper layer has detected timeout
233 	 */
234 	log_rdma_event(INFO, "disconnecting transport\n");
235 	smbd_destroy(server);
236 
237 create_conn:
238 	log_rdma_event(INFO, "creating rdma session\n");
239 	server->smbd_conn = smbd_get_connection(
240 		server, (struct sockaddr *) &server->dstaddr);
241 
242 	if (server->smbd_conn) {
243 		cifs_dbg(VFS, "RDMA transport re-established\n");
244 		trace_smb3_smbd_connect_done(server->hostname, server->conn_id, &server->dstaddr);
245 		return 0;
246 	}
247 	trace_smb3_smbd_connect_err(server->hostname, server->conn_id, &server->dstaddr);
248 	return -ENOENT;
249 }
250 
251 /* Create a SMBD connection, called by upper layer */
_smbd_get_connection(struct TCP_Server_Info * server,struct sockaddr * dstaddr,int port)252 static struct smbd_connection *_smbd_get_connection(
253 	struct TCP_Server_Info *server, struct sockaddr *dstaddr, int port)
254 {
255 	struct net *net = cifs_net_ns(server);
256 	struct smbd_connection *info;
257 	struct smbdirect_socket *sc;
258 	struct smbdirect_socket_parameters init_params = {};
259 	struct smbdirect_socket_parameters *sp;
260 	__be16 *sport;
261 	u64 port_flags = 0;
262 	int ret;
263 
264 	switch (port) {
265 	case SMBD_PORT:
266 		/*
267 		 * only allow iWarp devices
268 		 * for port 5445.
269 		 */
270 		port_flags |= SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW;
271 		break;
272 	case SMB_PORT:
273 		/*
274 		 * only allow InfiniBand, RoCEv1 or RoCEv2
275 		 * devices for port 445.
276 		 *
277 		 * (Basically don't allow iWarp devices)
278 		 */
279 		port_flags |= SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB;
280 		break;
281 	}
282 
283 	/*
284 	 * Create the initial parameters
285 	 */
286 	sp = &init_params;
287 	sp->flags = port_flags;
288 	sp->resolve_addr_timeout_msec = RDMA_RESOLVE_TIMEOUT;
289 	sp->resolve_route_timeout_msec = RDMA_RESOLVE_TIMEOUT;
290 	sp->rdma_connect_timeout_msec = RDMA_RESOLVE_TIMEOUT;
291 	sp->negotiate_timeout_msec = SMBD_NEGOTIATE_TIMEOUT * 1000;
292 	sp->initiator_depth = 1;
293 	sp->responder_resources = SMBD_CM_RESPONDER_RESOURCES;
294 	sp->recv_credit_max = smbd_receive_credit_max;
295 	sp->send_credit_target = smbd_send_credit_target;
296 	sp->max_send_size = smbd_max_send_size;
297 	sp->max_fragmented_recv_size = smbd_max_fragmented_recv_size;
298 	sp->max_recv_size = smbd_max_receive_size;
299 	sp->max_frmr_depth = smbd_max_frmr_depth;
300 	sp->keepalive_interval_msec = smbd_keep_alive_interval * 1000;
301 	sp->keepalive_timeout_msec = KEEPALIVE_RECV_TIMEOUT * 1000;
302 
303 	info = kzalloc_obj(*info);
304 	if (!info)
305 		return NULL;
306 	ret = smbdirect_socket_create_kern(net, &sc);
307 	if (ret)
308 		goto socket_init_failed;
309 	smbdirect_socket_set_logging(sc, NULL, smbd_logging_needed, smbd_logging_vaprintf);
310 	ret = smbdirect_socket_set_initial_parameters(sc, sp);
311 	if (ret)
312 		goto set_params_failed;
313 	ret = smbdirect_socket_set_kernel_settings(sc, IB_POLL_SOFTIRQ, GFP_KERNEL);
314 	if (ret)
315 		goto set_settings_failed;
316 
317 	if (dstaddr->sa_family == AF_INET6)
318 		sport = &((struct sockaddr_in6 *)dstaddr)->sin6_port;
319 	else
320 		sport = &((struct sockaddr_in *)dstaddr)->sin_port;
321 
322 	*sport = htons(port);
323 
324 	ret = smbdirect_connect_sync(sc, dstaddr);
325 	if (ret) {
326 		log_rdma_event(ERR, "connect to %pISpsfc failed: %1pe\n",
327 			       dstaddr, ERR_PTR(ret));
328 		goto connect_failed;
329 	}
330 
331 	info->socket = sc;
332 	return info;
333 
334 connect_failed:
335 set_settings_failed:
336 set_params_failed:
337 	smbdirect_socket_release(sc);
338 socket_init_failed:
339 	kfree(info);
340 	return NULL;
341 }
342 
smbd_get_parameters(struct smbd_connection * conn)343 const struct smbdirect_socket_parameters *smbd_get_parameters(struct smbd_connection *conn)
344 {
345 	if (unlikely(!conn->socket)) {
346 		static const struct smbdirect_socket_parameters zero_params;
347 
348 		return &zero_params;
349 	}
350 
351 	return smbdirect_socket_get_current_parameters(conn->socket);
352 }
353 
smbd_get_connection(struct TCP_Server_Info * server,struct sockaddr * dstaddr)354 struct smbd_connection *smbd_get_connection(
355 	struct TCP_Server_Info *server, struct sockaddr *dstaddr)
356 {
357 	struct smbd_connection *ret;
358 	const struct smbdirect_socket_parameters *sp;
359 	int port = SMBD_PORT;
360 
361 try_again:
362 	ret = _smbd_get_connection(server, dstaddr, port);
363 
364 	/* Try SMB_PORT if SMBD_PORT doesn't work */
365 	if (!ret && port == SMBD_PORT) {
366 		port = SMB_PORT;
367 		goto try_again;
368 	}
369 	if (!ret)
370 		return NULL;
371 
372 	sp = smbd_get_parameters(ret);
373 
374 	server->rdma_readwrite_threshold =
375 		rdma_readwrite_threshold > sp->max_fragmented_send_size ?
376 		sp->max_fragmented_send_size :
377 		rdma_readwrite_threshold;
378 
379 	return ret;
380 }
381 
382 /*
383  * Receive data from the transport's receive reassembly queue
384  * All the incoming data packets are placed in reassembly queue
385  * iter: the buffer to read data into
386  * size: the length of data to read
387  * return value: actual data read
388  *
389  * Note: this implementation copies the data from reassembly queue to receive
390  * buffers used by upper layer. This is not the optimal code path. A better way
391  * to do it is to not have upper layer allocate its receive buffers but rather
392  * borrow the buffer from reassembly queue, and return it after data is
393  * consumed. But this will require more changes to upper layer code, and also
394  * need to consider packet boundaries while they still being reassembled.
395  */
smbd_recv(struct smbd_connection * info,struct msghdr * msg)396 int smbd_recv(struct smbd_connection *info, struct msghdr *msg)
397 {
398 	struct smbdirect_socket *sc = info->socket;
399 
400 	if (!smbdirect_connection_is_connected(sc))
401 		return -ENOTCONN;
402 
403 	return smbdirect_connection_recvmsg(sc, msg, 0);
404 }
405 
406 /*
407  * Send data to transport
408  * Each rqst is transported as a SMBDirect payload
409  * rqst: the data to write
410  * return value: 0 if successfully write, otherwise error code
411  */
smbd_send(struct TCP_Server_Info * server,int num_rqst,struct smb_rqst * rqst_array)412 int smbd_send(struct TCP_Server_Info *server,
413 	int num_rqst, struct smb_rqst *rqst_array)
414 {
415 	struct smbd_connection *info = server->smbd_conn;
416 	struct smbdirect_socket *sc = info->socket;
417 	const struct smbdirect_socket_parameters *sp = smbd_get_parameters(info);
418 	struct smb_rqst *rqst;
419 	struct iov_iter iter;
420 	struct smbdirect_send_batch_storage bstorage;
421 	struct smbdirect_send_batch *batch;
422 	unsigned int remaining_data_length, klen;
423 	int rc, i, rqst_idx;
424 	int error = 0;
425 
426 	if (!smbdirect_connection_is_connected(sc))
427 		return -EAGAIN;
428 
429 	/*
430 	 * Add in the page array if there is one. The caller needs to set
431 	 * rq_tailsz to PAGE_SIZE when the buffer has multiple pages and
432 	 * ends at page boundary
433 	 */
434 	remaining_data_length = 0;
435 	for (i = 0; i < num_rqst; i++)
436 		remaining_data_length += smb_rqst_len(server, &rqst_array[i]);
437 
438 	if (unlikely(remaining_data_length > sp->max_fragmented_send_size)) {
439 		/* assertion: payload never exceeds negotiated maximum */
440 		log_write(ERR, "payload size %d > max size %d\n",
441 			remaining_data_length, sp->max_fragmented_send_size);
442 		return -EINVAL;
443 	}
444 
445 	log_write(INFO, "num_rqst=%d total length=%u\n",
446 			num_rqst, remaining_data_length);
447 
448 	rqst_idx = 0;
449 	batch = smbdirect_init_send_batch_storage(&bstorage, false, 0);
450 	do {
451 		rqst = &rqst_array[rqst_idx];
452 
453 		cifs_dbg(FYI, "Sending smb (RDMA): idx=%d smb_len=%lu\n",
454 			 rqst_idx, smb_rqst_len(server, rqst));
455 		for (i = 0; i < rqst->rq_nvec; i++)
456 			dump_smb(rqst->rq_iov[i].iov_base, rqst->rq_iov[i].iov_len);
457 
458 		log_write(INFO, "RDMA-WR[%u] nvec=%d len=%u iter=%zu rqlen=%lu\n",
459 			  rqst_idx, rqst->rq_nvec, remaining_data_length,
460 			  iov_iter_count(&rqst->rq_iter), smb_rqst_len(server, rqst));
461 
462 		/* Send the metadata pages. */
463 		klen = 0;
464 		for (i = 0; i < rqst->rq_nvec; i++)
465 			klen += rqst->rq_iov[i].iov_len;
466 		iov_iter_kvec(&iter, ITER_SOURCE, rqst->rq_iov, rqst->rq_nvec, klen);
467 
468 		rc = smbd_post_send_full_iter(sc, batch, &iter, remaining_data_length);
469 		if (rc < 0) {
470 			error = rc;
471 			break;
472 		}
473 		remaining_data_length -= rc;
474 
475 		if (iov_iter_count(&rqst->rq_iter) > 0) {
476 			/* And then the data pages if there are any */
477 			rc = smbd_post_send_full_iter(sc, batch, &rqst->rq_iter,
478 						      remaining_data_length);
479 			if (rc < 0) {
480 				error = rc;
481 				break;
482 			}
483 			remaining_data_length -= rc;
484 		}
485 
486 	} while (++rqst_idx < num_rqst);
487 
488 	rc = smbdirect_connection_send_batch_flush(sc, batch, true);
489 	if (unlikely(!rc && error))
490 		rc = error;
491 
492 	/*
493 	 * As an optimization, we don't wait for individual I/O to finish
494 	 * before sending the next one.
495 	 * Send them all and wait for pending send count to get to 0
496 	 * that means all the I/Os have been out and we are good to return
497 	 */
498 
499 	error = rc;
500 	rc = smbdirect_connection_send_wait_zero_pending(sc);
501 	if (unlikely(rc && !error))
502 		error = -EAGAIN;
503 
504 	if (unlikely(error))
505 		return error;
506 
507 	return 0;
508 }
509 
510 /*
511  * Register memory for RDMA read/write
512  * iter: the buffer to register memory with
513  * writing: true if this is a RDMA write (SMB read), false for RDMA read
514  * need_invalidate: true if this MR needs to be locally invalidated after I/O
515  * return value: the MR registered, NULL if failed.
516  */
smbd_register_mr(struct smbd_connection * info,struct iov_iter * iter,bool writing,bool need_invalidate)517 struct smbdirect_mr_io *smbd_register_mr(struct smbd_connection *info,
518 				 struct iov_iter *iter,
519 				 bool writing, bool need_invalidate)
520 {
521 	struct smbdirect_socket *sc = info->socket;
522 
523 	if (!smbdirect_connection_is_connected(sc))
524 		return NULL;
525 
526 	return smbdirect_connection_register_mr_io(sc, iter, writing, need_invalidate);
527 }
528 
smbd_mr_fill_buffer_descriptor(struct smbdirect_mr_io * mr,struct smbdirect_buffer_descriptor_v1 * v1)529 void smbd_mr_fill_buffer_descriptor(struct smbdirect_mr_io *mr,
530 				    struct smbdirect_buffer_descriptor_v1 *v1)
531 {
532 	smbdirect_mr_io_fill_buffer_descriptor(mr, v1);
533 }
534 
535 /*
536  * Deregister a MR after I/O is done
537  * This function may wait if remote invalidation is not used
538  * and we have to locally invalidate the buffer to prevent data is being
539  * modified by remote peer after upper layer consumes it
540  */
smbd_deregister_mr(struct smbdirect_mr_io * mr)541 void smbd_deregister_mr(struct smbdirect_mr_io *mr)
542 {
543 	smbdirect_connection_deregister_mr_io(mr);
544 }
545 
smbd_debug_proc_show(struct TCP_Server_Info * server,struct seq_file * m)546 void smbd_debug_proc_show(struct TCP_Server_Info *server, struct seq_file *m)
547 {
548 	if (!server->rdma)
549 		return;
550 
551 	if (!server->smbd_conn) {
552 		seq_puts(m, "\nSMBDirect transport not available");
553 		return;
554 	}
555 
556 	smbdirect_connection_legacy_debug_proc_show(server->smbd_conn->socket,
557 						    server->rdma_readwrite_threshold,
558 						    m);
559 }
560 
561 MODULE_IMPORT_NS("SMBDIRECT");
562