xref: /linux/fs/smb/client/smbdirect.c (revision 566771afc7a81e343da9939f0bd848d3622e2501)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *   Copyright (C) 2017, Microsoft Corporation.
4  *
5  *   Author(s): Long Li <longli@microsoft.com>
6  */
7 #include <linux/module.h>
8 #include <linux/highmem.h>
9 #include <linux/folio_queue.h>
10 #include "../common/smbdirect/smbdirect_pdu.h"
11 #include "smbdirect.h"
12 #include "cifs_debug.h"
13 #include "cifsproto.h"
14 #include "smb2proto.h"
15 
smbd_get_parameters(struct smbd_connection * conn)16 const struct smbdirect_socket_parameters *smbd_get_parameters(struct smbd_connection *conn)
17 {
18 	struct smbdirect_socket *sc = &conn->socket;
19 
20 	return &sc->parameters;
21 }
22 
23 static struct smbdirect_recv_io *get_receive_buffer(
24 		struct smbdirect_socket *sc);
25 static void put_receive_buffer(
26 		struct smbdirect_socket *sc,
27 		struct smbdirect_recv_io *response);
28 static int allocate_receive_buffers(struct smbdirect_socket *sc, int num_buf);
29 static void destroy_receive_buffers(struct smbdirect_socket *sc);
30 
31 static void enqueue_reassembly(
32 		struct smbdirect_socket *sc,
33 		struct smbdirect_recv_io *response, int data_length);
34 static struct smbdirect_recv_io *_get_first_reassembly(
35 		struct smbdirect_socket *sc);
36 
37 static int smbd_post_recv(
38 		struct smbdirect_socket *sc,
39 		struct smbdirect_recv_io *response);
40 
41 static int smbd_post_send_empty(struct smbdirect_socket *sc);
42 
43 static void destroy_mr_list(struct smbdirect_socket *sc);
44 static int allocate_mr_list(struct smbdirect_socket *sc);
45 
46 struct smb_extract_to_rdma {
47 	struct ib_sge		*sge;
48 	unsigned int		nr_sge;
49 	unsigned int		max_sge;
50 	struct ib_device	*device;
51 	u32			local_dma_lkey;
52 	enum dma_data_direction	direction;
53 };
54 static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len,
55 					struct smb_extract_to_rdma *rdma);
56 
57 /* Port numbers for SMBD transport */
58 #define SMB_PORT	445
59 #define SMBD_PORT	5445
60 
61 /* Address lookup and resolve timeout in ms */
62 #define RDMA_RESOLVE_TIMEOUT	5000
63 
64 /* SMBD negotiation timeout in seconds */
65 #define SMBD_NEGOTIATE_TIMEOUT	120
66 
67 /* The timeout to wait for a keepalive message from peer in seconds */
68 #define KEEPALIVE_RECV_TIMEOUT 5
69 
70 /* SMBD minimum receive size and fragmented sized defined in [MS-SMBD] */
71 #define SMBD_MIN_RECEIVE_SIZE		128
72 #define SMBD_MIN_FRAGMENTED_SIZE	131072
73 
74 /*
75  * Default maximum number of RDMA read/write outstanding on this connection
76  * This value is possibly decreased during QP creation on hardware limit
77  */
78 #define SMBD_CM_RESPONDER_RESOURCES	32
79 
80 /* Maximum number of retries on data transfer operations */
81 #define SMBD_CM_RETRY			6
82 /* No need to retry on Receiver Not Ready since SMBD manages credits */
83 #define SMBD_CM_RNR_RETRY		0
84 
85 /*
86  * User configurable initial values per SMBD transport connection
87  * as defined in [MS-SMBD] 3.1.1.1
88  * Those may change after a SMBD negotiation
89  */
90 /* The local peer's maximum number of credits to grant to the peer */
91 int smbd_receive_credit_max = 255;
92 
93 /* The remote peer's credit request of local peer */
94 int smbd_send_credit_target = 255;
95 
96 /* The maximum single message size can be sent to remote peer */
97 int smbd_max_send_size = 1364;
98 
99 /*  The maximum fragmented upper-layer payload receive size supported */
100 int smbd_max_fragmented_recv_size = 1024 * 1024;
101 
102 /*  The maximum single-message size which can be received */
103 int smbd_max_receive_size = 1364;
104 
105 /* The timeout to initiate send of a keepalive message on idle */
106 int smbd_keep_alive_interval = 120;
107 
108 /*
109  * User configurable initial values for RDMA transport
110  * The actual values used may be lower and are limited to hardware capabilities
111  */
112 /* Default maximum number of pages in a single RDMA write/read */
113 int smbd_max_frmr_depth = 2048;
114 
115 /* If payload is less than this byte, use RDMA send/recv not read/write */
116 int rdma_readwrite_threshold = 4096;
117 
118 /* Transport logging functions
119  * Logging are defined as classes. They can be OR'ed to define the actual
120  * logging level via module parameter smbd_logging_class
121  * e.g. cifs.smbd_logging_class=0xa0 will log all log_rdma_recv() and
122  * log_rdma_event()
123  */
124 #define LOG_OUTGOING			0x1
125 #define LOG_INCOMING			0x2
126 #define LOG_READ			0x4
127 #define LOG_WRITE			0x8
128 #define LOG_RDMA_SEND			0x10
129 #define LOG_RDMA_RECV			0x20
130 #define LOG_KEEP_ALIVE			0x40
131 #define LOG_RDMA_EVENT			0x80
132 #define LOG_RDMA_MR			0x100
133 static unsigned int smbd_logging_class;
134 module_param(smbd_logging_class, uint, 0644);
135 MODULE_PARM_DESC(smbd_logging_class,
136 	"Logging class for SMBD transport 0x0 to 0x100");
137 
138 #define ERR		0x0
139 #define INFO		0x1
140 static unsigned int smbd_logging_level = ERR;
141 module_param(smbd_logging_level, uint, 0644);
142 MODULE_PARM_DESC(smbd_logging_level,
143 	"Logging level for SMBD transport, 0 (default): error, 1: info");
144 
145 #define log_rdma(level, class, fmt, args...)				\
146 do {									\
147 	if (level <= smbd_logging_level || class & smbd_logging_class)	\
148 		cifs_dbg(VFS, "%s:%d " fmt, __func__, __LINE__, ##args);\
149 } while (0)
150 
151 #define log_outgoing(level, fmt, args...) \
152 		log_rdma(level, LOG_OUTGOING, fmt, ##args)
153 #define log_incoming(level, fmt, args...) \
154 		log_rdma(level, LOG_INCOMING, fmt, ##args)
155 #define log_read(level, fmt, args...)	log_rdma(level, LOG_READ, fmt, ##args)
156 #define log_write(level, fmt, args...)	log_rdma(level, LOG_WRITE, fmt, ##args)
157 #define log_rdma_send(level, fmt, args...) \
158 		log_rdma(level, LOG_RDMA_SEND, fmt, ##args)
159 #define log_rdma_recv(level, fmt, args...) \
160 		log_rdma(level, LOG_RDMA_RECV, fmt, ##args)
161 #define log_keep_alive(level, fmt, args...) \
162 		log_rdma(level, LOG_KEEP_ALIVE, fmt, ##args)
163 #define log_rdma_event(level, fmt, args...) \
164 		log_rdma(level, LOG_RDMA_EVENT, fmt, ##args)
165 #define log_rdma_mr(level, fmt, args...) \
166 		log_rdma(level, LOG_RDMA_MR, fmt, ##args)
167 
smbd_disconnect_wake_up_all(struct smbdirect_socket * sc)168 static void smbd_disconnect_wake_up_all(struct smbdirect_socket *sc)
169 {
170 	/*
171 	 * Wake up all waiters in all wait queues
172 	 * in order to notice the broken connection.
173 	 */
174 	wake_up_all(&sc->status_wait);
175 	wake_up_all(&sc->send_io.lcredits.wait_queue);
176 	wake_up_all(&sc->send_io.credits.wait_queue);
177 	wake_up_all(&sc->send_io.pending.dec_wait_queue);
178 	wake_up_all(&sc->send_io.pending.zero_wait_queue);
179 	wake_up_all(&sc->recv_io.reassembly.wait_queue);
180 	wake_up_all(&sc->mr_io.ready.wait_queue);
181 	wake_up_all(&sc->mr_io.cleanup.wait_queue);
182 }
183 
smbd_disconnect_rdma_work(struct work_struct * work)184 static void smbd_disconnect_rdma_work(struct work_struct *work)
185 {
186 	struct smbdirect_socket *sc =
187 		container_of(work, struct smbdirect_socket, disconnect_work);
188 
189 	/*
190 	 * make sure this and other work is not queued again
191 	 * but here we don't block and avoid
192 	 * disable[_delayed]_work_sync()
193 	 */
194 	disable_work(&sc->disconnect_work);
195 	disable_work(&sc->recv_io.posted.refill_work);
196 	disable_work(&sc->mr_io.recovery_work);
197 	disable_work(&sc->idle.immediate_work);
198 	disable_delayed_work(&sc->idle.timer_work);
199 
200 	if (sc->first_error == 0)
201 		sc->first_error = -ECONNABORTED;
202 
203 	switch (sc->status) {
204 	case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED:
205 	case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING:
206 	case SMBDIRECT_SOCKET_NEGOTIATE_FAILED:
207 	case SMBDIRECT_SOCKET_CONNECTED:
208 	case SMBDIRECT_SOCKET_ERROR:
209 		sc->status = SMBDIRECT_SOCKET_DISCONNECTING;
210 		rdma_disconnect(sc->rdma.cm_id);
211 		break;
212 
213 	case SMBDIRECT_SOCKET_CREATED:
214 	case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED:
215 	case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING:
216 	case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED:
217 	case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED:
218 	case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING:
219 	case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED:
220 	case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED:
221 	case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING:
222 	case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED:
223 		/*
224 		 * rdma_connect() never reached
225 		 * RDMA_CM_EVENT_ESTABLISHED
226 		 */
227 		sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
228 		break;
229 
230 	case SMBDIRECT_SOCKET_DISCONNECTING:
231 	case SMBDIRECT_SOCKET_DISCONNECTED:
232 	case SMBDIRECT_SOCKET_DESTROYED:
233 		break;
234 	}
235 
236 	/*
237 	 * Wake up all waiters in all wait queues
238 	 * in order to notice the broken connection.
239 	 */
240 	smbd_disconnect_wake_up_all(sc);
241 }
242 
smbd_disconnect_rdma_connection(struct smbdirect_socket * sc)243 static void smbd_disconnect_rdma_connection(struct smbdirect_socket *sc)
244 {
245 	/*
246 	 * make sure other work (than disconnect_work) is
247 	 * not queued again but here we don't block and avoid
248 	 * disable[_delayed]_work_sync()
249 	 */
250 	disable_work(&sc->recv_io.posted.refill_work);
251 	disable_work(&sc->mr_io.recovery_work);
252 	disable_work(&sc->idle.immediate_work);
253 	disable_delayed_work(&sc->idle.timer_work);
254 
255 	if (sc->first_error == 0)
256 		sc->first_error = -ECONNABORTED;
257 
258 	switch (sc->status) {
259 	case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED:
260 	case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED:
261 	case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED:
262 	case SMBDIRECT_SOCKET_NEGOTIATE_FAILED:
263 	case SMBDIRECT_SOCKET_ERROR:
264 	case SMBDIRECT_SOCKET_DISCONNECTING:
265 	case SMBDIRECT_SOCKET_DISCONNECTED:
266 	case SMBDIRECT_SOCKET_DESTROYED:
267 		/*
268 		 * Keep the current error status
269 		 */
270 		break;
271 
272 	case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED:
273 	case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING:
274 		sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED;
275 		break;
276 
277 	case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED:
278 	case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING:
279 		sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED;
280 		break;
281 
282 	case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED:
283 	case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING:
284 		sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED;
285 		break;
286 
287 	case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED:
288 	case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING:
289 		sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED;
290 		break;
291 
292 	case SMBDIRECT_SOCKET_CREATED:
293 	case SMBDIRECT_SOCKET_CONNECTED:
294 		sc->status = SMBDIRECT_SOCKET_ERROR;
295 		break;
296 	}
297 
298 	/*
299 	 * Wake up all waiters in all wait queues
300 	 * in order to notice the broken connection.
301 	 */
302 	smbd_disconnect_wake_up_all(sc);
303 
304 	queue_work(sc->workqueue, &sc->disconnect_work);
305 }
306 
307 /* Upcall from RDMA CM */
smbd_conn_upcall(struct rdma_cm_id * id,struct rdma_cm_event * event)308 static int smbd_conn_upcall(
309 		struct rdma_cm_id *id, struct rdma_cm_event *event)
310 {
311 	struct smbdirect_socket *sc = id->context;
312 	struct smbdirect_socket_parameters *sp = &sc->parameters;
313 	const char *event_name = rdma_event_msg(event->event);
314 	u8 peer_initiator_depth;
315 	u8 peer_responder_resources;
316 
317 	log_rdma_event(INFO, "event=%s status=%d\n",
318 		event_name, event->status);
319 
320 	switch (event->event) {
321 	case RDMA_CM_EVENT_ADDR_RESOLVED:
322 		WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING);
323 		sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED;
324 		wake_up(&sc->status_wait);
325 		break;
326 
327 	case RDMA_CM_EVENT_ROUTE_RESOLVED:
328 		WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING);
329 		sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED;
330 		wake_up(&sc->status_wait);
331 		break;
332 
333 	case RDMA_CM_EVENT_ADDR_ERROR:
334 		log_rdma_event(ERR, "connecting failed event=%s\n", event_name);
335 		WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING);
336 		sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED;
337 		smbd_disconnect_rdma_work(&sc->disconnect_work);
338 		break;
339 
340 	case RDMA_CM_EVENT_ROUTE_ERROR:
341 		log_rdma_event(ERR, "connecting failed event=%s\n", event_name);
342 		WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING);
343 		sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED;
344 		smbd_disconnect_rdma_work(&sc->disconnect_work);
345 		break;
346 
347 	case RDMA_CM_EVENT_ESTABLISHED:
348 		log_rdma_event(INFO, "connected event=%s\n", event_name);
349 
350 		/*
351 		 * Here we work around an inconsistency between
352 		 * iWarp and other devices (at least rxe and irdma using RoCEv2)
353 		 */
354 		if (rdma_protocol_iwarp(id->device, id->port_num)) {
355 			/*
356 			 * iWarp devices report the peer's values
357 			 * with the perspective of the peer here.
358 			 * Tested with siw and irdma (in iwarp mode)
359 			 * We need to change to our perspective here,
360 			 * so we need to switch the values.
361 			 */
362 			peer_initiator_depth = event->param.conn.responder_resources;
363 			peer_responder_resources = event->param.conn.initiator_depth;
364 		} else {
365 			/*
366 			 * Non iWarp devices report the peer's values
367 			 * already changed to our perspective here.
368 			 * Tested with rxe and irdma (in roce mode).
369 			 */
370 			peer_initiator_depth = event->param.conn.initiator_depth;
371 			peer_responder_resources = event->param.conn.responder_resources;
372 		}
373 		if (rdma_protocol_iwarp(id->device, id->port_num) &&
374 		    event->param.conn.private_data_len == 8) {
375 			/*
376 			 * Legacy clients with only iWarp MPA v1 support
377 			 * need a private blob in order to negotiate
378 			 * the IRD/ORD values.
379 			 */
380 			const __be32 *ird_ord_hdr = event->param.conn.private_data;
381 			u32 ird32 = be32_to_cpu(ird_ord_hdr[0]);
382 			u32 ord32 = be32_to_cpu(ird_ord_hdr[1]);
383 
384 			/*
385 			 * cifs.ko sends the legacy IRD/ORD negotiation
386 			 * event if iWarp MPA v2 was used.
387 			 *
388 			 * Here we check that the values match and only
389 			 * mark the client as legacy if they don't match.
390 			 */
391 			if ((u32)event->param.conn.initiator_depth != ird32 ||
392 			    (u32)event->param.conn.responder_resources != ord32) {
393 				/*
394 				 * There are broken clients (old cifs.ko)
395 				 * using little endian and also
396 				 * struct rdma_conn_param only uses u8
397 				 * for initiator_depth and responder_resources,
398 				 * so we truncate the value to U8_MAX.
399 				 *
400 				 * smb_direct_accept_client() will then
401 				 * do the real negotiation in order to
402 				 * select the minimum between client and
403 				 * server.
404 				 */
405 				ird32 = min_t(u32, ird32, U8_MAX);
406 				ord32 = min_t(u32, ord32, U8_MAX);
407 
408 				sc->rdma.legacy_iwarp = true;
409 				peer_initiator_depth = (u8)ird32;
410 				peer_responder_resources = (u8)ord32;
411 			}
412 		}
413 
414 		/*
415 		 * negotiate the value by using the minimum
416 		 * between client and server if the client provided
417 		 * non 0 values.
418 		 */
419 		if (peer_initiator_depth != 0)
420 			sp->initiator_depth =
421 					min_t(u8, sp->initiator_depth,
422 					      peer_initiator_depth);
423 		if (peer_responder_resources != 0)
424 			sp->responder_resources =
425 					min_t(u8, sp->responder_resources,
426 					      peer_responder_resources);
427 
428 		WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING);
429 		sc->status = SMBDIRECT_SOCKET_NEGOTIATE_NEEDED;
430 		wake_up(&sc->status_wait);
431 		break;
432 
433 	case RDMA_CM_EVENT_CONNECT_ERROR:
434 	case RDMA_CM_EVENT_UNREACHABLE:
435 	case RDMA_CM_EVENT_REJECTED:
436 		log_rdma_event(ERR, "connecting failed event=%s\n", event_name);
437 		WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING);
438 		sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED;
439 		smbd_disconnect_rdma_work(&sc->disconnect_work);
440 		break;
441 
442 	case RDMA_CM_EVENT_DEVICE_REMOVAL:
443 	case RDMA_CM_EVENT_DISCONNECTED:
444 		/* This happens when we fail the negotiation */
445 		if (sc->status == SMBDIRECT_SOCKET_NEGOTIATE_FAILED) {
446 			log_rdma_event(ERR, "event=%s during negotiation\n", event_name);
447 		}
448 
449 		sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
450 		smbd_disconnect_rdma_work(&sc->disconnect_work);
451 		break;
452 
453 	default:
454 		log_rdma_event(ERR, "unexpected event=%s status=%d\n",
455 			       event_name, event->status);
456 		break;
457 	}
458 
459 	return 0;
460 }
461 
462 /* Upcall from RDMA QP */
463 static void
smbd_qp_async_error_upcall(struct ib_event * event,void * context)464 smbd_qp_async_error_upcall(struct ib_event *event, void *context)
465 {
466 	struct smbdirect_socket *sc = context;
467 
468 	log_rdma_event(ERR, "%s on device %s socket %p\n",
469 		ib_event_msg(event->event), event->device->name, sc);
470 
471 	switch (event->event) {
472 	case IB_EVENT_CQ_ERR:
473 	case IB_EVENT_QP_FATAL:
474 		smbd_disconnect_rdma_connection(sc);
475 		break;
476 
477 	default:
478 		break;
479 	}
480 }
481 
smbdirect_send_io_payload(struct smbdirect_send_io * request)482 static inline void *smbdirect_send_io_payload(struct smbdirect_send_io *request)
483 {
484 	return (void *)request->packet;
485 }
486 
smbdirect_recv_io_payload(struct smbdirect_recv_io * response)487 static inline void *smbdirect_recv_io_payload(struct smbdirect_recv_io *response)
488 {
489 	return (void *)response->packet;
490 }
491 
492 /* Called when a RDMA send is done */
send_done(struct ib_cq * cq,struct ib_wc * wc)493 static void send_done(struct ib_cq *cq, struct ib_wc *wc)
494 {
495 	int i;
496 	struct smbdirect_send_io *request =
497 		container_of(wc->wr_cqe, struct smbdirect_send_io, cqe);
498 	struct smbdirect_socket *sc = request->socket;
499 	int lcredits = 0;
500 
501 	log_rdma_send(INFO, "smbdirect_send_io 0x%p completed wc->status=%s\n",
502 		request, ib_wc_status_msg(wc->status));
503 
504 	for (i = 0; i < request->num_sge; i++)
505 		ib_dma_unmap_single(sc->ib.dev,
506 			request->sge[i].addr,
507 			request->sge[i].length,
508 			DMA_TO_DEVICE);
509 	mempool_free(request, sc->send_io.mem.pool);
510 	lcredits += 1;
511 
512 	if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) {
513 		if (wc->status != IB_WC_WR_FLUSH_ERR)
514 			log_rdma_send(ERR, "wc->status=%s wc->opcode=%d\n",
515 				ib_wc_status_msg(wc->status), wc->opcode);
516 		smbd_disconnect_rdma_connection(sc);
517 		return;
518 	}
519 
520 	atomic_add(lcredits, &sc->send_io.lcredits.count);
521 	wake_up(&sc->send_io.lcredits.wait_queue);
522 
523 	if (atomic_dec_and_test(&sc->send_io.pending.count))
524 		wake_up(&sc->send_io.pending.zero_wait_queue);
525 
526 	wake_up(&sc->send_io.pending.dec_wait_queue);
527 }
528 
dump_smbdirect_negotiate_resp(struct smbdirect_negotiate_resp * resp)529 static void dump_smbdirect_negotiate_resp(struct smbdirect_negotiate_resp *resp)
530 {
531 	log_rdma_event(INFO, "resp message min_version %u max_version %u negotiated_version %u credits_requested %u credits_granted %u status %u max_readwrite_size %u preferred_send_size %u max_receive_size %u max_fragmented_size %u\n",
532 		       resp->min_version, resp->max_version,
533 		       resp->negotiated_version, resp->credits_requested,
534 		       resp->credits_granted, resp->status,
535 		       resp->max_readwrite_size, resp->preferred_send_size,
536 		       resp->max_receive_size, resp->max_fragmented_size);
537 }
538 
539 /*
540  * Process a negotiation response message, according to [MS-SMBD]3.1.5.7
541  * response, packet_length: the negotiation response message
542  * return value: true if negotiation is a success, false if failed
543  */
process_negotiation_response(struct smbdirect_recv_io * response,int packet_length)544 static bool process_negotiation_response(
545 		struct smbdirect_recv_io *response, int packet_length)
546 {
547 	struct smbdirect_socket *sc = response->socket;
548 	struct smbdirect_socket_parameters *sp = &sc->parameters;
549 	struct smbdirect_negotiate_resp *packet = smbdirect_recv_io_payload(response);
550 
551 	if (packet_length < sizeof(struct smbdirect_negotiate_resp)) {
552 		log_rdma_event(ERR,
553 			"error: packet_length=%d\n", packet_length);
554 		return false;
555 	}
556 
557 	if (le16_to_cpu(packet->negotiated_version) != SMBDIRECT_V1) {
558 		log_rdma_event(ERR, "error: negotiated_version=%x\n",
559 			le16_to_cpu(packet->negotiated_version));
560 		return false;
561 	}
562 
563 	if (packet->credits_requested == 0) {
564 		log_rdma_event(ERR, "error: credits_requested==0\n");
565 		return false;
566 	}
567 	sc->recv_io.credits.target = le16_to_cpu(packet->credits_requested);
568 	sc->recv_io.credits.target = min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max);
569 
570 	if (packet->credits_granted == 0) {
571 		log_rdma_event(ERR, "error: credits_granted==0\n");
572 		return false;
573 	}
574 	atomic_set(&sc->send_io.lcredits.count, sp->send_credit_target);
575 	atomic_set(&sc->send_io.credits.count, le16_to_cpu(packet->credits_granted));
576 
577 	if (le32_to_cpu(packet->preferred_send_size) > sp->max_recv_size) {
578 		log_rdma_event(ERR, "error: preferred_send_size=%d\n",
579 			le32_to_cpu(packet->preferred_send_size));
580 		return false;
581 	}
582 	sp->max_recv_size = le32_to_cpu(packet->preferred_send_size);
583 
584 	if (le32_to_cpu(packet->max_receive_size) < SMBD_MIN_RECEIVE_SIZE) {
585 		log_rdma_event(ERR, "error: max_receive_size=%d\n",
586 			le32_to_cpu(packet->max_receive_size));
587 		return false;
588 	}
589 	sp->max_send_size = min_t(u32, sp->max_send_size,
590 				  le32_to_cpu(packet->max_receive_size));
591 
592 	if (le32_to_cpu(packet->max_fragmented_size) <
593 			SMBD_MIN_FRAGMENTED_SIZE) {
594 		log_rdma_event(ERR, "error: max_fragmented_size=%d\n",
595 			le32_to_cpu(packet->max_fragmented_size));
596 		return false;
597 	}
598 	sp->max_fragmented_send_size =
599 		le32_to_cpu(packet->max_fragmented_size);
600 
601 
602 	sp->max_read_write_size = min_t(u32,
603 			le32_to_cpu(packet->max_readwrite_size),
604 			sp->max_frmr_depth * PAGE_SIZE);
605 	sp->max_frmr_depth = sp->max_read_write_size / PAGE_SIZE;
606 
607 	sc->recv_io.expected = SMBDIRECT_EXPECT_DATA_TRANSFER;
608 	return true;
609 }
610 
smbd_post_send_credits(struct work_struct * work)611 static void smbd_post_send_credits(struct work_struct *work)
612 {
613 	int rc;
614 	struct smbdirect_recv_io *response;
615 	struct smbdirect_socket *sc =
616 		container_of(work, struct smbdirect_socket, recv_io.posted.refill_work);
617 
618 	if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
619 		return;
620 	}
621 
622 	if (sc->recv_io.credits.target >
623 		atomic_read(&sc->recv_io.credits.count)) {
624 		while (true) {
625 			response = get_receive_buffer(sc);
626 			if (!response)
627 				break;
628 
629 			response->first_segment = false;
630 			rc = smbd_post_recv(sc, response);
631 			if (rc) {
632 				log_rdma_recv(ERR,
633 					"post_recv failed rc=%d\n", rc);
634 				put_receive_buffer(sc, response);
635 				break;
636 			}
637 
638 			atomic_inc(&sc->recv_io.posted.count);
639 		}
640 	}
641 
642 	/* Promptly send an immediate packet as defined in [MS-SMBD] 3.1.1.1 */
643 	if (atomic_read(&sc->recv_io.credits.count) <
644 		sc->recv_io.credits.target - 1) {
645 		log_keep_alive(INFO, "schedule send of an empty message\n");
646 		queue_work(sc->workqueue, &sc->idle.immediate_work);
647 	}
648 }
649 
650 /* Called from softirq, when recv is done */
recv_done(struct ib_cq * cq,struct ib_wc * wc)651 static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
652 {
653 	struct smbdirect_data_transfer *data_transfer;
654 	struct smbdirect_recv_io *response =
655 		container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe);
656 	struct smbdirect_socket *sc = response->socket;
657 	struct smbdirect_socket_parameters *sp = &sc->parameters;
658 	u16 old_recv_credit_target;
659 	u32 data_offset = 0;
660 	u32 data_length = 0;
661 	u32 remaining_data_length = 0;
662 	bool negotiate_done = false;
663 
664 	log_rdma_recv(INFO,
665 		      "response=0x%p type=%d wc status=%s wc opcode %d byte_len=%d pkey_index=%u\n",
666 		      response, sc->recv_io.expected,
667 		      ib_wc_status_msg(wc->status), wc->opcode,
668 		      wc->byte_len, wc->pkey_index);
669 
670 	if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) {
671 		if (wc->status != IB_WC_WR_FLUSH_ERR)
672 			log_rdma_recv(ERR, "wc->status=%s opcode=%d\n",
673 				ib_wc_status_msg(wc->status), wc->opcode);
674 		goto error;
675 	}
676 
677 	ib_dma_sync_single_for_cpu(
678 		wc->qp->device,
679 		response->sge.addr,
680 		response->sge.length,
681 		DMA_FROM_DEVICE);
682 
683 	/*
684 	 * Reset timer to the keepalive interval in
685 	 * order to trigger our next keepalive message.
686 	 */
687 	sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE;
688 	mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
689 			 msecs_to_jiffies(sp->keepalive_interval_msec));
690 
691 	switch (sc->recv_io.expected) {
692 	/* SMBD negotiation response */
693 	case SMBDIRECT_EXPECT_NEGOTIATE_REP:
694 		dump_smbdirect_negotiate_resp(smbdirect_recv_io_payload(response));
695 		sc->recv_io.reassembly.full_packet_received = true;
696 		negotiate_done =
697 			process_negotiation_response(response, wc->byte_len);
698 		put_receive_buffer(sc, response);
699 		WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING);
700 		if (!negotiate_done) {
701 			sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED;
702 			smbd_disconnect_rdma_connection(sc);
703 		} else {
704 			sc->status = SMBDIRECT_SOCKET_CONNECTED;
705 			wake_up(&sc->status_wait);
706 		}
707 
708 		return;
709 
710 	/* SMBD data transfer packet */
711 	case SMBDIRECT_EXPECT_DATA_TRANSFER:
712 		data_transfer = smbdirect_recv_io_payload(response);
713 
714 		if (wc->byte_len <
715 		    offsetof(struct smbdirect_data_transfer, padding))
716 			goto error;
717 
718 		remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length);
719 		data_offset = le32_to_cpu(data_transfer->data_offset);
720 		data_length = le32_to_cpu(data_transfer->data_length);
721 		if (wc->byte_len < data_offset ||
722 		    (u64)wc->byte_len < (u64)data_offset + data_length)
723 			goto error;
724 
725 		if (remaining_data_length > sp->max_fragmented_recv_size ||
726 		    data_length > sp->max_fragmented_recv_size ||
727 		    (u64)remaining_data_length + (u64)data_length > (u64)sp->max_fragmented_recv_size)
728 			goto error;
729 
730 		if (data_length) {
731 			if (sc->recv_io.reassembly.full_packet_received)
732 				response->first_segment = true;
733 
734 			if (le32_to_cpu(data_transfer->remaining_data_length))
735 				sc->recv_io.reassembly.full_packet_received = false;
736 			else
737 				sc->recv_io.reassembly.full_packet_received = true;
738 		}
739 
740 		atomic_dec(&sc->recv_io.posted.count);
741 		atomic_dec(&sc->recv_io.credits.count);
742 		old_recv_credit_target = sc->recv_io.credits.target;
743 		sc->recv_io.credits.target =
744 			le16_to_cpu(data_transfer->credits_requested);
745 		sc->recv_io.credits.target =
746 			min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max);
747 		sc->recv_io.credits.target =
748 			max_t(u16, sc->recv_io.credits.target, 1);
749 		if (le16_to_cpu(data_transfer->credits_granted)) {
750 			atomic_add(le16_to_cpu(data_transfer->credits_granted),
751 				&sc->send_io.credits.count);
752 			/*
753 			 * We have new send credits granted from remote peer
754 			 * If any sender is waiting for credits, unblock it
755 			 */
756 			wake_up(&sc->send_io.credits.wait_queue);
757 		}
758 
759 		log_incoming(INFO, "data flags %d data_offset %d data_length %d remaining_data_length %d\n",
760 			     le16_to_cpu(data_transfer->flags),
761 			     le32_to_cpu(data_transfer->data_offset),
762 			     le32_to_cpu(data_transfer->data_length),
763 			     le32_to_cpu(data_transfer->remaining_data_length));
764 
765 		/* Send an immediate response right away if requested */
766 		if (le16_to_cpu(data_transfer->flags) &
767 				SMBDIRECT_FLAG_RESPONSE_REQUESTED) {
768 			log_keep_alive(INFO, "schedule send of immediate response\n");
769 			queue_work(sc->workqueue, &sc->idle.immediate_work);
770 		}
771 
772 		/*
773 		 * If this is a packet with data playload place the data in
774 		 * reassembly queue and wake up the reading thread
775 		 */
776 		if (data_length) {
777 			if (sc->recv_io.credits.target > old_recv_credit_target)
778 				queue_work(sc->workqueue, &sc->recv_io.posted.refill_work);
779 
780 			enqueue_reassembly(sc, response, data_length);
781 			wake_up(&sc->recv_io.reassembly.wait_queue);
782 		} else
783 			put_receive_buffer(sc, response);
784 
785 		return;
786 
787 	case SMBDIRECT_EXPECT_NEGOTIATE_REQ:
788 		/* Only server... */
789 		break;
790 	}
791 
792 	/*
793 	 * This is an internal error!
794 	 */
795 	log_rdma_recv(ERR, "unexpected response type=%d\n", sc->recv_io.expected);
796 	WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_DATA_TRANSFER);
797 error:
798 	put_receive_buffer(sc, response);
799 	smbd_disconnect_rdma_connection(sc);
800 }
801 
smbd_create_id(struct smbdirect_socket * sc,struct sockaddr * dstaddr,int port)802 static struct rdma_cm_id *smbd_create_id(
803 		struct smbdirect_socket *sc,
804 		struct sockaddr *dstaddr, int port)
805 {
806 	struct smbdirect_socket_parameters *sp = &sc->parameters;
807 	struct rdma_cm_id *id;
808 	int rc;
809 	__be16 *sport;
810 
811 	id = rdma_create_id(&init_net, smbd_conn_upcall, sc,
812 		RDMA_PS_TCP, IB_QPT_RC);
813 	if (IS_ERR(id)) {
814 		rc = PTR_ERR(id);
815 		log_rdma_event(ERR, "rdma_create_id() failed %i\n", rc);
816 		return id;
817 	}
818 
819 	if (dstaddr->sa_family == AF_INET6)
820 		sport = &((struct sockaddr_in6 *)dstaddr)->sin6_port;
821 	else
822 		sport = &((struct sockaddr_in *)dstaddr)->sin_port;
823 
824 	*sport = htons(port);
825 
826 	WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED);
827 	sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING;
828 	rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)dstaddr,
829 		sp->resolve_addr_timeout_msec);
830 	if (rc) {
831 		log_rdma_event(ERR, "rdma_resolve_addr() failed %i\n", rc);
832 		goto out;
833 	}
834 	rc = wait_event_interruptible_timeout(
835 		sc->status_wait,
836 		sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING,
837 		msecs_to_jiffies(sp->resolve_addr_timeout_msec));
838 	/* e.g. if interrupted returns -ERESTARTSYS */
839 	if (rc < 0) {
840 		log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc);
841 		goto out;
842 	}
843 	if (sc->status == SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING) {
844 		rc = -ETIMEDOUT;
845 		log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc);
846 		goto out;
847 	}
848 	if (sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED) {
849 		rc = -EHOSTUNREACH;
850 		log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc);
851 		goto out;
852 	}
853 
854 	WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED);
855 	sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING;
856 	rc = rdma_resolve_route(id, sp->resolve_route_timeout_msec);
857 	if (rc) {
858 		log_rdma_event(ERR, "rdma_resolve_route() failed %i\n", rc);
859 		goto out;
860 	}
861 	rc = wait_event_interruptible_timeout(
862 		sc->status_wait,
863 		sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING,
864 		msecs_to_jiffies(sp->resolve_route_timeout_msec));
865 	/* e.g. if interrupted returns -ERESTARTSYS */
866 	if (rc < 0)  {
867 		log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc);
868 		goto out;
869 	}
870 	if (sc->status == SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING) {
871 		rc = -ETIMEDOUT;
872 		log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc);
873 		goto out;
874 	}
875 	if (sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED) {
876 		rc = -ENETUNREACH;
877 		log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc);
878 		goto out;
879 	}
880 
881 	return id;
882 
883 out:
884 	rdma_destroy_id(id);
885 	return ERR_PTR(rc);
886 }
887 
888 /*
889  * Test if FRWR (Fast Registration Work Requests) is supported on the device
890  * This implementation requires FRWR on RDMA read/write
891  * return value: true if it is supported
892  */
frwr_is_supported(struct ib_device_attr * attrs)893 static bool frwr_is_supported(struct ib_device_attr *attrs)
894 {
895 	if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
896 		return false;
897 	if (attrs->max_fast_reg_page_list_len == 0)
898 		return false;
899 	return true;
900 }
901 
smbd_ia_open(struct smbdirect_socket * sc,struct sockaddr * dstaddr,int port)902 static int smbd_ia_open(
903 		struct smbdirect_socket *sc,
904 		struct sockaddr *dstaddr, int port)
905 {
906 	struct smbdirect_socket_parameters *sp = &sc->parameters;
907 	int rc;
908 
909 	WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED);
910 	sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED;
911 
912 	sc->rdma.cm_id = smbd_create_id(sc, dstaddr, port);
913 	if (IS_ERR(sc->rdma.cm_id)) {
914 		rc = PTR_ERR(sc->rdma.cm_id);
915 		goto out1;
916 	}
917 	sc->ib.dev = sc->rdma.cm_id->device;
918 
919 	if (!frwr_is_supported(&sc->ib.dev->attrs)) {
920 		log_rdma_event(ERR, "Fast Registration Work Requests (FRWR) is not supported\n");
921 		log_rdma_event(ERR, "Device capability flags = %llx max_fast_reg_page_list_len = %u\n",
922 			       sc->ib.dev->attrs.device_cap_flags,
923 			       sc->ib.dev->attrs.max_fast_reg_page_list_len);
924 		rc = -EPROTONOSUPPORT;
925 		goto out2;
926 	}
927 	sp->max_frmr_depth = min_t(u32,
928 		sp->max_frmr_depth,
929 		sc->ib.dev->attrs.max_fast_reg_page_list_len);
930 	sc->mr_io.type = IB_MR_TYPE_MEM_REG;
931 	if (sc->ib.dev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG)
932 		sc->mr_io.type = IB_MR_TYPE_SG_GAPS;
933 
934 	return 0;
935 
936 out2:
937 	rdma_destroy_id(sc->rdma.cm_id);
938 	sc->rdma.cm_id = NULL;
939 
940 out1:
941 	return rc;
942 }
943 
944 /*
945  * Send a negotiation request message to the peer
946  * The negotiation procedure is in [MS-SMBD] 3.1.5.2 and 3.1.5.3
947  * After negotiation, the transport is connected and ready for
948  * carrying upper layer SMB payload
949  */
smbd_post_send_negotiate_req(struct smbdirect_socket * sc)950 static int smbd_post_send_negotiate_req(struct smbdirect_socket *sc)
951 {
952 	struct smbdirect_socket_parameters *sp = &sc->parameters;
953 	struct ib_send_wr send_wr;
954 	int rc = -ENOMEM;
955 	struct smbdirect_send_io *request;
956 	struct smbdirect_negotiate_req *packet;
957 
958 	request = mempool_alloc(sc->send_io.mem.pool, GFP_KERNEL);
959 	if (!request)
960 		return rc;
961 
962 	request->socket = sc;
963 
964 	packet = smbdirect_send_io_payload(request);
965 	packet->min_version = cpu_to_le16(SMBDIRECT_V1);
966 	packet->max_version = cpu_to_le16(SMBDIRECT_V1);
967 	packet->reserved = 0;
968 	packet->credits_requested = cpu_to_le16(sp->send_credit_target);
969 	packet->preferred_send_size = cpu_to_le32(sp->max_send_size);
970 	packet->max_receive_size = cpu_to_le32(sp->max_recv_size);
971 	packet->max_fragmented_size =
972 		cpu_to_le32(sp->max_fragmented_recv_size);
973 
974 	request->num_sge = 1;
975 	request->sge[0].addr = ib_dma_map_single(
976 				sc->ib.dev, (void *)packet,
977 				sizeof(*packet), DMA_TO_DEVICE);
978 	if (ib_dma_mapping_error(sc->ib.dev, request->sge[0].addr)) {
979 		rc = -EIO;
980 		goto dma_mapping_failed;
981 	}
982 
983 	request->sge[0].length = sizeof(*packet);
984 	request->sge[0].lkey = sc->ib.pd->local_dma_lkey;
985 
986 	ib_dma_sync_single_for_device(
987 		sc->ib.dev, request->sge[0].addr,
988 		request->sge[0].length, DMA_TO_DEVICE);
989 
990 	request->cqe.done = send_done;
991 
992 	send_wr.next = NULL;
993 	send_wr.wr_cqe = &request->cqe;
994 	send_wr.sg_list = request->sge;
995 	send_wr.num_sge = request->num_sge;
996 	send_wr.opcode = IB_WR_SEND;
997 	send_wr.send_flags = IB_SEND_SIGNALED;
998 
999 	log_rdma_send(INFO, "sge addr=0x%llx length=%u lkey=0x%x\n",
1000 		request->sge[0].addr,
1001 		request->sge[0].length, request->sge[0].lkey);
1002 
1003 	atomic_inc(&sc->send_io.pending.count);
1004 	rc = ib_post_send(sc->ib.qp, &send_wr, NULL);
1005 	if (!rc)
1006 		return 0;
1007 
1008 	/* if we reach here, post send failed */
1009 	log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
1010 	atomic_dec(&sc->send_io.pending.count);
1011 	ib_dma_unmap_single(sc->ib.dev, request->sge[0].addr,
1012 		request->sge[0].length, DMA_TO_DEVICE);
1013 
1014 	smbd_disconnect_rdma_connection(sc);
1015 
1016 dma_mapping_failed:
1017 	mempool_free(request, sc->send_io.mem.pool);
1018 	return rc;
1019 }
1020 
1021 /*
1022  * Extend the credits to remote peer
1023  * This implements [MS-SMBD] 3.1.5.9
1024  * The idea is that we should extend credits to remote peer as quickly as
1025  * it's allowed, to maintain data flow. We allocate as much receive
1026  * buffer as possible, and extend the receive credits to remote peer
1027  * return value: the new credtis being granted.
1028  */
manage_credits_prior_sending(struct smbdirect_socket * sc)1029 static int manage_credits_prior_sending(struct smbdirect_socket *sc)
1030 {
1031 	int new_credits;
1032 
1033 	if (atomic_read(&sc->recv_io.credits.count) >= sc->recv_io.credits.target)
1034 		return 0;
1035 
1036 	new_credits = atomic_read(&sc->recv_io.posted.count);
1037 	if (new_credits == 0)
1038 		return 0;
1039 
1040 	new_credits -= atomic_read(&sc->recv_io.credits.count);
1041 	if (new_credits <= 0)
1042 		return 0;
1043 
1044 	return new_credits;
1045 }
1046 
1047 /*
1048  * Check if we need to send a KEEP_ALIVE message
1049  * The idle connection timer triggers a KEEP_ALIVE message when expires
1050  * SMBDIRECT_FLAG_RESPONSE_REQUESTED is set in the message flag to have peer send
1051  * back a response.
1052  * return value:
1053  * 1 if SMBDIRECT_FLAG_RESPONSE_REQUESTED needs to be set
1054  * 0: otherwise
1055  */
manage_keep_alive_before_sending(struct smbdirect_socket * sc)1056 static int manage_keep_alive_before_sending(struct smbdirect_socket *sc)
1057 {
1058 	struct smbdirect_socket_parameters *sp = &sc->parameters;
1059 
1060 	if (sc->idle.keepalive == SMBDIRECT_KEEPALIVE_PENDING) {
1061 		sc->idle.keepalive = SMBDIRECT_KEEPALIVE_SENT;
1062 		/*
1063 		 * Now use the keepalive timeout (instead of keepalive interval)
1064 		 * in order to wait for a response
1065 		 */
1066 		mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
1067 				 msecs_to_jiffies(sp->keepalive_timeout_msec));
1068 		return 1;
1069 	}
1070 	return 0;
1071 }
1072 
1073 /* Post the send request */
smbd_post_send(struct smbdirect_socket * sc,struct smbdirect_send_io * request)1074 static int smbd_post_send(struct smbdirect_socket *sc,
1075 		struct smbdirect_send_io *request)
1076 {
1077 	struct ib_send_wr send_wr;
1078 	int rc, i;
1079 
1080 	for (i = 0; i < request->num_sge; i++) {
1081 		log_rdma_send(INFO,
1082 			"rdma_request sge[%d] addr=0x%llx length=%u\n",
1083 			i, request->sge[i].addr, request->sge[i].length);
1084 		ib_dma_sync_single_for_device(
1085 			sc->ib.dev,
1086 			request->sge[i].addr,
1087 			request->sge[i].length,
1088 			DMA_TO_DEVICE);
1089 	}
1090 
1091 	request->cqe.done = send_done;
1092 
1093 	send_wr.next = NULL;
1094 	send_wr.wr_cqe = &request->cqe;
1095 	send_wr.sg_list = request->sge;
1096 	send_wr.num_sge = request->num_sge;
1097 	send_wr.opcode = IB_WR_SEND;
1098 	send_wr.send_flags = IB_SEND_SIGNALED;
1099 
1100 	rc = ib_post_send(sc->ib.qp, &send_wr, NULL);
1101 	if (rc) {
1102 		log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
1103 		smbd_disconnect_rdma_connection(sc);
1104 		rc = -EAGAIN;
1105 	}
1106 
1107 	return rc;
1108 }
1109 
smbd_post_send_iter(struct smbdirect_socket * sc,struct iov_iter * iter,int * _remaining_data_length)1110 static int smbd_post_send_iter(struct smbdirect_socket *sc,
1111 			       struct iov_iter *iter,
1112 			       int *_remaining_data_length)
1113 {
1114 	struct smbdirect_socket_parameters *sp = &sc->parameters;
1115 	int i, rc;
1116 	int header_length;
1117 	int data_length;
1118 	struct smbdirect_send_io *request;
1119 	struct smbdirect_data_transfer *packet;
1120 	int new_credits = 0;
1121 
1122 wait_lcredit:
1123 	/* Wait for local send credits */
1124 	rc = wait_event_interruptible(sc->send_io.lcredits.wait_queue,
1125 		atomic_read(&sc->send_io.lcredits.count) > 0 ||
1126 		sc->status != SMBDIRECT_SOCKET_CONNECTED);
1127 	if (rc)
1128 		goto err_wait_lcredit;
1129 
1130 	if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
1131 		log_outgoing(ERR, "disconnected not sending on wait_credit\n");
1132 		rc = -EAGAIN;
1133 		goto err_wait_lcredit;
1134 	}
1135 	if (unlikely(atomic_dec_return(&sc->send_io.lcredits.count) < 0)) {
1136 		atomic_inc(&sc->send_io.lcredits.count);
1137 		goto wait_lcredit;
1138 	}
1139 
1140 wait_credit:
1141 	/* Wait for send credits. A SMBD packet needs one credit */
1142 	rc = wait_event_interruptible(sc->send_io.credits.wait_queue,
1143 		atomic_read(&sc->send_io.credits.count) > 0 ||
1144 		sc->status != SMBDIRECT_SOCKET_CONNECTED);
1145 	if (rc)
1146 		goto err_wait_credit;
1147 
1148 	if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
1149 		log_outgoing(ERR, "disconnected not sending on wait_credit\n");
1150 		rc = -EAGAIN;
1151 		goto err_wait_credit;
1152 	}
1153 	if (unlikely(atomic_dec_return(&sc->send_io.credits.count) < 0)) {
1154 		atomic_inc(&sc->send_io.credits.count);
1155 		goto wait_credit;
1156 	}
1157 
1158 	request = mempool_alloc(sc->send_io.mem.pool, GFP_KERNEL);
1159 	if (!request) {
1160 		rc = -ENOMEM;
1161 		goto err_alloc;
1162 	}
1163 
1164 	request->socket = sc;
1165 	memset(request->sge, 0, sizeof(request->sge));
1166 
1167 	/* Map the packet to DMA */
1168 	header_length = sizeof(struct smbdirect_data_transfer);
1169 	/* If this is a packet without payload, don't send padding */
1170 	if (!iter)
1171 		header_length = offsetof(struct smbdirect_data_transfer, padding);
1172 
1173 	packet = smbdirect_send_io_payload(request);
1174 	request->sge[0].addr = ib_dma_map_single(sc->ib.dev,
1175 						 (void *)packet,
1176 						 header_length,
1177 						 DMA_TO_DEVICE);
1178 	if (ib_dma_mapping_error(sc->ib.dev, request->sge[0].addr)) {
1179 		rc = -EIO;
1180 		goto err_dma;
1181 	}
1182 
1183 	request->sge[0].length = header_length;
1184 	request->sge[0].lkey = sc->ib.pd->local_dma_lkey;
1185 	request->num_sge = 1;
1186 
1187 	/* Fill in the data payload to find out how much data we can add */
1188 	if (iter) {
1189 		struct smb_extract_to_rdma extract = {
1190 			.nr_sge		= request->num_sge,
1191 			.max_sge	= SMBDIRECT_SEND_IO_MAX_SGE,
1192 			.sge		= request->sge,
1193 			.device		= sc->ib.dev,
1194 			.local_dma_lkey	= sc->ib.pd->local_dma_lkey,
1195 			.direction	= DMA_TO_DEVICE,
1196 		};
1197 		size_t payload_len = umin(*_remaining_data_length,
1198 					  sp->max_send_size - sizeof(*packet));
1199 
1200 		rc = smb_extract_iter_to_rdma(iter, payload_len,
1201 					      &extract);
1202 		if (rc < 0)
1203 			goto err_dma;
1204 		data_length = rc;
1205 		request->num_sge = extract.nr_sge;
1206 		*_remaining_data_length -= data_length;
1207 	} else {
1208 		data_length = 0;
1209 	}
1210 
1211 	/* Fill in the packet header */
1212 	packet->credits_requested = cpu_to_le16(sp->send_credit_target);
1213 
1214 	new_credits = manage_credits_prior_sending(sc);
1215 	atomic_add(new_credits, &sc->recv_io.credits.count);
1216 	packet->credits_granted = cpu_to_le16(new_credits);
1217 
1218 	packet->flags = 0;
1219 	if (manage_keep_alive_before_sending(sc))
1220 		packet->flags |= cpu_to_le16(SMBDIRECT_FLAG_RESPONSE_REQUESTED);
1221 
1222 	packet->reserved = 0;
1223 	if (!data_length)
1224 		packet->data_offset = 0;
1225 	else
1226 		packet->data_offset = cpu_to_le32(24);
1227 	packet->data_length = cpu_to_le32(data_length);
1228 	packet->remaining_data_length = cpu_to_le32(*_remaining_data_length);
1229 	packet->padding = 0;
1230 
1231 	log_outgoing(INFO, "credits_requested=%d credits_granted=%d data_offset=%d data_length=%d remaining_data_length=%d\n",
1232 		     le16_to_cpu(packet->credits_requested),
1233 		     le16_to_cpu(packet->credits_granted),
1234 		     le32_to_cpu(packet->data_offset),
1235 		     le32_to_cpu(packet->data_length),
1236 		     le32_to_cpu(packet->remaining_data_length));
1237 
1238 	/*
1239 	 * Now that we got a local and a remote credit
1240 	 * we add us as pending
1241 	 */
1242 	atomic_inc(&sc->send_io.pending.count);
1243 
1244 	rc = smbd_post_send(sc, request);
1245 	if (!rc)
1246 		return 0;
1247 
1248 	if (atomic_dec_and_test(&sc->send_io.pending.count))
1249 		wake_up(&sc->send_io.pending.zero_wait_queue);
1250 
1251 	wake_up(&sc->send_io.pending.dec_wait_queue);
1252 
1253 err_dma:
1254 	for (i = 0; i < request->num_sge; i++)
1255 		if (request->sge[i].addr)
1256 			ib_dma_unmap_single(sc->ib.dev,
1257 					    request->sge[i].addr,
1258 					    request->sge[i].length,
1259 					    DMA_TO_DEVICE);
1260 	mempool_free(request, sc->send_io.mem.pool);
1261 
1262 	/* roll back the granted receive credits */
1263 	atomic_sub(new_credits, &sc->recv_io.credits.count);
1264 
1265 err_alloc:
1266 	atomic_inc(&sc->send_io.credits.count);
1267 	wake_up(&sc->send_io.credits.wait_queue);
1268 
1269 err_wait_credit:
1270 	atomic_inc(&sc->send_io.lcredits.count);
1271 	wake_up(&sc->send_io.lcredits.wait_queue);
1272 
1273 err_wait_lcredit:
1274 	return rc;
1275 }
1276 
1277 /*
1278  * Send an empty message
1279  * Empty message is used to extend credits to peer to for keep live
1280  * while there is no upper layer payload to send at the time
1281  */
smbd_post_send_empty(struct smbdirect_socket * sc)1282 static int smbd_post_send_empty(struct smbdirect_socket *sc)
1283 {
1284 	int remaining_data_length = 0;
1285 
1286 	sc->statistics.send_empty++;
1287 	return smbd_post_send_iter(sc, NULL, &remaining_data_length);
1288 }
1289 
smbd_post_send_full_iter(struct smbdirect_socket * sc,struct iov_iter * iter,int * _remaining_data_length)1290 static int smbd_post_send_full_iter(struct smbdirect_socket *sc,
1291 				    struct iov_iter *iter,
1292 				    int *_remaining_data_length)
1293 {
1294 	int rc = 0;
1295 
1296 	/*
1297 	 * smbd_post_send_iter() respects the
1298 	 * negotiated max_send_size, so we need to
1299 	 * loop until the full iter is posted
1300 	 */
1301 
1302 	while (iov_iter_count(iter) > 0) {
1303 		rc = smbd_post_send_iter(sc, iter, _remaining_data_length);
1304 		if (rc < 0)
1305 			break;
1306 	}
1307 
1308 	return rc;
1309 }
1310 
1311 /*
1312  * Post a receive request to the transport
1313  * The remote peer can only send data when a receive request is posted
1314  * The interaction is controlled by send/receive credit system
1315  */
smbd_post_recv(struct smbdirect_socket * sc,struct smbdirect_recv_io * response)1316 static int smbd_post_recv(
1317 		struct smbdirect_socket *sc, struct smbdirect_recv_io *response)
1318 {
1319 	struct smbdirect_socket_parameters *sp = &sc->parameters;
1320 	struct ib_recv_wr recv_wr;
1321 	int rc = -EIO;
1322 
1323 	response->sge.addr = ib_dma_map_single(
1324 				sc->ib.dev, response->packet,
1325 				sp->max_recv_size, DMA_FROM_DEVICE);
1326 	if (ib_dma_mapping_error(sc->ib.dev, response->sge.addr))
1327 		return rc;
1328 
1329 	response->sge.length = sp->max_recv_size;
1330 	response->sge.lkey = sc->ib.pd->local_dma_lkey;
1331 
1332 	response->cqe.done = recv_done;
1333 
1334 	recv_wr.wr_cqe = &response->cqe;
1335 	recv_wr.next = NULL;
1336 	recv_wr.sg_list = &response->sge;
1337 	recv_wr.num_sge = 1;
1338 
1339 	rc = ib_post_recv(sc->ib.qp, &recv_wr, NULL);
1340 	if (rc) {
1341 		ib_dma_unmap_single(sc->ib.dev, response->sge.addr,
1342 				    response->sge.length, DMA_FROM_DEVICE);
1343 		response->sge.length = 0;
1344 		smbd_disconnect_rdma_connection(sc);
1345 		log_rdma_recv(ERR, "ib_post_recv failed rc=%d\n", rc);
1346 	}
1347 
1348 	return rc;
1349 }
1350 
1351 /* Perform SMBD negotiate according to [MS-SMBD] 3.1.5.2 */
smbd_negotiate(struct smbdirect_socket * sc)1352 static int smbd_negotiate(struct smbdirect_socket *sc)
1353 {
1354 	struct smbdirect_socket_parameters *sp = &sc->parameters;
1355 	int rc;
1356 	struct smbdirect_recv_io *response = get_receive_buffer(sc);
1357 
1358 	WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED);
1359 	sc->status = SMBDIRECT_SOCKET_NEGOTIATE_RUNNING;
1360 
1361 	sc->recv_io.expected = SMBDIRECT_EXPECT_NEGOTIATE_REP;
1362 	rc = smbd_post_recv(sc, response);
1363 	log_rdma_event(INFO, "smbd_post_recv rc=%d iov.addr=0x%llx iov.length=%u iov.lkey=0x%x\n",
1364 		       rc, response->sge.addr,
1365 		       response->sge.length, response->sge.lkey);
1366 	if (rc) {
1367 		put_receive_buffer(sc, response);
1368 		return rc;
1369 	}
1370 
1371 	rc = smbd_post_send_negotiate_req(sc);
1372 	if (rc)
1373 		return rc;
1374 
1375 	rc = wait_event_interruptible_timeout(
1376 		sc->status_wait,
1377 		sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING,
1378 		msecs_to_jiffies(sp->negotiate_timeout_msec));
1379 	log_rdma_event(INFO, "wait_event_interruptible_timeout rc=%d\n", rc);
1380 
1381 	if (sc->status == SMBDIRECT_SOCKET_CONNECTED)
1382 		return 0;
1383 
1384 	if (rc == 0)
1385 		rc = -ETIMEDOUT;
1386 	else if (rc == -ERESTARTSYS)
1387 		rc = -EINTR;
1388 	else
1389 		rc = -ENOTCONN;
1390 
1391 	return rc;
1392 }
1393 
1394 /*
1395  * Implement Connection.FragmentReassemblyBuffer defined in [MS-SMBD] 3.1.1.1
1396  * This is a queue for reassembling upper layer payload and present to upper
1397  * layer. All the inncoming payload go to the reassembly queue, regardless of
1398  * if reassembly is required. The uuper layer code reads from the queue for all
1399  * incoming payloads.
1400  * Put a received packet to the reassembly queue
1401  * response: the packet received
1402  * data_length: the size of payload in this packet
1403  */
enqueue_reassembly(struct smbdirect_socket * sc,struct smbdirect_recv_io * response,int data_length)1404 static void enqueue_reassembly(
1405 	struct smbdirect_socket *sc,
1406 	struct smbdirect_recv_io *response,
1407 	int data_length)
1408 {
1409 	unsigned long flags;
1410 
1411 	spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
1412 	list_add_tail(&response->list, &sc->recv_io.reassembly.list);
1413 	sc->recv_io.reassembly.queue_length++;
1414 	/*
1415 	 * Make sure reassembly_data_length is updated after list and
1416 	 * reassembly_queue_length are updated. On the dequeue side
1417 	 * reassembly_data_length is checked without a lock to determine
1418 	 * if reassembly_queue_length and list is up to date
1419 	 */
1420 	virt_wmb();
1421 	sc->recv_io.reassembly.data_length += data_length;
1422 	spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
1423 	sc->statistics.enqueue_reassembly_queue++;
1424 }
1425 
1426 /*
1427  * Get the first entry at the front of reassembly queue
1428  * Caller is responsible for locking
1429  * return value: the first entry if any, NULL if queue is empty
1430  */
_get_first_reassembly(struct smbdirect_socket * sc)1431 static struct smbdirect_recv_io *_get_first_reassembly(struct smbdirect_socket *sc)
1432 {
1433 	struct smbdirect_recv_io *ret = NULL;
1434 
1435 	if (!list_empty(&sc->recv_io.reassembly.list)) {
1436 		ret = list_first_entry(
1437 			&sc->recv_io.reassembly.list,
1438 			struct smbdirect_recv_io, list);
1439 	}
1440 	return ret;
1441 }
1442 
1443 /*
1444  * Get a receive buffer
1445  * For each remote send, we need to post a receive. The receive buffers are
1446  * pre-allocated in advance.
1447  * return value: the receive buffer, NULL if none is available
1448  */
get_receive_buffer(struct smbdirect_socket * sc)1449 static struct smbdirect_recv_io *get_receive_buffer(struct smbdirect_socket *sc)
1450 {
1451 	struct smbdirect_recv_io *ret = NULL;
1452 	unsigned long flags;
1453 
1454 	spin_lock_irqsave(&sc->recv_io.free.lock, flags);
1455 	if (!list_empty(&sc->recv_io.free.list)) {
1456 		ret = list_first_entry(
1457 			&sc->recv_io.free.list,
1458 			struct smbdirect_recv_io, list);
1459 		list_del(&ret->list);
1460 		sc->statistics.get_receive_buffer++;
1461 	}
1462 	spin_unlock_irqrestore(&sc->recv_io.free.lock, flags);
1463 
1464 	return ret;
1465 }
1466 
1467 /*
1468  * Return a receive buffer
1469  * Upon returning of a receive buffer, we can post new receive and extend
1470  * more receive credits to remote peer. This is done immediately after a
1471  * receive buffer is returned.
1472  */
put_receive_buffer(struct smbdirect_socket * sc,struct smbdirect_recv_io * response)1473 static void put_receive_buffer(
1474 	struct smbdirect_socket *sc, struct smbdirect_recv_io *response)
1475 {
1476 	unsigned long flags;
1477 
1478 	if (likely(response->sge.length != 0)) {
1479 		ib_dma_unmap_single(sc->ib.dev,
1480 				    response->sge.addr,
1481 				    response->sge.length,
1482 				    DMA_FROM_DEVICE);
1483 		response->sge.length = 0;
1484 	}
1485 
1486 	spin_lock_irqsave(&sc->recv_io.free.lock, flags);
1487 	list_add_tail(&response->list, &sc->recv_io.free.list);
1488 	sc->statistics.put_receive_buffer++;
1489 	spin_unlock_irqrestore(&sc->recv_io.free.lock, flags);
1490 
1491 	queue_work(sc->workqueue, &sc->recv_io.posted.refill_work);
1492 }
1493 
1494 /* Preallocate all receive buffer on transport establishment */
allocate_receive_buffers(struct smbdirect_socket * sc,int num_buf)1495 static int allocate_receive_buffers(struct smbdirect_socket *sc, int num_buf)
1496 {
1497 	struct smbdirect_recv_io *response;
1498 	int i;
1499 
1500 	for (i = 0; i < num_buf; i++) {
1501 		response = mempool_alloc(sc->recv_io.mem.pool, GFP_KERNEL);
1502 		if (!response)
1503 			goto allocate_failed;
1504 
1505 		response->socket = sc;
1506 		response->sge.length = 0;
1507 		list_add_tail(&response->list, &sc->recv_io.free.list);
1508 	}
1509 
1510 	return 0;
1511 
1512 allocate_failed:
1513 	while (!list_empty(&sc->recv_io.free.list)) {
1514 		response = list_first_entry(
1515 				&sc->recv_io.free.list,
1516 				struct smbdirect_recv_io, list);
1517 		list_del(&response->list);
1518 
1519 		mempool_free(response, sc->recv_io.mem.pool);
1520 	}
1521 	return -ENOMEM;
1522 }
1523 
destroy_receive_buffers(struct smbdirect_socket * sc)1524 static void destroy_receive_buffers(struct smbdirect_socket *sc)
1525 {
1526 	struct smbdirect_recv_io *response;
1527 
1528 	while ((response = get_receive_buffer(sc)))
1529 		mempool_free(response, sc->recv_io.mem.pool);
1530 }
1531 
send_immediate_empty_message(struct work_struct * work)1532 static void send_immediate_empty_message(struct work_struct *work)
1533 {
1534 	struct smbdirect_socket *sc =
1535 		container_of(work, struct smbdirect_socket, idle.immediate_work);
1536 
1537 	if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
1538 		return;
1539 
1540 	log_keep_alive(INFO, "send an empty message\n");
1541 	smbd_post_send_empty(sc);
1542 }
1543 
1544 /* Implement idle connection timer [MS-SMBD] 3.1.6.2 */
idle_connection_timer(struct work_struct * work)1545 static void idle_connection_timer(struct work_struct *work)
1546 {
1547 	struct smbdirect_socket *sc =
1548 		container_of(work, struct smbdirect_socket, idle.timer_work.work);
1549 	struct smbdirect_socket_parameters *sp = &sc->parameters;
1550 
1551 	if (sc->idle.keepalive != SMBDIRECT_KEEPALIVE_NONE) {
1552 		log_keep_alive(ERR,
1553 			"error status sc->idle.keepalive=%d\n",
1554 			sc->idle.keepalive);
1555 		smbd_disconnect_rdma_connection(sc);
1556 		return;
1557 	}
1558 
1559 	if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
1560 		return;
1561 
1562 	/*
1563 	 * Now use the keepalive timeout (instead of keepalive interval)
1564 	 * in order to wait for a response
1565 	 */
1566 	sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING;
1567 	mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
1568 			 msecs_to_jiffies(sp->keepalive_timeout_msec));
1569 	log_keep_alive(INFO, "schedule send of empty idle message\n");
1570 	queue_work(sc->workqueue, &sc->idle.immediate_work);
1571 }
1572 
1573 /*
1574  * Destroy the transport and related RDMA and memory resources
1575  * Need to go through all the pending counters and make sure on one is using
1576  * the transport while it is destroyed
1577  */
smbd_destroy(struct TCP_Server_Info * server)1578 void smbd_destroy(struct TCP_Server_Info *server)
1579 {
1580 	struct smbd_connection *info = server->smbd_conn;
1581 	struct smbdirect_socket *sc;
1582 	struct smbdirect_recv_io *response;
1583 	unsigned long flags;
1584 
1585 	if (!info) {
1586 		log_rdma_event(INFO, "rdma session already destroyed\n");
1587 		return;
1588 	}
1589 	sc = &info->socket;
1590 
1591 	log_rdma_event(INFO, "cancelling and disable disconnect_work\n");
1592 	disable_work_sync(&sc->disconnect_work);
1593 
1594 	log_rdma_event(INFO, "destroying rdma session\n");
1595 	if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING)
1596 		smbd_disconnect_rdma_work(&sc->disconnect_work);
1597 	if (sc->status < SMBDIRECT_SOCKET_DISCONNECTED) {
1598 		log_rdma_event(INFO, "wait for transport being disconnected\n");
1599 		wait_event(sc->status_wait, sc->status == SMBDIRECT_SOCKET_DISCONNECTED);
1600 		log_rdma_event(INFO, "waited for transport being disconnected\n");
1601 	}
1602 
1603 	/*
1604 	 * Wake up all waiters in all wait queues
1605 	 * in order to notice the broken connection.
1606 	 *
1607 	 * Most likely this was already called via
1608 	 * smbd_disconnect_rdma_work(), but call it again...
1609 	 */
1610 	smbd_disconnect_wake_up_all(sc);
1611 
1612 	log_rdma_event(INFO, "cancelling recv_io.posted.refill_work\n");
1613 	disable_work_sync(&sc->recv_io.posted.refill_work);
1614 
1615 	log_rdma_event(INFO, "destroying qp\n");
1616 	ib_drain_qp(sc->ib.qp);
1617 	rdma_destroy_qp(sc->rdma.cm_id);
1618 	sc->ib.qp = NULL;
1619 
1620 	log_rdma_event(INFO, "cancelling idle timer\n");
1621 	disable_delayed_work_sync(&sc->idle.timer_work);
1622 	log_rdma_event(INFO, "cancelling send immediate work\n");
1623 	disable_work_sync(&sc->idle.immediate_work);
1624 
1625 	/* It's not possible for upper layer to get to reassembly */
1626 	log_rdma_event(INFO, "drain the reassembly queue\n");
1627 	do {
1628 		spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
1629 		response = _get_first_reassembly(sc);
1630 		if (response) {
1631 			list_del(&response->list);
1632 			spin_unlock_irqrestore(
1633 				&sc->recv_io.reassembly.lock, flags);
1634 			put_receive_buffer(sc, response);
1635 		} else
1636 			spin_unlock_irqrestore(
1637 				&sc->recv_io.reassembly.lock, flags);
1638 	} while (response);
1639 	sc->recv_io.reassembly.data_length = 0;
1640 
1641 	log_rdma_event(INFO, "free receive buffers\n");
1642 	destroy_receive_buffers(sc);
1643 
1644 	log_rdma_event(INFO, "freeing mr list\n");
1645 	destroy_mr_list(sc);
1646 
1647 	ib_free_cq(sc->ib.send_cq);
1648 	ib_free_cq(sc->ib.recv_cq);
1649 	ib_dealloc_pd(sc->ib.pd);
1650 	rdma_destroy_id(sc->rdma.cm_id);
1651 
1652 	/* free mempools */
1653 	mempool_destroy(sc->send_io.mem.pool);
1654 	kmem_cache_destroy(sc->send_io.mem.cache);
1655 
1656 	mempool_destroy(sc->recv_io.mem.pool);
1657 	kmem_cache_destroy(sc->recv_io.mem.cache);
1658 
1659 	sc->status = SMBDIRECT_SOCKET_DESTROYED;
1660 
1661 	destroy_workqueue(sc->workqueue);
1662 	log_rdma_event(INFO,  "rdma session destroyed\n");
1663 	kfree(info);
1664 	server->smbd_conn = NULL;
1665 }
1666 
1667 /*
1668  * Reconnect this SMBD connection, called from upper layer
1669  * return value: 0 on success, or actual error code
1670  */
smbd_reconnect(struct TCP_Server_Info * server)1671 int smbd_reconnect(struct TCP_Server_Info *server)
1672 {
1673 	log_rdma_event(INFO, "reconnecting rdma session\n");
1674 
1675 	if (!server->smbd_conn) {
1676 		log_rdma_event(INFO, "rdma session already destroyed\n");
1677 		goto create_conn;
1678 	}
1679 
1680 	/*
1681 	 * This is possible if transport is disconnected and we haven't received
1682 	 * notification from RDMA, but upper layer has detected timeout
1683 	 */
1684 	if (server->smbd_conn->socket.status == SMBDIRECT_SOCKET_CONNECTED) {
1685 		log_rdma_event(INFO, "disconnecting transport\n");
1686 		smbd_destroy(server);
1687 	}
1688 
1689 create_conn:
1690 	log_rdma_event(INFO, "creating rdma session\n");
1691 	server->smbd_conn = smbd_get_connection(
1692 		server, (struct sockaddr *) &server->dstaddr);
1693 
1694 	if (server->smbd_conn) {
1695 		cifs_dbg(VFS, "RDMA transport re-established\n");
1696 		trace_smb3_smbd_connect_done(server->hostname, server->conn_id, &server->dstaddr);
1697 		return 0;
1698 	}
1699 	trace_smb3_smbd_connect_err(server->hostname, server->conn_id, &server->dstaddr);
1700 	return -ENOENT;
1701 }
1702 
destroy_caches(struct smbdirect_socket * sc)1703 static void destroy_caches(struct smbdirect_socket *sc)
1704 {
1705 	destroy_receive_buffers(sc);
1706 	mempool_destroy(sc->recv_io.mem.pool);
1707 	kmem_cache_destroy(sc->recv_io.mem.cache);
1708 	mempool_destroy(sc->send_io.mem.pool);
1709 	kmem_cache_destroy(sc->send_io.mem.cache);
1710 }
1711 
1712 #define MAX_NAME_LEN	80
allocate_caches(struct smbdirect_socket * sc)1713 static int allocate_caches(struct smbdirect_socket *sc)
1714 {
1715 	struct smbdirect_socket_parameters *sp = &sc->parameters;
1716 	char name[MAX_NAME_LEN];
1717 	int rc;
1718 
1719 	if (WARN_ON_ONCE(sp->max_recv_size < sizeof(struct smbdirect_data_transfer)))
1720 		return -ENOMEM;
1721 
1722 	scnprintf(name, MAX_NAME_LEN, "smbdirect_send_io_%p", sc);
1723 	sc->send_io.mem.cache =
1724 		kmem_cache_create(
1725 			name,
1726 			sizeof(struct smbdirect_send_io) +
1727 				sizeof(struct smbdirect_data_transfer),
1728 			0, SLAB_HWCACHE_ALIGN, NULL);
1729 	if (!sc->send_io.mem.cache)
1730 		return -ENOMEM;
1731 
1732 	sc->send_io.mem.pool =
1733 		mempool_create(sp->send_credit_target, mempool_alloc_slab,
1734 			mempool_free_slab, sc->send_io.mem.cache);
1735 	if (!sc->send_io.mem.pool)
1736 		goto out1;
1737 
1738 	scnprintf(name, MAX_NAME_LEN, "smbdirect_recv_io_%p", sc);
1739 
1740 	struct kmem_cache_args response_args = {
1741 		.align		= __alignof__(struct smbdirect_recv_io),
1742 		.useroffset	= (offsetof(struct smbdirect_recv_io, packet) +
1743 				   sizeof(struct smbdirect_data_transfer)),
1744 		.usersize	= sp->max_recv_size - sizeof(struct smbdirect_data_transfer),
1745 	};
1746 	sc->recv_io.mem.cache =
1747 		kmem_cache_create(name,
1748 				  sizeof(struct smbdirect_recv_io) + sp->max_recv_size,
1749 				  &response_args, SLAB_HWCACHE_ALIGN);
1750 	if (!sc->recv_io.mem.cache)
1751 		goto out2;
1752 
1753 	sc->recv_io.mem.pool =
1754 		mempool_create(sp->recv_credit_max, mempool_alloc_slab,
1755 		       mempool_free_slab, sc->recv_io.mem.cache);
1756 	if (!sc->recv_io.mem.pool)
1757 		goto out3;
1758 
1759 	rc = allocate_receive_buffers(sc, sp->recv_credit_max);
1760 	if (rc) {
1761 		log_rdma_event(ERR, "failed to allocate receive buffers\n");
1762 		goto out4;
1763 	}
1764 
1765 	return 0;
1766 
1767 out4:
1768 	mempool_destroy(sc->recv_io.mem.pool);
1769 out3:
1770 	kmem_cache_destroy(sc->recv_io.mem.cache);
1771 out2:
1772 	mempool_destroy(sc->send_io.mem.pool);
1773 out1:
1774 	kmem_cache_destroy(sc->send_io.mem.cache);
1775 	return -ENOMEM;
1776 }
1777 
1778 /* Create a SMBD connection, called by upper layer */
_smbd_get_connection(struct TCP_Server_Info * server,struct sockaddr * dstaddr,int port)1779 static struct smbd_connection *_smbd_get_connection(
1780 	struct TCP_Server_Info *server, struct sockaddr *dstaddr, int port)
1781 {
1782 	int rc;
1783 	struct smbd_connection *info;
1784 	struct smbdirect_socket *sc;
1785 	struct smbdirect_socket_parameters *sp;
1786 	struct rdma_conn_param conn_param;
1787 	struct ib_qp_cap qp_cap;
1788 	struct ib_qp_init_attr qp_attr;
1789 	struct sockaddr_in *addr_in = (struct sockaddr_in *) dstaddr;
1790 	struct ib_port_immutable port_immutable;
1791 	__be32 ird_ord_hdr[2];
1792 	char wq_name[80];
1793 	struct workqueue_struct *workqueue;
1794 
1795 	info = kzalloc(sizeof(struct smbd_connection), GFP_KERNEL);
1796 	if (!info)
1797 		return NULL;
1798 	sc = &info->socket;
1799 	scnprintf(wq_name, ARRAY_SIZE(wq_name), "smbd_%p", sc);
1800 	workqueue = create_workqueue(wq_name);
1801 	if (!workqueue)
1802 		goto create_wq_failed;
1803 	smbdirect_socket_init(sc);
1804 	sc->workqueue = workqueue;
1805 	sp = &sc->parameters;
1806 
1807 	INIT_WORK(&sc->disconnect_work, smbd_disconnect_rdma_work);
1808 
1809 	sp->resolve_addr_timeout_msec = RDMA_RESOLVE_TIMEOUT;
1810 	sp->resolve_route_timeout_msec = RDMA_RESOLVE_TIMEOUT;
1811 	sp->rdma_connect_timeout_msec = RDMA_RESOLVE_TIMEOUT;
1812 	sp->negotiate_timeout_msec = SMBD_NEGOTIATE_TIMEOUT * 1000;
1813 	sp->initiator_depth = 1;
1814 	sp->responder_resources = SMBD_CM_RESPONDER_RESOURCES;
1815 	sp->recv_credit_max = smbd_receive_credit_max;
1816 	sp->send_credit_target = smbd_send_credit_target;
1817 	sp->max_send_size = smbd_max_send_size;
1818 	sp->max_fragmented_recv_size = smbd_max_fragmented_recv_size;
1819 	sp->max_recv_size = smbd_max_receive_size;
1820 	sp->max_frmr_depth = smbd_max_frmr_depth;
1821 	sp->keepalive_interval_msec = smbd_keep_alive_interval * 1000;
1822 	sp->keepalive_timeout_msec = KEEPALIVE_RECV_TIMEOUT * 1000;
1823 
1824 	rc = smbd_ia_open(sc, dstaddr, port);
1825 	if (rc) {
1826 		log_rdma_event(INFO, "smbd_ia_open rc=%d\n", rc);
1827 		goto create_id_failed;
1828 	}
1829 
1830 	if (sp->send_credit_target > sc->ib.dev->attrs.max_cqe ||
1831 	    sp->send_credit_target > sc->ib.dev->attrs.max_qp_wr) {
1832 		log_rdma_event(ERR, "consider lowering send_credit_target = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n",
1833 			       sp->send_credit_target,
1834 			       sc->ib.dev->attrs.max_cqe,
1835 			       sc->ib.dev->attrs.max_qp_wr);
1836 		goto config_failed;
1837 	}
1838 
1839 	if (sp->recv_credit_max > sc->ib.dev->attrs.max_cqe ||
1840 	    sp->recv_credit_max > sc->ib.dev->attrs.max_qp_wr) {
1841 		log_rdma_event(ERR, "consider lowering receive_credit_max = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n",
1842 			       sp->recv_credit_max,
1843 			       sc->ib.dev->attrs.max_cqe,
1844 			       sc->ib.dev->attrs.max_qp_wr);
1845 		goto config_failed;
1846 	}
1847 
1848 	if (sc->ib.dev->attrs.max_send_sge < SMBDIRECT_SEND_IO_MAX_SGE ||
1849 	    sc->ib.dev->attrs.max_recv_sge < SMBDIRECT_RECV_IO_MAX_SGE) {
1850 		log_rdma_event(ERR,
1851 			"device %.*s max_send_sge/max_recv_sge = %d/%d too small\n",
1852 			IB_DEVICE_NAME_MAX,
1853 			sc->ib.dev->name,
1854 			sc->ib.dev->attrs.max_send_sge,
1855 			sc->ib.dev->attrs.max_recv_sge);
1856 		goto config_failed;
1857 	}
1858 
1859 	sp->responder_resources =
1860 		min_t(u8, sp->responder_resources,
1861 		      sc->ib.dev->attrs.max_qp_rd_atom);
1862 	log_rdma_mr(INFO, "responder_resources=%d\n",
1863 		sp->responder_resources);
1864 
1865 	/*
1866 	 * We use allocate sp->responder_resources * 2 MRs
1867 	 * and each MR needs WRs for REG and INV, so
1868 	 * we use '* 4'.
1869 	 *
1870 	 * +1 for ib_drain_qp()
1871 	 */
1872 	memset(&qp_cap, 0, sizeof(qp_cap));
1873 	qp_cap.max_send_wr = sp->send_credit_target + sp->responder_resources * 4 + 1;
1874 	qp_cap.max_recv_wr = sp->recv_credit_max + 1;
1875 	qp_cap.max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE;
1876 	qp_cap.max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE;
1877 
1878 	sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0);
1879 	if (IS_ERR(sc->ib.pd)) {
1880 		rc = PTR_ERR(sc->ib.pd);
1881 		sc->ib.pd = NULL;
1882 		log_rdma_event(ERR, "ib_alloc_pd() returned %d\n", rc);
1883 		goto alloc_pd_failed;
1884 	}
1885 
1886 	sc->ib.send_cq =
1887 		ib_alloc_cq_any(sc->ib.dev, sc,
1888 				qp_cap.max_send_wr, IB_POLL_SOFTIRQ);
1889 	if (IS_ERR(sc->ib.send_cq)) {
1890 		sc->ib.send_cq = NULL;
1891 		goto alloc_cq_failed;
1892 	}
1893 
1894 	sc->ib.recv_cq =
1895 		ib_alloc_cq_any(sc->ib.dev, sc,
1896 				qp_cap.max_recv_wr, IB_POLL_SOFTIRQ);
1897 	if (IS_ERR(sc->ib.recv_cq)) {
1898 		sc->ib.recv_cq = NULL;
1899 		goto alloc_cq_failed;
1900 	}
1901 
1902 	memset(&qp_attr, 0, sizeof(qp_attr));
1903 	qp_attr.event_handler = smbd_qp_async_error_upcall;
1904 	qp_attr.qp_context = sc;
1905 	qp_attr.cap = qp_cap;
1906 	qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
1907 	qp_attr.qp_type = IB_QPT_RC;
1908 	qp_attr.send_cq = sc->ib.send_cq;
1909 	qp_attr.recv_cq = sc->ib.recv_cq;
1910 	qp_attr.port_num = ~0;
1911 
1912 	rc = rdma_create_qp(sc->rdma.cm_id, sc->ib.pd, &qp_attr);
1913 	if (rc) {
1914 		log_rdma_event(ERR, "rdma_create_qp failed %i\n", rc);
1915 		goto create_qp_failed;
1916 	}
1917 	sc->ib.qp = sc->rdma.cm_id->qp;
1918 
1919 	memset(&conn_param, 0, sizeof(conn_param));
1920 	conn_param.initiator_depth = sp->initiator_depth;
1921 	conn_param.responder_resources = sp->responder_resources;
1922 
1923 	/* Need to send IRD/ORD in private data for iWARP */
1924 	sc->ib.dev->ops.get_port_immutable(
1925 		sc->ib.dev, sc->rdma.cm_id->port_num, &port_immutable);
1926 	if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) {
1927 		ird_ord_hdr[0] = cpu_to_be32(conn_param.responder_resources);
1928 		ird_ord_hdr[1] = cpu_to_be32(conn_param.initiator_depth);
1929 		conn_param.private_data = ird_ord_hdr;
1930 		conn_param.private_data_len = sizeof(ird_ord_hdr);
1931 	} else {
1932 		conn_param.private_data = NULL;
1933 		conn_param.private_data_len = 0;
1934 	}
1935 
1936 	conn_param.retry_count = SMBD_CM_RETRY;
1937 	conn_param.rnr_retry_count = SMBD_CM_RNR_RETRY;
1938 	conn_param.flow_control = 0;
1939 
1940 	log_rdma_event(INFO, "connecting to IP %pI4 port %d\n",
1941 		&addr_in->sin_addr, port);
1942 
1943 	WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED);
1944 	sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING;
1945 	rc = rdma_connect(sc->rdma.cm_id, &conn_param);
1946 	if (rc) {
1947 		log_rdma_event(ERR, "rdma_connect() failed with %i\n", rc);
1948 		goto rdma_connect_failed;
1949 	}
1950 
1951 	wait_event_interruptible_timeout(
1952 		sc->status_wait,
1953 		sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING,
1954 		msecs_to_jiffies(sp->rdma_connect_timeout_msec));
1955 
1956 	if (sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED) {
1957 		log_rdma_event(ERR, "rdma_connect failed port=%d\n", port);
1958 		goto rdma_connect_failed;
1959 	}
1960 
1961 	log_rdma_event(INFO, "rdma_connect connected\n");
1962 
1963 	rc = allocate_caches(sc);
1964 	if (rc) {
1965 		log_rdma_event(ERR, "cache allocation failed\n");
1966 		goto allocate_cache_failed;
1967 	}
1968 
1969 	INIT_WORK(&sc->idle.immediate_work, send_immediate_empty_message);
1970 	INIT_DELAYED_WORK(&sc->idle.timer_work, idle_connection_timer);
1971 	/*
1972 	 * start with the negotiate timeout and SMBDIRECT_KEEPALIVE_PENDING
1973 	 * so that the timer will cause a disconnect.
1974 	 */
1975 	sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING;
1976 	mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
1977 			 msecs_to_jiffies(sp->negotiate_timeout_msec));
1978 
1979 	INIT_WORK(&sc->recv_io.posted.refill_work, smbd_post_send_credits);
1980 
1981 	rc = smbd_negotiate(sc);
1982 	if (rc) {
1983 		log_rdma_event(ERR, "smbd_negotiate rc=%d\n", rc);
1984 		goto negotiation_failed;
1985 	}
1986 
1987 	rc = allocate_mr_list(sc);
1988 	if (rc) {
1989 		log_rdma_mr(ERR, "memory registration allocation failed\n");
1990 		goto allocate_mr_failed;
1991 	}
1992 
1993 	return info;
1994 
1995 allocate_mr_failed:
1996 	/* At this point, need to a full transport shutdown */
1997 	server->smbd_conn = info;
1998 	smbd_destroy(server);
1999 	return NULL;
2000 
2001 negotiation_failed:
2002 	disable_delayed_work_sync(&sc->idle.timer_work);
2003 	destroy_caches(sc);
2004 	sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED;
2005 	rdma_disconnect(sc->rdma.cm_id);
2006 	wait_event(sc->status_wait,
2007 		sc->status == SMBDIRECT_SOCKET_DISCONNECTED);
2008 
2009 allocate_cache_failed:
2010 rdma_connect_failed:
2011 	rdma_destroy_qp(sc->rdma.cm_id);
2012 
2013 create_qp_failed:
2014 alloc_cq_failed:
2015 	if (sc->ib.send_cq)
2016 		ib_free_cq(sc->ib.send_cq);
2017 	if (sc->ib.recv_cq)
2018 		ib_free_cq(sc->ib.recv_cq);
2019 
2020 	ib_dealloc_pd(sc->ib.pd);
2021 
2022 alloc_pd_failed:
2023 config_failed:
2024 	rdma_destroy_id(sc->rdma.cm_id);
2025 
2026 create_id_failed:
2027 	destroy_workqueue(sc->workqueue);
2028 create_wq_failed:
2029 	kfree(info);
2030 	return NULL;
2031 }
2032 
smbd_get_connection(struct TCP_Server_Info * server,struct sockaddr * dstaddr)2033 struct smbd_connection *smbd_get_connection(
2034 	struct TCP_Server_Info *server, struct sockaddr *dstaddr)
2035 {
2036 	struct smbd_connection *ret;
2037 	const struct smbdirect_socket_parameters *sp;
2038 	int port = SMBD_PORT;
2039 
2040 try_again:
2041 	ret = _smbd_get_connection(server, dstaddr, port);
2042 
2043 	/* Try SMB_PORT if SMBD_PORT doesn't work */
2044 	if (!ret && port == SMBD_PORT) {
2045 		port = SMB_PORT;
2046 		goto try_again;
2047 	}
2048 	if (!ret)
2049 		return NULL;
2050 
2051 	sp = &ret->socket.parameters;
2052 
2053 	server->rdma_readwrite_threshold =
2054 		rdma_readwrite_threshold > sp->max_fragmented_send_size ?
2055 		sp->max_fragmented_send_size :
2056 		rdma_readwrite_threshold;
2057 
2058 	return ret;
2059 }
2060 
2061 /*
2062  * Receive data from the transport's receive reassembly queue
2063  * All the incoming data packets are placed in reassembly queue
2064  * iter: the buffer to read data into
2065  * size: the length of data to read
2066  * return value: actual data read
2067  *
2068  * Note: this implementation copies the data from reassembly queue to receive
2069  * buffers used by upper layer. This is not the optimal code path. A better way
2070  * to do it is to not have upper layer allocate its receive buffers but rather
2071  * borrow the buffer from reassembly queue, and return it after data is
2072  * consumed. But this will require more changes to upper layer code, and also
2073  * need to consider packet boundaries while they still being reassembled.
2074  */
smbd_recv(struct smbd_connection * info,struct msghdr * msg)2075 int smbd_recv(struct smbd_connection *info, struct msghdr *msg)
2076 {
2077 	struct smbdirect_socket *sc = &info->socket;
2078 	struct smbdirect_recv_io *response;
2079 	struct smbdirect_data_transfer *data_transfer;
2080 	size_t size = iov_iter_count(&msg->msg_iter);
2081 	int to_copy, to_read, data_read, offset;
2082 	u32 data_length, remaining_data_length, data_offset;
2083 	int rc;
2084 
2085 	if (WARN_ON_ONCE(iov_iter_rw(&msg->msg_iter) == WRITE))
2086 		return -EINVAL; /* It's a bug in upper layer to get there */
2087 
2088 again:
2089 	/*
2090 	 * No need to hold the reassembly queue lock all the time as we are
2091 	 * the only one reading from the front of the queue. The transport
2092 	 * may add more entries to the back of the queue at the same time
2093 	 */
2094 	log_read(INFO, "size=%zd sc->recv_io.reassembly.data_length=%d\n", size,
2095 		sc->recv_io.reassembly.data_length);
2096 	if (sc->recv_io.reassembly.data_length >= size) {
2097 		int queue_length;
2098 		int queue_removed = 0;
2099 		unsigned long flags;
2100 
2101 		/*
2102 		 * Need to make sure reassembly_data_length is read before
2103 		 * reading reassembly_queue_length and calling
2104 		 * _get_first_reassembly. This call is lock free
2105 		 * as we never read at the end of the queue which are being
2106 		 * updated in SOFTIRQ as more data is received
2107 		 */
2108 		virt_rmb();
2109 		queue_length = sc->recv_io.reassembly.queue_length;
2110 		data_read = 0;
2111 		to_read = size;
2112 		offset = sc->recv_io.reassembly.first_entry_offset;
2113 		while (data_read < size) {
2114 			response = _get_first_reassembly(sc);
2115 			data_transfer = smbdirect_recv_io_payload(response);
2116 			data_length = le32_to_cpu(data_transfer->data_length);
2117 			remaining_data_length =
2118 				le32_to_cpu(
2119 					data_transfer->remaining_data_length);
2120 			data_offset = le32_to_cpu(data_transfer->data_offset);
2121 
2122 			/*
2123 			 * The upper layer expects RFC1002 length at the
2124 			 * beginning of the payload. Return it to indicate
2125 			 * the total length of the packet. This minimize the
2126 			 * change to upper layer packet processing logic. This
2127 			 * will be eventually remove when an intermediate
2128 			 * transport layer is added
2129 			 */
2130 			if (response->first_segment && size == 4) {
2131 				unsigned int rfc1002_len =
2132 					data_length + remaining_data_length;
2133 				__be32 rfc1002_hdr = cpu_to_be32(rfc1002_len);
2134 				if (copy_to_iter(&rfc1002_hdr, sizeof(rfc1002_hdr),
2135 						 &msg->msg_iter) != sizeof(rfc1002_hdr))
2136 					return -EFAULT;
2137 				data_read = 4;
2138 				response->first_segment = false;
2139 				log_read(INFO, "returning rfc1002 length %d\n",
2140 					rfc1002_len);
2141 				goto read_rfc1002_done;
2142 			}
2143 
2144 			to_copy = min_t(int, data_length - offset, to_read);
2145 			if (copy_to_iter((char *)data_transfer + data_offset + offset,
2146 					 to_copy, &msg->msg_iter) != to_copy)
2147 				return -EFAULT;
2148 
2149 			/* move on to the next buffer? */
2150 			if (to_copy == data_length - offset) {
2151 				queue_length--;
2152 				/*
2153 				 * No need to lock if we are not at the
2154 				 * end of the queue
2155 				 */
2156 				if (queue_length)
2157 					list_del(&response->list);
2158 				else {
2159 					spin_lock_irqsave(
2160 						&sc->recv_io.reassembly.lock, flags);
2161 					list_del(&response->list);
2162 					spin_unlock_irqrestore(
2163 						&sc->recv_io.reassembly.lock, flags);
2164 				}
2165 				queue_removed++;
2166 				sc->statistics.dequeue_reassembly_queue++;
2167 				put_receive_buffer(sc, response);
2168 				offset = 0;
2169 				log_read(INFO, "put_receive_buffer offset=0\n");
2170 			} else
2171 				offset += to_copy;
2172 
2173 			to_read -= to_copy;
2174 			data_read += to_copy;
2175 
2176 			log_read(INFO, "_get_first_reassembly memcpy %d bytes data_transfer_length-offset=%d after that to_read=%d data_read=%d offset=%d\n",
2177 				 to_copy, data_length - offset,
2178 				 to_read, data_read, offset);
2179 		}
2180 
2181 		spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
2182 		sc->recv_io.reassembly.data_length -= data_read;
2183 		sc->recv_io.reassembly.queue_length -= queue_removed;
2184 		spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
2185 
2186 		sc->recv_io.reassembly.first_entry_offset = offset;
2187 		log_read(INFO, "returning to thread data_read=%d reassembly_data_length=%d first_entry_offset=%d\n",
2188 			 data_read, sc->recv_io.reassembly.data_length,
2189 			 sc->recv_io.reassembly.first_entry_offset);
2190 read_rfc1002_done:
2191 		return data_read;
2192 	}
2193 
2194 	log_read(INFO, "wait_event on more data\n");
2195 	rc = wait_event_interruptible(
2196 		sc->recv_io.reassembly.wait_queue,
2197 		sc->recv_io.reassembly.data_length >= size ||
2198 			sc->status != SMBDIRECT_SOCKET_CONNECTED);
2199 	/* Don't return any data if interrupted */
2200 	if (rc)
2201 		return rc;
2202 
2203 	if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
2204 		log_read(ERR, "disconnected\n");
2205 		return -ECONNABORTED;
2206 	}
2207 
2208 	goto again;
2209 }
2210 
2211 /*
2212  * Send data to transport
2213  * Each rqst is transported as a SMBDirect payload
2214  * rqst: the data to write
2215  * return value: 0 if successfully write, otherwise error code
2216  */
smbd_send(struct TCP_Server_Info * server,int num_rqst,struct smb_rqst * rqst_array)2217 int smbd_send(struct TCP_Server_Info *server,
2218 	int num_rqst, struct smb_rqst *rqst_array)
2219 {
2220 	struct smbd_connection *info = server->smbd_conn;
2221 	struct smbdirect_socket *sc = &info->socket;
2222 	struct smbdirect_socket_parameters *sp = &sc->parameters;
2223 	struct smb_rqst *rqst;
2224 	struct iov_iter iter;
2225 	unsigned int remaining_data_length, klen;
2226 	int rc, i, rqst_idx;
2227 
2228 	if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
2229 		return -EAGAIN;
2230 
2231 	/*
2232 	 * Add in the page array if there is one. The caller needs to set
2233 	 * rq_tailsz to PAGE_SIZE when the buffer has multiple pages and
2234 	 * ends at page boundary
2235 	 */
2236 	remaining_data_length = 0;
2237 	for (i = 0; i < num_rqst; i++)
2238 		remaining_data_length += smb_rqst_len(server, &rqst_array[i]);
2239 
2240 	if (unlikely(remaining_data_length > sp->max_fragmented_send_size)) {
2241 		/* assertion: payload never exceeds negotiated maximum */
2242 		log_write(ERR, "payload size %d > max size %d\n",
2243 			remaining_data_length, sp->max_fragmented_send_size);
2244 		return -EINVAL;
2245 	}
2246 
2247 	log_write(INFO, "num_rqst=%d total length=%u\n",
2248 			num_rqst, remaining_data_length);
2249 
2250 	rqst_idx = 0;
2251 	do {
2252 		rqst = &rqst_array[rqst_idx];
2253 
2254 		cifs_dbg(FYI, "Sending smb (RDMA): idx=%d smb_len=%lu\n",
2255 			 rqst_idx, smb_rqst_len(server, rqst));
2256 		for (i = 0; i < rqst->rq_nvec; i++)
2257 			dump_smb(rqst->rq_iov[i].iov_base, rqst->rq_iov[i].iov_len);
2258 
2259 		log_write(INFO, "RDMA-WR[%u] nvec=%d len=%u iter=%zu rqlen=%lu\n",
2260 			  rqst_idx, rqst->rq_nvec, remaining_data_length,
2261 			  iov_iter_count(&rqst->rq_iter), smb_rqst_len(server, rqst));
2262 
2263 		/* Send the metadata pages. */
2264 		klen = 0;
2265 		for (i = 0; i < rqst->rq_nvec; i++)
2266 			klen += rqst->rq_iov[i].iov_len;
2267 		iov_iter_kvec(&iter, ITER_SOURCE, rqst->rq_iov, rqst->rq_nvec, klen);
2268 
2269 		rc = smbd_post_send_full_iter(sc, &iter, &remaining_data_length);
2270 		if (rc < 0)
2271 			break;
2272 
2273 		if (iov_iter_count(&rqst->rq_iter) > 0) {
2274 			/* And then the data pages if there are any */
2275 			rc = smbd_post_send_full_iter(sc, &rqst->rq_iter,
2276 						      &remaining_data_length);
2277 			if (rc < 0)
2278 				break;
2279 		}
2280 
2281 	} while (++rqst_idx < num_rqst);
2282 
2283 	/*
2284 	 * As an optimization, we don't wait for individual I/O to finish
2285 	 * before sending the next one.
2286 	 * Send them all and wait for pending send count to get to 0
2287 	 * that means all the I/Os have been out and we are good to return
2288 	 */
2289 
2290 	wait_event(sc->send_io.pending.zero_wait_queue,
2291 		atomic_read(&sc->send_io.pending.count) == 0 ||
2292 		sc->status != SMBDIRECT_SOCKET_CONNECTED);
2293 
2294 	if (sc->status != SMBDIRECT_SOCKET_CONNECTED && rc == 0)
2295 		rc = -EAGAIN;
2296 
2297 	return rc;
2298 }
2299 
register_mr_done(struct ib_cq * cq,struct ib_wc * wc)2300 static void register_mr_done(struct ib_cq *cq, struct ib_wc *wc)
2301 {
2302 	struct smbdirect_mr_io *mr =
2303 		container_of(wc->wr_cqe, struct smbdirect_mr_io, cqe);
2304 	struct smbdirect_socket *sc = mr->socket;
2305 
2306 	if (wc->status) {
2307 		log_rdma_mr(ERR, "status=%d\n", wc->status);
2308 		smbd_disconnect_rdma_connection(sc);
2309 	}
2310 }
2311 
2312 /*
2313  * The work queue function that recovers MRs
2314  * We need to call ib_dereg_mr() and ib_alloc_mr() before this MR can be used
2315  * again. Both calls are slow, so finish them in a workqueue. This will not
2316  * block I/O path.
2317  * There is one workqueue that recovers MRs, there is no need to lock as the
2318  * I/O requests calling smbd_register_mr will never update the links in the
2319  * mr_list.
2320  */
smbd_mr_recovery_work(struct work_struct * work)2321 static void smbd_mr_recovery_work(struct work_struct *work)
2322 {
2323 	struct smbdirect_socket *sc =
2324 		container_of(work, struct smbdirect_socket, mr_io.recovery_work);
2325 	struct smbdirect_socket_parameters *sp = &sc->parameters;
2326 	struct smbdirect_mr_io *smbdirect_mr;
2327 	int rc;
2328 
2329 	list_for_each_entry(smbdirect_mr, &sc->mr_io.all.list, list) {
2330 		if (smbdirect_mr->state == SMBDIRECT_MR_ERROR) {
2331 
2332 			/* recover this MR entry */
2333 			rc = ib_dereg_mr(smbdirect_mr->mr);
2334 			if (rc) {
2335 				log_rdma_mr(ERR,
2336 					"ib_dereg_mr failed rc=%x\n",
2337 					rc);
2338 				smbd_disconnect_rdma_connection(sc);
2339 				continue;
2340 			}
2341 
2342 			smbdirect_mr->mr = ib_alloc_mr(
2343 				sc->ib.pd, sc->mr_io.type,
2344 				sp->max_frmr_depth);
2345 			if (IS_ERR(smbdirect_mr->mr)) {
2346 				log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n",
2347 					    sc->mr_io.type,
2348 					    sp->max_frmr_depth);
2349 				smbd_disconnect_rdma_connection(sc);
2350 				continue;
2351 			}
2352 		} else
2353 			/* This MR is being used, don't recover it */
2354 			continue;
2355 
2356 		smbdirect_mr->state = SMBDIRECT_MR_READY;
2357 
2358 		/* smbdirect_mr->state is updated by this function
2359 		 * and is read and updated by I/O issuing CPUs trying
2360 		 * to get a MR, the call to atomic_inc_return
2361 		 * implicates a memory barrier and guarantees this
2362 		 * value is updated before waking up any calls to
2363 		 * get_mr() from the I/O issuing CPUs
2364 		 */
2365 		if (atomic_inc_return(&sc->mr_io.ready.count) == 1)
2366 			wake_up(&sc->mr_io.ready.wait_queue);
2367 	}
2368 }
2369 
smbd_mr_disable_locked(struct smbdirect_mr_io * mr)2370 static void smbd_mr_disable_locked(struct smbdirect_mr_io *mr)
2371 {
2372 	struct smbdirect_socket *sc = mr->socket;
2373 
2374 	lockdep_assert_held(&mr->mutex);
2375 
2376 	if (mr->state == SMBDIRECT_MR_DISABLED)
2377 		return;
2378 
2379 	if (mr->mr)
2380 		ib_dereg_mr(mr->mr);
2381 	if (mr->sgt.nents)
2382 		ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir);
2383 	kfree(mr->sgt.sgl);
2384 
2385 	mr->mr = NULL;
2386 	mr->sgt.sgl = NULL;
2387 	mr->sgt.nents = 0;
2388 
2389 	mr->state = SMBDIRECT_MR_DISABLED;
2390 }
2391 
smbd_mr_free_locked(struct kref * kref)2392 static void smbd_mr_free_locked(struct kref *kref)
2393 {
2394 	struct smbdirect_mr_io *mr =
2395 		container_of(kref, struct smbdirect_mr_io, kref);
2396 
2397 	lockdep_assert_held(&mr->mutex);
2398 
2399 	/*
2400 	 * smbd_mr_disable_locked() should already be called!
2401 	 */
2402 	if (WARN_ON_ONCE(mr->state != SMBDIRECT_MR_DISABLED))
2403 		smbd_mr_disable_locked(mr);
2404 
2405 	mutex_unlock(&mr->mutex);
2406 	mutex_destroy(&mr->mutex);
2407 	kfree(mr);
2408 }
2409 
destroy_mr_list(struct smbdirect_socket * sc)2410 static void destroy_mr_list(struct smbdirect_socket *sc)
2411 {
2412 	struct smbdirect_mr_io *mr, *tmp;
2413 	LIST_HEAD(all_list);
2414 	unsigned long flags;
2415 
2416 	disable_work_sync(&sc->mr_io.recovery_work);
2417 
2418 	spin_lock_irqsave(&sc->mr_io.all.lock, flags);
2419 	list_splice_tail_init(&sc->mr_io.all.list, &all_list);
2420 	spin_unlock_irqrestore(&sc->mr_io.all.lock, flags);
2421 
2422 	list_for_each_entry_safe(mr, tmp, &all_list, list) {
2423 		mutex_lock(&mr->mutex);
2424 
2425 		smbd_mr_disable_locked(mr);
2426 		list_del(&mr->list);
2427 		mr->socket = NULL;
2428 
2429 		/*
2430 		 * No kref_put_mutex() as it's already locked.
2431 		 *
2432 		 * If smbd_mr_free_locked() is called
2433 		 * and the mutex is unlocked and mr is gone,
2434 		 * in that case kref_put() returned 1.
2435 		 *
2436 		 * If kref_put() returned 0 we know that
2437 		 * smbd_mr_free_locked() didn't
2438 		 * run. Not by us nor by anyone else, as we
2439 		 * still hold the mutex, so we need to unlock.
2440 		 *
2441 		 * If the mr is still registered it will
2442 		 * be dangling (detached from the connection
2443 		 * waiting for smbd_deregister_mr() to be
2444 		 * called in order to free the memory.
2445 		 */
2446 		if (!kref_put(&mr->kref, smbd_mr_free_locked))
2447 			mutex_unlock(&mr->mutex);
2448 	}
2449 }
2450 
2451 /*
2452  * Allocate MRs used for RDMA read/write
2453  * The number of MRs will not exceed hardware capability in responder_resources
2454  * All MRs are kept in mr_list. The MR can be recovered after it's used
2455  * Recovery is done in smbd_mr_recovery_work. The content of list entry changes
2456  * as MRs are used and recovered for I/O, but the list links will not change
2457  */
allocate_mr_list(struct smbdirect_socket * sc)2458 static int allocate_mr_list(struct smbdirect_socket *sc)
2459 {
2460 	struct smbdirect_socket_parameters *sp = &sc->parameters;
2461 	struct smbdirect_mr_io *mr;
2462 	int ret;
2463 	u32 i;
2464 
2465 	if (sp->responder_resources == 0) {
2466 		log_rdma_mr(ERR, "responder_resources negotiated as 0\n");
2467 		return -EINVAL;
2468 	}
2469 
2470 	/* Allocate more MRs (2x) than hardware responder_resources */
2471 	for (i = 0; i < sp->responder_resources * 2; i++) {
2472 		mr = kzalloc(sizeof(*mr), GFP_KERNEL);
2473 		if (!mr) {
2474 			ret = -ENOMEM;
2475 			goto kzalloc_mr_failed;
2476 		}
2477 
2478 		kref_init(&mr->kref);
2479 		mutex_init(&mr->mutex);
2480 
2481 		mr->mr = ib_alloc_mr(sc->ib.pd,
2482 				     sc->mr_io.type,
2483 				     sp->max_frmr_depth);
2484 		if (IS_ERR(mr->mr)) {
2485 			ret = PTR_ERR(mr->mr);
2486 			log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n",
2487 				    sc->mr_io.type, sp->max_frmr_depth);
2488 			goto ib_alloc_mr_failed;
2489 		}
2490 
2491 		mr->sgt.sgl = kcalloc(sp->max_frmr_depth,
2492 				      sizeof(struct scatterlist),
2493 				      GFP_KERNEL);
2494 		if (!mr->sgt.sgl) {
2495 			ret = -ENOMEM;
2496 			log_rdma_mr(ERR, "failed to allocate sgl\n");
2497 			goto kcalloc_sgl_failed;
2498 		}
2499 		mr->state = SMBDIRECT_MR_READY;
2500 		mr->socket = sc;
2501 
2502 		list_add_tail(&mr->list, &sc->mr_io.all.list);
2503 		atomic_inc(&sc->mr_io.ready.count);
2504 	}
2505 
2506 	INIT_WORK(&sc->mr_io.recovery_work, smbd_mr_recovery_work);
2507 
2508 	return 0;
2509 
2510 kcalloc_sgl_failed:
2511 	ib_dereg_mr(mr->mr);
2512 ib_alloc_mr_failed:
2513 	mutex_destroy(&mr->mutex);
2514 	kfree(mr);
2515 kzalloc_mr_failed:
2516 	destroy_mr_list(sc);
2517 	return ret;
2518 }
2519 
2520 /*
2521  * Get a MR from mr_list. This function waits until there is at least one
2522  * MR available in the list. It may access the list while the
2523  * smbd_mr_recovery_work is recovering the MR list. This doesn't need a lock
2524  * as they never modify the same places. However, there may be several CPUs
2525  * issuing I/O trying to get MR at the same time, mr_list_lock is used to
2526  * protect this situation.
2527  */
get_mr(struct smbdirect_socket * sc)2528 static struct smbdirect_mr_io *get_mr(struct smbdirect_socket *sc)
2529 {
2530 	struct smbdirect_mr_io *ret;
2531 	unsigned long flags;
2532 	int rc;
2533 again:
2534 	rc = wait_event_interruptible(sc->mr_io.ready.wait_queue,
2535 		atomic_read(&sc->mr_io.ready.count) ||
2536 		sc->status != SMBDIRECT_SOCKET_CONNECTED);
2537 	if (rc) {
2538 		log_rdma_mr(ERR, "wait_event_interruptible rc=%x\n", rc);
2539 		return NULL;
2540 	}
2541 
2542 	if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
2543 		log_rdma_mr(ERR, "sc->status=%x\n", sc->status);
2544 		return NULL;
2545 	}
2546 
2547 	spin_lock_irqsave(&sc->mr_io.all.lock, flags);
2548 	list_for_each_entry(ret, &sc->mr_io.all.list, list) {
2549 		if (ret->state == SMBDIRECT_MR_READY) {
2550 			ret->state = SMBDIRECT_MR_REGISTERED;
2551 			kref_get(&ret->kref);
2552 			spin_unlock_irqrestore(&sc->mr_io.all.lock, flags);
2553 			atomic_dec(&sc->mr_io.ready.count);
2554 			atomic_inc(&sc->mr_io.used.count);
2555 			return ret;
2556 		}
2557 	}
2558 
2559 	spin_unlock_irqrestore(&sc->mr_io.all.lock, flags);
2560 	/*
2561 	 * It is possible that we could fail to get MR because other processes may
2562 	 * try to acquire a MR at the same time. If this is the case, retry it.
2563 	 */
2564 	goto again;
2565 }
2566 
2567 /*
2568  * Transcribe the pages from an iterator into an MR scatterlist.
2569  */
smbd_iter_to_mr(struct iov_iter * iter,struct sg_table * sgt,unsigned int max_sg)2570 static int smbd_iter_to_mr(struct iov_iter *iter,
2571 			   struct sg_table *sgt,
2572 			   unsigned int max_sg)
2573 {
2574 	int ret;
2575 
2576 	memset(sgt->sgl, 0, max_sg * sizeof(struct scatterlist));
2577 
2578 	ret = extract_iter_to_sg(iter, iov_iter_count(iter), sgt, max_sg, 0);
2579 	WARN_ON(ret < 0);
2580 	if (sgt->nents > 0)
2581 		sg_mark_end(&sgt->sgl[sgt->nents - 1]);
2582 	return ret;
2583 }
2584 
2585 /*
2586  * Register memory for RDMA read/write
2587  * iter: the buffer to register memory with
2588  * writing: true if this is a RDMA write (SMB read), false for RDMA read
2589  * need_invalidate: true if this MR needs to be locally invalidated after I/O
2590  * return value: the MR registered, NULL if failed.
2591  */
smbd_register_mr(struct smbd_connection * info,struct iov_iter * iter,bool writing,bool need_invalidate)2592 struct smbdirect_mr_io *smbd_register_mr(struct smbd_connection *info,
2593 				 struct iov_iter *iter,
2594 				 bool writing, bool need_invalidate)
2595 {
2596 	struct smbdirect_socket *sc = &info->socket;
2597 	struct smbdirect_socket_parameters *sp = &sc->parameters;
2598 	struct smbdirect_mr_io *mr;
2599 	int rc, num_pages;
2600 	struct ib_reg_wr *reg_wr;
2601 
2602 	num_pages = iov_iter_npages(iter, sp->max_frmr_depth + 1);
2603 	if (num_pages > sp->max_frmr_depth) {
2604 		log_rdma_mr(ERR, "num_pages=%d max_frmr_depth=%d\n",
2605 			num_pages, sp->max_frmr_depth);
2606 		WARN_ON_ONCE(1);
2607 		return NULL;
2608 	}
2609 
2610 	mr = get_mr(sc);
2611 	if (!mr) {
2612 		log_rdma_mr(ERR, "get_mr returning NULL\n");
2613 		return NULL;
2614 	}
2615 
2616 	mutex_lock(&mr->mutex);
2617 
2618 	mr->dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
2619 	mr->need_invalidate = need_invalidate;
2620 	mr->sgt.nents = 0;
2621 	mr->sgt.orig_nents = 0;
2622 
2623 	log_rdma_mr(INFO, "num_pages=0x%x count=0x%zx depth=%u\n",
2624 		    num_pages, iov_iter_count(iter), sp->max_frmr_depth);
2625 	smbd_iter_to_mr(iter, &mr->sgt, sp->max_frmr_depth);
2626 
2627 	rc = ib_dma_map_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir);
2628 	if (!rc) {
2629 		log_rdma_mr(ERR, "ib_dma_map_sg num_pages=%x dir=%x rc=%x\n",
2630 			    num_pages, mr->dir, rc);
2631 		goto dma_map_error;
2632 	}
2633 
2634 	rc = ib_map_mr_sg(mr->mr, mr->sgt.sgl, mr->sgt.nents, NULL, PAGE_SIZE);
2635 	if (rc != mr->sgt.nents) {
2636 		log_rdma_mr(ERR,
2637 			    "ib_map_mr_sg failed rc = %d nents = %x\n",
2638 			    rc, mr->sgt.nents);
2639 		goto map_mr_error;
2640 	}
2641 
2642 	ib_update_fast_reg_key(mr->mr, ib_inc_rkey(mr->mr->rkey));
2643 	reg_wr = &mr->wr;
2644 	reg_wr->wr.opcode = IB_WR_REG_MR;
2645 	mr->cqe.done = register_mr_done;
2646 	reg_wr->wr.wr_cqe = &mr->cqe;
2647 	reg_wr->wr.num_sge = 0;
2648 	reg_wr->wr.send_flags = IB_SEND_SIGNALED;
2649 	reg_wr->mr = mr->mr;
2650 	reg_wr->key = mr->mr->rkey;
2651 	reg_wr->access = writing ?
2652 			IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
2653 			IB_ACCESS_REMOTE_READ;
2654 
2655 	/*
2656 	 * There is no need for waiting for complemtion on ib_post_send
2657 	 * on IB_WR_REG_MR. Hardware enforces a barrier and order of execution
2658 	 * on the next ib_post_send when we actually send I/O to remote peer
2659 	 */
2660 	rc = ib_post_send(sc->ib.qp, &reg_wr->wr, NULL);
2661 	if (!rc) {
2662 		/*
2663 		 * get_mr() gave us a reference
2664 		 * via kref_get(&mr->kref), we keep that and let
2665 		 * the caller use smbd_deregister_mr()
2666 		 * to remove it again.
2667 		 */
2668 		mutex_unlock(&mr->mutex);
2669 		return mr;
2670 	}
2671 
2672 	log_rdma_mr(ERR, "ib_post_send failed rc=%x reg_wr->key=%x\n",
2673 		rc, reg_wr->key);
2674 
2675 	/* If all failed, attempt to recover this MR by setting it SMBDIRECT_MR_ERROR*/
2676 map_mr_error:
2677 	ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir);
2678 
2679 dma_map_error:
2680 	mr->sgt.nents = 0;
2681 	mr->state = SMBDIRECT_MR_ERROR;
2682 	if (atomic_dec_and_test(&sc->mr_io.used.count))
2683 		wake_up(&sc->mr_io.cleanup.wait_queue);
2684 
2685 	smbd_disconnect_rdma_connection(sc);
2686 
2687 	/*
2688 	 * get_mr() gave us a reference
2689 	 * via kref_get(&mr->kref), we need to remove it again
2690 	 * on error.
2691 	 *
2692 	 * No kref_put_mutex() as it's already locked.
2693 	 *
2694 	 * If smbd_mr_free_locked() is called
2695 	 * and the mutex is unlocked and mr is gone,
2696 	 * in that case kref_put() returned 1.
2697 	 *
2698 	 * If kref_put() returned 0 we know that
2699 	 * smbd_mr_free_locked() didn't
2700 	 * run. Not by us nor by anyone else, as we
2701 	 * still hold the mutex, so we need to unlock.
2702 	 */
2703 	if (!kref_put(&mr->kref, smbd_mr_free_locked))
2704 		mutex_unlock(&mr->mutex);
2705 
2706 	return NULL;
2707 }
2708 
local_inv_done(struct ib_cq * cq,struct ib_wc * wc)2709 static void local_inv_done(struct ib_cq *cq, struct ib_wc *wc)
2710 {
2711 	struct smbdirect_mr_io *smbdirect_mr;
2712 	struct ib_cqe *cqe;
2713 
2714 	cqe = wc->wr_cqe;
2715 	smbdirect_mr = container_of(cqe, struct smbdirect_mr_io, cqe);
2716 	smbdirect_mr->state = SMBDIRECT_MR_INVALIDATED;
2717 	if (wc->status != IB_WC_SUCCESS) {
2718 		log_rdma_mr(ERR, "invalidate failed status=%x\n", wc->status);
2719 		smbdirect_mr->state = SMBDIRECT_MR_ERROR;
2720 	}
2721 	complete(&smbdirect_mr->invalidate_done);
2722 }
2723 
2724 /*
2725  * Deregister a MR after I/O is done
2726  * This function may wait if remote invalidation is not used
2727  * and we have to locally invalidate the buffer to prevent data is being
2728  * modified by remote peer after upper layer consumes it
2729  */
smbd_deregister_mr(struct smbdirect_mr_io * mr)2730 void smbd_deregister_mr(struct smbdirect_mr_io *mr)
2731 {
2732 	struct smbdirect_socket *sc = mr->socket;
2733 
2734 	mutex_lock(&mr->mutex);
2735 	if (mr->state == SMBDIRECT_MR_DISABLED)
2736 		goto put_kref;
2737 
2738 	if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
2739 		smbd_mr_disable_locked(mr);
2740 		goto put_kref;
2741 	}
2742 
2743 	if (mr->need_invalidate) {
2744 		struct ib_send_wr *wr = &mr->inv_wr;
2745 		int rc;
2746 
2747 		/* Need to finish local invalidation before returning */
2748 		wr->opcode = IB_WR_LOCAL_INV;
2749 		mr->cqe.done = local_inv_done;
2750 		wr->wr_cqe = &mr->cqe;
2751 		wr->num_sge = 0;
2752 		wr->ex.invalidate_rkey = mr->mr->rkey;
2753 		wr->send_flags = IB_SEND_SIGNALED;
2754 
2755 		init_completion(&mr->invalidate_done);
2756 		rc = ib_post_send(sc->ib.qp, wr, NULL);
2757 		if (rc) {
2758 			log_rdma_mr(ERR, "ib_post_send failed rc=%x\n", rc);
2759 			smbd_mr_disable_locked(mr);
2760 			smbd_disconnect_rdma_connection(sc);
2761 			goto done;
2762 		}
2763 		wait_for_completion(&mr->invalidate_done);
2764 		mr->need_invalidate = false;
2765 	} else
2766 		/*
2767 		 * For remote invalidation, just set it to SMBDIRECT_MR_INVALIDATED
2768 		 * and defer to mr_recovery_work to recover the MR for next use
2769 		 */
2770 		mr->state = SMBDIRECT_MR_INVALIDATED;
2771 
2772 	if (mr->sgt.nents) {
2773 		ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir);
2774 		mr->sgt.nents = 0;
2775 	}
2776 
2777 	if (mr->state == SMBDIRECT_MR_INVALIDATED) {
2778 		mr->state = SMBDIRECT_MR_READY;
2779 		if (atomic_inc_return(&sc->mr_io.ready.count) == 1)
2780 			wake_up(&sc->mr_io.ready.wait_queue);
2781 	} else
2782 		/*
2783 		 * Schedule the work to do MR recovery for future I/Os MR
2784 		 * recovery is slow and don't want it to block current I/O
2785 		 */
2786 		queue_work(sc->workqueue, &sc->mr_io.recovery_work);
2787 
2788 done:
2789 	if (atomic_dec_and_test(&sc->mr_io.used.count))
2790 		wake_up(&sc->mr_io.cleanup.wait_queue);
2791 
2792 put_kref:
2793 	/*
2794 	 * No kref_put_mutex() as it's already locked.
2795 	 *
2796 	 * If smbd_mr_free_locked() is called
2797 	 * and the mutex is unlocked and mr is gone,
2798 	 * in that case kref_put() returned 1.
2799 	 *
2800 	 * If kref_put() returned 0 we know that
2801 	 * smbd_mr_free_locked() didn't
2802 	 * run. Not by us nor by anyone else, as we
2803 	 * still hold the mutex, so we need to unlock
2804 	 * and keep the mr in SMBDIRECT_MR_READY or
2805 	 * SMBDIRECT_MR_ERROR state.
2806 	 */
2807 	if (!kref_put(&mr->kref, smbd_mr_free_locked))
2808 		mutex_unlock(&mr->mutex);
2809 }
2810 
smb_set_sge(struct smb_extract_to_rdma * rdma,struct page * lowest_page,size_t off,size_t len)2811 static bool smb_set_sge(struct smb_extract_to_rdma *rdma,
2812 			struct page *lowest_page, size_t off, size_t len)
2813 {
2814 	struct ib_sge *sge = &rdma->sge[rdma->nr_sge];
2815 	u64 addr;
2816 
2817 	addr = ib_dma_map_page(rdma->device, lowest_page,
2818 			       off, len, rdma->direction);
2819 	if (ib_dma_mapping_error(rdma->device, addr))
2820 		return false;
2821 
2822 	sge->addr   = addr;
2823 	sge->length = len;
2824 	sge->lkey   = rdma->local_dma_lkey;
2825 	rdma->nr_sge++;
2826 	return true;
2827 }
2828 
2829 /*
2830  * Extract page fragments from a BVEC-class iterator and add them to an RDMA
2831  * element list.  The pages are not pinned.
2832  */
smb_extract_bvec_to_rdma(struct iov_iter * iter,struct smb_extract_to_rdma * rdma,ssize_t maxsize)2833 static ssize_t smb_extract_bvec_to_rdma(struct iov_iter *iter,
2834 					struct smb_extract_to_rdma *rdma,
2835 					ssize_t maxsize)
2836 {
2837 	const struct bio_vec *bv = iter->bvec;
2838 	unsigned long start = iter->iov_offset;
2839 	unsigned int i;
2840 	ssize_t ret = 0;
2841 
2842 	for (i = 0; i < iter->nr_segs; i++) {
2843 		size_t off, len;
2844 
2845 		len = bv[i].bv_len;
2846 		if (start >= len) {
2847 			start -= len;
2848 			continue;
2849 		}
2850 
2851 		len = min_t(size_t, maxsize, len - start);
2852 		off = bv[i].bv_offset + start;
2853 
2854 		if (!smb_set_sge(rdma, bv[i].bv_page, off, len))
2855 			return -EIO;
2856 
2857 		ret += len;
2858 		maxsize -= len;
2859 		if (rdma->nr_sge >= rdma->max_sge || maxsize <= 0)
2860 			break;
2861 		start = 0;
2862 	}
2863 
2864 	if (ret > 0)
2865 		iov_iter_advance(iter, ret);
2866 	return ret;
2867 }
2868 
2869 /*
2870  * Extract fragments from a KVEC-class iterator and add them to an RDMA list.
2871  * This can deal with vmalloc'd buffers as well as kmalloc'd or static buffers.
2872  * The pages are not pinned.
2873  */
smb_extract_kvec_to_rdma(struct iov_iter * iter,struct smb_extract_to_rdma * rdma,ssize_t maxsize)2874 static ssize_t smb_extract_kvec_to_rdma(struct iov_iter *iter,
2875 					struct smb_extract_to_rdma *rdma,
2876 					ssize_t maxsize)
2877 {
2878 	const struct kvec *kv = iter->kvec;
2879 	unsigned long start = iter->iov_offset;
2880 	unsigned int i;
2881 	ssize_t ret = 0;
2882 
2883 	for (i = 0; i < iter->nr_segs; i++) {
2884 		struct page *page;
2885 		unsigned long kaddr;
2886 		size_t off, len, seg;
2887 
2888 		len = kv[i].iov_len;
2889 		if (start >= len) {
2890 			start -= len;
2891 			continue;
2892 		}
2893 
2894 		kaddr = (unsigned long)kv[i].iov_base + start;
2895 		off = kaddr & ~PAGE_MASK;
2896 		len = min_t(size_t, maxsize, len - start);
2897 		kaddr &= PAGE_MASK;
2898 
2899 		maxsize -= len;
2900 		do {
2901 			seg = min_t(size_t, len, PAGE_SIZE - off);
2902 
2903 			if (is_vmalloc_or_module_addr((void *)kaddr))
2904 				page = vmalloc_to_page((void *)kaddr);
2905 			else
2906 				page = virt_to_page((void *)kaddr);
2907 
2908 			if (!smb_set_sge(rdma, page, off, seg))
2909 				return -EIO;
2910 
2911 			ret += seg;
2912 			len -= seg;
2913 			kaddr += PAGE_SIZE;
2914 			off = 0;
2915 		} while (len > 0 && rdma->nr_sge < rdma->max_sge);
2916 
2917 		if (rdma->nr_sge >= rdma->max_sge || maxsize <= 0)
2918 			break;
2919 		start = 0;
2920 	}
2921 
2922 	if (ret > 0)
2923 		iov_iter_advance(iter, ret);
2924 	return ret;
2925 }
2926 
2927 /*
2928  * Extract folio fragments from a FOLIOQ-class iterator and add them to an RDMA
2929  * list.  The folios are not pinned.
2930  */
smb_extract_folioq_to_rdma(struct iov_iter * iter,struct smb_extract_to_rdma * rdma,ssize_t maxsize)2931 static ssize_t smb_extract_folioq_to_rdma(struct iov_iter *iter,
2932 					  struct smb_extract_to_rdma *rdma,
2933 					  ssize_t maxsize)
2934 {
2935 	const struct folio_queue *folioq = iter->folioq;
2936 	unsigned int slot = iter->folioq_slot;
2937 	ssize_t ret = 0;
2938 	size_t offset = iter->iov_offset;
2939 
2940 	BUG_ON(!folioq);
2941 
2942 	if (slot >= folioq_nr_slots(folioq)) {
2943 		folioq = folioq->next;
2944 		if (WARN_ON_ONCE(!folioq))
2945 			return -EIO;
2946 		slot = 0;
2947 	}
2948 
2949 	do {
2950 		struct folio *folio = folioq_folio(folioq, slot);
2951 		size_t fsize = folioq_folio_size(folioq, slot);
2952 
2953 		if (offset < fsize) {
2954 			size_t part = umin(maxsize, fsize - offset);
2955 
2956 			if (!smb_set_sge(rdma, folio_page(folio, 0), offset, part))
2957 				return -EIO;
2958 
2959 			offset += part;
2960 			ret += part;
2961 			maxsize -= part;
2962 		}
2963 
2964 		if (offset >= fsize) {
2965 			offset = 0;
2966 			slot++;
2967 			if (slot >= folioq_nr_slots(folioq)) {
2968 				if (!folioq->next) {
2969 					WARN_ON_ONCE(ret < iter->count);
2970 					break;
2971 				}
2972 				folioq = folioq->next;
2973 				slot = 0;
2974 			}
2975 		}
2976 	} while (rdma->nr_sge < rdma->max_sge && maxsize > 0);
2977 
2978 	iter->folioq = folioq;
2979 	iter->folioq_slot = slot;
2980 	iter->iov_offset = offset;
2981 	iter->count -= ret;
2982 	return ret;
2983 }
2984 
2985 /*
2986  * Extract page fragments from up to the given amount of the source iterator
2987  * and build up an RDMA list that refers to all of those bits.  The RDMA list
2988  * is appended to, up to the maximum number of elements set in the parameter
2989  * block.
2990  *
2991  * The extracted page fragments are not pinned or ref'd in any way; if an
2992  * IOVEC/UBUF-type iterator is to be used, it should be converted to a
2993  * BVEC-type iterator and the pages pinned, ref'd or otherwise held in some
2994  * way.
2995  */
smb_extract_iter_to_rdma(struct iov_iter * iter,size_t len,struct smb_extract_to_rdma * rdma)2996 static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len,
2997 					struct smb_extract_to_rdma *rdma)
2998 {
2999 	ssize_t ret;
3000 	int before = rdma->nr_sge;
3001 
3002 	switch (iov_iter_type(iter)) {
3003 	case ITER_BVEC:
3004 		ret = smb_extract_bvec_to_rdma(iter, rdma, len);
3005 		break;
3006 	case ITER_KVEC:
3007 		ret = smb_extract_kvec_to_rdma(iter, rdma, len);
3008 		break;
3009 	case ITER_FOLIOQ:
3010 		ret = smb_extract_folioq_to_rdma(iter, rdma, len);
3011 		break;
3012 	default:
3013 		WARN_ON_ONCE(1);
3014 		return -EIO;
3015 	}
3016 
3017 	if (ret < 0) {
3018 		while (rdma->nr_sge > before) {
3019 			struct ib_sge *sge = &rdma->sge[rdma->nr_sge--];
3020 
3021 			ib_dma_unmap_single(rdma->device, sge->addr, sge->length,
3022 					    rdma->direction);
3023 			sge->addr = 0;
3024 		}
3025 	}
3026 
3027 	return ret;
3028 }
3029