1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (C) 2017, Microsoft Corporation.
4 *
5 * Author(s): Long Li <longli@microsoft.com>
6 */
7 #include <linux/module.h>
8 #include <linux/highmem.h>
9 #include <linux/folio_queue.h>
10 #include "../common/smbdirect/smbdirect_pdu.h"
11 #include "smbdirect.h"
12 #include "cifs_debug.h"
13 #include "cifsproto.h"
14 #include "smb2proto.h"
15
smbd_get_parameters(struct smbd_connection * conn)16 const struct smbdirect_socket_parameters *smbd_get_parameters(struct smbd_connection *conn)
17 {
18 struct smbdirect_socket *sc = &conn->socket;
19
20 return &sc->parameters;
21 }
22
23 static struct smbdirect_recv_io *get_receive_buffer(
24 struct smbdirect_socket *sc);
25 static void put_receive_buffer(
26 struct smbdirect_socket *sc,
27 struct smbdirect_recv_io *response);
28 static int allocate_receive_buffers(struct smbdirect_socket *sc, int num_buf);
29 static void destroy_receive_buffers(struct smbdirect_socket *sc);
30
31 static void enqueue_reassembly(
32 struct smbdirect_socket *sc,
33 struct smbdirect_recv_io *response, int data_length);
34 static struct smbdirect_recv_io *_get_first_reassembly(
35 struct smbdirect_socket *sc);
36
37 static int smbd_post_recv(
38 struct smbdirect_socket *sc,
39 struct smbdirect_recv_io *response);
40
41 static int smbd_post_send_empty(struct smbdirect_socket *sc);
42
43 static void destroy_mr_list(struct smbdirect_socket *sc);
44 static int allocate_mr_list(struct smbdirect_socket *sc);
45
46 struct smb_extract_to_rdma {
47 struct ib_sge *sge;
48 unsigned int nr_sge;
49 unsigned int max_sge;
50 struct ib_device *device;
51 u32 local_dma_lkey;
52 enum dma_data_direction direction;
53 };
54 static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len,
55 struct smb_extract_to_rdma *rdma);
56
57 /* Port numbers for SMBD transport */
58 #define SMB_PORT 445
59 #define SMBD_PORT 5445
60
61 /* Address lookup and resolve timeout in ms */
62 #define RDMA_RESOLVE_TIMEOUT 5000
63
64 /* SMBD negotiation timeout in seconds */
65 #define SMBD_NEGOTIATE_TIMEOUT 120
66
67 /* The timeout to wait for a keepalive message from peer in seconds */
68 #define KEEPALIVE_RECV_TIMEOUT 5
69
70 /* SMBD minimum receive size and fragmented sized defined in [MS-SMBD] */
71 #define SMBD_MIN_RECEIVE_SIZE 128
72 #define SMBD_MIN_FRAGMENTED_SIZE 131072
73
74 /*
75 * Default maximum number of RDMA read/write outstanding on this connection
76 * This value is possibly decreased during QP creation on hardware limit
77 */
78 #define SMBD_CM_RESPONDER_RESOURCES 32
79
80 /* Maximum number of retries on data transfer operations */
81 #define SMBD_CM_RETRY 6
82 /* No need to retry on Receiver Not Ready since SMBD manages credits */
83 #define SMBD_CM_RNR_RETRY 0
84
85 /*
86 * User configurable initial values per SMBD transport connection
87 * as defined in [MS-SMBD] 3.1.1.1
88 * Those may change after a SMBD negotiation
89 */
90 /* The local peer's maximum number of credits to grant to the peer */
91 int smbd_receive_credit_max = 255;
92
93 /* The remote peer's credit request of local peer */
94 int smbd_send_credit_target = 255;
95
96 /* The maximum single message size can be sent to remote peer */
97 int smbd_max_send_size = 1364;
98
99 /* The maximum fragmented upper-layer payload receive size supported */
100 int smbd_max_fragmented_recv_size = 1024 * 1024;
101
102 /* The maximum single-message size which can be received */
103 int smbd_max_receive_size = 1364;
104
105 /* The timeout to initiate send of a keepalive message on idle */
106 int smbd_keep_alive_interval = 120;
107
108 /*
109 * User configurable initial values for RDMA transport
110 * The actual values used may be lower and are limited to hardware capabilities
111 */
112 /* Default maximum number of pages in a single RDMA write/read */
113 int smbd_max_frmr_depth = 2048;
114
115 /* If payload is less than this byte, use RDMA send/recv not read/write */
116 int rdma_readwrite_threshold = 4096;
117
118 /* Transport logging functions
119 * Logging are defined as classes. They can be OR'ed to define the actual
120 * logging level via module parameter smbd_logging_class
121 * e.g. cifs.smbd_logging_class=0xa0 will log all log_rdma_recv() and
122 * log_rdma_event()
123 */
124 #define LOG_OUTGOING 0x1
125 #define LOG_INCOMING 0x2
126 #define LOG_READ 0x4
127 #define LOG_WRITE 0x8
128 #define LOG_RDMA_SEND 0x10
129 #define LOG_RDMA_RECV 0x20
130 #define LOG_KEEP_ALIVE 0x40
131 #define LOG_RDMA_EVENT 0x80
132 #define LOG_RDMA_MR 0x100
133 static unsigned int smbd_logging_class;
134 module_param(smbd_logging_class, uint, 0644);
135 MODULE_PARM_DESC(smbd_logging_class,
136 "Logging class for SMBD transport 0x0 to 0x100");
137
138 #define ERR 0x0
139 #define INFO 0x1
140 static unsigned int smbd_logging_level = ERR;
141 module_param(smbd_logging_level, uint, 0644);
142 MODULE_PARM_DESC(smbd_logging_level,
143 "Logging level for SMBD transport, 0 (default): error, 1: info");
144
145 #define log_rdma(level, class, fmt, args...) \
146 do { \
147 if (level <= smbd_logging_level || class & smbd_logging_class) \
148 cifs_dbg(VFS, "%s:%d " fmt, __func__, __LINE__, ##args);\
149 } while (0)
150
151 #define log_outgoing(level, fmt, args...) \
152 log_rdma(level, LOG_OUTGOING, fmt, ##args)
153 #define log_incoming(level, fmt, args...) \
154 log_rdma(level, LOG_INCOMING, fmt, ##args)
155 #define log_read(level, fmt, args...) log_rdma(level, LOG_READ, fmt, ##args)
156 #define log_write(level, fmt, args...) log_rdma(level, LOG_WRITE, fmt, ##args)
157 #define log_rdma_send(level, fmt, args...) \
158 log_rdma(level, LOG_RDMA_SEND, fmt, ##args)
159 #define log_rdma_recv(level, fmt, args...) \
160 log_rdma(level, LOG_RDMA_RECV, fmt, ##args)
161 #define log_keep_alive(level, fmt, args...) \
162 log_rdma(level, LOG_KEEP_ALIVE, fmt, ##args)
163 #define log_rdma_event(level, fmt, args...) \
164 log_rdma(level, LOG_RDMA_EVENT, fmt, ##args)
165 #define log_rdma_mr(level, fmt, args...) \
166 log_rdma(level, LOG_RDMA_MR, fmt, ##args)
167
smbd_disconnect_wake_up_all(struct smbdirect_socket * sc)168 static void smbd_disconnect_wake_up_all(struct smbdirect_socket *sc)
169 {
170 /*
171 * Wake up all waiters in all wait queues
172 * in order to notice the broken connection.
173 */
174 wake_up_all(&sc->status_wait);
175 wake_up_all(&sc->send_io.lcredits.wait_queue);
176 wake_up_all(&sc->send_io.credits.wait_queue);
177 wake_up_all(&sc->send_io.pending.dec_wait_queue);
178 wake_up_all(&sc->send_io.pending.zero_wait_queue);
179 wake_up_all(&sc->recv_io.reassembly.wait_queue);
180 wake_up_all(&sc->mr_io.ready.wait_queue);
181 wake_up_all(&sc->mr_io.cleanup.wait_queue);
182 }
183
smbd_disconnect_rdma_work(struct work_struct * work)184 static void smbd_disconnect_rdma_work(struct work_struct *work)
185 {
186 struct smbdirect_socket *sc =
187 container_of(work, struct smbdirect_socket, disconnect_work);
188
189 /*
190 * make sure this and other work is not queued again
191 * but here we don't block and avoid
192 * disable[_delayed]_work_sync()
193 */
194 disable_work(&sc->disconnect_work);
195 disable_work(&sc->recv_io.posted.refill_work);
196 disable_work(&sc->mr_io.recovery_work);
197 disable_work(&sc->idle.immediate_work);
198 disable_delayed_work(&sc->idle.timer_work);
199
200 if (sc->first_error == 0)
201 sc->first_error = -ECONNABORTED;
202
203 switch (sc->status) {
204 case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED:
205 case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING:
206 case SMBDIRECT_SOCKET_NEGOTIATE_FAILED:
207 case SMBDIRECT_SOCKET_CONNECTED:
208 case SMBDIRECT_SOCKET_ERROR:
209 sc->status = SMBDIRECT_SOCKET_DISCONNECTING;
210 rdma_disconnect(sc->rdma.cm_id);
211 break;
212
213 case SMBDIRECT_SOCKET_CREATED:
214 case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED:
215 case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING:
216 case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED:
217 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED:
218 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING:
219 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED:
220 case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED:
221 case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING:
222 case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED:
223 /*
224 * rdma_connect() never reached
225 * RDMA_CM_EVENT_ESTABLISHED
226 */
227 sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
228 break;
229
230 case SMBDIRECT_SOCKET_DISCONNECTING:
231 case SMBDIRECT_SOCKET_DISCONNECTED:
232 case SMBDIRECT_SOCKET_DESTROYED:
233 break;
234 }
235
236 /*
237 * Wake up all waiters in all wait queues
238 * in order to notice the broken connection.
239 */
240 smbd_disconnect_wake_up_all(sc);
241 }
242
smbd_disconnect_rdma_connection(struct smbdirect_socket * sc)243 static void smbd_disconnect_rdma_connection(struct smbdirect_socket *sc)
244 {
245 /*
246 * make sure other work (than disconnect_work) is
247 * not queued again but here we don't block and avoid
248 * disable[_delayed]_work_sync()
249 */
250 disable_work(&sc->recv_io.posted.refill_work);
251 disable_work(&sc->mr_io.recovery_work);
252 disable_work(&sc->idle.immediate_work);
253 disable_delayed_work(&sc->idle.timer_work);
254
255 if (sc->first_error == 0)
256 sc->first_error = -ECONNABORTED;
257
258 switch (sc->status) {
259 case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED:
260 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED:
261 case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED:
262 case SMBDIRECT_SOCKET_NEGOTIATE_FAILED:
263 case SMBDIRECT_SOCKET_ERROR:
264 case SMBDIRECT_SOCKET_DISCONNECTING:
265 case SMBDIRECT_SOCKET_DISCONNECTED:
266 case SMBDIRECT_SOCKET_DESTROYED:
267 /*
268 * Keep the current error status
269 */
270 break;
271
272 case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED:
273 case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING:
274 sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED;
275 break;
276
277 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED:
278 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING:
279 sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED;
280 break;
281
282 case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED:
283 case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING:
284 sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED;
285 break;
286
287 case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED:
288 case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING:
289 sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED;
290 break;
291
292 case SMBDIRECT_SOCKET_CREATED:
293 case SMBDIRECT_SOCKET_CONNECTED:
294 sc->status = SMBDIRECT_SOCKET_ERROR;
295 break;
296 }
297
298 /*
299 * Wake up all waiters in all wait queues
300 * in order to notice the broken connection.
301 */
302 smbd_disconnect_wake_up_all(sc);
303
304 queue_work(sc->workqueue, &sc->disconnect_work);
305 }
306
307 /* Upcall from RDMA CM */
smbd_conn_upcall(struct rdma_cm_id * id,struct rdma_cm_event * event)308 static int smbd_conn_upcall(
309 struct rdma_cm_id *id, struct rdma_cm_event *event)
310 {
311 struct smbdirect_socket *sc = id->context;
312 struct smbdirect_socket_parameters *sp = &sc->parameters;
313 const char *event_name = rdma_event_msg(event->event);
314 u8 peer_initiator_depth;
315 u8 peer_responder_resources;
316
317 log_rdma_event(INFO, "event=%s status=%d\n",
318 event_name, event->status);
319
320 switch (event->event) {
321 case RDMA_CM_EVENT_ADDR_RESOLVED:
322 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING);
323 sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED;
324 wake_up(&sc->status_wait);
325 break;
326
327 case RDMA_CM_EVENT_ROUTE_RESOLVED:
328 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING);
329 sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED;
330 wake_up(&sc->status_wait);
331 break;
332
333 case RDMA_CM_EVENT_ADDR_ERROR:
334 log_rdma_event(ERR, "connecting failed event=%s\n", event_name);
335 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING);
336 sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED;
337 smbd_disconnect_rdma_work(&sc->disconnect_work);
338 break;
339
340 case RDMA_CM_EVENT_ROUTE_ERROR:
341 log_rdma_event(ERR, "connecting failed event=%s\n", event_name);
342 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING);
343 sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED;
344 smbd_disconnect_rdma_work(&sc->disconnect_work);
345 break;
346
347 case RDMA_CM_EVENT_ESTABLISHED:
348 log_rdma_event(INFO, "connected event=%s\n", event_name);
349
350 /*
351 * Here we work around an inconsistency between
352 * iWarp and other devices (at least rxe and irdma using RoCEv2)
353 */
354 if (rdma_protocol_iwarp(id->device, id->port_num)) {
355 /*
356 * iWarp devices report the peer's values
357 * with the perspective of the peer here.
358 * Tested with siw and irdma (in iwarp mode)
359 * We need to change to our perspective here,
360 * so we need to switch the values.
361 */
362 peer_initiator_depth = event->param.conn.responder_resources;
363 peer_responder_resources = event->param.conn.initiator_depth;
364 } else {
365 /*
366 * Non iWarp devices report the peer's values
367 * already changed to our perspective here.
368 * Tested with rxe and irdma (in roce mode).
369 */
370 peer_initiator_depth = event->param.conn.initiator_depth;
371 peer_responder_resources = event->param.conn.responder_resources;
372 }
373 if (rdma_protocol_iwarp(id->device, id->port_num) &&
374 event->param.conn.private_data_len == 8) {
375 /*
376 * Legacy clients with only iWarp MPA v1 support
377 * need a private blob in order to negotiate
378 * the IRD/ORD values.
379 */
380 const __be32 *ird_ord_hdr = event->param.conn.private_data;
381 u32 ird32 = be32_to_cpu(ird_ord_hdr[0]);
382 u32 ord32 = be32_to_cpu(ird_ord_hdr[1]);
383
384 /*
385 * cifs.ko sends the legacy IRD/ORD negotiation
386 * event if iWarp MPA v2 was used.
387 *
388 * Here we check that the values match and only
389 * mark the client as legacy if they don't match.
390 */
391 if ((u32)event->param.conn.initiator_depth != ird32 ||
392 (u32)event->param.conn.responder_resources != ord32) {
393 /*
394 * There are broken clients (old cifs.ko)
395 * using little endian and also
396 * struct rdma_conn_param only uses u8
397 * for initiator_depth and responder_resources,
398 * so we truncate the value to U8_MAX.
399 *
400 * smb_direct_accept_client() will then
401 * do the real negotiation in order to
402 * select the minimum between client and
403 * server.
404 */
405 ird32 = min_t(u32, ird32, U8_MAX);
406 ord32 = min_t(u32, ord32, U8_MAX);
407
408 sc->rdma.legacy_iwarp = true;
409 peer_initiator_depth = (u8)ird32;
410 peer_responder_resources = (u8)ord32;
411 }
412 }
413
414 /*
415 * negotiate the value by using the minimum
416 * between client and server if the client provided
417 * non 0 values.
418 */
419 if (peer_initiator_depth != 0)
420 sp->initiator_depth =
421 min_t(u8, sp->initiator_depth,
422 peer_initiator_depth);
423 if (peer_responder_resources != 0)
424 sp->responder_resources =
425 min_t(u8, sp->responder_resources,
426 peer_responder_resources);
427
428 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING);
429 sc->status = SMBDIRECT_SOCKET_NEGOTIATE_NEEDED;
430 wake_up(&sc->status_wait);
431 break;
432
433 case RDMA_CM_EVENT_CONNECT_ERROR:
434 case RDMA_CM_EVENT_UNREACHABLE:
435 case RDMA_CM_EVENT_REJECTED:
436 log_rdma_event(ERR, "connecting failed event=%s\n", event_name);
437 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING);
438 sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED;
439 smbd_disconnect_rdma_work(&sc->disconnect_work);
440 break;
441
442 case RDMA_CM_EVENT_DEVICE_REMOVAL:
443 case RDMA_CM_EVENT_DISCONNECTED:
444 /* This happens when we fail the negotiation */
445 if (sc->status == SMBDIRECT_SOCKET_NEGOTIATE_FAILED) {
446 log_rdma_event(ERR, "event=%s during negotiation\n", event_name);
447 }
448
449 sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
450 smbd_disconnect_rdma_work(&sc->disconnect_work);
451 break;
452
453 default:
454 log_rdma_event(ERR, "unexpected event=%s status=%d\n",
455 event_name, event->status);
456 break;
457 }
458
459 return 0;
460 }
461
462 /* Upcall from RDMA QP */
463 static void
smbd_qp_async_error_upcall(struct ib_event * event,void * context)464 smbd_qp_async_error_upcall(struct ib_event *event, void *context)
465 {
466 struct smbdirect_socket *sc = context;
467
468 log_rdma_event(ERR, "%s on device %s socket %p\n",
469 ib_event_msg(event->event), event->device->name, sc);
470
471 switch (event->event) {
472 case IB_EVENT_CQ_ERR:
473 case IB_EVENT_QP_FATAL:
474 smbd_disconnect_rdma_connection(sc);
475 break;
476
477 default:
478 break;
479 }
480 }
481
smbdirect_send_io_payload(struct smbdirect_send_io * request)482 static inline void *smbdirect_send_io_payload(struct smbdirect_send_io *request)
483 {
484 return (void *)request->packet;
485 }
486
smbdirect_recv_io_payload(struct smbdirect_recv_io * response)487 static inline void *smbdirect_recv_io_payload(struct smbdirect_recv_io *response)
488 {
489 return (void *)response->packet;
490 }
491
492 /* Called when a RDMA send is done */
send_done(struct ib_cq * cq,struct ib_wc * wc)493 static void send_done(struct ib_cq *cq, struct ib_wc *wc)
494 {
495 int i;
496 struct smbdirect_send_io *request =
497 container_of(wc->wr_cqe, struct smbdirect_send_io, cqe);
498 struct smbdirect_socket *sc = request->socket;
499 int lcredits = 0;
500
501 log_rdma_send(INFO, "smbdirect_send_io 0x%p completed wc->status=%s\n",
502 request, ib_wc_status_msg(wc->status));
503
504 for (i = 0; i < request->num_sge; i++)
505 ib_dma_unmap_single(sc->ib.dev,
506 request->sge[i].addr,
507 request->sge[i].length,
508 DMA_TO_DEVICE);
509 mempool_free(request, sc->send_io.mem.pool);
510 lcredits += 1;
511
512 if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) {
513 if (wc->status != IB_WC_WR_FLUSH_ERR)
514 log_rdma_send(ERR, "wc->status=%s wc->opcode=%d\n",
515 ib_wc_status_msg(wc->status), wc->opcode);
516 smbd_disconnect_rdma_connection(sc);
517 return;
518 }
519
520 atomic_add(lcredits, &sc->send_io.lcredits.count);
521 wake_up(&sc->send_io.lcredits.wait_queue);
522
523 if (atomic_dec_and_test(&sc->send_io.pending.count))
524 wake_up(&sc->send_io.pending.zero_wait_queue);
525
526 wake_up(&sc->send_io.pending.dec_wait_queue);
527 }
528
dump_smbdirect_negotiate_resp(struct smbdirect_negotiate_resp * resp)529 static void dump_smbdirect_negotiate_resp(struct smbdirect_negotiate_resp *resp)
530 {
531 log_rdma_event(INFO, "resp message min_version %u max_version %u negotiated_version %u credits_requested %u credits_granted %u status %u max_readwrite_size %u preferred_send_size %u max_receive_size %u max_fragmented_size %u\n",
532 resp->min_version, resp->max_version,
533 resp->negotiated_version, resp->credits_requested,
534 resp->credits_granted, resp->status,
535 resp->max_readwrite_size, resp->preferred_send_size,
536 resp->max_receive_size, resp->max_fragmented_size);
537 }
538
539 /*
540 * Process a negotiation response message, according to [MS-SMBD]3.1.5.7
541 * response, packet_length: the negotiation response message
542 * return value: true if negotiation is a success, false if failed
543 */
process_negotiation_response(struct smbdirect_recv_io * response,int packet_length)544 static bool process_negotiation_response(
545 struct smbdirect_recv_io *response, int packet_length)
546 {
547 struct smbdirect_socket *sc = response->socket;
548 struct smbdirect_socket_parameters *sp = &sc->parameters;
549 struct smbdirect_negotiate_resp *packet = smbdirect_recv_io_payload(response);
550
551 if (packet_length < sizeof(struct smbdirect_negotiate_resp)) {
552 log_rdma_event(ERR,
553 "error: packet_length=%d\n", packet_length);
554 return false;
555 }
556
557 if (le16_to_cpu(packet->negotiated_version) != SMBDIRECT_V1) {
558 log_rdma_event(ERR, "error: negotiated_version=%x\n",
559 le16_to_cpu(packet->negotiated_version));
560 return false;
561 }
562
563 if (packet->credits_requested == 0) {
564 log_rdma_event(ERR, "error: credits_requested==0\n");
565 return false;
566 }
567 sc->recv_io.credits.target = le16_to_cpu(packet->credits_requested);
568 sc->recv_io.credits.target = min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max);
569
570 if (packet->credits_granted == 0) {
571 log_rdma_event(ERR, "error: credits_granted==0\n");
572 return false;
573 }
574 atomic_set(&sc->send_io.lcredits.count, sp->send_credit_target);
575 atomic_set(&sc->send_io.credits.count, le16_to_cpu(packet->credits_granted));
576
577 if (le32_to_cpu(packet->preferred_send_size) > sp->max_recv_size) {
578 log_rdma_event(ERR, "error: preferred_send_size=%d\n",
579 le32_to_cpu(packet->preferred_send_size));
580 return false;
581 }
582 sp->max_recv_size = le32_to_cpu(packet->preferred_send_size);
583
584 if (le32_to_cpu(packet->max_receive_size) < SMBD_MIN_RECEIVE_SIZE) {
585 log_rdma_event(ERR, "error: max_receive_size=%d\n",
586 le32_to_cpu(packet->max_receive_size));
587 return false;
588 }
589 sp->max_send_size = min_t(u32, sp->max_send_size,
590 le32_to_cpu(packet->max_receive_size));
591
592 if (le32_to_cpu(packet->max_fragmented_size) <
593 SMBD_MIN_FRAGMENTED_SIZE) {
594 log_rdma_event(ERR, "error: max_fragmented_size=%d\n",
595 le32_to_cpu(packet->max_fragmented_size));
596 return false;
597 }
598 sp->max_fragmented_send_size =
599 le32_to_cpu(packet->max_fragmented_size);
600
601
602 sp->max_read_write_size = min_t(u32,
603 le32_to_cpu(packet->max_readwrite_size),
604 sp->max_frmr_depth * PAGE_SIZE);
605 sp->max_frmr_depth = sp->max_read_write_size / PAGE_SIZE;
606
607 sc->recv_io.expected = SMBDIRECT_EXPECT_DATA_TRANSFER;
608 return true;
609 }
610
smbd_post_send_credits(struct work_struct * work)611 static void smbd_post_send_credits(struct work_struct *work)
612 {
613 int rc;
614 struct smbdirect_recv_io *response;
615 struct smbdirect_socket *sc =
616 container_of(work, struct smbdirect_socket, recv_io.posted.refill_work);
617
618 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
619 return;
620 }
621
622 if (sc->recv_io.credits.target >
623 atomic_read(&sc->recv_io.credits.count)) {
624 while (true) {
625 response = get_receive_buffer(sc);
626 if (!response)
627 break;
628
629 response->first_segment = false;
630 rc = smbd_post_recv(sc, response);
631 if (rc) {
632 log_rdma_recv(ERR,
633 "post_recv failed rc=%d\n", rc);
634 put_receive_buffer(sc, response);
635 break;
636 }
637
638 atomic_inc(&sc->recv_io.posted.count);
639 }
640 }
641
642 /* Promptly send an immediate packet as defined in [MS-SMBD] 3.1.1.1 */
643 if (atomic_read(&sc->recv_io.credits.count) <
644 sc->recv_io.credits.target - 1) {
645 log_keep_alive(INFO, "schedule send of an empty message\n");
646 queue_work(sc->workqueue, &sc->idle.immediate_work);
647 }
648 }
649
650 /* Called from softirq, when recv is done */
recv_done(struct ib_cq * cq,struct ib_wc * wc)651 static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
652 {
653 struct smbdirect_data_transfer *data_transfer;
654 struct smbdirect_recv_io *response =
655 container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe);
656 struct smbdirect_socket *sc = response->socket;
657 struct smbdirect_socket_parameters *sp = &sc->parameters;
658 u16 old_recv_credit_target;
659 u32 data_offset = 0;
660 u32 data_length = 0;
661 u32 remaining_data_length = 0;
662 bool negotiate_done = false;
663
664 log_rdma_recv(INFO,
665 "response=0x%p type=%d wc status=%s wc opcode %d byte_len=%d pkey_index=%u\n",
666 response, sc->recv_io.expected,
667 ib_wc_status_msg(wc->status), wc->opcode,
668 wc->byte_len, wc->pkey_index);
669
670 if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) {
671 if (wc->status != IB_WC_WR_FLUSH_ERR)
672 log_rdma_recv(ERR, "wc->status=%s opcode=%d\n",
673 ib_wc_status_msg(wc->status), wc->opcode);
674 goto error;
675 }
676
677 ib_dma_sync_single_for_cpu(
678 wc->qp->device,
679 response->sge.addr,
680 response->sge.length,
681 DMA_FROM_DEVICE);
682
683 /*
684 * Reset timer to the keepalive interval in
685 * order to trigger our next keepalive message.
686 */
687 sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE;
688 mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
689 msecs_to_jiffies(sp->keepalive_interval_msec));
690
691 switch (sc->recv_io.expected) {
692 /* SMBD negotiation response */
693 case SMBDIRECT_EXPECT_NEGOTIATE_REP:
694 dump_smbdirect_negotiate_resp(smbdirect_recv_io_payload(response));
695 sc->recv_io.reassembly.full_packet_received = true;
696 negotiate_done =
697 process_negotiation_response(response, wc->byte_len);
698 put_receive_buffer(sc, response);
699 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING);
700 if (!negotiate_done) {
701 sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED;
702 smbd_disconnect_rdma_connection(sc);
703 } else {
704 sc->status = SMBDIRECT_SOCKET_CONNECTED;
705 wake_up(&sc->status_wait);
706 }
707
708 return;
709
710 /* SMBD data transfer packet */
711 case SMBDIRECT_EXPECT_DATA_TRANSFER:
712 data_transfer = smbdirect_recv_io_payload(response);
713
714 if (wc->byte_len <
715 offsetof(struct smbdirect_data_transfer, padding))
716 goto error;
717
718 remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length);
719 data_offset = le32_to_cpu(data_transfer->data_offset);
720 data_length = le32_to_cpu(data_transfer->data_length);
721 if (wc->byte_len < data_offset ||
722 (u64)wc->byte_len < (u64)data_offset + data_length)
723 goto error;
724
725 if (remaining_data_length > sp->max_fragmented_recv_size ||
726 data_length > sp->max_fragmented_recv_size ||
727 (u64)remaining_data_length + (u64)data_length > (u64)sp->max_fragmented_recv_size)
728 goto error;
729
730 if (data_length) {
731 if (sc->recv_io.reassembly.full_packet_received)
732 response->first_segment = true;
733
734 if (le32_to_cpu(data_transfer->remaining_data_length))
735 sc->recv_io.reassembly.full_packet_received = false;
736 else
737 sc->recv_io.reassembly.full_packet_received = true;
738 }
739
740 atomic_dec(&sc->recv_io.posted.count);
741 atomic_dec(&sc->recv_io.credits.count);
742 old_recv_credit_target = sc->recv_io.credits.target;
743 sc->recv_io.credits.target =
744 le16_to_cpu(data_transfer->credits_requested);
745 sc->recv_io.credits.target =
746 min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max);
747 sc->recv_io.credits.target =
748 max_t(u16, sc->recv_io.credits.target, 1);
749 if (le16_to_cpu(data_transfer->credits_granted)) {
750 atomic_add(le16_to_cpu(data_transfer->credits_granted),
751 &sc->send_io.credits.count);
752 /*
753 * We have new send credits granted from remote peer
754 * If any sender is waiting for credits, unblock it
755 */
756 wake_up(&sc->send_io.credits.wait_queue);
757 }
758
759 log_incoming(INFO, "data flags %d data_offset %d data_length %d remaining_data_length %d\n",
760 le16_to_cpu(data_transfer->flags),
761 le32_to_cpu(data_transfer->data_offset),
762 le32_to_cpu(data_transfer->data_length),
763 le32_to_cpu(data_transfer->remaining_data_length));
764
765 /* Send an immediate response right away if requested */
766 if (le16_to_cpu(data_transfer->flags) &
767 SMBDIRECT_FLAG_RESPONSE_REQUESTED) {
768 log_keep_alive(INFO, "schedule send of immediate response\n");
769 queue_work(sc->workqueue, &sc->idle.immediate_work);
770 }
771
772 /*
773 * If this is a packet with data playload place the data in
774 * reassembly queue and wake up the reading thread
775 */
776 if (data_length) {
777 if (sc->recv_io.credits.target > old_recv_credit_target)
778 queue_work(sc->workqueue, &sc->recv_io.posted.refill_work);
779
780 enqueue_reassembly(sc, response, data_length);
781 wake_up(&sc->recv_io.reassembly.wait_queue);
782 } else
783 put_receive_buffer(sc, response);
784
785 return;
786
787 case SMBDIRECT_EXPECT_NEGOTIATE_REQ:
788 /* Only server... */
789 break;
790 }
791
792 /*
793 * This is an internal error!
794 */
795 log_rdma_recv(ERR, "unexpected response type=%d\n", sc->recv_io.expected);
796 WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_DATA_TRANSFER);
797 error:
798 put_receive_buffer(sc, response);
799 smbd_disconnect_rdma_connection(sc);
800 }
801
smbd_create_id(struct smbdirect_socket * sc,struct sockaddr * dstaddr,int port)802 static struct rdma_cm_id *smbd_create_id(
803 struct smbdirect_socket *sc,
804 struct sockaddr *dstaddr, int port)
805 {
806 struct smbdirect_socket_parameters *sp = &sc->parameters;
807 struct rdma_cm_id *id;
808 int rc;
809 __be16 *sport;
810
811 id = rdma_create_id(&init_net, smbd_conn_upcall, sc,
812 RDMA_PS_TCP, IB_QPT_RC);
813 if (IS_ERR(id)) {
814 rc = PTR_ERR(id);
815 log_rdma_event(ERR, "rdma_create_id() failed %i\n", rc);
816 return id;
817 }
818
819 if (dstaddr->sa_family == AF_INET6)
820 sport = &((struct sockaddr_in6 *)dstaddr)->sin6_port;
821 else
822 sport = &((struct sockaddr_in *)dstaddr)->sin_port;
823
824 *sport = htons(port);
825
826 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED);
827 sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING;
828 rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)dstaddr,
829 sp->resolve_addr_timeout_msec);
830 if (rc) {
831 log_rdma_event(ERR, "rdma_resolve_addr() failed %i\n", rc);
832 goto out;
833 }
834 rc = wait_event_interruptible_timeout(
835 sc->status_wait,
836 sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING,
837 msecs_to_jiffies(sp->resolve_addr_timeout_msec));
838 /* e.g. if interrupted returns -ERESTARTSYS */
839 if (rc < 0) {
840 log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc);
841 goto out;
842 }
843 if (sc->status == SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING) {
844 rc = -ETIMEDOUT;
845 log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc);
846 goto out;
847 }
848 if (sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED) {
849 rc = -EHOSTUNREACH;
850 log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc);
851 goto out;
852 }
853
854 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED);
855 sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING;
856 rc = rdma_resolve_route(id, sp->resolve_route_timeout_msec);
857 if (rc) {
858 log_rdma_event(ERR, "rdma_resolve_route() failed %i\n", rc);
859 goto out;
860 }
861 rc = wait_event_interruptible_timeout(
862 sc->status_wait,
863 sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING,
864 msecs_to_jiffies(sp->resolve_route_timeout_msec));
865 /* e.g. if interrupted returns -ERESTARTSYS */
866 if (rc < 0) {
867 log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc);
868 goto out;
869 }
870 if (sc->status == SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING) {
871 rc = -ETIMEDOUT;
872 log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc);
873 goto out;
874 }
875 if (sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED) {
876 rc = -ENETUNREACH;
877 log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc);
878 goto out;
879 }
880
881 return id;
882
883 out:
884 rdma_destroy_id(id);
885 return ERR_PTR(rc);
886 }
887
888 /*
889 * Test if FRWR (Fast Registration Work Requests) is supported on the device
890 * This implementation requires FRWR on RDMA read/write
891 * return value: true if it is supported
892 */
frwr_is_supported(struct ib_device_attr * attrs)893 static bool frwr_is_supported(struct ib_device_attr *attrs)
894 {
895 if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
896 return false;
897 if (attrs->max_fast_reg_page_list_len == 0)
898 return false;
899 return true;
900 }
901
smbd_ia_open(struct smbdirect_socket * sc,struct sockaddr * dstaddr,int port)902 static int smbd_ia_open(
903 struct smbdirect_socket *sc,
904 struct sockaddr *dstaddr, int port)
905 {
906 struct smbdirect_socket_parameters *sp = &sc->parameters;
907 int rc;
908
909 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED);
910 sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED;
911
912 sc->rdma.cm_id = smbd_create_id(sc, dstaddr, port);
913 if (IS_ERR(sc->rdma.cm_id)) {
914 rc = PTR_ERR(sc->rdma.cm_id);
915 goto out1;
916 }
917 sc->ib.dev = sc->rdma.cm_id->device;
918
919 if (!frwr_is_supported(&sc->ib.dev->attrs)) {
920 log_rdma_event(ERR, "Fast Registration Work Requests (FRWR) is not supported\n");
921 log_rdma_event(ERR, "Device capability flags = %llx max_fast_reg_page_list_len = %u\n",
922 sc->ib.dev->attrs.device_cap_flags,
923 sc->ib.dev->attrs.max_fast_reg_page_list_len);
924 rc = -EPROTONOSUPPORT;
925 goto out2;
926 }
927 sp->max_frmr_depth = min_t(u32,
928 sp->max_frmr_depth,
929 sc->ib.dev->attrs.max_fast_reg_page_list_len);
930 sc->mr_io.type = IB_MR_TYPE_MEM_REG;
931 if (sc->ib.dev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG)
932 sc->mr_io.type = IB_MR_TYPE_SG_GAPS;
933
934 return 0;
935
936 out2:
937 rdma_destroy_id(sc->rdma.cm_id);
938 sc->rdma.cm_id = NULL;
939
940 out1:
941 return rc;
942 }
943
944 /*
945 * Send a negotiation request message to the peer
946 * The negotiation procedure is in [MS-SMBD] 3.1.5.2 and 3.1.5.3
947 * After negotiation, the transport is connected and ready for
948 * carrying upper layer SMB payload
949 */
smbd_post_send_negotiate_req(struct smbdirect_socket * sc)950 static int smbd_post_send_negotiate_req(struct smbdirect_socket *sc)
951 {
952 struct smbdirect_socket_parameters *sp = &sc->parameters;
953 struct ib_send_wr send_wr;
954 int rc = -ENOMEM;
955 struct smbdirect_send_io *request;
956 struct smbdirect_negotiate_req *packet;
957
958 request = mempool_alloc(sc->send_io.mem.pool, GFP_KERNEL);
959 if (!request)
960 return rc;
961
962 request->socket = sc;
963
964 packet = smbdirect_send_io_payload(request);
965 packet->min_version = cpu_to_le16(SMBDIRECT_V1);
966 packet->max_version = cpu_to_le16(SMBDIRECT_V1);
967 packet->reserved = 0;
968 packet->credits_requested = cpu_to_le16(sp->send_credit_target);
969 packet->preferred_send_size = cpu_to_le32(sp->max_send_size);
970 packet->max_receive_size = cpu_to_le32(sp->max_recv_size);
971 packet->max_fragmented_size =
972 cpu_to_le32(sp->max_fragmented_recv_size);
973
974 request->num_sge = 1;
975 request->sge[0].addr = ib_dma_map_single(
976 sc->ib.dev, (void *)packet,
977 sizeof(*packet), DMA_TO_DEVICE);
978 if (ib_dma_mapping_error(sc->ib.dev, request->sge[0].addr)) {
979 rc = -EIO;
980 goto dma_mapping_failed;
981 }
982
983 request->sge[0].length = sizeof(*packet);
984 request->sge[0].lkey = sc->ib.pd->local_dma_lkey;
985
986 ib_dma_sync_single_for_device(
987 sc->ib.dev, request->sge[0].addr,
988 request->sge[0].length, DMA_TO_DEVICE);
989
990 request->cqe.done = send_done;
991
992 send_wr.next = NULL;
993 send_wr.wr_cqe = &request->cqe;
994 send_wr.sg_list = request->sge;
995 send_wr.num_sge = request->num_sge;
996 send_wr.opcode = IB_WR_SEND;
997 send_wr.send_flags = IB_SEND_SIGNALED;
998
999 log_rdma_send(INFO, "sge addr=0x%llx length=%u lkey=0x%x\n",
1000 request->sge[0].addr,
1001 request->sge[0].length, request->sge[0].lkey);
1002
1003 atomic_inc(&sc->send_io.pending.count);
1004 rc = ib_post_send(sc->ib.qp, &send_wr, NULL);
1005 if (!rc)
1006 return 0;
1007
1008 /* if we reach here, post send failed */
1009 log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
1010 atomic_dec(&sc->send_io.pending.count);
1011 ib_dma_unmap_single(sc->ib.dev, request->sge[0].addr,
1012 request->sge[0].length, DMA_TO_DEVICE);
1013
1014 smbd_disconnect_rdma_connection(sc);
1015
1016 dma_mapping_failed:
1017 mempool_free(request, sc->send_io.mem.pool);
1018 return rc;
1019 }
1020
1021 /*
1022 * Extend the credits to remote peer
1023 * This implements [MS-SMBD] 3.1.5.9
1024 * The idea is that we should extend credits to remote peer as quickly as
1025 * it's allowed, to maintain data flow. We allocate as much receive
1026 * buffer as possible, and extend the receive credits to remote peer
1027 * return value: the new credtis being granted.
1028 */
manage_credits_prior_sending(struct smbdirect_socket * sc)1029 static int manage_credits_prior_sending(struct smbdirect_socket *sc)
1030 {
1031 int new_credits;
1032
1033 if (atomic_read(&sc->recv_io.credits.count) >= sc->recv_io.credits.target)
1034 return 0;
1035
1036 new_credits = atomic_read(&sc->recv_io.posted.count);
1037 if (new_credits == 0)
1038 return 0;
1039
1040 new_credits -= atomic_read(&sc->recv_io.credits.count);
1041 if (new_credits <= 0)
1042 return 0;
1043
1044 return new_credits;
1045 }
1046
1047 /*
1048 * Check if we need to send a KEEP_ALIVE message
1049 * The idle connection timer triggers a KEEP_ALIVE message when expires
1050 * SMBDIRECT_FLAG_RESPONSE_REQUESTED is set in the message flag to have peer send
1051 * back a response.
1052 * return value:
1053 * 1 if SMBDIRECT_FLAG_RESPONSE_REQUESTED needs to be set
1054 * 0: otherwise
1055 */
manage_keep_alive_before_sending(struct smbdirect_socket * sc)1056 static int manage_keep_alive_before_sending(struct smbdirect_socket *sc)
1057 {
1058 struct smbdirect_socket_parameters *sp = &sc->parameters;
1059
1060 if (sc->idle.keepalive == SMBDIRECT_KEEPALIVE_PENDING) {
1061 sc->idle.keepalive = SMBDIRECT_KEEPALIVE_SENT;
1062 /*
1063 * Now use the keepalive timeout (instead of keepalive interval)
1064 * in order to wait for a response
1065 */
1066 mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
1067 msecs_to_jiffies(sp->keepalive_timeout_msec));
1068 return 1;
1069 }
1070 return 0;
1071 }
1072
1073 /* Post the send request */
smbd_post_send(struct smbdirect_socket * sc,struct smbdirect_send_io * request)1074 static int smbd_post_send(struct smbdirect_socket *sc,
1075 struct smbdirect_send_io *request)
1076 {
1077 struct ib_send_wr send_wr;
1078 int rc, i;
1079
1080 for (i = 0; i < request->num_sge; i++) {
1081 log_rdma_send(INFO,
1082 "rdma_request sge[%d] addr=0x%llx length=%u\n",
1083 i, request->sge[i].addr, request->sge[i].length);
1084 ib_dma_sync_single_for_device(
1085 sc->ib.dev,
1086 request->sge[i].addr,
1087 request->sge[i].length,
1088 DMA_TO_DEVICE);
1089 }
1090
1091 request->cqe.done = send_done;
1092
1093 send_wr.next = NULL;
1094 send_wr.wr_cqe = &request->cqe;
1095 send_wr.sg_list = request->sge;
1096 send_wr.num_sge = request->num_sge;
1097 send_wr.opcode = IB_WR_SEND;
1098 send_wr.send_flags = IB_SEND_SIGNALED;
1099
1100 rc = ib_post_send(sc->ib.qp, &send_wr, NULL);
1101 if (rc) {
1102 log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
1103 smbd_disconnect_rdma_connection(sc);
1104 rc = -EAGAIN;
1105 }
1106
1107 return rc;
1108 }
1109
smbd_post_send_iter(struct smbdirect_socket * sc,struct iov_iter * iter,int * _remaining_data_length)1110 static int smbd_post_send_iter(struct smbdirect_socket *sc,
1111 struct iov_iter *iter,
1112 int *_remaining_data_length)
1113 {
1114 struct smbdirect_socket_parameters *sp = &sc->parameters;
1115 int i, rc;
1116 int header_length;
1117 int data_length;
1118 struct smbdirect_send_io *request;
1119 struct smbdirect_data_transfer *packet;
1120 int new_credits = 0;
1121
1122 wait_lcredit:
1123 /* Wait for local send credits */
1124 rc = wait_event_interruptible(sc->send_io.lcredits.wait_queue,
1125 atomic_read(&sc->send_io.lcredits.count) > 0 ||
1126 sc->status != SMBDIRECT_SOCKET_CONNECTED);
1127 if (rc)
1128 goto err_wait_lcredit;
1129
1130 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
1131 log_outgoing(ERR, "disconnected not sending on wait_credit\n");
1132 rc = -EAGAIN;
1133 goto err_wait_lcredit;
1134 }
1135 if (unlikely(atomic_dec_return(&sc->send_io.lcredits.count) < 0)) {
1136 atomic_inc(&sc->send_io.lcredits.count);
1137 goto wait_lcredit;
1138 }
1139
1140 wait_credit:
1141 /* Wait for send credits. A SMBD packet needs one credit */
1142 rc = wait_event_interruptible(sc->send_io.credits.wait_queue,
1143 atomic_read(&sc->send_io.credits.count) > 0 ||
1144 sc->status != SMBDIRECT_SOCKET_CONNECTED);
1145 if (rc)
1146 goto err_wait_credit;
1147
1148 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
1149 log_outgoing(ERR, "disconnected not sending on wait_credit\n");
1150 rc = -EAGAIN;
1151 goto err_wait_credit;
1152 }
1153 if (unlikely(atomic_dec_return(&sc->send_io.credits.count) < 0)) {
1154 atomic_inc(&sc->send_io.credits.count);
1155 goto wait_credit;
1156 }
1157
1158 request = mempool_alloc(sc->send_io.mem.pool, GFP_KERNEL);
1159 if (!request) {
1160 rc = -ENOMEM;
1161 goto err_alloc;
1162 }
1163
1164 request->socket = sc;
1165 memset(request->sge, 0, sizeof(request->sge));
1166
1167 /* Map the packet to DMA */
1168 header_length = sizeof(struct smbdirect_data_transfer);
1169 /* If this is a packet without payload, don't send padding */
1170 if (!iter)
1171 header_length = offsetof(struct smbdirect_data_transfer, padding);
1172
1173 packet = smbdirect_send_io_payload(request);
1174 request->sge[0].addr = ib_dma_map_single(sc->ib.dev,
1175 (void *)packet,
1176 header_length,
1177 DMA_TO_DEVICE);
1178 if (ib_dma_mapping_error(sc->ib.dev, request->sge[0].addr)) {
1179 rc = -EIO;
1180 goto err_dma;
1181 }
1182
1183 request->sge[0].length = header_length;
1184 request->sge[0].lkey = sc->ib.pd->local_dma_lkey;
1185 request->num_sge = 1;
1186
1187 /* Fill in the data payload to find out how much data we can add */
1188 if (iter) {
1189 struct smb_extract_to_rdma extract = {
1190 .nr_sge = request->num_sge,
1191 .max_sge = SMBDIRECT_SEND_IO_MAX_SGE,
1192 .sge = request->sge,
1193 .device = sc->ib.dev,
1194 .local_dma_lkey = sc->ib.pd->local_dma_lkey,
1195 .direction = DMA_TO_DEVICE,
1196 };
1197 size_t payload_len = umin(*_remaining_data_length,
1198 sp->max_send_size - sizeof(*packet));
1199
1200 rc = smb_extract_iter_to_rdma(iter, payload_len,
1201 &extract);
1202 if (rc < 0)
1203 goto err_dma;
1204 data_length = rc;
1205 request->num_sge = extract.nr_sge;
1206 *_remaining_data_length -= data_length;
1207 } else {
1208 data_length = 0;
1209 }
1210
1211 /* Fill in the packet header */
1212 packet->credits_requested = cpu_to_le16(sp->send_credit_target);
1213
1214 new_credits = manage_credits_prior_sending(sc);
1215 atomic_add(new_credits, &sc->recv_io.credits.count);
1216 packet->credits_granted = cpu_to_le16(new_credits);
1217
1218 packet->flags = 0;
1219 if (manage_keep_alive_before_sending(sc))
1220 packet->flags |= cpu_to_le16(SMBDIRECT_FLAG_RESPONSE_REQUESTED);
1221
1222 packet->reserved = 0;
1223 if (!data_length)
1224 packet->data_offset = 0;
1225 else
1226 packet->data_offset = cpu_to_le32(24);
1227 packet->data_length = cpu_to_le32(data_length);
1228 packet->remaining_data_length = cpu_to_le32(*_remaining_data_length);
1229 packet->padding = 0;
1230
1231 log_outgoing(INFO, "credits_requested=%d credits_granted=%d data_offset=%d data_length=%d remaining_data_length=%d\n",
1232 le16_to_cpu(packet->credits_requested),
1233 le16_to_cpu(packet->credits_granted),
1234 le32_to_cpu(packet->data_offset),
1235 le32_to_cpu(packet->data_length),
1236 le32_to_cpu(packet->remaining_data_length));
1237
1238 /*
1239 * Now that we got a local and a remote credit
1240 * we add us as pending
1241 */
1242 atomic_inc(&sc->send_io.pending.count);
1243
1244 rc = smbd_post_send(sc, request);
1245 if (!rc)
1246 return 0;
1247
1248 if (atomic_dec_and_test(&sc->send_io.pending.count))
1249 wake_up(&sc->send_io.pending.zero_wait_queue);
1250
1251 wake_up(&sc->send_io.pending.dec_wait_queue);
1252
1253 err_dma:
1254 for (i = 0; i < request->num_sge; i++)
1255 if (request->sge[i].addr)
1256 ib_dma_unmap_single(sc->ib.dev,
1257 request->sge[i].addr,
1258 request->sge[i].length,
1259 DMA_TO_DEVICE);
1260 mempool_free(request, sc->send_io.mem.pool);
1261
1262 /* roll back the granted receive credits */
1263 atomic_sub(new_credits, &sc->recv_io.credits.count);
1264
1265 err_alloc:
1266 atomic_inc(&sc->send_io.credits.count);
1267 wake_up(&sc->send_io.credits.wait_queue);
1268
1269 err_wait_credit:
1270 atomic_inc(&sc->send_io.lcredits.count);
1271 wake_up(&sc->send_io.lcredits.wait_queue);
1272
1273 err_wait_lcredit:
1274 return rc;
1275 }
1276
1277 /*
1278 * Send an empty message
1279 * Empty message is used to extend credits to peer to for keep live
1280 * while there is no upper layer payload to send at the time
1281 */
smbd_post_send_empty(struct smbdirect_socket * sc)1282 static int smbd_post_send_empty(struct smbdirect_socket *sc)
1283 {
1284 int remaining_data_length = 0;
1285
1286 sc->statistics.send_empty++;
1287 return smbd_post_send_iter(sc, NULL, &remaining_data_length);
1288 }
1289
smbd_post_send_full_iter(struct smbdirect_socket * sc,struct iov_iter * iter,int * _remaining_data_length)1290 static int smbd_post_send_full_iter(struct smbdirect_socket *sc,
1291 struct iov_iter *iter,
1292 int *_remaining_data_length)
1293 {
1294 int rc = 0;
1295
1296 /*
1297 * smbd_post_send_iter() respects the
1298 * negotiated max_send_size, so we need to
1299 * loop until the full iter is posted
1300 */
1301
1302 while (iov_iter_count(iter) > 0) {
1303 rc = smbd_post_send_iter(sc, iter, _remaining_data_length);
1304 if (rc < 0)
1305 break;
1306 }
1307
1308 return rc;
1309 }
1310
1311 /*
1312 * Post a receive request to the transport
1313 * The remote peer can only send data when a receive request is posted
1314 * The interaction is controlled by send/receive credit system
1315 */
smbd_post_recv(struct smbdirect_socket * sc,struct smbdirect_recv_io * response)1316 static int smbd_post_recv(
1317 struct smbdirect_socket *sc, struct smbdirect_recv_io *response)
1318 {
1319 struct smbdirect_socket_parameters *sp = &sc->parameters;
1320 struct ib_recv_wr recv_wr;
1321 int rc = -EIO;
1322
1323 response->sge.addr = ib_dma_map_single(
1324 sc->ib.dev, response->packet,
1325 sp->max_recv_size, DMA_FROM_DEVICE);
1326 if (ib_dma_mapping_error(sc->ib.dev, response->sge.addr))
1327 return rc;
1328
1329 response->sge.length = sp->max_recv_size;
1330 response->sge.lkey = sc->ib.pd->local_dma_lkey;
1331
1332 response->cqe.done = recv_done;
1333
1334 recv_wr.wr_cqe = &response->cqe;
1335 recv_wr.next = NULL;
1336 recv_wr.sg_list = &response->sge;
1337 recv_wr.num_sge = 1;
1338
1339 rc = ib_post_recv(sc->ib.qp, &recv_wr, NULL);
1340 if (rc) {
1341 ib_dma_unmap_single(sc->ib.dev, response->sge.addr,
1342 response->sge.length, DMA_FROM_DEVICE);
1343 response->sge.length = 0;
1344 smbd_disconnect_rdma_connection(sc);
1345 log_rdma_recv(ERR, "ib_post_recv failed rc=%d\n", rc);
1346 }
1347
1348 return rc;
1349 }
1350
1351 /* Perform SMBD negotiate according to [MS-SMBD] 3.1.5.2 */
smbd_negotiate(struct smbdirect_socket * sc)1352 static int smbd_negotiate(struct smbdirect_socket *sc)
1353 {
1354 struct smbdirect_socket_parameters *sp = &sc->parameters;
1355 int rc;
1356 struct smbdirect_recv_io *response = get_receive_buffer(sc);
1357
1358 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED);
1359 sc->status = SMBDIRECT_SOCKET_NEGOTIATE_RUNNING;
1360
1361 sc->recv_io.expected = SMBDIRECT_EXPECT_NEGOTIATE_REP;
1362 rc = smbd_post_recv(sc, response);
1363 log_rdma_event(INFO, "smbd_post_recv rc=%d iov.addr=0x%llx iov.length=%u iov.lkey=0x%x\n",
1364 rc, response->sge.addr,
1365 response->sge.length, response->sge.lkey);
1366 if (rc) {
1367 put_receive_buffer(sc, response);
1368 return rc;
1369 }
1370
1371 rc = smbd_post_send_negotiate_req(sc);
1372 if (rc)
1373 return rc;
1374
1375 rc = wait_event_interruptible_timeout(
1376 sc->status_wait,
1377 sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING,
1378 msecs_to_jiffies(sp->negotiate_timeout_msec));
1379 log_rdma_event(INFO, "wait_event_interruptible_timeout rc=%d\n", rc);
1380
1381 if (sc->status == SMBDIRECT_SOCKET_CONNECTED)
1382 return 0;
1383
1384 if (rc == 0)
1385 rc = -ETIMEDOUT;
1386 else if (rc == -ERESTARTSYS)
1387 rc = -EINTR;
1388 else
1389 rc = -ENOTCONN;
1390
1391 return rc;
1392 }
1393
1394 /*
1395 * Implement Connection.FragmentReassemblyBuffer defined in [MS-SMBD] 3.1.1.1
1396 * This is a queue for reassembling upper layer payload and present to upper
1397 * layer. All the inncoming payload go to the reassembly queue, regardless of
1398 * if reassembly is required. The uuper layer code reads from the queue for all
1399 * incoming payloads.
1400 * Put a received packet to the reassembly queue
1401 * response: the packet received
1402 * data_length: the size of payload in this packet
1403 */
enqueue_reassembly(struct smbdirect_socket * sc,struct smbdirect_recv_io * response,int data_length)1404 static void enqueue_reassembly(
1405 struct smbdirect_socket *sc,
1406 struct smbdirect_recv_io *response,
1407 int data_length)
1408 {
1409 unsigned long flags;
1410
1411 spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
1412 list_add_tail(&response->list, &sc->recv_io.reassembly.list);
1413 sc->recv_io.reassembly.queue_length++;
1414 /*
1415 * Make sure reassembly_data_length is updated after list and
1416 * reassembly_queue_length are updated. On the dequeue side
1417 * reassembly_data_length is checked without a lock to determine
1418 * if reassembly_queue_length and list is up to date
1419 */
1420 virt_wmb();
1421 sc->recv_io.reassembly.data_length += data_length;
1422 spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
1423 sc->statistics.enqueue_reassembly_queue++;
1424 }
1425
1426 /*
1427 * Get the first entry at the front of reassembly queue
1428 * Caller is responsible for locking
1429 * return value: the first entry if any, NULL if queue is empty
1430 */
_get_first_reassembly(struct smbdirect_socket * sc)1431 static struct smbdirect_recv_io *_get_first_reassembly(struct smbdirect_socket *sc)
1432 {
1433 struct smbdirect_recv_io *ret = NULL;
1434
1435 if (!list_empty(&sc->recv_io.reassembly.list)) {
1436 ret = list_first_entry(
1437 &sc->recv_io.reassembly.list,
1438 struct smbdirect_recv_io, list);
1439 }
1440 return ret;
1441 }
1442
1443 /*
1444 * Get a receive buffer
1445 * For each remote send, we need to post a receive. The receive buffers are
1446 * pre-allocated in advance.
1447 * return value: the receive buffer, NULL if none is available
1448 */
get_receive_buffer(struct smbdirect_socket * sc)1449 static struct smbdirect_recv_io *get_receive_buffer(struct smbdirect_socket *sc)
1450 {
1451 struct smbdirect_recv_io *ret = NULL;
1452 unsigned long flags;
1453
1454 spin_lock_irqsave(&sc->recv_io.free.lock, flags);
1455 if (!list_empty(&sc->recv_io.free.list)) {
1456 ret = list_first_entry(
1457 &sc->recv_io.free.list,
1458 struct smbdirect_recv_io, list);
1459 list_del(&ret->list);
1460 sc->statistics.get_receive_buffer++;
1461 }
1462 spin_unlock_irqrestore(&sc->recv_io.free.lock, flags);
1463
1464 return ret;
1465 }
1466
1467 /*
1468 * Return a receive buffer
1469 * Upon returning of a receive buffer, we can post new receive and extend
1470 * more receive credits to remote peer. This is done immediately after a
1471 * receive buffer is returned.
1472 */
put_receive_buffer(struct smbdirect_socket * sc,struct smbdirect_recv_io * response)1473 static void put_receive_buffer(
1474 struct smbdirect_socket *sc, struct smbdirect_recv_io *response)
1475 {
1476 unsigned long flags;
1477
1478 if (likely(response->sge.length != 0)) {
1479 ib_dma_unmap_single(sc->ib.dev,
1480 response->sge.addr,
1481 response->sge.length,
1482 DMA_FROM_DEVICE);
1483 response->sge.length = 0;
1484 }
1485
1486 spin_lock_irqsave(&sc->recv_io.free.lock, flags);
1487 list_add_tail(&response->list, &sc->recv_io.free.list);
1488 sc->statistics.put_receive_buffer++;
1489 spin_unlock_irqrestore(&sc->recv_io.free.lock, flags);
1490
1491 queue_work(sc->workqueue, &sc->recv_io.posted.refill_work);
1492 }
1493
1494 /* Preallocate all receive buffer on transport establishment */
allocate_receive_buffers(struct smbdirect_socket * sc,int num_buf)1495 static int allocate_receive_buffers(struct smbdirect_socket *sc, int num_buf)
1496 {
1497 struct smbdirect_recv_io *response;
1498 int i;
1499
1500 for (i = 0; i < num_buf; i++) {
1501 response = mempool_alloc(sc->recv_io.mem.pool, GFP_KERNEL);
1502 if (!response)
1503 goto allocate_failed;
1504
1505 response->socket = sc;
1506 response->sge.length = 0;
1507 list_add_tail(&response->list, &sc->recv_io.free.list);
1508 }
1509
1510 return 0;
1511
1512 allocate_failed:
1513 while (!list_empty(&sc->recv_io.free.list)) {
1514 response = list_first_entry(
1515 &sc->recv_io.free.list,
1516 struct smbdirect_recv_io, list);
1517 list_del(&response->list);
1518
1519 mempool_free(response, sc->recv_io.mem.pool);
1520 }
1521 return -ENOMEM;
1522 }
1523
destroy_receive_buffers(struct smbdirect_socket * sc)1524 static void destroy_receive_buffers(struct smbdirect_socket *sc)
1525 {
1526 struct smbdirect_recv_io *response;
1527
1528 while ((response = get_receive_buffer(sc)))
1529 mempool_free(response, sc->recv_io.mem.pool);
1530 }
1531
send_immediate_empty_message(struct work_struct * work)1532 static void send_immediate_empty_message(struct work_struct *work)
1533 {
1534 struct smbdirect_socket *sc =
1535 container_of(work, struct smbdirect_socket, idle.immediate_work);
1536
1537 if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
1538 return;
1539
1540 log_keep_alive(INFO, "send an empty message\n");
1541 smbd_post_send_empty(sc);
1542 }
1543
1544 /* Implement idle connection timer [MS-SMBD] 3.1.6.2 */
idle_connection_timer(struct work_struct * work)1545 static void idle_connection_timer(struct work_struct *work)
1546 {
1547 struct smbdirect_socket *sc =
1548 container_of(work, struct smbdirect_socket, idle.timer_work.work);
1549 struct smbdirect_socket_parameters *sp = &sc->parameters;
1550
1551 if (sc->idle.keepalive != SMBDIRECT_KEEPALIVE_NONE) {
1552 log_keep_alive(ERR,
1553 "error status sc->idle.keepalive=%d\n",
1554 sc->idle.keepalive);
1555 smbd_disconnect_rdma_connection(sc);
1556 return;
1557 }
1558
1559 if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
1560 return;
1561
1562 /*
1563 * Now use the keepalive timeout (instead of keepalive interval)
1564 * in order to wait for a response
1565 */
1566 sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING;
1567 mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
1568 msecs_to_jiffies(sp->keepalive_timeout_msec));
1569 log_keep_alive(INFO, "schedule send of empty idle message\n");
1570 queue_work(sc->workqueue, &sc->idle.immediate_work);
1571 }
1572
1573 /*
1574 * Destroy the transport and related RDMA and memory resources
1575 * Need to go through all the pending counters and make sure on one is using
1576 * the transport while it is destroyed
1577 */
smbd_destroy(struct TCP_Server_Info * server)1578 void smbd_destroy(struct TCP_Server_Info *server)
1579 {
1580 struct smbd_connection *info = server->smbd_conn;
1581 struct smbdirect_socket *sc;
1582 struct smbdirect_recv_io *response;
1583 unsigned long flags;
1584
1585 if (!info) {
1586 log_rdma_event(INFO, "rdma session already destroyed\n");
1587 return;
1588 }
1589 sc = &info->socket;
1590
1591 log_rdma_event(INFO, "cancelling and disable disconnect_work\n");
1592 disable_work_sync(&sc->disconnect_work);
1593
1594 log_rdma_event(INFO, "destroying rdma session\n");
1595 if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING)
1596 smbd_disconnect_rdma_work(&sc->disconnect_work);
1597 if (sc->status < SMBDIRECT_SOCKET_DISCONNECTED) {
1598 log_rdma_event(INFO, "wait for transport being disconnected\n");
1599 wait_event(sc->status_wait, sc->status == SMBDIRECT_SOCKET_DISCONNECTED);
1600 log_rdma_event(INFO, "waited for transport being disconnected\n");
1601 }
1602
1603 /*
1604 * Wake up all waiters in all wait queues
1605 * in order to notice the broken connection.
1606 *
1607 * Most likely this was already called via
1608 * smbd_disconnect_rdma_work(), but call it again...
1609 */
1610 smbd_disconnect_wake_up_all(sc);
1611
1612 log_rdma_event(INFO, "cancelling recv_io.posted.refill_work\n");
1613 disable_work_sync(&sc->recv_io.posted.refill_work);
1614
1615 log_rdma_event(INFO, "destroying qp\n");
1616 ib_drain_qp(sc->ib.qp);
1617 rdma_destroy_qp(sc->rdma.cm_id);
1618 sc->ib.qp = NULL;
1619
1620 log_rdma_event(INFO, "cancelling idle timer\n");
1621 disable_delayed_work_sync(&sc->idle.timer_work);
1622 log_rdma_event(INFO, "cancelling send immediate work\n");
1623 disable_work_sync(&sc->idle.immediate_work);
1624
1625 /* It's not possible for upper layer to get to reassembly */
1626 log_rdma_event(INFO, "drain the reassembly queue\n");
1627 do {
1628 spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
1629 response = _get_first_reassembly(sc);
1630 if (response) {
1631 list_del(&response->list);
1632 spin_unlock_irqrestore(
1633 &sc->recv_io.reassembly.lock, flags);
1634 put_receive_buffer(sc, response);
1635 } else
1636 spin_unlock_irqrestore(
1637 &sc->recv_io.reassembly.lock, flags);
1638 } while (response);
1639 sc->recv_io.reassembly.data_length = 0;
1640
1641 log_rdma_event(INFO, "free receive buffers\n");
1642 destroy_receive_buffers(sc);
1643
1644 log_rdma_event(INFO, "freeing mr list\n");
1645 destroy_mr_list(sc);
1646
1647 ib_free_cq(sc->ib.send_cq);
1648 ib_free_cq(sc->ib.recv_cq);
1649 ib_dealloc_pd(sc->ib.pd);
1650 rdma_destroy_id(sc->rdma.cm_id);
1651
1652 /* free mempools */
1653 mempool_destroy(sc->send_io.mem.pool);
1654 kmem_cache_destroy(sc->send_io.mem.cache);
1655
1656 mempool_destroy(sc->recv_io.mem.pool);
1657 kmem_cache_destroy(sc->recv_io.mem.cache);
1658
1659 sc->status = SMBDIRECT_SOCKET_DESTROYED;
1660
1661 destroy_workqueue(sc->workqueue);
1662 log_rdma_event(INFO, "rdma session destroyed\n");
1663 kfree(info);
1664 server->smbd_conn = NULL;
1665 }
1666
1667 /*
1668 * Reconnect this SMBD connection, called from upper layer
1669 * return value: 0 on success, or actual error code
1670 */
smbd_reconnect(struct TCP_Server_Info * server)1671 int smbd_reconnect(struct TCP_Server_Info *server)
1672 {
1673 log_rdma_event(INFO, "reconnecting rdma session\n");
1674
1675 if (!server->smbd_conn) {
1676 log_rdma_event(INFO, "rdma session already destroyed\n");
1677 goto create_conn;
1678 }
1679
1680 /*
1681 * This is possible if transport is disconnected and we haven't received
1682 * notification from RDMA, but upper layer has detected timeout
1683 */
1684 if (server->smbd_conn->socket.status == SMBDIRECT_SOCKET_CONNECTED) {
1685 log_rdma_event(INFO, "disconnecting transport\n");
1686 smbd_destroy(server);
1687 }
1688
1689 create_conn:
1690 log_rdma_event(INFO, "creating rdma session\n");
1691 server->smbd_conn = smbd_get_connection(
1692 server, (struct sockaddr *) &server->dstaddr);
1693
1694 if (server->smbd_conn) {
1695 cifs_dbg(VFS, "RDMA transport re-established\n");
1696 trace_smb3_smbd_connect_done(server->hostname, server->conn_id, &server->dstaddr);
1697 return 0;
1698 }
1699 trace_smb3_smbd_connect_err(server->hostname, server->conn_id, &server->dstaddr);
1700 return -ENOENT;
1701 }
1702
destroy_caches(struct smbdirect_socket * sc)1703 static void destroy_caches(struct smbdirect_socket *sc)
1704 {
1705 destroy_receive_buffers(sc);
1706 mempool_destroy(sc->recv_io.mem.pool);
1707 kmem_cache_destroy(sc->recv_io.mem.cache);
1708 mempool_destroy(sc->send_io.mem.pool);
1709 kmem_cache_destroy(sc->send_io.mem.cache);
1710 }
1711
1712 #define MAX_NAME_LEN 80
allocate_caches(struct smbdirect_socket * sc)1713 static int allocate_caches(struct smbdirect_socket *sc)
1714 {
1715 struct smbdirect_socket_parameters *sp = &sc->parameters;
1716 char name[MAX_NAME_LEN];
1717 int rc;
1718
1719 if (WARN_ON_ONCE(sp->max_recv_size < sizeof(struct smbdirect_data_transfer)))
1720 return -ENOMEM;
1721
1722 scnprintf(name, MAX_NAME_LEN, "smbdirect_send_io_%p", sc);
1723 sc->send_io.mem.cache =
1724 kmem_cache_create(
1725 name,
1726 sizeof(struct smbdirect_send_io) +
1727 sizeof(struct smbdirect_data_transfer),
1728 0, SLAB_HWCACHE_ALIGN, NULL);
1729 if (!sc->send_io.mem.cache)
1730 return -ENOMEM;
1731
1732 sc->send_io.mem.pool =
1733 mempool_create(sp->send_credit_target, mempool_alloc_slab,
1734 mempool_free_slab, sc->send_io.mem.cache);
1735 if (!sc->send_io.mem.pool)
1736 goto out1;
1737
1738 scnprintf(name, MAX_NAME_LEN, "smbdirect_recv_io_%p", sc);
1739
1740 struct kmem_cache_args response_args = {
1741 .align = __alignof__(struct smbdirect_recv_io),
1742 .useroffset = (offsetof(struct smbdirect_recv_io, packet) +
1743 sizeof(struct smbdirect_data_transfer)),
1744 .usersize = sp->max_recv_size - sizeof(struct smbdirect_data_transfer),
1745 };
1746 sc->recv_io.mem.cache =
1747 kmem_cache_create(name,
1748 sizeof(struct smbdirect_recv_io) + sp->max_recv_size,
1749 &response_args, SLAB_HWCACHE_ALIGN);
1750 if (!sc->recv_io.mem.cache)
1751 goto out2;
1752
1753 sc->recv_io.mem.pool =
1754 mempool_create(sp->recv_credit_max, mempool_alloc_slab,
1755 mempool_free_slab, sc->recv_io.mem.cache);
1756 if (!sc->recv_io.mem.pool)
1757 goto out3;
1758
1759 rc = allocate_receive_buffers(sc, sp->recv_credit_max);
1760 if (rc) {
1761 log_rdma_event(ERR, "failed to allocate receive buffers\n");
1762 goto out4;
1763 }
1764
1765 return 0;
1766
1767 out4:
1768 mempool_destroy(sc->recv_io.mem.pool);
1769 out3:
1770 kmem_cache_destroy(sc->recv_io.mem.cache);
1771 out2:
1772 mempool_destroy(sc->send_io.mem.pool);
1773 out1:
1774 kmem_cache_destroy(sc->send_io.mem.cache);
1775 return -ENOMEM;
1776 }
1777
1778 /* Create a SMBD connection, called by upper layer */
_smbd_get_connection(struct TCP_Server_Info * server,struct sockaddr * dstaddr,int port)1779 static struct smbd_connection *_smbd_get_connection(
1780 struct TCP_Server_Info *server, struct sockaddr *dstaddr, int port)
1781 {
1782 int rc;
1783 struct smbd_connection *info;
1784 struct smbdirect_socket *sc;
1785 struct smbdirect_socket_parameters *sp;
1786 struct rdma_conn_param conn_param;
1787 struct ib_qp_cap qp_cap;
1788 struct ib_qp_init_attr qp_attr;
1789 struct sockaddr_in *addr_in = (struct sockaddr_in *) dstaddr;
1790 struct ib_port_immutable port_immutable;
1791 __be32 ird_ord_hdr[2];
1792 char wq_name[80];
1793 struct workqueue_struct *workqueue;
1794
1795 info = kzalloc(sizeof(struct smbd_connection), GFP_KERNEL);
1796 if (!info)
1797 return NULL;
1798 sc = &info->socket;
1799 scnprintf(wq_name, ARRAY_SIZE(wq_name), "smbd_%p", sc);
1800 workqueue = create_workqueue(wq_name);
1801 if (!workqueue)
1802 goto create_wq_failed;
1803 smbdirect_socket_init(sc);
1804 sc->workqueue = workqueue;
1805 sp = &sc->parameters;
1806
1807 INIT_WORK(&sc->disconnect_work, smbd_disconnect_rdma_work);
1808
1809 sp->resolve_addr_timeout_msec = RDMA_RESOLVE_TIMEOUT;
1810 sp->resolve_route_timeout_msec = RDMA_RESOLVE_TIMEOUT;
1811 sp->rdma_connect_timeout_msec = RDMA_RESOLVE_TIMEOUT;
1812 sp->negotiate_timeout_msec = SMBD_NEGOTIATE_TIMEOUT * 1000;
1813 sp->initiator_depth = 1;
1814 sp->responder_resources = SMBD_CM_RESPONDER_RESOURCES;
1815 sp->recv_credit_max = smbd_receive_credit_max;
1816 sp->send_credit_target = smbd_send_credit_target;
1817 sp->max_send_size = smbd_max_send_size;
1818 sp->max_fragmented_recv_size = smbd_max_fragmented_recv_size;
1819 sp->max_recv_size = smbd_max_receive_size;
1820 sp->max_frmr_depth = smbd_max_frmr_depth;
1821 sp->keepalive_interval_msec = smbd_keep_alive_interval * 1000;
1822 sp->keepalive_timeout_msec = KEEPALIVE_RECV_TIMEOUT * 1000;
1823
1824 rc = smbd_ia_open(sc, dstaddr, port);
1825 if (rc) {
1826 log_rdma_event(INFO, "smbd_ia_open rc=%d\n", rc);
1827 goto create_id_failed;
1828 }
1829
1830 if (sp->send_credit_target > sc->ib.dev->attrs.max_cqe ||
1831 sp->send_credit_target > sc->ib.dev->attrs.max_qp_wr) {
1832 log_rdma_event(ERR, "consider lowering send_credit_target = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n",
1833 sp->send_credit_target,
1834 sc->ib.dev->attrs.max_cqe,
1835 sc->ib.dev->attrs.max_qp_wr);
1836 goto config_failed;
1837 }
1838
1839 if (sp->recv_credit_max > sc->ib.dev->attrs.max_cqe ||
1840 sp->recv_credit_max > sc->ib.dev->attrs.max_qp_wr) {
1841 log_rdma_event(ERR, "consider lowering receive_credit_max = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n",
1842 sp->recv_credit_max,
1843 sc->ib.dev->attrs.max_cqe,
1844 sc->ib.dev->attrs.max_qp_wr);
1845 goto config_failed;
1846 }
1847
1848 if (sc->ib.dev->attrs.max_send_sge < SMBDIRECT_SEND_IO_MAX_SGE ||
1849 sc->ib.dev->attrs.max_recv_sge < SMBDIRECT_RECV_IO_MAX_SGE) {
1850 log_rdma_event(ERR,
1851 "device %.*s max_send_sge/max_recv_sge = %d/%d too small\n",
1852 IB_DEVICE_NAME_MAX,
1853 sc->ib.dev->name,
1854 sc->ib.dev->attrs.max_send_sge,
1855 sc->ib.dev->attrs.max_recv_sge);
1856 goto config_failed;
1857 }
1858
1859 sp->responder_resources =
1860 min_t(u8, sp->responder_resources,
1861 sc->ib.dev->attrs.max_qp_rd_atom);
1862 log_rdma_mr(INFO, "responder_resources=%d\n",
1863 sp->responder_resources);
1864
1865 /*
1866 * We use allocate sp->responder_resources * 2 MRs
1867 * and each MR needs WRs for REG and INV, so
1868 * we use '* 4'.
1869 *
1870 * +1 for ib_drain_qp()
1871 */
1872 memset(&qp_cap, 0, sizeof(qp_cap));
1873 qp_cap.max_send_wr = sp->send_credit_target + sp->responder_resources * 4 + 1;
1874 qp_cap.max_recv_wr = sp->recv_credit_max + 1;
1875 qp_cap.max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE;
1876 qp_cap.max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE;
1877
1878 sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0);
1879 if (IS_ERR(sc->ib.pd)) {
1880 rc = PTR_ERR(sc->ib.pd);
1881 sc->ib.pd = NULL;
1882 log_rdma_event(ERR, "ib_alloc_pd() returned %d\n", rc);
1883 goto alloc_pd_failed;
1884 }
1885
1886 sc->ib.send_cq =
1887 ib_alloc_cq_any(sc->ib.dev, sc,
1888 qp_cap.max_send_wr, IB_POLL_SOFTIRQ);
1889 if (IS_ERR(sc->ib.send_cq)) {
1890 sc->ib.send_cq = NULL;
1891 goto alloc_cq_failed;
1892 }
1893
1894 sc->ib.recv_cq =
1895 ib_alloc_cq_any(sc->ib.dev, sc,
1896 qp_cap.max_recv_wr, IB_POLL_SOFTIRQ);
1897 if (IS_ERR(sc->ib.recv_cq)) {
1898 sc->ib.recv_cq = NULL;
1899 goto alloc_cq_failed;
1900 }
1901
1902 memset(&qp_attr, 0, sizeof(qp_attr));
1903 qp_attr.event_handler = smbd_qp_async_error_upcall;
1904 qp_attr.qp_context = sc;
1905 qp_attr.cap = qp_cap;
1906 qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
1907 qp_attr.qp_type = IB_QPT_RC;
1908 qp_attr.send_cq = sc->ib.send_cq;
1909 qp_attr.recv_cq = sc->ib.recv_cq;
1910 qp_attr.port_num = ~0;
1911
1912 rc = rdma_create_qp(sc->rdma.cm_id, sc->ib.pd, &qp_attr);
1913 if (rc) {
1914 log_rdma_event(ERR, "rdma_create_qp failed %i\n", rc);
1915 goto create_qp_failed;
1916 }
1917 sc->ib.qp = sc->rdma.cm_id->qp;
1918
1919 memset(&conn_param, 0, sizeof(conn_param));
1920 conn_param.initiator_depth = sp->initiator_depth;
1921 conn_param.responder_resources = sp->responder_resources;
1922
1923 /* Need to send IRD/ORD in private data for iWARP */
1924 sc->ib.dev->ops.get_port_immutable(
1925 sc->ib.dev, sc->rdma.cm_id->port_num, &port_immutable);
1926 if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) {
1927 ird_ord_hdr[0] = cpu_to_be32(conn_param.responder_resources);
1928 ird_ord_hdr[1] = cpu_to_be32(conn_param.initiator_depth);
1929 conn_param.private_data = ird_ord_hdr;
1930 conn_param.private_data_len = sizeof(ird_ord_hdr);
1931 } else {
1932 conn_param.private_data = NULL;
1933 conn_param.private_data_len = 0;
1934 }
1935
1936 conn_param.retry_count = SMBD_CM_RETRY;
1937 conn_param.rnr_retry_count = SMBD_CM_RNR_RETRY;
1938 conn_param.flow_control = 0;
1939
1940 log_rdma_event(INFO, "connecting to IP %pI4 port %d\n",
1941 &addr_in->sin_addr, port);
1942
1943 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED);
1944 sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING;
1945 rc = rdma_connect(sc->rdma.cm_id, &conn_param);
1946 if (rc) {
1947 log_rdma_event(ERR, "rdma_connect() failed with %i\n", rc);
1948 goto rdma_connect_failed;
1949 }
1950
1951 wait_event_interruptible_timeout(
1952 sc->status_wait,
1953 sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING,
1954 msecs_to_jiffies(sp->rdma_connect_timeout_msec));
1955
1956 if (sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED) {
1957 log_rdma_event(ERR, "rdma_connect failed port=%d\n", port);
1958 goto rdma_connect_failed;
1959 }
1960
1961 log_rdma_event(INFO, "rdma_connect connected\n");
1962
1963 rc = allocate_caches(sc);
1964 if (rc) {
1965 log_rdma_event(ERR, "cache allocation failed\n");
1966 goto allocate_cache_failed;
1967 }
1968
1969 INIT_WORK(&sc->idle.immediate_work, send_immediate_empty_message);
1970 INIT_DELAYED_WORK(&sc->idle.timer_work, idle_connection_timer);
1971 /*
1972 * start with the negotiate timeout and SMBDIRECT_KEEPALIVE_PENDING
1973 * so that the timer will cause a disconnect.
1974 */
1975 sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING;
1976 mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
1977 msecs_to_jiffies(sp->negotiate_timeout_msec));
1978
1979 INIT_WORK(&sc->recv_io.posted.refill_work, smbd_post_send_credits);
1980
1981 rc = smbd_negotiate(sc);
1982 if (rc) {
1983 log_rdma_event(ERR, "smbd_negotiate rc=%d\n", rc);
1984 goto negotiation_failed;
1985 }
1986
1987 rc = allocate_mr_list(sc);
1988 if (rc) {
1989 log_rdma_mr(ERR, "memory registration allocation failed\n");
1990 goto allocate_mr_failed;
1991 }
1992
1993 return info;
1994
1995 allocate_mr_failed:
1996 /* At this point, need to a full transport shutdown */
1997 server->smbd_conn = info;
1998 smbd_destroy(server);
1999 return NULL;
2000
2001 negotiation_failed:
2002 disable_delayed_work_sync(&sc->idle.timer_work);
2003 destroy_caches(sc);
2004 sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED;
2005 rdma_disconnect(sc->rdma.cm_id);
2006 wait_event(sc->status_wait,
2007 sc->status == SMBDIRECT_SOCKET_DISCONNECTED);
2008
2009 allocate_cache_failed:
2010 rdma_connect_failed:
2011 rdma_destroy_qp(sc->rdma.cm_id);
2012
2013 create_qp_failed:
2014 alloc_cq_failed:
2015 if (sc->ib.send_cq)
2016 ib_free_cq(sc->ib.send_cq);
2017 if (sc->ib.recv_cq)
2018 ib_free_cq(sc->ib.recv_cq);
2019
2020 ib_dealloc_pd(sc->ib.pd);
2021
2022 alloc_pd_failed:
2023 config_failed:
2024 rdma_destroy_id(sc->rdma.cm_id);
2025
2026 create_id_failed:
2027 destroy_workqueue(sc->workqueue);
2028 create_wq_failed:
2029 kfree(info);
2030 return NULL;
2031 }
2032
smbd_get_connection(struct TCP_Server_Info * server,struct sockaddr * dstaddr)2033 struct smbd_connection *smbd_get_connection(
2034 struct TCP_Server_Info *server, struct sockaddr *dstaddr)
2035 {
2036 struct smbd_connection *ret;
2037 const struct smbdirect_socket_parameters *sp;
2038 int port = SMBD_PORT;
2039
2040 try_again:
2041 ret = _smbd_get_connection(server, dstaddr, port);
2042
2043 /* Try SMB_PORT if SMBD_PORT doesn't work */
2044 if (!ret && port == SMBD_PORT) {
2045 port = SMB_PORT;
2046 goto try_again;
2047 }
2048 if (!ret)
2049 return NULL;
2050
2051 sp = &ret->socket.parameters;
2052
2053 server->rdma_readwrite_threshold =
2054 rdma_readwrite_threshold > sp->max_fragmented_send_size ?
2055 sp->max_fragmented_send_size :
2056 rdma_readwrite_threshold;
2057
2058 return ret;
2059 }
2060
2061 /*
2062 * Receive data from the transport's receive reassembly queue
2063 * All the incoming data packets are placed in reassembly queue
2064 * iter: the buffer to read data into
2065 * size: the length of data to read
2066 * return value: actual data read
2067 *
2068 * Note: this implementation copies the data from reassembly queue to receive
2069 * buffers used by upper layer. This is not the optimal code path. A better way
2070 * to do it is to not have upper layer allocate its receive buffers but rather
2071 * borrow the buffer from reassembly queue, and return it after data is
2072 * consumed. But this will require more changes to upper layer code, and also
2073 * need to consider packet boundaries while they still being reassembled.
2074 */
smbd_recv(struct smbd_connection * info,struct msghdr * msg)2075 int smbd_recv(struct smbd_connection *info, struct msghdr *msg)
2076 {
2077 struct smbdirect_socket *sc = &info->socket;
2078 struct smbdirect_recv_io *response;
2079 struct smbdirect_data_transfer *data_transfer;
2080 size_t size = iov_iter_count(&msg->msg_iter);
2081 int to_copy, to_read, data_read, offset;
2082 u32 data_length, remaining_data_length, data_offset;
2083 int rc;
2084
2085 if (WARN_ON_ONCE(iov_iter_rw(&msg->msg_iter) == WRITE))
2086 return -EINVAL; /* It's a bug in upper layer to get there */
2087
2088 again:
2089 /*
2090 * No need to hold the reassembly queue lock all the time as we are
2091 * the only one reading from the front of the queue. The transport
2092 * may add more entries to the back of the queue at the same time
2093 */
2094 log_read(INFO, "size=%zd sc->recv_io.reassembly.data_length=%d\n", size,
2095 sc->recv_io.reassembly.data_length);
2096 if (sc->recv_io.reassembly.data_length >= size) {
2097 int queue_length;
2098 int queue_removed = 0;
2099 unsigned long flags;
2100
2101 /*
2102 * Need to make sure reassembly_data_length is read before
2103 * reading reassembly_queue_length and calling
2104 * _get_first_reassembly. This call is lock free
2105 * as we never read at the end of the queue which are being
2106 * updated in SOFTIRQ as more data is received
2107 */
2108 virt_rmb();
2109 queue_length = sc->recv_io.reassembly.queue_length;
2110 data_read = 0;
2111 to_read = size;
2112 offset = sc->recv_io.reassembly.first_entry_offset;
2113 while (data_read < size) {
2114 response = _get_first_reassembly(sc);
2115 data_transfer = smbdirect_recv_io_payload(response);
2116 data_length = le32_to_cpu(data_transfer->data_length);
2117 remaining_data_length =
2118 le32_to_cpu(
2119 data_transfer->remaining_data_length);
2120 data_offset = le32_to_cpu(data_transfer->data_offset);
2121
2122 /*
2123 * The upper layer expects RFC1002 length at the
2124 * beginning of the payload. Return it to indicate
2125 * the total length of the packet. This minimize the
2126 * change to upper layer packet processing logic. This
2127 * will be eventually remove when an intermediate
2128 * transport layer is added
2129 */
2130 if (response->first_segment && size == 4) {
2131 unsigned int rfc1002_len =
2132 data_length + remaining_data_length;
2133 __be32 rfc1002_hdr = cpu_to_be32(rfc1002_len);
2134 if (copy_to_iter(&rfc1002_hdr, sizeof(rfc1002_hdr),
2135 &msg->msg_iter) != sizeof(rfc1002_hdr))
2136 return -EFAULT;
2137 data_read = 4;
2138 response->first_segment = false;
2139 log_read(INFO, "returning rfc1002 length %d\n",
2140 rfc1002_len);
2141 goto read_rfc1002_done;
2142 }
2143
2144 to_copy = min_t(int, data_length - offset, to_read);
2145 if (copy_to_iter((char *)data_transfer + data_offset + offset,
2146 to_copy, &msg->msg_iter) != to_copy)
2147 return -EFAULT;
2148
2149 /* move on to the next buffer? */
2150 if (to_copy == data_length - offset) {
2151 queue_length--;
2152 /*
2153 * No need to lock if we are not at the
2154 * end of the queue
2155 */
2156 if (queue_length)
2157 list_del(&response->list);
2158 else {
2159 spin_lock_irqsave(
2160 &sc->recv_io.reassembly.lock, flags);
2161 list_del(&response->list);
2162 spin_unlock_irqrestore(
2163 &sc->recv_io.reassembly.lock, flags);
2164 }
2165 queue_removed++;
2166 sc->statistics.dequeue_reassembly_queue++;
2167 put_receive_buffer(sc, response);
2168 offset = 0;
2169 log_read(INFO, "put_receive_buffer offset=0\n");
2170 } else
2171 offset += to_copy;
2172
2173 to_read -= to_copy;
2174 data_read += to_copy;
2175
2176 log_read(INFO, "_get_first_reassembly memcpy %d bytes data_transfer_length-offset=%d after that to_read=%d data_read=%d offset=%d\n",
2177 to_copy, data_length - offset,
2178 to_read, data_read, offset);
2179 }
2180
2181 spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
2182 sc->recv_io.reassembly.data_length -= data_read;
2183 sc->recv_io.reassembly.queue_length -= queue_removed;
2184 spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
2185
2186 sc->recv_io.reassembly.first_entry_offset = offset;
2187 log_read(INFO, "returning to thread data_read=%d reassembly_data_length=%d first_entry_offset=%d\n",
2188 data_read, sc->recv_io.reassembly.data_length,
2189 sc->recv_io.reassembly.first_entry_offset);
2190 read_rfc1002_done:
2191 return data_read;
2192 }
2193
2194 log_read(INFO, "wait_event on more data\n");
2195 rc = wait_event_interruptible(
2196 sc->recv_io.reassembly.wait_queue,
2197 sc->recv_io.reassembly.data_length >= size ||
2198 sc->status != SMBDIRECT_SOCKET_CONNECTED);
2199 /* Don't return any data if interrupted */
2200 if (rc)
2201 return rc;
2202
2203 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
2204 log_read(ERR, "disconnected\n");
2205 return -ECONNABORTED;
2206 }
2207
2208 goto again;
2209 }
2210
2211 /*
2212 * Send data to transport
2213 * Each rqst is transported as a SMBDirect payload
2214 * rqst: the data to write
2215 * return value: 0 if successfully write, otherwise error code
2216 */
smbd_send(struct TCP_Server_Info * server,int num_rqst,struct smb_rqst * rqst_array)2217 int smbd_send(struct TCP_Server_Info *server,
2218 int num_rqst, struct smb_rqst *rqst_array)
2219 {
2220 struct smbd_connection *info = server->smbd_conn;
2221 struct smbdirect_socket *sc = &info->socket;
2222 struct smbdirect_socket_parameters *sp = &sc->parameters;
2223 struct smb_rqst *rqst;
2224 struct iov_iter iter;
2225 unsigned int remaining_data_length, klen;
2226 int rc, i, rqst_idx;
2227
2228 if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
2229 return -EAGAIN;
2230
2231 /*
2232 * Add in the page array if there is one. The caller needs to set
2233 * rq_tailsz to PAGE_SIZE when the buffer has multiple pages and
2234 * ends at page boundary
2235 */
2236 remaining_data_length = 0;
2237 for (i = 0; i < num_rqst; i++)
2238 remaining_data_length += smb_rqst_len(server, &rqst_array[i]);
2239
2240 if (unlikely(remaining_data_length > sp->max_fragmented_send_size)) {
2241 /* assertion: payload never exceeds negotiated maximum */
2242 log_write(ERR, "payload size %d > max size %d\n",
2243 remaining_data_length, sp->max_fragmented_send_size);
2244 return -EINVAL;
2245 }
2246
2247 log_write(INFO, "num_rqst=%d total length=%u\n",
2248 num_rqst, remaining_data_length);
2249
2250 rqst_idx = 0;
2251 do {
2252 rqst = &rqst_array[rqst_idx];
2253
2254 cifs_dbg(FYI, "Sending smb (RDMA): idx=%d smb_len=%lu\n",
2255 rqst_idx, smb_rqst_len(server, rqst));
2256 for (i = 0; i < rqst->rq_nvec; i++)
2257 dump_smb(rqst->rq_iov[i].iov_base, rqst->rq_iov[i].iov_len);
2258
2259 log_write(INFO, "RDMA-WR[%u] nvec=%d len=%u iter=%zu rqlen=%lu\n",
2260 rqst_idx, rqst->rq_nvec, remaining_data_length,
2261 iov_iter_count(&rqst->rq_iter), smb_rqst_len(server, rqst));
2262
2263 /* Send the metadata pages. */
2264 klen = 0;
2265 for (i = 0; i < rqst->rq_nvec; i++)
2266 klen += rqst->rq_iov[i].iov_len;
2267 iov_iter_kvec(&iter, ITER_SOURCE, rqst->rq_iov, rqst->rq_nvec, klen);
2268
2269 rc = smbd_post_send_full_iter(sc, &iter, &remaining_data_length);
2270 if (rc < 0)
2271 break;
2272
2273 if (iov_iter_count(&rqst->rq_iter) > 0) {
2274 /* And then the data pages if there are any */
2275 rc = smbd_post_send_full_iter(sc, &rqst->rq_iter,
2276 &remaining_data_length);
2277 if (rc < 0)
2278 break;
2279 }
2280
2281 } while (++rqst_idx < num_rqst);
2282
2283 /*
2284 * As an optimization, we don't wait for individual I/O to finish
2285 * before sending the next one.
2286 * Send them all and wait for pending send count to get to 0
2287 * that means all the I/Os have been out and we are good to return
2288 */
2289
2290 wait_event(sc->send_io.pending.zero_wait_queue,
2291 atomic_read(&sc->send_io.pending.count) == 0 ||
2292 sc->status != SMBDIRECT_SOCKET_CONNECTED);
2293
2294 if (sc->status != SMBDIRECT_SOCKET_CONNECTED && rc == 0)
2295 rc = -EAGAIN;
2296
2297 return rc;
2298 }
2299
register_mr_done(struct ib_cq * cq,struct ib_wc * wc)2300 static void register_mr_done(struct ib_cq *cq, struct ib_wc *wc)
2301 {
2302 struct smbdirect_mr_io *mr =
2303 container_of(wc->wr_cqe, struct smbdirect_mr_io, cqe);
2304 struct smbdirect_socket *sc = mr->socket;
2305
2306 if (wc->status) {
2307 log_rdma_mr(ERR, "status=%d\n", wc->status);
2308 smbd_disconnect_rdma_connection(sc);
2309 }
2310 }
2311
2312 /*
2313 * The work queue function that recovers MRs
2314 * We need to call ib_dereg_mr() and ib_alloc_mr() before this MR can be used
2315 * again. Both calls are slow, so finish them in a workqueue. This will not
2316 * block I/O path.
2317 * There is one workqueue that recovers MRs, there is no need to lock as the
2318 * I/O requests calling smbd_register_mr will never update the links in the
2319 * mr_list.
2320 */
smbd_mr_recovery_work(struct work_struct * work)2321 static void smbd_mr_recovery_work(struct work_struct *work)
2322 {
2323 struct smbdirect_socket *sc =
2324 container_of(work, struct smbdirect_socket, mr_io.recovery_work);
2325 struct smbdirect_socket_parameters *sp = &sc->parameters;
2326 struct smbdirect_mr_io *smbdirect_mr;
2327 int rc;
2328
2329 list_for_each_entry(smbdirect_mr, &sc->mr_io.all.list, list) {
2330 if (smbdirect_mr->state == SMBDIRECT_MR_ERROR) {
2331
2332 /* recover this MR entry */
2333 rc = ib_dereg_mr(smbdirect_mr->mr);
2334 if (rc) {
2335 log_rdma_mr(ERR,
2336 "ib_dereg_mr failed rc=%x\n",
2337 rc);
2338 smbd_disconnect_rdma_connection(sc);
2339 continue;
2340 }
2341
2342 smbdirect_mr->mr = ib_alloc_mr(
2343 sc->ib.pd, sc->mr_io.type,
2344 sp->max_frmr_depth);
2345 if (IS_ERR(smbdirect_mr->mr)) {
2346 log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n",
2347 sc->mr_io.type,
2348 sp->max_frmr_depth);
2349 smbd_disconnect_rdma_connection(sc);
2350 continue;
2351 }
2352 } else
2353 /* This MR is being used, don't recover it */
2354 continue;
2355
2356 smbdirect_mr->state = SMBDIRECT_MR_READY;
2357
2358 /* smbdirect_mr->state is updated by this function
2359 * and is read and updated by I/O issuing CPUs trying
2360 * to get a MR, the call to atomic_inc_return
2361 * implicates a memory barrier and guarantees this
2362 * value is updated before waking up any calls to
2363 * get_mr() from the I/O issuing CPUs
2364 */
2365 if (atomic_inc_return(&sc->mr_io.ready.count) == 1)
2366 wake_up(&sc->mr_io.ready.wait_queue);
2367 }
2368 }
2369
smbd_mr_disable_locked(struct smbdirect_mr_io * mr)2370 static void smbd_mr_disable_locked(struct smbdirect_mr_io *mr)
2371 {
2372 struct smbdirect_socket *sc = mr->socket;
2373
2374 lockdep_assert_held(&mr->mutex);
2375
2376 if (mr->state == SMBDIRECT_MR_DISABLED)
2377 return;
2378
2379 if (mr->mr)
2380 ib_dereg_mr(mr->mr);
2381 if (mr->sgt.nents)
2382 ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir);
2383 kfree(mr->sgt.sgl);
2384
2385 mr->mr = NULL;
2386 mr->sgt.sgl = NULL;
2387 mr->sgt.nents = 0;
2388
2389 mr->state = SMBDIRECT_MR_DISABLED;
2390 }
2391
smbd_mr_free_locked(struct kref * kref)2392 static void smbd_mr_free_locked(struct kref *kref)
2393 {
2394 struct smbdirect_mr_io *mr =
2395 container_of(kref, struct smbdirect_mr_io, kref);
2396
2397 lockdep_assert_held(&mr->mutex);
2398
2399 /*
2400 * smbd_mr_disable_locked() should already be called!
2401 */
2402 if (WARN_ON_ONCE(mr->state != SMBDIRECT_MR_DISABLED))
2403 smbd_mr_disable_locked(mr);
2404
2405 mutex_unlock(&mr->mutex);
2406 mutex_destroy(&mr->mutex);
2407 kfree(mr);
2408 }
2409
destroy_mr_list(struct smbdirect_socket * sc)2410 static void destroy_mr_list(struct smbdirect_socket *sc)
2411 {
2412 struct smbdirect_mr_io *mr, *tmp;
2413 LIST_HEAD(all_list);
2414 unsigned long flags;
2415
2416 disable_work_sync(&sc->mr_io.recovery_work);
2417
2418 spin_lock_irqsave(&sc->mr_io.all.lock, flags);
2419 list_splice_tail_init(&sc->mr_io.all.list, &all_list);
2420 spin_unlock_irqrestore(&sc->mr_io.all.lock, flags);
2421
2422 list_for_each_entry_safe(mr, tmp, &all_list, list) {
2423 mutex_lock(&mr->mutex);
2424
2425 smbd_mr_disable_locked(mr);
2426 list_del(&mr->list);
2427 mr->socket = NULL;
2428
2429 /*
2430 * No kref_put_mutex() as it's already locked.
2431 *
2432 * If smbd_mr_free_locked() is called
2433 * and the mutex is unlocked and mr is gone,
2434 * in that case kref_put() returned 1.
2435 *
2436 * If kref_put() returned 0 we know that
2437 * smbd_mr_free_locked() didn't
2438 * run. Not by us nor by anyone else, as we
2439 * still hold the mutex, so we need to unlock.
2440 *
2441 * If the mr is still registered it will
2442 * be dangling (detached from the connection
2443 * waiting for smbd_deregister_mr() to be
2444 * called in order to free the memory.
2445 */
2446 if (!kref_put(&mr->kref, smbd_mr_free_locked))
2447 mutex_unlock(&mr->mutex);
2448 }
2449 }
2450
2451 /*
2452 * Allocate MRs used for RDMA read/write
2453 * The number of MRs will not exceed hardware capability in responder_resources
2454 * All MRs are kept in mr_list. The MR can be recovered after it's used
2455 * Recovery is done in smbd_mr_recovery_work. The content of list entry changes
2456 * as MRs are used and recovered for I/O, but the list links will not change
2457 */
allocate_mr_list(struct smbdirect_socket * sc)2458 static int allocate_mr_list(struct smbdirect_socket *sc)
2459 {
2460 struct smbdirect_socket_parameters *sp = &sc->parameters;
2461 struct smbdirect_mr_io *mr;
2462 int ret;
2463 u32 i;
2464
2465 if (sp->responder_resources == 0) {
2466 log_rdma_mr(ERR, "responder_resources negotiated as 0\n");
2467 return -EINVAL;
2468 }
2469
2470 /* Allocate more MRs (2x) than hardware responder_resources */
2471 for (i = 0; i < sp->responder_resources * 2; i++) {
2472 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
2473 if (!mr) {
2474 ret = -ENOMEM;
2475 goto kzalloc_mr_failed;
2476 }
2477
2478 kref_init(&mr->kref);
2479 mutex_init(&mr->mutex);
2480
2481 mr->mr = ib_alloc_mr(sc->ib.pd,
2482 sc->mr_io.type,
2483 sp->max_frmr_depth);
2484 if (IS_ERR(mr->mr)) {
2485 ret = PTR_ERR(mr->mr);
2486 log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n",
2487 sc->mr_io.type, sp->max_frmr_depth);
2488 goto ib_alloc_mr_failed;
2489 }
2490
2491 mr->sgt.sgl = kcalloc(sp->max_frmr_depth,
2492 sizeof(struct scatterlist),
2493 GFP_KERNEL);
2494 if (!mr->sgt.sgl) {
2495 ret = -ENOMEM;
2496 log_rdma_mr(ERR, "failed to allocate sgl\n");
2497 goto kcalloc_sgl_failed;
2498 }
2499 mr->state = SMBDIRECT_MR_READY;
2500 mr->socket = sc;
2501
2502 list_add_tail(&mr->list, &sc->mr_io.all.list);
2503 atomic_inc(&sc->mr_io.ready.count);
2504 }
2505
2506 INIT_WORK(&sc->mr_io.recovery_work, smbd_mr_recovery_work);
2507
2508 return 0;
2509
2510 kcalloc_sgl_failed:
2511 ib_dereg_mr(mr->mr);
2512 ib_alloc_mr_failed:
2513 mutex_destroy(&mr->mutex);
2514 kfree(mr);
2515 kzalloc_mr_failed:
2516 destroy_mr_list(sc);
2517 return ret;
2518 }
2519
2520 /*
2521 * Get a MR from mr_list. This function waits until there is at least one
2522 * MR available in the list. It may access the list while the
2523 * smbd_mr_recovery_work is recovering the MR list. This doesn't need a lock
2524 * as they never modify the same places. However, there may be several CPUs
2525 * issuing I/O trying to get MR at the same time, mr_list_lock is used to
2526 * protect this situation.
2527 */
get_mr(struct smbdirect_socket * sc)2528 static struct smbdirect_mr_io *get_mr(struct smbdirect_socket *sc)
2529 {
2530 struct smbdirect_mr_io *ret;
2531 unsigned long flags;
2532 int rc;
2533 again:
2534 rc = wait_event_interruptible(sc->mr_io.ready.wait_queue,
2535 atomic_read(&sc->mr_io.ready.count) ||
2536 sc->status != SMBDIRECT_SOCKET_CONNECTED);
2537 if (rc) {
2538 log_rdma_mr(ERR, "wait_event_interruptible rc=%x\n", rc);
2539 return NULL;
2540 }
2541
2542 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
2543 log_rdma_mr(ERR, "sc->status=%x\n", sc->status);
2544 return NULL;
2545 }
2546
2547 spin_lock_irqsave(&sc->mr_io.all.lock, flags);
2548 list_for_each_entry(ret, &sc->mr_io.all.list, list) {
2549 if (ret->state == SMBDIRECT_MR_READY) {
2550 ret->state = SMBDIRECT_MR_REGISTERED;
2551 kref_get(&ret->kref);
2552 spin_unlock_irqrestore(&sc->mr_io.all.lock, flags);
2553 atomic_dec(&sc->mr_io.ready.count);
2554 atomic_inc(&sc->mr_io.used.count);
2555 return ret;
2556 }
2557 }
2558
2559 spin_unlock_irqrestore(&sc->mr_io.all.lock, flags);
2560 /*
2561 * It is possible that we could fail to get MR because other processes may
2562 * try to acquire a MR at the same time. If this is the case, retry it.
2563 */
2564 goto again;
2565 }
2566
2567 /*
2568 * Transcribe the pages from an iterator into an MR scatterlist.
2569 */
smbd_iter_to_mr(struct iov_iter * iter,struct sg_table * sgt,unsigned int max_sg)2570 static int smbd_iter_to_mr(struct iov_iter *iter,
2571 struct sg_table *sgt,
2572 unsigned int max_sg)
2573 {
2574 int ret;
2575
2576 memset(sgt->sgl, 0, max_sg * sizeof(struct scatterlist));
2577
2578 ret = extract_iter_to_sg(iter, iov_iter_count(iter), sgt, max_sg, 0);
2579 WARN_ON(ret < 0);
2580 if (sgt->nents > 0)
2581 sg_mark_end(&sgt->sgl[sgt->nents - 1]);
2582 return ret;
2583 }
2584
2585 /*
2586 * Register memory for RDMA read/write
2587 * iter: the buffer to register memory with
2588 * writing: true if this is a RDMA write (SMB read), false for RDMA read
2589 * need_invalidate: true if this MR needs to be locally invalidated after I/O
2590 * return value: the MR registered, NULL if failed.
2591 */
smbd_register_mr(struct smbd_connection * info,struct iov_iter * iter,bool writing,bool need_invalidate)2592 struct smbdirect_mr_io *smbd_register_mr(struct smbd_connection *info,
2593 struct iov_iter *iter,
2594 bool writing, bool need_invalidate)
2595 {
2596 struct smbdirect_socket *sc = &info->socket;
2597 struct smbdirect_socket_parameters *sp = &sc->parameters;
2598 struct smbdirect_mr_io *mr;
2599 int rc, num_pages;
2600 struct ib_reg_wr *reg_wr;
2601
2602 num_pages = iov_iter_npages(iter, sp->max_frmr_depth + 1);
2603 if (num_pages > sp->max_frmr_depth) {
2604 log_rdma_mr(ERR, "num_pages=%d max_frmr_depth=%d\n",
2605 num_pages, sp->max_frmr_depth);
2606 WARN_ON_ONCE(1);
2607 return NULL;
2608 }
2609
2610 mr = get_mr(sc);
2611 if (!mr) {
2612 log_rdma_mr(ERR, "get_mr returning NULL\n");
2613 return NULL;
2614 }
2615
2616 mutex_lock(&mr->mutex);
2617
2618 mr->dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
2619 mr->need_invalidate = need_invalidate;
2620 mr->sgt.nents = 0;
2621 mr->sgt.orig_nents = 0;
2622
2623 log_rdma_mr(INFO, "num_pages=0x%x count=0x%zx depth=%u\n",
2624 num_pages, iov_iter_count(iter), sp->max_frmr_depth);
2625 smbd_iter_to_mr(iter, &mr->sgt, sp->max_frmr_depth);
2626
2627 rc = ib_dma_map_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir);
2628 if (!rc) {
2629 log_rdma_mr(ERR, "ib_dma_map_sg num_pages=%x dir=%x rc=%x\n",
2630 num_pages, mr->dir, rc);
2631 goto dma_map_error;
2632 }
2633
2634 rc = ib_map_mr_sg(mr->mr, mr->sgt.sgl, mr->sgt.nents, NULL, PAGE_SIZE);
2635 if (rc != mr->sgt.nents) {
2636 log_rdma_mr(ERR,
2637 "ib_map_mr_sg failed rc = %d nents = %x\n",
2638 rc, mr->sgt.nents);
2639 goto map_mr_error;
2640 }
2641
2642 ib_update_fast_reg_key(mr->mr, ib_inc_rkey(mr->mr->rkey));
2643 reg_wr = &mr->wr;
2644 reg_wr->wr.opcode = IB_WR_REG_MR;
2645 mr->cqe.done = register_mr_done;
2646 reg_wr->wr.wr_cqe = &mr->cqe;
2647 reg_wr->wr.num_sge = 0;
2648 reg_wr->wr.send_flags = IB_SEND_SIGNALED;
2649 reg_wr->mr = mr->mr;
2650 reg_wr->key = mr->mr->rkey;
2651 reg_wr->access = writing ?
2652 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
2653 IB_ACCESS_REMOTE_READ;
2654
2655 /*
2656 * There is no need for waiting for complemtion on ib_post_send
2657 * on IB_WR_REG_MR. Hardware enforces a barrier and order of execution
2658 * on the next ib_post_send when we actually send I/O to remote peer
2659 */
2660 rc = ib_post_send(sc->ib.qp, ®_wr->wr, NULL);
2661 if (!rc) {
2662 /*
2663 * get_mr() gave us a reference
2664 * via kref_get(&mr->kref), we keep that and let
2665 * the caller use smbd_deregister_mr()
2666 * to remove it again.
2667 */
2668 mutex_unlock(&mr->mutex);
2669 return mr;
2670 }
2671
2672 log_rdma_mr(ERR, "ib_post_send failed rc=%x reg_wr->key=%x\n",
2673 rc, reg_wr->key);
2674
2675 /* If all failed, attempt to recover this MR by setting it SMBDIRECT_MR_ERROR*/
2676 map_mr_error:
2677 ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir);
2678
2679 dma_map_error:
2680 mr->sgt.nents = 0;
2681 mr->state = SMBDIRECT_MR_ERROR;
2682 if (atomic_dec_and_test(&sc->mr_io.used.count))
2683 wake_up(&sc->mr_io.cleanup.wait_queue);
2684
2685 smbd_disconnect_rdma_connection(sc);
2686
2687 /*
2688 * get_mr() gave us a reference
2689 * via kref_get(&mr->kref), we need to remove it again
2690 * on error.
2691 *
2692 * No kref_put_mutex() as it's already locked.
2693 *
2694 * If smbd_mr_free_locked() is called
2695 * and the mutex is unlocked and mr is gone,
2696 * in that case kref_put() returned 1.
2697 *
2698 * If kref_put() returned 0 we know that
2699 * smbd_mr_free_locked() didn't
2700 * run. Not by us nor by anyone else, as we
2701 * still hold the mutex, so we need to unlock.
2702 */
2703 if (!kref_put(&mr->kref, smbd_mr_free_locked))
2704 mutex_unlock(&mr->mutex);
2705
2706 return NULL;
2707 }
2708
local_inv_done(struct ib_cq * cq,struct ib_wc * wc)2709 static void local_inv_done(struct ib_cq *cq, struct ib_wc *wc)
2710 {
2711 struct smbdirect_mr_io *smbdirect_mr;
2712 struct ib_cqe *cqe;
2713
2714 cqe = wc->wr_cqe;
2715 smbdirect_mr = container_of(cqe, struct smbdirect_mr_io, cqe);
2716 smbdirect_mr->state = SMBDIRECT_MR_INVALIDATED;
2717 if (wc->status != IB_WC_SUCCESS) {
2718 log_rdma_mr(ERR, "invalidate failed status=%x\n", wc->status);
2719 smbdirect_mr->state = SMBDIRECT_MR_ERROR;
2720 }
2721 complete(&smbdirect_mr->invalidate_done);
2722 }
2723
2724 /*
2725 * Deregister a MR after I/O is done
2726 * This function may wait if remote invalidation is not used
2727 * and we have to locally invalidate the buffer to prevent data is being
2728 * modified by remote peer after upper layer consumes it
2729 */
smbd_deregister_mr(struct smbdirect_mr_io * mr)2730 void smbd_deregister_mr(struct smbdirect_mr_io *mr)
2731 {
2732 struct smbdirect_socket *sc = mr->socket;
2733
2734 mutex_lock(&mr->mutex);
2735 if (mr->state == SMBDIRECT_MR_DISABLED)
2736 goto put_kref;
2737
2738 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
2739 smbd_mr_disable_locked(mr);
2740 goto put_kref;
2741 }
2742
2743 if (mr->need_invalidate) {
2744 struct ib_send_wr *wr = &mr->inv_wr;
2745 int rc;
2746
2747 /* Need to finish local invalidation before returning */
2748 wr->opcode = IB_WR_LOCAL_INV;
2749 mr->cqe.done = local_inv_done;
2750 wr->wr_cqe = &mr->cqe;
2751 wr->num_sge = 0;
2752 wr->ex.invalidate_rkey = mr->mr->rkey;
2753 wr->send_flags = IB_SEND_SIGNALED;
2754
2755 init_completion(&mr->invalidate_done);
2756 rc = ib_post_send(sc->ib.qp, wr, NULL);
2757 if (rc) {
2758 log_rdma_mr(ERR, "ib_post_send failed rc=%x\n", rc);
2759 smbd_mr_disable_locked(mr);
2760 smbd_disconnect_rdma_connection(sc);
2761 goto done;
2762 }
2763 wait_for_completion(&mr->invalidate_done);
2764 mr->need_invalidate = false;
2765 } else
2766 /*
2767 * For remote invalidation, just set it to SMBDIRECT_MR_INVALIDATED
2768 * and defer to mr_recovery_work to recover the MR for next use
2769 */
2770 mr->state = SMBDIRECT_MR_INVALIDATED;
2771
2772 if (mr->sgt.nents) {
2773 ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir);
2774 mr->sgt.nents = 0;
2775 }
2776
2777 if (mr->state == SMBDIRECT_MR_INVALIDATED) {
2778 mr->state = SMBDIRECT_MR_READY;
2779 if (atomic_inc_return(&sc->mr_io.ready.count) == 1)
2780 wake_up(&sc->mr_io.ready.wait_queue);
2781 } else
2782 /*
2783 * Schedule the work to do MR recovery for future I/Os MR
2784 * recovery is slow and don't want it to block current I/O
2785 */
2786 queue_work(sc->workqueue, &sc->mr_io.recovery_work);
2787
2788 done:
2789 if (atomic_dec_and_test(&sc->mr_io.used.count))
2790 wake_up(&sc->mr_io.cleanup.wait_queue);
2791
2792 put_kref:
2793 /*
2794 * No kref_put_mutex() as it's already locked.
2795 *
2796 * If smbd_mr_free_locked() is called
2797 * and the mutex is unlocked and mr is gone,
2798 * in that case kref_put() returned 1.
2799 *
2800 * If kref_put() returned 0 we know that
2801 * smbd_mr_free_locked() didn't
2802 * run. Not by us nor by anyone else, as we
2803 * still hold the mutex, so we need to unlock
2804 * and keep the mr in SMBDIRECT_MR_READY or
2805 * SMBDIRECT_MR_ERROR state.
2806 */
2807 if (!kref_put(&mr->kref, smbd_mr_free_locked))
2808 mutex_unlock(&mr->mutex);
2809 }
2810
smb_set_sge(struct smb_extract_to_rdma * rdma,struct page * lowest_page,size_t off,size_t len)2811 static bool smb_set_sge(struct smb_extract_to_rdma *rdma,
2812 struct page *lowest_page, size_t off, size_t len)
2813 {
2814 struct ib_sge *sge = &rdma->sge[rdma->nr_sge];
2815 u64 addr;
2816
2817 addr = ib_dma_map_page(rdma->device, lowest_page,
2818 off, len, rdma->direction);
2819 if (ib_dma_mapping_error(rdma->device, addr))
2820 return false;
2821
2822 sge->addr = addr;
2823 sge->length = len;
2824 sge->lkey = rdma->local_dma_lkey;
2825 rdma->nr_sge++;
2826 return true;
2827 }
2828
2829 /*
2830 * Extract page fragments from a BVEC-class iterator and add them to an RDMA
2831 * element list. The pages are not pinned.
2832 */
smb_extract_bvec_to_rdma(struct iov_iter * iter,struct smb_extract_to_rdma * rdma,ssize_t maxsize)2833 static ssize_t smb_extract_bvec_to_rdma(struct iov_iter *iter,
2834 struct smb_extract_to_rdma *rdma,
2835 ssize_t maxsize)
2836 {
2837 const struct bio_vec *bv = iter->bvec;
2838 unsigned long start = iter->iov_offset;
2839 unsigned int i;
2840 ssize_t ret = 0;
2841
2842 for (i = 0; i < iter->nr_segs; i++) {
2843 size_t off, len;
2844
2845 len = bv[i].bv_len;
2846 if (start >= len) {
2847 start -= len;
2848 continue;
2849 }
2850
2851 len = min_t(size_t, maxsize, len - start);
2852 off = bv[i].bv_offset + start;
2853
2854 if (!smb_set_sge(rdma, bv[i].bv_page, off, len))
2855 return -EIO;
2856
2857 ret += len;
2858 maxsize -= len;
2859 if (rdma->nr_sge >= rdma->max_sge || maxsize <= 0)
2860 break;
2861 start = 0;
2862 }
2863
2864 if (ret > 0)
2865 iov_iter_advance(iter, ret);
2866 return ret;
2867 }
2868
2869 /*
2870 * Extract fragments from a KVEC-class iterator and add them to an RDMA list.
2871 * This can deal with vmalloc'd buffers as well as kmalloc'd or static buffers.
2872 * The pages are not pinned.
2873 */
smb_extract_kvec_to_rdma(struct iov_iter * iter,struct smb_extract_to_rdma * rdma,ssize_t maxsize)2874 static ssize_t smb_extract_kvec_to_rdma(struct iov_iter *iter,
2875 struct smb_extract_to_rdma *rdma,
2876 ssize_t maxsize)
2877 {
2878 const struct kvec *kv = iter->kvec;
2879 unsigned long start = iter->iov_offset;
2880 unsigned int i;
2881 ssize_t ret = 0;
2882
2883 for (i = 0; i < iter->nr_segs; i++) {
2884 struct page *page;
2885 unsigned long kaddr;
2886 size_t off, len, seg;
2887
2888 len = kv[i].iov_len;
2889 if (start >= len) {
2890 start -= len;
2891 continue;
2892 }
2893
2894 kaddr = (unsigned long)kv[i].iov_base + start;
2895 off = kaddr & ~PAGE_MASK;
2896 len = min_t(size_t, maxsize, len - start);
2897 kaddr &= PAGE_MASK;
2898
2899 maxsize -= len;
2900 do {
2901 seg = min_t(size_t, len, PAGE_SIZE - off);
2902
2903 if (is_vmalloc_or_module_addr((void *)kaddr))
2904 page = vmalloc_to_page((void *)kaddr);
2905 else
2906 page = virt_to_page((void *)kaddr);
2907
2908 if (!smb_set_sge(rdma, page, off, seg))
2909 return -EIO;
2910
2911 ret += seg;
2912 len -= seg;
2913 kaddr += PAGE_SIZE;
2914 off = 0;
2915 } while (len > 0 && rdma->nr_sge < rdma->max_sge);
2916
2917 if (rdma->nr_sge >= rdma->max_sge || maxsize <= 0)
2918 break;
2919 start = 0;
2920 }
2921
2922 if (ret > 0)
2923 iov_iter_advance(iter, ret);
2924 return ret;
2925 }
2926
2927 /*
2928 * Extract folio fragments from a FOLIOQ-class iterator and add them to an RDMA
2929 * list. The folios are not pinned.
2930 */
smb_extract_folioq_to_rdma(struct iov_iter * iter,struct smb_extract_to_rdma * rdma,ssize_t maxsize)2931 static ssize_t smb_extract_folioq_to_rdma(struct iov_iter *iter,
2932 struct smb_extract_to_rdma *rdma,
2933 ssize_t maxsize)
2934 {
2935 const struct folio_queue *folioq = iter->folioq;
2936 unsigned int slot = iter->folioq_slot;
2937 ssize_t ret = 0;
2938 size_t offset = iter->iov_offset;
2939
2940 BUG_ON(!folioq);
2941
2942 if (slot >= folioq_nr_slots(folioq)) {
2943 folioq = folioq->next;
2944 if (WARN_ON_ONCE(!folioq))
2945 return -EIO;
2946 slot = 0;
2947 }
2948
2949 do {
2950 struct folio *folio = folioq_folio(folioq, slot);
2951 size_t fsize = folioq_folio_size(folioq, slot);
2952
2953 if (offset < fsize) {
2954 size_t part = umin(maxsize, fsize - offset);
2955
2956 if (!smb_set_sge(rdma, folio_page(folio, 0), offset, part))
2957 return -EIO;
2958
2959 offset += part;
2960 ret += part;
2961 maxsize -= part;
2962 }
2963
2964 if (offset >= fsize) {
2965 offset = 0;
2966 slot++;
2967 if (slot >= folioq_nr_slots(folioq)) {
2968 if (!folioq->next) {
2969 WARN_ON_ONCE(ret < iter->count);
2970 break;
2971 }
2972 folioq = folioq->next;
2973 slot = 0;
2974 }
2975 }
2976 } while (rdma->nr_sge < rdma->max_sge && maxsize > 0);
2977
2978 iter->folioq = folioq;
2979 iter->folioq_slot = slot;
2980 iter->iov_offset = offset;
2981 iter->count -= ret;
2982 return ret;
2983 }
2984
2985 /*
2986 * Extract page fragments from up to the given amount of the source iterator
2987 * and build up an RDMA list that refers to all of those bits. The RDMA list
2988 * is appended to, up to the maximum number of elements set in the parameter
2989 * block.
2990 *
2991 * The extracted page fragments are not pinned or ref'd in any way; if an
2992 * IOVEC/UBUF-type iterator is to be used, it should be converted to a
2993 * BVEC-type iterator and the pages pinned, ref'd or otherwise held in some
2994 * way.
2995 */
smb_extract_iter_to_rdma(struct iov_iter * iter,size_t len,struct smb_extract_to_rdma * rdma)2996 static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len,
2997 struct smb_extract_to_rdma *rdma)
2998 {
2999 ssize_t ret;
3000 int before = rdma->nr_sge;
3001
3002 switch (iov_iter_type(iter)) {
3003 case ITER_BVEC:
3004 ret = smb_extract_bvec_to_rdma(iter, rdma, len);
3005 break;
3006 case ITER_KVEC:
3007 ret = smb_extract_kvec_to_rdma(iter, rdma, len);
3008 break;
3009 case ITER_FOLIOQ:
3010 ret = smb_extract_folioq_to_rdma(iter, rdma, len);
3011 break;
3012 default:
3013 WARN_ON_ONCE(1);
3014 return -EIO;
3015 }
3016
3017 if (ret < 0) {
3018 while (rdma->nr_sge > before) {
3019 struct ib_sge *sge = &rdma->sge[rdma->nr_sge--];
3020
3021 ib_dma_unmap_single(rdma->device, sge->addr, sge->length,
3022 rdma->direction);
3023 sge->addr = 0;
3024 }
3025 }
3026
3027 return ret;
3028 }
3029