1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (C) 2017, Microsoft Corporation.
4 *
5 * Author(s): Long Li <longli@microsoft.com>
6 */
7 #include <linux/module.h>
8 #include <linux/highmem.h>
9 #include <linux/folio_queue.h>
10 #include "../common/smbdirect/smbdirect_pdu.h"
11 #include "smbdirect.h"
12 #include "cifs_debug.h"
13 #include "cifsproto.h"
14 #include "smb2proto.h"
15
smbd_get_parameters(struct smbd_connection * conn)16 const struct smbdirect_socket_parameters *smbd_get_parameters(struct smbd_connection *conn)
17 {
18 struct smbdirect_socket *sc = &conn->socket;
19
20 return &sc->parameters;
21 }
22
23 static struct smbdirect_recv_io *get_receive_buffer(
24 struct smbdirect_socket *sc);
25 static void put_receive_buffer(
26 struct smbdirect_socket *sc,
27 struct smbdirect_recv_io *response);
28 static int allocate_receive_buffers(struct smbdirect_socket *sc, int num_buf);
29 static void destroy_receive_buffers(struct smbdirect_socket *sc);
30
31 static void enqueue_reassembly(
32 struct smbdirect_socket *sc,
33 struct smbdirect_recv_io *response, int data_length);
34 static struct smbdirect_recv_io *_get_first_reassembly(
35 struct smbdirect_socket *sc);
36
37 static int smbd_post_recv(
38 struct smbdirect_socket *sc,
39 struct smbdirect_recv_io *response);
40
41 static int smbd_post_send_empty(struct smbdirect_socket *sc);
42
43 static void destroy_mr_list(struct smbdirect_socket *sc);
44 static int allocate_mr_list(struct smbdirect_socket *sc);
45
46 struct smb_extract_to_rdma {
47 struct ib_sge *sge;
48 unsigned int nr_sge;
49 unsigned int max_sge;
50 struct ib_device *device;
51 u32 local_dma_lkey;
52 enum dma_data_direction direction;
53 };
54 static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len,
55 struct smb_extract_to_rdma *rdma);
56
57 /* Port numbers for SMBD transport */
58 #define SMB_PORT 445
59 #define SMBD_PORT 5445
60
61 /* Address lookup and resolve timeout in ms */
62 #define RDMA_RESOLVE_TIMEOUT 5000
63
64 /* SMBD negotiation timeout in seconds */
65 #define SMBD_NEGOTIATE_TIMEOUT 120
66
67 /* The timeout to wait for a keepalive message from peer in seconds */
68 #define KEEPALIVE_RECV_TIMEOUT 5
69
70 /* SMBD minimum receive size and fragmented sized defined in [MS-SMBD] */
71 #define SMBD_MIN_RECEIVE_SIZE 128
72 #define SMBD_MIN_FRAGMENTED_SIZE 131072
73
74 /*
75 * Default maximum number of RDMA read/write outstanding on this connection
76 * This value is possibly decreased during QP creation on hardware limit
77 */
78 #define SMBD_CM_RESPONDER_RESOURCES 32
79
80 /* Maximum number of retries on data transfer operations */
81 #define SMBD_CM_RETRY 6
82 /* No need to retry on Receiver Not Ready since SMBD manages credits */
83 #define SMBD_CM_RNR_RETRY 0
84
85 /*
86 * User configurable initial values per SMBD transport connection
87 * as defined in [MS-SMBD] 3.1.1.1
88 * Those may change after a SMBD negotiation
89 */
90 /* The local peer's maximum number of credits to grant to the peer */
91 int smbd_receive_credit_max = 255;
92
93 /* The remote peer's credit request of local peer */
94 int smbd_send_credit_target = 255;
95
96 /* The maximum single message size can be sent to remote peer */
97 int smbd_max_send_size = 1364;
98
99 /* The maximum fragmented upper-layer payload receive size supported */
100 int smbd_max_fragmented_recv_size = 1024 * 1024;
101
102 /* The maximum single-message size which can be received */
103 int smbd_max_receive_size = 1364;
104
105 /* The timeout to initiate send of a keepalive message on idle */
106 int smbd_keep_alive_interval = 120;
107
108 /*
109 * User configurable initial values for RDMA transport
110 * The actual values used may be lower and are limited to hardware capabilities
111 */
112 /* Default maximum number of pages in a single RDMA write/read */
113 int smbd_max_frmr_depth = 2048;
114
115 /* If payload is less than this byte, use RDMA send/recv not read/write */
116 int rdma_readwrite_threshold = 4096;
117
118 /* Transport logging functions
119 * Logging are defined as classes. They can be OR'ed to define the actual
120 * logging level via module parameter smbd_logging_class
121 * e.g. cifs.smbd_logging_class=0xa0 will log all log_rdma_recv() and
122 * log_rdma_event()
123 */
124 #define LOG_OUTGOING 0x1
125 #define LOG_INCOMING 0x2
126 #define LOG_READ 0x4
127 #define LOG_WRITE 0x8
128 #define LOG_RDMA_SEND 0x10
129 #define LOG_RDMA_RECV 0x20
130 #define LOG_KEEP_ALIVE 0x40
131 #define LOG_RDMA_EVENT 0x80
132 #define LOG_RDMA_MR 0x100
133 static unsigned int smbd_logging_class;
134 module_param(smbd_logging_class, uint, 0644);
135 MODULE_PARM_DESC(smbd_logging_class,
136 "Logging class for SMBD transport 0x0 to 0x100");
137
138 #define ERR 0x0
139 #define INFO 0x1
140 static unsigned int smbd_logging_level = ERR;
141 module_param(smbd_logging_level, uint, 0644);
142 MODULE_PARM_DESC(smbd_logging_level,
143 "Logging level for SMBD transport, 0 (default): error, 1: info");
144
145 #define log_rdma(level, class, fmt, args...) \
146 do { \
147 if (level <= smbd_logging_level || class & smbd_logging_class) \
148 cifs_dbg(VFS, "%s:%d " fmt, __func__, __LINE__, ##args);\
149 } while (0)
150
151 #define log_outgoing(level, fmt, args...) \
152 log_rdma(level, LOG_OUTGOING, fmt, ##args)
153 #define log_incoming(level, fmt, args...) \
154 log_rdma(level, LOG_INCOMING, fmt, ##args)
155 #define log_read(level, fmt, args...) log_rdma(level, LOG_READ, fmt, ##args)
156 #define log_write(level, fmt, args...) log_rdma(level, LOG_WRITE, fmt, ##args)
157 #define log_rdma_send(level, fmt, args...) \
158 log_rdma(level, LOG_RDMA_SEND, fmt, ##args)
159 #define log_rdma_recv(level, fmt, args...) \
160 log_rdma(level, LOG_RDMA_RECV, fmt, ##args)
161 #define log_keep_alive(level, fmt, args...) \
162 log_rdma(level, LOG_KEEP_ALIVE, fmt, ##args)
163 #define log_rdma_event(level, fmt, args...) \
164 log_rdma(level, LOG_RDMA_EVENT, fmt, ##args)
165 #define log_rdma_mr(level, fmt, args...) \
166 log_rdma(level, LOG_RDMA_MR, fmt, ##args)
167
smbd_disconnect_wake_up_all(struct smbdirect_socket * sc)168 static void smbd_disconnect_wake_up_all(struct smbdirect_socket *sc)
169 {
170 /*
171 * Wake up all waiters in all wait queues
172 * in order to notice the broken connection.
173 */
174 wake_up_all(&sc->status_wait);
175 wake_up_all(&sc->send_io.lcredits.wait_queue);
176 wake_up_all(&sc->send_io.credits.wait_queue);
177 wake_up_all(&sc->send_io.pending.dec_wait_queue);
178 wake_up_all(&sc->send_io.pending.zero_wait_queue);
179 wake_up_all(&sc->recv_io.reassembly.wait_queue);
180 wake_up_all(&sc->mr_io.ready.wait_queue);
181 wake_up_all(&sc->mr_io.cleanup.wait_queue);
182 }
183
smbd_disconnect_rdma_work(struct work_struct * work)184 static void smbd_disconnect_rdma_work(struct work_struct *work)
185 {
186 struct smbdirect_socket *sc =
187 container_of(work, struct smbdirect_socket, disconnect_work);
188
189 /*
190 * make sure this and other work is not queued again
191 * but here we don't block and avoid
192 * disable[_delayed]_work_sync()
193 */
194 disable_work(&sc->disconnect_work);
195 disable_work(&sc->recv_io.posted.refill_work);
196 disable_work(&sc->mr_io.recovery_work);
197 disable_work(&sc->idle.immediate_work);
198 disable_delayed_work(&sc->idle.timer_work);
199
200 if (sc->first_error == 0)
201 sc->first_error = -ECONNABORTED;
202
203 switch (sc->status) {
204 case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED:
205 case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING:
206 case SMBDIRECT_SOCKET_NEGOTIATE_FAILED:
207 case SMBDIRECT_SOCKET_CONNECTED:
208 case SMBDIRECT_SOCKET_ERROR:
209 sc->status = SMBDIRECT_SOCKET_DISCONNECTING;
210 rdma_disconnect(sc->rdma.cm_id);
211 break;
212
213 case SMBDIRECT_SOCKET_CREATED:
214 case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED:
215 case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING:
216 case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED:
217 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED:
218 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING:
219 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED:
220 case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED:
221 case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING:
222 case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED:
223 /*
224 * rdma_connect() never reached
225 * RDMA_CM_EVENT_ESTABLISHED
226 */
227 sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
228 break;
229
230 case SMBDIRECT_SOCKET_DISCONNECTING:
231 case SMBDIRECT_SOCKET_DISCONNECTED:
232 case SMBDIRECT_SOCKET_DESTROYED:
233 break;
234 }
235
236 /*
237 * Wake up all waiters in all wait queues
238 * in order to notice the broken connection.
239 */
240 smbd_disconnect_wake_up_all(sc);
241 }
242
smbd_disconnect_rdma_connection(struct smbdirect_socket * sc)243 static void smbd_disconnect_rdma_connection(struct smbdirect_socket *sc)
244 {
245 /*
246 * make sure other work (than disconnect_work) is
247 * not queued again but here we don't block and avoid
248 * disable[_delayed]_work_sync()
249 */
250 disable_work(&sc->recv_io.posted.refill_work);
251 disable_work(&sc->mr_io.recovery_work);
252 disable_work(&sc->idle.immediate_work);
253 disable_delayed_work(&sc->idle.timer_work);
254
255 if (sc->first_error == 0)
256 sc->first_error = -ECONNABORTED;
257
258 switch (sc->status) {
259 case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED:
260 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED:
261 case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED:
262 case SMBDIRECT_SOCKET_NEGOTIATE_FAILED:
263 case SMBDIRECT_SOCKET_ERROR:
264 case SMBDIRECT_SOCKET_DISCONNECTING:
265 case SMBDIRECT_SOCKET_DISCONNECTED:
266 case SMBDIRECT_SOCKET_DESTROYED:
267 /*
268 * Keep the current error status
269 */
270 break;
271
272 case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED:
273 case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING:
274 sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED;
275 break;
276
277 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED:
278 case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING:
279 sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED;
280 break;
281
282 case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED:
283 case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING:
284 sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED;
285 break;
286
287 case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED:
288 case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING:
289 sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED;
290 break;
291
292 case SMBDIRECT_SOCKET_CREATED:
293 sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
294 break;
295
296 case SMBDIRECT_SOCKET_CONNECTED:
297 sc->status = SMBDIRECT_SOCKET_ERROR;
298 break;
299 }
300
301 /*
302 * Wake up all waiters in all wait queues
303 * in order to notice the broken connection.
304 */
305 smbd_disconnect_wake_up_all(sc);
306
307 queue_work(sc->workqueue, &sc->disconnect_work);
308 }
309
310 /* Upcall from RDMA CM */
smbd_conn_upcall(struct rdma_cm_id * id,struct rdma_cm_event * event)311 static int smbd_conn_upcall(
312 struct rdma_cm_id *id, struct rdma_cm_event *event)
313 {
314 struct smbdirect_socket *sc = id->context;
315 struct smbdirect_socket_parameters *sp = &sc->parameters;
316 const char *event_name = rdma_event_msg(event->event);
317 u8 peer_initiator_depth;
318 u8 peer_responder_resources;
319
320 log_rdma_event(INFO, "event=%s status=%d\n",
321 event_name, event->status);
322
323 switch (event->event) {
324 case RDMA_CM_EVENT_ADDR_RESOLVED:
325 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING);
326 sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED;
327 wake_up(&sc->status_wait);
328 break;
329
330 case RDMA_CM_EVENT_ROUTE_RESOLVED:
331 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING);
332 sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED;
333 wake_up(&sc->status_wait);
334 break;
335
336 case RDMA_CM_EVENT_ADDR_ERROR:
337 log_rdma_event(ERR, "connecting failed event=%s\n", event_name);
338 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING);
339 sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED;
340 smbd_disconnect_rdma_work(&sc->disconnect_work);
341 break;
342
343 case RDMA_CM_EVENT_ROUTE_ERROR:
344 log_rdma_event(ERR, "connecting failed event=%s\n", event_name);
345 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING);
346 sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED;
347 smbd_disconnect_rdma_work(&sc->disconnect_work);
348 break;
349
350 case RDMA_CM_EVENT_ESTABLISHED:
351 log_rdma_event(INFO, "connected event=%s\n", event_name);
352
353 /*
354 * Here we work around an inconsistency between
355 * iWarp and other devices (at least rxe and irdma using RoCEv2)
356 */
357 if (rdma_protocol_iwarp(id->device, id->port_num)) {
358 /*
359 * iWarp devices report the peer's values
360 * with the perspective of the peer here.
361 * Tested with siw and irdma (in iwarp mode)
362 * We need to change to our perspective here,
363 * so we need to switch the values.
364 */
365 peer_initiator_depth = event->param.conn.responder_resources;
366 peer_responder_resources = event->param.conn.initiator_depth;
367 } else {
368 /*
369 * Non iWarp devices report the peer's values
370 * already changed to our perspective here.
371 * Tested with rxe and irdma (in roce mode).
372 */
373 peer_initiator_depth = event->param.conn.initiator_depth;
374 peer_responder_resources = event->param.conn.responder_resources;
375 }
376 if (rdma_protocol_iwarp(id->device, id->port_num) &&
377 event->param.conn.private_data_len == 8) {
378 /*
379 * Legacy clients with only iWarp MPA v1 support
380 * need a private blob in order to negotiate
381 * the IRD/ORD values.
382 */
383 const __be32 *ird_ord_hdr = event->param.conn.private_data;
384 u32 ird32 = be32_to_cpu(ird_ord_hdr[0]);
385 u32 ord32 = be32_to_cpu(ird_ord_hdr[1]);
386
387 /*
388 * cifs.ko sends the legacy IRD/ORD negotiation
389 * event if iWarp MPA v2 was used.
390 *
391 * Here we check that the values match and only
392 * mark the client as legacy if they don't match.
393 */
394 if ((u32)event->param.conn.initiator_depth != ird32 ||
395 (u32)event->param.conn.responder_resources != ord32) {
396 /*
397 * There are broken clients (old cifs.ko)
398 * using little endian and also
399 * struct rdma_conn_param only uses u8
400 * for initiator_depth and responder_resources,
401 * so we truncate the value to U8_MAX.
402 *
403 * smb_direct_accept_client() will then
404 * do the real negotiation in order to
405 * select the minimum between client and
406 * server.
407 */
408 ird32 = min_t(u32, ird32, U8_MAX);
409 ord32 = min_t(u32, ord32, U8_MAX);
410
411 sc->rdma.legacy_iwarp = true;
412 peer_initiator_depth = (u8)ird32;
413 peer_responder_resources = (u8)ord32;
414 }
415 }
416
417 /*
418 * negotiate the value by using the minimum
419 * between client and server if the client provided
420 * non 0 values.
421 */
422 if (peer_initiator_depth != 0)
423 sp->initiator_depth =
424 min_t(u8, sp->initiator_depth,
425 peer_initiator_depth);
426 if (peer_responder_resources != 0)
427 sp->responder_resources =
428 min_t(u8, sp->responder_resources,
429 peer_responder_resources);
430
431 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING);
432 sc->status = SMBDIRECT_SOCKET_NEGOTIATE_NEEDED;
433 wake_up(&sc->status_wait);
434 break;
435
436 case RDMA_CM_EVENT_CONNECT_ERROR:
437 case RDMA_CM_EVENT_UNREACHABLE:
438 case RDMA_CM_EVENT_REJECTED:
439 log_rdma_event(ERR, "connecting failed event=%s\n", event_name);
440 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING);
441 sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED;
442 smbd_disconnect_rdma_work(&sc->disconnect_work);
443 break;
444
445 case RDMA_CM_EVENT_DEVICE_REMOVAL:
446 case RDMA_CM_EVENT_DISCONNECTED:
447 /* This happens when we fail the negotiation */
448 if (sc->status == SMBDIRECT_SOCKET_NEGOTIATE_FAILED) {
449 log_rdma_event(ERR, "event=%s during negotiation\n", event_name);
450 }
451
452 sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
453 smbd_disconnect_rdma_work(&sc->disconnect_work);
454 break;
455
456 default:
457 log_rdma_event(ERR, "unexpected event=%s status=%d\n",
458 event_name, event->status);
459 break;
460 }
461
462 return 0;
463 }
464
465 /* Upcall from RDMA QP */
466 static void
smbd_qp_async_error_upcall(struct ib_event * event,void * context)467 smbd_qp_async_error_upcall(struct ib_event *event, void *context)
468 {
469 struct smbdirect_socket *sc = context;
470
471 log_rdma_event(ERR, "%s on device %s socket %p\n",
472 ib_event_msg(event->event), event->device->name, sc);
473
474 switch (event->event) {
475 case IB_EVENT_CQ_ERR:
476 case IB_EVENT_QP_FATAL:
477 smbd_disconnect_rdma_connection(sc);
478 break;
479
480 default:
481 break;
482 }
483 }
484
smbdirect_send_io_payload(struct smbdirect_send_io * request)485 static inline void *smbdirect_send_io_payload(struct smbdirect_send_io *request)
486 {
487 return (void *)request->packet;
488 }
489
smbdirect_recv_io_payload(struct smbdirect_recv_io * response)490 static inline void *smbdirect_recv_io_payload(struct smbdirect_recv_io *response)
491 {
492 return (void *)response->packet;
493 }
494
495 /* Called when a RDMA send is done */
send_done(struct ib_cq * cq,struct ib_wc * wc)496 static void send_done(struct ib_cq *cq, struct ib_wc *wc)
497 {
498 int i;
499 struct smbdirect_send_io *request =
500 container_of(wc->wr_cqe, struct smbdirect_send_io, cqe);
501 struct smbdirect_socket *sc = request->socket;
502 int lcredits = 0;
503
504 log_rdma_send(INFO, "smbdirect_send_io 0x%p completed wc->status=%s\n",
505 request, ib_wc_status_msg(wc->status));
506
507 for (i = 0; i < request->num_sge; i++)
508 ib_dma_unmap_single(sc->ib.dev,
509 request->sge[i].addr,
510 request->sge[i].length,
511 DMA_TO_DEVICE);
512 mempool_free(request, sc->send_io.mem.pool);
513 lcredits += 1;
514
515 if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) {
516 if (wc->status != IB_WC_WR_FLUSH_ERR)
517 log_rdma_send(ERR, "wc->status=%s wc->opcode=%d\n",
518 ib_wc_status_msg(wc->status), wc->opcode);
519 smbd_disconnect_rdma_connection(sc);
520 return;
521 }
522
523 atomic_add(lcredits, &sc->send_io.lcredits.count);
524 wake_up(&sc->send_io.lcredits.wait_queue);
525
526 if (atomic_dec_and_test(&sc->send_io.pending.count))
527 wake_up(&sc->send_io.pending.zero_wait_queue);
528
529 wake_up(&sc->send_io.pending.dec_wait_queue);
530 }
531
dump_smbdirect_negotiate_resp(struct smbdirect_negotiate_resp * resp)532 static void dump_smbdirect_negotiate_resp(struct smbdirect_negotiate_resp *resp)
533 {
534 log_rdma_event(INFO, "resp message min_version %u max_version %u negotiated_version %u credits_requested %u credits_granted %u status %u max_readwrite_size %u preferred_send_size %u max_receive_size %u max_fragmented_size %u\n",
535 resp->min_version, resp->max_version,
536 resp->negotiated_version, resp->credits_requested,
537 resp->credits_granted, resp->status,
538 resp->max_readwrite_size, resp->preferred_send_size,
539 resp->max_receive_size, resp->max_fragmented_size);
540 }
541
542 /*
543 * Process a negotiation response message, according to [MS-SMBD]3.1.5.7
544 * response, packet_length: the negotiation response message
545 * return value: true if negotiation is a success, false if failed
546 */
process_negotiation_response(struct smbdirect_recv_io * response,int packet_length)547 static bool process_negotiation_response(
548 struct smbdirect_recv_io *response, int packet_length)
549 {
550 struct smbdirect_socket *sc = response->socket;
551 struct smbdirect_socket_parameters *sp = &sc->parameters;
552 struct smbdirect_negotiate_resp *packet = smbdirect_recv_io_payload(response);
553
554 if (packet_length < sizeof(struct smbdirect_negotiate_resp)) {
555 log_rdma_event(ERR,
556 "error: packet_length=%d\n", packet_length);
557 return false;
558 }
559
560 if (le16_to_cpu(packet->negotiated_version) != SMBDIRECT_V1) {
561 log_rdma_event(ERR, "error: negotiated_version=%x\n",
562 le16_to_cpu(packet->negotiated_version));
563 return false;
564 }
565
566 if (packet->credits_requested == 0) {
567 log_rdma_event(ERR, "error: credits_requested==0\n");
568 return false;
569 }
570 sc->recv_io.credits.target = le16_to_cpu(packet->credits_requested);
571 sc->recv_io.credits.target = min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max);
572
573 if (packet->credits_granted == 0) {
574 log_rdma_event(ERR, "error: credits_granted==0\n");
575 return false;
576 }
577 atomic_set(&sc->send_io.lcredits.count, sp->send_credit_target);
578 atomic_set(&sc->send_io.credits.count, le16_to_cpu(packet->credits_granted));
579
580 if (le32_to_cpu(packet->preferred_send_size) > sp->max_recv_size) {
581 log_rdma_event(ERR, "error: preferred_send_size=%d\n",
582 le32_to_cpu(packet->preferred_send_size));
583 return false;
584 }
585 sp->max_recv_size = le32_to_cpu(packet->preferred_send_size);
586
587 if (le32_to_cpu(packet->max_receive_size) < SMBD_MIN_RECEIVE_SIZE) {
588 log_rdma_event(ERR, "error: max_receive_size=%d\n",
589 le32_to_cpu(packet->max_receive_size));
590 return false;
591 }
592 sp->max_send_size = min_t(u32, sp->max_send_size,
593 le32_to_cpu(packet->max_receive_size));
594
595 if (le32_to_cpu(packet->max_fragmented_size) <
596 SMBD_MIN_FRAGMENTED_SIZE) {
597 log_rdma_event(ERR, "error: max_fragmented_size=%d\n",
598 le32_to_cpu(packet->max_fragmented_size));
599 return false;
600 }
601 sp->max_fragmented_send_size =
602 le32_to_cpu(packet->max_fragmented_size);
603
604
605 sp->max_read_write_size = min_t(u32,
606 le32_to_cpu(packet->max_readwrite_size),
607 sp->max_frmr_depth * PAGE_SIZE);
608 sp->max_frmr_depth = sp->max_read_write_size / PAGE_SIZE;
609
610 sc->recv_io.expected = SMBDIRECT_EXPECT_DATA_TRANSFER;
611 return true;
612 }
613
smbd_post_send_credits(struct work_struct * work)614 static void smbd_post_send_credits(struct work_struct *work)
615 {
616 int rc;
617 struct smbdirect_recv_io *response;
618 struct smbdirect_socket *sc =
619 container_of(work, struct smbdirect_socket, recv_io.posted.refill_work);
620
621 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
622 return;
623 }
624
625 if (sc->recv_io.credits.target >
626 atomic_read(&sc->recv_io.credits.count)) {
627 while (true) {
628 response = get_receive_buffer(sc);
629 if (!response)
630 break;
631
632 response->first_segment = false;
633 rc = smbd_post_recv(sc, response);
634 if (rc) {
635 log_rdma_recv(ERR,
636 "post_recv failed rc=%d\n", rc);
637 put_receive_buffer(sc, response);
638 break;
639 }
640
641 atomic_inc(&sc->recv_io.posted.count);
642 }
643 }
644
645 /* Promptly send an immediate packet as defined in [MS-SMBD] 3.1.1.1 */
646 if (atomic_read(&sc->recv_io.credits.count) <
647 sc->recv_io.credits.target - 1) {
648 log_keep_alive(INFO, "schedule send of an empty message\n");
649 queue_work(sc->workqueue, &sc->idle.immediate_work);
650 }
651 }
652
653 /* Called from softirq, when recv is done */
recv_done(struct ib_cq * cq,struct ib_wc * wc)654 static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
655 {
656 struct smbdirect_data_transfer *data_transfer;
657 struct smbdirect_recv_io *response =
658 container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe);
659 struct smbdirect_socket *sc = response->socket;
660 struct smbdirect_socket_parameters *sp = &sc->parameters;
661 u16 old_recv_credit_target;
662 u32 data_offset = 0;
663 u32 data_length = 0;
664 u32 remaining_data_length = 0;
665 bool negotiate_done = false;
666
667 log_rdma_recv(INFO,
668 "response=0x%p type=%d wc status=%s wc opcode %d byte_len=%d pkey_index=%u\n",
669 response, sc->recv_io.expected,
670 ib_wc_status_msg(wc->status), wc->opcode,
671 wc->byte_len, wc->pkey_index);
672
673 if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) {
674 if (wc->status != IB_WC_WR_FLUSH_ERR)
675 log_rdma_recv(ERR, "wc->status=%s opcode=%d\n",
676 ib_wc_status_msg(wc->status), wc->opcode);
677 goto error;
678 }
679
680 ib_dma_sync_single_for_cpu(
681 wc->qp->device,
682 response->sge.addr,
683 response->sge.length,
684 DMA_FROM_DEVICE);
685
686 /*
687 * Reset timer to the keepalive interval in
688 * order to trigger our next keepalive message.
689 */
690 sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE;
691 mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
692 msecs_to_jiffies(sp->keepalive_interval_msec));
693
694 switch (sc->recv_io.expected) {
695 /* SMBD negotiation response */
696 case SMBDIRECT_EXPECT_NEGOTIATE_REP:
697 dump_smbdirect_negotiate_resp(smbdirect_recv_io_payload(response));
698 sc->recv_io.reassembly.full_packet_received = true;
699 negotiate_done =
700 process_negotiation_response(response, wc->byte_len);
701 put_receive_buffer(sc, response);
702 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING);
703 if (!negotiate_done) {
704 sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED;
705 smbd_disconnect_rdma_connection(sc);
706 } else {
707 sc->status = SMBDIRECT_SOCKET_CONNECTED;
708 wake_up(&sc->status_wait);
709 }
710
711 return;
712
713 /* SMBD data transfer packet */
714 case SMBDIRECT_EXPECT_DATA_TRANSFER:
715 data_transfer = smbdirect_recv_io_payload(response);
716
717 if (wc->byte_len <
718 offsetof(struct smbdirect_data_transfer, padding))
719 goto error;
720
721 remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length);
722 data_offset = le32_to_cpu(data_transfer->data_offset);
723 data_length = le32_to_cpu(data_transfer->data_length);
724 if (wc->byte_len < data_offset ||
725 (u64)wc->byte_len < (u64)data_offset + data_length)
726 goto error;
727
728 if (remaining_data_length > sp->max_fragmented_recv_size ||
729 data_length > sp->max_fragmented_recv_size ||
730 (u64)remaining_data_length + (u64)data_length > (u64)sp->max_fragmented_recv_size)
731 goto error;
732
733 if (data_length) {
734 if (sc->recv_io.reassembly.full_packet_received)
735 response->first_segment = true;
736
737 if (le32_to_cpu(data_transfer->remaining_data_length))
738 sc->recv_io.reassembly.full_packet_received = false;
739 else
740 sc->recv_io.reassembly.full_packet_received = true;
741 }
742
743 atomic_dec(&sc->recv_io.posted.count);
744 atomic_dec(&sc->recv_io.credits.count);
745 old_recv_credit_target = sc->recv_io.credits.target;
746 sc->recv_io.credits.target =
747 le16_to_cpu(data_transfer->credits_requested);
748 sc->recv_io.credits.target =
749 min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max);
750 sc->recv_io.credits.target =
751 max_t(u16, sc->recv_io.credits.target, 1);
752 if (le16_to_cpu(data_transfer->credits_granted)) {
753 atomic_add(le16_to_cpu(data_transfer->credits_granted),
754 &sc->send_io.credits.count);
755 /*
756 * We have new send credits granted from remote peer
757 * If any sender is waiting for credits, unblock it
758 */
759 wake_up(&sc->send_io.credits.wait_queue);
760 }
761
762 log_incoming(INFO, "data flags %d data_offset %d data_length %d remaining_data_length %d\n",
763 le16_to_cpu(data_transfer->flags),
764 le32_to_cpu(data_transfer->data_offset),
765 le32_to_cpu(data_transfer->data_length),
766 le32_to_cpu(data_transfer->remaining_data_length));
767
768 /* Send an immediate response right away if requested */
769 if (le16_to_cpu(data_transfer->flags) &
770 SMBDIRECT_FLAG_RESPONSE_REQUESTED) {
771 log_keep_alive(INFO, "schedule send of immediate response\n");
772 queue_work(sc->workqueue, &sc->idle.immediate_work);
773 }
774
775 /*
776 * If this is a packet with data playload place the data in
777 * reassembly queue and wake up the reading thread
778 */
779 if (data_length) {
780 if (sc->recv_io.credits.target > old_recv_credit_target)
781 queue_work(sc->workqueue, &sc->recv_io.posted.refill_work);
782
783 enqueue_reassembly(sc, response, data_length);
784 wake_up(&sc->recv_io.reassembly.wait_queue);
785 } else
786 put_receive_buffer(sc, response);
787
788 return;
789
790 case SMBDIRECT_EXPECT_NEGOTIATE_REQ:
791 /* Only server... */
792 break;
793 }
794
795 /*
796 * This is an internal error!
797 */
798 log_rdma_recv(ERR, "unexpected response type=%d\n", sc->recv_io.expected);
799 WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_DATA_TRANSFER);
800 error:
801 put_receive_buffer(sc, response);
802 smbd_disconnect_rdma_connection(sc);
803 }
804
smbd_create_id(struct smbdirect_socket * sc,struct sockaddr * dstaddr,int port)805 static struct rdma_cm_id *smbd_create_id(
806 struct smbdirect_socket *sc,
807 struct sockaddr *dstaddr, int port)
808 {
809 struct smbdirect_socket_parameters *sp = &sc->parameters;
810 struct rdma_cm_id *id;
811 int rc;
812 __be16 *sport;
813
814 id = rdma_create_id(&init_net, smbd_conn_upcall, sc,
815 RDMA_PS_TCP, IB_QPT_RC);
816 if (IS_ERR(id)) {
817 rc = PTR_ERR(id);
818 log_rdma_event(ERR, "rdma_create_id() failed %i\n", rc);
819 return id;
820 }
821
822 if (dstaddr->sa_family == AF_INET6)
823 sport = &((struct sockaddr_in6 *)dstaddr)->sin6_port;
824 else
825 sport = &((struct sockaddr_in *)dstaddr)->sin_port;
826
827 *sport = htons(port);
828
829 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED);
830 sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING;
831 rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)dstaddr,
832 sp->resolve_addr_timeout_msec);
833 if (rc) {
834 log_rdma_event(ERR, "rdma_resolve_addr() failed %i\n", rc);
835 goto out;
836 }
837 rc = wait_event_interruptible_timeout(
838 sc->status_wait,
839 sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING,
840 msecs_to_jiffies(sp->resolve_addr_timeout_msec));
841 /* e.g. if interrupted returns -ERESTARTSYS */
842 if (rc < 0) {
843 log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc);
844 goto out;
845 }
846 if (sc->status == SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING) {
847 rc = -ETIMEDOUT;
848 log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc);
849 goto out;
850 }
851 if (sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED) {
852 rc = -EHOSTUNREACH;
853 log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc);
854 goto out;
855 }
856
857 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED);
858 sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING;
859 rc = rdma_resolve_route(id, sp->resolve_route_timeout_msec);
860 if (rc) {
861 log_rdma_event(ERR, "rdma_resolve_route() failed %i\n", rc);
862 goto out;
863 }
864 rc = wait_event_interruptible_timeout(
865 sc->status_wait,
866 sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING,
867 msecs_to_jiffies(sp->resolve_route_timeout_msec));
868 /* e.g. if interrupted returns -ERESTARTSYS */
869 if (rc < 0) {
870 log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc);
871 goto out;
872 }
873 if (sc->status == SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING) {
874 rc = -ETIMEDOUT;
875 log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc);
876 goto out;
877 }
878 if (sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED) {
879 rc = -ENETUNREACH;
880 log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc);
881 goto out;
882 }
883
884 return id;
885
886 out:
887 rdma_destroy_id(id);
888 return ERR_PTR(rc);
889 }
890
891 /*
892 * Test if FRWR (Fast Registration Work Requests) is supported on the device
893 * This implementation requires FRWR on RDMA read/write
894 * return value: true if it is supported
895 */
frwr_is_supported(struct ib_device_attr * attrs)896 static bool frwr_is_supported(struct ib_device_attr *attrs)
897 {
898 if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
899 return false;
900 if (attrs->max_fast_reg_page_list_len == 0)
901 return false;
902 return true;
903 }
904
smbd_ia_open(struct smbdirect_socket * sc,struct sockaddr * dstaddr,int port)905 static int smbd_ia_open(
906 struct smbdirect_socket *sc,
907 struct sockaddr *dstaddr, int port)
908 {
909 struct smbdirect_socket_parameters *sp = &sc->parameters;
910 int rc;
911
912 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED);
913 sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED;
914
915 sc->rdma.cm_id = smbd_create_id(sc, dstaddr, port);
916 if (IS_ERR(sc->rdma.cm_id)) {
917 rc = PTR_ERR(sc->rdma.cm_id);
918 goto out1;
919 }
920 sc->ib.dev = sc->rdma.cm_id->device;
921
922 if (!frwr_is_supported(&sc->ib.dev->attrs)) {
923 log_rdma_event(ERR, "Fast Registration Work Requests (FRWR) is not supported\n");
924 log_rdma_event(ERR, "Device capability flags = %llx max_fast_reg_page_list_len = %u\n",
925 sc->ib.dev->attrs.device_cap_flags,
926 sc->ib.dev->attrs.max_fast_reg_page_list_len);
927 rc = -EPROTONOSUPPORT;
928 goto out2;
929 }
930 sp->max_frmr_depth = min_t(u32,
931 sp->max_frmr_depth,
932 sc->ib.dev->attrs.max_fast_reg_page_list_len);
933 sc->mr_io.type = IB_MR_TYPE_MEM_REG;
934 if (sc->ib.dev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG)
935 sc->mr_io.type = IB_MR_TYPE_SG_GAPS;
936
937 return 0;
938
939 out2:
940 rdma_destroy_id(sc->rdma.cm_id);
941 sc->rdma.cm_id = NULL;
942
943 out1:
944 return rc;
945 }
946
947 /*
948 * Send a negotiation request message to the peer
949 * The negotiation procedure is in [MS-SMBD] 3.1.5.2 and 3.1.5.3
950 * After negotiation, the transport is connected and ready for
951 * carrying upper layer SMB payload
952 */
smbd_post_send_negotiate_req(struct smbdirect_socket * sc)953 static int smbd_post_send_negotiate_req(struct smbdirect_socket *sc)
954 {
955 struct smbdirect_socket_parameters *sp = &sc->parameters;
956 struct ib_send_wr send_wr;
957 int rc = -ENOMEM;
958 struct smbdirect_send_io *request;
959 struct smbdirect_negotiate_req *packet;
960
961 request = mempool_alloc(sc->send_io.mem.pool, GFP_KERNEL);
962 if (!request)
963 return rc;
964
965 request->socket = sc;
966
967 packet = smbdirect_send_io_payload(request);
968 packet->min_version = cpu_to_le16(SMBDIRECT_V1);
969 packet->max_version = cpu_to_le16(SMBDIRECT_V1);
970 packet->reserved = 0;
971 packet->credits_requested = cpu_to_le16(sp->send_credit_target);
972 packet->preferred_send_size = cpu_to_le32(sp->max_send_size);
973 packet->max_receive_size = cpu_to_le32(sp->max_recv_size);
974 packet->max_fragmented_size =
975 cpu_to_le32(sp->max_fragmented_recv_size);
976
977 request->num_sge = 1;
978 request->sge[0].addr = ib_dma_map_single(
979 sc->ib.dev, (void *)packet,
980 sizeof(*packet), DMA_TO_DEVICE);
981 if (ib_dma_mapping_error(sc->ib.dev, request->sge[0].addr)) {
982 rc = -EIO;
983 goto dma_mapping_failed;
984 }
985
986 request->sge[0].length = sizeof(*packet);
987 request->sge[0].lkey = sc->ib.pd->local_dma_lkey;
988
989 ib_dma_sync_single_for_device(
990 sc->ib.dev, request->sge[0].addr,
991 request->sge[0].length, DMA_TO_DEVICE);
992
993 request->cqe.done = send_done;
994
995 send_wr.next = NULL;
996 send_wr.wr_cqe = &request->cqe;
997 send_wr.sg_list = request->sge;
998 send_wr.num_sge = request->num_sge;
999 send_wr.opcode = IB_WR_SEND;
1000 send_wr.send_flags = IB_SEND_SIGNALED;
1001
1002 log_rdma_send(INFO, "sge addr=0x%llx length=%u lkey=0x%x\n",
1003 request->sge[0].addr,
1004 request->sge[0].length, request->sge[0].lkey);
1005
1006 atomic_inc(&sc->send_io.pending.count);
1007 rc = ib_post_send(sc->ib.qp, &send_wr, NULL);
1008 if (!rc)
1009 return 0;
1010
1011 /* if we reach here, post send failed */
1012 log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
1013 atomic_dec(&sc->send_io.pending.count);
1014 ib_dma_unmap_single(sc->ib.dev, request->sge[0].addr,
1015 request->sge[0].length, DMA_TO_DEVICE);
1016
1017 smbd_disconnect_rdma_connection(sc);
1018
1019 dma_mapping_failed:
1020 mempool_free(request, sc->send_io.mem.pool);
1021 return rc;
1022 }
1023
1024 /*
1025 * Extend the credits to remote peer
1026 * This implements [MS-SMBD] 3.1.5.9
1027 * The idea is that we should extend credits to remote peer as quickly as
1028 * it's allowed, to maintain data flow. We allocate as much receive
1029 * buffer as possible, and extend the receive credits to remote peer
1030 * return value: the new credtis being granted.
1031 */
manage_credits_prior_sending(struct smbdirect_socket * sc)1032 static int manage_credits_prior_sending(struct smbdirect_socket *sc)
1033 {
1034 int new_credits;
1035
1036 if (atomic_read(&sc->recv_io.credits.count) >= sc->recv_io.credits.target)
1037 return 0;
1038
1039 new_credits = atomic_read(&sc->recv_io.posted.count);
1040 if (new_credits == 0)
1041 return 0;
1042
1043 new_credits -= atomic_read(&sc->recv_io.credits.count);
1044 if (new_credits <= 0)
1045 return 0;
1046
1047 return new_credits;
1048 }
1049
1050 /*
1051 * Check if we need to send a KEEP_ALIVE message
1052 * The idle connection timer triggers a KEEP_ALIVE message when expires
1053 * SMBDIRECT_FLAG_RESPONSE_REQUESTED is set in the message flag to have peer send
1054 * back a response.
1055 * return value:
1056 * 1 if SMBDIRECT_FLAG_RESPONSE_REQUESTED needs to be set
1057 * 0: otherwise
1058 */
manage_keep_alive_before_sending(struct smbdirect_socket * sc)1059 static int manage_keep_alive_before_sending(struct smbdirect_socket *sc)
1060 {
1061 struct smbdirect_socket_parameters *sp = &sc->parameters;
1062
1063 if (sc->idle.keepalive == SMBDIRECT_KEEPALIVE_PENDING) {
1064 sc->idle.keepalive = SMBDIRECT_KEEPALIVE_SENT;
1065 /*
1066 * Now use the keepalive timeout (instead of keepalive interval)
1067 * in order to wait for a response
1068 */
1069 mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
1070 msecs_to_jiffies(sp->keepalive_timeout_msec));
1071 return 1;
1072 }
1073 return 0;
1074 }
1075
1076 /* Post the send request */
smbd_post_send(struct smbdirect_socket * sc,struct smbdirect_send_io * request)1077 static int smbd_post_send(struct smbdirect_socket *sc,
1078 struct smbdirect_send_io *request)
1079 {
1080 struct ib_send_wr send_wr;
1081 int rc, i;
1082
1083 for (i = 0; i < request->num_sge; i++) {
1084 log_rdma_send(INFO,
1085 "rdma_request sge[%d] addr=0x%llx length=%u\n",
1086 i, request->sge[i].addr, request->sge[i].length);
1087 ib_dma_sync_single_for_device(
1088 sc->ib.dev,
1089 request->sge[i].addr,
1090 request->sge[i].length,
1091 DMA_TO_DEVICE);
1092 }
1093
1094 request->cqe.done = send_done;
1095
1096 send_wr.next = NULL;
1097 send_wr.wr_cqe = &request->cqe;
1098 send_wr.sg_list = request->sge;
1099 send_wr.num_sge = request->num_sge;
1100 send_wr.opcode = IB_WR_SEND;
1101 send_wr.send_flags = IB_SEND_SIGNALED;
1102
1103 rc = ib_post_send(sc->ib.qp, &send_wr, NULL);
1104 if (rc) {
1105 log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
1106 smbd_disconnect_rdma_connection(sc);
1107 rc = -EAGAIN;
1108 }
1109
1110 return rc;
1111 }
1112
smbd_post_send_iter(struct smbdirect_socket * sc,struct iov_iter * iter,int * _remaining_data_length)1113 static int smbd_post_send_iter(struct smbdirect_socket *sc,
1114 struct iov_iter *iter,
1115 int *_remaining_data_length)
1116 {
1117 struct smbdirect_socket_parameters *sp = &sc->parameters;
1118 int i, rc;
1119 int header_length;
1120 int data_length;
1121 struct smbdirect_send_io *request;
1122 struct smbdirect_data_transfer *packet;
1123 int new_credits = 0;
1124
1125 wait_lcredit:
1126 /* Wait for local send credits */
1127 rc = wait_event_interruptible(sc->send_io.lcredits.wait_queue,
1128 atomic_read(&sc->send_io.lcredits.count) > 0 ||
1129 sc->status != SMBDIRECT_SOCKET_CONNECTED);
1130 if (rc)
1131 goto err_wait_lcredit;
1132
1133 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
1134 log_outgoing(ERR, "disconnected not sending on wait_credit\n");
1135 rc = -EAGAIN;
1136 goto err_wait_lcredit;
1137 }
1138 if (unlikely(atomic_dec_return(&sc->send_io.lcredits.count) < 0)) {
1139 atomic_inc(&sc->send_io.lcredits.count);
1140 goto wait_lcredit;
1141 }
1142
1143 wait_credit:
1144 /* Wait for send credits. A SMBD packet needs one credit */
1145 rc = wait_event_interruptible(sc->send_io.credits.wait_queue,
1146 atomic_read(&sc->send_io.credits.count) > 0 ||
1147 sc->status != SMBDIRECT_SOCKET_CONNECTED);
1148 if (rc)
1149 goto err_wait_credit;
1150
1151 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
1152 log_outgoing(ERR, "disconnected not sending on wait_credit\n");
1153 rc = -EAGAIN;
1154 goto err_wait_credit;
1155 }
1156 if (unlikely(atomic_dec_return(&sc->send_io.credits.count) < 0)) {
1157 atomic_inc(&sc->send_io.credits.count);
1158 goto wait_credit;
1159 }
1160
1161 request = mempool_alloc(sc->send_io.mem.pool, GFP_KERNEL);
1162 if (!request) {
1163 rc = -ENOMEM;
1164 goto err_alloc;
1165 }
1166
1167 request->socket = sc;
1168 memset(request->sge, 0, sizeof(request->sge));
1169
1170 /* Map the packet to DMA */
1171 header_length = sizeof(struct smbdirect_data_transfer);
1172 /* If this is a packet without payload, don't send padding */
1173 if (!iter)
1174 header_length = offsetof(struct smbdirect_data_transfer, padding);
1175
1176 packet = smbdirect_send_io_payload(request);
1177 request->sge[0].addr = ib_dma_map_single(sc->ib.dev,
1178 (void *)packet,
1179 header_length,
1180 DMA_TO_DEVICE);
1181 if (ib_dma_mapping_error(sc->ib.dev, request->sge[0].addr)) {
1182 rc = -EIO;
1183 goto err_dma;
1184 }
1185
1186 request->sge[0].length = header_length;
1187 request->sge[0].lkey = sc->ib.pd->local_dma_lkey;
1188 request->num_sge = 1;
1189
1190 /* Fill in the data payload to find out how much data we can add */
1191 if (iter) {
1192 struct smb_extract_to_rdma extract = {
1193 .nr_sge = request->num_sge,
1194 .max_sge = SMBDIRECT_SEND_IO_MAX_SGE,
1195 .sge = request->sge,
1196 .device = sc->ib.dev,
1197 .local_dma_lkey = sc->ib.pd->local_dma_lkey,
1198 .direction = DMA_TO_DEVICE,
1199 };
1200 size_t payload_len = umin(*_remaining_data_length,
1201 sp->max_send_size - sizeof(*packet));
1202
1203 rc = smb_extract_iter_to_rdma(iter, payload_len,
1204 &extract);
1205 if (rc < 0)
1206 goto err_dma;
1207 data_length = rc;
1208 request->num_sge = extract.nr_sge;
1209 *_remaining_data_length -= data_length;
1210 } else {
1211 data_length = 0;
1212 }
1213
1214 /* Fill in the packet header */
1215 packet->credits_requested = cpu_to_le16(sp->send_credit_target);
1216
1217 new_credits = manage_credits_prior_sending(sc);
1218 atomic_add(new_credits, &sc->recv_io.credits.count);
1219 packet->credits_granted = cpu_to_le16(new_credits);
1220
1221 packet->flags = 0;
1222 if (manage_keep_alive_before_sending(sc))
1223 packet->flags |= cpu_to_le16(SMBDIRECT_FLAG_RESPONSE_REQUESTED);
1224
1225 packet->reserved = 0;
1226 if (!data_length)
1227 packet->data_offset = 0;
1228 else
1229 packet->data_offset = cpu_to_le32(24);
1230 packet->data_length = cpu_to_le32(data_length);
1231 packet->remaining_data_length = cpu_to_le32(*_remaining_data_length);
1232 packet->padding = 0;
1233
1234 log_outgoing(INFO, "credits_requested=%d credits_granted=%d data_offset=%d data_length=%d remaining_data_length=%d\n",
1235 le16_to_cpu(packet->credits_requested),
1236 le16_to_cpu(packet->credits_granted),
1237 le32_to_cpu(packet->data_offset),
1238 le32_to_cpu(packet->data_length),
1239 le32_to_cpu(packet->remaining_data_length));
1240
1241 /*
1242 * Now that we got a local and a remote credit
1243 * we add us as pending
1244 */
1245 atomic_inc(&sc->send_io.pending.count);
1246
1247 rc = smbd_post_send(sc, request);
1248 if (!rc)
1249 return 0;
1250
1251 if (atomic_dec_and_test(&sc->send_io.pending.count))
1252 wake_up(&sc->send_io.pending.zero_wait_queue);
1253
1254 wake_up(&sc->send_io.pending.dec_wait_queue);
1255
1256 err_dma:
1257 for (i = 0; i < request->num_sge; i++)
1258 if (request->sge[i].addr)
1259 ib_dma_unmap_single(sc->ib.dev,
1260 request->sge[i].addr,
1261 request->sge[i].length,
1262 DMA_TO_DEVICE);
1263 mempool_free(request, sc->send_io.mem.pool);
1264
1265 /* roll back the granted receive credits */
1266 atomic_sub(new_credits, &sc->recv_io.credits.count);
1267
1268 err_alloc:
1269 atomic_inc(&sc->send_io.credits.count);
1270 wake_up(&sc->send_io.credits.wait_queue);
1271
1272 err_wait_credit:
1273 atomic_inc(&sc->send_io.lcredits.count);
1274 wake_up(&sc->send_io.lcredits.wait_queue);
1275
1276 err_wait_lcredit:
1277 return rc;
1278 }
1279
1280 /*
1281 * Send an empty message
1282 * Empty message is used to extend credits to peer to for keep live
1283 * while there is no upper layer payload to send at the time
1284 */
smbd_post_send_empty(struct smbdirect_socket * sc)1285 static int smbd_post_send_empty(struct smbdirect_socket *sc)
1286 {
1287 int remaining_data_length = 0;
1288
1289 sc->statistics.send_empty++;
1290 return smbd_post_send_iter(sc, NULL, &remaining_data_length);
1291 }
1292
smbd_post_send_full_iter(struct smbdirect_socket * sc,struct iov_iter * iter,int * _remaining_data_length)1293 static int smbd_post_send_full_iter(struct smbdirect_socket *sc,
1294 struct iov_iter *iter,
1295 int *_remaining_data_length)
1296 {
1297 int rc = 0;
1298
1299 /*
1300 * smbd_post_send_iter() respects the
1301 * negotiated max_send_size, so we need to
1302 * loop until the full iter is posted
1303 */
1304
1305 while (iov_iter_count(iter) > 0) {
1306 rc = smbd_post_send_iter(sc, iter, _remaining_data_length);
1307 if (rc < 0)
1308 break;
1309 }
1310
1311 return rc;
1312 }
1313
1314 /*
1315 * Post a receive request to the transport
1316 * The remote peer can only send data when a receive request is posted
1317 * The interaction is controlled by send/receive credit system
1318 */
smbd_post_recv(struct smbdirect_socket * sc,struct smbdirect_recv_io * response)1319 static int smbd_post_recv(
1320 struct smbdirect_socket *sc, struct smbdirect_recv_io *response)
1321 {
1322 struct smbdirect_socket_parameters *sp = &sc->parameters;
1323 struct ib_recv_wr recv_wr;
1324 int rc = -EIO;
1325
1326 response->sge.addr = ib_dma_map_single(
1327 sc->ib.dev, response->packet,
1328 sp->max_recv_size, DMA_FROM_DEVICE);
1329 if (ib_dma_mapping_error(sc->ib.dev, response->sge.addr))
1330 return rc;
1331
1332 response->sge.length = sp->max_recv_size;
1333 response->sge.lkey = sc->ib.pd->local_dma_lkey;
1334
1335 response->cqe.done = recv_done;
1336
1337 recv_wr.wr_cqe = &response->cqe;
1338 recv_wr.next = NULL;
1339 recv_wr.sg_list = &response->sge;
1340 recv_wr.num_sge = 1;
1341
1342 rc = ib_post_recv(sc->ib.qp, &recv_wr, NULL);
1343 if (rc) {
1344 ib_dma_unmap_single(sc->ib.dev, response->sge.addr,
1345 response->sge.length, DMA_FROM_DEVICE);
1346 response->sge.length = 0;
1347 smbd_disconnect_rdma_connection(sc);
1348 log_rdma_recv(ERR, "ib_post_recv failed rc=%d\n", rc);
1349 }
1350
1351 return rc;
1352 }
1353
1354 /* Perform SMBD negotiate according to [MS-SMBD] 3.1.5.2 */
smbd_negotiate(struct smbdirect_socket * sc)1355 static int smbd_negotiate(struct smbdirect_socket *sc)
1356 {
1357 struct smbdirect_socket_parameters *sp = &sc->parameters;
1358 int rc;
1359 struct smbdirect_recv_io *response = get_receive_buffer(sc);
1360
1361 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED);
1362 sc->status = SMBDIRECT_SOCKET_NEGOTIATE_RUNNING;
1363
1364 sc->recv_io.expected = SMBDIRECT_EXPECT_NEGOTIATE_REP;
1365 rc = smbd_post_recv(sc, response);
1366 log_rdma_event(INFO, "smbd_post_recv rc=%d iov.addr=0x%llx iov.length=%u iov.lkey=0x%x\n",
1367 rc, response->sge.addr,
1368 response->sge.length, response->sge.lkey);
1369 if (rc) {
1370 put_receive_buffer(sc, response);
1371 return rc;
1372 }
1373
1374 rc = smbd_post_send_negotiate_req(sc);
1375 if (rc)
1376 return rc;
1377
1378 rc = wait_event_interruptible_timeout(
1379 sc->status_wait,
1380 sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING,
1381 msecs_to_jiffies(sp->negotiate_timeout_msec));
1382 log_rdma_event(INFO, "wait_event_interruptible_timeout rc=%d\n", rc);
1383
1384 if (sc->status == SMBDIRECT_SOCKET_CONNECTED)
1385 return 0;
1386
1387 if (rc == 0)
1388 rc = -ETIMEDOUT;
1389 else if (rc == -ERESTARTSYS)
1390 rc = -EINTR;
1391 else
1392 rc = -ENOTCONN;
1393
1394 return rc;
1395 }
1396
1397 /*
1398 * Implement Connection.FragmentReassemblyBuffer defined in [MS-SMBD] 3.1.1.1
1399 * This is a queue for reassembling upper layer payload and present to upper
1400 * layer. All the inncoming payload go to the reassembly queue, regardless of
1401 * if reassembly is required. The uuper layer code reads from the queue for all
1402 * incoming payloads.
1403 * Put a received packet to the reassembly queue
1404 * response: the packet received
1405 * data_length: the size of payload in this packet
1406 */
enqueue_reassembly(struct smbdirect_socket * sc,struct smbdirect_recv_io * response,int data_length)1407 static void enqueue_reassembly(
1408 struct smbdirect_socket *sc,
1409 struct smbdirect_recv_io *response,
1410 int data_length)
1411 {
1412 unsigned long flags;
1413
1414 spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
1415 list_add_tail(&response->list, &sc->recv_io.reassembly.list);
1416 sc->recv_io.reassembly.queue_length++;
1417 /*
1418 * Make sure reassembly_data_length is updated after list and
1419 * reassembly_queue_length are updated. On the dequeue side
1420 * reassembly_data_length is checked without a lock to determine
1421 * if reassembly_queue_length and list is up to date
1422 */
1423 virt_wmb();
1424 sc->recv_io.reassembly.data_length += data_length;
1425 spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
1426 sc->statistics.enqueue_reassembly_queue++;
1427 }
1428
1429 /*
1430 * Get the first entry at the front of reassembly queue
1431 * Caller is responsible for locking
1432 * return value: the first entry if any, NULL if queue is empty
1433 */
_get_first_reassembly(struct smbdirect_socket * sc)1434 static struct smbdirect_recv_io *_get_first_reassembly(struct smbdirect_socket *sc)
1435 {
1436 struct smbdirect_recv_io *ret = NULL;
1437
1438 if (!list_empty(&sc->recv_io.reassembly.list)) {
1439 ret = list_first_entry(
1440 &sc->recv_io.reassembly.list,
1441 struct smbdirect_recv_io, list);
1442 }
1443 return ret;
1444 }
1445
1446 /*
1447 * Get a receive buffer
1448 * For each remote send, we need to post a receive. The receive buffers are
1449 * pre-allocated in advance.
1450 * return value: the receive buffer, NULL if none is available
1451 */
get_receive_buffer(struct smbdirect_socket * sc)1452 static struct smbdirect_recv_io *get_receive_buffer(struct smbdirect_socket *sc)
1453 {
1454 struct smbdirect_recv_io *ret = NULL;
1455 unsigned long flags;
1456
1457 spin_lock_irqsave(&sc->recv_io.free.lock, flags);
1458 if (!list_empty(&sc->recv_io.free.list)) {
1459 ret = list_first_entry(
1460 &sc->recv_io.free.list,
1461 struct smbdirect_recv_io, list);
1462 list_del(&ret->list);
1463 sc->statistics.get_receive_buffer++;
1464 }
1465 spin_unlock_irqrestore(&sc->recv_io.free.lock, flags);
1466
1467 return ret;
1468 }
1469
1470 /*
1471 * Return a receive buffer
1472 * Upon returning of a receive buffer, we can post new receive and extend
1473 * more receive credits to remote peer. This is done immediately after a
1474 * receive buffer is returned.
1475 */
put_receive_buffer(struct smbdirect_socket * sc,struct smbdirect_recv_io * response)1476 static void put_receive_buffer(
1477 struct smbdirect_socket *sc, struct smbdirect_recv_io *response)
1478 {
1479 unsigned long flags;
1480
1481 if (likely(response->sge.length != 0)) {
1482 ib_dma_unmap_single(sc->ib.dev,
1483 response->sge.addr,
1484 response->sge.length,
1485 DMA_FROM_DEVICE);
1486 response->sge.length = 0;
1487 }
1488
1489 spin_lock_irqsave(&sc->recv_io.free.lock, flags);
1490 list_add_tail(&response->list, &sc->recv_io.free.list);
1491 sc->statistics.put_receive_buffer++;
1492 spin_unlock_irqrestore(&sc->recv_io.free.lock, flags);
1493
1494 queue_work(sc->workqueue, &sc->recv_io.posted.refill_work);
1495 }
1496
1497 /* Preallocate all receive buffer on transport establishment */
allocate_receive_buffers(struct smbdirect_socket * sc,int num_buf)1498 static int allocate_receive_buffers(struct smbdirect_socket *sc, int num_buf)
1499 {
1500 struct smbdirect_recv_io *response;
1501 int i;
1502
1503 for (i = 0; i < num_buf; i++) {
1504 response = mempool_alloc(sc->recv_io.mem.pool, GFP_KERNEL);
1505 if (!response)
1506 goto allocate_failed;
1507
1508 response->socket = sc;
1509 response->sge.length = 0;
1510 list_add_tail(&response->list, &sc->recv_io.free.list);
1511 }
1512
1513 return 0;
1514
1515 allocate_failed:
1516 while (!list_empty(&sc->recv_io.free.list)) {
1517 response = list_first_entry(
1518 &sc->recv_io.free.list,
1519 struct smbdirect_recv_io, list);
1520 list_del(&response->list);
1521
1522 mempool_free(response, sc->recv_io.mem.pool);
1523 }
1524 return -ENOMEM;
1525 }
1526
destroy_receive_buffers(struct smbdirect_socket * sc)1527 static void destroy_receive_buffers(struct smbdirect_socket *sc)
1528 {
1529 struct smbdirect_recv_io *response;
1530
1531 while ((response = get_receive_buffer(sc)))
1532 mempool_free(response, sc->recv_io.mem.pool);
1533 }
1534
send_immediate_empty_message(struct work_struct * work)1535 static void send_immediate_empty_message(struct work_struct *work)
1536 {
1537 struct smbdirect_socket *sc =
1538 container_of(work, struct smbdirect_socket, idle.immediate_work);
1539
1540 if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
1541 return;
1542
1543 log_keep_alive(INFO, "send an empty message\n");
1544 smbd_post_send_empty(sc);
1545 }
1546
1547 /* Implement idle connection timer [MS-SMBD] 3.1.6.2 */
idle_connection_timer(struct work_struct * work)1548 static void idle_connection_timer(struct work_struct *work)
1549 {
1550 struct smbdirect_socket *sc =
1551 container_of(work, struct smbdirect_socket, idle.timer_work.work);
1552 struct smbdirect_socket_parameters *sp = &sc->parameters;
1553
1554 if (sc->idle.keepalive != SMBDIRECT_KEEPALIVE_NONE) {
1555 log_keep_alive(ERR,
1556 "error status sc->idle.keepalive=%d\n",
1557 sc->idle.keepalive);
1558 smbd_disconnect_rdma_connection(sc);
1559 return;
1560 }
1561
1562 if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
1563 return;
1564
1565 /*
1566 * Now use the keepalive timeout (instead of keepalive interval)
1567 * in order to wait for a response
1568 */
1569 sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING;
1570 mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
1571 msecs_to_jiffies(sp->keepalive_timeout_msec));
1572 log_keep_alive(INFO, "schedule send of empty idle message\n");
1573 queue_work(sc->workqueue, &sc->idle.immediate_work);
1574 }
1575
1576 /*
1577 * Destroy the transport and related RDMA and memory resources
1578 * Need to go through all the pending counters and make sure on one is using
1579 * the transport while it is destroyed
1580 */
smbd_destroy(struct TCP_Server_Info * server)1581 void smbd_destroy(struct TCP_Server_Info *server)
1582 {
1583 struct smbd_connection *info = server->smbd_conn;
1584 struct smbdirect_socket *sc;
1585 struct smbdirect_recv_io *response;
1586 unsigned long flags;
1587
1588 if (!info) {
1589 log_rdma_event(INFO, "rdma session already destroyed\n");
1590 return;
1591 }
1592 sc = &info->socket;
1593
1594 log_rdma_event(INFO, "cancelling and disable disconnect_work\n");
1595 disable_work_sync(&sc->disconnect_work);
1596
1597 log_rdma_event(INFO, "destroying rdma session\n");
1598 if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING)
1599 smbd_disconnect_rdma_work(&sc->disconnect_work);
1600 if (sc->status < SMBDIRECT_SOCKET_DISCONNECTED) {
1601 log_rdma_event(INFO, "wait for transport being disconnected\n");
1602 wait_event(sc->status_wait, sc->status == SMBDIRECT_SOCKET_DISCONNECTED);
1603 log_rdma_event(INFO, "waited for transport being disconnected\n");
1604 }
1605
1606 /*
1607 * Wake up all waiters in all wait queues
1608 * in order to notice the broken connection.
1609 *
1610 * Most likely this was already called via
1611 * smbd_disconnect_rdma_work(), but call it again...
1612 */
1613 smbd_disconnect_wake_up_all(sc);
1614
1615 log_rdma_event(INFO, "cancelling recv_io.posted.refill_work\n");
1616 disable_work_sync(&sc->recv_io.posted.refill_work);
1617
1618 log_rdma_event(INFO, "destroying qp\n");
1619 ib_drain_qp(sc->ib.qp);
1620 rdma_destroy_qp(sc->rdma.cm_id);
1621 sc->ib.qp = NULL;
1622
1623 log_rdma_event(INFO, "cancelling idle timer\n");
1624 disable_delayed_work_sync(&sc->idle.timer_work);
1625 log_rdma_event(INFO, "cancelling send immediate work\n");
1626 disable_work_sync(&sc->idle.immediate_work);
1627
1628 /* It's not possible for upper layer to get to reassembly */
1629 log_rdma_event(INFO, "drain the reassembly queue\n");
1630 do {
1631 spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
1632 response = _get_first_reassembly(sc);
1633 if (response) {
1634 list_del(&response->list);
1635 spin_unlock_irqrestore(
1636 &sc->recv_io.reassembly.lock, flags);
1637 put_receive_buffer(sc, response);
1638 } else
1639 spin_unlock_irqrestore(
1640 &sc->recv_io.reassembly.lock, flags);
1641 } while (response);
1642 sc->recv_io.reassembly.data_length = 0;
1643
1644 log_rdma_event(INFO, "free receive buffers\n");
1645 destroy_receive_buffers(sc);
1646
1647 log_rdma_event(INFO, "freeing mr list\n");
1648 destroy_mr_list(sc);
1649
1650 ib_free_cq(sc->ib.send_cq);
1651 ib_free_cq(sc->ib.recv_cq);
1652 ib_dealloc_pd(sc->ib.pd);
1653 rdma_destroy_id(sc->rdma.cm_id);
1654
1655 /* free mempools */
1656 mempool_destroy(sc->send_io.mem.pool);
1657 kmem_cache_destroy(sc->send_io.mem.cache);
1658
1659 mempool_destroy(sc->recv_io.mem.pool);
1660 kmem_cache_destroy(sc->recv_io.mem.cache);
1661
1662 sc->status = SMBDIRECT_SOCKET_DESTROYED;
1663
1664 destroy_workqueue(sc->workqueue);
1665 log_rdma_event(INFO, "rdma session destroyed\n");
1666 kfree(info);
1667 server->smbd_conn = NULL;
1668 }
1669
1670 /*
1671 * Reconnect this SMBD connection, called from upper layer
1672 * return value: 0 on success, or actual error code
1673 */
smbd_reconnect(struct TCP_Server_Info * server)1674 int smbd_reconnect(struct TCP_Server_Info *server)
1675 {
1676 log_rdma_event(INFO, "reconnecting rdma session\n");
1677
1678 if (!server->smbd_conn) {
1679 log_rdma_event(INFO, "rdma session already destroyed\n");
1680 goto create_conn;
1681 }
1682
1683 /*
1684 * This is possible if transport is disconnected and we haven't received
1685 * notification from RDMA, but upper layer has detected timeout
1686 */
1687 if (server->smbd_conn->socket.status == SMBDIRECT_SOCKET_CONNECTED) {
1688 log_rdma_event(INFO, "disconnecting transport\n");
1689 smbd_destroy(server);
1690 }
1691
1692 create_conn:
1693 log_rdma_event(INFO, "creating rdma session\n");
1694 server->smbd_conn = smbd_get_connection(
1695 server, (struct sockaddr *) &server->dstaddr);
1696
1697 if (server->smbd_conn) {
1698 cifs_dbg(VFS, "RDMA transport re-established\n");
1699 trace_smb3_smbd_connect_done(server->hostname, server->conn_id, &server->dstaddr);
1700 return 0;
1701 }
1702 trace_smb3_smbd_connect_err(server->hostname, server->conn_id, &server->dstaddr);
1703 return -ENOENT;
1704 }
1705
destroy_caches(struct smbdirect_socket * sc)1706 static void destroy_caches(struct smbdirect_socket *sc)
1707 {
1708 destroy_receive_buffers(sc);
1709 mempool_destroy(sc->recv_io.mem.pool);
1710 kmem_cache_destroy(sc->recv_io.mem.cache);
1711 mempool_destroy(sc->send_io.mem.pool);
1712 kmem_cache_destroy(sc->send_io.mem.cache);
1713 }
1714
1715 #define MAX_NAME_LEN 80
allocate_caches(struct smbdirect_socket * sc)1716 static int allocate_caches(struct smbdirect_socket *sc)
1717 {
1718 struct smbdirect_socket_parameters *sp = &sc->parameters;
1719 char name[MAX_NAME_LEN];
1720 int rc;
1721
1722 if (WARN_ON_ONCE(sp->max_recv_size < sizeof(struct smbdirect_data_transfer)))
1723 return -ENOMEM;
1724
1725 scnprintf(name, MAX_NAME_LEN, "smbdirect_send_io_%p", sc);
1726 sc->send_io.mem.cache =
1727 kmem_cache_create(
1728 name,
1729 sizeof(struct smbdirect_send_io) +
1730 sizeof(struct smbdirect_data_transfer),
1731 0, SLAB_HWCACHE_ALIGN, NULL);
1732 if (!sc->send_io.mem.cache)
1733 return -ENOMEM;
1734
1735 sc->send_io.mem.pool =
1736 mempool_create(sp->send_credit_target, mempool_alloc_slab,
1737 mempool_free_slab, sc->send_io.mem.cache);
1738 if (!sc->send_io.mem.pool)
1739 goto out1;
1740
1741 scnprintf(name, MAX_NAME_LEN, "smbdirect_recv_io_%p", sc);
1742
1743 struct kmem_cache_args response_args = {
1744 .align = __alignof__(struct smbdirect_recv_io),
1745 .useroffset = (offsetof(struct smbdirect_recv_io, packet) +
1746 sizeof(struct smbdirect_data_transfer)),
1747 .usersize = sp->max_recv_size - sizeof(struct smbdirect_data_transfer),
1748 };
1749 sc->recv_io.mem.cache =
1750 kmem_cache_create(name,
1751 sizeof(struct smbdirect_recv_io) + sp->max_recv_size,
1752 &response_args, SLAB_HWCACHE_ALIGN);
1753 if (!sc->recv_io.mem.cache)
1754 goto out2;
1755
1756 sc->recv_io.mem.pool =
1757 mempool_create(sp->recv_credit_max, mempool_alloc_slab,
1758 mempool_free_slab, sc->recv_io.mem.cache);
1759 if (!sc->recv_io.mem.pool)
1760 goto out3;
1761
1762 rc = allocate_receive_buffers(sc, sp->recv_credit_max);
1763 if (rc) {
1764 log_rdma_event(ERR, "failed to allocate receive buffers\n");
1765 goto out4;
1766 }
1767
1768 return 0;
1769
1770 out4:
1771 mempool_destroy(sc->recv_io.mem.pool);
1772 out3:
1773 kmem_cache_destroy(sc->recv_io.mem.cache);
1774 out2:
1775 mempool_destroy(sc->send_io.mem.pool);
1776 out1:
1777 kmem_cache_destroy(sc->send_io.mem.cache);
1778 return -ENOMEM;
1779 }
1780
1781 /* Create a SMBD connection, called by upper layer */
_smbd_get_connection(struct TCP_Server_Info * server,struct sockaddr * dstaddr,int port)1782 static struct smbd_connection *_smbd_get_connection(
1783 struct TCP_Server_Info *server, struct sockaddr *dstaddr, int port)
1784 {
1785 int rc;
1786 struct smbd_connection *info;
1787 struct smbdirect_socket *sc;
1788 struct smbdirect_socket_parameters *sp;
1789 struct rdma_conn_param conn_param;
1790 struct ib_qp_cap qp_cap;
1791 struct ib_qp_init_attr qp_attr;
1792 struct sockaddr_in *addr_in = (struct sockaddr_in *) dstaddr;
1793 struct ib_port_immutable port_immutable;
1794 __be32 ird_ord_hdr[2];
1795 char wq_name[80];
1796 struct workqueue_struct *workqueue;
1797
1798 info = kzalloc(sizeof(struct smbd_connection), GFP_KERNEL);
1799 if (!info)
1800 return NULL;
1801 sc = &info->socket;
1802 scnprintf(wq_name, ARRAY_SIZE(wq_name), "smbd_%p", sc);
1803 workqueue = create_workqueue(wq_name);
1804 if (!workqueue)
1805 goto create_wq_failed;
1806 smbdirect_socket_init(sc);
1807 sc->workqueue = workqueue;
1808 sp = &sc->parameters;
1809
1810 INIT_WORK(&sc->disconnect_work, smbd_disconnect_rdma_work);
1811
1812 sp->resolve_addr_timeout_msec = RDMA_RESOLVE_TIMEOUT;
1813 sp->resolve_route_timeout_msec = RDMA_RESOLVE_TIMEOUT;
1814 sp->rdma_connect_timeout_msec = RDMA_RESOLVE_TIMEOUT;
1815 sp->negotiate_timeout_msec = SMBD_NEGOTIATE_TIMEOUT * 1000;
1816 sp->initiator_depth = 1;
1817 sp->responder_resources = SMBD_CM_RESPONDER_RESOURCES;
1818 sp->recv_credit_max = smbd_receive_credit_max;
1819 sp->send_credit_target = smbd_send_credit_target;
1820 sp->max_send_size = smbd_max_send_size;
1821 sp->max_fragmented_recv_size = smbd_max_fragmented_recv_size;
1822 sp->max_recv_size = smbd_max_receive_size;
1823 sp->max_frmr_depth = smbd_max_frmr_depth;
1824 sp->keepalive_interval_msec = smbd_keep_alive_interval * 1000;
1825 sp->keepalive_timeout_msec = KEEPALIVE_RECV_TIMEOUT * 1000;
1826
1827 rc = smbd_ia_open(sc, dstaddr, port);
1828 if (rc) {
1829 log_rdma_event(INFO, "smbd_ia_open rc=%d\n", rc);
1830 goto create_id_failed;
1831 }
1832
1833 if (sp->send_credit_target > sc->ib.dev->attrs.max_cqe ||
1834 sp->send_credit_target > sc->ib.dev->attrs.max_qp_wr) {
1835 log_rdma_event(ERR, "consider lowering send_credit_target = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n",
1836 sp->send_credit_target,
1837 sc->ib.dev->attrs.max_cqe,
1838 sc->ib.dev->attrs.max_qp_wr);
1839 goto config_failed;
1840 }
1841
1842 if (sp->recv_credit_max > sc->ib.dev->attrs.max_cqe ||
1843 sp->recv_credit_max > sc->ib.dev->attrs.max_qp_wr) {
1844 log_rdma_event(ERR, "consider lowering receive_credit_max = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n",
1845 sp->recv_credit_max,
1846 sc->ib.dev->attrs.max_cqe,
1847 sc->ib.dev->attrs.max_qp_wr);
1848 goto config_failed;
1849 }
1850
1851 if (sc->ib.dev->attrs.max_send_sge < SMBDIRECT_SEND_IO_MAX_SGE ||
1852 sc->ib.dev->attrs.max_recv_sge < SMBDIRECT_RECV_IO_MAX_SGE) {
1853 log_rdma_event(ERR,
1854 "device %.*s max_send_sge/max_recv_sge = %d/%d too small\n",
1855 IB_DEVICE_NAME_MAX,
1856 sc->ib.dev->name,
1857 sc->ib.dev->attrs.max_send_sge,
1858 sc->ib.dev->attrs.max_recv_sge);
1859 goto config_failed;
1860 }
1861
1862 sp->responder_resources =
1863 min_t(u8, sp->responder_resources,
1864 sc->ib.dev->attrs.max_qp_rd_atom);
1865 log_rdma_mr(INFO, "responder_resources=%d\n",
1866 sp->responder_resources);
1867
1868 /*
1869 * We use allocate sp->responder_resources * 2 MRs
1870 * and each MR needs WRs for REG and INV, so
1871 * we use '* 4'.
1872 *
1873 * +1 for ib_drain_qp()
1874 */
1875 memset(&qp_cap, 0, sizeof(qp_cap));
1876 qp_cap.max_send_wr = sp->send_credit_target + sp->responder_resources * 4 + 1;
1877 qp_cap.max_recv_wr = sp->recv_credit_max + 1;
1878 qp_cap.max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE;
1879 qp_cap.max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE;
1880
1881 sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0);
1882 if (IS_ERR(sc->ib.pd)) {
1883 rc = PTR_ERR(sc->ib.pd);
1884 sc->ib.pd = NULL;
1885 log_rdma_event(ERR, "ib_alloc_pd() returned %d\n", rc);
1886 goto alloc_pd_failed;
1887 }
1888
1889 sc->ib.send_cq =
1890 ib_alloc_cq_any(sc->ib.dev, sc,
1891 qp_cap.max_send_wr, IB_POLL_SOFTIRQ);
1892 if (IS_ERR(sc->ib.send_cq)) {
1893 sc->ib.send_cq = NULL;
1894 goto alloc_cq_failed;
1895 }
1896
1897 sc->ib.recv_cq =
1898 ib_alloc_cq_any(sc->ib.dev, sc,
1899 qp_cap.max_recv_wr, IB_POLL_SOFTIRQ);
1900 if (IS_ERR(sc->ib.recv_cq)) {
1901 sc->ib.recv_cq = NULL;
1902 goto alloc_cq_failed;
1903 }
1904
1905 memset(&qp_attr, 0, sizeof(qp_attr));
1906 qp_attr.event_handler = smbd_qp_async_error_upcall;
1907 qp_attr.qp_context = sc;
1908 qp_attr.cap = qp_cap;
1909 qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
1910 qp_attr.qp_type = IB_QPT_RC;
1911 qp_attr.send_cq = sc->ib.send_cq;
1912 qp_attr.recv_cq = sc->ib.recv_cq;
1913 qp_attr.port_num = ~0;
1914
1915 rc = rdma_create_qp(sc->rdma.cm_id, sc->ib.pd, &qp_attr);
1916 if (rc) {
1917 log_rdma_event(ERR, "rdma_create_qp failed %i\n", rc);
1918 goto create_qp_failed;
1919 }
1920 sc->ib.qp = sc->rdma.cm_id->qp;
1921
1922 memset(&conn_param, 0, sizeof(conn_param));
1923 conn_param.initiator_depth = sp->initiator_depth;
1924 conn_param.responder_resources = sp->responder_resources;
1925
1926 /* Need to send IRD/ORD in private data for iWARP */
1927 sc->ib.dev->ops.get_port_immutable(
1928 sc->ib.dev, sc->rdma.cm_id->port_num, &port_immutable);
1929 if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) {
1930 ird_ord_hdr[0] = cpu_to_be32(conn_param.responder_resources);
1931 ird_ord_hdr[1] = cpu_to_be32(conn_param.initiator_depth);
1932 conn_param.private_data = ird_ord_hdr;
1933 conn_param.private_data_len = sizeof(ird_ord_hdr);
1934 } else {
1935 conn_param.private_data = NULL;
1936 conn_param.private_data_len = 0;
1937 }
1938
1939 conn_param.retry_count = SMBD_CM_RETRY;
1940 conn_param.rnr_retry_count = SMBD_CM_RNR_RETRY;
1941 conn_param.flow_control = 0;
1942
1943 log_rdma_event(INFO, "connecting to IP %pI4 port %d\n",
1944 &addr_in->sin_addr, port);
1945
1946 WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED);
1947 sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING;
1948 rc = rdma_connect(sc->rdma.cm_id, &conn_param);
1949 if (rc) {
1950 log_rdma_event(ERR, "rdma_connect() failed with %i\n", rc);
1951 goto rdma_connect_failed;
1952 }
1953
1954 wait_event_interruptible_timeout(
1955 sc->status_wait,
1956 sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING,
1957 msecs_to_jiffies(sp->rdma_connect_timeout_msec));
1958
1959 if (sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED) {
1960 log_rdma_event(ERR, "rdma_connect failed port=%d\n", port);
1961 goto rdma_connect_failed;
1962 }
1963
1964 log_rdma_event(INFO, "rdma_connect connected\n");
1965
1966 rc = allocate_caches(sc);
1967 if (rc) {
1968 log_rdma_event(ERR, "cache allocation failed\n");
1969 goto allocate_cache_failed;
1970 }
1971
1972 INIT_WORK(&sc->idle.immediate_work, send_immediate_empty_message);
1973 INIT_DELAYED_WORK(&sc->idle.timer_work, idle_connection_timer);
1974 /*
1975 * start with the negotiate timeout and SMBDIRECT_KEEPALIVE_PENDING
1976 * so that the timer will cause a disconnect.
1977 */
1978 sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING;
1979 mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
1980 msecs_to_jiffies(sp->negotiate_timeout_msec));
1981
1982 INIT_WORK(&sc->recv_io.posted.refill_work, smbd_post_send_credits);
1983
1984 rc = smbd_negotiate(sc);
1985 if (rc) {
1986 log_rdma_event(ERR, "smbd_negotiate rc=%d\n", rc);
1987 goto negotiation_failed;
1988 }
1989
1990 rc = allocate_mr_list(sc);
1991 if (rc) {
1992 log_rdma_mr(ERR, "memory registration allocation failed\n");
1993 goto allocate_mr_failed;
1994 }
1995
1996 return info;
1997
1998 allocate_mr_failed:
1999 /* At this point, need to a full transport shutdown */
2000 server->smbd_conn = info;
2001 smbd_destroy(server);
2002 return NULL;
2003
2004 negotiation_failed:
2005 disable_delayed_work_sync(&sc->idle.timer_work);
2006 destroy_caches(sc);
2007 sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED;
2008 rdma_disconnect(sc->rdma.cm_id);
2009 wait_event(sc->status_wait,
2010 sc->status == SMBDIRECT_SOCKET_DISCONNECTED);
2011
2012 allocate_cache_failed:
2013 rdma_connect_failed:
2014 rdma_destroy_qp(sc->rdma.cm_id);
2015
2016 create_qp_failed:
2017 alloc_cq_failed:
2018 if (sc->ib.send_cq)
2019 ib_free_cq(sc->ib.send_cq);
2020 if (sc->ib.recv_cq)
2021 ib_free_cq(sc->ib.recv_cq);
2022
2023 ib_dealloc_pd(sc->ib.pd);
2024
2025 alloc_pd_failed:
2026 config_failed:
2027 rdma_destroy_id(sc->rdma.cm_id);
2028
2029 create_id_failed:
2030 destroy_workqueue(sc->workqueue);
2031 create_wq_failed:
2032 kfree(info);
2033 return NULL;
2034 }
2035
smbd_get_connection(struct TCP_Server_Info * server,struct sockaddr * dstaddr)2036 struct smbd_connection *smbd_get_connection(
2037 struct TCP_Server_Info *server, struct sockaddr *dstaddr)
2038 {
2039 struct smbd_connection *ret;
2040 const struct smbdirect_socket_parameters *sp;
2041 int port = SMBD_PORT;
2042
2043 try_again:
2044 ret = _smbd_get_connection(server, dstaddr, port);
2045
2046 /* Try SMB_PORT if SMBD_PORT doesn't work */
2047 if (!ret && port == SMBD_PORT) {
2048 port = SMB_PORT;
2049 goto try_again;
2050 }
2051 if (!ret)
2052 return NULL;
2053
2054 sp = &ret->socket.parameters;
2055
2056 server->rdma_readwrite_threshold =
2057 rdma_readwrite_threshold > sp->max_fragmented_send_size ?
2058 sp->max_fragmented_send_size :
2059 rdma_readwrite_threshold;
2060
2061 return ret;
2062 }
2063
2064 /*
2065 * Receive data from the transport's receive reassembly queue
2066 * All the incoming data packets are placed in reassembly queue
2067 * iter: the buffer to read data into
2068 * size: the length of data to read
2069 * return value: actual data read
2070 *
2071 * Note: this implementation copies the data from reassembly queue to receive
2072 * buffers used by upper layer. This is not the optimal code path. A better way
2073 * to do it is to not have upper layer allocate its receive buffers but rather
2074 * borrow the buffer from reassembly queue, and return it after data is
2075 * consumed. But this will require more changes to upper layer code, and also
2076 * need to consider packet boundaries while they still being reassembled.
2077 */
smbd_recv(struct smbd_connection * info,struct msghdr * msg)2078 int smbd_recv(struct smbd_connection *info, struct msghdr *msg)
2079 {
2080 struct smbdirect_socket *sc = &info->socket;
2081 struct smbdirect_recv_io *response;
2082 struct smbdirect_data_transfer *data_transfer;
2083 size_t size = iov_iter_count(&msg->msg_iter);
2084 int to_copy, to_read, data_read, offset;
2085 u32 data_length, remaining_data_length, data_offset;
2086 int rc;
2087
2088 if (WARN_ON_ONCE(iov_iter_rw(&msg->msg_iter) == WRITE))
2089 return -EINVAL; /* It's a bug in upper layer to get there */
2090
2091 again:
2092 /*
2093 * No need to hold the reassembly queue lock all the time as we are
2094 * the only one reading from the front of the queue. The transport
2095 * may add more entries to the back of the queue at the same time
2096 */
2097 log_read(INFO, "size=%zd sc->recv_io.reassembly.data_length=%d\n", size,
2098 sc->recv_io.reassembly.data_length);
2099 if (sc->recv_io.reassembly.data_length >= size) {
2100 int queue_length;
2101 int queue_removed = 0;
2102 unsigned long flags;
2103
2104 /*
2105 * Need to make sure reassembly_data_length is read before
2106 * reading reassembly_queue_length and calling
2107 * _get_first_reassembly. This call is lock free
2108 * as we never read at the end of the queue which are being
2109 * updated in SOFTIRQ as more data is received
2110 */
2111 virt_rmb();
2112 queue_length = sc->recv_io.reassembly.queue_length;
2113 data_read = 0;
2114 to_read = size;
2115 offset = sc->recv_io.reassembly.first_entry_offset;
2116 while (data_read < size) {
2117 response = _get_first_reassembly(sc);
2118 data_transfer = smbdirect_recv_io_payload(response);
2119 data_length = le32_to_cpu(data_transfer->data_length);
2120 remaining_data_length =
2121 le32_to_cpu(
2122 data_transfer->remaining_data_length);
2123 data_offset = le32_to_cpu(data_transfer->data_offset);
2124
2125 /*
2126 * The upper layer expects RFC1002 length at the
2127 * beginning of the payload. Return it to indicate
2128 * the total length of the packet. This minimize the
2129 * change to upper layer packet processing logic. This
2130 * will be eventually remove when an intermediate
2131 * transport layer is added
2132 */
2133 if (response->first_segment && size == 4) {
2134 unsigned int rfc1002_len =
2135 data_length + remaining_data_length;
2136 __be32 rfc1002_hdr = cpu_to_be32(rfc1002_len);
2137 if (copy_to_iter(&rfc1002_hdr, sizeof(rfc1002_hdr),
2138 &msg->msg_iter) != sizeof(rfc1002_hdr))
2139 return -EFAULT;
2140 data_read = 4;
2141 response->first_segment = false;
2142 log_read(INFO, "returning rfc1002 length %d\n",
2143 rfc1002_len);
2144 goto read_rfc1002_done;
2145 }
2146
2147 to_copy = min_t(int, data_length - offset, to_read);
2148 if (copy_to_iter((char *)data_transfer + data_offset + offset,
2149 to_copy, &msg->msg_iter) != to_copy)
2150 return -EFAULT;
2151
2152 /* move on to the next buffer? */
2153 if (to_copy == data_length - offset) {
2154 queue_length--;
2155 /*
2156 * No need to lock if we are not at the
2157 * end of the queue
2158 */
2159 if (queue_length)
2160 list_del(&response->list);
2161 else {
2162 spin_lock_irqsave(
2163 &sc->recv_io.reassembly.lock, flags);
2164 list_del(&response->list);
2165 spin_unlock_irqrestore(
2166 &sc->recv_io.reassembly.lock, flags);
2167 }
2168 queue_removed++;
2169 sc->statistics.dequeue_reassembly_queue++;
2170 put_receive_buffer(sc, response);
2171 offset = 0;
2172 log_read(INFO, "put_receive_buffer offset=0\n");
2173 } else
2174 offset += to_copy;
2175
2176 to_read -= to_copy;
2177 data_read += to_copy;
2178
2179 log_read(INFO, "_get_first_reassembly memcpy %d bytes data_transfer_length-offset=%d after that to_read=%d data_read=%d offset=%d\n",
2180 to_copy, data_length - offset,
2181 to_read, data_read, offset);
2182 }
2183
2184 spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
2185 sc->recv_io.reassembly.data_length -= data_read;
2186 sc->recv_io.reassembly.queue_length -= queue_removed;
2187 spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
2188
2189 sc->recv_io.reassembly.first_entry_offset = offset;
2190 log_read(INFO, "returning to thread data_read=%d reassembly_data_length=%d first_entry_offset=%d\n",
2191 data_read, sc->recv_io.reassembly.data_length,
2192 sc->recv_io.reassembly.first_entry_offset);
2193 read_rfc1002_done:
2194 return data_read;
2195 }
2196
2197 log_read(INFO, "wait_event on more data\n");
2198 rc = wait_event_interruptible(
2199 sc->recv_io.reassembly.wait_queue,
2200 sc->recv_io.reassembly.data_length >= size ||
2201 sc->status != SMBDIRECT_SOCKET_CONNECTED);
2202 /* Don't return any data if interrupted */
2203 if (rc)
2204 return rc;
2205
2206 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
2207 log_read(ERR, "disconnected\n");
2208 return -ECONNABORTED;
2209 }
2210
2211 goto again;
2212 }
2213
2214 /*
2215 * Send data to transport
2216 * Each rqst is transported as a SMBDirect payload
2217 * rqst: the data to write
2218 * return value: 0 if successfully write, otherwise error code
2219 */
smbd_send(struct TCP_Server_Info * server,int num_rqst,struct smb_rqst * rqst_array)2220 int smbd_send(struct TCP_Server_Info *server,
2221 int num_rqst, struct smb_rqst *rqst_array)
2222 {
2223 struct smbd_connection *info = server->smbd_conn;
2224 struct smbdirect_socket *sc = &info->socket;
2225 struct smbdirect_socket_parameters *sp = &sc->parameters;
2226 struct smb_rqst *rqst;
2227 struct iov_iter iter;
2228 unsigned int remaining_data_length, klen;
2229 int rc, i, rqst_idx;
2230
2231 if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
2232 return -EAGAIN;
2233
2234 /*
2235 * Add in the page array if there is one. The caller needs to set
2236 * rq_tailsz to PAGE_SIZE when the buffer has multiple pages and
2237 * ends at page boundary
2238 */
2239 remaining_data_length = 0;
2240 for (i = 0; i < num_rqst; i++)
2241 remaining_data_length += smb_rqst_len(server, &rqst_array[i]);
2242
2243 if (unlikely(remaining_data_length > sp->max_fragmented_send_size)) {
2244 /* assertion: payload never exceeds negotiated maximum */
2245 log_write(ERR, "payload size %d > max size %d\n",
2246 remaining_data_length, sp->max_fragmented_send_size);
2247 return -EINVAL;
2248 }
2249
2250 log_write(INFO, "num_rqst=%d total length=%u\n",
2251 num_rqst, remaining_data_length);
2252
2253 rqst_idx = 0;
2254 do {
2255 rqst = &rqst_array[rqst_idx];
2256
2257 cifs_dbg(FYI, "Sending smb (RDMA): idx=%d smb_len=%lu\n",
2258 rqst_idx, smb_rqst_len(server, rqst));
2259 for (i = 0; i < rqst->rq_nvec; i++)
2260 dump_smb(rqst->rq_iov[i].iov_base, rqst->rq_iov[i].iov_len);
2261
2262 log_write(INFO, "RDMA-WR[%u] nvec=%d len=%u iter=%zu rqlen=%lu\n",
2263 rqst_idx, rqst->rq_nvec, remaining_data_length,
2264 iov_iter_count(&rqst->rq_iter), smb_rqst_len(server, rqst));
2265
2266 /* Send the metadata pages. */
2267 klen = 0;
2268 for (i = 0; i < rqst->rq_nvec; i++)
2269 klen += rqst->rq_iov[i].iov_len;
2270 iov_iter_kvec(&iter, ITER_SOURCE, rqst->rq_iov, rqst->rq_nvec, klen);
2271
2272 rc = smbd_post_send_full_iter(sc, &iter, &remaining_data_length);
2273 if (rc < 0)
2274 break;
2275
2276 if (iov_iter_count(&rqst->rq_iter) > 0) {
2277 /* And then the data pages if there are any */
2278 rc = smbd_post_send_full_iter(sc, &rqst->rq_iter,
2279 &remaining_data_length);
2280 if (rc < 0)
2281 break;
2282 }
2283
2284 } while (++rqst_idx < num_rqst);
2285
2286 /*
2287 * As an optimization, we don't wait for individual I/O to finish
2288 * before sending the next one.
2289 * Send them all and wait for pending send count to get to 0
2290 * that means all the I/Os have been out and we are good to return
2291 */
2292
2293 wait_event(sc->send_io.pending.zero_wait_queue,
2294 atomic_read(&sc->send_io.pending.count) == 0 ||
2295 sc->status != SMBDIRECT_SOCKET_CONNECTED);
2296
2297 if (sc->status != SMBDIRECT_SOCKET_CONNECTED && rc == 0)
2298 rc = -EAGAIN;
2299
2300 return rc;
2301 }
2302
register_mr_done(struct ib_cq * cq,struct ib_wc * wc)2303 static void register_mr_done(struct ib_cq *cq, struct ib_wc *wc)
2304 {
2305 struct smbdirect_mr_io *mr =
2306 container_of(wc->wr_cqe, struct smbdirect_mr_io, cqe);
2307 struct smbdirect_socket *sc = mr->socket;
2308
2309 if (wc->status) {
2310 log_rdma_mr(ERR, "status=%d\n", wc->status);
2311 smbd_disconnect_rdma_connection(sc);
2312 }
2313 }
2314
2315 /*
2316 * The work queue function that recovers MRs
2317 * We need to call ib_dereg_mr() and ib_alloc_mr() before this MR can be used
2318 * again. Both calls are slow, so finish them in a workqueue. This will not
2319 * block I/O path.
2320 * There is one workqueue that recovers MRs, there is no need to lock as the
2321 * I/O requests calling smbd_register_mr will never update the links in the
2322 * mr_list.
2323 */
smbd_mr_recovery_work(struct work_struct * work)2324 static void smbd_mr_recovery_work(struct work_struct *work)
2325 {
2326 struct smbdirect_socket *sc =
2327 container_of(work, struct smbdirect_socket, mr_io.recovery_work);
2328 struct smbdirect_socket_parameters *sp = &sc->parameters;
2329 struct smbdirect_mr_io *smbdirect_mr;
2330 int rc;
2331
2332 list_for_each_entry(smbdirect_mr, &sc->mr_io.all.list, list) {
2333 if (smbdirect_mr->state == SMBDIRECT_MR_ERROR) {
2334
2335 /* recover this MR entry */
2336 rc = ib_dereg_mr(smbdirect_mr->mr);
2337 if (rc) {
2338 log_rdma_mr(ERR,
2339 "ib_dereg_mr failed rc=%x\n",
2340 rc);
2341 smbd_disconnect_rdma_connection(sc);
2342 continue;
2343 }
2344
2345 smbdirect_mr->mr = ib_alloc_mr(
2346 sc->ib.pd, sc->mr_io.type,
2347 sp->max_frmr_depth);
2348 if (IS_ERR(smbdirect_mr->mr)) {
2349 log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n",
2350 sc->mr_io.type,
2351 sp->max_frmr_depth);
2352 smbd_disconnect_rdma_connection(sc);
2353 continue;
2354 }
2355 } else
2356 /* This MR is being used, don't recover it */
2357 continue;
2358
2359 smbdirect_mr->state = SMBDIRECT_MR_READY;
2360
2361 /* smbdirect_mr->state is updated by this function
2362 * and is read and updated by I/O issuing CPUs trying
2363 * to get a MR, the call to atomic_inc_return
2364 * implicates a memory barrier and guarantees this
2365 * value is updated before waking up any calls to
2366 * get_mr() from the I/O issuing CPUs
2367 */
2368 if (atomic_inc_return(&sc->mr_io.ready.count) == 1)
2369 wake_up(&sc->mr_io.ready.wait_queue);
2370 }
2371 }
2372
smbd_mr_disable_locked(struct smbdirect_mr_io * mr)2373 static void smbd_mr_disable_locked(struct smbdirect_mr_io *mr)
2374 {
2375 struct smbdirect_socket *sc = mr->socket;
2376
2377 lockdep_assert_held(&mr->mutex);
2378
2379 if (mr->state == SMBDIRECT_MR_DISABLED)
2380 return;
2381
2382 if (mr->mr)
2383 ib_dereg_mr(mr->mr);
2384 if (mr->sgt.nents)
2385 ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir);
2386 kfree(mr->sgt.sgl);
2387
2388 mr->mr = NULL;
2389 mr->sgt.sgl = NULL;
2390 mr->sgt.nents = 0;
2391
2392 mr->state = SMBDIRECT_MR_DISABLED;
2393 }
2394
smbd_mr_free_locked(struct kref * kref)2395 static void smbd_mr_free_locked(struct kref *kref)
2396 {
2397 struct smbdirect_mr_io *mr =
2398 container_of(kref, struct smbdirect_mr_io, kref);
2399
2400 lockdep_assert_held(&mr->mutex);
2401
2402 /*
2403 * smbd_mr_disable_locked() should already be called!
2404 */
2405 if (WARN_ON_ONCE(mr->state != SMBDIRECT_MR_DISABLED))
2406 smbd_mr_disable_locked(mr);
2407
2408 mutex_unlock(&mr->mutex);
2409 mutex_destroy(&mr->mutex);
2410 kfree(mr);
2411 }
2412
destroy_mr_list(struct smbdirect_socket * sc)2413 static void destroy_mr_list(struct smbdirect_socket *sc)
2414 {
2415 struct smbdirect_mr_io *mr, *tmp;
2416 LIST_HEAD(all_list);
2417 unsigned long flags;
2418
2419 disable_work_sync(&sc->mr_io.recovery_work);
2420
2421 spin_lock_irqsave(&sc->mr_io.all.lock, flags);
2422 list_splice_tail_init(&sc->mr_io.all.list, &all_list);
2423 spin_unlock_irqrestore(&sc->mr_io.all.lock, flags);
2424
2425 list_for_each_entry_safe(mr, tmp, &all_list, list) {
2426 mutex_lock(&mr->mutex);
2427
2428 smbd_mr_disable_locked(mr);
2429 list_del(&mr->list);
2430 mr->socket = NULL;
2431
2432 /*
2433 * No kref_put_mutex() as it's already locked.
2434 *
2435 * If smbd_mr_free_locked() is called
2436 * and the mutex is unlocked and mr is gone,
2437 * in that case kref_put() returned 1.
2438 *
2439 * If kref_put() returned 0 we know that
2440 * smbd_mr_free_locked() didn't
2441 * run. Not by us nor by anyone else, as we
2442 * still hold the mutex, so we need to unlock.
2443 *
2444 * If the mr is still registered it will
2445 * be dangling (detached from the connection
2446 * waiting for smbd_deregister_mr() to be
2447 * called in order to free the memory.
2448 */
2449 if (!kref_put(&mr->kref, smbd_mr_free_locked))
2450 mutex_unlock(&mr->mutex);
2451 }
2452 }
2453
2454 /*
2455 * Allocate MRs used for RDMA read/write
2456 * The number of MRs will not exceed hardware capability in responder_resources
2457 * All MRs are kept in mr_list. The MR can be recovered after it's used
2458 * Recovery is done in smbd_mr_recovery_work. The content of list entry changes
2459 * as MRs are used and recovered for I/O, but the list links will not change
2460 */
allocate_mr_list(struct smbdirect_socket * sc)2461 static int allocate_mr_list(struct smbdirect_socket *sc)
2462 {
2463 struct smbdirect_socket_parameters *sp = &sc->parameters;
2464 struct smbdirect_mr_io *mr;
2465 int ret;
2466 u32 i;
2467
2468 if (sp->responder_resources == 0) {
2469 log_rdma_mr(ERR, "responder_resources negotiated as 0\n");
2470 return -EINVAL;
2471 }
2472
2473 /* Allocate more MRs (2x) than hardware responder_resources */
2474 for (i = 0; i < sp->responder_resources * 2; i++) {
2475 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
2476 if (!mr) {
2477 ret = -ENOMEM;
2478 goto kzalloc_mr_failed;
2479 }
2480
2481 kref_init(&mr->kref);
2482 mutex_init(&mr->mutex);
2483
2484 mr->mr = ib_alloc_mr(sc->ib.pd,
2485 sc->mr_io.type,
2486 sp->max_frmr_depth);
2487 if (IS_ERR(mr->mr)) {
2488 ret = PTR_ERR(mr->mr);
2489 log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n",
2490 sc->mr_io.type, sp->max_frmr_depth);
2491 goto ib_alloc_mr_failed;
2492 }
2493
2494 mr->sgt.sgl = kcalloc(sp->max_frmr_depth,
2495 sizeof(struct scatterlist),
2496 GFP_KERNEL);
2497 if (!mr->sgt.sgl) {
2498 ret = -ENOMEM;
2499 log_rdma_mr(ERR, "failed to allocate sgl\n");
2500 goto kcalloc_sgl_failed;
2501 }
2502 mr->state = SMBDIRECT_MR_READY;
2503 mr->socket = sc;
2504
2505 list_add_tail(&mr->list, &sc->mr_io.all.list);
2506 atomic_inc(&sc->mr_io.ready.count);
2507 }
2508
2509 INIT_WORK(&sc->mr_io.recovery_work, smbd_mr_recovery_work);
2510
2511 return 0;
2512
2513 kcalloc_sgl_failed:
2514 ib_dereg_mr(mr->mr);
2515 ib_alloc_mr_failed:
2516 mutex_destroy(&mr->mutex);
2517 kfree(mr);
2518 kzalloc_mr_failed:
2519 destroy_mr_list(sc);
2520 return ret;
2521 }
2522
2523 /*
2524 * Get a MR from mr_list. This function waits until there is at least one
2525 * MR available in the list. It may access the list while the
2526 * smbd_mr_recovery_work is recovering the MR list. This doesn't need a lock
2527 * as they never modify the same places. However, there may be several CPUs
2528 * issuing I/O trying to get MR at the same time, mr_list_lock is used to
2529 * protect this situation.
2530 */
get_mr(struct smbdirect_socket * sc)2531 static struct smbdirect_mr_io *get_mr(struct smbdirect_socket *sc)
2532 {
2533 struct smbdirect_mr_io *ret;
2534 unsigned long flags;
2535 int rc;
2536 again:
2537 rc = wait_event_interruptible(sc->mr_io.ready.wait_queue,
2538 atomic_read(&sc->mr_io.ready.count) ||
2539 sc->status != SMBDIRECT_SOCKET_CONNECTED);
2540 if (rc) {
2541 log_rdma_mr(ERR, "wait_event_interruptible rc=%x\n", rc);
2542 return NULL;
2543 }
2544
2545 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
2546 log_rdma_mr(ERR, "sc->status=%x\n", sc->status);
2547 return NULL;
2548 }
2549
2550 spin_lock_irqsave(&sc->mr_io.all.lock, flags);
2551 list_for_each_entry(ret, &sc->mr_io.all.list, list) {
2552 if (ret->state == SMBDIRECT_MR_READY) {
2553 ret->state = SMBDIRECT_MR_REGISTERED;
2554 kref_get(&ret->kref);
2555 spin_unlock_irqrestore(&sc->mr_io.all.lock, flags);
2556 atomic_dec(&sc->mr_io.ready.count);
2557 atomic_inc(&sc->mr_io.used.count);
2558 return ret;
2559 }
2560 }
2561
2562 spin_unlock_irqrestore(&sc->mr_io.all.lock, flags);
2563 /*
2564 * It is possible that we could fail to get MR because other processes may
2565 * try to acquire a MR at the same time. If this is the case, retry it.
2566 */
2567 goto again;
2568 }
2569
2570 /*
2571 * Transcribe the pages from an iterator into an MR scatterlist.
2572 */
smbd_iter_to_mr(struct iov_iter * iter,struct sg_table * sgt,unsigned int max_sg)2573 static int smbd_iter_to_mr(struct iov_iter *iter,
2574 struct sg_table *sgt,
2575 unsigned int max_sg)
2576 {
2577 int ret;
2578
2579 memset(sgt->sgl, 0, max_sg * sizeof(struct scatterlist));
2580
2581 ret = extract_iter_to_sg(iter, iov_iter_count(iter), sgt, max_sg, 0);
2582 WARN_ON(ret < 0);
2583 if (sgt->nents > 0)
2584 sg_mark_end(&sgt->sgl[sgt->nents - 1]);
2585 return ret;
2586 }
2587
2588 /*
2589 * Register memory for RDMA read/write
2590 * iter: the buffer to register memory with
2591 * writing: true if this is a RDMA write (SMB read), false for RDMA read
2592 * need_invalidate: true if this MR needs to be locally invalidated after I/O
2593 * return value: the MR registered, NULL if failed.
2594 */
smbd_register_mr(struct smbd_connection * info,struct iov_iter * iter,bool writing,bool need_invalidate)2595 struct smbdirect_mr_io *smbd_register_mr(struct smbd_connection *info,
2596 struct iov_iter *iter,
2597 bool writing, bool need_invalidate)
2598 {
2599 struct smbdirect_socket *sc = &info->socket;
2600 struct smbdirect_socket_parameters *sp = &sc->parameters;
2601 struct smbdirect_mr_io *mr;
2602 int rc, num_pages;
2603 struct ib_reg_wr *reg_wr;
2604
2605 num_pages = iov_iter_npages(iter, sp->max_frmr_depth + 1);
2606 if (num_pages > sp->max_frmr_depth) {
2607 log_rdma_mr(ERR, "num_pages=%d max_frmr_depth=%d\n",
2608 num_pages, sp->max_frmr_depth);
2609 WARN_ON_ONCE(1);
2610 return NULL;
2611 }
2612
2613 mr = get_mr(sc);
2614 if (!mr) {
2615 log_rdma_mr(ERR, "get_mr returning NULL\n");
2616 return NULL;
2617 }
2618
2619 mutex_lock(&mr->mutex);
2620
2621 mr->dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
2622 mr->need_invalidate = need_invalidate;
2623 mr->sgt.nents = 0;
2624 mr->sgt.orig_nents = 0;
2625
2626 log_rdma_mr(INFO, "num_pages=0x%x count=0x%zx depth=%u\n",
2627 num_pages, iov_iter_count(iter), sp->max_frmr_depth);
2628 smbd_iter_to_mr(iter, &mr->sgt, sp->max_frmr_depth);
2629
2630 rc = ib_dma_map_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir);
2631 if (!rc) {
2632 log_rdma_mr(ERR, "ib_dma_map_sg num_pages=%x dir=%x rc=%x\n",
2633 num_pages, mr->dir, rc);
2634 goto dma_map_error;
2635 }
2636
2637 rc = ib_map_mr_sg(mr->mr, mr->sgt.sgl, mr->sgt.nents, NULL, PAGE_SIZE);
2638 if (rc != mr->sgt.nents) {
2639 log_rdma_mr(ERR,
2640 "ib_map_mr_sg failed rc = %d nents = %x\n",
2641 rc, mr->sgt.nents);
2642 goto map_mr_error;
2643 }
2644
2645 ib_update_fast_reg_key(mr->mr, ib_inc_rkey(mr->mr->rkey));
2646 reg_wr = &mr->wr;
2647 reg_wr->wr.opcode = IB_WR_REG_MR;
2648 mr->cqe.done = register_mr_done;
2649 reg_wr->wr.wr_cqe = &mr->cqe;
2650 reg_wr->wr.num_sge = 0;
2651 reg_wr->wr.send_flags = IB_SEND_SIGNALED;
2652 reg_wr->mr = mr->mr;
2653 reg_wr->key = mr->mr->rkey;
2654 reg_wr->access = writing ?
2655 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
2656 IB_ACCESS_REMOTE_READ;
2657
2658 /*
2659 * There is no need for waiting for complemtion on ib_post_send
2660 * on IB_WR_REG_MR. Hardware enforces a barrier and order of execution
2661 * on the next ib_post_send when we actually send I/O to remote peer
2662 */
2663 rc = ib_post_send(sc->ib.qp, ®_wr->wr, NULL);
2664 if (!rc) {
2665 /*
2666 * get_mr() gave us a reference
2667 * via kref_get(&mr->kref), we keep that and let
2668 * the caller use smbd_deregister_mr()
2669 * to remove it again.
2670 */
2671 mutex_unlock(&mr->mutex);
2672 return mr;
2673 }
2674
2675 log_rdma_mr(ERR, "ib_post_send failed rc=%x reg_wr->key=%x\n",
2676 rc, reg_wr->key);
2677
2678 /* If all failed, attempt to recover this MR by setting it SMBDIRECT_MR_ERROR*/
2679 map_mr_error:
2680 ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir);
2681
2682 dma_map_error:
2683 mr->sgt.nents = 0;
2684 mr->state = SMBDIRECT_MR_ERROR;
2685 if (atomic_dec_and_test(&sc->mr_io.used.count))
2686 wake_up(&sc->mr_io.cleanup.wait_queue);
2687
2688 smbd_disconnect_rdma_connection(sc);
2689
2690 /*
2691 * get_mr() gave us a reference
2692 * via kref_get(&mr->kref), we need to remove it again
2693 * on error.
2694 *
2695 * No kref_put_mutex() as it's already locked.
2696 *
2697 * If smbd_mr_free_locked() is called
2698 * and the mutex is unlocked and mr is gone,
2699 * in that case kref_put() returned 1.
2700 *
2701 * If kref_put() returned 0 we know that
2702 * smbd_mr_free_locked() didn't
2703 * run. Not by us nor by anyone else, as we
2704 * still hold the mutex, so we need to unlock.
2705 */
2706 if (!kref_put(&mr->kref, smbd_mr_free_locked))
2707 mutex_unlock(&mr->mutex);
2708
2709 return NULL;
2710 }
2711
local_inv_done(struct ib_cq * cq,struct ib_wc * wc)2712 static void local_inv_done(struct ib_cq *cq, struct ib_wc *wc)
2713 {
2714 struct smbdirect_mr_io *smbdirect_mr;
2715 struct ib_cqe *cqe;
2716
2717 cqe = wc->wr_cqe;
2718 smbdirect_mr = container_of(cqe, struct smbdirect_mr_io, cqe);
2719 smbdirect_mr->state = SMBDIRECT_MR_INVALIDATED;
2720 if (wc->status != IB_WC_SUCCESS) {
2721 log_rdma_mr(ERR, "invalidate failed status=%x\n", wc->status);
2722 smbdirect_mr->state = SMBDIRECT_MR_ERROR;
2723 }
2724 complete(&smbdirect_mr->invalidate_done);
2725 }
2726
2727 /*
2728 * Deregister a MR after I/O is done
2729 * This function may wait if remote invalidation is not used
2730 * and we have to locally invalidate the buffer to prevent data is being
2731 * modified by remote peer after upper layer consumes it
2732 */
smbd_deregister_mr(struct smbdirect_mr_io * mr)2733 void smbd_deregister_mr(struct smbdirect_mr_io *mr)
2734 {
2735 struct smbdirect_socket *sc = mr->socket;
2736
2737 mutex_lock(&mr->mutex);
2738 if (mr->state == SMBDIRECT_MR_DISABLED)
2739 goto put_kref;
2740
2741 if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
2742 smbd_mr_disable_locked(mr);
2743 goto put_kref;
2744 }
2745
2746 if (mr->need_invalidate) {
2747 struct ib_send_wr *wr = &mr->inv_wr;
2748 int rc;
2749
2750 /* Need to finish local invalidation before returning */
2751 wr->opcode = IB_WR_LOCAL_INV;
2752 mr->cqe.done = local_inv_done;
2753 wr->wr_cqe = &mr->cqe;
2754 wr->num_sge = 0;
2755 wr->ex.invalidate_rkey = mr->mr->rkey;
2756 wr->send_flags = IB_SEND_SIGNALED;
2757
2758 init_completion(&mr->invalidate_done);
2759 rc = ib_post_send(sc->ib.qp, wr, NULL);
2760 if (rc) {
2761 log_rdma_mr(ERR, "ib_post_send failed rc=%x\n", rc);
2762 smbd_mr_disable_locked(mr);
2763 smbd_disconnect_rdma_connection(sc);
2764 goto done;
2765 }
2766 wait_for_completion(&mr->invalidate_done);
2767 mr->need_invalidate = false;
2768 } else
2769 /*
2770 * For remote invalidation, just set it to SMBDIRECT_MR_INVALIDATED
2771 * and defer to mr_recovery_work to recover the MR for next use
2772 */
2773 mr->state = SMBDIRECT_MR_INVALIDATED;
2774
2775 if (mr->sgt.nents) {
2776 ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir);
2777 mr->sgt.nents = 0;
2778 }
2779
2780 if (mr->state == SMBDIRECT_MR_INVALIDATED) {
2781 mr->state = SMBDIRECT_MR_READY;
2782 if (atomic_inc_return(&sc->mr_io.ready.count) == 1)
2783 wake_up(&sc->mr_io.ready.wait_queue);
2784 } else
2785 /*
2786 * Schedule the work to do MR recovery for future I/Os MR
2787 * recovery is slow and don't want it to block current I/O
2788 */
2789 queue_work(sc->workqueue, &sc->mr_io.recovery_work);
2790
2791 done:
2792 if (atomic_dec_and_test(&sc->mr_io.used.count))
2793 wake_up(&sc->mr_io.cleanup.wait_queue);
2794
2795 put_kref:
2796 /*
2797 * No kref_put_mutex() as it's already locked.
2798 *
2799 * If smbd_mr_free_locked() is called
2800 * and the mutex is unlocked and mr is gone,
2801 * in that case kref_put() returned 1.
2802 *
2803 * If kref_put() returned 0 we know that
2804 * smbd_mr_free_locked() didn't
2805 * run. Not by us nor by anyone else, as we
2806 * still hold the mutex, so we need to unlock
2807 * and keep the mr in SMBDIRECT_MR_READY or
2808 * SMBDIRECT_MR_ERROR state.
2809 */
2810 if (!kref_put(&mr->kref, smbd_mr_free_locked))
2811 mutex_unlock(&mr->mutex);
2812 }
2813
smb_set_sge(struct smb_extract_to_rdma * rdma,struct page * lowest_page,size_t off,size_t len)2814 static bool smb_set_sge(struct smb_extract_to_rdma *rdma,
2815 struct page *lowest_page, size_t off, size_t len)
2816 {
2817 struct ib_sge *sge = &rdma->sge[rdma->nr_sge];
2818 u64 addr;
2819
2820 addr = ib_dma_map_page(rdma->device, lowest_page,
2821 off, len, rdma->direction);
2822 if (ib_dma_mapping_error(rdma->device, addr))
2823 return false;
2824
2825 sge->addr = addr;
2826 sge->length = len;
2827 sge->lkey = rdma->local_dma_lkey;
2828 rdma->nr_sge++;
2829 return true;
2830 }
2831
2832 /*
2833 * Extract page fragments from a BVEC-class iterator and add them to an RDMA
2834 * element list. The pages are not pinned.
2835 */
smb_extract_bvec_to_rdma(struct iov_iter * iter,struct smb_extract_to_rdma * rdma,ssize_t maxsize)2836 static ssize_t smb_extract_bvec_to_rdma(struct iov_iter *iter,
2837 struct smb_extract_to_rdma *rdma,
2838 ssize_t maxsize)
2839 {
2840 const struct bio_vec *bv = iter->bvec;
2841 unsigned long start = iter->iov_offset;
2842 unsigned int i;
2843 ssize_t ret = 0;
2844
2845 for (i = 0; i < iter->nr_segs; i++) {
2846 size_t off, len;
2847
2848 len = bv[i].bv_len;
2849 if (start >= len) {
2850 start -= len;
2851 continue;
2852 }
2853
2854 len = min_t(size_t, maxsize, len - start);
2855 off = bv[i].bv_offset + start;
2856
2857 if (!smb_set_sge(rdma, bv[i].bv_page, off, len))
2858 return -EIO;
2859
2860 ret += len;
2861 maxsize -= len;
2862 if (rdma->nr_sge >= rdma->max_sge || maxsize <= 0)
2863 break;
2864 start = 0;
2865 }
2866
2867 if (ret > 0)
2868 iov_iter_advance(iter, ret);
2869 return ret;
2870 }
2871
2872 /*
2873 * Extract fragments from a KVEC-class iterator and add them to an RDMA list.
2874 * This can deal with vmalloc'd buffers as well as kmalloc'd or static buffers.
2875 * The pages are not pinned.
2876 */
smb_extract_kvec_to_rdma(struct iov_iter * iter,struct smb_extract_to_rdma * rdma,ssize_t maxsize)2877 static ssize_t smb_extract_kvec_to_rdma(struct iov_iter *iter,
2878 struct smb_extract_to_rdma *rdma,
2879 ssize_t maxsize)
2880 {
2881 const struct kvec *kv = iter->kvec;
2882 unsigned long start = iter->iov_offset;
2883 unsigned int i;
2884 ssize_t ret = 0;
2885
2886 for (i = 0; i < iter->nr_segs; i++) {
2887 struct page *page;
2888 unsigned long kaddr;
2889 size_t off, len, seg;
2890
2891 len = kv[i].iov_len;
2892 if (start >= len) {
2893 start -= len;
2894 continue;
2895 }
2896
2897 kaddr = (unsigned long)kv[i].iov_base + start;
2898 off = kaddr & ~PAGE_MASK;
2899 len = min_t(size_t, maxsize, len - start);
2900 kaddr &= PAGE_MASK;
2901
2902 maxsize -= len;
2903 do {
2904 seg = min_t(size_t, len, PAGE_SIZE - off);
2905
2906 if (is_vmalloc_or_module_addr((void *)kaddr))
2907 page = vmalloc_to_page((void *)kaddr);
2908 else
2909 page = virt_to_page((void *)kaddr);
2910
2911 if (!smb_set_sge(rdma, page, off, seg))
2912 return -EIO;
2913
2914 ret += seg;
2915 len -= seg;
2916 kaddr += PAGE_SIZE;
2917 off = 0;
2918 } while (len > 0 && rdma->nr_sge < rdma->max_sge);
2919
2920 if (rdma->nr_sge >= rdma->max_sge || maxsize <= 0)
2921 break;
2922 start = 0;
2923 }
2924
2925 if (ret > 0)
2926 iov_iter_advance(iter, ret);
2927 return ret;
2928 }
2929
2930 /*
2931 * Extract folio fragments from a FOLIOQ-class iterator and add them to an RDMA
2932 * list. The folios are not pinned.
2933 */
smb_extract_folioq_to_rdma(struct iov_iter * iter,struct smb_extract_to_rdma * rdma,ssize_t maxsize)2934 static ssize_t smb_extract_folioq_to_rdma(struct iov_iter *iter,
2935 struct smb_extract_to_rdma *rdma,
2936 ssize_t maxsize)
2937 {
2938 const struct folio_queue *folioq = iter->folioq;
2939 unsigned int slot = iter->folioq_slot;
2940 ssize_t ret = 0;
2941 size_t offset = iter->iov_offset;
2942
2943 BUG_ON(!folioq);
2944
2945 if (slot >= folioq_nr_slots(folioq)) {
2946 folioq = folioq->next;
2947 if (WARN_ON_ONCE(!folioq))
2948 return -EIO;
2949 slot = 0;
2950 }
2951
2952 do {
2953 struct folio *folio = folioq_folio(folioq, slot);
2954 size_t fsize = folioq_folio_size(folioq, slot);
2955
2956 if (offset < fsize) {
2957 size_t part = umin(maxsize, fsize - offset);
2958
2959 if (!smb_set_sge(rdma, folio_page(folio, 0), offset, part))
2960 return -EIO;
2961
2962 offset += part;
2963 ret += part;
2964 maxsize -= part;
2965 }
2966
2967 if (offset >= fsize) {
2968 offset = 0;
2969 slot++;
2970 if (slot >= folioq_nr_slots(folioq)) {
2971 if (!folioq->next) {
2972 WARN_ON_ONCE(ret < iter->count);
2973 break;
2974 }
2975 folioq = folioq->next;
2976 slot = 0;
2977 }
2978 }
2979 } while (rdma->nr_sge < rdma->max_sge && maxsize > 0);
2980
2981 iter->folioq = folioq;
2982 iter->folioq_slot = slot;
2983 iter->iov_offset = offset;
2984 iter->count -= ret;
2985 return ret;
2986 }
2987
2988 /*
2989 * Extract page fragments from up to the given amount of the source iterator
2990 * and build up an RDMA list that refers to all of those bits. The RDMA list
2991 * is appended to, up to the maximum number of elements set in the parameter
2992 * block.
2993 *
2994 * The extracted page fragments are not pinned or ref'd in any way; if an
2995 * IOVEC/UBUF-type iterator is to be used, it should be converted to a
2996 * BVEC-type iterator and the pages pinned, ref'd or otherwise held in some
2997 * way.
2998 */
smb_extract_iter_to_rdma(struct iov_iter * iter,size_t len,struct smb_extract_to_rdma * rdma)2999 static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len,
3000 struct smb_extract_to_rdma *rdma)
3001 {
3002 ssize_t ret;
3003 int before = rdma->nr_sge;
3004
3005 switch (iov_iter_type(iter)) {
3006 case ITER_BVEC:
3007 ret = smb_extract_bvec_to_rdma(iter, rdma, len);
3008 break;
3009 case ITER_KVEC:
3010 ret = smb_extract_kvec_to_rdma(iter, rdma, len);
3011 break;
3012 case ITER_FOLIOQ:
3013 ret = smb_extract_folioq_to_rdma(iter, rdma, len);
3014 break;
3015 default:
3016 WARN_ON_ONCE(1);
3017 return -EIO;
3018 }
3019
3020 if (ret < 0) {
3021 while (rdma->nr_sge > before) {
3022 struct ib_sge *sge = &rdma->sge[rdma->nr_sge--];
3023
3024 ib_dma_unmap_single(rdma->device, sge->addr, sge->length,
3025 rdma->direction);
3026 sge->addr = 0;
3027 }
3028 }
3029
3030 return ret;
3031 }
3032