1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * NVMe over Fabrics TCP target.
4 * Copyright (c) 2018 Lightbits Labs. All rights reserved.
5 */
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 #include <linux/module.h>
8 #include <linux/init.h>
9 #include <linux/slab.h>
10 #include <linux/crc32c.h>
11 #include <linux/err.h>
12 #include <linux/nvme-tcp.h>
13 #include <linux/nvme-keyring.h>
14 #include <net/sock.h>
15 #include <net/tcp.h>
16 #include <net/tls.h>
17 #include <net/tls_prot.h>
18 #include <net/handshake.h>
19 #include <linux/inet.h>
20 #include <linux/llist.h>
21 #include <trace/events/sock.h>
22
23 #include "nvmet.h"
24
25 #define NVMET_TCP_DEF_INLINE_DATA_SIZE (4 * PAGE_SIZE)
26 #define NVMET_TCP_MAXH2CDATA 0x400000 /* 16M arbitrary limit */
27 #define NVMET_TCP_BACKLOG 128
28
param_store_val(const char * str,int * val,int min,int max)29 static int param_store_val(const char *str, int *val, int min, int max)
30 {
31 int ret, new_val;
32
33 ret = kstrtoint(str, 10, &new_val);
34 if (ret)
35 return -EINVAL;
36
37 if (new_val < min || new_val > max)
38 return -EINVAL;
39
40 *val = new_val;
41 return 0;
42 }
43
set_params(const char * str,const struct kernel_param * kp)44 static int set_params(const char *str, const struct kernel_param *kp)
45 {
46 return param_store_val(str, kp->arg, 0, INT_MAX);
47 }
48
49 static const struct kernel_param_ops set_param_ops = {
50 .set = set_params,
51 .get = param_get_int,
52 };
53
54 /* Define the socket priority to use for connections were it is desirable
55 * that the NIC consider performing optimized packet processing or filtering.
56 * A non-zero value being sufficient to indicate general consideration of any
57 * possible optimization. Making it a module param allows for alternative
58 * values that may be unique for some NIC implementations.
59 */
60 static int so_priority;
61 device_param_cb(so_priority, &set_param_ops, &so_priority, 0644);
62 MODULE_PARM_DESC(so_priority, "nvmet tcp socket optimize priority: Default 0");
63
64 /* Define a time period (in usecs) that io_work() shall sample an activated
65 * queue before determining it to be idle. This optional module behavior
66 * can enable NIC solutions that support socket optimized packet processing
67 * using advanced interrupt moderation techniques.
68 */
69 static int idle_poll_period_usecs;
70 device_param_cb(idle_poll_period_usecs, &set_param_ops,
71 &idle_poll_period_usecs, 0644);
72 MODULE_PARM_DESC(idle_poll_period_usecs,
73 "nvmet tcp io_work poll till idle time period in usecs: Default 0");
74
75 #ifdef CONFIG_NVME_TARGET_TCP_TLS
76 /*
77 * TLS handshake timeout
78 */
79 static int tls_handshake_timeout = 10;
80 module_param(tls_handshake_timeout, int, 0644);
81 MODULE_PARM_DESC(tls_handshake_timeout,
82 "nvme TLS handshake timeout in seconds (default 10)");
83 #endif
84
85 #define NVMET_TCP_RECV_BUDGET 8
86 #define NVMET_TCP_SEND_BUDGET 8
87 #define NVMET_TCP_IO_WORK_BUDGET 64
88
89 enum nvmet_tcp_send_state {
90 NVMET_TCP_SEND_DATA_PDU,
91 NVMET_TCP_SEND_DATA,
92 NVMET_TCP_SEND_R2T,
93 NVMET_TCP_SEND_DDGST,
94 NVMET_TCP_SEND_RESPONSE
95 };
96
97 enum nvmet_tcp_recv_state {
98 NVMET_TCP_RECV_PDU,
99 NVMET_TCP_RECV_DATA,
100 NVMET_TCP_RECV_DDGST,
101 NVMET_TCP_RECV_ERR,
102 };
103
104 enum {
105 NVMET_TCP_F_INIT_FAILED = (1 << 0),
106 };
107
108 struct nvmet_tcp_cmd {
109 struct nvmet_tcp_queue *queue;
110 struct nvmet_req req;
111
112 struct nvme_tcp_cmd_pdu *cmd_pdu;
113 struct nvme_tcp_rsp_pdu *rsp_pdu;
114 struct nvme_tcp_data_pdu *data_pdu;
115 struct nvme_tcp_r2t_pdu *r2t_pdu;
116
117 u32 rbytes_done;
118 u32 wbytes_done;
119
120 u32 pdu_len;
121 u32 pdu_recv;
122 int sg_idx;
123 char recv_cbuf[CMSG_LEN(sizeof(char))];
124 struct msghdr recv_msg;
125 struct bio_vec *iov;
126 u32 flags;
127
128 struct list_head entry;
129 struct llist_node lentry;
130
131 /* send state */
132 u32 offset;
133 struct scatterlist *cur_sg;
134 enum nvmet_tcp_send_state state;
135
136 __le32 exp_ddgst;
137 __le32 recv_ddgst;
138 };
139
140 enum nvmet_tcp_queue_state {
141 NVMET_TCP_Q_CONNECTING,
142 NVMET_TCP_Q_TLS_HANDSHAKE,
143 NVMET_TCP_Q_LIVE,
144 NVMET_TCP_Q_DISCONNECTING,
145 NVMET_TCP_Q_FAILED,
146 };
147
148 struct nvmet_tcp_queue {
149 struct socket *sock;
150 struct nvmet_tcp_port *port;
151 struct work_struct io_work;
152 struct nvmet_cq nvme_cq;
153 struct nvmet_sq nvme_sq;
154 struct kref kref;
155
156 /* send state */
157 struct nvmet_tcp_cmd *cmds;
158 unsigned int nr_cmds;
159 struct list_head free_list;
160 struct llist_head resp_list;
161 struct list_head resp_send_list;
162 int send_list_len;
163 struct nvmet_tcp_cmd *snd_cmd;
164
165 /* recv state */
166 int offset;
167 int left;
168 enum nvmet_tcp_recv_state rcv_state;
169 struct nvmet_tcp_cmd *cmd;
170 union nvme_tcp_pdu pdu;
171
172 /* digest state */
173 bool hdr_digest;
174 bool data_digest;
175
176 /* TLS state */
177 key_serial_t tls_pskid;
178 struct delayed_work tls_handshake_tmo_work;
179
180 unsigned long poll_end;
181
182 spinlock_t state_lock;
183 enum nvmet_tcp_queue_state state;
184
185 struct sockaddr_storage sockaddr;
186 struct sockaddr_storage sockaddr_peer;
187 struct work_struct release_work;
188
189 int idx;
190 struct list_head queue_list;
191
192 struct nvmet_tcp_cmd connect;
193
194 struct page_frag_cache pf_cache;
195
196 void (*data_ready)(struct sock *);
197 void (*state_change)(struct sock *);
198 void (*write_space)(struct sock *);
199 };
200
201 struct nvmet_tcp_port {
202 struct socket *sock;
203 struct work_struct accept_work;
204 struct nvmet_port *nport;
205 struct sockaddr_storage addr;
206 void (*data_ready)(struct sock *);
207 };
208
209 static DEFINE_IDA(nvmet_tcp_queue_ida);
210 static LIST_HEAD(nvmet_tcp_queue_list);
211 static DEFINE_MUTEX(nvmet_tcp_queue_mutex);
212
213 static struct workqueue_struct *nvmet_tcp_wq;
214 static const struct nvmet_fabrics_ops nvmet_tcp_ops;
215 static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c);
216 static void nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd *cmd);
217
nvmet_tcp_cmd_tag(struct nvmet_tcp_queue * queue,struct nvmet_tcp_cmd * cmd)218 static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue,
219 struct nvmet_tcp_cmd *cmd)
220 {
221 if (unlikely(!queue->nr_cmds)) {
222 /* We didn't allocate cmds yet, send 0xffff */
223 return USHRT_MAX;
224 }
225
226 return cmd - queue->cmds;
227 }
228
nvmet_tcp_has_data_in(struct nvmet_tcp_cmd * cmd)229 static inline bool nvmet_tcp_has_data_in(struct nvmet_tcp_cmd *cmd)
230 {
231 return nvme_is_write(cmd->req.cmd) &&
232 cmd->rbytes_done < cmd->req.transfer_len;
233 }
234
nvmet_tcp_need_data_in(struct nvmet_tcp_cmd * cmd)235 static inline bool nvmet_tcp_need_data_in(struct nvmet_tcp_cmd *cmd)
236 {
237 return nvmet_tcp_has_data_in(cmd) && !cmd->req.cqe->status;
238 }
239
nvmet_tcp_need_data_out(struct nvmet_tcp_cmd * cmd)240 static inline bool nvmet_tcp_need_data_out(struct nvmet_tcp_cmd *cmd)
241 {
242 return !nvme_is_write(cmd->req.cmd) &&
243 cmd->req.transfer_len > 0 &&
244 !cmd->req.cqe->status;
245 }
246
nvmet_tcp_has_inline_data(struct nvmet_tcp_cmd * cmd)247 static inline bool nvmet_tcp_has_inline_data(struct nvmet_tcp_cmd *cmd)
248 {
249 return nvme_is_write(cmd->req.cmd) && cmd->pdu_len &&
250 !cmd->rbytes_done;
251 }
252
253 static inline struct nvmet_tcp_cmd *
nvmet_tcp_get_cmd(struct nvmet_tcp_queue * queue)254 nvmet_tcp_get_cmd(struct nvmet_tcp_queue *queue)
255 {
256 struct nvmet_tcp_cmd *cmd;
257
258 cmd = list_first_entry_or_null(&queue->free_list,
259 struct nvmet_tcp_cmd, entry);
260 if (!cmd)
261 return NULL;
262 list_del_init(&cmd->entry);
263
264 cmd->rbytes_done = cmd->wbytes_done = 0;
265 cmd->pdu_len = 0;
266 cmd->pdu_recv = 0;
267 cmd->iov = NULL;
268 cmd->flags = 0;
269 return cmd;
270 }
271
nvmet_tcp_put_cmd(struct nvmet_tcp_cmd * cmd)272 static inline void nvmet_tcp_put_cmd(struct nvmet_tcp_cmd *cmd)
273 {
274 if (unlikely(cmd == &cmd->queue->connect))
275 return;
276
277 list_add_tail(&cmd->entry, &cmd->queue->free_list);
278 }
279
queue_cpu(struct nvmet_tcp_queue * queue)280 static inline int queue_cpu(struct nvmet_tcp_queue *queue)
281 {
282 return queue->sock->sk->sk_incoming_cpu;
283 }
284
nvmet_tcp_hdgst_len(struct nvmet_tcp_queue * queue)285 static inline u8 nvmet_tcp_hdgst_len(struct nvmet_tcp_queue *queue)
286 {
287 return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
288 }
289
nvmet_tcp_ddgst_len(struct nvmet_tcp_queue * queue)290 static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue)
291 {
292 return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
293 }
294
nvmet_tcp_hdgst(void * pdu,size_t len)295 static inline void nvmet_tcp_hdgst(void *pdu, size_t len)
296 {
297 put_unaligned_le32(~crc32c(~0, pdu, len), pdu + len);
298 }
299
nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue * queue,void * pdu,size_t len)300 static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue,
301 void *pdu, size_t len)
302 {
303 struct nvme_tcp_hdr *hdr = pdu;
304 __le32 recv_digest;
305 __le32 exp_digest;
306
307 if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
308 pr_err("queue %d: header digest enabled but no header digest\n",
309 queue->idx);
310 return -EPROTO;
311 }
312
313 recv_digest = *(__le32 *)(pdu + hdr->hlen);
314 nvmet_tcp_hdgst(pdu, len);
315 exp_digest = *(__le32 *)(pdu + hdr->hlen);
316 if (recv_digest != exp_digest) {
317 pr_err("queue %d: header digest error: recv %#x expected %#x\n",
318 queue->idx, le32_to_cpu(recv_digest),
319 le32_to_cpu(exp_digest));
320 return -EPROTO;
321 }
322
323 return 0;
324 }
325
nvmet_tcp_check_ddgst(struct nvmet_tcp_queue * queue,void * pdu)326 static int nvmet_tcp_check_ddgst(struct nvmet_tcp_queue *queue, void *pdu)
327 {
328 struct nvme_tcp_hdr *hdr = pdu;
329 u8 digest_len = nvmet_tcp_hdgst_len(queue);
330 u32 len;
331
332 len = le32_to_cpu(hdr->plen) - hdr->hlen -
333 (hdr->flags & NVME_TCP_F_HDGST ? digest_len : 0);
334
335 if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
336 pr_err("queue %d: data digest flag is cleared\n", queue->idx);
337 return -EPROTO;
338 }
339
340 return 0;
341 }
342
343 /* If cmd buffers are NULL, no operation is performed */
nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd * cmd)344 static void nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd *cmd)
345 {
346 kfree(cmd->iov);
347 sgl_free(cmd->req.sg);
348 cmd->iov = NULL;
349 cmd->req.sg = NULL;
350 }
351
352 static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue);
353
nvmet_tcp_build_pdu_iovec(struct nvmet_tcp_cmd * cmd)354 static void nvmet_tcp_build_pdu_iovec(struct nvmet_tcp_cmd *cmd)
355 {
356 struct bio_vec *iov = cmd->iov;
357 struct scatterlist *sg;
358 u32 length, offset, sg_offset;
359 unsigned int sg_remaining;
360 int nr_pages;
361
362 length = cmd->pdu_len;
363 nr_pages = DIV_ROUND_UP(length, PAGE_SIZE);
364 offset = cmd->rbytes_done;
365 cmd->sg_idx = offset / PAGE_SIZE;
366 sg_offset = offset % PAGE_SIZE;
367 if (!cmd->req.sg_cnt || cmd->sg_idx >= cmd->req.sg_cnt) {
368 nvmet_tcp_fatal_error(cmd->queue);
369 return;
370 }
371 sg = &cmd->req.sg[cmd->sg_idx];
372 sg_remaining = cmd->req.sg_cnt - cmd->sg_idx;
373
374 while (length) {
375 if (!sg_remaining) {
376 nvmet_tcp_fatal_error(cmd->queue);
377 return;
378 }
379 if (!sg->length || sg->length <= sg_offset) {
380 nvmet_tcp_fatal_error(cmd->queue);
381 return;
382 }
383 u32 iov_len = min_t(u32, length, sg->length - sg_offset);
384
385 bvec_set_page(iov, sg_page(sg), iov_len,
386 sg->offset + sg_offset);
387
388 length -= iov_len;
389 sg = sg_next(sg);
390 sg_remaining--;
391 iov++;
392 sg_offset = 0;
393 }
394
395 iov_iter_bvec(&cmd->recv_msg.msg_iter, ITER_DEST, cmd->iov,
396 nr_pages, cmd->pdu_len);
397 }
398
nvmet_tcp_fatal_error(struct nvmet_tcp_queue * queue)399 static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue)
400 {
401 queue->rcv_state = NVMET_TCP_RECV_ERR;
402 if (queue->nvme_sq.ctrl)
403 nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl);
404 else
405 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
406 }
407
nvmet_tcp_socket_error(struct nvmet_tcp_queue * queue,int status)408 static void nvmet_tcp_socket_error(struct nvmet_tcp_queue *queue, int status)
409 {
410 queue->rcv_state = NVMET_TCP_RECV_ERR;
411 if (status == -EPIPE || status == -ECONNRESET)
412 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
413 else
414 nvmet_tcp_fatal_error(queue);
415 }
416
nvmet_tcp_map_data(struct nvmet_tcp_cmd * cmd)417 static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd)
418 {
419 struct nvme_sgl_desc *sgl = &cmd->req.cmd->common.dptr.sgl;
420 u32 len = le32_to_cpu(sgl->length);
421
422 if (!len)
423 return 0;
424
425 if (sgl->type == ((NVME_SGL_FMT_DATA_DESC << 4) |
426 NVME_SGL_FMT_OFFSET)) {
427 if (!nvme_is_write(cmd->req.cmd))
428 return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
429
430 if (len > cmd->req.port->inline_data_size)
431 return NVME_SC_SGL_INVALID_OFFSET | NVME_STATUS_DNR;
432 cmd->pdu_len = len;
433 }
434 cmd->req.transfer_len += len;
435
436 cmd->req.sg = sgl_alloc(len, GFP_KERNEL, &cmd->req.sg_cnt);
437 if (!cmd->req.sg)
438 return NVME_SC_INTERNAL;
439 cmd->cur_sg = cmd->req.sg;
440
441 if (nvmet_tcp_has_data_in(cmd)) {
442 cmd->iov = kmalloc_array(cmd->req.sg_cnt,
443 sizeof(*cmd->iov), GFP_KERNEL);
444 if (!cmd->iov)
445 goto err;
446 }
447
448 return 0;
449 err:
450 nvmet_tcp_free_cmd_buffers(cmd);
451 return NVME_SC_INTERNAL;
452 }
453
nvmet_tcp_calc_ddgst(struct nvmet_tcp_cmd * cmd)454 static void nvmet_tcp_calc_ddgst(struct nvmet_tcp_cmd *cmd)
455 {
456 size_t total_len = cmd->req.transfer_len;
457 struct scatterlist *sg = cmd->req.sg;
458 u32 crc = ~0;
459
460 while (total_len) {
461 size_t len = min_t(size_t, total_len, sg->length);
462
463 /*
464 * Note that the scatterlist does not contain any highmem pages,
465 * as it was allocated by sgl_alloc() with GFP_KERNEL.
466 */
467 crc = crc32c(crc, sg_virt(sg), len);
468 total_len -= len;
469 sg = sg_next(sg);
470 }
471 cmd->exp_ddgst = cpu_to_le32(~crc);
472 }
473
nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd * cmd)474 static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd)
475 {
476 struct nvme_tcp_data_pdu *pdu = cmd->data_pdu;
477 struct nvmet_tcp_queue *queue = cmd->queue;
478 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
479 u8 ddgst = nvmet_tcp_ddgst_len(cmd->queue);
480
481 cmd->offset = 0;
482 cmd->state = NVMET_TCP_SEND_DATA_PDU;
483
484 pdu->hdr.type = nvme_tcp_c2h_data;
485 pdu->hdr.flags = NVME_TCP_F_DATA_LAST | (queue->nvme_sq.sqhd_disabled ?
486 NVME_TCP_F_DATA_SUCCESS : 0);
487 pdu->hdr.hlen = sizeof(*pdu);
488 pdu->hdr.pdo = pdu->hdr.hlen + hdgst;
489 pdu->hdr.plen =
490 cpu_to_le32(pdu->hdr.hlen + hdgst +
491 cmd->req.transfer_len + ddgst);
492 pdu->command_id = cmd->req.cqe->command_id;
493 pdu->data_length = cpu_to_le32(cmd->req.transfer_len);
494 pdu->data_offset = cpu_to_le32(cmd->wbytes_done);
495
496 if (queue->data_digest) {
497 pdu->hdr.flags |= NVME_TCP_F_DDGST;
498 nvmet_tcp_calc_ddgst(cmd);
499 }
500
501 if (cmd->queue->hdr_digest) {
502 pdu->hdr.flags |= NVME_TCP_F_HDGST;
503 nvmet_tcp_hdgst(pdu, sizeof(*pdu));
504 }
505 }
506
nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd * cmd)507 static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd)
508 {
509 struct nvme_tcp_r2t_pdu *pdu = cmd->r2t_pdu;
510 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
511
512 cmd->offset = 0;
513 cmd->state = NVMET_TCP_SEND_R2T;
514
515 pdu->hdr.type = nvme_tcp_r2t;
516 pdu->hdr.flags = 0;
517 pdu->hdr.hlen = sizeof(*pdu);
518 pdu->hdr.pdo = 0;
519 pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
520
521 pdu->command_id = cmd->req.cmd->common.command_id;
522 pdu->ttag = nvmet_tcp_cmd_tag(cmd->queue, cmd);
523 pdu->r2t_length = cpu_to_le32(cmd->req.transfer_len - cmd->rbytes_done);
524 pdu->r2t_offset = cpu_to_le32(cmd->rbytes_done);
525 if (cmd->queue->hdr_digest) {
526 pdu->hdr.flags |= NVME_TCP_F_HDGST;
527 nvmet_tcp_hdgst(pdu, sizeof(*pdu));
528 }
529 }
530
nvmet_setup_response_pdu(struct nvmet_tcp_cmd * cmd)531 static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd)
532 {
533 struct nvme_tcp_rsp_pdu *pdu = cmd->rsp_pdu;
534 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
535
536 cmd->offset = 0;
537 cmd->state = NVMET_TCP_SEND_RESPONSE;
538
539 pdu->hdr.type = nvme_tcp_rsp;
540 pdu->hdr.flags = 0;
541 pdu->hdr.hlen = sizeof(*pdu);
542 pdu->hdr.pdo = 0;
543 pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
544 if (cmd->queue->hdr_digest) {
545 pdu->hdr.flags |= NVME_TCP_F_HDGST;
546 nvmet_tcp_hdgst(pdu, sizeof(*pdu));
547 }
548 }
549
nvmet_tcp_process_resp_list(struct nvmet_tcp_queue * queue)550 static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue *queue)
551 {
552 struct llist_node *node;
553 struct nvmet_tcp_cmd *cmd;
554
555 for (node = llist_del_all(&queue->resp_list); node; node = node->next) {
556 cmd = llist_entry(node, struct nvmet_tcp_cmd, lentry);
557 list_add(&cmd->entry, &queue->resp_send_list);
558 queue->send_list_len++;
559 }
560 }
561
nvmet_tcp_fetch_cmd(struct nvmet_tcp_queue * queue)562 static struct nvmet_tcp_cmd *nvmet_tcp_fetch_cmd(struct nvmet_tcp_queue *queue)
563 {
564 queue->snd_cmd = list_first_entry_or_null(&queue->resp_send_list,
565 struct nvmet_tcp_cmd, entry);
566 if (!queue->snd_cmd) {
567 nvmet_tcp_process_resp_list(queue);
568 queue->snd_cmd =
569 list_first_entry_or_null(&queue->resp_send_list,
570 struct nvmet_tcp_cmd, entry);
571 if (unlikely(!queue->snd_cmd))
572 return NULL;
573 }
574
575 list_del_init(&queue->snd_cmd->entry);
576 queue->send_list_len--;
577
578 if (nvmet_tcp_need_data_out(queue->snd_cmd))
579 nvmet_setup_c2h_data_pdu(queue->snd_cmd);
580 else if (nvmet_tcp_need_data_in(queue->snd_cmd))
581 nvmet_setup_r2t_pdu(queue->snd_cmd);
582 else
583 nvmet_setup_response_pdu(queue->snd_cmd);
584
585 return queue->snd_cmd;
586 }
587
nvmet_tcp_queue_response(struct nvmet_req * req)588 static void nvmet_tcp_queue_response(struct nvmet_req *req)
589 {
590 struct nvmet_tcp_cmd *cmd =
591 container_of(req, struct nvmet_tcp_cmd, req);
592 struct nvmet_tcp_queue *queue = cmd->queue;
593 enum nvmet_tcp_recv_state queue_state;
594 struct nvmet_tcp_cmd *queue_cmd;
595 struct nvme_sgl_desc *sgl;
596 u32 len;
597
598 /* Pairs with store_release in nvmet_prepare_receive_pdu() */
599 queue_state = smp_load_acquire(&queue->rcv_state);
600 queue_cmd = READ_ONCE(queue->cmd);
601
602 if (unlikely(cmd == queue_cmd)) {
603 sgl = &cmd->req.cmd->common.dptr.sgl;
604 len = le32_to_cpu(sgl->length);
605
606 /*
607 * Wait for inline data before processing the response.
608 * Avoid using helpers, this might happen before
609 * nvmet_req_init is completed.
610 */
611 if (queue_state == NVMET_TCP_RECV_PDU &&
612 len && len <= cmd->req.port->inline_data_size &&
613 nvme_is_write(cmd->req.cmd))
614 return;
615 }
616
617 llist_add(&cmd->lentry, &queue->resp_list);
618 queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &cmd->queue->io_work);
619 }
620
nvmet_tcp_execute_request(struct nvmet_tcp_cmd * cmd)621 static void nvmet_tcp_execute_request(struct nvmet_tcp_cmd *cmd)
622 {
623 if (unlikely(cmd->flags & NVMET_TCP_F_INIT_FAILED))
624 nvmet_tcp_queue_response(&cmd->req);
625 else
626 cmd->req.execute(&cmd->req);
627 }
628
nvmet_try_send_data_pdu(struct nvmet_tcp_cmd * cmd)629 static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd)
630 {
631 struct msghdr msg = {
632 .msg_flags = MSG_DONTWAIT | MSG_MORE | MSG_SPLICE_PAGES,
633 };
634 struct bio_vec bvec;
635 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
636 int left = sizeof(*cmd->data_pdu) - cmd->offset + hdgst;
637 int ret;
638
639 bvec_set_virt(&bvec, (void *)cmd->data_pdu + cmd->offset, left);
640 iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, left);
641 ret = sock_sendmsg(cmd->queue->sock, &msg);
642 if (ret <= 0)
643 return ret;
644
645 cmd->offset += ret;
646 left -= ret;
647
648 if (left)
649 return -EAGAIN;
650
651 cmd->state = NVMET_TCP_SEND_DATA;
652 cmd->offset = 0;
653 return 1;
654 }
655
nvmet_try_send_data(struct nvmet_tcp_cmd * cmd,bool last_in_batch)656 static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
657 {
658 struct nvmet_tcp_queue *queue = cmd->queue;
659 int ret;
660
661 while (cmd->cur_sg) {
662 struct msghdr msg = {
663 .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES,
664 };
665 struct page *page = sg_page(cmd->cur_sg);
666 struct bio_vec bvec;
667 u32 left = cmd->cur_sg->length - cmd->offset;
668
669 if ((!last_in_batch && cmd->queue->send_list_len) ||
670 cmd->wbytes_done + left < cmd->req.transfer_len ||
671 queue->data_digest || !queue->nvme_sq.sqhd_disabled)
672 msg.msg_flags |= MSG_MORE;
673
674 bvec_set_page(&bvec, page, left, cmd->offset);
675 iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, left);
676 ret = sock_sendmsg(cmd->queue->sock, &msg);
677 if (ret <= 0)
678 return ret;
679
680 cmd->offset += ret;
681 cmd->wbytes_done += ret;
682
683 /* Done with sg?*/
684 if (cmd->offset == cmd->cur_sg->length) {
685 cmd->cur_sg = sg_next(cmd->cur_sg);
686 cmd->offset = 0;
687 }
688 }
689
690 if (queue->data_digest) {
691 cmd->state = NVMET_TCP_SEND_DDGST;
692 cmd->offset = 0;
693 } else {
694 if (queue->nvme_sq.sqhd_disabled) {
695 cmd->queue->snd_cmd = NULL;
696 nvmet_tcp_put_cmd(cmd);
697 } else {
698 nvmet_setup_response_pdu(cmd);
699 }
700 }
701
702 if (queue->nvme_sq.sqhd_disabled)
703 nvmet_tcp_free_cmd_buffers(cmd);
704
705 return 1;
706
707 }
708
nvmet_try_send_response(struct nvmet_tcp_cmd * cmd,bool last_in_batch)709 static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd,
710 bool last_in_batch)
711 {
712 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, };
713 struct bio_vec bvec;
714 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
715 int left = sizeof(*cmd->rsp_pdu) - cmd->offset + hdgst;
716 int ret;
717
718 if (!last_in_batch && cmd->queue->send_list_len)
719 msg.msg_flags |= MSG_MORE;
720 else
721 msg.msg_flags |= MSG_EOR;
722
723 bvec_set_virt(&bvec, (void *)cmd->rsp_pdu + cmd->offset, left);
724 iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, left);
725 ret = sock_sendmsg(cmd->queue->sock, &msg);
726 if (ret <= 0)
727 return ret;
728 cmd->offset += ret;
729 left -= ret;
730
731 if (left)
732 return -EAGAIN;
733
734 nvmet_tcp_free_cmd_buffers(cmd);
735 cmd->queue->snd_cmd = NULL;
736 nvmet_tcp_put_cmd(cmd);
737 return 1;
738 }
739
nvmet_try_send_r2t(struct nvmet_tcp_cmd * cmd,bool last_in_batch)740 static int nvmet_try_send_r2t(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
741 {
742 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, };
743 struct bio_vec bvec;
744 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
745 int left = sizeof(*cmd->r2t_pdu) - cmd->offset + hdgst;
746 int ret;
747
748 if (!last_in_batch && cmd->queue->send_list_len)
749 msg.msg_flags |= MSG_MORE;
750 else
751 msg.msg_flags |= MSG_EOR;
752
753 bvec_set_virt(&bvec, (void *)cmd->r2t_pdu + cmd->offset, left);
754 iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, left);
755 ret = sock_sendmsg(cmd->queue->sock, &msg);
756 if (ret <= 0)
757 return ret;
758 cmd->offset += ret;
759 left -= ret;
760
761 if (left)
762 return -EAGAIN;
763
764 cmd->queue->snd_cmd = NULL;
765 return 1;
766 }
767
nvmet_try_send_ddgst(struct nvmet_tcp_cmd * cmd,bool last_in_batch)768 static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
769 {
770 struct nvmet_tcp_queue *queue = cmd->queue;
771 int left = NVME_TCP_DIGEST_LENGTH - cmd->offset;
772 struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
773 struct kvec iov = {
774 .iov_base = (u8 *)&cmd->exp_ddgst + cmd->offset,
775 .iov_len = left
776 };
777 int ret;
778
779 if (!last_in_batch && cmd->queue->send_list_len)
780 msg.msg_flags |= MSG_MORE;
781 else
782 msg.msg_flags |= MSG_EOR;
783
784 ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
785 if (unlikely(ret <= 0))
786 return ret;
787
788 cmd->offset += ret;
789 left -= ret;
790
791 if (left)
792 return -EAGAIN;
793
794 if (queue->nvme_sq.sqhd_disabled) {
795 cmd->queue->snd_cmd = NULL;
796 nvmet_tcp_put_cmd(cmd);
797 } else {
798 nvmet_setup_response_pdu(cmd);
799 }
800 return 1;
801 }
802
nvmet_tcp_try_send_one(struct nvmet_tcp_queue * queue,bool last_in_batch)803 static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue *queue,
804 bool last_in_batch)
805 {
806 struct nvmet_tcp_cmd *cmd = queue->snd_cmd;
807 int ret = 0;
808
809 if (!cmd || queue->state == NVMET_TCP_Q_DISCONNECTING) {
810 cmd = nvmet_tcp_fetch_cmd(queue);
811 if (unlikely(!cmd))
812 return 0;
813 }
814
815 if (cmd->state == NVMET_TCP_SEND_DATA_PDU) {
816 ret = nvmet_try_send_data_pdu(cmd);
817 if (ret <= 0)
818 goto done_send;
819 }
820
821 if (cmd->state == NVMET_TCP_SEND_DATA) {
822 ret = nvmet_try_send_data(cmd, last_in_batch);
823 if (ret <= 0)
824 goto done_send;
825 }
826
827 if (cmd->state == NVMET_TCP_SEND_DDGST) {
828 ret = nvmet_try_send_ddgst(cmd, last_in_batch);
829 if (ret <= 0)
830 goto done_send;
831 }
832
833 if (cmd->state == NVMET_TCP_SEND_R2T) {
834 ret = nvmet_try_send_r2t(cmd, last_in_batch);
835 if (ret <= 0)
836 goto done_send;
837 }
838
839 if (cmd->state == NVMET_TCP_SEND_RESPONSE)
840 ret = nvmet_try_send_response(cmd, last_in_batch);
841
842 done_send:
843 if (ret < 0) {
844 if (ret == -EAGAIN)
845 return 0;
846 return ret;
847 }
848
849 return 1;
850 }
851
nvmet_tcp_try_send(struct nvmet_tcp_queue * queue,int budget,int * sends)852 static int nvmet_tcp_try_send(struct nvmet_tcp_queue *queue,
853 int budget, int *sends)
854 {
855 int i, ret = 0;
856
857 for (i = 0; i < budget; i++) {
858 ret = nvmet_tcp_try_send_one(queue, i == budget - 1);
859 if (unlikely(ret < 0)) {
860 nvmet_tcp_socket_error(queue, ret);
861 goto done;
862 } else if (ret == 0) {
863 break;
864 }
865 (*sends)++;
866 }
867 done:
868 return ret;
869 }
870
nvmet_prepare_receive_pdu(struct nvmet_tcp_queue * queue)871 static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue *queue)
872 {
873 queue->offset = 0;
874 queue->left = sizeof(struct nvme_tcp_hdr);
875 WRITE_ONCE(queue->cmd, NULL);
876 /* Ensure rcv_state is visible only after queue->cmd is set */
877 smp_store_release(&queue->rcv_state, NVMET_TCP_RECV_PDU);
878 }
879
nvmet_tcp_handle_icreq(struct nvmet_tcp_queue * queue)880 static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue)
881 {
882 struct nvme_tcp_icreq_pdu *icreq = &queue->pdu.icreq;
883 struct nvme_tcp_icresp_pdu *icresp = &queue->pdu.icresp;
884 struct msghdr msg = {};
885 struct kvec iov;
886 int ret;
887
888 if (le32_to_cpu(icreq->hdr.plen) != sizeof(struct nvme_tcp_icreq_pdu)) {
889 pr_err("bad nvme-tcp pdu length (%d)\n",
890 le32_to_cpu(icreq->hdr.plen));
891 nvmet_tcp_fatal_error(queue);
892 return -EPROTO;
893 }
894
895 if (icreq->pfv != NVME_TCP_PFV_1_0) {
896 pr_err("queue %d: bad pfv %d\n", queue->idx, icreq->pfv);
897 return -EPROTO;
898 }
899
900 if (icreq->hpda != 0) {
901 pr_err("queue %d: unsupported hpda %d\n", queue->idx,
902 icreq->hpda);
903 return -EPROTO;
904 }
905
906 queue->hdr_digest = !!(icreq->digest & NVME_TCP_HDR_DIGEST_ENABLE);
907 queue->data_digest = !!(icreq->digest & NVME_TCP_DATA_DIGEST_ENABLE);
908
909 memset(icresp, 0, sizeof(*icresp));
910 icresp->hdr.type = nvme_tcp_icresp;
911 icresp->hdr.hlen = sizeof(*icresp);
912 icresp->hdr.pdo = 0;
913 icresp->hdr.plen = cpu_to_le32(icresp->hdr.hlen);
914 icresp->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
915 icresp->maxdata = cpu_to_le32(NVMET_TCP_MAXH2CDATA);
916 icresp->cpda = 0;
917 if (queue->hdr_digest)
918 icresp->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
919 if (queue->data_digest)
920 icresp->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
921
922 iov.iov_base = icresp;
923 iov.iov_len = sizeof(*icresp);
924 ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
925 if (ret < 0) {
926 queue->state = NVMET_TCP_Q_FAILED;
927 return ret; /* queue removal will cleanup */
928 }
929
930 queue->state = NVMET_TCP_Q_LIVE;
931 nvmet_prepare_receive_pdu(queue);
932 return 0;
933 }
934
nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue * queue,struct nvmet_tcp_cmd * cmd,struct nvmet_req * req)935 static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue,
936 struct nvmet_tcp_cmd *cmd, struct nvmet_req *req)
937 {
938 size_t data_len = le32_to_cpu(req->cmd->common.dptr.sgl.length);
939 int ret;
940
941 /*
942 * This command has not been processed yet, hence we are trying to
943 * figure out if there is still pending data left to receive. If
944 * we don't, we can simply prepare for the next pdu and bail out,
945 * otherwise we will need to prepare a buffer and receive the
946 * stale data before continuing forward.
947 */
948 if (!nvme_is_write(cmd->req.cmd) || !data_len ||
949 data_len > cmd->req.port->inline_data_size) {
950 nvmet_prepare_receive_pdu(queue);
951 return;
952 }
953
954 ret = nvmet_tcp_map_data(cmd);
955 if (unlikely(ret)) {
956 pr_err("queue %d: failed to map data\n", queue->idx);
957 nvmet_tcp_fatal_error(queue);
958 return;
959 }
960
961 queue->rcv_state = NVMET_TCP_RECV_DATA;
962 nvmet_tcp_build_pdu_iovec(cmd);
963 cmd->flags |= NVMET_TCP_F_INIT_FAILED;
964 }
965
nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue * queue)966 static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue)
967 {
968 struct nvme_tcp_data_pdu *data = &queue->pdu.data;
969 struct nvmet_tcp_cmd *cmd;
970 unsigned int exp_data_len;
971
972 if (likely(queue->nr_cmds)) {
973 if (unlikely(data->ttag >= queue->nr_cmds)) {
974 pr_err("queue %d: received out of bound ttag %u, nr_cmds %u\n",
975 queue->idx, data->ttag, queue->nr_cmds);
976 goto err_proto;
977 }
978 cmd = &queue->cmds[data->ttag];
979 } else {
980 cmd = &queue->connect;
981 }
982
983 if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) {
984 pr_err("ttag %u unexpected data offset %u (expected %u)\n",
985 data->ttag, le32_to_cpu(data->data_offset),
986 cmd->rbytes_done);
987 goto err_proto;
988 }
989
990 exp_data_len = le32_to_cpu(data->hdr.plen) -
991 nvmet_tcp_hdgst_len(queue) -
992 nvmet_tcp_ddgst_len(queue) -
993 sizeof(*data);
994
995 cmd->pdu_len = le32_to_cpu(data->data_length);
996 if (unlikely(cmd->pdu_len != exp_data_len ||
997 cmd->pdu_len == 0 ||
998 cmd->pdu_len > NVMET_TCP_MAXH2CDATA)) {
999 pr_err("H2CData PDU len %u is invalid\n", cmd->pdu_len);
1000 goto err_proto;
1001 }
1002 /*
1003 * Ensure command data structures are initialized. We must check both
1004 * cmd->req.sg and cmd->iov because they can have different NULL states:
1005 * - Uninitialized commands: both NULL
1006 * - READ commands: cmd->req.sg allocated, cmd->iov NULL
1007 * - WRITE commands: both allocated
1008 */
1009 if (unlikely(!cmd->req.sg || !cmd->iov)) {
1010 pr_err("queue %d: H2CData PDU received for invalid command state (ttag %u)\n",
1011 queue->idx, data->ttag);
1012 goto err_proto;
1013 }
1014 cmd->pdu_recv = 0;
1015 nvmet_tcp_build_pdu_iovec(cmd);
1016 queue->cmd = cmd;
1017 queue->rcv_state = NVMET_TCP_RECV_DATA;
1018
1019 return 0;
1020
1021 err_proto:
1022 /* FIXME: use proper transport errors */
1023 nvmet_tcp_fatal_error(queue);
1024 return -EPROTO;
1025 }
1026
nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue * queue)1027 static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue)
1028 {
1029 struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
1030 struct nvme_command *nvme_cmd = &queue->pdu.cmd.cmd;
1031 struct nvmet_req *req;
1032 int ret;
1033
1034 if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
1035 if (hdr->type != nvme_tcp_icreq) {
1036 pr_err("unexpected pdu type (%d) before icreq\n",
1037 hdr->type);
1038 nvmet_tcp_fatal_error(queue);
1039 return -EPROTO;
1040 }
1041 return nvmet_tcp_handle_icreq(queue);
1042 }
1043
1044 if (unlikely(hdr->type == nvme_tcp_icreq)) {
1045 pr_err("queue %d: received icreq pdu in state %d\n",
1046 queue->idx, queue->state);
1047 nvmet_tcp_fatal_error(queue);
1048 return -EPROTO;
1049 }
1050
1051 if (hdr->type == nvme_tcp_h2c_data) {
1052 ret = nvmet_tcp_handle_h2c_data_pdu(queue);
1053 if (unlikely(ret))
1054 return ret;
1055 return 0;
1056 }
1057
1058 queue->cmd = nvmet_tcp_get_cmd(queue);
1059 if (unlikely(!queue->cmd)) {
1060 /* This should never happen */
1061 pr_err("queue %d: out of commands (%d) send_list_len: %d, opcode: %d",
1062 queue->idx, queue->nr_cmds, queue->send_list_len,
1063 nvme_cmd->common.opcode);
1064 nvmet_tcp_fatal_error(queue);
1065 return -ENOMEM;
1066 }
1067
1068 req = &queue->cmd->req;
1069 memcpy(req->cmd, nvme_cmd, sizeof(*nvme_cmd));
1070
1071 if (unlikely(!nvmet_req_init(req, &queue->nvme_sq, &nvmet_tcp_ops))) {
1072 pr_err("failed cmd %p id %d opcode %d, data_len: %d, status: %04x\n",
1073 req->cmd, req->cmd->common.command_id,
1074 req->cmd->common.opcode,
1075 le32_to_cpu(req->cmd->common.dptr.sgl.length),
1076 le16_to_cpu(req->cqe->status));
1077
1078 nvmet_tcp_handle_req_failure(queue, queue->cmd, req);
1079 return 0;
1080 }
1081
1082 ret = nvmet_tcp_map_data(queue->cmd);
1083 if (unlikely(ret)) {
1084 pr_err("queue %d: failed to map data\n", queue->idx);
1085 if (nvmet_tcp_has_inline_data(queue->cmd))
1086 nvmet_tcp_fatal_error(queue);
1087 else
1088 nvmet_req_complete(req, ret);
1089 ret = -EAGAIN;
1090 goto out;
1091 }
1092
1093 if (nvmet_tcp_need_data_in(queue->cmd)) {
1094 if (nvmet_tcp_has_inline_data(queue->cmd)) {
1095 queue->rcv_state = NVMET_TCP_RECV_DATA;
1096 nvmet_tcp_build_pdu_iovec(queue->cmd);
1097 return 0;
1098 }
1099 /* send back R2T */
1100 nvmet_tcp_queue_response(&queue->cmd->req);
1101 goto out;
1102 }
1103
1104 queue->cmd->req.execute(&queue->cmd->req);
1105 out:
1106 nvmet_prepare_receive_pdu(queue);
1107 return ret;
1108 }
1109
1110 static const u8 nvme_tcp_pdu_sizes[] = {
1111 [nvme_tcp_icreq] = sizeof(struct nvme_tcp_icreq_pdu),
1112 [nvme_tcp_cmd] = sizeof(struct nvme_tcp_cmd_pdu),
1113 [nvme_tcp_h2c_data] = sizeof(struct nvme_tcp_data_pdu),
1114 };
1115
nvmet_tcp_pdu_size(u8 type)1116 static inline u8 nvmet_tcp_pdu_size(u8 type)
1117 {
1118 size_t idx = type;
1119
1120 return (idx < ARRAY_SIZE(nvme_tcp_pdu_sizes) &&
1121 nvme_tcp_pdu_sizes[idx]) ?
1122 nvme_tcp_pdu_sizes[idx] : 0;
1123 }
1124
nvmet_tcp_pdu_valid(u8 type)1125 static inline bool nvmet_tcp_pdu_valid(u8 type)
1126 {
1127 switch (type) {
1128 case nvme_tcp_icreq:
1129 case nvme_tcp_cmd:
1130 case nvme_tcp_h2c_data:
1131 /* fallthru */
1132 return true;
1133 }
1134
1135 return false;
1136 }
1137
nvmet_tcp_tls_record_ok(struct nvmet_tcp_queue * queue,struct msghdr * msg,char * cbuf)1138 static int nvmet_tcp_tls_record_ok(struct nvmet_tcp_queue *queue,
1139 struct msghdr *msg, char *cbuf)
1140 {
1141 struct cmsghdr *cmsg = (struct cmsghdr *)cbuf;
1142 u8 ctype, level, description;
1143 int ret = 0;
1144
1145 ctype = tls_get_record_type(queue->sock->sk, cmsg);
1146 switch (ctype) {
1147 case 0:
1148 break;
1149 case TLS_RECORD_TYPE_DATA:
1150 break;
1151 case TLS_RECORD_TYPE_ALERT:
1152 tls_alert_recv(queue->sock->sk, msg, &level, &description);
1153 if (level == TLS_ALERT_LEVEL_FATAL) {
1154 pr_err("queue %d: TLS Alert desc %u\n",
1155 queue->idx, description);
1156 ret = -ENOTCONN;
1157 } else {
1158 pr_warn("queue %d: TLS Alert desc %u\n",
1159 queue->idx, description);
1160 ret = -EAGAIN;
1161 }
1162 break;
1163 default:
1164 /* discard this record type */
1165 pr_err("queue %d: TLS record %d unhandled\n",
1166 queue->idx, ctype);
1167 ret = -EAGAIN;
1168 break;
1169 }
1170 return ret;
1171 }
1172
nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue * queue)1173 static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue *queue)
1174 {
1175 struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
1176 int len, ret;
1177 struct kvec iov;
1178 char cbuf[CMSG_LEN(sizeof(char))] = {};
1179 struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
1180
1181 recv:
1182 iov.iov_base = (void *)&queue->pdu + queue->offset;
1183 iov.iov_len = queue->left;
1184 if (queue->tls_pskid) {
1185 msg.msg_control = cbuf;
1186 msg.msg_controllen = sizeof(cbuf);
1187 }
1188 len = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1189 iov.iov_len, msg.msg_flags);
1190 if (unlikely(len < 0))
1191 return len;
1192 if (queue->tls_pskid) {
1193 ret = nvmet_tcp_tls_record_ok(queue, &msg, cbuf);
1194 if (ret < 0)
1195 return ret;
1196 }
1197
1198 queue->offset += len;
1199 queue->left -= len;
1200 if (queue->left)
1201 return -EAGAIN;
1202
1203 if (queue->offset == sizeof(struct nvme_tcp_hdr)) {
1204 u8 hdgst = nvmet_tcp_hdgst_len(queue);
1205
1206 if (unlikely(!nvmet_tcp_pdu_valid(hdr->type))) {
1207 pr_err("unexpected pdu type %d\n", hdr->type);
1208 nvmet_tcp_fatal_error(queue);
1209 return -EIO;
1210 }
1211
1212 if (unlikely(hdr->hlen != nvmet_tcp_pdu_size(hdr->type))) {
1213 pr_err("pdu %d bad hlen %d\n", hdr->type, hdr->hlen);
1214 return -EIO;
1215 }
1216
1217 queue->left = hdr->hlen - queue->offset + hdgst;
1218 goto recv;
1219 }
1220
1221 if (queue->hdr_digest &&
1222 nvmet_tcp_verify_hdgst(queue, &queue->pdu, hdr->hlen)) {
1223 nvmet_tcp_fatal_error(queue); /* fatal */
1224 return -EPROTO;
1225 }
1226
1227 if (queue->data_digest &&
1228 nvmet_tcp_check_ddgst(queue, &queue->pdu)) {
1229 nvmet_tcp_fatal_error(queue); /* fatal */
1230 return -EPROTO;
1231 }
1232
1233 return nvmet_tcp_done_recv_pdu(queue);
1234 }
1235
nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd * cmd)1236 static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd *cmd)
1237 {
1238 struct nvmet_tcp_queue *queue = cmd->queue;
1239
1240 nvmet_tcp_calc_ddgst(cmd);
1241 queue->offset = 0;
1242 queue->left = NVME_TCP_DIGEST_LENGTH;
1243 queue->rcv_state = NVMET_TCP_RECV_DDGST;
1244 }
1245
nvmet_tcp_try_recv_data(struct nvmet_tcp_queue * queue)1246 static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue)
1247 {
1248 struct nvmet_tcp_cmd *cmd = queue->cmd;
1249 int len, ret;
1250
1251 while (msg_data_left(&cmd->recv_msg)) {
1252 len = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg,
1253 cmd->recv_msg.msg_flags);
1254 if (len <= 0)
1255 return len;
1256 if (queue->tls_pskid) {
1257 ret = nvmet_tcp_tls_record_ok(cmd->queue,
1258 &cmd->recv_msg, cmd->recv_cbuf);
1259 if (ret < 0)
1260 return ret;
1261 }
1262
1263 cmd->pdu_recv += len;
1264 cmd->rbytes_done += len;
1265 }
1266
1267 if (queue->data_digest) {
1268 nvmet_tcp_prep_recv_ddgst(cmd);
1269 return 0;
1270 }
1271
1272 if (cmd->rbytes_done == cmd->req.transfer_len)
1273 nvmet_tcp_execute_request(cmd);
1274
1275 nvmet_prepare_receive_pdu(queue);
1276 return 0;
1277 }
1278
nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue * queue)1279 static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue)
1280 {
1281 struct nvmet_tcp_cmd *cmd = queue->cmd;
1282 int ret, len;
1283 char cbuf[CMSG_LEN(sizeof(char))] = {};
1284 struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
1285 struct kvec iov = {
1286 .iov_base = (void *)&cmd->recv_ddgst + queue->offset,
1287 .iov_len = queue->left
1288 };
1289
1290 if (queue->tls_pskid) {
1291 msg.msg_control = cbuf;
1292 msg.msg_controllen = sizeof(cbuf);
1293 }
1294 len = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1295 iov.iov_len, msg.msg_flags);
1296 if (unlikely(len < 0))
1297 return len;
1298 if (queue->tls_pskid) {
1299 ret = nvmet_tcp_tls_record_ok(queue, &msg, cbuf);
1300 if (ret < 0)
1301 return ret;
1302 }
1303
1304 queue->offset += len;
1305 queue->left -= len;
1306 if (queue->left)
1307 return -EAGAIN;
1308
1309 if (queue->data_digest && cmd->exp_ddgst != cmd->recv_ddgst) {
1310 pr_err("queue %d: cmd %d pdu (%d) data digest error: recv %#x expected %#x\n",
1311 queue->idx, cmd->req.cmd->common.command_id,
1312 queue->pdu.cmd.hdr.type, le32_to_cpu(cmd->recv_ddgst),
1313 le32_to_cpu(cmd->exp_ddgst));
1314 nvmet_req_uninit(&cmd->req);
1315 nvmet_tcp_free_cmd_buffers(cmd);
1316 nvmet_tcp_fatal_error(queue);
1317 ret = -EPROTO;
1318 goto out;
1319 }
1320
1321 if (cmd->rbytes_done == cmd->req.transfer_len)
1322 nvmet_tcp_execute_request(cmd);
1323
1324 ret = 0;
1325 out:
1326 nvmet_prepare_receive_pdu(queue);
1327 return ret;
1328 }
1329
nvmet_tcp_try_recv_one(struct nvmet_tcp_queue * queue)1330 static int nvmet_tcp_try_recv_one(struct nvmet_tcp_queue *queue)
1331 {
1332 int result = 0;
1333
1334 if (unlikely(queue->rcv_state == NVMET_TCP_RECV_ERR))
1335 return 0;
1336
1337 if (queue->rcv_state == NVMET_TCP_RECV_PDU) {
1338 result = nvmet_tcp_try_recv_pdu(queue);
1339 if (result != 0)
1340 goto done_recv;
1341 }
1342
1343 if (queue->rcv_state == NVMET_TCP_RECV_DATA) {
1344 result = nvmet_tcp_try_recv_data(queue);
1345 if (result != 0)
1346 goto done_recv;
1347 }
1348
1349 if (queue->rcv_state == NVMET_TCP_RECV_DDGST) {
1350 result = nvmet_tcp_try_recv_ddgst(queue);
1351 if (result != 0)
1352 goto done_recv;
1353 }
1354
1355 done_recv:
1356 if (result < 0) {
1357 if (result == -EAGAIN)
1358 return 0;
1359 return result;
1360 }
1361 return 1;
1362 }
1363
nvmet_tcp_try_recv(struct nvmet_tcp_queue * queue,int budget,int * recvs)1364 static int nvmet_tcp_try_recv(struct nvmet_tcp_queue *queue,
1365 int budget, int *recvs)
1366 {
1367 int i, ret = 0;
1368
1369 for (i = 0; i < budget; i++) {
1370 ret = nvmet_tcp_try_recv_one(queue);
1371 if (unlikely(ret < 0)) {
1372 nvmet_tcp_socket_error(queue, ret);
1373 goto done;
1374 } else if (ret == 0) {
1375 break;
1376 }
1377 (*recvs)++;
1378 }
1379 done:
1380 return ret;
1381 }
1382
nvmet_tcp_release_queue(struct kref * kref)1383 static void nvmet_tcp_release_queue(struct kref *kref)
1384 {
1385 struct nvmet_tcp_queue *queue =
1386 container_of(kref, struct nvmet_tcp_queue, kref);
1387
1388 WARN_ON(queue->state != NVMET_TCP_Q_DISCONNECTING);
1389 queue_work(nvmet_wq, &queue->release_work);
1390 }
1391
nvmet_tcp_schedule_release_queue(struct nvmet_tcp_queue * queue)1392 static void nvmet_tcp_schedule_release_queue(struct nvmet_tcp_queue *queue)
1393 {
1394 spin_lock_bh(&queue->state_lock);
1395 if (queue->state == NVMET_TCP_Q_TLS_HANDSHAKE) {
1396 /* Socket closed during handshake */
1397 tls_handshake_cancel(queue->sock->sk);
1398 }
1399 if (queue->state != NVMET_TCP_Q_DISCONNECTING) {
1400 queue->state = NVMET_TCP_Q_DISCONNECTING;
1401 kref_put(&queue->kref, nvmet_tcp_release_queue);
1402 }
1403 spin_unlock_bh(&queue->state_lock);
1404 }
1405
nvmet_tcp_arm_queue_deadline(struct nvmet_tcp_queue * queue)1406 static inline void nvmet_tcp_arm_queue_deadline(struct nvmet_tcp_queue *queue)
1407 {
1408 queue->poll_end = jiffies + usecs_to_jiffies(idle_poll_period_usecs);
1409 }
1410
nvmet_tcp_check_queue_deadline(struct nvmet_tcp_queue * queue,int ops)1411 static bool nvmet_tcp_check_queue_deadline(struct nvmet_tcp_queue *queue,
1412 int ops)
1413 {
1414 if (!idle_poll_period_usecs)
1415 return false;
1416
1417 if (ops)
1418 nvmet_tcp_arm_queue_deadline(queue);
1419
1420 return !time_after(jiffies, queue->poll_end);
1421 }
1422
nvmet_tcp_io_work(struct work_struct * w)1423 static void nvmet_tcp_io_work(struct work_struct *w)
1424 {
1425 struct nvmet_tcp_queue *queue =
1426 container_of(w, struct nvmet_tcp_queue, io_work);
1427 bool pending;
1428 int ret, ops = 0;
1429
1430 do {
1431 pending = false;
1432
1433 ret = nvmet_tcp_try_recv(queue, NVMET_TCP_RECV_BUDGET, &ops);
1434 if (ret > 0)
1435 pending = true;
1436 else if (ret < 0)
1437 return;
1438
1439 ret = nvmet_tcp_try_send(queue, NVMET_TCP_SEND_BUDGET, &ops);
1440 if (ret > 0)
1441 pending = true;
1442 else if (ret < 0)
1443 return;
1444
1445 } while (pending && ops < NVMET_TCP_IO_WORK_BUDGET);
1446
1447 /*
1448 * Requeue the worker if idle deadline period is in progress or any
1449 * ops activity was recorded during the do-while loop above.
1450 */
1451 if (nvmet_tcp_check_queue_deadline(queue, ops) || pending)
1452 queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
1453 }
1454
nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue * queue,struct nvmet_tcp_cmd * c)1455 static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue,
1456 struct nvmet_tcp_cmd *c)
1457 {
1458 u8 hdgst = nvmet_tcp_hdgst_len(queue);
1459
1460 c->queue = queue;
1461 c->req.port = queue->port->nport;
1462
1463 c->cmd_pdu = page_frag_alloc(&queue->pf_cache,
1464 sizeof(*c->cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1465 if (!c->cmd_pdu)
1466 return -ENOMEM;
1467 c->req.cmd = &c->cmd_pdu->cmd;
1468
1469 c->rsp_pdu = page_frag_alloc(&queue->pf_cache,
1470 sizeof(*c->rsp_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1471 if (!c->rsp_pdu)
1472 goto out_free_cmd;
1473 c->req.cqe = &c->rsp_pdu->cqe;
1474
1475 c->data_pdu = page_frag_alloc(&queue->pf_cache,
1476 sizeof(*c->data_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1477 if (!c->data_pdu)
1478 goto out_free_rsp;
1479
1480 c->r2t_pdu = page_frag_alloc(&queue->pf_cache,
1481 sizeof(*c->r2t_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1482 if (!c->r2t_pdu)
1483 goto out_free_data;
1484
1485 if (queue->state == NVMET_TCP_Q_TLS_HANDSHAKE) {
1486 c->recv_msg.msg_control = c->recv_cbuf;
1487 c->recv_msg.msg_controllen = sizeof(c->recv_cbuf);
1488 }
1489 c->recv_msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
1490
1491 list_add_tail(&c->entry, &queue->free_list);
1492
1493 return 0;
1494 out_free_data:
1495 page_frag_free(c->data_pdu);
1496 out_free_rsp:
1497 page_frag_free(c->rsp_pdu);
1498 out_free_cmd:
1499 page_frag_free(c->cmd_pdu);
1500 return -ENOMEM;
1501 }
1502
nvmet_tcp_free_cmd(struct nvmet_tcp_cmd * c)1503 static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c)
1504 {
1505 page_frag_free(c->r2t_pdu);
1506 page_frag_free(c->data_pdu);
1507 page_frag_free(c->rsp_pdu);
1508 page_frag_free(c->cmd_pdu);
1509 }
1510
nvmet_tcp_alloc_cmds(struct nvmet_tcp_queue * queue)1511 static int nvmet_tcp_alloc_cmds(struct nvmet_tcp_queue *queue)
1512 {
1513 struct nvmet_tcp_cmd *cmds;
1514 int i, ret = -EINVAL, nr_cmds = queue->nr_cmds;
1515
1516 cmds = kvcalloc(nr_cmds, sizeof(struct nvmet_tcp_cmd), GFP_KERNEL);
1517 if (!cmds)
1518 goto out;
1519
1520 for (i = 0; i < nr_cmds; i++) {
1521 ret = nvmet_tcp_alloc_cmd(queue, cmds + i);
1522 if (ret)
1523 goto out_free;
1524 }
1525
1526 queue->cmds = cmds;
1527
1528 return 0;
1529 out_free:
1530 while (--i >= 0)
1531 nvmet_tcp_free_cmd(cmds + i);
1532 kvfree(cmds);
1533 out:
1534 return ret;
1535 }
1536
nvmet_tcp_free_cmds(struct nvmet_tcp_queue * queue)1537 static void nvmet_tcp_free_cmds(struct nvmet_tcp_queue *queue)
1538 {
1539 struct nvmet_tcp_cmd *cmds = queue->cmds;
1540 int i;
1541
1542 for (i = 0; i < queue->nr_cmds; i++)
1543 nvmet_tcp_free_cmd(cmds + i);
1544
1545 nvmet_tcp_free_cmd(&queue->connect);
1546 kvfree(cmds);
1547 }
1548
nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue * queue)1549 static void nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue)
1550 {
1551 struct socket *sock = queue->sock;
1552
1553 if (!queue->state_change)
1554 return;
1555
1556 write_lock_bh(&sock->sk->sk_callback_lock);
1557 sock->sk->sk_data_ready = queue->data_ready;
1558 sock->sk->sk_state_change = queue->state_change;
1559 sock->sk->sk_write_space = queue->write_space;
1560 sock->sk->sk_user_data = NULL;
1561 write_unlock_bh(&sock->sk->sk_callback_lock);
1562 }
1563
nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue * queue)1564 static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue)
1565 {
1566 struct nvmet_tcp_cmd *cmd = queue->cmds;
1567 int i;
1568
1569 for (i = 0; i < queue->nr_cmds; i++, cmd++) {
1570 if (nvmet_tcp_need_data_in(cmd))
1571 nvmet_req_uninit(&cmd->req);
1572 }
1573
1574 if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect)) {
1575 /* failed in connect */
1576 nvmet_req_uninit(&queue->connect.req);
1577 }
1578 }
1579
nvmet_tcp_free_cmd_data_in_buffers(struct nvmet_tcp_queue * queue)1580 static void nvmet_tcp_free_cmd_data_in_buffers(struct nvmet_tcp_queue *queue)
1581 {
1582 struct nvmet_tcp_cmd *cmd = queue->cmds;
1583 int i;
1584
1585 for (i = 0; i < queue->nr_cmds; i++, cmd++)
1586 nvmet_tcp_free_cmd_buffers(cmd);
1587 nvmet_tcp_free_cmd_buffers(&queue->connect);
1588 }
1589
nvmet_tcp_release_queue_work(struct work_struct * w)1590 static void nvmet_tcp_release_queue_work(struct work_struct *w)
1591 {
1592 struct nvmet_tcp_queue *queue =
1593 container_of(w, struct nvmet_tcp_queue, release_work);
1594
1595 mutex_lock(&nvmet_tcp_queue_mutex);
1596 list_del_init(&queue->queue_list);
1597 mutex_unlock(&nvmet_tcp_queue_mutex);
1598
1599 nvmet_tcp_restore_socket_callbacks(queue);
1600 cancel_delayed_work_sync(&queue->tls_handshake_tmo_work);
1601 cancel_work_sync(&queue->io_work);
1602 /* stop accepting incoming data */
1603 queue->rcv_state = NVMET_TCP_RECV_ERR;
1604
1605 nvmet_sq_put_tls_key(&queue->nvme_sq);
1606 nvmet_tcp_uninit_data_in_cmds(queue);
1607 nvmet_sq_destroy(&queue->nvme_sq);
1608 nvmet_cq_put(&queue->nvme_cq);
1609 cancel_work_sync(&queue->io_work);
1610 nvmet_tcp_free_cmd_data_in_buffers(queue);
1611 /* ->sock will be released by fput() */
1612 fput(queue->sock->file);
1613 nvmet_tcp_free_cmds(queue);
1614 ida_free(&nvmet_tcp_queue_ida, queue->idx);
1615 page_frag_cache_drain(&queue->pf_cache);
1616 kfree(queue);
1617 }
1618
nvmet_tcp_data_ready(struct sock * sk)1619 static void nvmet_tcp_data_ready(struct sock *sk)
1620 {
1621 struct nvmet_tcp_queue *queue;
1622
1623 trace_sk_data_ready(sk);
1624
1625 read_lock_bh(&sk->sk_callback_lock);
1626 queue = sk->sk_user_data;
1627 if (likely(queue)) {
1628 if (queue->data_ready)
1629 queue->data_ready(sk);
1630 if (queue->state != NVMET_TCP_Q_TLS_HANDSHAKE)
1631 queue_work_on(queue_cpu(queue), nvmet_tcp_wq,
1632 &queue->io_work);
1633 }
1634 read_unlock_bh(&sk->sk_callback_lock);
1635 }
1636
nvmet_tcp_write_space(struct sock * sk)1637 static void nvmet_tcp_write_space(struct sock *sk)
1638 {
1639 struct nvmet_tcp_queue *queue;
1640
1641 read_lock_bh(&sk->sk_callback_lock);
1642 queue = sk->sk_user_data;
1643 if (unlikely(!queue))
1644 goto out;
1645
1646 if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
1647 queue->write_space(sk);
1648 goto out;
1649 }
1650
1651 if (sk_stream_is_writeable(sk)) {
1652 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1653 queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
1654 }
1655 out:
1656 read_unlock_bh(&sk->sk_callback_lock);
1657 }
1658
nvmet_tcp_state_change(struct sock * sk)1659 static void nvmet_tcp_state_change(struct sock *sk)
1660 {
1661 struct nvmet_tcp_queue *queue;
1662
1663 read_lock_bh(&sk->sk_callback_lock);
1664 queue = sk->sk_user_data;
1665 if (!queue)
1666 goto done;
1667
1668 switch (sk->sk_state) {
1669 case TCP_FIN_WAIT2:
1670 case TCP_LAST_ACK:
1671 break;
1672 case TCP_FIN_WAIT1:
1673 case TCP_CLOSE_WAIT:
1674 case TCP_CLOSE:
1675 /* FALLTHRU */
1676 nvmet_tcp_schedule_release_queue(queue);
1677 break;
1678 default:
1679 pr_warn("queue %d unhandled state %d\n",
1680 queue->idx, sk->sk_state);
1681 }
1682 done:
1683 read_unlock_bh(&sk->sk_callback_lock);
1684 }
1685
nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue * queue)1686 static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
1687 {
1688 struct socket *sock = queue->sock;
1689 struct inet_sock *inet = inet_sk(sock->sk);
1690 int ret;
1691
1692 ret = kernel_getsockname(sock,
1693 (struct sockaddr *)&queue->sockaddr);
1694 if (ret < 0)
1695 return ret;
1696
1697 ret = kernel_getpeername(sock,
1698 (struct sockaddr *)&queue->sockaddr_peer);
1699 if (ret < 0)
1700 return ret;
1701
1702 /*
1703 * Cleanup whatever is sitting in the TCP transmit queue on socket
1704 * close. This is done to prevent stale data from being sent should
1705 * the network connection be restored before TCP times out.
1706 */
1707 sock_no_linger(sock->sk);
1708
1709 if (so_priority > 0)
1710 sock_set_priority(sock->sk, so_priority);
1711
1712 /* Set socket type of service */
1713 if (inet->rcv_tos > 0)
1714 ip_sock_set_tos(sock->sk, inet->rcv_tos);
1715
1716 ret = 0;
1717 write_lock_bh(&sock->sk->sk_callback_lock);
1718 if (sock->sk->sk_state != TCP_ESTABLISHED) {
1719 /*
1720 * If the socket is already closing, don't even start
1721 * consuming it
1722 */
1723 ret = -ENOTCONN;
1724 } else {
1725 sock->sk->sk_user_data = queue;
1726 queue->data_ready = sock->sk->sk_data_ready;
1727 sock->sk->sk_data_ready = nvmet_tcp_data_ready;
1728 queue->state_change = sock->sk->sk_state_change;
1729 sock->sk->sk_state_change = nvmet_tcp_state_change;
1730 queue->write_space = sock->sk->sk_write_space;
1731 sock->sk->sk_write_space = nvmet_tcp_write_space;
1732 if (idle_poll_period_usecs)
1733 nvmet_tcp_arm_queue_deadline(queue);
1734 queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
1735 }
1736 write_unlock_bh(&sock->sk->sk_callback_lock);
1737
1738 return ret;
1739 }
1740
1741 #ifdef CONFIG_NVME_TARGET_TCP_TLS
nvmet_tcp_try_peek_pdu(struct nvmet_tcp_queue * queue)1742 static int nvmet_tcp_try_peek_pdu(struct nvmet_tcp_queue *queue)
1743 {
1744 struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
1745 int len, ret;
1746 struct kvec iov = {
1747 .iov_base = (u8 *)&queue->pdu + queue->offset,
1748 .iov_len = sizeof(struct nvme_tcp_hdr),
1749 };
1750 char cbuf[CMSG_LEN(sizeof(char))] = {};
1751 struct msghdr msg = {
1752 .msg_control = cbuf,
1753 .msg_controllen = sizeof(cbuf),
1754 .msg_flags = MSG_PEEK,
1755 };
1756
1757 if (nvmet_port_secure_channel_required(queue->port->nport))
1758 return 0;
1759
1760 len = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1761 iov.iov_len, msg.msg_flags);
1762 if (unlikely(len < 0)) {
1763 pr_debug("queue %d: peek error %d\n",
1764 queue->idx, len);
1765 return len;
1766 }
1767
1768 ret = nvmet_tcp_tls_record_ok(queue, &msg, cbuf);
1769 if (ret < 0)
1770 return ret;
1771
1772 if (len < sizeof(struct nvme_tcp_hdr)) {
1773 pr_debug("queue %d: short read, %d bytes missing\n",
1774 queue->idx, (int)iov.iov_len - len);
1775 return -EAGAIN;
1776 }
1777 pr_debug("queue %d: hdr type %d hlen %d plen %d size %d\n",
1778 queue->idx, hdr->type, hdr->hlen, hdr->plen,
1779 (int)sizeof(struct nvme_tcp_icreq_pdu));
1780 if (hdr->type == nvme_tcp_icreq &&
1781 hdr->hlen == sizeof(struct nvme_tcp_icreq_pdu) &&
1782 hdr->plen == cpu_to_le32(sizeof(struct nvme_tcp_icreq_pdu))) {
1783 pr_debug("queue %d: icreq detected\n",
1784 queue->idx);
1785 return len;
1786 }
1787 return 0;
1788 }
1789
nvmet_tcp_tls_key_lookup(struct nvmet_tcp_queue * queue,key_serial_t peerid)1790 static int nvmet_tcp_tls_key_lookup(struct nvmet_tcp_queue *queue,
1791 key_serial_t peerid)
1792 {
1793 struct key *tls_key = nvme_tls_key_lookup(peerid);
1794 int status = 0;
1795
1796 if (IS_ERR(tls_key)) {
1797 pr_warn("%s: queue %d failed to lookup key %x\n",
1798 __func__, queue->idx, peerid);
1799 spin_lock_bh(&queue->state_lock);
1800 queue->state = NVMET_TCP_Q_FAILED;
1801 spin_unlock_bh(&queue->state_lock);
1802 status = PTR_ERR(tls_key);
1803 } else {
1804 pr_debug("%s: queue %d using TLS PSK %x\n",
1805 __func__, queue->idx, peerid);
1806 queue->nvme_sq.tls_key = tls_key;
1807 }
1808 return status;
1809 }
1810
nvmet_tcp_tls_handshake_done(void * data,int status,key_serial_t peerid)1811 static void nvmet_tcp_tls_handshake_done(void *data, int status,
1812 key_serial_t peerid)
1813 {
1814 struct nvmet_tcp_queue *queue = data;
1815
1816 pr_debug("queue %d: TLS handshake done, key %x, status %d\n",
1817 queue->idx, peerid, status);
1818 spin_lock_bh(&queue->state_lock);
1819 if (WARN_ON(queue->state != NVMET_TCP_Q_TLS_HANDSHAKE)) {
1820 spin_unlock_bh(&queue->state_lock);
1821 return;
1822 }
1823 if (!status) {
1824 queue->tls_pskid = peerid;
1825 queue->state = NVMET_TCP_Q_CONNECTING;
1826 } else
1827 queue->state = NVMET_TCP_Q_FAILED;
1828 spin_unlock_bh(&queue->state_lock);
1829
1830 cancel_delayed_work_sync(&queue->tls_handshake_tmo_work);
1831
1832 if (!status)
1833 status = nvmet_tcp_tls_key_lookup(queue, peerid);
1834
1835 if (status)
1836 nvmet_tcp_schedule_release_queue(queue);
1837 else
1838 nvmet_tcp_set_queue_sock(queue);
1839 kref_put(&queue->kref, nvmet_tcp_release_queue);
1840 }
1841
nvmet_tcp_tls_handshake_timeout(struct work_struct * w)1842 static void nvmet_tcp_tls_handshake_timeout(struct work_struct *w)
1843 {
1844 struct nvmet_tcp_queue *queue = container_of(to_delayed_work(w),
1845 struct nvmet_tcp_queue, tls_handshake_tmo_work);
1846
1847 pr_warn("queue %d: TLS handshake timeout\n", queue->idx);
1848 /*
1849 * If tls_handshake_cancel() fails we've lost the race with
1850 * nvmet_tcp_tls_handshake_done() */
1851 if (!tls_handshake_cancel(queue->sock->sk))
1852 return;
1853 spin_lock_bh(&queue->state_lock);
1854 if (WARN_ON(queue->state != NVMET_TCP_Q_TLS_HANDSHAKE)) {
1855 spin_unlock_bh(&queue->state_lock);
1856 return;
1857 }
1858 queue->state = NVMET_TCP_Q_FAILED;
1859 spin_unlock_bh(&queue->state_lock);
1860 nvmet_tcp_schedule_release_queue(queue);
1861 kref_put(&queue->kref, nvmet_tcp_release_queue);
1862 }
1863
nvmet_tcp_tls_handshake(struct nvmet_tcp_queue * queue)1864 static int nvmet_tcp_tls_handshake(struct nvmet_tcp_queue *queue)
1865 {
1866 int ret = -EOPNOTSUPP;
1867 struct tls_handshake_args args;
1868
1869 if (queue->state != NVMET_TCP_Q_TLS_HANDSHAKE) {
1870 pr_warn("cannot start TLS in state %d\n", queue->state);
1871 return -EINVAL;
1872 }
1873
1874 kref_get(&queue->kref);
1875 pr_debug("queue %d: TLS ServerHello\n", queue->idx);
1876 memset(&args, 0, sizeof(args));
1877 args.ta_sock = queue->sock;
1878 args.ta_done = nvmet_tcp_tls_handshake_done;
1879 args.ta_data = queue;
1880 args.ta_keyring = key_serial(queue->port->nport->keyring);
1881 args.ta_timeout_ms = tls_handshake_timeout * 1000;
1882
1883 ret = tls_server_hello_psk(&args, GFP_KERNEL);
1884 if (ret) {
1885 kref_put(&queue->kref, nvmet_tcp_release_queue);
1886 pr_err("failed to start TLS, err=%d\n", ret);
1887 } else {
1888 queue_delayed_work(nvmet_wq, &queue->tls_handshake_tmo_work,
1889 tls_handshake_timeout * HZ);
1890 }
1891 return ret;
1892 }
1893 #else
nvmet_tcp_tls_handshake_timeout(struct work_struct * w)1894 static void nvmet_tcp_tls_handshake_timeout(struct work_struct *w) {}
1895 #endif
1896
nvmet_tcp_alloc_queue(struct nvmet_tcp_port * port,struct socket * newsock)1897 static void nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
1898 struct socket *newsock)
1899 {
1900 struct nvmet_tcp_queue *queue;
1901 struct file *sock_file = NULL;
1902 int ret;
1903
1904 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
1905 if (!queue) {
1906 ret = -ENOMEM;
1907 goto out_release;
1908 }
1909
1910 INIT_WORK(&queue->release_work, nvmet_tcp_release_queue_work);
1911 INIT_WORK(&queue->io_work, nvmet_tcp_io_work);
1912 kref_init(&queue->kref);
1913 queue->sock = newsock;
1914 queue->port = port;
1915 queue->nr_cmds = 0;
1916 spin_lock_init(&queue->state_lock);
1917 if (queue->port->nport->disc_addr.tsas.tcp.sectype ==
1918 NVMF_TCP_SECTYPE_TLS13)
1919 queue->state = NVMET_TCP_Q_TLS_HANDSHAKE;
1920 else
1921 queue->state = NVMET_TCP_Q_CONNECTING;
1922 INIT_LIST_HEAD(&queue->free_list);
1923 init_llist_head(&queue->resp_list);
1924 INIT_LIST_HEAD(&queue->resp_send_list);
1925
1926 sock_file = sock_alloc_file(queue->sock, O_CLOEXEC, NULL);
1927 if (IS_ERR(sock_file)) {
1928 ret = PTR_ERR(sock_file);
1929 goto out_free_queue;
1930 }
1931
1932 queue->idx = ida_alloc(&nvmet_tcp_queue_ida, GFP_KERNEL);
1933 if (queue->idx < 0) {
1934 ret = queue->idx;
1935 goto out_sock;
1936 }
1937
1938 ret = nvmet_tcp_alloc_cmd(queue, &queue->connect);
1939 if (ret)
1940 goto out_ida_remove;
1941
1942 nvmet_cq_init(&queue->nvme_cq);
1943 ret = nvmet_sq_init(&queue->nvme_sq, &queue->nvme_cq);
1944 if (ret)
1945 goto out_free_connect;
1946
1947 nvmet_prepare_receive_pdu(queue);
1948
1949 mutex_lock(&nvmet_tcp_queue_mutex);
1950 list_add_tail(&queue->queue_list, &nvmet_tcp_queue_list);
1951 mutex_unlock(&nvmet_tcp_queue_mutex);
1952
1953 INIT_DELAYED_WORK(&queue->tls_handshake_tmo_work,
1954 nvmet_tcp_tls_handshake_timeout);
1955 #ifdef CONFIG_NVME_TARGET_TCP_TLS
1956 if (queue->state == NVMET_TCP_Q_TLS_HANDSHAKE) {
1957 struct sock *sk = queue->sock->sk;
1958
1959 /* Restore the default callbacks before starting upcall */
1960 write_lock_bh(&sk->sk_callback_lock);
1961 sk->sk_user_data = NULL;
1962 sk->sk_data_ready = port->data_ready;
1963 write_unlock_bh(&sk->sk_callback_lock);
1964 if (!nvmet_tcp_try_peek_pdu(queue)) {
1965 if (!nvmet_tcp_tls_handshake(queue))
1966 return;
1967 /* TLS handshake failed, terminate the connection */
1968 goto out_destroy_sq;
1969 }
1970 /* Not a TLS connection, continue with normal processing */
1971 queue->state = NVMET_TCP_Q_CONNECTING;
1972 }
1973 #endif
1974
1975 ret = nvmet_tcp_set_queue_sock(queue);
1976 if (ret)
1977 goto out_destroy_sq;
1978
1979 return;
1980 out_destroy_sq:
1981 mutex_lock(&nvmet_tcp_queue_mutex);
1982 list_del_init(&queue->queue_list);
1983 mutex_unlock(&nvmet_tcp_queue_mutex);
1984 nvmet_sq_destroy(&queue->nvme_sq);
1985 out_free_connect:
1986 nvmet_cq_put(&queue->nvme_cq);
1987 nvmet_tcp_free_cmd(&queue->connect);
1988 out_ida_remove:
1989 ida_free(&nvmet_tcp_queue_ida, queue->idx);
1990 out_sock:
1991 fput(queue->sock->file);
1992 out_free_queue:
1993 kfree(queue);
1994 out_release:
1995 pr_err("failed to allocate queue, error %d\n", ret);
1996 if (!sock_file)
1997 sock_release(newsock);
1998 }
1999
nvmet_tcp_accept_work(struct work_struct * w)2000 static void nvmet_tcp_accept_work(struct work_struct *w)
2001 {
2002 struct nvmet_tcp_port *port =
2003 container_of(w, struct nvmet_tcp_port, accept_work);
2004 struct socket *newsock;
2005 int ret;
2006
2007 while (true) {
2008 ret = kernel_accept(port->sock, &newsock, O_NONBLOCK);
2009 if (ret < 0) {
2010 if (ret != -EAGAIN)
2011 pr_warn("failed to accept err=%d\n", ret);
2012 return;
2013 }
2014 nvmet_tcp_alloc_queue(port, newsock);
2015 }
2016 }
2017
nvmet_tcp_listen_data_ready(struct sock * sk)2018 static void nvmet_tcp_listen_data_ready(struct sock *sk)
2019 {
2020 struct nvmet_tcp_port *port;
2021
2022 trace_sk_data_ready(sk);
2023
2024 if (sk->sk_state != TCP_LISTEN)
2025 return;
2026
2027 read_lock_bh(&sk->sk_callback_lock);
2028 port = sk->sk_user_data;
2029 if (port)
2030 queue_work(nvmet_wq, &port->accept_work);
2031 read_unlock_bh(&sk->sk_callback_lock);
2032 }
2033
nvmet_tcp_add_port(struct nvmet_port * nport)2034 static int nvmet_tcp_add_port(struct nvmet_port *nport)
2035 {
2036 struct nvmet_tcp_port *port;
2037 __kernel_sa_family_t af;
2038 int ret;
2039
2040 port = kzalloc(sizeof(*port), GFP_KERNEL);
2041 if (!port)
2042 return -ENOMEM;
2043
2044 switch (nport->disc_addr.adrfam) {
2045 case NVMF_ADDR_FAMILY_IP4:
2046 af = AF_INET;
2047 break;
2048 case NVMF_ADDR_FAMILY_IP6:
2049 af = AF_INET6;
2050 break;
2051 default:
2052 pr_err("address family %d not supported\n",
2053 nport->disc_addr.adrfam);
2054 ret = -EINVAL;
2055 goto err_port;
2056 }
2057
2058 ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr,
2059 nport->disc_addr.trsvcid, &port->addr);
2060 if (ret) {
2061 pr_err("malformed ip/port passed: %s:%s\n",
2062 nport->disc_addr.traddr, nport->disc_addr.trsvcid);
2063 goto err_port;
2064 }
2065
2066 port->nport = nport;
2067 INIT_WORK(&port->accept_work, nvmet_tcp_accept_work);
2068 if (port->nport->inline_data_size < 0)
2069 port->nport->inline_data_size = NVMET_TCP_DEF_INLINE_DATA_SIZE;
2070
2071 ret = sock_create(port->addr.ss_family, SOCK_STREAM,
2072 IPPROTO_TCP, &port->sock);
2073 if (ret) {
2074 pr_err("failed to create a socket\n");
2075 goto err_port;
2076 }
2077
2078 port->sock->sk->sk_user_data = port;
2079 port->data_ready = port->sock->sk->sk_data_ready;
2080 port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready;
2081 sock_set_reuseaddr(port->sock->sk);
2082 tcp_sock_set_nodelay(port->sock->sk);
2083 if (so_priority > 0)
2084 sock_set_priority(port->sock->sk, so_priority);
2085
2086 ret = kernel_bind(port->sock, (struct sockaddr_unsized *)&port->addr,
2087 sizeof(port->addr));
2088 if (ret) {
2089 pr_err("failed to bind port socket %d\n", ret);
2090 goto err_sock;
2091 }
2092
2093 ret = kernel_listen(port->sock, NVMET_TCP_BACKLOG);
2094 if (ret) {
2095 pr_err("failed to listen %d on port sock\n", ret);
2096 goto err_sock;
2097 }
2098
2099 nport->priv = port;
2100 pr_info("enabling port %d (%pISpc)\n",
2101 le16_to_cpu(nport->disc_addr.portid), &port->addr);
2102
2103 return 0;
2104
2105 err_sock:
2106 sock_release(port->sock);
2107 err_port:
2108 kfree(port);
2109 return ret;
2110 }
2111
nvmet_tcp_destroy_port_queues(struct nvmet_tcp_port * port)2112 static void nvmet_tcp_destroy_port_queues(struct nvmet_tcp_port *port)
2113 {
2114 struct nvmet_tcp_queue *queue;
2115
2116 mutex_lock(&nvmet_tcp_queue_mutex);
2117 list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
2118 if (queue->port == port)
2119 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
2120 mutex_unlock(&nvmet_tcp_queue_mutex);
2121 }
2122
nvmet_tcp_remove_port(struct nvmet_port * nport)2123 static void nvmet_tcp_remove_port(struct nvmet_port *nport)
2124 {
2125 struct nvmet_tcp_port *port = nport->priv;
2126
2127 write_lock_bh(&port->sock->sk->sk_callback_lock);
2128 port->sock->sk->sk_data_ready = port->data_ready;
2129 port->sock->sk->sk_user_data = NULL;
2130 write_unlock_bh(&port->sock->sk->sk_callback_lock);
2131 cancel_work_sync(&port->accept_work);
2132 /*
2133 * Destroy the remaining queues, which are not belong to any
2134 * controller yet.
2135 */
2136 nvmet_tcp_destroy_port_queues(port);
2137
2138 sock_release(port->sock);
2139 kfree(port);
2140 }
2141
nvmet_tcp_delete_ctrl(struct nvmet_ctrl * ctrl)2142 static void nvmet_tcp_delete_ctrl(struct nvmet_ctrl *ctrl)
2143 {
2144 struct nvmet_tcp_queue *queue;
2145
2146 mutex_lock(&nvmet_tcp_queue_mutex);
2147 list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
2148 if (queue->nvme_sq.ctrl == ctrl)
2149 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
2150 mutex_unlock(&nvmet_tcp_queue_mutex);
2151 }
2152
nvmet_tcp_install_queue(struct nvmet_sq * sq)2153 static u16 nvmet_tcp_install_queue(struct nvmet_sq *sq)
2154 {
2155 struct nvmet_tcp_queue *queue =
2156 container_of(sq, struct nvmet_tcp_queue, nvme_sq);
2157
2158 if (sq->qid == 0) {
2159 struct nvmet_tcp_queue *q;
2160 int pending = 0;
2161
2162 /* Check for pending controller teardown */
2163 mutex_lock(&nvmet_tcp_queue_mutex);
2164 list_for_each_entry(q, &nvmet_tcp_queue_list, queue_list) {
2165 if (q->nvme_sq.ctrl == sq->ctrl &&
2166 q->state == NVMET_TCP_Q_DISCONNECTING)
2167 pending++;
2168 }
2169 mutex_unlock(&nvmet_tcp_queue_mutex);
2170 if (pending > NVMET_TCP_BACKLOG)
2171 return NVME_SC_CONNECT_CTRL_BUSY;
2172 }
2173
2174 queue->nr_cmds = sq->size * 2;
2175 if (nvmet_tcp_alloc_cmds(queue)) {
2176 queue->nr_cmds = 0;
2177 return NVME_SC_INTERNAL;
2178 }
2179 return 0;
2180 }
2181
nvmet_tcp_disc_port_addr(struct nvmet_req * req,struct nvmet_port * nport,char * traddr)2182 static void nvmet_tcp_disc_port_addr(struct nvmet_req *req,
2183 struct nvmet_port *nport, char *traddr)
2184 {
2185 struct nvmet_tcp_port *port = nport->priv;
2186
2187 if (inet_addr_is_any(&port->addr)) {
2188 struct nvmet_tcp_cmd *cmd =
2189 container_of(req, struct nvmet_tcp_cmd, req);
2190 struct nvmet_tcp_queue *queue = cmd->queue;
2191
2192 sprintf(traddr, "%pISc", (struct sockaddr *)&queue->sockaddr);
2193 } else {
2194 memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE);
2195 }
2196 }
2197
nvmet_tcp_host_port_addr(struct nvmet_ctrl * ctrl,char * traddr,size_t traddr_len)2198 static ssize_t nvmet_tcp_host_port_addr(struct nvmet_ctrl *ctrl,
2199 char *traddr, size_t traddr_len)
2200 {
2201 struct nvmet_sq *sq = ctrl->sqs[0];
2202 struct nvmet_tcp_queue *queue =
2203 container_of(sq, struct nvmet_tcp_queue, nvme_sq);
2204
2205 if (queue->sockaddr_peer.ss_family == AF_UNSPEC)
2206 return -EINVAL;
2207 return snprintf(traddr, traddr_len, "%pISc",
2208 (struct sockaddr *)&queue->sockaddr_peer);
2209 }
2210
2211 static const struct nvmet_fabrics_ops nvmet_tcp_ops = {
2212 .owner = THIS_MODULE,
2213 .type = NVMF_TRTYPE_TCP,
2214 .msdbd = 1,
2215 .add_port = nvmet_tcp_add_port,
2216 .remove_port = nvmet_tcp_remove_port,
2217 .queue_response = nvmet_tcp_queue_response,
2218 .delete_ctrl = nvmet_tcp_delete_ctrl,
2219 .install_queue = nvmet_tcp_install_queue,
2220 .disc_traddr = nvmet_tcp_disc_port_addr,
2221 .host_traddr = nvmet_tcp_host_port_addr,
2222 };
2223
nvmet_tcp_init(void)2224 static int __init nvmet_tcp_init(void)
2225 {
2226 int ret;
2227
2228 nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq",
2229 WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
2230 if (!nvmet_tcp_wq)
2231 return -ENOMEM;
2232
2233 ret = nvmet_register_transport(&nvmet_tcp_ops);
2234 if (ret)
2235 goto err;
2236
2237 return 0;
2238 err:
2239 destroy_workqueue(nvmet_tcp_wq);
2240 return ret;
2241 }
2242
nvmet_tcp_exit(void)2243 static void __exit nvmet_tcp_exit(void)
2244 {
2245 struct nvmet_tcp_queue *queue;
2246
2247 nvmet_unregister_transport(&nvmet_tcp_ops);
2248
2249 flush_workqueue(nvmet_wq);
2250 mutex_lock(&nvmet_tcp_queue_mutex);
2251 list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
2252 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
2253 mutex_unlock(&nvmet_tcp_queue_mutex);
2254 flush_workqueue(nvmet_wq);
2255
2256 destroy_workqueue(nvmet_tcp_wq);
2257 ida_destroy(&nvmet_tcp_queue_ida);
2258 }
2259
2260 module_init(nvmet_tcp_init);
2261 module_exit(nvmet_tcp_exit);
2262
2263 MODULE_DESCRIPTION("NVMe target TCP transport driver");
2264 MODULE_LICENSE("GPL v2");
2265 MODULE_ALIAS("nvmet-transport-3"); /* 3 == NVMF_TRTYPE_TCP */
2266