1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * NVMe over Fabrics TCP host. 4 * Copyright (c) 2018 Lightbits Labs. All rights reserved. 5 */ 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 #include <linux/module.h> 8 #include <linux/init.h> 9 #include <linux/slab.h> 10 #include <linux/err.h> 11 #include <linux/key.h> 12 #include <linux/nvme-tcp.h> 13 #include <linux/nvme-keyring.h> 14 #include <net/sock.h> 15 #include <net/tcp.h> 16 #include <net/tls.h> 17 #include <net/tls_prot.h> 18 #include <net/handshake.h> 19 #include <linux/blk-mq.h> 20 #include <crypto/hash.h> 21 #include <net/busy_poll.h> 22 #include <trace/events/sock.h> 23 24 #include "nvme.h" 25 #include "fabrics.h" 26 27 struct nvme_tcp_queue; 28 29 /* Define the socket priority to use for connections were it is desirable 30 * that the NIC consider performing optimized packet processing or filtering. 31 * A non-zero value being sufficient to indicate general consideration of any 32 * possible optimization. Making it a module param allows for alternative 33 * values that may be unique for some NIC implementations. 34 */ 35 static int so_priority; 36 module_param(so_priority, int, 0644); 37 MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority"); 38 39 /* 40 * Use the unbound workqueue for nvme_tcp_wq, then we can set the cpu affinity 41 * from sysfs. 42 */ 43 static bool wq_unbound; 44 module_param(wq_unbound, bool, 0644); 45 MODULE_PARM_DESC(wq_unbound, "Use unbound workqueue for nvme-tcp IO context (default false)"); 46 47 /* 48 * TLS handshake timeout 49 */ 50 static int tls_handshake_timeout = 10; 51 #ifdef CONFIG_NVME_TCP_TLS 52 module_param(tls_handshake_timeout, int, 0644); 53 MODULE_PARM_DESC(tls_handshake_timeout, 54 "nvme TLS handshake timeout in seconds (default 10)"); 55 #endif 56 57 #ifdef CONFIG_DEBUG_LOCK_ALLOC 58 /* lockdep can detect a circular dependency of the form 59 * sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock 60 * because dependencies are tracked for both nvme-tcp and user contexts. Using 61 * a separate class prevents lockdep from conflating nvme-tcp socket use with 62 * user-space socket API use. 63 */ 64 static struct lock_class_key nvme_tcp_sk_key[2]; 65 static struct lock_class_key nvme_tcp_slock_key[2]; 66 67 static void nvme_tcp_reclassify_socket(struct socket *sock) 68 { 69 struct sock *sk = sock->sk; 70 71 if (WARN_ON_ONCE(!sock_allow_reclassification(sk))) 72 return; 73 74 switch (sk->sk_family) { 75 case AF_INET: 76 sock_lock_init_class_and_name(sk, "slock-AF_INET-NVME", 77 &nvme_tcp_slock_key[0], 78 "sk_lock-AF_INET-NVME", 79 &nvme_tcp_sk_key[0]); 80 break; 81 case AF_INET6: 82 sock_lock_init_class_and_name(sk, "slock-AF_INET6-NVME", 83 &nvme_tcp_slock_key[1], 84 "sk_lock-AF_INET6-NVME", 85 &nvme_tcp_sk_key[1]); 86 break; 87 default: 88 WARN_ON_ONCE(1); 89 } 90 } 91 #else 92 static void nvme_tcp_reclassify_socket(struct socket *sock) { } 93 #endif 94 95 enum nvme_tcp_send_state { 96 NVME_TCP_SEND_CMD_PDU = 0, 97 NVME_TCP_SEND_H2C_PDU, 98 NVME_TCP_SEND_DATA, 99 NVME_TCP_SEND_DDGST, 100 }; 101 102 struct nvme_tcp_request { 103 struct nvme_request req; 104 void *pdu; 105 struct nvme_tcp_queue *queue; 106 u32 data_len; 107 u32 pdu_len; 108 u32 pdu_sent; 109 u32 h2cdata_left; 110 u32 h2cdata_offset; 111 u16 ttag; 112 __le16 status; 113 struct list_head entry; 114 struct llist_node lentry; 115 __le32 ddgst; 116 117 struct bio *curr_bio; 118 struct iov_iter iter; 119 120 /* send state */ 121 size_t offset; 122 size_t data_sent; 123 enum nvme_tcp_send_state state; 124 }; 125 126 enum nvme_tcp_queue_flags { 127 NVME_TCP_Q_ALLOCATED = 0, 128 NVME_TCP_Q_LIVE = 1, 129 NVME_TCP_Q_POLLING = 2, 130 }; 131 132 enum nvme_tcp_recv_state { 133 NVME_TCP_RECV_PDU = 0, 134 NVME_TCP_RECV_DATA, 135 NVME_TCP_RECV_DDGST, 136 }; 137 138 struct nvme_tcp_ctrl; 139 struct nvme_tcp_queue { 140 struct socket *sock; 141 struct work_struct io_work; 142 int io_cpu; 143 144 struct mutex queue_lock; 145 struct mutex send_mutex; 146 struct llist_head req_list; 147 struct list_head send_list; 148 149 /* recv state */ 150 void *pdu; 151 int pdu_remaining; 152 int pdu_offset; 153 size_t data_remaining; 154 size_t ddgst_remaining; 155 unsigned int nr_cqe; 156 157 /* send state */ 158 struct nvme_tcp_request *request; 159 160 u32 maxh2cdata; 161 size_t cmnd_capsule_len; 162 struct nvme_tcp_ctrl *ctrl; 163 unsigned long flags; 164 bool rd_enabled; 165 166 bool hdr_digest; 167 bool data_digest; 168 struct ahash_request *rcv_hash; 169 struct ahash_request *snd_hash; 170 __le32 exp_ddgst; 171 __le32 recv_ddgst; 172 struct completion tls_complete; 173 int tls_err; 174 struct page_frag_cache pf_cache; 175 176 void (*state_change)(struct sock *); 177 void (*data_ready)(struct sock *); 178 void (*write_space)(struct sock *); 179 }; 180 181 struct nvme_tcp_ctrl { 182 /* read only in the hot path */ 183 struct nvme_tcp_queue *queues; 184 struct blk_mq_tag_set tag_set; 185 186 /* other member variables */ 187 struct list_head list; 188 struct blk_mq_tag_set admin_tag_set; 189 struct sockaddr_storage addr; 190 struct sockaddr_storage src_addr; 191 struct nvme_ctrl ctrl; 192 193 struct work_struct err_work; 194 struct delayed_work connect_work; 195 struct nvme_tcp_request async_req; 196 u32 io_queues[HCTX_MAX_TYPES]; 197 }; 198 199 static LIST_HEAD(nvme_tcp_ctrl_list); 200 static DEFINE_MUTEX(nvme_tcp_ctrl_mutex); 201 static struct workqueue_struct *nvme_tcp_wq; 202 static const struct blk_mq_ops nvme_tcp_mq_ops; 203 static const struct blk_mq_ops nvme_tcp_admin_mq_ops; 204 static int nvme_tcp_try_send(struct nvme_tcp_queue *queue); 205 206 static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl) 207 { 208 return container_of(ctrl, struct nvme_tcp_ctrl, ctrl); 209 } 210 211 static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue) 212 { 213 return queue - queue->ctrl->queues; 214 } 215 216 static inline bool nvme_tcp_tls(struct nvme_ctrl *ctrl) 217 { 218 if (!IS_ENABLED(CONFIG_NVME_TCP_TLS)) 219 return 0; 220 221 return ctrl->opts->tls; 222 } 223 224 static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue) 225 { 226 u32 queue_idx = nvme_tcp_queue_id(queue); 227 228 if (queue_idx == 0) 229 return queue->ctrl->admin_tag_set.tags[queue_idx]; 230 return queue->ctrl->tag_set.tags[queue_idx - 1]; 231 } 232 233 static inline u8 nvme_tcp_hdgst_len(struct nvme_tcp_queue *queue) 234 { 235 return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0; 236 } 237 238 static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue) 239 { 240 return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0; 241 } 242 243 static inline void *nvme_tcp_req_cmd_pdu(struct nvme_tcp_request *req) 244 { 245 return req->pdu; 246 } 247 248 static inline void *nvme_tcp_req_data_pdu(struct nvme_tcp_request *req) 249 { 250 /* use the pdu space in the back for the data pdu */ 251 return req->pdu + sizeof(struct nvme_tcp_cmd_pdu) - 252 sizeof(struct nvme_tcp_data_pdu); 253 } 254 255 static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_request *req) 256 { 257 if (nvme_is_fabrics(req->req.cmd)) 258 return NVME_TCP_ADMIN_CCSZ; 259 return req->queue->cmnd_capsule_len - sizeof(struct nvme_command); 260 } 261 262 static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req) 263 { 264 return req == &req->queue->ctrl->async_req; 265 } 266 267 static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req) 268 { 269 struct request *rq; 270 271 if (unlikely(nvme_tcp_async_req(req))) 272 return false; /* async events don't have a request */ 273 274 rq = blk_mq_rq_from_pdu(req); 275 276 return rq_data_dir(rq) == WRITE && req->data_len && 277 req->data_len <= nvme_tcp_inline_data_size(req); 278 } 279 280 static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req) 281 { 282 return req->iter.bvec->bv_page; 283 } 284 285 static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request *req) 286 { 287 return req->iter.bvec->bv_offset + req->iter.iov_offset; 288 } 289 290 static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req) 291 { 292 return min_t(size_t, iov_iter_single_seg_count(&req->iter), 293 req->pdu_len - req->pdu_sent); 294 } 295 296 static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request *req) 297 { 298 return rq_data_dir(blk_mq_rq_from_pdu(req)) == WRITE ? 299 req->pdu_len - req->pdu_sent : 0; 300 } 301 302 static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req, 303 int len) 304 { 305 return nvme_tcp_pdu_data_left(req) <= len; 306 } 307 308 static void nvme_tcp_init_iter(struct nvme_tcp_request *req, 309 unsigned int dir) 310 { 311 struct request *rq = blk_mq_rq_from_pdu(req); 312 struct bio_vec *vec; 313 unsigned int size; 314 int nr_bvec; 315 size_t offset; 316 317 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) { 318 vec = &rq->special_vec; 319 nr_bvec = 1; 320 size = blk_rq_payload_bytes(rq); 321 offset = 0; 322 } else { 323 struct bio *bio = req->curr_bio; 324 struct bvec_iter bi; 325 struct bio_vec bv; 326 327 vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); 328 nr_bvec = 0; 329 bio_for_each_bvec(bv, bio, bi) { 330 nr_bvec++; 331 } 332 size = bio->bi_iter.bi_size; 333 offset = bio->bi_iter.bi_bvec_done; 334 } 335 336 iov_iter_bvec(&req->iter, dir, vec, nr_bvec, size); 337 req->iter.iov_offset = offset; 338 } 339 340 static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req, 341 int len) 342 { 343 req->data_sent += len; 344 req->pdu_sent += len; 345 iov_iter_advance(&req->iter, len); 346 if (!iov_iter_count(&req->iter) && 347 req->data_sent < req->data_len) { 348 req->curr_bio = req->curr_bio->bi_next; 349 nvme_tcp_init_iter(req, ITER_SOURCE); 350 } 351 } 352 353 static inline void nvme_tcp_send_all(struct nvme_tcp_queue *queue) 354 { 355 int ret; 356 357 /* drain the send queue as much as we can... */ 358 do { 359 ret = nvme_tcp_try_send(queue); 360 } while (ret > 0); 361 } 362 363 static inline bool nvme_tcp_queue_has_pending(struct nvme_tcp_queue *queue) 364 { 365 return !list_empty(&queue->send_list) || 366 !llist_empty(&queue->req_list); 367 } 368 369 static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue) 370 { 371 return !nvme_tcp_tls(&queue->ctrl->ctrl) && 372 nvme_tcp_queue_has_pending(queue); 373 } 374 375 static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, 376 bool sync, bool last) 377 { 378 struct nvme_tcp_queue *queue = req->queue; 379 bool empty; 380 381 empty = llist_add(&req->lentry, &queue->req_list) && 382 list_empty(&queue->send_list) && !queue->request; 383 384 /* 385 * if we're the first on the send_list and we can try to send 386 * directly, otherwise queue io_work. Also, only do that if we 387 * are on the same cpu, so we don't introduce contention. 388 */ 389 if (queue->io_cpu == raw_smp_processor_id() && 390 sync && empty && mutex_trylock(&queue->send_mutex)) { 391 nvme_tcp_send_all(queue); 392 mutex_unlock(&queue->send_mutex); 393 } 394 395 if (last && nvme_tcp_queue_has_pending(queue)) 396 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); 397 } 398 399 static void nvme_tcp_process_req_list(struct nvme_tcp_queue *queue) 400 { 401 struct nvme_tcp_request *req; 402 struct llist_node *node; 403 404 for (node = llist_del_all(&queue->req_list); node; node = node->next) { 405 req = llist_entry(node, struct nvme_tcp_request, lentry); 406 list_add(&req->entry, &queue->send_list); 407 } 408 } 409 410 static inline struct nvme_tcp_request * 411 nvme_tcp_fetch_request(struct nvme_tcp_queue *queue) 412 { 413 struct nvme_tcp_request *req; 414 415 req = list_first_entry_or_null(&queue->send_list, 416 struct nvme_tcp_request, entry); 417 if (!req) { 418 nvme_tcp_process_req_list(queue); 419 req = list_first_entry_or_null(&queue->send_list, 420 struct nvme_tcp_request, entry); 421 if (unlikely(!req)) 422 return NULL; 423 } 424 425 list_del(&req->entry); 426 return req; 427 } 428 429 static inline void nvme_tcp_ddgst_final(struct ahash_request *hash, 430 __le32 *dgst) 431 { 432 ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0); 433 crypto_ahash_final(hash); 434 } 435 436 static inline void nvme_tcp_ddgst_update(struct ahash_request *hash, 437 struct page *page, off_t off, size_t len) 438 { 439 struct scatterlist sg; 440 441 sg_init_table(&sg, 1); 442 sg_set_page(&sg, page, len, off); 443 ahash_request_set_crypt(hash, &sg, NULL, len); 444 crypto_ahash_update(hash); 445 } 446 447 static inline void nvme_tcp_hdgst(struct ahash_request *hash, 448 void *pdu, size_t len) 449 { 450 struct scatterlist sg; 451 452 sg_init_one(&sg, pdu, len); 453 ahash_request_set_crypt(hash, &sg, pdu + len, len); 454 crypto_ahash_digest(hash); 455 } 456 457 static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue, 458 void *pdu, size_t pdu_len) 459 { 460 struct nvme_tcp_hdr *hdr = pdu; 461 __le32 recv_digest; 462 __le32 exp_digest; 463 464 if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) { 465 dev_err(queue->ctrl->ctrl.device, 466 "queue %d: header digest flag is cleared\n", 467 nvme_tcp_queue_id(queue)); 468 return -EPROTO; 469 } 470 471 recv_digest = *(__le32 *)(pdu + hdr->hlen); 472 nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len); 473 exp_digest = *(__le32 *)(pdu + hdr->hlen); 474 if (recv_digest != exp_digest) { 475 dev_err(queue->ctrl->ctrl.device, 476 "header digest error: recv %#x expected %#x\n", 477 le32_to_cpu(recv_digest), le32_to_cpu(exp_digest)); 478 return -EIO; 479 } 480 481 return 0; 482 } 483 484 static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu) 485 { 486 struct nvme_tcp_hdr *hdr = pdu; 487 u8 digest_len = nvme_tcp_hdgst_len(queue); 488 u32 len; 489 490 len = le32_to_cpu(hdr->plen) - hdr->hlen - 491 ((hdr->flags & NVME_TCP_F_HDGST) ? digest_len : 0); 492 493 if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) { 494 dev_err(queue->ctrl->ctrl.device, 495 "queue %d: data digest flag is cleared\n", 496 nvme_tcp_queue_id(queue)); 497 return -EPROTO; 498 } 499 crypto_ahash_init(queue->rcv_hash); 500 501 return 0; 502 } 503 504 static void nvme_tcp_exit_request(struct blk_mq_tag_set *set, 505 struct request *rq, unsigned int hctx_idx) 506 { 507 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); 508 509 page_frag_free(req->pdu); 510 } 511 512 static int nvme_tcp_init_request(struct blk_mq_tag_set *set, 513 struct request *rq, unsigned int hctx_idx, 514 unsigned int numa_node) 515 { 516 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(set->driver_data); 517 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); 518 struct nvme_tcp_cmd_pdu *pdu; 519 int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; 520 struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx]; 521 u8 hdgst = nvme_tcp_hdgst_len(queue); 522 523 req->pdu = page_frag_alloc(&queue->pf_cache, 524 sizeof(struct nvme_tcp_cmd_pdu) + hdgst, 525 GFP_KERNEL | __GFP_ZERO); 526 if (!req->pdu) 527 return -ENOMEM; 528 529 pdu = req->pdu; 530 req->queue = queue; 531 nvme_req(rq)->ctrl = &ctrl->ctrl; 532 nvme_req(rq)->cmd = &pdu->cmd; 533 534 return 0; 535 } 536 537 static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, 538 unsigned int hctx_idx) 539 { 540 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(data); 541 struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1]; 542 543 hctx->driver_data = queue; 544 return 0; 545 } 546 547 static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, 548 unsigned int hctx_idx) 549 { 550 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(data); 551 struct nvme_tcp_queue *queue = &ctrl->queues[0]; 552 553 hctx->driver_data = queue; 554 return 0; 555 } 556 557 static enum nvme_tcp_recv_state 558 nvme_tcp_recv_state(struct nvme_tcp_queue *queue) 559 { 560 return (queue->pdu_remaining) ? NVME_TCP_RECV_PDU : 561 (queue->ddgst_remaining) ? NVME_TCP_RECV_DDGST : 562 NVME_TCP_RECV_DATA; 563 } 564 565 static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue) 566 { 567 queue->pdu_remaining = sizeof(struct nvme_tcp_rsp_pdu) + 568 nvme_tcp_hdgst_len(queue); 569 queue->pdu_offset = 0; 570 queue->data_remaining = -1; 571 queue->ddgst_remaining = 0; 572 } 573 574 static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl) 575 { 576 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) 577 return; 578 579 dev_warn(ctrl->device, "starting error recovery\n"); 580 queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work); 581 } 582 583 static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue, 584 struct nvme_completion *cqe) 585 { 586 struct nvme_tcp_request *req; 587 struct request *rq; 588 589 rq = nvme_find_rq(nvme_tcp_tagset(queue), cqe->command_id); 590 if (!rq) { 591 dev_err(queue->ctrl->ctrl.device, 592 "got bad cqe.command_id %#x on queue %d\n", 593 cqe->command_id, nvme_tcp_queue_id(queue)); 594 nvme_tcp_error_recovery(&queue->ctrl->ctrl); 595 return -EINVAL; 596 } 597 598 req = blk_mq_rq_to_pdu(rq); 599 if (req->status == cpu_to_le16(NVME_SC_SUCCESS)) 600 req->status = cqe->status; 601 602 if (!nvme_try_complete_req(rq, req->status, cqe->result)) 603 nvme_complete_rq(rq); 604 queue->nr_cqe++; 605 606 return 0; 607 } 608 609 static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue, 610 struct nvme_tcp_data_pdu *pdu) 611 { 612 struct request *rq; 613 614 rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id); 615 if (!rq) { 616 dev_err(queue->ctrl->ctrl.device, 617 "got bad c2hdata.command_id %#x on queue %d\n", 618 pdu->command_id, nvme_tcp_queue_id(queue)); 619 return -ENOENT; 620 } 621 622 if (!blk_rq_payload_bytes(rq)) { 623 dev_err(queue->ctrl->ctrl.device, 624 "queue %d tag %#x unexpected data\n", 625 nvme_tcp_queue_id(queue), rq->tag); 626 return -EIO; 627 } 628 629 queue->data_remaining = le32_to_cpu(pdu->data_length); 630 631 if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS && 632 unlikely(!(pdu->hdr.flags & NVME_TCP_F_DATA_LAST))) { 633 dev_err(queue->ctrl->ctrl.device, 634 "queue %d tag %#x SUCCESS set but not last PDU\n", 635 nvme_tcp_queue_id(queue), rq->tag); 636 nvme_tcp_error_recovery(&queue->ctrl->ctrl); 637 return -EPROTO; 638 } 639 640 return 0; 641 } 642 643 static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue, 644 struct nvme_tcp_rsp_pdu *pdu) 645 { 646 struct nvme_completion *cqe = &pdu->cqe; 647 int ret = 0; 648 649 /* 650 * AEN requests are special as they don't time out and can 651 * survive any kind of queue freeze and often don't respond to 652 * aborts. We don't even bother to allocate a struct request 653 * for them but rather special case them here. 654 */ 655 if (unlikely(nvme_is_aen_req(nvme_tcp_queue_id(queue), 656 cqe->command_id))) 657 nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status, 658 &cqe->result); 659 else 660 ret = nvme_tcp_process_nvme_cqe(queue, cqe); 661 662 return ret; 663 } 664 665 static void nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req) 666 { 667 struct nvme_tcp_data_pdu *data = nvme_tcp_req_data_pdu(req); 668 struct nvme_tcp_queue *queue = req->queue; 669 struct request *rq = blk_mq_rq_from_pdu(req); 670 u32 h2cdata_sent = req->pdu_len; 671 u8 hdgst = nvme_tcp_hdgst_len(queue); 672 u8 ddgst = nvme_tcp_ddgst_len(queue); 673 674 req->state = NVME_TCP_SEND_H2C_PDU; 675 req->offset = 0; 676 req->pdu_len = min(req->h2cdata_left, queue->maxh2cdata); 677 req->pdu_sent = 0; 678 req->h2cdata_left -= req->pdu_len; 679 req->h2cdata_offset += h2cdata_sent; 680 681 memset(data, 0, sizeof(*data)); 682 data->hdr.type = nvme_tcp_h2c_data; 683 if (!req->h2cdata_left) 684 data->hdr.flags = NVME_TCP_F_DATA_LAST; 685 if (queue->hdr_digest) 686 data->hdr.flags |= NVME_TCP_F_HDGST; 687 if (queue->data_digest) 688 data->hdr.flags |= NVME_TCP_F_DDGST; 689 data->hdr.hlen = sizeof(*data); 690 data->hdr.pdo = data->hdr.hlen + hdgst; 691 data->hdr.plen = 692 cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst); 693 data->ttag = req->ttag; 694 data->command_id = nvme_cid(rq); 695 data->data_offset = cpu_to_le32(req->h2cdata_offset); 696 data->data_length = cpu_to_le32(req->pdu_len); 697 } 698 699 static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue, 700 struct nvme_tcp_r2t_pdu *pdu) 701 { 702 struct nvme_tcp_request *req; 703 struct request *rq; 704 u32 r2t_length = le32_to_cpu(pdu->r2t_length); 705 u32 r2t_offset = le32_to_cpu(pdu->r2t_offset); 706 707 rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id); 708 if (!rq) { 709 dev_err(queue->ctrl->ctrl.device, 710 "got bad r2t.command_id %#x on queue %d\n", 711 pdu->command_id, nvme_tcp_queue_id(queue)); 712 return -ENOENT; 713 } 714 req = blk_mq_rq_to_pdu(rq); 715 716 if (unlikely(!r2t_length)) { 717 dev_err(queue->ctrl->ctrl.device, 718 "req %d r2t len is %u, probably a bug...\n", 719 rq->tag, r2t_length); 720 return -EPROTO; 721 } 722 723 if (unlikely(req->data_sent + r2t_length > req->data_len)) { 724 dev_err(queue->ctrl->ctrl.device, 725 "req %d r2t len %u exceeded data len %u (%zu sent)\n", 726 rq->tag, r2t_length, req->data_len, req->data_sent); 727 return -EPROTO; 728 } 729 730 if (unlikely(r2t_offset < req->data_sent)) { 731 dev_err(queue->ctrl->ctrl.device, 732 "req %d unexpected r2t offset %u (expected %zu)\n", 733 rq->tag, r2t_offset, req->data_sent); 734 return -EPROTO; 735 } 736 737 req->pdu_len = 0; 738 req->h2cdata_left = r2t_length; 739 req->h2cdata_offset = r2t_offset; 740 req->ttag = pdu->ttag; 741 742 nvme_tcp_setup_h2c_data_pdu(req); 743 nvme_tcp_queue_request(req, false, true); 744 745 return 0; 746 } 747 748 static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, 749 unsigned int *offset, size_t *len) 750 { 751 struct nvme_tcp_hdr *hdr; 752 char *pdu = queue->pdu; 753 size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining); 754 int ret; 755 756 ret = skb_copy_bits(skb, *offset, 757 &pdu[queue->pdu_offset], rcv_len); 758 if (unlikely(ret)) 759 return ret; 760 761 queue->pdu_remaining -= rcv_len; 762 queue->pdu_offset += rcv_len; 763 *offset += rcv_len; 764 *len -= rcv_len; 765 if (queue->pdu_remaining) 766 return 0; 767 768 hdr = queue->pdu; 769 if (queue->hdr_digest) { 770 ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen); 771 if (unlikely(ret)) 772 return ret; 773 } 774 775 776 if (queue->data_digest) { 777 ret = nvme_tcp_check_ddgst(queue, queue->pdu); 778 if (unlikely(ret)) 779 return ret; 780 } 781 782 switch (hdr->type) { 783 case nvme_tcp_c2h_data: 784 return nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu); 785 case nvme_tcp_rsp: 786 nvme_tcp_init_recv_ctx(queue); 787 return nvme_tcp_handle_comp(queue, (void *)queue->pdu); 788 case nvme_tcp_r2t: 789 nvme_tcp_init_recv_ctx(queue); 790 return nvme_tcp_handle_r2t(queue, (void *)queue->pdu); 791 default: 792 dev_err(queue->ctrl->ctrl.device, 793 "unsupported pdu type (%d)\n", hdr->type); 794 return -EINVAL; 795 } 796 } 797 798 static inline void nvme_tcp_end_request(struct request *rq, u16 status) 799 { 800 union nvme_result res = {}; 801 802 if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res)) 803 nvme_complete_rq(rq); 804 } 805 806 static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, 807 unsigned int *offset, size_t *len) 808 { 809 struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu; 810 struct request *rq = 811 nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu->command_id); 812 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); 813 814 while (true) { 815 int recv_len, ret; 816 817 recv_len = min_t(size_t, *len, queue->data_remaining); 818 if (!recv_len) 819 break; 820 821 if (!iov_iter_count(&req->iter)) { 822 req->curr_bio = req->curr_bio->bi_next; 823 824 /* 825 * If we don`t have any bios it means that controller 826 * sent more data than we requested, hence error 827 */ 828 if (!req->curr_bio) { 829 dev_err(queue->ctrl->ctrl.device, 830 "queue %d no space in request %#x", 831 nvme_tcp_queue_id(queue), rq->tag); 832 nvme_tcp_init_recv_ctx(queue); 833 return -EIO; 834 } 835 nvme_tcp_init_iter(req, ITER_DEST); 836 } 837 838 /* we can read only from what is left in this bio */ 839 recv_len = min_t(size_t, recv_len, 840 iov_iter_count(&req->iter)); 841 842 if (queue->data_digest) 843 ret = skb_copy_and_hash_datagram_iter(skb, *offset, 844 &req->iter, recv_len, queue->rcv_hash); 845 else 846 ret = skb_copy_datagram_iter(skb, *offset, 847 &req->iter, recv_len); 848 if (ret) { 849 dev_err(queue->ctrl->ctrl.device, 850 "queue %d failed to copy request %#x data", 851 nvme_tcp_queue_id(queue), rq->tag); 852 return ret; 853 } 854 855 *len -= recv_len; 856 *offset += recv_len; 857 queue->data_remaining -= recv_len; 858 } 859 860 if (!queue->data_remaining) { 861 if (queue->data_digest) { 862 nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst); 863 queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH; 864 } else { 865 if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) { 866 nvme_tcp_end_request(rq, 867 le16_to_cpu(req->status)); 868 queue->nr_cqe++; 869 } 870 nvme_tcp_init_recv_ctx(queue); 871 } 872 } 873 874 return 0; 875 } 876 877 static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue, 878 struct sk_buff *skb, unsigned int *offset, size_t *len) 879 { 880 struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu; 881 char *ddgst = (char *)&queue->recv_ddgst; 882 size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining); 883 off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining; 884 int ret; 885 886 ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len); 887 if (unlikely(ret)) 888 return ret; 889 890 queue->ddgst_remaining -= recv_len; 891 *offset += recv_len; 892 *len -= recv_len; 893 if (queue->ddgst_remaining) 894 return 0; 895 896 if (queue->recv_ddgst != queue->exp_ddgst) { 897 struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue), 898 pdu->command_id); 899 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); 900 901 req->status = cpu_to_le16(NVME_SC_DATA_XFER_ERROR); 902 903 dev_err(queue->ctrl->ctrl.device, 904 "data digest error: recv %#x expected %#x\n", 905 le32_to_cpu(queue->recv_ddgst), 906 le32_to_cpu(queue->exp_ddgst)); 907 } 908 909 if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) { 910 struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue), 911 pdu->command_id); 912 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); 913 914 nvme_tcp_end_request(rq, le16_to_cpu(req->status)); 915 queue->nr_cqe++; 916 } 917 918 nvme_tcp_init_recv_ctx(queue); 919 return 0; 920 } 921 922 static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb, 923 unsigned int offset, size_t len) 924 { 925 struct nvme_tcp_queue *queue = desc->arg.data; 926 size_t consumed = len; 927 int result; 928 929 if (unlikely(!queue->rd_enabled)) 930 return -EFAULT; 931 932 while (len) { 933 switch (nvme_tcp_recv_state(queue)) { 934 case NVME_TCP_RECV_PDU: 935 result = nvme_tcp_recv_pdu(queue, skb, &offset, &len); 936 break; 937 case NVME_TCP_RECV_DATA: 938 result = nvme_tcp_recv_data(queue, skb, &offset, &len); 939 break; 940 case NVME_TCP_RECV_DDGST: 941 result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len); 942 break; 943 default: 944 result = -EFAULT; 945 } 946 if (result) { 947 dev_err(queue->ctrl->ctrl.device, 948 "receive failed: %d\n", result); 949 queue->rd_enabled = false; 950 nvme_tcp_error_recovery(&queue->ctrl->ctrl); 951 return result; 952 } 953 } 954 955 return consumed; 956 } 957 958 static void nvme_tcp_data_ready(struct sock *sk) 959 { 960 struct nvme_tcp_queue *queue; 961 962 trace_sk_data_ready(sk); 963 964 read_lock_bh(&sk->sk_callback_lock); 965 queue = sk->sk_user_data; 966 if (likely(queue && queue->rd_enabled) && 967 !test_bit(NVME_TCP_Q_POLLING, &queue->flags)) 968 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); 969 read_unlock_bh(&sk->sk_callback_lock); 970 } 971 972 static void nvme_tcp_write_space(struct sock *sk) 973 { 974 struct nvme_tcp_queue *queue; 975 976 read_lock_bh(&sk->sk_callback_lock); 977 queue = sk->sk_user_data; 978 if (likely(queue && sk_stream_is_writeable(sk))) { 979 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 980 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); 981 } 982 read_unlock_bh(&sk->sk_callback_lock); 983 } 984 985 static void nvme_tcp_state_change(struct sock *sk) 986 { 987 struct nvme_tcp_queue *queue; 988 989 read_lock_bh(&sk->sk_callback_lock); 990 queue = sk->sk_user_data; 991 if (!queue) 992 goto done; 993 994 switch (sk->sk_state) { 995 case TCP_CLOSE: 996 case TCP_CLOSE_WAIT: 997 case TCP_LAST_ACK: 998 case TCP_FIN_WAIT1: 999 case TCP_FIN_WAIT2: 1000 nvme_tcp_error_recovery(&queue->ctrl->ctrl); 1001 break; 1002 default: 1003 dev_info(queue->ctrl->ctrl.device, 1004 "queue %d socket state %d\n", 1005 nvme_tcp_queue_id(queue), sk->sk_state); 1006 } 1007 1008 queue->state_change(sk); 1009 done: 1010 read_unlock_bh(&sk->sk_callback_lock); 1011 } 1012 1013 static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue) 1014 { 1015 queue->request = NULL; 1016 } 1017 1018 static void nvme_tcp_fail_request(struct nvme_tcp_request *req) 1019 { 1020 if (nvme_tcp_async_req(req)) { 1021 union nvme_result res = {}; 1022 1023 nvme_complete_async_event(&req->queue->ctrl->ctrl, 1024 cpu_to_le16(NVME_SC_HOST_PATH_ERROR), &res); 1025 } else { 1026 nvme_tcp_end_request(blk_mq_rq_from_pdu(req), 1027 NVME_SC_HOST_PATH_ERROR); 1028 } 1029 } 1030 1031 static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) 1032 { 1033 struct nvme_tcp_queue *queue = req->queue; 1034 int req_data_len = req->data_len; 1035 u32 h2cdata_left = req->h2cdata_left; 1036 1037 while (true) { 1038 struct bio_vec bvec; 1039 struct msghdr msg = { 1040 .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, 1041 }; 1042 struct page *page = nvme_tcp_req_cur_page(req); 1043 size_t offset = nvme_tcp_req_cur_offset(req); 1044 size_t len = nvme_tcp_req_cur_length(req); 1045 bool last = nvme_tcp_pdu_last_send(req, len); 1046 int req_data_sent = req->data_sent; 1047 int ret; 1048 1049 if (last && !queue->data_digest && !nvme_tcp_queue_more(queue)) 1050 msg.msg_flags |= MSG_EOR; 1051 else 1052 msg.msg_flags |= MSG_MORE; 1053 1054 if (!sendpage_ok(page)) 1055 msg.msg_flags &= ~MSG_SPLICE_PAGES; 1056 1057 bvec_set_page(&bvec, page, len, offset); 1058 iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len); 1059 ret = sock_sendmsg(queue->sock, &msg); 1060 if (ret <= 0) 1061 return ret; 1062 1063 if (queue->data_digest) 1064 nvme_tcp_ddgst_update(queue->snd_hash, page, 1065 offset, ret); 1066 1067 /* 1068 * update the request iterator except for the last payload send 1069 * in the request where we don't want to modify it as we may 1070 * compete with the RX path completing the request. 1071 */ 1072 if (req_data_sent + ret < req_data_len) 1073 nvme_tcp_advance_req(req, ret); 1074 1075 /* fully successful last send in current PDU */ 1076 if (last && ret == len) { 1077 if (queue->data_digest) { 1078 nvme_tcp_ddgst_final(queue->snd_hash, 1079 &req->ddgst); 1080 req->state = NVME_TCP_SEND_DDGST; 1081 req->offset = 0; 1082 } else { 1083 if (h2cdata_left) 1084 nvme_tcp_setup_h2c_data_pdu(req); 1085 else 1086 nvme_tcp_done_send_req(queue); 1087 } 1088 return 1; 1089 } 1090 } 1091 return -EAGAIN; 1092 } 1093 1094 static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) 1095 { 1096 struct nvme_tcp_queue *queue = req->queue; 1097 struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req); 1098 struct bio_vec bvec; 1099 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, }; 1100 bool inline_data = nvme_tcp_has_inline_data(req); 1101 u8 hdgst = nvme_tcp_hdgst_len(queue); 1102 int len = sizeof(*pdu) + hdgst - req->offset; 1103 int ret; 1104 1105 if (inline_data || nvme_tcp_queue_more(queue)) 1106 msg.msg_flags |= MSG_MORE; 1107 else 1108 msg.msg_flags |= MSG_EOR; 1109 1110 if (queue->hdr_digest && !req->offset) 1111 nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); 1112 1113 bvec_set_virt(&bvec, (void *)pdu + req->offset, len); 1114 iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len); 1115 ret = sock_sendmsg(queue->sock, &msg); 1116 if (unlikely(ret <= 0)) 1117 return ret; 1118 1119 len -= ret; 1120 if (!len) { 1121 if (inline_data) { 1122 req->state = NVME_TCP_SEND_DATA; 1123 if (queue->data_digest) 1124 crypto_ahash_init(queue->snd_hash); 1125 } else { 1126 nvme_tcp_done_send_req(queue); 1127 } 1128 return 1; 1129 } 1130 req->offset += ret; 1131 1132 return -EAGAIN; 1133 } 1134 1135 static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req) 1136 { 1137 struct nvme_tcp_queue *queue = req->queue; 1138 struct nvme_tcp_data_pdu *pdu = nvme_tcp_req_data_pdu(req); 1139 struct bio_vec bvec; 1140 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_MORE, }; 1141 u8 hdgst = nvme_tcp_hdgst_len(queue); 1142 int len = sizeof(*pdu) - req->offset + hdgst; 1143 int ret; 1144 1145 if (queue->hdr_digest && !req->offset) 1146 nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); 1147 1148 if (!req->h2cdata_left) 1149 msg.msg_flags |= MSG_SPLICE_PAGES; 1150 1151 bvec_set_virt(&bvec, (void *)pdu + req->offset, len); 1152 iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len); 1153 ret = sock_sendmsg(queue->sock, &msg); 1154 if (unlikely(ret <= 0)) 1155 return ret; 1156 1157 len -= ret; 1158 if (!len) { 1159 req->state = NVME_TCP_SEND_DATA; 1160 if (queue->data_digest) 1161 crypto_ahash_init(queue->snd_hash); 1162 return 1; 1163 } 1164 req->offset += ret; 1165 1166 return -EAGAIN; 1167 } 1168 1169 static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req) 1170 { 1171 struct nvme_tcp_queue *queue = req->queue; 1172 size_t offset = req->offset; 1173 u32 h2cdata_left = req->h2cdata_left; 1174 int ret; 1175 struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; 1176 struct kvec iov = { 1177 .iov_base = (u8 *)&req->ddgst + req->offset, 1178 .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset 1179 }; 1180 1181 if (nvme_tcp_queue_more(queue)) 1182 msg.msg_flags |= MSG_MORE; 1183 else 1184 msg.msg_flags |= MSG_EOR; 1185 1186 ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); 1187 if (unlikely(ret <= 0)) 1188 return ret; 1189 1190 if (offset + ret == NVME_TCP_DIGEST_LENGTH) { 1191 if (h2cdata_left) 1192 nvme_tcp_setup_h2c_data_pdu(req); 1193 else 1194 nvme_tcp_done_send_req(queue); 1195 return 1; 1196 } 1197 1198 req->offset += ret; 1199 return -EAGAIN; 1200 } 1201 1202 static int nvme_tcp_try_send(struct nvme_tcp_queue *queue) 1203 { 1204 struct nvme_tcp_request *req; 1205 unsigned int noreclaim_flag; 1206 int ret = 1; 1207 1208 if (!queue->request) { 1209 queue->request = nvme_tcp_fetch_request(queue); 1210 if (!queue->request) 1211 return 0; 1212 } 1213 req = queue->request; 1214 1215 noreclaim_flag = memalloc_noreclaim_save(); 1216 if (req->state == NVME_TCP_SEND_CMD_PDU) { 1217 ret = nvme_tcp_try_send_cmd_pdu(req); 1218 if (ret <= 0) 1219 goto done; 1220 if (!nvme_tcp_has_inline_data(req)) 1221 goto out; 1222 } 1223 1224 if (req->state == NVME_TCP_SEND_H2C_PDU) { 1225 ret = nvme_tcp_try_send_data_pdu(req); 1226 if (ret <= 0) 1227 goto done; 1228 } 1229 1230 if (req->state == NVME_TCP_SEND_DATA) { 1231 ret = nvme_tcp_try_send_data(req); 1232 if (ret <= 0) 1233 goto done; 1234 } 1235 1236 if (req->state == NVME_TCP_SEND_DDGST) 1237 ret = nvme_tcp_try_send_ddgst(req); 1238 done: 1239 if (ret == -EAGAIN) { 1240 ret = 0; 1241 } else if (ret < 0) { 1242 dev_err(queue->ctrl->ctrl.device, 1243 "failed to send request %d\n", ret); 1244 nvme_tcp_fail_request(queue->request); 1245 nvme_tcp_done_send_req(queue); 1246 } 1247 out: 1248 memalloc_noreclaim_restore(noreclaim_flag); 1249 return ret; 1250 } 1251 1252 static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue) 1253 { 1254 struct socket *sock = queue->sock; 1255 struct sock *sk = sock->sk; 1256 read_descriptor_t rd_desc; 1257 int consumed; 1258 1259 rd_desc.arg.data = queue; 1260 rd_desc.count = 1; 1261 lock_sock(sk); 1262 queue->nr_cqe = 0; 1263 consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb); 1264 release_sock(sk); 1265 return consumed; 1266 } 1267 1268 static void nvme_tcp_io_work(struct work_struct *w) 1269 { 1270 struct nvme_tcp_queue *queue = 1271 container_of(w, struct nvme_tcp_queue, io_work); 1272 unsigned long deadline = jiffies + msecs_to_jiffies(1); 1273 1274 do { 1275 bool pending = false; 1276 int result; 1277 1278 if (mutex_trylock(&queue->send_mutex)) { 1279 result = nvme_tcp_try_send(queue); 1280 mutex_unlock(&queue->send_mutex); 1281 if (result > 0) 1282 pending = true; 1283 else if (unlikely(result < 0)) 1284 break; 1285 } 1286 1287 result = nvme_tcp_try_recv(queue); 1288 if (result > 0) 1289 pending = true; 1290 else if (unlikely(result < 0)) 1291 return; 1292 1293 if (!pending || !queue->rd_enabled) 1294 return; 1295 1296 } while (!time_after(jiffies, deadline)); /* quota is exhausted */ 1297 1298 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); 1299 } 1300 1301 static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue) 1302 { 1303 struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash); 1304 1305 ahash_request_free(queue->rcv_hash); 1306 ahash_request_free(queue->snd_hash); 1307 crypto_free_ahash(tfm); 1308 } 1309 1310 static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue) 1311 { 1312 struct crypto_ahash *tfm; 1313 1314 tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC); 1315 if (IS_ERR(tfm)) 1316 return PTR_ERR(tfm); 1317 1318 queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL); 1319 if (!queue->snd_hash) 1320 goto free_tfm; 1321 ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL); 1322 1323 queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL); 1324 if (!queue->rcv_hash) 1325 goto free_snd_hash; 1326 ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL); 1327 1328 return 0; 1329 free_snd_hash: 1330 ahash_request_free(queue->snd_hash); 1331 free_tfm: 1332 crypto_free_ahash(tfm); 1333 return -ENOMEM; 1334 } 1335 1336 static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl) 1337 { 1338 struct nvme_tcp_request *async = &ctrl->async_req; 1339 1340 page_frag_free(async->pdu); 1341 } 1342 1343 static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl) 1344 { 1345 struct nvme_tcp_queue *queue = &ctrl->queues[0]; 1346 struct nvme_tcp_request *async = &ctrl->async_req; 1347 u8 hdgst = nvme_tcp_hdgst_len(queue); 1348 1349 async->pdu = page_frag_alloc(&queue->pf_cache, 1350 sizeof(struct nvme_tcp_cmd_pdu) + hdgst, 1351 GFP_KERNEL | __GFP_ZERO); 1352 if (!async->pdu) 1353 return -ENOMEM; 1354 1355 async->queue = &ctrl->queues[0]; 1356 return 0; 1357 } 1358 1359 static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid) 1360 { 1361 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); 1362 struct nvme_tcp_queue *queue = &ctrl->queues[qid]; 1363 unsigned int noreclaim_flag; 1364 1365 if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags)) 1366 return; 1367 1368 if (queue->hdr_digest || queue->data_digest) 1369 nvme_tcp_free_crypto(queue); 1370 1371 page_frag_cache_drain(&queue->pf_cache); 1372 1373 noreclaim_flag = memalloc_noreclaim_save(); 1374 /* ->sock will be released by fput() */ 1375 fput(queue->sock->file); 1376 queue->sock = NULL; 1377 memalloc_noreclaim_restore(noreclaim_flag); 1378 1379 kfree(queue->pdu); 1380 mutex_destroy(&queue->send_mutex); 1381 mutex_destroy(&queue->queue_lock); 1382 } 1383 1384 static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) 1385 { 1386 struct nvme_tcp_icreq_pdu *icreq; 1387 struct nvme_tcp_icresp_pdu *icresp; 1388 char cbuf[CMSG_LEN(sizeof(char))] = {}; 1389 u8 ctype; 1390 struct msghdr msg = {}; 1391 struct kvec iov; 1392 bool ctrl_hdgst, ctrl_ddgst; 1393 u32 maxh2cdata; 1394 int ret; 1395 1396 icreq = kzalloc(sizeof(*icreq), GFP_KERNEL); 1397 if (!icreq) 1398 return -ENOMEM; 1399 1400 icresp = kzalloc(sizeof(*icresp), GFP_KERNEL); 1401 if (!icresp) { 1402 ret = -ENOMEM; 1403 goto free_icreq; 1404 } 1405 1406 icreq->hdr.type = nvme_tcp_icreq; 1407 icreq->hdr.hlen = sizeof(*icreq); 1408 icreq->hdr.pdo = 0; 1409 icreq->hdr.plen = cpu_to_le32(icreq->hdr.hlen); 1410 icreq->pfv = cpu_to_le16(NVME_TCP_PFV_1_0); 1411 icreq->maxr2t = 0; /* single inflight r2t supported */ 1412 icreq->hpda = 0; /* no alignment constraint */ 1413 if (queue->hdr_digest) 1414 icreq->digest |= NVME_TCP_HDR_DIGEST_ENABLE; 1415 if (queue->data_digest) 1416 icreq->digest |= NVME_TCP_DATA_DIGEST_ENABLE; 1417 1418 iov.iov_base = icreq; 1419 iov.iov_len = sizeof(*icreq); 1420 ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); 1421 if (ret < 0) { 1422 pr_warn("queue %d: failed to send icreq, error %d\n", 1423 nvme_tcp_queue_id(queue), ret); 1424 goto free_icresp; 1425 } 1426 1427 memset(&msg, 0, sizeof(msg)); 1428 iov.iov_base = icresp; 1429 iov.iov_len = sizeof(*icresp); 1430 if (nvme_tcp_tls(&queue->ctrl->ctrl)) { 1431 msg.msg_control = cbuf; 1432 msg.msg_controllen = sizeof(cbuf); 1433 } 1434 ret = kernel_recvmsg(queue->sock, &msg, &iov, 1, 1435 iov.iov_len, msg.msg_flags); 1436 if (ret < 0) { 1437 pr_warn("queue %d: failed to receive icresp, error %d\n", 1438 nvme_tcp_queue_id(queue), ret); 1439 goto free_icresp; 1440 } 1441 ret = -ENOTCONN; 1442 if (nvme_tcp_tls(&queue->ctrl->ctrl)) { 1443 ctype = tls_get_record_type(queue->sock->sk, 1444 (struct cmsghdr *)cbuf); 1445 if (ctype != TLS_RECORD_TYPE_DATA) { 1446 pr_err("queue %d: unhandled TLS record %d\n", 1447 nvme_tcp_queue_id(queue), ctype); 1448 goto free_icresp; 1449 } 1450 } 1451 ret = -EINVAL; 1452 if (icresp->hdr.type != nvme_tcp_icresp) { 1453 pr_err("queue %d: bad type returned %d\n", 1454 nvme_tcp_queue_id(queue), icresp->hdr.type); 1455 goto free_icresp; 1456 } 1457 1458 if (le32_to_cpu(icresp->hdr.plen) != sizeof(*icresp)) { 1459 pr_err("queue %d: bad pdu length returned %d\n", 1460 nvme_tcp_queue_id(queue), icresp->hdr.plen); 1461 goto free_icresp; 1462 } 1463 1464 if (icresp->pfv != NVME_TCP_PFV_1_0) { 1465 pr_err("queue %d: bad pfv returned %d\n", 1466 nvme_tcp_queue_id(queue), icresp->pfv); 1467 goto free_icresp; 1468 } 1469 1470 ctrl_ddgst = !!(icresp->digest & NVME_TCP_DATA_DIGEST_ENABLE); 1471 if ((queue->data_digest && !ctrl_ddgst) || 1472 (!queue->data_digest && ctrl_ddgst)) { 1473 pr_err("queue %d: data digest mismatch host: %s ctrl: %s\n", 1474 nvme_tcp_queue_id(queue), 1475 queue->data_digest ? "enabled" : "disabled", 1476 ctrl_ddgst ? "enabled" : "disabled"); 1477 goto free_icresp; 1478 } 1479 1480 ctrl_hdgst = !!(icresp->digest & NVME_TCP_HDR_DIGEST_ENABLE); 1481 if ((queue->hdr_digest && !ctrl_hdgst) || 1482 (!queue->hdr_digest && ctrl_hdgst)) { 1483 pr_err("queue %d: header digest mismatch host: %s ctrl: %s\n", 1484 nvme_tcp_queue_id(queue), 1485 queue->hdr_digest ? "enabled" : "disabled", 1486 ctrl_hdgst ? "enabled" : "disabled"); 1487 goto free_icresp; 1488 } 1489 1490 if (icresp->cpda != 0) { 1491 pr_err("queue %d: unsupported cpda returned %d\n", 1492 nvme_tcp_queue_id(queue), icresp->cpda); 1493 goto free_icresp; 1494 } 1495 1496 maxh2cdata = le32_to_cpu(icresp->maxdata); 1497 if ((maxh2cdata % 4) || (maxh2cdata < NVME_TCP_MIN_MAXH2CDATA)) { 1498 pr_err("queue %d: invalid maxh2cdata returned %u\n", 1499 nvme_tcp_queue_id(queue), maxh2cdata); 1500 goto free_icresp; 1501 } 1502 queue->maxh2cdata = maxh2cdata; 1503 1504 ret = 0; 1505 free_icresp: 1506 kfree(icresp); 1507 free_icreq: 1508 kfree(icreq); 1509 return ret; 1510 } 1511 1512 static bool nvme_tcp_admin_queue(struct nvme_tcp_queue *queue) 1513 { 1514 return nvme_tcp_queue_id(queue) == 0; 1515 } 1516 1517 static bool nvme_tcp_default_queue(struct nvme_tcp_queue *queue) 1518 { 1519 struct nvme_tcp_ctrl *ctrl = queue->ctrl; 1520 int qid = nvme_tcp_queue_id(queue); 1521 1522 return !nvme_tcp_admin_queue(queue) && 1523 qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT]; 1524 } 1525 1526 static bool nvme_tcp_read_queue(struct nvme_tcp_queue *queue) 1527 { 1528 struct nvme_tcp_ctrl *ctrl = queue->ctrl; 1529 int qid = nvme_tcp_queue_id(queue); 1530 1531 return !nvme_tcp_admin_queue(queue) && 1532 !nvme_tcp_default_queue(queue) && 1533 qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] + 1534 ctrl->io_queues[HCTX_TYPE_READ]; 1535 } 1536 1537 static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue) 1538 { 1539 struct nvme_tcp_ctrl *ctrl = queue->ctrl; 1540 int qid = nvme_tcp_queue_id(queue); 1541 1542 return !nvme_tcp_admin_queue(queue) && 1543 !nvme_tcp_default_queue(queue) && 1544 !nvme_tcp_read_queue(queue) && 1545 qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] + 1546 ctrl->io_queues[HCTX_TYPE_READ] + 1547 ctrl->io_queues[HCTX_TYPE_POLL]; 1548 } 1549 1550 static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue) 1551 { 1552 struct nvme_tcp_ctrl *ctrl = queue->ctrl; 1553 int qid = nvme_tcp_queue_id(queue); 1554 int n = 0; 1555 1556 if (nvme_tcp_default_queue(queue)) 1557 n = qid - 1; 1558 else if (nvme_tcp_read_queue(queue)) 1559 n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1; 1560 else if (nvme_tcp_poll_queue(queue)) 1561 n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1562 ctrl->io_queues[HCTX_TYPE_READ] - 1; 1563 if (wq_unbound) 1564 queue->io_cpu = WORK_CPU_UNBOUND; 1565 else 1566 queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false); 1567 } 1568 1569 static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid) 1570 { 1571 struct nvme_tcp_queue *queue = data; 1572 struct nvme_tcp_ctrl *ctrl = queue->ctrl; 1573 int qid = nvme_tcp_queue_id(queue); 1574 struct key *tls_key; 1575 1576 dev_dbg(ctrl->ctrl.device, "queue %d: TLS handshake done, key %x, status %d\n", 1577 qid, pskid, status); 1578 1579 if (status) { 1580 queue->tls_err = -status; 1581 goto out_complete; 1582 } 1583 1584 tls_key = key_lookup(pskid); 1585 if (IS_ERR(tls_key)) { 1586 dev_warn(ctrl->ctrl.device, "queue %d: Invalid key %x\n", 1587 qid, pskid); 1588 queue->tls_err = -ENOKEY; 1589 } else { 1590 ctrl->ctrl.tls_key = tls_key; 1591 queue->tls_err = 0; 1592 } 1593 1594 out_complete: 1595 complete(&queue->tls_complete); 1596 } 1597 1598 static int nvme_tcp_start_tls(struct nvme_ctrl *nctrl, 1599 struct nvme_tcp_queue *queue, 1600 key_serial_t pskid) 1601 { 1602 int qid = nvme_tcp_queue_id(queue); 1603 int ret; 1604 struct tls_handshake_args args; 1605 unsigned long tmo = tls_handshake_timeout * HZ; 1606 key_serial_t keyring = nvme_keyring_id(); 1607 1608 dev_dbg(nctrl->device, "queue %d: start TLS with key %x\n", 1609 qid, pskid); 1610 memset(&args, 0, sizeof(args)); 1611 args.ta_sock = queue->sock; 1612 args.ta_done = nvme_tcp_tls_done; 1613 args.ta_data = queue; 1614 args.ta_my_peerids[0] = pskid; 1615 args.ta_num_peerids = 1; 1616 if (nctrl->opts->keyring) 1617 keyring = key_serial(nctrl->opts->keyring); 1618 args.ta_keyring = keyring; 1619 args.ta_timeout_ms = tls_handshake_timeout * 1000; 1620 queue->tls_err = -EOPNOTSUPP; 1621 init_completion(&queue->tls_complete); 1622 ret = tls_client_hello_psk(&args, GFP_KERNEL); 1623 if (ret) { 1624 dev_err(nctrl->device, "queue %d: failed to start TLS: %d\n", 1625 qid, ret); 1626 return ret; 1627 } 1628 ret = wait_for_completion_interruptible_timeout(&queue->tls_complete, tmo); 1629 if (ret <= 0) { 1630 if (ret == 0) 1631 ret = -ETIMEDOUT; 1632 1633 dev_err(nctrl->device, 1634 "queue %d: TLS handshake failed, error %d\n", 1635 qid, ret); 1636 tls_handshake_cancel(queue->sock->sk); 1637 } else { 1638 dev_dbg(nctrl->device, 1639 "queue %d: TLS handshake complete, error %d\n", 1640 qid, queue->tls_err); 1641 ret = queue->tls_err; 1642 } 1643 return ret; 1644 } 1645 1646 static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid, 1647 key_serial_t pskid) 1648 { 1649 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); 1650 struct nvme_tcp_queue *queue = &ctrl->queues[qid]; 1651 int ret, rcv_pdu_size; 1652 struct file *sock_file; 1653 1654 mutex_init(&queue->queue_lock); 1655 queue->ctrl = ctrl; 1656 init_llist_head(&queue->req_list); 1657 INIT_LIST_HEAD(&queue->send_list); 1658 mutex_init(&queue->send_mutex); 1659 INIT_WORK(&queue->io_work, nvme_tcp_io_work); 1660 1661 if (qid > 0) 1662 queue->cmnd_capsule_len = nctrl->ioccsz * 16; 1663 else 1664 queue->cmnd_capsule_len = sizeof(struct nvme_command) + 1665 NVME_TCP_ADMIN_CCSZ; 1666 1667 ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM, 1668 IPPROTO_TCP, &queue->sock); 1669 if (ret) { 1670 dev_err(nctrl->device, 1671 "failed to create socket: %d\n", ret); 1672 goto err_destroy_mutex; 1673 } 1674 1675 sock_file = sock_alloc_file(queue->sock, O_CLOEXEC, NULL); 1676 if (IS_ERR(sock_file)) { 1677 ret = PTR_ERR(sock_file); 1678 goto err_destroy_mutex; 1679 } 1680 nvme_tcp_reclassify_socket(queue->sock); 1681 1682 /* Single syn retry */ 1683 tcp_sock_set_syncnt(queue->sock->sk, 1); 1684 1685 /* Set TCP no delay */ 1686 tcp_sock_set_nodelay(queue->sock->sk); 1687 1688 /* 1689 * Cleanup whatever is sitting in the TCP transmit queue on socket 1690 * close. This is done to prevent stale data from being sent should 1691 * the network connection be restored before TCP times out. 1692 */ 1693 sock_no_linger(queue->sock->sk); 1694 1695 if (so_priority > 0) 1696 sock_set_priority(queue->sock->sk, so_priority); 1697 1698 /* Set socket type of service */ 1699 if (nctrl->opts->tos >= 0) 1700 ip_sock_set_tos(queue->sock->sk, nctrl->opts->tos); 1701 1702 /* Set 10 seconds timeout for icresp recvmsg */ 1703 queue->sock->sk->sk_rcvtimeo = 10 * HZ; 1704 1705 queue->sock->sk->sk_allocation = GFP_ATOMIC; 1706 queue->sock->sk->sk_use_task_frag = false; 1707 nvme_tcp_set_queue_io_cpu(queue); 1708 queue->request = NULL; 1709 queue->data_remaining = 0; 1710 queue->ddgst_remaining = 0; 1711 queue->pdu_remaining = 0; 1712 queue->pdu_offset = 0; 1713 sk_set_memalloc(queue->sock->sk); 1714 1715 if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) { 1716 ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr, 1717 sizeof(ctrl->src_addr)); 1718 if (ret) { 1719 dev_err(nctrl->device, 1720 "failed to bind queue %d socket %d\n", 1721 qid, ret); 1722 goto err_sock; 1723 } 1724 } 1725 1726 if (nctrl->opts->mask & NVMF_OPT_HOST_IFACE) { 1727 char *iface = nctrl->opts->host_iface; 1728 sockptr_t optval = KERNEL_SOCKPTR(iface); 1729 1730 ret = sock_setsockopt(queue->sock, SOL_SOCKET, SO_BINDTODEVICE, 1731 optval, strlen(iface)); 1732 if (ret) { 1733 dev_err(nctrl->device, 1734 "failed to bind to interface %s queue %d err %d\n", 1735 iface, qid, ret); 1736 goto err_sock; 1737 } 1738 } 1739 1740 queue->hdr_digest = nctrl->opts->hdr_digest; 1741 queue->data_digest = nctrl->opts->data_digest; 1742 if (queue->hdr_digest || queue->data_digest) { 1743 ret = nvme_tcp_alloc_crypto(queue); 1744 if (ret) { 1745 dev_err(nctrl->device, 1746 "failed to allocate queue %d crypto\n", qid); 1747 goto err_sock; 1748 } 1749 } 1750 1751 rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) + 1752 nvme_tcp_hdgst_len(queue); 1753 queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL); 1754 if (!queue->pdu) { 1755 ret = -ENOMEM; 1756 goto err_crypto; 1757 } 1758 1759 dev_dbg(nctrl->device, "connecting queue %d\n", 1760 nvme_tcp_queue_id(queue)); 1761 1762 ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr, 1763 sizeof(ctrl->addr), 0); 1764 if (ret) { 1765 dev_err(nctrl->device, 1766 "failed to connect socket: %d\n", ret); 1767 goto err_rcv_pdu; 1768 } 1769 1770 /* If PSKs are configured try to start TLS */ 1771 if (IS_ENABLED(CONFIG_NVME_TCP_TLS) && pskid) { 1772 ret = nvme_tcp_start_tls(nctrl, queue, pskid); 1773 if (ret) 1774 goto err_init_connect; 1775 } 1776 1777 ret = nvme_tcp_init_connection(queue); 1778 if (ret) 1779 goto err_init_connect; 1780 1781 set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags); 1782 1783 return 0; 1784 1785 err_init_connect: 1786 kernel_sock_shutdown(queue->sock, SHUT_RDWR); 1787 err_rcv_pdu: 1788 kfree(queue->pdu); 1789 err_crypto: 1790 if (queue->hdr_digest || queue->data_digest) 1791 nvme_tcp_free_crypto(queue); 1792 err_sock: 1793 /* ->sock will be released by fput() */ 1794 fput(queue->sock->file); 1795 queue->sock = NULL; 1796 err_destroy_mutex: 1797 mutex_destroy(&queue->send_mutex); 1798 mutex_destroy(&queue->queue_lock); 1799 return ret; 1800 } 1801 1802 static void nvme_tcp_restore_sock_ops(struct nvme_tcp_queue *queue) 1803 { 1804 struct socket *sock = queue->sock; 1805 1806 write_lock_bh(&sock->sk->sk_callback_lock); 1807 sock->sk->sk_user_data = NULL; 1808 sock->sk->sk_data_ready = queue->data_ready; 1809 sock->sk->sk_state_change = queue->state_change; 1810 sock->sk->sk_write_space = queue->write_space; 1811 write_unlock_bh(&sock->sk->sk_callback_lock); 1812 } 1813 1814 static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue) 1815 { 1816 kernel_sock_shutdown(queue->sock, SHUT_RDWR); 1817 nvme_tcp_restore_sock_ops(queue); 1818 cancel_work_sync(&queue->io_work); 1819 } 1820 1821 static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid) 1822 { 1823 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); 1824 struct nvme_tcp_queue *queue = &ctrl->queues[qid]; 1825 1826 if (!test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags)) 1827 return; 1828 1829 mutex_lock(&queue->queue_lock); 1830 if (test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags)) 1831 __nvme_tcp_stop_queue(queue); 1832 mutex_unlock(&queue->queue_lock); 1833 } 1834 1835 static void nvme_tcp_setup_sock_ops(struct nvme_tcp_queue *queue) 1836 { 1837 write_lock_bh(&queue->sock->sk->sk_callback_lock); 1838 queue->sock->sk->sk_user_data = queue; 1839 queue->state_change = queue->sock->sk->sk_state_change; 1840 queue->data_ready = queue->sock->sk->sk_data_ready; 1841 queue->write_space = queue->sock->sk->sk_write_space; 1842 queue->sock->sk->sk_data_ready = nvme_tcp_data_ready; 1843 queue->sock->sk->sk_state_change = nvme_tcp_state_change; 1844 queue->sock->sk->sk_write_space = nvme_tcp_write_space; 1845 #ifdef CONFIG_NET_RX_BUSY_POLL 1846 queue->sock->sk->sk_ll_usec = 1; 1847 #endif 1848 write_unlock_bh(&queue->sock->sk->sk_callback_lock); 1849 } 1850 1851 static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx) 1852 { 1853 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); 1854 struct nvme_tcp_queue *queue = &ctrl->queues[idx]; 1855 int ret; 1856 1857 queue->rd_enabled = true; 1858 nvme_tcp_init_recv_ctx(queue); 1859 nvme_tcp_setup_sock_ops(queue); 1860 1861 if (idx) 1862 ret = nvmf_connect_io_queue(nctrl, idx); 1863 else 1864 ret = nvmf_connect_admin_queue(nctrl); 1865 1866 if (!ret) { 1867 set_bit(NVME_TCP_Q_LIVE, &queue->flags); 1868 } else { 1869 if (test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags)) 1870 __nvme_tcp_stop_queue(queue); 1871 dev_err(nctrl->device, 1872 "failed to connect queue: %d ret=%d\n", idx, ret); 1873 } 1874 return ret; 1875 } 1876 1877 static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl) 1878 { 1879 if (to_tcp_ctrl(ctrl)->async_req.pdu) { 1880 cancel_work_sync(&ctrl->async_event_work); 1881 nvme_tcp_free_async_req(to_tcp_ctrl(ctrl)); 1882 to_tcp_ctrl(ctrl)->async_req.pdu = NULL; 1883 } 1884 1885 nvme_tcp_free_queue(ctrl, 0); 1886 } 1887 1888 static void nvme_tcp_free_io_queues(struct nvme_ctrl *ctrl) 1889 { 1890 int i; 1891 1892 for (i = 1; i < ctrl->queue_count; i++) 1893 nvme_tcp_free_queue(ctrl, i); 1894 } 1895 1896 static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl) 1897 { 1898 int i; 1899 1900 for (i = 1; i < ctrl->queue_count; i++) 1901 nvme_tcp_stop_queue(ctrl, i); 1902 } 1903 1904 static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl, 1905 int first, int last) 1906 { 1907 int i, ret; 1908 1909 for (i = first; i < last; i++) { 1910 ret = nvme_tcp_start_queue(ctrl, i); 1911 if (ret) 1912 goto out_stop_queues; 1913 } 1914 1915 return 0; 1916 1917 out_stop_queues: 1918 for (i--; i >= first; i--) 1919 nvme_tcp_stop_queue(ctrl, i); 1920 return ret; 1921 } 1922 1923 static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl) 1924 { 1925 int ret; 1926 key_serial_t pskid = 0; 1927 1928 if (nvme_tcp_tls(ctrl)) { 1929 if (ctrl->opts->tls_key) 1930 pskid = key_serial(ctrl->opts->tls_key); 1931 else 1932 pskid = nvme_tls_psk_default(ctrl->opts->keyring, 1933 ctrl->opts->host->nqn, 1934 ctrl->opts->subsysnqn); 1935 if (!pskid) { 1936 dev_err(ctrl->device, "no valid PSK found\n"); 1937 return -ENOKEY; 1938 } 1939 } 1940 1941 ret = nvme_tcp_alloc_queue(ctrl, 0, pskid); 1942 if (ret) 1943 return ret; 1944 1945 ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl)); 1946 if (ret) 1947 goto out_free_queue; 1948 1949 return 0; 1950 1951 out_free_queue: 1952 nvme_tcp_free_queue(ctrl, 0); 1953 return ret; 1954 } 1955 1956 static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) 1957 { 1958 int i, ret; 1959 1960 if (nvme_tcp_tls(ctrl) && !ctrl->tls_key) { 1961 dev_err(ctrl->device, "no PSK negotiated\n"); 1962 return -ENOKEY; 1963 } 1964 for (i = 1; i < ctrl->queue_count; i++) { 1965 ret = nvme_tcp_alloc_queue(ctrl, i, 1966 key_serial(ctrl->tls_key)); 1967 if (ret) 1968 goto out_free_queues; 1969 } 1970 1971 return 0; 1972 1973 out_free_queues: 1974 for (i--; i >= 1; i--) 1975 nvme_tcp_free_queue(ctrl, i); 1976 1977 return ret; 1978 } 1979 1980 static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) 1981 { 1982 unsigned int nr_io_queues; 1983 int ret; 1984 1985 nr_io_queues = nvmf_nr_io_queues(ctrl->opts); 1986 ret = nvme_set_queue_count(ctrl, &nr_io_queues); 1987 if (ret) 1988 return ret; 1989 1990 if (nr_io_queues == 0) { 1991 dev_err(ctrl->device, 1992 "unable to set any I/O queues\n"); 1993 return -ENOMEM; 1994 } 1995 1996 ctrl->queue_count = nr_io_queues + 1; 1997 dev_info(ctrl->device, 1998 "creating %d I/O queues.\n", nr_io_queues); 1999 2000 nvmf_set_io_queues(ctrl->opts, nr_io_queues, 2001 to_tcp_ctrl(ctrl)->io_queues); 2002 return __nvme_tcp_alloc_io_queues(ctrl); 2003 } 2004 2005 static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove) 2006 { 2007 nvme_tcp_stop_io_queues(ctrl); 2008 if (remove) 2009 nvme_remove_io_tag_set(ctrl); 2010 nvme_tcp_free_io_queues(ctrl); 2011 } 2012 2013 static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) 2014 { 2015 int ret, nr_queues; 2016 2017 ret = nvme_tcp_alloc_io_queues(ctrl); 2018 if (ret) 2019 return ret; 2020 2021 if (new) { 2022 ret = nvme_alloc_io_tag_set(ctrl, &to_tcp_ctrl(ctrl)->tag_set, 2023 &nvme_tcp_mq_ops, 2024 ctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2, 2025 sizeof(struct nvme_tcp_request)); 2026 if (ret) 2027 goto out_free_io_queues; 2028 } 2029 2030 /* 2031 * Only start IO queues for which we have allocated the tagset 2032 * and limitted it to the available queues. On reconnects, the 2033 * queue number might have changed. 2034 */ 2035 nr_queues = min(ctrl->tagset->nr_hw_queues + 1, ctrl->queue_count); 2036 ret = nvme_tcp_start_io_queues(ctrl, 1, nr_queues); 2037 if (ret) 2038 goto out_cleanup_connect_q; 2039 2040 if (!new) { 2041 nvme_start_freeze(ctrl); 2042 nvme_unquiesce_io_queues(ctrl); 2043 if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) { 2044 /* 2045 * If we timed out waiting for freeze we are likely to 2046 * be stuck. Fail the controller initialization just 2047 * to be safe. 2048 */ 2049 ret = -ENODEV; 2050 nvme_unfreeze(ctrl); 2051 goto out_wait_freeze_timed_out; 2052 } 2053 blk_mq_update_nr_hw_queues(ctrl->tagset, 2054 ctrl->queue_count - 1); 2055 nvme_unfreeze(ctrl); 2056 } 2057 2058 /* 2059 * If the number of queues has increased (reconnect case) 2060 * start all new queues now. 2061 */ 2062 ret = nvme_tcp_start_io_queues(ctrl, nr_queues, 2063 ctrl->tagset->nr_hw_queues + 1); 2064 if (ret) 2065 goto out_wait_freeze_timed_out; 2066 2067 return 0; 2068 2069 out_wait_freeze_timed_out: 2070 nvme_quiesce_io_queues(ctrl); 2071 nvme_sync_io_queues(ctrl); 2072 nvme_tcp_stop_io_queues(ctrl); 2073 out_cleanup_connect_q: 2074 nvme_cancel_tagset(ctrl); 2075 if (new) 2076 nvme_remove_io_tag_set(ctrl); 2077 out_free_io_queues: 2078 nvme_tcp_free_io_queues(ctrl); 2079 return ret; 2080 } 2081 2082 static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove) 2083 { 2084 nvme_tcp_stop_queue(ctrl, 0); 2085 if (remove) 2086 nvme_remove_admin_tag_set(ctrl); 2087 nvme_tcp_free_admin_queue(ctrl); 2088 } 2089 2090 static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) 2091 { 2092 int error; 2093 2094 error = nvme_tcp_alloc_admin_queue(ctrl); 2095 if (error) 2096 return error; 2097 2098 if (new) { 2099 error = nvme_alloc_admin_tag_set(ctrl, 2100 &to_tcp_ctrl(ctrl)->admin_tag_set, 2101 &nvme_tcp_admin_mq_ops, 2102 sizeof(struct nvme_tcp_request)); 2103 if (error) 2104 goto out_free_queue; 2105 } 2106 2107 error = nvme_tcp_start_queue(ctrl, 0); 2108 if (error) 2109 goto out_cleanup_tagset; 2110 2111 error = nvme_enable_ctrl(ctrl); 2112 if (error) 2113 goto out_stop_queue; 2114 2115 nvme_unquiesce_admin_queue(ctrl); 2116 2117 error = nvme_init_ctrl_finish(ctrl, false); 2118 if (error) 2119 goto out_quiesce_queue; 2120 2121 return 0; 2122 2123 out_quiesce_queue: 2124 nvme_quiesce_admin_queue(ctrl); 2125 blk_sync_queue(ctrl->admin_q); 2126 out_stop_queue: 2127 nvme_tcp_stop_queue(ctrl, 0); 2128 nvme_cancel_admin_tagset(ctrl); 2129 out_cleanup_tagset: 2130 if (new) 2131 nvme_remove_admin_tag_set(ctrl); 2132 out_free_queue: 2133 nvme_tcp_free_admin_queue(ctrl); 2134 return error; 2135 } 2136 2137 static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, 2138 bool remove) 2139 { 2140 nvme_quiesce_admin_queue(ctrl); 2141 blk_sync_queue(ctrl->admin_q); 2142 nvme_tcp_stop_queue(ctrl, 0); 2143 nvme_cancel_admin_tagset(ctrl); 2144 if (remove) 2145 nvme_unquiesce_admin_queue(ctrl); 2146 nvme_tcp_destroy_admin_queue(ctrl, remove); 2147 } 2148 2149 static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, 2150 bool remove) 2151 { 2152 if (ctrl->queue_count <= 1) 2153 return; 2154 nvme_quiesce_admin_queue(ctrl); 2155 nvme_quiesce_io_queues(ctrl); 2156 nvme_sync_io_queues(ctrl); 2157 nvme_tcp_stop_io_queues(ctrl); 2158 nvme_cancel_tagset(ctrl); 2159 if (remove) 2160 nvme_unquiesce_io_queues(ctrl); 2161 nvme_tcp_destroy_io_queues(ctrl, remove); 2162 } 2163 2164 static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl, 2165 int status) 2166 { 2167 enum nvme_ctrl_state state = nvme_ctrl_state(ctrl); 2168 2169 /* If we are resetting/deleting then do nothing */ 2170 if (state != NVME_CTRL_CONNECTING) { 2171 WARN_ON_ONCE(state == NVME_CTRL_NEW || state == NVME_CTRL_LIVE); 2172 return; 2173 } 2174 2175 if (nvmf_should_reconnect(ctrl, status)) { 2176 dev_info(ctrl->device, "Reconnecting in %d seconds...\n", 2177 ctrl->opts->reconnect_delay); 2178 queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work, 2179 ctrl->opts->reconnect_delay * HZ); 2180 } else { 2181 dev_info(ctrl->device, "Removing controller (%d)...\n", 2182 status); 2183 nvme_delete_ctrl(ctrl); 2184 } 2185 } 2186 2187 static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) 2188 { 2189 struct nvmf_ctrl_options *opts = ctrl->opts; 2190 int ret; 2191 2192 ret = nvme_tcp_configure_admin_queue(ctrl, new); 2193 if (ret) 2194 return ret; 2195 2196 if (ctrl->icdoff) { 2197 ret = -EOPNOTSUPP; 2198 dev_err(ctrl->device, "icdoff is not supported!\n"); 2199 goto destroy_admin; 2200 } 2201 2202 if (!nvme_ctrl_sgl_supported(ctrl)) { 2203 ret = -EOPNOTSUPP; 2204 dev_err(ctrl->device, "Mandatory sgls are not supported!\n"); 2205 goto destroy_admin; 2206 } 2207 2208 if (opts->queue_size > ctrl->sqsize + 1) 2209 dev_warn(ctrl->device, 2210 "queue_size %zu > ctrl sqsize %u, clamping down\n", 2211 opts->queue_size, ctrl->sqsize + 1); 2212 2213 if (ctrl->sqsize + 1 > ctrl->maxcmd) { 2214 dev_warn(ctrl->device, 2215 "sqsize %u > ctrl maxcmd %u, clamping down\n", 2216 ctrl->sqsize + 1, ctrl->maxcmd); 2217 ctrl->sqsize = ctrl->maxcmd - 1; 2218 } 2219 2220 if (ctrl->queue_count > 1) { 2221 ret = nvme_tcp_configure_io_queues(ctrl, new); 2222 if (ret) 2223 goto destroy_admin; 2224 } 2225 2226 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) { 2227 /* 2228 * state change failure is ok if we started ctrl delete, 2229 * unless we're during creation of a new controller to 2230 * avoid races with teardown flow. 2231 */ 2232 enum nvme_ctrl_state state = nvme_ctrl_state(ctrl); 2233 2234 WARN_ON_ONCE(state != NVME_CTRL_DELETING && 2235 state != NVME_CTRL_DELETING_NOIO); 2236 WARN_ON_ONCE(new); 2237 ret = -EINVAL; 2238 goto destroy_io; 2239 } 2240 2241 nvme_start_ctrl(ctrl); 2242 return 0; 2243 2244 destroy_io: 2245 if (ctrl->queue_count > 1) { 2246 nvme_quiesce_io_queues(ctrl); 2247 nvme_sync_io_queues(ctrl); 2248 nvme_tcp_stop_io_queues(ctrl); 2249 nvme_cancel_tagset(ctrl); 2250 nvme_tcp_destroy_io_queues(ctrl, new); 2251 } 2252 destroy_admin: 2253 nvme_stop_keep_alive(ctrl); 2254 nvme_tcp_teardown_admin_queue(ctrl, false); 2255 return ret; 2256 } 2257 2258 static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work) 2259 { 2260 struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work), 2261 struct nvme_tcp_ctrl, connect_work); 2262 struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl; 2263 int ret; 2264 2265 ++ctrl->nr_reconnects; 2266 2267 ret = nvme_tcp_setup_ctrl(ctrl, false); 2268 if (ret) 2269 goto requeue; 2270 2271 dev_info(ctrl->device, "Successfully reconnected (attempt %d/%d)\n", 2272 ctrl->nr_reconnects, ctrl->opts->max_reconnects); 2273 2274 ctrl->nr_reconnects = 0; 2275 2276 return; 2277 2278 requeue: 2279 dev_info(ctrl->device, "Failed reconnect attempt %d/%d\n", 2280 ctrl->nr_reconnects, ctrl->opts->max_reconnects); 2281 nvme_tcp_reconnect_or_remove(ctrl, ret); 2282 } 2283 2284 static void nvme_tcp_error_recovery_work(struct work_struct *work) 2285 { 2286 struct nvme_tcp_ctrl *tcp_ctrl = container_of(work, 2287 struct nvme_tcp_ctrl, err_work); 2288 struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl; 2289 2290 nvme_stop_keep_alive(ctrl); 2291 flush_work(&ctrl->async_event_work); 2292 nvme_tcp_teardown_io_queues(ctrl, false); 2293 /* unquiesce to fail fast pending requests */ 2294 nvme_unquiesce_io_queues(ctrl); 2295 nvme_tcp_teardown_admin_queue(ctrl, false); 2296 nvme_unquiesce_admin_queue(ctrl); 2297 nvme_auth_stop(ctrl); 2298 2299 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { 2300 /* state change failure is ok if we started ctrl delete */ 2301 enum nvme_ctrl_state state = nvme_ctrl_state(ctrl); 2302 2303 WARN_ON_ONCE(state != NVME_CTRL_DELETING && 2304 state != NVME_CTRL_DELETING_NOIO); 2305 return; 2306 } 2307 2308 nvme_tcp_reconnect_or_remove(ctrl, 0); 2309 } 2310 2311 static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown) 2312 { 2313 nvme_tcp_teardown_io_queues(ctrl, shutdown); 2314 nvme_quiesce_admin_queue(ctrl); 2315 nvme_disable_ctrl(ctrl, shutdown); 2316 nvme_tcp_teardown_admin_queue(ctrl, shutdown); 2317 } 2318 2319 static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl) 2320 { 2321 nvme_tcp_teardown_ctrl(ctrl, true); 2322 } 2323 2324 static void nvme_reset_ctrl_work(struct work_struct *work) 2325 { 2326 struct nvme_ctrl *ctrl = 2327 container_of(work, struct nvme_ctrl, reset_work); 2328 int ret; 2329 2330 nvme_stop_ctrl(ctrl); 2331 nvme_tcp_teardown_ctrl(ctrl, false); 2332 2333 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { 2334 /* state change failure is ok if we started ctrl delete */ 2335 enum nvme_ctrl_state state = nvme_ctrl_state(ctrl); 2336 2337 WARN_ON_ONCE(state != NVME_CTRL_DELETING && 2338 state != NVME_CTRL_DELETING_NOIO); 2339 return; 2340 } 2341 2342 ret = nvme_tcp_setup_ctrl(ctrl, false); 2343 if (ret) 2344 goto out_fail; 2345 2346 return; 2347 2348 out_fail: 2349 ++ctrl->nr_reconnects; 2350 nvme_tcp_reconnect_or_remove(ctrl, ret); 2351 } 2352 2353 static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl) 2354 { 2355 flush_work(&to_tcp_ctrl(ctrl)->err_work); 2356 cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work); 2357 } 2358 2359 static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl) 2360 { 2361 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); 2362 2363 if (list_empty(&ctrl->list)) 2364 goto free_ctrl; 2365 2366 mutex_lock(&nvme_tcp_ctrl_mutex); 2367 list_del(&ctrl->list); 2368 mutex_unlock(&nvme_tcp_ctrl_mutex); 2369 2370 nvmf_free_options(nctrl->opts); 2371 free_ctrl: 2372 kfree(ctrl->queues); 2373 kfree(ctrl); 2374 } 2375 2376 static void nvme_tcp_set_sg_null(struct nvme_command *c) 2377 { 2378 struct nvme_sgl_desc *sg = &c->common.dptr.sgl; 2379 2380 sg->addr = 0; 2381 sg->length = 0; 2382 sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) | 2383 NVME_SGL_FMT_TRANSPORT_A; 2384 } 2385 2386 static void nvme_tcp_set_sg_inline(struct nvme_tcp_queue *queue, 2387 struct nvme_command *c, u32 data_len) 2388 { 2389 struct nvme_sgl_desc *sg = &c->common.dptr.sgl; 2390 2391 sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff); 2392 sg->length = cpu_to_le32(data_len); 2393 sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET; 2394 } 2395 2396 static void nvme_tcp_set_sg_host_data(struct nvme_command *c, 2397 u32 data_len) 2398 { 2399 struct nvme_sgl_desc *sg = &c->common.dptr.sgl; 2400 2401 sg->addr = 0; 2402 sg->length = cpu_to_le32(data_len); 2403 sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) | 2404 NVME_SGL_FMT_TRANSPORT_A; 2405 } 2406 2407 static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg) 2408 { 2409 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(arg); 2410 struct nvme_tcp_queue *queue = &ctrl->queues[0]; 2411 struct nvme_tcp_cmd_pdu *pdu = ctrl->async_req.pdu; 2412 struct nvme_command *cmd = &pdu->cmd; 2413 u8 hdgst = nvme_tcp_hdgst_len(queue); 2414 2415 memset(pdu, 0, sizeof(*pdu)); 2416 pdu->hdr.type = nvme_tcp_cmd; 2417 if (queue->hdr_digest) 2418 pdu->hdr.flags |= NVME_TCP_F_HDGST; 2419 pdu->hdr.hlen = sizeof(*pdu); 2420 pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst); 2421 2422 cmd->common.opcode = nvme_admin_async_event; 2423 cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH; 2424 cmd->common.flags |= NVME_CMD_SGL_METABUF; 2425 nvme_tcp_set_sg_null(cmd); 2426 2427 ctrl->async_req.state = NVME_TCP_SEND_CMD_PDU; 2428 ctrl->async_req.offset = 0; 2429 ctrl->async_req.curr_bio = NULL; 2430 ctrl->async_req.data_len = 0; 2431 2432 nvme_tcp_queue_request(&ctrl->async_req, true, true); 2433 } 2434 2435 static void nvme_tcp_complete_timed_out(struct request *rq) 2436 { 2437 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); 2438 struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl; 2439 2440 nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue)); 2441 nvmf_complete_timed_out_request(rq); 2442 } 2443 2444 static enum blk_eh_timer_return nvme_tcp_timeout(struct request *rq) 2445 { 2446 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); 2447 struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl; 2448 struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req); 2449 struct nvme_command *cmd = &pdu->cmd; 2450 int qid = nvme_tcp_queue_id(req->queue); 2451 2452 dev_warn(ctrl->device, 2453 "I/O tag %d (%04x) type %d opcode %#x (%s) QID %d timeout\n", 2454 rq->tag, nvme_cid(rq), pdu->hdr.type, cmd->common.opcode, 2455 nvme_fabrics_opcode_str(qid, cmd), qid); 2456 2457 if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE) { 2458 /* 2459 * If we are resetting, connecting or deleting we should 2460 * complete immediately because we may block controller 2461 * teardown or setup sequence 2462 * - ctrl disable/shutdown fabrics requests 2463 * - connect requests 2464 * - initialization admin requests 2465 * - I/O requests that entered after unquiescing and 2466 * the controller stopped responding 2467 * 2468 * All other requests should be cancelled by the error 2469 * recovery work, so it's fine that we fail it here. 2470 */ 2471 nvme_tcp_complete_timed_out(rq); 2472 return BLK_EH_DONE; 2473 } 2474 2475 /* 2476 * LIVE state should trigger the normal error recovery which will 2477 * handle completing this request. 2478 */ 2479 nvme_tcp_error_recovery(ctrl); 2480 return BLK_EH_RESET_TIMER; 2481 } 2482 2483 static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue, 2484 struct request *rq) 2485 { 2486 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); 2487 struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req); 2488 struct nvme_command *c = &pdu->cmd; 2489 2490 c->common.flags |= NVME_CMD_SGL_METABUF; 2491 2492 if (!blk_rq_nr_phys_segments(rq)) 2493 nvme_tcp_set_sg_null(c); 2494 else if (rq_data_dir(rq) == WRITE && 2495 req->data_len <= nvme_tcp_inline_data_size(req)) 2496 nvme_tcp_set_sg_inline(queue, c, req->data_len); 2497 else 2498 nvme_tcp_set_sg_host_data(c, req->data_len); 2499 2500 return 0; 2501 } 2502 2503 static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns, 2504 struct request *rq) 2505 { 2506 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); 2507 struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req); 2508 struct nvme_tcp_queue *queue = req->queue; 2509 u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0; 2510 blk_status_t ret; 2511 2512 ret = nvme_setup_cmd(ns, rq); 2513 if (ret) 2514 return ret; 2515 2516 req->state = NVME_TCP_SEND_CMD_PDU; 2517 req->status = cpu_to_le16(NVME_SC_SUCCESS); 2518 req->offset = 0; 2519 req->data_sent = 0; 2520 req->pdu_len = 0; 2521 req->pdu_sent = 0; 2522 req->h2cdata_left = 0; 2523 req->data_len = blk_rq_nr_phys_segments(rq) ? 2524 blk_rq_payload_bytes(rq) : 0; 2525 req->curr_bio = rq->bio; 2526 if (req->curr_bio && req->data_len) 2527 nvme_tcp_init_iter(req, rq_data_dir(rq)); 2528 2529 if (rq_data_dir(rq) == WRITE && 2530 req->data_len <= nvme_tcp_inline_data_size(req)) 2531 req->pdu_len = req->data_len; 2532 2533 pdu->hdr.type = nvme_tcp_cmd; 2534 pdu->hdr.flags = 0; 2535 if (queue->hdr_digest) 2536 pdu->hdr.flags |= NVME_TCP_F_HDGST; 2537 if (queue->data_digest && req->pdu_len) { 2538 pdu->hdr.flags |= NVME_TCP_F_DDGST; 2539 ddgst = nvme_tcp_ddgst_len(queue); 2540 } 2541 pdu->hdr.hlen = sizeof(*pdu); 2542 pdu->hdr.pdo = req->pdu_len ? pdu->hdr.hlen + hdgst : 0; 2543 pdu->hdr.plen = 2544 cpu_to_le32(pdu->hdr.hlen + hdgst + req->pdu_len + ddgst); 2545 2546 ret = nvme_tcp_map_data(queue, rq); 2547 if (unlikely(ret)) { 2548 nvme_cleanup_cmd(rq); 2549 dev_err(queue->ctrl->ctrl.device, 2550 "Failed to map data (%d)\n", ret); 2551 return ret; 2552 } 2553 2554 return 0; 2555 } 2556 2557 static void nvme_tcp_commit_rqs(struct blk_mq_hw_ctx *hctx) 2558 { 2559 struct nvme_tcp_queue *queue = hctx->driver_data; 2560 2561 if (!llist_empty(&queue->req_list)) 2562 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); 2563 } 2564 2565 static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx, 2566 const struct blk_mq_queue_data *bd) 2567 { 2568 struct nvme_ns *ns = hctx->queue->queuedata; 2569 struct nvme_tcp_queue *queue = hctx->driver_data; 2570 struct request *rq = bd->rq; 2571 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); 2572 bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags); 2573 blk_status_t ret; 2574 2575 if (!nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready)) 2576 return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq); 2577 2578 ret = nvme_tcp_setup_cmd_pdu(ns, rq); 2579 if (unlikely(ret)) 2580 return ret; 2581 2582 nvme_start_request(rq); 2583 2584 nvme_tcp_queue_request(req, true, bd->last); 2585 2586 return BLK_STS_OK; 2587 } 2588 2589 static void nvme_tcp_map_queues(struct blk_mq_tag_set *set) 2590 { 2591 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(set->driver_data); 2592 2593 nvmf_map_queues(set, &ctrl->ctrl, ctrl->io_queues); 2594 } 2595 2596 static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) 2597 { 2598 struct nvme_tcp_queue *queue = hctx->driver_data; 2599 struct sock *sk = queue->sock->sk; 2600 2601 if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags)) 2602 return 0; 2603 2604 set_bit(NVME_TCP_Q_POLLING, &queue->flags); 2605 if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue)) 2606 sk_busy_loop(sk, true); 2607 nvme_tcp_try_recv(queue); 2608 clear_bit(NVME_TCP_Q_POLLING, &queue->flags); 2609 return queue->nr_cqe; 2610 } 2611 2612 static int nvme_tcp_get_address(struct nvme_ctrl *ctrl, char *buf, int size) 2613 { 2614 struct nvme_tcp_queue *queue = &to_tcp_ctrl(ctrl)->queues[0]; 2615 struct sockaddr_storage src_addr; 2616 int ret, len; 2617 2618 len = nvmf_get_address(ctrl, buf, size); 2619 2620 mutex_lock(&queue->queue_lock); 2621 2622 if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags)) 2623 goto done; 2624 ret = kernel_getsockname(queue->sock, (struct sockaddr *)&src_addr); 2625 if (ret > 0) { 2626 if (len > 0) 2627 len--; /* strip trailing newline */ 2628 len += scnprintf(buf + len, size - len, "%ssrc_addr=%pISc\n", 2629 (len) ? "," : "", &src_addr); 2630 } 2631 done: 2632 mutex_unlock(&queue->queue_lock); 2633 2634 return len; 2635 } 2636 2637 static const struct blk_mq_ops nvme_tcp_mq_ops = { 2638 .queue_rq = nvme_tcp_queue_rq, 2639 .commit_rqs = nvme_tcp_commit_rqs, 2640 .complete = nvme_complete_rq, 2641 .init_request = nvme_tcp_init_request, 2642 .exit_request = nvme_tcp_exit_request, 2643 .init_hctx = nvme_tcp_init_hctx, 2644 .timeout = nvme_tcp_timeout, 2645 .map_queues = nvme_tcp_map_queues, 2646 .poll = nvme_tcp_poll, 2647 }; 2648 2649 static const struct blk_mq_ops nvme_tcp_admin_mq_ops = { 2650 .queue_rq = nvme_tcp_queue_rq, 2651 .complete = nvme_complete_rq, 2652 .init_request = nvme_tcp_init_request, 2653 .exit_request = nvme_tcp_exit_request, 2654 .init_hctx = nvme_tcp_init_admin_hctx, 2655 .timeout = nvme_tcp_timeout, 2656 }; 2657 2658 static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = { 2659 .name = "tcp", 2660 .module = THIS_MODULE, 2661 .flags = NVME_F_FABRICS | NVME_F_BLOCKING, 2662 .reg_read32 = nvmf_reg_read32, 2663 .reg_read64 = nvmf_reg_read64, 2664 .reg_write32 = nvmf_reg_write32, 2665 .subsystem_reset = nvmf_subsystem_reset, 2666 .free_ctrl = nvme_tcp_free_ctrl, 2667 .submit_async_event = nvme_tcp_submit_async_event, 2668 .delete_ctrl = nvme_tcp_delete_ctrl, 2669 .get_address = nvme_tcp_get_address, 2670 .stop_ctrl = nvme_tcp_stop_ctrl, 2671 }; 2672 2673 static bool 2674 nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts) 2675 { 2676 struct nvme_tcp_ctrl *ctrl; 2677 bool found = false; 2678 2679 mutex_lock(&nvme_tcp_ctrl_mutex); 2680 list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) { 2681 found = nvmf_ip_options_match(&ctrl->ctrl, opts); 2682 if (found) 2683 break; 2684 } 2685 mutex_unlock(&nvme_tcp_ctrl_mutex); 2686 2687 return found; 2688 } 2689 2690 static struct nvme_tcp_ctrl *nvme_tcp_alloc_ctrl(struct device *dev, 2691 struct nvmf_ctrl_options *opts) 2692 { 2693 struct nvme_tcp_ctrl *ctrl; 2694 int ret; 2695 2696 ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); 2697 if (!ctrl) 2698 return ERR_PTR(-ENOMEM); 2699 2700 INIT_LIST_HEAD(&ctrl->list); 2701 ctrl->ctrl.opts = opts; 2702 ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + 2703 opts->nr_poll_queues + 1; 2704 ctrl->ctrl.sqsize = opts->queue_size - 1; 2705 ctrl->ctrl.kato = opts->kato; 2706 2707 INIT_DELAYED_WORK(&ctrl->connect_work, 2708 nvme_tcp_reconnect_ctrl_work); 2709 INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work); 2710 INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work); 2711 2712 if (!(opts->mask & NVMF_OPT_TRSVCID)) { 2713 opts->trsvcid = 2714 kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL); 2715 if (!opts->trsvcid) { 2716 ret = -ENOMEM; 2717 goto out_free_ctrl; 2718 } 2719 opts->mask |= NVMF_OPT_TRSVCID; 2720 } 2721 2722 ret = inet_pton_with_scope(&init_net, AF_UNSPEC, 2723 opts->traddr, opts->trsvcid, &ctrl->addr); 2724 if (ret) { 2725 pr_err("malformed address passed: %s:%s\n", 2726 opts->traddr, opts->trsvcid); 2727 goto out_free_ctrl; 2728 } 2729 2730 if (opts->mask & NVMF_OPT_HOST_TRADDR) { 2731 ret = inet_pton_with_scope(&init_net, AF_UNSPEC, 2732 opts->host_traddr, NULL, &ctrl->src_addr); 2733 if (ret) { 2734 pr_err("malformed src address passed: %s\n", 2735 opts->host_traddr); 2736 goto out_free_ctrl; 2737 } 2738 } 2739 2740 if (opts->mask & NVMF_OPT_HOST_IFACE) { 2741 if (!__dev_get_by_name(&init_net, opts->host_iface)) { 2742 pr_err("invalid interface passed: %s\n", 2743 opts->host_iface); 2744 ret = -ENODEV; 2745 goto out_free_ctrl; 2746 } 2747 } 2748 2749 if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) { 2750 ret = -EALREADY; 2751 goto out_free_ctrl; 2752 } 2753 2754 ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues), 2755 GFP_KERNEL); 2756 if (!ctrl->queues) { 2757 ret = -ENOMEM; 2758 goto out_free_ctrl; 2759 } 2760 2761 ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_tcp_ctrl_ops, 0); 2762 if (ret) 2763 goto out_kfree_queues; 2764 2765 return ctrl; 2766 out_kfree_queues: 2767 kfree(ctrl->queues); 2768 out_free_ctrl: 2769 kfree(ctrl); 2770 return ERR_PTR(ret); 2771 } 2772 2773 static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, 2774 struct nvmf_ctrl_options *opts) 2775 { 2776 struct nvme_tcp_ctrl *ctrl; 2777 int ret; 2778 2779 ctrl = nvme_tcp_alloc_ctrl(dev, opts); 2780 if (IS_ERR(ctrl)) 2781 return ERR_CAST(ctrl); 2782 2783 ret = nvme_add_ctrl(&ctrl->ctrl); 2784 if (ret) 2785 goto out_put_ctrl; 2786 2787 if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { 2788 WARN_ON_ONCE(1); 2789 ret = -EINTR; 2790 goto out_uninit_ctrl; 2791 } 2792 2793 ret = nvme_tcp_setup_ctrl(&ctrl->ctrl, true); 2794 if (ret) 2795 goto out_uninit_ctrl; 2796 2797 dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp, hostnqn: %s\n", 2798 nvmf_ctrl_subsysnqn(&ctrl->ctrl), &ctrl->addr, opts->host->nqn); 2799 2800 mutex_lock(&nvme_tcp_ctrl_mutex); 2801 list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list); 2802 mutex_unlock(&nvme_tcp_ctrl_mutex); 2803 2804 return &ctrl->ctrl; 2805 2806 out_uninit_ctrl: 2807 nvme_uninit_ctrl(&ctrl->ctrl); 2808 out_put_ctrl: 2809 nvme_put_ctrl(&ctrl->ctrl); 2810 if (ret > 0) 2811 ret = -EIO; 2812 return ERR_PTR(ret); 2813 } 2814 2815 static struct nvmf_transport_ops nvme_tcp_transport = { 2816 .name = "tcp", 2817 .module = THIS_MODULE, 2818 .required_opts = NVMF_OPT_TRADDR, 2819 .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY | 2820 NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO | 2821 NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST | 2822 NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES | 2823 NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE | NVMF_OPT_TLS | 2824 NVMF_OPT_KEYRING | NVMF_OPT_TLS_KEY, 2825 .create_ctrl = nvme_tcp_create_ctrl, 2826 }; 2827 2828 static int __init nvme_tcp_init_module(void) 2829 { 2830 unsigned int wq_flags = WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_SYSFS; 2831 2832 BUILD_BUG_ON(sizeof(struct nvme_tcp_hdr) != 8); 2833 BUILD_BUG_ON(sizeof(struct nvme_tcp_cmd_pdu) != 72); 2834 BUILD_BUG_ON(sizeof(struct nvme_tcp_data_pdu) != 24); 2835 BUILD_BUG_ON(sizeof(struct nvme_tcp_rsp_pdu) != 24); 2836 BUILD_BUG_ON(sizeof(struct nvme_tcp_r2t_pdu) != 24); 2837 BUILD_BUG_ON(sizeof(struct nvme_tcp_icreq_pdu) != 128); 2838 BUILD_BUG_ON(sizeof(struct nvme_tcp_icresp_pdu) != 128); 2839 BUILD_BUG_ON(sizeof(struct nvme_tcp_term_pdu) != 24); 2840 2841 if (wq_unbound) 2842 wq_flags |= WQ_UNBOUND; 2843 2844 nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq", wq_flags, 0); 2845 if (!nvme_tcp_wq) 2846 return -ENOMEM; 2847 2848 nvmf_register_transport(&nvme_tcp_transport); 2849 return 0; 2850 } 2851 2852 static void __exit nvme_tcp_cleanup_module(void) 2853 { 2854 struct nvme_tcp_ctrl *ctrl; 2855 2856 nvmf_unregister_transport(&nvme_tcp_transport); 2857 2858 mutex_lock(&nvme_tcp_ctrl_mutex); 2859 list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) 2860 nvme_delete_ctrl(&ctrl->ctrl); 2861 mutex_unlock(&nvme_tcp_ctrl_mutex); 2862 flush_workqueue(nvme_delete_wq); 2863 2864 destroy_workqueue(nvme_tcp_wq); 2865 } 2866 2867 module_init(nvme_tcp_init_module); 2868 module_exit(nvme_tcp_cleanup_module); 2869 2870 MODULE_DESCRIPTION("NVMe host TCP transport driver"); 2871 MODULE_LICENSE("GPL v2"); 2872