1 /*- 2 * Copyright (c) 2015, Mellanox Technologies, Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 */ 25 26 #ifndef ICL_ISER_H 27 #define ICL_ISER_H 28 29 /* 30 * iSCSI Common Layer for RDMA. 31 */ 32 33 #include <sys/param.h> 34 #include <sys/capsicum.h> 35 #include <sys/condvar.h> 36 #include <sys/conf.h> 37 #include <sys/file.h> 38 #include <sys/kernel.h> 39 #include <sys/kthread.h> 40 #include <sys/lock.h> 41 #include <sys/mbuf.h> 42 #include <sys/mutex.h> 43 #include <sys/module.h> 44 #include <sys/protosw.h> 45 #include <sys/socket.h> 46 #include <sys/socketvar.h> 47 #include <sys/sysctl.h> 48 #include <sys/systm.h> 49 #include <sys/sx.h> 50 #include <sys/uio.h> 51 #include <sys/taskqueue.h> 52 #include <sys/bio.h> 53 #include <vm/uma.h> 54 #include <netinet/in.h> 55 #include <netinet/tcp.h> 56 #include <dev/iscsi/icl.h> 57 #include <dev/iscsi/iscsi_proto.h> 58 #include <icl_conn_if.h> 59 #include <cam/cam.h> 60 #include <cam/cam_ccb.h> 61 #include <rdma/ib_verbs.h> 62 #include <rdma/ib_fmr_pool.h> 63 #include <rdma/rdma_cm.h> 64 65 66 #define ISER_DBG(X, ...) \ 67 do { \ 68 if (unlikely(iser_debug > 2)) \ 69 printf("DEBUG: %s: " X "\n", \ 70 __func__, ## __VA_ARGS__); \ 71 } while (0) 72 73 #define ISER_INFO(X, ...) \ 74 do { \ 75 if (unlikely(iser_debug > 1)) \ 76 printf("INFO: %s: " X "\n", \ 77 __func__, ## __VA_ARGS__); \ 78 } while (0) 79 80 #define ISER_WARN(X, ...) \ 81 do { \ 82 if (unlikely(iser_debug > 0)) { \ 83 printf("WARNING: %s: " X "\n", \ 84 __func__, ## __VA_ARGS__); \ 85 } \ 86 } while (0) 87 88 #define ISER_ERR(X, ...) \ 89 printf("ERROR: %s: " X "\n", __func__, ## __VA_ARGS__) 90 91 #define ISER_VER 0x10 92 #define ISER_WSV 0x08 93 #define ISER_RSV 0x04 94 95 #define ISER_FASTREG_LI_WRID 0xffffffffffffffffULL 96 #define ISER_BEACON_WRID 0xfffffffffffffffeULL 97 98 #define SHIFT_4K 12 99 #define SIZE_4K (1ULL << SHIFT_4K) 100 #define MASK_4K (~(SIZE_4K-1)) 101 102 /* support up to 512KB in one RDMA */ 103 #define ISCSI_ISER_SG_TABLESIZE (0x80000 >> SHIFT_4K) 104 #define ISER_DEF_XMIT_CMDS_MAX 256 105 106 /* the max RX (recv) WR supported by the iSER QP is defined by * 107 * max_recv_wr = commands_max + recv_beacon */ 108 #define ISER_QP_MAX_RECV_DTOS (ISER_DEF_XMIT_CMDS_MAX + 1) 109 #define ISER_MIN_POSTED_RX (ISER_DEF_XMIT_CMDS_MAX >> 2) 110 111 /* QP settings */ 112 /* Maximal bounds on received asynchronous PDUs */ 113 #define ISER_MAX_RX_MISC_PDUS 4 /* NOOP_IN(2) , ASYNC_EVENT(2) */ 114 #define ISER_MAX_TX_MISC_PDUS 6 /* NOOP_OUT(2), TEXT(1), SCSI_TMFUNC(2), LOGOUT(1) */ 115 116 /* the max TX (send) WR supported by the iSER QP is defined by * 117 * max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect * 118 * to have at max for SCSI command. The tx posting & completion handling code * 119 * supports -EAGAIN scheme where tx is suspended till the QP has room for more * 120 * send WR. D=8 comes from 64K/8K */ 121 122 #define ISER_INFLIGHT_DATAOUTS 8 123 124 /* the send_beacon increase the max_send_wr by 1 */ 125 #define ISER_QP_MAX_REQ_DTOS (ISER_DEF_XMIT_CMDS_MAX * \ 126 (1 + ISER_INFLIGHT_DATAOUTS) + \ 127 ISER_MAX_TX_MISC_PDUS + \ 128 ISER_MAX_RX_MISC_PDUS + 1) 129 130 #define ISER_GET_MAX_XMIT_CMDS(send_wr) ((send_wr \ 131 - ISER_MAX_TX_MISC_PDUS \ 132 - ISER_MAX_RX_MISC_PDUS - 1) / \ 133 (1 + ISER_INFLIGHT_DATAOUTS)) 134 135 #define ISER_WC_BATCH_COUNT 16 136 #define ISER_SIGNAL_CMD_COUNT 32 137 138 /* Maximal QP's recommended per CQ. In case we use more QP's per CQ we might * 139 * encounter a CQ overrun state. */ 140 #define ISCSI_ISER_MAX_CONN 8 141 #define ISER_MAX_RX_LEN (ISER_QP_MAX_RECV_DTOS * ISCSI_ISER_MAX_CONN) 142 #define ISER_MAX_TX_LEN (ISER_QP_MAX_REQ_DTOS * ISCSI_ISER_MAX_CONN) 143 #define ISER_MAX_CQ_LEN (ISER_MAX_RX_LEN + ISER_MAX_TX_LEN + \ 144 ISCSI_ISER_MAX_CONN) 145 146 #define ISER_ZBVA_NOT_SUPPORTED 0x80 147 #define ISER_SEND_W_INV_NOT_SUPPORTED 0x40 148 149 #define ISCSI_DEF_MAX_RECV_SEG_LEN 8192 150 #define ISCSI_OPCODE_MASK 0x3f 151 152 #define icl_to_iser_conn(ic) \ 153 container_of(ic, struct iser_conn, icl_conn) 154 #define icl_to_iser_pdu(ip) \ 155 container_of(ip, struct icl_iser_pdu, icl_pdu) 156 157 /** 158 * struct iser_hdr - iSER header 159 * 160 * @flags: flags support (zbva, remote_inv) 161 * @rsvd: reserved 162 * @write_stag: write rkey 163 * @write_va: write virtual address 164 * @reaf_stag: read rkey 165 * @read_va: read virtual address 166 */ 167 struct iser_hdr { 168 u8 flags; 169 u8 rsvd[3]; 170 __be32 write_stag; 171 __be64 write_va; 172 __be32 read_stag; 173 __be64 read_va; 174 } __attribute__((packed)); 175 176 struct iser_cm_hdr { 177 u8 flags; 178 u8 rsvd[3]; 179 } __packed; 180 181 /* Constant PDU lengths calculations */ 182 #define ISER_HEADERS_LEN (sizeof(struct iser_hdr) + ISCSI_BHS_SIZE) 183 184 #define ISER_RECV_DATA_SEG_LEN 128 185 #define ISER_RX_PAYLOAD_SIZE (ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN) 186 187 #define ISER_RX_LOGIN_SIZE (ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN) 188 189 enum iser_conn_state { 190 ISER_CONN_INIT, /* descriptor allocd, no conn */ 191 ISER_CONN_PENDING, /* in the process of being established */ 192 ISER_CONN_UP, /* up and running */ 193 ISER_CONN_TERMINATING, /* in the process of being terminated */ 194 ISER_CONN_DOWN, /* shut down */ 195 ISER_CONN_STATES_NUM 196 }; 197 198 enum iser_task_status { 199 ISER_TASK_STATUS_INIT = 0, 200 ISER_TASK_STATUS_STARTED, 201 ISER_TASK_STATUS_COMPLETED 202 }; 203 204 enum iser_data_dir { 205 ISER_DIR_IN = 0, /* to initiator */ 206 ISER_DIR_OUT, /* from initiator */ 207 ISER_DIRS_NUM 208 }; 209 210 /** 211 * struct iser_mem_reg - iSER memory registration info 212 * 213 * @sge: memory region sg element 214 * @rkey: memory region remote key 215 * @mem_h: pointer to registration context (FMR/Fastreg) 216 */ 217 struct iser_mem_reg { 218 struct ib_sge sge; 219 u32 rkey; 220 void *mem_h; 221 }; 222 223 enum iser_desc_type { 224 ISCSI_TX_CONTROL , 225 ISCSI_TX_SCSI_COMMAND, 226 ISCSI_TX_DATAOUT 227 }; 228 229 /** 230 * struct iser_data_buf - iSER data buffer 231 * 232 * @sg: pointer to the sg list 233 * @size: num entries of this sg 234 * @data_len: total beffer byte len 235 * @dma_nents: returned by dma_map_sg 236 * @copy_buf: allocated copy buf for SGs unaligned 237 * for rdma which are copied 238 * @orig_sg: pointer to the original sg list (in case 239 * we used a copy) 240 * @sg_single: SG-ified clone of a non SG SC or 241 * unaligned SG 242 */ 243 struct iser_data_buf { 244 struct scatterlist sgl[ISCSI_ISER_SG_TABLESIZE]; 245 void *sg; 246 int size; 247 unsigned long data_len; 248 unsigned int dma_nents; 249 char *copy_buf; 250 struct scatterlist *orig_sg; 251 struct scatterlist sg_single; 252 }; 253 254 /* fwd declarations */ 255 struct iser_conn; 256 struct ib_conn; 257 struct iser_device; 258 259 /** 260 * struct iser_tx_desc - iSER TX descriptor (for send wr_id) 261 * 262 * @iser_header: iser header 263 * @iscsi_header: iscsi header (bhs) 264 * @type: command/control/dataout 265 * @dma_addr: header buffer dma_address 266 * @tx_sg: sg[0] points to iser/iscsi headers 267 * sg[1] optionally points to either of immediate data 268 * unsolicited data-out or control 269 * @num_sge: number sges used on this TX task 270 * @mapped: indicates if the descriptor is dma mapped 271 */ 272 struct iser_tx_desc { 273 struct iser_hdr iser_header; 274 struct iscsi_bhs iscsi_header __attribute__((packed)); 275 enum iser_desc_type type; 276 u64 dma_addr; 277 struct ib_sge tx_sg[2]; 278 int num_sge; 279 bool mapped; 280 }; 281 282 #define ISER_RX_PAD_SIZE (256 - (ISER_RX_PAYLOAD_SIZE + \ 283 sizeof(u64) + sizeof(struct ib_sge))) 284 /** 285 * struct iser_rx_desc - iSER RX descriptor (for recv wr_id) 286 * 287 * @iser_header: iser header 288 * @iscsi_header: iscsi header 289 * @data: received data segment 290 * @dma_addr: receive buffer dma address 291 * @rx_sg: ib_sge of receive buffer 292 * @pad: for sense data TODO: Modify to maximum sense length supported 293 */ 294 struct iser_rx_desc { 295 struct iser_hdr iser_header; 296 struct iscsi_bhs iscsi_header; 297 char data[ISER_RECV_DATA_SEG_LEN]; 298 u64 dma_addr; 299 struct ib_sge rx_sg; 300 char pad[ISER_RX_PAD_SIZE]; 301 } __attribute__((packed)); 302 303 struct icl_iser_pdu { 304 struct icl_pdu icl_pdu; 305 struct iser_tx_desc desc; 306 struct iser_conn *iser_conn; 307 enum iser_task_status status; 308 struct ccb_scsiio *csio; 309 int command_sent; 310 int dir[ISER_DIRS_NUM]; 311 struct iser_mem_reg rdma_reg[ISER_DIRS_NUM]; 312 struct iser_data_buf data[ISER_DIRS_NUM]; 313 }; 314 315 /** 316 * struct iser_comp - iSER completion context 317 * 318 * @device: pointer to device handle 319 * @cq: completion queue 320 * @wcs: work completion array 321 * @tq: taskqueue handle 322 * @task: task to run task_fn 323 * @active_qps: Number of active QPs attached 324 * to completion context 325 */ 326 struct iser_comp { 327 struct iser_device *device; 328 struct ib_cq *cq; 329 struct ib_wc wcs[ISER_WC_BATCH_COUNT]; 330 struct taskqueue *tq; 331 struct task task; 332 int active_qps; 333 }; 334 335 /** 336 * struct iser_device - iSER device handle 337 * 338 * @ib_device: RDMA device 339 * @pd: Protection Domain for this device 340 * @dev_attr: Device attributes container 341 * @mr: Global DMA memory region 342 * @event_handler: IB events handle routine 343 * @ig_list: entry in devices list 344 * @refcount: Reference counter, dominated by open iser connections 345 * @comps_used: Number of completion contexts used, Min between online 346 * cpus and device max completion vectors 347 * @comps: Dinamically allocated array of completion handlers 348 */ 349 struct iser_device { 350 struct ib_device *ib_device; 351 struct ib_pd *pd; 352 struct ib_device_attr dev_attr; 353 struct ib_mr *mr; 354 struct ib_event_handler event_handler; 355 struct list_head ig_list; 356 int refcount; 357 int comps_used; 358 struct iser_comp *comps; 359 }; 360 361 /** 362 * struct iser_reg_resources - Fast registration recources 363 * 364 * @mr: memory region 365 * @mr_valid: is mr valid indicator 366 */ 367 struct iser_reg_resources { 368 struct ib_mr *mr; 369 u8 mr_valid:1; 370 }; 371 372 /** 373 * struct fast_reg_descriptor - Fast registration descriptor 374 * 375 * @list: entry in connection fastreg pool 376 * @rsc: data buffer registration resources 377 */ 378 struct fast_reg_descriptor { 379 struct list_head list; 380 struct iser_reg_resources rsc; 381 }; 382 383 384 /** 385 * struct iser_beacon - beacon to signal all flush errors were drained 386 * 387 * @send: send wr 388 * @recv: recv wr 389 * @flush_lock: protects flush_cv 390 * @flush_cv: condition variable for beacon flush 391 */ 392 struct iser_beacon { 393 union { 394 struct ib_send_wr send; 395 struct ib_recv_wr recv; 396 }; 397 struct mtx flush_lock; 398 struct cv flush_cv; 399 }; 400 401 /** 402 * struct ib_conn - Infiniband related objects 403 * 404 * @cma_id: rdma_cm connection maneger handle 405 * @qp: Connection Queue-pair 406 * @device: reference to iser device 407 * @comp: iser completion context 408 */ 409 struct ib_conn { 410 struct rdma_cm_id *cma_id; 411 struct ib_qp *qp; 412 int post_recv_buf_count; 413 u8 sig_count; 414 struct ib_recv_wr rx_wr[ISER_MIN_POSTED_RX]; 415 struct iser_device *device; 416 struct iser_comp *comp; 417 struct iser_beacon beacon; 418 struct mtx lock; 419 union { 420 struct { 421 struct ib_fmr_pool *pool; 422 struct iser_page_vec *page_vec; 423 } fmr; 424 struct { 425 struct list_head pool; 426 int pool_size; 427 } fastreg; 428 }; 429 }; 430 431 struct iser_conn { 432 struct icl_conn icl_conn; 433 struct ib_conn ib_conn; 434 struct cv up_cv; 435 struct list_head conn_list; 436 struct sx state_mutex; 437 enum iser_conn_state state; 438 int qp_max_recv_dtos; 439 int min_posted_rx; 440 u16 max_cmds; 441 char *login_buf; 442 char *login_req_buf, *login_resp_buf; 443 u64 login_req_dma, login_resp_dma; 444 unsigned int rx_desc_head; 445 struct iser_rx_desc *rx_descs; 446 u32 num_rx_descs; 447 bool handoff_done; 448 }; 449 450 /** 451 * struct iser_global: iSER global context 452 * 453 * @device_list_mutex: protects device_list 454 * @device_list: iser devices global list 455 * @connlist_mutex: protects connlist 456 * @connlist: iser connections global list 457 * @desc_cache: kmem cache for tx dataout 458 * @close_conns_mutex: serializes conns closure 459 */ 460 struct iser_global { 461 struct sx device_list_mutex; 462 struct list_head device_list; 463 struct mtx connlist_mutex; 464 struct list_head connlist; 465 struct sx close_conns_mutex; 466 }; 467 468 extern struct iser_global ig; 469 extern int iser_debug; 470 471 void 472 iser_create_send_desc(struct iser_conn *, struct iser_tx_desc *); 473 474 int 475 iser_post_recvl(struct iser_conn *); 476 477 int 478 iser_post_recvm(struct iser_conn *, int); 479 480 int 481 iser_alloc_login_buf(struct iser_conn *iser_conn); 482 483 void 484 iser_free_login_buf(struct iser_conn *iser_conn); 485 486 int 487 iser_post_send(struct ib_conn *, struct iser_tx_desc *, bool); 488 489 void 490 iser_snd_completion(struct iser_tx_desc *, struct ib_conn *); 491 492 void 493 iser_rcv_completion(struct iser_rx_desc *, unsigned long, 494 struct ib_conn *); 495 496 void 497 iser_pdu_free(struct icl_conn *, struct icl_pdu *); 498 499 struct icl_pdu * 500 iser_new_pdu(struct icl_conn *ic, int flags); 501 502 int 503 iser_alloc_rx_descriptors(struct iser_conn *, int); 504 505 void 506 iser_free_rx_descriptors(struct iser_conn *); 507 508 int 509 iser_initialize_headers(struct icl_iser_pdu *, struct iser_conn *); 510 511 int 512 iser_send_control(struct iser_conn *, struct icl_iser_pdu *); 513 514 int 515 iser_send_command(struct iser_conn *, struct icl_iser_pdu *); 516 517 int 518 iser_reg_rdma_mem(struct icl_iser_pdu *, enum iser_data_dir); 519 520 void 521 iser_unreg_rdma_mem(struct icl_iser_pdu *, enum iser_data_dir); 522 523 int 524 iser_create_fastreg_pool(struct ib_conn *, unsigned); 525 526 void 527 iser_free_fastreg_pool(struct ib_conn *); 528 529 int 530 iser_dma_map_task_data(struct icl_iser_pdu *, 531 struct iser_data_buf *, enum iser_data_dir, 532 enum dma_data_direction); 533 534 int 535 iser_conn_terminate(struct iser_conn *); 536 537 void 538 iser_free_ib_conn_res(struct iser_conn *, bool); 539 540 void 541 iser_dma_unmap_task_data(struct icl_iser_pdu *, struct iser_data_buf *, 542 enum dma_data_direction); 543 544 int 545 iser_cma_handler(struct rdma_cm_id *, struct rdma_cm_event *); 546 547 #endif /* !ICL_ISER_H */ 548