xref: /freebsd/sys/dev/iser/icl_iser.h (revision 884d26c84cba3ffc3d4e626306098fcdfe6a0c2b)
1 /* $FreeBSD$ */
2 /*-
3  * Copyright (c) 2015, Mellanox Technologies, Inc. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #ifndef ICL_ISER_H
28 #define ICL_ISER_H
29 
30 /*
31  * iSCSI Common Layer for RDMA.
32  */
33 
34 #include <sys/cdefs.h>
35 #include <sys/param.h>
36 #include <sys/capsicum.h>
37 #include <sys/condvar.h>
38 #include <sys/conf.h>
39 #include <sys/file.h>
40 #include <sys/kernel.h>
41 #include <sys/kthread.h>
42 #include <sys/lock.h>
43 #include <sys/mbuf.h>
44 #include <sys/mutex.h>
45 #include <sys/module.h>
46 #include <sys/protosw.h>
47 #include <sys/socket.h>
48 #include <sys/socketvar.h>
49 #include <sys/sysctl.h>
50 #include <sys/systm.h>
51 #include <sys/sx.h>
52 #include <sys/uio.h>
53 #include <sys/taskqueue.h>
54 #include <sys/bio.h>
55 #include <vm/uma.h>
56 #include <netinet/in.h>
57 #include <netinet/tcp.h>
58 #include <dev/iscsi/icl.h>
59 #include <dev/iscsi/iscsi_proto.h>
60 #include <icl_conn_if.h>
61 #include <cam/cam.h>
62 #include <cam/cam_ccb.h>
63 #include <rdma/ib_verbs.h>
64 #include <rdma/ib_fmr_pool.h>
65 #include <rdma/rdma_cm.h>
66 
67 
68 #define	ISER_DBG(X, ...)						\
69 	do {								\
70 		if (unlikely(iser_debug > 2))				\
71 			printf("DEBUG: %s: " X "\n",			\
72 				__func__, ## __VA_ARGS__);		\
73 	} while (0)
74 
75 #define	ISER_INFO(X, ...)						\
76 	do {								\
77 		if (unlikely(iser_debug > 1))				\
78 			printf("INFO: %s: " X "\n",			\
79 				__func__, ## __VA_ARGS__);		\
80 	} while (0)
81 
82 #define	ISER_WARN(X, ...)						\
83 	do {								\
84 		if (unlikely(iser_debug > 0)) {				\
85 			printf("WARNING: %s: " X "\n",			\
86 				__func__, ## __VA_ARGS__);		\
87 		}							\
88 	} while (0)
89 
90 #define	ISER_ERR(X, ...) 						\
91 	printf("ERROR: %s: " X "\n", __func__, ## __VA_ARGS__)
92 
93 #define ISER_VER			0x10
94 #define ISER_WSV			0x08
95 #define ISER_RSV			0x04
96 
97 #define ISER_FASTREG_LI_WRID		0xffffffffffffffffULL
98 #define ISER_BEACON_WRID		0xfffffffffffffffeULL
99 
100 #define SHIFT_4K	12
101 #define SIZE_4K	(1ULL << SHIFT_4K)
102 #define MASK_4K	(~(SIZE_4K-1))
103 
104 /* support up to 512KB in one RDMA */
105 #define ISCSI_ISER_SG_TABLESIZE         (0x80000 >> SHIFT_4K)
106 #define ISER_DEF_XMIT_CMDS_MAX 256
107 
108 /* the max RX (recv) WR supported by the iSER QP is defined by                 *
109  * max_recv_wr = commands_max + recv_beacon                                    */
110 #define ISER_QP_MAX_RECV_DTOS  (ISER_DEF_XMIT_CMDS_MAX + 1)
111 #define ISER_MIN_POSTED_RX		(ISER_DEF_XMIT_CMDS_MAX >> 2)
112 
113 /* QP settings */
114 /* Maximal bounds on received asynchronous PDUs */
115 #define ISER_MAX_RX_MISC_PDUS           4 /* NOOP_IN(2) , ASYNC_EVENT(2)   */
116 #define ISER_MAX_TX_MISC_PDUS           6 /* NOOP_OUT(2), TEXT(1), SCSI_TMFUNC(2), LOGOUT(1) */
117 
118 /* the max TX (send) WR supported by the iSER QP is defined by                 *
119  * max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect   *
120  * to have at max for SCSI command. The tx posting & completion handling code  *
121  * supports -EAGAIN scheme where tx is suspended till the QP has room for more *
122  * send WR. D=8 comes from 64K/8K                                              */
123 
124 #define ISER_INFLIGHT_DATAOUTS		8
125 
126 /* the send_beacon increase the max_send_wr by 1  */
127 #define ISER_QP_MAX_REQ_DTOS		(ISER_DEF_XMIT_CMDS_MAX *    \
128 					(1 + ISER_INFLIGHT_DATAOUTS) + \
129 					ISER_MAX_TX_MISC_PDUS        + \
130 					ISER_MAX_RX_MISC_PDUS + 1)
131 
132 #define ISER_GET_MAX_XMIT_CMDS(send_wr) ((send_wr			\
133 					 - ISER_MAX_TX_MISC_PDUS	\
134 					 - ISER_MAX_RX_MISC_PDUS - 1) /	\
135 					 (1 + ISER_INFLIGHT_DATAOUTS))
136 
137 #define ISER_WC_BATCH_COUNT   16
138 #define ISER_SIGNAL_CMD_COUNT 32
139 
140 /* Maximal QP's recommended per CQ. In case we use more QP's per CQ we might   *
141  * encounter a CQ overrun state.                                               */
142 #define ISCSI_ISER_MAX_CONN	8
143 #define ISER_MAX_RX_LEN		(ISER_QP_MAX_RECV_DTOS * ISCSI_ISER_MAX_CONN)
144 #define ISER_MAX_TX_LEN		(ISER_QP_MAX_REQ_DTOS  * ISCSI_ISER_MAX_CONN)
145 #define ISER_MAX_CQ_LEN		(ISER_MAX_RX_LEN + ISER_MAX_TX_LEN + \
146 				 ISCSI_ISER_MAX_CONN)
147 
148 #define ISER_ZBVA_NOT_SUPPORTED                0x80
149 #define ISER_SEND_W_INV_NOT_SUPPORTED	0x40
150 
151 #define	ISCSI_DEF_MAX_RECV_SEG_LEN	8192
152 #define	ISCSI_OPCODE_MASK		0x3f
153 
154 #define icl_to_iser_conn(ic) \
155 	container_of(ic, struct iser_conn, icl_conn)
156 #define icl_to_iser_pdu(ip) \
157 	container_of(ip, struct icl_iser_pdu, icl_pdu)
158 
159 /**
160  * struct iser_hdr - iSER header
161  *
162  * @flags:        flags support (zbva, remote_inv)
163  * @rsvd:         reserved
164  * @write_stag:   write rkey
165  * @write_va:     write virtual address
166  * @reaf_stag:    read rkey
167  * @read_va:      read virtual address
168  */
169 struct iser_hdr {
170 	u8      flags;
171 	u8      rsvd[3];
172 	__be32  write_stag;
173 	__be64  write_va;
174 	__be32  read_stag;
175 	__be64  read_va;
176 } __attribute__((packed));
177 
178 struct iser_cm_hdr {
179 	u8      flags;
180 	u8      rsvd[3];
181 } __packed;
182 
183 /* Constant PDU lengths calculations */
184 #define ISER_HEADERS_LEN  (sizeof(struct iser_hdr) + ISCSI_BHS_SIZE)
185 
186 #define ISER_RECV_DATA_SEG_LEN	128
187 #define ISER_RX_PAYLOAD_SIZE	(ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN)
188 
189 #define ISER_RX_LOGIN_SIZE	(ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN)
190 
191 enum iser_conn_state {
192 	ISER_CONN_INIT,		   /* descriptor allocd, no conn          */
193 	ISER_CONN_PENDING,	   /* in the process of being established */
194 	ISER_CONN_UP,		   /* up and running                      */
195 	ISER_CONN_TERMINATING,	   /* in the process of being terminated  */
196 	ISER_CONN_DOWN,		   /* shut down                           */
197 	ISER_CONN_STATES_NUM
198 };
199 
200 enum iser_task_status {
201 	ISER_TASK_STATUS_INIT = 0,
202 	ISER_TASK_STATUS_STARTED,
203 	ISER_TASK_STATUS_COMPLETED
204 };
205 
206 enum iser_data_dir {
207 	ISER_DIR_IN = 0,	   /* to initiator */
208 	ISER_DIR_OUT,		   /* from initiator */
209 	ISER_DIRS_NUM
210 };
211 
212 /**
213  * struct iser_mem_reg - iSER memory registration info
214  *
215  * @sge:          memory region sg element
216  * @rkey:         memory region remote key
217  * @mem_h:        pointer to registration context (FMR/Fastreg)
218  */
219 struct iser_mem_reg {
220 	struct ib_sge	 sge;
221 	u32		 rkey;
222 	void		*mem_h;
223 };
224 
225 enum iser_desc_type {
226 	ISCSI_TX_CONTROL ,
227 	ISCSI_TX_SCSI_COMMAND,
228 	ISCSI_TX_DATAOUT
229 };
230 
231 /**
232  * struct iser_data_buf - iSER data buffer
233  *
234  * @sg:           pointer to the sg list
235  * @size:         num entries of this sg
236  * @data_len:     total beffer byte len
237  * @dma_nents:    returned by dma_map_sg
238  * @copy_buf:     allocated copy buf for SGs unaligned
239  *                for rdma which are copied
240  * @orig_sg:      pointer to the original sg list (in case
241  *                we used a copy)
242  * @sg_single:    SG-ified clone of a non SG SC or
243  *                unaligned SG
244  */
245 struct iser_data_buf {
246 	struct scatterlist sgl[ISCSI_ISER_SG_TABLESIZE];
247 	void               *sg;
248 	unsigned int       size;
249 	unsigned long      data_len;
250 	unsigned int       dma_nents;
251 	char               *copy_buf;
252 	struct scatterlist *orig_sg;
253 	struct scatterlist sg_single;
254   };
255 
256 /* fwd declarations */
257 struct iser_conn;
258 struct ib_conn;
259 struct iser_device;
260 
261 /**
262  * struct iser_tx_desc - iSER TX descriptor (for send wr_id)
263  *
264  * @iser_header:   iser header
265  * @iscsi_header:  iscsi header (bhs)
266  * @type:          command/control/dataout
267  * @dma_addr:      header buffer dma_address
268  * @tx_sg:         sg[0] points to iser/iscsi headers
269  *                 sg[1] optionally points to either of immediate data
270  *                 unsolicited data-out or control
271  * @num_sge:       number sges used on this TX task
272  * @mapped:        indicates if the descriptor is dma mapped
273  */
274 struct iser_tx_desc {
275 	struct iser_hdr              iser_header;
276 	struct iscsi_bhs             iscsi_header __attribute__((packed));
277 	enum   iser_desc_type        type;
278 	u64		             dma_addr;
279 	struct ib_sge		     tx_sg[2];
280 	int                          num_sge;
281 	bool                         mapped;
282 };
283 
284 #define ISER_RX_PAD_SIZE	(256 - (ISER_RX_PAYLOAD_SIZE + \
285 					sizeof(u64) + sizeof(struct ib_sge)))
286 /**
287  * struct iser_rx_desc - iSER RX descriptor (for recv wr_id)
288  *
289  * @iser_header:   iser header
290  * @iscsi_header:  iscsi header
291  * @data:          received data segment
292  * @dma_addr:      receive buffer dma address
293  * @rx_sg:         ib_sge of receive buffer
294  * @pad:           for sense data TODO: Modify to maximum sense length supported
295  */
296 struct iser_rx_desc {
297 	struct iser_hdr              iser_header;
298 	struct iscsi_bhs             iscsi_header;
299 	char		             data[ISER_RECV_DATA_SEG_LEN];
300 	u64		             dma_addr;
301 	struct ib_sge		     rx_sg;
302 	char		             pad[ISER_RX_PAD_SIZE];
303 } __attribute__((packed));
304 
305 struct icl_iser_pdu {
306 	struct icl_pdu               icl_pdu;
307 	struct iser_tx_desc          desc;
308 	struct iser_conn             *iser_conn;
309 	enum iser_task_status        status;
310 	struct ccb_scsiio 			 *csio;
311 	int                          command_sent;
312 	int                          dir[ISER_DIRS_NUM];
313 	struct iser_mem_reg          rdma_reg[ISER_DIRS_NUM];
314 	struct iser_data_buf         data[ISER_DIRS_NUM];
315 };
316 
317 /**
318  * struct iser_comp - iSER completion context
319  *
320  * @device:     pointer to device handle
321  * @cq:         completion queue
322  * @wcs:        work completion array
323  * @tq:    	taskqueue handle
324  * @task:    	task to run task_fn
325  * @active_qps: Number of active QPs attached
326  *              to completion context
327  */
328 struct iser_comp {
329 	struct iser_device      *device;
330 	struct ib_cq		*cq;
331 	struct ib_wc		 wcs[ISER_WC_BATCH_COUNT];
332 	struct taskqueue        *tq;
333 	struct task             task;
334 	int                      active_qps;
335 };
336 
337 /**
338  * struct iser_device - iSER device handle
339  *
340  * @ib_device:     RDMA device
341  * @pd:            Protection Domain for this device
342  * @dev_attr:      Device attributes container
343  * @mr:            Global DMA memory region
344  * @event_handler: IB events handle routine
345  * @ig_list:	   entry in devices list
346  * @refcount:      Reference counter, dominated by open iser connections
347  * @comps_used:    Number of completion contexts used, Min between online
348  *                 cpus and device max completion vectors
349  * @comps:         Dinamically allocated array of completion handlers
350  */
351 struct iser_device {
352 	struct ib_device             *ib_device;
353 	struct ib_pd	             *pd;
354 	struct ib_device_attr	     dev_attr;
355 	struct ib_mr	             *mr;
356 	struct ib_event_handler      event_handler;
357 	struct list_head             ig_list;
358 	int                          refcount;
359 	int			     comps_used;
360 	struct iser_comp	     *comps;
361 };
362 
363 /**
364  * struct iser_reg_resources - Fast registration recources
365  *
366  * @mr:         memory region
367  * @frpl:       fast reg page list
368  * @mr_valid:   is mr valid indicator
369  */
370 struct iser_reg_resources {
371 	struct ib_mr                     *mr;
372 	struct ib_fast_reg_page_list     *frpl;
373 	u8                                mr_valid:1;
374 };
375 
376 /**
377  * struct fast_reg_descriptor - Fast registration descriptor
378  *
379  * @list:           entry in connection fastreg pool
380  * @rsc:            data buffer registration resources
381  */
382 struct fast_reg_descriptor {
383 	struct list_head		  list;
384 	struct iser_reg_resources	  rsc;
385 };
386 
387 
388 /**
389  * struct iser_beacon - beacon to signal all flush errors were drained
390  *
391  * @send:           send wr
392  * @recv:           recv wr
393  * @flush_lock:     protects flush_cv
394  * @flush_cv:       condition variable for beacon flush
395  */
396 struct iser_beacon {
397 	union {
398 		struct ib_send_wr	send;
399 		struct ib_recv_wr	recv;
400 	};
401 	struct mtx		     flush_lock;
402 	struct cv		     flush_cv;
403 };
404 
405 /**
406  * struct ib_conn - Infiniband related objects
407  *
408  * @cma_id:              rdma_cm connection maneger handle
409  * @qp:                  Connection Queue-pair
410  * @device:              reference to iser device
411  * @comp:                iser completion context
412   */
413 struct ib_conn {
414 	struct rdma_cm_id           *cma_id;
415 	struct ib_qp	            *qp;
416 	int                          post_recv_buf_count;
417 	u8                           sig_count;
418 	struct ib_recv_wr	     rx_wr[ISER_MIN_POSTED_RX];
419 	struct iser_device          *device;
420 	struct iser_comp	    *comp;
421 	struct iser_beacon	     beacon;
422 	struct mtx               lock;
423 	union {
424 		struct {
425 			struct ib_fmr_pool      *pool;
426 			struct iser_page_vec	*page_vec;
427 		} fmr;
428 		struct {
429 			struct list_head	 pool;
430 			int			 pool_size;
431 		} fastreg;
432 	};
433 };
434 
435 struct iser_conn {
436 	struct icl_conn             icl_conn;
437 	struct ib_conn               ib_conn;
438 	struct cv                    up_cv;
439 	struct list_head             conn_list;
440 	struct sx		     		 state_mutex;
441 	enum iser_conn_state	     state;
442 	int		     				 qp_max_recv_dtos;
443 	int		     				 min_posted_rx;
444 	u16                          max_cmds;
445 	char  			     *login_buf;
446 	char			     *login_req_buf, *login_resp_buf;
447 	u64			     login_req_dma, login_resp_dma;
448 	unsigned int 		     rx_desc_head;
449 	struct iser_rx_desc	     *rx_descs;
450 	u32                          num_rx_descs;
451 	bool                         handoff_done;
452 };
453 
454 /**
455  * struct iser_global: iSER global context
456  *
457  * @device_list_mutex:    protects device_list
458  * @device_list:          iser devices global list
459  * @connlist_mutex:       protects connlist
460  * @connlist:             iser connections global list
461  * @desc_cache:           kmem cache for tx dataout
462  * @close_conns_mutex:    serializes conns closure
463  */
464 struct iser_global {
465 	struct sx        device_list_mutex;
466 	struct list_head  device_list;
467 	struct mtx        connlist_mutex;
468 	struct list_head  connlist;
469 	struct sx         close_conns_mutex;
470 };
471 
472 extern struct iser_global ig;
473 extern int iser_debug;
474 
475 void
476 iser_create_send_desc(struct iser_conn *, struct iser_tx_desc *);
477 
478 int
479 iser_post_recvl(struct iser_conn *);
480 
481 int
482 iser_post_recvm(struct iser_conn *, int);
483 
484 int
485 iser_alloc_login_buf(struct iser_conn *iser_conn);
486 
487 void
488 iser_free_login_buf(struct iser_conn *iser_conn);
489 
490 int
491 iser_post_send(struct ib_conn *, struct iser_tx_desc *, bool);
492 
493 void
494 iser_snd_completion(struct iser_tx_desc *, struct ib_conn *);
495 
496 void
497 iser_rcv_completion(struct iser_rx_desc *, unsigned long,
498 		    struct ib_conn *);
499 
500 void
501 iser_pdu_free(struct icl_conn *, struct icl_pdu *);
502 
503 struct icl_pdu *
504 iser_new_pdu(struct icl_conn *ic, int flags);
505 
506 int
507 iser_alloc_rx_descriptors(struct iser_conn *, int);
508 
509 void
510 iser_free_rx_descriptors(struct iser_conn *);
511 
512 int
513 iser_initialize_headers(struct icl_iser_pdu *, struct iser_conn *);
514 
515 int
516 iser_send_control(struct iser_conn *, struct icl_iser_pdu *);
517 
518 int
519 iser_send_command(struct iser_conn *, struct icl_iser_pdu *);
520 
521 int
522 iser_reg_rdma_mem(struct icl_iser_pdu *, enum iser_data_dir);
523 
524 void
525 iser_unreg_rdma_mem(struct icl_iser_pdu *, enum iser_data_dir);
526 
527 int
528 iser_create_fastreg_pool(struct ib_conn *, unsigned);
529 
530 void
531 iser_free_fastreg_pool(struct ib_conn *);
532 
533 int
534 iser_dma_map_task_data(struct icl_iser_pdu *,
535 		       struct iser_data_buf *, enum iser_data_dir,
536 		       enum dma_data_direction);
537 
538 int
539 iser_conn_terminate(struct iser_conn *);
540 
541 void
542 iser_free_ib_conn_res(struct iser_conn *, bool);
543 
544 void
545 iser_dma_unmap_task_data(struct icl_iser_pdu *, struct iser_data_buf *,
546 			 enum dma_data_direction);
547 
548 int
549 iser_cma_handler(struct rdma_cm_id *, struct rdma_cm_event *);
550 
551 #endif /* !ICL_ISER_H */
552