xref: /freebsd/sys/dev/iser/icl_iser.h (revision 2008043f386721d58158e37e0d7e50df8095942d)
1 /*-
2  * Copyright (c) 2015, Mellanox Technologies, Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  */
25 
26 #ifndef ICL_ISER_H
27 #define ICL_ISER_H
28 
29 /*
30  * iSCSI Common Layer for RDMA.
31  */
32 
33 #include <sys/cdefs.h>
34 #include <sys/param.h>
35 #include <sys/capsicum.h>
36 #include <sys/condvar.h>
37 #include <sys/conf.h>
38 #include <sys/file.h>
39 #include <sys/kernel.h>
40 #include <sys/kthread.h>
41 #include <sys/lock.h>
42 #include <sys/mbuf.h>
43 #include <sys/mutex.h>
44 #include <sys/module.h>
45 #include <sys/protosw.h>
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48 #include <sys/sysctl.h>
49 #include <sys/systm.h>
50 #include <sys/sx.h>
51 #include <sys/uio.h>
52 #include <sys/taskqueue.h>
53 #include <sys/bio.h>
54 #include <vm/uma.h>
55 #include <netinet/in.h>
56 #include <netinet/tcp.h>
57 #include <dev/iscsi/icl.h>
58 #include <dev/iscsi/iscsi_proto.h>
59 #include <icl_conn_if.h>
60 #include <cam/cam.h>
61 #include <cam/cam_ccb.h>
62 #include <rdma/ib_verbs.h>
63 #include <rdma/ib_fmr_pool.h>
64 #include <rdma/rdma_cm.h>
65 
66 
67 #define	ISER_DBG(X, ...)						\
68 	do {								\
69 		if (unlikely(iser_debug > 2))				\
70 			printf("DEBUG: %s: " X "\n",			\
71 				__func__, ## __VA_ARGS__);		\
72 	} while (0)
73 
74 #define	ISER_INFO(X, ...)						\
75 	do {								\
76 		if (unlikely(iser_debug > 1))				\
77 			printf("INFO: %s: " X "\n",			\
78 				__func__, ## __VA_ARGS__);		\
79 	} while (0)
80 
81 #define	ISER_WARN(X, ...)						\
82 	do {								\
83 		if (unlikely(iser_debug > 0)) {				\
84 			printf("WARNING: %s: " X "\n",			\
85 				__func__, ## __VA_ARGS__);		\
86 		}							\
87 	} while (0)
88 
89 #define	ISER_ERR(X, ...) 						\
90 	printf("ERROR: %s: " X "\n", __func__, ## __VA_ARGS__)
91 
92 #define ISER_VER			0x10
93 #define ISER_WSV			0x08
94 #define ISER_RSV			0x04
95 
96 #define ISER_FASTREG_LI_WRID		0xffffffffffffffffULL
97 #define ISER_BEACON_WRID		0xfffffffffffffffeULL
98 
99 #define SHIFT_4K	12
100 #define SIZE_4K	(1ULL << SHIFT_4K)
101 #define MASK_4K	(~(SIZE_4K-1))
102 
103 /* support up to 512KB in one RDMA */
104 #define ISCSI_ISER_SG_TABLESIZE         (0x80000 >> SHIFT_4K)
105 #define ISER_DEF_XMIT_CMDS_MAX 256
106 
107 /* the max RX (recv) WR supported by the iSER QP is defined by                 *
108  * max_recv_wr = commands_max + recv_beacon                                    */
109 #define ISER_QP_MAX_RECV_DTOS  (ISER_DEF_XMIT_CMDS_MAX + 1)
110 #define ISER_MIN_POSTED_RX		(ISER_DEF_XMIT_CMDS_MAX >> 2)
111 
112 /* QP settings */
113 /* Maximal bounds on received asynchronous PDUs */
114 #define ISER_MAX_RX_MISC_PDUS           4 /* NOOP_IN(2) , ASYNC_EVENT(2)   */
115 #define ISER_MAX_TX_MISC_PDUS           6 /* NOOP_OUT(2), TEXT(1), SCSI_TMFUNC(2), LOGOUT(1) */
116 
117 /* the max TX (send) WR supported by the iSER QP is defined by                 *
118  * max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect   *
119  * to have at max for SCSI command. The tx posting & completion handling code  *
120  * supports -EAGAIN scheme where tx is suspended till the QP has room for more *
121  * send WR. D=8 comes from 64K/8K                                              */
122 
123 #define ISER_INFLIGHT_DATAOUTS		8
124 
125 /* the send_beacon increase the max_send_wr by 1  */
126 #define ISER_QP_MAX_REQ_DTOS		(ISER_DEF_XMIT_CMDS_MAX *    \
127 					(1 + ISER_INFLIGHT_DATAOUTS) + \
128 					ISER_MAX_TX_MISC_PDUS        + \
129 					ISER_MAX_RX_MISC_PDUS + 1)
130 
131 #define ISER_GET_MAX_XMIT_CMDS(send_wr) ((send_wr			\
132 					 - ISER_MAX_TX_MISC_PDUS	\
133 					 - ISER_MAX_RX_MISC_PDUS - 1) /	\
134 					 (1 + ISER_INFLIGHT_DATAOUTS))
135 
136 #define ISER_WC_BATCH_COUNT   16
137 #define ISER_SIGNAL_CMD_COUNT 32
138 
139 /* Maximal QP's recommended per CQ. In case we use more QP's per CQ we might   *
140  * encounter a CQ overrun state.                                               */
141 #define ISCSI_ISER_MAX_CONN	8
142 #define ISER_MAX_RX_LEN		(ISER_QP_MAX_RECV_DTOS * ISCSI_ISER_MAX_CONN)
143 #define ISER_MAX_TX_LEN		(ISER_QP_MAX_REQ_DTOS  * ISCSI_ISER_MAX_CONN)
144 #define ISER_MAX_CQ_LEN		(ISER_MAX_RX_LEN + ISER_MAX_TX_LEN + \
145 				 ISCSI_ISER_MAX_CONN)
146 
147 #define ISER_ZBVA_NOT_SUPPORTED                0x80
148 #define ISER_SEND_W_INV_NOT_SUPPORTED	0x40
149 
150 #define	ISCSI_DEF_MAX_RECV_SEG_LEN	8192
151 #define	ISCSI_OPCODE_MASK		0x3f
152 
153 #define icl_to_iser_conn(ic) \
154 	container_of(ic, struct iser_conn, icl_conn)
155 #define icl_to_iser_pdu(ip) \
156 	container_of(ip, struct icl_iser_pdu, icl_pdu)
157 
158 /**
159  * struct iser_hdr - iSER header
160  *
161  * @flags:        flags support (zbva, remote_inv)
162  * @rsvd:         reserved
163  * @write_stag:   write rkey
164  * @write_va:     write virtual address
165  * @reaf_stag:    read rkey
166  * @read_va:      read virtual address
167  */
168 struct iser_hdr {
169 	u8      flags;
170 	u8      rsvd[3];
171 	__be32  write_stag;
172 	__be64  write_va;
173 	__be32  read_stag;
174 	__be64  read_va;
175 } __attribute__((packed));
176 
177 struct iser_cm_hdr {
178 	u8      flags;
179 	u8      rsvd[3];
180 } __packed;
181 
182 /* Constant PDU lengths calculations */
183 #define ISER_HEADERS_LEN  (sizeof(struct iser_hdr) + ISCSI_BHS_SIZE)
184 
185 #define ISER_RECV_DATA_SEG_LEN	128
186 #define ISER_RX_PAYLOAD_SIZE	(ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN)
187 
188 #define ISER_RX_LOGIN_SIZE	(ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN)
189 
190 enum iser_conn_state {
191 	ISER_CONN_INIT,		   /* descriptor allocd, no conn          */
192 	ISER_CONN_PENDING,	   /* in the process of being established */
193 	ISER_CONN_UP,		   /* up and running                      */
194 	ISER_CONN_TERMINATING,	   /* in the process of being terminated  */
195 	ISER_CONN_DOWN,		   /* shut down                           */
196 	ISER_CONN_STATES_NUM
197 };
198 
199 enum iser_task_status {
200 	ISER_TASK_STATUS_INIT = 0,
201 	ISER_TASK_STATUS_STARTED,
202 	ISER_TASK_STATUS_COMPLETED
203 };
204 
205 enum iser_data_dir {
206 	ISER_DIR_IN = 0,	   /* to initiator */
207 	ISER_DIR_OUT,		   /* from initiator */
208 	ISER_DIRS_NUM
209 };
210 
211 /**
212  * struct iser_mem_reg - iSER memory registration info
213  *
214  * @sge:          memory region sg element
215  * @rkey:         memory region remote key
216  * @mem_h:        pointer to registration context (FMR/Fastreg)
217  */
218 struct iser_mem_reg {
219 	struct ib_sge	 sge;
220 	u32		 rkey;
221 	void		*mem_h;
222 };
223 
224 enum iser_desc_type {
225 	ISCSI_TX_CONTROL ,
226 	ISCSI_TX_SCSI_COMMAND,
227 	ISCSI_TX_DATAOUT
228 };
229 
230 /**
231  * struct iser_data_buf - iSER data buffer
232  *
233  * @sg:           pointer to the sg list
234  * @size:         num entries of this sg
235  * @data_len:     total beffer byte len
236  * @dma_nents:    returned by dma_map_sg
237  * @copy_buf:     allocated copy buf for SGs unaligned
238  *                for rdma which are copied
239  * @orig_sg:      pointer to the original sg list (in case
240  *                we used a copy)
241  * @sg_single:    SG-ified clone of a non SG SC or
242  *                unaligned SG
243  */
244 struct iser_data_buf {
245 	struct scatterlist sgl[ISCSI_ISER_SG_TABLESIZE];
246 	void               *sg;
247 	int                size;
248 	unsigned long      data_len;
249 	unsigned int       dma_nents;
250 	char               *copy_buf;
251 	struct scatterlist *orig_sg;
252 	struct scatterlist sg_single;
253   };
254 
255 /* fwd declarations */
256 struct iser_conn;
257 struct ib_conn;
258 struct iser_device;
259 
260 /**
261  * struct iser_tx_desc - iSER TX descriptor (for send wr_id)
262  *
263  * @iser_header:   iser header
264  * @iscsi_header:  iscsi header (bhs)
265  * @type:          command/control/dataout
266  * @dma_addr:      header buffer dma_address
267  * @tx_sg:         sg[0] points to iser/iscsi headers
268  *                 sg[1] optionally points to either of immediate data
269  *                 unsolicited data-out or control
270  * @num_sge:       number sges used on this TX task
271  * @mapped:        indicates if the descriptor is dma mapped
272  */
273 struct iser_tx_desc {
274 	struct iser_hdr              iser_header;
275 	struct iscsi_bhs             iscsi_header __attribute__((packed));
276 	enum   iser_desc_type        type;
277 	u64		             dma_addr;
278 	struct ib_sge		     tx_sg[2];
279 	int                          num_sge;
280 	bool                         mapped;
281 };
282 
283 #define ISER_RX_PAD_SIZE	(256 - (ISER_RX_PAYLOAD_SIZE + \
284 					sizeof(u64) + sizeof(struct ib_sge)))
285 /**
286  * struct iser_rx_desc - iSER RX descriptor (for recv wr_id)
287  *
288  * @iser_header:   iser header
289  * @iscsi_header:  iscsi header
290  * @data:          received data segment
291  * @dma_addr:      receive buffer dma address
292  * @rx_sg:         ib_sge of receive buffer
293  * @pad:           for sense data TODO: Modify to maximum sense length supported
294  */
295 struct iser_rx_desc {
296 	struct iser_hdr              iser_header;
297 	struct iscsi_bhs             iscsi_header;
298 	char		             data[ISER_RECV_DATA_SEG_LEN];
299 	u64		             dma_addr;
300 	struct ib_sge		     rx_sg;
301 	char		             pad[ISER_RX_PAD_SIZE];
302 } __attribute__((packed));
303 
304 struct icl_iser_pdu {
305 	struct icl_pdu               icl_pdu;
306 	struct iser_tx_desc          desc;
307 	struct iser_conn             *iser_conn;
308 	enum iser_task_status        status;
309 	struct ccb_scsiio 			 *csio;
310 	int                          command_sent;
311 	int                          dir[ISER_DIRS_NUM];
312 	struct iser_mem_reg          rdma_reg[ISER_DIRS_NUM];
313 	struct iser_data_buf         data[ISER_DIRS_NUM];
314 };
315 
316 /**
317  * struct iser_comp - iSER completion context
318  *
319  * @device:     pointer to device handle
320  * @cq:         completion queue
321  * @wcs:        work completion array
322  * @tq:    	taskqueue handle
323  * @task:    	task to run task_fn
324  * @active_qps: Number of active QPs attached
325  *              to completion context
326  */
327 struct iser_comp {
328 	struct iser_device      *device;
329 	struct ib_cq		*cq;
330 	struct ib_wc		 wcs[ISER_WC_BATCH_COUNT];
331 	struct taskqueue        *tq;
332 	struct task             task;
333 	int                      active_qps;
334 };
335 
336 /**
337  * struct iser_device - iSER device handle
338  *
339  * @ib_device:     RDMA device
340  * @pd:            Protection Domain for this device
341  * @dev_attr:      Device attributes container
342  * @mr:            Global DMA memory region
343  * @event_handler: IB events handle routine
344  * @ig_list:	   entry in devices list
345  * @refcount:      Reference counter, dominated by open iser connections
346  * @comps_used:    Number of completion contexts used, Min between online
347  *                 cpus and device max completion vectors
348  * @comps:         Dinamically allocated array of completion handlers
349  */
350 struct iser_device {
351 	struct ib_device             *ib_device;
352 	struct ib_pd	             *pd;
353 	struct ib_device_attr	     dev_attr;
354 	struct ib_mr	             *mr;
355 	struct ib_event_handler      event_handler;
356 	struct list_head             ig_list;
357 	int                          refcount;
358 	int			     comps_used;
359 	struct iser_comp	     *comps;
360 };
361 
362 /**
363  * struct iser_reg_resources - Fast registration recources
364  *
365  * @mr:         memory region
366  * @mr_valid:   is mr valid indicator
367  */
368 struct iser_reg_resources {
369 	struct ib_mr                     *mr;
370 	u8                                mr_valid:1;
371 };
372 
373 /**
374  * struct fast_reg_descriptor - Fast registration descriptor
375  *
376  * @list:           entry in connection fastreg pool
377  * @rsc:            data buffer registration resources
378  */
379 struct fast_reg_descriptor {
380 	struct list_head		  list;
381 	struct iser_reg_resources	  rsc;
382 };
383 
384 
385 /**
386  * struct iser_beacon - beacon to signal all flush errors were drained
387  *
388  * @send:           send wr
389  * @recv:           recv wr
390  * @flush_lock:     protects flush_cv
391  * @flush_cv:       condition variable for beacon flush
392  */
393 struct iser_beacon {
394 	union {
395 		struct ib_send_wr	send;
396 		struct ib_recv_wr	recv;
397 	};
398 	struct mtx		     flush_lock;
399 	struct cv		     flush_cv;
400 };
401 
402 /**
403  * struct ib_conn - Infiniband related objects
404  *
405  * @cma_id:              rdma_cm connection maneger handle
406  * @qp:                  Connection Queue-pair
407  * @device:              reference to iser device
408  * @comp:                iser completion context
409   */
410 struct ib_conn {
411 	struct rdma_cm_id           *cma_id;
412 	struct ib_qp	            *qp;
413 	int                          post_recv_buf_count;
414 	u8                           sig_count;
415 	struct ib_recv_wr	     rx_wr[ISER_MIN_POSTED_RX];
416 	struct iser_device          *device;
417 	struct iser_comp	    *comp;
418 	struct iser_beacon	     beacon;
419 	struct mtx               lock;
420 	union {
421 		struct {
422 			struct ib_fmr_pool      *pool;
423 			struct iser_page_vec	*page_vec;
424 		} fmr;
425 		struct {
426 			struct list_head	 pool;
427 			int			 pool_size;
428 		} fastreg;
429 	};
430 };
431 
432 struct iser_conn {
433 	struct icl_conn             icl_conn;
434 	struct ib_conn               ib_conn;
435 	struct cv                    up_cv;
436 	struct list_head             conn_list;
437 	struct sx		     		 state_mutex;
438 	enum iser_conn_state	     state;
439 	int		     				 qp_max_recv_dtos;
440 	int		     				 min_posted_rx;
441 	u16                          max_cmds;
442 	char  			     *login_buf;
443 	char			     *login_req_buf, *login_resp_buf;
444 	u64			     login_req_dma, login_resp_dma;
445 	unsigned int 		     rx_desc_head;
446 	struct iser_rx_desc	     *rx_descs;
447 	u32                          num_rx_descs;
448 	bool                         handoff_done;
449 };
450 
451 /**
452  * struct iser_global: iSER global context
453  *
454  * @device_list_mutex:    protects device_list
455  * @device_list:          iser devices global list
456  * @connlist_mutex:       protects connlist
457  * @connlist:             iser connections global list
458  * @desc_cache:           kmem cache for tx dataout
459  * @close_conns_mutex:    serializes conns closure
460  */
461 struct iser_global {
462 	struct sx        device_list_mutex;
463 	struct list_head  device_list;
464 	struct mtx        connlist_mutex;
465 	struct list_head  connlist;
466 	struct sx         close_conns_mutex;
467 };
468 
469 extern struct iser_global ig;
470 extern int iser_debug;
471 
472 void
473 iser_create_send_desc(struct iser_conn *, struct iser_tx_desc *);
474 
475 int
476 iser_post_recvl(struct iser_conn *);
477 
478 int
479 iser_post_recvm(struct iser_conn *, int);
480 
481 int
482 iser_alloc_login_buf(struct iser_conn *iser_conn);
483 
484 void
485 iser_free_login_buf(struct iser_conn *iser_conn);
486 
487 int
488 iser_post_send(struct ib_conn *, struct iser_tx_desc *, bool);
489 
490 void
491 iser_snd_completion(struct iser_tx_desc *, struct ib_conn *);
492 
493 void
494 iser_rcv_completion(struct iser_rx_desc *, unsigned long,
495 		    struct ib_conn *);
496 
497 void
498 iser_pdu_free(struct icl_conn *, struct icl_pdu *);
499 
500 struct icl_pdu *
501 iser_new_pdu(struct icl_conn *ic, int flags);
502 
503 int
504 iser_alloc_rx_descriptors(struct iser_conn *, int);
505 
506 void
507 iser_free_rx_descriptors(struct iser_conn *);
508 
509 int
510 iser_initialize_headers(struct icl_iser_pdu *, struct iser_conn *);
511 
512 int
513 iser_send_control(struct iser_conn *, struct icl_iser_pdu *);
514 
515 int
516 iser_send_command(struct iser_conn *, struct icl_iser_pdu *);
517 
518 int
519 iser_reg_rdma_mem(struct icl_iser_pdu *, enum iser_data_dir);
520 
521 void
522 iser_unreg_rdma_mem(struct icl_iser_pdu *, enum iser_data_dir);
523 
524 int
525 iser_create_fastreg_pool(struct ib_conn *, unsigned);
526 
527 void
528 iser_free_fastreg_pool(struct ib_conn *);
529 
530 int
531 iser_dma_map_task_data(struct icl_iser_pdu *,
532 		       struct iser_data_buf *, enum iser_data_dir,
533 		       enum dma_data_direction);
534 
535 int
536 iser_conn_terminate(struct iser_conn *);
537 
538 void
539 iser_free_ib_conn_res(struct iser_conn *, bool);
540 
541 void
542 iser_dma_unmap_task_data(struct icl_iser_pdu *, struct iser_data_buf *,
543 			 enum dma_data_direction);
544 
545 int
546 iser_cma_handler(struct rdma_cm_id *, struct rdma_cm_event *);
547 
548 #endif /* !ICL_ISER_H */
549