1 /* 2 * This file contains definitions imported from the OFED rds header ib.h. 3 * Oracle elects to have and use the contents of ib.h under and 4 * governed by the OpenIB.org BSD license. 5 */ 6 /* 7 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 8 */ 9 10 #ifndef _RDSV3_IB_H 11 #define _RDSV3_IB_H 12 13 #include <sys/rds.h> 14 #include <sys/ib/clients/rdsv3/rdsv3.h> 15 #include <sys/ib/clients/rdsv3/rdma_transport.h> 16 #include <sys/ib/clients/rdsv3/rdsv3_af_thr.h> 17 18 #define RDSV3_FMR_SIZE 256 19 #define RDSV3_FMR_POOL_SIZE (12 * 1024) 20 21 #define RDSV3_IB_MAX_SGE 8 22 #define RDSV3_IB_RECV_SGE 2 23 24 #define RDSV3_IB_DEFAULT_RECV_WR 1024 25 #define RDSV3_IB_DEFAULT_SEND_WR 256 26 27 #define RDSV3_IB_DEFAULT_RETRY_COUNT 2 28 29 /* minor versions supported */ 30 #define RDSV3_IB_SUPPORTED_PROTOCOLS 0x00000003 31 32 extern struct list rdsv3_ib_devices; 33 34 /* 35 * IB posts RDSV3_FRAG_SIZE fragments of pages to the receive queues to 36 * try and minimize the amount of memory tied up both the device and 37 * socket receive queues. 38 */ 39 /* page offset of the final full frag that fits in the page */ 40 #define RDSV3_PAGE_LAST_OFF \ 41 (((PAGE_SIZE / RDSV3_FRAG_SIZE) - 1) * RDSV3_FRAG_SIZE) 42 struct rdsv3_page_frag { 43 struct list_node f_item; 44 caddr_t f_page; 45 unsigned long f_offset; 46 ibt_wr_ds_t f_sge; 47 ibt_mi_hdl_t f_mapped; 48 }; 49 50 struct rdsv3_ib_incoming { 51 list_node_t ii_obj; /* list obj of rdsv3_inc_pool list */ 52 struct list ii_frags; 53 struct rdsv3_incoming ii_inc; 54 struct rdsv3_inc_pool *ii_pool; 55 struct rdsv3_ib_device *ii_ibdev; 56 }; 57 58 struct rdsv3_ib_connect_private { 59 /* Add new fields at the end, and don't permute existing fields. */ 60 uint32_be_t dp_saddr; 61 uint32_be_t dp_daddr; 62 uint8_t dp_protocol_major; 63 uint8_t dp_protocol_minor; 64 uint16_be_t dp_protocol_minor_mask; /* bitmask */ 65 uint32_be_t dp_reserved1; 66 uint32_be_t dp_ack_seq; 67 uint32_be_t dp_credit; /* non-zero enables flow ctl */ 68 }; 69 70 struct rdsv3_ib_send_work { 71 struct rdsv3_message *s_rm; 72 struct rdsv3_rdma_op *s_op; 73 ibt_wrc_opcode_t s_opcode; 74 unsigned long s_queued; 75 }; 76 77 struct rdsv3_ib_recv_work { 78 struct rdsv3_ib_incoming *r_ibinc; 79 struct rdsv3_page_frag *r_frag; 80 ibt_wr_ds_t r_sge[2]; 81 }; 82 83 struct rdsv3_ib_work_ring { 84 uint32_t w_nr; 85 uint32_t w_alloc_ptr; 86 uint32_t w_alloc_ctr; 87 uint32_t w_free_ptr; 88 atomic_t w_free_ctr; 89 rdsv3_wait_queue_t w_empty_wait; 90 }; 91 92 /* 93 * Rings are posted with all the allocations they'll need to queue the 94 * incoming message to the receiving socket so this can't fail. 95 * All fragments start with a header, so we can make sure we're not receiving 96 * garbage, and we can tell a small 8 byte fragment from an ACK frame. 97 */ 98 struct rdsv3_ib_ack_state { 99 uint64_t ack_next; 100 uint64_t ack_recv; 101 unsigned int ack_required:1; 102 unsigned int ack_next_valid:1; 103 unsigned int ack_recv_valid:1; 104 }; 105 106 struct rdsv3_ib_device; 107 108 struct rdsv3_ib_connection { 109 110 struct list_node ib_node; 111 boolean_t i_on_dev_list; 112 struct rdsv3_ib_device *rds_ibdev; 113 struct rdsv3_connection *conn; 114 115 /* alphabet soup, IBTA style */ 116 struct rdma_cm_id *i_cm_id; 117 struct ib_pd *i_pd; 118 struct rdsv3_hdrs_mr *i_mr; 119 struct ib_cq *i_cq; 120 struct ib_cq *i_snd_cq; 121 122 /* tx */ 123 struct rdsv3_ib_work_ring i_send_ring; 124 struct rdsv3_message *i_rm; 125 struct rdsv3_header *i_send_hdrs; 126 uint64_t i_send_hdrs_dma; 127 struct rdsv3_ib_send_work *i_sends; 128 ibt_send_wr_t *i_send_wrs; 129 130 /* soft CQ */ 131 rdsv3_af_thr_t *i_soft_cq; 132 rdsv3_af_thr_t *i_snd_soft_cq; 133 rdsv3_af_thr_t *i_refill_rq; 134 135 /* rx */ 136 struct mutex i_recv_mutex; 137 struct rdsv3_ib_work_ring i_recv_ring; 138 struct rdsv3_ib_incoming *i_ibinc; 139 uint32_t i_recv_data_rem; 140 struct rdsv3_header *i_recv_hdrs; 141 uint64_t i_recv_hdrs_dma; 142 struct rdsv3_ib_recv_work *i_recvs; 143 ibt_recv_wr_t *i_recv_wrs; 144 struct rdsv3_page_frag i_frag; 145 uint64_t i_ack_recv; /* last ACK received */ 146 147 /* sending acks */ 148 unsigned long i_ack_flags; 149 #ifdef KERNEL_HAS_ATOMIC64 150 atomic64_t i_ack_next; /* next ACK to send */ 151 #else 152 kmutex_t i_ack_lock; /* protect i_ack_next */ 153 uint64_t i_ack_next; /* next ACK to send */ 154 #endif 155 struct rdsv3_header *i_ack; 156 ibt_send_wr_t i_ack_wr; 157 ibt_wr_ds_t i_ack_sge; 158 uint64_t i_ack_dma; 159 unsigned long i_ack_queued; 160 161 /* 162 * Flow control related information 163 * 164 * Our algorithm uses a pair variables that we need to access 165 * atomically - one for the send credits, and one posted 166 * recv credits we need to transfer to remote. 167 * Rather than protect them using a slow spinlock, we put both into 168 * a single atomic_t and update it using cmpxchg 169 */ 170 atomic_t i_credits; 171 172 /* Protocol version specific information */ 173 unsigned int i_flowctl:1; /* enable/disable flow ctl */ 174 175 /* Batched completions */ 176 unsigned int i_unsignaled_wrs; 177 long i_unsignaled_bytes; 178 }; 179 180 /* This assumes that atomic_t is at least 32 bits */ 181 #define IB_GET_SEND_CREDITS(v) ((v) & 0xffff) 182 #define IB_GET_POST_CREDITS(v) ((v) >> 16) 183 #define IB_SET_SEND_CREDITS(v) ((v) & 0xffff) 184 #define IB_SET_POST_CREDITS(v) ((v) << 16) 185 186 struct rdsv3_ib_ipaddr { 187 struct list_node list; 188 uint32_be_t ipaddr; 189 }; 190 191 struct rdsv3_ib_device { 192 struct list_node list; 193 struct list ipaddr_list; 194 struct list conn_list; 195 ib_device_t *dev; 196 struct ib_pd *pd; 197 struct kmem_cache *ib_frag_slab; 198 kmutex_t spinlock; /* protect the above */ 199 krwlock_t rwlock; /* protect paddr_list */ 200 unsigned int fmr_max_remaps; 201 unsigned int max_fmrs; 202 unsigned int fmr_message_size; 203 int max_sge; 204 unsigned int max_wrs; 205 unsigned int max_initiator_depth; 206 unsigned int max_responder_resources; 207 struct rdsv3_fmr_pool *fmr_pool; 208 struct rdsv3_inc_pool *inc_pool; 209 ibt_fmr_pool_hdl_t fmr_pool_hdl; 210 ibt_hca_attr_t hca_attr; 211 rdsv3_af_thr_t *fmr_soft_cq; 212 rdsv3_af_thr_t *inc_soft_cq; 213 ibt_hca_hdl_t ibt_hca_hdl; 214 rdsv3_af_grp_t *aft_hcagp; 215 }; 216 217 /* bits for i_ack_flags */ 218 #define IB_ACK_IN_FLIGHT 0 219 #define IB_ACK_REQUESTED 1 220 221 #define RDSV3_IB_SEND_OP (1ULL << 63) 222 223 /* Magic WR_ID for ACKs */ 224 #define RDSV3_IB_ACK_WR_ID (~(uint64_t)0) 225 226 struct rdsv3_ib_statistics { 227 uint64_t s_ib_connect_raced; 228 uint64_t s_ib_listen_closed_stale; 229 uint64_t s_ib_evt_handler_call; 230 uint64_t s_ib_tasklet_call; 231 uint64_t s_ib_tx_cq_event; 232 uint64_t s_ib_tx_ring_full; 233 uint64_t s_ib_tx_throttle; 234 uint64_t s_ib_tx_sg_mapping_failure; 235 uint64_t s_ib_tx_stalled; 236 uint64_t s_ib_tx_credit_updates; 237 uint64_t s_ib_rx_cq_event; 238 uint64_t s_ib_rx_ring_empty; 239 uint64_t s_ib_rx_refill_from_cq; 240 uint64_t s_ib_rx_refill_from_thread; 241 uint64_t s_ib_rx_alloc_limit; 242 uint64_t s_ib_rx_credit_updates; 243 uint64_t s_ib_ack_sent; 244 uint64_t s_ib_ack_send_failure; 245 uint64_t s_ib_ack_send_delayed; 246 uint64_t s_ib_ack_send_piggybacked; 247 uint64_t s_ib_ack_received; 248 uint64_t s_ib_rdma_mr_alloc; 249 uint64_t s_ib_rdma_mr_free; 250 uint64_t s_ib_rdma_mr_used; 251 uint64_t s_ib_rdma_mr_pool_flush; 252 uint64_t s_ib_rdma_mr_pool_wait; 253 uint64_t s_ib_rdma_mr_pool_depleted; 254 }; 255 256 extern struct rdsv3_workqueue_struct_s *rds_ib_wq; 257 258 /* ib.c */ 259 extern struct rdsv3_transport rdsv3_ib_transport; 260 extern void rdsv3_ib_add_one(ib_device_t *device); 261 extern void rdsv3_ib_remove_one(ib_device_t *device); 262 extern struct ib_client rdsv3_ib_client; 263 264 extern unsigned int fmr_pool_size; 265 extern unsigned int fmr_message_size; 266 extern unsigned int rdsv3_ib_retry_count; 267 268 extern kmutex_t ib_nodev_conns_lock; 269 extern struct list ib_nodev_conns; 270 271 /* ib_cm.c */ 272 int rdsv3_ib_conn_alloc(struct rdsv3_connection *conn, int gfp); 273 void rdsv3_ib_conn_free(void *arg); 274 int rdsv3_ib_conn_connect(struct rdsv3_connection *conn); 275 void rdsv3_ib_conn_shutdown(struct rdsv3_connection *conn); 276 void rdsv3_conn_drop(struct rdsv3_connection *conn); 277 int rdsv3_ib_cm_handle_connect(struct rdma_cm_id *cm_id, 278 struct rdma_cm_event *event); 279 int rdsv3_ib_cm_initiate_connect(struct rdma_cm_id *cm_id); 280 void rdsv3_ib_cm_connect_complete(struct rdsv3_connection *conn, 281 struct rdma_cm_event *event); 282 void rdsv3_ib_tasklet_fn(void *data); 283 void rdsv3_ib_snd_tasklet_fn(void *data); 284 void rdsv3_ib_refill_fn(void *data); 285 286 /* ib_rdma.c */ 287 int rdsv3_ib_update_ipaddr(struct rdsv3_ib_device *rds_ibdev, 288 uint32_be_t ipaddr); 289 void rdsv3_ib_add_conn(struct rdsv3_ib_device *rds_ibdev, 290 struct rdsv3_connection *conn); 291 void rdsv3_ib_remove_conn(struct rdsv3_ib_device *rds_ibdev, 292 struct rdsv3_connection *conn); 293 void __rdsv3_ib_destroy_conns(struct list *list, kmutex_t *list_lock); 294 static inline void rdsv3_ib_destroy_nodev_conns(void) 295 { 296 __rdsv3_ib_destroy_conns(&ib_nodev_conns, &ib_nodev_conns_lock); 297 } 298 static inline void rdsv3_ib_destroy_conns(struct rdsv3_ib_device *rds_ibdev) 299 { 300 __rdsv3_ib_destroy_conns(&rds_ibdev->conn_list, &rds_ibdev->spinlock); 301 } 302 303 int rdsv3_ib_create_mr_pool(struct rdsv3_ib_device *); 304 void rdsv3_ib_destroy_mr_pool(struct rdsv3_ib_device *); 305 void rdsv3_ib_get_mr_info(struct rdsv3_ib_device *rds_ibdev, 306 struct rds_info_rdma_connection *iinfo); 307 void *rdsv3_ib_get_mr(struct rds_iovec *args, unsigned long nents, 308 struct rdsv3_sock *rs, uint32_t *key_ret); 309 void rdsv3_ib_sync_mr(void *trans_private, int dir); 310 void rdsv3_ib_free_mr(void *trans_private, int invalidate); 311 void rdsv3_ib_flush_mrs(void); 312 void rdsv3_ib_drain_mrlist_fn(void *data); 313 314 /* ib_recv.c */ 315 int rdsv3_ib_recv_init(void); 316 void rdsv3_ib_recv_exit(void); 317 int rdsv3_ib_recv(struct rdsv3_connection *conn); 318 int rdsv3_ib_recv_refill(struct rdsv3_connection *conn, int prefill); 319 void rdsv3_ib_inc_free(struct rdsv3_incoming *inc); 320 int rdsv3_ib_inc_copy_to_user(struct rdsv3_incoming *inc, uio_t *uiop, 321 size_t size); 322 void rdsv3_ib_recv_cqe_handler(struct rdsv3_ib_connection *ic, ibt_wc_t *wc, 323 struct rdsv3_ib_ack_state *state); 324 void rdsv3_ib_recv_init_ring(struct rdsv3_ib_connection *ic); 325 void rdsv3_ib_recv_clear_ring(struct rdsv3_ib_connection *ic); 326 void rdsv3_ib_recv_init_ack(struct rdsv3_ib_connection *ic); 327 void rdsv3_ib_attempt_ack(struct rdsv3_ib_connection *ic); 328 void rdsv3_ib_ack_send_complete(struct rdsv3_ib_connection *ic); 329 uint64_t rdsv3_ib_piggyb_ack(struct rdsv3_ib_connection *ic); 330 void rdsv3_ib_set_ack(struct rdsv3_ib_connection *ic, uint64_t seq, 331 int ack_required); 332 int rdsv3_ib_create_inc_pool(struct rdsv3_ib_device *); 333 void rdsv3_ib_destroy_inc_pool(struct rdsv3_ib_device *); 334 void rdsv3_ib_drain_inclist(void *); 335 336 /* ib_ring.c */ 337 void rdsv3_ib_ring_init(struct rdsv3_ib_work_ring *ring, uint32_t nr); 338 void rdsv3_ib_ring_resize(struct rdsv3_ib_work_ring *ring, uint32_t nr); 339 uint32_t rdsv3_ib_ring_alloc(struct rdsv3_ib_work_ring *ring, uint32_t val, 340 uint32_t *pos); 341 void rdsv3_ib_ring_free(struct rdsv3_ib_work_ring *ring, uint32_t val); 342 void rdsv3_ib_ring_unalloc(struct rdsv3_ib_work_ring *ring, uint32_t val); 343 int rdsv3_ib_ring_empty(struct rdsv3_ib_work_ring *ring); 344 int rdsv3_ib_ring_low(struct rdsv3_ib_work_ring *ring); 345 uint32_t rdsv3_ib_ring_oldest(struct rdsv3_ib_work_ring *ring); 346 uint32_t rdsv3_ib_ring_completed(struct rdsv3_ib_work_ring *ring, 347 uint32_t wr_id, uint32_t oldest); 348 349 /* ib_send.c */ 350 void rdsv3_ib_xmit_complete(struct rdsv3_connection *conn); 351 int rdsv3_ib_xmit(struct rdsv3_connection *conn, struct rdsv3_message *rm, 352 unsigned int hdr_off, unsigned int sg, unsigned int off); 353 void rdsv3_ib_send_cqe_handler(struct rdsv3_ib_connection *ic, ibt_wc_t *wc); 354 void rdsv3_ib_send_init_ring(struct rdsv3_ib_connection *ic); 355 void rdsv3_ib_send_clear_ring(struct rdsv3_ib_connection *ic); 356 int rdsv3_ib_xmit_rdma(struct rdsv3_connection *conn, struct rdsv3_rdma_op *op); 357 void rdsv3_ib_send_add_credits(struct rdsv3_connection *conn, 358 unsigned int credits); 359 void rdsv3_ib_advertise_credits(struct rdsv3_connection *conn, 360 unsigned int posted); 361 int rdsv3_ib_send_grab_credits(struct rdsv3_ib_connection *ic, uint32_t wanted, 362 uint32_t *adv_credits, int need_posted); 363 364 /* ib_stats.c */ 365 extern struct rdsv3_ib_statistics *rdsv3_ib_stats; 366 #define rdsv3_ib_stats_inc(member) \ 367 rdsv3_stats_add_which(rdsv3_ib_stats, member, 1) 368 unsigned int rdsv3_ib_stats_info_copy(struct rdsv3_info_iterator *iter, 369 unsigned int avail); 370 371 /* ib_sysctl.c */ 372 int rdsv3_ib_sysctl_init(void); 373 void rdsv3_ib_sysctl_exit(void); 374 extern unsigned long rdsv3_ib_sysctl_max_send_wr; 375 extern unsigned long rdsv3_ib_sysctl_max_recv_wr; 376 extern unsigned long rdsv3_ib_sysctl_max_unsig_wrs; 377 extern unsigned long rdsv3_ib_sysctl_max_unsig_bytes; 378 extern unsigned long rdsv3_ib_sysctl_max_recv_allocation; 379 extern unsigned int rdsv3_ib_sysctl_flow_control; 380 381 #endif /* _RDSV3_IB_H */ 382