1 /* 2 * This file contains definitions imported from the OFED rds header ib.h. 3 * Oracle elects to have and use the contents of ib.h under and 4 * governed by the OpenIB.org BSD license. 5 */ 6 /* 7 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 8 */ 9 10 #ifndef _RDSV3_IB_H 11 #define _RDSV3_IB_H 12 13 #include <sys/rds.h> 14 #include <sys/ib/clients/rdsv3/rdsv3.h> 15 #include <sys/ib/clients/rdsv3/rdma_transport.h> 16 #include <sys/ib/clients/rdsv3/rdsv3_af_thr.h> 17 18 #define RDSV3_FMR_SIZE 256 19 #define RDSV3_FMR_POOL_SIZE (12 * 1024) 20 21 #define RDSV3_IB_MAX_SGE 8 22 #define RDSV3_IB_RECV_SGE 2 23 24 #define RDSV3_IB_DEFAULT_RECV_WR 1024 25 #define RDSV3_IB_DEFAULT_SEND_WR 256 26 27 #define RDSV3_IB_DEFAULT_RETRY_COUNT 2 28 29 /* minor versions supported */ 30 #define RDSV3_IB_SUPPORTED_PROTOCOLS 0x00000003 31 32 #define RDSV3_IB_MAX_RECV_ALLOC ((512 * 1024 * 1024) / RDSV3_FRAG_SIZE) 33 #define RDSV3_IB_WC_POLL_SIZE 16 34 35 extern struct list rdsv3_ib_devices; 36 37 /* 38 * IB posts RDSV3_FRAG_SIZE fragments of pages to the receive queues to 39 * try and minimize the amount of memory tied up both the device and 40 * socket receive queues. 41 */ 42 /* page offset of the final full frag that fits in the page */ 43 #define RDSV3_PAGE_LAST_OFF \ 44 (((PAGE_SIZE / RDSV3_FRAG_SIZE) - 1) * RDSV3_FRAG_SIZE) 45 struct rdsv3_page_frag { 46 struct list_node f_item; 47 caddr_t f_page; 48 unsigned long f_offset; 49 ibt_wr_ds_t f_sge; 50 ibt_mi_hdl_t f_mapped; 51 }; 52 53 struct rdsv3_ib_incoming { 54 list_node_t ii_obj; /* list obj of rdsv3_inc_pool list */ 55 struct list ii_frags; 56 struct rdsv3_incoming ii_inc; 57 struct rdsv3_inc_pool *ii_pool; 58 struct rdsv3_ib_device *ii_ibdev; 59 }; 60 61 struct rdsv3_ib_connect_private { 62 /* Add new fields at the end, and don't permute existing fields. */ 63 uint32_be_t dp_saddr; 64 uint32_be_t dp_daddr; 65 uint8_t dp_protocol_major; 66 uint8_t dp_protocol_minor; 67 uint16_be_t dp_protocol_minor_mask; /* bitmask */ 68 uint32_be_t dp_reserved1; 69 uint32_be_t dp_ack_seq; 70 uint32_be_t dp_credit; /* non-zero enables flow ctl */ 71 }; 72 73 struct rdsv3_ib_send_work { 74 struct rdsv3_message *s_rm; 75 struct rdsv3_rdma_op *s_op; 76 ibt_wrc_opcode_t s_opcode; 77 unsigned long s_queued; 78 }; 79 80 struct rdsv3_ib_recv_work { 81 struct rdsv3_ib_incoming *r_ibinc; 82 struct rdsv3_page_frag *r_frag; 83 ibt_wr_ds_t r_sge[2]; 84 }; 85 86 struct rdsv3_ib_work_ring { 87 uint32_t w_nr; 88 uint32_t w_alloc_ptr; 89 uint32_t w_alloc_ctr; 90 uint32_t w_free_ptr; 91 atomic_t w_free_ctr; 92 rdsv3_wait_queue_t w_empty_wait; 93 }; 94 95 /* 96 * Rings are posted with all the allocations they'll need to queue the 97 * incoming message to the receiving socket so this can't fail. 98 * All fragments start with a header, so we can make sure we're not receiving 99 * garbage, and we can tell a small 8 byte fragment from an ACK frame. 100 */ 101 struct rdsv3_ib_ack_state { 102 uint64_t ack_next; 103 uint64_t ack_recv; 104 unsigned int ack_required:1; 105 unsigned int ack_next_valid:1; 106 unsigned int ack_recv_valid:1; 107 }; 108 109 struct rdsv3_ib_device; 110 111 struct rdsv3_ib_connection { 112 113 struct list_node ib_node; 114 boolean_t i_on_dev_list; 115 struct rdsv3_ib_device *rds_ibdev; 116 struct rdsv3_connection *conn; 117 118 /* alphabet soup, IBTA style */ 119 struct rdma_cm_id *i_cm_id; 120 struct ib_pd *i_pd; 121 struct rdsv3_hdrs_mr *i_mr; 122 struct ib_cq *i_cq; 123 struct ib_cq *i_snd_cq; 124 125 /* tx */ 126 struct rdsv3_ib_work_ring i_send_ring; 127 struct rdsv3_message *i_rm; 128 struct rdsv3_header *i_send_hdrs; 129 uint64_t i_send_hdrs_dma; 130 struct rdsv3_ib_send_work *i_sends; 131 ibt_send_wr_t *i_send_wrs; 132 133 /* soft CQ */ 134 rdsv3_af_thr_t *i_soft_cq; 135 rdsv3_af_thr_t *i_snd_soft_cq; 136 rdsv3_af_thr_t *i_refill_rq; 137 138 /* rx */ 139 struct mutex i_recv_mutex; 140 struct rdsv3_ib_work_ring i_recv_ring; 141 struct rdsv3_ib_incoming *i_ibinc; 142 uint32_t i_recv_data_rem; 143 struct rdsv3_header *i_recv_hdrs; 144 uint64_t i_recv_hdrs_dma; 145 struct rdsv3_ib_recv_work *i_recvs; 146 ibt_recv_wr_t *i_recv_wrs; 147 struct rdsv3_page_frag i_frag; 148 uint64_t i_ack_recv; /* last ACK received */ 149 150 /* sending acks */ 151 unsigned long i_ack_flags; 152 #ifdef KERNEL_HAS_ATOMIC64 153 atomic64_t i_ack_next; /* next ACK to send */ 154 #else 155 kmutex_t i_ack_lock; /* protect i_ack_next */ 156 uint64_t i_ack_next; /* next ACK to send */ 157 #endif 158 struct rdsv3_header *i_ack; 159 ibt_send_wr_t i_ack_wr; 160 ibt_wr_ds_t i_ack_sge; 161 uint64_t i_ack_dma; 162 unsigned long i_ack_queued; 163 164 /* 165 * Flow control related information 166 * 167 * Our algorithm uses a pair variables that we need to access 168 * atomically - one for the send credits, and one posted 169 * recv credits we need to transfer to remote. 170 * Rather than protect them using a slow spinlock, we put both into 171 * a single atomic_t and update it using cmpxchg 172 */ 173 atomic_t i_credits; 174 175 /* Protocol version specific information */ 176 unsigned int i_flowctl:1; /* enable/disable flow ctl */ 177 178 /* Batched completions */ 179 unsigned int i_unsignaled_wrs; 180 long i_unsignaled_bytes; 181 182 unsigned long i_max_recv_alloc; 183 }; 184 185 /* This assumes that atomic_t is at least 32 bits */ 186 #define IB_GET_SEND_CREDITS(v) ((v) & 0xffff) 187 #define IB_GET_POST_CREDITS(v) ((v) >> 16) 188 #define IB_SET_SEND_CREDITS(v) ((v) & 0xffff) 189 #define IB_SET_POST_CREDITS(v) ((v) << 16) 190 191 struct rdsv3_ib_ipaddr { 192 struct list_node list; 193 uint32_be_t ipaddr; 194 }; 195 196 struct rdsv3_ib_device { 197 struct list_node list; 198 struct list ipaddr_list; 199 struct list conn_list; 200 ib_device_t *dev; 201 struct ib_pd *pd; 202 struct kmem_cache *ib_frag_slab; 203 kmutex_t spinlock; /* protect the above */ 204 krwlock_t rwlock; /* protect paddr_list */ 205 unsigned int fmr_max_remaps; 206 unsigned int max_fmrs; 207 unsigned int fmr_message_size; 208 int max_sge; 209 unsigned int max_wrs; 210 unsigned int max_initiator_depth; 211 unsigned int max_responder_resources; 212 struct rdsv3_fmr_pool *fmr_pool; 213 struct rdsv3_inc_pool *inc_pool; 214 ibt_fmr_pool_hdl_t fmr_pool_hdl; 215 ibt_hca_attr_t hca_attr; 216 rdsv3_af_thr_t *fmr_soft_cq; 217 rdsv3_af_thr_t *inc_soft_cq; 218 ibt_hca_hdl_t ibt_hca_hdl; 219 rdsv3_af_grp_t *aft_hcagp; 220 }; 221 222 /* bits for i_ack_flags */ 223 #define IB_ACK_IN_FLIGHT 0 224 #define IB_ACK_REQUESTED 1 225 226 #define RDSV3_IB_SEND_OP (1ULL << 63) 227 228 /* Magic WR_ID for ACKs */ 229 #define RDSV3_IB_ACK_WR_ID (~(uint64_t)0) 230 231 struct rdsv3_ib_statistics { 232 uint64_t s_ib_connect_raced; 233 uint64_t s_ib_listen_closed_stale; 234 uint64_t s_ib_evt_handler_call; 235 uint64_t s_ib_tasklet_call; 236 uint64_t s_ib_tx_cq_event; 237 uint64_t s_ib_tx_ring_full; 238 uint64_t s_ib_tx_throttle; 239 uint64_t s_ib_tx_sg_mapping_failure; 240 uint64_t s_ib_tx_stalled; 241 uint64_t s_ib_tx_credit_updates; 242 uint64_t s_ib_rx_cq_event; 243 uint64_t s_ib_rx_ring_empty; 244 uint64_t s_ib_rx_refill_from_cq; 245 uint64_t s_ib_rx_refill_from_thread; 246 uint64_t s_ib_rx_alloc_limit; 247 uint64_t s_ib_rx_credit_updates; 248 uint64_t s_ib_ack_sent; 249 uint64_t s_ib_ack_send_failure; 250 uint64_t s_ib_ack_send_delayed; 251 uint64_t s_ib_ack_send_piggybacked; 252 uint64_t s_ib_ack_received; 253 uint64_t s_ib_rdma_mr_alloc; 254 uint64_t s_ib_rdma_mr_free; 255 uint64_t s_ib_rdma_mr_used; 256 uint64_t s_ib_rdma_mr_pool_flush; 257 uint64_t s_ib_rdma_mr_pool_wait; 258 uint64_t s_ib_rdma_mr_pool_depleted; 259 }; 260 261 extern struct rdsv3_workqueue_struct_s *rds_ib_wq; 262 263 /* ib.c */ 264 extern struct rdsv3_transport rdsv3_ib_transport; 265 extern void rdsv3_ib_add_one(ib_device_t *device); 266 extern void rdsv3_ib_remove_one(ib_device_t *device); 267 extern struct ib_client rdsv3_ib_client; 268 269 extern unsigned int fmr_pool_size; 270 extern unsigned int fmr_message_size; 271 extern unsigned int rdsv3_ib_retry_count; 272 273 extern kmutex_t ib_nodev_conns_lock; 274 extern struct list ib_nodev_conns; 275 276 /* ib_cm.c */ 277 int rdsv3_ib_conn_alloc(struct rdsv3_connection *conn, int gfp); 278 void rdsv3_ib_conn_free(void *arg); 279 int rdsv3_ib_conn_connect(struct rdsv3_connection *conn); 280 void rdsv3_ib_conn_shutdown(struct rdsv3_connection *conn); 281 void rdsv3_conn_drop(struct rdsv3_connection *conn); 282 int rdsv3_ib_cm_handle_connect(struct rdma_cm_id *cm_id, 283 struct rdma_cm_event *event); 284 int rdsv3_ib_cm_initiate_connect(struct rdma_cm_id *cm_id); 285 void rdsv3_ib_cm_connect_complete(struct rdsv3_connection *conn, 286 struct rdma_cm_event *event); 287 void rdsv3_ib_tasklet_fn(void *data); 288 void rdsv3_ib_snd_tasklet_fn(void *data); 289 void rdsv3_ib_refill_fn(void *data); 290 291 /* ib_rdma.c */ 292 int rdsv3_ib_update_ipaddr(struct rdsv3_ib_device *rds_ibdev, 293 uint32_be_t ipaddr); 294 void rdsv3_ib_add_conn(struct rdsv3_ib_device *rds_ibdev, 295 struct rdsv3_connection *conn); 296 void rdsv3_ib_remove_conn(struct rdsv3_ib_device *rds_ibdev, 297 struct rdsv3_connection *conn); 298 void __rdsv3_ib_destroy_conns(struct list *list, kmutex_t *list_lock); 299 static inline void rdsv3_ib_destroy_nodev_conns(void) 300 { 301 __rdsv3_ib_destroy_conns(&ib_nodev_conns, &ib_nodev_conns_lock); 302 } 303 static inline void rdsv3_ib_destroy_conns(struct rdsv3_ib_device *rds_ibdev) 304 { 305 __rdsv3_ib_destroy_conns(&rds_ibdev->conn_list, &rds_ibdev->spinlock); 306 } 307 308 int rdsv3_ib_create_mr_pool(struct rdsv3_ib_device *); 309 void rdsv3_ib_destroy_mr_pool(struct rdsv3_ib_device *); 310 void rdsv3_ib_get_mr_info(struct rdsv3_ib_device *rds_ibdev, 311 struct rds_info_rdma_connection *iinfo); 312 void *rdsv3_ib_get_mr(struct rds_iovec *args, unsigned long nents, 313 struct rdsv3_sock *rs, uint32_t *key_ret); 314 void rdsv3_ib_sync_mr(void *trans_private, int dir); 315 void rdsv3_ib_free_mr(void *trans_private, int invalidate); 316 void rdsv3_ib_flush_mrs(void); 317 void rdsv3_ib_drain_mrlist_fn(void *data); 318 319 /* ib_recv.c */ 320 int rdsv3_ib_recv_init(void); 321 void rdsv3_ib_recv_exit(void); 322 int rdsv3_ib_recv(struct rdsv3_connection *conn); 323 int rdsv3_ib_recv_refill(struct rdsv3_connection *conn, int prefill); 324 void rdsv3_ib_inc_free(struct rdsv3_incoming *inc); 325 int rdsv3_ib_inc_copy_to_user(struct rdsv3_incoming *inc, uio_t *uiop, 326 size_t size); 327 void rdsv3_ib_recv_cqe_handler(struct rdsv3_ib_connection *ic, ibt_wc_t *wc, 328 struct rdsv3_ib_ack_state *state); 329 void rdsv3_ib_recv_init_ring(struct rdsv3_ib_connection *ic); 330 void rdsv3_ib_recv_clear_ring(struct rdsv3_ib_connection *ic); 331 void rdsv3_ib_recv_init_ack(struct rdsv3_ib_connection *ic); 332 void rdsv3_ib_attempt_ack(struct rdsv3_ib_connection *ic); 333 void rdsv3_ib_ack_send_complete(struct rdsv3_ib_connection *ic); 334 uint64_t rdsv3_ib_piggyb_ack(struct rdsv3_ib_connection *ic); 335 void rdsv3_ib_set_ack(struct rdsv3_ib_connection *ic, uint64_t seq, 336 int ack_required); 337 int rdsv3_ib_create_inc_pool(struct rdsv3_ib_device *); 338 void rdsv3_ib_destroy_inc_pool(struct rdsv3_ib_device *); 339 void rdsv3_ib_drain_inclist(void *); 340 341 /* ib_ring.c */ 342 void rdsv3_ib_ring_init(struct rdsv3_ib_work_ring *ring, uint32_t nr); 343 void rdsv3_ib_ring_resize(struct rdsv3_ib_work_ring *ring, uint32_t nr); 344 uint32_t rdsv3_ib_ring_alloc(struct rdsv3_ib_work_ring *ring, uint32_t val, 345 uint32_t *pos); 346 void rdsv3_ib_ring_free(struct rdsv3_ib_work_ring *ring, uint32_t val); 347 void rdsv3_ib_ring_unalloc(struct rdsv3_ib_work_ring *ring, uint32_t val); 348 int rdsv3_ib_ring_empty(struct rdsv3_ib_work_ring *ring); 349 int rdsv3_ib_ring_low(struct rdsv3_ib_work_ring *ring); 350 uint32_t rdsv3_ib_ring_oldest(struct rdsv3_ib_work_ring *ring); 351 uint32_t rdsv3_ib_ring_completed(struct rdsv3_ib_work_ring *ring, 352 uint32_t wr_id, uint32_t oldest); 353 354 /* ib_send.c */ 355 void rdsv3_ib_xmit_complete(struct rdsv3_connection *conn); 356 int rdsv3_ib_xmit(struct rdsv3_connection *conn, struct rdsv3_message *rm, 357 unsigned int hdr_off, unsigned int sg, unsigned int off); 358 void rdsv3_ib_send_cqe_handler(struct rdsv3_ib_connection *ic, ibt_wc_t *wc); 359 void rdsv3_ib_send_init_ring(struct rdsv3_ib_connection *ic); 360 void rdsv3_ib_send_clear_ring(struct rdsv3_ib_connection *ic); 361 int rdsv3_ib_xmit_rdma(struct rdsv3_connection *conn, struct rdsv3_rdma_op *op); 362 void rdsv3_ib_send_add_credits(struct rdsv3_connection *conn, 363 unsigned int credits); 364 void rdsv3_ib_advertise_credits(struct rdsv3_connection *conn, 365 unsigned int posted); 366 int rdsv3_ib_send_grab_credits(struct rdsv3_ib_connection *ic, uint32_t wanted, 367 uint32_t *adv_credits, int need_posted); 368 369 /* ib_stats.c */ 370 extern struct rdsv3_ib_statistics *rdsv3_ib_stats; 371 #define rdsv3_ib_stats_inc(member) \ 372 rdsv3_stats_add_which(rdsv3_ib_stats, member, 1) 373 unsigned int rdsv3_ib_stats_info_copy(struct rdsv3_info_iterator *iter, 374 unsigned int avail); 375 376 /* ib_sysctl.c */ 377 int rdsv3_ib_sysctl_init(void); 378 void rdsv3_ib_sysctl_exit(void); 379 extern unsigned long rdsv3_ib_sysctl_max_send_wr; 380 extern unsigned long rdsv3_ib_sysctl_max_recv_wr; 381 extern unsigned long rdsv3_ib_sysctl_max_unsig_wrs; 382 extern unsigned long rdsv3_ib_sysctl_max_unsig_bytes; 383 extern unsigned long rdsv3_ib_sysctl_max_recv_allocation; 384 extern unsigned int rdsv3_ib_sysctl_flow_control; 385 386 #endif /* _RDSV3_IB_H */ 387