1 // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) 2 /* 3 * Copyright(c) 2018 - 2020 Intel Corporation. 4 * 5 */ 6 7 #include "hfi.h" 8 #include "qp.h" 9 #include "rc.h" 10 #include "verbs.h" 11 #include "tid_rdma.h" 12 #include "exp_rcv.h" 13 #include "trace.h" 14 15 /** 16 * DOC: TID RDMA READ protocol 17 * 18 * This is an end-to-end protocol at the hfi1 level between two nodes that 19 * improves performance by avoiding data copy on the requester side. It 20 * converts a qualified RDMA READ request into a TID RDMA READ request on 21 * the requester side and thereafter handles the request and response 22 * differently. To be qualified, the RDMA READ request should meet the 23 * following: 24 * -- The total data length should be greater than 256K; 25 * -- The total data length should be a multiple of 4K page size; 26 * -- Each local scatter-gather entry should be 4K page aligned; 27 * -- Each local scatter-gather entry should be a multiple of 4K page size; 28 */ 29 30 #define RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK BIT_ULL(32) 31 #define RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK BIT_ULL(33) 32 #define RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK BIT_ULL(34) 33 #define RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK BIT_ULL(35) 34 #define RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK BIT_ULL(37) 35 #define RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK BIT_ULL(38) 36 37 /* Maximum number of packets within a flow generation. */ 38 #define MAX_TID_FLOW_PSN BIT(HFI1_KDETH_BTH_SEQ_SHIFT) 39 40 #define GENERATION_MASK 0xFFFFF 41 42 static u32 mask_generation(u32 a) 43 { 44 return a & GENERATION_MASK; 45 } 46 47 /* Reserved generation value to set to unused flows for kernel contexts */ 48 #define KERN_GENERATION_RESERVED mask_generation(U32_MAX) 49 50 /* 51 * J_KEY for kernel contexts when TID RDMA is used. 52 * See generate_jkey() in hfi.h for more information. 53 */ 54 #define TID_RDMA_JKEY 32 55 #define HFI1_KERNEL_MIN_JKEY HFI1_ADMIN_JKEY_RANGE 56 #define HFI1_KERNEL_MAX_JKEY (2 * HFI1_ADMIN_JKEY_RANGE - 1) 57 58 /* Maximum number of segments in flight per QP request. */ 59 #define TID_RDMA_MAX_READ_SEGS_PER_REQ 6 60 #define TID_RDMA_MAX_WRITE_SEGS_PER_REQ 4 61 #define MAX_REQ max_t(u16, TID_RDMA_MAX_READ_SEGS_PER_REQ, \ 62 TID_RDMA_MAX_WRITE_SEGS_PER_REQ) 63 #define MAX_FLOWS roundup_pow_of_two(MAX_REQ + 1) 64 65 #define MAX_EXPECTED_PAGES (MAX_EXPECTED_BUFFER / PAGE_SIZE) 66 67 #define TID_RDMA_DESTQP_FLOW_SHIFT 11 68 #define TID_RDMA_DESTQP_FLOW_MASK 0x1f 69 70 #define TID_OPFN_QP_CTXT_MASK 0xff 71 #define TID_OPFN_QP_CTXT_SHIFT 56 72 #define TID_OPFN_QP_KDETH_MASK 0xff 73 #define TID_OPFN_QP_KDETH_SHIFT 48 74 #define TID_OPFN_MAX_LEN_MASK 0x7ff 75 #define TID_OPFN_MAX_LEN_SHIFT 37 76 #define TID_OPFN_TIMEOUT_MASK 0x1f 77 #define TID_OPFN_TIMEOUT_SHIFT 32 78 #define TID_OPFN_RESERVED_MASK 0x3f 79 #define TID_OPFN_RESERVED_SHIFT 26 80 #define TID_OPFN_URG_MASK 0x1 81 #define TID_OPFN_URG_SHIFT 25 82 #define TID_OPFN_VER_MASK 0x7 83 #define TID_OPFN_VER_SHIFT 22 84 #define TID_OPFN_JKEY_MASK 0x3f 85 #define TID_OPFN_JKEY_SHIFT 16 86 #define TID_OPFN_MAX_READ_MASK 0x3f 87 #define TID_OPFN_MAX_READ_SHIFT 10 88 #define TID_OPFN_MAX_WRITE_MASK 0x3f 89 #define TID_OPFN_MAX_WRITE_SHIFT 4 90 91 /* 92 * OPFN TID layout 93 * 94 * 63 47 31 15 95 * NNNNNNNNKKKKKKKK MMMMMMMMMMMTTTTT DDDDDDUVVVJJJJJJ RRRRRRWWWWWWCCCC 96 * 3210987654321098 7654321098765432 1098765432109876 5432109876543210 97 * N - the context Number 98 * K - the Kdeth_qp 99 * M - Max_len 100 * T - Timeout 101 * D - reserveD 102 * V - version 103 * U - Urg capable 104 * J - Jkey 105 * R - max_Read 106 * W - max_Write 107 * C - Capcode 108 */ 109 110 static void tid_rdma_trigger_resume(struct work_struct *work); 111 static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req); 112 static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req, 113 gfp_t gfp); 114 static void hfi1_init_trdma_req(struct rvt_qp *qp, 115 struct tid_rdma_request *req); 116 static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx); 117 static void hfi1_tid_timeout(struct timer_list *t); 118 static void hfi1_add_tid_reap_timer(struct rvt_qp *qp); 119 static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp); 120 static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp); 121 static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp); 122 static void hfi1_tid_retry_timeout(struct timer_list *t); 123 static int make_tid_rdma_ack(struct rvt_qp *qp, 124 struct ib_other_headers *ohdr, 125 struct hfi1_pkt_state *ps); 126 static void hfi1_do_tid_send(struct rvt_qp *qp); 127 static u32 read_r_next_psn(struct hfi1_devdata *dd, u8 ctxt, u8 fidx); 128 static void tid_rdma_rcv_err(struct hfi1_packet *packet, 129 struct ib_other_headers *ohdr, 130 struct rvt_qp *qp, u32 psn, int diff, bool fecn); 131 static void update_r_next_psn_fecn(struct hfi1_packet *packet, 132 struct hfi1_qp_priv *priv, 133 struct hfi1_ctxtdata *rcd, 134 struct tid_rdma_flow *flow, 135 bool fecn); 136 137 static void validate_r_tid_ack(struct hfi1_qp_priv *priv) 138 { 139 if (priv->r_tid_ack == HFI1_QP_WQE_INVALID) 140 priv->r_tid_ack = priv->r_tid_tail; 141 } 142 143 static void tid_rdma_schedule_ack(struct rvt_qp *qp) 144 { 145 struct hfi1_qp_priv *priv = qp->priv; 146 147 priv->s_flags |= RVT_S_ACK_PENDING; 148 hfi1_schedule_tid_send(qp); 149 } 150 151 static void tid_rdma_trigger_ack(struct rvt_qp *qp) 152 { 153 validate_r_tid_ack(qp->priv); 154 tid_rdma_schedule_ack(qp); 155 } 156 157 static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p) 158 { 159 return 160 (((u64)p->qp & TID_OPFN_QP_CTXT_MASK) << 161 TID_OPFN_QP_CTXT_SHIFT) | 162 ((((u64)p->qp >> 16) & TID_OPFN_QP_KDETH_MASK) << 163 TID_OPFN_QP_KDETH_SHIFT) | 164 (((u64)((p->max_len >> PAGE_SHIFT) - 1) & 165 TID_OPFN_MAX_LEN_MASK) << TID_OPFN_MAX_LEN_SHIFT) | 166 (((u64)p->timeout & TID_OPFN_TIMEOUT_MASK) << 167 TID_OPFN_TIMEOUT_SHIFT) | 168 (((u64)p->urg & TID_OPFN_URG_MASK) << TID_OPFN_URG_SHIFT) | 169 (((u64)p->jkey & TID_OPFN_JKEY_MASK) << TID_OPFN_JKEY_SHIFT) | 170 (((u64)p->max_read & TID_OPFN_MAX_READ_MASK) << 171 TID_OPFN_MAX_READ_SHIFT) | 172 (((u64)p->max_write & TID_OPFN_MAX_WRITE_MASK) << 173 TID_OPFN_MAX_WRITE_SHIFT); 174 } 175 176 static void tid_rdma_opfn_decode(struct tid_rdma_params *p, u64 data) 177 { 178 p->max_len = (((data >> TID_OPFN_MAX_LEN_SHIFT) & 179 TID_OPFN_MAX_LEN_MASK) + 1) << PAGE_SHIFT; 180 p->jkey = (data >> TID_OPFN_JKEY_SHIFT) & TID_OPFN_JKEY_MASK; 181 p->max_write = (data >> TID_OPFN_MAX_WRITE_SHIFT) & 182 TID_OPFN_MAX_WRITE_MASK; 183 p->max_read = (data >> TID_OPFN_MAX_READ_SHIFT) & 184 TID_OPFN_MAX_READ_MASK; 185 p->qp = 186 ((((data >> TID_OPFN_QP_KDETH_SHIFT) & TID_OPFN_QP_KDETH_MASK) 187 << 16) | 188 ((data >> TID_OPFN_QP_CTXT_SHIFT) & TID_OPFN_QP_CTXT_MASK)); 189 p->urg = (data >> TID_OPFN_URG_SHIFT) & TID_OPFN_URG_MASK; 190 p->timeout = (data >> TID_OPFN_TIMEOUT_SHIFT) & TID_OPFN_TIMEOUT_MASK; 191 } 192 193 void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p) 194 { 195 struct hfi1_qp_priv *priv = qp->priv; 196 197 p->qp = (RVT_KDETH_QP_PREFIX << 16) | priv->rcd->ctxt; 198 p->max_len = TID_RDMA_MAX_SEGMENT_SIZE; 199 p->jkey = priv->rcd->jkey; 200 p->max_read = TID_RDMA_MAX_READ_SEGS_PER_REQ; 201 p->max_write = TID_RDMA_MAX_WRITE_SEGS_PER_REQ; 202 p->timeout = qp->timeout; 203 p->urg = is_urg_masked(priv->rcd); 204 } 205 206 bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data) 207 { 208 struct hfi1_qp_priv *priv = qp->priv; 209 210 *data = tid_rdma_opfn_encode(&priv->tid_rdma.local); 211 return true; 212 } 213 214 bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data) 215 { 216 struct hfi1_qp_priv *priv = qp->priv; 217 struct tid_rdma_params *remote, *old; 218 bool ret = true; 219 220 old = rcu_dereference_protected(priv->tid_rdma.remote, 221 lockdep_is_held(&priv->opfn.lock)); 222 data &= ~0xfULL; 223 /* 224 * If data passed in is zero, return true so as not to continue the 225 * negotiation process 226 */ 227 if (!data || !HFI1_CAP_IS_KSET(TID_RDMA)) 228 goto null; 229 /* 230 * If kzalloc fails, return false. This will result in: 231 * * at the requester a new OPFN request being generated to retry 232 * the negotiation 233 * * at the responder, 0 being returned to the requester so as to 234 * disable TID RDMA at both the requester and the responder 235 */ 236 remote = kzalloc(sizeof(*remote), GFP_ATOMIC); 237 if (!remote) { 238 ret = false; 239 goto null; 240 } 241 242 tid_rdma_opfn_decode(remote, data); 243 priv->tid_timer_timeout_jiffies = 244 usecs_to_jiffies((((4096UL * (1UL << remote->timeout)) / 245 1000UL) << 3) * 7); 246 trace_hfi1_opfn_param(qp, 0, &priv->tid_rdma.local); 247 trace_hfi1_opfn_param(qp, 1, remote); 248 rcu_assign_pointer(priv->tid_rdma.remote, remote); 249 /* 250 * A TID RDMA READ request's segment size is not equal to 251 * remote->max_len only when the request's data length is smaller 252 * than remote->max_len. In that case, there will be only one segment. 253 * Therefore, when priv->pkts_ps is used to calculate req->cur_seg 254 * during retry, it will lead to req->cur_seg = 0, which is exactly 255 * what is expected. 256 */ 257 priv->pkts_ps = (u16)rvt_div_mtu(qp, remote->max_len); 258 priv->timeout_shift = ilog2(priv->pkts_ps - 1) + 1; 259 goto free; 260 null: 261 RCU_INIT_POINTER(priv->tid_rdma.remote, NULL); 262 priv->timeout_shift = 0; 263 free: 264 if (old) 265 kfree_rcu(old, rcu_head); 266 return ret; 267 } 268 269 bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data) 270 { 271 bool ret; 272 273 ret = tid_rdma_conn_reply(qp, *data); 274 *data = 0; 275 /* 276 * If tid_rdma_conn_reply() returns error, set *data as 0 to indicate 277 * TID RDMA could not be enabled. This will result in TID RDMA being 278 * disabled at the requester too. 279 */ 280 if (ret) 281 (void)tid_rdma_conn_req(qp, data); 282 return ret; 283 } 284 285 void tid_rdma_conn_error(struct rvt_qp *qp) 286 { 287 struct hfi1_qp_priv *priv = qp->priv; 288 struct tid_rdma_params *old; 289 290 old = rcu_dereference_protected(priv->tid_rdma.remote, 291 lockdep_is_held(&priv->opfn.lock)); 292 RCU_INIT_POINTER(priv->tid_rdma.remote, NULL); 293 if (old) 294 kfree_rcu(old, rcu_head); 295 } 296 297 /* This is called at context initialization time */ 298 int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit) 299 { 300 if (reinit) 301 return 0; 302 303 BUILD_BUG_ON(TID_RDMA_JKEY < HFI1_KERNEL_MIN_JKEY); 304 BUILD_BUG_ON(TID_RDMA_JKEY > HFI1_KERNEL_MAX_JKEY); 305 rcd->jkey = TID_RDMA_JKEY; 306 hfi1_set_ctxt_jkey(rcd->dd, rcd, rcd->jkey); 307 return hfi1_alloc_ctxt_rcv_groups(rcd); 308 } 309 310 /** 311 * qp_to_rcd - determine the receive context used by a qp 312 * @qp - the qp 313 * 314 * This routine returns the receive context associated 315 * with a a qp's qpn. 316 * 317 * Returns the context. 318 */ 319 static struct hfi1_ctxtdata *qp_to_rcd(struct rvt_dev_info *rdi, 320 struct rvt_qp *qp) 321 { 322 struct hfi1_ibdev *verbs_dev = container_of(rdi, 323 struct hfi1_ibdev, 324 rdi); 325 struct hfi1_devdata *dd = container_of(verbs_dev, 326 struct hfi1_devdata, 327 verbs_dev); 328 unsigned int ctxt; 329 330 if (qp->ibqp.qp_num == 0) 331 ctxt = 0; 332 else 333 ctxt = hfi1_get_qp_map(dd, qp->ibqp.qp_num >> dd->qos_shift); 334 return dd->rcd[ctxt]; 335 } 336 337 int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, 338 struct ib_qp_init_attr *init_attr) 339 { 340 struct hfi1_qp_priv *qpriv = qp->priv; 341 int i, ret; 342 343 qpriv->rcd = qp_to_rcd(rdi, qp); 344 345 spin_lock_init(&qpriv->opfn.lock); 346 INIT_WORK(&qpriv->opfn.opfn_work, opfn_send_conn_request); 347 INIT_WORK(&qpriv->tid_rdma.trigger_work, tid_rdma_trigger_resume); 348 qpriv->flow_state.psn = 0; 349 qpriv->flow_state.index = RXE_NUM_TID_FLOWS; 350 qpriv->flow_state.last_index = RXE_NUM_TID_FLOWS; 351 qpriv->flow_state.generation = KERN_GENERATION_RESERVED; 352 qpriv->s_state = TID_OP(WRITE_RESP); 353 qpriv->s_tid_cur = HFI1_QP_WQE_INVALID; 354 qpriv->s_tid_head = HFI1_QP_WQE_INVALID; 355 qpriv->s_tid_tail = HFI1_QP_WQE_INVALID; 356 qpriv->rnr_nak_state = TID_RNR_NAK_INIT; 357 qpriv->r_tid_head = HFI1_QP_WQE_INVALID; 358 qpriv->r_tid_tail = HFI1_QP_WQE_INVALID; 359 qpriv->r_tid_ack = HFI1_QP_WQE_INVALID; 360 qpriv->r_tid_alloc = HFI1_QP_WQE_INVALID; 361 atomic_set(&qpriv->n_requests, 0); 362 atomic_set(&qpriv->n_tid_requests, 0); 363 timer_setup(&qpriv->s_tid_timer, hfi1_tid_timeout, 0); 364 timer_setup(&qpriv->s_tid_retry_timer, hfi1_tid_retry_timeout, 0); 365 INIT_LIST_HEAD(&qpriv->tid_wait); 366 367 if (init_attr->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { 368 struct hfi1_devdata *dd = qpriv->rcd->dd; 369 370 qpriv->pages = kzalloc_node(TID_RDMA_MAX_PAGES * 371 sizeof(*qpriv->pages), 372 GFP_KERNEL, dd->node); 373 if (!qpriv->pages) 374 return -ENOMEM; 375 for (i = 0; i < qp->s_size; i++) { 376 struct hfi1_swqe_priv *priv; 377 struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i); 378 379 priv = kzalloc_node(sizeof(*priv), GFP_KERNEL, 380 dd->node); 381 if (!priv) 382 return -ENOMEM; 383 384 hfi1_init_trdma_req(qp, &priv->tid_req); 385 priv->tid_req.e.swqe = wqe; 386 wqe->priv = priv; 387 } 388 for (i = 0; i < rvt_max_atomic(rdi); i++) { 389 struct hfi1_ack_priv *priv; 390 391 priv = kzalloc_node(sizeof(*priv), GFP_KERNEL, 392 dd->node); 393 if (!priv) 394 return -ENOMEM; 395 396 hfi1_init_trdma_req(qp, &priv->tid_req); 397 priv->tid_req.e.ack = &qp->s_ack_queue[i]; 398 399 ret = hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req, 400 GFP_KERNEL); 401 if (ret) { 402 kfree(priv); 403 return ret; 404 } 405 qp->s_ack_queue[i].priv = priv; 406 } 407 } 408 409 return 0; 410 } 411 412 void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp) 413 { 414 struct hfi1_qp_priv *qpriv = qp->priv; 415 struct rvt_swqe *wqe; 416 u32 i; 417 418 if (qp->ibqp.qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { 419 for (i = 0; i < qp->s_size; i++) { 420 wqe = rvt_get_swqe_ptr(qp, i); 421 kfree(wqe->priv); 422 wqe->priv = NULL; 423 } 424 for (i = 0; i < rvt_max_atomic(rdi); i++) { 425 struct hfi1_ack_priv *priv = qp->s_ack_queue[i].priv; 426 427 if (priv) 428 hfi1_kern_exp_rcv_free_flows(&priv->tid_req); 429 kfree(priv); 430 qp->s_ack_queue[i].priv = NULL; 431 } 432 cancel_work_sync(&qpriv->opfn.opfn_work); 433 kfree(qpriv->pages); 434 qpriv->pages = NULL; 435 } 436 } 437 438 /* Flow and tid waiter functions */ 439 /** 440 * DOC: lock ordering 441 * 442 * There are two locks involved with the queuing 443 * routines: the qp s_lock and the exp_lock. 444 * 445 * Since the tid space allocation is called from 446 * the send engine, the qp s_lock is already held. 447 * 448 * The allocation routines will get the exp_lock. 449 * 450 * The first_qp() call is provided to allow the head of 451 * the rcd wait queue to be fetched under the exp_lock and 452 * followed by a drop of the exp_lock. 453 * 454 * Any qp in the wait list will have the qp reference count held 455 * to hold the qp in memory. 456 */ 457 458 /* 459 * return head of rcd wait list 460 * 461 * Must hold the exp_lock. 462 * 463 * Get a reference to the QP to hold the QP in memory. 464 * 465 * The caller must release the reference when the local 466 * is no longer being used. 467 */ 468 static struct rvt_qp *first_qp(struct hfi1_ctxtdata *rcd, 469 struct tid_queue *queue) 470 __must_hold(&rcd->exp_lock) 471 { 472 struct hfi1_qp_priv *priv; 473 474 lockdep_assert_held(&rcd->exp_lock); 475 priv = list_first_entry_or_null(&queue->queue_head, 476 struct hfi1_qp_priv, 477 tid_wait); 478 if (!priv) 479 return NULL; 480 rvt_get_qp(priv->owner); 481 return priv->owner; 482 } 483 484 /** 485 * kernel_tid_waiters - determine rcd wait 486 * @rcd: the receive context 487 * @qp: the head of the qp being processed 488 * 489 * This routine will return false IFF 490 * the list is NULL or the head of the 491 * list is the indicated qp. 492 * 493 * Must hold the qp s_lock and the exp_lock. 494 * 495 * Return: 496 * false if either of the conditions below are satisfied: 497 * 1. The list is empty or 498 * 2. The indicated qp is at the head of the list and the 499 * HFI1_S_WAIT_TID_SPACE bit is set in qp->s_flags. 500 * true is returned otherwise. 501 */ 502 static bool kernel_tid_waiters(struct hfi1_ctxtdata *rcd, 503 struct tid_queue *queue, struct rvt_qp *qp) 504 __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock) 505 { 506 struct rvt_qp *fqp; 507 bool ret = true; 508 509 lockdep_assert_held(&qp->s_lock); 510 lockdep_assert_held(&rcd->exp_lock); 511 fqp = first_qp(rcd, queue); 512 if (!fqp || (fqp == qp && (qp->s_flags & HFI1_S_WAIT_TID_SPACE))) 513 ret = false; 514 rvt_put_qp(fqp); 515 return ret; 516 } 517 518 /** 519 * dequeue_tid_waiter - dequeue the qp from the list 520 * @qp - the qp to remove the wait list 521 * 522 * This routine removes the indicated qp from the 523 * wait list if it is there. 524 * 525 * This should be done after the hardware flow and 526 * tid array resources have been allocated. 527 * 528 * Must hold the qp s_lock and the rcd exp_lock. 529 * 530 * It assumes the s_lock to protect the s_flags 531 * field and to reliably test the HFI1_S_WAIT_TID_SPACE flag. 532 */ 533 static void dequeue_tid_waiter(struct hfi1_ctxtdata *rcd, 534 struct tid_queue *queue, struct rvt_qp *qp) 535 __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock) 536 { 537 struct hfi1_qp_priv *priv = qp->priv; 538 539 lockdep_assert_held(&qp->s_lock); 540 lockdep_assert_held(&rcd->exp_lock); 541 if (list_empty(&priv->tid_wait)) 542 return; 543 list_del_init(&priv->tid_wait); 544 qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE; 545 queue->dequeue++; 546 rvt_put_qp(qp); 547 } 548 549 /** 550 * queue_qp_for_tid_wait - suspend QP on tid space 551 * @rcd: the receive context 552 * @qp: the qp 553 * 554 * The qp is inserted at the tail of the rcd 555 * wait queue and the HFI1_S_WAIT_TID_SPACE s_flag is set. 556 * 557 * Must hold the qp s_lock and the exp_lock. 558 */ 559 static void queue_qp_for_tid_wait(struct hfi1_ctxtdata *rcd, 560 struct tid_queue *queue, struct rvt_qp *qp) 561 __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock) 562 { 563 struct hfi1_qp_priv *priv = qp->priv; 564 565 lockdep_assert_held(&qp->s_lock); 566 lockdep_assert_held(&rcd->exp_lock); 567 if (list_empty(&priv->tid_wait)) { 568 qp->s_flags |= HFI1_S_WAIT_TID_SPACE; 569 list_add_tail(&priv->tid_wait, &queue->queue_head); 570 priv->tid_enqueue = ++queue->enqueue; 571 rcd->dd->verbs_dev.n_tidwait++; 572 trace_hfi1_qpsleep(qp, HFI1_S_WAIT_TID_SPACE); 573 rvt_get_qp(qp); 574 } 575 } 576 577 /** 578 * __trigger_tid_waiter - trigger tid waiter 579 * @qp: the qp 580 * 581 * This is a private entrance to schedule the qp 582 * assuming the caller is holding the qp->s_lock. 583 */ 584 static void __trigger_tid_waiter(struct rvt_qp *qp) 585 __must_hold(&qp->s_lock) 586 { 587 lockdep_assert_held(&qp->s_lock); 588 if (!(qp->s_flags & HFI1_S_WAIT_TID_SPACE)) 589 return; 590 trace_hfi1_qpwakeup(qp, HFI1_S_WAIT_TID_SPACE); 591 hfi1_schedule_send(qp); 592 } 593 594 /** 595 * tid_rdma_schedule_tid_wakeup - schedule wakeup for a qp 596 * @qp - the qp 597 * 598 * trigger a schedule or a waiting qp in a deadlock 599 * safe manner. The qp reference is held prior 600 * to this call via first_qp(). 601 * 602 * If the qp trigger was already scheduled (!rval) 603 * the the reference is dropped, otherwise the resume 604 * or the destroy cancel will dispatch the reference. 605 */ 606 static void tid_rdma_schedule_tid_wakeup(struct rvt_qp *qp) 607 { 608 struct hfi1_qp_priv *priv; 609 struct hfi1_ibport *ibp; 610 struct hfi1_pportdata *ppd; 611 struct hfi1_devdata *dd; 612 bool rval; 613 614 if (!qp) 615 return; 616 617 priv = qp->priv; 618 ibp = to_iport(qp->ibqp.device, qp->port_num); 619 ppd = ppd_from_ibp(ibp); 620 dd = dd_from_ibdev(qp->ibqp.device); 621 622 rval = queue_work_on(priv->s_sde ? 623 priv->s_sde->cpu : 624 cpumask_first(cpumask_of_node(dd->node)), 625 ppd->hfi1_wq, 626 &priv->tid_rdma.trigger_work); 627 if (!rval) 628 rvt_put_qp(qp); 629 } 630 631 /** 632 * tid_rdma_trigger_resume - field a trigger work request 633 * @work - the work item 634 * 635 * Complete the off qp trigger processing by directly 636 * calling the progress routine. 637 */ 638 static void tid_rdma_trigger_resume(struct work_struct *work) 639 { 640 struct tid_rdma_qp_params *tr; 641 struct hfi1_qp_priv *priv; 642 struct rvt_qp *qp; 643 644 tr = container_of(work, struct tid_rdma_qp_params, trigger_work); 645 priv = container_of(tr, struct hfi1_qp_priv, tid_rdma); 646 qp = priv->owner; 647 spin_lock_irq(&qp->s_lock); 648 if (qp->s_flags & HFI1_S_WAIT_TID_SPACE) { 649 spin_unlock_irq(&qp->s_lock); 650 hfi1_do_send(priv->owner, true); 651 } else { 652 spin_unlock_irq(&qp->s_lock); 653 } 654 rvt_put_qp(qp); 655 } 656 657 /** 658 * tid_rdma_flush_wait - unwind any tid space wait 659 * 660 * This is called when resetting a qp to 661 * allow a destroy or reset to get rid 662 * of any tid space linkage and reference counts. 663 */ 664 static void _tid_rdma_flush_wait(struct rvt_qp *qp, struct tid_queue *queue) 665 __must_hold(&qp->s_lock) 666 { 667 struct hfi1_qp_priv *priv; 668 669 if (!qp) 670 return; 671 lockdep_assert_held(&qp->s_lock); 672 priv = qp->priv; 673 qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE; 674 spin_lock(&priv->rcd->exp_lock); 675 if (!list_empty(&priv->tid_wait)) { 676 list_del_init(&priv->tid_wait); 677 qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE; 678 queue->dequeue++; 679 rvt_put_qp(qp); 680 } 681 spin_unlock(&priv->rcd->exp_lock); 682 } 683 684 void hfi1_tid_rdma_flush_wait(struct rvt_qp *qp) 685 __must_hold(&qp->s_lock) 686 { 687 struct hfi1_qp_priv *priv = qp->priv; 688 689 _tid_rdma_flush_wait(qp, &priv->rcd->flow_queue); 690 _tid_rdma_flush_wait(qp, &priv->rcd->rarr_queue); 691 } 692 693 /* Flow functions */ 694 /** 695 * kern_reserve_flow - allocate a hardware flow 696 * @rcd - the context to use for allocation 697 * @last - the index of the preferred flow. Use RXE_NUM_TID_FLOWS to 698 * signify "don't care". 699 * 700 * Use a bit mask based allocation to reserve a hardware 701 * flow for use in receiving KDETH data packets. If a preferred flow is 702 * specified the function will attempt to reserve that flow again, if 703 * available. 704 * 705 * The exp_lock must be held. 706 * 707 * Return: 708 * On success: a value postive value between 0 and RXE_NUM_TID_FLOWS - 1 709 * On failure: -EAGAIN 710 */ 711 static int kern_reserve_flow(struct hfi1_ctxtdata *rcd, int last) 712 __must_hold(&rcd->exp_lock) 713 { 714 int nr; 715 716 /* Attempt to reserve the preferred flow index */ 717 if (last >= 0 && last < RXE_NUM_TID_FLOWS && 718 !test_and_set_bit(last, &rcd->flow_mask)) 719 return last; 720 721 nr = ffz(rcd->flow_mask); 722 BUILD_BUG_ON(RXE_NUM_TID_FLOWS >= 723 (sizeof(rcd->flow_mask) * BITS_PER_BYTE)); 724 if (nr > (RXE_NUM_TID_FLOWS - 1)) 725 return -EAGAIN; 726 set_bit(nr, &rcd->flow_mask); 727 return nr; 728 } 729 730 static void kern_set_hw_flow(struct hfi1_ctxtdata *rcd, u32 generation, 731 u32 flow_idx) 732 { 733 u64 reg; 734 735 reg = ((u64)generation << HFI1_KDETH_BTH_SEQ_SHIFT) | 736 RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK | 737 RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK | 738 RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK | 739 RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK | 740 RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK; 741 742 if (generation != KERN_GENERATION_RESERVED) 743 reg |= RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK; 744 745 write_uctxt_csr(rcd->dd, rcd->ctxt, 746 RCV_TID_FLOW_TABLE + 8 * flow_idx, reg); 747 } 748 749 static u32 kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx) 750 __must_hold(&rcd->exp_lock) 751 { 752 u32 generation = rcd->flows[flow_idx].generation; 753 754 kern_set_hw_flow(rcd, generation, flow_idx); 755 return generation; 756 } 757 758 static u32 kern_flow_generation_next(u32 gen) 759 { 760 u32 generation = mask_generation(gen + 1); 761 762 if (generation == KERN_GENERATION_RESERVED) 763 generation = mask_generation(generation + 1); 764 return generation; 765 } 766 767 static void kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx) 768 __must_hold(&rcd->exp_lock) 769 { 770 rcd->flows[flow_idx].generation = 771 kern_flow_generation_next(rcd->flows[flow_idx].generation); 772 kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, flow_idx); 773 } 774 775 int hfi1_kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp) 776 { 777 struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv; 778 struct tid_flow_state *fs = &qpriv->flow_state; 779 struct rvt_qp *fqp; 780 unsigned long flags; 781 int ret = 0; 782 783 /* The QP already has an allocated flow */ 784 if (fs->index != RXE_NUM_TID_FLOWS) 785 return ret; 786 787 spin_lock_irqsave(&rcd->exp_lock, flags); 788 if (kernel_tid_waiters(rcd, &rcd->flow_queue, qp)) 789 goto queue; 790 791 ret = kern_reserve_flow(rcd, fs->last_index); 792 if (ret < 0) 793 goto queue; 794 fs->index = ret; 795 fs->last_index = fs->index; 796 797 /* Generation received in a RESYNC overrides default flow generation */ 798 if (fs->generation != KERN_GENERATION_RESERVED) 799 rcd->flows[fs->index].generation = fs->generation; 800 fs->generation = kern_setup_hw_flow(rcd, fs->index); 801 fs->psn = 0; 802 dequeue_tid_waiter(rcd, &rcd->flow_queue, qp); 803 /* get head before dropping lock */ 804 fqp = first_qp(rcd, &rcd->flow_queue); 805 spin_unlock_irqrestore(&rcd->exp_lock, flags); 806 807 tid_rdma_schedule_tid_wakeup(fqp); 808 return 0; 809 queue: 810 queue_qp_for_tid_wait(rcd, &rcd->flow_queue, qp); 811 spin_unlock_irqrestore(&rcd->exp_lock, flags); 812 return -EAGAIN; 813 } 814 815 void hfi1_kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp) 816 { 817 struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv; 818 struct tid_flow_state *fs = &qpriv->flow_state; 819 struct rvt_qp *fqp; 820 unsigned long flags; 821 822 if (fs->index >= RXE_NUM_TID_FLOWS) 823 return; 824 spin_lock_irqsave(&rcd->exp_lock, flags); 825 kern_clear_hw_flow(rcd, fs->index); 826 clear_bit(fs->index, &rcd->flow_mask); 827 fs->index = RXE_NUM_TID_FLOWS; 828 fs->psn = 0; 829 fs->generation = KERN_GENERATION_RESERVED; 830 831 /* get head before dropping lock */ 832 fqp = first_qp(rcd, &rcd->flow_queue); 833 spin_unlock_irqrestore(&rcd->exp_lock, flags); 834 835 if (fqp == qp) { 836 __trigger_tid_waiter(fqp); 837 rvt_put_qp(fqp); 838 } else { 839 tid_rdma_schedule_tid_wakeup(fqp); 840 } 841 } 842 843 void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd) 844 { 845 int i; 846 847 for (i = 0; i < RXE_NUM_TID_FLOWS; i++) { 848 rcd->flows[i].generation = mask_generation(prandom_u32()); 849 kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, i); 850 } 851 } 852 853 /* TID allocation functions */ 854 static u8 trdma_pset_order(struct tid_rdma_pageset *s) 855 { 856 u8 count = s->count; 857 858 return ilog2(count) + 1; 859 } 860 861 /** 862 * tid_rdma_find_phys_blocks_4k - get groups base on mr info 863 * @npages - number of pages 864 * @pages - pointer to an array of page structs 865 * @list - page set array to return 866 * 867 * This routine returns the number of groups associated with 868 * the current sge information. This implementation is based 869 * on the expected receive find_phys_blocks() adjusted to 870 * use the MR information vs. the pfn. 871 * 872 * Return: 873 * the number of RcvArray entries 874 */ 875 static u32 tid_rdma_find_phys_blocks_4k(struct tid_rdma_flow *flow, 876 struct page **pages, 877 u32 npages, 878 struct tid_rdma_pageset *list) 879 { 880 u32 pagecount, pageidx, setcount = 0, i; 881 void *vaddr, *this_vaddr; 882 883 if (!npages) 884 return 0; 885 886 /* 887 * Look for sets of physically contiguous pages in the user buffer. 888 * This will allow us to optimize Expected RcvArray entry usage by 889 * using the bigger supported sizes. 890 */ 891 vaddr = page_address(pages[0]); 892 trace_hfi1_tid_flow_page(flow->req->qp, flow, 0, 0, 0, vaddr); 893 for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) { 894 this_vaddr = i < npages ? page_address(pages[i]) : NULL; 895 trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 0, 0, 896 this_vaddr); 897 /* 898 * If the vaddr's are not sequential, pages are not physically 899 * contiguous. 900 */ 901 if (this_vaddr != (vaddr + PAGE_SIZE)) { 902 /* 903 * At this point we have to loop over the set of 904 * physically contiguous pages and break them down it 905 * sizes supported by the HW. 906 * There are two main constraints: 907 * 1. The max buffer size is MAX_EXPECTED_BUFFER. 908 * If the total set size is bigger than that 909 * program only a MAX_EXPECTED_BUFFER chunk. 910 * 2. The buffer size has to be a power of two. If 911 * it is not, round down to the closes power of 912 * 2 and program that size. 913 */ 914 while (pagecount) { 915 int maxpages = pagecount; 916 u32 bufsize = pagecount * PAGE_SIZE; 917 918 if (bufsize > MAX_EXPECTED_BUFFER) 919 maxpages = 920 MAX_EXPECTED_BUFFER >> 921 PAGE_SHIFT; 922 else if (!is_power_of_2(bufsize)) 923 maxpages = 924 rounddown_pow_of_two(bufsize) >> 925 PAGE_SHIFT; 926 927 list[setcount].idx = pageidx; 928 list[setcount].count = maxpages; 929 trace_hfi1_tid_pageset(flow->req->qp, setcount, 930 list[setcount].idx, 931 list[setcount].count); 932 pagecount -= maxpages; 933 pageidx += maxpages; 934 setcount++; 935 } 936 pageidx = i; 937 pagecount = 1; 938 vaddr = this_vaddr; 939 } else { 940 vaddr += PAGE_SIZE; 941 pagecount++; 942 } 943 } 944 /* insure we always return an even number of sets */ 945 if (setcount & 1) 946 list[setcount++].count = 0; 947 return setcount; 948 } 949 950 /** 951 * tid_flush_pages - dump out pages into pagesets 952 * @list - list of pagesets 953 * @idx - pointer to current page index 954 * @pages - number of pages to dump 955 * @sets - current number of pagesset 956 * 957 * This routine flushes out accumuated pages. 958 * 959 * To insure an even number of sets the 960 * code may add a filler. 961 * 962 * This can happen with when pages is not 963 * a power of 2 or pages is a power of 2 964 * less than the maximum pages. 965 * 966 * Return: 967 * The new number of sets 968 */ 969 970 static u32 tid_flush_pages(struct tid_rdma_pageset *list, 971 u32 *idx, u32 pages, u32 sets) 972 { 973 while (pages) { 974 u32 maxpages = pages; 975 976 if (maxpages > MAX_EXPECTED_PAGES) 977 maxpages = MAX_EXPECTED_PAGES; 978 else if (!is_power_of_2(maxpages)) 979 maxpages = rounddown_pow_of_two(maxpages); 980 list[sets].idx = *idx; 981 list[sets++].count = maxpages; 982 *idx += maxpages; 983 pages -= maxpages; 984 } 985 /* might need a filler */ 986 if (sets & 1) 987 list[sets++].count = 0; 988 return sets; 989 } 990 991 /** 992 * tid_rdma_find_phys_blocks_8k - get groups base on mr info 993 * @pages - pointer to an array of page structs 994 * @npages - number of pages 995 * @list - page set array to return 996 * 997 * This routine parses an array of pages to compute pagesets 998 * in an 8k compatible way. 999 * 1000 * pages are tested two at a time, i, i + 1 for contiguous 1001 * pages and i - 1 and i contiguous pages. 1002 * 1003 * If any condition is false, any accumlated pages are flushed and 1004 * v0,v1 are emitted as separate PAGE_SIZE pagesets 1005 * 1006 * Otherwise, the current 8k is totaled for a future flush. 1007 * 1008 * Return: 1009 * The number of pagesets 1010 * list set with the returned number of pagesets 1011 * 1012 */ 1013 static u32 tid_rdma_find_phys_blocks_8k(struct tid_rdma_flow *flow, 1014 struct page **pages, 1015 u32 npages, 1016 struct tid_rdma_pageset *list) 1017 { 1018 u32 idx, sets = 0, i; 1019 u32 pagecnt = 0; 1020 void *v0, *v1, *vm1; 1021 1022 if (!npages) 1023 return 0; 1024 for (idx = 0, i = 0, vm1 = NULL; i < npages; i += 2) { 1025 /* get a new v0 */ 1026 v0 = page_address(pages[i]); 1027 trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 0, v0); 1028 v1 = i + 1 < npages ? 1029 page_address(pages[i + 1]) : NULL; 1030 trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 1, v1); 1031 /* compare i, i + 1 vaddr */ 1032 if (v1 != (v0 + PAGE_SIZE)) { 1033 /* flush out pages */ 1034 sets = tid_flush_pages(list, &idx, pagecnt, sets); 1035 /* output v0,v1 as two pagesets */ 1036 list[sets].idx = idx++; 1037 list[sets++].count = 1; 1038 if (v1) { 1039 list[sets].count = 1; 1040 list[sets++].idx = idx++; 1041 } else { 1042 list[sets++].count = 0; 1043 } 1044 vm1 = NULL; 1045 pagecnt = 0; 1046 continue; 1047 } 1048 /* i,i+1 consecutive, look at i-1,i */ 1049 if (vm1 && v0 != (vm1 + PAGE_SIZE)) { 1050 /* flush out pages */ 1051 sets = tid_flush_pages(list, &idx, pagecnt, sets); 1052 pagecnt = 0; 1053 } 1054 /* pages will always be a multiple of 8k */ 1055 pagecnt += 2; 1056 /* save i-1 */ 1057 vm1 = v1; 1058 /* move to next pair */ 1059 } 1060 /* dump residual pages at end */ 1061 sets = tid_flush_pages(list, &idx, npages - idx, sets); 1062 /* by design cannot be odd sets */ 1063 WARN_ON(sets & 1); 1064 return sets; 1065 } 1066 1067 /** 1068 * Find pages for one segment of a sge array represented by @ss. The function 1069 * does not check the sge, the sge must have been checked for alignment with a 1070 * prior call to hfi1_kern_trdma_ok. Other sge checking is done as part of 1071 * rvt_lkey_ok and rvt_rkey_ok. Also, the function only modifies the local sge 1072 * copy maintained in @ss->sge, the original sge is not modified. 1073 * 1074 * Unlike IB RDMA WRITE, we can't decrement ss->num_sge here because we are not 1075 * releasing the MR reference count at the same time. Otherwise, we'll "leak" 1076 * references to the MR. This difference requires that we keep track of progress 1077 * into the sg_list. This is done by the cur_seg cursor in the tid_rdma_request 1078 * structure. 1079 */ 1080 static u32 kern_find_pages(struct tid_rdma_flow *flow, 1081 struct page **pages, 1082 struct rvt_sge_state *ss, bool *last) 1083 { 1084 struct tid_rdma_request *req = flow->req; 1085 struct rvt_sge *sge = &ss->sge; 1086 u32 length = flow->req->seg_len; 1087 u32 len = PAGE_SIZE; 1088 u32 i = 0; 1089 1090 while (length && req->isge < ss->num_sge) { 1091 pages[i++] = virt_to_page(sge->vaddr); 1092 1093 sge->vaddr += len; 1094 sge->length -= len; 1095 sge->sge_length -= len; 1096 if (!sge->sge_length) { 1097 if (++req->isge < ss->num_sge) 1098 *sge = ss->sg_list[req->isge - 1]; 1099 } else if (sge->length == 0 && sge->mr->lkey) { 1100 if (++sge->n >= RVT_SEGSZ) { 1101 ++sge->m; 1102 sge->n = 0; 1103 } 1104 sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr; 1105 sge->length = sge->mr->map[sge->m]->segs[sge->n].length; 1106 } 1107 length -= len; 1108 } 1109 1110 flow->length = flow->req->seg_len - length; 1111 *last = req->isge == ss->num_sge ? false : true; 1112 return i; 1113 } 1114 1115 static void dma_unmap_flow(struct tid_rdma_flow *flow) 1116 { 1117 struct hfi1_devdata *dd; 1118 int i; 1119 struct tid_rdma_pageset *pset; 1120 1121 dd = flow->req->rcd->dd; 1122 for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets; 1123 i++, pset++) { 1124 if (pset->count && pset->addr) { 1125 dma_unmap_page(&dd->pcidev->dev, 1126 pset->addr, 1127 PAGE_SIZE * pset->count, 1128 DMA_FROM_DEVICE); 1129 pset->mapped = 0; 1130 } 1131 } 1132 } 1133 1134 static int dma_map_flow(struct tid_rdma_flow *flow, struct page **pages) 1135 { 1136 int i; 1137 struct hfi1_devdata *dd = flow->req->rcd->dd; 1138 struct tid_rdma_pageset *pset; 1139 1140 for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets; 1141 i++, pset++) { 1142 if (pset->count) { 1143 pset->addr = dma_map_page(&dd->pcidev->dev, 1144 pages[pset->idx], 1145 0, 1146 PAGE_SIZE * pset->count, 1147 DMA_FROM_DEVICE); 1148 1149 if (dma_mapping_error(&dd->pcidev->dev, pset->addr)) { 1150 dma_unmap_flow(flow); 1151 return -ENOMEM; 1152 } 1153 pset->mapped = 1; 1154 } 1155 } 1156 return 0; 1157 } 1158 1159 static inline bool dma_mapped(struct tid_rdma_flow *flow) 1160 { 1161 return !!flow->pagesets[0].mapped; 1162 } 1163 1164 /* 1165 * Get pages pointers and identify contiguous physical memory chunks for a 1166 * segment. All segments are of length flow->req->seg_len. 1167 */ 1168 static int kern_get_phys_blocks(struct tid_rdma_flow *flow, 1169 struct page **pages, 1170 struct rvt_sge_state *ss, bool *last) 1171 { 1172 u8 npages; 1173 1174 /* Reuse previously computed pagesets, if any */ 1175 if (flow->npagesets) { 1176 trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head, 1177 flow); 1178 if (!dma_mapped(flow)) 1179 return dma_map_flow(flow, pages); 1180 return 0; 1181 } 1182 1183 npages = kern_find_pages(flow, pages, ss, last); 1184 1185 if (flow->req->qp->pmtu == enum_to_mtu(OPA_MTU_4096)) 1186 flow->npagesets = 1187 tid_rdma_find_phys_blocks_4k(flow, pages, npages, 1188 flow->pagesets); 1189 else 1190 flow->npagesets = 1191 tid_rdma_find_phys_blocks_8k(flow, pages, npages, 1192 flow->pagesets); 1193 1194 return dma_map_flow(flow, pages); 1195 } 1196 1197 static inline void kern_add_tid_node(struct tid_rdma_flow *flow, 1198 struct hfi1_ctxtdata *rcd, char *s, 1199 struct tid_group *grp, u8 cnt) 1200 { 1201 struct kern_tid_node *node = &flow->tnode[flow->tnode_cnt++]; 1202 1203 WARN_ON_ONCE(flow->tnode_cnt >= 1204 (TID_RDMA_MAX_SEGMENT_SIZE >> PAGE_SHIFT)); 1205 if (WARN_ON_ONCE(cnt & 1)) 1206 dd_dev_err(rcd->dd, 1207 "unexpected odd allocation cnt %u map 0x%x used %u", 1208 cnt, grp->map, grp->used); 1209 1210 node->grp = grp; 1211 node->map = grp->map; 1212 node->cnt = cnt; 1213 trace_hfi1_tid_node_add(flow->req->qp, s, flow->tnode_cnt - 1, 1214 grp->base, grp->map, grp->used, cnt); 1215 } 1216 1217 /* 1218 * Try to allocate pageset_count TID's from TID groups for a context 1219 * 1220 * This function allocates TID's without moving groups between lists or 1221 * modifying grp->map. This is done as follows, being cogizant of the lists 1222 * between which the TID groups will move: 1223 * 1. First allocate complete groups of 8 TID's since this is more efficient, 1224 * these groups will move from group->full without affecting used 1225 * 2. If more TID's are needed allocate from used (will move from used->full or 1226 * stay in used) 1227 * 3. If we still don't have the required number of TID's go back and look again 1228 * at a complete group (will move from group->used) 1229 */ 1230 static int kern_alloc_tids(struct tid_rdma_flow *flow) 1231 { 1232 struct hfi1_ctxtdata *rcd = flow->req->rcd; 1233 struct hfi1_devdata *dd = rcd->dd; 1234 u32 ngroups, pageidx = 0; 1235 struct tid_group *group = NULL, *used; 1236 u8 use; 1237 1238 flow->tnode_cnt = 0; 1239 ngroups = flow->npagesets / dd->rcv_entries.group_size; 1240 if (!ngroups) 1241 goto used_list; 1242 1243 /* First look at complete groups */ 1244 list_for_each_entry(group, &rcd->tid_group_list.list, list) { 1245 kern_add_tid_node(flow, rcd, "complete groups", group, 1246 group->size); 1247 1248 pageidx += group->size; 1249 if (!--ngroups) 1250 break; 1251 } 1252 1253 if (pageidx >= flow->npagesets) 1254 goto ok; 1255 1256 used_list: 1257 /* Now look at partially used groups */ 1258 list_for_each_entry(used, &rcd->tid_used_list.list, list) { 1259 use = min_t(u32, flow->npagesets - pageidx, 1260 used->size - used->used); 1261 kern_add_tid_node(flow, rcd, "used groups", used, use); 1262 1263 pageidx += use; 1264 if (pageidx >= flow->npagesets) 1265 goto ok; 1266 } 1267 1268 /* 1269 * Look again at a complete group, continuing from where we left. 1270 * However, if we are at the head, we have reached the end of the 1271 * complete groups list from the first loop above 1272 */ 1273 if (group && &group->list == &rcd->tid_group_list.list) 1274 goto bail_eagain; 1275 group = list_prepare_entry(group, &rcd->tid_group_list.list, 1276 list); 1277 if (list_is_last(&group->list, &rcd->tid_group_list.list)) 1278 goto bail_eagain; 1279 group = list_next_entry(group, list); 1280 use = min_t(u32, flow->npagesets - pageidx, group->size); 1281 kern_add_tid_node(flow, rcd, "complete continue", group, use); 1282 pageidx += use; 1283 if (pageidx >= flow->npagesets) 1284 goto ok; 1285 bail_eagain: 1286 trace_hfi1_msg_alloc_tids(flow->req->qp, " insufficient tids: needed ", 1287 (u64)flow->npagesets); 1288 return -EAGAIN; 1289 ok: 1290 return 0; 1291 } 1292 1293 static void kern_program_rcv_group(struct tid_rdma_flow *flow, int grp_num, 1294 u32 *pset_idx) 1295 { 1296 struct hfi1_ctxtdata *rcd = flow->req->rcd; 1297 struct hfi1_devdata *dd = rcd->dd; 1298 struct kern_tid_node *node = &flow->tnode[grp_num]; 1299 struct tid_group *grp = node->grp; 1300 struct tid_rdma_pageset *pset; 1301 u32 pmtu_pg = flow->req->qp->pmtu >> PAGE_SHIFT; 1302 u32 rcventry, npages = 0, pair = 0, tidctrl; 1303 u8 i, cnt = 0; 1304 1305 for (i = 0; i < grp->size; i++) { 1306 rcventry = grp->base + i; 1307 1308 if (node->map & BIT(i) || cnt >= node->cnt) { 1309 rcv_array_wc_fill(dd, rcventry); 1310 continue; 1311 } 1312 pset = &flow->pagesets[(*pset_idx)++]; 1313 if (pset->count) { 1314 hfi1_put_tid(dd, rcventry, PT_EXPECTED, 1315 pset->addr, trdma_pset_order(pset)); 1316 } else { 1317 hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0); 1318 } 1319 npages += pset->count; 1320 1321 rcventry -= rcd->expected_base; 1322 tidctrl = pair ? 0x3 : rcventry & 0x1 ? 0x2 : 0x1; 1323 /* 1324 * A single TID entry will be used to use a rcvarr pair (with 1325 * tidctrl 0x3), if ALL these are true (a) the bit pos is even 1326 * (b) the group map shows current and the next bits as free 1327 * indicating two consecutive rcvarry entries are available (c) 1328 * we actually need 2 more entries 1329 */ 1330 pair = !(i & 0x1) && !((node->map >> i) & 0x3) && 1331 node->cnt >= cnt + 2; 1332 if (!pair) { 1333 if (!pset->count) 1334 tidctrl = 0x1; 1335 flow->tid_entry[flow->tidcnt++] = 1336 EXP_TID_SET(IDX, rcventry >> 1) | 1337 EXP_TID_SET(CTRL, tidctrl) | 1338 EXP_TID_SET(LEN, npages); 1339 trace_hfi1_tid_entry_alloc(/* entry */ 1340 flow->req->qp, flow->tidcnt - 1, 1341 flow->tid_entry[flow->tidcnt - 1]); 1342 1343 /* Efficient DIV_ROUND_UP(npages, pmtu_pg) */ 1344 flow->npkts += (npages + pmtu_pg - 1) >> ilog2(pmtu_pg); 1345 npages = 0; 1346 } 1347 1348 if (grp->used == grp->size - 1) 1349 tid_group_move(grp, &rcd->tid_used_list, 1350 &rcd->tid_full_list); 1351 else if (!grp->used) 1352 tid_group_move(grp, &rcd->tid_group_list, 1353 &rcd->tid_used_list); 1354 1355 grp->used++; 1356 grp->map |= BIT(i); 1357 cnt++; 1358 } 1359 } 1360 1361 static void kern_unprogram_rcv_group(struct tid_rdma_flow *flow, int grp_num) 1362 { 1363 struct hfi1_ctxtdata *rcd = flow->req->rcd; 1364 struct hfi1_devdata *dd = rcd->dd; 1365 struct kern_tid_node *node = &flow->tnode[grp_num]; 1366 struct tid_group *grp = node->grp; 1367 u32 rcventry; 1368 u8 i, cnt = 0; 1369 1370 for (i = 0; i < grp->size; i++) { 1371 rcventry = grp->base + i; 1372 1373 if (node->map & BIT(i) || cnt >= node->cnt) { 1374 rcv_array_wc_fill(dd, rcventry); 1375 continue; 1376 } 1377 1378 hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0); 1379 1380 grp->used--; 1381 grp->map &= ~BIT(i); 1382 cnt++; 1383 1384 if (grp->used == grp->size - 1) 1385 tid_group_move(grp, &rcd->tid_full_list, 1386 &rcd->tid_used_list); 1387 else if (!grp->used) 1388 tid_group_move(grp, &rcd->tid_used_list, 1389 &rcd->tid_group_list); 1390 } 1391 if (WARN_ON_ONCE(cnt & 1)) { 1392 struct hfi1_ctxtdata *rcd = flow->req->rcd; 1393 struct hfi1_devdata *dd = rcd->dd; 1394 1395 dd_dev_err(dd, "unexpected odd free cnt %u map 0x%x used %u", 1396 cnt, grp->map, grp->used); 1397 } 1398 } 1399 1400 static void kern_program_rcvarray(struct tid_rdma_flow *flow) 1401 { 1402 u32 pset_idx = 0; 1403 int i; 1404 1405 flow->npkts = 0; 1406 flow->tidcnt = 0; 1407 for (i = 0; i < flow->tnode_cnt; i++) 1408 kern_program_rcv_group(flow, i, &pset_idx); 1409 trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head, flow); 1410 } 1411 1412 /** 1413 * hfi1_kern_exp_rcv_setup() - setup TID's and flow for one segment of a 1414 * TID RDMA request 1415 * 1416 * @req: TID RDMA request for which the segment/flow is being set up 1417 * @ss: sge state, maintains state across successive segments of a sge 1418 * @last: set to true after the last sge segment has been processed 1419 * 1420 * This function 1421 * (1) finds a free flow entry in the flow circular buffer 1422 * (2) finds pages and continuous physical chunks constituing one segment 1423 * of an sge 1424 * (3) allocates TID group entries for those chunks 1425 * (4) programs rcvarray entries in the hardware corresponding to those 1426 * TID's 1427 * (5) computes a tidarray with formatted TID entries which can be sent 1428 * to the sender 1429 * (6) Reserves and programs HW flows. 1430 * (7) It also manages queing the QP when TID/flow resources are not 1431 * available. 1432 * 1433 * @req points to struct tid_rdma_request of which the segments are a part. The 1434 * function uses qp, rcd and seg_len members of @req. In the absence of errors, 1435 * req->flow_idx is the index of the flow which has been prepared in this 1436 * invocation of function call. With flow = &req->flows[req->flow_idx], 1437 * flow->tid_entry contains the TID array which the sender can use for TID RDMA 1438 * sends and flow->npkts contains number of packets required to send the 1439 * segment. 1440 * 1441 * hfi1_check_sge_align should be called prior to calling this function and if 1442 * it signals error TID RDMA cannot be used for this sge and this function 1443 * should not be called. 1444 * 1445 * For the queuing, caller must hold the flow->req->qp s_lock from the send 1446 * engine and the function will procure the exp_lock. 1447 * 1448 * Return: 1449 * The function returns -EAGAIN if sufficient number of TID/flow resources to 1450 * map the segment could not be allocated. In this case the function should be 1451 * called again with previous arguments to retry the TID allocation. There are 1452 * no other error returns. The function returns 0 on success. 1453 */ 1454 int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req, 1455 struct rvt_sge_state *ss, bool *last) 1456 __must_hold(&req->qp->s_lock) 1457 { 1458 struct tid_rdma_flow *flow = &req->flows[req->setup_head]; 1459 struct hfi1_ctxtdata *rcd = req->rcd; 1460 struct hfi1_qp_priv *qpriv = req->qp->priv; 1461 unsigned long flags; 1462 struct rvt_qp *fqp; 1463 u16 clear_tail = req->clear_tail; 1464 1465 lockdep_assert_held(&req->qp->s_lock); 1466 /* 1467 * We return error if either (a) we don't have space in the flow 1468 * circular buffer, or (b) we already have max entries in the buffer. 1469 * Max entries depend on the type of request we are processing and the 1470 * negotiated TID RDMA parameters. 1471 */ 1472 if (!CIRC_SPACE(req->setup_head, clear_tail, MAX_FLOWS) || 1473 CIRC_CNT(req->setup_head, clear_tail, MAX_FLOWS) >= 1474 req->n_flows) 1475 return -EINVAL; 1476 1477 /* 1478 * Get pages, identify contiguous physical memory chunks for the segment 1479 * If we can not determine a DMA address mapping we will treat it just 1480 * like if we ran out of space above. 1481 */ 1482 if (kern_get_phys_blocks(flow, qpriv->pages, ss, last)) { 1483 hfi1_wait_kmem(flow->req->qp); 1484 return -ENOMEM; 1485 } 1486 1487 spin_lock_irqsave(&rcd->exp_lock, flags); 1488 if (kernel_tid_waiters(rcd, &rcd->rarr_queue, flow->req->qp)) 1489 goto queue; 1490 1491 /* 1492 * At this point we know the number of pagesets and hence the number of 1493 * TID's to map the segment. Allocate the TID's from the TID groups. If 1494 * we cannot allocate the required number we exit and try again later 1495 */ 1496 if (kern_alloc_tids(flow)) 1497 goto queue; 1498 /* 1499 * Finally program the TID entries with the pagesets, compute the 1500 * tidarray and enable the HW flow 1501 */ 1502 kern_program_rcvarray(flow); 1503 1504 /* 1505 * Setup the flow state with relevant information. 1506 * This information is used for tracking the sequence of data packets 1507 * for the segment. 1508 * The flow is setup here as this is the most accurate time and place 1509 * to do so. Doing at a later time runs the risk of the flow data in 1510 * qpriv getting out of sync. 1511 */ 1512 memset(&flow->flow_state, 0x0, sizeof(flow->flow_state)); 1513 flow->idx = qpriv->flow_state.index; 1514 flow->flow_state.generation = qpriv->flow_state.generation; 1515 flow->flow_state.spsn = qpriv->flow_state.psn; 1516 flow->flow_state.lpsn = flow->flow_state.spsn + flow->npkts - 1; 1517 flow->flow_state.r_next_psn = 1518 full_flow_psn(flow, flow->flow_state.spsn); 1519 qpriv->flow_state.psn += flow->npkts; 1520 1521 dequeue_tid_waiter(rcd, &rcd->rarr_queue, flow->req->qp); 1522 /* get head before dropping lock */ 1523 fqp = first_qp(rcd, &rcd->rarr_queue); 1524 spin_unlock_irqrestore(&rcd->exp_lock, flags); 1525 tid_rdma_schedule_tid_wakeup(fqp); 1526 1527 req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1); 1528 return 0; 1529 queue: 1530 queue_qp_for_tid_wait(rcd, &rcd->rarr_queue, flow->req->qp); 1531 spin_unlock_irqrestore(&rcd->exp_lock, flags); 1532 return -EAGAIN; 1533 } 1534 1535 static void hfi1_tid_rdma_reset_flow(struct tid_rdma_flow *flow) 1536 { 1537 flow->npagesets = 0; 1538 } 1539 1540 /* 1541 * This function is called after one segment has been successfully sent to 1542 * release the flow and TID HW/SW resources for that segment. The segments for a 1543 * TID RDMA request are setup and cleared in FIFO order which is managed using a 1544 * circular buffer. 1545 */ 1546 int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req) 1547 __must_hold(&req->qp->s_lock) 1548 { 1549 struct tid_rdma_flow *flow = &req->flows[req->clear_tail]; 1550 struct hfi1_ctxtdata *rcd = req->rcd; 1551 unsigned long flags; 1552 int i; 1553 struct rvt_qp *fqp; 1554 1555 lockdep_assert_held(&req->qp->s_lock); 1556 /* Exit if we have nothing in the flow circular buffer */ 1557 if (!CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS)) 1558 return -EINVAL; 1559 1560 spin_lock_irqsave(&rcd->exp_lock, flags); 1561 1562 for (i = 0; i < flow->tnode_cnt; i++) 1563 kern_unprogram_rcv_group(flow, i); 1564 /* To prevent double unprogramming */ 1565 flow->tnode_cnt = 0; 1566 /* get head before dropping lock */ 1567 fqp = first_qp(rcd, &rcd->rarr_queue); 1568 spin_unlock_irqrestore(&rcd->exp_lock, flags); 1569 1570 dma_unmap_flow(flow); 1571 1572 hfi1_tid_rdma_reset_flow(flow); 1573 req->clear_tail = (req->clear_tail + 1) & (MAX_FLOWS - 1); 1574 1575 if (fqp == req->qp) { 1576 __trigger_tid_waiter(fqp); 1577 rvt_put_qp(fqp); 1578 } else { 1579 tid_rdma_schedule_tid_wakeup(fqp); 1580 } 1581 1582 return 0; 1583 } 1584 1585 /* 1586 * This function is called to release all the tid entries for 1587 * a request. 1588 */ 1589 void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req) 1590 __must_hold(&req->qp->s_lock) 1591 { 1592 /* Use memory barrier for proper ordering */ 1593 while (CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS)) { 1594 if (hfi1_kern_exp_rcv_clear(req)) 1595 break; 1596 } 1597 } 1598 1599 /** 1600 * hfi1_kern_exp_rcv_free_flows - free priviously allocated flow information 1601 * @req - the tid rdma request to be cleaned 1602 */ 1603 static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req) 1604 { 1605 kfree(req->flows); 1606 req->flows = NULL; 1607 } 1608 1609 /** 1610 * __trdma_clean_swqe - clean up for large sized QPs 1611 * @qp: the queue patch 1612 * @wqe: the send wqe 1613 */ 1614 void __trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe) 1615 { 1616 struct hfi1_swqe_priv *p = wqe->priv; 1617 1618 hfi1_kern_exp_rcv_free_flows(&p->tid_req); 1619 } 1620 1621 /* 1622 * This can be called at QP create time or in the data path. 1623 */ 1624 static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req, 1625 gfp_t gfp) 1626 { 1627 struct tid_rdma_flow *flows; 1628 int i; 1629 1630 if (likely(req->flows)) 1631 return 0; 1632 flows = kmalloc_node(MAX_FLOWS * sizeof(*flows), gfp, 1633 req->rcd->numa_id); 1634 if (!flows) 1635 return -ENOMEM; 1636 /* mini init */ 1637 for (i = 0; i < MAX_FLOWS; i++) { 1638 flows[i].req = req; 1639 flows[i].npagesets = 0; 1640 flows[i].pagesets[0].mapped = 0; 1641 flows[i].resync_npkts = 0; 1642 } 1643 req->flows = flows; 1644 return 0; 1645 } 1646 1647 static void hfi1_init_trdma_req(struct rvt_qp *qp, 1648 struct tid_rdma_request *req) 1649 { 1650 struct hfi1_qp_priv *qpriv = qp->priv; 1651 1652 /* 1653 * Initialize various TID RDMA request variables. 1654 * These variables are "static", which is why they 1655 * can be pre-initialized here before the WRs has 1656 * even been submitted. 1657 * However, non-NULL values for these variables do not 1658 * imply that this WQE has been enabled for TID RDMA. 1659 * Drivers should check the WQE's opcode to determine 1660 * if a request is a TID RDMA one or not. 1661 */ 1662 req->qp = qp; 1663 req->rcd = qpriv->rcd; 1664 } 1665 1666 u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry, 1667 void *context, int vl, int mode, u64 data) 1668 { 1669 struct hfi1_devdata *dd = context; 1670 1671 return dd->verbs_dev.n_tidwait; 1672 } 1673 1674 static struct tid_rdma_flow *find_flow_ib(struct tid_rdma_request *req, 1675 u32 psn, u16 *fidx) 1676 { 1677 u16 head, tail; 1678 struct tid_rdma_flow *flow; 1679 1680 head = req->setup_head; 1681 tail = req->clear_tail; 1682 for ( ; CIRC_CNT(head, tail, MAX_FLOWS); 1683 tail = CIRC_NEXT(tail, MAX_FLOWS)) { 1684 flow = &req->flows[tail]; 1685 if (cmp_psn(psn, flow->flow_state.ib_spsn) >= 0 && 1686 cmp_psn(psn, flow->flow_state.ib_lpsn) <= 0) { 1687 if (fidx) 1688 *fidx = tail; 1689 return flow; 1690 } 1691 } 1692 return NULL; 1693 } 1694 1695 /* TID RDMA READ functions */ 1696 u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe, 1697 struct ib_other_headers *ohdr, u32 *bth1, 1698 u32 *bth2, u32 *len) 1699 { 1700 struct tid_rdma_request *req = wqe_to_tid_req(wqe); 1701 struct tid_rdma_flow *flow = &req->flows[req->flow_idx]; 1702 struct rvt_qp *qp = req->qp; 1703 struct hfi1_qp_priv *qpriv = qp->priv; 1704 struct hfi1_swqe_priv *wpriv = wqe->priv; 1705 struct tid_rdma_read_req *rreq = &ohdr->u.tid_rdma.r_req; 1706 struct tid_rdma_params *remote; 1707 u32 req_len = 0; 1708 void *req_addr = NULL; 1709 1710 /* This is the IB psn used to send the request */ 1711 *bth2 = mask_psn(flow->flow_state.ib_spsn + flow->pkt); 1712 trace_hfi1_tid_flow_build_read_pkt(qp, req->flow_idx, flow); 1713 1714 /* TID Entries for TID RDMA READ payload */ 1715 req_addr = &flow->tid_entry[flow->tid_idx]; 1716 req_len = sizeof(*flow->tid_entry) * 1717 (flow->tidcnt - flow->tid_idx); 1718 1719 memset(&ohdr->u.tid_rdma.r_req, 0, sizeof(ohdr->u.tid_rdma.r_req)); 1720 wpriv->ss.sge.vaddr = req_addr; 1721 wpriv->ss.sge.sge_length = req_len; 1722 wpriv->ss.sge.length = wpriv->ss.sge.sge_length; 1723 /* 1724 * We can safely zero these out. Since the first SGE covers the 1725 * entire packet, nothing else should even look at the MR. 1726 */ 1727 wpriv->ss.sge.mr = NULL; 1728 wpriv->ss.sge.m = 0; 1729 wpriv->ss.sge.n = 0; 1730 1731 wpriv->ss.sg_list = NULL; 1732 wpriv->ss.total_len = wpriv->ss.sge.sge_length; 1733 wpriv->ss.num_sge = 1; 1734 1735 /* Construct the TID RDMA READ REQ packet header */ 1736 rcu_read_lock(); 1737 remote = rcu_dereference(qpriv->tid_rdma.remote); 1738 1739 KDETH_RESET(rreq->kdeth0, KVER, 0x1); 1740 KDETH_RESET(rreq->kdeth1, JKEY, remote->jkey); 1741 rreq->reth.vaddr = cpu_to_be64(wqe->rdma_wr.remote_addr + 1742 req->cur_seg * req->seg_len + flow->sent); 1743 rreq->reth.rkey = cpu_to_be32(wqe->rdma_wr.rkey); 1744 rreq->reth.length = cpu_to_be32(*len); 1745 rreq->tid_flow_psn = 1746 cpu_to_be32((flow->flow_state.generation << 1747 HFI1_KDETH_BTH_SEQ_SHIFT) | 1748 ((flow->flow_state.spsn + flow->pkt) & 1749 HFI1_KDETH_BTH_SEQ_MASK)); 1750 rreq->tid_flow_qp = 1751 cpu_to_be32(qpriv->tid_rdma.local.qp | 1752 ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) << 1753 TID_RDMA_DESTQP_FLOW_SHIFT) | 1754 qpriv->rcd->ctxt); 1755 rreq->verbs_qp = cpu_to_be32(qp->remote_qpn); 1756 *bth1 &= ~RVT_QPN_MASK; 1757 *bth1 |= remote->qp; 1758 *bth2 |= IB_BTH_REQ_ACK; 1759 rcu_read_unlock(); 1760 1761 /* We are done with this segment */ 1762 flow->sent += *len; 1763 req->cur_seg++; 1764 qp->s_state = TID_OP(READ_REQ); 1765 req->ack_pending++; 1766 req->flow_idx = (req->flow_idx + 1) & (MAX_FLOWS - 1); 1767 qpriv->pending_tid_r_segs++; 1768 qp->s_num_rd_atomic++; 1769 1770 /* Set the TID RDMA READ request payload size */ 1771 *len = req_len; 1772 1773 return sizeof(ohdr->u.tid_rdma.r_req) / sizeof(u32); 1774 } 1775 1776 /* 1777 * @len: contains the data length to read upon entry and the read request 1778 * payload length upon exit. 1779 */ 1780 u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe, 1781 struct ib_other_headers *ohdr, u32 *bth1, 1782 u32 *bth2, u32 *len) 1783 __must_hold(&qp->s_lock) 1784 { 1785 struct hfi1_qp_priv *qpriv = qp->priv; 1786 struct tid_rdma_request *req = wqe_to_tid_req(wqe); 1787 struct tid_rdma_flow *flow = NULL; 1788 u32 hdwords = 0; 1789 bool last; 1790 bool retry = true; 1791 u32 npkts = rvt_div_round_up_mtu(qp, *len); 1792 1793 trace_hfi1_tid_req_build_read_req(qp, 0, wqe->wr.opcode, wqe->psn, 1794 wqe->lpsn, req); 1795 /* 1796 * Check sync conditions. Make sure that there are no pending 1797 * segments before freeing the flow. 1798 */ 1799 sync_check: 1800 if (req->state == TID_REQUEST_SYNC) { 1801 if (qpriv->pending_tid_r_segs) 1802 goto done; 1803 1804 hfi1_kern_clear_hw_flow(req->rcd, qp); 1805 qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; 1806 req->state = TID_REQUEST_ACTIVE; 1807 } 1808 1809 /* 1810 * If the request for this segment is resent, the tid resources should 1811 * have been allocated before. In this case, req->flow_idx should 1812 * fall behind req->setup_head. 1813 */ 1814 if (req->flow_idx == req->setup_head) { 1815 retry = false; 1816 if (req->state == TID_REQUEST_RESEND) { 1817 /* 1818 * This is the first new segment for a request whose 1819 * earlier segments have been re-sent. We need to 1820 * set up the sge pointer correctly. 1821 */ 1822 restart_sge(&qp->s_sge, wqe, req->s_next_psn, 1823 qp->pmtu); 1824 req->isge = 0; 1825 req->state = TID_REQUEST_ACTIVE; 1826 } 1827 1828 /* 1829 * Check sync. The last PSN of each generation is reserved for 1830 * RESYNC. 1831 */ 1832 if ((qpriv->flow_state.psn + npkts) > MAX_TID_FLOW_PSN - 1) { 1833 req->state = TID_REQUEST_SYNC; 1834 goto sync_check; 1835 } 1836 1837 /* Allocate the flow if not yet */ 1838 if (hfi1_kern_setup_hw_flow(qpriv->rcd, qp)) 1839 goto done; 1840 1841 /* 1842 * The following call will advance req->setup_head after 1843 * allocating the tid entries. 1844 */ 1845 if (hfi1_kern_exp_rcv_setup(req, &qp->s_sge, &last)) { 1846 req->state = TID_REQUEST_QUEUED; 1847 1848 /* 1849 * We don't have resources for this segment. The QP has 1850 * already been queued. 1851 */ 1852 goto done; 1853 } 1854 } 1855 1856 /* req->flow_idx should only be one slot behind req->setup_head */ 1857 flow = &req->flows[req->flow_idx]; 1858 flow->pkt = 0; 1859 flow->tid_idx = 0; 1860 flow->sent = 0; 1861 if (!retry) { 1862 /* Set the first and last IB PSN for the flow in use.*/ 1863 flow->flow_state.ib_spsn = req->s_next_psn; 1864 flow->flow_state.ib_lpsn = 1865 flow->flow_state.ib_spsn + flow->npkts - 1; 1866 } 1867 1868 /* Calculate the next segment start psn.*/ 1869 req->s_next_psn += flow->npkts; 1870 1871 /* Build the packet header */ 1872 hdwords = hfi1_build_tid_rdma_read_packet(wqe, ohdr, bth1, bth2, len); 1873 done: 1874 return hdwords; 1875 } 1876 1877 /* 1878 * Validate and accept the TID RDMA READ request parameters. 1879 * Return 0 if the request is accepted successfully; 1880 * Return 1 otherwise. 1881 */ 1882 static int tid_rdma_rcv_read_request(struct rvt_qp *qp, 1883 struct rvt_ack_entry *e, 1884 struct hfi1_packet *packet, 1885 struct ib_other_headers *ohdr, 1886 u32 bth0, u32 psn, u64 vaddr, u32 len) 1887 { 1888 struct hfi1_qp_priv *qpriv = qp->priv; 1889 struct tid_rdma_request *req; 1890 struct tid_rdma_flow *flow; 1891 u32 flow_psn, i, tidlen = 0, pktlen, tlen; 1892 1893 req = ack_to_tid_req(e); 1894 1895 /* Validate the payload first */ 1896 flow = &req->flows[req->setup_head]; 1897 1898 /* payload length = packet length - (header length + ICRC length) */ 1899 pktlen = packet->tlen - (packet->hlen + 4); 1900 if (pktlen > sizeof(flow->tid_entry)) 1901 return 1; 1902 memcpy(flow->tid_entry, packet->ebuf, pktlen); 1903 flow->tidcnt = pktlen / sizeof(*flow->tid_entry); 1904 1905 /* 1906 * Walk the TID_ENTRY list to make sure we have enough space for a 1907 * complete segment. Also calculate the number of required packets. 1908 */ 1909 flow->npkts = rvt_div_round_up_mtu(qp, len); 1910 for (i = 0; i < flow->tidcnt; i++) { 1911 trace_hfi1_tid_entry_rcv_read_req(qp, i, 1912 flow->tid_entry[i]); 1913 tlen = EXP_TID_GET(flow->tid_entry[i], LEN); 1914 if (!tlen) 1915 return 1; 1916 1917 /* 1918 * For tid pair (tidctr == 3), the buffer size of the pair 1919 * should be the sum of the buffer size described by each 1920 * tid entry. However, only the first entry needs to be 1921 * specified in the request (see WFR HAS Section 8.5.7.1). 1922 */ 1923 tidlen += tlen; 1924 } 1925 if (tidlen * PAGE_SIZE < len) 1926 return 1; 1927 1928 /* Empty the flow array */ 1929 req->clear_tail = req->setup_head; 1930 flow->pkt = 0; 1931 flow->tid_idx = 0; 1932 flow->tid_offset = 0; 1933 flow->sent = 0; 1934 flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_qp); 1935 flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) & 1936 TID_RDMA_DESTQP_FLOW_MASK; 1937 flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_psn)); 1938 flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT; 1939 flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK; 1940 flow->length = len; 1941 1942 flow->flow_state.lpsn = flow->flow_state.spsn + 1943 flow->npkts - 1; 1944 flow->flow_state.ib_spsn = psn; 1945 flow->flow_state.ib_lpsn = flow->flow_state.ib_spsn + flow->npkts - 1; 1946 1947 trace_hfi1_tid_flow_rcv_read_req(qp, req->setup_head, flow); 1948 /* Set the initial flow index to the current flow. */ 1949 req->flow_idx = req->setup_head; 1950 1951 /* advance circular buffer head */ 1952 req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1); 1953 1954 /* 1955 * Compute last PSN for request. 1956 */ 1957 e->opcode = (bth0 >> 24) & 0xff; 1958 e->psn = psn; 1959 e->lpsn = psn + flow->npkts - 1; 1960 e->sent = 0; 1961 1962 req->n_flows = qpriv->tid_rdma.local.max_read; 1963 req->state = TID_REQUEST_ACTIVE; 1964 req->cur_seg = 0; 1965 req->comp_seg = 0; 1966 req->ack_seg = 0; 1967 req->isge = 0; 1968 req->seg_len = qpriv->tid_rdma.local.max_len; 1969 req->total_len = len; 1970 req->total_segs = 1; 1971 req->r_flow_psn = e->psn; 1972 1973 trace_hfi1_tid_req_rcv_read_req(qp, 0, e->opcode, e->psn, e->lpsn, 1974 req); 1975 return 0; 1976 } 1977 1978 static int tid_rdma_rcv_error(struct hfi1_packet *packet, 1979 struct ib_other_headers *ohdr, 1980 struct rvt_qp *qp, u32 psn, int diff) 1981 { 1982 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); 1983 struct hfi1_ctxtdata *rcd = ((struct hfi1_qp_priv *)qp->priv)->rcd; 1984 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); 1985 struct hfi1_qp_priv *qpriv = qp->priv; 1986 struct rvt_ack_entry *e; 1987 struct tid_rdma_request *req; 1988 unsigned long flags; 1989 u8 prev; 1990 bool old_req; 1991 1992 trace_hfi1_rsp_tid_rcv_error(qp, psn); 1993 trace_hfi1_tid_rdma_rcv_err(qp, 0, psn, diff); 1994 if (diff > 0) { 1995 /* sequence error */ 1996 if (!qp->r_nak_state) { 1997 ibp->rvp.n_rc_seqnak++; 1998 qp->r_nak_state = IB_NAK_PSN_ERROR; 1999 qp->r_ack_psn = qp->r_psn; 2000 rc_defered_ack(rcd, qp); 2001 } 2002 goto done; 2003 } 2004 2005 ibp->rvp.n_rc_dupreq++; 2006 2007 spin_lock_irqsave(&qp->s_lock, flags); 2008 e = find_prev_entry(qp, psn, &prev, NULL, &old_req); 2009 if (!e || (e->opcode != TID_OP(READ_REQ) && 2010 e->opcode != TID_OP(WRITE_REQ))) 2011 goto unlock; 2012 2013 req = ack_to_tid_req(e); 2014 req->r_flow_psn = psn; 2015 trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn, e->lpsn, req); 2016 if (e->opcode == TID_OP(READ_REQ)) { 2017 struct ib_reth *reth; 2018 u32 len; 2019 u32 rkey; 2020 u64 vaddr; 2021 int ok; 2022 u32 bth0; 2023 2024 reth = &ohdr->u.tid_rdma.r_req.reth; 2025 /* 2026 * The requester always restarts from the start of the original 2027 * request. 2028 */ 2029 len = be32_to_cpu(reth->length); 2030 if (psn != e->psn || len != req->total_len) 2031 goto unlock; 2032 2033 release_rdma_sge_mr(e); 2034 2035 rkey = be32_to_cpu(reth->rkey); 2036 vaddr = get_ib_reth_vaddr(reth); 2037 2038 qp->r_len = len; 2039 ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey, 2040 IB_ACCESS_REMOTE_READ); 2041 if (unlikely(!ok)) 2042 goto unlock; 2043 2044 /* 2045 * If all the response packets for the current request have 2046 * been sent out and this request is complete (old_request 2047 * == false) and the TID flow may be unusable (the 2048 * req->clear_tail is advanced). However, when an earlier 2049 * request is received, this request will not be complete any 2050 * more (qp->s_tail_ack_queue is moved back, see below). 2051 * Consequently, we need to update the TID flow info everytime 2052 * a duplicate request is received. 2053 */ 2054 bth0 = be32_to_cpu(ohdr->bth[0]); 2055 if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn, 2056 vaddr, len)) 2057 goto unlock; 2058 2059 /* 2060 * True if the request is already scheduled (between 2061 * qp->s_tail_ack_queue and qp->r_head_ack_queue); 2062 */ 2063 if (old_req) 2064 goto unlock; 2065 } else { 2066 struct flow_state *fstate; 2067 bool schedule = false; 2068 u8 i; 2069 2070 if (req->state == TID_REQUEST_RESEND) { 2071 req->state = TID_REQUEST_RESEND_ACTIVE; 2072 } else if (req->state == TID_REQUEST_INIT_RESEND) { 2073 req->state = TID_REQUEST_INIT; 2074 schedule = true; 2075 } 2076 2077 /* 2078 * True if the request is already scheduled (between 2079 * qp->s_tail_ack_queue and qp->r_head_ack_queue). 2080 * Also, don't change requests, which are at the SYNC 2081 * point and haven't generated any responses yet. 2082 * There is nothing to retransmit for them yet. 2083 */ 2084 if (old_req || req->state == TID_REQUEST_INIT || 2085 (req->state == TID_REQUEST_SYNC && !req->cur_seg)) { 2086 for (i = prev + 1; ; i++) { 2087 if (i > rvt_size_atomic(&dev->rdi)) 2088 i = 0; 2089 if (i == qp->r_head_ack_queue) 2090 break; 2091 e = &qp->s_ack_queue[i]; 2092 req = ack_to_tid_req(e); 2093 if (e->opcode == TID_OP(WRITE_REQ) && 2094 req->state == TID_REQUEST_INIT) 2095 req->state = TID_REQUEST_INIT_RESEND; 2096 } 2097 /* 2098 * If the state of the request has been changed, 2099 * the first leg needs to get scheduled in order to 2100 * pick up the change. Otherwise, normal response 2101 * processing should take care of it. 2102 */ 2103 if (!schedule) 2104 goto unlock; 2105 } 2106 2107 /* 2108 * If there is no more allocated segment, just schedule the qp 2109 * without changing any state. 2110 */ 2111 if (req->clear_tail == req->setup_head) 2112 goto schedule; 2113 /* 2114 * If this request has sent responses for segments, which have 2115 * not received data yet (flow_idx != clear_tail), the flow_idx 2116 * pointer needs to be adjusted so the same responses can be 2117 * re-sent. 2118 */ 2119 if (CIRC_CNT(req->flow_idx, req->clear_tail, MAX_FLOWS)) { 2120 fstate = &req->flows[req->clear_tail].flow_state; 2121 qpriv->pending_tid_w_segs -= 2122 CIRC_CNT(req->flow_idx, req->clear_tail, 2123 MAX_FLOWS); 2124 req->flow_idx = 2125 CIRC_ADD(req->clear_tail, 2126 delta_psn(psn, fstate->resp_ib_psn), 2127 MAX_FLOWS); 2128 qpriv->pending_tid_w_segs += 2129 delta_psn(psn, fstate->resp_ib_psn); 2130 /* 2131 * When flow_idx == setup_head, we've gotten a duplicate 2132 * request for a segment, which has not been allocated 2133 * yet. In that case, don't adjust this request. 2134 * However, we still want to go through the loop below 2135 * to adjust all subsequent requests. 2136 */ 2137 if (CIRC_CNT(req->setup_head, req->flow_idx, 2138 MAX_FLOWS)) { 2139 req->cur_seg = delta_psn(psn, e->psn); 2140 req->state = TID_REQUEST_RESEND_ACTIVE; 2141 } 2142 } 2143 2144 for (i = prev + 1; ; i++) { 2145 /* 2146 * Look at everything up to and including 2147 * s_tail_ack_queue 2148 */ 2149 if (i > rvt_size_atomic(&dev->rdi)) 2150 i = 0; 2151 if (i == qp->r_head_ack_queue) 2152 break; 2153 e = &qp->s_ack_queue[i]; 2154 req = ack_to_tid_req(e); 2155 trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn, 2156 e->lpsn, req); 2157 if (e->opcode != TID_OP(WRITE_REQ) || 2158 req->cur_seg == req->comp_seg || 2159 req->state == TID_REQUEST_INIT || 2160 req->state == TID_REQUEST_INIT_RESEND) { 2161 if (req->state == TID_REQUEST_INIT) 2162 req->state = TID_REQUEST_INIT_RESEND; 2163 continue; 2164 } 2165 qpriv->pending_tid_w_segs -= 2166 CIRC_CNT(req->flow_idx, 2167 req->clear_tail, 2168 MAX_FLOWS); 2169 req->flow_idx = req->clear_tail; 2170 req->state = TID_REQUEST_RESEND; 2171 req->cur_seg = req->comp_seg; 2172 } 2173 qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK; 2174 } 2175 /* Re-process old requests.*/ 2176 if (qp->s_acked_ack_queue == qp->s_tail_ack_queue) 2177 qp->s_acked_ack_queue = prev; 2178 qp->s_tail_ack_queue = prev; 2179 /* 2180 * Since the qp->s_tail_ack_queue is modified, the 2181 * qp->s_ack_state must be changed to re-initialize 2182 * qp->s_ack_rdma_sge; Otherwise, we will end up in 2183 * wrong memory region. 2184 */ 2185 qp->s_ack_state = OP(ACKNOWLEDGE); 2186 schedule: 2187 /* 2188 * It's possible to receive a retry psn that is earlier than an RNRNAK 2189 * psn. In this case, the rnrnak state should be cleared. 2190 */ 2191 if (qpriv->rnr_nak_state) { 2192 qp->s_nak_state = 0; 2193 qpriv->rnr_nak_state = TID_RNR_NAK_INIT; 2194 qp->r_psn = e->lpsn + 1; 2195 hfi1_tid_write_alloc_resources(qp, true); 2196 } 2197 2198 qp->r_state = e->opcode; 2199 qp->r_nak_state = 0; 2200 qp->s_flags |= RVT_S_RESP_PENDING; 2201 hfi1_schedule_send(qp); 2202 unlock: 2203 spin_unlock_irqrestore(&qp->s_lock, flags); 2204 done: 2205 return 1; 2206 } 2207 2208 void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet) 2209 { 2210 /* HANDLER FOR TID RDMA READ REQUEST packet (Responder side)*/ 2211 2212 /* 2213 * 1. Verify TID RDMA READ REQ as per IB_OPCODE_RC_RDMA_READ 2214 * (see hfi1_rc_rcv()) 2215 * 2. Put TID RDMA READ REQ into the response queueu (s_ack_queue) 2216 * - Setup struct tid_rdma_req with request info 2217 * - Initialize struct tid_rdma_flow info; 2218 * - Copy TID entries; 2219 * 3. Set the qp->s_ack_state. 2220 * 4. Set RVT_S_RESP_PENDING in s_flags. 2221 * 5. Kick the send engine (hfi1_schedule_send()) 2222 */ 2223 struct hfi1_ctxtdata *rcd = packet->rcd; 2224 struct rvt_qp *qp = packet->qp; 2225 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); 2226 struct ib_other_headers *ohdr = packet->ohdr; 2227 struct rvt_ack_entry *e; 2228 unsigned long flags; 2229 struct ib_reth *reth; 2230 struct hfi1_qp_priv *qpriv = qp->priv; 2231 u32 bth0, psn, len, rkey; 2232 bool fecn; 2233 u8 next; 2234 u64 vaddr; 2235 int diff; 2236 u8 nack_state = IB_NAK_INVALID_REQUEST; 2237 2238 bth0 = be32_to_cpu(ohdr->bth[0]); 2239 if (hfi1_ruc_check_hdr(ibp, packet)) 2240 return; 2241 2242 fecn = process_ecn(qp, packet); 2243 psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 2244 trace_hfi1_rsp_rcv_tid_read_req(qp, psn); 2245 2246 if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST)) 2247 rvt_comm_est(qp); 2248 2249 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) 2250 goto nack_inv; 2251 2252 reth = &ohdr->u.tid_rdma.r_req.reth; 2253 vaddr = be64_to_cpu(reth->vaddr); 2254 len = be32_to_cpu(reth->length); 2255 /* The length needs to be in multiples of PAGE_SIZE */ 2256 if (!len || len & ~PAGE_MASK || len > qpriv->tid_rdma.local.max_len) 2257 goto nack_inv; 2258 2259 diff = delta_psn(psn, qp->r_psn); 2260 if (unlikely(diff)) { 2261 tid_rdma_rcv_err(packet, ohdr, qp, psn, diff, fecn); 2262 return; 2263 } 2264 2265 /* We've verified the request, insert it into the ack queue. */ 2266 next = qp->r_head_ack_queue + 1; 2267 if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) 2268 next = 0; 2269 spin_lock_irqsave(&qp->s_lock, flags); 2270 if (unlikely(next == qp->s_tail_ack_queue)) { 2271 if (!qp->s_ack_queue[next].sent) { 2272 nack_state = IB_NAK_REMOTE_OPERATIONAL_ERROR; 2273 goto nack_inv_unlock; 2274 } 2275 update_ack_queue(qp, next); 2276 } 2277 e = &qp->s_ack_queue[qp->r_head_ack_queue]; 2278 release_rdma_sge_mr(e); 2279 2280 rkey = be32_to_cpu(reth->rkey); 2281 qp->r_len = len; 2282 2283 if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr, 2284 rkey, IB_ACCESS_REMOTE_READ))) 2285 goto nack_acc; 2286 2287 /* Accept the request parameters */ 2288 if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn, vaddr, 2289 len)) 2290 goto nack_inv_unlock; 2291 2292 qp->r_state = e->opcode; 2293 qp->r_nak_state = 0; 2294 /* 2295 * We need to increment the MSN here instead of when we 2296 * finish sending the result since a duplicate request would 2297 * increment it more than once. 2298 */ 2299 qp->r_msn++; 2300 qp->r_psn += e->lpsn - e->psn + 1; 2301 2302 qp->r_head_ack_queue = next; 2303 2304 /* 2305 * For all requests other than TID WRITE which are added to the ack 2306 * queue, qpriv->r_tid_alloc follows qp->r_head_ack_queue. It is ok to 2307 * do this because of interlocks between these and TID WRITE 2308 * requests. The same change has also been made in hfi1_rc_rcv(). 2309 */ 2310 qpriv->r_tid_alloc = qp->r_head_ack_queue; 2311 2312 /* Schedule the send tasklet. */ 2313 qp->s_flags |= RVT_S_RESP_PENDING; 2314 if (fecn) 2315 qp->s_flags |= RVT_S_ECN; 2316 hfi1_schedule_send(qp); 2317 2318 spin_unlock_irqrestore(&qp->s_lock, flags); 2319 return; 2320 2321 nack_inv_unlock: 2322 spin_unlock_irqrestore(&qp->s_lock, flags); 2323 nack_inv: 2324 rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); 2325 qp->r_nak_state = nack_state; 2326 qp->r_ack_psn = qp->r_psn; 2327 /* Queue NAK for later */ 2328 rc_defered_ack(rcd, qp); 2329 return; 2330 nack_acc: 2331 spin_unlock_irqrestore(&qp->s_lock, flags); 2332 rvt_rc_error(qp, IB_WC_LOC_PROT_ERR); 2333 qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; 2334 qp->r_ack_psn = qp->r_psn; 2335 } 2336 2337 u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, 2338 struct ib_other_headers *ohdr, u32 *bth0, 2339 u32 *bth1, u32 *bth2, u32 *len, bool *last) 2340 { 2341 struct hfi1_ack_priv *epriv = e->priv; 2342 struct tid_rdma_request *req = &epriv->tid_req; 2343 struct hfi1_qp_priv *qpriv = qp->priv; 2344 struct tid_rdma_flow *flow = &req->flows[req->clear_tail]; 2345 u32 tidentry = flow->tid_entry[flow->tid_idx]; 2346 u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT; 2347 struct tid_rdma_read_resp *resp = &ohdr->u.tid_rdma.r_rsp; 2348 u32 next_offset, om = KDETH_OM_LARGE; 2349 bool last_pkt; 2350 u32 hdwords = 0; 2351 struct tid_rdma_params *remote; 2352 2353 *len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset); 2354 flow->sent += *len; 2355 next_offset = flow->tid_offset + *len; 2356 last_pkt = (flow->sent >= flow->length); 2357 2358 trace_hfi1_tid_entry_build_read_resp(qp, flow->tid_idx, tidentry); 2359 trace_hfi1_tid_flow_build_read_resp(qp, req->clear_tail, flow); 2360 2361 rcu_read_lock(); 2362 remote = rcu_dereference(qpriv->tid_rdma.remote); 2363 if (!remote) { 2364 rcu_read_unlock(); 2365 goto done; 2366 } 2367 KDETH_RESET(resp->kdeth0, KVER, 0x1); 2368 KDETH_SET(resp->kdeth0, SH, !last_pkt); 2369 KDETH_SET(resp->kdeth0, INTR, !!(!last_pkt && remote->urg)); 2370 KDETH_SET(resp->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL)); 2371 KDETH_SET(resp->kdeth0, TID, EXP_TID_GET(tidentry, IDX)); 2372 KDETH_SET(resp->kdeth0, OM, om == KDETH_OM_LARGE); 2373 KDETH_SET(resp->kdeth0, OFFSET, flow->tid_offset / om); 2374 KDETH_RESET(resp->kdeth1, JKEY, remote->jkey); 2375 resp->verbs_qp = cpu_to_be32(qp->remote_qpn); 2376 rcu_read_unlock(); 2377 2378 resp->aeth = rvt_compute_aeth(qp); 2379 resp->verbs_psn = cpu_to_be32(mask_psn(flow->flow_state.ib_spsn + 2380 flow->pkt)); 2381 2382 *bth0 = TID_OP(READ_RESP) << 24; 2383 *bth1 = flow->tid_qpn; 2384 *bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) & 2385 HFI1_KDETH_BTH_SEQ_MASK) | 2386 (flow->flow_state.generation << 2387 HFI1_KDETH_BTH_SEQ_SHIFT)); 2388 *last = last_pkt; 2389 if (last_pkt) 2390 /* Advance to next flow */ 2391 req->clear_tail = (req->clear_tail + 1) & 2392 (MAX_FLOWS - 1); 2393 2394 if (next_offset >= tidlen) { 2395 flow->tid_offset = 0; 2396 flow->tid_idx++; 2397 } else { 2398 flow->tid_offset = next_offset; 2399 } 2400 2401 hdwords = sizeof(ohdr->u.tid_rdma.r_rsp) / sizeof(u32); 2402 2403 done: 2404 return hdwords; 2405 } 2406 2407 static inline struct tid_rdma_request * 2408 find_tid_request(struct rvt_qp *qp, u32 psn, enum ib_wr_opcode opcode) 2409 __must_hold(&qp->s_lock) 2410 { 2411 struct rvt_swqe *wqe; 2412 struct tid_rdma_request *req = NULL; 2413 u32 i, end; 2414 2415 end = qp->s_cur + 1; 2416 if (end == qp->s_size) 2417 end = 0; 2418 for (i = qp->s_acked; i != end;) { 2419 wqe = rvt_get_swqe_ptr(qp, i); 2420 if (cmp_psn(psn, wqe->psn) >= 0 && 2421 cmp_psn(psn, wqe->lpsn) <= 0) { 2422 if (wqe->wr.opcode == opcode) 2423 req = wqe_to_tid_req(wqe); 2424 break; 2425 } 2426 if (++i == qp->s_size) 2427 i = 0; 2428 } 2429 2430 return req; 2431 } 2432 2433 void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet) 2434 { 2435 /* HANDLER FOR TID RDMA READ RESPONSE packet (Requestor side */ 2436 2437 /* 2438 * 1. Find matching SWQE 2439 * 2. Check that the entire segment has been read. 2440 * 3. Remove HFI1_S_WAIT_TID_RESP from s_flags. 2441 * 4. Free the TID flow resources. 2442 * 5. Kick the send engine (hfi1_schedule_send()) 2443 */ 2444 struct ib_other_headers *ohdr = packet->ohdr; 2445 struct rvt_qp *qp = packet->qp; 2446 struct hfi1_qp_priv *priv = qp->priv; 2447 struct hfi1_ctxtdata *rcd = packet->rcd; 2448 struct tid_rdma_request *req; 2449 struct tid_rdma_flow *flow; 2450 u32 opcode, aeth; 2451 bool fecn; 2452 unsigned long flags; 2453 u32 kpsn, ipsn; 2454 2455 trace_hfi1_sender_rcv_tid_read_resp(qp); 2456 fecn = process_ecn(qp, packet); 2457 kpsn = mask_psn(be32_to_cpu(ohdr->bth[2])); 2458 aeth = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.aeth); 2459 opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; 2460 2461 spin_lock_irqsave(&qp->s_lock, flags); 2462 ipsn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn)); 2463 req = find_tid_request(qp, ipsn, IB_WR_TID_RDMA_READ); 2464 if (unlikely(!req)) 2465 goto ack_op_err; 2466 2467 flow = &req->flows[req->clear_tail]; 2468 /* When header suppression is disabled */ 2469 if (cmp_psn(ipsn, flow->flow_state.ib_lpsn)) { 2470 update_r_next_psn_fecn(packet, priv, rcd, flow, fecn); 2471 2472 if (cmp_psn(kpsn, flow->flow_state.r_next_psn)) 2473 goto ack_done; 2474 flow->flow_state.r_next_psn = mask_psn(kpsn + 1); 2475 /* 2476 * Copy the payload to destination buffer if this packet is 2477 * delivered as an eager packet due to RSM rule and FECN. 2478 * The RSM rule selects FECN bit in BTH and SH bit in 2479 * KDETH header and therefore will not match the last 2480 * packet of each segment that has SH bit cleared. 2481 */ 2482 if (fecn && packet->etype == RHF_RCV_TYPE_EAGER) { 2483 struct rvt_sge_state ss; 2484 u32 len; 2485 u32 tlen = packet->tlen; 2486 u16 hdrsize = packet->hlen; 2487 u8 pad = packet->pad; 2488 u8 extra_bytes = pad + packet->extra_byte + 2489 (SIZE_OF_CRC << 2); 2490 u32 pmtu = qp->pmtu; 2491 2492 if (unlikely(tlen != (hdrsize + pmtu + extra_bytes))) 2493 goto ack_op_err; 2494 len = restart_sge(&ss, req->e.swqe, ipsn, pmtu); 2495 if (unlikely(len < pmtu)) 2496 goto ack_op_err; 2497 rvt_copy_sge(qp, &ss, packet->payload, pmtu, false, 2498 false); 2499 /* Raise the sw sequence check flag for next packet */ 2500 priv->s_flags |= HFI1_R_TID_SW_PSN; 2501 } 2502 2503 goto ack_done; 2504 } 2505 flow->flow_state.r_next_psn = mask_psn(kpsn + 1); 2506 req->ack_pending--; 2507 priv->pending_tid_r_segs--; 2508 qp->s_num_rd_atomic--; 2509 if ((qp->s_flags & RVT_S_WAIT_FENCE) && 2510 !qp->s_num_rd_atomic) { 2511 qp->s_flags &= ~(RVT_S_WAIT_FENCE | 2512 RVT_S_WAIT_ACK); 2513 hfi1_schedule_send(qp); 2514 } 2515 if (qp->s_flags & RVT_S_WAIT_RDMAR) { 2516 qp->s_flags &= ~(RVT_S_WAIT_RDMAR | RVT_S_WAIT_ACK); 2517 hfi1_schedule_send(qp); 2518 } 2519 2520 trace_hfi1_ack(qp, ipsn); 2521 trace_hfi1_tid_req_rcv_read_resp(qp, 0, req->e.swqe->wr.opcode, 2522 req->e.swqe->psn, req->e.swqe->lpsn, 2523 req); 2524 trace_hfi1_tid_flow_rcv_read_resp(qp, req->clear_tail, flow); 2525 2526 /* Release the tid resources */ 2527 hfi1_kern_exp_rcv_clear(req); 2528 2529 if (!do_rc_ack(qp, aeth, ipsn, opcode, 0, rcd)) 2530 goto ack_done; 2531 2532 /* If not done yet, build next read request */ 2533 if (++req->comp_seg >= req->total_segs) { 2534 priv->tid_r_comp++; 2535 req->state = TID_REQUEST_COMPLETE; 2536 } 2537 2538 /* 2539 * Clear the hw flow under two conditions: 2540 * 1. This request is a sync point and it is complete; 2541 * 2. Current request is completed and there are no more requests. 2542 */ 2543 if ((req->state == TID_REQUEST_SYNC && 2544 req->comp_seg == req->cur_seg) || 2545 priv->tid_r_comp == priv->tid_r_reqs) { 2546 hfi1_kern_clear_hw_flow(priv->rcd, qp); 2547 priv->s_flags &= ~HFI1_R_TID_SW_PSN; 2548 if (req->state == TID_REQUEST_SYNC) 2549 req->state = TID_REQUEST_ACTIVE; 2550 } 2551 2552 hfi1_schedule_send(qp); 2553 goto ack_done; 2554 2555 ack_op_err: 2556 /* 2557 * The test indicates that the send engine has finished its cleanup 2558 * after sending the request and it's now safe to put the QP into error 2559 * state. However, if the wqe queue is empty (qp->s_acked == qp->s_tail 2560 * == qp->s_head), it would be unsafe to complete the wqe pointed by 2561 * qp->s_acked here. Putting the qp into error state will safely flush 2562 * all remaining requests. 2563 */ 2564 if (qp->s_last == qp->s_acked) 2565 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); 2566 2567 ack_done: 2568 spin_unlock_irqrestore(&qp->s_lock, flags); 2569 } 2570 2571 void hfi1_kern_read_tid_flow_free(struct rvt_qp *qp) 2572 __must_hold(&qp->s_lock) 2573 { 2574 u32 n = qp->s_acked; 2575 struct rvt_swqe *wqe; 2576 struct tid_rdma_request *req; 2577 struct hfi1_qp_priv *priv = qp->priv; 2578 2579 lockdep_assert_held(&qp->s_lock); 2580 /* Free any TID entries */ 2581 while (n != qp->s_tail) { 2582 wqe = rvt_get_swqe_ptr(qp, n); 2583 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { 2584 req = wqe_to_tid_req(wqe); 2585 hfi1_kern_exp_rcv_clear_all(req); 2586 } 2587 2588 if (++n == qp->s_size) 2589 n = 0; 2590 } 2591 /* Free flow */ 2592 hfi1_kern_clear_hw_flow(priv->rcd, qp); 2593 } 2594 2595 static bool tid_rdma_tid_err(struct hfi1_packet *packet, u8 rcv_type) 2596 { 2597 struct rvt_qp *qp = packet->qp; 2598 2599 if (rcv_type >= RHF_RCV_TYPE_IB) 2600 goto done; 2601 2602 spin_lock(&qp->s_lock); 2603 2604 /* 2605 * We've ran out of space in the eager buffer. 2606 * Eagerly received KDETH packets which require space in the 2607 * Eager buffer (packet that have payload) are TID RDMA WRITE 2608 * response packets. In this case, we have to re-transmit the 2609 * TID RDMA WRITE request. 2610 */ 2611 if (rcv_type == RHF_RCV_TYPE_EAGER) { 2612 hfi1_restart_rc(qp, qp->s_last_psn + 1, 1); 2613 hfi1_schedule_send(qp); 2614 } 2615 2616 /* Since no payload is delivered, just drop the packet */ 2617 spin_unlock(&qp->s_lock); 2618 done: 2619 return true; 2620 } 2621 2622 static void restart_tid_rdma_read_req(struct hfi1_ctxtdata *rcd, 2623 struct rvt_qp *qp, struct rvt_swqe *wqe) 2624 { 2625 struct tid_rdma_request *req; 2626 struct tid_rdma_flow *flow; 2627 2628 /* Start from the right segment */ 2629 qp->r_flags |= RVT_R_RDMAR_SEQ; 2630 req = wqe_to_tid_req(wqe); 2631 flow = &req->flows[req->clear_tail]; 2632 hfi1_restart_rc(qp, flow->flow_state.ib_spsn, 0); 2633 if (list_empty(&qp->rspwait)) { 2634 qp->r_flags |= RVT_R_RSP_SEND; 2635 rvt_get_qp(qp); 2636 list_add_tail(&qp->rspwait, &rcd->qp_wait_list); 2637 } 2638 } 2639 2640 /* 2641 * Handle the KDETH eflags for TID RDMA READ response. 2642 * 2643 * Return true if the last packet for a segment has been received and it is 2644 * time to process the response normally; otherwise, return true. 2645 * 2646 * The caller must hold the packet->qp->r_lock and the rcu_read_lock. 2647 */ 2648 static bool handle_read_kdeth_eflags(struct hfi1_ctxtdata *rcd, 2649 struct hfi1_packet *packet, u8 rcv_type, 2650 u8 rte, u32 psn, u32 ibpsn) 2651 __must_hold(&packet->qp->r_lock) __must_hold(RCU) 2652 { 2653 struct hfi1_pportdata *ppd = rcd->ppd; 2654 struct hfi1_devdata *dd = ppd->dd; 2655 struct hfi1_ibport *ibp; 2656 struct rvt_swqe *wqe; 2657 struct tid_rdma_request *req; 2658 struct tid_rdma_flow *flow; 2659 u32 ack_psn; 2660 struct rvt_qp *qp = packet->qp; 2661 struct hfi1_qp_priv *priv = qp->priv; 2662 bool ret = true; 2663 int diff = 0; 2664 u32 fpsn; 2665 2666 lockdep_assert_held(&qp->r_lock); 2667 trace_hfi1_rsp_read_kdeth_eflags(qp, ibpsn); 2668 trace_hfi1_sender_read_kdeth_eflags(qp); 2669 trace_hfi1_tid_read_sender_kdeth_eflags(qp, 0); 2670 spin_lock(&qp->s_lock); 2671 /* If the psn is out of valid range, drop the packet */ 2672 if (cmp_psn(ibpsn, qp->s_last_psn) < 0 || 2673 cmp_psn(ibpsn, qp->s_psn) > 0) 2674 goto s_unlock; 2675 2676 /* 2677 * Note that NAKs implicitly ACK outstanding SEND and RDMA write 2678 * requests and implicitly NAK RDMA read and atomic requests issued 2679 * before the NAK'ed request. 2680 */ 2681 ack_psn = ibpsn - 1; 2682 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 2683 ibp = to_iport(qp->ibqp.device, qp->port_num); 2684 2685 /* Complete WQEs that the PSN finishes. */ 2686 while ((int)delta_psn(ack_psn, wqe->lpsn) >= 0) { 2687 /* 2688 * If this request is a RDMA read or atomic, and the NACK is 2689 * for a later operation, this NACK NAKs the RDMA read or 2690 * atomic. 2691 */ 2692 if (wqe->wr.opcode == IB_WR_RDMA_READ || 2693 wqe->wr.opcode == IB_WR_TID_RDMA_READ || 2694 wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || 2695 wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { 2696 /* Retry this request. */ 2697 if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) { 2698 qp->r_flags |= RVT_R_RDMAR_SEQ; 2699 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { 2700 restart_tid_rdma_read_req(rcd, qp, 2701 wqe); 2702 } else { 2703 hfi1_restart_rc(qp, qp->s_last_psn + 1, 2704 0); 2705 if (list_empty(&qp->rspwait)) { 2706 qp->r_flags |= RVT_R_RSP_SEND; 2707 rvt_get_qp(qp); 2708 list_add_tail(/* wait */ 2709 &qp->rspwait, 2710 &rcd->qp_wait_list); 2711 } 2712 } 2713 } 2714 /* 2715 * No need to process the NAK since we are 2716 * restarting an earlier request. 2717 */ 2718 break; 2719 } 2720 2721 wqe = do_rc_completion(qp, wqe, ibp); 2722 if (qp->s_acked == qp->s_tail) 2723 goto s_unlock; 2724 } 2725 2726 if (qp->s_acked == qp->s_tail) 2727 goto s_unlock; 2728 2729 /* Handle the eflags for the request */ 2730 if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) 2731 goto s_unlock; 2732 2733 req = wqe_to_tid_req(wqe); 2734 trace_hfi1_tid_req_read_kdeth_eflags(qp, 0, wqe->wr.opcode, wqe->psn, 2735 wqe->lpsn, req); 2736 switch (rcv_type) { 2737 case RHF_RCV_TYPE_EXPECTED: 2738 switch (rte) { 2739 case RHF_RTE_EXPECTED_FLOW_SEQ_ERR: 2740 /* 2741 * On the first occurrence of a Flow Sequence error, 2742 * the flag TID_FLOW_SW_PSN is set. 2743 * 2744 * After that, the flow is *not* reprogrammed and the 2745 * protocol falls back to SW PSN checking. This is done 2746 * to prevent continuous Flow Sequence errors for any 2747 * packets that could be still in the fabric. 2748 */ 2749 flow = &req->flows[req->clear_tail]; 2750 trace_hfi1_tid_flow_read_kdeth_eflags(qp, 2751 req->clear_tail, 2752 flow); 2753 if (priv->s_flags & HFI1_R_TID_SW_PSN) { 2754 diff = cmp_psn(psn, 2755 flow->flow_state.r_next_psn); 2756 if (diff > 0) { 2757 /* Drop the packet.*/ 2758 goto s_unlock; 2759 } else if (diff < 0) { 2760 /* 2761 * If a response packet for a restarted 2762 * request has come back, reset the 2763 * restart flag. 2764 */ 2765 if (qp->r_flags & RVT_R_RDMAR_SEQ) 2766 qp->r_flags &= 2767 ~RVT_R_RDMAR_SEQ; 2768 2769 /* Drop the packet.*/ 2770 goto s_unlock; 2771 } 2772 2773 /* 2774 * If SW PSN verification is successful and 2775 * this is the last packet in the segment, tell 2776 * the caller to process it as a normal packet. 2777 */ 2778 fpsn = full_flow_psn(flow, 2779 flow->flow_state.lpsn); 2780 if (cmp_psn(fpsn, psn) == 0) { 2781 ret = false; 2782 if (qp->r_flags & RVT_R_RDMAR_SEQ) 2783 qp->r_flags &= 2784 ~RVT_R_RDMAR_SEQ; 2785 } 2786 flow->flow_state.r_next_psn = 2787 mask_psn(psn + 1); 2788 } else { 2789 u32 last_psn; 2790 2791 last_psn = read_r_next_psn(dd, rcd->ctxt, 2792 flow->idx); 2793 flow->flow_state.r_next_psn = last_psn; 2794 priv->s_flags |= HFI1_R_TID_SW_PSN; 2795 /* 2796 * If no request has been restarted yet, 2797 * restart the current one. 2798 */ 2799 if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) 2800 restart_tid_rdma_read_req(rcd, qp, 2801 wqe); 2802 } 2803 2804 break; 2805 2806 case RHF_RTE_EXPECTED_FLOW_GEN_ERR: 2807 /* 2808 * Since the TID flow is able to ride through 2809 * generation mismatch, drop this stale packet. 2810 */ 2811 break; 2812 2813 default: 2814 break; 2815 } 2816 break; 2817 2818 case RHF_RCV_TYPE_ERROR: 2819 switch (rte) { 2820 case RHF_RTE_ERROR_OP_CODE_ERR: 2821 case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR: 2822 case RHF_RTE_ERROR_KHDR_HCRC_ERR: 2823 case RHF_RTE_ERROR_KHDR_KVER_ERR: 2824 case RHF_RTE_ERROR_CONTEXT_ERR: 2825 case RHF_RTE_ERROR_KHDR_TID_ERR: 2826 default: 2827 break; 2828 } 2829 default: 2830 break; 2831 } 2832 s_unlock: 2833 spin_unlock(&qp->s_lock); 2834 return ret; 2835 } 2836 2837 bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, 2838 struct hfi1_pportdata *ppd, 2839 struct hfi1_packet *packet) 2840 { 2841 struct hfi1_ibport *ibp = &ppd->ibport_data; 2842 struct hfi1_devdata *dd = ppd->dd; 2843 struct rvt_dev_info *rdi = &dd->verbs_dev.rdi; 2844 u8 rcv_type = rhf_rcv_type(packet->rhf); 2845 u8 rte = rhf_rcv_type_err(packet->rhf); 2846 struct ib_header *hdr = packet->hdr; 2847 struct ib_other_headers *ohdr = NULL; 2848 int lnh = be16_to_cpu(hdr->lrh[0]) & 3; 2849 u16 lid = be16_to_cpu(hdr->lrh[1]); 2850 u8 opcode; 2851 u32 qp_num, psn, ibpsn; 2852 struct rvt_qp *qp; 2853 struct hfi1_qp_priv *qpriv; 2854 unsigned long flags; 2855 bool ret = true; 2856 struct rvt_ack_entry *e; 2857 struct tid_rdma_request *req; 2858 struct tid_rdma_flow *flow; 2859 int diff = 0; 2860 2861 trace_hfi1_msg_handle_kdeth_eflags(NULL, "Kdeth error: rhf ", 2862 packet->rhf); 2863 if (packet->rhf & RHF_ICRC_ERR) 2864 return ret; 2865 2866 packet->ohdr = &hdr->u.oth; 2867 ohdr = packet->ohdr; 2868 trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf))); 2869 2870 /* Get the destination QP number. */ 2871 qp_num = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_qp) & 2872 RVT_QPN_MASK; 2873 if (lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) 2874 goto drop; 2875 2876 psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 2877 opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; 2878 2879 rcu_read_lock(); 2880 qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num); 2881 if (!qp) 2882 goto rcu_unlock; 2883 2884 packet->qp = qp; 2885 2886 /* Check for valid receive state. */ 2887 spin_lock_irqsave(&qp->r_lock, flags); 2888 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { 2889 ibp->rvp.n_pkt_drops++; 2890 goto r_unlock; 2891 } 2892 2893 if (packet->rhf & RHF_TID_ERR) { 2894 /* For TIDERR and RC QPs preemptively schedule a NAK */ 2895 u32 tlen = rhf_pkt_len(packet->rhf); /* in bytes */ 2896 2897 /* Sanity check packet */ 2898 if (tlen < 24) 2899 goto r_unlock; 2900 2901 /* 2902 * Check for GRH. We should never get packets with GRH in this 2903 * path. 2904 */ 2905 if (lnh == HFI1_LRH_GRH) 2906 goto r_unlock; 2907 2908 if (tid_rdma_tid_err(packet, rcv_type)) 2909 goto r_unlock; 2910 } 2911 2912 /* handle TID RDMA READ */ 2913 if (opcode == TID_OP(READ_RESP)) { 2914 ibpsn = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn); 2915 ibpsn = mask_psn(ibpsn); 2916 ret = handle_read_kdeth_eflags(rcd, packet, rcv_type, rte, psn, 2917 ibpsn); 2918 goto r_unlock; 2919 } 2920 2921 /* 2922 * qp->s_tail_ack_queue points to the rvt_ack_entry currently being 2923 * processed. These a completed sequentially so we can be sure that 2924 * the pointer will not change until the entire request has completed. 2925 */ 2926 spin_lock(&qp->s_lock); 2927 qpriv = qp->priv; 2928 if (qpriv->r_tid_tail == HFI1_QP_WQE_INVALID || 2929 qpriv->r_tid_tail == qpriv->r_tid_head) 2930 goto unlock; 2931 e = &qp->s_ack_queue[qpriv->r_tid_tail]; 2932 if (e->opcode != TID_OP(WRITE_REQ)) 2933 goto unlock; 2934 req = ack_to_tid_req(e); 2935 if (req->comp_seg == req->cur_seg) 2936 goto unlock; 2937 flow = &req->flows[req->clear_tail]; 2938 trace_hfi1_eflags_err_write(qp, rcv_type, rte, psn); 2939 trace_hfi1_rsp_handle_kdeth_eflags(qp, psn); 2940 trace_hfi1_tid_write_rsp_handle_kdeth_eflags(qp); 2941 trace_hfi1_tid_req_handle_kdeth_eflags(qp, 0, e->opcode, e->psn, 2942 e->lpsn, req); 2943 trace_hfi1_tid_flow_handle_kdeth_eflags(qp, req->clear_tail, flow); 2944 2945 switch (rcv_type) { 2946 case RHF_RCV_TYPE_EXPECTED: 2947 switch (rte) { 2948 case RHF_RTE_EXPECTED_FLOW_SEQ_ERR: 2949 if (!(qpriv->s_flags & HFI1_R_TID_SW_PSN)) { 2950 qpriv->s_flags |= HFI1_R_TID_SW_PSN; 2951 flow->flow_state.r_next_psn = 2952 read_r_next_psn(dd, rcd->ctxt, 2953 flow->idx); 2954 qpriv->r_next_psn_kdeth = 2955 flow->flow_state.r_next_psn; 2956 goto nak_psn; 2957 } else { 2958 /* 2959 * If the received PSN does not match the next 2960 * expected PSN, NAK the packet. 2961 * However, only do that if we know that the a 2962 * NAK has already been sent. Otherwise, this 2963 * mismatch could be due to packets that were 2964 * already in flight. 2965 */ 2966 diff = cmp_psn(psn, 2967 flow->flow_state.r_next_psn); 2968 if (diff > 0) 2969 goto nak_psn; 2970 else if (diff < 0) 2971 break; 2972 2973 qpriv->s_nak_state = 0; 2974 /* 2975 * If SW PSN verification is successful and this 2976 * is the last packet in the segment, tell the 2977 * caller to process it as a normal packet. 2978 */ 2979 if (psn == full_flow_psn(flow, 2980 flow->flow_state.lpsn)) 2981 ret = false; 2982 flow->flow_state.r_next_psn = 2983 mask_psn(psn + 1); 2984 qpriv->r_next_psn_kdeth = 2985 flow->flow_state.r_next_psn; 2986 } 2987 break; 2988 2989 case RHF_RTE_EXPECTED_FLOW_GEN_ERR: 2990 goto nak_psn; 2991 2992 default: 2993 break; 2994 } 2995 break; 2996 2997 case RHF_RCV_TYPE_ERROR: 2998 switch (rte) { 2999 case RHF_RTE_ERROR_OP_CODE_ERR: 3000 case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR: 3001 case RHF_RTE_ERROR_KHDR_HCRC_ERR: 3002 case RHF_RTE_ERROR_KHDR_KVER_ERR: 3003 case RHF_RTE_ERROR_CONTEXT_ERR: 3004 case RHF_RTE_ERROR_KHDR_TID_ERR: 3005 default: 3006 break; 3007 } 3008 default: 3009 break; 3010 } 3011 3012 unlock: 3013 spin_unlock(&qp->s_lock); 3014 r_unlock: 3015 spin_unlock_irqrestore(&qp->r_lock, flags); 3016 rcu_unlock: 3017 rcu_read_unlock(); 3018 drop: 3019 return ret; 3020 nak_psn: 3021 ibp->rvp.n_rc_seqnak++; 3022 if (!qpriv->s_nak_state) { 3023 qpriv->s_nak_state = IB_NAK_PSN_ERROR; 3024 /* We are NAK'ing the next expected PSN */ 3025 qpriv->s_nak_psn = mask_psn(flow->flow_state.r_next_psn); 3026 tid_rdma_trigger_ack(qp); 3027 } 3028 goto unlock; 3029 } 3030 3031 /* 3032 * "Rewind" the TID request information. 3033 * This means that we reset the state back to ACTIVE, 3034 * find the proper flow, set the flow index to that flow, 3035 * and reset the flow information. 3036 */ 3037 void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe, 3038 u32 *bth2) 3039 { 3040 struct tid_rdma_request *req = wqe_to_tid_req(wqe); 3041 struct tid_rdma_flow *flow; 3042 struct hfi1_qp_priv *qpriv = qp->priv; 3043 int diff, delta_pkts; 3044 u32 tididx = 0, i; 3045 u16 fidx; 3046 3047 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { 3048 *bth2 = mask_psn(qp->s_psn); 3049 flow = find_flow_ib(req, *bth2, &fidx); 3050 if (!flow) { 3051 trace_hfi1_msg_tid_restart_req(/* msg */ 3052 qp, "!!!!!! Could not find flow to restart: bth2 ", 3053 (u64)*bth2); 3054 trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode, 3055 wqe->psn, wqe->lpsn, 3056 req); 3057 return; 3058 } 3059 } else { 3060 fidx = req->acked_tail; 3061 flow = &req->flows[fidx]; 3062 *bth2 = mask_psn(req->r_ack_psn); 3063 } 3064 3065 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) 3066 delta_pkts = delta_psn(*bth2, flow->flow_state.ib_spsn); 3067 else 3068 delta_pkts = delta_psn(*bth2, 3069 full_flow_psn(flow, 3070 flow->flow_state.spsn)); 3071 3072 trace_hfi1_tid_flow_restart_req(qp, fidx, flow); 3073 diff = delta_pkts + flow->resync_npkts; 3074 3075 flow->sent = 0; 3076 flow->pkt = 0; 3077 flow->tid_idx = 0; 3078 flow->tid_offset = 0; 3079 if (diff) { 3080 for (tididx = 0; tididx < flow->tidcnt; tididx++) { 3081 u32 tidentry = flow->tid_entry[tididx], tidlen, 3082 tidnpkts, npkts; 3083 3084 flow->tid_offset = 0; 3085 tidlen = EXP_TID_GET(tidentry, LEN) * PAGE_SIZE; 3086 tidnpkts = rvt_div_round_up_mtu(qp, tidlen); 3087 npkts = min_t(u32, diff, tidnpkts); 3088 flow->pkt += npkts; 3089 flow->sent += (npkts == tidnpkts ? tidlen : 3090 npkts * qp->pmtu); 3091 flow->tid_offset += npkts * qp->pmtu; 3092 diff -= npkts; 3093 if (!diff) 3094 break; 3095 } 3096 } 3097 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) { 3098 rvt_skip_sge(&qpriv->tid_ss, (req->cur_seg * req->seg_len) + 3099 flow->sent, 0); 3100 /* 3101 * Packet PSN is based on flow_state.spsn + flow->pkt. However, 3102 * during a RESYNC, the generation is incremented and the 3103 * sequence is reset to 0. Since we've adjusted the npkts in the 3104 * flow and the SGE has been sufficiently advanced, we have to 3105 * adjust flow->pkt in order to calculate the correct PSN. 3106 */ 3107 flow->pkt -= flow->resync_npkts; 3108 } 3109 3110 if (flow->tid_offset == 3111 EXP_TID_GET(flow->tid_entry[tididx], LEN) * PAGE_SIZE) { 3112 tididx++; 3113 flow->tid_offset = 0; 3114 } 3115 flow->tid_idx = tididx; 3116 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) 3117 /* Move flow_idx to correct index */ 3118 req->flow_idx = fidx; 3119 else 3120 req->clear_tail = fidx; 3121 3122 trace_hfi1_tid_flow_restart_req(qp, fidx, flow); 3123 trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode, wqe->psn, 3124 wqe->lpsn, req); 3125 req->state = TID_REQUEST_ACTIVE; 3126 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) { 3127 /* Reset all the flows that we are going to resend */ 3128 fidx = CIRC_NEXT(fidx, MAX_FLOWS); 3129 i = qpriv->s_tid_tail; 3130 do { 3131 for (; CIRC_CNT(req->setup_head, fidx, MAX_FLOWS); 3132 fidx = CIRC_NEXT(fidx, MAX_FLOWS)) { 3133 req->flows[fidx].sent = 0; 3134 req->flows[fidx].pkt = 0; 3135 req->flows[fidx].tid_idx = 0; 3136 req->flows[fidx].tid_offset = 0; 3137 req->flows[fidx].resync_npkts = 0; 3138 } 3139 if (i == qpriv->s_tid_cur) 3140 break; 3141 do { 3142 i = (++i == qp->s_size ? 0 : i); 3143 wqe = rvt_get_swqe_ptr(qp, i); 3144 } while (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE); 3145 req = wqe_to_tid_req(wqe); 3146 req->cur_seg = req->ack_seg; 3147 fidx = req->acked_tail; 3148 /* Pull req->clear_tail back */ 3149 req->clear_tail = fidx; 3150 } while (1); 3151 } 3152 } 3153 3154 void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp) 3155 { 3156 int i, ret; 3157 struct hfi1_qp_priv *qpriv = qp->priv; 3158 struct tid_flow_state *fs; 3159 3160 if (qp->ibqp.qp_type != IB_QPT_RC || !HFI1_CAP_IS_KSET(TID_RDMA)) 3161 return; 3162 3163 /* 3164 * First, clear the flow to help prevent any delayed packets from 3165 * being delivered. 3166 */ 3167 fs = &qpriv->flow_state; 3168 if (fs->index != RXE_NUM_TID_FLOWS) 3169 hfi1_kern_clear_hw_flow(qpriv->rcd, qp); 3170 3171 for (i = qp->s_acked; i != qp->s_head;) { 3172 struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i); 3173 3174 if (++i == qp->s_size) 3175 i = 0; 3176 /* Free only locally allocated TID entries */ 3177 if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) 3178 continue; 3179 do { 3180 struct hfi1_swqe_priv *priv = wqe->priv; 3181 3182 ret = hfi1_kern_exp_rcv_clear(&priv->tid_req); 3183 } while (!ret); 3184 } 3185 for (i = qp->s_acked_ack_queue; i != qp->r_head_ack_queue;) { 3186 struct rvt_ack_entry *e = &qp->s_ack_queue[i]; 3187 3188 if (++i == rvt_max_atomic(ib_to_rvt(qp->ibqp.device))) 3189 i = 0; 3190 /* Free only locally allocated TID entries */ 3191 if (e->opcode != TID_OP(WRITE_REQ)) 3192 continue; 3193 do { 3194 struct hfi1_ack_priv *priv = e->priv; 3195 3196 ret = hfi1_kern_exp_rcv_clear(&priv->tid_req); 3197 } while (!ret); 3198 } 3199 } 3200 3201 bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe) 3202 { 3203 struct rvt_swqe *prev; 3204 struct hfi1_qp_priv *priv = qp->priv; 3205 u32 s_prev; 3206 struct tid_rdma_request *req; 3207 3208 s_prev = (qp->s_cur == 0 ? qp->s_size : qp->s_cur) - 1; 3209 prev = rvt_get_swqe_ptr(qp, s_prev); 3210 3211 switch (wqe->wr.opcode) { 3212 case IB_WR_SEND: 3213 case IB_WR_SEND_WITH_IMM: 3214 case IB_WR_SEND_WITH_INV: 3215 case IB_WR_ATOMIC_CMP_AND_SWP: 3216 case IB_WR_ATOMIC_FETCH_AND_ADD: 3217 case IB_WR_RDMA_WRITE: 3218 switch (prev->wr.opcode) { 3219 case IB_WR_TID_RDMA_WRITE: 3220 req = wqe_to_tid_req(prev); 3221 if (req->ack_seg != req->total_segs) 3222 goto interlock; 3223 default: 3224 break; 3225 } 3226 break; 3227 case IB_WR_RDMA_READ: 3228 if (prev->wr.opcode != IB_WR_TID_RDMA_WRITE) 3229 break; 3230 /* fall through */ 3231 case IB_WR_TID_RDMA_READ: 3232 switch (prev->wr.opcode) { 3233 case IB_WR_RDMA_READ: 3234 if (qp->s_acked != qp->s_cur) 3235 goto interlock; 3236 break; 3237 case IB_WR_TID_RDMA_WRITE: 3238 req = wqe_to_tid_req(prev); 3239 if (req->ack_seg != req->total_segs) 3240 goto interlock; 3241 default: 3242 break; 3243 } 3244 default: 3245 break; 3246 } 3247 return false; 3248 3249 interlock: 3250 priv->s_flags |= HFI1_S_TID_WAIT_INTERLCK; 3251 return true; 3252 } 3253 3254 /* Does @sge meet the alignment requirements for tid rdma? */ 3255 static inline bool hfi1_check_sge_align(struct rvt_qp *qp, 3256 struct rvt_sge *sge, int num_sge) 3257 { 3258 int i; 3259 3260 for (i = 0; i < num_sge; i++, sge++) { 3261 trace_hfi1_sge_check_align(qp, i, sge); 3262 if ((u64)sge->vaddr & ~PAGE_MASK || 3263 sge->sge_length & ~PAGE_MASK) 3264 return false; 3265 } 3266 return true; 3267 } 3268 3269 void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe) 3270 { 3271 struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv; 3272 struct hfi1_swqe_priv *priv = wqe->priv; 3273 struct tid_rdma_params *remote; 3274 enum ib_wr_opcode new_opcode; 3275 bool do_tid_rdma = false; 3276 struct hfi1_pportdata *ppd = qpriv->rcd->ppd; 3277 3278 if ((rdma_ah_get_dlid(&qp->remote_ah_attr) & ~((1 << ppd->lmc) - 1)) == 3279 ppd->lid) 3280 return; 3281 if (qpriv->hdr_type != HFI1_PKT_TYPE_9B) 3282 return; 3283 3284 rcu_read_lock(); 3285 remote = rcu_dereference(qpriv->tid_rdma.remote); 3286 /* 3287 * If TID RDMA is disabled by the negotiation, don't 3288 * use it. 3289 */ 3290 if (!remote) 3291 goto exit; 3292 3293 if (wqe->wr.opcode == IB_WR_RDMA_READ) { 3294 if (hfi1_check_sge_align(qp, &wqe->sg_list[0], 3295 wqe->wr.num_sge)) { 3296 new_opcode = IB_WR_TID_RDMA_READ; 3297 do_tid_rdma = true; 3298 } 3299 } else if (wqe->wr.opcode == IB_WR_RDMA_WRITE) { 3300 /* 3301 * TID RDMA is enabled for this RDMA WRITE request iff: 3302 * 1. The remote address is page-aligned, 3303 * 2. The length is larger than the minimum segment size, 3304 * 3. The length is page-multiple. 3305 */ 3306 if (!(wqe->rdma_wr.remote_addr & ~PAGE_MASK) && 3307 !(wqe->length & ~PAGE_MASK)) { 3308 new_opcode = IB_WR_TID_RDMA_WRITE; 3309 do_tid_rdma = true; 3310 } 3311 } 3312 3313 if (do_tid_rdma) { 3314 if (hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req, GFP_ATOMIC)) 3315 goto exit; 3316 wqe->wr.opcode = new_opcode; 3317 priv->tid_req.seg_len = 3318 min_t(u32, remote->max_len, wqe->length); 3319 priv->tid_req.total_segs = 3320 DIV_ROUND_UP(wqe->length, priv->tid_req.seg_len); 3321 /* Compute the last PSN of the request */ 3322 wqe->lpsn = wqe->psn; 3323 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { 3324 priv->tid_req.n_flows = remote->max_read; 3325 qpriv->tid_r_reqs++; 3326 wqe->lpsn += rvt_div_round_up_mtu(qp, wqe->length) - 1; 3327 } else { 3328 wqe->lpsn += priv->tid_req.total_segs - 1; 3329 atomic_inc(&qpriv->n_requests); 3330 } 3331 3332 priv->tid_req.cur_seg = 0; 3333 priv->tid_req.comp_seg = 0; 3334 priv->tid_req.ack_seg = 0; 3335 priv->tid_req.state = TID_REQUEST_INACTIVE; 3336 /* 3337 * Reset acked_tail. 3338 * TID RDMA READ does not have ACKs so it does not 3339 * update the pointer. We have to reset it so TID RDMA 3340 * WRITE does not get confused. 3341 */ 3342 priv->tid_req.acked_tail = priv->tid_req.setup_head; 3343 trace_hfi1_tid_req_setup_tid_wqe(qp, 1, wqe->wr.opcode, 3344 wqe->psn, wqe->lpsn, 3345 &priv->tid_req); 3346 } 3347 exit: 3348 rcu_read_unlock(); 3349 } 3350 3351 /* TID RDMA WRITE functions */ 3352 3353 u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe, 3354 struct ib_other_headers *ohdr, 3355 u32 *bth1, u32 *bth2, u32 *len) 3356 { 3357 struct hfi1_qp_priv *qpriv = qp->priv; 3358 struct tid_rdma_request *req = wqe_to_tid_req(wqe); 3359 struct tid_rdma_params *remote; 3360 3361 rcu_read_lock(); 3362 remote = rcu_dereference(qpriv->tid_rdma.remote); 3363 /* 3364 * Set the number of flow to be used based on negotiated 3365 * parameters. 3366 */ 3367 req->n_flows = remote->max_write; 3368 req->state = TID_REQUEST_ACTIVE; 3369 3370 KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth0, KVER, 0x1); 3371 KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth1, JKEY, remote->jkey); 3372 ohdr->u.tid_rdma.w_req.reth.vaddr = 3373 cpu_to_be64(wqe->rdma_wr.remote_addr + (wqe->length - *len)); 3374 ohdr->u.tid_rdma.w_req.reth.rkey = 3375 cpu_to_be32(wqe->rdma_wr.rkey); 3376 ohdr->u.tid_rdma.w_req.reth.length = cpu_to_be32(*len); 3377 ohdr->u.tid_rdma.w_req.verbs_qp = cpu_to_be32(qp->remote_qpn); 3378 *bth1 &= ~RVT_QPN_MASK; 3379 *bth1 |= remote->qp; 3380 qp->s_state = TID_OP(WRITE_REQ); 3381 qp->s_flags |= HFI1_S_WAIT_TID_RESP; 3382 *bth2 |= IB_BTH_REQ_ACK; 3383 *len = 0; 3384 3385 rcu_read_unlock(); 3386 return sizeof(ohdr->u.tid_rdma.w_req) / sizeof(u32); 3387 } 3388 3389 static u32 hfi1_compute_tid_rdma_flow_wt(struct rvt_qp *qp) 3390 { 3391 /* 3392 * Heuristic for computing the RNR timeout when waiting on the flow 3393 * queue. Rather than a computationaly expensive exact estimate of when 3394 * a flow will be available, we assume that if a QP is at position N in 3395 * the flow queue it has to wait approximately (N + 1) * (number of 3396 * segments between two sync points). The rationale for this is that 3397 * flows are released and recycled at each sync point. 3398 */ 3399 return (MAX_TID_FLOW_PSN * qp->pmtu) >> TID_RDMA_SEGMENT_SHIFT; 3400 } 3401 3402 static u32 position_in_queue(struct hfi1_qp_priv *qpriv, 3403 struct tid_queue *queue) 3404 { 3405 return qpriv->tid_enqueue - queue->dequeue; 3406 } 3407 3408 /* 3409 * @qp: points to rvt_qp context. 3410 * @to_seg: desired RNR timeout in segments. 3411 * Return: index of the next highest timeout in the ib_hfi1_rnr_table[] 3412 */ 3413 static u32 hfi1_compute_tid_rnr_timeout(struct rvt_qp *qp, u32 to_seg) 3414 { 3415 struct hfi1_qp_priv *qpriv = qp->priv; 3416 u64 timeout; 3417 u32 bytes_per_us; 3418 u8 i; 3419 3420 bytes_per_us = active_egress_rate(qpriv->rcd->ppd) / 8; 3421 timeout = (to_seg * TID_RDMA_MAX_SEGMENT_SIZE) / bytes_per_us; 3422 /* 3423 * Find the next highest value in the RNR table to the required 3424 * timeout. This gives the responder some padding. 3425 */ 3426 for (i = 1; i <= IB_AETH_CREDIT_MASK; i++) 3427 if (rvt_rnr_tbl_to_usec(i) >= timeout) 3428 return i; 3429 return 0; 3430 } 3431 3432 /** 3433 * Central place for resource allocation at TID write responder, 3434 * is called from write_req and write_data interrupt handlers as 3435 * well as the send thread when a queued QP is scheduled for 3436 * resource allocation. 3437 * 3438 * Iterates over (a) segments of a request and then (b) queued requests 3439 * themselves to allocate resources for up to local->max_write 3440 * segments across multiple requests. Stop allocating when we 3441 * hit a sync point, resume allocating after data packets at 3442 * sync point have been received. 3443 * 3444 * Resource allocation and sending of responses is decoupled. The 3445 * request/segment which are being allocated and sent are as follows. 3446 * Resources are allocated for: 3447 * [request: qpriv->r_tid_alloc, segment: req->alloc_seg] 3448 * The send thread sends: 3449 * [request: qp->s_tail_ack_queue, segment:req->cur_seg] 3450 */ 3451 static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx) 3452 { 3453 struct tid_rdma_request *req; 3454 struct hfi1_qp_priv *qpriv = qp->priv; 3455 struct hfi1_ctxtdata *rcd = qpriv->rcd; 3456 struct tid_rdma_params *local = &qpriv->tid_rdma.local; 3457 struct rvt_ack_entry *e; 3458 u32 npkts, to_seg; 3459 bool last; 3460 int ret = 0; 3461 3462 lockdep_assert_held(&qp->s_lock); 3463 3464 while (1) { 3465 trace_hfi1_rsp_tid_write_alloc_res(qp, 0); 3466 trace_hfi1_tid_write_rsp_alloc_res(qp); 3467 /* 3468 * Don't allocate more segments if a RNR NAK has already been 3469 * scheduled to avoid messing up qp->r_psn: the RNR NAK will 3470 * be sent only when all allocated segments have been sent. 3471 * However, if more segments are allocated before that, TID RDMA 3472 * WRITE RESP packets will be sent out for these new segments 3473 * before the RNR NAK packet. When the requester receives the 3474 * RNR NAK packet, it will restart with qp->s_last_psn + 1, 3475 * which does not match qp->r_psn and will be dropped. 3476 * Consequently, the requester will exhaust its retries and 3477 * put the qp into error state. 3478 */ 3479 if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND) 3480 break; 3481 3482 /* No requests left to process */ 3483 if (qpriv->r_tid_alloc == qpriv->r_tid_head) { 3484 /* If all data has been received, clear the flow */ 3485 if (qpriv->flow_state.index < RXE_NUM_TID_FLOWS && 3486 !qpriv->alloc_w_segs) { 3487 hfi1_kern_clear_hw_flow(rcd, qp); 3488 qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; 3489 } 3490 break; 3491 } 3492 3493 e = &qp->s_ack_queue[qpriv->r_tid_alloc]; 3494 if (e->opcode != TID_OP(WRITE_REQ)) 3495 goto next_req; 3496 req = ack_to_tid_req(e); 3497 trace_hfi1_tid_req_write_alloc_res(qp, 0, e->opcode, e->psn, 3498 e->lpsn, req); 3499 /* Finished allocating for all segments of this request */ 3500 if (req->alloc_seg >= req->total_segs) 3501 goto next_req; 3502 3503 /* Can allocate only a maximum of local->max_write for a QP */ 3504 if (qpriv->alloc_w_segs >= local->max_write) 3505 break; 3506 3507 /* Don't allocate at a sync point with data packets pending */ 3508 if (qpriv->sync_pt && qpriv->alloc_w_segs) 3509 break; 3510 3511 /* All data received at the sync point, continue */ 3512 if (qpriv->sync_pt && !qpriv->alloc_w_segs) { 3513 hfi1_kern_clear_hw_flow(rcd, qp); 3514 qpriv->sync_pt = false; 3515 qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; 3516 } 3517 3518 /* Allocate flow if we don't have one */ 3519 if (qpriv->flow_state.index >= RXE_NUM_TID_FLOWS) { 3520 ret = hfi1_kern_setup_hw_flow(qpriv->rcd, qp); 3521 if (ret) { 3522 to_seg = hfi1_compute_tid_rdma_flow_wt(qp) * 3523 position_in_queue(qpriv, 3524 &rcd->flow_queue); 3525 break; 3526 } 3527 } 3528 3529 npkts = rvt_div_round_up_mtu(qp, req->seg_len); 3530 3531 /* 3532 * We are at a sync point if we run out of KDETH PSN space. 3533 * Last PSN of every generation is reserved for RESYNC. 3534 */ 3535 if (qpriv->flow_state.psn + npkts > MAX_TID_FLOW_PSN - 1) { 3536 qpriv->sync_pt = true; 3537 break; 3538 } 3539 3540 /* 3541 * If overtaking req->acked_tail, send an RNR NAK. Because the 3542 * QP is not queued in this case, and the issue can only be 3543 * caused by a delay in scheduling the second leg which we 3544 * cannot estimate, we use a rather arbitrary RNR timeout of 3545 * (MAX_FLOWS / 2) segments 3546 */ 3547 if (!CIRC_SPACE(req->setup_head, req->acked_tail, 3548 MAX_FLOWS)) { 3549 ret = -EAGAIN; 3550 to_seg = MAX_FLOWS >> 1; 3551 tid_rdma_trigger_ack(qp); 3552 break; 3553 } 3554 3555 /* Try to allocate rcv array / TID entries */ 3556 ret = hfi1_kern_exp_rcv_setup(req, &req->ss, &last); 3557 if (ret == -EAGAIN) 3558 to_seg = position_in_queue(qpriv, &rcd->rarr_queue); 3559 if (ret) 3560 break; 3561 3562 qpriv->alloc_w_segs++; 3563 req->alloc_seg++; 3564 continue; 3565 next_req: 3566 /* Begin processing the next request */ 3567 if (++qpriv->r_tid_alloc > 3568 rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) 3569 qpriv->r_tid_alloc = 0; 3570 } 3571 3572 /* 3573 * Schedule an RNR NAK to be sent if (a) flow or rcv array allocation 3574 * has failed (b) we are called from the rcv handler interrupt context 3575 * (c) an RNR NAK has not already been scheduled 3576 */ 3577 if (ret == -EAGAIN && intr_ctx && !qp->r_nak_state) 3578 goto send_rnr_nak; 3579 3580 return; 3581 3582 send_rnr_nak: 3583 lockdep_assert_held(&qp->r_lock); 3584 3585 /* Set r_nak_state to prevent unrelated events from generating NAK's */ 3586 qp->r_nak_state = hfi1_compute_tid_rnr_timeout(qp, to_seg) | IB_RNR_NAK; 3587 3588 /* Pull back r_psn to the segment being RNR NAK'd */ 3589 qp->r_psn = e->psn + req->alloc_seg; 3590 qp->r_ack_psn = qp->r_psn; 3591 /* 3592 * Pull back r_head_ack_queue to the ack entry following the request 3593 * being RNR NAK'd. This allows resources to be allocated to the request 3594 * if the queued QP is scheduled. 3595 */ 3596 qp->r_head_ack_queue = qpriv->r_tid_alloc + 1; 3597 if (qp->r_head_ack_queue > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) 3598 qp->r_head_ack_queue = 0; 3599 qpriv->r_tid_head = qp->r_head_ack_queue; 3600 /* 3601 * These send side fields are used in make_rc_ack(). They are set in 3602 * hfi1_send_rc_ack() but must be set here before dropping qp->s_lock 3603 * for consistency 3604 */ 3605 qp->s_nak_state = qp->r_nak_state; 3606 qp->s_ack_psn = qp->r_ack_psn; 3607 /* 3608 * Clear the ACK PENDING flag to prevent unwanted ACK because we 3609 * have modified qp->s_ack_psn here. 3610 */ 3611 qp->s_flags &= ~(RVT_S_ACK_PENDING); 3612 3613 trace_hfi1_rsp_tid_write_alloc_res(qp, qp->r_psn); 3614 /* 3615 * qpriv->rnr_nak_state is used to determine when the scheduled RNR NAK 3616 * has actually been sent. qp->s_flags RVT_S_ACK_PENDING bit cannot be 3617 * used for this because qp->s_lock is dropped before calling 3618 * hfi1_send_rc_ack() leading to inconsistency between the receive 3619 * interrupt handlers and the send thread in make_rc_ack() 3620 */ 3621 qpriv->rnr_nak_state = TID_RNR_NAK_SEND; 3622 3623 /* 3624 * Schedule RNR NAK to be sent. RNR NAK's are scheduled from the receive 3625 * interrupt handlers but will be sent from the send engine behind any 3626 * previous responses that may have been scheduled 3627 */ 3628 rc_defered_ack(rcd, qp); 3629 } 3630 3631 void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet) 3632 { 3633 /* HANDLER FOR TID RDMA WRITE REQUEST packet (Responder side)*/ 3634 3635 /* 3636 * 1. Verify TID RDMA WRITE REQ as per IB_OPCODE_RC_RDMA_WRITE_FIRST 3637 * (see hfi1_rc_rcv()) 3638 * - Don't allow 0-length requests. 3639 * 2. Put TID RDMA WRITE REQ into the response queueu (s_ack_queue) 3640 * - Setup struct tid_rdma_req with request info 3641 * - Prepare struct tid_rdma_flow array? 3642 * 3. Set the qp->s_ack_state as state diagram in design doc. 3643 * 4. Set RVT_S_RESP_PENDING in s_flags. 3644 * 5. Kick the send engine (hfi1_schedule_send()) 3645 */ 3646 struct hfi1_ctxtdata *rcd = packet->rcd; 3647 struct rvt_qp *qp = packet->qp; 3648 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); 3649 struct ib_other_headers *ohdr = packet->ohdr; 3650 struct rvt_ack_entry *e; 3651 unsigned long flags; 3652 struct ib_reth *reth; 3653 struct hfi1_qp_priv *qpriv = qp->priv; 3654 struct tid_rdma_request *req; 3655 u32 bth0, psn, len, rkey, num_segs; 3656 bool fecn; 3657 u8 next; 3658 u64 vaddr; 3659 int diff; 3660 3661 bth0 = be32_to_cpu(ohdr->bth[0]); 3662 if (hfi1_ruc_check_hdr(ibp, packet)) 3663 return; 3664 3665 fecn = process_ecn(qp, packet); 3666 psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 3667 trace_hfi1_rsp_rcv_tid_write_req(qp, psn); 3668 3669 if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST)) 3670 rvt_comm_est(qp); 3671 3672 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) 3673 goto nack_inv; 3674 3675 reth = &ohdr->u.tid_rdma.w_req.reth; 3676 vaddr = be64_to_cpu(reth->vaddr); 3677 len = be32_to_cpu(reth->length); 3678 3679 num_segs = DIV_ROUND_UP(len, qpriv->tid_rdma.local.max_len); 3680 diff = delta_psn(psn, qp->r_psn); 3681 if (unlikely(diff)) { 3682 tid_rdma_rcv_err(packet, ohdr, qp, psn, diff, fecn); 3683 return; 3684 } 3685 3686 /* 3687 * The resent request which was previously RNR NAK'd is inserted at the 3688 * location of the original request, which is one entry behind 3689 * r_head_ack_queue 3690 */ 3691 if (qpriv->rnr_nak_state) 3692 qp->r_head_ack_queue = qp->r_head_ack_queue ? 3693 qp->r_head_ack_queue - 1 : 3694 rvt_size_atomic(ib_to_rvt(qp->ibqp.device)); 3695 3696 /* We've verified the request, insert it into the ack queue. */ 3697 next = qp->r_head_ack_queue + 1; 3698 if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) 3699 next = 0; 3700 spin_lock_irqsave(&qp->s_lock, flags); 3701 if (unlikely(next == qp->s_acked_ack_queue)) { 3702 if (!qp->s_ack_queue[next].sent) 3703 goto nack_inv_unlock; 3704 update_ack_queue(qp, next); 3705 } 3706 e = &qp->s_ack_queue[qp->r_head_ack_queue]; 3707 req = ack_to_tid_req(e); 3708 3709 /* Bring previously RNR NAK'd request back to life */ 3710 if (qpriv->rnr_nak_state) { 3711 qp->r_nak_state = 0; 3712 qp->s_nak_state = 0; 3713 qpriv->rnr_nak_state = TID_RNR_NAK_INIT; 3714 qp->r_psn = e->lpsn + 1; 3715 req->state = TID_REQUEST_INIT; 3716 goto update_head; 3717 } 3718 3719 release_rdma_sge_mr(e); 3720 3721 /* The length needs to be in multiples of PAGE_SIZE */ 3722 if (!len || len & ~PAGE_MASK) 3723 goto nack_inv_unlock; 3724 3725 rkey = be32_to_cpu(reth->rkey); 3726 qp->r_len = len; 3727 3728 if (e->opcode == TID_OP(WRITE_REQ) && 3729 (req->setup_head != req->clear_tail || 3730 req->clear_tail != req->acked_tail)) 3731 goto nack_inv_unlock; 3732 3733 if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr, 3734 rkey, IB_ACCESS_REMOTE_WRITE))) 3735 goto nack_acc; 3736 3737 qp->r_psn += num_segs - 1; 3738 3739 e->opcode = (bth0 >> 24) & 0xff; 3740 e->psn = psn; 3741 e->lpsn = qp->r_psn; 3742 e->sent = 0; 3743 3744 req->n_flows = min_t(u16, num_segs, qpriv->tid_rdma.local.max_write); 3745 req->state = TID_REQUEST_INIT; 3746 req->cur_seg = 0; 3747 req->comp_seg = 0; 3748 req->ack_seg = 0; 3749 req->alloc_seg = 0; 3750 req->isge = 0; 3751 req->seg_len = qpriv->tid_rdma.local.max_len; 3752 req->total_len = len; 3753 req->total_segs = num_segs; 3754 req->r_flow_psn = e->psn; 3755 req->ss.sge = e->rdma_sge; 3756 req->ss.num_sge = 1; 3757 3758 req->flow_idx = req->setup_head; 3759 req->clear_tail = req->setup_head; 3760 req->acked_tail = req->setup_head; 3761 3762 qp->r_state = e->opcode; 3763 qp->r_nak_state = 0; 3764 /* 3765 * We need to increment the MSN here instead of when we 3766 * finish sending the result since a duplicate request would 3767 * increment it more than once. 3768 */ 3769 qp->r_msn++; 3770 qp->r_psn++; 3771 3772 trace_hfi1_tid_req_rcv_write_req(qp, 0, e->opcode, e->psn, e->lpsn, 3773 req); 3774 3775 if (qpriv->r_tid_tail == HFI1_QP_WQE_INVALID) { 3776 qpriv->r_tid_tail = qp->r_head_ack_queue; 3777 } else if (qpriv->r_tid_tail == qpriv->r_tid_head) { 3778 struct tid_rdma_request *ptr; 3779 3780 e = &qp->s_ack_queue[qpriv->r_tid_tail]; 3781 ptr = ack_to_tid_req(e); 3782 3783 if (e->opcode != TID_OP(WRITE_REQ) || 3784 ptr->comp_seg == ptr->total_segs) { 3785 if (qpriv->r_tid_tail == qpriv->r_tid_ack) 3786 qpriv->r_tid_ack = qp->r_head_ack_queue; 3787 qpriv->r_tid_tail = qp->r_head_ack_queue; 3788 } 3789 } 3790 update_head: 3791 qp->r_head_ack_queue = next; 3792 qpriv->r_tid_head = qp->r_head_ack_queue; 3793 3794 hfi1_tid_write_alloc_resources(qp, true); 3795 trace_hfi1_tid_write_rsp_rcv_req(qp); 3796 3797 /* Schedule the send tasklet. */ 3798 qp->s_flags |= RVT_S_RESP_PENDING; 3799 if (fecn) 3800 qp->s_flags |= RVT_S_ECN; 3801 hfi1_schedule_send(qp); 3802 3803 spin_unlock_irqrestore(&qp->s_lock, flags); 3804 return; 3805 3806 nack_inv_unlock: 3807 spin_unlock_irqrestore(&qp->s_lock, flags); 3808 nack_inv: 3809 rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); 3810 qp->r_nak_state = IB_NAK_INVALID_REQUEST; 3811 qp->r_ack_psn = qp->r_psn; 3812 /* Queue NAK for later */ 3813 rc_defered_ack(rcd, qp); 3814 return; 3815 nack_acc: 3816 spin_unlock_irqrestore(&qp->s_lock, flags); 3817 rvt_rc_error(qp, IB_WC_LOC_PROT_ERR); 3818 qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; 3819 qp->r_ack_psn = qp->r_psn; 3820 } 3821 3822 u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, 3823 struct ib_other_headers *ohdr, u32 *bth1, 3824 u32 bth2, u32 *len, 3825 struct rvt_sge_state **ss) 3826 { 3827 struct hfi1_ack_priv *epriv = e->priv; 3828 struct tid_rdma_request *req = &epriv->tid_req; 3829 struct hfi1_qp_priv *qpriv = qp->priv; 3830 struct tid_rdma_flow *flow = NULL; 3831 u32 resp_len = 0, hdwords = 0; 3832 void *resp_addr = NULL; 3833 struct tid_rdma_params *remote; 3834 3835 trace_hfi1_tid_req_build_write_resp(qp, 0, e->opcode, e->psn, e->lpsn, 3836 req); 3837 trace_hfi1_tid_write_rsp_build_resp(qp); 3838 trace_hfi1_rsp_build_tid_write_resp(qp, bth2); 3839 flow = &req->flows[req->flow_idx]; 3840 switch (req->state) { 3841 default: 3842 /* 3843 * Try to allocate resources here in case QP was queued and was 3844 * later scheduled when resources became available 3845 */ 3846 hfi1_tid_write_alloc_resources(qp, false); 3847 3848 /* We've already sent everything which is ready */ 3849 if (req->cur_seg >= req->alloc_seg) 3850 goto done; 3851 3852 /* 3853 * Resources can be assigned but responses cannot be sent in 3854 * rnr_nak state, till the resent request is received 3855 */ 3856 if (qpriv->rnr_nak_state == TID_RNR_NAK_SENT) 3857 goto done; 3858 3859 req->state = TID_REQUEST_ACTIVE; 3860 trace_hfi1_tid_flow_build_write_resp(qp, req->flow_idx, flow); 3861 req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS); 3862 hfi1_add_tid_reap_timer(qp); 3863 break; 3864 3865 case TID_REQUEST_RESEND_ACTIVE: 3866 case TID_REQUEST_RESEND: 3867 trace_hfi1_tid_flow_build_write_resp(qp, req->flow_idx, flow); 3868 req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS); 3869 if (!CIRC_CNT(req->setup_head, req->flow_idx, MAX_FLOWS)) 3870 req->state = TID_REQUEST_ACTIVE; 3871 3872 hfi1_mod_tid_reap_timer(qp); 3873 break; 3874 } 3875 flow->flow_state.resp_ib_psn = bth2; 3876 resp_addr = (void *)flow->tid_entry; 3877 resp_len = sizeof(*flow->tid_entry) * flow->tidcnt; 3878 req->cur_seg++; 3879 3880 memset(&ohdr->u.tid_rdma.w_rsp, 0, sizeof(ohdr->u.tid_rdma.w_rsp)); 3881 epriv->ss.sge.vaddr = resp_addr; 3882 epriv->ss.sge.sge_length = resp_len; 3883 epriv->ss.sge.length = epriv->ss.sge.sge_length; 3884 /* 3885 * We can safely zero these out. Since the first SGE covers the 3886 * entire packet, nothing else should even look at the MR. 3887 */ 3888 epriv->ss.sge.mr = NULL; 3889 epriv->ss.sge.m = 0; 3890 epriv->ss.sge.n = 0; 3891 3892 epriv->ss.sg_list = NULL; 3893 epriv->ss.total_len = epriv->ss.sge.sge_length; 3894 epriv->ss.num_sge = 1; 3895 3896 *ss = &epriv->ss; 3897 *len = epriv->ss.total_len; 3898 3899 /* Construct the TID RDMA WRITE RESP packet header */ 3900 rcu_read_lock(); 3901 remote = rcu_dereference(qpriv->tid_rdma.remote); 3902 3903 KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth0, KVER, 0x1); 3904 KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth1, JKEY, remote->jkey); 3905 ohdr->u.tid_rdma.w_rsp.aeth = rvt_compute_aeth(qp); 3906 ohdr->u.tid_rdma.w_rsp.tid_flow_psn = 3907 cpu_to_be32((flow->flow_state.generation << 3908 HFI1_KDETH_BTH_SEQ_SHIFT) | 3909 (flow->flow_state.spsn & 3910 HFI1_KDETH_BTH_SEQ_MASK)); 3911 ohdr->u.tid_rdma.w_rsp.tid_flow_qp = 3912 cpu_to_be32(qpriv->tid_rdma.local.qp | 3913 ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) << 3914 TID_RDMA_DESTQP_FLOW_SHIFT) | 3915 qpriv->rcd->ctxt); 3916 ohdr->u.tid_rdma.w_rsp.verbs_qp = cpu_to_be32(qp->remote_qpn); 3917 *bth1 = remote->qp; 3918 rcu_read_unlock(); 3919 hdwords = sizeof(ohdr->u.tid_rdma.w_rsp) / sizeof(u32); 3920 qpriv->pending_tid_w_segs++; 3921 done: 3922 return hdwords; 3923 } 3924 3925 static void hfi1_add_tid_reap_timer(struct rvt_qp *qp) 3926 { 3927 struct hfi1_qp_priv *qpriv = qp->priv; 3928 3929 lockdep_assert_held(&qp->s_lock); 3930 if (!(qpriv->s_flags & HFI1_R_TID_RSC_TIMER)) { 3931 qpriv->s_flags |= HFI1_R_TID_RSC_TIMER; 3932 qpriv->s_tid_timer.expires = jiffies + 3933 qpriv->tid_timer_timeout_jiffies; 3934 add_timer(&qpriv->s_tid_timer); 3935 } 3936 } 3937 3938 static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp) 3939 { 3940 struct hfi1_qp_priv *qpriv = qp->priv; 3941 3942 lockdep_assert_held(&qp->s_lock); 3943 qpriv->s_flags |= HFI1_R_TID_RSC_TIMER; 3944 mod_timer(&qpriv->s_tid_timer, jiffies + 3945 qpriv->tid_timer_timeout_jiffies); 3946 } 3947 3948 static int hfi1_stop_tid_reap_timer(struct rvt_qp *qp) 3949 { 3950 struct hfi1_qp_priv *qpriv = qp->priv; 3951 int rval = 0; 3952 3953 lockdep_assert_held(&qp->s_lock); 3954 if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) { 3955 rval = del_timer(&qpriv->s_tid_timer); 3956 qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER; 3957 } 3958 return rval; 3959 } 3960 3961 void hfi1_del_tid_reap_timer(struct rvt_qp *qp) 3962 { 3963 struct hfi1_qp_priv *qpriv = qp->priv; 3964 3965 del_timer_sync(&qpriv->s_tid_timer); 3966 qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER; 3967 } 3968 3969 static void hfi1_tid_timeout(struct timer_list *t) 3970 { 3971 struct hfi1_qp_priv *qpriv = from_timer(qpriv, t, s_tid_timer); 3972 struct rvt_qp *qp = qpriv->owner; 3973 struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); 3974 unsigned long flags; 3975 u32 i; 3976 3977 spin_lock_irqsave(&qp->r_lock, flags); 3978 spin_lock(&qp->s_lock); 3979 if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) { 3980 dd_dev_warn(dd_from_ibdev(qp->ibqp.device), "[QP%u] %s %d\n", 3981 qp->ibqp.qp_num, __func__, __LINE__); 3982 trace_hfi1_msg_tid_timeout(/* msg */ 3983 qp, "resource timeout = ", 3984 (u64)qpriv->tid_timer_timeout_jiffies); 3985 hfi1_stop_tid_reap_timer(qp); 3986 /* 3987 * Go though the entire ack queue and clear any outstanding 3988 * HW flow and RcvArray resources. 3989 */ 3990 hfi1_kern_clear_hw_flow(qpriv->rcd, qp); 3991 for (i = 0; i < rvt_max_atomic(rdi); i++) { 3992 struct tid_rdma_request *req = 3993 ack_to_tid_req(&qp->s_ack_queue[i]); 3994 3995 hfi1_kern_exp_rcv_clear_all(req); 3996 } 3997 spin_unlock(&qp->s_lock); 3998 if (qp->ibqp.event_handler) { 3999 struct ib_event ev; 4000 4001 ev.device = qp->ibqp.device; 4002 ev.element.qp = &qp->ibqp; 4003 ev.event = IB_EVENT_QP_FATAL; 4004 qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); 4005 } 4006 rvt_rc_error(qp, IB_WC_RESP_TIMEOUT_ERR); 4007 goto unlock_r_lock; 4008 } 4009 spin_unlock(&qp->s_lock); 4010 unlock_r_lock: 4011 spin_unlock_irqrestore(&qp->r_lock, flags); 4012 } 4013 4014 void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet) 4015 { 4016 /* HANDLER FOR TID RDMA WRITE RESPONSE packet (Requestor side */ 4017 4018 /* 4019 * 1. Find matching SWQE 4020 * 2. Check that TIDENTRY array has enough space for a complete 4021 * segment. If not, put QP in error state. 4022 * 3. Save response data in struct tid_rdma_req and struct tid_rdma_flow 4023 * 4. Remove HFI1_S_WAIT_TID_RESP from s_flags. 4024 * 5. Set qp->s_state 4025 * 6. Kick the send engine (hfi1_schedule_send()) 4026 */ 4027 struct ib_other_headers *ohdr = packet->ohdr; 4028 struct rvt_qp *qp = packet->qp; 4029 struct hfi1_qp_priv *qpriv = qp->priv; 4030 struct hfi1_ctxtdata *rcd = packet->rcd; 4031 struct rvt_swqe *wqe; 4032 struct tid_rdma_request *req; 4033 struct tid_rdma_flow *flow; 4034 enum ib_wc_status status; 4035 u32 opcode, aeth, psn, flow_psn, i, tidlen = 0, pktlen; 4036 bool fecn; 4037 unsigned long flags; 4038 4039 fecn = process_ecn(qp, packet); 4040 psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 4041 aeth = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.aeth); 4042 opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; 4043 4044 spin_lock_irqsave(&qp->s_lock, flags); 4045 4046 /* Ignore invalid responses */ 4047 if (cmp_psn(psn, qp->s_next_psn) >= 0) 4048 goto ack_done; 4049 4050 /* Ignore duplicate responses. */ 4051 if (unlikely(cmp_psn(psn, qp->s_last_psn) <= 0)) 4052 goto ack_done; 4053 4054 if (unlikely(qp->s_acked == qp->s_tail)) 4055 goto ack_done; 4056 4057 /* 4058 * If we are waiting for a particular packet sequence number 4059 * due to a request being resent, check for it. Otherwise, 4060 * ensure that we haven't missed anything. 4061 */ 4062 if (qp->r_flags & RVT_R_RDMAR_SEQ) { 4063 if (cmp_psn(psn, qp->s_last_psn + 1) != 0) 4064 goto ack_done; 4065 qp->r_flags &= ~RVT_R_RDMAR_SEQ; 4066 } 4067 4068 wqe = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur); 4069 if (unlikely(wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)) 4070 goto ack_op_err; 4071 4072 req = wqe_to_tid_req(wqe); 4073 /* 4074 * If we've lost ACKs and our acked_tail pointer is too far 4075 * behind, don't overwrite segments. Just drop the packet and 4076 * let the reliability protocol take care of it. 4077 */ 4078 if (!CIRC_SPACE(req->setup_head, req->acked_tail, MAX_FLOWS)) 4079 goto ack_done; 4080 4081 /* 4082 * The call to do_rc_ack() should be last in the chain of 4083 * packet checks because it will end up updating the QP state. 4084 * Therefore, anything that would prevent the packet from 4085 * being accepted as a successful response should be prior 4086 * to it. 4087 */ 4088 if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd)) 4089 goto ack_done; 4090 4091 trace_hfi1_ack(qp, psn); 4092 4093 flow = &req->flows[req->setup_head]; 4094 flow->pkt = 0; 4095 flow->tid_idx = 0; 4096 flow->tid_offset = 0; 4097 flow->sent = 0; 4098 flow->resync_npkts = 0; 4099 flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_qp); 4100 flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) & 4101 TID_RDMA_DESTQP_FLOW_MASK; 4102 flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_psn)); 4103 flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT; 4104 flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK; 4105 flow->flow_state.resp_ib_psn = psn; 4106 flow->length = min_t(u32, req->seg_len, 4107 (wqe->length - (req->comp_seg * req->seg_len))); 4108 4109 flow->npkts = rvt_div_round_up_mtu(qp, flow->length); 4110 flow->flow_state.lpsn = flow->flow_state.spsn + 4111 flow->npkts - 1; 4112 /* payload length = packet length - (header length + ICRC length) */ 4113 pktlen = packet->tlen - (packet->hlen + 4); 4114 if (pktlen > sizeof(flow->tid_entry)) { 4115 status = IB_WC_LOC_LEN_ERR; 4116 goto ack_err; 4117 } 4118 memcpy(flow->tid_entry, packet->ebuf, pktlen); 4119 flow->tidcnt = pktlen / sizeof(*flow->tid_entry); 4120 trace_hfi1_tid_flow_rcv_write_resp(qp, req->setup_head, flow); 4121 4122 req->comp_seg++; 4123 trace_hfi1_tid_write_sender_rcv_resp(qp, 0); 4124 /* 4125 * Walk the TID_ENTRY list to make sure we have enough space for a 4126 * complete segment. 4127 */ 4128 for (i = 0; i < flow->tidcnt; i++) { 4129 trace_hfi1_tid_entry_rcv_write_resp(/* entry */ 4130 qp, i, flow->tid_entry[i]); 4131 if (!EXP_TID_GET(flow->tid_entry[i], LEN)) { 4132 status = IB_WC_LOC_LEN_ERR; 4133 goto ack_err; 4134 } 4135 tidlen += EXP_TID_GET(flow->tid_entry[i], LEN); 4136 } 4137 if (tidlen * PAGE_SIZE < flow->length) { 4138 status = IB_WC_LOC_LEN_ERR; 4139 goto ack_err; 4140 } 4141 4142 trace_hfi1_tid_req_rcv_write_resp(qp, 0, wqe->wr.opcode, wqe->psn, 4143 wqe->lpsn, req); 4144 /* 4145 * If this is the first response for this request, set the initial 4146 * flow index to the current flow. 4147 */ 4148 if (!cmp_psn(psn, wqe->psn)) { 4149 req->r_last_acked = mask_psn(wqe->psn - 1); 4150 /* Set acked flow index to head index */ 4151 req->acked_tail = req->setup_head; 4152 } 4153 4154 /* advance circular buffer head */ 4155 req->setup_head = CIRC_NEXT(req->setup_head, MAX_FLOWS); 4156 req->state = TID_REQUEST_ACTIVE; 4157 4158 /* 4159 * If all responses for this TID RDMA WRITE request have been received 4160 * advance the pointer to the next one. 4161 * Since TID RDMA requests could be mixed in with regular IB requests, 4162 * they might not appear sequentially in the queue. Therefore, the 4163 * next request needs to be "found". 4164 */ 4165 if (qpriv->s_tid_cur != qpriv->s_tid_head && 4166 req->comp_seg == req->total_segs) { 4167 for (i = qpriv->s_tid_cur + 1; ; i++) { 4168 if (i == qp->s_size) 4169 i = 0; 4170 wqe = rvt_get_swqe_ptr(qp, i); 4171 if (i == qpriv->s_tid_head) 4172 break; 4173 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) 4174 break; 4175 } 4176 qpriv->s_tid_cur = i; 4177 } 4178 qp->s_flags &= ~HFI1_S_WAIT_TID_RESP; 4179 hfi1_schedule_tid_send(qp); 4180 goto ack_done; 4181 4182 ack_op_err: 4183 status = IB_WC_LOC_QP_OP_ERR; 4184 ack_err: 4185 rvt_error_qp(qp, status); 4186 ack_done: 4187 if (fecn) 4188 qp->s_flags |= RVT_S_ECN; 4189 spin_unlock_irqrestore(&qp->s_lock, flags); 4190 } 4191 4192 bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe, 4193 struct ib_other_headers *ohdr, 4194 u32 *bth1, u32 *bth2, u32 *len) 4195 { 4196 struct tid_rdma_request *req = wqe_to_tid_req(wqe); 4197 struct tid_rdma_flow *flow = &req->flows[req->clear_tail]; 4198 struct tid_rdma_params *remote; 4199 struct rvt_qp *qp = req->qp; 4200 struct hfi1_qp_priv *qpriv = qp->priv; 4201 u32 tidentry = flow->tid_entry[flow->tid_idx]; 4202 u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT; 4203 struct tid_rdma_write_data *wd = &ohdr->u.tid_rdma.w_data; 4204 u32 next_offset, om = KDETH_OM_LARGE; 4205 bool last_pkt; 4206 4207 if (!tidlen) { 4208 hfi1_trdma_send_complete(qp, wqe, IB_WC_REM_INV_RD_REQ_ERR); 4209 rvt_error_qp(qp, IB_WC_REM_INV_RD_REQ_ERR); 4210 } 4211 4212 *len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset); 4213 flow->sent += *len; 4214 next_offset = flow->tid_offset + *len; 4215 last_pkt = (flow->tid_idx == (flow->tidcnt - 1) && 4216 next_offset >= tidlen) || (flow->sent >= flow->length); 4217 trace_hfi1_tid_entry_build_write_data(qp, flow->tid_idx, tidentry); 4218 trace_hfi1_tid_flow_build_write_data(qp, req->clear_tail, flow); 4219 4220 rcu_read_lock(); 4221 remote = rcu_dereference(qpriv->tid_rdma.remote); 4222 KDETH_RESET(wd->kdeth0, KVER, 0x1); 4223 KDETH_SET(wd->kdeth0, SH, !last_pkt); 4224 KDETH_SET(wd->kdeth0, INTR, !!(!last_pkt && remote->urg)); 4225 KDETH_SET(wd->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL)); 4226 KDETH_SET(wd->kdeth0, TID, EXP_TID_GET(tidentry, IDX)); 4227 KDETH_SET(wd->kdeth0, OM, om == KDETH_OM_LARGE); 4228 KDETH_SET(wd->kdeth0, OFFSET, flow->tid_offset / om); 4229 KDETH_RESET(wd->kdeth1, JKEY, remote->jkey); 4230 wd->verbs_qp = cpu_to_be32(qp->remote_qpn); 4231 rcu_read_unlock(); 4232 4233 *bth1 = flow->tid_qpn; 4234 *bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) & 4235 HFI1_KDETH_BTH_SEQ_MASK) | 4236 (flow->flow_state.generation << 4237 HFI1_KDETH_BTH_SEQ_SHIFT)); 4238 if (last_pkt) { 4239 /* PSNs are zero-based, so +1 to count number of packets */ 4240 if (flow->flow_state.lpsn + 1 + 4241 rvt_div_round_up_mtu(qp, req->seg_len) > 4242 MAX_TID_FLOW_PSN) 4243 req->state = TID_REQUEST_SYNC; 4244 *bth2 |= IB_BTH_REQ_ACK; 4245 } 4246 4247 if (next_offset >= tidlen) { 4248 flow->tid_offset = 0; 4249 flow->tid_idx++; 4250 } else { 4251 flow->tid_offset = next_offset; 4252 } 4253 return last_pkt; 4254 } 4255 4256 void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet) 4257 { 4258 struct rvt_qp *qp = packet->qp; 4259 struct hfi1_qp_priv *priv = qp->priv; 4260 struct hfi1_ctxtdata *rcd = priv->rcd; 4261 struct ib_other_headers *ohdr = packet->ohdr; 4262 struct rvt_ack_entry *e; 4263 struct tid_rdma_request *req; 4264 struct tid_rdma_flow *flow; 4265 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); 4266 unsigned long flags; 4267 u32 psn, next; 4268 u8 opcode; 4269 bool fecn; 4270 4271 fecn = process_ecn(qp, packet); 4272 psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 4273 opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; 4274 4275 /* 4276 * All error handling should be done by now. If we are here, the packet 4277 * is either good or been accepted by the error handler. 4278 */ 4279 spin_lock_irqsave(&qp->s_lock, flags); 4280 e = &qp->s_ack_queue[priv->r_tid_tail]; 4281 req = ack_to_tid_req(e); 4282 flow = &req->flows[req->clear_tail]; 4283 if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.lpsn))) { 4284 update_r_next_psn_fecn(packet, priv, rcd, flow, fecn); 4285 4286 if (cmp_psn(psn, flow->flow_state.r_next_psn)) 4287 goto send_nak; 4288 4289 flow->flow_state.r_next_psn = mask_psn(psn + 1); 4290 /* 4291 * Copy the payload to destination buffer if this packet is 4292 * delivered as an eager packet due to RSM rule and FECN. 4293 * The RSM rule selects FECN bit in BTH and SH bit in 4294 * KDETH header and therefore will not match the last 4295 * packet of each segment that has SH bit cleared. 4296 */ 4297 if (fecn && packet->etype == RHF_RCV_TYPE_EAGER) { 4298 struct rvt_sge_state ss; 4299 u32 len; 4300 u32 tlen = packet->tlen; 4301 u16 hdrsize = packet->hlen; 4302 u8 pad = packet->pad; 4303 u8 extra_bytes = pad + packet->extra_byte + 4304 (SIZE_OF_CRC << 2); 4305 u32 pmtu = qp->pmtu; 4306 4307 if (unlikely(tlen != (hdrsize + pmtu + extra_bytes))) 4308 goto send_nak; 4309 len = req->comp_seg * req->seg_len; 4310 len += delta_psn(psn, 4311 full_flow_psn(flow, flow->flow_state.spsn)) * 4312 pmtu; 4313 if (unlikely(req->total_len - len < pmtu)) 4314 goto send_nak; 4315 4316 /* 4317 * The e->rdma_sge field is set when TID RDMA WRITE REQ 4318 * is first received and is never modified thereafter. 4319 */ 4320 ss.sge = e->rdma_sge; 4321 ss.sg_list = NULL; 4322 ss.num_sge = 1; 4323 ss.total_len = req->total_len; 4324 rvt_skip_sge(&ss, len, false); 4325 rvt_copy_sge(qp, &ss, packet->payload, pmtu, false, 4326 false); 4327 /* Raise the sw sequence check flag for next packet */ 4328 priv->r_next_psn_kdeth = mask_psn(psn + 1); 4329 priv->s_flags |= HFI1_R_TID_SW_PSN; 4330 } 4331 goto exit; 4332 } 4333 flow->flow_state.r_next_psn = mask_psn(psn + 1); 4334 hfi1_kern_exp_rcv_clear(req); 4335 priv->alloc_w_segs--; 4336 rcd->flows[flow->idx].psn = psn & HFI1_KDETH_BTH_SEQ_MASK; 4337 req->comp_seg++; 4338 priv->s_nak_state = 0; 4339 4340 /* 4341 * Release the flow if one of the following conditions has been met: 4342 * - The request has reached a sync point AND all outstanding 4343 * segments have been completed, or 4344 * - The entire request is complete and there are no more requests 4345 * (of any kind) in the queue. 4346 */ 4347 trace_hfi1_rsp_rcv_tid_write_data(qp, psn); 4348 trace_hfi1_tid_req_rcv_write_data(qp, 0, e->opcode, e->psn, e->lpsn, 4349 req); 4350 trace_hfi1_tid_write_rsp_rcv_data(qp); 4351 validate_r_tid_ack(priv); 4352 4353 if (opcode == TID_OP(WRITE_DATA_LAST)) { 4354 release_rdma_sge_mr(e); 4355 for (next = priv->r_tid_tail + 1; ; next++) { 4356 if (next > rvt_size_atomic(&dev->rdi)) 4357 next = 0; 4358 if (next == priv->r_tid_head) 4359 break; 4360 e = &qp->s_ack_queue[next]; 4361 if (e->opcode == TID_OP(WRITE_REQ)) 4362 break; 4363 } 4364 priv->r_tid_tail = next; 4365 if (++qp->s_acked_ack_queue > rvt_size_atomic(&dev->rdi)) 4366 qp->s_acked_ack_queue = 0; 4367 } 4368 4369 hfi1_tid_write_alloc_resources(qp, true); 4370 4371 /* 4372 * If we need to generate more responses, schedule the 4373 * send engine. 4374 */ 4375 if (req->cur_seg < req->total_segs || 4376 qp->s_tail_ack_queue != qp->r_head_ack_queue) { 4377 qp->s_flags |= RVT_S_RESP_PENDING; 4378 hfi1_schedule_send(qp); 4379 } 4380 4381 priv->pending_tid_w_segs--; 4382 if (priv->s_flags & HFI1_R_TID_RSC_TIMER) { 4383 if (priv->pending_tid_w_segs) 4384 hfi1_mod_tid_reap_timer(req->qp); 4385 else 4386 hfi1_stop_tid_reap_timer(req->qp); 4387 } 4388 4389 done: 4390 tid_rdma_schedule_ack(qp); 4391 exit: 4392 priv->r_next_psn_kdeth = flow->flow_state.r_next_psn; 4393 if (fecn) 4394 qp->s_flags |= RVT_S_ECN; 4395 spin_unlock_irqrestore(&qp->s_lock, flags); 4396 return; 4397 4398 send_nak: 4399 if (!priv->s_nak_state) { 4400 priv->s_nak_state = IB_NAK_PSN_ERROR; 4401 priv->s_nak_psn = flow->flow_state.r_next_psn; 4402 tid_rdma_trigger_ack(qp); 4403 } 4404 goto done; 4405 } 4406 4407 static bool hfi1_tid_rdma_is_resync_psn(u32 psn) 4408 { 4409 return (bool)((psn & HFI1_KDETH_BTH_SEQ_MASK) == 4410 HFI1_KDETH_BTH_SEQ_MASK); 4411 } 4412 4413 u32 hfi1_build_tid_rdma_write_ack(struct rvt_qp *qp, struct rvt_ack_entry *e, 4414 struct ib_other_headers *ohdr, u16 iflow, 4415 u32 *bth1, u32 *bth2) 4416 { 4417 struct hfi1_qp_priv *qpriv = qp->priv; 4418 struct tid_flow_state *fs = &qpriv->flow_state; 4419 struct tid_rdma_request *req = ack_to_tid_req(e); 4420 struct tid_rdma_flow *flow = &req->flows[iflow]; 4421 struct tid_rdma_params *remote; 4422 4423 rcu_read_lock(); 4424 remote = rcu_dereference(qpriv->tid_rdma.remote); 4425 KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey); 4426 ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn); 4427 *bth1 = remote->qp; 4428 rcu_read_unlock(); 4429 4430 if (qpriv->resync) { 4431 *bth2 = mask_psn((fs->generation << 4432 HFI1_KDETH_BTH_SEQ_SHIFT) - 1); 4433 ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp); 4434 } else if (qpriv->s_nak_state) { 4435 *bth2 = mask_psn(qpriv->s_nak_psn); 4436 ohdr->u.tid_rdma.ack.aeth = 4437 cpu_to_be32((qp->r_msn & IB_MSN_MASK) | 4438 (qpriv->s_nak_state << 4439 IB_AETH_CREDIT_SHIFT)); 4440 } else { 4441 *bth2 = full_flow_psn(flow, flow->flow_state.lpsn); 4442 ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp); 4443 } 4444 KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, 0x1); 4445 ohdr->u.tid_rdma.ack.tid_flow_qp = 4446 cpu_to_be32(qpriv->tid_rdma.local.qp | 4447 ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) << 4448 TID_RDMA_DESTQP_FLOW_SHIFT) | 4449 qpriv->rcd->ctxt); 4450 4451 ohdr->u.tid_rdma.ack.tid_flow_psn = 0; 4452 ohdr->u.tid_rdma.ack.verbs_psn = 4453 cpu_to_be32(flow->flow_state.resp_ib_psn); 4454 4455 if (qpriv->resync) { 4456 /* 4457 * If the PSN before the current expect KDETH PSN is the 4458 * RESYNC PSN, then we never received a good TID RDMA WRITE 4459 * DATA packet after a previous RESYNC. 4460 * In this case, the next expected KDETH PSN stays the same. 4461 */ 4462 if (hfi1_tid_rdma_is_resync_psn(qpriv->r_next_psn_kdeth - 1)) { 4463 ohdr->u.tid_rdma.ack.tid_flow_psn = 4464 cpu_to_be32(qpriv->r_next_psn_kdeth_save); 4465 } else { 4466 /* 4467 * Because the KDETH PSNs jump during a RESYNC, it's 4468 * not possible to infer (or compute) the previous value 4469 * of r_next_psn_kdeth in the case of back-to-back 4470 * RESYNC packets. Therefore, we save it. 4471 */ 4472 qpriv->r_next_psn_kdeth_save = 4473 qpriv->r_next_psn_kdeth - 1; 4474 ohdr->u.tid_rdma.ack.tid_flow_psn = 4475 cpu_to_be32(qpriv->r_next_psn_kdeth_save); 4476 qpriv->r_next_psn_kdeth = mask_psn(*bth2 + 1); 4477 } 4478 qpriv->resync = false; 4479 } 4480 4481 return sizeof(ohdr->u.tid_rdma.ack) / sizeof(u32); 4482 } 4483 4484 void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) 4485 { 4486 struct ib_other_headers *ohdr = packet->ohdr; 4487 struct rvt_qp *qp = packet->qp; 4488 struct hfi1_qp_priv *qpriv = qp->priv; 4489 struct rvt_swqe *wqe; 4490 struct tid_rdma_request *req; 4491 struct tid_rdma_flow *flow; 4492 u32 aeth, psn, req_psn, ack_psn, flpsn, resync_psn, ack_kpsn; 4493 unsigned long flags; 4494 u16 fidx; 4495 4496 trace_hfi1_tid_write_sender_rcv_tid_ack(qp, 0); 4497 process_ecn(qp, packet); 4498 psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 4499 aeth = be32_to_cpu(ohdr->u.tid_rdma.ack.aeth); 4500 req_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.verbs_psn)); 4501 resync_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.tid_flow_psn)); 4502 4503 spin_lock_irqsave(&qp->s_lock, flags); 4504 trace_hfi1_rcv_tid_ack(qp, aeth, psn, req_psn, resync_psn); 4505 4506 /* If we are waiting for an ACK to RESYNC, drop any other packets */ 4507 if ((qp->s_flags & HFI1_S_WAIT_HALT) && 4508 cmp_psn(psn, qpriv->s_resync_psn)) 4509 goto ack_op_err; 4510 4511 ack_psn = req_psn; 4512 if (hfi1_tid_rdma_is_resync_psn(psn)) 4513 ack_kpsn = resync_psn; 4514 else 4515 ack_kpsn = psn; 4516 if (aeth >> 29) { 4517 ack_psn--; 4518 ack_kpsn--; 4519 } 4520 4521 if (unlikely(qp->s_acked == qp->s_tail)) 4522 goto ack_op_err; 4523 4524 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 4525 4526 if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) 4527 goto ack_op_err; 4528 4529 req = wqe_to_tid_req(wqe); 4530 trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn, 4531 wqe->lpsn, req); 4532 flow = &req->flows[req->acked_tail]; 4533 trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow); 4534 4535 /* Drop stale ACK/NAK */ 4536 if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.spsn)) < 0 || 4537 cmp_psn(req_psn, flow->flow_state.resp_ib_psn) < 0) 4538 goto ack_op_err; 4539 4540 while (cmp_psn(ack_kpsn, 4541 full_flow_psn(flow, flow->flow_state.lpsn)) >= 0 && 4542 req->ack_seg < req->cur_seg) { 4543 req->ack_seg++; 4544 /* advance acked segment pointer */ 4545 req->acked_tail = CIRC_NEXT(req->acked_tail, MAX_FLOWS); 4546 req->r_last_acked = flow->flow_state.resp_ib_psn; 4547 trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn, 4548 wqe->lpsn, req); 4549 if (req->ack_seg == req->total_segs) { 4550 req->state = TID_REQUEST_COMPLETE; 4551 wqe = do_rc_completion(qp, wqe, 4552 to_iport(qp->ibqp.device, 4553 qp->port_num)); 4554 trace_hfi1_sender_rcv_tid_ack(qp); 4555 atomic_dec(&qpriv->n_tid_requests); 4556 if (qp->s_acked == qp->s_tail) 4557 break; 4558 if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) 4559 break; 4560 req = wqe_to_tid_req(wqe); 4561 } 4562 flow = &req->flows[req->acked_tail]; 4563 trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow); 4564 } 4565 4566 trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn, 4567 wqe->lpsn, req); 4568 switch (aeth >> 29) { 4569 case 0: /* ACK */ 4570 if (qpriv->s_flags & RVT_S_WAIT_ACK) 4571 qpriv->s_flags &= ~RVT_S_WAIT_ACK; 4572 if (!hfi1_tid_rdma_is_resync_psn(psn)) { 4573 /* Check if there is any pending TID ACK */ 4574 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && 4575 req->ack_seg < req->cur_seg) 4576 hfi1_mod_tid_retry_timer(qp); 4577 else 4578 hfi1_stop_tid_retry_timer(qp); 4579 hfi1_schedule_send(qp); 4580 } else { 4581 u32 spsn, fpsn, last_acked, generation; 4582 struct tid_rdma_request *rptr; 4583 4584 /* ACK(RESYNC) */ 4585 hfi1_stop_tid_retry_timer(qp); 4586 /* Allow new requests (see hfi1_make_tid_rdma_pkt) */ 4587 qp->s_flags &= ~HFI1_S_WAIT_HALT; 4588 /* 4589 * Clear RVT_S_SEND_ONE flag in case that the TID RDMA 4590 * ACK is received after the TID retry timer is fired 4591 * again. In this case, do not send any more TID 4592 * RESYNC request or wait for any more TID ACK packet. 4593 */ 4594 qpriv->s_flags &= ~RVT_S_SEND_ONE; 4595 hfi1_schedule_send(qp); 4596 4597 if ((qp->s_acked == qpriv->s_tid_tail && 4598 req->ack_seg == req->total_segs) || 4599 qp->s_acked == qp->s_tail) { 4600 qpriv->s_state = TID_OP(WRITE_DATA_LAST); 4601 goto done; 4602 } 4603 4604 if (req->ack_seg == req->comp_seg) { 4605 qpriv->s_state = TID_OP(WRITE_DATA); 4606 goto done; 4607 } 4608 4609 /* 4610 * The PSN to start with is the next PSN after the 4611 * RESYNC PSN. 4612 */ 4613 psn = mask_psn(psn + 1); 4614 generation = psn >> HFI1_KDETH_BTH_SEQ_SHIFT; 4615 spsn = 0; 4616 4617 /* 4618 * Update to the correct WQE when we get an ACK(RESYNC) 4619 * in the middle of a request. 4620 */ 4621 if (delta_psn(ack_psn, wqe->lpsn)) 4622 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 4623 req = wqe_to_tid_req(wqe); 4624 flow = &req->flows[req->acked_tail]; 4625 /* 4626 * RESYNC re-numbers the PSN ranges of all remaining 4627 * segments. Also, PSN's start from 0 in the middle of a 4628 * segment and the first segment size is less than the 4629 * default number of packets. flow->resync_npkts is used 4630 * to track the number of packets from the start of the 4631 * real segment to the point of 0 PSN after the RESYNC 4632 * in order to later correctly rewind the SGE. 4633 */ 4634 fpsn = full_flow_psn(flow, flow->flow_state.spsn); 4635 req->r_ack_psn = psn; 4636 /* 4637 * If resync_psn points to the last flow PSN for a 4638 * segment and the new segment (likely from a new 4639 * request) starts with a new generation number, we 4640 * need to adjust resync_psn accordingly. 4641 */ 4642 if (flow->flow_state.generation != 4643 (resync_psn >> HFI1_KDETH_BTH_SEQ_SHIFT)) 4644 resync_psn = mask_psn(fpsn - 1); 4645 flow->resync_npkts += 4646 delta_psn(mask_psn(resync_psn + 1), fpsn); 4647 /* 4648 * Renumber all packet sequence number ranges 4649 * based on the new generation. 4650 */ 4651 last_acked = qp->s_acked; 4652 rptr = req; 4653 while (1) { 4654 /* start from last acked segment */ 4655 for (fidx = rptr->acked_tail; 4656 CIRC_CNT(rptr->setup_head, fidx, 4657 MAX_FLOWS); 4658 fidx = CIRC_NEXT(fidx, MAX_FLOWS)) { 4659 u32 lpsn; 4660 u32 gen; 4661 4662 flow = &rptr->flows[fidx]; 4663 gen = flow->flow_state.generation; 4664 if (WARN_ON(gen == generation && 4665 flow->flow_state.spsn != 4666 spsn)) 4667 continue; 4668 lpsn = flow->flow_state.lpsn; 4669 lpsn = full_flow_psn(flow, lpsn); 4670 flow->npkts = 4671 delta_psn(lpsn, 4672 mask_psn(resync_psn) 4673 ); 4674 flow->flow_state.generation = 4675 generation; 4676 flow->flow_state.spsn = spsn; 4677 flow->flow_state.lpsn = 4678 flow->flow_state.spsn + 4679 flow->npkts - 1; 4680 flow->pkt = 0; 4681 spsn += flow->npkts; 4682 resync_psn += flow->npkts; 4683 trace_hfi1_tid_flow_rcv_tid_ack(qp, 4684 fidx, 4685 flow); 4686 } 4687 if (++last_acked == qpriv->s_tid_cur + 1) 4688 break; 4689 if (last_acked == qp->s_size) 4690 last_acked = 0; 4691 wqe = rvt_get_swqe_ptr(qp, last_acked); 4692 rptr = wqe_to_tid_req(wqe); 4693 } 4694 req->cur_seg = req->ack_seg; 4695 qpriv->s_tid_tail = qp->s_acked; 4696 qpriv->s_state = TID_OP(WRITE_REQ); 4697 hfi1_schedule_tid_send(qp); 4698 } 4699 done: 4700 qpriv->s_retry = qp->s_retry_cnt; 4701 break; 4702 4703 case 3: /* NAK */ 4704 hfi1_stop_tid_retry_timer(qp); 4705 switch ((aeth >> IB_AETH_CREDIT_SHIFT) & 4706 IB_AETH_CREDIT_MASK) { 4707 case 0: /* PSN sequence error */ 4708 if (!req->flows) 4709 break; 4710 flow = &req->flows[req->acked_tail]; 4711 flpsn = full_flow_psn(flow, flow->flow_state.lpsn); 4712 if (cmp_psn(psn, flpsn) > 0) 4713 break; 4714 trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, 4715 flow); 4716 req->r_ack_psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 4717 req->cur_seg = req->ack_seg; 4718 qpriv->s_tid_tail = qp->s_acked; 4719 qpriv->s_state = TID_OP(WRITE_REQ); 4720 qpriv->s_retry = qp->s_retry_cnt; 4721 hfi1_schedule_tid_send(qp); 4722 break; 4723 4724 default: 4725 break; 4726 } 4727 break; 4728 4729 default: 4730 break; 4731 } 4732 4733 ack_op_err: 4734 spin_unlock_irqrestore(&qp->s_lock, flags); 4735 } 4736 4737 void hfi1_add_tid_retry_timer(struct rvt_qp *qp) 4738 { 4739 struct hfi1_qp_priv *priv = qp->priv; 4740 struct ib_qp *ibqp = &qp->ibqp; 4741 struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device); 4742 4743 lockdep_assert_held(&qp->s_lock); 4744 if (!(priv->s_flags & HFI1_S_TID_RETRY_TIMER)) { 4745 priv->s_flags |= HFI1_S_TID_RETRY_TIMER; 4746 priv->s_tid_retry_timer.expires = jiffies + 4747 priv->tid_retry_timeout_jiffies + rdi->busy_jiffies; 4748 add_timer(&priv->s_tid_retry_timer); 4749 } 4750 } 4751 4752 static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp) 4753 { 4754 struct hfi1_qp_priv *priv = qp->priv; 4755 struct ib_qp *ibqp = &qp->ibqp; 4756 struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device); 4757 4758 lockdep_assert_held(&qp->s_lock); 4759 priv->s_flags |= HFI1_S_TID_RETRY_TIMER; 4760 mod_timer(&priv->s_tid_retry_timer, jiffies + 4761 priv->tid_retry_timeout_jiffies + rdi->busy_jiffies); 4762 } 4763 4764 static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp) 4765 { 4766 struct hfi1_qp_priv *priv = qp->priv; 4767 int rval = 0; 4768 4769 lockdep_assert_held(&qp->s_lock); 4770 if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) { 4771 rval = del_timer(&priv->s_tid_retry_timer); 4772 priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER; 4773 } 4774 return rval; 4775 } 4776 4777 void hfi1_del_tid_retry_timer(struct rvt_qp *qp) 4778 { 4779 struct hfi1_qp_priv *priv = qp->priv; 4780 4781 del_timer_sync(&priv->s_tid_retry_timer); 4782 priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER; 4783 } 4784 4785 static void hfi1_tid_retry_timeout(struct timer_list *t) 4786 { 4787 struct hfi1_qp_priv *priv = from_timer(priv, t, s_tid_retry_timer); 4788 struct rvt_qp *qp = priv->owner; 4789 struct rvt_swqe *wqe; 4790 unsigned long flags; 4791 struct tid_rdma_request *req; 4792 4793 spin_lock_irqsave(&qp->r_lock, flags); 4794 spin_lock(&qp->s_lock); 4795 trace_hfi1_tid_write_sender_retry_timeout(qp, 0); 4796 if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) { 4797 hfi1_stop_tid_retry_timer(qp); 4798 if (!priv->s_retry) { 4799 trace_hfi1_msg_tid_retry_timeout(/* msg */ 4800 qp, 4801 "Exhausted retries. Tid retry timeout = ", 4802 (u64)priv->tid_retry_timeout_jiffies); 4803 4804 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 4805 hfi1_trdma_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); 4806 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); 4807 } else { 4808 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 4809 req = wqe_to_tid_req(wqe); 4810 trace_hfi1_tid_req_tid_retry_timeout(/* req */ 4811 qp, 0, wqe->wr.opcode, wqe->psn, wqe->lpsn, req); 4812 4813 priv->s_flags &= ~RVT_S_WAIT_ACK; 4814 /* Only send one packet (the RESYNC) */ 4815 priv->s_flags |= RVT_S_SEND_ONE; 4816 /* 4817 * No additional request shall be made by this QP until 4818 * the RESYNC has been complete. 4819 */ 4820 qp->s_flags |= HFI1_S_WAIT_HALT; 4821 priv->s_state = TID_OP(RESYNC); 4822 priv->s_retry--; 4823 hfi1_schedule_tid_send(qp); 4824 } 4825 } 4826 spin_unlock(&qp->s_lock); 4827 spin_unlock_irqrestore(&qp->r_lock, flags); 4828 } 4829 4830 u32 hfi1_build_tid_rdma_resync(struct rvt_qp *qp, struct rvt_swqe *wqe, 4831 struct ib_other_headers *ohdr, u32 *bth1, 4832 u32 *bth2, u16 fidx) 4833 { 4834 struct hfi1_qp_priv *qpriv = qp->priv; 4835 struct tid_rdma_params *remote; 4836 struct tid_rdma_request *req = wqe_to_tid_req(wqe); 4837 struct tid_rdma_flow *flow = &req->flows[fidx]; 4838 u32 generation; 4839 4840 rcu_read_lock(); 4841 remote = rcu_dereference(qpriv->tid_rdma.remote); 4842 KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey); 4843 ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn); 4844 *bth1 = remote->qp; 4845 rcu_read_unlock(); 4846 4847 generation = kern_flow_generation_next(flow->flow_state.generation); 4848 *bth2 = mask_psn((generation << HFI1_KDETH_BTH_SEQ_SHIFT) - 1); 4849 qpriv->s_resync_psn = *bth2; 4850 *bth2 |= IB_BTH_REQ_ACK; 4851 KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, 0x1); 4852 4853 return sizeof(ohdr->u.tid_rdma.resync) / sizeof(u32); 4854 } 4855 4856 void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet) 4857 { 4858 struct ib_other_headers *ohdr = packet->ohdr; 4859 struct rvt_qp *qp = packet->qp; 4860 struct hfi1_qp_priv *qpriv = qp->priv; 4861 struct hfi1_ctxtdata *rcd = qpriv->rcd; 4862 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); 4863 struct rvt_ack_entry *e; 4864 struct tid_rdma_request *req; 4865 struct tid_rdma_flow *flow; 4866 struct tid_flow_state *fs = &qpriv->flow_state; 4867 u32 psn, generation, idx, gen_next; 4868 bool fecn; 4869 unsigned long flags; 4870 4871 fecn = process_ecn(qp, packet); 4872 psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 4873 4874 generation = mask_psn(psn + 1) >> HFI1_KDETH_BTH_SEQ_SHIFT; 4875 spin_lock_irqsave(&qp->s_lock, flags); 4876 4877 gen_next = (fs->generation == KERN_GENERATION_RESERVED) ? 4878 generation : kern_flow_generation_next(fs->generation); 4879 /* 4880 * RESYNC packet contains the "next" generation and can only be 4881 * from the current or previous generations 4882 */ 4883 if (generation != mask_generation(gen_next - 1) && 4884 generation != gen_next) 4885 goto bail; 4886 /* Already processing a resync */ 4887 if (qpriv->resync) 4888 goto bail; 4889 4890 spin_lock(&rcd->exp_lock); 4891 if (fs->index >= RXE_NUM_TID_FLOWS) { 4892 /* 4893 * If we don't have a flow, save the generation so it can be 4894 * applied when a new flow is allocated 4895 */ 4896 fs->generation = generation; 4897 } else { 4898 /* Reprogram the QP flow with new generation */ 4899 rcd->flows[fs->index].generation = generation; 4900 fs->generation = kern_setup_hw_flow(rcd, fs->index); 4901 } 4902 fs->psn = 0; 4903 /* 4904 * Disable SW PSN checking since a RESYNC is equivalent to a 4905 * sync point and the flow has/will be reprogrammed 4906 */ 4907 qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; 4908 trace_hfi1_tid_write_rsp_rcv_resync(qp); 4909 4910 /* 4911 * Reset all TID flow information with the new generation. 4912 * This is done for all requests and segments after the 4913 * last received segment 4914 */ 4915 for (idx = qpriv->r_tid_tail; ; idx++) { 4916 u16 flow_idx; 4917 4918 if (idx > rvt_size_atomic(&dev->rdi)) 4919 idx = 0; 4920 e = &qp->s_ack_queue[idx]; 4921 if (e->opcode == TID_OP(WRITE_REQ)) { 4922 req = ack_to_tid_req(e); 4923 trace_hfi1_tid_req_rcv_resync(qp, 0, e->opcode, e->psn, 4924 e->lpsn, req); 4925 4926 /* start from last unacked segment */ 4927 for (flow_idx = req->clear_tail; 4928 CIRC_CNT(req->setup_head, flow_idx, 4929 MAX_FLOWS); 4930 flow_idx = CIRC_NEXT(flow_idx, MAX_FLOWS)) { 4931 u32 lpsn; 4932 u32 next; 4933 4934 flow = &req->flows[flow_idx]; 4935 lpsn = full_flow_psn(flow, 4936 flow->flow_state.lpsn); 4937 next = flow->flow_state.r_next_psn; 4938 flow->npkts = delta_psn(lpsn, next - 1); 4939 flow->flow_state.generation = fs->generation; 4940 flow->flow_state.spsn = fs->psn; 4941 flow->flow_state.lpsn = 4942 flow->flow_state.spsn + flow->npkts - 1; 4943 flow->flow_state.r_next_psn = 4944 full_flow_psn(flow, 4945 flow->flow_state.spsn); 4946 fs->psn += flow->npkts; 4947 trace_hfi1_tid_flow_rcv_resync(qp, flow_idx, 4948 flow); 4949 } 4950 } 4951 if (idx == qp->s_tail_ack_queue) 4952 break; 4953 } 4954 4955 spin_unlock(&rcd->exp_lock); 4956 qpriv->resync = true; 4957 /* RESYNC request always gets a TID RDMA ACK. */ 4958 qpriv->s_nak_state = 0; 4959 tid_rdma_trigger_ack(qp); 4960 bail: 4961 if (fecn) 4962 qp->s_flags |= RVT_S_ECN; 4963 spin_unlock_irqrestore(&qp->s_lock, flags); 4964 } 4965 4966 /* 4967 * Call this function when the last TID RDMA WRITE DATA packet for a request 4968 * is built. 4969 */ 4970 static void update_tid_tail(struct rvt_qp *qp) 4971 __must_hold(&qp->s_lock) 4972 { 4973 struct hfi1_qp_priv *priv = qp->priv; 4974 u32 i; 4975 struct rvt_swqe *wqe; 4976 4977 lockdep_assert_held(&qp->s_lock); 4978 /* Can't move beyond s_tid_cur */ 4979 if (priv->s_tid_tail == priv->s_tid_cur) 4980 return; 4981 for (i = priv->s_tid_tail + 1; ; i++) { 4982 if (i == qp->s_size) 4983 i = 0; 4984 4985 if (i == priv->s_tid_cur) 4986 break; 4987 wqe = rvt_get_swqe_ptr(qp, i); 4988 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) 4989 break; 4990 } 4991 priv->s_tid_tail = i; 4992 priv->s_state = TID_OP(WRITE_RESP); 4993 } 4994 4995 int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps) 4996 __must_hold(&qp->s_lock) 4997 { 4998 struct hfi1_qp_priv *priv = qp->priv; 4999 struct rvt_swqe *wqe; 5000 u32 bth1 = 0, bth2 = 0, hwords = 5, len, middle = 0; 5001 struct ib_other_headers *ohdr; 5002 struct rvt_sge_state *ss = &qp->s_sge; 5003 struct rvt_ack_entry *e = &qp->s_ack_queue[qp->s_tail_ack_queue]; 5004 struct tid_rdma_request *req = ack_to_tid_req(e); 5005 bool last = false; 5006 u8 opcode = TID_OP(WRITE_DATA); 5007 5008 lockdep_assert_held(&qp->s_lock); 5009 trace_hfi1_tid_write_sender_make_tid_pkt(qp, 0); 5010 /* 5011 * Prioritize the sending of the requests and responses over the 5012 * sending of the TID RDMA data packets. 5013 */ 5014 if (((atomic_read(&priv->n_tid_requests) < HFI1_TID_RDMA_WRITE_CNT) && 5015 atomic_read(&priv->n_requests) && 5016 !(qp->s_flags & (RVT_S_BUSY | RVT_S_WAIT_ACK | 5017 HFI1_S_ANY_WAIT_IO))) || 5018 (e->opcode == TID_OP(WRITE_REQ) && req->cur_seg < req->alloc_seg && 5019 !(qp->s_flags & (RVT_S_BUSY | HFI1_S_ANY_WAIT_IO)))) { 5020 struct iowait_work *iowork; 5021 5022 iowork = iowait_get_ib_work(&priv->s_iowait); 5023 ps->s_txreq = get_waiting_verbs_txreq(iowork); 5024 if (ps->s_txreq || hfi1_make_rc_req(qp, ps)) { 5025 priv->s_flags |= HFI1_S_TID_BUSY_SET; 5026 return 1; 5027 } 5028 } 5029 5030 ps->s_txreq = get_txreq(ps->dev, qp); 5031 if (!ps->s_txreq) 5032 goto bail_no_tx; 5033 5034 ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth; 5035 5036 if ((priv->s_flags & RVT_S_ACK_PENDING) && 5037 make_tid_rdma_ack(qp, ohdr, ps)) 5038 return 1; 5039 5040 /* 5041 * Bail out if we can't send data. 5042 * Be reminded that this check must been done after the call to 5043 * make_tid_rdma_ack() because the responding QP could be in 5044 * RTR state where it can send TID RDMA ACK, not TID RDMA WRITE DATA. 5045 */ 5046 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) 5047 goto bail; 5048 5049 if (priv->s_flags & RVT_S_WAIT_ACK) 5050 goto bail; 5051 5052 /* Check whether there is anything to do. */ 5053 if (priv->s_tid_tail == HFI1_QP_WQE_INVALID) 5054 goto bail; 5055 wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail); 5056 req = wqe_to_tid_req(wqe); 5057 trace_hfi1_tid_req_make_tid_pkt(qp, 0, wqe->wr.opcode, wqe->psn, 5058 wqe->lpsn, req); 5059 switch (priv->s_state) { 5060 case TID_OP(WRITE_REQ): 5061 case TID_OP(WRITE_RESP): 5062 priv->tid_ss.sge = wqe->sg_list[0]; 5063 priv->tid_ss.sg_list = wqe->sg_list + 1; 5064 priv->tid_ss.num_sge = wqe->wr.num_sge; 5065 priv->tid_ss.total_len = wqe->length; 5066 5067 if (priv->s_state == TID_OP(WRITE_REQ)) 5068 hfi1_tid_rdma_restart_req(qp, wqe, &bth2); 5069 priv->s_state = TID_OP(WRITE_DATA); 5070 /* fall through */ 5071 5072 case TID_OP(WRITE_DATA): 5073 /* 5074 * 1. Check whether TID RDMA WRITE RESP available. 5075 * 2. If no: 5076 * 2.1 If have more segments and no TID RDMA WRITE RESP, 5077 * set HFI1_S_WAIT_TID_RESP 5078 * 2.2 Return indicating no progress made. 5079 * 3. If yes: 5080 * 3.1 Build TID RDMA WRITE DATA packet. 5081 * 3.2 If last packet in segment: 5082 * 3.2.1 Change KDETH header bits 5083 * 3.2.2 Advance RESP pointers. 5084 * 3.3 Return indicating progress made. 5085 */ 5086 trace_hfi1_sender_make_tid_pkt(qp); 5087 trace_hfi1_tid_write_sender_make_tid_pkt(qp, 0); 5088 wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail); 5089 req = wqe_to_tid_req(wqe); 5090 len = wqe->length; 5091 5092 if (!req->comp_seg || req->cur_seg == req->comp_seg) 5093 goto bail; 5094 5095 trace_hfi1_tid_req_make_tid_pkt(qp, 0, wqe->wr.opcode, 5096 wqe->psn, wqe->lpsn, req); 5097 last = hfi1_build_tid_rdma_packet(wqe, ohdr, &bth1, &bth2, 5098 &len); 5099 5100 if (last) { 5101 /* move pointer to next flow */ 5102 req->clear_tail = CIRC_NEXT(req->clear_tail, 5103 MAX_FLOWS); 5104 if (++req->cur_seg < req->total_segs) { 5105 if (!CIRC_CNT(req->setup_head, req->clear_tail, 5106 MAX_FLOWS)) 5107 qp->s_flags |= HFI1_S_WAIT_TID_RESP; 5108 } else { 5109 priv->s_state = TID_OP(WRITE_DATA_LAST); 5110 opcode = TID_OP(WRITE_DATA_LAST); 5111 5112 /* Advance the s_tid_tail now */ 5113 update_tid_tail(qp); 5114 } 5115 } 5116 hwords += sizeof(ohdr->u.tid_rdma.w_data) / sizeof(u32); 5117 ss = &priv->tid_ss; 5118 break; 5119 5120 case TID_OP(RESYNC): 5121 trace_hfi1_sender_make_tid_pkt(qp); 5122 /* Use generation from the most recently received response */ 5123 wqe = rvt_get_swqe_ptr(qp, priv->s_tid_cur); 5124 req = wqe_to_tid_req(wqe); 5125 /* If no responses for this WQE look at the previous one */ 5126 if (!req->comp_seg) { 5127 wqe = rvt_get_swqe_ptr(qp, 5128 (!priv->s_tid_cur ? qp->s_size : 5129 priv->s_tid_cur) - 1); 5130 req = wqe_to_tid_req(wqe); 5131 } 5132 hwords += hfi1_build_tid_rdma_resync(qp, wqe, ohdr, &bth1, 5133 &bth2, 5134 CIRC_PREV(req->setup_head, 5135 MAX_FLOWS)); 5136 ss = NULL; 5137 len = 0; 5138 opcode = TID_OP(RESYNC); 5139 break; 5140 5141 default: 5142 goto bail; 5143 } 5144 if (priv->s_flags & RVT_S_SEND_ONE) { 5145 priv->s_flags &= ~RVT_S_SEND_ONE; 5146 priv->s_flags |= RVT_S_WAIT_ACK; 5147 bth2 |= IB_BTH_REQ_ACK; 5148 } 5149 qp->s_len -= len; 5150 ps->s_txreq->hdr_dwords = hwords; 5151 ps->s_txreq->sde = priv->s_sde; 5152 ps->s_txreq->ss = ss; 5153 ps->s_txreq->s_cur_size = len; 5154 hfi1_make_ruc_header(qp, ohdr, (opcode << 24), bth1, bth2, 5155 middle, ps); 5156 return 1; 5157 bail: 5158 hfi1_put_txreq(ps->s_txreq); 5159 bail_no_tx: 5160 ps->s_txreq = NULL; 5161 priv->s_flags &= ~RVT_S_BUSY; 5162 /* 5163 * If we didn't get a txreq, the QP will be woken up later to try 5164 * again, set the flags to the the wake up which work item to wake 5165 * up. 5166 * (A better algorithm should be found to do this and generalize the 5167 * sleep/wakeup flags.) 5168 */ 5169 iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID); 5170 return 0; 5171 } 5172 5173 static int make_tid_rdma_ack(struct rvt_qp *qp, 5174 struct ib_other_headers *ohdr, 5175 struct hfi1_pkt_state *ps) 5176 { 5177 struct rvt_ack_entry *e; 5178 struct hfi1_qp_priv *qpriv = qp->priv; 5179 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); 5180 u32 hwords, next; 5181 u32 len = 0; 5182 u32 bth1 = 0, bth2 = 0; 5183 int middle = 0; 5184 u16 flow; 5185 struct tid_rdma_request *req, *nreq; 5186 5187 trace_hfi1_tid_write_rsp_make_tid_ack(qp); 5188 /* Don't send an ACK if we aren't supposed to. */ 5189 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) 5190 goto bail; 5191 5192 /* header size in 32-bit words LRH+BTH = (8+12)/4. */ 5193 hwords = 5; 5194 5195 e = &qp->s_ack_queue[qpriv->r_tid_ack]; 5196 req = ack_to_tid_req(e); 5197 /* 5198 * In the RESYNC case, we are exactly one segment past the 5199 * previously sent ack or at the previously sent NAK. So to send 5200 * the resync ack, we go back one segment (which might be part of 5201 * the previous request) and let the do-while loop execute again. 5202 * The advantage of executing the do-while loop is that any data 5203 * received after the previous ack is automatically acked in the 5204 * RESYNC ack. It turns out that for the do-while loop we only need 5205 * to pull back qpriv->r_tid_ack, not the segment 5206 * indices/counters. The scheme works even if the previous request 5207 * was not a TID WRITE request. 5208 */ 5209 if (qpriv->resync) { 5210 if (!req->ack_seg || req->ack_seg == req->total_segs) 5211 qpriv->r_tid_ack = !qpriv->r_tid_ack ? 5212 rvt_size_atomic(&dev->rdi) : 5213 qpriv->r_tid_ack - 1; 5214 e = &qp->s_ack_queue[qpriv->r_tid_ack]; 5215 req = ack_to_tid_req(e); 5216 } 5217 5218 trace_hfi1_rsp_make_tid_ack(qp, e->psn); 5219 trace_hfi1_tid_req_make_tid_ack(qp, 0, e->opcode, e->psn, e->lpsn, 5220 req); 5221 /* 5222 * If we've sent all the ACKs that we can, we are done 5223 * until we get more segments... 5224 */ 5225 if (!qpriv->s_nak_state && !qpriv->resync && 5226 req->ack_seg == req->comp_seg) 5227 goto bail; 5228 5229 do { 5230 /* 5231 * To deal with coalesced ACKs, the acked_tail pointer 5232 * into the flow array is used. The distance between it 5233 * and the clear_tail is the number of flows that are 5234 * being ACK'ed. 5235 */ 5236 req->ack_seg += 5237 /* Get up-to-date value */ 5238 CIRC_CNT(req->clear_tail, req->acked_tail, 5239 MAX_FLOWS); 5240 /* Advance acked index */ 5241 req->acked_tail = req->clear_tail; 5242 5243 /* 5244 * req->clear_tail points to the segment currently being 5245 * received. So, when sending an ACK, the previous 5246 * segment is being ACK'ed. 5247 */ 5248 flow = CIRC_PREV(req->acked_tail, MAX_FLOWS); 5249 if (req->ack_seg != req->total_segs) 5250 break; 5251 req->state = TID_REQUEST_COMPLETE; 5252 5253 next = qpriv->r_tid_ack + 1; 5254 if (next > rvt_size_atomic(&dev->rdi)) 5255 next = 0; 5256 qpriv->r_tid_ack = next; 5257 if (qp->s_ack_queue[next].opcode != TID_OP(WRITE_REQ)) 5258 break; 5259 nreq = ack_to_tid_req(&qp->s_ack_queue[next]); 5260 if (!nreq->comp_seg || nreq->ack_seg == nreq->comp_seg) 5261 break; 5262 5263 /* Move to the next ack entry now */ 5264 e = &qp->s_ack_queue[qpriv->r_tid_ack]; 5265 req = ack_to_tid_req(e); 5266 } while (1); 5267 5268 /* 5269 * At this point qpriv->r_tid_ack == qpriv->r_tid_tail but e and 5270 * req could be pointing at the previous ack queue entry 5271 */ 5272 if (qpriv->s_nak_state || 5273 (qpriv->resync && 5274 !hfi1_tid_rdma_is_resync_psn(qpriv->r_next_psn_kdeth - 1) && 5275 (cmp_psn(qpriv->r_next_psn_kdeth - 1, 5276 full_flow_psn(&req->flows[flow], 5277 req->flows[flow].flow_state.lpsn)) > 0))) { 5278 /* 5279 * A NAK will implicitly acknowledge all previous TID RDMA 5280 * requests. Therefore, we NAK with the req->acked_tail 5281 * segment for the request at qpriv->r_tid_ack (same at 5282 * this point as the req->clear_tail segment for the 5283 * qpriv->r_tid_tail request) 5284 */ 5285 e = &qp->s_ack_queue[qpriv->r_tid_ack]; 5286 req = ack_to_tid_req(e); 5287 flow = req->acked_tail; 5288 } else if (req->ack_seg == req->total_segs && 5289 qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK) 5290 qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK; 5291 5292 trace_hfi1_tid_write_rsp_make_tid_ack(qp); 5293 trace_hfi1_tid_req_make_tid_ack(qp, 0, e->opcode, e->psn, e->lpsn, 5294 req); 5295 hwords += hfi1_build_tid_rdma_write_ack(qp, e, ohdr, flow, &bth1, 5296 &bth2); 5297 len = 0; 5298 qpriv->s_flags &= ~RVT_S_ACK_PENDING; 5299 ps->s_txreq->hdr_dwords = hwords; 5300 ps->s_txreq->sde = qpriv->s_sde; 5301 ps->s_txreq->s_cur_size = len; 5302 ps->s_txreq->ss = NULL; 5303 hfi1_make_ruc_header(qp, ohdr, (TID_OP(ACK) << 24), bth1, bth2, middle, 5304 ps); 5305 ps->s_txreq->txreq.flags |= SDMA_TXREQ_F_VIP; 5306 return 1; 5307 bail: 5308 /* 5309 * Ensure s_rdma_ack_cnt changes are committed prior to resetting 5310 * RVT_S_RESP_PENDING 5311 */ 5312 smp_wmb(); 5313 qpriv->s_flags &= ~RVT_S_ACK_PENDING; 5314 return 0; 5315 } 5316 5317 static int hfi1_send_tid_ok(struct rvt_qp *qp) 5318 { 5319 struct hfi1_qp_priv *priv = qp->priv; 5320 5321 return !(priv->s_flags & RVT_S_BUSY || 5322 qp->s_flags & HFI1_S_ANY_WAIT_IO) && 5323 (verbs_txreq_queued(iowait_get_tid_work(&priv->s_iowait)) || 5324 (priv->s_flags & RVT_S_RESP_PENDING) || 5325 !(qp->s_flags & HFI1_S_ANY_TID_WAIT_SEND)); 5326 } 5327 5328 void _hfi1_do_tid_send(struct work_struct *work) 5329 { 5330 struct iowait_work *w = container_of(work, struct iowait_work, iowork); 5331 struct rvt_qp *qp = iowait_to_qp(w->iow); 5332 5333 hfi1_do_tid_send(qp); 5334 } 5335 5336 static void hfi1_do_tid_send(struct rvt_qp *qp) 5337 { 5338 struct hfi1_pkt_state ps; 5339 struct hfi1_qp_priv *priv = qp->priv; 5340 5341 ps.dev = to_idev(qp->ibqp.device); 5342 ps.ibp = to_iport(qp->ibqp.device, qp->port_num); 5343 ps.ppd = ppd_from_ibp(ps.ibp); 5344 ps.wait = iowait_get_tid_work(&priv->s_iowait); 5345 ps.in_thread = false; 5346 ps.timeout_int = qp->timeout_jiffies / 8; 5347 5348 trace_hfi1_rc_do_tid_send(qp, false); 5349 spin_lock_irqsave(&qp->s_lock, ps.flags); 5350 5351 /* Return if we are already busy processing a work request. */ 5352 if (!hfi1_send_tid_ok(qp)) { 5353 if (qp->s_flags & HFI1_S_ANY_WAIT_IO) 5354 iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID); 5355 spin_unlock_irqrestore(&qp->s_lock, ps.flags); 5356 return; 5357 } 5358 5359 priv->s_flags |= RVT_S_BUSY; 5360 5361 ps.timeout = jiffies + ps.timeout_int; 5362 ps.cpu = priv->s_sde ? priv->s_sde->cpu : 5363 cpumask_first(cpumask_of_node(ps.ppd->dd->node)); 5364 ps.pkts_sent = false; 5365 5366 /* insure a pre-built packet is handled */ 5367 ps.s_txreq = get_waiting_verbs_txreq(ps.wait); 5368 do { 5369 /* Check for a constructed packet to be sent. */ 5370 if (ps.s_txreq) { 5371 if (priv->s_flags & HFI1_S_TID_BUSY_SET) { 5372 qp->s_flags |= RVT_S_BUSY; 5373 ps.wait = iowait_get_ib_work(&priv->s_iowait); 5374 } 5375 spin_unlock_irqrestore(&qp->s_lock, ps.flags); 5376 5377 /* 5378 * If the packet cannot be sent now, return and 5379 * the send tasklet will be woken up later. 5380 */ 5381 if (hfi1_verbs_send(qp, &ps)) 5382 return; 5383 5384 /* allow other tasks to run */ 5385 if (hfi1_schedule_send_yield(qp, &ps, true)) 5386 return; 5387 5388 spin_lock_irqsave(&qp->s_lock, ps.flags); 5389 if (priv->s_flags & HFI1_S_TID_BUSY_SET) { 5390 qp->s_flags &= ~RVT_S_BUSY; 5391 priv->s_flags &= ~HFI1_S_TID_BUSY_SET; 5392 ps.wait = iowait_get_tid_work(&priv->s_iowait); 5393 if (iowait_flag_set(&priv->s_iowait, 5394 IOWAIT_PENDING_IB)) 5395 hfi1_schedule_send(qp); 5396 } 5397 } 5398 } while (hfi1_make_tid_rdma_pkt(qp, &ps)); 5399 iowait_starve_clear(ps.pkts_sent, &priv->s_iowait); 5400 spin_unlock_irqrestore(&qp->s_lock, ps.flags); 5401 } 5402 5403 static bool _hfi1_schedule_tid_send(struct rvt_qp *qp) 5404 { 5405 struct hfi1_qp_priv *priv = qp->priv; 5406 struct hfi1_ibport *ibp = 5407 to_iport(qp->ibqp.device, qp->port_num); 5408 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); 5409 struct hfi1_devdata *dd = ppd->dd; 5410 5411 if ((dd->flags & HFI1_SHUTDOWN)) 5412 return true; 5413 5414 return iowait_tid_schedule(&priv->s_iowait, ppd->hfi1_wq, 5415 priv->s_sde ? 5416 priv->s_sde->cpu : 5417 cpumask_first(cpumask_of_node(dd->node))); 5418 } 5419 5420 /** 5421 * hfi1_schedule_tid_send - schedule progress on TID RDMA state machine 5422 * @qp: the QP 5423 * 5424 * This schedules qp progress on the TID RDMA state machine. Caller 5425 * should hold the s_lock. 5426 * Unlike hfi1_schedule_send(), this cannot use hfi1_send_ok() because 5427 * the two state machines can step on each other with respect to the 5428 * RVT_S_BUSY flag. 5429 * Therefore, a modified test is used. 5430 * @return true if the second leg is scheduled; 5431 * false if the second leg is not scheduled. 5432 */ 5433 bool hfi1_schedule_tid_send(struct rvt_qp *qp) 5434 { 5435 lockdep_assert_held(&qp->s_lock); 5436 if (hfi1_send_tid_ok(qp)) { 5437 /* 5438 * The following call returns true if the qp is not on the 5439 * queue and false if the qp is already on the queue before 5440 * this call. Either way, the qp will be on the queue when the 5441 * call returns. 5442 */ 5443 _hfi1_schedule_tid_send(qp); 5444 return true; 5445 } 5446 if (qp->s_flags & HFI1_S_ANY_WAIT_IO) 5447 iowait_set_flag(&((struct hfi1_qp_priv *)qp->priv)->s_iowait, 5448 IOWAIT_PENDING_TID); 5449 return false; 5450 } 5451 5452 bool hfi1_tid_rdma_ack_interlock(struct rvt_qp *qp, struct rvt_ack_entry *e) 5453 { 5454 struct rvt_ack_entry *prev; 5455 struct tid_rdma_request *req; 5456 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); 5457 struct hfi1_qp_priv *priv = qp->priv; 5458 u32 s_prev; 5459 5460 s_prev = qp->s_tail_ack_queue == 0 ? rvt_size_atomic(&dev->rdi) : 5461 (qp->s_tail_ack_queue - 1); 5462 prev = &qp->s_ack_queue[s_prev]; 5463 5464 if ((e->opcode == TID_OP(READ_REQ) || 5465 e->opcode == OP(RDMA_READ_REQUEST)) && 5466 prev->opcode == TID_OP(WRITE_REQ)) { 5467 req = ack_to_tid_req(prev); 5468 if (req->ack_seg != req->total_segs) { 5469 priv->s_flags |= HFI1_R_TID_WAIT_INTERLCK; 5470 return true; 5471 } 5472 } 5473 return false; 5474 } 5475 5476 static u32 read_r_next_psn(struct hfi1_devdata *dd, u8 ctxt, u8 fidx) 5477 { 5478 u64 reg; 5479 5480 /* 5481 * The only sane way to get the amount of 5482 * progress is to read the HW flow state. 5483 */ 5484 reg = read_uctxt_csr(dd, ctxt, RCV_TID_FLOW_TABLE + (8 * fidx)); 5485 return mask_psn(reg); 5486 } 5487 5488 static void tid_rdma_rcv_err(struct hfi1_packet *packet, 5489 struct ib_other_headers *ohdr, 5490 struct rvt_qp *qp, u32 psn, int diff, bool fecn) 5491 { 5492 unsigned long flags; 5493 5494 tid_rdma_rcv_error(packet, ohdr, qp, psn, diff); 5495 if (fecn) { 5496 spin_lock_irqsave(&qp->s_lock, flags); 5497 qp->s_flags |= RVT_S_ECN; 5498 spin_unlock_irqrestore(&qp->s_lock, flags); 5499 } 5500 } 5501 5502 static void update_r_next_psn_fecn(struct hfi1_packet *packet, 5503 struct hfi1_qp_priv *priv, 5504 struct hfi1_ctxtdata *rcd, 5505 struct tid_rdma_flow *flow, 5506 bool fecn) 5507 { 5508 /* 5509 * If a start/middle packet is delivered here due to 5510 * RSM rule and FECN, we need to update the r_next_psn. 5511 */ 5512 if (fecn && packet->etype == RHF_RCV_TYPE_EAGER && 5513 !(priv->s_flags & HFI1_R_TID_SW_PSN)) { 5514 struct hfi1_devdata *dd = rcd->dd; 5515 5516 flow->flow_state.r_next_psn = 5517 read_r_next_psn(dd, rcd->ctxt, flow->idx); 5518 } 5519 } 5520