1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 /* 3 * Copyright(c) 2015 - 2020 Intel Corporation. 4 * Copyright(c) 2021 Cornelis Networks. 5 */ 6 7 #include <linux/pci.h> 8 #include <linux/netdevice.h> 9 #include <linux/vmalloc.h> 10 #include <linux/delay.h> 11 #include <linux/xarray.h> 12 #include <linux/module.h> 13 #include <linux/printk.h> 14 #include <linux/hrtimer.h> 15 #include <linux/bitmap.h> 16 #include <linux/numa.h> 17 #include <rdma/rdma_vt.h> 18 19 #include "hfi.h" 20 #include "device.h" 21 #include "common.h" 22 #include "trace.h" 23 #include "mad.h" 24 #include "sdma.h" 25 #include "debugfs.h" 26 #include "verbs.h" 27 #include "aspm.h" 28 #include "affinity.h" 29 #include "exp_rcv.h" 30 #include "netdev.h" 31 32 #undef pr_fmt 33 #define pr_fmt(fmt) DRIVER_NAME ": " fmt 34 35 /* 36 * min buffers we want to have per context, after driver 37 */ 38 #define HFI1_MIN_USER_CTXT_BUFCNT 7 39 40 #define HFI1_MIN_EAGER_BUFFER_SIZE (4 * 1024) /* 4KB */ 41 #define HFI1_MAX_EAGER_BUFFER_SIZE (256 * 1024) /* 256KB */ 42 43 #define NUM_IB_PORTS 1 44 45 /* 46 * Number of user receive contexts we are configured to use (to allow for more 47 * pio buffers per ctxt, etc.) Zero means use one user context per CPU. 48 */ 49 int num_user_contexts = -1; 50 module_param_named(num_user_contexts, num_user_contexts, int, 0444); 51 MODULE_PARM_DESC( 52 num_user_contexts, "Set max number of user contexts to use (default: -1 will use the real (non-HT) CPU count)"); 53 54 uint krcvqs[RXE_NUM_DATA_VL]; 55 int krcvqsset; 56 module_param_array(krcvqs, uint, &krcvqsset, S_IRUGO); 57 MODULE_PARM_DESC(krcvqs, "Array of the number of non-control kernel receive queues by VL"); 58 59 /* computed based on above array */ 60 unsigned long n_krcvqs; 61 62 static unsigned hfi1_rcvarr_split = 25; 63 module_param_named(rcvarr_split, hfi1_rcvarr_split, uint, S_IRUGO); 64 MODULE_PARM_DESC(rcvarr_split, "Percent of context's RcvArray entries used for Eager buffers"); 65 66 static uint eager_buffer_size = (8 << 20); /* 8MB */ 67 module_param(eager_buffer_size, uint, S_IRUGO); 68 MODULE_PARM_DESC(eager_buffer_size, "Size of the eager buffers, default: 8MB"); 69 70 static uint rcvhdrcnt = 2048; /* 2x the max eager buffer count */ 71 module_param_named(rcvhdrcnt, rcvhdrcnt, uint, S_IRUGO); 72 MODULE_PARM_DESC(rcvhdrcnt, "Receive header queue count (default 2048)"); 73 74 static uint hfi1_hdrq_entsize = 32; 75 module_param_named(hdrq_entsize, hfi1_hdrq_entsize, uint, 0444); 76 MODULE_PARM_DESC(hdrq_entsize, "Size of header queue entries: 2 - 8B, 16 - 64B, 32 - 128B (default)"); 77 78 unsigned int user_credit_return_threshold = 33; /* default is 33% */ 79 module_param(user_credit_return_threshold, uint, S_IRUGO); 80 MODULE_PARM_DESC(user_credit_return_threshold, "Credit return threshold for user send contexts, return when unreturned credits passes this many blocks (in percent of allocated blocks, 0 is off)"); 81 82 DEFINE_XARRAY_FLAGS(hfi1_dev_table, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ); 83 84 static int hfi1_create_kctxt(struct hfi1_devdata *dd, 85 struct hfi1_pportdata *ppd) 86 { 87 struct hfi1_ctxtdata *rcd; 88 int ret; 89 90 /* Control context has to be always 0 */ 91 BUILD_BUG_ON(HFI1_CTRL_CTXT != 0); 92 93 ret = hfi1_create_ctxtdata(ppd, dd->node, &rcd); 94 if (ret < 0) { 95 dd_dev_err(dd, "Kernel receive context allocation failed\n"); 96 return ret; 97 } 98 99 /* 100 * Set up the kernel context flags here and now because they use 101 * default values for all receive side memories. User contexts will 102 * be handled as they are created. 103 */ 104 rcd->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) | 105 HFI1_CAP_KGET(NODROP_RHQ_FULL) | 106 HFI1_CAP_KGET(NODROP_EGR_FULL) | 107 HFI1_CAP_KGET(DMA_RTAIL); 108 109 /* Control context must use DMA_RTAIL */ 110 if (rcd->ctxt == HFI1_CTRL_CTXT) 111 rcd->flags |= HFI1_CAP_DMA_RTAIL; 112 rcd->fast_handler = get_dma_rtail_setting(rcd) ? 113 handle_receive_interrupt_dma_rtail : 114 handle_receive_interrupt_nodma_rtail; 115 116 hfi1_set_seq_cnt(rcd, 1); 117 118 rcd->sc = sc_alloc(dd, SC_ACK, rcd->rcvhdrqentsize, dd->node); 119 if (!rcd->sc) { 120 dd_dev_err(dd, "Kernel send context allocation failed\n"); 121 return -ENOMEM; 122 } 123 hfi1_init_ctxt(rcd->sc); 124 125 return 0; 126 } 127 128 /* 129 * Create the receive context array and one or more kernel contexts 130 */ 131 int hfi1_create_kctxts(struct hfi1_devdata *dd) 132 { 133 u16 i; 134 int ret; 135 136 dd->rcd = kcalloc_node(dd->num_rcv_contexts, sizeof(*dd->rcd), 137 GFP_KERNEL, dd->node); 138 if (!dd->rcd) 139 return -ENOMEM; 140 141 for (i = 0; i < dd->first_dyn_alloc_ctxt; ++i) { 142 ret = hfi1_create_kctxt(dd, dd->pport); 143 if (ret) 144 goto bail; 145 } 146 147 return 0; 148 bail: 149 for (i = 0; dd->rcd && i < dd->first_dyn_alloc_ctxt; ++i) 150 hfi1_free_ctxt(dd->rcd[i]); 151 152 /* All the contexts should be freed, free the array */ 153 kfree(dd->rcd); 154 dd->rcd = NULL; 155 return ret; 156 } 157 158 /* 159 * Helper routines for the receive context reference count (rcd and uctxt). 160 */ 161 static void hfi1_rcd_init(struct hfi1_ctxtdata *rcd) 162 { 163 kref_init(&rcd->kref); 164 } 165 166 /** 167 * hfi1_rcd_free - When reference is zero clean up. 168 * @kref: pointer to an initialized rcd data structure 169 * 170 */ 171 static void hfi1_rcd_free(struct kref *kref) 172 { 173 unsigned long flags; 174 struct hfi1_ctxtdata *rcd = 175 container_of(kref, struct hfi1_ctxtdata, kref); 176 177 spin_lock_irqsave(&rcd->dd->uctxt_lock, flags); 178 rcd->dd->rcd[rcd->ctxt] = NULL; 179 spin_unlock_irqrestore(&rcd->dd->uctxt_lock, flags); 180 181 hfi1_free_ctxtdata(rcd->dd, rcd); 182 183 kfree(rcd); 184 } 185 186 /** 187 * hfi1_rcd_put - decrement reference for rcd 188 * @rcd: pointer to an initialized rcd data structure 189 * 190 * Use this to put a reference after the init. 191 */ 192 int hfi1_rcd_put(struct hfi1_ctxtdata *rcd) 193 { 194 if (rcd) 195 return kref_put(&rcd->kref, hfi1_rcd_free); 196 197 return 0; 198 } 199 200 /** 201 * hfi1_rcd_get - increment reference for rcd 202 * @rcd: pointer to an initialized rcd data structure 203 * 204 * Use this to get a reference after the init. 205 * 206 * Return : reflect kref_get_unless_zero(), which returns non-zero on 207 * increment, otherwise 0. 208 */ 209 int hfi1_rcd_get(struct hfi1_ctxtdata *rcd) 210 { 211 return kref_get_unless_zero(&rcd->kref); 212 } 213 214 /** 215 * allocate_rcd_index - allocate an rcd index from the rcd array 216 * @dd: pointer to a valid devdata structure 217 * @rcd: rcd data structure to assign 218 * @index: pointer to index that is allocated 219 * 220 * Find an empty index in the rcd array, and assign the given rcd to it. 221 * If the array is full, we are EBUSY. 222 * 223 */ 224 static int allocate_rcd_index(struct hfi1_devdata *dd, 225 struct hfi1_ctxtdata *rcd, u16 *index) 226 { 227 unsigned long flags; 228 u16 ctxt; 229 230 spin_lock_irqsave(&dd->uctxt_lock, flags); 231 for (ctxt = 0; ctxt < dd->num_rcv_contexts; ctxt++) 232 if (!dd->rcd[ctxt]) 233 break; 234 235 if (ctxt < dd->num_rcv_contexts) { 236 rcd->ctxt = ctxt; 237 dd->rcd[ctxt] = rcd; 238 hfi1_rcd_init(rcd); 239 } 240 spin_unlock_irqrestore(&dd->uctxt_lock, flags); 241 242 if (ctxt >= dd->num_rcv_contexts) 243 return -EBUSY; 244 245 *index = ctxt; 246 247 return 0; 248 } 249 250 /** 251 * hfi1_rcd_get_by_index_safe - validate the ctxt index before accessing the 252 * array 253 * @dd: pointer to a valid devdata structure 254 * @ctxt: the index of an possilbe rcd 255 * 256 * This is a wrapper for hfi1_rcd_get_by_index() to validate that the given 257 * ctxt index is valid. 258 * 259 * The caller is responsible for making the _put(). 260 * 261 */ 262 struct hfi1_ctxtdata *hfi1_rcd_get_by_index_safe(struct hfi1_devdata *dd, 263 u16 ctxt) 264 { 265 if (ctxt < dd->num_rcv_contexts) 266 return hfi1_rcd_get_by_index(dd, ctxt); 267 268 return NULL; 269 } 270 271 /** 272 * hfi1_rcd_get_by_index - get by index 273 * @dd: pointer to a valid devdata structure 274 * @ctxt: the index of an possilbe rcd 275 * 276 * We need to protect access to the rcd array. If access is needed to 277 * one or more index, get the protecting spinlock and then increment the 278 * kref. 279 * 280 * The caller is responsible for making the _put(). 281 * 282 */ 283 struct hfi1_ctxtdata *hfi1_rcd_get_by_index(struct hfi1_devdata *dd, u16 ctxt) 284 { 285 unsigned long flags; 286 struct hfi1_ctxtdata *rcd = NULL; 287 288 spin_lock_irqsave(&dd->uctxt_lock, flags); 289 if (dd->rcd[ctxt]) { 290 rcd = dd->rcd[ctxt]; 291 if (!hfi1_rcd_get(rcd)) 292 rcd = NULL; 293 } 294 spin_unlock_irqrestore(&dd->uctxt_lock, flags); 295 296 return rcd; 297 } 298 299 /* 300 * Common code for user and kernel context create and setup. 301 * NOTE: the initial kref is done here (hf1_rcd_init()). 302 */ 303 int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa, 304 struct hfi1_ctxtdata **context) 305 { 306 struct hfi1_devdata *dd = ppd->dd; 307 struct hfi1_ctxtdata *rcd; 308 unsigned kctxt_ngroups = 0; 309 u32 base; 310 311 if (dd->rcv_entries.nctxt_extra > 312 dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt) 313 kctxt_ngroups = (dd->rcv_entries.nctxt_extra - 314 (dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt)); 315 rcd = kzalloc_node(sizeof(*rcd), GFP_KERNEL, numa); 316 if (rcd) { 317 u32 rcvtids, max_entries; 318 u16 ctxt; 319 int ret; 320 321 ret = allocate_rcd_index(dd, rcd, &ctxt); 322 if (ret) { 323 *context = NULL; 324 kfree(rcd); 325 return ret; 326 } 327 328 INIT_LIST_HEAD(&rcd->qp_wait_list); 329 hfi1_exp_tid_group_init(rcd); 330 rcd->ppd = ppd; 331 rcd->dd = dd; 332 rcd->numa_id = numa; 333 rcd->rcv_array_groups = dd->rcv_entries.ngroups; 334 rcd->rhf_rcv_function_map = normal_rhf_rcv_functions; 335 rcd->slow_handler = handle_receive_interrupt; 336 rcd->do_interrupt = rcd->slow_handler; 337 rcd->msix_intr = CCE_NUM_MSIX_VECTORS; 338 339 mutex_init(&rcd->exp_mutex); 340 spin_lock_init(&rcd->exp_lock); 341 INIT_LIST_HEAD(&rcd->flow_queue.queue_head); 342 INIT_LIST_HEAD(&rcd->rarr_queue.queue_head); 343 344 hfi1_cdbg(PROC, "setting up context %u", rcd->ctxt); 345 346 /* 347 * Calculate the context's RcvArray entry starting point. 348 * We do this here because we have to take into account all 349 * the RcvArray entries that previous context would have 350 * taken and we have to account for any extra groups assigned 351 * to the static (kernel) or dynamic (user) contexts. 352 */ 353 if (ctxt < dd->first_dyn_alloc_ctxt) { 354 if (ctxt < kctxt_ngroups) { 355 base = ctxt * (dd->rcv_entries.ngroups + 1); 356 rcd->rcv_array_groups++; 357 } else { 358 base = kctxt_ngroups + 359 (ctxt * dd->rcv_entries.ngroups); 360 } 361 } else { 362 u16 ct = ctxt - dd->first_dyn_alloc_ctxt; 363 364 base = ((dd->n_krcv_queues * dd->rcv_entries.ngroups) + 365 kctxt_ngroups); 366 if (ct < dd->rcv_entries.nctxt_extra) { 367 base += ct * (dd->rcv_entries.ngroups + 1); 368 rcd->rcv_array_groups++; 369 } else { 370 base += dd->rcv_entries.nctxt_extra + 371 (ct * dd->rcv_entries.ngroups); 372 } 373 } 374 rcd->eager_base = base * dd->rcv_entries.group_size; 375 376 rcd->rcvhdrq_cnt = rcvhdrcnt; 377 rcd->rcvhdrqentsize = hfi1_hdrq_entsize; 378 rcd->rhf_offset = 379 rcd->rcvhdrqentsize - sizeof(u64) / sizeof(u32); 380 /* 381 * Simple Eager buffer allocation: we have already pre-allocated 382 * the number of RcvArray entry groups. Each ctxtdata structure 383 * holds the number of groups for that context. 384 * 385 * To follow CSR requirements and maintain cacheline alignment, 386 * make sure all sizes and bases are multiples of group_size. 387 * 388 * The expected entry count is what is left after assigning 389 * eager. 390 */ 391 max_entries = rcd->rcv_array_groups * 392 dd->rcv_entries.group_size; 393 rcvtids = ((max_entries * hfi1_rcvarr_split) / 100); 394 rcd->egrbufs.count = round_down(rcvtids, 395 dd->rcv_entries.group_size); 396 if (rcd->egrbufs.count > MAX_EAGER_ENTRIES) { 397 dd_dev_err(dd, "ctxt%u: requested too many RcvArray entries.\n", 398 rcd->ctxt); 399 rcd->egrbufs.count = MAX_EAGER_ENTRIES; 400 } 401 hfi1_cdbg(PROC, 402 "ctxt%u: max Eager buffer RcvArray entries: %u", 403 rcd->ctxt, rcd->egrbufs.count); 404 405 /* 406 * Allocate array that will hold the eager buffer accounting 407 * data. 408 * This will allocate the maximum possible buffer count based 409 * on the value of the RcvArray split parameter. 410 * The resulting value will be rounded down to the closest 411 * multiple of dd->rcv_entries.group_size. 412 */ 413 rcd->egrbufs.buffers = 414 kcalloc_node(rcd->egrbufs.count, 415 sizeof(*rcd->egrbufs.buffers), 416 GFP_KERNEL, numa); 417 if (!rcd->egrbufs.buffers) 418 goto bail; 419 rcd->egrbufs.rcvtids = 420 kcalloc_node(rcd->egrbufs.count, 421 sizeof(*rcd->egrbufs.rcvtids), 422 GFP_KERNEL, numa); 423 if (!rcd->egrbufs.rcvtids) 424 goto bail; 425 rcd->egrbufs.size = eager_buffer_size; 426 /* 427 * The size of the buffers programmed into the RcvArray 428 * entries needs to be big enough to handle the highest 429 * MTU supported. 430 */ 431 if (rcd->egrbufs.size < hfi1_max_mtu) { 432 rcd->egrbufs.size = __roundup_pow_of_two(hfi1_max_mtu); 433 hfi1_cdbg(PROC, 434 "ctxt%u: eager bufs size too small. Adjusting to %u", 435 rcd->ctxt, rcd->egrbufs.size); 436 } 437 rcd->egrbufs.rcvtid_size = HFI1_MAX_EAGER_BUFFER_SIZE; 438 439 /* Applicable only for statically created kernel contexts */ 440 if (ctxt < dd->first_dyn_alloc_ctxt) { 441 rcd->opstats = kzalloc_node(sizeof(*rcd->opstats), 442 GFP_KERNEL, numa); 443 if (!rcd->opstats) 444 goto bail; 445 446 /* Initialize TID flow generations for the context */ 447 hfi1_kern_init_ctxt_generations(rcd); 448 } 449 450 *context = rcd; 451 return 0; 452 } 453 454 bail: 455 *context = NULL; 456 hfi1_free_ctxt(rcd); 457 return -ENOMEM; 458 } 459 460 /** 461 * hfi1_free_ctxt - free context 462 * @rcd: pointer to an initialized rcd data structure 463 * 464 * This wrapper is the free function that matches hfi1_create_ctxtdata(). 465 * When a context is done being used (kernel or user), this function is called 466 * for the "final" put to match the kref init from hfi1_create_ctxtdata(). 467 * Other users of the context do a get/put sequence to make sure that the 468 * structure isn't removed while in use. 469 */ 470 void hfi1_free_ctxt(struct hfi1_ctxtdata *rcd) 471 { 472 hfi1_rcd_put(rcd); 473 } 474 475 /* 476 * Select the largest ccti value over all SLs to determine the intra- 477 * packet gap for the link. 478 * 479 * called with cca_timer_lock held (to protect access to cca_timer 480 * array), and rcu_read_lock() (to protect access to cc_state). 481 */ 482 void set_link_ipg(struct hfi1_pportdata *ppd) 483 { 484 struct hfi1_devdata *dd = ppd->dd; 485 struct cc_state *cc_state; 486 int i; 487 u16 cce, ccti_limit, max_ccti = 0; 488 u16 shift, mult; 489 u64 src; 490 u32 current_egress_rate; /* Mbits /sec */ 491 u64 max_pkt_time; 492 /* 493 * max_pkt_time is the maximum packet egress time in units 494 * of the fabric clock period 1/(805 MHz). 495 */ 496 497 cc_state = get_cc_state(ppd); 498 499 if (!cc_state) 500 /* 501 * This should _never_ happen - rcu_read_lock() is held, 502 * and set_link_ipg() should not be called if cc_state 503 * is NULL. 504 */ 505 return; 506 507 for (i = 0; i < OPA_MAX_SLS; i++) { 508 u16 ccti = ppd->cca_timer[i].ccti; 509 510 if (ccti > max_ccti) 511 max_ccti = ccti; 512 } 513 514 ccti_limit = cc_state->cct.ccti_limit; 515 if (max_ccti > ccti_limit) 516 max_ccti = ccti_limit; 517 518 cce = cc_state->cct.entries[max_ccti].entry; 519 shift = (cce & 0xc000) >> 14; 520 mult = (cce & 0x3fff); 521 522 current_egress_rate = active_egress_rate(ppd); 523 524 max_pkt_time = egress_cycles(ppd->ibmaxlen, current_egress_rate); 525 526 src = (max_pkt_time >> shift) * mult; 527 528 src &= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK; 529 src <<= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT; 530 531 write_csr(dd, SEND_STATIC_RATE_CONTROL, src); 532 } 533 534 static enum hrtimer_restart cca_timer_fn(struct hrtimer *t) 535 { 536 struct cca_timer *cca_timer; 537 struct hfi1_pportdata *ppd; 538 int sl; 539 u16 ccti_timer, ccti_min; 540 struct cc_state *cc_state; 541 unsigned long flags; 542 enum hrtimer_restart ret = HRTIMER_NORESTART; 543 544 cca_timer = container_of(t, struct cca_timer, hrtimer); 545 ppd = cca_timer->ppd; 546 sl = cca_timer->sl; 547 548 rcu_read_lock(); 549 550 cc_state = get_cc_state(ppd); 551 552 if (!cc_state) { 553 rcu_read_unlock(); 554 return HRTIMER_NORESTART; 555 } 556 557 /* 558 * 1) decrement ccti for SL 559 * 2) calculate IPG for link (set_link_ipg()) 560 * 3) restart timer, unless ccti is at min value 561 */ 562 563 ccti_min = cc_state->cong_setting.entries[sl].ccti_min; 564 ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer; 565 566 spin_lock_irqsave(&ppd->cca_timer_lock, flags); 567 568 if (cca_timer->ccti > ccti_min) { 569 cca_timer->ccti--; 570 set_link_ipg(ppd); 571 } 572 573 if (cca_timer->ccti > ccti_min) { 574 unsigned long nsec = 1024 * ccti_timer; 575 /* ccti_timer is in units of 1.024 usec */ 576 hrtimer_forward_now(t, ns_to_ktime(nsec)); 577 ret = HRTIMER_RESTART; 578 } 579 580 spin_unlock_irqrestore(&ppd->cca_timer_lock, flags); 581 rcu_read_unlock(); 582 return ret; 583 } 584 585 /* 586 * Common code for initializing the physical port structure. 587 */ 588 void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, 589 struct hfi1_devdata *dd, u8 hw_pidx, u32 port) 590 { 591 int i; 592 uint default_pkey_idx; 593 struct cc_state *cc_state; 594 595 ppd->dd = dd; 596 ppd->hw_pidx = hw_pidx; 597 ppd->port = port; /* IB port number, not index */ 598 ppd->prev_link_width = LINK_WIDTH_DEFAULT; 599 /* 600 * There are C_VL_COUNT number of PortVLXmitWait counters. 601 * Adding 1 to C_VL_COUNT to include the PortXmitWait counter. 602 */ 603 for (i = 0; i < C_VL_COUNT + 1; i++) { 604 ppd->port_vl_xmit_wait_last[i] = 0; 605 ppd->vl_xmit_flit_cnt[i] = 0; 606 } 607 608 default_pkey_idx = 1; 609 610 ppd->pkeys[default_pkey_idx] = DEFAULT_P_KEY; 611 ppd->part_enforce |= HFI1_PART_ENFORCE_IN; 612 ppd->pkeys[0] = 0x8001; 613 614 INIT_WORK(&ppd->link_vc_work, handle_verify_cap); 615 INIT_WORK(&ppd->link_up_work, handle_link_up); 616 INIT_WORK(&ppd->link_down_work, handle_link_down); 617 INIT_WORK(&ppd->freeze_work, handle_freeze); 618 INIT_WORK(&ppd->link_downgrade_work, handle_link_downgrade); 619 INIT_WORK(&ppd->sma_message_work, handle_sma_message); 620 INIT_WORK(&ppd->link_bounce_work, handle_link_bounce); 621 INIT_DELAYED_WORK(&ppd->start_link_work, handle_start_link); 622 INIT_WORK(&ppd->linkstate_active_work, receive_interrupt_work); 623 INIT_WORK(&ppd->qsfp_info.qsfp_work, qsfp_event); 624 625 mutex_init(&ppd->hls_lock); 626 spin_lock_init(&ppd->qsfp_info.qsfp_lock); 627 628 ppd->qsfp_info.ppd = ppd; 629 ppd->sm_trap_qp = 0x0; 630 ppd->sa_qp = 0x1; 631 632 ppd->hfi1_wq = NULL; 633 634 spin_lock_init(&ppd->cca_timer_lock); 635 636 for (i = 0; i < OPA_MAX_SLS; i++) { 637 ppd->cca_timer[i].ppd = ppd; 638 ppd->cca_timer[i].sl = i; 639 ppd->cca_timer[i].ccti = 0; 640 hrtimer_setup(&ppd->cca_timer[i].hrtimer, cca_timer_fn, CLOCK_MONOTONIC, 641 HRTIMER_MODE_REL); 642 } 643 644 ppd->cc_max_table_entries = IB_CC_TABLE_CAP_DEFAULT; 645 646 spin_lock_init(&ppd->cc_state_lock); 647 spin_lock_init(&ppd->cc_log_lock); 648 cc_state = kzalloc_obj(*cc_state); 649 RCU_INIT_POINTER(ppd->cc_state, cc_state); 650 if (!cc_state) 651 goto bail; 652 return; 653 654 bail: 655 dd_dev_err(dd, "Congestion Control Agent disabled for port %d\n", port); 656 } 657 658 /* 659 * Do initialization for device that is only needed on 660 * first detect, not on resets. 661 */ 662 static int loadtime_init(struct hfi1_devdata *dd) 663 { 664 return 0; 665 } 666 667 /** 668 * init_after_reset - re-initialize after a reset 669 * @dd: the hfi1_ib device 670 * 671 * sanity check at least some of the values after reset, and 672 * ensure no receive or transmit (explicitly, in case reset 673 * failed 674 */ 675 static int init_after_reset(struct hfi1_devdata *dd) 676 { 677 int i; 678 struct hfi1_ctxtdata *rcd; 679 /* 680 * Ensure chip does no sends or receives, tail updates, or 681 * pioavail updates while we re-initialize. This is mostly 682 * for the driver data structures, not chip registers. 683 */ 684 for (i = 0; i < dd->num_rcv_contexts; i++) { 685 rcd = hfi1_rcd_get_by_index(dd, i); 686 hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS | 687 HFI1_RCVCTRL_INTRAVAIL_DIS | 688 HFI1_RCVCTRL_TAILUPD_DIS, rcd); 689 hfi1_rcd_put(rcd); 690 } 691 pio_send_control(dd, PSC_GLOBAL_DISABLE); 692 for (i = 0; i < dd->num_send_contexts; i++) 693 sc_disable(dd->send_contexts[i].sc); 694 695 return 0; 696 } 697 698 static void enable_chip(struct hfi1_devdata *dd) 699 { 700 struct hfi1_ctxtdata *rcd; 701 u32 rcvmask; 702 u16 i; 703 704 /* enable PIO send */ 705 pio_send_control(dd, PSC_GLOBAL_ENABLE); 706 707 /* 708 * Enable kernel ctxts' receive and receive interrupt. 709 * Other ctxts done as user opens and initializes them. 710 */ 711 for (i = 0; i < dd->first_dyn_alloc_ctxt; ++i) { 712 rcd = hfi1_rcd_get_by_index(dd, i); 713 if (!rcd) 714 continue; 715 rcvmask = HFI1_RCVCTRL_CTXT_ENB | HFI1_RCVCTRL_INTRAVAIL_ENB; 716 rcvmask |= HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ? 717 HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS; 718 if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) 719 rcvmask |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB; 720 if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_RHQ_FULL)) 721 rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB; 722 if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_EGR_FULL)) 723 rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB; 724 if (HFI1_CAP_IS_KSET(TID_RDMA)) 725 rcvmask |= HFI1_RCVCTRL_TIDFLOW_ENB; 726 hfi1_rcvctrl(dd, rcvmask, rcd); 727 sc_enable(rcd->sc); 728 hfi1_rcd_put(rcd); 729 } 730 } 731 732 /** 733 * create_workqueues - create per port workqueues 734 * @dd: the hfi1_ib device 735 */ 736 static int create_workqueues(struct hfi1_devdata *dd) 737 { 738 int pidx; 739 struct hfi1_pportdata *ppd; 740 741 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 742 ppd = dd->pport + pidx; 743 if (!ppd->hfi1_wq) { 744 ppd->hfi1_wq = 745 alloc_workqueue( 746 "hfi%d_%d", 747 WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | 748 WQ_PERCPU, 749 HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES, 750 dd->unit, pidx); 751 if (!ppd->hfi1_wq) 752 goto wq_error; 753 } 754 if (!ppd->link_wq) { 755 /* 756 * Make the link workqueue single-threaded to enforce 757 * serialization. 758 */ 759 ppd->link_wq = 760 alloc_workqueue( 761 "hfi_link_%d_%d", 762 WQ_SYSFS | WQ_MEM_RECLAIM | WQ_UNBOUND, 763 1, /* max_active */ 764 dd->unit, pidx); 765 if (!ppd->link_wq) 766 goto wq_error; 767 } 768 } 769 return 0; 770 wq_error: 771 pr_err("alloc_workqueue failed for port %d\n", pidx + 1); 772 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 773 ppd = dd->pport + pidx; 774 if (ppd->hfi1_wq) { 775 destroy_workqueue(ppd->hfi1_wq); 776 ppd->hfi1_wq = NULL; 777 } 778 if (ppd->link_wq) { 779 destroy_workqueue(ppd->link_wq); 780 ppd->link_wq = NULL; 781 } 782 } 783 return -ENOMEM; 784 } 785 786 /** 787 * destroy_workqueues - destroy per port workqueues 788 * @dd: the hfi1_ib device 789 */ 790 static void destroy_workqueues(struct hfi1_devdata *dd) 791 { 792 int pidx; 793 struct hfi1_pportdata *ppd; 794 795 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 796 ppd = dd->pport + pidx; 797 798 if (ppd->hfi1_wq) { 799 destroy_workqueue(ppd->hfi1_wq); 800 ppd->hfi1_wq = NULL; 801 } 802 if (ppd->link_wq) { 803 destroy_workqueue(ppd->link_wq); 804 ppd->link_wq = NULL; 805 } 806 } 807 } 808 809 /** 810 * enable_general_intr() - Enable the IRQs that will be handled by the 811 * general interrupt handler. 812 * @dd: valid devdata 813 * 814 */ 815 static void enable_general_intr(struct hfi1_devdata *dd) 816 { 817 set_intr_bits(dd, CCE_ERR_INT, MISC_ERR_INT, true); 818 set_intr_bits(dd, PIO_ERR_INT, TXE_ERR_INT, true); 819 set_intr_bits(dd, IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END, true); 820 set_intr_bits(dd, PBC_INT, GPIO_ASSERT_INT, true); 821 set_intr_bits(dd, TCRIT_INT, TCRIT_INT, true); 822 set_intr_bits(dd, IS_DC_START, IS_DC_END, true); 823 set_intr_bits(dd, IS_SENDCREDIT_START, IS_SENDCREDIT_END, true); 824 } 825 826 /** 827 * hfi1_init - do the actual initialization sequence on the chip 828 * @dd: the hfi1_ib device 829 * @reinit: re-initializing, so don't allocate new memory 830 * 831 * Do the actual initialization sequence on the chip. This is done 832 * both from the init routine called from the PCI infrastructure, and 833 * when we reset the chip, or detect that it was reset internally, 834 * or it's administratively re-enabled. 835 * 836 * Memory allocation here and in called routines is only done in 837 * the first case (reinit == 0). We have to be careful, because even 838 * without memory allocation, we need to re-write all the chip registers 839 * TIDs, etc. after the reset or enable has completed. 840 */ 841 int hfi1_init(struct hfi1_devdata *dd, int reinit) 842 { 843 int ret = 0, pidx, lastfail = 0; 844 unsigned long len; 845 u16 i; 846 struct hfi1_ctxtdata *rcd; 847 struct hfi1_pportdata *ppd; 848 849 /* Set up send low level handlers */ 850 dd->process_pio_send = hfi1_verbs_send_pio; 851 dd->process_dma_send = hfi1_verbs_send_dma; 852 dd->pio_inline_send = pio_copy; 853 854 if (is_ax(dd)) { 855 atomic_set(&dd->drop_packet, DROP_PACKET_ON); 856 dd->do_drop = true; 857 } else { 858 atomic_set(&dd->drop_packet, DROP_PACKET_OFF); 859 dd->do_drop = false; 860 } 861 862 /* make sure the link is not "up" */ 863 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 864 ppd = dd->pport + pidx; 865 ppd->linkup = 0; 866 } 867 868 if (reinit) 869 ret = init_after_reset(dd); 870 else 871 ret = loadtime_init(dd); 872 if (ret) 873 goto done; 874 875 /* dd->rcd can be NULL if early initialization failed */ 876 for (i = 0; dd->rcd && i < dd->first_dyn_alloc_ctxt; ++i) { 877 /* 878 * Set up the (kernel) rcvhdr queue and egr TIDs. If doing 879 * re-init, the simplest way to handle this is to free 880 * existing, and re-allocate. 881 * Need to re-create rest of ctxt 0 ctxtdata as well. 882 */ 883 rcd = hfi1_rcd_get_by_index(dd, i); 884 if (!rcd) 885 continue; 886 887 lastfail = hfi1_create_rcvhdrq(dd, rcd); 888 if (!lastfail) 889 lastfail = hfi1_setup_eagerbufs(rcd); 890 if (!lastfail) 891 lastfail = hfi1_kern_exp_rcv_init(rcd, reinit); 892 if (lastfail) { 893 dd_dev_err(dd, 894 "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n"); 895 ret = lastfail; 896 } 897 /* enable IRQ */ 898 hfi1_rcd_put(rcd); 899 } 900 901 /* Allocate enough memory for user event notification. */ 902 len = PAGE_ALIGN(chip_rcv_contexts(dd) * HFI1_MAX_SHARED_CTXTS * 903 sizeof(*dd->events)); 904 dd->events = vmalloc_user(len); 905 if (!dd->events) 906 dd_dev_err(dd, "Failed to allocate user events page\n"); 907 /* 908 * Allocate a page for device and port status. 909 * Page will be shared amongst all user processes. 910 */ 911 dd->status = vmalloc_user(PAGE_SIZE); 912 if (!dd->status) 913 dd_dev_err(dd, "Failed to allocate dev status page\n"); 914 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 915 ppd = dd->pport + pidx; 916 if (dd->status) 917 /* Currently, we only have one port */ 918 ppd->statusp = &dd->status->port; 919 920 set_mtu(ppd); 921 } 922 923 /* enable chip even if we have an error, so we can debug cause */ 924 enable_chip(dd); 925 926 done: 927 /* 928 * Set status even if port serdes is not initialized 929 * so that diags will work. 930 */ 931 if (dd->status) 932 dd->status->dev |= HFI1_STATUS_CHIP_PRESENT | 933 HFI1_STATUS_INITTED; 934 if (!ret) { 935 /* enable all interrupts from the chip */ 936 enable_general_intr(dd); 937 init_qsfp_int(dd); 938 939 /* chip is OK for user apps; mark it as initialized */ 940 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 941 ppd = dd->pport + pidx; 942 943 /* 944 * start the serdes - must be after interrupts are 945 * enabled so we are notified when the link goes up 946 */ 947 lastfail = bringup_serdes(ppd); 948 if (lastfail) 949 dd_dev_info(dd, 950 "Failed to bring up port %u\n", 951 ppd->port); 952 953 /* 954 * Set status even if port serdes is not initialized 955 * so that diags will work. 956 */ 957 if (ppd->statusp) 958 *ppd->statusp |= HFI1_STATUS_CHIP_PRESENT | 959 HFI1_STATUS_INITTED; 960 if (!ppd->link_speed_enabled) 961 continue; 962 } 963 } 964 965 /* if ret is non-zero, we probably should do some cleanup here... */ 966 return ret; 967 } 968 969 struct hfi1_devdata *hfi1_lookup(int unit) 970 { 971 return xa_load(&hfi1_dev_table, unit); 972 } 973 974 /* 975 * Stop the timers during unit shutdown, or after an error late 976 * in initialization. 977 */ 978 static void stop_timers(struct hfi1_devdata *dd) 979 { 980 struct hfi1_pportdata *ppd; 981 int pidx; 982 983 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 984 ppd = dd->pport + pidx; 985 if (ppd->led_override_timer.function) { 986 timer_delete_sync(&ppd->led_override_timer); 987 atomic_set(&ppd->led_override_timer_active, 0); 988 } 989 } 990 } 991 992 /** 993 * shutdown_device - shut down a device 994 * @dd: the hfi1_ib device 995 * 996 * This is called to make the device quiet when we are about to 997 * unload the driver, and also when the device is administratively 998 * disabled. It does not free any data structures. 999 * Everything it does has to be setup again by hfi1_init(dd, 1) 1000 */ 1001 static void shutdown_device(struct hfi1_devdata *dd) 1002 { 1003 struct hfi1_pportdata *ppd; 1004 struct hfi1_ctxtdata *rcd; 1005 unsigned pidx; 1006 int i; 1007 1008 if (dd->flags & HFI1_SHUTDOWN) 1009 return; 1010 dd->flags |= HFI1_SHUTDOWN; 1011 1012 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1013 ppd = dd->pport + pidx; 1014 1015 ppd->linkup = 0; 1016 if (ppd->statusp) 1017 *ppd->statusp &= ~(HFI1_STATUS_IB_CONF | 1018 HFI1_STATUS_IB_READY); 1019 } 1020 dd->flags &= ~HFI1_INITTED; 1021 1022 /* mask and clean up interrupts */ 1023 set_intr_bits(dd, IS_FIRST_SOURCE, IS_LAST_SOURCE, false); 1024 msix_clean_up_interrupts(dd); 1025 1026 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1027 for (i = 0; i < dd->num_rcv_contexts; i++) { 1028 rcd = hfi1_rcd_get_by_index(dd, i); 1029 hfi1_rcvctrl(dd, HFI1_RCVCTRL_TAILUPD_DIS | 1030 HFI1_RCVCTRL_CTXT_DIS | 1031 HFI1_RCVCTRL_INTRAVAIL_DIS | 1032 HFI1_RCVCTRL_PKEY_DIS | 1033 HFI1_RCVCTRL_ONE_PKT_EGR_DIS, rcd); 1034 hfi1_rcd_put(rcd); 1035 } 1036 /* 1037 * Gracefully stop all sends allowing any in progress to 1038 * trickle out first. 1039 */ 1040 for (i = 0; i < dd->num_send_contexts; i++) 1041 sc_flush(dd->send_contexts[i].sc); 1042 } 1043 1044 /* 1045 * Enough for anything that's going to trickle out to have actually 1046 * done so. 1047 */ 1048 udelay(20); 1049 1050 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1051 ppd = dd->pport + pidx; 1052 1053 /* disable all contexts */ 1054 for (i = 0; i < dd->num_send_contexts; i++) 1055 sc_disable(dd->send_contexts[i].sc); 1056 /* disable the send device */ 1057 pio_send_control(dd, PSC_GLOBAL_DISABLE); 1058 1059 shutdown_led_override(ppd); 1060 1061 /* 1062 * Clear SerdesEnable. 1063 * We can't count on interrupts since we are stopping. 1064 */ 1065 hfi1_quiet_serdes(ppd); 1066 if (ppd->hfi1_wq) 1067 flush_workqueue(ppd->hfi1_wq); 1068 if (ppd->link_wq) 1069 flush_workqueue(ppd->link_wq); 1070 } 1071 sdma_exit(dd); 1072 } 1073 1074 /** 1075 * hfi1_free_ctxtdata - free a context's allocated data 1076 * @dd: the hfi1_ib device 1077 * @rcd: the ctxtdata structure 1078 * 1079 * free up any allocated data for a context 1080 * It should never change any chip state, or global driver state. 1081 */ 1082 void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) 1083 { 1084 u32 e; 1085 1086 if (!rcd) 1087 return; 1088 1089 if (rcd->rcvhdrq) { 1090 dma_free_coherent(&dd->pcidev->dev, rcvhdrq_size(rcd), 1091 rcd->rcvhdrq, rcd->rcvhdrq_dma); 1092 rcd->rcvhdrq = NULL; 1093 if (hfi1_rcvhdrtail_kvaddr(rcd)) { 1094 dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, 1095 (void *)hfi1_rcvhdrtail_kvaddr(rcd), 1096 rcd->rcvhdrqtailaddr_dma); 1097 rcd->rcvhdrtail_kvaddr = NULL; 1098 } 1099 } 1100 1101 /* all the RcvArray entries should have been cleared by now */ 1102 kfree(rcd->egrbufs.rcvtids); 1103 rcd->egrbufs.rcvtids = NULL; 1104 1105 for (e = 0; e < rcd->egrbufs.alloced; e++) { 1106 if (rcd->egrbufs.buffers[e].addr) 1107 dma_free_coherent(&dd->pcidev->dev, 1108 rcd->egrbufs.buffers[e].len, 1109 rcd->egrbufs.buffers[e].addr, 1110 rcd->egrbufs.buffers[e].dma); 1111 } 1112 kfree(rcd->egrbufs.buffers); 1113 rcd->egrbufs.alloced = 0; 1114 rcd->egrbufs.buffers = NULL; 1115 1116 sc_free(rcd->sc); 1117 rcd->sc = NULL; 1118 1119 vfree(rcd->subctxt_uregbase); 1120 vfree(rcd->subctxt_rcvegrbuf); 1121 vfree(rcd->subctxt_rcvhdr_base); 1122 kfree(rcd->opstats); 1123 1124 rcd->subctxt_uregbase = NULL; 1125 rcd->subctxt_rcvegrbuf = NULL; 1126 rcd->subctxt_rcvhdr_base = NULL; 1127 rcd->opstats = NULL; 1128 } 1129 1130 /* 1131 * Release our hold on the shared asic data. If we are the last one, 1132 * return the structure to be finalized outside the lock. Must be 1133 * holding hfi1_dev_table lock. 1134 */ 1135 static struct hfi1_asic_data *release_asic_data(struct hfi1_devdata *dd) 1136 { 1137 struct hfi1_asic_data *ad; 1138 int other; 1139 1140 if (!dd->asic_data) 1141 return NULL; 1142 dd->asic_data->dds[dd->hfi1_id] = NULL; 1143 other = dd->hfi1_id ? 0 : 1; 1144 ad = dd->asic_data; 1145 dd->asic_data = NULL; 1146 /* return NULL if the other dd still has a link */ 1147 return ad->dds[other] ? NULL : ad; 1148 } 1149 1150 static void finalize_asic_data(struct hfi1_devdata *dd, 1151 struct hfi1_asic_data *ad) 1152 { 1153 clean_up_i2c(dd, ad); 1154 kfree(ad); 1155 } 1156 1157 /** 1158 * hfi1_free_devdata - cleans up and frees per-unit data structure 1159 * @dd: pointer to a valid devdata structure 1160 * 1161 * It cleans up and frees all data structures set up by 1162 * by hfi1_alloc_devdata(). 1163 */ 1164 void hfi1_free_devdata(struct hfi1_devdata *dd) 1165 { 1166 struct hfi1_asic_data *ad; 1167 unsigned long flags; 1168 1169 xa_lock_irqsave(&hfi1_dev_table, flags); 1170 __xa_erase(&hfi1_dev_table, dd->unit); 1171 ad = release_asic_data(dd); 1172 xa_unlock_irqrestore(&hfi1_dev_table, flags); 1173 1174 finalize_asic_data(dd, ad); 1175 free_platform_config(dd); 1176 rcu_barrier(); /* wait for rcu callbacks to complete */ 1177 free_percpu(dd->int_counter); 1178 free_percpu(dd->rcv_limit); 1179 free_percpu(dd->send_schedule); 1180 free_percpu(dd->tx_opstats); 1181 dd->int_counter = NULL; 1182 dd->rcv_limit = NULL; 1183 dd->send_schedule = NULL; 1184 dd->tx_opstats = NULL; 1185 kfree(dd->comp_vect); 1186 dd->comp_vect = NULL; 1187 if (dd->rcvhdrtail_dummy_kvaddr) 1188 dma_free_coherent(&dd->pcidev->dev, sizeof(u64), 1189 (void *)dd->rcvhdrtail_dummy_kvaddr, 1190 dd->rcvhdrtail_dummy_dma); 1191 dd->rcvhdrtail_dummy_kvaddr = NULL; 1192 sdma_clean(dd, dd->num_sdma); 1193 rvt_dealloc_device(&dd->verbs_dev.rdi); 1194 } 1195 1196 /** 1197 * hfi1_alloc_devdata - Allocate our primary per-unit data structure. 1198 * @pdev: Valid PCI device 1199 * @extra: How many bytes to alloc past the default 1200 * 1201 * Must be done via verbs allocator, because the verbs cleanup process 1202 * both does cleanup and free of the data structure. 1203 * "extra" is for chip-specific data. 1204 */ 1205 static struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, 1206 size_t extra) 1207 { 1208 struct hfi1_devdata *dd; 1209 int ret, nports; 1210 1211 /* extra is * number of ports */ 1212 nports = extra / sizeof(struct hfi1_pportdata); 1213 1214 dd = (struct hfi1_devdata *)rvt_alloc_device(sizeof(*dd) + extra, 1215 nports); 1216 if (!dd) 1217 return ERR_PTR(-ENOMEM); 1218 dd->num_pports = nports; 1219 dd->pport = (struct hfi1_pportdata *)(dd + 1); 1220 dd->pcidev = pdev; 1221 pci_set_drvdata(pdev, dd); 1222 1223 ret = xa_alloc_irq(&hfi1_dev_table, &dd->unit, dd, xa_limit_32b, 1224 GFP_KERNEL); 1225 if (ret < 0) { 1226 dev_err(&pdev->dev, 1227 "Could not allocate unit ID: error %d\n", -ret); 1228 goto bail; 1229 } 1230 rvt_set_ibdev_name(&dd->verbs_dev.rdi, "%s_%d", class_name(), dd->unit); 1231 /* 1232 * If the BIOS does not have the NUMA node information set, select 1233 * NUMA 0 so we get consistent performance. 1234 */ 1235 dd->node = pcibus_to_node(pdev->bus); 1236 if (dd->node == NUMA_NO_NODE) { 1237 dd_dev_err(dd, "Invalid PCI NUMA node. Performance may be affected\n"); 1238 dd->node = 0; 1239 } 1240 1241 /* 1242 * Initialize all locks for the device. This needs to be as early as 1243 * possible so locks are usable. 1244 */ 1245 spin_lock_init(&dd->sc_lock); 1246 spin_lock_init(&dd->sendctrl_lock); 1247 spin_lock_init(&dd->rcvctrl_lock); 1248 spin_lock_init(&dd->uctxt_lock); 1249 spin_lock_init(&dd->hfi1_diag_trans_lock); 1250 spin_lock_init(&dd->sc_init_lock); 1251 spin_lock_init(&dd->dc8051_memlock); 1252 seqlock_init(&dd->sc2vl_lock); 1253 spin_lock_init(&dd->sde_map_lock); 1254 spin_lock_init(&dd->pio_map_lock); 1255 mutex_init(&dd->dc8051_lock); 1256 init_waitqueue_head(&dd->event_queue); 1257 spin_lock_init(&dd->irq_src_lock); 1258 1259 dd->int_counter = alloc_percpu(u64); 1260 if (!dd->int_counter) { 1261 ret = -ENOMEM; 1262 goto bail; 1263 } 1264 1265 dd->rcv_limit = alloc_percpu(u64); 1266 if (!dd->rcv_limit) { 1267 ret = -ENOMEM; 1268 goto bail; 1269 } 1270 1271 dd->send_schedule = alloc_percpu(u64); 1272 if (!dd->send_schedule) { 1273 ret = -ENOMEM; 1274 goto bail; 1275 } 1276 1277 dd->tx_opstats = alloc_percpu(struct hfi1_opcode_stats_perctx); 1278 if (!dd->tx_opstats) { 1279 ret = -ENOMEM; 1280 goto bail; 1281 } 1282 1283 dd->comp_vect = kzalloc_obj(*dd->comp_vect); 1284 if (!dd->comp_vect) { 1285 ret = -ENOMEM; 1286 goto bail; 1287 } 1288 1289 /* allocate dummy tail memory for all receive contexts */ 1290 dd->rcvhdrtail_dummy_kvaddr = 1291 dma_alloc_coherent(&dd->pcidev->dev, sizeof(u64), 1292 &dd->rcvhdrtail_dummy_dma, GFP_KERNEL); 1293 if (!dd->rcvhdrtail_dummy_kvaddr) { 1294 ret = -ENOMEM; 1295 goto bail; 1296 } 1297 1298 atomic_set(&dd->ipoib_rsm_usr_num, 0); 1299 return dd; 1300 1301 bail: 1302 hfi1_free_devdata(dd); 1303 return ERR_PTR(ret); 1304 } 1305 1306 /* 1307 * Called from freeze mode handlers, and from PCI error 1308 * reporting code. Should be paranoid about state of 1309 * system and data structures. 1310 */ 1311 void hfi1_disable_after_error(struct hfi1_devdata *dd) 1312 { 1313 if (dd->flags & HFI1_INITTED) { 1314 u32 pidx; 1315 1316 dd->flags &= ~HFI1_INITTED; 1317 if (dd->pport) 1318 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1319 struct hfi1_pportdata *ppd; 1320 1321 ppd = dd->pport + pidx; 1322 if (dd->flags & HFI1_PRESENT) 1323 set_link_state(ppd, HLS_DN_DISABLE); 1324 1325 if (ppd->statusp) 1326 *ppd->statusp &= ~HFI1_STATUS_IB_READY; 1327 } 1328 } 1329 1330 /* 1331 * Mark as having had an error for driver, and also 1332 * for /sys and status word mapped to user programs. 1333 * This marks unit as not usable, until reset. 1334 */ 1335 if (dd->status) 1336 dd->status->dev |= HFI1_STATUS_HWERROR; 1337 } 1338 1339 static void remove_one(struct pci_dev *); 1340 static int init_one(struct pci_dev *, const struct pci_device_id *); 1341 static void shutdown_one(struct pci_dev *); 1342 1343 #define DRIVER_LOAD_MSG "Cornelis " DRIVER_NAME " loaded: " 1344 #define PFX DRIVER_NAME ": " 1345 1346 const struct pci_device_id hfi1_pci_tbl[] = { 1347 { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL0) }, 1348 { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL1) }, 1349 { 0, } 1350 }; 1351 1352 MODULE_DEVICE_TABLE(pci, hfi1_pci_tbl); 1353 1354 static struct pci_driver hfi1_pci_driver = { 1355 .name = DRIVER_NAME, 1356 .probe = init_one, 1357 .remove = remove_one, 1358 .shutdown = shutdown_one, 1359 .id_table = hfi1_pci_tbl, 1360 .err_handler = &hfi1_pci_err_handler, 1361 }; 1362 1363 static void __init compute_krcvqs(void) 1364 { 1365 int i; 1366 1367 for (i = 0; i < krcvqsset; i++) 1368 n_krcvqs += krcvqs[i]; 1369 } 1370 1371 /* 1372 * Do all the generic driver unit- and chip-independent memory 1373 * allocation and initialization. 1374 */ 1375 static int __init hfi1_mod_init(void) 1376 { 1377 int ret; 1378 1379 ret = dev_init(); 1380 if (ret) 1381 goto bail; 1382 1383 ret = node_affinity_init(); 1384 if (ret) 1385 goto bail; 1386 1387 /* validate max MTU before any devices start */ 1388 if (!valid_opa_max_mtu(hfi1_max_mtu)) { 1389 pr_err("Invalid max_mtu 0x%x, using 0x%x instead\n", 1390 hfi1_max_mtu, HFI1_DEFAULT_MAX_MTU); 1391 hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU; 1392 } 1393 /* valid CUs run from 1-128 in powers of 2 */ 1394 if (hfi1_cu > 128 || !is_power_of_2(hfi1_cu)) 1395 hfi1_cu = 1; 1396 /* valid credit return threshold is 0-100, variable is unsigned */ 1397 if (user_credit_return_threshold > 100) 1398 user_credit_return_threshold = 100; 1399 1400 compute_krcvqs(); 1401 /* 1402 * sanitize receive interrupt count, time must wait until after 1403 * the hardware type is known 1404 */ 1405 if (rcv_intr_count > RCV_HDR_HEAD_COUNTER_MASK) 1406 rcv_intr_count = RCV_HDR_HEAD_COUNTER_MASK; 1407 /* reject invalid combinations */ 1408 if (rcv_intr_count == 0 && rcv_intr_timeout == 0) { 1409 pr_err("Invalid mode: both receive interrupt count and available timeout are zero - setting interrupt count to 1\n"); 1410 rcv_intr_count = 1; 1411 } 1412 if (rcv_intr_count > 1 && rcv_intr_timeout == 0) { 1413 /* 1414 * Avoid indefinite packet delivery by requiring a timeout 1415 * if count is > 1. 1416 */ 1417 pr_err("Invalid mode: receive interrupt count greater than 1 and available timeout is zero - setting available timeout to 1\n"); 1418 rcv_intr_timeout = 1; 1419 } 1420 if (rcv_intr_dynamic && !(rcv_intr_count > 1 && rcv_intr_timeout > 0)) { 1421 /* 1422 * The dynamic algorithm expects a non-zero timeout 1423 * and a count > 1. 1424 */ 1425 pr_err("Invalid mode: dynamic receive interrupt mitigation with invalid count and timeout - turning dynamic off\n"); 1426 rcv_intr_dynamic = 0; 1427 } 1428 1429 /* sanitize link CRC options */ 1430 link_crc_mask &= SUPPORTED_CRCS; 1431 1432 ret = opfn_init(); 1433 if (ret < 0) { 1434 pr_err("Failed to allocate opfn_wq"); 1435 goto bail_dev; 1436 } 1437 1438 /* 1439 * These must be called before the driver is registered with 1440 * the PCI subsystem. 1441 */ 1442 hfi1_dbg_init(); 1443 ret = pci_register_driver(&hfi1_pci_driver); 1444 if (ret < 0) { 1445 pr_err("Unable to register driver: error %d\n", -ret); 1446 goto bail_dev; 1447 } 1448 goto bail; /* all OK */ 1449 1450 bail_dev: 1451 hfi1_dbg_exit(); 1452 dev_cleanup(); 1453 bail: 1454 return ret; 1455 } 1456 1457 module_init(hfi1_mod_init); 1458 1459 /* 1460 * Do the non-unit driver cleanup, memory free, etc. at unload. 1461 */ 1462 static void __exit hfi1_mod_cleanup(void) 1463 { 1464 pci_unregister_driver(&hfi1_pci_driver); 1465 opfn_exit(); 1466 node_affinity_destroy_all(); 1467 hfi1_dbg_exit(); 1468 1469 WARN_ON(!xa_empty(&hfi1_dev_table)); 1470 dispose_firmware(); /* asymmetric with obtain_firmware() */ 1471 dev_cleanup(); 1472 } 1473 1474 module_exit(hfi1_mod_cleanup); 1475 1476 /* this can only be called after a successful initialization */ 1477 static void cleanup_device_data(struct hfi1_devdata *dd) 1478 { 1479 int ctxt; 1480 int pidx; 1481 1482 /* users can't do anything more with chip */ 1483 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1484 struct hfi1_pportdata *ppd = &dd->pport[pidx]; 1485 struct cc_state *cc_state; 1486 int i; 1487 1488 if (ppd->statusp) 1489 *ppd->statusp &= ~HFI1_STATUS_CHIP_PRESENT; 1490 1491 for (i = 0; i < OPA_MAX_SLS; i++) 1492 hrtimer_cancel(&ppd->cca_timer[i].hrtimer); 1493 1494 spin_lock(&ppd->cc_state_lock); 1495 cc_state = get_cc_state_protected(ppd); 1496 RCU_INIT_POINTER(ppd->cc_state, NULL); 1497 spin_unlock(&ppd->cc_state_lock); 1498 1499 if (cc_state) 1500 kfree_rcu(cc_state, rcu); 1501 } 1502 1503 free_credit_return(dd); 1504 1505 /* 1506 * Free any resources still in use (usually just kernel contexts) 1507 * at unload; we do for ctxtcnt, because that's what we allocate. 1508 */ 1509 for (ctxt = 0; dd->rcd && ctxt < dd->num_rcv_contexts; ctxt++) { 1510 struct hfi1_ctxtdata *rcd = dd->rcd[ctxt]; 1511 1512 if (rcd) { 1513 hfi1_free_ctxt_rcv_groups(rcd); 1514 hfi1_free_ctxt(rcd); 1515 } 1516 } 1517 1518 kfree(dd->rcd); 1519 dd->rcd = NULL; 1520 1521 free_pio_map(dd); 1522 /* must follow rcv context free - need to remove rcv's hooks */ 1523 for (ctxt = 0; ctxt < dd->num_send_contexts; ctxt++) 1524 sc_free(dd->send_contexts[ctxt].sc); 1525 dd->num_send_contexts = 0; 1526 kfree(dd->send_contexts); 1527 dd->send_contexts = NULL; 1528 kfree(dd->hw_to_sw); 1529 dd->hw_to_sw = NULL; 1530 kfree(dd->boardname); 1531 vfree(dd->events); 1532 vfree(dd->status); 1533 } 1534 1535 /* 1536 * Clean up on unit shutdown, or error during unit load after 1537 * successful initialization. 1538 */ 1539 static void postinit_cleanup(struct hfi1_devdata *dd) 1540 { 1541 hfi1_start_cleanup(dd); 1542 hfi1_comp_vectors_clean_up(dd); 1543 hfi1_dev_affinity_clean_up(dd); 1544 1545 hfi1_pcie_ddcleanup(dd); 1546 hfi1_pcie_cleanup(dd->pcidev); 1547 1548 cleanup_device_data(dd); 1549 1550 hfi1_free_devdata(dd); 1551 } 1552 1553 static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) 1554 { 1555 int ret = 0, j, pidx, initfail; 1556 struct hfi1_devdata *dd; 1557 struct hfi1_pportdata *ppd; 1558 1559 /* First, lock the non-writable module parameters */ 1560 HFI1_CAP_LOCK(); 1561 1562 /* Validate dev ids */ 1563 if (!(ent->device == PCI_DEVICE_ID_INTEL0 || 1564 ent->device == PCI_DEVICE_ID_INTEL1)) { 1565 dev_err(&pdev->dev, "Failing on unknown Intel deviceid 0x%x\n", 1566 ent->device); 1567 ret = -ENODEV; 1568 goto bail; 1569 } 1570 1571 /* Allocate the dd so we can get to work */ 1572 dd = hfi1_alloc_devdata(pdev, NUM_IB_PORTS * 1573 sizeof(struct hfi1_pportdata)); 1574 if (IS_ERR(dd)) { 1575 ret = PTR_ERR(dd); 1576 goto bail; 1577 } 1578 1579 /* Validate some global module parameters */ 1580 ret = hfi1_validate_rcvhdrcnt(dd, rcvhdrcnt); 1581 if (ret) 1582 goto bail; 1583 1584 /* use the encoding function as a sanitization check */ 1585 if (!encode_rcv_header_entry_size(hfi1_hdrq_entsize)) { 1586 dd_dev_err(dd, "Invalid HdrQ Entry size %u\n", 1587 hfi1_hdrq_entsize); 1588 ret = -EINVAL; 1589 goto bail; 1590 } 1591 1592 /* The receive eager buffer size must be set before the receive 1593 * contexts are created. 1594 * 1595 * Set the eager buffer size. Validate that it falls in a range 1596 * allowed by the hardware - all powers of 2 between the min and 1597 * max. The maximum valid MTU is within the eager buffer range 1598 * so we do not need to cap the max_mtu by an eager buffer size 1599 * setting. 1600 */ 1601 if (eager_buffer_size) { 1602 if (!is_power_of_2(eager_buffer_size)) 1603 eager_buffer_size = 1604 roundup_pow_of_two(eager_buffer_size); 1605 eager_buffer_size = 1606 clamp_val(eager_buffer_size, 1607 MIN_EAGER_BUFFER * 8, 1608 MAX_EAGER_BUFFER_TOTAL); 1609 dd_dev_info(dd, "Eager buffer size %u\n", 1610 eager_buffer_size); 1611 } else { 1612 dd_dev_err(dd, "Invalid Eager buffer size of 0\n"); 1613 ret = -EINVAL; 1614 goto bail; 1615 } 1616 1617 /* restrict value of hfi1_rcvarr_split */ 1618 hfi1_rcvarr_split = clamp_val(hfi1_rcvarr_split, 0, 100); 1619 1620 ret = hfi1_pcie_init(dd); 1621 if (ret) 1622 goto bail; 1623 1624 /* 1625 * Do device-specific initialization, function table setup, dd 1626 * allocation, etc. 1627 */ 1628 ret = hfi1_init_dd(dd); 1629 if (ret) 1630 goto clean_bail; /* error already printed */ 1631 1632 ret = create_workqueues(dd); 1633 if (ret) 1634 goto clean_bail; 1635 1636 /* do the generic initialization */ 1637 initfail = hfi1_init(dd, 0); 1638 1639 ret = hfi1_register_ib_device(dd); 1640 1641 /* 1642 * Now ready for use. this should be cleared whenever we 1643 * detect a reset, or initiate one. If earlier failure, 1644 * we still create devices, so diags, etc. can be used 1645 * to determine cause of problem. 1646 */ 1647 if (!initfail && !ret) { 1648 dd->flags |= HFI1_INITTED; 1649 /* create debufs files after init and ib register */ 1650 hfi1_dbg_ibdev_init(&dd->verbs_dev); 1651 } 1652 1653 j = hfi1_device_create(dd); 1654 if (j) 1655 dd_dev_err(dd, "Failed to create /dev devices: %d\n", -j); 1656 1657 if (initfail || ret) { 1658 msix_clean_up_interrupts(dd); 1659 stop_timers(dd); 1660 flush_workqueue(ib_wq); 1661 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1662 hfi1_quiet_serdes(dd->pport + pidx); 1663 ppd = dd->pport + pidx; 1664 if (ppd->hfi1_wq) { 1665 destroy_workqueue(ppd->hfi1_wq); 1666 ppd->hfi1_wq = NULL; 1667 } 1668 if (ppd->link_wq) { 1669 destroy_workqueue(ppd->link_wq); 1670 ppd->link_wq = NULL; 1671 } 1672 } 1673 if (!j) 1674 hfi1_device_remove(dd); 1675 if (!ret) 1676 hfi1_unregister_ib_device(dd); 1677 postinit_cleanup(dd); 1678 if (initfail) 1679 ret = initfail; 1680 goto bail; /* everything already cleaned */ 1681 } 1682 1683 sdma_start(dd); 1684 1685 return 0; 1686 1687 clean_bail: 1688 hfi1_pcie_cleanup(pdev); 1689 bail: 1690 return ret; 1691 } 1692 1693 static void wait_for_clients(struct hfi1_devdata *dd) 1694 { 1695 /* 1696 * Remove the device init value and complete the device if there is 1697 * no clients or wait for active clients to finish. 1698 */ 1699 if (refcount_dec_and_test(&dd->user_refcount)) 1700 complete(&dd->user_comp); 1701 1702 wait_for_completion(&dd->user_comp); 1703 } 1704 1705 static void remove_one(struct pci_dev *pdev) 1706 { 1707 struct hfi1_devdata *dd = pci_get_drvdata(pdev); 1708 1709 /* close debugfs files before ib unregister */ 1710 hfi1_dbg_ibdev_exit(&dd->verbs_dev); 1711 1712 /* remove the /dev hfi1 interface */ 1713 hfi1_device_remove(dd); 1714 1715 /* wait for existing user space clients to finish */ 1716 wait_for_clients(dd); 1717 1718 /* unregister from IB core */ 1719 hfi1_unregister_ib_device(dd); 1720 1721 /* free netdev data */ 1722 hfi1_free_rx(dd); 1723 1724 /* 1725 * Disable the IB link, disable interrupts on the device, 1726 * clear dma engines, etc. 1727 */ 1728 shutdown_device(dd); 1729 destroy_workqueues(dd); 1730 1731 stop_timers(dd); 1732 1733 /* wait until all of our (qsfp) queue_work() calls complete */ 1734 flush_workqueue(ib_wq); 1735 1736 postinit_cleanup(dd); 1737 } 1738 1739 static void shutdown_one(struct pci_dev *pdev) 1740 { 1741 struct hfi1_devdata *dd = pci_get_drvdata(pdev); 1742 1743 shutdown_device(dd); 1744 } 1745 1746 /** 1747 * hfi1_create_rcvhdrq - create a receive header queue 1748 * @dd: the hfi1_ib device 1749 * @rcd: the context data 1750 * 1751 * This must be contiguous memory (from an i/o perspective), and must be 1752 * DMA'able (which means for some systems, it will go through an IOMMU, 1753 * or be forced into a low address range). 1754 */ 1755 int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) 1756 { 1757 unsigned amt; 1758 1759 if (!rcd->rcvhdrq) { 1760 amt = rcvhdrq_size(rcd); 1761 1762 rcd->rcvhdrq = dma_alloc_coherent(&dd->pcidev->dev, amt, 1763 &rcd->rcvhdrq_dma, 1764 GFP_KERNEL); 1765 1766 if (!rcd->rcvhdrq) { 1767 dd_dev_err(dd, 1768 "attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n", 1769 amt, rcd->ctxt); 1770 goto bail; 1771 } 1772 1773 if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) || 1774 HFI1_CAP_UGET_MASK(rcd->flags, DMA_RTAIL)) { 1775 rcd->rcvhdrtail_kvaddr = dma_alloc_coherent(&dd->pcidev->dev, 1776 PAGE_SIZE, 1777 &rcd->rcvhdrqtailaddr_dma, 1778 GFP_KERNEL); 1779 if (!rcd->rcvhdrtail_kvaddr) 1780 goto bail_free; 1781 } 1782 } 1783 1784 set_hdrq_regs(rcd->dd, rcd->ctxt, rcd->rcvhdrqentsize, 1785 rcd->rcvhdrq_cnt); 1786 1787 return 0; 1788 1789 bail_free: 1790 dd_dev_err(dd, 1791 "attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n", 1792 rcd->ctxt); 1793 dma_free_coherent(&dd->pcidev->dev, amt, rcd->rcvhdrq, 1794 rcd->rcvhdrq_dma); 1795 rcd->rcvhdrq = NULL; 1796 bail: 1797 return -ENOMEM; 1798 } 1799 1800 /** 1801 * hfi1_setup_eagerbufs - llocate eager buffers, both kernel and user 1802 * contexts. 1803 * @rcd: the context we are setting up. 1804 * 1805 * Allocate the eager TID buffers and program them into hip. 1806 * They are no longer completely contiguous, we do multiple allocation 1807 * calls. Otherwise we get the OOM code involved, by asking for too 1808 * much per call, with disastrous results on some kernels. 1809 */ 1810 int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd) 1811 { 1812 struct hfi1_devdata *dd = rcd->dd; 1813 u32 max_entries, egrtop, alloced_bytes = 0; 1814 u16 order, idx = 0; 1815 int ret = 0; 1816 u16 round_mtu = roundup_pow_of_two(hfi1_max_mtu); 1817 1818 /* 1819 * The minimum size of the eager buffers is a groups of MTU-sized 1820 * buffers. 1821 * The global eager_buffer_size parameter is checked against the 1822 * theoretical lower limit of the value. Here, we check against the 1823 * MTU. 1824 */ 1825 if (rcd->egrbufs.size < (round_mtu * dd->rcv_entries.group_size)) 1826 rcd->egrbufs.size = round_mtu * dd->rcv_entries.group_size; 1827 /* 1828 * If using one-pkt-per-egr-buffer, lower the eager buffer 1829 * size to the max MTU (page-aligned). 1830 */ 1831 if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) 1832 rcd->egrbufs.rcvtid_size = round_mtu; 1833 1834 /* 1835 * Eager buffers sizes of 1MB or less require smaller TID sizes 1836 * to satisfy the "multiple of 8 RcvArray entries" requirement. 1837 */ 1838 if (rcd->egrbufs.size <= (1 << 20)) 1839 rcd->egrbufs.rcvtid_size = max((unsigned long)round_mtu, 1840 rounddown_pow_of_two(rcd->egrbufs.size / 8)); 1841 1842 while (alloced_bytes < rcd->egrbufs.size && 1843 rcd->egrbufs.alloced < rcd->egrbufs.count) { 1844 rcd->egrbufs.buffers[idx].addr = 1845 dma_alloc_coherent(&dd->pcidev->dev, 1846 rcd->egrbufs.rcvtid_size, 1847 &rcd->egrbufs.buffers[idx].dma, 1848 GFP_KERNEL); 1849 if (rcd->egrbufs.buffers[idx].addr) { 1850 rcd->egrbufs.buffers[idx].len = 1851 rcd->egrbufs.rcvtid_size; 1852 rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].addr = 1853 rcd->egrbufs.buffers[idx].addr; 1854 rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].dma = 1855 rcd->egrbufs.buffers[idx].dma; 1856 rcd->egrbufs.alloced++; 1857 alloced_bytes += rcd->egrbufs.rcvtid_size; 1858 idx++; 1859 } else { 1860 u32 new_size, i, j; 1861 u64 offset = 0; 1862 1863 /* 1864 * Fail the eager buffer allocation if: 1865 * - we are already using the lowest acceptable size 1866 * - we are using one-pkt-per-egr-buffer (this implies 1867 * that we are accepting only one size) 1868 */ 1869 if (rcd->egrbufs.rcvtid_size == round_mtu || 1870 !HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) { 1871 dd_dev_err(dd, "ctxt%u: Failed to allocate eager buffers\n", 1872 rcd->ctxt); 1873 ret = -ENOMEM; 1874 goto bail_rcvegrbuf_phys; 1875 } 1876 1877 new_size = rcd->egrbufs.rcvtid_size / 2; 1878 1879 /* 1880 * If the first attempt to allocate memory failed, don't 1881 * fail everything but continue with the next lower 1882 * size. 1883 */ 1884 if (idx == 0) { 1885 rcd->egrbufs.rcvtid_size = new_size; 1886 continue; 1887 } 1888 1889 /* 1890 * Re-partition already allocated buffers to a smaller 1891 * size. 1892 */ 1893 rcd->egrbufs.alloced = 0; 1894 for (i = 0, j = 0, offset = 0; j < idx; i++) { 1895 if (i >= rcd->egrbufs.count) 1896 break; 1897 rcd->egrbufs.rcvtids[i].dma = 1898 rcd->egrbufs.buffers[j].dma + offset; 1899 rcd->egrbufs.rcvtids[i].addr = 1900 rcd->egrbufs.buffers[j].addr + offset; 1901 rcd->egrbufs.alloced++; 1902 if ((rcd->egrbufs.buffers[j].dma + offset + 1903 new_size) == 1904 (rcd->egrbufs.buffers[j].dma + 1905 rcd->egrbufs.buffers[j].len)) { 1906 j++; 1907 offset = 0; 1908 } else { 1909 offset += new_size; 1910 } 1911 } 1912 rcd->egrbufs.rcvtid_size = new_size; 1913 } 1914 } 1915 rcd->egrbufs.numbufs = idx; 1916 rcd->egrbufs.size = alloced_bytes; 1917 1918 hfi1_cdbg(PROC, 1919 "ctxt%u: Alloced %u rcv tid entries @ %uKB, total %uKB", 1920 rcd->ctxt, rcd->egrbufs.alloced, 1921 rcd->egrbufs.rcvtid_size / 1024, rcd->egrbufs.size / 1024); 1922 1923 /* 1924 * Set the contexts rcv array head update threshold to the closest 1925 * power of 2 (so we can use a mask instead of modulo) below half 1926 * the allocated entries. 1927 */ 1928 rcd->egrbufs.threshold = 1929 rounddown_pow_of_two(rcd->egrbufs.alloced / 2); 1930 /* 1931 * Compute the expected RcvArray entry base. This is done after 1932 * allocating the eager buffers in order to maximize the 1933 * expected RcvArray entries for the context. 1934 */ 1935 max_entries = rcd->rcv_array_groups * dd->rcv_entries.group_size; 1936 egrtop = roundup(rcd->egrbufs.alloced, dd->rcv_entries.group_size); 1937 rcd->expected_count = max_entries - egrtop; 1938 if (rcd->expected_count > MAX_TID_PAIR_ENTRIES * 2) 1939 rcd->expected_count = MAX_TID_PAIR_ENTRIES * 2; 1940 1941 rcd->expected_base = rcd->eager_base + egrtop; 1942 hfi1_cdbg(PROC, "ctxt%u: eager:%u, exp:%u, egrbase:%u, expbase:%u", 1943 rcd->ctxt, rcd->egrbufs.alloced, rcd->expected_count, 1944 rcd->eager_base, rcd->expected_base); 1945 1946 if (!hfi1_rcvbuf_validate(rcd->egrbufs.rcvtid_size, PT_EAGER, &order)) { 1947 hfi1_cdbg(PROC, 1948 "ctxt%u: current Eager buffer size is invalid %u", 1949 rcd->ctxt, rcd->egrbufs.rcvtid_size); 1950 ret = -EINVAL; 1951 goto bail_rcvegrbuf_phys; 1952 } 1953 1954 for (idx = 0; idx < rcd->egrbufs.alloced; idx++) { 1955 hfi1_put_tid(dd, rcd->eager_base + idx, PT_EAGER, 1956 rcd->egrbufs.rcvtids[idx].dma, order); 1957 cond_resched(); 1958 } 1959 1960 return 0; 1961 1962 bail_rcvegrbuf_phys: 1963 for (idx = 0; idx < rcd->egrbufs.alloced && 1964 rcd->egrbufs.buffers[idx].addr; 1965 idx++) { 1966 dma_free_coherent(&dd->pcidev->dev, 1967 rcd->egrbufs.buffers[idx].len, 1968 rcd->egrbufs.buffers[idx].addr, 1969 rcd->egrbufs.buffers[idx].dma); 1970 rcd->egrbufs.buffers[idx].addr = NULL; 1971 rcd->egrbufs.buffers[idx].dma = 0; 1972 rcd->egrbufs.buffers[idx].len = 0; 1973 } 1974 1975 return ret; 1976 } 1977