1 /* 2 * Copyright(c) 2015, 2016 Intel Corporation. 3 * 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 * redistributing this file, you may do so under either license. 6 * 7 * GPL LICENSE SUMMARY 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of version 2 of the GNU General Public License as 11 * published by the Free Software Foundation. 12 * 13 * This program is distributed in the hope that it will be useful, but 14 * WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * General Public License for more details. 17 * 18 * BSD LICENSE 19 * 20 * Redistribution and use in source and binary forms, with or without 21 * modification, are permitted provided that the following conditions 22 * are met: 23 * 24 * - Redistributions of source code must retain the above copyright 25 * notice, this list of conditions and the following disclaimer. 26 * - Redistributions in binary form must reproduce the above copyright 27 * notice, this list of conditions and the following disclaimer in 28 * the documentation and/or other materials provided with the 29 * distribution. 30 * - Neither the name of Intel Corporation nor the names of its 31 * contributors may be used to endorse or promote products derived 32 * from this software without specific prior written permission. 33 * 34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 45 * 46 */ 47 48 #include <linux/pci.h> 49 #include <linux/netdevice.h> 50 #include <linux/vmalloc.h> 51 #include <linux/delay.h> 52 #include <linux/idr.h> 53 #include <linux/module.h> 54 #include <linux/printk.h> 55 #include <linux/hrtimer.h> 56 #include <rdma/rdma_vt.h> 57 58 #include "hfi.h" 59 #include "device.h" 60 #include "common.h" 61 #include "trace.h" 62 #include "mad.h" 63 #include "sdma.h" 64 #include "debugfs.h" 65 #include "verbs.h" 66 #include "aspm.h" 67 68 #undef pr_fmt 69 #define pr_fmt(fmt) DRIVER_NAME ": " fmt 70 71 /* 72 * min buffers we want to have per context, after driver 73 */ 74 #define HFI1_MIN_USER_CTXT_BUFCNT 7 75 76 #define HFI1_MIN_HDRQ_EGRBUF_CNT 2 77 #define HFI1_MAX_HDRQ_EGRBUF_CNT 16352 78 #define HFI1_MIN_EAGER_BUFFER_SIZE (4 * 1024) /* 4KB */ 79 #define HFI1_MAX_EAGER_BUFFER_SIZE (256 * 1024) /* 256KB */ 80 81 /* 82 * Number of user receive contexts we are configured to use (to allow for more 83 * pio buffers per ctxt, etc.) Zero means use one user context per CPU. 84 */ 85 int num_user_contexts = -1; 86 module_param_named(num_user_contexts, num_user_contexts, uint, S_IRUGO); 87 MODULE_PARM_DESC( 88 num_user_contexts, "Set max number of user contexts to use"); 89 90 uint krcvqs[RXE_NUM_DATA_VL]; 91 int krcvqsset; 92 module_param_array(krcvqs, uint, &krcvqsset, S_IRUGO); 93 MODULE_PARM_DESC(krcvqs, "Array of the number of non-control kernel receive queues by VL"); 94 95 /* computed based on above array */ 96 unsigned n_krcvqs; 97 98 static unsigned hfi1_rcvarr_split = 25; 99 module_param_named(rcvarr_split, hfi1_rcvarr_split, uint, S_IRUGO); 100 MODULE_PARM_DESC(rcvarr_split, "Percent of context's RcvArray entries used for Eager buffers"); 101 102 static uint eager_buffer_size = (2 << 20); /* 2MB */ 103 module_param(eager_buffer_size, uint, S_IRUGO); 104 MODULE_PARM_DESC(eager_buffer_size, "Size of the eager buffers, default: 2MB"); 105 106 static uint rcvhdrcnt = 2048; /* 2x the max eager buffer count */ 107 module_param_named(rcvhdrcnt, rcvhdrcnt, uint, S_IRUGO); 108 MODULE_PARM_DESC(rcvhdrcnt, "Receive header queue count (default 2048)"); 109 110 static uint hfi1_hdrq_entsize = 32; 111 module_param_named(hdrq_entsize, hfi1_hdrq_entsize, uint, S_IRUGO); 112 MODULE_PARM_DESC(hdrq_entsize, "Size of header queue entries: 2 - 8B, 16 - 64B (default), 32 - 128B"); 113 114 unsigned int user_credit_return_threshold = 33; /* default is 33% */ 115 module_param(user_credit_return_threshold, uint, S_IRUGO); 116 MODULE_PARM_DESC(user_credit_return_threshold, "Credit return threshold for user send contexts, return when unreturned credits passes this many blocks (in percent of allocated blocks, 0 is off)"); 117 118 static inline u64 encode_rcv_header_entry_size(u16); 119 120 static struct idr hfi1_unit_table; 121 u32 hfi1_cpulist_count; 122 unsigned long *hfi1_cpulist; 123 124 /* 125 * Common code for creating the receive context array. 126 */ 127 int hfi1_create_ctxts(struct hfi1_devdata *dd) 128 { 129 unsigned i; 130 int ret; 131 132 /* Control context has to be always 0 */ 133 BUILD_BUG_ON(HFI1_CTRL_CTXT != 0); 134 135 dd->rcd = kzalloc_node(dd->num_rcv_contexts * sizeof(*dd->rcd), 136 GFP_KERNEL, dd->node); 137 if (!dd->rcd) 138 goto nomem; 139 140 /* create one or more kernel contexts */ 141 for (i = 0; i < dd->first_user_ctxt; ++i) { 142 struct hfi1_pportdata *ppd; 143 struct hfi1_ctxtdata *rcd; 144 145 ppd = dd->pport + (i % dd->num_pports); 146 rcd = hfi1_create_ctxtdata(ppd, i, dd->node); 147 if (!rcd) { 148 dd_dev_err(dd, 149 "Unable to allocate kernel receive context, failing\n"); 150 goto nomem; 151 } 152 /* 153 * Set up the kernel context flags here and now because they 154 * use default values for all receive side memories. User 155 * contexts will be handled as they are created. 156 */ 157 rcd->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) | 158 HFI1_CAP_KGET(NODROP_RHQ_FULL) | 159 HFI1_CAP_KGET(NODROP_EGR_FULL) | 160 HFI1_CAP_KGET(DMA_RTAIL); 161 162 /* Control context must use DMA_RTAIL */ 163 if (rcd->ctxt == HFI1_CTRL_CTXT) 164 rcd->flags |= HFI1_CAP_DMA_RTAIL; 165 rcd->seq_cnt = 1; 166 167 rcd->sc = sc_alloc(dd, SC_ACK, rcd->rcvhdrqentsize, dd->node); 168 if (!rcd->sc) { 169 dd_dev_err(dd, 170 "Unable to allocate kernel send context, failing\n"); 171 dd->rcd[rcd->ctxt] = NULL; 172 hfi1_free_ctxtdata(dd, rcd); 173 goto nomem; 174 } 175 176 ret = hfi1_init_ctxt(rcd->sc); 177 if (ret < 0) { 178 dd_dev_err(dd, 179 "Failed to setup kernel receive context, failing\n"); 180 sc_free(rcd->sc); 181 dd->rcd[rcd->ctxt] = NULL; 182 hfi1_free_ctxtdata(dd, rcd); 183 ret = -EFAULT; 184 goto bail; 185 } 186 } 187 188 /* 189 * Initialize aspm, to be done after gen3 transition and setting up 190 * contexts and before enabling interrupts 191 */ 192 aspm_init(dd); 193 194 return 0; 195 nomem: 196 ret = -ENOMEM; 197 bail: 198 kfree(dd->rcd); 199 dd->rcd = NULL; 200 return ret; 201 } 202 203 /* 204 * Common code for user and kernel context setup. 205 */ 206 struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt, 207 int numa) 208 { 209 struct hfi1_devdata *dd = ppd->dd; 210 struct hfi1_ctxtdata *rcd; 211 unsigned kctxt_ngroups = 0; 212 u32 base; 213 214 if (dd->rcv_entries.nctxt_extra > 215 dd->num_rcv_contexts - dd->first_user_ctxt) 216 kctxt_ngroups = (dd->rcv_entries.nctxt_extra - 217 (dd->num_rcv_contexts - dd->first_user_ctxt)); 218 rcd = kzalloc(sizeof(*rcd), GFP_KERNEL); 219 if (rcd) { 220 u32 rcvtids, max_entries; 221 222 hfi1_cdbg(PROC, "setting up context %u\n", ctxt); 223 224 INIT_LIST_HEAD(&rcd->qp_wait_list); 225 rcd->ppd = ppd; 226 rcd->dd = dd; 227 rcd->cnt = 1; 228 rcd->ctxt = ctxt; 229 dd->rcd[ctxt] = rcd; 230 rcd->numa_id = numa; 231 rcd->rcv_array_groups = dd->rcv_entries.ngroups; 232 233 mutex_init(&rcd->exp_lock); 234 235 /* 236 * Calculate the context's RcvArray entry starting point. 237 * We do this here because we have to take into account all 238 * the RcvArray entries that previous context would have 239 * taken and we have to account for any extra groups 240 * assigned to the kernel or user contexts. 241 */ 242 if (ctxt < dd->first_user_ctxt) { 243 if (ctxt < kctxt_ngroups) { 244 base = ctxt * (dd->rcv_entries.ngroups + 1); 245 rcd->rcv_array_groups++; 246 } else 247 base = kctxt_ngroups + 248 (ctxt * dd->rcv_entries.ngroups); 249 } else { 250 u16 ct = ctxt - dd->first_user_ctxt; 251 252 base = ((dd->n_krcv_queues * dd->rcv_entries.ngroups) + 253 kctxt_ngroups); 254 if (ct < dd->rcv_entries.nctxt_extra) { 255 base += ct * (dd->rcv_entries.ngroups + 1); 256 rcd->rcv_array_groups++; 257 } else 258 base += dd->rcv_entries.nctxt_extra + 259 (ct * dd->rcv_entries.ngroups); 260 } 261 rcd->eager_base = base * dd->rcv_entries.group_size; 262 263 /* Validate and initialize Rcv Hdr Q variables */ 264 if (rcvhdrcnt % HDRQ_INCREMENT) { 265 dd_dev_err(dd, 266 "ctxt%u: header queue count %d must be divisible by %lu\n", 267 rcd->ctxt, rcvhdrcnt, HDRQ_INCREMENT); 268 goto bail; 269 } 270 rcd->rcvhdrq_cnt = rcvhdrcnt; 271 rcd->rcvhdrqentsize = hfi1_hdrq_entsize; 272 /* 273 * Simple Eager buffer allocation: we have already pre-allocated 274 * the number of RcvArray entry groups. Each ctxtdata structure 275 * holds the number of groups for that context. 276 * 277 * To follow CSR requirements and maintain cacheline alignment, 278 * make sure all sizes and bases are multiples of group_size. 279 * 280 * The expected entry count is what is left after assigning 281 * eager. 282 */ 283 max_entries = rcd->rcv_array_groups * 284 dd->rcv_entries.group_size; 285 rcvtids = ((max_entries * hfi1_rcvarr_split) / 100); 286 rcd->egrbufs.count = round_down(rcvtids, 287 dd->rcv_entries.group_size); 288 if (rcd->egrbufs.count > MAX_EAGER_ENTRIES) { 289 dd_dev_err(dd, "ctxt%u: requested too many RcvArray entries.\n", 290 rcd->ctxt); 291 rcd->egrbufs.count = MAX_EAGER_ENTRIES; 292 } 293 hfi1_cdbg(PROC, 294 "ctxt%u: max Eager buffer RcvArray entries: %u\n", 295 rcd->ctxt, rcd->egrbufs.count); 296 297 /* 298 * Allocate array that will hold the eager buffer accounting 299 * data. 300 * This will allocate the maximum possible buffer count based 301 * on the value of the RcvArray split parameter. 302 * The resulting value will be rounded down to the closest 303 * multiple of dd->rcv_entries.group_size. 304 */ 305 rcd->egrbufs.buffers = kcalloc(rcd->egrbufs.count, 306 sizeof(*rcd->egrbufs.buffers), 307 GFP_KERNEL); 308 if (!rcd->egrbufs.buffers) 309 goto bail; 310 rcd->egrbufs.rcvtids = kcalloc(rcd->egrbufs.count, 311 sizeof(*rcd->egrbufs.rcvtids), 312 GFP_KERNEL); 313 if (!rcd->egrbufs.rcvtids) 314 goto bail; 315 rcd->egrbufs.size = eager_buffer_size; 316 /* 317 * The size of the buffers programmed into the RcvArray 318 * entries needs to be big enough to handle the highest 319 * MTU supported. 320 */ 321 if (rcd->egrbufs.size < hfi1_max_mtu) { 322 rcd->egrbufs.size = __roundup_pow_of_two(hfi1_max_mtu); 323 hfi1_cdbg(PROC, 324 "ctxt%u: eager bufs size too small. Adjusting to %zu\n", 325 rcd->ctxt, rcd->egrbufs.size); 326 } 327 rcd->egrbufs.rcvtid_size = HFI1_MAX_EAGER_BUFFER_SIZE; 328 329 if (ctxt < dd->first_user_ctxt) { /* N/A for PSM contexts */ 330 rcd->opstats = kzalloc(sizeof(*rcd->opstats), 331 GFP_KERNEL); 332 if (!rcd->opstats) 333 goto bail; 334 } 335 } 336 return rcd; 337 bail: 338 kfree(rcd->egrbufs.rcvtids); 339 kfree(rcd->egrbufs.buffers); 340 kfree(rcd); 341 return NULL; 342 } 343 344 /* 345 * Convert a receive header entry size that to the encoding used in the CSR. 346 * 347 * Return a zero if the given size is invalid. 348 */ 349 static inline u64 encode_rcv_header_entry_size(u16 size) 350 { 351 /* there are only 3 valid receive header entry sizes */ 352 if (size == 2) 353 return 1; 354 if (size == 16) 355 return 2; 356 else if (size == 32) 357 return 4; 358 return 0; /* invalid */ 359 } 360 361 /* 362 * Select the largest ccti value over all SLs to determine the intra- 363 * packet gap for the link. 364 * 365 * called with cca_timer_lock held (to protect access to cca_timer 366 * array), and rcu_read_lock() (to protect access to cc_state). 367 */ 368 void set_link_ipg(struct hfi1_pportdata *ppd) 369 { 370 struct hfi1_devdata *dd = ppd->dd; 371 struct cc_state *cc_state; 372 int i; 373 u16 cce, ccti_limit, max_ccti = 0; 374 u16 shift, mult; 375 u64 src; 376 u32 current_egress_rate; /* Mbits /sec */ 377 u32 max_pkt_time; 378 /* 379 * max_pkt_time is the maximum packet egress time in units 380 * of the fabric clock period 1/(805 MHz). 381 */ 382 383 cc_state = get_cc_state(ppd); 384 385 if (!cc_state) 386 /* 387 * This should _never_ happen - rcu_read_lock() is held, 388 * and set_link_ipg() should not be called if cc_state 389 * is NULL. 390 */ 391 return; 392 393 for (i = 0; i < OPA_MAX_SLS; i++) { 394 u16 ccti = ppd->cca_timer[i].ccti; 395 396 if (ccti > max_ccti) 397 max_ccti = ccti; 398 } 399 400 ccti_limit = cc_state->cct.ccti_limit; 401 if (max_ccti > ccti_limit) 402 max_ccti = ccti_limit; 403 404 cce = cc_state->cct.entries[max_ccti].entry; 405 shift = (cce & 0xc000) >> 14; 406 mult = (cce & 0x3fff); 407 408 current_egress_rate = active_egress_rate(ppd); 409 410 max_pkt_time = egress_cycles(ppd->ibmaxlen, current_egress_rate); 411 412 src = (max_pkt_time >> shift) * mult; 413 414 src &= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK; 415 src <<= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT; 416 417 write_csr(dd, SEND_STATIC_RATE_CONTROL, src); 418 } 419 420 static enum hrtimer_restart cca_timer_fn(struct hrtimer *t) 421 { 422 struct cca_timer *cca_timer; 423 struct hfi1_pportdata *ppd; 424 int sl; 425 u16 ccti_timer, ccti_min; 426 struct cc_state *cc_state; 427 unsigned long flags; 428 enum hrtimer_restart ret = HRTIMER_NORESTART; 429 430 cca_timer = container_of(t, struct cca_timer, hrtimer); 431 ppd = cca_timer->ppd; 432 sl = cca_timer->sl; 433 434 rcu_read_lock(); 435 436 cc_state = get_cc_state(ppd); 437 438 if (!cc_state) { 439 rcu_read_unlock(); 440 return HRTIMER_NORESTART; 441 } 442 443 /* 444 * 1) decrement ccti for SL 445 * 2) calculate IPG for link (set_link_ipg()) 446 * 3) restart timer, unless ccti is at min value 447 */ 448 449 ccti_min = cc_state->cong_setting.entries[sl].ccti_min; 450 ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer; 451 452 spin_lock_irqsave(&ppd->cca_timer_lock, flags); 453 454 if (cca_timer->ccti > ccti_min) { 455 cca_timer->ccti--; 456 set_link_ipg(ppd); 457 } 458 459 if (cca_timer->ccti > ccti_min) { 460 unsigned long nsec = 1024 * ccti_timer; 461 /* ccti_timer is in units of 1.024 usec */ 462 hrtimer_forward_now(t, ns_to_ktime(nsec)); 463 ret = HRTIMER_RESTART; 464 } 465 466 spin_unlock_irqrestore(&ppd->cca_timer_lock, flags); 467 rcu_read_unlock(); 468 return ret; 469 } 470 471 /* 472 * Common code for initializing the physical port structure. 473 */ 474 void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, 475 struct hfi1_devdata *dd, u8 hw_pidx, u8 port) 476 { 477 int i, size; 478 uint default_pkey_idx; 479 480 ppd->dd = dd; 481 ppd->hw_pidx = hw_pidx; 482 ppd->port = port; /* IB port number, not index */ 483 484 default_pkey_idx = 1; 485 486 ppd->pkeys[default_pkey_idx] = DEFAULT_P_KEY; 487 if (loopback) { 488 hfi1_early_err(&pdev->dev, 489 "Faking data partition 0x8001 in idx %u\n", 490 !default_pkey_idx); 491 ppd->pkeys[!default_pkey_idx] = 0x8001; 492 } 493 494 INIT_WORK(&ppd->link_vc_work, handle_verify_cap); 495 INIT_WORK(&ppd->link_up_work, handle_link_up); 496 INIT_WORK(&ppd->link_down_work, handle_link_down); 497 INIT_WORK(&ppd->freeze_work, handle_freeze); 498 INIT_WORK(&ppd->link_downgrade_work, handle_link_downgrade); 499 INIT_WORK(&ppd->sma_message_work, handle_sma_message); 500 INIT_WORK(&ppd->link_bounce_work, handle_link_bounce); 501 INIT_WORK(&ppd->linkstate_active_work, receive_interrupt_work); 502 INIT_WORK(&ppd->qsfp_info.qsfp_work, qsfp_event); 503 504 mutex_init(&ppd->hls_lock); 505 spin_lock_init(&ppd->sdma_alllock); 506 spin_lock_init(&ppd->qsfp_info.qsfp_lock); 507 508 ppd->qsfp_info.ppd = ppd; 509 ppd->sm_trap_qp = 0x0; 510 ppd->sa_qp = 0x1; 511 512 ppd->hfi1_wq = NULL; 513 514 spin_lock_init(&ppd->cca_timer_lock); 515 516 for (i = 0; i < OPA_MAX_SLS; i++) { 517 hrtimer_init(&ppd->cca_timer[i].hrtimer, CLOCK_MONOTONIC, 518 HRTIMER_MODE_REL); 519 ppd->cca_timer[i].ppd = ppd; 520 ppd->cca_timer[i].sl = i; 521 ppd->cca_timer[i].ccti = 0; 522 ppd->cca_timer[i].hrtimer.function = cca_timer_fn; 523 } 524 525 ppd->cc_max_table_entries = IB_CC_TABLE_CAP_DEFAULT; 526 527 spin_lock_init(&ppd->cc_state_lock); 528 spin_lock_init(&ppd->cc_log_lock); 529 size = sizeof(struct cc_state); 530 RCU_INIT_POINTER(ppd->cc_state, kzalloc(size, GFP_KERNEL)); 531 if (!rcu_dereference(ppd->cc_state)) 532 goto bail; 533 return; 534 535 bail: 536 537 hfi1_early_err(&pdev->dev, 538 "Congestion Control Agent disabled for port %d\n", port); 539 } 540 541 /* 542 * Do initialization for device that is only needed on 543 * first detect, not on resets. 544 */ 545 static int loadtime_init(struct hfi1_devdata *dd) 546 { 547 return 0; 548 } 549 550 /** 551 * init_after_reset - re-initialize after a reset 552 * @dd: the hfi1_ib device 553 * 554 * sanity check at least some of the values after reset, and 555 * ensure no receive or transmit (explicitly, in case reset 556 * failed 557 */ 558 static int init_after_reset(struct hfi1_devdata *dd) 559 { 560 int i; 561 562 /* 563 * Ensure chip does no sends or receives, tail updates, or 564 * pioavail updates while we re-initialize. This is mostly 565 * for the driver data structures, not chip registers. 566 */ 567 for (i = 0; i < dd->num_rcv_contexts; i++) 568 hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS | 569 HFI1_RCVCTRL_INTRAVAIL_DIS | 570 HFI1_RCVCTRL_TAILUPD_DIS, i); 571 pio_send_control(dd, PSC_GLOBAL_DISABLE); 572 for (i = 0; i < dd->num_send_contexts; i++) 573 sc_disable(dd->send_contexts[i].sc); 574 575 return 0; 576 } 577 578 static void enable_chip(struct hfi1_devdata *dd) 579 { 580 u32 rcvmask; 581 u32 i; 582 583 /* enable PIO send */ 584 pio_send_control(dd, PSC_GLOBAL_ENABLE); 585 586 /* 587 * Enable kernel ctxts' receive and receive interrupt. 588 * Other ctxts done as user opens and initializes them. 589 */ 590 for (i = 0; i < dd->first_user_ctxt; ++i) { 591 rcvmask = HFI1_RCVCTRL_CTXT_ENB | HFI1_RCVCTRL_INTRAVAIL_ENB; 592 rcvmask |= HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, DMA_RTAIL) ? 593 HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS; 594 if (!HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, MULTI_PKT_EGR)) 595 rcvmask |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB; 596 if (HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, NODROP_RHQ_FULL)) 597 rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB; 598 if (HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, NODROP_EGR_FULL)) 599 rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB; 600 hfi1_rcvctrl(dd, rcvmask, i); 601 sc_enable(dd->rcd[i]->sc); 602 } 603 } 604 605 /** 606 * create_workqueues - create per port workqueues 607 * @dd: the hfi1_ib device 608 */ 609 static int create_workqueues(struct hfi1_devdata *dd) 610 { 611 int pidx; 612 struct hfi1_pportdata *ppd; 613 614 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 615 ppd = dd->pport + pidx; 616 if (!ppd->hfi1_wq) { 617 ppd->hfi1_wq = 618 alloc_workqueue( 619 "hfi%d_%d", 620 WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE, 621 dd->num_sdma, 622 dd->unit, pidx); 623 if (!ppd->hfi1_wq) 624 goto wq_error; 625 } 626 } 627 return 0; 628 wq_error: 629 pr_err("alloc_workqueue failed for port %d\n", pidx + 1); 630 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 631 ppd = dd->pport + pidx; 632 if (ppd->hfi1_wq) { 633 destroy_workqueue(ppd->hfi1_wq); 634 ppd->hfi1_wq = NULL; 635 } 636 } 637 return -ENOMEM; 638 } 639 640 /** 641 * hfi1_init - do the actual initialization sequence on the chip 642 * @dd: the hfi1_ib device 643 * @reinit: re-initializing, so don't allocate new memory 644 * 645 * Do the actual initialization sequence on the chip. This is done 646 * both from the init routine called from the PCI infrastructure, and 647 * when we reset the chip, or detect that it was reset internally, 648 * or it's administratively re-enabled. 649 * 650 * Memory allocation here and in called routines is only done in 651 * the first case (reinit == 0). We have to be careful, because even 652 * without memory allocation, we need to re-write all the chip registers 653 * TIDs, etc. after the reset or enable has completed. 654 */ 655 int hfi1_init(struct hfi1_devdata *dd, int reinit) 656 { 657 int ret = 0, pidx, lastfail = 0; 658 unsigned i, len; 659 struct hfi1_ctxtdata *rcd; 660 struct hfi1_pportdata *ppd; 661 662 /* Set up recv low level handlers */ 663 dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_EXPECTED] = 664 kdeth_process_expected; 665 dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_EAGER] = 666 kdeth_process_eager; 667 dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_IB] = process_receive_ib; 668 dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_ERROR] = 669 process_receive_error; 670 dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_BYPASS] = 671 process_receive_bypass; 672 dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID5] = 673 process_receive_invalid; 674 dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID6] = 675 process_receive_invalid; 676 dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID7] = 677 process_receive_invalid; 678 dd->rhf_rcv_function_map = dd->normal_rhf_rcv_functions; 679 680 /* Set up send low level handlers */ 681 dd->process_pio_send = hfi1_verbs_send_pio; 682 dd->process_dma_send = hfi1_verbs_send_dma; 683 dd->pio_inline_send = pio_copy; 684 685 if (is_ax(dd)) { 686 atomic_set(&dd->drop_packet, DROP_PACKET_ON); 687 dd->do_drop = 1; 688 } else { 689 atomic_set(&dd->drop_packet, DROP_PACKET_OFF); 690 dd->do_drop = 0; 691 } 692 693 /* make sure the link is not "up" */ 694 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 695 ppd = dd->pport + pidx; 696 ppd->linkup = 0; 697 } 698 699 if (reinit) 700 ret = init_after_reset(dd); 701 else 702 ret = loadtime_init(dd); 703 if (ret) 704 goto done; 705 706 /* allocate dummy tail memory for all receive contexts */ 707 dd->rcvhdrtail_dummy_kvaddr = dma_zalloc_coherent( 708 &dd->pcidev->dev, sizeof(u64), 709 &dd->rcvhdrtail_dummy_physaddr, 710 GFP_KERNEL); 711 712 if (!dd->rcvhdrtail_dummy_kvaddr) { 713 dd_dev_err(dd, "cannot allocate dummy tail memory\n"); 714 ret = -ENOMEM; 715 goto done; 716 } 717 718 /* dd->rcd can be NULL if early initialization failed */ 719 for (i = 0; dd->rcd && i < dd->first_user_ctxt; ++i) { 720 /* 721 * Set up the (kernel) rcvhdr queue and egr TIDs. If doing 722 * re-init, the simplest way to handle this is to free 723 * existing, and re-allocate. 724 * Need to re-create rest of ctxt 0 ctxtdata as well. 725 */ 726 rcd = dd->rcd[i]; 727 if (!rcd) 728 continue; 729 730 rcd->do_interrupt = &handle_receive_interrupt; 731 732 lastfail = hfi1_create_rcvhdrq(dd, rcd); 733 if (!lastfail) 734 lastfail = hfi1_setup_eagerbufs(rcd); 735 if (lastfail) { 736 dd_dev_err(dd, 737 "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n"); 738 ret = lastfail; 739 } 740 } 741 742 /* Allocate enough memory for user event notification. */ 743 len = PAGE_ALIGN(dd->chip_rcv_contexts * HFI1_MAX_SHARED_CTXTS * 744 sizeof(*dd->events)); 745 dd->events = vmalloc_user(len); 746 if (!dd->events) 747 dd_dev_err(dd, "Failed to allocate user events page\n"); 748 /* 749 * Allocate a page for device and port status. 750 * Page will be shared amongst all user processes. 751 */ 752 dd->status = vmalloc_user(PAGE_SIZE); 753 if (!dd->status) 754 dd_dev_err(dd, "Failed to allocate dev status page\n"); 755 else 756 dd->freezelen = PAGE_SIZE - (sizeof(*dd->status) - 757 sizeof(dd->status->freezemsg)); 758 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 759 ppd = dd->pport + pidx; 760 if (dd->status) 761 /* Currently, we only have one port */ 762 ppd->statusp = &dd->status->port; 763 764 set_mtu(ppd); 765 } 766 767 /* enable chip even if we have an error, so we can debug cause */ 768 enable_chip(dd); 769 770 done: 771 /* 772 * Set status even if port serdes is not initialized 773 * so that diags will work. 774 */ 775 if (dd->status) 776 dd->status->dev |= HFI1_STATUS_CHIP_PRESENT | 777 HFI1_STATUS_INITTED; 778 if (!ret) { 779 /* enable all interrupts from the chip */ 780 set_intr_state(dd, 1); 781 782 /* chip is OK for user apps; mark it as initialized */ 783 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 784 ppd = dd->pport + pidx; 785 786 /* 787 * start the serdes - must be after interrupts are 788 * enabled so we are notified when the link goes up 789 */ 790 lastfail = bringup_serdes(ppd); 791 if (lastfail) 792 dd_dev_info(dd, 793 "Failed to bring up port %u\n", 794 ppd->port); 795 796 /* 797 * Set status even if port serdes is not initialized 798 * so that diags will work. 799 */ 800 if (ppd->statusp) 801 *ppd->statusp |= HFI1_STATUS_CHIP_PRESENT | 802 HFI1_STATUS_INITTED; 803 if (!ppd->link_speed_enabled) 804 continue; 805 } 806 } 807 808 /* if ret is non-zero, we probably should do some cleanup here... */ 809 return ret; 810 } 811 812 static inline struct hfi1_devdata *__hfi1_lookup(int unit) 813 { 814 return idr_find(&hfi1_unit_table, unit); 815 } 816 817 struct hfi1_devdata *hfi1_lookup(int unit) 818 { 819 struct hfi1_devdata *dd; 820 unsigned long flags; 821 822 spin_lock_irqsave(&hfi1_devs_lock, flags); 823 dd = __hfi1_lookup(unit); 824 spin_unlock_irqrestore(&hfi1_devs_lock, flags); 825 826 return dd; 827 } 828 829 /* 830 * Stop the timers during unit shutdown, or after an error late 831 * in initialization. 832 */ 833 static void stop_timers(struct hfi1_devdata *dd) 834 { 835 struct hfi1_pportdata *ppd; 836 int pidx; 837 838 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 839 ppd = dd->pport + pidx; 840 if (ppd->led_override_timer.data) { 841 del_timer_sync(&ppd->led_override_timer); 842 atomic_set(&ppd->led_override_timer_active, 0); 843 } 844 } 845 } 846 847 /** 848 * shutdown_device - shut down a device 849 * @dd: the hfi1_ib device 850 * 851 * This is called to make the device quiet when we are about to 852 * unload the driver, and also when the device is administratively 853 * disabled. It does not free any data structures. 854 * Everything it does has to be setup again by hfi1_init(dd, 1) 855 */ 856 static void shutdown_device(struct hfi1_devdata *dd) 857 { 858 struct hfi1_pportdata *ppd; 859 unsigned pidx; 860 int i; 861 862 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 863 ppd = dd->pport + pidx; 864 865 ppd->linkup = 0; 866 if (ppd->statusp) 867 *ppd->statusp &= ~(HFI1_STATUS_IB_CONF | 868 HFI1_STATUS_IB_READY); 869 } 870 dd->flags &= ~HFI1_INITTED; 871 872 /* mask interrupts, but not errors */ 873 set_intr_state(dd, 0); 874 875 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 876 ppd = dd->pport + pidx; 877 for (i = 0; i < dd->num_rcv_contexts; i++) 878 hfi1_rcvctrl(dd, HFI1_RCVCTRL_TAILUPD_DIS | 879 HFI1_RCVCTRL_CTXT_DIS | 880 HFI1_RCVCTRL_INTRAVAIL_DIS | 881 HFI1_RCVCTRL_PKEY_DIS | 882 HFI1_RCVCTRL_ONE_PKT_EGR_DIS, i); 883 /* 884 * Gracefully stop all sends allowing any in progress to 885 * trickle out first. 886 */ 887 for (i = 0; i < dd->num_send_contexts; i++) 888 sc_flush(dd->send_contexts[i].sc); 889 } 890 891 /* 892 * Enough for anything that's going to trickle out to have actually 893 * done so. 894 */ 895 udelay(20); 896 897 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 898 ppd = dd->pport + pidx; 899 900 /* disable all contexts */ 901 for (i = 0; i < dd->num_send_contexts; i++) 902 sc_disable(dd->send_contexts[i].sc); 903 /* disable the send device */ 904 pio_send_control(dd, PSC_GLOBAL_DISABLE); 905 906 shutdown_led_override(ppd); 907 908 /* 909 * Clear SerdesEnable. 910 * We can't count on interrupts since we are stopping. 911 */ 912 hfi1_quiet_serdes(ppd); 913 914 if (ppd->hfi1_wq) { 915 destroy_workqueue(ppd->hfi1_wq); 916 ppd->hfi1_wq = NULL; 917 } 918 } 919 sdma_exit(dd); 920 } 921 922 /** 923 * hfi1_free_ctxtdata - free a context's allocated data 924 * @dd: the hfi1_ib device 925 * @rcd: the ctxtdata structure 926 * 927 * free up any allocated data for a context 928 * This should not touch anything that would affect a simultaneous 929 * re-allocation of context data, because it is called after hfi1_mutex 930 * is released (and can be called from reinit as well). 931 * It should never change any chip state, or global driver state. 932 */ 933 void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) 934 { 935 unsigned e; 936 937 if (!rcd) 938 return; 939 940 if (rcd->rcvhdrq) { 941 dma_free_coherent(&dd->pcidev->dev, rcd->rcvhdrq_size, 942 rcd->rcvhdrq, rcd->rcvhdrq_phys); 943 rcd->rcvhdrq = NULL; 944 if (rcd->rcvhdrtail_kvaddr) { 945 dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, 946 (void *)rcd->rcvhdrtail_kvaddr, 947 rcd->rcvhdrqtailaddr_phys); 948 rcd->rcvhdrtail_kvaddr = NULL; 949 } 950 } 951 952 /* all the RcvArray entries should have been cleared by now */ 953 kfree(rcd->egrbufs.rcvtids); 954 955 for (e = 0; e < rcd->egrbufs.alloced; e++) { 956 if (rcd->egrbufs.buffers[e].phys) 957 dma_free_coherent(&dd->pcidev->dev, 958 rcd->egrbufs.buffers[e].len, 959 rcd->egrbufs.buffers[e].addr, 960 rcd->egrbufs.buffers[e].phys); 961 } 962 kfree(rcd->egrbufs.buffers); 963 964 sc_free(rcd->sc); 965 vfree(rcd->user_event_mask); 966 vfree(rcd->subctxt_uregbase); 967 vfree(rcd->subctxt_rcvegrbuf); 968 vfree(rcd->subctxt_rcvhdr_base); 969 kfree(rcd->opstats); 970 kfree(rcd); 971 } 972 973 /* 974 * Release our hold on the shared asic data. If we are the last one, 975 * free the structure. Must be holding hfi1_devs_lock. 976 */ 977 static void release_asic_data(struct hfi1_devdata *dd) 978 { 979 int other; 980 981 if (!dd->asic_data) 982 return; 983 dd->asic_data->dds[dd->hfi1_id] = NULL; 984 other = dd->hfi1_id ? 0 : 1; 985 if (!dd->asic_data->dds[other]) { 986 /* we are the last holder, free it */ 987 kfree(dd->asic_data); 988 } 989 dd->asic_data = NULL; 990 } 991 992 static void __hfi1_free_devdata(struct kobject *kobj) 993 { 994 struct hfi1_devdata *dd = 995 container_of(kobj, struct hfi1_devdata, kobj); 996 unsigned long flags; 997 998 spin_lock_irqsave(&hfi1_devs_lock, flags); 999 idr_remove(&hfi1_unit_table, dd->unit); 1000 list_del(&dd->list); 1001 release_asic_data(dd); 1002 spin_unlock_irqrestore(&hfi1_devs_lock, flags); 1003 free_platform_config(dd); 1004 rcu_barrier(); /* wait for rcu callbacks to complete */ 1005 free_percpu(dd->int_counter); 1006 free_percpu(dd->rcv_limit); 1007 hfi1_dev_affinity_free(dd); 1008 free_percpu(dd->send_schedule); 1009 rvt_dealloc_device(&dd->verbs_dev.rdi); 1010 } 1011 1012 static struct kobj_type hfi1_devdata_type = { 1013 .release = __hfi1_free_devdata, 1014 }; 1015 1016 void hfi1_free_devdata(struct hfi1_devdata *dd) 1017 { 1018 kobject_put(&dd->kobj); 1019 } 1020 1021 /* 1022 * Allocate our primary per-unit data structure. Must be done via verbs 1023 * allocator, because the verbs cleanup process both does cleanup and 1024 * free of the data structure. 1025 * "extra" is for chip-specific data. 1026 * 1027 * Use the idr mechanism to get a unit number for this unit. 1028 */ 1029 struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra) 1030 { 1031 unsigned long flags; 1032 struct hfi1_devdata *dd; 1033 int ret, nports; 1034 1035 /* extra is * number of ports */ 1036 nports = extra / sizeof(struct hfi1_pportdata); 1037 1038 dd = (struct hfi1_devdata *)rvt_alloc_device(sizeof(*dd) + extra, 1039 nports); 1040 if (!dd) 1041 return ERR_PTR(-ENOMEM); 1042 dd->num_pports = nports; 1043 dd->pport = (struct hfi1_pportdata *)(dd + 1); 1044 1045 INIT_LIST_HEAD(&dd->list); 1046 idr_preload(GFP_KERNEL); 1047 spin_lock_irqsave(&hfi1_devs_lock, flags); 1048 1049 ret = idr_alloc(&hfi1_unit_table, dd, 0, 0, GFP_NOWAIT); 1050 if (ret >= 0) { 1051 dd->unit = ret; 1052 list_add(&dd->list, &hfi1_dev_list); 1053 } 1054 1055 spin_unlock_irqrestore(&hfi1_devs_lock, flags); 1056 idr_preload_end(); 1057 1058 if (ret < 0) { 1059 hfi1_early_err(&pdev->dev, 1060 "Could not allocate unit ID: error %d\n", -ret); 1061 goto bail; 1062 } 1063 /* 1064 * Initialize all locks for the device. This needs to be as early as 1065 * possible so locks are usable. 1066 */ 1067 spin_lock_init(&dd->sc_lock); 1068 spin_lock_init(&dd->sendctrl_lock); 1069 spin_lock_init(&dd->rcvctrl_lock); 1070 spin_lock_init(&dd->uctxt_lock); 1071 spin_lock_init(&dd->hfi1_diag_trans_lock); 1072 spin_lock_init(&dd->sc_init_lock); 1073 spin_lock_init(&dd->dc8051_lock); 1074 spin_lock_init(&dd->dc8051_memlock); 1075 seqlock_init(&dd->sc2vl_lock); 1076 spin_lock_init(&dd->sde_map_lock); 1077 spin_lock_init(&dd->pio_map_lock); 1078 init_waitqueue_head(&dd->event_queue); 1079 1080 dd->int_counter = alloc_percpu(u64); 1081 if (!dd->int_counter) { 1082 ret = -ENOMEM; 1083 hfi1_early_err(&pdev->dev, 1084 "Could not allocate per-cpu int_counter\n"); 1085 goto bail; 1086 } 1087 1088 dd->rcv_limit = alloc_percpu(u64); 1089 if (!dd->rcv_limit) { 1090 ret = -ENOMEM; 1091 hfi1_early_err(&pdev->dev, 1092 "Could not allocate per-cpu rcv_limit\n"); 1093 goto bail; 1094 } 1095 1096 dd->send_schedule = alloc_percpu(u64); 1097 if (!dd->send_schedule) { 1098 ret = -ENOMEM; 1099 hfi1_early_err(&pdev->dev, 1100 "Could not allocate per-cpu int_counter\n"); 1101 goto bail; 1102 } 1103 1104 if (!hfi1_cpulist_count) { 1105 u32 count = num_online_cpus(); 1106 1107 hfi1_cpulist = kcalloc(BITS_TO_LONGS(count), sizeof(long), 1108 GFP_KERNEL); 1109 if (hfi1_cpulist) 1110 hfi1_cpulist_count = count; 1111 else 1112 hfi1_early_err( 1113 &pdev->dev, 1114 "Could not alloc cpulist info, cpu affinity might be wrong\n"); 1115 } 1116 kobject_init(&dd->kobj, &hfi1_devdata_type); 1117 return dd; 1118 1119 bail: 1120 if (!list_empty(&dd->list)) 1121 list_del_init(&dd->list); 1122 rvt_dealloc_device(&dd->verbs_dev.rdi); 1123 return ERR_PTR(ret); 1124 } 1125 1126 /* 1127 * Called from freeze mode handlers, and from PCI error 1128 * reporting code. Should be paranoid about state of 1129 * system and data structures. 1130 */ 1131 void hfi1_disable_after_error(struct hfi1_devdata *dd) 1132 { 1133 if (dd->flags & HFI1_INITTED) { 1134 u32 pidx; 1135 1136 dd->flags &= ~HFI1_INITTED; 1137 if (dd->pport) 1138 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1139 struct hfi1_pportdata *ppd; 1140 1141 ppd = dd->pport + pidx; 1142 if (dd->flags & HFI1_PRESENT) 1143 set_link_state(ppd, HLS_DN_DISABLE); 1144 1145 if (ppd->statusp) 1146 *ppd->statusp &= ~HFI1_STATUS_IB_READY; 1147 } 1148 } 1149 1150 /* 1151 * Mark as having had an error for driver, and also 1152 * for /sys and status word mapped to user programs. 1153 * This marks unit as not usable, until reset. 1154 */ 1155 if (dd->status) 1156 dd->status->dev |= HFI1_STATUS_HWERROR; 1157 } 1158 1159 static void remove_one(struct pci_dev *); 1160 static int init_one(struct pci_dev *, const struct pci_device_id *); 1161 1162 #define DRIVER_LOAD_MSG "Intel " DRIVER_NAME " loaded: " 1163 #define PFX DRIVER_NAME ": " 1164 1165 static const struct pci_device_id hfi1_pci_tbl[] = { 1166 { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL0) }, 1167 { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL1) }, 1168 { 0, } 1169 }; 1170 1171 MODULE_DEVICE_TABLE(pci, hfi1_pci_tbl); 1172 1173 static struct pci_driver hfi1_pci_driver = { 1174 .name = DRIVER_NAME, 1175 .probe = init_one, 1176 .remove = remove_one, 1177 .id_table = hfi1_pci_tbl, 1178 .err_handler = &hfi1_pci_err_handler, 1179 }; 1180 1181 static void __init compute_krcvqs(void) 1182 { 1183 int i; 1184 1185 for (i = 0; i < krcvqsset; i++) 1186 n_krcvqs += krcvqs[i]; 1187 } 1188 1189 /* 1190 * Do all the generic driver unit- and chip-independent memory 1191 * allocation and initialization. 1192 */ 1193 static int __init hfi1_mod_init(void) 1194 { 1195 int ret; 1196 1197 ret = dev_init(); 1198 if (ret) 1199 goto bail; 1200 1201 /* validate max MTU before any devices start */ 1202 if (!valid_opa_max_mtu(hfi1_max_mtu)) { 1203 pr_err("Invalid max_mtu 0x%x, using 0x%x instead\n", 1204 hfi1_max_mtu, HFI1_DEFAULT_MAX_MTU); 1205 hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU; 1206 } 1207 /* valid CUs run from 1-128 in powers of 2 */ 1208 if (hfi1_cu > 128 || !is_power_of_2(hfi1_cu)) 1209 hfi1_cu = 1; 1210 /* valid credit return threshold is 0-100, variable is unsigned */ 1211 if (user_credit_return_threshold > 100) 1212 user_credit_return_threshold = 100; 1213 1214 compute_krcvqs(); 1215 /* 1216 * sanitize receive interrupt count, time must wait until after 1217 * the hardware type is known 1218 */ 1219 if (rcv_intr_count > RCV_HDR_HEAD_COUNTER_MASK) 1220 rcv_intr_count = RCV_HDR_HEAD_COUNTER_MASK; 1221 /* reject invalid combinations */ 1222 if (rcv_intr_count == 0 && rcv_intr_timeout == 0) { 1223 pr_err("Invalid mode: both receive interrupt count and available timeout are zero - setting interrupt count to 1\n"); 1224 rcv_intr_count = 1; 1225 } 1226 if (rcv_intr_count > 1 && rcv_intr_timeout == 0) { 1227 /* 1228 * Avoid indefinite packet delivery by requiring a timeout 1229 * if count is > 1. 1230 */ 1231 pr_err("Invalid mode: receive interrupt count greater than 1 and available timeout is zero - setting available timeout to 1\n"); 1232 rcv_intr_timeout = 1; 1233 } 1234 if (rcv_intr_dynamic && !(rcv_intr_count > 1 && rcv_intr_timeout > 0)) { 1235 /* 1236 * The dynamic algorithm expects a non-zero timeout 1237 * and a count > 1. 1238 */ 1239 pr_err("Invalid mode: dynamic receive interrupt mitigation with invalid count and timeout - turning dynamic off\n"); 1240 rcv_intr_dynamic = 0; 1241 } 1242 1243 /* sanitize link CRC options */ 1244 link_crc_mask &= SUPPORTED_CRCS; 1245 1246 /* 1247 * These must be called before the driver is registered with 1248 * the PCI subsystem. 1249 */ 1250 idr_init(&hfi1_unit_table); 1251 1252 hfi1_dbg_init(); 1253 ret = hfi1_wss_init(); 1254 if (ret < 0) 1255 goto bail_wss; 1256 ret = pci_register_driver(&hfi1_pci_driver); 1257 if (ret < 0) { 1258 pr_err("Unable to register driver: error %d\n", -ret); 1259 goto bail_dev; 1260 } 1261 goto bail; /* all OK */ 1262 1263 bail_dev: 1264 hfi1_wss_exit(); 1265 bail_wss: 1266 hfi1_dbg_exit(); 1267 idr_destroy(&hfi1_unit_table); 1268 dev_cleanup(); 1269 bail: 1270 return ret; 1271 } 1272 1273 module_init(hfi1_mod_init); 1274 1275 /* 1276 * Do the non-unit driver cleanup, memory free, etc. at unload. 1277 */ 1278 static void __exit hfi1_mod_cleanup(void) 1279 { 1280 pci_unregister_driver(&hfi1_pci_driver); 1281 hfi1_wss_exit(); 1282 hfi1_dbg_exit(); 1283 hfi1_cpulist_count = 0; 1284 kfree(hfi1_cpulist); 1285 1286 idr_destroy(&hfi1_unit_table); 1287 dispose_firmware(); /* asymmetric with obtain_firmware() */ 1288 dev_cleanup(); 1289 } 1290 1291 module_exit(hfi1_mod_cleanup); 1292 1293 /* this can only be called after a successful initialization */ 1294 static void cleanup_device_data(struct hfi1_devdata *dd) 1295 { 1296 int ctxt; 1297 int pidx; 1298 struct hfi1_ctxtdata **tmp; 1299 unsigned long flags; 1300 1301 /* users can't do anything more with chip */ 1302 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1303 struct hfi1_pportdata *ppd = &dd->pport[pidx]; 1304 struct cc_state *cc_state; 1305 int i; 1306 1307 if (ppd->statusp) 1308 *ppd->statusp &= ~HFI1_STATUS_CHIP_PRESENT; 1309 1310 for (i = 0; i < OPA_MAX_SLS; i++) 1311 hrtimer_cancel(&ppd->cca_timer[i].hrtimer); 1312 1313 spin_lock(&ppd->cc_state_lock); 1314 cc_state = get_cc_state(ppd); 1315 RCU_INIT_POINTER(ppd->cc_state, NULL); 1316 spin_unlock(&ppd->cc_state_lock); 1317 1318 if (cc_state) 1319 call_rcu(&cc_state->rcu, cc_state_reclaim); 1320 } 1321 1322 free_credit_return(dd); 1323 1324 /* 1325 * Free any resources still in use (usually just kernel contexts) 1326 * at unload; we do for ctxtcnt, because that's what we allocate. 1327 * We acquire lock to be really paranoid that rcd isn't being 1328 * accessed from some interrupt-related code (that should not happen, 1329 * but best to be sure). 1330 */ 1331 spin_lock_irqsave(&dd->uctxt_lock, flags); 1332 tmp = dd->rcd; 1333 dd->rcd = NULL; 1334 spin_unlock_irqrestore(&dd->uctxt_lock, flags); 1335 1336 if (dd->rcvhdrtail_dummy_kvaddr) { 1337 dma_free_coherent(&dd->pcidev->dev, sizeof(u64), 1338 (void *)dd->rcvhdrtail_dummy_kvaddr, 1339 dd->rcvhdrtail_dummy_physaddr); 1340 dd->rcvhdrtail_dummy_kvaddr = NULL; 1341 } 1342 1343 for (ctxt = 0; tmp && ctxt < dd->num_rcv_contexts; ctxt++) { 1344 struct hfi1_ctxtdata *rcd = tmp[ctxt]; 1345 1346 tmp[ctxt] = NULL; /* debugging paranoia */ 1347 if (rcd) { 1348 hfi1_clear_tids(rcd); 1349 hfi1_free_ctxtdata(dd, rcd); 1350 } 1351 } 1352 kfree(tmp); 1353 free_pio_map(dd); 1354 /* must follow rcv context free - need to remove rcv's hooks */ 1355 for (ctxt = 0; ctxt < dd->num_send_contexts; ctxt++) 1356 sc_free(dd->send_contexts[ctxt].sc); 1357 dd->num_send_contexts = 0; 1358 kfree(dd->send_contexts); 1359 dd->send_contexts = NULL; 1360 kfree(dd->hw_to_sw); 1361 dd->hw_to_sw = NULL; 1362 kfree(dd->boardname); 1363 vfree(dd->events); 1364 vfree(dd->status); 1365 } 1366 1367 /* 1368 * Clean up on unit shutdown, or error during unit load after 1369 * successful initialization. 1370 */ 1371 static void postinit_cleanup(struct hfi1_devdata *dd) 1372 { 1373 hfi1_start_cleanup(dd); 1374 1375 hfi1_pcie_ddcleanup(dd); 1376 hfi1_pcie_cleanup(dd->pcidev); 1377 1378 cleanup_device_data(dd); 1379 1380 hfi1_free_devdata(dd); 1381 } 1382 1383 static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) 1384 { 1385 int ret = 0, j, pidx, initfail; 1386 struct hfi1_devdata *dd = NULL; 1387 struct hfi1_pportdata *ppd; 1388 1389 /* First, lock the non-writable module parameters */ 1390 HFI1_CAP_LOCK(); 1391 1392 /* Validate some global module parameters */ 1393 if (rcvhdrcnt <= HFI1_MIN_HDRQ_EGRBUF_CNT) { 1394 hfi1_early_err(&pdev->dev, "Header queue count too small\n"); 1395 ret = -EINVAL; 1396 goto bail; 1397 } 1398 if (rcvhdrcnt > HFI1_MAX_HDRQ_EGRBUF_CNT) { 1399 hfi1_early_err(&pdev->dev, 1400 "Receive header queue count cannot be greater than %u\n", 1401 HFI1_MAX_HDRQ_EGRBUF_CNT); 1402 ret = -EINVAL; 1403 goto bail; 1404 } 1405 /* use the encoding function as a sanitization check */ 1406 if (!encode_rcv_header_entry_size(hfi1_hdrq_entsize)) { 1407 hfi1_early_err(&pdev->dev, "Invalid HdrQ Entry size %u\n", 1408 hfi1_hdrq_entsize); 1409 ret = -EINVAL; 1410 goto bail; 1411 } 1412 1413 /* The receive eager buffer size must be set before the receive 1414 * contexts are created. 1415 * 1416 * Set the eager buffer size. Validate that it falls in a range 1417 * allowed by the hardware - all powers of 2 between the min and 1418 * max. The maximum valid MTU is within the eager buffer range 1419 * so we do not need to cap the max_mtu by an eager buffer size 1420 * setting. 1421 */ 1422 if (eager_buffer_size) { 1423 if (!is_power_of_2(eager_buffer_size)) 1424 eager_buffer_size = 1425 roundup_pow_of_two(eager_buffer_size); 1426 eager_buffer_size = 1427 clamp_val(eager_buffer_size, 1428 MIN_EAGER_BUFFER * 8, 1429 MAX_EAGER_BUFFER_TOTAL); 1430 hfi1_early_info(&pdev->dev, "Eager buffer size %u\n", 1431 eager_buffer_size); 1432 } else { 1433 hfi1_early_err(&pdev->dev, "Invalid Eager buffer size of 0\n"); 1434 ret = -EINVAL; 1435 goto bail; 1436 } 1437 1438 /* restrict value of hfi1_rcvarr_split */ 1439 hfi1_rcvarr_split = clamp_val(hfi1_rcvarr_split, 0, 100); 1440 1441 ret = hfi1_pcie_init(pdev, ent); 1442 if (ret) 1443 goto bail; 1444 1445 /* 1446 * Do device-specific initialization, function table setup, dd 1447 * allocation, etc. 1448 */ 1449 switch (ent->device) { 1450 case PCI_DEVICE_ID_INTEL0: 1451 case PCI_DEVICE_ID_INTEL1: 1452 dd = hfi1_init_dd(pdev, ent); 1453 break; 1454 default: 1455 hfi1_early_err(&pdev->dev, 1456 "Failing on unknown Intel deviceid 0x%x\n", 1457 ent->device); 1458 ret = -ENODEV; 1459 } 1460 1461 if (IS_ERR(dd)) 1462 ret = PTR_ERR(dd); 1463 if (ret) 1464 goto clean_bail; /* error already printed */ 1465 1466 ret = create_workqueues(dd); 1467 if (ret) 1468 goto clean_bail; 1469 1470 /* do the generic initialization */ 1471 initfail = hfi1_init(dd, 0); 1472 1473 ret = hfi1_register_ib_device(dd); 1474 1475 /* 1476 * Now ready for use. this should be cleared whenever we 1477 * detect a reset, or initiate one. If earlier failure, 1478 * we still create devices, so diags, etc. can be used 1479 * to determine cause of problem. 1480 */ 1481 if (!initfail && !ret) { 1482 dd->flags |= HFI1_INITTED; 1483 /* create debufs files after init and ib register */ 1484 hfi1_dbg_ibdev_init(&dd->verbs_dev); 1485 } 1486 1487 j = hfi1_device_create(dd); 1488 if (j) 1489 dd_dev_err(dd, "Failed to create /dev devices: %d\n", -j); 1490 1491 if (initfail || ret) { 1492 stop_timers(dd); 1493 flush_workqueue(ib_wq); 1494 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1495 hfi1_quiet_serdes(dd->pport + pidx); 1496 ppd = dd->pport + pidx; 1497 if (ppd->hfi1_wq) { 1498 destroy_workqueue(ppd->hfi1_wq); 1499 ppd->hfi1_wq = NULL; 1500 } 1501 } 1502 if (!j) 1503 hfi1_device_remove(dd); 1504 if (!ret) 1505 hfi1_unregister_ib_device(dd); 1506 postinit_cleanup(dd); 1507 if (initfail) 1508 ret = initfail; 1509 goto bail; /* everything already cleaned */ 1510 } 1511 1512 sdma_start(dd); 1513 1514 return 0; 1515 1516 clean_bail: 1517 hfi1_pcie_cleanup(pdev); 1518 bail: 1519 return ret; 1520 } 1521 1522 static void remove_one(struct pci_dev *pdev) 1523 { 1524 struct hfi1_devdata *dd = pci_get_drvdata(pdev); 1525 1526 /* close debugfs files before ib unregister */ 1527 hfi1_dbg_ibdev_exit(&dd->verbs_dev); 1528 /* unregister from IB core */ 1529 hfi1_unregister_ib_device(dd); 1530 1531 /* 1532 * Disable the IB link, disable interrupts on the device, 1533 * clear dma engines, etc. 1534 */ 1535 shutdown_device(dd); 1536 1537 stop_timers(dd); 1538 1539 /* wait until all of our (qsfp) queue_work() calls complete */ 1540 flush_workqueue(ib_wq); 1541 1542 hfi1_device_remove(dd); 1543 1544 postinit_cleanup(dd); 1545 } 1546 1547 /** 1548 * hfi1_create_rcvhdrq - create a receive header queue 1549 * @dd: the hfi1_ib device 1550 * @rcd: the context data 1551 * 1552 * This must be contiguous memory (from an i/o perspective), and must be 1553 * DMA'able (which means for some systems, it will go through an IOMMU, 1554 * or be forced into a low address range). 1555 */ 1556 int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) 1557 { 1558 unsigned amt; 1559 u64 reg; 1560 1561 if (!rcd->rcvhdrq) { 1562 dma_addr_t phys_hdrqtail; 1563 gfp_t gfp_flags; 1564 1565 /* 1566 * rcvhdrqentsize is in DWs, so we have to convert to bytes 1567 * (* sizeof(u32)). 1568 */ 1569 amt = PAGE_ALIGN(rcd->rcvhdrq_cnt * rcd->rcvhdrqentsize * 1570 sizeof(u32)); 1571 1572 gfp_flags = (rcd->ctxt >= dd->first_user_ctxt) ? 1573 GFP_USER : GFP_KERNEL; 1574 rcd->rcvhdrq = dma_zalloc_coherent( 1575 &dd->pcidev->dev, amt, &rcd->rcvhdrq_phys, 1576 gfp_flags | __GFP_COMP); 1577 1578 if (!rcd->rcvhdrq) { 1579 dd_dev_err(dd, 1580 "attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n", 1581 amt, rcd->ctxt); 1582 goto bail; 1583 } 1584 1585 if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) { 1586 rcd->rcvhdrtail_kvaddr = dma_zalloc_coherent( 1587 &dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail, 1588 gfp_flags); 1589 if (!rcd->rcvhdrtail_kvaddr) 1590 goto bail_free; 1591 rcd->rcvhdrqtailaddr_phys = phys_hdrqtail; 1592 } 1593 1594 rcd->rcvhdrq_size = amt; 1595 } 1596 /* 1597 * These values are per-context: 1598 * RcvHdrCnt 1599 * RcvHdrEntSize 1600 * RcvHdrSize 1601 */ 1602 reg = ((u64)(rcd->rcvhdrq_cnt >> HDRQ_SIZE_SHIFT) 1603 & RCV_HDR_CNT_CNT_MASK) 1604 << RCV_HDR_CNT_CNT_SHIFT; 1605 write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_CNT, reg); 1606 reg = (encode_rcv_header_entry_size(rcd->rcvhdrqentsize) 1607 & RCV_HDR_ENT_SIZE_ENT_SIZE_MASK) 1608 << RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT; 1609 write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_ENT_SIZE, reg); 1610 reg = (dd->rcvhdrsize & RCV_HDR_SIZE_HDR_SIZE_MASK) 1611 << RCV_HDR_SIZE_HDR_SIZE_SHIFT; 1612 write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_SIZE, reg); 1613 1614 /* 1615 * Program dummy tail address for every receive context 1616 * before enabling any receive context 1617 */ 1618 write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_TAIL_ADDR, 1619 dd->rcvhdrtail_dummy_physaddr); 1620 1621 return 0; 1622 1623 bail_free: 1624 dd_dev_err(dd, 1625 "attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n", 1626 rcd->ctxt); 1627 vfree(rcd->user_event_mask); 1628 rcd->user_event_mask = NULL; 1629 dma_free_coherent(&dd->pcidev->dev, amt, rcd->rcvhdrq, 1630 rcd->rcvhdrq_phys); 1631 rcd->rcvhdrq = NULL; 1632 bail: 1633 return -ENOMEM; 1634 } 1635 1636 /** 1637 * allocate eager buffers, both kernel and user contexts. 1638 * @rcd: the context we are setting up. 1639 * 1640 * Allocate the eager TID buffers and program them into hip. 1641 * They are no longer completely contiguous, we do multiple allocation 1642 * calls. Otherwise we get the OOM code involved, by asking for too 1643 * much per call, with disastrous results on some kernels. 1644 */ 1645 int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd) 1646 { 1647 struct hfi1_devdata *dd = rcd->dd; 1648 u32 max_entries, egrtop, alloced_bytes = 0, idx = 0; 1649 gfp_t gfp_flags; 1650 u16 order; 1651 int ret = 0; 1652 u16 round_mtu = roundup_pow_of_two(hfi1_max_mtu); 1653 1654 /* 1655 * GFP_USER, but without GFP_FS, so buffer cache can be 1656 * coalesced (we hope); otherwise, even at order 4, 1657 * heavy filesystem activity makes these fail, and we can 1658 * use compound pages. 1659 */ 1660 gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP; 1661 1662 /* 1663 * The minimum size of the eager buffers is a groups of MTU-sized 1664 * buffers. 1665 * The global eager_buffer_size parameter is checked against the 1666 * theoretical lower limit of the value. Here, we check against the 1667 * MTU. 1668 */ 1669 if (rcd->egrbufs.size < (round_mtu * dd->rcv_entries.group_size)) 1670 rcd->egrbufs.size = round_mtu * dd->rcv_entries.group_size; 1671 /* 1672 * If using one-pkt-per-egr-buffer, lower the eager buffer 1673 * size to the max MTU (page-aligned). 1674 */ 1675 if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) 1676 rcd->egrbufs.rcvtid_size = round_mtu; 1677 1678 /* 1679 * Eager buffers sizes of 1MB or less require smaller TID sizes 1680 * to satisfy the "multiple of 8 RcvArray entries" requirement. 1681 */ 1682 if (rcd->egrbufs.size <= (1 << 20)) 1683 rcd->egrbufs.rcvtid_size = max((unsigned long)round_mtu, 1684 rounddown_pow_of_two(rcd->egrbufs.size / 8)); 1685 1686 while (alloced_bytes < rcd->egrbufs.size && 1687 rcd->egrbufs.alloced < rcd->egrbufs.count) { 1688 rcd->egrbufs.buffers[idx].addr = 1689 dma_zalloc_coherent(&dd->pcidev->dev, 1690 rcd->egrbufs.rcvtid_size, 1691 &rcd->egrbufs.buffers[idx].phys, 1692 gfp_flags); 1693 if (rcd->egrbufs.buffers[idx].addr) { 1694 rcd->egrbufs.buffers[idx].len = 1695 rcd->egrbufs.rcvtid_size; 1696 rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].addr = 1697 rcd->egrbufs.buffers[idx].addr; 1698 rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].phys = 1699 rcd->egrbufs.buffers[idx].phys; 1700 rcd->egrbufs.alloced++; 1701 alloced_bytes += rcd->egrbufs.rcvtid_size; 1702 idx++; 1703 } else { 1704 u32 new_size, i, j; 1705 u64 offset = 0; 1706 1707 /* 1708 * Fail the eager buffer allocation if: 1709 * - we are already using the lowest acceptable size 1710 * - we are using one-pkt-per-egr-buffer (this implies 1711 * that we are accepting only one size) 1712 */ 1713 if (rcd->egrbufs.rcvtid_size == round_mtu || 1714 !HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) { 1715 dd_dev_err(dd, "ctxt%u: Failed to allocate eager buffers\n", 1716 rcd->ctxt); 1717 goto bail_rcvegrbuf_phys; 1718 } 1719 1720 new_size = rcd->egrbufs.rcvtid_size / 2; 1721 1722 /* 1723 * If the first attempt to allocate memory failed, don't 1724 * fail everything but continue with the next lower 1725 * size. 1726 */ 1727 if (idx == 0) { 1728 rcd->egrbufs.rcvtid_size = new_size; 1729 continue; 1730 } 1731 1732 /* 1733 * Re-partition already allocated buffers to a smaller 1734 * size. 1735 */ 1736 rcd->egrbufs.alloced = 0; 1737 for (i = 0, j = 0, offset = 0; j < idx; i++) { 1738 if (i >= rcd->egrbufs.count) 1739 break; 1740 rcd->egrbufs.rcvtids[i].phys = 1741 rcd->egrbufs.buffers[j].phys + offset; 1742 rcd->egrbufs.rcvtids[i].addr = 1743 rcd->egrbufs.buffers[j].addr + offset; 1744 rcd->egrbufs.alloced++; 1745 if ((rcd->egrbufs.buffers[j].phys + offset + 1746 new_size) == 1747 (rcd->egrbufs.buffers[j].phys + 1748 rcd->egrbufs.buffers[j].len)) { 1749 j++; 1750 offset = 0; 1751 } else { 1752 offset += new_size; 1753 } 1754 } 1755 rcd->egrbufs.rcvtid_size = new_size; 1756 } 1757 } 1758 rcd->egrbufs.numbufs = idx; 1759 rcd->egrbufs.size = alloced_bytes; 1760 1761 hfi1_cdbg(PROC, 1762 "ctxt%u: Alloced %u rcv tid entries @ %uKB, total %zuKB\n", 1763 rcd->ctxt, rcd->egrbufs.alloced, rcd->egrbufs.rcvtid_size, 1764 rcd->egrbufs.size); 1765 1766 /* 1767 * Set the contexts rcv array head update threshold to the closest 1768 * power of 2 (so we can use a mask instead of modulo) below half 1769 * the allocated entries. 1770 */ 1771 rcd->egrbufs.threshold = 1772 rounddown_pow_of_two(rcd->egrbufs.alloced / 2); 1773 /* 1774 * Compute the expected RcvArray entry base. This is done after 1775 * allocating the eager buffers in order to maximize the 1776 * expected RcvArray entries for the context. 1777 */ 1778 max_entries = rcd->rcv_array_groups * dd->rcv_entries.group_size; 1779 egrtop = roundup(rcd->egrbufs.alloced, dd->rcv_entries.group_size); 1780 rcd->expected_count = max_entries - egrtop; 1781 if (rcd->expected_count > MAX_TID_PAIR_ENTRIES * 2) 1782 rcd->expected_count = MAX_TID_PAIR_ENTRIES * 2; 1783 1784 rcd->expected_base = rcd->eager_base + egrtop; 1785 hfi1_cdbg(PROC, "ctxt%u: eager:%u, exp:%u, egrbase:%u, expbase:%u\n", 1786 rcd->ctxt, rcd->egrbufs.alloced, rcd->expected_count, 1787 rcd->eager_base, rcd->expected_base); 1788 1789 if (!hfi1_rcvbuf_validate(rcd->egrbufs.rcvtid_size, PT_EAGER, &order)) { 1790 hfi1_cdbg(PROC, 1791 "ctxt%u: current Eager buffer size is invalid %u\n", 1792 rcd->ctxt, rcd->egrbufs.rcvtid_size); 1793 ret = -EINVAL; 1794 goto bail; 1795 } 1796 1797 for (idx = 0; idx < rcd->egrbufs.alloced; idx++) { 1798 hfi1_put_tid(dd, rcd->eager_base + idx, PT_EAGER, 1799 rcd->egrbufs.rcvtids[idx].phys, order); 1800 cond_resched(); 1801 } 1802 goto bail; 1803 1804 bail_rcvegrbuf_phys: 1805 for (idx = 0; idx < rcd->egrbufs.alloced && 1806 rcd->egrbufs.buffers[idx].addr; 1807 idx++) { 1808 dma_free_coherent(&dd->pcidev->dev, 1809 rcd->egrbufs.buffers[idx].len, 1810 rcd->egrbufs.buffers[idx].addr, 1811 rcd->egrbufs.buffers[idx].phys); 1812 rcd->egrbufs.buffers[idx].addr = NULL; 1813 rcd->egrbufs.buffers[idx].phys = 0; 1814 rcd->egrbufs.buffers[idx].len = 0; 1815 } 1816 bail: 1817 return ret; 1818 } 1819