1 /* 2 * Copyright(c) 2015 - 2018 Intel Corporation. 3 * 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 * redistributing this file, you may do so under either license. 6 * 7 * GPL LICENSE SUMMARY 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of version 2 of the GNU General Public License as 11 * published by the Free Software Foundation. 12 * 13 * This program is distributed in the hope that it will be useful, but 14 * WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * General Public License for more details. 17 * 18 * BSD LICENSE 19 * 20 * Redistribution and use in source and binary forms, with or without 21 * modification, are permitted provided that the following conditions 22 * are met: 23 * 24 * - Redistributions of source code must retain the above copyright 25 * notice, this list of conditions and the following disclaimer. 26 * - Redistributions in binary form must reproduce the above copyright 27 * notice, this list of conditions and the following disclaimer in 28 * the documentation and/or other materials provided with the 29 * distribution. 30 * - Neither the name of Intel Corporation nor the names of its 31 * contributors may be used to endorse or promote products derived 32 * from this software without specific prior written permission. 33 * 34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 45 * 46 */ 47 48 #include <linux/pci.h> 49 #include <linux/netdevice.h> 50 #include <linux/vmalloc.h> 51 #include <linux/delay.h> 52 #include <linux/idr.h> 53 #include <linux/module.h> 54 #include <linux/printk.h> 55 #include <linux/hrtimer.h> 56 #include <linux/bitmap.h> 57 #include <linux/numa.h> 58 #include <rdma/rdma_vt.h> 59 60 #include "hfi.h" 61 #include "device.h" 62 #include "common.h" 63 #include "trace.h" 64 #include "mad.h" 65 #include "sdma.h" 66 #include "debugfs.h" 67 #include "verbs.h" 68 #include "aspm.h" 69 #include "affinity.h" 70 #include "vnic.h" 71 #include "exp_rcv.h" 72 73 #undef pr_fmt 74 #define pr_fmt(fmt) DRIVER_NAME ": " fmt 75 76 #define HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES 5 77 /* 78 * min buffers we want to have per context, after driver 79 */ 80 #define HFI1_MIN_USER_CTXT_BUFCNT 7 81 82 #define HFI1_MIN_HDRQ_EGRBUF_CNT 2 83 #define HFI1_MAX_HDRQ_EGRBUF_CNT 16352 84 #define HFI1_MIN_EAGER_BUFFER_SIZE (4 * 1024) /* 4KB */ 85 #define HFI1_MAX_EAGER_BUFFER_SIZE (256 * 1024) /* 256KB */ 86 87 #define NUM_IB_PORTS 1 88 89 /* 90 * Number of user receive contexts we are configured to use (to allow for more 91 * pio buffers per ctxt, etc.) Zero means use one user context per CPU. 92 */ 93 int num_user_contexts = -1; 94 module_param_named(num_user_contexts, num_user_contexts, int, 0444); 95 MODULE_PARM_DESC( 96 num_user_contexts, "Set max number of user contexts to use (default: -1 will use the real (non-HT) CPU count)"); 97 98 uint krcvqs[RXE_NUM_DATA_VL]; 99 int krcvqsset; 100 module_param_array(krcvqs, uint, &krcvqsset, S_IRUGO); 101 MODULE_PARM_DESC(krcvqs, "Array of the number of non-control kernel receive queues by VL"); 102 103 /* computed based on above array */ 104 unsigned long n_krcvqs; 105 106 static unsigned hfi1_rcvarr_split = 25; 107 module_param_named(rcvarr_split, hfi1_rcvarr_split, uint, S_IRUGO); 108 MODULE_PARM_DESC(rcvarr_split, "Percent of context's RcvArray entries used for Eager buffers"); 109 110 static uint eager_buffer_size = (8 << 20); /* 8MB */ 111 module_param(eager_buffer_size, uint, S_IRUGO); 112 MODULE_PARM_DESC(eager_buffer_size, "Size of the eager buffers, default: 8MB"); 113 114 static uint rcvhdrcnt = 2048; /* 2x the max eager buffer count */ 115 module_param_named(rcvhdrcnt, rcvhdrcnt, uint, S_IRUGO); 116 MODULE_PARM_DESC(rcvhdrcnt, "Receive header queue count (default 2048)"); 117 118 static uint hfi1_hdrq_entsize = 32; 119 module_param_named(hdrq_entsize, hfi1_hdrq_entsize, uint, 0444); 120 MODULE_PARM_DESC(hdrq_entsize, "Size of header queue entries: 2 - 8B, 16 - 64B, 32 - 128B (default)"); 121 122 unsigned int user_credit_return_threshold = 33; /* default is 33% */ 123 module_param(user_credit_return_threshold, uint, S_IRUGO); 124 MODULE_PARM_DESC(user_credit_return_threshold, "Credit return threshold for user send contexts, return when unreturned credits passes this many blocks (in percent of allocated blocks, 0 is off)"); 125 126 static inline u64 encode_rcv_header_entry_size(u16 size); 127 128 static struct idr hfi1_unit_table; 129 130 static int hfi1_create_kctxt(struct hfi1_devdata *dd, 131 struct hfi1_pportdata *ppd) 132 { 133 struct hfi1_ctxtdata *rcd; 134 int ret; 135 136 /* Control context has to be always 0 */ 137 BUILD_BUG_ON(HFI1_CTRL_CTXT != 0); 138 139 ret = hfi1_create_ctxtdata(ppd, dd->node, &rcd); 140 if (ret < 0) { 141 dd_dev_err(dd, "Kernel receive context allocation failed\n"); 142 return ret; 143 } 144 145 /* 146 * Set up the kernel context flags here and now because they use 147 * default values for all receive side memories. User contexts will 148 * be handled as they are created. 149 */ 150 rcd->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) | 151 HFI1_CAP_KGET(NODROP_RHQ_FULL) | 152 HFI1_CAP_KGET(NODROP_EGR_FULL) | 153 HFI1_CAP_KGET(DMA_RTAIL); 154 155 /* Control context must use DMA_RTAIL */ 156 if (rcd->ctxt == HFI1_CTRL_CTXT) 157 rcd->flags |= HFI1_CAP_DMA_RTAIL; 158 rcd->seq_cnt = 1; 159 160 rcd->sc = sc_alloc(dd, SC_ACK, rcd->rcvhdrqentsize, dd->node); 161 if (!rcd->sc) { 162 dd_dev_err(dd, "Kernel send context allocation failed\n"); 163 return -ENOMEM; 164 } 165 hfi1_init_ctxt(rcd->sc); 166 167 return 0; 168 } 169 170 /* 171 * Create the receive context array and one or more kernel contexts 172 */ 173 int hfi1_create_kctxts(struct hfi1_devdata *dd) 174 { 175 u16 i; 176 int ret; 177 178 dd->rcd = kcalloc_node(dd->num_rcv_contexts, sizeof(*dd->rcd), 179 GFP_KERNEL, dd->node); 180 if (!dd->rcd) 181 return -ENOMEM; 182 183 for (i = 0; i < dd->first_dyn_alloc_ctxt; ++i) { 184 ret = hfi1_create_kctxt(dd, dd->pport); 185 if (ret) 186 goto bail; 187 } 188 189 return 0; 190 bail: 191 for (i = 0; dd->rcd && i < dd->first_dyn_alloc_ctxt; ++i) 192 hfi1_free_ctxt(dd->rcd[i]); 193 194 /* All the contexts should be freed, free the array */ 195 kfree(dd->rcd); 196 dd->rcd = NULL; 197 return ret; 198 } 199 200 /* 201 * Helper routines for the receive context reference count (rcd and uctxt). 202 */ 203 static void hfi1_rcd_init(struct hfi1_ctxtdata *rcd) 204 { 205 kref_init(&rcd->kref); 206 } 207 208 /** 209 * hfi1_rcd_free - When reference is zero clean up. 210 * @kref: pointer to an initialized rcd data structure 211 * 212 */ 213 static void hfi1_rcd_free(struct kref *kref) 214 { 215 unsigned long flags; 216 struct hfi1_ctxtdata *rcd = 217 container_of(kref, struct hfi1_ctxtdata, kref); 218 219 hfi1_free_ctxtdata(rcd->dd, rcd); 220 221 spin_lock_irqsave(&rcd->dd->uctxt_lock, flags); 222 rcd->dd->rcd[rcd->ctxt] = NULL; 223 spin_unlock_irqrestore(&rcd->dd->uctxt_lock, flags); 224 225 kfree(rcd); 226 } 227 228 /** 229 * hfi1_rcd_put - decrement reference for rcd 230 * @rcd: pointer to an initialized rcd data structure 231 * 232 * Use this to put a reference after the init. 233 */ 234 int hfi1_rcd_put(struct hfi1_ctxtdata *rcd) 235 { 236 if (rcd) 237 return kref_put(&rcd->kref, hfi1_rcd_free); 238 239 return 0; 240 } 241 242 /** 243 * hfi1_rcd_get - increment reference for rcd 244 * @rcd: pointer to an initialized rcd data structure 245 * 246 * Use this to get a reference after the init. 247 */ 248 void hfi1_rcd_get(struct hfi1_ctxtdata *rcd) 249 { 250 kref_get(&rcd->kref); 251 } 252 253 /** 254 * allocate_rcd_index - allocate an rcd index from the rcd array 255 * @dd: pointer to a valid devdata structure 256 * @rcd: rcd data structure to assign 257 * @index: pointer to index that is allocated 258 * 259 * Find an empty index in the rcd array, and assign the given rcd to it. 260 * If the array is full, we are EBUSY. 261 * 262 */ 263 static int allocate_rcd_index(struct hfi1_devdata *dd, 264 struct hfi1_ctxtdata *rcd, u16 *index) 265 { 266 unsigned long flags; 267 u16 ctxt; 268 269 spin_lock_irqsave(&dd->uctxt_lock, flags); 270 for (ctxt = 0; ctxt < dd->num_rcv_contexts; ctxt++) 271 if (!dd->rcd[ctxt]) 272 break; 273 274 if (ctxt < dd->num_rcv_contexts) { 275 rcd->ctxt = ctxt; 276 dd->rcd[ctxt] = rcd; 277 hfi1_rcd_init(rcd); 278 } 279 spin_unlock_irqrestore(&dd->uctxt_lock, flags); 280 281 if (ctxt >= dd->num_rcv_contexts) 282 return -EBUSY; 283 284 *index = ctxt; 285 286 return 0; 287 } 288 289 /** 290 * hfi1_rcd_get_by_index_safe - validate the ctxt index before accessing the 291 * array 292 * @dd: pointer to a valid devdata structure 293 * @ctxt: the index of an possilbe rcd 294 * 295 * This is a wrapper for hfi1_rcd_get_by_index() to validate that the given 296 * ctxt index is valid. 297 * 298 * The caller is responsible for making the _put(). 299 * 300 */ 301 struct hfi1_ctxtdata *hfi1_rcd_get_by_index_safe(struct hfi1_devdata *dd, 302 u16 ctxt) 303 { 304 if (ctxt < dd->num_rcv_contexts) 305 return hfi1_rcd_get_by_index(dd, ctxt); 306 307 return NULL; 308 } 309 310 /** 311 * hfi1_rcd_get_by_index 312 * @dd: pointer to a valid devdata structure 313 * @ctxt: the index of an possilbe rcd 314 * 315 * We need to protect access to the rcd array. If access is needed to 316 * one or more index, get the protecting spinlock and then increment the 317 * kref. 318 * 319 * The caller is responsible for making the _put(). 320 * 321 */ 322 struct hfi1_ctxtdata *hfi1_rcd_get_by_index(struct hfi1_devdata *dd, u16 ctxt) 323 { 324 unsigned long flags; 325 struct hfi1_ctxtdata *rcd = NULL; 326 327 spin_lock_irqsave(&dd->uctxt_lock, flags); 328 if (dd->rcd[ctxt]) { 329 rcd = dd->rcd[ctxt]; 330 hfi1_rcd_get(rcd); 331 } 332 spin_unlock_irqrestore(&dd->uctxt_lock, flags); 333 334 return rcd; 335 } 336 337 /* 338 * Common code for user and kernel context create and setup. 339 * NOTE: the initial kref is done here (hf1_rcd_init()). 340 */ 341 int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa, 342 struct hfi1_ctxtdata **context) 343 { 344 struct hfi1_devdata *dd = ppd->dd; 345 struct hfi1_ctxtdata *rcd; 346 unsigned kctxt_ngroups = 0; 347 u32 base; 348 349 if (dd->rcv_entries.nctxt_extra > 350 dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt) 351 kctxt_ngroups = (dd->rcv_entries.nctxt_extra - 352 (dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt)); 353 rcd = kzalloc_node(sizeof(*rcd), GFP_KERNEL, numa); 354 if (rcd) { 355 u32 rcvtids, max_entries; 356 u16 ctxt; 357 int ret; 358 359 ret = allocate_rcd_index(dd, rcd, &ctxt); 360 if (ret) { 361 *context = NULL; 362 kfree(rcd); 363 return ret; 364 } 365 366 INIT_LIST_HEAD(&rcd->qp_wait_list); 367 hfi1_exp_tid_group_init(rcd); 368 rcd->ppd = ppd; 369 rcd->dd = dd; 370 rcd->numa_id = numa; 371 rcd->rcv_array_groups = dd->rcv_entries.ngroups; 372 rcd->rhf_rcv_function_map = normal_rhf_rcv_functions; 373 374 mutex_init(&rcd->exp_mutex); 375 376 hfi1_cdbg(PROC, "setting up context %u\n", rcd->ctxt); 377 378 /* 379 * Calculate the context's RcvArray entry starting point. 380 * We do this here because we have to take into account all 381 * the RcvArray entries that previous context would have 382 * taken and we have to account for any extra groups assigned 383 * to the static (kernel) or dynamic (vnic/user) contexts. 384 */ 385 if (ctxt < dd->first_dyn_alloc_ctxt) { 386 if (ctxt < kctxt_ngroups) { 387 base = ctxt * (dd->rcv_entries.ngroups + 1); 388 rcd->rcv_array_groups++; 389 } else { 390 base = kctxt_ngroups + 391 (ctxt * dd->rcv_entries.ngroups); 392 } 393 } else { 394 u16 ct = ctxt - dd->first_dyn_alloc_ctxt; 395 396 base = ((dd->n_krcv_queues * dd->rcv_entries.ngroups) + 397 kctxt_ngroups); 398 if (ct < dd->rcv_entries.nctxt_extra) { 399 base += ct * (dd->rcv_entries.ngroups + 1); 400 rcd->rcv_array_groups++; 401 } else { 402 base += dd->rcv_entries.nctxt_extra + 403 (ct * dd->rcv_entries.ngroups); 404 } 405 } 406 rcd->eager_base = base * dd->rcv_entries.group_size; 407 408 rcd->rcvhdrq_cnt = rcvhdrcnt; 409 rcd->rcvhdrqentsize = hfi1_hdrq_entsize; 410 rcd->rhf_offset = 411 rcd->rcvhdrqentsize - sizeof(u64) / sizeof(u32); 412 /* 413 * Simple Eager buffer allocation: we have already pre-allocated 414 * the number of RcvArray entry groups. Each ctxtdata structure 415 * holds the number of groups for that context. 416 * 417 * To follow CSR requirements and maintain cacheline alignment, 418 * make sure all sizes and bases are multiples of group_size. 419 * 420 * The expected entry count is what is left after assigning 421 * eager. 422 */ 423 max_entries = rcd->rcv_array_groups * 424 dd->rcv_entries.group_size; 425 rcvtids = ((max_entries * hfi1_rcvarr_split) / 100); 426 rcd->egrbufs.count = round_down(rcvtids, 427 dd->rcv_entries.group_size); 428 if (rcd->egrbufs.count > MAX_EAGER_ENTRIES) { 429 dd_dev_err(dd, "ctxt%u: requested too many RcvArray entries.\n", 430 rcd->ctxt); 431 rcd->egrbufs.count = MAX_EAGER_ENTRIES; 432 } 433 hfi1_cdbg(PROC, 434 "ctxt%u: max Eager buffer RcvArray entries: %u\n", 435 rcd->ctxt, rcd->egrbufs.count); 436 437 /* 438 * Allocate array that will hold the eager buffer accounting 439 * data. 440 * This will allocate the maximum possible buffer count based 441 * on the value of the RcvArray split parameter. 442 * The resulting value will be rounded down to the closest 443 * multiple of dd->rcv_entries.group_size. 444 */ 445 rcd->egrbufs.buffers = 446 kcalloc_node(rcd->egrbufs.count, 447 sizeof(*rcd->egrbufs.buffers), 448 GFP_KERNEL, numa); 449 if (!rcd->egrbufs.buffers) 450 goto bail; 451 rcd->egrbufs.rcvtids = 452 kcalloc_node(rcd->egrbufs.count, 453 sizeof(*rcd->egrbufs.rcvtids), 454 GFP_KERNEL, numa); 455 if (!rcd->egrbufs.rcvtids) 456 goto bail; 457 rcd->egrbufs.size = eager_buffer_size; 458 /* 459 * The size of the buffers programmed into the RcvArray 460 * entries needs to be big enough to handle the highest 461 * MTU supported. 462 */ 463 if (rcd->egrbufs.size < hfi1_max_mtu) { 464 rcd->egrbufs.size = __roundup_pow_of_two(hfi1_max_mtu); 465 hfi1_cdbg(PROC, 466 "ctxt%u: eager bufs size too small. Adjusting to %zu\n", 467 rcd->ctxt, rcd->egrbufs.size); 468 } 469 rcd->egrbufs.rcvtid_size = HFI1_MAX_EAGER_BUFFER_SIZE; 470 471 /* Applicable only for statically created kernel contexts */ 472 if (ctxt < dd->first_dyn_alloc_ctxt) { 473 rcd->opstats = kzalloc_node(sizeof(*rcd->opstats), 474 GFP_KERNEL, numa); 475 if (!rcd->opstats) 476 goto bail; 477 } 478 479 *context = rcd; 480 return 0; 481 } 482 483 bail: 484 *context = NULL; 485 hfi1_free_ctxt(rcd); 486 return -ENOMEM; 487 } 488 489 /** 490 * hfi1_free_ctxt 491 * @rcd: pointer to an initialized rcd data structure 492 * 493 * This wrapper is the free function that matches hfi1_create_ctxtdata(). 494 * When a context is done being used (kernel or user), this function is called 495 * for the "final" put to match the kref init from hf1i_create_ctxtdata(). 496 * Other users of the context do a get/put sequence to make sure that the 497 * structure isn't removed while in use. 498 */ 499 void hfi1_free_ctxt(struct hfi1_ctxtdata *rcd) 500 { 501 hfi1_rcd_put(rcd); 502 } 503 504 /* 505 * Convert a receive header entry size that to the encoding used in the CSR. 506 * 507 * Return a zero if the given size is invalid. 508 */ 509 static inline u64 encode_rcv_header_entry_size(u16 size) 510 { 511 /* there are only 3 valid receive header entry sizes */ 512 if (size == 2) 513 return 1; 514 if (size == 16) 515 return 2; 516 else if (size == 32) 517 return 4; 518 return 0; /* invalid */ 519 } 520 521 /* 522 * Select the largest ccti value over all SLs to determine the intra- 523 * packet gap for the link. 524 * 525 * called with cca_timer_lock held (to protect access to cca_timer 526 * array), and rcu_read_lock() (to protect access to cc_state). 527 */ 528 void set_link_ipg(struct hfi1_pportdata *ppd) 529 { 530 struct hfi1_devdata *dd = ppd->dd; 531 struct cc_state *cc_state; 532 int i; 533 u16 cce, ccti_limit, max_ccti = 0; 534 u16 shift, mult; 535 u64 src; 536 u32 current_egress_rate; /* Mbits /sec */ 537 u32 max_pkt_time; 538 /* 539 * max_pkt_time is the maximum packet egress time in units 540 * of the fabric clock period 1/(805 MHz). 541 */ 542 543 cc_state = get_cc_state(ppd); 544 545 if (!cc_state) 546 /* 547 * This should _never_ happen - rcu_read_lock() is held, 548 * and set_link_ipg() should not be called if cc_state 549 * is NULL. 550 */ 551 return; 552 553 for (i = 0; i < OPA_MAX_SLS; i++) { 554 u16 ccti = ppd->cca_timer[i].ccti; 555 556 if (ccti > max_ccti) 557 max_ccti = ccti; 558 } 559 560 ccti_limit = cc_state->cct.ccti_limit; 561 if (max_ccti > ccti_limit) 562 max_ccti = ccti_limit; 563 564 cce = cc_state->cct.entries[max_ccti].entry; 565 shift = (cce & 0xc000) >> 14; 566 mult = (cce & 0x3fff); 567 568 current_egress_rate = active_egress_rate(ppd); 569 570 max_pkt_time = egress_cycles(ppd->ibmaxlen, current_egress_rate); 571 572 src = (max_pkt_time >> shift) * mult; 573 574 src &= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK; 575 src <<= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT; 576 577 write_csr(dd, SEND_STATIC_RATE_CONTROL, src); 578 } 579 580 static enum hrtimer_restart cca_timer_fn(struct hrtimer *t) 581 { 582 struct cca_timer *cca_timer; 583 struct hfi1_pportdata *ppd; 584 int sl; 585 u16 ccti_timer, ccti_min; 586 struct cc_state *cc_state; 587 unsigned long flags; 588 enum hrtimer_restart ret = HRTIMER_NORESTART; 589 590 cca_timer = container_of(t, struct cca_timer, hrtimer); 591 ppd = cca_timer->ppd; 592 sl = cca_timer->sl; 593 594 rcu_read_lock(); 595 596 cc_state = get_cc_state(ppd); 597 598 if (!cc_state) { 599 rcu_read_unlock(); 600 return HRTIMER_NORESTART; 601 } 602 603 /* 604 * 1) decrement ccti for SL 605 * 2) calculate IPG for link (set_link_ipg()) 606 * 3) restart timer, unless ccti is at min value 607 */ 608 609 ccti_min = cc_state->cong_setting.entries[sl].ccti_min; 610 ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer; 611 612 spin_lock_irqsave(&ppd->cca_timer_lock, flags); 613 614 if (cca_timer->ccti > ccti_min) { 615 cca_timer->ccti--; 616 set_link_ipg(ppd); 617 } 618 619 if (cca_timer->ccti > ccti_min) { 620 unsigned long nsec = 1024 * ccti_timer; 621 /* ccti_timer is in units of 1.024 usec */ 622 hrtimer_forward_now(t, ns_to_ktime(nsec)); 623 ret = HRTIMER_RESTART; 624 } 625 626 spin_unlock_irqrestore(&ppd->cca_timer_lock, flags); 627 rcu_read_unlock(); 628 return ret; 629 } 630 631 /* 632 * Common code for initializing the physical port structure. 633 */ 634 void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, 635 struct hfi1_devdata *dd, u8 hw_pidx, u8 port) 636 { 637 int i; 638 uint default_pkey_idx; 639 struct cc_state *cc_state; 640 641 ppd->dd = dd; 642 ppd->hw_pidx = hw_pidx; 643 ppd->port = port; /* IB port number, not index */ 644 ppd->prev_link_width = LINK_WIDTH_DEFAULT; 645 /* 646 * There are C_VL_COUNT number of PortVLXmitWait counters. 647 * Adding 1 to C_VL_COUNT to include the PortXmitWait counter. 648 */ 649 for (i = 0; i < C_VL_COUNT + 1; i++) { 650 ppd->port_vl_xmit_wait_last[i] = 0; 651 ppd->vl_xmit_flit_cnt[i] = 0; 652 } 653 654 default_pkey_idx = 1; 655 656 ppd->pkeys[default_pkey_idx] = DEFAULT_P_KEY; 657 ppd->part_enforce |= HFI1_PART_ENFORCE_IN; 658 659 if (loopback) { 660 dd_dev_err(dd, "Faking data partition 0x8001 in idx %u\n", 661 !default_pkey_idx); 662 ppd->pkeys[!default_pkey_idx] = 0x8001; 663 } 664 665 INIT_WORK(&ppd->link_vc_work, handle_verify_cap); 666 INIT_WORK(&ppd->link_up_work, handle_link_up); 667 INIT_WORK(&ppd->link_down_work, handle_link_down); 668 INIT_WORK(&ppd->freeze_work, handle_freeze); 669 INIT_WORK(&ppd->link_downgrade_work, handle_link_downgrade); 670 INIT_WORK(&ppd->sma_message_work, handle_sma_message); 671 INIT_WORK(&ppd->link_bounce_work, handle_link_bounce); 672 INIT_DELAYED_WORK(&ppd->start_link_work, handle_start_link); 673 INIT_WORK(&ppd->linkstate_active_work, receive_interrupt_work); 674 INIT_WORK(&ppd->qsfp_info.qsfp_work, qsfp_event); 675 676 mutex_init(&ppd->hls_lock); 677 spin_lock_init(&ppd->qsfp_info.qsfp_lock); 678 679 ppd->qsfp_info.ppd = ppd; 680 ppd->sm_trap_qp = 0x0; 681 ppd->sa_qp = 0x1; 682 683 ppd->hfi1_wq = NULL; 684 685 spin_lock_init(&ppd->cca_timer_lock); 686 687 for (i = 0; i < OPA_MAX_SLS; i++) { 688 hrtimer_init(&ppd->cca_timer[i].hrtimer, CLOCK_MONOTONIC, 689 HRTIMER_MODE_REL); 690 ppd->cca_timer[i].ppd = ppd; 691 ppd->cca_timer[i].sl = i; 692 ppd->cca_timer[i].ccti = 0; 693 ppd->cca_timer[i].hrtimer.function = cca_timer_fn; 694 } 695 696 ppd->cc_max_table_entries = IB_CC_TABLE_CAP_DEFAULT; 697 698 spin_lock_init(&ppd->cc_state_lock); 699 spin_lock_init(&ppd->cc_log_lock); 700 cc_state = kzalloc(sizeof(*cc_state), GFP_KERNEL); 701 RCU_INIT_POINTER(ppd->cc_state, cc_state); 702 if (!cc_state) 703 goto bail; 704 return; 705 706 bail: 707 dd_dev_err(dd, "Congestion Control Agent disabled for port %d\n", port); 708 } 709 710 /* 711 * Do initialization for device that is only needed on 712 * first detect, not on resets. 713 */ 714 static int loadtime_init(struct hfi1_devdata *dd) 715 { 716 return 0; 717 } 718 719 /** 720 * init_after_reset - re-initialize after a reset 721 * @dd: the hfi1_ib device 722 * 723 * sanity check at least some of the values after reset, and 724 * ensure no receive or transmit (explicitly, in case reset 725 * failed 726 */ 727 static int init_after_reset(struct hfi1_devdata *dd) 728 { 729 int i; 730 struct hfi1_ctxtdata *rcd; 731 /* 732 * Ensure chip does no sends or receives, tail updates, or 733 * pioavail updates while we re-initialize. This is mostly 734 * for the driver data structures, not chip registers. 735 */ 736 for (i = 0; i < dd->num_rcv_contexts; i++) { 737 rcd = hfi1_rcd_get_by_index(dd, i); 738 hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS | 739 HFI1_RCVCTRL_INTRAVAIL_DIS | 740 HFI1_RCVCTRL_TAILUPD_DIS, rcd); 741 hfi1_rcd_put(rcd); 742 } 743 pio_send_control(dd, PSC_GLOBAL_DISABLE); 744 for (i = 0; i < dd->num_send_contexts; i++) 745 sc_disable(dd->send_contexts[i].sc); 746 747 return 0; 748 } 749 750 static void enable_chip(struct hfi1_devdata *dd) 751 { 752 struct hfi1_ctxtdata *rcd; 753 u32 rcvmask; 754 u16 i; 755 756 /* enable PIO send */ 757 pio_send_control(dd, PSC_GLOBAL_ENABLE); 758 759 /* 760 * Enable kernel ctxts' receive and receive interrupt. 761 * Other ctxts done as user opens and initializes them. 762 */ 763 for (i = 0; i < dd->first_dyn_alloc_ctxt; ++i) { 764 rcd = hfi1_rcd_get_by_index(dd, i); 765 if (!rcd) 766 continue; 767 rcvmask = HFI1_RCVCTRL_CTXT_ENB | HFI1_RCVCTRL_INTRAVAIL_ENB; 768 rcvmask |= HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ? 769 HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS; 770 if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) 771 rcvmask |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB; 772 if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_RHQ_FULL)) 773 rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB; 774 if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_EGR_FULL)) 775 rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB; 776 hfi1_rcvctrl(dd, rcvmask, rcd); 777 sc_enable(rcd->sc); 778 hfi1_rcd_put(rcd); 779 } 780 } 781 782 /** 783 * create_workqueues - create per port workqueues 784 * @dd: the hfi1_ib device 785 */ 786 static int create_workqueues(struct hfi1_devdata *dd) 787 { 788 int pidx; 789 struct hfi1_pportdata *ppd; 790 791 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 792 ppd = dd->pport + pidx; 793 if (!ppd->hfi1_wq) { 794 ppd->hfi1_wq = 795 alloc_workqueue( 796 "hfi%d_%d", 797 WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE, 798 HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES, 799 dd->unit, pidx); 800 if (!ppd->hfi1_wq) 801 goto wq_error; 802 } 803 if (!ppd->link_wq) { 804 /* 805 * Make the link workqueue single-threaded to enforce 806 * serialization. 807 */ 808 ppd->link_wq = 809 alloc_workqueue( 810 "hfi_link_%d_%d", 811 WQ_SYSFS | WQ_MEM_RECLAIM | WQ_UNBOUND, 812 1, /* max_active */ 813 dd->unit, pidx); 814 if (!ppd->link_wq) 815 goto wq_error; 816 } 817 } 818 return 0; 819 wq_error: 820 pr_err("alloc_workqueue failed for port %d\n", pidx + 1); 821 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 822 ppd = dd->pport + pidx; 823 if (ppd->hfi1_wq) { 824 destroy_workqueue(ppd->hfi1_wq); 825 ppd->hfi1_wq = NULL; 826 } 827 if (ppd->link_wq) { 828 destroy_workqueue(ppd->link_wq); 829 ppd->link_wq = NULL; 830 } 831 } 832 return -ENOMEM; 833 } 834 835 /** 836 * enable_general_intr() - Enable the IRQs that will be handled by the 837 * general interrupt handler. 838 * @dd: valid devdata 839 * 840 */ 841 static void enable_general_intr(struct hfi1_devdata *dd) 842 { 843 set_intr_bits(dd, CCE_ERR_INT, MISC_ERR_INT, true); 844 set_intr_bits(dd, PIO_ERR_INT, TXE_ERR_INT, true); 845 set_intr_bits(dd, IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END, true); 846 set_intr_bits(dd, PBC_INT, GPIO_ASSERT_INT, true); 847 set_intr_bits(dd, TCRIT_INT, TCRIT_INT, true); 848 set_intr_bits(dd, IS_DC_START, IS_DC_END, true); 849 set_intr_bits(dd, IS_SENDCREDIT_START, IS_SENDCREDIT_END, true); 850 } 851 852 /** 853 * hfi1_init - do the actual initialization sequence on the chip 854 * @dd: the hfi1_ib device 855 * @reinit: re-initializing, so don't allocate new memory 856 * 857 * Do the actual initialization sequence on the chip. This is done 858 * both from the init routine called from the PCI infrastructure, and 859 * when we reset the chip, or detect that it was reset internally, 860 * or it's administratively re-enabled. 861 * 862 * Memory allocation here and in called routines is only done in 863 * the first case (reinit == 0). We have to be careful, because even 864 * without memory allocation, we need to re-write all the chip registers 865 * TIDs, etc. after the reset or enable has completed. 866 */ 867 int hfi1_init(struct hfi1_devdata *dd, int reinit) 868 { 869 int ret = 0, pidx, lastfail = 0; 870 unsigned long len; 871 u16 i; 872 struct hfi1_ctxtdata *rcd; 873 struct hfi1_pportdata *ppd; 874 875 /* Set up send low level handlers */ 876 dd->process_pio_send = hfi1_verbs_send_pio; 877 dd->process_dma_send = hfi1_verbs_send_dma; 878 dd->pio_inline_send = pio_copy; 879 dd->process_vnic_dma_send = hfi1_vnic_send_dma; 880 881 if (is_ax(dd)) { 882 atomic_set(&dd->drop_packet, DROP_PACKET_ON); 883 dd->do_drop = 1; 884 } else { 885 atomic_set(&dd->drop_packet, DROP_PACKET_OFF); 886 dd->do_drop = 0; 887 } 888 889 /* make sure the link is not "up" */ 890 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 891 ppd = dd->pport + pidx; 892 ppd->linkup = 0; 893 } 894 895 if (reinit) 896 ret = init_after_reset(dd); 897 else 898 ret = loadtime_init(dd); 899 if (ret) 900 goto done; 901 902 /* allocate dummy tail memory for all receive contexts */ 903 dd->rcvhdrtail_dummy_kvaddr = dma_alloc_coherent(&dd->pcidev->dev, 904 sizeof(u64), 905 &dd->rcvhdrtail_dummy_dma, 906 GFP_KERNEL); 907 908 if (!dd->rcvhdrtail_dummy_kvaddr) { 909 dd_dev_err(dd, "cannot allocate dummy tail memory\n"); 910 ret = -ENOMEM; 911 goto done; 912 } 913 914 /* dd->rcd can be NULL if early initialization failed */ 915 for (i = 0; dd->rcd && i < dd->first_dyn_alloc_ctxt; ++i) { 916 /* 917 * Set up the (kernel) rcvhdr queue and egr TIDs. If doing 918 * re-init, the simplest way to handle this is to free 919 * existing, and re-allocate. 920 * Need to re-create rest of ctxt 0 ctxtdata as well. 921 */ 922 rcd = hfi1_rcd_get_by_index(dd, i); 923 if (!rcd) 924 continue; 925 926 rcd->do_interrupt = &handle_receive_interrupt; 927 928 lastfail = hfi1_create_rcvhdrq(dd, rcd); 929 if (!lastfail) 930 lastfail = hfi1_setup_eagerbufs(rcd); 931 if (lastfail) { 932 dd_dev_err(dd, 933 "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n"); 934 ret = lastfail; 935 } 936 /* enable IRQ */ 937 hfi1_rcd_put(rcd); 938 } 939 940 /* Allocate enough memory for user event notification. */ 941 len = PAGE_ALIGN(chip_rcv_contexts(dd) * HFI1_MAX_SHARED_CTXTS * 942 sizeof(*dd->events)); 943 dd->events = vmalloc_user(len); 944 if (!dd->events) 945 dd_dev_err(dd, "Failed to allocate user events page\n"); 946 /* 947 * Allocate a page for device and port status. 948 * Page will be shared amongst all user processes. 949 */ 950 dd->status = vmalloc_user(PAGE_SIZE); 951 if (!dd->status) 952 dd_dev_err(dd, "Failed to allocate dev status page\n"); 953 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 954 ppd = dd->pport + pidx; 955 if (dd->status) 956 /* Currently, we only have one port */ 957 ppd->statusp = &dd->status->port; 958 959 set_mtu(ppd); 960 } 961 962 /* enable chip even if we have an error, so we can debug cause */ 963 enable_chip(dd); 964 965 done: 966 /* 967 * Set status even if port serdes is not initialized 968 * so that diags will work. 969 */ 970 if (dd->status) 971 dd->status->dev |= HFI1_STATUS_CHIP_PRESENT | 972 HFI1_STATUS_INITTED; 973 if (!ret) { 974 /* enable all interrupts from the chip */ 975 enable_general_intr(dd); 976 init_qsfp_int(dd); 977 978 /* chip is OK for user apps; mark it as initialized */ 979 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 980 ppd = dd->pport + pidx; 981 982 /* 983 * start the serdes - must be after interrupts are 984 * enabled so we are notified when the link goes up 985 */ 986 lastfail = bringup_serdes(ppd); 987 if (lastfail) 988 dd_dev_info(dd, 989 "Failed to bring up port %u\n", 990 ppd->port); 991 992 /* 993 * Set status even if port serdes is not initialized 994 * so that diags will work. 995 */ 996 if (ppd->statusp) 997 *ppd->statusp |= HFI1_STATUS_CHIP_PRESENT | 998 HFI1_STATUS_INITTED; 999 if (!ppd->link_speed_enabled) 1000 continue; 1001 } 1002 } 1003 1004 /* if ret is non-zero, we probably should do some cleanup here... */ 1005 return ret; 1006 } 1007 1008 static inline struct hfi1_devdata *__hfi1_lookup(int unit) 1009 { 1010 return idr_find(&hfi1_unit_table, unit); 1011 } 1012 1013 struct hfi1_devdata *hfi1_lookup(int unit) 1014 { 1015 struct hfi1_devdata *dd; 1016 unsigned long flags; 1017 1018 spin_lock_irqsave(&hfi1_devs_lock, flags); 1019 dd = __hfi1_lookup(unit); 1020 spin_unlock_irqrestore(&hfi1_devs_lock, flags); 1021 1022 return dd; 1023 } 1024 1025 /* 1026 * Stop the timers during unit shutdown, or after an error late 1027 * in initialization. 1028 */ 1029 static void stop_timers(struct hfi1_devdata *dd) 1030 { 1031 struct hfi1_pportdata *ppd; 1032 int pidx; 1033 1034 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1035 ppd = dd->pport + pidx; 1036 if (ppd->led_override_timer.function) { 1037 del_timer_sync(&ppd->led_override_timer); 1038 atomic_set(&ppd->led_override_timer_active, 0); 1039 } 1040 } 1041 } 1042 1043 /** 1044 * shutdown_device - shut down a device 1045 * @dd: the hfi1_ib device 1046 * 1047 * This is called to make the device quiet when we are about to 1048 * unload the driver, and also when the device is administratively 1049 * disabled. It does not free any data structures. 1050 * Everything it does has to be setup again by hfi1_init(dd, 1) 1051 */ 1052 static void shutdown_device(struct hfi1_devdata *dd) 1053 { 1054 struct hfi1_pportdata *ppd; 1055 struct hfi1_ctxtdata *rcd; 1056 unsigned pidx; 1057 int i; 1058 1059 if (dd->flags & HFI1_SHUTDOWN) 1060 return; 1061 dd->flags |= HFI1_SHUTDOWN; 1062 1063 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1064 ppd = dd->pport + pidx; 1065 1066 ppd->linkup = 0; 1067 if (ppd->statusp) 1068 *ppd->statusp &= ~(HFI1_STATUS_IB_CONF | 1069 HFI1_STATUS_IB_READY); 1070 } 1071 dd->flags &= ~HFI1_INITTED; 1072 1073 /* mask and clean up interrupts */ 1074 set_intr_bits(dd, IS_FIRST_SOURCE, IS_LAST_SOURCE, false); 1075 msix_clean_up_interrupts(dd); 1076 1077 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1078 ppd = dd->pport + pidx; 1079 for (i = 0; i < dd->num_rcv_contexts; i++) { 1080 rcd = hfi1_rcd_get_by_index(dd, i); 1081 hfi1_rcvctrl(dd, HFI1_RCVCTRL_TAILUPD_DIS | 1082 HFI1_RCVCTRL_CTXT_DIS | 1083 HFI1_RCVCTRL_INTRAVAIL_DIS | 1084 HFI1_RCVCTRL_PKEY_DIS | 1085 HFI1_RCVCTRL_ONE_PKT_EGR_DIS, rcd); 1086 hfi1_rcd_put(rcd); 1087 } 1088 /* 1089 * Gracefully stop all sends allowing any in progress to 1090 * trickle out first. 1091 */ 1092 for (i = 0; i < dd->num_send_contexts; i++) 1093 sc_flush(dd->send_contexts[i].sc); 1094 } 1095 1096 /* 1097 * Enough for anything that's going to trickle out to have actually 1098 * done so. 1099 */ 1100 udelay(20); 1101 1102 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1103 ppd = dd->pport + pidx; 1104 1105 /* disable all contexts */ 1106 for (i = 0; i < dd->num_send_contexts; i++) 1107 sc_disable(dd->send_contexts[i].sc); 1108 /* disable the send device */ 1109 pio_send_control(dd, PSC_GLOBAL_DISABLE); 1110 1111 shutdown_led_override(ppd); 1112 1113 /* 1114 * Clear SerdesEnable. 1115 * We can't count on interrupts since we are stopping. 1116 */ 1117 hfi1_quiet_serdes(ppd); 1118 1119 if (ppd->hfi1_wq) { 1120 destroy_workqueue(ppd->hfi1_wq); 1121 ppd->hfi1_wq = NULL; 1122 } 1123 if (ppd->link_wq) { 1124 destroy_workqueue(ppd->link_wq); 1125 ppd->link_wq = NULL; 1126 } 1127 } 1128 sdma_exit(dd); 1129 } 1130 1131 /** 1132 * hfi1_free_ctxtdata - free a context's allocated data 1133 * @dd: the hfi1_ib device 1134 * @rcd: the ctxtdata structure 1135 * 1136 * free up any allocated data for a context 1137 * It should never change any chip state, or global driver state. 1138 */ 1139 void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) 1140 { 1141 u32 e; 1142 1143 if (!rcd) 1144 return; 1145 1146 if (rcd->rcvhdrq) { 1147 dma_free_coherent(&dd->pcidev->dev, rcvhdrq_size(rcd), 1148 rcd->rcvhdrq, rcd->rcvhdrq_dma); 1149 rcd->rcvhdrq = NULL; 1150 if (rcd->rcvhdrtail_kvaddr) { 1151 dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, 1152 (void *)rcd->rcvhdrtail_kvaddr, 1153 rcd->rcvhdrqtailaddr_dma); 1154 rcd->rcvhdrtail_kvaddr = NULL; 1155 } 1156 } 1157 1158 /* all the RcvArray entries should have been cleared by now */ 1159 kfree(rcd->egrbufs.rcvtids); 1160 rcd->egrbufs.rcvtids = NULL; 1161 1162 for (e = 0; e < rcd->egrbufs.alloced; e++) { 1163 if (rcd->egrbufs.buffers[e].dma) 1164 dma_free_coherent(&dd->pcidev->dev, 1165 rcd->egrbufs.buffers[e].len, 1166 rcd->egrbufs.buffers[e].addr, 1167 rcd->egrbufs.buffers[e].dma); 1168 } 1169 kfree(rcd->egrbufs.buffers); 1170 rcd->egrbufs.alloced = 0; 1171 rcd->egrbufs.buffers = NULL; 1172 1173 sc_free(rcd->sc); 1174 rcd->sc = NULL; 1175 1176 vfree(rcd->subctxt_uregbase); 1177 vfree(rcd->subctxt_rcvegrbuf); 1178 vfree(rcd->subctxt_rcvhdr_base); 1179 kfree(rcd->opstats); 1180 1181 rcd->subctxt_uregbase = NULL; 1182 rcd->subctxt_rcvegrbuf = NULL; 1183 rcd->subctxt_rcvhdr_base = NULL; 1184 rcd->opstats = NULL; 1185 } 1186 1187 /* 1188 * Release our hold on the shared asic data. If we are the last one, 1189 * return the structure to be finalized outside the lock. Must be 1190 * holding hfi1_devs_lock. 1191 */ 1192 static struct hfi1_asic_data *release_asic_data(struct hfi1_devdata *dd) 1193 { 1194 struct hfi1_asic_data *ad; 1195 int other; 1196 1197 if (!dd->asic_data) 1198 return NULL; 1199 dd->asic_data->dds[dd->hfi1_id] = NULL; 1200 other = dd->hfi1_id ? 0 : 1; 1201 ad = dd->asic_data; 1202 dd->asic_data = NULL; 1203 /* return NULL if the other dd still has a link */ 1204 return ad->dds[other] ? NULL : ad; 1205 } 1206 1207 static void finalize_asic_data(struct hfi1_devdata *dd, 1208 struct hfi1_asic_data *ad) 1209 { 1210 clean_up_i2c(dd, ad); 1211 kfree(ad); 1212 } 1213 1214 /** 1215 * hfi1_clean_devdata - cleans up per-unit data structure 1216 * @dd: pointer to a valid devdata structure 1217 * 1218 * It cleans up all data structures set up by 1219 * by hfi1_alloc_devdata(). 1220 */ 1221 static void hfi1_clean_devdata(struct hfi1_devdata *dd) 1222 { 1223 struct hfi1_asic_data *ad; 1224 unsigned long flags; 1225 1226 spin_lock_irqsave(&hfi1_devs_lock, flags); 1227 if (!list_empty(&dd->list)) { 1228 idr_remove(&hfi1_unit_table, dd->unit); 1229 list_del_init(&dd->list); 1230 } 1231 ad = release_asic_data(dd); 1232 spin_unlock_irqrestore(&hfi1_devs_lock, flags); 1233 1234 finalize_asic_data(dd, ad); 1235 free_platform_config(dd); 1236 rcu_barrier(); /* wait for rcu callbacks to complete */ 1237 free_percpu(dd->int_counter); 1238 free_percpu(dd->rcv_limit); 1239 free_percpu(dd->send_schedule); 1240 free_percpu(dd->tx_opstats); 1241 dd->int_counter = NULL; 1242 dd->rcv_limit = NULL; 1243 dd->send_schedule = NULL; 1244 dd->tx_opstats = NULL; 1245 kfree(dd->comp_vect); 1246 dd->comp_vect = NULL; 1247 sdma_clean(dd, dd->num_sdma); 1248 rvt_dealloc_device(&dd->verbs_dev.rdi); 1249 } 1250 1251 static void __hfi1_free_devdata(struct kobject *kobj) 1252 { 1253 struct hfi1_devdata *dd = 1254 container_of(kobj, struct hfi1_devdata, kobj); 1255 1256 hfi1_clean_devdata(dd); 1257 } 1258 1259 static struct kobj_type hfi1_devdata_type = { 1260 .release = __hfi1_free_devdata, 1261 }; 1262 1263 void hfi1_free_devdata(struct hfi1_devdata *dd) 1264 { 1265 kobject_put(&dd->kobj); 1266 } 1267 1268 /** 1269 * hfi1_alloc_devdata - Allocate our primary per-unit data structure. 1270 * @pdev: Valid PCI device 1271 * @extra: How many bytes to alloc past the default 1272 * 1273 * Must be done via verbs allocator, because the verbs cleanup process 1274 * both does cleanup and free of the data structure. 1275 * "extra" is for chip-specific data. 1276 * 1277 * Use the idr mechanism to get a unit number for this unit. 1278 */ 1279 static struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, 1280 size_t extra) 1281 { 1282 unsigned long flags; 1283 struct hfi1_devdata *dd; 1284 int ret, nports; 1285 1286 /* extra is * number of ports */ 1287 nports = extra / sizeof(struct hfi1_pportdata); 1288 1289 dd = (struct hfi1_devdata *)rvt_alloc_device(sizeof(*dd) + extra, 1290 nports); 1291 if (!dd) 1292 return ERR_PTR(-ENOMEM); 1293 dd->num_pports = nports; 1294 dd->pport = (struct hfi1_pportdata *)(dd + 1); 1295 dd->pcidev = pdev; 1296 pci_set_drvdata(pdev, dd); 1297 1298 INIT_LIST_HEAD(&dd->list); 1299 idr_preload(GFP_KERNEL); 1300 spin_lock_irqsave(&hfi1_devs_lock, flags); 1301 1302 ret = idr_alloc(&hfi1_unit_table, dd, 0, 0, GFP_NOWAIT); 1303 if (ret >= 0) { 1304 dd->unit = ret; 1305 list_add(&dd->list, &hfi1_dev_list); 1306 } 1307 dd->node = NUMA_NO_NODE; 1308 1309 spin_unlock_irqrestore(&hfi1_devs_lock, flags); 1310 idr_preload_end(); 1311 1312 if (ret < 0) { 1313 dev_err(&pdev->dev, 1314 "Could not allocate unit ID: error %d\n", -ret); 1315 goto bail; 1316 } 1317 rvt_set_ibdev_name(&dd->verbs_dev.rdi, "%s_%d", class_name(), dd->unit); 1318 1319 /* 1320 * Initialize all locks for the device. This needs to be as early as 1321 * possible so locks are usable. 1322 */ 1323 spin_lock_init(&dd->sc_lock); 1324 spin_lock_init(&dd->sendctrl_lock); 1325 spin_lock_init(&dd->rcvctrl_lock); 1326 spin_lock_init(&dd->uctxt_lock); 1327 spin_lock_init(&dd->hfi1_diag_trans_lock); 1328 spin_lock_init(&dd->sc_init_lock); 1329 spin_lock_init(&dd->dc8051_memlock); 1330 seqlock_init(&dd->sc2vl_lock); 1331 spin_lock_init(&dd->sde_map_lock); 1332 spin_lock_init(&dd->pio_map_lock); 1333 mutex_init(&dd->dc8051_lock); 1334 init_waitqueue_head(&dd->event_queue); 1335 spin_lock_init(&dd->irq_src_lock); 1336 1337 dd->int_counter = alloc_percpu(u64); 1338 if (!dd->int_counter) { 1339 ret = -ENOMEM; 1340 goto bail; 1341 } 1342 1343 dd->rcv_limit = alloc_percpu(u64); 1344 if (!dd->rcv_limit) { 1345 ret = -ENOMEM; 1346 goto bail; 1347 } 1348 1349 dd->send_schedule = alloc_percpu(u64); 1350 if (!dd->send_schedule) { 1351 ret = -ENOMEM; 1352 goto bail; 1353 } 1354 1355 dd->tx_opstats = alloc_percpu(struct hfi1_opcode_stats_perctx); 1356 if (!dd->tx_opstats) { 1357 ret = -ENOMEM; 1358 goto bail; 1359 } 1360 1361 dd->comp_vect = kzalloc(sizeof(*dd->comp_vect), GFP_KERNEL); 1362 if (!dd->comp_vect) { 1363 ret = -ENOMEM; 1364 goto bail; 1365 } 1366 1367 kobject_init(&dd->kobj, &hfi1_devdata_type); 1368 return dd; 1369 1370 bail: 1371 hfi1_clean_devdata(dd); 1372 return ERR_PTR(ret); 1373 } 1374 1375 /* 1376 * Called from freeze mode handlers, and from PCI error 1377 * reporting code. Should be paranoid about state of 1378 * system and data structures. 1379 */ 1380 void hfi1_disable_after_error(struct hfi1_devdata *dd) 1381 { 1382 if (dd->flags & HFI1_INITTED) { 1383 u32 pidx; 1384 1385 dd->flags &= ~HFI1_INITTED; 1386 if (dd->pport) 1387 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1388 struct hfi1_pportdata *ppd; 1389 1390 ppd = dd->pport + pidx; 1391 if (dd->flags & HFI1_PRESENT) 1392 set_link_state(ppd, HLS_DN_DISABLE); 1393 1394 if (ppd->statusp) 1395 *ppd->statusp &= ~HFI1_STATUS_IB_READY; 1396 } 1397 } 1398 1399 /* 1400 * Mark as having had an error for driver, and also 1401 * for /sys and status word mapped to user programs. 1402 * This marks unit as not usable, until reset. 1403 */ 1404 if (dd->status) 1405 dd->status->dev |= HFI1_STATUS_HWERROR; 1406 } 1407 1408 static void remove_one(struct pci_dev *); 1409 static int init_one(struct pci_dev *, const struct pci_device_id *); 1410 static void shutdown_one(struct pci_dev *); 1411 1412 #define DRIVER_LOAD_MSG "Intel " DRIVER_NAME " loaded: " 1413 #define PFX DRIVER_NAME ": " 1414 1415 const struct pci_device_id hfi1_pci_tbl[] = { 1416 { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL0) }, 1417 { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL1) }, 1418 { 0, } 1419 }; 1420 1421 MODULE_DEVICE_TABLE(pci, hfi1_pci_tbl); 1422 1423 static struct pci_driver hfi1_pci_driver = { 1424 .name = DRIVER_NAME, 1425 .probe = init_one, 1426 .remove = remove_one, 1427 .shutdown = shutdown_one, 1428 .id_table = hfi1_pci_tbl, 1429 .err_handler = &hfi1_pci_err_handler, 1430 }; 1431 1432 static void __init compute_krcvqs(void) 1433 { 1434 int i; 1435 1436 for (i = 0; i < krcvqsset; i++) 1437 n_krcvqs += krcvqs[i]; 1438 } 1439 1440 /* 1441 * Do all the generic driver unit- and chip-independent memory 1442 * allocation and initialization. 1443 */ 1444 static int __init hfi1_mod_init(void) 1445 { 1446 int ret; 1447 1448 ret = dev_init(); 1449 if (ret) 1450 goto bail; 1451 1452 ret = node_affinity_init(); 1453 if (ret) 1454 goto bail; 1455 1456 /* validate max MTU before any devices start */ 1457 if (!valid_opa_max_mtu(hfi1_max_mtu)) { 1458 pr_err("Invalid max_mtu 0x%x, using 0x%x instead\n", 1459 hfi1_max_mtu, HFI1_DEFAULT_MAX_MTU); 1460 hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU; 1461 } 1462 /* valid CUs run from 1-128 in powers of 2 */ 1463 if (hfi1_cu > 128 || !is_power_of_2(hfi1_cu)) 1464 hfi1_cu = 1; 1465 /* valid credit return threshold is 0-100, variable is unsigned */ 1466 if (user_credit_return_threshold > 100) 1467 user_credit_return_threshold = 100; 1468 1469 compute_krcvqs(); 1470 /* 1471 * sanitize receive interrupt count, time must wait until after 1472 * the hardware type is known 1473 */ 1474 if (rcv_intr_count > RCV_HDR_HEAD_COUNTER_MASK) 1475 rcv_intr_count = RCV_HDR_HEAD_COUNTER_MASK; 1476 /* reject invalid combinations */ 1477 if (rcv_intr_count == 0 && rcv_intr_timeout == 0) { 1478 pr_err("Invalid mode: both receive interrupt count and available timeout are zero - setting interrupt count to 1\n"); 1479 rcv_intr_count = 1; 1480 } 1481 if (rcv_intr_count > 1 && rcv_intr_timeout == 0) { 1482 /* 1483 * Avoid indefinite packet delivery by requiring a timeout 1484 * if count is > 1. 1485 */ 1486 pr_err("Invalid mode: receive interrupt count greater than 1 and available timeout is zero - setting available timeout to 1\n"); 1487 rcv_intr_timeout = 1; 1488 } 1489 if (rcv_intr_dynamic && !(rcv_intr_count > 1 && rcv_intr_timeout > 0)) { 1490 /* 1491 * The dynamic algorithm expects a non-zero timeout 1492 * and a count > 1. 1493 */ 1494 pr_err("Invalid mode: dynamic receive interrupt mitigation with invalid count and timeout - turning dynamic off\n"); 1495 rcv_intr_dynamic = 0; 1496 } 1497 1498 /* sanitize link CRC options */ 1499 link_crc_mask &= SUPPORTED_CRCS; 1500 1501 /* 1502 * These must be called before the driver is registered with 1503 * the PCI subsystem. 1504 */ 1505 idr_init(&hfi1_unit_table); 1506 1507 hfi1_dbg_init(); 1508 ret = pci_register_driver(&hfi1_pci_driver); 1509 if (ret < 0) { 1510 pr_err("Unable to register driver: error %d\n", -ret); 1511 goto bail_dev; 1512 } 1513 goto bail; /* all OK */ 1514 1515 bail_dev: 1516 hfi1_dbg_exit(); 1517 idr_destroy(&hfi1_unit_table); 1518 dev_cleanup(); 1519 bail: 1520 return ret; 1521 } 1522 1523 module_init(hfi1_mod_init); 1524 1525 /* 1526 * Do the non-unit driver cleanup, memory free, etc. at unload. 1527 */ 1528 static void __exit hfi1_mod_cleanup(void) 1529 { 1530 pci_unregister_driver(&hfi1_pci_driver); 1531 node_affinity_destroy_all(); 1532 hfi1_dbg_exit(); 1533 1534 idr_destroy(&hfi1_unit_table); 1535 dispose_firmware(); /* asymmetric with obtain_firmware() */ 1536 dev_cleanup(); 1537 } 1538 1539 module_exit(hfi1_mod_cleanup); 1540 1541 /* this can only be called after a successful initialization */ 1542 static void cleanup_device_data(struct hfi1_devdata *dd) 1543 { 1544 int ctxt; 1545 int pidx; 1546 1547 /* users can't do anything more with chip */ 1548 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1549 struct hfi1_pportdata *ppd = &dd->pport[pidx]; 1550 struct cc_state *cc_state; 1551 int i; 1552 1553 if (ppd->statusp) 1554 *ppd->statusp &= ~HFI1_STATUS_CHIP_PRESENT; 1555 1556 for (i = 0; i < OPA_MAX_SLS; i++) 1557 hrtimer_cancel(&ppd->cca_timer[i].hrtimer); 1558 1559 spin_lock(&ppd->cc_state_lock); 1560 cc_state = get_cc_state_protected(ppd); 1561 RCU_INIT_POINTER(ppd->cc_state, NULL); 1562 spin_unlock(&ppd->cc_state_lock); 1563 1564 if (cc_state) 1565 kfree_rcu(cc_state, rcu); 1566 } 1567 1568 free_credit_return(dd); 1569 1570 if (dd->rcvhdrtail_dummy_kvaddr) { 1571 dma_free_coherent(&dd->pcidev->dev, sizeof(u64), 1572 (void *)dd->rcvhdrtail_dummy_kvaddr, 1573 dd->rcvhdrtail_dummy_dma); 1574 dd->rcvhdrtail_dummy_kvaddr = NULL; 1575 } 1576 1577 /* 1578 * Free any resources still in use (usually just kernel contexts) 1579 * at unload; we do for ctxtcnt, because that's what we allocate. 1580 */ 1581 for (ctxt = 0; dd->rcd && ctxt < dd->num_rcv_contexts; ctxt++) { 1582 struct hfi1_ctxtdata *rcd = dd->rcd[ctxt]; 1583 1584 if (rcd) { 1585 hfi1_clear_tids(rcd); 1586 hfi1_free_ctxt(rcd); 1587 } 1588 } 1589 1590 kfree(dd->rcd); 1591 dd->rcd = NULL; 1592 1593 free_pio_map(dd); 1594 /* must follow rcv context free - need to remove rcv's hooks */ 1595 for (ctxt = 0; ctxt < dd->num_send_contexts; ctxt++) 1596 sc_free(dd->send_contexts[ctxt].sc); 1597 dd->num_send_contexts = 0; 1598 kfree(dd->send_contexts); 1599 dd->send_contexts = NULL; 1600 kfree(dd->hw_to_sw); 1601 dd->hw_to_sw = NULL; 1602 kfree(dd->boardname); 1603 vfree(dd->events); 1604 vfree(dd->status); 1605 } 1606 1607 /* 1608 * Clean up on unit shutdown, or error during unit load after 1609 * successful initialization. 1610 */ 1611 static void postinit_cleanup(struct hfi1_devdata *dd) 1612 { 1613 hfi1_start_cleanup(dd); 1614 hfi1_comp_vectors_clean_up(dd); 1615 hfi1_dev_affinity_clean_up(dd); 1616 1617 hfi1_pcie_ddcleanup(dd); 1618 hfi1_pcie_cleanup(dd->pcidev); 1619 1620 cleanup_device_data(dd); 1621 1622 hfi1_free_devdata(dd); 1623 } 1624 1625 static int init_validate_rcvhdrcnt(struct hfi1_devdata *dd, uint thecnt) 1626 { 1627 if (thecnt <= HFI1_MIN_HDRQ_EGRBUF_CNT) { 1628 dd_dev_err(dd, "Receive header queue count too small\n"); 1629 return -EINVAL; 1630 } 1631 1632 if (thecnt > HFI1_MAX_HDRQ_EGRBUF_CNT) { 1633 dd_dev_err(dd, 1634 "Receive header queue count cannot be greater than %u\n", 1635 HFI1_MAX_HDRQ_EGRBUF_CNT); 1636 return -EINVAL; 1637 } 1638 1639 if (thecnt % HDRQ_INCREMENT) { 1640 dd_dev_err(dd, "Receive header queue count %d must be divisible by %lu\n", 1641 thecnt, HDRQ_INCREMENT); 1642 return -EINVAL; 1643 } 1644 1645 return 0; 1646 } 1647 1648 static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) 1649 { 1650 int ret = 0, j, pidx, initfail; 1651 struct hfi1_devdata *dd; 1652 struct hfi1_pportdata *ppd; 1653 1654 /* First, lock the non-writable module parameters */ 1655 HFI1_CAP_LOCK(); 1656 1657 /* Validate dev ids */ 1658 if (!(ent->device == PCI_DEVICE_ID_INTEL0 || 1659 ent->device == PCI_DEVICE_ID_INTEL1)) { 1660 dev_err(&pdev->dev, "Failing on unknown Intel deviceid 0x%x\n", 1661 ent->device); 1662 ret = -ENODEV; 1663 goto bail; 1664 } 1665 1666 /* Allocate the dd so we can get to work */ 1667 dd = hfi1_alloc_devdata(pdev, NUM_IB_PORTS * 1668 sizeof(struct hfi1_pportdata)); 1669 if (IS_ERR(dd)) { 1670 ret = PTR_ERR(dd); 1671 goto bail; 1672 } 1673 1674 /* Validate some global module parameters */ 1675 ret = init_validate_rcvhdrcnt(dd, rcvhdrcnt); 1676 if (ret) 1677 goto bail; 1678 1679 /* use the encoding function as a sanitization check */ 1680 if (!encode_rcv_header_entry_size(hfi1_hdrq_entsize)) { 1681 dd_dev_err(dd, "Invalid HdrQ Entry size %u\n", 1682 hfi1_hdrq_entsize); 1683 ret = -EINVAL; 1684 goto bail; 1685 } 1686 1687 /* The receive eager buffer size must be set before the receive 1688 * contexts are created. 1689 * 1690 * Set the eager buffer size. Validate that it falls in a range 1691 * allowed by the hardware - all powers of 2 between the min and 1692 * max. The maximum valid MTU is within the eager buffer range 1693 * so we do not need to cap the max_mtu by an eager buffer size 1694 * setting. 1695 */ 1696 if (eager_buffer_size) { 1697 if (!is_power_of_2(eager_buffer_size)) 1698 eager_buffer_size = 1699 roundup_pow_of_two(eager_buffer_size); 1700 eager_buffer_size = 1701 clamp_val(eager_buffer_size, 1702 MIN_EAGER_BUFFER * 8, 1703 MAX_EAGER_BUFFER_TOTAL); 1704 dd_dev_info(dd, "Eager buffer size %u\n", 1705 eager_buffer_size); 1706 } else { 1707 dd_dev_err(dd, "Invalid Eager buffer size of 0\n"); 1708 ret = -EINVAL; 1709 goto bail; 1710 } 1711 1712 /* restrict value of hfi1_rcvarr_split */ 1713 hfi1_rcvarr_split = clamp_val(hfi1_rcvarr_split, 0, 100); 1714 1715 ret = hfi1_pcie_init(dd); 1716 if (ret) 1717 goto bail; 1718 1719 /* 1720 * Do device-specific initialization, function table setup, dd 1721 * allocation, etc. 1722 */ 1723 ret = hfi1_init_dd(dd); 1724 if (ret) 1725 goto clean_bail; /* error already printed */ 1726 1727 ret = create_workqueues(dd); 1728 if (ret) 1729 goto clean_bail; 1730 1731 /* do the generic initialization */ 1732 initfail = hfi1_init(dd, 0); 1733 1734 /* setup vnic */ 1735 hfi1_vnic_setup(dd); 1736 1737 ret = hfi1_register_ib_device(dd); 1738 1739 /* 1740 * Now ready for use. this should be cleared whenever we 1741 * detect a reset, or initiate one. If earlier failure, 1742 * we still create devices, so diags, etc. can be used 1743 * to determine cause of problem. 1744 */ 1745 if (!initfail && !ret) { 1746 dd->flags |= HFI1_INITTED; 1747 /* create debufs files after init and ib register */ 1748 hfi1_dbg_ibdev_init(&dd->verbs_dev); 1749 } 1750 1751 j = hfi1_device_create(dd); 1752 if (j) 1753 dd_dev_err(dd, "Failed to create /dev devices: %d\n", -j); 1754 1755 if (initfail || ret) { 1756 msix_clean_up_interrupts(dd); 1757 stop_timers(dd); 1758 flush_workqueue(ib_wq); 1759 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1760 hfi1_quiet_serdes(dd->pport + pidx); 1761 ppd = dd->pport + pidx; 1762 if (ppd->hfi1_wq) { 1763 destroy_workqueue(ppd->hfi1_wq); 1764 ppd->hfi1_wq = NULL; 1765 } 1766 if (ppd->link_wq) { 1767 destroy_workqueue(ppd->link_wq); 1768 ppd->link_wq = NULL; 1769 } 1770 } 1771 if (!j) 1772 hfi1_device_remove(dd); 1773 if (!ret) 1774 hfi1_unregister_ib_device(dd); 1775 hfi1_vnic_cleanup(dd); 1776 postinit_cleanup(dd); 1777 if (initfail) 1778 ret = initfail; 1779 goto bail; /* everything already cleaned */ 1780 } 1781 1782 sdma_start(dd); 1783 1784 return 0; 1785 1786 clean_bail: 1787 hfi1_pcie_cleanup(pdev); 1788 bail: 1789 return ret; 1790 } 1791 1792 static void wait_for_clients(struct hfi1_devdata *dd) 1793 { 1794 /* 1795 * Remove the device init value and complete the device if there is 1796 * no clients or wait for active clients to finish. 1797 */ 1798 if (atomic_dec_and_test(&dd->user_refcount)) 1799 complete(&dd->user_comp); 1800 1801 wait_for_completion(&dd->user_comp); 1802 } 1803 1804 static void remove_one(struct pci_dev *pdev) 1805 { 1806 struct hfi1_devdata *dd = pci_get_drvdata(pdev); 1807 1808 /* close debugfs files before ib unregister */ 1809 hfi1_dbg_ibdev_exit(&dd->verbs_dev); 1810 1811 /* remove the /dev hfi1 interface */ 1812 hfi1_device_remove(dd); 1813 1814 /* wait for existing user space clients to finish */ 1815 wait_for_clients(dd); 1816 1817 /* unregister from IB core */ 1818 hfi1_unregister_ib_device(dd); 1819 1820 /* cleanup vnic */ 1821 hfi1_vnic_cleanup(dd); 1822 1823 /* 1824 * Disable the IB link, disable interrupts on the device, 1825 * clear dma engines, etc. 1826 */ 1827 shutdown_device(dd); 1828 1829 stop_timers(dd); 1830 1831 /* wait until all of our (qsfp) queue_work() calls complete */ 1832 flush_workqueue(ib_wq); 1833 1834 postinit_cleanup(dd); 1835 } 1836 1837 static void shutdown_one(struct pci_dev *pdev) 1838 { 1839 struct hfi1_devdata *dd = pci_get_drvdata(pdev); 1840 1841 shutdown_device(dd); 1842 } 1843 1844 /** 1845 * hfi1_create_rcvhdrq - create a receive header queue 1846 * @dd: the hfi1_ib device 1847 * @rcd: the context data 1848 * 1849 * This must be contiguous memory (from an i/o perspective), and must be 1850 * DMA'able (which means for some systems, it will go through an IOMMU, 1851 * or be forced into a low address range). 1852 */ 1853 int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) 1854 { 1855 unsigned amt; 1856 u64 reg; 1857 1858 if (!rcd->rcvhdrq) { 1859 gfp_t gfp_flags; 1860 1861 amt = rcvhdrq_size(rcd); 1862 1863 if (rcd->ctxt < dd->first_dyn_alloc_ctxt || rcd->is_vnic) 1864 gfp_flags = GFP_KERNEL; 1865 else 1866 gfp_flags = GFP_USER; 1867 rcd->rcvhdrq = dma_alloc_coherent(&dd->pcidev->dev, amt, 1868 &rcd->rcvhdrq_dma, 1869 gfp_flags | __GFP_COMP); 1870 1871 if (!rcd->rcvhdrq) { 1872 dd_dev_err(dd, 1873 "attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n", 1874 amt, rcd->ctxt); 1875 goto bail; 1876 } 1877 1878 if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) || 1879 HFI1_CAP_UGET_MASK(rcd->flags, DMA_RTAIL)) { 1880 rcd->rcvhdrtail_kvaddr = dma_alloc_coherent(&dd->pcidev->dev, 1881 PAGE_SIZE, 1882 &rcd->rcvhdrqtailaddr_dma, 1883 gfp_flags); 1884 if (!rcd->rcvhdrtail_kvaddr) 1885 goto bail_free; 1886 } 1887 } 1888 /* 1889 * These values are per-context: 1890 * RcvHdrCnt 1891 * RcvHdrEntSize 1892 * RcvHdrSize 1893 */ 1894 reg = ((u64)(rcd->rcvhdrq_cnt >> HDRQ_SIZE_SHIFT) 1895 & RCV_HDR_CNT_CNT_MASK) 1896 << RCV_HDR_CNT_CNT_SHIFT; 1897 write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_CNT, reg); 1898 reg = (encode_rcv_header_entry_size(rcd->rcvhdrqentsize) 1899 & RCV_HDR_ENT_SIZE_ENT_SIZE_MASK) 1900 << RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT; 1901 write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_ENT_SIZE, reg); 1902 reg = ((u64)DEFAULT_RCVHDRSIZE & RCV_HDR_SIZE_HDR_SIZE_MASK) 1903 << RCV_HDR_SIZE_HDR_SIZE_SHIFT; 1904 write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_SIZE, reg); 1905 1906 /* 1907 * Program dummy tail address for every receive context 1908 * before enabling any receive context 1909 */ 1910 write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_TAIL_ADDR, 1911 dd->rcvhdrtail_dummy_dma); 1912 1913 return 0; 1914 1915 bail_free: 1916 dd_dev_err(dd, 1917 "attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n", 1918 rcd->ctxt); 1919 dma_free_coherent(&dd->pcidev->dev, amt, rcd->rcvhdrq, 1920 rcd->rcvhdrq_dma); 1921 rcd->rcvhdrq = NULL; 1922 bail: 1923 return -ENOMEM; 1924 } 1925 1926 /** 1927 * allocate eager buffers, both kernel and user contexts. 1928 * @rcd: the context we are setting up. 1929 * 1930 * Allocate the eager TID buffers and program them into hip. 1931 * They are no longer completely contiguous, we do multiple allocation 1932 * calls. Otherwise we get the OOM code involved, by asking for too 1933 * much per call, with disastrous results on some kernels. 1934 */ 1935 int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd) 1936 { 1937 struct hfi1_devdata *dd = rcd->dd; 1938 u32 max_entries, egrtop, alloced_bytes = 0; 1939 gfp_t gfp_flags; 1940 u16 order, idx = 0; 1941 int ret = 0; 1942 u16 round_mtu = roundup_pow_of_two(hfi1_max_mtu); 1943 1944 /* 1945 * GFP_USER, but without GFP_FS, so buffer cache can be 1946 * coalesced (we hope); otherwise, even at order 4, 1947 * heavy filesystem activity makes these fail, and we can 1948 * use compound pages. 1949 */ 1950 gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP; 1951 1952 /* 1953 * The minimum size of the eager buffers is a groups of MTU-sized 1954 * buffers. 1955 * The global eager_buffer_size parameter is checked against the 1956 * theoretical lower limit of the value. Here, we check against the 1957 * MTU. 1958 */ 1959 if (rcd->egrbufs.size < (round_mtu * dd->rcv_entries.group_size)) 1960 rcd->egrbufs.size = round_mtu * dd->rcv_entries.group_size; 1961 /* 1962 * If using one-pkt-per-egr-buffer, lower the eager buffer 1963 * size to the max MTU (page-aligned). 1964 */ 1965 if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) 1966 rcd->egrbufs.rcvtid_size = round_mtu; 1967 1968 /* 1969 * Eager buffers sizes of 1MB or less require smaller TID sizes 1970 * to satisfy the "multiple of 8 RcvArray entries" requirement. 1971 */ 1972 if (rcd->egrbufs.size <= (1 << 20)) 1973 rcd->egrbufs.rcvtid_size = max((unsigned long)round_mtu, 1974 rounddown_pow_of_two(rcd->egrbufs.size / 8)); 1975 1976 while (alloced_bytes < rcd->egrbufs.size && 1977 rcd->egrbufs.alloced < rcd->egrbufs.count) { 1978 rcd->egrbufs.buffers[idx].addr = 1979 dma_alloc_coherent(&dd->pcidev->dev, 1980 rcd->egrbufs.rcvtid_size, 1981 &rcd->egrbufs.buffers[idx].dma, 1982 gfp_flags); 1983 if (rcd->egrbufs.buffers[idx].addr) { 1984 rcd->egrbufs.buffers[idx].len = 1985 rcd->egrbufs.rcvtid_size; 1986 rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].addr = 1987 rcd->egrbufs.buffers[idx].addr; 1988 rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].dma = 1989 rcd->egrbufs.buffers[idx].dma; 1990 rcd->egrbufs.alloced++; 1991 alloced_bytes += rcd->egrbufs.rcvtid_size; 1992 idx++; 1993 } else { 1994 u32 new_size, i, j; 1995 u64 offset = 0; 1996 1997 /* 1998 * Fail the eager buffer allocation if: 1999 * - we are already using the lowest acceptable size 2000 * - we are using one-pkt-per-egr-buffer (this implies 2001 * that we are accepting only one size) 2002 */ 2003 if (rcd->egrbufs.rcvtid_size == round_mtu || 2004 !HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) { 2005 dd_dev_err(dd, "ctxt%u: Failed to allocate eager buffers\n", 2006 rcd->ctxt); 2007 ret = -ENOMEM; 2008 goto bail_rcvegrbuf_phys; 2009 } 2010 2011 new_size = rcd->egrbufs.rcvtid_size / 2; 2012 2013 /* 2014 * If the first attempt to allocate memory failed, don't 2015 * fail everything but continue with the next lower 2016 * size. 2017 */ 2018 if (idx == 0) { 2019 rcd->egrbufs.rcvtid_size = new_size; 2020 continue; 2021 } 2022 2023 /* 2024 * Re-partition already allocated buffers to a smaller 2025 * size. 2026 */ 2027 rcd->egrbufs.alloced = 0; 2028 for (i = 0, j = 0, offset = 0; j < idx; i++) { 2029 if (i >= rcd->egrbufs.count) 2030 break; 2031 rcd->egrbufs.rcvtids[i].dma = 2032 rcd->egrbufs.buffers[j].dma + offset; 2033 rcd->egrbufs.rcvtids[i].addr = 2034 rcd->egrbufs.buffers[j].addr + offset; 2035 rcd->egrbufs.alloced++; 2036 if ((rcd->egrbufs.buffers[j].dma + offset + 2037 new_size) == 2038 (rcd->egrbufs.buffers[j].dma + 2039 rcd->egrbufs.buffers[j].len)) { 2040 j++; 2041 offset = 0; 2042 } else { 2043 offset += new_size; 2044 } 2045 } 2046 rcd->egrbufs.rcvtid_size = new_size; 2047 } 2048 } 2049 rcd->egrbufs.numbufs = idx; 2050 rcd->egrbufs.size = alloced_bytes; 2051 2052 hfi1_cdbg(PROC, 2053 "ctxt%u: Alloced %u rcv tid entries @ %uKB, total %zuKB\n", 2054 rcd->ctxt, rcd->egrbufs.alloced, 2055 rcd->egrbufs.rcvtid_size / 1024, rcd->egrbufs.size / 1024); 2056 2057 /* 2058 * Set the contexts rcv array head update threshold to the closest 2059 * power of 2 (so we can use a mask instead of modulo) below half 2060 * the allocated entries. 2061 */ 2062 rcd->egrbufs.threshold = 2063 rounddown_pow_of_two(rcd->egrbufs.alloced / 2); 2064 /* 2065 * Compute the expected RcvArray entry base. This is done after 2066 * allocating the eager buffers in order to maximize the 2067 * expected RcvArray entries for the context. 2068 */ 2069 max_entries = rcd->rcv_array_groups * dd->rcv_entries.group_size; 2070 egrtop = roundup(rcd->egrbufs.alloced, dd->rcv_entries.group_size); 2071 rcd->expected_count = max_entries - egrtop; 2072 if (rcd->expected_count > MAX_TID_PAIR_ENTRIES * 2) 2073 rcd->expected_count = MAX_TID_PAIR_ENTRIES * 2; 2074 2075 rcd->expected_base = rcd->eager_base + egrtop; 2076 hfi1_cdbg(PROC, "ctxt%u: eager:%u, exp:%u, egrbase:%u, expbase:%u\n", 2077 rcd->ctxt, rcd->egrbufs.alloced, rcd->expected_count, 2078 rcd->eager_base, rcd->expected_base); 2079 2080 if (!hfi1_rcvbuf_validate(rcd->egrbufs.rcvtid_size, PT_EAGER, &order)) { 2081 hfi1_cdbg(PROC, 2082 "ctxt%u: current Eager buffer size is invalid %u\n", 2083 rcd->ctxt, rcd->egrbufs.rcvtid_size); 2084 ret = -EINVAL; 2085 goto bail_rcvegrbuf_phys; 2086 } 2087 2088 for (idx = 0; idx < rcd->egrbufs.alloced; idx++) { 2089 hfi1_put_tid(dd, rcd->eager_base + idx, PT_EAGER, 2090 rcd->egrbufs.rcvtids[idx].dma, order); 2091 cond_resched(); 2092 } 2093 2094 return 0; 2095 2096 bail_rcvegrbuf_phys: 2097 for (idx = 0; idx < rcd->egrbufs.alloced && 2098 rcd->egrbufs.buffers[idx].addr; 2099 idx++) { 2100 dma_free_coherent(&dd->pcidev->dev, 2101 rcd->egrbufs.buffers[idx].len, 2102 rcd->egrbufs.buffers[idx].addr, 2103 rcd->egrbufs.buffers[idx].dma); 2104 rcd->egrbufs.buffers[idx].addr = NULL; 2105 rcd->egrbufs.buffers[idx].dma = 0; 2106 rcd->egrbufs.buffers[idx].len = 0; 2107 } 2108 2109 return ret; 2110 } 2111