1 /* 2 * Copyright (c) 2012 Mellanox Technologies, Inc. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 */ 32 #define _GNU_SOURCE 33 #include <config.h> 34 35 #include <stdio.h> 36 #include <stdlib.h> 37 #include <unistd.h> 38 #include <errno.h> 39 #include <sys/mman.h> 40 #include <pthread.h> 41 #include <string.h> 42 #include <sched.h> 43 #include <sys/param.h> 44 #include <sys/cpuset.h> 45 46 #include "mlx5.h" 47 #include "mlx5-abi.h" 48 49 #ifndef PCI_VENDOR_ID_MELLANOX 50 #define PCI_VENDOR_ID_MELLANOX 0x15b3 51 #endif 52 53 #ifndef CPU_OR 54 #define CPU_OR(x, y, z) do {} while (0) 55 #endif 56 57 #ifndef CPU_EQUAL 58 #define CPU_EQUAL(x, y) 1 59 #endif 60 61 62 #define HCA(v, d) \ 63 { .vendor = PCI_VENDOR_ID_##v, \ 64 .device = d } 65 66 static struct { 67 unsigned vendor; 68 unsigned device; 69 } hca_table[] = { 70 HCA(MELLANOX, 4113), /* MT4113 Connect-IB */ 71 HCA(MELLANOX, 4114), /* Connect-IB Virtual Function */ 72 HCA(MELLANOX, 4115), /* ConnectX-4 */ 73 HCA(MELLANOX, 4116), /* ConnectX-4 Virtual Function */ 74 HCA(MELLANOX, 4117), /* ConnectX-4LX */ 75 HCA(MELLANOX, 4118), /* ConnectX-4LX Virtual Function */ 76 HCA(MELLANOX, 4119), /* ConnectX-5, PCIe 3.0 */ 77 HCA(MELLANOX, 4120), /* ConnectX-5 Virtual Function */ 78 HCA(MELLANOX, 4121), /* ConnectX-5 Ex */ 79 HCA(MELLANOX, 4122), /* ConnectX-5 Ex VF */ 80 HCA(MELLANOX, 4123), /* ConnectX-6 */ 81 HCA(MELLANOX, 4124), /* ConnectX-6 VF */ 82 HCA(MELLANOX, 4125), /* ConnectX-6 DX */ 83 HCA(MELLANOX, 4126), /* ConnectX family mlx5Gen Virtual Function */ 84 HCA(MELLANOX, 41682), /* BlueField integrated ConnectX-5 network controller */ 85 HCA(MELLANOX, 41683), /* BlueField integrated ConnectX-5 network controller VF */ 86 }; 87 88 uint32_t mlx5_debug_mask = 0; 89 int mlx5_freeze_on_error_cqe; 90 91 static struct ibv_context_ops mlx5_ctx_ops = { 92 .query_device = mlx5_query_device, 93 .query_port = mlx5_query_port, 94 .alloc_pd = mlx5_alloc_pd, 95 .dealloc_pd = mlx5_free_pd, 96 .reg_mr = mlx5_reg_mr, 97 .rereg_mr = mlx5_rereg_mr, 98 .dereg_mr = mlx5_dereg_mr, 99 .alloc_mw = mlx5_alloc_mw, 100 .dealloc_mw = mlx5_dealloc_mw, 101 .bind_mw = mlx5_bind_mw, 102 .create_cq = mlx5_create_cq, 103 .poll_cq = mlx5_poll_cq, 104 .req_notify_cq = mlx5_arm_cq, 105 .cq_event = mlx5_cq_event, 106 .resize_cq = mlx5_resize_cq, 107 .destroy_cq = mlx5_destroy_cq, 108 .create_srq = mlx5_create_srq, 109 .modify_srq = mlx5_modify_srq, 110 .query_srq = mlx5_query_srq, 111 .destroy_srq = mlx5_destroy_srq, 112 .post_srq_recv = mlx5_post_srq_recv, 113 .create_qp = mlx5_create_qp, 114 .query_qp = mlx5_query_qp, 115 .modify_qp = mlx5_modify_qp, 116 .destroy_qp = mlx5_destroy_qp, 117 .post_send = mlx5_post_send, 118 .post_recv = mlx5_post_recv, 119 .create_ah = mlx5_create_ah, 120 .destroy_ah = mlx5_destroy_ah, 121 .attach_mcast = mlx5_attach_mcast, 122 .detach_mcast = mlx5_detach_mcast 123 }; 124 125 static int read_number_from_line(const char *line, int *value) 126 { 127 const char *ptr; 128 129 ptr = strchr(line, ':'); 130 if (!ptr) 131 return 1; 132 133 ++ptr; 134 135 *value = atoi(ptr); 136 return 0; 137 } 138 /** 139 * The function looks for the first free user-index in all the 140 * user-index tables. If all are used, returns -1, otherwise 141 * a valid user-index. 142 * In case the reference count of the table is zero, it means the 143 * table is not in use and wasn't allocated yet, therefore the 144 * mlx5_store_uidx allocates the table, and increment the reference 145 * count on the table. 146 */ 147 static int32_t get_free_uidx(struct mlx5_context *ctx) 148 { 149 int32_t tind; 150 int32_t i; 151 152 for (tind = 0; tind < MLX5_UIDX_TABLE_SIZE; tind++) { 153 if (ctx->uidx_table[tind].refcnt < MLX5_UIDX_TABLE_MASK) 154 break; 155 } 156 157 if (tind == MLX5_UIDX_TABLE_SIZE) 158 return -1; 159 160 if (!ctx->uidx_table[tind].refcnt) 161 return tind << MLX5_UIDX_TABLE_SHIFT; 162 163 for (i = 0; i < MLX5_UIDX_TABLE_MASK + 1; i++) { 164 if (!ctx->uidx_table[tind].table[i]) 165 break; 166 } 167 168 return (tind << MLX5_UIDX_TABLE_SHIFT) | i; 169 } 170 171 int32_t mlx5_store_uidx(struct mlx5_context *ctx, void *rsc) 172 { 173 int32_t tind; 174 int32_t ret = -1; 175 int32_t uidx; 176 177 pthread_mutex_lock(&ctx->uidx_table_mutex); 178 uidx = get_free_uidx(ctx); 179 if (uidx < 0) 180 goto out; 181 182 tind = uidx >> MLX5_UIDX_TABLE_SHIFT; 183 184 if (!ctx->uidx_table[tind].refcnt) { 185 ctx->uidx_table[tind].table = calloc(MLX5_UIDX_TABLE_MASK + 1, 186 sizeof(struct mlx5_resource *)); 187 if (!ctx->uidx_table[tind].table) 188 goto out; 189 } 190 191 ++ctx->uidx_table[tind].refcnt; 192 ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = rsc; 193 ret = uidx; 194 195 out: 196 pthread_mutex_unlock(&ctx->uidx_table_mutex); 197 return ret; 198 } 199 200 void mlx5_clear_uidx(struct mlx5_context *ctx, uint32_t uidx) 201 { 202 int tind = uidx >> MLX5_UIDX_TABLE_SHIFT; 203 204 pthread_mutex_lock(&ctx->uidx_table_mutex); 205 206 if (!--ctx->uidx_table[tind].refcnt) 207 free(ctx->uidx_table[tind].table); 208 else 209 ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = NULL; 210 211 pthread_mutex_unlock(&ctx->uidx_table_mutex); 212 } 213 214 static int mlx5_is_sandy_bridge(int *num_cores) 215 { 216 char line[128]; 217 FILE *fd; 218 int rc = 0; 219 int cur_cpu_family = -1; 220 int cur_cpu_model = -1; 221 222 fd = fopen("/proc/cpuinfo", "r"); 223 if (!fd) 224 return 0; 225 226 *num_cores = 0; 227 228 while (fgets(line, 128, fd)) { 229 int value; 230 231 /* if this is information on new processor */ 232 if (!strncmp(line, "processor", 9)) { 233 ++*num_cores; 234 235 cur_cpu_family = -1; 236 cur_cpu_model = -1; 237 } else if (!strncmp(line, "cpu family", 10)) { 238 if ((cur_cpu_family < 0) && (!read_number_from_line(line, &value))) 239 cur_cpu_family = value; 240 } else if (!strncmp(line, "model", 5)) { 241 if ((cur_cpu_model < 0) && (!read_number_from_line(line, &value))) 242 cur_cpu_model = value; 243 } 244 245 /* if this is a Sandy Bridge CPU */ 246 if ((cur_cpu_family == 6) && 247 (cur_cpu_model == 0x2A || (cur_cpu_model == 0x2D) )) 248 rc = 1; 249 } 250 251 fclose(fd); 252 return rc; 253 } 254 255 /* 256 man cpuset 257 258 This format displays each 32-bit word in hexadecimal (using ASCII characters "0" - "9" and "a" - "f"); words 259 are filled with leading zeros, if required. For masks longer than one word, a comma separator is used between 260 words. Words are displayed in big-endian order, which has the most significant bit first. The hex digits 261 within a word are also in big-endian order. 262 263 The number of 32-bit words displayed is the minimum number needed to display all bits of the bitmask, based on 264 the size of the bitmask. 265 266 Examples of the Mask Format: 267 268 00000001 # just bit 0 set 269 40000000,00000000,00000000 # just bit 94 set 270 000000ff,00000000 # bits 32-39 set 271 00000000,000E3862 # 1,5,6,11-13,17-19 set 272 273 A mask with bits 0, 1, 2, 4, 8, 16, 32, and 64 set displays as: 274 275 00000001,00000001,00010117 276 277 The first "1" is for bit 64, the second for bit 32, the third for bit 16, the fourth for bit 8, the fifth for 278 bit 4, and the "7" is for bits 2, 1, and 0. 279 */ 280 static void mlx5_local_cpu_set(struct ibv_device *ibdev, cpuset_t *cpu_set) 281 { 282 char *p, buf[1024]; 283 char *env_value; 284 uint32_t word; 285 int i, k; 286 287 env_value = getenv("MLX5_LOCAL_CPUS"); 288 if (env_value) 289 strncpy(buf, env_value, sizeof(buf)); 290 else { 291 char fname[MAXPATHLEN]; 292 293 snprintf(fname, MAXPATHLEN, "/sys/class/infiniband/%s", 294 ibv_get_device_name(ibdev)); 295 296 if (ibv_read_sysfs_file(fname, "device/local_cpus", buf, sizeof(buf))) { 297 fprintf(stderr, PFX "Warning: can not get local cpu set: failed to open %s\n", fname); 298 return; 299 } 300 } 301 302 p = strrchr(buf, ','); 303 if (!p) 304 p = buf; 305 306 i = 0; 307 do { 308 if (*p == ',') { 309 *p = 0; 310 p ++; 311 } 312 313 word = strtoul(p, NULL, 16); 314 315 for (k = 0; word; ++k, word >>= 1) 316 if (word & 1) 317 CPU_SET(k+i, cpu_set); 318 319 if (p == buf) 320 break; 321 322 p = strrchr(buf, ','); 323 if (!p) 324 p = buf; 325 326 i += 32; 327 } while (i < CPU_SETSIZE); 328 } 329 330 static int mlx5_enable_sandy_bridge_fix(struct ibv_device *ibdev) 331 { 332 cpuset_t my_cpus, dev_local_cpus, result_set; 333 int stall_enable; 334 int ret; 335 int num_cores; 336 337 if (!mlx5_is_sandy_bridge(&num_cores)) 338 return 0; 339 340 /* by default enable stall on sandy bridge arch */ 341 stall_enable = 1; 342 343 /* 344 * check if app is bound to cpu set that is inside 345 * of device local cpu set. Disable stalling if true 346 */ 347 348 /* use static cpu set - up to CPU_SETSIZE (1024) cpus/node */ 349 CPU_ZERO(&my_cpus); 350 CPU_ZERO(&dev_local_cpus); 351 CPU_ZERO(&result_set); 352 ret = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, 353 sizeof(my_cpus), &my_cpus); 354 if (ret == -1) { 355 if (errno == EINVAL) 356 fprintf(stderr, PFX "Warning: my cpu set is too small\n"); 357 else 358 fprintf(stderr, PFX "Warning: failed to get my cpu set\n"); 359 goto out; 360 } 361 362 /* get device local cpu set */ 363 mlx5_local_cpu_set(ibdev, &dev_local_cpus); 364 365 /* check if my cpu set is in dev cpu */ 366 #if __FreeBSD_version < 1400046 367 CPU_OR(&result_set, &my_cpus); 368 CPU_OR(&result_set, &dev_local_cpus); 369 #else 370 CPU_OR(&result_set, &my_cpus, &dev_local_cpus); 371 #endif 372 stall_enable = CPU_EQUAL(&result_set, &dev_local_cpus) ? 0 : 1; 373 374 out: 375 return stall_enable; 376 } 377 378 static void mlx5_read_env(struct ibv_device *ibdev, struct mlx5_context *ctx) 379 { 380 char *env_value; 381 382 env_value = getenv("MLX5_STALL_CQ_POLL"); 383 if (env_value) 384 /* check if cq stall is enforced by user */ 385 ctx->stall_enable = (strcmp(env_value, "0")) ? 1 : 0; 386 else 387 /* autodetect if we need to do cq polling */ 388 ctx->stall_enable = mlx5_enable_sandy_bridge_fix(ibdev); 389 390 env_value = getenv("MLX5_STALL_NUM_LOOP"); 391 if (env_value) 392 mlx5_stall_num_loop = atoi(env_value); 393 394 env_value = getenv("MLX5_STALL_CQ_POLL_MIN"); 395 if (env_value) 396 mlx5_stall_cq_poll_min = atoi(env_value); 397 398 env_value = getenv("MLX5_STALL_CQ_POLL_MAX"); 399 if (env_value) 400 mlx5_stall_cq_poll_max = atoi(env_value); 401 402 env_value = getenv("MLX5_STALL_CQ_INC_STEP"); 403 if (env_value) 404 mlx5_stall_cq_inc_step = atoi(env_value); 405 406 env_value = getenv("MLX5_STALL_CQ_DEC_STEP"); 407 if (env_value) 408 mlx5_stall_cq_dec_step = atoi(env_value); 409 410 ctx->stall_adaptive_enable = 0; 411 ctx->stall_cycles = 0; 412 413 if (mlx5_stall_num_loop < 0) { 414 ctx->stall_adaptive_enable = 1; 415 ctx->stall_cycles = mlx5_stall_cq_poll_min; 416 } 417 418 } 419 420 static int get_total_uuars(int page_size) 421 { 422 int size = MLX5_DEF_TOT_UUARS; 423 int uuars_in_page; 424 char *env; 425 426 env = getenv("MLX5_TOTAL_UUARS"); 427 if (env) 428 size = atoi(env); 429 430 if (size < 1) 431 return -EINVAL; 432 433 uuars_in_page = page_size / MLX5_ADAPTER_PAGE_SIZE * MLX5_NUM_NON_FP_BFREGS_PER_UAR; 434 size = max(uuars_in_page, size); 435 size = align(size, MLX5_NUM_NON_FP_BFREGS_PER_UAR); 436 if (size > MLX5_MAX_BFREGS) 437 return -ENOMEM; 438 439 return size; 440 } 441 442 static void open_debug_file(struct mlx5_context *ctx) 443 { 444 char *env; 445 446 env = getenv("MLX5_DEBUG_FILE"); 447 if (!env) { 448 ctx->dbg_fp = stderr; 449 return; 450 } 451 452 ctx->dbg_fp = fopen(env, "aw+"); 453 if (!ctx->dbg_fp) { 454 fprintf(stderr, "Failed opening debug file %s, using stderr\n", env); 455 ctx->dbg_fp = stderr; 456 return; 457 } 458 } 459 460 static void close_debug_file(struct mlx5_context *ctx) 461 { 462 if (ctx->dbg_fp && ctx->dbg_fp != stderr) 463 fclose(ctx->dbg_fp); 464 } 465 466 static void set_debug_mask(void) 467 { 468 char *env; 469 470 env = getenv("MLX5_DEBUG_MASK"); 471 if (env) 472 mlx5_debug_mask = strtol(env, NULL, 0); 473 } 474 475 static void set_freeze_on_error(void) 476 { 477 char *env; 478 479 env = getenv("MLX5_FREEZE_ON_ERROR_CQE"); 480 if (env) 481 mlx5_freeze_on_error_cqe = strtol(env, NULL, 0); 482 } 483 484 static int get_always_bf(void) 485 { 486 char *env; 487 488 env = getenv("MLX5_POST_SEND_PREFER_BF"); 489 if (!env) 490 return 1; 491 492 return strcmp(env, "0") ? 1 : 0; 493 } 494 495 static int get_shut_up_bf(void) 496 { 497 char *env; 498 499 env = getenv("MLX5_SHUT_UP_BF"); 500 if (!env) 501 return 0; 502 503 return strcmp(env, "0") ? 1 : 0; 504 } 505 506 static int get_num_low_lat_uuars(int tot_uuars) 507 { 508 char *env; 509 int num = 4; 510 511 env = getenv("MLX5_NUM_LOW_LAT_UUARS"); 512 if (env) 513 num = atoi(env); 514 515 if (num < 0) 516 return -EINVAL; 517 518 num = max(num, tot_uuars - MLX5_MED_BFREGS_TSHOLD); 519 return num; 520 } 521 522 /* The library allocates an array of uuar contexts. The one in index zero does 523 * not to execersize odd/even policy so it can avoid a lock but it may not use 524 * blue flame. The upper ones, low_lat_uuars can use blue flame with no lock 525 * since they are assigned to one QP only. The rest can use blue flame but since 526 * they are shared they need a lock 527 */ 528 static int need_uuar_lock(struct mlx5_context *ctx, int uuarn) 529 { 530 if (uuarn == 0 || mlx5_single_threaded) 531 return 0; 532 533 if (uuarn >= (ctx->tot_uuars - ctx->low_lat_uuars) * 2) 534 return 0; 535 536 return 1; 537 } 538 539 static int single_threaded_app(void) 540 { 541 542 char *env; 543 544 env = getenv("MLX5_SINGLE_THREADED"); 545 if (env) 546 return strcmp(env, "1") ? 0 : 1; 547 548 return 0; 549 } 550 551 static int mlx5_cmd_get_context(struct mlx5_context *context, 552 struct mlx5_alloc_ucontext *req, 553 size_t req_len, 554 struct mlx5_alloc_ucontext_resp *resp, 555 size_t resp_len) 556 { 557 if (!ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req, 558 req_len, &resp->ibv_resp, resp_len)) 559 return 0; 560 561 /* The ibv_cmd_get_context fails in older kernels when passing 562 * a request length that the kernel doesn't know. 563 * To avoid breaking compatibility of new libmlx5 and older 564 * kernels, when ibv_cmd_get_context fails with the full 565 * request length, we try once again with the legacy length. 566 * We repeat this process while reducing requested size based 567 * on the feature input size. To avoid this in the future, we 568 * will remove the check in kernel that requires fields unknown 569 * to the kernel to be cleared. This will require that any new 570 * feature that involves extending struct mlx5_alloc_ucontext 571 * will be accompanied by an indication in the form of one or 572 * more fields in struct mlx5_alloc_ucontext_resp. If the 573 * response value can be interpreted as feature not supported 574 * when the returned value is zero, this will suffice to 575 * indicate to the library that the request was ignored by the 576 * kernel, either because it is unaware or because it decided 577 * to do so. If zero is a valid response, we will add a new 578 * field that indicates whether the request was handled. 579 */ 580 if (!ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req, 581 offsetof(struct mlx5_alloc_ucontext, lib_caps), 582 &resp->ibv_resp, resp_len)) 583 return 0; 584 585 return ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req, 586 offsetof(struct mlx5_alloc_ucontext, 587 cqe_version), 588 &resp->ibv_resp, resp_len); 589 } 590 591 static int mlx5_map_internal_clock(struct mlx5_device *mdev, 592 struct ibv_context *ibv_ctx) 593 { 594 struct mlx5_context *context = to_mctx(ibv_ctx); 595 void *hca_clock_page; 596 off_t offset = 0; 597 598 set_command(MLX5_MMAP_GET_CORE_CLOCK_CMD, &offset); 599 hca_clock_page = mmap(NULL, mdev->page_size, 600 PROT_READ, MAP_SHARED, ibv_ctx->cmd_fd, 601 mdev->page_size * offset); 602 603 if (hca_clock_page == MAP_FAILED) { 604 fprintf(stderr, PFX 605 "Warning: Timestamp available,\n" 606 "but failed to mmap() hca core clock page.\n"); 607 return -1; 608 } 609 610 context->hca_core_clock = hca_clock_page + 611 (context->core_clock.offset & (mdev->page_size - 1)); 612 return 0; 613 } 614 615 int mlx5dv_query_device(struct ibv_context *ctx_in, 616 struct mlx5dv_context *attrs_out) 617 { 618 struct mlx5_context *mctx = to_mctx(ctx_in); 619 uint64_t comp_mask_out = 0; 620 621 attrs_out->version = 0; 622 attrs_out->flags = 0; 623 624 if (mctx->cqe_version == MLX5_CQE_VERSION_V1) 625 attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_CQE_V1; 626 627 if (mctx->vendor_cap_flags & MLX5_VENDOR_CAP_FLAGS_MPW) 628 attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_MPW; 629 630 if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_CQE_COMPRESION) { 631 attrs_out->cqe_comp_caps = mctx->cqe_comp_caps; 632 comp_mask_out |= MLX5DV_CONTEXT_MASK_CQE_COMPRESION; 633 } 634 635 attrs_out->comp_mask = comp_mask_out; 636 637 return 0; 638 } 639 640 static int mlx5dv_get_qp(struct ibv_qp *qp_in, 641 struct mlx5dv_qp *qp_out) 642 { 643 struct mlx5_qp *mqp = to_mqp(qp_in); 644 645 qp_out->comp_mask = 0; 646 qp_out->dbrec = mqp->db; 647 648 if (mqp->sq_buf_size) 649 /* IBV_QPT_RAW_PACKET */ 650 qp_out->sq.buf = (void *)((uintptr_t)mqp->sq_buf.buf); 651 else 652 qp_out->sq.buf = (void *)((uintptr_t)mqp->buf.buf + mqp->sq.offset); 653 qp_out->sq.wqe_cnt = mqp->sq.wqe_cnt; 654 qp_out->sq.stride = 1 << mqp->sq.wqe_shift; 655 656 qp_out->rq.buf = (void *)((uintptr_t)mqp->buf.buf + mqp->rq.offset); 657 qp_out->rq.wqe_cnt = mqp->rq.wqe_cnt; 658 qp_out->rq.stride = 1 << mqp->rq.wqe_shift; 659 660 qp_out->bf.reg = mqp->bf->reg; 661 662 if (mqp->bf->uuarn > 0) 663 qp_out->bf.size = mqp->bf->buf_size; 664 else 665 qp_out->bf.size = 0; 666 667 return 0; 668 } 669 670 static int mlx5dv_get_cq(struct ibv_cq *cq_in, 671 struct mlx5dv_cq *cq_out) 672 { 673 struct mlx5_cq *mcq = to_mcq(cq_in); 674 struct mlx5_context *mctx = to_mctx(cq_in->context); 675 676 cq_out->comp_mask = 0; 677 cq_out->cqn = mcq->cqn; 678 cq_out->cqe_cnt = mcq->ibv_cq.cqe + 1; 679 cq_out->cqe_size = mcq->cqe_sz; 680 cq_out->buf = mcq->active_buf->buf; 681 cq_out->dbrec = mcq->dbrec; 682 cq_out->uar = mctx->uar; 683 684 mcq->flags |= MLX5_CQ_FLAGS_DV_OWNED; 685 686 return 0; 687 } 688 689 static int mlx5dv_get_rwq(struct ibv_wq *wq_in, 690 struct mlx5dv_rwq *rwq_out) 691 { 692 struct mlx5_rwq *mrwq = to_mrwq(wq_in); 693 694 rwq_out->comp_mask = 0; 695 rwq_out->buf = mrwq->pbuff; 696 rwq_out->dbrec = mrwq->recv_db; 697 rwq_out->wqe_cnt = mrwq->rq.wqe_cnt; 698 rwq_out->stride = 1 << mrwq->rq.wqe_shift; 699 700 return 0; 701 } 702 703 static int mlx5dv_get_srq(struct ibv_srq *srq_in, 704 struct mlx5dv_srq *srq_out) 705 { 706 struct mlx5_srq *msrq; 707 708 msrq = container_of(srq_in, struct mlx5_srq, vsrq.srq); 709 710 srq_out->comp_mask = 0; 711 srq_out->buf = msrq->buf.buf; 712 srq_out->dbrec = msrq->db; 713 srq_out->stride = 1 << msrq->wqe_shift; 714 srq_out->head = msrq->head; 715 srq_out->tail = msrq->tail; 716 717 return 0; 718 } 719 720 int mlx5dv_init_obj(struct mlx5dv_obj *obj, uint64_t obj_type) 721 { 722 int ret = 0; 723 724 if (obj_type & MLX5DV_OBJ_QP) 725 ret = mlx5dv_get_qp(obj->qp.in, obj->qp.out); 726 if (!ret && (obj_type & MLX5DV_OBJ_CQ)) 727 ret = mlx5dv_get_cq(obj->cq.in, obj->cq.out); 728 if (!ret && (obj_type & MLX5DV_OBJ_SRQ)) 729 ret = mlx5dv_get_srq(obj->srq.in, obj->srq.out); 730 if (!ret && (obj_type & MLX5DV_OBJ_RWQ)) 731 ret = mlx5dv_get_rwq(obj->rwq.in, obj->rwq.out); 732 733 return ret; 734 } 735 736 static void adjust_uar_info(struct mlx5_device *mdev, 737 struct mlx5_context *context, 738 struct mlx5_alloc_ucontext_resp resp) 739 { 740 if (!resp.log_uar_size && !resp.num_uars_per_page) { 741 /* old kernel */ 742 context->uar_size = mdev->page_size; 743 context->num_uars_per_page = 1; 744 return; 745 } 746 747 context->uar_size = 1 << resp.log_uar_size; 748 context->num_uars_per_page = resp.num_uars_per_page; 749 } 750 751 static int mlx5_init_context(struct verbs_device *vdev, 752 struct ibv_context *ctx, int cmd_fd) 753 { 754 struct mlx5_context *context; 755 struct mlx5_alloc_ucontext req; 756 struct mlx5_alloc_ucontext_resp resp; 757 int i; 758 int page_size; 759 int tot_uuars; 760 int low_lat_uuars; 761 int gross_uuars; 762 int j; 763 off_t offset; 764 struct mlx5_device *mdev; 765 struct verbs_context *v_ctx; 766 struct ibv_port_attr port_attr; 767 struct ibv_device_attr_ex device_attr; 768 int k; 769 int bfi; 770 int num_sys_page_map; 771 772 mdev = to_mdev(&vdev->device); 773 v_ctx = verbs_get_ctx(ctx); 774 page_size = mdev->page_size; 775 mlx5_single_threaded = single_threaded_app(); 776 777 context = to_mctx(ctx); 778 context->ibv_ctx.cmd_fd = cmd_fd; 779 780 open_debug_file(context); 781 set_debug_mask(); 782 set_freeze_on_error(); 783 if (gethostname(context->hostname, sizeof(context->hostname))) 784 strcpy(context->hostname, "host_unknown"); 785 786 tot_uuars = get_total_uuars(page_size); 787 if (tot_uuars < 0) { 788 errno = -tot_uuars; 789 goto err_free; 790 } 791 792 low_lat_uuars = get_num_low_lat_uuars(tot_uuars); 793 if (low_lat_uuars < 0) { 794 errno = -low_lat_uuars; 795 goto err_free; 796 } 797 798 if (low_lat_uuars > tot_uuars - 1) { 799 errno = ENOMEM; 800 goto err_free; 801 } 802 803 memset(&req, 0, sizeof(req)); 804 memset(&resp, 0, sizeof(resp)); 805 806 req.total_num_uuars = tot_uuars; 807 req.num_low_latency_uuars = low_lat_uuars; 808 req.cqe_version = MLX5_CQE_VERSION_V1; 809 req.lib_caps |= MLX5_LIB_CAP_4K_UAR; 810 811 if (mlx5_cmd_get_context(context, &req, sizeof(req), &resp, 812 sizeof(resp))) 813 goto err_free; 814 815 context->max_num_qps = resp.qp_tab_size; 816 context->bf_reg_size = resp.bf_reg_size; 817 context->tot_uuars = resp.tot_uuars; 818 context->low_lat_uuars = low_lat_uuars; 819 context->cache_line_size = resp.cache_line_size; 820 context->max_sq_desc_sz = resp.max_sq_desc_sz; 821 context->max_rq_desc_sz = resp.max_rq_desc_sz; 822 context->max_send_wqebb = resp.max_send_wqebb; 823 context->num_ports = resp.num_ports; 824 context->max_recv_wr = resp.max_recv_wr; 825 context->max_srq_recv_wr = resp.max_srq_recv_wr; 826 827 context->cqe_version = resp.cqe_version; 828 if (context->cqe_version) { 829 if (context->cqe_version == MLX5_CQE_VERSION_V1) 830 mlx5_ctx_ops.poll_cq = mlx5_poll_cq_v1; 831 else 832 goto err_free; 833 } 834 835 adjust_uar_info(mdev, context, resp); 836 837 gross_uuars = context->tot_uuars / MLX5_NUM_NON_FP_BFREGS_PER_UAR * NUM_BFREGS_PER_UAR; 838 context->bfs = calloc(gross_uuars, sizeof(*context->bfs)); 839 if (!context->bfs) { 840 errno = ENOMEM; 841 goto err_free; 842 } 843 844 context->cmds_supp_uhw = resp.cmds_supp_uhw; 845 context->vendor_cap_flags = 0; 846 847 pthread_mutex_init(&context->qp_table_mutex, NULL); 848 pthread_mutex_init(&context->srq_table_mutex, NULL); 849 pthread_mutex_init(&context->uidx_table_mutex, NULL); 850 for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i) 851 context->qp_table[i].refcnt = 0; 852 853 for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i) 854 context->uidx_table[i].refcnt = 0; 855 856 context->db_list = NULL; 857 858 pthread_mutex_init(&context->db_list_mutex, NULL); 859 860 num_sys_page_map = context->tot_uuars / (context->num_uars_per_page * MLX5_NUM_NON_FP_BFREGS_PER_UAR); 861 for (i = 0; i < num_sys_page_map; ++i) { 862 offset = 0; 863 set_command(MLX5_MMAP_GET_REGULAR_PAGES_CMD, &offset); 864 set_index(i, &offset); 865 context->uar[i] = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, 866 cmd_fd, page_size * offset); 867 if (context->uar[i] == MAP_FAILED) { 868 context->uar[i] = NULL; 869 goto err_free_bf; 870 } 871 } 872 873 for (i = 0; i < num_sys_page_map; i++) { 874 for (j = 0; j < context->num_uars_per_page; j++) { 875 for (k = 0; k < NUM_BFREGS_PER_UAR; k++) { 876 bfi = (i * context->num_uars_per_page + j) * NUM_BFREGS_PER_UAR + k; 877 context->bfs[bfi].reg = context->uar[i] + MLX5_ADAPTER_PAGE_SIZE * j + 878 MLX5_BF_OFFSET + k * context->bf_reg_size; 879 context->bfs[bfi].need_lock = need_uuar_lock(context, bfi); 880 mlx5_spinlock_init(&context->bfs[bfi].lock); 881 context->bfs[bfi].offset = 0; 882 if (bfi) 883 context->bfs[bfi].buf_size = context->bf_reg_size / 2; 884 context->bfs[bfi].uuarn = bfi; 885 } 886 } 887 } 888 context->hca_core_clock = NULL; 889 if (resp.response_length + sizeof(resp.ibv_resp) >= 890 offsetof(struct mlx5_alloc_ucontext_resp, hca_core_clock_offset) + 891 sizeof(resp.hca_core_clock_offset) && 892 resp.comp_mask & MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET) { 893 context->core_clock.offset = resp.hca_core_clock_offset; 894 mlx5_map_internal_clock(mdev, ctx); 895 } 896 897 mlx5_spinlock_init(&context->lock32); 898 899 context->prefer_bf = get_always_bf(); 900 context->shut_up_bf = get_shut_up_bf(); 901 mlx5_read_env(&vdev->device, context); 902 903 mlx5_spinlock_init(&context->hugetlb_lock); 904 TAILQ_INIT(&context->hugetlb_list); 905 906 context->ibv_ctx.ops = mlx5_ctx_ops; 907 908 verbs_set_ctx_op(v_ctx, create_qp_ex, mlx5_create_qp_ex); 909 verbs_set_ctx_op(v_ctx, open_xrcd, mlx5_open_xrcd); 910 verbs_set_ctx_op(v_ctx, close_xrcd, mlx5_close_xrcd); 911 verbs_set_ctx_op(v_ctx, create_srq_ex, mlx5_create_srq_ex); 912 verbs_set_ctx_op(v_ctx, get_srq_num, mlx5_get_srq_num); 913 verbs_set_ctx_op(v_ctx, query_device_ex, mlx5_query_device_ex); 914 verbs_set_ctx_op(v_ctx, query_rt_values, mlx5_query_rt_values); 915 verbs_set_ctx_op(v_ctx, ibv_create_flow, ibv_cmd_create_flow); 916 verbs_set_ctx_op(v_ctx, ibv_destroy_flow, ibv_cmd_destroy_flow); 917 verbs_set_ctx_op(v_ctx, create_cq_ex, mlx5_create_cq_ex); 918 verbs_set_ctx_op(v_ctx, create_wq, mlx5_create_wq); 919 verbs_set_ctx_op(v_ctx, modify_wq, mlx5_modify_wq); 920 verbs_set_ctx_op(v_ctx, destroy_wq, mlx5_destroy_wq); 921 verbs_set_ctx_op(v_ctx, create_rwq_ind_table, mlx5_create_rwq_ind_table); 922 verbs_set_ctx_op(v_ctx, destroy_rwq_ind_table, mlx5_destroy_rwq_ind_table); 923 924 memset(&device_attr, 0, sizeof(device_attr)); 925 if (!mlx5_query_device_ex(ctx, NULL, &device_attr, 926 sizeof(struct ibv_device_attr_ex))) { 927 context->cached_device_cap_flags = 928 device_attr.orig_attr.device_cap_flags; 929 context->atomic_cap = device_attr.orig_attr.atomic_cap; 930 context->cached_tso_caps = device_attr.tso_caps; 931 } 932 933 for (j = 0; j < min(MLX5_MAX_PORTS_NUM, context->num_ports); ++j) { 934 memset(&port_attr, 0, sizeof(port_attr)); 935 if (!mlx5_query_port(ctx, j + 1, &port_attr)) 936 context->cached_link_layer[j] = port_attr.link_layer; 937 } 938 939 return 0; 940 941 err_free_bf: 942 free(context->bfs); 943 944 err_free: 945 for (i = 0; i < MLX5_MAX_UARS; ++i) { 946 if (context->uar[i]) 947 munmap(context->uar[i], page_size); 948 } 949 close_debug_file(context); 950 return errno; 951 } 952 953 static void mlx5_cleanup_context(struct verbs_device *device, 954 struct ibv_context *ibctx) 955 { 956 struct mlx5_context *context = to_mctx(ibctx); 957 int page_size = to_mdev(ibctx->device)->page_size; 958 int i; 959 960 free(context->bfs); 961 for (i = 0; i < MLX5_MAX_UARS; ++i) { 962 if (context->uar[i]) 963 munmap(context->uar[i], page_size); 964 } 965 if (context->hca_core_clock) 966 munmap(context->hca_core_clock - context->core_clock.offset, 967 page_size); 968 close_debug_file(context); 969 } 970 971 static struct verbs_device_ops mlx5_dev_ops = { 972 .init_context = mlx5_init_context, 973 .uninit_context = mlx5_cleanup_context, 974 }; 975 976 static struct verbs_device *mlx5_driver_init(const char *uverbs_sys_path, 977 int abi_version) 978 { 979 char value[8]; 980 struct mlx5_device *dev; 981 unsigned vendor, device; 982 int i; 983 984 if (ibv_read_sysfs_file(uverbs_sys_path, "device/vendor", 985 value, sizeof value) < 0) 986 return NULL; 987 sscanf(value, "%i", &vendor); 988 989 if (ibv_read_sysfs_file(uverbs_sys_path, "device/device", 990 value, sizeof value) < 0) 991 return NULL; 992 sscanf(value, "%i", &device); 993 994 for (i = 0; i < sizeof hca_table / sizeof hca_table[0]; ++i) 995 if (vendor == hca_table[i].vendor && 996 device == hca_table[i].device) 997 goto found; 998 999 return NULL; 1000 1001 found: 1002 if (abi_version < MLX5_UVERBS_MIN_ABI_VERSION || 1003 abi_version > MLX5_UVERBS_MAX_ABI_VERSION) { 1004 fprintf(stderr, PFX "Fatal: ABI version %d of %s is not supported " 1005 "(min supported %d, max supported %d)\n", 1006 abi_version, uverbs_sys_path, 1007 MLX5_UVERBS_MIN_ABI_VERSION, 1008 MLX5_UVERBS_MAX_ABI_VERSION); 1009 return NULL; 1010 } 1011 1012 dev = calloc(1, sizeof *dev); 1013 if (!dev) { 1014 fprintf(stderr, PFX "Fatal: couldn't allocate device for %s\n", 1015 uverbs_sys_path); 1016 return NULL; 1017 } 1018 1019 dev->page_size = sysconf(_SC_PAGESIZE); 1020 dev->driver_abi_ver = abi_version; 1021 1022 dev->verbs_dev.ops = &mlx5_dev_ops; 1023 dev->verbs_dev.sz = sizeof(*dev); 1024 dev->verbs_dev.size_of_context = sizeof(struct mlx5_context) - 1025 sizeof(struct ibv_context); 1026 1027 return &dev->verbs_dev; 1028 } 1029 1030 static __attribute__((constructor)) void mlx5_register_driver(void) 1031 { 1032 verbs_register_driver("mlx5", mlx5_driver_init); 1033 } 1034