1 /* 2 * Copyright (c) 2012 Mellanox Technologies, Inc. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 */ 32 #define _GNU_SOURCE 33 #include <config.h> 34 35 #include <stdio.h> 36 #include <stdlib.h> 37 #include <unistd.h> 38 #include <errno.h> 39 #include <sys/mman.h> 40 #include <pthread.h> 41 #include <string.h> 42 #include <sched.h> 43 #include <sys/param.h> 44 #include <sys/cpuset.h> 45 46 #include "mlx5.h" 47 #include "mlx5-abi.h" 48 49 #ifndef PCI_VENDOR_ID_MELLANOX 50 #define PCI_VENDOR_ID_MELLANOX 0x15b3 51 #endif 52 53 #ifndef CPU_OR 54 #define CPU_OR(x, y, z) do {} while (0) 55 #endif 56 57 #ifndef CPU_EQUAL 58 #define CPU_EQUAL(x, y) 1 59 #endif 60 61 62 #define HCA(v, d) \ 63 { .vendor = PCI_VENDOR_ID_##v, \ 64 .device = d } 65 66 static struct { 67 unsigned vendor; 68 unsigned device; 69 } hca_table[] = { 70 HCA(MELLANOX, 4113), /* MT4113 Connect-IB */ 71 HCA(MELLANOX, 4114), /* Connect-IB Virtual Function */ 72 HCA(MELLANOX, 4115), /* ConnectX-4 */ 73 HCA(MELLANOX, 4116), /* ConnectX-4 Virtual Function */ 74 HCA(MELLANOX, 4117), /* ConnectX-4LX */ 75 HCA(MELLANOX, 4118), /* ConnectX-4LX Virtual Function */ 76 HCA(MELLANOX, 4119), /* ConnectX-5, PCIe 3.0 */ 77 HCA(MELLANOX, 4120), /* ConnectX-5 Virtual Function */ 78 HCA(MELLANOX, 4121), /* ConnectX-5 Ex */ 79 HCA(MELLANOX, 4122), /* ConnectX-5 Ex VF */ 80 HCA(MELLANOX, 4123), /* ConnectX-6 */ 81 HCA(MELLANOX, 4124), /* ConnectX-6 VF */ 82 HCA(MELLANOX, 4125), /* ConnectX-6 DX */ 83 HCA(MELLANOX, 4126), /* ConnectX family mlx5Gen Virtual Function */ 84 HCA(MELLANOX, 4127), /* ConnectX-6 LX */ 85 HCA(MELLANOX, 4129), /* ConnectX-7 */ 86 HCA(MELLANOX, 4131), /* ConnectX-8 */ 87 HCA(MELLANOX, 41682), /* BlueField integrated ConnectX-5 network controller */ 88 HCA(MELLANOX, 41683), /* BlueField integrated ConnectX-5 network controller VF */ 89 HCA(MELLANOX, 41686), /* BlueField-2 integrated ConnectX-6 Dx network controller */ 90 HCA(MELLANOX, 41692), /* BlueField-3 integrated ConnectX-7 network controller */ 91 HCA(MELLANOX, 41695), /* BlueField-4 integrated ConnectX-8 network controller */ 92 }; 93 94 uint32_t mlx5_debug_mask = 0; 95 int mlx5_freeze_on_error_cqe; 96 97 static struct ibv_context_ops mlx5_ctx_ops = { 98 .query_device = mlx5_query_device, 99 .query_port = mlx5_query_port, 100 .alloc_pd = mlx5_alloc_pd, 101 .dealloc_pd = mlx5_free_pd, 102 .reg_mr = mlx5_reg_mr, 103 .rereg_mr = mlx5_rereg_mr, 104 .dereg_mr = mlx5_dereg_mr, 105 .alloc_mw = mlx5_alloc_mw, 106 .dealloc_mw = mlx5_dealloc_mw, 107 .bind_mw = mlx5_bind_mw, 108 .create_cq = mlx5_create_cq, 109 .poll_cq = mlx5_poll_cq, 110 .req_notify_cq = mlx5_arm_cq, 111 .cq_event = mlx5_cq_event, 112 .resize_cq = mlx5_resize_cq, 113 .destroy_cq = mlx5_destroy_cq, 114 .create_srq = mlx5_create_srq, 115 .modify_srq = mlx5_modify_srq, 116 .query_srq = mlx5_query_srq, 117 .destroy_srq = mlx5_destroy_srq, 118 .post_srq_recv = mlx5_post_srq_recv, 119 .create_qp = mlx5_create_qp, 120 .query_qp = mlx5_query_qp, 121 .modify_qp = mlx5_modify_qp, 122 .destroy_qp = mlx5_destroy_qp, 123 .post_send = mlx5_post_send, 124 .post_recv = mlx5_post_recv, 125 .create_ah = mlx5_create_ah, 126 .destroy_ah = mlx5_destroy_ah, 127 .attach_mcast = mlx5_attach_mcast, 128 .detach_mcast = mlx5_detach_mcast 129 }; 130 131 static int read_number_from_line(const char *line, int *value) 132 { 133 const char *ptr; 134 135 ptr = strchr(line, ':'); 136 if (!ptr) 137 return 1; 138 139 ++ptr; 140 141 *value = atoi(ptr); 142 return 0; 143 } 144 /** 145 * The function looks for the first free user-index in all the 146 * user-index tables. If all are used, returns -1, otherwise 147 * a valid user-index. 148 * In case the reference count of the table is zero, it means the 149 * table is not in use and wasn't allocated yet, therefore the 150 * mlx5_store_uidx allocates the table, and increment the reference 151 * count on the table. 152 */ 153 static int32_t get_free_uidx(struct mlx5_context *ctx) 154 { 155 int32_t tind; 156 int32_t i; 157 158 for (tind = 0; tind < MLX5_UIDX_TABLE_SIZE; tind++) { 159 if (ctx->uidx_table[tind].refcnt < MLX5_UIDX_TABLE_MASK) 160 break; 161 } 162 163 if (tind == MLX5_UIDX_TABLE_SIZE) 164 return -1; 165 166 if (!ctx->uidx_table[tind].refcnt) 167 return tind << MLX5_UIDX_TABLE_SHIFT; 168 169 for (i = 0; i < MLX5_UIDX_TABLE_MASK + 1; i++) { 170 if (!ctx->uidx_table[tind].table[i]) 171 break; 172 } 173 174 return (tind << MLX5_UIDX_TABLE_SHIFT) | i; 175 } 176 177 int32_t mlx5_store_uidx(struct mlx5_context *ctx, void *rsc) 178 { 179 int32_t tind; 180 int32_t ret = -1; 181 int32_t uidx; 182 183 pthread_mutex_lock(&ctx->uidx_table_mutex); 184 uidx = get_free_uidx(ctx); 185 if (uidx < 0) 186 goto out; 187 188 tind = uidx >> MLX5_UIDX_TABLE_SHIFT; 189 190 if (!ctx->uidx_table[tind].refcnt) { 191 ctx->uidx_table[tind].table = calloc(MLX5_UIDX_TABLE_MASK + 1, 192 sizeof(struct mlx5_resource *)); 193 if (!ctx->uidx_table[tind].table) 194 goto out; 195 } 196 197 ++ctx->uidx_table[tind].refcnt; 198 ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = rsc; 199 ret = uidx; 200 201 out: 202 pthread_mutex_unlock(&ctx->uidx_table_mutex); 203 return ret; 204 } 205 206 void mlx5_clear_uidx(struct mlx5_context *ctx, uint32_t uidx) 207 { 208 int tind = uidx >> MLX5_UIDX_TABLE_SHIFT; 209 210 pthread_mutex_lock(&ctx->uidx_table_mutex); 211 212 if (!--ctx->uidx_table[tind].refcnt) 213 free(ctx->uidx_table[tind].table); 214 else 215 ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = NULL; 216 217 pthread_mutex_unlock(&ctx->uidx_table_mutex); 218 } 219 220 static int mlx5_is_sandy_bridge(int *num_cores) 221 { 222 char line[128]; 223 FILE *fd; 224 int rc = 0; 225 int cur_cpu_family = -1; 226 int cur_cpu_model = -1; 227 228 fd = fopen("/proc/cpuinfo", "r"); 229 if (!fd) 230 return 0; 231 232 *num_cores = 0; 233 234 while (fgets(line, 128, fd)) { 235 int value; 236 237 /* if this is information on new processor */ 238 if (!strncmp(line, "processor", 9)) { 239 ++*num_cores; 240 241 cur_cpu_family = -1; 242 cur_cpu_model = -1; 243 } else if (!strncmp(line, "cpu family", 10)) { 244 if ((cur_cpu_family < 0) && (!read_number_from_line(line, &value))) 245 cur_cpu_family = value; 246 } else if (!strncmp(line, "model", 5)) { 247 if ((cur_cpu_model < 0) && (!read_number_from_line(line, &value))) 248 cur_cpu_model = value; 249 } 250 251 /* if this is a Sandy Bridge CPU */ 252 if ((cur_cpu_family == 6) && 253 (cur_cpu_model == 0x2A || (cur_cpu_model == 0x2D) )) 254 rc = 1; 255 } 256 257 fclose(fd); 258 return rc; 259 } 260 261 /* 262 man cpuset 263 264 This format displays each 32-bit word in hexadecimal (using ASCII characters "0" - "9" and "a" - "f"); words 265 are filled with leading zeros, if required. For masks longer than one word, a comma separator is used between 266 words. Words are displayed in big-endian order, which has the most significant bit first. The hex digits 267 within a word are also in big-endian order. 268 269 The number of 32-bit words displayed is the minimum number needed to display all bits of the bitmask, based on 270 the size of the bitmask. 271 272 Examples of the Mask Format: 273 274 00000001 # just bit 0 set 275 40000000,00000000,00000000 # just bit 94 set 276 000000ff,00000000 # bits 32-39 set 277 00000000,000E3862 # 1,5,6,11-13,17-19 set 278 279 A mask with bits 0, 1, 2, 4, 8, 16, 32, and 64 set displays as: 280 281 00000001,00000001,00010117 282 283 The first "1" is for bit 64, the second for bit 32, the third for bit 16, the fourth for bit 8, the fifth for 284 bit 4, and the "7" is for bits 2, 1, and 0. 285 */ 286 static void mlx5_local_cpu_set(struct ibv_device *ibdev, cpuset_t *cpu_set) 287 { 288 char *p, buf[1024]; 289 char *env_value; 290 uint32_t word; 291 int i, k; 292 293 env_value = getenv("MLX5_LOCAL_CPUS"); 294 if (env_value) 295 strncpy(buf, env_value, sizeof(buf)); 296 else { 297 char fname[MAXPATHLEN]; 298 299 snprintf(fname, MAXPATHLEN, "/sys/class/infiniband/%s", 300 ibv_get_device_name(ibdev)); 301 302 if (ibv_read_sysfs_file(fname, "device/local_cpus", buf, sizeof(buf))) { 303 fprintf(stderr, PFX "Warning: can not get local cpu set: failed to open %s\n", fname); 304 return; 305 } 306 } 307 308 p = strrchr(buf, ','); 309 if (!p) 310 p = buf; 311 312 i = 0; 313 do { 314 if (*p == ',') { 315 *p = 0; 316 p ++; 317 } 318 319 word = strtoul(p, NULL, 16); 320 321 for (k = 0; word; ++k, word >>= 1) 322 if (word & 1) 323 CPU_SET(k+i, cpu_set); 324 325 if (p == buf) 326 break; 327 328 p = strrchr(buf, ','); 329 if (!p) 330 p = buf; 331 332 i += 32; 333 } while (i < CPU_SETSIZE); 334 } 335 336 static int mlx5_enable_sandy_bridge_fix(struct ibv_device *ibdev) 337 { 338 cpuset_t my_cpus, dev_local_cpus, result_set; 339 int stall_enable; 340 int ret; 341 int num_cores; 342 343 if (!mlx5_is_sandy_bridge(&num_cores)) 344 return 0; 345 346 /* by default enable stall on sandy bridge arch */ 347 stall_enable = 1; 348 349 /* 350 * check if app is bound to cpu set that is inside 351 * of device local cpu set. Disable stalling if true 352 */ 353 354 /* use static cpu set - up to CPU_SETSIZE (1024) cpus/node */ 355 CPU_ZERO(&my_cpus); 356 CPU_ZERO(&dev_local_cpus); 357 CPU_ZERO(&result_set); 358 ret = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, 359 sizeof(my_cpus), &my_cpus); 360 if (ret == -1) { 361 if (errno == EINVAL) 362 fprintf(stderr, PFX "Warning: my cpu set is too small\n"); 363 else 364 fprintf(stderr, PFX "Warning: failed to get my cpu set\n"); 365 goto out; 366 } 367 368 /* get device local cpu set */ 369 mlx5_local_cpu_set(ibdev, &dev_local_cpus); 370 371 /* check if my cpu set is in dev cpu */ 372 #if __FreeBSD_version < 1400046 373 CPU_OR(&result_set, &my_cpus); 374 CPU_OR(&result_set, &dev_local_cpus); 375 #else 376 CPU_OR(&result_set, &my_cpus, &dev_local_cpus); 377 #endif 378 stall_enable = CPU_EQUAL(&result_set, &dev_local_cpus) ? 0 : 1; 379 380 out: 381 return stall_enable; 382 } 383 384 static void mlx5_read_env(struct ibv_device *ibdev, struct mlx5_context *ctx) 385 { 386 char *env_value; 387 388 env_value = getenv("MLX5_STALL_CQ_POLL"); 389 if (env_value) 390 /* check if cq stall is enforced by user */ 391 ctx->stall_enable = (strcmp(env_value, "0")) ? 1 : 0; 392 else 393 /* autodetect if we need to do cq polling */ 394 ctx->stall_enable = mlx5_enable_sandy_bridge_fix(ibdev); 395 396 env_value = getenv("MLX5_STALL_NUM_LOOP"); 397 if (env_value) 398 mlx5_stall_num_loop = atoi(env_value); 399 400 env_value = getenv("MLX5_STALL_CQ_POLL_MIN"); 401 if (env_value) 402 mlx5_stall_cq_poll_min = atoi(env_value); 403 404 env_value = getenv("MLX5_STALL_CQ_POLL_MAX"); 405 if (env_value) 406 mlx5_stall_cq_poll_max = atoi(env_value); 407 408 env_value = getenv("MLX5_STALL_CQ_INC_STEP"); 409 if (env_value) 410 mlx5_stall_cq_inc_step = atoi(env_value); 411 412 env_value = getenv("MLX5_STALL_CQ_DEC_STEP"); 413 if (env_value) 414 mlx5_stall_cq_dec_step = atoi(env_value); 415 416 ctx->stall_adaptive_enable = 0; 417 ctx->stall_cycles = 0; 418 419 if (mlx5_stall_num_loop < 0) { 420 ctx->stall_adaptive_enable = 1; 421 ctx->stall_cycles = mlx5_stall_cq_poll_min; 422 } 423 424 } 425 426 static int get_total_uuars(int page_size) 427 { 428 int size = MLX5_DEF_TOT_UUARS; 429 int uuars_in_page; 430 char *env; 431 432 env = getenv("MLX5_TOTAL_UUARS"); 433 if (env) 434 size = atoi(env); 435 436 if (size < 1) 437 return -EINVAL; 438 439 uuars_in_page = page_size / MLX5_ADAPTER_PAGE_SIZE * MLX5_NUM_NON_FP_BFREGS_PER_UAR; 440 size = max(uuars_in_page, size); 441 size = align(size, MLX5_NUM_NON_FP_BFREGS_PER_UAR); 442 if (size > MLX5_MAX_BFREGS) 443 return -ENOMEM; 444 445 return size; 446 } 447 448 static void open_debug_file(struct mlx5_context *ctx) 449 { 450 char *env; 451 452 env = getenv("MLX5_DEBUG_FILE"); 453 if (!env) { 454 ctx->dbg_fp = stderr; 455 return; 456 } 457 458 ctx->dbg_fp = fopen(env, "aw+"); 459 if (!ctx->dbg_fp) { 460 fprintf(stderr, "Failed opening debug file %s, using stderr\n", env); 461 ctx->dbg_fp = stderr; 462 return; 463 } 464 } 465 466 static void close_debug_file(struct mlx5_context *ctx) 467 { 468 if (ctx->dbg_fp && ctx->dbg_fp != stderr) 469 fclose(ctx->dbg_fp); 470 } 471 472 static void set_debug_mask(void) 473 { 474 char *env; 475 476 env = getenv("MLX5_DEBUG_MASK"); 477 if (env) 478 mlx5_debug_mask = strtol(env, NULL, 0); 479 } 480 481 static void set_freeze_on_error(void) 482 { 483 char *env; 484 485 env = getenv("MLX5_FREEZE_ON_ERROR_CQE"); 486 if (env) 487 mlx5_freeze_on_error_cqe = strtol(env, NULL, 0); 488 } 489 490 static int get_always_bf(void) 491 { 492 char *env; 493 494 env = getenv("MLX5_POST_SEND_PREFER_BF"); 495 if (!env) 496 return 1; 497 498 return strcmp(env, "0") ? 1 : 0; 499 } 500 501 static int get_shut_up_bf(void) 502 { 503 char *env; 504 505 env = getenv("MLX5_SHUT_UP_BF"); 506 if (!env) 507 return 0; 508 509 return strcmp(env, "0") ? 1 : 0; 510 } 511 512 static int get_num_low_lat_uuars(int tot_uuars) 513 { 514 char *env; 515 int num = 4; 516 517 env = getenv("MLX5_NUM_LOW_LAT_UUARS"); 518 if (env) 519 num = atoi(env); 520 521 if (num < 0) 522 return -EINVAL; 523 524 num = max(num, tot_uuars - MLX5_MED_BFREGS_TSHOLD); 525 return num; 526 } 527 528 /* The library allocates an array of uuar contexts. The one in index zero does 529 * not to execersize odd/even policy so it can avoid a lock but it may not use 530 * blue flame. The upper ones, low_lat_uuars can use blue flame with no lock 531 * since they are assigned to one QP only. The rest can use blue flame but since 532 * they are shared they need a lock 533 */ 534 static int need_uuar_lock(struct mlx5_context *ctx, int uuarn) 535 { 536 if (uuarn == 0 || mlx5_single_threaded) 537 return 0; 538 539 if (uuarn >= (ctx->tot_uuars - ctx->low_lat_uuars) * 2) 540 return 0; 541 542 return 1; 543 } 544 545 static int single_threaded_app(void) 546 { 547 548 char *env; 549 550 env = getenv("MLX5_SINGLE_THREADED"); 551 if (env) 552 return strcmp(env, "1") ? 0 : 1; 553 554 return 0; 555 } 556 557 static int mlx5_cmd_get_context(struct mlx5_context *context, 558 struct mlx5_alloc_ucontext *req, 559 size_t req_len, 560 struct mlx5_alloc_ucontext_resp *resp, 561 size_t resp_len) 562 { 563 if (!ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req, 564 req_len, &resp->ibv_resp, resp_len)) 565 return 0; 566 567 /* The ibv_cmd_get_context fails in older kernels when passing 568 * a request length that the kernel doesn't know. 569 * To avoid breaking compatibility of new libmlx5 and older 570 * kernels, when ibv_cmd_get_context fails with the full 571 * request length, we try once again with the legacy length. 572 * We repeat this process while reducing requested size based 573 * on the feature input size. To avoid this in the future, we 574 * will remove the check in kernel that requires fields unknown 575 * to the kernel to be cleared. This will require that any new 576 * feature that involves extending struct mlx5_alloc_ucontext 577 * will be accompanied by an indication in the form of one or 578 * more fields in struct mlx5_alloc_ucontext_resp. If the 579 * response value can be interpreted as feature not supported 580 * when the returned value is zero, this will suffice to 581 * indicate to the library that the request was ignored by the 582 * kernel, either because it is unaware or because it decided 583 * to do so. If zero is a valid response, we will add a new 584 * field that indicates whether the request was handled. 585 */ 586 if (!ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req, 587 offsetof(struct mlx5_alloc_ucontext, lib_caps), 588 &resp->ibv_resp, resp_len)) 589 return 0; 590 591 return ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req, 592 offsetof(struct mlx5_alloc_ucontext, 593 cqe_version), 594 &resp->ibv_resp, resp_len); 595 } 596 597 static int mlx5_map_internal_clock(struct mlx5_device *mdev, 598 struct ibv_context *ibv_ctx) 599 { 600 struct mlx5_context *context = to_mctx(ibv_ctx); 601 void *hca_clock_page; 602 off_t offset = 0; 603 604 set_command(MLX5_MMAP_GET_CORE_CLOCK_CMD, &offset); 605 hca_clock_page = mmap(NULL, mdev->page_size, 606 PROT_READ, MAP_SHARED, ibv_ctx->cmd_fd, 607 mdev->page_size * offset); 608 609 if (hca_clock_page == MAP_FAILED) { 610 fprintf(stderr, PFX 611 "Warning: Timestamp available,\n" 612 "but failed to mmap() hca core clock page.\n"); 613 return -1; 614 } 615 616 context->hca_core_clock = hca_clock_page + 617 (context->core_clock.offset & (mdev->page_size - 1)); 618 return 0; 619 } 620 621 int mlx5dv_query_device(struct ibv_context *ctx_in, 622 struct mlx5dv_context *attrs_out) 623 { 624 struct mlx5_context *mctx = to_mctx(ctx_in); 625 uint64_t comp_mask_out = 0; 626 627 attrs_out->version = 0; 628 attrs_out->flags = 0; 629 630 if (mctx->cqe_version == MLX5_CQE_VERSION_V1) 631 attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_CQE_V1; 632 633 if (mctx->vendor_cap_flags & MLX5_VENDOR_CAP_FLAGS_MPW) 634 attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_MPW; 635 636 if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_CQE_COMPRESION) { 637 attrs_out->cqe_comp_caps = mctx->cqe_comp_caps; 638 comp_mask_out |= MLX5DV_CONTEXT_MASK_CQE_COMPRESION; 639 } 640 641 attrs_out->comp_mask = comp_mask_out; 642 643 return 0; 644 } 645 646 static int mlx5dv_get_qp(struct ibv_qp *qp_in, 647 struct mlx5dv_qp *qp_out) 648 { 649 struct mlx5_qp *mqp = to_mqp(qp_in); 650 651 qp_out->comp_mask = 0; 652 qp_out->dbrec = mqp->db; 653 654 if (mqp->sq_buf_size) 655 /* IBV_QPT_RAW_PACKET */ 656 qp_out->sq.buf = (void *)((uintptr_t)mqp->sq_buf.buf); 657 else 658 qp_out->sq.buf = (void *)((uintptr_t)mqp->buf.buf + mqp->sq.offset); 659 qp_out->sq.wqe_cnt = mqp->sq.wqe_cnt; 660 qp_out->sq.stride = 1 << mqp->sq.wqe_shift; 661 662 qp_out->rq.buf = (void *)((uintptr_t)mqp->buf.buf + mqp->rq.offset); 663 qp_out->rq.wqe_cnt = mqp->rq.wqe_cnt; 664 qp_out->rq.stride = 1 << mqp->rq.wqe_shift; 665 666 qp_out->bf.reg = mqp->bf->reg; 667 668 if (mqp->bf->uuarn > 0) 669 qp_out->bf.size = mqp->bf->buf_size; 670 else 671 qp_out->bf.size = 0; 672 673 return 0; 674 } 675 676 static int mlx5dv_get_cq(struct ibv_cq *cq_in, 677 struct mlx5dv_cq *cq_out) 678 { 679 struct mlx5_cq *mcq = to_mcq(cq_in); 680 struct mlx5_context *mctx = to_mctx(cq_in->context); 681 682 cq_out->comp_mask = 0; 683 cq_out->cqn = mcq->cqn; 684 cq_out->cqe_cnt = mcq->ibv_cq.cqe + 1; 685 cq_out->cqe_size = mcq->cqe_sz; 686 cq_out->buf = mcq->active_buf->buf; 687 cq_out->dbrec = mcq->dbrec; 688 cq_out->uar = mctx->uar; 689 690 mcq->flags |= MLX5_CQ_FLAGS_DV_OWNED; 691 692 return 0; 693 } 694 695 static int mlx5dv_get_rwq(struct ibv_wq *wq_in, 696 struct mlx5dv_rwq *rwq_out) 697 { 698 struct mlx5_rwq *mrwq = to_mrwq(wq_in); 699 700 rwq_out->comp_mask = 0; 701 rwq_out->buf = mrwq->pbuff; 702 rwq_out->dbrec = mrwq->recv_db; 703 rwq_out->wqe_cnt = mrwq->rq.wqe_cnt; 704 rwq_out->stride = 1 << mrwq->rq.wqe_shift; 705 706 return 0; 707 } 708 709 static int mlx5dv_get_srq(struct ibv_srq *srq_in, 710 struct mlx5dv_srq *srq_out) 711 { 712 struct mlx5_srq *msrq; 713 714 msrq = container_of(srq_in, struct mlx5_srq, vsrq.srq); 715 716 srq_out->comp_mask = 0; 717 srq_out->buf = msrq->buf.buf; 718 srq_out->dbrec = msrq->db; 719 srq_out->stride = 1 << msrq->wqe_shift; 720 srq_out->head = msrq->head; 721 srq_out->tail = msrq->tail; 722 723 return 0; 724 } 725 726 int mlx5dv_init_obj(struct mlx5dv_obj *obj, uint64_t obj_type) 727 { 728 int ret = 0; 729 730 if (obj_type & MLX5DV_OBJ_QP) 731 ret = mlx5dv_get_qp(obj->qp.in, obj->qp.out); 732 if (!ret && (obj_type & MLX5DV_OBJ_CQ)) 733 ret = mlx5dv_get_cq(obj->cq.in, obj->cq.out); 734 if (!ret && (obj_type & MLX5DV_OBJ_SRQ)) 735 ret = mlx5dv_get_srq(obj->srq.in, obj->srq.out); 736 if (!ret && (obj_type & MLX5DV_OBJ_RWQ)) 737 ret = mlx5dv_get_rwq(obj->rwq.in, obj->rwq.out); 738 739 return ret; 740 } 741 742 static void adjust_uar_info(struct mlx5_device *mdev, 743 struct mlx5_context *context, 744 struct mlx5_alloc_ucontext_resp resp) 745 { 746 if (!resp.log_uar_size && !resp.num_uars_per_page) { 747 /* old kernel */ 748 context->uar_size = mdev->page_size; 749 context->num_uars_per_page = 1; 750 return; 751 } 752 753 context->uar_size = 1 << resp.log_uar_size; 754 context->num_uars_per_page = resp.num_uars_per_page; 755 } 756 757 static int mlx5_init_context(struct verbs_device *vdev, 758 struct ibv_context *ctx, int cmd_fd) 759 { 760 struct mlx5_context *context; 761 struct mlx5_alloc_ucontext req; 762 struct mlx5_alloc_ucontext_resp resp; 763 int i; 764 int page_size; 765 int tot_uuars; 766 int low_lat_uuars; 767 int gross_uuars; 768 int j; 769 off_t offset; 770 struct mlx5_device *mdev; 771 struct verbs_context *v_ctx; 772 struct ibv_port_attr port_attr; 773 struct ibv_device_attr_ex device_attr; 774 int k; 775 int bfi; 776 int num_sys_page_map; 777 778 mdev = to_mdev(&vdev->device); 779 v_ctx = verbs_get_ctx(ctx); 780 page_size = mdev->page_size; 781 mlx5_single_threaded = single_threaded_app(); 782 783 context = to_mctx(ctx); 784 context->ibv_ctx.cmd_fd = cmd_fd; 785 786 open_debug_file(context); 787 set_debug_mask(); 788 set_freeze_on_error(); 789 if (gethostname(context->hostname, sizeof(context->hostname))) 790 strcpy(context->hostname, "host_unknown"); 791 792 tot_uuars = get_total_uuars(page_size); 793 if (tot_uuars < 0) { 794 errno = -tot_uuars; 795 goto err_free; 796 } 797 798 low_lat_uuars = get_num_low_lat_uuars(tot_uuars); 799 if (low_lat_uuars < 0) { 800 errno = -low_lat_uuars; 801 goto err_free; 802 } 803 804 if (low_lat_uuars > tot_uuars - 1) { 805 errno = ENOMEM; 806 goto err_free; 807 } 808 809 memset(&req, 0, sizeof(req)); 810 memset(&resp, 0, sizeof(resp)); 811 812 req.total_num_uuars = tot_uuars; 813 req.num_low_latency_uuars = low_lat_uuars; 814 req.cqe_version = MLX5_CQE_VERSION_V1; 815 req.lib_caps |= MLX5_LIB_CAP_4K_UAR; 816 817 if (mlx5_cmd_get_context(context, &req, sizeof(req), &resp, 818 sizeof(resp))) 819 goto err_free; 820 821 context->max_num_qps = resp.qp_tab_size; 822 context->bf_reg_size = resp.bf_reg_size; 823 context->tot_uuars = resp.tot_uuars; 824 context->low_lat_uuars = low_lat_uuars; 825 context->cache_line_size = resp.cache_line_size; 826 context->max_sq_desc_sz = resp.max_sq_desc_sz; 827 context->max_rq_desc_sz = resp.max_rq_desc_sz; 828 context->max_send_wqebb = resp.max_send_wqebb; 829 context->num_ports = resp.num_ports; 830 context->max_recv_wr = resp.max_recv_wr; 831 context->max_srq_recv_wr = resp.max_srq_recv_wr; 832 833 context->cqe_version = resp.cqe_version; 834 if (context->cqe_version) { 835 if (context->cqe_version == MLX5_CQE_VERSION_V1) 836 mlx5_ctx_ops.poll_cq = mlx5_poll_cq_v1; 837 else 838 goto err_free; 839 } 840 841 adjust_uar_info(mdev, context, resp); 842 843 gross_uuars = context->tot_uuars / MLX5_NUM_NON_FP_BFREGS_PER_UAR * NUM_BFREGS_PER_UAR; 844 context->bfs = calloc(gross_uuars, sizeof(*context->bfs)); 845 if (!context->bfs) { 846 errno = ENOMEM; 847 goto err_free; 848 } 849 850 context->cmds_supp_uhw = resp.cmds_supp_uhw; 851 context->vendor_cap_flags = 0; 852 853 if (pthread_mutex_init(&context->qp_table_mutex, NULL)) 854 goto err_free_bf; 855 if (pthread_mutex_init(&context->srq_table_mutex, NULL)) 856 goto err_qp_table_mutex; 857 if (pthread_mutex_init(&context->uidx_table_mutex, NULL)) 858 goto err_srq_table_mutex; 859 for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i) 860 context->qp_table[i].refcnt = 0; 861 862 for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i) 863 context->uidx_table[i].refcnt = 0; 864 865 context->db_list = NULL; 866 867 if (pthread_mutex_init(&context->db_list_mutex, NULL)) 868 goto err_uidx_table_mutex; 869 870 num_sys_page_map = context->tot_uuars / (context->num_uars_per_page * MLX5_NUM_NON_FP_BFREGS_PER_UAR); 871 for (i = 0; i < num_sys_page_map; ++i) { 872 offset = 0; 873 set_command(MLX5_MMAP_GET_REGULAR_PAGES_CMD, &offset); 874 set_index(i, &offset); 875 context->uar[i] = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, 876 cmd_fd, page_size * offset); 877 if (context->uar[i] == MAP_FAILED) { 878 context->uar[i] = NULL; 879 goto err_db_list_mutex; 880 } 881 } 882 883 for (i = 0; i < num_sys_page_map; i++) { 884 for (j = 0; j < context->num_uars_per_page; j++) { 885 for (k = 0; k < NUM_BFREGS_PER_UAR; k++) { 886 bfi = (i * context->num_uars_per_page + j) * NUM_BFREGS_PER_UAR + k; 887 context->bfs[bfi].reg = context->uar[i] + MLX5_ADAPTER_PAGE_SIZE * j + 888 MLX5_BF_OFFSET + k * context->bf_reg_size; 889 context->bfs[bfi].need_lock = need_uuar_lock(context, bfi); 890 if (mlx5_spinlock_init(&context->bfs[bfi].lock)) 891 goto err_bfs_spl; 892 context->bfs[bfi].offset = 0; 893 if (bfi) 894 context->bfs[bfi].buf_size = context->bf_reg_size / 2; 895 context->bfs[bfi].uuarn = bfi; 896 } 897 } 898 } 899 context->hca_core_clock = NULL; 900 if (resp.response_length + sizeof(resp.ibv_resp) >= 901 offsetof(struct mlx5_alloc_ucontext_resp, hca_core_clock_offset) + 902 sizeof(resp.hca_core_clock_offset) && 903 resp.comp_mask & MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET) { 904 context->core_clock.offset = resp.hca_core_clock_offset; 905 mlx5_map_internal_clock(mdev, ctx); 906 } 907 908 if (mlx5_spinlock_init(&context->lock32)) 909 goto err_bfs_spl; 910 911 context->prefer_bf = get_always_bf(); 912 context->shut_up_bf = get_shut_up_bf(); 913 mlx5_read_env(&vdev->device, context); 914 915 if (mlx5_spinlock_init(&context->hugetlb_lock)) 916 goto err_32_spl; 917 TAILQ_INIT(&context->hugetlb_list); 918 919 context->ibv_ctx.ops = mlx5_ctx_ops; 920 921 verbs_set_ctx_op(v_ctx, create_qp_ex, mlx5_create_qp_ex); 922 verbs_set_ctx_op(v_ctx, open_xrcd, mlx5_open_xrcd); 923 verbs_set_ctx_op(v_ctx, close_xrcd, mlx5_close_xrcd); 924 verbs_set_ctx_op(v_ctx, create_srq_ex, mlx5_create_srq_ex); 925 verbs_set_ctx_op(v_ctx, get_srq_num, mlx5_get_srq_num); 926 verbs_set_ctx_op(v_ctx, query_device_ex, mlx5_query_device_ex); 927 verbs_set_ctx_op(v_ctx, query_rt_values, mlx5_query_rt_values); 928 verbs_set_ctx_op(v_ctx, ibv_create_flow, ibv_cmd_create_flow); 929 verbs_set_ctx_op(v_ctx, ibv_destroy_flow, ibv_cmd_destroy_flow); 930 verbs_set_ctx_op(v_ctx, create_cq_ex, mlx5_create_cq_ex); 931 verbs_set_ctx_op(v_ctx, create_wq, mlx5_create_wq); 932 verbs_set_ctx_op(v_ctx, modify_wq, mlx5_modify_wq); 933 verbs_set_ctx_op(v_ctx, destroy_wq, mlx5_destroy_wq); 934 verbs_set_ctx_op(v_ctx, create_rwq_ind_table, mlx5_create_rwq_ind_table); 935 verbs_set_ctx_op(v_ctx, destroy_rwq_ind_table, mlx5_destroy_rwq_ind_table); 936 937 memset(&device_attr, 0, sizeof(device_attr)); 938 if (!mlx5_query_device_ex(ctx, NULL, &device_attr, 939 sizeof(struct ibv_device_attr_ex))) { 940 context->cached_device_cap_flags = 941 device_attr.orig_attr.device_cap_flags; 942 context->atomic_cap = device_attr.orig_attr.atomic_cap; 943 context->cached_tso_caps = device_attr.tso_caps; 944 } 945 946 for (j = 0; j < min(MLX5_MAX_PORTS_NUM, context->num_ports); ++j) { 947 memset(&port_attr, 0, sizeof(port_attr)); 948 if (!mlx5_query_port(ctx, j + 1, &port_attr)) 949 context->cached_link_layer[j] = port_attr.link_layer; 950 } 951 952 return 0; 953 954 err_32_spl: 955 mlx5_spinlock_destroy(&context->lock32); 956 957 err_bfs_spl: 958 for (i = 0; i < num_sys_page_map; i++) { 959 for (j = 0; j < context->num_uars_per_page; j++) { 960 for (k = 0; k < NUM_BFREGS_PER_UAR; k++) { 961 bfi = (i * context->num_uars_per_page + j) * NUM_BFREGS_PER_UAR + k; 962 mlx5_spinlock_destroy(&context->bfs[bfi].lock); 963 } 964 } 965 } 966 967 err_db_list_mutex: 968 pthread_mutex_destroy(&context->db_list_mutex); 969 970 err_uidx_table_mutex: 971 pthread_mutex_destroy(&context->uidx_table_mutex); 972 973 err_srq_table_mutex: 974 pthread_mutex_destroy(&context->srq_table_mutex); 975 976 err_qp_table_mutex: 977 pthread_mutex_destroy(&context->qp_table_mutex); 978 979 err_free_bf: 980 free(context->bfs); 981 982 err_free: 983 for (i = 0; i < MLX5_MAX_UARS; ++i) { 984 if (context->uar[i]) 985 munmap(context->uar[i], page_size); 986 } 987 close_debug_file(context); 988 return errno; 989 } 990 991 static void mlx5_cleanup_context(struct verbs_device *device, 992 struct ibv_context *ibctx) 993 { 994 struct mlx5_context *context = to_mctx(ibctx); 995 int page_size = to_mdev(ibctx->device)->page_size; 996 int i; 997 int j; 998 int k; 999 int bfi; 1000 int num_sys_page_map; 1001 1002 num_sys_page_map = context->tot_uuars / (context->num_uars_per_page * MLX5_NUM_NON_FP_BFREGS_PER_UAR); 1003 for (i = 0; i < num_sys_page_map; i++) { 1004 for (j = 0; j < context->num_uars_per_page; j++) { 1005 for (k = 0; k < NUM_BFREGS_PER_UAR; k++) { 1006 bfi = (i * context->num_uars_per_page + j) * NUM_BFREGS_PER_UAR + k; 1007 mlx5_spinlock_destroy(&context->bfs[bfi].lock); 1008 } 1009 } 1010 } 1011 mlx5_spinlock_destroy(&context->hugetlb_lock); 1012 mlx5_spinlock_destroy(&context->lock32); 1013 pthread_mutex_destroy(&context->db_list_mutex); 1014 pthread_mutex_destroy(&context->uidx_table_mutex); 1015 pthread_mutex_destroy(&context->srq_table_mutex); 1016 pthread_mutex_destroy(&context->qp_table_mutex); 1017 1018 free(context->bfs); 1019 for (i = 0; i < MLX5_MAX_UARS; ++i) { 1020 if (context->uar[i]) 1021 munmap(context->uar[i], page_size); 1022 } 1023 if (context->hca_core_clock) 1024 munmap(context->hca_core_clock - context->core_clock.offset, 1025 page_size); 1026 close_debug_file(context); 1027 } 1028 1029 static struct verbs_device_ops mlx5_dev_ops = { 1030 .init_context = mlx5_init_context, 1031 .uninit_context = mlx5_cleanup_context, 1032 }; 1033 1034 static struct verbs_device *mlx5_driver_init(const char *uverbs_sys_path, 1035 int abi_version) 1036 { 1037 char value[8]; 1038 struct mlx5_device *dev; 1039 unsigned vendor, device; 1040 int i; 1041 1042 if (ibv_read_sysfs_file(uverbs_sys_path, "device/vendor", 1043 value, sizeof value) < 0) 1044 return NULL; 1045 sscanf(value, "%i", &vendor); 1046 1047 if (ibv_read_sysfs_file(uverbs_sys_path, "device/device", 1048 value, sizeof value) < 0) 1049 return NULL; 1050 sscanf(value, "%i", &device); 1051 1052 for (i = 0; i < sizeof hca_table / sizeof hca_table[0]; ++i) 1053 if (vendor == hca_table[i].vendor && 1054 device == hca_table[i].device) 1055 goto found; 1056 1057 return NULL; 1058 1059 found: 1060 if (abi_version < MLX5_UVERBS_MIN_ABI_VERSION || 1061 abi_version > MLX5_UVERBS_MAX_ABI_VERSION) { 1062 fprintf(stderr, PFX "Fatal: ABI version %d of %s is not supported " 1063 "(min supported %d, max supported %d)\n", 1064 abi_version, uverbs_sys_path, 1065 MLX5_UVERBS_MIN_ABI_VERSION, 1066 MLX5_UVERBS_MAX_ABI_VERSION); 1067 return NULL; 1068 } 1069 1070 dev = calloc(1, sizeof *dev); 1071 if (!dev) { 1072 fprintf(stderr, PFX "Fatal: couldn't allocate device for %s\n", 1073 uverbs_sys_path); 1074 return NULL; 1075 } 1076 1077 dev->page_size = sysconf(_SC_PAGESIZE); 1078 dev->driver_abi_ver = abi_version; 1079 1080 dev->verbs_dev.ops = &mlx5_dev_ops; 1081 dev->verbs_dev.sz = sizeof(*dev); 1082 dev->verbs_dev.size_of_context = sizeof(struct mlx5_context) - 1083 sizeof(struct ibv_context); 1084 1085 return &dev->verbs_dev; 1086 } 1087 1088 static __attribute__((constructor)) void mlx5_register_driver(void) 1089 { 1090 verbs_register_driver("mlx5", mlx5_driver_init); 1091 } 1092