1 /* 2 * Copyright (c) 2012 Mellanox Technologies, Inc. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 */ 32 #define _GNU_SOURCE 33 #include <config.h> 34 35 #include <stdio.h> 36 #include <stdlib.h> 37 #include <unistd.h> 38 #include <errno.h> 39 #include <sys/mman.h> 40 #include <pthread.h> 41 #include <string.h> 42 #include <sched.h> 43 #include <sys/param.h> 44 #include <sys/cpuset.h> 45 46 #include "mlx5.h" 47 #include "mlx5-abi.h" 48 49 #ifndef PCI_VENDOR_ID_MELLANOX 50 #define PCI_VENDOR_ID_MELLANOX 0x15b3 51 #endif 52 53 #ifndef CPU_OR 54 #define CPU_OR(x, y, z) do {} while (0) 55 #endif 56 57 #ifndef CPU_EQUAL 58 #define CPU_EQUAL(x, y) 1 59 #endif 60 61 62 #define HCA(v, d) \ 63 { .vendor = PCI_VENDOR_ID_##v, \ 64 .device = d } 65 66 static struct { 67 unsigned vendor; 68 unsigned device; 69 } hca_table[] = { 70 HCA(MELLANOX, 4113), /* MT4113 Connect-IB */ 71 HCA(MELLANOX, 4114), /* Connect-IB Virtual Function */ 72 HCA(MELLANOX, 4115), /* ConnectX-4 */ 73 HCA(MELLANOX, 4116), /* ConnectX-4 Virtual Function */ 74 HCA(MELLANOX, 4117), /* ConnectX-4LX */ 75 HCA(MELLANOX, 4118), /* ConnectX-4LX Virtual Function */ 76 HCA(MELLANOX, 4119), /* ConnectX-5, PCIe 3.0 */ 77 HCA(MELLANOX, 4120), /* ConnectX-5 Virtual Function */ 78 HCA(MELLANOX, 4121), /* ConnectX-5 Ex */ 79 HCA(MELLANOX, 4122), /* ConnectX-5 Ex VF */ 80 HCA(MELLANOX, 4123), /* ConnectX-6 */ 81 HCA(MELLANOX, 4124), /* ConnectX-6 VF */ 82 HCA(MELLANOX, 4125), /* ConnectX-6 DX */ 83 HCA(MELLANOX, 4126), /* ConnectX family mlx5Gen Virtual Function */ 84 HCA(MELLANOX, 4127), /* ConnectX-6 LX */ 85 HCA(MELLANOX, 4129), /* ConnectX-7 */ 86 HCA(MELLANOX, 4131), /* ConnectX-8 */ 87 HCA(MELLANOX, 41682), /* BlueField integrated ConnectX-5 network controller */ 88 HCA(MELLANOX, 41683), /* BlueField integrated ConnectX-5 network controller VF */ 89 HCA(MELLANOX, 41686), /* BlueField-2 integrated ConnectX-6 Dx network controller */ 90 HCA(MELLANOX, 41692), /* BlueField-3 integrated ConnectX-7 network controller */ 91 HCA(MELLANOX, 41695), /* BlueField-4 integrated ConnectX-8 network controller */ 92 }; 93 94 uint32_t mlx5_debug_mask = 0; 95 int mlx5_freeze_on_error_cqe; 96 97 static struct ibv_context_ops mlx5_ctx_ops = { 98 .query_device = mlx5_query_device, 99 .query_port = mlx5_query_port, 100 .alloc_pd = mlx5_alloc_pd, 101 .dealloc_pd = mlx5_free_pd, 102 .reg_mr = mlx5_reg_mr, 103 .rereg_mr = mlx5_rereg_mr, 104 .dereg_mr = mlx5_dereg_mr, 105 .alloc_mw = mlx5_alloc_mw, 106 .dealloc_mw = mlx5_dealloc_mw, 107 .bind_mw = mlx5_bind_mw, 108 .create_cq = mlx5_create_cq, 109 .poll_cq = mlx5_poll_cq, 110 .req_notify_cq = mlx5_arm_cq, 111 .cq_event = mlx5_cq_event, 112 .resize_cq = mlx5_resize_cq, 113 .destroy_cq = mlx5_destroy_cq, 114 .create_srq = mlx5_create_srq, 115 .modify_srq = mlx5_modify_srq, 116 .query_srq = mlx5_query_srq, 117 .destroy_srq = mlx5_destroy_srq, 118 .post_srq_recv = mlx5_post_srq_recv, 119 .create_qp = mlx5_create_qp, 120 .query_qp = mlx5_query_qp, 121 .modify_qp = mlx5_modify_qp, 122 .destroy_qp = mlx5_destroy_qp, 123 .post_send = mlx5_post_send, 124 .post_recv = mlx5_post_recv, 125 .create_ah = mlx5_create_ah, 126 .destroy_ah = mlx5_destroy_ah, 127 .attach_mcast = mlx5_attach_mcast, 128 .detach_mcast = mlx5_detach_mcast 129 }; 130 131 static int read_number_from_line(const char *line, int *value) 132 { 133 const char *ptr; 134 135 ptr = strchr(line, ':'); 136 if (!ptr) 137 return 1; 138 139 ++ptr; 140 141 *value = atoi(ptr); 142 return 0; 143 } 144 /** 145 * The function looks for the first free user-index in all the 146 * user-index tables. If all are used, returns -1, otherwise 147 * a valid user-index. 148 * In case the reference count of the table is zero, it means the 149 * table is not in use and wasn't allocated yet, therefore the 150 * mlx5_store_uidx allocates the table, and increment the reference 151 * count on the table. 152 */ 153 static int32_t get_free_uidx(struct mlx5_context *ctx) 154 { 155 int32_t tind; 156 int32_t i; 157 158 for (tind = 0; tind < MLX5_UIDX_TABLE_SIZE; tind++) { 159 if (ctx->uidx_table[tind].refcnt < MLX5_UIDX_TABLE_MASK) 160 break; 161 } 162 163 if (tind == MLX5_UIDX_TABLE_SIZE) 164 return -1; 165 166 if (!ctx->uidx_table[tind].refcnt) 167 return tind << MLX5_UIDX_TABLE_SHIFT; 168 169 for (i = 0; i < MLX5_UIDX_TABLE_MASK + 1; i++) { 170 if (!ctx->uidx_table[tind].table[i]) 171 break; 172 } 173 174 return (tind << MLX5_UIDX_TABLE_SHIFT) | i; 175 } 176 177 int32_t mlx5_store_uidx(struct mlx5_context *ctx, void *rsc) 178 { 179 int32_t tind; 180 int32_t ret = -1; 181 int32_t uidx; 182 183 pthread_mutex_lock(&ctx->uidx_table_mutex); 184 uidx = get_free_uidx(ctx); 185 if (uidx < 0) 186 goto out; 187 188 tind = uidx >> MLX5_UIDX_TABLE_SHIFT; 189 190 if (!ctx->uidx_table[tind].refcnt) { 191 ctx->uidx_table[tind].table = calloc(MLX5_UIDX_TABLE_MASK + 1, 192 sizeof(struct mlx5_resource *)); 193 if (!ctx->uidx_table[tind].table) 194 goto out; 195 } 196 197 ++ctx->uidx_table[tind].refcnt; 198 ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = rsc; 199 ret = uidx; 200 201 out: 202 pthread_mutex_unlock(&ctx->uidx_table_mutex); 203 return ret; 204 } 205 206 void mlx5_clear_uidx(struct mlx5_context *ctx, uint32_t uidx) 207 { 208 int tind = uidx >> MLX5_UIDX_TABLE_SHIFT; 209 210 pthread_mutex_lock(&ctx->uidx_table_mutex); 211 212 if (!--ctx->uidx_table[tind].refcnt) 213 free(ctx->uidx_table[tind].table); 214 else 215 ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = NULL; 216 217 pthread_mutex_unlock(&ctx->uidx_table_mutex); 218 } 219 220 static int mlx5_is_sandy_bridge(int *num_cores) 221 { 222 char line[128]; 223 FILE *fd; 224 int rc = 0; 225 int cur_cpu_family = -1; 226 int cur_cpu_model = -1; 227 228 fd = fopen("/proc/cpuinfo", "r"); 229 if (!fd) 230 return 0; 231 232 *num_cores = 0; 233 234 while (fgets(line, 128, fd)) { 235 int value; 236 237 /* if this is information on new processor */ 238 if (!strncmp(line, "processor", 9)) { 239 ++*num_cores; 240 241 cur_cpu_family = -1; 242 cur_cpu_model = -1; 243 } else if (!strncmp(line, "cpu family", 10)) { 244 if ((cur_cpu_family < 0) && (!read_number_from_line(line, &value))) 245 cur_cpu_family = value; 246 } else if (!strncmp(line, "model", 5)) { 247 if ((cur_cpu_model < 0) && (!read_number_from_line(line, &value))) 248 cur_cpu_model = value; 249 } 250 251 /* if this is a Sandy Bridge CPU */ 252 if ((cur_cpu_family == 6) && 253 (cur_cpu_model == 0x2A || (cur_cpu_model == 0x2D) )) 254 rc = 1; 255 } 256 257 fclose(fd); 258 return rc; 259 } 260 261 /* 262 man cpuset 263 264 This format displays each 32-bit word in hexadecimal (using ASCII characters "0" - "9" and "a" - "f"); words 265 are filled with leading zeros, if required. For masks longer than one word, a comma separator is used between 266 words. Words are displayed in big-endian order, which has the most significant bit first. The hex digits 267 within a word are also in big-endian order. 268 269 The number of 32-bit words displayed is the minimum number needed to display all bits of the bitmask, based on 270 the size of the bitmask. 271 272 Examples of the Mask Format: 273 274 00000001 # just bit 0 set 275 40000000,00000000,00000000 # just bit 94 set 276 000000ff,00000000 # bits 32-39 set 277 00000000,000E3862 # 1,5,6,11-13,17-19 set 278 279 A mask with bits 0, 1, 2, 4, 8, 16, 32, and 64 set displays as: 280 281 00000001,00000001,00010117 282 283 The first "1" is for bit 64, the second for bit 32, the third for bit 16, the fourth for bit 8, the fifth for 284 bit 4, and the "7" is for bits 2, 1, and 0. 285 */ 286 static void mlx5_local_cpu_set(struct ibv_device *ibdev, cpuset_t *cpu_set) 287 { 288 char *p, buf[1024]; 289 char *env_value; 290 uint32_t word; 291 int i, k; 292 293 env_value = getenv("MLX5_LOCAL_CPUS"); 294 if (env_value) 295 strncpy(buf, env_value, sizeof(buf)); 296 else { 297 char fname[MAXPATHLEN]; 298 299 snprintf(fname, MAXPATHLEN, "/sys/class/infiniband/%s", 300 ibv_get_device_name(ibdev)); 301 302 if (ibv_read_sysfs_file(fname, "device/local_cpus", buf, sizeof(buf))) { 303 fprintf(stderr, PFX "Warning: can not get local cpu set: failed to open %s\n", fname); 304 return; 305 } 306 } 307 308 p = strrchr(buf, ','); 309 if (!p) 310 p = buf; 311 312 i = 0; 313 do { 314 if (*p == ',') { 315 *p = 0; 316 p ++; 317 } 318 319 word = strtoul(p, NULL, 16); 320 321 for (k = 0; word; ++k, word >>= 1) 322 if (word & 1) 323 CPU_SET(k+i, cpu_set); 324 325 if (p == buf) 326 break; 327 328 p = strrchr(buf, ','); 329 if (!p) 330 p = buf; 331 332 i += 32; 333 } while (i < CPU_SETSIZE); 334 } 335 336 static int mlx5_enable_sandy_bridge_fix(struct ibv_device *ibdev) 337 { 338 cpuset_t my_cpus, dev_local_cpus, result_set; 339 int stall_enable; 340 int ret; 341 int num_cores; 342 343 if (!mlx5_is_sandy_bridge(&num_cores)) 344 return 0; 345 346 /* by default enable stall on sandy bridge arch */ 347 stall_enable = 1; 348 349 /* 350 * check if app is bound to cpu set that is inside 351 * of device local cpu set. Disable stalling if true 352 */ 353 354 /* use static cpu set - up to CPU_SETSIZE (1024) cpus/node */ 355 CPU_ZERO(&my_cpus); 356 CPU_ZERO(&dev_local_cpus); 357 CPU_ZERO(&result_set); 358 ret = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, 359 sizeof(my_cpus), &my_cpus); 360 if (ret == -1) { 361 if (errno == EINVAL) 362 fprintf(stderr, PFX "Warning: my cpu set is too small\n"); 363 else 364 fprintf(stderr, PFX "Warning: failed to get my cpu set\n"); 365 goto out; 366 } 367 368 /* get device local cpu set */ 369 mlx5_local_cpu_set(ibdev, &dev_local_cpus); 370 371 /* check if my cpu set is in dev cpu */ 372 #if __FreeBSD_version < 1400046 373 CPU_OR(&result_set, &my_cpus); 374 CPU_OR(&result_set, &dev_local_cpus); 375 #else 376 CPU_OR(&result_set, &my_cpus, &dev_local_cpus); 377 #endif 378 stall_enable = CPU_EQUAL(&result_set, &dev_local_cpus) ? 0 : 1; 379 380 out: 381 return stall_enable; 382 } 383 384 static void mlx5_read_env(struct ibv_device *ibdev, struct mlx5_context *ctx) 385 { 386 char *env_value; 387 388 env_value = getenv("MLX5_STALL_CQ_POLL"); 389 if (env_value) 390 /* check if cq stall is enforced by user */ 391 ctx->stall_enable = (strcmp(env_value, "0")) ? 1 : 0; 392 else 393 /* autodetect if we need to do cq polling */ 394 ctx->stall_enable = mlx5_enable_sandy_bridge_fix(ibdev); 395 396 env_value = getenv("MLX5_STALL_NUM_LOOP"); 397 if (env_value) 398 mlx5_stall_num_loop = atoi(env_value); 399 400 env_value = getenv("MLX5_STALL_CQ_POLL_MIN"); 401 if (env_value) 402 mlx5_stall_cq_poll_min = atoi(env_value); 403 404 env_value = getenv("MLX5_STALL_CQ_POLL_MAX"); 405 if (env_value) 406 mlx5_stall_cq_poll_max = atoi(env_value); 407 408 env_value = getenv("MLX5_STALL_CQ_INC_STEP"); 409 if (env_value) 410 mlx5_stall_cq_inc_step = atoi(env_value); 411 412 env_value = getenv("MLX5_STALL_CQ_DEC_STEP"); 413 if (env_value) 414 mlx5_stall_cq_dec_step = atoi(env_value); 415 416 ctx->stall_adaptive_enable = 0; 417 ctx->stall_cycles = 0; 418 419 if (mlx5_stall_num_loop < 0) { 420 ctx->stall_adaptive_enable = 1; 421 ctx->stall_cycles = mlx5_stall_cq_poll_min; 422 } 423 424 } 425 426 static int get_total_uuars(int page_size) 427 { 428 int size = MLX5_DEF_TOT_UUARS; 429 int uuars_in_page; 430 char *env; 431 432 env = getenv("MLX5_TOTAL_UUARS"); 433 if (env) 434 size = atoi(env); 435 436 if (size < 1) 437 return -EINVAL; 438 439 uuars_in_page = page_size / MLX5_ADAPTER_PAGE_SIZE * MLX5_NUM_NON_FP_BFREGS_PER_UAR; 440 size = max(uuars_in_page, size); 441 size = align(size, MLX5_NUM_NON_FP_BFREGS_PER_UAR); 442 if (size > MLX5_MAX_BFREGS) 443 return -ENOMEM; 444 445 return size; 446 } 447 448 static void open_debug_file(struct mlx5_context *ctx) 449 { 450 char *env; 451 452 env = getenv("MLX5_DEBUG_FILE"); 453 if (!env) { 454 ctx->dbg_fp = stderr; 455 return; 456 } 457 458 ctx->dbg_fp = fopen(env, "aw+"); 459 if (!ctx->dbg_fp) { 460 fprintf(stderr, "Failed opening debug file %s, using stderr\n", env); 461 ctx->dbg_fp = stderr; 462 return; 463 } 464 } 465 466 static void close_debug_file(struct mlx5_context *ctx) 467 { 468 if (ctx->dbg_fp && ctx->dbg_fp != stderr) 469 fclose(ctx->dbg_fp); 470 } 471 472 static void set_debug_mask(void) 473 { 474 char *env; 475 476 env = getenv("MLX5_DEBUG_MASK"); 477 if (env) 478 mlx5_debug_mask = strtol(env, NULL, 0); 479 } 480 481 static void set_freeze_on_error(void) 482 { 483 char *env; 484 485 env = getenv("MLX5_FREEZE_ON_ERROR_CQE"); 486 if (env) 487 mlx5_freeze_on_error_cqe = strtol(env, NULL, 0); 488 } 489 490 static int get_always_bf(void) 491 { 492 char *env; 493 494 env = getenv("MLX5_POST_SEND_PREFER_BF"); 495 if (!env) 496 return 1; 497 498 return strcmp(env, "0") ? 1 : 0; 499 } 500 501 static int get_shut_up_bf(void) 502 { 503 char *env; 504 505 env = getenv("MLX5_SHUT_UP_BF"); 506 if (!env) 507 return 0; 508 509 return strcmp(env, "0") ? 1 : 0; 510 } 511 512 static int get_num_low_lat_uuars(int tot_uuars) 513 { 514 char *env; 515 int num = 4; 516 517 env = getenv("MLX5_NUM_LOW_LAT_UUARS"); 518 if (env) 519 num = atoi(env); 520 521 if (num < 0) 522 return -EINVAL; 523 524 num = max(num, tot_uuars - MLX5_MED_BFREGS_TSHOLD); 525 return num; 526 } 527 528 /* The library allocates an array of uuar contexts. The one in index zero does 529 * not to execersize odd/even policy so it can avoid a lock but it may not use 530 * blue flame. The upper ones, low_lat_uuars can use blue flame with no lock 531 * since they are assigned to one QP only. The rest can use blue flame but since 532 * they are shared they need a lock 533 */ 534 static int need_uuar_lock(struct mlx5_context *ctx, int uuarn) 535 { 536 if (uuarn == 0 || mlx5_single_threaded) 537 return 0; 538 539 if (uuarn >= (ctx->tot_uuars - ctx->low_lat_uuars) * 2) 540 return 0; 541 542 return 1; 543 } 544 545 static int single_threaded_app(void) 546 { 547 548 char *env; 549 550 env = getenv("MLX5_SINGLE_THREADED"); 551 if (env) 552 return strcmp(env, "1") ? 0 : 1; 553 554 return 0; 555 } 556 557 static int mlx5_cmd_get_context(struct mlx5_context *context, 558 struct mlx5_alloc_ucontext *req, 559 size_t req_len, 560 struct mlx5_alloc_ucontext_resp *resp, 561 size_t resp_len) 562 { 563 if (!ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req, 564 req_len, &resp->ibv_resp, resp_len)) 565 return 0; 566 567 /* The ibv_cmd_get_context fails in older kernels when passing 568 * a request length that the kernel doesn't know. 569 * To avoid breaking compatibility of new libmlx5 and older 570 * kernels, when ibv_cmd_get_context fails with the full 571 * request length, we try once again with the legacy length. 572 * We repeat this process while reducing requested size based 573 * on the feature input size. To avoid this in the future, we 574 * will remove the check in kernel that requires fields unknown 575 * to the kernel to be cleared. This will require that any new 576 * feature that involves extending struct mlx5_alloc_ucontext 577 * will be accompanied by an indication in the form of one or 578 * more fields in struct mlx5_alloc_ucontext_resp. If the 579 * response value can be interpreted as feature not supported 580 * when the returned value is zero, this will suffice to 581 * indicate to the library that the request was ignored by the 582 * kernel, either because it is unaware or because it decided 583 * to do so. If zero is a valid response, we will add a new 584 * field that indicates whether the request was handled. 585 */ 586 if (!ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req, 587 offsetof(struct mlx5_alloc_ucontext, lib_caps), 588 &resp->ibv_resp, resp_len)) 589 return 0; 590 591 return ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req, 592 offsetof(struct mlx5_alloc_ucontext, 593 cqe_version), 594 &resp->ibv_resp, resp_len); 595 } 596 597 static int mlx5_map_internal_clock(struct mlx5_device *mdev, 598 struct ibv_context *ibv_ctx) 599 { 600 struct mlx5_context *context = to_mctx(ibv_ctx); 601 void *hca_clock_page; 602 off_t offset = 0; 603 604 set_command(MLX5_MMAP_GET_CORE_CLOCK_CMD, &offset); 605 hca_clock_page = mmap(NULL, mdev->page_size, 606 PROT_READ, MAP_SHARED, ibv_ctx->cmd_fd, 607 mdev->page_size * offset); 608 609 if (hca_clock_page == MAP_FAILED) { 610 fprintf(stderr, PFX 611 "Warning: Timestamp available,\n" 612 "but failed to mmap() hca core clock page.\n"); 613 return -1; 614 } 615 616 context->hca_core_clock = hca_clock_page + 617 (context->core_clock.offset & (mdev->page_size - 1)); 618 return 0; 619 } 620 621 int mlx5dv_query_device(struct ibv_context *ctx_in, 622 struct mlx5dv_context *attrs_out) 623 { 624 struct mlx5_context *mctx = to_mctx(ctx_in); 625 uint64_t comp_mask_out = 0; 626 627 attrs_out->version = 0; 628 attrs_out->flags = 0; 629 630 if (mctx->cqe_version == MLX5_CQE_VERSION_V1) 631 attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_CQE_V1; 632 633 if (mctx->vendor_cap_flags & MLX5_VENDOR_CAP_FLAGS_MPW) 634 attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_MPW; 635 636 if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_CQE_COMPRESION) { 637 attrs_out->cqe_comp_caps = mctx->cqe_comp_caps; 638 comp_mask_out |= MLX5DV_CONTEXT_MASK_CQE_COMPRESION; 639 } 640 641 attrs_out->comp_mask = comp_mask_out; 642 643 return 0; 644 } 645 646 static int mlx5dv_get_qp(struct ibv_qp *qp_in, 647 struct mlx5dv_qp *qp_out) 648 { 649 struct mlx5_qp *mqp = to_mqp(qp_in); 650 651 qp_out->comp_mask = 0; 652 qp_out->dbrec = mqp->db; 653 654 if (mqp->sq_buf_size) 655 /* IBV_QPT_RAW_PACKET */ 656 qp_out->sq.buf = (void *)((uintptr_t)mqp->sq_buf.buf); 657 else 658 qp_out->sq.buf = (void *)((uintptr_t)mqp->buf.buf + mqp->sq.offset); 659 qp_out->sq.wqe_cnt = mqp->sq.wqe_cnt; 660 qp_out->sq.stride = 1 << mqp->sq.wqe_shift; 661 662 qp_out->rq.buf = (void *)((uintptr_t)mqp->buf.buf + mqp->rq.offset); 663 qp_out->rq.wqe_cnt = mqp->rq.wqe_cnt; 664 qp_out->rq.stride = 1 << mqp->rq.wqe_shift; 665 666 qp_out->bf.reg = mqp->bf->reg; 667 668 if (mqp->bf->uuarn > 0) 669 qp_out->bf.size = mqp->bf->buf_size; 670 else 671 qp_out->bf.size = 0; 672 673 return 0; 674 } 675 676 static int mlx5dv_get_cq(struct ibv_cq *cq_in, 677 struct mlx5dv_cq *cq_out) 678 { 679 struct mlx5_cq *mcq = to_mcq(cq_in); 680 struct mlx5_context *mctx = to_mctx(cq_in->context); 681 682 cq_out->comp_mask = 0; 683 cq_out->cqn = mcq->cqn; 684 cq_out->cqe_cnt = mcq->ibv_cq.cqe + 1; 685 cq_out->cqe_size = mcq->cqe_sz; 686 cq_out->buf = mcq->active_buf->buf; 687 cq_out->dbrec = mcq->dbrec; 688 cq_out->uar = mctx->uar; 689 690 mcq->flags |= MLX5_CQ_FLAGS_DV_OWNED; 691 692 return 0; 693 } 694 695 static int mlx5dv_get_rwq(struct ibv_wq *wq_in, 696 struct mlx5dv_rwq *rwq_out) 697 { 698 struct mlx5_rwq *mrwq = to_mrwq(wq_in); 699 700 rwq_out->comp_mask = 0; 701 rwq_out->buf = mrwq->pbuff; 702 rwq_out->dbrec = mrwq->recv_db; 703 rwq_out->wqe_cnt = mrwq->rq.wqe_cnt; 704 rwq_out->stride = 1 << mrwq->rq.wqe_shift; 705 706 return 0; 707 } 708 709 static int mlx5dv_get_srq(struct ibv_srq *srq_in, 710 struct mlx5dv_srq *srq_out) 711 { 712 struct mlx5_srq *msrq; 713 714 msrq = container_of(srq_in, struct mlx5_srq, vsrq.srq); 715 716 srq_out->comp_mask = 0; 717 srq_out->buf = msrq->buf.buf; 718 srq_out->dbrec = msrq->db; 719 srq_out->stride = 1 << msrq->wqe_shift; 720 srq_out->head = msrq->head; 721 srq_out->tail = msrq->tail; 722 723 return 0; 724 } 725 726 int mlx5dv_init_obj(struct mlx5dv_obj *obj, uint64_t obj_type) 727 { 728 int ret = 0; 729 730 if (obj_type & MLX5DV_OBJ_QP) 731 ret = mlx5dv_get_qp(obj->qp.in, obj->qp.out); 732 if (!ret && (obj_type & MLX5DV_OBJ_CQ)) 733 ret = mlx5dv_get_cq(obj->cq.in, obj->cq.out); 734 if (!ret && (obj_type & MLX5DV_OBJ_SRQ)) 735 ret = mlx5dv_get_srq(obj->srq.in, obj->srq.out); 736 if (!ret && (obj_type & MLX5DV_OBJ_RWQ)) 737 ret = mlx5dv_get_rwq(obj->rwq.in, obj->rwq.out); 738 739 return ret; 740 } 741 742 static void adjust_uar_info(struct mlx5_device *mdev, 743 struct mlx5_context *context, 744 struct mlx5_alloc_ucontext_resp resp) 745 { 746 if (!resp.log_uar_size && !resp.num_uars_per_page) { 747 /* old kernel */ 748 context->uar_size = mdev->page_size; 749 context->num_uars_per_page = 1; 750 return; 751 } 752 753 context->uar_size = 1 << resp.log_uar_size; 754 context->num_uars_per_page = resp.num_uars_per_page; 755 } 756 757 static int mlx5_init_context(struct verbs_device *vdev, 758 struct ibv_context *ctx, int cmd_fd) 759 { 760 struct mlx5_context *context; 761 struct mlx5_alloc_ucontext req; 762 struct mlx5_alloc_ucontext_resp resp; 763 int i; 764 int page_size; 765 int tot_uuars; 766 int low_lat_uuars; 767 int gross_uuars; 768 int j; 769 off_t offset; 770 struct mlx5_device *mdev; 771 struct verbs_context *v_ctx; 772 struct ibv_port_attr port_attr; 773 struct ibv_device_attr_ex device_attr; 774 int k; 775 int bfi; 776 int num_sys_page_map; 777 778 mdev = to_mdev(&vdev->device); 779 v_ctx = verbs_get_ctx(ctx); 780 page_size = mdev->page_size; 781 mlx5_single_threaded = single_threaded_app(); 782 783 context = to_mctx(ctx); 784 context->ibv_ctx.cmd_fd = cmd_fd; 785 786 open_debug_file(context); 787 set_debug_mask(); 788 set_freeze_on_error(); 789 if (gethostname(context->hostname, sizeof(context->hostname))) 790 strcpy(context->hostname, "host_unknown"); 791 792 tot_uuars = get_total_uuars(page_size); 793 if (tot_uuars < 0) { 794 errno = -tot_uuars; 795 goto err_free; 796 } 797 798 low_lat_uuars = get_num_low_lat_uuars(tot_uuars); 799 if (low_lat_uuars < 0) { 800 errno = -low_lat_uuars; 801 goto err_free; 802 } 803 804 if (low_lat_uuars > tot_uuars - 1) { 805 errno = ENOMEM; 806 goto err_free; 807 } 808 809 memset(&req, 0, sizeof(req)); 810 memset(&resp, 0, sizeof(resp)); 811 812 req.total_num_uuars = tot_uuars; 813 req.num_low_latency_uuars = low_lat_uuars; 814 req.cqe_version = MLX5_CQE_VERSION_V1; 815 req.lib_caps |= MLX5_LIB_CAP_4K_UAR; 816 817 if (mlx5_cmd_get_context(context, &req, sizeof(req), &resp, 818 sizeof(resp))) 819 goto err_free; 820 821 context->max_num_qps = resp.qp_tab_size; 822 context->bf_reg_size = resp.bf_reg_size; 823 context->tot_uuars = resp.tot_uuars; 824 context->low_lat_uuars = low_lat_uuars; 825 context->cache_line_size = resp.cache_line_size; 826 context->max_sq_desc_sz = resp.max_sq_desc_sz; 827 context->max_rq_desc_sz = resp.max_rq_desc_sz; 828 context->max_send_wqebb = resp.max_send_wqebb; 829 context->num_ports = resp.num_ports; 830 context->max_recv_wr = resp.max_recv_wr; 831 context->max_srq_recv_wr = resp.max_srq_recv_wr; 832 833 context->cqe_version = resp.cqe_version; 834 if (context->cqe_version) { 835 if (context->cqe_version == MLX5_CQE_VERSION_V1) 836 mlx5_ctx_ops.poll_cq = mlx5_poll_cq_v1; 837 else 838 goto err_free; 839 } 840 841 adjust_uar_info(mdev, context, resp); 842 843 gross_uuars = context->tot_uuars / MLX5_NUM_NON_FP_BFREGS_PER_UAR * NUM_BFREGS_PER_UAR; 844 context->bfs = calloc(gross_uuars, sizeof(*context->bfs)); 845 if (!context->bfs) { 846 errno = ENOMEM; 847 goto err_free; 848 } 849 850 context->cmds_supp_uhw = resp.cmds_supp_uhw; 851 context->vendor_cap_flags = 0; 852 853 pthread_mutex_init(&context->qp_table_mutex, NULL); 854 pthread_mutex_init(&context->srq_table_mutex, NULL); 855 pthread_mutex_init(&context->uidx_table_mutex, NULL); 856 for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i) 857 context->qp_table[i].refcnt = 0; 858 859 for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i) 860 context->uidx_table[i].refcnt = 0; 861 862 context->db_list = NULL; 863 864 pthread_mutex_init(&context->db_list_mutex, NULL); 865 866 num_sys_page_map = context->tot_uuars / (context->num_uars_per_page * MLX5_NUM_NON_FP_BFREGS_PER_UAR); 867 for (i = 0; i < num_sys_page_map; ++i) { 868 offset = 0; 869 set_command(MLX5_MMAP_GET_REGULAR_PAGES_CMD, &offset); 870 set_index(i, &offset); 871 context->uar[i] = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, 872 cmd_fd, page_size * offset); 873 if (context->uar[i] == MAP_FAILED) { 874 context->uar[i] = NULL; 875 goto err_free_bf; 876 } 877 } 878 879 for (i = 0; i < num_sys_page_map; i++) { 880 for (j = 0; j < context->num_uars_per_page; j++) { 881 for (k = 0; k < NUM_BFREGS_PER_UAR; k++) { 882 bfi = (i * context->num_uars_per_page + j) * NUM_BFREGS_PER_UAR + k; 883 context->bfs[bfi].reg = context->uar[i] + MLX5_ADAPTER_PAGE_SIZE * j + 884 MLX5_BF_OFFSET + k * context->bf_reg_size; 885 context->bfs[bfi].need_lock = need_uuar_lock(context, bfi); 886 mlx5_spinlock_init(&context->bfs[bfi].lock); 887 context->bfs[bfi].offset = 0; 888 if (bfi) 889 context->bfs[bfi].buf_size = context->bf_reg_size / 2; 890 context->bfs[bfi].uuarn = bfi; 891 } 892 } 893 } 894 context->hca_core_clock = NULL; 895 if (resp.response_length + sizeof(resp.ibv_resp) >= 896 offsetof(struct mlx5_alloc_ucontext_resp, hca_core_clock_offset) + 897 sizeof(resp.hca_core_clock_offset) && 898 resp.comp_mask & MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET) { 899 context->core_clock.offset = resp.hca_core_clock_offset; 900 mlx5_map_internal_clock(mdev, ctx); 901 } 902 903 mlx5_spinlock_init(&context->lock32); 904 905 context->prefer_bf = get_always_bf(); 906 context->shut_up_bf = get_shut_up_bf(); 907 mlx5_read_env(&vdev->device, context); 908 909 mlx5_spinlock_init(&context->hugetlb_lock); 910 TAILQ_INIT(&context->hugetlb_list); 911 912 context->ibv_ctx.ops = mlx5_ctx_ops; 913 914 verbs_set_ctx_op(v_ctx, create_qp_ex, mlx5_create_qp_ex); 915 verbs_set_ctx_op(v_ctx, open_xrcd, mlx5_open_xrcd); 916 verbs_set_ctx_op(v_ctx, close_xrcd, mlx5_close_xrcd); 917 verbs_set_ctx_op(v_ctx, create_srq_ex, mlx5_create_srq_ex); 918 verbs_set_ctx_op(v_ctx, get_srq_num, mlx5_get_srq_num); 919 verbs_set_ctx_op(v_ctx, query_device_ex, mlx5_query_device_ex); 920 verbs_set_ctx_op(v_ctx, query_rt_values, mlx5_query_rt_values); 921 verbs_set_ctx_op(v_ctx, ibv_create_flow, ibv_cmd_create_flow); 922 verbs_set_ctx_op(v_ctx, ibv_destroy_flow, ibv_cmd_destroy_flow); 923 verbs_set_ctx_op(v_ctx, create_cq_ex, mlx5_create_cq_ex); 924 verbs_set_ctx_op(v_ctx, create_wq, mlx5_create_wq); 925 verbs_set_ctx_op(v_ctx, modify_wq, mlx5_modify_wq); 926 verbs_set_ctx_op(v_ctx, destroy_wq, mlx5_destroy_wq); 927 verbs_set_ctx_op(v_ctx, create_rwq_ind_table, mlx5_create_rwq_ind_table); 928 verbs_set_ctx_op(v_ctx, destroy_rwq_ind_table, mlx5_destroy_rwq_ind_table); 929 930 memset(&device_attr, 0, sizeof(device_attr)); 931 if (!mlx5_query_device_ex(ctx, NULL, &device_attr, 932 sizeof(struct ibv_device_attr_ex))) { 933 context->cached_device_cap_flags = 934 device_attr.orig_attr.device_cap_flags; 935 context->atomic_cap = device_attr.orig_attr.atomic_cap; 936 context->cached_tso_caps = device_attr.tso_caps; 937 } 938 939 for (j = 0; j < min(MLX5_MAX_PORTS_NUM, context->num_ports); ++j) { 940 memset(&port_attr, 0, sizeof(port_attr)); 941 if (!mlx5_query_port(ctx, j + 1, &port_attr)) 942 context->cached_link_layer[j] = port_attr.link_layer; 943 } 944 945 return 0; 946 947 err_free_bf: 948 free(context->bfs); 949 950 err_free: 951 for (i = 0; i < MLX5_MAX_UARS; ++i) { 952 if (context->uar[i]) 953 munmap(context->uar[i], page_size); 954 } 955 close_debug_file(context); 956 return errno; 957 } 958 959 static void mlx5_cleanup_context(struct verbs_device *device, 960 struct ibv_context *ibctx) 961 { 962 struct mlx5_context *context = to_mctx(ibctx); 963 int page_size = to_mdev(ibctx->device)->page_size; 964 int i; 965 966 free(context->bfs); 967 for (i = 0; i < MLX5_MAX_UARS; ++i) { 968 if (context->uar[i]) 969 munmap(context->uar[i], page_size); 970 } 971 if (context->hca_core_clock) 972 munmap(context->hca_core_clock - context->core_clock.offset, 973 page_size); 974 close_debug_file(context); 975 } 976 977 static struct verbs_device_ops mlx5_dev_ops = { 978 .init_context = mlx5_init_context, 979 .uninit_context = mlx5_cleanup_context, 980 }; 981 982 static struct verbs_device *mlx5_driver_init(const char *uverbs_sys_path, 983 int abi_version) 984 { 985 char value[8]; 986 struct mlx5_device *dev; 987 unsigned vendor, device; 988 int i; 989 990 if (ibv_read_sysfs_file(uverbs_sys_path, "device/vendor", 991 value, sizeof value) < 0) 992 return NULL; 993 sscanf(value, "%i", &vendor); 994 995 if (ibv_read_sysfs_file(uverbs_sys_path, "device/device", 996 value, sizeof value) < 0) 997 return NULL; 998 sscanf(value, "%i", &device); 999 1000 for (i = 0; i < sizeof hca_table / sizeof hca_table[0]; ++i) 1001 if (vendor == hca_table[i].vendor && 1002 device == hca_table[i].device) 1003 goto found; 1004 1005 return NULL; 1006 1007 found: 1008 if (abi_version < MLX5_UVERBS_MIN_ABI_VERSION || 1009 abi_version > MLX5_UVERBS_MAX_ABI_VERSION) { 1010 fprintf(stderr, PFX "Fatal: ABI version %d of %s is not supported " 1011 "(min supported %d, max supported %d)\n", 1012 abi_version, uverbs_sys_path, 1013 MLX5_UVERBS_MIN_ABI_VERSION, 1014 MLX5_UVERBS_MAX_ABI_VERSION); 1015 return NULL; 1016 } 1017 1018 dev = calloc(1, sizeof *dev); 1019 if (!dev) { 1020 fprintf(stderr, PFX "Fatal: couldn't allocate device for %s\n", 1021 uverbs_sys_path); 1022 return NULL; 1023 } 1024 1025 dev->page_size = sysconf(_SC_PAGESIZE); 1026 dev->driver_abi_ver = abi_version; 1027 1028 dev->verbs_dev.ops = &mlx5_dev_ops; 1029 dev->verbs_dev.sz = sizeof(*dev); 1030 dev->verbs_dev.size_of_context = sizeof(struct mlx5_context) - 1031 sizeof(struct ibv_context); 1032 1033 return &dev->verbs_dev; 1034 } 1035 1036 static __attribute__((constructor)) void mlx5_register_driver(void) 1037 { 1038 verbs_register_driver("mlx5", mlx5_driver_init); 1039 } 1040