1 /* 2 * Copyright (c) 2012 Mellanox Technologies, Inc. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 */ 32 #define _GNU_SOURCE 33 #include <config.h> 34 35 #include <stdio.h> 36 #include <stdlib.h> 37 #include <unistd.h> 38 #include <errno.h> 39 #include <sys/mman.h> 40 #include <pthread.h> 41 #include <string.h> 42 #include <sched.h> 43 #include <sys/param.h> 44 #include <sys/cpuset.h> 45 46 #include "mlx5.h" 47 #include "mlx5-abi.h" 48 49 #ifndef PCI_VENDOR_ID_MELLANOX 50 #define PCI_VENDOR_ID_MELLANOX 0x15b3 51 #endif 52 53 #ifndef CPU_OR 54 #define CPU_OR(x, y, z) do {} while (0) 55 #endif 56 57 #ifndef CPU_EQUAL 58 #define CPU_EQUAL(x, y) 1 59 #endif 60 61 62 #define HCA(v, d) \ 63 { .vendor = PCI_VENDOR_ID_##v, \ 64 .device = d } 65 66 static struct { 67 unsigned vendor; 68 unsigned device; 69 } hca_table[] = { 70 HCA(MELLANOX, 4113), /* MT4113 Connect-IB */ 71 HCA(MELLANOX, 4114), /* Connect-IB Virtual Function */ 72 HCA(MELLANOX, 4115), /* ConnectX-4 */ 73 HCA(MELLANOX, 4116), /* ConnectX-4 Virtual Function */ 74 HCA(MELLANOX, 4117), /* ConnectX-4LX */ 75 HCA(MELLANOX, 4118), /* ConnectX-4LX Virtual Function */ 76 HCA(MELLANOX, 4119), /* ConnectX-5, PCIe 3.0 */ 77 HCA(MELLANOX, 4120), /* ConnectX-5 Virtual Function */ 78 HCA(MELLANOX, 4121), /* ConnectX-5 Ex */ 79 HCA(MELLANOX, 4122), /* ConnectX-5 Ex VF */ 80 HCA(MELLANOX, 4123), /* ConnectX-6 */ 81 HCA(MELLANOX, 4124), /* ConnectX-6 VF */ 82 HCA(MELLANOX, 4125), /* ConnectX-6 DX */ 83 HCA(MELLANOX, 4126), /* ConnectX family mlx5Gen Virtual Function */ 84 HCA(MELLANOX, 41682), /* BlueField integrated ConnectX-5 network controller */ 85 HCA(MELLANOX, 41683), /* BlueField integrated ConnectX-5 network controller VF */ 86 }; 87 88 uint32_t mlx5_debug_mask = 0; 89 int mlx5_freeze_on_error_cqe; 90 91 static struct ibv_context_ops mlx5_ctx_ops = { 92 .query_device = mlx5_query_device, 93 .query_port = mlx5_query_port, 94 .alloc_pd = mlx5_alloc_pd, 95 .dealloc_pd = mlx5_free_pd, 96 .reg_mr = mlx5_reg_mr, 97 .rereg_mr = mlx5_rereg_mr, 98 .dereg_mr = mlx5_dereg_mr, 99 .alloc_mw = mlx5_alloc_mw, 100 .dealloc_mw = mlx5_dealloc_mw, 101 .bind_mw = mlx5_bind_mw, 102 .create_cq = mlx5_create_cq, 103 .poll_cq = mlx5_poll_cq, 104 .req_notify_cq = mlx5_arm_cq, 105 .cq_event = mlx5_cq_event, 106 .resize_cq = mlx5_resize_cq, 107 .destroy_cq = mlx5_destroy_cq, 108 .create_srq = mlx5_create_srq, 109 .modify_srq = mlx5_modify_srq, 110 .query_srq = mlx5_query_srq, 111 .destroy_srq = mlx5_destroy_srq, 112 .post_srq_recv = mlx5_post_srq_recv, 113 .create_qp = mlx5_create_qp, 114 .query_qp = mlx5_query_qp, 115 .modify_qp = mlx5_modify_qp, 116 .destroy_qp = mlx5_destroy_qp, 117 .post_send = mlx5_post_send, 118 .post_recv = mlx5_post_recv, 119 .create_ah = mlx5_create_ah, 120 .destroy_ah = mlx5_destroy_ah, 121 .attach_mcast = mlx5_attach_mcast, 122 .detach_mcast = mlx5_detach_mcast 123 }; 124 125 static int read_number_from_line(const char *line, int *value) 126 { 127 const char *ptr; 128 129 ptr = strchr(line, ':'); 130 if (!ptr) 131 return 1; 132 133 ++ptr; 134 135 *value = atoi(ptr); 136 return 0; 137 } 138 /** 139 * The function looks for the first free user-index in all the 140 * user-index tables. If all are used, returns -1, otherwise 141 * a valid user-index. 142 * In case the reference count of the table is zero, it means the 143 * table is not in use and wasn't allocated yet, therefore the 144 * mlx5_store_uidx allocates the table, and increment the reference 145 * count on the table. 146 */ 147 static int32_t get_free_uidx(struct mlx5_context *ctx) 148 { 149 int32_t tind; 150 int32_t i; 151 152 for (tind = 0; tind < MLX5_UIDX_TABLE_SIZE; tind++) { 153 if (ctx->uidx_table[tind].refcnt < MLX5_UIDX_TABLE_MASK) 154 break; 155 } 156 157 if (tind == MLX5_UIDX_TABLE_SIZE) 158 return -1; 159 160 if (!ctx->uidx_table[tind].refcnt) 161 return tind << MLX5_UIDX_TABLE_SHIFT; 162 163 for (i = 0; i < MLX5_UIDX_TABLE_MASK + 1; i++) { 164 if (!ctx->uidx_table[tind].table[i]) 165 break; 166 } 167 168 return (tind << MLX5_UIDX_TABLE_SHIFT) | i; 169 } 170 171 int32_t mlx5_store_uidx(struct mlx5_context *ctx, void *rsc) 172 { 173 int32_t tind; 174 int32_t ret = -1; 175 int32_t uidx; 176 177 pthread_mutex_lock(&ctx->uidx_table_mutex); 178 uidx = get_free_uidx(ctx); 179 if (uidx < 0) 180 goto out; 181 182 tind = uidx >> MLX5_UIDX_TABLE_SHIFT; 183 184 if (!ctx->uidx_table[tind].refcnt) { 185 ctx->uidx_table[tind].table = calloc(MLX5_UIDX_TABLE_MASK + 1, 186 sizeof(struct mlx5_resource *)); 187 if (!ctx->uidx_table[tind].table) 188 goto out; 189 } 190 191 ++ctx->uidx_table[tind].refcnt; 192 ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = rsc; 193 ret = uidx; 194 195 out: 196 pthread_mutex_unlock(&ctx->uidx_table_mutex); 197 return ret; 198 } 199 200 void mlx5_clear_uidx(struct mlx5_context *ctx, uint32_t uidx) 201 { 202 int tind = uidx >> MLX5_UIDX_TABLE_SHIFT; 203 204 pthread_mutex_lock(&ctx->uidx_table_mutex); 205 206 if (!--ctx->uidx_table[tind].refcnt) 207 free(ctx->uidx_table[tind].table); 208 else 209 ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = NULL; 210 211 pthread_mutex_unlock(&ctx->uidx_table_mutex); 212 } 213 214 static int mlx5_is_sandy_bridge(int *num_cores) 215 { 216 char line[128]; 217 FILE *fd; 218 int rc = 0; 219 int cur_cpu_family = -1; 220 int cur_cpu_model = -1; 221 222 fd = fopen("/proc/cpuinfo", "r"); 223 if (!fd) 224 return 0; 225 226 *num_cores = 0; 227 228 while (fgets(line, 128, fd)) { 229 int value; 230 231 /* if this is information on new processor */ 232 if (!strncmp(line, "processor", 9)) { 233 ++*num_cores; 234 235 cur_cpu_family = -1; 236 cur_cpu_model = -1; 237 } else if (!strncmp(line, "cpu family", 10)) { 238 if ((cur_cpu_family < 0) && (!read_number_from_line(line, &value))) 239 cur_cpu_family = value; 240 } else if (!strncmp(line, "model", 5)) { 241 if ((cur_cpu_model < 0) && (!read_number_from_line(line, &value))) 242 cur_cpu_model = value; 243 } 244 245 /* if this is a Sandy Bridge CPU */ 246 if ((cur_cpu_family == 6) && 247 (cur_cpu_model == 0x2A || (cur_cpu_model == 0x2D) )) 248 rc = 1; 249 } 250 251 fclose(fd); 252 return rc; 253 } 254 255 /* 256 man cpuset 257 258 This format displays each 32-bit word in hexadecimal (using ASCII characters "0" - "9" and "a" - "f"); words 259 are filled with leading zeros, if required. For masks longer than one word, a comma separator is used between 260 words. Words are displayed in big-endian order, which has the most significant bit first. The hex digits 261 within a word are also in big-endian order. 262 263 The number of 32-bit words displayed is the minimum number needed to display all bits of the bitmask, based on 264 the size of the bitmask. 265 266 Examples of the Mask Format: 267 268 00000001 # just bit 0 set 269 40000000,00000000,00000000 # just bit 94 set 270 000000ff,00000000 # bits 32-39 set 271 00000000,000E3862 # 1,5,6,11-13,17-19 set 272 273 A mask with bits 0, 1, 2, 4, 8, 16, 32, and 64 set displays as: 274 275 00000001,00000001,00010117 276 277 The first "1" is for bit 64, the second for bit 32, the third for bit 16, the fourth for bit 8, the fifth for 278 bit 4, and the "7" is for bits 2, 1, and 0. 279 */ 280 static void mlx5_local_cpu_set(struct ibv_device *ibdev, cpuset_t *cpu_set) 281 { 282 char *p, buf[1024]; 283 char *env_value; 284 uint32_t word; 285 int i, k; 286 287 env_value = getenv("MLX5_LOCAL_CPUS"); 288 if (env_value) 289 strncpy(buf, env_value, sizeof(buf)); 290 else { 291 char fname[MAXPATHLEN]; 292 293 snprintf(fname, MAXPATHLEN, "/sys/class/infiniband/%s", 294 ibv_get_device_name(ibdev)); 295 296 if (ibv_read_sysfs_file(fname, "device/local_cpus", buf, sizeof(buf))) { 297 fprintf(stderr, PFX "Warning: can not get local cpu set: failed to open %s\n", fname); 298 return; 299 } 300 } 301 302 p = strrchr(buf, ','); 303 if (!p) 304 p = buf; 305 306 i = 0; 307 do { 308 if (*p == ',') { 309 *p = 0; 310 p ++; 311 } 312 313 word = strtoul(p, NULL, 16); 314 315 for (k = 0; word; ++k, word >>= 1) 316 if (word & 1) 317 CPU_SET(k+i, cpu_set); 318 319 if (p == buf) 320 break; 321 322 p = strrchr(buf, ','); 323 if (!p) 324 p = buf; 325 326 i += 32; 327 } while (i < CPU_SETSIZE); 328 } 329 330 static int mlx5_enable_sandy_bridge_fix(struct ibv_device *ibdev) 331 { 332 cpuset_t my_cpus, dev_local_cpus, result_set; 333 int stall_enable; 334 int ret; 335 int num_cores; 336 337 if (!mlx5_is_sandy_bridge(&num_cores)) 338 return 0; 339 340 /* by default enable stall on sandy bridge arch */ 341 stall_enable = 1; 342 343 /* 344 * check if app is bound to cpu set that is inside 345 * of device local cpu set. Disable stalling if true 346 */ 347 348 /* use static cpu set - up to CPU_SETSIZE (1024) cpus/node */ 349 CPU_ZERO(&my_cpus); 350 CPU_ZERO(&dev_local_cpus); 351 CPU_ZERO(&result_set); 352 ret = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, 353 sizeof(my_cpus), &my_cpus); 354 if (ret == -1) { 355 if (errno == EINVAL) 356 fprintf(stderr, PFX "Warning: my cpu set is too small\n"); 357 else 358 fprintf(stderr, PFX "Warning: failed to get my cpu set\n"); 359 goto out; 360 } 361 362 /* get device local cpu set */ 363 mlx5_local_cpu_set(ibdev, &dev_local_cpus); 364 365 /* check if my cpu set is in dev cpu */ 366 CPU_OR(&result_set, &my_cpus); 367 CPU_OR(&result_set, &dev_local_cpus); 368 stall_enable = CPU_EQUAL(&result_set, &dev_local_cpus) ? 0 : 1; 369 370 out: 371 return stall_enable; 372 } 373 374 static void mlx5_read_env(struct ibv_device *ibdev, struct mlx5_context *ctx) 375 { 376 char *env_value; 377 378 env_value = getenv("MLX5_STALL_CQ_POLL"); 379 if (env_value) 380 /* check if cq stall is enforced by user */ 381 ctx->stall_enable = (strcmp(env_value, "0")) ? 1 : 0; 382 else 383 /* autodetect if we need to do cq polling */ 384 ctx->stall_enable = mlx5_enable_sandy_bridge_fix(ibdev); 385 386 env_value = getenv("MLX5_STALL_NUM_LOOP"); 387 if (env_value) 388 mlx5_stall_num_loop = atoi(env_value); 389 390 env_value = getenv("MLX5_STALL_CQ_POLL_MIN"); 391 if (env_value) 392 mlx5_stall_cq_poll_min = atoi(env_value); 393 394 env_value = getenv("MLX5_STALL_CQ_POLL_MAX"); 395 if (env_value) 396 mlx5_stall_cq_poll_max = atoi(env_value); 397 398 env_value = getenv("MLX5_STALL_CQ_INC_STEP"); 399 if (env_value) 400 mlx5_stall_cq_inc_step = atoi(env_value); 401 402 env_value = getenv("MLX5_STALL_CQ_DEC_STEP"); 403 if (env_value) 404 mlx5_stall_cq_dec_step = atoi(env_value); 405 406 ctx->stall_adaptive_enable = 0; 407 ctx->stall_cycles = 0; 408 409 if (mlx5_stall_num_loop < 0) { 410 ctx->stall_adaptive_enable = 1; 411 ctx->stall_cycles = mlx5_stall_cq_poll_min; 412 } 413 414 } 415 416 static int get_total_uuars(int page_size) 417 { 418 int size = MLX5_DEF_TOT_UUARS; 419 int uuars_in_page; 420 char *env; 421 422 env = getenv("MLX5_TOTAL_UUARS"); 423 if (env) 424 size = atoi(env); 425 426 if (size < 1) 427 return -EINVAL; 428 429 uuars_in_page = page_size / MLX5_ADAPTER_PAGE_SIZE * MLX5_NUM_NON_FP_BFREGS_PER_UAR; 430 size = max(uuars_in_page, size); 431 size = align(size, MLX5_NUM_NON_FP_BFREGS_PER_UAR); 432 if (size > MLX5_MAX_BFREGS) 433 return -ENOMEM; 434 435 return size; 436 } 437 438 static void open_debug_file(struct mlx5_context *ctx) 439 { 440 char *env; 441 442 env = getenv("MLX5_DEBUG_FILE"); 443 if (!env) { 444 ctx->dbg_fp = stderr; 445 return; 446 } 447 448 ctx->dbg_fp = fopen(env, "aw+"); 449 if (!ctx->dbg_fp) { 450 fprintf(stderr, "Failed opening debug file %s, using stderr\n", env); 451 ctx->dbg_fp = stderr; 452 return; 453 } 454 } 455 456 static void close_debug_file(struct mlx5_context *ctx) 457 { 458 if (ctx->dbg_fp && ctx->dbg_fp != stderr) 459 fclose(ctx->dbg_fp); 460 } 461 462 static void set_debug_mask(void) 463 { 464 char *env; 465 466 env = getenv("MLX5_DEBUG_MASK"); 467 if (env) 468 mlx5_debug_mask = strtol(env, NULL, 0); 469 } 470 471 static void set_freeze_on_error(void) 472 { 473 char *env; 474 475 env = getenv("MLX5_FREEZE_ON_ERROR_CQE"); 476 if (env) 477 mlx5_freeze_on_error_cqe = strtol(env, NULL, 0); 478 } 479 480 static int get_always_bf(void) 481 { 482 char *env; 483 484 env = getenv("MLX5_POST_SEND_PREFER_BF"); 485 if (!env) 486 return 1; 487 488 return strcmp(env, "0") ? 1 : 0; 489 } 490 491 static int get_shut_up_bf(void) 492 { 493 char *env; 494 495 env = getenv("MLX5_SHUT_UP_BF"); 496 if (!env) 497 return 0; 498 499 return strcmp(env, "0") ? 1 : 0; 500 } 501 502 static int get_num_low_lat_uuars(int tot_uuars) 503 { 504 char *env; 505 int num = 4; 506 507 env = getenv("MLX5_NUM_LOW_LAT_UUARS"); 508 if (env) 509 num = atoi(env); 510 511 if (num < 0) 512 return -EINVAL; 513 514 num = max(num, tot_uuars - MLX5_MED_BFREGS_TSHOLD); 515 return num; 516 } 517 518 /* The library allocates an array of uuar contexts. The one in index zero does 519 * not to execersize odd/even policy so it can avoid a lock but it may not use 520 * blue flame. The upper ones, low_lat_uuars can use blue flame with no lock 521 * since they are assigned to one QP only. The rest can use blue flame but since 522 * they are shared they need a lock 523 */ 524 static int need_uuar_lock(struct mlx5_context *ctx, int uuarn) 525 { 526 if (uuarn == 0 || mlx5_single_threaded) 527 return 0; 528 529 if (uuarn >= (ctx->tot_uuars - ctx->low_lat_uuars) * 2) 530 return 0; 531 532 return 1; 533 } 534 535 static int single_threaded_app(void) 536 { 537 538 char *env; 539 540 env = getenv("MLX5_SINGLE_THREADED"); 541 if (env) 542 return strcmp(env, "1") ? 0 : 1; 543 544 return 0; 545 } 546 547 static int mlx5_cmd_get_context(struct mlx5_context *context, 548 struct mlx5_alloc_ucontext *req, 549 size_t req_len, 550 struct mlx5_alloc_ucontext_resp *resp, 551 size_t resp_len) 552 { 553 if (!ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req, 554 req_len, &resp->ibv_resp, resp_len)) 555 return 0; 556 557 /* The ibv_cmd_get_context fails in older kernels when passing 558 * a request length that the kernel doesn't know. 559 * To avoid breaking compatibility of new libmlx5 and older 560 * kernels, when ibv_cmd_get_context fails with the full 561 * request length, we try once again with the legacy length. 562 * We repeat this process while reducing requested size based 563 * on the feature input size. To avoid this in the future, we 564 * will remove the check in kernel that requires fields unknown 565 * to the kernel to be cleared. This will require that any new 566 * feature that involves extending struct mlx5_alloc_ucontext 567 * will be accompanied by an indication in the form of one or 568 * more fields in struct mlx5_alloc_ucontext_resp. If the 569 * response value can be interpreted as feature not supported 570 * when the returned value is zero, this will suffice to 571 * indicate to the library that the request was ignored by the 572 * kernel, either because it is unaware or because it decided 573 * to do so. If zero is a valid response, we will add a new 574 * field that indicates whether the request was handled. 575 */ 576 if (!ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req, 577 offsetof(struct mlx5_alloc_ucontext, lib_caps), 578 &resp->ibv_resp, resp_len)) 579 return 0; 580 581 return ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req, 582 offsetof(struct mlx5_alloc_ucontext, 583 cqe_version), 584 &resp->ibv_resp, resp_len); 585 } 586 587 static int mlx5_map_internal_clock(struct mlx5_device *mdev, 588 struct ibv_context *ibv_ctx) 589 { 590 struct mlx5_context *context = to_mctx(ibv_ctx); 591 void *hca_clock_page; 592 off_t offset = 0; 593 594 set_command(MLX5_MMAP_GET_CORE_CLOCK_CMD, &offset); 595 hca_clock_page = mmap(NULL, mdev->page_size, 596 PROT_READ, MAP_SHARED, ibv_ctx->cmd_fd, 597 mdev->page_size * offset); 598 599 if (hca_clock_page == MAP_FAILED) { 600 fprintf(stderr, PFX 601 "Warning: Timestamp available,\n" 602 "but failed to mmap() hca core clock page.\n"); 603 return -1; 604 } 605 606 context->hca_core_clock = hca_clock_page + 607 (context->core_clock.offset & (mdev->page_size - 1)); 608 return 0; 609 } 610 611 int mlx5dv_query_device(struct ibv_context *ctx_in, 612 struct mlx5dv_context *attrs_out) 613 { 614 struct mlx5_context *mctx = to_mctx(ctx_in); 615 uint64_t comp_mask_out = 0; 616 617 attrs_out->version = 0; 618 attrs_out->flags = 0; 619 620 if (mctx->cqe_version == MLX5_CQE_VERSION_V1) 621 attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_CQE_V1; 622 623 if (mctx->vendor_cap_flags & MLX5_VENDOR_CAP_FLAGS_MPW) 624 attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_MPW; 625 626 if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_CQE_COMPRESION) { 627 attrs_out->cqe_comp_caps = mctx->cqe_comp_caps; 628 comp_mask_out |= MLX5DV_CONTEXT_MASK_CQE_COMPRESION; 629 } 630 631 attrs_out->comp_mask = comp_mask_out; 632 633 return 0; 634 } 635 636 static int mlx5dv_get_qp(struct ibv_qp *qp_in, 637 struct mlx5dv_qp *qp_out) 638 { 639 struct mlx5_qp *mqp = to_mqp(qp_in); 640 641 qp_out->comp_mask = 0; 642 qp_out->dbrec = mqp->db; 643 644 if (mqp->sq_buf_size) 645 /* IBV_QPT_RAW_PACKET */ 646 qp_out->sq.buf = (void *)((uintptr_t)mqp->sq_buf.buf); 647 else 648 qp_out->sq.buf = (void *)((uintptr_t)mqp->buf.buf + mqp->sq.offset); 649 qp_out->sq.wqe_cnt = mqp->sq.wqe_cnt; 650 qp_out->sq.stride = 1 << mqp->sq.wqe_shift; 651 652 qp_out->rq.buf = (void *)((uintptr_t)mqp->buf.buf + mqp->rq.offset); 653 qp_out->rq.wqe_cnt = mqp->rq.wqe_cnt; 654 qp_out->rq.stride = 1 << mqp->rq.wqe_shift; 655 656 qp_out->bf.reg = mqp->bf->reg; 657 658 if (mqp->bf->uuarn > 0) 659 qp_out->bf.size = mqp->bf->buf_size; 660 else 661 qp_out->bf.size = 0; 662 663 return 0; 664 } 665 666 static int mlx5dv_get_cq(struct ibv_cq *cq_in, 667 struct mlx5dv_cq *cq_out) 668 { 669 struct mlx5_cq *mcq = to_mcq(cq_in); 670 struct mlx5_context *mctx = to_mctx(cq_in->context); 671 672 cq_out->comp_mask = 0; 673 cq_out->cqn = mcq->cqn; 674 cq_out->cqe_cnt = mcq->ibv_cq.cqe + 1; 675 cq_out->cqe_size = mcq->cqe_sz; 676 cq_out->buf = mcq->active_buf->buf; 677 cq_out->dbrec = mcq->dbrec; 678 cq_out->uar = mctx->uar; 679 680 mcq->flags |= MLX5_CQ_FLAGS_DV_OWNED; 681 682 return 0; 683 } 684 685 static int mlx5dv_get_rwq(struct ibv_wq *wq_in, 686 struct mlx5dv_rwq *rwq_out) 687 { 688 struct mlx5_rwq *mrwq = to_mrwq(wq_in); 689 690 rwq_out->comp_mask = 0; 691 rwq_out->buf = mrwq->pbuff; 692 rwq_out->dbrec = mrwq->recv_db; 693 rwq_out->wqe_cnt = mrwq->rq.wqe_cnt; 694 rwq_out->stride = 1 << mrwq->rq.wqe_shift; 695 696 return 0; 697 } 698 699 static int mlx5dv_get_srq(struct ibv_srq *srq_in, 700 struct mlx5dv_srq *srq_out) 701 { 702 struct mlx5_srq *msrq; 703 704 msrq = container_of(srq_in, struct mlx5_srq, vsrq.srq); 705 706 srq_out->comp_mask = 0; 707 srq_out->buf = msrq->buf.buf; 708 srq_out->dbrec = msrq->db; 709 srq_out->stride = 1 << msrq->wqe_shift; 710 srq_out->head = msrq->head; 711 srq_out->tail = msrq->tail; 712 713 return 0; 714 } 715 716 int mlx5dv_init_obj(struct mlx5dv_obj *obj, uint64_t obj_type) 717 { 718 int ret = 0; 719 720 if (obj_type & MLX5DV_OBJ_QP) 721 ret = mlx5dv_get_qp(obj->qp.in, obj->qp.out); 722 if (!ret && (obj_type & MLX5DV_OBJ_CQ)) 723 ret = mlx5dv_get_cq(obj->cq.in, obj->cq.out); 724 if (!ret && (obj_type & MLX5DV_OBJ_SRQ)) 725 ret = mlx5dv_get_srq(obj->srq.in, obj->srq.out); 726 if (!ret && (obj_type & MLX5DV_OBJ_RWQ)) 727 ret = mlx5dv_get_rwq(obj->rwq.in, obj->rwq.out); 728 729 return ret; 730 } 731 732 static void adjust_uar_info(struct mlx5_device *mdev, 733 struct mlx5_context *context, 734 struct mlx5_alloc_ucontext_resp resp) 735 { 736 if (!resp.log_uar_size && !resp.num_uars_per_page) { 737 /* old kernel */ 738 context->uar_size = mdev->page_size; 739 context->num_uars_per_page = 1; 740 return; 741 } 742 743 context->uar_size = 1 << resp.log_uar_size; 744 context->num_uars_per_page = resp.num_uars_per_page; 745 } 746 747 static int mlx5_init_context(struct verbs_device *vdev, 748 struct ibv_context *ctx, int cmd_fd) 749 { 750 struct mlx5_context *context; 751 struct mlx5_alloc_ucontext req; 752 struct mlx5_alloc_ucontext_resp resp; 753 int i; 754 int page_size; 755 int tot_uuars; 756 int low_lat_uuars; 757 int gross_uuars; 758 int j; 759 off_t offset; 760 struct mlx5_device *mdev; 761 struct verbs_context *v_ctx; 762 struct ibv_port_attr port_attr; 763 struct ibv_device_attr_ex device_attr; 764 int k; 765 int bfi; 766 int num_sys_page_map; 767 768 mdev = to_mdev(&vdev->device); 769 v_ctx = verbs_get_ctx(ctx); 770 page_size = mdev->page_size; 771 mlx5_single_threaded = single_threaded_app(); 772 773 context = to_mctx(ctx); 774 context->ibv_ctx.cmd_fd = cmd_fd; 775 776 open_debug_file(context); 777 set_debug_mask(); 778 set_freeze_on_error(); 779 if (gethostname(context->hostname, sizeof(context->hostname))) 780 strcpy(context->hostname, "host_unknown"); 781 782 tot_uuars = get_total_uuars(page_size); 783 if (tot_uuars < 0) { 784 errno = -tot_uuars; 785 goto err_free; 786 } 787 788 low_lat_uuars = get_num_low_lat_uuars(tot_uuars); 789 if (low_lat_uuars < 0) { 790 errno = -low_lat_uuars; 791 goto err_free; 792 } 793 794 if (low_lat_uuars > tot_uuars - 1) { 795 errno = ENOMEM; 796 goto err_free; 797 } 798 799 memset(&req, 0, sizeof(req)); 800 memset(&resp, 0, sizeof(resp)); 801 802 req.total_num_uuars = tot_uuars; 803 req.num_low_latency_uuars = low_lat_uuars; 804 req.cqe_version = MLX5_CQE_VERSION_V1; 805 req.lib_caps |= MLX5_LIB_CAP_4K_UAR; 806 807 if (mlx5_cmd_get_context(context, &req, sizeof(req), &resp, 808 sizeof(resp))) 809 goto err_free; 810 811 context->max_num_qps = resp.qp_tab_size; 812 context->bf_reg_size = resp.bf_reg_size; 813 context->tot_uuars = resp.tot_uuars; 814 context->low_lat_uuars = low_lat_uuars; 815 context->cache_line_size = resp.cache_line_size; 816 context->max_sq_desc_sz = resp.max_sq_desc_sz; 817 context->max_rq_desc_sz = resp.max_rq_desc_sz; 818 context->max_send_wqebb = resp.max_send_wqebb; 819 context->num_ports = resp.num_ports; 820 context->max_recv_wr = resp.max_recv_wr; 821 context->max_srq_recv_wr = resp.max_srq_recv_wr; 822 823 context->cqe_version = resp.cqe_version; 824 if (context->cqe_version) { 825 if (context->cqe_version == MLX5_CQE_VERSION_V1) 826 mlx5_ctx_ops.poll_cq = mlx5_poll_cq_v1; 827 else 828 goto err_free; 829 } 830 831 adjust_uar_info(mdev, context, resp); 832 833 gross_uuars = context->tot_uuars / MLX5_NUM_NON_FP_BFREGS_PER_UAR * NUM_BFREGS_PER_UAR; 834 context->bfs = calloc(gross_uuars, sizeof(*context->bfs)); 835 if (!context->bfs) { 836 errno = ENOMEM; 837 goto err_free; 838 } 839 840 context->cmds_supp_uhw = resp.cmds_supp_uhw; 841 context->vendor_cap_flags = 0; 842 843 pthread_mutex_init(&context->qp_table_mutex, NULL); 844 pthread_mutex_init(&context->srq_table_mutex, NULL); 845 pthread_mutex_init(&context->uidx_table_mutex, NULL); 846 for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i) 847 context->qp_table[i].refcnt = 0; 848 849 for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i) 850 context->uidx_table[i].refcnt = 0; 851 852 context->db_list = NULL; 853 854 pthread_mutex_init(&context->db_list_mutex, NULL); 855 856 num_sys_page_map = context->tot_uuars / (context->num_uars_per_page * MLX5_NUM_NON_FP_BFREGS_PER_UAR); 857 for (i = 0; i < num_sys_page_map; ++i) { 858 offset = 0; 859 set_command(MLX5_MMAP_GET_REGULAR_PAGES_CMD, &offset); 860 set_index(i, &offset); 861 context->uar[i] = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, 862 cmd_fd, page_size * offset); 863 if (context->uar[i] == MAP_FAILED) { 864 context->uar[i] = NULL; 865 goto err_free_bf; 866 } 867 } 868 869 for (i = 0; i < num_sys_page_map; i++) { 870 for (j = 0; j < context->num_uars_per_page; j++) { 871 for (k = 0; k < NUM_BFREGS_PER_UAR; k++) { 872 bfi = (i * context->num_uars_per_page + j) * NUM_BFREGS_PER_UAR + k; 873 context->bfs[bfi].reg = context->uar[i] + MLX5_ADAPTER_PAGE_SIZE * j + 874 MLX5_BF_OFFSET + k * context->bf_reg_size; 875 context->bfs[bfi].need_lock = need_uuar_lock(context, bfi); 876 mlx5_spinlock_init(&context->bfs[bfi].lock); 877 context->bfs[bfi].offset = 0; 878 if (bfi) 879 context->bfs[bfi].buf_size = context->bf_reg_size / 2; 880 context->bfs[bfi].uuarn = bfi; 881 } 882 } 883 } 884 context->hca_core_clock = NULL; 885 if (resp.response_length + sizeof(resp.ibv_resp) >= 886 offsetof(struct mlx5_alloc_ucontext_resp, hca_core_clock_offset) + 887 sizeof(resp.hca_core_clock_offset) && 888 resp.comp_mask & MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET) { 889 context->core_clock.offset = resp.hca_core_clock_offset; 890 mlx5_map_internal_clock(mdev, ctx); 891 } 892 893 mlx5_spinlock_init(&context->lock32); 894 895 context->prefer_bf = get_always_bf(); 896 context->shut_up_bf = get_shut_up_bf(); 897 mlx5_read_env(&vdev->device, context); 898 899 mlx5_spinlock_init(&context->hugetlb_lock); 900 TAILQ_INIT(&context->hugetlb_list); 901 902 context->ibv_ctx.ops = mlx5_ctx_ops; 903 904 verbs_set_ctx_op(v_ctx, create_qp_ex, mlx5_create_qp_ex); 905 verbs_set_ctx_op(v_ctx, open_xrcd, mlx5_open_xrcd); 906 verbs_set_ctx_op(v_ctx, close_xrcd, mlx5_close_xrcd); 907 verbs_set_ctx_op(v_ctx, create_srq_ex, mlx5_create_srq_ex); 908 verbs_set_ctx_op(v_ctx, get_srq_num, mlx5_get_srq_num); 909 verbs_set_ctx_op(v_ctx, query_device_ex, mlx5_query_device_ex); 910 verbs_set_ctx_op(v_ctx, query_rt_values, mlx5_query_rt_values); 911 verbs_set_ctx_op(v_ctx, ibv_create_flow, ibv_cmd_create_flow); 912 verbs_set_ctx_op(v_ctx, ibv_destroy_flow, ibv_cmd_destroy_flow); 913 verbs_set_ctx_op(v_ctx, create_cq_ex, mlx5_create_cq_ex); 914 verbs_set_ctx_op(v_ctx, create_wq, mlx5_create_wq); 915 verbs_set_ctx_op(v_ctx, modify_wq, mlx5_modify_wq); 916 verbs_set_ctx_op(v_ctx, destroy_wq, mlx5_destroy_wq); 917 verbs_set_ctx_op(v_ctx, create_rwq_ind_table, mlx5_create_rwq_ind_table); 918 verbs_set_ctx_op(v_ctx, destroy_rwq_ind_table, mlx5_destroy_rwq_ind_table); 919 920 memset(&device_attr, 0, sizeof(device_attr)); 921 if (!mlx5_query_device_ex(ctx, NULL, &device_attr, 922 sizeof(struct ibv_device_attr_ex))) { 923 context->cached_device_cap_flags = 924 device_attr.orig_attr.device_cap_flags; 925 context->atomic_cap = device_attr.orig_attr.atomic_cap; 926 context->cached_tso_caps = device_attr.tso_caps; 927 } 928 929 for (j = 0; j < min(MLX5_MAX_PORTS_NUM, context->num_ports); ++j) { 930 memset(&port_attr, 0, sizeof(port_attr)); 931 if (!mlx5_query_port(ctx, j + 1, &port_attr)) 932 context->cached_link_layer[j] = port_attr.link_layer; 933 } 934 935 return 0; 936 937 err_free_bf: 938 free(context->bfs); 939 940 err_free: 941 for (i = 0; i < MLX5_MAX_UARS; ++i) { 942 if (context->uar[i]) 943 munmap(context->uar[i], page_size); 944 } 945 close_debug_file(context); 946 return errno; 947 } 948 949 static void mlx5_cleanup_context(struct verbs_device *device, 950 struct ibv_context *ibctx) 951 { 952 struct mlx5_context *context = to_mctx(ibctx); 953 int page_size = to_mdev(ibctx->device)->page_size; 954 int i; 955 956 free(context->bfs); 957 for (i = 0; i < MLX5_MAX_UARS; ++i) { 958 if (context->uar[i]) 959 munmap(context->uar[i], page_size); 960 } 961 if (context->hca_core_clock) 962 munmap(context->hca_core_clock - context->core_clock.offset, 963 page_size); 964 close_debug_file(context); 965 } 966 967 static struct verbs_device_ops mlx5_dev_ops = { 968 .init_context = mlx5_init_context, 969 .uninit_context = mlx5_cleanup_context, 970 }; 971 972 static struct verbs_device *mlx5_driver_init(const char *uverbs_sys_path, 973 int abi_version) 974 { 975 char value[8]; 976 struct mlx5_device *dev; 977 unsigned vendor, device; 978 int i; 979 980 if (ibv_read_sysfs_file(uverbs_sys_path, "device/vendor", 981 value, sizeof value) < 0) 982 return NULL; 983 sscanf(value, "%i", &vendor); 984 985 if (ibv_read_sysfs_file(uverbs_sys_path, "device/device", 986 value, sizeof value) < 0) 987 return NULL; 988 sscanf(value, "%i", &device); 989 990 for (i = 0; i < sizeof hca_table / sizeof hca_table[0]; ++i) 991 if (vendor == hca_table[i].vendor && 992 device == hca_table[i].device) 993 goto found; 994 995 return NULL; 996 997 found: 998 if (abi_version < MLX5_UVERBS_MIN_ABI_VERSION || 999 abi_version > MLX5_UVERBS_MAX_ABI_VERSION) { 1000 fprintf(stderr, PFX "Fatal: ABI version %d of %s is not supported " 1001 "(min supported %d, max supported %d)\n", 1002 abi_version, uverbs_sys_path, 1003 MLX5_UVERBS_MIN_ABI_VERSION, 1004 MLX5_UVERBS_MAX_ABI_VERSION); 1005 return NULL; 1006 } 1007 1008 dev = calloc(1, sizeof *dev); 1009 if (!dev) { 1010 fprintf(stderr, PFX "Fatal: couldn't allocate device for %s\n", 1011 uverbs_sys_path); 1012 return NULL; 1013 } 1014 1015 dev->page_size = sysconf(_SC_PAGESIZE); 1016 dev->driver_abi_ver = abi_version; 1017 1018 dev->verbs_dev.ops = &mlx5_dev_ops; 1019 dev->verbs_dev.sz = sizeof(*dev); 1020 dev->verbs_dev.size_of_context = sizeof(struct mlx5_context) - 1021 sizeof(struct ibv_context); 1022 1023 return &dev->verbs_dev; 1024 } 1025 1026 static __attribute__((constructor)) void mlx5_register_driver(void) 1027 { 1028 verbs_register_driver("mlx5", mlx5_driver_init); 1029 } 1030