1 /* 2 * Copyright (c) 2012 Mellanox Technologies, Inc. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 */ 32 #define _GNU_SOURCE 33 #include <config.h> 34 35 #include <stdio.h> 36 #include <stdlib.h> 37 #include <unistd.h> 38 #include <errno.h> 39 #include <sys/mman.h> 40 #include <pthread.h> 41 #include <string.h> 42 #include <sched.h> 43 #include <sys/param.h> 44 #include <sys/cpuset.h> 45 46 #include "mlx5.h" 47 #include "mlx5-abi.h" 48 49 #ifndef PCI_VENDOR_ID_MELLANOX 50 #define PCI_VENDOR_ID_MELLANOX 0x15b3 51 #endif 52 53 #ifndef CPU_OR 54 #define CPU_OR(x, y, z) do {} while (0) 55 #endif 56 57 #ifndef CPU_EQUAL 58 #define CPU_EQUAL(x, y) 1 59 #endif 60 61 62 #define HCA(v, d) \ 63 { .vendor = PCI_VENDOR_ID_##v, \ 64 .device = d } 65 66 static struct { 67 unsigned vendor; 68 unsigned device; 69 } hca_table[] = { 70 HCA(MELLANOX, 4113), /* MT4113 Connect-IB */ 71 HCA(MELLANOX, 4114), /* Connect-IB Virtual Function */ 72 HCA(MELLANOX, 4115), /* ConnectX-4 */ 73 HCA(MELLANOX, 4116), /* ConnectX-4 Virtual Function */ 74 HCA(MELLANOX, 4117), /* ConnectX-4LX */ 75 HCA(MELLANOX, 4118), /* ConnectX-4LX Virtual Function */ 76 HCA(MELLANOX, 4119), /* ConnectX-5, PCIe 3.0 */ 77 HCA(MELLANOX, 4120), /* ConnectX-5 Virtual Function */ 78 HCA(MELLANOX, 4121), /* ConnectX-5 Ex */ 79 HCA(MELLANOX, 4122), /* ConnectX-5 Ex VF */ 80 HCA(MELLANOX, 4123), /* ConnectX-6 */ 81 HCA(MELLANOX, 4124), /* ConnectX-6 VF */ 82 HCA(MELLANOX, 41682), /* BlueField integrated ConnectX-5 network controller */ 83 HCA(MELLANOX, 41683), /* BlueField integrated ConnectX-5 network controller VF */ 84 }; 85 86 uint32_t mlx5_debug_mask = 0; 87 int mlx5_freeze_on_error_cqe; 88 89 static struct ibv_context_ops mlx5_ctx_ops = { 90 .query_device = mlx5_query_device, 91 .query_port = mlx5_query_port, 92 .alloc_pd = mlx5_alloc_pd, 93 .dealloc_pd = mlx5_free_pd, 94 .reg_mr = mlx5_reg_mr, 95 .rereg_mr = mlx5_rereg_mr, 96 .dereg_mr = mlx5_dereg_mr, 97 .alloc_mw = mlx5_alloc_mw, 98 .dealloc_mw = mlx5_dealloc_mw, 99 .bind_mw = mlx5_bind_mw, 100 .create_cq = mlx5_create_cq, 101 .poll_cq = mlx5_poll_cq, 102 .req_notify_cq = mlx5_arm_cq, 103 .cq_event = mlx5_cq_event, 104 .resize_cq = mlx5_resize_cq, 105 .destroy_cq = mlx5_destroy_cq, 106 .create_srq = mlx5_create_srq, 107 .modify_srq = mlx5_modify_srq, 108 .query_srq = mlx5_query_srq, 109 .destroy_srq = mlx5_destroy_srq, 110 .post_srq_recv = mlx5_post_srq_recv, 111 .create_qp = mlx5_create_qp, 112 .query_qp = mlx5_query_qp, 113 .modify_qp = mlx5_modify_qp, 114 .destroy_qp = mlx5_destroy_qp, 115 .post_send = mlx5_post_send, 116 .post_recv = mlx5_post_recv, 117 .create_ah = mlx5_create_ah, 118 .destroy_ah = mlx5_destroy_ah, 119 .attach_mcast = mlx5_attach_mcast, 120 .detach_mcast = mlx5_detach_mcast 121 }; 122 123 static int read_number_from_line(const char *line, int *value) 124 { 125 const char *ptr; 126 127 ptr = strchr(line, ':'); 128 if (!ptr) 129 return 1; 130 131 ++ptr; 132 133 *value = atoi(ptr); 134 return 0; 135 } 136 /** 137 * The function looks for the first free user-index in all the 138 * user-index tables. If all are used, returns -1, otherwise 139 * a valid user-index. 140 * In case the reference count of the table is zero, it means the 141 * table is not in use and wasn't allocated yet, therefore the 142 * mlx5_store_uidx allocates the table, and increment the reference 143 * count on the table. 144 */ 145 static int32_t get_free_uidx(struct mlx5_context *ctx) 146 { 147 int32_t tind; 148 int32_t i; 149 150 for (tind = 0; tind < MLX5_UIDX_TABLE_SIZE; tind++) { 151 if (ctx->uidx_table[tind].refcnt < MLX5_UIDX_TABLE_MASK) 152 break; 153 } 154 155 if (tind == MLX5_UIDX_TABLE_SIZE) 156 return -1; 157 158 if (!ctx->uidx_table[tind].refcnt) 159 return tind << MLX5_UIDX_TABLE_SHIFT; 160 161 for (i = 0; i < MLX5_UIDX_TABLE_MASK + 1; i++) { 162 if (!ctx->uidx_table[tind].table[i]) 163 break; 164 } 165 166 return (tind << MLX5_UIDX_TABLE_SHIFT) | i; 167 } 168 169 int32_t mlx5_store_uidx(struct mlx5_context *ctx, void *rsc) 170 { 171 int32_t tind; 172 int32_t ret = -1; 173 int32_t uidx; 174 175 pthread_mutex_lock(&ctx->uidx_table_mutex); 176 uidx = get_free_uidx(ctx); 177 if (uidx < 0) 178 goto out; 179 180 tind = uidx >> MLX5_UIDX_TABLE_SHIFT; 181 182 if (!ctx->uidx_table[tind].refcnt) { 183 ctx->uidx_table[tind].table = calloc(MLX5_UIDX_TABLE_MASK + 1, 184 sizeof(struct mlx5_resource *)); 185 if (!ctx->uidx_table[tind].table) 186 goto out; 187 } 188 189 ++ctx->uidx_table[tind].refcnt; 190 ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = rsc; 191 ret = uidx; 192 193 out: 194 pthread_mutex_unlock(&ctx->uidx_table_mutex); 195 return ret; 196 } 197 198 void mlx5_clear_uidx(struct mlx5_context *ctx, uint32_t uidx) 199 { 200 int tind = uidx >> MLX5_UIDX_TABLE_SHIFT; 201 202 pthread_mutex_lock(&ctx->uidx_table_mutex); 203 204 if (!--ctx->uidx_table[tind].refcnt) 205 free(ctx->uidx_table[tind].table); 206 else 207 ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = NULL; 208 209 pthread_mutex_unlock(&ctx->uidx_table_mutex); 210 } 211 212 static int mlx5_is_sandy_bridge(int *num_cores) 213 { 214 char line[128]; 215 FILE *fd; 216 int rc = 0; 217 int cur_cpu_family = -1; 218 int cur_cpu_model = -1; 219 220 fd = fopen("/proc/cpuinfo", "r"); 221 if (!fd) 222 return 0; 223 224 *num_cores = 0; 225 226 while (fgets(line, 128, fd)) { 227 int value; 228 229 /* if this is information on new processor */ 230 if (!strncmp(line, "processor", 9)) { 231 ++*num_cores; 232 233 cur_cpu_family = -1; 234 cur_cpu_model = -1; 235 } else if (!strncmp(line, "cpu family", 10)) { 236 if ((cur_cpu_family < 0) && (!read_number_from_line(line, &value))) 237 cur_cpu_family = value; 238 } else if (!strncmp(line, "model", 5)) { 239 if ((cur_cpu_model < 0) && (!read_number_from_line(line, &value))) 240 cur_cpu_model = value; 241 } 242 243 /* if this is a Sandy Bridge CPU */ 244 if ((cur_cpu_family == 6) && 245 (cur_cpu_model == 0x2A || (cur_cpu_model == 0x2D) )) 246 rc = 1; 247 } 248 249 fclose(fd); 250 return rc; 251 } 252 253 /* 254 man cpuset 255 256 This format displays each 32-bit word in hexadecimal (using ASCII characters "0" - "9" and "a" - "f"); words 257 are filled with leading zeros, if required. For masks longer than one word, a comma separator is used between 258 words. Words are displayed in big-endian order, which has the most significant bit first. The hex digits 259 within a word are also in big-endian order. 260 261 The number of 32-bit words displayed is the minimum number needed to display all bits of the bitmask, based on 262 the size of the bitmask. 263 264 Examples of the Mask Format: 265 266 00000001 # just bit 0 set 267 40000000,00000000,00000000 # just bit 94 set 268 000000ff,00000000 # bits 32-39 set 269 00000000,000E3862 # 1,5,6,11-13,17-19 set 270 271 A mask with bits 0, 1, 2, 4, 8, 16, 32, and 64 set displays as: 272 273 00000001,00000001,00010117 274 275 The first "1" is for bit 64, the second for bit 32, the third for bit 16, the fourth for bit 8, the fifth for 276 bit 4, and the "7" is for bits 2, 1, and 0. 277 */ 278 static void mlx5_local_cpu_set(struct ibv_device *ibdev, cpuset_t *cpu_set) 279 { 280 char *p, buf[1024]; 281 char *env_value; 282 uint32_t word; 283 int i, k; 284 285 env_value = getenv("MLX5_LOCAL_CPUS"); 286 if (env_value) 287 strncpy(buf, env_value, sizeof(buf)); 288 else { 289 char fname[MAXPATHLEN]; 290 291 snprintf(fname, MAXPATHLEN, "/sys/class/infiniband/%s", 292 ibv_get_device_name(ibdev)); 293 294 if (ibv_read_sysfs_file(fname, "device/local_cpus", buf, sizeof(buf))) { 295 fprintf(stderr, PFX "Warning: can not get local cpu set: failed to open %s\n", fname); 296 return; 297 } 298 } 299 300 p = strrchr(buf, ','); 301 if (!p) 302 p = buf; 303 304 i = 0; 305 do { 306 if (*p == ',') { 307 *p = 0; 308 p ++; 309 } 310 311 word = strtoul(p, NULL, 16); 312 313 for (k = 0; word; ++k, word >>= 1) 314 if (word & 1) 315 CPU_SET(k+i, cpu_set); 316 317 if (p == buf) 318 break; 319 320 p = strrchr(buf, ','); 321 if (!p) 322 p = buf; 323 324 i += 32; 325 } while (i < CPU_SETSIZE); 326 } 327 328 static int mlx5_enable_sandy_bridge_fix(struct ibv_device *ibdev) 329 { 330 cpuset_t my_cpus, dev_local_cpus, result_set; 331 int stall_enable; 332 int ret; 333 int num_cores; 334 335 if (!mlx5_is_sandy_bridge(&num_cores)) 336 return 0; 337 338 /* by default enable stall on sandy bridge arch */ 339 stall_enable = 1; 340 341 /* 342 * check if app is bound to cpu set that is inside 343 * of device local cpu set. Disable stalling if true 344 */ 345 346 /* use static cpu set - up to CPU_SETSIZE (1024) cpus/node */ 347 CPU_ZERO(&my_cpus); 348 CPU_ZERO(&dev_local_cpus); 349 CPU_ZERO(&result_set); 350 ret = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, 351 sizeof(my_cpus), &my_cpus); 352 if (ret == -1) { 353 if (errno == EINVAL) 354 fprintf(stderr, PFX "Warning: my cpu set is too small\n"); 355 else 356 fprintf(stderr, PFX "Warning: failed to get my cpu set\n"); 357 goto out; 358 } 359 360 /* get device local cpu set */ 361 mlx5_local_cpu_set(ibdev, &dev_local_cpus); 362 363 /* check if my cpu set is in dev cpu */ 364 CPU_OR(&result_set, &my_cpus); 365 CPU_OR(&result_set, &dev_local_cpus); 366 stall_enable = CPU_EQUAL(&result_set, &dev_local_cpus) ? 0 : 1; 367 368 out: 369 return stall_enable; 370 } 371 372 static void mlx5_read_env(struct ibv_device *ibdev, struct mlx5_context *ctx) 373 { 374 char *env_value; 375 376 env_value = getenv("MLX5_STALL_CQ_POLL"); 377 if (env_value) 378 /* check if cq stall is enforced by user */ 379 ctx->stall_enable = (strcmp(env_value, "0")) ? 1 : 0; 380 else 381 /* autodetect if we need to do cq polling */ 382 ctx->stall_enable = mlx5_enable_sandy_bridge_fix(ibdev); 383 384 env_value = getenv("MLX5_STALL_NUM_LOOP"); 385 if (env_value) 386 mlx5_stall_num_loop = atoi(env_value); 387 388 env_value = getenv("MLX5_STALL_CQ_POLL_MIN"); 389 if (env_value) 390 mlx5_stall_cq_poll_min = atoi(env_value); 391 392 env_value = getenv("MLX5_STALL_CQ_POLL_MAX"); 393 if (env_value) 394 mlx5_stall_cq_poll_max = atoi(env_value); 395 396 env_value = getenv("MLX5_STALL_CQ_INC_STEP"); 397 if (env_value) 398 mlx5_stall_cq_inc_step = atoi(env_value); 399 400 env_value = getenv("MLX5_STALL_CQ_DEC_STEP"); 401 if (env_value) 402 mlx5_stall_cq_dec_step = atoi(env_value); 403 404 ctx->stall_adaptive_enable = 0; 405 ctx->stall_cycles = 0; 406 407 if (mlx5_stall_num_loop < 0) { 408 ctx->stall_adaptive_enable = 1; 409 ctx->stall_cycles = mlx5_stall_cq_poll_min; 410 } 411 412 } 413 414 static int get_total_uuars(int page_size) 415 { 416 int size = MLX5_DEF_TOT_UUARS; 417 int uuars_in_page; 418 char *env; 419 420 env = getenv("MLX5_TOTAL_UUARS"); 421 if (env) 422 size = atoi(env); 423 424 if (size < 1) 425 return -EINVAL; 426 427 uuars_in_page = page_size / MLX5_ADAPTER_PAGE_SIZE * MLX5_NUM_NON_FP_BFREGS_PER_UAR; 428 size = max(uuars_in_page, size); 429 size = align(size, MLX5_NUM_NON_FP_BFREGS_PER_UAR); 430 if (size > MLX5_MAX_BFREGS) 431 return -ENOMEM; 432 433 return size; 434 } 435 436 static void open_debug_file(struct mlx5_context *ctx) 437 { 438 char *env; 439 440 env = getenv("MLX5_DEBUG_FILE"); 441 if (!env) { 442 ctx->dbg_fp = stderr; 443 return; 444 } 445 446 ctx->dbg_fp = fopen(env, "aw+"); 447 if (!ctx->dbg_fp) { 448 fprintf(stderr, "Failed opening debug file %s, using stderr\n", env); 449 ctx->dbg_fp = stderr; 450 return; 451 } 452 } 453 454 static void close_debug_file(struct mlx5_context *ctx) 455 { 456 if (ctx->dbg_fp && ctx->dbg_fp != stderr) 457 fclose(ctx->dbg_fp); 458 } 459 460 static void set_debug_mask(void) 461 { 462 char *env; 463 464 env = getenv("MLX5_DEBUG_MASK"); 465 if (env) 466 mlx5_debug_mask = strtol(env, NULL, 0); 467 } 468 469 static void set_freeze_on_error(void) 470 { 471 char *env; 472 473 env = getenv("MLX5_FREEZE_ON_ERROR_CQE"); 474 if (env) 475 mlx5_freeze_on_error_cqe = strtol(env, NULL, 0); 476 } 477 478 static int get_always_bf(void) 479 { 480 char *env; 481 482 env = getenv("MLX5_POST_SEND_PREFER_BF"); 483 if (!env) 484 return 1; 485 486 return strcmp(env, "0") ? 1 : 0; 487 } 488 489 static int get_shut_up_bf(void) 490 { 491 char *env; 492 493 env = getenv("MLX5_SHUT_UP_BF"); 494 if (!env) 495 return 0; 496 497 return strcmp(env, "0") ? 1 : 0; 498 } 499 500 static int get_num_low_lat_uuars(int tot_uuars) 501 { 502 char *env; 503 int num = 4; 504 505 env = getenv("MLX5_NUM_LOW_LAT_UUARS"); 506 if (env) 507 num = atoi(env); 508 509 if (num < 0) 510 return -EINVAL; 511 512 num = max(num, tot_uuars - MLX5_MED_BFREGS_TSHOLD); 513 return num; 514 } 515 516 /* The library allocates an array of uuar contexts. The one in index zero does 517 * not to execersize odd/even policy so it can avoid a lock but it may not use 518 * blue flame. The upper ones, low_lat_uuars can use blue flame with no lock 519 * since they are assigned to one QP only. The rest can use blue flame but since 520 * they are shared they need a lock 521 */ 522 static int need_uuar_lock(struct mlx5_context *ctx, int uuarn) 523 { 524 if (uuarn == 0 || mlx5_single_threaded) 525 return 0; 526 527 if (uuarn >= (ctx->tot_uuars - ctx->low_lat_uuars) * 2) 528 return 0; 529 530 return 1; 531 } 532 533 static int single_threaded_app(void) 534 { 535 536 char *env; 537 538 env = getenv("MLX5_SINGLE_THREADED"); 539 if (env) 540 return strcmp(env, "1") ? 0 : 1; 541 542 return 0; 543 } 544 545 static int mlx5_cmd_get_context(struct mlx5_context *context, 546 struct mlx5_alloc_ucontext *req, 547 size_t req_len, 548 struct mlx5_alloc_ucontext_resp *resp, 549 size_t resp_len) 550 { 551 if (!ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req, 552 req_len, &resp->ibv_resp, resp_len)) 553 return 0; 554 555 /* The ibv_cmd_get_context fails in older kernels when passing 556 * a request length that the kernel doesn't know. 557 * To avoid breaking compatibility of new libmlx5 and older 558 * kernels, when ibv_cmd_get_context fails with the full 559 * request length, we try once again with the legacy length. 560 * We repeat this process while reducing requested size based 561 * on the feature input size. To avoid this in the future, we 562 * will remove the check in kernel that requires fields unknown 563 * to the kernel to be cleared. This will require that any new 564 * feature that involves extending struct mlx5_alloc_ucontext 565 * will be accompanied by an indication in the form of one or 566 * more fields in struct mlx5_alloc_ucontext_resp. If the 567 * response value can be interpreted as feature not supported 568 * when the returned value is zero, this will suffice to 569 * indicate to the library that the request was ignored by the 570 * kernel, either because it is unaware or because it decided 571 * to do so. If zero is a valid response, we will add a new 572 * field that indicates whether the request was handled. 573 */ 574 if (!ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req, 575 offsetof(struct mlx5_alloc_ucontext, lib_caps), 576 &resp->ibv_resp, resp_len)) 577 return 0; 578 579 return ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req, 580 offsetof(struct mlx5_alloc_ucontext, 581 cqe_version), 582 &resp->ibv_resp, resp_len); 583 } 584 585 static int mlx5_map_internal_clock(struct mlx5_device *mdev, 586 struct ibv_context *ibv_ctx) 587 { 588 struct mlx5_context *context = to_mctx(ibv_ctx); 589 void *hca_clock_page; 590 off_t offset = 0; 591 592 set_command(MLX5_MMAP_GET_CORE_CLOCK_CMD, &offset); 593 hca_clock_page = mmap(NULL, mdev->page_size, 594 PROT_READ, MAP_SHARED, ibv_ctx->cmd_fd, 595 mdev->page_size * offset); 596 597 if (hca_clock_page == MAP_FAILED) { 598 fprintf(stderr, PFX 599 "Warning: Timestamp available,\n" 600 "but failed to mmap() hca core clock page.\n"); 601 return -1; 602 } 603 604 context->hca_core_clock = hca_clock_page + 605 (context->core_clock.offset & (mdev->page_size - 1)); 606 return 0; 607 } 608 609 int mlx5dv_query_device(struct ibv_context *ctx_in, 610 struct mlx5dv_context *attrs_out) 611 { 612 struct mlx5_context *mctx = to_mctx(ctx_in); 613 uint64_t comp_mask_out = 0; 614 615 attrs_out->version = 0; 616 attrs_out->flags = 0; 617 618 if (mctx->cqe_version == MLX5_CQE_VERSION_V1) 619 attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_CQE_V1; 620 621 if (mctx->vendor_cap_flags & MLX5_VENDOR_CAP_FLAGS_MPW) 622 attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_MPW; 623 624 if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_CQE_COMPRESION) { 625 attrs_out->cqe_comp_caps = mctx->cqe_comp_caps; 626 comp_mask_out |= MLX5DV_CONTEXT_MASK_CQE_COMPRESION; 627 } 628 629 attrs_out->comp_mask = comp_mask_out; 630 631 return 0; 632 } 633 634 static int mlx5dv_get_qp(struct ibv_qp *qp_in, 635 struct mlx5dv_qp *qp_out) 636 { 637 struct mlx5_qp *mqp = to_mqp(qp_in); 638 639 qp_out->comp_mask = 0; 640 qp_out->dbrec = mqp->db; 641 642 if (mqp->sq_buf_size) 643 /* IBV_QPT_RAW_PACKET */ 644 qp_out->sq.buf = (void *)((uintptr_t)mqp->sq_buf.buf); 645 else 646 qp_out->sq.buf = (void *)((uintptr_t)mqp->buf.buf + mqp->sq.offset); 647 qp_out->sq.wqe_cnt = mqp->sq.wqe_cnt; 648 qp_out->sq.stride = 1 << mqp->sq.wqe_shift; 649 650 qp_out->rq.buf = (void *)((uintptr_t)mqp->buf.buf + mqp->rq.offset); 651 qp_out->rq.wqe_cnt = mqp->rq.wqe_cnt; 652 qp_out->rq.stride = 1 << mqp->rq.wqe_shift; 653 654 qp_out->bf.reg = mqp->bf->reg; 655 656 if (mqp->bf->uuarn > 0) 657 qp_out->bf.size = mqp->bf->buf_size; 658 else 659 qp_out->bf.size = 0; 660 661 return 0; 662 } 663 664 static int mlx5dv_get_cq(struct ibv_cq *cq_in, 665 struct mlx5dv_cq *cq_out) 666 { 667 struct mlx5_cq *mcq = to_mcq(cq_in); 668 struct mlx5_context *mctx = to_mctx(cq_in->context); 669 670 cq_out->comp_mask = 0; 671 cq_out->cqn = mcq->cqn; 672 cq_out->cqe_cnt = mcq->ibv_cq.cqe + 1; 673 cq_out->cqe_size = mcq->cqe_sz; 674 cq_out->buf = mcq->active_buf->buf; 675 cq_out->dbrec = mcq->dbrec; 676 cq_out->uar = mctx->uar; 677 678 mcq->flags |= MLX5_CQ_FLAGS_DV_OWNED; 679 680 return 0; 681 } 682 683 static int mlx5dv_get_rwq(struct ibv_wq *wq_in, 684 struct mlx5dv_rwq *rwq_out) 685 { 686 struct mlx5_rwq *mrwq = to_mrwq(wq_in); 687 688 rwq_out->comp_mask = 0; 689 rwq_out->buf = mrwq->pbuff; 690 rwq_out->dbrec = mrwq->recv_db; 691 rwq_out->wqe_cnt = mrwq->rq.wqe_cnt; 692 rwq_out->stride = 1 << mrwq->rq.wqe_shift; 693 694 return 0; 695 } 696 697 static int mlx5dv_get_srq(struct ibv_srq *srq_in, 698 struct mlx5dv_srq *srq_out) 699 { 700 struct mlx5_srq *msrq; 701 702 msrq = container_of(srq_in, struct mlx5_srq, vsrq.srq); 703 704 srq_out->comp_mask = 0; 705 srq_out->buf = msrq->buf.buf; 706 srq_out->dbrec = msrq->db; 707 srq_out->stride = 1 << msrq->wqe_shift; 708 srq_out->head = msrq->head; 709 srq_out->tail = msrq->tail; 710 711 return 0; 712 } 713 714 int mlx5dv_init_obj(struct mlx5dv_obj *obj, uint64_t obj_type) 715 { 716 int ret = 0; 717 718 if (obj_type & MLX5DV_OBJ_QP) 719 ret = mlx5dv_get_qp(obj->qp.in, obj->qp.out); 720 if (!ret && (obj_type & MLX5DV_OBJ_CQ)) 721 ret = mlx5dv_get_cq(obj->cq.in, obj->cq.out); 722 if (!ret && (obj_type & MLX5DV_OBJ_SRQ)) 723 ret = mlx5dv_get_srq(obj->srq.in, obj->srq.out); 724 if (!ret && (obj_type & MLX5DV_OBJ_RWQ)) 725 ret = mlx5dv_get_rwq(obj->rwq.in, obj->rwq.out); 726 727 return ret; 728 } 729 730 static void adjust_uar_info(struct mlx5_device *mdev, 731 struct mlx5_context *context, 732 struct mlx5_alloc_ucontext_resp resp) 733 { 734 if (!resp.log_uar_size && !resp.num_uars_per_page) { 735 /* old kernel */ 736 context->uar_size = mdev->page_size; 737 context->num_uars_per_page = 1; 738 return; 739 } 740 741 context->uar_size = 1 << resp.log_uar_size; 742 context->num_uars_per_page = resp.num_uars_per_page; 743 } 744 745 static int mlx5_init_context(struct verbs_device *vdev, 746 struct ibv_context *ctx, int cmd_fd) 747 { 748 struct mlx5_context *context; 749 struct mlx5_alloc_ucontext req; 750 struct mlx5_alloc_ucontext_resp resp; 751 int i; 752 int page_size; 753 int tot_uuars; 754 int low_lat_uuars; 755 int gross_uuars; 756 int j; 757 off_t offset; 758 struct mlx5_device *mdev; 759 struct verbs_context *v_ctx; 760 struct ibv_port_attr port_attr; 761 struct ibv_device_attr_ex device_attr; 762 int k; 763 int bfi; 764 int num_sys_page_map; 765 766 mdev = to_mdev(&vdev->device); 767 v_ctx = verbs_get_ctx(ctx); 768 page_size = mdev->page_size; 769 mlx5_single_threaded = single_threaded_app(); 770 771 context = to_mctx(ctx); 772 context->ibv_ctx.cmd_fd = cmd_fd; 773 774 open_debug_file(context); 775 set_debug_mask(); 776 set_freeze_on_error(); 777 if (gethostname(context->hostname, sizeof(context->hostname))) 778 strcpy(context->hostname, "host_unknown"); 779 780 tot_uuars = get_total_uuars(page_size); 781 if (tot_uuars < 0) { 782 errno = -tot_uuars; 783 goto err_free; 784 } 785 786 low_lat_uuars = get_num_low_lat_uuars(tot_uuars); 787 if (low_lat_uuars < 0) { 788 errno = -low_lat_uuars; 789 goto err_free; 790 } 791 792 if (low_lat_uuars > tot_uuars - 1) { 793 errno = ENOMEM; 794 goto err_free; 795 } 796 797 memset(&req, 0, sizeof(req)); 798 memset(&resp, 0, sizeof(resp)); 799 800 req.total_num_uuars = tot_uuars; 801 req.num_low_latency_uuars = low_lat_uuars; 802 req.cqe_version = MLX5_CQE_VERSION_V1; 803 req.lib_caps |= MLX5_LIB_CAP_4K_UAR; 804 805 if (mlx5_cmd_get_context(context, &req, sizeof(req), &resp, 806 sizeof(resp))) 807 goto err_free; 808 809 context->max_num_qps = resp.qp_tab_size; 810 context->bf_reg_size = resp.bf_reg_size; 811 context->tot_uuars = resp.tot_uuars; 812 context->low_lat_uuars = low_lat_uuars; 813 context->cache_line_size = resp.cache_line_size; 814 context->max_sq_desc_sz = resp.max_sq_desc_sz; 815 context->max_rq_desc_sz = resp.max_rq_desc_sz; 816 context->max_send_wqebb = resp.max_send_wqebb; 817 context->num_ports = resp.num_ports; 818 context->max_recv_wr = resp.max_recv_wr; 819 context->max_srq_recv_wr = resp.max_srq_recv_wr; 820 821 context->cqe_version = resp.cqe_version; 822 if (context->cqe_version) { 823 if (context->cqe_version == MLX5_CQE_VERSION_V1) 824 mlx5_ctx_ops.poll_cq = mlx5_poll_cq_v1; 825 else 826 goto err_free; 827 } 828 829 adjust_uar_info(mdev, context, resp); 830 831 gross_uuars = context->tot_uuars / MLX5_NUM_NON_FP_BFREGS_PER_UAR * NUM_BFREGS_PER_UAR; 832 context->bfs = calloc(gross_uuars, sizeof(*context->bfs)); 833 if (!context->bfs) { 834 errno = ENOMEM; 835 goto err_free; 836 } 837 838 context->cmds_supp_uhw = resp.cmds_supp_uhw; 839 context->vendor_cap_flags = 0; 840 841 pthread_mutex_init(&context->qp_table_mutex, NULL); 842 pthread_mutex_init(&context->srq_table_mutex, NULL); 843 pthread_mutex_init(&context->uidx_table_mutex, NULL); 844 for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i) 845 context->qp_table[i].refcnt = 0; 846 847 for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i) 848 context->uidx_table[i].refcnt = 0; 849 850 context->db_list = NULL; 851 852 pthread_mutex_init(&context->db_list_mutex, NULL); 853 854 num_sys_page_map = context->tot_uuars / (context->num_uars_per_page * MLX5_NUM_NON_FP_BFREGS_PER_UAR); 855 for (i = 0; i < num_sys_page_map; ++i) { 856 offset = 0; 857 set_command(MLX5_MMAP_GET_REGULAR_PAGES_CMD, &offset); 858 set_index(i, &offset); 859 context->uar[i] = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, 860 cmd_fd, page_size * offset); 861 if (context->uar[i] == MAP_FAILED) { 862 context->uar[i] = NULL; 863 goto err_free_bf; 864 } 865 } 866 867 for (i = 0; i < num_sys_page_map; i++) { 868 for (j = 0; j < context->num_uars_per_page; j++) { 869 for (k = 0; k < NUM_BFREGS_PER_UAR; k++) { 870 bfi = (i * context->num_uars_per_page + j) * NUM_BFREGS_PER_UAR + k; 871 context->bfs[bfi].reg = context->uar[i] + MLX5_ADAPTER_PAGE_SIZE * j + 872 MLX5_BF_OFFSET + k * context->bf_reg_size; 873 context->bfs[bfi].need_lock = need_uuar_lock(context, bfi); 874 mlx5_spinlock_init(&context->bfs[bfi].lock); 875 context->bfs[bfi].offset = 0; 876 if (bfi) 877 context->bfs[bfi].buf_size = context->bf_reg_size / 2; 878 context->bfs[bfi].uuarn = bfi; 879 } 880 } 881 } 882 context->hca_core_clock = NULL; 883 if (resp.response_length + sizeof(resp.ibv_resp) >= 884 offsetof(struct mlx5_alloc_ucontext_resp, hca_core_clock_offset) + 885 sizeof(resp.hca_core_clock_offset) && 886 resp.comp_mask & MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET) { 887 context->core_clock.offset = resp.hca_core_clock_offset; 888 mlx5_map_internal_clock(mdev, ctx); 889 } 890 891 mlx5_spinlock_init(&context->lock32); 892 893 context->prefer_bf = get_always_bf(); 894 context->shut_up_bf = get_shut_up_bf(); 895 mlx5_read_env(&vdev->device, context); 896 897 mlx5_spinlock_init(&context->hugetlb_lock); 898 TAILQ_INIT(&context->hugetlb_list); 899 900 context->ibv_ctx.ops = mlx5_ctx_ops; 901 902 verbs_set_ctx_op(v_ctx, create_qp_ex, mlx5_create_qp_ex); 903 verbs_set_ctx_op(v_ctx, open_xrcd, mlx5_open_xrcd); 904 verbs_set_ctx_op(v_ctx, close_xrcd, mlx5_close_xrcd); 905 verbs_set_ctx_op(v_ctx, create_srq_ex, mlx5_create_srq_ex); 906 verbs_set_ctx_op(v_ctx, get_srq_num, mlx5_get_srq_num); 907 verbs_set_ctx_op(v_ctx, query_device_ex, mlx5_query_device_ex); 908 verbs_set_ctx_op(v_ctx, query_rt_values, mlx5_query_rt_values); 909 verbs_set_ctx_op(v_ctx, ibv_create_flow, ibv_cmd_create_flow); 910 verbs_set_ctx_op(v_ctx, ibv_destroy_flow, ibv_cmd_destroy_flow); 911 verbs_set_ctx_op(v_ctx, create_cq_ex, mlx5_create_cq_ex); 912 verbs_set_ctx_op(v_ctx, create_wq, mlx5_create_wq); 913 verbs_set_ctx_op(v_ctx, modify_wq, mlx5_modify_wq); 914 verbs_set_ctx_op(v_ctx, destroy_wq, mlx5_destroy_wq); 915 verbs_set_ctx_op(v_ctx, create_rwq_ind_table, mlx5_create_rwq_ind_table); 916 verbs_set_ctx_op(v_ctx, destroy_rwq_ind_table, mlx5_destroy_rwq_ind_table); 917 918 memset(&device_attr, 0, sizeof(device_attr)); 919 if (!mlx5_query_device_ex(ctx, NULL, &device_attr, 920 sizeof(struct ibv_device_attr_ex))) { 921 context->cached_device_cap_flags = 922 device_attr.orig_attr.device_cap_flags; 923 context->atomic_cap = device_attr.orig_attr.atomic_cap; 924 context->cached_tso_caps = device_attr.tso_caps; 925 } 926 927 for (j = 0; j < min(MLX5_MAX_PORTS_NUM, context->num_ports); ++j) { 928 memset(&port_attr, 0, sizeof(port_attr)); 929 if (!mlx5_query_port(ctx, j + 1, &port_attr)) 930 context->cached_link_layer[j] = port_attr.link_layer; 931 } 932 933 return 0; 934 935 err_free_bf: 936 free(context->bfs); 937 938 err_free: 939 for (i = 0; i < MLX5_MAX_UARS; ++i) { 940 if (context->uar[i]) 941 munmap(context->uar[i], page_size); 942 } 943 close_debug_file(context); 944 return errno; 945 } 946 947 static void mlx5_cleanup_context(struct verbs_device *device, 948 struct ibv_context *ibctx) 949 { 950 struct mlx5_context *context = to_mctx(ibctx); 951 int page_size = to_mdev(ibctx->device)->page_size; 952 int i; 953 954 free(context->bfs); 955 for (i = 0; i < MLX5_MAX_UARS; ++i) { 956 if (context->uar[i]) 957 munmap(context->uar[i], page_size); 958 } 959 if (context->hca_core_clock) 960 munmap(context->hca_core_clock - context->core_clock.offset, 961 page_size); 962 close_debug_file(context); 963 } 964 965 static struct verbs_device_ops mlx5_dev_ops = { 966 .init_context = mlx5_init_context, 967 .uninit_context = mlx5_cleanup_context, 968 }; 969 970 static struct verbs_device *mlx5_driver_init(const char *uverbs_sys_path, 971 int abi_version) 972 { 973 char value[8]; 974 struct mlx5_device *dev; 975 unsigned vendor, device; 976 int i; 977 978 if (ibv_read_sysfs_file(uverbs_sys_path, "device/vendor", 979 value, sizeof value) < 0) 980 return NULL; 981 sscanf(value, "%i", &vendor); 982 983 if (ibv_read_sysfs_file(uverbs_sys_path, "device/device", 984 value, sizeof value) < 0) 985 return NULL; 986 sscanf(value, "%i", &device); 987 988 for (i = 0; i < sizeof hca_table / sizeof hca_table[0]; ++i) 989 if (vendor == hca_table[i].vendor && 990 device == hca_table[i].device) 991 goto found; 992 993 return NULL; 994 995 found: 996 if (abi_version < MLX5_UVERBS_MIN_ABI_VERSION || 997 abi_version > MLX5_UVERBS_MAX_ABI_VERSION) { 998 fprintf(stderr, PFX "Fatal: ABI version %d of %s is not supported " 999 "(min supported %d, max supported %d)\n", 1000 abi_version, uverbs_sys_path, 1001 MLX5_UVERBS_MIN_ABI_VERSION, 1002 MLX5_UVERBS_MAX_ABI_VERSION); 1003 return NULL; 1004 } 1005 1006 dev = calloc(1, sizeof *dev); 1007 if (!dev) { 1008 fprintf(stderr, PFX "Fatal: couldn't allocate device for %s\n", 1009 uverbs_sys_path); 1010 return NULL; 1011 } 1012 1013 dev->page_size = sysconf(_SC_PAGESIZE); 1014 dev->driver_abi_ver = abi_version; 1015 1016 dev->verbs_dev.ops = &mlx5_dev_ops; 1017 dev->verbs_dev.sz = sizeof(*dev); 1018 dev->verbs_dev.size_of_context = sizeof(struct mlx5_context) - 1019 sizeof(struct ibv_context); 1020 1021 return &dev->verbs_dev; 1022 } 1023 1024 static __attribute__((constructor)) void mlx5_register_driver(void) 1025 { 1026 verbs_register_driver("mlx5", mlx5_driver_init); 1027 } 1028