1 /*
2 * Copyright (c) 2012 Mellanox Technologies, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 */
32 #define _GNU_SOURCE
33 #include <config.h>
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <unistd.h>
38 #include <errno.h>
39 #include <sys/mman.h>
40 #include <pthread.h>
41 #include <string.h>
42 #include <sched.h>
43 #include <sys/param.h>
44 #include <sys/cpuset.h>
45
46 #include "mlx5.h"
47 #include "mlx5-abi.h"
48
49 #ifndef PCI_VENDOR_ID_MELLANOX
50 #define PCI_VENDOR_ID_MELLANOX 0x15b3
51 #endif
52
53 #ifndef CPU_OR
54 #define CPU_OR(x, y, z) do {} while (0)
55 #endif
56
57 #ifndef CPU_EQUAL
58 #define CPU_EQUAL(x, y) 1
59 #endif
60
61
62 #define HCA(v, d) \
63 { .vendor = PCI_VENDOR_ID_##v, \
64 .device = d }
65
66 static struct {
67 unsigned vendor;
68 unsigned device;
69 } hca_table[] = {
70 HCA(MELLANOX, 4113), /* MT4113 Connect-IB */
71 HCA(MELLANOX, 4114), /* Connect-IB Virtual Function */
72 HCA(MELLANOX, 4115), /* ConnectX-4 */
73 HCA(MELLANOX, 4116), /* ConnectX-4 Virtual Function */
74 HCA(MELLANOX, 4117), /* ConnectX-4LX */
75 HCA(MELLANOX, 4118), /* ConnectX-4LX Virtual Function */
76 HCA(MELLANOX, 4119), /* ConnectX-5, PCIe 3.0 */
77 HCA(MELLANOX, 4120), /* ConnectX-5 Virtual Function */
78 HCA(MELLANOX, 4121), /* ConnectX-5 Ex */
79 HCA(MELLANOX, 4122), /* ConnectX-5 Ex VF */
80 HCA(MELLANOX, 4123), /* ConnectX-6 */
81 HCA(MELLANOX, 4124), /* ConnectX-6 VF */
82 HCA(MELLANOX, 4125), /* ConnectX-6 DX */
83 HCA(MELLANOX, 4126), /* ConnectX family mlx5Gen Virtual Function */
84 HCA(MELLANOX, 4127), /* ConnectX-6 LX */
85 HCA(MELLANOX, 4129), /* ConnectX-7 */
86 HCA(MELLANOX, 4131), /* ConnectX-8 */
87 HCA(MELLANOX, 41682), /* BlueField integrated ConnectX-5 network controller */
88 HCA(MELLANOX, 41683), /* BlueField integrated ConnectX-5 network controller VF */
89 HCA(MELLANOX, 41686), /* BlueField-2 integrated ConnectX-6 Dx network controller */
90 HCA(MELLANOX, 41692), /* BlueField-3 integrated ConnectX-7 network controller */
91 HCA(MELLANOX, 41695), /* BlueField-4 integrated ConnectX-8 network controller */
92 };
93
94 uint32_t mlx5_debug_mask = 0;
95 int mlx5_freeze_on_error_cqe;
96
97 static struct ibv_context_ops mlx5_ctx_ops = {
98 .query_device = mlx5_query_device,
99 .query_port = mlx5_query_port,
100 .alloc_pd = mlx5_alloc_pd,
101 .dealloc_pd = mlx5_free_pd,
102 .reg_mr = mlx5_reg_mr,
103 .rereg_mr = mlx5_rereg_mr,
104 .dereg_mr = mlx5_dereg_mr,
105 .alloc_mw = mlx5_alloc_mw,
106 .dealloc_mw = mlx5_dealloc_mw,
107 .bind_mw = mlx5_bind_mw,
108 .create_cq = mlx5_create_cq,
109 .poll_cq = mlx5_poll_cq,
110 .req_notify_cq = mlx5_arm_cq,
111 .cq_event = mlx5_cq_event,
112 .resize_cq = mlx5_resize_cq,
113 .destroy_cq = mlx5_destroy_cq,
114 .create_srq = mlx5_create_srq,
115 .modify_srq = mlx5_modify_srq,
116 .query_srq = mlx5_query_srq,
117 .destroy_srq = mlx5_destroy_srq,
118 .post_srq_recv = mlx5_post_srq_recv,
119 .create_qp = mlx5_create_qp,
120 .query_qp = mlx5_query_qp,
121 .modify_qp = mlx5_modify_qp,
122 .destroy_qp = mlx5_destroy_qp,
123 .post_send = mlx5_post_send,
124 .post_recv = mlx5_post_recv,
125 .create_ah = mlx5_create_ah,
126 .destroy_ah = mlx5_destroy_ah,
127 .attach_mcast = mlx5_attach_mcast,
128 .detach_mcast = mlx5_detach_mcast
129 };
130
read_number_from_line(const char * line,int * value)131 static int read_number_from_line(const char *line, int *value)
132 {
133 const char *ptr;
134
135 ptr = strchr(line, ':');
136 if (!ptr)
137 return 1;
138
139 ++ptr;
140
141 *value = atoi(ptr);
142 return 0;
143 }
144 /**
145 * The function looks for the first free user-index in all the
146 * user-index tables. If all are used, returns -1, otherwise
147 * a valid user-index.
148 * In case the reference count of the table is zero, it means the
149 * table is not in use and wasn't allocated yet, therefore the
150 * mlx5_store_uidx allocates the table, and increment the reference
151 * count on the table.
152 */
get_free_uidx(struct mlx5_context * ctx)153 static int32_t get_free_uidx(struct mlx5_context *ctx)
154 {
155 int32_t tind;
156 int32_t i;
157
158 for (tind = 0; tind < MLX5_UIDX_TABLE_SIZE; tind++) {
159 if (ctx->uidx_table[tind].refcnt < MLX5_UIDX_TABLE_MASK)
160 break;
161 }
162
163 if (tind == MLX5_UIDX_TABLE_SIZE)
164 return -1;
165
166 if (!ctx->uidx_table[tind].refcnt)
167 return tind << MLX5_UIDX_TABLE_SHIFT;
168
169 for (i = 0; i < MLX5_UIDX_TABLE_MASK + 1; i++) {
170 if (!ctx->uidx_table[tind].table[i])
171 break;
172 }
173
174 return (tind << MLX5_UIDX_TABLE_SHIFT) | i;
175 }
176
mlx5_store_uidx(struct mlx5_context * ctx,void * rsc)177 int32_t mlx5_store_uidx(struct mlx5_context *ctx, void *rsc)
178 {
179 int32_t tind;
180 int32_t ret = -1;
181 int32_t uidx;
182
183 pthread_mutex_lock(&ctx->uidx_table_mutex);
184 uidx = get_free_uidx(ctx);
185 if (uidx < 0)
186 goto out;
187
188 tind = uidx >> MLX5_UIDX_TABLE_SHIFT;
189
190 if (!ctx->uidx_table[tind].refcnt) {
191 ctx->uidx_table[tind].table = calloc(MLX5_UIDX_TABLE_MASK + 1,
192 sizeof(struct mlx5_resource *));
193 if (!ctx->uidx_table[tind].table)
194 goto out;
195 }
196
197 ++ctx->uidx_table[tind].refcnt;
198 ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = rsc;
199 ret = uidx;
200
201 out:
202 pthread_mutex_unlock(&ctx->uidx_table_mutex);
203 return ret;
204 }
205
mlx5_clear_uidx(struct mlx5_context * ctx,uint32_t uidx)206 void mlx5_clear_uidx(struct mlx5_context *ctx, uint32_t uidx)
207 {
208 int tind = uidx >> MLX5_UIDX_TABLE_SHIFT;
209
210 pthread_mutex_lock(&ctx->uidx_table_mutex);
211
212 if (!--ctx->uidx_table[tind].refcnt)
213 free(ctx->uidx_table[tind].table);
214 else
215 ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = NULL;
216
217 pthread_mutex_unlock(&ctx->uidx_table_mutex);
218 }
219
mlx5_is_sandy_bridge(int * num_cores)220 static int mlx5_is_sandy_bridge(int *num_cores)
221 {
222 char line[128];
223 FILE *fd;
224 int rc = 0;
225 int cur_cpu_family = -1;
226 int cur_cpu_model = -1;
227
228 fd = fopen("/proc/cpuinfo", "r");
229 if (!fd)
230 return 0;
231
232 *num_cores = 0;
233
234 while (fgets(line, 128, fd)) {
235 int value;
236
237 /* if this is information on new processor */
238 if (!strncmp(line, "processor", 9)) {
239 ++*num_cores;
240
241 cur_cpu_family = -1;
242 cur_cpu_model = -1;
243 } else if (!strncmp(line, "cpu family", 10)) {
244 if ((cur_cpu_family < 0) && (!read_number_from_line(line, &value)))
245 cur_cpu_family = value;
246 } else if (!strncmp(line, "model", 5)) {
247 if ((cur_cpu_model < 0) && (!read_number_from_line(line, &value)))
248 cur_cpu_model = value;
249 }
250
251 /* if this is a Sandy Bridge CPU */
252 if ((cur_cpu_family == 6) &&
253 (cur_cpu_model == 0x2A || (cur_cpu_model == 0x2D) ))
254 rc = 1;
255 }
256
257 fclose(fd);
258 return rc;
259 }
260
261 /*
262 man cpuset
263
264 This format displays each 32-bit word in hexadecimal (using ASCII characters "0" - "9" and "a" - "f"); words
265 are filled with leading zeros, if required. For masks longer than one word, a comma separator is used between
266 words. Words are displayed in big-endian order, which has the most significant bit first. The hex digits
267 within a word are also in big-endian order.
268
269 The number of 32-bit words displayed is the minimum number needed to display all bits of the bitmask, based on
270 the size of the bitmask.
271
272 Examples of the Mask Format:
273
274 00000001 # just bit 0 set
275 40000000,00000000,00000000 # just bit 94 set
276 000000ff,00000000 # bits 32-39 set
277 00000000,000E3862 # 1,5,6,11-13,17-19 set
278
279 A mask with bits 0, 1, 2, 4, 8, 16, 32, and 64 set displays as:
280
281 00000001,00000001,00010117
282
283 The first "1" is for bit 64, the second for bit 32, the third for bit 16, the fourth for bit 8, the fifth for
284 bit 4, and the "7" is for bits 2, 1, and 0.
285 */
mlx5_local_cpu_set(struct ibv_device * ibdev,cpuset_t * cpu_set)286 static void mlx5_local_cpu_set(struct ibv_device *ibdev, cpuset_t *cpu_set)
287 {
288 char *p, buf[1024];
289 char *env_value;
290 uint32_t word;
291 int i, k;
292
293 env_value = getenv("MLX5_LOCAL_CPUS");
294 if (env_value)
295 strncpy(buf, env_value, sizeof(buf));
296 else {
297 char fname[MAXPATHLEN];
298
299 snprintf(fname, MAXPATHLEN, "/sys/class/infiniband/%s",
300 ibv_get_device_name(ibdev));
301
302 if (ibv_read_sysfs_file(fname, "device/local_cpus", buf, sizeof(buf))) {
303 fprintf(stderr, PFX "Warning: can not get local cpu set: failed to open %s\n", fname);
304 return;
305 }
306 }
307
308 p = strrchr(buf, ',');
309 if (!p)
310 p = buf;
311
312 i = 0;
313 do {
314 if (*p == ',') {
315 *p = 0;
316 p ++;
317 }
318
319 word = strtoul(p, NULL, 16);
320
321 for (k = 0; word; ++k, word >>= 1)
322 if (word & 1)
323 CPU_SET(k+i, cpu_set);
324
325 if (p == buf)
326 break;
327
328 p = strrchr(buf, ',');
329 if (!p)
330 p = buf;
331
332 i += 32;
333 } while (i < CPU_SETSIZE);
334 }
335
mlx5_enable_sandy_bridge_fix(struct ibv_device * ibdev)336 static int mlx5_enable_sandy_bridge_fix(struct ibv_device *ibdev)
337 {
338 cpuset_t my_cpus, dev_local_cpus, result_set;
339 int stall_enable;
340 int ret;
341 int num_cores;
342
343 if (!mlx5_is_sandy_bridge(&num_cores))
344 return 0;
345
346 /* by default enable stall on sandy bridge arch */
347 stall_enable = 1;
348
349 /*
350 * check if app is bound to cpu set that is inside
351 * of device local cpu set. Disable stalling if true
352 */
353
354 /* use static cpu set - up to CPU_SETSIZE (1024) cpus/node */
355 CPU_ZERO(&my_cpus);
356 CPU_ZERO(&dev_local_cpus);
357 CPU_ZERO(&result_set);
358 ret = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, -1,
359 sizeof(my_cpus), &my_cpus);
360 if (ret == -1) {
361 if (errno == EINVAL)
362 fprintf(stderr, PFX "Warning: my cpu set is too small\n");
363 else
364 fprintf(stderr, PFX "Warning: failed to get my cpu set\n");
365 goto out;
366 }
367
368 /* get device local cpu set */
369 mlx5_local_cpu_set(ibdev, &dev_local_cpus);
370
371 /* check if my cpu set is in dev cpu */
372 #if __FreeBSD_version < 1400046
373 CPU_OR(&result_set, &my_cpus);
374 CPU_OR(&result_set, &dev_local_cpus);
375 #else
376 CPU_OR(&result_set, &my_cpus, &dev_local_cpus);
377 #endif
378 stall_enable = CPU_EQUAL(&result_set, &dev_local_cpus) ? 0 : 1;
379
380 out:
381 return stall_enable;
382 }
383
mlx5_read_env(struct ibv_device * ibdev,struct mlx5_context * ctx)384 static void mlx5_read_env(struct ibv_device *ibdev, struct mlx5_context *ctx)
385 {
386 char *env_value;
387
388 env_value = getenv("MLX5_STALL_CQ_POLL");
389 if (env_value)
390 /* check if cq stall is enforced by user */
391 ctx->stall_enable = (strcmp(env_value, "0")) ? 1 : 0;
392 else
393 /* autodetect if we need to do cq polling */
394 ctx->stall_enable = mlx5_enable_sandy_bridge_fix(ibdev);
395
396 env_value = getenv("MLX5_STALL_NUM_LOOP");
397 if (env_value)
398 mlx5_stall_num_loop = atoi(env_value);
399
400 env_value = getenv("MLX5_STALL_CQ_POLL_MIN");
401 if (env_value)
402 mlx5_stall_cq_poll_min = atoi(env_value);
403
404 env_value = getenv("MLX5_STALL_CQ_POLL_MAX");
405 if (env_value)
406 mlx5_stall_cq_poll_max = atoi(env_value);
407
408 env_value = getenv("MLX5_STALL_CQ_INC_STEP");
409 if (env_value)
410 mlx5_stall_cq_inc_step = atoi(env_value);
411
412 env_value = getenv("MLX5_STALL_CQ_DEC_STEP");
413 if (env_value)
414 mlx5_stall_cq_dec_step = atoi(env_value);
415
416 ctx->stall_adaptive_enable = 0;
417 ctx->stall_cycles = 0;
418
419 if (mlx5_stall_num_loop < 0) {
420 ctx->stall_adaptive_enable = 1;
421 ctx->stall_cycles = mlx5_stall_cq_poll_min;
422 }
423
424 }
425
get_total_uuars(int page_size)426 static int get_total_uuars(int page_size)
427 {
428 int size = MLX5_DEF_TOT_UUARS;
429 int uuars_in_page;
430 char *env;
431
432 env = getenv("MLX5_TOTAL_UUARS");
433 if (env)
434 size = atoi(env);
435
436 if (size < 1)
437 return -EINVAL;
438
439 uuars_in_page = page_size / MLX5_ADAPTER_PAGE_SIZE * MLX5_NUM_NON_FP_BFREGS_PER_UAR;
440 size = max(uuars_in_page, size);
441 size = align(size, MLX5_NUM_NON_FP_BFREGS_PER_UAR);
442 if (size > MLX5_MAX_BFREGS)
443 return -ENOMEM;
444
445 return size;
446 }
447
open_debug_file(struct mlx5_context * ctx)448 static void open_debug_file(struct mlx5_context *ctx)
449 {
450 char *env;
451
452 env = getenv("MLX5_DEBUG_FILE");
453 if (!env) {
454 ctx->dbg_fp = stderr;
455 return;
456 }
457
458 ctx->dbg_fp = fopen(env, "aw+");
459 if (!ctx->dbg_fp) {
460 fprintf(stderr, "Failed opening debug file %s, using stderr\n", env);
461 ctx->dbg_fp = stderr;
462 return;
463 }
464 }
465
close_debug_file(struct mlx5_context * ctx)466 static void close_debug_file(struct mlx5_context *ctx)
467 {
468 if (ctx->dbg_fp && ctx->dbg_fp != stderr)
469 fclose(ctx->dbg_fp);
470 }
471
set_debug_mask(void)472 static void set_debug_mask(void)
473 {
474 char *env;
475
476 env = getenv("MLX5_DEBUG_MASK");
477 if (env)
478 mlx5_debug_mask = strtol(env, NULL, 0);
479 }
480
set_freeze_on_error(void)481 static void set_freeze_on_error(void)
482 {
483 char *env;
484
485 env = getenv("MLX5_FREEZE_ON_ERROR_CQE");
486 if (env)
487 mlx5_freeze_on_error_cqe = strtol(env, NULL, 0);
488 }
489
get_always_bf(void)490 static int get_always_bf(void)
491 {
492 char *env;
493
494 env = getenv("MLX5_POST_SEND_PREFER_BF");
495 if (!env)
496 return 1;
497
498 return strcmp(env, "0") ? 1 : 0;
499 }
500
get_shut_up_bf(void)501 static int get_shut_up_bf(void)
502 {
503 char *env;
504
505 env = getenv("MLX5_SHUT_UP_BF");
506 if (!env)
507 return 0;
508
509 return strcmp(env, "0") ? 1 : 0;
510 }
511
get_num_low_lat_uuars(int tot_uuars)512 static int get_num_low_lat_uuars(int tot_uuars)
513 {
514 char *env;
515 int num = 4;
516
517 env = getenv("MLX5_NUM_LOW_LAT_UUARS");
518 if (env)
519 num = atoi(env);
520
521 if (num < 0)
522 return -EINVAL;
523
524 num = max(num, tot_uuars - MLX5_MED_BFREGS_TSHOLD);
525 return num;
526 }
527
528 /* The library allocates an array of uuar contexts. The one in index zero does
529 * not to execersize odd/even policy so it can avoid a lock but it may not use
530 * blue flame. The upper ones, low_lat_uuars can use blue flame with no lock
531 * since they are assigned to one QP only. The rest can use blue flame but since
532 * they are shared they need a lock
533 */
need_uuar_lock(struct mlx5_context * ctx,int uuarn)534 static int need_uuar_lock(struct mlx5_context *ctx, int uuarn)
535 {
536 if (uuarn == 0 || mlx5_single_threaded)
537 return 0;
538
539 if (uuarn >= (ctx->tot_uuars - ctx->low_lat_uuars) * 2)
540 return 0;
541
542 return 1;
543 }
544
single_threaded_app(void)545 static int single_threaded_app(void)
546 {
547
548 char *env;
549
550 env = getenv("MLX5_SINGLE_THREADED");
551 if (env)
552 return strcmp(env, "1") ? 0 : 1;
553
554 return 0;
555 }
556
mlx5_cmd_get_context(struct mlx5_context * context,struct mlx5_alloc_ucontext * req,size_t req_len,struct mlx5_alloc_ucontext_resp * resp,size_t resp_len)557 static int mlx5_cmd_get_context(struct mlx5_context *context,
558 struct mlx5_alloc_ucontext *req,
559 size_t req_len,
560 struct mlx5_alloc_ucontext_resp *resp,
561 size_t resp_len)
562 {
563 if (!ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req,
564 req_len, &resp->ibv_resp, resp_len))
565 return 0;
566
567 /* The ibv_cmd_get_context fails in older kernels when passing
568 * a request length that the kernel doesn't know.
569 * To avoid breaking compatibility of new libmlx5 and older
570 * kernels, when ibv_cmd_get_context fails with the full
571 * request length, we try once again with the legacy length.
572 * We repeat this process while reducing requested size based
573 * on the feature input size. To avoid this in the future, we
574 * will remove the check in kernel that requires fields unknown
575 * to the kernel to be cleared. This will require that any new
576 * feature that involves extending struct mlx5_alloc_ucontext
577 * will be accompanied by an indication in the form of one or
578 * more fields in struct mlx5_alloc_ucontext_resp. If the
579 * response value can be interpreted as feature not supported
580 * when the returned value is zero, this will suffice to
581 * indicate to the library that the request was ignored by the
582 * kernel, either because it is unaware or because it decided
583 * to do so. If zero is a valid response, we will add a new
584 * field that indicates whether the request was handled.
585 */
586 if (!ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req,
587 offsetof(struct mlx5_alloc_ucontext, lib_caps),
588 &resp->ibv_resp, resp_len))
589 return 0;
590
591 return ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req,
592 offsetof(struct mlx5_alloc_ucontext,
593 cqe_version),
594 &resp->ibv_resp, resp_len);
595 }
596
mlx5_map_internal_clock(struct mlx5_device * mdev,struct ibv_context * ibv_ctx)597 static int mlx5_map_internal_clock(struct mlx5_device *mdev,
598 struct ibv_context *ibv_ctx)
599 {
600 struct mlx5_context *context = to_mctx(ibv_ctx);
601 void *hca_clock_page;
602 off_t offset = 0;
603
604 set_command(MLX5_MMAP_GET_CORE_CLOCK_CMD, &offset);
605 hca_clock_page = mmap(NULL, mdev->page_size,
606 PROT_READ, MAP_SHARED, ibv_ctx->cmd_fd,
607 mdev->page_size * offset);
608
609 if (hca_clock_page == MAP_FAILED) {
610 fprintf(stderr, PFX
611 "Warning: Timestamp available,\n"
612 "but failed to mmap() hca core clock page.\n");
613 return -1;
614 }
615
616 context->hca_core_clock = hca_clock_page +
617 (context->core_clock.offset & (mdev->page_size - 1));
618 return 0;
619 }
620
mlx5dv_query_device(struct ibv_context * ctx_in,struct mlx5dv_context * attrs_out)621 int mlx5dv_query_device(struct ibv_context *ctx_in,
622 struct mlx5dv_context *attrs_out)
623 {
624 struct mlx5_context *mctx = to_mctx(ctx_in);
625 uint64_t comp_mask_out = 0;
626
627 attrs_out->version = 0;
628 attrs_out->flags = 0;
629
630 if (mctx->cqe_version == MLX5_CQE_VERSION_V1)
631 attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_CQE_V1;
632
633 if (mctx->vendor_cap_flags & MLX5_VENDOR_CAP_FLAGS_MPW)
634 attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_MPW;
635
636 if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_CQE_COMPRESION) {
637 attrs_out->cqe_comp_caps = mctx->cqe_comp_caps;
638 comp_mask_out |= MLX5DV_CONTEXT_MASK_CQE_COMPRESION;
639 }
640
641 attrs_out->comp_mask = comp_mask_out;
642
643 return 0;
644 }
645
mlx5dv_get_qp(struct ibv_qp * qp_in,struct mlx5dv_qp * qp_out)646 static int mlx5dv_get_qp(struct ibv_qp *qp_in,
647 struct mlx5dv_qp *qp_out)
648 {
649 struct mlx5_qp *mqp = to_mqp(qp_in);
650
651 qp_out->comp_mask = 0;
652 qp_out->dbrec = mqp->db;
653
654 if (mqp->sq_buf_size)
655 /* IBV_QPT_RAW_PACKET */
656 qp_out->sq.buf = (void *)((uintptr_t)mqp->sq_buf.buf);
657 else
658 qp_out->sq.buf = (void *)((uintptr_t)mqp->buf.buf + mqp->sq.offset);
659 qp_out->sq.wqe_cnt = mqp->sq.wqe_cnt;
660 qp_out->sq.stride = 1 << mqp->sq.wqe_shift;
661
662 qp_out->rq.buf = (void *)((uintptr_t)mqp->buf.buf + mqp->rq.offset);
663 qp_out->rq.wqe_cnt = mqp->rq.wqe_cnt;
664 qp_out->rq.stride = 1 << mqp->rq.wqe_shift;
665
666 qp_out->bf.reg = mqp->bf->reg;
667
668 if (mqp->bf->uuarn > 0)
669 qp_out->bf.size = mqp->bf->buf_size;
670 else
671 qp_out->bf.size = 0;
672
673 return 0;
674 }
675
mlx5dv_get_cq(struct ibv_cq * cq_in,struct mlx5dv_cq * cq_out)676 static int mlx5dv_get_cq(struct ibv_cq *cq_in,
677 struct mlx5dv_cq *cq_out)
678 {
679 struct mlx5_cq *mcq = to_mcq(cq_in);
680 struct mlx5_context *mctx = to_mctx(cq_in->context);
681
682 cq_out->comp_mask = 0;
683 cq_out->cqn = mcq->cqn;
684 cq_out->cqe_cnt = mcq->ibv_cq.cqe + 1;
685 cq_out->cqe_size = mcq->cqe_sz;
686 cq_out->buf = mcq->active_buf->buf;
687 cq_out->dbrec = mcq->dbrec;
688 cq_out->uar = mctx->uar;
689
690 mcq->flags |= MLX5_CQ_FLAGS_DV_OWNED;
691
692 return 0;
693 }
694
mlx5dv_get_rwq(struct ibv_wq * wq_in,struct mlx5dv_rwq * rwq_out)695 static int mlx5dv_get_rwq(struct ibv_wq *wq_in,
696 struct mlx5dv_rwq *rwq_out)
697 {
698 struct mlx5_rwq *mrwq = to_mrwq(wq_in);
699
700 rwq_out->comp_mask = 0;
701 rwq_out->buf = mrwq->pbuff;
702 rwq_out->dbrec = mrwq->recv_db;
703 rwq_out->wqe_cnt = mrwq->rq.wqe_cnt;
704 rwq_out->stride = 1 << mrwq->rq.wqe_shift;
705
706 return 0;
707 }
708
mlx5dv_get_srq(struct ibv_srq * srq_in,struct mlx5dv_srq * srq_out)709 static int mlx5dv_get_srq(struct ibv_srq *srq_in,
710 struct mlx5dv_srq *srq_out)
711 {
712 struct mlx5_srq *msrq;
713
714 msrq = container_of(srq_in, struct mlx5_srq, vsrq.srq);
715
716 srq_out->comp_mask = 0;
717 srq_out->buf = msrq->buf.buf;
718 srq_out->dbrec = msrq->db;
719 srq_out->stride = 1 << msrq->wqe_shift;
720 srq_out->head = msrq->head;
721 srq_out->tail = msrq->tail;
722
723 return 0;
724 }
725
mlx5dv_init_obj(struct mlx5dv_obj * obj,uint64_t obj_type)726 int mlx5dv_init_obj(struct mlx5dv_obj *obj, uint64_t obj_type)
727 {
728 int ret = 0;
729
730 if (obj_type & MLX5DV_OBJ_QP)
731 ret = mlx5dv_get_qp(obj->qp.in, obj->qp.out);
732 if (!ret && (obj_type & MLX5DV_OBJ_CQ))
733 ret = mlx5dv_get_cq(obj->cq.in, obj->cq.out);
734 if (!ret && (obj_type & MLX5DV_OBJ_SRQ))
735 ret = mlx5dv_get_srq(obj->srq.in, obj->srq.out);
736 if (!ret && (obj_type & MLX5DV_OBJ_RWQ))
737 ret = mlx5dv_get_rwq(obj->rwq.in, obj->rwq.out);
738
739 return ret;
740 }
741
adjust_uar_info(struct mlx5_device * mdev,struct mlx5_context * context,struct mlx5_alloc_ucontext_resp resp)742 static void adjust_uar_info(struct mlx5_device *mdev,
743 struct mlx5_context *context,
744 struct mlx5_alloc_ucontext_resp resp)
745 {
746 if (!resp.log_uar_size && !resp.num_uars_per_page) {
747 /* old kernel */
748 context->uar_size = mdev->page_size;
749 context->num_uars_per_page = 1;
750 return;
751 }
752
753 context->uar_size = 1 << resp.log_uar_size;
754 context->num_uars_per_page = resp.num_uars_per_page;
755 }
756
mlx5_init_context(struct verbs_device * vdev,struct ibv_context * ctx,int cmd_fd)757 static int mlx5_init_context(struct verbs_device *vdev,
758 struct ibv_context *ctx, int cmd_fd)
759 {
760 struct mlx5_context *context;
761 struct mlx5_alloc_ucontext req;
762 struct mlx5_alloc_ucontext_resp resp;
763 int i;
764 int page_size;
765 int tot_uuars;
766 int low_lat_uuars;
767 int gross_uuars;
768 int j;
769 off_t offset;
770 struct mlx5_device *mdev;
771 struct verbs_context *v_ctx;
772 struct ibv_port_attr port_attr;
773 struct ibv_device_attr_ex device_attr;
774 int k;
775 int bfi;
776 int num_sys_page_map;
777
778 mdev = to_mdev(&vdev->device);
779 v_ctx = verbs_get_ctx(ctx);
780 page_size = mdev->page_size;
781 mlx5_single_threaded = single_threaded_app();
782
783 context = to_mctx(ctx);
784 context->ibv_ctx.cmd_fd = cmd_fd;
785
786 open_debug_file(context);
787 set_debug_mask();
788 set_freeze_on_error();
789 if (gethostname(context->hostname, sizeof(context->hostname)))
790 strcpy(context->hostname, "host_unknown");
791
792 tot_uuars = get_total_uuars(page_size);
793 if (tot_uuars < 0) {
794 errno = -tot_uuars;
795 goto err_free;
796 }
797
798 low_lat_uuars = get_num_low_lat_uuars(tot_uuars);
799 if (low_lat_uuars < 0) {
800 errno = -low_lat_uuars;
801 goto err_free;
802 }
803
804 if (low_lat_uuars > tot_uuars - 1) {
805 errno = ENOMEM;
806 goto err_free;
807 }
808
809 memset(&req, 0, sizeof(req));
810 memset(&resp, 0, sizeof(resp));
811
812 req.total_num_uuars = tot_uuars;
813 req.num_low_latency_uuars = low_lat_uuars;
814 req.cqe_version = MLX5_CQE_VERSION_V1;
815 req.lib_caps |= MLX5_LIB_CAP_4K_UAR;
816
817 if (mlx5_cmd_get_context(context, &req, sizeof(req), &resp,
818 sizeof(resp)))
819 goto err_free;
820
821 context->max_num_qps = resp.qp_tab_size;
822 context->bf_reg_size = resp.bf_reg_size;
823 context->tot_uuars = resp.tot_uuars;
824 context->low_lat_uuars = low_lat_uuars;
825 context->cache_line_size = resp.cache_line_size;
826 context->max_sq_desc_sz = resp.max_sq_desc_sz;
827 context->max_rq_desc_sz = resp.max_rq_desc_sz;
828 context->max_send_wqebb = resp.max_send_wqebb;
829 context->num_ports = resp.num_ports;
830 context->max_recv_wr = resp.max_recv_wr;
831 context->max_srq_recv_wr = resp.max_srq_recv_wr;
832
833 context->cqe_version = resp.cqe_version;
834 if (context->cqe_version) {
835 if (context->cqe_version == MLX5_CQE_VERSION_V1)
836 mlx5_ctx_ops.poll_cq = mlx5_poll_cq_v1;
837 else
838 goto err_free;
839 }
840
841 adjust_uar_info(mdev, context, resp);
842
843 gross_uuars = context->tot_uuars / MLX5_NUM_NON_FP_BFREGS_PER_UAR * NUM_BFREGS_PER_UAR;
844 context->bfs = calloc(gross_uuars, sizeof(*context->bfs));
845 if (!context->bfs) {
846 errno = ENOMEM;
847 goto err_free;
848 }
849
850 context->cmds_supp_uhw = resp.cmds_supp_uhw;
851 context->vendor_cap_flags = 0;
852
853 if (pthread_mutex_init(&context->qp_table_mutex, NULL))
854 goto err_free_bf;
855 if (pthread_mutex_init(&context->srq_table_mutex, NULL))
856 goto err_qp_table_mutex;
857 if (pthread_mutex_init(&context->uidx_table_mutex, NULL))
858 goto err_srq_table_mutex;
859 for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i)
860 context->qp_table[i].refcnt = 0;
861
862 for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i)
863 context->uidx_table[i].refcnt = 0;
864
865 context->db_list = NULL;
866
867 if (pthread_mutex_init(&context->db_list_mutex, NULL))
868 goto err_uidx_table_mutex;
869
870 num_sys_page_map = context->tot_uuars / (context->num_uars_per_page * MLX5_NUM_NON_FP_BFREGS_PER_UAR);
871 for (i = 0; i < num_sys_page_map; ++i) {
872 offset = 0;
873 set_command(MLX5_MMAP_GET_REGULAR_PAGES_CMD, &offset);
874 set_index(i, &offset);
875 context->uar[i] = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED,
876 cmd_fd, page_size * offset);
877 if (context->uar[i] == MAP_FAILED) {
878 context->uar[i] = NULL;
879 goto err_db_list_mutex;
880 }
881 }
882
883 for (i = 0; i < num_sys_page_map; i++) {
884 for (j = 0; j < context->num_uars_per_page; j++) {
885 for (k = 0; k < NUM_BFREGS_PER_UAR; k++) {
886 bfi = (i * context->num_uars_per_page + j) * NUM_BFREGS_PER_UAR + k;
887 context->bfs[bfi].reg = context->uar[i] + MLX5_ADAPTER_PAGE_SIZE * j +
888 MLX5_BF_OFFSET + k * context->bf_reg_size;
889 context->bfs[bfi].need_lock = need_uuar_lock(context, bfi);
890 if (mlx5_spinlock_init(&context->bfs[bfi].lock))
891 goto err_bfs_spl;
892 context->bfs[bfi].offset = 0;
893 if (bfi)
894 context->bfs[bfi].buf_size = context->bf_reg_size / 2;
895 context->bfs[bfi].uuarn = bfi;
896 }
897 }
898 }
899 context->hca_core_clock = NULL;
900 if (resp.response_length + sizeof(resp.ibv_resp) >=
901 offsetof(struct mlx5_alloc_ucontext_resp, hca_core_clock_offset) +
902 sizeof(resp.hca_core_clock_offset) &&
903 resp.comp_mask & MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET) {
904 context->core_clock.offset = resp.hca_core_clock_offset;
905 mlx5_map_internal_clock(mdev, ctx);
906 }
907
908 if (mlx5_spinlock_init(&context->lock32))
909 goto err_bfs_spl;
910
911 context->prefer_bf = get_always_bf();
912 context->shut_up_bf = get_shut_up_bf();
913 mlx5_read_env(&vdev->device, context);
914
915 if (mlx5_spinlock_init(&context->hugetlb_lock))
916 goto err_32_spl;
917 TAILQ_INIT(&context->hugetlb_list);
918
919 context->ibv_ctx.ops = mlx5_ctx_ops;
920
921 verbs_set_ctx_op(v_ctx, create_qp_ex, mlx5_create_qp_ex);
922 verbs_set_ctx_op(v_ctx, open_xrcd, mlx5_open_xrcd);
923 verbs_set_ctx_op(v_ctx, close_xrcd, mlx5_close_xrcd);
924 verbs_set_ctx_op(v_ctx, create_srq_ex, mlx5_create_srq_ex);
925 verbs_set_ctx_op(v_ctx, get_srq_num, mlx5_get_srq_num);
926 verbs_set_ctx_op(v_ctx, query_device_ex, mlx5_query_device_ex);
927 verbs_set_ctx_op(v_ctx, query_rt_values, mlx5_query_rt_values);
928 verbs_set_ctx_op(v_ctx, ibv_create_flow, ibv_cmd_create_flow);
929 verbs_set_ctx_op(v_ctx, ibv_destroy_flow, ibv_cmd_destroy_flow);
930 verbs_set_ctx_op(v_ctx, create_cq_ex, mlx5_create_cq_ex);
931 verbs_set_ctx_op(v_ctx, create_wq, mlx5_create_wq);
932 verbs_set_ctx_op(v_ctx, modify_wq, mlx5_modify_wq);
933 verbs_set_ctx_op(v_ctx, destroy_wq, mlx5_destroy_wq);
934 verbs_set_ctx_op(v_ctx, create_rwq_ind_table, mlx5_create_rwq_ind_table);
935 verbs_set_ctx_op(v_ctx, destroy_rwq_ind_table, mlx5_destroy_rwq_ind_table);
936
937 memset(&device_attr, 0, sizeof(device_attr));
938 if (!mlx5_query_device_ex(ctx, NULL, &device_attr,
939 sizeof(struct ibv_device_attr_ex))) {
940 context->cached_device_cap_flags =
941 device_attr.orig_attr.device_cap_flags;
942 context->atomic_cap = device_attr.orig_attr.atomic_cap;
943 context->cached_tso_caps = device_attr.tso_caps;
944 }
945
946 for (j = 0; j < min(MLX5_MAX_PORTS_NUM, context->num_ports); ++j) {
947 memset(&port_attr, 0, sizeof(port_attr));
948 if (!mlx5_query_port(ctx, j + 1, &port_attr))
949 context->cached_link_layer[j] = port_attr.link_layer;
950 }
951
952 return 0;
953
954 err_32_spl:
955 mlx5_spinlock_destroy(&context->lock32);
956
957 err_bfs_spl:
958 for (i = 0; i < num_sys_page_map; i++) {
959 for (j = 0; j < context->num_uars_per_page; j++) {
960 for (k = 0; k < NUM_BFREGS_PER_UAR; k++) {
961 bfi = (i * context->num_uars_per_page + j) * NUM_BFREGS_PER_UAR + k;
962 mlx5_spinlock_destroy(&context->bfs[bfi].lock);
963 }
964 }
965 }
966
967 err_db_list_mutex:
968 pthread_mutex_destroy(&context->db_list_mutex);
969
970 err_uidx_table_mutex:
971 pthread_mutex_destroy(&context->uidx_table_mutex);
972
973 err_srq_table_mutex:
974 pthread_mutex_destroy(&context->srq_table_mutex);
975
976 err_qp_table_mutex:
977 pthread_mutex_destroy(&context->qp_table_mutex);
978
979 err_free_bf:
980 free(context->bfs);
981
982 err_free:
983 for (i = 0; i < MLX5_MAX_UARS; ++i) {
984 if (context->uar[i])
985 munmap(context->uar[i], page_size);
986 }
987 close_debug_file(context);
988 return errno;
989 }
990
mlx5_cleanup_context(struct verbs_device * device,struct ibv_context * ibctx)991 static void mlx5_cleanup_context(struct verbs_device *device,
992 struct ibv_context *ibctx)
993 {
994 struct mlx5_context *context = to_mctx(ibctx);
995 int page_size = to_mdev(ibctx->device)->page_size;
996 int i;
997 int j;
998 int k;
999 int bfi;
1000 int num_sys_page_map;
1001
1002 num_sys_page_map = context->tot_uuars / (context->num_uars_per_page * MLX5_NUM_NON_FP_BFREGS_PER_UAR);
1003 for (i = 0; i < num_sys_page_map; i++) {
1004 for (j = 0; j < context->num_uars_per_page; j++) {
1005 for (k = 0; k < NUM_BFREGS_PER_UAR; k++) {
1006 bfi = (i * context->num_uars_per_page + j) * NUM_BFREGS_PER_UAR + k;
1007 mlx5_spinlock_destroy(&context->bfs[bfi].lock);
1008 }
1009 }
1010 }
1011 mlx5_spinlock_destroy(&context->hugetlb_lock);
1012 mlx5_spinlock_destroy(&context->lock32);
1013 pthread_mutex_destroy(&context->db_list_mutex);
1014 pthread_mutex_destroy(&context->uidx_table_mutex);
1015 pthread_mutex_destroy(&context->srq_table_mutex);
1016 pthread_mutex_destroy(&context->qp_table_mutex);
1017
1018 free(context->bfs);
1019 for (i = 0; i < MLX5_MAX_UARS; ++i) {
1020 if (context->uar[i])
1021 munmap(context->uar[i], page_size);
1022 }
1023 if (context->hca_core_clock)
1024 munmap(context->hca_core_clock - context->core_clock.offset,
1025 page_size);
1026 close_debug_file(context);
1027 }
1028
1029 static struct verbs_device_ops mlx5_dev_ops = {
1030 .init_context = mlx5_init_context,
1031 .uninit_context = mlx5_cleanup_context,
1032 };
1033
mlx5_driver_init(const char * uverbs_sys_path,int abi_version)1034 static struct verbs_device *mlx5_driver_init(const char *uverbs_sys_path,
1035 int abi_version)
1036 {
1037 char value[8];
1038 struct mlx5_device *dev;
1039 unsigned vendor, device;
1040 int i;
1041
1042 if (ibv_read_sysfs_file(uverbs_sys_path, "device/vendor",
1043 value, sizeof value) < 0)
1044 return NULL;
1045 sscanf(value, "%i", &vendor);
1046
1047 if (ibv_read_sysfs_file(uverbs_sys_path, "device/device",
1048 value, sizeof value) < 0)
1049 return NULL;
1050 sscanf(value, "%i", &device);
1051
1052 for (i = 0; i < sizeof hca_table / sizeof hca_table[0]; ++i)
1053 if (vendor == hca_table[i].vendor &&
1054 device == hca_table[i].device)
1055 goto found;
1056
1057 return NULL;
1058
1059 found:
1060 if (abi_version < MLX5_UVERBS_MIN_ABI_VERSION ||
1061 abi_version > MLX5_UVERBS_MAX_ABI_VERSION) {
1062 fprintf(stderr, PFX "Fatal: ABI version %d of %s is not supported "
1063 "(min supported %d, max supported %d)\n",
1064 abi_version, uverbs_sys_path,
1065 MLX5_UVERBS_MIN_ABI_VERSION,
1066 MLX5_UVERBS_MAX_ABI_VERSION);
1067 return NULL;
1068 }
1069
1070 dev = calloc(1, sizeof *dev);
1071 if (!dev) {
1072 fprintf(stderr, PFX "Fatal: couldn't allocate device for %s\n",
1073 uverbs_sys_path);
1074 return NULL;
1075 }
1076
1077 dev->page_size = sysconf(_SC_PAGESIZE);
1078 dev->driver_abi_ver = abi_version;
1079
1080 dev->verbs_dev.ops = &mlx5_dev_ops;
1081 dev->verbs_dev.sz = sizeof(*dev);
1082 dev->verbs_dev.size_of_context = sizeof(struct mlx5_context) -
1083 sizeof(struct ibv_context);
1084
1085 return &dev->verbs_dev;
1086 }
1087
mlx5_register_driver(void)1088 static __attribute__((constructor)) void mlx5_register_driver(void)
1089 {
1090 verbs_register_driver("mlx5", mlx5_driver_init);
1091 }
1092