xref: /freebsd/contrib/ofed/libmlx5/mlx5.c (revision b07549f3c15637af6064d2f52b572f239f5ba870)
1 /*
2  * Copyright (c) 2012 Mellanox Technologies, Inc.  All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32 #define _GNU_SOURCE
33 #include <config.h>
34 
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <unistd.h>
38 #include <errno.h>
39 #include <sys/mman.h>
40 #include <pthread.h>
41 #include <string.h>
42 #include <sched.h>
43 #include <sys/param.h>
44 #include <sys/cpuset.h>
45 
46 #include "mlx5.h"
47 #include "mlx5-abi.h"
48 
49 #ifndef PCI_VENDOR_ID_MELLANOX
50 #define PCI_VENDOR_ID_MELLANOX			0x15b3
51 #endif
52 
53 #ifndef CPU_OR
54 #define CPU_OR(x, y, z) do {} while (0)
55 #endif
56 
57 #ifndef CPU_EQUAL
58 #define CPU_EQUAL(x, y) 1
59 #endif
60 
61 
62 #define HCA(v, d) \
63 	{ .vendor = PCI_VENDOR_ID_##v,			\
64 	  .device = d }
65 
66 static struct {
67 	unsigned		vendor;
68 	unsigned		device;
69 } hca_table[] = {
70 	HCA(MELLANOX, 4113),	/* MT4113 Connect-IB */
71 	HCA(MELLANOX, 4114),	/* Connect-IB Virtual Function */
72 	HCA(MELLANOX, 4115),	/* ConnectX-4 */
73 	HCA(MELLANOX, 4116),	/* ConnectX-4 Virtual Function */
74 	HCA(MELLANOX, 4117),	/* ConnectX-4LX */
75 	HCA(MELLANOX, 4118),	/* ConnectX-4LX Virtual Function */
76 	HCA(MELLANOX, 4119),	/* ConnectX-5, PCIe 3.0 */
77 	HCA(MELLANOX, 4120),	/* ConnectX-5 Virtual Function */
78 	HCA(MELLANOX, 4121),    /* ConnectX-5 Ex */
79 	HCA(MELLANOX, 4122),	/* ConnectX-5 Ex VF */
80 	HCA(MELLANOX, 4123),    /* ConnectX-6 */
81 	HCA(MELLANOX, 4124),	/* ConnectX-6 VF */
82 	HCA(MELLANOX, 4125),	/* ConnectX-6 DX */
83 	HCA(MELLANOX, 4126),	/* ConnectX family mlx5Gen Virtual Function */
84 	HCA(MELLANOX, 4127),	/* ConnectX-6 LX */
85 	HCA(MELLANOX, 4129),	/* ConnectX-7 */
86 	HCA(MELLANOX, 4131),	/* ConnectX-8 */
87 	HCA(MELLANOX, 41682),	/* BlueField integrated ConnectX-5 network controller */
88 	HCA(MELLANOX, 41683),	/* BlueField integrated ConnectX-5 network controller VF */
89 	HCA(MELLANOX, 41686),	/* BlueField-2 integrated ConnectX-6 Dx network controller */
90 	HCA(MELLANOX, 41692),	/* BlueField-3 integrated ConnectX-7 network controller */
91 	HCA(MELLANOX, 41695),	/* BlueField-4 integrated ConnectX-8 network controller */
92 };
93 
94 uint32_t mlx5_debug_mask = 0;
95 int mlx5_freeze_on_error_cqe;
96 
97 static struct ibv_context_ops mlx5_ctx_ops = {
98 	.query_device  = mlx5_query_device,
99 	.query_port    = mlx5_query_port,
100 	.alloc_pd      = mlx5_alloc_pd,
101 	.dealloc_pd    = mlx5_free_pd,
102 	.reg_mr	       = mlx5_reg_mr,
103 	.rereg_mr      = mlx5_rereg_mr,
104 	.dereg_mr      = mlx5_dereg_mr,
105 	.alloc_mw      = mlx5_alloc_mw,
106 	.dealloc_mw    = mlx5_dealloc_mw,
107 	.bind_mw       = mlx5_bind_mw,
108 	.create_cq     = mlx5_create_cq,
109 	.poll_cq       = mlx5_poll_cq,
110 	.req_notify_cq = mlx5_arm_cq,
111 	.cq_event      = mlx5_cq_event,
112 	.resize_cq     = mlx5_resize_cq,
113 	.destroy_cq    = mlx5_destroy_cq,
114 	.create_srq    = mlx5_create_srq,
115 	.modify_srq    = mlx5_modify_srq,
116 	.query_srq     = mlx5_query_srq,
117 	.destroy_srq   = mlx5_destroy_srq,
118 	.post_srq_recv = mlx5_post_srq_recv,
119 	.create_qp     = mlx5_create_qp,
120 	.query_qp      = mlx5_query_qp,
121 	.modify_qp     = mlx5_modify_qp,
122 	.destroy_qp    = mlx5_destroy_qp,
123 	.post_send     = mlx5_post_send,
124 	.post_recv     = mlx5_post_recv,
125 	.create_ah     = mlx5_create_ah,
126 	.destroy_ah    = mlx5_destroy_ah,
127 	.attach_mcast  = mlx5_attach_mcast,
128 	.detach_mcast  = mlx5_detach_mcast
129 };
130 
131 static int read_number_from_line(const char *line, int *value)
132 {
133 	const char *ptr;
134 
135 	ptr = strchr(line, ':');
136 	if (!ptr)
137 		return 1;
138 
139 	++ptr;
140 
141 	*value = atoi(ptr);
142 	return 0;
143 }
144 /**
145  * The function looks for the first free user-index in all the
146  * user-index tables. If all are used, returns -1, otherwise
147  * a valid user-index.
148  * In case the reference count of the table is zero, it means the
149  * table is not in use and wasn't allocated yet, therefore the
150  * mlx5_store_uidx allocates the table, and increment the reference
151  * count on the table.
152  */
153 static int32_t get_free_uidx(struct mlx5_context *ctx)
154 {
155 	int32_t tind;
156 	int32_t i;
157 
158 	for (tind = 0; tind < MLX5_UIDX_TABLE_SIZE; tind++) {
159 		if (ctx->uidx_table[tind].refcnt < MLX5_UIDX_TABLE_MASK)
160 			break;
161 	}
162 
163 	if (tind == MLX5_UIDX_TABLE_SIZE)
164 		return -1;
165 
166 	if (!ctx->uidx_table[tind].refcnt)
167 		return tind << MLX5_UIDX_TABLE_SHIFT;
168 
169 	for (i = 0; i < MLX5_UIDX_TABLE_MASK + 1; i++) {
170 		if (!ctx->uidx_table[tind].table[i])
171 			break;
172 	}
173 
174 	return (tind << MLX5_UIDX_TABLE_SHIFT) | i;
175 }
176 
177 int32_t mlx5_store_uidx(struct mlx5_context *ctx, void *rsc)
178 {
179 	int32_t tind;
180 	int32_t ret = -1;
181 	int32_t uidx;
182 
183 	pthread_mutex_lock(&ctx->uidx_table_mutex);
184 	uidx = get_free_uidx(ctx);
185 	if (uidx < 0)
186 		goto out;
187 
188 	tind = uidx >> MLX5_UIDX_TABLE_SHIFT;
189 
190 	if (!ctx->uidx_table[tind].refcnt) {
191 		ctx->uidx_table[tind].table = calloc(MLX5_UIDX_TABLE_MASK + 1,
192 						     sizeof(struct mlx5_resource *));
193 		if (!ctx->uidx_table[tind].table)
194 			goto out;
195 	}
196 
197 	++ctx->uidx_table[tind].refcnt;
198 	ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = rsc;
199 	ret = uidx;
200 
201 out:
202 	pthread_mutex_unlock(&ctx->uidx_table_mutex);
203 	return ret;
204 }
205 
206 void mlx5_clear_uidx(struct mlx5_context *ctx, uint32_t uidx)
207 {
208 	int tind = uidx >> MLX5_UIDX_TABLE_SHIFT;
209 
210 	pthread_mutex_lock(&ctx->uidx_table_mutex);
211 
212 	if (!--ctx->uidx_table[tind].refcnt)
213 		free(ctx->uidx_table[tind].table);
214 	else
215 		ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = NULL;
216 
217 	pthread_mutex_unlock(&ctx->uidx_table_mutex);
218 }
219 
220 static int mlx5_is_sandy_bridge(int *num_cores)
221 {
222 	char line[128];
223 	FILE *fd;
224 	int rc = 0;
225 	int cur_cpu_family = -1;
226 	int cur_cpu_model = -1;
227 
228 	fd = fopen("/proc/cpuinfo", "r");
229 	if (!fd)
230 		return 0;
231 
232 	*num_cores = 0;
233 
234 	while (fgets(line, 128, fd)) {
235 		int value;
236 
237 		/* if this is information on new processor */
238 		if (!strncmp(line, "processor", 9)) {
239 			++*num_cores;
240 
241 			cur_cpu_family = -1;
242 			cur_cpu_model  = -1;
243 		} else if (!strncmp(line, "cpu family", 10)) {
244 			if ((cur_cpu_family < 0) && (!read_number_from_line(line, &value)))
245 				cur_cpu_family = value;
246 		} else if (!strncmp(line, "model", 5)) {
247 			if ((cur_cpu_model < 0) && (!read_number_from_line(line, &value)))
248 				cur_cpu_model = value;
249 		}
250 
251 		/* if this is a Sandy Bridge CPU */
252 		if ((cur_cpu_family == 6) &&
253 		    (cur_cpu_model == 0x2A || (cur_cpu_model == 0x2D) ))
254 			rc = 1;
255 	}
256 
257 	fclose(fd);
258 	return rc;
259 }
260 
261 /*
262 man cpuset
263 
264   This format displays each 32-bit word in hexadecimal (using ASCII characters "0" - "9" and "a" - "f"); words
265   are filled with leading zeros, if required. For masks longer than one word, a comma separator is used between
266   words. Words are displayed in big-endian order, which has the most significant bit first. The hex digits
267   within a word are also in big-endian order.
268 
269   The number of 32-bit words displayed is the minimum number needed to display all bits of the bitmask, based on
270   the size of the bitmask.
271 
272   Examples of the Mask Format:
273 
274      00000001                        # just bit 0 set
275      40000000,00000000,00000000      # just bit 94 set
276      000000ff,00000000               # bits 32-39 set
277      00000000,000E3862               # 1,5,6,11-13,17-19 set
278 
279   A mask with bits 0, 1, 2, 4, 8, 16, 32, and 64 set displays as:
280 
281      00000001,00000001,00010117
282 
283   The first "1" is for bit 64, the second for bit 32, the third for bit 16, the fourth for bit 8, the fifth for
284   bit 4, and the "7" is for bits 2, 1, and 0.
285 */
286 static void mlx5_local_cpu_set(struct ibv_device *ibdev, cpuset_t *cpu_set)
287 {
288 	char *p, buf[1024];
289 	char *env_value;
290 	uint32_t word;
291 	int i, k;
292 
293 	env_value = getenv("MLX5_LOCAL_CPUS");
294 	if (env_value)
295 		strncpy(buf, env_value, sizeof(buf));
296 	else {
297 		char fname[MAXPATHLEN];
298 
299 		snprintf(fname, MAXPATHLEN, "/sys/class/infiniband/%s",
300 			 ibv_get_device_name(ibdev));
301 
302 		if (ibv_read_sysfs_file(fname, "device/local_cpus", buf, sizeof(buf))) {
303 			fprintf(stderr, PFX "Warning: can not get local cpu set: failed to open %s\n", fname);
304 			return;
305 		}
306 	}
307 
308 	p = strrchr(buf, ',');
309 	if (!p)
310 		p = buf;
311 
312 	i = 0;
313 	do {
314 		if (*p == ',') {
315 			*p = 0;
316 			p ++;
317 		}
318 
319 		word = strtoul(p, NULL, 16);
320 
321 		for (k = 0; word; ++k, word >>= 1)
322 			if (word & 1)
323 				CPU_SET(k+i, cpu_set);
324 
325 		if (p == buf)
326 			break;
327 
328 		p = strrchr(buf, ',');
329 		if (!p)
330 			p = buf;
331 
332 		i += 32;
333 	} while (i < CPU_SETSIZE);
334 }
335 
336 static int mlx5_enable_sandy_bridge_fix(struct ibv_device *ibdev)
337 {
338 	cpuset_t my_cpus, dev_local_cpus, result_set;
339 	int stall_enable;
340 	int ret;
341 	int num_cores;
342 
343 	if (!mlx5_is_sandy_bridge(&num_cores))
344 		return 0;
345 
346 	/* by default enable stall on sandy bridge arch */
347 	stall_enable = 1;
348 
349 	/*
350 	 * check if app is bound to cpu set that is inside
351 	 * of device local cpu set. Disable stalling if true
352 	 */
353 
354 	/* use static cpu set - up to CPU_SETSIZE (1024) cpus/node */
355 	CPU_ZERO(&my_cpus);
356 	CPU_ZERO(&dev_local_cpus);
357 	CPU_ZERO(&result_set);
358 	ret = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, -1,
359 	    sizeof(my_cpus), &my_cpus);
360 	if (ret == -1) {
361 		if (errno == EINVAL)
362 			fprintf(stderr, PFX "Warning: my cpu set is too small\n");
363 		else
364 			fprintf(stderr, PFX "Warning: failed to get my cpu set\n");
365 		goto out;
366 	}
367 
368 	/* get device local cpu set */
369 	mlx5_local_cpu_set(ibdev, &dev_local_cpus);
370 
371 	/* check if my cpu set is in dev cpu */
372 #if __FreeBSD_version < 1400046
373 	CPU_OR(&result_set, &my_cpus);
374 	CPU_OR(&result_set, &dev_local_cpus);
375 #else
376 	CPU_OR(&result_set, &my_cpus, &dev_local_cpus);
377 #endif
378 	stall_enable = CPU_EQUAL(&result_set, &dev_local_cpus) ? 0 : 1;
379 
380 out:
381 	return stall_enable;
382 }
383 
384 static void mlx5_read_env(struct ibv_device *ibdev, struct mlx5_context *ctx)
385 {
386 	char *env_value;
387 
388 	env_value = getenv("MLX5_STALL_CQ_POLL");
389 	if (env_value)
390 		/* check if cq stall is enforced by user */
391 		ctx->stall_enable = (strcmp(env_value, "0")) ? 1 : 0;
392 	else
393 		/* autodetect if we need to do cq polling */
394 		ctx->stall_enable = mlx5_enable_sandy_bridge_fix(ibdev);
395 
396 	env_value = getenv("MLX5_STALL_NUM_LOOP");
397 	if (env_value)
398 		mlx5_stall_num_loop = atoi(env_value);
399 
400 	env_value = getenv("MLX5_STALL_CQ_POLL_MIN");
401 	if (env_value)
402 		mlx5_stall_cq_poll_min = atoi(env_value);
403 
404 	env_value = getenv("MLX5_STALL_CQ_POLL_MAX");
405 	if (env_value)
406 		mlx5_stall_cq_poll_max = atoi(env_value);
407 
408 	env_value = getenv("MLX5_STALL_CQ_INC_STEP");
409 	if (env_value)
410 		mlx5_stall_cq_inc_step = atoi(env_value);
411 
412 	env_value = getenv("MLX5_STALL_CQ_DEC_STEP");
413 	if (env_value)
414 		mlx5_stall_cq_dec_step = atoi(env_value);
415 
416 	ctx->stall_adaptive_enable = 0;
417 	ctx->stall_cycles = 0;
418 
419 	if (mlx5_stall_num_loop < 0) {
420 		ctx->stall_adaptive_enable = 1;
421 		ctx->stall_cycles = mlx5_stall_cq_poll_min;
422 	}
423 
424 }
425 
426 static int get_total_uuars(int page_size)
427 {
428 	int size = MLX5_DEF_TOT_UUARS;
429 	int uuars_in_page;
430 	char *env;
431 
432 	env = getenv("MLX5_TOTAL_UUARS");
433 	if (env)
434 		size = atoi(env);
435 
436 	if (size < 1)
437 		return -EINVAL;
438 
439 	uuars_in_page = page_size / MLX5_ADAPTER_PAGE_SIZE * MLX5_NUM_NON_FP_BFREGS_PER_UAR;
440 	size = max(uuars_in_page, size);
441 	size = align(size, MLX5_NUM_NON_FP_BFREGS_PER_UAR);
442 	if (size > MLX5_MAX_BFREGS)
443 		return -ENOMEM;
444 
445 	return size;
446 }
447 
448 static void open_debug_file(struct mlx5_context *ctx)
449 {
450 	char *env;
451 
452 	env = getenv("MLX5_DEBUG_FILE");
453 	if (!env) {
454 		ctx->dbg_fp = stderr;
455 		return;
456 	}
457 
458 	ctx->dbg_fp = fopen(env, "aw+");
459 	if (!ctx->dbg_fp) {
460 		fprintf(stderr, "Failed opening debug file %s, using stderr\n", env);
461 		ctx->dbg_fp = stderr;
462 		return;
463 	}
464 }
465 
466 static void close_debug_file(struct mlx5_context *ctx)
467 {
468 	if (ctx->dbg_fp && ctx->dbg_fp != stderr)
469 		fclose(ctx->dbg_fp);
470 }
471 
472 static void set_debug_mask(void)
473 {
474 	char *env;
475 
476 	env = getenv("MLX5_DEBUG_MASK");
477 	if (env)
478 		mlx5_debug_mask = strtol(env, NULL, 0);
479 }
480 
481 static void set_freeze_on_error(void)
482 {
483 	char *env;
484 
485 	env = getenv("MLX5_FREEZE_ON_ERROR_CQE");
486 	if (env)
487 		mlx5_freeze_on_error_cqe = strtol(env, NULL, 0);
488 }
489 
490 static int get_always_bf(void)
491 {
492 	char *env;
493 
494 	env = getenv("MLX5_POST_SEND_PREFER_BF");
495 	if (!env)
496 		return 1;
497 
498 	return strcmp(env, "0") ? 1 : 0;
499 }
500 
501 static int get_shut_up_bf(void)
502 {
503 	char *env;
504 
505 	env = getenv("MLX5_SHUT_UP_BF");
506 	if (!env)
507 		return 0;
508 
509 	return strcmp(env, "0") ? 1 : 0;
510 }
511 
512 static int get_num_low_lat_uuars(int tot_uuars)
513 {
514 	char *env;
515 	int num = 4;
516 
517 	env = getenv("MLX5_NUM_LOW_LAT_UUARS");
518 	if (env)
519 		num = atoi(env);
520 
521 	if (num < 0)
522 		return -EINVAL;
523 
524 	num = max(num, tot_uuars - MLX5_MED_BFREGS_TSHOLD);
525 	return num;
526 }
527 
528 /* The library allocates an array of uuar contexts. The one in index zero does
529  * not to execersize odd/even policy so it can avoid a lock but it may not use
530  * blue flame. The upper ones, low_lat_uuars can use blue flame with no lock
531  * since they are assigned to one QP only. The rest can use blue flame but since
532  * they are shared they need a lock
533  */
534 static int need_uuar_lock(struct mlx5_context *ctx, int uuarn)
535 {
536 	if (uuarn == 0 || mlx5_single_threaded)
537 		return 0;
538 
539 	if (uuarn >= (ctx->tot_uuars - ctx->low_lat_uuars) * 2)
540 		return 0;
541 
542 	return 1;
543 }
544 
545 static int single_threaded_app(void)
546 {
547 
548 	char *env;
549 
550 	env = getenv("MLX5_SINGLE_THREADED");
551 	if (env)
552 		return strcmp(env, "1") ? 0 : 1;
553 
554 	return 0;
555 }
556 
557 static int mlx5_cmd_get_context(struct mlx5_context *context,
558 				struct mlx5_alloc_ucontext *req,
559 				size_t req_len,
560 				struct mlx5_alloc_ucontext_resp *resp,
561 				size_t resp_len)
562 {
563 	if (!ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req,
564 				 req_len, &resp->ibv_resp, resp_len))
565 		return 0;
566 
567 	/* The ibv_cmd_get_context fails in older kernels when passing
568 	 * a request length that the kernel doesn't know.
569 	 * To avoid breaking compatibility of new libmlx5 and older
570 	 * kernels, when ibv_cmd_get_context fails with the full
571 	 * request length, we try once again with the legacy length.
572 	 * We repeat this process while reducing requested size based
573 	 * on the feature input size. To avoid this in the future, we
574 	 * will remove the check in kernel that requires fields unknown
575 	 * to the kernel to be cleared. This will require that any new
576 	 * feature that involves extending struct mlx5_alloc_ucontext
577 	 * will be accompanied by an indication in the form of one or
578 	 * more fields in struct mlx5_alloc_ucontext_resp. If the
579 	 * response value can be interpreted as feature not supported
580 	 * when the returned value is zero, this will suffice to
581 	 * indicate to the library that the request was ignored by the
582 	 * kernel, either because it is unaware or because it decided
583 	 * to do so. If zero is a valid response, we will add a new
584 	 * field that indicates whether the request was handled.
585 	 */
586 	if (!ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req,
587 				 offsetof(struct mlx5_alloc_ucontext, lib_caps),
588 				 &resp->ibv_resp, resp_len))
589 		return 0;
590 
591 	return ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req,
592 				   offsetof(struct mlx5_alloc_ucontext,
593 					    cqe_version),
594 				   &resp->ibv_resp, resp_len);
595 }
596 
597 static int mlx5_map_internal_clock(struct mlx5_device *mdev,
598 				   struct ibv_context *ibv_ctx)
599 {
600 	struct mlx5_context *context = to_mctx(ibv_ctx);
601 	void *hca_clock_page;
602 	off_t offset = 0;
603 
604 	set_command(MLX5_MMAP_GET_CORE_CLOCK_CMD, &offset);
605 	hca_clock_page = mmap(NULL, mdev->page_size,
606 			      PROT_READ, MAP_SHARED, ibv_ctx->cmd_fd,
607 			      mdev->page_size * offset);
608 
609 	if (hca_clock_page == MAP_FAILED) {
610 		fprintf(stderr, PFX
611 			"Warning: Timestamp available,\n"
612 			"but failed to mmap() hca core clock page.\n");
613 		return -1;
614 	}
615 
616 	context->hca_core_clock = hca_clock_page +
617 		(context->core_clock.offset & (mdev->page_size - 1));
618 	return 0;
619 }
620 
621 int mlx5dv_query_device(struct ibv_context *ctx_in,
622 			 struct mlx5dv_context *attrs_out)
623 {
624 	struct mlx5_context *mctx = to_mctx(ctx_in);
625 	uint64_t comp_mask_out = 0;
626 
627 	attrs_out->version   = 0;
628 	attrs_out->flags     = 0;
629 
630 	if (mctx->cqe_version == MLX5_CQE_VERSION_V1)
631 		attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_CQE_V1;
632 
633 	if (mctx->vendor_cap_flags & MLX5_VENDOR_CAP_FLAGS_MPW)
634 		attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_MPW;
635 
636 	if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_CQE_COMPRESION) {
637 		attrs_out->cqe_comp_caps = mctx->cqe_comp_caps;
638 		comp_mask_out |= MLX5DV_CONTEXT_MASK_CQE_COMPRESION;
639 	}
640 
641 	attrs_out->comp_mask = comp_mask_out;
642 
643 	return 0;
644 }
645 
646 static int mlx5dv_get_qp(struct ibv_qp *qp_in,
647 			 struct mlx5dv_qp *qp_out)
648 {
649 	struct mlx5_qp *mqp = to_mqp(qp_in);
650 
651 	qp_out->comp_mask = 0;
652 	qp_out->dbrec     = mqp->db;
653 
654 	if (mqp->sq_buf_size)
655 		/* IBV_QPT_RAW_PACKET */
656 		qp_out->sq.buf = (void *)((uintptr_t)mqp->sq_buf.buf);
657 	else
658 		qp_out->sq.buf = (void *)((uintptr_t)mqp->buf.buf + mqp->sq.offset);
659 	qp_out->sq.wqe_cnt = mqp->sq.wqe_cnt;
660 	qp_out->sq.stride  = 1 << mqp->sq.wqe_shift;
661 
662 	qp_out->rq.buf     = (void *)((uintptr_t)mqp->buf.buf + mqp->rq.offset);
663 	qp_out->rq.wqe_cnt = mqp->rq.wqe_cnt;
664 	qp_out->rq.stride  = 1 << mqp->rq.wqe_shift;
665 
666 	qp_out->bf.reg    = mqp->bf->reg;
667 
668 	if (mqp->bf->uuarn > 0)
669 		qp_out->bf.size = mqp->bf->buf_size;
670 	else
671 		qp_out->bf.size = 0;
672 
673 	return 0;
674 }
675 
676 static int mlx5dv_get_cq(struct ibv_cq *cq_in,
677 			 struct mlx5dv_cq *cq_out)
678 {
679 	struct mlx5_cq *mcq = to_mcq(cq_in);
680 	struct mlx5_context *mctx = to_mctx(cq_in->context);
681 
682 	cq_out->comp_mask = 0;
683 	cq_out->cqn       = mcq->cqn;
684 	cq_out->cqe_cnt   = mcq->ibv_cq.cqe + 1;
685 	cq_out->cqe_size  = mcq->cqe_sz;
686 	cq_out->buf       = mcq->active_buf->buf;
687 	cq_out->dbrec     = mcq->dbrec;
688 	cq_out->uar	  = mctx->uar;
689 
690 	mcq->flags	 |= MLX5_CQ_FLAGS_DV_OWNED;
691 
692 	return 0;
693 }
694 
695 static int mlx5dv_get_rwq(struct ibv_wq *wq_in,
696 			  struct mlx5dv_rwq *rwq_out)
697 {
698 	struct mlx5_rwq *mrwq = to_mrwq(wq_in);
699 
700 	rwq_out->comp_mask = 0;
701 	rwq_out->buf       = mrwq->pbuff;
702 	rwq_out->dbrec     = mrwq->recv_db;
703 	rwq_out->wqe_cnt   = mrwq->rq.wqe_cnt;
704 	rwq_out->stride    = 1 << mrwq->rq.wqe_shift;
705 
706 	return 0;
707 }
708 
709 static int mlx5dv_get_srq(struct ibv_srq *srq_in,
710 			  struct mlx5dv_srq *srq_out)
711 {
712 	struct mlx5_srq *msrq;
713 
714 	msrq = container_of(srq_in, struct mlx5_srq, vsrq.srq);
715 
716 	srq_out->comp_mask = 0;
717 	srq_out->buf       = msrq->buf.buf;
718 	srq_out->dbrec     = msrq->db;
719 	srq_out->stride    = 1 << msrq->wqe_shift;
720 	srq_out->head      = msrq->head;
721 	srq_out->tail      = msrq->tail;
722 
723 	return 0;
724 }
725 
726 int mlx5dv_init_obj(struct mlx5dv_obj *obj, uint64_t obj_type)
727 {
728 	int ret = 0;
729 
730 	if (obj_type & MLX5DV_OBJ_QP)
731 		ret = mlx5dv_get_qp(obj->qp.in, obj->qp.out);
732 	if (!ret && (obj_type & MLX5DV_OBJ_CQ))
733 		ret = mlx5dv_get_cq(obj->cq.in, obj->cq.out);
734 	if (!ret && (obj_type & MLX5DV_OBJ_SRQ))
735 		ret = mlx5dv_get_srq(obj->srq.in, obj->srq.out);
736 	if (!ret && (obj_type & MLX5DV_OBJ_RWQ))
737 		ret = mlx5dv_get_rwq(obj->rwq.in, obj->rwq.out);
738 
739 	return ret;
740 }
741 
742 static void adjust_uar_info(struct mlx5_device *mdev,
743 			    struct mlx5_context *context,
744 			    struct mlx5_alloc_ucontext_resp resp)
745 {
746 	if (!resp.log_uar_size && !resp.num_uars_per_page) {
747 		/* old kernel */
748 		context->uar_size = mdev->page_size;
749 		context->num_uars_per_page = 1;
750 		return;
751 	}
752 
753 	context->uar_size = 1 << resp.log_uar_size;
754 	context->num_uars_per_page = resp.num_uars_per_page;
755 }
756 
757 static int mlx5_init_context(struct verbs_device *vdev,
758 			     struct ibv_context *ctx, int cmd_fd)
759 {
760 	struct mlx5_context	       *context;
761 	struct mlx5_alloc_ucontext	req;
762 	struct mlx5_alloc_ucontext_resp resp;
763 	int				i;
764 	int				page_size;
765 	int				tot_uuars;
766 	int				low_lat_uuars;
767 	int				gross_uuars;
768 	int				j;
769 	off_t				offset;
770 	struct mlx5_device	       *mdev;
771 	struct verbs_context	       *v_ctx;
772 	struct ibv_port_attr		port_attr;
773 	struct ibv_device_attr_ex	device_attr;
774 	int				k;
775 	int				bfi;
776 	int				num_sys_page_map;
777 
778 	mdev = to_mdev(&vdev->device);
779 	v_ctx = verbs_get_ctx(ctx);
780 	page_size = mdev->page_size;
781 	mlx5_single_threaded = single_threaded_app();
782 
783 	context = to_mctx(ctx);
784 	context->ibv_ctx.cmd_fd = cmd_fd;
785 
786 	open_debug_file(context);
787 	set_debug_mask();
788 	set_freeze_on_error();
789 	if (gethostname(context->hostname, sizeof(context->hostname)))
790 		strcpy(context->hostname, "host_unknown");
791 
792 	tot_uuars = get_total_uuars(page_size);
793 	if (tot_uuars < 0) {
794 		errno = -tot_uuars;
795 		goto err_free;
796 	}
797 
798 	low_lat_uuars = get_num_low_lat_uuars(tot_uuars);
799 	if (low_lat_uuars < 0) {
800 		errno = -low_lat_uuars;
801 		goto err_free;
802 	}
803 
804 	if (low_lat_uuars > tot_uuars - 1) {
805 		errno = ENOMEM;
806 		goto err_free;
807 	}
808 
809 	memset(&req, 0, sizeof(req));
810 	memset(&resp, 0, sizeof(resp));
811 
812 	req.total_num_uuars = tot_uuars;
813 	req.num_low_latency_uuars = low_lat_uuars;
814 	req.cqe_version = MLX5_CQE_VERSION_V1;
815 	req.lib_caps |= MLX5_LIB_CAP_4K_UAR;
816 
817 	if (mlx5_cmd_get_context(context, &req, sizeof(req), &resp,
818 				 sizeof(resp)))
819 		goto err_free;
820 
821 	context->max_num_qps		= resp.qp_tab_size;
822 	context->bf_reg_size		= resp.bf_reg_size;
823 	context->tot_uuars		= resp.tot_uuars;
824 	context->low_lat_uuars		= low_lat_uuars;
825 	context->cache_line_size	= resp.cache_line_size;
826 	context->max_sq_desc_sz = resp.max_sq_desc_sz;
827 	context->max_rq_desc_sz = resp.max_rq_desc_sz;
828 	context->max_send_wqebb	= resp.max_send_wqebb;
829 	context->num_ports	= resp.num_ports;
830 	context->max_recv_wr	= resp.max_recv_wr;
831 	context->max_srq_recv_wr = resp.max_srq_recv_wr;
832 
833 	context->cqe_version = resp.cqe_version;
834 	if (context->cqe_version) {
835 		if (context->cqe_version == MLX5_CQE_VERSION_V1)
836 			mlx5_ctx_ops.poll_cq = mlx5_poll_cq_v1;
837 		else
838 			goto err_free;
839 	}
840 
841 	adjust_uar_info(mdev, context, resp);
842 
843 	gross_uuars = context->tot_uuars / MLX5_NUM_NON_FP_BFREGS_PER_UAR * NUM_BFREGS_PER_UAR;
844 	context->bfs = calloc(gross_uuars, sizeof(*context->bfs));
845 	if (!context->bfs) {
846 		errno = ENOMEM;
847 		goto err_free;
848 	}
849 
850 	context->cmds_supp_uhw = resp.cmds_supp_uhw;
851 	context->vendor_cap_flags = 0;
852 
853 	pthread_mutex_init(&context->qp_table_mutex, NULL);
854 	pthread_mutex_init(&context->srq_table_mutex, NULL);
855 	pthread_mutex_init(&context->uidx_table_mutex, NULL);
856 	for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i)
857 		context->qp_table[i].refcnt = 0;
858 
859 	for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i)
860 		context->uidx_table[i].refcnt = 0;
861 
862 	context->db_list = NULL;
863 
864 	pthread_mutex_init(&context->db_list_mutex, NULL);
865 
866 	num_sys_page_map = context->tot_uuars / (context->num_uars_per_page * MLX5_NUM_NON_FP_BFREGS_PER_UAR);
867 	for (i = 0; i < num_sys_page_map; ++i) {
868 		offset = 0;
869 		set_command(MLX5_MMAP_GET_REGULAR_PAGES_CMD, &offset);
870 		set_index(i, &offset);
871 		context->uar[i] = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED,
872 				       cmd_fd, page_size * offset);
873 		if (context->uar[i] == MAP_FAILED) {
874 			context->uar[i] = NULL;
875 			goto err_free_bf;
876 		}
877 	}
878 
879 	for (i = 0; i < num_sys_page_map; i++) {
880 		for (j = 0; j < context->num_uars_per_page; j++) {
881 			for (k = 0; k < NUM_BFREGS_PER_UAR; k++) {
882 				bfi = (i * context->num_uars_per_page + j) * NUM_BFREGS_PER_UAR + k;
883 				context->bfs[bfi].reg = context->uar[i] + MLX5_ADAPTER_PAGE_SIZE * j +
884 							MLX5_BF_OFFSET + k * context->bf_reg_size;
885 				context->bfs[bfi].need_lock = need_uuar_lock(context, bfi);
886 				mlx5_spinlock_init(&context->bfs[bfi].lock);
887 				context->bfs[bfi].offset = 0;
888 				if (bfi)
889 					context->bfs[bfi].buf_size = context->bf_reg_size / 2;
890 				context->bfs[bfi].uuarn = bfi;
891 			}
892 		}
893 	}
894 	context->hca_core_clock = NULL;
895 	if (resp.response_length + sizeof(resp.ibv_resp) >=
896 	    offsetof(struct mlx5_alloc_ucontext_resp, hca_core_clock_offset) +
897 	    sizeof(resp.hca_core_clock_offset) &&
898 	    resp.comp_mask & MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET) {
899 		context->core_clock.offset = resp.hca_core_clock_offset;
900 		mlx5_map_internal_clock(mdev, ctx);
901 	}
902 
903 	mlx5_spinlock_init(&context->lock32);
904 
905 	context->prefer_bf = get_always_bf();
906 	context->shut_up_bf = get_shut_up_bf();
907 	mlx5_read_env(&vdev->device, context);
908 
909 	mlx5_spinlock_init(&context->hugetlb_lock);
910 	TAILQ_INIT(&context->hugetlb_list);
911 
912 	context->ibv_ctx.ops = mlx5_ctx_ops;
913 
914 	verbs_set_ctx_op(v_ctx, create_qp_ex, mlx5_create_qp_ex);
915 	verbs_set_ctx_op(v_ctx, open_xrcd, mlx5_open_xrcd);
916 	verbs_set_ctx_op(v_ctx, close_xrcd, mlx5_close_xrcd);
917 	verbs_set_ctx_op(v_ctx, create_srq_ex, mlx5_create_srq_ex);
918 	verbs_set_ctx_op(v_ctx, get_srq_num, mlx5_get_srq_num);
919 	verbs_set_ctx_op(v_ctx, query_device_ex, mlx5_query_device_ex);
920 	verbs_set_ctx_op(v_ctx, query_rt_values, mlx5_query_rt_values);
921 	verbs_set_ctx_op(v_ctx, ibv_create_flow, ibv_cmd_create_flow);
922 	verbs_set_ctx_op(v_ctx, ibv_destroy_flow, ibv_cmd_destroy_flow);
923 	verbs_set_ctx_op(v_ctx, create_cq_ex, mlx5_create_cq_ex);
924 	verbs_set_ctx_op(v_ctx, create_wq, mlx5_create_wq);
925 	verbs_set_ctx_op(v_ctx, modify_wq, mlx5_modify_wq);
926 	verbs_set_ctx_op(v_ctx, destroy_wq, mlx5_destroy_wq);
927 	verbs_set_ctx_op(v_ctx, create_rwq_ind_table, mlx5_create_rwq_ind_table);
928 	verbs_set_ctx_op(v_ctx, destroy_rwq_ind_table, mlx5_destroy_rwq_ind_table);
929 
930 	memset(&device_attr, 0, sizeof(device_attr));
931 	if (!mlx5_query_device_ex(ctx, NULL, &device_attr,
932 				  sizeof(struct ibv_device_attr_ex))) {
933 		context->cached_device_cap_flags =
934 			device_attr.orig_attr.device_cap_flags;
935 		context->atomic_cap = device_attr.orig_attr.atomic_cap;
936 		context->cached_tso_caps = device_attr.tso_caps;
937 	}
938 
939 	for (j = 0; j < min(MLX5_MAX_PORTS_NUM, context->num_ports); ++j) {
940 		memset(&port_attr, 0, sizeof(port_attr));
941 		if (!mlx5_query_port(ctx, j + 1, &port_attr))
942 			context->cached_link_layer[j] = port_attr.link_layer;
943 	}
944 
945 	return 0;
946 
947 err_free_bf:
948 	free(context->bfs);
949 
950 err_free:
951 	for (i = 0; i < MLX5_MAX_UARS; ++i) {
952 		if (context->uar[i])
953 			munmap(context->uar[i], page_size);
954 	}
955 	close_debug_file(context);
956 	return errno;
957 }
958 
959 static void mlx5_cleanup_context(struct verbs_device *device,
960 				 struct ibv_context *ibctx)
961 {
962 	struct mlx5_context *context = to_mctx(ibctx);
963 	int page_size = to_mdev(ibctx->device)->page_size;
964 	int i;
965 
966 	free(context->bfs);
967 	for (i = 0; i < MLX5_MAX_UARS; ++i) {
968 		if (context->uar[i])
969 			munmap(context->uar[i], page_size);
970 	}
971 	if (context->hca_core_clock)
972 		munmap(context->hca_core_clock - context->core_clock.offset,
973 		       page_size);
974 	close_debug_file(context);
975 }
976 
977 static struct verbs_device_ops mlx5_dev_ops = {
978 	.init_context = mlx5_init_context,
979 	.uninit_context = mlx5_cleanup_context,
980 };
981 
982 static struct verbs_device *mlx5_driver_init(const char *uverbs_sys_path,
983 					     int abi_version)
984 {
985 	char			value[8];
986 	struct mlx5_device     *dev;
987 	unsigned		vendor, device;
988 	int			i;
989 
990 	if (ibv_read_sysfs_file(uverbs_sys_path, "device/vendor",
991 				value, sizeof value) < 0)
992 		return NULL;
993 	sscanf(value, "%i", &vendor);
994 
995 	if (ibv_read_sysfs_file(uverbs_sys_path, "device/device",
996 				value, sizeof value) < 0)
997 		return NULL;
998 	sscanf(value, "%i", &device);
999 
1000 	for (i = 0; i < sizeof hca_table / sizeof hca_table[0]; ++i)
1001 		if (vendor == hca_table[i].vendor &&
1002 		    device == hca_table[i].device)
1003 			goto found;
1004 
1005 	return NULL;
1006 
1007 found:
1008 	if (abi_version < MLX5_UVERBS_MIN_ABI_VERSION ||
1009 	    abi_version > MLX5_UVERBS_MAX_ABI_VERSION) {
1010 		fprintf(stderr, PFX "Fatal: ABI version %d of %s is not supported "
1011 			"(min supported %d, max supported %d)\n",
1012 			abi_version, uverbs_sys_path,
1013 			MLX5_UVERBS_MIN_ABI_VERSION,
1014 			MLX5_UVERBS_MAX_ABI_VERSION);
1015 		return NULL;
1016 	}
1017 
1018 	dev = calloc(1, sizeof *dev);
1019 	if (!dev) {
1020 		fprintf(stderr, PFX "Fatal: couldn't allocate device for %s\n",
1021 			uverbs_sys_path);
1022 		return NULL;
1023 	}
1024 
1025 	dev->page_size   = sysconf(_SC_PAGESIZE);
1026 	dev->driver_abi_ver = abi_version;
1027 
1028 	dev->verbs_dev.ops = &mlx5_dev_ops;
1029 	dev->verbs_dev.sz = sizeof(*dev);
1030 	dev->verbs_dev.size_of_context = sizeof(struct mlx5_context) -
1031 		sizeof(struct ibv_context);
1032 
1033 	return &dev->verbs_dev;
1034 }
1035 
1036 static __attribute__((constructor)) void mlx5_register_driver(void)
1037 {
1038 	verbs_register_driver("mlx5", mlx5_driver_init);
1039 }
1040