xref: /freebsd/contrib/ofed/libmlx5/mlx5.c (revision f126890ac5386406dadf7c4cfa9566cbb56537c5)
1 /*
2  * Copyright (c) 2012 Mellanox Technologies, Inc.  All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32 #define _GNU_SOURCE
33 #include <config.h>
34 
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <unistd.h>
38 #include <errno.h>
39 #include <sys/mman.h>
40 #include <pthread.h>
41 #include <string.h>
42 #include <sched.h>
43 #include <sys/param.h>
44 #include <sys/cpuset.h>
45 
46 #include "mlx5.h"
47 #include "mlx5-abi.h"
48 
49 #ifndef PCI_VENDOR_ID_MELLANOX
50 #define PCI_VENDOR_ID_MELLANOX			0x15b3
51 #endif
52 
53 #ifndef CPU_OR
54 #define CPU_OR(x, y, z) do {} while (0)
55 #endif
56 
57 #ifndef CPU_EQUAL
58 #define CPU_EQUAL(x, y) 1
59 #endif
60 
61 
62 #define HCA(v, d) \
63 	{ .vendor = PCI_VENDOR_ID_##v,			\
64 	  .device = d }
65 
66 static struct {
67 	unsigned		vendor;
68 	unsigned		device;
69 } hca_table[] = {
70 	HCA(MELLANOX, 4113),	/* MT4113 Connect-IB */
71 	HCA(MELLANOX, 4114),	/* Connect-IB Virtual Function */
72 	HCA(MELLANOX, 4115),	/* ConnectX-4 */
73 	HCA(MELLANOX, 4116),	/* ConnectX-4 Virtual Function */
74 	HCA(MELLANOX, 4117),	/* ConnectX-4LX */
75 	HCA(MELLANOX, 4118),	/* ConnectX-4LX Virtual Function */
76 	HCA(MELLANOX, 4119),	/* ConnectX-5, PCIe 3.0 */
77 	HCA(MELLANOX, 4120),	/* ConnectX-5 Virtual Function */
78 	HCA(MELLANOX, 4121),    /* ConnectX-5 Ex */
79 	HCA(MELLANOX, 4122),	/* ConnectX-5 Ex VF */
80 	HCA(MELLANOX, 4123),    /* ConnectX-6 */
81 	HCA(MELLANOX, 4124),	/* ConnectX-6 VF */
82 	HCA(MELLANOX, 4125),	/* ConnectX-6 DX */
83 	HCA(MELLANOX, 4126),	/* ConnectX family mlx5Gen Virtual Function */
84 	HCA(MELLANOX, 4127),	/* ConnectX-6 LX */
85 	HCA(MELLANOX, 4129),	/* ConnectX-7 */
86 	HCA(MELLANOX, 4131),	/* ConnectX-8 */
87 	HCA(MELLANOX, 41682),	/* BlueField integrated ConnectX-5 network controller */
88 	HCA(MELLANOX, 41683),	/* BlueField integrated ConnectX-5 network controller VF */
89 	HCA(MELLANOX, 41686),	/* BlueField-2 integrated ConnectX-6 Dx network controller */
90 	HCA(MELLANOX, 41692),	/* BlueField-3 integrated ConnectX-7 network controller */
91 	HCA(MELLANOX, 41695),	/* BlueField-4 integrated ConnectX-8 network controller */
92 };
93 
94 uint32_t mlx5_debug_mask = 0;
95 int mlx5_freeze_on_error_cqe;
96 
97 static struct ibv_context_ops mlx5_ctx_ops = {
98 	.query_device  = mlx5_query_device,
99 	.query_port    = mlx5_query_port,
100 	.alloc_pd      = mlx5_alloc_pd,
101 	.dealloc_pd    = mlx5_free_pd,
102 	.reg_mr	       = mlx5_reg_mr,
103 	.rereg_mr      = mlx5_rereg_mr,
104 	.dereg_mr      = mlx5_dereg_mr,
105 	.alloc_mw      = mlx5_alloc_mw,
106 	.dealloc_mw    = mlx5_dealloc_mw,
107 	.bind_mw       = mlx5_bind_mw,
108 	.create_cq     = mlx5_create_cq,
109 	.poll_cq       = mlx5_poll_cq,
110 	.req_notify_cq = mlx5_arm_cq,
111 	.cq_event      = mlx5_cq_event,
112 	.resize_cq     = mlx5_resize_cq,
113 	.destroy_cq    = mlx5_destroy_cq,
114 	.create_srq    = mlx5_create_srq,
115 	.modify_srq    = mlx5_modify_srq,
116 	.query_srq     = mlx5_query_srq,
117 	.destroy_srq   = mlx5_destroy_srq,
118 	.post_srq_recv = mlx5_post_srq_recv,
119 	.create_qp     = mlx5_create_qp,
120 	.query_qp      = mlx5_query_qp,
121 	.modify_qp     = mlx5_modify_qp,
122 	.destroy_qp    = mlx5_destroy_qp,
123 	.post_send     = mlx5_post_send,
124 	.post_recv     = mlx5_post_recv,
125 	.create_ah     = mlx5_create_ah,
126 	.destroy_ah    = mlx5_destroy_ah,
127 	.attach_mcast  = mlx5_attach_mcast,
128 	.detach_mcast  = mlx5_detach_mcast
129 };
130 
131 static int read_number_from_line(const char *line, int *value)
132 {
133 	const char *ptr;
134 
135 	ptr = strchr(line, ':');
136 	if (!ptr)
137 		return 1;
138 
139 	++ptr;
140 
141 	*value = atoi(ptr);
142 	return 0;
143 }
144 /**
145  * The function looks for the first free user-index in all the
146  * user-index tables. If all are used, returns -1, otherwise
147  * a valid user-index.
148  * In case the reference count of the table is zero, it means the
149  * table is not in use and wasn't allocated yet, therefore the
150  * mlx5_store_uidx allocates the table, and increment the reference
151  * count on the table.
152  */
153 static int32_t get_free_uidx(struct mlx5_context *ctx)
154 {
155 	int32_t tind;
156 	int32_t i;
157 
158 	for (tind = 0; tind < MLX5_UIDX_TABLE_SIZE; tind++) {
159 		if (ctx->uidx_table[tind].refcnt < MLX5_UIDX_TABLE_MASK)
160 			break;
161 	}
162 
163 	if (tind == MLX5_UIDX_TABLE_SIZE)
164 		return -1;
165 
166 	if (!ctx->uidx_table[tind].refcnt)
167 		return tind << MLX5_UIDX_TABLE_SHIFT;
168 
169 	for (i = 0; i < MLX5_UIDX_TABLE_MASK + 1; i++) {
170 		if (!ctx->uidx_table[tind].table[i])
171 			break;
172 	}
173 
174 	return (tind << MLX5_UIDX_TABLE_SHIFT) | i;
175 }
176 
177 int32_t mlx5_store_uidx(struct mlx5_context *ctx, void *rsc)
178 {
179 	int32_t tind;
180 	int32_t ret = -1;
181 	int32_t uidx;
182 
183 	pthread_mutex_lock(&ctx->uidx_table_mutex);
184 	uidx = get_free_uidx(ctx);
185 	if (uidx < 0)
186 		goto out;
187 
188 	tind = uidx >> MLX5_UIDX_TABLE_SHIFT;
189 
190 	if (!ctx->uidx_table[tind].refcnt) {
191 		ctx->uidx_table[tind].table = calloc(MLX5_UIDX_TABLE_MASK + 1,
192 						     sizeof(struct mlx5_resource *));
193 		if (!ctx->uidx_table[tind].table)
194 			goto out;
195 	}
196 
197 	++ctx->uidx_table[tind].refcnt;
198 	ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = rsc;
199 	ret = uidx;
200 
201 out:
202 	pthread_mutex_unlock(&ctx->uidx_table_mutex);
203 	return ret;
204 }
205 
206 void mlx5_clear_uidx(struct mlx5_context *ctx, uint32_t uidx)
207 {
208 	int tind = uidx >> MLX5_UIDX_TABLE_SHIFT;
209 
210 	pthread_mutex_lock(&ctx->uidx_table_mutex);
211 
212 	if (!--ctx->uidx_table[tind].refcnt)
213 		free(ctx->uidx_table[tind].table);
214 	else
215 		ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = NULL;
216 
217 	pthread_mutex_unlock(&ctx->uidx_table_mutex);
218 }
219 
220 static int mlx5_is_sandy_bridge(int *num_cores)
221 {
222 	char line[128];
223 	FILE *fd;
224 	int rc = 0;
225 	int cur_cpu_family = -1;
226 	int cur_cpu_model = -1;
227 
228 	fd = fopen("/proc/cpuinfo", "r");
229 	if (!fd)
230 		return 0;
231 
232 	*num_cores = 0;
233 
234 	while (fgets(line, 128, fd)) {
235 		int value;
236 
237 		/* if this is information on new processor */
238 		if (!strncmp(line, "processor", 9)) {
239 			++*num_cores;
240 
241 			cur_cpu_family = -1;
242 			cur_cpu_model  = -1;
243 		} else if (!strncmp(line, "cpu family", 10)) {
244 			if ((cur_cpu_family < 0) && (!read_number_from_line(line, &value)))
245 				cur_cpu_family = value;
246 		} else if (!strncmp(line, "model", 5)) {
247 			if ((cur_cpu_model < 0) && (!read_number_from_line(line, &value)))
248 				cur_cpu_model = value;
249 		}
250 
251 		/* if this is a Sandy Bridge CPU */
252 		if ((cur_cpu_family == 6) &&
253 		    (cur_cpu_model == 0x2A || (cur_cpu_model == 0x2D) ))
254 			rc = 1;
255 	}
256 
257 	fclose(fd);
258 	return rc;
259 }
260 
261 /*
262 man cpuset
263 
264   This format displays each 32-bit word in hexadecimal (using ASCII characters "0" - "9" and "a" - "f"); words
265   are filled with leading zeros, if required. For masks longer than one word, a comma separator is used between
266   words. Words are displayed in big-endian order, which has the most significant bit first. The hex digits
267   within a word are also in big-endian order.
268 
269   The number of 32-bit words displayed is the minimum number needed to display all bits of the bitmask, based on
270   the size of the bitmask.
271 
272   Examples of the Mask Format:
273 
274      00000001                        # just bit 0 set
275      40000000,00000000,00000000      # just bit 94 set
276      000000ff,00000000               # bits 32-39 set
277      00000000,000E3862               # 1,5,6,11-13,17-19 set
278 
279   A mask with bits 0, 1, 2, 4, 8, 16, 32, and 64 set displays as:
280 
281      00000001,00000001,00010117
282 
283   The first "1" is for bit 64, the second for bit 32, the third for bit 16, the fourth for bit 8, the fifth for
284   bit 4, and the "7" is for bits 2, 1, and 0.
285 */
286 static void mlx5_local_cpu_set(struct ibv_device *ibdev, cpuset_t *cpu_set)
287 {
288 	char *p, buf[1024];
289 	char *env_value;
290 	uint32_t word;
291 	int i, k;
292 
293 	env_value = getenv("MLX5_LOCAL_CPUS");
294 	if (env_value)
295 		strncpy(buf, env_value, sizeof(buf));
296 	else {
297 		char fname[MAXPATHLEN];
298 
299 		snprintf(fname, MAXPATHLEN, "/sys/class/infiniband/%s",
300 			 ibv_get_device_name(ibdev));
301 
302 		if (ibv_read_sysfs_file(fname, "device/local_cpus", buf, sizeof(buf))) {
303 			fprintf(stderr, PFX "Warning: can not get local cpu set: failed to open %s\n", fname);
304 			return;
305 		}
306 	}
307 
308 	p = strrchr(buf, ',');
309 	if (!p)
310 		p = buf;
311 
312 	i = 0;
313 	do {
314 		if (*p == ',') {
315 			*p = 0;
316 			p ++;
317 		}
318 
319 		word = strtoul(p, NULL, 16);
320 
321 		for (k = 0; word; ++k, word >>= 1)
322 			if (word & 1)
323 				CPU_SET(k+i, cpu_set);
324 
325 		if (p == buf)
326 			break;
327 
328 		p = strrchr(buf, ',');
329 		if (!p)
330 			p = buf;
331 
332 		i += 32;
333 	} while (i < CPU_SETSIZE);
334 }
335 
336 static int mlx5_enable_sandy_bridge_fix(struct ibv_device *ibdev)
337 {
338 	cpuset_t my_cpus, dev_local_cpus, result_set;
339 	int stall_enable;
340 	int ret;
341 	int num_cores;
342 
343 	if (!mlx5_is_sandy_bridge(&num_cores))
344 		return 0;
345 
346 	/* by default enable stall on sandy bridge arch */
347 	stall_enable = 1;
348 
349 	/*
350 	 * check if app is bound to cpu set that is inside
351 	 * of device local cpu set. Disable stalling if true
352 	 */
353 
354 	/* use static cpu set - up to CPU_SETSIZE (1024) cpus/node */
355 	CPU_ZERO(&my_cpus);
356 	CPU_ZERO(&dev_local_cpus);
357 	CPU_ZERO(&result_set);
358 	ret = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, -1,
359 	    sizeof(my_cpus), &my_cpus);
360 	if (ret == -1) {
361 		if (errno == EINVAL)
362 			fprintf(stderr, PFX "Warning: my cpu set is too small\n");
363 		else
364 			fprintf(stderr, PFX "Warning: failed to get my cpu set\n");
365 		goto out;
366 	}
367 
368 	/* get device local cpu set */
369 	mlx5_local_cpu_set(ibdev, &dev_local_cpus);
370 
371 	/* check if my cpu set is in dev cpu */
372 #if __FreeBSD_version < 1400046
373 	CPU_OR(&result_set, &my_cpus);
374 	CPU_OR(&result_set, &dev_local_cpus);
375 #else
376 	CPU_OR(&result_set, &my_cpus, &dev_local_cpus);
377 #endif
378 	stall_enable = CPU_EQUAL(&result_set, &dev_local_cpus) ? 0 : 1;
379 
380 out:
381 	return stall_enable;
382 }
383 
384 static void mlx5_read_env(struct ibv_device *ibdev, struct mlx5_context *ctx)
385 {
386 	char *env_value;
387 
388 	env_value = getenv("MLX5_STALL_CQ_POLL");
389 	if (env_value)
390 		/* check if cq stall is enforced by user */
391 		ctx->stall_enable = (strcmp(env_value, "0")) ? 1 : 0;
392 	else
393 		/* autodetect if we need to do cq polling */
394 		ctx->stall_enable = mlx5_enable_sandy_bridge_fix(ibdev);
395 
396 	env_value = getenv("MLX5_STALL_NUM_LOOP");
397 	if (env_value)
398 		mlx5_stall_num_loop = atoi(env_value);
399 
400 	env_value = getenv("MLX5_STALL_CQ_POLL_MIN");
401 	if (env_value)
402 		mlx5_stall_cq_poll_min = atoi(env_value);
403 
404 	env_value = getenv("MLX5_STALL_CQ_POLL_MAX");
405 	if (env_value)
406 		mlx5_stall_cq_poll_max = atoi(env_value);
407 
408 	env_value = getenv("MLX5_STALL_CQ_INC_STEP");
409 	if (env_value)
410 		mlx5_stall_cq_inc_step = atoi(env_value);
411 
412 	env_value = getenv("MLX5_STALL_CQ_DEC_STEP");
413 	if (env_value)
414 		mlx5_stall_cq_dec_step = atoi(env_value);
415 
416 	ctx->stall_adaptive_enable = 0;
417 	ctx->stall_cycles = 0;
418 
419 	if (mlx5_stall_num_loop < 0) {
420 		ctx->stall_adaptive_enable = 1;
421 		ctx->stall_cycles = mlx5_stall_cq_poll_min;
422 	}
423 
424 }
425 
426 static int get_total_uuars(int page_size)
427 {
428 	int size = MLX5_DEF_TOT_UUARS;
429 	int uuars_in_page;
430 	char *env;
431 
432 	env = getenv("MLX5_TOTAL_UUARS");
433 	if (env)
434 		size = atoi(env);
435 
436 	if (size < 1)
437 		return -EINVAL;
438 
439 	uuars_in_page = page_size / MLX5_ADAPTER_PAGE_SIZE * MLX5_NUM_NON_FP_BFREGS_PER_UAR;
440 	size = max(uuars_in_page, size);
441 	size = align(size, MLX5_NUM_NON_FP_BFREGS_PER_UAR);
442 	if (size > MLX5_MAX_BFREGS)
443 		return -ENOMEM;
444 
445 	return size;
446 }
447 
448 static void open_debug_file(struct mlx5_context *ctx)
449 {
450 	char *env;
451 
452 	env = getenv("MLX5_DEBUG_FILE");
453 	if (!env) {
454 		ctx->dbg_fp = stderr;
455 		return;
456 	}
457 
458 	ctx->dbg_fp = fopen(env, "aw+");
459 	if (!ctx->dbg_fp) {
460 		fprintf(stderr, "Failed opening debug file %s, using stderr\n", env);
461 		ctx->dbg_fp = stderr;
462 		return;
463 	}
464 }
465 
466 static void close_debug_file(struct mlx5_context *ctx)
467 {
468 	if (ctx->dbg_fp && ctx->dbg_fp != stderr)
469 		fclose(ctx->dbg_fp);
470 }
471 
472 static void set_debug_mask(void)
473 {
474 	char *env;
475 
476 	env = getenv("MLX5_DEBUG_MASK");
477 	if (env)
478 		mlx5_debug_mask = strtol(env, NULL, 0);
479 }
480 
481 static void set_freeze_on_error(void)
482 {
483 	char *env;
484 
485 	env = getenv("MLX5_FREEZE_ON_ERROR_CQE");
486 	if (env)
487 		mlx5_freeze_on_error_cqe = strtol(env, NULL, 0);
488 }
489 
490 static int get_always_bf(void)
491 {
492 	char *env;
493 
494 	env = getenv("MLX5_POST_SEND_PREFER_BF");
495 	if (!env)
496 		return 1;
497 
498 	return strcmp(env, "0") ? 1 : 0;
499 }
500 
501 static int get_shut_up_bf(void)
502 {
503 	char *env;
504 
505 	env = getenv("MLX5_SHUT_UP_BF");
506 	if (!env)
507 		return 0;
508 
509 	return strcmp(env, "0") ? 1 : 0;
510 }
511 
512 static int get_num_low_lat_uuars(int tot_uuars)
513 {
514 	char *env;
515 	int num = 4;
516 
517 	env = getenv("MLX5_NUM_LOW_LAT_UUARS");
518 	if (env)
519 		num = atoi(env);
520 
521 	if (num < 0)
522 		return -EINVAL;
523 
524 	num = max(num, tot_uuars - MLX5_MED_BFREGS_TSHOLD);
525 	return num;
526 }
527 
528 /* The library allocates an array of uuar contexts. The one in index zero does
529  * not to execersize odd/even policy so it can avoid a lock but it may not use
530  * blue flame. The upper ones, low_lat_uuars can use blue flame with no lock
531  * since they are assigned to one QP only. The rest can use blue flame but since
532  * they are shared they need a lock
533  */
534 static int need_uuar_lock(struct mlx5_context *ctx, int uuarn)
535 {
536 	if (uuarn == 0 || mlx5_single_threaded)
537 		return 0;
538 
539 	if (uuarn >= (ctx->tot_uuars - ctx->low_lat_uuars) * 2)
540 		return 0;
541 
542 	return 1;
543 }
544 
545 static int single_threaded_app(void)
546 {
547 
548 	char *env;
549 
550 	env = getenv("MLX5_SINGLE_THREADED");
551 	if (env)
552 		return strcmp(env, "1") ? 0 : 1;
553 
554 	return 0;
555 }
556 
557 static int mlx5_cmd_get_context(struct mlx5_context *context,
558 				struct mlx5_alloc_ucontext *req,
559 				size_t req_len,
560 				struct mlx5_alloc_ucontext_resp *resp,
561 				size_t resp_len)
562 {
563 	if (!ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req,
564 				 req_len, &resp->ibv_resp, resp_len))
565 		return 0;
566 
567 	/* The ibv_cmd_get_context fails in older kernels when passing
568 	 * a request length that the kernel doesn't know.
569 	 * To avoid breaking compatibility of new libmlx5 and older
570 	 * kernels, when ibv_cmd_get_context fails with the full
571 	 * request length, we try once again with the legacy length.
572 	 * We repeat this process while reducing requested size based
573 	 * on the feature input size. To avoid this in the future, we
574 	 * will remove the check in kernel that requires fields unknown
575 	 * to the kernel to be cleared. This will require that any new
576 	 * feature that involves extending struct mlx5_alloc_ucontext
577 	 * will be accompanied by an indication in the form of one or
578 	 * more fields in struct mlx5_alloc_ucontext_resp. If the
579 	 * response value can be interpreted as feature not supported
580 	 * when the returned value is zero, this will suffice to
581 	 * indicate to the library that the request was ignored by the
582 	 * kernel, either because it is unaware or because it decided
583 	 * to do so. If zero is a valid response, we will add a new
584 	 * field that indicates whether the request was handled.
585 	 */
586 	if (!ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req,
587 				 offsetof(struct mlx5_alloc_ucontext, lib_caps),
588 				 &resp->ibv_resp, resp_len))
589 		return 0;
590 
591 	return ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req,
592 				   offsetof(struct mlx5_alloc_ucontext,
593 					    cqe_version),
594 				   &resp->ibv_resp, resp_len);
595 }
596 
597 static int mlx5_map_internal_clock(struct mlx5_device *mdev,
598 				   struct ibv_context *ibv_ctx)
599 {
600 	struct mlx5_context *context = to_mctx(ibv_ctx);
601 	void *hca_clock_page;
602 	off_t offset = 0;
603 
604 	set_command(MLX5_MMAP_GET_CORE_CLOCK_CMD, &offset);
605 	hca_clock_page = mmap(NULL, mdev->page_size,
606 			      PROT_READ, MAP_SHARED, ibv_ctx->cmd_fd,
607 			      mdev->page_size * offset);
608 
609 	if (hca_clock_page == MAP_FAILED) {
610 		fprintf(stderr, PFX
611 			"Warning: Timestamp available,\n"
612 			"but failed to mmap() hca core clock page.\n");
613 		return -1;
614 	}
615 
616 	context->hca_core_clock = hca_clock_page +
617 		(context->core_clock.offset & (mdev->page_size - 1));
618 	return 0;
619 }
620 
621 int mlx5dv_query_device(struct ibv_context *ctx_in,
622 			 struct mlx5dv_context *attrs_out)
623 {
624 	struct mlx5_context *mctx = to_mctx(ctx_in);
625 	uint64_t comp_mask_out = 0;
626 
627 	attrs_out->version   = 0;
628 	attrs_out->flags     = 0;
629 
630 	if (mctx->cqe_version == MLX5_CQE_VERSION_V1)
631 		attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_CQE_V1;
632 
633 	if (mctx->vendor_cap_flags & MLX5_VENDOR_CAP_FLAGS_MPW)
634 		attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_MPW;
635 
636 	if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_CQE_COMPRESION) {
637 		attrs_out->cqe_comp_caps = mctx->cqe_comp_caps;
638 		comp_mask_out |= MLX5DV_CONTEXT_MASK_CQE_COMPRESION;
639 	}
640 
641 	attrs_out->comp_mask = comp_mask_out;
642 
643 	return 0;
644 }
645 
646 static int mlx5dv_get_qp(struct ibv_qp *qp_in,
647 			 struct mlx5dv_qp *qp_out)
648 {
649 	struct mlx5_qp *mqp = to_mqp(qp_in);
650 
651 	qp_out->comp_mask = 0;
652 	qp_out->dbrec     = mqp->db;
653 
654 	if (mqp->sq_buf_size)
655 		/* IBV_QPT_RAW_PACKET */
656 		qp_out->sq.buf = (void *)((uintptr_t)mqp->sq_buf.buf);
657 	else
658 		qp_out->sq.buf = (void *)((uintptr_t)mqp->buf.buf + mqp->sq.offset);
659 	qp_out->sq.wqe_cnt = mqp->sq.wqe_cnt;
660 	qp_out->sq.stride  = 1 << mqp->sq.wqe_shift;
661 
662 	qp_out->rq.buf     = (void *)((uintptr_t)mqp->buf.buf + mqp->rq.offset);
663 	qp_out->rq.wqe_cnt = mqp->rq.wqe_cnt;
664 	qp_out->rq.stride  = 1 << mqp->rq.wqe_shift;
665 
666 	qp_out->bf.reg    = mqp->bf->reg;
667 
668 	if (mqp->bf->uuarn > 0)
669 		qp_out->bf.size = mqp->bf->buf_size;
670 	else
671 		qp_out->bf.size = 0;
672 
673 	return 0;
674 }
675 
676 static int mlx5dv_get_cq(struct ibv_cq *cq_in,
677 			 struct mlx5dv_cq *cq_out)
678 {
679 	struct mlx5_cq *mcq = to_mcq(cq_in);
680 	struct mlx5_context *mctx = to_mctx(cq_in->context);
681 
682 	cq_out->comp_mask = 0;
683 	cq_out->cqn       = mcq->cqn;
684 	cq_out->cqe_cnt   = mcq->ibv_cq.cqe + 1;
685 	cq_out->cqe_size  = mcq->cqe_sz;
686 	cq_out->buf       = mcq->active_buf->buf;
687 	cq_out->dbrec     = mcq->dbrec;
688 	cq_out->uar	  = mctx->uar;
689 
690 	mcq->flags	 |= MLX5_CQ_FLAGS_DV_OWNED;
691 
692 	return 0;
693 }
694 
695 static int mlx5dv_get_rwq(struct ibv_wq *wq_in,
696 			  struct mlx5dv_rwq *rwq_out)
697 {
698 	struct mlx5_rwq *mrwq = to_mrwq(wq_in);
699 
700 	rwq_out->comp_mask = 0;
701 	rwq_out->buf       = mrwq->pbuff;
702 	rwq_out->dbrec     = mrwq->recv_db;
703 	rwq_out->wqe_cnt   = mrwq->rq.wqe_cnt;
704 	rwq_out->stride    = 1 << mrwq->rq.wqe_shift;
705 
706 	return 0;
707 }
708 
709 static int mlx5dv_get_srq(struct ibv_srq *srq_in,
710 			  struct mlx5dv_srq *srq_out)
711 {
712 	struct mlx5_srq *msrq;
713 
714 	msrq = container_of(srq_in, struct mlx5_srq, vsrq.srq);
715 
716 	srq_out->comp_mask = 0;
717 	srq_out->buf       = msrq->buf.buf;
718 	srq_out->dbrec     = msrq->db;
719 	srq_out->stride    = 1 << msrq->wqe_shift;
720 	srq_out->head      = msrq->head;
721 	srq_out->tail      = msrq->tail;
722 
723 	return 0;
724 }
725 
726 int mlx5dv_init_obj(struct mlx5dv_obj *obj, uint64_t obj_type)
727 {
728 	int ret = 0;
729 
730 	if (obj_type & MLX5DV_OBJ_QP)
731 		ret = mlx5dv_get_qp(obj->qp.in, obj->qp.out);
732 	if (!ret && (obj_type & MLX5DV_OBJ_CQ))
733 		ret = mlx5dv_get_cq(obj->cq.in, obj->cq.out);
734 	if (!ret && (obj_type & MLX5DV_OBJ_SRQ))
735 		ret = mlx5dv_get_srq(obj->srq.in, obj->srq.out);
736 	if (!ret && (obj_type & MLX5DV_OBJ_RWQ))
737 		ret = mlx5dv_get_rwq(obj->rwq.in, obj->rwq.out);
738 
739 	return ret;
740 }
741 
742 static void adjust_uar_info(struct mlx5_device *mdev,
743 			    struct mlx5_context *context,
744 			    struct mlx5_alloc_ucontext_resp resp)
745 {
746 	if (!resp.log_uar_size && !resp.num_uars_per_page) {
747 		/* old kernel */
748 		context->uar_size = mdev->page_size;
749 		context->num_uars_per_page = 1;
750 		return;
751 	}
752 
753 	context->uar_size = 1 << resp.log_uar_size;
754 	context->num_uars_per_page = resp.num_uars_per_page;
755 }
756 
757 static int mlx5_init_context(struct verbs_device *vdev,
758 			     struct ibv_context *ctx, int cmd_fd)
759 {
760 	struct mlx5_context	       *context;
761 	struct mlx5_alloc_ucontext	req;
762 	struct mlx5_alloc_ucontext_resp resp;
763 	int				i;
764 	int				page_size;
765 	int				tot_uuars;
766 	int				low_lat_uuars;
767 	int				gross_uuars;
768 	int				j;
769 	off_t				offset;
770 	struct mlx5_device	       *mdev;
771 	struct verbs_context	       *v_ctx;
772 	struct ibv_port_attr		port_attr;
773 	struct ibv_device_attr_ex	device_attr;
774 	int				k;
775 	int				bfi;
776 	int				num_sys_page_map;
777 
778 	mdev = to_mdev(&vdev->device);
779 	v_ctx = verbs_get_ctx(ctx);
780 	page_size = mdev->page_size;
781 	mlx5_single_threaded = single_threaded_app();
782 
783 	context = to_mctx(ctx);
784 	context->ibv_ctx.cmd_fd = cmd_fd;
785 
786 	open_debug_file(context);
787 	set_debug_mask();
788 	set_freeze_on_error();
789 	if (gethostname(context->hostname, sizeof(context->hostname)))
790 		strcpy(context->hostname, "host_unknown");
791 
792 	tot_uuars = get_total_uuars(page_size);
793 	if (tot_uuars < 0) {
794 		errno = -tot_uuars;
795 		goto err_free;
796 	}
797 
798 	low_lat_uuars = get_num_low_lat_uuars(tot_uuars);
799 	if (low_lat_uuars < 0) {
800 		errno = -low_lat_uuars;
801 		goto err_free;
802 	}
803 
804 	if (low_lat_uuars > tot_uuars - 1) {
805 		errno = ENOMEM;
806 		goto err_free;
807 	}
808 
809 	memset(&req, 0, sizeof(req));
810 	memset(&resp, 0, sizeof(resp));
811 
812 	req.total_num_uuars = tot_uuars;
813 	req.num_low_latency_uuars = low_lat_uuars;
814 	req.cqe_version = MLX5_CQE_VERSION_V1;
815 	req.lib_caps |= MLX5_LIB_CAP_4K_UAR;
816 
817 	if (mlx5_cmd_get_context(context, &req, sizeof(req), &resp,
818 				 sizeof(resp)))
819 		goto err_free;
820 
821 	context->max_num_qps		= resp.qp_tab_size;
822 	context->bf_reg_size		= resp.bf_reg_size;
823 	context->tot_uuars		= resp.tot_uuars;
824 	context->low_lat_uuars		= low_lat_uuars;
825 	context->cache_line_size	= resp.cache_line_size;
826 	context->max_sq_desc_sz = resp.max_sq_desc_sz;
827 	context->max_rq_desc_sz = resp.max_rq_desc_sz;
828 	context->max_send_wqebb	= resp.max_send_wqebb;
829 	context->num_ports	= resp.num_ports;
830 	context->max_recv_wr	= resp.max_recv_wr;
831 	context->max_srq_recv_wr = resp.max_srq_recv_wr;
832 
833 	context->cqe_version = resp.cqe_version;
834 	if (context->cqe_version) {
835 		if (context->cqe_version == MLX5_CQE_VERSION_V1)
836 			mlx5_ctx_ops.poll_cq = mlx5_poll_cq_v1;
837 		else
838 			goto err_free;
839 	}
840 
841 	adjust_uar_info(mdev, context, resp);
842 
843 	gross_uuars = context->tot_uuars / MLX5_NUM_NON_FP_BFREGS_PER_UAR * NUM_BFREGS_PER_UAR;
844 	context->bfs = calloc(gross_uuars, sizeof(*context->bfs));
845 	if (!context->bfs) {
846 		errno = ENOMEM;
847 		goto err_free;
848 	}
849 
850 	context->cmds_supp_uhw = resp.cmds_supp_uhw;
851 	context->vendor_cap_flags = 0;
852 
853 	if (pthread_mutex_init(&context->qp_table_mutex, NULL))
854 		goto err_free_bf;
855 	if (pthread_mutex_init(&context->srq_table_mutex, NULL))
856 		goto err_qp_table_mutex;
857 	if (pthread_mutex_init(&context->uidx_table_mutex, NULL))
858 		goto err_srq_table_mutex;
859 	for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i)
860 		context->qp_table[i].refcnt = 0;
861 
862 	for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i)
863 		context->uidx_table[i].refcnt = 0;
864 
865 	context->db_list = NULL;
866 
867 	if (pthread_mutex_init(&context->db_list_mutex, NULL))
868 		goto err_uidx_table_mutex;
869 
870 	num_sys_page_map = context->tot_uuars / (context->num_uars_per_page * MLX5_NUM_NON_FP_BFREGS_PER_UAR);
871 	for (i = 0; i < num_sys_page_map; ++i) {
872 		offset = 0;
873 		set_command(MLX5_MMAP_GET_REGULAR_PAGES_CMD, &offset);
874 		set_index(i, &offset);
875 		context->uar[i] = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED,
876 				       cmd_fd, page_size * offset);
877 		if (context->uar[i] == MAP_FAILED) {
878 			context->uar[i] = NULL;
879 			goto err_db_list_mutex;
880 		}
881 	}
882 
883 	for (i = 0; i < num_sys_page_map; i++) {
884 		for (j = 0; j < context->num_uars_per_page; j++) {
885 			for (k = 0; k < NUM_BFREGS_PER_UAR; k++) {
886 				bfi = (i * context->num_uars_per_page + j) * NUM_BFREGS_PER_UAR + k;
887 				context->bfs[bfi].reg = context->uar[i] + MLX5_ADAPTER_PAGE_SIZE * j +
888 							MLX5_BF_OFFSET + k * context->bf_reg_size;
889 				context->bfs[bfi].need_lock = need_uuar_lock(context, bfi);
890 				if (mlx5_spinlock_init(&context->bfs[bfi].lock))
891 					goto err_bfs_spl;
892 				context->bfs[bfi].offset = 0;
893 				if (bfi)
894 					context->bfs[bfi].buf_size = context->bf_reg_size / 2;
895 				context->bfs[bfi].uuarn = bfi;
896 			}
897 		}
898 	}
899 	context->hca_core_clock = NULL;
900 	if (resp.response_length + sizeof(resp.ibv_resp) >=
901 	    offsetof(struct mlx5_alloc_ucontext_resp, hca_core_clock_offset) +
902 	    sizeof(resp.hca_core_clock_offset) &&
903 	    resp.comp_mask & MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET) {
904 		context->core_clock.offset = resp.hca_core_clock_offset;
905 		mlx5_map_internal_clock(mdev, ctx);
906 	}
907 
908 	if (mlx5_spinlock_init(&context->lock32))
909 		goto err_bfs_spl;
910 
911 	context->prefer_bf = get_always_bf();
912 	context->shut_up_bf = get_shut_up_bf();
913 	mlx5_read_env(&vdev->device, context);
914 
915 	if (mlx5_spinlock_init(&context->hugetlb_lock))
916 		goto err_32_spl;
917 	TAILQ_INIT(&context->hugetlb_list);
918 
919 	context->ibv_ctx.ops = mlx5_ctx_ops;
920 
921 	verbs_set_ctx_op(v_ctx, create_qp_ex, mlx5_create_qp_ex);
922 	verbs_set_ctx_op(v_ctx, open_xrcd, mlx5_open_xrcd);
923 	verbs_set_ctx_op(v_ctx, close_xrcd, mlx5_close_xrcd);
924 	verbs_set_ctx_op(v_ctx, create_srq_ex, mlx5_create_srq_ex);
925 	verbs_set_ctx_op(v_ctx, get_srq_num, mlx5_get_srq_num);
926 	verbs_set_ctx_op(v_ctx, query_device_ex, mlx5_query_device_ex);
927 	verbs_set_ctx_op(v_ctx, query_rt_values, mlx5_query_rt_values);
928 	verbs_set_ctx_op(v_ctx, ibv_create_flow, ibv_cmd_create_flow);
929 	verbs_set_ctx_op(v_ctx, ibv_destroy_flow, ibv_cmd_destroy_flow);
930 	verbs_set_ctx_op(v_ctx, create_cq_ex, mlx5_create_cq_ex);
931 	verbs_set_ctx_op(v_ctx, create_wq, mlx5_create_wq);
932 	verbs_set_ctx_op(v_ctx, modify_wq, mlx5_modify_wq);
933 	verbs_set_ctx_op(v_ctx, destroy_wq, mlx5_destroy_wq);
934 	verbs_set_ctx_op(v_ctx, create_rwq_ind_table, mlx5_create_rwq_ind_table);
935 	verbs_set_ctx_op(v_ctx, destroy_rwq_ind_table, mlx5_destroy_rwq_ind_table);
936 
937 	memset(&device_attr, 0, sizeof(device_attr));
938 	if (!mlx5_query_device_ex(ctx, NULL, &device_attr,
939 				  sizeof(struct ibv_device_attr_ex))) {
940 		context->cached_device_cap_flags =
941 			device_attr.orig_attr.device_cap_flags;
942 		context->atomic_cap = device_attr.orig_attr.atomic_cap;
943 		context->cached_tso_caps = device_attr.tso_caps;
944 	}
945 
946 	for (j = 0; j < min(MLX5_MAX_PORTS_NUM, context->num_ports); ++j) {
947 		memset(&port_attr, 0, sizeof(port_attr));
948 		if (!mlx5_query_port(ctx, j + 1, &port_attr))
949 			context->cached_link_layer[j] = port_attr.link_layer;
950 	}
951 
952 	return 0;
953 
954 err_32_spl:
955 	mlx5_spinlock_destroy(&context->lock32);
956 
957 err_bfs_spl:
958 	for (i = 0; i < num_sys_page_map; i++) {
959 		for (j = 0; j < context->num_uars_per_page; j++) {
960 			for (k = 0; k < NUM_BFREGS_PER_UAR; k++) {
961 				bfi = (i * context->num_uars_per_page + j) * NUM_BFREGS_PER_UAR + k;
962 				mlx5_spinlock_destroy(&context->bfs[bfi].lock);
963 			}
964 		}
965 	}
966 
967 err_db_list_mutex:
968 	pthread_mutex_destroy(&context->db_list_mutex);
969 
970 err_uidx_table_mutex:
971 	pthread_mutex_destroy(&context->uidx_table_mutex);
972 
973 err_srq_table_mutex:
974 	pthread_mutex_destroy(&context->srq_table_mutex);
975 
976 err_qp_table_mutex:
977 	pthread_mutex_destroy(&context->qp_table_mutex);
978 
979 err_free_bf:
980 	free(context->bfs);
981 
982 err_free:
983 	for (i = 0; i < MLX5_MAX_UARS; ++i) {
984 		if (context->uar[i])
985 			munmap(context->uar[i], page_size);
986 	}
987 	close_debug_file(context);
988 	return errno;
989 }
990 
991 static void mlx5_cleanup_context(struct verbs_device *device,
992 				 struct ibv_context *ibctx)
993 {
994 	struct mlx5_context *context = to_mctx(ibctx);
995 	int page_size = to_mdev(ibctx->device)->page_size;
996 	int i;
997 	int				j;
998 	int				k;
999 	int				bfi;
1000 	int				num_sys_page_map;
1001 
1002 	num_sys_page_map = context->tot_uuars / (context->num_uars_per_page * MLX5_NUM_NON_FP_BFREGS_PER_UAR);
1003 	for (i = 0; i < num_sys_page_map; i++) {
1004 		for (j = 0; j < context->num_uars_per_page; j++) {
1005 			for (k = 0; k < NUM_BFREGS_PER_UAR; k++) {
1006 				bfi = (i * context->num_uars_per_page + j) * NUM_BFREGS_PER_UAR + k;
1007 				mlx5_spinlock_destroy(&context->bfs[bfi].lock);
1008 			}
1009 		}
1010 	}
1011 	mlx5_spinlock_destroy(&context->hugetlb_lock);
1012 	mlx5_spinlock_destroy(&context->lock32);
1013 	pthread_mutex_destroy(&context->db_list_mutex);
1014 	pthread_mutex_destroy(&context->uidx_table_mutex);
1015 	pthread_mutex_destroy(&context->srq_table_mutex);
1016 	pthread_mutex_destroy(&context->qp_table_mutex);
1017 
1018 	free(context->bfs);
1019 	for (i = 0; i < MLX5_MAX_UARS; ++i) {
1020 		if (context->uar[i])
1021 			munmap(context->uar[i], page_size);
1022 	}
1023 	if (context->hca_core_clock)
1024 		munmap(context->hca_core_clock - context->core_clock.offset,
1025 		       page_size);
1026 	close_debug_file(context);
1027 }
1028 
1029 static struct verbs_device_ops mlx5_dev_ops = {
1030 	.init_context = mlx5_init_context,
1031 	.uninit_context = mlx5_cleanup_context,
1032 };
1033 
1034 static struct verbs_device *mlx5_driver_init(const char *uverbs_sys_path,
1035 					     int abi_version)
1036 {
1037 	char			value[8];
1038 	struct mlx5_device     *dev;
1039 	unsigned		vendor, device;
1040 	int			i;
1041 
1042 	if (ibv_read_sysfs_file(uverbs_sys_path, "device/vendor",
1043 				value, sizeof value) < 0)
1044 		return NULL;
1045 	sscanf(value, "%i", &vendor);
1046 
1047 	if (ibv_read_sysfs_file(uverbs_sys_path, "device/device",
1048 				value, sizeof value) < 0)
1049 		return NULL;
1050 	sscanf(value, "%i", &device);
1051 
1052 	for (i = 0; i < sizeof hca_table / sizeof hca_table[0]; ++i)
1053 		if (vendor == hca_table[i].vendor &&
1054 		    device == hca_table[i].device)
1055 			goto found;
1056 
1057 	return NULL;
1058 
1059 found:
1060 	if (abi_version < MLX5_UVERBS_MIN_ABI_VERSION ||
1061 	    abi_version > MLX5_UVERBS_MAX_ABI_VERSION) {
1062 		fprintf(stderr, PFX "Fatal: ABI version %d of %s is not supported "
1063 			"(min supported %d, max supported %d)\n",
1064 			abi_version, uverbs_sys_path,
1065 			MLX5_UVERBS_MIN_ABI_VERSION,
1066 			MLX5_UVERBS_MAX_ABI_VERSION);
1067 		return NULL;
1068 	}
1069 
1070 	dev = calloc(1, sizeof *dev);
1071 	if (!dev) {
1072 		fprintf(stderr, PFX "Fatal: couldn't allocate device for %s\n",
1073 			uverbs_sys_path);
1074 		return NULL;
1075 	}
1076 
1077 	dev->page_size   = sysconf(_SC_PAGESIZE);
1078 	dev->driver_abi_ver = abi_version;
1079 
1080 	dev->verbs_dev.ops = &mlx5_dev_ops;
1081 	dev->verbs_dev.sz = sizeof(*dev);
1082 	dev->verbs_dev.size_of_context = sizeof(struct mlx5_context) -
1083 		sizeof(struct ibv_context);
1084 
1085 	return &dev->verbs_dev;
1086 }
1087 
1088 static __attribute__((constructor)) void mlx5_register_driver(void)
1089 {
1090 	verbs_register_driver("mlx5", mlx5_driver_init);
1091 }
1092