xref: /freebsd/contrib/ofed/libmlx5/mlx5.c (revision 92b14858b44dc4b3b57154a10e9de1b39d791e41)
1 /*
2  * Copyright (c) 2012 Mellanox Technologies, Inc.  All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32 #define _GNU_SOURCE
33 #include <config.h>
34 
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <unistd.h>
38 #include <errno.h>
39 #include <sys/mman.h>
40 #include <pthread.h>
41 #include <string.h>
42 #include <sched.h>
43 #include <sys/param.h>
44 #include <sys/cpuset.h>
45 
46 #include "mlx5.h"
47 #include "mlx5-abi.h"
48 
49 #ifndef PCI_VENDOR_ID_MELLANOX
50 #define PCI_VENDOR_ID_MELLANOX			0x15b3
51 #endif
52 
53 #ifndef CPU_OR
54 #define CPU_OR(x, y, z) do {} while (0)
55 #endif
56 
57 #ifndef CPU_EQUAL
58 #define CPU_EQUAL(x, y) 1
59 #endif
60 
61 
62 #define HCA(v, d) \
63 	{ .vendor = PCI_VENDOR_ID_##v,			\
64 	  .device = d }
65 
66 static struct {
67 	unsigned		vendor;
68 	unsigned		device;
69 } hca_table[] = {
70 	HCA(MELLANOX, 4113),	/* MT4113 Connect-IB */
71 	HCA(MELLANOX, 4114),	/* Connect-IB Virtual Function */
72 	HCA(MELLANOX, 4115),	/* ConnectX-4 */
73 	HCA(MELLANOX, 4116),	/* ConnectX-4 Virtual Function */
74 	HCA(MELLANOX, 4117),	/* ConnectX-4LX */
75 	HCA(MELLANOX, 4118),	/* ConnectX-4LX Virtual Function */
76 	HCA(MELLANOX, 4119),	/* ConnectX-5, PCIe 3.0 */
77 	HCA(MELLANOX, 4120),	/* ConnectX-5 Virtual Function */
78 	HCA(MELLANOX, 4121),    /* ConnectX-5 Ex */
79 	HCA(MELLANOX, 4122),	/* ConnectX-5 Ex VF */
80 	HCA(MELLANOX, 4123),    /* ConnectX-6 */
81 	HCA(MELLANOX, 4124),	/* ConnectX-6 VF */
82 	HCA(MELLANOX, 4125),	/* ConnectX-6 DX */
83 	HCA(MELLANOX, 4126),	/* ConnectX family mlx5Gen Virtual Function */
84 	HCA(MELLANOX, 41682),	/* BlueField integrated ConnectX-5 network controller */
85 	HCA(MELLANOX, 41683),	/* BlueField integrated ConnectX-5 network controller VF */
86 };
87 
88 uint32_t mlx5_debug_mask = 0;
89 int mlx5_freeze_on_error_cqe;
90 
91 static struct ibv_context_ops mlx5_ctx_ops = {
92 	.query_device  = mlx5_query_device,
93 	.query_port    = mlx5_query_port,
94 	.alloc_pd      = mlx5_alloc_pd,
95 	.dealloc_pd    = mlx5_free_pd,
96 	.reg_mr	       = mlx5_reg_mr,
97 	.rereg_mr      = mlx5_rereg_mr,
98 	.dereg_mr      = mlx5_dereg_mr,
99 	.alloc_mw      = mlx5_alloc_mw,
100 	.dealloc_mw    = mlx5_dealloc_mw,
101 	.bind_mw       = mlx5_bind_mw,
102 	.create_cq     = mlx5_create_cq,
103 	.poll_cq       = mlx5_poll_cq,
104 	.req_notify_cq = mlx5_arm_cq,
105 	.cq_event      = mlx5_cq_event,
106 	.resize_cq     = mlx5_resize_cq,
107 	.destroy_cq    = mlx5_destroy_cq,
108 	.create_srq    = mlx5_create_srq,
109 	.modify_srq    = mlx5_modify_srq,
110 	.query_srq     = mlx5_query_srq,
111 	.destroy_srq   = mlx5_destroy_srq,
112 	.post_srq_recv = mlx5_post_srq_recv,
113 	.create_qp     = mlx5_create_qp,
114 	.query_qp      = mlx5_query_qp,
115 	.modify_qp     = mlx5_modify_qp,
116 	.destroy_qp    = mlx5_destroy_qp,
117 	.post_send     = mlx5_post_send,
118 	.post_recv     = mlx5_post_recv,
119 	.create_ah     = mlx5_create_ah,
120 	.destroy_ah    = mlx5_destroy_ah,
121 	.attach_mcast  = mlx5_attach_mcast,
122 	.detach_mcast  = mlx5_detach_mcast
123 };
124 
125 static int read_number_from_line(const char *line, int *value)
126 {
127 	const char *ptr;
128 
129 	ptr = strchr(line, ':');
130 	if (!ptr)
131 		return 1;
132 
133 	++ptr;
134 
135 	*value = atoi(ptr);
136 	return 0;
137 }
138 /**
139  * The function looks for the first free user-index in all the
140  * user-index tables. If all are used, returns -1, otherwise
141  * a valid user-index.
142  * In case the reference count of the table is zero, it means the
143  * table is not in use and wasn't allocated yet, therefore the
144  * mlx5_store_uidx allocates the table, and increment the reference
145  * count on the table.
146  */
147 static int32_t get_free_uidx(struct mlx5_context *ctx)
148 {
149 	int32_t tind;
150 	int32_t i;
151 
152 	for (tind = 0; tind < MLX5_UIDX_TABLE_SIZE; tind++) {
153 		if (ctx->uidx_table[tind].refcnt < MLX5_UIDX_TABLE_MASK)
154 			break;
155 	}
156 
157 	if (tind == MLX5_UIDX_TABLE_SIZE)
158 		return -1;
159 
160 	if (!ctx->uidx_table[tind].refcnt)
161 		return tind << MLX5_UIDX_TABLE_SHIFT;
162 
163 	for (i = 0; i < MLX5_UIDX_TABLE_MASK + 1; i++) {
164 		if (!ctx->uidx_table[tind].table[i])
165 			break;
166 	}
167 
168 	return (tind << MLX5_UIDX_TABLE_SHIFT) | i;
169 }
170 
171 int32_t mlx5_store_uidx(struct mlx5_context *ctx, void *rsc)
172 {
173 	int32_t tind;
174 	int32_t ret = -1;
175 	int32_t uidx;
176 
177 	pthread_mutex_lock(&ctx->uidx_table_mutex);
178 	uidx = get_free_uidx(ctx);
179 	if (uidx < 0)
180 		goto out;
181 
182 	tind = uidx >> MLX5_UIDX_TABLE_SHIFT;
183 
184 	if (!ctx->uidx_table[tind].refcnt) {
185 		ctx->uidx_table[tind].table = calloc(MLX5_UIDX_TABLE_MASK + 1,
186 						     sizeof(struct mlx5_resource *));
187 		if (!ctx->uidx_table[tind].table)
188 			goto out;
189 	}
190 
191 	++ctx->uidx_table[tind].refcnt;
192 	ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = rsc;
193 	ret = uidx;
194 
195 out:
196 	pthread_mutex_unlock(&ctx->uidx_table_mutex);
197 	return ret;
198 }
199 
200 void mlx5_clear_uidx(struct mlx5_context *ctx, uint32_t uidx)
201 {
202 	int tind = uidx >> MLX5_UIDX_TABLE_SHIFT;
203 
204 	pthread_mutex_lock(&ctx->uidx_table_mutex);
205 
206 	if (!--ctx->uidx_table[tind].refcnt)
207 		free(ctx->uidx_table[tind].table);
208 	else
209 		ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = NULL;
210 
211 	pthread_mutex_unlock(&ctx->uidx_table_mutex);
212 }
213 
214 static int mlx5_is_sandy_bridge(int *num_cores)
215 {
216 	char line[128];
217 	FILE *fd;
218 	int rc = 0;
219 	int cur_cpu_family = -1;
220 	int cur_cpu_model = -1;
221 
222 	fd = fopen("/proc/cpuinfo", "r");
223 	if (!fd)
224 		return 0;
225 
226 	*num_cores = 0;
227 
228 	while (fgets(line, 128, fd)) {
229 		int value;
230 
231 		/* if this is information on new processor */
232 		if (!strncmp(line, "processor", 9)) {
233 			++*num_cores;
234 
235 			cur_cpu_family = -1;
236 			cur_cpu_model  = -1;
237 		} else if (!strncmp(line, "cpu family", 10)) {
238 			if ((cur_cpu_family < 0) && (!read_number_from_line(line, &value)))
239 				cur_cpu_family = value;
240 		} else if (!strncmp(line, "model", 5)) {
241 			if ((cur_cpu_model < 0) && (!read_number_from_line(line, &value)))
242 				cur_cpu_model = value;
243 		}
244 
245 		/* if this is a Sandy Bridge CPU */
246 		if ((cur_cpu_family == 6) &&
247 		    (cur_cpu_model == 0x2A || (cur_cpu_model == 0x2D) ))
248 			rc = 1;
249 	}
250 
251 	fclose(fd);
252 	return rc;
253 }
254 
255 /*
256 man cpuset
257 
258   This format displays each 32-bit word in hexadecimal (using ASCII characters "0" - "9" and "a" - "f"); words
259   are filled with leading zeros, if required. For masks longer than one word, a comma separator is used between
260   words. Words are displayed in big-endian order, which has the most significant bit first. The hex digits
261   within a word are also in big-endian order.
262 
263   The number of 32-bit words displayed is the minimum number needed to display all bits of the bitmask, based on
264   the size of the bitmask.
265 
266   Examples of the Mask Format:
267 
268      00000001                        # just bit 0 set
269      40000000,00000000,00000000      # just bit 94 set
270      000000ff,00000000               # bits 32-39 set
271      00000000,000E3862               # 1,5,6,11-13,17-19 set
272 
273   A mask with bits 0, 1, 2, 4, 8, 16, 32, and 64 set displays as:
274 
275      00000001,00000001,00010117
276 
277   The first "1" is for bit 64, the second for bit 32, the third for bit 16, the fourth for bit 8, the fifth for
278   bit 4, and the "7" is for bits 2, 1, and 0.
279 */
280 static void mlx5_local_cpu_set(struct ibv_device *ibdev, cpuset_t *cpu_set)
281 {
282 	char *p, buf[1024];
283 	char *env_value;
284 	uint32_t word;
285 	int i, k;
286 
287 	env_value = getenv("MLX5_LOCAL_CPUS");
288 	if (env_value)
289 		strncpy(buf, env_value, sizeof(buf));
290 	else {
291 		char fname[MAXPATHLEN];
292 
293 		snprintf(fname, MAXPATHLEN, "/sys/class/infiniband/%s",
294 			 ibv_get_device_name(ibdev));
295 
296 		if (ibv_read_sysfs_file(fname, "device/local_cpus", buf, sizeof(buf))) {
297 			fprintf(stderr, PFX "Warning: can not get local cpu set: failed to open %s\n", fname);
298 			return;
299 		}
300 	}
301 
302 	p = strrchr(buf, ',');
303 	if (!p)
304 		p = buf;
305 
306 	i = 0;
307 	do {
308 		if (*p == ',') {
309 			*p = 0;
310 			p ++;
311 		}
312 
313 		word = strtoul(p, NULL, 16);
314 
315 		for (k = 0; word; ++k, word >>= 1)
316 			if (word & 1)
317 				CPU_SET(k+i, cpu_set);
318 
319 		if (p == buf)
320 			break;
321 
322 		p = strrchr(buf, ',');
323 		if (!p)
324 			p = buf;
325 
326 		i += 32;
327 	} while (i < CPU_SETSIZE);
328 }
329 
330 static int mlx5_enable_sandy_bridge_fix(struct ibv_device *ibdev)
331 {
332 	cpuset_t my_cpus, dev_local_cpus, result_set;
333 	int stall_enable;
334 	int ret;
335 	int num_cores;
336 
337 	if (!mlx5_is_sandy_bridge(&num_cores))
338 		return 0;
339 
340 	/* by default enable stall on sandy bridge arch */
341 	stall_enable = 1;
342 
343 	/*
344 	 * check if app is bound to cpu set that is inside
345 	 * of device local cpu set. Disable stalling if true
346 	 */
347 
348 	/* use static cpu set - up to CPU_SETSIZE (1024) cpus/node */
349 	CPU_ZERO(&my_cpus);
350 	CPU_ZERO(&dev_local_cpus);
351 	CPU_ZERO(&result_set);
352 	ret = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, -1,
353 	    sizeof(my_cpus), &my_cpus);
354 	if (ret == -1) {
355 		if (errno == EINVAL)
356 			fprintf(stderr, PFX "Warning: my cpu set is too small\n");
357 		else
358 			fprintf(stderr, PFX "Warning: failed to get my cpu set\n");
359 		goto out;
360 	}
361 
362 	/* get device local cpu set */
363 	mlx5_local_cpu_set(ibdev, &dev_local_cpus);
364 
365 	/* check if my cpu set is in dev cpu */
366 	CPU_OR(&result_set, &my_cpus);
367 	CPU_OR(&result_set, &dev_local_cpus);
368 	stall_enable = CPU_EQUAL(&result_set, &dev_local_cpus) ? 0 : 1;
369 
370 out:
371 	return stall_enable;
372 }
373 
374 static void mlx5_read_env(struct ibv_device *ibdev, struct mlx5_context *ctx)
375 {
376 	char *env_value;
377 
378 	env_value = getenv("MLX5_STALL_CQ_POLL");
379 	if (env_value)
380 		/* check if cq stall is enforced by user */
381 		ctx->stall_enable = (strcmp(env_value, "0")) ? 1 : 0;
382 	else
383 		/* autodetect if we need to do cq polling */
384 		ctx->stall_enable = mlx5_enable_sandy_bridge_fix(ibdev);
385 
386 	env_value = getenv("MLX5_STALL_NUM_LOOP");
387 	if (env_value)
388 		mlx5_stall_num_loop = atoi(env_value);
389 
390 	env_value = getenv("MLX5_STALL_CQ_POLL_MIN");
391 	if (env_value)
392 		mlx5_stall_cq_poll_min = atoi(env_value);
393 
394 	env_value = getenv("MLX5_STALL_CQ_POLL_MAX");
395 	if (env_value)
396 		mlx5_stall_cq_poll_max = atoi(env_value);
397 
398 	env_value = getenv("MLX5_STALL_CQ_INC_STEP");
399 	if (env_value)
400 		mlx5_stall_cq_inc_step = atoi(env_value);
401 
402 	env_value = getenv("MLX5_STALL_CQ_DEC_STEP");
403 	if (env_value)
404 		mlx5_stall_cq_dec_step = atoi(env_value);
405 
406 	ctx->stall_adaptive_enable = 0;
407 	ctx->stall_cycles = 0;
408 
409 	if (mlx5_stall_num_loop < 0) {
410 		ctx->stall_adaptive_enable = 1;
411 		ctx->stall_cycles = mlx5_stall_cq_poll_min;
412 	}
413 
414 }
415 
416 static int get_total_uuars(int page_size)
417 {
418 	int size = MLX5_DEF_TOT_UUARS;
419 	int uuars_in_page;
420 	char *env;
421 
422 	env = getenv("MLX5_TOTAL_UUARS");
423 	if (env)
424 		size = atoi(env);
425 
426 	if (size < 1)
427 		return -EINVAL;
428 
429 	uuars_in_page = page_size / MLX5_ADAPTER_PAGE_SIZE * MLX5_NUM_NON_FP_BFREGS_PER_UAR;
430 	size = max(uuars_in_page, size);
431 	size = align(size, MLX5_NUM_NON_FP_BFREGS_PER_UAR);
432 	if (size > MLX5_MAX_BFREGS)
433 		return -ENOMEM;
434 
435 	return size;
436 }
437 
438 static void open_debug_file(struct mlx5_context *ctx)
439 {
440 	char *env;
441 
442 	env = getenv("MLX5_DEBUG_FILE");
443 	if (!env) {
444 		ctx->dbg_fp = stderr;
445 		return;
446 	}
447 
448 	ctx->dbg_fp = fopen(env, "aw+");
449 	if (!ctx->dbg_fp) {
450 		fprintf(stderr, "Failed opening debug file %s, using stderr\n", env);
451 		ctx->dbg_fp = stderr;
452 		return;
453 	}
454 }
455 
456 static void close_debug_file(struct mlx5_context *ctx)
457 {
458 	if (ctx->dbg_fp && ctx->dbg_fp != stderr)
459 		fclose(ctx->dbg_fp);
460 }
461 
462 static void set_debug_mask(void)
463 {
464 	char *env;
465 
466 	env = getenv("MLX5_DEBUG_MASK");
467 	if (env)
468 		mlx5_debug_mask = strtol(env, NULL, 0);
469 }
470 
471 static void set_freeze_on_error(void)
472 {
473 	char *env;
474 
475 	env = getenv("MLX5_FREEZE_ON_ERROR_CQE");
476 	if (env)
477 		mlx5_freeze_on_error_cqe = strtol(env, NULL, 0);
478 }
479 
480 static int get_always_bf(void)
481 {
482 	char *env;
483 
484 	env = getenv("MLX5_POST_SEND_PREFER_BF");
485 	if (!env)
486 		return 1;
487 
488 	return strcmp(env, "0") ? 1 : 0;
489 }
490 
491 static int get_shut_up_bf(void)
492 {
493 	char *env;
494 
495 	env = getenv("MLX5_SHUT_UP_BF");
496 	if (!env)
497 		return 0;
498 
499 	return strcmp(env, "0") ? 1 : 0;
500 }
501 
502 static int get_num_low_lat_uuars(int tot_uuars)
503 {
504 	char *env;
505 	int num = 4;
506 
507 	env = getenv("MLX5_NUM_LOW_LAT_UUARS");
508 	if (env)
509 		num = atoi(env);
510 
511 	if (num < 0)
512 		return -EINVAL;
513 
514 	num = max(num, tot_uuars - MLX5_MED_BFREGS_TSHOLD);
515 	return num;
516 }
517 
518 /* The library allocates an array of uuar contexts. The one in index zero does
519  * not to execersize odd/even policy so it can avoid a lock but it may not use
520  * blue flame. The upper ones, low_lat_uuars can use blue flame with no lock
521  * since they are assigned to one QP only. The rest can use blue flame but since
522  * they are shared they need a lock
523  */
524 static int need_uuar_lock(struct mlx5_context *ctx, int uuarn)
525 {
526 	if (uuarn == 0 || mlx5_single_threaded)
527 		return 0;
528 
529 	if (uuarn >= (ctx->tot_uuars - ctx->low_lat_uuars) * 2)
530 		return 0;
531 
532 	return 1;
533 }
534 
535 static int single_threaded_app(void)
536 {
537 
538 	char *env;
539 
540 	env = getenv("MLX5_SINGLE_THREADED");
541 	if (env)
542 		return strcmp(env, "1") ? 0 : 1;
543 
544 	return 0;
545 }
546 
547 static int mlx5_cmd_get_context(struct mlx5_context *context,
548 				struct mlx5_alloc_ucontext *req,
549 				size_t req_len,
550 				struct mlx5_alloc_ucontext_resp *resp,
551 				size_t resp_len)
552 {
553 	if (!ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req,
554 				 req_len, &resp->ibv_resp, resp_len))
555 		return 0;
556 
557 	/* The ibv_cmd_get_context fails in older kernels when passing
558 	 * a request length that the kernel doesn't know.
559 	 * To avoid breaking compatibility of new libmlx5 and older
560 	 * kernels, when ibv_cmd_get_context fails with the full
561 	 * request length, we try once again with the legacy length.
562 	 * We repeat this process while reducing requested size based
563 	 * on the feature input size. To avoid this in the future, we
564 	 * will remove the check in kernel that requires fields unknown
565 	 * to the kernel to be cleared. This will require that any new
566 	 * feature that involves extending struct mlx5_alloc_ucontext
567 	 * will be accompanied by an indication in the form of one or
568 	 * more fields in struct mlx5_alloc_ucontext_resp. If the
569 	 * response value can be interpreted as feature not supported
570 	 * when the returned value is zero, this will suffice to
571 	 * indicate to the library that the request was ignored by the
572 	 * kernel, either because it is unaware or because it decided
573 	 * to do so. If zero is a valid response, we will add a new
574 	 * field that indicates whether the request was handled.
575 	 */
576 	if (!ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req,
577 				 offsetof(struct mlx5_alloc_ucontext, lib_caps),
578 				 &resp->ibv_resp, resp_len))
579 		return 0;
580 
581 	return ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req,
582 				   offsetof(struct mlx5_alloc_ucontext,
583 					    cqe_version),
584 				   &resp->ibv_resp, resp_len);
585 }
586 
587 static int mlx5_map_internal_clock(struct mlx5_device *mdev,
588 				   struct ibv_context *ibv_ctx)
589 {
590 	struct mlx5_context *context = to_mctx(ibv_ctx);
591 	void *hca_clock_page;
592 	off_t offset = 0;
593 
594 	set_command(MLX5_MMAP_GET_CORE_CLOCK_CMD, &offset);
595 	hca_clock_page = mmap(NULL, mdev->page_size,
596 			      PROT_READ, MAP_SHARED, ibv_ctx->cmd_fd,
597 			      mdev->page_size * offset);
598 
599 	if (hca_clock_page == MAP_FAILED) {
600 		fprintf(stderr, PFX
601 			"Warning: Timestamp available,\n"
602 			"but failed to mmap() hca core clock page.\n");
603 		return -1;
604 	}
605 
606 	context->hca_core_clock = hca_clock_page +
607 		(context->core_clock.offset & (mdev->page_size - 1));
608 	return 0;
609 }
610 
611 int mlx5dv_query_device(struct ibv_context *ctx_in,
612 			 struct mlx5dv_context *attrs_out)
613 {
614 	struct mlx5_context *mctx = to_mctx(ctx_in);
615 	uint64_t comp_mask_out = 0;
616 
617 	attrs_out->version   = 0;
618 	attrs_out->flags     = 0;
619 
620 	if (mctx->cqe_version == MLX5_CQE_VERSION_V1)
621 		attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_CQE_V1;
622 
623 	if (mctx->vendor_cap_flags & MLX5_VENDOR_CAP_FLAGS_MPW)
624 		attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_MPW;
625 
626 	if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_CQE_COMPRESION) {
627 		attrs_out->cqe_comp_caps = mctx->cqe_comp_caps;
628 		comp_mask_out |= MLX5DV_CONTEXT_MASK_CQE_COMPRESION;
629 	}
630 
631 	attrs_out->comp_mask = comp_mask_out;
632 
633 	return 0;
634 }
635 
636 static int mlx5dv_get_qp(struct ibv_qp *qp_in,
637 			 struct mlx5dv_qp *qp_out)
638 {
639 	struct mlx5_qp *mqp = to_mqp(qp_in);
640 
641 	qp_out->comp_mask = 0;
642 	qp_out->dbrec     = mqp->db;
643 
644 	if (mqp->sq_buf_size)
645 		/* IBV_QPT_RAW_PACKET */
646 		qp_out->sq.buf = (void *)((uintptr_t)mqp->sq_buf.buf);
647 	else
648 		qp_out->sq.buf = (void *)((uintptr_t)mqp->buf.buf + mqp->sq.offset);
649 	qp_out->sq.wqe_cnt = mqp->sq.wqe_cnt;
650 	qp_out->sq.stride  = 1 << mqp->sq.wqe_shift;
651 
652 	qp_out->rq.buf     = (void *)((uintptr_t)mqp->buf.buf + mqp->rq.offset);
653 	qp_out->rq.wqe_cnt = mqp->rq.wqe_cnt;
654 	qp_out->rq.stride  = 1 << mqp->rq.wqe_shift;
655 
656 	qp_out->bf.reg    = mqp->bf->reg;
657 
658 	if (mqp->bf->uuarn > 0)
659 		qp_out->bf.size = mqp->bf->buf_size;
660 	else
661 		qp_out->bf.size = 0;
662 
663 	return 0;
664 }
665 
666 static int mlx5dv_get_cq(struct ibv_cq *cq_in,
667 			 struct mlx5dv_cq *cq_out)
668 {
669 	struct mlx5_cq *mcq = to_mcq(cq_in);
670 	struct mlx5_context *mctx = to_mctx(cq_in->context);
671 
672 	cq_out->comp_mask = 0;
673 	cq_out->cqn       = mcq->cqn;
674 	cq_out->cqe_cnt   = mcq->ibv_cq.cqe + 1;
675 	cq_out->cqe_size  = mcq->cqe_sz;
676 	cq_out->buf       = mcq->active_buf->buf;
677 	cq_out->dbrec     = mcq->dbrec;
678 	cq_out->uar	  = mctx->uar;
679 
680 	mcq->flags	 |= MLX5_CQ_FLAGS_DV_OWNED;
681 
682 	return 0;
683 }
684 
685 static int mlx5dv_get_rwq(struct ibv_wq *wq_in,
686 			  struct mlx5dv_rwq *rwq_out)
687 {
688 	struct mlx5_rwq *mrwq = to_mrwq(wq_in);
689 
690 	rwq_out->comp_mask = 0;
691 	rwq_out->buf       = mrwq->pbuff;
692 	rwq_out->dbrec     = mrwq->recv_db;
693 	rwq_out->wqe_cnt   = mrwq->rq.wqe_cnt;
694 	rwq_out->stride    = 1 << mrwq->rq.wqe_shift;
695 
696 	return 0;
697 }
698 
699 static int mlx5dv_get_srq(struct ibv_srq *srq_in,
700 			  struct mlx5dv_srq *srq_out)
701 {
702 	struct mlx5_srq *msrq;
703 
704 	msrq = container_of(srq_in, struct mlx5_srq, vsrq.srq);
705 
706 	srq_out->comp_mask = 0;
707 	srq_out->buf       = msrq->buf.buf;
708 	srq_out->dbrec     = msrq->db;
709 	srq_out->stride    = 1 << msrq->wqe_shift;
710 	srq_out->head      = msrq->head;
711 	srq_out->tail      = msrq->tail;
712 
713 	return 0;
714 }
715 
716 int mlx5dv_init_obj(struct mlx5dv_obj *obj, uint64_t obj_type)
717 {
718 	int ret = 0;
719 
720 	if (obj_type & MLX5DV_OBJ_QP)
721 		ret = mlx5dv_get_qp(obj->qp.in, obj->qp.out);
722 	if (!ret && (obj_type & MLX5DV_OBJ_CQ))
723 		ret = mlx5dv_get_cq(obj->cq.in, obj->cq.out);
724 	if (!ret && (obj_type & MLX5DV_OBJ_SRQ))
725 		ret = mlx5dv_get_srq(obj->srq.in, obj->srq.out);
726 	if (!ret && (obj_type & MLX5DV_OBJ_RWQ))
727 		ret = mlx5dv_get_rwq(obj->rwq.in, obj->rwq.out);
728 
729 	return ret;
730 }
731 
732 static void adjust_uar_info(struct mlx5_device *mdev,
733 			    struct mlx5_context *context,
734 			    struct mlx5_alloc_ucontext_resp resp)
735 {
736 	if (!resp.log_uar_size && !resp.num_uars_per_page) {
737 		/* old kernel */
738 		context->uar_size = mdev->page_size;
739 		context->num_uars_per_page = 1;
740 		return;
741 	}
742 
743 	context->uar_size = 1 << resp.log_uar_size;
744 	context->num_uars_per_page = resp.num_uars_per_page;
745 }
746 
747 static int mlx5_init_context(struct verbs_device *vdev,
748 			     struct ibv_context *ctx, int cmd_fd)
749 {
750 	struct mlx5_context	       *context;
751 	struct mlx5_alloc_ucontext	req;
752 	struct mlx5_alloc_ucontext_resp resp;
753 	int				i;
754 	int				page_size;
755 	int				tot_uuars;
756 	int				low_lat_uuars;
757 	int				gross_uuars;
758 	int				j;
759 	off_t				offset;
760 	struct mlx5_device	       *mdev;
761 	struct verbs_context	       *v_ctx;
762 	struct ibv_port_attr		port_attr;
763 	struct ibv_device_attr_ex	device_attr;
764 	int				k;
765 	int				bfi;
766 	int				num_sys_page_map;
767 
768 	mdev = to_mdev(&vdev->device);
769 	v_ctx = verbs_get_ctx(ctx);
770 	page_size = mdev->page_size;
771 	mlx5_single_threaded = single_threaded_app();
772 
773 	context = to_mctx(ctx);
774 	context->ibv_ctx.cmd_fd = cmd_fd;
775 
776 	open_debug_file(context);
777 	set_debug_mask();
778 	set_freeze_on_error();
779 	if (gethostname(context->hostname, sizeof(context->hostname)))
780 		strcpy(context->hostname, "host_unknown");
781 
782 	tot_uuars = get_total_uuars(page_size);
783 	if (tot_uuars < 0) {
784 		errno = -tot_uuars;
785 		goto err_free;
786 	}
787 
788 	low_lat_uuars = get_num_low_lat_uuars(tot_uuars);
789 	if (low_lat_uuars < 0) {
790 		errno = -low_lat_uuars;
791 		goto err_free;
792 	}
793 
794 	if (low_lat_uuars > tot_uuars - 1) {
795 		errno = ENOMEM;
796 		goto err_free;
797 	}
798 
799 	memset(&req, 0, sizeof(req));
800 	memset(&resp, 0, sizeof(resp));
801 
802 	req.total_num_uuars = tot_uuars;
803 	req.num_low_latency_uuars = low_lat_uuars;
804 	req.cqe_version = MLX5_CQE_VERSION_V1;
805 	req.lib_caps |= MLX5_LIB_CAP_4K_UAR;
806 
807 	if (mlx5_cmd_get_context(context, &req, sizeof(req), &resp,
808 				 sizeof(resp)))
809 		goto err_free;
810 
811 	context->max_num_qps		= resp.qp_tab_size;
812 	context->bf_reg_size		= resp.bf_reg_size;
813 	context->tot_uuars		= resp.tot_uuars;
814 	context->low_lat_uuars		= low_lat_uuars;
815 	context->cache_line_size	= resp.cache_line_size;
816 	context->max_sq_desc_sz = resp.max_sq_desc_sz;
817 	context->max_rq_desc_sz = resp.max_rq_desc_sz;
818 	context->max_send_wqebb	= resp.max_send_wqebb;
819 	context->num_ports	= resp.num_ports;
820 	context->max_recv_wr	= resp.max_recv_wr;
821 	context->max_srq_recv_wr = resp.max_srq_recv_wr;
822 
823 	context->cqe_version = resp.cqe_version;
824 	if (context->cqe_version) {
825 		if (context->cqe_version == MLX5_CQE_VERSION_V1)
826 			mlx5_ctx_ops.poll_cq = mlx5_poll_cq_v1;
827 		else
828 			goto err_free;
829 	}
830 
831 	adjust_uar_info(mdev, context, resp);
832 
833 	gross_uuars = context->tot_uuars / MLX5_NUM_NON_FP_BFREGS_PER_UAR * NUM_BFREGS_PER_UAR;
834 	context->bfs = calloc(gross_uuars, sizeof(*context->bfs));
835 	if (!context->bfs) {
836 		errno = ENOMEM;
837 		goto err_free;
838 	}
839 
840 	context->cmds_supp_uhw = resp.cmds_supp_uhw;
841 	context->vendor_cap_flags = 0;
842 
843 	pthread_mutex_init(&context->qp_table_mutex, NULL);
844 	pthread_mutex_init(&context->srq_table_mutex, NULL);
845 	pthread_mutex_init(&context->uidx_table_mutex, NULL);
846 	for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i)
847 		context->qp_table[i].refcnt = 0;
848 
849 	for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i)
850 		context->uidx_table[i].refcnt = 0;
851 
852 	context->db_list = NULL;
853 
854 	pthread_mutex_init(&context->db_list_mutex, NULL);
855 
856 	num_sys_page_map = context->tot_uuars / (context->num_uars_per_page * MLX5_NUM_NON_FP_BFREGS_PER_UAR);
857 	for (i = 0; i < num_sys_page_map; ++i) {
858 		offset = 0;
859 		set_command(MLX5_MMAP_GET_REGULAR_PAGES_CMD, &offset);
860 		set_index(i, &offset);
861 		context->uar[i] = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED,
862 				       cmd_fd, page_size * offset);
863 		if (context->uar[i] == MAP_FAILED) {
864 			context->uar[i] = NULL;
865 			goto err_free_bf;
866 		}
867 	}
868 
869 	for (i = 0; i < num_sys_page_map; i++) {
870 		for (j = 0; j < context->num_uars_per_page; j++) {
871 			for (k = 0; k < NUM_BFREGS_PER_UAR; k++) {
872 				bfi = (i * context->num_uars_per_page + j) * NUM_BFREGS_PER_UAR + k;
873 				context->bfs[bfi].reg = context->uar[i] + MLX5_ADAPTER_PAGE_SIZE * j +
874 							MLX5_BF_OFFSET + k * context->bf_reg_size;
875 				context->bfs[bfi].need_lock = need_uuar_lock(context, bfi);
876 				mlx5_spinlock_init(&context->bfs[bfi].lock);
877 				context->bfs[bfi].offset = 0;
878 				if (bfi)
879 					context->bfs[bfi].buf_size = context->bf_reg_size / 2;
880 				context->bfs[bfi].uuarn = bfi;
881 			}
882 		}
883 	}
884 	context->hca_core_clock = NULL;
885 	if (resp.response_length + sizeof(resp.ibv_resp) >=
886 	    offsetof(struct mlx5_alloc_ucontext_resp, hca_core_clock_offset) +
887 	    sizeof(resp.hca_core_clock_offset) &&
888 	    resp.comp_mask & MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET) {
889 		context->core_clock.offset = resp.hca_core_clock_offset;
890 		mlx5_map_internal_clock(mdev, ctx);
891 	}
892 
893 	mlx5_spinlock_init(&context->lock32);
894 
895 	context->prefer_bf = get_always_bf();
896 	context->shut_up_bf = get_shut_up_bf();
897 	mlx5_read_env(&vdev->device, context);
898 
899 	mlx5_spinlock_init(&context->hugetlb_lock);
900 	TAILQ_INIT(&context->hugetlb_list);
901 
902 	context->ibv_ctx.ops = mlx5_ctx_ops;
903 
904 	verbs_set_ctx_op(v_ctx, create_qp_ex, mlx5_create_qp_ex);
905 	verbs_set_ctx_op(v_ctx, open_xrcd, mlx5_open_xrcd);
906 	verbs_set_ctx_op(v_ctx, close_xrcd, mlx5_close_xrcd);
907 	verbs_set_ctx_op(v_ctx, create_srq_ex, mlx5_create_srq_ex);
908 	verbs_set_ctx_op(v_ctx, get_srq_num, mlx5_get_srq_num);
909 	verbs_set_ctx_op(v_ctx, query_device_ex, mlx5_query_device_ex);
910 	verbs_set_ctx_op(v_ctx, query_rt_values, mlx5_query_rt_values);
911 	verbs_set_ctx_op(v_ctx, ibv_create_flow, ibv_cmd_create_flow);
912 	verbs_set_ctx_op(v_ctx, ibv_destroy_flow, ibv_cmd_destroy_flow);
913 	verbs_set_ctx_op(v_ctx, create_cq_ex, mlx5_create_cq_ex);
914 	verbs_set_ctx_op(v_ctx, create_wq, mlx5_create_wq);
915 	verbs_set_ctx_op(v_ctx, modify_wq, mlx5_modify_wq);
916 	verbs_set_ctx_op(v_ctx, destroy_wq, mlx5_destroy_wq);
917 	verbs_set_ctx_op(v_ctx, create_rwq_ind_table, mlx5_create_rwq_ind_table);
918 	verbs_set_ctx_op(v_ctx, destroy_rwq_ind_table, mlx5_destroy_rwq_ind_table);
919 
920 	memset(&device_attr, 0, sizeof(device_attr));
921 	if (!mlx5_query_device_ex(ctx, NULL, &device_attr,
922 				  sizeof(struct ibv_device_attr_ex))) {
923 		context->cached_device_cap_flags =
924 			device_attr.orig_attr.device_cap_flags;
925 		context->atomic_cap = device_attr.orig_attr.atomic_cap;
926 		context->cached_tso_caps = device_attr.tso_caps;
927 	}
928 
929 	for (j = 0; j < min(MLX5_MAX_PORTS_NUM, context->num_ports); ++j) {
930 		memset(&port_attr, 0, sizeof(port_attr));
931 		if (!mlx5_query_port(ctx, j + 1, &port_attr))
932 			context->cached_link_layer[j] = port_attr.link_layer;
933 	}
934 
935 	return 0;
936 
937 err_free_bf:
938 	free(context->bfs);
939 
940 err_free:
941 	for (i = 0; i < MLX5_MAX_UARS; ++i) {
942 		if (context->uar[i])
943 			munmap(context->uar[i], page_size);
944 	}
945 	close_debug_file(context);
946 	return errno;
947 }
948 
949 static void mlx5_cleanup_context(struct verbs_device *device,
950 				 struct ibv_context *ibctx)
951 {
952 	struct mlx5_context *context = to_mctx(ibctx);
953 	int page_size = to_mdev(ibctx->device)->page_size;
954 	int i;
955 
956 	free(context->bfs);
957 	for (i = 0; i < MLX5_MAX_UARS; ++i) {
958 		if (context->uar[i])
959 			munmap(context->uar[i], page_size);
960 	}
961 	if (context->hca_core_clock)
962 		munmap(context->hca_core_clock - context->core_clock.offset,
963 		       page_size);
964 	close_debug_file(context);
965 }
966 
967 static struct verbs_device_ops mlx5_dev_ops = {
968 	.init_context = mlx5_init_context,
969 	.uninit_context = mlx5_cleanup_context,
970 };
971 
972 static struct verbs_device *mlx5_driver_init(const char *uverbs_sys_path,
973 					     int abi_version)
974 {
975 	char			value[8];
976 	struct mlx5_device     *dev;
977 	unsigned		vendor, device;
978 	int			i;
979 
980 	if (ibv_read_sysfs_file(uverbs_sys_path, "device/vendor",
981 				value, sizeof value) < 0)
982 		return NULL;
983 	sscanf(value, "%i", &vendor);
984 
985 	if (ibv_read_sysfs_file(uverbs_sys_path, "device/device",
986 				value, sizeof value) < 0)
987 		return NULL;
988 	sscanf(value, "%i", &device);
989 
990 	for (i = 0; i < sizeof hca_table / sizeof hca_table[0]; ++i)
991 		if (vendor == hca_table[i].vendor &&
992 		    device == hca_table[i].device)
993 			goto found;
994 
995 	return NULL;
996 
997 found:
998 	if (abi_version < MLX5_UVERBS_MIN_ABI_VERSION ||
999 	    abi_version > MLX5_UVERBS_MAX_ABI_VERSION) {
1000 		fprintf(stderr, PFX "Fatal: ABI version %d of %s is not supported "
1001 			"(min supported %d, max supported %d)\n",
1002 			abi_version, uverbs_sys_path,
1003 			MLX5_UVERBS_MIN_ABI_VERSION,
1004 			MLX5_UVERBS_MAX_ABI_VERSION);
1005 		return NULL;
1006 	}
1007 
1008 	dev = calloc(1, sizeof *dev);
1009 	if (!dev) {
1010 		fprintf(stderr, PFX "Fatal: couldn't allocate device for %s\n",
1011 			uverbs_sys_path);
1012 		return NULL;
1013 	}
1014 
1015 	dev->page_size   = sysconf(_SC_PAGESIZE);
1016 	dev->driver_abi_ver = abi_version;
1017 
1018 	dev->verbs_dev.ops = &mlx5_dev_ops;
1019 	dev->verbs_dev.sz = sizeof(*dev);
1020 	dev->verbs_dev.size_of_context = sizeof(struct mlx5_context) -
1021 		sizeof(struct ibv_context);
1022 
1023 	return &dev->verbs_dev;
1024 }
1025 
1026 static __attribute__((constructor)) void mlx5_register_driver(void)
1027 {
1028 	verbs_register_driver("mlx5", mlx5_driver_init);
1029 }
1030