xref: /freebsd/contrib/ofed/libmlx5/mlx5.c (revision 7d91d6b83e74edf278dde375e6049aca833cbebd)
1 /*
2  * Copyright (c) 2012 Mellanox Technologies, Inc.  All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32 #define _GNU_SOURCE
33 #include <config.h>
34 
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <unistd.h>
38 #include <errno.h>
39 #include <sys/mman.h>
40 #include <pthread.h>
41 #include <string.h>
42 #include <sched.h>
43 #include <sys/param.h>
44 #include <sys/cpuset.h>
45 
46 #include "mlx5.h"
47 #include "mlx5-abi.h"
48 
49 #ifndef PCI_VENDOR_ID_MELLANOX
50 #define PCI_VENDOR_ID_MELLANOX			0x15b3
51 #endif
52 
53 #ifndef CPU_OR
54 #define CPU_OR(x, y, z) do {} while (0)
55 #endif
56 
57 #ifndef CPU_EQUAL
58 #define CPU_EQUAL(x, y) 1
59 #endif
60 
61 
62 #define HCA(v, d) \
63 	{ .vendor = PCI_VENDOR_ID_##v,			\
64 	  .device = d }
65 
66 static struct {
67 	unsigned		vendor;
68 	unsigned		device;
69 } hca_table[] = {
70 	HCA(MELLANOX, 4113),	/* MT4113 Connect-IB */
71 	HCA(MELLANOX, 4114),	/* Connect-IB Virtual Function */
72 	HCA(MELLANOX, 4115),	/* ConnectX-4 */
73 	HCA(MELLANOX, 4116),	/* ConnectX-4 Virtual Function */
74 	HCA(MELLANOX, 4117),	/* ConnectX-4LX */
75 	HCA(MELLANOX, 4118),	/* ConnectX-4LX Virtual Function */
76 	HCA(MELLANOX, 4119),	/* ConnectX-5, PCIe 3.0 */
77 	HCA(MELLANOX, 4120),	/* ConnectX-5 Virtual Function */
78 	HCA(MELLANOX, 4121),    /* ConnectX-5 Ex */
79 	HCA(MELLANOX, 4122),	/* ConnectX-5 Ex VF */
80 	HCA(MELLANOX, 4123),    /* ConnectX-6 */
81 	HCA(MELLANOX, 4124),	/* ConnectX-6 VF */
82 	HCA(MELLANOX, 4125),	/* ConnectX-6 DX */
83 	HCA(MELLANOX, 4126),	/* ConnectX family mlx5Gen Virtual Function */
84 	HCA(MELLANOX, 41682),	/* BlueField integrated ConnectX-5 network controller */
85 	HCA(MELLANOX, 41683),	/* BlueField integrated ConnectX-5 network controller VF */
86 };
87 
88 uint32_t mlx5_debug_mask = 0;
89 int mlx5_freeze_on_error_cqe;
90 
91 static struct ibv_context_ops mlx5_ctx_ops = {
92 	.query_device  = mlx5_query_device,
93 	.query_port    = mlx5_query_port,
94 	.alloc_pd      = mlx5_alloc_pd,
95 	.dealloc_pd    = mlx5_free_pd,
96 	.reg_mr	       = mlx5_reg_mr,
97 	.rereg_mr      = mlx5_rereg_mr,
98 	.dereg_mr      = mlx5_dereg_mr,
99 	.alloc_mw      = mlx5_alloc_mw,
100 	.dealloc_mw    = mlx5_dealloc_mw,
101 	.bind_mw       = mlx5_bind_mw,
102 	.create_cq     = mlx5_create_cq,
103 	.poll_cq       = mlx5_poll_cq,
104 	.req_notify_cq = mlx5_arm_cq,
105 	.cq_event      = mlx5_cq_event,
106 	.resize_cq     = mlx5_resize_cq,
107 	.destroy_cq    = mlx5_destroy_cq,
108 	.create_srq    = mlx5_create_srq,
109 	.modify_srq    = mlx5_modify_srq,
110 	.query_srq     = mlx5_query_srq,
111 	.destroy_srq   = mlx5_destroy_srq,
112 	.post_srq_recv = mlx5_post_srq_recv,
113 	.create_qp     = mlx5_create_qp,
114 	.query_qp      = mlx5_query_qp,
115 	.modify_qp     = mlx5_modify_qp,
116 	.destroy_qp    = mlx5_destroy_qp,
117 	.post_send     = mlx5_post_send,
118 	.post_recv     = mlx5_post_recv,
119 	.create_ah     = mlx5_create_ah,
120 	.destroy_ah    = mlx5_destroy_ah,
121 	.attach_mcast  = mlx5_attach_mcast,
122 	.detach_mcast  = mlx5_detach_mcast
123 };
124 
125 static int read_number_from_line(const char *line, int *value)
126 {
127 	const char *ptr;
128 
129 	ptr = strchr(line, ':');
130 	if (!ptr)
131 		return 1;
132 
133 	++ptr;
134 
135 	*value = atoi(ptr);
136 	return 0;
137 }
138 /**
139  * The function looks for the first free user-index in all the
140  * user-index tables. If all are used, returns -1, otherwise
141  * a valid user-index.
142  * In case the reference count of the table is zero, it means the
143  * table is not in use and wasn't allocated yet, therefore the
144  * mlx5_store_uidx allocates the table, and increment the reference
145  * count on the table.
146  */
147 static int32_t get_free_uidx(struct mlx5_context *ctx)
148 {
149 	int32_t tind;
150 	int32_t i;
151 
152 	for (tind = 0; tind < MLX5_UIDX_TABLE_SIZE; tind++) {
153 		if (ctx->uidx_table[tind].refcnt < MLX5_UIDX_TABLE_MASK)
154 			break;
155 	}
156 
157 	if (tind == MLX5_UIDX_TABLE_SIZE)
158 		return -1;
159 
160 	if (!ctx->uidx_table[tind].refcnt)
161 		return tind << MLX5_UIDX_TABLE_SHIFT;
162 
163 	for (i = 0; i < MLX5_UIDX_TABLE_MASK + 1; i++) {
164 		if (!ctx->uidx_table[tind].table[i])
165 			break;
166 	}
167 
168 	return (tind << MLX5_UIDX_TABLE_SHIFT) | i;
169 }
170 
171 int32_t mlx5_store_uidx(struct mlx5_context *ctx, void *rsc)
172 {
173 	int32_t tind;
174 	int32_t ret = -1;
175 	int32_t uidx;
176 
177 	pthread_mutex_lock(&ctx->uidx_table_mutex);
178 	uidx = get_free_uidx(ctx);
179 	if (uidx < 0)
180 		goto out;
181 
182 	tind = uidx >> MLX5_UIDX_TABLE_SHIFT;
183 
184 	if (!ctx->uidx_table[tind].refcnt) {
185 		ctx->uidx_table[tind].table = calloc(MLX5_UIDX_TABLE_MASK + 1,
186 						     sizeof(struct mlx5_resource *));
187 		if (!ctx->uidx_table[tind].table)
188 			goto out;
189 	}
190 
191 	++ctx->uidx_table[tind].refcnt;
192 	ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = rsc;
193 	ret = uidx;
194 
195 out:
196 	pthread_mutex_unlock(&ctx->uidx_table_mutex);
197 	return ret;
198 }
199 
200 void mlx5_clear_uidx(struct mlx5_context *ctx, uint32_t uidx)
201 {
202 	int tind = uidx >> MLX5_UIDX_TABLE_SHIFT;
203 
204 	pthread_mutex_lock(&ctx->uidx_table_mutex);
205 
206 	if (!--ctx->uidx_table[tind].refcnt)
207 		free(ctx->uidx_table[tind].table);
208 	else
209 		ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = NULL;
210 
211 	pthread_mutex_unlock(&ctx->uidx_table_mutex);
212 }
213 
214 static int mlx5_is_sandy_bridge(int *num_cores)
215 {
216 	char line[128];
217 	FILE *fd;
218 	int rc = 0;
219 	int cur_cpu_family = -1;
220 	int cur_cpu_model = -1;
221 
222 	fd = fopen("/proc/cpuinfo", "r");
223 	if (!fd)
224 		return 0;
225 
226 	*num_cores = 0;
227 
228 	while (fgets(line, 128, fd)) {
229 		int value;
230 
231 		/* if this is information on new processor */
232 		if (!strncmp(line, "processor", 9)) {
233 			++*num_cores;
234 
235 			cur_cpu_family = -1;
236 			cur_cpu_model  = -1;
237 		} else if (!strncmp(line, "cpu family", 10)) {
238 			if ((cur_cpu_family < 0) && (!read_number_from_line(line, &value)))
239 				cur_cpu_family = value;
240 		} else if (!strncmp(line, "model", 5)) {
241 			if ((cur_cpu_model < 0) && (!read_number_from_line(line, &value)))
242 				cur_cpu_model = value;
243 		}
244 
245 		/* if this is a Sandy Bridge CPU */
246 		if ((cur_cpu_family == 6) &&
247 		    (cur_cpu_model == 0x2A || (cur_cpu_model == 0x2D) ))
248 			rc = 1;
249 	}
250 
251 	fclose(fd);
252 	return rc;
253 }
254 
255 /*
256 man cpuset
257 
258   This format displays each 32-bit word in hexadecimal (using ASCII characters "0" - "9" and "a" - "f"); words
259   are filled with leading zeros, if required. For masks longer than one word, a comma separator is used between
260   words. Words are displayed in big-endian order, which has the most significant bit first. The hex digits
261   within a word are also in big-endian order.
262 
263   The number of 32-bit words displayed is the minimum number needed to display all bits of the bitmask, based on
264   the size of the bitmask.
265 
266   Examples of the Mask Format:
267 
268      00000001                        # just bit 0 set
269      40000000,00000000,00000000      # just bit 94 set
270      000000ff,00000000               # bits 32-39 set
271      00000000,000E3862               # 1,5,6,11-13,17-19 set
272 
273   A mask with bits 0, 1, 2, 4, 8, 16, 32, and 64 set displays as:
274 
275      00000001,00000001,00010117
276 
277   The first "1" is for bit 64, the second for bit 32, the third for bit 16, the fourth for bit 8, the fifth for
278   bit 4, and the "7" is for bits 2, 1, and 0.
279 */
280 static void mlx5_local_cpu_set(struct ibv_device *ibdev, cpuset_t *cpu_set)
281 {
282 	char *p, buf[1024];
283 	char *env_value;
284 	uint32_t word;
285 	int i, k;
286 
287 	env_value = getenv("MLX5_LOCAL_CPUS");
288 	if (env_value)
289 		strncpy(buf, env_value, sizeof(buf));
290 	else {
291 		char fname[MAXPATHLEN];
292 
293 		snprintf(fname, MAXPATHLEN, "/sys/class/infiniband/%s",
294 			 ibv_get_device_name(ibdev));
295 
296 		if (ibv_read_sysfs_file(fname, "device/local_cpus", buf, sizeof(buf))) {
297 			fprintf(stderr, PFX "Warning: can not get local cpu set: failed to open %s\n", fname);
298 			return;
299 		}
300 	}
301 
302 	p = strrchr(buf, ',');
303 	if (!p)
304 		p = buf;
305 
306 	i = 0;
307 	do {
308 		if (*p == ',') {
309 			*p = 0;
310 			p ++;
311 		}
312 
313 		word = strtoul(p, NULL, 16);
314 
315 		for (k = 0; word; ++k, word >>= 1)
316 			if (word & 1)
317 				CPU_SET(k+i, cpu_set);
318 
319 		if (p == buf)
320 			break;
321 
322 		p = strrchr(buf, ',');
323 		if (!p)
324 			p = buf;
325 
326 		i += 32;
327 	} while (i < CPU_SETSIZE);
328 }
329 
330 static int mlx5_enable_sandy_bridge_fix(struct ibv_device *ibdev)
331 {
332 	cpuset_t my_cpus, dev_local_cpus, result_set;
333 	int stall_enable;
334 	int ret;
335 	int num_cores;
336 
337 	if (!mlx5_is_sandy_bridge(&num_cores))
338 		return 0;
339 
340 	/* by default enable stall on sandy bridge arch */
341 	stall_enable = 1;
342 
343 	/*
344 	 * check if app is bound to cpu set that is inside
345 	 * of device local cpu set. Disable stalling if true
346 	 */
347 
348 	/* use static cpu set - up to CPU_SETSIZE (1024) cpus/node */
349 	CPU_ZERO(&my_cpus);
350 	CPU_ZERO(&dev_local_cpus);
351 	CPU_ZERO(&result_set);
352 	ret = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, -1,
353 	    sizeof(my_cpus), &my_cpus);
354 	if (ret == -1) {
355 		if (errno == EINVAL)
356 			fprintf(stderr, PFX "Warning: my cpu set is too small\n");
357 		else
358 			fprintf(stderr, PFX "Warning: failed to get my cpu set\n");
359 		goto out;
360 	}
361 
362 	/* get device local cpu set */
363 	mlx5_local_cpu_set(ibdev, &dev_local_cpus);
364 
365 	/* check if my cpu set is in dev cpu */
366 #if __FreeBSD_version < 1400046
367 	CPU_OR(&result_set, &my_cpus);
368 	CPU_OR(&result_set, &dev_local_cpus);
369 #else
370 	CPU_OR(&result_set, &my_cpus, &dev_local_cpus);
371 #endif
372 	stall_enable = CPU_EQUAL(&result_set, &dev_local_cpus) ? 0 : 1;
373 
374 out:
375 	return stall_enable;
376 }
377 
378 static void mlx5_read_env(struct ibv_device *ibdev, struct mlx5_context *ctx)
379 {
380 	char *env_value;
381 
382 	env_value = getenv("MLX5_STALL_CQ_POLL");
383 	if (env_value)
384 		/* check if cq stall is enforced by user */
385 		ctx->stall_enable = (strcmp(env_value, "0")) ? 1 : 0;
386 	else
387 		/* autodetect if we need to do cq polling */
388 		ctx->stall_enable = mlx5_enable_sandy_bridge_fix(ibdev);
389 
390 	env_value = getenv("MLX5_STALL_NUM_LOOP");
391 	if (env_value)
392 		mlx5_stall_num_loop = atoi(env_value);
393 
394 	env_value = getenv("MLX5_STALL_CQ_POLL_MIN");
395 	if (env_value)
396 		mlx5_stall_cq_poll_min = atoi(env_value);
397 
398 	env_value = getenv("MLX5_STALL_CQ_POLL_MAX");
399 	if (env_value)
400 		mlx5_stall_cq_poll_max = atoi(env_value);
401 
402 	env_value = getenv("MLX5_STALL_CQ_INC_STEP");
403 	if (env_value)
404 		mlx5_stall_cq_inc_step = atoi(env_value);
405 
406 	env_value = getenv("MLX5_STALL_CQ_DEC_STEP");
407 	if (env_value)
408 		mlx5_stall_cq_dec_step = atoi(env_value);
409 
410 	ctx->stall_adaptive_enable = 0;
411 	ctx->stall_cycles = 0;
412 
413 	if (mlx5_stall_num_loop < 0) {
414 		ctx->stall_adaptive_enable = 1;
415 		ctx->stall_cycles = mlx5_stall_cq_poll_min;
416 	}
417 
418 }
419 
420 static int get_total_uuars(int page_size)
421 {
422 	int size = MLX5_DEF_TOT_UUARS;
423 	int uuars_in_page;
424 	char *env;
425 
426 	env = getenv("MLX5_TOTAL_UUARS");
427 	if (env)
428 		size = atoi(env);
429 
430 	if (size < 1)
431 		return -EINVAL;
432 
433 	uuars_in_page = page_size / MLX5_ADAPTER_PAGE_SIZE * MLX5_NUM_NON_FP_BFREGS_PER_UAR;
434 	size = max(uuars_in_page, size);
435 	size = align(size, MLX5_NUM_NON_FP_BFREGS_PER_UAR);
436 	if (size > MLX5_MAX_BFREGS)
437 		return -ENOMEM;
438 
439 	return size;
440 }
441 
442 static void open_debug_file(struct mlx5_context *ctx)
443 {
444 	char *env;
445 
446 	env = getenv("MLX5_DEBUG_FILE");
447 	if (!env) {
448 		ctx->dbg_fp = stderr;
449 		return;
450 	}
451 
452 	ctx->dbg_fp = fopen(env, "aw+");
453 	if (!ctx->dbg_fp) {
454 		fprintf(stderr, "Failed opening debug file %s, using stderr\n", env);
455 		ctx->dbg_fp = stderr;
456 		return;
457 	}
458 }
459 
460 static void close_debug_file(struct mlx5_context *ctx)
461 {
462 	if (ctx->dbg_fp && ctx->dbg_fp != stderr)
463 		fclose(ctx->dbg_fp);
464 }
465 
466 static void set_debug_mask(void)
467 {
468 	char *env;
469 
470 	env = getenv("MLX5_DEBUG_MASK");
471 	if (env)
472 		mlx5_debug_mask = strtol(env, NULL, 0);
473 }
474 
475 static void set_freeze_on_error(void)
476 {
477 	char *env;
478 
479 	env = getenv("MLX5_FREEZE_ON_ERROR_CQE");
480 	if (env)
481 		mlx5_freeze_on_error_cqe = strtol(env, NULL, 0);
482 }
483 
484 static int get_always_bf(void)
485 {
486 	char *env;
487 
488 	env = getenv("MLX5_POST_SEND_PREFER_BF");
489 	if (!env)
490 		return 1;
491 
492 	return strcmp(env, "0") ? 1 : 0;
493 }
494 
495 static int get_shut_up_bf(void)
496 {
497 	char *env;
498 
499 	env = getenv("MLX5_SHUT_UP_BF");
500 	if (!env)
501 		return 0;
502 
503 	return strcmp(env, "0") ? 1 : 0;
504 }
505 
506 static int get_num_low_lat_uuars(int tot_uuars)
507 {
508 	char *env;
509 	int num = 4;
510 
511 	env = getenv("MLX5_NUM_LOW_LAT_UUARS");
512 	if (env)
513 		num = atoi(env);
514 
515 	if (num < 0)
516 		return -EINVAL;
517 
518 	num = max(num, tot_uuars - MLX5_MED_BFREGS_TSHOLD);
519 	return num;
520 }
521 
522 /* The library allocates an array of uuar contexts. The one in index zero does
523  * not to execersize odd/even policy so it can avoid a lock but it may not use
524  * blue flame. The upper ones, low_lat_uuars can use blue flame with no lock
525  * since they are assigned to one QP only. The rest can use blue flame but since
526  * they are shared they need a lock
527  */
528 static int need_uuar_lock(struct mlx5_context *ctx, int uuarn)
529 {
530 	if (uuarn == 0 || mlx5_single_threaded)
531 		return 0;
532 
533 	if (uuarn >= (ctx->tot_uuars - ctx->low_lat_uuars) * 2)
534 		return 0;
535 
536 	return 1;
537 }
538 
539 static int single_threaded_app(void)
540 {
541 
542 	char *env;
543 
544 	env = getenv("MLX5_SINGLE_THREADED");
545 	if (env)
546 		return strcmp(env, "1") ? 0 : 1;
547 
548 	return 0;
549 }
550 
551 static int mlx5_cmd_get_context(struct mlx5_context *context,
552 				struct mlx5_alloc_ucontext *req,
553 				size_t req_len,
554 				struct mlx5_alloc_ucontext_resp *resp,
555 				size_t resp_len)
556 {
557 	if (!ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req,
558 				 req_len, &resp->ibv_resp, resp_len))
559 		return 0;
560 
561 	/* The ibv_cmd_get_context fails in older kernels when passing
562 	 * a request length that the kernel doesn't know.
563 	 * To avoid breaking compatibility of new libmlx5 and older
564 	 * kernels, when ibv_cmd_get_context fails with the full
565 	 * request length, we try once again with the legacy length.
566 	 * We repeat this process while reducing requested size based
567 	 * on the feature input size. To avoid this in the future, we
568 	 * will remove the check in kernel that requires fields unknown
569 	 * to the kernel to be cleared. This will require that any new
570 	 * feature that involves extending struct mlx5_alloc_ucontext
571 	 * will be accompanied by an indication in the form of one or
572 	 * more fields in struct mlx5_alloc_ucontext_resp. If the
573 	 * response value can be interpreted as feature not supported
574 	 * when the returned value is zero, this will suffice to
575 	 * indicate to the library that the request was ignored by the
576 	 * kernel, either because it is unaware or because it decided
577 	 * to do so. If zero is a valid response, we will add a new
578 	 * field that indicates whether the request was handled.
579 	 */
580 	if (!ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req,
581 				 offsetof(struct mlx5_alloc_ucontext, lib_caps),
582 				 &resp->ibv_resp, resp_len))
583 		return 0;
584 
585 	return ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req,
586 				   offsetof(struct mlx5_alloc_ucontext,
587 					    cqe_version),
588 				   &resp->ibv_resp, resp_len);
589 }
590 
591 static int mlx5_map_internal_clock(struct mlx5_device *mdev,
592 				   struct ibv_context *ibv_ctx)
593 {
594 	struct mlx5_context *context = to_mctx(ibv_ctx);
595 	void *hca_clock_page;
596 	off_t offset = 0;
597 
598 	set_command(MLX5_MMAP_GET_CORE_CLOCK_CMD, &offset);
599 	hca_clock_page = mmap(NULL, mdev->page_size,
600 			      PROT_READ, MAP_SHARED, ibv_ctx->cmd_fd,
601 			      mdev->page_size * offset);
602 
603 	if (hca_clock_page == MAP_FAILED) {
604 		fprintf(stderr, PFX
605 			"Warning: Timestamp available,\n"
606 			"but failed to mmap() hca core clock page.\n");
607 		return -1;
608 	}
609 
610 	context->hca_core_clock = hca_clock_page +
611 		(context->core_clock.offset & (mdev->page_size - 1));
612 	return 0;
613 }
614 
615 int mlx5dv_query_device(struct ibv_context *ctx_in,
616 			 struct mlx5dv_context *attrs_out)
617 {
618 	struct mlx5_context *mctx = to_mctx(ctx_in);
619 	uint64_t comp_mask_out = 0;
620 
621 	attrs_out->version   = 0;
622 	attrs_out->flags     = 0;
623 
624 	if (mctx->cqe_version == MLX5_CQE_VERSION_V1)
625 		attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_CQE_V1;
626 
627 	if (mctx->vendor_cap_flags & MLX5_VENDOR_CAP_FLAGS_MPW)
628 		attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_MPW;
629 
630 	if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_CQE_COMPRESION) {
631 		attrs_out->cqe_comp_caps = mctx->cqe_comp_caps;
632 		comp_mask_out |= MLX5DV_CONTEXT_MASK_CQE_COMPRESION;
633 	}
634 
635 	attrs_out->comp_mask = comp_mask_out;
636 
637 	return 0;
638 }
639 
640 static int mlx5dv_get_qp(struct ibv_qp *qp_in,
641 			 struct mlx5dv_qp *qp_out)
642 {
643 	struct mlx5_qp *mqp = to_mqp(qp_in);
644 
645 	qp_out->comp_mask = 0;
646 	qp_out->dbrec     = mqp->db;
647 
648 	if (mqp->sq_buf_size)
649 		/* IBV_QPT_RAW_PACKET */
650 		qp_out->sq.buf = (void *)((uintptr_t)mqp->sq_buf.buf);
651 	else
652 		qp_out->sq.buf = (void *)((uintptr_t)mqp->buf.buf + mqp->sq.offset);
653 	qp_out->sq.wqe_cnt = mqp->sq.wqe_cnt;
654 	qp_out->sq.stride  = 1 << mqp->sq.wqe_shift;
655 
656 	qp_out->rq.buf     = (void *)((uintptr_t)mqp->buf.buf + mqp->rq.offset);
657 	qp_out->rq.wqe_cnt = mqp->rq.wqe_cnt;
658 	qp_out->rq.stride  = 1 << mqp->rq.wqe_shift;
659 
660 	qp_out->bf.reg    = mqp->bf->reg;
661 
662 	if (mqp->bf->uuarn > 0)
663 		qp_out->bf.size = mqp->bf->buf_size;
664 	else
665 		qp_out->bf.size = 0;
666 
667 	return 0;
668 }
669 
670 static int mlx5dv_get_cq(struct ibv_cq *cq_in,
671 			 struct mlx5dv_cq *cq_out)
672 {
673 	struct mlx5_cq *mcq = to_mcq(cq_in);
674 	struct mlx5_context *mctx = to_mctx(cq_in->context);
675 
676 	cq_out->comp_mask = 0;
677 	cq_out->cqn       = mcq->cqn;
678 	cq_out->cqe_cnt   = mcq->ibv_cq.cqe + 1;
679 	cq_out->cqe_size  = mcq->cqe_sz;
680 	cq_out->buf       = mcq->active_buf->buf;
681 	cq_out->dbrec     = mcq->dbrec;
682 	cq_out->uar	  = mctx->uar;
683 
684 	mcq->flags	 |= MLX5_CQ_FLAGS_DV_OWNED;
685 
686 	return 0;
687 }
688 
689 static int mlx5dv_get_rwq(struct ibv_wq *wq_in,
690 			  struct mlx5dv_rwq *rwq_out)
691 {
692 	struct mlx5_rwq *mrwq = to_mrwq(wq_in);
693 
694 	rwq_out->comp_mask = 0;
695 	rwq_out->buf       = mrwq->pbuff;
696 	rwq_out->dbrec     = mrwq->recv_db;
697 	rwq_out->wqe_cnt   = mrwq->rq.wqe_cnt;
698 	rwq_out->stride    = 1 << mrwq->rq.wqe_shift;
699 
700 	return 0;
701 }
702 
703 static int mlx5dv_get_srq(struct ibv_srq *srq_in,
704 			  struct mlx5dv_srq *srq_out)
705 {
706 	struct mlx5_srq *msrq;
707 
708 	msrq = container_of(srq_in, struct mlx5_srq, vsrq.srq);
709 
710 	srq_out->comp_mask = 0;
711 	srq_out->buf       = msrq->buf.buf;
712 	srq_out->dbrec     = msrq->db;
713 	srq_out->stride    = 1 << msrq->wqe_shift;
714 	srq_out->head      = msrq->head;
715 	srq_out->tail      = msrq->tail;
716 
717 	return 0;
718 }
719 
720 int mlx5dv_init_obj(struct mlx5dv_obj *obj, uint64_t obj_type)
721 {
722 	int ret = 0;
723 
724 	if (obj_type & MLX5DV_OBJ_QP)
725 		ret = mlx5dv_get_qp(obj->qp.in, obj->qp.out);
726 	if (!ret && (obj_type & MLX5DV_OBJ_CQ))
727 		ret = mlx5dv_get_cq(obj->cq.in, obj->cq.out);
728 	if (!ret && (obj_type & MLX5DV_OBJ_SRQ))
729 		ret = mlx5dv_get_srq(obj->srq.in, obj->srq.out);
730 	if (!ret && (obj_type & MLX5DV_OBJ_RWQ))
731 		ret = mlx5dv_get_rwq(obj->rwq.in, obj->rwq.out);
732 
733 	return ret;
734 }
735 
736 static void adjust_uar_info(struct mlx5_device *mdev,
737 			    struct mlx5_context *context,
738 			    struct mlx5_alloc_ucontext_resp resp)
739 {
740 	if (!resp.log_uar_size && !resp.num_uars_per_page) {
741 		/* old kernel */
742 		context->uar_size = mdev->page_size;
743 		context->num_uars_per_page = 1;
744 		return;
745 	}
746 
747 	context->uar_size = 1 << resp.log_uar_size;
748 	context->num_uars_per_page = resp.num_uars_per_page;
749 }
750 
751 static int mlx5_init_context(struct verbs_device *vdev,
752 			     struct ibv_context *ctx, int cmd_fd)
753 {
754 	struct mlx5_context	       *context;
755 	struct mlx5_alloc_ucontext	req;
756 	struct mlx5_alloc_ucontext_resp resp;
757 	int				i;
758 	int				page_size;
759 	int				tot_uuars;
760 	int				low_lat_uuars;
761 	int				gross_uuars;
762 	int				j;
763 	off_t				offset;
764 	struct mlx5_device	       *mdev;
765 	struct verbs_context	       *v_ctx;
766 	struct ibv_port_attr		port_attr;
767 	struct ibv_device_attr_ex	device_attr;
768 	int				k;
769 	int				bfi;
770 	int				num_sys_page_map;
771 
772 	mdev = to_mdev(&vdev->device);
773 	v_ctx = verbs_get_ctx(ctx);
774 	page_size = mdev->page_size;
775 	mlx5_single_threaded = single_threaded_app();
776 
777 	context = to_mctx(ctx);
778 	context->ibv_ctx.cmd_fd = cmd_fd;
779 
780 	open_debug_file(context);
781 	set_debug_mask();
782 	set_freeze_on_error();
783 	if (gethostname(context->hostname, sizeof(context->hostname)))
784 		strcpy(context->hostname, "host_unknown");
785 
786 	tot_uuars = get_total_uuars(page_size);
787 	if (tot_uuars < 0) {
788 		errno = -tot_uuars;
789 		goto err_free;
790 	}
791 
792 	low_lat_uuars = get_num_low_lat_uuars(tot_uuars);
793 	if (low_lat_uuars < 0) {
794 		errno = -low_lat_uuars;
795 		goto err_free;
796 	}
797 
798 	if (low_lat_uuars > tot_uuars - 1) {
799 		errno = ENOMEM;
800 		goto err_free;
801 	}
802 
803 	memset(&req, 0, sizeof(req));
804 	memset(&resp, 0, sizeof(resp));
805 
806 	req.total_num_uuars = tot_uuars;
807 	req.num_low_latency_uuars = low_lat_uuars;
808 	req.cqe_version = MLX5_CQE_VERSION_V1;
809 	req.lib_caps |= MLX5_LIB_CAP_4K_UAR;
810 
811 	if (mlx5_cmd_get_context(context, &req, sizeof(req), &resp,
812 				 sizeof(resp)))
813 		goto err_free;
814 
815 	context->max_num_qps		= resp.qp_tab_size;
816 	context->bf_reg_size		= resp.bf_reg_size;
817 	context->tot_uuars		= resp.tot_uuars;
818 	context->low_lat_uuars		= low_lat_uuars;
819 	context->cache_line_size	= resp.cache_line_size;
820 	context->max_sq_desc_sz = resp.max_sq_desc_sz;
821 	context->max_rq_desc_sz = resp.max_rq_desc_sz;
822 	context->max_send_wqebb	= resp.max_send_wqebb;
823 	context->num_ports	= resp.num_ports;
824 	context->max_recv_wr	= resp.max_recv_wr;
825 	context->max_srq_recv_wr = resp.max_srq_recv_wr;
826 
827 	context->cqe_version = resp.cqe_version;
828 	if (context->cqe_version) {
829 		if (context->cqe_version == MLX5_CQE_VERSION_V1)
830 			mlx5_ctx_ops.poll_cq = mlx5_poll_cq_v1;
831 		else
832 			goto err_free;
833 	}
834 
835 	adjust_uar_info(mdev, context, resp);
836 
837 	gross_uuars = context->tot_uuars / MLX5_NUM_NON_FP_BFREGS_PER_UAR * NUM_BFREGS_PER_UAR;
838 	context->bfs = calloc(gross_uuars, sizeof(*context->bfs));
839 	if (!context->bfs) {
840 		errno = ENOMEM;
841 		goto err_free;
842 	}
843 
844 	context->cmds_supp_uhw = resp.cmds_supp_uhw;
845 	context->vendor_cap_flags = 0;
846 
847 	pthread_mutex_init(&context->qp_table_mutex, NULL);
848 	pthread_mutex_init(&context->srq_table_mutex, NULL);
849 	pthread_mutex_init(&context->uidx_table_mutex, NULL);
850 	for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i)
851 		context->qp_table[i].refcnt = 0;
852 
853 	for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i)
854 		context->uidx_table[i].refcnt = 0;
855 
856 	context->db_list = NULL;
857 
858 	pthread_mutex_init(&context->db_list_mutex, NULL);
859 
860 	num_sys_page_map = context->tot_uuars / (context->num_uars_per_page * MLX5_NUM_NON_FP_BFREGS_PER_UAR);
861 	for (i = 0; i < num_sys_page_map; ++i) {
862 		offset = 0;
863 		set_command(MLX5_MMAP_GET_REGULAR_PAGES_CMD, &offset);
864 		set_index(i, &offset);
865 		context->uar[i] = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED,
866 				       cmd_fd, page_size * offset);
867 		if (context->uar[i] == MAP_FAILED) {
868 			context->uar[i] = NULL;
869 			goto err_free_bf;
870 		}
871 	}
872 
873 	for (i = 0; i < num_sys_page_map; i++) {
874 		for (j = 0; j < context->num_uars_per_page; j++) {
875 			for (k = 0; k < NUM_BFREGS_PER_UAR; k++) {
876 				bfi = (i * context->num_uars_per_page + j) * NUM_BFREGS_PER_UAR + k;
877 				context->bfs[bfi].reg = context->uar[i] + MLX5_ADAPTER_PAGE_SIZE * j +
878 							MLX5_BF_OFFSET + k * context->bf_reg_size;
879 				context->bfs[bfi].need_lock = need_uuar_lock(context, bfi);
880 				mlx5_spinlock_init(&context->bfs[bfi].lock);
881 				context->bfs[bfi].offset = 0;
882 				if (bfi)
883 					context->bfs[bfi].buf_size = context->bf_reg_size / 2;
884 				context->bfs[bfi].uuarn = bfi;
885 			}
886 		}
887 	}
888 	context->hca_core_clock = NULL;
889 	if (resp.response_length + sizeof(resp.ibv_resp) >=
890 	    offsetof(struct mlx5_alloc_ucontext_resp, hca_core_clock_offset) +
891 	    sizeof(resp.hca_core_clock_offset) &&
892 	    resp.comp_mask & MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET) {
893 		context->core_clock.offset = resp.hca_core_clock_offset;
894 		mlx5_map_internal_clock(mdev, ctx);
895 	}
896 
897 	mlx5_spinlock_init(&context->lock32);
898 
899 	context->prefer_bf = get_always_bf();
900 	context->shut_up_bf = get_shut_up_bf();
901 	mlx5_read_env(&vdev->device, context);
902 
903 	mlx5_spinlock_init(&context->hugetlb_lock);
904 	TAILQ_INIT(&context->hugetlb_list);
905 
906 	context->ibv_ctx.ops = mlx5_ctx_ops;
907 
908 	verbs_set_ctx_op(v_ctx, create_qp_ex, mlx5_create_qp_ex);
909 	verbs_set_ctx_op(v_ctx, open_xrcd, mlx5_open_xrcd);
910 	verbs_set_ctx_op(v_ctx, close_xrcd, mlx5_close_xrcd);
911 	verbs_set_ctx_op(v_ctx, create_srq_ex, mlx5_create_srq_ex);
912 	verbs_set_ctx_op(v_ctx, get_srq_num, mlx5_get_srq_num);
913 	verbs_set_ctx_op(v_ctx, query_device_ex, mlx5_query_device_ex);
914 	verbs_set_ctx_op(v_ctx, query_rt_values, mlx5_query_rt_values);
915 	verbs_set_ctx_op(v_ctx, ibv_create_flow, ibv_cmd_create_flow);
916 	verbs_set_ctx_op(v_ctx, ibv_destroy_flow, ibv_cmd_destroy_flow);
917 	verbs_set_ctx_op(v_ctx, create_cq_ex, mlx5_create_cq_ex);
918 	verbs_set_ctx_op(v_ctx, create_wq, mlx5_create_wq);
919 	verbs_set_ctx_op(v_ctx, modify_wq, mlx5_modify_wq);
920 	verbs_set_ctx_op(v_ctx, destroy_wq, mlx5_destroy_wq);
921 	verbs_set_ctx_op(v_ctx, create_rwq_ind_table, mlx5_create_rwq_ind_table);
922 	verbs_set_ctx_op(v_ctx, destroy_rwq_ind_table, mlx5_destroy_rwq_ind_table);
923 
924 	memset(&device_attr, 0, sizeof(device_attr));
925 	if (!mlx5_query_device_ex(ctx, NULL, &device_attr,
926 				  sizeof(struct ibv_device_attr_ex))) {
927 		context->cached_device_cap_flags =
928 			device_attr.orig_attr.device_cap_flags;
929 		context->atomic_cap = device_attr.orig_attr.atomic_cap;
930 		context->cached_tso_caps = device_attr.tso_caps;
931 	}
932 
933 	for (j = 0; j < min(MLX5_MAX_PORTS_NUM, context->num_ports); ++j) {
934 		memset(&port_attr, 0, sizeof(port_attr));
935 		if (!mlx5_query_port(ctx, j + 1, &port_attr))
936 			context->cached_link_layer[j] = port_attr.link_layer;
937 	}
938 
939 	return 0;
940 
941 err_free_bf:
942 	free(context->bfs);
943 
944 err_free:
945 	for (i = 0; i < MLX5_MAX_UARS; ++i) {
946 		if (context->uar[i])
947 			munmap(context->uar[i], page_size);
948 	}
949 	close_debug_file(context);
950 	return errno;
951 }
952 
953 static void mlx5_cleanup_context(struct verbs_device *device,
954 				 struct ibv_context *ibctx)
955 {
956 	struct mlx5_context *context = to_mctx(ibctx);
957 	int page_size = to_mdev(ibctx->device)->page_size;
958 	int i;
959 
960 	free(context->bfs);
961 	for (i = 0; i < MLX5_MAX_UARS; ++i) {
962 		if (context->uar[i])
963 			munmap(context->uar[i], page_size);
964 	}
965 	if (context->hca_core_clock)
966 		munmap(context->hca_core_clock - context->core_clock.offset,
967 		       page_size);
968 	close_debug_file(context);
969 }
970 
971 static struct verbs_device_ops mlx5_dev_ops = {
972 	.init_context = mlx5_init_context,
973 	.uninit_context = mlx5_cleanup_context,
974 };
975 
976 static struct verbs_device *mlx5_driver_init(const char *uverbs_sys_path,
977 					     int abi_version)
978 {
979 	char			value[8];
980 	struct mlx5_device     *dev;
981 	unsigned		vendor, device;
982 	int			i;
983 
984 	if (ibv_read_sysfs_file(uverbs_sys_path, "device/vendor",
985 				value, sizeof value) < 0)
986 		return NULL;
987 	sscanf(value, "%i", &vendor);
988 
989 	if (ibv_read_sysfs_file(uverbs_sys_path, "device/device",
990 				value, sizeof value) < 0)
991 		return NULL;
992 	sscanf(value, "%i", &device);
993 
994 	for (i = 0; i < sizeof hca_table / sizeof hca_table[0]; ++i)
995 		if (vendor == hca_table[i].vendor &&
996 		    device == hca_table[i].device)
997 			goto found;
998 
999 	return NULL;
1000 
1001 found:
1002 	if (abi_version < MLX5_UVERBS_MIN_ABI_VERSION ||
1003 	    abi_version > MLX5_UVERBS_MAX_ABI_VERSION) {
1004 		fprintf(stderr, PFX "Fatal: ABI version %d of %s is not supported "
1005 			"(min supported %d, max supported %d)\n",
1006 			abi_version, uverbs_sys_path,
1007 			MLX5_UVERBS_MIN_ABI_VERSION,
1008 			MLX5_UVERBS_MAX_ABI_VERSION);
1009 		return NULL;
1010 	}
1011 
1012 	dev = calloc(1, sizeof *dev);
1013 	if (!dev) {
1014 		fprintf(stderr, PFX "Fatal: couldn't allocate device for %s\n",
1015 			uverbs_sys_path);
1016 		return NULL;
1017 	}
1018 
1019 	dev->page_size   = sysconf(_SC_PAGESIZE);
1020 	dev->driver_abi_ver = abi_version;
1021 
1022 	dev->verbs_dev.ops = &mlx5_dev_ops;
1023 	dev->verbs_dev.sz = sizeof(*dev);
1024 	dev->verbs_dev.size_of_context = sizeof(struct mlx5_context) -
1025 		sizeof(struct ibv_context);
1026 
1027 	return &dev->verbs_dev;
1028 }
1029 
1030 static __attribute__((constructor)) void mlx5_register_driver(void)
1031 {
1032 	verbs_register_driver("mlx5", mlx5_driver_init);
1033 }
1034