xref: /freebsd/contrib/ofed/libmlx5/mlx5.c (revision cc426dd31990b8b50b210efc450e404596548ca1)
1 /*
2  * Copyright (c) 2012 Mellanox Technologies, Inc.  All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32 #define _GNU_SOURCE
33 #include <config.h>
34 
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <unistd.h>
38 #include <errno.h>
39 #include <sys/mman.h>
40 #include <pthread.h>
41 #include <string.h>
42 #include <sched.h>
43 #include <sys/param.h>
44 #include <sys/cpuset.h>
45 
46 #include "mlx5.h"
47 #include "mlx5-abi.h"
48 
49 #ifndef PCI_VENDOR_ID_MELLANOX
50 #define PCI_VENDOR_ID_MELLANOX			0x15b3
51 #endif
52 
53 #ifndef CPU_OR
54 #define CPU_OR(x, y, z) do {} while (0)
55 #endif
56 
57 #ifndef CPU_EQUAL
58 #define CPU_EQUAL(x, y) 1
59 #endif
60 
61 
62 #define HCA(v, d) \
63 	{ .vendor = PCI_VENDOR_ID_##v,			\
64 	  .device = d }
65 
66 static struct {
67 	unsigned		vendor;
68 	unsigned		device;
69 } hca_table[] = {
70 	HCA(MELLANOX, 4113),	/* MT4113 Connect-IB */
71 	HCA(MELLANOX, 4114),	/* Connect-IB Virtual Function */
72 	HCA(MELLANOX, 4115),	/* ConnectX-4 */
73 	HCA(MELLANOX, 4116),	/* ConnectX-4 Virtual Function */
74 	HCA(MELLANOX, 4117),	/* ConnectX-4LX */
75 	HCA(MELLANOX, 4118),	/* ConnectX-4LX Virtual Function */
76 	HCA(MELLANOX, 4119),	/* ConnectX-5, PCIe 3.0 */
77 	HCA(MELLANOX, 4120),	/* ConnectX-5 Virtual Function */
78 	HCA(MELLANOX, 4121),    /* ConnectX-5 Ex */
79 	HCA(MELLANOX, 4122),	/* ConnectX-5 Ex VF */
80 	HCA(MELLANOX, 4123),    /* ConnectX-6 */
81 	HCA(MELLANOX, 4124),	/* ConnectX-6 VF */
82 	HCA(MELLANOX, 41682),	/* BlueField integrated ConnectX-5 network controller */
83 	HCA(MELLANOX, 41683),	/* BlueField integrated ConnectX-5 network controller VF */
84 };
85 
86 uint32_t mlx5_debug_mask = 0;
87 int mlx5_freeze_on_error_cqe;
88 
89 static struct ibv_context_ops mlx5_ctx_ops = {
90 	.query_device  = mlx5_query_device,
91 	.query_port    = mlx5_query_port,
92 	.alloc_pd      = mlx5_alloc_pd,
93 	.dealloc_pd    = mlx5_free_pd,
94 	.reg_mr	       = mlx5_reg_mr,
95 	.rereg_mr      = mlx5_rereg_mr,
96 	.dereg_mr      = mlx5_dereg_mr,
97 	.alloc_mw      = mlx5_alloc_mw,
98 	.dealloc_mw    = mlx5_dealloc_mw,
99 	.bind_mw       = mlx5_bind_mw,
100 	.create_cq     = mlx5_create_cq,
101 	.poll_cq       = mlx5_poll_cq,
102 	.req_notify_cq = mlx5_arm_cq,
103 	.cq_event      = mlx5_cq_event,
104 	.resize_cq     = mlx5_resize_cq,
105 	.destroy_cq    = mlx5_destroy_cq,
106 	.create_srq    = mlx5_create_srq,
107 	.modify_srq    = mlx5_modify_srq,
108 	.query_srq     = mlx5_query_srq,
109 	.destroy_srq   = mlx5_destroy_srq,
110 	.post_srq_recv = mlx5_post_srq_recv,
111 	.create_qp     = mlx5_create_qp,
112 	.query_qp      = mlx5_query_qp,
113 	.modify_qp     = mlx5_modify_qp,
114 	.destroy_qp    = mlx5_destroy_qp,
115 	.post_send     = mlx5_post_send,
116 	.post_recv     = mlx5_post_recv,
117 	.create_ah     = mlx5_create_ah,
118 	.destroy_ah    = mlx5_destroy_ah,
119 	.attach_mcast  = mlx5_attach_mcast,
120 	.detach_mcast  = mlx5_detach_mcast
121 };
122 
123 static int read_number_from_line(const char *line, int *value)
124 {
125 	const char *ptr;
126 
127 	ptr = strchr(line, ':');
128 	if (!ptr)
129 		return 1;
130 
131 	++ptr;
132 
133 	*value = atoi(ptr);
134 	return 0;
135 }
136 /**
137  * The function looks for the first free user-index in all the
138  * user-index tables. If all are used, returns -1, otherwise
139  * a valid user-index.
140  * In case the reference count of the table is zero, it means the
141  * table is not in use and wasn't allocated yet, therefore the
142  * mlx5_store_uidx allocates the table, and increment the reference
143  * count on the table.
144  */
145 static int32_t get_free_uidx(struct mlx5_context *ctx)
146 {
147 	int32_t tind;
148 	int32_t i;
149 
150 	for (tind = 0; tind < MLX5_UIDX_TABLE_SIZE; tind++) {
151 		if (ctx->uidx_table[tind].refcnt < MLX5_UIDX_TABLE_MASK)
152 			break;
153 	}
154 
155 	if (tind == MLX5_UIDX_TABLE_SIZE)
156 		return -1;
157 
158 	if (!ctx->uidx_table[tind].refcnt)
159 		return tind << MLX5_UIDX_TABLE_SHIFT;
160 
161 	for (i = 0; i < MLX5_UIDX_TABLE_MASK + 1; i++) {
162 		if (!ctx->uidx_table[tind].table[i])
163 			break;
164 	}
165 
166 	return (tind << MLX5_UIDX_TABLE_SHIFT) | i;
167 }
168 
169 int32_t mlx5_store_uidx(struct mlx5_context *ctx, void *rsc)
170 {
171 	int32_t tind;
172 	int32_t ret = -1;
173 	int32_t uidx;
174 
175 	pthread_mutex_lock(&ctx->uidx_table_mutex);
176 	uidx = get_free_uidx(ctx);
177 	if (uidx < 0)
178 		goto out;
179 
180 	tind = uidx >> MLX5_UIDX_TABLE_SHIFT;
181 
182 	if (!ctx->uidx_table[tind].refcnt) {
183 		ctx->uidx_table[tind].table = calloc(MLX5_UIDX_TABLE_MASK + 1,
184 						     sizeof(struct mlx5_resource *));
185 		if (!ctx->uidx_table[tind].table)
186 			goto out;
187 	}
188 
189 	++ctx->uidx_table[tind].refcnt;
190 	ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = rsc;
191 	ret = uidx;
192 
193 out:
194 	pthread_mutex_unlock(&ctx->uidx_table_mutex);
195 	return ret;
196 }
197 
198 void mlx5_clear_uidx(struct mlx5_context *ctx, uint32_t uidx)
199 {
200 	int tind = uidx >> MLX5_UIDX_TABLE_SHIFT;
201 
202 	pthread_mutex_lock(&ctx->uidx_table_mutex);
203 
204 	if (!--ctx->uidx_table[tind].refcnt)
205 		free(ctx->uidx_table[tind].table);
206 	else
207 		ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = NULL;
208 
209 	pthread_mutex_unlock(&ctx->uidx_table_mutex);
210 }
211 
212 static int mlx5_is_sandy_bridge(int *num_cores)
213 {
214 	char line[128];
215 	FILE *fd;
216 	int rc = 0;
217 	int cur_cpu_family = -1;
218 	int cur_cpu_model = -1;
219 
220 	fd = fopen("/proc/cpuinfo", "r");
221 	if (!fd)
222 		return 0;
223 
224 	*num_cores = 0;
225 
226 	while (fgets(line, 128, fd)) {
227 		int value;
228 
229 		/* if this is information on new processor */
230 		if (!strncmp(line, "processor", 9)) {
231 			++*num_cores;
232 
233 			cur_cpu_family = -1;
234 			cur_cpu_model  = -1;
235 		} else if (!strncmp(line, "cpu family", 10)) {
236 			if ((cur_cpu_family < 0) && (!read_number_from_line(line, &value)))
237 				cur_cpu_family = value;
238 		} else if (!strncmp(line, "model", 5)) {
239 			if ((cur_cpu_model < 0) && (!read_number_from_line(line, &value)))
240 				cur_cpu_model = value;
241 		}
242 
243 		/* if this is a Sandy Bridge CPU */
244 		if ((cur_cpu_family == 6) &&
245 		    (cur_cpu_model == 0x2A || (cur_cpu_model == 0x2D) ))
246 			rc = 1;
247 	}
248 
249 	fclose(fd);
250 	return rc;
251 }
252 
253 /*
254 man cpuset
255 
256   This format displays each 32-bit word in hexadecimal (using ASCII characters "0" - "9" and "a" - "f"); words
257   are filled with leading zeros, if required. For masks longer than one word, a comma separator is used between
258   words. Words are displayed in big-endian order, which has the most significant bit first. The hex digits
259   within a word are also in big-endian order.
260 
261   The number of 32-bit words displayed is the minimum number needed to display all bits of the bitmask, based on
262   the size of the bitmask.
263 
264   Examples of the Mask Format:
265 
266      00000001                        # just bit 0 set
267      40000000,00000000,00000000      # just bit 94 set
268      000000ff,00000000               # bits 32-39 set
269      00000000,000E3862               # 1,5,6,11-13,17-19 set
270 
271   A mask with bits 0, 1, 2, 4, 8, 16, 32, and 64 set displays as:
272 
273      00000001,00000001,00010117
274 
275   The first "1" is for bit 64, the second for bit 32, the third for bit 16, the fourth for bit 8, the fifth for
276   bit 4, and the "7" is for bits 2, 1, and 0.
277 */
278 static void mlx5_local_cpu_set(struct ibv_device *ibdev, cpuset_t *cpu_set)
279 {
280 	char *p, buf[1024];
281 	char *env_value;
282 	uint32_t word;
283 	int i, k;
284 
285 	env_value = getenv("MLX5_LOCAL_CPUS");
286 	if (env_value)
287 		strncpy(buf, env_value, sizeof(buf));
288 	else {
289 		char fname[MAXPATHLEN];
290 
291 		snprintf(fname, MAXPATHLEN, "/sys/class/infiniband/%s",
292 			 ibv_get_device_name(ibdev));
293 
294 		if (ibv_read_sysfs_file(fname, "device/local_cpus", buf, sizeof(buf))) {
295 			fprintf(stderr, PFX "Warning: can not get local cpu set: failed to open %s\n", fname);
296 			return;
297 		}
298 	}
299 
300 	p = strrchr(buf, ',');
301 	if (!p)
302 		p = buf;
303 
304 	i = 0;
305 	do {
306 		if (*p == ',') {
307 			*p = 0;
308 			p ++;
309 		}
310 
311 		word = strtoul(p, NULL, 16);
312 
313 		for (k = 0; word; ++k, word >>= 1)
314 			if (word & 1)
315 				CPU_SET(k+i, cpu_set);
316 
317 		if (p == buf)
318 			break;
319 
320 		p = strrchr(buf, ',');
321 		if (!p)
322 			p = buf;
323 
324 		i += 32;
325 	} while (i < CPU_SETSIZE);
326 }
327 
328 static int mlx5_enable_sandy_bridge_fix(struct ibv_device *ibdev)
329 {
330 	cpuset_t my_cpus, dev_local_cpus, result_set;
331 	int stall_enable;
332 	int ret;
333 	int num_cores;
334 
335 	if (!mlx5_is_sandy_bridge(&num_cores))
336 		return 0;
337 
338 	/* by default enable stall on sandy bridge arch */
339 	stall_enable = 1;
340 
341 	/*
342 	 * check if app is bound to cpu set that is inside
343 	 * of device local cpu set. Disable stalling if true
344 	 */
345 
346 	/* use static cpu set - up to CPU_SETSIZE (1024) cpus/node */
347 	CPU_ZERO(&my_cpus);
348 	CPU_ZERO(&dev_local_cpus);
349 	CPU_ZERO(&result_set);
350 	ret = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, -1,
351 	    sizeof(my_cpus), &my_cpus);
352 	if (ret == -1) {
353 		if (errno == EINVAL)
354 			fprintf(stderr, PFX "Warning: my cpu set is too small\n");
355 		else
356 			fprintf(stderr, PFX "Warning: failed to get my cpu set\n");
357 		goto out;
358 	}
359 
360 	/* get device local cpu set */
361 	mlx5_local_cpu_set(ibdev, &dev_local_cpus);
362 
363 	/* check if my cpu set is in dev cpu */
364 	CPU_OR(&result_set, &my_cpus);
365 	CPU_OR(&result_set, &dev_local_cpus);
366 	stall_enable = CPU_EQUAL(&result_set, &dev_local_cpus) ? 0 : 1;
367 
368 out:
369 	return stall_enable;
370 }
371 
372 static void mlx5_read_env(struct ibv_device *ibdev, struct mlx5_context *ctx)
373 {
374 	char *env_value;
375 
376 	env_value = getenv("MLX5_STALL_CQ_POLL");
377 	if (env_value)
378 		/* check if cq stall is enforced by user */
379 		ctx->stall_enable = (strcmp(env_value, "0")) ? 1 : 0;
380 	else
381 		/* autodetect if we need to do cq polling */
382 		ctx->stall_enable = mlx5_enable_sandy_bridge_fix(ibdev);
383 
384 	env_value = getenv("MLX5_STALL_NUM_LOOP");
385 	if (env_value)
386 		mlx5_stall_num_loop = atoi(env_value);
387 
388 	env_value = getenv("MLX5_STALL_CQ_POLL_MIN");
389 	if (env_value)
390 		mlx5_stall_cq_poll_min = atoi(env_value);
391 
392 	env_value = getenv("MLX5_STALL_CQ_POLL_MAX");
393 	if (env_value)
394 		mlx5_stall_cq_poll_max = atoi(env_value);
395 
396 	env_value = getenv("MLX5_STALL_CQ_INC_STEP");
397 	if (env_value)
398 		mlx5_stall_cq_inc_step = atoi(env_value);
399 
400 	env_value = getenv("MLX5_STALL_CQ_DEC_STEP");
401 	if (env_value)
402 		mlx5_stall_cq_dec_step = atoi(env_value);
403 
404 	ctx->stall_adaptive_enable = 0;
405 	ctx->stall_cycles = 0;
406 
407 	if (mlx5_stall_num_loop < 0) {
408 		ctx->stall_adaptive_enable = 1;
409 		ctx->stall_cycles = mlx5_stall_cq_poll_min;
410 	}
411 
412 }
413 
414 static int get_total_uuars(int page_size)
415 {
416 	int size = MLX5_DEF_TOT_UUARS;
417 	int uuars_in_page;
418 	char *env;
419 
420 	env = getenv("MLX5_TOTAL_UUARS");
421 	if (env)
422 		size = atoi(env);
423 
424 	if (size < 1)
425 		return -EINVAL;
426 
427 	uuars_in_page = page_size / MLX5_ADAPTER_PAGE_SIZE * MLX5_NUM_NON_FP_BFREGS_PER_UAR;
428 	size = max(uuars_in_page, size);
429 	size = align(size, MLX5_NUM_NON_FP_BFREGS_PER_UAR);
430 	if (size > MLX5_MAX_BFREGS)
431 		return -ENOMEM;
432 
433 	return size;
434 }
435 
436 static void open_debug_file(struct mlx5_context *ctx)
437 {
438 	char *env;
439 
440 	env = getenv("MLX5_DEBUG_FILE");
441 	if (!env) {
442 		ctx->dbg_fp = stderr;
443 		return;
444 	}
445 
446 	ctx->dbg_fp = fopen(env, "aw+");
447 	if (!ctx->dbg_fp) {
448 		fprintf(stderr, "Failed opening debug file %s, using stderr\n", env);
449 		ctx->dbg_fp = stderr;
450 		return;
451 	}
452 }
453 
454 static void close_debug_file(struct mlx5_context *ctx)
455 {
456 	if (ctx->dbg_fp && ctx->dbg_fp != stderr)
457 		fclose(ctx->dbg_fp);
458 }
459 
460 static void set_debug_mask(void)
461 {
462 	char *env;
463 
464 	env = getenv("MLX5_DEBUG_MASK");
465 	if (env)
466 		mlx5_debug_mask = strtol(env, NULL, 0);
467 }
468 
469 static void set_freeze_on_error(void)
470 {
471 	char *env;
472 
473 	env = getenv("MLX5_FREEZE_ON_ERROR_CQE");
474 	if (env)
475 		mlx5_freeze_on_error_cqe = strtol(env, NULL, 0);
476 }
477 
478 static int get_always_bf(void)
479 {
480 	char *env;
481 
482 	env = getenv("MLX5_POST_SEND_PREFER_BF");
483 	if (!env)
484 		return 1;
485 
486 	return strcmp(env, "0") ? 1 : 0;
487 }
488 
489 static int get_shut_up_bf(void)
490 {
491 	char *env;
492 
493 	env = getenv("MLX5_SHUT_UP_BF");
494 	if (!env)
495 		return 0;
496 
497 	return strcmp(env, "0") ? 1 : 0;
498 }
499 
500 static int get_num_low_lat_uuars(int tot_uuars)
501 {
502 	char *env;
503 	int num = 4;
504 
505 	env = getenv("MLX5_NUM_LOW_LAT_UUARS");
506 	if (env)
507 		num = atoi(env);
508 
509 	if (num < 0)
510 		return -EINVAL;
511 
512 	num = max(num, tot_uuars - MLX5_MED_BFREGS_TSHOLD);
513 	return num;
514 }
515 
516 /* The library allocates an array of uuar contexts. The one in index zero does
517  * not to execersize odd/even policy so it can avoid a lock but it may not use
518  * blue flame. The upper ones, low_lat_uuars can use blue flame with no lock
519  * since they are assigned to one QP only. The rest can use blue flame but since
520  * they are shared they need a lock
521  */
522 static int need_uuar_lock(struct mlx5_context *ctx, int uuarn)
523 {
524 	if (uuarn == 0 || mlx5_single_threaded)
525 		return 0;
526 
527 	if (uuarn >= (ctx->tot_uuars - ctx->low_lat_uuars) * 2)
528 		return 0;
529 
530 	return 1;
531 }
532 
533 static int single_threaded_app(void)
534 {
535 
536 	char *env;
537 
538 	env = getenv("MLX5_SINGLE_THREADED");
539 	if (env)
540 		return strcmp(env, "1") ? 0 : 1;
541 
542 	return 0;
543 }
544 
545 static int mlx5_cmd_get_context(struct mlx5_context *context,
546 				struct mlx5_alloc_ucontext *req,
547 				size_t req_len,
548 				struct mlx5_alloc_ucontext_resp *resp,
549 				size_t resp_len)
550 {
551 	if (!ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req,
552 				 req_len, &resp->ibv_resp, resp_len))
553 		return 0;
554 
555 	/* The ibv_cmd_get_context fails in older kernels when passing
556 	 * a request length that the kernel doesn't know.
557 	 * To avoid breaking compatibility of new libmlx5 and older
558 	 * kernels, when ibv_cmd_get_context fails with the full
559 	 * request length, we try once again with the legacy length.
560 	 * We repeat this process while reducing requested size based
561 	 * on the feature input size. To avoid this in the future, we
562 	 * will remove the check in kernel that requires fields unknown
563 	 * to the kernel to be cleared. This will require that any new
564 	 * feature that involves extending struct mlx5_alloc_ucontext
565 	 * will be accompanied by an indication in the form of one or
566 	 * more fields in struct mlx5_alloc_ucontext_resp. If the
567 	 * response value can be interpreted as feature not supported
568 	 * when the returned value is zero, this will suffice to
569 	 * indicate to the library that the request was ignored by the
570 	 * kernel, either because it is unaware or because it decided
571 	 * to do so. If zero is a valid response, we will add a new
572 	 * field that indicates whether the request was handled.
573 	 */
574 	if (!ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req,
575 				 offsetof(struct mlx5_alloc_ucontext, lib_caps),
576 				 &resp->ibv_resp, resp_len))
577 		return 0;
578 
579 	return ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req,
580 				   offsetof(struct mlx5_alloc_ucontext,
581 					    cqe_version),
582 				   &resp->ibv_resp, resp_len);
583 }
584 
585 static int mlx5_map_internal_clock(struct mlx5_device *mdev,
586 				   struct ibv_context *ibv_ctx)
587 {
588 	struct mlx5_context *context = to_mctx(ibv_ctx);
589 	void *hca_clock_page;
590 	off_t offset = 0;
591 
592 	set_command(MLX5_MMAP_GET_CORE_CLOCK_CMD, &offset);
593 	hca_clock_page = mmap(NULL, mdev->page_size,
594 			      PROT_READ, MAP_SHARED, ibv_ctx->cmd_fd,
595 			      mdev->page_size * offset);
596 
597 	if (hca_clock_page == MAP_FAILED) {
598 		fprintf(stderr, PFX
599 			"Warning: Timestamp available,\n"
600 			"but failed to mmap() hca core clock page.\n");
601 		return -1;
602 	}
603 
604 	context->hca_core_clock = hca_clock_page +
605 		(context->core_clock.offset & (mdev->page_size - 1));
606 	return 0;
607 }
608 
609 int mlx5dv_query_device(struct ibv_context *ctx_in,
610 			 struct mlx5dv_context *attrs_out)
611 {
612 	struct mlx5_context *mctx = to_mctx(ctx_in);
613 	uint64_t comp_mask_out = 0;
614 
615 	attrs_out->version   = 0;
616 	attrs_out->flags     = 0;
617 
618 	if (mctx->cqe_version == MLX5_CQE_VERSION_V1)
619 		attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_CQE_V1;
620 
621 	if (mctx->vendor_cap_flags & MLX5_VENDOR_CAP_FLAGS_MPW)
622 		attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_MPW;
623 
624 	if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_CQE_COMPRESION) {
625 		attrs_out->cqe_comp_caps = mctx->cqe_comp_caps;
626 		comp_mask_out |= MLX5DV_CONTEXT_MASK_CQE_COMPRESION;
627 	}
628 
629 	attrs_out->comp_mask = comp_mask_out;
630 
631 	return 0;
632 }
633 
634 static int mlx5dv_get_qp(struct ibv_qp *qp_in,
635 			 struct mlx5dv_qp *qp_out)
636 {
637 	struct mlx5_qp *mqp = to_mqp(qp_in);
638 
639 	qp_out->comp_mask = 0;
640 	qp_out->dbrec     = mqp->db;
641 
642 	if (mqp->sq_buf_size)
643 		/* IBV_QPT_RAW_PACKET */
644 		qp_out->sq.buf = (void *)((uintptr_t)mqp->sq_buf.buf);
645 	else
646 		qp_out->sq.buf = (void *)((uintptr_t)mqp->buf.buf + mqp->sq.offset);
647 	qp_out->sq.wqe_cnt = mqp->sq.wqe_cnt;
648 	qp_out->sq.stride  = 1 << mqp->sq.wqe_shift;
649 
650 	qp_out->rq.buf     = (void *)((uintptr_t)mqp->buf.buf + mqp->rq.offset);
651 	qp_out->rq.wqe_cnt = mqp->rq.wqe_cnt;
652 	qp_out->rq.stride  = 1 << mqp->rq.wqe_shift;
653 
654 	qp_out->bf.reg    = mqp->bf->reg;
655 
656 	if (mqp->bf->uuarn > 0)
657 		qp_out->bf.size = mqp->bf->buf_size;
658 	else
659 		qp_out->bf.size = 0;
660 
661 	return 0;
662 }
663 
664 static int mlx5dv_get_cq(struct ibv_cq *cq_in,
665 			 struct mlx5dv_cq *cq_out)
666 {
667 	struct mlx5_cq *mcq = to_mcq(cq_in);
668 	struct mlx5_context *mctx = to_mctx(cq_in->context);
669 
670 	cq_out->comp_mask = 0;
671 	cq_out->cqn       = mcq->cqn;
672 	cq_out->cqe_cnt   = mcq->ibv_cq.cqe + 1;
673 	cq_out->cqe_size  = mcq->cqe_sz;
674 	cq_out->buf       = mcq->active_buf->buf;
675 	cq_out->dbrec     = mcq->dbrec;
676 	cq_out->uar	  = mctx->uar;
677 
678 	mcq->flags	 |= MLX5_CQ_FLAGS_DV_OWNED;
679 
680 	return 0;
681 }
682 
683 static int mlx5dv_get_rwq(struct ibv_wq *wq_in,
684 			  struct mlx5dv_rwq *rwq_out)
685 {
686 	struct mlx5_rwq *mrwq = to_mrwq(wq_in);
687 
688 	rwq_out->comp_mask = 0;
689 	rwq_out->buf       = mrwq->pbuff;
690 	rwq_out->dbrec     = mrwq->recv_db;
691 	rwq_out->wqe_cnt   = mrwq->rq.wqe_cnt;
692 	rwq_out->stride    = 1 << mrwq->rq.wqe_shift;
693 
694 	return 0;
695 }
696 
697 static int mlx5dv_get_srq(struct ibv_srq *srq_in,
698 			  struct mlx5dv_srq *srq_out)
699 {
700 	struct mlx5_srq *msrq;
701 
702 	msrq = container_of(srq_in, struct mlx5_srq, vsrq.srq);
703 
704 	srq_out->comp_mask = 0;
705 	srq_out->buf       = msrq->buf.buf;
706 	srq_out->dbrec     = msrq->db;
707 	srq_out->stride    = 1 << msrq->wqe_shift;
708 	srq_out->head      = msrq->head;
709 	srq_out->tail      = msrq->tail;
710 
711 	return 0;
712 }
713 
714 int mlx5dv_init_obj(struct mlx5dv_obj *obj, uint64_t obj_type)
715 {
716 	int ret = 0;
717 
718 	if (obj_type & MLX5DV_OBJ_QP)
719 		ret = mlx5dv_get_qp(obj->qp.in, obj->qp.out);
720 	if (!ret && (obj_type & MLX5DV_OBJ_CQ))
721 		ret = mlx5dv_get_cq(obj->cq.in, obj->cq.out);
722 	if (!ret && (obj_type & MLX5DV_OBJ_SRQ))
723 		ret = mlx5dv_get_srq(obj->srq.in, obj->srq.out);
724 	if (!ret && (obj_type & MLX5DV_OBJ_RWQ))
725 		ret = mlx5dv_get_rwq(obj->rwq.in, obj->rwq.out);
726 
727 	return ret;
728 }
729 
730 static void adjust_uar_info(struct mlx5_device *mdev,
731 			    struct mlx5_context *context,
732 			    struct mlx5_alloc_ucontext_resp resp)
733 {
734 	if (!resp.log_uar_size && !resp.num_uars_per_page) {
735 		/* old kernel */
736 		context->uar_size = mdev->page_size;
737 		context->num_uars_per_page = 1;
738 		return;
739 	}
740 
741 	context->uar_size = 1 << resp.log_uar_size;
742 	context->num_uars_per_page = resp.num_uars_per_page;
743 }
744 
745 static int mlx5_init_context(struct verbs_device *vdev,
746 			     struct ibv_context *ctx, int cmd_fd)
747 {
748 	struct mlx5_context	       *context;
749 	struct mlx5_alloc_ucontext	req;
750 	struct mlx5_alloc_ucontext_resp resp;
751 	int				i;
752 	int				page_size;
753 	int				tot_uuars;
754 	int				low_lat_uuars;
755 	int				gross_uuars;
756 	int				j;
757 	off_t				offset;
758 	struct mlx5_device	       *mdev;
759 	struct verbs_context	       *v_ctx;
760 	struct ibv_port_attr		port_attr;
761 	struct ibv_device_attr_ex	device_attr;
762 	int				k;
763 	int				bfi;
764 	int				num_sys_page_map;
765 
766 	mdev = to_mdev(&vdev->device);
767 	v_ctx = verbs_get_ctx(ctx);
768 	page_size = mdev->page_size;
769 	mlx5_single_threaded = single_threaded_app();
770 
771 	context = to_mctx(ctx);
772 	context->ibv_ctx.cmd_fd = cmd_fd;
773 
774 	open_debug_file(context);
775 	set_debug_mask();
776 	set_freeze_on_error();
777 	if (gethostname(context->hostname, sizeof(context->hostname)))
778 		strcpy(context->hostname, "host_unknown");
779 
780 	tot_uuars = get_total_uuars(page_size);
781 	if (tot_uuars < 0) {
782 		errno = -tot_uuars;
783 		goto err_free;
784 	}
785 
786 	low_lat_uuars = get_num_low_lat_uuars(tot_uuars);
787 	if (low_lat_uuars < 0) {
788 		errno = -low_lat_uuars;
789 		goto err_free;
790 	}
791 
792 	if (low_lat_uuars > tot_uuars - 1) {
793 		errno = ENOMEM;
794 		goto err_free;
795 	}
796 
797 	memset(&req, 0, sizeof(req));
798 	memset(&resp, 0, sizeof(resp));
799 
800 	req.total_num_uuars = tot_uuars;
801 	req.num_low_latency_uuars = low_lat_uuars;
802 	req.cqe_version = MLX5_CQE_VERSION_V1;
803 	req.lib_caps |= MLX5_LIB_CAP_4K_UAR;
804 
805 	if (mlx5_cmd_get_context(context, &req, sizeof(req), &resp,
806 				 sizeof(resp)))
807 		goto err_free;
808 
809 	context->max_num_qps		= resp.qp_tab_size;
810 	context->bf_reg_size		= resp.bf_reg_size;
811 	context->tot_uuars		= resp.tot_uuars;
812 	context->low_lat_uuars		= low_lat_uuars;
813 	context->cache_line_size	= resp.cache_line_size;
814 	context->max_sq_desc_sz = resp.max_sq_desc_sz;
815 	context->max_rq_desc_sz = resp.max_rq_desc_sz;
816 	context->max_send_wqebb	= resp.max_send_wqebb;
817 	context->num_ports	= resp.num_ports;
818 	context->max_recv_wr	= resp.max_recv_wr;
819 	context->max_srq_recv_wr = resp.max_srq_recv_wr;
820 
821 	context->cqe_version = resp.cqe_version;
822 	if (context->cqe_version) {
823 		if (context->cqe_version == MLX5_CQE_VERSION_V1)
824 			mlx5_ctx_ops.poll_cq = mlx5_poll_cq_v1;
825 		else
826 			goto err_free;
827 	}
828 
829 	adjust_uar_info(mdev, context, resp);
830 
831 	gross_uuars = context->tot_uuars / MLX5_NUM_NON_FP_BFREGS_PER_UAR * NUM_BFREGS_PER_UAR;
832 	context->bfs = calloc(gross_uuars, sizeof(*context->bfs));
833 	if (!context->bfs) {
834 		errno = ENOMEM;
835 		goto err_free;
836 	}
837 
838 	context->cmds_supp_uhw = resp.cmds_supp_uhw;
839 	context->vendor_cap_flags = 0;
840 
841 	pthread_mutex_init(&context->qp_table_mutex, NULL);
842 	pthread_mutex_init(&context->srq_table_mutex, NULL);
843 	pthread_mutex_init(&context->uidx_table_mutex, NULL);
844 	for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i)
845 		context->qp_table[i].refcnt = 0;
846 
847 	for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i)
848 		context->uidx_table[i].refcnt = 0;
849 
850 	context->db_list = NULL;
851 
852 	pthread_mutex_init(&context->db_list_mutex, NULL);
853 
854 	num_sys_page_map = context->tot_uuars / (context->num_uars_per_page * MLX5_NUM_NON_FP_BFREGS_PER_UAR);
855 	for (i = 0; i < num_sys_page_map; ++i) {
856 		offset = 0;
857 		set_command(MLX5_MMAP_GET_REGULAR_PAGES_CMD, &offset);
858 		set_index(i, &offset);
859 		context->uar[i] = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED,
860 				       cmd_fd, page_size * offset);
861 		if (context->uar[i] == MAP_FAILED) {
862 			context->uar[i] = NULL;
863 			goto err_free_bf;
864 		}
865 	}
866 
867 	for (i = 0; i < num_sys_page_map; i++) {
868 		for (j = 0; j < context->num_uars_per_page; j++) {
869 			for (k = 0; k < NUM_BFREGS_PER_UAR; k++) {
870 				bfi = (i * context->num_uars_per_page + j) * NUM_BFREGS_PER_UAR + k;
871 				context->bfs[bfi].reg = context->uar[i] + MLX5_ADAPTER_PAGE_SIZE * j +
872 							MLX5_BF_OFFSET + k * context->bf_reg_size;
873 				context->bfs[bfi].need_lock = need_uuar_lock(context, bfi);
874 				mlx5_spinlock_init(&context->bfs[bfi].lock);
875 				context->bfs[bfi].offset = 0;
876 				if (bfi)
877 					context->bfs[bfi].buf_size = context->bf_reg_size / 2;
878 				context->bfs[bfi].uuarn = bfi;
879 			}
880 		}
881 	}
882 	context->hca_core_clock = NULL;
883 	if (resp.response_length + sizeof(resp.ibv_resp) >=
884 	    offsetof(struct mlx5_alloc_ucontext_resp, hca_core_clock_offset) +
885 	    sizeof(resp.hca_core_clock_offset) &&
886 	    resp.comp_mask & MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET) {
887 		context->core_clock.offset = resp.hca_core_clock_offset;
888 		mlx5_map_internal_clock(mdev, ctx);
889 	}
890 
891 	mlx5_spinlock_init(&context->lock32);
892 
893 	context->prefer_bf = get_always_bf();
894 	context->shut_up_bf = get_shut_up_bf();
895 	mlx5_read_env(&vdev->device, context);
896 
897 	mlx5_spinlock_init(&context->hugetlb_lock);
898 	TAILQ_INIT(&context->hugetlb_list);
899 
900 	context->ibv_ctx.ops = mlx5_ctx_ops;
901 
902 	verbs_set_ctx_op(v_ctx, create_qp_ex, mlx5_create_qp_ex);
903 	verbs_set_ctx_op(v_ctx, open_xrcd, mlx5_open_xrcd);
904 	verbs_set_ctx_op(v_ctx, close_xrcd, mlx5_close_xrcd);
905 	verbs_set_ctx_op(v_ctx, create_srq_ex, mlx5_create_srq_ex);
906 	verbs_set_ctx_op(v_ctx, get_srq_num, mlx5_get_srq_num);
907 	verbs_set_ctx_op(v_ctx, query_device_ex, mlx5_query_device_ex);
908 	verbs_set_ctx_op(v_ctx, query_rt_values, mlx5_query_rt_values);
909 	verbs_set_ctx_op(v_ctx, ibv_create_flow, ibv_cmd_create_flow);
910 	verbs_set_ctx_op(v_ctx, ibv_destroy_flow, ibv_cmd_destroy_flow);
911 	verbs_set_ctx_op(v_ctx, create_cq_ex, mlx5_create_cq_ex);
912 	verbs_set_ctx_op(v_ctx, create_wq, mlx5_create_wq);
913 	verbs_set_ctx_op(v_ctx, modify_wq, mlx5_modify_wq);
914 	verbs_set_ctx_op(v_ctx, destroy_wq, mlx5_destroy_wq);
915 	verbs_set_ctx_op(v_ctx, create_rwq_ind_table, mlx5_create_rwq_ind_table);
916 	verbs_set_ctx_op(v_ctx, destroy_rwq_ind_table, mlx5_destroy_rwq_ind_table);
917 
918 	memset(&device_attr, 0, sizeof(device_attr));
919 	if (!mlx5_query_device_ex(ctx, NULL, &device_attr,
920 				  sizeof(struct ibv_device_attr_ex))) {
921 		context->cached_device_cap_flags =
922 			device_attr.orig_attr.device_cap_flags;
923 		context->atomic_cap = device_attr.orig_attr.atomic_cap;
924 		context->cached_tso_caps = device_attr.tso_caps;
925 	}
926 
927 	for (j = 0; j < min(MLX5_MAX_PORTS_NUM, context->num_ports); ++j) {
928 		memset(&port_attr, 0, sizeof(port_attr));
929 		if (!mlx5_query_port(ctx, j + 1, &port_attr))
930 			context->cached_link_layer[j] = port_attr.link_layer;
931 	}
932 
933 	return 0;
934 
935 err_free_bf:
936 	free(context->bfs);
937 
938 err_free:
939 	for (i = 0; i < MLX5_MAX_UARS; ++i) {
940 		if (context->uar[i])
941 			munmap(context->uar[i], page_size);
942 	}
943 	close_debug_file(context);
944 	return errno;
945 }
946 
947 static void mlx5_cleanup_context(struct verbs_device *device,
948 				 struct ibv_context *ibctx)
949 {
950 	struct mlx5_context *context = to_mctx(ibctx);
951 	int page_size = to_mdev(ibctx->device)->page_size;
952 	int i;
953 
954 	free(context->bfs);
955 	for (i = 0; i < MLX5_MAX_UARS; ++i) {
956 		if (context->uar[i])
957 			munmap(context->uar[i], page_size);
958 	}
959 	if (context->hca_core_clock)
960 		munmap(context->hca_core_clock - context->core_clock.offset,
961 		       page_size);
962 	close_debug_file(context);
963 }
964 
965 static struct verbs_device_ops mlx5_dev_ops = {
966 	.init_context = mlx5_init_context,
967 	.uninit_context = mlx5_cleanup_context,
968 };
969 
970 static struct verbs_device *mlx5_driver_init(const char *uverbs_sys_path,
971 					     int abi_version)
972 {
973 	char			value[8];
974 	struct mlx5_device     *dev;
975 	unsigned		vendor, device;
976 	int			i;
977 
978 	if (ibv_read_sysfs_file(uverbs_sys_path, "device/vendor",
979 				value, sizeof value) < 0)
980 		return NULL;
981 	sscanf(value, "%i", &vendor);
982 
983 	if (ibv_read_sysfs_file(uverbs_sys_path, "device/device",
984 				value, sizeof value) < 0)
985 		return NULL;
986 	sscanf(value, "%i", &device);
987 
988 	for (i = 0; i < sizeof hca_table / sizeof hca_table[0]; ++i)
989 		if (vendor == hca_table[i].vendor &&
990 		    device == hca_table[i].device)
991 			goto found;
992 
993 	return NULL;
994 
995 found:
996 	if (abi_version < MLX5_UVERBS_MIN_ABI_VERSION ||
997 	    abi_version > MLX5_UVERBS_MAX_ABI_VERSION) {
998 		fprintf(stderr, PFX "Fatal: ABI version %d of %s is not supported "
999 			"(min supported %d, max supported %d)\n",
1000 			abi_version, uverbs_sys_path,
1001 			MLX5_UVERBS_MIN_ABI_VERSION,
1002 			MLX5_UVERBS_MAX_ABI_VERSION);
1003 		return NULL;
1004 	}
1005 
1006 	dev = calloc(1, sizeof *dev);
1007 	if (!dev) {
1008 		fprintf(stderr, PFX "Fatal: couldn't allocate device for %s\n",
1009 			uverbs_sys_path);
1010 		return NULL;
1011 	}
1012 
1013 	dev->page_size   = sysconf(_SC_PAGESIZE);
1014 	dev->driver_abi_ver = abi_version;
1015 
1016 	dev->verbs_dev.ops = &mlx5_dev_ops;
1017 	dev->verbs_dev.sz = sizeof(*dev);
1018 	dev->verbs_dev.size_of_context = sizeof(struct mlx5_context) -
1019 		sizeof(struct ibv_context);
1020 
1021 	return &dev->verbs_dev;
1022 }
1023 
1024 static __attribute__((constructor)) void mlx5_register_driver(void)
1025 {
1026 	verbs_register_driver("mlx5", mlx5_driver_init);
1027 }
1028