xref: /freebsd/sys/dev/ena/ena_sysctl.c (revision e6bfd18d21b225af6a0ed67ceeaf1293b7b9eba5)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2015-2021 Amazon.com, Inc. or its affiliates.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  *
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  */
30 #include <sys/cdefs.h>
31 #include <sys/param.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include "opt_rss.h"
35 
36 #include "ena_rss.h"
37 #include "ena_sysctl.h"
38 
39 static void ena_sysctl_add_wd(struct ena_adapter *);
40 static void ena_sysctl_add_stats(struct ena_adapter *);
41 static void ena_sysctl_add_eni_metrics(struct ena_adapter *);
42 static void ena_sysctl_add_tuneables(struct ena_adapter *);
43 /* Kernel option RSS prevents manipulation of key hash and indirection table. */
44 #ifndef RSS
45 static void ena_sysctl_add_rss(struct ena_adapter *);
46 #endif
47 static int ena_sysctl_buf_ring_size(SYSCTL_HANDLER_ARGS);
48 static int ena_sysctl_rx_queue_size(SYSCTL_HANDLER_ARGS);
49 static int ena_sysctl_io_queues_nb(SYSCTL_HANDLER_ARGS);
50 static int ena_sysctl_eni_metrics_interval(SYSCTL_HANDLER_ARGS);
51 #ifndef RSS
52 static int ena_sysctl_rss_key(SYSCTL_HANDLER_ARGS);
53 static int ena_sysctl_rss_indir_table(SYSCTL_HANDLER_ARGS);
54 #endif
55 
56 /* Limit max ENI sample rate to be an hour. */
57 #define ENI_METRICS_MAX_SAMPLE_INTERVAL 3600
58 #define ENA_HASH_KEY_MSG_SIZE (ENA_HASH_KEY_SIZE * 2 + 1)
59 
60 static SYSCTL_NODE(_hw, OID_AUTO, ena, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
61     "ENA driver parameters");
62 
63 /*
64  * Logging level for changing verbosity of the output
65  */
66 int ena_log_level = ENA_INFO;
67 SYSCTL_INT(_hw_ena, OID_AUTO, log_level, CTLFLAG_RWTUN, &ena_log_level, 0,
68     "Logging level indicating verbosity of the logs");
69 
70 SYSCTL_CONST_STRING(_hw_ena, OID_AUTO, driver_version, CTLFLAG_RD,
71     ENA_DRV_MODULE_VERSION, "ENA driver version");
72 
73 /*
74  * Use 9k mbufs for the Rx buffers. Default to 0 (use page size mbufs instead).
75  * Using 9k mbufs in low memory conditions might cause allocation to take a lot
76  * of time and lead to the OS instability as it needs to look for the contiguous
77  * pages.
78  * However, page size mbufs has a bit smaller throughput than 9k mbufs, so if
79  * the network performance is the priority, the 9k mbufs can be used.
80  */
81 int ena_enable_9k_mbufs = 0;
82 SYSCTL_INT(_hw_ena, OID_AUTO, enable_9k_mbufs, CTLFLAG_RDTUN,
83     &ena_enable_9k_mbufs, 0, "Use 9 kB mbufs for Rx descriptors");
84 
85 /*
86  * Force the driver to use large LLQ (Low Latency Queue) header. Defaults to
87  * false. This option may be important for platforms, which often handle packet
88  * headers on Tx with total header size greater than 96B, as it may
89  * reduce the latency.
90  * It also reduces the maximum Tx queue size by half, so it may cause more Tx
91  * packet drops.
92  */
93 bool ena_force_large_llq_header = false;
94 SYSCTL_BOOL(_hw_ena, OID_AUTO, force_large_llq_header, CTLFLAG_RDTUN,
95     &ena_force_large_llq_header, 0,
96     "Increases maximum supported header size in LLQ mode to 224 bytes, while reducing the maximum Tx queue size by half.\n");
97 
98 int ena_rss_table_size = ENA_RX_RSS_TABLE_SIZE;
99 
100 void
101 ena_sysctl_add_nodes(struct ena_adapter *adapter)
102 {
103 	ena_sysctl_add_wd(adapter);
104 	ena_sysctl_add_stats(adapter);
105 	ena_sysctl_add_eni_metrics(adapter);
106 	ena_sysctl_add_tuneables(adapter);
107 #ifndef RSS
108 	ena_sysctl_add_rss(adapter);
109 #endif
110 }
111 
112 static void
113 ena_sysctl_add_wd(struct ena_adapter *adapter)
114 {
115 	device_t dev;
116 
117 	struct sysctl_ctx_list *ctx;
118 	struct sysctl_oid *tree;
119 	struct sysctl_oid_list *child;
120 
121 	dev = adapter->pdev;
122 
123 	ctx = device_get_sysctl_ctx(dev);
124 	tree = device_get_sysctl_tree(dev);
125 	child = SYSCTL_CHILDREN(tree);
126 
127 	/* Sysctl calls for Watchdog service */
128 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "wd_active", CTLFLAG_RWTUN,
129 	    &adapter->wd_active, 0, "Watchdog is active");
130 
131 	SYSCTL_ADD_QUAD(ctx, child, OID_AUTO, "keep_alive_timeout",
132 	    CTLFLAG_RWTUN, &adapter->keep_alive_timeout,
133 	    "Timeout for Keep Alive messages");
134 
135 	SYSCTL_ADD_QUAD(ctx, child, OID_AUTO, "missing_tx_timeout",
136 	    CTLFLAG_RWTUN, &adapter->missing_tx_timeout,
137 	    "Timeout for TX completion");
138 
139 	SYSCTL_ADD_U32(ctx, child, OID_AUTO, "missing_tx_max_queues",
140 	    CTLFLAG_RWTUN, &adapter->missing_tx_max_queues, 0,
141 	    "Number of TX queues to check per run");
142 
143 	SYSCTL_ADD_U32(ctx, child, OID_AUTO, "missing_tx_threshold",
144 	    CTLFLAG_RWTUN, &adapter->missing_tx_threshold, 0,
145 	    "Max number of timeouted packets");
146 }
147 
148 static void
149 ena_sysctl_add_stats(struct ena_adapter *adapter)
150 {
151 	device_t dev;
152 
153 	struct ena_ring *tx_ring;
154 	struct ena_ring *rx_ring;
155 
156 	struct ena_hw_stats *hw_stats;
157 	struct ena_stats_dev *dev_stats;
158 	struct ena_stats_tx *tx_stats;
159 	struct ena_stats_rx *rx_stats;
160 	struct ena_com_stats_admin *admin_stats;
161 
162 	struct sysctl_ctx_list *ctx;
163 	struct sysctl_oid *tree;
164 	struct sysctl_oid_list *child;
165 
166 	struct sysctl_oid *queue_node, *tx_node, *rx_node, *hw_node;
167 	struct sysctl_oid *admin_node;
168 	struct sysctl_oid_list *queue_list, *tx_list, *rx_list, *hw_list;
169 	struct sysctl_oid_list *admin_list;
170 
171 #define QUEUE_NAME_LEN 32
172 	char namebuf[QUEUE_NAME_LEN];
173 	int i;
174 
175 	dev = adapter->pdev;
176 
177 	ctx = device_get_sysctl_ctx(dev);
178 	tree = device_get_sysctl_tree(dev);
179 	child = SYSCTL_CHILDREN(tree);
180 
181 	tx_ring = adapter->tx_ring;
182 	rx_ring = adapter->rx_ring;
183 
184 	hw_stats = &adapter->hw_stats;
185 	dev_stats = &adapter->dev_stats;
186 	admin_stats = &adapter->ena_dev->admin_queue.stats;
187 
188 	SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "wd_expired", CTLFLAG_RD,
189 	    &dev_stats->wd_expired, "Watchdog expiry count");
190 	SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "interface_up", CTLFLAG_RD,
191 	    &dev_stats->interface_up, "Network interface up count");
192 	SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "interface_down",
193 	    CTLFLAG_RD, &dev_stats->interface_down,
194 	    "Network interface down count");
195 	SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "admin_q_pause",
196 	    CTLFLAG_RD, &dev_stats->admin_q_pause, "Admin queue pauses");
197 
198 	for (i = 0; i < adapter->num_io_queues; ++i, ++tx_ring, ++rx_ring) {
199 		snprintf(namebuf, QUEUE_NAME_LEN, "queue%d", i);
200 
201 		queue_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf,
202 		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Queue Name");
203 		queue_list = SYSCTL_CHILDREN(queue_node);
204 
205 		adapter->que[i].oid = queue_node;
206 
207 #ifdef RSS
208 		/* Common stats */
209 		SYSCTL_ADD_INT(ctx, queue_list, OID_AUTO, "cpu", CTLFLAG_RD,
210 		    &adapter->que[i].cpu, 0, "CPU affinity");
211 		SYSCTL_ADD_INT(ctx, queue_list, OID_AUTO, "domain", CTLFLAG_RD,
212 		    &adapter->que[i].domain, 0, "NUMA domain");
213 #endif
214 
215 		/* TX specific stats */
216 		tx_node = SYSCTL_ADD_NODE(ctx, queue_list, OID_AUTO, "tx_ring",
217 		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "TX ring");
218 		tx_list = SYSCTL_CHILDREN(tx_node);
219 
220 		tx_stats = &tx_ring->tx_stats;
221 
222 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "count",
223 		    CTLFLAG_RD, &tx_stats->cnt, "Packets sent");
224 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "bytes",
225 		    CTLFLAG_RD, &tx_stats->bytes, "Bytes sent");
226 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
227 		    "prepare_ctx_err", CTLFLAG_RD, &tx_stats->prepare_ctx_err,
228 		    "TX buffer preparation failures");
229 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
230 		    "dma_mapping_err", CTLFLAG_RD, &tx_stats->dma_mapping_err,
231 		    "DMA mapping failures");
232 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "doorbells",
233 		    CTLFLAG_RD, &tx_stats->doorbells, "Queue doorbells");
234 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
235 		    "missing_tx_comp", CTLFLAG_RD, &tx_stats->missing_tx_comp,
236 		    "TX completions missed");
237 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "bad_req_id",
238 		    CTLFLAG_RD, &tx_stats->bad_req_id, "Bad request id count");
239 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "mbuf_collapses",
240 		    CTLFLAG_RD, &tx_stats->collapse, "Mbuf collapse count");
241 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
242 		    "mbuf_collapse_err", CTLFLAG_RD, &tx_stats->collapse_err,
243 		    "Mbuf collapse failures");
244 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "queue_wakeups",
245 		    CTLFLAG_RD, &tx_stats->queue_wakeup, "Queue wakeups");
246 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "queue_stops",
247 		    CTLFLAG_RD, &tx_stats->queue_stop, "Queue stops");
248 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
249 		    "llq_buffer_copy", CTLFLAG_RD, &tx_stats->llq_buffer_copy,
250 		    "Header copies for llq transaction");
251 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
252 		    "unmask_interrupt_num", CTLFLAG_RD,
253 		    &tx_stats->unmask_interrupt_num,
254 		    "Unmasked interrupt count");
255 
256 		/* RX specific stats */
257 		rx_node = SYSCTL_ADD_NODE(ctx, queue_list, OID_AUTO, "rx_ring",
258 		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "RX ring");
259 		rx_list = SYSCTL_CHILDREN(rx_node);
260 
261 		rx_stats = &rx_ring->rx_stats;
262 
263 		SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, "count",
264 		    CTLFLAG_RD, &rx_stats->cnt, "Packets received");
265 		SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, "bytes",
266 		    CTLFLAG_RD, &rx_stats->bytes, "Bytes received");
267 		SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, "refil_partial",
268 		    CTLFLAG_RD, &rx_stats->refil_partial,
269 		    "Partial refilled mbufs");
270 		SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, "csum_bad",
271 		    CTLFLAG_RD, &rx_stats->csum_bad, "Bad RX checksum");
272 		SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO,
273 		    "mbuf_alloc_fail", CTLFLAG_RD, &rx_stats->mbuf_alloc_fail,
274 		    "Failed mbuf allocs");
275 		SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO,
276 		    "mjum_alloc_fail", CTLFLAG_RD, &rx_stats->mjum_alloc_fail,
277 		    "Failed jumbo mbuf allocs");
278 		SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO,
279 		    "dma_mapping_err", CTLFLAG_RD, &rx_stats->dma_mapping_err,
280 		    "DMA mapping errors");
281 		SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, "bad_desc_num",
282 		    CTLFLAG_RD, &rx_stats->bad_desc_num,
283 		    "Bad descriptor count");
284 		SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, "bad_req_id",
285 		    CTLFLAG_RD, &rx_stats->bad_req_id, "Bad request id count");
286 		SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, "empty_rx_ring",
287 		    CTLFLAG_RD, &rx_stats->empty_rx_ring,
288 		    "RX descriptors depletion count");
289 		SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, "csum_good",
290 		    CTLFLAG_RD, &rx_stats->csum_good,
291 		    "Valid RX checksum calculations");
292 	}
293 
294 	/* Stats read from device */
295 	hw_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "hw_stats",
296 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Statistics from hardware");
297 	hw_list = SYSCTL_CHILDREN(hw_node);
298 
299 	SYSCTL_ADD_COUNTER_U64(ctx, hw_list, OID_AUTO, "rx_packets", CTLFLAG_RD,
300 	    &hw_stats->rx_packets, "Packets received");
301 	SYSCTL_ADD_COUNTER_U64(ctx, hw_list, OID_AUTO, "tx_packets", CTLFLAG_RD,
302 	    &hw_stats->tx_packets, "Packets transmitted");
303 	SYSCTL_ADD_COUNTER_U64(ctx, hw_list, OID_AUTO, "rx_bytes", CTLFLAG_RD,
304 	    &hw_stats->rx_bytes, "Bytes received");
305 	SYSCTL_ADD_COUNTER_U64(ctx, hw_list, OID_AUTO, "tx_bytes", CTLFLAG_RD,
306 	    &hw_stats->tx_bytes, "Bytes transmitted");
307 	SYSCTL_ADD_COUNTER_U64(ctx, hw_list, OID_AUTO, "rx_drops", CTLFLAG_RD,
308 	    &hw_stats->rx_drops, "Receive packet drops");
309 	SYSCTL_ADD_COUNTER_U64(ctx, hw_list, OID_AUTO, "tx_drops", CTLFLAG_RD,
310 	    &hw_stats->tx_drops, "Transmit packet drops");
311 
312 	/* ENA Admin queue stats */
313 	admin_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "admin_stats",
314 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "ENA Admin Queue statistics");
315 	admin_list = SYSCTL_CHILDREN(admin_node);
316 
317 	SYSCTL_ADD_U64(ctx, admin_list, OID_AUTO, "aborted_cmd", CTLFLAG_RD,
318 	    &admin_stats->aborted_cmd, 0, "Aborted commands");
319 	SYSCTL_ADD_U64(ctx, admin_list, OID_AUTO, "sumbitted_cmd", CTLFLAG_RD,
320 	    &admin_stats->submitted_cmd, 0, "Submitted commands");
321 	SYSCTL_ADD_U64(ctx, admin_list, OID_AUTO, "completed_cmd", CTLFLAG_RD,
322 	    &admin_stats->completed_cmd, 0, "Completed commands");
323 	SYSCTL_ADD_U64(ctx, admin_list, OID_AUTO, "out_of_space", CTLFLAG_RD,
324 	    &admin_stats->out_of_space, 0, "Queue out of space");
325 	SYSCTL_ADD_U64(ctx, admin_list, OID_AUTO, "no_completion", CTLFLAG_RD,
326 	    &admin_stats->no_completion, 0, "Commands not completed");
327 }
328 
329 static void
330 ena_sysctl_add_eni_metrics(struct ena_adapter *adapter)
331 {
332 	device_t dev;
333 	struct ena_admin_eni_stats *eni_metrics;
334 
335 	struct sysctl_ctx_list *ctx;
336 	struct sysctl_oid *tree;
337 	struct sysctl_oid_list *child;
338 
339 	struct sysctl_oid *eni_node;
340 	struct sysctl_oid_list *eni_list;
341 
342 	dev = adapter->pdev;
343 
344 	ctx = device_get_sysctl_ctx(dev);
345 	tree = device_get_sysctl_tree(dev);
346 	child = SYSCTL_CHILDREN(tree);
347 
348 	eni_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "eni_metrics",
349 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "ENA's ENI metrics");
350 	eni_list = SYSCTL_CHILDREN(eni_node);
351 
352 	eni_metrics = &adapter->eni_metrics;
353 
354 	SYSCTL_ADD_U64(ctx, eni_list, OID_AUTO, "bw_in_allowance_exceeded",
355 	    CTLFLAG_RD, &eni_metrics->bw_in_allowance_exceeded, 0,
356 	    "Inbound BW allowance exceeded");
357 	SYSCTL_ADD_U64(ctx, eni_list, OID_AUTO, "bw_out_allowance_exceeded",
358 	    CTLFLAG_RD, &eni_metrics->bw_out_allowance_exceeded, 0,
359 	    "Outbound BW allowance exceeded");
360 	SYSCTL_ADD_U64(ctx, eni_list, OID_AUTO, "pps_allowance_exceeded",
361 	    CTLFLAG_RD, &eni_metrics->pps_allowance_exceeded, 0,
362 	    "PPS allowance exceeded");
363 	SYSCTL_ADD_U64(ctx, eni_list, OID_AUTO, "conntrack_allowance_exceeded",
364 	    CTLFLAG_RD, &eni_metrics->conntrack_allowance_exceeded, 0,
365 	    "Connection tracking allowance exceeded");
366 	SYSCTL_ADD_U64(ctx, eni_list, OID_AUTO, "linklocal_allowance_exceeded",
367 	    CTLFLAG_RD, &eni_metrics->linklocal_allowance_exceeded, 0,
368 	    "Linklocal packet rate allowance exceeded");
369 
370 	/*
371 	 * Tuneable, which determines how often ENI metrics will be read.
372 	 * 0 means it's turned off. Maximum allowed value is limited by:
373 	 * ENI_METRICS_MAX_SAMPLE_INTERVAL.
374 	 */
375 	SYSCTL_ADD_PROC(ctx, eni_list, OID_AUTO, "sample_interval",
376 	    CTLTYPE_U16 | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0,
377 	    ena_sysctl_eni_metrics_interval, "SU",
378 	    "Interval in seconds for updating ENI emetrics. 0 turns off the update.");
379 }
380 
381 static void
382 ena_sysctl_add_tuneables(struct ena_adapter *adapter)
383 {
384 	device_t dev;
385 
386 	struct sysctl_ctx_list *ctx;
387 	struct sysctl_oid *tree;
388 	struct sysctl_oid_list *child;
389 
390 	dev = adapter->pdev;
391 
392 	ctx = device_get_sysctl_ctx(dev);
393 	tree = device_get_sysctl_tree(dev);
394 	child = SYSCTL_CHILDREN(tree);
395 
396 	/* Tuneable number of buffers in the buf-ring (drbr) */
397 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "buf_ring_size",
398 	    CTLTYPE_U32 | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0,
399 	    ena_sysctl_buf_ring_size, "I",
400 	    "Size of the Tx buffer ring (drbr).");
401 
402 	/* Tuneable number of the Rx ring size */
403 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_queue_size",
404 	    CTLTYPE_U32 | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0,
405 	    ena_sysctl_rx_queue_size, "I",
406 	    "Size of the Rx ring. The size should be a power of 2.");
407 
408 	/* Tuneable number of IO queues */
409 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "io_queues_nb",
410 	    CTLTYPE_U32 | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0,
411 	    ena_sysctl_io_queues_nb, "I", "Number of IO queues.");
412 }
413 
414 /* Kernel option RSS prevents manipulation of key hash and indirection table. */
415 #ifndef RSS
416 static void
417 ena_sysctl_add_rss(struct ena_adapter *adapter)
418 {
419 	device_t dev;
420 
421 	struct sysctl_ctx_list *ctx;
422 	struct sysctl_oid *tree;
423 	struct sysctl_oid_list *child;
424 
425 	dev = adapter->pdev;
426 
427 	ctx = device_get_sysctl_ctx(dev);
428 	tree = device_get_sysctl_tree(dev);
429 	child = SYSCTL_CHILDREN(tree);
430 
431 	/* RSS options */
432 	tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rss",
433 	    CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Receive Side Scaling options.");
434 	child = SYSCTL_CHILDREN(tree);
435 
436 	/* RSS hash key */
437 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "key",
438 	    CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0,
439 	    ena_sysctl_rss_key, "A", "RSS key.");
440 
441 	/* Tuneable RSS indirection table */
442 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "indir_table",
443 	    CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0,
444 	    ena_sysctl_rss_indir_table, "A", "RSS indirection table.");
445 
446 	/* RSS indirection table size */
447 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "indir_table_size",
448 	    CTLFLAG_RD | CTLFLAG_MPSAFE, &ena_rss_table_size, 0,
449 	    "RSS indirection table size.");
450 }
451 #endif /* RSS */
452 
453 
454 /*
455  * ena_sysctl_update_queue_node_nb - Register/unregister sysctl queue nodes.
456  *
457  * Whether the nodes are registered or unregistered depends on a delta between
458  * the `old` and `new` parameters, representing the number of queues.
459  *
460  * This function is used to hide sysctl attributes for queue nodes which aren't
461  * currently used by the HW (e.g. after a call to `ena_sysctl_io_queues_nb`).
462  *
463  * NOTE:
464  * All unregistered nodes must be registered again at detach, i.e. by a call to
465  * this function.
466  */
467 void
468 ena_sysctl_update_queue_node_nb(struct ena_adapter *adapter, int old, int new)
469 {
470 	struct sysctl_oid *oid;
471 	int min, max, i;
472 
473 	min = MIN(old, new);
474 	max = MIN(MAX(old, new), adapter->max_num_io_queues);
475 
476 	for (i = min; i < max; ++i) {
477 		oid = adapter->que[i].oid;
478 
479 		sysctl_wlock();
480 		if (old > new)
481 			sysctl_unregister_oid(oid);
482 		else
483 			sysctl_register_oid(oid);
484 		sysctl_wunlock();
485 	}
486 }
487 
488 static int
489 ena_sysctl_buf_ring_size(SYSCTL_HANDLER_ARGS)
490 {
491 	struct ena_adapter *adapter = arg1;
492 	uint32_t val;
493 	int error;
494 
495 	ENA_LOCK_LOCK();
496 	if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) {
497 		error = EINVAL;
498 		goto unlock;
499 	}
500 
501 	val = 0;
502 	error = sysctl_wire_old_buffer(req, sizeof(val));
503 	if (error == 0) {
504 		val = adapter->buf_ring_size;
505 		error = sysctl_handle_32(oidp, &val, 0, req);
506 	}
507 	if (error != 0 || req->newptr == NULL)
508 		goto unlock;
509 
510 	if (!powerof2(val) || val == 0) {
511 		ena_log(adapter->pdev, ERR,
512 		    "Requested new Tx buffer ring size (%u) is not a power of 2\n",
513 		    val);
514 		error = EINVAL;
515 		goto unlock;
516 	}
517 
518 	if (val != adapter->buf_ring_size) {
519 		ena_log(adapter->pdev, INFO,
520 		    "Requested new Tx buffer ring size: %d. Old size: %d\n",
521 		    val, adapter->buf_ring_size);
522 
523 		error = ena_update_buf_ring_size(adapter, val);
524 	} else {
525 		ena_log(adapter->pdev, ERR,
526 		    "New Tx buffer ring size is the same as already used: %u\n",
527 		    adapter->buf_ring_size);
528 	}
529 
530 unlock:
531 	ENA_LOCK_UNLOCK();
532 
533 	return (error);
534 }
535 
536 static int
537 ena_sysctl_rx_queue_size(SYSCTL_HANDLER_ARGS)
538 {
539 	struct ena_adapter *adapter = arg1;
540 	uint32_t val;
541 	int error;
542 
543 	ENA_LOCK_LOCK();
544 	if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) {
545 		error = EINVAL;
546 		goto unlock;
547 	}
548 
549 	val = 0;
550 	error = sysctl_wire_old_buffer(req, sizeof(val));
551 	if (error == 0) {
552 		val = adapter->requested_rx_ring_size;
553 		error = sysctl_handle_32(oidp, &val, 0, req);
554 	}
555 	if (error != 0 || req->newptr == NULL)
556 		goto unlock;
557 
558 	if (val < ENA_MIN_RING_SIZE || val > adapter->max_rx_ring_size) {
559 		ena_log(adapter->pdev, ERR,
560 		    "Requested new Rx queue size (%u) is out of range: [%u, %u]\n",
561 		    val, ENA_MIN_RING_SIZE, adapter->max_rx_ring_size);
562 		error = EINVAL;
563 		goto unlock;
564 	}
565 
566 	/* Check if the parameter is power of 2 */
567 	if (!powerof2(val)) {
568 		ena_log(adapter->pdev, ERR,
569 		    "Requested new Rx queue size (%u) is not a power of 2\n",
570 		    val);
571 		error = EINVAL;
572 		goto unlock;
573 	}
574 
575 	if (val != adapter->requested_rx_ring_size) {
576 		ena_log(adapter->pdev, INFO,
577 		    "Requested new Rx queue size: %u. Old size: %u\n", val,
578 		    adapter->requested_rx_ring_size);
579 
580 		error = ena_update_queue_size(adapter,
581 		    adapter->requested_tx_ring_size, val);
582 	} else {
583 		ena_log(adapter->pdev, ERR,
584 		    "New Rx queue size is the same as already used: %u\n",
585 		    adapter->requested_rx_ring_size);
586 	}
587 
588 unlock:
589 	ENA_LOCK_UNLOCK();
590 
591 	return (error);
592 }
593 
594 /*
595  * Change number of effectively used IO queues adapter->num_io_queues
596  */
597 static int
598 ena_sysctl_io_queues_nb(SYSCTL_HANDLER_ARGS)
599 {
600 	struct ena_adapter *adapter = arg1;
601 	uint32_t old_num_queues, tmp = 0;
602 	int error;
603 
604 	ENA_LOCK_LOCK();
605 	if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) {
606 		error = EINVAL;
607 		goto unlock;
608 	}
609 
610 	error = sysctl_wire_old_buffer(req, sizeof(tmp));
611 	if (error == 0) {
612 		tmp = adapter->num_io_queues;
613 		error = sysctl_handle_int(oidp, &tmp, 0, req);
614 	}
615 	if (error != 0 || req->newptr == NULL)
616 		goto unlock;
617 
618 	if (tmp == 0) {
619 		ena_log(adapter->pdev, ERR,
620 		    "Requested number of IO queues is zero\n");
621 		error = EINVAL;
622 		goto unlock;
623 	}
624 
625 	/*
626 	 * The adapter::max_num_io_queues is the HW capability. The system
627 	 * resources availability may potentially be a tighter limit. Therefore
628 	 * the relation `adapter::max_num_io_queues >= adapter::msix_vecs`
629 	 * always holds true, while the `adapter::msix_vecs` is variable across
630 	 * device reset (`ena_destroy_device()` + `ena_restore_device()`).
631 	 */
632 	if (tmp > (adapter->msix_vecs - ENA_ADMIN_MSIX_VEC)) {
633 		ena_log(adapter->pdev, ERR,
634 		    "Requested number of IO queues is higher than maximum allowed (%u)\n",
635 		    adapter->msix_vecs - ENA_ADMIN_MSIX_VEC);
636 		error = EINVAL;
637 		goto unlock;
638 	}
639 	if (tmp == adapter->num_io_queues) {
640 		ena_log(adapter->pdev, ERR,
641 		    "Requested number of IO queues is equal to current value "
642 		    "(%u)\n",
643 		    adapter->num_io_queues);
644 	} else {
645 		ena_log(adapter->pdev, INFO,
646 		    "Requested new number of IO queues: %u, current value: "
647 		    "%u\n",
648 		    tmp, adapter->num_io_queues);
649 
650 		old_num_queues = adapter->num_io_queues;
651 		error = ena_update_io_queue_nb(adapter, tmp);
652 		if (error != 0)
653 			return (error);
654 
655 		ena_sysctl_update_queue_node_nb(adapter, old_num_queues, tmp);
656 	}
657 
658 unlock:
659 	ENA_LOCK_UNLOCK();
660 
661 	return (error);
662 }
663 
664 static int
665 ena_sysctl_eni_metrics_interval(SYSCTL_HANDLER_ARGS)
666 {
667 	struct ena_adapter *adapter = arg1;
668 	uint16_t interval;
669 	int error;
670 
671 	ENA_LOCK_LOCK();
672 	if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) {
673 		error = EINVAL;
674 		goto unlock;
675 	}
676 
677 	error = sysctl_wire_old_buffer(req, sizeof(interval));
678 	if (error == 0) {
679 		interval = adapter->eni_metrics_sample_interval;
680 		error = sysctl_handle_16(oidp, &interval, 0, req);
681 	}
682 	if (error != 0 || req->newptr == NULL)
683 		goto unlock;
684 
685 	if (interval > ENI_METRICS_MAX_SAMPLE_INTERVAL) {
686 		ena_log(adapter->pdev, ERR,
687 		    "ENI metrics update interval is out of range - maximum allowed value: %d seconds\n",
688 		    ENI_METRICS_MAX_SAMPLE_INTERVAL);
689 		error = EINVAL;
690 		goto unlock;
691 	}
692 
693 	if (interval == 0) {
694 		ena_log(adapter->pdev, INFO,
695 		    "ENI metrics update is now turned off\n");
696 		bzero(&adapter->eni_metrics, sizeof(adapter->eni_metrics));
697 	} else {
698 		ena_log(adapter->pdev, INFO,
699 		    "ENI metrics update interval is set to: %" PRIu16
700 		    " seconds\n",
701 		    interval);
702 	}
703 
704 	adapter->eni_metrics_sample_interval = interval;
705 
706 unlock:
707 	ENA_LOCK_UNLOCK();
708 
709 	return (0);
710 }
711 
712 #ifndef RSS
713 /*
714  * Change the Receive Side Scaling hash key.
715  */
716 static int
717 ena_sysctl_rss_key(SYSCTL_HANDLER_ARGS)
718 {
719 	struct ena_adapter *adapter = arg1;
720 	struct ena_com_dev *ena_dev = adapter->ena_dev;
721 	enum ena_admin_hash_functions ena_func;
722 	char msg[ENA_HASH_KEY_MSG_SIZE];
723 	char elem[3] = { 0 };
724 	char *endp;
725 	u8 rss_key[ENA_HASH_KEY_SIZE];
726 	int error, i;
727 
728 	ENA_LOCK_LOCK();
729 	if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) {
730 		error = EINVAL;
731 		goto unlock;
732 	}
733 
734 	if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_RSS_ACTIVE, adapter))) {
735 		error = ENOTSUP;
736 		goto unlock;
737 	}
738 
739 	error = sysctl_wire_old_buffer(req, sizeof(msg));
740 	if (error != 0)
741 		goto unlock;
742 
743 	error = ena_com_get_hash_function(adapter->ena_dev, &ena_func);
744 	if (error != 0) {
745 		device_printf(adapter->pdev, "Cannot get hash function\n");
746 		goto unlock;
747 	}
748 
749 	if (ena_func != ENA_ADMIN_TOEPLITZ) {
750 		error = EINVAL;
751 		device_printf(adapter->pdev, "Unsupported hash algorithm\n");
752 		goto unlock;
753 	}
754 
755 	error = ena_rss_get_hash_key(ena_dev, rss_key);
756 	if (error != 0) {
757 		device_printf(adapter->pdev, "Cannot get hash key\n");
758 		goto unlock;
759 	}
760 
761 	for (i = 0; i < ENA_HASH_KEY_SIZE; ++i)
762 		snprintf(&msg[i * 2], 3, "%02x", rss_key[i]);
763 
764 	error = sysctl_handle_string(oidp, msg, sizeof(msg), req);
765 	if (error != 0 || req->newptr == NULL)
766 		goto unlock;
767 
768 	if (strlen(msg) != sizeof(msg) - 1) {
769 		error = EINVAL;
770 		device_printf(adapter->pdev, "Invalid key size\n");
771 		goto unlock;
772 	}
773 
774 	for (i = 0; i < ENA_HASH_KEY_SIZE; ++i) {
775 		strncpy(elem, &msg[i * 2], 2);
776 		rss_key[i] = strtol(elem, &endp, 16);
777 
778 		/* Both hex nibbles in the string must be valid to continue. */
779 		if (endp == elem || *endp != '\0' || rss_key[i] < 0) {
780 			error = EINVAL;
781 			device_printf(adapter->pdev,
782 			    "Invalid key hex value: '%c'\n", *endp);
783 			goto unlock;
784 		}
785 	}
786 
787 	error = ena_rss_set_hash(ena_dev, rss_key);
788 	if (error != 0)
789 		device_printf(adapter->pdev, "Cannot fill hash key\n");
790 
791 unlock:
792 	ENA_LOCK_UNLOCK();
793 
794 	return (error);
795 }
796 
797 /*
798  * Change the Receive Side Scaling indirection table.
799  *
800  * The sysctl entry string consists of one or more `x:y` keypairs, where
801  * x stands for the table index and y for its new value.
802  * Table indices that don't need to be updated can be omitted from the string
803  * and will retain their existing values. If an index is entered more than once,
804  * the last value is used.
805  *
806  * Example:
807  * To update two selected indices in the RSS indirection table, e.g. setting
808  * index 0 to queue 5 and then index 5 to queue 0, the below command should be
809  * used:
810  *   sysctl dev.ena.0.rss.indir_table="0:5 5:0"
811  */
812 static int
813 ena_sysctl_rss_indir_table(SYSCTL_HANDLER_ARGS)
814 {
815 	int num_queues, error;
816 	struct ena_adapter *adapter = arg1;
817 	struct ena_indir *indir;
818 	char *msg, *buf, *endp;
819 	uint32_t idx, value;
820 
821 	ENA_LOCK_LOCK();
822 	if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) {
823 		error = EINVAL;
824 		goto unlock;
825 	}
826 
827 	if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_RSS_ACTIVE, adapter))) {
828 		error = ENOTSUP;
829 		goto unlock;
830 	}
831 
832 	indir = adapter->rss_indir;
833 	msg = indir->sysctl_buf;
834 
835 	if (unlikely(indir == NULL)) {
836 		error = ENOTSUP;
837 		goto unlock;
838 	}
839 
840 	error = sysctl_handle_string(oidp, msg, sizeof(indir->sysctl_buf), req);
841 	if (error != 0 || req->newptr == NULL)
842 		goto unlock;
843 
844 	num_queues = adapter->num_io_queues;
845 
846 	/*
847 	 * This sysctl expects msg to be a list of `x:y` record pairs,
848 	 * where x is the indirection table index and y is its value.
849 	 */
850 	for (buf = msg; *buf != '\0'; buf = endp) {
851 		idx = strtol(buf, &endp, 10);
852 
853 		if (endp == buf || idx < 0) {
854 			device_printf(adapter->pdev, "Invalid index: %s\n",
855 			    buf);
856 			error = EINVAL;
857 			break;
858 		}
859 
860 		if (idx >= ENA_RX_RSS_TABLE_SIZE) {
861 			device_printf(adapter->pdev, "Index %d out of range\n",
862 			    idx);
863 			error = ERANGE;
864 			break;
865 		}
866 
867 		buf = endp;
868 
869 		if (*buf++ != ':') {
870 			device_printf(adapter->pdev, "Missing ':' separator\n");
871 			error = EINVAL;
872 			break;
873 		}
874 
875 		value = strtol(buf, &endp, 10);
876 
877 		if (endp == buf || value < 0) {
878 			device_printf(adapter->pdev, "Invalid value: %s\n",
879 			    buf);
880 			error = EINVAL;
881 			break;
882 		}
883 
884 		if (value >= num_queues) {
885 			device_printf(adapter->pdev, "Value %d out of range\n",
886 			    value);
887 			error = ERANGE;
888 			break;
889 		}
890 
891 		indir->table[idx] = value;
892 	}
893 
894 	if (error != 0) /* Reload indirection table with last good data. */
895 		ena_rss_indir_get(adapter, indir->table);
896 
897 	/* At this point msg has been clobbered by sysctl_handle_string. */
898 	ena_rss_copy_indir_buf(msg, indir->table);
899 
900 	if (error == 0)
901 		error = ena_rss_indir_set(adapter, indir->table);
902 
903 unlock:
904 	ENA_LOCK_UNLOCK();
905 
906 	return (error);
907 }
908 #endif /* RSS */
909