xref: /freebsd/sys/dev/ena/ena_sysctl.c (revision 0784121c963e39aa9e8b33c4e0a0c181daf75277)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2015-2021 Amazon.com, Inc. or its affiliates.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  *
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  */
30 #include <sys/cdefs.h>
31 #include <sys/param.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include "opt_rss.h"
35 
36 #include "ena_sysctl.h"
37 #include "ena_rss.h"
38 
39 static void	ena_sysctl_add_wd(struct ena_adapter *);
40 static void	ena_sysctl_add_stats(struct ena_adapter *);
41 static void	ena_sysctl_add_eni_metrics(struct ena_adapter *);
42 static void	ena_sysctl_add_tuneables(struct ena_adapter *);
43 /* Kernel option RSS prevents manipulation of key hash and indirection table. */
44 #ifndef RSS
45 static void	ena_sysctl_add_rss(struct ena_adapter *);
46 #endif
47 static int	ena_sysctl_buf_ring_size(SYSCTL_HANDLER_ARGS);
48 static int	ena_sysctl_rx_queue_size(SYSCTL_HANDLER_ARGS);
49 static int	ena_sysctl_io_queues_nb(SYSCTL_HANDLER_ARGS);
50 static int	ena_sysctl_eni_metrics_interval(SYSCTL_HANDLER_ARGS);
51 #ifndef RSS
52 static int	ena_sysctl_rss_key(SYSCTL_HANDLER_ARGS);
53 static int	ena_sysctl_rss_indir_table(SYSCTL_HANDLER_ARGS);
54 #endif
55 
56 /* Limit max ENI sample rate to be an hour. */
57 #define ENI_METRICS_MAX_SAMPLE_INTERVAL 3600
58 #define ENA_HASH_KEY_MSG_SIZE		(ENA_HASH_KEY_SIZE * 2 + 1)
59 
60 static SYSCTL_NODE(_hw, OID_AUTO, ena, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
61     "ENA driver parameters");
62 
63 /*
64  * Logging level for changing verbosity of the output
65  */
66 int ena_log_level = ENA_INFO;
67 SYSCTL_INT(_hw_ena, OID_AUTO, log_level, CTLFLAG_RWTUN,
68     &ena_log_level, 0, "Logging level indicating verbosity of the logs");
69 
70 SYSCTL_CONST_STRING(_hw_ena, OID_AUTO, driver_version, CTLFLAG_RD,
71     DRV_MODULE_VERSION, "ENA driver version");
72 
73 /*
74  * Use 9k mbufs for the Rx buffers. Default to 0 (use page size mbufs instead).
75  * Using 9k mbufs in low memory conditions might cause allocation to take a lot
76  * of time and lead to the OS instability as it needs to look for the contiguous
77  * pages.
78  * However, page size mbufs has a bit smaller throughput than 9k mbufs, so if
79  * the network performance is the priority, the 9k mbufs can be used.
80  */
81 int ena_enable_9k_mbufs = 0;
82 SYSCTL_INT(_hw_ena, OID_AUTO, enable_9k_mbufs, CTLFLAG_RDTUN,
83     &ena_enable_9k_mbufs, 0, "Use 9 kB mbufs for Rx descriptors");
84 
85 /*
86  * Force the driver to use large LLQ (Low Latency Queue) header. Defaults to
87  * false. This option may be important for platforms, which often handle packet
88  * headers on Tx with total header size greater than 96B, as it may
89  * reduce the latency.
90  * It also reduces the maximum Tx queue size by half, so it may cause more Tx
91  * packet drops.
92  */
93 bool ena_force_large_llq_header = false;
94 SYSCTL_BOOL(_hw_ena, OID_AUTO, force_large_llq_header, CTLFLAG_RDTUN,
95     &ena_force_large_llq_header, 0,
96     "Increases maximum supported header size in LLQ mode to 224 bytes, while reducing the maximum Tx queue size by half.\n");
97 
98 int ena_rss_table_size = ENA_RX_RSS_TABLE_SIZE;
99 
100 void
101 ena_sysctl_add_nodes(struct ena_adapter *adapter)
102 {
103 	ena_sysctl_add_wd(adapter);
104 	ena_sysctl_add_stats(adapter);
105 	ena_sysctl_add_eni_metrics(adapter);
106 	ena_sysctl_add_tuneables(adapter);
107 #ifndef RSS
108 	ena_sysctl_add_rss(adapter);
109 #endif
110 }
111 
112 static void
113 ena_sysctl_add_wd(struct ena_adapter *adapter)
114 {
115 	device_t dev;
116 
117 	struct sysctl_ctx_list *ctx;
118 	struct sysctl_oid *tree;
119 	struct sysctl_oid_list *child;
120 
121 	dev = adapter->pdev;
122 
123 	ctx = device_get_sysctl_ctx(dev);
124 	tree = device_get_sysctl_tree(dev);
125 	child = SYSCTL_CHILDREN(tree);
126 
127 	/* Sysctl calls for Watchdog service */
128 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "wd_active",
129 	    CTLFLAG_RWTUN, &adapter->wd_active, 0,
130 	    "Watchdog is active");
131 
132 	SYSCTL_ADD_QUAD(ctx, child, OID_AUTO, "keep_alive_timeout",
133 	    CTLFLAG_RWTUN, &adapter->keep_alive_timeout,
134 	    "Timeout for Keep Alive messages");
135 
136 	SYSCTL_ADD_QUAD(ctx, child, OID_AUTO, "missing_tx_timeout",
137 	    CTLFLAG_RWTUN, &adapter->missing_tx_timeout,
138 	    "Timeout for TX completion");
139 
140 	SYSCTL_ADD_U32(ctx, child, OID_AUTO, "missing_tx_max_queues",
141 	    CTLFLAG_RWTUN, &adapter->missing_tx_max_queues, 0,
142 	    "Number of TX queues to check per run");
143 
144 	SYSCTL_ADD_U32(ctx, child, OID_AUTO, "missing_tx_threshold",
145 	    CTLFLAG_RWTUN, &adapter->missing_tx_threshold, 0,
146 	    "Max number of timeouted packets");
147 }
148 
149 static void
150 ena_sysctl_add_stats(struct ena_adapter *adapter)
151 {
152 	device_t dev;
153 
154 	struct ena_ring *tx_ring;
155 	struct ena_ring *rx_ring;
156 
157 	struct ena_hw_stats *hw_stats;
158 	struct ena_stats_dev *dev_stats;
159 	struct ena_stats_tx *tx_stats;
160 	struct ena_stats_rx *rx_stats;
161 	struct ena_com_stats_admin *admin_stats;
162 
163 	struct sysctl_ctx_list *ctx;
164 	struct sysctl_oid *tree;
165 	struct sysctl_oid_list *child;
166 
167 	struct sysctl_oid *queue_node, *tx_node, *rx_node, *hw_node;
168 	struct sysctl_oid *admin_node;
169 	struct sysctl_oid_list *queue_list, *tx_list, *rx_list, *hw_list;
170 	struct sysctl_oid_list *admin_list;
171 
172 #define QUEUE_NAME_LEN 32
173 	char namebuf[QUEUE_NAME_LEN];
174 	int i;
175 
176 	dev = adapter->pdev;
177 
178 	ctx = device_get_sysctl_ctx(dev);
179 	tree = device_get_sysctl_tree(dev);
180 	child = SYSCTL_CHILDREN(tree);
181 
182 	tx_ring = adapter->tx_ring;
183 	rx_ring = adapter->rx_ring;
184 
185 	hw_stats = &adapter->hw_stats;
186 	dev_stats = &adapter->dev_stats;
187 	admin_stats = &adapter->ena_dev->admin_queue.stats;
188 
189 	SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "wd_expired",
190 	    CTLFLAG_RD, &dev_stats->wd_expired,
191 	    "Watchdog expiry count");
192 	SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "interface_up",
193 	    CTLFLAG_RD, &dev_stats->interface_up,
194 	    "Network interface up count");
195 	SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "interface_down",
196 	    CTLFLAG_RD, &dev_stats->interface_down,
197 	    "Network interface down count");
198 	SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "admin_q_pause",
199 	    CTLFLAG_RD, &dev_stats->admin_q_pause,
200 	    "Admin queue pauses");
201 
202 	for (i = 0; i < adapter->num_io_queues; ++i, ++tx_ring, ++rx_ring) {
203 		snprintf(namebuf, QUEUE_NAME_LEN, "queue%d", i);
204 
205 		queue_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
206 		    namebuf, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Queue Name");
207 		queue_list = SYSCTL_CHILDREN(queue_node);
208 
209 		adapter->que[i].oid = queue_node;
210 
211 #ifdef RSS
212 		/* Common stats */
213 		SYSCTL_ADD_INT(ctx, queue_list, OID_AUTO, "cpu",
214 		    CTLFLAG_RD, &adapter->que[i].cpu, 0, "CPU affinity");
215 		SYSCTL_ADD_INT(ctx, queue_list, OID_AUTO, "domain",
216 		    CTLFLAG_RD, &adapter->que[i].domain, 0, "NUMA domain");
217 #endif
218 
219 		/* TX specific stats */
220 		tx_node = SYSCTL_ADD_NODE(ctx, queue_list, OID_AUTO,
221 		    "tx_ring", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "TX ring");
222 		tx_list = SYSCTL_CHILDREN(tx_node);
223 
224 		tx_stats = &tx_ring->tx_stats;
225 
226 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
227 		    "count", CTLFLAG_RD,
228 		    &tx_stats->cnt, "Packets sent");
229 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
230 		    "bytes", CTLFLAG_RD,
231 		    &tx_stats->bytes, "Bytes sent");
232 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
233 		    "prepare_ctx_err", CTLFLAG_RD,
234 		    &tx_stats->prepare_ctx_err,
235 		    "TX buffer preparation failures");
236 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
237 		    "dma_mapping_err", CTLFLAG_RD,
238 		    &tx_stats->dma_mapping_err, "DMA mapping failures");
239 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
240 		    "doorbells", CTLFLAG_RD,
241 		    &tx_stats->doorbells, "Queue doorbells");
242 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
243 		    "missing_tx_comp", CTLFLAG_RD,
244 		    &tx_stats->missing_tx_comp, "TX completions missed");
245 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
246 		    "bad_req_id", CTLFLAG_RD,
247 		    &tx_stats->bad_req_id, "Bad request id count");
248 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
249 		        "mbuf_collapses", CTLFLAG_RD,
250 		        &tx_stats->collapse,
251 		        "Mbuf collapse count");
252 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
253 		        "mbuf_collapse_err", CTLFLAG_RD,
254 		        &tx_stats->collapse_err,
255 		        "Mbuf collapse failures");
256 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
257 		    "queue_wakeups", CTLFLAG_RD,
258 		    &tx_stats->queue_wakeup, "Queue wakeups");
259 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
260 		    "queue_stops", CTLFLAG_RD,
261 		    &tx_stats->queue_stop, "Queue stops");
262 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
263 		    "llq_buffer_copy", CTLFLAG_RD,
264 		    &tx_stats->llq_buffer_copy,
265 		    "Header copies for llq transaction");
266 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
267 		    "unmask_interrupt_num", CTLFLAG_RD,
268 		    &tx_stats->unmask_interrupt_num,
269 		    "Unmasked interrupt count");
270 
271 		/* RX specific stats */
272 		rx_node = SYSCTL_ADD_NODE(ctx, queue_list, OID_AUTO,
273 		    "rx_ring", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "RX ring");
274 		rx_list = SYSCTL_CHILDREN(rx_node);
275 
276 		rx_stats = &rx_ring->rx_stats;
277 
278 		SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO,
279 		    "count", CTLFLAG_RD,
280 		    &rx_stats->cnt, "Packets received");
281 		SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO,
282 		    "bytes", CTLFLAG_RD,
283 		    &rx_stats->bytes, "Bytes received");
284 		SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO,
285 		    "refil_partial", CTLFLAG_RD,
286 		    &rx_stats->refil_partial, "Partial refilled mbufs");
287 		SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO,
288 		    "csum_bad", CTLFLAG_RD,
289 		    &rx_stats->csum_bad, "Bad RX checksum");
290 		SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO,
291 		    "mbuf_alloc_fail", CTLFLAG_RD,
292 		    &rx_stats->mbuf_alloc_fail, "Failed mbuf allocs");
293 		SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO,
294 		    "mjum_alloc_fail", CTLFLAG_RD,
295 		    &rx_stats->mjum_alloc_fail, "Failed jumbo mbuf allocs");
296 		SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO,
297 		    "dma_mapping_err", CTLFLAG_RD,
298 		    &rx_stats->dma_mapping_err, "DMA mapping errors");
299 		SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO,
300 		    "bad_desc_num", CTLFLAG_RD,
301 		    &rx_stats->bad_desc_num, "Bad descriptor count");
302 		SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO,
303 		    "bad_req_id", CTLFLAG_RD,
304 		    &rx_stats->bad_req_id, "Bad request id count");
305 		SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO,
306 		    "empty_rx_ring", CTLFLAG_RD,
307 		    &rx_stats->empty_rx_ring, "RX descriptors depletion count");
308 		SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO,
309 		    "csum_good", CTLFLAG_RD,
310 		    &rx_stats->csum_good, "Valid RX checksum calculations");
311 	}
312 
313 	/* Stats read from device */
314 	hw_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "hw_stats",
315 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Statistics from hardware");
316 	hw_list = SYSCTL_CHILDREN(hw_node);
317 
318 	SYSCTL_ADD_COUNTER_U64(ctx, hw_list, OID_AUTO, "rx_packets", CTLFLAG_RD,
319 	    &hw_stats->rx_packets, "Packets received");
320 	SYSCTL_ADD_COUNTER_U64(ctx, hw_list, OID_AUTO, "tx_packets", CTLFLAG_RD,
321 	    &hw_stats->tx_packets, "Packets transmitted");
322 	SYSCTL_ADD_COUNTER_U64(ctx, hw_list, OID_AUTO, "rx_bytes", CTLFLAG_RD,
323 	    &hw_stats->rx_bytes, "Bytes received");
324 	SYSCTL_ADD_COUNTER_U64(ctx, hw_list, OID_AUTO, "tx_bytes", CTLFLAG_RD,
325 	    &hw_stats->tx_bytes, "Bytes transmitted");
326 	SYSCTL_ADD_COUNTER_U64(ctx, hw_list, OID_AUTO, "rx_drops", CTLFLAG_RD,
327 	    &hw_stats->rx_drops, "Receive packet drops");
328 	SYSCTL_ADD_COUNTER_U64(ctx, hw_list, OID_AUTO, "tx_drops", CTLFLAG_RD,
329 	    &hw_stats->tx_drops, "Transmit packet drops");
330 
331 	/* ENA Admin queue stats */
332 	admin_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "admin_stats",
333 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "ENA Admin Queue statistics");
334 	admin_list = SYSCTL_CHILDREN(admin_node);
335 
336 	SYSCTL_ADD_U64(ctx, admin_list, OID_AUTO, "aborted_cmd", CTLFLAG_RD,
337 	    &admin_stats->aborted_cmd, 0, "Aborted commands");
338 	SYSCTL_ADD_U64(ctx, admin_list, OID_AUTO, "sumbitted_cmd", CTLFLAG_RD,
339 	    &admin_stats->submitted_cmd, 0, "Submitted commands");
340 	SYSCTL_ADD_U64(ctx, admin_list, OID_AUTO, "completed_cmd", CTLFLAG_RD,
341 	    &admin_stats->completed_cmd, 0, "Completed commands");
342 	SYSCTL_ADD_U64(ctx, admin_list, OID_AUTO, "out_of_space", CTLFLAG_RD,
343 	    &admin_stats->out_of_space, 0, "Queue out of space");
344 	SYSCTL_ADD_U64(ctx, admin_list, OID_AUTO, "no_completion", CTLFLAG_RD,
345 	    &admin_stats->no_completion, 0, "Commands not completed");
346 }
347 
348 static void
349 ena_sysctl_add_eni_metrics(struct ena_adapter *adapter)
350 {
351 	device_t dev;
352 	struct ena_admin_eni_stats *eni_metrics;
353 
354 	struct sysctl_ctx_list *ctx;
355 	struct sysctl_oid *tree;
356 	struct sysctl_oid_list *child;
357 
358 	struct sysctl_oid *eni_node;
359 	struct sysctl_oid_list *eni_list;
360 
361 	dev = adapter->pdev;
362 
363 	ctx = device_get_sysctl_ctx(dev);
364 	tree = device_get_sysctl_tree(dev);
365 	child = SYSCTL_CHILDREN(tree);
366 
367 	eni_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "eni_metrics",
368 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "ENA's ENI metrics");
369 	eni_list = SYSCTL_CHILDREN(eni_node);
370 
371 	eni_metrics = &adapter->eni_metrics;
372 
373 	SYSCTL_ADD_U64(ctx, eni_list, OID_AUTO, "bw_in_allowance_exceeded",
374 	    CTLFLAG_RD, &eni_metrics->bw_in_allowance_exceeded, 0,
375 	    "Inbound BW allowance exceeded");
376 	SYSCTL_ADD_U64(ctx, eni_list, OID_AUTO, "bw_out_allowance_exceeded",
377 	    CTLFLAG_RD, &eni_metrics->bw_out_allowance_exceeded, 0,
378 	    "Outbound BW allowance exceeded");
379 	SYSCTL_ADD_U64(ctx, eni_list, OID_AUTO, "pps_allowance_exceeded",
380 	    CTLFLAG_RD, &eni_metrics->pps_allowance_exceeded, 0,
381 	    "PPS allowance exceeded");
382 	SYSCTL_ADD_U64(ctx, eni_list, OID_AUTO, "conntrack_allowance_exceeded",
383 	    CTLFLAG_RD, &eni_metrics->conntrack_allowance_exceeded, 0,
384 	    "Connection tracking allowance exceeded");
385 	SYSCTL_ADD_U64(ctx, eni_list, OID_AUTO, "linklocal_allowance_exceeded",
386 	    CTLFLAG_RD, &eni_metrics->linklocal_allowance_exceeded, 0,
387 	    "Linklocal packet rate allowance exceeded");
388 
389 	/*
390 	 * Tuneable, which determines how often ENI metrics will be read.
391 	 * 0 means it's turned off. Maximum allowed value is limited by:
392 	 * ENI_METRICS_MAX_SAMPLE_INTERVAL.
393 	 */
394 	SYSCTL_ADD_PROC(ctx, eni_list, OID_AUTO, "sample_interval",
395 	    CTLTYPE_U16 | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0,
396 	    ena_sysctl_eni_metrics_interval, "SU",
397 	    "Interval in seconds for updating ENI emetrics. 0 turns off the update.");
398 }
399 
400 static void
401 ena_sysctl_add_tuneables(struct ena_adapter *adapter)
402 {
403 	device_t dev;
404 
405 	struct sysctl_ctx_list *ctx;
406 	struct sysctl_oid *tree;
407 	struct sysctl_oid_list *child;
408 
409 	dev = adapter->pdev;
410 
411 	ctx = device_get_sysctl_ctx(dev);
412 	tree = device_get_sysctl_tree(dev);
413 	child = SYSCTL_CHILDREN(tree);
414 
415 	/* Tuneable number of buffers in the buf-ring (drbr) */
416 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "buf_ring_size",
417 	    CTLTYPE_U32 | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0,
418 	    ena_sysctl_buf_ring_size, "I",
419 	    "Size of the Tx buffer ring (drbr).");
420 
421 	/* Tuneable number of the Rx ring size */
422 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_queue_size",
423 	    CTLTYPE_U32 | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0,
424 	    ena_sysctl_rx_queue_size, "I",
425 	    "Size of the Rx ring. The size should be a power of 2.");
426 
427 	/* Tuneable number of IO queues */
428 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "io_queues_nb",
429 	    CTLTYPE_U32 | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0,
430 	    ena_sysctl_io_queues_nb, "I", "Number of IO queues.");
431 }
432 
433 /* Kernel option RSS prevents manipulation of key hash and indirection table. */
434 #ifndef RSS
435 static void
436 ena_sysctl_add_rss(struct ena_adapter *adapter)
437 {
438 	device_t dev;
439 
440 	struct sysctl_ctx_list *ctx;
441 	struct sysctl_oid *tree;
442 	struct sysctl_oid_list *child;
443 
444 	dev = adapter->pdev;
445 
446 	ctx = device_get_sysctl_ctx(dev);
447 	tree = device_get_sysctl_tree(dev);
448 	child = SYSCTL_CHILDREN(tree);
449 
450 	/* RSS options */
451 	tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rss",
452 	    CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Receive Side Scaling options.");
453 	child = SYSCTL_CHILDREN(tree);
454 
455 	/* RSS hash key */
456 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "key",
457 	    CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0,
458 	    ena_sysctl_rss_key, "A", "RSS key.");
459 
460 	/* Tuneable RSS indirection table */
461 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "indir_table",
462 	    CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0,
463 	    ena_sysctl_rss_indir_table, "A", "RSS indirection table.");
464 
465 	/* RSS indirection table size */
466 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "indir_table_size",
467 	    CTLFLAG_RD | CTLFLAG_MPSAFE, &ena_rss_table_size, 0,
468 	    "RSS indirection table size.");
469 }
470 #endif /* RSS */
471 
472 
473 /*
474  * ena_sysctl_update_queue_node_nb - Register/unregister sysctl queue nodes.
475  *
476  * Whether the nodes are registered or unregistered depends on a delta between
477  * the `old` and `new` parameters, representing the number of queues.
478  *
479  * This function is used to hide sysctl attributes for queue nodes which aren't
480  * currently used by the HW (e.g. after a call to `ena_sysctl_io_queues_nb`).
481  *
482  * NOTE:
483  * All unregistered nodes must be registered again at detach, i.e. by a call to
484  * this function.
485  */
486 void
487 ena_sysctl_update_queue_node_nb(struct ena_adapter *adapter, int old, int new)
488 {
489 	device_t dev;
490 	struct sysctl_oid *oid;
491 	int min, max, i;
492 
493 	dev = adapter->pdev;
494 	min = MIN(old, new);
495 	max = MIN(MAX(old, new), adapter->max_num_io_queues);
496 
497 	for (i = min; i < max; ++i) {
498 		oid = adapter->que[i].oid;
499 
500 		sysctl_wlock();
501 		if (old > new)
502 			sysctl_unregister_oid(oid);
503 		else
504 			sysctl_register_oid(oid);
505 		sysctl_wunlock();
506 	}
507 }
508 
509 static int
510 ena_sysctl_buf_ring_size(SYSCTL_HANDLER_ARGS)
511 {
512 	struct ena_adapter *adapter = arg1;
513 	uint32_t val;
514 	int error;
515 
516 	ENA_LOCK_LOCK();
517 	if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) {
518 		error = EINVAL;
519 		goto unlock;
520 	}
521 
522 	val = 0;
523 	error = sysctl_wire_old_buffer(req, sizeof(val));
524 	if (error == 0) {
525 		val = adapter->buf_ring_size;
526 		error = sysctl_handle_32(oidp, &val, 0, req);
527 	}
528 	if (error != 0 || req->newptr == NULL)
529 		goto unlock;
530 
531 	if (!powerof2(val) || val == 0) {
532 		ena_log(adapter->pdev, ERR,
533 		    "Requested new Tx buffer ring size (%u) is not a power of 2\n",
534 		    val);
535 		error = EINVAL;
536 		goto unlock;
537 	}
538 
539 	if (val != adapter->buf_ring_size) {
540 		ena_log(adapter->pdev, INFO,
541 		    "Requested new Tx buffer ring size: %d. Old size: %d\n",
542 		    val, adapter->buf_ring_size);
543 
544 		error = ena_update_buf_ring_size(adapter, val);
545 	} else {
546 		ena_log(adapter->pdev, ERR,
547 		    "New Tx buffer ring size is the same as already used: %u\n",
548 		    adapter->buf_ring_size);
549 	}
550 
551 unlock:
552 	ENA_LOCK_UNLOCK();
553 
554 	return (error);
555 }
556 
557 static int
558 ena_sysctl_rx_queue_size(SYSCTL_HANDLER_ARGS)
559 {
560 	struct ena_adapter *adapter = arg1;
561 	uint32_t val;
562 	int error;
563 
564 	ENA_LOCK_LOCK();
565 	if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) {
566 		error = EINVAL;
567 		goto unlock;
568 	}
569 
570 	val = 0;
571 	error = sysctl_wire_old_buffer(req, sizeof(val));
572 	if (error == 0) {
573 		val = adapter->requested_rx_ring_size;
574 		error = sysctl_handle_32(oidp, &val, 0, req);
575 	}
576 	if (error != 0 || req->newptr == NULL)
577 		goto unlock;
578 
579 	if  (val < ENA_MIN_RING_SIZE || val > adapter->max_rx_ring_size) {
580 		ena_log(adapter->pdev, ERR,
581 		    "Requested new Rx queue size (%u) is out of range: [%u, %u]\n",
582 		    val, ENA_MIN_RING_SIZE, adapter->max_rx_ring_size);
583 		error = EINVAL;
584 		goto unlock;
585 	}
586 
587 	/* Check if the parameter is power of 2 */
588 	if (!powerof2(val)) {
589 		ena_log(adapter->pdev, ERR,
590 		    "Requested new Rx queue size (%u) is not a power of 2\n",
591 		    val);
592 		error = EINVAL;
593 		goto unlock;
594 	}
595 
596 	if (val != adapter->requested_rx_ring_size) {
597 		ena_log(adapter->pdev, INFO,
598 		    "Requested new Rx queue size: %u. Old size: %u\n",
599 		    val, adapter->requested_rx_ring_size);
600 
601 		error = ena_update_queue_size(adapter,
602 		    adapter->requested_tx_ring_size, val);
603 	} else {
604 		ena_log(adapter->pdev, ERR,
605 		    "New Rx queue size is the same as already used: %u\n",
606 		    adapter->requested_rx_ring_size);
607 	}
608 
609 unlock:
610 	ENA_LOCK_UNLOCK();
611 
612 	return (error);
613 }
614 
615 /*
616  * Change number of effectively used IO queues adapter->num_io_queues
617  */
618 static int
619 ena_sysctl_io_queues_nb(SYSCTL_HANDLER_ARGS)
620 {
621 	struct ena_adapter *adapter = arg1;
622 	uint32_t old_num_queues, tmp = 0;
623 	int error;
624 
625 	ENA_LOCK_LOCK();
626 	if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) {
627 		error = EINVAL;
628 		goto unlock;
629 	}
630 
631 	error = sysctl_wire_old_buffer(req, sizeof(tmp));
632 	if (error == 0) {
633 		tmp = adapter->num_io_queues;
634 		error = sysctl_handle_int(oidp, &tmp, 0, req);
635 	}
636 	if (error != 0 || req->newptr == NULL)
637 		goto unlock;
638 
639 	if (tmp == 0) {
640 		ena_log(adapter->pdev, ERR,
641 		    "Requested number of IO queues is zero\n");
642 		error = EINVAL;
643 		goto unlock;
644 	}
645 
646 	/*
647 	 * The adapter::max_num_io_queues is the HW capability. The system
648 	 * resources availability may potentially be a tighter limit. Therefore
649 	 * the relation `adapter::max_num_io_queues >= adapter::msix_vecs`
650 	 * always holds true, while the `adapter::msix_vecs` is variable across
651 	 * device reset (`ena_destroy_device()` + `ena_restore_device()`).
652 	 */
653 	if (tmp > (adapter->msix_vecs - ENA_ADMIN_MSIX_VEC)) {
654 		ena_log(adapter->pdev, ERR,
655 		    "Requested number of IO queues is higher than maximum "
656 		    "allowed (%u)\n", adapter->msix_vecs - ENA_ADMIN_MSIX_VEC);
657 		error = EINVAL;
658 		goto unlock;
659 	}
660 	if (tmp == adapter->num_io_queues) {
661 		ena_log(adapter->pdev, ERR,
662 		    "Requested number of IO queues is equal to current value "
663 		    "(%u)\n", adapter->num_io_queues);
664 	} else {
665 		ena_log(adapter->pdev, INFO,
666 		    "Requested new number of IO queues: %u, current value: "
667 		    "%u\n", tmp, adapter->num_io_queues);
668 
669 		old_num_queues = adapter->num_io_queues;
670 		error = ena_update_io_queue_nb(adapter, tmp);
671 		if (error != 0)
672 			return (error);
673 
674 		ena_sysctl_update_queue_node_nb(adapter, old_num_queues, tmp);
675 	}
676 
677 unlock:
678 	ENA_LOCK_UNLOCK();
679 
680 	return (error);
681 }
682 
683 static int
684 ena_sysctl_eni_metrics_interval(SYSCTL_HANDLER_ARGS)
685 {
686 	struct ena_adapter *adapter = arg1;
687 	uint16_t interval;
688 	int error;
689 
690 	ENA_LOCK_LOCK();
691 	if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) {
692 		error = EINVAL;
693 		goto unlock;
694 	}
695 
696 	error = sysctl_wire_old_buffer(req, sizeof(interval));
697 	if (error == 0) {
698 		interval = adapter->eni_metrics_sample_interval;
699 		error = sysctl_handle_16(oidp, &interval, 0, req);
700 	}
701 	if (error != 0 || req->newptr == NULL)
702 		goto unlock;
703 
704 	if (interval > ENI_METRICS_MAX_SAMPLE_INTERVAL) {
705 		ena_log(adapter->pdev, ERR,
706 		    "ENI metrics update interval is out of range - maximum allowed value: %d seconds\n",
707 		    ENI_METRICS_MAX_SAMPLE_INTERVAL);
708 		error = EINVAL;
709 		goto unlock;
710 	}
711 
712 	if (interval == 0) {
713 		ena_log(adapter->pdev, INFO,
714 		    "ENI metrics update is now turned off\n");
715 		bzero(&adapter->eni_metrics, sizeof(adapter->eni_metrics));
716 	} else {
717 		ena_log(adapter->pdev, INFO,
718 		    "ENI metrics update interval is set to: %"PRIu16" seconds\n",
719 		    interval);
720 	}
721 
722 	adapter->eni_metrics_sample_interval = interval;
723 
724 unlock:
725 	ENA_LOCK_UNLOCK();
726 
727 	return (0);
728 }
729 
730 #ifndef RSS
731 /*
732  * Change the Receive Side Scaling hash key.
733  */
734 static int
735 ena_sysctl_rss_key(SYSCTL_HANDLER_ARGS)
736 {
737 	struct ena_adapter *adapter = arg1;
738 	struct ena_com_dev *ena_dev = adapter->ena_dev;
739 	enum ena_admin_hash_functions ena_func;
740 	char msg[ENA_HASH_KEY_MSG_SIZE];
741 	char elem[3] = { 0 };
742 	char *endp;
743 	u8 rss_key[ENA_HASH_KEY_SIZE];
744 	int error, i;
745 
746 	ENA_LOCK_LOCK();
747 	if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) {
748 		error = EINVAL;
749 		goto unlock;
750 	}
751 
752 	if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_RSS_ACTIVE, adapter))) {
753 		error = ENOTSUP;
754 		goto unlock;
755 	}
756 
757 	error = sysctl_wire_old_buffer(req, sizeof(msg));
758 	if (error != 0)
759 		goto unlock;
760 
761 	error = ena_com_get_hash_function(adapter->ena_dev, &ena_func);
762 	if (error != 0) {
763 		device_printf(adapter->pdev, "Cannot get hash function\n");
764 		goto unlock;
765 	}
766 
767 	if (ena_func != ENA_ADMIN_TOEPLITZ) {
768 		error = EINVAL;
769 		device_printf(adapter->pdev, "Unsupported hash algorithm\n");
770 		goto unlock;
771 	}
772 
773 	error = ena_rss_get_hash_key(ena_dev, rss_key);
774 	if (error != 0) {
775 		device_printf(adapter->pdev, "Cannot get hash key\n");
776 		goto unlock;
777 	}
778 
779 	for (i = 0; i < ENA_HASH_KEY_SIZE; ++i)
780 		snprintf(&msg[i * 2], 3, "%02x", rss_key[i]);
781 
782 	error = sysctl_handle_string(oidp, msg, sizeof(msg), req);
783 	if (error != 0 || req->newptr == NULL)
784 		goto unlock;
785 
786 	if (strlen(msg) != sizeof(msg) - 1) {
787 		error = EINVAL;
788 		device_printf(adapter->pdev, "Invalid key size\n");
789 		goto unlock;
790 	}
791 
792 	for (i = 0; i < ENA_HASH_KEY_SIZE; ++i) {
793 		strncpy(elem, &msg[i * 2], 2);
794 		rss_key[i] = strtol(elem, &endp, 16);
795 
796 		/* Both hex nibbles in the string must be valid to continue. */
797 		if (endp == elem || *endp != '\0' || rss_key[i] < 0) {
798 			error = EINVAL;
799 			device_printf(adapter->pdev,
800 			    "Invalid key hex value: '%c'\n", *endp);
801 			goto unlock;
802 		}
803 	}
804 
805 	error = ena_rss_set_hash(ena_dev, rss_key);
806 	if (error != 0)
807 		device_printf(adapter->pdev, "Cannot fill hash key\n");
808 
809 unlock:
810 	ENA_LOCK_UNLOCK();
811 
812 	return (error);
813 }
814 
815 /*
816  * Change the Receive Side Scaling indirection table.
817  *
818  * The sysctl entry string consists of one or more `x:y` keypairs, where
819  * x stands for the table index and y for its new value.
820  * Table indices that don't need to be updated can be omitted from the string
821  * and will retain their existing values. If an index is entered more than once,
822  * the last value is used.
823  *
824  * Example:
825  * To update two selected indices in the RSS indirection table, e.g. setting
826  * index 0 to queue 5 and then index 5 to queue 0, the below command should be
827  * used:
828  *   sysctl dev.ena.0.rss.indir_table="0:5 5:0"
829  */
830 static int
831 ena_sysctl_rss_indir_table(SYSCTL_HANDLER_ARGS)
832 {
833 	int num_queues, error;
834 	struct ena_adapter *adapter = arg1;
835 	struct ena_com_dev *ena_dev;
836 	struct ena_indir *indir;
837 	char *msg, *buf, *endp;
838 	uint32_t idx, value;
839 
840 	ENA_LOCK_LOCK();
841 	if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) {
842 		error = EINVAL;
843 		goto unlock;
844 	}
845 
846 	if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_RSS_ACTIVE, adapter))) {
847 		error = ENOTSUP;
848 		goto unlock;
849 	}
850 
851 	ena_dev = adapter->ena_dev;
852 	indir = adapter->rss_indir;
853 	msg = indir->sysctl_buf;
854 
855 	if (unlikely(indir == NULL)) {
856 		error = ENOTSUP;
857 		goto unlock;
858 	}
859 
860 	error = sysctl_handle_string(oidp, msg, sizeof(indir->sysctl_buf), req);
861 	if (error != 0 || req->newptr == NULL)
862 		goto unlock;
863 
864 	num_queues = adapter->num_io_queues;
865 
866 	/*
867 	 * This sysctl expects msg to be a list of `x:y` record pairs,
868 	 * where x is the indirection table index and y is its value.
869 	 */
870 	for (buf = msg; *buf != '\0'; buf = endp) {
871 		idx = strtol(buf, &endp, 10);
872 
873 		if (endp == buf || idx < 0) {
874 			device_printf(adapter->pdev, "Invalid index: %s\n",
875 			    buf);
876 			error = EINVAL;
877 			break;
878 		}
879 
880 		if (idx >= ENA_RX_RSS_TABLE_SIZE) {
881 			device_printf(adapter->pdev, "Index %d out of range\n",
882 			    idx);
883 			error = ERANGE;
884 			break;
885 		}
886 
887 		buf = endp;
888 
889 		if (*buf++ != ':') {
890 			device_printf(adapter->pdev, "Missing ':' separator\n");
891 			error = EINVAL;
892 			break;
893 		}
894 
895 		value = strtol(buf, &endp, 10);
896 
897 		if (endp == buf || value < 0) {
898 			device_printf(adapter->pdev, "Invalid value: %s\n",
899 			    buf);
900 			error = EINVAL;
901 			break;
902 		}
903 
904 		if (value >= num_queues) {
905 			device_printf(adapter->pdev, "Value %d out of range\n",
906 			    value);
907 			error = ERANGE;
908 			break;
909 		}
910 
911 		indir->table[idx] = value;
912 	}
913 
914 	if (error != 0) /* Reload indirection table with last good data. */
915 		ena_rss_indir_get(adapter, indir->table);
916 
917 	/* At this point msg has been clobbered by sysctl_handle_string. */
918 	ena_rss_copy_indir_buf(msg, indir->table);
919 
920 	if (error == 0)
921 		error = ena_rss_indir_set(adapter, indir->table);
922 
923 unlock:
924 	ENA_LOCK_UNLOCK();
925 
926 	return (error);
927 }
928 #endif /* RSS */
929