xref: /freebsd/sys/dev/ena/ena_sysctl.c (revision 61ba55bcf70f2340f9c943c9571113b3fd8eda69)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2015-2021 Amazon.com, Inc. or its affiliates.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  *
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  */
30 #include <sys/cdefs.h>
31 #include <sys/param.h>
32 #include "opt_rss.h"
33 
34 #include "ena_rss.h"
35 #include "ena_sysctl.h"
36 
37 static void ena_sysctl_add_wd(struct ena_adapter *);
38 static void ena_sysctl_add_stats(struct ena_adapter *);
39 static void ena_sysctl_add_eni_metrics(struct ena_adapter *);
40 static void ena_sysctl_add_tuneables(struct ena_adapter *);
41 /* Kernel option RSS prevents manipulation of key hash and indirection table. */
42 #ifndef RSS
43 static void ena_sysctl_add_rss(struct ena_adapter *);
44 #endif
45 static int ena_sysctl_buf_ring_size(SYSCTL_HANDLER_ARGS);
46 static int ena_sysctl_rx_queue_size(SYSCTL_HANDLER_ARGS);
47 static int ena_sysctl_io_queues_nb(SYSCTL_HANDLER_ARGS);
48 static int ena_sysctl_eni_metrics_interval(SYSCTL_HANDLER_ARGS);
49 #ifndef RSS
50 static int ena_sysctl_rss_key(SYSCTL_HANDLER_ARGS);
51 static int ena_sysctl_rss_indir_table(SYSCTL_HANDLER_ARGS);
52 #endif
53 
54 /* Limit max ENI sample rate to be an hour. */
55 #define ENI_METRICS_MAX_SAMPLE_INTERVAL 3600
56 #define ENA_HASH_KEY_MSG_SIZE (ENA_HASH_KEY_SIZE * 2 + 1)
57 
58 static SYSCTL_NODE(_hw, OID_AUTO, ena, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
59     "ENA driver parameters");
60 
61 /*
62  * Logging level for changing verbosity of the output
63  */
64 int ena_log_level = ENA_INFO;
65 SYSCTL_INT(_hw_ena, OID_AUTO, log_level, CTLFLAG_RWTUN, &ena_log_level, 0,
66     "Logging level indicating verbosity of the logs");
67 
68 SYSCTL_CONST_STRING(_hw_ena, OID_AUTO, driver_version, CTLFLAG_RD,
69     ENA_DRV_MODULE_VERSION, "ENA driver version");
70 
71 /*
72  * Use 9k mbufs for the Rx buffers. Default to 0 (use page size mbufs instead).
73  * Using 9k mbufs in low memory conditions might cause allocation to take a lot
74  * of time and lead to the OS instability as it needs to look for the contiguous
75  * pages.
76  * However, page size mbufs has a bit smaller throughput than 9k mbufs, so if
77  * the network performance is the priority, the 9k mbufs can be used.
78  */
79 int ena_enable_9k_mbufs = 0;
80 SYSCTL_INT(_hw_ena, OID_AUTO, enable_9k_mbufs, CTLFLAG_RDTUN,
81     &ena_enable_9k_mbufs, 0, "Use 9 kB mbufs for Rx descriptors");
82 
83 /*
84  * Force the driver to use large LLQ (Low Latency Queue) header. Defaults to
85  * false. This option may be important for platforms, which often handle packet
86  * headers on Tx with total header size greater than 96B, as it may
87  * reduce the latency.
88  * It also reduces the maximum Tx queue size by half, so it may cause more Tx
89  * packet drops.
90  */
91 bool ena_force_large_llq_header = false;
92 SYSCTL_BOOL(_hw_ena, OID_AUTO, force_large_llq_header, CTLFLAG_RDTUN,
93     &ena_force_large_llq_header, 0,
94     "Increases maximum supported header size in LLQ mode to 224 bytes, while reducing the maximum Tx queue size by half.\n");
95 
96 int ena_rss_table_size = ENA_RX_RSS_TABLE_SIZE;
97 
98 void
99 ena_sysctl_add_nodes(struct ena_adapter *adapter)
100 {
101 	ena_sysctl_add_wd(adapter);
102 	ena_sysctl_add_stats(adapter);
103 	ena_sysctl_add_eni_metrics(adapter);
104 	ena_sysctl_add_tuneables(adapter);
105 #ifndef RSS
106 	ena_sysctl_add_rss(adapter);
107 #endif
108 }
109 
110 static void
111 ena_sysctl_add_wd(struct ena_adapter *adapter)
112 {
113 	device_t dev;
114 
115 	struct sysctl_ctx_list *ctx;
116 	struct sysctl_oid *tree;
117 	struct sysctl_oid_list *child;
118 
119 	dev = adapter->pdev;
120 
121 	ctx = device_get_sysctl_ctx(dev);
122 	tree = device_get_sysctl_tree(dev);
123 	child = SYSCTL_CHILDREN(tree);
124 
125 	/* Sysctl calls for Watchdog service */
126 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "wd_active", CTLFLAG_RWTUN,
127 	    &adapter->wd_active, 0, "Watchdog is active");
128 
129 	SYSCTL_ADD_QUAD(ctx, child, OID_AUTO, "keep_alive_timeout",
130 	    CTLFLAG_RWTUN, &adapter->keep_alive_timeout,
131 	    "Timeout for Keep Alive messages");
132 
133 	SYSCTL_ADD_QUAD(ctx, child, OID_AUTO, "missing_tx_timeout",
134 	    CTLFLAG_RWTUN, &adapter->missing_tx_timeout,
135 	    "Timeout for TX completion");
136 
137 	SYSCTL_ADD_U32(ctx, child, OID_AUTO, "missing_tx_max_queues",
138 	    CTLFLAG_RWTUN, &adapter->missing_tx_max_queues, 0,
139 	    "Number of TX queues to check per run");
140 
141 	SYSCTL_ADD_U32(ctx, child, OID_AUTO, "missing_tx_threshold",
142 	    CTLFLAG_RWTUN, &adapter->missing_tx_threshold, 0,
143 	    "Max number of timeouted packets");
144 }
145 
146 static void
147 ena_sysctl_add_stats(struct ena_adapter *adapter)
148 {
149 	device_t dev;
150 
151 	struct ena_ring *tx_ring;
152 	struct ena_ring *rx_ring;
153 
154 	struct ena_hw_stats *hw_stats;
155 	struct ena_stats_dev *dev_stats;
156 	struct ena_stats_tx *tx_stats;
157 	struct ena_stats_rx *rx_stats;
158 	struct ena_com_stats_admin *admin_stats;
159 
160 	struct sysctl_ctx_list *ctx;
161 	struct sysctl_oid *tree;
162 	struct sysctl_oid_list *child;
163 
164 	struct sysctl_oid *queue_node, *tx_node, *rx_node, *hw_node;
165 	struct sysctl_oid *admin_node;
166 	struct sysctl_oid_list *queue_list, *tx_list, *rx_list, *hw_list;
167 	struct sysctl_oid_list *admin_list;
168 
169 #define QUEUE_NAME_LEN 32
170 	char namebuf[QUEUE_NAME_LEN];
171 	int i;
172 
173 	dev = adapter->pdev;
174 
175 	ctx = device_get_sysctl_ctx(dev);
176 	tree = device_get_sysctl_tree(dev);
177 	child = SYSCTL_CHILDREN(tree);
178 
179 	tx_ring = adapter->tx_ring;
180 	rx_ring = adapter->rx_ring;
181 
182 	hw_stats = &adapter->hw_stats;
183 	dev_stats = &adapter->dev_stats;
184 	admin_stats = &adapter->ena_dev->admin_queue.stats;
185 
186 	SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "wd_expired", CTLFLAG_RD,
187 	    &dev_stats->wd_expired, "Watchdog expiry count");
188 	SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "interface_up", CTLFLAG_RD,
189 	    &dev_stats->interface_up, "Network interface up count");
190 	SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "interface_down",
191 	    CTLFLAG_RD, &dev_stats->interface_down,
192 	    "Network interface down count");
193 	SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "admin_q_pause",
194 	    CTLFLAG_RD, &dev_stats->admin_q_pause, "Admin queue pauses");
195 
196 	for (i = 0; i < adapter->num_io_queues; ++i, ++tx_ring, ++rx_ring) {
197 		snprintf(namebuf, QUEUE_NAME_LEN, "queue%d", i);
198 
199 		queue_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf,
200 		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Queue Name");
201 		queue_list = SYSCTL_CHILDREN(queue_node);
202 
203 		adapter->que[i].oid = queue_node;
204 
205 #ifdef RSS
206 		/* Common stats */
207 		SYSCTL_ADD_INT(ctx, queue_list, OID_AUTO, "cpu", CTLFLAG_RD,
208 		    &adapter->que[i].cpu, 0, "CPU affinity");
209 		SYSCTL_ADD_INT(ctx, queue_list, OID_AUTO, "domain", CTLFLAG_RD,
210 		    &adapter->que[i].domain, 0, "NUMA domain");
211 #endif
212 
213 		/* TX specific stats */
214 		tx_node = SYSCTL_ADD_NODE(ctx, queue_list, OID_AUTO, "tx_ring",
215 		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "TX ring");
216 		tx_list = SYSCTL_CHILDREN(tx_node);
217 
218 		tx_stats = &tx_ring->tx_stats;
219 
220 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "count",
221 		    CTLFLAG_RD, &tx_stats->cnt, "Packets sent");
222 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "bytes",
223 		    CTLFLAG_RD, &tx_stats->bytes, "Bytes sent");
224 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
225 		    "prepare_ctx_err", CTLFLAG_RD, &tx_stats->prepare_ctx_err,
226 		    "TX buffer preparation failures");
227 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
228 		    "dma_mapping_err", CTLFLAG_RD, &tx_stats->dma_mapping_err,
229 		    "DMA mapping failures");
230 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "doorbells",
231 		    CTLFLAG_RD, &tx_stats->doorbells, "Queue doorbells");
232 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
233 		    "missing_tx_comp", CTLFLAG_RD, &tx_stats->missing_tx_comp,
234 		    "TX completions missed");
235 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "bad_req_id",
236 		    CTLFLAG_RD, &tx_stats->bad_req_id, "Bad request id count");
237 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "mbuf_collapses",
238 		    CTLFLAG_RD, &tx_stats->collapse, "Mbuf collapse count");
239 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
240 		    "mbuf_collapse_err", CTLFLAG_RD, &tx_stats->collapse_err,
241 		    "Mbuf collapse failures");
242 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "queue_wakeups",
243 		    CTLFLAG_RD, &tx_stats->queue_wakeup, "Queue wakeups");
244 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "queue_stops",
245 		    CTLFLAG_RD, &tx_stats->queue_stop, "Queue stops");
246 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
247 		    "llq_buffer_copy", CTLFLAG_RD, &tx_stats->llq_buffer_copy,
248 		    "Header copies for llq transaction");
249 		SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
250 		    "unmask_interrupt_num", CTLFLAG_RD,
251 		    &tx_stats->unmask_interrupt_num,
252 		    "Unmasked interrupt count");
253 
254 		/* RX specific stats */
255 		rx_node = SYSCTL_ADD_NODE(ctx, queue_list, OID_AUTO, "rx_ring",
256 		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "RX ring");
257 		rx_list = SYSCTL_CHILDREN(rx_node);
258 
259 		rx_stats = &rx_ring->rx_stats;
260 
261 		SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, "count",
262 		    CTLFLAG_RD, &rx_stats->cnt, "Packets received");
263 		SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, "bytes",
264 		    CTLFLAG_RD, &rx_stats->bytes, "Bytes received");
265 		SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, "refil_partial",
266 		    CTLFLAG_RD, &rx_stats->refil_partial,
267 		    "Partial refilled mbufs");
268 		SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, "csum_bad",
269 		    CTLFLAG_RD, &rx_stats->csum_bad, "Bad RX checksum");
270 		SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO,
271 		    "mbuf_alloc_fail", CTLFLAG_RD, &rx_stats->mbuf_alloc_fail,
272 		    "Failed mbuf allocs");
273 		SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO,
274 		    "mjum_alloc_fail", CTLFLAG_RD, &rx_stats->mjum_alloc_fail,
275 		    "Failed jumbo mbuf allocs");
276 		SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO,
277 		    "dma_mapping_err", CTLFLAG_RD, &rx_stats->dma_mapping_err,
278 		    "DMA mapping errors");
279 		SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, "bad_desc_num",
280 		    CTLFLAG_RD, &rx_stats->bad_desc_num,
281 		    "Bad descriptor count");
282 		SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, "bad_req_id",
283 		    CTLFLAG_RD, &rx_stats->bad_req_id, "Bad request id count");
284 		SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, "empty_rx_ring",
285 		    CTLFLAG_RD, &rx_stats->empty_rx_ring,
286 		    "RX descriptors depletion count");
287 		SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, "csum_good",
288 		    CTLFLAG_RD, &rx_stats->csum_good,
289 		    "Valid RX checksum calculations");
290 	}
291 
292 	/* Stats read from device */
293 	hw_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "hw_stats",
294 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Statistics from hardware");
295 	hw_list = SYSCTL_CHILDREN(hw_node);
296 
297 	SYSCTL_ADD_COUNTER_U64(ctx, hw_list, OID_AUTO, "rx_packets", CTLFLAG_RD,
298 	    &hw_stats->rx_packets, "Packets received");
299 	SYSCTL_ADD_COUNTER_U64(ctx, hw_list, OID_AUTO, "tx_packets", CTLFLAG_RD,
300 	    &hw_stats->tx_packets, "Packets transmitted");
301 	SYSCTL_ADD_COUNTER_U64(ctx, hw_list, OID_AUTO, "rx_bytes", CTLFLAG_RD,
302 	    &hw_stats->rx_bytes, "Bytes received");
303 	SYSCTL_ADD_COUNTER_U64(ctx, hw_list, OID_AUTO, "tx_bytes", CTLFLAG_RD,
304 	    &hw_stats->tx_bytes, "Bytes transmitted");
305 	SYSCTL_ADD_COUNTER_U64(ctx, hw_list, OID_AUTO, "rx_drops", CTLFLAG_RD,
306 	    &hw_stats->rx_drops, "Receive packet drops");
307 	SYSCTL_ADD_COUNTER_U64(ctx, hw_list, OID_AUTO, "tx_drops", CTLFLAG_RD,
308 	    &hw_stats->tx_drops, "Transmit packet drops");
309 
310 	/* ENA Admin queue stats */
311 	admin_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "admin_stats",
312 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "ENA Admin Queue statistics");
313 	admin_list = SYSCTL_CHILDREN(admin_node);
314 
315 	SYSCTL_ADD_U64(ctx, admin_list, OID_AUTO, "aborted_cmd", CTLFLAG_RD,
316 	    &admin_stats->aborted_cmd, 0, "Aborted commands");
317 	SYSCTL_ADD_U64(ctx, admin_list, OID_AUTO, "sumbitted_cmd", CTLFLAG_RD,
318 	    &admin_stats->submitted_cmd, 0, "Submitted commands");
319 	SYSCTL_ADD_U64(ctx, admin_list, OID_AUTO, "completed_cmd", CTLFLAG_RD,
320 	    &admin_stats->completed_cmd, 0, "Completed commands");
321 	SYSCTL_ADD_U64(ctx, admin_list, OID_AUTO, "out_of_space", CTLFLAG_RD,
322 	    &admin_stats->out_of_space, 0, "Queue out of space");
323 	SYSCTL_ADD_U64(ctx, admin_list, OID_AUTO, "no_completion", CTLFLAG_RD,
324 	    &admin_stats->no_completion, 0, "Commands not completed");
325 }
326 
327 static void
328 ena_sysctl_add_eni_metrics(struct ena_adapter *adapter)
329 {
330 	device_t dev;
331 	struct ena_admin_eni_stats *eni_metrics;
332 
333 	struct sysctl_ctx_list *ctx;
334 	struct sysctl_oid *tree;
335 	struct sysctl_oid_list *child;
336 
337 	struct sysctl_oid *eni_node;
338 	struct sysctl_oid_list *eni_list;
339 
340 	dev = adapter->pdev;
341 
342 	ctx = device_get_sysctl_ctx(dev);
343 	tree = device_get_sysctl_tree(dev);
344 	child = SYSCTL_CHILDREN(tree);
345 
346 	eni_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "eni_metrics",
347 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "ENA's ENI metrics");
348 	eni_list = SYSCTL_CHILDREN(eni_node);
349 
350 	eni_metrics = &adapter->eni_metrics;
351 
352 	SYSCTL_ADD_U64(ctx, eni_list, OID_AUTO, "bw_in_allowance_exceeded",
353 	    CTLFLAG_RD, &eni_metrics->bw_in_allowance_exceeded, 0,
354 	    "Inbound BW allowance exceeded");
355 	SYSCTL_ADD_U64(ctx, eni_list, OID_AUTO, "bw_out_allowance_exceeded",
356 	    CTLFLAG_RD, &eni_metrics->bw_out_allowance_exceeded, 0,
357 	    "Outbound BW allowance exceeded");
358 	SYSCTL_ADD_U64(ctx, eni_list, OID_AUTO, "pps_allowance_exceeded",
359 	    CTLFLAG_RD, &eni_metrics->pps_allowance_exceeded, 0,
360 	    "PPS allowance exceeded");
361 	SYSCTL_ADD_U64(ctx, eni_list, OID_AUTO, "conntrack_allowance_exceeded",
362 	    CTLFLAG_RD, &eni_metrics->conntrack_allowance_exceeded, 0,
363 	    "Connection tracking allowance exceeded");
364 	SYSCTL_ADD_U64(ctx, eni_list, OID_AUTO, "linklocal_allowance_exceeded",
365 	    CTLFLAG_RD, &eni_metrics->linklocal_allowance_exceeded, 0,
366 	    "Linklocal packet rate allowance exceeded");
367 
368 	/*
369 	 * Tuneable, which determines how often ENI metrics will be read.
370 	 * 0 means it's turned off. Maximum allowed value is limited by:
371 	 * ENI_METRICS_MAX_SAMPLE_INTERVAL.
372 	 */
373 	SYSCTL_ADD_PROC(ctx, eni_list, OID_AUTO, "sample_interval",
374 	    CTLTYPE_U16 | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0,
375 	    ena_sysctl_eni_metrics_interval, "SU",
376 	    "Interval in seconds for updating ENI emetrics. 0 turns off the update.");
377 }
378 
379 static void
380 ena_sysctl_add_tuneables(struct ena_adapter *adapter)
381 {
382 	device_t dev;
383 
384 	struct sysctl_ctx_list *ctx;
385 	struct sysctl_oid *tree;
386 	struct sysctl_oid_list *child;
387 
388 	dev = adapter->pdev;
389 
390 	ctx = device_get_sysctl_ctx(dev);
391 	tree = device_get_sysctl_tree(dev);
392 	child = SYSCTL_CHILDREN(tree);
393 
394 	/* Tuneable number of buffers in the buf-ring (drbr) */
395 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "buf_ring_size",
396 	    CTLTYPE_U32 | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0,
397 	    ena_sysctl_buf_ring_size, "I",
398 	    "Size of the Tx buffer ring (drbr).");
399 
400 	/* Tuneable number of the Rx ring size */
401 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_queue_size",
402 	    CTLTYPE_U32 | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0,
403 	    ena_sysctl_rx_queue_size, "I",
404 	    "Size of the Rx ring. The size should be a power of 2.");
405 
406 	/* Tuneable number of IO queues */
407 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "io_queues_nb",
408 	    CTLTYPE_U32 | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0,
409 	    ena_sysctl_io_queues_nb, "I", "Number of IO queues.");
410 }
411 
412 /* Kernel option RSS prevents manipulation of key hash and indirection table. */
413 #ifndef RSS
414 static void
415 ena_sysctl_add_rss(struct ena_adapter *adapter)
416 {
417 	device_t dev;
418 
419 	struct sysctl_ctx_list *ctx;
420 	struct sysctl_oid *tree;
421 	struct sysctl_oid_list *child;
422 
423 	dev = adapter->pdev;
424 
425 	ctx = device_get_sysctl_ctx(dev);
426 	tree = device_get_sysctl_tree(dev);
427 	child = SYSCTL_CHILDREN(tree);
428 
429 	/* RSS options */
430 	tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rss",
431 	    CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Receive Side Scaling options.");
432 	child = SYSCTL_CHILDREN(tree);
433 
434 	/* RSS hash key */
435 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "key",
436 	    CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0,
437 	    ena_sysctl_rss_key, "A", "RSS key.");
438 
439 	/* Tuneable RSS indirection table */
440 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "indir_table",
441 	    CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0,
442 	    ena_sysctl_rss_indir_table, "A", "RSS indirection table.");
443 
444 	/* RSS indirection table size */
445 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "indir_table_size",
446 	    CTLFLAG_RD | CTLFLAG_MPSAFE, &ena_rss_table_size, 0,
447 	    "RSS indirection table size.");
448 }
449 #endif /* RSS */
450 
451 
452 /*
453  * ena_sysctl_update_queue_node_nb - Register/unregister sysctl queue nodes.
454  *
455  * Whether the nodes are registered or unregistered depends on a delta between
456  * the `old` and `new` parameters, representing the number of queues.
457  *
458  * This function is used to hide sysctl attributes for queue nodes which aren't
459  * currently used by the HW (e.g. after a call to `ena_sysctl_io_queues_nb`).
460  *
461  * NOTE:
462  * All unregistered nodes must be registered again at detach, i.e. by a call to
463  * this function.
464  */
465 void
466 ena_sysctl_update_queue_node_nb(struct ena_adapter *adapter, int old, int new)
467 {
468 	struct sysctl_oid *oid;
469 	int min, max, i;
470 
471 	min = MIN(old, new);
472 	max = MIN(MAX(old, new), adapter->max_num_io_queues);
473 
474 	for (i = min; i < max; ++i) {
475 		oid = adapter->que[i].oid;
476 
477 		sysctl_wlock();
478 		if (old > new)
479 			sysctl_unregister_oid(oid);
480 		else
481 			sysctl_register_oid(oid);
482 		sysctl_wunlock();
483 	}
484 }
485 
486 static int
487 ena_sysctl_buf_ring_size(SYSCTL_HANDLER_ARGS)
488 {
489 	struct ena_adapter *adapter = arg1;
490 	uint32_t val;
491 	int error;
492 
493 	ENA_LOCK_LOCK();
494 	if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) {
495 		error = EINVAL;
496 		goto unlock;
497 	}
498 
499 	val = 0;
500 	error = sysctl_wire_old_buffer(req, sizeof(val));
501 	if (error == 0) {
502 		val = adapter->buf_ring_size;
503 		error = sysctl_handle_32(oidp, &val, 0, req);
504 	}
505 	if (error != 0 || req->newptr == NULL)
506 		goto unlock;
507 
508 	if (!powerof2(val) || val == 0) {
509 		ena_log(adapter->pdev, ERR,
510 		    "Requested new Tx buffer ring size (%u) is not a power of 2\n",
511 		    val);
512 		error = EINVAL;
513 		goto unlock;
514 	}
515 
516 	if (val != adapter->buf_ring_size) {
517 		ena_log(adapter->pdev, INFO,
518 		    "Requested new Tx buffer ring size: %d. Old size: %d\n",
519 		    val, adapter->buf_ring_size);
520 
521 		error = ena_update_buf_ring_size(adapter, val);
522 	} else {
523 		ena_log(adapter->pdev, ERR,
524 		    "New Tx buffer ring size is the same as already used: %u\n",
525 		    adapter->buf_ring_size);
526 	}
527 
528 unlock:
529 	ENA_LOCK_UNLOCK();
530 
531 	return (error);
532 }
533 
534 static int
535 ena_sysctl_rx_queue_size(SYSCTL_HANDLER_ARGS)
536 {
537 	struct ena_adapter *adapter = arg1;
538 	uint32_t val;
539 	int error;
540 
541 	ENA_LOCK_LOCK();
542 	if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) {
543 		error = EINVAL;
544 		goto unlock;
545 	}
546 
547 	val = 0;
548 	error = sysctl_wire_old_buffer(req, sizeof(val));
549 	if (error == 0) {
550 		val = adapter->requested_rx_ring_size;
551 		error = sysctl_handle_32(oidp, &val, 0, req);
552 	}
553 	if (error != 0 || req->newptr == NULL)
554 		goto unlock;
555 
556 	if (val < ENA_MIN_RING_SIZE || val > adapter->max_rx_ring_size) {
557 		ena_log(adapter->pdev, ERR,
558 		    "Requested new Rx queue size (%u) is out of range: [%u, %u]\n",
559 		    val, ENA_MIN_RING_SIZE, adapter->max_rx_ring_size);
560 		error = EINVAL;
561 		goto unlock;
562 	}
563 
564 	/* Check if the parameter is power of 2 */
565 	if (!powerof2(val)) {
566 		ena_log(adapter->pdev, ERR,
567 		    "Requested new Rx queue size (%u) is not a power of 2\n",
568 		    val);
569 		error = EINVAL;
570 		goto unlock;
571 	}
572 
573 	if (val != adapter->requested_rx_ring_size) {
574 		ena_log(adapter->pdev, INFO,
575 		    "Requested new Rx queue size: %u. Old size: %u\n", val,
576 		    adapter->requested_rx_ring_size);
577 
578 		error = ena_update_queue_size(adapter,
579 		    adapter->requested_tx_ring_size, val);
580 	} else {
581 		ena_log(adapter->pdev, ERR,
582 		    "New Rx queue size is the same as already used: %u\n",
583 		    adapter->requested_rx_ring_size);
584 	}
585 
586 unlock:
587 	ENA_LOCK_UNLOCK();
588 
589 	return (error);
590 }
591 
592 /*
593  * Change number of effectively used IO queues adapter->num_io_queues
594  */
595 static int
596 ena_sysctl_io_queues_nb(SYSCTL_HANDLER_ARGS)
597 {
598 	struct ena_adapter *adapter = arg1;
599 	uint32_t old_num_queues, tmp = 0;
600 	int error;
601 
602 	ENA_LOCK_LOCK();
603 	if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) {
604 		error = EINVAL;
605 		goto unlock;
606 	}
607 
608 	error = sysctl_wire_old_buffer(req, sizeof(tmp));
609 	if (error == 0) {
610 		tmp = adapter->num_io_queues;
611 		error = sysctl_handle_int(oidp, &tmp, 0, req);
612 	}
613 	if (error != 0 || req->newptr == NULL)
614 		goto unlock;
615 
616 	if (tmp == 0) {
617 		ena_log(adapter->pdev, ERR,
618 		    "Requested number of IO queues is zero\n");
619 		error = EINVAL;
620 		goto unlock;
621 	}
622 
623 	/*
624 	 * The adapter::max_num_io_queues is the HW capability. The system
625 	 * resources availability may potentially be a tighter limit. Therefore
626 	 * the relation `adapter::max_num_io_queues >= adapter::msix_vecs`
627 	 * always holds true, while the `adapter::msix_vecs` is variable across
628 	 * device reset (`ena_destroy_device()` + `ena_restore_device()`).
629 	 */
630 	if (tmp > (adapter->msix_vecs - ENA_ADMIN_MSIX_VEC)) {
631 		ena_log(adapter->pdev, ERR,
632 		    "Requested number of IO queues is higher than maximum allowed (%u)\n",
633 		    adapter->msix_vecs - ENA_ADMIN_MSIX_VEC);
634 		error = EINVAL;
635 		goto unlock;
636 	}
637 	if (tmp == adapter->num_io_queues) {
638 		ena_log(adapter->pdev, ERR,
639 		    "Requested number of IO queues is equal to current value "
640 		    "(%u)\n",
641 		    adapter->num_io_queues);
642 	} else {
643 		ena_log(adapter->pdev, INFO,
644 		    "Requested new number of IO queues: %u, current value: "
645 		    "%u\n",
646 		    tmp, adapter->num_io_queues);
647 
648 		old_num_queues = adapter->num_io_queues;
649 		error = ena_update_io_queue_nb(adapter, tmp);
650 		if (error != 0)
651 			return (error);
652 
653 		ena_sysctl_update_queue_node_nb(adapter, old_num_queues, tmp);
654 	}
655 
656 unlock:
657 	ENA_LOCK_UNLOCK();
658 
659 	return (error);
660 }
661 
662 static int
663 ena_sysctl_eni_metrics_interval(SYSCTL_HANDLER_ARGS)
664 {
665 	struct ena_adapter *adapter = arg1;
666 	uint16_t interval;
667 	int error;
668 
669 	ENA_LOCK_LOCK();
670 	if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) {
671 		error = EINVAL;
672 		goto unlock;
673 	}
674 
675 	error = sysctl_wire_old_buffer(req, sizeof(interval));
676 	if (error == 0) {
677 		interval = adapter->eni_metrics_sample_interval;
678 		error = sysctl_handle_16(oidp, &interval, 0, req);
679 	}
680 	if (error != 0 || req->newptr == NULL)
681 		goto unlock;
682 
683 	if (interval > ENI_METRICS_MAX_SAMPLE_INTERVAL) {
684 		ena_log(adapter->pdev, ERR,
685 		    "ENI metrics update interval is out of range - maximum allowed value: %d seconds\n",
686 		    ENI_METRICS_MAX_SAMPLE_INTERVAL);
687 		error = EINVAL;
688 		goto unlock;
689 	}
690 
691 	if (interval == 0) {
692 		ena_log(adapter->pdev, INFO,
693 		    "ENI metrics update is now turned off\n");
694 		bzero(&adapter->eni_metrics, sizeof(adapter->eni_metrics));
695 	} else {
696 		ena_log(adapter->pdev, INFO,
697 		    "ENI metrics update interval is set to: %" PRIu16
698 		    " seconds\n",
699 		    interval);
700 	}
701 
702 	adapter->eni_metrics_sample_interval = interval;
703 
704 unlock:
705 	ENA_LOCK_UNLOCK();
706 
707 	return (0);
708 }
709 
710 #ifndef RSS
711 /*
712  * Change the Receive Side Scaling hash key.
713  */
714 static int
715 ena_sysctl_rss_key(SYSCTL_HANDLER_ARGS)
716 {
717 	struct ena_adapter *adapter = arg1;
718 	struct ena_com_dev *ena_dev = adapter->ena_dev;
719 	enum ena_admin_hash_functions ena_func;
720 	char msg[ENA_HASH_KEY_MSG_SIZE];
721 	char elem[3] = { 0 };
722 	char *endp;
723 	u8 rss_key[ENA_HASH_KEY_SIZE];
724 	int error, i;
725 
726 	ENA_LOCK_LOCK();
727 	if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) {
728 		error = EINVAL;
729 		goto unlock;
730 	}
731 
732 	if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_RSS_ACTIVE, adapter))) {
733 		error = ENOTSUP;
734 		goto unlock;
735 	}
736 
737 	error = sysctl_wire_old_buffer(req, sizeof(msg));
738 	if (error != 0)
739 		goto unlock;
740 
741 	error = ena_com_get_hash_function(adapter->ena_dev, &ena_func);
742 	if (error != 0) {
743 		device_printf(adapter->pdev, "Cannot get hash function\n");
744 		goto unlock;
745 	}
746 
747 	if (ena_func != ENA_ADMIN_TOEPLITZ) {
748 		error = EINVAL;
749 		device_printf(adapter->pdev, "Unsupported hash algorithm\n");
750 		goto unlock;
751 	}
752 
753 	error = ena_rss_get_hash_key(ena_dev, rss_key);
754 	if (error != 0) {
755 		device_printf(adapter->pdev, "Cannot get hash key\n");
756 		goto unlock;
757 	}
758 
759 	for (i = 0; i < ENA_HASH_KEY_SIZE; ++i)
760 		snprintf(&msg[i * 2], 3, "%02x", rss_key[i]);
761 
762 	error = sysctl_handle_string(oidp, msg, sizeof(msg), req);
763 	if (error != 0 || req->newptr == NULL)
764 		goto unlock;
765 
766 	if (strlen(msg) != sizeof(msg) - 1) {
767 		error = EINVAL;
768 		device_printf(adapter->pdev, "Invalid key size\n");
769 		goto unlock;
770 	}
771 
772 	for (i = 0; i < ENA_HASH_KEY_SIZE; ++i) {
773 		strncpy(elem, &msg[i * 2], 2);
774 		rss_key[i] = strtol(elem, &endp, 16);
775 
776 		/* Both hex nibbles in the string must be valid to continue. */
777 		if (endp == elem || *endp != '\0' || rss_key[i] < 0) {
778 			error = EINVAL;
779 			device_printf(adapter->pdev,
780 			    "Invalid key hex value: '%c'\n", *endp);
781 			goto unlock;
782 		}
783 	}
784 
785 	error = ena_rss_set_hash(ena_dev, rss_key);
786 	if (error != 0)
787 		device_printf(adapter->pdev, "Cannot fill hash key\n");
788 
789 unlock:
790 	ENA_LOCK_UNLOCK();
791 
792 	return (error);
793 }
794 
795 /*
796  * Change the Receive Side Scaling indirection table.
797  *
798  * The sysctl entry string consists of one or more `x:y` keypairs, where
799  * x stands for the table index and y for its new value.
800  * Table indices that don't need to be updated can be omitted from the string
801  * and will retain their existing values. If an index is entered more than once,
802  * the last value is used.
803  *
804  * Example:
805  * To update two selected indices in the RSS indirection table, e.g. setting
806  * index 0 to queue 5 and then index 5 to queue 0, the below command should be
807  * used:
808  *   sysctl dev.ena.0.rss.indir_table="0:5 5:0"
809  */
810 static int
811 ena_sysctl_rss_indir_table(SYSCTL_HANDLER_ARGS)
812 {
813 	int num_queues, error;
814 	struct ena_adapter *adapter = arg1;
815 	struct ena_indir *indir;
816 	char *msg, *buf, *endp;
817 	uint32_t idx, value;
818 
819 	ENA_LOCK_LOCK();
820 	if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) {
821 		error = EINVAL;
822 		goto unlock;
823 	}
824 
825 	if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_RSS_ACTIVE, adapter))) {
826 		error = ENOTSUP;
827 		goto unlock;
828 	}
829 
830 	indir = adapter->rss_indir;
831 	msg = indir->sysctl_buf;
832 
833 	if (unlikely(indir == NULL)) {
834 		error = ENOTSUP;
835 		goto unlock;
836 	}
837 
838 	error = sysctl_handle_string(oidp, msg, sizeof(indir->sysctl_buf), req);
839 	if (error != 0 || req->newptr == NULL)
840 		goto unlock;
841 
842 	num_queues = adapter->num_io_queues;
843 
844 	/*
845 	 * This sysctl expects msg to be a list of `x:y` record pairs,
846 	 * where x is the indirection table index and y is its value.
847 	 */
848 	for (buf = msg; *buf != '\0'; buf = endp) {
849 		idx = strtol(buf, &endp, 10);
850 
851 		if (endp == buf || idx < 0) {
852 			device_printf(adapter->pdev, "Invalid index: %s\n",
853 			    buf);
854 			error = EINVAL;
855 			break;
856 		}
857 
858 		if (idx >= ENA_RX_RSS_TABLE_SIZE) {
859 			device_printf(adapter->pdev, "Index %d out of range\n",
860 			    idx);
861 			error = ERANGE;
862 			break;
863 		}
864 
865 		buf = endp;
866 
867 		if (*buf++ != ':') {
868 			device_printf(adapter->pdev, "Missing ':' separator\n");
869 			error = EINVAL;
870 			break;
871 		}
872 
873 		value = strtol(buf, &endp, 10);
874 
875 		if (endp == buf || value < 0) {
876 			device_printf(adapter->pdev, "Invalid value: %s\n",
877 			    buf);
878 			error = EINVAL;
879 			break;
880 		}
881 
882 		if (value >= num_queues) {
883 			device_printf(adapter->pdev, "Value %d out of range\n",
884 			    value);
885 			error = ERANGE;
886 			break;
887 		}
888 
889 		indir->table[idx] = value;
890 	}
891 
892 	if (error != 0) /* Reload indirection table with last good data. */
893 		ena_rss_indir_get(adapter, indir->table);
894 
895 	/* At this point msg has been clobbered by sysctl_handle_string. */
896 	ena_rss_copy_indir_buf(msg, indir->table);
897 
898 	if (error == 0)
899 		error = ena_rss_indir_set(adapter, indir->table);
900 
901 unlock:
902 	ENA_LOCK_UNLOCK();
903 
904 	return (error);
905 }
906 #endif /* RSS */
907