xref: /linux/drivers/net/ethernet/google/gve/gve_main.c (revision e7e86d7697c6ed1dbbde18d7185c35b6967945ed)
1 // SPDX-License-Identifier: (GPL-2.0 OR MIT)
2 /* Google virtual Ethernet (gve) driver
3  *
4  * Copyright (C) 2015-2024 Google LLC
5  */
6 
7 #include <linux/bitmap.h>
8 #include <linux/bpf.h>
9 #include <linux/cpumask.h>
10 #include <linux/etherdevice.h>
11 #include <linux/filter.h>
12 #include <linux/interrupt.h>
13 #include <linux/irq.h>
14 #include <linux/module.h>
15 #include <linux/pci.h>
16 #include <linux/sched.h>
17 #include <linux/timer.h>
18 #include <linux/workqueue.h>
19 #include <linux/utsname.h>
20 #include <linux/version.h>
21 #include <net/netdev_queues.h>
22 #include <net/sch_generic.h>
23 #include <net/xdp_sock_drv.h>
24 #include "gve.h"
25 #include "gve_dqo.h"
26 #include "gve_adminq.h"
27 #include "gve_register.h"
28 #include "gve_utils.h"
29 
30 #define GVE_DEFAULT_RX_COPYBREAK	(256)
31 
32 #define DEFAULT_MSG_LEVEL	(NETIF_MSG_DRV | NETIF_MSG_LINK)
33 #define GVE_VERSION		"1.0.0"
34 #define GVE_VERSION_PREFIX	"GVE-"
35 
36 // Minimum amount of time between queue kicks in msec (10 seconds)
37 #define MIN_TX_TIMEOUT_GAP (1000 * 10)
38 
39 char gve_driver_name[] = "gve";
40 const char gve_version_str[] = GVE_VERSION;
41 static const char gve_version_prefix[] = GVE_VERSION_PREFIX;
42 
43 static int gve_verify_driver_compatibility(struct gve_priv *priv)
44 {
45 	int err;
46 	struct gve_driver_info *driver_info;
47 	dma_addr_t driver_info_bus;
48 
49 	driver_info = dma_alloc_coherent(&priv->pdev->dev,
50 					 sizeof(struct gve_driver_info),
51 					 &driver_info_bus, GFP_KERNEL);
52 	if (!driver_info)
53 		return -ENOMEM;
54 
55 	*driver_info = (struct gve_driver_info) {
56 		.os_type = 1, /* Linux */
57 		.os_version_major = cpu_to_be32(LINUX_VERSION_MAJOR),
58 		.os_version_minor = cpu_to_be32(LINUX_VERSION_SUBLEVEL),
59 		.os_version_sub = cpu_to_be32(LINUX_VERSION_PATCHLEVEL),
60 		.driver_capability_flags = {
61 			cpu_to_be64(GVE_DRIVER_CAPABILITY_FLAGS1),
62 			cpu_to_be64(GVE_DRIVER_CAPABILITY_FLAGS2),
63 			cpu_to_be64(GVE_DRIVER_CAPABILITY_FLAGS3),
64 			cpu_to_be64(GVE_DRIVER_CAPABILITY_FLAGS4),
65 		},
66 	};
67 	strscpy(driver_info->os_version_str1, utsname()->release,
68 		sizeof(driver_info->os_version_str1));
69 	strscpy(driver_info->os_version_str2, utsname()->version,
70 		sizeof(driver_info->os_version_str2));
71 
72 	err = gve_adminq_verify_driver_compatibility(priv,
73 						     sizeof(struct gve_driver_info),
74 						     driver_info_bus);
75 
76 	/* It's ok if the device doesn't support this */
77 	if (err == -EOPNOTSUPP)
78 		err = 0;
79 
80 	dma_free_coherent(&priv->pdev->dev,
81 			  sizeof(struct gve_driver_info),
82 			  driver_info, driver_info_bus);
83 	return err;
84 }
85 
86 static netdev_features_t gve_features_check(struct sk_buff *skb,
87 					    struct net_device *dev,
88 					    netdev_features_t features)
89 {
90 	struct gve_priv *priv = netdev_priv(dev);
91 
92 	if (!gve_is_gqi(priv))
93 		return gve_features_check_dqo(skb, dev, features);
94 
95 	return features;
96 }
97 
98 static netdev_tx_t gve_start_xmit(struct sk_buff *skb, struct net_device *dev)
99 {
100 	struct gve_priv *priv = netdev_priv(dev);
101 
102 	if (gve_is_gqi(priv))
103 		return gve_tx(skb, dev);
104 	else
105 		return gve_tx_dqo(skb, dev);
106 }
107 
108 static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s)
109 {
110 	struct gve_priv *priv = netdev_priv(dev);
111 	unsigned int start;
112 	u64 packets, bytes;
113 	int num_tx_queues;
114 	int ring;
115 
116 	num_tx_queues = gve_num_tx_queues(priv);
117 	if (priv->rx) {
118 		for (ring = 0; ring < priv->rx_cfg.num_queues; ring++) {
119 			do {
120 				start =
121 				  u64_stats_fetch_begin(&priv->rx[ring].statss);
122 				packets = priv->rx[ring].rpackets;
123 				bytes = priv->rx[ring].rbytes;
124 			} while (u64_stats_fetch_retry(&priv->rx[ring].statss,
125 						       start));
126 			s->rx_packets += packets;
127 			s->rx_bytes += bytes;
128 		}
129 	}
130 	if (priv->tx) {
131 		for (ring = 0; ring < num_tx_queues; ring++) {
132 			do {
133 				start =
134 				  u64_stats_fetch_begin(&priv->tx[ring].statss);
135 				packets = priv->tx[ring].pkt_done;
136 				bytes = priv->tx[ring].bytes_done;
137 			} while (u64_stats_fetch_retry(&priv->tx[ring].statss,
138 						       start));
139 			s->tx_packets += packets;
140 			s->tx_bytes += bytes;
141 		}
142 	}
143 }
144 
145 static int gve_alloc_flow_rule_caches(struct gve_priv *priv)
146 {
147 	struct gve_flow_rules_cache *flow_rules_cache = &priv->flow_rules_cache;
148 	int err = 0;
149 
150 	if (!priv->max_flow_rules)
151 		return 0;
152 
153 	flow_rules_cache->rules_cache =
154 		kvcalloc(GVE_FLOW_RULES_CACHE_SIZE, sizeof(*flow_rules_cache->rules_cache),
155 			 GFP_KERNEL);
156 	if (!flow_rules_cache->rules_cache) {
157 		dev_err(&priv->pdev->dev, "Cannot alloc flow rules cache\n");
158 		return -ENOMEM;
159 	}
160 
161 	flow_rules_cache->rule_ids_cache =
162 		kvcalloc(GVE_FLOW_RULE_IDS_CACHE_SIZE, sizeof(*flow_rules_cache->rule_ids_cache),
163 			 GFP_KERNEL);
164 	if (!flow_rules_cache->rule_ids_cache) {
165 		dev_err(&priv->pdev->dev, "Cannot alloc flow rule ids cache\n");
166 		err = -ENOMEM;
167 		goto free_rules_cache;
168 	}
169 
170 	return 0;
171 
172 free_rules_cache:
173 	kvfree(flow_rules_cache->rules_cache);
174 	flow_rules_cache->rules_cache = NULL;
175 	return err;
176 }
177 
178 static void gve_free_flow_rule_caches(struct gve_priv *priv)
179 {
180 	struct gve_flow_rules_cache *flow_rules_cache = &priv->flow_rules_cache;
181 
182 	kvfree(flow_rules_cache->rule_ids_cache);
183 	flow_rules_cache->rule_ids_cache = NULL;
184 	kvfree(flow_rules_cache->rules_cache);
185 	flow_rules_cache->rules_cache = NULL;
186 }
187 
188 static int gve_alloc_rss_config_cache(struct gve_priv *priv)
189 {
190 	struct gve_rss_config *rss_config = &priv->rss_config;
191 
192 	if (!priv->cache_rss_config)
193 		return 0;
194 
195 	rss_config->hash_key = kcalloc(priv->rss_key_size,
196 				       sizeof(rss_config->hash_key[0]),
197 				       GFP_KERNEL);
198 	if (!rss_config->hash_key)
199 		return -ENOMEM;
200 
201 	rss_config->hash_lut = kcalloc(priv->rss_lut_size,
202 				       sizeof(rss_config->hash_lut[0]),
203 				       GFP_KERNEL);
204 	if (!rss_config->hash_lut)
205 		goto free_rss_key_cache;
206 
207 	return 0;
208 
209 free_rss_key_cache:
210 	kfree(rss_config->hash_key);
211 	rss_config->hash_key = NULL;
212 	return -ENOMEM;
213 }
214 
215 static void gve_free_rss_config_cache(struct gve_priv *priv)
216 {
217 	struct gve_rss_config *rss_config = &priv->rss_config;
218 
219 	kfree(rss_config->hash_key);
220 	kfree(rss_config->hash_lut);
221 
222 	memset(rss_config, 0, sizeof(*rss_config));
223 }
224 
225 static int gve_alloc_counter_array(struct gve_priv *priv)
226 {
227 	priv->counter_array =
228 		dma_alloc_coherent(&priv->pdev->dev,
229 				   priv->num_event_counters *
230 				   sizeof(*priv->counter_array),
231 				   &priv->counter_array_bus, GFP_KERNEL);
232 	if (!priv->counter_array)
233 		return -ENOMEM;
234 
235 	return 0;
236 }
237 
238 static void gve_free_counter_array(struct gve_priv *priv)
239 {
240 	if (!priv->counter_array)
241 		return;
242 
243 	dma_free_coherent(&priv->pdev->dev,
244 			  priv->num_event_counters *
245 			  sizeof(*priv->counter_array),
246 			  priv->counter_array, priv->counter_array_bus);
247 	priv->counter_array = NULL;
248 }
249 
250 /* NIC requests to report stats */
251 static void gve_stats_report_task(struct work_struct *work)
252 {
253 	struct gve_priv *priv = container_of(work, struct gve_priv,
254 					     stats_report_task);
255 	if (gve_get_do_report_stats(priv)) {
256 		gve_handle_report_stats(priv);
257 		gve_clear_do_report_stats(priv);
258 	}
259 }
260 
261 static void gve_stats_report_schedule(struct gve_priv *priv)
262 {
263 	if (!gve_get_probe_in_progress(priv) &&
264 	    !gve_get_reset_in_progress(priv)) {
265 		gve_set_do_report_stats(priv);
266 		queue_work(priv->gve_wq, &priv->stats_report_task);
267 	}
268 }
269 
270 static void gve_stats_report_timer(struct timer_list *t)
271 {
272 	struct gve_priv *priv = timer_container_of(priv, t,
273 						   stats_report_timer);
274 
275 	mod_timer(&priv->stats_report_timer,
276 		  round_jiffies(jiffies +
277 		  msecs_to_jiffies(priv->stats_report_timer_period)));
278 	gve_stats_report_schedule(priv);
279 }
280 
281 static int gve_alloc_stats_report(struct gve_priv *priv)
282 {
283 	int tx_stats_num, rx_stats_num;
284 
285 	tx_stats_num = (GVE_TX_STATS_REPORT_NUM + NIC_TX_STATS_REPORT_NUM) *
286 		       gve_num_tx_queues(priv);
287 	rx_stats_num = (GVE_RX_STATS_REPORT_NUM + NIC_RX_STATS_REPORT_NUM) *
288 		       priv->rx_cfg.num_queues;
289 	priv->stats_report_len = struct_size(priv->stats_report, stats,
290 					     size_add(tx_stats_num, rx_stats_num));
291 	priv->stats_report =
292 		dma_alloc_coherent(&priv->pdev->dev, priv->stats_report_len,
293 				   &priv->stats_report_bus, GFP_KERNEL);
294 	if (!priv->stats_report)
295 		return -ENOMEM;
296 	/* Set up timer for the report-stats task */
297 	timer_setup(&priv->stats_report_timer, gve_stats_report_timer, 0);
298 	priv->stats_report_timer_period = GVE_STATS_REPORT_TIMER_PERIOD;
299 	return 0;
300 }
301 
302 static void gve_free_stats_report(struct gve_priv *priv)
303 {
304 	if (!priv->stats_report)
305 		return;
306 
307 	timer_delete_sync(&priv->stats_report_timer);
308 	dma_free_coherent(&priv->pdev->dev, priv->stats_report_len,
309 			  priv->stats_report, priv->stats_report_bus);
310 	priv->stats_report = NULL;
311 }
312 
313 static irqreturn_t gve_mgmnt_intr(int irq, void *arg)
314 {
315 	struct gve_priv *priv = arg;
316 
317 	queue_work(priv->gve_wq, &priv->service_task);
318 	return IRQ_HANDLED;
319 }
320 
321 static irqreturn_t gve_intr(int irq, void *arg)
322 {
323 	struct gve_notify_block *block = arg;
324 	struct gve_priv *priv = block->priv;
325 
326 	iowrite32be(GVE_IRQ_MASK, gve_irq_doorbell(priv, block));
327 	napi_schedule_irqoff(&block->napi);
328 	return IRQ_HANDLED;
329 }
330 
331 static irqreturn_t gve_intr_dqo(int irq, void *arg)
332 {
333 	struct gve_notify_block *block = arg;
334 
335 	/* Interrupts are automatically masked */
336 	napi_schedule_irqoff(&block->napi);
337 	return IRQ_HANDLED;
338 }
339 
340 static int gve_is_napi_on_home_cpu(struct gve_priv *priv, u32 irq)
341 {
342 	int cpu_curr = smp_processor_id();
343 	const struct cpumask *aff_mask;
344 
345 	aff_mask = irq_get_effective_affinity_mask(irq);
346 	if (unlikely(!aff_mask))
347 		return 1;
348 
349 	return cpumask_test_cpu(cpu_curr, aff_mask);
350 }
351 
352 int gve_napi_poll(struct napi_struct *napi, int budget)
353 {
354 	struct gve_notify_block *block;
355 	__be32 __iomem *irq_doorbell;
356 	bool reschedule = false;
357 	struct gve_priv *priv;
358 	int work_done = 0;
359 
360 	block = container_of(napi, struct gve_notify_block, napi);
361 	priv = block->priv;
362 
363 	if (block->tx) {
364 		if (block->tx->q_num < priv->tx_cfg.num_queues)
365 			reschedule |= gve_tx_poll(block, budget);
366 		else if (budget)
367 			reschedule |= gve_xdp_poll(block, budget);
368 	}
369 
370 	if (!budget)
371 		return 0;
372 
373 	if (block->rx) {
374 		work_done = gve_rx_poll(block, budget);
375 
376 		/* Poll XSK TX as part of RX NAPI. Setup re-poll based on max of
377 		 * TX and RX work done.
378 		 */
379 		if (priv->xdp_prog)
380 			work_done = max_t(int, work_done,
381 					  gve_xsk_tx_poll(block, budget));
382 
383 		reschedule |= work_done == budget;
384 	}
385 
386 	if (reschedule)
387 		return budget;
388 
389        /* Complete processing - don't unmask irq if busy polling is enabled */
390 	if (likely(napi_complete_done(napi, work_done))) {
391 		irq_doorbell = gve_irq_doorbell(priv, block);
392 		iowrite32be(GVE_IRQ_ACK | GVE_IRQ_EVENT, irq_doorbell);
393 
394 		/* Ensure IRQ ACK is visible before we check pending work.
395 		 * If queue had issued updates, it would be truly visible.
396 		 */
397 		mb();
398 
399 		if (block->tx)
400 			reschedule |= gve_tx_clean_pending(priv, block->tx);
401 		if (block->rx)
402 			reschedule |= gve_rx_work_pending(block->rx);
403 
404 		if (reschedule && napi_schedule(napi))
405 			iowrite32be(GVE_IRQ_MASK, irq_doorbell);
406 	}
407 	return work_done;
408 }
409 
410 int gve_napi_poll_dqo(struct napi_struct *napi, int budget)
411 {
412 	struct gve_notify_block *block =
413 		container_of(napi, struct gve_notify_block, napi);
414 	struct gve_priv *priv = block->priv;
415 	bool reschedule = false;
416 	int work_done = 0;
417 
418 	if (block->tx) {
419 		if (block->tx->q_num < priv->tx_cfg.num_queues)
420 			reschedule |= gve_tx_poll_dqo(block, /*do_clean=*/true);
421 		else
422 			reschedule |= gve_xdp_poll_dqo(block);
423 	}
424 
425 	if (!budget)
426 		return 0;
427 
428 	if (block->rx) {
429 		work_done = gve_rx_poll_dqo(block, budget);
430 
431 		/* Poll XSK TX as part of RX NAPI. Setup re-poll based on if
432 		 * either datapath has more work to do.
433 		 */
434 		if (priv->xdp_prog)
435 			reschedule |= gve_xsk_tx_poll_dqo(block, budget);
436 		reschedule |= work_done == budget;
437 	}
438 
439 	if (reschedule) {
440 		/* Reschedule by returning budget only if already on the correct
441 		 * cpu.
442 		 */
443 		if (likely(gve_is_napi_on_home_cpu(priv, block->irq)))
444 			return budget;
445 
446 		/* If not on the cpu with which this queue's irq has affinity
447 		 * with, we avoid rescheduling napi and arm the irq instead so
448 		 * that napi gets rescheduled back eventually onto the right
449 		 * cpu.
450 		 */
451 		if (work_done == budget)
452 			work_done--;
453 	}
454 
455 	if (likely(napi_complete_done(napi, work_done))) {
456 		/* Enable interrupts again.
457 		 *
458 		 * We don't need to repoll afterwards because HW supports the
459 		 * PCI MSI-X PBA feature.
460 		 *
461 		 * Another interrupt would be triggered if a new event came in
462 		 * since the last one.
463 		 */
464 		gve_write_irq_doorbell_dqo(priv, block,
465 					   GVE_ITR_NO_UPDATE_DQO | GVE_ITR_ENABLE_BIT_DQO);
466 	}
467 
468 	return work_done;
469 }
470 
471 static const struct cpumask *gve_get_node_mask(struct gve_priv *priv)
472 {
473 	if (priv->numa_node == NUMA_NO_NODE)
474 		return cpu_all_mask;
475 	else
476 		return cpumask_of_node(priv->numa_node);
477 }
478 
479 static int gve_alloc_notify_blocks(struct gve_priv *priv)
480 {
481 	int num_vecs_requested = priv->num_ntfy_blks + 1;
482 	const struct cpumask *node_mask;
483 	unsigned int cur_cpu;
484 	int vecs_enabled;
485 	int i, j;
486 	int err;
487 
488 	priv->msix_vectors = kvcalloc(num_vecs_requested,
489 				      sizeof(*priv->msix_vectors), GFP_KERNEL);
490 	if (!priv->msix_vectors)
491 		return -ENOMEM;
492 	for (i = 0; i < num_vecs_requested; i++)
493 		priv->msix_vectors[i].entry = i;
494 	vecs_enabled = pci_enable_msix_range(priv->pdev, priv->msix_vectors,
495 					     GVE_MIN_MSIX, num_vecs_requested);
496 	if (vecs_enabled < 0) {
497 		dev_err(&priv->pdev->dev, "Could not enable min msix %d/%d\n",
498 			GVE_MIN_MSIX, vecs_enabled);
499 		err = vecs_enabled;
500 		goto abort_with_msix_vectors;
501 	}
502 	if (vecs_enabled != num_vecs_requested) {
503 		int new_num_ntfy_blks = (vecs_enabled - 1) & ~0x1;
504 		int vecs_per_type = new_num_ntfy_blks / 2;
505 		int vecs_left = new_num_ntfy_blks % 2;
506 
507 		priv->num_ntfy_blks = new_num_ntfy_blks;
508 		priv->mgmt_msix_idx = priv->num_ntfy_blks;
509 		priv->tx_cfg.max_queues = min_t(int, priv->tx_cfg.max_queues,
510 						vecs_per_type);
511 		priv->rx_cfg.max_queues = min_t(int, priv->rx_cfg.max_queues,
512 						vecs_per_type + vecs_left);
513 		dev_err(&priv->pdev->dev,
514 			"Could not enable desired msix, only enabled %d, adjusting tx max queues to %d, and rx max queues to %d\n",
515 			vecs_enabled, priv->tx_cfg.max_queues,
516 			priv->rx_cfg.max_queues);
517 		if (priv->tx_cfg.num_queues > priv->tx_cfg.max_queues)
518 			priv->tx_cfg.num_queues = priv->tx_cfg.max_queues;
519 		if (priv->rx_cfg.num_queues > priv->rx_cfg.max_queues)
520 			priv->rx_cfg.num_queues = priv->rx_cfg.max_queues;
521 	}
522 
523 	/* Setup Management Vector  - the last vector */
524 	snprintf(priv->mgmt_msix_name, sizeof(priv->mgmt_msix_name), "gve-mgmnt@pci:%s",
525 		 pci_name(priv->pdev));
526 	err = request_irq(priv->msix_vectors[priv->mgmt_msix_idx].vector,
527 			  gve_mgmnt_intr, 0, priv->mgmt_msix_name, priv);
528 	if (err) {
529 		dev_err(&priv->pdev->dev, "Did not receive management vector.\n");
530 		goto abort_with_msix_enabled;
531 	}
532 	priv->irq_db_indices =
533 		dma_alloc_coherent(&priv->pdev->dev,
534 				   priv->num_ntfy_blks *
535 				   sizeof(*priv->irq_db_indices),
536 				   &priv->irq_db_indices_bus, GFP_KERNEL);
537 	if (!priv->irq_db_indices) {
538 		err = -ENOMEM;
539 		goto abort_with_mgmt_vector;
540 	}
541 
542 	priv->ntfy_blocks = kvzalloc(priv->num_ntfy_blks *
543 				     sizeof(*priv->ntfy_blocks), GFP_KERNEL);
544 	if (!priv->ntfy_blocks) {
545 		err = -ENOMEM;
546 		goto abort_with_irq_db_indices;
547 	}
548 
549 	/* Setup the other blocks - the first n-1 vectors */
550 	node_mask = gve_get_node_mask(priv);
551 	cur_cpu = cpumask_first(node_mask);
552 	for (i = 0; i < priv->num_ntfy_blks; i++) {
553 		struct gve_notify_block *block = &priv->ntfy_blocks[i];
554 		int msix_idx = i;
555 
556 		snprintf(block->name, sizeof(block->name), "gve-ntfy-blk%d@pci:%s",
557 			 i, pci_name(priv->pdev));
558 		block->priv = priv;
559 		err = request_irq(priv->msix_vectors[msix_idx].vector,
560 				  gve_is_gqi(priv) ? gve_intr : gve_intr_dqo,
561 				  0, block->name, block);
562 		if (err) {
563 			dev_err(&priv->pdev->dev,
564 				"Failed to receive msix vector %d\n", i);
565 			goto abort_with_some_ntfy_blocks;
566 		}
567 		block->irq = priv->msix_vectors[msix_idx].vector;
568 		irq_set_affinity_and_hint(block->irq,
569 					  cpumask_of(cur_cpu));
570 		block->irq_db_index = &priv->irq_db_indices[i].index;
571 
572 		cur_cpu = cpumask_next(cur_cpu, node_mask);
573 		/* Wrap once CPUs in the node have been exhausted, or when
574 		 * starting RX queue affinities. TX and RX queues of the same
575 		 * index share affinity.
576 		 */
577 		if (cur_cpu >= nr_cpu_ids || (i + 1) == priv->tx_cfg.max_queues)
578 			cur_cpu = cpumask_first(node_mask);
579 	}
580 	return 0;
581 abort_with_some_ntfy_blocks:
582 	for (j = 0; j < i; j++) {
583 		struct gve_notify_block *block = &priv->ntfy_blocks[j];
584 		int msix_idx = j;
585 
586 		irq_set_affinity_hint(priv->msix_vectors[msix_idx].vector,
587 				      NULL);
588 		free_irq(priv->msix_vectors[msix_idx].vector, block);
589 		block->irq = 0;
590 	}
591 	kvfree(priv->ntfy_blocks);
592 	priv->ntfy_blocks = NULL;
593 abort_with_irq_db_indices:
594 	dma_free_coherent(&priv->pdev->dev, priv->num_ntfy_blks *
595 			  sizeof(*priv->irq_db_indices),
596 			  priv->irq_db_indices, priv->irq_db_indices_bus);
597 	priv->irq_db_indices = NULL;
598 abort_with_mgmt_vector:
599 	free_irq(priv->msix_vectors[priv->mgmt_msix_idx].vector, priv);
600 abort_with_msix_enabled:
601 	pci_disable_msix(priv->pdev);
602 abort_with_msix_vectors:
603 	kvfree(priv->msix_vectors);
604 	priv->msix_vectors = NULL;
605 	return err;
606 }
607 
608 static void gve_free_notify_blocks(struct gve_priv *priv)
609 {
610 	int i;
611 
612 	if (!priv->msix_vectors)
613 		return;
614 
615 	/* Free the irqs */
616 	for (i = 0; i < priv->num_ntfy_blks; i++) {
617 		struct gve_notify_block *block = &priv->ntfy_blocks[i];
618 		int msix_idx = i;
619 
620 		irq_set_affinity_hint(priv->msix_vectors[msix_idx].vector,
621 				      NULL);
622 		free_irq(priv->msix_vectors[msix_idx].vector, block);
623 		block->irq = 0;
624 	}
625 	free_irq(priv->msix_vectors[priv->mgmt_msix_idx].vector, priv);
626 	kvfree(priv->ntfy_blocks);
627 	priv->ntfy_blocks = NULL;
628 	dma_free_coherent(&priv->pdev->dev, priv->num_ntfy_blks *
629 			  sizeof(*priv->irq_db_indices),
630 			  priv->irq_db_indices, priv->irq_db_indices_bus);
631 	priv->irq_db_indices = NULL;
632 	pci_disable_msix(priv->pdev);
633 	kvfree(priv->msix_vectors);
634 	priv->msix_vectors = NULL;
635 }
636 
637 static int gve_setup_device_resources(struct gve_priv *priv)
638 {
639 	int err;
640 
641 	err = gve_alloc_flow_rule_caches(priv);
642 	if (err)
643 		return err;
644 	err = gve_alloc_rss_config_cache(priv);
645 	if (err)
646 		goto abort_with_flow_rule_caches;
647 	err = gve_alloc_counter_array(priv);
648 	if (err)
649 		goto abort_with_rss_config_cache;
650 	err = gve_init_clock(priv);
651 	if (err)
652 		goto abort_with_counter;
653 	err = gve_alloc_notify_blocks(priv);
654 	if (err)
655 		goto abort_with_clock;
656 	err = gve_alloc_stats_report(priv);
657 	if (err)
658 		goto abort_with_ntfy_blocks;
659 	err = gve_adminq_configure_device_resources(priv,
660 						    priv->counter_array_bus,
661 						    priv->num_event_counters,
662 						    priv->irq_db_indices_bus,
663 						    priv->num_ntfy_blks);
664 	if (unlikely(err)) {
665 		dev_err(&priv->pdev->dev,
666 			"could not setup device_resources: err=%d\n", err);
667 		err = -ENXIO;
668 		goto abort_with_stats_report;
669 	}
670 
671 	if (!gve_is_gqi(priv)) {
672 		priv->ptype_lut_dqo = kvzalloc(sizeof(*priv->ptype_lut_dqo),
673 					       GFP_KERNEL);
674 		if (!priv->ptype_lut_dqo) {
675 			err = -ENOMEM;
676 			goto abort_with_stats_report;
677 		}
678 		err = gve_adminq_get_ptype_map_dqo(priv, priv->ptype_lut_dqo);
679 		if (err) {
680 			dev_err(&priv->pdev->dev,
681 				"Failed to get ptype map: err=%d\n", err);
682 			goto abort_with_ptype_lut;
683 		}
684 	}
685 
686 	err = gve_init_rss_config(priv, priv->rx_cfg.num_queues);
687 	if (err) {
688 		dev_err(&priv->pdev->dev, "Failed to init RSS config");
689 		goto abort_with_ptype_lut;
690 	}
691 
692 	err = gve_adminq_report_stats(priv, priv->stats_report_len,
693 				      priv->stats_report_bus,
694 				      GVE_STATS_REPORT_TIMER_PERIOD);
695 	if (err)
696 		dev_err(&priv->pdev->dev,
697 			"Failed to report stats: err=%d\n", err);
698 	gve_set_device_resources_ok(priv);
699 	return 0;
700 
701 abort_with_ptype_lut:
702 	kvfree(priv->ptype_lut_dqo);
703 	priv->ptype_lut_dqo = NULL;
704 abort_with_stats_report:
705 	gve_free_stats_report(priv);
706 abort_with_ntfy_blocks:
707 	gve_free_notify_blocks(priv);
708 abort_with_clock:
709 	gve_teardown_clock(priv);
710 abort_with_counter:
711 	gve_free_counter_array(priv);
712 abort_with_rss_config_cache:
713 	gve_free_rss_config_cache(priv);
714 abort_with_flow_rule_caches:
715 	gve_free_flow_rule_caches(priv);
716 
717 	return err;
718 }
719 
720 static void gve_trigger_reset(struct gve_priv *priv);
721 
722 static void gve_teardown_device_resources(struct gve_priv *priv)
723 {
724 	int err;
725 
726 	/* Tell device its resources are being freed */
727 	if (gve_get_device_resources_ok(priv)) {
728 		err = gve_flow_rules_reset(priv);
729 		if (err) {
730 			dev_err(&priv->pdev->dev,
731 				"Failed to reset flow rules: err=%d\n", err);
732 			gve_trigger_reset(priv);
733 		}
734 		/* detach the stats report */
735 		err = gve_adminq_report_stats(priv, 0, 0x0, GVE_STATS_REPORT_TIMER_PERIOD);
736 		if (err) {
737 			dev_err(&priv->pdev->dev,
738 				"Failed to detach stats report: err=%d\n", err);
739 			gve_trigger_reset(priv);
740 		}
741 		err = gve_adminq_deconfigure_device_resources(priv);
742 		if (err) {
743 			dev_err(&priv->pdev->dev,
744 				"Could not deconfigure device resources: err=%d\n",
745 				err);
746 			gve_trigger_reset(priv);
747 		}
748 	}
749 
750 	kvfree(priv->ptype_lut_dqo);
751 	priv->ptype_lut_dqo = NULL;
752 
753 	gve_free_flow_rule_caches(priv);
754 	gve_free_rss_config_cache(priv);
755 	gve_free_counter_array(priv);
756 	gve_free_notify_blocks(priv);
757 	gve_free_stats_report(priv);
758 	gve_teardown_clock(priv);
759 	gve_clear_device_resources_ok(priv);
760 }
761 
762 static int gve_unregister_qpl(struct gve_priv *priv,
763 			      struct gve_queue_page_list *qpl)
764 {
765 	int err;
766 
767 	if (!qpl)
768 		return 0;
769 
770 	err = gve_adminq_unregister_page_list(priv, qpl->id);
771 	if (err) {
772 		netif_err(priv, drv, priv->dev,
773 			  "Failed to unregister queue page list %d\n",
774 			  qpl->id);
775 		return err;
776 	}
777 
778 	priv->num_registered_pages -= qpl->num_entries;
779 	return 0;
780 }
781 
782 static int gve_register_qpl(struct gve_priv *priv,
783 			    struct gve_queue_page_list *qpl)
784 {
785 	int pages;
786 	int err;
787 
788 	if (!qpl)
789 		return 0;
790 
791 	pages = qpl->num_entries;
792 
793 	if (pages + priv->num_registered_pages > priv->max_registered_pages) {
794 		netif_err(priv, drv, priv->dev,
795 			  "Reached max number of registered pages %llu > %llu\n",
796 			  pages + priv->num_registered_pages,
797 			  priv->max_registered_pages);
798 		return -EINVAL;
799 	}
800 
801 	err = gve_adminq_register_page_list(priv, qpl);
802 	if (err) {
803 		netif_err(priv, drv, priv->dev,
804 			  "failed to register queue page list %d\n",
805 			  qpl->id);
806 		return err;
807 	}
808 
809 	priv->num_registered_pages += pages;
810 	return 0;
811 }
812 
813 static struct gve_queue_page_list *gve_tx_get_qpl(struct gve_priv *priv, int idx)
814 {
815 	struct gve_tx_ring *tx = &priv->tx[idx];
816 
817 	if (gve_is_gqi(priv))
818 		return tx->tx_fifo.qpl;
819 	else
820 		return tx->dqo.qpl;
821 }
822 
823 static struct gve_queue_page_list *gve_rx_get_qpl(struct gve_priv *priv, int idx)
824 {
825 	struct gve_rx_ring *rx = &priv->rx[idx];
826 
827 	if (gve_is_gqi(priv))
828 		return rx->data.qpl;
829 	else
830 		return rx->dqo.qpl;
831 }
832 
833 static int gve_register_qpls(struct gve_priv *priv)
834 {
835 	int num_tx_qpls, num_rx_qpls;
836 	int err;
837 	int i;
838 
839 	num_tx_qpls = gve_num_tx_qpls(&priv->tx_cfg, gve_is_qpl(priv));
840 	num_rx_qpls = gve_num_rx_qpls(&priv->rx_cfg, gve_is_qpl(priv));
841 
842 	for (i = 0; i < num_tx_qpls; i++) {
843 		err = gve_register_qpl(priv, gve_tx_get_qpl(priv, i));
844 		if (err)
845 			return err;
846 	}
847 
848 	for (i = 0; i < num_rx_qpls; i++) {
849 		err = gve_register_qpl(priv, gve_rx_get_qpl(priv, i));
850 		if (err)
851 			return err;
852 	}
853 
854 	return 0;
855 }
856 
857 static int gve_unregister_qpls(struct gve_priv *priv)
858 {
859 	int num_tx_qpls, num_rx_qpls;
860 	int err;
861 	int i;
862 
863 	num_tx_qpls = gve_num_tx_qpls(&priv->tx_cfg, gve_is_qpl(priv));
864 	num_rx_qpls = gve_num_rx_qpls(&priv->rx_cfg, gve_is_qpl(priv));
865 
866 	for (i = 0; i < num_tx_qpls; i++) {
867 		err = gve_unregister_qpl(priv, gve_tx_get_qpl(priv, i));
868 		/* This failure will trigger a reset - no need to clean */
869 		if (err)
870 			return err;
871 	}
872 
873 	for (i = 0; i < num_rx_qpls; i++) {
874 		err = gve_unregister_qpl(priv, gve_rx_get_qpl(priv, i));
875 		/* This failure will trigger a reset - no need to clean */
876 		if (err)
877 			return err;
878 	}
879 	return 0;
880 }
881 
882 static int gve_create_rings(struct gve_priv *priv)
883 {
884 	int num_tx_queues = gve_num_tx_queues(priv);
885 	int err;
886 	int i;
887 
888 	err = gve_adminq_create_tx_queues(priv, 0, num_tx_queues);
889 	if (err) {
890 		netif_err(priv, drv, priv->dev, "failed to create %d tx queues\n",
891 			  num_tx_queues);
892 		/* This failure will trigger a reset - no need to clean
893 		 * up
894 		 */
895 		return err;
896 	}
897 	netif_dbg(priv, drv, priv->dev, "created %d tx queues\n",
898 		  num_tx_queues);
899 
900 	err = gve_adminq_create_rx_queues(priv, priv->rx_cfg.num_queues);
901 	if (err) {
902 		netif_err(priv, drv, priv->dev, "failed to create %d rx queues\n",
903 			  priv->rx_cfg.num_queues);
904 		/* This failure will trigger a reset - no need to clean
905 		 * up
906 		 */
907 		return err;
908 	}
909 	netif_dbg(priv, drv, priv->dev, "created %d rx queues\n",
910 		  priv->rx_cfg.num_queues);
911 
912 	if (gve_is_gqi(priv)) {
913 		/* Rx data ring has been prefilled with packet buffers at queue
914 		 * allocation time.
915 		 *
916 		 * Write the doorbell to provide descriptor slots and packet
917 		 * buffers to the NIC.
918 		 */
919 		for (i = 0; i < priv->rx_cfg.num_queues; i++)
920 			gve_rx_write_doorbell(priv, &priv->rx[i]);
921 	} else {
922 		for (i = 0; i < priv->rx_cfg.num_queues; i++) {
923 			/* Post buffers and ring doorbell. */
924 			gve_rx_post_buffers_dqo(&priv->rx[i]);
925 		}
926 	}
927 
928 	return 0;
929 }
930 
931 static void init_xdp_sync_stats(struct gve_priv *priv)
932 {
933 	int start_id = gve_xdp_tx_start_queue_id(priv);
934 	int i;
935 
936 	/* Init stats */
937 	for (i = start_id; i < start_id + priv->tx_cfg.num_xdp_queues; i++) {
938 		int ntfy_idx = gve_tx_idx_to_ntfy(priv, i);
939 
940 		u64_stats_init(&priv->tx[i].statss);
941 		priv->tx[i].ntfy_id = ntfy_idx;
942 	}
943 }
944 
945 static void gve_init_sync_stats(struct gve_priv *priv)
946 {
947 	int i;
948 
949 	for (i = 0; i < priv->tx_cfg.num_queues; i++)
950 		u64_stats_init(&priv->tx[i].statss);
951 
952 	/* Init stats for XDP TX queues */
953 	init_xdp_sync_stats(priv);
954 
955 	for (i = 0; i < priv->rx_cfg.num_queues; i++)
956 		u64_stats_init(&priv->rx[i].statss);
957 }
958 
959 static void gve_tx_get_curr_alloc_cfg(struct gve_priv *priv,
960 				      struct gve_tx_alloc_rings_cfg *cfg)
961 {
962 	cfg->qcfg = &priv->tx_cfg;
963 	cfg->raw_addressing = !gve_is_qpl(priv);
964 	cfg->ring_size = priv->tx_desc_cnt;
965 	cfg->num_xdp_rings = cfg->qcfg->num_xdp_queues;
966 	cfg->tx = priv->tx;
967 }
968 
969 static void gve_tx_stop_rings(struct gve_priv *priv, int num_rings)
970 {
971 	int i;
972 
973 	if (!priv->tx)
974 		return;
975 
976 	for (i = 0; i < num_rings; i++) {
977 		if (gve_is_gqi(priv))
978 			gve_tx_stop_ring_gqi(priv, i);
979 		else
980 			gve_tx_stop_ring_dqo(priv, i);
981 	}
982 }
983 
984 static void gve_tx_start_rings(struct gve_priv *priv, int num_rings)
985 {
986 	int i;
987 
988 	for (i = 0; i < num_rings; i++) {
989 		if (gve_is_gqi(priv))
990 			gve_tx_start_ring_gqi(priv, i);
991 		else
992 			gve_tx_start_ring_dqo(priv, i);
993 	}
994 }
995 
996 static int gve_queues_mem_alloc(struct gve_priv *priv,
997 				struct gve_tx_alloc_rings_cfg *tx_alloc_cfg,
998 				struct gve_rx_alloc_rings_cfg *rx_alloc_cfg)
999 {
1000 	int err;
1001 
1002 	if (gve_is_gqi(priv))
1003 		err = gve_tx_alloc_rings_gqi(priv, tx_alloc_cfg);
1004 	else
1005 		err = gve_tx_alloc_rings_dqo(priv, tx_alloc_cfg);
1006 	if (err)
1007 		return err;
1008 
1009 	if (gve_is_gqi(priv))
1010 		err = gve_rx_alloc_rings_gqi(priv, rx_alloc_cfg);
1011 	else
1012 		err = gve_rx_alloc_rings_dqo(priv, rx_alloc_cfg);
1013 	if (err)
1014 		goto free_tx;
1015 
1016 	return 0;
1017 
1018 free_tx:
1019 	if (gve_is_gqi(priv))
1020 		gve_tx_free_rings_gqi(priv, tx_alloc_cfg);
1021 	else
1022 		gve_tx_free_rings_dqo(priv, tx_alloc_cfg);
1023 	return err;
1024 }
1025 
1026 static int gve_destroy_rings(struct gve_priv *priv)
1027 {
1028 	int num_tx_queues = gve_num_tx_queues(priv);
1029 	int err;
1030 
1031 	err = gve_adminq_destroy_tx_queues(priv, 0, num_tx_queues);
1032 	if (err) {
1033 		netif_err(priv, drv, priv->dev,
1034 			  "failed to destroy tx queues\n");
1035 		/* This failure will trigger a reset - no need to clean up */
1036 		return err;
1037 	}
1038 	netif_dbg(priv, drv, priv->dev, "destroyed tx queues\n");
1039 	err = gve_adminq_destroy_rx_queues(priv, priv->rx_cfg.num_queues);
1040 	if (err) {
1041 		netif_err(priv, drv, priv->dev,
1042 			  "failed to destroy rx queues\n");
1043 		/* This failure will trigger a reset - no need to clean up */
1044 		return err;
1045 	}
1046 	netif_dbg(priv, drv, priv->dev, "destroyed rx queues\n");
1047 	return 0;
1048 }
1049 
1050 static void gve_queues_mem_free(struct gve_priv *priv,
1051 				struct gve_tx_alloc_rings_cfg *tx_cfg,
1052 				struct gve_rx_alloc_rings_cfg *rx_cfg)
1053 {
1054 	if (gve_is_gqi(priv)) {
1055 		gve_tx_free_rings_gqi(priv, tx_cfg);
1056 		gve_rx_free_rings_gqi(priv, rx_cfg);
1057 	} else {
1058 		gve_tx_free_rings_dqo(priv, tx_cfg);
1059 		gve_rx_free_rings_dqo(priv, rx_cfg);
1060 	}
1061 }
1062 
1063 int gve_alloc_page(struct gve_priv *priv, struct device *dev,
1064 		   struct page **page, dma_addr_t *dma,
1065 		   enum dma_data_direction dir, gfp_t gfp_flags)
1066 {
1067 	*page = alloc_pages_node(priv->numa_node, gfp_flags, 0);
1068 	if (!*page) {
1069 		priv->page_alloc_fail++;
1070 		return -ENOMEM;
1071 	}
1072 	*dma = dma_map_page(dev, *page, 0, PAGE_SIZE, dir);
1073 	if (dma_mapping_error(dev, *dma)) {
1074 		priv->dma_mapping_error++;
1075 		put_page(*page);
1076 		return -ENOMEM;
1077 	}
1078 	return 0;
1079 }
1080 
1081 struct gve_queue_page_list *gve_alloc_queue_page_list(struct gve_priv *priv,
1082 						      u32 id, int pages)
1083 {
1084 	struct gve_queue_page_list *qpl;
1085 	int err;
1086 	int i;
1087 
1088 	qpl = kvzalloc(sizeof(*qpl), GFP_KERNEL);
1089 	if (!qpl)
1090 		return NULL;
1091 
1092 	qpl->id = id;
1093 	qpl->num_entries = 0;
1094 	qpl->pages = kvcalloc(pages, sizeof(*qpl->pages), GFP_KERNEL);
1095 	if (!qpl->pages)
1096 		goto abort;
1097 
1098 	qpl->page_buses = kvcalloc(pages, sizeof(*qpl->page_buses), GFP_KERNEL);
1099 	if (!qpl->page_buses)
1100 		goto abort;
1101 
1102 	for (i = 0; i < pages; i++) {
1103 		err = gve_alloc_page(priv, &priv->pdev->dev, &qpl->pages[i],
1104 				     &qpl->page_buses[i],
1105 				     gve_qpl_dma_dir(priv, id), GFP_KERNEL);
1106 		if (err)
1107 			goto abort;
1108 		qpl->num_entries++;
1109 	}
1110 
1111 	return qpl;
1112 
1113 abort:
1114 	gve_free_queue_page_list(priv, qpl, id);
1115 	return NULL;
1116 }
1117 
1118 void gve_free_page(struct device *dev, struct page *page, dma_addr_t dma,
1119 		   enum dma_data_direction dir)
1120 {
1121 	if (!dma_mapping_error(dev, dma))
1122 		dma_unmap_page(dev, dma, PAGE_SIZE, dir);
1123 	if (page)
1124 		put_page(page);
1125 }
1126 
1127 void gve_free_queue_page_list(struct gve_priv *priv,
1128 			      struct gve_queue_page_list *qpl,
1129 			      u32 id)
1130 {
1131 	int i;
1132 
1133 	if (!qpl)
1134 		return;
1135 	if (!qpl->pages)
1136 		goto free_qpl;
1137 	if (!qpl->page_buses)
1138 		goto free_pages;
1139 
1140 	for (i = 0; i < qpl->num_entries; i++)
1141 		gve_free_page(&priv->pdev->dev, qpl->pages[i],
1142 			      qpl->page_buses[i], gve_qpl_dma_dir(priv, id));
1143 
1144 	kvfree(qpl->page_buses);
1145 	qpl->page_buses = NULL;
1146 free_pages:
1147 	kvfree(qpl->pages);
1148 	qpl->pages = NULL;
1149 free_qpl:
1150 	kvfree(qpl);
1151 }
1152 
1153 /* Use this to schedule a reset when the device is capable of continuing
1154  * to handle other requests in its current state. If it is not, do a reset
1155  * in thread instead.
1156  */
1157 void gve_schedule_reset(struct gve_priv *priv)
1158 {
1159 	gve_set_do_reset(priv);
1160 	queue_work(priv->gve_wq, &priv->service_task);
1161 }
1162 
1163 static void gve_reset_and_teardown(struct gve_priv *priv, bool was_up);
1164 static int gve_reset_recovery(struct gve_priv *priv, bool was_up);
1165 static void gve_turndown(struct gve_priv *priv);
1166 static void gve_turnup(struct gve_priv *priv);
1167 
1168 static void gve_unreg_xsk_pool(struct gve_priv *priv, u16 qid)
1169 {
1170 	struct gve_rx_ring *rx;
1171 
1172 	if (!priv->rx)
1173 		return;
1174 
1175 	rx = &priv->rx[qid];
1176 	rx->xsk_pool = NULL;
1177 	if (xdp_rxq_info_is_reg(&rx->xdp_rxq))
1178 		xdp_rxq_info_unreg_mem_model(&rx->xdp_rxq);
1179 
1180 	if (!priv->tx)
1181 		return;
1182 	priv->tx[gve_xdp_tx_queue_id(priv, qid)].xsk_pool = NULL;
1183 }
1184 
1185 static int gve_reg_xsk_pool(struct gve_priv *priv, struct net_device *dev,
1186 			    struct xsk_buff_pool *pool, u16 qid)
1187 {
1188 	struct gve_rx_ring *rx;
1189 	u16 tx_qid;
1190 	int err;
1191 
1192 	rx = &priv->rx[qid];
1193 	err = xdp_rxq_info_reg_mem_model(&rx->xdp_rxq,
1194 					 MEM_TYPE_XSK_BUFF_POOL, pool);
1195 	if (err) {
1196 		gve_unreg_xsk_pool(priv, qid);
1197 		return err;
1198 	}
1199 
1200 	rx->xsk_pool = pool;
1201 
1202 	tx_qid = gve_xdp_tx_queue_id(priv, qid);
1203 	priv->tx[tx_qid].xsk_pool = pool;
1204 
1205 	return 0;
1206 }
1207 
1208 static void gve_unreg_xdp_info(struct gve_priv *priv)
1209 {
1210 	int i;
1211 
1212 	if (!priv->tx_cfg.num_xdp_queues || !priv->rx)
1213 		return;
1214 
1215 	for (i = 0; i < priv->rx_cfg.num_queues; i++) {
1216 		struct gve_rx_ring *rx = &priv->rx[i];
1217 
1218 		if (xdp_rxq_info_is_reg(&rx->xdp_rxq))
1219 			xdp_rxq_info_unreg(&rx->xdp_rxq);
1220 
1221 		gve_unreg_xsk_pool(priv, i);
1222 	}
1223 }
1224 
1225 static struct xsk_buff_pool *gve_get_xsk_pool(struct gve_priv *priv, int qid)
1226 {
1227 	if (!test_bit(qid, priv->xsk_pools))
1228 		return NULL;
1229 
1230 	return xsk_get_pool_from_qid(priv->dev, qid);
1231 }
1232 
1233 static int gve_reg_xdp_info(struct gve_priv *priv, struct net_device *dev)
1234 {
1235 	struct napi_struct *napi;
1236 	struct gve_rx_ring *rx;
1237 	int err = 0;
1238 	int i;
1239 
1240 	if (!priv->tx_cfg.num_xdp_queues)
1241 		return 0;
1242 
1243 	for (i = 0; i < priv->rx_cfg.num_queues; i++) {
1244 		struct xsk_buff_pool *xsk_pool;
1245 
1246 		rx = &priv->rx[i];
1247 		napi = &priv->ntfy_blocks[rx->ntfy_id].napi;
1248 
1249 		err = xdp_rxq_info_reg(&rx->xdp_rxq, dev, i,
1250 				       napi->napi_id);
1251 		if (err)
1252 			goto err;
1253 
1254 		xsk_pool = gve_get_xsk_pool(priv, i);
1255 		if (xsk_pool)
1256 			err = gve_reg_xsk_pool(priv, dev, xsk_pool, i);
1257 		else if (gve_is_qpl(priv))
1258 			err = xdp_rxq_info_reg_mem_model(&rx->xdp_rxq,
1259 							 MEM_TYPE_PAGE_SHARED,
1260 							 NULL);
1261 		else
1262 			err = xdp_rxq_info_reg_mem_model(&rx->xdp_rxq,
1263 							 MEM_TYPE_PAGE_POOL,
1264 							 rx->dqo.page_pool);
1265 		if (err)
1266 			goto err;
1267 	}
1268 	return 0;
1269 
1270 err:
1271 	gve_unreg_xdp_info(priv);
1272 	return err;
1273 }
1274 
1275 
1276 static void gve_drain_page_cache(struct gve_priv *priv)
1277 {
1278 	int i;
1279 
1280 	for (i = 0; i < priv->rx_cfg.num_queues; i++)
1281 		page_frag_cache_drain(&priv->rx[i].page_cache);
1282 }
1283 
1284 static void gve_rx_get_curr_alloc_cfg(struct gve_priv *priv,
1285 				      struct gve_rx_alloc_rings_cfg *cfg)
1286 {
1287 	cfg->qcfg_rx = &priv->rx_cfg;
1288 	cfg->qcfg_tx = &priv->tx_cfg;
1289 	cfg->raw_addressing = !gve_is_qpl(priv);
1290 	cfg->enable_header_split = priv->header_split_enabled;
1291 	cfg->ring_size = priv->rx_desc_cnt;
1292 	cfg->packet_buffer_size = priv->rx_cfg.packet_buffer_size;
1293 	cfg->rx = priv->rx;
1294 	cfg->xdp = !!cfg->qcfg_tx->num_xdp_queues;
1295 }
1296 
1297 void gve_get_curr_alloc_cfgs(struct gve_priv *priv,
1298 			     struct gve_tx_alloc_rings_cfg *tx_alloc_cfg,
1299 			     struct gve_rx_alloc_rings_cfg *rx_alloc_cfg)
1300 {
1301 	gve_tx_get_curr_alloc_cfg(priv, tx_alloc_cfg);
1302 	gve_rx_get_curr_alloc_cfg(priv, rx_alloc_cfg);
1303 }
1304 
1305 static void gve_rx_start_ring(struct gve_priv *priv, int i)
1306 {
1307 	if (gve_is_gqi(priv))
1308 		gve_rx_start_ring_gqi(priv, i);
1309 	else
1310 		gve_rx_start_ring_dqo(priv, i);
1311 }
1312 
1313 static void gve_rx_start_rings(struct gve_priv *priv, int num_rings)
1314 {
1315 	int i;
1316 
1317 	for (i = 0; i < num_rings; i++)
1318 		gve_rx_start_ring(priv, i);
1319 }
1320 
1321 static void gve_rx_stop_ring(struct gve_priv *priv, int i)
1322 {
1323 	if (gve_is_gqi(priv))
1324 		gve_rx_stop_ring_gqi(priv, i);
1325 	else
1326 		gve_rx_stop_ring_dqo(priv, i);
1327 }
1328 
1329 static void gve_rx_stop_rings(struct gve_priv *priv, int num_rings)
1330 {
1331 	int i;
1332 
1333 	if (!priv->rx)
1334 		return;
1335 
1336 	for (i = 0; i < num_rings; i++)
1337 		gve_rx_stop_ring(priv, i);
1338 }
1339 
1340 static void gve_queues_mem_remove(struct gve_priv *priv)
1341 {
1342 	struct gve_tx_alloc_rings_cfg tx_alloc_cfg = {0};
1343 	struct gve_rx_alloc_rings_cfg rx_alloc_cfg = {0};
1344 
1345 	gve_get_curr_alloc_cfgs(priv, &tx_alloc_cfg, &rx_alloc_cfg);
1346 	gve_queues_mem_free(priv, &tx_alloc_cfg, &rx_alloc_cfg);
1347 	priv->tx = NULL;
1348 	priv->rx = NULL;
1349 }
1350 
1351 /* The passed-in queue memory is stored into priv and the queues are made live.
1352  * No memory is allocated. Passed-in memory is freed on errors.
1353  */
1354 static int gve_queues_start(struct gve_priv *priv,
1355 			    struct gve_tx_alloc_rings_cfg *tx_alloc_cfg,
1356 			    struct gve_rx_alloc_rings_cfg *rx_alloc_cfg)
1357 {
1358 	struct net_device *dev = priv->dev;
1359 	int err;
1360 
1361 	/* Record new resources into priv */
1362 	priv->tx = tx_alloc_cfg->tx;
1363 	priv->rx = rx_alloc_cfg->rx;
1364 
1365 	/* Record new configs into priv */
1366 	priv->tx_cfg = *tx_alloc_cfg->qcfg;
1367 	priv->tx_cfg.num_xdp_queues = tx_alloc_cfg->num_xdp_rings;
1368 	priv->rx_cfg = *rx_alloc_cfg->qcfg_rx;
1369 	priv->tx_desc_cnt = tx_alloc_cfg->ring_size;
1370 	priv->rx_desc_cnt = rx_alloc_cfg->ring_size;
1371 
1372 	gve_tx_start_rings(priv, gve_num_tx_queues(priv));
1373 	gve_rx_start_rings(priv, rx_alloc_cfg->qcfg_rx->num_queues);
1374 	gve_init_sync_stats(priv);
1375 
1376 	err = netif_set_real_num_tx_queues(dev, priv->tx_cfg.num_queues);
1377 	if (err)
1378 		goto stop_and_free_rings;
1379 	err = netif_set_real_num_rx_queues(dev, priv->rx_cfg.num_queues);
1380 	if (err)
1381 		goto stop_and_free_rings;
1382 
1383 	err = gve_reg_xdp_info(priv, dev);
1384 	if (err)
1385 		goto stop_and_free_rings;
1386 
1387 	if (rx_alloc_cfg->reset_rss) {
1388 		err = gve_init_rss_config(priv, priv->rx_cfg.num_queues);
1389 		if (err)
1390 			goto reset;
1391 	}
1392 
1393 	err = gve_register_qpls(priv);
1394 	if (err)
1395 		goto reset;
1396 
1397 	priv->header_split_enabled = rx_alloc_cfg->enable_header_split;
1398 	priv->rx_cfg.packet_buffer_size = rx_alloc_cfg->packet_buffer_size;
1399 
1400 	err = gve_create_rings(priv);
1401 	if (err)
1402 		goto reset;
1403 
1404 	gve_set_device_rings_ok(priv);
1405 
1406 	if (gve_get_report_stats(priv))
1407 		mod_timer(&priv->stats_report_timer,
1408 			  round_jiffies(jiffies +
1409 				msecs_to_jiffies(priv->stats_report_timer_period)));
1410 
1411 	gve_turnup(priv);
1412 	queue_work(priv->gve_wq, &priv->service_task);
1413 	priv->interface_up_cnt++;
1414 	return 0;
1415 
1416 reset:
1417 	if (gve_get_reset_in_progress(priv))
1418 		goto stop_and_free_rings;
1419 	gve_reset_and_teardown(priv, true);
1420 	/* if this fails there is nothing we can do so just ignore the return */
1421 	gve_reset_recovery(priv, false);
1422 	/* return the original error */
1423 	return err;
1424 stop_and_free_rings:
1425 	gve_tx_stop_rings(priv, gve_num_tx_queues(priv));
1426 	gve_rx_stop_rings(priv, priv->rx_cfg.num_queues);
1427 	gve_queues_mem_remove(priv);
1428 	return err;
1429 }
1430 
1431 static int gve_open(struct net_device *dev)
1432 {
1433 	struct gve_tx_alloc_rings_cfg tx_alloc_cfg = {0};
1434 	struct gve_rx_alloc_rings_cfg rx_alloc_cfg = {0};
1435 	struct gve_priv *priv = netdev_priv(dev);
1436 	int err;
1437 
1438 	gve_get_curr_alloc_cfgs(priv, &tx_alloc_cfg, &rx_alloc_cfg);
1439 
1440 	err = gve_queues_mem_alloc(priv, &tx_alloc_cfg, &rx_alloc_cfg);
1441 	if (err)
1442 		return err;
1443 
1444 	/* No need to free on error: ownership of resources is lost after
1445 	 * calling gve_queues_start.
1446 	 */
1447 	err = gve_queues_start(priv, &tx_alloc_cfg, &rx_alloc_cfg);
1448 	if (err)
1449 		return err;
1450 
1451 	return 0;
1452 }
1453 
1454 static int gve_queues_stop(struct gve_priv *priv)
1455 {
1456 	int err;
1457 
1458 	netif_carrier_off(priv->dev);
1459 	if (gve_get_device_rings_ok(priv)) {
1460 		gve_turndown(priv);
1461 		gve_drain_page_cache(priv);
1462 		err = gve_destroy_rings(priv);
1463 		if (err)
1464 			goto err;
1465 		err = gve_unregister_qpls(priv);
1466 		if (err)
1467 			goto err;
1468 		gve_clear_device_rings_ok(priv);
1469 	}
1470 	timer_delete_sync(&priv->stats_report_timer);
1471 
1472 	gve_unreg_xdp_info(priv);
1473 
1474 	gve_tx_stop_rings(priv, gve_num_tx_queues(priv));
1475 	gve_rx_stop_rings(priv, priv->rx_cfg.num_queues);
1476 
1477 	priv->interface_down_cnt++;
1478 	return 0;
1479 
1480 err:
1481 	/* This must have been called from a reset due to the rtnl lock
1482 	 * so just return at this point.
1483 	 */
1484 	if (gve_get_reset_in_progress(priv))
1485 		return err;
1486 	/* Otherwise reset before returning */
1487 	gve_reset_and_teardown(priv, true);
1488 	return gve_reset_recovery(priv, false);
1489 }
1490 
1491 static int gve_close(struct net_device *dev)
1492 {
1493 	struct gve_priv *priv = netdev_priv(dev);
1494 	int err;
1495 
1496 	err = gve_queues_stop(priv);
1497 	if (err)
1498 		return err;
1499 
1500 	gve_queues_mem_remove(priv);
1501 	return 0;
1502 }
1503 
1504 static void gve_handle_link_status(struct gve_priv *priv, bool link_status)
1505 {
1506 	if (!gve_get_napi_enabled(priv))
1507 		return;
1508 
1509 	if (link_status == netif_carrier_ok(priv->dev))
1510 		return;
1511 
1512 	if (link_status) {
1513 		netdev_info(priv->dev, "Device link is up.\n");
1514 		netif_carrier_on(priv->dev);
1515 	} else {
1516 		netdev_info(priv->dev, "Device link is down.\n");
1517 		netif_carrier_off(priv->dev);
1518 	}
1519 }
1520 
1521 static int gve_configure_rings_xdp(struct gve_priv *priv,
1522 				   u16 num_xdp_rings)
1523 {
1524 	struct gve_tx_alloc_rings_cfg tx_alloc_cfg = {0};
1525 	struct gve_rx_alloc_rings_cfg rx_alloc_cfg = {0};
1526 
1527 	gve_get_curr_alloc_cfgs(priv, &tx_alloc_cfg, &rx_alloc_cfg);
1528 	tx_alloc_cfg.num_xdp_rings = num_xdp_rings;
1529 
1530 	rx_alloc_cfg.xdp = !!num_xdp_rings;
1531 	return gve_adjust_config(priv, &tx_alloc_cfg, &rx_alloc_cfg);
1532 }
1533 
1534 static int gve_set_xdp(struct gve_priv *priv, struct bpf_prog *prog,
1535 		       struct netlink_ext_ack *extack)
1536 {
1537 	struct bpf_prog *old_prog;
1538 	int err = 0;
1539 	u32 status;
1540 
1541 	old_prog = READ_ONCE(priv->xdp_prog);
1542 	if (!netif_running(priv->dev)) {
1543 		WRITE_ONCE(priv->xdp_prog, prog);
1544 		if (old_prog)
1545 			bpf_prog_put(old_prog);
1546 
1547 		/* Update priv XDP queue configuration */
1548 		priv->tx_cfg.num_xdp_queues = priv->xdp_prog ?
1549 			priv->rx_cfg.num_queues : 0;
1550 		return 0;
1551 	}
1552 
1553 	if (!old_prog && prog)
1554 		err = gve_configure_rings_xdp(priv, priv->rx_cfg.num_queues);
1555 	else if (old_prog && !prog)
1556 		err = gve_configure_rings_xdp(priv, 0);
1557 
1558 	if (err)
1559 		goto out;
1560 
1561 	WRITE_ONCE(priv->xdp_prog, prog);
1562 	if (old_prog)
1563 		bpf_prog_put(old_prog);
1564 
1565 out:
1566 	status = ioread32be(&priv->reg_bar0->device_status);
1567 	gve_handle_link_status(priv, GVE_DEVICE_STATUS_LINK_STATUS_MASK & status);
1568 	return err;
1569 }
1570 
1571 static int gve_xdp_xmit(struct net_device *dev, int n,
1572 			struct xdp_frame **frames, u32 flags)
1573 {
1574 	struct gve_priv *priv = netdev_priv(dev);
1575 
1576 	if (priv->queue_format == GVE_GQI_QPL_FORMAT)
1577 		return gve_xdp_xmit_gqi(dev, n, frames, flags);
1578 	else if (priv->queue_format == GVE_DQO_RDA_FORMAT)
1579 		return gve_xdp_xmit_dqo(dev, n, frames, flags);
1580 
1581 	return -EOPNOTSUPP;
1582 }
1583 
1584 static int gve_xsk_pool_enable(struct net_device *dev,
1585 			       struct xsk_buff_pool *pool,
1586 			       u16 qid)
1587 {
1588 	struct gve_priv *priv = netdev_priv(dev);
1589 	int err;
1590 
1591 	if (qid >= priv->rx_cfg.num_queues) {
1592 		dev_err(&priv->pdev->dev, "xsk pool invalid qid %d", qid);
1593 		return -EINVAL;
1594 	}
1595 	if (xsk_pool_get_rx_frame_size(pool) <
1596 	     priv->dev->max_mtu + sizeof(struct ethhdr)) {
1597 		dev_err(&priv->pdev->dev, "xsk pool frame_len too small");
1598 		return -EINVAL;
1599 	}
1600 
1601 	err = xsk_pool_dma_map(pool, &priv->pdev->dev,
1602 			       DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
1603 	if (err)
1604 		return err;
1605 
1606 	set_bit(qid, priv->xsk_pools);
1607 
1608 	/* If XDP prog is not installed or interface is down, return. */
1609 	if (!priv->xdp_prog || !netif_running(dev))
1610 		return 0;
1611 
1612 	err = gve_reg_xsk_pool(priv, dev, pool, qid);
1613 	if (err)
1614 		goto err_xsk_pool_dma_mapped;
1615 
1616 	/* Stop and start RDA queues to repost buffers. */
1617 	if (!gve_is_qpl(priv)) {
1618 		err = gve_configure_rings_xdp(priv, priv->rx_cfg.num_queues);
1619 		if (err)
1620 			goto err_xsk_pool_registered;
1621 	}
1622 	return 0;
1623 
1624 err_xsk_pool_registered:
1625 	gve_unreg_xsk_pool(priv, qid);
1626 err_xsk_pool_dma_mapped:
1627 	clear_bit(qid, priv->xsk_pools);
1628 	xsk_pool_dma_unmap(pool,
1629 			   DMA_ATTR_SKIP_CPU_SYNC |
1630 			   DMA_ATTR_WEAK_ORDERING);
1631 	return err;
1632 }
1633 
1634 static int gve_xsk_pool_disable(struct net_device *dev,
1635 				u16 qid)
1636 {
1637 	struct gve_priv *priv = netdev_priv(dev);
1638 	struct napi_struct *napi_rx;
1639 	struct napi_struct *napi_tx;
1640 	struct xsk_buff_pool *pool;
1641 	int tx_qid;
1642 	int err;
1643 
1644 	if (qid >= priv->rx_cfg.num_queues)
1645 		return -EINVAL;
1646 
1647 	clear_bit(qid, priv->xsk_pools);
1648 
1649 	pool = xsk_get_pool_from_qid(dev, qid);
1650 	if (pool)
1651 		xsk_pool_dma_unmap(pool,
1652 				   DMA_ATTR_SKIP_CPU_SYNC |
1653 				   DMA_ATTR_WEAK_ORDERING);
1654 
1655 	if (!netif_running(dev) || !priv->tx_cfg.num_xdp_queues)
1656 		return 0;
1657 
1658 	/* Stop and start RDA queues to repost buffers. */
1659 	if (!gve_is_qpl(priv) && priv->xdp_prog) {
1660 		err = gve_configure_rings_xdp(priv, priv->rx_cfg.num_queues);
1661 		if (err)
1662 			return err;
1663 	}
1664 
1665 	napi_rx = &priv->ntfy_blocks[priv->rx[qid].ntfy_id].napi;
1666 	napi_disable(napi_rx); /* make sure current rx poll is done */
1667 
1668 	tx_qid = gve_xdp_tx_queue_id(priv, qid);
1669 	napi_tx = &priv->ntfy_blocks[priv->tx[tx_qid].ntfy_id].napi;
1670 	napi_disable(napi_tx); /* make sure current tx poll is done */
1671 
1672 	gve_unreg_xsk_pool(priv, qid);
1673 	smp_mb(); /* Make sure it is visible to the workers on datapath */
1674 
1675 	napi_enable(napi_rx);
1676 	napi_enable(napi_tx);
1677 	if (gve_is_gqi(priv)) {
1678 		if (gve_rx_work_pending(&priv->rx[qid]))
1679 			napi_schedule(napi_rx);
1680 
1681 		if (gve_tx_clean_pending(priv, &priv->tx[tx_qid]))
1682 			napi_schedule(napi_tx);
1683 	}
1684 
1685 	return 0;
1686 }
1687 
1688 static int gve_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags)
1689 {
1690 	struct gve_priv *priv = netdev_priv(dev);
1691 	struct napi_struct *napi;
1692 
1693 	if (!gve_get_napi_enabled(priv))
1694 		return -ENETDOWN;
1695 
1696 	if (queue_id >= priv->rx_cfg.num_queues || !priv->xdp_prog)
1697 		return -EINVAL;
1698 
1699 	napi = &priv->ntfy_blocks[gve_rx_idx_to_ntfy(priv, queue_id)].napi;
1700 	if (!napi_if_scheduled_mark_missed(napi)) {
1701 		/* Call local_bh_enable to trigger SoftIRQ processing */
1702 		local_bh_disable();
1703 		napi_schedule(napi);
1704 		local_bh_enable();
1705 	}
1706 
1707 	return 0;
1708 }
1709 
1710 static int verify_xdp_configuration(struct net_device *dev)
1711 {
1712 	struct gve_priv *priv = netdev_priv(dev);
1713 	u16 max_xdp_mtu;
1714 
1715 	if (dev->features & NETIF_F_LRO) {
1716 		netdev_warn(dev, "XDP is not supported when LRO is on.\n");
1717 		return -EOPNOTSUPP;
1718 	}
1719 
1720 	if (priv->header_split_enabled) {
1721 		netdev_warn(dev, "XDP is not supported when header-data split is enabled.\n");
1722 		return -EOPNOTSUPP;
1723 	}
1724 
1725 	max_xdp_mtu = priv->rx_cfg.packet_buffer_size - sizeof(struct ethhdr);
1726 	if (priv->queue_format == GVE_GQI_QPL_FORMAT)
1727 		max_xdp_mtu -= GVE_RX_PAD;
1728 
1729 	if (dev->mtu > max_xdp_mtu) {
1730 		netdev_warn(dev, "XDP is not supported for mtu %d.\n",
1731 			    dev->mtu);
1732 		return -EOPNOTSUPP;
1733 	}
1734 
1735 	if (priv->rx_cfg.num_queues != priv->tx_cfg.num_queues ||
1736 	    (2 * priv->tx_cfg.num_queues > priv->tx_cfg.max_queues)) {
1737 		netdev_warn(dev, "XDP load failed: The number of configured RX queues %d should be equal to the number of configured TX queues %d and the number of configured RX/TX queues should be less than or equal to half the maximum number of RX/TX queues %d",
1738 			    priv->rx_cfg.num_queues,
1739 			    priv->tx_cfg.num_queues,
1740 			    priv->tx_cfg.max_queues);
1741 		return -EINVAL;
1742 	}
1743 	return 0;
1744 }
1745 
1746 static int gve_xdp(struct net_device *dev, struct netdev_bpf *xdp)
1747 {
1748 	struct gve_priv *priv = netdev_priv(dev);
1749 	int err;
1750 
1751 	err = verify_xdp_configuration(dev);
1752 	if (err)
1753 		return err;
1754 	switch (xdp->command) {
1755 	case XDP_SETUP_PROG:
1756 		return gve_set_xdp(priv, xdp->prog, xdp->extack);
1757 	case XDP_SETUP_XSK_POOL:
1758 		if (xdp->xsk.pool)
1759 			return gve_xsk_pool_enable(dev, xdp->xsk.pool, xdp->xsk.queue_id);
1760 		else
1761 			return gve_xsk_pool_disable(dev, xdp->xsk.queue_id);
1762 	default:
1763 		return -EINVAL;
1764 	}
1765 }
1766 
1767 int gve_init_rss_config(struct gve_priv *priv, u16 num_queues)
1768 {
1769 	struct gve_rss_config *rss_config = &priv->rss_config;
1770 	struct ethtool_rxfh_param rxfh = {0};
1771 	u16 i;
1772 
1773 	if (!priv->cache_rss_config)
1774 		return 0;
1775 
1776 	for (i = 0; i < priv->rss_lut_size; i++)
1777 		rss_config->hash_lut[i] =
1778 			ethtool_rxfh_indir_default(i, num_queues);
1779 
1780 	netdev_rss_key_fill(rss_config->hash_key, priv->rss_key_size);
1781 
1782 	rxfh.hfunc = ETH_RSS_HASH_TOP;
1783 
1784 	return gve_adminq_configure_rss(priv, &rxfh);
1785 }
1786 
1787 int gve_flow_rules_reset(struct gve_priv *priv)
1788 {
1789 	if (!priv->max_flow_rules)
1790 		return 0;
1791 
1792 	return gve_adminq_reset_flow_rules(priv);
1793 }
1794 
1795 int gve_adjust_config(struct gve_priv *priv,
1796 		      struct gve_tx_alloc_rings_cfg *tx_alloc_cfg,
1797 		      struct gve_rx_alloc_rings_cfg *rx_alloc_cfg)
1798 {
1799 	int err;
1800 
1801 	/* Allocate resources for the new configuration */
1802 	err = gve_queues_mem_alloc(priv, tx_alloc_cfg, rx_alloc_cfg);
1803 	if (err) {
1804 		netif_err(priv, drv, priv->dev,
1805 			  "Adjust config failed to alloc new queues");
1806 		return err;
1807 	}
1808 
1809 	/* Teardown the device and free existing resources */
1810 	err = gve_close(priv->dev);
1811 	if (err) {
1812 		netif_err(priv, drv, priv->dev,
1813 			  "Adjust config failed to close old queues");
1814 		gve_queues_mem_free(priv, tx_alloc_cfg, rx_alloc_cfg);
1815 		return err;
1816 	}
1817 
1818 	/* Bring the device back up again with the new resources. */
1819 	err = gve_queues_start(priv, tx_alloc_cfg, rx_alloc_cfg);
1820 	if (err) {
1821 		netif_err(priv, drv, priv->dev,
1822 			  "Adjust config failed to start new queues, !!! DISABLING ALL QUEUES !!!\n");
1823 		/* No need to free on error: ownership of resources is lost after
1824 		 * calling gve_queues_start.
1825 		 */
1826 		gve_turndown(priv);
1827 		return err;
1828 	}
1829 
1830 	return 0;
1831 }
1832 
1833 int gve_adjust_queues(struct gve_priv *priv,
1834 		      struct gve_rx_queue_config new_rx_config,
1835 		      struct gve_tx_queue_config new_tx_config,
1836 		      bool reset_rss)
1837 {
1838 	struct gve_tx_alloc_rings_cfg tx_alloc_cfg = {0};
1839 	struct gve_rx_alloc_rings_cfg rx_alloc_cfg = {0};
1840 	int err;
1841 
1842 	gve_get_curr_alloc_cfgs(priv, &tx_alloc_cfg, &rx_alloc_cfg);
1843 
1844 	/* Relay the new config from ethtool */
1845 	tx_alloc_cfg.qcfg = &new_tx_config;
1846 	rx_alloc_cfg.qcfg_tx = &new_tx_config;
1847 	rx_alloc_cfg.qcfg_rx = &new_rx_config;
1848 	rx_alloc_cfg.reset_rss = reset_rss;
1849 
1850 	if (netif_running(priv->dev)) {
1851 		err = gve_adjust_config(priv, &tx_alloc_cfg, &rx_alloc_cfg);
1852 		return err;
1853 	}
1854 	/* Set the config for the next up. */
1855 	if (reset_rss) {
1856 		err = gve_init_rss_config(priv, new_rx_config.num_queues);
1857 		if (err)
1858 			return err;
1859 	}
1860 	priv->tx_cfg = new_tx_config;
1861 	priv->rx_cfg = new_rx_config;
1862 
1863 	return 0;
1864 }
1865 
1866 static void gve_turndown(struct gve_priv *priv)
1867 {
1868 	int idx;
1869 
1870 	if (netif_carrier_ok(priv->dev))
1871 		netif_carrier_off(priv->dev);
1872 
1873 	if (!gve_get_napi_enabled(priv))
1874 		return;
1875 
1876 	/* Disable napi to prevent more work from coming in */
1877 	for (idx = 0; idx < gve_num_tx_queues(priv); idx++) {
1878 		int ntfy_idx = gve_tx_idx_to_ntfy(priv, idx);
1879 		struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx];
1880 
1881 		if (!gve_tx_was_added_to_block(priv, idx))
1882 			continue;
1883 
1884 		if (idx < priv->tx_cfg.num_queues)
1885 			netif_queue_set_napi(priv->dev, idx,
1886 					     NETDEV_QUEUE_TYPE_TX, NULL);
1887 
1888 		napi_disable_locked(&block->napi);
1889 	}
1890 	for (idx = 0; idx < priv->rx_cfg.num_queues; idx++) {
1891 		int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx);
1892 		struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx];
1893 
1894 		if (!gve_rx_was_added_to_block(priv, idx))
1895 			continue;
1896 
1897 		netif_queue_set_napi(priv->dev, idx, NETDEV_QUEUE_TYPE_RX,
1898 				     NULL);
1899 		napi_disable_locked(&block->napi);
1900 	}
1901 
1902 	/* Stop tx queues */
1903 	netif_tx_disable(priv->dev);
1904 
1905 	xdp_features_clear_redirect_target_locked(priv->dev);
1906 
1907 	gve_clear_napi_enabled(priv);
1908 	gve_clear_report_stats(priv);
1909 
1910 	/* Make sure that all traffic is finished processing. */
1911 	synchronize_net();
1912 }
1913 
1914 static void gve_turnup(struct gve_priv *priv)
1915 {
1916 	int idx;
1917 
1918 	/* Start the tx queues */
1919 	netif_tx_start_all_queues(priv->dev);
1920 
1921 	/* Enable napi and unmask interrupts for all queues */
1922 	for (idx = 0; idx < gve_num_tx_queues(priv); idx++) {
1923 		int ntfy_idx = gve_tx_idx_to_ntfy(priv, idx);
1924 		struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx];
1925 
1926 		if (!gve_tx_was_added_to_block(priv, idx))
1927 			continue;
1928 
1929 		napi_enable_locked(&block->napi);
1930 
1931 		if (idx < priv->tx_cfg.num_queues)
1932 			netif_queue_set_napi(priv->dev, idx,
1933 					     NETDEV_QUEUE_TYPE_TX,
1934 					     &block->napi);
1935 
1936 		if (gve_is_gqi(priv)) {
1937 			iowrite32be(0, gve_irq_doorbell(priv, block));
1938 		} else {
1939 			gve_set_itr_coalesce_usecs_dqo(priv, block,
1940 						       priv->tx_coalesce_usecs);
1941 		}
1942 
1943 		/* Any descs written by the NIC before this barrier will be
1944 		 * handled by the one-off napi schedule below. Whereas any
1945 		 * descs after the barrier will generate interrupts.
1946 		 */
1947 		mb();
1948 		napi_schedule(&block->napi);
1949 	}
1950 	for (idx = 0; idx < priv->rx_cfg.num_queues; idx++) {
1951 		int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx);
1952 		struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx];
1953 
1954 		if (!gve_rx_was_added_to_block(priv, idx))
1955 			continue;
1956 
1957 		napi_enable_locked(&block->napi);
1958 		netif_queue_set_napi(priv->dev, idx, NETDEV_QUEUE_TYPE_RX,
1959 				     &block->napi);
1960 
1961 		if (gve_is_gqi(priv)) {
1962 			iowrite32be(0, gve_irq_doorbell(priv, block));
1963 		} else {
1964 			gve_set_itr_coalesce_usecs_dqo(priv, block,
1965 						       priv->rx_coalesce_usecs);
1966 		}
1967 
1968 		/* Any descs written by the NIC before this barrier will be
1969 		 * handled by the one-off napi schedule below. Whereas any
1970 		 * descs after the barrier will generate interrupts.
1971 		 */
1972 		mb();
1973 		napi_schedule(&block->napi);
1974 	}
1975 
1976 	if (priv->tx_cfg.num_xdp_queues && gve_supports_xdp_xmit(priv))
1977 		xdp_features_set_redirect_target_locked(priv->dev, false);
1978 
1979 	gve_set_napi_enabled(priv);
1980 }
1981 
1982 static void gve_turnup_and_check_status(struct gve_priv *priv)
1983 {
1984 	u32 status;
1985 
1986 	gve_turnup(priv);
1987 	status = ioread32be(&priv->reg_bar0->device_status);
1988 	gve_handle_link_status(priv, GVE_DEVICE_STATUS_LINK_STATUS_MASK & status);
1989 }
1990 
1991 static struct gve_notify_block *gve_get_tx_notify_block(struct gve_priv *priv,
1992 							unsigned int txqueue)
1993 {
1994 	u32 ntfy_idx;
1995 
1996 	if (txqueue > priv->tx_cfg.num_queues)
1997 		return NULL;
1998 
1999 	ntfy_idx = gve_tx_idx_to_ntfy(priv, txqueue);
2000 	if (ntfy_idx >= priv->num_ntfy_blks)
2001 		return NULL;
2002 
2003 	return &priv->ntfy_blocks[ntfy_idx];
2004 }
2005 
2006 static bool gve_tx_timeout_try_q_kick(struct gve_priv *priv,
2007 				      unsigned int txqueue)
2008 {
2009 	struct gve_notify_block *block;
2010 	u32 current_time;
2011 
2012 	block = gve_get_tx_notify_block(priv, txqueue);
2013 
2014 	if (!block)
2015 		return false;
2016 
2017 	current_time = jiffies_to_msecs(jiffies);
2018 	if (block->tx->last_kick_msec + MIN_TX_TIMEOUT_GAP > current_time)
2019 		return false;
2020 
2021 	netdev_info(priv->dev, "Kicking queue %d", txqueue);
2022 	napi_schedule(&block->napi);
2023 	block->tx->last_kick_msec = current_time;
2024 	return true;
2025 }
2026 
2027 static void gve_tx_timeout(struct net_device *dev, unsigned int txqueue)
2028 {
2029 	struct gve_notify_block *block;
2030 	struct gve_priv *priv;
2031 
2032 	netdev_info(dev, "Timeout on tx queue, %d", txqueue);
2033 	priv = netdev_priv(dev);
2034 
2035 	if (!gve_tx_timeout_try_q_kick(priv, txqueue))
2036 		gve_schedule_reset(priv);
2037 
2038 	block = gve_get_tx_notify_block(priv, txqueue);
2039 	if (block)
2040 		block->tx->queue_timeout++;
2041 	priv->tx_timeo_cnt++;
2042 }
2043 
2044 u16 gve_get_pkt_buf_size(const struct gve_priv *priv, bool enable_hsplit)
2045 {
2046 	if (enable_hsplit && priv->max_rx_buffer_size >= GVE_MAX_RX_BUFFER_SIZE)
2047 		return GVE_MAX_RX_BUFFER_SIZE;
2048 	else
2049 		return GVE_DEFAULT_RX_BUFFER_SIZE;
2050 }
2051 
2052 /* Header split is only supported on DQ RDA queue format. If XDP is enabled,
2053  * header split is not allowed.
2054  */
2055 bool gve_header_split_supported(const struct gve_priv *priv)
2056 {
2057 	return priv->header_buf_size &&
2058 		priv->queue_format == GVE_DQO_RDA_FORMAT && !priv->xdp_prog;
2059 }
2060 
2061 int gve_set_hsplit_config(struct gve_priv *priv, u8 tcp_data_split)
2062 {
2063 	struct gve_tx_alloc_rings_cfg tx_alloc_cfg = {0};
2064 	struct gve_rx_alloc_rings_cfg rx_alloc_cfg = {0};
2065 	bool enable_hdr_split;
2066 	int err = 0;
2067 
2068 	if (tcp_data_split == ETHTOOL_TCP_DATA_SPLIT_UNKNOWN)
2069 		return 0;
2070 
2071 	if (!gve_header_split_supported(priv)) {
2072 		dev_err(&priv->pdev->dev, "Header-split not supported\n");
2073 		return -EOPNOTSUPP;
2074 	}
2075 
2076 	if (tcp_data_split == ETHTOOL_TCP_DATA_SPLIT_ENABLED)
2077 		enable_hdr_split = true;
2078 	else
2079 		enable_hdr_split = false;
2080 
2081 	if (enable_hdr_split == priv->header_split_enabled)
2082 		return 0;
2083 
2084 	gve_get_curr_alloc_cfgs(priv, &tx_alloc_cfg, &rx_alloc_cfg);
2085 
2086 	rx_alloc_cfg.enable_header_split = enable_hdr_split;
2087 	rx_alloc_cfg.packet_buffer_size = gve_get_pkt_buf_size(priv, enable_hdr_split);
2088 
2089 	if (netif_running(priv->dev))
2090 		err = gve_adjust_config(priv, &tx_alloc_cfg, &rx_alloc_cfg);
2091 	return err;
2092 }
2093 
2094 static int gve_set_features(struct net_device *netdev,
2095 			    netdev_features_t features)
2096 {
2097 	const netdev_features_t orig_features = netdev->features;
2098 	struct gve_tx_alloc_rings_cfg tx_alloc_cfg = {0};
2099 	struct gve_rx_alloc_rings_cfg rx_alloc_cfg = {0};
2100 	struct gve_priv *priv = netdev_priv(netdev);
2101 	int err;
2102 
2103 	gve_get_curr_alloc_cfgs(priv, &tx_alloc_cfg, &rx_alloc_cfg);
2104 
2105 	if ((netdev->features & NETIF_F_LRO) != (features & NETIF_F_LRO)) {
2106 		netdev->features ^= NETIF_F_LRO;
2107 		if (priv->xdp_prog && (netdev->features & NETIF_F_LRO)) {
2108 			netdev_warn(netdev,
2109 				    "XDP is not supported when LRO is on.\n");
2110 			err =  -EOPNOTSUPP;
2111 			goto revert_features;
2112 		}
2113 		if (netif_running(netdev)) {
2114 			err = gve_adjust_config(priv, &tx_alloc_cfg, &rx_alloc_cfg);
2115 			if (err)
2116 				goto revert_features;
2117 		}
2118 	}
2119 	if ((netdev->features & NETIF_F_NTUPLE) && !(features & NETIF_F_NTUPLE)) {
2120 		err = gve_flow_rules_reset(priv);
2121 		if (err)
2122 			goto revert_features;
2123 	}
2124 
2125 	return 0;
2126 
2127 revert_features:
2128 	netdev->features = orig_features;
2129 	return err;
2130 }
2131 
2132 static int gve_get_ts_config(struct net_device *dev,
2133 			     struct kernel_hwtstamp_config *kernel_config)
2134 {
2135 	struct gve_priv *priv = netdev_priv(dev);
2136 
2137 	*kernel_config = priv->ts_config;
2138 	return 0;
2139 }
2140 
2141 static int gve_set_ts_config(struct net_device *dev,
2142 			     struct kernel_hwtstamp_config *kernel_config,
2143 			     struct netlink_ext_ack *extack)
2144 {
2145 	struct gve_priv *priv = netdev_priv(dev);
2146 
2147 	if (kernel_config->tx_type != HWTSTAMP_TX_OFF) {
2148 		NL_SET_ERR_MSG_MOD(extack, "TX timestamping is not supported");
2149 		return -ERANGE;
2150 	}
2151 
2152 	if (kernel_config->rx_filter != HWTSTAMP_FILTER_NONE) {
2153 		if (!priv->nic_ts_report) {
2154 			NL_SET_ERR_MSG_MOD(extack,
2155 					   "RX timestamping is not supported");
2156 			kernel_config->rx_filter = HWTSTAMP_FILTER_NONE;
2157 			return -EOPNOTSUPP;
2158 		}
2159 
2160 		kernel_config->rx_filter = HWTSTAMP_FILTER_ALL;
2161 		gve_clock_nic_ts_read(priv);
2162 		ptp_schedule_worker(priv->ptp->clock, 0);
2163 	} else {
2164 		ptp_cancel_worker_sync(priv->ptp->clock);
2165 	}
2166 
2167 	priv->ts_config.rx_filter = kernel_config->rx_filter;
2168 
2169 	return 0;
2170 }
2171 
2172 static const struct net_device_ops gve_netdev_ops = {
2173 	.ndo_start_xmit		=	gve_start_xmit,
2174 	.ndo_features_check	=	gve_features_check,
2175 	.ndo_open		=	gve_open,
2176 	.ndo_stop		=	gve_close,
2177 	.ndo_get_stats64	=	gve_get_stats,
2178 	.ndo_tx_timeout         =       gve_tx_timeout,
2179 	.ndo_set_features	=	gve_set_features,
2180 	.ndo_bpf		=	gve_xdp,
2181 	.ndo_xdp_xmit		=	gve_xdp_xmit,
2182 	.ndo_xsk_wakeup		=	gve_xsk_wakeup,
2183 	.ndo_hwtstamp_get	=	gve_get_ts_config,
2184 	.ndo_hwtstamp_set	=	gve_set_ts_config,
2185 };
2186 
2187 static void gve_handle_status(struct gve_priv *priv, u32 status)
2188 {
2189 	if (GVE_DEVICE_STATUS_RESET_MASK & status) {
2190 		dev_info(&priv->pdev->dev, "Device requested reset.\n");
2191 		gve_set_do_reset(priv);
2192 	}
2193 	if (GVE_DEVICE_STATUS_REPORT_STATS_MASK & status) {
2194 		priv->stats_report_trigger_cnt++;
2195 		gve_set_do_report_stats(priv);
2196 	}
2197 }
2198 
2199 static void gve_handle_reset(struct gve_priv *priv)
2200 {
2201 	/* A service task will be scheduled at the end of probe to catch any
2202 	 * resets that need to happen, and we don't want to reset until
2203 	 * probe is done.
2204 	 */
2205 	if (gve_get_probe_in_progress(priv))
2206 		return;
2207 
2208 	if (gve_get_do_reset(priv)) {
2209 		rtnl_lock();
2210 		netdev_lock(priv->dev);
2211 		gve_reset(priv, false);
2212 		netdev_unlock(priv->dev);
2213 		rtnl_unlock();
2214 	}
2215 }
2216 
2217 void gve_handle_report_stats(struct gve_priv *priv)
2218 {
2219 	struct stats *stats = priv->stats_report->stats;
2220 	int idx, stats_idx = 0;
2221 	unsigned int start = 0;
2222 	u64 tx_bytes;
2223 
2224 	if (!gve_get_report_stats(priv))
2225 		return;
2226 
2227 	be64_add_cpu(&priv->stats_report->written_count, 1);
2228 	/* tx stats */
2229 	if (priv->tx) {
2230 		for (idx = 0; idx < gve_num_tx_queues(priv); idx++) {
2231 			u32 last_completion = 0;
2232 			u32 tx_frames = 0;
2233 
2234 			/* DQO doesn't currently support these metrics. */
2235 			if (gve_is_gqi(priv)) {
2236 				last_completion = priv->tx[idx].done;
2237 				tx_frames = priv->tx[idx].req;
2238 			}
2239 
2240 			do {
2241 				start = u64_stats_fetch_begin(&priv->tx[idx].statss);
2242 				tx_bytes = priv->tx[idx].bytes_done;
2243 			} while (u64_stats_fetch_retry(&priv->tx[idx].statss, start));
2244 			stats[stats_idx++] = (struct stats) {
2245 				.stat_name = cpu_to_be32(TX_WAKE_CNT),
2246 				.value = cpu_to_be64(priv->tx[idx].wake_queue),
2247 				.queue_id = cpu_to_be32(idx),
2248 			};
2249 			stats[stats_idx++] = (struct stats) {
2250 				.stat_name = cpu_to_be32(TX_STOP_CNT),
2251 				.value = cpu_to_be64(priv->tx[idx].stop_queue),
2252 				.queue_id = cpu_to_be32(idx),
2253 			};
2254 			stats[stats_idx++] = (struct stats) {
2255 				.stat_name = cpu_to_be32(TX_FRAMES_SENT),
2256 				.value = cpu_to_be64(tx_frames),
2257 				.queue_id = cpu_to_be32(idx),
2258 			};
2259 			stats[stats_idx++] = (struct stats) {
2260 				.stat_name = cpu_to_be32(TX_BYTES_SENT),
2261 				.value = cpu_to_be64(tx_bytes),
2262 				.queue_id = cpu_to_be32(idx),
2263 			};
2264 			stats[stats_idx++] = (struct stats) {
2265 				.stat_name = cpu_to_be32(TX_LAST_COMPLETION_PROCESSED),
2266 				.value = cpu_to_be64(last_completion),
2267 				.queue_id = cpu_to_be32(idx),
2268 			};
2269 			stats[stats_idx++] = (struct stats) {
2270 				.stat_name = cpu_to_be32(TX_TIMEOUT_CNT),
2271 				.value = cpu_to_be64(priv->tx[idx].queue_timeout),
2272 				.queue_id = cpu_to_be32(idx),
2273 			};
2274 		}
2275 	}
2276 	/* rx stats */
2277 	if (priv->rx) {
2278 		for (idx = 0; idx < priv->rx_cfg.num_queues; idx++) {
2279 			stats[stats_idx++] = (struct stats) {
2280 				.stat_name = cpu_to_be32(RX_NEXT_EXPECTED_SEQUENCE),
2281 				.value = cpu_to_be64(priv->rx[idx].desc.seqno),
2282 				.queue_id = cpu_to_be32(idx),
2283 			};
2284 			stats[stats_idx++] = (struct stats) {
2285 				.stat_name = cpu_to_be32(RX_BUFFERS_POSTED),
2286 				.value = cpu_to_be64(priv->rx[idx].fill_cnt),
2287 				.queue_id = cpu_to_be32(idx),
2288 			};
2289 		}
2290 	}
2291 }
2292 
2293 /* Handle NIC status register changes, reset requests and report stats */
2294 static void gve_service_task(struct work_struct *work)
2295 {
2296 	struct gve_priv *priv = container_of(work, struct gve_priv,
2297 					     service_task);
2298 	u32 status = ioread32be(&priv->reg_bar0->device_status);
2299 
2300 	gve_handle_status(priv, status);
2301 
2302 	gve_handle_reset(priv);
2303 	gve_handle_link_status(priv, GVE_DEVICE_STATUS_LINK_STATUS_MASK & status);
2304 }
2305 
2306 static void gve_set_netdev_xdp_features(struct gve_priv *priv)
2307 {
2308 	xdp_features_t xdp_features;
2309 
2310 	if (priv->queue_format == GVE_GQI_QPL_FORMAT) {
2311 		xdp_features = NETDEV_XDP_ACT_BASIC;
2312 		xdp_features |= NETDEV_XDP_ACT_REDIRECT;
2313 		xdp_features |= NETDEV_XDP_ACT_XSK_ZEROCOPY;
2314 	} else if (priv->queue_format == GVE_DQO_RDA_FORMAT) {
2315 		xdp_features = NETDEV_XDP_ACT_BASIC;
2316 		xdp_features |= NETDEV_XDP_ACT_REDIRECT;
2317 		xdp_features |= NETDEV_XDP_ACT_XSK_ZEROCOPY;
2318 	} else {
2319 		xdp_features = 0;
2320 	}
2321 
2322 	xdp_set_features_flag_locked(priv->dev, xdp_features);
2323 }
2324 
2325 static int gve_init_priv(struct gve_priv *priv, bool skip_describe_device)
2326 {
2327 	int num_ntfy;
2328 	int err;
2329 
2330 	/* Set up the adminq */
2331 	err = gve_adminq_alloc(&priv->pdev->dev, priv);
2332 	if (err) {
2333 		dev_err(&priv->pdev->dev,
2334 			"Failed to alloc admin queue: err=%d\n", err);
2335 		return err;
2336 	}
2337 
2338 	err = gve_verify_driver_compatibility(priv);
2339 	if (err) {
2340 		dev_err(&priv->pdev->dev,
2341 			"Could not verify driver compatibility: err=%d\n", err);
2342 		goto err;
2343 	}
2344 
2345 	priv->num_registered_pages = 0;
2346 
2347 	if (skip_describe_device)
2348 		goto setup_device;
2349 
2350 	priv->queue_format = GVE_QUEUE_FORMAT_UNSPECIFIED;
2351 	/* Get the initial information we need from the device */
2352 	err = gve_adminq_describe_device(priv);
2353 	if (err) {
2354 		dev_err(&priv->pdev->dev,
2355 			"Could not get device information: err=%d\n", err);
2356 		goto err;
2357 	}
2358 	priv->dev->mtu = priv->dev->max_mtu;
2359 	num_ntfy = pci_msix_vec_count(priv->pdev);
2360 	if (num_ntfy <= 0) {
2361 		dev_err(&priv->pdev->dev,
2362 			"could not count MSI-x vectors: err=%d\n", num_ntfy);
2363 		err = num_ntfy;
2364 		goto err;
2365 	} else if (num_ntfy < GVE_MIN_MSIX) {
2366 		dev_err(&priv->pdev->dev, "gve needs at least %d MSI-x vectors, but only has %d\n",
2367 			GVE_MIN_MSIX, num_ntfy);
2368 		err = -EINVAL;
2369 		goto err;
2370 	}
2371 
2372 	/* Big TCP is only supported on DQO */
2373 	if (!gve_is_gqi(priv))
2374 		netif_set_tso_max_size(priv->dev, GVE_DQO_TX_MAX);
2375 
2376 	priv->rx_copybreak = GVE_DEFAULT_RX_COPYBREAK;
2377 	/* gvnic has one Notification Block per MSI-x vector, except for the
2378 	 * management vector
2379 	 */
2380 	priv->num_ntfy_blks = (num_ntfy - 1) & ~0x1;
2381 	priv->mgmt_msix_idx = priv->num_ntfy_blks;
2382 	priv->numa_node = dev_to_node(&priv->pdev->dev);
2383 
2384 	priv->tx_cfg.max_queues =
2385 		min_t(int, priv->tx_cfg.max_queues, priv->num_ntfy_blks / 2);
2386 	priv->rx_cfg.max_queues =
2387 		min_t(int, priv->rx_cfg.max_queues, priv->num_ntfy_blks / 2);
2388 
2389 	priv->tx_cfg.num_queues = priv->tx_cfg.max_queues;
2390 	priv->rx_cfg.num_queues = priv->rx_cfg.max_queues;
2391 	if (priv->default_num_queues > 0) {
2392 		priv->tx_cfg.num_queues = min_t(int, priv->default_num_queues,
2393 						priv->tx_cfg.num_queues);
2394 		priv->rx_cfg.num_queues = min_t(int, priv->default_num_queues,
2395 						priv->rx_cfg.num_queues);
2396 	}
2397 	priv->tx_cfg.num_xdp_queues = 0;
2398 
2399 	dev_info(&priv->pdev->dev, "TX queues %d, RX queues %d\n",
2400 		 priv->tx_cfg.num_queues, priv->rx_cfg.num_queues);
2401 	dev_info(&priv->pdev->dev, "Max TX queues %d, Max RX queues %d\n",
2402 		 priv->tx_cfg.max_queues, priv->rx_cfg.max_queues);
2403 
2404 	if (!gve_is_gqi(priv)) {
2405 		priv->tx_coalesce_usecs = GVE_TX_IRQ_RATELIMIT_US_DQO;
2406 		priv->rx_coalesce_usecs = GVE_RX_IRQ_RATELIMIT_US_DQO;
2407 	}
2408 
2409 	priv->ts_config.tx_type = HWTSTAMP_TX_OFF;
2410 	priv->ts_config.rx_filter = HWTSTAMP_FILTER_NONE;
2411 
2412 setup_device:
2413 	priv->xsk_pools = bitmap_zalloc(priv->rx_cfg.max_queues, GFP_KERNEL);
2414 	if (!priv->xsk_pools) {
2415 		err = -ENOMEM;
2416 		goto err;
2417 	}
2418 
2419 	gve_set_netdev_xdp_features(priv);
2420 	err = gve_setup_device_resources(priv);
2421 	if (err)
2422 		goto err_free_xsk_bitmap;
2423 
2424 	return 0;
2425 
2426 err_free_xsk_bitmap:
2427 	bitmap_free(priv->xsk_pools);
2428 	priv->xsk_pools = NULL;
2429 err:
2430 	gve_adminq_free(&priv->pdev->dev, priv);
2431 	return err;
2432 }
2433 
2434 static void gve_teardown_priv_resources(struct gve_priv *priv)
2435 {
2436 	gve_teardown_device_resources(priv);
2437 	gve_adminq_free(&priv->pdev->dev, priv);
2438 	bitmap_free(priv->xsk_pools);
2439 	priv->xsk_pools = NULL;
2440 }
2441 
2442 static void gve_trigger_reset(struct gve_priv *priv)
2443 {
2444 	/* Reset the device by releasing the AQ */
2445 	gve_adminq_release(priv);
2446 }
2447 
2448 static void gve_reset_and_teardown(struct gve_priv *priv, bool was_up)
2449 {
2450 	gve_trigger_reset(priv);
2451 	/* With the reset having already happened, close cannot fail */
2452 	if (was_up)
2453 		gve_close(priv->dev);
2454 	gve_teardown_priv_resources(priv);
2455 }
2456 
2457 static int gve_reset_recovery(struct gve_priv *priv, bool was_up)
2458 {
2459 	int err;
2460 
2461 	err = gve_init_priv(priv, true);
2462 	if (err)
2463 		goto err;
2464 	if (was_up) {
2465 		err = gve_open(priv->dev);
2466 		if (err)
2467 			goto err;
2468 	}
2469 	return 0;
2470 err:
2471 	dev_err(&priv->pdev->dev, "Reset failed! !!! DISABLING ALL QUEUES !!!\n");
2472 	gve_turndown(priv);
2473 	return err;
2474 }
2475 
2476 int gve_reset(struct gve_priv *priv, bool attempt_teardown)
2477 {
2478 	bool was_up = netif_running(priv->dev);
2479 	int err;
2480 
2481 	dev_info(&priv->pdev->dev, "Performing reset\n");
2482 	gve_clear_do_reset(priv);
2483 	gve_set_reset_in_progress(priv);
2484 	/* If we aren't attempting to teardown normally, just go turndown and
2485 	 * reset right away.
2486 	 */
2487 	if (!attempt_teardown) {
2488 		gve_turndown(priv);
2489 		gve_reset_and_teardown(priv, was_up);
2490 	} else {
2491 		/* Otherwise attempt to close normally */
2492 		if (was_up) {
2493 			err = gve_close(priv->dev);
2494 			/* If that fails reset as we did above */
2495 			if (err)
2496 				gve_reset_and_teardown(priv, was_up);
2497 		}
2498 		/* Clean up any remaining resources */
2499 		gve_teardown_priv_resources(priv);
2500 	}
2501 
2502 	/* Set it all back up */
2503 	err = gve_reset_recovery(priv, was_up);
2504 	gve_clear_reset_in_progress(priv);
2505 	priv->reset_cnt++;
2506 	priv->interface_up_cnt = 0;
2507 	priv->interface_down_cnt = 0;
2508 	priv->stats_report_trigger_cnt = 0;
2509 	return err;
2510 }
2511 
2512 static void gve_write_version(u8 __iomem *driver_version_register)
2513 {
2514 	const char *c = gve_version_prefix;
2515 
2516 	while (*c) {
2517 		writeb(*c, driver_version_register);
2518 		c++;
2519 	}
2520 
2521 	c = gve_version_str;
2522 	while (*c) {
2523 		writeb(*c, driver_version_register);
2524 		c++;
2525 	}
2526 	writeb('\n', driver_version_register);
2527 }
2528 
2529 static int gve_rx_queue_stop(struct net_device *dev, void *per_q_mem, int idx)
2530 {
2531 	struct gve_priv *priv = netdev_priv(dev);
2532 	struct gve_rx_ring *gve_per_q_mem;
2533 	int err;
2534 
2535 	if (!priv->rx)
2536 		return -EAGAIN;
2537 
2538 	/* Destroying queue 0 while other queues exist is not supported in DQO */
2539 	if (!gve_is_gqi(priv) && idx == 0)
2540 		return -ERANGE;
2541 
2542 	/* Single-queue destruction requires quiescence on all queues */
2543 	gve_turndown(priv);
2544 
2545 	/* This failure will trigger a reset - no need to clean up */
2546 	err = gve_adminq_destroy_single_rx_queue(priv, idx);
2547 	if (err)
2548 		return err;
2549 
2550 	if (gve_is_qpl(priv)) {
2551 		/* This failure will trigger a reset - no need to clean up */
2552 		err = gve_unregister_qpl(priv, gve_rx_get_qpl(priv, idx));
2553 		if (err)
2554 			return err;
2555 	}
2556 
2557 	gve_rx_stop_ring(priv, idx);
2558 
2559 	/* Turn the unstopped queues back up */
2560 	gve_turnup_and_check_status(priv);
2561 
2562 	gve_per_q_mem = (struct gve_rx_ring *)per_q_mem;
2563 	*gve_per_q_mem = priv->rx[idx];
2564 	memset(&priv->rx[idx], 0, sizeof(priv->rx[idx]));
2565 	return 0;
2566 }
2567 
2568 static void gve_rx_queue_mem_free(struct net_device *dev, void *per_q_mem)
2569 {
2570 	struct gve_priv *priv = netdev_priv(dev);
2571 	struct gve_rx_alloc_rings_cfg cfg = {0};
2572 	struct gve_rx_ring *gve_per_q_mem;
2573 
2574 	gve_per_q_mem = (struct gve_rx_ring *)per_q_mem;
2575 	gve_rx_get_curr_alloc_cfg(priv, &cfg);
2576 
2577 	if (gve_is_gqi(priv))
2578 		gve_rx_free_ring_gqi(priv, gve_per_q_mem, &cfg);
2579 	else
2580 		gve_rx_free_ring_dqo(priv, gve_per_q_mem, &cfg);
2581 }
2582 
2583 static int gve_rx_queue_mem_alloc(struct net_device *dev, void *per_q_mem,
2584 				  int idx)
2585 {
2586 	struct gve_priv *priv = netdev_priv(dev);
2587 	struct gve_rx_alloc_rings_cfg cfg = {0};
2588 	struct gve_rx_ring *gve_per_q_mem;
2589 	int err;
2590 
2591 	if (!priv->rx)
2592 		return -EAGAIN;
2593 
2594 	gve_per_q_mem = (struct gve_rx_ring *)per_q_mem;
2595 	gve_rx_get_curr_alloc_cfg(priv, &cfg);
2596 
2597 	if (gve_is_gqi(priv))
2598 		err = gve_rx_alloc_ring_gqi(priv, &cfg, gve_per_q_mem, idx);
2599 	else
2600 		err = gve_rx_alloc_ring_dqo(priv, &cfg, gve_per_q_mem, idx);
2601 
2602 	return err;
2603 }
2604 
2605 static int gve_rx_queue_start(struct net_device *dev, void *per_q_mem, int idx)
2606 {
2607 	struct gve_priv *priv = netdev_priv(dev);
2608 	struct gve_rx_ring *gve_per_q_mem;
2609 	int err;
2610 
2611 	if (!priv->rx)
2612 		return -EAGAIN;
2613 
2614 	gve_per_q_mem = (struct gve_rx_ring *)per_q_mem;
2615 	priv->rx[idx] = *gve_per_q_mem;
2616 
2617 	/* Single-queue creation requires quiescence on all queues */
2618 	gve_turndown(priv);
2619 
2620 	gve_rx_start_ring(priv, idx);
2621 
2622 	if (gve_is_qpl(priv)) {
2623 		/* This failure will trigger a reset - no need to clean up */
2624 		err = gve_register_qpl(priv, gve_rx_get_qpl(priv, idx));
2625 		if (err)
2626 			goto abort;
2627 	}
2628 
2629 	/* This failure will trigger a reset - no need to clean up */
2630 	err = gve_adminq_create_single_rx_queue(priv, idx);
2631 	if (err)
2632 		goto abort;
2633 
2634 	if (gve_is_gqi(priv))
2635 		gve_rx_write_doorbell(priv, &priv->rx[idx]);
2636 	else
2637 		gve_rx_post_buffers_dqo(&priv->rx[idx]);
2638 
2639 	/* Turn the unstopped queues back up */
2640 	gve_turnup_and_check_status(priv);
2641 	return 0;
2642 
2643 abort:
2644 	gve_rx_stop_ring(priv, idx);
2645 
2646 	/* All failures in this func result in a reset, by clearing the struct
2647 	 * at idx, we prevent a double free when that reset runs. The reset,
2648 	 * which needs the rtnl lock, will not run till this func returns and
2649 	 * its caller gives up the lock.
2650 	 */
2651 	memset(&priv->rx[idx], 0, sizeof(priv->rx[idx]));
2652 	return err;
2653 }
2654 
2655 static const struct netdev_queue_mgmt_ops gve_queue_mgmt_ops = {
2656 	.ndo_queue_mem_size	=	sizeof(struct gve_rx_ring),
2657 	.ndo_queue_mem_alloc	=	gve_rx_queue_mem_alloc,
2658 	.ndo_queue_mem_free	=	gve_rx_queue_mem_free,
2659 	.ndo_queue_start	=	gve_rx_queue_start,
2660 	.ndo_queue_stop		=	gve_rx_queue_stop,
2661 };
2662 
2663 static void gve_get_rx_queue_stats(struct net_device *dev, int idx,
2664 				   struct netdev_queue_stats_rx *rx_stats)
2665 {
2666 	struct gve_priv *priv = netdev_priv(dev);
2667 	struct gve_rx_ring *rx = &priv->rx[idx];
2668 	unsigned int start;
2669 
2670 	do {
2671 		start = u64_stats_fetch_begin(&rx->statss);
2672 		rx_stats->packets = rx->rpackets;
2673 		rx_stats->bytes = rx->rbytes;
2674 		rx_stats->alloc_fail = rx->rx_skb_alloc_fail +
2675 				       rx->rx_buf_alloc_fail;
2676 	} while (u64_stats_fetch_retry(&rx->statss, start));
2677 }
2678 
2679 static void gve_get_tx_queue_stats(struct net_device *dev, int idx,
2680 				   struct netdev_queue_stats_tx *tx_stats)
2681 {
2682 	struct gve_priv *priv = netdev_priv(dev);
2683 	struct gve_tx_ring *tx = &priv->tx[idx];
2684 	unsigned int start;
2685 
2686 	do {
2687 		start = u64_stats_fetch_begin(&tx->statss);
2688 		tx_stats->packets = tx->pkt_done;
2689 		tx_stats->bytes = tx->bytes_done;
2690 	} while (u64_stats_fetch_retry(&tx->statss, start));
2691 }
2692 
2693 static void gve_get_base_stats(struct net_device *dev,
2694 			       struct netdev_queue_stats_rx *rx,
2695 			       struct netdev_queue_stats_tx *tx)
2696 {
2697 	rx->packets = 0;
2698 	rx->bytes = 0;
2699 	rx->alloc_fail = 0;
2700 
2701 	tx->packets = 0;
2702 	tx->bytes = 0;
2703 }
2704 
2705 static const struct netdev_stat_ops gve_stat_ops = {
2706 	.get_queue_stats_rx	= gve_get_rx_queue_stats,
2707 	.get_queue_stats_tx	= gve_get_tx_queue_stats,
2708 	.get_base_stats		= gve_get_base_stats,
2709 };
2710 
2711 static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
2712 {
2713 	int max_tx_queues, max_rx_queues;
2714 	struct net_device *dev;
2715 	__be32 __iomem *db_bar;
2716 	struct gve_registers __iomem *reg_bar;
2717 	struct gve_priv *priv;
2718 	int err;
2719 
2720 	err = pci_enable_device(pdev);
2721 	if (err)
2722 		return err;
2723 
2724 	err = pci_request_regions(pdev, gve_driver_name);
2725 	if (err)
2726 		goto abort_with_enabled;
2727 
2728 	pci_set_master(pdev);
2729 
2730 	err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
2731 	if (err) {
2732 		dev_err(&pdev->dev, "Failed to set dma mask: err=%d\n", err);
2733 		goto abort_with_pci_region;
2734 	}
2735 
2736 	reg_bar = pci_iomap(pdev, GVE_REGISTER_BAR, 0);
2737 	if (!reg_bar) {
2738 		dev_err(&pdev->dev, "Failed to map pci bar!\n");
2739 		err = -ENOMEM;
2740 		goto abort_with_pci_region;
2741 	}
2742 
2743 	db_bar = pci_iomap(pdev, GVE_DOORBELL_BAR, 0);
2744 	if (!db_bar) {
2745 		dev_err(&pdev->dev, "Failed to map doorbell bar!\n");
2746 		err = -ENOMEM;
2747 		goto abort_with_reg_bar;
2748 	}
2749 
2750 	gve_write_version(&reg_bar->driver_version);
2751 	/* Get max queues to alloc etherdev */
2752 	max_tx_queues = ioread32be(&reg_bar->max_tx_queues);
2753 	max_rx_queues = ioread32be(&reg_bar->max_rx_queues);
2754 	/* Alloc and setup the netdev and priv */
2755 	dev = alloc_etherdev_mqs(sizeof(*priv), max_tx_queues, max_rx_queues);
2756 	if (!dev) {
2757 		dev_err(&pdev->dev, "could not allocate netdev\n");
2758 		err = -ENOMEM;
2759 		goto abort_with_db_bar;
2760 	}
2761 	SET_NETDEV_DEV(dev, &pdev->dev);
2762 	pci_set_drvdata(pdev, dev);
2763 	dev->ethtool_ops = &gve_ethtool_ops;
2764 	dev->netdev_ops = &gve_netdev_ops;
2765 	dev->queue_mgmt_ops = &gve_queue_mgmt_ops;
2766 	dev->stat_ops = &gve_stat_ops;
2767 
2768 	/* Set default and supported features.
2769 	 *
2770 	 * Features might be set in other locations as well (such as
2771 	 * `gve_adminq_describe_device`).
2772 	 */
2773 	dev->hw_features = NETIF_F_HIGHDMA;
2774 	dev->hw_features |= NETIF_F_SG;
2775 	dev->hw_features |= NETIF_F_HW_CSUM;
2776 	dev->hw_features |= NETIF_F_TSO;
2777 	dev->hw_features |= NETIF_F_TSO6;
2778 	dev->hw_features |= NETIF_F_TSO_ECN;
2779 	dev->hw_features |= NETIF_F_RXCSUM;
2780 	dev->hw_features |= NETIF_F_RXHASH;
2781 	dev->features = dev->hw_features;
2782 	dev->watchdog_timeo = 5 * HZ;
2783 	dev->min_mtu = ETH_MIN_MTU;
2784 	netif_carrier_off(dev);
2785 
2786 	priv = netdev_priv(dev);
2787 	priv->dev = dev;
2788 	priv->pdev = pdev;
2789 	priv->msg_enable = DEFAULT_MSG_LEVEL;
2790 	priv->reg_bar0 = reg_bar;
2791 	priv->db_bar2 = db_bar;
2792 	priv->service_task_flags = 0x0;
2793 	priv->state_flags = 0x0;
2794 	priv->ethtool_flags = 0x0;
2795 	priv->rx_cfg.packet_buffer_size = GVE_DEFAULT_RX_BUFFER_SIZE;
2796 	priv->max_rx_buffer_size = GVE_DEFAULT_RX_BUFFER_SIZE;
2797 
2798 	gve_set_probe_in_progress(priv);
2799 	priv->gve_wq = alloc_ordered_workqueue("gve", 0);
2800 	if (!priv->gve_wq) {
2801 		dev_err(&pdev->dev, "Could not allocate workqueue");
2802 		err = -ENOMEM;
2803 		goto abort_with_netdev;
2804 	}
2805 	INIT_WORK(&priv->service_task, gve_service_task);
2806 	INIT_WORK(&priv->stats_report_task, gve_stats_report_task);
2807 	priv->tx_cfg.max_queues = max_tx_queues;
2808 	priv->rx_cfg.max_queues = max_rx_queues;
2809 
2810 	err = gve_init_priv(priv, false);
2811 	if (err)
2812 		goto abort_with_wq;
2813 
2814 	if (!gve_is_gqi(priv) && !gve_is_qpl(priv))
2815 		dev->netmem_tx = true;
2816 
2817 	err = register_netdev(dev);
2818 	if (err)
2819 		goto abort_with_gve_init;
2820 
2821 	dev_info(&pdev->dev, "GVE version %s\n", gve_version_str);
2822 	dev_info(&pdev->dev, "GVE queue format %d\n", (int)priv->queue_format);
2823 	gve_clear_probe_in_progress(priv);
2824 	queue_work(priv->gve_wq, &priv->service_task);
2825 	return 0;
2826 
2827 abort_with_gve_init:
2828 	gve_teardown_priv_resources(priv);
2829 
2830 abort_with_wq:
2831 	destroy_workqueue(priv->gve_wq);
2832 
2833 abort_with_netdev:
2834 	free_netdev(dev);
2835 
2836 abort_with_db_bar:
2837 	pci_iounmap(pdev, db_bar);
2838 
2839 abort_with_reg_bar:
2840 	pci_iounmap(pdev, reg_bar);
2841 
2842 abort_with_pci_region:
2843 	pci_release_regions(pdev);
2844 
2845 abort_with_enabled:
2846 	pci_disable_device(pdev);
2847 	return err;
2848 }
2849 
2850 static void gve_remove(struct pci_dev *pdev)
2851 {
2852 	struct net_device *netdev = pci_get_drvdata(pdev);
2853 	struct gve_priv *priv = netdev_priv(netdev);
2854 	__be32 __iomem *db_bar = priv->db_bar2;
2855 	void __iomem *reg_bar = priv->reg_bar0;
2856 
2857 	unregister_netdev(netdev);
2858 	gve_teardown_priv_resources(priv);
2859 	destroy_workqueue(priv->gve_wq);
2860 	free_netdev(netdev);
2861 	pci_iounmap(pdev, db_bar);
2862 	pci_iounmap(pdev, reg_bar);
2863 	pci_release_regions(pdev);
2864 	pci_disable_device(pdev);
2865 }
2866 
2867 static void gve_shutdown(struct pci_dev *pdev)
2868 {
2869 	struct net_device *netdev = pci_get_drvdata(pdev);
2870 	struct gve_priv *priv = netdev_priv(netdev);
2871 	bool was_up = netif_running(priv->dev);
2872 
2873 	netif_device_detach(netdev);
2874 
2875 	rtnl_lock();
2876 	netdev_lock(netdev);
2877 	if (was_up && gve_close(priv->dev)) {
2878 		/* If the dev was up, attempt to close, if close fails, reset */
2879 		gve_reset_and_teardown(priv, was_up);
2880 	} else {
2881 		/* If the dev wasn't up or close worked, finish tearing down */
2882 		gve_teardown_priv_resources(priv);
2883 	}
2884 	netdev_unlock(netdev);
2885 	rtnl_unlock();
2886 }
2887 
2888 #ifdef CONFIG_PM
2889 static int gve_suspend(struct pci_dev *pdev, pm_message_t state)
2890 {
2891 	struct net_device *netdev = pci_get_drvdata(pdev);
2892 	struct gve_priv *priv = netdev_priv(netdev);
2893 	bool was_up = netif_running(priv->dev);
2894 
2895 	priv->suspend_cnt++;
2896 	rtnl_lock();
2897 	netdev_lock(netdev);
2898 	if (was_up && gve_close(priv->dev)) {
2899 		/* If the dev was up, attempt to close, if close fails, reset */
2900 		gve_reset_and_teardown(priv, was_up);
2901 	} else {
2902 		/* If the dev wasn't up or close worked, finish tearing down */
2903 		gve_teardown_priv_resources(priv);
2904 	}
2905 	priv->up_before_suspend = was_up;
2906 	netdev_unlock(netdev);
2907 	rtnl_unlock();
2908 	return 0;
2909 }
2910 
2911 static int gve_resume(struct pci_dev *pdev)
2912 {
2913 	struct net_device *netdev = pci_get_drvdata(pdev);
2914 	struct gve_priv *priv = netdev_priv(netdev);
2915 	int err;
2916 
2917 	priv->resume_cnt++;
2918 	rtnl_lock();
2919 	netdev_lock(netdev);
2920 	err = gve_reset_recovery(priv, priv->up_before_suspend);
2921 	netdev_unlock(netdev);
2922 	rtnl_unlock();
2923 	return err;
2924 }
2925 #endif /* CONFIG_PM */
2926 
2927 static const struct pci_device_id gve_id_table[] = {
2928 	{ PCI_DEVICE(PCI_VENDOR_ID_GOOGLE, PCI_DEV_ID_GVNIC) },
2929 	{ }
2930 };
2931 
2932 static struct pci_driver gve_driver = {
2933 	.name		= gve_driver_name,
2934 	.id_table	= gve_id_table,
2935 	.probe		= gve_probe,
2936 	.remove		= gve_remove,
2937 	.shutdown	= gve_shutdown,
2938 #ifdef CONFIG_PM
2939 	.suspend        = gve_suspend,
2940 	.resume         = gve_resume,
2941 #endif
2942 };
2943 
2944 module_pci_driver(gve_driver);
2945 
2946 MODULE_DEVICE_TABLE(pci, gve_id_table);
2947 MODULE_AUTHOR("Google, Inc.");
2948 MODULE_DESCRIPTION("Google Virtual NIC Driver");
2949 MODULE_LICENSE("Dual MIT/GPL");
2950 MODULE_VERSION(GVE_VERSION);
2951