xref: /linux/drivers/net/ethernet/intel/igb/igb_xsk.c (revision 2ee738e90e80850582cbe10f34c6447965c1d87b)
1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright(c) 2018 Intel Corporation. */
3 
4 #include <linux/bpf_trace.h>
5 #include <net/xdp_sock_drv.h>
6 #include <net/xdp.h>
7 
8 #include "e1000_hw.h"
9 #include "igb.h"
10 
11 static int igb_realloc_rx_buffer_info(struct igb_ring *ring, bool pool_present)
12 {
13 	int size = pool_present ?
14 		sizeof(*ring->rx_buffer_info_zc) * ring->count :
15 		sizeof(*ring->rx_buffer_info) * ring->count;
16 	void *buff_info = vmalloc(size);
17 
18 	if (!buff_info)
19 		return -ENOMEM;
20 
21 	if (pool_present) {
22 		vfree(ring->rx_buffer_info);
23 		ring->rx_buffer_info = NULL;
24 		ring->rx_buffer_info_zc = buff_info;
25 	} else {
26 		vfree(ring->rx_buffer_info_zc);
27 		ring->rx_buffer_info_zc = NULL;
28 		ring->rx_buffer_info = buff_info;
29 	}
30 
31 	return 0;
32 }
33 
34 static void igb_txrx_ring_disable(struct igb_adapter *adapter, u16 qid)
35 {
36 	struct igb_ring *tx_ring = adapter->tx_ring[qid];
37 	struct igb_ring *rx_ring = adapter->rx_ring[qid];
38 	struct e1000_hw *hw = &adapter->hw;
39 
40 	set_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags);
41 
42 	wr32(E1000_TXDCTL(tx_ring->reg_idx), 0);
43 	wr32(E1000_RXDCTL(rx_ring->reg_idx), 0);
44 
45 	synchronize_net();
46 
47 	/* Rx/Tx share the same napi context. */
48 	napi_disable(&rx_ring->q_vector->napi);
49 
50 	igb_clean_tx_ring(tx_ring);
51 	igb_clean_rx_ring(rx_ring);
52 
53 	memset(&rx_ring->rx_stats, 0, sizeof(rx_ring->rx_stats));
54 	memset(&tx_ring->tx_stats, 0, sizeof(tx_ring->tx_stats));
55 }
56 
57 static void igb_txrx_ring_enable(struct igb_adapter *adapter, u16 qid)
58 {
59 	struct igb_ring *tx_ring = adapter->tx_ring[qid];
60 	struct igb_ring *rx_ring = adapter->rx_ring[qid];
61 
62 	igb_configure_tx_ring(adapter, tx_ring);
63 	igb_configure_rx_ring(adapter, rx_ring);
64 
65 	synchronize_net();
66 
67 	clear_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags);
68 
69 	/* call igb_desc_unused which always leaves
70 	 * at least 1 descriptor unused to make sure
71 	 * next_to_use != next_to_clean
72 	 */
73 	if (rx_ring->xsk_pool)
74 		igb_alloc_rx_buffers_zc(rx_ring, rx_ring->xsk_pool,
75 					igb_desc_unused(rx_ring));
76 	else
77 		igb_alloc_rx_buffers(rx_ring, igb_desc_unused(rx_ring));
78 
79 	/* Rx/Tx share the same napi context. */
80 	napi_enable(&rx_ring->q_vector->napi);
81 }
82 
83 struct xsk_buff_pool *igb_xsk_pool(struct igb_adapter *adapter,
84 				   struct igb_ring *ring)
85 {
86 	int qid = ring->queue_index;
87 	struct xsk_buff_pool *pool;
88 
89 	pool = xsk_get_pool_from_qid(adapter->netdev, qid);
90 
91 	if (!igb_xdp_is_enabled(adapter))
92 		return NULL;
93 
94 	return (pool && pool->dev) ? pool : NULL;
95 }
96 
97 static int igb_xsk_pool_enable(struct igb_adapter *adapter,
98 			       struct xsk_buff_pool *pool,
99 			       u16 qid)
100 {
101 	struct net_device *netdev = adapter->netdev;
102 	struct igb_ring *rx_ring;
103 	bool if_running;
104 	int err;
105 
106 	if (qid >= adapter->num_rx_queues)
107 		return -EINVAL;
108 
109 	if (qid >= netdev->real_num_rx_queues ||
110 	    qid >= netdev->real_num_tx_queues)
111 		return -EINVAL;
112 
113 	err = xsk_pool_dma_map(pool, &adapter->pdev->dev, IGB_RX_DMA_ATTR);
114 	if (err)
115 		return err;
116 
117 	rx_ring = adapter->rx_ring[qid];
118 	if_running = netif_running(adapter->netdev) && igb_xdp_is_enabled(adapter);
119 	if (if_running)
120 		igb_txrx_ring_disable(adapter, qid);
121 
122 	if (if_running) {
123 		err = igb_realloc_rx_buffer_info(rx_ring, true);
124 		if (!err) {
125 			igb_txrx_ring_enable(adapter, qid);
126 			/* Kick start the NAPI context so that receiving will start */
127 			err = igb_xsk_wakeup(adapter->netdev, qid, XDP_WAKEUP_RX);
128 		}
129 
130 		if (err) {
131 			xsk_pool_dma_unmap(pool, IGB_RX_DMA_ATTR);
132 			return err;
133 		}
134 	}
135 
136 	return 0;
137 }
138 
139 static int igb_xsk_pool_disable(struct igb_adapter *adapter, u16 qid)
140 {
141 	struct xsk_buff_pool *pool;
142 	struct igb_ring *rx_ring;
143 	bool if_running;
144 	int err;
145 
146 	pool = xsk_get_pool_from_qid(adapter->netdev, qid);
147 	if (!pool)
148 		return -EINVAL;
149 
150 	rx_ring = adapter->rx_ring[qid];
151 	if_running = netif_running(adapter->netdev) && igb_xdp_is_enabled(adapter);
152 	if (if_running)
153 		igb_txrx_ring_disable(adapter, qid);
154 
155 	xsk_pool_dma_unmap(pool, IGB_RX_DMA_ATTR);
156 
157 	if (if_running) {
158 		err = igb_realloc_rx_buffer_info(rx_ring, false);
159 		if (err)
160 			return err;
161 
162 		igb_txrx_ring_enable(adapter, qid);
163 	}
164 
165 	return 0;
166 }
167 
168 int igb_xsk_pool_setup(struct igb_adapter *adapter,
169 		       struct xsk_buff_pool *pool,
170 		       u16 qid)
171 {
172 	return pool ? igb_xsk_pool_enable(adapter, pool, qid) :
173 		igb_xsk_pool_disable(adapter, qid);
174 }
175 
176 static u16 igb_fill_rx_descs(struct xsk_buff_pool *pool, struct xdp_buff **xdp,
177 			     union e1000_adv_rx_desc *rx_desc, u16 count)
178 {
179 	dma_addr_t dma;
180 	u16 buffs;
181 	int i;
182 
183 	/* nothing to do */
184 	if (!count)
185 		return 0;
186 
187 	buffs = xsk_buff_alloc_batch(pool, xdp, count);
188 	for (i = 0; i < buffs; i++) {
189 		dma = xsk_buff_xdp_get_dma(*xdp);
190 		rx_desc->read.pkt_addr = cpu_to_le64(dma);
191 		rx_desc->wb.upper.length = 0;
192 
193 		rx_desc++;
194 		xdp++;
195 	}
196 
197 	return buffs;
198 }
199 
200 bool igb_alloc_rx_buffers_zc(struct igb_ring *rx_ring,
201 			     struct xsk_buff_pool *xsk_pool, u16 count)
202 {
203 	u32 nb_buffs_extra = 0, nb_buffs = 0;
204 	union e1000_adv_rx_desc *rx_desc;
205 	u16 ntu = rx_ring->next_to_use;
206 	u16 total_count = count;
207 	struct xdp_buff **xdp;
208 
209 	rx_desc = IGB_RX_DESC(rx_ring, ntu);
210 	xdp = &rx_ring->rx_buffer_info_zc[ntu];
211 
212 	if (ntu + count >= rx_ring->count) {
213 		nb_buffs_extra = igb_fill_rx_descs(xsk_pool, xdp, rx_desc,
214 						   rx_ring->count - ntu);
215 		if (nb_buffs_extra != rx_ring->count - ntu) {
216 			ntu += nb_buffs_extra;
217 			goto exit;
218 		}
219 		rx_desc = IGB_RX_DESC(rx_ring, 0);
220 		xdp = rx_ring->rx_buffer_info_zc;
221 		ntu = 0;
222 		count -= nb_buffs_extra;
223 	}
224 
225 	nb_buffs = igb_fill_rx_descs(xsk_pool, xdp, rx_desc, count);
226 	ntu += nb_buffs;
227 	if (ntu == rx_ring->count)
228 		ntu = 0;
229 
230 	/* clear the length for the next_to_use descriptor */
231 	rx_desc = IGB_RX_DESC(rx_ring, ntu);
232 	rx_desc->wb.upper.length = 0;
233 
234 exit:
235 	if (rx_ring->next_to_use != ntu) {
236 		rx_ring->next_to_use = ntu;
237 
238 		/* Force memory writes to complete before letting h/w
239 		 * know there are new descriptors to fetch.  (Only
240 		 * applicable for weak-ordered memory model archs,
241 		 * such as IA-64).
242 		 */
243 		wmb();
244 		writel(ntu, rx_ring->tail);
245 	}
246 
247 	return total_count == (nb_buffs + nb_buffs_extra);
248 }
249 
250 void igb_clean_rx_ring_zc(struct igb_ring *rx_ring)
251 {
252 	u16 ntc = rx_ring->next_to_clean;
253 	u16 ntu = rx_ring->next_to_use;
254 
255 	while (ntc != ntu) {
256 		struct xdp_buff *xdp = rx_ring->rx_buffer_info_zc[ntc];
257 
258 		xsk_buff_free(xdp);
259 		ntc++;
260 		if (ntc >= rx_ring->count)
261 			ntc = 0;
262 	}
263 }
264 
265 static struct sk_buff *igb_construct_skb_zc(struct igb_ring *rx_ring,
266 					    struct xdp_buff *xdp,
267 					    ktime_t timestamp)
268 {
269 	unsigned int totalsize = xdp->data_end - xdp->data_meta;
270 	unsigned int metasize = xdp->data - xdp->data_meta;
271 	struct sk_buff *skb;
272 
273 	net_prefetch(xdp->data_meta);
274 
275 	/* allocate a skb to store the frags */
276 	skb = napi_alloc_skb(&rx_ring->q_vector->napi, totalsize);
277 	if (unlikely(!skb))
278 		return NULL;
279 
280 	if (timestamp)
281 		skb_hwtstamps(skb)->hwtstamp = timestamp;
282 
283 	memcpy(__skb_put(skb, totalsize), xdp->data_meta,
284 	       ALIGN(totalsize, sizeof(long)));
285 
286 	if (metasize) {
287 		skb_metadata_set(skb, metasize);
288 		__skb_pull(skb, metasize);
289 	}
290 
291 	return skb;
292 }
293 
294 static int igb_run_xdp_zc(struct igb_adapter *adapter, struct igb_ring *rx_ring,
295 			  struct xdp_buff *xdp, struct xsk_buff_pool *xsk_pool,
296 			  struct bpf_prog *xdp_prog)
297 {
298 	int err, result = IGB_XDP_PASS;
299 	u32 act;
300 
301 	prefetchw(xdp->data_hard_start); /* xdp_frame write */
302 
303 	act = bpf_prog_run_xdp(xdp_prog, xdp);
304 
305 	if (likely(act == XDP_REDIRECT)) {
306 		err = xdp_do_redirect(adapter->netdev, xdp, xdp_prog);
307 		if (!err)
308 			return IGB_XDP_REDIR;
309 
310 		if (xsk_uses_need_wakeup(xsk_pool) &&
311 		    err == -ENOBUFS)
312 			result = IGB_XDP_EXIT;
313 		else
314 			result = IGB_XDP_CONSUMED;
315 		goto out_failure;
316 	}
317 
318 	switch (act) {
319 	case XDP_PASS:
320 		break;
321 	case XDP_TX:
322 		result = igb_xdp_xmit_back(adapter, xdp);
323 		if (result == IGB_XDP_CONSUMED)
324 			goto out_failure;
325 		break;
326 	default:
327 		bpf_warn_invalid_xdp_action(adapter->netdev, xdp_prog, act);
328 		fallthrough;
329 	case XDP_ABORTED:
330 out_failure:
331 		trace_xdp_exception(rx_ring->netdev, xdp_prog, act);
332 		fallthrough;
333 	case XDP_DROP:
334 		result = IGB_XDP_CONSUMED;
335 		break;
336 	}
337 
338 	return result;
339 }
340 
341 int igb_clean_rx_irq_zc(struct igb_q_vector *q_vector,
342 			struct xsk_buff_pool *xsk_pool, const int budget)
343 {
344 	struct igb_adapter *adapter = q_vector->adapter;
345 	unsigned int total_bytes = 0, total_packets = 0;
346 	struct igb_ring *rx_ring = q_vector->rx.ring;
347 	u32 ntc = rx_ring->next_to_clean;
348 	struct bpf_prog *xdp_prog;
349 	unsigned int xdp_xmit = 0;
350 	bool failure = false;
351 	u16 entries_to_alloc;
352 	struct sk_buff *skb;
353 
354 	/* xdp_prog cannot be NULL in the ZC path */
355 	xdp_prog = READ_ONCE(rx_ring->xdp_prog);
356 
357 	while (likely(total_packets < budget)) {
358 		union e1000_adv_rx_desc *rx_desc;
359 		ktime_t timestamp = 0;
360 		struct xdp_buff *xdp;
361 		unsigned int size;
362 		int xdp_res = 0;
363 
364 		rx_desc = IGB_RX_DESC(rx_ring, ntc);
365 		size = le16_to_cpu(rx_desc->wb.upper.length);
366 		if (!size)
367 			break;
368 
369 		/* This memory barrier is needed to keep us from reading
370 		 * any other fields out of the rx_desc until we know the
371 		 * descriptor has been written back
372 		 */
373 		dma_rmb();
374 
375 		xdp = rx_ring->rx_buffer_info_zc[ntc];
376 		xsk_buff_set_size(xdp, size);
377 		xsk_buff_dma_sync_for_cpu(xdp);
378 
379 		/* pull rx packet timestamp if available and valid */
380 		if (igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP)) {
381 			int ts_hdr_len;
382 
383 			ts_hdr_len = igb_ptp_rx_pktstamp(rx_ring->q_vector,
384 							 xdp->data,
385 							 &timestamp);
386 
387 			xdp->data += ts_hdr_len;
388 			xdp->data_meta += ts_hdr_len;
389 			size -= ts_hdr_len;
390 		}
391 
392 		xdp_res = igb_run_xdp_zc(adapter, rx_ring, xdp, xsk_pool,
393 					 xdp_prog);
394 
395 		if (xdp_res) {
396 			if (likely(xdp_res & (IGB_XDP_TX | IGB_XDP_REDIR))) {
397 				xdp_xmit |= xdp_res;
398 			} else if (xdp_res == IGB_XDP_EXIT) {
399 				failure = true;
400 				break;
401 			} else if (xdp_res == IGB_XDP_CONSUMED) {
402 				xsk_buff_free(xdp);
403 			}
404 
405 			total_packets++;
406 			total_bytes += size;
407 			ntc++;
408 			if (ntc == rx_ring->count)
409 				ntc = 0;
410 			continue;
411 		}
412 
413 		skb = igb_construct_skb_zc(rx_ring, xdp, timestamp);
414 
415 		/* exit if we failed to retrieve a buffer */
416 		if (!skb) {
417 			rx_ring->rx_stats.alloc_failed++;
418 			break;
419 		}
420 
421 		xsk_buff_free(xdp);
422 		ntc++;
423 		if (ntc == rx_ring->count)
424 			ntc = 0;
425 
426 		if (eth_skb_pad(skb))
427 			continue;
428 
429 		/* probably a little skewed due to removing CRC */
430 		total_bytes += skb->len;
431 
432 		/* populate checksum, timestamp, VLAN, and protocol */
433 		igb_process_skb_fields(rx_ring, rx_desc, skb);
434 
435 		napi_gro_receive(&q_vector->napi, skb);
436 
437 		/* update budget accounting */
438 		total_packets++;
439 	}
440 
441 	rx_ring->next_to_clean = ntc;
442 
443 	if (xdp_xmit)
444 		igb_finalize_xdp(adapter, xdp_xmit);
445 
446 	igb_update_rx_stats(q_vector, total_packets, total_bytes);
447 
448 	entries_to_alloc = igb_desc_unused(rx_ring);
449 	if (entries_to_alloc >= IGB_RX_BUFFER_WRITE)
450 		failure |= !igb_alloc_rx_buffers_zc(rx_ring, xsk_pool,
451 						    entries_to_alloc);
452 
453 	if (xsk_uses_need_wakeup(xsk_pool)) {
454 		if (failure || rx_ring->next_to_clean == rx_ring->next_to_use)
455 			xsk_set_rx_need_wakeup(xsk_pool);
456 		else
457 			xsk_clear_rx_need_wakeup(xsk_pool);
458 
459 		return (int)total_packets;
460 	}
461 	return failure ? budget : (int)total_packets;
462 }
463 
464 bool igb_xmit_zc(struct igb_ring *tx_ring, struct xsk_buff_pool *xsk_pool)
465 {
466 	unsigned int budget = igb_desc_unused(tx_ring);
467 	u32 cmd_type, olinfo_status, nb_pkts, i = 0;
468 	struct xdp_desc *descs = xsk_pool->tx_descs;
469 	union e1000_adv_tx_desc *tx_desc = NULL;
470 	struct igb_tx_buffer *tx_buffer_info;
471 	unsigned int total_bytes = 0;
472 	dma_addr_t dma;
473 
474 	if (!netif_carrier_ok(tx_ring->netdev))
475 		return true;
476 
477 	if (test_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags))
478 		return true;
479 
480 	nb_pkts = xsk_tx_peek_release_desc_batch(xsk_pool, budget);
481 	if (!nb_pkts)
482 		return true;
483 
484 	while (nb_pkts-- > 0) {
485 		dma = xsk_buff_raw_get_dma(xsk_pool, descs[i].addr);
486 		xsk_buff_raw_dma_sync_for_device(xsk_pool, dma, descs[i].len);
487 
488 		tx_buffer_info = &tx_ring->tx_buffer_info[tx_ring->next_to_use];
489 		tx_buffer_info->bytecount = descs[i].len;
490 		tx_buffer_info->type = IGB_TYPE_XSK;
491 		tx_buffer_info->xdpf = NULL;
492 		tx_buffer_info->gso_segs = 1;
493 		tx_buffer_info->time_stamp = jiffies;
494 
495 		tx_desc = IGB_TX_DESC(tx_ring, tx_ring->next_to_use);
496 		tx_desc->read.buffer_addr = cpu_to_le64(dma);
497 
498 		/* put descriptor type bits */
499 		cmd_type = E1000_ADVTXD_DTYP_DATA | E1000_ADVTXD_DCMD_DEXT |
500 			   E1000_ADVTXD_DCMD_IFCS;
501 		olinfo_status = descs[i].len << E1000_ADVTXD_PAYLEN_SHIFT;
502 
503 		/* FIXME: This sets the Report Status (RS) bit for every
504 		 * descriptor. One nice to have optimization would be to set it
505 		 * only for the last descriptor in the whole batch. See Intel
506 		 * ice driver for an example on how to do it.
507 		 */
508 		cmd_type |= descs[i].len | IGB_TXD_DCMD;
509 		tx_desc->read.cmd_type_len = cpu_to_le32(cmd_type);
510 		tx_desc->read.olinfo_status = cpu_to_le32(olinfo_status);
511 
512 		total_bytes += descs[i].len;
513 
514 		i++;
515 		tx_ring->next_to_use++;
516 		tx_buffer_info->next_to_watch = tx_desc;
517 		if (tx_ring->next_to_use == tx_ring->count)
518 			tx_ring->next_to_use = 0;
519 	}
520 
521 	netdev_tx_sent_queue(txring_txq(tx_ring), total_bytes);
522 	igb_xdp_ring_update_tail(tx_ring);
523 
524 	return nb_pkts < budget;
525 }
526 
527 int igb_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
528 {
529 	struct igb_adapter *adapter = netdev_priv(dev);
530 	struct e1000_hw *hw = &adapter->hw;
531 	struct igb_ring *ring;
532 	u32 eics = 0;
533 
534 	if (test_bit(__IGB_DOWN, &adapter->state))
535 		return -ENETDOWN;
536 
537 	if (!igb_xdp_is_enabled(adapter))
538 		return -EINVAL;
539 
540 	if (qid >= adapter->num_tx_queues)
541 		return -EINVAL;
542 
543 	ring = adapter->tx_ring[qid];
544 
545 	if (test_bit(IGB_RING_FLAG_TX_DISABLED, &ring->flags))
546 		return -ENETDOWN;
547 
548 	if (!READ_ONCE(ring->xsk_pool))
549 		return -EINVAL;
550 
551 	if (!napi_if_scheduled_mark_missed(&ring->q_vector->napi)) {
552 		/* Cause software interrupt */
553 		if (adapter->flags & IGB_FLAG_HAS_MSIX) {
554 			eics |= ring->q_vector->eims_value;
555 			wr32(E1000_EICS, eics);
556 		} else {
557 			wr32(E1000_ICS, E1000_ICS_RXDMT0);
558 		}
559 	}
560 
561 	return 0;
562 }
563