xref: /freebsd/sys/dev/netmap/netmap_kloop.c (revision c7046f76c2c027b00c0e6ba57cfd28f1a78f5e23)
1 /*
2  * Copyright (C) 2016-2018 Vincenzo Maffione
3  * Copyright (C) 2015 Stefano Garzarella
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *   1. Redistributions of source code must retain the above copyright
10  *      notice, this list of conditions and the following disclaimer.
11  *   2. Redistributions in binary form must reproduce the above copyright
12  *      notice, this list of conditions and the following disclaimer in the
13  *      documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  *
27  * $FreeBSD$
28  */
29 
30 /*
31  * common headers
32  */
33 #if defined(__FreeBSD__)
34 #include <sys/cdefs.h>
35 #include <sys/param.h>
36 #include <sys/kernel.h>
37 #include <sys/types.h>
38 #include <sys/selinfo.h>
39 #include <sys/socket.h>
40 #include <net/if.h>
41 #include <net/if_var.h>
42 #include <machine/bus.h>
43 
44 #define usleep_range(_1, _2) \
45         pause_sbt("sync-kloop-sleep", SBT_1US * _1, SBT_1US * 1, C_ABSOLUTE)
46 
47 #elif defined(linux)
48 #include <bsd_glue.h>
49 #include <linux/file.h>
50 #include <linux/eventfd.h>
51 #endif
52 
53 #include <net/netmap.h>
54 #include <dev/netmap/netmap_kern.h>
55 #include <net/netmap_virt.h>
56 #include <dev/netmap/netmap_mem2.h>
57 
58 /* Support for eventfd-based notifications. */
59 #if defined(linux)
60 #define SYNC_KLOOP_POLL
61 #endif
62 
63 /* Write kring pointers (hwcur, hwtail) to the CSB.
64  * This routine is coupled with ptnetmap_guest_read_kring_csb(). */
65 static inline void
66 sync_kloop_kernel_write(struct nm_csb_ktoa __user *ptr, uint32_t hwcur,
67 			   uint32_t hwtail)
68 {
69 	/* Issue a first store-store barrier to make sure writes to the
70 	 * netmap ring do not overcome updates on ktoa->hwcur and ktoa->hwtail. */
71 	nm_stst_barrier();
72 
73 	/*
74 	 * The same scheme used in nm_sync_kloop_appl_write() applies here.
75 	 * We allow the application to read a value of hwcur more recent than the value
76 	 * of hwtail, since this would anyway result in a consistent view of the
77 	 * ring state (and hwcur can never wraparound hwtail, since hwcur must be
78 	 * behind head).
79 	 *
80 	 * The following memory barrier scheme is used to make this happen:
81 	 *
82 	 *          Application            Kernel
83 	 *
84 	 *          STORE(hwcur)           LOAD(hwtail)
85 	 *          wmb() <------------->  rmb()
86 	 *          STORE(hwtail)          LOAD(hwcur)
87 	 */
88 	CSB_WRITE(ptr, hwcur, hwcur);
89 	nm_stst_barrier();
90 	CSB_WRITE(ptr, hwtail, hwtail);
91 }
92 
93 /* Read kring pointers (head, cur, sync_flags) from the CSB.
94  * This routine is coupled with ptnetmap_guest_write_kring_csb(). */
95 static inline void
96 sync_kloop_kernel_read(struct nm_csb_atok __user *ptr,
97 			  struct netmap_ring *shadow_ring,
98 			  uint32_t num_slots)
99 {
100 	/*
101 	 * We place a memory barrier to make sure that the update of head never
102 	 * overtakes the update of cur.
103 	 * (see explanation in sync_kloop_kernel_write).
104 	 */
105 	CSB_READ(ptr, head, shadow_ring->head);
106 	nm_ldld_barrier();
107 	CSB_READ(ptr, cur, shadow_ring->cur);
108 	CSB_READ(ptr, sync_flags, shadow_ring->flags);
109 
110 	/* Make sure that loads from atok->head and atok->cur are not delayed
111 	 * after the loads from the netmap ring. */
112 	nm_ldld_barrier();
113 }
114 
115 /* Enable or disable application --> kernel kicks. */
116 static inline void
117 csb_ktoa_kick_enable(struct nm_csb_ktoa __user *csb_ktoa, uint32_t val)
118 {
119 	CSB_WRITE(csb_ktoa, kern_need_kick, val);
120 }
121 
122 #ifdef SYNC_KLOOP_POLL
123 /* Are application interrupt enabled or disabled? */
124 static inline uint32_t
125 csb_atok_intr_enabled(struct nm_csb_atok __user *csb_atok)
126 {
127 	uint32_t v;
128 
129 	CSB_READ(csb_atok, appl_need_kick, v);
130 
131 	return v;
132 }
133 #endif  /* SYNC_KLOOP_POLL */
134 
135 static inline void
136 sync_kloop_kring_dump(const char *title, const struct netmap_kring *kring)
137 {
138 	nm_prinf("%s, kring %s, hwcur %d, rhead %d, "
139 		"rcur %d, rtail %d, hwtail %d",
140 		title, kring->name, kring->nr_hwcur, kring->rhead,
141 		kring->rcur, kring->rtail, kring->nr_hwtail);
142 }
143 
144 /* Arguments for netmap_sync_kloop_tx_ring() and
145  * netmap_sync_kloop_rx_ring().
146  */
147 struct sync_kloop_ring_args {
148 	struct netmap_kring *kring;
149 	struct nm_csb_atok *csb_atok;
150 	struct nm_csb_ktoa *csb_ktoa;
151 #ifdef SYNC_KLOOP_POLL
152 	struct eventfd_ctx *irq_ctx;
153 #endif /* SYNC_KLOOP_POLL */
154 	/* Are we busy waiting rather than using a schedule() loop ? */
155 	bool busy_wait;
156 	/* Are we processing in the context of VM exit ? */
157 	bool direct;
158 };
159 
160 static void
161 netmap_sync_kloop_tx_ring(const struct sync_kloop_ring_args *a)
162 {
163 	struct netmap_kring *kring = a->kring;
164 	struct nm_csb_atok *csb_atok = a->csb_atok;
165 	struct nm_csb_ktoa *csb_ktoa = a->csb_ktoa;
166 	struct netmap_ring shadow_ring; /* shadow copy of the netmap_ring */
167 #ifdef SYNC_KLOOP_POLL
168 	bool more_txspace = false;
169 #endif /* SYNC_KLOOP_POLL */
170 	uint32_t num_slots;
171 	int batch;
172 
173 	if (unlikely(nm_kr_tryget(kring, 1, NULL))) {
174 		return;
175 	}
176 
177 	num_slots = kring->nkr_num_slots;
178 
179 	/* Disable application --> kernel notifications. */
180 	if (!a->direct) {
181 		csb_ktoa_kick_enable(csb_ktoa, 0);
182 	}
183 	/* Copy the application kring pointers from the CSB */
184 	sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots);
185 
186 	for (;;) {
187 		batch = shadow_ring.head - kring->nr_hwcur;
188 		if (batch < 0)
189 			batch += num_slots;
190 
191 #ifdef PTN_TX_BATCH_LIM
192 		if (batch > PTN_TX_BATCH_LIM(num_slots)) {
193 			/* If application moves ahead too fast, let's cut the move so
194 			 * that we don't exceed our batch limit. */
195 			uint32_t head_lim = kring->nr_hwcur + PTN_TX_BATCH_LIM(num_slots);
196 
197 			if (head_lim >= num_slots)
198 				head_lim -= num_slots;
199 			nm_prdis(1, "batch: %d head: %d head_lim: %d", batch, shadow_ring.head,
200 					head_lim);
201 			shadow_ring.head = head_lim;
202 			batch = PTN_TX_BATCH_LIM(num_slots);
203 		}
204 #endif /* PTN_TX_BATCH_LIM */
205 
206 		if (nm_kr_txspace(kring) <= (num_slots >> 1)) {
207 			shadow_ring.flags |= NAF_FORCE_RECLAIM;
208 		}
209 
210 		/* Netmap prologue */
211 		shadow_ring.tail = kring->rtail;
212 		if (unlikely(nm_txsync_prologue(kring, &shadow_ring) >= num_slots)) {
213 			/* Reinit ring and enable notifications. */
214 			netmap_ring_reinit(kring);
215 			if (!a->busy_wait) {
216 				csb_ktoa_kick_enable(csb_ktoa, 1);
217 			}
218 			break;
219 		}
220 
221 		if (unlikely(netmap_debug & NM_DEBUG_TXSYNC)) {
222 			sync_kloop_kring_dump("pre txsync", kring);
223 		}
224 
225 		if (unlikely(kring->nm_sync(kring, shadow_ring.flags))) {
226 			if (!a->busy_wait) {
227 				/* Re-enable notifications. */
228 				csb_ktoa_kick_enable(csb_ktoa, 1);
229 			}
230 			nm_prerr("txsync() failed");
231 			break;
232 		}
233 
234 		/*
235 		 * Finalize
236 		 * Copy kernel hwcur and hwtail into the CSB for the application sync(), and
237 		 * do the nm_sync_finalize.
238 		 */
239 		sync_kloop_kernel_write(csb_ktoa, kring->nr_hwcur,
240 				kring->nr_hwtail);
241 		if (kring->rtail != kring->nr_hwtail) {
242 			/* Some more room available in the parent adapter. */
243 			kring->rtail = kring->nr_hwtail;
244 #ifdef SYNC_KLOOP_POLL
245 			more_txspace = true;
246 #endif /* SYNC_KLOOP_POLL */
247 		}
248 
249 		if (unlikely(netmap_debug & NM_DEBUG_TXSYNC)) {
250 			sync_kloop_kring_dump("post txsync", kring);
251 		}
252 
253 		/* Interrupt the application if needed. */
254 #ifdef SYNC_KLOOP_POLL
255 		if (a->irq_ctx && more_txspace && csb_atok_intr_enabled(csb_atok)) {
256 			/* We could disable kernel --> application kicks here,
257 			 * to avoid spurious interrupts. */
258 			eventfd_signal(a->irq_ctx, 1);
259 			more_txspace = false;
260 		}
261 #endif /* SYNC_KLOOP_POLL */
262 
263 		/* Read CSB to see if there is more work to do. */
264 		sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots);
265 		if (shadow_ring.head == kring->rhead) {
266 			if (a->busy_wait) {
267 				break;
268 			}
269 			/*
270 			 * No more packets to transmit. We enable notifications and
271 			 * go to sleep, waiting for a kick from the application when new
272 			 * new slots are ready for transmission.
273 			 */
274 			/* Re-enable notifications. */
275 			csb_ktoa_kick_enable(csb_ktoa, 1);
276 			/* Double check, with store-load memory barrier. */
277 			nm_stld_barrier();
278 			sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots);
279 			if (shadow_ring.head != kring->rhead) {
280 				/* We won the race condition, there are more packets to
281 				 * transmit. Disable notifications and do another cycle */
282 				csb_ktoa_kick_enable(csb_ktoa, 0);
283 				continue;
284 			}
285 			break;
286 		}
287 
288 		if (nm_kr_txempty(kring)) {
289 			/* No more available TX slots. We stop waiting for a notification
290 			 * from the backend (netmap_tx_irq). */
291 			nm_prdis(1, "TX ring");
292 			break;
293 		}
294 	}
295 
296 	nm_kr_put(kring);
297 
298 #ifdef SYNC_KLOOP_POLL
299 	if (a->irq_ctx && more_txspace && csb_atok_intr_enabled(csb_atok)) {
300 		eventfd_signal(a->irq_ctx, 1);
301 	}
302 #endif /* SYNC_KLOOP_POLL */
303 }
304 
305 /* RX cycle without receive any packets */
306 #define SYNC_LOOP_RX_DRY_CYCLES_MAX	2
307 
308 static inline int
309 sync_kloop_norxslots(struct netmap_kring *kring, uint32_t g_head)
310 {
311 	return (NM_ACCESS_ONCE(kring->nr_hwtail) == nm_prev(g_head,
312 				kring->nkr_num_slots - 1));
313 }
314 
315 static void
316 netmap_sync_kloop_rx_ring(const struct sync_kloop_ring_args *a)
317 {
318 
319 	struct netmap_kring *kring = a->kring;
320 	struct nm_csb_atok *csb_atok = a->csb_atok;
321 	struct nm_csb_ktoa *csb_ktoa = a->csb_ktoa;
322 	struct netmap_ring shadow_ring; /* shadow copy of the netmap_ring */
323 	int dry_cycles = 0;
324 #ifdef SYNC_KLOOP_POLL
325 	bool some_recvd = false;
326 #endif /* SYNC_KLOOP_POLL */
327 	uint32_t num_slots;
328 
329 	if (unlikely(nm_kr_tryget(kring, 1, NULL))) {
330 		return;
331 	}
332 
333 	num_slots = kring->nkr_num_slots;
334 
335 	/* Get RX csb_atok and csb_ktoa pointers from the CSB. */
336 	num_slots = kring->nkr_num_slots;
337 
338 	/* Disable notifications. */
339 	if (!a->direct) {
340 		csb_ktoa_kick_enable(csb_ktoa, 0);
341 	}
342 	/* Copy the application kring pointers from the CSB */
343 	sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots);
344 
345 	for (;;) {
346 		uint32_t hwtail;
347 
348 		/* Netmap prologue */
349 		shadow_ring.tail = kring->rtail;
350 		if (unlikely(nm_rxsync_prologue(kring, &shadow_ring) >= num_slots)) {
351 			/* Reinit ring and enable notifications. */
352 			netmap_ring_reinit(kring);
353 			if (!a->busy_wait) {
354 				csb_ktoa_kick_enable(csb_ktoa, 1);
355 			}
356 			break;
357 		}
358 
359 		if (unlikely(netmap_debug & NM_DEBUG_RXSYNC)) {
360 			sync_kloop_kring_dump("pre rxsync", kring);
361 		}
362 
363 		if (unlikely(kring->nm_sync(kring, shadow_ring.flags))) {
364 			if (!a->busy_wait) {
365 				/* Re-enable notifications. */
366 				csb_ktoa_kick_enable(csb_ktoa, 1);
367 			}
368 			nm_prerr("rxsync() failed");
369 			break;
370 		}
371 
372 		/*
373 		 * Finalize
374 		 * Copy kernel hwcur and hwtail into the CSB for the application sync()
375 		 */
376 		hwtail = NM_ACCESS_ONCE(kring->nr_hwtail);
377 		sync_kloop_kernel_write(csb_ktoa, kring->nr_hwcur, hwtail);
378 		if (kring->rtail != hwtail) {
379 			kring->rtail = hwtail;
380 #ifdef SYNC_KLOOP_POLL
381 			some_recvd = true;
382 #endif /* SYNC_KLOOP_POLL */
383 			dry_cycles = 0;
384 		} else {
385 			dry_cycles++;
386 		}
387 
388 		if (unlikely(netmap_debug & NM_DEBUG_RXSYNC)) {
389 			sync_kloop_kring_dump("post rxsync", kring);
390 		}
391 
392 #ifdef SYNC_KLOOP_POLL
393 		/* Interrupt the application if needed. */
394 		if (a->irq_ctx && some_recvd && csb_atok_intr_enabled(csb_atok)) {
395 			/* We could disable kernel --> application kicks here,
396 			 * to avoid spurious interrupts. */
397 			eventfd_signal(a->irq_ctx, 1);
398 			some_recvd = false;
399 		}
400 #endif /* SYNC_KLOOP_POLL */
401 
402 		/* Read CSB to see if there is more work to do. */
403 		sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots);
404 		if (sync_kloop_norxslots(kring, shadow_ring.head)) {
405 			if (a->busy_wait) {
406 				break;
407 			}
408 			/*
409 			 * No more slots available for reception. We enable notification and
410 			 * go to sleep, waiting for a kick from the application when new receive
411 			 * slots are available.
412 			 */
413 			/* Re-enable notifications. */
414 			csb_ktoa_kick_enable(csb_ktoa, 1);
415 			/* Double check, with store-load memory barrier. */
416 			nm_stld_barrier();
417 			sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots);
418 			if (!sync_kloop_norxslots(kring, shadow_ring.head)) {
419 				/* We won the race condition, more slots are available. Disable
420 				 * notifications and do another cycle. */
421 				csb_ktoa_kick_enable(csb_ktoa, 0);
422 				continue;
423 			}
424 			break;
425 		}
426 
427 		hwtail = NM_ACCESS_ONCE(kring->nr_hwtail);
428 		if (unlikely(hwtail == kring->rhead ||
429 					dry_cycles >= SYNC_LOOP_RX_DRY_CYCLES_MAX)) {
430 			/* No more packets to be read from the backend. We stop and
431 			 * wait for a notification from the backend (netmap_rx_irq). */
432 			nm_prdis(1, "nr_hwtail: %d rhead: %d dry_cycles: %d",
433 					hwtail, kring->rhead, dry_cycles);
434 			break;
435 		}
436 	}
437 
438 	nm_kr_put(kring);
439 
440 #ifdef SYNC_KLOOP_POLL
441 	/* Interrupt the application if needed. */
442 	if (a->irq_ctx && some_recvd && csb_atok_intr_enabled(csb_atok)) {
443 		eventfd_signal(a->irq_ctx, 1);
444 	}
445 #endif /* SYNC_KLOOP_POLL */
446 }
447 
448 #ifdef SYNC_KLOOP_POLL
449 struct sync_kloop_poll_ctx;
450 struct sync_kloop_poll_entry {
451 	/* Support for receiving notifications from
452 	 * a netmap ring or from the application. */
453 	struct file *filp;
454 	wait_queue_t wait;
455 	wait_queue_head_t *wqh;
456 
457 	/* Support for sending notifications to the application. */
458 	struct eventfd_ctx *irq_ctx;
459 	struct file *irq_filp;
460 
461 	/* Arguments for the ring processing function. Useful
462 	 * in case of custom wake-up function. */
463 	struct sync_kloop_ring_args *args;
464 	struct sync_kloop_poll_ctx *parent;
465 
466 };
467 
468 struct sync_kloop_poll_ctx {
469 	poll_table wait_table;
470 	unsigned int next_entry;
471 	int (*next_wake_fun)(wait_queue_t *, unsigned, int, void *);
472 	unsigned int num_entries;
473 	unsigned int num_tx_rings;
474 	unsigned int num_rings;
475 	/* First num_tx_rings entries are for the TX kicks.
476 	 * Then the RX kicks entries follow. The last two
477 	 * entries are for TX irq, and RX irq. */
478 	struct sync_kloop_poll_entry entries[0];
479 };
480 
481 static void
482 sync_kloop_poll_table_queue_proc(struct file *file, wait_queue_head_t *wqh,
483 				poll_table *pt)
484 {
485 	struct sync_kloop_poll_ctx *poll_ctx =
486 		container_of(pt, struct sync_kloop_poll_ctx, wait_table);
487 	struct sync_kloop_poll_entry *entry = poll_ctx->entries +
488 						poll_ctx->next_entry;
489 
490 	BUG_ON(poll_ctx->next_entry >= poll_ctx->num_entries);
491 	entry->wqh = wqh;
492 	entry->filp = file;
493 	/* Use the default wake up function. */
494 	if (poll_ctx->next_wake_fun == NULL) {
495 		init_waitqueue_entry(&entry->wait, current);
496 	} else {
497 		init_waitqueue_func_entry(&entry->wait,
498 		    poll_ctx->next_wake_fun);
499 	}
500 	add_wait_queue(wqh, &entry->wait);
501 }
502 
503 static int
504 sync_kloop_tx_kick_wake_fun(wait_queue_t *wait, unsigned mode,
505     int wake_flags, void *key)
506 {
507 	struct sync_kloop_poll_entry *entry =
508 	    container_of(wait, struct sync_kloop_poll_entry, wait);
509 
510 	netmap_sync_kloop_tx_ring(entry->args);
511 
512 	return 0;
513 }
514 
515 static int
516 sync_kloop_tx_irq_wake_fun(wait_queue_t *wait, unsigned mode,
517     int wake_flags, void *key)
518 {
519 	struct sync_kloop_poll_entry *entry =
520 	    container_of(wait, struct sync_kloop_poll_entry, wait);
521 	struct sync_kloop_poll_ctx *poll_ctx = entry->parent;
522 	int i;
523 
524 	for (i = 0; i < poll_ctx->num_tx_rings; i++) {
525 		struct eventfd_ctx *irq_ctx = poll_ctx->entries[i].irq_ctx;
526 
527 		if (irq_ctx) {
528 			eventfd_signal(irq_ctx, 1);
529 		}
530 	}
531 
532 	return 0;
533 }
534 
535 static int
536 sync_kloop_rx_kick_wake_fun(wait_queue_t *wait, unsigned mode,
537     int wake_flags, void *key)
538 {
539 	struct sync_kloop_poll_entry *entry =
540 	    container_of(wait, struct sync_kloop_poll_entry, wait);
541 
542 	netmap_sync_kloop_rx_ring(entry->args);
543 
544 	return 0;
545 }
546 
547 static int
548 sync_kloop_rx_irq_wake_fun(wait_queue_t *wait, unsigned mode,
549     int wake_flags, void *key)
550 {
551 	struct sync_kloop_poll_entry *entry =
552 	    container_of(wait, struct sync_kloop_poll_entry, wait);
553 	struct sync_kloop_poll_ctx *poll_ctx = entry->parent;
554 	int i;
555 
556 	for (i = poll_ctx->num_tx_rings; i < poll_ctx->num_rings; i++) {
557 		struct eventfd_ctx *irq_ctx = poll_ctx->entries[i].irq_ctx;
558 
559 		if (irq_ctx) {
560 			eventfd_signal(irq_ctx, 1);
561 		}
562 	}
563 
564 	return 0;
565 }
566 #endif  /* SYNC_KLOOP_POLL */
567 
568 int
569 netmap_sync_kloop(struct netmap_priv_d *priv, struct nmreq_header *hdr)
570 {
571 	struct nmreq_sync_kloop_start *req =
572 		(struct nmreq_sync_kloop_start *)(uintptr_t)hdr->nr_body;
573 	struct nmreq_opt_sync_kloop_eventfds *eventfds_opt = NULL;
574 #ifdef SYNC_KLOOP_POLL
575 	struct sync_kloop_poll_ctx *poll_ctx = NULL;
576 #endif  /* SYNC_KLOOP_POLL */
577 	int num_rx_rings, num_tx_rings, num_rings;
578 	struct sync_kloop_ring_args *args = NULL;
579 	uint32_t sleep_us = req->sleep_us;
580 	struct nm_csb_atok* csb_atok_base;
581 	struct nm_csb_ktoa* csb_ktoa_base;
582 	struct netmap_adapter *na;
583 	struct nmreq_option *opt;
584 	bool na_could_sleep = false;
585 	bool busy_wait = true;
586 	bool direct_tx = false;
587 	bool direct_rx = false;
588 	int err = 0;
589 	int i;
590 
591 	if (sleep_us > 1000000) {
592 		/* We do not accept sleeping for more than a second. */
593 		return EINVAL;
594 	}
595 
596 	if (priv->np_nifp == NULL) {
597 		return ENXIO;
598 	}
599 	mb(); /* make sure following reads are not from cache */
600 
601 	na = priv->np_na;
602 	if (!nm_netmap_on(na)) {
603 		return ENXIO;
604 	}
605 
606 	NMG_LOCK();
607 	/* Make sure the application is working in CSB mode. */
608 	if (!priv->np_csb_atok_base || !priv->np_csb_ktoa_base) {
609 		NMG_UNLOCK();
610 		nm_prerr("sync-kloop on %s requires "
611 				"NETMAP_REQ_OPT_CSB option", na->name);
612 		return EINVAL;
613 	}
614 
615 	csb_atok_base = priv->np_csb_atok_base;
616 	csb_ktoa_base = priv->np_csb_ktoa_base;
617 
618 	/* Make sure that no kloop is currently running. */
619 	if (priv->np_kloop_state & NM_SYNC_KLOOP_RUNNING) {
620 		err = EBUSY;
621 	}
622 	priv->np_kloop_state |= NM_SYNC_KLOOP_RUNNING;
623 	NMG_UNLOCK();
624 	if (err) {
625 		return err;
626 	}
627 
628 	num_rx_rings = priv->np_qlast[NR_RX] - priv->np_qfirst[NR_RX];
629 	num_tx_rings = priv->np_qlast[NR_TX] - priv->np_qfirst[NR_TX];
630 	num_rings = num_tx_rings + num_rx_rings;
631 
632 	args = nm_os_malloc(num_rings * sizeof(args[0]));
633 	if (!args) {
634 		err = ENOMEM;
635 		goto out;
636 	}
637 
638 	/* Prepare the arguments for netmap_sync_kloop_tx_ring()
639 	 * and netmap_sync_kloop_rx_ring(). */
640 	for (i = 0; i < num_tx_rings; i++) {
641 		struct sync_kloop_ring_args *a = args + i;
642 
643 		a->kring = NMR(na, NR_TX)[i + priv->np_qfirst[NR_TX]];
644 		a->csb_atok = csb_atok_base + i;
645 		a->csb_ktoa = csb_ktoa_base + i;
646 		a->busy_wait = busy_wait;
647 		a->direct = direct_tx;
648 	}
649 	for (i = 0; i < num_rx_rings; i++) {
650 		struct sync_kloop_ring_args *a = args + num_tx_rings + i;
651 
652 		a->kring = NMR(na, NR_RX)[i + priv->np_qfirst[NR_RX]];
653 		a->csb_atok = csb_atok_base + num_tx_rings + i;
654 		a->csb_ktoa = csb_ktoa_base + num_tx_rings + i;
655 		a->busy_wait = busy_wait;
656 		a->direct = direct_rx;
657 	}
658 
659 	/* Validate notification options. */
660 	opt = nmreq_getoption(hdr, NETMAP_REQ_OPT_SYNC_KLOOP_MODE);
661 	if (opt != NULL) {
662 		struct nmreq_opt_sync_kloop_mode *mode_opt =
663 		    (struct nmreq_opt_sync_kloop_mode *)opt;
664 
665 		direct_tx = !!(mode_opt->mode & NM_OPT_SYNC_KLOOP_DIRECT_TX);
666 		direct_rx = !!(mode_opt->mode & NM_OPT_SYNC_KLOOP_DIRECT_RX);
667 		if (mode_opt->mode & ~(NM_OPT_SYNC_KLOOP_DIRECT_TX |
668 		    NM_OPT_SYNC_KLOOP_DIRECT_RX)) {
669 			opt->nro_status = err = EINVAL;
670 			goto out;
671 		}
672 		opt->nro_status = 0;
673 	}
674 	opt = nmreq_getoption(hdr, NETMAP_REQ_OPT_SYNC_KLOOP_EVENTFDS);
675 	if (opt != NULL) {
676 		if (opt->nro_size != sizeof(*eventfds_opt) +
677 			sizeof(eventfds_opt->eventfds[0]) * num_rings) {
678 			/* Option size not consistent with the number of
679 			 * entries. */
680 			opt->nro_status = err = EINVAL;
681 			goto out;
682 		}
683 #ifdef SYNC_KLOOP_POLL
684 		eventfds_opt = (struct nmreq_opt_sync_kloop_eventfds *)opt;
685 		opt->nro_status = 0;
686 
687 		/* Check if some ioeventfd entry is not defined, and force sleep
688 		 * synchronization in that case. */
689 		busy_wait = false;
690 		for (i = 0; i < num_rings; i++) {
691 			if (eventfds_opt->eventfds[i].ioeventfd < 0) {
692 				busy_wait = true;
693 				break;
694 			}
695 		}
696 
697 		if (busy_wait && (direct_tx || direct_rx)) {
698 			/* For direct processing we need all the
699 			 * ioeventfds to be valid. */
700 			opt->nro_status = err = EINVAL;
701 			goto out;
702 		}
703 
704 		/* We need 2 poll entries for TX and RX notifications coming
705 		 * from the netmap adapter, plus one entries per ring for the
706 		 * notifications coming from the application. */
707 		poll_ctx = nm_os_malloc(sizeof(*poll_ctx) +
708 				(num_rings + 2) * sizeof(poll_ctx->entries[0]));
709 		init_poll_funcptr(&poll_ctx->wait_table,
710 					sync_kloop_poll_table_queue_proc);
711 		poll_ctx->num_entries = 2 + num_rings;
712 		poll_ctx->num_tx_rings = num_tx_rings;
713 		poll_ctx->num_rings = num_rings;
714 		poll_ctx->next_entry = 0;
715 		poll_ctx->next_wake_fun = NULL;
716 
717 		if (direct_tx && (na->na_flags & NAF_BDG_MAYSLEEP)) {
718 			/* In direct mode, VALE txsync is called from
719 			 * wake-up context, where it is not possible
720 			 * to sleep.
721 			 */
722 			na->na_flags &= ~NAF_BDG_MAYSLEEP;
723 			na_could_sleep = true;
724 		}
725 
726 		for (i = 0; i < num_rings + 2; i++) {
727 			poll_ctx->entries[i].args = args + i;
728 			poll_ctx->entries[i].parent = poll_ctx;
729 		}
730 
731 		/* Poll for notifications coming from the applications through
732 		 * eventfds. */
733 		for (i = 0; i < num_rings; i++, poll_ctx->next_entry++) {
734 			struct eventfd_ctx *irq = NULL;
735 			struct file *filp = NULL;
736 			unsigned long mask;
737 			bool tx_ring = (i < num_tx_rings);
738 
739 			if (eventfds_opt->eventfds[i].irqfd >= 0) {
740 				filp = eventfd_fget(
741 				    eventfds_opt->eventfds[i].irqfd);
742 				if (IS_ERR(filp)) {
743 					err = PTR_ERR(filp);
744 					goto out;
745 				}
746 				irq = eventfd_ctx_fileget(filp);
747 				if (IS_ERR(irq)) {
748 					err = PTR_ERR(irq);
749 					goto out;
750 				}
751 			}
752 			poll_ctx->entries[i].irq_filp = filp;
753 			poll_ctx->entries[i].irq_ctx = irq;
754 			poll_ctx->entries[i].args->busy_wait = busy_wait;
755 			/* Don't let netmap_sync_kloop_*x_ring() use
756 			 * IRQs in direct mode. */
757 			poll_ctx->entries[i].args->irq_ctx =
758 			    ((tx_ring && direct_tx) ||
759 			    (!tx_ring && direct_rx)) ? NULL :
760 			    poll_ctx->entries[i].irq_ctx;
761 			poll_ctx->entries[i].args->direct =
762 			    (tx_ring ? direct_tx : direct_rx);
763 
764 			if (!busy_wait) {
765 				filp = eventfd_fget(
766 				    eventfds_opt->eventfds[i].ioeventfd);
767 				if (IS_ERR(filp)) {
768 					err = PTR_ERR(filp);
769 					goto out;
770 				}
771 				if (tx_ring && direct_tx) {
772 					/* Override the wake up function
773 					 * so that it can directly call
774 					 * netmap_sync_kloop_tx_ring().
775 					 */
776 					poll_ctx->next_wake_fun =
777 					    sync_kloop_tx_kick_wake_fun;
778 				} else if (!tx_ring && direct_rx) {
779 					/* Same for direct RX. */
780 					poll_ctx->next_wake_fun =
781 					    sync_kloop_rx_kick_wake_fun;
782 				} else {
783 					poll_ctx->next_wake_fun = NULL;
784 				}
785 				mask = filp->f_op->poll(filp,
786 				    &poll_ctx->wait_table);
787 				if (mask & POLLERR) {
788 					err = EINVAL;
789 					goto out;
790 				}
791 			}
792 		}
793 
794 		/* Poll for notifications coming from the netmap rings bound to
795 		 * this file descriptor. */
796 		if (!busy_wait) {
797 			NMG_LOCK();
798 			/* In direct mode, override the wake up function so
799 			 * that it can forward the netmap_tx_irq() to the
800 			 * guest. */
801 			poll_ctx->next_wake_fun = direct_tx ?
802 			    sync_kloop_tx_irq_wake_fun : NULL;
803 			poll_wait(priv->np_filp, priv->np_si[NR_TX],
804 			    &poll_ctx->wait_table);
805 			poll_ctx->next_entry++;
806 
807 			poll_ctx->next_wake_fun = direct_rx ?
808 			    sync_kloop_rx_irq_wake_fun : NULL;
809 			poll_wait(priv->np_filp, priv->np_si[NR_RX],
810 			    &poll_ctx->wait_table);
811 			poll_ctx->next_entry++;
812 			NMG_UNLOCK();
813 		}
814 #else   /* SYNC_KLOOP_POLL */
815 		opt->nro_status = EOPNOTSUPP;
816 		goto out;
817 #endif  /* SYNC_KLOOP_POLL */
818 	}
819 
820 	nm_prinf("kloop busy_wait %u, direct_tx %u, direct_rx %u, "
821 	    "na_could_sleep %u", busy_wait, direct_tx, direct_rx,
822 	    na_could_sleep);
823 
824 	/* Main loop. */
825 	for (;;) {
826 		if (unlikely(NM_ACCESS_ONCE(priv->np_kloop_state) & NM_SYNC_KLOOP_STOPPING)) {
827 			break;
828 		}
829 
830 #ifdef SYNC_KLOOP_POLL
831 		if (!busy_wait) {
832 			/* It is important to set the task state as
833 			 * interruptible before processing any TX/RX ring,
834 			 * so that if a notification on ring Y comes after
835 			 * we have processed ring Y, but before we call
836 			 * schedule(), we don't miss it. This is true because
837 			 * the wake up function will change the task state,
838 			 * and therefore the schedule_timeout() call below
839 			 * will observe the change).
840 			 */
841 			set_current_state(TASK_INTERRUPTIBLE);
842 		}
843 #endif  /* SYNC_KLOOP_POLL */
844 
845 		/* Process all the TX rings bound to this file descriptor. */
846 		for (i = 0; !direct_tx && i < num_tx_rings; i++) {
847 			struct sync_kloop_ring_args *a = args + i;
848 			netmap_sync_kloop_tx_ring(a);
849 		}
850 
851 		/* Process all the RX rings bound to this file descriptor. */
852 		for (i = 0; !direct_rx && i < num_rx_rings; i++) {
853 			struct sync_kloop_ring_args *a = args + num_tx_rings + i;
854 			netmap_sync_kloop_rx_ring(a);
855 		}
856 
857 		if (busy_wait) {
858 			/* Default synchronization method: sleep for a while. */
859 			usleep_range(sleep_us, sleep_us);
860 		}
861 #ifdef SYNC_KLOOP_POLL
862 		else {
863 			/* Yield to the scheduler waiting for a notification
864 			 * to come either from netmap or the application. */
865 			schedule_timeout(msecs_to_jiffies(3000));
866 		}
867 #endif /* SYNC_KLOOP_POLL */
868 	}
869 out:
870 #ifdef SYNC_KLOOP_POLL
871 	if (poll_ctx) {
872 		/* Stop polling from netmap and the eventfds, and deallocate
873 		 * the poll context. */
874 		if (!busy_wait) {
875 			__set_current_state(TASK_RUNNING);
876 		}
877 		for (i = 0; i < poll_ctx->next_entry; i++) {
878 			struct sync_kloop_poll_entry *entry =
879 						poll_ctx->entries + i;
880 
881 			if (entry->wqh)
882 				remove_wait_queue(entry->wqh, &entry->wait);
883 			/* We did not get a reference to the eventfds, but
884 			 * don't do that on netmap file descriptors (since
885 			 * a reference was not taken. */
886 			if (entry->filp && entry->filp != priv->np_filp)
887 				fput(entry->filp);
888 			if (entry->irq_ctx)
889 				eventfd_ctx_put(entry->irq_ctx);
890 			if (entry->irq_filp)
891 				fput(entry->irq_filp);
892 		}
893 		nm_os_free(poll_ctx);
894 		poll_ctx = NULL;
895 	}
896 #endif /* SYNC_KLOOP_POLL */
897 
898 	if (args) {
899 		nm_os_free(args);
900 		args = NULL;
901 	}
902 
903 	/* Reset the kloop state. */
904 	NMG_LOCK();
905 	priv->np_kloop_state = 0;
906 	if (na_could_sleep) {
907 		na->na_flags |= NAF_BDG_MAYSLEEP;
908 	}
909 	NMG_UNLOCK();
910 
911 	return err;
912 }
913 
914 int
915 netmap_sync_kloop_stop(struct netmap_priv_d *priv)
916 {
917 	struct netmap_adapter *na;
918 	bool running = true;
919 	int err = 0;
920 
921 	if (priv->np_nifp == NULL) {
922 		return ENXIO;
923 	}
924 	mb(); /* make sure following reads are not from cache */
925 
926 	na = priv->np_na;
927 	if (!nm_netmap_on(na)) {
928 		return ENXIO;
929 	}
930 
931 	/* Set the kloop stopping flag. */
932 	NMG_LOCK();
933 	priv->np_kloop_state |= NM_SYNC_KLOOP_STOPPING;
934 	NMG_UNLOCK();
935 
936 	/* Send a notification to the kloop, in case it is blocked in
937 	 * schedule_timeout(). We can use either RX or TX, because the
938 	 * kloop is waiting on both. */
939 	nm_os_selwakeup(priv->np_si[NR_RX]);
940 
941 	/* Wait for the kloop to actually terminate. */
942 	while (running) {
943 		usleep_range(1000, 1500);
944 		NMG_LOCK();
945 		running = (NM_ACCESS_ONCE(priv->np_kloop_state)
946 				& NM_SYNC_KLOOP_RUNNING);
947 		NMG_UNLOCK();
948 	}
949 
950 	return err;
951 }
952 
953 #ifdef WITH_PTNETMAP
954 /*
955  * Guest ptnetmap txsync()/rxsync() routines, used in ptnet device drivers.
956  * These routines are reused across the different operating systems supported
957  * by netmap.
958  */
959 
960 /*
961  * Reconcile host and guest views of the transmit ring.
962  *
963  * Guest user wants to transmit packets up to the one before ring->head,
964  * and guest kernel knows tx_ring->hwcur is the first packet unsent
965  * by the host kernel.
966  *
967  * We push out as many packets as possible, and possibly
968  * reclaim buffers from previously completed transmission.
969  *
970  * Notifications from the host are enabled only if the user guest would
971  * block (no space in the ring).
972  */
973 bool
974 netmap_pt_guest_txsync(struct nm_csb_atok *atok, struct nm_csb_ktoa *ktoa,
975 			struct netmap_kring *kring, int flags)
976 {
977 	bool notify = false;
978 
979 	/* Disable notifications */
980 	atok->appl_need_kick = 0;
981 
982 	/*
983 	 * First part: tell the host to process the new packets,
984 	 * updating the CSB.
985 	 */
986 	kring->nr_hwcur = ktoa->hwcur;
987 	nm_sync_kloop_appl_write(atok, kring->rcur, kring->rhead);
988 
989         /* Ask for a kick from a guest to the host if needed. */
990 	if (((kring->rhead != kring->nr_hwcur || nm_kr_wouldblock(kring))
991 		&& NM_ACCESS_ONCE(ktoa->kern_need_kick)) ||
992 			(flags & NAF_FORCE_RECLAIM)) {
993 		atok->sync_flags = flags;
994 		notify = true;
995 	}
996 
997 	/*
998 	 * Second part: reclaim buffers for completed transmissions.
999 	 */
1000 	if (nm_kr_wouldblock(kring) || (flags & NAF_FORCE_RECLAIM)) {
1001 		nm_sync_kloop_appl_read(ktoa, &kring->nr_hwtail,
1002 					&kring->nr_hwcur);
1003 	}
1004 
1005         /*
1006          * No more room in the ring for new transmissions. The user thread will
1007 	 * go to sleep and we need to be notified by the host when more free
1008 	 * space is available.
1009          */
1010 	if (nm_kr_wouldblock(kring) && !(kring->nr_kflags & NKR_NOINTR)) {
1011 		/* Re-enable notifications. */
1012 		atok->appl_need_kick = 1;
1013                 /* Double check, with store-load memory barrier. */
1014 		nm_stld_barrier();
1015 		nm_sync_kloop_appl_read(ktoa, &kring->nr_hwtail,
1016 					&kring->nr_hwcur);
1017                 /* If there is new free space, disable notifications */
1018 		if (unlikely(!nm_kr_wouldblock(kring))) {
1019 			atok->appl_need_kick = 0;
1020 		}
1021 	}
1022 
1023 	nm_prdis(1, "%s CSB(head:%u cur:%u hwtail:%u) KRING(head:%u cur:%u tail:%u)",
1024 		kring->name, atok->head, atok->cur, ktoa->hwtail,
1025 		kring->rhead, kring->rcur, kring->nr_hwtail);
1026 
1027 	return notify;
1028 }
1029 
1030 /*
1031  * Reconcile host and guest view of the receive ring.
1032  *
1033  * Update hwcur/hwtail from host (reading from CSB).
1034  *
1035  * If guest user has released buffers up to the one before ring->head, we
1036  * also give them to the host.
1037  *
1038  * Notifications from the host are enabled only if the user guest would
1039  * block (no more completed slots in the ring).
1040  */
1041 bool
1042 netmap_pt_guest_rxsync(struct nm_csb_atok *atok, struct nm_csb_ktoa *ktoa,
1043 			struct netmap_kring *kring, int flags)
1044 {
1045 	bool notify = false;
1046 
1047         /* Disable notifications */
1048 	atok->appl_need_kick = 0;
1049 
1050 	/*
1051 	 * First part: import newly received packets, by updating the kring
1052 	 * hwtail to the hwtail known from the host (read from the CSB).
1053 	 * This also updates the kring hwcur.
1054 	 */
1055 	nm_sync_kloop_appl_read(ktoa, &kring->nr_hwtail, &kring->nr_hwcur);
1056 	kring->nr_kflags &= ~NKR_PENDINTR;
1057 
1058 	/*
1059 	 * Second part: tell the host about the slots that guest user has
1060 	 * released, by updating cur and head in the CSB.
1061 	 */
1062 	if (kring->rhead != kring->nr_hwcur) {
1063 		nm_sync_kloop_appl_write(atok, kring->rcur, kring->rhead);
1064 	}
1065 
1066         /*
1067          * No more completed RX slots. The user thread will go to sleep and
1068 	 * we need to be notified by the host when more RX slots have been
1069 	 * completed.
1070          */
1071 	if (nm_kr_wouldblock(kring) && !(kring->nr_kflags & NKR_NOINTR)) {
1072 		/* Re-enable notifications. */
1073                 atok->appl_need_kick = 1;
1074                 /* Double check, with store-load memory barrier. */
1075 		nm_stld_barrier();
1076 		nm_sync_kloop_appl_read(ktoa, &kring->nr_hwtail,
1077 					&kring->nr_hwcur);
1078                 /* If there are new slots, disable notifications. */
1079 		if (!nm_kr_wouldblock(kring)) {
1080                         atok->appl_need_kick = 0;
1081                 }
1082         }
1083 
1084 	/* Ask for a kick from the guest to the host if needed. */
1085 	if ((kring->rhead != kring->nr_hwcur || nm_kr_wouldblock(kring))
1086 		&& NM_ACCESS_ONCE(ktoa->kern_need_kick)) {
1087 		atok->sync_flags = flags;
1088 		notify = true;
1089 	}
1090 
1091 	nm_prdis(1, "%s CSB(head:%u cur:%u hwtail:%u) KRING(head:%u cur:%u tail:%u)",
1092 		kring->name, atok->head, atok->cur, ktoa->hwtail,
1093 		kring->rhead, kring->rcur, kring->nr_hwtail);
1094 
1095 	return notify;
1096 }
1097 
1098 /*
1099  * Callbacks for ptnet drivers: nm_krings_create, nm_krings_delete, nm_dtor.
1100  */
1101 int
1102 ptnet_nm_krings_create(struct netmap_adapter *na)
1103 {
1104 	struct netmap_pt_guest_adapter *ptna =
1105 			(struct netmap_pt_guest_adapter *)na; /* Upcast. */
1106 	struct netmap_adapter *na_nm = &ptna->hwup.up;
1107 	struct netmap_adapter *na_dr = &ptna->dr.up;
1108 	int ret;
1109 
1110 	if (ptna->backend_users) {
1111 		return 0;
1112 	}
1113 
1114 	/* Create krings on the public netmap adapter. */
1115 	ret = netmap_hw_krings_create(na_nm);
1116 	if (ret) {
1117 		return ret;
1118 	}
1119 
1120 	/* Copy krings into the netmap adapter private to the driver. */
1121 	na_dr->tx_rings = na_nm->tx_rings;
1122 	na_dr->rx_rings = na_nm->rx_rings;
1123 
1124 	return 0;
1125 }
1126 
1127 void
1128 ptnet_nm_krings_delete(struct netmap_adapter *na)
1129 {
1130 	struct netmap_pt_guest_adapter *ptna =
1131 			(struct netmap_pt_guest_adapter *)na; /* Upcast. */
1132 	struct netmap_adapter *na_nm = &ptna->hwup.up;
1133 	struct netmap_adapter *na_dr = &ptna->dr.up;
1134 
1135 	if (ptna->backend_users) {
1136 		return;
1137 	}
1138 
1139 	na_dr->tx_rings = NULL;
1140 	na_dr->rx_rings = NULL;
1141 
1142 	netmap_hw_krings_delete(na_nm);
1143 }
1144 
1145 void
1146 ptnet_nm_dtor(struct netmap_adapter *na)
1147 {
1148 	struct netmap_pt_guest_adapter *ptna =
1149 			(struct netmap_pt_guest_adapter *)na;
1150 
1151 	netmap_mem_put(ptna->dr.up.nm_mem);
1152 	memset(&ptna->dr, 0, sizeof(ptna->dr));
1153 	netmap_mem_pt_guest_ifp_del(na->nm_mem, na->ifp);
1154 }
1155 
1156 int
1157 netmap_pt_guest_attach(struct netmap_adapter *arg,
1158 		       unsigned int nifp_offset, unsigned int memid)
1159 {
1160 	struct netmap_pt_guest_adapter *ptna;
1161 	struct ifnet *ifp = arg ? arg->ifp : NULL;
1162 	int error;
1163 
1164 	/* get allocator */
1165 	arg->nm_mem = netmap_mem_pt_guest_new(ifp, nifp_offset, memid);
1166 	if (arg->nm_mem == NULL)
1167 		return ENOMEM;
1168 	arg->na_flags |= NAF_MEM_OWNER;
1169 	error = netmap_attach_ext(arg, sizeof(struct netmap_pt_guest_adapter), 1);
1170 	if (error)
1171 		return error;
1172 
1173 	/* get the netmap_pt_guest_adapter */
1174 	ptna = (struct netmap_pt_guest_adapter *) NA(ifp);
1175 
1176 	/* Initialize a separate pass-through netmap adapter that is going to
1177 	 * be used by the ptnet driver only, and so never exposed to netmap
1178          * applications. We only need a subset of the available fields. */
1179 	memset(&ptna->dr, 0, sizeof(ptna->dr));
1180 	ptna->dr.up.ifp = ifp;
1181 	ptna->dr.up.nm_mem = netmap_mem_get(ptna->hwup.up.nm_mem);
1182         ptna->dr.up.nm_config = ptna->hwup.up.nm_config;
1183 
1184 	ptna->backend_users = 0;
1185 
1186 	return 0;
1187 }
1188 
1189 #endif /* WITH_PTNETMAP */
1190