xref: /freebsd/sys/dev/netmap/netmap_kloop.c (revision 7fdf597e96a02165cfe22ff357b857d5fa15ed8a)
1 /*
2  * Copyright (C) 2016-2018 Vincenzo Maffione
3  * Copyright (C) 2015 Stefano Garzarella
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *   1. Redistributions of source code must retain the above copyright
10  *      notice, this list of conditions and the following disclaimer.
11  *   2. Redistributions in binary form must reproduce the above copyright
12  *      notice, this list of conditions and the following disclaimer in the
13  *      documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 /*
29  * common headers
30  */
31 #if defined(__FreeBSD__)
32 
33 #include <sys/param.h>
34 #include <sys/kernel.h>
35 #include <sys/types.h>
36 #include <sys/selinfo.h>
37 #include <sys/socket.h>
38 #include <net/if.h>
39 #include <net/if_var.h>
40 #include <machine/bus.h>
41 
42 #define usleep_range(_1, _2) \
43         pause_sbt("sync-kloop-sleep", SBT_1US * _1, SBT_1US * 1, C_ABSOLUTE)
44 
45 #elif defined(linux)
46 #include <bsd_glue.h>
47 #include <linux/file.h>
48 #include <linux/eventfd.h>
49 #endif
50 
51 #include <net/netmap.h>
52 #include <dev/netmap/netmap_kern.h>
53 #include <net/netmap_virt.h>
54 #include <dev/netmap/netmap_mem2.h>
55 
56 /* Support for eventfd-based notifications. */
57 #if defined(linux)
58 #define SYNC_KLOOP_POLL
59 #endif
60 
61 /* Write kring pointers (hwcur, hwtail) to the CSB.
62  * This routine is coupled with ptnetmap_guest_read_kring_csb(). */
63 static inline void
64 sync_kloop_kernel_write(struct nm_csb_ktoa __user *ptr, uint32_t hwcur,
65 			   uint32_t hwtail)
66 {
67 	/* Issue a first store-store barrier to make sure writes to the
68 	 * netmap ring do not overcome updates on ktoa->hwcur and ktoa->hwtail. */
69 	nm_stst_barrier();
70 
71 	/*
72 	 * The same scheme used in nm_sync_kloop_appl_write() applies here.
73 	 * We allow the application to read a value of hwcur more recent than the value
74 	 * of hwtail, since this would anyway result in a consistent view of the
75 	 * ring state (and hwcur can never wraparound hwtail, since hwcur must be
76 	 * behind head).
77 	 *
78 	 * The following memory barrier scheme is used to make this happen:
79 	 *
80 	 *          Application            Kernel
81 	 *
82 	 *          STORE(hwcur)           LOAD(hwtail)
83 	 *          wmb() <------------->  rmb()
84 	 *          STORE(hwtail)          LOAD(hwcur)
85 	 */
86 	CSB_WRITE(ptr, hwcur, hwcur);
87 	nm_stst_barrier();
88 	CSB_WRITE(ptr, hwtail, hwtail);
89 }
90 
91 /* Read kring pointers (head, cur, sync_flags) from the CSB.
92  * This routine is coupled with ptnetmap_guest_write_kring_csb(). */
93 static inline void
94 sync_kloop_kernel_read(struct nm_csb_atok __user *ptr,
95 			  struct netmap_ring *shadow_ring,
96 			  uint32_t num_slots)
97 {
98 	/*
99 	 * We place a memory barrier to make sure that the update of head never
100 	 * overtakes the update of cur.
101 	 * (see explanation in sync_kloop_kernel_write).
102 	 */
103 	CSB_READ(ptr, head, shadow_ring->head);
104 	nm_ldld_barrier();
105 	CSB_READ(ptr, cur, shadow_ring->cur);
106 	CSB_READ(ptr, sync_flags, shadow_ring->flags);
107 
108 	/* Make sure that loads from atok->head and atok->cur are not delayed
109 	 * after the loads from the netmap ring. */
110 	nm_ldld_barrier();
111 }
112 
113 /* Enable or disable application --> kernel kicks. */
114 static inline void
115 csb_ktoa_kick_enable(struct nm_csb_ktoa __user *csb_ktoa, uint32_t val)
116 {
117 	CSB_WRITE(csb_ktoa, kern_need_kick, val);
118 }
119 
120 #ifdef SYNC_KLOOP_POLL
121 /* Are application interrupt enabled or disabled? */
122 static inline uint32_t
123 csb_atok_intr_enabled(struct nm_csb_atok __user *csb_atok)
124 {
125 	uint32_t v;
126 
127 	CSB_READ(csb_atok, appl_need_kick, v);
128 
129 	return v;
130 }
131 #endif  /* SYNC_KLOOP_POLL */
132 
133 static inline void
134 sync_kloop_kring_dump(const char *title, const struct netmap_kring *kring)
135 {
136 	nm_prinf("%s, kring %s, hwcur %d, rhead %d, "
137 		"rcur %d, rtail %d, hwtail %d",
138 		title, kring->name, kring->nr_hwcur, kring->rhead,
139 		kring->rcur, kring->rtail, kring->nr_hwtail);
140 }
141 
142 /* Arguments for netmap_sync_kloop_tx_ring() and
143  * netmap_sync_kloop_rx_ring().
144  */
145 struct sync_kloop_ring_args {
146 	struct netmap_kring *kring;
147 	struct nm_csb_atok *csb_atok;
148 	struct nm_csb_ktoa *csb_ktoa;
149 #ifdef SYNC_KLOOP_POLL
150 	struct eventfd_ctx *irq_ctx;
151 #endif /* SYNC_KLOOP_POLL */
152 	/* Are we busy waiting rather than using a schedule() loop ? */
153 	bool busy_wait;
154 	/* Are we processing in the context of VM exit ? */
155 	bool direct;
156 };
157 
158 static void
159 netmap_sync_kloop_tx_ring(const struct sync_kloop_ring_args *a)
160 {
161 	struct netmap_kring *kring = a->kring;
162 	struct nm_csb_atok *csb_atok = a->csb_atok;
163 	struct nm_csb_ktoa *csb_ktoa = a->csb_ktoa;
164 	struct netmap_ring shadow_ring; /* shadow copy of the netmap_ring */
165 #ifdef SYNC_KLOOP_POLL
166 	bool more_txspace = false;
167 #endif /* SYNC_KLOOP_POLL */
168 	uint32_t num_slots;
169 	int batch;
170 
171 	if (unlikely(nm_kr_tryget(kring, 1, NULL))) {
172 		return;
173 	}
174 
175 	num_slots = kring->nkr_num_slots;
176 
177 	/* Disable application --> kernel notifications. */
178 	if (!a->direct) {
179 		csb_ktoa_kick_enable(csb_ktoa, 0);
180 	}
181 	/* Copy the application kring pointers from the CSB */
182 	sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots);
183 
184 	for (;;) {
185 		batch = shadow_ring.head - kring->nr_hwcur;
186 		if (batch < 0)
187 			batch += num_slots;
188 
189 #ifdef PTN_TX_BATCH_LIM
190 		if (batch > PTN_TX_BATCH_LIM(num_slots)) {
191 			/* If application moves ahead too fast, let's cut the move so
192 			 * that we don't exceed our batch limit. */
193 			uint32_t head_lim = kring->nr_hwcur + PTN_TX_BATCH_LIM(num_slots);
194 
195 			if (head_lim >= num_slots)
196 				head_lim -= num_slots;
197 			nm_prdis(1, "batch: %d head: %d head_lim: %d", batch, shadow_ring.head,
198 					head_lim);
199 			shadow_ring.head = head_lim;
200 			batch = PTN_TX_BATCH_LIM(num_slots);
201 		}
202 #endif /* PTN_TX_BATCH_LIM */
203 
204 		if (nm_kr_txspace(kring) <= (num_slots >> 1)) {
205 			shadow_ring.flags |= NAF_FORCE_RECLAIM;
206 		}
207 
208 		/* Netmap prologue */
209 		shadow_ring.tail = kring->rtail;
210 		if (unlikely(nm_txsync_prologue(kring, &shadow_ring) >= num_slots)) {
211 			/* Reinit ring and enable notifications. */
212 			netmap_ring_reinit(kring);
213 			if (!a->busy_wait) {
214 				csb_ktoa_kick_enable(csb_ktoa, 1);
215 			}
216 			break;
217 		}
218 
219 		if (unlikely(netmap_debug & NM_DEBUG_TXSYNC)) {
220 			sync_kloop_kring_dump("pre txsync", kring);
221 		}
222 
223 		if (unlikely(kring->nm_sync(kring, shadow_ring.flags))) {
224 			if (!a->busy_wait) {
225 				/* Re-enable notifications. */
226 				csb_ktoa_kick_enable(csb_ktoa, 1);
227 			}
228 			nm_prerr("txsync() failed");
229 			break;
230 		}
231 
232 		/*
233 		 * Finalize
234 		 * Copy kernel hwcur and hwtail into the CSB for the application sync(), and
235 		 * do the nm_sync_finalize.
236 		 */
237 		sync_kloop_kernel_write(csb_ktoa, kring->nr_hwcur,
238 				kring->nr_hwtail);
239 		if (kring->rtail != kring->nr_hwtail) {
240 			/* Some more room available in the parent adapter. */
241 			kring->rtail = kring->nr_hwtail;
242 #ifdef SYNC_KLOOP_POLL
243 			more_txspace = true;
244 #endif /* SYNC_KLOOP_POLL */
245 		}
246 
247 		if (unlikely(netmap_debug & NM_DEBUG_TXSYNC)) {
248 			sync_kloop_kring_dump("post txsync", kring);
249 		}
250 
251 		/* Interrupt the application if needed. */
252 #ifdef SYNC_KLOOP_POLL
253 		if (a->irq_ctx && more_txspace && csb_atok_intr_enabled(csb_atok)) {
254 			/* We could disable kernel --> application kicks here,
255 			 * to avoid spurious interrupts. */
256 			eventfd_signal(a->irq_ctx, 1);
257 			more_txspace = false;
258 		}
259 #endif /* SYNC_KLOOP_POLL */
260 
261 		/* Read CSB to see if there is more work to do. */
262 		sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots);
263 		if (shadow_ring.head == kring->rhead) {
264 			if (a->busy_wait) {
265 				break;
266 			}
267 			/*
268 			 * No more packets to transmit. We enable notifications and
269 			 * go to sleep, waiting for a kick from the application when new
270 			 * new slots are ready for transmission.
271 			 */
272 			/* Re-enable notifications. */
273 			csb_ktoa_kick_enable(csb_ktoa, 1);
274 			/* Double check, with store-load memory barrier. */
275 			nm_stld_barrier();
276 			sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots);
277 			if (shadow_ring.head != kring->rhead) {
278 				/* We won the race condition, there are more packets to
279 				 * transmit. Disable notifications and do another cycle */
280 				csb_ktoa_kick_enable(csb_ktoa, 0);
281 				continue;
282 			}
283 			break;
284 		}
285 
286 		if (nm_kr_txempty(kring)) {
287 			/* No more available TX slots. We stop waiting for a notification
288 			 * from the backend (netmap_tx_irq). */
289 			nm_prdis(1, "TX ring");
290 			break;
291 		}
292 	}
293 
294 	nm_kr_put(kring);
295 
296 #ifdef SYNC_KLOOP_POLL
297 	if (a->irq_ctx && more_txspace && csb_atok_intr_enabled(csb_atok)) {
298 		eventfd_signal(a->irq_ctx, 1);
299 	}
300 #endif /* SYNC_KLOOP_POLL */
301 }
302 
303 /* RX cycle without receive any packets */
304 #define SYNC_LOOP_RX_DRY_CYCLES_MAX	2
305 
306 static inline int
307 sync_kloop_norxslots(struct netmap_kring *kring, uint32_t g_head)
308 {
309 	return (NM_ACCESS_ONCE(kring->nr_hwtail) == nm_prev(g_head,
310 				kring->nkr_num_slots - 1));
311 }
312 
313 static void
314 netmap_sync_kloop_rx_ring(const struct sync_kloop_ring_args *a)
315 {
316 
317 	struct netmap_kring *kring = a->kring;
318 	struct nm_csb_atok *csb_atok = a->csb_atok;
319 	struct nm_csb_ktoa *csb_ktoa = a->csb_ktoa;
320 	struct netmap_ring shadow_ring; /* shadow copy of the netmap_ring */
321 	int dry_cycles = 0;
322 #ifdef SYNC_KLOOP_POLL
323 	bool some_recvd = false;
324 #endif /* SYNC_KLOOP_POLL */
325 	uint32_t num_slots;
326 
327 	if (unlikely(nm_kr_tryget(kring, 1, NULL))) {
328 		return;
329 	}
330 
331 	num_slots = kring->nkr_num_slots;
332 
333 	/* Get RX csb_atok and csb_ktoa pointers from the CSB. */
334 	num_slots = kring->nkr_num_slots;
335 
336 	/* Disable notifications. */
337 	if (!a->direct) {
338 		csb_ktoa_kick_enable(csb_ktoa, 0);
339 	}
340 	/* Copy the application kring pointers from the CSB */
341 	sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots);
342 
343 	for (;;) {
344 		uint32_t hwtail;
345 
346 		/* Netmap prologue */
347 		shadow_ring.tail = kring->rtail;
348 		if (unlikely(nm_rxsync_prologue(kring, &shadow_ring) >= num_slots)) {
349 			/* Reinit ring and enable notifications. */
350 			netmap_ring_reinit(kring);
351 			if (!a->busy_wait) {
352 				csb_ktoa_kick_enable(csb_ktoa, 1);
353 			}
354 			break;
355 		}
356 
357 		if (unlikely(netmap_debug & NM_DEBUG_RXSYNC)) {
358 			sync_kloop_kring_dump("pre rxsync", kring);
359 		}
360 
361 		if (unlikely(kring->nm_sync(kring, shadow_ring.flags))) {
362 			if (!a->busy_wait) {
363 				/* Re-enable notifications. */
364 				csb_ktoa_kick_enable(csb_ktoa, 1);
365 			}
366 			nm_prerr("rxsync() failed");
367 			break;
368 		}
369 
370 		/*
371 		 * Finalize
372 		 * Copy kernel hwcur and hwtail into the CSB for the application sync()
373 		 */
374 		hwtail = NM_ACCESS_ONCE(kring->nr_hwtail);
375 		sync_kloop_kernel_write(csb_ktoa, kring->nr_hwcur, hwtail);
376 		if (kring->rtail != hwtail) {
377 			kring->rtail = hwtail;
378 #ifdef SYNC_KLOOP_POLL
379 			some_recvd = true;
380 #endif /* SYNC_KLOOP_POLL */
381 			dry_cycles = 0;
382 		} else {
383 			dry_cycles++;
384 		}
385 
386 		if (unlikely(netmap_debug & NM_DEBUG_RXSYNC)) {
387 			sync_kloop_kring_dump("post rxsync", kring);
388 		}
389 
390 #ifdef SYNC_KLOOP_POLL
391 		/* Interrupt the application if needed. */
392 		if (a->irq_ctx && some_recvd && csb_atok_intr_enabled(csb_atok)) {
393 			/* We could disable kernel --> application kicks here,
394 			 * to avoid spurious interrupts. */
395 			eventfd_signal(a->irq_ctx, 1);
396 			some_recvd = false;
397 		}
398 #endif /* SYNC_KLOOP_POLL */
399 
400 		/* Read CSB to see if there is more work to do. */
401 		sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots);
402 		if (sync_kloop_norxslots(kring, shadow_ring.head)) {
403 			if (a->busy_wait) {
404 				break;
405 			}
406 			/*
407 			 * No more slots available for reception. We enable notification and
408 			 * go to sleep, waiting for a kick from the application when new receive
409 			 * slots are available.
410 			 */
411 			/* Re-enable notifications. */
412 			csb_ktoa_kick_enable(csb_ktoa, 1);
413 			/* Double check, with store-load memory barrier. */
414 			nm_stld_barrier();
415 			sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots);
416 			if (!sync_kloop_norxslots(kring, shadow_ring.head)) {
417 				/* We won the race condition, more slots are available. Disable
418 				 * notifications and do another cycle. */
419 				csb_ktoa_kick_enable(csb_ktoa, 0);
420 				continue;
421 			}
422 			break;
423 		}
424 
425 		hwtail = NM_ACCESS_ONCE(kring->nr_hwtail);
426 		if (unlikely(hwtail == kring->rhead ||
427 					dry_cycles >= SYNC_LOOP_RX_DRY_CYCLES_MAX)) {
428 			/* No more packets to be read from the backend. We stop and
429 			 * wait for a notification from the backend (netmap_rx_irq). */
430 			nm_prdis(1, "nr_hwtail: %d rhead: %d dry_cycles: %d",
431 					hwtail, kring->rhead, dry_cycles);
432 			break;
433 		}
434 	}
435 
436 	nm_kr_put(kring);
437 
438 #ifdef SYNC_KLOOP_POLL
439 	/* Interrupt the application if needed. */
440 	if (a->irq_ctx && some_recvd && csb_atok_intr_enabled(csb_atok)) {
441 		eventfd_signal(a->irq_ctx, 1);
442 	}
443 #endif /* SYNC_KLOOP_POLL */
444 }
445 
446 #ifdef SYNC_KLOOP_POLL
447 struct sync_kloop_poll_ctx;
448 struct sync_kloop_poll_entry {
449 	/* Support for receiving notifications from
450 	 * a netmap ring or from the application. */
451 	struct file *filp;
452 	wait_queue_t wait;
453 	wait_queue_head_t *wqh;
454 
455 	/* Support for sending notifications to the application. */
456 	struct eventfd_ctx *irq_ctx;
457 	struct file *irq_filp;
458 
459 	/* Arguments for the ring processing function. Useful
460 	 * in case of custom wake-up function. */
461 	struct sync_kloop_ring_args *args;
462 	struct sync_kloop_poll_ctx *parent;
463 
464 };
465 
466 struct sync_kloop_poll_ctx {
467 	poll_table wait_table;
468 	unsigned int next_entry;
469 	int (*next_wake_fun)(wait_queue_t *, unsigned, int, void *);
470 	unsigned int num_entries;
471 	unsigned int num_tx_rings;
472 	unsigned int num_rings;
473 	/* First num_tx_rings entries are for the TX kicks.
474 	 * Then the RX kicks entries follow. The last two
475 	 * entries are for TX irq, and RX irq. */
476 	struct sync_kloop_poll_entry entries[0];
477 };
478 
479 static void
480 sync_kloop_poll_table_queue_proc(struct file *file, wait_queue_head_t *wqh,
481 				poll_table *pt)
482 {
483 	struct sync_kloop_poll_ctx *poll_ctx =
484 		container_of(pt, struct sync_kloop_poll_ctx, wait_table);
485 	struct sync_kloop_poll_entry *entry = poll_ctx->entries +
486 						poll_ctx->next_entry;
487 
488 	BUG_ON(poll_ctx->next_entry >= poll_ctx->num_entries);
489 	entry->wqh = wqh;
490 	entry->filp = file;
491 	/* Use the default wake up function. */
492 	if (poll_ctx->next_wake_fun == NULL) {
493 		init_waitqueue_entry(&entry->wait, current);
494 	} else {
495 		init_waitqueue_func_entry(&entry->wait,
496 		    poll_ctx->next_wake_fun);
497 	}
498 	add_wait_queue(wqh, &entry->wait);
499 }
500 
501 static int
502 sync_kloop_tx_kick_wake_fun(wait_queue_t *wait, unsigned mode,
503     int wake_flags, void *key)
504 {
505 	struct sync_kloop_poll_entry *entry =
506 	    container_of(wait, struct sync_kloop_poll_entry, wait);
507 
508 	netmap_sync_kloop_tx_ring(entry->args);
509 
510 	return 0;
511 }
512 
513 static int
514 sync_kloop_tx_irq_wake_fun(wait_queue_t *wait, unsigned mode,
515     int wake_flags, void *key)
516 {
517 	struct sync_kloop_poll_entry *entry =
518 	    container_of(wait, struct sync_kloop_poll_entry, wait);
519 	struct sync_kloop_poll_ctx *poll_ctx = entry->parent;
520 	int i;
521 
522 	for (i = 0; i < poll_ctx->num_tx_rings; i++) {
523 		struct eventfd_ctx *irq_ctx = poll_ctx->entries[i].irq_ctx;
524 
525 		if (irq_ctx) {
526 			eventfd_signal(irq_ctx, 1);
527 		}
528 	}
529 
530 	return 0;
531 }
532 
533 static int
534 sync_kloop_rx_kick_wake_fun(wait_queue_t *wait, unsigned mode,
535     int wake_flags, void *key)
536 {
537 	struct sync_kloop_poll_entry *entry =
538 	    container_of(wait, struct sync_kloop_poll_entry, wait);
539 
540 	netmap_sync_kloop_rx_ring(entry->args);
541 
542 	return 0;
543 }
544 
545 static int
546 sync_kloop_rx_irq_wake_fun(wait_queue_t *wait, unsigned mode,
547     int wake_flags, void *key)
548 {
549 	struct sync_kloop_poll_entry *entry =
550 	    container_of(wait, struct sync_kloop_poll_entry, wait);
551 	struct sync_kloop_poll_ctx *poll_ctx = entry->parent;
552 	int i;
553 
554 	for (i = poll_ctx->num_tx_rings; i < poll_ctx->num_rings; i++) {
555 		struct eventfd_ctx *irq_ctx = poll_ctx->entries[i].irq_ctx;
556 
557 		if (irq_ctx) {
558 			eventfd_signal(irq_ctx, 1);
559 		}
560 	}
561 
562 	return 0;
563 }
564 #endif  /* SYNC_KLOOP_POLL */
565 
566 int
567 netmap_sync_kloop(struct netmap_priv_d *priv, struct nmreq_header *hdr)
568 {
569 	struct nmreq_sync_kloop_start *req =
570 		(struct nmreq_sync_kloop_start *)(uintptr_t)hdr->nr_body;
571 	struct nmreq_opt_sync_kloop_eventfds *eventfds_opt = NULL;
572 #ifdef SYNC_KLOOP_POLL
573 	struct sync_kloop_poll_ctx *poll_ctx = NULL;
574 #endif  /* SYNC_KLOOP_POLL */
575 	int num_rx_rings, num_tx_rings, num_rings;
576 	struct sync_kloop_ring_args *args = NULL;
577 	uint32_t sleep_us = req->sleep_us;
578 	struct nm_csb_atok* csb_atok_base;
579 	struct nm_csb_ktoa* csb_ktoa_base;
580 	struct netmap_adapter *na;
581 	struct nmreq_option *opt;
582 	bool na_could_sleep = false;
583 	bool busy_wait = true;
584 	bool direct_tx = false;
585 	bool direct_rx = false;
586 	int err = 0;
587 	int i;
588 
589 	if (sleep_us > 1000000) {
590 		/* We do not accept sleeping for more than a second. */
591 		return EINVAL;
592 	}
593 
594 	if (priv->np_nifp == NULL) {
595 		return ENXIO;
596 	}
597 	mb(); /* make sure following reads are not from cache */
598 
599 	na = priv->np_na;
600 	if (!nm_netmap_on(na)) {
601 		return ENXIO;
602 	}
603 
604 	NMG_LOCK();
605 	/* Make sure the application is working in CSB mode. */
606 	if (!priv->np_csb_atok_base || !priv->np_csb_ktoa_base) {
607 		NMG_UNLOCK();
608 		nm_prerr("sync-kloop on %s requires "
609 				"NETMAP_REQ_OPT_CSB option", na->name);
610 		return EINVAL;
611 	}
612 
613 	csb_atok_base = priv->np_csb_atok_base;
614 	csb_ktoa_base = priv->np_csb_ktoa_base;
615 
616 	/* Make sure that no kloop is currently running. */
617 	if (priv->np_kloop_state & NM_SYNC_KLOOP_RUNNING) {
618 		err = EBUSY;
619 	}
620 	priv->np_kloop_state |= NM_SYNC_KLOOP_RUNNING;
621 	NMG_UNLOCK();
622 	if (err) {
623 		return err;
624 	}
625 
626 	num_rx_rings = priv->np_qlast[NR_RX] - priv->np_qfirst[NR_RX];
627 	num_tx_rings = priv->np_qlast[NR_TX] - priv->np_qfirst[NR_TX];
628 	num_rings = num_tx_rings + num_rx_rings;
629 
630 	args = nm_os_malloc(num_rings * sizeof(args[0]));
631 	if (!args) {
632 		err = ENOMEM;
633 		goto out;
634 	}
635 
636 	/* Prepare the arguments for netmap_sync_kloop_tx_ring()
637 	 * and netmap_sync_kloop_rx_ring(). */
638 	for (i = 0; i < num_tx_rings; i++) {
639 		struct sync_kloop_ring_args *a = args + i;
640 
641 		a->kring = NMR(na, NR_TX)[i + priv->np_qfirst[NR_TX]];
642 		a->csb_atok = csb_atok_base + i;
643 		a->csb_ktoa = csb_ktoa_base + i;
644 		a->busy_wait = busy_wait;
645 		a->direct = direct_tx;
646 	}
647 	for (i = 0; i < num_rx_rings; i++) {
648 		struct sync_kloop_ring_args *a = args + num_tx_rings + i;
649 
650 		a->kring = NMR(na, NR_RX)[i + priv->np_qfirst[NR_RX]];
651 		a->csb_atok = csb_atok_base + num_tx_rings + i;
652 		a->csb_ktoa = csb_ktoa_base + num_tx_rings + i;
653 		a->busy_wait = busy_wait;
654 		a->direct = direct_rx;
655 	}
656 
657 	/* Validate notification options. */
658 	opt = nmreq_getoption(hdr, NETMAP_REQ_OPT_SYNC_KLOOP_MODE);
659 	if (opt != NULL) {
660 		struct nmreq_opt_sync_kloop_mode *mode_opt =
661 		    (struct nmreq_opt_sync_kloop_mode *)opt;
662 
663 		direct_tx = !!(mode_opt->mode & NM_OPT_SYNC_KLOOP_DIRECT_TX);
664 		direct_rx = !!(mode_opt->mode & NM_OPT_SYNC_KLOOP_DIRECT_RX);
665 		if (mode_opt->mode & ~(NM_OPT_SYNC_KLOOP_DIRECT_TX |
666 		    NM_OPT_SYNC_KLOOP_DIRECT_RX)) {
667 			opt->nro_status = err = EINVAL;
668 			goto out;
669 		}
670 		opt->nro_status = 0;
671 	}
672 	opt = nmreq_getoption(hdr, NETMAP_REQ_OPT_SYNC_KLOOP_EVENTFDS);
673 	if (opt != NULL) {
674 		if (opt->nro_size != sizeof(*eventfds_opt) +
675 			sizeof(eventfds_opt->eventfds[0]) * num_rings) {
676 			/* Option size not consistent with the number of
677 			 * entries. */
678 			opt->nro_status = err = EINVAL;
679 			goto out;
680 		}
681 #ifdef SYNC_KLOOP_POLL
682 		eventfds_opt = (struct nmreq_opt_sync_kloop_eventfds *)opt;
683 		opt->nro_status = 0;
684 
685 		/* Check if some ioeventfd entry is not defined, and force sleep
686 		 * synchronization in that case. */
687 		busy_wait = false;
688 		for (i = 0; i < num_rings; i++) {
689 			if (eventfds_opt->eventfds[i].ioeventfd < 0) {
690 				busy_wait = true;
691 				break;
692 			}
693 		}
694 
695 		if (busy_wait && (direct_tx || direct_rx)) {
696 			/* For direct processing we need all the
697 			 * ioeventfds to be valid. */
698 			opt->nro_status = err = EINVAL;
699 			goto out;
700 		}
701 
702 		/* We need 2 poll entries for TX and RX notifications coming
703 		 * from the netmap adapter, plus one entries per ring for the
704 		 * notifications coming from the application. */
705 		poll_ctx = nm_os_malloc(sizeof(*poll_ctx) +
706 				(num_rings + 2) * sizeof(poll_ctx->entries[0]));
707 		init_poll_funcptr(&poll_ctx->wait_table,
708 					sync_kloop_poll_table_queue_proc);
709 		poll_ctx->num_entries = 2 + num_rings;
710 		poll_ctx->num_tx_rings = num_tx_rings;
711 		poll_ctx->num_rings = num_rings;
712 		poll_ctx->next_entry = 0;
713 		poll_ctx->next_wake_fun = NULL;
714 
715 		if (direct_tx && (na->na_flags & NAF_BDG_MAYSLEEP)) {
716 			/* In direct mode, VALE txsync is called from
717 			 * wake-up context, where it is not possible
718 			 * to sleep.
719 			 */
720 			na->na_flags &= ~NAF_BDG_MAYSLEEP;
721 			na_could_sleep = true;
722 		}
723 
724 		for (i = 0; i < num_rings + 2; i++) {
725 			poll_ctx->entries[i].args = args + i;
726 			poll_ctx->entries[i].parent = poll_ctx;
727 		}
728 
729 		/* Poll for notifications coming from the applications through
730 		 * eventfds. */
731 		for (i = 0; i < num_rings; i++, poll_ctx->next_entry++) {
732 			struct eventfd_ctx *irq = NULL;
733 			struct file *filp = NULL;
734 			unsigned long mask;
735 			bool tx_ring = (i < num_tx_rings);
736 
737 			if (eventfds_opt->eventfds[i].irqfd >= 0) {
738 				filp = eventfd_fget(
739 				    eventfds_opt->eventfds[i].irqfd);
740 				if (IS_ERR(filp)) {
741 					err = PTR_ERR(filp);
742 					goto out;
743 				}
744 				irq = eventfd_ctx_fileget(filp);
745 				if (IS_ERR(irq)) {
746 					err = PTR_ERR(irq);
747 					goto out;
748 				}
749 			}
750 			poll_ctx->entries[i].irq_filp = filp;
751 			poll_ctx->entries[i].irq_ctx = irq;
752 			poll_ctx->entries[i].args->busy_wait = busy_wait;
753 			/* Don't let netmap_sync_kloop_*x_ring() use
754 			 * IRQs in direct mode. */
755 			poll_ctx->entries[i].args->irq_ctx =
756 			    ((tx_ring && direct_tx) ||
757 			    (!tx_ring && direct_rx)) ? NULL :
758 			    poll_ctx->entries[i].irq_ctx;
759 			poll_ctx->entries[i].args->direct =
760 			    (tx_ring ? direct_tx : direct_rx);
761 
762 			if (!busy_wait) {
763 				filp = eventfd_fget(
764 				    eventfds_opt->eventfds[i].ioeventfd);
765 				if (IS_ERR(filp)) {
766 					err = PTR_ERR(filp);
767 					goto out;
768 				}
769 				if (tx_ring && direct_tx) {
770 					/* Override the wake up function
771 					 * so that it can directly call
772 					 * netmap_sync_kloop_tx_ring().
773 					 */
774 					poll_ctx->next_wake_fun =
775 					    sync_kloop_tx_kick_wake_fun;
776 				} else if (!tx_ring && direct_rx) {
777 					/* Same for direct RX. */
778 					poll_ctx->next_wake_fun =
779 					    sync_kloop_rx_kick_wake_fun;
780 				} else {
781 					poll_ctx->next_wake_fun = NULL;
782 				}
783 				mask = filp->f_op->poll(filp,
784 				    &poll_ctx->wait_table);
785 				if (mask & POLLERR) {
786 					err = EINVAL;
787 					goto out;
788 				}
789 			}
790 		}
791 
792 		/* Poll for notifications coming from the netmap rings bound to
793 		 * this file descriptor. */
794 		if (!busy_wait) {
795 			NMG_LOCK();
796 			/* In direct mode, override the wake up function so
797 			 * that it can forward the netmap_tx_irq() to the
798 			 * guest. */
799 			poll_ctx->next_wake_fun = direct_tx ?
800 			    sync_kloop_tx_irq_wake_fun : NULL;
801 			poll_wait(priv->np_filp, priv->np_si[NR_TX],
802 			    &poll_ctx->wait_table);
803 			poll_ctx->next_entry++;
804 
805 			poll_ctx->next_wake_fun = direct_rx ?
806 			    sync_kloop_rx_irq_wake_fun : NULL;
807 			poll_wait(priv->np_filp, priv->np_si[NR_RX],
808 			    &poll_ctx->wait_table);
809 			poll_ctx->next_entry++;
810 			NMG_UNLOCK();
811 		}
812 #else   /* SYNC_KLOOP_POLL */
813 		opt->nro_status = EOPNOTSUPP;
814 		goto out;
815 #endif  /* SYNC_KLOOP_POLL */
816 	}
817 
818 	nm_prinf("kloop busy_wait %u, direct_tx %u, direct_rx %u, "
819 	    "na_could_sleep %u", busy_wait, direct_tx, direct_rx,
820 	    na_could_sleep);
821 
822 	/* Main loop. */
823 	for (;;) {
824 		if (unlikely(NM_ACCESS_ONCE(priv->np_kloop_state) & NM_SYNC_KLOOP_STOPPING)) {
825 			break;
826 		}
827 
828 #ifdef SYNC_KLOOP_POLL
829 		if (!busy_wait) {
830 			/* It is important to set the task state as
831 			 * interruptible before processing any TX/RX ring,
832 			 * so that if a notification on ring Y comes after
833 			 * we have processed ring Y, but before we call
834 			 * schedule(), we don't miss it. This is true because
835 			 * the wake up function will change the task state,
836 			 * and therefore the schedule_timeout() call below
837 			 * will observe the change).
838 			 */
839 			set_current_state(TASK_INTERRUPTIBLE);
840 		}
841 #endif  /* SYNC_KLOOP_POLL */
842 
843 		/* Process all the TX rings bound to this file descriptor. */
844 		for (i = 0; !direct_tx && i < num_tx_rings; i++) {
845 			struct sync_kloop_ring_args *a = args + i;
846 			netmap_sync_kloop_tx_ring(a);
847 		}
848 
849 		/* Process all the RX rings bound to this file descriptor. */
850 		for (i = 0; !direct_rx && i < num_rx_rings; i++) {
851 			struct sync_kloop_ring_args *a = args + num_tx_rings + i;
852 			netmap_sync_kloop_rx_ring(a);
853 		}
854 
855 		if (busy_wait) {
856 			/* Default synchronization method: sleep for a while. */
857 			usleep_range(sleep_us, sleep_us);
858 		}
859 #ifdef SYNC_KLOOP_POLL
860 		else {
861 			/* Yield to the scheduler waiting for a notification
862 			 * to come either from netmap or the application. */
863 			schedule_timeout(msecs_to_jiffies(3000));
864 		}
865 #endif /* SYNC_KLOOP_POLL */
866 	}
867 out:
868 #ifdef SYNC_KLOOP_POLL
869 	if (poll_ctx) {
870 		/* Stop polling from netmap and the eventfds, and deallocate
871 		 * the poll context. */
872 		if (!busy_wait) {
873 			__set_current_state(TASK_RUNNING);
874 		}
875 		for (i = 0; i < poll_ctx->next_entry; i++) {
876 			struct sync_kloop_poll_entry *entry =
877 						poll_ctx->entries + i;
878 
879 			if (entry->wqh)
880 				remove_wait_queue(entry->wqh, &entry->wait);
881 			/* We did not get a reference to the eventfds, but
882 			 * don't do that on netmap file descriptors (since
883 			 * a reference was not taken. */
884 			if (entry->filp && entry->filp != priv->np_filp)
885 				fput(entry->filp);
886 			if (entry->irq_ctx)
887 				eventfd_ctx_put(entry->irq_ctx);
888 			if (entry->irq_filp)
889 				fput(entry->irq_filp);
890 		}
891 		nm_os_free(poll_ctx);
892 		poll_ctx = NULL;
893 	}
894 #endif /* SYNC_KLOOP_POLL */
895 
896 	if (args) {
897 		nm_os_free(args);
898 		args = NULL;
899 	}
900 
901 	/* Reset the kloop state. */
902 	NMG_LOCK();
903 	priv->np_kloop_state = 0;
904 	if (na_could_sleep) {
905 		na->na_flags |= NAF_BDG_MAYSLEEP;
906 	}
907 	NMG_UNLOCK();
908 
909 	return err;
910 }
911 
912 int
913 netmap_sync_kloop_stop(struct netmap_priv_d *priv)
914 {
915 	struct netmap_adapter *na;
916 	bool running = true;
917 	int err = 0;
918 
919 	if (priv->np_nifp == NULL) {
920 		return ENXIO;
921 	}
922 	mb(); /* make sure following reads are not from cache */
923 
924 	na = priv->np_na;
925 	if (!nm_netmap_on(na)) {
926 		return ENXIO;
927 	}
928 
929 	/* Set the kloop stopping flag. */
930 	NMG_LOCK();
931 	priv->np_kloop_state |= NM_SYNC_KLOOP_STOPPING;
932 	NMG_UNLOCK();
933 
934 	/* Send a notification to the kloop, in case it is blocked in
935 	 * schedule_timeout(). We can use either RX or TX, because the
936 	 * kloop is waiting on both. */
937 	nm_os_selwakeup(priv->np_si[NR_RX]);
938 
939 	/* Wait for the kloop to actually terminate. */
940 	while (running) {
941 		usleep_range(1000, 1500);
942 		NMG_LOCK();
943 		running = (NM_ACCESS_ONCE(priv->np_kloop_state)
944 				& NM_SYNC_KLOOP_RUNNING);
945 		NMG_UNLOCK();
946 	}
947 
948 	return err;
949 }
950 
951 #ifdef WITH_PTNETMAP
952 /*
953  * Guest ptnetmap txsync()/rxsync() routines, used in ptnet device drivers.
954  * These routines are reused across the different operating systems supported
955  * by netmap.
956  */
957 
958 /*
959  * Reconcile host and guest views of the transmit ring.
960  *
961  * Guest user wants to transmit packets up to the one before ring->head,
962  * and guest kernel knows tx_ring->hwcur is the first packet unsent
963  * by the host kernel.
964  *
965  * We push out as many packets as possible, and possibly
966  * reclaim buffers from previously completed transmission.
967  *
968  * Notifications from the host are enabled only if the user guest would
969  * block (no space in the ring).
970  */
971 bool
972 netmap_pt_guest_txsync(struct nm_csb_atok *atok, struct nm_csb_ktoa *ktoa,
973 			struct netmap_kring *kring, int flags)
974 {
975 	bool notify = false;
976 
977 	/* Disable notifications */
978 	atok->appl_need_kick = 0;
979 
980 	/*
981 	 * First part: tell the host to process the new packets,
982 	 * updating the CSB.
983 	 */
984 	kring->nr_hwcur = ktoa->hwcur;
985 	nm_sync_kloop_appl_write(atok, kring->rcur, kring->rhead);
986 
987         /* Ask for a kick from a guest to the host if needed. */
988 	if (((kring->rhead != kring->nr_hwcur || nm_kr_wouldblock(kring))
989 		&& NM_ACCESS_ONCE(ktoa->kern_need_kick)) ||
990 			(flags & NAF_FORCE_RECLAIM)) {
991 		atok->sync_flags = flags;
992 		notify = true;
993 	}
994 
995 	/*
996 	 * Second part: reclaim buffers for completed transmissions.
997 	 */
998 	if (nm_kr_wouldblock(kring) || (flags & NAF_FORCE_RECLAIM)) {
999 		nm_sync_kloop_appl_read(ktoa, &kring->nr_hwtail,
1000 					&kring->nr_hwcur);
1001 	}
1002 
1003         /*
1004          * No more room in the ring for new transmissions. The user thread will
1005 	 * go to sleep and we need to be notified by the host when more free
1006 	 * space is available.
1007          */
1008 	if (nm_kr_wouldblock(kring) && !(kring->nr_kflags & NKR_NOINTR)) {
1009 		/* Re-enable notifications. */
1010 		atok->appl_need_kick = 1;
1011                 /* Double check, with store-load memory barrier. */
1012 		nm_stld_barrier();
1013 		nm_sync_kloop_appl_read(ktoa, &kring->nr_hwtail,
1014 					&kring->nr_hwcur);
1015                 /* If there is new free space, disable notifications */
1016 		if (unlikely(!nm_kr_wouldblock(kring))) {
1017 			atok->appl_need_kick = 0;
1018 		}
1019 	}
1020 
1021 	nm_prdis(1, "%s CSB(head:%u cur:%u hwtail:%u) KRING(head:%u cur:%u tail:%u)",
1022 		kring->name, atok->head, atok->cur, ktoa->hwtail,
1023 		kring->rhead, kring->rcur, kring->nr_hwtail);
1024 
1025 	return notify;
1026 }
1027 
1028 /*
1029  * Reconcile host and guest view of the receive ring.
1030  *
1031  * Update hwcur/hwtail from host (reading from CSB).
1032  *
1033  * If guest user has released buffers up to the one before ring->head, we
1034  * also give them to the host.
1035  *
1036  * Notifications from the host are enabled only if the user guest would
1037  * block (no more completed slots in the ring).
1038  */
1039 bool
1040 netmap_pt_guest_rxsync(struct nm_csb_atok *atok, struct nm_csb_ktoa *ktoa,
1041 			struct netmap_kring *kring, int flags)
1042 {
1043 	bool notify = false;
1044 
1045         /* Disable notifications */
1046 	atok->appl_need_kick = 0;
1047 
1048 	/*
1049 	 * First part: import newly received packets, by updating the kring
1050 	 * hwtail to the hwtail known from the host (read from the CSB).
1051 	 * This also updates the kring hwcur.
1052 	 */
1053 	nm_sync_kloop_appl_read(ktoa, &kring->nr_hwtail, &kring->nr_hwcur);
1054 	kring->nr_kflags &= ~NKR_PENDINTR;
1055 
1056 	/*
1057 	 * Second part: tell the host about the slots that guest user has
1058 	 * released, by updating cur and head in the CSB.
1059 	 */
1060 	if (kring->rhead != kring->nr_hwcur) {
1061 		nm_sync_kloop_appl_write(atok, kring->rcur, kring->rhead);
1062 	}
1063 
1064         /*
1065          * No more completed RX slots. The user thread will go to sleep and
1066 	 * we need to be notified by the host when more RX slots have been
1067 	 * completed.
1068          */
1069 	if (nm_kr_wouldblock(kring) && !(kring->nr_kflags & NKR_NOINTR)) {
1070 		/* Re-enable notifications. */
1071                 atok->appl_need_kick = 1;
1072                 /* Double check, with store-load memory barrier. */
1073 		nm_stld_barrier();
1074 		nm_sync_kloop_appl_read(ktoa, &kring->nr_hwtail,
1075 					&kring->nr_hwcur);
1076                 /* If there are new slots, disable notifications. */
1077 		if (!nm_kr_wouldblock(kring)) {
1078                         atok->appl_need_kick = 0;
1079                 }
1080         }
1081 
1082 	/* Ask for a kick from the guest to the host if needed. */
1083 	if ((kring->rhead != kring->nr_hwcur || nm_kr_wouldblock(kring))
1084 		&& NM_ACCESS_ONCE(ktoa->kern_need_kick)) {
1085 		atok->sync_flags = flags;
1086 		notify = true;
1087 	}
1088 
1089 	nm_prdis(1, "%s CSB(head:%u cur:%u hwtail:%u) KRING(head:%u cur:%u tail:%u)",
1090 		kring->name, atok->head, atok->cur, ktoa->hwtail,
1091 		kring->rhead, kring->rcur, kring->nr_hwtail);
1092 
1093 	return notify;
1094 }
1095 
1096 /*
1097  * Callbacks for ptnet drivers: nm_krings_create, nm_krings_delete, nm_dtor.
1098  */
1099 int
1100 ptnet_nm_krings_create(struct netmap_adapter *na)
1101 {
1102 	struct netmap_pt_guest_adapter *ptna =
1103 			(struct netmap_pt_guest_adapter *)na; /* Upcast. */
1104 	struct netmap_adapter *na_nm = &ptna->hwup.up;
1105 	struct netmap_adapter *na_dr = &ptna->dr.up;
1106 	int ret;
1107 
1108 	if (ptna->backend_users) {
1109 		return 0;
1110 	}
1111 
1112 	/* Create krings on the public netmap adapter. */
1113 	ret = netmap_hw_krings_create(na_nm);
1114 	if (ret) {
1115 		return ret;
1116 	}
1117 
1118 	/* Copy krings into the netmap adapter private to the driver. */
1119 	na_dr->tx_rings = na_nm->tx_rings;
1120 	na_dr->rx_rings = na_nm->rx_rings;
1121 
1122 	return 0;
1123 }
1124 
1125 void
1126 ptnet_nm_krings_delete(struct netmap_adapter *na)
1127 {
1128 	struct netmap_pt_guest_adapter *ptna =
1129 			(struct netmap_pt_guest_adapter *)na; /* Upcast. */
1130 	struct netmap_adapter *na_nm = &ptna->hwup.up;
1131 	struct netmap_adapter *na_dr = &ptna->dr.up;
1132 
1133 	if (ptna->backend_users) {
1134 		return;
1135 	}
1136 
1137 	na_dr->tx_rings = NULL;
1138 	na_dr->rx_rings = NULL;
1139 
1140 	netmap_hw_krings_delete(na_nm);
1141 }
1142 
1143 void
1144 ptnet_nm_dtor(struct netmap_adapter *na)
1145 {
1146 	struct netmap_pt_guest_adapter *ptna =
1147 			(struct netmap_pt_guest_adapter *)na;
1148 
1149 	netmap_mem_put(ptna->dr.up.nm_mem);
1150 	memset(&ptna->dr, 0, sizeof(ptna->dr));
1151 	netmap_mem_pt_guest_ifp_del(na->nm_mem, na->ifp);
1152 }
1153 
1154 int
1155 netmap_pt_guest_attach(struct netmap_adapter *arg,
1156 		       unsigned int nifp_offset, unsigned int memid)
1157 {
1158 	struct netmap_pt_guest_adapter *ptna;
1159 	if_t ifp = arg ? arg->ifp : NULL;
1160 	int error;
1161 
1162 	/* get allocator */
1163 	arg->nm_mem = netmap_mem_pt_guest_new(ifp, nifp_offset, memid);
1164 	if (arg->nm_mem == NULL)
1165 		return ENOMEM;
1166 	arg->na_flags |= NAF_MEM_OWNER;
1167 	error = netmap_attach_ext(arg, sizeof(struct netmap_pt_guest_adapter), 1);
1168 	if (error)
1169 		return error;
1170 
1171 	/* get the netmap_pt_guest_adapter */
1172 	ptna = (struct netmap_pt_guest_adapter *) NA(ifp);
1173 
1174 	/* Initialize a separate pass-through netmap adapter that is going to
1175 	 * be used by the ptnet driver only, and so never exposed to netmap
1176          * applications. We only need a subset of the available fields. */
1177 	memset(&ptna->dr, 0, sizeof(ptna->dr));
1178 	ptna->dr.up.ifp = ifp;
1179 	ptna->dr.up.nm_mem = netmap_mem_get(ptna->hwup.up.nm_mem);
1180         ptna->dr.up.nm_config = ptna->hwup.up.nm_config;
1181 
1182 	ptna->backend_users = 0;
1183 
1184 	return 0;
1185 }
1186 
1187 #endif /* WITH_PTNETMAP */
1188