xref: /freebsd/sys/dev/netmap/netmap_kloop.c (revision b9f654b163bce26de79705e77b872427c9f2afa1)
1 /*
2  * Copyright (C) 2016-2018 Vincenzo Maffione
3  * Copyright (C) 2015 Stefano Garzarella
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *   1. Redistributions of source code must retain the above copyright
10  *      notice, this list of conditions and the following disclaimer.
11  *   2. Redistributions in binary form must reproduce the above copyright
12  *      notice, this list of conditions and the following disclaimer in the
13  *      documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  *
27  * $FreeBSD$
28  */
29 
30 /*
31  * common headers
32  */
33 #if defined(__FreeBSD__)
34 #include <sys/cdefs.h>
35 #include <sys/param.h>
36 #include <sys/kernel.h>
37 #include <sys/types.h>
38 #include <sys/selinfo.h>
39 #include <sys/socket.h>
40 #include <net/if.h>
41 #include <net/if_var.h>
42 #include <machine/bus.h>
43 
44 #define usleep_range(_1, _2) \
45         pause_sbt("sync-kloop-sleep", SBT_1US * _1, SBT_1US * 1, C_ABSOLUTE)
46 
47 #elif defined(linux)
48 #include <bsd_glue.h>
49 #include <linux/file.h>
50 #include <linux/eventfd.h>
51 #endif
52 
53 #include <net/netmap.h>
54 #include <dev/netmap/netmap_kern.h>
55 #include <net/netmap_virt.h>
56 #include <dev/netmap/netmap_mem2.h>
57 
58 /* Support for eventfd-based notifications. */
59 #if defined(linux)
60 #define SYNC_KLOOP_POLL
61 #endif
62 
63 /* Write kring pointers (hwcur, hwtail) to the CSB.
64  * This routine is coupled with ptnetmap_guest_read_kring_csb(). */
65 static inline void
66 sync_kloop_kernel_write(struct nm_csb_ktoa __user *ptr, uint32_t hwcur,
67 			   uint32_t hwtail)
68 {
69 	/* Issue a first store-store barrier to make sure writes to the
70 	 * netmap ring do not overcome updates on ktoa->hwcur and ktoa->hwtail. */
71 	nm_stst_barrier();
72 
73 	/*
74 	 * The same scheme used in nm_sync_kloop_appl_write() applies here.
75 	 * We allow the application to read a value of hwcur more recent than the value
76 	 * of hwtail, since this would anyway result in a consistent view of the
77 	 * ring state (and hwcur can never wraparound hwtail, since hwcur must be
78 	 * behind head).
79 	 *
80 	 * The following memory barrier scheme is used to make this happen:
81 	 *
82 	 *          Application            Kernel
83 	 *
84 	 *          STORE(hwcur)           LOAD(hwtail)
85 	 *          wmb() <------------->  rmb()
86 	 *          STORE(hwtail)          LOAD(hwcur)
87 	 */
88 	CSB_WRITE(ptr, hwcur, hwcur);
89 	nm_stst_barrier();
90 	CSB_WRITE(ptr, hwtail, hwtail);
91 }
92 
93 /* Read kring pointers (head, cur, sync_flags) from the CSB.
94  * This routine is coupled with ptnetmap_guest_write_kring_csb(). */
95 static inline void
96 sync_kloop_kernel_read(struct nm_csb_atok __user *ptr,
97 			  struct netmap_ring *shadow_ring,
98 			  uint32_t num_slots)
99 {
100 	/*
101 	 * We place a memory barrier to make sure that the update of head never
102 	 * overtakes the update of cur.
103 	 * (see explanation in sync_kloop_kernel_write).
104 	 */
105 	CSB_READ(ptr, head, shadow_ring->head);
106 	nm_ldld_barrier();
107 	CSB_READ(ptr, cur, shadow_ring->cur);
108 	CSB_READ(ptr, sync_flags, shadow_ring->flags);
109 
110 	/* Make sure that loads from atok->head and atok->cur are not delayed
111 	 * after the loads from the netmap ring. */
112 	nm_ldld_barrier();
113 }
114 
115 /* Enable or disable application --> kernel kicks. */
116 static inline void
117 csb_ktoa_kick_enable(struct nm_csb_ktoa __user *csb_ktoa, uint32_t val)
118 {
119 	CSB_WRITE(csb_ktoa, kern_need_kick, val);
120 }
121 
122 #ifdef SYNC_KLOOP_POLL
123 /* Are application interrupt enabled or disabled? */
124 static inline uint32_t
125 csb_atok_intr_enabled(struct nm_csb_atok __user *csb_atok)
126 {
127 	uint32_t v;
128 
129 	CSB_READ(csb_atok, appl_need_kick, v);
130 
131 	return v;
132 }
133 #endif  /* SYNC_KLOOP_POLL */
134 
135 static inline void
136 sync_kloop_kring_dump(const char *title, const struct netmap_kring *kring)
137 {
138 	nm_prinf("%s, kring %s, hwcur %d, rhead %d, "
139 		"rcur %d, rtail %d, hwtail %d",
140 		title, kring->name, kring->nr_hwcur, kring->rhead,
141 		kring->rcur, kring->rtail, kring->nr_hwtail);
142 }
143 
144 /* Arguments for netmap_sync_kloop_tx_ring() and
145  * netmap_sync_kloop_rx_ring().
146  */
147 struct sync_kloop_ring_args {
148 	struct netmap_kring *kring;
149 	struct nm_csb_atok *csb_atok;
150 	struct nm_csb_ktoa *csb_ktoa;
151 #ifdef SYNC_KLOOP_POLL
152 	struct eventfd_ctx *irq_ctx;
153 #endif /* SYNC_KLOOP_POLL */
154 	/* Are we busy waiting rather than using a schedule() loop ? */
155 	bool busy_wait;
156 	/* Are we processing in the context of VM exit ? */
157 	bool direct;
158 };
159 
160 static void
161 netmap_sync_kloop_tx_ring(const struct sync_kloop_ring_args *a)
162 {
163 	struct netmap_kring *kring = a->kring;
164 	struct nm_csb_atok *csb_atok = a->csb_atok;
165 	struct nm_csb_ktoa *csb_ktoa = a->csb_ktoa;
166 	struct netmap_ring shadow_ring; /* shadow copy of the netmap_ring */
167 	bool more_txspace = false;
168 	uint32_t num_slots;
169 	int batch;
170 
171 	if (unlikely(nm_kr_tryget(kring, 1, NULL))) {
172 		return;
173 	}
174 
175 	num_slots = kring->nkr_num_slots;
176 
177 	/* Disable application --> kernel notifications. */
178 	if (!a->direct) {
179 		csb_ktoa_kick_enable(csb_ktoa, 0);
180 	}
181 	/* Copy the application kring pointers from the CSB */
182 	sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots);
183 
184 	for (;;) {
185 		batch = shadow_ring.head - kring->nr_hwcur;
186 		if (batch < 0)
187 			batch += num_slots;
188 
189 #ifdef PTN_TX_BATCH_LIM
190 		if (batch > PTN_TX_BATCH_LIM(num_slots)) {
191 			/* If application moves ahead too fast, let's cut the move so
192 			 * that we don't exceed our batch limit. */
193 			uint32_t head_lim = kring->nr_hwcur + PTN_TX_BATCH_LIM(num_slots);
194 
195 			if (head_lim >= num_slots)
196 				head_lim -= num_slots;
197 			nm_prdis(1, "batch: %d head: %d head_lim: %d", batch, shadow_ring.head,
198 					head_lim);
199 			shadow_ring.head = head_lim;
200 			batch = PTN_TX_BATCH_LIM(num_slots);
201 		}
202 #endif /* PTN_TX_BATCH_LIM */
203 
204 		if (nm_kr_txspace(kring) <= (num_slots >> 1)) {
205 			shadow_ring.flags |= NAF_FORCE_RECLAIM;
206 		}
207 
208 		/* Netmap prologue */
209 		shadow_ring.tail = kring->rtail;
210 		if (unlikely(nm_txsync_prologue(kring, &shadow_ring) >= num_slots)) {
211 			/* Reinit ring and enable notifications. */
212 			netmap_ring_reinit(kring);
213 			if (!a->busy_wait) {
214 				csb_ktoa_kick_enable(csb_ktoa, 1);
215 			}
216 			break;
217 		}
218 
219 		if (unlikely(netmap_debug & NM_DEBUG_TXSYNC)) {
220 			sync_kloop_kring_dump("pre txsync", kring);
221 		}
222 
223 		if (unlikely(kring->nm_sync(kring, shadow_ring.flags))) {
224 			if (!a->busy_wait) {
225 				/* Reenable notifications. */
226 				csb_ktoa_kick_enable(csb_ktoa, 1);
227 			}
228 			nm_prerr("txsync() failed");
229 			break;
230 		}
231 
232 		/*
233 		 * Finalize
234 		 * Copy kernel hwcur and hwtail into the CSB for the application sync(), and
235 		 * do the nm_sync_finalize.
236 		 */
237 		sync_kloop_kernel_write(csb_ktoa, kring->nr_hwcur,
238 				kring->nr_hwtail);
239 		if (kring->rtail != kring->nr_hwtail) {
240 			/* Some more room available in the parent adapter. */
241 			kring->rtail = kring->nr_hwtail;
242 			more_txspace = true;
243 		}
244 
245 		if (unlikely(netmap_debug & NM_DEBUG_TXSYNC)) {
246 			sync_kloop_kring_dump("post txsync", kring);
247 		}
248 
249 		/* Interrupt the application if needed. */
250 #ifdef SYNC_KLOOP_POLL
251 		if (a->irq_ctx && more_txspace && csb_atok_intr_enabled(csb_atok)) {
252 			/* We could disable kernel --> application kicks here,
253 			 * to avoid spurious interrupts. */
254 			eventfd_signal(a->irq_ctx, 1);
255 			more_txspace = false;
256 		}
257 #endif /* SYNC_KLOOP_POLL */
258 
259 		/* Read CSB to see if there is more work to do. */
260 		sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots);
261 		if (shadow_ring.head == kring->rhead) {
262 			if (a->busy_wait) {
263 				break;
264 			}
265 			/*
266 			 * No more packets to transmit. We enable notifications and
267 			 * go to sleep, waiting for a kick from the application when new
268 			 * new slots are ready for transmission.
269 			 */
270 			/* Reenable notifications. */
271 			csb_ktoa_kick_enable(csb_ktoa, 1);
272 			/* Double check, with store-load memory barrier. */
273 			nm_stld_barrier();
274 			sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots);
275 			if (shadow_ring.head != kring->rhead) {
276 				/* We won the race condition, there are more packets to
277 				 * transmit. Disable notifications and do another cycle */
278 				csb_ktoa_kick_enable(csb_ktoa, 0);
279 				continue;
280 			}
281 			break;
282 		}
283 
284 		if (nm_kr_txempty(kring)) {
285 			/* No more available TX slots. We stop waiting for a notification
286 			 * from the backend (netmap_tx_irq). */
287 			nm_prdis(1, "TX ring");
288 			break;
289 		}
290 	}
291 
292 	nm_kr_put(kring);
293 
294 #ifdef SYNC_KLOOP_POLL
295 	if (a->irq_ctx && more_txspace && csb_atok_intr_enabled(csb_atok)) {
296 		eventfd_signal(a->irq_ctx, 1);
297 	}
298 #endif /* SYNC_KLOOP_POLL */
299 }
300 
301 /* RX cycle without receive any packets */
302 #define SYNC_LOOP_RX_DRY_CYCLES_MAX	2
303 
304 static inline int
305 sync_kloop_norxslots(struct netmap_kring *kring, uint32_t g_head)
306 {
307 	return (NM_ACCESS_ONCE(kring->nr_hwtail) == nm_prev(g_head,
308 				kring->nkr_num_slots - 1));
309 }
310 
311 static void
312 netmap_sync_kloop_rx_ring(const struct sync_kloop_ring_args *a)
313 {
314 
315 	struct netmap_kring *kring = a->kring;
316 	struct nm_csb_atok *csb_atok = a->csb_atok;
317 	struct nm_csb_ktoa *csb_ktoa = a->csb_ktoa;
318 	struct netmap_ring shadow_ring; /* shadow copy of the netmap_ring */
319 	int dry_cycles = 0;
320 	bool some_recvd = false;
321 	uint32_t num_slots;
322 
323 	if (unlikely(nm_kr_tryget(kring, 1, NULL))) {
324 		return;
325 	}
326 
327 	num_slots = kring->nkr_num_slots;
328 
329 	/* Get RX csb_atok and csb_ktoa pointers from the CSB. */
330 	num_slots = kring->nkr_num_slots;
331 
332 	/* Disable notifications. */
333 	if (!a->direct) {
334 		csb_ktoa_kick_enable(csb_ktoa, 0);
335 	}
336 	/* Copy the application kring pointers from the CSB */
337 	sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots);
338 
339 	for (;;) {
340 		uint32_t hwtail;
341 
342 		/* Netmap prologue */
343 		shadow_ring.tail = kring->rtail;
344 		if (unlikely(nm_rxsync_prologue(kring, &shadow_ring) >= num_slots)) {
345 			/* Reinit ring and enable notifications. */
346 			netmap_ring_reinit(kring);
347 			if (!a->busy_wait) {
348 				csb_ktoa_kick_enable(csb_ktoa, 1);
349 			}
350 			break;
351 		}
352 
353 		if (unlikely(netmap_debug & NM_DEBUG_RXSYNC)) {
354 			sync_kloop_kring_dump("pre rxsync", kring);
355 		}
356 
357 		if (unlikely(kring->nm_sync(kring, shadow_ring.flags))) {
358 			if (!a->busy_wait) {
359 				/* Reenable notifications. */
360 				csb_ktoa_kick_enable(csb_ktoa, 1);
361 			}
362 			nm_prerr("rxsync() failed");
363 			break;
364 		}
365 
366 		/*
367 		 * Finalize
368 		 * Copy kernel hwcur and hwtail into the CSB for the application sync()
369 		 */
370 		hwtail = NM_ACCESS_ONCE(kring->nr_hwtail);
371 		sync_kloop_kernel_write(csb_ktoa, kring->nr_hwcur, hwtail);
372 		if (kring->rtail != hwtail) {
373 			kring->rtail = hwtail;
374 			some_recvd = true;
375 			dry_cycles = 0;
376 		} else {
377 			dry_cycles++;
378 		}
379 
380 		if (unlikely(netmap_debug & NM_DEBUG_RXSYNC)) {
381 			sync_kloop_kring_dump("post rxsync", kring);
382 		}
383 
384 #ifdef SYNC_KLOOP_POLL
385 		/* Interrupt the application if needed. */
386 		if (a->irq_ctx && some_recvd && csb_atok_intr_enabled(csb_atok)) {
387 			/* We could disable kernel --> application kicks here,
388 			 * to avoid spurious interrupts. */
389 			eventfd_signal(a->irq_ctx, 1);
390 			some_recvd = false;
391 		}
392 #endif /* SYNC_KLOOP_POLL */
393 
394 		/* Read CSB to see if there is more work to do. */
395 		sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots);
396 		if (sync_kloop_norxslots(kring, shadow_ring.head)) {
397 			if (a->busy_wait) {
398 				break;
399 			}
400 			/*
401 			 * No more slots available for reception. We enable notification and
402 			 * go to sleep, waiting for a kick from the application when new receive
403 			 * slots are available.
404 			 */
405 			/* Reenable notifications. */
406 			csb_ktoa_kick_enable(csb_ktoa, 1);
407 			/* Double check, with store-load memory barrier. */
408 			nm_stld_barrier();
409 			sync_kloop_kernel_read(csb_atok, &shadow_ring, num_slots);
410 			if (!sync_kloop_norxslots(kring, shadow_ring.head)) {
411 				/* We won the race condition, more slots are available. Disable
412 				 * notifications and do another cycle. */
413 				csb_ktoa_kick_enable(csb_ktoa, 0);
414 				continue;
415 			}
416 			break;
417 		}
418 
419 		hwtail = NM_ACCESS_ONCE(kring->nr_hwtail);
420 		if (unlikely(hwtail == kring->rhead ||
421 					dry_cycles >= SYNC_LOOP_RX_DRY_CYCLES_MAX)) {
422 			/* No more packets to be read from the backend. We stop and
423 			 * wait for a notification from the backend (netmap_rx_irq). */
424 			nm_prdis(1, "nr_hwtail: %d rhead: %d dry_cycles: %d",
425 					hwtail, kring->rhead, dry_cycles);
426 			break;
427 		}
428 	}
429 
430 	nm_kr_put(kring);
431 
432 #ifdef SYNC_KLOOP_POLL
433 	/* Interrupt the application if needed. */
434 	if (a->irq_ctx && some_recvd && csb_atok_intr_enabled(csb_atok)) {
435 		eventfd_signal(a->irq_ctx, 1);
436 	}
437 #endif /* SYNC_KLOOP_POLL */
438 }
439 
440 #ifdef SYNC_KLOOP_POLL
441 struct sync_kloop_poll_ctx;
442 struct sync_kloop_poll_entry {
443 	/* Support for receiving notifications from
444 	 * a netmap ring or from the application. */
445 	struct file *filp;
446 	wait_queue_t wait;
447 	wait_queue_head_t *wqh;
448 
449 	/* Support for sending notifications to the application. */
450 	struct eventfd_ctx *irq_ctx;
451 	struct file *irq_filp;
452 
453 	/* Arguments for the ring processing function. Useful
454 	 * in case of custom wake-up function. */
455 	struct sync_kloop_ring_args *args;
456 	struct sync_kloop_poll_ctx *parent;
457 
458 };
459 
460 struct sync_kloop_poll_ctx {
461 	poll_table wait_table;
462 	unsigned int next_entry;
463 	int (*next_wake_fun)(wait_queue_t *, unsigned, int, void *);
464 	unsigned int num_entries;
465 	unsigned int num_tx_rings;
466 	unsigned int num_rings;
467 	/* First num_tx_rings entries are for the TX kicks.
468 	 * Then the RX kicks entries follow. The last two
469 	 * entries are for TX irq, and RX irq. */
470 	struct sync_kloop_poll_entry entries[0];
471 };
472 
473 static void
474 sync_kloop_poll_table_queue_proc(struct file *file, wait_queue_head_t *wqh,
475 				poll_table *pt)
476 {
477 	struct sync_kloop_poll_ctx *poll_ctx =
478 		container_of(pt, struct sync_kloop_poll_ctx, wait_table);
479 	struct sync_kloop_poll_entry *entry = poll_ctx->entries +
480 						poll_ctx->next_entry;
481 
482 	BUG_ON(poll_ctx->next_entry >= poll_ctx->num_entries);
483 	entry->wqh = wqh;
484 	entry->filp = file;
485 	/* Use the default wake up function. */
486 	if (poll_ctx->next_wake_fun == NULL) {
487 		init_waitqueue_entry(&entry->wait, current);
488 	} else {
489 		init_waitqueue_func_entry(&entry->wait,
490 		    poll_ctx->next_wake_fun);
491 	}
492 	add_wait_queue(wqh, &entry->wait);
493 }
494 
495 static int
496 sync_kloop_tx_kick_wake_fun(wait_queue_t *wait, unsigned mode,
497     int wake_flags, void *key)
498 {
499 	struct sync_kloop_poll_entry *entry =
500 	    container_of(wait, struct sync_kloop_poll_entry, wait);
501 
502 	netmap_sync_kloop_tx_ring(entry->args);
503 
504 	return 0;
505 }
506 
507 static int
508 sync_kloop_tx_irq_wake_fun(wait_queue_t *wait, unsigned mode,
509     int wake_flags, void *key)
510 {
511 	struct sync_kloop_poll_entry *entry =
512 	    container_of(wait, struct sync_kloop_poll_entry, wait);
513 	struct sync_kloop_poll_ctx *poll_ctx = entry->parent;
514 	int i;
515 
516 	for (i = 0; i < poll_ctx->num_tx_rings; i++) {
517 		struct eventfd_ctx *irq_ctx = poll_ctx->entries[i].irq_ctx;
518 
519 		if (irq_ctx) {
520 			eventfd_signal(irq_ctx, 1);
521 		}
522 	}
523 
524 	return 0;
525 }
526 
527 static int
528 sync_kloop_rx_kick_wake_fun(wait_queue_t *wait, unsigned mode,
529     int wake_flags, void *key)
530 {
531 	struct sync_kloop_poll_entry *entry =
532 	    container_of(wait, struct sync_kloop_poll_entry, wait);
533 
534 	netmap_sync_kloop_rx_ring(entry->args);
535 
536 	return 0;
537 }
538 
539 static int
540 sync_kloop_rx_irq_wake_fun(wait_queue_t *wait, unsigned mode,
541     int wake_flags, void *key)
542 {
543 	struct sync_kloop_poll_entry *entry =
544 	    container_of(wait, struct sync_kloop_poll_entry, wait);
545 	struct sync_kloop_poll_ctx *poll_ctx = entry->parent;
546 	int i;
547 
548 	for (i = poll_ctx->num_tx_rings; i < poll_ctx->num_rings; i++) {
549 		struct eventfd_ctx *irq_ctx = poll_ctx->entries[i].irq_ctx;
550 
551 		if (irq_ctx) {
552 			eventfd_signal(irq_ctx, 1);
553 		}
554 	}
555 
556 	return 0;
557 }
558 #endif  /* SYNC_KLOOP_POLL */
559 
560 int
561 netmap_sync_kloop(struct netmap_priv_d *priv, struct nmreq_header *hdr)
562 {
563 	struct nmreq_sync_kloop_start *req =
564 		(struct nmreq_sync_kloop_start *)(uintptr_t)hdr->nr_body;
565 	struct nmreq_opt_sync_kloop_eventfds *eventfds_opt = NULL;
566 #ifdef SYNC_KLOOP_POLL
567 	struct sync_kloop_poll_ctx *poll_ctx = NULL;
568 #endif  /* SYNC_KLOOP_POLL */
569 	int num_rx_rings, num_tx_rings, num_rings;
570 	struct sync_kloop_ring_args *args = NULL;
571 	uint32_t sleep_us = req->sleep_us;
572 	struct nm_csb_atok* csb_atok_base;
573 	struct nm_csb_ktoa* csb_ktoa_base;
574 	struct netmap_adapter *na;
575 	struct nmreq_option *opt;
576 	bool na_could_sleep = false;
577 	bool busy_wait = true;
578 	bool direct_tx = false;
579 	bool direct_rx = false;
580 	int err = 0;
581 	int i;
582 
583 	if (sleep_us > 1000000) {
584 		/* We do not accept sleeping for more than a second. */
585 		return EINVAL;
586 	}
587 
588 	if (priv->np_nifp == NULL) {
589 		return ENXIO;
590 	}
591 	mb(); /* make sure following reads are not from cache */
592 
593 	na = priv->np_na;
594 	if (!nm_netmap_on(na)) {
595 		return ENXIO;
596 	}
597 
598 	NMG_LOCK();
599 	/* Make sure the application is working in CSB mode. */
600 	if (!priv->np_csb_atok_base || !priv->np_csb_ktoa_base) {
601 		NMG_UNLOCK();
602 		nm_prerr("sync-kloop on %s requires "
603 				"NETMAP_REQ_OPT_CSB option", na->name);
604 		return EINVAL;
605 	}
606 
607 	csb_atok_base = priv->np_csb_atok_base;
608 	csb_ktoa_base = priv->np_csb_ktoa_base;
609 
610 	/* Make sure that no kloop is currently running. */
611 	if (priv->np_kloop_state & NM_SYNC_KLOOP_RUNNING) {
612 		err = EBUSY;
613 	}
614 	priv->np_kloop_state |= NM_SYNC_KLOOP_RUNNING;
615 	NMG_UNLOCK();
616 	if (err) {
617 		return err;
618 	}
619 
620 	num_rx_rings = priv->np_qlast[NR_RX] - priv->np_qfirst[NR_RX];
621 	num_tx_rings = priv->np_qlast[NR_TX] - priv->np_qfirst[NR_TX];
622 	num_rings = num_tx_rings + num_rx_rings;
623 
624 	args = nm_os_malloc(num_rings * sizeof(args[0]));
625 	if (!args) {
626 		err = ENOMEM;
627 		goto out;
628 	}
629 
630 	/* Prepare the arguments for netmap_sync_kloop_tx_ring()
631 	 * and netmap_sync_kloop_rx_ring(). */
632 	for (i = 0; i < num_tx_rings; i++) {
633 		struct sync_kloop_ring_args *a = args + i;
634 
635 		a->kring = NMR(na, NR_TX)[i + priv->np_qfirst[NR_TX]];
636 		a->csb_atok = csb_atok_base + i;
637 		a->csb_ktoa = csb_ktoa_base + i;
638 		a->busy_wait = busy_wait;
639 		a->direct = direct_tx;
640 	}
641 	for (i = 0; i < num_rx_rings; i++) {
642 		struct sync_kloop_ring_args *a = args + num_tx_rings + i;
643 
644 		a->kring = NMR(na, NR_RX)[i + priv->np_qfirst[NR_RX]];
645 		a->csb_atok = csb_atok_base + num_tx_rings + i;
646 		a->csb_ktoa = csb_ktoa_base + num_tx_rings + i;
647 		a->busy_wait = busy_wait;
648 		a->direct = direct_rx;
649 	}
650 
651 	/* Validate notification options. */
652 	opt = nmreq_findoption((struct nmreq_option *)(uintptr_t)hdr->nr_options,
653 				NETMAP_REQ_OPT_SYNC_KLOOP_MODE);
654 	if (opt != NULL) {
655 		struct nmreq_opt_sync_kloop_mode *mode_opt =
656 		    (struct nmreq_opt_sync_kloop_mode *)opt;
657 
658 		direct_tx = !!(mode_opt->mode & NM_OPT_SYNC_KLOOP_DIRECT_TX);
659 		direct_rx = !!(mode_opt->mode & NM_OPT_SYNC_KLOOP_DIRECT_RX);
660 		if (mode_opt->mode & ~(NM_OPT_SYNC_KLOOP_DIRECT_TX |
661 		    NM_OPT_SYNC_KLOOP_DIRECT_RX)) {
662 			opt->nro_status = err = EINVAL;
663 			goto out;
664 		}
665 		opt->nro_status = 0;
666 	}
667 	opt = nmreq_findoption((struct nmreq_option *)(uintptr_t)hdr->nr_options,
668 				NETMAP_REQ_OPT_SYNC_KLOOP_EVENTFDS);
669 	if (opt != NULL) {
670 		err = nmreq_checkduplicate(opt);
671 		if (err) {
672 			opt->nro_status = err;
673 			goto out;
674 		}
675 		if (opt->nro_size != sizeof(*eventfds_opt) +
676 			sizeof(eventfds_opt->eventfds[0]) * num_rings) {
677 			/* Option size not consistent with the number of
678 			 * entries. */
679 			opt->nro_status = err = EINVAL;
680 			goto out;
681 		}
682 #ifdef SYNC_KLOOP_POLL
683 		eventfds_opt = (struct nmreq_opt_sync_kloop_eventfds *)opt;
684 		opt->nro_status = 0;
685 
686 		/* Check if some ioeventfd entry is not defined, and force sleep
687 		 * synchronization in that case. */
688 		busy_wait = false;
689 		for (i = 0; i < num_rings; i++) {
690 			if (eventfds_opt->eventfds[i].ioeventfd < 0) {
691 				busy_wait = true;
692 				break;
693 			}
694 		}
695 
696 		if (busy_wait && (direct_tx || direct_rx)) {
697 			/* For direct processing we need all the
698 			 * ioeventfds to be valid. */
699 			opt->nro_status = err = EINVAL;
700 			goto out;
701 		}
702 
703 		/* We need 2 poll entries for TX and RX notifications coming
704 		 * from the netmap adapter, plus one entries per ring for the
705 		 * notifications coming from the application. */
706 		poll_ctx = nm_os_malloc(sizeof(*poll_ctx) +
707 				(num_rings + 2) * sizeof(poll_ctx->entries[0]));
708 		init_poll_funcptr(&poll_ctx->wait_table,
709 					sync_kloop_poll_table_queue_proc);
710 		poll_ctx->num_entries = 2 + num_rings;
711 		poll_ctx->num_tx_rings = num_tx_rings;
712 		poll_ctx->num_rings = num_rings;
713 		poll_ctx->next_entry = 0;
714 		poll_ctx->next_wake_fun = NULL;
715 
716 		if (direct_tx && (na->na_flags & NAF_BDG_MAYSLEEP)) {
717 			/* In direct mode, VALE txsync is called from
718 			 * wake-up context, where it is not possible
719 			 * to sleep.
720 			 */
721 			na->na_flags &= ~NAF_BDG_MAYSLEEP;
722 			na_could_sleep = true;
723 		}
724 
725 		for (i = 0; i < num_rings + 2; i++) {
726 			poll_ctx->entries[i].args = args + i;
727 			poll_ctx->entries[i].parent = poll_ctx;
728 		}
729 
730 		/* Poll for notifications coming from the applications through
731 		 * eventfds. */
732 		for (i = 0; i < num_rings; i++, poll_ctx->next_entry++) {
733 			struct eventfd_ctx *irq = NULL;
734 			struct file *filp = NULL;
735 			unsigned long mask;
736 			bool tx_ring = (i < num_tx_rings);
737 
738 			if (eventfds_opt->eventfds[i].irqfd >= 0) {
739 				filp = eventfd_fget(
740 				    eventfds_opt->eventfds[i].irqfd);
741 				if (IS_ERR(filp)) {
742 					err = PTR_ERR(filp);
743 					goto out;
744 				}
745 				irq = eventfd_ctx_fileget(filp);
746 				if (IS_ERR(irq)) {
747 					err = PTR_ERR(irq);
748 					goto out;
749 				}
750 			}
751 			poll_ctx->entries[i].irq_filp = filp;
752 			poll_ctx->entries[i].irq_ctx = irq;
753 			poll_ctx->entries[i].args->busy_wait = busy_wait;
754 			/* Don't let netmap_sync_kloop_*x_ring() use
755 			 * IRQs in direct mode. */
756 			poll_ctx->entries[i].args->irq_ctx =
757 			    ((tx_ring && direct_tx) ||
758 			    (!tx_ring && direct_rx)) ? NULL :
759 			    poll_ctx->entries[i].irq_ctx;
760 			poll_ctx->entries[i].args->direct =
761 			    (tx_ring ? direct_tx : direct_rx);
762 
763 			if (!busy_wait) {
764 				filp = eventfd_fget(
765 				    eventfds_opt->eventfds[i].ioeventfd);
766 				if (IS_ERR(filp)) {
767 					err = PTR_ERR(filp);
768 					goto out;
769 				}
770 				if (tx_ring && direct_tx) {
771 					/* Override the wake up function
772 					 * so that it can directly call
773 					 * netmap_sync_kloop_tx_ring().
774 					 */
775 					poll_ctx->next_wake_fun =
776 					    sync_kloop_tx_kick_wake_fun;
777 				} else if (!tx_ring && direct_rx) {
778 					/* Same for direct RX. */
779 					poll_ctx->next_wake_fun =
780 					    sync_kloop_rx_kick_wake_fun;
781 				} else {
782 					poll_ctx->next_wake_fun = NULL;
783 				}
784 				mask = filp->f_op->poll(filp,
785 				    &poll_ctx->wait_table);
786 				if (mask & POLLERR) {
787 					err = EINVAL;
788 					goto out;
789 				}
790 			}
791 		}
792 
793 		/* Poll for notifications coming from the netmap rings bound to
794 		 * this file descriptor. */
795 		if (!busy_wait) {
796 			NMG_LOCK();
797 			/* In direct mode, override the wake up function so
798 			 * that it can forward the netmap_tx_irq() to the
799 			 * guest. */
800 			poll_ctx->next_wake_fun = direct_tx ?
801 			    sync_kloop_tx_irq_wake_fun : NULL;
802 			poll_wait(priv->np_filp, priv->np_si[NR_TX],
803 			    &poll_ctx->wait_table);
804 			poll_ctx->next_entry++;
805 
806 			poll_ctx->next_wake_fun = direct_rx ?
807 			    sync_kloop_rx_irq_wake_fun : NULL;
808 			poll_wait(priv->np_filp, priv->np_si[NR_RX],
809 			    &poll_ctx->wait_table);
810 			poll_ctx->next_entry++;
811 			NMG_UNLOCK();
812 		}
813 #else   /* SYNC_KLOOP_POLL */
814 		opt->nro_status = EOPNOTSUPP;
815 		goto out;
816 #endif  /* SYNC_KLOOP_POLL */
817 	}
818 
819 	nm_prinf("kloop busy_wait %u, direct_tx %u, direct_rx %u, "
820 	    "na_could_sleep %u", busy_wait, direct_tx, direct_rx,
821 	    na_could_sleep);
822 
823 	/* Main loop. */
824 	for (;;) {
825 		if (unlikely(NM_ACCESS_ONCE(priv->np_kloop_state) & NM_SYNC_KLOOP_STOPPING)) {
826 			break;
827 		}
828 
829 #ifdef SYNC_KLOOP_POLL
830 		if (!busy_wait) {
831 			/* It is important to set the task state as
832 			 * interruptible before processing any TX/RX ring,
833 			 * so that if a notification on ring Y comes after
834 			 * we have processed ring Y, but before we call
835 			 * schedule(), we don't miss it. This is true because
836 			 * the wake up function will change the the task state,
837 			 * and therefore the schedule_timeout() call below
838 			 * will observe the change).
839 			 */
840 			set_current_state(TASK_INTERRUPTIBLE);
841 		}
842 #endif  /* SYNC_KLOOP_POLL */
843 
844 		/* Process all the TX rings bound to this file descriptor. */
845 		for (i = 0; !direct_tx && i < num_tx_rings; i++) {
846 			struct sync_kloop_ring_args *a = args + i;
847 			netmap_sync_kloop_tx_ring(a);
848 		}
849 
850 		/* Process all the RX rings bound to this file descriptor. */
851 		for (i = 0; !direct_rx && i < num_rx_rings; i++) {
852 			struct sync_kloop_ring_args *a = args + num_tx_rings + i;
853 			netmap_sync_kloop_rx_ring(a);
854 		}
855 
856 		if (busy_wait) {
857 			/* Default synchronization method: sleep for a while. */
858 			usleep_range(sleep_us, sleep_us);
859 		}
860 #ifdef SYNC_KLOOP_POLL
861 		else {
862 			/* Yield to the scheduler waiting for a notification
863 			 * to come either from netmap or the application. */
864 			schedule_timeout(msecs_to_jiffies(3000));
865 		}
866 #endif /* SYNC_KLOOP_POLL */
867 	}
868 out:
869 #ifdef SYNC_KLOOP_POLL
870 	if (poll_ctx) {
871 		/* Stop polling from netmap and the eventfds, and deallocate
872 		 * the poll context. */
873 		if (!busy_wait) {
874 			__set_current_state(TASK_RUNNING);
875 		}
876 		for (i = 0; i < poll_ctx->next_entry; i++) {
877 			struct sync_kloop_poll_entry *entry =
878 						poll_ctx->entries + i;
879 
880 			if (entry->wqh)
881 				remove_wait_queue(entry->wqh, &entry->wait);
882 			/* We did not get a reference to the eventfds, but
883 			 * don't do that on netmap file descriptors (since
884 			 * a reference was not taken. */
885 			if (entry->filp && entry->filp != priv->np_filp)
886 				fput(entry->filp);
887 			if (entry->irq_ctx)
888 				eventfd_ctx_put(entry->irq_ctx);
889 			if (entry->irq_filp)
890 				fput(entry->irq_filp);
891 		}
892 		nm_os_free(poll_ctx);
893 		poll_ctx = NULL;
894 	}
895 #endif /* SYNC_KLOOP_POLL */
896 
897 	if (args) {
898 		nm_os_free(args);
899 		args = NULL;
900 	}
901 
902 	/* Reset the kloop state. */
903 	NMG_LOCK();
904 	priv->np_kloop_state = 0;
905 	if (na_could_sleep) {
906 		na->na_flags |= NAF_BDG_MAYSLEEP;
907 	}
908 	NMG_UNLOCK();
909 
910 	return err;
911 }
912 
913 int
914 netmap_sync_kloop_stop(struct netmap_priv_d *priv)
915 {
916 	struct netmap_adapter *na;
917 	bool running = true;
918 	int err = 0;
919 
920 	if (priv->np_nifp == NULL) {
921 		return ENXIO;
922 	}
923 	mb(); /* make sure following reads are not from cache */
924 
925 	na = priv->np_na;
926 	if (!nm_netmap_on(na)) {
927 		return ENXIO;
928 	}
929 
930 	/* Set the kloop stopping flag. */
931 	NMG_LOCK();
932 	priv->np_kloop_state |= NM_SYNC_KLOOP_STOPPING;
933 	NMG_UNLOCK();
934 
935 	/* Send a notification to the kloop, in case it is blocked in
936 	 * schedule_timeout(). We can use either RX or TX, because the
937 	 * kloop is waiting on both. */
938 	nm_os_selwakeup(priv->np_si[NR_RX]);
939 
940 	/* Wait for the kloop to actually terminate. */
941 	while (running) {
942 		usleep_range(1000, 1500);
943 		NMG_LOCK();
944 		running = (NM_ACCESS_ONCE(priv->np_kloop_state)
945 				& NM_SYNC_KLOOP_RUNNING);
946 		NMG_UNLOCK();
947 	}
948 
949 	return err;
950 }
951 
952 #ifdef WITH_PTNETMAP
953 /*
954  * Guest ptnetmap txsync()/rxsync() routines, used in ptnet device drivers.
955  * These routines are reused across the different operating systems supported
956  * by netmap.
957  */
958 
959 /*
960  * Reconcile host and guest views of the transmit ring.
961  *
962  * Guest user wants to transmit packets up to the one before ring->head,
963  * and guest kernel knows tx_ring->hwcur is the first packet unsent
964  * by the host kernel.
965  *
966  * We push out as many packets as possible, and possibly
967  * reclaim buffers from previously completed transmission.
968  *
969  * Notifications from the host are enabled only if the user guest would
970  * block (no space in the ring).
971  */
972 bool
973 netmap_pt_guest_txsync(struct nm_csb_atok *atok, struct nm_csb_ktoa *ktoa,
974 			struct netmap_kring *kring, int flags)
975 {
976 	bool notify = false;
977 
978 	/* Disable notifications */
979 	atok->appl_need_kick = 0;
980 
981 	/*
982 	 * First part: tell the host to process the new packets,
983 	 * updating the CSB.
984 	 */
985 	kring->nr_hwcur = ktoa->hwcur;
986 	nm_sync_kloop_appl_write(atok, kring->rcur, kring->rhead);
987 
988         /* Ask for a kick from a guest to the host if needed. */
989 	if (((kring->rhead != kring->nr_hwcur || nm_kr_wouldblock(kring))
990 		&& NM_ACCESS_ONCE(ktoa->kern_need_kick)) ||
991 			(flags & NAF_FORCE_RECLAIM)) {
992 		atok->sync_flags = flags;
993 		notify = true;
994 	}
995 
996 	/*
997 	 * Second part: reclaim buffers for completed transmissions.
998 	 */
999 	if (nm_kr_wouldblock(kring) || (flags & NAF_FORCE_RECLAIM)) {
1000 		nm_sync_kloop_appl_read(ktoa, &kring->nr_hwtail,
1001 					&kring->nr_hwcur);
1002 	}
1003 
1004         /*
1005          * No more room in the ring for new transmissions. The user thread will
1006 	 * go to sleep and we need to be notified by the host when more free
1007 	 * space is available.
1008          */
1009 	if (nm_kr_wouldblock(kring) && !(kring->nr_kflags & NKR_NOINTR)) {
1010 		/* Reenable notifications. */
1011 		atok->appl_need_kick = 1;
1012                 /* Double check, with store-load memory barrier. */
1013 		nm_stld_barrier();
1014 		nm_sync_kloop_appl_read(ktoa, &kring->nr_hwtail,
1015 					&kring->nr_hwcur);
1016                 /* If there is new free space, disable notifications */
1017 		if (unlikely(!nm_kr_wouldblock(kring))) {
1018 			atok->appl_need_kick = 0;
1019 		}
1020 	}
1021 
1022 	nm_prdis(1, "%s CSB(head:%u cur:%u hwtail:%u) KRING(head:%u cur:%u tail:%u)",
1023 		kring->name, atok->head, atok->cur, ktoa->hwtail,
1024 		kring->rhead, kring->rcur, kring->nr_hwtail);
1025 
1026 	return notify;
1027 }
1028 
1029 /*
1030  * Reconcile host and guest view of the receive ring.
1031  *
1032  * Update hwcur/hwtail from host (reading from CSB).
1033  *
1034  * If guest user has released buffers up to the one before ring->head, we
1035  * also give them to the host.
1036  *
1037  * Notifications from the host are enabled only if the user guest would
1038  * block (no more completed slots in the ring).
1039  */
1040 bool
1041 netmap_pt_guest_rxsync(struct nm_csb_atok *atok, struct nm_csb_ktoa *ktoa,
1042 			struct netmap_kring *kring, int flags)
1043 {
1044 	bool notify = false;
1045 
1046         /* Disable notifications */
1047 	atok->appl_need_kick = 0;
1048 
1049 	/*
1050 	 * First part: import newly received packets, by updating the kring
1051 	 * hwtail to the hwtail known from the host (read from the CSB).
1052 	 * This also updates the kring hwcur.
1053 	 */
1054 	nm_sync_kloop_appl_read(ktoa, &kring->nr_hwtail, &kring->nr_hwcur);
1055 	kring->nr_kflags &= ~NKR_PENDINTR;
1056 
1057 	/*
1058 	 * Second part: tell the host about the slots that guest user has
1059 	 * released, by updating cur and head in the CSB.
1060 	 */
1061 	if (kring->rhead != kring->nr_hwcur) {
1062 		nm_sync_kloop_appl_write(atok, kring->rcur, kring->rhead);
1063 	}
1064 
1065         /*
1066          * No more completed RX slots. The user thread will go to sleep and
1067 	 * we need to be notified by the host when more RX slots have been
1068 	 * completed.
1069          */
1070 	if (nm_kr_wouldblock(kring) && !(kring->nr_kflags & NKR_NOINTR)) {
1071 		/* Reenable notifications. */
1072                 atok->appl_need_kick = 1;
1073                 /* Double check, with store-load memory barrier. */
1074 		nm_stld_barrier();
1075 		nm_sync_kloop_appl_read(ktoa, &kring->nr_hwtail,
1076 					&kring->nr_hwcur);
1077                 /* If there are new slots, disable notifications. */
1078 		if (!nm_kr_wouldblock(kring)) {
1079                         atok->appl_need_kick = 0;
1080                 }
1081         }
1082 
1083 	/* Ask for a kick from the guest to the host if needed. */
1084 	if ((kring->rhead != kring->nr_hwcur || nm_kr_wouldblock(kring))
1085 		&& NM_ACCESS_ONCE(ktoa->kern_need_kick)) {
1086 		atok->sync_flags = flags;
1087 		notify = true;
1088 	}
1089 
1090 	nm_prdis(1, "%s CSB(head:%u cur:%u hwtail:%u) KRING(head:%u cur:%u tail:%u)",
1091 		kring->name, atok->head, atok->cur, ktoa->hwtail,
1092 		kring->rhead, kring->rcur, kring->nr_hwtail);
1093 
1094 	return notify;
1095 }
1096 
1097 /*
1098  * Callbacks for ptnet drivers: nm_krings_create, nm_krings_delete, nm_dtor.
1099  */
1100 int
1101 ptnet_nm_krings_create(struct netmap_adapter *na)
1102 {
1103 	struct netmap_pt_guest_adapter *ptna =
1104 			(struct netmap_pt_guest_adapter *)na; /* Upcast. */
1105 	struct netmap_adapter *na_nm = &ptna->hwup.up;
1106 	struct netmap_adapter *na_dr = &ptna->dr.up;
1107 	int ret;
1108 
1109 	if (ptna->backend_users) {
1110 		return 0;
1111 	}
1112 
1113 	/* Create krings on the public netmap adapter. */
1114 	ret = netmap_hw_krings_create(na_nm);
1115 	if (ret) {
1116 		return ret;
1117 	}
1118 
1119 	/* Copy krings into the netmap adapter private to the driver. */
1120 	na_dr->tx_rings = na_nm->tx_rings;
1121 	na_dr->rx_rings = na_nm->rx_rings;
1122 
1123 	return 0;
1124 }
1125 
1126 void
1127 ptnet_nm_krings_delete(struct netmap_adapter *na)
1128 {
1129 	struct netmap_pt_guest_adapter *ptna =
1130 			(struct netmap_pt_guest_adapter *)na; /* Upcast. */
1131 	struct netmap_adapter *na_nm = &ptna->hwup.up;
1132 	struct netmap_adapter *na_dr = &ptna->dr.up;
1133 
1134 	if (ptna->backend_users) {
1135 		return;
1136 	}
1137 
1138 	na_dr->tx_rings = NULL;
1139 	na_dr->rx_rings = NULL;
1140 
1141 	netmap_hw_krings_delete(na_nm);
1142 }
1143 
1144 void
1145 ptnet_nm_dtor(struct netmap_adapter *na)
1146 {
1147 	struct netmap_pt_guest_adapter *ptna =
1148 			(struct netmap_pt_guest_adapter *)na;
1149 
1150 	netmap_mem_put(ptna->dr.up.nm_mem);
1151 	memset(&ptna->dr, 0, sizeof(ptna->dr));
1152 	netmap_mem_pt_guest_ifp_del(na->nm_mem, na->ifp);
1153 }
1154 
1155 int
1156 netmap_pt_guest_attach(struct netmap_adapter *arg,
1157 		       unsigned int nifp_offset, unsigned int memid)
1158 {
1159 	struct netmap_pt_guest_adapter *ptna;
1160 	struct ifnet *ifp = arg ? arg->ifp : NULL;
1161 	int error;
1162 
1163 	/* get allocator */
1164 	arg->nm_mem = netmap_mem_pt_guest_new(ifp, nifp_offset, memid);
1165 	if (arg->nm_mem == NULL)
1166 		return ENOMEM;
1167 	arg->na_flags |= NAF_MEM_OWNER;
1168 	error = netmap_attach_ext(arg, sizeof(struct netmap_pt_guest_adapter), 1);
1169 	if (error)
1170 		return error;
1171 
1172 	/* get the netmap_pt_guest_adapter */
1173 	ptna = (struct netmap_pt_guest_adapter *) NA(ifp);
1174 
1175 	/* Initialize a separate pass-through netmap adapter that is going to
1176 	 * be used by the ptnet driver only, and so never exposed to netmap
1177          * applications. We only need a subset of the available fields. */
1178 	memset(&ptna->dr, 0, sizeof(ptna->dr));
1179 	ptna->dr.up.ifp = ifp;
1180 	ptna->dr.up.nm_mem = netmap_mem_get(ptna->hwup.up.nm_mem);
1181         ptna->dr.up.nm_config = ptna->hwup.up.nm_config;
1182 
1183 	ptna->backend_users = 0;
1184 
1185 	return 0;
1186 }
1187 
1188 #endif /* WITH_PTNETMAP */
1189