1 /* 2 * Copyright (C) 2013 Luigi Rizzo. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 */ 25 26 #ifndef NET_PARAVIRT_H 27 #define NET_PARAVIRT_H 28 29 /* 30 * 31 Support for virtio-like communication between host (H) and guest (G) NICs. 32 33 THIS IS EXPERIMENTAL CODE AND SUBJECT TO CHANGE. 34 35 The guest allocates the shared Communication Status Block (csb) and 36 write its physical address at CSBAL and CSBAH (data is little endian). 37 csb->csb_on enables the mode. If disabled, the device acts a regular one. 38 39 Notifications for tx and rx are exchanged without vm exits 40 if possible. In particular (only mentioning csb mode below), 41 the following actions are performed. In the description below, 42 "double check" means verifying again the condition that caused 43 the previous action, and reverting the action if the condition has 44 changed. The condition typically depends on a variable set by the 45 other party, and the double check is done to avoid races. E.g. 46 47 // start with A=0 48 again: 49 // do something 50 if ( cond(C) ) { // C is written by the other side 51 A = 1; 52 // barrier 53 if ( !cond(C) ) { 54 A = 0; 55 goto again; 56 } 57 } 58 59 TX: start from idle: 60 H starts with host_need_txkick=1 when the I/O thread bh is idle. Upon new 61 transmissions, G always updates guest_tdt. If host_need_txkick == 1, 62 G also writes to the TDT, which acts as a kick to H (so pending 63 writes are always dispatched to H as soon as possible.) 64 65 TX: active state: 66 On the kick (TDT write) H sets host_need_txkick == 0 (if not 67 done already by G), and starts an I/O thread trying to consume 68 packets from TDH to guest_tdt, periodically refreshing host_tdh 69 and TDH. When host_tdh == guest_tdt, H sets host_need_txkick=1, 70 and then does the "double check" for race avoidance. 71 72 TX: G runs out of buffers 73 XXX there are two mechanisms, one boolean (using guest_need_txkick) 74 and one with a threshold (using guest_txkick_at). They are mutually 75 exclusive. 76 BOOLEAN: when G has no space, it sets guest_need_txkick=1 and does 77 the double check. If H finds guest_need_txkick== 1 on a write 78 to TDH, it also generates an interrupt. 79 THRESHOLD: G sets guest_txkick_at to the TDH value for which it 80 wants to receive an interrupt. When H detects that TDH moves 81 across guest_txkick_at, it generates an interrupt. 82 This second mechanism reduces the number of interrupts and 83 TDT writes on the transmit side when the host is too slow. 84 85 RX: start from idle 86 G starts with guest_need_rxkick = 1 when the receive ring is empty. 87 As packets arrive, H updates host_rdh (and RDH) and also generates an 88 interrupt when guest_need_rxkick == 1 (so incoming packets are 89 always reported to G as soon as possible, apart from interrupt 90 moderation delays). It also tracks guest_rdt for new buffers. 91 92 RX: active state 93 As the interrupt arrives, G sets guest_need_rxkick = 0 and starts 94 draining packets from the receive ring, while updating guest_rdt 95 When G runs out of packets it sets guest_need_rxkick=1 and does the 96 double check. 97 98 RX: H runs out of buffers 99 XXX there are two mechanisms, one boolean (using host_need_rxkick) 100 and one with a threshold (using host_xxkick_at). They are mutually 101 exclusive. 102 BOOLEAN: when H has no space, it sets host_need_rxkick=1 and does the 103 double check. If G finds host_need_rxkick==1 on updating guest_rdt, 104 it also writes to RDT causing a kick to H. 105 THRESHOLD: H sets host_rxkick_at to the RDT value for which it wants 106 to receive a kick. When G detects that guest_rdt moves across 107 host_rxkick_at, it writes to RDT thus generates a kick. 108 This second mechanism reduces the number of kicks and 109 RDT writes on the receive side when the guest is too slow and 110 would free only a few buffers at a time. 111 112 */ 113 struct paravirt_csb { 114 /* XXX revise the layout to minimize cache bounces. 115 * Usage is described as follows: 116 * [GH][RW][+-0] guest/host reads/writes frequently/rarely/almost never 117 */ 118 /* these are (mostly) written by the guest */ 119 uint32_t guest_tdt; /* GW+ HR+ pkt to transmit */ 120 uint32_t guest_need_txkick; /* GW- HR+ G ran out of tx bufs, request kick */ 121 uint32_t guest_need_rxkick; /* GW- HR+ G ran out of rx pkts, request kick */ 122 uint32_t guest_csb_on; /* GW- HR+ enable paravirtual mode */ 123 uint32_t guest_rdt; /* GW+ HR+ rx buffers available */ 124 uint32_t guest_txkick_at; /* GW- HR+ tx ring pos. where G expects an intr */ 125 uint32_t guest_use_msix; /* GW0 HR0 guest uses MSI-X interrupts. */ 126 uint32_t pad[9]; 127 128 /* these are (mostly) written by the host */ 129 uint32_t host_tdh; /* GR0 HW- shadow register, mostly unused */ 130 uint32_t host_need_txkick; /* GR+ HW- start the iothread */ 131 uint32_t host_txcycles_lim; /* GW- HR- how much to spin before sleep. 132 * set by the guest */ 133 uint32_t host_txcycles; /* GR0 HW- counter, but no need to be exported */ 134 uint32_t host_rdh; /* GR0 HW- shadow register, mostly unused */ 135 uint32_t host_need_rxkick; /* GR+ HW- flush rx queued packets */ 136 uint32_t host_isr; /* GR* HW* shadow copy of ISR */ 137 uint32_t host_rxkick_at; /* GR+ HW- rx ring pos where H expects a kick */ 138 uint32_t vnet_ring_high; /* Vnet ring physical address high. */ 139 uint32_t vnet_ring_low; /* Vnet ring physical address low. */ 140 }; 141 142 #define NET_PARAVIRT_CSB_SIZE 4096 143 #define NET_PARAVIRT_NONE (~((uint32_t)0)) 144 145 #ifdef QEMU_PCI_H 146 147 /* 148 * API functions only available within QEMU 149 */ 150 151 void paravirt_configure_csb(struct paravirt_csb** csb, uint32_t csbbal, 152 uint32_t csbbah, QEMUBH* tx_bh, AddressSpace *as); 153 154 #endif /* QEMU_PCI_H */ 155 156 #endif /* NET_PARAVIRT_H */ 157