/* * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. */ /* * Copyright 2016 Joyent, Inc. * Copyright 2022 MNX Cloud, Inc. */ /* * Overlay Devices * * Overlay devices provide a means for creating overlay networks, a means of * multiplexing multiple logical, isolated, and discrete layer two and layer * three networks on top of one physical network. * * In general, these overlay devices encapsulate the logic to answer two * different questions: * * 1) How should I transform a packet to put it on the wire? * 2) Where should I send a transformed packet? * * Each overlay device is presented to the user as a GLDv3 device. While the * link itself cannot have an IP interface created on top of it, it allows for * additional GLDv3 devices, such as a VNIC, to be created on top of it which * can be plumbed up with IP interfaces. * * * -------------------- * General Architecture * -------------------- * * The logical overlay device that a user sees in dladm(8) is a combination of * two different components that work together. The first component is this * kernel module, which is responsible for answering question one -- how should * I transform a packet to put it on the wire. * * The second component is what we call the virtual ARP daemon, or varpd. It is * a userland component that is responsible for answering the second question -- * Where should I send a transformed packet. Instances of the kernel overlay * GLDv3 device ask varpd the question of where should a packet go. * * The split was done for a few reasons. Importantly, we wanted to keep the act * of generating encapsulated packets in the kernel so as to ensure that the * general data path was fast and also kept simple. On the flip side, while the * question of where should something go may be simple, it may often be * complicated and need to interface with several different external or * distributed systems. In those cases, it's simpler to allow for the full * flexibility of userland to be brought to bear to solve that problem and in * general, the path isn't very common. * * The following is what makes up the logical overlay device that a user would * create with dladm(8). * * Kernel Userland * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . * . +--------+ +--------+ +--------+ . . . * . | VNIC 0 | | VNIC 1 | | VNIC 2 | . . . * . +--------+ +--------+ +--------+ . . . * . | | | . . . * . | | | . . . * . +------------+-----------+ . . . * . | . . /dev/overlay . * . +--------------+ . . . +------------+ . * . | | . . . | | . * . | Overlay |======*=================| Virtual | . * . | GLDv3 Device |========================| ARP Daemon | . * . | | . . | | . * . +--------------+ . . +------------+ . * . | . . | . * . | . . | . * . +----------------+ . . +--------+ . * . | Overlay | . . | varpd | . * . | Encapsulation | . . | Lookup | . * . | Plugin | . . | Plugin | . * . +----------------+ . . +--------+ . * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . * * * This image shows the two different components and where they live. * Importantly, it also shows that both the kernel overlay device and the * userland varpd both support plugins. The plugins actually implement the * things that users care about and the APIs have been designed to try to * minimize the amount of things that a module writer needs to worry about it. * * IDENTIFIERS * * Every overlay device is defined by a unique identifier which is the overlay * identifier. Its purpose is similar to that of a VLAN identifier, it's a * unique number that is used to differentiate between different entries on the * wire. * * ENCAPSULATION * * An overlay encapsulation plugin is a kernel miscellaneous module whose * purpose is to contain knowledge about how to transform packets to put them * onto the wire and to take them off. An example of an encapsulation plugin is * vxlan. It's also how support for things like nvgre or geneve would be brought * into the system. * * Each encapsulation plugins defines a series of operation vectors and * properties. For the full details on everything they should provide, please * read uts/common/sys/overlay_plugin.h. The encapsulation plugin is responsible * for telling the system what information is required to send a packet. For * example, vxlan is defined to send everything over a UDP packet and therefore * requires a port and an IP address, while nvgre on the other hand is its own * IP type and therefore just requires an IP address. In addition, it also * provides information about the kind of socket that should be created. This is * used by the kernel multiplexor, more of that in the Kernel Components * section. * * LOOKUPS * * The kernel communicates requests for lookups over the character device * /dev/overlay. varpd is responsible for listening for requests on that device * and answering them. The character device is specific to the target path and * varpd. * * Much as the kernel overlay module handles the bulk of the scaffolding but * leaves the important work to the encapsulation plugin, varpd provides a * similar role and leaves the full brunt of lookups to a userland dynamic * shared object which implements the logic of lookups. * * Each lookup plugin defines a series of operation vectors and properties. For * the full details on everything that they should provide, please read * lib/varpd/libvarpd/libvarpd_provider.h. Essentially, they are given a MAC * address and asked to give an address on the physical network that it should * be sent to. In addition, they handle questions related to how to handle * things like broadcast and multicast traffic, etc. * * ---------- * Properties * ---------- * * A device from a dladm perspective has a unique set of properties that are * combined from three different sources: * * 1) Generic properties that every overlay device has * 2) Properties that are specific to the encapsulation plugin * 3) Properties that are specific to the lookup plugin * * All of these are exposed in a single set of properties in dladm. Note that * these are not necessarily traditional link properties. However, if something * is both a traditional GLDv3 link property, say the MTU of a device, and a * specific property here, than the driver ensures that all existing GLDv3 * specific means of manipulating it are used and wraps up its private property * interfaces to ensure that works. * * Properties in the second and third category are prefixed with the name of * their module. For example, the vxlan encapsulation module has a property * called the 'listen_ip'. This property would show up in dladm as * 'vxlan/listen_ip'. This allows different plugins to both use similar names * for similar properties and to also have independent name spaces so that * overlapping names do not conflict with anything else. * * While the kernel combines both sets one and two into a single coherent view, * it does not do anything with respect to the properties that are owned by the * lookup plugin -- those are owned wholly by varpd. Instead, libdladm is in * charge of bridging these two worlds into one magical experience for the user. * It carries the burden of knowing about both overlay specific and varpd * specific properties. Importantly, we want to maintain this distinction. We * don't want to treat the kernel as an arbitrary key/value store for varpd and * we want the kernel to own its own data and not have to ask userland for * information that it owns. * * Every property in the system has the following attributes: * * o A name * o A type * o A size * o Permissions * o Default value * o Valid value ranges * o A value * * Everything except for the value is obtained by callers through the propinfo * callbacks and a property has a maximum size of OVERLAY_PROP_SIZEMAX, * currently 256 bytes. * * The following are the supported types of properties: * * OVERLAY_PROP_T_INT * * A signed integer, its length is 8 bytes, corresponding to a * int64_t. * * OVERLAY_PROP_T_UINT * * An unsigned integer, its length is 8 bytes, corresponding to a * uint64_t. * * OVERLAY_PROP_T_IP * * A struct in6_addr, it has a fixed size. * * OVERLAY_PROP_T_STRING * * A null-terminated character string encoded in either ASCII or * UTF-8. Note that the size of the string includes the null * terminator. * * The next thing that we apply to a property is its permission. The permissions * are put together by the bitwise or of the following flags and values. * * OVERLAY_PROP_PERM_REQ * * This indicates a required property. A property that is required * must be set by a consumer before the device can be created. If a * required property has a default property, this constraint is * loosened because the default property defines the value. * * OVERLAY_PORP_PERM_READ * * This indicates that a property can be read. All properties will * have this value set. * * OVERLAY_PROP_PERM_WRITE * * This indicates that a property can be written to and thus * updated by userland. Properties that are only intended to * display information, will not have OVERLAY_PROP_PERM_WRITE set. * * In addition, a few additional values are defined as a convenience to * consumers. The first, OVERLAY_PROP_PERM_RW, is a combination of * OVERLAY_PROP_PERM_READ and OVERLAY_PERM_PROP_WRITE. The second, * OVERLAY_PROP_PERM_RRW, is a combination of OVERLAY_PROP_PERM_REQ, * OVERLAY_PROP_PERM_READ, and OVERLAY_PROP_PERM_WRITE. The protection mode of a * property should generally be a constant across its lifetime. * * A property may optionally have a default value. If it does have a default * value, and that property is not set to be a different value, then the default * value is inherited automatically. It also means that if the default value is * acceptable, there is no need to set the value for a required property. For * example, the vxlan module has the vxlan/listen_port property which is * required, but has a default value of 4789 (the IANA assigned port). Because * of that default value, there is no need for it to be set. * * Finally, a property may declare a list of valid values. These valid values * are used for display purposes, they are not enforced by the broader system, * but merely allow a means for the information to be communicated to the user * through dladm(8). Like a default value, this is optional. * * The general scaffolding does not do very much with respect to the getting and * setting of properties. That is really owned by the individual plugins * themselves. * * ----------------------------- * Destinations and Plugin Types * ----------------------------- * * Both encapsulation and lookup plugins define the kinds of destinations that * they know how to support. There are three different pieces of information * that can be used to address to a destination currently, all of which is * summarized in the type overlay_point_t. Any combination of these is * supported. * * OVERLAY_PLUGIN_D_ETHERNET * * An Ethernet MAC address is required. * * OVERLAY_PLUGIN_D_IP * * An IP address is required. All IP addresses used by the overlay * system are transmitted as IPv6 addresses. IPv4 addresses can be * represented by using IPv4-mapped IPv6 addresses. * * OVERLAY_PLUGIN_D_PORT * * A TCP/UDP port is required. * * A kernel encapsulation plugin declares which of these that it requires, it's * a static set. On the other hand, a userland lookup plugin can be built to * support all of these or any combination thereof. It gets passed the required * destination type, based on the kernel encapsulation method, and then it makes * the determination as to whether or not it supports it. For example, the * direct plugin can support either an IP or both an IP and a port, it simply * doesn't display the direct/dest_port property in the cases where a port is * not required to support this. * * The user lookup plugins have two different modes of operation which * determines how they interact with the broader system and how look ups are * performed. These types are: * * OVERLAY_TARGET_POINT * * A point to point plugin has a single static definition for where * to send all traffic. Every packet in the system always gets sent * to the exact same destination which is programmed into the * kernel when the general device is activated. * * OVERLAY_TARGET_DYNAMIC * * A dynamic plugin does not have a single static definition. * Instead, for each destination, the kernel makes an asynchronous * request to varpd to determine where the packet should be routed, * and if a specific destination is found, then that destination is * cached in the overlay device's target cache. * * This distinction, while important for the general overlay device's operation, * is not important to the encapsulation plugins. They don't need to know about * any of these pieces. It's just a concern for varpd, the userland plugin, and * the general overlay scaffolding. * * When an overlay device is set to OVERLAY_TARGET_POINT, then it does not * maintain a target cache, and instead just keeps track of the destination and * always sends encapsulated packets to that address. When the target type is of * OVERLAY_TARGET_DYNAMIC, then the kernel maintains a cache of all such * destinations. These destinations are kept around in an instance of a * reference hash that is specific to the given overlay device. Entries in the * cache can be invalidated and replaced by varpd and its lookup plugins. * * ---------------------------------- * Kernel Components and Architecture * ---------------------------------- * * There are multiple pieces inside the kernel that work together, there is the * general overlay_dev_t structure, which is the logical GLDv3 device, but it * itself has references to things like an instance of an encapsulation plugin, * a pointer to a mux and a target cache. It can roughly be summarized in the * following image: * * +------------------+ * | global | * | overlay list | * | overlay_dev_list | * +------------------+ * | * | +-----------------------+ +---------------+ * +->| GLDv3 Device |----------->| GLDv3 Device | -> ... * | overlay_dev_t | | overlay_dev_t | * | | +---------------+ * | | * | mac_handle_t -----+---> GLDv3 handle to MAC * | datalink_id_t -----+---> Datalink ID used by DLS * | overlay_dev_flag_t ---+---> Device state * | uint_t -----+---> Current device MTU * | uint_t -----+---> In-progress RX operations * | uint_t -----+---> In-progress TX operations * | char[] -----+---> FMA degraded message * | void * -----+---> plugin private data * | overlay_target_t * ---+---------------------+ * | overlay_plugin_t * ---+---------+ | * +-----------------------+ | | * ^ | | * +--------------------+ | | | * | Kernel Socket | | | | * | Multiplexor | | | | * | overlay_mux_t | | | | * | | | | | * | avl_tree_t -+--+ | | * | uint_t -+--> socket family | | * | uint_t -+--> socket type | | * | uint_t -+--> socket protocol | | * | ksocket_t -+--> I/O socket | | * | struct sockaddr * -+--> ksocket address | | * | overlay_plugin_t --+--------+ | | * +--------------------+ | | | * | | | * +-------------------------+ | | | * | Encap Plugin |<--+-----------+ | * | overlay_plugin_t | | * | | | * | char * ---+--> plugin name | * | overlay_plugin_ops_t * -+--> plugin downcalls | * | char ** (props) ---+--> property list | * | uint_t ---+--> id length | * | overlay_plugin_flags_t -+--> plugin flags | * | overlay_plugin_dest_t --+--> destination type v * +-------------------------+ +-------------------------+ * | Target Cache | * | overlay_target_t | * | | * cache mode <--+- overlay_target_mode_t | * dest type <--+- overlay_plugin_dest_t | * cache flags <--+- overlay_target_flag_t | * varpd id <--+- uint64_t | * outstanding varpd reqs. <--+- uint_t | * OVERLAY_TARGET_POINT state <--+- overlay_target_point_t | * OVERLAY_TARGET_DYNAMIC state <-+---+- overlay_target_dyn_t | * | +-------------------------+ * +-----------------------+ * | * v * +-------------------------------+ +------------------------+ * | Target Entry |-->| Target Entry |--> ... * | overlay_target_entry_t | | overlay_target_entry_t | * | | +------------------------+ * | | * | overlay_target_entry_flags_t -+--> Entry flags * | uint8_t[ETHERADDRL] ---+--> Target MAC address * | overlay_target_point_t ---+--> Target underlay address * | mblk_t * ---+--> outstanding mblk head * | mblk_t * ---+--> outstanding mblk tail * | size_t ---+--> outstanding mblk size * +-------------------------------+ * * The primary entries that we care about are the overlay_dev_t, which * correspond to each overlay device that is created with dladm(8). Globally, * these devices are maintained in a simple list_t which is protected with a * lock. Hence, these include important information such as the mac_handle_t * and a datalink_id_t which is used to interact with the broader MAC and DLS * ecosystem. We also maintain additional information such as the current state, * outstanding operations, the mtu, and importantly, the plugin's private data. * This is the instance of an encapsulation plugin that gets created as part of * creating an overlay device. Another aspect of this is that the overlay_dev_t * also includes information with respect to FMA. For more information, see the * FMA section. * * Each overlay_dev_t has a pointer to a plugin, a mux, and a target. The plugin * is the encapsulation plugin. This allows the device to make downcalls into it * based on doing things like getting and setting properties. Otherwise, the * plugin itself is a fairly straightforward entity. They are maintained in an * (not pictured above) list. The plugins themselves mostly maintain things like * the static list of properties, what kind of destination they require, and the * operations vector. A given module may contain more if necessary. * * The next piece of the puzzle is the mux, or a multiplexor. The mux itself * maintains a ksocket and it is through the mux that we send and receive * message blocks. The mux represents a socket type and address, as well as a * plugin. Multiple overlay_dev_t devices may then share the same mux. For * example, consider the case where you have different instances of vxlan all on * the same underlay network. These would all logically share the same IP * address and port that packets are sent and received on; however, what differs * is the decapuslation ID. * * Each mux maintains a ksocket_t which is similar to a socket(3SOCKET). Unlike * a socket, we enable a direct callback on the ksocket. This means that * whenever a message block chain is received, rather than sitting there and * getting a callback in a context and kicking that back out to a taskq. Instead * data comes into the callback function overlay_mux_recv(). * * The mux is given encapsulated packets (via overlay_m_tx, the GLDv3 tx * function) to transmit. It receives encapsulated packets, decapsulates them to * determine the overlay identifier, looks up the given device that matches that * identifier, and then causes the broader MAC world to receive the packet with * a call to mac_rx(). * * Today, we don't do too much that's special with the ksocket; however, as * hardware is gaining understanding for these encapsulation protocols, we'll * probably want to think of better ways to get those capabilities passed down * and potentially better ways to program receive filters so they get directly * to us. Though, that's all fantasy future land. * * The next part of the puzzle is the target cache. The purpose of the target * cache is to cache where we should send a packet on the underlay network, * given its mac address. The target cache operates in two modes depending on * whether the lookup module was declared to OVERLAY_TARGET_POINT or * OVERLAY_TARGET_DYANMIC. * * In the case where the target cache has been programmed to be * OVERLAY_TARGET_POINT, then we only maintain a single overlay_target_point_t * which has the destination that we send everything, no matter the destination * mac address. * * On the other hand, when we have an instance of OVERLAY_TARGET_DYNAMIC, things * are much more interesting and as a result, more complicated. We primarily * store lists of overlay_target_entry_t's which are stored in both an avl tree * and a refhash_t. The primary look up path uses the refhash_t and the avl tree * is only used for a few of the target ioctls used to dump data such that we * can get a consistent iteration order for things like dladm show-overlay -t. * The key that we use for the reference hashtable is based on the mac address * in the cache and currently we just do a simple CRC32 to transform it into a * hash. * * Each entry maintains a set of flags to indicate the current status of the * request. The flags may indicate one of three states: that current cache entry * is valid, that the current cache entry has been directed to drop all output, * and that the current cache entry is invalid and may be being looked up. In * the case where it's valid, we just take the destination address and run with * it. * * If it's invalid and a lookup has not been made, then we start the process * that prepares a query that will make its way up to varpd. The cache entry * entry maintains a message block chain of outstanding message blocks and a * size. These lists are populated only when we don't know the answer as to * where should these be sent. The size entry is used to cap the amount of * outstanding data that we don't know the answer to. If we exceed a cap on the * amount of outstanding data (currently 1 Mb), then we'll drop any additional * packets. Once we get an answer indicating a valid destination, we transmit * any outstanding data to that place. For the full story on how we look that up * will be discussed in the section on the Target Cache Lifecycle. * * ------------------------ * FMA and Degraded Devices * ------------------------ * * Every kernel overlay device keeps track of its FMA state. Today in FMA we * cannot represent partitions between resources nor can we represent that a * given minor node of a pseudo device has failed -- if we degrade the overlay * device, then the entire dev_info_t is degraded. However, we still want to be * able to indicate to administrators that things may go wrong. * * To this end, we've added a notion of a degraded state to every overlay * device. This state is primarily dictated by userland and it can happen for * various reasons. Generally, because a userland lookup plugin has been * partitioned, or something has gone wrong such that there is no longer any * userland lookup module for a device, then we'll mark it degraded. * * As long as any of our minor instances is degraded, then we'll fire off the * FMA event to note that. Once the last degraded instance is no longer * degraded, then we'll end up telling FMA that we're all clean. * * To help administrators get a better sense of which of the various minor * devices is wrong, we store the odd_fmamsg[] character array. This character * array can be fetched with doing a dladm show-overlay -f. * * Note, that it's important that we do not update the link status of the * devices. We want to remain up as much as possible. By changing the link in a * degraded state, this may end up making things worse. We may still actually * have information in the target cache and if we mark the link down, that'll * result in not being able to use it. The reason being that this'll mark all * the downstream VNICs down which will go to IP and from there we end up * dealing with sadness. * * ----------------------- * Target Cache Life Cycle * ----------------------- * * This section only applies when we have a lookup plugin of * OVERLAY_TARGET_DYNAMIC. None of this applies to those of type * OVERLAY_TARGET_POINT. * * While we got into the target cache in the general architecture section, it's * worth going into more details as to how this actually works and showing some * examples and state machines. Recall that a target cache entry basically has * the following state transition diagram: * * Initial state * . . . . . . first access . . . varpd lookup enqueued * . . . * . . . * +-------+ . +----------+ . * | No |------*---->| Invalid |-------*----+ * | Entry | | Entry | | * +-------+ +----------+ | * varpd ^ ^ varpd | * invalidate | | drop | * . . . * * . . v * +-------+ | | +---------+ * | Entry |--->-----+ +----<----| Entry | * | Valid |<----------*---------<----| Pending |->-+ varpd * +-------+ . +---------+ * . . drop, but * . varpd ^ | other queued * . success | | entries * +-----+ * * When the table is first created, it is empty. As we attempt to lookup entries * and we find there is no entry at all, we'll create a new table entry for it. * At that point the entry is technically in an invalid state, that means that * we have no valid data from varpd. In that case, we'll go ahead and queue the * packet into the entry's pending chain, and queue a varpd lookup, setting the * OVERLAY_ENTRY_F_PENDING flag in the progress. * * If additional mblk_t's come in for this entry, we end up appending them to * the tail of the chain, if and only if, we don't exceed the threshold for the * amount of space they can take up. An entry remains pending until we get a * varpd reply. If varpd replies with a valid results, we move to the valid * entry state, and remove the OVERLAY_ENTRY_F_PENDING flag and set it with one * of OVERLAY_ENTRY_F_VALID or OVERLAY_ENTRY_F_DROP as appropriate. * * Once an entry is valid, it stays valid until user land tells us to invalidate * it with an ioctl or replace it, OVERLAY_TARG_CACHE_REMOE and * OVERLAY_TARG_CACHE_SET respectively. * * If the lookup fails with a call to drop the packet, then the next state is * determined by the state of the queue. If the set of outstanding entries is * empty, then we just transition back to the invalid state. If instead, the * set of outstanding entries is not empty, then we'll queue another entry and * stay in the same state, repeating this until the number of requests is * drained. * * The following images describes the flow of a given lookup and where the * overlay_target_entry_t is at any given time. * * +-------------------+ * | Invalid Entry | An entry starts off as an invalid entry * | de:ad:be:ef:00:00 | and only exists in the target cache. * +-------------------+ * * ~~~~ * * +---------------------+ * | Global list_t | A mblk_t comes in for an entry. We * | overlay_target_list | append it to the overlay_target_list. * +---------------------+ * | * v * +-------------------+ +-------------------+ * | Pending Entry |----->| Pending Entry |--->... * | 42:5e:1a:10:d6:2d | | de:ad:be:ef:00:00 | * +-------------------+ +-------------------+ * * ~~~~ * * +--------------------------+ * | /dev/overlay minor state | User land said that it would look up an * | overlay_target_hdl_t | entry for us. We remove it from the * +--------------------------+ global list and add it to the handle's * | outstanding list. * | * v * +-------------------+ +-------------------+ * | Pending Entry |----->| Pending Entry | * | 90:b8:d0:79:02:dd | | de:ad:be:ef:00:00 | * +-------------------+ +-------------------+ * * ~~~~ * * +-------------------+ * | Valid Entry | varpd returned an answer with * | de:ad:be:ef:00:00 | OVERLAY_IOC_RESPOND and the target cache * | 10.169.23.42:4789 | entry is now populated with a * +-------------------+ destination and marked as valid * * * The lookup mechanism is performed via a series of operations on the character * pseudo-device /dev/overlay. The only thing that uses this device is the * userland daemon varpd. /dev/overlay is a cloneable device, each open of it * granting a new minor number which maintains its own state. We maintain this * state so that way if an outstanding lookup was queued to something that * crashed or closed its handle without responding, we can know about this and * thus handle it appropriately. * * When a lookup is first created it's added to our global list of outstanding * lookups. To service requests, userland is required to perform an ioctl to ask * for a request. We will block it in the kernel a set amount of time waiting * for a request. When we give a request to a given minor instance of the * device, we remove it from the global list and append the request to the * device's list of outstanding entries, for the reasons we discussed above. * When a lookup comes in, we give user land a smaller amount of information * specific to that packet, the overlay_targ_lookup_t. It includes a request id * to identify this, and then the overlay id, the varpd id, the header and * packet size, the source and destination mac address, the SAP, and any * potential VLAN header. * * At that point, it stays in that outstanding list until one of two ioctls are * returned: OVERLAY_TARG_RESPOND or OVERLAY_TARG_DROP. During this time, * userland may also perform other operations. For example, it may use * OVERLAY_TARG_PKT to get a copy of this packet so it can perform more in-depth * analysis of what to do beyond what we gave it initially. This is useful for * providing proxy arp and the like. Finally, there are two other ioctls that * varpd can then do. The first is OVERLAY_TARG_INJECT which injects the * non-jumbo frame packet up into that mac device and OVERLAY_TARG_RESEND which * causes us to encapsulate and send out the packet they've given us. * * * Finally, through the target cache, several ioctls are provided to allow for * interrogation and management of the cache. They allow for individual entries * to be retrieved, set, or have the entire table flushed. For the full set of * ioctls here and what they do, take a look at uts/common/sys/overlay_target.h. * * ------------------ * Sample Packet Flow * ------------------ * * There's a lot of pieces here, hopefully an example of how this all fits * together will help clarify and elucidate what's going on. We're going to * first track an outgoing packet, eg. one that is sent from an IP interface on * a VNIC on top of an overlay device, and then we'll look at what it means to * respond to that. * * * +----------------+ +--------------+ +------------------+ * | IP/DLS send |------->| MAC sends it |----------->| mblk_t reaches | * | packet to MAC | | to the GLDv3 | | overlay GLDv3 tx | * +----------------+ | VNIC device | | overlay_m_tx() | * +--------------+ +------------------+ * | * . lookup . cache | * . drop . miss v * +---------+ . +--------+ . +------------------+ * | freemsg |<-----*-------| varpd |<---*------| Lookup each mblk | * | mblk_t | | lookup | | in the target | * +---------+ | queued | | cache | * ^ +--------+ +------------------+ * on send | | | cache * error . . * *. . lookup * . . hit * | | success v * | | +------------------+ * +-----------------+ +--------------->| call plugin | * | Send out | | ovpo_encap() to | * | overlay_mux_t's |<----------------------------------| get encap mblk_t | * | ksocket | +------------------+ * +-----------------+ * * The receive end point looks a little different and looks more like: * * +------------------+ +----------------+ +-----------+ * | mblk_t comes off |---->| enter netstack |--->| delivered |---+ * | the physical | | IP stack | | to | * . . direct * | device | +----------------+ | ksocket | | callback * +------------------+ +-----------+ | * . overlay id | * . not found v * +-----------+ . +-----------------+ +--------------------+ * | freemsg |<--*------| call plugin |<------| overlay_mux_recv() | * | mblk_t | | ovpo_decap() to | +--------------------+ * +-----------+ | decap mblk_t | * +-----------------+ * | * * . . overlay id * v found * +--------+ +----------------+ * | adjust |----->| call mac_rx | * | mblk_t | | on original | * +--------+ | decaped packet | * +----------------+ * * ------------------ * Netstack Awareness * ------------------ * * In the above image we note that this enters a netstack. Today the only * netstack that can be is the global zone as the overlay driver itself is not * exactly netstack aware. What this really means is that varpd cannot run in a * non-global zone and an overlay device cannot belong to a non-global zone. * Non-global zones can still have a VNIC assigned to them that's been created * over the overlay device the same way they would if it had been created over * an etherstub or a physical device. * * The majority of the work to make it netstack aware is straightforward and the * biggest thing is to create a netstack module that allows us to hook into * netstack (and thus zone) creation and destruction. From there, we need to * amend the target cache lookup routines that we discussed earlier to not have * a global outstanding list and a global list of handles, but rather, one per * netstack. * * For the mux, we'll need to open the ksocket in the context of the zone, we * can likely do this with a properly composed credential, but we'll need to do * some more work on that path. Finally, we'll want to make sure the dld ioctls * are aware of the zoneid of the caller and we use that appropriately and store * it in the overlay_dev_t. * * ----------- * GLDv3 Notes * ----------- * * The overlay driver implements a GLDv3 device. Parts of GLDv3 are more * relevant and other parts are much less relevant for us. For example, the * GLDv3 is used to toggle the device being put into and out of promiscuous * mode, to program MAC addresses for unicast and multicast hardware filters. * Today, an overlay device doesn't have a notion of promiscuous mode nor does * it have a notion of unicast and multicast addresses programmed into the * device. Instead, for the purposes of the hardware filter, we don't do * anything and just always accept new addresses being added and removed. * * If the GLDv3 start function has not been called, then we will not use this * device for I/O purposes. Any calls to transmit or receive should be dropped, * though the GLDv3 guarantees us that transmit will not be called without * calling start. Similarly, once stop is called, then no packets can be dealt * with. * * Today we don't support the stat interfaces, though there's no good reason * that we shouldn't assemble some of the stats based on what we have in the * future. * * When it comes to link properties, many of the traditional link properties do * not apply and many others MAC handles for us. For example, we don't need to * implement anything for overlay_m_getprop() to deal with returning the MTU, as * MAC never calls into us for that. As such, there isn't much of anything to * support in terms of properties. * * Today, we don't support any notion of hardware capabilities. However, if * future NIC hardware or other changes to the system cause it to make sense for * us to emulate logical groups, then we should do that. However, we still do * implement a capab function so that we can identify ourselves as an overlay * device to the broader MAC framework. This is done mostly so that a device * created on top of us can have fanout rings as we don't try to lie about a * speed for our device. * * The other question is what should be done for a device's MTU and margin. We * set our minimum supported MTU to be the minimum value that an IP network may * be set to 576 -- which mimics what an etherstub does. On the flip side, we * have our upper bound set to 8900. This value comes from the fact that a lot * of jumbo networks use their maximum as 9000. As such, we want to reserve 100 * bytes, which isn't exactly the most accurate number, but it'll be good enough * for now. Because of that, our default MTU off of these devices is 1400, as * the default MTU for everything is usually 1500 or whatever the underlying * device is at; however, this is a bit simpler than asking the netstack what * are all the IP interfaces at. It also calls into question how PMTU and PMTU * discovery should work here. The challenge, especially for * OVERLAY_TARG_DYNAMIC is that the MTU to any of the places will vary and it's * not clear that if you have a single bad entry that the overall MTU should be * lowered. Instead, we should figure out a better way of determining these * kinds of PMTU errors and appropriately alerting the administrator via FMA. * * Regarding margin, we allow a margin of up to VLAN_TAGSZ depending on whether * or not the underlying encapsulation device supports VLAN tags. If it does, * then we'll set the margin to allow for it, otherwise, we will not. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include dev_info_t *overlay_dip; static kmutex_t overlay_dev_lock; static list_t overlay_dev_list; static uint8_t overlay_macaddr[ETHERADDRL] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; typedef enum overlay_dev_prop { OVERLAY_DEV_P_MTU = 0, OVERLAY_DEV_P_VNETID, OVERLAY_DEV_P_ENCAP, OVERLAY_DEV_P_VARPDID } overlay_dev_prop_t; #define OVERLAY_DEV_NPROPS 4 static const char *overlay_dev_props[] = { "mtu", "vnetid", "encap", "varpd/id" }; #define OVERLAY_MTU_MIN 576 #define OVERLAY_MTU_DEF 1400 #define OVERLAY_MTU_MAX 8900 overlay_dev_t * overlay_hold_by_dlid(datalink_id_t id) { overlay_dev_t *o; mutex_enter(&overlay_dev_lock); for (o = list_head(&overlay_dev_list); o != NULL; o = list_next(&overlay_dev_list, o)) { if (id == o->odd_linkid) { mutex_enter(&o->odd_lock); o->odd_ref++; mutex_exit(&o->odd_lock); mutex_exit(&overlay_dev_lock); return (o); } } mutex_exit(&overlay_dev_lock); return (NULL); } void overlay_hold_rele(overlay_dev_t *odd) { mutex_enter(&odd->odd_lock); ASSERT(odd->odd_ref > 0); odd->odd_ref--; mutex_exit(&odd->odd_lock); } void overlay_io_start(overlay_dev_t *odd, overlay_dev_flag_t flag) { ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX); ASSERT(MUTEX_HELD(&odd->odd_lock)); if (flag & OVERLAY_F_IN_RX) odd->odd_rxcount++; if (flag & OVERLAY_F_IN_TX) odd->odd_txcount++; odd->odd_flags |= flag; } void overlay_io_done(overlay_dev_t *odd, overlay_dev_flag_t flag) { boolean_t signal = B_FALSE; ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX); ASSERT(MUTEX_HELD(&odd->odd_lock)); if (flag & OVERLAY_F_IN_RX) { ASSERT(odd->odd_rxcount > 0); odd->odd_rxcount--; if (odd->odd_rxcount == 0) { signal = B_TRUE; odd->odd_flags &= ~OVERLAY_F_IN_RX; } } if (flag & OVERLAY_F_IN_TX) { ASSERT(odd->odd_txcount > 0); odd->odd_txcount--; if (odd->odd_txcount == 0) { signal = B_TRUE; odd->odd_flags &= ~OVERLAY_F_IN_TX; } } if (signal == B_TRUE) cv_broadcast(&odd->odd_iowait); } static void overlay_io_wait(overlay_dev_t *odd, overlay_dev_flag_t flag) { ASSERT((flag & ~OVERLAY_F_IOMASK) == 0); ASSERT(MUTEX_HELD(&odd->odd_lock)); while (odd->odd_flags & flag) { cv_wait(&odd->odd_iowait, &odd->odd_lock); } } void overlay_dev_iter(overlay_dev_iter_f func, void *arg) { overlay_dev_t *odd; mutex_enter(&overlay_dev_lock); for (odd = list_head(&overlay_dev_list); odd != NULL; odd = list_next(&overlay_dev_list, odd)) { if (func(odd, arg) != 0) { mutex_exit(&overlay_dev_lock); return; } } mutex_exit(&overlay_dev_lock); } /* ARGSUSED */ static int overlay_m_stat(void *arg, uint_t stat, uint64_t *val) { return (ENOTSUP); } static int overlay_m_start(void *arg) { overlay_dev_t *odd = arg; overlay_mux_t *mux; int ret, domain, family, prot; struct sockaddr_storage storage; socklen_t slen; mutex_enter(&odd->odd_lock); if ((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0) { mutex_exit(&odd->odd_lock); return (EAGAIN); } mutex_exit(&odd->odd_lock); ret = odd->odd_plugin->ovp_ops->ovpo_socket(odd->odd_pvoid, &domain, &family, &prot, (struct sockaddr *)&storage, &slen); if (ret != 0) return (ret); mux = overlay_mux_open(odd->odd_plugin, domain, family, prot, (struct sockaddr *)&storage, slen, &ret); if (mux == NULL) return (ret); overlay_mux_add_dev(mux, odd); odd->odd_mux = mux; mutex_enter(&odd->odd_lock); ASSERT(!(odd->odd_flags & OVERLAY_F_IN_MUX)); odd->odd_flags |= OVERLAY_F_IN_MUX; mutex_exit(&odd->odd_lock); return (0); } static void overlay_m_stop(void *arg) { overlay_dev_t *odd = arg; /* * The MAC Perimeter is held here, so we don't have to worry about * synchronizing this with respect to metadata operations. */ mutex_enter(&odd->odd_lock); VERIFY(odd->odd_flags & OVERLAY_F_IN_MUX); VERIFY(!(odd->odd_flags & OVERLAY_F_MDDROP)); odd->odd_flags |= OVERLAY_F_MDDROP; overlay_io_wait(odd, OVERLAY_F_IOMASK); mutex_exit(&odd->odd_lock); overlay_mux_remove_dev(odd->odd_mux, odd); overlay_mux_close(odd->odd_mux); odd->odd_mux = NULL; mutex_enter(&odd->odd_lock); odd->odd_flags &= ~OVERLAY_F_IN_MUX; odd->odd_flags &= ~OVERLAY_F_MDDROP; VERIFY((odd->odd_flags & OVERLAY_F_STOPMASK) == 0); mutex_exit(&odd->odd_lock); } /* * For more info on this, see the big theory statement. */ /* ARGSUSED */ static int overlay_m_promisc(void *arg, boolean_t on) { return (0); } /* * For more info on this, see the big theory statement. */ /* ARGSUSED */ static int overlay_m_multicast(void *arg, boolean_t add, const uint8_t *addrp) { return (0); } /* * For more info on this, see the big theory statement. */ /* ARGSUSED */ static int overlay_m_unicast(void *arg, const uint8_t *macaddr) { return (0); } mblk_t * overlay_m_tx(void *arg, mblk_t *mp_chain) { overlay_dev_t *odd = arg; mblk_t *mp, *ep; int ret; ovep_encap_info_t einfo; struct msghdr hdr; mutex_enter(&odd->odd_lock); if ((odd->odd_flags & OVERLAY_F_MDDROP) || !(odd->odd_flags & OVERLAY_F_IN_MUX)) { mutex_exit(&odd->odd_lock); freemsgchain(mp_chain); return (NULL); } overlay_io_start(odd, OVERLAY_F_IN_TX); mutex_exit(&odd->odd_lock); bzero(&hdr, sizeof (struct msghdr)); bzero(&einfo, sizeof (ovep_encap_info_t)); einfo.ovdi_id = odd->odd_vid; mp = mp_chain; while (mp != NULL) { socklen_t slen; struct sockaddr_storage storage; mp_chain = mp->b_next; mp->b_next = NULL; ep = NULL; ret = overlay_target_lookup(odd, mp, (struct sockaddr *)&storage, &slen); if (ret != OVERLAY_TARGET_OK) { if (ret == OVERLAY_TARGET_DROP) freemsg(mp); mp = mp_chain; continue; } hdr.msg_name = &storage; hdr.msg_namelen = slen; ret = odd->odd_plugin->ovp_ops->ovpo_encap(odd->odd_mh, mp, &einfo, &ep); if (ret != 0 || ep == NULL) { freemsg(mp); goto out; } ASSERT(ep->b_cont == mp || ep == mp); ret = overlay_mux_tx(odd->odd_mux, &hdr, ep); if (ret != 0) goto out; mp = mp_chain; } out: mutex_enter(&odd->odd_lock); overlay_io_done(odd, OVERLAY_F_IN_TX); mutex_exit(&odd->odd_lock); return (mp_chain); } /* ARGSUSED */ static void overlay_m_ioctl(void *arg, queue_t *q, mblk_t *mp) { miocnak(q, mp, 0, ENOTSUP); } /* ARGSUSED */ static boolean_t overlay_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) { /* * Tell MAC we're an overlay. */ if (cap == MAC_CAPAB_OVERLAY) return (B_TRUE); return (B_FALSE); } /* ARGSUSED */ static int overlay_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, uint_t pr_valsize, const void *pr_val) { uint32_t mtu, old; int err; overlay_dev_t *odd = arg; if (pr_num != MAC_PROP_MTU) return (ENOTSUP); bcopy(pr_val, &mtu, sizeof (mtu)); if (mtu < OVERLAY_MTU_MIN || mtu > OVERLAY_MTU_MAX) return (EINVAL); mutex_enter(&odd->odd_lock); old = odd->odd_mtu; odd->odd_mtu = mtu; err = mac_maxsdu_update(odd->odd_mh, mtu); if (err != 0) odd->odd_mtu = old; mutex_exit(&odd->odd_lock); return (err); } /* ARGSUSED */ static int overlay_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, uint_t pr_valsize, void *pr_val) { return (ENOTSUP); } /* ARGSUSED */ static void overlay_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num, mac_prop_info_handle_t prh) { if (pr_num != MAC_PROP_MTU) return; mac_prop_info_set_default_uint32(prh, OVERLAY_MTU_DEF); mac_prop_info_set_range_uint32(prh, OVERLAY_MTU_MIN, OVERLAY_MTU_MAX); } static mac_callbacks_t overlay_m_callbacks = { .mc_callbacks = (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP | MC_PROPINFO), .mc_getstat = overlay_m_stat, .mc_start = overlay_m_start, .mc_stop = overlay_m_stop, .mc_setpromisc = overlay_m_promisc, .mc_multicst = overlay_m_multicast, .mc_unicst = overlay_m_unicast, .mc_tx = overlay_m_tx, .mc_ioctl = overlay_m_ioctl, .mc_getcapab = overlay_m_getcapab, .mc_getprop = overlay_m_getprop, .mc_setprop = overlay_m_setprop, .mc_propinfo = overlay_m_propinfo }; static boolean_t overlay_valid_name(const char *name, size_t buflen) { size_t actlen; int err, i; for (i = 0; i < buflen; i++) { if (name[i] == '\0') break; } if (i == 0 || i == buflen) return (B_FALSE); actlen = i; if (strchr(name, '/') != NULL) return (B_FALSE); if (u8_validate((char *)name, actlen, NULL, U8_VALIDATE_ENTIRE, &err) < 0) return (B_FALSE); return (B_TRUE); } /* ARGSUSED */ static int overlay_i_create(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) { int err; uint64_t maxid; overlay_dev_t *odd, *o; mac_register_t *mac; overlay_ioc_create_t *oicp = karg; if (overlay_valid_name(oicp->oic_encap, MAXLINKNAMELEN) == B_FALSE) return (EINVAL); odd = kmem_zalloc(sizeof (overlay_dev_t), KM_SLEEP); odd->odd_linkid = oicp->oic_linkid; odd->odd_plugin = overlay_plugin_lookup(oicp->oic_encap); if (odd->odd_plugin == NULL) { kmem_free(odd, sizeof (overlay_dev_t)); return (ENOENT); } err = odd->odd_plugin->ovp_ops->ovpo_init((overlay_handle_t)odd, &odd->odd_pvoid); if (err != 0) { odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); overlay_plugin_rele(odd->odd_plugin); kmem_free(odd, sizeof (overlay_dev_t)); return (EINVAL); } /* * Make sure that our virtual network id is valid for the given plugin * that we're working with. */ ASSERT(odd->odd_plugin->ovp_id_size <= 8); maxid = UINT64_MAX; if (odd->odd_plugin->ovp_id_size != 8) maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) - 1ULL; if (oicp->oic_vnetid > maxid) { odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); overlay_plugin_rele(odd->odd_plugin); kmem_free(odd, sizeof (overlay_dev_t)); return (EINVAL); } odd->odd_vid = oicp->oic_vnetid; mac = mac_alloc(MAC_VERSION); if (mac == NULL) { mutex_exit(&overlay_dev_lock); odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); overlay_plugin_rele(odd->odd_plugin); kmem_free(odd, sizeof (overlay_dev_t)); return (EINVAL); } mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER; mac->m_driver = odd; mac->m_dip = overlay_dip; mac->m_dst_addr = NULL; mac->m_callbacks = &overlay_m_callbacks; mac->m_pdata = NULL; mac->m_pdata_size = 0; mac->m_priv_props = NULL; /* Let mac handle this itself. */ mac->m_instance = (uint_t)-1; /* * There is no real source address that should be used here, but saying * that we're not ethernet is going to cause its own problems. At the * end of the say, this is fine. */ mac->m_src_addr = overlay_macaddr; /* * Start with the default MTU as the max SDU. If the MTU is changed, the * SDU will be changed to reflect that. */ mac->m_min_sdu = 1; mac->m_max_sdu = OVERLAY_MTU_DEF; mac->m_multicast_sdu = 0; /* * The underlying device doesn't matter, instead this comes from the * encapsulation protocol and whether or not they allow VLAN tags. */ if (odd->odd_plugin->ovp_flags & OVEP_F_VLAN_TAG) { mac->m_margin = VLAN_TAGSZ; } else { mac->m_margin = 0; } /* * Today, we have no MAC virtualization, it may make sense in the future * to go ahead and emulate some subset of this, but it doesn't today. */ mac->m_v12n = MAC_VIRT_NONE; mutex_enter(&overlay_dev_lock); for (o = list_head(&overlay_dev_list); o != NULL; o = list_next(&overlay_dev_list, o)) { if (o->odd_linkid == oicp->oic_linkid) { mutex_exit(&overlay_dev_lock); odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); overlay_plugin_rele(odd->odd_plugin); kmem_free(odd, sizeof (overlay_dev_t)); return (EEXIST); } if (o->odd_vid == oicp->oic_vnetid && o->odd_plugin == odd->odd_plugin) { mutex_exit(&overlay_dev_lock); odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); overlay_plugin_rele(odd->odd_plugin); kmem_free(odd, sizeof (overlay_dev_t)); return (EEXIST); } } err = mac_register(mac, &odd->odd_mh); mac_free(mac); if (err != 0) { mutex_exit(&overlay_dev_lock); odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); overlay_plugin_rele(odd->odd_plugin); kmem_free(odd, sizeof (overlay_dev_t)); return (err); } err = dls_devnet_create(odd->odd_mh, odd->odd_linkid, crgetzoneid(cred)); if (err != 0) { mutex_exit(&overlay_dev_lock); (void) mac_unregister(odd->odd_mh); odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); overlay_plugin_rele(odd->odd_plugin); kmem_free(odd, sizeof (overlay_dev_t)); return (err); } mutex_init(&odd->odd_lock, NULL, MUTEX_DRIVER, NULL); cv_init(&odd->odd_iowait, NULL, CV_DRIVER, NULL); odd->odd_ref = 0; odd->odd_flags = 0; list_insert_tail(&overlay_dev_list, odd); mutex_exit(&overlay_dev_lock); return (0); } /* ARGSUSED */ static int overlay_i_activate(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) { int i, ret; overlay_dev_t *odd; mac_perim_handle_t mph; overlay_ioc_activate_t *oiap = karg; overlay_ioc_propinfo_t *infop; overlay_ioc_prop_t *oip; overlay_prop_handle_t phdl; odd = overlay_hold_by_dlid(oiap->oia_linkid); if (odd == NULL) return (ENOENT); infop = kmem_alloc(sizeof (overlay_ioc_propinfo_t), KM_SLEEP); oip = kmem_alloc(sizeof (overlay_ioc_prop_t), KM_SLEEP); phdl = (overlay_prop_handle_t)infop; mac_perim_enter_by_mh(odd->odd_mh, &mph); mutex_enter(&odd->odd_lock); if (odd->odd_flags & OVERLAY_F_ACTIVATED) { mutex_exit(&odd->odd_lock); mac_perim_exit(mph); overlay_hold_rele(odd); kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); kmem_free(oip, sizeof (overlay_ioc_prop_t)); return (EEXIST); } mutex_exit(&odd->odd_lock); for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) { const char *pname = odd->odd_plugin->ovp_props[i]; bzero(infop, sizeof (overlay_ioc_propinfo_t)); overlay_prop_init(phdl); ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(pname, phdl); if (ret != 0) { mac_perim_exit(mph); overlay_hold_rele(odd); kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); kmem_free(oip, sizeof (overlay_ioc_prop_t)); return (ret); } if ((infop->oipi_prot & OVERLAY_PROP_PERM_REQ) == 0) continue; bzero(oip, sizeof (overlay_ioc_prop_t)); oip->oip_size = sizeof (oip->oip_value); ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid, pname, oip->oip_value, &oip->oip_size); if (ret != 0) { mac_perim_exit(mph); overlay_hold_rele(odd); kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); kmem_free(oip, sizeof (overlay_ioc_prop_t)); return (ret); } if (oip->oip_size == 0) { mac_perim_exit(mph); overlay_hold_rele(odd); kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); kmem_free(oip, sizeof (overlay_ioc_prop_t)); return (EINVAL); } } mutex_enter(&odd->odd_lock); if ((odd->odd_flags & OVERLAY_F_VARPD) == 0) { mutex_exit(&odd->odd_lock); mac_perim_exit(mph); overlay_hold_rele(odd); kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); kmem_free(oip, sizeof (overlay_ioc_prop_t)); return (ENXIO); } ASSERT((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0); odd->odd_flags |= OVERLAY_F_ACTIVATED; /* * Now that we've activated ourselves, we should indicate to the world * that we're up. Note that we may not be able to perform lookups at * this time, but our notion of being 'up' isn't dependent on that * ability. */ mac_link_update(odd->odd_mh, LINK_STATE_UP); mutex_exit(&odd->odd_lock); mac_perim_exit(mph); overlay_hold_rele(odd); kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); kmem_free(oip, sizeof (overlay_ioc_prop_t)); return (0); } /* ARGSUSED */ static int overlay_i_delete(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) { overlay_ioc_delete_t *oidp = karg; overlay_dev_t *odd; datalink_id_t tid; int ret; odd = overlay_hold_by_dlid(oidp->oid_linkid); if (odd == NULL) { return (ENOENT); } mutex_enter(&odd->odd_lock); /* If we're not the only hold, we're busy */ if (odd->odd_ref != 1) { mutex_exit(&odd->odd_lock); overlay_hold_rele(odd); return (EBUSY); } if (odd->odd_flags & OVERLAY_F_IN_MUX) { mutex_exit(&odd->odd_lock); overlay_hold_rele(odd); return (EBUSY); } /* * To remove this, we need to first remove it from dls and then remove * it from mac. The act of removing it from mac will check if there are * devices on top of this, eg. vnics. If there are, then that will fail * and we'll have to go through and recreate the dls entry. Only after * mac_unregister has succeeded, then we'll go through and actually free * everything and drop the dev lock. */ ret = dls_devnet_destroy(odd->odd_mh, &tid, B_TRUE); if (ret != 0) { overlay_hold_rele(odd); return (ret); } ASSERT(oidp->oid_linkid == tid); ret = mac_disable(odd->odd_mh); if (ret != 0) { (void) dls_devnet_create(odd->odd_mh, odd->odd_linkid, crgetzoneid(cred)); overlay_hold_rele(odd); return (ret); } overlay_target_quiesce(odd->odd_target); mutex_enter(&overlay_dev_lock); list_remove(&overlay_dev_list, odd); mutex_exit(&overlay_dev_lock); cv_destroy(&odd->odd_iowait); mutex_destroy(&odd->odd_lock); overlay_target_free(odd); odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); overlay_plugin_rele(odd->odd_plugin); kmem_free(odd, sizeof (overlay_dev_t)); return (0); } /* ARGSUSED */ static int overlay_i_nprops(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) { overlay_dev_t *odd; overlay_ioc_nprops_t *on = karg; odd = overlay_hold_by_dlid(on->oipn_linkid); if (odd == NULL) return (ENOENT); on->oipn_nprops = odd->odd_plugin->ovp_nprops + OVERLAY_DEV_NPROPS; overlay_hold_rele(odd); return (0); } static int overlay_propinfo_plugin_cb(overlay_plugin_t *opp, void *arg) { overlay_prop_handle_t phdl = arg; overlay_prop_set_range_str(phdl, opp->ovp_name); return (0); } static int overlay_i_name_to_propid(overlay_dev_t *odd, const char *name, uint_t *id) { int i; for (i = 0; i < OVERLAY_DEV_NPROPS; i++) { if (strcmp(overlay_dev_props[i], name) == 0) { *id = i; return (0); } } for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) { if (strcmp(odd->odd_plugin->ovp_props[i], name) == 0) { *id = i + OVERLAY_DEV_NPROPS; return (0); } } return (ENOENT); } static void overlay_i_propinfo_mtu(overlay_dev_t *odd, overlay_prop_handle_t phdl) { uint32_t def; mac_propval_range_t range; uint_t perm; ASSERT(MAC_PERIM_HELD(odd->odd_mh)); bzero(&range, sizeof (mac_propval_range_t)); range.mpr_count = 1; if (mac_prop_info(odd->odd_mh, MAC_PROP_MTU, "mtu", &def, sizeof (def), &range, &perm) != 0) return; if (perm == MAC_PROP_PERM_READ) overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ); else if (perm == MAC_PROP_PERM_WRITE) overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_WRITE); else if (perm == MAC_PROP_PERM_RW) overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW); overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); overlay_prop_set_default(phdl, &def, sizeof (def)); overlay_prop_set_range_uint32(phdl, range.mpr_range_uint32[0].mpur_min, range.mpr_range_uint32[0].mpur_max); } /* ARGSUSED */ static int overlay_i_propinfo(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) { overlay_dev_t *odd; int ret; mac_perim_handle_t mph; uint_t propid = UINT_MAX; overlay_ioc_propinfo_t *oip = karg; overlay_prop_handle_t phdl = (overlay_prop_handle_t)oip; odd = overlay_hold_by_dlid(oip->oipi_linkid); if (odd == NULL) return (ENOENT); overlay_prop_init(phdl); mac_perim_enter_by_mh(odd->odd_mh, &mph); /* * If the id is -1, then the property that we're looking for is named in * oipi_name and we should fill in its id. Otherwise, we've been given * an id and we need to turn that into a name for our plugin's sake. The * id is our own fabrication for property discovery. */ if (oip->oipi_id == -1) { /* * Determine if it's a known generic property or it belongs to a * module by checking against the list of known names. */ oip->oipi_name[OVERLAY_PROP_NAMELEN-1] = '\0'; if ((ret = overlay_i_name_to_propid(odd, oip->oipi_name, &propid)) != 0) { overlay_hold_rele(odd); mac_perim_exit(mph); return (ret); } oip->oipi_id = propid; if (propid >= OVERLAY_DEV_NPROPS) { ret = odd->odd_plugin->ovp_ops->ovpo_propinfo( oip->oipi_name, phdl); overlay_hold_rele(odd); mac_perim_exit(mph); return (ret); } } else if (oip->oipi_id >= OVERLAY_DEV_NPROPS) { uint_t id = oip->oipi_id - OVERLAY_DEV_NPROPS; if (id >= odd->odd_plugin->ovp_nprops) { overlay_hold_rele(odd); mac_perim_exit(mph); return (EINVAL); } ret = odd->odd_plugin->ovp_ops->ovpo_propinfo( odd->odd_plugin->ovp_props[id], phdl); overlay_hold_rele(odd); mac_perim_exit(mph); return (ret); } else if (oip->oipi_id < -1) { overlay_hold_rele(odd); mac_perim_exit(mph); return (EINVAL); } else { ASSERT(oip->oipi_id < OVERLAY_DEV_NPROPS); ASSERT(oip->oipi_id >= 0); propid = oip->oipi_id; (void) strlcpy(oip->oipi_name, overlay_dev_props[propid], sizeof (oip->oipi_name)); } switch (propid) { case OVERLAY_DEV_P_MTU: overlay_i_propinfo_mtu(odd, phdl); break; case OVERLAY_DEV_P_VNETID: overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW); overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); overlay_prop_set_nodefault(phdl); break; case OVERLAY_DEV_P_ENCAP: overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ); overlay_prop_set_type(phdl, OVERLAY_PROP_T_STRING); overlay_prop_set_nodefault(phdl); overlay_plugin_walk(overlay_propinfo_plugin_cb, phdl); break; case OVERLAY_DEV_P_VARPDID: overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ); overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); overlay_prop_set_nodefault(phdl); break; default: overlay_hold_rele(odd); mac_perim_exit(mph); return (ENOENT); } overlay_hold_rele(odd); mac_perim_exit(mph); return (0); } /* ARGSUSED */ static int overlay_i_getprop(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) { int ret; overlay_dev_t *odd; mac_perim_handle_t mph; overlay_ioc_prop_t *oip = karg; uint_t propid, mtu; odd = overlay_hold_by_dlid(oip->oip_linkid); if (odd == NULL) return (ENOENT); mac_perim_enter_by_mh(odd->odd_mh, &mph); oip->oip_size = OVERLAY_PROP_SIZEMAX; oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0'; if (oip->oip_id == -1) { int i; for (i = 0; i < OVERLAY_DEV_NPROPS; i++) { if (strcmp(overlay_dev_props[i], oip->oip_name) == 0) break; if (i == OVERLAY_DEV_NPROPS) { ret = odd->odd_plugin->ovp_ops->ovpo_getprop( odd->odd_pvoid, oip->oip_name, oip->oip_value, &oip->oip_size); overlay_hold_rele(odd); mac_perim_exit(mph); return (ret); } } propid = i; } else if (oip->oip_id >= OVERLAY_DEV_NPROPS) { uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS; if (id > odd->odd_plugin->ovp_nprops) { overlay_hold_rele(odd); mac_perim_exit(mph); return (EINVAL); } ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid, odd->odd_plugin->ovp_props[id], oip->oip_value, &oip->oip_size); overlay_hold_rele(odd); mac_perim_exit(mph); return (ret); } else if (oip->oip_id < -1) { overlay_hold_rele(odd); mac_perim_exit(mph); return (EINVAL); } else { ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS); ASSERT(oip->oip_id >= 0); propid = oip->oip_id; } ret = 0; switch (propid) { case OVERLAY_DEV_P_MTU: /* * The MTU is always set and retrieved through MAC, to allow for * MAC to do whatever it wants, as really that property belongs * to MAC. This is important for things where vnics have hold on * the MTU. */ mac_sdu_get(odd->odd_mh, NULL, &mtu); bcopy(&mtu, oip->oip_value, sizeof (uint_t)); oip->oip_size = sizeof (uint_t); break; case OVERLAY_DEV_P_VNETID: /* * While it's read-only while inside of a mux, we're not in a * context that can guarantee that. Therefore we always grab the * overlay_dev_t's odd_lock. */ mutex_enter(&odd->odd_lock); bcopy(&odd->odd_vid, oip->oip_value, sizeof (uint64_t)); mutex_exit(&odd->odd_lock); oip->oip_size = sizeof (uint64_t); break; case OVERLAY_DEV_P_ENCAP: oip->oip_size = strlcpy((char *)oip->oip_value, odd->odd_plugin->ovp_name, oip->oip_size); break; case OVERLAY_DEV_P_VARPDID: mutex_enter(&odd->odd_lock); if (odd->odd_flags & OVERLAY_F_VARPD) { const uint64_t val = odd->odd_target->ott_id; bcopy(&val, oip->oip_value, sizeof (uint64_t)); oip->oip_size = sizeof (uint64_t); } else { oip->oip_size = 0; } mutex_exit(&odd->odd_lock); break; default: ret = ENOENT; } overlay_hold_rele(odd); mac_perim_exit(mph); return (ret); } static void overlay_setprop_vnetid(overlay_dev_t *odd, uint64_t vnetid) { mutex_enter(&odd->odd_lock); /* Simple case, not active */ if (!(odd->odd_flags & OVERLAY_F_IN_MUX)) { odd->odd_vid = vnetid; mutex_exit(&odd->odd_lock); return; } /* * In the hard case, we need to set the drop flag, quiesce I/O and then * we can go ahead and do everything. */ odd->odd_flags |= OVERLAY_F_MDDROP; overlay_io_wait(odd, OVERLAY_F_IOMASK); mutex_exit(&odd->odd_lock); overlay_mux_remove_dev(odd->odd_mux, odd); mutex_enter(&odd->odd_lock); odd->odd_vid = vnetid; mutex_exit(&odd->odd_lock); overlay_mux_add_dev(odd->odd_mux, odd); mutex_enter(&odd->odd_lock); ASSERT(odd->odd_flags & OVERLAY_F_IN_MUX); odd->odd_flags &= ~OVERLAY_F_MDDROP; mutex_exit(&odd->odd_lock); } /* ARGSUSED */ static int overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) { int ret; overlay_dev_t *odd; overlay_ioc_prop_t *oip = karg; uint_t propid = UINT_MAX; mac_perim_handle_t mph; uint64_t maxid, *vidp; if (oip->oip_size > OVERLAY_PROP_SIZEMAX) return (EINVAL); odd = overlay_hold_by_dlid(oip->oip_linkid); if (odd == NULL) return (ENOENT); oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0'; mac_perim_enter_by_mh(odd->odd_mh, &mph); mutex_enter(&odd->odd_lock); if (odd->odd_flags & OVERLAY_F_ACTIVATED) { mac_perim_exit(mph); mutex_exit(&odd->odd_lock); return (ENOTSUP); } mutex_exit(&odd->odd_lock); if (oip->oip_id == -1) { int i; for (i = 0; i < OVERLAY_DEV_NPROPS; i++) { if (strcmp(overlay_dev_props[i], oip->oip_name) == 0) break; if (i == OVERLAY_DEV_NPROPS) { ret = odd->odd_plugin->ovp_ops->ovpo_setprop( odd->odd_pvoid, oip->oip_name, oip->oip_value, oip->oip_size); overlay_hold_rele(odd); mac_perim_exit(mph); return (ret); } } propid = i; } else if (oip->oip_id >= OVERLAY_DEV_NPROPS) { uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS; if (id > odd->odd_plugin->ovp_nprops) { mac_perim_exit(mph); overlay_hold_rele(odd); return (EINVAL); } ret = odd->odd_plugin->ovp_ops->ovpo_setprop(odd->odd_pvoid, odd->odd_plugin->ovp_props[id], oip->oip_value, oip->oip_size); mac_perim_exit(mph); overlay_hold_rele(odd); return (ret); } else if (oip->oip_id < -1) { mac_perim_exit(mph); overlay_hold_rele(odd); return (EINVAL); } else { ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS); ASSERT(oip->oip_id >= 0); propid = oip->oip_id; } ret = 0; switch (propid) { case OVERLAY_DEV_P_MTU: ret = mac_set_prop(odd->odd_mh, MAC_PROP_MTU, "mtu", oip->oip_value, oip->oip_size); break; case OVERLAY_DEV_P_VNETID: if (oip->oip_size != sizeof (uint64_t)) { ret = EINVAL; break; } vidp = (uint64_t *)oip->oip_value; ASSERT(odd->odd_plugin->ovp_id_size <= 8); maxid = UINT64_MAX; if (odd->odd_plugin->ovp_id_size != 8) maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) - 1ULL; if (*vidp >= maxid) { ret = EINVAL; break; } overlay_setprop_vnetid(odd, *vidp); break; case OVERLAY_DEV_P_ENCAP: case OVERLAY_DEV_P_VARPDID: ret = EPERM; break; default: ret = ENOENT; } mac_perim_exit(mph); overlay_hold_rele(odd); return (ret); } /* ARGSUSED */ static int overlay_i_status(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) { overlay_dev_t *odd; overlay_ioc_status_t *os = karg; odd = overlay_hold_by_dlid(os->ois_linkid); if (odd == NULL) return (ENOENT); mutex_enter(&odd->odd_lock); if ((odd->odd_flags & OVERLAY_F_DEGRADED) != 0) { os->ois_status = OVERLAY_I_DEGRADED; if (odd->odd_fmamsg != NULL) { (void) strlcpy(os->ois_message, odd->odd_fmamsg, OVERLAY_STATUS_BUFLEN); } else { os->ois_message[0] = '\0'; } } else { os->ois_status = OVERLAY_I_OK; os->ois_message[0] = '\0'; } mutex_exit(&odd->odd_lock); overlay_hold_rele(odd); return (0); } static dld_ioc_info_t overlay_ioc_list[] = { { OVERLAY_IOC_CREATE, DLDCOPYIN, sizeof (overlay_ioc_create_t), overlay_i_create, secpolicy_dl_config }, { OVERLAY_IOC_ACTIVATE, DLDCOPYIN, sizeof (overlay_ioc_activate_t), overlay_i_activate, secpolicy_dl_config }, { OVERLAY_IOC_DELETE, DLDCOPYIN, sizeof (overlay_ioc_delete_t), overlay_i_delete, secpolicy_dl_config }, { OVERLAY_IOC_PROPINFO, DLDCOPYIN | DLDCOPYOUT, sizeof (overlay_ioc_propinfo_t), overlay_i_propinfo, secpolicy_dl_config }, { OVERLAY_IOC_GETPROP, DLDCOPYIN | DLDCOPYOUT, sizeof (overlay_ioc_prop_t), overlay_i_getprop, secpolicy_dl_config }, { OVERLAY_IOC_SETPROP, DLDCOPYIN, sizeof (overlay_ioc_prop_t), overlay_i_setprop, secpolicy_dl_config }, { OVERLAY_IOC_NPROPS, DLDCOPYIN | DLDCOPYOUT, sizeof (overlay_ioc_nprops_t), overlay_i_nprops, secpolicy_dl_config }, { OVERLAY_IOC_STATUS, DLDCOPYIN | DLDCOPYOUT, sizeof (overlay_ioc_status_t), overlay_i_status, NULL } }; static int overlay_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { int fmcap = DDI_FM_EREPORT_CAPABLE; if (cmd != DDI_ATTACH) return (DDI_FAILURE); if (overlay_dip != NULL || ddi_get_instance(dip) != 0) return (DDI_FAILURE); ddi_fm_init(dip, &fmcap, NULL); if (ddi_create_minor_node(dip, OVERLAY_CTL, S_IFCHR, ddi_get_instance(dip), DDI_PSEUDO, 0) == DDI_FAILURE) return (DDI_FAILURE); if (dld_ioc_register(OVERLAY_IOC, overlay_ioc_list, DLDIOCCNT(overlay_ioc_list)) != 0) { ddi_remove_minor_node(dip, OVERLAY_CTL); return (DDI_FAILURE); } overlay_dip = dip; return (DDI_SUCCESS); } /* ARGSUSED */ static int overlay_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp) { int error; switch (cmd) { case DDI_INFO_DEVT2DEVINFO: *resp = (void *)overlay_dip; error = DDI_SUCCESS; break; case DDI_INFO_DEVT2INSTANCE: *resp = (void *)0; error = DDI_SUCCESS; break; default: error = DDI_FAILURE; break; } return (error); } static int overlay_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) { if (cmd != DDI_DETACH) return (DDI_FAILURE); mutex_enter(&overlay_dev_lock); if (!list_is_empty(&overlay_dev_list) || overlay_target_busy()) { mutex_exit(&overlay_dev_lock); return (EBUSY); } mutex_exit(&overlay_dev_lock); dld_ioc_unregister(OVERLAY_IOC); ddi_remove_minor_node(dip, OVERLAY_CTL); ddi_fm_fini(dip); overlay_dip = NULL; return (DDI_SUCCESS); } static struct cb_ops overlay_cbops = { overlay_target_open, /* cb_open */ overlay_target_close, /* cb_close */ nodev, /* cb_strategy */ nodev, /* cb_print */ nodev, /* cb_dump */ nodev, /* cb_read */ nodev, /* cb_write */ overlay_target_ioctl, /* cb_ioctl */ nodev, /* cb_devmap */ nodev, /* cb_mmap */ nodev, /* cb_segmap */ nochpoll, /* cb_chpoll */ ddi_prop_op, /* cb_prop_op */ NULL, /* cb_stream */ D_MP, /* cb_flag */ CB_REV, /* cb_rev */ nodev, /* cb_aread */ nodev, /* cb_awrite */ }; static struct dev_ops overlay_dev_ops = { DEVO_REV, /* devo_rev */ 0, /* devo_refcnt */ overlay_getinfo, /* devo_getinfo */ nulldev, /* devo_identify */ nulldev, /* devo_probe */ overlay_attach, /* devo_attach */ overlay_detach, /* devo_detach */ nulldev, /* devo_reset */ &overlay_cbops, /* devo_cb_ops */ NULL, /* devo_bus_ops */ NULL, /* devo_power */ ddi_quiesce_not_supported /* devo_quiesce */ }; static struct modldrv overlay_modldrv = { &mod_driverops, "Overlay Network Driver", &overlay_dev_ops }; static struct modlinkage overlay_linkage = { MODREV_1, &overlay_modldrv }; static int overlay_init(void) { mutex_init(&overlay_dev_lock, NULL, MUTEX_DRIVER, NULL); list_create(&overlay_dev_list, sizeof (overlay_dev_t), offsetof(overlay_dev_t, odd_link)); overlay_mux_init(); overlay_plugin_init(); overlay_target_init(); return (DDI_SUCCESS); } static void overlay_fini(void) { overlay_target_fini(); overlay_plugin_fini(); overlay_mux_fini(); mutex_destroy(&overlay_dev_lock); list_destroy(&overlay_dev_list); } int _init(void) { int err; if ((err = overlay_init()) != DDI_SUCCESS) return (err); mac_init_ops(NULL, "overlay"); err = mod_install(&overlay_linkage); if (err != DDI_SUCCESS) { overlay_fini(); return (err); } return (0); } int _info(struct modinfo *modinfop) { return (mod_info(&overlay_linkage, modinfop)); } int _fini(void) { int err; err = mod_remove(&overlay_linkage); if (err != 0) return (err); overlay_fini(); return (0); }