xref: /illumos-gate/usr/src/uts/common/io/overlay/overlay.c (revision 20a7641f9918de8574b8b3b47dbe35c4bfc78df1)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2016 Joyent, Inc.
14  * Copyright 2022 MNX Cloud, Inc.
15  */
16 
17 /*
18  * Overlay Devices
19  *
20  * Overlay devices provide a means for creating overlay networks, a means of
21  * multiplexing multiple logical, isolated, and discrete layer two and layer
22  * three networks on top of one physical network.
23  *
24  * In general, these overlay devices encapsulate the logic to answer two
25  * different questions:
26  *
27  *   1) How should I transform a packet to put it on the wire?
28  *   2) Where should I send a transformed packet?
29  *
30  * Each overlay device is presented to the user as a GLDv3 device. While the
31  * link itself cannot have an IP interface created on top of it, it allows for
32  * additional GLDv3 devices, such as a VNIC, to be created on top of it which
33  * can be plumbed up with IP interfaces.
34  *
35  *
36  * --------------------
37  * General Architecture
38  * --------------------
39  *
40  * The logical overlay device that a user sees in dladm(8) is a combination of
41  * two different components that work together. The first component is this
42  * kernel module, which is responsible for answering question one -- how should
43  * I transform a packet to put it on the wire.
44  *
45  * The second component is what we call the virtual ARP daemon, or varpd. It is
46  * a userland component that is responsible for answering the second question --
47  * Where should I send a transformed packet. Instances of the kernel overlay
48  * GLDv3 device ask varpd the question of where should a packet go.
49  *
50  * The split was done for a few reasons. Importantly, we wanted to keep the act
51  * of generating encapsulated packets in the kernel so as to ensure that the
52  * general data path was fast and also kept simple. On the flip side, while the
53  * question of where should something go may be simple, it may often be
54  * complicated and need to interface with several different external or
55  * distributed systems. In those cases, it's simpler to allow for the full
56  * flexibility of userland to be brought to bear to solve that problem and in
57  * general, the path isn't very common.
58  *
59  * The following is what makes up the logical overlay device that a user would
60  * create with dladm(8).
61  *
62  *       Kernel                                     Userland
63  *   . . . . . . . . . . . . . . . . . . . . .   . . . . . . . . . . . . .
64  *   . +--------+   +--------+  +--------+   .   .                       .
65  *   . | VNIC 0 |   | VNIC 1 |  | VNIC 2 |   .   .                       .
66  *   . +--------+   +--------+  +--------+   .   .                       .
67  *   .     |            |           |        .   .                       .
68  *   .     |            |           |        .   .                       .
69  *   .     +------------+-----------+        .   .                       .
70  *   .                  |              . . /dev/overlay                  .
71  *   .           +--------------+      .     .   .       +------------+  .
72  *   .           |              |      .     .   .       |            |  .
73  *   .           |    Overlay   |======*=================|   Virtual  |  .
74  *   .           | GLDv3 Device |========================| ARP Daemon |  .
75  *   .           |              |            .   .       |            |  .
76  *   .           +--------------+            .   .       +------------+  .
77  *   .                  |                    .   .              |        .
78  *   .                  |                    .   .              |        .
79  *   .           +----------------+          .   .         +--------+    .
80  *   .           |  Overlay       |          .   .         | varpd  |    .
81  *   .           |  Encapsulation |          .   .         | Lookup |    .
82  *   .           |  Plugin        |          .   .         | Plugin |    .
83  *   .           +----------------+          .   .         +--------+    .
84  *   . . . . . . . . . . . . . . . . . . . . .   . . . . . . . . . . . . .
85  *
86  *
87  * This image shows the two different components and where they live.
88  * Importantly, it also shows that both the kernel overlay device and the
89  * userland varpd both support plugins. The plugins actually implement the
90  * things that users care about and the APIs have been designed to try to
91  * minimize the amount of things that a module writer needs to worry about it.
92  *
93  * IDENTIFIERS
94  *
95  * Every overlay device is defined by a unique identifier which is the overlay
96  * identifier. Its purpose is similar to that of a VLAN identifier, it's a
97  * unique number that is used to differentiate between different entries on the
98  * wire.
99  *
100  * ENCAPSULATION
101  *
102  * An overlay encapsulation plugin is a kernel miscellaneous module whose
103  * purpose is to contain knowledge about how to transform packets to put them
104  * onto the wire and to take them off. An example of an encapsulation plugin is
105  * vxlan. It's also how support for things like nvgre or geneve would be brought
106  * into the system.
107  *
108  * Each encapsulation plugins defines a series of operation vectors and
109  * properties. For the full details on everything they should provide, please
110  * read uts/common/sys/overlay_plugin.h. The encapsulation plugin is responsible
111  * for telling the system what information is required to send a packet. For
112  * example, vxlan is defined to send everything over a UDP packet and therefore
113  * requires a port and an IP address, while nvgre on the other hand is its own
114  * IP type and therefore just requires an IP address. In addition, it also
115  * provides information about the kind of socket that should be created. This is
116  * used by the kernel multiplexor, more of that in the Kernel Components
117  * section.
118  *
119  * LOOKUPS
120  *
121  * The kernel communicates requests for lookups over the character device
122  * /dev/overlay. varpd is responsible for listening for requests on that device
123  * and answering them. The character device is specific to the target path and
124  * varpd.
125  *
126  * Much as the kernel overlay module handles the bulk of the scaffolding but
127  * leaves the important work to the encapsulation plugin, varpd provides a
128  * similar role and leaves the full brunt of lookups to a userland dynamic
129  * shared object which implements the logic of lookups.
130  *
131  * Each lookup plugin defines a series of operation vectors and properties. For
132  * the full details on everything that they should provide, please read
133  * lib/varpd/libvarpd/libvarpd_provider.h. Essentially, they are given a MAC
134  * address and asked to give an address on the physical network that it should
135  * be sent to. In addition, they handle questions related to how to handle
136  * things like broadcast and multicast traffic, etc.
137  *
138  * ----------
139  * Properties
140  * ----------
141  *
142  * A device from a dladm perspective has a unique set of properties that are
143  * combined from three different sources:
144  *
145  *   1) Generic properties that every overlay device has
146  *   2) Properties that are specific to the encapsulation plugin
147  *   3) Properties that are specific to the lookup plugin
148  *
149  * All of these are exposed in a single set of properties in dladm. Note that
150  * these are not necessarily traditional link properties. However, if something
151  * is both a traditional GLDv3 link property, say the MTU of a device, and a
152  * specific property here, than the driver ensures that all existing GLDv3
153  * specific means of manipulating it are used and wraps up its private property
154  * interfaces to ensure that works.
155  *
156  * Properties in the second and third category are prefixed with the name of
157  * their module. For example, the vxlan encapsulation module has a property
158  * called the 'listen_ip'. This property would show up in dladm as
159  * 'vxlan/listen_ip'. This allows different plugins to both use similar names
160  * for similar properties and to also have independent name spaces so that
161  * overlapping names do not conflict with anything else.
162  *
163  * While the kernel combines both sets one and two into a single coherent view,
164  * it does not do anything with respect to the properties that are owned by the
165  * lookup plugin -- those are owned wholly by varpd. Instead, libdladm is in
166  * charge of bridging these two worlds into one magical experience for the user.
167  * It carries the burden of knowing about both overlay specific and varpd
168  * specific properties. Importantly, we want to maintain this distinction. We
169  * don't want to treat the kernel as an arbitrary key/value store for varpd and
170  * we want the kernel to own its own data and not have to ask userland for
171  * information that it owns.
172  *
173  * Every property in the system has the following attributes:
174  *
175  *   o A name
176  *   o A type
177  *   o A size
178  *   o Permissions
179  *   o Default value
180  *   o Valid value ranges
181  *   o A value
182  *
183  * Everything except for the value is obtained by callers through the propinfo
184  * callbacks and a property has a maximum size of OVERLAY_PROP_SIZEMAX,
185  * currently 256 bytes.
186  *
187  * The following are the supported types of properties:
188  *
189  *	OVERLAY_PROP_T_INT
190  *
191  *		A signed integer, its length is 8 bytes, corresponding to a
192  *		int64_t.
193  *
194  *	OVERLAY_PROP_T_UINT
195  *
196  *		An unsigned integer, its length is 8 bytes, corresponding to a
197  *		uint64_t.
198  *
199  *	OVERLAY_PROP_T_IP
200  *
201  *		A struct in6_addr, it has a fixed size.
202  *
203  *	OVERLAY_PROP_T_STRING
204  *
205  *		A null-terminated character string encoded in either ASCII or
206  *		UTF-8. Note that the size of the string includes the null
207  *		terminator.
208  *
209  * The next thing that we apply to a property is its permission. The permissions
210  * are put together by the bitwise or of the following flags and values.
211  *
212  *	OVERLAY_PROP_PERM_REQ
213  *
214  *		This indicates a required property. A property that is required
215  *		must be set by a consumer before the device can be created. If a
216  *		required property has a default property, this constraint is
217  *		loosened because the default property defines the value.
218  *
219  *	OVERLAY_PORP_PERM_READ
220  *
221  *		This indicates that a property can be read. All properties will
222  *		have this value set.
223  *
224  *	OVERLAY_PROP_PERM_WRITE
225  *
226  *		This indicates that a property can be written to and thus
227  *		updated by userland. Properties that are only intended to
228  *		display information, will not have OVERLAY_PROP_PERM_WRITE set.
229  *
230  * In addition, a few additional values are defined as a convenience to
231  * consumers. The first, OVERLAY_PROP_PERM_RW, is a combination of
232  * OVERLAY_PROP_PERM_READ and OVERLAY_PERM_PROP_WRITE. The second,
233  * OVERLAY_PROP_PERM_RRW, is a combination of OVERLAY_PROP_PERM_REQ,
234  * OVERLAY_PROP_PERM_READ, and OVERLAY_PROP_PERM_WRITE. The protection mode of a
235  * property should generally be a constant across its lifetime.
236  *
237  * A property may optionally have a default value. If it does have a default
238  * value, and that property is not set to be a different value, then the default
239  * value is inherited automatically. It also means that if the default value is
240  * acceptable, there is no need to set the value for a required property. For
241  * example, the vxlan module has the vxlan/listen_port property which is
242  * required, but has a default value of 4789 (the IANA assigned port). Because
243  * of that default value, there is no need for it to be set.
244  *
245  * Finally, a property may declare a list of valid values. These valid values
246  * are used for display purposes, they are not enforced by the broader system,
247  * but merely allow a means for the information to be communicated to the user
248  * through dladm(8). Like a default value, this is optional.
249  *
250  * The general scaffolding does not do very much with respect to the getting and
251  * setting of properties. That is really owned by the individual plugins
252  * themselves.
253  *
254  * -----------------------------
255  * Destinations and Plugin Types
256  * -----------------------------
257  *
258  * Both encapsulation and lookup plugins define the kinds of destinations that
259  * they know how to support. There are three different pieces of information
260  * that can be used to address to a destination currently, all of which is
261  * summarized in the type overlay_point_t. Any combination of these is
262  * supported.
263  *
264  *	OVERLAY_PLUGIN_D_ETHERNET
265  *
266  *		An Ethernet MAC address is required.
267  *
268  *	OVERLAY_PLUGIN_D_IP
269  *
270  *		An IP address is required. All IP addresses used by the overlay
271  *		system are transmitted as IPv6 addresses. IPv4 addresses can be
272  *		represented by using IPv4-mapped IPv6 addresses.
273  *
274  *	OVERLAY_PLUGIN_D_PORT
275  *
276  *		A TCP/UDP port is required.
277  *
278  * A kernel encapsulation plugin declares which of these that it requires, it's
279  * a static set. On the other hand, a userland lookup plugin can be built to
280  * support all of these or any combination thereof. It gets passed the required
281  * destination type, based on the kernel encapsulation method, and then it makes
282  * the determination as to whether or not it supports it. For example, the
283  * direct plugin can support either an IP or both an IP and a port, it simply
284  * doesn't display the direct/dest_port property in the cases where a port is
285  * not required to support this.
286  *
287  * The user lookup plugins have two different modes of operation which
288  * determines how they interact with the broader system and how look ups are
289  * performed. These types are:
290  *
291  *	OVERLAY_TARGET_POINT
292  *
293  *		A point to point plugin has a single static definition for where
294  *		to send all traffic. Every packet in the system always gets sent
295  *		to the exact same destination which is programmed into the
296  *		kernel when the general device is activated.
297  *
298  *	OVERLAY_TARGET_DYNAMIC
299  *
300  *		A dynamic plugin does not have a single static definition.
301  *		Instead, for each destination, the kernel makes an asynchronous
302  *		request to varpd to determine where the packet should be routed,
303  *		and if a specific destination is found, then that destination is
304  *		cached in the overlay device's target cache.
305  *
306  * This distinction, while important for the general overlay device's operation,
307  * is not important to the encapsulation plugins. They don't need to know about
308  * any of these pieces. It's just a concern for varpd, the userland plugin, and
309  * the general overlay scaffolding.
310  *
311  * When an overlay device is set to OVERLAY_TARGET_POINT, then it does not
312  * maintain a target cache, and instead just keeps track of the destination and
313  * always sends encapsulated packets to that address. When the target type is of
314  * OVERLAY_TARGET_DYNAMIC, then the kernel maintains a cache of all such
315  * destinations. These destinations are kept around in an instance of a
316  * reference hash that is specific to the given overlay device. Entries in the
317  * cache can be invalidated and replaced by varpd and its lookup plugins.
318  *
319  * ----------------------------------
320  * Kernel Components and Architecture
321  * ----------------------------------
322  *
323  * There are multiple pieces inside the kernel that work together, there is the
324  * general overlay_dev_t structure, which is the logical GLDv3 device, but it
325  * itself has references to things like an instance of an encapsulation plugin,
326  * a pointer to a mux and a target cache. It can roughly be summarized in the
327  * following image:
328  *
329  *     +------------------+
330  *     | global           |
331  *     | overlay list     |
332  *     | overlay_dev_list |
333  *     +------------------+
334  *        |
335  *        |  +-----------------------+            +---------------+
336  *        +->| GLDv3 Device          |----------->| GLDv3 Device  | -> ...
337  *           | overlay_dev_t         |            | overlay_dev_t |
338  *           |                       |            +---------------+
339  *           |                       |
340  *           | mac_handle_t     -----+---> GLDv3 handle to MAC
341  *           | datalink_id_t    -----+---> Datalink ID used by DLS
342  *           | overlay_dev_flag_t ---+---> Device state
343  *           | uint_t           -----+---> Current device MTU
344  *           | uint_t           -----+---> In-progress RX operations
345  *           | uint_t           -----+---> In-progress TX operations
346  *           | char[]           -----+---> FMA degraded message
347  *           | void *           -----+---> plugin private data
348  *           | overlay_target_t * ---+---------------------+
349  *           | overlay_plugin_t * ---+---------+           |
350  *           +-----------------------+         |           |
351  *                           ^                 |           |
352  *   +--------------------+  |                 |           |
353  *   | Kernel Socket      |  |                 |           |
354  *   | Multiplexor        |  |                 |           |
355  *   | overlay_mux_t      |  |                 |           |
356  *   |                    |  |                 |           |
357  *   | avl_tree_t        -+--+                 |           |
358  *   | uint_t            -+--> socket family   |           |
359  *   | uint_t            -+--> socket type     |           |
360  *   | uint_t            -+--> socket protocol |           |
361  *   | ksocket_t         -+--> I/O socket      |           |
362  *   | struct sockaddr * -+--> ksocket address |           |
363  *   | overlay_plugin_t --+--------+           |           |
364  *   +--------------------+        |           |           |
365  *                                 |           |           |
366  *   +-------------------------+   |           |           |
367  *   | Encap Plugin            |<--+-----------+           |
368  *   | overlay_plugin_t        |                           |
369  *   |                         |                           |
370  *   | char *               ---+--> plugin name            |
371  *   | overlay_plugin_ops_t * -+--> plugin downcalls       |
372  *   | char ** (props)      ---+--> property list          |
373  *   | uint_t               ---+--> id length              |
374  *   | overlay_plugin_flags_t -+--> plugin flags           |
375  *   | overlay_plugin_dest_t --+--> destination type       v
376  *   +-------------------------+                    +-------------------------+
377  *                                                  |   Target Cache          |
378  *                                                  |   overlay_target_t      |
379  *                                                  |                         |
380  *                                    cache mode <--+- overlay_target_mode_t  |
381  *                                     dest type <--+- overlay_plugin_dest_t  |
382  *                                   cache flags <--+- overlay_target_flag_t  |
383  *                                     varpd id  <--+- uint64_t               |
384  *                       outstanding varpd reqs. <--+- uint_t                 |
385  *                   OVERLAY_TARGET_POINT state  <--+- overlay_target_point_t |
386  *               OVERLAY_TARGET_DYNAMIC state <-+---+- overlay_target_dyn_t   |
387  *                                              |   +-------------------------+
388  *                      +-----------------------+
389  *                      |
390  *                      v
391  *   +-------------------------------+   +------------------------+
392  *   | Target Entry                  |-->| Target Entry           |--> ...
393  *   | overlay_target_entry_t        |   | overlay_target_entry_t |
394  *   |                               |   +------------------------+
395  *   |                               |
396  *   | overlay_target_entry_flags_t -+--> Entry flags
397  *   | uint8_t[ETHERADDRL]        ---+--> Target MAC address
398  *   | overlay_target_point_t     ---+--> Target underlay address
399  *   | mblk_t *                   ---+--> outstanding mblk head
400  *   | mblk_t *                   ---+--> outstanding mblk tail
401  *   | size_t                     ---+--> outstanding mblk size
402  *   +-------------------------------+
403  *
404  * The primary entries that we care about are the overlay_dev_t, which
405  * correspond to each overlay device that is created with dladm(8). Globally,
406  * these devices are maintained in a simple list_t which is protected with a
407  * lock.  Hence, these include important information such as the mac_handle_t
408  * and a datalink_id_t which is used to interact with the broader MAC and DLS
409  * ecosystem. We also maintain additional information such as the current state,
410  * outstanding operations, the mtu, and importantly, the plugin's private data.
411  * This is the instance of an encapsulation plugin that gets created as part of
412  * creating an overlay device. Another aspect of this is that the overlay_dev_t
413  * also includes information with respect to FMA. For more information, see the
414  * FMA section.
415  *
416  * Each overlay_dev_t has a pointer to a plugin, a mux, and a target. The plugin
417  * is the encapsulation plugin. This allows the device to make downcalls into it
418  * based on doing things like getting and setting properties. Otherwise, the
419  * plugin itself is a fairly straightforward entity. They are maintained in an
420  * (not pictured above) list. The plugins themselves mostly maintain things like
421  * the static list of properties, what kind of destination they require, and the
422  * operations vector. A given module may contain more if necessary.
423  *
424  * The next piece of the puzzle is the mux, or a multiplexor. The mux itself
425  * maintains a ksocket and it is through the mux that we send and receive
426  * message blocks. The mux represents a socket type and address, as well as a
427  * plugin. Multiple overlay_dev_t devices may then share the same mux. For
428  * example, consider the case where you have different instances of vxlan all on
429  * the same underlay network. These would all logically share the same IP
430  * address and port that packets are sent and received on; however, what differs
431  * is the decapuslation ID.
432  *
433  * Each mux maintains a ksocket_t which is similar to a socket(3SOCKET). Unlike
434  * a socket, we enable a direct callback on the ksocket. This means that
435  * whenever a message block chain is received, rather than sitting there and
436  * getting a callback in a context and kicking that back out to a taskq. Instead
437  * data comes into the callback function overlay_mux_recv().
438  *
439  * The mux is given encapsulated packets (via overlay_m_tx, the GLDv3 tx
440  * function) to transmit. It receives encapsulated packets, decapsulates them to
441  * determine the overlay identifier, looks up the given device that matches that
442  * identifier, and then causes the broader MAC world to receive the packet with
443  * a call to mac_rx().
444  *
445  * Today, we don't do too much that's special with the ksocket; however, as
446  * hardware is gaining understanding for these encapsulation protocols, we'll
447  * probably want to think of better ways to get those capabilities passed down
448  * and potentially better ways to program receive filters so they get directly
449  * to us. Though, that's all fantasy future land.
450  *
451  * The next part of the puzzle is the target cache. The purpose of the target
452  * cache is to cache where we should send a packet on the underlay network,
453  * given its mac address. The target cache operates in two modes depending on
454  * whether the lookup module was declared to OVERLAY_TARGET_POINT or
455  * OVERLAY_TARGET_DYANMIC.
456  *
457  * In the case where the target cache has been programmed to be
458  * OVERLAY_TARGET_POINT, then we only maintain a single overlay_target_point_t
459  * which has the destination that we send everything, no matter the destination
460  * mac address.
461  *
462  * On the other hand, when we have an instance of OVERLAY_TARGET_DYNAMIC, things
463  * are much more interesting and as a result, more complicated. We primarily
464  * store lists of overlay_target_entry_t's which are stored in both an avl tree
465  * and a refhash_t. The primary look up path uses the refhash_t and the avl tree
466  * is only used for a few of the target ioctls used to dump data such that we
467  * can get a consistent iteration order for things like dladm show-overlay -t.
468  * The key that we use for the reference hashtable is based on the mac address
469  * in the cache and currently we just do a simple CRC32 to transform it into a
470  * hash.
471  *
472  * Each entry maintains a set of flags to indicate the current status of the
473  * request. The flags may indicate one of three states: that current cache entry
474  * is valid, that the current cache entry has been directed to drop all output,
475  * and that the current cache entry is invalid and may be being looked up. In
476  * the case where it's valid, we just take the destination address and run with
477  * it.
478  *
479  * If it's invalid and a lookup has not been made, then we start the process
480  * that prepares a query that will make its way up to varpd. The cache entry
481  * entry maintains a message block chain of outstanding message blocks and a
482  * size. These lists are populated only when we don't know the answer as to
483  * where should these be sent. The size entry is used to cap the amount of
484  * outstanding data that we don't know the answer to. If we exceed a cap on the
485  * amount of outstanding data (currently 1 Mb), then we'll drop any additional
486  * packets. Once we get an answer indicating a valid destination, we transmit
487  * any outstanding data to that place. For the full story on how we look that up
488  * will be discussed in the section on the Target Cache Lifecycle.
489  *
490  * ------------------------
491  * FMA and Degraded Devices
492  * ------------------------
493  *
494  * Every kernel overlay device keeps track of its FMA state. Today in FMA we
495  * cannot represent partitions between resources nor can we represent that a
496  * given minor node of a pseudo device has failed -- if we degrade the overlay
497  * device, then the entire dev_info_t is degraded. However, we still want to be
498  * able to indicate to administrators that things may go wrong.
499  *
500  * To this end, we've added a notion of a degraded state to every overlay
501  * device. This state is primarily dictated by userland and it can happen for
502  * various reasons. Generally, because a userland lookup plugin has been
503  * partitioned, or something has gone wrong such that there is no longer any
504  * userland lookup module for a device, then we'll mark it degraded.
505  *
506  * As long as any of our minor instances is degraded, then we'll fire off the
507  * FMA event to note that. Once the last degraded instance is no longer
508  * degraded, then we'll end up telling FMA that we're all clean.
509  *
510  * To help administrators get a better sense of which of the various minor
511  * devices is wrong, we store the odd_fmamsg[] character array. This character
512  * array can be fetched with doing a dladm show-overlay -f.
513  *
514  * Note, that it's important that we do not update the link status of the
515  * devices. We want to remain up as much as possible. By changing the link in a
516  * degraded state, this may end up making things worse. We may still actually
517  * have information in the target cache and if we mark the link down, that'll
518  * result in not being able to use it. The reason being that this'll mark all
519  * the downstream VNICs down which will go to IP and from there we end up
520  * dealing with sadness.
521  *
522  * -----------------------
523  * Target Cache Life Cycle
524  * -----------------------
525  *
526  * This section only applies when we have a lookup plugin of
527  * OVERLAY_TARGET_DYNAMIC. None of this applies to those of type
528  * OVERLAY_TARGET_POINT.
529  *
530  * While we got into the target cache in the general architecture section, it's
531  * worth going into more details as to how this actually works and showing some
532  * examples and state machines. Recall that a target cache entry basically has
533  * the following state transition diagram:
534  *
535  * Initial state
536  *    . . .           . . . first access       . . . varpd lookup enqueued
537  *        .           .                        .
538  *        .           .                        .
539  *     +-------+      .     +----------+       .
540  *     |  No   |------*---->| Invalid  |-------*----+
541  *     | Entry |            |  Entry   |            |
542  *     +-------+            +----------+            |
543  *                 varpd      ^      ^   varpd      |
544  *                 invalidate |      |   drop       |
545  *                      . . . *      * . .          v
546  *          +-------+         |      |         +---------+
547  *          | Entry |--->-----+      +----<----| Entry   |
548  *          | Valid |<----------*---------<----| Pending |->-+     varpd
549  *          +-------+           .              +---------+   * . . drop, but
550  *                              . varpd                ^     |     other queued
551  *                              . success              |     |     entries
552  *                                                     +-----+
553  *
554  * When the table is first created, it is empty. As we attempt to lookup entries
555  * and we find there is no entry at all, we'll create a new table entry for it.
556  * At that point the entry is technically in an invalid state, that means that
557  * we have no valid data from varpd. In that case, we'll go ahead and queue the
558  * packet into the entry's pending chain, and queue a varpd lookup, setting the
559  * OVERLAY_ENTRY_F_PENDING flag in the progress.
560  *
561  * If additional mblk_t's come in for this entry, we end up appending them to
562  * the tail of the chain, if and only if, we don't exceed the threshold for the
563  * amount of space they can take up. An entry remains pending until we get a
564  * varpd reply. If varpd replies with a valid results, we move to the valid
565  * entry state, and remove the OVERLAY_ENTRY_F_PENDING flag and set it with one
566  * of OVERLAY_ENTRY_F_VALID or OVERLAY_ENTRY_F_DROP as appropriate.
567  *
568  * Once an entry is valid, it stays valid until user land tells us to invalidate
569  * it with an ioctl or replace it, OVERLAY_TARG_CACHE_REMOE and
570  * OVERLAY_TARG_CACHE_SET respectively.
571  *
572  * If the lookup fails with a call to drop the packet, then the next state is
573  * determined by the state of the queue. If the set of outstanding entries is
574  * empty, then we just transition back to the invalid state. If instead, the
575  * set of outstanding entries is not empty, then we'll queue another entry and
576  * stay in the same state, repeating this until the number of requests is
577  * drained.
578  *
579  * The following images describes the flow of a given lookup and where the
580  * overlay_target_entry_t is at any given time.
581  *
582  *     +-------------------+
583  *     | Invalid Entry     |		An entry starts off as an invalid entry
584  *     | de:ad:be:ef:00:00 |		and only exists in the target cache.
585  *     +-------------------+
586  *
587  *	~~~~
588  *
589  *     +---------------------+
590  *     | Global list_t       |		A mblk_t comes in for an entry. We
591  *     | overlay_target_list |		append it to the overlay_target_list.
592  *     +---------------------+
593  *                   |
594  *                   v
595  *             +-------------------+      +-------------------+
596  *             | Pending Entry     |----->| Pending Entry     |--->...
597  *             | 42:5e:1a:10:d6:2d |      | de:ad:be:ef:00:00 |
598  *             +-------------------+      +-------------------+
599  *
600  *	~~~~
601  *
602  *     +--------------------------+
603  *     | /dev/overlay minor state |	User land said that it would look up an
604  *     | overlay_target_hdl_t     |	entry for us. We remove it from the
605  *     +--------------------------+	global list and add it to the handle's
606  *                  |			outstanding list.
607  *                  |
608  *                  v
609  *            +-------------------+      +-------------------+
610  *            | Pending Entry     |----->| Pending Entry     |
611  *            | 90:b8:d0:79:02:dd |      | de:ad:be:ef:00:00 |
612  *            +-------------------+      +-------------------+
613  *
614  *	~~~~
615  *
616  *     +-------------------+
617  *     | Valid Entry       |		varpd returned an answer with
618  *     | de:ad:be:ef:00:00 |		OVERLAY_IOC_RESPOND and the target cache
619  *     | 10.169.23.42:4789 |		entry is now populated with a
620  *     +-------------------+		destination and marked as valid
621  *
622  *
623  * The lookup mechanism is performed via a series of operations on the character
624  * pseudo-device /dev/overlay. The only thing that uses this device is the
625  * userland daemon varpd. /dev/overlay is a cloneable device, each open of it
626  * granting a new minor number which maintains its own state. We maintain this
627  * state so that way if an outstanding lookup was queued to something that
628  * crashed or closed its handle without responding, we can know about this and
629  * thus handle it appropriately.
630  *
631  * When a lookup is first created it's added to our global list of outstanding
632  * lookups. To service requests, userland is required to perform an ioctl to ask
633  * for a request. We will block it in the kernel a set amount of time waiting
634  * for a request. When we give a request to a given minor instance of the
635  * device, we remove it from the global list and append the request to the
636  * device's list of outstanding entries, for the reasons we discussed above.
637  * When a lookup comes in, we give user land a smaller amount of information
638  * specific to that packet, the overlay_targ_lookup_t. It includes a request id
639  * to identify this, and then the overlay id, the varpd id, the header and
640  * packet size, the source and destination mac address, the SAP, and any
641  * potential VLAN header.
642  *
643  * At that point, it stays in that outstanding list until one of two ioctls are
644  * returned: OVERLAY_TARG_RESPOND or OVERLAY_TARG_DROP. During this time,
645  * userland may also perform other operations. For example, it may use
646  * OVERLAY_TARG_PKT to get a copy of this packet so it can perform more in-depth
647  * analysis of what to do beyond what we gave it initially. This is useful for
648  * providing proxy arp and the like. Finally, there are two other ioctls that
649  * varpd can then do. The first is OVERLAY_TARG_INJECT which injects the
650  * non-jumbo frame packet up into that mac device and OVERLAY_TARG_RESEND which
651  * causes us to encapsulate and send out the packet they've given us.
652  *
653  *
654  * Finally, through the target cache, several ioctls are provided to allow for
655  * interrogation and management of the cache. They allow for individual entries
656  * to be retrieved, set, or have the entire table flushed. For the full set of
657  * ioctls here and what they do, take a look at uts/common/sys/overlay_target.h.
658  *
659  * ------------------
660  * Sample Packet Flow
661  * ------------------
662  *
663  * There's a lot of pieces here, hopefully an example of how this all fits
664  * together will help clarify and elucidate what's going on. We're going to
665  * first track an outgoing packet, eg. one that is sent from an IP interface on
666  * a VNIC on top of an overlay device, and then we'll look at what it means to
667  * respond to that.
668  *
669  *
670  *    +----------------+        +--------------+            +------------------+
671  *    | IP/DLS send    |------->| MAC sends it |----------->| mblk_t reaches   |
672  *    | packet to MAC  |        | to the GLDv3 |            | overlay GLDv3 tx |
673  *    +----------------+        | VNIC device  |            | overlay_m_tx()   |
674  *                              +--------------+            +------------------+
675  *                                                                   |
676  *                             . lookup              . cache         |
677  *                             . drop                . miss          v
678  *            +---------+      .       +--------+    .      +------------------+
679  *            | freemsg |<-----*-------| varpd  |<---*------| Lookup each mblk |
680  *            | mblk_t  |              | lookup |           | in the target    |
681  *            +---------+              | queued |           | cache            |
682  *                ^                    +--------+           +------------------+
683  *      on send   |                        |                         |     cache
684  *      error . . *                        *. . lookup               * . . hit
685  *                |                        |    success              v
686  *                |                        |                +------------------+
687  *    +-----------------+                  +--------------->| call plugin      |
688  *    | Send out        |                                   | ovpo_encap() to  |
689  *    | overlay_mux_t's |<----------------------------------| get encap mblk_t |
690  *    | ksocket         |                                   +------------------+
691  *    +-----------------+
692  *
693  * The receive end point looks a little different and looks more like:
694  *
695  *  +------------------+     +----------------+    +-----------+
696  *  | mblk_t comes off |---->| enter netstack |--->| delivered |---+
697  *  | the physical     |     | IP stack       |    |     to    |   * . . direct
698  *  | device           |     +----------------+    |  ksocket  |   |   callback
699  *  +------------------+                           +-----------+   |
700  *                       . overlay id                              |
701  *                       . not found                               v
702  *       +-----------+   .      +-----------------+       +--------------------+
703  *       | freemsg   |<--*------| call plugin     |<------| overlay_mux_recv() |
704  *       | mblk_t    |          | ovpo_decap() to |       +--------------------+
705  *       +-----------+          | decap mblk_t    |
706  *                              +-----------------+
707  *                                     |
708  *                                     * . . overlay id
709  *                                     v     found
710  *                                 +--------+      +----------------+
711  *                                 | adjust |----->| call mac_rx    |
712  *                                 | mblk_t |      | on original    |
713  *                                 +--------+      | decaped packet |
714  *                                                 +----------------+
715  *
716  * ------------------
717  * Netstack Awareness
718  * ------------------
719  *
720  * In the above image we note that this enters a netstack. Today the only
721  * netstack that can be is the global zone as the overlay driver itself is not
722  * exactly netstack aware. What this really means is that varpd cannot run in a
723  * non-global zone and an overlay device cannot belong to a non-global zone.
724  * Non-global zones can still have a VNIC assigned to them that's been created
725  * over the overlay device the same way they would if it had been created over
726  * an etherstub or a physical device.
727  *
728  * The majority of the work to make it netstack aware is straightforward and the
729  * biggest thing is to create a netstack module that allows us to hook into
730  * netstack (and thus zone) creation and destruction.  From there, we need to
731  * amend the target cache lookup routines that we discussed earlier to not have
732  * a global outstanding list and a global list of handles, but rather, one per
733  * netstack.
734  *
735  * For the mux, we'll need to open the ksocket in the context of the zone, we
736  * can likely do this with a properly composed credential, but we'll need to do
737  * some more work on that path. Finally, we'll want to make sure the dld ioctls
738  * are aware of the zoneid of the caller and we use that appropriately and store
739  * it in the overlay_dev_t.
740  *
741  * -----------
742  * GLDv3 Notes
743  * -----------
744  *
745  * The overlay driver implements a GLDv3 device. Parts of GLDv3 are more
746  * relevant and other parts are much less relevant for us. For example, the
747  * GLDv3 is used to toggle the device being put into and out of promiscuous
748  * mode, to program MAC addresses for unicast and multicast hardware filters.
749  * Today, an overlay device doesn't have a notion of promiscuous mode nor does
750  * it have a notion of unicast and multicast addresses programmed into the
751  * device. Instead, for the purposes of the hardware filter, we don't do
752  * anything and just always accept new addresses being added and removed.
753  *
754  * If the GLDv3 start function has not been called, then we will not use this
755  * device for I/O purposes. Any calls to transmit or receive should be dropped,
756  * though the GLDv3 guarantees us that transmit will not be called without
757  * calling start. Similarly, once stop is called, then no packets can be dealt
758  * with.
759  *
760  * Today we don't support the stat interfaces, though there's no good reason
761  * that we shouldn't assemble some of the stats based on what we have in the
762  * future.
763  *
764  * When it comes to link properties, many of the traditional link properties do
765  * not apply and many others MAC handles for us. For example, we don't need to
766  * implement anything for overlay_m_getprop() to deal with returning the MTU, as
767  * MAC never calls into us for that. As such, there isn't much of anything to
768  * support in terms of properties.
769  *
770  * Today, we don't support any notion of hardware capabilities. However, if
771  * future NIC hardware or other changes to the system cause it to make sense for
772  * us to emulate logical groups, then we should do that. However, we still do
773  * implement a capab function so that we can identify ourselves as an overlay
774  * device to the broader MAC framework. This is done mostly so that a device
775  * created on top of us can have fanout rings as we don't try to lie about a
776  * speed for our device.
777  *
778  * The other question is what should be done for a device's MTU and margin. We
779  * set our minimum supported MTU to be the minimum value that an IP network may
780  * be set to 576 -- which mimics what an etherstub does. On the flip side, we
781  * have our upper bound set to 8900. This value comes from the fact that a lot
782  * of jumbo networks use their maximum as 9000. As such, we want to reserve 100
783  * bytes, which isn't exactly the most accurate number, but it'll be good enough
784  * for now. Because of that, our default MTU off of these devices is 1400, as
785  * the default MTU for everything is usually 1500 or whatever the underlying
786  * device is at; however, this is a bit simpler than asking the netstack what
787  * are all the IP interfaces at. It also calls into question how PMTU and PMTU
788  * discovery should work here. The challenge, especially for
789  * OVERLAY_TARG_DYNAMIC is that the MTU to any of the places will vary and it's
790  * not clear that if you have a single bad entry that the overall MTU should be
791  * lowered. Instead, we should figure out a better way of determining these
792  * kinds of PMTU errors and appropriately alerting the administrator via FMA.
793  *
794  * Regarding margin, we allow a margin of up to VLAN_TAGSZ depending on whether
795  * or not the underlying encapsulation device supports VLAN tags. If it does,
796  * then we'll set the margin to allow for it, otherwise, we will not.
797  */
798 
799 #include <sys/conf.h>
800 #include <sys/errno.h>
801 #include <sys/stat.h>
802 #include <sys/ddi.h>
803 #include <sys/sunddi.h>
804 #include <sys/modctl.h>
805 #include <sys/policy.h>
806 #include <sys/stream.h>
807 #include <sys/strsubr.h>
808 #include <sys/strsun.h>
809 #include <sys/types.h>
810 #include <sys/kmem.h>
811 #include <sys/param.h>
812 #include <sys/sysmacros.h>
813 #include <sys/ddifm.h>
814 
815 #include <sys/dls.h>
816 #include <sys/dld_ioc.h>
817 #include <sys/mac_provider.h>
818 #include <sys/mac_client_priv.h>
819 #include <sys/mac_ether.h>
820 #include <sys/vlan.h>
821 
822 #include <sys/overlay_impl.h>
823 
824 dev_info_t *overlay_dip;
825 static kmutex_t overlay_dev_lock;
826 static list_t overlay_dev_list;
827 static uint8_t overlay_macaddr[ETHERADDRL] =
828 	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
829 
830 typedef enum overlay_dev_prop {
831 	OVERLAY_DEV_P_MTU = 0,
832 	OVERLAY_DEV_P_VNETID,
833 	OVERLAY_DEV_P_ENCAP,
834 	OVERLAY_DEV_P_VARPDID
835 } overlay_dev_prop_t;
836 
837 #define	OVERLAY_DEV_NPROPS	4
838 static const char *overlay_dev_props[] = {
839 	"mtu",
840 	"vnetid",
841 	"encap",
842 	"varpd/id"
843 };
844 
845 #define	OVERLAY_MTU_MIN	576
846 #define	OVERLAY_MTU_DEF	1400
847 #define	OVERLAY_MTU_MAX	8900
848 
849 overlay_dev_t *
850 overlay_hold_by_dlid(datalink_id_t id)
851 {
852 	overlay_dev_t *o;
853 
854 	mutex_enter(&overlay_dev_lock);
855 	for (o = list_head(&overlay_dev_list); o != NULL;
856 	    o = list_next(&overlay_dev_list, o)) {
857 		if (id == o->odd_linkid) {
858 			mutex_enter(&o->odd_lock);
859 			o->odd_ref++;
860 			mutex_exit(&o->odd_lock);
861 			mutex_exit(&overlay_dev_lock);
862 			return (o);
863 		}
864 	}
865 
866 	mutex_exit(&overlay_dev_lock);
867 	return (NULL);
868 }
869 
870 void
871 overlay_hold_rele(overlay_dev_t *odd)
872 {
873 	mutex_enter(&odd->odd_lock);
874 	ASSERT(odd->odd_ref > 0);
875 	odd->odd_ref--;
876 	mutex_exit(&odd->odd_lock);
877 }
878 
879 void
880 overlay_io_start(overlay_dev_t *odd, overlay_dev_flag_t flag)
881 {
882 	ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX);
883 	ASSERT(MUTEX_HELD(&odd->odd_lock));
884 
885 	if (flag & OVERLAY_F_IN_RX)
886 		odd->odd_rxcount++;
887 	if (flag & OVERLAY_F_IN_TX)
888 		odd->odd_txcount++;
889 	odd->odd_flags |= flag;
890 }
891 
892 void
893 overlay_io_done(overlay_dev_t *odd, overlay_dev_flag_t flag)
894 {
895 	boolean_t signal = B_FALSE;
896 
897 	ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX);
898 	ASSERT(MUTEX_HELD(&odd->odd_lock));
899 
900 	if (flag & OVERLAY_F_IN_RX) {
901 		ASSERT(odd->odd_rxcount > 0);
902 		odd->odd_rxcount--;
903 		if (odd->odd_rxcount == 0) {
904 			signal = B_TRUE;
905 			odd->odd_flags &= ~OVERLAY_F_IN_RX;
906 		}
907 	}
908 	if (flag & OVERLAY_F_IN_TX) {
909 		ASSERT(odd->odd_txcount > 0);
910 		odd->odd_txcount--;
911 		if (odd->odd_txcount == 0) {
912 			signal = B_TRUE;
913 			odd->odd_flags &= ~OVERLAY_F_IN_TX;
914 		}
915 	}
916 
917 	if (signal == B_TRUE)
918 		cv_broadcast(&odd->odd_iowait);
919 }
920 
921 static void
922 overlay_io_wait(overlay_dev_t *odd, overlay_dev_flag_t flag)
923 {
924 	ASSERT((flag & ~OVERLAY_F_IOMASK) == 0);
925 	ASSERT(MUTEX_HELD(&odd->odd_lock));
926 
927 	while (odd->odd_flags & flag) {
928 		cv_wait(&odd->odd_iowait, &odd->odd_lock);
929 	}
930 }
931 
932 void
933 overlay_dev_iter(overlay_dev_iter_f func, void *arg)
934 {
935 	overlay_dev_t *odd;
936 
937 	mutex_enter(&overlay_dev_lock);
938 	for (odd = list_head(&overlay_dev_list); odd != NULL;
939 	    odd = list_next(&overlay_dev_list, odd)) {
940 		if (func(odd, arg) != 0) {
941 			mutex_exit(&overlay_dev_lock);
942 			return;
943 		}
944 	}
945 	mutex_exit(&overlay_dev_lock);
946 }
947 
948 /* ARGSUSED */
949 static int
950 overlay_m_stat(void *arg, uint_t stat, uint64_t *val)
951 {
952 	return (ENOTSUP);
953 }
954 
955 static int
956 overlay_m_start(void *arg)
957 {
958 	overlay_dev_t *odd = arg;
959 	overlay_mux_t *mux;
960 	int ret, domain, family, prot;
961 	struct sockaddr_storage storage;
962 	socklen_t slen;
963 
964 	mutex_enter(&odd->odd_lock);
965 	if ((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0) {
966 		mutex_exit(&odd->odd_lock);
967 		return (EAGAIN);
968 	}
969 	mutex_exit(&odd->odd_lock);
970 
971 	ret = odd->odd_plugin->ovp_ops->ovpo_socket(odd->odd_pvoid, &domain,
972 	    &family, &prot, (struct sockaddr *)&storage, &slen);
973 	if (ret != 0)
974 		return (ret);
975 
976 	mux = overlay_mux_open(odd->odd_plugin, domain, family, prot,
977 	    (struct sockaddr *)&storage, slen, &ret);
978 	if (mux == NULL)
979 		return (ret);
980 
981 	overlay_mux_add_dev(mux, odd);
982 	odd->odd_mux = mux;
983 	mutex_enter(&odd->odd_lock);
984 	ASSERT(!(odd->odd_flags & OVERLAY_F_IN_MUX));
985 	odd->odd_flags |= OVERLAY_F_IN_MUX;
986 	mutex_exit(&odd->odd_lock);
987 
988 	return (0);
989 }
990 
991 static void
992 overlay_m_stop(void *arg)
993 {
994 	overlay_dev_t *odd = arg;
995 
996 	/*
997 	 * The MAC Perimeter is held here, so we don't have to worry about
998 	 * synchronizing this with respect to metadata operations.
999 	 */
1000 	mutex_enter(&odd->odd_lock);
1001 	VERIFY(odd->odd_flags & OVERLAY_F_IN_MUX);
1002 	VERIFY(!(odd->odd_flags & OVERLAY_F_MDDROP));
1003 	odd->odd_flags |= OVERLAY_F_MDDROP;
1004 	overlay_io_wait(odd, OVERLAY_F_IOMASK);
1005 	mutex_exit(&odd->odd_lock);
1006 
1007 	overlay_mux_remove_dev(odd->odd_mux, odd);
1008 	overlay_mux_close(odd->odd_mux);
1009 	odd->odd_mux = NULL;
1010 
1011 	mutex_enter(&odd->odd_lock);
1012 	odd->odd_flags &= ~OVERLAY_F_IN_MUX;
1013 	odd->odd_flags &= ~OVERLAY_F_MDDROP;
1014 	VERIFY((odd->odd_flags & OVERLAY_F_STOPMASK) == 0);
1015 	mutex_exit(&odd->odd_lock);
1016 }
1017 
1018 /*
1019  * For more info on this, see the big theory statement.
1020  */
1021 /* ARGSUSED */
1022 static int
1023 overlay_m_promisc(void *arg, boolean_t on)
1024 {
1025 	return (0);
1026 }
1027 
1028 /*
1029  * For more info on this, see the big theory statement.
1030  */
1031 /* ARGSUSED */
1032 static int
1033 overlay_m_multicast(void *arg, boolean_t add, const uint8_t *addrp)
1034 {
1035 	return (0);
1036 }
1037 
1038 /*
1039  * For more info on this, see the big theory statement.
1040  */
1041 /* ARGSUSED */
1042 static int
1043 overlay_m_unicast(void *arg, const uint8_t *macaddr)
1044 {
1045 	return (0);
1046 }
1047 
1048 mblk_t *
1049 overlay_m_tx(void *arg, mblk_t *mp_chain)
1050 {
1051 	overlay_dev_t *odd = arg;
1052 	mblk_t *mp, *ep;
1053 	int ret;
1054 	ovep_encap_info_t einfo;
1055 	struct msghdr hdr;
1056 
1057 	mutex_enter(&odd->odd_lock);
1058 	if ((odd->odd_flags & OVERLAY_F_MDDROP) ||
1059 	    !(odd->odd_flags & OVERLAY_F_IN_MUX)) {
1060 		mutex_exit(&odd->odd_lock);
1061 		freemsgchain(mp_chain);
1062 		return (NULL);
1063 	}
1064 	overlay_io_start(odd, OVERLAY_F_IN_TX);
1065 	mutex_exit(&odd->odd_lock);
1066 
1067 	bzero(&hdr, sizeof (struct msghdr));
1068 
1069 	bzero(&einfo, sizeof (ovep_encap_info_t));
1070 	einfo.ovdi_id = odd->odd_vid;
1071 	mp = mp_chain;
1072 	while (mp != NULL) {
1073 		socklen_t slen;
1074 		struct sockaddr_storage storage;
1075 
1076 		mp_chain = mp->b_next;
1077 		mp->b_next = NULL;
1078 		ep = NULL;
1079 
1080 		ret = overlay_target_lookup(odd, mp,
1081 		    (struct sockaddr *)&storage, &slen);
1082 		if (ret != OVERLAY_TARGET_OK) {
1083 			if (ret == OVERLAY_TARGET_DROP)
1084 				freemsg(mp);
1085 			mp = mp_chain;
1086 			continue;
1087 		}
1088 
1089 		hdr.msg_name = &storage;
1090 		hdr.msg_namelen = slen;
1091 
1092 		ret = odd->odd_plugin->ovp_ops->ovpo_encap(odd->odd_mh, mp,
1093 		    &einfo, &ep);
1094 		if (ret != 0 || ep == NULL) {
1095 			freemsg(mp);
1096 			goto out;
1097 		}
1098 
1099 		ASSERT(ep->b_cont == mp || ep == mp);
1100 		ret = overlay_mux_tx(odd->odd_mux, &hdr, ep);
1101 		if (ret != 0)
1102 			goto out;
1103 
1104 		mp = mp_chain;
1105 	}
1106 
1107 out:
1108 	mutex_enter(&odd->odd_lock);
1109 	overlay_io_done(odd, OVERLAY_F_IN_TX);
1110 	mutex_exit(&odd->odd_lock);
1111 	return (mp_chain);
1112 }
1113 
1114 /* ARGSUSED */
1115 static void
1116 overlay_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
1117 {
1118 	miocnak(q, mp, 0, ENOTSUP);
1119 }
1120 
1121 /* ARGSUSED */
1122 static boolean_t
1123 overlay_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
1124 {
1125 	/*
1126 	 * Tell MAC we're an overlay.
1127 	 */
1128 	if (cap == MAC_CAPAB_OVERLAY)
1129 		return (B_TRUE);
1130 	return (B_FALSE);
1131 }
1132 
1133 /* ARGSUSED */
1134 static int
1135 overlay_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1136     uint_t pr_valsize, const void *pr_val)
1137 {
1138 	uint32_t mtu, old;
1139 	int err;
1140 	overlay_dev_t *odd = arg;
1141 
1142 	if (pr_num != MAC_PROP_MTU)
1143 		return (ENOTSUP);
1144 
1145 	bcopy(pr_val, &mtu, sizeof (mtu));
1146 	if (mtu < OVERLAY_MTU_MIN || mtu > OVERLAY_MTU_MAX)
1147 		return (EINVAL);
1148 
1149 	mutex_enter(&odd->odd_lock);
1150 	old = odd->odd_mtu;
1151 	odd->odd_mtu = mtu;
1152 	err = mac_maxsdu_update(odd->odd_mh, mtu);
1153 	if (err != 0)
1154 		odd->odd_mtu = old;
1155 	mutex_exit(&odd->odd_lock);
1156 
1157 	return (err);
1158 }
1159 
1160 /* ARGSUSED */
1161 static int
1162 overlay_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1163     uint_t pr_valsize, void *pr_val)
1164 {
1165 	return (ENOTSUP);
1166 }
1167 
1168 /* ARGSUSED */
1169 static void
1170 overlay_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1171     mac_prop_info_handle_t prh)
1172 {
1173 	if (pr_num != MAC_PROP_MTU)
1174 		return;
1175 
1176 	mac_prop_info_set_default_uint32(prh, OVERLAY_MTU_DEF);
1177 	mac_prop_info_set_range_uint32(prh, OVERLAY_MTU_MIN, OVERLAY_MTU_MAX);
1178 }
1179 
1180 static mac_callbacks_t overlay_m_callbacks = {
1181 	.mc_callbacks = (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP |
1182 	    MC_PROPINFO),
1183 	.mc_getstat = overlay_m_stat,
1184 	.mc_start = overlay_m_start,
1185 	.mc_stop = overlay_m_stop,
1186 	.mc_setpromisc = overlay_m_promisc,
1187 	.mc_multicst = overlay_m_multicast,
1188 	.mc_unicst = overlay_m_unicast,
1189 	.mc_tx = overlay_m_tx,
1190 	.mc_ioctl = overlay_m_ioctl,
1191 	.mc_getcapab = overlay_m_getcapab,
1192 	.mc_getprop = overlay_m_getprop,
1193 	.mc_setprop = overlay_m_setprop,
1194 	.mc_propinfo = overlay_m_propinfo
1195 };
1196 
1197 static boolean_t
1198 overlay_valid_name(const char *name, size_t buflen)
1199 {
1200 	size_t actlen;
1201 	int err, i;
1202 
1203 	for (i = 0; i < buflen; i++) {
1204 		if (name[i] == '\0')
1205 			break;
1206 	}
1207 
1208 	if (i == 0 || i == buflen)
1209 		return (B_FALSE);
1210 	actlen = i;
1211 	if (strchr(name, '/') != NULL)
1212 		return (B_FALSE);
1213 	if (u8_validate((char *)name, actlen, NULL,
1214 	    U8_VALIDATE_ENTIRE, &err) < 0)
1215 		return (B_FALSE);
1216 	return (B_TRUE);
1217 }
1218 
1219 /* ARGSUSED */
1220 static int
1221 overlay_i_create(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
1222 {
1223 	int err;
1224 	uint64_t maxid;
1225 	overlay_dev_t *odd, *o;
1226 	mac_register_t *mac;
1227 	overlay_ioc_create_t *oicp = karg;
1228 
1229 	if (overlay_valid_name(oicp->oic_encap, MAXLINKNAMELEN) == B_FALSE)
1230 		return (EINVAL);
1231 
1232 	odd = kmem_zalloc(sizeof (overlay_dev_t), KM_SLEEP);
1233 	odd->odd_linkid = oicp->oic_linkid;
1234 	odd->odd_plugin = overlay_plugin_lookup(oicp->oic_encap);
1235 	if (odd->odd_plugin == NULL) {
1236 		kmem_free(odd, sizeof (overlay_dev_t));
1237 		return (ENOENT);
1238 	}
1239 	err = odd->odd_plugin->ovp_ops->ovpo_init((overlay_handle_t)odd,
1240 	    &odd->odd_pvoid);
1241 	if (err != 0) {
1242 		odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1243 		overlay_plugin_rele(odd->odd_plugin);
1244 		kmem_free(odd, sizeof (overlay_dev_t));
1245 		return (EINVAL);
1246 	}
1247 
1248 	/*
1249 	 * Make sure that our virtual network id is valid for the given plugin
1250 	 * that we're working with.
1251 	 */
1252 	ASSERT(odd->odd_plugin->ovp_id_size <= 8);
1253 	maxid = UINT64_MAX;
1254 	if (odd->odd_plugin->ovp_id_size != 8)
1255 		maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) - 1ULL;
1256 	if (oicp->oic_vnetid > maxid) {
1257 		odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1258 		overlay_plugin_rele(odd->odd_plugin);
1259 		kmem_free(odd, sizeof (overlay_dev_t));
1260 		return (EINVAL);
1261 	}
1262 	odd->odd_vid = oicp->oic_vnetid;
1263 
1264 	mac = mac_alloc(MAC_VERSION);
1265 	if (mac == NULL) {
1266 		mutex_exit(&overlay_dev_lock);
1267 		odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1268 		overlay_plugin_rele(odd->odd_plugin);
1269 		kmem_free(odd, sizeof (overlay_dev_t));
1270 		return (EINVAL);
1271 	}
1272 
1273 	mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1274 	mac->m_driver = odd;
1275 	mac->m_dip = overlay_dip;
1276 	mac->m_dst_addr = NULL;
1277 	mac->m_callbacks = &overlay_m_callbacks;
1278 	mac->m_pdata = NULL;
1279 	mac->m_pdata_size = 0;
1280 
1281 	mac->m_priv_props = NULL;
1282 
1283 	/* Let mac handle this itself. */
1284 	mac->m_instance = (uint_t)-1;
1285 
1286 	/*
1287 	 * There is no real source address that should be used here, but saying
1288 	 * that we're not ethernet is going to cause its own problems. At the
1289 	 * end of the say, this is fine.
1290 	 */
1291 	mac->m_src_addr = overlay_macaddr;
1292 
1293 	/*
1294 	 * Start with the default MTU as the max SDU. If the MTU is changed, the
1295 	 * SDU will be changed to reflect that.
1296 	 */
1297 	mac->m_min_sdu = 1;
1298 	mac->m_max_sdu = OVERLAY_MTU_DEF;
1299 	mac->m_multicast_sdu = 0;
1300 
1301 	/*
1302 	 * The underlying device doesn't matter, instead this comes from the
1303 	 * encapsulation protocol and whether or not they allow VLAN tags.
1304 	 */
1305 	if (odd->odd_plugin->ovp_flags & OVEP_F_VLAN_TAG) {
1306 		mac->m_margin = VLAN_TAGSZ;
1307 	} else {
1308 		mac->m_margin = 0;
1309 	}
1310 
1311 	/*
1312 	 * Today, we have no MAC virtualization, it may make sense in the future
1313 	 * to go ahead and emulate some subset of this, but it doesn't today.
1314 	 */
1315 	mac->m_v12n = MAC_VIRT_NONE;
1316 
1317 	mutex_enter(&overlay_dev_lock);
1318 	for (o = list_head(&overlay_dev_list); o != NULL;
1319 	    o = list_next(&overlay_dev_list, o)) {
1320 		if (o->odd_linkid == oicp->oic_linkid) {
1321 			mutex_exit(&overlay_dev_lock);
1322 			odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1323 			overlay_plugin_rele(odd->odd_plugin);
1324 			kmem_free(odd, sizeof (overlay_dev_t));
1325 			return (EEXIST);
1326 		}
1327 
1328 		if (o->odd_vid == oicp->oic_vnetid &&
1329 		    o->odd_plugin == odd->odd_plugin) {
1330 			mutex_exit(&overlay_dev_lock);
1331 			odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1332 			overlay_plugin_rele(odd->odd_plugin);
1333 			kmem_free(odd, sizeof (overlay_dev_t));
1334 			return (EEXIST);
1335 		}
1336 	}
1337 
1338 	err = mac_register(mac, &odd->odd_mh);
1339 	mac_free(mac);
1340 	if (err != 0) {
1341 		mutex_exit(&overlay_dev_lock);
1342 		odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1343 		overlay_plugin_rele(odd->odd_plugin);
1344 		kmem_free(odd, sizeof (overlay_dev_t));
1345 		return (err);
1346 	}
1347 
1348 	err = dls_devnet_create(odd->odd_mh, odd->odd_linkid,
1349 	    crgetzoneid(cred));
1350 	if (err != 0) {
1351 		mutex_exit(&overlay_dev_lock);
1352 		(void) mac_unregister(odd->odd_mh);
1353 		odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1354 		overlay_plugin_rele(odd->odd_plugin);
1355 		kmem_free(odd, sizeof (overlay_dev_t));
1356 		return (err);
1357 	}
1358 
1359 	mutex_init(&odd->odd_lock, NULL, MUTEX_DRIVER, NULL);
1360 	cv_init(&odd->odd_iowait, NULL, CV_DRIVER, NULL);
1361 	odd->odd_ref = 0;
1362 	odd->odd_flags = 0;
1363 	list_insert_tail(&overlay_dev_list, odd);
1364 	mutex_exit(&overlay_dev_lock);
1365 
1366 	return (0);
1367 }
1368 
1369 /* ARGSUSED */
1370 static int
1371 overlay_i_activate(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
1372 {
1373 	int i, ret;
1374 	overlay_dev_t *odd;
1375 	mac_perim_handle_t mph;
1376 	overlay_ioc_activate_t *oiap = karg;
1377 	overlay_ioc_propinfo_t *infop;
1378 	overlay_ioc_prop_t *oip;
1379 	overlay_prop_handle_t phdl;
1380 
1381 	odd = overlay_hold_by_dlid(oiap->oia_linkid);
1382 	if (odd == NULL)
1383 		return (ENOENT);
1384 
1385 	infop = kmem_alloc(sizeof (overlay_ioc_propinfo_t), KM_SLEEP);
1386 	oip = kmem_alloc(sizeof (overlay_ioc_prop_t), KM_SLEEP);
1387 	phdl = (overlay_prop_handle_t)infop;
1388 
1389 	mac_perim_enter_by_mh(odd->odd_mh, &mph);
1390 	mutex_enter(&odd->odd_lock);
1391 	if (odd->odd_flags & OVERLAY_F_ACTIVATED) {
1392 		mutex_exit(&odd->odd_lock);
1393 		mac_perim_exit(mph);
1394 		overlay_hold_rele(odd);
1395 		kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1396 		kmem_free(oip, sizeof (overlay_ioc_prop_t));
1397 		return (EEXIST);
1398 	}
1399 	mutex_exit(&odd->odd_lock);
1400 
1401 	for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) {
1402 		const char *pname = odd->odd_plugin->ovp_props[i];
1403 		bzero(infop, sizeof (overlay_ioc_propinfo_t));
1404 		overlay_prop_init(phdl);
1405 		ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(pname, phdl);
1406 		if (ret != 0) {
1407 			mac_perim_exit(mph);
1408 			overlay_hold_rele(odd);
1409 			kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1410 			kmem_free(oip, sizeof (overlay_ioc_prop_t));
1411 			return (ret);
1412 		}
1413 
1414 		if ((infop->oipi_prot & OVERLAY_PROP_PERM_REQ) == 0)
1415 			continue;
1416 		bzero(oip, sizeof (overlay_ioc_prop_t));
1417 		oip->oip_size = sizeof (oip->oip_value);
1418 		ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid,
1419 		    pname, oip->oip_value, &oip->oip_size);
1420 		if (ret != 0) {
1421 			mac_perim_exit(mph);
1422 			overlay_hold_rele(odd);
1423 			kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1424 			kmem_free(oip, sizeof (overlay_ioc_prop_t));
1425 			return (ret);
1426 		}
1427 		if (oip->oip_size == 0) {
1428 			mac_perim_exit(mph);
1429 			overlay_hold_rele(odd);
1430 			kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1431 			kmem_free(oip, sizeof (overlay_ioc_prop_t));
1432 			return (EINVAL);
1433 		}
1434 	}
1435 
1436 	mutex_enter(&odd->odd_lock);
1437 	if ((odd->odd_flags & OVERLAY_F_VARPD) == 0) {
1438 		mutex_exit(&odd->odd_lock);
1439 		mac_perim_exit(mph);
1440 		overlay_hold_rele(odd);
1441 		kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1442 		kmem_free(oip, sizeof (overlay_ioc_prop_t));
1443 		return (ENXIO);
1444 	}
1445 
1446 	ASSERT((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0);
1447 	odd->odd_flags |= OVERLAY_F_ACTIVATED;
1448 
1449 	/*
1450 	 * Now that we've activated ourselves, we should indicate to the world
1451 	 * that we're up. Note that we may not be able to perform lookups at
1452 	 * this time, but our notion of being 'up' isn't dependent on that
1453 	 * ability.
1454 	 */
1455 	mac_link_update(odd->odd_mh, LINK_STATE_UP);
1456 	mutex_exit(&odd->odd_lock);
1457 
1458 	mac_perim_exit(mph);
1459 	overlay_hold_rele(odd);
1460 	kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1461 	kmem_free(oip, sizeof (overlay_ioc_prop_t));
1462 
1463 	return (0);
1464 }
1465 
1466 /* ARGSUSED */
1467 static int
1468 overlay_i_delete(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
1469 {
1470 	overlay_ioc_delete_t *oidp = karg;
1471 	overlay_dev_t *odd;
1472 	datalink_id_t tid;
1473 	int ret;
1474 
1475 	odd = overlay_hold_by_dlid(oidp->oid_linkid);
1476 	if (odd == NULL) {
1477 		return (ENOENT);
1478 	}
1479 
1480 	mutex_enter(&odd->odd_lock);
1481 	/* If we're not the only hold, we're busy */
1482 	if (odd->odd_ref != 1) {
1483 		mutex_exit(&odd->odd_lock);
1484 		overlay_hold_rele(odd);
1485 		return (EBUSY);
1486 	}
1487 
1488 	if (odd->odd_flags & OVERLAY_F_IN_MUX) {
1489 		mutex_exit(&odd->odd_lock);
1490 		overlay_hold_rele(odd);
1491 		return (EBUSY);
1492 	}
1493 
1494 	/*
1495 	 * To remove this, we need to first remove it from dls and then remove
1496 	 * it from mac. The act of removing it from mac will check if there are
1497 	 * devices on top of this, eg. vnics. If there are, then that will fail
1498 	 * and we'll have to go through and recreate the dls entry. Only after
1499 	 * mac_unregister has succeeded, then we'll go through and actually free
1500 	 * everything and drop the dev lock.
1501 	 */
1502 	ret = dls_devnet_destroy(odd->odd_mh, &tid, B_TRUE);
1503 	if (ret != 0) {
1504 		overlay_hold_rele(odd);
1505 		return (ret);
1506 	}
1507 
1508 	ASSERT(oidp->oid_linkid == tid);
1509 	ret = mac_disable(odd->odd_mh);
1510 	if (ret != 0) {
1511 		(void) dls_devnet_create(odd->odd_mh, odd->odd_linkid,
1512 		    crgetzoneid(cred));
1513 		overlay_hold_rele(odd);
1514 		return (ret);
1515 	}
1516 
1517 	overlay_target_quiesce(odd->odd_target);
1518 
1519 	mutex_enter(&overlay_dev_lock);
1520 	list_remove(&overlay_dev_list, odd);
1521 	mutex_exit(&overlay_dev_lock);
1522 
1523 	cv_destroy(&odd->odd_iowait);
1524 	mutex_destroy(&odd->odd_lock);
1525 	overlay_target_free(odd);
1526 	odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1527 	overlay_plugin_rele(odd->odd_plugin);
1528 	kmem_free(odd, sizeof (overlay_dev_t));
1529 
1530 	return (0);
1531 }
1532 
1533 /* ARGSUSED */
1534 static int
1535 overlay_i_nprops(void *karg, intptr_t arg, int mode, cred_t *cred,
1536     int *rvalp)
1537 {
1538 	overlay_dev_t *odd;
1539 	overlay_ioc_nprops_t *on = karg;
1540 
1541 	odd = overlay_hold_by_dlid(on->oipn_linkid);
1542 	if (odd == NULL)
1543 		return (ENOENT);
1544 	on->oipn_nprops = odd->odd_plugin->ovp_nprops + OVERLAY_DEV_NPROPS;
1545 	overlay_hold_rele(odd);
1546 
1547 	return (0);
1548 }
1549 
1550 static int
1551 overlay_propinfo_plugin_cb(overlay_plugin_t *opp, void *arg)
1552 {
1553 	overlay_prop_handle_t phdl = arg;
1554 	overlay_prop_set_range_str(phdl, opp->ovp_name);
1555 	return (0);
1556 }
1557 
1558 static int
1559 overlay_i_name_to_propid(overlay_dev_t *odd, const char *name, uint_t *id)
1560 {
1561 	int i;
1562 
1563 	for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
1564 		if (strcmp(overlay_dev_props[i], name) == 0) {
1565 			*id = i;
1566 			return (0);
1567 		}
1568 	}
1569 
1570 	for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) {
1571 		if (strcmp(odd->odd_plugin->ovp_props[i], name) == 0) {
1572 			*id = i + OVERLAY_DEV_NPROPS;
1573 			return (0);
1574 		}
1575 	}
1576 
1577 	return (ENOENT);
1578 }
1579 
1580 static void
1581 overlay_i_propinfo_mtu(overlay_dev_t *odd, overlay_prop_handle_t phdl)
1582 {
1583 	uint32_t def;
1584 	mac_propval_range_t range;
1585 	uint_t perm;
1586 
1587 	ASSERT(MAC_PERIM_HELD(odd->odd_mh));
1588 
1589 	bzero(&range, sizeof (mac_propval_range_t));
1590 	range.mpr_count = 1;
1591 	if (mac_prop_info(odd->odd_mh, MAC_PROP_MTU, "mtu", &def,
1592 	    sizeof (def), &range, &perm) != 0)
1593 		return;
1594 
1595 	if (perm == MAC_PROP_PERM_READ)
1596 		overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
1597 	else if (perm == MAC_PROP_PERM_WRITE)
1598 		overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_WRITE);
1599 	else if (perm == MAC_PROP_PERM_RW)
1600 		overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
1601 
1602 	overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
1603 	overlay_prop_set_default(phdl, &def, sizeof (def));
1604 	overlay_prop_set_range_uint32(phdl, range.mpr_range_uint32[0].mpur_min,
1605 	    range.mpr_range_uint32[0].mpur_max);
1606 }
1607 
1608 /* ARGSUSED */
1609 static int
1610 overlay_i_propinfo(void *karg, intptr_t arg, int mode, cred_t *cred,
1611     int *rvalp)
1612 {
1613 	overlay_dev_t *odd;
1614 	int ret;
1615 	mac_perim_handle_t mph;
1616 	uint_t propid = UINT_MAX;
1617 	overlay_ioc_propinfo_t *oip = karg;
1618 	overlay_prop_handle_t phdl = (overlay_prop_handle_t)oip;
1619 
1620 	odd = overlay_hold_by_dlid(oip->oipi_linkid);
1621 	if (odd == NULL)
1622 		return (ENOENT);
1623 
1624 	overlay_prop_init(phdl);
1625 	mac_perim_enter_by_mh(odd->odd_mh, &mph);
1626 
1627 	/*
1628 	 * If the id is -1, then the property that we're looking for is named in
1629 	 * oipi_name and we should fill in its id. Otherwise, we've been given
1630 	 * an id and we need to turn that into a name for our plugin's sake. The
1631 	 * id is our own fabrication for property discovery.
1632 	 */
1633 	if (oip->oipi_id == -1) {
1634 		/*
1635 		 * Determine if it's a known generic property or it belongs to a
1636 		 * module by checking against the list of known names.
1637 		 */
1638 		oip->oipi_name[OVERLAY_PROP_NAMELEN-1] = '\0';
1639 		if ((ret = overlay_i_name_to_propid(odd, oip->oipi_name,
1640 		    &propid)) != 0) {
1641 			overlay_hold_rele(odd);
1642 			mac_perim_exit(mph);
1643 			return (ret);
1644 		}
1645 		oip->oipi_id = propid;
1646 		if (propid >= OVERLAY_DEV_NPROPS) {
1647 			ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(
1648 			    oip->oipi_name, phdl);
1649 			overlay_hold_rele(odd);
1650 			mac_perim_exit(mph);
1651 			return (ret);
1652 
1653 		}
1654 	} else if (oip->oipi_id >= OVERLAY_DEV_NPROPS) {
1655 		uint_t id = oip->oipi_id - OVERLAY_DEV_NPROPS;
1656 
1657 		if (id >= odd->odd_plugin->ovp_nprops) {
1658 			overlay_hold_rele(odd);
1659 			mac_perim_exit(mph);
1660 			return (EINVAL);
1661 		}
1662 		ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(
1663 		    odd->odd_plugin->ovp_props[id], phdl);
1664 		overlay_hold_rele(odd);
1665 		mac_perim_exit(mph);
1666 		return (ret);
1667 	} else if (oip->oipi_id < -1) {
1668 		overlay_hold_rele(odd);
1669 		mac_perim_exit(mph);
1670 		return (EINVAL);
1671 	} else {
1672 		ASSERT(oip->oipi_id < OVERLAY_DEV_NPROPS);
1673 		ASSERT(oip->oipi_id >= 0);
1674 		propid = oip->oipi_id;
1675 		(void) strlcpy(oip->oipi_name, overlay_dev_props[propid],
1676 		    sizeof (oip->oipi_name));
1677 	}
1678 
1679 	switch (propid) {
1680 	case OVERLAY_DEV_P_MTU:
1681 		overlay_i_propinfo_mtu(odd, phdl);
1682 		break;
1683 	case OVERLAY_DEV_P_VNETID:
1684 		overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
1685 		overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
1686 		overlay_prop_set_nodefault(phdl);
1687 		break;
1688 	case OVERLAY_DEV_P_ENCAP:
1689 		overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
1690 		overlay_prop_set_type(phdl, OVERLAY_PROP_T_STRING);
1691 		overlay_prop_set_nodefault(phdl);
1692 		overlay_plugin_walk(overlay_propinfo_plugin_cb, phdl);
1693 		break;
1694 	case OVERLAY_DEV_P_VARPDID:
1695 		overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
1696 		overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
1697 		overlay_prop_set_nodefault(phdl);
1698 		break;
1699 	default:
1700 		overlay_hold_rele(odd);
1701 		mac_perim_exit(mph);
1702 		return (ENOENT);
1703 	}
1704 
1705 	overlay_hold_rele(odd);
1706 	mac_perim_exit(mph);
1707 	return (0);
1708 }
1709 
1710 /* ARGSUSED */
1711 static int
1712 overlay_i_getprop(void *karg, intptr_t arg, int mode, cred_t *cred,
1713     int *rvalp)
1714 {
1715 	int ret;
1716 	overlay_dev_t *odd;
1717 	mac_perim_handle_t mph;
1718 	overlay_ioc_prop_t *oip = karg;
1719 	uint_t propid, mtu;
1720 
1721 	odd = overlay_hold_by_dlid(oip->oip_linkid);
1722 	if (odd == NULL)
1723 		return (ENOENT);
1724 
1725 	mac_perim_enter_by_mh(odd->odd_mh, &mph);
1726 	oip->oip_size = OVERLAY_PROP_SIZEMAX;
1727 	oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0';
1728 	if (oip->oip_id == -1) {
1729 		int i;
1730 
1731 		for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
1732 			if (strcmp(overlay_dev_props[i], oip->oip_name) == 0)
1733 				break;
1734 			if (i == OVERLAY_DEV_NPROPS) {
1735 				ret = odd->odd_plugin->ovp_ops->ovpo_getprop(
1736 				    odd->odd_pvoid, oip->oip_name,
1737 				    oip->oip_value, &oip->oip_size);
1738 				overlay_hold_rele(odd);
1739 				mac_perim_exit(mph);
1740 				return (ret);
1741 			}
1742 		}
1743 
1744 		propid = i;
1745 	} else if (oip->oip_id >= OVERLAY_DEV_NPROPS) {
1746 		uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS;
1747 
1748 		if (id > odd->odd_plugin->ovp_nprops) {
1749 			overlay_hold_rele(odd);
1750 			mac_perim_exit(mph);
1751 			return (EINVAL);
1752 		}
1753 		ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid,
1754 		    odd->odd_plugin->ovp_props[id], oip->oip_value,
1755 		    &oip->oip_size);
1756 		overlay_hold_rele(odd);
1757 		mac_perim_exit(mph);
1758 		return (ret);
1759 	} else if (oip->oip_id < -1) {
1760 		overlay_hold_rele(odd);
1761 		mac_perim_exit(mph);
1762 		return (EINVAL);
1763 	} else {
1764 		ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS);
1765 		ASSERT(oip->oip_id >= 0);
1766 		propid = oip->oip_id;
1767 	}
1768 
1769 	ret = 0;
1770 	switch (propid) {
1771 	case OVERLAY_DEV_P_MTU:
1772 		/*
1773 		 * The MTU is always set and retrieved through MAC, to allow for
1774 		 * MAC to do whatever it wants, as really that property belongs
1775 		 * to MAC. This is important for things where vnics have hold on
1776 		 * the MTU.
1777 		 */
1778 		mac_sdu_get(odd->odd_mh, NULL, &mtu);
1779 		bcopy(&mtu, oip->oip_value, sizeof (uint_t));
1780 		oip->oip_size = sizeof (uint_t);
1781 		break;
1782 	case OVERLAY_DEV_P_VNETID:
1783 		/*
1784 		 * While it's read-only while inside of a mux, we're not in a
1785 		 * context that can guarantee that. Therefore we always grab the
1786 		 * overlay_dev_t's odd_lock.
1787 		 */
1788 		mutex_enter(&odd->odd_lock);
1789 		bcopy(&odd->odd_vid, oip->oip_value, sizeof (uint64_t));
1790 		mutex_exit(&odd->odd_lock);
1791 		oip->oip_size = sizeof (uint64_t);
1792 		break;
1793 	case OVERLAY_DEV_P_ENCAP:
1794 		oip->oip_size = strlcpy((char *)oip->oip_value,
1795 		    odd->odd_plugin->ovp_name, oip->oip_size);
1796 		break;
1797 	case OVERLAY_DEV_P_VARPDID:
1798 		mutex_enter(&odd->odd_lock);
1799 		if (odd->odd_flags & OVERLAY_F_VARPD) {
1800 			const uint64_t val = odd->odd_target->ott_id;
1801 			bcopy(&val, oip->oip_value, sizeof (uint64_t));
1802 			oip->oip_size = sizeof (uint64_t);
1803 		} else {
1804 			oip->oip_size = 0;
1805 		}
1806 		mutex_exit(&odd->odd_lock);
1807 		break;
1808 	default:
1809 		ret = ENOENT;
1810 	}
1811 
1812 	overlay_hold_rele(odd);
1813 	mac_perim_exit(mph);
1814 	return (ret);
1815 }
1816 
1817 static void
1818 overlay_setprop_vnetid(overlay_dev_t *odd, uint64_t vnetid)
1819 {
1820 	mutex_enter(&odd->odd_lock);
1821 
1822 	/* Simple case, not active */
1823 	if (!(odd->odd_flags & OVERLAY_F_IN_MUX)) {
1824 		odd->odd_vid = vnetid;
1825 		mutex_exit(&odd->odd_lock);
1826 		return;
1827 	}
1828 
1829 	/*
1830 	 * In the hard case, we need to set the drop flag, quiesce I/O and then
1831 	 * we can go ahead and do everything.
1832 	 */
1833 	odd->odd_flags |= OVERLAY_F_MDDROP;
1834 	overlay_io_wait(odd, OVERLAY_F_IOMASK);
1835 	mutex_exit(&odd->odd_lock);
1836 
1837 	overlay_mux_remove_dev(odd->odd_mux, odd);
1838 
1839 	mutex_enter(&odd->odd_lock);
1840 	odd->odd_vid = vnetid;
1841 	mutex_exit(&odd->odd_lock);
1842 
1843 	overlay_mux_add_dev(odd->odd_mux, odd);
1844 
1845 	mutex_enter(&odd->odd_lock);
1846 	ASSERT(odd->odd_flags & OVERLAY_F_IN_MUX);
1847 	odd->odd_flags &= ~OVERLAY_F_MDDROP;
1848 	mutex_exit(&odd->odd_lock);
1849 }
1850 
1851 /* ARGSUSED */
1852 static int
1853 overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred,
1854     int *rvalp)
1855 {
1856 	int ret;
1857 	overlay_dev_t *odd;
1858 	overlay_ioc_prop_t *oip = karg;
1859 	uint_t propid = UINT_MAX;
1860 	mac_perim_handle_t mph;
1861 	uint64_t maxid, *vidp;
1862 
1863 	if (oip->oip_size > OVERLAY_PROP_SIZEMAX)
1864 		return (EINVAL);
1865 
1866 	odd = overlay_hold_by_dlid(oip->oip_linkid);
1867 	if (odd == NULL)
1868 		return (ENOENT);
1869 
1870 	oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0';
1871 	mac_perim_enter_by_mh(odd->odd_mh, &mph);
1872 	mutex_enter(&odd->odd_lock);
1873 	if (odd->odd_flags & OVERLAY_F_ACTIVATED) {
1874 		mac_perim_exit(mph);
1875 		mutex_exit(&odd->odd_lock);
1876 		return (ENOTSUP);
1877 	}
1878 	mutex_exit(&odd->odd_lock);
1879 	if (oip->oip_id == -1) {
1880 		int i;
1881 
1882 		for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
1883 			if (strcmp(overlay_dev_props[i], oip->oip_name) == 0)
1884 				break;
1885 			if (i == OVERLAY_DEV_NPROPS) {
1886 				ret = odd->odd_plugin->ovp_ops->ovpo_setprop(
1887 				    odd->odd_pvoid, oip->oip_name,
1888 				    oip->oip_value, oip->oip_size);
1889 				overlay_hold_rele(odd);
1890 				mac_perim_exit(mph);
1891 				return (ret);
1892 			}
1893 		}
1894 
1895 		propid = i;
1896 	} else if (oip->oip_id >= OVERLAY_DEV_NPROPS) {
1897 		uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS;
1898 
1899 		if (id > odd->odd_plugin->ovp_nprops) {
1900 			mac_perim_exit(mph);
1901 			overlay_hold_rele(odd);
1902 			return (EINVAL);
1903 		}
1904 		ret = odd->odd_plugin->ovp_ops->ovpo_setprop(odd->odd_pvoid,
1905 		    odd->odd_plugin->ovp_props[id], oip->oip_value,
1906 		    oip->oip_size);
1907 		mac_perim_exit(mph);
1908 		overlay_hold_rele(odd);
1909 		return (ret);
1910 	} else if (oip->oip_id < -1) {
1911 		mac_perim_exit(mph);
1912 		overlay_hold_rele(odd);
1913 		return (EINVAL);
1914 	} else {
1915 		ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS);
1916 		ASSERT(oip->oip_id >= 0);
1917 		propid = oip->oip_id;
1918 	}
1919 
1920 	ret = 0;
1921 	switch (propid) {
1922 	case OVERLAY_DEV_P_MTU:
1923 		ret = mac_set_prop(odd->odd_mh, MAC_PROP_MTU, "mtu",
1924 		    oip->oip_value, oip->oip_size);
1925 		break;
1926 	case OVERLAY_DEV_P_VNETID:
1927 		if (oip->oip_size != sizeof (uint64_t)) {
1928 			ret = EINVAL;
1929 			break;
1930 		}
1931 		vidp = (uint64_t *)oip->oip_value;
1932 		ASSERT(odd->odd_plugin->ovp_id_size <= 8);
1933 		maxid = UINT64_MAX;
1934 		if (odd->odd_plugin->ovp_id_size != 8)
1935 			maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) -
1936 			    1ULL;
1937 		if (*vidp >= maxid) {
1938 			ret = EINVAL;
1939 			break;
1940 		}
1941 		overlay_setprop_vnetid(odd, *vidp);
1942 		break;
1943 	case OVERLAY_DEV_P_ENCAP:
1944 	case OVERLAY_DEV_P_VARPDID:
1945 		ret = EPERM;
1946 		break;
1947 	default:
1948 		ret = ENOENT;
1949 	}
1950 
1951 	mac_perim_exit(mph);
1952 	overlay_hold_rele(odd);
1953 	return (ret);
1954 }
1955 
1956 /* ARGSUSED */
1957 static int
1958 overlay_i_status(void *karg, intptr_t arg, int mode, cred_t *cred,
1959     int *rvalp)
1960 {
1961 	overlay_dev_t *odd;
1962 	overlay_ioc_status_t *os = karg;
1963 
1964 	odd = overlay_hold_by_dlid(os->ois_linkid);
1965 	if (odd == NULL)
1966 		return (ENOENT);
1967 
1968 	mutex_enter(&odd->odd_lock);
1969 	if ((odd->odd_flags & OVERLAY_F_DEGRADED) != 0) {
1970 		os->ois_status = OVERLAY_I_DEGRADED;
1971 		if (odd->odd_fmamsg != NULL) {
1972 			(void) strlcpy(os->ois_message, odd->odd_fmamsg,
1973 			    OVERLAY_STATUS_BUFLEN);
1974 		} else {
1975 			os->ois_message[0] = '\0';
1976 		}
1977 
1978 	} else {
1979 		os->ois_status = OVERLAY_I_OK;
1980 		os->ois_message[0] = '\0';
1981 	}
1982 	mutex_exit(&odd->odd_lock);
1983 	overlay_hold_rele(odd);
1984 
1985 	return (0);
1986 }
1987 
1988 static dld_ioc_info_t overlay_ioc_list[] = {
1989 	{ OVERLAY_IOC_CREATE, DLDCOPYIN, sizeof (overlay_ioc_create_t),
1990 		overlay_i_create, secpolicy_dl_config },
1991 	{ OVERLAY_IOC_ACTIVATE, DLDCOPYIN, sizeof (overlay_ioc_activate_t),
1992 		overlay_i_activate, secpolicy_dl_config },
1993 	{ OVERLAY_IOC_DELETE, DLDCOPYIN, sizeof (overlay_ioc_delete_t),
1994 		overlay_i_delete, secpolicy_dl_config },
1995 	{ OVERLAY_IOC_PROPINFO, DLDCOPYIN | DLDCOPYOUT,
1996 		sizeof (overlay_ioc_propinfo_t), overlay_i_propinfo,
1997 		secpolicy_dl_config },
1998 	{ OVERLAY_IOC_GETPROP, DLDCOPYIN | DLDCOPYOUT,
1999 		sizeof (overlay_ioc_prop_t), overlay_i_getprop,
2000 		secpolicy_dl_config },
2001 	{ OVERLAY_IOC_SETPROP, DLDCOPYIN,
2002 		sizeof (overlay_ioc_prop_t), overlay_i_setprop,
2003 		secpolicy_dl_config },
2004 	{ OVERLAY_IOC_NPROPS, DLDCOPYIN | DLDCOPYOUT,
2005 		sizeof (overlay_ioc_nprops_t), overlay_i_nprops,
2006 		secpolicy_dl_config },
2007 	{ OVERLAY_IOC_STATUS, DLDCOPYIN | DLDCOPYOUT,
2008 		sizeof (overlay_ioc_status_t), overlay_i_status,
2009 		NULL }
2010 };
2011 
2012 static int
2013 overlay_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2014 {
2015 	int fmcap = DDI_FM_EREPORT_CAPABLE;
2016 	if (cmd != DDI_ATTACH)
2017 		return (DDI_FAILURE);
2018 
2019 	if (overlay_dip != NULL || ddi_get_instance(dip) != 0)
2020 		return (DDI_FAILURE);
2021 
2022 	ddi_fm_init(dip, &fmcap, NULL);
2023 
2024 	if (ddi_create_minor_node(dip, OVERLAY_CTL, S_IFCHR,
2025 	    ddi_get_instance(dip), DDI_PSEUDO, 0) == DDI_FAILURE)
2026 		return (DDI_FAILURE);
2027 
2028 	if (dld_ioc_register(OVERLAY_IOC, overlay_ioc_list,
2029 	    DLDIOCCNT(overlay_ioc_list)) != 0) {
2030 		ddi_remove_minor_node(dip, OVERLAY_CTL);
2031 		return (DDI_FAILURE);
2032 	}
2033 
2034 	overlay_dip = dip;
2035 	return (DDI_SUCCESS);
2036 }
2037 
2038 /* ARGSUSED */
2039 static int
2040 overlay_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
2041 {
2042 	int error;
2043 
2044 	switch (cmd) {
2045 	case DDI_INFO_DEVT2DEVINFO:
2046 		*resp = (void *)overlay_dip;
2047 		error = DDI_SUCCESS;
2048 		break;
2049 	case DDI_INFO_DEVT2INSTANCE:
2050 		*resp = (void *)0;
2051 		error = DDI_SUCCESS;
2052 		break;
2053 	default:
2054 		error = DDI_FAILURE;
2055 		break;
2056 	}
2057 
2058 	return (error);
2059 }
2060 
2061 static int
2062 overlay_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2063 {
2064 	if (cmd != DDI_DETACH)
2065 		return (DDI_FAILURE);
2066 
2067 	mutex_enter(&overlay_dev_lock);
2068 	if (!list_is_empty(&overlay_dev_list) || overlay_target_busy()) {
2069 		mutex_exit(&overlay_dev_lock);
2070 		return (EBUSY);
2071 	}
2072 	mutex_exit(&overlay_dev_lock);
2073 
2074 
2075 	dld_ioc_unregister(OVERLAY_IOC);
2076 	ddi_remove_minor_node(dip, OVERLAY_CTL);
2077 	ddi_fm_fini(dip);
2078 	overlay_dip = NULL;
2079 	return (DDI_SUCCESS);
2080 }
2081 
2082 static struct cb_ops overlay_cbops = {
2083 	overlay_target_open,	/* cb_open */
2084 	overlay_target_close,	/* cb_close */
2085 	nodev,			/* cb_strategy */
2086 	nodev,			/* cb_print */
2087 	nodev,			/* cb_dump */
2088 	nodev,			/* cb_read */
2089 	nodev,			/* cb_write */
2090 	overlay_target_ioctl,	/* cb_ioctl */
2091 	nodev,			/* cb_devmap */
2092 	nodev,			/* cb_mmap */
2093 	nodev,			/* cb_segmap */
2094 	nochpoll,		/* cb_chpoll */
2095 	ddi_prop_op,		/* cb_prop_op */
2096 	NULL,			/* cb_stream */
2097 	D_MP,			/* cb_flag */
2098 	CB_REV,			/* cb_rev */
2099 	nodev,			/* cb_aread */
2100 	nodev,			/* cb_awrite */
2101 };
2102 
2103 static struct dev_ops overlay_dev_ops = {
2104 	DEVO_REV,		/* devo_rev */
2105 	0,			/* devo_refcnt */
2106 	overlay_getinfo,	/* devo_getinfo */
2107 	nulldev,		/* devo_identify */
2108 	nulldev,		/* devo_probe */
2109 	overlay_attach,		/* devo_attach */
2110 	overlay_detach,		/* devo_detach */
2111 	nulldev,		/* devo_reset */
2112 	&overlay_cbops,		/* devo_cb_ops */
2113 	NULL,			/* devo_bus_ops */
2114 	NULL,			/* devo_power */
2115 	ddi_quiesce_not_supported	/* devo_quiesce */
2116 };
2117 
2118 static struct modldrv overlay_modldrv = {
2119 	&mod_driverops,
2120 	"Overlay Network Driver",
2121 	&overlay_dev_ops
2122 };
2123 
2124 static struct modlinkage overlay_linkage = {
2125 	MODREV_1,
2126 	&overlay_modldrv
2127 };
2128 
2129 static int
2130 overlay_init(void)
2131 {
2132 	mutex_init(&overlay_dev_lock, NULL, MUTEX_DRIVER, NULL);
2133 	list_create(&overlay_dev_list, sizeof (overlay_dev_t),
2134 	    offsetof(overlay_dev_t, odd_link));
2135 	overlay_mux_init();
2136 	overlay_plugin_init();
2137 	overlay_target_init();
2138 
2139 	return (DDI_SUCCESS);
2140 }
2141 
2142 static void
2143 overlay_fini(void)
2144 {
2145 	overlay_target_fini();
2146 	overlay_plugin_fini();
2147 	overlay_mux_fini();
2148 	mutex_destroy(&overlay_dev_lock);
2149 	list_destroy(&overlay_dev_list);
2150 }
2151 
2152 int
2153 _init(void)
2154 {
2155 	int err;
2156 
2157 	if ((err = overlay_init()) != DDI_SUCCESS)
2158 		return (err);
2159 
2160 	mac_init_ops(NULL, "overlay");
2161 	err = mod_install(&overlay_linkage);
2162 	if (err != DDI_SUCCESS) {
2163 		overlay_fini();
2164 		return (err);
2165 	}
2166 
2167 	return (0);
2168 }
2169 
2170 int
2171 _info(struct modinfo *modinfop)
2172 {
2173 	return (mod_info(&overlay_linkage, modinfop));
2174 }
2175 
2176 int
2177 _fini(void)
2178 {
2179 	int err;
2180 
2181 	err = mod_remove(&overlay_linkage);
2182 	if (err != 0)
2183 		return (err);
2184 
2185 	overlay_fini();
2186 	return (0);
2187 }
2188