xref: /illumos-gate/usr/src/uts/common/io/overlay/overlay.c (revision 36589d6bb0cdae89e166b57b0d64ae56d53247d9)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2016 Joyent, Inc.
14  */
15 
16 /*
17  * Overlay Devices
18  *
19  * Overlay devices provide a means for creating overlay networks, a means of
20  * multiplexing multiple logical, isolated, and discrete layer two and layer
21  * three networks on top of one physical network.
22  *
23  * In general, these overlay devices encapsulate the logic to answer two
24  * different questions:
25  *
26  *   1) How should I transform a packet to put it on the wire?
27  *   2) Where should I send a transformed packet?
28  *
29  * Each overlay device is presented to the user as a GLDv3 device. While the
30  * link itself cannot have an IP interface created on top of it, it allows for
31  * additional GLDv3 devices, such as a VNIC, to be created on top of it which
32  * can be plumbed up with IP interfaces.
33  *
34  *
35  * --------------------
36  * General Architecture
37  * --------------------
38  *
39  * The logical overlay device that a user sees in dladm(1M) is a combination of
40  * two different components that work together. The first component is this
41  * kernel module, which is responsible for answering question one -- how should
42  * I transform a packet to put it on the wire.
43  *
44  * The second component is what we call the virtual ARP daemon, or varpd. It is
45  * a userland component that is responsible for answering the second question --
46  * Where should I send a transformed packet. Instances of the kernel overlay
47  * GLDv3 device ask varpd the question of where should a packet go.
48  *
49  * The split was done for a few reasons. Importantly, we wanted to keep the act
50  * of generating encapsulated packets in the kernel so as to ensure that the
51  * general data path was fast and also kept simple. On the flip side, while the
52  * question of where should something go may be simple, it may often be
53  * complicated and need to interface with several different external or
54  * distributed systems. In those cases, it's simpler to allow for the full
55  * flexibility of userland to be brought to bear to solve that problem and in
56  * general, the path isn't very common.
57  *
58  * The following is what makes up the logical overlay device that a user would
59  * create with dladm(1M).
60  *
61  *       Kernel                                     Userland
62  *   . . . . . . . . . . . . . . . . . . . . .   . . . . . . . . . . . . .
63  *   . +--------+   +--------+  +--------+   .   .                       .
64  *   . | VNIC 0 |   | VNIC 1 |  | VNIC 2 |   .   .                       .
65  *   . +--------+   +--------+  +--------+   .   .                       .
66  *   .     |            |           |        .   .                       .
67  *   .     |            |           |        .   .                       .
68  *   .     +------------+-----------+        .   .                       .
69  *   .                  |              . . /dev/overlay                  .
70  *   .           +--------------+      .     .   .       +------------+  .
71  *   .           |              |      .     .   .       |            |  .
72  *   .           |    Overlay   |======*=================|   Virtual  |  .
73  *   .           | GLDv3 Device |========================| ARP Daemon |  .
74  *   .           |              |            .   .       |            |  .
75  *   .           +--------------+            .   .       +------------+  .
76  *   .                  |                    .   .              |        .
77  *   .                  |                    .   .              |        .
78  *   .           +----------------+          .   .         +--------+    .
79  *   .           |  Overlay       |          .   .         | varpd  |    .
80  *   .           |  Encapsulation |          .   .         | Lookup |    .
81  *   .           |  Plugin        |          .   .         | Plugin |    .
82  *   .           +----------------+          .   .         +--------+    .
83  *   . . . . . . . . . . . . . . . . . . . . .   . . . . . . . . . . . . .
84  *
85  *
86  * This image shows the two different components and where they live.
87  * Importantly, it also shows that both the kernel overlay device and the
88  * userland varpd both support plugins. The plugins actually implement the
89  * things that users care about and the APIs have been designed to try to
90  * minimize the amount of things that a module writer needs to worry about it.
91  *
92  * IDENTIFIERS
93  *
94  * Every overlay device is defined by a unique identifier which is the overlay
95  * identifier. Its purpose is similar to that of a VLAN identifier, it's a
96  * unique number that is used to differentiate between different entries on the
97  * wire.
98  *
99  * ENCAPSULATION
100  *
101  * An overlay encapsulation plugin is a kernel miscellaneous module whose
102  * purpose is to contain knowledge about how to transform packets to put them
103  * onto the wire and to take them off. An example of an encapsulation plugin is
104  * vxlan. It's also how support for things like nvgre or geneve would be brought
105  * into the system.
106  *
107  * Each encapsulation plugins defines a series of operation vectors and
108  * properties. For the full details on everything they should provide, please
109  * read uts/common/sys/overlay_plugin.h. The encapsulation plugin is responsible
110  * for telling the system what information is required to send a packet. For
111  * example, vxlan is defined to send everything over a UDP packet and therefore
112  * requires a port and an IP address, while nvgre on the other hand is its own
113  * IP type and therefore just requires an IP address. In addition, it also
114  * provides information about the kind of socket that should be created. This is
115  * used by the kernel multiplexor, more of that in the Kernel Components
116  * section.
117  *
118  * LOOKUPS
119  *
120  * The kernel communicates requests for lookups over the character device
121  * /dev/overlay. varpd is responsible for listening for requests on that device
122  * and answering them. The character device is specific to the target path and
123  * varpd.
124  *
125  * Much as the kernel overlay module handles the bulk of the scaffolding but
126  * leaves the important work to the encapsulation plugin, varpd provides a
127  * similar role and leaves the full brunt of lookups to a userland dynamic
128  * shared object which implements the logic of lookups.
129  *
130  * Each lookup plugin defines a series of operation vectors and properties. For
131  * the full details on everything that they should provide, please read
132  * lib/varpd/libvarpd/libvarpd_provider.h. Essentially, they are given a MAC
133  * address and asked to give an address on the physical network that it should
134  * be sent to. In addition, they handle questions related to how to handle
135  * things like broadcast and multicast traffic, etc.
136  *
137  * ----------
138  * Properties
139  * ----------
140  *
141  * A device from a dladm perspective has a unique set of properties that are
142  * combined from three different sources:
143  *
144  *   1) Generic properties that every overlay device has
145  *   2) Properties that are specific to the encapsulation plugin
146  *   3) Properties that are specific to the lookup plugin
147  *
148  * All of these are exposed in a single set of properties in dladm. Note that
149  * these are not necessarily traditional link properties. However, if something
150  * is both a traditional GLDv3 link property, say the MTU of a device, and a
151  * specific property here, than the driver ensures that all existing GLDv3
152  * specific means of manipulating it are used and wraps up its private property
153  * interfaces to ensure that works.
154  *
155  * Properties in the second and third category are prefixed with the name of
156  * their module. For example, the vxlan encapsulation module has a property
157  * called the 'listen_ip'. This property would show up in dladm as
158  * 'vxlan/listen_ip'. This allows different plugins to both use similar names
159  * for similar properties and to also have independent name spaces so that
160  * overlapping names do not conflict with anything else.
161  *
162  * While the kernel combines both sets one and two into a single coherent view,
163  * it does not do anything with respect to the properties that are owned by the
164  * lookup plugin -- those are owned wholly by varpd. Instead, libdladm is in
165  * charge of bridging these two worlds into one magical experience for the user.
166  * It carries the burden of knowing about both overlay specific and varpd
167  * specific properties. Importantly, we want to maintain this distinction. We
168  * don't want to treat the kernel as an arbitrary key/value store for varpd and
169  * we want the kernel to own its own data and not have to ask userland for
170  * information that it owns.
171  *
172  * Every property in the system has the following attributes:
173  *
174  *   o A name
175  *   o A type
176  *   o A size
177  *   o Permissions
178  *   o Default value
179  *   o Valid value ranges
180  *   o A value
181  *
182  * Everything except for the value is obtained by callers through the propinfo
183  * callbacks and a property has a maximum size of OVERLAY_PROP_SIZEMAX,
184  * currently 256 bytes.
185  *
186  * The following are the supported types of properties:
187  *
188  *	OVERLAY_PROP_T_INT
189  *
190  *		A signed integer, its length is 8 bytes, corresponding to a
191  *		int64_t.
192  *
193  *	OVERLAY_PROP_T_UINT
194  *
195  *		An unsigned integer, its length is 8 bytes, corresponding to a
196  *		uint64_t.
197  *
198  *	OVERLAY_PROP_T_IP
199  *
200  *		A struct in6_addr, it has a fixed size.
201  *
202  *	OVERLAY_PROP_T_STRING
203  *
204  *		A null-terminated character string encoded in either ASCII or
205  *		UTF-8. Note that the size of the string includes the null
206  *		terminator.
207  *
208  * The next thing that we apply to a property is its permission. The permissions
209  * are put together by the bitwise or of the following flags and values.
210  *
211  *	OVERLAY_PROP_PERM_REQ
212  *
213  *		This indicates a required property. A property that is required
214  *		must be set by a consumer before the device can be created. If a
215  *		required property has a default property, this constraint is
216  *		loosened because the default property defines the value.
217  *
218  *	OVERLAY_PORP_PERM_READ
219  *
220  *		This indicates that a property can be read. All properties will
221  *		have this value set.
222  *
223  *	OVERLAY_PROP_PERM_WRITE
224  *
225  *		This indicates that a property can be written to and thus
226  *		updated by userland. Properties that are only intended to
227  *		display information, will not have OVERLAY_PROP_PERM_WRITE set.
228  *
229  * In addition, a few additional values are defined as a convenience to
230  * consumers. The first, OVERLAY_PROP_PERM_RW, is a combination of
231  * OVERLAY_PROP_PERM_READ and OVERLAY_PERM_PROP_WRITE. The second,
232  * OVERLAY_PROP_PERM_RRW, is a combination of OVERLAY_PROP_PERM_REQ,
233  * OVERLAY_PROP_PERM_READ, and OVERLAY_PROP_PERM_WRITE. The protection mode of a
234  * property should generally be a constant across its lifetime.
235  *
236  * A property may optionally have a default value. If it does have a default
237  * value, and that property is not set to be a different value, then the default
238  * value is inherited automatically. It also means that if the default value is
239  * acceptable, there is no need to set the value for a required property. For
240  * example, the vxlan module has the vxlan/listen_port property which is
241  * required, but has a default value of 4789 (the IANA assigned port). Because
242  * of that default value, there is no need for it to be set.
243  *
244  * Finally, a property may declare a list of valid values. These valid values
245  * are used for display purposes, they are not enforced by the broader system,
246  * but merely allow a means for the information to be communicated to the user
247  * through dladm(1M). Like a default value, this is optional.
248  *
249  * The general scaffolding does not do very much with respect to the getting and
250  * setting of properties. That is really owned by the individual plugins
251  * themselves.
252  *
253  * -----------------------------
254  * Destinations and Plugin Types
255  * -----------------------------
256  *
257  * Both encapsulation and lookup plugins define the kinds of destinations that
258  * they know how to support. There are three different pieces of information
259  * that can be used to address to a destination currently, all of which is
260  * summarized in the type overlay_point_t. Any combination of these is
261  * supported.
262  *
263  *	OVERLAY_PLUGIN_D_ETHERNET
264  *
265  *		An Ethernet MAC address is required.
266  *
267  *	OVERLAY_PLUGIN_D_IP
268  *
269  *		An IP address is required. All IP addresses used by the overlay
270  *		system are transmitted as IPv6 addresses. IPv4 addresses can be
271  *		represented by using IPv4-mapped IPv6 addresses.
272  *
273  *	OVERLAY_PLUGIN_D_PORT
274  *
275  *		A TCP/UDP port is required.
276  *
277  * A kernel encapsulation plugin declares which of these that it requires, it's
278  * a static set. On the other hand, a userland lookup plugin can be built to
279  * support all of these or any combination thereof. It gets passed the required
280  * destination type, based on the kernel encapsulation method, and then it makes
281  * the determination as to whether or not it supports it. For example, the
282  * direct plugin can support either an IP or both an IP and a port, it simply
283  * doesn't display the direct/dest_port property in the cases where a port is
284  * not required to support this.
285  *
286  * The user lookup plugins have two different modes of operation which
287  * determines how they interact with the broader system and how look ups are
288  * performed. These types are:
289  *
290  *	OVERLAY_TARGET_POINT
291  *
292  *		A point to point plugin has a single static definition for where
293  *		to send all traffic. Every packet in the system always gets sent
294  *		to the exact same destination which is programmed into the
295  *		kernel when the general device is activated.
296  *
297  *	OVERLAY_TARGET_DYNAMIC
298  *
299  *		A dynamic plugin does not have a single static definition.
300  *		Instead, for each destination, the kernel makes an asynchronous
301  *		request to varpd to determine where the packet should be routed,
302  *		and if a specific destination is found, then that destination is
303  *		cached in the overlay device's target cache.
304  *
305  * This distinction, while important for the general overlay device's operation,
306  * is not important to the encapsulation plugins. They don't need to know about
307  * any of these pieces. It's just a concern for varpd, the userland plugin, and
308  * the general overlay scaffolding.
309  *
310  * When an overlay device is set to OVERLAY_TARGET_POINT, then it does not
311  * maintain a target cache, and instead just keeps track of the destination and
312  * always sends encapsulated packets to that address. When the target type is of
313  * OVERLAY_TARGET_DYNAMIC, then the kernel maintains a cache of all such
314  * destinations. These destinations are kept around in an instance of a
315  * reference hash that is specific to the given overlay device. Entries in the
316  * cache can be invalidated and replaced by varpd and its lookup plugins.
317  *
318  * ----------------------------------
319  * Kernel Components and Architecture
320  * ----------------------------------
321  *
322  * There are multiple pieces inside the kernel that work together, there is the
323  * general overlay_dev_t structure, which is the logical GLDv3 device, but it
324  * itself has references to things like an instance of an encapsulation plugin,
325  * a pointer to a mux and a target cache. It can roughly be summarized in the
326  * following image:
327  *
328  *     +------------------+
329  *     | global           |
330  *     | overlay list     |
331  *     | overlay_dev_list |
332  *     +------------------+
333  *        |
334  *        |  +-----------------------+            +---------------+
335  *        +->| GLDv3 Device          |----------->| GLDv3 Device  | -> ...
336  *           | overlay_dev_t         |            | overlay_dev_t |
337  *           |                       |            +---------------+
338  *           |                       |
339  *           | mac_handle_t     -----+---> GLDv3 handle to MAC
340  *           | datalink_id_t    -----+---> Datalink ID used by DLS
341  *           | overlay_dev_flag_t ---+---> Device state
342  *           | uint_t           -----+---> Current device MTU
343  *           | uint_t           -----+---> In-progress RX operations
344  *           | uint_t           -----+---> In-progress TX operations
345  *           | char[]           -----+---> FMA degraded message
346  *           | void *           -----+---> plugin private data
347  *           | overlay_target_t * ---+---------------------+
348  *           | overlay_plugin_t * ---+---------+           |
349  *           +-----------------------+         |           |
350  *                           ^                 |           |
351  *   +--------------------+  |                 |           |
352  *   | Kernel Socket      |  |                 |           |
353  *   | Multiplexor        |  |                 |           |
354  *   | overlay_mux_t      |  |                 |           |
355  *   |                    |  |                 |           |
356  *   | avl_tree_t        -+--+                 |           |
357  *   | uint_t            -+--> socket family   |           |
358  *   | uint_t            -+--> socket type     |           |
359  *   | uint_t            -+--> socket protocol |           |
360  *   | ksocket_t         -+--> I/O socket      |           |
361  *   | struct sockaddr * -+--> ksocket address |           |
362  *   | overlay_plugin_t --+--------+           |           |
363  *   +--------------------+        |           |           |
364  *                                 |           |           |
365  *   +-------------------------+   |           |           |
366  *   | Encap Plugin            |<--+-----------+           |
367  *   | overlay_plugin_t        |                           |
368  *   |                         |                           |
369  *   | char *               ---+--> plugin name            |
370  *   | overlay_plugin_ops_t * -+--> plugin downcalls       |
371  *   | char ** (props)      ---+--> property list          |
372  *   | uint_t               ---+--> id length              |
373  *   | overlay_plugin_flags_t -+--> plugin flags           |
374  *   | overlay_plugin_dest_t --+--> destination type       v
375  *   +-------------------------+                    +-------------------------+
376  *                                                  |   Target Cache          |
377  *                                                  |   overlay_target_t      |
378  *                                                  |                         |
379  *                                    cache mode <--+- overlay_target_mode_t  |
380  *                                     dest type <--+- overlay_plugin_dest_t  |
381  *                                   cache flags <--+- overlay_target_flag_t  |
382  *                                     varpd id  <--+- uint64_t               |
383  *                       outstanding varpd reqs. <--+- uint_t                 |
384  *                   OVERLAY_TARGET_POINT state  <--+- overlay_target_point_t |
385  *               OVERLAY_TARGET_DYNAMIC state <-+---+- overlay_target_dyn_t   |
386  *                                              |   +-------------------------+
387  *                      +-----------------------+
388  *                      |
389  *                      v
390  *   +-------------------------------+   +------------------------+
391  *   | Target Entry                  |-->| Target Entry           |--> ...
392  *   | overlay_target_entry_t        |   | overlay_target_entry_t |
393  *   |                               |   +------------------------+
394  *   |                               |
395  *   | overlay_target_entry_flags_t -+--> Entry flags
396  *   | uint8_t[ETHERADDRL]        ---+--> Target MAC address
397  *   | overlay_target_point_t     ---+--> Target underlay address
398  *   | mblk_t *                   ---+--> outstanding mblk head
399  *   | mblk_t *                   ---+--> outstanding mblk tail
400  *   | size_t                     ---+--> outstanding mblk size
401  *   +-------------------------------+
402  *
403  * The primary entries that we care about are the overlay_dev_t, which
404  * correspond to each overlay device that is created with dladm(1M). Globally,
405  * these devices are maintained in a simple list_t which is protected with a
406  * lock.  Hence, these include important information such as the mac_handle_t
407  * and a datalink_id_t which is used to interact with the broader MAC and DLS
408  * ecosystem. We also maintain additional information such as the current state,
409  * outstanding operations, the mtu, and importantly, the plugin's private data.
410  * This is the instance of an encapsulation plugin that gets created as part of
411  * creating an overlay device. Another aspect of this is that the overlay_dev_t
412  * also includes information with respect to FMA. For more information, see the
413  * FMA section.
414  *
415  * Each overlay_dev_t has a pointer to a plugin, a mux, and a target. The plugin
416  * is the encapsulation plugin. This allows the device to make downcalls into it
417  * based on doing things like getting and setting properties. Otherwise, the
418  * plugin itself is a fairly straightforward entity. They are maintained in an
419  * (not pictured above) list. The plugins themselves mostly maintain things like
420  * the static list of properties, what kind of destination they require, and the
421  * operations vector. A given module may contain more if necessary.
422  *
423  * The next piece of the puzzle is the mux, or a multiplexor. The mux itself
424  * maintains a ksocket and it is through the mux that we send and receive
425  * message blocks. The mux represents a socket type and address, as well as a
426  * plugin. Multiple overlay_dev_t devices may then share the same mux. For
427  * example, consider the case where you have different instances of vxlan all on
428  * the same underlay network. These would all logically share the same IP
429  * address and port that packets are sent and received on; however, what differs
430  * is the decapuslation ID.
431  *
432  * Each mux maintains a ksocket_t which is similar to a socket(3SOCKET). Unlike
433  * a socket, we enable a direct callback on the ksocket. This means that
434  * whenever a message block chain is received, rather than sitting there and
435  * getting a callback in a context and kicking that back out to a taskq. Instead
436  * data comes into the callback function overlay_mux_recv().
437  *
438  * The mux is given encapsulated packets (via overlay_m_tx, the GLDv3 tx
439  * function) to transmit. It receives encapsulated packets, decapsulates them to
440  * determine the overlay identifier, looks up the given device that matches that
441  * identifier, and then causes the broader MAC world to receive the packet with
442  * a call to mac_rx().
443  *
444  * Today, we don't do too much that's special with the ksocket; however, as
445  * hardware is gaining understanding for these encapsulation protocols, we'll
446  * probably want to think of better ways to get those capabilities passed down
447  * and potentially better ways to program receive filters so they get directly
448  * to us. Though, that's all fantasy future land.
449  *
450  * The next part of the puzzle is the target cache. The purpose of the target
451  * cache is to cache where we should send a packet on the underlay network,
452  * given its mac address. The target cache operates in two modes depending on
453  * whether the lookup module was declared to OVERLAY_TARGET_POINT or
454  * OVERLAY_TARGET_DYANMIC.
455  *
456  * In the case where the target cache has been programmed to be
457  * OVERLAY_TARGET_POINT, then we only maintain a single overlay_target_point_t
458  * which has the destination that we send everything, no matter the destination
459  * mac address.
460  *
461  * On the other hand, when we have an instance of OVERLAY_TARGET_DYNAMIC, things
462  * are much more interesting and as a result, more complicated. We primarily
463  * store lists of overlay_target_entry_t's which are stored in both an avl tree
464  * and a refhash_t. The primary look up path uses the refhash_t and the avl tree
465  * is only used for a few of the target ioctls used to dump data such that we
466  * can get a consistent iteration order for things like dladm show-overlay -t.
467  * The key that we use for the reference hashtable is based on the mac address
468  * in the cache and currently we just do a simple CRC32 to transform it into a
469  * hash.
470  *
471  * Each entry maintains a set of flags to indicate the current status of the
472  * request. The flags may indicate one of three states: that current cache entry
473  * is valid, that the current cache entry has been directed to drop all output,
474  * and that the current cache entry is invalid and may be being looked up. In
475  * the case where it's valid, we just take the destination address and run with
476  * it.
477  *
478  * If it's invalid and a lookup has not been made, then we start the process
479  * that prepares a query that will make its way up to varpd. The cache entry
480  * entry maintains a message block chain of outstanding message blocks and a
481  * size. These lists are populated only when we don't know the answer as to
482  * where should these be sent. The size entry is used to cap the amount of
483  * outstanding data that we don't know the answer to. If we exceed a cap on the
484  * amount of outstanding data (currently 1 Mb), then we'll drop any additional
485  * packets. Once we get an answer indicating a valid destination, we transmit
486  * any outstanding data to that place. For the full story on how we look that up
487  * will be discussed in the section on the Target Cache Lifecycle.
488  *
489  * ------------------------
490  * FMA and Degraded Devices
491  * ------------------------
492  *
493  * Every kernel overlay device keeps track of its FMA state. Today in FMA we
494  * cannot represent partitions between resources nor can we represent that a
495  * given minor node of a pseudo device has failed -- if we degrade the overlay
496  * device, then the entire dev_info_t is degraded. However, we still want to be
497  * able to indicate to administrators that things may go wrong.
498  *
499  * To this end, we've added a notion of a degraded state to every overlay
500  * device. This state is primarily dictated by userland and it can happen for
501  * various reasons. Generally, because a userland lookup plugin has been
502  * partitioned, or something has gone wrong such that there is no longer any
503  * userland lookup module for a device, then we'll mark it degraded.
504  *
505  * As long as any of our minor instances is degraded, then we'll fire off the
506  * FMA event to note that. Once the last degraded instance is no longer
507  * degraded, then we'll end up telling FMA that we're all clean.
508  *
509  * To help administrators get a better sense of which of the various minor
510  * devices is wrong, we store the odd_fmamsg[] character array. This character
511  * array can be fetched with doing a dladm show-overlay -f.
512  *
513  * Note, that it's important that we do not update the link status of the
514  * devices. We want to remain up as much as possible. By changing the link in a
515  * degraded state, this may end up making things worse. We may still actually
516  * have information in the target cache and if we mark the link down, that'll
517  * result in not being able to use it. The reason being that this'll mark all
518  * the downstream VNICs down which will go to IP and from there we end up
519  * dealing with sadness.
520  *
521  * -----------------------
522  * Target Cache Life Cycle
523  * -----------------------
524  *
525  * This section only applies when we have a lookup plugin of
526  * OVERLAY_TARGET_DYNAMIC. None of this applies to those of type
527  * OVERLAY_TARGET_POINT.
528  *
529  * While we got into the target cache in the general architecture section, it's
530  * worth going into more details as to how this actually works and showing some
531  * examples and state machines. Recall that a target cache entry basically has
532  * the following state transition diagram:
533  *
534  * Initial state
535  *    . . .           . . . first access       . . . varpd lookup enqueued
536  *        .           .                        .
537  *        .           .                        .
538  *     +-------+      .     +----------+       .
539  *     |  No   |------*---->| Invalid  |-------*----+
540  *     | Entry |            |  Entry   |            |
541  *     +-------+            +----------+            |
542  *                 varpd      ^      ^   varpd      |
543  *                 invalidate |      |   drop       |
544  *                      . . . *      * . .          v
545  *          +-------+         |      |         +---------+
546  *          | Entry |--->-----+      +----<----| Entry   |
547  *          | Valid |<----------*---------<----| Pending |->-+     varpd
548  *          +-------+           .              +---------+   * . . drop, but
549  *                              . varpd                ^     |     other queued
550  *                              . success              |     |     entries
551  *                                                     +-----+
552  *
553  * When the table is first created, it is empty. As we attempt to lookup entries
554  * and we find there is no entry at all, we'll create a new table entry for it.
555  * At that point the entry is technically in an invalid state, that means that
556  * we have no valid data from varpd. In that case, we'll go ahead and queue the
557  * packet into the entry's pending chain, and queue a varpd lookup, setting the
558  * OVERLAY_ENTRY_F_PENDING flag in the progress.
559  *
560  * If additional mblk_t's come in for this entry, we end up appending them to
561  * the tail of the chain, if and only if, we don't exceed the threshold for the
562  * amount of space they can take up. An entry remains pending until we get a
563  * varpd reply. If varpd replies with a valid results, we move to the valid
564  * entry state, and remove the OVERLAY_ENTRY_F_PENDING flag and set it with one
565  * of OVERLAY_ENTRY_F_VALID or OVERLAY_ENTRY_F_DROP as appropriate.
566  *
567  * Once an entry is valid, it stays valid until user land tells us to invalidate
568  * it with an ioctl or replace it, OVERLAY_TARG_CACHE_REMOE and
569  * OVERLAY_TARG_CACHE_SET respectively.
570  *
571  * If the lookup fails with a call to drop the packet, then the next state is
572  * determined by the state of the queue. If the set of outstanding entries is
573  * empty, then we just transition back to the invalid state. If instead, the
574  * set of outstanding entries is not empty, then we'll queue another entry and
575  * stay in the same state, repeating this until the number of requests is
576  * drained.
577  *
578  * The following images describes the flow of a given lookup and where the
579  * overlay_target_entry_t is at any given time.
580  *
581  *     +-------------------+
582  *     | Invalid Entry     |		An entry starts off as an invalid entry
583  *     | de:ad:be:ef:00:00 |		and only exists in the target cache.
584  *     +-------------------+
585  *
586  *	~~~~
587  *
588  *     +---------------------+
589  *     | Global list_t       |		A mblk_t comes in for an entry. We
590  *     | overlay_target_list |		append it to the overlay_target_list.
591  *     +---------------------+
592  *                   |
593  *                   v
594  *             +-------------------+      +-------------------+
595  *             | Pending Entry     |----->| Pending Entry     |--->...
596  *             | 42:5e:1a:10:d6:2d |      | de:ad:be:ef:00:00 |
597  *             +-------------------+      +-------------------+
598  *
599  *	~~~~
600  *
601  *     +--------------------------+
602  *     | /dev/overlay minor state |	User land said that it would look up an
603  *     | overlay_target_hdl_t     |	entry for us. We remove it from the
604  *     +--------------------------+	global list and add it to the handle's
605  *                  |			outstanding list.
606  *                  |
607  *                  v
608  *            +-------------------+      +-------------------+
609  *            | Pending Entry     |----->| Pending Entry     |
610  *            | 90:b8:d0:79:02:dd |      | de:ad:be:ef:00:00 |
611  *            +-------------------+      +-------------------+
612  *
613  *	~~~~
614  *
615  *     +-------------------+
616  *     | Valid Entry       |		varpd returned an answer with
617  *     | de:ad:be:ef:00:00 |		OVERLAY_IOC_RESPOND and the target cache
618  *     | 10.169.23.42:4789 |		entry is now populated with a
619  *     +-------------------+		destination and marked as valid
620  *
621  *
622  * The lookup mechanism is performed via a series of operations on the character
623  * pseudo-device /dev/overlay. The only thing that uses this device is the
624  * userland daemon varpd. /dev/overlay is a cloneable device, each open of it
625  * granting a new minor number which maintains its own state. We maintain this
626  * state so that way if an outstanding lookup was queued to something that
627  * crashed or closed its handle without responding, we can know about this and
628  * thus handle it appropriately.
629  *
630  * When a lookup is first created it's added to our global list of outstanding
631  * lookups. To service requests, userland is required to perform an ioctl to ask
632  * for a request. We will block it in the kernel a set amount of time waiting
633  * for a request. When we give a request to a given minor instance of the
634  * device, we remove it from the global list and append the request to the
635  * device's list of outstanding entries, for the reasons we discussed above.
636  * When a lookup comes in, we give user land a smaller amount of information
637  * specific to that packet, the overlay_targ_lookup_t. It includes a request id
638  * to identify this, and then the overlay id, the varpd id, the header and
639  * packet size, the source and destination mac address, the SAP, and any
640  * potential VLAN header.
641  *
642  * At that point, it stays in that outstanding list until one of two ioctls are
643  * returned: OVERLAY_TARG_RESPOND or OVERLAY_TARG_DROP. During this time,
644  * userland may also perform other operations. For example, it may use
645  * OVERLAY_TARG_PKT to get a copy of this packet so it can perform more in-depth
646  * analysis of what to do beyond what we gave it initially. This is useful for
647  * providing proxy arp and the like. Finally, there are two other ioctls that
648  * varpd can then do. The first is OVERLAY_TARG_INJECT which injects the
649  * non-jumbo frame packet up into that mac device and OVERLAY_TARG_RESEND which
650  * causes us to encapsulate and send out the packet they've given us.
651  *
652  *
653  * Finally, through the target cache, several ioctls are provided to allow for
654  * interrogation and management of the cache. They allow for individual entries
655  * to be retrieved, set, or have the entire table flushed. For the full set of
656  * ioctls here and what they do, take a look at uts/common/sys/overlay_target.h.
657  *
658  * ------------------
659  * Sample Packet Flow
660  * ------------------
661  *
662  * There's a lot of pieces here, hopefully an example of how this all fits
663  * together will help clarify and elucidate what's going on. We're going to
664  * first track an outgoing packet, eg. one that is sent from an IP interface on
665  * a VNIC on top of an overlay device, and then we'll look at what it means to
666  * respond to that.
667  *
668  *
669  *    +----------------+        +--------------+            +------------------+
670  *    | IP/DLS send    |------->| MAC sends it |----------->| mblk_t reaches   |
671  *    | packet to MAC  |        | to the GLDv3 |            | overlay GLDv3 tx |
672  *    +----------------+        | VNIC device  |            | overlay_m_tx()   |
673  *                              +--------------+            +------------------+
674  *                                                                   |
675  *                             . lookup              . cache         |
676  *                             . drop                . miss          v
677  *            +---------+      .       +--------+    .      +------------------+
678  *            | freemsg |<-----*-------| varpd  |<---*------| Lookup each mblk |
679  *            | mblk_t  |              | lookup |           | in the target    |
680  *            +---------+              | queued |           | cache            |
681  *                ^                    +--------+           +------------------+
682  *      on send   |                        |                         |     cache
683  *      error . . *                        *. . lookup               * . . hit
684  *                |                        |    success              v
685  *                |                        |                +------------------+
686  *    +-----------------+                  +--------------->| call plugin      |
687  *    | Send out        |                                   | ovpo_encap() to  |
688  *    | overlay_mux_t's |<----------------------------------| get encap mblk_t |
689  *    | ksocket         |                                   +------------------+
690  *    +-----------------+
691  *
692  * The receive end point looks a little different and looks more like:
693  *
694  *  +------------------+     +----------------+    +-----------+
695  *  | mblk_t comes off |---->| enter netstack |--->| delivered |---+
696  *  | the physical     |     | IP stack       |    |     to    |   * . . direct
697  *  | device           |     +----------------+    |  ksocket  |   |   callback
698  *  +------------------+                           +-----------+   |
699  *                       . overlay id                              |
700  *                       . not found                               v
701  *       +-----------+   .      +-----------------+       +--------------------+
702  *       | freemsg   |<--*------| call plugin     |<------| overlay_mux_recv() |
703  *       | mblk_t    |          | ovpo_decap() to |       +--------------------+
704  *       +-----------+          | decap mblk_t    |
705  *                              +-----------------+
706  *                                     |
707  *                                     * . . overlay id
708  *                                     v     found
709  *                                 +--------+      +----------------+
710  *                                 | adjust |----->| call mac_rx    |
711  *                                 | mblk_t |      | on original    |
712  *                                 +--------+      | decaped packet |
713  *                                                 +----------------+
714  *
715  * ------------------
716  * Netstack Awareness
717  * ------------------
718  *
719  * In the above image we note that this enters a netstack. Today the only
720  * netstack that can be is the global zone as the overlay driver itself is not
721  * exactly netstack aware. What this really means is that varpd cannot run in a
722  * non-global zone and an overlay device cannot belong to a non-global zone.
723  * Non-global zones can still have a VNIC assigned to them that's been created
724  * over the overlay device the same way they would if it had been created over
725  * an etherstub or a physical device.
726  *
727  * The majority of the work to make it netstack aware is straightforward and the
728  * biggest thing is to create a netstack module that allows us to hook into
729  * netstack (and thus zone) creation and destruction.  From there, we need to
730  * amend the target cache lookup routines that we discussed earlier to not have
731  * a global outstanding list and a global list of handles, but rather, one per
732  * netstack.
733  *
734  * For the mux, we'll need to open the ksocket in the context of the zone, we
735  * can likely do this with a properly composed credential, but we'll need to do
736  * some more work on that path. Finally, we'll want to make sure the dld ioctls
737  * are aware of the zoneid of the caller and we use that appropriately and store
738  * it in the overlay_dev_t.
739  *
740  * -----------
741  * GLDv3 Notes
742  * -----------
743  *
744  * The overlay driver implements a GLDv3 device. Parts of GLDv3 are more
745  * relevant and other parts are much less relevant for us. For example, the
746  * GLDv3 is used to toggle the device being put into and out of promiscuous
747  * mode, to program MAC addresses for unicast and multicast hardware filters.
748  * Today, an overlay device doesn't have a notion of promiscuous mode nor does
749  * it have a notion of unicast and multicast addresses programmed into the
750  * device. Instead, for the purposes of the hardware filter, we don't do
751  * anything and just always accept new addresses being added and removed.
752  *
753  * If the GLDv3 start function has not been called, then we will not use this
754  * device for I/O purposes. Any calls to transmit or receive should be dropped,
755  * though the GLDv3 guarantees us that transmit will not be called without
756  * calling start. Similarly, once stop is called, then no packets can be dealt
757  * with.
758  *
759  * Today we don't support the stat interfaces, though there's no good reason
760  * that we shouldn't assemble some of the stats based on what we have in the
761  * future.
762  *
763  * When it comes to link properties, many of the traditional link properties do
764  * not apply and many others MAC handles for us. For example, we don't need to
765  * implement anything for overlay_m_getprop() to deal with returning the MTU, as
766  * MAC never calls into us for that. As such, there isn't much of anything to
767  * support in terms of properties.
768  *
769  * Today, we don't support any notion of hardware capabilities. However, if
770  * future NIC hardware or other changes to the system cause it to make sense for
771  * us to emulate logical groups, then we should do that. However, we still do
772  * implement a capab function so that we can identify ourselves as an overlay
773  * device to the broader MAC framework. This is done mostly so that a device
774  * created on top of us can have fanout rings as we don't try to lie about a
775  * speed for our device.
776  *
777  * The other question is what should be done for a device's MTU and margin. We
778  * set our minimum supported MTU to be the minimum value that an IP network may
779  * be set to 576 -- which mimics what an etherstub does. On the flip side, we
780  * have our upper bound set to 8900. This value comes from the fact that a lot
781  * of jumbo networks use their maximum as 9000. As such, we want to reserve 100
782  * bytes, which isn't exactly the most accurate number, but it'll be good enough
783  * for now. Because of that, our default MTU off of these devices is 1400, as
784  * the default MTU for everything is usually 1500 or whatever the underlying
785  * device is at; however, this is a bit simpler than asking the netstack what
786  * are all the IP interfaces at. It also calls into question how PMTU and PMTU
787  * discovery should work here. The challenge, especially for
788  * OVERLAY_TARG_DYNAMIC is that the MTU to any of the places will vary and it's
789  * not clear that if you have a single bad entry that the overall MTU should be
790  * lowered. Instead, we should figure out a better way of determining these
791  * kinds of PMTU errors and appropriately alerting the administrator via FMA.
792  *
793  * Regarding margin, we allow a margin of up to VLAN_TAGSZ depending on whether
794  * or not the underlying encapsulation device supports VLAN tags. If it does,
795  * then we'll set the margin to allow for it, otherwise, we will not.
796  */
797 
798 #include <sys/conf.h>
799 #include <sys/errno.h>
800 #include <sys/stat.h>
801 #include <sys/ddi.h>
802 #include <sys/sunddi.h>
803 #include <sys/modctl.h>
804 #include <sys/policy.h>
805 #include <sys/stream.h>
806 #include <sys/strsubr.h>
807 #include <sys/strsun.h>
808 #include <sys/types.h>
809 #include <sys/kmem.h>
810 #include <sys/param.h>
811 #include <sys/sysmacros.h>
812 #include <sys/ddifm.h>
813 
814 #include <sys/dls.h>
815 #include <sys/dld_ioc.h>
816 #include <sys/mac_provider.h>
817 #include <sys/mac_client_priv.h>
818 #include <sys/mac_ether.h>
819 #include <sys/vlan.h>
820 
821 #include <sys/overlay_impl.h>
822 
823 dev_info_t *overlay_dip;
824 static kmutex_t overlay_dev_lock;
825 static list_t overlay_dev_list;
826 static uint8_t overlay_macaddr[ETHERADDRL] =
827 	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
828 
829 typedef enum overlay_dev_prop {
830 	OVERLAY_DEV_P_MTU = 0,
831 	OVERLAY_DEV_P_VNETID,
832 	OVERLAY_DEV_P_ENCAP,
833 	OVERLAY_DEV_P_VARPDID
834 } overlay_dev_prop_t;
835 
836 #define	OVERLAY_DEV_NPROPS	4
837 static const char *overlay_dev_props[] = {
838 	"mtu",
839 	"vnetid",
840 	"encap",
841 	"varpd/id"
842 };
843 
844 #define	OVERLAY_MTU_MIN	576
845 #define	OVERLAY_MTU_DEF	1400
846 #define	OVERLAY_MTU_MAX	8900
847 
848 overlay_dev_t *
849 overlay_hold_by_dlid(datalink_id_t id)
850 {
851 	overlay_dev_t *o;
852 
853 	mutex_enter(&overlay_dev_lock);
854 	for (o = list_head(&overlay_dev_list); o != NULL;
855 	    o = list_next(&overlay_dev_list, o)) {
856 		if (id == o->odd_linkid) {
857 			mutex_enter(&o->odd_lock);
858 			o->odd_ref++;
859 			mutex_exit(&o->odd_lock);
860 			mutex_exit(&overlay_dev_lock);
861 			return (o);
862 		}
863 	}
864 
865 	mutex_exit(&overlay_dev_lock);
866 	return (NULL);
867 }
868 
869 void
870 overlay_hold_rele(overlay_dev_t *odd)
871 {
872 	mutex_enter(&odd->odd_lock);
873 	ASSERT(odd->odd_ref > 0);
874 	odd->odd_ref--;
875 	mutex_exit(&odd->odd_lock);
876 }
877 
878 void
879 overlay_io_start(overlay_dev_t *odd, overlay_dev_flag_t flag)
880 {
881 	ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX);
882 	ASSERT(MUTEX_HELD(&odd->odd_lock));
883 
884 	if (flag & OVERLAY_F_IN_RX)
885 		odd->odd_rxcount++;
886 	if (flag & OVERLAY_F_IN_TX)
887 		odd->odd_txcount++;
888 	odd->odd_flags |= flag;
889 }
890 
891 void
892 overlay_io_done(overlay_dev_t *odd, overlay_dev_flag_t flag)
893 {
894 	boolean_t signal = B_FALSE;
895 
896 	ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX);
897 	ASSERT(MUTEX_HELD(&odd->odd_lock));
898 
899 	if (flag & OVERLAY_F_IN_RX) {
900 		ASSERT(odd->odd_rxcount > 0);
901 		odd->odd_rxcount--;
902 		if (odd->odd_rxcount == 0) {
903 			signal = B_TRUE;
904 			odd->odd_flags &= ~OVERLAY_F_IN_RX;
905 		}
906 	}
907 	if (flag & OVERLAY_F_IN_TX) {
908 		ASSERT(odd->odd_txcount > 0);
909 		odd->odd_txcount--;
910 		if (odd->odd_txcount == 0) {
911 			signal = B_TRUE;
912 			odd->odd_flags &= ~OVERLAY_F_IN_TX;
913 		}
914 	}
915 
916 	if (signal == B_TRUE)
917 		cv_broadcast(&odd->odd_iowait);
918 }
919 
920 static void
921 overlay_io_wait(overlay_dev_t *odd, overlay_dev_flag_t flag)
922 {
923 	ASSERT((flag & ~OVERLAY_F_IOMASK) == 0);
924 	ASSERT(MUTEX_HELD(&odd->odd_lock));
925 
926 	while (odd->odd_flags & flag) {
927 		cv_wait(&odd->odd_iowait, &odd->odd_lock);
928 	}
929 }
930 
931 void
932 overlay_dev_iter(overlay_dev_iter_f func, void *arg)
933 {
934 	overlay_dev_t *odd;
935 
936 	mutex_enter(&overlay_dev_lock);
937 	for (odd = list_head(&overlay_dev_list); odd != NULL;
938 	    odd = list_next(&overlay_dev_list, odd)) {
939 		if (func(odd, arg) != 0) {
940 			mutex_exit(&overlay_dev_lock);
941 			return;
942 		}
943 	}
944 	mutex_exit(&overlay_dev_lock);
945 }
946 
947 /* ARGSUSED */
948 static int
949 overlay_m_stat(void *arg, uint_t stat, uint64_t *val)
950 {
951 	return (ENOTSUP);
952 }
953 
954 static int
955 overlay_m_start(void *arg)
956 {
957 	overlay_dev_t *odd = arg;
958 	overlay_mux_t *mux;
959 	int ret, domain, family, prot;
960 	struct sockaddr_storage storage;
961 	socklen_t slen;
962 
963 	mutex_enter(&odd->odd_lock);
964 	if ((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0) {
965 		mutex_exit(&odd->odd_lock);
966 		return (EAGAIN);
967 	}
968 	mutex_exit(&odd->odd_lock);
969 
970 	ret = odd->odd_plugin->ovp_ops->ovpo_socket(odd->odd_pvoid, &domain,
971 	    &family, &prot, (struct sockaddr *)&storage, &slen);
972 	if (ret != 0)
973 		return (ret);
974 
975 	mux = overlay_mux_open(odd->odd_plugin, domain, family, prot,
976 	    (struct sockaddr *)&storage, slen, &ret);
977 	if (mux == NULL)
978 		return (ret);
979 
980 	overlay_mux_add_dev(mux, odd);
981 	odd->odd_mux = mux;
982 	mutex_enter(&odd->odd_lock);
983 	ASSERT(!(odd->odd_flags & OVERLAY_F_IN_MUX));
984 	odd->odd_flags |= OVERLAY_F_IN_MUX;
985 	mutex_exit(&odd->odd_lock);
986 
987 	return (0);
988 }
989 
990 static void
991 overlay_m_stop(void *arg)
992 {
993 	overlay_dev_t *odd = arg;
994 
995 	/*
996 	 * The MAC Perimeter is held here, so we don't have to worry about
997 	 * synchronizing this with respect to metadata operations.
998 	 */
999 	mutex_enter(&odd->odd_lock);
1000 	VERIFY(odd->odd_flags & OVERLAY_F_IN_MUX);
1001 	VERIFY(!(odd->odd_flags & OVERLAY_F_MDDROP));
1002 	odd->odd_flags |= OVERLAY_F_MDDROP;
1003 	overlay_io_wait(odd, OVERLAY_F_IOMASK);
1004 	mutex_exit(&odd->odd_lock);
1005 
1006 	overlay_mux_remove_dev(odd->odd_mux, odd);
1007 	overlay_mux_close(odd->odd_mux);
1008 	odd->odd_mux = NULL;
1009 
1010 	mutex_enter(&odd->odd_lock);
1011 	odd->odd_flags &= ~OVERLAY_F_IN_MUX;
1012 	odd->odd_flags &= ~OVERLAY_F_MDDROP;
1013 	VERIFY((odd->odd_flags & OVERLAY_F_STOPMASK) == 0);
1014 	mutex_exit(&odd->odd_lock);
1015 }
1016 
1017 /*
1018  * For more info on this, see the big theory statement.
1019  */
1020 /* ARGSUSED */
1021 static int
1022 overlay_m_promisc(void *arg, boolean_t on)
1023 {
1024 	return (0);
1025 }
1026 
1027 /*
1028  * For more info on this, see the big theory statement.
1029  */
1030 /* ARGSUSED */
1031 static int
1032 overlay_m_multicast(void *arg, boolean_t add, const uint8_t *addrp)
1033 {
1034 	return (0);
1035 }
1036 
1037 /*
1038  * For more info on this, see the big theory statement.
1039  */
1040 /* ARGSUSED */
1041 static int
1042 overlay_m_unicast(void *arg, const uint8_t *macaddr)
1043 {
1044 	return (0);
1045 }
1046 
1047 mblk_t *
1048 overlay_m_tx(void *arg, mblk_t *mp_chain)
1049 {
1050 	overlay_dev_t *odd = arg;
1051 	mblk_t *mp, *ep;
1052 	int ret;
1053 	ovep_encap_info_t einfo;
1054 	struct msghdr hdr;
1055 
1056 	mutex_enter(&odd->odd_lock);
1057 	if ((odd->odd_flags & OVERLAY_F_MDDROP) ||
1058 	    !(odd->odd_flags & OVERLAY_F_IN_MUX)) {
1059 		mutex_exit(&odd->odd_lock);
1060 		freemsgchain(mp_chain);
1061 		return (NULL);
1062 	}
1063 	overlay_io_start(odd, OVERLAY_F_IN_TX);
1064 	mutex_exit(&odd->odd_lock);
1065 
1066 	bzero(&hdr, sizeof (struct msghdr));
1067 
1068 	bzero(&einfo, sizeof (ovep_encap_info_t));
1069 	einfo.ovdi_id = odd->odd_vid;
1070 	mp = mp_chain;
1071 	while (mp != NULL) {
1072 		socklen_t slen;
1073 		struct sockaddr_storage storage;
1074 
1075 		mp_chain = mp->b_next;
1076 		mp->b_next = NULL;
1077 		ep = NULL;
1078 
1079 		ret = overlay_target_lookup(odd, mp,
1080 		    (struct sockaddr *)&storage, &slen);
1081 		if (ret != OVERLAY_TARGET_OK) {
1082 			if (ret == OVERLAY_TARGET_DROP)
1083 				freemsg(mp);
1084 			mp = mp_chain;
1085 			continue;
1086 		}
1087 
1088 		hdr.msg_name = &storage;
1089 		hdr.msg_namelen = slen;
1090 
1091 		ret = odd->odd_plugin->ovp_ops->ovpo_encap(odd->odd_mh, mp,
1092 		    &einfo, &ep);
1093 		if (ret != 0 || ep == NULL) {
1094 			freemsg(mp);
1095 			goto out;
1096 		}
1097 
1098 		ASSERT(ep->b_cont == mp || ep == mp);
1099 		ret = overlay_mux_tx(odd->odd_mux, &hdr, ep);
1100 		if (ret != 0)
1101 			goto out;
1102 
1103 		mp = mp_chain;
1104 	}
1105 
1106 out:
1107 	mutex_enter(&odd->odd_lock);
1108 	overlay_io_done(odd, OVERLAY_F_IN_TX);
1109 	mutex_exit(&odd->odd_lock);
1110 	return (mp_chain);
1111 }
1112 
1113 /* ARGSUSED */
1114 static void
1115 overlay_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
1116 {
1117 	miocnak(q, mp, 0, ENOTSUP);
1118 }
1119 
1120 /* ARGSUSED */
1121 static boolean_t
1122 overlay_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
1123 {
1124 	/*
1125 	 * Tell MAC we're an overlay.
1126 	 */
1127 	if (cap == MAC_CAPAB_OVERLAY)
1128 		return (B_TRUE);
1129 	return (B_FALSE);
1130 }
1131 
1132 /* ARGSUSED */
1133 static int
1134 overlay_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1135     uint_t pr_valsize, const void *pr_val)
1136 {
1137 	uint32_t mtu, old;
1138 	int err;
1139 	overlay_dev_t *odd = arg;
1140 
1141 	if (pr_num != MAC_PROP_MTU)
1142 		return (ENOTSUP);
1143 
1144 	bcopy(pr_val, &mtu, sizeof (mtu));
1145 	if (mtu < OVERLAY_MTU_MIN || mtu > OVERLAY_MTU_MAX)
1146 		return (EINVAL);
1147 
1148 	mutex_enter(&odd->odd_lock);
1149 	old = odd->odd_mtu;
1150 	odd->odd_mtu = mtu;
1151 	err = mac_maxsdu_update(odd->odd_mh, mtu);
1152 	if (err != 0)
1153 		odd->odd_mtu = old;
1154 	mutex_exit(&odd->odd_lock);
1155 
1156 	return (err);
1157 }
1158 
1159 /* ARGSUSED */
1160 static int
1161 overlay_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1162     uint_t pr_valsize, void *pr_val)
1163 {
1164 	return (ENOTSUP);
1165 }
1166 
1167 /* ARGSUSED */
1168 static void
1169 overlay_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1170     mac_prop_info_handle_t prh)
1171 {
1172 	if (pr_num != MAC_PROP_MTU)
1173 		return;
1174 
1175 	mac_prop_info_set_default_uint32(prh, OVERLAY_MTU_DEF);
1176 	mac_prop_info_set_range_uint32(prh, OVERLAY_MTU_MIN, OVERLAY_MTU_MAX);
1177 }
1178 
1179 static mac_callbacks_t overlay_m_callbacks = {
1180 	.mc_callbacks = (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP |
1181 	    MC_PROPINFO),
1182 	.mc_getstat = overlay_m_stat,
1183 	.mc_start = overlay_m_start,
1184 	.mc_stop = overlay_m_stop,
1185 	.mc_setpromisc = overlay_m_promisc,
1186 	.mc_multicst = overlay_m_multicast,
1187 	.mc_unicst = overlay_m_unicast,
1188 	.mc_tx = overlay_m_tx,
1189 	.mc_ioctl = overlay_m_ioctl,
1190 	.mc_getcapab = overlay_m_getcapab,
1191 	.mc_getprop = overlay_m_getprop,
1192 	.mc_setprop = overlay_m_setprop,
1193 	.mc_propinfo = overlay_m_propinfo
1194 };
1195 
1196 static boolean_t
1197 overlay_valid_name(const char *name, size_t buflen)
1198 {
1199 	size_t actlen;
1200 	int err, i;
1201 
1202 	for (i = 0; i < buflen; i++) {
1203 		if (name[i] == '\0')
1204 			break;
1205 	}
1206 
1207 	if (i == 0 || i == buflen)
1208 		return (B_FALSE);
1209 	actlen = i;
1210 	if (strchr(name, '/') != NULL)
1211 		return (B_FALSE);
1212 	if (u8_validate((char *)name, actlen, NULL,
1213 	    U8_VALIDATE_ENTIRE, &err) < 0)
1214 		return (B_FALSE);
1215 	return (B_TRUE);
1216 }
1217 
1218 /* ARGSUSED */
1219 static int
1220 overlay_i_create(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
1221 {
1222 	int err;
1223 	uint64_t maxid;
1224 	overlay_dev_t *odd, *o;
1225 	mac_register_t *mac;
1226 	overlay_ioc_create_t *oicp = karg;
1227 
1228 	if (overlay_valid_name(oicp->oic_encap, MAXLINKNAMELEN) == B_FALSE)
1229 		return (EINVAL);
1230 
1231 	odd = kmem_zalloc(sizeof (overlay_dev_t), KM_SLEEP);
1232 	odd->odd_linkid = oicp->oic_linkid;
1233 	odd->odd_plugin = overlay_plugin_lookup(oicp->oic_encap);
1234 	if (odd->odd_plugin == NULL) {
1235 		kmem_free(odd, sizeof (overlay_dev_t));
1236 		return (ENOENT);
1237 	}
1238 	err = odd->odd_plugin->ovp_ops->ovpo_init((overlay_handle_t)odd,
1239 	    &odd->odd_pvoid);
1240 	if (err != 0) {
1241 		odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1242 		overlay_plugin_rele(odd->odd_plugin);
1243 		kmem_free(odd, sizeof (overlay_dev_t));
1244 		return (EINVAL);
1245 	}
1246 
1247 	/*
1248 	 * Make sure that our virtual network id is valid for the given plugin
1249 	 * that we're working with.
1250 	 */
1251 	ASSERT(odd->odd_plugin->ovp_id_size <= 8);
1252 	maxid = UINT64_MAX;
1253 	if (odd->odd_plugin->ovp_id_size != 8)
1254 		maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) - 1ULL;
1255 	if (oicp->oic_vnetid > maxid) {
1256 		odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1257 		overlay_plugin_rele(odd->odd_plugin);
1258 		kmem_free(odd, sizeof (overlay_dev_t));
1259 		return (EINVAL);
1260 	}
1261 	odd->odd_vid = oicp->oic_vnetid;
1262 
1263 	mac = mac_alloc(MAC_VERSION);
1264 	if (mac == NULL) {
1265 		mutex_exit(&overlay_dev_lock);
1266 		odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1267 		overlay_plugin_rele(odd->odd_plugin);
1268 		kmem_free(odd, sizeof (overlay_dev_t));
1269 		return (EINVAL);
1270 	}
1271 
1272 	mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1273 	mac->m_driver = odd;
1274 	mac->m_dip = overlay_dip;
1275 	mac->m_dst_addr = NULL;
1276 	mac->m_callbacks = &overlay_m_callbacks;
1277 	mac->m_pdata = NULL;
1278 	mac->m_pdata_size = 0;
1279 
1280 	mac->m_priv_props = NULL;
1281 
1282 	/* Let mac handle this itself. */
1283 	mac->m_instance = (uint_t)-1;
1284 
1285 	/*
1286 	 * There is no real source address that should be used here, but saying
1287 	 * that we're not ethernet is going to cause its own problems. At the
1288 	 * end of the say, this is fine.
1289 	 */
1290 	mac->m_src_addr = overlay_macaddr;
1291 
1292 	/*
1293 	 * Start with the default MTU as the max SDU. If the MTU is changed, the
1294 	 * SDU will be changed to reflect that.
1295 	 */
1296 	mac->m_min_sdu = 1;
1297 	mac->m_max_sdu = OVERLAY_MTU_DEF;
1298 	mac->m_multicast_sdu = 0;
1299 
1300 	/*
1301 	 * The underlying device doesn't matter, instead this comes from the
1302 	 * encapsulation protocol and whether or not they allow VLAN tags.
1303 	 */
1304 	if (odd->odd_plugin->ovp_flags & OVEP_F_VLAN_TAG) {
1305 		mac->m_margin = VLAN_TAGSZ;
1306 	} else {
1307 		mac->m_margin = 0;
1308 	}
1309 
1310 	/*
1311 	 * Today, we have no MAC virtualization, it may make sense in the future
1312 	 * to go ahead and emulate some subset of this, but it doesn't today.
1313 	 */
1314 	mac->m_v12n = MAC_VIRT_NONE;
1315 
1316 	mutex_enter(&overlay_dev_lock);
1317 	for (o = list_head(&overlay_dev_list); o != NULL;
1318 	    o = list_next(&overlay_dev_list, o)) {
1319 		if (o->odd_linkid == oicp->oic_linkid) {
1320 			mutex_exit(&overlay_dev_lock);
1321 			odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1322 			overlay_plugin_rele(odd->odd_plugin);
1323 			kmem_free(odd, sizeof (overlay_dev_t));
1324 			return (EEXIST);
1325 		}
1326 
1327 		if (o->odd_vid == oicp->oic_vnetid &&
1328 		    o->odd_plugin == odd->odd_plugin) {
1329 			mutex_exit(&overlay_dev_lock);
1330 			odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1331 			overlay_plugin_rele(odd->odd_plugin);
1332 			kmem_free(odd, sizeof (overlay_dev_t));
1333 			return (EEXIST);
1334 		}
1335 	}
1336 
1337 	err = mac_register(mac, &odd->odd_mh);
1338 	mac_free(mac);
1339 	if (err != 0) {
1340 		mutex_exit(&overlay_dev_lock);
1341 		odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1342 		overlay_plugin_rele(odd->odd_plugin);
1343 		kmem_free(odd, sizeof (overlay_dev_t));
1344 		return (err);
1345 	}
1346 
1347 	err = dls_devnet_create(odd->odd_mh, odd->odd_linkid,
1348 	    crgetzoneid(cred));
1349 	if (err != 0) {
1350 		mutex_exit(&overlay_dev_lock);
1351 		(void) mac_unregister(odd->odd_mh);
1352 		odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1353 		overlay_plugin_rele(odd->odd_plugin);
1354 		kmem_free(odd, sizeof (overlay_dev_t));
1355 		return (err);
1356 	}
1357 
1358 	mutex_init(&odd->odd_lock, NULL, MUTEX_DRIVER, NULL);
1359 	cv_init(&odd->odd_iowait, NULL, CV_DRIVER, NULL);
1360 	odd->odd_ref = 0;
1361 	odd->odd_flags = 0;
1362 	list_insert_tail(&overlay_dev_list, odd);
1363 	mutex_exit(&overlay_dev_lock);
1364 
1365 	return (0);
1366 }
1367 
1368 /* ARGSUSED */
1369 static int
1370 overlay_i_activate(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
1371 {
1372 	int i, ret;
1373 	overlay_dev_t *odd;
1374 	mac_perim_handle_t mph;
1375 	overlay_ioc_activate_t *oiap = karg;
1376 	overlay_ioc_propinfo_t *infop;
1377 	overlay_ioc_prop_t *oip;
1378 	overlay_prop_handle_t phdl;
1379 
1380 	odd = overlay_hold_by_dlid(oiap->oia_linkid);
1381 	if (odd == NULL)
1382 		return (ENOENT);
1383 
1384 	infop = kmem_alloc(sizeof (overlay_ioc_propinfo_t), KM_SLEEP);
1385 	oip = kmem_alloc(sizeof (overlay_ioc_prop_t), KM_SLEEP);
1386 	phdl = (overlay_prop_handle_t)infop;
1387 
1388 	mac_perim_enter_by_mh(odd->odd_mh, &mph);
1389 	mutex_enter(&odd->odd_lock);
1390 	if (odd->odd_flags & OVERLAY_F_ACTIVATED) {
1391 		mutex_exit(&odd->odd_lock);
1392 		mac_perim_exit(mph);
1393 		overlay_hold_rele(odd);
1394 		kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1395 		kmem_free(oip, sizeof (overlay_ioc_prop_t));
1396 		return (EEXIST);
1397 	}
1398 	mutex_exit(&odd->odd_lock);
1399 
1400 	for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) {
1401 		const char *pname = odd->odd_plugin->ovp_props[i];
1402 		bzero(infop, sizeof (overlay_ioc_propinfo_t));
1403 		overlay_prop_init(phdl);
1404 		ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(pname, phdl);
1405 		if (ret != 0) {
1406 			mac_perim_exit(mph);
1407 			overlay_hold_rele(odd);
1408 			kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1409 			kmem_free(oip, sizeof (overlay_ioc_prop_t));
1410 			return (ret);
1411 		}
1412 
1413 		if ((infop->oipi_prot & OVERLAY_PROP_PERM_REQ) == 0)
1414 			continue;
1415 		bzero(oip, sizeof (overlay_ioc_prop_t));
1416 		oip->oip_size = sizeof (oip->oip_value);
1417 		ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid,
1418 		    pname, oip->oip_value, &oip->oip_size);
1419 		if (ret != 0) {
1420 			mac_perim_exit(mph);
1421 			overlay_hold_rele(odd);
1422 			kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1423 			kmem_free(oip, sizeof (overlay_ioc_prop_t));
1424 			return (ret);
1425 		}
1426 		if (oip->oip_size == 0) {
1427 			mac_perim_exit(mph);
1428 			overlay_hold_rele(odd);
1429 			kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1430 			kmem_free(oip, sizeof (overlay_ioc_prop_t));
1431 			return (EINVAL);
1432 		}
1433 	}
1434 
1435 	mutex_enter(&odd->odd_lock);
1436 	if ((odd->odd_flags & OVERLAY_F_VARPD) == 0) {
1437 		mutex_exit(&odd->odd_lock);
1438 		mac_perim_exit(mph);
1439 		overlay_hold_rele(odd);
1440 		kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1441 		kmem_free(oip, sizeof (overlay_ioc_prop_t));
1442 		return (ENXIO);
1443 	}
1444 
1445 	ASSERT((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0);
1446 	odd->odd_flags |= OVERLAY_F_ACTIVATED;
1447 
1448 	/*
1449 	 * Now that we've activated ourselves, we should indicate to the world
1450 	 * that we're up. Note that we may not be able to perform lookups at
1451 	 * this time, but our notion of being 'up' isn't dependent on that
1452 	 * ability.
1453 	 */
1454 	mac_link_update(odd->odd_mh, LINK_STATE_UP);
1455 	mutex_exit(&odd->odd_lock);
1456 
1457 	mac_perim_exit(mph);
1458 	overlay_hold_rele(odd);
1459 	kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1460 	kmem_free(oip, sizeof (overlay_ioc_prop_t));
1461 
1462 	return (0);
1463 }
1464 
1465 /* ARGSUSED */
1466 static int
1467 overlay_i_delete(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
1468 {
1469 	overlay_ioc_delete_t *oidp = karg;
1470 	overlay_dev_t *odd;
1471 	datalink_id_t tid;
1472 	int ret;
1473 
1474 	odd = overlay_hold_by_dlid(oidp->oid_linkid);
1475 	if (odd == NULL) {
1476 		return (ENOENT);
1477 	}
1478 
1479 	mutex_enter(&odd->odd_lock);
1480 	/* If we're not the only hold, we're busy */
1481 	if (odd->odd_ref != 1) {
1482 		mutex_exit(&odd->odd_lock);
1483 		overlay_hold_rele(odd);
1484 		return (EBUSY);
1485 	}
1486 
1487 	if (odd->odd_flags & OVERLAY_F_IN_MUX) {
1488 		mutex_exit(&odd->odd_lock);
1489 		overlay_hold_rele(odd);
1490 		return (EBUSY);
1491 	}
1492 
1493 	/*
1494 	 * To remove this, we need to first remove it from dls and then remove
1495 	 * it from mac. The act of removing it from mac will check if there are
1496 	 * devices on top of this, eg. vnics. If there are, then that will fail
1497 	 * and we'll have to go through and recreate the dls entry. Only after
1498 	 * mac_unregister has succeeded, then we'll go through and actually free
1499 	 * everything and drop the dev lock.
1500 	 */
1501 	ret = dls_devnet_destroy(odd->odd_mh, &tid, B_TRUE);
1502 	if (ret != 0) {
1503 		overlay_hold_rele(odd);
1504 		return (ret);
1505 	}
1506 
1507 	ASSERT(oidp->oid_linkid == tid);
1508 	ret = mac_disable(odd->odd_mh);
1509 	if (ret != 0) {
1510 		(void) dls_devnet_create(odd->odd_mh, odd->odd_linkid,
1511 		    crgetzoneid(cred));
1512 		overlay_hold_rele(odd);
1513 		return (ret);
1514 	}
1515 
1516 	overlay_target_quiesce(odd->odd_target);
1517 
1518 	mutex_enter(&overlay_dev_lock);
1519 	list_remove(&overlay_dev_list, odd);
1520 	mutex_exit(&overlay_dev_lock);
1521 
1522 	cv_destroy(&odd->odd_iowait);
1523 	mutex_destroy(&odd->odd_lock);
1524 	overlay_target_free(odd);
1525 	odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1526 	overlay_plugin_rele(odd->odd_plugin);
1527 	kmem_free(odd, sizeof (overlay_dev_t));
1528 
1529 	return (0);
1530 }
1531 
1532 /* ARGSUSED */
1533 static int
1534 overlay_i_nprops(void *karg, intptr_t arg, int mode, cred_t *cred,
1535     int *rvalp)
1536 {
1537 	overlay_dev_t *odd;
1538 	overlay_ioc_nprops_t *on = karg;
1539 
1540 	odd = overlay_hold_by_dlid(on->oipn_linkid);
1541 	if (odd == NULL)
1542 		return (ENOENT);
1543 	on->oipn_nprops = odd->odd_plugin->ovp_nprops + OVERLAY_DEV_NPROPS;
1544 	overlay_hold_rele(odd);
1545 
1546 	return (0);
1547 }
1548 
1549 static int
1550 overlay_propinfo_plugin_cb(overlay_plugin_t *opp, void *arg)
1551 {
1552 	overlay_prop_handle_t phdl = arg;
1553 	overlay_prop_set_range_str(phdl, opp->ovp_name);
1554 	return (0);
1555 }
1556 
1557 static int
1558 overlay_i_name_to_propid(overlay_dev_t *odd, const char *name, uint_t *id)
1559 {
1560 	int i;
1561 
1562 	for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
1563 		if (strcmp(overlay_dev_props[i], name) == 0) {
1564 			*id = i;
1565 			return (0);
1566 		}
1567 	}
1568 
1569 	for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) {
1570 		if (strcmp(odd->odd_plugin->ovp_props[i], name) == 0) {
1571 			*id = i + OVERLAY_DEV_NPROPS;
1572 			return (0);
1573 		}
1574 	}
1575 
1576 	return (ENOENT);
1577 }
1578 
1579 static void
1580 overlay_i_propinfo_mtu(overlay_dev_t *odd, overlay_prop_handle_t phdl)
1581 {
1582 	uint32_t def;
1583 	mac_propval_range_t range;
1584 	uint_t perm;
1585 
1586 	ASSERT(MAC_PERIM_HELD(odd->odd_mh));
1587 
1588 	bzero(&range, sizeof (mac_propval_range_t));
1589 	range.mpr_count = 1;
1590 	if (mac_prop_info(odd->odd_mh, MAC_PROP_MTU, "mtu", &def,
1591 	    sizeof (def), &range, &perm) != 0)
1592 		return;
1593 
1594 	if (perm == MAC_PROP_PERM_READ)
1595 		overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
1596 	else if (perm == MAC_PROP_PERM_WRITE)
1597 		overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_WRITE);
1598 	else if (perm == MAC_PROP_PERM_RW)
1599 		overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
1600 
1601 	overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
1602 	overlay_prop_set_default(phdl, &def, sizeof (def));
1603 	overlay_prop_set_range_uint32(phdl, range.mpr_range_uint32[0].mpur_min,
1604 	    range.mpr_range_uint32[0].mpur_max);
1605 }
1606 
1607 /* ARGSUSED */
1608 static int
1609 overlay_i_propinfo(void *karg, intptr_t arg, int mode, cred_t *cred,
1610     int *rvalp)
1611 {
1612 	overlay_dev_t *odd;
1613 	int ret;
1614 	mac_perim_handle_t mph;
1615 	uint_t propid = UINT_MAX;
1616 	overlay_ioc_propinfo_t *oip = karg;
1617 	overlay_prop_handle_t phdl = (overlay_prop_handle_t)oip;
1618 
1619 	odd = overlay_hold_by_dlid(oip->oipi_linkid);
1620 	if (odd == NULL)
1621 		return (ENOENT);
1622 
1623 	overlay_prop_init(phdl);
1624 	mac_perim_enter_by_mh(odd->odd_mh, &mph);
1625 
1626 	/*
1627 	 * If the id is -1, then the property that we're looking for is named in
1628 	 * oipi_name and we should fill in its id. Otherwise, we've been given
1629 	 * an id and we need to turn that into a name for our plugin's sake. The
1630 	 * id is our own fabrication for property discovery.
1631 	 */
1632 	if (oip->oipi_id == -1) {
1633 		/*
1634 		 * Determine if it's a known generic property or it belongs to a
1635 		 * module by checking against the list of known names.
1636 		 */
1637 		oip->oipi_name[OVERLAY_PROP_NAMELEN-1] = '\0';
1638 		if ((ret = overlay_i_name_to_propid(odd, oip->oipi_name,
1639 		    &propid)) != 0) {
1640 			overlay_hold_rele(odd);
1641 			mac_perim_exit(mph);
1642 			return (ret);
1643 		}
1644 		oip->oipi_id = propid;
1645 		if (propid >= OVERLAY_DEV_NPROPS) {
1646 			ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(
1647 			    oip->oipi_name, phdl);
1648 			overlay_hold_rele(odd);
1649 			mac_perim_exit(mph);
1650 			return (ret);
1651 
1652 		}
1653 	} else if (oip->oipi_id >= OVERLAY_DEV_NPROPS) {
1654 		uint_t id = oip->oipi_id - OVERLAY_DEV_NPROPS;
1655 
1656 		if (id >= odd->odd_plugin->ovp_nprops) {
1657 			overlay_hold_rele(odd);
1658 			mac_perim_exit(mph);
1659 			return (EINVAL);
1660 		}
1661 		ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(
1662 		    odd->odd_plugin->ovp_props[id], phdl);
1663 		overlay_hold_rele(odd);
1664 		mac_perim_exit(mph);
1665 		return (ret);
1666 	} else if (oip->oipi_id < -1) {
1667 		overlay_hold_rele(odd);
1668 		mac_perim_exit(mph);
1669 		return (EINVAL);
1670 	} else {
1671 		ASSERT(oip->oipi_id < OVERLAY_DEV_NPROPS);
1672 		ASSERT(oip->oipi_id >= 0);
1673 		propid = oip->oipi_id;
1674 		(void) strlcpy(oip->oipi_name, overlay_dev_props[propid],
1675 		    sizeof (oip->oipi_name));
1676 	}
1677 
1678 	switch (propid) {
1679 	case OVERLAY_DEV_P_MTU:
1680 		overlay_i_propinfo_mtu(odd, phdl);
1681 		break;
1682 	case OVERLAY_DEV_P_VNETID:
1683 		overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
1684 		overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
1685 		overlay_prop_set_nodefault(phdl);
1686 		break;
1687 	case OVERLAY_DEV_P_ENCAP:
1688 		overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
1689 		overlay_prop_set_type(phdl, OVERLAY_PROP_T_STRING);
1690 		overlay_prop_set_nodefault(phdl);
1691 		overlay_plugin_walk(overlay_propinfo_plugin_cb, phdl);
1692 		break;
1693 	case OVERLAY_DEV_P_VARPDID:
1694 		overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
1695 		overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
1696 		overlay_prop_set_nodefault(phdl);
1697 		break;
1698 	default:
1699 		overlay_hold_rele(odd);
1700 		mac_perim_exit(mph);
1701 		return (ENOENT);
1702 	}
1703 
1704 	overlay_hold_rele(odd);
1705 	mac_perim_exit(mph);
1706 	return (0);
1707 }
1708 
1709 /* ARGSUSED */
1710 static int
1711 overlay_i_getprop(void *karg, intptr_t arg, int mode, cred_t *cred,
1712     int *rvalp)
1713 {
1714 	int ret;
1715 	overlay_dev_t *odd;
1716 	mac_perim_handle_t mph;
1717 	overlay_ioc_prop_t *oip = karg;
1718 	uint_t propid, mtu;
1719 
1720 	odd = overlay_hold_by_dlid(oip->oip_linkid);
1721 	if (odd == NULL)
1722 		return (ENOENT);
1723 
1724 	mac_perim_enter_by_mh(odd->odd_mh, &mph);
1725 	oip->oip_size = OVERLAY_PROP_SIZEMAX;
1726 	oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0';
1727 	if (oip->oip_id == -1) {
1728 		int i;
1729 
1730 		for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
1731 			if (strcmp(overlay_dev_props[i], oip->oip_name) == 0)
1732 				break;
1733 			if (i == OVERLAY_DEV_NPROPS) {
1734 				ret = odd->odd_plugin->ovp_ops->ovpo_getprop(
1735 				    odd->odd_pvoid, oip->oip_name,
1736 				    oip->oip_value, &oip->oip_size);
1737 				overlay_hold_rele(odd);
1738 				mac_perim_exit(mph);
1739 				return (ret);
1740 			}
1741 		}
1742 
1743 		propid = i;
1744 	} else if (oip->oip_id >= OVERLAY_DEV_NPROPS) {
1745 		uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS;
1746 
1747 		if (id > odd->odd_plugin->ovp_nprops) {
1748 			overlay_hold_rele(odd);
1749 			mac_perim_exit(mph);
1750 			return (EINVAL);
1751 		}
1752 		ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid,
1753 		    odd->odd_plugin->ovp_props[id], oip->oip_value,
1754 		    &oip->oip_size);
1755 		overlay_hold_rele(odd);
1756 		mac_perim_exit(mph);
1757 		return (ret);
1758 	} else if (oip->oip_id < -1) {
1759 		overlay_hold_rele(odd);
1760 		mac_perim_exit(mph);
1761 		return (EINVAL);
1762 	} else {
1763 		ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS);
1764 		ASSERT(oip->oip_id >= 0);
1765 		propid = oip->oip_id;
1766 	}
1767 
1768 	ret = 0;
1769 	switch (propid) {
1770 	case OVERLAY_DEV_P_MTU:
1771 		/*
1772 		 * The MTU is always set and retrieved through MAC, to allow for
1773 		 * MAC to do whatever it wants, as really that property belongs
1774 		 * to MAC. This is important for things where vnics have hold on
1775 		 * the MTU.
1776 		 */
1777 		mac_sdu_get(odd->odd_mh, NULL, &mtu);
1778 		bcopy(&mtu, oip->oip_value, sizeof (uint_t));
1779 		oip->oip_size = sizeof (uint_t);
1780 		break;
1781 	case OVERLAY_DEV_P_VNETID:
1782 		/*
1783 		 * While it's read-only while inside of a mux, we're not in a
1784 		 * context that can guarantee that. Therefore we always grab the
1785 		 * overlay_dev_t's odd_lock.
1786 		 */
1787 		mutex_enter(&odd->odd_lock);
1788 		bcopy(&odd->odd_vid, oip->oip_value, sizeof (uint64_t));
1789 		mutex_exit(&odd->odd_lock);
1790 		oip->oip_size = sizeof (uint64_t);
1791 		break;
1792 	case OVERLAY_DEV_P_ENCAP:
1793 		oip->oip_size = strlcpy((char *)oip->oip_value,
1794 		    odd->odd_plugin->ovp_name, oip->oip_size);
1795 		break;
1796 	case OVERLAY_DEV_P_VARPDID:
1797 		mutex_enter(&odd->odd_lock);
1798 		if (odd->odd_flags & OVERLAY_F_VARPD) {
1799 			const uint64_t val = odd->odd_target->ott_id;
1800 			bcopy(&val, oip->oip_value, sizeof (uint64_t));
1801 			oip->oip_size = sizeof (uint64_t);
1802 		} else {
1803 			oip->oip_size = 0;
1804 		}
1805 		mutex_exit(&odd->odd_lock);
1806 		break;
1807 	default:
1808 		ret = ENOENT;
1809 	}
1810 
1811 	overlay_hold_rele(odd);
1812 	mac_perim_exit(mph);
1813 	return (ret);
1814 }
1815 
1816 static void
1817 overlay_setprop_vnetid(overlay_dev_t *odd, uint64_t vnetid)
1818 {
1819 	mutex_enter(&odd->odd_lock);
1820 
1821 	/* Simple case, not active */
1822 	if (!(odd->odd_flags & OVERLAY_F_IN_MUX)) {
1823 		odd->odd_vid = vnetid;
1824 		mutex_exit(&odd->odd_lock);
1825 		return;
1826 	}
1827 
1828 	/*
1829 	 * In the hard case, we need to set the drop flag, quiesce I/O and then
1830 	 * we can go ahead and do everything.
1831 	 */
1832 	odd->odd_flags |= OVERLAY_F_MDDROP;
1833 	overlay_io_wait(odd, OVERLAY_F_IOMASK);
1834 	mutex_exit(&odd->odd_lock);
1835 
1836 	overlay_mux_remove_dev(odd->odd_mux, odd);
1837 	mutex_enter(&odd->odd_lock);
1838 	odd->odd_vid = vnetid;
1839 	mutex_exit(&odd->odd_lock);
1840 	overlay_mux_add_dev(odd->odd_mux, odd);
1841 
1842 	mutex_enter(&odd->odd_lock);
1843 	ASSERT(odd->odd_flags & OVERLAY_F_IN_MUX);
1844 	odd->odd_flags &= ~OVERLAY_F_IN_MUX;
1845 	mutex_exit(&odd->odd_lock);
1846 }
1847 
1848 /* ARGSUSED */
1849 static int
1850 overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred,
1851     int *rvalp)
1852 {
1853 	int ret;
1854 	overlay_dev_t *odd;
1855 	overlay_ioc_prop_t *oip = karg;
1856 	uint_t propid = UINT_MAX;
1857 	mac_perim_handle_t mph;
1858 	uint64_t maxid, *vidp;
1859 
1860 	if (oip->oip_size > OVERLAY_PROP_SIZEMAX)
1861 		return (EINVAL);
1862 
1863 	odd = overlay_hold_by_dlid(oip->oip_linkid);
1864 	if (odd == NULL)
1865 		return (ENOENT);
1866 
1867 	oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0';
1868 	mac_perim_enter_by_mh(odd->odd_mh, &mph);
1869 	mutex_enter(&odd->odd_lock);
1870 	if (odd->odd_flags & OVERLAY_F_ACTIVATED) {
1871 		mac_perim_exit(mph);
1872 		mutex_exit(&odd->odd_lock);
1873 		return (ENOTSUP);
1874 	}
1875 	mutex_exit(&odd->odd_lock);
1876 	if (oip->oip_id == -1) {
1877 		int i;
1878 
1879 		for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
1880 			if (strcmp(overlay_dev_props[i], oip->oip_name) == 0)
1881 				break;
1882 			if (i == OVERLAY_DEV_NPROPS) {
1883 				ret = odd->odd_plugin->ovp_ops->ovpo_setprop(
1884 				    odd->odd_pvoid, oip->oip_name,
1885 				    oip->oip_value, oip->oip_size);
1886 				overlay_hold_rele(odd);
1887 				mac_perim_exit(mph);
1888 				return (ret);
1889 			}
1890 		}
1891 
1892 		propid = i;
1893 	} else if (oip->oip_id >= OVERLAY_DEV_NPROPS) {
1894 		uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS;
1895 
1896 		if (id > odd->odd_plugin->ovp_nprops) {
1897 			mac_perim_exit(mph);
1898 			overlay_hold_rele(odd);
1899 			return (EINVAL);
1900 		}
1901 		ret = odd->odd_plugin->ovp_ops->ovpo_setprop(odd->odd_pvoid,
1902 		    odd->odd_plugin->ovp_props[id], oip->oip_value,
1903 		    oip->oip_size);
1904 		mac_perim_exit(mph);
1905 		overlay_hold_rele(odd);
1906 		return (ret);
1907 	} else if (oip->oip_id < -1) {
1908 		mac_perim_exit(mph);
1909 		overlay_hold_rele(odd);
1910 		return (EINVAL);
1911 	} else {
1912 		ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS);
1913 		ASSERT(oip->oip_id >= 0);
1914 		propid = oip->oip_id;
1915 	}
1916 
1917 	ret = 0;
1918 	switch (propid) {
1919 	case OVERLAY_DEV_P_MTU:
1920 		ret = mac_set_prop(odd->odd_mh, MAC_PROP_MTU, "mtu",
1921 		    oip->oip_value, oip->oip_size);
1922 		break;
1923 	case OVERLAY_DEV_P_VNETID:
1924 		if (oip->oip_size != sizeof (uint64_t)) {
1925 			ret = EINVAL;
1926 			break;
1927 		}
1928 		vidp = (uint64_t *)oip->oip_value;
1929 		ASSERT(odd->odd_plugin->ovp_id_size <= 8);
1930 		maxid = UINT64_MAX;
1931 		if (odd->odd_plugin->ovp_id_size != 8)
1932 			maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) -
1933 			    1ULL;
1934 		if (*vidp >= maxid) {
1935 			ret = EINVAL;
1936 			break;
1937 		}
1938 		overlay_setprop_vnetid(odd, *vidp);
1939 		break;
1940 	case OVERLAY_DEV_P_ENCAP:
1941 	case OVERLAY_DEV_P_VARPDID:
1942 		ret = EPERM;
1943 		break;
1944 	default:
1945 		ret = ENOENT;
1946 	}
1947 
1948 	mac_perim_exit(mph);
1949 	overlay_hold_rele(odd);
1950 	return (ret);
1951 }
1952 
1953 /* ARGSUSED */
1954 static int
1955 overlay_i_status(void *karg, intptr_t arg, int mode, cred_t *cred,
1956     int *rvalp)
1957 {
1958 	overlay_dev_t *odd;
1959 	overlay_ioc_status_t *os = karg;
1960 
1961 	odd = overlay_hold_by_dlid(os->ois_linkid);
1962 	if (odd == NULL)
1963 		return (ENOENT);
1964 
1965 	mutex_enter(&odd->odd_lock);
1966 	if ((odd->odd_flags & OVERLAY_F_DEGRADED) != 0) {
1967 		os->ois_status = OVERLAY_I_DEGRADED;
1968 		if (odd->odd_fmamsg != NULL) {
1969 			(void) strlcpy(os->ois_message, odd->odd_fmamsg,
1970 			    OVERLAY_STATUS_BUFLEN);
1971 		} else {
1972 			os->ois_message[0] = '\0';
1973 		}
1974 
1975 	} else {
1976 		os->ois_status = OVERLAY_I_OK;
1977 		os->ois_message[0] = '\0';
1978 	}
1979 	mutex_exit(&odd->odd_lock);
1980 	overlay_hold_rele(odd);
1981 
1982 	return (0);
1983 }
1984 
1985 static dld_ioc_info_t overlay_ioc_list[] = {
1986 	{ OVERLAY_IOC_CREATE, DLDCOPYIN, sizeof (overlay_ioc_create_t),
1987 		overlay_i_create, secpolicy_dl_config },
1988 	{ OVERLAY_IOC_ACTIVATE, DLDCOPYIN, sizeof (overlay_ioc_activate_t),
1989 		overlay_i_activate, secpolicy_dl_config },
1990 	{ OVERLAY_IOC_DELETE, DLDCOPYIN, sizeof (overlay_ioc_delete_t),
1991 		overlay_i_delete, secpolicy_dl_config },
1992 	{ OVERLAY_IOC_PROPINFO, DLDCOPYIN | DLDCOPYOUT,
1993 		sizeof (overlay_ioc_propinfo_t), overlay_i_propinfo,
1994 		secpolicy_dl_config },
1995 	{ OVERLAY_IOC_GETPROP, DLDCOPYIN | DLDCOPYOUT,
1996 		sizeof (overlay_ioc_prop_t), overlay_i_getprop,
1997 		secpolicy_dl_config },
1998 	{ OVERLAY_IOC_SETPROP, DLDCOPYIN,
1999 		sizeof (overlay_ioc_prop_t), overlay_i_setprop,
2000 		secpolicy_dl_config },
2001 	{ OVERLAY_IOC_NPROPS, DLDCOPYIN | DLDCOPYOUT,
2002 		sizeof (overlay_ioc_nprops_t), overlay_i_nprops,
2003 		secpolicy_dl_config },
2004 	{ OVERLAY_IOC_STATUS, DLDCOPYIN | DLDCOPYOUT,
2005 		sizeof (overlay_ioc_status_t), overlay_i_status,
2006 		NULL }
2007 };
2008 
2009 static int
2010 overlay_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2011 {
2012 	int fmcap = DDI_FM_EREPORT_CAPABLE;
2013 	if (cmd != DDI_ATTACH)
2014 		return (DDI_FAILURE);
2015 
2016 	if (overlay_dip != NULL || ddi_get_instance(dip) != 0)
2017 		return (DDI_FAILURE);
2018 
2019 	ddi_fm_init(dip, &fmcap, NULL);
2020 
2021 	if (ddi_create_minor_node(dip, OVERLAY_CTL, S_IFCHR,
2022 	    ddi_get_instance(dip), DDI_PSEUDO, 0) == DDI_FAILURE)
2023 		return (DDI_FAILURE);
2024 
2025 	if (dld_ioc_register(OVERLAY_IOC, overlay_ioc_list,
2026 	    DLDIOCCNT(overlay_ioc_list)) != 0) {
2027 		ddi_remove_minor_node(dip, OVERLAY_CTL);
2028 		return (DDI_FAILURE);
2029 	}
2030 
2031 	overlay_dip = dip;
2032 	return (DDI_SUCCESS);
2033 }
2034 
2035 /* ARGSUSED */
2036 static int
2037 overlay_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
2038 {
2039 	int error;
2040 
2041 	switch (cmd) {
2042 	case DDI_INFO_DEVT2DEVINFO:
2043 		*resp = (void *)overlay_dip;
2044 		error = DDI_SUCCESS;
2045 		break;
2046 	case DDI_INFO_DEVT2INSTANCE:
2047 		*resp = (void *)0;
2048 		error = DDI_SUCCESS;
2049 		break;
2050 	default:
2051 		error = DDI_FAILURE;
2052 		break;
2053 	}
2054 
2055 	return (error);
2056 }
2057 
2058 static int
2059 overlay_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2060 {
2061 	if (cmd != DDI_DETACH)
2062 		return (DDI_FAILURE);
2063 
2064 	mutex_enter(&overlay_dev_lock);
2065 	if (!list_is_empty(&overlay_dev_list) || overlay_target_busy()) {
2066 		mutex_exit(&overlay_dev_lock);
2067 		return (EBUSY);
2068 	}
2069 	mutex_exit(&overlay_dev_lock);
2070 
2071 
2072 	dld_ioc_unregister(OVERLAY_IOC);
2073 	ddi_remove_minor_node(dip, OVERLAY_CTL);
2074 	ddi_fm_fini(dip);
2075 	overlay_dip = NULL;
2076 	return (DDI_SUCCESS);
2077 }
2078 
2079 static struct cb_ops overlay_cbops = {
2080 	overlay_target_open,	/* cb_open */
2081 	overlay_target_close,	/* cb_close */
2082 	nodev,			/* cb_strategy */
2083 	nodev,			/* cb_print */
2084 	nodev,			/* cb_dump */
2085 	nodev,			/* cb_read */
2086 	nodev,			/* cb_write */
2087 	overlay_target_ioctl,	/* cb_ioctl */
2088 	nodev,			/* cb_devmap */
2089 	nodev,			/* cb_mmap */
2090 	nodev,			/* cb_segmap */
2091 	nochpoll,		/* cb_chpoll */
2092 	ddi_prop_op,		/* cb_prop_op */
2093 	NULL,			/* cb_stream */
2094 	D_MP,			/* cb_flag */
2095 	CB_REV,			/* cb_rev */
2096 	nodev,			/* cb_aread */
2097 	nodev,			/* cb_awrite */
2098 };
2099 
2100 static struct dev_ops overlay_dev_ops = {
2101 	DEVO_REV,		/* devo_rev */
2102 	0,			/* devo_refcnt */
2103 	overlay_getinfo,	/* devo_getinfo */
2104 	nulldev,		/* devo_identify */
2105 	nulldev,		/* devo_probe */
2106 	overlay_attach,		/* devo_attach */
2107 	overlay_detach,		/* devo_detach */
2108 	nulldev,		/* devo_reset */
2109 	&overlay_cbops,		/* devo_cb_ops */
2110 	NULL,			/* devo_bus_ops */
2111 	NULL,			/* devo_power */
2112 	ddi_quiesce_not_supported	/* devo_quiesce */
2113 };
2114 
2115 static struct modldrv overlay_modldrv = {
2116 	&mod_driverops,
2117 	"Overlay Network Driver",
2118 	&overlay_dev_ops
2119 };
2120 
2121 static struct modlinkage overlay_linkage = {
2122 	MODREV_1,
2123 	&overlay_modldrv
2124 };
2125 
2126 static int
2127 overlay_init(void)
2128 {
2129 	mutex_init(&overlay_dev_lock, NULL, MUTEX_DRIVER, NULL);
2130 	list_create(&overlay_dev_list, sizeof (overlay_dev_t),
2131 	    offsetof(overlay_dev_t, odd_link));
2132 	overlay_mux_init();
2133 	overlay_plugin_init();
2134 	overlay_target_init();
2135 
2136 	return (DDI_SUCCESS);
2137 }
2138 
2139 static void
2140 overlay_fini(void)
2141 {
2142 	overlay_target_fini();
2143 	overlay_plugin_fini();
2144 	overlay_mux_fini();
2145 	mutex_destroy(&overlay_dev_lock);
2146 	list_destroy(&overlay_dev_list);
2147 }
2148 
2149 int
2150 _init(void)
2151 {
2152 	int err;
2153 
2154 	if ((err = overlay_init()) != DDI_SUCCESS)
2155 		return (err);
2156 
2157 	mac_init_ops(NULL, "overlay");
2158 	err = mod_install(&overlay_linkage);
2159 	if (err != DDI_SUCCESS) {
2160 		overlay_fini();
2161 		return (err);
2162 	}
2163 
2164 	return (0);
2165 }
2166 
2167 int
2168 _info(struct modinfo *modinfop)
2169 {
2170 	return (mod_info(&overlay_linkage, modinfop));
2171 }
2172 
2173 int
2174 _fini(void)
2175 {
2176 	int err;
2177 
2178 	err = mod_remove(&overlay_linkage);
2179 	if (err != 0)
2180 		return (err);
2181 
2182 	overlay_fini();
2183 	return (0);
2184 }
2185