1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2016 Joyent, Inc.
14 * Copyright 2022 MNX Cloud, Inc.
15 */
16
17 /*
18 * Overlay Devices
19 *
20 * Overlay devices provide a means for creating overlay networks, a means of
21 * multiplexing multiple logical, isolated, and discrete layer two and layer
22 * three networks on top of one physical network.
23 *
24 * In general, these overlay devices encapsulate the logic to answer two
25 * different questions:
26 *
27 * 1) How should I transform a packet to put it on the wire?
28 * 2) Where should I send a transformed packet?
29 *
30 * Each overlay device is presented to the user as a GLDv3 device. While the
31 * link itself cannot have an IP interface created on top of it, it allows for
32 * additional GLDv3 devices, such as a VNIC, to be created on top of it which
33 * can be plumbed up with IP interfaces.
34 *
35 *
36 * --------------------
37 * General Architecture
38 * --------------------
39 *
40 * The logical overlay device that a user sees in dladm(8) is a combination of
41 * two different components that work together. The first component is this
42 * kernel module, which is responsible for answering question one -- how should
43 * I transform a packet to put it on the wire.
44 *
45 * The second component is what we call the virtual ARP daemon, or varpd. It is
46 * a userland component that is responsible for answering the second question --
47 * Where should I send a transformed packet. Instances of the kernel overlay
48 * GLDv3 device ask varpd the question of where should a packet go.
49 *
50 * The split was done for a few reasons. Importantly, we wanted to keep the act
51 * of generating encapsulated packets in the kernel so as to ensure that the
52 * general data path was fast and also kept simple. On the flip side, while the
53 * question of where should something go may be simple, it may often be
54 * complicated and need to interface with several different external or
55 * distributed systems. In those cases, it's simpler to allow for the full
56 * flexibility of userland to be brought to bear to solve that problem and in
57 * general, the path isn't very common.
58 *
59 * The following is what makes up the logical overlay device that a user would
60 * create with dladm(8).
61 *
62 * Kernel Userland
63 * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
64 * . +--------+ +--------+ +--------+ . . .
65 * . | VNIC 0 | | VNIC 1 | | VNIC 2 | . . .
66 * . +--------+ +--------+ +--------+ . . .
67 * . | | | . . .
68 * . | | | . . .
69 * . +------------+-----------+ . . .
70 * . | . . /dev/overlay .
71 * . +--------------+ . . . +------------+ .
72 * . | | . . . | | .
73 * . | Overlay |======*=================| Virtual | .
74 * . | GLDv3 Device |========================| ARP Daemon | .
75 * . | | . . | | .
76 * . +--------------+ . . +------------+ .
77 * . | . . | .
78 * . | . . | .
79 * . +----------------+ . . +--------+ .
80 * . | Overlay | . . | varpd | .
81 * . | Encapsulation | . . | Lookup | .
82 * . | Plugin | . . | Plugin | .
83 * . +----------------+ . . +--------+ .
84 * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
85 *
86 *
87 * This image shows the two different components and where they live.
88 * Importantly, it also shows that both the kernel overlay device and the
89 * userland varpd both support plugins. The plugins actually implement the
90 * things that users care about and the APIs have been designed to try to
91 * minimize the amount of things that a module writer needs to worry about it.
92 *
93 * IDENTIFIERS
94 *
95 * Every overlay device is defined by a unique identifier which is the overlay
96 * identifier. Its purpose is similar to that of a VLAN identifier, it's a
97 * unique number that is used to differentiate between different entries on the
98 * wire.
99 *
100 * ENCAPSULATION
101 *
102 * An overlay encapsulation plugin is a kernel miscellaneous module whose
103 * purpose is to contain knowledge about how to transform packets to put them
104 * onto the wire and to take them off. An example of an encapsulation plugin is
105 * vxlan. It's also how support for things like nvgre or geneve would be brought
106 * into the system.
107 *
108 * Each encapsulation plugins defines a series of operation vectors and
109 * properties. For the full details on everything they should provide, please
110 * read uts/common/sys/overlay_plugin.h. The encapsulation plugin is responsible
111 * for telling the system what information is required to send a packet. For
112 * example, vxlan is defined to send everything over a UDP packet and therefore
113 * requires a port and an IP address, while nvgre on the other hand is its own
114 * IP type and therefore just requires an IP address. In addition, it also
115 * provides information about the kind of socket that should be created. This is
116 * used by the kernel multiplexor, more of that in the Kernel Components
117 * section.
118 *
119 * LOOKUPS
120 *
121 * The kernel communicates requests for lookups over the character device
122 * /dev/overlay. varpd is responsible for listening for requests on that device
123 * and answering them. The character device is specific to the target path and
124 * varpd.
125 *
126 * Much as the kernel overlay module handles the bulk of the scaffolding but
127 * leaves the important work to the encapsulation plugin, varpd provides a
128 * similar role and leaves the full brunt of lookups to a userland dynamic
129 * shared object which implements the logic of lookups.
130 *
131 * Each lookup plugin defines a series of operation vectors and properties. For
132 * the full details on everything that they should provide, please read
133 * lib/varpd/libvarpd/libvarpd_provider.h. Essentially, they are given a MAC
134 * address and asked to give an address on the physical network that it should
135 * be sent to. In addition, they handle questions related to how to handle
136 * things like broadcast and multicast traffic, etc.
137 *
138 * ----------
139 * Properties
140 * ----------
141 *
142 * A device from a dladm perspective has a unique set of properties that are
143 * combined from three different sources:
144 *
145 * 1) Generic properties that every overlay device has
146 * 2) Properties that are specific to the encapsulation plugin
147 * 3) Properties that are specific to the lookup plugin
148 *
149 * All of these are exposed in a single set of properties in dladm. Note that
150 * these are not necessarily traditional link properties. However, if something
151 * is both a traditional GLDv3 link property, say the MTU of a device, and a
152 * specific property here, than the driver ensures that all existing GLDv3
153 * specific means of manipulating it are used and wraps up its private property
154 * interfaces to ensure that works.
155 *
156 * Properties in the second and third category are prefixed with the name of
157 * their module. For example, the vxlan encapsulation module has a property
158 * called the 'listen_ip'. This property would show up in dladm as
159 * 'vxlan/listen_ip'. This allows different plugins to both use similar names
160 * for similar properties and to also have independent name spaces so that
161 * overlapping names do not conflict with anything else.
162 *
163 * While the kernel combines both sets one and two into a single coherent view,
164 * it does not do anything with respect to the properties that are owned by the
165 * lookup plugin -- those are owned wholly by varpd. Instead, libdladm is in
166 * charge of bridging these two worlds into one magical experience for the user.
167 * It carries the burden of knowing about both overlay specific and varpd
168 * specific properties. Importantly, we want to maintain this distinction. We
169 * don't want to treat the kernel as an arbitrary key/value store for varpd and
170 * we want the kernel to own its own data and not have to ask userland for
171 * information that it owns.
172 *
173 * Every property in the system has the following attributes:
174 *
175 * o A name
176 * o A type
177 * o A size
178 * o Permissions
179 * o Default value
180 * o Valid value ranges
181 * o A value
182 *
183 * Everything except for the value is obtained by callers through the propinfo
184 * callbacks and a property has a maximum size of OVERLAY_PROP_SIZEMAX,
185 * currently 256 bytes.
186 *
187 * The following are the supported types of properties:
188 *
189 * OVERLAY_PROP_T_INT
190 *
191 * A signed integer, its length is 8 bytes, corresponding to a
192 * int64_t.
193 *
194 * OVERLAY_PROP_T_UINT
195 *
196 * An unsigned integer, its length is 8 bytes, corresponding to a
197 * uint64_t.
198 *
199 * OVERLAY_PROP_T_IP
200 *
201 * A struct in6_addr, it has a fixed size.
202 *
203 * OVERLAY_PROP_T_STRING
204 *
205 * A null-terminated character string encoded in either ASCII or
206 * UTF-8. Note that the size of the string includes the null
207 * terminator.
208 *
209 * The next thing that we apply to a property is its permission. The permissions
210 * are put together by the bitwise or of the following flags and values.
211 *
212 * OVERLAY_PROP_PERM_REQ
213 *
214 * This indicates a required property. A property that is required
215 * must be set by a consumer before the device can be created. If a
216 * required property has a default property, this constraint is
217 * loosened because the default property defines the value.
218 *
219 * OVERLAY_PORP_PERM_READ
220 *
221 * This indicates that a property can be read. All properties will
222 * have this value set.
223 *
224 * OVERLAY_PROP_PERM_WRITE
225 *
226 * This indicates that a property can be written to and thus
227 * updated by userland. Properties that are only intended to
228 * display information, will not have OVERLAY_PROP_PERM_WRITE set.
229 *
230 * In addition, a few additional values are defined as a convenience to
231 * consumers. The first, OVERLAY_PROP_PERM_RW, is a combination of
232 * OVERLAY_PROP_PERM_READ and OVERLAY_PERM_PROP_WRITE. The second,
233 * OVERLAY_PROP_PERM_RRW, is a combination of OVERLAY_PROP_PERM_REQ,
234 * OVERLAY_PROP_PERM_READ, and OVERLAY_PROP_PERM_WRITE. The protection mode of a
235 * property should generally be a constant across its lifetime.
236 *
237 * A property may optionally have a default value. If it does have a default
238 * value, and that property is not set to be a different value, then the default
239 * value is inherited automatically. It also means that if the default value is
240 * acceptable, there is no need to set the value for a required property. For
241 * example, the vxlan module has the vxlan/listen_port property which is
242 * required, but has a default value of 4789 (the IANA assigned port). Because
243 * of that default value, there is no need for it to be set.
244 *
245 * Finally, a property may declare a list of valid values. These valid values
246 * are used for display purposes, they are not enforced by the broader system,
247 * but merely allow a means for the information to be communicated to the user
248 * through dladm(8). Like a default value, this is optional.
249 *
250 * The general scaffolding does not do very much with respect to the getting and
251 * setting of properties. That is really owned by the individual plugins
252 * themselves.
253 *
254 * -----------------------------
255 * Destinations and Plugin Types
256 * -----------------------------
257 *
258 * Both encapsulation and lookup plugins define the kinds of destinations that
259 * they know how to support. There are three different pieces of information
260 * that can be used to address to a destination currently, all of which is
261 * summarized in the type overlay_point_t. Any combination of these is
262 * supported.
263 *
264 * OVERLAY_PLUGIN_D_ETHERNET
265 *
266 * An Ethernet MAC address is required.
267 *
268 * OVERLAY_PLUGIN_D_IP
269 *
270 * An IP address is required. All IP addresses used by the overlay
271 * system are transmitted as IPv6 addresses. IPv4 addresses can be
272 * represented by using IPv4-mapped IPv6 addresses.
273 *
274 * OVERLAY_PLUGIN_D_PORT
275 *
276 * A TCP/UDP port is required.
277 *
278 * A kernel encapsulation plugin declares which of these that it requires, it's
279 * a static set. On the other hand, a userland lookup plugin can be built to
280 * support all of these or any combination thereof. It gets passed the required
281 * destination type, based on the kernel encapsulation method, and then it makes
282 * the determination as to whether or not it supports it. For example, the
283 * direct plugin can support either an IP or both an IP and a port, it simply
284 * doesn't display the direct/dest_port property in the cases where a port is
285 * not required to support this.
286 *
287 * The user lookup plugins have two different modes of operation which
288 * determines how they interact with the broader system and how look ups are
289 * performed. These types are:
290 *
291 * OVERLAY_TARGET_POINT
292 *
293 * A point to point plugin has a single static definition for where
294 * to send all traffic. Every packet in the system always gets sent
295 * to the exact same destination which is programmed into the
296 * kernel when the general device is activated.
297 *
298 * OVERLAY_TARGET_DYNAMIC
299 *
300 * A dynamic plugin does not have a single static definition.
301 * Instead, for each destination, the kernel makes an asynchronous
302 * request to varpd to determine where the packet should be routed,
303 * and if a specific destination is found, then that destination is
304 * cached in the overlay device's target cache.
305 *
306 * This distinction, while important for the general overlay device's operation,
307 * is not important to the encapsulation plugins. They don't need to know about
308 * any of these pieces. It's just a concern for varpd, the userland plugin, and
309 * the general overlay scaffolding.
310 *
311 * When an overlay device is set to OVERLAY_TARGET_POINT, then it does not
312 * maintain a target cache, and instead just keeps track of the destination and
313 * always sends encapsulated packets to that address. When the target type is of
314 * OVERLAY_TARGET_DYNAMIC, then the kernel maintains a cache of all such
315 * destinations. These destinations are kept around in an instance of a
316 * reference hash that is specific to the given overlay device. Entries in the
317 * cache can be invalidated and replaced by varpd and its lookup plugins.
318 *
319 * ----------------------------------
320 * Kernel Components and Architecture
321 * ----------------------------------
322 *
323 * There are multiple pieces inside the kernel that work together, there is the
324 * general overlay_dev_t structure, which is the logical GLDv3 device, but it
325 * itself has references to things like an instance of an encapsulation plugin,
326 * a pointer to a mux and a target cache. It can roughly be summarized in the
327 * following image:
328 *
329 * +------------------+
330 * | global |
331 * | overlay list |
332 * | overlay_dev_list |
333 * +------------------+
334 * |
335 * | +-----------------------+ +---------------+
336 * +->| GLDv3 Device |----------->| GLDv3 Device | -> ...
337 * | overlay_dev_t | | overlay_dev_t |
338 * | | +---------------+
339 * | |
340 * | mac_handle_t -----+---> GLDv3 handle to MAC
341 * | datalink_id_t -----+---> Datalink ID used by DLS
342 * | overlay_dev_flag_t ---+---> Device state
343 * | uint_t -----+---> Current device MTU
344 * | uint_t -----+---> In-progress RX operations
345 * | uint_t -----+---> In-progress TX operations
346 * | char[] -----+---> FMA degraded message
347 * | void * -----+---> plugin private data
348 * | overlay_target_t * ---+---------------------+
349 * | overlay_plugin_t * ---+---------+ |
350 * +-----------------------+ | |
351 * ^ | |
352 * +--------------------+ | | |
353 * | Kernel Socket | | | |
354 * | Multiplexor | | | |
355 * | overlay_mux_t | | | |
356 * | | | | |
357 * | avl_tree_t -+--+ | |
358 * | uint_t -+--> socket family | |
359 * | uint_t -+--> socket type | |
360 * | uint_t -+--> socket protocol | |
361 * | ksocket_t -+--> I/O socket | |
362 * | struct sockaddr * -+--> ksocket address | |
363 * | overlay_plugin_t --+--------+ | |
364 * +--------------------+ | | |
365 * | | |
366 * +-------------------------+ | | |
367 * | Encap Plugin |<--+-----------+ |
368 * | overlay_plugin_t | |
369 * | | |
370 * | char * ---+--> plugin name |
371 * | overlay_plugin_ops_t * -+--> plugin downcalls |
372 * | char ** (props) ---+--> property list |
373 * | uint_t ---+--> id length |
374 * | overlay_plugin_flags_t -+--> plugin flags |
375 * | overlay_plugin_dest_t --+--> destination type v
376 * +-------------------------+ +-------------------------+
377 * | Target Cache |
378 * | overlay_target_t |
379 * | |
380 * cache mode <--+- overlay_target_mode_t |
381 * dest type <--+- overlay_plugin_dest_t |
382 * cache flags <--+- overlay_target_flag_t |
383 * varpd id <--+- uint64_t |
384 * outstanding varpd reqs. <--+- uint_t |
385 * OVERLAY_TARGET_POINT state <--+- overlay_target_point_t |
386 * OVERLAY_TARGET_DYNAMIC state <-+---+- overlay_target_dyn_t |
387 * | +-------------------------+
388 * +-----------------------+
389 * |
390 * v
391 * +-------------------------------+ +------------------------+
392 * | Target Entry |-->| Target Entry |--> ...
393 * | overlay_target_entry_t | | overlay_target_entry_t |
394 * | | +------------------------+
395 * | |
396 * | overlay_target_entry_flags_t -+--> Entry flags
397 * | uint8_t[ETHERADDRL] ---+--> Target MAC address
398 * | overlay_target_point_t ---+--> Target underlay address
399 * | mblk_t * ---+--> outstanding mblk head
400 * | mblk_t * ---+--> outstanding mblk tail
401 * | size_t ---+--> outstanding mblk size
402 * +-------------------------------+
403 *
404 * The primary entries that we care about are the overlay_dev_t, which
405 * correspond to each overlay device that is created with dladm(8). Globally,
406 * these devices are maintained in a simple list_t which is protected with a
407 * lock. Hence, these include important information such as the mac_handle_t
408 * and a datalink_id_t which is used to interact with the broader MAC and DLS
409 * ecosystem. We also maintain additional information such as the current state,
410 * outstanding operations, the mtu, and importantly, the plugin's private data.
411 * This is the instance of an encapsulation plugin that gets created as part of
412 * creating an overlay device. Another aspect of this is that the overlay_dev_t
413 * also includes information with respect to FMA. For more information, see the
414 * FMA section.
415 *
416 * Each overlay_dev_t has a pointer to a plugin, a mux, and a target. The plugin
417 * is the encapsulation plugin. This allows the device to make downcalls into it
418 * based on doing things like getting and setting properties. Otherwise, the
419 * plugin itself is a fairly straightforward entity. They are maintained in an
420 * (not pictured above) list. The plugins themselves mostly maintain things like
421 * the static list of properties, what kind of destination they require, and the
422 * operations vector. A given module may contain more if necessary.
423 *
424 * The next piece of the puzzle is the mux, or a multiplexor. The mux itself
425 * maintains a ksocket and it is through the mux that we send and receive
426 * message blocks. The mux represents a socket type and address, as well as a
427 * plugin. Multiple overlay_dev_t devices may then share the same mux. For
428 * example, consider the case where you have different instances of vxlan all on
429 * the same underlay network. These would all logically share the same IP
430 * address and port that packets are sent and received on; however, what differs
431 * is the decapuslation ID.
432 *
433 * Each mux maintains a ksocket_t which is similar to a socket(3SOCKET). Unlike
434 * a socket, we enable a direct callback on the ksocket. This means that
435 * whenever a message block chain is received, rather than sitting there and
436 * getting a callback in a context and kicking that back out to a taskq. Instead
437 * data comes into the callback function overlay_mux_recv().
438 *
439 * The mux is given encapsulated packets (via overlay_m_tx, the GLDv3 tx
440 * function) to transmit. It receives encapsulated packets, decapsulates them to
441 * determine the overlay identifier, looks up the given device that matches that
442 * identifier, and then causes the broader MAC world to receive the packet with
443 * a call to mac_rx().
444 *
445 * Today, we don't do too much that's special with the ksocket; however, as
446 * hardware is gaining understanding for these encapsulation protocols, we'll
447 * probably want to think of better ways to get those capabilities passed down
448 * and potentially better ways to program receive filters so they get directly
449 * to us. Though, that's all fantasy future land.
450 *
451 * The next part of the puzzle is the target cache. The purpose of the target
452 * cache is to cache where we should send a packet on the underlay network,
453 * given its mac address. The target cache operates in two modes depending on
454 * whether the lookup module was declared to OVERLAY_TARGET_POINT or
455 * OVERLAY_TARGET_DYANMIC.
456 *
457 * In the case where the target cache has been programmed to be
458 * OVERLAY_TARGET_POINT, then we only maintain a single overlay_target_point_t
459 * which has the destination that we send everything, no matter the destination
460 * mac address.
461 *
462 * On the other hand, when we have an instance of OVERLAY_TARGET_DYNAMIC, things
463 * are much more interesting and as a result, more complicated. We primarily
464 * store lists of overlay_target_entry_t's which are stored in both an avl tree
465 * and a refhash_t. The primary look up path uses the refhash_t and the avl tree
466 * is only used for a few of the target ioctls used to dump data such that we
467 * can get a consistent iteration order for things like dladm show-overlay -t.
468 * The key that we use for the reference hashtable is based on the mac address
469 * in the cache and currently we just do a simple CRC32 to transform it into a
470 * hash.
471 *
472 * Each entry maintains a set of flags to indicate the current status of the
473 * request. The flags may indicate one of three states: that current cache entry
474 * is valid, that the current cache entry has been directed to drop all output,
475 * and that the current cache entry is invalid and may be being looked up. In
476 * the case where it's valid, we just take the destination address and run with
477 * it.
478 *
479 * If it's invalid and a lookup has not been made, then we start the process
480 * that prepares a query that will make its way up to varpd. The cache entry
481 * entry maintains a message block chain of outstanding message blocks and a
482 * size. These lists are populated only when we don't know the answer as to
483 * where should these be sent. The size entry is used to cap the amount of
484 * outstanding data that we don't know the answer to. If we exceed a cap on the
485 * amount of outstanding data (currently 1 Mb), then we'll drop any additional
486 * packets. Once we get an answer indicating a valid destination, we transmit
487 * any outstanding data to that place. For the full story on how we look that up
488 * will be discussed in the section on the Target Cache Lifecycle.
489 *
490 * ------------------------
491 * FMA and Degraded Devices
492 * ------------------------
493 *
494 * Every kernel overlay device keeps track of its FMA state. Today in FMA we
495 * cannot represent partitions between resources nor can we represent that a
496 * given minor node of a pseudo device has failed -- if we degrade the overlay
497 * device, then the entire dev_info_t is degraded. However, we still want to be
498 * able to indicate to administrators that things may go wrong.
499 *
500 * To this end, we've added a notion of a degraded state to every overlay
501 * device. This state is primarily dictated by userland and it can happen for
502 * various reasons. Generally, because a userland lookup plugin has been
503 * partitioned, or something has gone wrong such that there is no longer any
504 * userland lookup module for a device, then we'll mark it degraded.
505 *
506 * As long as any of our minor instances is degraded, then we'll fire off the
507 * FMA event to note that. Once the last degraded instance is no longer
508 * degraded, then we'll end up telling FMA that we're all clean.
509 *
510 * To help administrators get a better sense of which of the various minor
511 * devices is wrong, we store the odd_fmamsg[] character array. This character
512 * array can be fetched with doing a dladm show-overlay -f.
513 *
514 * Note, that it's important that we do not update the link status of the
515 * devices. We want to remain up as much as possible. By changing the link in a
516 * degraded state, this may end up making things worse. We may still actually
517 * have information in the target cache and if we mark the link down, that'll
518 * result in not being able to use it. The reason being that this'll mark all
519 * the downstream VNICs down which will go to IP and from there we end up
520 * dealing with sadness.
521 *
522 * -----------------------
523 * Target Cache Life Cycle
524 * -----------------------
525 *
526 * This section only applies when we have a lookup plugin of
527 * OVERLAY_TARGET_DYNAMIC. None of this applies to those of type
528 * OVERLAY_TARGET_POINT.
529 *
530 * While we got into the target cache in the general architecture section, it's
531 * worth going into more details as to how this actually works and showing some
532 * examples and state machines. Recall that a target cache entry basically has
533 * the following state transition diagram:
534 *
535 * Initial state
536 * . . . . . . first access . . . varpd lookup enqueued
537 * . . .
538 * . . .
539 * +-------+ . +----------+ .
540 * | No |------*---->| Invalid |-------*----+
541 * | Entry | | Entry | |
542 * +-------+ +----------+ |
543 * varpd ^ ^ varpd |
544 * invalidate | | drop |
545 * . . . * * . . v
546 * +-------+ | | +---------+
547 * | Entry |--->-----+ +----<----| Entry |
548 * | Valid |<----------*---------<----| Pending |->-+ varpd
549 * +-------+ . +---------+ * . . drop, but
550 * . varpd ^ | other queued
551 * . success | | entries
552 * +-----+
553 *
554 * When the table is first created, it is empty. As we attempt to lookup entries
555 * and we find there is no entry at all, we'll create a new table entry for it.
556 * At that point the entry is technically in an invalid state, that means that
557 * we have no valid data from varpd. In that case, we'll go ahead and queue the
558 * packet into the entry's pending chain, and queue a varpd lookup, setting the
559 * OVERLAY_ENTRY_F_PENDING flag in the progress.
560 *
561 * If additional mblk_t's come in for this entry, we end up appending them to
562 * the tail of the chain, if and only if, we don't exceed the threshold for the
563 * amount of space they can take up. An entry remains pending until we get a
564 * varpd reply. If varpd replies with a valid results, we move to the valid
565 * entry state, and remove the OVERLAY_ENTRY_F_PENDING flag and set it with one
566 * of OVERLAY_ENTRY_F_VALID or OVERLAY_ENTRY_F_DROP as appropriate.
567 *
568 * Once an entry is valid, it stays valid until user land tells us to invalidate
569 * it with an ioctl or replace it, OVERLAY_TARG_CACHE_REMOE and
570 * OVERLAY_TARG_CACHE_SET respectively.
571 *
572 * If the lookup fails with a call to drop the packet, then the next state is
573 * determined by the state of the queue. If the set of outstanding entries is
574 * empty, then we just transition back to the invalid state. If instead, the
575 * set of outstanding entries is not empty, then we'll queue another entry and
576 * stay in the same state, repeating this until the number of requests is
577 * drained.
578 *
579 * The following images describes the flow of a given lookup and where the
580 * overlay_target_entry_t is at any given time.
581 *
582 * +-------------------+
583 * | Invalid Entry | An entry starts off as an invalid entry
584 * | de:ad:be:ef:00:00 | and only exists in the target cache.
585 * +-------------------+
586 *
587 * ~~~~
588 *
589 * +---------------------+
590 * | Global list_t | A mblk_t comes in for an entry. We
591 * | overlay_target_list | append it to the overlay_target_list.
592 * +---------------------+
593 * |
594 * v
595 * +-------------------+ +-------------------+
596 * | Pending Entry |----->| Pending Entry |--->...
597 * | 42:5e:1a:10:d6:2d | | de:ad:be:ef:00:00 |
598 * +-------------------+ +-------------------+
599 *
600 * ~~~~
601 *
602 * +--------------------------+
603 * | /dev/overlay minor state | User land said that it would look up an
604 * | overlay_target_hdl_t | entry for us. We remove it from the
605 * +--------------------------+ global list and add it to the handle's
606 * | outstanding list.
607 * |
608 * v
609 * +-------------------+ +-------------------+
610 * | Pending Entry |----->| Pending Entry |
611 * | 90:b8:d0:79:02:dd | | de:ad:be:ef:00:00 |
612 * +-------------------+ +-------------------+
613 *
614 * ~~~~
615 *
616 * +-------------------+
617 * | Valid Entry | varpd returned an answer with
618 * | de:ad:be:ef:00:00 | OVERLAY_IOC_RESPOND and the target cache
619 * | 10.169.23.42:4789 | entry is now populated with a
620 * +-------------------+ destination and marked as valid
621 *
622 *
623 * The lookup mechanism is performed via a series of operations on the character
624 * pseudo-device /dev/overlay. The only thing that uses this device is the
625 * userland daemon varpd. /dev/overlay is a cloneable device, each open of it
626 * granting a new minor number which maintains its own state. We maintain this
627 * state so that way if an outstanding lookup was queued to something that
628 * crashed or closed its handle without responding, we can know about this and
629 * thus handle it appropriately.
630 *
631 * When a lookup is first created it's added to our global list of outstanding
632 * lookups. To service requests, userland is required to perform an ioctl to ask
633 * for a request. We will block it in the kernel a set amount of time waiting
634 * for a request. When we give a request to a given minor instance of the
635 * device, we remove it from the global list and append the request to the
636 * device's list of outstanding entries, for the reasons we discussed above.
637 * When a lookup comes in, we give user land a smaller amount of information
638 * specific to that packet, the overlay_targ_lookup_t. It includes a request id
639 * to identify this, and then the overlay id, the varpd id, the header and
640 * packet size, the source and destination mac address, the SAP, and any
641 * potential VLAN header.
642 *
643 * At that point, it stays in that outstanding list until one of two ioctls are
644 * returned: OVERLAY_TARG_RESPOND or OVERLAY_TARG_DROP. During this time,
645 * userland may also perform other operations. For example, it may use
646 * OVERLAY_TARG_PKT to get a copy of this packet so it can perform more in-depth
647 * analysis of what to do beyond what we gave it initially. This is useful for
648 * providing proxy arp and the like. Finally, there are two other ioctls that
649 * varpd can then do. The first is OVERLAY_TARG_INJECT which injects the
650 * non-jumbo frame packet up into that mac device and OVERLAY_TARG_RESEND which
651 * causes us to encapsulate and send out the packet they've given us.
652 *
653 *
654 * Finally, through the target cache, several ioctls are provided to allow for
655 * interrogation and management of the cache. They allow for individual entries
656 * to be retrieved, set, or have the entire table flushed. For the full set of
657 * ioctls here and what they do, take a look at uts/common/sys/overlay_target.h.
658 *
659 * ------------------
660 * Sample Packet Flow
661 * ------------------
662 *
663 * There's a lot of pieces here, hopefully an example of how this all fits
664 * together will help clarify and elucidate what's going on. We're going to
665 * first track an outgoing packet, eg. one that is sent from an IP interface on
666 * a VNIC on top of an overlay device, and then we'll look at what it means to
667 * respond to that.
668 *
669 *
670 * +----------------+ +--------------+ +------------------+
671 * | IP/DLS send |------->| MAC sends it |----------->| mblk_t reaches |
672 * | packet to MAC | | to the GLDv3 | | overlay GLDv3 tx |
673 * +----------------+ | VNIC device | | overlay_m_tx() |
674 * +--------------+ +------------------+
675 * |
676 * . lookup . cache |
677 * . drop . miss v
678 * +---------+ . +--------+ . +------------------+
679 * | freemsg |<-----*-------| varpd |<---*------| Lookup each mblk |
680 * | mblk_t | | lookup | | in the target |
681 * +---------+ | queued | | cache |
682 * ^ +--------+ +------------------+
683 * on send | | | cache
684 * error . . * *. . lookup * . . hit
685 * | | success v
686 * | | +------------------+
687 * +-----------------+ +--------------->| call plugin |
688 * | Send out | | ovpo_encap() to |
689 * | overlay_mux_t's |<----------------------------------| get encap mblk_t |
690 * | ksocket | +------------------+
691 * +-----------------+
692 *
693 * The receive end point looks a little different and looks more like:
694 *
695 * +------------------+ +----------------+ +-----------+
696 * | mblk_t comes off |---->| enter netstack |--->| delivered |---+
697 * | the physical | | IP stack | | to | * . . direct
698 * | device | +----------------+ | ksocket | | callback
699 * +------------------+ +-----------+ |
700 * . overlay id |
701 * . not found v
702 * +-----------+ . +-----------------+ +--------------------+
703 * | freemsg |<--*------| call plugin |<------| overlay_mux_recv() |
704 * | mblk_t | | ovpo_decap() to | +--------------------+
705 * +-----------+ | decap mblk_t |
706 * +-----------------+
707 * |
708 * * . . overlay id
709 * v found
710 * +--------+ +----------------+
711 * | adjust |----->| call mac_rx |
712 * | mblk_t | | on original |
713 * +--------+ | decaped packet |
714 * +----------------+
715 *
716 * ------------------
717 * Netstack Awareness
718 * ------------------
719 *
720 * In the above image we note that this enters a netstack. Today the only
721 * netstack that can be is the global zone as the overlay driver itself is not
722 * exactly netstack aware. What this really means is that varpd cannot run in a
723 * non-global zone and an overlay device cannot belong to a non-global zone.
724 * Non-global zones can still have a VNIC assigned to them that's been created
725 * over the overlay device the same way they would if it had been created over
726 * an etherstub or a physical device.
727 *
728 * The majority of the work to make it netstack aware is straightforward and the
729 * biggest thing is to create a netstack module that allows us to hook into
730 * netstack (and thus zone) creation and destruction. From there, we need to
731 * amend the target cache lookup routines that we discussed earlier to not have
732 * a global outstanding list and a global list of handles, but rather, one per
733 * netstack.
734 *
735 * For the mux, we'll need to open the ksocket in the context of the zone, we
736 * can likely do this with a properly composed credential, but we'll need to do
737 * some more work on that path. Finally, we'll want to make sure the dld ioctls
738 * are aware of the zoneid of the caller and we use that appropriately and store
739 * it in the overlay_dev_t.
740 *
741 * -----------
742 * GLDv3 Notes
743 * -----------
744 *
745 * The overlay driver implements a GLDv3 device. Parts of GLDv3 are more
746 * relevant and other parts are much less relevant for us. For example, the
747 * GLDv3 is used to toggle the device being put into and out of promiscuous
748 * mode, to program MAC addresses for unicast and multicast hardware filters.
749 * Today, an overlay device doesn't have a notion of promiscuous mode nor does
750 * it have a notion of unicast and multicast addresses programmed into the
751 * device. Instead, for the purposes of the hardware filter, we don't do
752 * anything and just always accept new addresses being added and removed.
753 *
754 * If the GLDv3 start function has not been called, then we will not use this
755 * device for I/O purposes. Any calls to transmit or receive should be dropped,
756 * though the GLDv3 guarantees us that transmit will not be called without
757 * calling start. Similarly, once stop is called, then no packets can be dealt
758 * with.
759 *
760 * Today we don't support the stat interfaces, though there's no good reason
761 * that we shouldn't assemble some of the stats based on what we have in the
762 * future.
763 *
764 * When it comes to link properties, many of the traditional link properties do
765 * not apply and many others MAC handles for us. For example, we don't need to
766 * implement anything for overlay_m_getprop() to deal with returning the MTU, as
767 * MAC never calls into us for that. As such, there isn't much of anything to
768 * support in terms of properties.
769 *
770 * Today, we don't support any notion of hardware capabilities. However, if
771 * future NIC hardware or other changes to the system cause it to make sense for
772 * us to emulate logical groups, then we should do that. However, we still do
773 * implement a capab function so that we can identify ourselves as an overlay
774 * device to the broader MAC framework. This is done mostly so that a device
775 * created on top of us can have fanout rings as we don't try to lie about a
776 * speed for our device.
777 *
778 * The other question is what should be done for a device's MTU and margin. We
779 * set our minimum supported MTU to be the minimum value that an IP network may
780 * be set to 576 -- which mimics what an etherstub does. On the flip side, we
781 * have our upper bound set to 8900. This value comes from the fact that a lot
782 * of jumbo networks use their maximum as 9000. As such, we want to reserve 100
783 * bytes, which isn't exactly the most accurate number, but it'll be good enough
784 * for now. Because of that, our default MTU off of these devices is 1400, as
785 * the default MTU for everything is usually 1500 or whatever the underlying
786 * device is at; however, this is a bit simpler than asking the netstack what
787 * are all the IP interfaces at. It also calls into question how PMTU and PMTU
788 * discovery should work here. The challenge, especially for
789 * OVERLAY_TARG_DYNAMIC is that the MTU to any of the places will vary and it's
790 * not clear that if you have a single bad entry that the overall MTU should be
791 * lowered. Instead, we should figure out a better way of determining these
792 * kinds of PMTU errors and appropriately alerting the administrator via FMA.
793 *
794 * Regarding margin, we allow a margin of up to VLAN_TAGSZ depending on whether
795 * or not the underlying encapsulation device supports VLAN tags. If it does,
796 * then we'll set the margin to allow for it, otherwise, we will not.
797 */
798
799 #include <sys/conf.h>
800 #include <sys/errno.h>
801 #include <sys/stat.h>
802 #include <sys/ddi.h>
803 #include <sys/sunddi.h>
804 #include <sys/modctl.h>
805 #include <sys/policy.h>
806 #include <sys/stream.h>
807 #include <sys/strsubr.h>
808 #include <sys/strsun.h>
809 #include <sys/types.h>
810 #include <sys/kmem.h>
811 #include <sys/param.h>
812 #include <sys/sysmacros.h>
813 #include <sys/ddifm.h>
814
815 #include <sys/dls.h>
816 #include <sys/dld_ioc.h>
817 #include <sys/mac_provider.h>
818 #include <sys/mac_client_priv.h>
819 #include <sys/mac_ether.h>
820 #include <sys/vlan.h>
821
822 #include <sys/overlay_impl.h>
823
824 dev_info_t *overlay_dip;
825 static kmutex_t overlay_dev_lock;
826 static list_t overlay_dev_list;
827 static uint8_t overlay_macaddr[ETHERADDRL] =
828 { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
829
830 typedef enum overlay_dev_prop {
831 OVERLAY_DEV_P_MTU = 0,
832 OVERLAY_DEV_P_VNETID,
833 OVERLAY_DEV_P_ENCAP,
834 OVERLAY_DEV_P_VARPDID
835 } overlay_dev_prop_t;
836
837 #define OVERLAY_DEV_NPROPS 4
838 static const char *overlay_dev_props[] = {
839 "mtu",
840 "vnetid",
841 "encap",
842 "varpd/id"
843 };
844
845 #define OVERLAY_MTU_MIN 576
846 #define OVERLAY_MTU_DEF 1400
847 #define OVERLAY_MTU_MAX 8900
848
849 overlay_dev_t *
overlay_hold_by_dlid(datalink_id_t id)850 overlay_hold_by_dlid(datalink_id_t id)
851 {
852 overlay_dev_t *o;
853
854 mutex_enter(&overlay_dev_lock);
855 for (o = list_head(&overlay_dev_list); o != NULL;
856 o = list_next(&overlay_dev_list, o)) {
857 if (id == o->odd_linkid) {
858 mutex_enter(&o->odd_lock);
859 o->odd_ref++;
860 mutex_exit(&o->odd_lock);
861 mutex_exit(&overlay_dev_lock);
862 return (o);
863 }
864 }
865
866 mutex_exit(&overlay_dev_lock);
867 return (NULL);
868 }
869
870 void
overlay_hold_rele(overlay_dev_t * odd)871 overlay_hold_rele(overlay_dev_t *odd)
872 {
873 mutex_enter(&odd->odd_lock);
874 ASSERT(odd->odd_ref > 0);
875 odd->odd_ref--;
876 mutex_exit(&odd->odd_lock);
877 }
878
879 void
overlay_io_start(overlay_dev_t * odd,overlay_dev_flag_t flag)880 overlay_io_start(overlay_dev_t *odd, overlay_dev_flag_t flag)
881 {
882 ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX);
883 ASSERT(MUTEX_HELD(&odd->odd_lock));
884
885 if (flag & OVERLAY_F_IN_RX)
886 odd->odd_rxcount++;
887 if (flag & OVERLAY_F_IN_TX)
888 odd->odd_txcount++;
889 odd->odd_flags |= flag;
890 }
891
892 void
overlay_io_done(overlay_dev_t * odd,overlay_dev_flag_t flag)893 overlay_io_done(overlay_dev_t *odd, overlay_dev_flag_t flag)
894 {
895 boolean_t signal = B_FALSE;
896
897 ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX);
898 ASSERT(MUTEX_HELD(&odd->odd_lock));
899
900 if (flag & OVERLAY_F_IN_RX) {
901 ASSERT(odd->odd_rxcount > 0);
902 odd->odd_rxcount--;
903 if (odd->odd_rxcount == 0) {
904 signal = B_TRUE;
905 odd->odd_flags &= ~OVERLAY_F_IN_RX;
906 }
907 }
908 if (flag & OVERLAY_F_IN_TX) {
909 ASSERT(odd->odd_txcount > 0);
910 odd->odd_txcount--;
911 if (odd->odd_txcount == 0) {
912 signal = B_TRUE;
913 odd->odd_flags &= ~OVERLAY_F_IN_TX;
914 }
915 }
916
917 if (signal == B_TRUE)
918 cv_broadcast(&odd->odd_iowait);
919 }
920
921 static void
overlay_io_wait(overlay_dev_t * odd,overlay_dev_flag_t flag)922 overlay_io_wait(overlay_dev_t *odd, overlay_dev_flag_t flag)
923 {
924 ASSERT((flag & ~OVERLAY_F_IOMASK) == 0);
925 ASSERT(MUTEX_HELD(&odd->odd_lock));
926
927 while (odd->odd_flags & flag) {
928 cv_wait(&odd->odd_iowait, &odd->odd_lock);
929 }
930 }
931
932 void
overlay_dev_iter(overlay_dev_iter_f func,void * arg)933 overlay_dev_iter(overlay_dev_iter_f func, void *arg)
934 {
935 overlay_dev_t *odd;
936
937 mutex_enter(&overlay_dev_lock);
938 for (odd = list_head(&overlay_dev_list); odd != NULL;
939 odd = list_next(&overlay_dev_list, odd)) {
940 if (func(odd, arg) != 0) {
941 mutex_exit(&overlay_dev_lock);
942 return;
943 }
944 }
945 mutex_exit(&overlay_dev_lock);
946 }
947
948 /* ARGSUSED */
949 static int
overlay_m_stat(void * arg,uint_t stat,uint64_t * val)950 overlay_m_stat(void *arg, uint_t stat, uint64_t *val)
951 {
952 return (ENOTSUP);
953 }
954
955 static int
overlay_m_start(void * arg)956 overlay_m_start(void *arg)
957 {
958 overlay_dev_t *odd = arg;
959 overlay_mux_t *mux;
960 int ret, domain, family, prot;
961 struct sockaddr_storage storage;
962 socklen_t slen;
963
964 mutex_enter(&odd->odd_lock);
965 if ((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0) {
966 mutex_exit(&odd->odd_lock);
967 return (EAGAIN);
968 }
969 mutex_exit(&odd->odd_lock);
970
971 ret = odd->odd_plugin->ovp_ops->ovpo_socket(odd->odd_pvoid, &domain,
972 &family, &prot, (struct sockaddr *)&storage, &slen);
973 if (ret != 0)
974 return (ret);
975
976 mux = overlay_mux_open(odd->odd_plugin, domain, family, prot,
977 (struct sockaddr *)&storage, slen, &ret);
978 if (mux == NULL)
979 return (ret);
980
981 overlay_mux_add_dev(mux, odd);
982 odd->odd_mux = mux;
983 mutex_enter(&odd->odd_lock);
984 ASSERT(!(odd->odd_flags & OVERLAY_F_IN_MUX));
985 odd->odd_flags |= OVERLAY_F_IN_MUX;
986 mutex_exit(&odd->odd_lock);
987
988 return (0);
989 }
990
991 static void
overlay_m_stop(void * arg)992 overlay_m_stop(void *arg)
993 {
994 overlay_dev_t *odd = arg;
995
996 /*
997 * The MAC Perimeter is held here, so we don't have to worry about
998 * synchronizing this with respect to metadata operations.
999 */
1000 mutex_enter(&odd->odd_lock);
1001 VERIFY(odd->odd_flags & OVERLAY_F_IN_MUX);
1002 VERIFY(!(odd->odd_flags & OVERLAY_F_MDDROP));
1003 odd->odd_flags |= OVERLAY_F_MDDROP;
1004 overlay_io_wait(odd, OVERLAY_F_IOMASK);
1005 mutex_exit(&odd->odd_lock);
1006
1007 overlay_mux_remove_dev(odd->odd_mux, odd);
1008 overlay_mux_close(odd->odd_mux);
1009 odd->odd_mux = NULL;
1010
1011 mutex_enter(&odd->odd_lock);
1012 odd->odd_flags &= ~OVERLAY_F_IN_MUX;
1013 odd->odd_flags &= ~OVERLAY_F_MDDROP;
1014 VERIFY((odd->odd_flags & OVERLAY_F_STOPMASK) == 0);
1015 mutex_exit(&odd->odd_lock);
1016 }
1017
1018 /*
1019 * For more info on this, see the big theory statement.
1020 */
1021 /* ARGSUSED */
1022 static int
overlay_m_promisc(void * arg,boolean_t on)1023 overlay_m_promisc(void *arg, boolean_t on)
1024 {
1025 return (0);
1026 }
1027
1028 /*
1029 * For more info on this, see the big theory statement.
1030 */
1031 /* ARGSUSED */
1032 static int
overlay_m_multicast(void * arg,boolean_t add,const uint8_t * addrp)1033 overlay_m_multicast(void *arg, boolean_t add, const uint8_t *addrp)
1034 {
1035 return (0);
1036 }
1037
1038 /*
1039 * For more info on this, see the big theory statement.
1040 */
1041 /* ARGSUSED */
1042 static int
overlay_m_unicast(void * arg,const uint8_t * macaddr)1043 overlay_m_unicast(void *arg, const uint8_t *macaddr)
1044 {
1045 return (0);
1046 }
1047
1048 mblk_t *
overlay_m_tx(void * arg,mblk_t * mp_chain)1049 overlay_m_tx(void *arg, mblk_t *mp_chain)
1050 {
1051 overlay_dev_t *odd = arg;
1052 mblk_t *mp, *ep;
1053 int ret;
1054 ovep_encap_info_t einfo;
1055 struct msghdr hdr;
1056
1057 mutex_enter(&odd->odd_lock);
1058 if ((odd->odd_flags & OVERLAY_F_MDDROP) ||
1059 !(odd->odd_flags & OVERLAY_F_IN_MUX)) {
1060 mutex_exit(&odd->odd_lock);
1061 freemsgchain(mp_chain);
1062 return (NULL);
1063 }
1064 overlay_io_start(odd, OVERLAY_F_IN_TX);
1065 mutex_exit(&odd->odd_lock);
1066
1067 bzero(&hdr, sizeof (struct msghdr));
1068
1069 bzero(&einfo, sizeof (ovep_encap_info_t));
1070 einfo.ovdi_id = odd->odd_vid;
1071 mp = mp_chain;
1072 while (mp != NULL) {
1073 socklen_t slen;
1074 struct sockaddr_storage storage;
1075
1076 mp_chain = mp->b_next;
1077 mp->b_next = NULL;
1078 ep = NULL;
1079
1080 ret = overlay_target_lookup(odd, mp,
1081 (struct sockaddr *)&storage, &slen);
1082 if (ret != OVERLAY_TARGET_OK) {
1083 if (ret == OVERLAY_TARGET_DROP)
1084 freemsg(mp);
1085 mp = mp_chain;
1086 continue;
1087 }
1088
1089 hdr.msg_name = &storage;
1090 hdr.msg_namelen = slen;
1091
1092 ret = odd->odd_plugin->ovp_ops->ovpo_encap(odd->odd_mh, mp,
1093 &einfo, &ep);
1094 if (ret != 0 || ep == NULL) {
1095 freemsg(mp);
1096 goto out;
1097 }
1098
1099 ASSERT(ep->b_cont == mp || ep == mp);
1100 ret = overlay_mux_tx(odd->odd_mux, &hdr, ep);
1101 if (ret != 0)
1102 goto out;
1103
1104 mp = mp_chain;
1105 }
1106
1107 out:
1108 mutex_enter(&odd->odd_lock);
1109 overlay_io_done(odd, OVERLAY_F_IN_TX);
1110 mutex_exit(&odd->odd_lock);
1111 return (mp_chain);
1112 }
1113
1114 /* ARGSUSED */
1115 static void
overlay_m_ioctl(void * arg,queue_t * q,mblk_t * mp)1116 overlay_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
1117 {
1118 miocnak(q, mp, 0, ENOTSUP);
1119 }
1120
1121 /* ARGSUSED */
1122 static boolean_t
overlay_m_getcapab(void * arg,mac_capab_t cap,void * cap_data)1123 overlay_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
1124 {
1125 /*
1126 * Tell MAC we're an overlay.
1127 */
1128 if (cap == MAC_CAPAB_OVERLAY)
1129 return (B_TRUE);
1130 return (B_FALSE);
1131 }
1132
1133 /* ARGSUSED */
1134 static int
overlay_m_setprop(void * arg,const char * pr_name,mac_prop_id_t pr_num,uint_t pr_valsize,const void * pr_val)1135 overlay_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1136 uint_t pr_valsize, const void *pr_val)
1137 {
1138 uint32_t mtu, old;
1139 int err;
1140 overlay_dev_t *odd = arg;
1141
1142 if (pr_num != MAC_PROP_MTU)
1143 return (ENOTSUP);
1144
1145 bcopy(pr_val, &mtu, sizeof (mtu));
1146 if (mtu < OVERLAY_MTU_MIN || mtu > OVERLAY_MTU_MAX)
1147 return (EINVAL);
1148
1149 mutex_enter(&odd->odd_lock);
1150 old = odd->odd_mtu;
1151 odd->odd_mtu = mtu;
1152 err = mac_maxsdu_update(odd->odd_mh, mtu);
1153 if (err != 0)
1154 odd->odd_mtu = old;
1155 mutex_exit(&odd->odd_lock);
1156
1157 return (err);
1158 }
1159
1160 /* ARGSUSED */
1161 static int
overlay_m_getprop(void * arg,const char * pr_name,mac_prop_id_t pr_num,uint_t pr_valsize,void * pr_val)1162 overlay_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1163 uint_t pr_valsize, void *pr_val)
1164 {
1165 return (ENOTSUP);
1166 }
1167
1168 /* ARGSUSED */
1169 static void
overlay_m_propinfo(void * arg,const char * pr_name,mac_prop_id_t pr_num,mac_prop_info_handle_t prh)1170 overlay_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1171 mac_prop_info_handle_t prh)
1172 {
1173 if (pr_num != MAC_PROP_MTU)
1174 return;
1175
1176 mac_prop_info_set_default_uint32(prh, OVERLAY_MTU_DEF);
1177 mac_prop_info_set_range_uint32(prh, OVERLAY_MTU_MIN, OVERLAY_MTU_MAX);
1178 }
1179
1180 static mac_callbacks_t overlay_m_callbacks = {
1181 .mc_callbacks = (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP |
1182 MC_PROPINFO),
1183 .mc_getstat = overlay_m_stat,
1184 .mc_start = overlay_m_start,
1185 .mc_stop = overlay_m_stop,
1186 .mc_setpromisc = overlay_m_promisc,
1187 .mc_multicst = overlay_m_multicast,
1188 .mc_unicst = overlay_m_unicast,
1189 .mc_tx = overlay_m_tx,
1190 .mc_ioctl = overlay_m_ioctl,
1191 .mc_getcapab = overlay_m_getcapab,
1192 .mc_getprop = overlay_m_getprop,
1193 .mc_setprop = overlay_m_setprop,
1194 .mc_propinfo = overlay_m_propinfo
1195 };
1196
1197 static boolean_t
overlay_valid_name(const char * name,size_t buflen)1198 overlay_valid_name(const char *name, size_t buflen)
1199 {
1200 size_t actlen;
1201 int err, i;
1202
1203 for (i = 0; i < buflen; i++) {
1204 if (name[i] == '\0')
1205 break;
1206 }
1207
1208 if (i == 0 || i == buflen)
1209 return (B_FALSE);
1210 actlen = i;
1211 if (strchr(name, '/') != NULL)
1212 return (B_FALSE);
1213 if (u8_validate((char *)name, actlen, NULL,
1214 U8_VALIDATE_ENTIRE, &err) < 0)
1215 return (B_FALSE);
1216 return (B_TRUE);
1217 }
1218
1219 /* ARGSUSED */
1220 static int
overlay_i_create(void * karg,intptr_t arg,int mode,cred_t * cred,int * rvalp)1221 overlay_i_create(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
1222 {
1223 int err;
1224 uint64_t maxid;
1225 overlay_dev_t *odd, *o;
1226 mac_register_t *mac;
1227 overlay_ioc_create_t *oicp = karg;
1228
1229 if (overlay_valid_name(oicp->oic_encap, MAXLINKNAMELEN) == B_FALSE)
1230 return (EINVAL);
1231
1232 odd = kmem_zalloc(sizeof (overlay_dev_t), KM_SLEEP);
1233 odd->odd_linkid = oicp->oic_linkid;
1234 odd->odd_plugin = overlay_plugin_lookup(oicp->oic_encap);
1235 if (odd->odd_plugin == NULL) {
1236 kmem_free(odd, sizeof (overlay_dev_t));
1237 return (ENOENT);
1238 }
1239 err = odd->odd_plugin->ovp_ops->ovpo_init((overlay_handle_t)odd,
1240 &odd->odd_pvoid);
1241 if (err != 0) {
1242 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1243 overlay_plugin_rele(odd->odd_plugin);
1244 kmem_free(odd, sizeof (overlay_dev_t));
1245 return (EINVAL);
1246 }
1247
1248 /*
1249 * Make sure that our virtual network id is valid for the given plugin
1250 * that we're working with.
1251 */
1252 ASSERT(odd->odd_plugin->ovp_id_size <= 8);
1253 maxid = UINT64_MAX;
1254 if (odd->odd_plugin->ovp_id_size != 8)
1255 maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) - 1ULL;
1256 if (oicp->oic_vnetid > maxid) {
1257 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1258 overlay_plugin_rele(odd->odd_plugin);
1259 kmem_free(odd, sizeof (overlay_dev_t));
1260 return (EINVAL);
1261 }
1262 odd->odd_vid = oicp->oic_vnetid;
1263
1264 mac = mac_alloc(MAC_VERSION);
1265 if (mac == NULL) {
1266 mutex_exit(&overlay_dev_lock);
1267 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1268 overlay_plugin_rele(odd->odd_plugin);
1269 kmem_free(odd, sizeof (overlay_dev_t));
1270 return (EINVAL);
1271 }
1272
1273 mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1274 mac->m_driver = odd;
1275 mac->m_dip = overlay_dip;
1276 mac->m_dst_addr = NULL;
1277 mac->m_callbacks = &overlay_m_callbacks;
1278 mac->m_pdata = NULL;
1279 mac->m_pdata_size = 0;
1280
1281 mac->m_priv_props = NULL;
1282
1283 /* Let mac handle this itself. */
1284 mac->m_instance = (uint_t)-1;
1285
1286 /*
1287 * There is no real source address that should be used here, but saying
1288 * that we're not ethernet is going to cause its own problems. At the
1289 * end of the say, this is fine.
1290 */
1291 mac->m_src_addr = overlay_macaddr;
1292
1293 /*
1294 * Start with the default MTU as the max SDU. If the MTU is changed, the
1295 * SDU will be changed to reflect that.
1296 */
1297 mac->m_min_sdu = 1;
1298 mac->m_max_sdu = OVERLAY_MTU_DEF;
1299 mac->m_multicast_sdu = 0;
1300
1301 /*
1302 * The underlying device doesn't matter, instead this comes from the
1303 * encapsulation protocol and whether or not they allow VLAN tags.
1304 */
1305 if (odd->odd_plugin->ovp_flags & OVEP_F_VLAN_TAG) {
1306 mac->m_margin = VLAN_TAGSZ;
1307 } else {
1308 mac->m_margin = 0;
1309 }
1310
1311 /*
1312 * Today, we have no MAC virtualization, it may make sense in the future
1313 * to go ahead and emulate some subset of this, but it doesn't today.
1314 */
1315 mac->m_v12n = MAC_VIRT_NONE;
1316
1317 mutex_enter(&overlay_dev_lock);
1318 for (o = list_head(&overlay_dev_list); o != NULL;
1319 o = list_next(&overlay_dev_list, o)) {
1320 if (o->odd_linkid == oicp->oic_linkid) {
1321 mutex_exit(&overlay_dev_lock);
1322 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1323 overlay_plugin_rele(odd->odd_plugin);
1324 kmem_free(odd, sizeof (overlay_dev_t));
1325 return (EEXIST);
1326 }
1327
1328 if (o->odd_vid == oicp->oic_vnetid &&
1329 o->odd_plugin == odd->odd_plugin) {
1330 mutex_exit(&overlay_dev_lock);
1331 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1332 overlay_plugin_rele(odd->odd_plugin);
1333 kmem_free(odd, sizeof (overlay_dev_t));
1334 return (EEXIST);
1335 }
1336 }
1337
1338 err = mac_register(mac, &odd->odd_mh);
1339 mac_free(mac);
1340 if (err != 0) {
1341 mutex_exit(&overlay_dev_lock);
1342 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1343 overlay_plugin_rele(odd->odd_plugin);
1344 kmem_free(odd, sizeof (overlay_dev_t));
1345 return (err);
1346 }
1347
1348 err = dls_devnet_create(odd->odd_mh, odd->odd_linkid,
1349 crgetzoneid(cred));
1350 if (err != 0) {
1351 mutex_exit(&overlay_dev_lock);
1352 (void) mac_unregister(odd->odd_mh);
1353 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1354 overlay_plugin_rele(odd->odd_plugin);
1355 kmem_free(odd, sizeof (overlay_dev_t));
1356 return (err);
1357 }
1358
1359 mutex_init(&odd->odd_lock, NULL, MUTEX_DRIVER, NULL);
1360 cv_init(&odd->odd_iowait, NULL, CV_DRIVER, NULL);
1361 odd->odd_ref = 0;
1362 odd->odd_flags = 0;
1363 list_insert_tail(&overlay_dev_list, odd);
1364 mutex_exit(&overlay_dev_lock);
1365
1366 return (0);
1367 }
1368
1369 /* ARGSUSED */
1370 static int
overlay_i_activate(void * karg,intptr_t arg,int mode,cred_t * cred,int * rvalp)1371 overlay_i_activate(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
1372 {
1373 int i, ret;
1374 overlay_dev_t *odd;
1375 mac_perim_handle_t mph;
1376 overlay_ioc_activate_t *oiap = karg;
1377 overlay_ioc_propinfo_t *infop;
1378 overlay_ioc_prop_t *oip;
1379 overlay_prop_handle_t phdl;
1380
1381 odd = overlay_hold_by_dlid(oiap->oia_linkid);
1382 if (odd == NULL)
1383 return (ENOENT);
1384
1385 infop = kmem_alloc(sizeof (overlay_ioc_propinfo_t), KM_SLEEP);
1386 oip = kmem_alloc(sizeof (overlay_ioc_prop_t), KM_SLEEP);
1387 phdl = (overlay_prop_handle_t)infop;
1388
1389 mac_perim_enter_by_mh(odd->odd_mh, &mph);
1390 mutex_enter(&odd->odd_lock);
1391 if (odd->odd_flags & OVERLAY_F_ACTIVATED) {
1392 mutex_exit(&odd->odd_lock);
1393 mac_perim_exit(mph);
1394 overlay_hold_rele(odd);
1395 kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1396 kmem_free(oip, sizeof (overlay_ioc_prop_t));
1397 return (EEXIST);
1398 }
1399 mutex_exit(&odd->odd_lock);
1400
1401 for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) {
1402 const char *pname = odd->odd_plugin->ovp_props[i];
1403 bzero(infop, sizeof (overlay_ioc_propinfo_t));
1404 overlay_prop_init(phdl);
1405 ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(pname, phdl);
1406 if (ret != 0) {
1407 mac_perim_exit(mph);
1408 overlay_hold_rele(odd);
1409 kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1410 kmem_free(oip, sizeof (overlay_ioc_prop_t));
1411 return (ret);
1412 }
1413
1414 if ((infop->oipi_prot & OVERLAY_PROP_PERM_REQ) == 0)
1415 continue;
1416 bzero(oip, sizeof (overlay_ioc_prop_t));
1417 oip->oip_size = sizeof (oip->oip_value);
1418 ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid,
1419 pname, oip->oip_value, &oip->oip_size);
1420 if (ret != 0) {
1421 mac_perim_exit(mph);
1422 overlay_hold_rele(odd);
1423 kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1424 kmem_free(oip, sizeof (overlay_ioc_prop_t));
1425 return (ret);
1426 }
1427 if (oip->oip_size == 0) {
1428 mac_perim_exit(mph);
1429 overlay_hold_rele(odd);
1430 kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1431 kmem_free(oip, sizeof (overlay_ioc_prop_t));
1432 return (EINVAL);
1433 }
1434 }
1435
1436 mutex_enter(&odd->odd_lock);
1437 if ((odd->odd_flags & OVERLAY_F_VARPD) == 0) {
1438 mutex_exit(&odd->odd_lock);
1439 mac_perim_exit(mph);
1440 overlay_hold_rele(odd);
1441 kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1442 kmem_free(oip, sizeof (overlay_ioc_prop_t));
1443 return (ENXIO);
1444 }
1445
1446 ASSERT((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0);
1447 odd->odd_flags |= OVERLAY_F_ACTIVATED;
1448
1449 /*
1450 * Now that we've activated ourselves, we should indicate to the world
1451 * that we're up. Note that we may not be able to perform lookups at
1452 * this time, but our notion of being 'up' isn't dependent on that
1453 * ability.
1454 */
1455 mac_link_update(odd->odd_mh, LINK_STATE_UP);
1456 mutex_exit(&odd->odd_lock);
1457
1458 mac_perim_exit(mph);
1459 overlay_hold_rele(odd);
1460 kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1461 kmem_free(oip, sizeof (overlay_ioc_prop_t));
1462
1463 return (0);
1464 }
1465
1466 /* ARGSUSED */
1467 static int
overlay_i_delete(void * karg,intptr_t arg,int mode,cred_t * cred,int * rvalp)1468 overlay_i_delete(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
1469 {
1470 overlay_ioc_delete_t *oidp = karg;
1471 overlay_dev_t *odd;
1472 datalink_id_t tid;
1473 int ret;
1474
1475 odd = overlay_hold_by_dlid(oidp->oid_linkid);
1476 if (odd == NULL) {
1477 return (ENOENT);
1478 }
1479
1480 mutex_enter(&odd->odd_lock);
1481 /* If we're not the only hold, we're busy */
1482 if (odd->odd_ref != 1) {
1483 mutex_exit(&odd->odd_lock);
1484 overlay_hold_rele(odd);
1485 return (EBUSY);
1486 }
1487
1488 if (odd->odd_flags & OVERLAY_F_IN_MUX) {
1489 mutex_exit(&odd->odd_lock);
1490 overlay_hold_rele(odd);
1491 return (EBUSY);
1492 }
1493
1494 /*
1495 * To remove this, we need to first remove it from dls and then remove
1496 * it from mac. The act of removing it from mac will check if there are
1497 * devices on top of this, eg. vnics. If there are, then that will fail
1498 * and we'll have to go through and recreate the dls entry. Only after
1499 * mac_unregister has succeeded, then we'll go through and actually free
1500 * everything and drop the dev lock.
1501 */
1502 ret = dls_devnet_destroy(odd->odd_mh, &tid, B_TRUE);
1503 if (ret != 0) {
1504 overlay_hold_rele(odd);
1505 return (ret);
1506 }
1507
1508 ASSERT(oidp->oid_linkid == tid);
1509 ret = mac_disable(odd->odd_mh);
1510 if (ret != 0) {
1511 (void) dls_devnet_create(odd->odd_mh, odd->odd_linkid,
1512 crgetzoneid(cred));
1513 overlay_hold_rele(odd);
1514 return (ret);
1515 }
1516
1517 overlay_target_quiesce(odd->odd_target);
1518
1519 mutex_enter(&overlay_dev_lock);
1520 list_remove(&overlay_dev_list, odd);
1521 mutex_exit(&overlay_dev_lock);
1522
1523 cv_destroy(&odd->odd_iowait);
1524 mutex_destroy(&odd->odd_lock);
1525 overlay_target_free(odd);
1526 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1527 overlay_plugin_rele(odd->odd_plugin);
1528 kmem_free(odd, sizeof (overlay_dev_t));
1529
1530 return (0);
1531 }
1532
1533 /* ARGSUSED */
1534 static int
overlay_i_nprops(void * karg,intptr_t arg,int mode,cred_t * cred,int * rvalp)1535 overlay_i_nprops(void *karg, intptr_t arg, int mode, cred_t *cred,
1536 int *rvalp)
1537 {
1538 overlay_dev_t *odd;
1539 overlay_ioc_nprops_t *on = karg;
1540
1541 odd = overlay_hold_by_dlid(on->oipn_linkid);
1542 if (odd == NULL)
1543 return (ENOENT);
1544 on->oipn_nprops = odd->odd_plugin->ovp_nprops + OVERLAY_DEV_NPROPS;
1545 overlay_hold_rele(odd);
1546
1547 return (0);
1548 }
1549
1550 static int
overlay_propinfo_plugin_cb(overlay_plugin_t * opp,void * arg)1551 overlay_propinfo_plugin_cb(overlay_plugin_t *opp, void *arg)
1552 {
1553 overlay_prop_handle_t phdl = arg;
1554 overlay_prop_set_range_str(phdl, opp->ovp_name);
1555 return (0);
1556 }
1557
1558 static int
overlay_i_name_to_propid(overlay_dev_t * odd,const char * name,uint_t * id)1559 overlay_i_name_to_propid(overlay_dev_t *odd, const char *name, uint_t *id)
1560 {
1561 int i;
1562
1563 for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
1564 if (strcmp(overlay_dev_props[i], name) == 0) {
1565 *id = i;
1566 return (0);
1567 }
1568 }
1569
1570 for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) {
1571 if (strcmp(odd->odd_plugin->ovp_props[i], name) == 0) {
1572 *id = i + OVERLAY_DEV_NPROPS;
1573 return (0);
1574 }
1575 }
1576
1577 return (ENOENT);
1578 }
1579
1580 static void
overlay_i_propinfo_mtu(overlay_dev_t * odd,overlay_prop_handle_t phdl)1581 overlay_i_propinfo_mtu(overlay_dev_t *odd, overlay_prop_handle_t phdl)
1582 {
1583 uint32_t def;
1584 mac_propval_range_t range;
1585 uint_t perm;
1586
1587 ASSERT(MAC_PERIM_HELD(odd->odd_mh));
1588
1589 bzero(&range, sizeof (mac_propval_range_t));
1590 range.mpr_count = 1;
1591 if (mac_prop_info(odd->odd_mh, MAC_PROP_MTU, "mtu", &def,
1592 sizeof (def), &range, &perm) != 0)
1593 return;
1594
1595 if (perm == MAC_PROP_PERM_READ)
1596 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
1597 else if (perm == MAC_PROP_PERM_WRITE)
1598 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_WRITE);
1599 else if (perm == MAC_PROP_PERM_RW)
1600 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
1601
1602 overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
1603 overlay_prop_set_default(phdl, &def, sizeof (def));
1604 overlay_prop_set_range_uint32(phdl, range.mpr_range_uint32[0].mpur_min,
1605 range.mpr_range_uint32[0].mpur_max);
1606 }
1607
1608 /* ARGSUSED */
1609 static int
overlay_i_propinfo(void * karg,intptr_t arg,int mode,cred_t * cred,int * rvalp)1610 overlay_i_propinfo(void *karg, intptr_t arg, int mode, cred_t *cred,
1611 int *rvalp)
1612 {
1613 overlay_dev_t *odd;
1614 int ret;
1615 mac_perim_handle_t mph;
1616 uint_t propid = UINT_MAX;
1617 overlay_ioc_propinfo_t *oip = karg;
1618 overlay_prop_handle_t phdl = (overlay_prop_handle_t)oip;
1619
1620 odd = overlay_hold_by_dlid(oip->oipi_linkid);
1621 if (odd == NULL)
1622 return (ENOENT);
1623
1624 overlay_prop_init(phdl);
1625 mac_perim_enter_by_mh(odd->odd_mh, &mph);
1626
1627 /*
1628 * If the id is -1, then the property that we're looking for is named in
1629 * oipi_name and we should fill in its id. Otherwise, we've been given
1630 * an id and we need to turn that into a name for our plugin's sake. The
1631 * id is our own fabrication for property discovery.
1632 */
1633 if (oip->oipi_id == -1) {
1634 /*
1635 * Determine if it's a known generic property or it belongs to a
1636 * module by checking against the list of known names.
1637 */
1638 oip->oipi_name[OVERLAY_PROP_NAMELEN-1] = '\0';
1639 if ((ret = overlay_i_name_to_propid(odd, oip->oipi_name,
1640 &propid)) != 0) {
1641 overlay_hold_rele(odd);
1642 mac_perim_exit(mph);
1643 return (ret);
1644 }
1645 oip->oipi_id = propid;
1646 if (propid >= OVERLAY_DEV_NPROPS) {
1647 ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(
1648 oip->oipi_name, phdl);
1649 overlay_hold_rele(odd);
1650 mac_perim_exit(mph);
1651 return (ret);
1652
1653 }
1654 } else if (oip->oipi_id >= OVERLAY_DEV_NPROPS) {
1655 uint_t id = oip->oipi_id - OVERLAY_DEV_NPROPS;
1656
1657 if (id >= odd->odd_plugin->ovp_nprops) {
1658 overlay_hold_rele(odd);
1659 mac_perim_exit(mph);
1660 return (EINVAL);
1661 }
1662 ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(
1663 odd->odd_plugin->ovp_props[id], phdl);
1664 overlay_hold_rele(odd);
1665 mac_perim_exit(mph);
1666 return (ret);
1667 } else if (oip->oipi_id < -1) {
1668 overlay_hold_rele(odd);
1669 mac_perim_exit(mph);
1670 return (EINVAL);
1671 } else {
1672 ASSERT(oip->oipi_id < OVERLAY_DEV_NPROPS);
1673 ASSERT(oip->oipi_id >= 0);
1674 propid = oip->oipi_id;
1675 (void) strlcpy(oip->oipi_name, overlay_dev_props[propid],
1676 sizeof (oip->oipi_name));
1677 }
1678
1679 switch (propid) {
1680 case OVERLAY_DEV_P_MTU:
1681 overlay_i_propinfo_mtu(odd, phdl);
1682 break;
1683 case OVERLAY_DEV_P_VNETID:
1684 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
1685 overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
1686 overlay_prop_set_nodefault(phdl);
1687 break;
1688 case OVERLAY_DEV_P_ENCAP:
1689 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
1690 overlay_prop_set_type(phdl, OVERLAY_PROP_T_STRING);
1691 overlay_prop_set_nodefault(phdl);
1692 overlay_plugin_walk(overlay_propinfo_plugin_cb, phdl);
1693 break;
1694 case OVERLAY_DEV_P_VARPDID:
1695 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
1696 overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
1697 overlay_prop_set_nodefault(phdl);
1698 break;
1699 default:
1700 overlay_hold_rele(odd);
1701 mac_perim_exit(mph);
1702 return (ENOENT);
1703 }
1704
1705 overlay_hold_rele(odd);
1706 mac_perim_exit(mph);
1707 return (0);
1708 }
1709
1710 /* ARGSUSED */
1711 static int
overlay_i_getprop(void * karg,intptr_t arg,int mode,cred_t * cred,int * rvalp)1712 overlay_i_getprop(void *karg, intptr_t arg, int mode, cred_t *cred,
1713 int *rvalp)
1714 {
1715 int ret;
1716 overlay_dev_t *odd;
1717 mac_perim_handle_t mph;
1718 overlay_ioc_prop_t *oip = karg;
1719 uint_t propid, mtu;
1720
1721 odd = overlay_hold_by_dlid(oip->oip_linkid);
1722 if (odd == NULL)
1723 return (ENOENT);
1724
1725 mac_perim_enter_by_mh(odd->odd_mh, &mph);
1726 oip->oip_size = OVERLAY_PROP_SIZEMAX;
1727 oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0';
1728 if (oip->oip_id == -1) {
1729 int i;
1730
1731 for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
1732 if (strcmp(overlay_dev_props[i], oip->oip_name) == 0)
1733 break;
1734 if (i == OVERLAY_DEV_NPROPS) {
1735 ret = odd->odd_plugin->ovp_ops->ovpo_getprop(
1736 odd->odd_pvoid, oip->oip_name,
1737 oip->oip_value, &oip->oip_size);
1738 overlay_hold_rele(odd);
1739 mac_perim_exit(mph);
1740 return (ret);
1741 }
1742 }
1743
1744 propid = i;
1745 } else if (oip->oip_id >= OVERLAY_DEV_NPROPS) {
1746 uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS;
1747
1748 if (id > odd->odd_plugin->ovp_nprops) {
1749 overlay_hold_rele(odd);
1750 mac_perim_exit(mph);
1751 return (EINVAL);
1752 }
1753 ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid,
1754 odd->odd_plugin->ovp_props[id], oip->oip_value,
1755 &oip->oip_size);
1756 overlay_hold_rele(odd);
1757 mac_perim_exit(mph);
1758 return (ret);
1759 } else if (oip->oip_id < -1) {
1760 overlay_hold_rele(odd);
1761 mac_perim_exit(mph);
1762 return (EINVAL);
1763 } else {
1764 ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS);
1765 ASSERT(oip->oip_id >= 0);
1766 propid = oip->oip_id;
1767 }
1768
1769 ret = 0;
1770 switch (propid) {
1771 case OVERLAY_DEV_P_MTU:
1772 /*
1773 * The MTU is always set and retrieved through MAC, to allow for
1774 * MAC to do whatever it wants, as really that property belongs
1775 * to MAC. This is important for things where vnics have hold on
1776 * the MTU.
1777 */
1778 mac_sdu_get(odd->odd_mh, NULL, &mtu);
1779 bcopy(&mtu, oip->oip_value, sizeof (uint_t));
1780 oip->oip_size = sizeof (uint_t);
1781 break;
1782 case OVERLAY_DEV_P_VNETID:
1783 /*
1784 * While it's read-only while inside of a mux, we're not in a
1785 * context that can guarantee that. Therefore we always grab the
1786 * overlay_dev_t's odd_lock.
1787 */
1788 mutex_enter(&odd->odd_lock);
1789 bcopy(&odd->odd_vid, oip->oip_value, sizeof (uint64_t));
1790 mutex_exit(&odd->odd_lock);
1791 oip->oip_size = sizeof (uint64_t);
1792 break;
1793 case OVERLAY_DEV_P_ENCAP:
1794 oip->oip_size = strlcpy((char *)oip->oip_value,
1795 odd->odd_plugin->ovp_name, oip->oip_size);
1796 break;
1797 case OVERLAY_DEV_P_VARPDID:
1798 mutex_enter(&odd->odd_lock);
1799 if (odd->odd_flags & OVERLAY_F_VARPD) {
1800 const uint64_t val = odd->odd_target->ott_id;
1801 bcopy(&val, oip->oip_value, sizeof (uint64_t));
1802 oip->oip_size = sizeof (uint64_t);
1803 } else {
1804 oip->oip_size = 0;
1805 }
1806 mutex_exit(&odd->odd_lock);
1807 break;
1808 default:
1809 ret = ENOENT;
1810 }
1811
1812 overlay_hold_rele(odd);
1813 mac_perim_exit(mph);
1814 return (ret);
1815 }
1816
1817 static void
overlay_setprop_vnetid(overlay_dev_t * odd,uint64_t vnetid)1818 overlay_setprop_vnetid(overlay_dev_t *odd, uint64_t vnetid)
1819 {
1820 mutex_enter(&odd->odd_lock);
1821
1822 /* Simple case, not active */
1823 if (!(odd->odd_flags & OVERLAY_F_IN_MUX)) {
1824 odd->odd_vid = vnetid;
1825 mutex_exit(&odd->odd_lock);
1826 return;
1827 }
1828
1829 /*
1830 * In the hard case, we need to set the drop flag, quiesce I/O and then
1831 * we can go ahead and do everything.
1832 */
1833 odd->odd_flags |= OVERLAY_F_MDDROP;
1834 overlay_io_wait(odd, OVERLAY_F_IOMASK);
1835 mutex_exit(&odd->odd_lock);
1836
1837 overlay_mux_remove_dev(odd->odd_mux, odd);
1838
1839 mutex_enter(&odd->odd_lock);
1840 odd->odd_vid = vnetid;
1841 mutex_exit(&odd->odd_lock);
1842
1843 overlay_mux_add_dev(odd->odd_mux, odd);
1844
1845 mutex_enter(&odd->odd_lock);
1846 ASSERT(odd->odd_flags & OVERLAY_F_IN_MUX);
1847 odd->odd_flags &= ~OVERLAY_F_MDDROP;
1848 mutex_exit(&odd->odd_lock);
1849 }
1850
1851 /* ARGSUSED */
1852 static int
overlay_i_setprop(void * karg,intptr_t arg,int mode,cred_t * cred,int * rvalp)1853 overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred,
1854 int *rvalp)
1855 {
1856 int ret;
1857 overlay_dev_t *odd;
1858 overlay_ioc_prop_t *oip = karg;
1859 uint_t propid = UINT_MAX;
1860 mac_perim_handle_t mph;
1861 uint64_t maxid, *vidp;
1862
1863 if (oip->oip_size > OVERLAY_PROP_SIZEMAX)
1864 return (EINVAL);
1865
1866 odd = overlay_hold_by_dlid(oip->oip_linkid);
1867 if (odd == NULL)
1868 return (ENOENT);
1869
1870 oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0';
1871 mac_perim_enter_by_mh(odd->odd_mh, &mph);
1872 mutex_enter(&odd->odd_lock);
1873 if (odd->odd_flags & OVERLAY_F_ACTIVATED) {
1874 mac_perim_exit(mph);
1875 mutex_exit(&odd->odd_lock);
1876 return (ENOTSUP);
1877 }
1878 mutex_exit(&odd->odd_lock);
1879 if (oip->oip_id == -1) {
1880 int i;
1881
1882 for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
1883 if (strcmp(overlay_dev_props[i], oip->oip_name) == 0)
1884 break;
1885 if (i == OVERLAY_DEV_NPROPS) {
1886 ret = odd->odd_plugin->ovp_ops->ovpo_setprop(
1887 odd->odd_pvoid, oip->oip_name,
1888 oip->oip_value, oip->oip_size);
1889 overlay_hold_rele(odd);
1890 mac_perim_exit(mph);
1891 return (ret);
1892 }
1893 }
1894
1895 propid = i;
1896 } else if (oip->oip_id >= OVERLAY_DEV_NPROPS) {
1897 uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS;
1898
1899 if (id > odd->odd_plugin->ovp_nprops) {
1900 mac_perim_exit(mph);
1901 overlay_hold_rele(odd);
1902 return (EINVAL);
1903 }
1904 ret = odd->odd_plugin->ovp_ops->ovpo_setprop(odd->odd_pvoid,
1905 odd->odd_plugin->ovp_props[id], oip->oip_value,
1906 oip->oip_size);
1907 mac_perim_exit(mph);
1908 overlay_hold_rele(odd);
1909 return (ret);
1910 } else if (oip->oip_id < -1) {
1911 mac_perim_exit(mph);
1912 overlay_hold_rele(odd);
1913 return (EINVAL);
1914 } else {
1915 ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS);
1916 ASSERT(oip->oip_id >= 0);
1917 propid = oip->oip_id;
1918 }
1919
1920 ret = 0;
1921 switch (propid) {
1922 case OVERLAY_DEV_P_MTU:
1923 ret = mac_set_prop(odd->odd_mh, MAC_PROP_MTU, "mtu",
1924 oip->oip_value, oip->oip_size);
1925 break;
1926 case OVERLAY_DEV_P_VNETID:
1927 if (oip->oip_size != sizeof (uint64_t)) {
1928 ret = EINVAL;
1929 break;
1930 }
1931 vidp = (uint64_t *)oip->oip_value;
1932 ASSERT(odd->odd_plugin->ovp_id_size <= 8);
1933 maxid = UINT64_MAX;
1934 if (odd->odd_plugin->ovp_id_size != 8)
1935 maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) -
1936 1ULL;
1937 if (*vidp >= maxid) {
1938 ret = EINVAL;
1939 break;
1940 }
1941 overlay_setprop_vnetid(odd, *vidp);
1942 break;
1943 case OVERLAY_DEV_P_ENCAP:
1944 case OVERLAY_DEV_P_VARPDID:
1945 ret = EPERM;
1946 break;
1947 default:
1948 ret = ENOENT;
1949 }
1950
1951 mac_perim_exit(mph);
1952 overlay_hold_rele(odd);
1953 return (ret);
1954 }
1955
1956 /* ARGSUSED */
1957 static int
overlay_i_status(void * karg,intptr_t arg,int mode,cred_t * cred,int * rvalp)1958 overlay_i_status(void *karg, intptr_t arg, int mode, cred_t *cred,
1959 int *rvalp)
1960 {
1961 overlay_dev_t *odd;
1962 overlay_ioc_status_t *os = karg;
1963
1964 odd = overlay_hold_by_dlid(os->ois_linkid);
1965 if (odd == NULL)
1966 return (ENOENT);
1967
1968 mutex_enter(&odd->odd_lock);
1969 if ((odd->odd_flags & OVERLAY_F_DEGRADED) != 0) {
1970 os->ois_status = OVERLAY_I_DEGRADED;
1971 (void) strlcpy(os->ois_message, odd->odd_fmamsg,
1972 OVERLAY_STATUS_BUFLEN);
1973 } else {
1974 os->ois_status = OVERLAY_I_OK;
1975 os->ois_message[0] = '\0';
1976 }
1977 mutex_exit(&odd->odd_lock);
1978 overlay_hold_rele(odd);
1979
1980 return (0);
1981 }
1982
1983 static dld_ioc_info_t overlay_ioc_list[] = {
1984 { OVERLAY_IOC_CREATE, DLDCOPYIN, sizeof (overlay_ioc_create_t),
1985 overlay_i_create, secpolicy_dl_config },
1986 { OVERLAY_IOC_ACTIVATE, DLDCOPYIN, sizeof (overlay_ioc_activate_t),
1987 overlay_i_activate, secpolicy_dl_config },
1988 { OVERLAY_IOC_DELETE, DLDCOPYIN, sizeof (overlay_ioc_delete_t),
1989 overlay_i_delete, secpolicy_dl_config },
1990 { OVERLAY_IOC_PROPINFO, DLDCOPYIN | DLDCOPYOUT,
1991 sizeof (overlay_ioc_propinfo_t), overlay_i_propinfo,
1992 secpolicy_dl_config },
1993 { OVERLAY_IOC_GETPROP, DLDCOPYIN | DLDCOPYOUT,
1994 sizeof (overlay_ioc_prop_t), overlay_i_getprop,
1995 secpolicy_dl_config },
1996 { OVERLAY_IOC_SETPROP, DLDCOPYIN,
1997 sizeof (overlay_ioc_prop_t), overlay_i_setprop,
1998 secpolicy_dl_config },
1999 { OVERLAY_IOC_NPROPS, DLDCOPYIN | DLDCOPYOUT,
2000 sizeof (overlay_ioc_nprops_t), overlay_i_nprops,
2001 secpolicy_dl_config },
2002 { OVERLAY_IOC_STATUS, DLDCOPYIN | DLDCOPYOUT,
2003 sizeof (overlay_ioc_status_t), overlay_i_status,
2004 NULL }
2005 };
2006
2007 static int
overlay_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)2008 overlay_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2009 {
2010 int fmcap = DDI_FM_EREPORT_CAPABLE;
2011 if (cmd != DDI_ATTACH)
2012 return (DDI_FAILURE);
2013
2014 if (overlay_dip != NULL || ddi_get_instance(dip) != 0)
2015 return (DDI_FAILURE);
2016
2017 ddi_fm_init(dip, &fmcap, NULL);
2018
2019 if (ddi_create_minor_node(dip, OVERLAY_CTL, S_IFCHR,
2020 ddi_get_instance(dip), DDI_PSEUDO, 0) == DDI_FAILURE)
2021 return (DDI_FAILURE);
2022
2023 if (dld_ioc_register(OVERLAY_IOC, overlay_ioc_list,
2024 DLDIOCCNT(overlay_ioc_list)) != 0) {
2025 ddi_remove_minor_node(dip, OVERLAY_CTL);
2026 return (DDI_FAILURE);
2027 }
2028
2029 overlay_dip = dip;
2030 return (DDI_SUCCESS);
2031 }
2032
2033 /* ARGSUSED */
2034 static int
overlay_getinfo(dev_info_t * dip,ddi_info_cmd_t cmd,void * arg,void ** resp)2035 overlay_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
2036 {
2037 int error;
2038
2039 switch (cmd) {
2040 case DDI_INFO_DEVT2DEVINFO:
2041 *resp = (void *)overlay_dip;
2042 error = DDI_SUCCESS;
2043 break;
2044 case DDI_INFO_DEVT2INSTANCE:
2045 *resp = (void *)0;
2046 error = DDI_SUCCESS;
2047 break;
2048 default:
2049 error = DDI_FAILURE;
2050 break;
2051 }
2052
2053 return (error);
2054 }
2055
2056 static int
overlay_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)2057 overlay_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2058 {
2059 if (cmd != DDI_DETACH)
2060 return (DDI_FAILURE);
2061
2062 mutex_enter(&overlay_dev_lock);
2063 if (!list_is_empty(&overlay_dev_list) || overlay_target_busy()) {
2064 mutex_exit(&overlay_dev_lock);
2065 return (EBUSY);
2066 }
2067 mutex_exit(&overlay_dev_lock);
2068
2069
2070 dld_ioc_unregister(OVERLAY_IOC);
2071 ddi_remove_minor_node(dip, OVERLAY_CTL);
2072 ddi_fm_fini(dip);
2073 overlay_dip = NULL;
2074 return (DDI_SUCCESS);
2075 }
2076
2077 static struct cb_ops overlay_cbops = {
2078 overlay_target_open, /* cb_open */
2079 overlay_target_close, /* cb_close */
2080 nodev, /* cb_strategy */
2081 nodev, /* cb_print */
2082 nodev, /* cb_dump */
2083 nodev, /* cb_read */
2084 nodev, /* cb_write */
2085 overlay_target_ioctl, /* cb_ioctl */
2086 nodev, /* cb_devmap */
2087 nodev, /* cb_mmap */
2088 nodev, /* cb_segmap */
2089 nochpoll, /* cb_chpoll */
2090 ddi_prop_op, /* cb_prop_op */
2091 NULL, /* cb_stream */
2092 D_MP, /* cb_flag */
2093 CB_REV, /* cb_rev */
2094 nodev, /* cb_aread */
2095 nodev, /* cb_awrite */
2096 };
2097
2098 static struct dev_ops overlay_dev_ops = {
2099 DEVO_REV, /* devo_rev */
2100 0, /* devo_refcnt */
2101 overlay_getinfo, /* devo_getinfo */
2102 nulldev, /* devo_identify */
2103 nulldev, /* devo_probe */
2104 overlay_attach, /* devo_attach */
2105 overlay_detach, /* devo_detach */
2106 nulldev, /* devo_reset */
2107 &overlay_cbops, /* devo_cb_ops */
2108 NULL, /* devo_bus_ops */
2109 NULL, /* devo_power */
2110 ddi_quiesce_not_supported /* devo_quiesce */
2111 };
2112
2113 static struct modldrv overlay_modldrv = {
2114 &mod_driverops,
2115 "Overlay Network Driver",
2116 &overlay_dev_ops
2117 };
2118
2119 static struct modlinkage overlay_linkage = {
2120 MODREV_1,
2121 &overlay_modldrv
2122 };
2123
2124 static int
overlay_init(void)2125 overlay_init(void)
2126 {
2127 mutex_init(&overlay_dev_lock, NULL, MUTEX_DRIVER, NULL);
2128 list_create(&overlay_dev_list, sizeof (overlay_dev_t),
2129 offsetof(overlay_dev_t, odd_link));
2130 overlay_mux_init();
2131 overlay_plugin_init();
2132 overlay_target_init();
2133
2134 return (DDI_SUCCESS);
2135 }
2136
2137 static void
overlay_fini(void)2138 overlay_fini(void)
2139 {
2140 overlay_target_fini();
2141 overlay_plugin_fini();
2142 overlay_mux_fini();
2143 mutex_destroy(&overlay_dev_lock);
2144 list_destroy(&overlay_dev_list);
2145 }
2146
2147 int
_init(void)2148 _init(void)
2149 {
2150 int err;
2151
2152 if ((err = overlay_init()) != DDI_SUCCESS)
2153 return (err);
2154
2155 mac_init_ops(NULL, "overlay");
2156 err = mod_install(&overlay_linkage);
2157 if (err != DDI_SUCCESS) {
2158 overlay_fini();
2159 return (err);
2160 }
2161
2162 return (0);
2163 }
2164
2165 int
_info(struct modinfo * modinfop)2166 _info(struct modinfo *modinfop)
2167 {
2168 return (mod_info(&overlay_linkage, modinfop));
2169 }
2170
2171 int
_fini(void)2172 _fini(void)
2173 {
2174 int err;
2175
2176 err = mod_remove(&overlay_linkage);
2177 if (err != 0)
2178 return (err);
2179
2180 overlay_fini();
2181 return (0);
2182 }
2183