1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2016 Joyent, Inc. 14 */ 15 16 /* 17 * Overlay Devices 18 * 19 * Overlay devices provide a means for creating overlay networks, a means of 20 * multiplexing multiple logical, isolated, and discrete layer two and layer 21 * three networks on top of one physical network. 22 * 23 * In general, these overlay devices encapsulate the logic to answer two 24 * different questions: 25 * 26 * 1) How should I transform a packet to put it on the wire? 27 * 2) Where should I send a transformed packet? 28 * 29 * Each overlay device is presented to the user as a GLDv3 device. While the 30 * link itself cannot have an IP interface created on top of it, it allows for 31 * additional GLDv3 devices, such as a VNIC, to be created on top of it which 32 * can be plumbed up with IP interfaces. 33 * 34 * 35 * -------------------- 36 * General Architecture 37 * -------------------- 38 * 39 * The logical overlay device that a user sees in dladm(1M) is a combination of 40 * two different components that work together. The first component is this 41 * kernel module, which is responsible for answering question one -- how should 42 * I transform a packet to put it on the wire. 43 * 44 * The second component is what we call the virtual ARP daemon, or varpd. It is 45 * a userland component that is responsible for answering the second question -- 46 * Where should I send a transformed packet. Instances of the kernel overlay 47 * GLDv3 device ask varpd the question of where should a packet go. 48 * 49 * The split was done for a few reasons. Importantly, we wanted to keep the act 50 * of generating encapsulated packets in the kernel so as to ensure that the 51 * general data path was fast and also kept simple. On the flip side, while the 52 * question of where should something go may be simple, it may often be 53 * complicated and need to interface with several different external or 54 * distributed systems. In those cases, it's simpler to allow for the full 55 * flexibility of userland to be brought to bear to solve that problem and in 56 * general, the path isn't very common. 57 * 58 * The following is what makes up the logical overlay device that a user would 59 * create with dladm(1M). 60 * 61 * Kernel Userland 62 * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 63 * . +--------+ +--------+ +--------+ . . . 64 * . | VNIC 0 | | VNIC 1 | | VNIC 2 | . . . 65 * . +--------+ +--------+ +--------+ . . . 66 * . | | | . . . 67 * . | | | . . . 68 * . +------------+-----------+ . . . 69 * . | . . /dev/overlay . 70 * . +--------------+ . . . +------------+ . 71 * . | | . . . | | . 72 * . | Overlay |======*=================| Virtual | . 73 * . | GLDv3 Device |========================| ARP Daemon | . 74 * . | | . . | | . 75 * . +--------------+ . . +------------+ . 76 * . | . . | . 77 * . | . . | . 78 * . +----------------+ . . +--------+ . 79 * . | Overlay | . . | varpd | . 80 * . | Encapsulation | . . | Lookup | . 81 * . | Plugin | . . | Plugin | . 82 * . +----------------+ . . +--------+ . 83 * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 84 * 85 * 86 * This image shows the two different components and where they live. 87 * Importantly, it also shows that both the kernel overlay device and the 88 * userland varpd both support plugins. The plugins actually implement the 89 * things that users care about and the APIs have been designed to try to 90 * minimize the amount of things that a module writer needs to worry about it. 91 * 92 * IDENTIFIERS 93 * 94 * Every overlay device is defined by a unique identifier which is the overlay 95 * identifier. Its purpose is similar to that of a VLAN identifier, it's a 96 * unique number that is used to differentiate between different entries on the 97 * wire. 98 * 99 * ENCAPSULATION 100 * 101 * An overlay encapsulation plugin is a kernel miscellaneous module whose 102 * purpose is to contain knowledge about how to transform packets to put them 103 * onto the wire and to take them off. An example of an encapsulation plugin is 104 * vxlan. It's also how support for things like nvgre or geneve would be brought 105 * into the system. 106 * 107 * Each encapsulation plugins defines a series of operation vectors and 108 * properties. For the full details on everything they should provide, please 109 * read uts/common/sys/overlay_plugin.h. The encapsulation plugin is responsible 110 * for telling the system what information is required to send a packet. For 111 * example, vxlan is defined to send everything over a UDP packet and therefore 112 * requires a port and an IP address, while nvgre on the other hand is its own 113 * IP type and therefore just requires an IP address. In addition, it also 114 * provides information about the kind of socket that should be created. This is 115 * used by the kernel multiplexor, more of that in the Kernel Components 116 * section. 117 * 118 * LOOKUPS 119 * 120 * The kernel communicates requests for lookups over the character device 121 * /dev/overlay. varpd is responsible for listening for requests on that device 122 * and answering them. The character device is specific to the target path and 123 * varpd. 124 * 125 * Much as the kernel overlay module handles the bulk of the scaffolding but 126 * leaves the important work to the encapsulation plugin, varpd provides a 127 * similar role and leaves the full brunt of lookups to a userland dynamic 128 * shared object which implements the logic of lookups. 129 * 130 * Each lookup plugin defines a series of operation vectors and properties. For 131 * the full details on everything that they should provide, please read 132 * lib/varpd/libvarpd/libvarpd_provider.h. Essentially, they are given a MAC 133 * address and asked to give an address on the physical network that it should 134 * be sent to. In addition, they handle questions related to how to handle 135 * things like broadcast and multicast traffic, etc. 136 * 137 * ---------- 138 * Properties 139 * ---------- 140 * 141 * A device from a dladm perspective has a unique set of properties that are 142 * combined from three different sources: 143 * 144 * 1) Generic properties that every overlay device has 145 * 2) Properties that are specific to the encapsulation plugin 146 * 3) Properties that are specific to the lookup plugin 147 * 148 * All of these are exposed in a single set of properties in dladm. Note that 149 * these are not necessarily traditional link properties. However, if something 150 * is both a traditional GLDv3 link property, say the MTU of a device, and a 151 * specific property here, than the driver ensures that all existing GLDv3 152 * specific means of manipulating it are used and wraps up its private property 153 * interfaces to ensure that works. 154 * 155 * Properties in the second and third category are prefixed with the name of 156 * their module. For example, the vxlan encapsulation module has a property 157 * called the 'listen_ip'. This property would show up in dladm as 158 * 'vxlan/listen_ip'. This allows different plugins to both use similar names 159 * for similar properties and to also have independent name spaces so that 160 * overlapping names do not conflict with anything else. 161 * 162 * While the kernel combines both sets one and two into a single coherent view, 163 * it does not do anything with respect to the properties that are owned by the 164 * lookup plugin -- those are owned wholly by varpd. Instead, libdladm is in 165 * charge of bridging these two worlds into one magical experience for the user. 166 * It carries the burden of knowing about both overlay specific and varpd 167 * specific properties. Importantly, we want to maintain this distinction. We 168 * don't want to treat the kernel as an arbitrary key/value store for varpd and 169 * we want the kernel to own its own data and not have to ask userland for 170 * information that it owns. 171 * 172 * Every property in the system has the following attributes: 173 * 174 * o A name 175 * o A type 176 * o A size 177 * o Permissions 178 * o Default value 179 * o Valid value ranges 180 * o A value 181 * 182 * Everything except for the value is obtained by callers through the propinfo 183 * callbacks and a property has a maximum size of OVERLAY_PROP_SIZEMAX, 184 * currently 256 bytes. 185 * 186 * The following are the supported types of properties: 187 * 188 * OVERLAY_PROP_T_INT 189 * 190 * A signed integer, its length is 8 bytes, corresponding to a 191 * int64_t. 192 * 193 * OVERLAY_PROP_T_UINT 194 * 195 * An unsigned integer, its length is 8 bytes, corresponding to a 196 * uint64_t. 197 * 198 * OVERLAY_PROP_T_IP 199 * 200 * A struct in6_addr, it has a fixed size. 201 * 202 * OVERLAY_PROP_T_STRING 203 * 204 * A null-terminated character string encoded in either ASCII or 205 * UTF-8. Note that the size of the string includes the null 206 * terminator. 207 * 208 * The next thing that we apply to a property is its permission. The permissions 209 * are put together by the bitwise or of the following flags and values. 210 * 211 * OVERLAY_PROP_PERM_REQ 212 * 213 * This indicates a required property. A property that is required 214 * must be set by a consumer before the device can be created. If a 215 * required property has a default property, this constraint is 216 * loosened because the default property defines the value. 217 * 218 * OVERLAY_PORP_PERM_READ 219 * 220 * This indicates that a property can be read. All properties will 221 * have this value set. 222 * 223 * OVERLAY_PROP_PERM_WRITE 224 * 225 * This indicates that a property can be written to and thus 226 * updated by userland. Properties that are only intended to 227 * display information, will not have OVERLAY_PROP_PERM_WRITE set. 228 * 229 * In addition, a few additional values are defined as a convenience to 230 * consumers. The first, OVERLAY_PROP_PERM_RW, is a combination of 231 * OVERLAY_PROP_PERM_READ and OVERLAY_PERM_PROP_WRITE. The second, 232 * OVERLAY_PROP_PERM_RRW, is a combination of OVERLAY_PROP_PERM_REQ, 233 * OVERLAY_PROP_PERM_READ, and OVERLAY_PROP_PERM_WRITE. The protection mode of a 234 * property should generally be a constant across its lifetime. 235 * 236 * A property may optionally have a default value. If it does have a default 237 * value, and that property is not set to be a different value, then the default 238 * value is inherited automatically. It also means that if the default value is 239 * acceptable, there is no need to set the value for a required property. For 240 * example, the vxlan module has the vxlan/listen_port property which is 241 * required, but has a default value of 4789 (the IANA assigned port). Because 242 * of that default value, there is no need for it to be set. 243 * 244 * Finally, a property may declare a list of valid values. These valid values 245 * are used for display purposes, they are not enforced by the broader system, 246 * but merely allow a means for the information to be communicated to the user 247 * through dladm(1M). Like a default value, this is optional. 248 * 249 * The general scaffolding does not do very much with respect to the getting and 250 * setting of properties. That is really owned by the individual plugins 251 * themselves. 252 * 253 * ----------------------------- 254 * Destinations and Plugin Types 255 * ----------------------------- 256 * 257 * Both encapsulation and lookup plugins define the kinds of destinations that 258 * they know how to support. There are three different pieces of information 259 * that can be used to address to a destination currently, all of which is 260 * summarized in the type overlay_point_t. Any combination of these is 261 * supported. 262 * 263 * OVERLAY_PLUGIN_D_ETHERNET 264 * 265 * An Ethernet MAC address is required. 266 * 267 * OVERLAY_PLUGIN_D_IP 268 * 269 * An IP address is required. All IP addresses used by the overlay 270 * system are transmitted as IPv6 addresses. IPv4 addresses can be 271 * represented by using IPv4-mapped IPv6 addresses. 272 * 273 * OVERLAY_PLUGIN_D_PORT 274 * 275 * A TCP/UDP port is required. 276 * 277 * A kernel encapsulation plugin declares which of these that it requires, it's 278 * a static set. On the other hand, a userland lookup plugin can be built to 279 * support all of these or any combination thereof. It gets passed the required 280 * destination type, based on the kernel encapsulation method, and then it makes 281 * the determination as to whether or not it supports it. For example, the 282 * direct plugin can support either an IP or both an IP and a port, it simply 283 * doesn't display the direct/dest_port property in the cases where a port is 284 * not required to support this. 285 * 286 * The user lookup plugins have two different modes of operation which 287 * determines how they interact with the broader system and how look ups are 288 * performed. These types are: 289 * 290 * OVERLAY_TARGET_POINT 291 * 292 * A point to point plugin has a single static definition for where 293 * to send all traffic. Every packet in the system always gets sent 294 * to the exact same destination which is programmed into the 295 * kernel when the general device is activated. 296 * 297 * OVERLAY_TARGET_DYNAMIC 298 * 299 * A dynamic plugin does not have a single static definition. 300 * Instead, for each destination, the kernel makes an asynchronous 301 * request to varpd to determine where the packet should be routed, 302 * and if a specific destination is found, then that destination is 303 * cached in the overlay device's target cache. 304 * 305 * This distinction, while important for the general overlay device's operation, 306 * is not important to the encapsulation plugins. They don't need to know about 307 * any of these pieces. It's just a concern for varpd, the userland plugin, and 308 * the general overlay scaffolding. 309 * 310 * When an overlay device is set to OVERLAY_TARGET_POINT, then it does not 311 * maintain a target cache, and instead just keeps track of the destination and 312 * always sends encapsulated packets to that address. When the target type is of 313 * OVERLAY_TARGET_DYNAMIC, then the kernel maintains a cache of all such 314 * destinations. These destinations are kept around in an instance of a 315 * reference hash that is specific to the given overlay device. Entries in the 316 * cache can be invalidated and replaced by varpd and its lookup plugins. 317 * 318 * ---------------------------------- 319 * Kernel Components and Architecture 320 * ---------------------------------- 321 * 322 * There are multiple pieces inside the kernel that work together, there is the 323 * general overlay_dev_t structure, which is the logical GLDv3 device, but it 324 * itself has references to things like an instance of an encapsulation plugin, 325 * a pointer to a mux and a target cache. It can roughly be summarized in the 326 * following image: 327 * 328 * +------------------+ 329 * | global | 330 * | overlay list | 331 * | overlay_dev_list | 332 * +------------------+ 333 * | 334 * | +-----------------------+ +---------------+ 335 * +->| GLDv3 Device |----------->| GLDv3 Device | -> ... 336 * | overlay_dev_t | | overlay_dev_t | 337 * | | +---------------+ 338 * | | 339 * | mac_handle_t -----+---> GLDv3 handle to MAC 340 * | datalink_id_t -----+---> Datalink ID used by DLS 341 * | overlay_dev_flag_t ---+---> Device state 342 * | uint_t -----+---> Current device MTU 343 * | uint_t -----+---> In-progress RX operations 344 * | uint_t -----+---> In-progress TX operations 345 * | char[] -----+---> FMA degraded message 346 * | void * -----+---> plugin private data 347 * | overlay_target_t * ---+---------------------+ 348 * | overlay_plugin_t * ---+---------+ | 349 * +-----------------------+ | | 350 * ^ | | 351 * +--------------------+ | | | 352 * | Kernel Socket | | | | 353 * | Multiplexor | | | | 354 * | overlay_mux_t | | | | 355 * | | | | | 356 * | avl_tree_t -+--+ | | 357 * | uint_t -+--> socket family | | 358 * | uint_t -+--> socket type | | 359 * | uint_t -+--> socket protocol | | 360 * | ksocket_t -+--> I/O socket | | 361 * | struct sockaddr * -+--> ksocket address | | 362 * | overlay_plugin_t --+--------+ | | 363 * +--------------------+ | | | 364 * | | | 365 * +-------------------------+ | | | 366 * | Encap Plugin |<--+-----------+ | 367 * | overlay_plugin_t | | 368 * | | | 369 * | char * ---+--> plugin name | 370 * | overlay_plugin_ops_t * -+--> plugin downcalls | 371 * | char ** (props) ---+--> property list | 372 * | uint_t ---+--> id length | 373 * | overlay_plugin_flags_t -+--> plugin flags | 374 * | overlay_plugin_dest_t --+--> destination type v 375 * +-------------------------+ +-------------------------+ 376 * | Target Cache | 377 * | overlay_target_t | 378 * | | 379 * cache mode <--+- overlay_target_mode_t | 380 * dest type <--+- overlay_plugin_dest_t | 381 * cache flags <--+- overlay_target_flag_t | 382 * varpd id <--+- uint64_t | 383 * outstanding varpd reqs. <--+- uint_t | 384 * OVERLAY_TARGET_POINT state <--+- overlay_target_point_t | 385 * OVERLAY_TARGET_DYNAMIC state <-+---+- overlay_target_dyn_t | 386 * | +-------------------------+ 387 * +-----------------------+ 388 * | 389 * v 390 * +-------------------------------+ +------------------------+ 391 * | Target Entry |-->| Target Entry |--> ... 392 * | overlay_target_entry_t | | overlay_target_entry_t | 393 * | | +------------------------+ 394 * | | 395 * | overlay_target_entry_flags_t -+--> Entry flags 396 * | uint8_t[ETHERADDRL] ---+--> Target MAC address 397 * | overlay_target_point_t ---+--> Target underlay address 398 * | mblk_t * ---+--> outstanding mblk head 399 * | mblk_t * ---+--> outstanding mblk tail 400 * | size_t ---+--> outstanding mblk size 401 * +-------------------------------+ 402 * 403 * The primary entries that we care about are the overlay_dev_t, which 404 * correspond to each overlay device that is created with dladm(1M). Globally, 405 * these devices are maintained in a simple list_t which is protected with a 406 * lock. Hence, these include important information such as the mac_handle_t 407 * and a datalink_id_t which is used to interact with the broader MAC and DLS 408 * ecosystem. We also maintain additional information such as the current state, 409 * outstanding operations, the mtu, and importantly, the plugin's private data. 410 * This is the instance of an encapsulation plugin that gets created as part of 411 * creating an overlay device. Another aspect of this is that the overlay_dev_t 412 * also includes information with respect to FMA. For more information, see the 413 * FMA section. 414 * 415 * Each overlay_dev_t has a pointer to a plugin, a mux, and a target. The plugin 416 * is the encapsulation plugin. This allows the device to make downcalls into it 417 * based on doing things like getting and setting properties. Otherwise, the 418 * plugin itself is a fairly straightforward entity. They are maintained in an 419 * (not pictured above) list. The plugins themselves mostly maintain things like 420 * the static list of properties, what kind of destination they require, and the 421 * operations vector. A given module may contain more if necessary. 422 * 423 * The next piece of the puzzle is the mux, or a multiplexor. The mux itself 424 * maintains a ksocket and it is through the mux that we send and receive 425 * message blocks. The mux represents a socket type and address, as well as a 426 * plugin. Multiple overlay_dev_t devices may then share the same mux. For 427 * example, consider the case where you have different instances of vxlan all on 428 * the same underlay network. These would all logically share the same IP 429 * address and port that packets are sent and received on; however, what differs 430 * is the decapuslation ID. 431 * 432 * Each mux maintains a ksocket_t which is similar to a socket(3SOCKET). Unlike 433 * a socket, we enable a direct callback on the ksocket. This means that 434 * whenever a message block chain is received, rather than sitting there and 435 * getting a callback in a context and kicking that back out to a taskq. Instead 436 * data comes into the callback function overlay_mux_recv(). 437 * 438 * The mux is given encapsulated packets (via overlay_m_tx, the GLDv3 tx 439 * function) to transmit. It receives encapsulated packets, decapsulates them to 440 * determine the overlay identifier, looks up the given device that matches that 441 * identifier, and then causes the broader MAC world to receive the packet with 442 * a call to mac_rx(). 443 * 444 * Today, we don't do too much that's special with the ksocket; however, as 445 * hardware is gaining understanding for these encapsulation protocols, we'll 446 * probably want to think of better ways to get those capabilities passed down 447 * and potentially better ways to program receive filters so they get directly 448 * to us. Though, that's all fantasy future land. 449 * 450 * The next part of the puzzle is the target cache. The purpose of the target 451 * cache is to cache where we should send a packet on the underlay network, 452 * given its mac address. The target cache operates in two modes depending on 453 * whether the lookup module was declared to OVERLAY_TARGET_POINT or 454 * OVERLAY_TARGET_DYANMIC. 455 * 456 * In the case where the target cache has been programmed to be 457 * OVERLAY_TARGET_POINT, then we only maintain a single overlay_target_point_t 458 * which has the destination that we send everything, no matter the destination 459 * mac address. 460 * 461 * On the other hand, when we have an instance of OVERLAY_TARGET_DYNAMIC, things 462 * are much more interesting and as a result, more complicated. We primarily 463 * store lists of overlay_target_entry_t's which are stored in both an avl tree 464 * and a refhash_t. The primary look up path uses the refhash_t and the avl tree 465 * is only used for a few of the target ioctls used to dump data such that we 466 * can get a consistent iteration order for things like dladm show-overlay -t. 467 * The key that we use for the reference hashtable is based on the mac address 468 * in the cache and currently we just do a simple CRC32 to transform it into a 469 * hash. 470 * 471 * Each entry maintains a set of flags to indicate the current status of the 472 * request. The flags may indicate one of three states: that current cache entry 473 * is valid, that the current cache entry has been directed to drop all output, 474 * and that the current cache entry is invalid and may be being looked up. In 475 * the case where it's valid, we just take the destination address and run with 476 * it. 477 * 478 * If it's invalid and a lookup has not been made, then we start the process 479 * that prepares a query that will make its way up to varpd. The cache entry 480 * entry maintains a message block chain of outstanding message blocks and a 481 * size. These lists are populated only when we don't know the answer as to 482 * where should these be sent. The size entry is used to cap the amount of 483 * outstanding data that we don't know the answer to. If we exceed a cap on the 484 * amount of outstanding data (currently 1 Mb), then we'll drop any additional 485 * packets. Once we get an answer indicating a valid destination, we transmit 486 * any outstanding data to that place. For the full story on how we look that up 487 * will be discussed in the section on the Target Cache Lifecycle. 488 * 489 * ------------------------ 490 * FMA and Degraded Devices 491 * ------------------------ 492 * 493 * Every kernel overlay device keeps track of its FMA state. Today in FMA we 494 * cannot represent partitions between resources nor can we represent that a 495 * given minor node of a pseudo device has failed -- if we degrade the overlay 496 * device, then the entire dev_info_t is degraded. However, we still want to be 497 * able to indicate to administrators that things may go wrong. 498 * 499 * To this end, we've added a notion of a degraded state to every overlay 500 * device. This state is primarily dictated by userland and it can happen for 501 * various reasons. Generally, because a userland lookup plugin has been 502 * partitioned, or something has gone wrong such that there is no longer any 503 * userland lookup module for a device, then we'll mark it degraded. 504 * 505 * As long as any of our minor instances is degraded, then we'll fire off the 506 * FMA event to note that. Once the last degraded instance is no longer 507 * degraded, then we'll end up telling FMA that we're all clean. 508 * 509 * To help administrators get a better sense of which of the various minor 510 * devices is wrong, we store the odd_fmamsg[] character array. This character 511 * array can be fetched with doing a dladm show-overlay -f. 512 * 513 * Note, that it's important that we do not update the link status of the 514 * devices. We want to remain up as much as possible. By changing the link in a 515 * degraded state, this may end up making things worse. We may still actually 516 * have information in the target cache and if we mark the link down, that'll 517 * result in not being able to use it. The reason being that this'll mark all 518 * the downstream VNICs down which will go to IP and from there we end up 519 * dealing with sadness. 520 * 521 * ----------------------- 522 * Target Cache Life Cycle 523 * ----------------------- 524 * 525 * This section only applies when we have a lookup plugin of 526 * OVERLAY_TARGET_DYNAMIC. None of this applies to those of type 527 * OVERLAY_TARGET_POINT. 528 * 529 * While we got into the target cache in the general architecture section, it's 530 * worth going into more details as to how this actually works and showing some 531 * examples and state machines. Recall that a target cache entry basically has 532 * the following state transition diagram: 533 * 534 * Initial state 535 * . . . . . . first access . . . varpd lookup enqueued 536 * . . . 537 * . . . 538 * +-------+ . +----------+ . 539 * | No |------*---->| Invalid |-------*----+ 540 * | Entry | | Entry | | 541 * +-------+ +----------+ | 542 * varpd ^ ^ varpd | 543 * invalidate | | drop | 544 * . . . * * . . v 545 * +-------+ | | +---------+ 546 * | Entry |--->-----+ +----<----| Entry | 547 * | Valid |<----------*---------<----| Pending |->-+ varpd 548 * +-------+ . +---------+ * . . drop, but 549 * . varpd ^ | other queued 550 * . success | | entries 551 * +-----+ 552 * 553 * When the table is first created, it is empty. As we attempt to lookup entries 554 * and we find there is no entry at all, we'll create a new table entry for it. 555 * At that point the entry is technically in an invalid state, that means that 556 * we have no valid data from varpd. In that case, we'll go ahead and queue the 557 * packet into the entry's pending chain, and queue a varpd lookup, setting the 558 * OVERLAY_ENTRY_F_PENDING flag in the progress. 559 * 560 * If additional mblk_t's come in for this entry, we end up appending them to 561 * the tail of the chain, if and only if, we don't exceed the threshold for the 562 * amount of space they can take up. An entry remains pending until we get a 563 * varpd reply. If varpd replies with a valid results, we move to the valid 564 * entry state, and remove the OVERLAY_ENTRY_F_PENDING flag and set it with one 565 * of OVERLAY_ENTRY_F_VALID or OVERLAY_ENTRY_F_DROP as appropriate. 566 * 567 * Once an entry is valid, it stays valid until user land tells us to invalidate 568 * it with an ioctl or replace it, OVERLAY_TARG_CACHE_REMOE and 569 * OVERLAY_TARG_CACHE_SET respectively. 570 * 571 * If the lookup fails with a call to drop the packet, then the next state is 572 * determined by the state of the queue. If the set of outstanding entries is 573 * empty, then we just transition back to the invalid state. If instead, the 574 * set of outstanding entries is not empty, then we'll queue another entry and 575 * stay in the same state, repeating this until the number of requests is 576 * drained. 577 * 578 * The following images describes the flow of a given lookup and where the 579 * overlay_target_entry_t is at any given time. 580 * 581 * +-------------------+ 582 * | Invalid Entry | An entry starts off as an invalid entry 583 * | de:ad:be:ef:00:00 | and only exists in the target cache. 584 * +-------------------+ 585 * 586 * ~~~~ 587 * 588 * +---------------------+ 589 * | Global list_t | A mblk_t comes in for an entry. We 590 * | overlay_target_list | append it to the overlay_target_list. 591 * +---------------------+ 592 * | 593 * v 594 * +-------------------+ +-------------------+ 595 * | Pending Entry |----->| Pending Entry |--->... 596 * | 42:5e:1a:10:d6:2d | | de:ad:be:ef:00:00 | 597 * +-------------------+ +-------------------+ 598 * 599 * ~~~~ 600 * 601 * +--------------------------+ 602 * | /dev/overlay minor state | User land said that it would look up an 603 * | overlay_target_hdl_t | entry for us. We remove it from the 604 * +--------------------------+ global list and add it to the handle's 605 * | outstanding list. 606 * | 607 * v 608 * +-------------------+ +-------------------+ 609 * | Pending Entry |----->| Pending Entry | 610 * | 90:b8:d0:79:02:dd | | de:ad:be:ef:00:00 | 611 * +-------------------+ +-------------------+ 612 * 613 * ~~~~ 614 * 615 * +-------------------+ 616 * | Valid Entry | varpd returned an answer with 617 * | de:ad:be:ef:00:00 | OVERLAY_IOC_RESPOND and the target cache 618 * | 10.169.23.42:4789 | entry is now populated with a 619 * +-------------------+ destination and marked as valid 620 * 621 * 622 * The lookup mechanism is performed via a series of operations on the character 623 * pseudo-device /dev/overlay. The only thing that uses this device is the 624 * userland daemon varpd. /dev/overlay is a cloneable device, each open of it 625 * granting a new minor number which maintains its own state. We maintain this 626 * state so that way if an outstanding lookup was queued to something that 627 * crashed or closed its handle without responding, we can know about this and 628 * thus handle it appropriately. 629 * 630 * When a lookup is first created it's added to our global list of outstanding 631 * lookups. To service requests, userland is required to perform an ioctl to ask 632 * for a request. We will block it in the kernel a set amount of time waiting 633 * for a request. When we give a request to a given minor instance of the 634 * device, we remove it from the global list and append the request to the 635 * device's list of outstanding entries, for the reasons we discussed above. 636 * When a lookup comes in, we give user land a smaller amount of information 637 * specific to that packet, the overlay_targ_lookup_t. It includes a request id 638 * to identify this, and then the overlay id, the varpd id, the header and 639 * packet size, the source and destination mac address, the SAP, and any 640 * potential VLAN header. 641 * 642 * At that point, it stays in that outstanding list until one of two ioctls are 643 * returned: OVERLAY_TARG_RESPOND or OVERLAY_TARG_DROP. During this time, 644 * userland may also perform other operations. For example, it may use 645 * OVERLAY_TARG_PKT to get a copy of this packet so it can perform more in-depth 646 * analysis of what to do beyond what we gave it initially. This is useful for 647 * providing proxy arp and the like. Finally, there are two other ioctls that 648 * varpd can then do. The first is OVERLAY_TARG_INJECT which injects the 649 * non-jumbo frame packet up into that mac device and OVERLAY_TARG_RESEND which 650 * causes us to encapsulate and send out the packet they've given us. 651 * 652 * 653 * Finally, through the target cache, several ioctls are provided to allow for 654 * interrogation and management of the cache. They allow for individual entries 655 * to be retrieved, set, or have the entire table flushed. For the full set of 656 * ioctls here and what they do, take a look at uts/common/sys/overlay_target.h. 657 * 658 * ------------------ 659 * Sample Packet Flow 660 * ------------------ 661 * 662 * There's a lot of pieces here, hopefully an example of how this all fits 663 * together will help clarify and elucidate what's going on. We're going to 664 * first track an outgoing packet, eg. one that is sent from an IP interface on 665 * a VNIC on top of an overlay device, and then we'll look at what it means to 666 * respond to that. 667 * 668 * 669 * +----------------+ +--------------+ +------------------+ 670 * | IP/DLS send |------->| MAC sends it |----------->| mblk_t reaches | 671 * | packet to MAC | | to the GLDv3 | | overlay GLDv3 tx | 672 * +----------------+ | VNIC device | | overlay_m_tx() | 673 * +--------------+ +------------------+ 674 * | 675 * . lookup . cache | 676 * . drop . miss v 677 * +---------+ . +--------+ . +------------------+ 678 * | freemsg |<-----*-------| varpd |<---*------| Lookup each mblk | 679 * | mblk_t | | lookup | | in the target | 680 * +---------+ | queued | | cache | 681 * ^ +--------+ +------------------+ 682 * on send | | | cache 683 * error . . * *. . lookup * . . hit 684 * | | success v 685 * | | +------------------+ 686 * +-----------------+ +--------------->| call plugin | 687 * | Send out | | ovpo_encap() to | 688 * | overlay_mux_t's |<----------------------------------| get encap mblk_t | 689 * | ksocket | +------------------+ 690 * +-----------------+ 691 * 692 * The receive end point looks a little different and looks more like: 693 * 694 * +------------------+ +----------------+ +-----------+ 695 * | mblk_t comes off |---->| enter netstack |--->| delivered |---+ 696 * | the physical | | IP stack | | to | * . . direct 697 * | device | +----------------+ | ksocket | | callback 698 * +------------------+ +-----------+ | 699 * . overlay id | 700 * . not found v 701 * +-----------+ . +-----------------+ +--------------------+ 702 * | freemsg |<--*------| call plugin |<------| overlay_mux_recv() | 703 * | mblk_t | | ovpo_decap() to | +--------------------+ 704 * +-----------+ | decap mblk_t | 705 * +-----------------+ 706 * | 707 * * . . overlay id 708 * v found 709 * +--------+ +----------------+ 710 * | adjust |----->| call mac_rx | 711 * | mblk_t | | on original | 712 * +--------+ | decaped packet | 713 * +----------------+ 714 * 715 * ------------------ 716 * Netstack Awareness 717 * ------------------ 718 * 719 * In the above image we note that this enters a netstack. Today the only 720 * netstack that can be is the global zone as the overlay driver itself is not 721 * exactly netstack aware. What this really means is that varpd cannot run in a 722 * non-global zone and an overlay device cannot belong to a non-global zone. 723 * Non-global zones can still have a VNIC assigned to them that's been created 724 * over the overlay device the same way they would if it had been created over 725 * an etherstub or a physical device. 726 * 727 * The majority of the work to make it netstack aware is straightforward and the 728 * biggest thing is to create a netstack module that allows us to hook into 729 * netstack (and thus zone) creation and destruction. From there, we need to 730 * amend the target cache lookup routines that we discussed earlier to not have 731 * a global outstanding list and a global list of handles, but rather, one per 732 * netstack. 733 * 734 * For the mux, we'll need to open the ksocket in the context of the zone, we 735 * can likely do this with a properly composed credential, but we'll need to do 736 * some more work on that path. Finally, we'll want to make sure the dld ioctls 737 * are aware of the zoneid of the caller and we use that appropriately and store 738 * it in the overlay_dev_t. 739 * 740 * ----------- 741 * GLDv3 Notes 742 * ----------- 743 * 744 * The overlay driver implements a GLDv3 device. Parts of GLDv3 are more 745 * relevant and other parts are much less relevant for us. For example, the 746 * GLDv3 is used to toggle the device being put into and out of promiscuous 747 * mode, to program MAC addresses for unicast and multicast hardware filters. 748 * Today, an overlay device doesn't have a notion of promiscuous mode nor does 749 * it have a notion of unicast and multicast addresses programmed into the 750 * device. Instead, for the purposes of the hardware filter, we don't do 751 * anything and just always accept new addresses being added and removed. 752 * 753 * If the GLDv3 start function has not been called, then we will not use this 754 * device for I/O purposes. Any calls to transmit or receive should be dropped, 755 * though the GLDv3 guarantees us that transmit will not be called without 756 * calling start. Similarly, once stop is called, then no packets can be dealt 757 * with. 758 * 759 * Today we don't support the stat interfaces, though there's no good reason 760 * that we shouldn't assemble some of the stats based on what we have in the 761 * future. 762 * 763 * When it comes to link properties, many of the traditional link properties do 764 * not apply and many others MAC handles for us. For example, we don't need to 765 * implement anything for overlay_m_getprop() to deal with returning the MTU, as 766 * MAC never calls into us for that. As such, there isn't much of anything to 767 * support in terms of properties. 768 * 769 * Today, we don't support any notion of hardware capabilities. However, if 770 * future NIC hardware or other changes to the system cause it to make sense for 771 * us to emulate logical groups, then we should do that. However, we still do 772 * implement a capab function so that we can identify ourselves as an overlay 773 * device to the broader MAC framework. This is done mostly so that a device 774 * created on top of us can have fanout rings as we don't try to lie about a 775 * speed for our device. 776 * 777 * The other question is what should be done for a device's MTU and margin. We 778 * set our minimum supported MTU to be the minimum value that an IP network may 779 * be set to 576 -- which mimics what an etherstub does. On the flip side, we 780 * have our upper bound set to 8900. This value comes from the fact that a lot 781 * of jumbo networks use their maximum as 9000. As such, we want to reserve 100 782 * bytes, which isn't exactly the most accurate number, but it'll be good enough 783 * for now. Because of that, our default MTU off of these devices is 1400, as 784 * the default MTU for everything is usually 1500 or whatever the underlying 785 * device is at; however, this is a bit simpler than asking the netstack what 786 * are all the IP interfaces at. It also calls into question how PMTU and PMTU 787 * discovery should work here. The challenge, especially for 788 * OVERLAY_TARG_DYNAMIC is that the MTU to any of the places will vary and it's 789 * not clear that if you have a single bad entry that the overall MTU should be 790 * lowered. Instead, we should figure out a better way of determining these 791 * kinds of PMTU errors and appropriately alerting the administrator via FMA. 792 * 793 * Regarding margin, we allow a margin of up to VLAN_TAGSZ depending on whether 794 * or not the underlying encapsulation device supports VLAN tags. If it does, 795 * then we'll set the margin to allow for it, otherwise, we will not. 796 */ 797 798 #include <sys/conf.h> 799 #include <sys/errno.h> 800 #include <sys/stat.h> 801 #include <sys/ddi.h> 802 #include <sys/sunddi.h> 803 #include <sys/modctl.h> 804 #include <sys/policy.h> 805 #include <sys/stream.h> 806 #include <sys/strsubr.h> 807 #include <sys/strsun.h> 808 #include <sys/types.h> 809 #include <sys/kmem.h> 810 #include <sys/param.h> 811 #include <sys/sysmacros.h> 812 #include <sys/ddifm.h> 813 814 #include <sys/dls.h> 815 #include <sys/dld_ioc.h> 816 #include <sys/mac_provider.h> 817 #include <sys/mac_client_priv.h> 818 #include <sys/mac_ether.h> 819 #include <sys/vlan.h> 820 821 #include <sys/overlay_impl.h> 822 823 dev_info_t *overlay_dip; 824 static kmutex_t overlay_dev_lock; 825 static list_t overlay_dev_list; 826 static uint8_t overlay_macaddr[ETHERADDRL] = 827 { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 828 829 typedef enum overlay_dev_prop { 830 OVERLAY_DEV_P_MTU = 0, 831 OVERLAY_DEV_P_VNETID, 832 OVERLAY_DEV_P_ENCAP, 833 OVERLAY_DEV_P_VARPDID 834 } overlay_dev_prop_t; 835 836 #define OVERLAY_DEV_NPROPS 4 837 static const char *overlay_dev_props[] = { 838 "mtu", 839 "vnetid", 840 "encap", 841 "varpd/id" 842 }; 843 844 #define OVERLAY_MTU_MIN 576 845 #define OVERLAY_MTU_DEF 1400 846 #define OVERLAY_MTU_MAX 8900 847 848 overlay_dev_t * 849 overlay_hold_by_dlid(datalink_id_t id) 850 { 851 overlay_dev_t *o; 852 853 mutex_enter(&overlay_dev_lock); 854 for (o = list_head(&overlay_dev_list); o != NULL; 855 o = list_next(&overlay_dev_list, o)) { 856 if (id == o->odd_linkid) { 857 mutex_enter(&o->odd_lock); 858 o->odd_ref++; 859 mutex_exit(&o->odd_lock); 860 mutex_exit(&overlay_dev_lock); 861 return (o); 862 } 863 } 864 865 mutex_exit(&overlay_dev_lock); 866 return (NULL); 867 } 868 869 void 870 overlay_hold_rele(overlay_dev_t *odd) 871 { 872 mutex_enter(&odd->odd_lock); 873 ASSERT(odd->odd_ref > 0); 874 odd->odd_ref--; 875 mutex_exit(&odd->odd_lock); 876 } 877 878 void 879 overlay_io_start(overlay_dev_t *odd, overlay_dev_flag_t flag) 880 { 881 ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX); 882 ASSERT(MUTEX_HELD(&odd->odd_lock)); 883 884 if (flag & OVERLAY_F_IN_RX) 885 odd->odd_rxcount++; 886 if (flag & OVERLAY_F_IN_TX) 887 odd->odd_txcount++; 888 odd->odd_flags |= flag; 889 } 890 891 void 892 overlay_io_done(overlay_dev_t *odd, overlay_dev_flag_t flag) 893 { 894 boolean_t signal = B_FALSE; 895 896 ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX); 897 ASSERT(MUTEX_HELD(&odd->odd_lock)); 898 899 if (flag & OVERLAY_F_IN_RX) { 900 ASSERT(odd->odd_rxcount > 0); 901 odd->odd_rxcount--; 902 if (odd->odd_rxcount == 0) { 903 signal = B_TRUE; 904 odd->odd_flags &= ~OVERLAY_F_IN_RX; 905 } 906 } 907 if (flag & OVERLAY_F_IN_TX) { 908 ASSERT(odd->odd_txcount > 0); 909 odd->odd_txcount--; 910 if (odd->odd_txcount == 0) { 911 signal = B_TRUE; 912 odd->odd_flags &= ~OVERLAY_F_IN_TX; 913 } 914 } 915 916 if (signal == B_TRUE) 917 cv_broadcast(&odd->odd_iowait); 918 } 919 920 static void 921 overlay_io_wait(overlay_dev_t *odd, overlay_dev_flag_t flag) 922 { 923 ASSERT((flag & ~OVERLAY_F_IOMASK) == 0); 924 ASSERT(MUTEX_HELD(&odd->odd_lock)); 925 926 while (odd->odd_flags & flag) { 927 cv_wait(&odd->odd_iowait, &odd->odd_lock); 928 } 929 } 930 931 void 932 overlay_dev_iter(overlay_dev_iter_f func, void *arg) 933 { 934 overlay_dev_t *odd; 935 936 mutex_enter(&overlay_dev_lock); 937 for (odd = list_head(&overlay_dev_list); odd != NULL; 938 odd = list_next(&overlay_dev_list, odd)) { 939 if (func(odd, arg) != 0) { 940 mutex_exit(&overlay_dev_lock); 941 return; 942 } 943 } 944 mutex_exit(&overlay_dev_lock); 945 } 946 947 /* ARGSUSED */ 948 static int 949 overlay_m_stat(void *arg, uint_t stat, uint64_t *val) 950 { 951 return (ENOTSUP); 952 } 953 954 static int 955 overlay_m_start(void *arg) 956 { 957 overlay_dev_t *odd = arg; 958 overlay_mux_t *mux; 959 int ret, domain, family, prot; 960 struct sockaddr_storage storage; 961 socklen_t slen; 962 963 mutex_enter(&odd->odd_lock); 964 if ((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0) { 965 mutex_exit(&odd->odd_lock); 966 return (EAGAIN); 967 } 968 mutex_exit(&odd->odd_lock); 969 970 ret = odd->odd_plugin->ovp_ops->ovpo_socket(odd->odd_pvoid, &domain, 971 &family, &prot, (struct sockaddr *)&storage, &slen); 972 if (ret != 0) 973 return (ret); 974 975 mux = overlay_mux_open(odd->odd_plugin, domain, family, prot, 976 (struct sockaddr *)&storage, slen, &ret); 977 if (mux == NULL) 978 return (ret); 979 980 overlay_mux_add_dev(mux, odd); 981 odd->odd_mux = mux; 982 mutex_enter(&odd->odd_lock); 983 ASSERT(!(odd->odd_flags & OVERLAY_F_IN_MUX)); 984 odd->odd_flags |= OVERLAY_F_IN_MUX; 985 mutex_exit(&odd->odd_lock); 986 987 return (0); 988 } 989 990 static void 991 overlay_m_stop(void *arg) 992 { 993 overlay_dev_t *odd = arg; 994 995 /* 996 * The MAC Perimeter is held here, so we don't have to worry about 997 * synchronizing this with respect to metadata operations. 998 */ 999 mutex_enter(&odd->odd_lock); 1000 VERIFY(odd->odd_flags & OVERLAY_F_IN_MUX); 1001 VERIFY(!(odd->odd_flags & OVERLAY_F_MDDROP)); 1002 odd->odd_flags |= OVERLAY_F_MDDROP; 1003 overlay_io_wait(odd, OVERLAY_F_IOMASK); 1004 mutex_exit(&odd->odd_lock); 1005 1006 overlay_mux_remove_dev(odd->odd_mux, odd); 1007 overlay_mux_close(odd->odd_mux); 1008 odd->odd_mux = NULL; 1009 1010 mutex_enter(&odd->odd_lock); 1011 odd->odd_flags &= ~OVERLAY_F_IN_MUX; 1012 odd->odd_flags &= ~OVERLAY_F_MDDROP; 1013 VERIFY((odd->odd_flags & OVERLAY_F_STOPMASK) == 0); 1014 mutex_exit(&odd->odd_lock); 1015 } 1016 1017 /* 1018 * For more info on this, see the big theory statement. 1019 */ 1020 /* ARGSUSED */ 1021 static int 1022 overlay_m_promisc(void *arg, boolean_t on) 1023 { 1024 return (0); 1025 } 1026 1027 /* 1028 * For more info on this, see the big theory statement. 1029 */ 1030 /* ARGSUSED */ 1031 static int 1032 overlay_m_multicast(void *arg, boolean_t add, const uint8_t *addrp) 1033 { 1034 return (0); 1035 } 1036 1037 /* 1038 * For more info on this, see the big theory statement. 1039 */ 1040 /* ARGSUSED */ 1041 static int 1042 overlay_m_unicast(void *arg, const uint8_t *macaddr) 1043 { 1044 return (0); 1045 } 1046 1047 mblk_t * 1048 overlay_m_tx(void *arg, mblk_t *mp_chain) 1049 { 1050 overlay_dev_t *odd = arg; 1051 mblk_t *mp, *ep; 1052 int ret; 1053 ovep_encap_info_t einfo; 1054 struct msghdr hdr; 1055 1056 mutex_enter(&odd->odd_lock); 1057 if ((odd->odd_flags & OVERLAY_F_MDDROP) || 1058 !(odd->odd_flags & OVERLAY_F_IN_MUX)) { 1059 mutex_exit(&odd->odd_lock); 1060 freemsgchain(mp_chain); 1061 return (NULL); 1062 } 1063 overlay_io_start(odd, OVERLAY_F_IN_TX); 1064 mutex_exit(&odd->odd_lock); 1065 1066 bzero(&hdr, sizeof (struct msghdr)); 1067 1068 bzero(&einfo, sizeof (ovep_encap_info_t)); 1069 einfo.ovdi_id = odd->odd_vid; 1070 mp = mp_chain; 1071 while (mp != NULL) { 1072 socklen_t slen; 1073 struct sockaddr_storage storage; 1074 1075 mp_chain = mp->b_next; 1076 mp->b_next = NULL; 1077 ep = NULL; 1078 1079 ret = overlay_target_lookup(odd, mp, 1080 (struct sockaddr *)&storage, &slen); 1081 if (ret != OVERLAY_TARGET_OK) { 1082 if (ret == OVERLAY_TARGET_DROP) 1083 freemsg(mp); 1084 mp = mp_chain; 1085 continue; 1086 } 1087 1088 hdr.msg_name = &storage; 1089 hdr.msg_namelen = slen; 1090 1091 ret = odd->odd_plugin->ovp_ops->ovpo_encap(odd->odd_mh, mp, 1092 &einfo, &ep); 1093 if (ret != 0 || ep == NULL) { 1094 freemsg(mp); 1095 goto out; 1096 } 1097 1098 ASSERT(ep->b_cont == mp || ep == mp); 1099 ret = overlay_mux_tx(odd->odd_mux, &hdr, ep); 1100 if (ret != 0) 1101 goto out; 1102 1103 mp = mp_chain; 1104 } 1105 1106 out: 1107 mutex_enter(&odd->odd_lock); 1108 overlay_io_done(odd, OVERLAY_F_IN_TX); 1109 mutex_exit(&odd->odd_lock); 1110 return (mp_chain); 1111 } 1112 1113 /* ARGSUSED */ 1114 static void 1115 overlay_m_ioctl(void *arg, queue_t *q, mblk_t *mp) 1116 { 1117 miocnak(q, mp, 0, ENOTSUP); 1118 } 1119 1120 /* ARGSUSED */ 1121 static boolean_t 1122 overlay_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) 1123 { 1124 /* 1125 * Tell MAC we're an overlay. 1126 */ 1127 if (cap == MAC_CAPAB_OVERLAY) 1128 return (B_TRUE); 1129 return (B_FALSE); 1130 } 1131 1132 /* ARGSUSED */ 1133 static int 1134 overlay_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, 1135 uint_t pr_valsize, const void *pr_val) 1136 { 1137 uint32_t mtu, old; 1138 int err; 1139 overlay_dev_t *odd = arg; 1140 1141 if (pr_num != MAC_PROP_MTU) 1142 return (ENOTSUP); 1143 1144 bcopy(pr_val, &mtu, sizeof (mtu)); 1145 if (mtu < OVERLAY_MTU_MIN || mtu > OVERLAY_MTU_MAX) 1146 return (EINVAL); 1147 1148 mutex_enter(&odd->odd_lock); 1149 old = odd->odd_mtu; 1150 odd->odd_mtu = mtu; 1151 err = mac_maxsdu_update(odd->odd_mh, mtu); 1152 if (err != 0) 1153 odd->odd_mtu = old; 1154 mutex_exit(&odd->odd_lock); 1155 1156 return (err); 1157 } 1158 1159 /* ARGSUSED */ 1160 static int 1161 overlay_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, 1162 uint_t pr_valsize, void *pr_val) 1163 { 1164 return (ENOTSUP); 1165 } 1166 1167 /* ARGSUSED */ 1168 static void 1169 overlay_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num, 1170 mac_prop_info_handle_t prh) 1171 { 1172 if (pr_num != MAC_PROP_MTU) 1173 return; 1174 1175 mac_prop_info_set_default_uint32(prh, OVERLAY_MTU_DEF); 1176 mac_prop_info_set_range_uint32(prh, OVERLAY_MTU_MIN, OVERLAY_MTU_MAX); 1177 } 1178 1179 static mac_callbacks_t overlay_m_callbacks = { 1180 .mc_callbacks = (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP | 1181 MC_PROPINFO), 1182 .mc_getstat = overlay_m_stat, 1183 .mc_start = overlay_m_start, 1184 .mc_stop = overlay_m_stop, 1185 .mc_setpromisc = overlay_m_promisc, 1186 .mc_multicst = overlay_m_multicast, 1187 .mc_unicst = overlay_m_unicast, 1188 .mc_tx = overlay_m_tx, 1189 .mc_ioctl = overlay_m_ioctl, 1190 .mc_getcapab = overlay_m_getcapab, 1191 .mc_getprop = overlay_m_getprop, 1192 .mc_setprop = overlay_m_setprop, 1193 .mc_propinfo = overlay_m_propinfo 1194 }; 1195 1196 static boolean_t 1197 overlay_valid_name(const char *name, size_t buflen) 1198 { 1199 size_t actlen; 1200 int err, i; 1201 1202 for (i = 0; i < buflen; i++) { 1203 if (name[i] == '\0') 1204 break; 1205 } 1206 1207 if (i == 0 || i == buflen) 1208 return (B_FALSE); 1209 actlen = i; 1210 if (strchr(name, '/') != NULL) 1211 return (B_FALSE); 1212 if (u8_validate((char *)name, actlen, NULL, 1213 U8_VALIDATE_ENTIRE, &err) < 0) 1214 return (B_FALSE); 1215 return (B_TRUE); 1216 } 1217 1218 /* ARGSUSED */ 1219 static int 1220 overlay_i_create(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) 1221 { 1222 int err; 1223 uint64_t maxid; 1224 overlay_dev_t *odd, *o; 1225 mac_register_t *mac; 1226 overlay_ioc_create_t *oicp = karg; 1227 1228 if (overlay_valid_name(oicp->oic_encap, MAXLINKNAMELEN) == B_FALSE) 1229 return (EINVAL); 1230 1231 odd = kmem_zalloc(sizeof (overlay_dev_t), KM_SLEEP); 1232 odd->odd_linkid = oicp->oic_linkid; 1233 odd->odd_plugin = overlay_plugin_lookup(oicp->oic_encap); 1234 if (odd->odd_plugin == NULL) { 1235 kmem_free(odd, sizeof (overlay_dev_t)); 1236 return (ENOENT); 1237 } 1238 err = odd->odd_plugin->ovp_ops->ovpo_init((overlay_handle_t)odd, 1239 &odd->odd_pvoid); 1240 if (err != 0) { 1241 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); 1242 overlay_plugin_rele(odd->odd_plugin); 1243 kmem_free(odd, sizeof (overlay_dev_t)); 1244 return (EINVAL); 1245 } 1246 1247 /* 1248 * Make sure that our virtual network id is valid for the given plugin 1249 * that we're working with. 1250 */ 1251 ASSERT(odd->odd_plugin->ovp_id_size <= 8); 1252 maxid = UINT64_MAX; 1253 if (odd->odd_plugin->ovp_id_size != 8) 1254 maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) - 1ULL; 1255 if (oicp->oic_vnetid > maxid) { 1256 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); 1257 overlay_plugin_rele(odd->odd_plugin); 1258 kmem_free(odd, sizeof (overlay_dev_t)); 1259 return (EINVAL); 1260 } 1261 odd->odd_vid = oicp->oic_vnetid; 1262 1263 mac = mac_alloc(MAC_VERSION); 1264 if (mac == NULL) { 1265 mutex_exit(&overlay_dev_lock); 1266 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); 1267 overlay_plugin_rele(odd->odd_plugin); 1268 kmem_free(odd, sizeof (overlay_dev_t)); 1269 return (EINVAL); 1270 } 1271 1272 mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER; 1273 mac->m_driver = odd; 1274 mac->m_dip = overlay_dip; 1275 mac->m_dst_addr = NULL; 1276 mac->m_callbacks = &overlay_m_callbacks; 1277 mac->m_pdata = NULL; 1278 mac->m_pdata_size = 0; 1279 1280 mac->m_priv_props = NULL; 1281 1282 /* Let mac handle this itself. */ 1283 mac->m_instance = (uint_t)-1; 1284 1285 /* 1286 * There is no real source address that should be used here, but saying 1287 * that we're not ethernet is going to cause its own problems. At the 1288 * end of the say, this is fine. 1289 */ 1290 mac->m_src_addr = overlay_macaddr; 1291 1292 /* 1293 * Start with the default MTU as the max SDU. If the MTU is changed, the 1294 * SDU will be changed to reflect that. 1295 */ 1296 mac->m_min_sdu = 1; 1297 mac->m_max_sdu = OVERLAY_MTU_DEF; 1298 mac->m_multicast_sdu = 0; 1299 1300 /* 1301 * The underlying device doesn't matter, instead this comes from the 1302 * encapsulation protocol and whether or not they allow VLAN tags. 1303 */ 1304 if (odd->odd_plugin->ovp_flags & OVEP_F_VLAN_TAG) { 1305 mac->m_margin = VLAN_TAGSZ; 1306 } else { 1307 mac->m_margin = 0; 1308 } 1309 1310 /* 1311 * Today, we have no MAC virtualization, it may make sense in the future 1312 * to go ahead and emulate some subset of this, but it doesn't today. 1313 */ 1314 mac->m_v12n = MAC_VIRT_NONE; 1315 1316 mutex_enter(&overlay_dev_lock); 1317 for (o = list_head(&overlay_dev_list); o != NULL; 1318 o = list_next(&overlay_dev_list, o)) { 1319 if (o->odd_linkid == oicp->oic_linkid) { 1320 mutex_exit(&overlay_dev_lock); 1321 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); 1322 overlay_plugin_rele(odd->odd_plugin); 1323 kmem_free(odd, sizeof (overlay_dev_t)); 1324 return (EEXIST); 1325 } 1326 1327 if (o->odd_vid == oicp->oic_vnetid && 1328 o->odd_plugin == odd->odd_plugin) { 1329 mutex_exit(&overlay_dev_lock); 1330 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); 1331 overlay_plugin_rele(odd->odd_plugin); 1332 kmem_free(odd, sizeof (overlay_dev_t)); 1333 return (EEXIST); 1334 } 1335 } 1336 1337 err = mac_register(mac, &odd->odd_mh); 1338 mac_free(mac); 1339 if (err != 0) { 1340 mutex_exit(&overlay_dev_lock); 1341 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); 1342 overlay_plugin_rele(odd->odd_plugin); 1343 kmem_free(odd, sizeof (overlay_dev_t)); 1344 return (err); 1345 } 1346 1347 err = dls_devnet_create(odd->odd_mh, odd->odd_linkid, 1348 crgetzoneid(cred)); 1349 if (err != 0) { 1350 mutex_exit(&overlay_dev_lock); 1351 (void) mac_unregister(odd->odd_mh); 1352 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); 1353 overlay_plugin_rele(odd->odd_plugin); 1354 kmem_free(odd, sizeof (overlay_dev_t)); 1355 return (err); 1356 } 1357 1358 mutex_init(&odd->odd_lock, NULL, MUTEX_DRIVER, NULL); 1359 cv_init(&odd->odd_iowait, NULL, CV_DRIVER, NULL); 1360 odd->odd_ref = 0; 1361 odd->odd_flags = 0; 1362 list_insert_tail(&overlay_dev_list, odd); 1363 mutex_exit(&overlay_dev_lock); 1364 1365 return (0); 1366 } 1367 1368 /* ARGSUSED */ 1369 static int 1370 overlay_i_activate(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) 1371 { 1372 int i, ret; 1373 overlay_dev_t *odd; 1374 mac_perim_handle_t mph; 1375 overlay_ioc_activate_t *oiap = karg; 1376 overlay_ioc_propinfo_t *infop; 1377 overlay_ioc_prop_t *oip; 1378 overlay_prop_handle_t phdl; 1379 1380 odd = overlay_hold_by_dlid(oiap->oia_linkid); 1381 if (odd == NULL) 1382 return (ENOENT); 1383 1384 infop = kmem_alloc(sizeof (overlay_ioc_propinfo_t), KM_SLEEP); 1385 oip = kmem_alloc(sizeof (overlay_ioc_prop_t), KM_SLEEP); 1386 phdl = (overlay_prop_handle_t)infop; 1387 1388 mac_perim_enter_by_mh(odd->odd_mh, &mph); 1389 mutex_enter(&odd->odd_lock); 1390 if (odd->odd_flags & OVERLAY_F_ACTIVATED) { 1391 mutex_exit(&odd->odd_lock); 1392 mac_perim_exit(mph); 1393 overlay_hold_rele(odd); 1394 kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); 1395 kmem_free(oip, sizeof (overlay_ioc_prop_t)); 1396 return (EEXIST); 1397 } 1398 mutex_exit(&odd->odd_lock); 1399 1400 for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) { 1401 const char *pname = odd->odd_plugin->ovp_props[i]; 1402 bzero(infop, sizeof (overlay_ioc_propinfo_t)); 1403 overlay_prop_init(phdl); 1404 ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(pname, phdl); 1405 if (ret != 0) { 1406 mac_perim_exit(mph); 1407 overlay_hold_rele(odd); 1408 kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); 1409 kmem_free(oip, sizeof (overlay_ioc_prop_t)); 1410 return (ret); 1411 } 1412 1413 if ((infop->oipi_prot & OVERLAY_PROP_PERM_REQ) == 0) 1414 continue; 1415 bzero(oip, sizeof (overlay_ioc_prop_t)); 1416 oip->oip_size = sizeof (oip->oip_value); 1417 ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid, 1418 pname, oip->oip_value, &oip->oip_size); 1419 if (ret != 0) { 1420 mac_perim_exit(mph); 1421 overlay_hold_rele(odd); 1422 kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); 1423 kmem_free(oip, sizeof (overlay_ioc_prop_t)); 1424 return (ret); 1425 } 1426 if (oip->oip_size == 0) { 1427 mac_perim_exit(mph); 1428 overlay_hold_rele(odd); 1429 kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); 1430 kmem_free(oip, sizeof (overlay_ioc_prop_t)); 1431 return (EINVAL); 1432 } 1433 } 1434 1435 mutex_enter(&odd->odd_lock); 1436 if ((odd->odd_flags & OVERLAY_F_VARPD) == 0) { 1437 mutex_exit(&odd->odd_lock); 1438 mac_perim_exit(mph); 1439 overlay_hold_rele(odd); 1440 kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); 1441 kmem_free(oip, sizeof (overlay_ioc_prop_t)); 1442 return (ENXIO); 1443 } 1444 1445 ASSERT((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0); 1446 odd->odd_flags |= OVERLAY_F_ACTIVATED; 1447 1448 /* 1449 * Now that we've activated ourselves, we should indicate to the world 1450 * that we're up. Note that we may not be able to perform lookups at 1451 * this time, but our notion of being 'up' isn't dependent on that 1452 * ability. 1453 */ 1454 mac_link_update(odd->odd_mh, LINK_STATE_UP); 1455 mutex_exit(&odd->odd_lock); 1456 1457 mac_perim_exit(mph); 1458 overlay_hold_rele(odd); 1459 kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); 1460 kmem_free(oip, sizeof (overlay_ioc_prop_t)); 1461 1462 return (0); 1463 } 1464 1465 /* ARGSUSED */ 1466 static int 1467 overlay_i_delete(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) 1468 { 1469 overlay_ioc_delete_t *oidp = karg; 1470 overlay_dev_t *odd; 1471 datalink_id_t tid; 1472 int ret; 1473 1474 odd = overlay_hold_by_dlid(oidp->oid_linkid); 1475 if (odd == NULL) { 1476 return (ENOENT); 1477 } 1478 1479 mutex_enter(&odd->odd_lock); 1480 /* If we're not the only hold, we're busy */ 1481 if (odd->odd_ref != 1) { 1482 mutex_exit(&odd->odd_lock); 1483 overlay_hold_rele(odd); 1484 return (EBUSY); 1485 } 1486 1487 if (odd->odd_flags & OVERLAY_F_IN_MUX) { 1488 mutex_exit(&odd->odd_lock); 1489 overlay_hold_rele(odd); 1490 return (EBUSY); 1491 } 1492 1493 /* 1494 * To remove this, we need to first remove it from dls and then remove 1495 * it from mac. The act of removing it from mac will check if there are 1496 * devices on top of this, eg. vnics. If there are, then that will fail 1497 * and we'll have to go through and recreate the dls entry. Only after 1498 * mac_unregister has succeeded, then we'll go through and actually free 1499 * everything and drop the dev lock. 1500 */ 1501 ret = dls_devnet_destroy(odd->odd_mh, &tid, B_TRUE); 1502 if (ret != 0) { 1503 overlay_hold_rele(odd); 1504 return (ret); 1505 } 1506 1507 ASSERT(oidp->oid_linkid == tid); 1508 ret = mac_disable(odd->odd_mh); 1509 if (ret != 0) { 1510 (void) dls_devnet_create(odd->odd_mh, odd->odd_linkid, 1511 crgetzoneid(cred)); 1512 overlay_hold_rele(odd); 1513 return (ret); 1514 } 1515 1516 overlay_target_quiesce(odd->odd_target); 1517 1518 mutex_enter(&overlay_dev_lock); 1519 list_remove(&overlay_dev_list, odd); 1520 mutex_exit(&overlay_dev_lock); 1521 1522 cv_destroy(&odd->odd_iowait); 1523 mutex_destroy(&odd->odd_lock); 1524 overlay_target_free(odd); 1525 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); 1526 overlay_plugin_rele(odd->odd_plugin); 1527 kmem_free(odd, sizeof (overlay_dev_t)); 1528 1529 return (0); 1530 } 1531 1532 /* ARGSUSED */ 1533 static int 1534 overlay_i_nprops(void *karg, intptr_t arg, int mode, cred_t *cred, 1535 int *rvalp) 1536 { 1537 overlay_dev_t *odd; 1538 overlay_ioc_nprops_t *on = karg; 1539 1540 odd = overlay_hold_by_dlid(on->oipn_linkid); 1541 if (odd == NULL) 1542 return (ENOENT); 1543 on->oipn_nprops = odd->odd_plugin->ovp_nprops + OVERLAY_DEV_NPROPS; 1544 overlay_hold_rele(odd); 1545 1546 return (0); 1547 } 1548 1549 static int 1550 overlay_propinfo_plugin_cb(overlay_plugin_t *opp, void *arg) 1551 { 1552 overlay_prop_handle_t phdl = arg; 1553 overlay_prop_set_range_str(phdl, opp->ovp_name); 1554 return (0); 1555 } 1556 1557 static int 1558 overlay_i_name_to_propid(overlay_dev_t *odd, const char *name, uint_t *id) 1559 { 1560 int i; 1561 1562 for (i = 0; i < OVERLAY_DEV_NPROPS; i++) { 1563 if (strcmp(overlay_dev_props[i], name) == 0) { 1564 *id = i; 1565 return (0); 1566 } 1567 } 1568 1569 for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) { 1570 if (strcmp(odd->odd_plugin->ovp_props[i], name) == 0) { 1571 *id = i + OVERLAY_DEV_NPROPS; 1572 return (0); 1573 } 1574 } 1575 1576 return (ENOENT); 1577 } 1578 1579 static void 1580 overlay_i_propinfo_mtu(overlay_dev_t *odd, overlay_prop_handle_t phdl) 1581 { 1582 uint32_t def; 1583 mac_propval_range_t range; 1584 uint_t perm; 1585 1586 ASSERT(MAC_PERIM_HELD(odd->odd_mh)); 1587 1588 bzero(&range, sizeof (mac_propval_range_t)); 1589 range.mpr_count = 1; 1590 if (mac_prop_info(odd->odd_mh, MAC_PROP_MTU, "mtu", &def, 1591 sizeof (def), &range, &perm) != 0) 1592 return; 1593 1594 if (perm == MAC_PROP_PERM_READ) 1595 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ); 1596 else if (perm == MAC_PROP_PERM_WRITE) 1597 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_WRITE); 1598 else if (perm == MAC_PROP_PERM_RW) 1599 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW); 1600 1601 overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); 1602 overlay_prop_set_default(phdl, &def, sizeof (def)); 1603 overlay_prop_set_range_uint32(phdl, range.mpr_range_uint32[0].mpur_min, 1604 range.mpr_range_uint32[0].mpur_max); 1605 } 1606 1607 /* ARGSUSED */ 1608 static int 1609 overlay_i_propinfo(void *karg, intptr_t arg, int mode, cred_t *cred, 1610 int *rvalp) 1611 { 1612 overlay_dev_t *odd; 1613 int ret; 1614 mac_perim_handle_t mph; 1615 uint_t propid = UINT_MAX; 1616 overlay_ioc_propinfo_t *oip = karg; 1617 overlay_prop_handle_t phdl = (overlay_prop_handle_t)oip; 1618 1619 odd = overlay_hold_by_dlid(oip->oipi_linkid); 1620 if (odd == NULL) 1621 return (ENOENT); 1622 1623 overlay_prop_init(phdl); 1624 mac_perim_enter_by_mh(odd->odd_mh, &mph); 1625 1626 /* 1627 * If the id is -1, then the property that we're looking for is named in 1628 * oipi_name and we should fill in its id. Otherwise, we've been given 1629 * an id and we need to turn that into a name for our plugin's sake. The 1630 * id is our own fabrication for property discovery. 1631 */ 1632 if (oip->oipi_id == -1) { 1633 /* 1634 * Determine if it's a known generic property or it belongs to a 1635 * module by checking against the list of known names. 1636 */ 1637 oip->oipi_name[OVERLAY_PROP_NAMELEN-1] = '\0'; 1638 if ((ret = overlay_i_name_to_propid(odd, oip->oipi_name, 1639 &propid)) != 0) { 1640 overlay_hold_rele(odd); 1641 mac_perim_exit(mph); 1642 return (ret); 1643 } 1644 oip->oipi_id = propid; 1645 if (propid >= OVERLAY_DEV_NPROPS) { 1646 ret = odd->odd_plugin->ovp_ops->ovpo_propinfo( 1647 oip->oipi_name, phdl); 1648 overlay_hold_rele(odd); 1649 mac_perim_exit(mph); 1650 return (ret); 1651 1652 } 1653 } else if (oip->oipi_id >= OVERLAY_DEV_NPROPS) { 1654 uint_t id = oip->oipi_id - OVERLAY_DEV_NPROPS; 1655 1656 if (id >= odd->odd_plugin->ovp_nprops) { 1657 overlay_hold_rele(odd); 1658 mac_perim_exit(mph); 1659 return (EINVAL); 1660 } 1661 ret = odd->odd_plugin->ovp_ops->ovpo_propinfo( 1662 odd->odd_plugin->ovp_props[id], phdl); 1663 overlay_hold_rele(odd); 1664 mac_perim_exit(mph); 1665 return (ret); 1666 } else if (oip->oipi_id < -1) { 1667 overlay_hold_rele(odd); 1668 mac_perim_exit(mph); 1669 return (EINVAL); 1670 } else { 1671 ASSERT(oip->oipi_id < OVERLAY_DEV_NPROPS); 1672 ASSERT(oip->oipi_id >= 0); 1673 propid = oip->oipi_id; 1674 (void) strlcpy(oip->oipi_name, overlay_dev_props[propid], 1675 sizeof (oip->oipi_name)); 1676 } 1677 1678 switch (propid) { 1679 case OVERLAY_DEV_P_MTU: 1680 overlay_i_propinfo_mtu(odd, phdl); 1681 break; 1682 case OVERLAY_DEV_P_VNETID: 1683 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW); 1684 overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); 1685 overlay_prop_set_nodefault(phdl); 1686 break; 1687 case OVERLAY_DEV_P_ENCAP: 1688 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ); 1689 overlay_prop_set_type(phdl, OVERLAY_PROP_T_STRING); 1690 overlay_prop_set_nodefault(phdl); 1691 overlay_plugin_walk(overlay_propinfo_plugin_cb, phdl); 1692 break; 1693 case OVERLAY_DEV_P_VARPDID: 1694 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ); 1695 overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); 1696 overlay_prop_set_nodefault(phdl); 1697 break; 1698 default: 1699 overlay_hold_rele(odd); 1700 mac_perim_exit(mph); 1701 return (ENOENT); 1702 } 1703 1704 overlay_hold_rele(odd); 1705 mac_perim_exit(mph); 1706 return (0); 1707 } 1708 1709 /* ARGSUSED */ 1710 static int 1711 overlay_i_getprop(void *karg, intptr_t arg, int mode, cred_t *cred, 1712 int *rvalp) 1713 { 1714 int ret; 1715 overlay_dev_t *odd; 1716 mac_perim_handle_t mph; 1717 overlay_ioc_prop_t *oip = karg; 1718 uint_t propid, mtu; 1719 1720 odd = overlay_hold_by_dlid(oip->oip_linkid); 1721 if (odd == NULL) 1722 return (ENOENT); 1723 1724 mac_perim_enter_by_mh(odd->odd_mh, &mph); 1725 oip->oip_size = OVERLAY_PROP_SIZEMAX; 1726 oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0'; 1727 if (oip->oip_id == -1) { 1728 int i; 1729 1730 for (i = 0; i < OVERLAY_DEV_NPROPS; i++) { 1731 if (strcmp(overlay_dev_props[i], oip->oip_name) == 0) 1732 break; 1733 if (i == OVERLAY_DEV_NPROPS) { 1734 ret = odd->odd_plugin->ovp_ops->ovpo_getprop( 1735 odd->odd_pvoid, oip->oip_name, 1736 oip->oip_value, &oip->oip_size); 1737 overlay_hold_rele(odd); 1738 mac_perim_exit(mph); 1739 return (ret); 1740 } 1741 } 1742 1743 propid = i; 1744 } else if (oip->oip_id >= OVERLAY_DEV_NPROPS) { 1745 uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS; 1746 1747 if (id > odd->odd_plugin->ovp_nprops) { 1748 overlay_hold_rele(odd); 1749 mac_perim_exit(mph); 1750 return (EINVAL); 1751 } 1752 ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid, 1753 odd->odd_plugin->ovp_props[id], oip->oip_value, 1754 &oip->oip_size); 1755 overlay_hold_rele(odd); 1756 mac_perim_exit(mph); 1757 return (ret); 1758 } else if (oip->oip_id < -1) { 1759 overlay_hold_rele(odd); 1760 mac_perim_exit(mph); 1761 return (EINVAL); 1762 } else { 1763 ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS); 1764 ASSERT(oip->oip_id >= 0); 1765 propid = oip->oip_id; 1766 } 1767 1768 ret = 0; 1769 switch (propid) { 1770 case OVERLAY_DEV_P_MTU: 1771 /* 1772 * The MTU is always set and retrieved through MAC, to allow for 1773 * MAC to do whatever it wants, as really that property belongs 1774 * to MAC. This is important for things where vnics have hold on 1775 * the MTU. 1776 */ 1777 mac_sdu_get(odd->odd_mh, NULL, &mtu); 1778 bcopy(&mtu, oip->oip_value, sizeof (uint_t)); 1779 oip->oip_size = sizeof (uint_t); 1780 break; 1781 case OVERLAY_DEV_P_VNETID: 1782 /* 1783 * While it's read-only while inside of a mux, we're not in a 1784 * context that can guarantee that. Therefore we always grab the 1785 * overlay_dev_t's odd_lock. 1786 */ 1787 mutex_enter(&odd->odd_lock); 1788 bcopy(&odd->odd_vid, oip->oip_value, sizeof (uint64_t)); 1789 mutex_exit(&odd->odd_lock); 1790 oip->oip_size = sizeof (uint64_t); 1791 break; 1792 case OVERLAY_DEV_P_ENCAP: 1793 oip->oip_size = strlcpy((char *)oip->oip_value, 1794 odd->odd_plugin->ovp_name, oip->oip_size); 1795 break; 1796 case OVERLAY_DEV_P_VARPDID: 1797 mutex_enter(&odd->odd_lock); 1798 if (odd->odd_flags & OVERLAY_F_VARPD) { 1799 const uint64_t val = odd->odd_target->ott_id; 1800 bcopy(&val, oip->oip_value, sizeof (uint64_t)); 1801 oip->oip_size = sizeof (uint64_t); 1802 } else { 1803 oip->oip_size = 0; 1804 } 1805 mutex_exit(&odd->odd_lock); 1806 break; 1807 default: 1808 ret = ENOENT; 1809 } 1810 1811 overlay_hold_rele(odd); 1812 mac_perim_exit(mph); 1813 return (ret); 1814 } 1815 1816 static void 1817 overlay_setprop_vnetid(overlay_dev_t *odd, uint64_t vnetid) 1818 { 1819 mutex_enter(&odd->odd_lock); 1820 1821 /* Simple case, not active */ 1822 if (!(odd->odd_flags & OVERLAY_F_IN_MUX)) { 1823 odd->odd_vid = vnetid; 1824 mutex_exit(&odd->odd_lock); 1825 return; 1826 } 1827 1828 /* 1829 * In the hard case, we need to set the drop flag, quiesce I/O and then 1830 * we can go ahead and do everything. 1831 */ 1832 odd->odd_flags |= OVERLAY_F_MDDROP; 1833 overlay_io_wait(odd, OVERLAY_F_IOMASK); 1834 mutex_exit(&odd->odd_lock); 1835 1836 overlay_mux_remove_dev(odd->odd_mux, odd); 1837 mutex_enter(&odd->odd_lock); 1838 odd->odd_vid = vnetid; 1839 mutex_exit(&odd->odd_lock); 1840 overlay_mux_add_dev(odd->odd_mux, odd); 1841 1842 mutex_enter(&odd->odd_lock); 1843 ASSERT(odd->odd_flags & OVERLAY_F_IN_MUX); 1844 odd->odd_flags &= ~OVERLAY_F_IN_MUX; 1845 mutex_exit(&odd->odd_lock); 1846 } 1847 1848 /* ARGSUSED */ 1849 static int 1850 overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred, 1851 int *rvalp) 1852 { 1853 int ret; 1854 overlay_dev_t *odd; 1855 overlay_ioc_prop_t *oip = karg; 1856 uint_t propid = UINT_MAX; 1857 mac_perim_handle_t mph; 1858 uint64_t maxid, *vidp; 1859 1860 if (oip->oip_size > OVERLAY_PROP_SIZEMAX) 1861 return (EINVAL); 1862 1863 odd = overlay_hold_by_dlid(oip->oip_linkid); 1864 if (odd == NULL) 1865 return (ENOENT); 1866 1867 oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0'; 1868 mac_perim_enter_by_mh(odd->odd_mh, &mph); 1869 mutex_enter(&odd->odd_lock); 1870 if (odd->odd_flags & OVERLAY_F_ACTIVATED) { 1871 mac_perim_exit(mph); 1872 mutex_exit(&odd->odd_lock); 1873 return (ENOTSUP); 1874 } 1875 mutex_exit(&odd->odd_lock); 1876 if (oip->oip_id == -1) { 1877 int i; 1878 1879 for (i = 0; i < OVERLAY_DEV_NPROPS; i++) { 1880 if (strcmp(overlay_dev_props[i], oip->oip_name) == 0) 1881 break; 1882 if (i == OVERLAY_DEV_NPROPS) { 1883 ret = odd->odd_plugin->ovp_ops->ovpo_setprop( 1884 odd->odd_pvoid, oip->oip_name, 1885 oip->oip_value, oip->oip_size); 1886 overlay_hold_rele(odd); 1887 mac_perim_exit(mph); 1888 return (ret); 1889 } 1890 } 1891 1892 propid = i; 1893 } else if (oip->oip_id >= OVERLAY_DEV_NPROPS) { 1894 uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS; 1895 1896 if (id > odd->odd_plugin->ovp_nprops) { 1897 mac_perim_exit(mph); 1898 overlay_hold_rele(odd); 1899 return (EINVAL); 1900 } 1901 ret = odd->odd_plugin->ovp_ops->ovpo_setprop(odd->odd_pvoid, 1902 odd->odd_plugin->ovp_props[id], oip->oip_value, 1903 oip->oip_size); 1904 mac_perim_exit(mph); 1905 overlay_hold_rele(odd); 1906 return (ret); 1907 } else if (oip->oip_id < -1) { 1908 mac_perim_exit(mph); 1909 overlay_hold_rele(odd); 1910 return (EINVAL); 1911 } else { 1912 ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS); 1913 ASSERT(oip->oip_id >= 0); 1914 propid = oip->oip_id; 1915 } 1916 1917 ret = 0; 1918 switch (propid) { 1919 case OVERLAY_DEV_P_MTU: 1920 ret = mac_set_prop(odd->odd_mh, MAC_PROP_MTU, "mtu", 1921 oip->oip_value, oip->oip_size); 1922 break; 1923 case OVERLAY_DEV_P_VNETID: 1924 if (oip->oip_size != sizeof (uint64_t)) { 1925 ret = EINVAL; 1926 break; 1927 } 1928 vidp = (uint64_t *)oip->oip_value; 1929 ASSERT(odd->odd_plugin->ovp_id_size <= 8); 1930 maxid = UINT64_MAX; 1931 if (odd->odd_plugin->ovp_id_size != 8) 1932 maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) - 1933 1ULL; 1934 if (*vidp >= maxid) { 1935 ret = EINVAL; 1936 break; 1937 } 1938 overlay_setprop_vnetid(odd, *vidp); 1939 break; 1940 case OVERLAY_DEV_P_ENCAP: 1941 case OVERLAY_DEV_P_VARPDID: 1942 ret = EPERM; 1943 break; 1944 default: 1945 ret = ENOENT; 1946 } 1947 1948 mac_perim_exit(mph); 1949 overlay_hold_rele(odd); 1950 return (ret); 1951 } 1952 1953 /* ARGSUSED */ 1954 static int 1955 overlay_i_status(void *karg, intptr_t arg, int mode, cred_t *cred, 1956 int *rvalp) 1957 { 1958 overlay_dev_t *odd; 1959 overlay_ioc_status_t *os = karg; 1960 1961 odd = overlay_hold_by_dlid(os->ois_linkid); 1962 if (odd == NULL) 1963 return (ENOENT); 1964 1965 mutex_enter(&odd->odd_lock); 1966 if ((odd->odd_flags & OVERLAY_F_DEGRADED) != 0) { 1967 os->ois_status = OVERLAY_I_DEGRADED; 1968 if (odd->odd_fmamsg != NULL) { 1969 (void) strlcpy(os->ois_message, odd->odd_fmamsg, 1970 OVERLAY_STATUS_BUFLEN); 1971 } else { 1972 os->ois_message[0] = '\0'; 1973 } 1974 1975 } else { 1976 os->ois_status = OVERLAY_I_OK; 1977 os->ois_message[0] = '\0'; 1978 } 1979 mutex_exit(&odd->odd_lock); 1980 overlay_hold_rele(odd); 1981 1982 return (0); 1983 } 1984 1985 static dld_ioc_info_t overlay_ioc_list[] = { 1986 { OVERLAY_IOC_CREATE, DLDCOPYIN, sizeof (overlay_ioc_create_t), 1987 overlay_i_create, secpolicy_dl_config }, 1988 { OVERLAY_IOC_ACTIVATE, DLDCOPYIN, sizeof (overlay_ioc_activate_t), 1989 overlay_i_activate, secpolicy_dl_config }, 1990 { OVERLAY_IOC_DELETE, DLDCOPYIN, sizeof (overlay_ioc_delete_t), 1991 overlay_i_delete, secpolicy_dl_config }, 1992 { OVERLAY_IOC_PROPINFO, DLDCOPYIN | DLDCOPYOUT, 1993 sizeof (overlay_ioc_propinfo_t), overlay_i_propinfo, 1994 secpolicy_dl_config }, 1995 { OVERLAY_IOC_GETPROP, DLDCOPYIN | DLDCOPYOUT, 1996 sizeof (overlay_ioc_prop_t), overlay_i_getprop, 1997 secpolicy_dl_config }, 1998 { OVERLAY_IOC_SETPROP, DLDCOPYIN, 1999 sizeof (overlay_ioc_prop_t), overlay_i_setprop, 2000 secpolicy_dl_config }, 2001 { OVERLAY_IOC_NPROPS, DLDCOPYIN | DLDCOPYOUT, 2002 sizeof (overlay_ioc_nprops_t), overlay_i_nprops, 2003 secpolicy_dl_config }, 2004 { OVERLAY_IOC_STATUS, DLDCOPYIN | DLDCOPYOUT, 2005 sizeof (overlay_ioc_status_t), overlay_i_status, 2006 NULL } 2007 }; 2008 2009 static int 2010 overlay_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2011 { 2012 int fmcap = DDI_FM_EREPORT_CAPABLE; 2013 if (cmd != DDI_ATTACH) 2014 return (DDI_FAILURE); 2015 2016 if (overlay_dip != NULL || ddi_get_instance(dip) != 0) 2017 return (DDI_FAILURE); 2018 2019 ddi_fm_init(dip, &fmcap, NULL); 2020 2021 if (ddi_create_minor_node(dip, OVERLAY_CTL, S_IFCHR, 2022 ddi_get_instance(dip), DDI_PSEUDO, 0) == DDI_FAILURE) 2023 return (DDI_FAILURE); 2024 2025 if (dld_ioc_register(OVERLAY_IOC, overlay_ioc_list, 2026 DLDIOCCNT(overlay_ioc_list)) != 0) { 2027 ddi_remove_minor_node(dip, OVERLAY_CTL); 2028 return (DDI_FAILURE); 2029 } 2030 2031 overlay_dip = dip; 2032 return (DDI_SUCCESS); 2033 } 2034 2035 /* ARGSUSED */ 2036 static int 2037 overlay_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp) 2038 { 2039 int error; 2040 2041 switch (cmd) { 2042 case DDI_INFO_DEVT2DEVINFO: 2043 *resp = (void *)overlay_dip; 2044 error = DDI_SUCCESS; 2045 break; 2046 case DDI_INFO_DEVT2INSTANCE: 2047 *resp = (void *)0; 2048 error = DDI_SUCCESS; 2049 break; 2050 default: 2051 error = DDI_FAILURE; 2052 break; 2053 } 2054 2055 return (error); 2056 } 2057 2058 static int 2059 overlay_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2060 { 2061 if (cmd != DDI_DETACH) 2062 return (DDI_FAILURE); 2063 2064 mutex_enter(&overlay_dev_lock); 2065 if (!list_is_empty(&overlay_dev_list) || overlay_target_busy()) { 2066 mutex_exit(&overlay_dev_lock); 2067 return (EBUSY); 2068 } 2069 mutex_exit(&overlay_dev_lock); 2070 2071 2072 dld_ioc_unregister(OVERLAY_IOC); 2073 ddi_remove_minor_node(dip, OVERLAY_CTL); 2074 ddi_fm_fini(dip); 2075 overlay_dip = NULL; 2076 return (DDI_SUCCESS); 2077 } 2078 2079 static struct cb_ops overlay_cbops = { 2080 overlay_target_open, /* cb_open */ 2081 overlay_target_close, /* cb_close */ 2082 nodev, /* cb_strategy */ 2083 nodev, /* cb_print */ 2084 nodev, /* cb_dump */ 2085 nodev, /* cb_read */ 2086 nodev, /* cb_write */ 2087 overlay_target_ioctl, /* cb_ioctl */ 2088 nodev, /* cb_devmap */ 2089 nodev, /* cb_mmap */ 2090 nodev, /* cb_segmap */ 2091 nochpoll, /* cb_chpoll */ 2092 ddi_prop_op, /* cb_prop_op */ 2093 NULL, /* cb_stream */ 2094 D_MP, /* cb_flag */ 2095 CB_REV, /* cb_rev */ 2096 nodev, /* cb_aread */ 2097 nodev, /* cb_awrite */ 2098 }; 2099 2100 static struct dev_ops overlay_dev_ops = { 2101 DEVO_REV, /* devo_rev */ 2102 0, /* devo_refcnt */ 2103 overlay_getinfo, /* devo_getinfo */ 2104 nulldev, /* devo_identify */ 2105 nulldev, /* devo_probe */ 2106 overlay_attach, /* devo_attach */ 2107 overlay_detach, /* devo_detach */ 2108 nulldev, /* devo_reset */ 2109 &overlay_cbops, /* devo_cb_ops */ 2110 NULL, /* devo_bus_ops */ 2111 NULL, /* devo_power */ 2112 ddi_quiesce_not_supported /* devo_quiesce */ 2113 }; 2114 2115 static struct modldrv overlay_modldrv = { 2116 &mod_driverops, 2117 "Overlay Network Driver", 2118 &overlay_dev_ops 2119 }; 2120 2121 static struct modlinkage overlay_linkage = { 2122 MODREV_1, 2123 &overlay_modldrv 2124 }; 2125 2126 static int 2127 overlay_init(void) 2128 { 2129 mutex_init(&overlay_dev_lock, NULL, MUTEX_DRIVER, NULL); 2130 list_create(&overlay_dev_list, sizeof (overlay_dev_t), 2131 offsetof(overlay_dev_t, odd_link)); 2132 overlay_mux_init(); 2133 overlay_plugin_init(); 2134 overlay_target_init(); 2135 2136 return (DDI_SUCCESS); 2137 } 2138 2139 static void 2140 overlay_fini(void) 2141 { 2142 overlay_target_fini(); 2143 overlay_plugin_fini(); 2144 overlay_mux_fini(); 2145 mutex_destroy(&overlay_dev_lock); 2146 list_destroy(&overlay_dev_list); 2147 } 2148 2149 int 2150 _init(void) 2151 { 2152 int err; 2153 2154 if ((err = overlay_init()) != DDI_SUCCESS) 2155 return (err); 2156 2157 mac_init_ops(NULL, "overlay"); 2158 err = mod_install(&overlay_linkage); 2159 if (err != DDI_SUCCESS) { 2160 overlay_fini(); 2161 return (err); 2162 } 2163 2164 return (0); 2165 } 2166 2167 int 2168 _info(struct modinfo *modinfop) 2169 { 2170 return (mod_info(&overlay_linkage, modinfop)); 2171 } 2172 2173 int 2174 _fini(void) 2175 { 2176 int err; 2177 2178 err = mod_remove(&overlay_linkage); 2179 if (err != 0) 2180 return (err); 2181 2182 overlay_fini(); 2183 return (0); 2184 } 2185