1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2016 Joyent, Inc. 14 * Copyright 2022 MNX Cloud, Inc. 15 */ 16 17 /* 18 * Overlay Devices 19 * 20 * Overlay devices provide a means for creating overlay networks, a means of 21 * multiplexing multiple logical, isolated, and discrete layer two and layer 22 * three networks on top of one physical network. 23 * 24 * In general, these overlay devices encapsulate the logic to answer two 25 * different questions: 26 * 27 * 1) How should I transform a packet to put it on the wire? 28 * 2) Where should I send a transformed packet? 29 * 30 * Each overlay device is presented to the user as a GLDv3 device. While the 31 * link itself cannot have an IP interface created on top of it, it allows for 32 * additional GLDv3 devices, such as a VNIC, to be created on top of it which 33 * can be plumbed up with IP interfaces. 34 * 35 * 36 * -------------------- 37 * General Architecture 38 * -------------------- 39 * 40 * The logical overlay device that a user sees in dladm(8) is a combination of 41 * two different components that work together. The first component is this 42 * kernel module, which is responsible for answering question one -- how should 43 * I transform a packet to put it on the wire. 44 * 45 * The second component is what we call the virtual ARP daemon, or varpd. It is 46 * a userland component that is responsible for answering the second question -- 47 * Where should I send a transformed packet. Instances of the kernel overlay 48 * GLDv3 device ask varpd the question of where should a packet go. 49 * 50 * The split was done for a few reasons. Importantly, we wanted to keep the act 51 * of generating encapsulated packets in the kernel so as to ensure that the 52 * general data path was fast and also kept simple. On the flip side, while the 53 * question of where should something go may be simple, it may often be 54 * complicated and need to interface with several different external or 55 * distributed systems. In those cases, it's simpler to allow for the full 56 * flexibility of userland to be brought to bear to solve that problem and in 57 * general, the path isn't very common. 58 * 59 * The following is what makes up the logical overlay device that a user would 60 * create with dladm(8). 61 * 62 * Kernel Userland 63 * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 64 * . +--------+ +--------+ +--------+ . . . 65 * . | VNIC 0 | | VNIC 1 | | VNIC 2 | . . . 66 * . +--------+ +--------+ +--------+ . . . 67 * . | | | . . . 68 * . | | | . . . 69 * . +------------+-----------+ . . . 70 * . | . . /dev/overlay . 71 * . +--------------+ . . . +------------+ . 72 * . | | . . . | | . 73 * . | Overlay |======*=================| Virtual | . 74 * . | GLDv3 Device |========================| ARP Daemon | . 75 * . | | . . | | . 76 * . +--------------+ . . +------------+ . 77 * . | . . | . 78 * . | . . | . 79 * . +----------------+ . . +--------+ . 80 * . | Overlay | . . | varpd | . 81 * . | Encapsulation | . . | Lookup | . 82 * . | Plugin | . . | Plugin | . 83 * . +----------------+ . . +--------+ . 84 * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 85 * 86 * 87 * This image shows the two different components and where they live. 88 * Importantly, it also shows that both the kernel overlay device and the 89 * userland varpd both support plugins. The plugins actually implement the 90 * things that users care about and the APIs have been designed to try to 91 * minimize the amount of things that a module writer needs to worry about it. 92 * 93 * IDENTIFIERS 94 * 95 * Every overlay device is defined by a unique identifier which is the overlay 96 * identifier. Its purpose is similar to that of a VLAN identifier, it's a 97 * unique number that is used to differentiate between different entries on the 98 * wire. 99 * 100 * ENCAPSULATION 101 * 102 * An overlay encapsulation plugin is a kernel miscellaneous module whose 103 * purpose is to contain knowledge about how to transform packets to put them 104 * onto the wire and to take them off. An example of an encapsulation plugin is 105 * vxlan. It's also how support for things like nvgre or geneve would be brought 106 * into the system. 107 * 108 * Each encapsulation plugins defines a series of operation vectors and 109 * properties. For the full details on everything they should provide, please 110 * read uts/common/sys/overlay_plugin.h. The encapsulation plugin is responsible 111 * for telling the system what information is required to send a packet. For 112 * example, vxlan is defined to send everything over a UDP packet and therefore 113 * requires a port and an IP address, while nvgre on the other hand is its own 114 * IP type and therefore just requires an IP address. In addition, it also 115 * provides information about the kind of socket that should be created. This is 116 * used by the kernel multiplexor, more of that in the Kernel Components 117 * section. 118 * 119 * LOOKUPS 120 * 121 * The kernel communicates requests for lookups over the character device 122 * /dev/overlay. varpd is responsible for listening for requests on that device 123 * and answering them. The character device is specific to the target path and 124 * varpd. 125 * 126 * Much as the kernel overlay module handles the bulk of the scaffolding but 127 * leaves the important work to the encapsulation plugin, varpd provides a 128 * similar role and leaves the full brunt of lookups to a userland dynamic 129 * shared object which implements the logic of lookups. 130 * 131 * Each lookup plugin defines a series of operation vectors and properties. For 132 * the full details on everything that they should provide, please read 133 * lib/varpd/libvarpd/libvarpd_provider.h. Essentially, they are given a MAC 134 * address and asked to give an address on the physical network that it should 135 * be sent to. In addition, they handle questions related to how to handle 136 * things like broadcast and multicast traffic, etc. 137 * 138 * ---------- 139 * Properties 140 * ---------- 141 * 142 * A device from a dladm perspective has a unique set of properties that are 143 * combined from three different sources: 144 * 145 * 1) Generic properties that every overlay device has 146 * 2) Properties that are specific to the encapsulation plugin 147 * 3) Properties that are specific to the lookup plugin 148 * 149 * All of these are exposed in a single set of properties in dladm. Note that 150 * these are not necessarily traditional link properties. However, if something 151 * is both a traditional GLDv3 link property, say the MTU of a device, and a 152 * specific property here, than the driver ensures that all existing GLDv3 153 * specific means of manipulating it are used and wraps up its private property 154 * interfaces to ensure that works. 155 * 156 * Properties in the second and third category are prefixed with the name of 157 * their module. For example, the vxlan encapsulation module has a property 158 * called the 'listen_ip'. This property would show up in dladm as 159 * 'vxlan/listen_ip'. This allows different plugins to both use similar names 160 * for similar properties and to also have independent name spaces so that 161 * overlapping names do not conflict with anything else. 162 * 163 * While the kernel combines both sets one and two into a single coherent view, 164 * it does not do anything with respect to the properties that are owned by the 165 * lookup plugin -- those are owned wholly by varpd. Instead, libdladm is in 166 * charge of bridging these two worlds into one magical experience for the user. 167 * It carries the burden of knowing about both overlay specific and varpd 168 * specific properties. Importantly, we want to maintain this distinction. We 169 * don't want to treat the kernel as an arbitrary key/value store for varpd and 170 * we want the kernel to own its own data and not have to ask userland for 171 * information that it owns. 172 * 173 * Every property in the system has the following attributes: 174 * 175 * o A name 176 * o A type 177 * o A size 178 * o Permissions 179 * o Default value 180 * o Valid value ranges 181 * o A value 182 * 183 * Everything except for the value is obtained by callers through the propinfo 184 * callbacks and a property has a maximum size of OVERLAY_PROP_SIZEMAX, 185 * currently 256 bytes. 186 * 187 * The following are the supported types of properties: 188 * 189 * OVERLAY_PROP_T_INT 190 * 191 * A signed integer, its length is 8 bytes, corresponding to a 192 * int64_t. 193 * 194 * OVERLAY_PROP_T_UINT 195 * 196 * An unsigned integer, its length is 8 bytes, corresponding to a 197 * uint64_t. 198 * 199 * OVERLAY_PROP_T_IP 200 * 201 * A struct in6_addr, it has a fixed size. 202 * 203 * OVERLAY_PROP_T_STRING 204 * 205 * A null-terminated character string encoded in either ASCII or 206 * UTF-8. Note that the size of the string includes the null 207 * terminator. 208 * 209 * The next thing that we apply to a property is its permission. The permissions 210 * are put together by the bitwise or of the following flags and values. 211 * 212 * OVERLAY_PROP_PERM_REQ 213 * 214 * This indicates a required property. A property that is required 215 * must be set by a consumer before the device can be created. If a 216 * required property has a default property, this constraint is 217 * loosened because the default property defines the value. 218 * 219 * OVERLAY_PORP_PERM_READ 220 * 221 * This indicates that a property can be read. All properties will 222 * have this value set. 223 * 224 * OVERLAY_PROP_PERM_WRITE 225 * 226 * This indicates that a property can be written to and thus 227 * updated by userland. Properties that are only intended to 228 * display information, will not have OVERLAY_PROP_PERM_WRITE set. 229 * 230 * In addition, a few additional values are defined as a convenience to 231 * consumers. The first, OVERLAY_PROP_PERM_RW, is a combination of 232 * OVERLAY_PROP_PERM_READ and OVERLAY_PERM_PROP_WRITE. The second, 233 * OVERLAY_PROP_PERM_RRW, is a combination of OVERLAY_PROP_PERM_REQ, 234 * OVERLAY_PROP_PERM_READ, and OVERLAY_PROP_PERM_WRITE. The protection mode of a 235 * property should generally be a constant across its lifetime. 236 * 237 * A property may optionally have a default value. If it does have a default 238 * value, and that property is not set to be a different value, then the default 239 * value is inherited automatically. It also means that if the default value is 240 * acceptable, there is no need to set the value for a required property. For 241 * example, the vxlan module has the vxlan/listen_port property which is 242 * required, but has a default value of 4789 (the IANA assigned port). Because 243 * of that default value, there is no need for it to be set. 244 * 245 * Finally, a property may declare a list of valid values. These valid values 246 * are used for display purposes, they are not enforced by the broader system, 247 * but merely allow a means for the information to be communicated to the user 248 * through dladm(8). Like a default value, this is optional. 249 * 250 * The general scaffolding does not do very much with respect to the getting and 251 * setting of properties. That is really owned by the individual plugins 252 * themselves. 253 * 254 * ----------------------------- 255 * Destinations and Plugin Types 256 * ----------------------------- 257 * 258 * Both encapsulation and lookup plugins define the kinds of destinations that 259 * they know how to support. There are three different pieces of information 260 * that can be used to address to a destination currently, all of which is 261 * summarized in the type overlay_point_t. Any combination of these is 262 * supported. 263 * 264 * OVERLAY_PLUGIN_D_ETHERNET 265 * 266 * An Ethernet MAC address is required. 267 * 268 * OVERLAY_PLUGIN_D_IP 269 * 270 * An IP address is required. All IP addresses used by the overlay 271 * system are transmitted as IPv6 addresses. IPv4 addresses can be 272 * represented by using IPv4-mapped IPv6 addresses. 273 * 274 * OVERLAY_PLUGIN_D_PORT 275 * 276 * A TCP/UDP port is required. 277 * 278 * A kernel encapsulation plugin declares which of these that it requires, it's 279 * a static set. On the other hand, a userland lookup plugin can be built to 280 * support all of these or any combination thereof. It gets passed the required 281 * destination type, based on the kernel encapsulation method, and then it makes 282 * the determination as to whether or not it supports it. For example, the 283 * direct plugin can support either an IP or both an IP and a port, it simply 284 * doesn't display the direct/dest_port property in the cases where a port is 285 * not required to support this. 286 * 287 * The user lookup plugins have two different modes of operation which 288 * determines how they interact with the broader system and how look ups are 289 * performed. These types are: 290 * 291 * OVERLAY_TARGET_POINT 292 * 293 * A point to point plugin has a single static definition for where 294 * to send all traffic. Every packet in the system always gets sent 295 * to the exact same destination which is programmed into the 296 * kernel when the general device is activated. 297 * 298 * OVERLAY_TARGET_DYNAMIC 299 * 300 * A dynamic plugin does not have a single static definition. 301 * Instead, for each destination, the kernel makes an asynchronous 302 * request to varpd to determine where the packet should be routed, 303 * and if a specific destination is found, then that destination is 304 * cached in the overlay device's target cache. 305 * 306 * This distinction, while important for the general overlay device's operation, 307 * is not important to the encapsulation plugins. They don't need to know about 308 * any of these pieces. It's just a concern for varpd, the userland plugin, and 309 * the general overlay scaffolding. 310 * 311 * When an overlay device is set to OVERLAY_TARGET_POINT, then it does not 312 * maintain a target cache, and instead just keeps track of the destination and 313 * always sends encapsulated packets to that address. When the target type is of 314 * OVERLAY_TARGET_DYNAMIC, then the kernel maintains a cache of all such 315 * destinations. These destinations are kept around in an instance of a 316 * reference hash that is specific to the given overlay device. Entries in the 317 * cache can be invalidated and replaced by varpd and its lookup plugins. 318 * 319 * ---------------------------------- 320 * Kernel Components and Architecture 321 * ---------------------------------- 322 * 323 * There are multiple pieces inside the kernel that work together, there is the 324 * general overlay_dev_t structure, which is the logical GLDv3 device, but it 325 * itself has references to things like an instance of an encapsulation plugin, 326 * a pointer to a mux and a target cache. It can roughly be summarized in the 327 * following image: 328 * 329 * +------------------+ 330 * | global | 331 * | overlay list | 332 * | overlay_dev_list | 333 * +------------------+ 334 * | 335 * | +-----------------------+ +---------------+ 336 * +->| GLDv3 Device |----------->| GLDv3 Device | -> ... 337 * | overlay_dev_t | | overlay_dev_t | 338 * | | +---------------+ 339 * | | 340 * | mac_handle_t -----+---> GLDv3 handle to MAC 341 * | datalink_id_t -----+---> Datalink ID used by DLS 342 * | overlay_dev_flag_t ---+---> Device state 343 * | uint_t -----+---> Current device MTU 344 * | uint_t -----+---> In-progress RX operations 345 * | uint_t -----+---> In-progress TX operations 346 * | char[] -----+---> FMA degraded message 347 * | void * -----+---> plugin private data 348 * | overlay_target_t * ---+---------------------+ 349 * | overlay_plugin_t * ---+---------+ | 350 * +-----------------------+ | | 351 * ^ | | 352 * +--------------------+ | | | 353 * | Kernel Socket | | | | 354 * | Multiplexor | | | | 355 * | overlay_mux_t | | | | 356 * | | | | | 357 * | avl_tree_t -+--+ | | 358 * | uint_t -+--> socket family | | 359 * | uint_t -+--> socket type | | 360 * | uint_t -+--> socket protocol | | 361 * | ksocket_t -+--> I/O socket | | 362 * | struct sockaddr * -+--> ksocket address | | 363 * | overlay_plugin_t --+--------+ | | 364 * +--------------------+ | | | 365 * | | | 366 * +-------------------------+ | | | 367 * | Encap Plugin |<--+-----------+ | 368 * | overlay_plugin_t | | 369 * | | | 370 * | char * ---+--> plugin name | 371 * | overlay_plugin_ops_t * -+--> plugin downcalls | 372 * | char ** (props) ---+--> property list | 373 * | uint_t ---+--> id length | 374 * | overlay_plugin_flags_t -+--> plugin flags | 375 * | overlay_plugin_dest_t --+--> destination type v 376 * +-------------------------+ +-------------------------+ 377 * | Target Cache | 378 * | overlay_target_t | 379 * | | 380 * cache mode <--+- overlay_target_mode_t | 381 * dest type <--+- overlay_plugin_dest_t | 382 * cache flags <--+- overlay_target_flag_t | 383 * varpd id <--+- uint64_t | 384 * outstanding varpd reqs. <--+- uint_t | 385 * OVERLAY_TARGET_POINT state <--+- overlay_target_point_t | 386 * OVERLAY_TARGET_DYNAMIC state <-+---+- overlay_target_dyn_t | 387 * | +-------------------------+ 388 * +-----------------------+ 389 * | 390 * v 391 * +-------------------------------+ +------------------------+ 392 * | Target Entry |-->| Target Entry |--> ... 393 * | overlay_target_entry_t | | overlay_target_entry_t | 394 * | | +------------------------+ 395 * | | 396 * | overlay_target_entry_flags_t -+--> Entry flags 397 * | uint8_t[ETHERADDRL] ---+--> Target MAC address 398 * | overlay_target_point_t ---+--> Target underlay address 399 * | mblk_t * ---+--> outstanding mblk head 400 * | mblk_t * ---+--> outstanding mblk tail 401 * | size_t ---+--> outstanding mblk size 402 * +-------------------------------+ 403 * 404 * The primary entries that we care about are the overlay_dev_t, which 405 * correspond to each overlay device that is created with dladm(8). Globally, 406 * these devices are maintained in a simple list_t which is protected with a 407 * lock. Hence, these include important information such as the mac_handle_t 408 * and a datalink_id_t which is used to interact with the broader MAC and DLS 409 * ecosystem. We also maintain additional information such as the current state, 410 * outstanding operations, the mtu, and importantly, the plugin's private data. 411 * This is the instance of an encapsulation plugin that gets created as part of 412 * creating an overlay device. Another aspect of this is that the overlay_dev_t 413 * also includes information with respect to FMA. For more information, see the 414 * FMA section. 415 * 416 * Each overlay_dev_t has a pointer to a plugin, a mux, and a target. The plugin 417 * is the encapsulation plugin. This allows the device to make downcalls into it 418 * based on doing things like getting and setting properties. Otherwise, the 419 * plugin itself is a fairly straightforward entity. They are maintained in an 420 * (not pictured above) list. The plugins themselves mostly maintain things like 421 * the static list of properties, what kind of destination they require, and the 422 * operations vector. A given module may contain more if necessary. 423 * 424 * The next piece of the puzzle is the mux, or a multiplexor. The mux itself 425 * maintains a ksocket and it is through the mux that we send and receive 426 * message blocks. The mux represents a socket type and address, as well as a 427 * plugin. Multiple overlay_dev_t devices may then share the same mux. For 428 * example, consider the case where you have different instances of vxlan all on 429 * the same underlay network. These would all logically share the same IP 430 * address and port that packets are sent and received on; however, what differs 431 * is the decapuslation ID. 432 * 433 * Each mux maintains a ksocket_t which is similar to a socket(3SOCKET). Unlike 434 * a socket, we enable a direct callback on the ksocket. This means that 435 * whenever a message block chain is received, rather than sitting there and 436 * getting a callback in a context and kicking that back out to a taskq. Instead 437 * data comes into the callback function overlay_mux_recv(). 438 * 439 * The mux is given encapsulated packets (via overlay_m_tx, the GLDv3 tx 440 * function) to transmit. It receives encapsulated packets, decapsulates them to 441 * determine the overlay identifier, looks up the given device that matches that 442 * identifier, and then causes the broader MAC world to receive the packet with 443 * a call to mac_rx(). 444 * 445 * Today, we don't do too much that's special with the ksocket; however, as 446 * hardware is gaining understanding for these encapsulation protocols, we'll 447 * probably want to think of better ways to get those capabilities passed down 448 * and potentially better ways to program receive filters so they get directly 449 * to us. Though, that's all fantasy future land. 450 * 451 * The next part of the puzzle is the target cache. The purpose of the target 452 * cache is to cache where we should send a packet on the underlay network, 453 * given its mac address. The target cache operates in two modes depending on 454 * whether the lookup module was declared to OVERLAY_TARGET_POINT or 455 * OVERLAY_TARGET_DYANMIC. 456 * 457 * In the case where the target cache has been programmed to be 458 * OVERLAY_TARGET_POINT, then we only maintain a single overlay_target_point_t 459 * which has the destination that we send everything, no matter the destination 460 * mac address. 461 * 462 * On the other hand, when we have an instance of OVERLAY_TARGET_DYNAMIC, things 463 * are much more interesting and as a result, more complicated. We primarily 464 * store lists of overlay_target_entry_t's which are stored in both an avl tree 465 * and a refhash_t. The primary look up path uses the refhash_t and the avl tree 466 * is only used for a few of the target ioctls used to dump data such that we 467 * can get a consistent iteration order for things like dladm show-overlay -t. 468 * The key that we use for the reference hashtable is based on the mac address 469 * in the cache and currently we just do a simple CRC32 to transform it into a 470 * hash. 471 * 472 * Each entry maintains a set of flags to indicate the current status of the 473 * request. The flags may indicate one of three states: that current cache entry 474 * is valid, that the current cache entry has been directed to drop all output, 475 * and that the current cache entry is invalid and may be being looked up. In 476 * the case where it's valid, we just take the destination address and run with 477 * it. 478 * 479 * If it's invalid and a lookup has not been made, then we start the process 480 * that prepares a query that will make its way up to varpd. The cache entry 481 * entry maintains a message block chain of outstanding message blocks and a 482 * size. These lists are populated only when we don't know the answer as to 483 * where should these be sent. The size entry is used to cap the amount of 484 * outstanding data that we don't know the answer to. If we exceed a cap on the 485 * amount of outstanding data (currently 1 Mb), then we'll drop any additional 486 * packets. Once we get an answer indicating a valid destination, we transmit 487 * any outstanding data to that place. For the full story on how we look that up 488 * will be discussed in the section on the Target Cache Lifecycle. 489 * 490 * ------------------------ 491 * FMA and Degraded Devices 492 * ------------------------ 493 * 494 * Every kernel overlay device keeps track of its FMA state. Today in FMA we 495 * cannot represent partitions between resources nor can we represent that a 496 * given minor node of a pseudo device has failed -- if we degrade the overlay 497 * device, then the entire dev_info_t is degraded. However, we still want to be 498 * able to indicate to administrators that things may go wrong. 499 * 500 * To this end, we've added a notion of a degraded state to every overlay 501 * device. This state is primarily dictated by userland and it can happen for 502 * various reasons. Generally, because a userland lookup plugin has been 503 * partitioned, or something has gone wrong such that there is no longer any 504 * userland lookup module for a device, then we'll mark it degraded. 505 * 506 * As long as any of our minor instances is degraded, then we'll fire off the 507 * FMA event to note that. Once the last degraded instance is no longer 508 * degraded, then we'll end up telling FMA that we're all clean. 509 * 510 * To help administrators get a better sense of which of the various minor 511 * devices is wrong, we store the odd_fmamsg[] character array. This character 512 * array can be fetched with doing a dladm show-overlay -f. 513 * 514 * Note, that it's important that we do not update the link status of the 515 * devices. We want to remain up as much as possible. By changing the link in a 516 * degraded state, this may end up making things worse. We may still actually 517 * have information in the target cache and if we mark the link down, that'll 518 * result in not being able to use it. The reason being that this'll mark all 519 * the downstream VNICs down which will go to IP and from there we end up 520 * dealing with sadness. 521 * 522 * ----------------------- 523 * Target Cache Life Cycle 524 * ----------------------- 525 * 526 * This section only applies when we have a lookup plugin of 527 * OVERLAY_TARGET_DYNAMIC. None of this applies to those of type 528 * OVERLAY_TARGET_POINT. 529 * 530 * While we got into the target cache in the general architecture section, it's 531 * worth going into more details as to how this actually works and showing some 532 * examples and state machines. Recall that a target cache entry basically has 533 * the following state transition diagram: 534 * 535 * Initial state 536 * . . . . . . first access . . . varpd lookup enqueued 537 * . . . 538 * . . . 539 * +-------+ . +----------+ . 540 * | No |------*---->| Invalid |-------*----+ 541 * | Entry | | Entry | | 542 * +-------+ +----------+ | 543 * varpd ^ ^ varpd | 544 * invalidate | | drop | 545 * . . . * * . . v 546 * +-------+ | | +---------+ 547 * | Entry |--->-----+ +----<----| Entry | 548 * | Valid |<----------*---------<----| Pending |->-+ varpd 549 * +-------+ . +---------+ * . . drop, but 550 * . varpd ^ | other queued 551 * . success | | entries 552 * +-----+ 553 * 554 * When the table is first created, it is empty. As we attempt to lookup entries 555 * and we find there is no entry at all, we'll create a new table entry for it. 556 * At that point the entry is technically in an invalid state, that means that 557 * we have no valid data from varpd. In that case, we'll go ahead and queue the 558 * packet into the entry's pending chain, and queue a varpd lookup, setting the 559 * OVERLAY_ENTRY_F_PENDING flag in the progress. 560 * 561 * If additional mblk_t's come in for this entry, we end up appending them to 562 * the tail of the chain, if and only if, we don't exceed the threshold for the 563 * amount of space they can take up. An entry remains pending until we get a 564 * varpd reply. If varpd replies with a valid results, we move to the valid 565 * entry state, and remove the OVERLAY_ENTRY_F_PENDING flag and set it with one 566 * of OVERLAY_ENTRY_F_VALID or OVERLAY_ENTRY_F_DROP as appropriate. 567 * 568 * Once an entry is valid, it stays valid until user land tells us to invalidate 569 * it with an ioctl or replace it, OVERLAY_TARG_CACHE_REMOE and 570 * OVERLAY_TARG_CACHE_SET respectively. 571 * 572 * If the lookup fails with a call to drop the packet, then the next state is 573 * determined by the state of the queue. If the set of outstanding entries is 574 * empty, then we just transition back to the invalid state. If instead, the 575 * set of outstanding entries is not empty, then we'll queue another entry and 576 * stay in the same state, repeating this until the number of requests is 577 * drained. 578 * 579 * The following images describes the flow of a given lookup and where the 580 * overlay_target_entry_t is at any given time. 581 * 582 * +-------------------+ 583 * | Invalid Entry | An entry starts off as an invalid entry 584 * | de:ad:be:ef:00:00 | and only exists in the target cache. 585 * +-------------------+ 586 * 587 * ~~~~ 588 * 589 * +---------------------+ 590 * | Global list_t | A mblk_t comes in for an entry. We 591 * | overlay_target_list | append it to the overlay_target_list. 592 * +---------------------+ 593 * | 594 * v 595 * +-------------------+ +-------------------+ 596 * | Pending Entry |----->| Pending Entry |--->... 597 * | 42:5e:1a:10:d6:2d | | de:ad:be:ef:00:00 | 598 * +-------------------+ +-------------------+ 599 * 600 * ~~~~ 601 * 602 * +--------------------------+ 603 * | /dev/overlay minor state | User land said that it would look up an 604 * | overlay_target_hdl_t | entry for us. We remove it from the 605 * +--------------------------+ global list and add it to the handle's 606 * | outstanding list. 607 * | 608 * v 609 * +-------------------+ +-------------------+ 610 * | Pending Entry |----->| Pending Entry | 611 * | 90:b8:d0:79:02:dd | | de:ad:be:ef:00:00 | 612 * +-------------------+ +-------------------+ 613 * 614 * ~~~~ 615 * 616 * +-------------------+ 617 * | Valid Entry | varpd returned an answer with 618 * | de:ad:be:ef:00:00 | OVERLAY_IOC_RESPOND and the target cache 619 * | 10.169.23.42:4789 | entry is now populated with a 620 * +-------------------+ destination and marked as valid 621 * 622 * 623 * The lookup mechanism is performed via a series of operations on the character 624 * pseudo-device /dev/overlay. The only thing that uses this device is the 625 * userland daemon varpd. /dev/overlay is a cloneable device, each open of it 626 * granting a new minor number which maintains its own state. We maintain this 627 * state so that way if an outstanding lookup was queued to something that 628 * crashed or closed its handle without responding, we can know about this and 629 * thus handle it appropriately. 630 * 631 * When a lookup is first created it's added to our global list of outstanding 632 * lookups. To service requests, userland is required to perform an ioctl to ask 633 * for a request. We will block it in the kernel a set amount of time waiting 634 * for a request. When we give a request to a given minor instance of the 635 * device, we remove it from the global list and append the request to the 636 * device's list of outstanding entries, for the reasons we discussed above. 637 * When a lookup comes in, we give user land a smaller amount of information 638 * specific to that packet, the overlay_targ_lookup_t. It includes a request id 639 * to identify this, and then the overlay id, the varpd id, the header and 640 * packet size, the source and destination mac address, the SAP, and any 641 * potential VLAN header. 642 * 643 * At that point, it stays in that outstanding list until one of two ioctls are 644 * returned: OVERLAY_TARG_RESPOND or OVERLAY_TARG_DROP. During this time, 645 * userland may also perform other operations. For example, it may use 646 * OVERLAY_TARG_PKT to get a copy of this packet so it can perform more in-depth 647 * analysis of what to do beyond what we gave it initially. This is useful for 648 * providing proxy arp and the like. Finally, there are two other ioctls that 649 * varpd can then do. The first is OVERLAY_TARG_INJECT which injects the 650 * non-jumbo frame packet up into that mac device and OVERLAY_TARG_RESEND which 651 * causes us to encapsulate and send out the packet they've given us. 652 * 653 * 654 * Finally, through the target cache, several ioctls are provided to allow for 655 * interrogation and management of the cache. They allow for individual entries 656 * to be retrieved, set, or have the entire table flushed. For the full set of 657 * ioctls here and what they do, take a look at uts/common/sys/overlay_target.h. 658 * 659 * ------------------ 660 * Sample Packet Flow 661 * ------------------ 662 * 663 * There's a lot of pieces here, hopefully an example of how this all fits 664 * together will help clarify and elucidate what's going on. We're going to 665 * first track an outgoing packet, eg. one that is sent from an IP interface on 666 * a VNIC on top of an overlay device, and then we'll look at what it means to 667 * respond to that. 668 * 669 * 670 * +----------------+ +--------------+ +------------------+ 671 * | IP/DLS send |------->| MAC sends it |----------->| mblk_t reaches | 672 * | packet to MAC | | to the GLDv3 | | overlay GLDv3 tx | 673 * +----------------+ | VNIC device | | overlay_m_tx() | 674 * +--------------+ +------------------+ 675 * | 676 * . lookup . cache | 677 * . drop . miss v 678 * +---------+ . +--------+ . +------------------+ 679 * | freemsg |<-----*-------| varpd |<---*------| Lookup each mblk | 680 * | mblk_t | | lookup | | in the target | 681 * +---------+ | queued | | cache | 682 * ^ +--------+ +------------------+ 683 * on send | | | cache 684 * error . . * *. . lookup * . . hit 685 * | | success v 686 * | | +------------------+ 687 * +-----------------+ +--------------->| call plugin | 688 * | Send out | | ovpo_encap() to | 689 * | overlay_mux_t's |<----------------------------------| get encap mblk_t | 690 * | ksocket | +------------------+ 691 * +-----------------+ 692 * 693 * The receive end point looks a little different and looks more like: 694 * 695 * +------------------+ +----------------+ +-----------+ 696 * | mblk_t comes off |---->| enter netstack |--->| delivered |---+ 697 * | the physical | | IP stack | | to | * . . direct 698 * | device | +----------------+ | ksocket | | callback 699 * +------------------+ +-----------+ | 700 * . overlay id | 701 * . not found v 702 * +-----------+ . +-----------------+ +--------------------+ 703 * | freemsg |<--*------| call plugin |<------| overlay_mux_recv() | 704 * | mblk_t | | ovpo_decap() to | +--------------------+ 705 * +-----------+ | decap mblk_t | 706 * +-----------------+ 707 * | 708 * * . . overlay id 709 * v found 710 * +--------+ +----------------+ 711 * | adjust |----->| call mac_rx | 712 * | mblk_t | | on original | 713 * +--------+ | decaped packet | 714 * +----------------+ 715 * 716 * ------------------ 717 * Netstack Awareness 718 * ------------------ 719 * 720 * In the above image we note that this enters a netstack. Today the only 721 * netstack that can be is the global zone as the overlay driver itself is not 722 * exactly netstack aware. What this really means is that varpd cannot run in a 723 * non-global zone and an overlay device cannot belong to a non-global zone. 724 * Non-global zones can still have a VNIC assigned to them that's been created 725 * over the overlay device the same way they would if it had been created over 726 * an etherstub or a physical device. 727 * 728 * The majority of the work to make it netstack aware is straightforward and the 729 * biggest thing is to create a netstack module that allows us to hook into 730 * netstack (and thus zone) creation and destruction. From there, we need to 731 * amend the target cache lookup routines that we discussed earlier to not have 732 * a global outstanding list and a global list of handles, but rather, one per 733 * netstack. 734 * 735 * For the mux, we'll need to open the ksocket in the context of the zone, we 736 * can likely do this with a properly composed credential, but we'll need to do 737 * some more work on that path. Finally, we'll want to make sure the dld ioctls 738 * are aware of the zoneid of the caller and we use that appropriately and store 739 * it in the overlay_dev_t. 740 * 741 * ----------- 742 * GLDv3 Notes 743 * ----------- 744 * 745 * The overlay driver implements a GLDv3 device. Parts of GLDv3 are more 746 * relevant and other parts are much less relevant for us. For example, the 747 * GLDv3 is used to toggle the device being put into and out of promiscuous 748 * mode, to program MAC addresses for unicast and multicast hardware filters. 749 * Today, an overlay device doesn't have a notion of promiscuous mode nor does 750 * it have a notion of unicast and multicast addresses programmed into the 751 * device. Instead, for the purposes of the hardware filter, we don't do 752 * anything and just always accept new addresses being added and removed. 753 * 754 * If the GLDv3 start function has not been called, then we will not use this 755 * device for I/O purposes. Any calls to transmit or receive should be dropped, 756 * though the GLDv3 guarantees us that transmit will not be called without 757 * calling start. Similarly, once stop is called, then no packets can be dealt 758 * with. 759 * 760 * Today we don't support the stat interfaces, though there's no good reason 761 * that we shouldn't assemble some of the stats based on what we have in the 762 * future. 763 * 764 * When it comes to link properties, many of the traditional link properties do 765 * not apply and many others MAC handles for us. For example, we don't need to 766 * implement anything for overlay_m_getprop() to deal with returning the MTU, as 767 * MAC never calls into us for that. As such, there isn't much of anything to 768 * support in terms of properties. 769 * 770 * Today, we don't support any notion of hardware capabilities. However, if 771 * future NIC hardware or other changes to the system cause it to make sense for 772 * us to emulate logical groups, then we should do that. However, we still do 773 * implement a capab function so that we can identify ourselves as an overlay 774 * device to the broader MAC framework. This is done mostly so that a device 775 * created on top of us can have fanout rings as we don't try to lie about a 776 * speed for our device. 777 * 778 * The other question is what should be done for a device's MTU and margin. We 779 * set our minimum supported MTU to be the minimum value that an IP network may 780 * be set to 576 -- which mimics what an etherstub does. On the flip side, we 781 * have our upper bound set to 8900. This value comes from the fact that a lot 782 * of jumbo networks use their maximum as 9000. As such, we want to reserve 100 783 * bytes, which isn't exactly the most accurate number, but it'll be good enough 784 * for now. Because of that, our default MTU off of these devices is 1400, as 785 * the default MTU for everything is usually 1500 or whatever the underlying 786 * device is at; however, this is a bit simpler than asking the netstack what 787 * are all the IP interfaces at. It also calls into question how PMTU and PMTU 788 * discovery should work here. The challenge, especially for 789 * OVERLAY_TARG_DYNAMIC is that the MTU to any of the places will vary and it's 790 * not clear that if you have a single bad entry that the overall MTU should be 791 * lowered. Instead, we should figure out a better way of determining these 792 * kinds of PMTU errors and appropriately alerting the administrator via FMA. 793 * 794 * Regarding margin, we allow a margin of up to VLAN_TAGSZ depending on whether 795 * or not the underlying encapsulation device supports VLAN tags. If it does, 796 * then we'll set the margin to allow for it, otherwise, we will not. 797 */ 798 799 #include <sys/conf.h> 800 #include <sys/errno.h> 801 #include <sys/stat.h> 802 #include <sys/ddi.h> 803 #include <sys/sunddi.h> 804 #include <sys/modctl.h> 805 #include <sys/policy.h> 806 #include <sys/stream.h> 807 #include <sys/strsubr.h> 808 #include <sys/strsun.h> 809 #include <sys/types.h> 810 #include <sys/kmem.h> 811 #include <sys/param.h> 812 #include <sys/sysmacros.h> 813 #include <sys/ddifm.h> 814 815 #include <sys/dls.h> 816 #include <sys/dld_ioc.h> 817 #include <sys/mac_provider.h> 818 #include <sys/mac_client_priv.h> 819 #include <sys/mac_ether.h> 820 #include <sys/vlan.h> 821 822 #include <sys/overlay_impl.h> 823 824 dev_info_t *overlay_dip; 825 static kmutex_t overlay_dev_lock; 826 static list_t overlay_dev_list; 827 static uint8_t overlay_macaddr[ETHERADDRL] = 828 { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 829 830 typedef enum overlay_dev_prop { 831 OVERLAY_DEV_P_MTU = 0, 832 OVERLAY_DEV_P_VNETID, 833 OVERLAY_DEV_P_ENCAP, 834 OVERLAY_DEV_P_VARPDID 835 } overlay_dev_prop_t; 836 837 #define OVERLAY_DEV_NPROPS 4 838 static const char *overlay_dev_props[] = { 839 "mtu", 840 "vnetid", 841 "encap", 842 "varpd/id" 843 }; 844 845 #define OVERLAY_MTU_MIN 576 846 #define OVERLAY_MTU_DEF 1400 847 #define OVERLAY_MTU_MAX 8900 848 849 overlay_dev_t * 850 overlay_hold_by_dlid(datalink_id_t id) 851 { 852 overlay_dev_t *o; 853 854 mutex_enter(&overlay_dev_lock); 855 for (o = list_head(&overlay_dev_list); o != NULL; 856 o = list_next(&overlay_dev_list, o)) { 857 if (id == o->odd_linkid) { 858 mutex_enter(&o->odd_lock); 859 o->odd_ref++; 860 mutex_exit(&o->odd_lock); 861 mutex_exit(&overlay_dev_lock); 862 return (o); 863 } 864 } 865 866 mutex_exit(&overlay_dev_lock); 867 return (NULL); 868 } 869 870 void 871 overlay_hold_rele(overlay_dev_t *odd) 872 { 873 mutex_enter(&odd->odd_lock); 874 ASSERT(odd->odd_ref > 0); 875 odd->odd_ref--; 876 mutex_exit(&odd->odd_lock); 877 } 878 879 void 880 overlay_io_start(overlay_dev_t *odd, overlay_dev_flag_t flag) 881 { 882 ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX); 883 ASSERT(MUTEX_HELD(&odd->odd_lock)); 884 885 if (flag & OVERLAY_F_IN_RX) 886 odd->odd_rxcount++; 887 if (flag & OVERLAY_F_IN_TX) 888 odd->odd_txcount++; 889 odd->odd_flags |= flag; 890 } 891 892 void 893 overlay_io_done(overlay_dev_t *odd, overlay_dev_flag_t flag) 894 { 895 boolean_t signal = B_FALSE; 896 897 ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX); 898 ASSERT(MUTEX_HELD(&odd->odd_lock)); 899 900 if (flag & OVERLAY_F_IN_RX) { 901 ASSERT(odd->odd_rxcount > 0); 902 odd->odd_rxcount--; 903 if (odd->odd_rxcount == 0) { 904 signal = B_TRUE; 905 odd->odd_flags &= ~OVERLAY_F_IN_RX; 906 } 907 } 908 if (flag & OVERLAY_F_IN_TX) { 909 ASSERT(odd->odd_txcount > 0); 910 odd->odd_txcount--; 911 if (odd->odd_txcount == 0) { 912 signal = B_TRUE; 913 odd->odd_flags &= ~OVERLAY_F_IN_TX; 914 } 915 } 916 917 if (signal == B_TRUE) 918 cv_broadcast(&odd->odd_iowait); 919 } 920 921 static void 922 overlay_io_wait(overlay_dev_t *odd, overlay_dev_flag_t flag) 923 { 924 ASSERT((flag & ~OVERLAY_F_IOMASK) == 0); 925 ASSERT(MUTEX_HELD(&odd->odd_lock)); 926 927 while (odd->odd_flags & flag) { 928 cv_wait(&odd->odd_iowait, &odd->odd_lock); 929 } 930 } 931 932 void 933 overlay_dev_iter(overlay_dev_iter_f func, void *arg) 934 { 935 overlay_dev_t *odd; 936 937 mutex_enter(&overlay_dev_lock); 938 for (odd = list_head(&overlay_dev_list); odd != NULL; 939 odd = list_next(&overlay_dev_list, odd)) { 940 if (func(odd, arg) != 0) { 941 mutex_exit(&overlay_dev_lock); 942 return; 943 } 944 } 945 mutex_exit(&overlay_dev_lock); 946 } 947 948 /* ARGSUSED */ 949 static int 950 overlay_m_stat(void *arg, uint_t stat, uint64_t *val) 951 { 952 return (ENOTSUP); 953 } 954 955 static int 956 overlay_m_start(void *arg) 957 { 958 overlay_dev_t *odd = arg; 959 overlay_mux_t *mux; 960 int ret, domain, family, prot; 961 struct sockaddr_storage storage; 962 socklen_t slen; 963 964 mutex_enter(&odd->odd_lock); 965 if ((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0) { 966 mutex_exit(&odd->odd_lock); 967 return (EAGAIN); 968 } 969 mutex_exit(&odd->odd_lock); 970 971 ret = odd->odd_plugin->ovp_ops->ovpo_socket(odd->odd_pvoid, &domain, 972 &family, &prot, (struct sockaddr *)&storage, &slen); 973 if (ret != 0) 974 return (ret); 975 976 mux = overlay_mux_open(odd->odd_plugin, domain, family, prot, 977 (struct sockaddr *)&storage, slen, &ret); 978 if (mux == NULL) 979 return (ret); 980 981 overlay_mux_add_dev(mux, odd); 982 odd->odd_mux = mux; 983 mutex_enter(&odd->odd_lock); 984 ASSERT(!(odd->odd_flags & OVERLAY_F_IN_MUX)); 985 odd->odd_flags |= OVERLAY_F_IN_MUX; 986 mutex_exit(&odd->odd_lock); 987 988 return (0); 989 } 990 991 static void 992 overlay_m_stop(void *arg) 993 { 994 overlay_dev_t *odd = arg; 995 996 /* 997 * The MAC Perimeter is held here, so we don't have to worry about 998 * synchronizing this with respect to metadata operations. 999 */ 1000 mutex_enter(&odd->odd_lock); 1001 VERIFY(odd->odd_flags & OVERLAY_F_IN_MUX); 1002 VERIFY(!(odd->odd_flags & OVERLAY_F_MDDROP)); 1003 odd->odd_flags |= OVERLAY_F_MDDROP; 1004 overlay_io_wait(odd, OVERLAY_F_IOMASK); 1005 mutex_exit(&odd->odd_lock); 1006 1007 overlay_mux_remove_dev(odd->odd_mux, odd); 1008 overlay_mux_close(odd->odd_mux); 1009 odd->odd_mux = NULL; 1010 1011 mutex_enter(&odd->odd_lock); 1012 odd->odd_flags &= ~OVERLAY_F_IN_MUX; 1013 odd->odd_flags &= ~OVERLAY_F_MDDROP; 1014 VERIFY((odd->odd_flags & OVERLAY_F_STOPMASK) == 0); 1015 mutex_exit(&odd->odd_lock); 1016 } 1017 1018 /* 1019 * For more info on this, see the big theory statement. 1020 */ 1021 /* ARGSUSED */ 1022 static int 1023 overlay_m_promisc(void *arg, boolean_t on) 1024 { 1025 return (0); 1026 } 1027 1028 /* 1029 * For more info on this, see the big theory statement. 1030 */ 1031 /* ARGSUSED */ 1032 static int 1033 overlay_m_multicast(void *arg, boolean_t add, const uint8_t *addrp) 1034 { 1035 return (0); 1036 } 1037 1038 /* 1039 * For more info on this, see the big theory statement. 1040 */ 1041 /* ARGSUSED */ 1042 static int 1043 overlay_m_unicast(void *arg, const uint8_t *macaddr) 1044 { 1045 return (0); 1046 } 1047 1048 mblk_t * 1049 overlay_m_tx(void *arg, mblk_t *mp_chain) 1050 { 1051 overlay_dev_t *odd = arg; 1052 mblk_t *mp, *ep; 1053 int ret; 1054 ovep_encap_info_t einfo; 1055 struct msghdr hdr; 1056 1057 mutex_enter(&odd->odd_lock); 1058 if ((odd->odd_flags & OVERLAY_F_MDDROP) || 1059 !(odd->odd_flags & OVERLAY_F_IN_MUX)) { 1060 mutex_exit(&odd->odd_lock); 1061 freemsgchain(mp_chain); 1062 return (NULL); 1063 } 1064 overlay_io_start(odd, OVERLAY_F_IN_TX); 1065 mutex_exit(&odd->odd_lock); 1066 1067 bzero(&hdr, sizeof (struct msghdr)); 1068 1069 bzero(&einfo, sizeof (ovep_encap_info_t)); 1070 einfo.ovdi_id = odd->odd_vid; 1071 mp = mp_chain; 1072 while (mp != NULL) { 1073 socklen_t slen; 1074 struct sockaddr_storage storage; 1075 1076 mp_chain = mp->b_next; 1077 mp->b_next = NULL; 1078 ep = NULL; 1079 1080 ret = overlay_target_lookup(odd, mp, 1081 (struct sockaddr *)&storage, &slen); 1082 if (ret != OVERLAY_TARGET_OK) { 1083 if (ret == OVERLAY_TARGET_DROP) 1084 freemsg(mp); 1085 mp = mp_chain; 1086 continue; 1087 } 1088 1089 hdr.msg_name = &storage; 1090 hdr.msg_namelen = slen; 1091 1092 ret = odd->odd_plugin->ovp_ops->ovpo_encap(odd->odd_mh, mp, 1093 &einfo, &ep); 1094 if (ret != 0 || ep == NULL) { 1095 freemsg(mp); 1096 goto out; 1097 } 1098 1099 ASSERT(ep->b_cont == mp || ep == mp); 1100 ret = overlay_mux_tx(odd->odd_mux, &hdr, ep); 1101 if (ret != 0) 1102 goto out; 1103 1104 mp = mp_chain; 1105 } 1106 1107 out: 1108 mutex_enter(&odd->odd_lock); 1109 overlay_io_done(odd, OVERLAY_F_IN_TX); 1110 mutex_exit(&odd->odd_lock); 1111 return (mp_chain); 1112 } 1113 1114 /* ARGSUSED */ 1115 static void 1116 overlay_m_ioctl(void *arg, queue_t *q, mblk_t *mp) 1117 { 1118 miocnak(q, mp, 0, ENOTSUP); 1119 } 1120 1121 /* ARGSUSED */ 1122 static boolean_t 1123 overlay_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) 1124 { 1125 /* 1126 * Tell MAC we're an overlay. 1127 */ 1128 if (cap == MAC_CAPAB_OVERLAY) 1129 return (B_TRUE); 1130 return (B_FALSE); 1131 } 1132 1133 /* ARGSUSED */ 1134 static int 1135 overlay_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, 1136 uint_t pr_valsize, const void *pr_val) 1137 { 1138 uint32_t mtu, old; 1139 int err; 1140 overlay_dev_t *odd = arg; 1141 1142 if (pr_num != MAC_PROP_MTU) 1143 return (ENOTSUP); 1144 1145 bcopy(pr_val, &mtu, sizeof (mtu)); 1146 if (mtu < OVERLAY_MTU_MIN || mtu > OVERLAY_MTU_MAX) 1147 return (EINVAL); 1148 1149 mutex_enter(&odd->odd_lock); 1150 old = odd->odd_mtu; 1151 odd->odd_mtu = mtu; 1152 err = mac_maxsdu_update(odd->odd_mh, mtu); 1153 if (err != 0) 1154 odd->odd_mtu = old; 1155 mutex_exit(&odd->odd_lock); 1156 1157 return (err); 1158 } 1159 1160 /* ARGSUSED */ 1161 static int 1162 overlay_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, 1163 uint_t pr_valsize, void *pr_val) 1164 { 1165 return (ENOTSUP); 1166 } 1167 1168 /* ARGSUSED */ 1169 static void 1170 overlay_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num, 1171 mac_prop_info_handle_t prh) 1172 { 1173 if (pr_num != MAC_PROP_MTU) 1174 return; 1175 1176 mac_prop_info_set_default_uint32(prh, OVERLAY_MTU_DEF); 1177 mac_prop_info_set_range_uint32(prh, OVERLAY_MTU_MIN, OVERLAY_MTU_MAX); 1178 } 1179 1180 static mac_callbacks_t overlay_m_callbacks = { 1181 .mc_callbacks = (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP | 1182 MC_PROPINFO), 1183 .mc_getstat = overlay_m_stat, 1184 .mc_start = overlay_m_start, 1185 .mc_stop = overlay_m_stop, 1186 .mc_setpromisc = overlay_m_promisc, 1187 .mc_multicst = overlay_m_multicast, 1188 .mc_unicst = overlay_m_unicast, 1189 .mc_tx = overlay_m_tx, 1190 .mc_ioctl = overlay_m_ioctl, 1191 .mc_getcapab = overlay_m_getcapab, 1192 .mc_getprop = overlay_m_getprop, 1193 .mc_setprop = overlay_m_setprop, 1194 .mc_propinfo = overlay_m_propinfo 1195 }; 1196 1197 static boolean_t 1198 overlay_valid_name(const char *name, size_t buflen) 1199 { 1200 size_t actlen; 1201 int err, i; 1202 1203 for (i = 0; i < buflen; i++) { 1204 if (name[i] == '\0') 1205 break; 1206 } 1207 1208 if (i == 0 || i == buflen) 1209 return (B_FALSE); 1210 actlen = i; 1211 if (strchr(name, '/') != NULL) 1212 return (B_FALSE); 1213 if (u8_validate((char *)name, actlen, NULL, 1214 U8_VALIDATE_ENTIRE, &err) < 0) 1215 return (B_FALSE); 1216 return (B_TRUE); 1217 } 1218 1219 /* ARGSUSED */ 1220 static int 1221 overlay_i_create(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) 1222 { 1223 int err; 1224 uint64_t maxid; 1225 overlay_dev_t *odd, *o; 1226 mac_register_t *mac; 1227 overlay_ioc_create_t *oicp = karg; 1228 1229 if (overlay_valid_name(oicp->oic_encap, MAXLINKNAMELEN) == B_FALSE) 1230 return (EINVAL); 1231 1232 odd = kmem_zalloc(sizeof (overlay_dev_t), KM_SLEEP); 1233 odd->odd_linkid = oicp->oic_linkid; 1234 odd->odd_plugin = overlay_plugin_lookup(oicp->oic_encap); 1235 if (odd->odd_plugin == NULL) { 1236 kmem_free(odd, sizeof (overlay_dev_t)); 1237 return (ENOENT); 1238 } 1239 err = odd->odd_plugin->ovp_ops->ovpo_init((overlay_handle_t)odd, 1240 &odd->odd_pvoid); 1241 if (err != 0) { 1242 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); 1243 overlay_plugin_rele(odd->odd_plugin); 1244 kmem_free(odd, sizeof (overlay_dev_t)); 1245 return (EINVAL); 1246 } 1247 1248 /* 1249 * Make sure that our virtual network id is valid for the given plugin 1250 * that we're working with. 1251 */ 1252 ASSERT(odd->odd_plugin->ovp_id_size <= 8); 1253 maxid = UINT64_MAX; 1254 if (odd->odd_plugin->ovp_id_size != 8) 1255 maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) - 1ULL; 1256 if (oicp->oic_vnetid > maxid) { 1257 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); 1258 overlay_plugin_rele(odd->odd_plugin); 1259 kmem_free(odd, sizeof (overlay_dev_t)); 1260 return (EINVAL); 1261 } 1262 odd->odd_vid = oicp->oic_vnetid; 1263 1264 mac = mac_alloc(MAC_VERSION); 1265 if (mac == NULL) { 1266 mutex_exit(&overlay_dev_lock); 1267 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); 1268 overlay_plugin_rele(odd->odd_plugin); 1269 kmem_free(odd, sizeof (overlay_dev_t)); 1270 return (EINVAL); 1271 } 1272 1273 mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER; 1274 mac->m_driver = odd; 1275 mac->m_dip = overlay_dip; 1276 mac->m_dst_addr = NULL; 1277 mac->m_callbacks = &overlay_m_callbacks; 1278 mac->m_pdata = NULL; 1279 mac->m_pdata_size = 0; 1280 1281 mac->m_priv_props = NULL; 1282 1283 /* Let mac handle this itself. */ 1284 mac->m_instance = (uint_t)-1; 1285 1286 /* 1287 * There is no real source address that should be used here, but saying 1288 * that we're not ethernet is going to cause its own problems. At the 1289 * end of the say, this is fine. 1290 */ 1291 mac->m_src_addr = overlay_macaddr; 1292 1293 /* 1294 * Start with the default MTU as the max SDU. If the MTU is changed, the 1295 * SDU will be changed to reflect that. 1296 */ 1297 mac->m_min_sdu = 1; 1298 mac->m_max_sdu = OVERLAY_MTU_DEF; 1299 mac->m_multicast_sdu = 0; 1300 1301 /* 1302 * The underlying device doesn't matter, instead this comes from the 1303 * encapsulation protocol and whether or not they allow VLAN tags. 1304 */ 1305 if (odd->odd_plugin->ovp_flags & OVEP_F_VLAN_TAG) { 1306 mac->m_margin = VLAN_TAGSZ; 1307 } else { 1308 mac->m_margin = 0; 1309 } 1310 1311 /* 1312 * Today, we have no MAC virtualization, it may make sense in the future 1313 * to go ahead and emulate some subset of this, but it doesn't today. 1314 */ 1315 mac->m_v12n = MAC_VIRT_NONE; 1316 1317 mutex_enter(&overlay_dev_lock); 1318 for (o = list_head(&overlay_dev_list); o != NULL; 1319 o = list_next(&overlay_dev_list, o)) { 1320 if (o->odd_linkid == oicp->oic_linkid) { 1321 mutex_exit(&overlay_dev_lock); 1322 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); 1323 overlay_plugin_rele(odd->odd_plugin); 1324 kmem_free(odd, sizeof (overlay_dev_t)); 1325 return (EEXIST); 1326 } 1327 1328 if (o->odd_vid == oicp->oic_vnetid && 1329 o->odd_plugin == odd->odd_plugin) { 1330 mutex_exit(&overlay_dev_lock); 1331 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); 1332 overlay_plugin_rele(odd->odd_plugin); 1333 kmem_free(odd, sizeof (overlay_dev_t)); 1334 return (EEXIST); 1335 } 1336 } 1337 1338 err = mac_register(mac, &odd->odd_mh); 1339 mac_free(mac); 1340 if (err != 0) { 1341 mutex_exit(&overlay_dev_lock); 1342 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); 1343 overlay_plugin_rele(odd->odd_plugin); 1344 kmem_free(odd, sizeof (overlay_dev_t)); 1345 return (err); 1346 } 1347 1348 err = dls_devnet_create(odd->odd_mh, odd->odd_linkid, 1349 crgetzoneid(cred)); 1350 if (err != 0) { 1351 mutex_exit(&overlay_dev_lock); 1352 (void) mac_unregister(odd->odd_mh); 1353 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); 1354 overlay_plugin_rele(odd->odd_plugin); 1355 kmem_free(odd, sizeof (overlay_dev_t)); 1356 return (err); 1357 } 1358 1359 mutex_init(&odd->odd_lock, NULL, MUTEX_DRIVER, NULL); 1360 cv_init(&odd->odd_iowait, NULL, CV_DRIVER, NULL); 1361 odd->odd_ref = 0; 1362 odd->odd_flags = 0; 1363 list_insert_tail(&overlay_dev_list, odd); 1364 mutex_exit(&overlay_dev_lock); 1365 1366 return (0); 1367 } 1368 1369 /* ARGSUSED */ 1370 static int 1371 overlay_i_activate(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) 1372 { 1373 int i, ret; 1374 overlay_dev_t *odd; 1375 mac_perim_handle_t mph; 1376 overlay_ioc_activate_t *oiap = karg; 1377 overlay_ioc_propinfo_t *infop; 1378 overlay_ioc_prop_t *oip; 1379 overlay_prop_handle_t phdl; 1380 1381 odd = overlay_hold_by_dlid(oiap->oia_linkid); 1382 if (odd == NULL) 1383 return (ENOENT); 1384 1385 infop = kmem_alloc(sizeof (overlay_ioc_propinfo_t), KM_SLEEP); 1386 oip = kmem_alloc(sizeof (overlay_ioc_prop_t), KM_SLEEP); 1387 phdl = (overlay_prop_handle_t)infop; 1388 1389 mac_perim_enter_by_mh(odd->odd_mh, &mph); 1390 mutex_enter(&odd->odd_lock); 1391 if (odd->odd_flags & OVERLAY_F_ACTIVATED) { 1392 mutex_exit(&odd->odd_lock); 1393 mac_perim_exit(mph); 1394 overlay_hold_rele(odd); 1395 kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); 1396 kmem_free(oip, sizeof (overlay_ioc_prop_t)); 1397 return (EEXIST); 1398 } 1399 mutex_exit(&odd->odd_lock); 1400 1401 for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) { 1402 const char *pname = odd->odd_plugin->ovp_props[i]; 1403 bzero(infop, sizeof (overlay_ioc_propinfo_t)); 1404 overlay_prop_init(phdl); 1405 ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(pname, phdl); 1406 if (ret != 0) { 1407 mac_perim_exit(mph); 1408 overlay_hold_rele(odd); 1409 kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); 1410 kmem_free(oip, sizeof (overlay_ioc_prop_t)); 1411 return (ret); 1412 } 1413 1414 if ((infop->oipi_prot & OVERLAY_PROP_PERM_REQ) == 0) 1415 continue; 1416 bzero(oip, sizeof (overlay_ioc_prop_t)); 1417 oip->oip_size = sizeof (oip->oip_value); 1418 ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid, 1419 pname, oip->oip_value, &oip->oip_size); 1420 if (ret != 0) { 1421 mac_perim_exit(mph); 1422 overlay_hold_rele(odd); 1423 kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); 1424 kmem_free(oip, sizeof (overlay_ioc_prop_t)); 1425 return (ret); 1426 } 1427 if (oip->oip_size == 0) { 1428 mac_perim_exit(mph); 1429 overlay_hold_rele(odd); 1430 kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); 1431 kmem_free(oip, sizeof (overlay_ioc_prop_t)); 1432 return (EINVAL); 1433 } 1434 } 1435 1436 mutex_enter(&odd->odd_lock); 1437 if ((odd->odd_flags & OVERLAY_F_VARPD) == 0) { 1438 mutex_exit(&odd->odd_lock); 1439 mac_perim_exit(mph); 1440 overlay_hold_rele(odd); 1441 kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); 1442 kmem_free(oip, sizeof (overlay_ioc_prop_t)); 1443 return (ENXIO); 1444 } 1445 1446 ASSERT((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0); 1447 odd->odd_flags |= OVERLAY_F_ACTIVATED; 1448 1449 /* 1450 * Now that we've activated ourselves, we should indicate to the world 1451 * that we're up. Note that we may not be able to perform lookups at 1452 * this time, but our notion of being 'up' isn't dependent on that 1453 * ability. 1454 */ 1455 mac_link_update(odd->odd_mh, LINK_STATE_UP); 1456 mutex_exit(&odd->odd_lock); 1457 1458 mac_perim_exit(mph); 1459 overlay_hold_rele(odd); 1460 kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); 1461 kmem_free(oip, sizeof (overlay_ioc_prop_t)); 1462 1463 return (0); 1464 } 1465 1466 /* ARGSUSED */ 1467 static int 1468 overlay_i_delete(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) 1469 { 1470 overlay_ioc_delete_t *oidp = karg; 1471 overlay_dev_t *odd; 1472 datalink_id_t tid; 1473 int ret; 1474 1475 odd = overlay_hold_by_dlid(oidp->oid_linkid); 1476 if (odd == NULL) { 1477 return (ENOENT); 1478 } 1479 1480 mutex_enter(&odd->odd_lock); 1481 /* If we're not the only hold, we're busy */ 1482 if (odd->odd_ref != 1) { 1483 mutex_exit(&odd->odd_lock); 1484 overlay_hold_rele(odd); 1485 return (EBUSY); 1486 } 1487 1488 if (odd->odd_flags & OVERLAY_F_IN_MUX) { 1489 mutex_exit(&odd->odd_lock); 1490 overlay_hold_rele(odd); 1491 return (EBUSY); 1492 } 1493 1494 /* 1495 * To remove this, we need to first remove it from dls and then remove 1496 * it from mac. The act of removing it from mac will check if there are 1497 * devices on top of this, eg. vnics. If there are, then that will fail 1498 * and we'll have to go through and recreate the dls entry. Only after 1499 * mac_unregister has succeeded, then we'll go through and actually free 1500 * everything and drop the dev lock. 1501 */ 1502 ret = dls_devnet_destroy(odd->odd_mh, &tid, B_TRUE); 1503 if (ret != 0) { 1504 overlay_hold_rele(odd); 1505 return (ret); 1506 } 1507 1508 ASSERT(oidp->oid_linkid == tid); 1509 ret = mac_disable(odd->odd_mh); 1510 if (ret != 0) { 1511 (void) dls_devnet_create(odd->odd_mh, odd->odd_linkid, 1512 crgetzoneid(cred)); 1513 overlay_hold_rele(odd); 1514 return (ret); 1515 } 1516 1517 overlay_target_quiesce(odd->odd_target); 1518 1519 mutex_enter(&overlay_dev_lock); 1520 list_remove(&overlay_dev_list, odd); 1521 mutex_exit(&overlay_dev_lock); 1522 1523 cv_destroy(&odd->odd_iowait); 1524 mutex_destroy(&odd->odd_lock); 1525 overlay_target_free(odd); 1526 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); 1527 overlay_plugin_rele(odd->odd_plugin); 1528 kmem_free(odd, sizeof (overlay_dev_t)); 1529 1530 return (0); 1531 } 1532 1533 /* ARGSUSED */ 1534 static int 1535 overlay_i_nprops(void *karg, intptr_t arg, int mode, cred_t *cred, 1536 int *rvalp) 1537 { 1538 overlay_dev_t *odd; 1539 overlay_ioc_nprops_t *on = karg; 1540 1541 odd = overlay_hold_by_dlid(on->oipn_linkid); 1542 if (odd == NULL) 1543 return (ENOENT); 1544 on->oipn_nprops = odd->odd_plugin->ovp_nprops + OVERLAY_DEV_NPROPS; 1545 overlay_hold_rele(odd); 1546 1547 return (0); 1548 } 1549 1550 static int 1551 overlay_propinfo_plugin_cb(overlay_plugin_t *opp, void *arg) 1552 { 1553 overlay_prop_handle_t phdl = arg; 1554 overlay_prop_set_range_str(phdl, opp->ovp_name); 1555 return (0); 1556 } 1557 1558 static int 1559 overlay_i_name_to_propid(overlay_dev_t *odd, const char *name, uint_t *id) 1560 { 1561 int i; 1562 1563 for (i = 0; i < OVERLAY_DEV_NPROPS; i++) { 1564 if (strcmp(overlay_dev_props[i], name) == 0) { 1565 *id = i; 1566 return (0); 1567 } 1568 } 1569 1570 for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) { 1571 if (strcmp(odd->odd_plugin->ovp_props[i], name) == 0) { 1572 *id = i + OVERLAY_DEV_NPROPS; 1573 return (0); 1574 } 1575 } 1576 1577 return (ENOENT); 1578 } 1579 1580 static void 1581 overlay_i_propinfo_mtu(overlay_dev_t *odd, overlay_prop_handle_t phdl) 1582 { 1583 uint32_t def; 1584 mac_propval_range_t range; 1585 uint_t perm; 1586 1587 ASSERT(MAC_PERIM_HELD(odd->odd_mh)); 1588 1589 bzero(&range, sizeof (mac_propval_range_t)); 1590 range.mpr_count = 1; 1591 if (mac_prop_info(odd->odd_mh, MAC_PROP_MTU, "mtu", &def, 1592 sizeof (def), &range, &perm) != 0) 1593 return; 1594 1595 if (perm == MAC_PROP_PERM_READ) 1596 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ); 1597 else if (perm == MAC_PROP_PERM_WRITE) 1598 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_WRITE); 1599 else if (perm == MAC_PROP_PERM_RW) 1600 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW); 1601 1602 overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); 1603 overlay_prop_set_default(phdl, &def, sizeof (def)); 1604 overlay_prop_set_range_uint32(phdl, range.mpr_range_uint32[0].mpur_min, 1605 range.mpr_range_uint32[0].mpur_max); 1606 } 1607 1608 /* ARGSUSED */ 1609 static int 1610 overlay_i_propinfo(void *karg, intptr_t arg, int mode, cred_t *cred, 1611 int *rvalp) 1612 { 1613 overlay_dev_t *odd; 1614 int ret; 1615 mac_perim_handle_t mph; 1616 uint_t propid = UINT_MAX; 1617 overlay_ioc_propinfo_t *oip = karg; 1618 overlay_prop_handle_t phdl = (overlay_prop_handle_t)oip; 1619 1620 odd = overlay_hold_by_dlid(oip->oipi_linkid); 1621 if (odd == NULL) 1622 return (ENOENT); 1623 1624 overlay_prop_init(phdl); 1625 mac_perim_enter_by_mh(odd->odd_mh, &mph); 1626 1627 /* 1628 * If the id is -1, then the property that we're looking for is named in 1629 * oipi_name and we should fill in its id. Otherwise, we've been given 1630 * an id and we need to turn that into a name for our plugin's sake. The 1631 * id is our own fabrication for property discovery. 1632 */ 1633 if (oip->oipi_id == -1) { 1634 /* 1635 * Determine if it's a known generic property or it belongs to a 1636 * module by checking against the list of known names. 1637 */ 1638 oip->oipi_name[OVERLAY_PROP_NAMELEN-1] = '\0'; 1639 if ((ret = overlay_i_name_to_propid(odd, oip->oipi_name, 1640 &propid)) != 0) { 1641 overlay_hold_rele(odd); 1642 mac_perim_exit(mph); 1643 return (ret); 1644 } 1645 oip->oipi_id = propid; 1646 if (propid >= OVERLAY_DEV_NPROPS) { 1647 ret = odd->odd_plugin->ovp_ops->ovpo_propinfo( 1648 oip->oipi_name, phdl); 1649 overlay_hold_rele(odd); 1650 mac_perim_exit(mph); 1651 return (ret); 1652 1653 } 1654 } else if (oip->oipi_id >= OVERLAY_DEV_NPROPS) { 1655 uint_t id = oip->oipi_id - OVERLAY_DEV_NPROPS; 1656 1657 if (id >= odd->odd_plugin->ovp_nprops) { 1658 overlay_hold_rele(odd); 1659 mac_perim_exit(mph); 1660 return (EINVAL); 1661 } 1662 ret = odd->odd_plugin->ovp_ops->ovpo_propinfo( 1663 odd->odd_plugin->ovp_props[id], phdl); 1664 overlay_hold_rele(odd); 1665 mac_perim_exit(mph); 1666 return (ret); 1667 } else if (oip->oipi_id < -1) { 1668 overlay_hold_rele(odd); 1669 mac_perim_exit(mph); 1670 return (EINVAL); 1671 } else { 1672 ASSERT(oip->oipi_id < OVERLAY_DEV_NPROPS); 1673 ASSERT(oip->oipi_id >= 0); 1674 propid = oip->oipi_id; 1675 (void) strlcpy(oip->oipi_name, overlay_dev_props[propid], 1676 sizeof (oip->oipi_name)); 1677 } 1678 1679 switch (propid) { 1680 case OVERLAY_DEV_P_MTU: 1681 overlay_i_propinfo_mtu(odd, phdl); 1682 break; 1683 case OVERLAY_DEV_P_VNETID: 1684 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW); 1685 overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); 1686 overlay_prop_set_nodefault(phdl); 1687 break; 1688 case OVERLAY_DEV_P_ENCAP: 1689 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ); 1690 overlay_prop_set_type(phdl, OVERLAY_PROP_T_STRING); 1691 overlay_prop_set_nodefault(phdl); 1692 overlay_plugin_walk(overlay_propinfo_plugin_cb, phdl); 1693 break; 1694 case OVERLAY_DEV_P_VARPDID: 1695 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ); 1696 overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); 1697 overlay_prop_set_nodefault(phdl); 1698 break; 1699 default: 1700 overlay_hold_rele(odd); 1701 mac_perim_exit(mph); 1702 return (ENOENT); 1703 } 1704 1705 overlay_hold_rele(odd); 1706 mac_perim_exit(mph); 1707 return (0); 1708 } 1709 1710 /* ARGSUSED */ 1711 static int 1712 overlay_i_getprop(void *karg, intptr_t arg, int mode, cred_t *cred, 1713 int *rvalp) 1714 { 1715 int ret; 1716 overlay_dev_t *odd; 1717 mac_perim_handle_t mph; 1718 overlay_ioc_prop_t *oip = karg; 1719 uint_t propid, mtu; 1720 1721 odd = overlay_hold_by_dlid(oip->oip_linkid); 1722 if (odd == NULL) 1723 return (ENOENT); 1724 1725 mac_perim_enter_by_mh(odd->odd_mh, &mph); 1726 oip->oip_size = OVERLAY_PROP_SIZEMAX; 1727 oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0'; 1728 if (oip->oip_id == -1) { 1729 int i; 1730 1731 for (i = 0; i < OVERLAY_DEV_NPROPS; i++) { 1732 if (strcmp(overlay_dev_props[i], oip->oip_name) == 0) 1733 break; 1734 if (i == OVERLAY_DEV_NPROPS) { 1735 ret = odd->odd_plugin->ovp_ops->ovpo_getprop( 1736 odd->odd_pvoid, oip->oip_name, 1737 oip->oip_value, &oip->oip_size); 1738 overlay_hold_rele(odd); 1739 mac_perim_exit(mph); 1740 return (ret); 1741 } 1742 } 1743 1744 propid = i; 1745 } else if (oip->oip_id >= OVERLAY_DEV_NPROPS) { 1746 uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS; 1747 1748 if (id > odd->odd_plugin->ovp_nprops) { 1749 overlay_hold_rele(odd); 1750 mac_perim_exit(mph); 1751 return (EINVAL); 1752 } 1753 ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid, 1754 odd->odd_plugin->ovp_props[id], oip->oip_value, 1755 &oip->oip_size); 1756 overlay_hold_rele(odd); 1757 mac_perim_exit(mph); 1758 return (ret); 1759 } else if (oip->oip_id < -1) { 1760 overlay_hold_rele(odd); 1761 mac_perim_exit(mph); 1762 return (EINVAL); 1763 } else { 1764 ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS); 1765 ASSERT(oip->oip_id >= 0); 1766 propid = oip->oip_id; 1767 } 1768 1769 ret = 0; 1770 switch (propid) { 1771 case OVERLAY_DEV_P_MTU: 1772 /* 1773 * The MTU is always set and retrieved through MAC, to allow for 1774 * MAC to do whatever it wants, as really that property belongs 1775 * to MAC. This is important for things where vnics have hold on 1776 * the MTU. 1777 */ 1778 mac_sdu_get(odd->odd_mh, NULL, &mtu); 1779 bcopy(&mtu, oip->oip_value, sizeof (uint_t)); 1780 oip->oip_size = sizeof (uint_t); 1781 break; 1782 case OVERLAY_DEV_P_VNETID: 1783 /* 1784 * While it's read-only while inside of a mux, we're not in a 1785 * context that can guarantee that. Therefore we always grab the 1786 * overlay_dev_t's odd_lock. 1787 */ 1788 mutex_enter(&odd->odd_lock); 1789 bcopy(&odd->odd_vid, oip->oip_value, sizeof (uint64_t)); 1790 mutex_exit(&odd->odd_lock); 1791 oip->oip_size = sizeof (uint64_t); 1792 break; 1793 case OVERLAY_DEV_P_ENCAP: 1794 oip->oip_size = strlcpy((char *)oip->oip_value, 1795 odd->odd_plugin->ovp_name, oip->oip_size); 1796 break; 1797 case OVERLAY_DEV_P_VARPDID: 1798 mutex_enter(&odd->odd_lock); 1799 if (odd->odd_flags & OVERLAY_F_VARPD) { 1800 const uint64_t val = odd->odd_target->ott_id; 1801 bcopy(&val, oip->oip_value, sizeof (uint64_t)); 1802 oip->oip_size = sizeof (uint64_t); 1803 } else { 1804 oip->oip_size = 0; 1805 } 1806 mutex_exit(&odd->odd_lock); 1807 break; 1808 default: 1809 ret = ENOENT; 1810 } 1811 1812 overlay_hold_rele(odd); 1813 mac_perim_exit(mph); 1814 return (ret); 1815 } 1816 1817 static void 1818 overlay_setprop_vnetid(overlay_dev_t *odd, uint64_t vnetid) 1819 { 1820 mutex_enter(&odd->odd_lock); 1821 1822 /* Simple case, not active */ 1823 if (!(odd->odd_flags & OVERLAY_F_IN_MUX)) { 1824 odd->odd_vid = vnetid; 1825 mutex_exit(&odd->odd_lock); 1826 return; 1827 } 1828 1829 /* 1830 * In the hard case, we need to set the drop flag, quiesce I/O and then 1831 * we can go ahead and do everything. 1832 */ 1833 odd->odd_flags |= OVERLAY_F_MDDROP; 1834 overlay_io_wait(odd, OVERLAY_F_IOMASK); 1835 mutex_exit(&odd->odd_lock); 1836 1837 overlay_mux_remove_dev(odd->odd_mux, odd); 1838 1839 mutex_enter(&odd->odd_lock); 1840 odd->odd_vid = vnetid; 1841 mutex_exit(&odd->odd_lock); 1842 1843 overlay_mux_add_dev(odd->odd_mux, odd); 1844 1845 mutex_enter(&odd->odd_lock); 1846 ASSERT(odd->odd_flags & OVERLAY_F_IN_MUX); 1847 odd->odd_flags &= ~OVERLAY_F_MDDROP; 1848 mutex_exit(&odd->odd_lock); 1849 } 1850 1851 /* ARGSUSED */ 1852 static int 1853 overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred, 1854 int *rvalp) 1855 { 1856 int ret; 1857 overlay_dev_t *odd; 1858 overlay_ioc_prop_t *oip = karg; 1859 uint_t propid = UINT_MAX; 1860 mac_perim_handle_t mph; 1861 uint64_t maxid, *vidp; 1862 1863 if (oip->oip_size > OVERLAY_PROP_SIZEMAX) 1864 return (EINVAL); 1865 1866 odd = overlay_hold_by_dlid(oip->oip_linkid); 1867 if (odd == NULL) 1868 return (ENOENT); 1869 1870 oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0'; 1871 mac_perim_enter_by_mh(odd->odd_mh, &mph); 1872 mutex_enter(&odd->odd_lock); 1873 if (odd->odd_flags & OVERLAY_F_ACTIVATED) { 1874 mac_perim_exit(mph); 1875 mutex_exit(&odd->odd_lock); 1876 return (ENOTSUP); 1877 } 1878 mutex_exit(&odd->odd_lock); 1879 if (oip->oip_id == -1) { 1880 int i; 1881 1882 for (i = 0; i < OVERLAY_DEV_NPROPS; i++) { 1883 if (strcmp(overlay_dev_props[i], oip->oip_name) == 0) 1884 break; 1885 if (i == OVERLAY_DEV_NPROPS) { 1886 ret = odd->odd_plugin->ovp_ops->ovpo_setprop( 1887 odd->odd_pvoid, oip->oip_name, 1888 oip->oip_value, oip->oip_size); 1889 overlay_hold_rele(odd); 1890 mac_perim_exit(mph); 1891 return (ret); 1892 } 1893 } 1894 1895 propid = i; 1896 } else if (oip->oip_id >= OVERLAY_DEV_NPROPS) { 1897 uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS; 1898 1899 if (id > odd->odd_plugin->ovp_nprops) { 1900 mac_perim_exit(mph); 1901 overlay_hold_rele(odd); 1902 return (EINVAL); 1903 } 1904 ret = odd->odd_plugin->ovp_ops->ovpo_setprop(odd->odd_pvoid, 1905 odd->odd_plugin->ovp_props[id], oip->oip_value, 1906 oip->oip_size); 1907 mac_perim_exit(mph); 1908 overlay_hold_rele(odd); 1909 return (ret); 1910 } else if (oip->oip_id < -1) { 1911 mac_perim_exit(mph); 1912 overlay_hold_rele(odd); 1913 return (EINVAL); 1914 } else { 1915 ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS); 1916 ASSERT(oip->oip_id >= 0); 1917 propid = oip->oip_id; 1918 } 1919 1920 ret = 0; 1921 switch (propid) { 1922 case OVERLAY_DEV_P_MTU: 1923 ret = mac_set_prop(odd->odd_mh, MAC_PROP_MTU, "mtu", 1924 oip->oip_value, oip->oip_size); 1925 break; 1926 case OVERLAY_DEV_P_VNETID: 1927 if (oip->oip_size != sizeof (uint64_t)) { 1928 ret = EINVAL; 1929 break; 1930 } 1931 vidp = (uint64_t *)oip->oip_value; 1932 ASSERT(odd->odd_plugin->ovp_id_size <= 8); 1933 maxid = UINT64_MAX; 1934 if (odd->odd_plugin->ovp_id_size != 8) 1935 maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) - 1936 1ULL; 1937 if (*vidp >= maxid) { 1938 ret = EINVAL; 1939 break; 1940 } 1941 overlay_setprop_vnetid(odd, *vidp); 1942 break; 1943 case OVERLAY_DEV_P_ENCAP: 1944 case OVERLAY_DEV_P_VARPDID: 1945 ret = EPERM; 1946 break; 1947 default: 1948 ret = ENOENT; 1949 } 1950 1951 mac_perim_exit(mph); 1952 overlay_hold_rele(odd); 1953 return (ret); 1954 } 1955 1956 /* ARGSUSED */ 1957 static int 1958 overlay_i_status(void *karg, intptr_t arg, int mode, cred_t *cred, 1959 int *rvalp) 1960 { 1961 overlay_dev_t *odd; 1962 overlay_ioc_status_t *os = karg; 1963 1964 odd = overlay_hold_by_dlid(os->ois_linkid); 1965 if (odd == NULL) 1966 return (ENOENT); 1967 1968 mutex_enter(&odd->odd_lock); 1969 if ((odd->odd_flags & OVERLAY_F_DEGRADED) != 0) { 1970 os->ois_status = OVERLAY_I_DEGRADED; 1971 (void) strlcpy(os->ois_message, odd->odd_fmamsg, 1972 OVERLAY_STATUS_BUFLEN); 1973 } else { 1974 os->ois_status = OVERLAY_I_OK; 1975 os->ois_message[0] = '\0'; 1976 } 1977 mutex_exit(&odd->odd_lock); 1978 overlay_hold_rele(odd); 1979 1980 return (0); 1981 } 1982 1983 static dld_ioc_info_t overlay_ioc_list[] = { 1984 { OVERLAY_IOC_CREATE, DLDCOPYIN, sizeof (overlay_ioc_create_t), 1985 overlay_i_create, secpolicy_dl_config }, 1986 { OVERLAY_IOC_ACTIVATE, DLDCOPYIN, sizeof (overlay_ioc_activate_t), 1987 overlay_i_activate, secpolicy_dl_config }, 1988 { OVERLAY_IOC_DELETE, DLDCOPYIN, sizeof (overlay_ioc_delete_t), 1989 overlay_i_delete, secpolicy_dl_config }, 1990 { OVERLAY_IOC_PROPINFO, DLDCOPYIN | DLDCOPYOUT, 1991 sizeof (overlay_ioc_propinfo_t), overlay_i_propinfo, 1992 secpolicy_dl_config }, 1993 { OVERLAY_IOC_GETPROP, DLDCOPYIN | DLDCOPYOUT, 1994 sizeof (overlay_ioc_prop_t), overlay_i_getprop, 1995 secpolicy_dl_config }, 1996 { OVERLAY_IOC_SETPROP, DLDCOPYIN, 1997 sizeof (overlay_ioc_prop_t), overlay_i_setprop, 1998 secpolicy_dl_config }, 1999 { OVERLAY_IOC_NPROPS, DLDCOPYIN | DLDCOPYOUT, 2000 sizeof (overlay_ioc_nprops_t), overlay_i_nprops, 2001 secpolicy_dl_config }, 2002 { OVERLAY_IOC_STATUS, DLDCOPYIN | DLDCOPYOUT, 2003 sizeof (overlay_ioc_status_t), overlay_i_status, 2004 NULL } 2005 }; 2006 2007 static int 2008 overlay_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2009 { 2010 int fmcap = DDI_FM_EREPORT_CAPABLE; 2011 if (cmd != DDI_ATTACH) 2012 return (DDI_FAILURE); 2013 2014 if (overlay_dip != NULL || ddi_get_instance(dip) != 0) 2015 return (DDI_FAILURE); 2016 2017 ddi_fm_init(dip, &fmcap, NULL); 2018 2019 if (ddi_create_minor_node(dip, OVERLAY_CTL, S_IFCHR, 2020 ddi_get_instance(dip), DDI_PSEUDO, 0) == DDI_FAILURE) 2021 return (DDI_FAILURE); 2022 2023 if (dld_ioc_register(OVERLAY_IOC, overlay_ioc_list, 2024 DLDIOCCNT(overlay_ioc_list)) != 0) { 2025 ddi_remove_minor_node(dip, OVERLAY_CTL); 2026 return (DDI_FAILURE); 2027 } 2028 2029 overlay_dip = dip; 2030 return (DDI_SUCCESS); 2031 } 2032 2033 /* ARGSUSED */ 2034 static int 2035 overlay_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp) 2036 { 2037 int error; 2038 2039 switch (cmd) { 2040 case DDI_INFO_DEVT2DEVINFO: 2041 *resp = (void *)overlay_dip; 2042 error = DDI_SUCCESS; 2043 break; 2044 case DDI_INFO_DEVT2INSTANCE: 2045 *resp = (void *)0; 2046 error = DDI_SUCCESS; 2047 break; 2048 default: 2049 error = DDI_FAILURE; 2050 break; 2051 } 2052 2053 return (error); 2054 } 2055 2056 static int 2057 overlay_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2058 { 2059 if (cmd != DDI_DETACH) 2060 return (DDI_FAILURE); 2061 2062 mutex_enter(&overlay_dev_lock); 2063 if (!list_is_empty(&overlay_dev_list) || overlay_target_busy()) { 2064 mutex_exit(&overlay_dev_lock); 2065 return (EBUSY); 2066 } 2067 mutex_exit(&overlay_dev_lock); 2068 2069 2070 dld_ioc_unregister(OVERLAY_IOC); 2071 ddi_remove_minor_node(dip, OVERLAY_CTL); 2072 ddi_fm_fini(dip); 2073 overlay_dip = NULL; 2074 return (DDI_SUCCESS); 2075 } 2076 2077 static struct cb_ops overlay_cbops = { 2078 overlay_target_open, /* cb_open */ 2079 overlay_target_close, /* cb_close */ 2080 nodev, /* cb_strategy */ 2081 nodev, /* cb_print */ 2082 nodev, /* cb_dump */ 2083 nodev, /* cb_read */ 2084 nodev, /* cb_write */ 2085 overlay_target_ioctl, /* cb_ioctl */ 2086 nodev, /* cb_devmap */ 2087 nodev, /* cb_mmap */ 2088 nodev, /* cb_segmap */ 2089 nochpoll, /* cb_chpoll */ 2090 ddi_prop_op, /* cb_prop_op */ 2091 NULL, /* cb_stream */ 2092 D_MP, /* cb_flag */ 2093 CB_REV, /* cb_rev */ 2094 nodev, /* cb_aread */ 2095 nodev, /* cb_awrite */ 2096 }; 2097 2098 static struct dev_ops overlay_dev_ops = { 2099 DEVO_REV, /* devo_rev */ 2100 0, /* devo_refcnt */ 2101 overlay_getinfo, /* devo_getinfo */ 2102 nulldev, /* devo_identify */ 2103 nulldev, /* devo_probe */ 2104 overlay_attach, /* devo_attach */ 2105 overlay_detach, /* devo_detach */ 2106 nulldev, /* devo_reset */ 2107 &overlay_cbops, /* devo_cb_ops */ 2108 NULL, /* devo_bus_ops */ 2109 NULL, /* devo_power */ 2110 ddi_quiesce_not_supported /* devo_quiesce */ 2111 }; 2112 2113 static struct modldrv overlay_modldrv = { 2114 &mod_driverops, 2115 "Overlay Network Driver", 2116 &overlay_dev_ops 2117 }; 2118 2119 static struct modlinkage overlay_linkage = { 2120 MODREV_1, 2121 &overlay_modldrv 2122 }; 2123 2124 static int 2125 overlay_init(void) 2126 { 2127 mutex_init(&overlay_dev_lock, NULL, MUTEX_DRIVER, NULL); 2128 list_create(&overlay_dev_list, sizeof (overlay_dev_t), 2129 offsetof(overlay_dev_t, odd_link)); 2130 overlay_mux_init(); 2131 overlay_plugin_init(); 2132 overlay_target_init(); 2133 2134 return (DDI_SUCCESS); 2135 } 2136 2137 static void 2138 overlay_fini(void) 2139 { 2140 overlay_target_fini(); 2141 overlay_plugin_fini(); 2142 overlay_mux_fini(); 2143 mutex_destroy(&overlay_dev_lock); 2144 list_destroy(&overlay_dev_list); 2145 } 2146 2147 int 2148 _init(void) 2149 { 2150 int err; 2151 2152 if ((err = overlay_init()) != DDI_SUCCESS) 2153 return (err); 2154 2155 mac_init_ops(NULL, "overlay"); 2156 err = mod_install(&overlay_linkage); 2157 if (err != DDI_SUCCESS) { 2158 overlay_fini(); 2159 return (err); 2160 } 2161 2162 return (0); 2163 } 2164 2165 int 2166 _info(struct modinfo *modinfop) 2167 { 2168 return (mod_info(&overlay_linkage, modinfop)); 2169 } 2170 2171 int 2172 _fini(void) 2173 { 2174 int err; 2175 2176 err = mod_remove(&overlay_linkage); 2177 if (err != 0) 2178 return (err); 2179 2180 overlay_fini(); 2181 return (0); 2182 } 2183