1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * MAC Services Module 29 * 30 * The GLDv3 framework locking - The MAC layer 31 * -------------------------------------------- 32 * 33 * The MAC layer is central to the GLD framework and can provide the locking 34 * framework needed for itself and for the use of MAC clients. MAC end points 35 * are fairly disjoint and don't share a lot of state. So a coarse grained 36 * multi-threading scheme is to single thread all create/modify/delete or set 37 * type of control operations on a per mac end point while allowing data threads 38 * concurrently. 39 * 40 * Control operations (set) that modify a mac end point are always serialized on 41 * a per mac end point basis, We have at most 1 such thread per mac end point 42 * at a time. 43 * 44 * All other operations that are not serialized are essentially multi-threaded. 45 * For example a control operation (get) like getting statistics which may not 46 * care about reading values atomically or data threads sending or receiving 47 * data. Mostly these type of operations don't modify the control state. Any 48 * state these operations care about are protected using traditional locks. 49 * 50 * The perimeter only serializes serial operations. It does not imply there 51 * aren't any other concurrent operations. However a serialized operation may 52 * sometimes need to make sure it is the only thread. In this case it needs 53 * to use reference counting mechanisms to cv_wait until any current data 54 * threads are done. 55 * 56 * The mac layer itself does not hold any locks across a call to another layer. 57 * The perimeter is however held across a down call to the driver to make the 58 * whole control operation atomic with respect to other control operations. 59 * Also the data path and get type control operations may proceed concurrently. 60 * These operations synchronize with the single serial operation on a given mac 61 * end point using regular locks. The perimeter ensures that conflicting 62 * operations like say a mac_multicast_add and a mac_multicast_remove on the 63 * same mac end point don't interfere with each other and also ensures that the 64 * changes in the mac layer and the call to the underlying driver to say add a 65 * multicast address are done atomically without interference from a thread 66 * trying to delete the same address. 67 * 68 * For example, consider 69 * mac_multicst_add() 70 * { 71 * mac_perimeter_enter(); serialize all control operations 72 * 73 * grab list lock protect against access by data threads 74 * add to list 75 * drop list lock 76 * 77 * call driver's mi_multicst 78 * 79 * mac_perimeter_exit(); 80 * } 81 * 82 * To lessen the number of serialization locks and simplify the lock hierarchy, 83 * we serialize all the control operations on a per mac end point by using a 84 * single serialization lock called the perimeter. We allow recursive entry into 85 * the perimeter to facilitate use of this mechanism by both the mac client and 86 * the MAC layer itself. 87 * 88 * MAC client means an entity that does an operation on a mac handle 89 * obtained from a mac_open/mac_client_open. Similarly MAC driver means 90 * an entity that does an operation on a mac handle obtained from a 91 * mac_register. An entity could be both client and driver but on different 92 * handles eg. aggr. and should only make the corresponding mac interface calls 93 * i.e. mac driver interface or mac client interface as appropriate for that 94 * mac handle. 95 * 96 * General rules. 97 * ------------- 98 * 99 * R1. The lock order of upcall threads is natually opposite to downcall 100 * threads. Hence upcalls must not hold any locks across layers for fear of 101 * recursive lock enter and lock order violation. This applies to all layers. 102 * 103 * R2. The perimeter is just another lock. Since it is held in the down 104 * direction, acquiring the perimeter in an upcall is prohibited as it would 105 * cause a deadlock. This applies to all layers. 106 * 107 * Note that upcalls that need to grab the mac perimeter (for example 108 * mac_notify upcalls) can still achieve that by posting the request to a 109 * thread, which can then grab all the required perimeters and locks in the 110 * right global order. Note that in the above example the mac layer iself 111 * won't grab the mac perimeter in the mac_notify upcall, instead the upcall 112 * to the client must do that. Please see the aggr code for an example. 113 * 114 * MAC client rules 115 * ---------------- 116 * 117 * R3. A MAC client may use the MAC provided perimeter facility to serialize 118 * control operations on a per mac end point. It does this by by acquring 119 * and holding the perimeter across a sequence of calls to the mac layer. 120 * This ensures atomicity across the entire block of mac calls. In this 121 * model the MAC client must not hold any client locks across the calls to 122 * the mac layer. This model is the preferred solution. 123 * 124 * R4. However if a MAC client has a lot of global state across all mac end 125 * points the per mac end point serialization may not be sufficient. In this 126 * case the client may choose to use global locks or use its own serialization. 127 * To avoid deadlocks, these client layer locks held across the mac calls 128 * in the control path must never be acquired by the data path for the reason 129 * mentioned below. 130 * 131 * (Assume that a control operation that holds a client lock blocks in the 132 * mac layer waiting for upcall reference counts to drop to zero. If an upcall 133 * data thread that holds this reference count, tries to acquire the same 134 * client lock subsequently it will deadlock). 135 * 136 * A MAC client may follow either the R3 model or the R4 model, but can't 137 * mix both. In the former, the hierarchy is Perim -> client locks, but in 138 * the latter it is client locks -> Perim. 139 * 140 * R5. MAC clients must make MAC calls (excluding data calls) in a cv_wait'able 141 * context since they may block while trying to acquire the perimeter. 142 * In addition some calls may block waiting for upcall refcnts to come down to 143 * zero. 144 * 145 * R6. MAC clients must make sure that they are single threaded and all threads 146 * from the top (in particular data threads) have finished before calling 147 * mac_client_close. The MAC framework does not track the number of client 148 * threads using the mac client handle. Also mac clients must make sure 149 * they have undone all the control operations before calling mac_client_close. 150 * For example mac_unicast_remove/mac_multicast_remove to undo the corresponding 151 * mac_unicast_add/mac_multicast_add. 152 * 153 * MAC framework rules 154 * ------------------- 155 * 156 * R7. The mac layer itself must not hold any mac layer locks (except the mac 157 * perimeter) across a call to any other layer from the mac layer. The call to 158 * any other layer could be via mi_* entry points, classifier entry points into 159 * the driver or via upcall pointers into layers above. The mac perimeter may 160 * be acquired or held only in the down direction, for e.g. when calling into 161 * a mi_* driver enty point to provide atomicity of the operation. 162 * 163 * R8. Since it is not guaranteed (see R14) that drivers won't hold locks across 164 * mac driver interfaces, the MAC layer must provide a cut out for control 165 * interfaces like upcall notifications and start them in a separate thread. 166 * 167 * R9. Note that locking order also implies a plumbing order. For example 168 * VNICs are allowed to be created over aggrs, but not vice-versa. An attempt 169 * to plumb in any other order must be failed at mac_open time, otherwise it 170 * could lead to deadlocks due to inverse locking order. 171 * 172 * R10. MAC driver interfaces must not block since the driver could call them 173 * in interrupt context. 174 * 175 * R11. Walkers must preferably not hold any locks while calling walker 176 * callbacks. Instead these can operate on reference counts. In simple 177 * callbacks it may be ok to hold a lock and call the callbacks, but this is 178 * harder to maintain in the general case of arbitrary callbacks. 179 * 180 * R12. The MAC layer must protect upcall notification callbacks using reference 181 * counts rather than holding locks across the callbacks. 182 * 183 * R13. Given the variety of drivers, it is preferable if the MAC layer can make 184 * sure that any pointers (such as mac ring pointers) it passes to the driver 185 * remain valid until mac unregister time. Currently the mac layer achieves 186 * this by using generation numbers for rings and freeing the mac rings only 187 * at unregister time. The MAC layer must provide a layer of indirection and 188 * must not expose underlying driver rings or driver data structures/pointers 189 * directly to MAC clients. 190 * 191 * MAC driver rules 192 * ---------------- 193 * 194 * R14. It would be preferable if MAC drivers don't hold any locks across any 195 * mac call. However at a minimum they must not hold any locks across data 196 * upcalls. They must also make sure that all references to mac data structures 197 * are cleaned up and that it is single threaded at mac_unregister time. 198 * 199 * R15. MAC driver interfaces don't block and so the action may be done 200 * asynchronously in a separate thread as for example handling notifications. 201 * The driver must not assume that the action is complete when the call 202 * returns. 203 * 204 * R16. Drivers must maintain a generation number per Rx ring, and pass it 205 * back to mac_rx_ring(); They are expected to increment the generation 206 * number whenever the ring's stop routine is invoked. 207 * See comments in mac_rx_ring(); 208 * 209 * R17 Similarly mi_stop is another synchronization point and the driver must 210 * ensure that all upcalls are done and there won't be any future upcall 211 * before returning from mi_stop. 212 * 213 * R18. The driver may assume that all set/modify control operations via 214 * the mi_* entry points are single threaded on a per mac end point. 215 * 216 * Lock and Perimeter hierarchy scenarios 217 * --------------------------------------- 218 * 219 * i_mac_impl_lock -> mi_rw_lock -> srs_lock -> s_ring_lock[i_mac_tx_srs_notify] 220 * 221 * ft_lock -> fe_lock [mac_flow_lookup] 222 * 223 * mi_rw_lock -> fe_lock [mac_bcast_send] 224 * 225 * srs_lock -> mac_bw_lock [mac_rx_srs_drain_bw] 226 * 227 * cpu_lock -> mac_srs_g_lock -> srs_lock -> s_ring_lock [mac_walk_srs_and_bind] 228 * 229 * i_dls_devnet_lock -> mac layer locks [dls_devnet_rename] 230 * 231 * Perimeters are ordered P1 -> P2 -> P3 from top to bottom in order of mac 232 * client to driver. In the case of clients that explictly use the mac provided 233 * perimeter mechanism for its serialization, the hierarchy is 234 * Perimeter -> mac layer locks, since the client never holds any locks across 235 * the mac calls. In the case of clients that use its own locks the hierarchy 236 * is Client locks -> Mac Perim -> Mac layer locks. The client never explicitly 237 * calls mac_perim_enter/exit in this case. 238 * 239 * Subflow creation rules 240 * --------------------------- 241 * o In case of a user specified cpulist present on underlying link and flows, 242 * the flows cpulist must be a subset of the underlying link. 243 * o In case of a user specified fanout mode present on link and flow, the 244 * subflow fanout count has to be less than or equal to that of the 245 * underlying link. The cpu-bindings for the subflows will be a subset of 246 * the underlying link. 247 * o In case if no cpulist specified on both underlying link and flow, the 248 * underlying link relies on a MAC tunable to provide out of box fanout. 249 * The subflow will have no cpulist (the subflow will be unbound) 250 * o In case if no cpulist is specified on the underlying link, a subflow can 251 * carry either a user-specified cpulist or fanout count. The cpu-bindings 252 * for the subflow will not adhere to restriction that they need to be subset 253 * of the underlying link. 254 * o In case where the underlying link is carrying either a user specified 255 * cpulist or fanout mode and for a unspecified subflow, the subflow will be 256 * created unbound. 257 * o While creating unbound subflows, bandwidth mode changes attempt to 258 * figure a right fanout count. In such cases the fanout count will override 259 * the unbound cpu-binding behavior. 260 * o In addition to this, while cycling between flow and link properties, we 261 * impose a restriction that if a link property has a subflow with 262 * user-specified attributes, we will not allow changing the link property. 263 * The administrator needs to reset all the user specified properties for the 264 * subflows before attempting a link property change. 265 * Some of the above rules can be overridden by specifying additional command 266 * line options while creating or modifying link or subflow properties. 267 */ 268 269 #include <sys/types.h> 270 #include <sys/conf.h> 271 #include <sys/id_space.h> 272 #include <sys/esunddi.h> 273 #include <sys/stat.h> 274 #include <sys/mkdev.h> 275 #include <sys/stream.h> 276 #include <sys/strsun.h> 277 #include <sys/strsubr.h> 278 #include <sys/dlpi.h> 279 #include <sys/modhash.h> 280 #include <sys/mac_provider.h> 281 #include <sys/mac_client_impl.h> 282 #include <sys/mac_soft_ring.h> 283 #include <sys/mac_impl.h> 284 #include <sys/mac.h> 285 #include <sys/dls.h> 286 #include <sys/dld.h> 287 #include <sys/modctl.h> 288 #include <sys/fs/dv_node.h> 289 #include <sys/thread.h> 290 #include <sys/proc.h> 291 #include <sys/callb.h> 292 #include <sys/cpuvar.h> 293 #include <sys/atomic.h> 294 #include <sys/bitmap.h> 295 #include <sys/sdt.h> 296 #include <sys/mac_flow.h> 297 #include <sys/ddi_intr_impl.h> 298 #include <sys/disp.h> 299 #include <sys/sdt.h> 300 #include <sys/vnic.h> 301 #include <sys/vnic_impl.h> 302 #include <sys/vlan.h> 303 #include <inet/ip.h> 304 #include <inet/ip6.h> 305 #include <sys/exacct.h> 306 #include <sys/exacct_impl.h> 307 #include <inet/nd.h> 308 #include <sys/ethernet.h> 309 310 #define IMPL_HASHSZ 67 /* prime */ 311 312 kmem_cache_t *i_mac_impl_cachep; 313 mod_hash_t *i_mac_impl_hash; 314 krwlock_t i_mac_impl_lock; 315 uint_t i_mac_impl_count; 316 static kmem_cache_t *mac_ring_cache; 317 static id_space_t *minor_ids; 318 static uint32_t minor_count; 319 320 /* 321 * Logging stuff. Perhaps mac_logging_interval could be broken into 322 * mac_flow_log_interval and mac_link_log_interval if we want to be 323 * able to schedule them differently. 324 */ 325 uint_t mac_logging_interval; 326 boolean_t mac_flow_log_enable; 327 boolean_t mac_link_log_enable; 328 timeout_id_t mac_logging_timer; 329 330 /* for debugging, see MAC_DBG_PRT() in mac_impl.h */ 331 int mac_dbg = 0; 332 333 #define MACTYPE_KMODDIR "mac" 334 #define MACTYPE_HASHSZ 67 335 static mod_hash_t *i_mactype_hash; 336 /* 337 * i_mactype_lock synchronizes threads that obtain references to mactype_t 338 * structures through i_mactype_getplugin(). 339 */ 340 static kmutex_t i_mactype_lock; 341 342 /* 343 * mac_tx_percpu_cnt 344 * 345 * Number of per cpu locks per mac_client_impl_t. Used by the transmit side 346 * in mac_tx to reduce lock contention. This is sized at boot time in mac_init. 347 * mac_tx_percpu_cnt_max is settable in /etc/system and must be a power of 2. 348 * Per cpu locks may be disabled by setting mac_tx_percpu_cnt_max to 1. 349 */ 350 int mac_tx_percpu_cnt; 351 int mac_tx_percpu_cnt_max = 128; 352 353 static int i_mac_constructor(void *, void *, int); 354 static void i_mac_destructor(void *, void *); 355 static int i_mac_ring_ctor(void *, void *, int); 356 static void i_mac_ring_dtor(void *, void *); 357 static mblk_t *mac_rx_classify(mac_impl_t *, mac_resource_handle_t, mblk_t *); 358 void mac_tx_client_flush(mac_client_impl_t *); 359 void mac_tx_client_block(mac_client_impl_t *); 360 static void mac_rx_ring_quiesce(mac_ring_t *, uint_t); 361 static int mac_start_group_and_rings(mac_group_t *); 362 static void mac_stop_group_and_rings(mac_group_t *); 363 364 /* 365 * Module initialization functions. 366 */ 367 368 void 369 mac_init(void) 370 { 371 mac_tx_percpu_cnt = ((boot_max_ncpus == -1) ? max_ncpus : 372 boot_max_ncpus); 373 374 /* Upper bound is mac_tx_percpu_cnt_max */ 375 if (mac_tx_percpu_cnt > mac_tx_percpu_cnt_max) 376 mac_tx_percpu_cnt = mac_tx_percpu_cnt_max; 377 378 if (mac_tx_percpu_cnt < 1) { 379 /* Someone set max_tx_percpu_cnt_max to 0 or less */ 380 mac_tx_percpu_cnt = 1; 381 } 382 383 ASSERT(mac_tx_percpu_cnt >= 1); 384 mac_tx_percpu_cnt = (1 << highbit(mac_tx_percpu_cnt - 1)); 385 /* 386 * Make it of the form 2**N - 1 in the range 387 * [0 .. mac_tx_percpu_cnt_max - 1] 388 */ 389 mac_tx_percpu_cnt--; 390 391 i_mac_impl_cachep = kmem_cache_create("mac_impl_cache", 392 sizeof (mac_impl_t), 0, i_mac_constructor, i_mac_destructor, 393 NULL, NULL, NULL, 0); 394 ASSERT(i_mac_impl_cachep != NULL); 395 396 mac_ring_cache = kmem_cache_create("mac_ring_cache", 397 sizeof (mac_ring_t), 0, i_mac_ring_ctor, i_mac_ring_dtor, NULL, 398 NULL, NULL, 0); 399 ASSERT(mac_ring_cache != NULL); 400 401 i_mac_impl_hash = mod_hash_create_extended("mac_impl_hash", 402 IMPL_HASHSZ, mod_hash_null_keydtor, mod_hash_null_valdtor, 403 mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP); 404 rw_init(&i_mac_impl_lock, NULL, RW_DEFAULT, NULL); 405 406 mac_flow_init(); 407 mac_soft_ring_init(); 408 mac_bcast_init(); 409 mac_client_init(); 410 411 i_mac_impl_count = 0; 412 413 i_mactype_hash = mod_hash_create_extended("mactype_hash", 414 MACTYPE_HASHSZ, 415 mod_hash_null_keydtor, mod_hash_null_valdtor, 416 mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP); 417 418 /* 419 * Allocate an id space to manage minor numbers. The range of the 420 * space will be from MAC_MAX_MINOR+1 to MAXMIN32 (maximum legal 421 * minor number is MAXMIN, but id_t is type of integer and does not 422 * allow MAXMIN). 423 */ 424 minor_ids = id_space_create("mac_minor_ids", MAC_MAX_MINOR+1, MAXMIN32); 425 ASSERT(minor_ids != NULL); 426 minor_count = 0; 427 428 /* Let's default to 20 seconds */ 429 mac_logging_interval = 20; 430 mac_flow_log_enable = B_FALSE; 431 mac_link_log_enable = B_FALSE; 432 mac_logging_timer = 0; 433 } 434 435 int 436 mac_fini(void) 437 { 438 if (i_mac_impl_count > 0 || minor_count > 0) 439 return (EBUSY); 440 441 id_space_destroy(minor_ids); 442 mac_flow_fini(); 443 444 mod_hash_destroy_hash(i_mac_impl_hash); 445 rw_destroy(&i_mac_impl_lock); 446 447 mac_client_fini(); 448 kmem_cache_destroy(mac_ring_cache); 449 450 mod_hash_destroy_hash(i_mactype_hash); 451 mac_soft_ring_finish(); 452 return (0); 453 } 454 455 void 456 mac_init_ops(struct dev_ops *ops, const char *name) 457 { 458 dld_init_ops(ops, name); 459 } 460 461 void 462 mac_fini_ops(struct dev_ops *ops) 463 { 464 dld_fini_ops(ops); 465 } 466 467 /*ARGSUSED*/ 468 static int 469 i_mac_constructor(void *buf, void *arg, int kmflag) 470 { 471 mac_impl_t *mip = buf; 472 473 bzero(buf, sizeof (mac_impl_t)); 474 475 mip->mi_linkstate = LINK_STATE_UNKNOWN; 476 mip->mi_nclients = 0; 477 478 mutex_init(&mip->mi_lock, NULL, MUTEX_DRIVER, NULL); 479 rw_init(&mip->mi_rw_lock, NULL, RW_DRIVER, NULL); 480 mutex_init(&mip->mi_notify_lock, NULL, MUTEX_DRIVER, NULL); 481 mutex_init(&mip->mi_promisc_lock, NULL, MUTEX_DRIVER, NULL); 482 mutex_init(&mip->mi_ring_lock, NULL, MUTEX_DEFAULT, NULL); 483 484 mip->mi_notify_cb_info.mcbi_lockp = &mip->mi_notify_lock; 485 cv_init(&mip->mi_notify_cb_info.mcbi_cv, NULL, CV_DRIVER, NULL); 486 mip->mi_promisc_cb_info.mcbi_lockp = &mip->mi_promisc_lock; 487 cv_init(&mip->mi_promisc_cb_info.mcbi_cv, NULL, CV_DRIVER, NULL); 488 return (0); 489 } 490 491 /*ARGSUSED*/ 492 static void 493 i_mac_destructor(void *buf, void *arg) 494 { 495 mac_impl_t *mip = buf; 496 mac_cb_info_t *mcbi; 497 498 ASSERT(mip->mi_ref == 0); 499 ASSERT(mip->mi_active == 0); 500 ASSERT(mip->mi_linkstate == LINK_STATE_UNKNOWN); 501 ASSERT(mip->mi_devpromisc == 0); 502 ASSERT(mip->mi_promisc == 0); 503 ASSERT(mip->mi_ksp == NULL); 504 ASSERT(mip->mi_kstat_count == 0); 505 ASSERT(mip->mi_nclients == 0); 506 ASSERT(mip->mi_nactiveclients == 0); 507 ASSERT(mip->mi_single_active_client == NULL); 508 ASSERT(mip->mi_state_flags == 0); 509 ASSERT(mip->mi_factory_addr == NULL); 510 ASSERT(mip->mi_factory_addr_num == 0); 511 ASSERT(mip->mi_default_tx_ring == NULL); 512 513 mcbi = &mip->mi_notify_cb_info; 514 ASSERT(mcbi->mcbi_del_cnt == 0 && mcbi->mcbi_walker_cnt == 0); 515 ASSERT(mip->mi_notify_bits == 0); 516 ASSERT(mip->mi_notify_thread == NULL); 517 ASSERT(mcbi->mcbi_lockp == &mip->mi_notify_lock); 518 mcbi->mcbi_lockp = NULL; 519 520 mcbi = &mip->mi_promisc_cb_info; 521 ASSERT(mcbi->mcbi_del_cnt == 0 && mip->mi_promisc_list == NULL); 522 ASSERT(mip->mi_promisc_list == NULL); 523 ASSERT(mcbi->mcbi_lockp == &mip->mi_promisc_lock); 524 mcbi->mcbi_lockp = NULL; 525 526 ASSERT(mip->mi_bcast_ngrps == 0 && mip->mi_bcast_grp == NULL); 527 ASSERT(mip->mi_perim_owner == NULL && mip->mi_perim_ocnt == 0); 528 529 mutex_destroy(&mip->mi_lock); 530 rw_destroy(&mip->mi_rw_lock); 531 532 mutex_destroy(&mip->mi_promisc_lock); 533 cv_destroy(&mip->mi_promisc_cb_info.mcbi_cv); 534 mutex_destroy(&mip->mi_notify_lock); 535 cv_destroy(&mip->mi_notify_cb_info.mcbi_cv); 536 mutex_destroy(&mip->mi_ring_lock); 537 } 538 539 /* ARGSUSED */ 540 static int 541 i_mac_ring_ctor(void *buf, void *arg, int kmflag) 542 { 543 mac_ring_t *ring = (mac_ring_t *)buf; 544 545 bzero(ring, sizeof (mac_ring_t)); 546 cv_init(&ring->mr_cv, NULL, CV_DEFAULT, NULL); 547 mutex_init(&ring->mr_lock, NULL, MUTEX_DEFAULT, NULL); 548 ring->mr_state = MR_FREE; 549 return (0); 550 } 551 552 /* ARGSUSED */ 553 static void 554 i_mac_ring_dtor(void *buf, void *arg) 555 { 556 mac_ring_t *ring = (mac_ring_t *)buf; 557 558 cv_destroy(&ring->mr_cv); 559 mutex_destroy(&ring->mr_lock); 560 } 561 562 /* 563 * Common functions to do mac callback addition and deletion. Currently this is 564 * used by promisc callbacks and notify callbacks. List addition and deletion 565 * need to take care of list walkers. List walkers in general, can't hold list 566 * locks and make upcall callbacks due to potential lock order and recursive 567 * reentry issues. Instead list walkers increment the list walker count to mark 568 * the presence of a walker thread. Addition can be carefully done to ensure 569 * that the list walker always sees either the old list or the new list. 570 * However the deletion can't be done while the walker is active, instead the 571 * deleting thread simply marks the entry as logically deleted. The last walker 572 * physically deletes and frees up the logically deleted entries when the walk 573 * is complete. 574 */ 575 void 576 mac_callback_add(mac_cb_info_t *mcbi, mac_cb_t **mcb_head, 577 mac_cb_t *mcb_elem) 578 { 579 mac_cb_t *p; 580 mac_cb_t **pp; 581 582 /* Verify it is not already in the list */ 583 for (pp = mcb_head; (p = *pp) != NULL; pp = &p->mcb_nextp) { 584 if (p == mcb_elem) 585 break; 586 } 587 VERIFY(p == NULL); 588 589 /* 590 * Add it to the head of the callback list. The membar ensures that 591 * the following list pointer manipulations reach global visibility 592 * in exactly the program order below. 593 */ 594 ASSERT(MUTEX_HELD(mcbi->mcbi_lockp)); 595 596 mcb_elem->mcb_nextp = *mcb_head; 597 membar_producer(); 598 *mcb_head = mcb_elem; 599 } 600 601 /* 602 * Mark the entry as logically deleted. If there aren't any walkers unlink 603 * from the list. In either case return the corresponding status. 604 */ 605 boolean_t 606 mac_callback_remove(mac_cb_info_t *mcbi, mac_cb_t **mcb_head, 607 mac_cb_t *mcb_elem) 608 { 609 mac_cb_t *p; 610 mac_cb_t **pp; 611 612 ASSERT(MUTEX_HELD(mcbi->mcbi_lockp)); 613 /* 614 * Search the callback list for the entry to be removed 615 */ 616 for (pp = mcb_head; (p = *pp) != NULL; pp = &p->mcb_nextp) { 617 if (p == mcb_elem) 618 break; 619 } 620 VERIFY(p != NULL); 621 622 /* 623 * If there are walkers just mark it as deleted and the last walker 624 * will remove from the list and free it. 625 */ 626 if (mcbi->mcbi_walker_cnt != 0) { 627 p->mcb_flags |= MCB_CONDEMNED; 628 mcbi->mcbi_del_cnt++; 629 return (B_FALSE); 630 } 631 632 ASSERT(mcbi->mcbi_del_cnt == 0); 633 *pp = p->mcb_nextp; 634 p->mcb_nextp = NULL; 635 return (B_TRUE); 636 } 637 638 /* 639 * Wait for all pending callback removals to be completed 640 */ 641 void 642 mac_callback_remove_wait(mac_cb_info_t *mcbi) 643 { 644 ASSERT(MUTEX_HELD(mcbi->mcbi_lockp)); 645 while (mcbi->mcbi_del_cnt != 0) { 646 DTRACE_PROBE1(need_wait, mac_cb_info_t *, mcbi); 647 cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp); 648 } 649 } 650 651 /* 652 * The last mac callback walker does the cleanup. Walk the list and unlik 653 * all the logically deleted entries and construct a temporary list of 654 * removed entries. Return the list of removed entries to the caller. 655 */ 656 mac_cb_t * 657 mac_callback_walker_cleanup(mac_cb_info_t *mcbi, mac_cb_t **mcb_head) 658 { 659 mac_cb_t *p; 660 mac_cb_t **pp; 661 mac_cb_t *rmlist = NULL; /* List of removed elements */ 662 int cnt = 0; 663 664 ASSERT(MUTEX_HELD(mcbi->mcbi_lockp)); 665 ASSERT(mcbi->mcbi_del_cnt != 0 && mcbi->mcbi_walker_cnt == 0); 666 667 pp = mcb_head; 668 while (*pp != NULL) { 669 if ((*pp)->mcb_flags & MCB_CONDEMNED) { 670 p = *pp; 671 *pp = p->mcb_nextp; 672 p->mcb_nextp = rmlist; 673 rmlist = p; 674 cnt++; 675 continue; 676 } 677 pp = &(*pp)->mcb_nextp; 678 } 679 680 ASSERT(mcbi->mcbi_del_cnt == cnt); 681 mcbi->mcbi_del_cnt = 0; 682 return (rmlist); 683 } 684 685 boolean_t 686 mac_callback_lookup(mac_cb_t **mcb_headp, mac_cb_t *mcb_elem) 687 { 688 mac_cb_t *mcb; 689 690 /* Verify it is not already in the list */ 691 for (mcb = *mcb_headp; mcb != NULL; mcb = mcb->mcb_nextp) { 692 if (mcb == mcb_elem) 693 return (B_TRUE); 694 } 695 696 return (B_FALSE); 697 } 698 699 boolean_t 700 mac_callback_find(mac_cb_info_t *mcbi, mac_cb_t **mcb_headp, mac_cb_t *mcb_elem) 701 { 702 boolean_t found; 703 704 mutex_enter(mcbi->mcbi_lockp); 705 found = mac_callback_lookup(mcb_headp, mcb_elem); 706 mutex_exit(mcbi->mcbi_lockp); 707 708 return (found); 709 } 710 711 /* Free the list of removed callbacks */ 712 void 713 mac_callback_free(mac_cb_t *rmlist) 714 { 715 mac_cb_t *mcb; 716 mac_cb_t *mcb_next; 717 718 for (mcb = rmlist; mcb != NULL; mcb = mcb_next) { 719 mcb_next = mcb->mcb_nextp; 720 kmem_free(mcb->mcb_objp, mcb->mcb_objsize); 721 } 722 } 723 724 /* 725 * The promisc callbacks are in 2 lists, one off the 'mip' and another off the 726 * 'mcip' threaded by mpi_mi_link and mpi_mci_link respectively. However there 727 * is only a single shared total walker count, and an entry can't be physically 728 * unlinked if a walker is active on either list. The last walker does this 729 * cleanup of logically deleted entries. 730 */ 731 void 732 i_mac_promisc_walker_cleanup(mac_impl_t *mip) 733 { 734 mac_cb_t *rmlist; 735 mac_cb_t *mcb; 736 mac_cb_t *mcb_next; 737 mac_promisc_impl_t *mpip; 738 739 /* 740 * Construct a temporary list of deleted callbacks by walking the 741 * the mi_promisc_list. Then for each entry in the temporary list, 742 * remove it from the mci_promisc_list and free the entry. 743 */ 744 rmlist = mac_callback_walker_cleanup(&mip->mi_promisc_cb_info, 745 &mip->mi_promisc_list); 746 747 for (mcb = rmlist; mcb != NULL; mcb = mcb_next) { 748 mcb_next = mcb->mcb_nextp; 749 mpip = (mac_promisc_impl_t *)mcb->mcb_objp; 750 VERIFY(mac_callback_remove(&mip->mi_promisc_cb_info, 751 &mpip->mpi_mcip->mci_promisc_list, &mpip->mpi_mci_link)); 752 mcb->mcb_flags = 0; 753 mcb->mcb_nextp = NULL; 754 kmem_cache_free(mac_promisc_impl_cache, mpip); 755 } 756 } 757 758 void 759 i_mac_notify(mac_impl_t *mip, mac_notify_type_t type) 760 { 761 mac_cb_info_t *mcbi; 762 763 /* 764 * Signal the notify thread even after mi_ref has become zero and 765 * mi_disabled is set. The synchronization with the notify thread 766 * happens in mac_unregister and that implies the driver must make 767 * sure it is single-threaded (with respect to mac calls) and that 768 * all pending mac calls have returned before it calls mac_unregister 769 */ 770 rw_enter(&i_mac_impl_lock, RW_READER); 771 if (mip->mi_state_flags & MIS_DISABLED) 772 goto exit; 773 774 /* 775 * Guard against incorrect notifications. (Running a newer 776 * mac client against an older implementation?) 777 */ 778 if (type >= MAC_NNOTE) 779 goto exit; 780 781 mcbi = &mip->mi_notify_cb_info; 782 mutex_enter(mcbi->mcbi_lockp); 783 mip->mi_notify_bits |= (1 << type); 784 cv_broadcast(&mcbi->mcbi_cv); 785 mutex_exit(mcbi->mcbi_lockp); 786 787 exit: 788 rw_exit(&i_mac_impl_lock); 789 } 790 791 /* 792 * Mac serialization primitives. Please see the block comment at the 793 * top of the file. 794 */ 795 void 796 i_mac_perim_enter(mac_impl_t *mip) 797 { 798 mac_client_impl_t *mcip; 799 800 if (mip->mi_state_flags & MIS_IS_VNIC) { 801 /* 802 * This is a VNIC. Return the lower mac since that is what 803 * we want to serialize on. 804 */ 805 mcip = mac_vnic_lower(mip); 806 mip = mcip->mci_mip; 807 } 808 809 mutex_enter(&mip->mi_perim_lock); 810 if (mip->mi_perim_owner == curthread) { 811 mip->mi_perim_ocnt++; 812 mutex_exit(&mip->mi_perim_lock); 813 return; 814 } 815 816 while (mip->mi_perim_owner != NULL) 817 cv_wait(&mip->mi_perim_cv, &mip->mi_perim_lock); 818 819 mip->mi_perim_owner = curthread; 820 ASSERT(mip->mi_perim_ocnt == 0); 821 mip->mi_perim_ocnt++; 822 #ifdef DEBUG 823 mip->mi_perim_stack_depth = getpcstack(mip->mi_perim_stack, 824 MAC_PERIM_STACK_DEPTH); 825 #endif 826 mutex_exit(&mip->mi_perim_lock); 827 } 828 829 int 830 i_mac_perim_enter_nowait(mac_impl_t *mip) 831 { 832 /* 833 * The vnic is a special case, since the serialization is done based 834 * on the lower mac. If the lower mac is busy, it does not imply the 835 * vnic can't be unregistered. But in the case of other drivers, 836 * a busy perimeter or open mac handles implies that the mac is busy 837 * and can't be unregistered. 838 */ 839 if (mip->mi_state_flags & MIS_IS_VNIC) { 840 i_mac_perim_enter(mip); 841 return (0); 842 } 843 844 mutex_enter(&mip->mi_perim_lock); 845 if (mip->mi_perim_owner != NULL) { 846 mutex_exit(&mip->mi_perim_lock); 847 return (EBUSY); 848 } 849 ASSERT(mip->mi_perim_ocnt == 0); 850 mip->mi_perim_owner = curthread; 851 mip->mi_perim_ocnt++; 852 mutex_exit(&mip->mi_perim_lock); 853 854 return (0); 855 } 856 857 void 858 i_mac_perim_exit(mac_impl_t *mip) 859 { 860 mac_client_impl_t *mcip; 861 862 if (mip->mi_state_flags & MIS_IS_VNIC) { 863 /* 864 * This is a VNIC. Return the lower mac since that is what 865 * we want to serialize on. 866 */ 867 mcip = mac_vnic_lower(mip); 868 mip = mcip->mci_mip; 869 } 870 871 ASSERT(mip->mi_perim_owner == curthread && mip->mi_perim_ocnt != 0); 872 873 mutex_enter(&mip->mi_perim_lock); 874 if (--mip->mi_perim_ocnt == 0) { 875 mip->mi_perim_owner = NULL; 876 cv_signal(&mip->mi_perim_cv); 877 } 878 mutex_exit(&mip->mi_perim_lock); 879 } 880 881 /* 882 * Returns whether the current thread holds the mac perimeter. Used in making 883 * assertions. 884 */ 885 boolean_t 886 mac_perim_held(mac_handle_t mh) 887 { 888 mac_impl_t *mip = (mac_impl_t *)mh; 889 mac_client_impl_t *mcip; 890 891 if (mip->mi_state_flags & MIS_IS_VNIC) { 892 /* 893 * This is a VNIC. Return the lower mac since that is what 894 * we want to serialize on. 895 */ 896 mcip = mac_vnic_lower(mip); 897 mip = mcip->mci_mip; 898 } 899 return (mip->mi_perim_owner == curthread); 900 } 901 902 /* 903 * mac client interfaces to enter the mac perimeter of a mac end point, given 904 * its mac handle, or macname or linkid. 905 */ 906 void 907 mac_perim_enter_by_mh(mac_handle_t mh, mac_perim_handle_t *mphp) 908 { 909 mac_impl_t *mip = (mac_impl_t *)mh; 910 911 i_mac_perim_enter(mip); 912 /* 913 * The mac_perim_handle_t returned encodes the 'mip' and whether a 914 * mac_open has been done internally while entering the perimeter. 915 * This information is used in mac_perim_exit 916 */ 917 MAC_ENCODE_MPH(*mphp, mip, 0); 918 } 919 920 int 921 mac_perim_enter_by_macname(const char *name, mac_perim_handle_t *mphp) 922 { 923 int err; 924 mac_handle_t mh; 925 926 if ((err = mac_open(name, &mh)) != 0) 927 return (err); 928 929 mac_perim_enter_by_mh(mh, mphp); 930 MAC_ENCODE_MPH(*mphp, mh, 1); 931 return (0); 932 } 933 934 int 935 mac_perim_enter_by_linkid(datalink_id_t linkid, mac_perim_handle_t *mphp) 936 { 937 int err; 938 mac_handle_t mh; 939 940 if ((err = mac_open_by_linkid(linkid, &mh)) != 0) 941 return (err); 942 943 mac_perim_enter_by_mh(mh, mphp); 944 MAC_ENCODE_MPH(*mphp, mh, 1); 945 return (0); 946 } 947 948 void 949 mac_perim_exit(mac_perim_handle_t mph) 950 { 951 mac_impl_t *mip; 952 boolean_t need_close; 953 954 MAC_DECODE_MPH(mph, mip, need_close); 955 i_mac_perim_exit(mip); 956 if (need_close) 957 mac_close((mac_handle_t)mip); 958 } 959 960 int 961 mac_hold(const char *macname, mac_impl_t **pmip) 962 { 963 mac_impl_t *mip; 964 int err; 965 966 /* 967 * Check the device name length to make sure it won't overflow our 968 * buffer. 969 */ 970 if (strlen(macname) >= MAXNAMELEN) 971 return (EINVAL); 972 973 /* 974 * Look up its entry in the global hash table. 975 */ 976 rw_enter(&i_mac_impl_lock, RW_WRITER); 977 err = mod_hash_find(i_mac_impl_hash, (mod_hash_key_t)macname, 978 (mod_hash_val_t *)&mip); 979 980 if (err != 0) { 981 rw_exit(&i_mac_impl_lock); 982 return (ENOENT); 983 } 984 985 if (mip->mi_state_flags & MIS_DISABLED) { 986 rw_exit(&i_mac_impl_lock); 987 return (ENOENT); 988 } 989 990 if (mip->mi_state_flags & MIS_EXCLUSIVE_HELD) { 991 rw_exit(&i_mac_impl_lock); 992 return (EBUSY); 993 } 994 995 mip->mi_ref++; 996 rw_exit(&i_mac_impl_lock); 997 998 *pmip = mip; 999 return (0); 1000 } 1001 1002 void 1003 mac_rele(mac_impl_t *mip) 1004 { 1005 rw_enter(&i_mac_impl_lock, RW_WRITER); 1006 ASSERT(mip->mi_ref != 0); 1007 if (--mip->mi_ref == 0) { 1008 ASSERT(mip->mi_nactiveclients == 0 && 1009 !(mip->mi_state_flags & MIS_EXCLUSIVE)); 1010 } 1011 rw_exit(&i_mac_impl_lock); 1012 } 1013 1014 /* 1015 * This function is called only by mac_client_open. 1016 */ 1017 int 1018 mac_start(mac_impl_t *mip) 1019 { 1020 int err = 0; 1021 1022 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); 1023 ASSERT(mip->mi_start != NULL); 1024 1025 /* 1026 * Check whether the device is already started. 1027 */ 1028 if (mip->mi_active++ == 0) { 1029 mac_ring_t *ring = NULL; 1030 1031 /* 1032 * Start the device. 1033 */ 1034 err = mip->mi_start(mip->mi_driver); 1035 if (err != 0) { 1036 mip->mi_active--; 1037 return (err); 1038 } 1039 1040 /* 1041 * Start the default tx ring. 1042 */ 1043 if (mip->mi_default_tx_ring != NULL) { 1044 1045 ring = (mac_ring_t *)mip->mi_default_tx_ring; 1046 err = mac_start_ring(ring); 1047 if (err != 0) { 1048 mip->mi_active--; 1049 return (err); 1050 } 1051 ring->mr_state = MR_INUSE; 1052 } 1053 1054 if (mip->mi_rx_groups != NULL) { 1055 /* 1056 * Start the default ring, since it will be needed 1057 * to receive broadcast and multicast traffic for 1058 * both primary and non-primary MAC clients. 1059 */ 1060 mac_group_t *grp = &mip->mi_rx_groups[0]; 1061 1062 ASSERT(grp->mrg_state == MAC_GROUP_STATE_REGISTERED); 1063 err = mac_start_group_and_rings(grp); 1064 if (err != 0) { 1065 mip->mi_active--; 1066 if (ring != NULL) { 1067 mac_stop_ring(ring); 1068 ring->mr_state = MR_FREE; 1069 } 1070 return (err); 1071 } 1072 mac_set_rx_group_state(grp, MAC_GROUP_STATE_SHARED); 1073 } 1074 } 1075 1076 return (err); 1077 } 1078 1079 /* 1080 * This function is called only by mac_client_close. 1081 */ 1082 void 1083 mac_stop(mac_impl_t *mip) 1084 { 1085 ASSERT(mip->mi_stop != NULL); 1086 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); 1087 1088 /* 1089 * Check whether the device is still needed. 1090 */ 1091 ASSERT(mip->mi_active != 0); 1092 if (--mip->mi_active == 0) { 1093 if (mip->mi_rx_groups != NULL) { 1094 /* 1095 * There should be no more active clients since the 1096 * MAC is being stopped. Stop the default RX group 1097 * and transition it back to registered state. 1098 */ 1099 mac_group_t *grp = &mip->mi_rx_groups[0]; 1100 1101 /* 1102 * When clients are torn down, the groups 1103 * are release via mac_release_rx_group which 1104 * knows the the default group is always in 1105 * started mode since broadcast uses it. So 1106 * we can assert that their are no clients 1107 * (since mac_bcast_add doesn't register itself 1108 * as a client) and group is in SHARED state. 1109 */ 1110 ASSERT(grp->mrg_state == MAC_GROUP_STATE_SHARED); 1111 ASSERT(MAC_RX_GROUP_NO_CLIENT(grp) && 1112 mip->mi_nactiveclients == 0); 1113 mac_stop_group_and_rings(grp); 1114 mac_set_rx_group_state(grp, MAC_GROUP_STATE_REGISTERED); 1115 } 1116 1117 if (mip->mi_default_tx_ring != NULL) { 1118 mac_ring_t *ring; 1119 1120 ring = (mac_ring_t *)mip->mi_default_tx_ring; 1121 mac_stop_ring(ring); 1122 ring->mr_state = MR_FREE; 1123 } 1124 1125 /* 1126 * Stop the device. 1127 */ 1128 mip->mi_stop(mip->mi_driver); 1129 } 1130 } 1131 1132 int 1133 i_mac_promisc_set(mac_impl_t *mip, boolean_t on, mac_promisc_type_t ptype) 1134 { 1135 int err = 0; 1136 1137 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); 1138 ASSERT(mip->mi_setpromisc != NULL); 1139 ASSERT(ptype == MAC_DEVPROMISC || ptype == MAC_PROMISC); 1140 1141 /* 1142 * Determine whether we should enable or disable promiscuous mode. 1143 * For details on the distinction between "device promiscuous mode" 1144 * and "MAC promiscuous mode", see PSARC/2005/289. 1145 */ 1146 if (on) { 1147 /* 1148 * Enable promiscuous mode on the device if not yet enabled. 1149 */ 1150 if (mip->mi_devpromisc++ == 0) { 1151 err = mip->mi_setpromisc(mip->mi_driver, B_TRUE); 1152 if (err != 0) { 1153 mip->mi_devpromisc--; 1154 return (err); 1155 } 1156 i_mac_notify(mip, MAC_NOTE_DEVPROMISC); 1157 } 1158 1159 /* 1160 * Enable promiscuous mode on the MAC if not yet enabled. 1161 */ 1162 if (ptype == MAC_PROMISC && mip->mi_promisc++ == 0) 1163 i_mac_notify(mip, MAC_NOTE_PROMISC); 1164 } else { 1165 if (mip->mi_devpromisc == 0) 1166 return (EPROTO); 1167 1168 /* 1169 * Disable promiscuous mode on the device if this is the last 1170 * enabling. 1171 */ 1172 if (--mip->mi_devpromisc == 0) { 1173 err = mip->mi_setpromisc(mip->mi_driver, B_FALSE); 1174 if (err != 0) { 1175 mip->mi_devpromisc++; 1176 return (err); 1177 } 1178 i_mac_notify(mip, MAC_NOTE_DEVPROMISC); 1179 } 1180 1181 /* 1182 * Disable promiscuous mode on the MAC if this is the last 1183 * enabling. 1184 */ 1185 if (ptype == MAC_PROMISC && --mip->mi_promisc == 0) 1186 i_mac_notify(mip, MAC_NOTE_PROMISC); 1187 } 1188 1189 return (0); 1190 } 1191 1192 int 1193 mac_promisc_set(mac_handle_t mh, boolean_t on, mac_promisc_type_t ptype) 1194 { 1195 mac_impl_t *mip = (mac_impl_t *)mh; 1196 int rv; 1197 1198 i_mac_perim_enter(mip); 1199 rv = i_mac_promisc_set(mip, on, ptype); 1200 i_mac_perim_exit(mip); 1201 1202 return (rv); 1203 } 1204 1205 /* 1206 * The promiscuity state can change any time. If the caller needs to take 1207 * actions that are atomic with the promiscuity state, then the caller needs 1208 * to bracket the entire sequence with mac_perim_enter/exit 1209 */ 1210 boolean_t 1211 mac_promisc_get(mac_handle_t mh, mac_promisc_type_t ptype) 1212 { 1213 mac_impl_t *mip = (mac_impl_t *)mh; 1214 1215 ASSERT(ptype == MAC_DEVPROMISC || ptype == MAC_PROMISC); 1216 1217 /* 1218 * Return the current promiscuity. 1219 */ 1220 if (ptype == MAC_DEVPROMISC) 1221 return (mip->mi_devpromisc != 0); 1222 else 1223 return (mip->mi_promisc != 0); 1224 } 1225 1226 /* 1227 * Invoked at MAC instance attach time to initialize the list 1228 * of factory MAC addresses supported by a MAC instance. This function 1229 * builds a local cache in the mac_impl_t for the MAC addresses 1230 * supported by the underlying hardware. The MAC clients themselves 1231 * use the mac_addr_factory*() functions to query and reserve 1232 * factory MAC addresses. 1233 */ 1234 void 1235 mac_addr_factory_init(mac_impl_t *mip) 1236 { 1237 mac_capab_multifactaddr_t capab; 1238 uint8_t *addr; 1239 int i; 1240 1241 /* 1242 * First round to see how many factory MAC addresses are available. 1243 */ 1244 bzero(&capab, sizeof (capab)); 1245 if (!i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_MULTIFACTADDR, 1246 &capab) || (capab.mcm_naddr == 0)) { 1247 /* 1248 * The MAC instance doesn't support multiple factory 1249 * MAC addresses, we're done here. 1250 */ 1251 return; 1252 } 1253 1254 /* 1255 * Allocate the space and get all the factory addresses. 1256 */ 1257 addr = kmem_alloc(capab.mcm_naddr * MAXMACADDRLEN, KM_SLEEP); 1258 capab.mcm_getaddr(mip->mi_driver, capab.mcm_naddr, addr); 1259 1260 mip->mi_factory_addr_num = capab.mcm_naddr; 1261 mip->mi_factory_addr = kmem_zalloc(mip->mi_factory_addr_num * 1262 sizeof (mac_factory_addr_t), KM_SLEEP); 1263 1264 for (i = 0; i < capab.mcm_naddr; i++) { 1265 bcopy(addr + i * MAXMACADDRLEN, 1266 mip->mi_factory_addr[i].mfa_addr, 1267 mip->mi_type->mt_addr_length); 1268 mip->mi_factory_addr[i].mfa_in_use = B_FALSE; 1269 } 1270 1271 kmem_free(addr, capab.mcm_naddr * MAXMACADDRLEN); 1272 } 1273 1274 void 1275 mac_addr_factory_fini(mac_impl_t *mip) 1276 { 1277 if (mip->mi_factory_addr == NULL) { 1278 ASSERT(mip->mi_factory_addr_num == 0); 1279 return; 1280 } 1281 1282 kmem_free(mip->mi_factory_addr, mip->mi_factory_addr_num * 1283 sizeof (mac_factory_addr_t)); 1284 1285 mip->mi_factory_addr = NULL; 1286 mip->mi_factory_addr_num = 0; 1287 } 1288 1289 /* 1290 * Reserve a factory MAC address. If *slot is set to -1, the function 1291 * attempts to reserve any of the available factory MAC addresses and 1292 * returns the reserved slot id. If no slots are available, the function 1293 * returns ENOSPC. If *slot is not set to -1, the function reserves 1294 * the specified slot if it is available, or returns EBUSY is the slot 1295 * is already used. Returns ENOTSUP if the underlying MAC does not 1296 * support multiple factory addresses. If the slot number is not -1 but 1297 * is invalid, returns EINVAL. 1298 */ 1299 int 1300 mac_addr_factory_reserve(mac_client_handle_t mch, int *slot) 1301 { 1302 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1303 mac_impl_t *mip = mcip->mci_mip; 1304 int i, ret = 0; 1305 1306 i_mac_perim_enter(mip); 1307 /* 1308 * Protect against concurrent readers that may need a self-consistent 1309 * view of the factory addresses 1310 */ 1311 rw_enter(&mip->mi_rw_lock, RW_WRITER); 1312 1313 if (mip->mi_factory_addr_num == 0) { 1314 ret = ENOTSUP; 1315 goto bail; 1316 } 1317 1318 if (*slot != -1) { 1319 /* check the specified slot */ 1320 if (*slot < 1 || *slot > mip->mi_factory_addr_num) { 1321 ret = EINVAL; 1322 goto bail; 1323 } 1324 if (mip->mi_factory_addr[*slot-1].mfa_in_use) { 1325 ret = EBUSY; 1326 goto bail; 1327 } 1328 } else { 1329 /* pick the next available slot */ 1330 for (i = 0; i < mip->mi_factory_addr_num; i++) { 1331 if (!mip->mi_factory_addr[i].mfa_in_use) 1332 break; 1333 } 1334 1335 if (i == mip->mi_factory_addr_num) { 1336 ret = ENOSPC; 1337 goto bail; 1338 } 1339 *slot = i+1; 1340 } 1341 1342 mip->mi_factory_addr[*slot-1].mfa_in_use = B_TRUE; 1343 mip->mi_factory_addr[*slot-1].mfa_client = mcip; 1344 1345 bail: 1346 rw_exit(&mip->mi_rw_lock); 1347 i_mac_perim_exit(mip); 1348 return (ret); 1349 } 1350 1351 /* 1352 * Release the specified factory MAC address slot. 1353 */ 1354 void 1355 mac_addr_factory_release(mac_client_handle_t mch, uint_t slot) 1356 { 1357 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1358 mac_impl_t *mip = mcip->mci_mip; 1359 1360 i_mac_perim_enter(mip); 1361 /* 1362 * Protect against concurrent readers that may need a self-consistent 1363 * view of the factory addresses 1364 */ 1365 rw_enter(&mip->mi_rw_lock, RW_WRITER); 1366 1367 ASSERT(slot > 0 && slot <= mip->mi_factory_addr_num); 1368 ASSERT(mip->mi_factory_addr[slot-1].mfa_in_use); 1369 1370 mip->mi_factory_addr[slot-1].mfa_in_use = B_FALSE; 1371 1372 rw_exit(&mip->mi_rw_lock); 1373 i_mac_perim_exit(mip); 1374 } 1375 1376 /* 1377 * Stores in mac_addr the value of the specified MAC address. Returns 1378 * 0 on success, or EINVAL if the slot number is not valid for the MAC. 1379 * The caller must provide a string of at least MAXNAMELEN bytes. 1380 */ 1381 void 1382 mac_addr_factory_value(mac_handle_t mh, int slot, uchar_t *mac_addr, 1383 uint_t *addr_len, char *client_name, boolean_t *in_use_arg) 1384 { 1385 mac_impl_t *mip = (mac_impl_t *)mh; 1386 boolean_t in_use; 1387 1388 ASSERT(slot > 0 && slot <= mip->mi_factory_addr_num); 1389 1390 /* 1391 * Readers need to hold mi_rw_lock. Writers need to hold mac perimeter 1392 * and mi_rw_lock 1393 */ 1394 rw_enter(&mip->mi_rw_lock, RW_READER); 1395 bcopy(mip->mi_factory_addr[slot-1].mfa_addr, mac_addr, MAXMACADDRLEN); 1396 *addr_len = mip->mi_type->mt_addr_length; 1397 in_use = mip->mi_factory_addr[slot-1].mfa_in_use; 1398 if (in_use && client_name != NULL) { 1399 bcopy(mip->mi_factory_addr[slot-1].mfa_client->mci_name, 1400 client_name, MAXNAMELEN); 1401 } 1402 if (in_use_arg != NULL) 1403 *in_use_arg = in_use; 1404 rw_exit(&mip->mi_rw_lock); 1405 } 1406 1407 /* 1408 * Returns the number of factory MAC addresses (in addition to the 1409 * primary MAC address), 0 if the underlying MAC doesn't support 1410 * that feature. 1411 */ 1412 uint_t 1413 mac_addr_factory_num(mac_handle_t mh) 1414 { 1415 mac_impl_t *mip = (mac_impl_t *)mh; 1416 1417 return (mip->mi_factory_addr_num); 1418 } 1419 1420 1421 void 1422 mac_rx_group_unmark(mac_group_t *grp, uint_t flag) 1423 { 1424 mac_ring_t *ring; 1425 1426 for (ring = grp->mrg_rings; ring != NULL; ring = ring->mr_next) 1427 ring->mr_flag &= ~flag; 1428 } 1429 1430 /* 1431 * The following mac_hwrings_xxx() functions are private mac client functions 1432 * used by the aggr driver to access and control the underlying HW Rx group 1433 * and rings. In this case, the aggr driver has exclusive control of the 1434 * underlying HW Rx group/rings, it calls the following functions to 1435 * start/stop the HW Rx rings, disable/enable polling, add/remove mac' 1436 * addresses, or set up the Rx callback. 1437 */ 1438 /* ARGSUSED */ 1439 static void 1440 mac_hwrings_rx_process(void *arg, mac_resource_handle_t srs, 1441 mblk_t *mp_chain, boolean_t loopback) 1442 { 1443 mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs; 1444 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 1445 mac_direct_rx_t proc; 1446 void *arg1; 1447 mac_resource_handle_t arg2; 1448 1449 proc = srs_rx->sr_func; 1450 arg1 = srs_rx->sr_arg1; 1451 arg2 = mac_srs->srs_mrh; 1452 1453 proc(arg1, arg2, mp_chain, NULL); 1454 } 1455 1456 /* 1457 * This function is called to get the list of HW rings that are reserved by 1458 * an exclusive mac client. 1459 * 1460 * Return value: the number of HW rings. 1461 */ 1462 int 1463 mac_hwrings_get(mac_client_handle_t mch, mac_group_handle_t *hwgh, 1464 mac_ring_handle_t *hwrh) 1465 { 1466 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1467 flow_entry_t *flent = mcip->mci_flent; 1468 mac_group_t *grp = flent->fe_rx_ring_group; 1469 mac_ring_t *ring; 1470 int cnt = 0; 1471 1472 /* 1473 * The mac client did not reserve any RX group, return directly. 1474 * This is probably because the underlying MAC does not support 1475 * any RX groups. 1476 */ 1477 *hwgh = NULL; 1478 if (grp == NULL) 1479 return (0); 1480 1481 /* 1482 * This RX group must be reserved by this mac client. 1483 */ 1484 ASSERT((grp->mrg_state == MAC_GROUP_STATE_RESERVED) && 1485 (mch == (mac_client_handle_t)(MAC_RX_GROUP_ONLY_CLIENT(grp)))); 1486 1487 for (ring = grp->mrg_rings; ring != NULL; ring = ring->mr_next) { 1488 ASSERT(cnt < MAX_RINGS_PER_GROUP); 1489 hwrh[cnt++] = (mac_ring_handle_t)ring; 1490 } 1491 *hwgh = (mac_group_handle_t)grp; 1492 return (cnt); 1493 } 1494 1495 /* 1496 * Setup the RX callback of the mac client which exclusively controls HW ring. 1497 */ 1498 void 1499 mac_hwring_setup(mac_ring_handle_t hwrh, mac_resource_handle_t prh) 1500 { 1501 mac_ring_t *hw_ring = (mac_ring_t *)hwrh; 1502 mac_soft_ring_set_t *mac_srs = hw_ring->mr_srs; 1503 1504 mac_srs->srs_mrh = prh; 1505 mac_srs->srs_rx.sr_lower_proc = mac_hwrings_rx_process; 1506 } 1507 1508 void 1509 mac_hwring_teardown(mac_ring_handle_t hwrh) 1510 { 1511 mac_ring_t *hw_ring = (mac_ring_t *)hwrh; 1512 mac_soft_ring_set_t *mac_srs = hw_ring->mr_srs; 1513 1514 mac_srs->srs_rx.sr_lower_proc = mac_rx_srs_process; 1515 mac_srs->srs_mrh = NULL; 1516 } 1517 1518 int 1519 mac_hwring_disable_intr(mac_ring_handle_t rh) 1520 { 1521 mac_ring_t *rr_ring = (mac_ring_t *)rh; 1522 mac_intr_t *intr = &rr_ring->mr_info.mri_intr; 1523 1524 return (intr->mi_disable(intr->mi_handle)); 1525 } 1526 1527 int 1528 mac_hwring_enable_intr(mac_ring_handle_t rh) 1529 { 1530 mac_ring_t *rr_ring = (mac_ring_t *)rh; 1531 mac_intr_t *intr = &rr_ring->mr_info.mri_intr; 1532 1533 return (intr->mi_enable(intr->mi_handle)); 1534 } 1535 1536 int 1537 mac_hwring_start(mac_ring_handle_t rh) 1538 { 1539 mac_ring_t *rr_ring = (mac_ring_t *)rh; 1540 1541 MAC_RING_UNMARK(rr_ring, MR_QUIESCE); 1542 return (0); 1543 } 1544 1545 void 1546 mac_hwring_stop(mac_ring_handle_t rh) 1547 { 1548 mac_ring_t *rr_ring = (mac_ring_t *)rh; 1549 1550 mac_rx_ring_quiesce(rr_ring, MR_QUIESCE); 1551 } 1552 1553 mblk_t * 1554 mac_hwring_poll(mac_ring_handle_t rh, int bytes_to_pickup) 1555 { 1556 mac_ring_t *rr_ring = (mac_ring_t *)rh; 1557 mac_ring_info_t *info = &rr_ring->mr_info; 1558 1559 return (info->mri_poll(info->mri_driver, bytes_to_pickup)); 1560 } 1561 1562 int 1563 mac_hwgroup_addmac(mac_group_handle_t gh, const uint8_t *addr) 1564 { 1565 mac_group_t *group = (mac_group_t *)gh; 1566 1567 return (mac_group_addmac(group, addr)); 1568 } 1569 1570 int 1571 mac_hwgroup_remmac(mac_group_handle_t gh, const uint8_t *addr) 1572 { 1573 mac_group_t *group = (mac_group_t *)gh; 1574 1575 return (mac_group_remmac(group, addr)); 1576 } 1577 1578 /* 1579 * Set the RX group to be shared/reserved. Note that the group must be 1580 * started/stopped outside of this function. 1581 */ 1582 void 1583 mac_set_rx_group_state(mac_group_t *grp, mac_group_state_t state) 1584 { 1585 /* 1586 * If there is no change in the group state, just return. 1587 */ 1588 if (grp->mrg_state == state) 1589 return; 1590 1591 switch (state) { 1592 case MAC_GROUP_STATE_RESERVED: 1593 /* 1594 * Successfully reserved the group. 1595 * 1596 * Given that there is an exclusive client controlling this 1597 * group, we enable the group level polling when available, 1598 * so that SRSs get to turn on/off individual rings they's 1599 * assigned to. 1600 */ 1601 ASSERT(MAC_PERIM_HELD(grp->mrg_mh)); 1602 1603 if (GROUP_INTR_DISABLE_FUNC(grp) != NULL) 1604 GROUP_INTR_DISABLE_FUNC(grp)(GROUP_INTR_HANDLE(grp)); 1605 1606 break; 1607 1608 case MAC_GROUP_STATE_SHARED: 1609 /* 1610 * Set all rings of this group to software classified. 1611 * If the group has an overriding interrupt, then re-enable it. 1612 */ 1613 ASSERT(MAC_PERIM_HELD(grp->mrg_mh)); 1614 1615 if (GROUP_INTR_ENABLE_FUNC(grp) != NULL) 1616 GROUP_INTR_ENABLE_FUNC(grp)(GROUP_INTR_HANDLE(grp)); 1617 1618 /* The ring is not available for reservations any more */ 1619 break; 1620 1621 case MAC_GROUP_STATE_REGISTERED: 1622 /* Also callable from mac_register, perim is not held */ 1623 break; 1624 1625 default: 1626 ASSERT(B_FALSE); 1627 break; 1628 } 1629 1630 grp->mrg_state = state; 1631 } 1632 1633 /* 1634 * Quiesce future hardware classified packets for the specified Rx ring 1635 */ 1636 static void 1637 mac_rx_ring_quiesce(mac_ring_t *rx_ring, uint_t ring_flag) 1638 { 1639 ASSERT(rx_ring->mr_classify_type == MAC_HW_CLASSIFIER); 1640 ASSERT(ring_flag == MR_CONDEMNED || ring_flag == MR_QUIESCE); 1641 1642 mutex_enter(&rx_ring->mr_lock); 1643 rx_ring->mr_flag |= ring_flag; 1644 while (rx_ring->mr_refcnt != 0) 1645 cv_wait(&rx_ring->mr_cv, &rx_ring->mr_lock); 1646 mutex_exit(&rx_ring->mr_lock); 1647 } 1648 1649 /* 1650 * Please see mac_tx for details about the per cpu locking scheme 1651 */ 1652 static void 1653 mac_tx_lock_all(mac_client_impl_t *mcip) 1654 { 1655 int i; 1656 1657 for (i = 0; i <= mac_tx_percpu_cnt; i++) 1658 mutex_enter(&mcip->mci_tx_pcpu[i].pcpu_tx_lock); 1659 } 1660 1661 static void 1662 mac_tx_unlock_all(mac_client_impl_t *mcip) 1663 { 1664 int i; 1665 1666 for (i = mac_tx_percpu_cnt; i >= 0; i--) 1667 mutex_exit(&mcip->mci_tx_pcpu[i].pcpu_tx_lock); 1668 } 1669 1670 static void 1671 mac_tx_unlock_allbutzero(mac_client_impl_t *mcip) 1672 { 1673 int i; 1674 1675 for (i = mac_tx_percpu_cnt; i > 0; i--) 1676 mutex_exit(&mcip->mci_tx_pcpu[i].pcpu_tx_lock); 1677 } 1678 1679 static int 1680 mac_tx_sum_refcnt(mac_client_impl_t *mcip) 1681 { 1682 int i; 1683 int refcnt = 0; 1684 1685 for (i = 0; i <= mac_tx_percpu_cnt; i++) 1686 refcnt += mcip->mci_tx_pcpu[i].pcpu_tx_refcnt; 1687 1688 return (refcnt); 1689 } 1690 1691 /* 1692 * Stop future Tx packets coming down from the client in preparation for 1693 * quiescing the Tx side. This is needed for dynamic reclaim and reassignment 1694 * of rings between clients 1695 */ 1696 void 1697 mac_tx_client_block(mac_client_impl_t *mcip) 1698 { 1699 mac_tx_lock_all(mcip); 1700 mcip->mci_tx_flag |= MCI_TX_QUIESCE; 1701 while (mac_tx_sum_refcnt(mcip) != 0) { 1702 mac_tx_unlock_allbutzero(mcip); 1703 cv_wait(&mcip->mci_tx_cv, &mcip->mci_tx_pcpu[0].pcpu_tx_lock); 1704 mutex_exit(&mcip->mci_tx_pcpu[0].pcpu_tx_lock); 1705 mac_tx_lock_all(mcip); 1706 } 1707 mac_tx_unlock_all(mcip); 1708 } 1709 1710 void 1711 mac_tx_client_unblock(mac_client_impl_t *mcip) 1712 { 1713 mac_tx_lock_all(mcip); 1714 mcip->mci_tx_flag &= ~MCI_TX_QUIESCE; 1715 mac_tx_unlock_all(mcip); 1716 /* 1717 * We may fail to disable flow control for the last MAC_NOTE_TX 1718 * notification because the MAC client is quiesced. Send the 1719 * notification again. 1720 */ 1721 i_mac_notify(mcip->mci_mip, MAC_NOTE_TX); 1722 } 1723 1724 /* 1725 * Wait for an SRS to quiesce. The SRS worker will signal us when the 1726 * quiesce is done. 1727 */ 1728 static void 1729 mac_srs_quiesce_wait(mac_soft_ring_set_t *srs, uint_t srs_flag) 1730 { 1731 mutex_enter(&srs->srs_lock); 1732 while (!(srs->srs_state & srs_flag)) 1733 cv_wait(&srs->srs_quiesce_done_cv, &srs->srs_lock); 1734 mutex_exit(&srs->srs_lock); 1735 } 1736 1737 /* 1738 * Quiescing an Rx SRS is achieved by the following sequence. The protocol 1739 * works bottom up by cutting off packet flow from the bottommost point in the 1740 * mac, then the SRS, and then the soft rings. There are 2 use cases of this 1741 * mechanism. One is a temporary quiesce of the SRS, such as say while changing 1742 * the Rx callbacks. Another use case is Rx SRS teardown. In the former case 1743 * the QUIESCE prefix/suffix is used and in the latter the CONDEMNED is used 1744 * for the SRS and MR flags. In the former case the threads pause waiting for 1745 * a restart, while in the latter case the threads exit. The Tx SRS teardown 1746 * is also mostly similar to the above. 1747 * 1748 * 1. Stop future hardware classified packets at the lowest level in the mac. 1749 * Remove any hardware classification rule (CONDEMNED case) and mark the 1750 * rings as CONDEMNED or QUIESCE as appropriate. This prevents the mr_refcnt 1751 * from increasing. Upcalls from the driver that come through hardware 1752 * classification will be dropped in mac_rx from now on. Then we wait for 1753 * the mr_refcnt to drop to zero. When the mr_refcnt reaches zero we are 1754 * sure there aren't any upcall threads from the driver through hardware 1755 * classification. In the case of SRS teardown we also remove the 1756 * classification rule in the driver. 1757 * 1758 * 2. Stop future software classified packets by marking the flow entry with 1759 * FE_QUIESCE or FE_CONDEMNED as appropriate which prevents the refcnt from 1760 * increasing. We also remove the flow entry from the table in the latter 1761 * case. Then wait for the fe_refcnt to reach an appropriate quiescent value 1762 * that indicates there aren't any active threads using that flow entry. 1763 * 1764 * 3. Quiesce the SRS and softrings by signaling the SRS. The SRS poll thread, 1765 * SRS worker thread, and the soft ring threads are quiesced in sequence 1766 * with the SRS worker thread serving as a master controller. This 1767 * mechansim is explained in mac_srs_worker_quiesce(). 1768 * 1769 * The restart mechanism to reactivate the SRS and softrings is explained 1770 * in mac_srs_worker_restart(). Here we just signal the SRS worker to start the 1771 * restart sequence. 1772 */ 1773 void 1774 mac_rx_srs_quiesce(mac_soft_ring_set_t *srs, uint_t srs_quiesce_flag) 1775 { 1776 flow_entry_t *flent = srs->srs_flent; 1777 uint_t mr_flag, srs_done_flag; 1778 1779 ASSERT(MAC_PERIM_HELD((mac_handle_t)FLENT_TO_MIP(flent))); 1780 ASSERT(!(srs->srs_type & SRST_TX)); 1781 1782 if (srs_quiesce_flag == SRS_CONDEMNED) { 1783 mr_flag = MR_CONDEMNED; 1784 srs_done_flag = SRS_CONDEMNED_DONE; 1785 if (srs->srs_type & SRST_CLIENT_POLL_ENABLED) 1786 mac_srs_client_poll_disable(srs->srs_mcip, srs); 1787 } else { 1788 ASSERT(srs_quiesce_flag == SRS_QUIESCE); 1789 mr_flag = MR_QUIESCE; 1790 srs_done_flag = SRS_QUIESCE_DONE; 1791 if (srs->srs_type & SRST_CLIENT_POLL_ENABLED) 1792 mac_srs_client_poll_quiesce(srs->srs_mcip, srs); 1793 } 1794 1795 if (srs->srs_ring != NULL) { 1796 mac_rx_ring_quiesce(srs->srs_ring, mr_flag); 1797 } else { 1798 /* 1799 * SRS is driven by software classification. In case 1800 * of CONDEMNED, the top level teardown functions will 1801 * deal with flow removal. 1802 */ 1803 if (srs_quiesce_flag != SRS_CONDEMNED) { 1804 FLOW_MARK(flent, FE_QUIESCE); 1805 mac_flow_wait(flent, FLOW_DRIVER_UPCALL); 1806 } 1807 } 1808 1809 /* 1810 * Signal the SRS to quiesce itself, and then cv_wait for the 1811 * SRS quiesce to complete. The SRS worker thread will wake us 1812 * up when the quiesce is complete 1813 */ 1814 mac_srs_signal(srs, srs_quiesce_flag); 1815 mac_srs_quiesce_wait(srs, srs_done_flag); 1816 } 1817 1818 /* 1819 * Remove an SRS. 1820 */ 1821 void 1822 mac_rx_srs_remove(mac_soft_ring_set_t *srs) 1823 { 1824 flow_entry_t *flent = srs->srs_flent; 1825 int i; 1826 1827 mac_rx_srs_quiesce(srs, SRS_CONDEMNED); 1828 /* 1829 * Locate and remove our entry in the fe_rx_srs[] array, and 1830 * adjust the fe_rx_srs array entries and array count by 1831 * moving the last entry into the vacated spot. 1832 */ 1833 mutex_enter(&flent->fe_lock); 1834 for (i = 0; i < flent->fe_rx_srs_cnt; i++) { 1835 if (flent->fe_rx_srs[i] == srs) 1836 break; 1837 } 1838 1839 ASSERT(i != 0 && i < flent->fe_rx_srs_cnt); 1840 if (i != flent->fe_rx_srs_cnt - 1) { 1841 flent->fe_rx_srs[i] = 1842 flent->fe_rx_srs[flent->fe_rx_srs_cnt - 1]; 1843 i = flent->fe_rx_srs_cnt - 1; 1844 } 1845 1846 flent->fe_rx_srs[i] = NULL; 1847 flent->fe_rx_srs_cnt--; 1848 mutex_exit(&flent->fe_lock); 1849 1850 mac_srs_free(srs); 1851 } 1852 1853 static void 1854 mac_srs_clear_flag(mac_soft_ring_set_t *srs, uint_t flag) 1855 { 1856 mutex_enter(&srs->srs_lock); 1857 srs->srs_state &= ~flag; 1858 mutex_exit(&srs->srs_lock); 1859 } 1860 1861 void 1862 mac_rx_srs_restart(mac_soft_ring_set_t *srs) 1863 { 1864 flow_entry_t *flent = srs->srs_flent; 1865 mac_ring_t *mr; 1866 1867 ASSERT(MAC_PERIM_HELD((mac_handle_t)FLENT_TO_MIP(flent))); 1868 ASSERT((srs->srs_type & SRST_TX) == 0); 1869 1870 /* 1871 * This handles a change in the number of SRSs between the quiesce and 1872 * and restart operation of a flow. 1873 */ 1874 if (!SRS_QUIESCED(srs)) 1875 return; 1876 1877 /* 1878 * Signal the SRS to restart itself. Wait for the restart to complete 1879 * Note that we only restart the SRS if it is not marked as 1880 * permanently quiesced. 1881 */ 1882 if (!SRS_QUIESCED_PERMANENT(srs)) { 1883 mac_srs_signal(srs, SRS_RESTART); 1884 mac_srs_quiesce_wait(srs, SRS_RESTART_DONE); 1885 mac_srs_clear_flag(srs, SRS_RESTART_DONE); 1886 1887 mac_srs_client_poll_restart(srs->srs_mcip, srs); 1888 } 1889 1890 /* Finally clear the flags to let the packets in */ 1891 mr = srs->srs_ring; 1892 if (mr != NULL) { 1893 MAC_RING_UNMARK(mr, MR_QUIESCE); 1894 /* In case the ring was stopped, safely restart it */ 1895 (void) mac_start_ring(mr); 1896 } else { 1897 FLOW_UNMARK(flent, FE_QUIESCE); 1898 } 1899 } 1900 1901 /* 1902 * Temporary quiesce of a flow and associated Rx SRS. 1903 * Please see block comment above mac_rx_classify_flow_rem. 1904 */ 1905 /* ARGSUSED */ 1906 int 1907 mac_rx_classify_flow_quiesce(flow_entry_t *flent, void *arg) 1908 { 1909 int i; 1910 1911 for (i = 0; i < flent->fe_rx_srs_cnt; i++) { 1912 mac_rx_srs_quiesce((mac_soft_ring_set_t *)flent->fe_rx_srs[i], 1913 SRS_QUIESCE); 1914 } 1915 return (0); 1916 } 1917 1918 /* 1919 * Restart a flow and associated Rx SRS that has been quiesced temporarily 1920 * Please see block comment above mac_rx_classify_flow_rem 1921 */ 1922 /* ARGSUSED */ 1923 int 1924 mac_rx_classify_flow_restart(flow_entry_t *flent, void *arg) 1925 { 1926 int i; 1927 1928 for (i = 0; i < flent->fe_rx_srs_cnt; i++) 1929 mac_rx_srs_restart((mac_soft_ring_set_t *)flent->fe_rx_srs[i]); 1930 1931 return (0); 1932 } 1933 1934 void 1935 mac_srs_perm_quiesce(mac_client_handle_t mch, boolean_t on) 1936 { 1937 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1938 flow_entry_t *flent = mcip->mci_flent; 1939 mac_impl_t *mip = mcip->mci_mip; 1940 mac_soft_ring_set_t *mac_srs; 1941 int i; 1942 1943 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); 1944 1945 if (flent == NULL) 1946 return; 1947 1948 for (i = 0; i < flent->fe_rx_srs_cnt; i++) { 1949 mac_srs = flent->fe_rx_srs[i]; 1950 mutex_enter(&mac_srs->srs_lock); 1951 if (on) 1952 mac_srs->srs_state |= SRS_QUIESCE_PERM; 1953 else 1954 mac_srs->srs_state &= ~SRS_QUIESCE_PERM; 1955 mutex_exit(&mac_srs->srs_lock); 1956 } 1957 } 1958 1959 void 1960 mac_rx_client_quiesce(mac_client_handle_t mch) 1961 { 1962 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1963 mac_impl_t *mip = mcip->mci_mip; 1964 1965 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); 1966 1967 if (MCIP_DATAPATH_SETUP(mcip)) { 1968 (void) mac_rx_classify_flow_quiesce(mcip->mci_flent, 1969 NULL); 1970 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab, 1971 mac_rx_classify_flow_quiesce, NULL); 1972 } 1973 } 1974 1975 void 1976 mac_rx_client_restart(mac_client_handle_t mch) 1977 { 1978 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1979 mac_impl_t *mip = mcip->mci_mip; 1980 1981 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); 1982 1983 if (MCIP_DATAPATH_SETUP(mcip)) { 1984 (void) mac_rx_classify_flow_restart(mcip->mci_flent, NULL); 1985 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab, 1986 mac_rx_classify_flow_restart, NULL); 1987 } 1988 } 1989 1990 /* 1991 * This function only quiesces the Tx SRS and softring worker threads. Callers 1992 * need to make sure that there aren't any mac client threads doing current or 1993 * future transmits in the mac before calling this function. 1994 */ 1995 void 1996 mac_tx_srs_quiesce(mac_soft_ring_set_t *srs, uint_t srs_quiesce_flag) 1997 { 1998 mac_client_impl_t *mcip = srs->srs_mcip; 1999 2000 ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); 2001 2002 ASSERT(srs->srs_type & SRST_TX); 2003 ASSERT(srs_quiesce_flag == SRS_CONDEMNED || 2004 srs_quiesce_flag == SRS_QUIESCE); 2005 2006 /* 2007 * Signal the SRS to quiesce itself, and then cv_wait for the 2008 * SRS quiesce to complete. The SRS worker thread will wake us 2009 * up when the quiesce is complete 2010 */ 2011 mac_srs_signal(srs, srs_quiesce_flag); 2012 mac_srs_quiesce_wait(srs, srs_quiesce_flag == SRS_QUIESCE ? 2013 SRS_QUIESCE_DONE : SRS_CONDEMNED_DONE); 2014 } 2015 2016 void 2017 mac_tx_srs_restart(mac_soft_ring_set_t *srs) 2018 { 2019 /* 2020 * Resizing the fanout could result in creation of new SRSs. 2021 * They may not necessarily be in the quiesced state in which 2022 * case it need be restarted 2023 */ 2024 if (!SRS_QUIESCED(srs)) 2025 return; 2026 2027 mac_srs_signal(srs, SRS_RESTART); 2028 mac_srs_quiesce_wait(srs, SRS_RESTART_DONE); 2029 mac_srs_clear_flag(srs, SRS_RESTART_DONE); 2030 } 2031 2032 /* 2033 * Temporary quiesce of a flow and associated Rx SRS. 2034 * Please see block comment above mac_rx_srs_quiesce 2035 */ 2036 /* ARGSUSED */ 2037 int 2038 mac_tx_flow_quiesce(flow_entry_t *flent, void *arg) 2039 { 2040 /* 2041 * The fe_tx_srs is null for a subflow on an interface that is 2042 * not plumbed 2043 */ 2044 if (flent->fe_tx_srs != NULL) 2045 mac_tx_srs_quiesce(flent->fe_tx_srs, SRS_QUIESCE); 2046 return (0); 2047 } 2048 2049 /* ARGSUSED */ 2050 int 2051 mac_tx_flow_restart(flow_entry_t *flent, void *arg) 2052 { 2053 /* 2054 * The fe_tx_srs is null for a subflow on an interface that is 2055 * not plumbed 2056 */ 2057 if (flent->fe_tx_srs != NULL) 2058 mac_tx_srs_restart(flent->fe_tx_srs); 2059 return (0); 2060 } 2061 2062 void 2063 mac_tx_client_quiesce(mac_client_impl_t *mcip, uint_t srs_quiesce_flag) 2064 { 2065 ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); 2066 2067 mac_tx_client_block(mcip); 2068 if (MCIP_TX_SRS(mcip) != NULL) { 2069 mac_tx_srs_quiesce(MCIP_TX_SRS(mcip), srs_quiesce_flag); 2070 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab, 2071 mac_tx_flow_quiesce, NULL); 2072 } 2073 } 2074 2075 void 2076 mac_tx_client_restart(mac_client_impl_t *mcip) 2077 { 2078 ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); 2079 2080 mac_tx_client_unblock(mcip); 2081 if (MCIP_TX_SRS(mcip) != NULL) { 2082 mac_tx_srs_restart(MCIP_TX_SRS(mcip)); 2083 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab, 2084 mac_tx_flow_restart, NULL); 2085 } 2086 } 2087 2088 void 2089 mac_tx_client_flush(mac_client_impl_t *mcip) 2090 { 2091 ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); 2092 2093 mac_tx_client_quiesce(mcip, SRS_QUIESCE); 2094 mac_tx_client_restart(mcip); 2095 } 2096 2097 void 2098 mac_client_quiesce(mac_client_impl_t *mcip) 2099 { 2100 mac_rx_client_quiesce((mac_client_handle_t)mcip); 2101 mac_tx_client_quiesce(mcip, SRS_QUIESCE); 2102 } 2103 2104 void 2105 mac_client_restart(mac_client_impl_t *mcip) 2106 { 2107 mac_rx_client_restart((mac_client_handle_t)mcip); 2108 mac_tx_client_restart(mcip); 2109 } 2110 2111 /* 2112 * Allocate a minor number. 2113 */ 2114 minor_t 2115 mac_minor_hold(boolean_t sleep) 2116 { 2117 minor_t minor; 2118 2119 /* 2120 * Grab a value from the arena. 2121 */ 2122 atomic_add_32(&minor_count, 1); 2123 2124 if (sleep) 2125 minor = (uint_t)id_alloc(minor_ids); 2126 else 2127 minor = (uint_t)id_alloc_nosleep(minor_ids); 2128 2129 if (minor == 0) { 2130 atomic_add_32(&minor_count, -1); 2131 return (0); 2132 } 2133 2134 return (minor); 2135 } 2136 2137 /* 2138 * Release a previously allocated minor number. 2139 */ 2140 void 2141 mac_minor_rele(minor_t minor) 2142 { 2143 /* 2144 * Return the value to the arena. 2145 */ 2146 id_free(minor_ids, minor); 2147 atomic_add_32(&minor_count, -1); 2148 } 2149 2150 uint32_t 2151 mac_no_notification(mac_handle_t mh) 2152 { 2153 mac_impl_t *mip = (mac_impl_t *)mh; 2154 return (mip->mi_unsup_note); 2155 } 2156 2157 /* 2158 * Prevent any new opens of this mac in preparation for unregister 2159 */ 2160 int 2161 i_mac_disable(mac_impl_t *mip) 2162 { 2163 mac_client_impl_t *mcip; 2164 2165 rw_enter(&i_mac_impl_lock, RW_WRITER); 2166 if (mip->mi_state_flags & MIS_DISABLED) { 2167 /* Already disabled, return success */ 2168 rw_exit(&i_mac_impl_lock); 2169 return (0); 2170 } 2171 /* 2172 * See if there are any other references to this mac_t (e.g., VLAN's). 2173 * If so return failure. If all the other checks below pass, then 2174 * set mi_disabled atomically under the i_mac_impl_lock to prevent 2175 * any new VLAN's from being created or new mac client opens of this 2176 * mac end point. 2177 */ 2178 if (mip->mi_ref > 0) { 2179 rw_exit(&i_mac_impl_lock); 2180 return (EBUSY); 2181 } 2182 2183 /* 2184 * mac clients must delete all multicast groups they join before 2185 * closing. bcast groups are reference counted, the last client 2186 * to delete the group will wait till the group is physically 2187 * deleted. Since all clients have closed this mac end point 2188 * mi_bcast_ngrps must be zero at this point 2189 */ 2190 ASSERT(mip->mi_bcast_ngrps == 0); 2191 2192 /* 2193 * Don't let go of this if it has some flows. 2194 * All other code guarantees no flows are added to a disabled 2195 * mac, therefore it is sufficient to check for the flow table 2196 * only here. 2197 */ 2198 mcip = mac_primary_client_handle(mip); 2199 if ((mcip != NULL) && mac_link_has_flows((mac_client_handle_t)mcip)) { 2200 rw_exit(&i_mac_impl_lock); 2201 return (ENOTEMPTY); 2202 } 2203 2204 mip->mi_state_flags |= MIS_DISABLED; 2205 rw_exit(&i_mac_impl_lock); 2206 return (0); 2207 } 2208 2209 int 2210 mac_disable_nowait(mac_handle_t mh) 2211 { 2212 mac_impl_t *mip = (mac_impl_t *)mh; 2213 int err; 2214 2215 if ((err = i_mac_perim_enter_nowait(mip)) != 0) 2216 return (err); 2217 err = i_mac_disable(mip); 2218 i_mac_perim_exit(mip); 2219 return (err); 2220 } 2221 2222 int 2223 mac_disable(mac_handle_t mh) 2224 { 2225 mac_impl_t *mip = (mac_impl_t *)mh; 2226 int err; 2227 2228 i_mac_perim_enter(mip); 2229 err = i_mac_disable(mip); 2230 i_mac_perim_exit(mip); 2231 2232 /* 2233 * Clean up notification thread and wait for it to exit. 2234 */ 2235 if (err == 0) 2236 i_mac_notify_exit(mip); 2237 2238 return (err); 2239 } 2240 2241 /* 2242 * Called when the MAC instance has a non empty flow table, to de-multiplex 2243 * incoming packets to the right flow. 2244 * The MAC's rw lock is assumed held as a READER. 2245 */ 2246 /* ARGSUSED */ 2247 static mblk_t * 2248 mac_rx_classify(mac_impl_t *mip, mac_resource_handle_t mrh, mblk_t *mp) 2249 { 2250 flow_entry_t *flent = NULL; 2251 uint_t flags = FLOW_INBOUND; 2252 int err; 2253 2254 /* 2255 * If the mac is a port of an aggregation, pass FLOW_IGNORE_VLAN 2256 * to mac_flow_lookup() so that the VLAN packets can be successfully 2257 * passed to the non-VLAN aggregation flows. 2258 * 2259 * Note that there is possibly a race between this and 2260 * mac_unicast_remove/add() and VLAN packets could be incorrectly 2261 * classified to non-VLAN flows of non-aggregation mac clients. These 2262 * VLAN packets will be then filtered out by the mac module. 2263 */ 2264 if ((mip->mi_state_flags & MIS_EXCLUSIVE) != 0) 2265 flags |= FLOW_IGNORE_VLAN; 2266 2267 err = mac_flow_lookup(mip->mi_flow_tab, mp, flags, &flent); 2268 if (err != 0) { 2269 /* no registered receive function */ 2270 return (mp); 2271 } else { 2272 mac_client_impl_t *mcip; 2273 2274 /* 2275 * This flent might just be an additional one on the MAC client, 2276 * i.e. for classification purposes (different fdesc), however 2277 * the resources, SRS et. al., are in the mci_flent, so if 2278 * this isn't the mci_flent, we need to get it. 2279 */ 2280 if ((mcip = flent->fe_mcip) != NULL && 2281 mcip->mci_flent != flent) { 2282 FLOW_REFRELE(flent); 2283 flent = mcip->mci_flent; 2284 FLOW_TRY_REFHOLD(flent, err); 2285 if (err != 0) 2286 return (mp); 2287 } 2288 (flent->fe_cb_fn)(flent->fe_cb_arg1, flent->fe_cb_arg2, mp, 2289 B_FALSE); 2290 FLOW_REFRELE(flent); 2291 } 2292 return (NULL); 2293 } 2294 2295 mblk_t * 2296 mac_rx_flow(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain) 2297 { 2298 mac_impl_t *mip = (mac_impl_t *)mh; 2299 mblk_t *bp, *bp1, **bpp, *list = NULL; 2300 2301 /* 2302 * We walk the chain and attempt to classify each packet. 2303 * The packets that couldn't be classified will be returned 2304 * back to the caller. 2305 */ 2306 bp = mp_chain; 2307 bpp = &list; 2308 while (bp != NULL) { 2309 bp1 = bp; 2310 bp = bp->b_next; 2311 bp1->b_next = NULL; 2312 2313 if (mac_rx_classify(mip, mrh, bp1) != NULL) { 2314 *bpp = bp1; 2315 bpp = &bp1->b_next; 2316 } 2317 } 2318 return (list); 2319 } 2320 2321 static int 2322 mac_tx_flow_srs_wakeup(flow_entry_t *flent, void *arg) 2323 { 2324 mac_ring_handle_t ring = arg; 2325 2326 if (flent->fe_tx_srs) 2327 mac_tx_srs_wakeup(flent->fe_tx_srs, ring); 2328 return (0); 2329 } 2330 2331 void 2332 i_mac_tx_srs_notify(mac_impl_t *mip, mac_ring_handle_t ring) 2333 { 2334 mac_client_impl_t *cclient; 2335 mac_soft_ring_set_t *mac_srs; 2336 2337 /* 2338 * After grabbing the mi_rw_lock, the list of clients can't change. 2339 * If there are any clients mi_disabled must be B_FALSE and can't 2340 * get set since there are clients. If there aren't any clients we 2341 * don't do anything. In any case the mip has to be valid. The driver 2342 * must make sure that it goes single threaded (with respect to mac 2343 * calls) and wait for all pending mac calls to finish before calling 2344 * mac_unregister. 2345 */ 2346 rw_enter(&i_mac_impl_lock, RW_READER); 2347 if (mip->mi_state_flags & MIS_DISABLED) { 2348 rw_exit(&i_mac_impl_lock); 2349 return; 2350 } 2351 2352 /* 2353 * Get MAC tx srs from walking mac_client_handle list. 2354 */ 2355 rw_enter(&mip->mi_rw_lock, RW_READER); 2356 for (cclient = mip->mi_clients_list; cclient != NULL; 2357 cclient = cclient->mci_client_next) { 2358 if ((mac_srs = MCIP_TX_SRS(cclient)) != NULL) 2359 mac_tx_srs_wakeup(mac_srs, ring); 2360 (void) mac_flow_walk(cclient->mci_subflow_tab, 2361 mac_tx_flow_srs_wakeup, ring); 2362 } 2363 rw_exit(&mip->mi_rw_lock); 2364 rw_exit(&i_mac_impl_lock); 2365 } 2366 2367 /* ARGSUSED */ 2368 void 2369 mac_multicast_refresh(mac_handle_t mh, mac_multicst_t refresh, void *arg, 2370 boolean_t add) 2371 { 2372 mac_impl_t *mip = (mac_impl_t *)mh; 2373 2374 i_mac_perim_enter((mac_impl_t *)mh); 2375 /* 2376 * If no specific refresh function was given then default to the 2377 * driver's m_multicst entry point. 2378 */ 2379 if (refresh == NULL) { 2380 refresh = mip->mi_multicst; 2381 arg = mip->mi_driver; 2382 } 2383 2384 mac_bcast_refresh(mip, refresh, arg, add); 2385 i_mac_perim_exit((mac_impl_t *)mh); 2386 } 2387 2388 void 2389 mac_promisc_refresh(mac_handle_t mh, mac_setpromisc_t refresh, void *arg) 2390 { 2391 mac_impl_t *mip = (mac_impl_t *)mh; 2392 2393 /* 2394 * If no specific refresh function was given then default to the 2395 * driver's m_promisc entry point. 2396 */ 2397 if (refresh == NULL) { 2398 refresh = mip->mi_setpromisc; 2399 arg = mip->mi_driver; 2400 } 2401 ASSERT(refresh != NULL); 2402 2403 /* 2404 * Call the refresh function with the current promiscuity. 2405 */ 2406 refresh(arg, (mip->mi_devpromisc != 0)); 2407 } 2408 2409 /* 2410 * The mac client requests that the mac not to change its margin size to 2411 * be less than the specified value. If "current" is B_TRUE, then the client 2412 * requests the mac not to change its margin size to be smaller than the 2413 * current size. Further, return the current margin size value in this case. 2414 * 2415 * We keep every requested size in an ordered list from largest to smallest. 2416 */ 2417 int 2418 mac_margin_add(mac_handle_t mh, uint32_t *marginp, boolean_t current) 2419 { 2420 mac_impl_t *mip = (mac_impl_t *)mh; 2421 mac_margin_req_t **pp, *p; 2422 int err = 0; 2423 2424 rw_enter(&(mip->mi_rw_lock), RW_WRITER); 2425 if (current) 2426 *marginp = mip->mi_margin; 2427 2428 /* 2429 * If the current margin value cannot satisfy the margin requested, 2430 * return ENOTSUP directly. 2431 */ 2432 if (*marginp > mip->mi_margin) { 2433 err = ENOTSUP; 2434 goto done; 2435 } 2436 2437 /* 2438 * Check whether the given margin is already in the list. If so, 2439 * bump the reference count. 2440 */ 2441 for (pp = &mip->mi_mmrp; (p = *pp) != NULL; pp = &p->mmr_nextp) { 2442 if (p->mmr_margin == *marginp) { 2443 /* 2444 * The margin requested is already in the list, 2445 * so just bump the reference count. 2446 */ 2447 p->mmr_ref++; 2448 goto done; 2449 } 2450 if (p->mmr_margin < *marginp) 2451 break; 2452 } 2453 2454 2455 p = kmem_zalloc(sizeof (mac_margin_req_t), KM_SLEEP); 2456 p->mmr_margin = *marginp; 2457 p->mmr_ref++; 2458 p->mmr_nextp = *pp; 2459 *pp = p; 2460 2461 done: 2462 rw_exit(&(mip->mi_rw_lock)); 2463 return (err); 2464 } 2465 2466 /* 2467 * The mac client requests to cancel its previous mac_margin_add() request. 2468 * We remove the requested margin size from the list. 2469 */ 2470 int 2471 mac_margin_remove(mac_handle_t mh, uint32_t margin) 2472 { 2473 mac_impl_t *mip = (mac_impl_t *)mh; 2474 mac_margin_req_t **pp, *p; 2475 int err = 0; 2476 2477 rw_enter(&(mip->mi_rw_lock), RW_WRITER); 2478 /* 2479 * Find the entry in the list for the given margin. 2480 */ 2481 for (pp = &(mip->mi_mmrp); (p = *pp) != NULL; pp = &(p->mmr_nextp)) { 2482 if (p->mmr_margin == margin) { 2483 if (--p->mmr_ref == 0) 2484 break; 2485 2486 /* 2487 * There is still a reference to this address so 2488 * there's nothing more to do. 2489 */ 2490 goto done; 2491 } 2492 } 2493 2494 /* 2495 * We did not find an entry for the given margin. 2496 */ 2497 if (p == NULL) { 2498 err = ENOENT; 2499 goto done; 2500 } 2501 2502 ASSERT(p->mmr_ref == 0); 2503 2504 /* 2505 * Remove it from the list. 2506 */ 2507 *pp = p->mmr_nextp; 2508 kmem_free(p, sizeof (mac_margin_req_t)); 2509 done: 2510 rw_exit(&(mip->mi_rw_lock)); 2511 return (err); 2512 } 2513 2514 boolean_t 2515 mac_margin_update(mac_handle_t mh, uint32_t margin) 2516 { 2517 mac_impl_t *mip = (mac_impl_t *)mh; 2518 uint32_t margin_needed = 0; 2519 2520 rw_enter(&(mip->mi_rw_lock), RW_WRITER); 2521 2522 if (mip->mi_mmrp != NULL) 2523 margin_needed = mip->mi_mmrp->mmr_margin; 2524 2525 if (margin_needed <= margin) 2526 mip->mi_margin = margin; 2527 2528 rw_exit(&(mip->mi_rw_lock)); 2529 2530 if (margin_needed <= margin) 2531 i_mac_notify(mip, MAC_NOTE_MARGIN); 2532 2533 return (margin_needed <= margin); 2534 } 2535 2536 /* 2537 * MAC Type Plugin functions. 2538 */ 2539 2540 mactype_t * 2541 mactype_getplugin(const char *pname) 2542 { 2543 mactype_t *mtype = NULL; 2544 boolean_t tried_modload = B_FALSE; 2545 2546 mutex_enter(&i_mactype_lock); 2547 2548 find_registered_mactype: 2549 if (mod_hash_find(i_mactype_hash, (mod_hash_key_t)pname, 2550 (mod_hash_val_t *)&mtype) != 0) { 2551 if (!tried_modload) { 2552 /* 2553 * If the plugin has not yet been loaded, then 2554 * attempt to load it now. If modload() succeeds, 2555 * the plugin should have registered using 2556 * mactype_register(), in which case we can go back 2557 * and attempt to find it again. 2558 */ 2559 if (modload(MACTYPE_KMODDIR, (char *)pname) != -1) { 2560 tried_modload = B_TRUE; 2561 goto find_registered_mactype; 2562 } 2563 } 2564 } else { 2565 /* 2566 * Note that there's no danger that the plugin we've loaded 2567 * could be unloaded between the modload() step and the 2568 * reference count bump here, as we're holding 2569 * i_mactype_lock, which mactype_unregister() also holds. 2570 */ 2571 atomic_inc_32(&mtype->mt_ref); 2572 } 2573 2574 mutex_exit(&i_mactype_lock); 2575 return (mtype); 2576 } 2577 2578 mactype_register_t * 2579 mactype_alloc(uint_t mactype_version) 2580 { 2581 mactype_register_t *mtrp; 2582 2583 /* 2584 * Make sure there isn't a version mismatch between the plugin and 2585 * the framework. In the future, if multiple versions are 2586 * supported, this check could become more sophisticated. 2587 */ 2588 if (mactype_version != MACTYPE_VERSION) 2589 return (NULL); 2590 2591 mtrp = kmem_zalloc(sizeof (mactype_register_t), KM_SLEEP); 2592 mtrp->mtr_version = mactype_version; 2593 return (mtrp); 2594 } 2595 2596 void 2597 mactype_free(mactype_register_t *mtrp) 2598 { 2599 kmem_free(mtrp, sizeof (mactype_register_t)); 2600 } 2601 2602 int 2603 mactype_register(mactype_register_t *mtrp) 2604 { 2605 mactype_t *mtp; 2606 mactype_ops_t *ops = mtrp->mtr_ops; 2607 2608 /* Do some sanity checking before we register this MAC type. */ 2609 if (mtrp->mtr_ident == NULL || ops == NULL) 2610 return (EINVAL); 2611 2612 /* 2613 * Verify that all mandatory callbacks are set in the ops 2614 * vector. 2615 */ 2616 if (ops->mtops_unicst_verify == NULL || 2617 ops->mtops_multicst_verify == NULL || 2618 ops->mtops_sap_verify == NULL || 2619 ops->mtops_header == NULL || 2620 ops->mtops_header_info == NULL) { 2621 return (EINVAL); 2622 } 2623 2624 mtp = kmem_zalloc(sizeof (*mtp), KM_SLEEP); 2625 mtp->mt_ident = mtrp->mtr_ident; 2626 mtp->mt_ops = *ops; 2627 mtp->mt_type = mtrp->mtr_mactype; 2628 mtp->mt_nativetype = mtrp->mtr_nativetype; 2629 mtp->mt_addr_length = mtrp->mtr_addrlen; 2630 if (mtrp->mtr_brdcst_addr != NULL) { 2631 mtp->mt_brdcst_addr = kmem_alloc(mtrp->mtr_addrlen, KM_SLEEP); 2632 bcopy(mtrp->mtr_brdcst_addr, mtp->mt_brdcst_addr, 2633 mtrp->mtr_addrlen); 2634 } 2635 2636 mtp->mt_stats = mtrp->mtr_stats; 2637 mtp->mt_statcount = mtrp->mtr_statcount; 2638 2639 mtp->mt_mapping = mtrp->mtr_mapping; 2640 mtp->mt_mappingcount = mtrp->mtr_mappingcount; 2641 2642 if (mod_hash_insert(i_mactype_hash, 2643 (mod_hash_key_t)mtp->mt_ident, (mod_hash_val_t)mtp) != 0) { 2644 kmem_free(mtp->mt_brdcst_addr, mtp->mt_addr_length); 2645 kmem_free(mtp, sizeof (*mtp)); 2646 return (EEXIST); 2647 } 2648 return (0); 2649 } 2650 2651 int 2652 mactype_unregister(const char *ident) 2653 { 2654 mactype_t *mtp; 2655 mod_hash_val_t val; 2656 int err; 2657 2658 /* 2659 * Let's not allow MAC drivers to use this plugin while we're 2660 * trying to unregister it. Holding i_mactype_lock also prevents a 2661 * plugin from unregistering while a MAC driver is attempting to 2662 * hold a reference to it in i_mactype_getplugin(). 2663 */ 2664 mutex_enter(&i_mactype_lock); 2665 2666 if ((err = mod_hash_find(i_mactype_hash, (mod_hash_key_t)ident, 2667 (mod_hash_val_t *)&mtp)) != 0) { 2668 /* A plugin is trying to unregister, but it never registered. */ 2669 err = ENXIO; 2670 goto done; 2671 } 2672 2673 if (mtp->mt_ref != 0) { 2674 err = EBUSY; 2675 goto done; 2676 } 2677 2678 err = mod_hash_remove(i_mactype_hash, (mod_hash_key_t)ident, &val); 2679 ASSERT(err == 0); 2680 if (err != 0) { 2681 /* This should never happen, thus the ASSERT() above. */ 2682 err = EINVAL; 2683 goto done; 2684 } 2685 ASSERT(mtp == (mactype_t *)val); 2686 2687 kmem_free(mtp->mt_brdcst_addr, mtp->mt_addr_length); 2688 kmem_free(mtp, sizeof (mactype_t)); 2689 done: 2690 mutex_exit(&i_mactype_lock); 2691 return (err); 2692 } 2693 2694 /* 2695 * Returns TRUE when the specified property is intended for the MAC framework, 2696 * as opposed to driver defined properties. 2697 */ 2698 static boolean_t 2699 mac_is_macprop(mac_prop_t *macprop) 2700 { 2701 switch (macprop->mp_id) { 2702 case MAC_PROP_MAXBW: 2703 case MAC_PROP_PRIO: 2704 case MAC_PROP_BIND_CPU: 2705 return (B_TRUE); 2706 default: 2707 return (B_FALSE); 2708 } 2709 } 2710 2711 /* 2712 * mac_set_prop() sets mac or hardware driver properties: 2713 * mac properties include maxbw, priority, and cpu binding list. Driver 2714 * properties are private properties to the hardware, such as mtu, speed 2715 * etc. 2716 * If the property is a driver property, mac_set_prop() calls driver's callback 2717 * function to set it. 2718 * If the property is a mac property, mac_set_prop() invokes mac_set_resources() 2719 * which will cache the property value in mac_impl_t and may call 2720 * mac_client_set_resource() to update property value of the primary mac client, 2721 * if it exists. 2722 */ 2723 int 2724 mac_set_prop(mac_handle_t mh, mac_prop_t *macprop, void *val, uint_t valsize) 2725 { 2726 int err = ENOTSUP; 2727 mac_impl_t *mip = (mac_impl_t *)mh; 2728 2729 ASSERT(MAC_PERIM_HELD(mh)); 2730 2731 /* If it is mac property, call mac_set_resources() */ 2732 if (mac_is_macprop(macprop)) { 2733 mac_resource_props_t mrp; 2734 2735 if (valsize < sizeof (mac_resource_props_t)) 2736 return (EINVAL); 2737 bzero(&mrp, sizeof (mac_resource_props_t)); 2738 bcopy(val, &mrp, sizeof (mrp)); 2739 return (mac_set_resources(mh, &mrp)); 2740 } 2741 switch (macprop->mp_id) { 2742 case MAC_PROP_MTU: { 2743 uint32_t mtu; 2744 2745 if (valsize < sizeof (mtu)) 2746 return (EINVAL); 2747 bcopy(val, &mtu, sizeof (mtu)); 2748 err = mac_set_mtu(mh, mtu, NULL); 2749 break; 2750 } 2751 default: 2752 /* For other driver properties, call driver's callback */ 2753 if (mip->mi_callbacks->mc_callbacks & MC_SETPROP) { 2754 err = mip->mi_callbacks->mc_setprop(mip->mi_driver, 2755 macprop->mp_name, macprop->mp_id, valsize, val); 2756 } 2757 } 2758 return (err); 2759 } 2760 2761 /* 2762 * mac_get_prop() gets mac or hardware driver properties. 2763 * 2764 * If the property is a driver property, mac_get_prop() calls driver's callback 2765 * function to get it. 2766 * If the property is a mac property, mac_get_prop() invokes mac_get_resources() 2767 * which returns the cached value in mac_impl_t. 2768 */ 2769 int 2770 mac_get_prop(mac_handle_t mh, mac_prop_t *macprop, void *val, uint_t valsize, 2771 uint_t *perm) 2772 { 2773 int err = ENOTSUP; 2774 mac_impl_t *mip = (mac_impl_t *)mh; 2775 uint32_t sdu; 2776 link_state_t link_state; 2777 2778 /* If mac property, read from cache */ 2779 if (mac_is_macprop(macprop)) { 2780 mac_resource_props_t mrp; 2781 2782 if (valsize < sizeof (mac_resource_props_t)) 2783 return (EINVAL); 2784 bzero(&mrp, sizeof (mac_resource_props_t)); 2785 mac_get_resources(mh, &mrp); 2786 bcopy(&mrp, val, sizeof (mac_resource_props_t)); 2787 return (0); 2788 } 2789 2790 switch (macprop->mp_id) { 2791 case MAC_PROP_MTU: 2792 if (valsize < sizeof (sdu)) 2793 return (EINVAL); 2794 if ((macprop->mp_flags & MAC_PROP_DEFAULT) == 0) { 2795 mac_sdu_get(mh, NULL, &sdu); 2796 bcopy(&sdu, val, sizeof (sdu)); 2797 if ((mip->mi_callbacks->mc_callbacks & MC_SETPROP) && 2798 (mip->mi_callbacks->mc_setprop(mip->mi_driver, 2799 macprop->mp_name, macprop->mp_id, valsize, 2800 val) == 0)) { 2801 *perm = MAC_PROP_PERM_RW; 2802 } else { 2803 *perm = MAC_PROP_PERM_READ; 2804 } 2805 return (0); 2806 } else { 2807 if (mip->mi_info.mi_media == DL_ETHER) { 2808 sdu = ETHERMTU; 2809 bcopy(&sdu, val, sizeof (sdu)); 2810 2811 return (0); 2812 } 2813 /* 2814 * ask driver for its default. 2815 */ 2816 break; 2817 } 2818 case MAC_PROP_STATUS: 2819 if (valsize < sizeof (link_state)) 2820 return (EINVAL); 2821 *perm = MAC_PROP_PERM_READ; 2822 link_state = mac_link_get(mh); 2823 bcopy(&link_state, val, sizeof (link_state)); 2824 return (0); 2825 default: 2826 break; 2827 2828 } 2829 /* If driver property, request from driver */ 2830 if (mip->mi_callbacks->mc_callbacks & MC_GETPROP) { 2831 err = mip->mi_callbacks->mc_getprop(mip->mi_driver, 2832 macprop->mp_name, macprop->mp_id, macprop->mp_flags, 2833 valsize, val, perm); 2834 } 2835 return (err); 2836 } 2837 2838 void 2839 mac_register_priv_prop(mac_impl_t *mip, mac_priv_prop_t *mpp, uint_t nprop) 2840 { 2841 mac_priv_prop_t *mpriv; 2842 2843 if (mpp == NULL) 2844 return; 2845 2846 mpriv = kmem_zalloc(nprop * sizeof (*mpriv), KM_SLEEP); 2847 (void) memcpy(mpriv, mpp, nprop * sizeof (*mpriv)); 2848 mip->mi_priv_prop = mpriv; 2849 mip->mi_priv_prop_count = nprop; 2850 } 2851 2852 void 2853 mac_unregister_priv_prop(mac_impl_t *mip) 2854 { 2855 mac_priv_prop_t *mpriv; 2856 2857 mpriv = mip->mi_priv_prop; 2858 if (mpriv != NULL) { 2859 kmem_free(mpriv, mip->mi_priv_prop_count * sizeof (*mpriv)); 2860 mip->mi_priv_prop = NULL; 2861 } 2862 mip->mi_priv_prop_count = 0; 2863 } 2864 2865 /* 2866 * mac_ring_t 'mr' macros. Some rogue drivers may access ring structure 2867 * (by invoking mac_rx()) even after processing mac_stop_ring(). In such 2868 * cases if MAC free's the ring structure after mac_stop_ring(), any 2869 * illegal access to the ring structure coming from the driver will panic 2870 * the system. In order to protect the system from such inadverent access, 2871 * we maintain a cache of rings in the mac_impl_t after they get free'd up. 2872 * When packets are received on free'd up rings, MAC (through the generation 2873 * count mechanism) will drop such packets. 2874 */ 2875 static mac_ring_t * 2876 mac_ring_alloc(mac_impl_t *mip, mac_capab_rings_t *cap_rings) 2877 { 2878 mac_ring_t *ring; 2879 2880 if (cap_rings->mr_type == MAC_RING_TYPE_RX) { 2881 mutex_enter(&mip->mi_ring_lock); 2882 if (mip->mi_ring_freelist != NULL) { 2883 ring = mip->mi_ring_freelist; 2884 mip->mi_ring_freelist = ring->mr_next; 2885 bzero(ring, sizeof (mac_ring_t)); 2886 } else { 2887 ring = kmem_cache_alloc(mac_ring_cache, KM_SLEEP); 2888 } 2889 mutex_exit(&mip->mi_ring_lock); 2890 } else { 2891 ring = kmem_zalloc(sizeof (mac_ring_t), KM_SLEEP); 2892 } 2893 ASSERT((ring != NULL) && (ring->mr_state == MR_FREE)); 2894 return (ring); 2895 } 2896 2897 static void 2898 mac_ring_free(mac_impl_t *mip, mac_ring_t *ring) 2899 { 2900 if (ring->mr_type == MAC_RING_TYPE_RX) { 2901 mutex_enter(&mip->mi_ring_lock); 2902 ring->mr_state = MR_FREE; 2903 ring->mr_flag = 0; 2904 ring->mr_next = mip->mi_ring_freelist; 2905 mip->mi_ring_freelist = ring; 2906 mutex_exit(&mip->mi_ring_lock); 2907 } else { 2908 kmem_free(ring, sizeof (mac_ring_t)); 2909 } 2910 } 2911 2912 static void 2913 mac_ring_freeall(mac_impl_t *mip) 2914 { 2915 mac_ring_t *ring_next; 2916 mutex_enter(&mip->mi_ring_lock); 2917 mac_ring_t *ring = mip->mi_ring_freelist; 2918 while (ring != NULL) { 2919 ring_next = ring->mr_next; 2920 kmem_cache_free(mac_ring_cache, ring); 2921 ring = ring_next; 2922 } 2923 mip->mi_ring_freelist = NULL; 2924 mutex_exit(&mip->mi_ring_lock); 2925 } 2926 2927 int 2928 mac_start_ring(mac_ring_t *ring) 2929 { 2930 int rv = 0; 2931 2932 if (ring->mr_start != NULL) 2933 rv = ring->mr_start(ring->mr_driver, ring->mr_gen_num); 2934 2935 return (rv); 2936 } 2937 2938 void 2939 mac_stop_ring(mac_ring_t *ring) 2940 { 2941 if (ring->mr_stop != NULL) 2942 ring->mr_stop(ring->mr_driver); 2943 2944 /* 2945 * Increment the ring generation number for this ring. 2946 */ 2947 ring->mr_gen_num++; 2948 } 2949 2950 int 2951 mac_start_group(mac_group_t *group) 2952 { 2953 int rv = 0; 2954 2955 if (group->mrg_start != NULL) 2956 rv = group->mrg_start(group->mrg_driver); 2957 2958 return (rv); 2959 } 2960 2961 void 2962 mac_stop_group(mac_group_t *group) 2963 { 2964 if (group->mrg_stop != NULL) 2965 group->mrg_stop(group->mrg_driver); 2966 } 2967 2968 /* 2969 * Called from mac_start() on the default Rx group. Broadcast and multicast 2970 * packets are received only on the default group. Hence the default group 2971 * needs to be up even if the primary client is not up, for the other groups 2972 * to be functional. We do this by calling this function at mac_start time 2973 * itself. However the broadcast packets that are received can't make their 2974 * way beyond mac_rx until a mac client creates a broadcast flow. 2975 */ 2976 static int 2977 mac_start_group_and_rings(mac_group_t *group) 2978 { 2979 mac_ring_t *ring; 2980 int rv = 0; 2981 2982 ASSERT(group->mrg_state == MAC_GROUP_STATE_REGISTERED); 2983 if ((rv = mac_start_group(group)) != 0) 2984 return (rv); 2985 2986 for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) { 2987 ASSERT(ring->mr_state == MR_FREE); 2988 if ((rv = mac_start_ring(ring)) != 0) 2989 goto error; 2990 ring->mr_state = MR_INUSE; 2991 ring->mr_classify_type = MAC_SW_CLASSIFIER; 2992 } 2993 return (0); 2994 2995 error: 2996 mac_stop_group_and_rings(group); 2997 return (rv); 2998 } 2999 3000 /* Called from mac_stop on the default Rx group */ 3001 static void 3002 mac_stop_group_and_rings(mac_group_t *group) 3003 { 3004 mac_ring_t *ring; 3005 3006 for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) { 3007 if (ring->mr_state != MR_FREE) { 3008 mac_stop_ring(ring); 3009 ring->mr_state = MR_FREE; 3010 ring->mr_flag = 0; 3011 ring->mr_classify_type = MAC_NO_CLASSIFIER; 3012 } 3013 } 3014 mac_stop_group(group); 3015 } 3016 3017 3018 static mac_ring_t * 3019 mac_init_ring(mac_impl_t *mip, mac_group_t *group, int index, 3020 mac_capab_rings_t *cap_rings) 3021 { 3022 mac_ring_t *ring; 3023 mac_ring_info_t ring_info; 3024 3025 ring = mac_ring_alloc(mip, cap_rings); 3026 3027 /* Prepare basic information of ring */ 3028 ring->mr_index = index; 3029 ring->mr_type = group->mrg_type; 3030 ring->mr_gh = (mac_group_handle_t)group; 3031 3032 /* Insert the new ring to the list. */ 3033 ring->mr_next = group->mrg_rings; 3034 group->mrg_rings = ring; 3035 3036 /* Zero to reuse the info data structure */ 3037 bzero(&ring_info, sizeof (ring_info)); 3038 3039 /* Query ring information from driver */ 3040 cap_rings->mr_rget(mip->mi_driver, group->mrg_type, group->mrg_index, 3041 index, &ring_info, (mac_ring_handle_t)ring); 3042 3043 ring->mr_info = ring_info; 3044 3045 /* Update ring's status */ 3046 ring->mr_state = MR_FREE; 3047 ring->mr_flag = 0; 3048 3049 /* Update the ring count of the group */ 3050 group->mrg_cur_count++; 3051 return (ring); 3052 } 3053 3054 /* 3055 * Rings are chained together for easy regrouping. 3056 */ 3057 static void 3058 mac_init_group(mac_impl_t *mip, mac_group_t *group, int size, 3059 mac_capab_rings_t *cap_rings) 3060 { 3061 int index; 3062 3063 /* 3064 * Initialize all ring members of this group. Size of zero will not 3065 * enter the loop, so it's safe for initializing an empty group. 3066 */ 3067 for (index = size - 1; index >= 0; index--) 3068 (void) mac_init_ring(mip, group, index, cap_rings); 3069 } 3070 3071 int 3072 mac_init_rings(mac_impl_t *mip, mac_ring_type_t rtype) 3073 { 3074 mac_capab_rings_t *cap_rings; 3075 mac_group_t *group, *groups; 3076 mac_group_info_t group_info; 3077 uint_t group_free = 0; 3078 uint_t ring_left; 3079 mac_ring_t *ring; 3080 int g, err = 0; 3081 3082 switch (rtype) { 3083 case MAC_RING_TYPE_RX: 3084 ASSERT(mip->mi_rx_groups == NULL); 3085 3086 cap_rings = &mip->mi_rx_rings_cap; 3087 cap_rings->mr_type = MAC_RING_TYPE_RX; 3088 break; 3089 case MAC_RING_TYPE_TX: 3090 ASSERT(mip->mi_tx_groups == NULL); 3091 3092 cap_rings = &mip->mi_tx_rings_cap; 3093 cap_rings->mr_type = MAC_RING_TYPE_TX; 3094 break; 3095 default: 3096 ASSERT(B_FALSE); 3097 } 3098 3099 if (!i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_RINGS, 3100 cap_rings)) 3101 return (0); 3102 3103 /* 3104 * Allocate a contiguous buffer for all groups. 3105 */ 3106 groups = kmem_zalloc(sizeof (mac_group_t) * (cap_rings->mr_gnum + 1), 3107 KM_SLEEP); 3108 3109 ring_left = cap_rings->mr_rnum; 3110 3111 /* 3112 * Get all ring groups if any, and get their ring members 3113 * if any. 3114 */ 3115 for (g = 0; g < cap_rings->mr_gnum; g++) { 3116 group = groups + g; 3117 3118 /* Prepare basic information of the group */ 3119 group->mrg_index = g; 3120 group->mrg_type = rtype; 3121 group->mrg_state = MAC_GROUP_STATE_UNINIT; 3122 group->mrg_mh = (mac_handle_t)mip; 3123 group->mrg_next = group + 1; 3124 3125 /* Zero to reuse the info data structure */ 3126 bzero(&group_info, sizeof (group_info)); 3127 3128 /* Query group information from driver */ 3129 cap_rings->mr_gget(mip->mi_driver, rtype, g, &group_info, 3130 (mac_group_handle_t)group); 3131 3132 switch (cap_rings->mr_group_type) { 3133 case MAC_GROUP_TYPE_DYNAMIC: 3134 if (cap_rings->mr_gaddring == NULL || 3135 cap_rings->mr_gremring == NULL) { 3136 DTRACE_PROBE3( 3137 mac__init__rings_no_addremring, 3138 char *, mip->mi_name, 3139 mac_group_add_ring_t, 3140 cap_rings->mr_gaddring, 3141 mac_group_add_ring_t, 3142 cap_rings->mr_gremring); 3143 err = EINVAL; 3144 goto bail; 3145 } 3146 3147 switch (rtype) { 3148 case MAC_RING_TYPE_RX: 3149 /* 3150 * The first RX group must have non-zero 3151 * rings, and the following groups must 3152 * have zero rings. 3153 */ 3154 if (g == 0 && group_info.mgi_count == 0) { 3155 DTRACE_PROBE1( 3156 mac__init__rings__rx__def__zero, 3157 char *, mip->mi_name); 3158 err = EINVAL; 3159 goto bail; 3160 } 3161 if (g > 0 && group_info.mgi_count != 0) { 3162 DTRACE_PROBE3( 3163 mac__init__rings__rx__nonzero, 3164 char *, mip->mi_name, 3165 int, g, int, group_info.mgi_count); 3166 err = EINVAL; 3167 goto bail; 3168 } 3169 break; 3170 case MAC_RING_TYPE_TX: 3171 /* 3172 * All TX ring groups must have zero rings. 3173 */ 3174 if (group_info.mgi_count != 0) { 3175 DTRACE_PROBE3( 3176 mac__init__rings__tx__nonzero, 3177 char *, mip->mi_name, 3178 int, g, int, group_info.mgi_count); 3179 err = EINVAL; 3180 goto bail; 3181 } 3182 break; 3183 } 3184 break; 3185 case MAC_GROUP_TYPE_STATIC: 3186 /* 3187 * Note that an empty group is allowed, e.g., an aggr 3188 * would start with an empty group. 3189 */ 3190 break; 3191 default: 3192 /* unknown group type */ 3193 DTRACE_PROBE2(mac__init__rings__unknown__type, 3194 char *, mip->mi_name, 3195 int, cap_rings->mr_group_type); 3196 err = EINVAL; 3197 goto bail; 3198 } 3199 3200 3201 /* 3202 * Driver must register group->mgi_addmac/remmac() for rx groups 3203 * to support multiple MAC addresses. 3204 */ 3205 if (rtype == MAC_RING_TYPE_RX) { 3206 if ((group_info.mgi_addmac == NULL) || 3207 (group_info.mgi_addmac == NULL)) 3208 goto bail; 3209 } 3210 3211 /* Cache driver-supplied information */ 3212 group->mrg_info = group_info; 3213 3214 /* Update the group's status and group count. */ 3215 mac_set_rx_group_state(group, MAC_GROUP_STATE_REGISTERED); 3216 group_free++; 3217 3218 group->mrg_rings = NULL; 3219 group->mrg_cur_count = 0; 3220 mac_init_group(mip, group, group_info.mgi_count, cap_rings); 3221 ring_left -= group_info.mgi_count; 3222 3223 /* The current group size should be equal to default value */ 3224 ASSERT(group->mrg_cur_count == group_info.mgi_count); 3225 } 3226 3227 /* Build up a dummy group for free resources as a pool */ 3228 group = groups + cap_rings->mr_gnum; 3229 3230 /* Prepare basic information of the group */ 3231 group->mrg_index = -1; 3232 group->mrg_type = rtype; 3233 group->mrg_state = MAC_GROUP_STATE_UNINIT; 3234 group->mrg_mh = (mac_handle_t)mip; 3235 group->mrg_next = NULL; 3236 3237 /* 3238 * If there are ungrouped rings, allocate a continuous buffer for 3239 * remaining resources. 3240 */ 3241 if (ring_left != 0) { 3242 group->mrg_rings = NULL; 3243 group->mrg_cur_count = 0; 3244 mac_init_group(mip, group, ring_left, cap_rings); 3245 3246 /* The current group size should be equal to ring_left */ 3247 ASSERT(group->mrg_cur_count == ring_left); 3248 3249 ring_left = 0; 3250 3251 /* Update this group's status */ 3252 mac_set_rx_group_state(group, MAC_GROUP_STATE_REGISTERED); 3253 } else 3254 group->mrg_rings = NULL; 3255 3256 ASSERT(ring_left == 0); 3257 3258 bail: 3259 /* Cache other important information to finalize the initialization */ 3260 switch (rtype) { 3261 case MAC_RING_TYPE_RX: 3262 mip->mi_rx_group_type = cap_rings->mr_group_type; 3263 mip->mi_rx_group_count = cap_rings->mr_gnum; 3264 mip->mi_rx_groups = groups; 3265 break; 3266 case MAC_RING_TYPE_TX: 3267 mip->mi_tx_group_type = cap_rings->mr_group_type; 3268 mip->mi_tx_group_count = cap_rings->mr_gnum; 3269 mip->mi_tx_group_free = group_free; 3270 mip->mi_tx_groups = groups; 3271 3272 /* 3273 * Ring 0 is used as the default one and it could be assigned 3274 * to a client as well. 3275 */ 3276 group = groups + cap_rings->mr_gnum; 3277 ring = group->mrg_rings; 3278 while ((ring->mr_index != 0) && (ring->mr_next != NULL)) 3279 ring = ring->mr_next; 3280 ASSERT(ring->mr_index == 0); 3281 mip->mi_default_tx_ring = (mac_ring_handle_t)ring; 3282 break; 3283 default: 3284 ASSERT(B_FALSE); 3285 } 3286 3287 if (err != 0) 3288 mac_free_rings(mip, rtype); 3289 3290 return (err); 3291 } 3292 3293 /* 3294 * Called to free all ring groups with particular type. It's supposed all groups 3295 * have been released by clinet. 3296 */ 3297 void 3298 mac_free_rings(mac_impl_t *mip, mac_ring_type_t rtype) 3299 { 3300 mac_group_t *group, *groups; 3301 uint_t group_count; 3302 3303 switch (rtype) { 3304 case MAC_RING_TYPE_RX: 3305 if (mip->mi_rx_groups == NULL) 3306 return; 3307 3308 groups = mip->mi_rx_groups; 3309 group_count = mip->mi_rx_group_count; 3310 3311 mip->mi_rx_groups = NULL; 3312 mip->mi_rx_group_count = 0; 3313 break; 3314 case MAC_RING_TYPE_TX: 3315 ASSERT(mip->mi_tx_group_count == mip->mi_tx_group_free); 3316 3317 if (mip->mi_tx_groups == NULL) 3318 return; 3319 3320 groups = mip->mi_tx_groups; 3321 group_count = mip->mi_tx_group_count; 3322 3323 mip->mi_tx_groups = NULL; 3324 mip->mi_tx_group_count = 0; 3325 mip->mi_tx_group_free = 0; 3326 mip->mi_default_tx_ring = NULL; 3327 break; 3328 default: 3329 ASSERT(B_FALSE); 3330 } 3331 3332 for (group = groups; group != NULL; group = group->mrg_next) { 3333 mac_ring_t *ring; 3334 3335 if (group->mrg_cur_count == 0) 3336 continue; 3337 3338 ASSERT(group->mrg_rings != NULL); 3339 3340 while ((ring = group->mrg_rings) != NULL) { 3341 group->mrg_rings = ring->mr_next; 3342 mac_ring_free(mip, ring); 3343 } 3344 } 3345 3346 /* Free all the cached rings */ 3347 mac_ring_freeall(mip); 3348 /* Free the block of group data strutures */ 3349 kmem_free(groups, sizeof (mac_group_t) * (group_count + 1)); 3350 } 3351 3352 /* 3353 * Associate a MAC address with a receive group. 3354 * 3355 * The return value of this function should always be checked properly, because 3356 * any type of failure could cause unexpected results. A group can be added 3357 * or removed with a MAC address only after it has been reserved. Ideally, 3358 * a successful reservation always leads to calling mac_group_addmac() to 3359 * steer desired traffic. Failure of adding an unicast MAC address doesn't 3360 * always imply that the group is functioning abnormally. 3361 * 3362 * Currently this function is called everywhere, and it reflects assumptions 3363 * about MAC addresses in the implementation. CR 6735196. 3364 */ 3365 int 3366 mac_group_addmac(mac_group_t *group, const uint8_t *addr) 3367 { 3368 ASSERT(group->mrg_type == MAC_RING_TYPE_RX); 3369 ASSERT(group->mrg_info.mgi_addmac != NULL); 3370 3371 return (group->mrg_info.mgi_addmac(group->mrg_info.mgi_driver, addr)); 3372 } 3373 3374 /* 3375 * Remove the association between MAC address and receive group. 3376 */ 3377 int 3378 mac_group_remmac(mac_group_t *group, const uint8_t *addr) 3379 { 3380 ASSERT(group->mrg_type == MAC_RING_TYPE_RX); 3381 ASSERT(group->mrg_info.mgi_remmac != NULL); 3382 3383 return (group->mrg_info.mgi_remmac(group->mrg_info.mgi_driver, addr)); 3384 } 3385 3386 /* 3387 * Release a ring in use by marking it MR_FREE. 3388 * Any other client may reserve it for its use. 3389 */ 3390 void 3391 mac_release_tx_ring(mac_ring_handle_t rh) 3392 { 3393 mac_ring_t *ring = (mac_ring_t *)rh; 3394 mac_group_t *group = (mac_group_t *)ring->mr_gh; 3395 mac_impl_t *mip = (mac_impl_t *)group->mrg_mh; 3396 3397 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); 3398 ASSERT(ring->mr_state != MR_FREE); 3399 3400 /* 3401 * Default tx ring will be released by mac_stop(). 3402 */ 3403 if (rh == mip->mi_default_tx_ring) 3404 return; 3405 3406 mac_stop_ring(ring); 3407 3408 ring->mr_state = MR_FREE; 3409 ring->mr_flag = 0; 3410 } 3411 3412 /* 3413 * Send packets through a selected tx ring. 3414 */ 3415 mblk_t * 3416 mac_ring_tx(mac_ring_handle_t rh, mblk_t *mp) 3417 { 3418 mac_ring_t *ring = (mac_ring_t *)rh; 3419 mac_ring_info_t *info = &ring->mr_info; 3420 3421 ASSERT(ring->mr_type == MAC_RING_TYPE_TX); 3422 ASSERT(ring->mr_state >= MR_INUSE); 3423 ASSERT(info->mri_tx != NULL); 3424 3425 return (info->mri_tx(info->mri_driver, mp)); 3426 } 3427 3428 /* 3429 * Find a ring from its index. 3430 */ 3431 mac_ring_t * 3432 mac_find_ring(mac_group_t *group, int index) 3433 { 3434 mac_ring_t *ring = group->mrg_rings; 3435 3436 for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) 3437 if (ring->mr_index == index) 3438 break; 3439 3440 return (ring); 3441 } 3442 /* 3443 * Add a ring to an existing group. 3444 * 3445 * The ring must be either passed directly (for example if the ring 3446 * movement is initiated by the framework), or specified through a driver 3447 * index (for example when the ring is added by the driver. 3448 * 3449 * The caller needs to call mac_perim_enter() before calling this function. 3450 */ 3451 int 3452 i_mac_group_add_ring(mac_group_t *group, mac_ring_t *ring, int index) 3453 { 3454 mac_impl_t *mip = (mac_impl_t *)group->mrg_mh; 3455 mac_capab_rings_t *cap_rings; 3456 boolean_t driver_call = (ring == NULL); 3457 mac_group_type_t group_type; 3458 int ret = 0; 3459 3460 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); 3461 3462 switch (group->mrg_type) { 3463 case MAC_RING_TYPE_RX: 3464 cap_rings = &mip->mi_rx_rings_cap; 3465 group_type = mip->mi_rx_group_type; 3466 break; 3467 case MAC_RING_TYPE_TX: 3468 cap_rings = &mip->mi_tx_rings_cap; 3469 group_type = mip->mi_tx_group_type; 3470 break; 3471 default: 3472 ASSERT(B_FALSE); 3473 } 3474 3475 /* 3476 * There should be no ring with the same ring index in the target 3477 * group. 3478 */ 3479 ASSERT(mac_find_ring(group, driver_call ? index : ring->mr_index) == 3480 NULL); 3481 3482 if (driver_call) { 3483 /* 3484 * The function is called as a result of a request from 3485 * a driver to add a ring to an existing group, for example 3486 * from the aggregation driver. Allocate a new mac_ring_t 3487 * for that ring. 3488 */ 3489 ring = mac_init_ring(mip, group, index, cap_rings); 3490 ASSERT(group->mrg_state > MAC_GROUP_STATE_UNINIT); 3491 } else { 3492 /* 3493 * The function is called as a result of a MAC layer request 3494 * to add a ring to an existing group. In this case the 3495 * ring is being moved between groups, which requires 3496 * the underlying driver to support dynamic grouping, 3497 * and the mac_ring_t already exists. 3498 */ 3499 ASSERT(group_type == MAC_GROUP_TYPE_DYNAMIC); 3500 ASSERT(cap_rings->mr_gaddring != NULL); 3501 ASSERT(ring->mr_gh == NULL); 3502 } 3503 3504 /* 3505 * At this point the ring should not be in use, and it should be 3506 * of the right for the target group. 3507 */ 3508 ASSERT(ring->mr_state < MR_INUSE); 3509 ASSERT(ring->mr_srs == NULL); 3510 ASSERT(ring->mr_type == group->mrg_type); 3511 3512 if (!driver_call) { 3513 /* 3514 * Add the driver level hardware ring if the process was not 3515 * initiated by the driver, and the target group is not the 3516 * group. 3517 */ 3518 if (group->mrg_driver != NULL) { 3519 cap_rings->mr_gaddring(group->mrg_driver, 3520 ring->mr_driver, ring->mr_type); 3521 } 3522 3523 /* 3524 * Insert the ring ahead existing rings. 3525 */ 3526 ring->mr_next = group->mrg_rings; 3527 group->mrg_rings = ring; 3528 ring->mr_gh = (mac_group_handle_t)group; 3529 group->mrg_cur_count++; 3530 } 3531 3532 /* 3533 * If the group has not been actively used, we're done. 3534 */ 3535 if (group->mrg_index != -1 && 3536 group->mrg_state < MAC_GROUP_STATE_RESERVED) 3537 return (0); 3538 3539 /* 3540 * Set up SRS/SR according to the ring type. 3541 */ 3542 switch (ring->mr_type) { 3543 case MAC_RING_TYPE_RX: 3544 /* 3545 * Setup SRS on top of the new ring if the group is 3546 * reserved for someones exclusive use. 3547 */ 3548 if (group->mrg_state == MAC_GROUP_STATE_RESERVED) { 3549 flow_entry_t *flent; 3550 mac_client_impl_t *mcip; 3551 3552 mcip = MAC_RX_GROUP_ONLY_CLIENT(group); 3553 ASSERT(mcip != NULL); 3554 flent = mcip->mci_flent; 3555 ASSERT(flent->fe_rx_srs_cnt > 0); 3556 mac_srs_group_setup(mcip, flent, group, SRST_LINK); 3557 } 3558 break; 3559 case MAC_RING_TYPE_TX: 3560 /* 3561 * For TX this function is only invoked during the 3562 * initial creation of a group when a share is 3563 * associated with a MAC client. So the datapath is not 3564 * yet setup, and will be setup later after the 3565 * group has been reserved and populated. 3566 */ 3567 break; 3568 default: 3569 ASSERT(B_FALSE); 3570 } 3571 3572 /* 3573 * Start the ring if needed. Failure causes to undo the grouping action. 3574 */ 3575 if ((ret = mac_start_ring(ring)) != 0) { 3576 if (ring->mr_type == MAC_RING_TYPE_RX) { 3577 if (ring->mr_srs != NULL) { 3578 mac_rx_srs_remove(ring->mr_srs); 3579 ring->mr_srs = NULL; 3580 } 3581 } 3582 if (!driver_call) { 3583 cap_rings->mr_gremring(group->mrg_driver, 3584 ring->mr_driver, ring->mr_type); 3585 } 3586 group->mrg_cur_count--; 3587 group->mrg_rings = ring->mr_next; 3588 3589 ring->mr_gh = NULL; 3590 3591 if (driver_call) 3592 mac_ring_free(mip, ring); 3593 3594 return (ret); 3595 } 3596 3597 /* 3598 * Update the ring's state. 3599 */ 3600 ring->mr_state = MR_INUSE; 3601 MAC_RING_UNMARK(ring, MR_INCIPIENT); 3602 return (0); 3603 } 3604 3605 /* 3606 * Remove a ring from it's current group. MAC internal function for dynamic 3607 * grouping. 3608 * 3609 * The caller needs to call mac_perim_enter() before calling this function. 3610 */ 3611 void 3612 i_mac_group_rem_ring(mac_group_t *group, mac_ring_t *ring, 3613 boolean_t driver_call) 3614 { 3615 mac_impl_t *mip = (mac_impl_t *)group->mrg_mh; 3616 mac_capab_rings_t *cap_rings = NULL; 3617 mac_group_type_t group_type; 3618 3619 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); 3620 3621 ASSERT(mac_find_ring(group, ring->mr_index) == ring); 3622 ASSERT((mac_group_t *)ring->mr_gh == group); 3623 ASSERT(ring->mr_type == group->mrg_type); 3624 3625 switch (ring->mr_type) { 3626 case MAC_RING_TYPE_RX: 3627 group_type = mip->mi_rx_group_type; 3628 cap_rings = &mip->mi_rx_rings_cap; 3629 3630 if (group->mrg_state >= MAC_GROUP_STATE_RESERVED) 3631 mac_stop_ring(ring); 3632 3633 /* 3634 * Only hardware classified packets hold a reference to the 3635 * ring all the way up the Rx path. mac_rx_srs_remove() 3636 * will take care of quiescing the Rx path and removing the 3637 * SRS. The software classified path neither holds a reference 3638 * nor any association with the ring in mac_rx. 3639 */ 3640 if (ring->mr_srs != NULL) { 3641 mac_rx_srs_remove(ring->mr_srs); 3642 ring->mr_srs = NULL; 3643 } 3644 ring->mr_state = MR_FREE; 3645 ring->mr_flag = 0; 3646 3647 break; 3648 case MAC_RING_TYPE_TX: 3649 /* 3650 * For TX this function is only invoked in two 3651 * cases: 3652 * 3653 * 1) In the case of a failure during the 3654 * initial creation of a group when a share is 3655 * associated with a MAC client. So the SRS is not 3656 * yet setup, and will be setup later after the 3657 * group has been reserved and populated. 3658 * 3659 * 2) From mac_release_tx_group() when freeing 3660 * a TX SRS. 3661 * 3662 * In both cases the SRS and its soft rings are 3663 * already quiesced. 3664 */ 3665 ASSERT(!driver_call); 3666 group_type = mip->mi_tx_group_type; 3667 cap_rings = &mip->mi_tx_rings_cap; 3668 break; 3669 default: 3670 ASSERT(B_FALSE); 3671 } 3672 3673 /* 3674 * Remove the ring from the group. 3675 */ 3676 if (ring == group->mrg_rings) 3677 group->mrg_rings = ring->mr_next; 3678 else { 3679 mac_ring_t *pre; 3680 3681 pre = group->mrg_rings; 3682 while (pre->mr_next != ring) 3683 pre = pre->mr_next; 3684 pre->mr_next = ring->mr_next; 3685 } 3686 group->mrg_cur_count--; 3687 3688 if (!driver_call) { 3689 ASSERT(group_type == MAC_GROUP_TYPE_DYNAMIC); 3690 ASSERT(cap_rings->mr_gremring != NULL); 3691 3692 /* 3693 * Remove the driver level hardware ring. 3694 */ 3695 if (group->mrg_driver != NULL) { 3696 cap_rings->mr_gremring(group->mrg_driver, 3697 ring->mr_driver, ring->mr_type); 3698 } 3699 } 3700 3701 ring->mr_gh = NULL; 3702 if (driver_call) { 3703 mac_ring_free(mip, ring); 3704 } else { 3705 ring->mr_state = MR_FREE; 3706 ring->mr_flag = 0; 3707 } 3708 } 3709 3710 /* 3711 * Move a ring to the target group. If needed, remove the ring from the group 3712 * that it currently belongs to. 3713 * 3714 * The caller need to enter MAC's perimeter by calling mac_perim_enter(). 3715 */ 3716 static int 3717 mac_group_mov_ring(mac_impl_t *mip, mac_group_t *d_group, mac_ring_t *ring) 3718 { 3719 mac_group_t *s_group = (mac_group_t *)ring->mr_gh; 3720 int rv; 3721 3722 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); 3723 ASSERT(d_group != NULL); 3724 ASSERT(s_group->mrg_mh == d_group->mrg_mh); 3725 3726 if (s_group == d_group) 3727 return (0); 3728 3729 /* 3730 * Remove it from current group first. 3731 */ 3732 if (s_group != NULL) 3733 i_mac_group_rem_ring(s_group, ring, B_FALSE); 3734 3735 /* 3736 * Add it to the new group. 3737 */ 3738 rv = i_mac_group_add_ring(d_group, ring, 0); 3739 if (rv != 0) { 3740 /* 3741 * Failed to add ring back to source group. If 3742 * that fails, the ring is stuck in limbo, log message. 3743 */ 3744 if (i_mac_group_add_ring(s_group, ring, 0)) { 3745 cmn_err(CE_WARN, "%s: failed to move ring %p\n", 3746 mip->mi_name, (void *)ring); 3747 } 3748 } 3749 3750 return (rv); 3751 } 3752 3753 /* 3754 * Find a MAC address according to its value. 3755 */ 3756 mac_address_t * 3757 mac_find_macaddr(mac_impl_t *mip, uint8_t *mac_addr) 3758 { 3759 mac_address_t *map; 3760 3761 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); 3762 3763 for (map = mip->mi_addresses; map != NULL; map = map->ma_next) { 3764 if (bcmp(mac_addr, map->ma_addr, map->ma_len) == 0) 3765 break; 3766 } 3767 3768 return (map); 3769 } 3770 3771 /* 3772 * Check whether the MAC address is shared by multiple clients. 3773 */ 3774 boolean_t 3775 mac_check_macaddr_shared(mac_address_t *map) 3776 { 3777 ASSERT(MAC_PERIM_HELD((mac_handle_t)map->ma_mip)); 3778 3779 return (map->ma_nusers > 1); 3780 } 3781 3782 /* 3783 * Remove the specified MAC address from the MAC address list and free it. 3784 */ 3785 static void 3786 mac_free_macaddr(mac_address_t *map) 3787 { 3788 mac_impl_t *mip = map->ma_mip; 3789 3790 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); 3791 ASSERT(mip->mi_addresses != NULL); 3792 3793 map = mac_find_macaddr(mip, map->ma_addr); 3794 3795 ASSERT(map != NULL); 3796 ASSERT(map->ma_nusers == 0); 3797 3798 if (map == mip->mi_addresses) { 3799 mip->mi_addresses = map->ma_next; 3800 } else { 3801 mac_address_t *pre; 3802 3803 pre = mip->mi_addresses; 3804 while (pre->ma_next != map) 3805 pre = pre->ma_next; 3806 pre->ma_next = map->ma_next; 3807 } 3808 3809 kmem_free(map, sizeof (mac_address_t)); 3810 } 3811 3812 /* 3813 * Add a MAC address reference for a client. If the desired MAC address 3814 * exists, add a reference to it. Otherwise, add the new address by adding 3815 * it to a reserved group or setting promiscuous mode. Won't try different 3816 * group is the group is non-NULL, so the caller must explictly share 3817 * default group when needed. 3818 * 3819 * Note, the primary MAC address is initialized at registration time, so 3820 * to add it to default group only need to activate it if its reference 3821 * count is still zero. Also, some drivers may not have advertised RINGS 3822 * capability. 3823 */ 3824 int 3825 mac_add_macaddr(mac_impl_t *mip, mac_group_t *group, uint8_t *mac_addr, 3826 boolean_t use_hw) 3827 { 3828 mac_address_t *map; 3829 int err = 0; 3830 boolean_t allocated_map = B_FALSE; 3831 3832 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); 3833 3834 map = mac_find_macaddr(mip, mac_addr); 3835 3836 /* 3837 * If the new MAC address has not been added. Allocate a new one 3838 * and set it up. 3839 */ 3840 if (map == NULL) { 3841 map = kmem_zalloc(sizeof (mac_address_t), KM_SLEEP); 3842 map->ma_len = mip->mi_type->mt_addr_length; 3843 bcopy(mac_addr, map->ma_addr, map->ma_len); 3844 map->ma_nusers = 0; 3845 map->ma_group = group; 3846 map->ma_mip = mip; 3847 3848 /* add the new MAC address to the head of the address list */ 3849 map->ma_next = mip->mi_addresses; 3850 mip->mi_addresses = map; 3851 3852 allocated_map = B_TRUE; 3853 } 3854 3855 ASSERT(map->ma_group == group); 3856 3857 /* 3858 * If the MAC address is already in use, simply account for the 3859 * new client. 3860 */ 3861 if (map->ma_nusers++ > 0) 3862 return (0); 3863 3864 /* 3865 * Activate this MAC address by adding it to the reserved group. 3866 */ 3867 if (group != NULL) { 3868 err = mac_group_addmac(group, (const uint8_t *)mac_addr); 3869 if (err == 0) { 3870 map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED; 3871 return (0); 3872 } 3873 } 3874 3875 /* 3876 * The MAC address addition failed. If the client requires a 3877 * hardware classified MAC address, fail the operation. 3878 */ 3879 if (use_hw) { 3880 err = ENOSPC; 3881 goto bail; 3882 } 3883 3884 /* 3885 * Try promiscuous mode. 3886 * 3887 * For drivers that don't advertise RINGS capability, do 3888 * nothing for the primary address. 3889 */ 3890 if ((group == NULL) && 3891 (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) == 0)) { 3892 map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED; 3893 return (0); 3894 } 3895 3896 /* 3897 * Enable promiscuous mode in order to receive traffic 3898 * to the new MAC address. 3899 */ 3900 if ((err = i_mac_promisc_set(mip, B_TRUE, MAC_DEVPROMISC)) == 0) { 3901 map->ma_type = MAC_ADDRESS_TYPE_UNICAST_PROMISC; 3902 return (0); 3903 } 3904 3905 /* 3906 * Free the MAC address that could not be added. Don't free 3907 * a pre-existing address, it could have been the entry 3908 * for the primary MAC address which was pre-allocated by 3909 * mac_init_macaddr(), and which must remain on the list. 3910 */ 3911 bail: 3912 map->ma_nusers--; 3913 if (allocated_map) 3914 mac_free_macaddr(map); 3915 return (err); 3916 } 3917 3918 /* 3919 * Remove a reference to a MAC address. This may cause to remove the MAC 3920 * address from an associated group or to turn off promiscuous mode. 3921 * The caller needs to handle the failure properly. 3922 */ 3923 int 3924 mac_remove_macaddr(mac_address_t *map) 3925 { 3926 mac_impl_t *mip = map->ma_mip; 3927 int err = 0; 3928 3929 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); 3930 3931 ASSERT(map == mac_find_macaddr(mip, map->ma_addr)); 3932 3933 /* 3934 * If it's not the last client using this MAC address, only update 3935 * the MAC clients count. 3936 */ 3937 if (--map->ma_nusers > 0) 3938 return (0); 3939 3940 /* 3941 * The MAC address is no longer used by any MAC client, so remove 3942 * it from its associated group, or turn off promiscuous mode 3943 * if it was enabled for the MAC address. 3944 */ 3945 switch (map->ma_type) { 3946 case MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED: 3947 /* 3948 * Don't free the preset primary address for drivers that 3949 * don't advertise RINGS capability. 3950 */ 3951 if (map->ma_group == NULL) 3952 return (0); 3953 3954 err = mac_group_remmac(map->ma_group, map->ma_addr); 3955 break; 3956 case MAC_ADDRESS_TYPE_UNICAST_PROMISC: 3957 err = i_mac_promisc_set(mip, B_FALSE, MAC_DEVPROMISC); 3958 break; 3959 default: 3960 ASSERT(B_FALSE); 3961 } 3962 3963 if (err != 0) 3964 return (err); 3965 3966 /* 3967 * We created MAC address for the primary one at registration, so we 3968 * won't free it here. mac_fini_macaddr() will take care of it. 3969 */ 3970 if (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) != 0) 3971 mac_free_macaddr(map); 3972 3973 return (0); 3974 } 3975 3976 /* 3977 * Update an existing MAC address. The caller need to make sure that the new 3978 * value has not been used. 3979 */ 3980 int 3981 mac_update_macaddr(mac_address_t *map, uint8_t *mac_addr) 3982 { 3983 mac_impl_t *mip = map->ma_mip; 3984 int err = 0; 3985 3986 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); 3987 ASSERT(mac_find_macaddr(mip, mac_addr) == NULL); 3988 3989 switch (map->ma_type) { 3990 case MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED: 3991 /* 3992 * Update the primary address for drivers that are not 3993 * RINGS capable. 3994 */ 3995 if (map->ma_group == NULL) { 3996 err = mip->mi_unicst(mip->mi_driver, (const uint8_t *) 3997 mac_addr); 3998 if (err != 0) 3999 return (err); 4000 break; 4001 } 4002 4003 /* 4004 * If this MAC address is not currently in use, 4005 * simply break out and update the value. 4006 */ 4007 if (map->ma_nusers == 0) 4008 break; 4009 4010 /* 4011 * Need to replace the MAC address associated with a group. 4012 */ 4013 err = mac_group_remmac(map->ma_group, map->ma_addr); 4014 if (err != 0) 4015 return (err); 4016 4017 err = mac_group_addmac(map->ma_group, mac_addr); 4018 4019 /* 4020 * Failure hints hardware error. The MAC layer needs to 4021 * have error notification facility to handle this. 4022 * Now, simply try to restore the value. 4023 */ 4024 if (err != 0) 4025 (void) mac_group_addmac(map->ma_group, map->ma_addr); 4026 4027 break; 4028 case MAC_ADDRESS_TYPE_UNICAST_PROMISC: 4029 /* 4030 * Need to do nothing more if in promiscuous mode. 4031 */ 4032 break; 4033 default: 4034 ASSERT(B_FALSE); 4035 } 4036 4037 /* 4038 * Successfully replaced the MAC address. 4039 */ 4040 if (err == 0) 4041 bcopy(mac_addr, map->ma_addr, map->ma_len); 4042 4043 return (err); 4044 } 4045 4046 /* 4047 * Freshen the MAC address with new value. Its caller must have updated the 4048 * hardware MAC address before calling this function. 4049 * This funcitons is supposed to be used to handle the MAC address change 4050 * notification from underlying drivers. 4051 */ 4052 void 4053 mac_freshen_macaddr(mac_address_t *map, uint8_t *mac_addr) 4054 { 4055 mac_impl_t *mip = map->ma_mip; 4056 4057 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); 4058 ASSERT(mac_find_macaddr(mip, mac_addr) == NULL); 4059 4060 /* 4061 * Freshen the MAC address with new value. 4062 */ 4063 bcopy(mac_addr, map->ma_addr, map->ma_len); 4064 bcopy(mac_addr, mip->mi_addr, map->ma_len); 4065 4066 /* 4067 * Update all MAC clients that share this MAC address. 4068 */ 4069 mac_unicast_update_clients(mip, map); 4070 } 4071 4072 /* 4073 * Set up the primary MAC address. 4074 */ 4075 void 4076 mac_init_macaddr(mac_impl_t *mip) 4077 { 4078 mac_address_t *map; 4079 4080 /* 4081 * The reference count is initialized to zero, until it's really 4082 * activated. 4083 */ 4084 map = kmem_zalloc(sizeof (mac_address_t), KM_SLEEP); 4085 map->ma_len = mip->mi_type->mt_addr_length; 4086 bcopy(mip->mi_addr, map->ma_addr, map->ma_len); 4087 4088 /* 4089 * If driver advertises RINGS capability, it shouldn't have initialized 4090 * its primary MAC address. For other drivers, including VNIC, the 4091 * primary address must work after registration. 4092 */ 4093 if (mip->mi_rx_groups == NULL) 4094 map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED; 4095 4096 /* 4097 * The primary MAC address is reserved for default group according 4098 * to current design. 4099 */ 4100 map->ma_group = mip->mi_rx_groups; 4101 map->ma_mip = mip; 4102 4103 mip->mi_addresses = map; 4104 } 4105 4106 /* 4107 * Clean up the primary MAC address. Note, only one primary MAC address 4108 * is allowed. All other MAC addresses must have been freed appropriately. 4109 */ 4110 void 4111 mac_fini_macaddr(mac_impl_t *mip) 4112 { 4113 mac_address_t *map = mip->mi_addresses; 4114 4115 if (map == NULL) 4116 return; 4117 4118 /* 4119 * If mi_addresses is initialized, there should be exactly one 4120 * entry left on the list with no users. 4121 */ 4122 ASSERT(map->ma_nusers == 0); 4123 ASSERT(map->ma_next == NULL); 4124 4125 kmem_free(map, sizeof (mac_address_t)); 4126 mip->mi_addresses = NULL; 4127 } 4128 4129 /* 4130 * Logging related functions. 4131 */ 4132 4133 /* Write the Flow description to the log file */ 4134 int 4135 mac_write_flow_desc(flow_entry_t *flent, mac_client_impl_t *mcip) 4136 { 4137 flow_desc_t *fdesc; 4138 mac_resource_props_t *mrp; 4139 net_desc_t ndesc; 4140 4141 bzero(&ndesc, sizeof (net_desc_t)); 4142 4143 /* 4144 * Grab the fe_lock to see a self-consistent fe_flow_desc. 4145 * Updates to the fe_flow_desc are done under the fe_lock 4146 */ 4147 mutex_enter(&flent->fe_lock); 4148 fdesc = &flent->fe_flow_desc; 4149 mrp = &flent->fe_resource_props; 4150 4151 ndesc.nd_name = flent->fe_flow_name; 4152 ndesc.nd_devname = mcip->mci_name; 4153 bcopy(fdesc->fd_src_mac, ndesc.nd_ehost, ETHERADDRL); 4154 bcopy(fdesc->fd_dst_mac, ndesc.nd_edest, ETHERADDRL); 4155 ndesc.nd_sap = htonl(fdesc->fd_sap); 4156 ndesc.nd_isv4 = (uint8_t)fdesc->fd_ipversion == IPV4_VERSION; 4157 ndesc.nd_bw_limit = mrp->mrp_maxbw; 4158 if (ndesc.nd_isv4) { 4159 ndesc.nd_saddr[3] = htonl(fdesc->fd_local_addr.s6_addr32[3]); 4160 ndesc.nd_daddr[3] = htonl(fdesc->fd_remote_addr.s6_addr32[3]); 4161 } else { 4162 bcopy(&fdesc->fd_local_addr, ndesc.nd_saddr, IPV6_ADDR_LEN); 4163 bcopy(&fdesc->fd_remote_addr, ndesc.nd_daddr, IPV6_ADDR_LEN); 4164 } 4165 ndesc.nd_sport = htons(fdesc->fd_local_port); 4166 ndesc.nd_dport = htons(fdesc->fd_remote_port); 4167 ndesc.nd_protocol = (uint8_t)fdesc->fd_protocol; 4168 mutex_exit(&flent->fe_lock); 4169 4170 return (exacct_commit_netinfo((void *)&ndesc, EX_NET_FLDESC_REC)); 4171 } 4172 4173 /* Write the Flow statistics to the log file */ 4174 int 4175 mac_write_flow_stats(flow_entry_t *flent) 4176 { 4177 flow_stats_t *fl_stats; 4178 net_stat_t nstat; 4179 4180 fl_stats = &flent->fe_flowstats; 4181 nstat.ns_name = flent->fe_flow_name; 4182 nstat.ns_ibytes = fl_stats->fs_rbytes; 4183 nstat.ns_obytes = fl_stats->fs_obytes; 4184 nstat.ns_ipackets = fl_stats->fs_ipackets; 4185 nstat.ns_opackets = fl_stats->fs_opackets; 4186 nstat.ns_ierrors = fl_stats->fs_ierrors; 4187 nstat.ns_oerrors = fl_stats->fs_oerrors; 4188 4189 return (exacct_commit_netinfo((void *)&nstat, EX_NET_FLSTAT_REC)); 4190 } 4191 4192 /* Write the Link Description to the log file */ 4193 int 4194 mac_write_link_desc(mac_client_impl_t *mcip) 4195 { 4196 net_desc_t ndesc; 4197 flow_entry_t *flent = mcip->mci_flent; 4198 4199 bzero(&ndesc, sizeof (net_desc_t)); 4200 4201 ndesc.nd_name = mcip->mci_name; 4202 ndesc.nd_devname = mcip->mci_name; 4203 ndesc.nd_isv4 = B_TRUE; 4204 /* 4205 * Grab the fe_lock to see a self-consistent fe_flow_desc. 4206 * Updates to the fe_flow_desc are done under the fe_lock 4207 * after removing the flent from the flow table. 4208 */ 4209 mutex_enter(&flent->fe_lock); 4210 bcopy(flent->fe_flow_desc.fd_src_mac, ndesc.nd_ehost, ETHERADDRL); 4211 mutex_exit(&flent->fe_lock); 4212 4213 return (exacct_commit_netinfo((void *)&ndesc, EX_NET_LNDESC_REC)); 4214 } 4215 4216 /* Write the Link statistics to the log file */ 4217 int 4218 mac_write_link_stats(mac_client_impl_t *mcip) 4219 { 4220 net_stat_t nstat; 4221 4222 nstat.ns_name = mcip->mci_name; 4223 nstat.ns_ibytes = mcip->mci_stat_ibytes; 4224 nstat.ns_obytes = mcip->mci_stat_obytes; 4225 nstat.ns_ipackets = mcip->mci_stat_ipackets; 4226 nstat.ns_opackets = mcip->mci_stat_opackets; 4227 nstat.ns_ierrors = mcip->mci_stat_ierrors; 4228 nstat.ns_oerrors = mcip->mci_stat_oerrors; 4229 4230 return (exacct_commit_netinfo((void *)&nstat, EX_NET_LNSTAT_REC)); 4231 } 4232 4233 /* 4234 * For a given flow, if the descrition has not been logged before, do it now. 4235 * If it is a VNIC, then we have collected information about it from the MAC 4236 * table, so skip it. 4237 */ 4238 /*ARGSUSED*/ 4239 static int 4240 mac_log_flowinfo(flow_entry_t *flent, void *args) 4241 { 4242 mac_client_impl_t *mcip = flent->fe_mcip; 4243 4244 if (mcip == NULL) 4245 return (0); 4246 4247 /* 4248 * If the name starts with "vnic", and fe_user_generated is true (to 4249 * exclude the mcast and active flow entries created implicitly for 4250 * a vnic, it is a VNIC flow. i.e. vnic1 is a vnic flow, 4251 * vnic/bge1/mcast1 is not and neither is vnic/bge1/active. 4252 */ 4253 if (strncasecmp(flent->fe_flow_name, "vnic", 4) == 0 && 4254 (flent->fe_type & FLOW_USER) != 0) { 4255 return (0); 4256 } 4257 4258 if (!flent->fe_desc_logged) { 4259 /* 4260 * We don't return error because we want to continu the 4261 * walk in case this is the last walk which means we 4262 * need to reset fe_desc_logged in all the flows. 4263 */ 4264 if (mac_write_flow_desc(flent, mcip) != 0) 4265 return (0); 4266 flent->fe_desc_logged = B_TRUE; 4267 } 4268 4269 /* 4270 * Regardless of the error, we want to proceed in case we have to 4271 * reset fe_desc_logged. 4272 */ 4273 (void) mac_write_flow_stats(flent); 4274 4275 if (mcip != NULL && !(mcip->mci_state_flags & MCIS_DESC_LOGGED)) 4276 flent->fe_desc_logged = B_FALSE; 4277 4278 return (0); 4279 } 4280 4281 typedef struct i_mac_log_state_s { 4282 boolean_t mi_last; 4283 int mi_fenable; 4284 int mi_lenable; 4285 } i_mac_log_state_t; 4286 4287 /* 4288 * Walk the mac_impl_ts and log the description for each mac client of this mac, 4289 * if it hasn't already been done. Additionally, log statistics for the link as 4290 * well. Walk the flow table and log information for each flow as well. 4291 * If it is the last walk (mci_last), then we turn off mci_desc_logged (and 4292 * also fe_desc_logged, if flow logging is on) since we want to log the 4293 * description if and when logging is restarted. 4294 */ 4295 /*ARGSUSED*/ 4296 static uint_t 4297 i_mac_log_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg) 4298 { 4299 mac_impl_t *mip = (mac_impl_t *)val; 4300 i_mac_log_state_t *lstate = (i_mac_log_state_t *)arg; 4301 int ret; 4302 mac_client_impl_t *mcip; 4303 4304 /* 4305 * Only walk the client list for NIC and etherstub 4306 */ 4307 if ((mip->mi_state_flags & MIS_DISABLED) || 4308 ((mip->mi_state_flags & MIS_IS_VNIC) && 4309 (mac_get_lower_mac_handle((mac_handle_t)mip) != NULL))) 4310 return (MH_WALK_CONTINUE); 4311 4312 for (mcip = mip->mi_clients_list; mcip != NULL; 4313 mcip = mcip->mci_client_next) { 4314 if (!MCIP_DATAPATH_SETUP(mcip)) 4315 continue; 4316 if (lstate->mi_lenable) { 4317 if (!(mcip->mci_state_flags & MCIS_DESC_LOGGED)) { 4318 ret = mac_write_link_desc(mcip); 4319 if (ret != 0) { 4320 /* 4321 * We can't terminate it if this is the last 4322 * walk, else there might be some links with 4323 * mi_desc_logged set to true, which means 4324 * their description won't be logged the next 4325 * time logging is started (similarly for the 4326 * flows within such links). We can continue 4327 * without walking the flow table (i.e. to 4328 * set fe_desc_logged to false) because we 4329 * won't have written any flow stuff for this 4330 * link as we haven't logged the link itself. 4331 */ 4332 if (lstate->mi_last) 4333 return (MH_WALK_CONTINUE); 4334 else 4335 return (MH_WALK_TERMINATE); 4336 } 4337 mcip->mci_state_flags |= MCIS_DESC_LOGGED; 4338 } 4339 } 4340 4341 if (mac_write_link_stats(mcip) != 0 && !lstate->mi_last) 4342 return (MH_WALK_TERMINATE); 4343 4344 if (lstate->mi_last) 4345 mcip->mci_state_flags &= ~MCIS_DESC_LOGGED; 4346 4347 if (lstate->mi_fenable) { 4348 if (mcip->mci_subflow_tab != NULL) { 4349 (void) mac_flow_walk(mcip->mci_subflow_tab, 4350 mac_log_flowinfo, mip); 4351 } 4352 } 4353 } 4354 return (MH_WALK_CONTINUE); 4355 } 4356 4357 /* 4358 * The timer thread that runs every mac_logging_interval seconds and logs 4359 * link and/or flow information. 4360 */ 4361 /* ARGSUSED */ 4362 void 4363 mac_log_linkinfo(void *arg) 4364 { 4365 i_mac_log_state_t lstate; 4366 4367 rw_enter(&i_mac_impl_lock, RW_READER); 4368 if (!mac_flow_log_enable && !mac_link_log_enable) { 4369 rw_exit(&i_mac_impl_lock); 4370 return; 4371 } 4372 lstate.mi_fenable = mac_flow_log_enable; 4373 lstate.mi_lenable = mac_link_log_enable; 4374 lstate.mi_last = B_FALSE; 4375 rw_exit(&i_mac_impl_lock); 4376 4377 mod_hash_walk(i_mac_impl_hash, i_mac_log_walker, &lstate); 4378 4379 rw_enter(&i_mac_impl_lock, RW_WRITER); 4380 if (mac_flow_log_enable || mac_link_log_enable) { 4381 mac_logging_timer = timeout(mac_log_linkinfo, NULL, 4382 SEC_TO_TICK(mac_logging_interval)); 4383 } 4384 rw_exit(&i_mac_impl_lock); 4385 } 4386 4387 /* 4388 * Start the logging timer. 4389 */ 4390 void 4391 mac_start_logusage(mac_logtype_t type, uint_t interval) 4392 { 4393 rw_enter(&i_mac_impl_lock, RW_WRITER); 4394 switch (type) { 4395 case MAC_LOGTYPE_FLOW: 4396 if (mac_flow_log_enable) { 4397 rw_exit(&i_mac_impl_lock); 4398 return; 4399 } 4400 mac_flow_log_enable = B_TRUE; 4401 /* FALLTHRU */ 4402 case MAC_LOGTYPE_LINK: 4403 if (mac_link_log_enable) { 4404 rw_exit(&i_mac_impl_lock); 4405 return; 4406 } 4407 mac_link_log_enable = B_TRUE; 4408 break; 4409 default: 4410 ASSERT(0); 4411 } 4412 mac_logging_interval = interval; 4413 rw_exit(&i_mac_impl_lock); 4414 mac_log_linkinfo(NULL); 4415 } 4416 4417 /* 4418 * Stop the logging timer if both Link and Flow logging are turned off. 4419 */ 4420 void 4421 mac_stop_logusage(mac_logtype_t type) 4422 { 4423 i_mac_log_state_t lstate; 4424 4425 rw_enter(&i_mac_impl_lock, RW_WRITER); 4426 lstate.mi_fenable = mac_flow_log_enable; 4427 lstate.mi_lenable = mac_link_log_enable; 4428 4429 /* Last walk */ 4430 lstate.mi_last = B_TRUE; 4431 4432 switch (type) { 4433 case MAC_LOGTYPE_FLOW: 4434 if (lstate.mi_fenable) { 4435 ASSERT(mac_link_log_enable); 4436 mac_flow_log_enable = B_FALSE; 4437 mac_link_log_enable = B_FALSE; 4438 break; 4439 } 4440 /* FALLTHRU */ 4441 case MAC_LOGTYPE_LINK: 4442 if (!lstate.mi_lenable || mac_flow_log_enable) { 4443 rw_exit(&i_mac_impl_lock); 4444 return; 4445 } 4446 mac_link_log_enable = B_FALSE; 4447 break; 4448 default: 4449 ASSERT(0); 4450 } 4451 rw_exit(&i_mac_impl_lock); 4452 (void) untimeout(mac_logging_timer); 4453 mac_logging_timer = 0; 4454 4455 /* Last walk */ 4456 mod_hash_walk(i_mac_impl_hash, i_mac_log_walker, &lstate); 4457 } 4458 4459 /* 4460 * Walk the rx and tx SRS/SRs for a flow and update the priority value. 4461 */ 4462 void 4463 mac_flow_update_priority(mac_client_impl_t *mcip, flow_entry_t *flent) 4464 { 4465 pri_t pri; 4466 int count; 4467 mac_soft_ring_set_t *mac_srs; 4468 4469 if (flent->fe_rx_srs_cnt <= 0) 4470 return; 4471 4472 if (((mac_soft_ring_set_t *)flent->fe_rx_srs[0])->srs_type == 4473 SRST_FLOW) { 4474 pri = FLOW_PRIORITY(mcip->mci_min_pri, 4475 mcip->mci_max_pri, 4476 flent->fe_resource_props.mrp_priority); 4477 } else { 4478 pri = mcip->mci_max_pri; 4479 } 4480 4481 for (count = 0; count < flent->fe_rx_srs_cnt; count++) { 4482 mac_srs = flent->fe_rx_srs[count]; 4483 mac_update_srs_priority(mac_srs, pri); 4484 } 4485 /* 4486 * If we have a Tx SRS, we need to modify all the threads associated 4487 * with it. 4488 */ 4489 if (flent->fe_tx_srs != NULL) 4490 mac_update_srs_priority(flent->fe_tx_srs, pri); 4491 } 4492 4493 /* 4494 * RX and TX rings are reserved according to different semantics depending 4495 * on the requests from the MAC clients and type of rings: 4496 * 4497 * On the Tx side, by default we reserve individual rings, independently from 4498 * the groups. 4499 * 4500 * On the Rx side, the reservation is at the granularity of the group 4501 * of rings, and used for v12n level 1 only. It has a special case for the 4502 * primary client. 4503 * 4504 * If a share is allocated to a MAC client, we allocate a TX group and an 4505 * RX group to the client, and assign TX rings and RX rings to these 4506 * groups according to information gathered from the driver through 4507 * the share capability. 4508 * 4509 * The foreseable evolution of Rx rings will handle v12n level 2 and higher 4510 * to allocate individual rings out of a group and program the hw classifier 4511 * based on IP address or higher level criteria. 4512 */ 4513 4514 /* 4515 * mac_reserve_tx_ring() 4516 * Reserve a unused ring by marking it with MR_INUSE state. 4517 * As reserved, the ring is ready to function. 4518 * 4519 * Notes for Hybrid I/O: 4520 * 4521 * If a specific ring is needed, it is specified through the desired_ring 4522 * argument. Otherwise that argument is set to NULL. 4523 * If the desired ring was previous allocated to another client, this 4524 * function swaps it with a new ring from the group of unassigned rings. 4525 */ 4526 mac_ring_t * 4527 mac_reserve_tx_ring(mac_impl_t *mip, mac_ring_t *desired_ring) 4528 { 4529 mac_group_t *group; 4530 mac_ring_t *ring; 4531 4532 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); 4533 4534 if (mip->mi_tx_groups == NULL) 4535 return (NULL); 4536 4537 /* 4538 * Find an available ring and start it before changing its status. 4539 * The unassigned rings are at the end of the mi_tx_groups 4540 * array. 4541 */ 4542 group = mip->mi_tx_groups + mip->mi_tx_group_count; 4543 4544 for (ring = group->mrg_rings; ring != NULL; 4545 ring = ring->mr_next) { 4546 if (desired_ring == NULL) { 4547 if (ring->mr_state == MR_FREE) 4548 /* wanted any free ring and found one */ 4549 break; 4550 } else { 4551 mac_ring_t *sring; 4552 mac_client_impl_t *client; 4553 mac_soft_ring_set_t *srs; 4554 4555 if (ring != desired_ring) 4556 /* wants a desired ring but this one ain't it */ 4557 continue; 4558 4559 if (ring->mr_state == MR_FREE) 4560 break; 4561 4562 /* 4563 * Found the desired ring but it's already in use. 4564 * Swap it with a new ring. 4565 */ 4566 4567 /* find the client which owns that ring */ 4568 for (client = mip->mi_clients_list; client != NULL; 4569 client = client->mci_client_next) { 4570 srs = MCIP_TX_SRS(client); 4571 if (srs != NULL && mac_tx_srs_ring_present(srs, 4572 desired_ring)) { 4573 /* found our ring */ 4574 break; 4575 } 4576 } 4577 if (client == NULL) { 4578 /* 4579 * The TX ring is in use, but it's not 4580 * associated with any clients, so it 4581 * has to be the default ring. In that 4582 * case we can simply assign a new ring 4583 * as the default ring, and we're done. 4584 */ 4585 ASSERT(mip->mi_default_tx_ring == 4586 (mac_ring_handle_t)desired_ring); 4587 4588 /* 4589 * Quiesce all clients on top of 4590 * the NIC to make sure there are no 4591 * pending threads still relying on 4592 * that default ring, for example 4593 * the multicast path. 4594 */ 4595 for (client = mip->mi_clients_list; 4596 client != NULL; 4597 client = client->mci_client_next) { 4598 mac_tx_client_quiesce(client, 4599 SRS_QUIESCE); 4600 } 4601 4602 mip->mi_default_tx_ring = (mac_ring_handle_t) 4603 mac_reserve_tx_ring(mip, NULL); 4604 4605 /* resume the clients */ 4606 for (client = mip->mi_clients_list; 4607 client != NULL; 4608 client = client->mci_client_next) 4609 mac_tx_client_restart(client); 4610 4611 break; 4612 } 4613 4614 /* 4615 * Note that we cannot simply invoke the group 4616 * add/rem routines since the client doesn't have a 4617 * TX group. So we need to instead add/remove 4618 * the rings from the SRS. 4619 */ 4620 ASSERT(client->mci_share == NULL); 4621 4622 /* first quiece the client */ 4623 mac_tx_client_quiesce(client, SRS_QUIESCE); 4624 4625 /* give a new ring to the client... */ 4626 sring = mac_reserve_tx_ring(mip, NULL); 4627 if (sring != NULL) { 4628 /* 4629 * There are no other available ring 4630 * on that MAC instance. The client 4631 * will fallback to the shared TX 4632 * ring. 4633 */ 4634 mac_tx_srs_add_ring(srs, sring); 4635 } 4636 4637 /* ... in exchange for our desired ring */ 4638 mac_tx_srs_del_ring(srs, desired_ring); 4639 4640 /* restart the client */ 4641 mac_tx_client_restart(client); 4642 4643 if (mip->mi_default_tx_ring == 4644 (mac_ring_handle_t)desired_ring) { 4645 /* 4646 * The desired ring is the default ring, 4647 * and there are one or more clients 4648 * using that default ring directly. 4649 */ 4650 mip->mi_default_tx_ring = 4651 (mac_ring_handle_t)sring; 4652 /* 4653 * Find clients using default ring and 4654 * swap it with the new default ring. 4655 */ 4656 for (client = mip->mi_clients_list; 4657 client != NULL; 4658 client = client->mci_client_next) { 4659 srs = MCIP_TX_SRS(client); 4660 if (srs != NULL && 4661 mac_tx_srs_ring_present(srs, 4662 desired_ring)) { 4663 /* first quiece the client */ 4664 mac_tx_client_quiesce(client, 4665 SRS_QUIESCE); 4666 4667 /* 4668 * Give it the new default 4669 * ring, and remove the old 4670 * one. 4671 */ 4672 if (sring != NULL) { 4673 mac_tx_srs_add_ring(srs, 4674 sring); 4675 } 4676 mac_tx_srs_del_ring(srs, 4677 desired_ring); 4678 4679 /* restart the client */ 4680 mac_tx_client_restart(client); 4681 } 4682 } 4683 } 4684 break; 4685 } 4686 } 4687 4688 if (ring != NULL) { 4689 if (mac_start_ring(ring) != 0) 4690 return (NULL); 4691 ring->mr_state = MR_INUSE; 4692 } 4693 4694 return (ring); 4695 } 4696 4697 /* 4698 * Minimum number of rings to leave in the default TX group when allocating 4699 * rings to new clients. 4700 */ 4701 static uint_t mac_min_rx_default_rings = 1; 4702 4703 /* 4704 * Populate a zero-ring group with rings. If the share is non-NULL, 4705 * the rings are chosen according to that share. 4706 * Invoked after allocating a new RX or TX group through 4707 * mac_reserve_rx_group() or mac_reserve_tx_group(), respectively. 4708 * Returns zero on success, an errno otherwise. 4709 */ 4710 int 4711 i_mac_group_allocate_rings(mac_impl_t *mip, mac_ring_type_t ring_type, 4712 mac_group_t *src_group, mac_group_t *new_group, mac_share_handle_t share) 4713 { 4714 mac_ring_t **rings, *tmp_ring[1], *ring; 4715 uint_t nrings; 4716 int rv, i, j; 4717 4718 ASSERT(mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC && 4719 mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC); 4720 ASSERT(new_group->mrg_cur_count == 0); 4721 4722 /* 4723 * First find the rings to allocate to the group. 4724 */ 4725 if (share != NULL) { 4726 /* get rings through ms_squery() */ 4727 mip->mi_share_capab.ms_squery(share, ring_type, NULL, &nrings); 4728 ASSERT(nrings != 0); 4729 rings = kmem_alloc(nrings * sizeof (mac_ring_handle_t), 4730 KM_SLEEP); 4731 mip->mi_share_capab.ms_squery(share, ring_type, 4732 (mac_ring_handle_t *)rings, &nrings); 4733 } else { 4734 /* this function is called for TX only with a share */ 4735 ASSERT(ring_type == MAC_RING_TYPE_RX); 4736 /* 4737 * Pick one ring from default group. 4738 * 4739 * for now pick the second ring which requires the first ring 4740 * at index 0 to stay in the default group, since it is the 4741 * ring which carries the multicast traffic. 4742 * We need a better way for a driver to indicate this, 4743 * for example a per-ring flag. 4744 */ 4745 for (ring = src_group->mrg_rings; ring != NULL; 4746 ring = ring->mr_next) { 4747 if (ring->mr_index != 0) 4748 break; 4749 } 4750 ASSERT(ring != NULL); 4751 nrings = 1; 4752 tmp_ring[0] = ring; 4753 rings = tmp_ring; 4754 } 4755 4756 switch (ring_type) { 4757 case MAC_RING_TYPE_RX: 4758 if (src_group->mrg_cur_count - nrings < 4759 mac_min_rx_default_rings) { 4760 /* we ran out of rings */ 4761 return (ENOSPC); 4762 } 4763 4764 /* move receive rings to new group */ 4765 for (i = 0; i < nrings; i++) { 4766 rv = mac_group_mov_ring(mip, new_group, rings[i]); 4767 if (rv != 0) { 4768 /* move rings back on failure */ 4769 for (j = 0; j < i; j++) { 4770 (void) mac_group_mov_ring(mip, 4771 src_group, rings[j]); 4772 } 4773 return (rv); 4774 } 4775 } 4776 break; 4777 4778 case MAC_RING_TYPE_TX: { 4779 mac_ring_t *tmp_ring; 4780 4781 /* move the TX rings to the new group */ 4782 ASSERT(src_group == NULL); 4783 for (i = 0; i < nrings; i++) { 4784 /* get the desired ring */ 4785 tmp_ring = mac_reserve_tx_ring(mip, rings[i]); 4786 ASSERT(tmp_ring == rings[i]); 4787 rv = mac_group_mov_ring(mip, new_group, rings[i]); 4788 if (rv != 0) { 4789 /* cleanup on failure */ 4790 for (j = 0; j < i; j++) { 4791 (void) mac_group_mov_ring(mip, 4792 mip->mi_tx_groups + 4793 mip->mi_tx_group_count, rings[j]); 4794 } 4795 } 4796 } 4797 break; 4798 } 4799 } 4800 4801 if (share != NULL) { 4802 /* add group to share */ 4803 mip->mi_share_capab.ms_sadd(share, new_group->mrg_driver); 4804 /* free temporary array of rings */ 4805 kmem_free(rings, nrings * sizeof (mac_ring_handle_t)); 4806 } 4807 4808 return (0); 4809 } 4810 4811 void 4812 mac_rx_group_add_client(mac_group_t *grp, mac_client_impl_t *mcip) 4813 { 4814 mac_grp_client_t *mgcp; 4815 4816 for (mgcp = grp->mrg_clients; mgcp != NULL; mgcp = mgcp->mgc_next) { 4817 if (mgcp->mgc_client == mcip) 4818 break; 4819 } 4820 4821 VERIFY(mgcp == NULL); 4822 4823 mgcp = kmem_zalloc(sizeof (mac_grp_client_t), KM_SLEEP); 4824 mgcp->mgc_client = mcip; 4825 mgcp->mgc_next = grp->mrg_clients; 4826 grp->mrg_clients = mgcp; 4827 4828 } 4829 4830 void 4831 mac_rx_group_remove_client(mac_group_t *grp, mac_client_impl_t *mcip) 4832 { 4833 mac_grp_client_t *mgcp, **pprev; 4834 4835 for (pprev = &grp->mrg_clients, mgcp = *pprev; mgcp != NULL; 4836 pprev = &mgcp->mgc_next, mgcp = *pprev) { 4837 if (mgcp->mgc_client == mcip) 4838 break; 4839 } 4840 4841 ASSERT(mgcp != NULL); 4842 4843 *pprev = mgcp->mgc_next; 4844 kmem_free(mgcp, sizeof (mac_grp_client_t)); 4845 } 4846 4847 /* 4848 * mac_reserve_rx_group() 4849 * 4850 * Finds an available group and exclusively reserves it for a client. 4851 * The group is chosen to suit the flow's resource controls (bandwidth and 4852 * fanout requirements) and the address type. 4853 * If the requestor is the pimary MAC then return the group with the 4854 * largest number of rings, otherwise the default ring when available. 4855 */ 4856 mac_group_t * 4857 mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, 4858 mac_rx_group_reserve_type_t rtype) 4859 { 4860 mac_share_handle_t share = mcip->mci_share; 4861 mac_impl_t *mip = mcip->mci_mip; 4862 mac_group_t *grp = NULL; 4863 int i, start, loopcount; 4864 int err; 4865 mac_address_t *map; 4866 4867 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); 4868 4869 /* Check if a group already has this mac address (case of VLANs) */ 4870 if ((map = mac_find_macaddr(mip, mac_addr)) != NULL) 4871 return (map->ma_group); 4872 4873 if (mip->mi_rx_groups == NULL || mip->mi_rx_group_count == 0 || 4874 rtype == MAC_RX_NO_RESERVE) 4875 return (NULL); 4876 4877 /* 4878 * Try to exclusively reserve a RX group. 4879 * 4880 * For flows requires SW_RING it always goes to the default group 4881 * (Until we can explicitely call out default groups (CR 6695600), 4882 * we assume that the default group is always at position zero); 4883 * 4884 * For flows requires HW_DEFAULT_RING (unicast flow of the primary 4885 * client), try to reserve the default RX group only. 4886 * 4887 * For flows requires HW_RING (unicast flow of other clients), try 4888 * to reserve non-default RX group then the default group. 4889 */ 4890 switch (rtype) { 4891 case MAC_RX_RESERVE_DEFAULT: 4892 start = 0; 4893 loopcount = 1; 4894 break; 4895 case MAC_RX_RESERVE_NONDEFAULT: 4896 start = 1; 4897 loopcount = mip->mi_rx_group_count; 4898 } 4899 4900 for (i = start; i < start + loopcount; i++) { 4901 grp = &mip->mi_rx_groups[i % mip->mi_rx_group_count]; 4902 4903 DTRACE_PROBE3(rx__group__trying, char *, mip->mi_name, 4904 int, grp->mrg_index, mac_group_state_t, grp->mrg_state); 4905 4906 /* 4907 * Check to see whether this mac client is the only client 4908 * on this RX group. If not, we cannot exclusively reserve 4909 * this RX group. 4910 */ 4911 if (!MAC_RX_GROUP_NO_CLIENT(grp) && 4912 (MAC_RX_GROUP_ONLY_CLIENT(grp) != mcip)) { 4913 continue; 4914 } 4915 4916 /* 4917 * This group could already be SHARED by other multicast 4918 * flows on this client. In that case, the group would 4919 * be shared and has already been started. 4920 */ 4921 ASSERT(grp->mrg_state != MAC_GROUP_STATE_UNINIT); 4922 4923 if ((grp->mrg_state == MAC_GROUP_STATE_REGISTERED) && 4924 (mac_start_group(grp) != 0)) { 4925 continue; 4926 } 4927 4928 if ((i % mip->mi_rx_group_count) == 0 || 4929 mip->mi_rx_group_type != MAC_GROUP_TYPE_DYNAMIC) { 4930 break; 4931 } 4932 4933 ASSERT(grp->mrg_cur_count == 0); 4934 4935 /* 4936 * Populate the group. Rings should be taken 4937 * from the default group at position 0 for now. 4938 */ 4939 4940 err = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_RX, 4941 &mip->mi_rx_groups[0], grp, share); 4942 if (err == 0) 4943 break; 4944 4945 DTRACE_PROBE3(rx__group__reserve__alloc__rings, char *, 4946 mip->mi_name, int, grp->mrg_index, int, err); 4947 4948 /* 4949 * It's a dynamic group but the grouping operation failed. 4950 */ 4951 mac_stop_group(grp); 4952 } 4953 4954 if (i == start + loopcount) 4955 return (NULL); 4956 4957 ASSERT(grp != NULL); 4958 4959 DTRACE_PROBE2(rx__group__reserved, 4960 char *, mip->mi_name, int, grp->mrg_index); 4961 return (grp); 4962 } 4963 4964 /* 4965 * mac_rx_release_group() 4966 * 4967 * This is called when there are no clients left for the group. 4968 * The group is stopped and marked MAC_GROUP_STATE_REGISTERED, 4969 * and if it is a non default group, the shares are removed and 4970 * all rings are assigned back to default group. 4971 */ 4972 void 4973 mac_release_rx_group(mac_client_impl_t *mcip, mac_group_t *group) 4974 { 4975 mac_impl_t *mip = mcip->mci_mip; 4976 mac_ring_t *ring; 4977 4978 ASSERT(group != &mip->mi_rx_groups[0]); 4979 4980 /* 4981 * This is the case where there are no clients left. Any 4982 * SRS etc on this group have also be quiesced. 4983 */ 4984 for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) { 4985 if (ring->mr_classify_type == MAC_HW_CLASSIFIER) { 4986 ASSERT(group->mrg_state == MAC_GROUP_STATE_RESERVED); 4987 /* 4988 * Remove the SRS associated with the HW ring. 4989 * As a result, polling will be disabled. 4990 */ 4991 ring->mr_srs = NULL; 4992 } 4993 ASSERT(ring->mr_state == MR_INUSE); 4994 mac_stop_ring(ring); 4995 ring->mr_state = MR_FREE; 4996 ring->mr_flag = 0; 4997 } 4998 4999 /* remove group from share */ 5000 if (mcip->mci_share != NULL) { 5001 mip->mi_share_capab.ms_sremove(mcip->mci_share, 5002 group->mrg_driver); 5003 } 5004 5005 if (mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) { 5006 mac_ring_t *ring; 5007 5008 /* 5009 * Rings were dynamically allocated to group. 5010 * Move rings back to default group. 5011 */ 5012 while ((ring = group->mrg_rings) != NULL) { 5013 (void) mac_group_mov_ring(mip, 5014 &mip->mi_rx_groups[0], ring); 5015 } 5016 } 5017 mac_stop_group(group); 5018 /* 5019 * Possible improvement: See if we can assign the group just released 5020 * to a another client of the mip 5021 */ 5022 } 5023 5024 /* 5025 * Reserves a TX group for the specified share. Invoked by mac_tx_srs_setup() 5026 * when a share was allocated to the client. 5027 */ 5028 mac_group_t * 5029 mac_reserve_tx_group(mac_impl_t *mip, mac_share_handle_t share) 5030 { 5031 mac_group_t *grp; 5032 int rv, i; 5033 5034 /* 5035 * TX groups are currently allocated only to MAC clients 5036 * which are associated with a share. Since we have a fixed 5037 * number of share and groups, and we already successfully 5038 * allocated a share, find an available TX group. 5039 */ 5040 ASSERT(share != NULL); 5041 ASSERT(mip->mi_tx_group_free > 0); 5042 5043 for (i = 0; i < mip->mi_tx_group_count; i++) { 5044 grp = &mip->mi_tx_groups[i]; 5045 5046 if ((grp->mrg_state == MAC_GROUP_STATE_RESERVED) || 5047 (grp->mrg_state == MAC_GROUP_STATE_UNINIT)) 5048 continue; 5049 5050 rv = mac_start_group(grp); 5051 ASSERT(rv == 0); 5052 5053 grp->mrg_state = MAC_GROUP_STATE_RESERVED; 5054 break; 5055 } 5056 5057 ASSERT(grp != NULL); 5058 5059 /* 5060 * Populate the group. Rings should be taken from the group 5061 * of unassigned rings, which is past the array of TX 5062 * groups adversized by the driver. 5063 */ 5064 rv = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_TX, NULL, 5065 grp, share); 5066 if (rv != 0) { 5067 DTRACE_PROBE3(tx__group__reserve__alloc__rings, 5068 char *, mip->mi_name, int, grp->mrg_index, int, rv); 5069 5070 mac_stop_group(grp); 5071 grp->mrg_state = MAC_GROUP_STATE_UNINIT; 5072 5073 return (NULL); 5074 } 5075 5076 mip->mi_tx_group_free--; 5077 5078 return (grp); 5079 } 5080 5081 void 5082 mac_release_tx_group(mac_impl_t *mip, mac_group_t *grp) 5083 { 5084 mac_client_impl_t *mcip = grp->mrg_tx_client; 5085 mac_share_handle_t share = mcip->mci_share; 5086 mac_ring_t *ring; 5087 5088 ASSERT(mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC); 5089 ASSERT(share != NULL); 5090 ASSERT(grp->mrg_state == MAC_GROUP_STATE_RESERVED); 5091 5092 mip->mi_share_capab.ms_sremove(share, grp->mrg_driver); 5093 while ((ring = grp->mrg_rings) != NULL) { 5094 /* move the ring back to the pool */ 5095 (void) mac_group_mov_ring(mip, mip->mi_tx_groups + 5096 mip->mi_tx_group_count, ring); 5097 } 5098 mac_stop_group(grp); 5099 mac_set_rx_group_state(grp, MAC_GROUP_STATE_REGISTERED); 5100 grp->mrg_tx_client = NULL; 5101 mip->mi_tx_group_free++; 5102 } 5103 5104 /* 5105 * This is a 1-time control path activity initiated by the client (IP). 5106 * The mac perimeter protects against other simultaneous control activities, 5107 * for example an ioctl that attempts to change the degree of fanout and 5108 * increase or decrease the number of softrings associated with this Tx SRS. 5109 */ 5110 static mac_tx_notify_cb_t * 5111 mac_client_tx_notify_add(mac_client_impl_t *mcip, 5112 mac_tx_notify_t notify, void *arg) 5113 { 5114 mac_cb_info_t *mcbi; 5115 mac_tx_notify_cb_t *mtnfp; 5116 5117 ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); 5118 5119 mtnfp = kmem_zalloc(sizeof (mac_tx_notify_cb_t), KM_SLEEP); 5120 mtnfp->mtnf_fn = notify; 5121 mtnfp->mtnf_arg = arg; 5122 mtnfp->mtnf_link.mcb_objp = mtnfp; 5123 mtnfp->mtnf_link.mcb_objsize = sizeof (mac_tx_notify_cb_t); 5124 mtnfp->mtnf_link.mcb_flags = MCB_TX_NOTIFY_CB_T; 5125 5126 mcbi = &mcip->mci_tx_notify_cb_info; 5127 mutex_enter(mcbi->mcbi_lockp); 5128 mac_callback_add(mcbi, &mcip->mci_tx_notify_cb_list, &mtnfp->mtnf_link); 5129 mutex_exit(mcbi->mcbi_lockp); 5130 return (mtnfp); 5131 } 5132 5133 static void 5134 mac_client_tx_notify_remove(mac_client_impl_t *mcip, mac_tx_notify_cb_t *mtnfp) 5135 { 5136 mac_cb_info_t *mcbi; 5137 mac_cb_t **cblist; 5138 5139 ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); 5140 5141 if (!mac_callback_find(&mcip->mci_tx_notify_cb_info, 5142 &mcip->mci_tx_notify_cb_list, &mtnfp->mtnf_link)) { 5143 cmn_err(CE_WARN, 5144 "mac_client_tx_notify_remove: callback not " 5145 "found, mcip 0x%p mtnfp 0x%p", (void *)mcip, (void *)mtnfp); 5146 return; 5147 } 5148 5149 mcbi = &mcip->mci_tx_notify_cb_info; 5150 cblist = &mcip->mci_tx_notify_cb_list; 5151 mutex_enter(mcbi->mcbi_lockp); 5152 if (mac_callback_remove(mcbi, cblist, &mtnfp->mtnf_link)) 5153 kmem_free(mtnfp, sizeof (mac_tx_notify_cb_t)); 5154 else 5155 mac_callback_remove_wait(&mcip->mci_tx_notify_cb_info); 5156 mutex_exit(mcbi->mcbi_lockp); 5157 } 5158 5159 /* 5160 * mac_client_tx_notify(): 5161 * call to add and remove flow control callback routine. 5162 */ 5163 mac_tx_notify_handle_t 5164 mac_client_tx_notify(mac_client_handle_t mch, mac_tx_notify_t callb_func, 5165 void *ptr) 5166 { 5167 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 5168 mac_tx_notify_cb_t *mtnfp = NULL; 5169 5170 i_mac_perim_enter(mcip->mci_mip); 5171 5172 if (callb_func != NULL) { 5173 /* Add a notify callback */ 5174 mtnfp = mac_client_tx_notify_add(mcip, callb_func, ptr); 5175 } else { 5176 mac_client_tx_notify_remove(mcip, (mac_tx_notify_cb_t *)ptr); 5177 } 5178 i_mac_perim_exit(mcip->mci_mip); 5179 5180 return ((mac_tx_notify_handle_t)mtnfp); 5181 } 5182