xref: /titanic_52/usr/src/uts/common/io/mac/mac_sched.c (revision 8ad9a34f3f25cf12dcaeb58068e149313836bb7a)
1da14cebeSEric Cheng /*
2da14cebeSEric Cheng  * CDDL HEADER START
3da14cebeSEric Cheng  *
4da14cebeSEric Cheng  * The contents of this file are subject to the terms of the
5da14cebeSEric Cheng  * Common Development and Distribution License (the "License").
6da14cebeSEric Cheng  * You may not use this file except in compliance with the License.
7da14cebeSEric Cheng  *
8da14cebeSEric Cheng  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9da14cebeSEric Cheng  * or http://www.opensolaris.org/os/licensing.
10da14cebeSEric Cheng  * See the License for the specific language governing permissions
11da14cebeSEric Cheng  * and limitations under the License.
12da14cebeSEric Cheng  *
13da14cebeSEric Cheng  * When distributing Covered Code, include this CDDL HEADER in each
14da14cebeSEric Cheng  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15da14cebeSEric Cheng  * If applicable, add the following below this CDDL HEADER, with the
16da14cebeSEric Cheng  * fields enclosed by brackets "[]" replaced with your own identifying
17da14cebeSEric Cheng  * information: Portions Copyright [yyyy] [name of copyright owner]
18da14cebeSEric Cheng  *
19da14cebeSEric Cheng  * CDDL HEADER END
20da14cebeSEric Cheng  */
21da14cebeSEric Cheng /*
229820c710SBaban Kenkre  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23da14cebeSEric Cheng  * Use is subject to license terms.
24*8ad9a34fSRyan Zezeski  * Copyright 2017 Joyent, Inc.
253cc3202eSDan McDonald  * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
26e2ea9c96SRobert Mustacchi  */
27da14cebeSEric Cheng 
28bc44a933SRobert Mustacchi /*
29bc44a933SRobert Mustacchi  * MAC data path
30bc44a933SRobert Mustacchi  *
31bc44a933SRobert Mustacchi  * The MAC data path is concerned with the flow of traffic from mac clients --
32bc44a933SRobert Mustacchi  * DLS, IP, etc. -- to various GLDv3 device drivers -- e1000g, vnic, aggr,
33bc44a933SRobert Mustacchi  * ixgbe, etc. -- and from the GLDv3 device drivers back to clients.
34bc44a933SRobert Mustacchi  *
35bc44a933SRobert Mustacchi  * -----------
36bc44a933SRobert Mustacchi  * Terminology
37bc44a933SRobert Mustacchi  * -----------
38bc44a933SRobert Mustacchi  *
39bc44a933SRobert Mustacchi  * MAC uses a lot of different, but related terms that are associated with the
40bc44a933SRobert Mustacchi  * design and structure of the data path. Before we cover other aspects, first
41bc44a933SRobert Mustacchi  * let's review the terminology that MAC uses.
42bc44a933SRobert Mustacchi  *
43bc44a933SRobert Mustacchi  * MAC
44bc44a933SRobert Mustacchi  *
45bc44a933SRobert Mustacchi  * 	This driver. It interfaces with device drivers and provides abstractions
46bc44a933SRobert Mustacchi  * 	that the rest of the system consumes. All data links -- things managed
47bc44a933SRobert Mustacchi  * 	with dladm(1M), are accessed through MAC.
48bc44a933SRobert Mustacchi  *
49bc44a933SRobert Mustacchi  * GLDv3 DEVICE DRIVER
50bc44a933SRobert Mustacchi  *
51bc44a933SRobert Mustacchi  * 	A GLDv3 device driver refers to a driver, both for pseudo-devices and
52bc44a933SRobert Mustacchi  * 	real devices, which implement the GLDv3 driver API. Common examples of
53bc44a933SRobert Mustacchi  * 	these are igb and ixgbe, which are drivers for various Intel networking
54bc44a933SRobert Mustacchi  * 	cards. These devices may or may not have various features, such as
55bc44a933SRobert Mustacchi  * 	hardware rings and checksum offloading. For MAC, a GLDv3 device is the
56bc44a933SRobert Mustacchi  * 	final point for the transmission of a packet and the starting point for
57bc44a933SRobert Mustacchi  * 	the receipt of a packet.
58bc44a933SRobert Mustacchi  *
59bc44a933SRobert Mustacchi  * FLOWS
60bc44a933SRobert Mustacchi  *
61bc44a933SRobert Mustacchi  * 	At a high level, a flow refers to a series of packets that are related.
62bc44a933SRobert Mustacchi  * 	Often times the term is used in the context of TCP to indicate a unique
63bc44a933SRobert Mustacchi  * 	TCP connection and the traffic over it. However, a flow can exist at
64bc44a933SRobert Mustacchi  * 	other levels of the system as well. MAC has a notion of a default flow
65bc44a933SRobert Mustacchi  * 	which is used for all unicast traffic addressed to the address of a MAC
66bc44a933SRobert Mustacchi  * 	device. For example, when a VNIC is created, a default flow is created
67bc44a933SRobert Mustacchi  * 	for the VNIC's MAC address. In addition, flows are created for broadcast
68bc44a933SRobert Mustacchi  * 	groups and a user may create a flow with flowadm(1M).
69bc44a933SRobert Mustacchi  *
70bc44a933SRobert Mustacchi  * CLASSIFICATION
71bc44a933SRobert Mustacchi  *
72bc44a933SRobert Mustacchi  * 	Classification refers to the notion of identifying an incoming frame
73bc44a933SRobert Mustacchi  * 	based on its destination address and optionally its source addresses and
74bc44a933SRobert Mustacchi  * 	doing different processing based on that information. Classification can
75bc44a933SRobert Mustacchi  * 	be done in both hardware and software. In general, we usually only
76bc44a933SRobert Mustacchi  * 	classify based on the layer two destination, eg. for Ethernet, the
77bc44a933SRobert Mustacchi  * 	destination MAC address.
78bc44a933SRobert Mustacchi  *
79bc44a933SRobert Mustacchi  * 	The system also will do classification based on layer three and layer
80bc44a933SRobert Mustacchi  * 	four properties. This is used to support things like flowadm(1M), which
81bc44a933SRobert Mustacchi  * 	allows setting QoS and other properties on a per-flow basis.
82bc44a933SRobert Mustacchi  *
83bc44a933SRobert Mustacchi  * RING
84bc44a933SRobert Mustacchi  *
85bc44a933SRobert Mustacchi  * 	Conceptually, a ring represents a series of framed messages, often in a
86bc44a933SRobert Mustacchi  * 	contiguous chunk of memory that acts as a circular buffer. Rings come in
87bc44a933SRobert Mustacchi  * 	a couple of forms. Generally they are either a hardware construct (hw
88bc44a933SRobert Mustacchi  * 	ring) or they are a software construct (sw ring) maintained by MAC.
89bc44a933SRobert Mustacchi  *
90bc44a933SRobert Mustacchi  * HW RING
91bc44a933SRobert Mustacchi  *
92bc44a933SRobert Mustacchi  * 	A hardware ring is a set of resources provided by a GLDv3 device driver
93bc44a933SRobert Mustacchi  * 	(even if it is a pseudo-device). A hardware ring comes in two different
94bc44a933SRobert Mustacchi  * 	forms: receive (rx) rings and transmit (tx) rings. An rx hw ring is
95bc44a933SRobert Mustacchi  * 	something that has a unique DMA (direct memory access) region and
96bc44a933SRobert Mustacchi  * 	generally supports some form of classification (though it isn't always
97bc44a933SRobert Mustacchi  * 	used), as well as a means of generating an interrupt specific to that
98bc44a933SRobert Mustacchi  * 	ring. For example, the device may generate a specific MSI-X for a PCI
99bc44a933SRobert Mustacchi  * 	express device. A tx ring is similar, except that it is dedicated to
100bc44a933SRobert Mustacchi  * 	transmission. It may also be a vector for enabling features such as VLAN
101bc44a933SRobert Mustacchi  * 	tagging and large transmit offloading. It usually has its own dedicated
102bc44a933SRobert Mustacchi  * 	interrupts for transmit being completed.
103bc44a933SRobert Mustacchi  *
104bc44a933SRobert Mustacchi  * SW RING
105bc44a933SRobert Mustacchi  *
106bc44a933SRobert Mustacchi  * 	A software ring is a construction of MAC. It represents the same thing
107bc44a933SRobert Mustacchi  * 	that a hardware ring generally does, a collection of frames. However,
108bc44a933SRobert Mustacchi  * 	instead of being in a contiguous ring of memory, they're instead linked
109bc44a933SRobert Mustacchi  * 	by using the mblk_t's b_next pointer. Each frame may itself be multiple
110bc44a933SRobert Mustacchi  * 	mblk_t's linked together by the b_cont pointer. A software ring always
111bc44a933SRobert Mustacchi  * 	represents a collection of classified packets; however, it varies as to
112bc44a933SRobert Mustacchi  * 	whether it uses only layer two information, or a combination of that and
113bc44a933SRobert Mustacchi  * 	additional layer three and layer four data.
114bc44a933SRobert Mustacchi  *
115bc44a933SRobert Mustacchi  * FANOUT
116bc44a933SRobert Mustacchi  *
117bc44a933SRobert Mustacchi  * 	Fanout is the idea of spreading out the load of processing frames based
118bc44a933SRobert Mustacchi  * 	on the source and destination information contained in the layer two,
119bc44a933SRobert Mustacchi  * 	three, and four headers, such that the data can then be processed in
120bc44a933SRobert Mustacchi  * 	parallel using multiple hardware threads.
121bc44a933SRobert Mustacchi  *
122bc44a933SRobert Mustacchi  * 	A fanout algorithm hashes the headers and uses that to place different
123bc44a933SRobert Mustacchi  * 	flows into a bucket. The most important thing is that packets that are
124bc44a933SRobert Mustacchi  * 	in the same flow end up in the same bucket. If they do not, performance
125bc44a933SRobert Mustacchi  * 	can be adversely affected. Consider the case of TCP.  TCP severely
126bc44a933SRobert Mustacchi  * 	penalizes a connection if the data arrives out of order. If a given flow
127bc44a933SRobert Mustacchi  * 	is processed on different CPUs, then the data will appear out of order,
128bc44a933SRobert Mustacchi  * 	hence the invariant that fanout always hash a given flow to the same
129bc44a933SRobert Mustacchi  * 	bucket and thus get processed on the same CPU.
130bc44a933SRobert Mustacchi  *
131bc44a933SRobert Mustacchi  * RECEIVE SIDE SCALING (RSS)
132bc44a933SRobert Mustacchi  *
133bc44a933SRobert Mustacchi  *
134bc44a933SRobert Mustacchi  * 	Receive side scaling is a term that isn't common in illumos, but is used
135bc44a933SRobert Mustacchi  * 	by vendors and was popularized by Microsoft. It refers to the idea of
136bc44a933SRobert Mustacchi  * 	spreading the incoming receive load out across multiple interrupts which
137bc44a933SRobert Mustacchi  * 	can be directed to different CPUs. This allows a device to leverage
138bc44a933SRobert Mustacchi  * 	hardware rings even when it doesn't support hardware classification. The
139bc44a933SRobert Mustacchi  * 	hardware uses an algorithm to perform fanout that ensures the flow
140bc44a933SRobert Mustacchi  * 	invariant is maintained.
141bc44a933SRobert Mustacchi  *
142bc44a933SRobert Mustacchi  * SOFT RING SET
143bc44a933SRobert Mustacchi  *
144bc44a933SRobert Mustacchi  * 	A soft ring set, commonly abbreviated SRS, is a collection of rings and
145bc44a933SRobert Mustacchi  * 	is used for both transmitting and receiving. It is maintained in the
146bc44a933SRobert Mustacchi  * 	structure mac_soft_ring_set_t. A soft ring set is usually associated
147bc44a933SRobert Mustacchi  * 	with flows, and coordinates both the use of hardware and software rings.
148bc44a933SRobert Mustacchi  * 	Because the use of hardware rings can change as devices such as VNICs
149bc44a933SRobert Mustacchi  * 	come and go, we always ensure that the set has software classification
150bc44a933SRobert Mustacchi  * 	rules that correspond to the hardware classification rules from rings.
151bc44a933SRobert Mustacchi  *
152bc44a933SRobert Mustacchi  * 	Soft ring sets are also used for the enforcement of various QoS
153bc44a933SRobert Mustacchi  * 	properties. For example, if a bandwidth limit has been placed on a
154bc44a933SRobert Mustacchi  * 	specific flow or device, then that will be enforced by the soft ring
155bc44a933SRobert Mustacchi  * 	set.
156bc44a933SRobert Mustacchi  *
157bc44a933SRobert Mustacchi  * SERVICE ATTACHMENT POINT (SAP)
158bc44a933SRobert Mustacchi  *
159bc44a933SRobert Mustacchi  * 	The service attachment point is a DLPI (Data Link Provider Interface)
160bc44a933SRobert Mustacchi  * 	concept; however, it comes up quite often in MAC. Most MAC devices speak
161bc44a933SRobert Mustacchi  * 	a protocol that has some notion of different channels or message type
162bc44a933SRobert Mustacchi  * 	identifiers. For example, Ethernet defines an EtherType which is a part
163bc44a933SRobert Mustacchi  * 	of the Ethernet header and defines the particular protocol of the data
164bc44a933SRobert Mustacchi  * 	payload. If the EtherType is set to 0x0800, then it defines that the
165bc44a933SRobert Mustacchi  * 	contents of that Ethernet frame is IPv4 traffic. For Ethernet, the
166bc44a933SRobert Mustacchi  * 	EtherType is the SAP.
167bc44a933SRobert Mustacchi  *
168bc44a933SRobert Mustacchi  * 	In DLPI, a given consumer attaches to a specific SAP. In illumos, the ip
169bc44a933SRobert Mustacchi  * 	and arp drivers attach to the EtherTypes for IPv4, IPv6, and ARP. Using
170bc44a933SRobert Mustacchi  * 	libdlpi(3LIB) user software can attach to arbitrary SAPs. With the
171bc44a933SRobert Mustacchi  * 	exception of 802.1Q VLAN tagged traffic, MAC itself does not directly
172bc44a933SRobert Mustacchi  * 	consume the SAP; however, it uses that information as part of hashing
173bc44a933SRobert Mustacchi  * 	and it may be used as part of the construction of flows.
174bc44a933SRobert Mustacchi  *
175bc44a933SRobert Mustacchi  * PRIMARY MAC CLIENT
176bc44a933SRobert Mustacchi  *
177bc44a933SRobert Mustacchi  * 	The primary mac client refers to a mac client whose unicast address
178bc44a933SRobert Mustacchi  * 	matches the address of the device itself. For example, if the system has
179bc44a933SRobert Mustacchi  * 	instance of the e1000g driver such as e1000g0, e1000g1, etc., the
180bc44a933SRobert Mustacchi  * 	primary mac client is the one named after the device itself. VNICs that
181bc44a933SRobert Mustacchi  * 	are created on top of such devices are not the primary client.
182bc44a933SRobert Mustacchi  *
183bc44a933SRobert Mustacchi  * TRANSMIT DESCRIPTORS
184bc44a933SRobert Mustacchi  *
185bc44a933SRobert Mustacchi  * 	Transmit descriptors are a resource that most GLDv3 device drivers have.
186bc44a933SRobert Mustacchi  * 	Generally, a GLDv3 device driver takes a frame that's meant to be output
187bc44a933SRobert Mustacchi  * 	and puts a copy of it into a region of memory. Each region of memory
188bc44a933SRobert Mustacchi  * 	usually has an associated descriptor that the device uses to manage
189bc44a933SRobert Mustacchi  * 	properties of the frames. Devices have a limited number of such
190bc44a933SRobert Mustacchi  * 	descriptors. They get reclaimed once the device finishes putting the
191bc44a933SRobert Mustacchi  * 	frame on the wire.
192bc44a933SRobert Mustacchi  *
193bc44a933SRobert Mustacchi  * 	If the driver runs out of transmit descriptors, for example, the OS is
194bc44a933SRobert Mustacchi  * 	generating more frames than it can put on the wire, then it will return
195bc44a933SRobert Mustacchi  * 	them back to the MAC layer.
196bc44a933SRobert Mustacchi  *
197bc44a933SRobert Mustacchi  * ---------------------------------
198bc44a933SRobert Mustacchi  * Rings, Classification, and Fanout
199bc44a933SRobert Mustacchi  * ---------------------------------
200bc44a933SRobert Mustacchi  *
201bc44a933SRobert Mustacchi  * The heart of MAC is made up of rings, and not those that Elven-kings wear.
202bc44a933SRobert Mustacchi  * When receiving a packet, MAC breaks the work into two different, though
203bc44a933SRobert Mustacchi  * interrelated phases. The first phase is generally classification and then the
204bc44a933SRobert Mustacchi  * second phase is generally fanout. When a frame comes in from a GLDv3 Device,
205bc44a933SRobert Mustacchi  * MAC needs to determine where that frame should be delivered. If it's a
206bc44a933SRobert Mustacchi  * unicast frame (say a normal TCP/IP packet), then it will be delivered to a
207bc44a933SRobert Mustacchi  * single MAC client; however, if it's a broadcast or multicast frame, then MAC
208bc44a933SRobert Mustacchi  * may need to deliver it to multiple MAC clients.
209bc44a933SRobert Mustacchi  *
210bc44a933SRobert Mustacchi  * On transmit, classification isn't quite as important, but may still be used.
211bc44a933SRobert Mustacchi  * Unlike with the receive path, the classification is not used to determine
212bc44a933SRobert Mustacchi  * devices that should transmit something, but rather is used for special
213bc44a933SRobert Mustacchi  * properties of a flow, eg. bandwidth limits for a given IP address, device, or
214bc44a933SRobert Mustacchi  * connection.
215bc44a933SRobert Mustacchi  *
216bc44a933SRobert Mustacchi  * MAC employs a software classifier and leverages hardware classification as
217bc44a933SRobert Mustacchi  * well. The software classifier can leverage the full layer two information,
218bc44a933SRobert Mustacchi  * source, destination, VLAN, and SAP. If the SAP indicates that IP traffic is
219bc44a933SRobert Mustacchi  * being sent, it can classify based on the IP header, and finally, it also
220bc44a933SRobert Mustacchi  * knows how to classify based on the local and remote ports of TCP, UDP, and
221bc44a933SRobert Mustacchi  * SCTP.
222bc44a933SRobert Mustacchi  *
223bc44a933SRobert Mustacchi  * Hardware classifiers vary in capability. Generally all hardware classifiers
224bc44a933SRobert Mustacchi  * provide the capability to classify based on the destination MAC address. Some
225bc44a933SRobert Mustacchi  * hardware has additional filters built in for performing more in-depth
226bc44a933SRobert Mustacchi  * classification; however, it often has much more limited resources for these
227bc44a933SRobert Mustacchi  * activities as compared to the layer two destination address classification.
228bc44a933SRobert Mustacchi  *
229bc44a933SRobert Mustacchi  * The modus operandi in MAC is to always ensure that we have software-based
230bc44a933SRobert Mustacchi  * capabilities and rules in place and then to supplement that with hardware
231bc44a933SRobert Mustacchi  * resources when available. In general, simple layer two classification is
232bc44a933SRobert Mustacchi  * sufficient and nothing else is used, unless a specific flow is created with
233bc44a933SRobert Mustacchi  * tools such as flowadm(1M) or bandwidth limits are set on a device with
234bc44a933SRobert Mustacchi  * dladm(1M).
235bc44a933SRobert Mustacchi  *
236bc44a933SRobert Mustacchi  * RINGS AND GROUPS
237bc44a933SRobert Mustacchi  *
238bc44a933SRobert Mustacchi  * To get into how rings and classification play together, it's first important
239bc44a933SRobert Mustacchi  * to understand how hardware devices commonly associate rings and allow them to
240bc44a933SRobert Mustacchi  * be programmed. Recall that a hardware ring should be thought of as a DMA
241bc44a933SRobert Mustacchi  * buffer and an interrupt resource. Rings are then collected into groups. A
242bc44a933SRobert Mustacchi  * group itself has a series of classification rules. One or more MAC addresses
243bc44a933SRobert Mustacchi  * are assigned to a group.
244bc44a933SRobert Mustacchi  *
245bc44a933SRobert Mustacchi  * Hardware devices vary in terms of what capabilities they provide. Sometimes
246bc44a933SRobert Mustacchi  * they allow for a dynamic assignment of rings to a group and sometimes they
247bc44a933SRobert Mustacchi  * have a static assignment of rings to a group. For example, the ixgbe driver
248bc44a933SRobert Mustacchi  * has a static assignment of rings to groups such that every group has exactly
249bc44a933SRobert Mustacchi  * one ring and the number of groups is equal to the number of rings.
250bc44a933SRobert Mustacchi  *
251bc44a933SRobert Mustacchi  * Classification and receive side scaling both come into play with how a device
252bc44a933SRobert Mustacchi  * advertises itself to MAC and how MAC uses it. If a device supports layer two
253bc44a933SRobert Mustacchi  * classification of frames, then MAC will assign MAC addresses to a group as a
254bc44a933SRobert Mustacchi  * form of primary classification. If a single MAC address is assigned to a
255bc44a933SRobert Mustacchi  * group, a common case, then MAC will consider packets that come in from rings
256bc44a933SRobert Mustacchi  * on that group to be fully classified and will not need to do any software
257bc44a933SRobert Mustacchi  * classification unless a specific flow has been created.
258bc44a933SRobert Mustacchi  *
259bc44a933SRobert Mustacchi  * If a device supports receive side scaling, then it may advertise or support
260bc44a933SRobert Mustacchi  * groups with multiple rings. In those cases, then receive side scaling will
261bc44a933SRobert Mustacchi  * come into play and MAC will use that as a means of fanning out received
262bc44a933SRobert Mustacchi  * frames across multiple CPUs. This can also be combined with groups that
263bc44a933SRobert Mustacchi  * support layer two classification.
264bc44a933SRobert Mustacchi  *
265bc44a933SRobert Mustacchi  * If a device supports dynamic assignments of rings to groups, then MAC will
266bc44a933SRobert Mustacchi  * change around the way that rings are assigned to various groups as devices
267bc44a933SRobert Mustacchi  * come and go from the system. For example, when a VNIC is created, a new flow
268bc44a933SRobert Mustacchi  * will be created for the VNIC's MAC address. If a hardware ring is available,
269bc44a933SRobert Mustacchi  * MAC may opt to reassign it from one group to another.
270bc44a933SRobert Mustacchi  *
271bc44a933SRobert Mustacchi  * ASSIGNMENT OF HARDWARE RINGS
272bc44a933SRobert Mustacchi  *
273bc44a933SRobert Mustacchi  * This is a bit of a complicated subject that varies depending on the device,
274bc44a933SRobert Mustacchi  * the use of aggregations, the special nature of the primary mac client. This
275bc44a933SRobert Mustacchi  * section deserves being fleshed out.
276bc44a933SRobert Mustacchi  *
277bc44a933SRobert Mustacchi  * FANOUT
278bc44a933SRobert Mustacchi  *
279bc44a933SRobert Mustacchi  * illumos uses fanout to help spread out the incoming processing load of chains
280bc44a933SRobert Mustacchi  * of frames away from a single CPU. If a device supports receive side scaling,
281bc44a933SRobert Mustacchi  * then that provides an initial form of fanout; however, what we're concerned
282bc44a933SRobert Mustacchi  * with all happens after the context of a given set of frames being classified
283bc44a933SRobert Mustacchi  * to a soft ring set.
284bc44a933SRobert Mustacchi  *
285bc44a933SRobert Mustacchi  * After frames reach a soft ring set and account for any potential bandwidth
286bc44a933SRobert Mustacchi  * related accounting, they may be fanned out based on one of the following
287bc44a933SRobert Mustacchi  * three modes:
288bc44a933SRobert Mustacchi  *
289bc44a933SRobert Mustacchi  *     o No Fanout
290bc44a933SRobert Mustacchi  *     o Protocol level fanout
291bc44a933SRobert Mustacchi  *     o Full software ring protocol fanout
292bc44a933SRobert Mustacchi  *
293bc44a933SRobert Mustacchi  * MAC makes the determination as to which of these modes a given soft ring set
294bc44a933SRobert Mustacchi  * obtains based on parameters such as whether or not it's the primary mac
295bc44a933SRobert Mustacchi  * client, whether it's on a 10 GbE or faster device, user controlled dladm(1M)
296bc44a933SRobert Mustacchi  * properties, and the nature of the hardware and the resources that it has.
297bc44a933SRobert Mustacchi  *
298bc44a933SRobert Mustacchi  * When there is no fanout, MAC does not create any soft rings for a device and
299bc44a933SRobert Mustacchi  * the device has frames delivered directly to the MAC client.
300bc44a933SRobert Mustacchi  *
301bc44a933SRobert Mustacchi  * Otherwise, all fanout is performed by software. MAC divides incoming frames
302bc44a933SRobert Mustacchi  * into one of three buckets -- IPv4 TCP traffic, IPv4 UDP traffic, and
303bc44a933SRobert Mustacchi  * everything else. Note, VLAN tagged traffic is considered other, regardless of
304bc44a933SRobert Mustacchi  * the interior EtherType. Regardless of the type of fanout, these three
305bc44a933SRobert Mustacchi  * categories or buckets are always used.
306bc44a933SRobert Mustacchi  *
307bc44a933SRobert Mustacchi  * The difference between protocol level fanout and full software ring protocol
308bc44a933SRobert Mustacchi  * fanout is the number of software rings that end up getting created. The
309bc44a933SRobert Mustacchi  * system always uses the same number of software rings per protocol bucket. So
310bc44a933SRobert Mustacchi  * in the first case when we're just doing protocol level fanout, we just create
311bc44a933SRobert Mustacchi  * one software ring each for IPv4 TCP traffic, IPv4 UDP traffic, and everything
312bc44a933SRobert Mustacchi  * else.
313bc44a933SRobert Mustacchi  *
314bc44a933SRobert Mustacchi  * In the case where we do full software ring protocol fanout, we generally use
315bc44a933SRobert Mustacchi  * mac_compute_soft_ring_count() to determine the number of rings. There are
316bc44a933SRobert Mustacchi  * other combinations of properties and devices that may send us down other
317bc44a933SRobert Mustacchi  * paths, but this is a common starting point. If it's a non-bandwidth enforced
318bc44a933SRobert Mustacchi  * device and we're on at least a 10 GbE link, then we'll use eight soft rings
319bc44a933SRobert Mustacchi  * per protocol bucket as a starting point. See mac_compute_soft_ring_count()
320bc44a933SRobert Mustacchi  * for more information on the total number.
321bc44a933SRobert Mustacchi  *
322bc44a933SRobert Mustacchi  * For each of these rings, we create a mac_soft_ring_t and an associated worker
323bc44a933SRobert Mustacchi  * thread. Particularly when doing full software ring protocol fanout, we bind
324bc44a933SRobert Mustacchi  * each of the worker threads to individual CPUs.
325bc44a933SRobert Mustacchi  *
326bc44a933SRobert Mustacchi  * The other advantage of these software rings is that it allows upper layers to
327bc44a933SRobert Mustacchi  * optionally poll on them. For example, TCP can leverage an squeue to poll on
328bc44a933SRobert Mustacchi  * the software ring, see squeue.c for more information.
329bc44a933SRobert Mustacchi  *
330bc44a933SRobert Mustacchi  * DLS BYPASS
331bc44a933SRobert Mustacchi  *
332bc44a933SRobert Mustacchi  * DLS is the data link services module. It interfaces with DLPI, which is the
333bc44a933SRobert Mustacchi  * primary way that other parts of the system such as IP interface with the MAC
334bc44a933SRobert Mustacchi  * layer. While DLS is traditionally a STREAMS-based interface, it allows for
335bc44a933SRobert Mustacchi  * certain modules such as IP to negotiate various more modern interfaces to be
336bc44a933SRobert Mustacchi  * used, which are useful for higher performance and allow it to use direct
337bc44a933SRobert Mustacchi  * function calls to DLS instead of using STREAMS.
338bc44a933SRobert Mustacchi  *
339bc44a933SRobert Mustacchi  * When we have IPv4 TCP or UDP software rings, then traffic on those rings is
340bc44a933SRobert Mustacchi  * eligible for what we call the dls bypass. In those cases, rather than going
341bc44a933SRobert Mustacchi  * out mac_rx_deliver() to DLS, DLS instead registers them to go directly via
342bc44a933SRobert Mustacchi  * the direct callback registered with DLS, generally ip_input().
343bc44a933SRobert Mustacchi  *
344bc44a933SRobert Mustacchi  * HARDWARE RING POLLING
345bc44a933SRobert Mustacchi  *
346bc44a933SRobert Mustacchi  * GLDv3 devices with hardware rings generally deliver chains of messages
347bc44a933SRobert Mustacchi  * (mblk_t chain) during the context of a single interrupt. However, interrupts
348bc44a933SRobert Mustacchi  * are not the only way that these devices may be used. As part of implementing
349bc44a933SRobert Mustacchi  * ring support, a GLDv3 device driver must have a way to disable the generation
350bc44a933SRobert Mustacchi  * of that interrupt and allow for the operating system to poll on that ring.
351bc44a933SRobert Mustacchi  *
352bc44a933SRobert Mustacchi  * To implement this, every soft ring set has a worker thread and a polling
353bc44a933SRobert Mustacchi  * thread. If a sufficient packet rate comes into the system, MAC will 'blank'
354bc44a933SRobert Mustacchi  * (disable) interrupts on that specific ring and the polling thread will start
355bc44a933SRobert Mustacchi  * consuming packets from the hardware device and deliver them to the soft ring
356bc44a933SRobert Mustacchi  * set, where the worker thread will take over.
357bc44a933SRobert Mustacchi  *
358bc44a933SRobert Mustacchi  * Once the rate of packet intake drops down below a certain threshold, then
359bc44a933SRobert Mustacchi  * polling on the hardware ring will be quiesced and interrupts will be
360bc44a933SRobert Mustacchi  * re-enabled for the given ring. This effectively allows the system to shift
361bc44a933SRobert Mustacchi  * how it handles a ring based on its load. At high packet rates, polling on the
362bc44a933SRobert Mustacchi  * device as opposed to relying on interrupts can actually reduce overall system
363bc44a933SRobert Mustacchi  * load due to the minimization of interrupt activity.
364bc44a933SRobert Mustacchi  *
365bc44a933SRobert Mustacchi  * Note the importance of each ring having its own interrupt source. The whole
366bc44a933SRobert Mustacchi  * idea here is that we do not disable interrupts on the device as a whole, but
367bc44a933SRobert Mustacchi  * rather each ring can be independently toggled.
368bc44a933SRobert Mustacchi  *
369bc44a933SRobert Mustacchi  * USE OF WORKER THREADS
370bc44a933SRobert Mustacchi  *
371bc44a933SRobert Mustacchi  * Both the soft ring set and individual soft rings have a worker thread
372bc44a933SRobert Mustacchi  * associated with them that may be bound to a specific CPU in the system. Any
373bc44a933SRobert Mustacchi  * such assignment will get reassessed as part of dynamic reconfiguration events
374bc44a933SRobert Mustacchi  * in the system such as the onlining and offlining of CPUs and the creation of
375bc44a933SRobert Mustacchi  * CPU partitions.
376bc44a933SRobert Mustacchi  *
377bc44a933SRobert Mustacchi  * In many cases, while in an interrupt, we try to deliver a frame all the way
378bc44a933SRobert Mustacchi  * through the stack in the context of the interrupt itself. However, if the
379bc44a933SRobert Mustacchi  * amount of queued frames has exceeded a threshold, then we instead defer to
380bc44a933SRobert Mustacchi  * the worker thread to do this work and signal it. This is particularly useful
381bc44a933SRobert Mustacchi  * when you have the soft ring set delivering frames into multiple software
382bc44a933SRobert Mustacchi  * rings. If it was only delivering frames into a single software ring then
383bc44a933SRobert Mustacchi  * there'd be no need to have another thread take over. However, if it's
384bc44a933SRobert Mustacchi  * delivering chains of frames to multiple rings, then it's worthwhile to have
385bc44a933SRobert Mustacchi  * the worker for the software ring take over so that the different software
386bc44a933SRobert Mustacchi  * rings can be processed in parallel.
387bc44a933SRobert Mustacchi  *
388bc44a933SRobert Mustacchi  * In a similar fashion to the hardware polling thread, if we don't have a
389bc44a933SRobert Mustacchi  * backlog or there's nothing to do, then the worker thread will go back to
390bc44a933SRobert Mustacchi  * sleep and frames can be delivered all the way from an interrupt. This
391bc44a933SRobert Mustacchi  * behavior is useful as it's designed to minimize latency and the default
392bc44a933SRobert Mustacchi  * disposition of MAC is to optimize for latency.
393bc44a933SRobert Mustacchi  *
394bc44a933SRobert Mustacchi  * MAINTAINING CHAINS
395bc44a933SRobert Mustacchi  *
396bc44a933SRobert Mustacchi  * Another useful idea that MAC uses is to try and maintain frames in chains for
397bc44a933SRobert Mustacchi  * as long as possible. The idea is that all of MAC can handle chains of frames
398bc44a933SRobert Mustacchi  * structured as a series of mblk_t structures linked with the b_next pointer.
399bc44a933SRobert Mustacchi  * When performing software classification and software fanout, MAC does not
400bc44a933SRobert Mustacchi  * simply determine the destination and send the frame along. Instead, in the
401bc44a933SRobert Mustacchi  * case of classification, it tries to maintain a chain for as long as possible
402bc44a933SRobert Mustacchi  * before passing it along and performing additional processing.
403bc44a933SRobert Mustacchi  *
404bc44a933SRobert Mustacchi  * In the case of fanout, MAC first determines what the target software ring is
405bc44a933SRobert Mustacchi  * for every frame in the original chain and constructs a new chain for each
406bc44a933SRobert Mustacchi  * target. MAC then delivers the new chain to each software ring in succession.
407bc44a933SRobert Mustacchi  *
408bc44a933SRobert Mustacchi  * The whole rationale for doing this is that we want to try and maintain the
409bc44a933SRobert Mustacchi  * pipe as much as possible and deliver as many frames through the stack at once
410bc44a933SRobert Mustacchi  * that we can, rather than just pushing a single frame through. This can often
411bc44a933SRobert Mustacchi  * help bring down latency and allows MAC to get a better sense of the overall
412bc44a933SRobert Mustacchi  * activity in the system and properly engage worker threads.
413bc44a933SRobert Mustacchi  *
414bc44a933SRobert Mustacchi  * --------------------
415bc44a933SRobert Mustacchi  * Bandwidth Management
416bc44a933SRobert Mustacchi  * --------------------
417bc44a933SRobert Mustacchi  *
418bc44a933SRobert Mustacchi  * Bandwidth management is something that's built into the soft ring set itself.
419bc44a933SRobert Mustacchi  * When bandwidth limits are placed on a flow, a corresponding soft ring set is
420bc44a933SRobert Mustacchi  * toggled into bandwidth mode. This changes how we transmit and receive the
421bc44a933SRobert Mustacchi  * frames in question.
422bc44a933SRobert Mustacchi  *
423bc44a933SRobert Mustacchi  * Bandwidth management is done on a per-tick basis. We translate the user's
424bc44a933SRobert Mustacchi  * requested bandwidth from a quantity per-second into a quantity per-tick. MAC
425bc44a933SRobert Mustacchi  * cannot process a frame across more than one tick, thus it sets a lower bound
426bc44a933SRobert Mustacchi  * for the bandwidth cap to be a single MTU. This also means that when
427bc44a933SRobert Mustacchi  * hires ticks are enabled (hz is set to 1000), that the minimum amount of
428bc44a933SRobert Mustacchi  * bandwidth is higher, because the number of ticks has increased and MAC has to
429bc44a933SRobert Mustacchi  * go from accepting 100 packets / sec to 1000 / sec.
430bc44a933SRobert Mustacchi  *
431bc44a933SRobert Mustacchi  * The bandwidth counter is reset by either the soft ring set's worker thread or
432bc44a933SRobert Mustacchi  * a thread that is doing an inline transmit or receive if they discover that
433bc44a933SRobert Mustacchi  * the current tick is in the future from the recorded tick.
434bc44a933SRobert Mustacchi  *
435bc44a933SRobert Mustacchi  * Whenever we're receiving or transmitting data, we end up leaving most of the
436bc44a933SRobert Mustacchi  * work to the soft ring set's worker thread. This forces data inserted into the
437bc44a933SRobert Mustacchi  * soft ring set to be effectively serialized and allows us to exhume bandwidth
438bc44a933SRobert Mustacchi  * at a reasonable rate. If there is nothing in the soft ring set at the moment
439bc44a933SRobert Mustacchi  * and the set has available bandwidth, then it may processed inline.
440bc44a933SRobert Mustacchi  * Otherwise, the worker is responsible for taking care of the soft ring set.
441bc44a933SRobert Mustacchi  *
442bc44a933SRobert Mustacchi  * ---------------------
443bc44a933SRobert Mustacchi  * The Receive Data Path
444bc44a933SRobert Mustacchi  * ---------------------
445bc44a933SRobert Mustacchi  *
446bc44a933SRobert Mustacchi  * The following series of ASCII art images breaks apart the way that a frame
447bc44a933SRobert Mustacchi  * comes in and is processed in MAC.
448bc44a933SRobert Mustacchi  *
449bc44a933SRobert Mustacchi  * Part 1 -- Initial frame receipt, SRS classification
450bc44a933SRobert Mustacchi  *
451bc44a933SRobert Mustacchi  * Here, a frame is received by a GLDv3 driver, generally in the context of an
452bc44a933SRobert Mustacchi  * interrupt, and it ends up in mac_rx_common(). A driver calls either mac_rx or
453bc44a933SRobert Mustacchi  * mac_rx_ring, depending on whether or not it supports rings and can identify
454bc44a933SRobert Mustacchi  * the interrupt as having come from a specific ring. Here we determine whether
455bc44a933SRobert Mustacchi  * or not it's fully classified and perform software classification as
456bc44a933SRobert Mustacchi  * appropriate. From here, everything always ends up going to either entry [A]
457bc44a933SRobert Mustacchi  * or entry [B] based on whether or not they have subflow processing needed. We
458bc44a933SRobert Mustacchi  * leave via fanout or delivery.
459bc44a933SRobert Mustacchi  *
460bc44a933SRobert Mustacchi  *           +===========+
461bc44a933SRobert Mustacchi  *           v hardware  v
462bc44a933SRobert Mustacchi  *           v interrupt v
463bc44a933SRobert Mustacchi  *           +===========+
464bc44a933SRobert Mustacchi  *                 |
465bc44a933SRobert Mustacchi  *                 * . . appropriate
466bc44a933SRobert Mustacchi  *                 |     upcall made
467bc44a933SRobert Mustacchi  *                 |     by GLDv3 driver  . . always
468bc44a933SRobert Mustacchi  *                 |                      .
469bc44a933SRobert Mustacchi  *  +--------+     |     +----------+     .    +---------------+
470bc44a933SRobert Mustacchi  *  | GLDv3  |     +---->| mac_rx   |-----*--->| mac_rx_common |
471bc44a933SRobert Mustacchi  *  | Driver |-->--+     +----------+          +---------------+
472bc44a933SRobert Mustacchi  *  +--------+     |        ^                         |
473bc44a933SRobert Mustacchi  *      |          |        ^                         v
474bc44a933SRobert Mustacchi  *      ^          |        * . . always   +----------------------+
475bc44a933SRobert Mustacchi  *      |          |        |              | mac_promisc_dispatch |
476bc44a933SRobert Mustacchi  *      |          |    +-------------+    +----------------------+
477bc44a933SRobert Mustacchi  *      |          +--->| mac_rx_ring |               |
478bc44a933SRobert Mustacchi  *      |               +-------------+               * . . hw classified
479bc44a933SRobert Mustacchi  *      |                                             v     or single flow?
480bc44a933SRobert Mustacchi  *      |                                             |
481bc44a933SRobert Mustacchi  *      |                                   +--------++--------------+
482bc44a933SRobert Mustacchi  *      |                                   |        |               * hw class,
483bc44a933SRobert Mustacchi  *      |                                   |        * hw classified | subflows
484bc44a933SRobert Mustacchi  *      |                 no hw class and . *        | or single     | exist
485bc44a933SRobert Mustacchi  *      |                 subflows          |        | flow          |
486bc44a933SRobert Mustacchi  *      |                                   |        v               v
487bc44a933SRobert Mustacchi  *      |                                   |   +-----------+   +-----------+
488bc44a933SRobert Mustacchi  *      |                                   |   |   goto    |   |  goto     |
489bc44a933SRobert Mustacchi  *      |                                   |   | entry [A] |   | entry [B] |
490bc44a933SRobert Mustacchi  *      |                                   |   +-----------+   +-----------+
491bc44a933SRobert Mustacchi  *      |                                   v          ^
492bc44a933SRobert Mustacchi  *      |                            +-------------+   |
493bc44a933SRobert Mustacchi  *      |                            | mac_rx_flow |   * SRS and flow found,
494bc44a933SRobert Mustacchi  *      |                            +-------------+   | call flow cb
495bc44a933SRobert Mustacchi  *      |                                   |          +------+
496bc44a933SRobert Mustacchi  *      |                                   v                 |
497bc44a933SRobert Mustacchi  *      v                             +==========+    +-----------------+
498bc44a933SRobert Mustacchi  *      |                             v For each v--->| mac_rx_classify |
499bc44a933SRobert Mustacchi  * +----------+                       v  mblk_t  v    +-----------------+
500bc44a933SRobert Mustacchi  * |   srs    |                       +==========+
501bc44a933SRobert Mustacchi  * | pollling |
502bc44a933SRobert Mustacchi  * |  thread  |->------------------------------------------+
503bc44a933SRobert Mustacchi  * +----------+                                            |
504bc44a933SRobert Mustacchi  *                                                         v       . inline
505bc44a933SRobert Mustacchi  *            +--------------------+   +----------+   +---------+  .
506bc44a933SRobert Mustacchi  *    [A]---->| mac_rx_srs_process |-->| check bw |-->| enqueue |--*---------+
507bc44a933SRobert Mustacchi  *            +--------------------+   |  limits  |   | frames  |            |
508bc44a933SRobert Mustacchi  *               ^                     +----------+   | to SRS  |            |
509bc44a933SRobert Mustacchi  *               |                                    +---------+            |
510bc44a933SRobert Mustacchi  *               |  send chain              +--------+    |                  |
511bc44a933SRobert Mustacchi  *               *  when clasified          | signal |    * BW limits,       |
512bc44a933SRobert Mustacchi  *               |  flow changes            |  srs   |<---+ loopback,        |
513bc44a933SRobert Mustacchi  *               |                          | worker |      stack too        |
514bc44a933SRobert Mustacchi  *               |                          +--------+      deep             |
515bc44a933SRobert Mustacchi  *      +-----------------+        +--------+                                |
516bc44a933SRobert Mustacchi  *      | mac_flow_lookup |        |  srs   |     +---------------------+    |
517bc44a933SRobert Mustacchi  *      +-----------------+        | worker |---->| mac_rx_srs_drain    |<---+
518bc44a933SRobert Mustacchi  *               ^                 | thread |     | mac_rx_srs_drain_bw |
519bc44a933SRobert Mustacchi  *               |                 +--------+     +---------------------+
520bc44a933SRobert Mustacchi  *               |                                          |
521bc44a933SRobert Mustacchi  *         +----------------------------+                   * software rings
522bc44a933SRobert Mustacchi  *   [B]-->| mac_rx_srs_subflow_process |                   | for fanout?
523bc44a933SRobert Mustacchi  *         +----------------------------+                   |
524bc44a933SRobert Mustacchi  *                                               +----------+-----------+
525bc44a933SRobert Mustacchi  *                                               |                      |
526bc44a933SRobert Mustacchi  *                                               v                      v
527bc44a933SRobert Mustacchi  *                                          +--------+             +--------+
528bc44a933SRobert Mustacchi  *                                          |  goto  |             |  goto  |
529bc44a933SRobert Mustacchi  *                                          | Part 2 |             | Part 3 |
530bc44a933SRobert Mustacchi  *                                          +--------+             +--------+
531bc44a933SRobert Mustacchi  *
532bc44a933SRobert Mustacchi  * Part 2 -- Fanout
533bc44a933SRobert Mustacchi  *
534bc44a933SRobert Mustacchi  * This part is concerned with using software fanout to assign frames to
535bc44a933SRobert Mustacchi  * software rings and then deliver them to MAC clients or allow those rings to
536bc44a933SRobert Mustacchi  * be polled upon. While there are two different primary fanout entry points,
537bc44a933SRobert Mustacchi  * mac_rx_fanout and mac_rx_proto_fanout, they behave in similar ways, and aside
538bc44a933SRobert Mustacchi  * from some of the individual hashing techniques used, most of the general
539bc44a933SRobert Mustacchi  * flow is the same.
540bc44a933SRobert Mustacchi  *
541bc44a933SRobert Mustacchi  *  +--------+              +-------------------+
542bc44a933SRobert Mustacchi  *  |  From  |---+--------->| mac_rx_srs_fanout |----+
543bc44a933SRobert Mustacchi  *  | Part 1 |   |          +-------------------+    |    +=================+
544bc44a933SRobert Mustacchi  *  +--------+   |                                   |    v for each mblk_t v
545bc44a933SRobert Mustacchi  *               * . . protocol only                 +--->v assign to new   v
546bc44a933SRobert Mustacchi  *               |     fanout                        |    v chain based on  v
547bc44a933SRobert Mustacchi  *               |                                   |    v hash % nrings   v
548bc44a933SRobert Mustacchi  *               |    +-------------------------+    |    +=================+
549bc44a933SRobert Mustacchi  *               +--->| mac_rx_srs_proto_fanout |----+             |
550bc44a933SRobert Mustacchi  *                    +-------------------------+                  |
551bc44a933SRobert Mustacchi  *                                                                 v
552bc44a933SRobert Mustacchi  *    +------------+    +--------------------------+       +================+
553bc44a933SRobert Mustacchi  *    | enqueue in |<---| mac_rx_soft_ring_process |<------v for each chain v
554bc44a933SRobert Mustacchi  *    | soft ring  |    +--------------------------+       +================+
555bc44a933SRobert Mustacchi  *    +------------+
556bc44a933SRobert Mustacchi  *         |                                    +-----------+
557bc44a933SRobert Mustacchi  *         * soft ring set                      | soft ring |
558bc44a933SRobert Mustacchi  *         | empty and no                       |  worker   |
559bc44a933SRobert Mustacchi  *         | worker?                            |  thread   |
560bc44a933SRobert Mustacchi  *         |                                    +-----------+
561bc44a933SRobert Mustacchi  *         +------*----------------+                  |
562bc44a933SRobert Mustacchi  *         |      .                |                  v
563bc44a933SRobert Mustacchi  *    No . *      . Yes            |       +------------------------+
564bc44a933SRobert Mustacchi  *         |                       +----<--| mac_rx_soft_ring_drain |
565bc44a933SRobert Mustacchi  *         |                       |       +------------------------+
566bc44a933SRobert Mustacchi  *         v                       |
567bc44a933SRobert Mustacchi  *   +-----------+                 v
568bc44a933SRobert Mustacchi  *   |   signal  |         +---------------+
569bc44a933SRobert Mustacchi  *   | soft ring |         | Deliver chain |
570bc44a933SRobert Mustacchi  *   |   worker  |         | goto Part 3   |
571bc44a933SRobert Mustacchi  *   +-----------+         +---------------+
572bc44a933SRobert Mustacchi  *
573bc44a933SRobert Mustacchi  *
574bc44a933SRobert Mustacchi  * Part 3 -- Packet Delivery
575bc44a933SRobert Mustacchi  *
576bc44a933SRobert Mustacchi  * Here, we go through and deliver the mblk_t chain directly to a given
577bc44a933SRobert Mustacchi  * processing function. In a lot of cases this is mac_rx_deliver(). In the case
578bc44a933SRobert Mustacchi  * of DLS bypass being used, then instead we end up going ahead and deliver it
579bc44a933SRobert Mustacchi  * to the direct callback registered with DLS, generally ip_input.
580bc44a933SRobert Mustacchi  *
581bc44a933SRobert Mustacchi  *
582bc44a933SRobert Mustacchi  *   +---------+            +----------------+    +------------------+
583bc44a933SRobert Mustacchi  *   |  From   |---+------->| mac_rx_deliver |--->| Off to DLS, or   |
584bc44a933SRobert Mustacchi  *   | Parts 1 |   |        +----------------+    | other MAC client |
585bc44a933SRobert Mustacchi  *   |  and 2  |   * DLS bypass                   +------------------+
586bc44a933SRobert Mustacchi  *   +---------+   | enabled   +----------+    +-------------+
587bc44a933SRobert Mustacchi  *                 +---------->| ip_input |--->|    To IP    |
588bc44a933SRobert Mustacchi  *                             +----------+    | and beyond! |
589bc44a933SRobert Mustacchi  *                                             +-------------+
590bc44a933SRobert Mustacchi  *
591bc44a933SRobert Mustacchi  * ----------------------
592bc44a933SRobert Mustacchi  * The Transmit Data Path
593bc44a933SRobert Mustacchi  * ----------------------
594bc44a933SRobert Mustacchi  *
595bc44a933SRobert Mustacchi  * Before we go into the images, it's worth talking about a problem that is a
596bc44a933SRobert Mustacchi  * bit different from the receive data path. GLDv3 device drivers have a finite
597bc44a933SRobert Mustacchi  * amount of transmit descriptors. When they run out, they return unused frames
598bc44a933SRobert Mustacchi  * back to MAC. MAC, at this point has several options about what it will do,
599bc44a933SRobert Mustacchi  * which vary based upon the settings that the client uses.
600bc44a933SRobert Mustacchi  *
601bc44a933SRobert Mustacchi  * When a device runs out of descriptors, the next thing that MAC does is
602bc44a933SRobert Mustacchi  * enqueue them off of the soft ring set or a software ring, depending on the
603bc44a933SRobert Mustacchi  * configuration of the soft ring set. MAC will enqueue up to a high watermark
604bc44a933SRobert Mustacchi  * of mblk_t chains, at which point it will indicate flow control back to the
605bc44a933SRobert Mustacchi  * client. Once this condition is reached, any mblk_t chains that were not
606bc44a933SRobert Mustacchi  * enqueued will be returned to the caller and they will have to decide what to
607bc44a933SRobert Mustacchi  * do with them. There are various flags that control this behavior that a
608bc44a933SRobert Mustacchi  * client may pass, which are discussed below.
609bc44a933SRobert Mustacchi  *
610bc44a933SRobert Mustacchi  * When this condition is hit, MAC also returns a cookie to the client in
611bc44a933SRobert Mustacchi  * addition to unconsumed frames. Clients can poll on that cookie and register a
612bc44a933SRobert Mustacchi  * callback with MAC to be notified when they are no longer subject to flow
613bc44a933SRobert Mustacchi  * control, at which point they may continue to call mac_tx(). This flow control
614bc44a933SRobert Mustacchi  * actually manages to work itself all the way up the stack, back through dls,
615bc44a933SRobert Mustacchi  * to ip, through the various protocols, and to sockfs.
616bc44a933SRobert Mustacchi  *
617bc44a933SRobert Mustacchi  * While the behavior described above is the default, this behavior can be
618bc44a933SRobert Mustacchi  * modified. There are two alternate modes, described below, which are
619bc44a933SRobert Mustacchi  * controlled with flags.
620bc44a933SRobert Mustacchi  *
621bc44a933SRobert Mustacchi  * DROP MODE
622bc44a933SRobert Mustacchi  *
623bc44a933SRobert Mustacchi  * This mode is controlled by having the client pass the MAC_DROP_ON_NO_DESC
624bc44a933SRobert Mustacchi  * flag. When this is passed, if a device driver runs out of transmit
625bc44a933SRobert Mustacchi  * descriptors, then the MAC layer will drop any unsent traffic. The client in
626bc44a933SRobert Mustacchi  * this case will never have any frames returned to it.
627bc44a933SRobert Mustacchi  *
628bc44a933SRobert Mustacchi  * DON'T ENQUEUE
629bc44a933SRobert Mustacchi  *
630bc44a933SRobert Mustacchi  * This mode is controlled by having the client pass the MAC_TX_NO_ENQUEUE flag.
631bc44a933SRobert Mustacchi  * If the MAC_DROP_ON_NO_DESC flag is also passed, it takes precedence. In this
632bc44a933SRobert Mustacchi  * mode, when we hit a case where a driver runs out of transmit descriptors,
633bc44a933SRobert Mustacchi  * then instead of enqueuing packets in a soft ring set or software ring, we
634bc44a933SRobert Mustacchi  * instead return the mblk_t chain back to the caller and immediately put the
635bc44a933SRobert Mustacchi  * soft ring set into flow control mode.
636bc44a933SRobert Mustacchi  *
637bc44a933SRobert Mustacchi  * The following series of ASCII art images describe the transmit data path that
638bc44a933SRobert Mustacchi  * MAC clients enter into based on calling into mac_tx(). A soft ring set has a
639bc44a933SRobert Mustacchi  * transmission function associated with it. There are seven possible
640bc44a933SRobert Mustacchi  * transmission modes, some of which share function entry points. The one that a
641bc44a933SRobert Mustacchi  * soft ring set gets depends on properties such as whether there are
642bc44a933SRobert Mustacchi  * transmission rings for fanout, whether the device involves aggregations,
643bc44a933SRobert Mustacchi  * whether any bandwidth limits exist, etc.
644bc44a933SRobert Mustacchi  *
645bc44a933SRobert Mustacchi  *
646bc44a933SRobert Mustacchi  * Part 1 -- Initial checks
647bc44a933SRobert Mustacchi  *
648bc44a933SRobert Mustacchi  *      * . called by
649bc44a933SRobert Mustacchi  *      |   MAC clients
650bc44a933SRobert Mustacchi  *      v                     . . No
651bc44a933SRobert Mustacchi  *  +--------+  +-----------+ .   +-------------------+  +====================+
652bc44a933SRobert Mustacchi  *  | mac_tx |->| device    |-*-->| mac_protect_check |->v Is this the simple v
653bc44a933SRobert Mustacchi  *  +--------+  | quiesced? |     +-------------------+  v case? See [1]      v
654bc44a933SRobert Mustacchi  *              +-----------+            |               +====================+
655bc44a933SRobert Mustacchi  *                  * . Yes              * failed                 |
656bc44a933SRobert Mustacchi  *                  v                    | frames                 |
657bc44a933SRobert Mustacchi  *             +--------------+          |                +-------+---------+
658bc44a933SRobert Mustacchi  *             | freemsgchain |<---------+          Yes . *            No . *
659bc44a933SRobert Mustacchi  *             +--------------+                           v                 v
660bc44a933SRobert Mustacchi  *                                                  +-----------+     +--------+
661bc44a933SRobert Mustacchi  *                                                  |   goto    |     |  goto  |
662bc44a933SRobert Mustacchi  *                                                  |  Part 2   |     | SRS TX |
663bc44a933SRobert Mustacchi  *                                                  | Entry [A] |     |  func  |
664bc44a933SRobert Mustacchi  *                                                  +-----------+     +--------+
665bc44a933SRobert Mustacchi  *                                                        |                 |
666bc44a933SRobert Mustacchi  *                                                        |                 v
667bc44a933SRobert Mustacchi  *                                                        |           +--------+
668bc44a933SRobert Mustacchi  *                                                        +---------->| return |
669bc44a933SRobert Mustacchi  *                                                                    | cookie |
670bc44a933SRobert Mustacchi  *                                                                    +--------+
671bc44a933SRobert Mustacchi  *
672bc44a933SRobert Mustacchi  * [1] The simple case refers to the SRS being configured with the
673bc44a933SRobert Mustacchi  * SRS_TX_DEFAULT transmission mode, having a single mblk_t (not a chain), their
674bc44a933SRobert Mustacchi  * being only a single active client, and not having a backlog in the srs.
675bc44a933SRobert Mustacchi  *
676bc44a933SRobert Mustacchi  *
677bc44a933SRobert Mustacchi  * Part 2 -- The SRS transmission functions
678bc44a933SRobert Mustacchi  *
679bc44a933SRobert Mustacchi  * This part is a bit more complicated. The different transmission paths often
680bc44a933SRobert Mustacchi  * leverage one another. In this case, we'll draw out the more common ones
681bc44a933SRobert Mustacchi  * before the parts that depend upon them. Here, we're going to start with the
682bc44a933SRobert Mustacchi  * workings of mac_tx_send() a common function that most of the others end up
683bc44a933SRobert Mustacchi  * calling.
684bc44a933SRobert Mustacchi  *
685bc44a933SRobert Mustacchi  *      +-------------+
686bc44a933SRobert Mustacchi  *      | mac_tx_send |
687bc44a933SRobert Mustacchi  *      +-------------+
688bc44a933SRobert Mustacchi  *            |
689bc44a933SRobert Mustacchi  *            v
690bc44a933SRobert Mustacchi  *      +=============+    +==============+
691bc44a933SRobert Mustacchi  *      v  more than  v--->v    check     v
692bc44a933SRobert Mustacchi  *      v one client? v    v VLAN and add v
693bc44a933SRobert Mustacchi  *      +=============+    v  VLAN tags   v
694bc44a933SRobert Mustacchi  *            |            +==============+
695bc44a933SRobert Mustacchi  *            |                  |
696bc44a933SRobert Mustacchi  *            +------------------+
697bc44a933SRobert Mustacchi  *            |
698bc44a933SRobert Mustacchi  *            |                 [A]
699bc44a933SRobert Mustacchi  *            v                  |
700bc44a933SRobert Mustacchi  *       +============+ . No     v
701bc44a933SRobert Mustacchi  *       v more than  v .     +==========+     +--------------------------+
702bc44a933SRobert Mustacchi  *       v one active v-*---->v for each v---->| mac_promisc_dispatch_one |---+
703bc44a933SRobert Mustacchi  *       v  client?   v       v mblk_t   v     +--------------------------+   |
704bc44a933SRobert Mustacchi  *       +============+       +==========+        ^                           |
705bc44a933SRobert Mustacchi  *            |                                   |       +==========+        |
706bc44a933SRobert Mustacchi  *            * . Yes                             |       v hardware v<-------+
707bc44a933SRobert Mustacchi  *            v                      +------------+       v  rings?  v
708bc44a933SRobert Mustacchi  *       +==========+                |                    +==========+
709bc44a933SRobert Mustacchi  *       v for each v       No . . . *                         |
710bc44a933SRobert Mustacchi  *       v mblk_t   v       specific |                         |
711bc44a933SRobert Mustacchi  *       +==========+       flow     |                   +-----+-----+
712bc44a933SRobert Mustacchi  *            |                      |                   |           |
713bc44a933SRobert Mustacchi  *            v                      |                   v           v
714bc44a933SRobert Mustacchi  *    +-----------------+            |               +-------+  +---------+
715bc44a933SRobert Mustacchi  *    | mac_tx_classify |------------+               | GLDv3 |  |  GLDv3  |
716bc44a933SRobert Mustacchi  *    +-----------------+                            |TX func|  | ring tx |
717bc44a933SRobert Mustacchi  *            |                                      +-------+  |  func   |
718bc44a933SRobert Mustacchi  *            * Specific flow, generally                 |      +---------+
719bc44a933SRobert Mustacchi  *            | bcast, mcast, loopback                   |           |
720bc44a933SRobert Mustacchi  *            v                                          +-----+-----+
721bc44a933SRobert Mustacchi  *      +==========+       +---------+                         |
722bc44a933SRobert Mustacchi  *      v valid L2 v--*--->| freemsg |                         v
723bc44a933SRobert Mustacchi  *      v  header  v  . No +---------+               +-------------------+
724bc44a933SRobert Mustacchi  *      +==========+                                 | return unconsumed |
725bc44a933SRobert Mustacchi  *            * . Yes                                |   frames to the   |
726bc44a933SRobert Mustacchi  *            v                                      |      caller       |
727bc44a933SRobert Mustacchi  *      +===========+                                +-------------------+
728bc44a933SRobert Mustacchi  *      v braodcast v      +----------------+                  ^
729bc44a933SRobert Mustacchi  *      v   flow?   v--*-->| mac_bcast_send |------------------+
730bc44a933SRobert Mustacchi  *      +===========+  .   +----------------+                  |
731bc44a933SRobert Mustacchi  *            |        . . Yes                                 |
732bc44a933SRobert Mustacchi  *       No . *                                                v
733bc44a933SRobert Mustacchi  *            |  +---------------------+  +---------------+  +----------+
734bc44a933SRobert Mustacchi  *            +->|mac_promisc_dispatch |->| mac_fix_cksum |->|   flow   |
735bc44a933SRobert Mustacchi  *               +---------------------+  +---------------+  | callback |
736bc44a933SRobert Mustacchi  *                                                           +----------+
737bc44a933SRobert Mustacchi  *
738bc44a933SRobert Mustacchi  *
739bc44a933SRobert Mustacchi  * In addition, many but not all of the routines, all rely on
740bc44a933SRobert Mustacchi  * mac_tx_softring_process as an entry point.
741bc44a933SRobert Mustacchi  *
742bc44a933SRobert Mustacchi  *
743bc44a933SRobert Mustacchi  *                                           . No             . No
744bc44a933SRobert Mustacchi  * +--------------------------+   +========+ .  +===========+ .  +-------------+
745bc44a933SRobert Mustacchi  * | mac_tx_soft_ring_process |-->v worker v-*->v out of tx v-*->|    goto     |
746bc44a933SRobert Mustacchi  * +--------------------------+   v only?  v    v  descr.?  v    | mac_tx_send |
747bc44a933SRobert Mustacchi  *                                +========+    +===========+    +-------------+
748bc44a933SRobert Mustacchi  *                              Yes . *               * . Yes           |
749bc44a933SRobert Mustacchi  *                   . No             v               |                 v
750bc44a933SRobert Mustacchi  *     v=========+   .          +===========+ . Yes   |     Yes .  +==========+
751bc44a933SRobert Mustacchi  *     v apppend v<--*----------v out of tx v-*-------+---------*--v returned v
752bc44a933SRobert Mustacchi  *     v mblk_t  v              v  descr.?  v         |            v frames?  v
753bc44a933SRobert Mustacchi  *     v chain   v              +===========+         |            +==========+
754bc44a933SRobert Mustacchi  *     +=========+                                    |                 *. No
755bc44a933SRobert Mustacchi  *         |                                          |                 v
756bc44a933SRobert Mustacchi  *         v                                          v           +------------+
757bc44a933SRobert Mustacchi  * +===================+           +----------------------+       |   done     |
758bc44a933SRobert Mustacchi  * v worker scheduled? v           | mac_tx_sring_enqueue |       | processing |
759bc44a933SRobert Mustacchi  * v Out of tx descr?  v           +----------------------+       +------------+
760bc44a933SRobert Mustacchi  * +===================+                      |
761bc44a933SRobert Mustacchi  *    |           |           . Yes           v
762bc44a933SRobert Mustacchi  *    * Yes       * No        .         +============+
763bc44a933SRobert Mustacchi  *    |           v         +-*---------v drop on no v
764bc44a933SRobert Mustacchi  *    |      +========+     v           v  TX desc?  v
765bc44a933SRobert Mustacchi  *    |      v  wake  v  +----------+   +============+
766bc44a933SRobert Mustacchi  *    |      v worker v  | mac_pkt_ |         * . No
767bc44a933SRobert Mustacchi  *    |      +========+  | drop     |         |         . Yes         . No
768bc44a933SRobert Mustacchi  *    |           |      +----------+         v         .             .
769bc44a933SRobert Mustacchi  *    |           |         v   ^     +===============+ .  +========+ .
770bc44a933SRobert Mustacchi  *    +--+--------+---------+   |     v Don't enqueue v-*->v ring   v-*----+
771bc44a933SRobert Mustacchi  *       |                      |     v     Set?      v    v empty? v      |
772bc44a933SRobert Mustacchi  *       |      +---------------+     +===============+    +========+      |
773bc44a933SRobert Mustacchi  *       |      |                            |                |            |
774bc44a933SRobert Mustacchi  *       |      |        +-------------------+                |            |
775bc44a933SRobert Mustacchi  *       |      *. Yes   |                          +---------+            |
776bc44a933SRobert Mustacchi  *       |      |        v                          v                      v
777bc44a933SRobert Mustacchi  *       |      |  +===========+               +========+      +--------------+
778bc44a933SRobert Mustacchi  *       |      +<-v At hiwat? v               v append v      |    return    |
779bc44a933SRobert Mustacchi  *       |         +===========+               v mblk_t v      | mblk_t chain |
780bc44a933SRobert Mustacchi  *       |                  * No               v chain  v      |   and flow   |
781bc44a933SRobert Mustacchi  *       |                  v                  +========+      |    control   |
782bc44a933SRobert Mustacchi  *       |               +=========+                |          |    cookie    |
783bc44a933SRobert Mustacchi  *       |               v  append v                v          +--------------+
784bc44a933SRobert Mustacchi  *       |               v  mblk_t v           +========+
785bc44a933SRobert Mustacchi  *       |               v  chain  v           v  wake  v   +------------+
786bc44a933SRobert Mustacchi  *       |               +=========+           v worker v-->|    done    |
787bc44a933SRobert Mustacchi  *       |                    |                +========+   | processing |
788bc44a933SRobert Mustacchi  *       |                    v       .. Yes                +------------+
789bc44a933SRobert Mustacchi  *       |               +=========+  .   +========+
790bc44a933SRobert Mustacchi  *       |               v  first  v--*-->v  wake  v
791bc44a933SRobert Mustacchi  *       |               v append? v      v worker v
792bc44a933SRobert Mustacchi  *       |               +=========+      +========+
793bc44a933SRobert Mustacchi  *       |                   |                |
794bc44a933SRobert Mustacchi  *       |              No . *                |
795bc44a933SRobert Mustacchi  *       |                   v                |
796bc44a933SRobert Mustacchi  *       |       +--------------+             |
797bc44a933SRobert Mustacchi  *       +------>|   Return     |             |
798bc44a933SRobert Mustacchi  *               | flow control |<------------+
799bc44a933SRobert Mustacchi  *               |   cookie     |
800bc44a933SRobert Mustacchi  *               +--------------+
801bc44a933SRobert Mustacchi  *
802bc44a933SRobert Mustacchi  *
803bc44a933SRobert Mustacchi  * The remaining images are all specific to each of the different transmission
804bc44a933SRobert Mustacchi  * modes.
805bc44a933SRobert Mustacchi  *
806bc44a933SRobert Mustacchi  * SRS TX DEFAULT
807bc44a933SRobert Mustacchi  *
808bc44a933SRobert Mustacchi  *      [ From Part 1 ]
809bc44a933SRobert Mustacchi  *             |
810bc44a933SRobert Mustacchi  *             v
811bc44a933SRobert Mustacchi  * +-------------------------+
812bc44a933SRobert Mustacchi  * | mac_tx_single_ring_mode |
813bc44a933SRobert Mustacchi  * +-------------------------+
814bc44a933SRobert Mustacchi  *            |
815bc44a933SRobert Mustacchi  *            |       . Yes
816bc44a933SRobert Mustacchi  *            v       .
817bc44a933SRobert Mustacchi  *       +==========+ .  +============+
818bc44a933SRobert Mustacchi  *       v   SRS    v-*->v   Try to   v---->---------------------+
819bc44a933SRobert Mustacchi  *       v backlog? v    v enqueue in v                          |
820bc44a933SRobert Mustacchi  *       +==========+    v     SRS    v-->------+                * . . Queue too
821bc44a933SRobert Mustacchi  *            |          +============+         * don't enqueue  |     deep or
822bc44a933SRobert Mustacchi  *            * . No         ^     |            | flag or at     |     drop flag
823bc44a933SRobert Mustacchi  *            |              |     v            | hiwat,         |
824bc44a933SRobert Mustacchi  *            v              |     |            | return    +---------+
825bc44a933SRobert Mustacchi  *     +-------------+       |     |            | cookie    | freemsg |
826bc44a933SRobert Mustacchi  *     |    goto     |-*-----+     |            |           +---------+
827bc44a933SRobert Mustacchi  *     | mac_tx_send | . returned  |            |                |
828bc44a933SRobert Mustacchi  *     +-------------+   mblk_t    |            |                |
829bc44a933SRobert Mustacchi  *            |                    |            |                |
830bc44a933SRobert Mustacchi  *            |                    |            |                |
831bc44a933SRobert Mustacchi  *            * . . all mblk_t     * queued,    |                |
832bc44a933SRobert Mustacchi  *            v     consumed       | may return |                |
833bc44a933SRobert Mustacchi  *     +-------------+             | tx cookie  |                |
834bc44a933SRobert Mustacchi  *     | SRS TX func |<------------+------------+----------------+
835bc44a933SRobert Mustacchi  *     |  completed  |
836bc44a933SRobert Mustacchi  *     +-------------+
837bc44a933SRobert Mustacchi  *
838bc44a933SRobert Mustacchi  * SRS_TX_SERIALIZE
839bc44a933SRobert Mustacchi  *
840bc44a933SRobert Mustacchi  *   +------------------------+
841bc44a933SRobert Mustacchi  *   | mac_tx_serializer_mode |
842bc44a933SRobert Mustacchi  *   +------------------------+
843bc44a933SRobert Mustacchi  *               |
844bc44a933SRobert Mustacchi  *               |        . No
845bc44a933SRobert Mustacchi  *               v        .
846bc44a933SRobert Mustacchi  *         +============+ .  +============+    +-------------+   +============+
847bc44a933SRobert Mustacchi  *         v srs being  v-*->v  set SRS   v--->|    goto     |-->v remove SRS v
848bc44a933SRobert Mustacchi  *         v processed? v    v proc flags v    | mac_tx_send |   v proc flag  v
849bc44a933SRobert Mustacchi  *         +============+    +============+    +-------------+   +============+
850bc44a933SRobert Mustacchi  *               |                                                     |
851bc44a933SRobert Mustacchi  *               * Yes                                                 |
852bc44a933SRobert Mustacchi  *               v                                       . No          v
853bc44a933SRobert Mustacchi  *      +--------------------+                           .        +==========+
854bc44a933SRobert Mustacchi  *      | mac_tx_srs_enqueue |  +------------------------*-----<--v returned v
855bc44a933SRobert Mustacchi  *      +--------------------+  |                                 v frames?  v
856bc44a933SRobert Mustacchi  *               |              |   . Yes                         +==========+
857bc44a933SRobert Mustacchi  *               |              |   .                                  |
858bc44a933SRobert Mustacchi  *               |              |   . +=========+                      v
859bc44a933SRobert Mustacchi  *               v              +-<-*-v queued  v     +--------------------+
860bc44a933SRobert Mustacchi  *        +-------------+       |     v frames? v<----| mac_tx_srs_enqueue |
861bc44a933SRobert Mustacchi  *        | SRS TX func |       |     +=========+     +--------------------+
862bc44a933SRobert Mustacchi  *        | completed,  |<------+         * . Yes
863bc44a933SRobert Mustacchi  *        | may return  |       |         v
864bc44a933SRobert Mustacchi  *        |   cookie    |       |     +========+
865bc44a933SRobert Mustacchi  *        +-------------+       +-<---v  wake  v
866bc44a933SRobert Mustacchi  *                                    v worker v
867bc44a933SRobert Mustacchi  *                                    +========+
868bc44a933SRobert Mustacchi  *
869bc44a933SRobert Mustacchi  *
870bc44a933SRobert Mustacchi  * SRS_TX_FANOUT
871bc44a933SRobert Mustacchi  *
872bc44a933SRobert Mustacchi  *                                             . Yes
873bc44a933SRobert Mustacchi  *   +--------------------+    +=============+ .   +--------------------------+
874bc44a933SRobert Mustacchi  *   | mac_tx_fanout_mode |--->v Have fanout v-*-->|           goto           |
875bc44a933SRobert Mustacchi  *   +--------------------+    v   hint?     v     | mac_rx_soft_ring_process |
876bc44a933SRobert Mustacchi  *                             +=============+     +--------------------------+
877bc44a933SRobert Mustacchi  *                                   * . No                    |
878bc44a933SRobert Mustacchi  *                                   v                         ^
879bc44a933SRobert Mustacchi  *                             +===========+                   |
880bc44a933SRobert Mustacchi  *                        +--->v for each  v           +===============+
881bc44a933SRobert Mustacchi  *                        |    v   mblk_t  v           v pick softring v
882bc44a933SRobert Mustacchi  *                 same   *    +===========+           v   from hash   v
883bc44a933SRobert Mustacchi  *                 hash   |          |                 +===============+
884bc44a933SRobert Mustacchi  *                        |          v                         |
885bc44a933SRobert Mustacchi  *                        |   +--------------+                 |
886bc44a933SRobert Mustacchi  *                        +---| mac_pkt_hash |--->*------------+
887bc44a933SRobert Mustacchi  *                            +--------------+    . different
888bc44a933SRobert Mustacchi  *                                                  hash or
889bc44a933SRobert Mustacchi  *                                                  done proc.
890bc44a933SRobert Mustacchi  * SRS_TX_AGGR                                      chain
891bc44a933SRobert Mustacchi  *
892bc44a933SRobert Mustacchi  *   +------------------+    +================================+
893bc44a933SRobert Mustacchi  *   | mac_tx_aggr_mode |--->v Use aggr capab function to     v
894bc44a933SRobert Mustacchi  *   +------------------+    v find appropriate tx ring.      v
895bc44a933SRobert Mustacchi  *                           v Applies hash based on aggr     v
896bc44a933SRobert Mustacchi  *                           v policy, see mac_tx_aggr_mode() v
897bc44a933SRobert Mustacchi  *                           +================================+
898bc44a933SRobert Mustacchi  *                                          |
899bc44a933SRobert Mustacchi  *                                          v
900bc44a933SRobert Mustacchi  *                           +-------------------------------+
901bc44a933SRobert Mustacchi  *                           |            goto               |
902bc44a933SRobert Mustacchi  *                           |  mac_rx_srs_soft_ring_process |
903bc44a933SRobert Mustacchi  *                           +-------------------------------+
904bc44a933SRobert Mustacchi  *
905bc44a933SRobert Mustacchi  *
906bc44a933SRobert Mustacchi  * SRS_TX_BW, SRS_TX_BW_FANOUT, SRS_TX_BW_AGGR
907bc44a933SRobert Mustacchi  *
908bc44a933SRobert Mustacchi  * Note, all three of these tx functions start from the same place --
909bc44a933SRobert Mustacchi  * mac_tx_bw_mode().
910bc44a933SRobert Mustacchi  *
911bc44a933SRobert Mustacchi  *  +----------------+
912bc44a933SRobert Mustacchi  *  | mac_tx_bw_mode |
913bc44a933SRobert Mustacchi  *  +----------------+
914bc44a933SRobert Mustacchi  *         |
915bc44a933SRobert Mustacchi  *         v          . No               . No               . Yes
916bc44a933SRobert Mustacchi  *  +==============+  .  +============+  .  +=============+ .  +=========+
917bc44a933SRobert Mustacchi  *  v  Out of BW?  v--*->v SRS empty? v--*->v  reset BW   v-*->v Bump BW v
918bc44a933SRobert Mustacchi  *  +==============+     +============+     v tick count? v    v Usage   v
919bc44a933SRobert Mustacchi  *         |                   |            +=============+    +=========+
920bc44a933SRobert Mustacchi  *         |         +---------+                   |                |
921bc44a933SRobert Mustacchi  *         |         |        +--------------------+                |
922bc44a933SRobert Mustacchi  *         |         |        |              +----------------------+
923bc44a933SRobert Mustacchi  *         v         |        v              v
924bc44a933SRobert Mustacchi  * +===============+ |  +==========+   +==========+      +------------------+
925bc44a933SRobert Mustacchi  * v Don't enqueue v |  v  set bw  v   v Is aggr? v--*-->|       goto       |
926bc44a933SRobert Mustacchi  * v   flag set?   v |  v enforced v   +==========+  .   | mac_tx_aggr_mode |-+
927bc44a933SRobert Mustacchi  * +===============+ |  +==========+         |       .   +------------------+ |
928bc44a933SRobert Mustacchi  *   |    Yes .*     |        |         No . *       .                        |
929bc44a933SRobert Mustacchi  *   |         |     |        |              |       . Yes                    |
930bc44a933SRobert Mustacchi  *   * . No    |     |        v              |                                |
931bc44a933SRobert Mustacchi  *   |  +---------+  |   +========+          v              +======+          |
932bc44a933SRobert Mustacchi  *   |  | freemsg |  |   v append v   +============+  . Yes v pick v          |
933bc44a933SRobert Mustacchi  *   |  +---------+  |   v mblk_t v   v Is fanout? v--*---->v ring v          |
934bc44a933SRobert Mustacchi  *   |      |        |   v chain  v   +============+        +======+          |
935bc44a933SRobert Mustacchi  *   +------+        |   +========+          |                  |             |
936bc44a933SRobert Mustacchi  *          v        |        |              v                  v             |
937bc44a933SRobert Mustacchi  *    +---------+    |        v       +-------------+ +--------------------+  |
938bc44a933SRobert Mustacchi  *    | return  |    |   +========+   |    goto     | |       goto         |  |
939bc44a933SRobert Mustacchi  *    |  flow   |    |   v wakeup v   | mac_tx_send | | mac_tx_fanout_mode |  |
940bc44a933SRobert Mustacchi  *    | control |    |   v worker v   +-------------+ +--------------------+  |
941bc44a933SRobert Mustacchi  *    | cookie  |    |   +========+          |                  |             |
942bc44a933SRobert Mustacchi  *    +---------+    |        |              |                  +------+------+
943bc44a933SRobert Mustacchi  *                   |        v              |                         |
944bc44a933SRobert Mustacchi  *                   |   +---------+         |                         v
945bc44a933SRobert Mustacchi  *                   |   | return  |   +============+           +------------+
946bc44a933SRobert Mustacchi  *                   |   |  flow   |   v unconsumed v-------+   |   done     |
947bc44a933SRobert Mustacchi  *                   |   | control |   v   frames?  v       |   | processing |
948bc44a933SRobert Mustacchi  *                   |   | cookie  |   +============+       |   +------------+
949bc44a933SRobert Mustacchi  *                   |   +---------+         |              |
950bc44a933SRobert Mustacchi  *                   |                  Yes  *              |
951bc44a933SRobert Mustacchi  *                   |                       |              |
952bc44a933SRobert Mustacchi  *                   |                 +===========+        |
953bc44a933SRobert Mustacchi  *                   |                 v subtract  v        |
954bc44a933SRobert Mustacchi  *                   |                 v unused bw v        |
955bc44a933SRobert Mustacchi  *                   |                 +===========+        |
956bc44a933SRobert Mustacchi  *                   |                       |              |
957bc44a933SRobert Mustacchi  *                   |                       v              |
958bc44a933SRobert Mustacchi  *                   |              +--------------------+  |
959bc44a933SRobert Mustacchi  *                   +------------->| mac_tx_srs_enqueue |  |
960bc44a933SRobert Mustacchi  *                                  +--------------------+  |
961bc44a933SRobert Mustacchi  *                                           |              |
962bc44a933SRobert Mustacchi  *                                           |              |
963bc44a933SRobert Mustacchi  *                                     +------------+       |
964bc44a933SRobert Mustacchi  *                                     |  return fc |       |
965bc44a933SRobert Mustacchi  *                                     | cookie and |<------+
966bc44a933SRobert Mustacchi  *                                     |    mblk_t  |
967bc44a933SRobert Mustacchi  *                                     +------------+
968bc44a933SRobert Mustacchi  */
969bc44a933SRobert Mustacchi 
970da14cebeSEric Cheng #include <sys/types.h>
971da14cebeSEric Cheng #include <sys/callb.h>
972da14cebeSEric Cheng #include <sys/sdt.h>
973da14cebeSEric Cheng #include <sys/strsubr.h>
974da14cebeSEric Cheng #include <sys/strsun.h>
975da14cebeSEric Cheng #include <sys/vlan.h>
976e2ea9c96SRobert Mustacchi #include <sys/stack.h>
977e2ea9c96SRobert Mustacchi #include <sys/archsystm.h>
978da14cebeSEric Cheng #include <inet/ipsec_impl.h>
979da14cebeSEric Cheng #include <inet/ip_impl.h>
980da14cebeSEric Cheng #include <inet/sadb.h>
981da14cebeSEric Cheng #include <inet/ipsecesp.h>
982da14cebeSEric Cheng #include <inet/ipsecah.h>
983da14cebeSEric Cheng #include <inet/ip6.h>
984da14cebeSEric Cheng 
985da14cebeSEric Cheng #include <sys/mac_impl.h>
986da14cebeSEric Cheng #include <sys/mac_client_impl.h>
987da14cebeSEric Cheng #include <sys/mac_client_priv.h>
988da14cebeSEric Cheng #include <sys/mac_soft_ring.h>
989da14cebeSEric Cheng #include <sys/mac_flow_impl.h>
990da14cebeSEric Cheng 
991da14cebeSEric Cheng static mac_tx_cookie_t mac_tx_single_ring_mode(mac_soft_ring_set_t *, mblk_t *,
992da14cebeSEric Cheng     uintptr_t, uint16_t, mblk_t **);
993da14cebeSEric Cheng static mac_tx_cookie_t mac_tx_serializer_mode(mac_soft_ring_set_t *, mblk_t *,
994da14cebeSEric Cheng     uintptr_t, uint16_t, mblk_t **);
995da14cebeSEric Cheng static mac_tx_cookie_t mac_tx_fanout_mode(mac_soft_ring_set_t *, mblk_t *,
996da14cebeSEric Cheng     uintptr_t, uint16_t, mblk_t **);
997da14cebeSEric Cheng static mac_tx_cookie_t mac_tx_bw_mode(mac_soft_ring_set_t *, mblk_t *,
998da14cebeSEric Cheng     uintptr_t, uint16_t, mblk_t **);
9990dc2366fSVenugopal Iyer static mac_tx_cookie_t mac_tx_aggr_mode(mac_soft_ring_set_t *, mblk_t *,
10000dc2366fSVenugopal Iyer     uintptr_t, uint16_t, mblk_t **);
1001da14cebeSEric Cheng 
1002da14cebeSEric Cheng typedef struct mac_tx_mode_s {
1003da14cebeSEric Cheng 	mac_tx_srs_mode_t	mac_tx_mode;
1004da14cebeSEric Cheng 	mac_tx_func_t		mac_tx_func;
1005da14cebeSEric Cheng } mac_tx_mode_t;
1006da14cebeSEric Cheng 
1007da14cebeSEric Cheng /*
10080dc2366fSVenugopal Iyer  * There are seven modes of operation on the Tx side. These modes get set
1009da14cebeSEric Cheng  * in mac_tx_srs_setup(). Except for the experimental TX_SERIALIZE mode,
1010da14cebeSEric Cheng  * none of the other modes are user configurable. They get selected by
1011da14cebeSEric Cheng  * the system depending upon whether the link (or flow) has multiple Tx
10120dc2366fSVenugopal Iyer  * rings or a bandwidth configured, or if the link is an aggr, etc.
10130dc2366fSVenugopal Iyer  *
10140dc2366fSVenugopal Iyer  * When the Tx SRS is operating in aggr mode (st_mode) or if there are
10150dc2366fSVenugopal Iyer  * multiple Tx rings owned by Tx SRS, then each Tx ring (pseudo or
10160dc2366fSVenugopal Iyer  * otherwise) will have a soft ring associated with it. These soft rings
10170dc2366fSVenugopal Iyer  * are stored in srs_tx_soft_rings[] array.
10180dc2366fSVenugopal Iyer  *
10190dc2366fSVenugopal Iyer  * Additionally in the case of aggr, there is the st_soft_rings[] array
10200dc2366fSVenugopal Iyer  * in the mac_srs_tx_t structure. This array is used to store the same
10210dc2366fSVenugopal Iyer  * set of soft rings that are present in srs_tx_soft_rings[] array but
10220dc2366fSVenugopal Iyer  * in a different manner. The soft ring associated with the pseudo Tx
10230dc2366fSVenugopal Iyer  * ring is saved at mr_index (of the pseudo ring) in st_soft_rings[]
10240dc2366fSVenugopal Iyer  * array. This helps in quickly getting the soft ring associated with the
10250dc2366fSVenugopal Iyer  * Tx ring when aggr_find_tx_ring() returns the pseudo Tx ring that is to
10260dc2366fSVenugopal Iyer  * be used for transmit.
1027da14cebeSEric Cheng  */
1028da14cebeSEric Cheng mac_tx_mode_t mac_tx_mode_list[] = {
1029da14cebeSEric Cheng 	{SRS_TX_DEFAULT,	mac_tx_single_ring_mode},
1030da14cebeSEric Cheng 	{SRS_TX_SERIALIZE,	mac_tx_serializer_mode},
1031da14cebeSEric Cheng 	{SRS_TX_FANOUT,		mac_tx_fanout_mode},
1032da14cebeSEric Cheng 	{SRS_TX_BW,		mac_tx_bw_mode},
10330dc2366fSVenugopal Iyer 	{SRS_TX_BW_FANOUT,	mac_tx_bw_mode},
10340dc2366fSVenugopal Iyer 	{SRS_TX_AGGR,		mac_tx_aggr_mode},
10350dc2366fSVenugopal Iyer 	{SRS_TX_BW_AGGR,	mac_tx_bw_mode}
1036da14cebeSEric Cheng };
1037da14cebeSEric Cheng 
1038da14cebeSEric Cheng /*
1039da14cebeSEric Cheng  * Soft Ring Set (SRS) - The Run time code that deals with
1040da14cebeSEric Cheng  * dynamic polling from the hardware, bandwidth enforcement,
1041da14cebeSEric Cheng  * fanout etc.
1042da14cebeSEric Cheng  *
1043da14cebeSEric Cheng  * We try to use H/W classification on NIC and assign traffic for
1044da14cebeSEric Cheng  * a MAC address to a particular Rx ring or ring group. There is a
1045da14cebeSEric Cheng  * 1-1 mapping between a SRS and a Rx ring. The SRS dynamically
1046da14cebeSEric Cheng  * switches the underlying Rx ring between interrupt and
1047da14cebeSEric Cheng  * polling mode and enforces any specified B/W control.
1048da14cebeSEric Cheng  *
1049da14cebeSEric Cheng  * There is always a SRS created and tied to each H/W and S/W rule.
1050da14cebeSEric Cheng  * Whenever we create a H/W rule, we always add the the same rule to
1051da14cebeSEric Cheng  * S/W classifier and tie a SRS to it.
1052da14cebeSEric Cheng  *
1053da14cebeSEric Cheng  * In case a B/W control is specified, it is broken into bytes
1054da14cebeSEric Cheng  * per ticks and as soon as the quota for a tick is exhausted,
1055da14cebeSEric Cheng  * the underlying Rx ring is forced into poll mode for remainder of
1056da14cebeSEric Cheng  * the tick. The SRS poll thread only polls for bytes that are
1057da14cebeSEric Cheng  * allowed to come in the SRS. We typically let 4x the configured
1058da14cebeSEric Cheng  * B/W worth of packets to come in the SRS (to prevent unnecessary
1059da14cebeSEric Cheng  * drops due to bursts) but only process the specified amount.
1060da14cebeSEric Cheng  *
1061da14cebeSEric Cheng  * A MAC client (e.g. a VNIC or aggr) can have 1 or more
1062da14cebeSEric Cheng  * Rx rings (and corresponding SRSs) assigned to it. The SRS
1063da14cebeSEric Cheng  * in turn can have softrings to do protocol level fanout or
1064da14cebeSEric Cheng  * softrings to do S/W based fanout or both. In case the NIC
1065da14cebeSEric Cheng  * has no Rx rings, we do S/W classification to respective SRS.
1066da14cebeSEric Cheng  * The S/W classification rule is always setup and ready. This
1067da14cebeSEric Cheng  * allows the MAC layer to reassign Rx rings whenever needed
1068da14cebeSEric Cheng  * but packets still continue to flow via the default path and
1069da14cebeSEric Cheng  * getting S/W classified to correct SRS.
1070da14cebeSEric Cheng  *
1071da14cebeSEric Cheng  * The SRS's are used on both Tx and Rx side. They use the same
1072da14cebeSEric Cheng  * data structure but the processing routines have slightly different
1073da14cebeSEric Cheng  * semantics due to the fact that Rx side needs to do dynamic
1074da14cebeSEric Cheng  * polling etc.
1075da14cebeSEric Cheng  *
1076da14cebeSEric Cheng  * Dynamic Polling Notes
1077da14cebeSEric Cheng  * =====================
1078da14cebeSEric Cheng  *
1079da14cebeSEric Cheng  * Each Soft ring set is capable of switching its Rx ring between
1080da14cebeSEric Cheng  * interrupt and poll mode and actively 'polls' for packets in
1081da14cebeSEric Cheng  * poll mode. If the SRS is implementing a B/W limit, it makes
1082da14cebeSEric Cheng  * sure that only Max allowed packets are pulled in poll mode
1083da14cebeSEric Cheng  * and goes to poll mode as soon as B/W limit is exceeded. As
1084da14cebeSEric Cheng  * such, there are no overheads to implement B/W limits.
1085da14cebeSEric Cheng  *
1086da14cebeSEric Cheng  * In poll mode, its better to keep the pipeline going where the
1087da14cebeSEric Cheng  * SRS worker thread keeps processing packets and poll thread
1088da14cebeSEric Cheng  * keeps bringing more packets (specially if they get to run
1089da14cebeSEric Cheng  * on different CPUs). This also prevents the overheads associated
1090da14cebeSEric Cheng  * by excessive signalling (on NUMA machines, this can be
1091da14cebeSEric Cheng  * pretty devastating). The exception is latency optimized case
1092da14cebeSEric Cheng  * where worker thread does no work and interrupt and poll thread
1093da14cebeSEric Cheng  * are allowed to do their own drain.
1094da14cebeSEric Cheng  *
1095da14cebeSEric Cheng  * We use the following policy to control Dynamic Polling:
1096da14cebeSEric Cheng  * 1) We switch to poll mode anytime the processing
1097da14cebeSEric Cheng  *    thread causes a backlog to build up in SRS and
1098da14cebeSEric Cheng  *    its associated Soft Rings (sr_poll_pkt_cnt > 0).
1099da14cebeSEric Cheng  * 2) As long as the backlog stays under the low water
1100da14cebeSEric Cheng  *    mark (sr_lowat), we poll the H/W for more packets.
1101da14cebeSEric Cheng  * 3) If the backlog (sr_poll_pkt_cnt) exceeds low
1102da14cebeSEric Cheng  *    water mark, we stay in poll mode but don't poll
1103da14cebeSEric Cheng  *    the H/W for more packets.
1104da14cebeSEric Cheng  * 4) Anytime in polling mode, if we poll the H/W for
1105da14cebeSEric Cheng  *    packets and find nothing plus we have an existing
1106da14cebeSEric Cheng  *    backlog (sr_poll_pkt_cnt > 0), we stay in polling
1107da14cebeSEric Cheng  *    mode but don't poll the H/W for packets anymore
1108da14cebeSEric Cheng  *    (let the polling thread go to sleep).
1109da14cebeSEric Cheng  * 5) Once the backlog is relived (packets are processed)
1110da14cebeSEric Cheng  *    we reenable polling (by signalling the poll thread)
1111da14cebeSEric Cheng  *    only when the backlog dips below sr_poll_thres.
1112da14cebeSEric Cheng  * 6) sr_hiwat is used exclusively when we are not
1113da14cebeSEric Cheng  *    polling capable and is used to decide when to
1114da14cebeSEric Cheng  *    drop packets so the SRS queue length doesn't grow
1115da14cebeSEric Cheng  *    infinitely.
1116da14cebeSEric Cheng  *
1117da14cebeSEric Cheng  * NOTE: Also see the block level comment on top of mac_soft_ring.c
1118da14cebeSEric Cheng  */
1119da14cebeSEric Cheng 
1120da14cebeSEric Cheng /*
1121da14cebeSEric Cheng  * mac_latency_optimize
1122da14cebeSEric Cheng  *
1123da14cebeSEric Cheng  * Controls whether the poll thread can process the packets inline
1124da14cebeSEric Cheng  * or let the SRS worker thread do the processing. This applies if
1125da14cebeSEric Cheng  * the SRS was not being processed. For latency sensitive traffic,
1126da14cebeSEric Cheng  * this needs to be true to allow inline processing. For throughput
1127da14cebeSEric Cheng  * under load, this should be false.
1128da14cebeSEric Cheng  *
1129da14cebeSEric Cheng  * This (and other similar) tunable should be rolled into a link
1130da14cebeSEric Cheng  * or flow specific workload hint that can be set using dladm
1131da14cebeSEric Cheng  * linkprop (instead of multiple such tunables).
1132da14cebeSEric Cheng  */
1133da14cebeSEric Cheng boolean_t mac_latency_optimize = B_TRUE;
1134da14cebeSEric Cheng 
1135da14cebeSEric Cheng /*
1136da14cebeSEric Cheng  * MAC_RX_SRS_ENQUEUE_CHAIN and MAC_TX_SRS_ENQUEUE_CHAIN
1137da14cebeSEric Cheng  *
1138da14cebeSEric Cheng  * queue a mp or chain in soft ring set and increment the
1139da14cebeSEric Cheng  * local count (srs_count) for the SRS and the shared counter
1140da14cebeSEric Cheng  * (srs_poll_pkt_cnt - shared between SRS and its soft rings
1141da14cebeSEric Cheng  * to track the total unprocessed packets for polling to work
1142da14cebeSEric Cheng  * correctly).
1143da14cebeSEric Cheng  *
1144da14cebeSEric Cheng  * The size (total bytes queued) counters are incremented only
1145da14cebeSEric Cheng  * if we are doing B/W control.
1146da14cebeSEric Cheng  */
1147da14cebeSEric Cheng #define	MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) {		\
1148da14cebeSEric Cheng 	ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));			\
1149da14cebeSEric Cheng 	if ((mac_srs)->srs_last != NULL)				\
1150da14cebeSEric Cheng 		(mac_srs)->srs_last->b_next = (head);			\
1151da14cebeSEric Cheng 	else								\
1152da14cebeSEric Cheng 		(mac_srs)->srs_first = (head);				\
1153da14cebeSEric Cheng 	(mac_srs)->srs_last = (tail);					\
1154da14cebeSEric Cheng 	(mac_srs)->srs_count += count;					\
1155da14cebeSEric Cheng }
1156da14cebeSEric Cheng 
1157da14cebeSEric Cheng #define	MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) {	\
1158da14cebeSEric Cheng 	mac_srs_rx_t	*srs_rx = &(mac_srs)->srs_rx;			\
1159da14cebeSEric Cheng 									\
1160da14cebeSEric Cheng 	MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz);		\
1161da14cebeSEric Cheng 	srs_rx->sr_poll_pkt_cnt += count;				\
1162da14cebeSEric Cheng 	ASSERT(srs_rx->sr_poll_pkt_cnt > 0);				\
1163da14cebeSEric Cheng 	if ((mac_srs)->srs_type & SRST_BW_CONTROL) {			\
1164da14cebeSEric Cheng 		(mac_srs)->srs_size += (sz);				\
1165da14cebeSEric Cheng 		mutex_enter(&(mac_srs)->srs_bw->mac_bw_lock);		\
1166da14cebeSEric Cheng 		(mac_srs)->srs_bw->mac_bw_sz += (sz);			\
1167da14cebeSEric Cheng 		mutex_exit(&(mac_srs)->srs_bw->mac_bw_lock);		\
1168da14cebeSEric Cheng 	}								\
1169da14cebeSEric Cheng }
1170da14cebeSEric Cheng 
1171da14cebeSEric Cheng #define	MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) {	\
1172da14cebeSEric Cheng 	mac_srs->srs_state |= SRS_ENQUEUED;				\
1173da14cebeSEric Cheng 	MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz);		\
1174da14cebeSEric Cheng 	if ((mac_srs)->srs_type & SRST_BW_CONTROL) {			\
1175da14cebeSEric Cheng 		(mac_srs)->srs_size += (sz);				\
1176da14cebeSEric Cheng 		(mac_srs)->srs_bw->mac_bw_sz += (sz);			\
1177da14cebeSEric Cheng 	}								\
1178da14cebeSEric Cheng }
1179da14cebeSEric Cheng 
1180da14cebeSEric Cheng /*
1181da14cebeSEric Cheng  * Turn polling on routines
1182da14cebeSEric Cheng  */
1183da14cebeSEric Cheng #define	MAC_SRS_POLLING_ON(mac_srs) {					\
1184da14cebeSEric Cheng 	ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));			\
1185da14cebeSEric Cheng 	if (((mac_srs)->srs_state &					\
1186da14cebeSEric Cheng 	    (SRS_POLLING_CAPAB|SRS_POLLING)) == SRS_POLLING_CAPAB) {	\
1187da14cebeSEric Cheng 		(mac_srs)->srs_state |= SRS_POLLING;			\
1188da14cebeSEric Cheng 		(void) mac_hwring_disable_intr((mac_ring_handle_t)	\
1189da14cebeSEric Cheng 		    (mac_srs)->srs_ring);				\
1190da14cebeSEric Cheng 		(mac_srs)->srs_rx.sr_poll_on++;				\
1191da14cebeSEric Cheng 	}								\
1192da14cebeSEric Cheng }
1193da14cebeSEric Cheng 
1194da14cebeSEric Cheng #define	MAC_SRS_WORKER_POLLING_ON(mac_srs) {				\
1195da14cebeSEric Cheng 	ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));			\
1196da14cebeSEric Cheng 	if (((mac_srs)->srs_state &					\
1197da14cebeSEric Cheng 	    (SRS_POLLING_CAPAB|SRS_WORKER|SRS_POLLING)) == 		\
1198da14cebeSEric Cheng 	    (SRS_POLLING_CAPAB|SRS_WORKER)) {				\
1199da14cebeSEric Cheng 		(mac_srs)->srs_state |= SRS_POLLING;			\
1200da14cebeSEric Cheng 		(void) mac_hwring_disable_intr((mac_ring_handle_t)	\
1201da14cebeSEric Cheng 		    (mac_srs)->srs_ring);				\
1202da14cebeSEric Cheng 		(mac_srs)->srs_rx.sr_worker_poll_on++;			\
1203da14cebeSEric Cheng 	}								\
1204da14cebeSEric Cheng }
1205da14cebeSEric Cheng 
1206da14cebeSEric Cheng /*
1207da14cebeSEric Cheng  * MAC_SRS_POLL_RING
1208da14cebeSEric Cheng  *
1209da14cebeSEric Cheng  * Signal the SRS poll thread to poll the underlying H/W ring
1210da14cebeSEric Cheng  * provided it wasn't already polling (SRS_GET_PKTS was set).
1211da14cebeSEric Cheng  *
1212da14cebeSEric Cheng  * Poll thread gets to run only from mac_rx_srs_drain() and only
1213da14cebeSEric Cheng  * if the drain was being done by the worker thread.
1214da14cebeSEric Cheng  */
1215da14cebeSEric Cheng #define	MAC_SRS_POLL_RING(mac_srs) {					\
1216da14cebeSEric Cheng 	mac_srs_rx_t	*srs_rx = &(mac_srs)->srs_rx;			\
1217da14cebeSEric Cheng 									\
1218da14cebeSEric Cheng 	ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));			\
1219da14cebeSEric Cheng 	srs_rx->sr_poll_thr_sig++;					\
1220da14cebeSEric Cheng 	if (((mac_srs)->srs_state & 					\
1221da14cebeSEric Cheng 	    (SRS_POLLING_CAPAB|SRS_WORKER|SRS_GET_PKTS)) ==		\
1222da14cebeSEric Cheng 		(SRS_WORKER|SRS_POLLING_CAPAB)) {			\
1223da14cebeSEric Cheng 		(mac_srs)->srs_state |= SRS_GET_PKTS;			\
1224da14cebeSEric Cheng 		cv_signal(&(mac_srs)->srs_cv);   			\
1225da14cebeSEric Cheng 	} else {							\
1226da14cebeSEric Cheng 		srs_rx->sr_poll_thr_busy++;				\
1227da14cebeSEric Cheng 	}								\
1228da14cebeSEric Cheng }
1229da14cebeSEric Cheng 
1230da14cebeSEric Cheng /*
1231da14cebeSEric Cheng  * MAC_SRS_CHECK_BW_CONTROL
1232da14cebeSEric Cheng  *
1233da14cebeSEric Cheng  * Check to see if next tick has started so we can reset the
1234da14cebeSEric Cheng  * SRS_BW_ENFORCED flag and allow more packets to come in the
1235da14cebeSEric Cheng  * system.
1236da14cebeSEric Cheng  */
1237da14cebeSEric Cheng #define	MAC_SRS_CHECK_BW_CONTROL(mac_srs) {				\
1238da14cebeSEric Cheng 	ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));			\
1239da14cebeSEric Cheng 	ASSERT(((mac_srs)->srs_type & SRST_TX) ||			\
1240da14cebeSEric Cheng 	    MUTEX_HELD(&(mac_srs)->srs_bw->mac_bw_lock));		\
1241d3d50737SRafael Vanoni 	clock_t now = ddi_get_lbolt();					\
1242d3d50737SRafael Vanoni 	if ((mac_srs)->srs_bw->mac_bw_curr_time != now) {		\
1243d3d50737SRafael Vanoni 		(mac_srs)->srs_bw->mac_bw_curr_time = now;		\
1244da14cebeSEric Cheng 		(mac_srs)->srs_bw->mac_bw_used = 0;	       		\
1245da14cebeSEric Cheng 		if ((mac_srs)->srs_bw->mac_bw_state & SRS_BW_ENFORCED)	\
1246da14cebeSEric Cheng 			(mac_srs)->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; \
1247da14cebeSEric Cheng 	}								\
1248da14cebeSEric Cheng }
1249da14cebeSEric Cheng 
1250da14cebeSEric Cheng /*
1251da14cebeSEric Cheng  * MAC_SRS_WORKER_WAKEUP
1252da14cebeSEric Cheng  *
1253da14cebeSEric Cheng  * Wake up the SRS worker thread to process the queue as long as
1254da14cebeSEric Cheng  * no one else is processing the queue. If we are optimizing for
1255da14cebeSEric Cheng  * latency, we wake up the worker thread immediately or else we
1256da14cebeSEric Cheng  * wait mac_srs_worker_wakeup_ticks before worker thread gets
1257da14cebeSEric Cheng  * woken up.
1258da14cebeSEric Cheng  */
1259da14cebeSEric Cheng int mac_srs_worker_wakeup_ticks = 0;
1260da14cebeSEric Cheng #define	MAC_SRS_WORKER_WAKEUP(mac_srs) {				\
1261da14cebeSEric Cheng 	ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));			\
1262da14cebeSEric Cheng 	if (!((mac_srs)->srs_state & SRS_PROC) &&			\
1263da14cebeSEric Cheng 		(mac_srs)->srs_tid == NULL) {				\
12643631b19bSRajagopal Kunhappan 		if (((mac_srs)->srs_state & SRS_LATENCY_OPT) ||		\
1265da14cebeSEric Cheng 			(mac_srs_worker_wakeup_ticks == 0))		\
1266da14cebeSEric Cheng 			cv_signal(&(mac_srs)->srs_async);		\
1267da14cebeSEric Cheng 		else							\
1268da14cebeSEric Cheng 			(mac_srs)->srs_tid =				\
1269da14cebeSEric Cheng 				timeout(mac_srs_fire, (mac_srs),	\
1270da14cebeSEric Cheng 					mac_srs_worker_wakeup_ticks);	\
1271da14cebeSEric Cheng 	}								\
1272da14cebeSEric Cheng }
1273da14cebeSEric Cheng 
1274da14cebeSEric Cheng #define	TX_BANDWIDTH_MODE(mac_srs)				\
1275da14cebeSEric Cheng 	((mac_srs)->srs_tx.st_mode == SRS_TX_BW ||		\
12760dc2366fSVenugopal Iyer 	    (mac_srs)->srs_tx.st_mode == SRS_TX_BW_FANOUT ||	\
12770dc2366fSVenugopal Iyer 	    (mac_srs)->srs_tx.st_mode == SRS_TX_BW_AGGR)
1278da14cebeSEric Cheng 
1279da14cebeSEric Cheng #define	TX_SRS_TO_SOFT_RING(mac_srs, head, hint) {			\
12800dc2366fSVenugopal Iyer 	if (tx_mode == SRS_TX_BW_FANOUT)				\
12810dc2366fSVenugopal Iyer 		(void) mac_tx_fanout_mode(mac_srs, head, hint, 0, NULL);\
12820dc2366fSVenugopal Iyer 	else								\
12830dc2366fSVenugopal Iyer 		(void) mac_tx_aggr_mode(mac_srs, head, hint, 0, NULL);	\
1284da14cebeSEric Cheng }
1285da14cebeSEric Cheng 
1286da14cebeSEric Cheng /*
1287da14cebeSEric Cheng  * MAC_TX_SRS_BLOCK
1288da14cebeSEric Cheng  *
1289da14cebeSEric Cheng  * Always called from mac_tx_srs_drain() function. SRS_TX_BLOCKED
1290da14cebeSEric Cheng  * will be set only if srs_tx_woken_up is FALSE. If
1291da14cebeSEric Cheng  * srs_tx_woken_up is TRUE, it indicates that the wakeup arrived
1292da14cebeSEric Cheng  * before we grabbed srs_lock to set SRS_TX_BLOCKED. We need to
1293da14cebeSEric Cheng  * attempt to transmit again and not setting SRS_TX_BLOCKED does
1294da14cebeSEric Cheng  * that.
1295da14cebeSEric Cheng  */
1296da14cebeSEric Cheng #define	MAC_TX_SRS_BLOCK(srs, mp)	{			\
1297da14cebeSEric Cheng 	ASSERT(MUTEX_HELD(&(srs)->srs_lock));			\
1298da14cebeSEric Cheng 	if ((srs)->srs_tx.st_woken_up) {			\
1299da14cebeSEric Cheng 		(srs)->srs_tx.st_woken_up = B_FALSE;		\
1300da14cebeSEric Cheng 	} else {						\
1301da14cebeSEric Cheng 		ASSERT(!((srs)->srs_state & SRS_TX_BLOCKED));	\
1302da14cebeSEric Cheng 		(srs)->srs_state |= SRS_TX_BLOCKED;		\
13030dc2366fSVenugopal Iyer 		(srs)->srs_tx.st_stat.mts_blockcnt++;		\
1304da14cebeSEric Cheng 	}							\
1305da14cebeSEric Cheng }
1306da14cebeSEric Cheng 
1307da14cebeSEric Cheng /*
1308da14cebeSEric Cheng  * MAC_TX_SRS_TEST_HIWAT
1309da14cebeSEric Cheng  *
1310da14cebeSEric Cheng  * Called before queueing a packet onto Tx SRS to test and set
1311da14cebeSEric Cheng  * SRS_TX_HIWAT if srs_count exceeds srs_tx_hiwat.
1312da14cebeSEric Cheng  */
1313da14cebeSEric Cheng #define	MAC_TX_SRS_TEST_HIWAT(srs, mp, tail, cnt, sz, cookie) {		\
1314da14cebeSEric Cheng 	boolean_t enqueue = 1;						\
1315da14cebeSEric Cheng 									\
1316da14cebeSEric Cheng 	if ((srs)->srs_count > (srs)->srs_tx.st_hiwat) {		\
1317da14cebeSEric Cheng 		/*							\
1318da14cebeSEric Cheng 		 * flow-controlled. Store srs in cookie so that it	\
1319da14cebeSEric Cheng 		 * can be returned as mac_tx_cookie_t to client		\
1320da14cebeSEric Cheng 		 */							\
1321da14cebeSEric Cheng 		(srs)->srs_state |= SRS_TX_HIWAT;			\
1322da14cebeSEric Cheng 		cookie = (mac_tx_cookie_t)srs;				\
1323da14cebeSEric Cheng 		(srs)->srs_tx.st_hiwat_cnt++;				\
1324da14cebeSEric Cheng 		if ((srs)->srs_count > (srs)->srs_tx.st_max_q_cnt) {	\
1325da14cebeSEric Cheng 			/* increment freed stats */			\
13260dc2366fSVenugopal Iyer 			(srs)->srs_tx.st_stat.mts_sdrops += cnt;	\
1327da14cebeSEric Cheng 			/*						\
1328da14cebeSEric Cheng 			 * b_prev may be set to the fanout hint		\
1329da14cebeSEric Cheng 			 * hence can't use freemsg directly		\
1330da14cebeSEric Cheng 			 */						\
1331da14cebeSEric Cheng 			mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);	\
1332da14cebeSEric Cheng 			DTRACE_PROBE1(tx_queued_hiwat,			\
1333da14cebeSEric Cheng 			    mac_soft_ring_set_t *, srs);		\
1334da14cebeSEric Cheng 			enqueue = 0;					\
1335da14cebeSEric Cheng 		}							\
1336da14cebeSEric Cheng 	}								\
1337da14cebeSEric Cheng 	if (enqueue)							\
1338da14cebeSEric Cheng 		MAC_TX_SRS_ENQUEUE_CHAIN(srs, mp, tail, cnt, sz);	\
1339da14cebeSEric Cheng }
1340da14cebeSEric Cheng 
1341da14cebeSEric Cheng /* Some utility macros */
1342da14cebeSEric Cheng #define	MAC_SRS_BW_LOCK(srs)						\
1343da14cebeSEric Cheng 	if (!(srs->srs_type & SRST_TX))					\
1344da14cebeSEric Cheng 		mutex_enter(&srs->srs_bw->mac_bw_lock);
1345da14cebeSEric Cheng 
1346da14cebeSEric Cheng #define	MAC_SRS_BW_UNLOCK(srs)						\
1347da14cebeSEric Cheng 	if (!(srs->srs_type & SRST_TX))					\
1348da14cebeSEric Cheng 		mutex_exit(&srs->srs_bw->mac_bw_lock);
1349da14cebeSEric Cheng 
1350da14cebeSEric Cheng #define	MAC_TX_SRS_DROP_MESSAGE(srs, mp, cookie) {		\
1351da14cebeSEric Cheng 	mac_pkt_drop(NULL, NULL, mp, B_FALSE);			\
1352da14cebeSEric Cheng 	/* increment freed stats */				\
13530dc2366fSVenugopal Iyer 	mac_srs->srs_tx.st_stat.mts_sdrops++;			\
1354da14cebeSEric Cheng 	cookie = (mac_tx_cookie_t)srs;				\
1355da14cebeSEric Cheng }
1356da14cebeSEric Cheng 
1357da14cebeSEric Cheng #define	MAC_TX_SET_NO_ENQUEUE(srs, mp_chain, ret_mp, cookie) {		\
1358da14cebeSEric Cheng 	mac_srs->srs_state |= SRS_TX_WAKEUP_CLIENT;			\
1359da14cebeSEric Cheng 	cookie = (mac_tx_cookie_t)srs;					\
1360da14cebeSEric Cheng 	*ret_mp = mp_chain;						\
1361da14cebeSEric Cheng }
1362da14cebeSEric Cheng 
1363da14cebeSEric Cheng /*
1364e2ea9c96SRobert Mustacchi  * MAC_RX_SRS_TOODEEP
1365e2ea9c96SRobert Mustacchi  *
1366e2ea9c96SRobert Mustacchi  * Macro called as part of receive-side processing to determine if handling
1367e2ea9c96SRobert Mustacchi  * can occur in situ (in the interrupt thread) or if it should be left to a
1368e2ea9c96SRobert Mustacchi  * worker thread.  Note that the constant used to make this determination is
1369e2ea9c96SRobert Mustacchi  * not entirely made-up, and is a result of some emprical validation. That
1370e2ea9c96SRobert Mustacchi  * said, the constant is left as a static variable to allow it to be
1371e2ea9c96SRobert Mustacchi  * dynamically tuned in the field if and as needed.
1372e2ea9c96SRobert Mustacchi  */
1373e2ea9c96SRobert Mustacchi static uintptr_t mac_rx_srs_stack_needed = 10240;
1374e2ea9c96SRobert Mustacchi static uint_t mac_rx_srs_stack_toodeep;
1375e2ea9c96SRobert Mustacchi 
1376e2ea9c96SRobert Mustacchi #ifndef STACK_GROWTH_DOWN
1377e2ea9c96SRobert Mustacchi #error Downward stack growth assumed.
1378e2ea9c96SRobert Mustacchi #endif
1379e2ea9c96SRobert Mustacchi 
1380e2ea9c96SRobert Mustacchi #define	MAC_RX_SRS_TOODEEP() (STACK_BIAS + (uintptr_t)getfp() - \
1381e2ea9c96SRobert Mustacchi 	(uintptr_t)curthread->t_stkbase < mac_rx_srs_stack_needed && \
1382e2ea9c96SRobert Mustacchi 	++mac_rx_srs_stack_toodeep)
1383e2ea9c96SRobert Mustacchi 
1384e2ea9c96SRobert Mustacchi 
1385e2ea9c96SRobert Mustacchi /*
1386da14cebeSEric Cheng  * Drop the rx packet and advance to the next one in the chain.
1387da14cebeSEric Cheng  */
1388da14cebeSEric Cheng static void
1389da14cebeSEric Cheng mac_rx_drop_pkt(mac_soft_ring_set_t *srs, mblk_t *mp)
1390da14cebeSEric Cheng {
1391da14cebeSEric Cheng 	mac_srs_rx_t	*srs_rx = &srs->srs_rx;
1392da14cebeSEric Cheng 
1393da14cebeSEric Cheng 	ASSERT(mp->b_next == NULL);
1394da14cebeSEric Cheng 	mutex_enter(&srs->srs_lock);
1395da14cebeSEric Cheng 	MAC_UPDATE_SRS_COUNT_LOCKED(srs, 1);
1396da14cebeSEric Cheng 	MAC_UPDATE_SRS_SIZE_LOCKED(srs, msgdsize(mp));
1397da14cebeSEric Cheng 	mutex_exit(&srs->srs_lock);
1398da14cebeSEric Cheng 
13990dc2366fSVenugopal Iyer 	srs_rx->sr_stat.mrs_sdrops++;
1400da14cebeSEric Cheng 	freemsg(mp);
1401da14cebeSEric Cheng }
1402da14cebeSEric Cheng 
1403da14cebeSEric Cheng /* DATAPATH RUNTIME ROUTINES */
1404da14cebeSEric Cheng 
1405da14cebeSEric Cheng /*
1406da14cebeSEric Cheng  * mac_srs_fire
1407da14cebeSEric Cheng  *
1408da14cebeSEric Cheng  * Timer callback routine for waking up the SRS worker thread.
1409da14cebeSEric Cheng  */
1410da14cebeSEric Cheng static void
1411da14cebeSEric Cheng mac_srs_fire(void *arg)
1412da14cebeSEric Cheng {
1413da14cebeSEric Cheng 	mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)arg;
1414da14cebeSEric Cheng 
1415da14cebeSEric Cheng 	mutex_enter(&mac_srs->srs_lock);
1416*8ad9a34fSRyan Zezeski 	if (mac_srs->srs_tid == NULL) {
1417da14cebeSEric Cheng 		mutex_exit(&mac_srs->srs_lock);
1418da14cebeSEric Cheng 		return;
1419da14cebeSEric Cheng 	}
1420da14cebeSEric Cheng 
1421*8ad9a34fSRyan Zezeski 	mac_srs->srs_tid = NULL;
1422da14cebeSEric Cheng 	if (!(mac_srs->srs_state & SRS_PROC))
1423da14cebeSEric Cheng 		cv_signal(&mac_srs->srs_async);
1424da14cebeSEric Cheng 
1425da14cebeSEric Cheng 	mutex_exit(&mac_srs->srs_lock);
1426da14cebeSEric Cheng }
1427da14cebeSEric Cheng 
1428da14cebeSEric Cheng /*
1429da14cebeSEric Cheng  * 'hint' is fanout_hint (type of uint64_t) which is given by the TCP/IP stack,
1430da14cebeSEric Cheng  * and it is used on the TX path.
1431da14cebeSEric Cheng  */
14326fa43c05SRao Shoaib #define	HASH_HINT(hint)	\
14336fa43c05SRao Shoaib 	((hint) ^ ((hint) >> 24) ^ ((hint) >> 16) ^ ((hint) >> 8))
14346fa43c05SRao Shoaib 
1435da14cebeSEric Cheng 
1436da14cebeSEric Cheng /*
1437dea290bfSSaso Kiselkov  * hash based on the src address, dst address and the port information.
1438da14cebeSEric Cheng  */
1439dea290bfSSaso Kiselkov #define	HASH_ADDR(src, dst, ports)					\
1440dea290bfSSaso Kiselkov 	(ntohl((src) + (dst)) ^ ((ports) >> 24) ^ ((ports) >> 16) ^	\
1441da14cebeSEric Cheng 	((ports) >> 8) ^ (ports))
1442da14cebeSEric Cheng 
1443da14cebeSEric Cheng #define	COMPUTE_INDEX(key, sz)	(key % sz)
1444da14cebeSEric Cheng 
1445da14cebeSEric Cheng #define	FANOUT_ENQUEUE_MP(head, tail, cnt, bw_ctl, sz, sz0, mp) {	\
1446da14cebeSEric Cheng 	if ((tail) != NULL) {						\
1447da14cebeSEric Cheng 		ASSERT((tail)->b_next == NULL);				\
1448da14cebeSEric Cheng 		(tail)->b_next = (mp);					\
1449da14cebeSEric Cheng 	} else {							\
1450da14cebeSEric Cheng 		ASSERT((head) == NULL);					\
1451da14cebeSEric Cheng 		(head) = (mp);						\
1452da14cebeSEric Cheng 	}								\
1453da14cebeSEric Cheng 	(tail) = (mp);							\
1454da14cebeSEric Cheng 	(cnt)++;							\
1455da14cebeSEric Cheng 	if ((bw_ctl))							\
1456da14cebeSEric Cheng 		(sz) += (sz0);						\
1457da14cebeSEric Cheng }
1458da14cebeSEric Cheng 
1459da14cebeSEric Cheng #define	MAC_FANOUT_DEFAULT	0
1460da14cebeSEric Cheng #define	MAC_FANOUT_RND_ROBIN	1
1461da14cebeSEric Cheng int mac_fanout_type = MAC_FANOUT_DEFAULT;
1462da14cebeSEric Cheng 
1463da14cebeSEric Cheng #define	MAX_SR_TYPES	3
1464da14cebeSEric Cheng /* fanout types for port based hashing */
1465da14cebeSEric Cheng enum pkt_type {
1466da14cebeSEric Cheng 	V4_TCP = 0,
1467da14cebeSEric Cheng 	V4_UDP,
1468da14cebeSEric Cheng 	OTH,
1469da14cebeSEric Cheng 	UNDEF
1470da14cebeSEric Cheng };
1471da14cebeSEric Cheng 
1472da14cebeSEric Cheng /*
1473da14cebeSEric Cheng  * Pair of local and remote ports in the transport header
1474da14cebeSEric Cheng  */
1475da14cebeSEric Cheng #define	PORTS_SIZE 4
1476da14cebeSEric Cheng 
1477da14cebeSEric Cheng /*
1478da14cebeSEric Cheng  * mac_rx_srs_proto_fanout
1479da14cebeSEric Cheng  *
1480da14cebeSEric Cheng  * This routine delivers packets destined to an SRS into one of the
1481da14cebeSEric Cheng  * protocol soft rings.
1482da14cebeSEric Cheng  *
1483da14cebeSEric Cheng  * Given a chain of packets we need to split it up into multiple sub chains
1484da14cebeSEric Cheng  * destined into TCP, UDP or OTH soft ring. Instead of entering
1485da14cebeSEric Cheng  * the soft ring one packet at a time, we want to enter it in the form of a
1486da14cebeSEric Cheng  * chain otherwise we get this start/stop behaviour where the worker thread
1487da14cebeSEric Cheng  * goes to sleep and then next packets comes in forcing it to wake up etc.
1488da14cebeSEric Cheng  */
1489da14cebeSEric Cheng static void
1490da14cebeSEric Cheng mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
1491da14cebeSEric Cheng {
1492da14cebeSEric Cheng 	struct ether_header		*ehp;
1493ae6aa22aSVenugopal Iyer 	struct ether_vlan_header	*evhp;
1494ae6aa22aSVenugopal Iyer 	uint32_t			sap;
1495da14cebeSEric Cheng 	ipha_t				*ipha;
1496ae6aa22aSVenugopal Iyer 	uint8_t				*dstaddr;
1497ae6aa22aSVenugopal Iyer 	size_t				hdrsize;
1498da14cebeSEric Cheng 	mblk_t				*mp;
1499da14cebeSEric Cheng 	mblk_t				*headmp[MAX_SR_TYPES];
1500da14cebeSEric Cheng 	mblk_t				*tailmp[MAX_SR_TYPES];
1501da14cebeSEric Cheng 	int				cnt[MAX_SR_TYPES];
1502da14cebeSEric Cheng 	size_t				sz[MAX_SR_TYPES];
1503da14cebeSEric Cheng 	size_t				sz1;
1504ae6aa22aSVenugopal Iyer 	boolean_t			bw_ctl;
1505da14cebeSEric Cheng 	boolean_t			hw_classified;
1506ae6aa22aSVenugopal Iyer 	boolean_t			dls_bypass;
1507ae6aa22aSVenugopal Iyer 	boolean_t			is_ether;
1508ae6aa22aSVenugopal Iyer 	boolean_t			is_unicast;
1509da14cebeSEric Cheng 	enum pkt_type			type;
1510da14cebeSEric Cheng 	mac_client_impl_t		*mcip = mac_srs->srs_mcip;
1511da14cebeSEric Cheng 
1512ae6aa22aSVenugopal Iyer 	is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER);
1513ae6aa22aSVenugopal Iyer 	bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0);
1514da14cebeSEric Cheng 
1515da14cebeSEric Cheng 	/*
1516da14cebeSEric Cheng 	 * If we don't have a Rx ring, S/W classification would have done
1517da14cebeSEric Cheng 	 * its job and its a packet meant for us. If we were polling on
1518da14cebeSEric Cheng 	 * the default ring (i.e. there was a ring assigned to this SRS),
1519da14cebeSEric Cheng 	 * then we need to make sure that the mac address really belongs
1520da14cebeSEric Cheng 	 * to us.
1521da14cebeSEric Cheng 	 */
1522da14cebeSEric Cheng 	hw_classified = mac_srs->srs_ring != NULL &&
1523da14cebeSEric Cheng 	    mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
1524da14cebeSEric Cheng 
1525da14cebeSEric Cheng 	/*
1526da14cebeSEric Cheng 	 * Special clients (eg. VLAN, non ether, etc) need DLS
1527da14cebeSEric Cheng 	 * processing in the Rx path. SRST_DLS_BYPASS will be clear for
15288d4cf8d8S 	 * such SRSs. Another way of disabling bypass is to set the
15298d4cf8d8S 	 * MCIS_RX_BYPASS_DISABLE flag.
1530da14cebeSEric Cheng 	 */
15318d4cf8d8S 	dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) &&
15328d4cf8d8S 	    ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0);
1533da14cebeSEric Cheng 
1534da14cebeSEric Cheng 	bzero(headmp, MAX_SR_TYPES * sizeof (mblk_t *));
1535da14cebeSEric Cheng 	bzero(tailmp, MAX_SR_TYPES * sizeof (mblk_t *));
1536da14cebeSEric Cheng 	bzero(cnt, MAX_SR_TYPES * sizeof (int));
1537da14cebeSEric Cheng 	bzero(sz, MAX_SR_TYPES * sizeof (size_t));
1538da14cebeSEric Cheng 
1539da14cebeSEric Cheng 	/*
1540da14cebeSEric Cheng 	 * We got a chain from SRS that we need to send to the soft rings.
1541da14cebeSEric Cheng 	 * Since squeues for TCP & IPv4 sap poll their soft rings (for
1542da14cebeSEric Cheng 	 * performance reasons), we need to separate out v4_tcp, v4_udp
1543da14cebeSEric Cheng 	 * and the rest goes in other.
1544da14cebeSEric Cheng 	 */
1545da14cebeSEric Cheng 	while (head != NULL) {
1546da14cebeSEric Cheng 		mp = head;
1547da14cebeSEric Cheng 		head = head->b_next;
1548da14cebeSEric Cheng 		mp->b_next = NULL;
1549da14cebeSEric Cheng 
1550da14cebeSEric Cheng 		type = OTH;
1551ae6aa22aSVenugopal Iyer 		sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
1552da14cebeSEric Cheng 
1553ae6aa22aSVenugopal Iyer 		if (is_ether) {
1554da14cebeSEric Cheng 			/*
1555da14cebeSEric Cheng 			 * At this point we can be sure the packet at least
1556da14cebeSEric Cheng 			 * has an ether header.
1557da14cebeSEric Cheng 			 */
1558da14cebeSEric Cheng 			if (sz1 < sizeof (struct ether_header)) {
1559da14cebeSEric Cheng 				mac_rx_drop_pkt(mac_srs, mp);
1560da14cebeSEric Cheng 				continue;
1561da14cebeSEric Cheng 			}
1562da14cebeSEric Cheng 			ehp = (struct ether_header *)mp->b_rptr;
1563da14cebeSEric Cheng 
1564da14cebeSEric Cheng 			/*
1565da14cebeSEric Cheng 			 * Determine if this is a VLAN or non-VLAN packet.
1566da14cebeSEric Cheng 			 */
1567ae6aa22aSVenugopal Iyer 			if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) {
1568da14cebeSEric Cheng 				evhp = (struct ether_vlan_header *)mp->b_rptr;
1569ae6aa22aSVenugopal Iyer 				sap = ntohs(evhp->ether_type);
1570ae6aa22aSVenugopal Iyer 				hdrsize = sizeof (struct ether_vlan_header);
1571da14cebeSEric Cheng 				/*
1572ae6aa22aSVenugopal Iyer 				 * Check if the VID of the packet, if any,
1573ae6aa22aSVenugopal Iyer 				 * belongs to this client.
1574da14cebeSEric Cheng 				 */
1575da14cebeSEric Cheng 				if (!mac_client_check_flow_vid(mcip,
1576da14cebeSEric Cheng 				    VLAN_ID(ntohs(evhp->ether_tci)))) {
1577da14cebeSEric Cheng 					mac_rx_drop_pkt(mac_srs, mp);
1578da14cebeSEric Cheng 					continue;
1579da14cebeSEric Cheng 				}
1580da14cebeSEric Cheng 			} else {
1581ae6aa22aSVenugopal Iyer 				hdrsize = sizeof (struct ether_header);
1582ae6aa22aSVenugopal Iyer 			}
1583ae6aa22aSVenugopal Iyer 			is_unicast =
1584ae6aa22aSVenugopal Iyer 			    ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0);
1585ae6aa22aSVenugopal Iyer 			dstaddr = (uint8_t *)&ehp->ether_dhost;
1586ae6aa22aSVenugopal Iyer 		} else {
1587ae6aa22aSVenugopal Iyer 			mac_header_info_t		mhi;
1588ae6aa22aSVenugopal Iyer 
1589ae6aa22aSVenugopal Iyer 			if (mac_header_info((mac_handle_t)mcip->mci_mip,
1590ae6aa22aSVenugopal Iyer 			    mp, &mhi) != 0) {
1591ae6aa22aSVenugopal Iyer 				mac_rx_drop_pkt(mac_srs, mp);
1592ae6aa22aSVenugopal Iyer 				continue;
1593ae6aa22aSVenugopal Iyer 			}
1594ae6aa22aSVenugopal Iyer 			hdrsize = mhi.mhi_hdrsize;
1595ae6aa22aSVenugopal Iyer 			sap = mhi.mhi_bindsap;
1596ae6aa22aSVenugopal Iyer 			is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST);
1597ae6aa22aSVenugopal Iyer 			dstaddr = (uint8_t *)mhi.mhi_daddr;
1598da14cebeSEric Cheng 		}
1599da14cebeSEric Cheng 
1600ae6aa22aSVenugopal Iyer 		if (!dls_bypass) {
1601ae6aa22aSVenugopal Iyer 			FANOUT_ENQUEUE_MP(headmp[type], tailmp[type],
1602ae6aa22aSVenugopal Iyer 			    cnt[type], bw_ctl, sz[type], sz1, mp);
1603ae6aa22aSVenugopal Iyer 			continue;
1604ae6aa22aSVenugopal Iyer 		}
1605ae6aa22aSVenugopal Iyer 
1606ae6aa22aSVenugopal Iyer 		if (sap == ETHERTYPE_IP) {
1607da14cebeSEric Cheng 			/*
1608da14cebeSEric Cheng 			 * If we are H/W classified, but we have promisc
1609da14cebeSEric Cheng 			 * on, then we need to check for the unicast address.
1610da14cebeSEric Cheng 			 */
1611da14cebeSEric Cheng 			if (hw_classified && mcip->mci_promisc_list != NULL) {
1612da14cebeSEric Cheng 				mac_address_t		*map;
1613da14cebeSEric Cheng 
1614da14cebeSEric Cheng 				rw_enter(&mcip->mci_rw_lock, RW_READER);
1615da14cebeSEric Cheng 				map = mcip->mci_unicast;
1616ae6aa22aSVenugopal Iyer 				if (bcmp(dstaddr, map->ma_addr,
1617da14cebeSEric Cheng 				    map->ma_len) == 0)
1618da14cebeSEric Cheng 					type = UNDEF;
1619da14cebeSEric Cheng 				rw_exit(&mcip->mci_rw_lock);
1620ae6aa22aSVenugopal Iyer 			} else if (is_unicast) {
1621da14cebeSEric Cheng 				type = UNDEF;
1622da14cebeSEric Cheng 			}
1623da14cebeSEric Cheng 		}
1624da14cebeSEric Cheng 
1625da14cebeSEric Cheng 		/*
1626da14cebeSEric Cheng 		 * This needs to become a contract with the driver for
1627da14cebeSEric Cheng 		 * the fast path.
1628da14cebeSEric Cheng 		 *
1629da14cebeSEric Cheng 		 * In the normal case the packet will have at least the L2
1630da14cebeSEric Cheng 		 * header and the IP + Transport header in the same mblk.
1631da14cebeSEric Cheng 		 * This is usually the case when the NIC driver sends up
1632da14cebeSEric Cheng 		 * the packet. This is also true when the stack generates
1633da14cebeSEric Cheng 		 * a packet that is looped back and when the stack uses the
1634da14cebeSEric Cheng 		 * fastpath mechanism. The normal case is optimized for
1635da14cebeSEric Cheng 		 * performance and may bypass DLS. All other cases go through
1636da14cebeSEric Cheng 		 * the 'OTH' type path without DLS bypass.
1637da14cebeSEric Cheng 		 */
1638da14cebeSEric Cheng 
1639ae6aa22aSVenugopal Iyer 		ipha = (ipha_t *)(mp->b_rptr + hdrsize);
1640da14cebeSEric Cheng 		if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha))
1641da14cebeSEric Cheng 			type = OTH;
1642da14cebeSEric Cheng 
1643da14cebeSEric Cheng 		if (type == OTH) {
1644da14cebeSEric Cheng 			FANOUT_ENQUEUE_MP(headmp[type], tailmp[type],
1645da14cebeSEric Cheng 			    cnt[type], bw_ctl, sz[type], sz1, mp);
1646da14cebeSEric Cheng 			continue;
1647da14cebeSEric Cheng 		}
1648da14cebeSEric Cheng 
1649da14cebeSEric Cheng 		ASSERT(type == UNDEF);
1650da14cebeSEric Cheng 		/*
1651da14cebeSEric Cheng 		 * We look for at least 4 bytes past the IP header to get
1652da14cebeSEric Cheng 		 * the port information. If we get an IP fragment, we don't
1653da14cebeSEric Cheng 		 * have the port information, and we use just the protocol
1654da14cebeSEric Cheng 		 * information.
1655da14cebeSEric Cheng 		 */
1656da14cebeSEric Cheng 		switch (ipha->ipha_protocol) {
1657da14cebeSEric Cheng 		case IPPROTO_TCP:
1658da14cebeSEric Cheng 			type = V4_TCP;
1659ae6aa22aSVenugopal Iyer 			mp->b_rptr += hdrsize;
1660da14cebeSEric Cheng 			break;
1661da14cebeSEric Cheng 		case IPPROTO_UDP:
1662da14cebeSEric Cheng 			type = V4_UDP;
1663ae6aa22aSVenugopal Iyer 			mp->b_rptr += hdrsize;
1664da14cebeSEric Cheng 			break;
1665da14cebeSEric Cheng 		default:
1666da14cebeSEric Cheng 			type = OTH;
1667da14cebeSEric Cheng 			break;
1668da14cebeSEric Cheng 		}
1669da14cebeSEric Cheng 
1670da14cebeSEric Cheng 		FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], cnt[type],
1671da14cebeSEric Cheng 		    bw_ctl, sz[type], sz1, mp);
1672da14cebeSEric Cheng 	}
1673da14cebeSEric Cheng 
1674da14cebeSEric Cheng 	for (type = V4_TCP; type < UNDEF; type++) {
1675da14cebeSEric Cheng 		if (headmp[type] != NULL) {
1676ae6aa22aSVenugopal Iyer 			mac_soft_ring_t			*softring;
1677ae6aa22aSVenugopal Iyer 
1678da14cebeSEric Cheng 			ASSERT(tailmp[type]->b_next == NULL);
1679da14cebeSEric Cheng 			switch (type) {
1680da14cebeSEric Cheng 			case V4_TCP:
1681da14cebeSEric Cheng 				softring = mac_srs->srs_tcp_soft_rings[0];
1682da14cebeSEric Cheng 				break;
1683da14cebeSEric Cheng 			case V4_UDP:
1684da14cebeSEric Cheng 				softring = mac_srs->srs_udp_soft_rings[0];
1685da14cebeSEric Cheng 				break;
1686da14cebeSEric Cheng 			case OTH:
1687da14cebeSEric Cheng 				softring = mac_srs->srs_oth_soft_rings[0];
1688da14cebeSEric Cheng 			}
1689ae6aa22aSVenugopal Iyer 			mac_rx_soft_ring_process(mcip, softring,
1690da14cebeSEric Cheng 			    headmp[type], tailmp[type], cnt[type], sz[type]);
1691da14cebeSEric Cheng 		}
1692da14cebeSEric Cheng 	}
1693da14cebeSEric Cheng }
1694da14cebeSEric Cheng 
16953cc3202eSDan McDonald int	fanout_unaligned = 0;
1696da14cebeSEric Cheng 
1697da14cebeSEric Cheng /*
1698da14cebeSEric Cheng  * mac_rx_srs_long_fanout
1699da14cebeSEric Cheng  *
17003cc3202eSDan McDonald  * The fanout routine for VLANs, and for anything else that isn't performing
17013cc3202eSDan McDonald  * explicit dls bypass.  Returns -1 on an error (drop the packet due to a
17023cc3202eSDan McDonald  * malformed packet), 0 on success, with values written in *indx and *type.
1703da14cebeSEric Cheng  */
1704da14cebeSEric Cheng static int
1705da14cebeSEric Cheng mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
1706ae6aa22aSVenugopal Iyer     uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx)
1707da14cebeSEric Cheng {
1708da14cebeSEric Cheng 	ip6_t		*ip6h;
17093cc3202eSDan McDonald 	ipha_t		*ipha;
1710da14cebeSEric Cheng 	uint8_t		*whereptr;
1711da14cebeSEric Cheng 	uint_t		hash;
1712da14cebeSEric Cheng 	uint16_t	remlen;
1713da14cebeSEric Cheng 	uint8_t		nexthdr;
1714da14cebeSEric Cheng 	uint16_t	hdr_len;
1715dea290bfSSaso Kiselkov 	uint32_t	src_val, dst_val;
1716da14cebeSEric Cheng 	boolean_t	modifiable = B_TRUE;
17173cc3202eSDan McDonald 	boolean_t	v6;
1718da14cebeSEric Cheng 
1719ae6aa22aSVenugopal Iyer 	ASSERT(MBLKL(mp) >= hdrsize);
1720da14cebeSEric Cheng 
17213cc3202eSDan McDonald 	if (sap == ETHERTYPE_IPV6) {
17223cc3202eSDan McDonald 		v6 = B_TRUE;
17233cc3202eSDan McDonald 		hdr_len = IPV6_HDR_LEN;
17243cc3202eSDan McDonald 	} else if (sap == ETHERTYPE_IP) {
17253cc3202eSDan McDonald 		v6 = B_FALSE;
17263cc3202eSDan McDonald 		hdr_len = IP_SIMPLE_HDR_LENGTH;
17273cc3202eSDan McDonald 	} else {
17283cc3202eSDan McDonald 		*indx = 0;
17293cc3202eSDan McDonald 		*type = OTH;
17303cc3202eSDan McDonald 		return (0);
17313cc3202eSDan McDonald 	}
17323cc3202eSDan McDonald 
1733ae6aa22aSVenugopal Iyer 	ip6h = (ip6_t *)(mp->b_rptr + hdrsize);
17343cc3202eSDan McDonald 	ipha = (ipha_t *)ip6h;
17353cc3202eSDan McDonald 
17363cc3202eSDan McDonald 	if ((uint8_t *)ip6h == mp->b_wptr) {
1737da14cebeSEric Cheng 		/*
1738ae6aa22aSVenugopal Iyer 		 * The first mblk_t only includes the mac header.
1739da14cebeSEric Cheng 		 * Note that it is safe to change the mp pointer here,
1740da14cebeSEric Cheng 		 * as the subsequent operation does not assume mp
1741ae6aa22aSVenugopal Iyer 		 * points to the start of the mac header.
1742da14cebeSEric Cheng 		 */
1743da14cebeSEric Cheng 		mp = mp->b_cont;
1744da14cebeSEric Cheng 
1745da14cebeSEric Cheng 		/*
17463cc3202eSDan McDonald 		 * Make sure the IP header points to an entire one.
1747da14cebeSEric Cheng 		 */
1748da14cebeSEric Cheng 		if (mp == NULL)
1749da14cebeSEric Cheng 			return (-1);
1750da14cebeSEric Cheng 
17513cc3202eSDan McDonald 		if (MBLKL(mp) < hdr_len) {
1752da14cebeSEric Cheng 			modifiable = (DB_REF(mp) == 1);
1753da14cebeSEric Cheng 
17543cc3202eSDan McDonald 			if (modifiable && !pullupmsg(mp, hdr_len))
1755da14cebeSEric Cheng 				return (-1);
1756da14cebeSEric Cheng 		}
1757da14cebeSEric Cheng 
1758da14cebeSEric Cheng 		ip6h = (ip6_t *)mp->b_rptr;
17593cc3202eSDan McDonald 		ipha = (ipha_t *)ip6h;
1760da14cebeSEric Cheng 	}
1761da14cebeSEric Cheng 
1762da14cebeSEric Cheng 	if (!modifiable || !(OK_32PTR((char *)ip6h)) ||
17633cc3202eSDan McDonald 	    ((uint8_t *)ip6h + hdr_len > mp->b_wptr)) {
1764da14cebeSEric Cheng 		/*
17653cc3202eSDan McDonald 		 * If either the IP header is not aligned, or it does not hold
17663cc3202eSDan McDonald 		 * the complete simple structure (a pullupmsg() is not an
17673cc3202eSDan McDonald 		 * option since it would result in an unaligned IP header),
17683cc3202eSDan McDonald 		 * fanout to the default ring.
17693cc3202eSDan McDonald 		 *
17703cc3202eSDan McDonald 		 * Note that this may cause packet reordering.
1771da14cebeSEric Cheng 		 */
1772da14cebeSEric Cheng 		*indx = 0;
1773da14cebeSEric Cheng 		*type = OTH;
17743cc3202eSDan McDonald 		fanout_unaligned++;
1775da14cebeSEric Cheng 		return (0);
1776da14cebeSEric Cheng 	}
1777da14cebeSEric Cheng 
17783cc3202eSDan McDonald 	/*
17793cc3202eSDan McDonald 	 * Extract next-header, full header length, and source-hash value
17803cc3202eSDan McDonald 	 * using v4/v6 specific fields.
17813cc3202eSDan McDonald 	 */
17823cc3202eSDan McDonald 	if (v6) {
1783da14cebeSEric Cheng 		remlen = ntohs(ip6h->ip6_plen);
1784da14cebeSEric Cheng 		nexthdr = ip6h->ip6_nxt;
17853cc3202eSDan McDonald 		src_val = V4_PART_OF_V6(ip6h->ip6_src);
1786dea290bfSSaso Kiselkov 		dst_val = V4_PART_OF_V6(ip6h->ip6_dst);
1787da14cebeSEric Cheng 		/*
1788da14cebeSEric Cheng 		 * Do src based fanout if below tunable is set to B_TRUE or
1789da14cebeSEric Cheng 		 * when mac_ip_hdr_length_v6() fails because of malformed
17903cc3202eSDan McDonald 		 * packets or because mblks need to be concatenated using
1791da14cebeSEric Cheng 		 * pullupmsg().
1792da14cebeSEric Cheng 		 */
1793dea290bfSSaso Kiselkov 		if (!mac_ip_hdr_length_v6(ip6h, mp->b_wptr, &hdr_len, &nexthdr,
1794dea290bfSSaso Kiselkov 		    NULL)) {
1795dea290bfSSaso Kiselkov 			goto src_dst_based_fanout;
1796da14cebeSEric Cheng 		}
17973cc3202eSDan McDonald 	} else {
17983cc3202eSDan McDonald 		hdr_len = IPH_HDR_LENGTH(ipha);
17993cc3202eSDan McDonald 		remlen = ntohs(ipha->ipha_length) - hdr_len;
18003cc3202eSDan McDonald 		nexthdr = ipha->ipha_protocol;
18013cc3202eSDan McDonald 		src_val = (uint32_t)ipha->ipha_src;
1802dea290bfSSaso Kiselkov 		dst_val = (uint32_t)ipha->ipha_dst;
18033cc3202eSDan McDonald 		/*
18043cc3202eSDan McDonald 		 * Catch IPv4 fragment case here.  IPv6 has nexthdr == FRAG
18053cc3202eSDan McDonald 		 * for its equivalent case.
18063cc3202eSDan McDonald 		 */
1807dea290bfSSaso Kiselkov 		if ((ntohs(ipha->ipha_fragment_offset_and_flags) &
18083cc3202eSDan McDonald 		    (IPH_MF | IPH_OFFSET)) != 0) {
1809dea290bfSSaso Kiselkov 			goto src_dst_based_fanout;
18103cc3202eSDan McDonald 		}
18113cc3202eSDan McDonald 	}
18123cc3202eSDan McDonald 	if (remlen < MIN_EHDR_LEN)
18133cc3202eSDan McDonald 		return (-1);
1814da14cebeSEric Cheng 	whereptr = (uint8_t *)ip6h + hdr_len;
1815da14cebeSEric Cheng 
18163cc3202eSDan McDonald 	/* If the transport is one of below, we do port/SPI based fanout */
1817da14cebeSEric Cheng 	switch (nexthdr) {
1818da14cebeSEric Cheng 	case IPPROTO_TCP:
1819da14cebeSEric Cheng 	case IPPROTO_UDP:
1820da14cebeSEric Cheng 	case IPPROTO_SCTP:
1821da14cebeSEric Cheng 	case IPPROTO_ESP:
1822da14cebeSEric Cheng 		/*
18233cc3202eSDan McDonald 		 * If the ports or SPI in the transport header is not part of
1824da14cebeSEric Cheng 		 * the mblk, do src_based_fanout, instead of calling
1825da14cebeSEric Cheng 		 * pullupmsg().
1826da14cebeSEric Cheng 		 */
18273cc3202eSDan McDonald 		if (mp->b_cont == NULL || whereptr + PORTS_SIZE <= mp->b_wptr)
18283cc3202eSDan McDonald 			break;	/* out of switch... */
18293cc3202eSDan McDonald 		/* FALLTHRU */
1830da14cebeSEric Cheng 	default:
1831dea290bfSSaso Kiselkov 		goto src_dst_based_fanout;
1832da14cebeSEric Cheng 	}
1833da14cebeSEric Cheng 
1834da14cebeSEric Cheng 	switch (nexthdr) {
1835da14cebeSEric Cheng 	case IPPROTO_TCP:
1836dea290bfSSaso Kiselkov 		hash = HASH_ADDR(src_val, dst_val, *(uint32_t *)whereptr);
18373cc3202eSDan McDonald 		*indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count);
1838da14cebeSEric Cheng 		*type = OTH;
1839da14cebeSEric Cheng 		break;
1840da14cebeSEric Cheng 	case IPPROTO_UDP:
1841da14cebeSEric Cheng 	case IPPROTO_SCTP:
1842da14cebeSEric Cheng 	case IPPROTO_ESP:
1843da14cebeSEric Cheng 		if (mac_fanout_type == MAC_FANOUT_DEFAULT) {
1844dea290bfSSaso Kiselkov 			hash = HASH_ADDR(src_val, dst_val,
1845dea290bfSSaso Kiselkov 			    *(uint32_t *)whereptr);
1846da14cebeSEric Cheng 			*indx = COMPUTE_INDEX(hash,
1847da14cebeSEric Cheng 			    mac_srs->srs_udp_ring_count);
1848da14cebeSEric Cheng 		} else {
18493cc3202eSDan McDonald 			*indx = mac_srs->srs_ind % mac_srs->srs_udp_ring_count;
1850da14cebeSEric Cheng 			mac_srs->srs_ind++;
1851da14cebeSEric Cheng 		}
1852da14cebeSEric Cheng 		*type = OTH;
1853da14cebeSEric Cheng 		break;
1854da14cebeSEric Cheng 	}
1855da14cebeSEric Cheng 	return (0);
1856da14cebeSEric Cheng 
1857dea290bfSSaso Kiselkov src_dst_based_fanout:
1858dea290bfSSaso Kiselkov 	hash = HASH_ADDR(src_val, dst_val, (uint32_t)0);
1859da14cebeSEric Cheng 	*indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count);
1860da14cebeSEric Cheng 	*type = OTH;
1861da14cebeSEric Cheng 	return (0);
1862da14cebeSEric Cheng }
1863da14cebeSEric Cheng 
1864da14cebeSEric Cheng /*
1865da14cebeSEric Cheng  * mac_rx_srs_fanout
1866da14cebeSEric Cheng  *
1867da14cebeSEric Cheng  * This routine delivers packets destined to an SRS into a soft ring member
1868da14cebeSEric Cheng  * of the set.
1869da14cebeSEric Cheng  *
1870da14cebeSEric Cheng  * Given a chain of packets we need to split it up into multiple sub chains
1871da14cebeSEric Cheng  * destined for one of the TCP, UDP or OTH soft rings. Instead of entering
1872da14cebeSEric Cheng  * the soft ring one packet at a time, we want to enter it in the form of a
1873da14cebeSEric Cheng  * chain otherwise we get this start/stop behaviour where the worker thread
1874da14cebeSEric Cheng  * goes to sleep and then next packets comes in forcing it to wake up etc.
1875da14cebeSEric Cheng  *
1876da14cebeSEric Cheng  * Note:
1877da14cebeSEric Cheng  * Since we know what is the maximum fanout possible, we create a 2D array
1878da14cebeSEric Cheng  * of 'softring types * MAX_SR_FANOUT' for the head, tail, cnt and sz
1879da14cebeSEric Cheng  * variables so that we can enter the softrings with chain. We need the
1880da14cebeSEric Cheng  * MAX_SR_FANOUT so we can allocate the arrays on the stack (a kmem_alloc
1881da14cebeSEric Cheng  * for each packet would be expensive). If we ever want to have the
1882da14cebeSEric Cheng  * ability to have unlimited fanout, we should probably declare a head,
1883da14cebeSEric Cheng  * tail, cnt, sz with each soft ring (a data struct which contains a softring
1884da14cebeSEric Cheng  * along with these members) and create an array of this uber struct so we
1885da14cebeSEric Cheng  * don't have to do kmem_alloc.
1886da14cebeSEric Cheng  */
1887da14cebeSEric Cheng int	fanout_oth1 = 0;
1888da14cebeSEric Cheng int	fanout_oth2 = 0;
1889da14cebeSEric Cheng int	fanout_oth3 = 0;
1890da14cebeSEric Cheng int	fanout_oth4 = 0;
1891da14cebeSEric Cheng int	fanout_oth5 = 0;
1892da14cebeSEric Cheng 
1893da14cebeSEric Cheng static void
1894da14cebeSEric Cheng mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
1895da14cebeSEric Cheng {
1896da14cebeSEric Cheng 	struct ether_header		*ehp;
1897ae6aa22aSVenugopal Iyer 	struct ether_vlan_header	*evhp;
1898ae6aa22aSVenugopal Iyer 	uint32_t			sap;
1899da14cebeSEric Cheng 	ipha_t				*ipha;
1900ae6aa22aSVenugopal Iyer 	uint8_t				*dstaddr;
1901da14cebeSEric Cheng 	uint_t				indx;
1902ae6aa22aSVenugopal Iyer 	size_t				ports_offset;
1903ae6aa22aSVenugopal Iyer 	size_t				ipha_len;
1904ae6aa22aSVenugopal Iyer 	size_t				hdrsize;
1905da14cebeSEric Cheng 	uint_t				hash;
1906da14cebeSEric Cheng 	mblk_t				*mp;
1907da14cebeSEric Cheng 	mblk_t				*headmp[MAX_SR_TYPES][MAX_SR_FANOUT];
1908da14cebeSEric Cheng 	mblk_t				*tailmp[MAX_SR_TYPES][MAX_SR_FANOUT];
1909da14cebeSEric Cheng 	int				cnt[MAX_SR_TYPES][MAX_SR_FANOUT];
1910da14cebeSEric Cheng 	size_t				sz[MAX_SR_TYPES][MAX_SR_FANOUT];
1911da14cebeSEric Cheng 	size_t				sz1;
1912ae6aa22aSVenugopal Iyer 	boolean_t			bw_ctl;
1913da14cebeSEric Cheng 	boolean_t			hw_classified;
1914ae6aa22aSVenugopal Iyer 	boolean_t			dls_bypass;
1915ae6aa22aSVenugopal Iyer 	boolean_t			is_ether;
1916ae6aa22aSVenugopal Iyer 	boolean_t			is_unicast;
1917da14cebeSEric Cheng 	int				fanout_cnt;
1918da14cebeSEric Cheng 	enum pkt_type			type;
1919da14cebeSEric Cheng 	mac_client_impl_t		*mcip = mac_srs->srs_mcip;
1920da14cebeSEric Cheng 
1921ae6aa22aSVenugopal Iyer 	is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER);
1922ae6aa22aSVenugopal Iyer 	bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0);
1923da14cebeSEric Cheng 
1924da14cebeSEric Cheng 	/*
1925da14cebeSEric Cheng 	 * If we don't have a Rx ring, S/W classification would have done
1926da14cebeSEric Cheng 	 * its job and its a packet meant for us. If we were polling on
1927da14cebeSEric Cheng 	 * the default ring (i.e. there was a ring assigned to this SRS),
1928da14cebeSEric Cheng 	 * then we need to make sure that the mac address really belongs
1929da14cebeSEric Cheng 	 * to us.
1930da14cebeSEric Cheng 	 */
1931da14cebeSEric Cheng 	hw_classified = mac_srs->srs_ring != NULL &&
1932da14cebeSEric Cheng 	    mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
1933da14cebeSEric Cheng 
1934da14cebeSEric Cheng 	/*
1935da14cebeSEric Cheng 	 * Special clients (eg. VLAN, non ether, etc) need DLS
1936da14cebeSEric Cheng 	 * processing in the Rx path. SRST_DLS_BYPASS will be clear for
19378d4cf8d8S 	 * such SRSs. Another way of disabling bypass is to set the
19388d4cf8d8S 	 * MCIS_RX_BYPASS_DISABLE flag.
1939da14cebeSEric Cheng 	 */
19408d4cf8d8S 	dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) &&
19418d4cf8d8S 	    ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0);
1942da14cebeSEric Cheng 
1943da14cebeSEric Cheng 	/*
1944da14cebeSEric Cheng 	 * Since the softrings are never destroyed and we always
1945da14cebeSEric Cheng 	 * create equal number of softrings for TCP, UDP and rest,
1946da14cebeSEric Cheng 	 * its OK to check one of them for count and use it without
1947da14cebeSEric Cheng 	 * any lock. In future, if soft rings get destroyed because
1948da14cebeSEric Cheng 	 * of reduction in fanout, we will need to ensure that happens
1949da14cebeSEric Cheng 	 * behind the SRS_PROC.
1950da14cebeSEric Cheng 	 */
1951da14cebeSEric Cheng 	fanout_cnt = mac_srs->srs_tcp_ring_count;
1952da14cebeSEric Cheng 
1953da14cebeSEric Cheng 	bzero(headmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *));
1954da14cebeSEric Cheng 	bzero(tailmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *));
1955da14cebeSEric Cheng 	bzero(cnt, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (int));
1956da14cebeSEric Cheng 	bzero(sz, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (size_t));
1957da14cebeSEric Cheng 
1958da14cebeSEric Cheng 	/*
1959da14cebeSEric Cheng 	 * We got a chain from SRS that we need to send to the soft rings.
1960da14cebeSEric Cheng 	 * Since squeues for TCP & IPv4 sap poll their soft rings (for
1961da14cebeSEric Cheng 	 * performance reasons), we need to separate out v4_tcp, v4_udp
1962da14cebeSEric Cheng 	 * and the rest goes in other.
1963da14cebeSEric Cheng 	 */
1964da14cebeSEric Cheng 	while (head != NULL) {
1965da14cebeSEric Cheng 		mp = head;
1966da14cebeSEric Cheng 		head = head->b_next;
1967da14cebeSEric Cheng 		mp->b_next = NULL;
1968da14cebeSEric Cheng 
1969da14cebeSEric Cheng 		type = OTH;
1970ae6aa22aSVenugopal Iyer 		sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
1971da14cebeSEric Cheng 
1972ae6aa22aSVenugopal Iyer 		if (is_ether) {
1973ae6aa22aSVenugopal Iyer 			/*
1974ae6aa22aSVenugopal Iyer 			 * At this point we can be sure the packet at least
1975ae6aa22aSVenugopal Iyer 			 * has an ether header.
1976ae6aa22aSVenugopal Iyer 			 */
1977ae6aa22aSVenugopal Iyer 			if (sz1 < sizeof (struct ether_header)) {
1978ae6aa22aSVenugopal Iyer 				mac_rx_drop_pkt(mac_srs, mp);
1979ae6aa22aSVenugopal Iyer 				continue;
1980ae6aa22aSVenugopal Iyer 			}
1981da14cebeSEric Cheng 			ehp = (struct ether_header *)mp->b_rptr;
1982ae6aa22aSVenugopal Iyer 
1983da14cebeSEric Cheng 			/*
1984ae6aa22aSVenugopal Iyer 			 * Determine if this is a VLAN or non-VLAN packet.
1985da14cebeSEric Cheng 			 */
1986ae6aa22aSVenugopal Iyer 			if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) {
1987ae6aa22aSVenugopal Iyer 				evhp = (struct ether_vlan_header *)mp->b_rptr;
1988ae6aa22aSVenugopal Iyer 				sap = ntohs(evhp->ether_type);
1989ae6aa22aSVenugopal Iyer 				hdrsize = sizeof (struct ether_vlan_header);
1990da14cebeSEric Cheng 				/*
1991ae6aa22aSVenugopal Iyer 				 * Check if the VID of the packet, if any,
1992ae6aa22aSVenugopal Iyer 				 * belongs to this client.
1993da14cebeSEric Cheng 				 */
1994da14cebeSEric Cheng 				if (!mac_client_check_flow_vid(mcip,
1995da14cebeSEric Cheng 				    VLAN_ID(ntohs(evhp->ether_tci)))) {
1996da14cebeSEric Cheng 					mac_rx_drop_pkt(mac_srs, mp);
1997da14cebeSEric Cheng 					continue;
1998da14cebeSEric Cheng 				}
1999ae6aa22aSVenugopal Iyer 			} else {
2000ae6aa22aSVenugopal Iyer 				hdrsize = sizeof (struct ether_header);
2001da14cebeSEric Cheng 			}
2002ae6aa22aSVenugopal Iyer 			is_unicast =
2003ae6aa22aSVenugopal Iyer 			    ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0);
2004ae6aa22aSVenugopal Iyer 			dstaddr = (uint8_t *)&ehp->ether_dhost;
2005ae6aa22aSVenugopal Iyer 		} else {
2006ae6aa22aSVenugopal Iyer 			mac_header_info_t		mhi;
2007ae6aa22aSVenugopal Iyer 
2008ae6aa22aSVenugopal Iyer 			if (mac_header_info((mac_handle_t)mcip->mci_mip,
2009ae6aa22aSVenugopal Iyer 			    mp, &mhi) != 0) {
2010da14cebeSEric Cheng 				mac_rx_drop_pkt(mac_srs, mp);
2011da14cebeSEric Cheng 				continue;
2012da14cebeSEric Cheng 			}
2013ae6aa22aSVenugopal Iyer 			hdrsize = mhi.mhi_hdrsize;
2014ae6aa22aSVenugopal Iyer 			sap = mhi.mhi_bindsap;
2015ae6aa22aSVenugopal Iyer 			is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST);
2016ae6aa22aSVenugopal Iyer 			dstaddr = (uint8_t *)mhi.mhi_daddr;
2017ae6aa22aSVenugopal Iyer 		}
2018ae6aa22aSVenugopal Iyer 
2019ae6aa22aSVenugopal Iyer 		if (!dls_bypass) {
2020ae6aa22aSVenugopal Iyer 			if (mac_rx_srs_long_fanout(mac_srs, mp, sap,
2021ae6aa22aSVenugopal Iyer 			    hdrsize, &type, &indx) == -1) {
2022ae6aa22aSVenugopal Iyer 				mac_rx_drop_pkt(mac_srs, mp);
2023ae6aa22aSVenugopal Iyer 				continue;
2024da14cebeSEric Cheng 			}
2025da14cebeSEric Cheng 
2026da14cebeSEric Cheng 			FANOUT_ENQUEUE_MP(headmp[type][indx],
2027da14cebeSEric Cheng 			    tailmp[type][indx], cnt[type][indx], bw_ctl,
2028da14cebeSEric Cheng 			    sz[type][indx], sz1, mp);
2029da14cebeSEric Cheng 			continue;
2030da14cebeSEric Cheng 		}
2031da14cebeSEric Cheng 
2032da14cebeSEric Cheng 
2033da14cebeSEric Cheng 		/*
2034da14cebeSEric Cheng 		 * If we are using the default Rx ring where H/W or S/W
2035da14cebeSEric Cheng 		 * classification has not happened, we need to verify if
2036da14cebeSEric Cheng 		 * this unicast packet really belongs to us.
2037da14cebeSEric Cheng 		 */
2038ae6aa22aSVenugopal Iyer 		if (sap == ETHERTYPE_IP) {
2039da14cebeSEric Cheng 			/*
2040da14cebeSEric Cheng 			 * If we are H/W classified, but we have promisc
2041da14cebeSEric Cheng 			 * on, then we need to check for the unicast address.
2042da14cebeSEric Cheng 			 */
2043da14cebeSEric Cheng 			if (hw_classified && mcip->mci_promisc_list != NULL) {
2044da14cebeSEric Cheng 				mac_address_t		*map;
2045da14cebeSEric Cheng 
2046da14cebeSEric Cheng 				rw_enter(&mcip->mci_rw_lock, RW_READER);
2047da14cebeSEric Cheng 				map = mcip->mci_unicast;
2048ae6aa22aSVenugopal Iyer 				if (bcmp(dstaddr, map->ma_addr,
2049da14cebeSEric Cheng 				    map->ma_len) == 0)
2050da14cebeSEric Cheng 					type = UNDEF;
2051da14cebeSEric Cheng 				rw_exit(&mcip->mci_rw_lock);
2052ae6aa22aSVenugopal Iyer 			} else if (is_unicast) {
2053da14cebeSEric Cheng 				type = UNDEF;
2054da14cebeSEric Cheng 			}
2055da14cebeSEric Cheng 		}
2056da14cebeSEric Cheng 
2057da14cebeSEric Cheng 		/*
2058da14cebeSEric Cheng 		 * This needs to become a contract with the driver for
2059da14cebeSEric Cheng 		 * the fast path.
2060da14cebeSEric Cheng 		 */
2061da14cebeSEric Cheng 
2062ae6aa22aSVenugopal Iyer 		ipha = (ipha_t *)(mp->b_rptr + hdrsize);
2063da14cebeSEric Cheng 		if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) {
2064da14cebeSEric Cheng 			type = OTH;
2065da14cebeSEric Cheng 			fanout_oth1++;
2066da14cebeSEric Cheng 		}
2067da14cebeSEric Cheng 
2068da14cebeSEric Cheng 		if (type != OTH) {
2069ae6aa22aSVenugopal Iyer 			uint16_t	frag_offset_flags;
2070ae6aa22aSVenugopal Iyer 
2071da14cebeSEric Cheng 			switch (ipha->ipha_protocol) {
2072da14cebeSEric Cheng 			case IPPROTO_TCP:
2073da14cebeSEric Cheng 			case IPPROTO_UDP:
2074da14cebeSEric Cheng 			case IPPROTO_SCTP:
2075da14cebeSEric Cheng 			case IPPROTO_ESP:
2076da14cebeSEric Cheng 				ipha_len = IPH_HDR_LENGTH(ipha);
2077da14cebeSEric Cheng 				if ((uchar_t *)ipha + ipha_len + PORTS_SIZE >
2078da14cebeSEric Cheng 				    mp->b_wptr) {
2079da14cebeSEric Cheng 					type = OTH;
2080da14cebeSEric Cheng 					break;
2081da14cebeSEric Cheng 				}
2082da14cebeSEric Cheng 				frag_offset_flags =
2083da14cebeSEric Cheng 				    ntohs(ipha->ipha_fragment_offset_and_flags);
2084da14cebeSEric Cheng 				if ((frag_offset_flags &
2085da14cebeSEric Cheng 				    (IPH_MF | IPH_OFFSET)) != 0) {
2086da14cebeSEric Cheng 					type = OTH;
2087da14cebeSEric Cheng 					fanout_oth3++;
2088da14cebeSEric Cheng 					break;
2089da14cebeSEric Cheng 				}
2090ae6aa22aSVenugopal Iyer 				ports_offset = hdrsize + ipha_len;
2091da14cebeSEric Cheng 				break;
2092da14cebeSEric Cheng 			default:
2093da14cebeSEric Cheng 				type = OTH;
2094da14cebeSEric Cheng 				fanout_oth4++;
2095da14cebeSEric Cheng 				break;
2096da14cebeSEric Cheng 			}
2097da14cebeSEric Cheng 		}
2098da14cebeSEric Cheng 
2099da14cebeSEric Cheng 		if (type == OTH) {
2100ae6aa22aSVenugopal Iyer 			if (mac_rx_srs_long_fanout(mac_srs, mp, sap,
2101ae6aa22aSVenugopal Iyer 			    hdrsize, &type, &indx) == -1) {
2102da14cebeSEric Cheng 				mac_rx_drop_pkt(mac_srs, mp);
2103da14cebeSEric Cheng 				continue;
2104da14cebeSEric Cheng 			}
2105da14cebeSEric Cheng 
2106da14cebeSEric Cheng 			FANOUT_ENQUEUE_MP(headmp[type][indx],
2107da14cebeSEric Cheng 			    tailmp[type][indx], cnt[type][indx], bw_ctl,
2108da14cebeSEric Cheng 			    sz[type][indx], sz1, mp);
2109da14cebeSEric Cheng 			continue;
2110da14cebeSEric Cheng 		}
2111da14cebeSEric Cheng 
2112da14cebeSEric Cheng 		ASSERT(type == UNDEF);
2113da14cebeSEric Cheng 
2114da14cebeSEric Cheng 		/*
2115da14cebeSEric Cheng 		 * XXX-Sunay: We should hold srs_lock since ring_count
2116da14cebeSEric Cheng 		 * below can change. But if we are always called from
2117da14cebeSEric Cheng 		 * mac_rx_srs_drain and SRS_PROC is set, then we can
2118da14cebeSEric Cheng 		 * enforce that ring_count can't be changed i.e.
2119da14cebeSEric Cheng 		 * to change fanout type or ring count, the calling
2120da14cebeSEric Cheng 		 * thread needs to be behind SRS_PROC.
2121da14cebeSEric Cheng 		 */
2122da14cebeSEric Cheng 		switch (ipha->ipha_protocol) {
2123da14cebeSEric Cheng 		case IPPROTO_TCP:
2124da14cebeSEric Cheng 			/*
2125da14cebeSEric Cheng 			 * Note that for ESP, we fanout on SPI and it is at the
2126da14cebeSEric Cheng 			 * same offset as the 2x16-bit ports. So it is clumped
2127da14cebeSEric Cheng 			 * along with TCP, UDP and SCTP.
2128da14cebeSEric Cheng 			 */
2129dea290bfSSaso Kiselkov 			hash = HASH_ADDR(ipha->ipha_src, ipha->ipha_dst,
2130da14cebeSEric Cheng 			    *(uint32_t *)(mp->b_rptr + ports_offset));
2131da14cebeSEric Cheng 			indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count);
2132da14cebeSEric Cheng 			type = V4_TCP;
2133ae6aa22aSVenugopal Iyer 			mp->b_rptr += hdrsize;
2134da14cebeSEric Cheng 			break;
2135da14cebeSEric Cheng 		case IPPROTO_UDP:
2136da14cebeSEric Cheng 		case IPPROTO_SCTP:
2137da14cebeSEric Cheng 		case IPPROTO_ESP:
2138da14cebeSEric Cheng 			if (mac_fanout_type == MAC_FANOUT_DEFAULT) {
2139dea290bfSSaso Kiselkov 				hash = HASH_ADDR(ipha->ipha_src, ipha->ipha_dst,
2140da14cebeSEric Cheng 				    *(uint32_t *)(mp->b_rptr + ports_offset));
2141da14cebeSEric Cheng 				indx = COMPUTE_INDEX(hash,
2142da14cebeSEric Cheng 				    mac_srs->srs_udp_ring_count);
2143da14cebeSEric Cheng 			} else {
2144da14cebeSEric Cheng 				indx = mac_srs->srs_ind %
2145da14cebeSEric Cheng 				    mac_srs->srs_udp_ring_count;
2146da14cebeSEric Cheng 				mac_srs->srs_ind++;
2147da14cebeSEric Cheng 			}
2148da14cebeSEric Cheng 			type = V4_UDP;
2149ae6aa22aSVenugopal Iyer 			mp->b_rptr += hdrsize;
2150da14cebeSEric Cheng 			break;
2151ae6aa22aSVenugopal Iyer 		default:
2152ae6aa22aSVenugopal Iyer 			indx = 0;
2153ae6aa22aSVenugopal Iyer 			type = OTH;
2154da14cebeSEric Cheng 		}
2155da14cebeSEric Cheng 
2156da14cebeSEric Cheng 		FANOUT_ENQUEUE_MP(headmp[type][indx], tailmp[type][indx],
2157da14cebeSEric Cheng 		    cnt[type][indx], bw_ctl, sz[type][indx], sz1, mp);
2158da14cebeSEric Cheng 	}
2159da14cebeSEric Cheng 
2160da14cebeSEric Cheng 	for (type = V4_TCP; type < UNDEF; type++) {
2161ae6aa22aSVenugopal Iyer 		int	i;
2162ae6aa22aSVenugopal Iyer 
2163da14cebeSEric Cheng 		for (i = 0; i < fanout_cnt; i++) {
2164da14cebeSEric Cheng 			if (headmp[type][i] != NULL) {
2165ae6aa22aSVenugopal Iyer 				mac_soft_ring_t	*softring;
2166ae6aa22aSVenugopal Iyer 
2167da14cebeSEric Cheng 				ASSERT(tailmp[type][i]->b_next == NULL);
2168da14cebeSEric Cheng 				switch (type) {
2169da14cebeSEric Cheng 				case V4_TCP:
2170da14cebeSEric Cheng 					softring =
2171da14cebeSEric Cheng 					    mac_srs->srs_tcp_soft_rings[i];
2172da14cebeSEric Cheng 					break;
2173da14cebeSEric Cheng 				case V4_UDP:
2174da14cebeSEric Cheng 					softring =
2175da14cebeSEric Cheng 					    mac_srs->srs_udp_soft_rings[i];
2176da14cebeSEric Cheng 					break;
2177da14cebeSEric Cheng 				case OTH:
2178da14cebeSEric Cheng 					softring =
2179da14cebeSEric Cheng 					    mac_srs->srs_oth_soft_rings[i];
2180da14cebeSEric Cheng 					break;
2181da14cebeSEric Cheng 				}
2182ae6aa22aSVenugopal Iyer 				mac_rx_soft_ring_process(mcip,
2183da14cebeSEric Cheng 				    softring, headmp[type][i], tailmp[type][i],
2184da14cebeSEric Cheng 				    cnt[type][i], sz[type][i]);
2185da14cebeSEric Cheng 			}
2186da14cebeSEric Cheng 		}
2187da14cebeSEric Cheng 	}
2188da14cebeSEric Cheng }
2189da14cebeSEric Cheng 
2190da14cebeSEric Cheng #define	SRS_BYTES_TO_PICKUP	150000
2191da14cebeSEric Cheng ssize_t	max_bytes_to_pickup = SRS_BYTES_TO_PICKUP;
2192da14cebeSEric Cheng 
2193da14cebeSEric Cheng /*
2194da14cebeSEric Cheng  * mac_rx_srs_poll_ring
2195da14cebeSEric Cheng  *
2196da14cebeSEric Cheng  * This SRS Poll thread uses this routine to poll the underlying hardware
2197da14cebeSEric Cheng  * Rx ring to get a chain of packets. It can inline process that chain
2198da14cebeSEric Cheng  * if mac_latency_optimize is set (default) or signal the SRS worker thread
2199da14cebeSEric Cheng  * to do the remaining processing.
2200da14cebeSEric Cheng  *
2201da14cebeSEric Cheng  * Since packets come in the system via interrupt or poll path, we also
2202da14cebeSEric Cheng  * update the stats and deal with promiscous clients here.
2203da14cebeSEric Cheng  */
2204da14cebeSEric Cheng void
2205da14cebeSEric Cheng mac_rx_srs_poll_ring(mac_soft_ring_set_t *mac_srs)
2206da14cebeSEric Cheng {
2207da14cebeSEric Cheng 	kmutex_t 		*lock = &mac_srs->srs_lock;
2208da14cebeSEric Cheng 	kcondvar_t 		*async = &mac_srs->srs_cv;
2209da14cebeSEric Cheng 	mac_srs_rx_t		*srs_rx = &mac_srs->srs_rx;
2210da14cebeSEric Cheng 	mblk_t 			*head, *tail, *mp;
2211da14cebeSEric Cheng 	callb_cpr_t 		cprinfo;
2212da14cebeSEric Cheng 	ssize_t 		bytes_to_pickup;
2213da14cebeSEric Cheng 	size_t 			sz;
2214da14cebeSEric Cheng 	int			count;
2215da14cebeSEric Cheng 	mac_client_impl_t	*smcip;
2216da14cebeSEric Cheng 
2217da14cebeSEric Cheng 	CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "mac_srs_poll");
2218da14cebeSEric Cheng 	mutex_enter(lock);
2219da14cebeSEric Cheng 
2220da14cebeSEric Cheng start:
2221da14cebeSEric Cheng 	for (;;) {
2222da14cebeSEric Cheng 		if (mac_srs->srs_state & SRS_PAUSE)
2223da14cebeSEric Cheng 			goto done;
2224da14cebeSEric Cheng 
2225da14cebeSEric Cheng 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
2226da14cebeSEric Cheng 		cv_wait(async, lock);
2227da14cebeSEric Cheng 		CALLB_CPR_SAFE_END(&cprinfo, lock);
2228da14cebeSEric Cheng 
2229da14cebeSEric Cheng 		if (mac_srs->srs_state & SRS_PAUSE)
2230da14cebeSEric Cheng 			goto done;
2231da14cebeSEric Cheng 
2232da14cebeSEric Cheng check_again:
2233da14cebeSEric Cheng 		if (mac_srs->srs_type & SRST_BW_CONTROL) {
2234da14cebeSEric Cheng 			/*
2235da14cebeSEric Cheng 			 * We pick as many bytes as we are allowed to queue.
2236da14cebeSEric Cheng 			 * Its possible that we will exceed the total
2237da14cebeSEric Cheng 			 * packets queued in case this SRS is part of the
2238da14cebeSEric Cheng 			 * Rx ring group since > 1 poll thread can be pulling
2239da14cebeSEric Cheng 			 * upto the max allowed packets at the same time
2240da14cebeSEric Cheng 			 * but that should be OK.
2241da14cebeSEric Cheng 			 */
2242da14cebeSEric Cheng 			mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
2243da14cebeSEric Cheng 			bytes_to_pickup =
2244da14cebeSEric Cheng 			    mac_srs->srs_bw->mac_bw_drop_threshold -
2245da14cebeSEric Cheng 			    mac_srs->srs_bw->mac_bw_sz;
2246da14cebeSEric Cheng 			/*
2247da14cebeSEric Cheng 			 * We shouldn't have been signalled if we
2248da14cebeSEric Cheng 			 * have 0 or less bytes to pick but since
2249da14cebeSEric Cheng 			 * some of the bytes accounting is driver
2250da14cebeSEric Cheng 			 * dependant, we do the safety check.
2251da14cebeSEric Cheng 			 */
2252da14cebeSEric Cheng 			if (bytes_to_pickup < 0)
2253da14cebeSEric Cheng 				bytes_to_pickup = 0;
2254da14cebeSEric Cheng 			mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2255da14cebeSEric Cheng 		} else {
2256da14cebeSEric Cheng 			/*
2257da14cebeSEric Cheng 			 * ToDO: Need to change the polling API
2258da14cebeSEric Cheng 			 * to add a packet count and a flag which
2259da14cebeSEric Cheng 			 * tells the driver whether we want packets
2260da14cebeSEric Cheng 			 * based on a count, or bytes, or all the
2261da14cebeSEric Cheng 			 * packets queued in the driver/HW. This
2262da14cebeSEric Cheng 			 * way, we never have to check the limits
2263da14cebeSEric Cheng 			 * on poll path. We truly let only as many
2264da14cebeSEric Cheng 			 * packets enter the system as we are willing
2265da14cebeSEric Cheng 			 * to process or queue.
2266da14cebeSEric Cheng 			 *
2267da14cebeSEric Cheng 			 * Something along the lines of
2268da14cebeSEric Cheng 			 * pkts_to_pickup = mac_soft_ring_max_q_cnt -
2269da14cebeSEric Cheng 			 *	mac_srs->srs_poll_pkt_cnt
2270da14cebeSEric Cheng 			 */
2271da14cebeSEric Cheng 
2272da14cebeSEric Cheng 			/*
2273da14cebeSEric Cheng 			 * Since we are not doing B/W control, pick
2274da14cebeSEric Cheng 			 * as many packets as allowed.
2275da14cebeSEric Cheng 			 */
2276da14cebeSEric Cheng 			bytes_to_pickup = max_bytes_to_pickup;
2277da14cebeSEric Cheng 		}
2278da14cebeSEric Cheng 
2279da14cebeSEric Cheng 		/* Poll the underlying Hardware */
2280da14cebeSEric Cheng 		mutex_exit(lock);
2281da14cebeSEric Cheng 		head = MAC_HWRING_POLL(mac_srs->srs_ring, (int)bytes_to_pickup);
2282da14cebeSEric Cheng 		mutex_enter(lock);
2283da14cebeSEric Cheng 
2284da14cebeSEric Cheng 		ASSERT((mac_srs->srs_state & SRS_POLL_THR_OWNER) ==
2285da14cebeSEric Cheng 		    SRS_POLL_THR_OWNER);
2286da14cebeSEric Cheng 
2287da14cebeSEric Cheng 		mp = tail = head;
2288da14cebeSEric Cheng 		count = 0;
2289da14cebeSEric Cheng 		sz = 0;
2290da14cebeSEric Cheng 		while (mp != NULL) {
2291da14cebeSEric Cheng 			tail = mp;
2292da14cebeSEric Cheng 			sz += msgdsize(mp);
2293da14cebeSEric Cheng 			mp = mp->b_next;
2294da14cebeSEric Cheng 			count++;
2295da14cebeSEric Cheng 		}
2296da14cebeSEric Cheng 
2297da14cebeSEric Cheng 		if (head != NULL) {
2298da14cebeSEric Cheng 			tail->b_next = NULL;
2299da14cebeSEric Cheng 			smcip = mac_srs->srs_mcip;
2300da14cebeSEric Cheng 
23010dc2366fSVenugopal Iyer 			SRS_RX_STAT_UPDATE(mac_srs, pollbytes, sz);
23020dc2366fSVenugopal Iyer 			SRS_RX_STAT_UPDATE(mac_srs, pollcnt, count);
2303da14cebeSEric Cheng 
2304da14cebeSEric Cheng 			/*
2305da14cebeSEric Cheng 			 * If there are any promiscuous mode callbacks
2306da14cebeSEric Cheng 			 * defined for this MAC client, pass them a copy
2307da14cebeSEric Cheng 			 * if appropriate and also update the counters.
2308da14cebeSEric Cheng 			 */
2309da14cebeSEric Cheng 			if (smcip != NULL) {
2310da14cebeSEric Cheng 				if (smcip->mci_mip->mi_promisc_list != NULL) {
2311da14cebeSEric Cheng 					mutex_exit(lock);
2312da14cebeSEric Cheng 					mac_promisc_dispatch(smcip->mci_mip,
2313da14cebeSEric Cheng 					    head, NULL);
2314da14cebeSEric Cheng 					mutex_enter(lock);
2315da14cebeSEric Cheng 				}
2316da14cebeSEric Cheng 			}
2317da14cebeSEric Cheng 			if (mac_srs->srs_type & SRST_BW_CONTROL) {
2318da14cebeSEric Cheng 				mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
2319da14cebeSEric Cheng 				mac_srs->srs_bw->mac_bw_polled += sz;
2320da14cebeSEric Cheng 				mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2321da14cebeSEric Cheng 			}
2322da14cebeSEric Cheng 			MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail,
2323da14cebeSEric Cheng 			    count, sz);
2324da14cebeSEric Cheng 			if (count <= 10)
23250dc2366fSVenugopal Iyer 				srs_rx->sr_stat.mrs_chaincntundr10++;
2326da14cebeSEric Cheng 			else if (count > 10 && count <= 50)
23270dc2366fSVenugopal Iyer 				srs_rx->sr_stat.mrs_chaincnt10to50++;
2328da14cebeSEric Cheng 			else
23290dc2366fSVenugopal Iyer 				srs_rx->sr_stat.mrs_chaincntover50++;
2330da14cebeSEric Cheng 		}
2331da14cebeSEric Cheng 
2332da14cebeSEric Cheng 		/*
2333da14cebeSEric Cheng 		 * We are guaranteed that SRS_PROC will be set if we
2334da14cebeSEric Cheng 		 * are here. Also, poll thread gets to run only if
2335da14cebeSEric Cheng 		 * the drain was being done by a worker thread although
2336da14cebeSEric Cheng 		 * its possible that worker thread is still running
2337da14cebeSEric Cheng 		 * and poll thread was sent down to keep the pipeline
2338da14cebeSEric Cheng 		 * going instead of doing a complete drain and then
2339da14cebeSEric Cheng 		 * trying to poll the NIC.
2340da14cebeSEric Cheng 		 *
2341da14cebeSEric Cheng 		 * So we need to check SRS_WORKER flag to make sure
2342da14cebeSEric Cheng 		 * that the worker thread is not processing the queue
2343da14cebeSEric Cheng 		 * in parallel to us. The flags and conditions are
2344da14cebeSEric Cheng 		 * protected by the srs_lock to prevent any race. We
2345da14cebeSEric Cheng 		 * ensure that we don't drop the srs_lock from now
2346da14cebeSEric Cheng 		 * till the end and similarly we don't drop the srs_lock
2347da14cebeSEric Cheng 		 * in mac_rx_srs_drain() till similar condition check
2348da14cebeSEric Cheng 		 * are complete. The mac_rx_srs_drain() needs to ensure
2349da14cebeSEric Cheng 		 * that SRS_WORKER flag remains set as long as its
2350da14cebeSEric Cheng 		 * processing the queue.
2351da14cebeSEric Cheng 		 */
2352da14cebeSEric Cheng 		if (!(mac_srs->srs_state & SRS_WORKER) &&
2353da14cebeSEric Cheng 		    (mac_srs->srs_first != NULL)) {
2354da14cebeSEric Cheng 			/*
2355da14cebeSEric Cheng 			 * We have packets to process and worker thread
2356da14cebeSEric Cheng 			 * is not running. Check to see if poll thread is
2357ae6aa22aSVenugopal Iyer 			 * allowed to process.
2358da14cebeSEric Cheng 			 */
2359ae6aa22aSVenugopal Iyer 			if (mac_srs->srs_state & SRS_LATENCY_OPT) {
2360da14cebeSEric Cheng 				mac_srs->srs_drain_func(mac_srs, SRS_POLL_PROC);
23617507fc2cSEric Cheng 				if (!(mac_srs->srs_state & SRS_PAUSE) &&
23627507fc2cSEric Cheng 				    srs_rx->sr_poll_pkt_cnt <=
2363da14cebeSEric Cheng 				    srs_rx->sr_lowat) {
2364da14cebeSEric Cheng 					srs_rx->sr_poll_again++;
2365da14cebeSEric Cheng 					goto check_again;
2366ae6aa22aSVenugopal Iyer 				}
2367da14cebeSEric Cheng 				/*
2368da14cebeSEric Cheng 				 * We are already above low water mark
2369da14cebeSEric Cheng 				 * so stay in the polling mode but no
2370da14cebeSEric Cheng 				 * need to poll. Once we dip below
2371da14cebeSEric Cheng 				 * the polling threshold, the processing
2372da14cebeSEric Cheng 				 * thread (soft ring) will signal us
2373da14cebeSEric Cheng 				 * to poll again (MAC_UPDATE_SRS_COUNT)
2374da14cebeSEric Cheng 				 */
2375da14cebeSEric Cheng 				srs_rx->sr_poll_drain_no_poll++;
2376ae6aa22aSVenugopal Iyer 				mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS);
2377da14cebeSEric Cheng 				/*
2378da14cebeSEric Cheng 				 * In B/W control case, its possible
2379da14cebeSEric Cheng 				 * that the backlog built up due to
2380da14cebeSEric Cheng 				 * B/W limit being reached and packets
2381da14cebeSEric Cheng 				 * are queued only in SRS. In this case,
2382da14cebeSEric Cheng 				 * we should schedule worker thread
2383da14cebeSEric Cheng 				 * since no one else will wake us up.
2384da14cebeSEric Cheng 				 */
2385ae6aa22aSVenugopal Iyer 				if ((mac_srs->srs_type & SRST_BW_CONTROL) &&
2386da14cebeSEric Cheng 				    (mac_srs->srs_tid == NULL)) {
2387da14cebeSEric Cheng 					mac_srs->srs_tid =
2388ae6aa22aSVenugopal Iyer 					    timeout(mac_srs_fire, mac_srs, 1);
2389da14cebeSEric Cheng 					srs_rx->sr_poll_worker_wakeup++;
2390da14cebeSEric Cheng 				}
2391da14cebeSEric Cheng 			} else {
2392da14cebeSEric Cheng 				/*
2393da14cebeSEric Cheng 				 * Wakeup the worker thread for more processing.
2394da14cebeSEric Cheng 				 * We optimize for throughput in this case.
2395da14cebeSEric Cheng 				 */
2396da14cebeSEric Cheng 				mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS);
2397da14cebeSEric Cheng 				MAC_SRS_WORKER_WAKEUP(mac_srs);
2398da14cebeSEric Cheng 				srs_rx->sr_poll_sig_worker++;
2399da14cebeSEric Cheng 			}
2400da14cebeSEric Cheng 		} else if ((mac_srs->srs_first == NULL) &&
2401da14cebeSEric Cheng 		    !(mac_srs->srs_state & SRS_WORKER)) {
2402da14cebeSEric Cheng 			/*
2403da14cebeSEric Cheng 			 * There is nothing queued in SRS and
2404da14cebeSEric Cheng 			 * no worker thread running. Plus we
2405da14cebeSEric Cheng 			 * didn't get anything from the H/W
2406da14cebeSEric Cheng 			 * as well (head == NULL);
2407da14cebeSEric Cheng 			 */
2408da14cebeSEric Cheng 			ASSERT(head == NULL);
2409da14cebeSEric Cheng 			mac_srs->srs_state &=
2410da14cebeSEric Cheng 			    ~(SRS_PROC|SRS_GET_PKTS);
2411da14cebeSEric Cheng 
2412da14cebeSEric Cheng 			/*
2413da14cebeSEric Cheng 			 * If we have a packets in soft ring, don't allow
2414da14cebeSEric Cheng 			 * more packets to come into this SRS by keeping the
2415da14cebeSEric Cheng 			 * interrupts off but not polling the H/W. The
2416da14cebeSEric Cheng 			 * poll thread will get signaled as soon as
2417da14cebeSEric Cheng 			 * srs_poll_pkt_cnt dips below poll threshold.
2418da14cebeSEric Cheng 			 */
2419da14cebeSEric Cheng 			if (srs_rx->sr_poll_pkt_cnt == 0) {
2420da14cebeSEric Cheng 				srs_rx->sr_poll_intr_enable++;
2421da14cebeSEric Cheng 				MAC_SRS_POLLING_OFF(mac_srs);
2422da14cebeSEric Cheng 			} else {
2423da14cebeSEric Cheng 				/*
2424da14cebeSEric Cheng 				 * We know nothing is queued in SRS
2425da14cebeSEric Cheng 				 * since we are here after checking
2426da14cebeSEric Cheng 				 * srs_first is NULL. The backlog
2427da14cebeSEric Cheng 				 * is entirely due to packets queued
2428da14cebeSEric Cheng 				 * in Soft ring which will wake us up
2429da14cebeSEric Cheng 				 * and get the interface out of polling
2430da14cebeSEric Cheng 				 * mode once the backlog dips below
2431da14cebeSEric Cheng 				 * sr_poll_thres.
2432da14cebeSEric Cheng 				 */
2433da14cebeSEric Cheng 				srs_rx->sr_poll_no_poll++;
2434da14cebeSEric Cheng 			}
2435da14cebeSEric Cheng 		} else {
2436da14cebeSEric Cheng 			/*
2437da14cebeSEric Cheng 			 * Worker thread is already running.
2438da14cebeSEric Cheng 			 * Nothing much to do. If the polling
2439da14cebeSEric Cheng 			 * was enabled, worker thread will deal
2440da14cebeSEric Cheng 			 * with that.
2441da14cebeSEric Cheng 			 */
2442da14cebeSEric Cheng 			mac_srs->srs_state &= ~SRS_GET_PKTS;
2443da14cebeSEric Cheng 			srs_rx->sr_poll_goto_sleep++;
2444da14cebeSEric Cheng 		}
2445da14cebeSEric Cheng 	}
2446da14cebeSEric Cheng done:
2447da14cebeSEric Cheng 	mac_srs->srs_state |= SRS_POLL_THR_QUIESCED;
2448da14cebeSEric Cheng 	cv_signal(&mac_srs->srs_async);
2449da14cebeSEric Cheng 	/*
2450da14cebeSEric Cheng 	 * If this is a temporary quiesce then wait for the restart signal
2451da14cebeSEric Cheng 	 * from the srs worker. Then clear the flags and signal the srs worker
2452da14cebeSEric Cheng 	 * to ensure a positive handshake and go back to start.
2453da14cebeSEric Cheng 	 */
2454da14cebeSEric Cheng 	while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_POLL_THR_RESTART)))
2455da14cebeSEric Cheng 		cv_wait(async, lock);
2456da14cebeSEric Cheng 	if (mac_srs->srs_state & SRS_POLL_THR_RESTART) {
2457da14cebeSEric Cheng 		ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED));
2458da14cebeSEric Cheng 		mac_srs->srs_state &=
2459da14cebeSEric Cheng 		    ~(SRS_POLL_THR_QUIESCED | SRS_POLL_THR_RESTART);
2460da14cebeSEric Cheng 		cv_signal(&mac_srs->srs_async);
2461da14cebeSEric Cheng 		goto start;
2462da14cebeSEric Cheng 	} else {
2463da14cebeSEric Cheng 		mac_srs->srs_state |= SRS_POLL_THR_EXITED;
2464da14cebeSEric Cheng 		cv_signal(&mac_srs->srs_async);
2465da14cebeSEric Cheng 		CALLB_CPR_EXIT(&cprinfo);
2466da14cebeSEric Cheng 		thread_exit();
2467da14cebeSEric Cheng 	}
2468da14cebeSEric Cheng }
2469da14cebeSEric Cheng 
2470da14cebeSEric Cheng /*
2471da14cebeSEric Cheng  * mac_srs_pick_chain
2472da14cebeSEric Cheng  *
2473da14cebeSEric Cheng  * In Bandwidth control case, checks how many packets can be processed
2474da14cebeSEric Cheng  * and return them in a sub chain.
2475da14cebeSEric Cheng  */
2476da14cebeSEric Cheng static mblk_t *
2477da14cebeSEric Cheng mac_srs_pick_chain(mac_soft_ring_set_t *mac_srs, mblk_t **chain_tail,
2478da14cebeSEric Cheng     size_t *chain_sz, int *chain_cnt)
2479da14cebeSEric Cheng {
2480da14cebeSEric Cheng 	mblk_t 			*head = NULL;
2481da14cebeSEric Cheng 	mblk_t 			*tail = NULL;
2482da14cebeSEric Cheng 	size_t			sz;
2483da14cebeSEric Cheng 	size_t 			tsz = 0;
2484da14cebeSEric Cheng 	int			cnt = 0;
2485da14cebeSEric Cheng 	mblk_t 			*mp;
2486da14cebeSEric Cheng 
2487da14cebeSEric Cheng 	ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
2488da14cebeSEric Cheng 	mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
2489da14cebeSEric Cheng 	if (((mac_srs->srs_bw->mac_bw_used + mac_srs->srs_size) <=
2490da14cebeSEric Cheng 	    mac_srs->srs_bw->mac_bw_limit) ||
2491da14cebeSEric Cheng 	    (mac_srs->srs_bw->mac_bw_limit == 0)) {
2492da14cebeSEric Cheng 		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2493da14cebeSEric Cheng 		head = mac_srs->srs_first;
2494da14cebeSEric Cheng 		mac_srs->srs_first = NULL;
2495da14cebeSEric Cheng 		*chain_tail = mac_srs->srs_last;
2496da14cebeSEric Cheng 		mac_srs->srs_last = NULL;
2497da14cebeSEric Cheng 		*chain_sz = mac_srs->srs_size;
2498da14cebeSEric Cheng 		*chain_cnt = mac_srs->srs_count;
2499da14cebeSEric Cheng 		mac_srs->srs_count = 0;
2500da14cebeSEric Cheng 		mac_srs->srs_size = 0;
2501da14cebeSEric Cheng 		return (head);
2502da14cebeSEric Cheng 	}
2503da14cebeSEric Cheng 
2504da14cebeSEric Cheng 	/*
2505da14cebeSEric Cheng 	 * Can't clear the entire backlog.
2506da14cebeSEric Cheng 	 * Need to find how many packets to pick
2507da14cebeSEric Cheng 	 */
2508da14cebeSEric Cheng 	ASSERT(MUTEX_HELD(&mac_srs->srs_bw->mac_bw_lock));
2509da14cebeSEric Cheng 	while ((mp = mac_srs->srs_first) != NULL) {
2510da14cebeSEric Cheng 		sz = msgdsize(mp);
2511da14cebeSEric Cheng 		if ((tsz + sz + mac_srs->srs_bw->mac_bw_used) >
2512da14cebeSEric Cheng 		    mac_srs->srs_bw->mac_bw_limit) {
2513da14cebeSEric Cheng 			if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED))
2514da14cebeSEric Cheng 				mac_srs->srs_bw->mac_bw_state |=
2515da14cebeSEric Cheng 				    SRS_BW_ENFORCED;
2516da14cebeSEric Cheng 			break;
2517da14cebeSEric Cheng 		}
2518da14cebeSEric Cheng 
2519da14cebeSEric Cheng 		/*
2520da14cebeSEric Cheng 		 * The _size & cnt is  decremented from the softrings
2521da14cebeSEric Cheng 		 * when they send up the packet for polling to work
2522da14cebeSEric Cheng 		 * properly.
2523da14cebeSEric Cheng 		 */
2524da14cebeSEric Cheng 		tsz += sz;
2525da14cebeSEric Cheng 		cnt++;
2526da14cebeSEric Cheng 		mac_srs->srs_count--;
2527da14cebeSEric Cheng 		mac_srs->srs_size -= sz;
2528da14cebeSEric Cheng 		if (tail != NULL)
2529da14cebeSEric Cheng 			tail->b_next = mp;
2530da14cebeSEric Cheng 		else
2531da14cebeSEric Cheng 			head = mp;
2532da14cebeSEric Cheng 		tail = mp;
2533da14cebeSEric Cheng 		mac_srs->srs_first = mac_srs->srs_first->b_next;
2534da14cebeSEric Cheng 	}
2535da14cebeSEric Cheng 	mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2536da14cebeSEric Cheng 	if (mac_srs->srs_first == NULL)
2537da14cebeSEric Cheng 		mac_srs->srs_last = NULL;
2538da14cebeSEric Cheng 
2539da14cebeSEric Cheng 	if (tail != NULL)
2540da14cebeSEric Cheng 		tail->b_next = NULL;
2541da14cebeSEric Cheng 	*chain_tail = tail;
2542da14cebeSEric Cheng 	*chain_cnt = cnt;
2543da14cebeSEric Cheng 	*chain_sz = tsz;
2544da14cebeSEric Cheng 
2545da14cebeSEric Cheng 	return (head);
2546da14cebeSEric Cheng }
2547da14cebeSEric Cheng 
2548da14cebeSEric Cheng /*
2549da14cebeSEric Cheng  * mac_rx_srs_drain
2550da14cebeSEric Cheng  *
2551da14cebeSEric Cheng  * The SRS drain routine. Gets to run to clear the queue. Any thread
2552da14cebeSEric Cheng  * (worker, interrupt, poll) can call this based on processing model.
2553da14cebeSEric Cheng  * The first thing we do is disable interrupts if possible and then
2554da14cebeSEric Cheng  * drain the queue. we also try to poll the underlying hardware if
2555da14cebeSEric Cheng  * there is a dedicated hardware Rx ring assigned to this SRS.
2556da14cebeSEric Cheng  *
2557da14cebeSEric Cheng  * There is a equivalent drain routine in bandwidth control mode
2558da14cebeSEric Cheng  * mac_rx_srs_drain_bw. There is some code duplication between the two
2559da14cebeSEric Cheng  * routines but they are highly performance sensitive and are easier
2560da14cebeSEric Cheng  * to read/debug if they stay separate. Any code changes here might
2561da14cebeSEric Cheng  * also apply to mac_rx_srs_drain_bw as well.
2562da14cebeSEric Cheng  */
2563da14cebeSEric Cheng void
2564da14cebeSEric Cheng mac_rx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type)
2565da14cebeSEric Cheng {
2566da14cebeSEric Cheng 	mblk_t 			*head;
2567da14cebeSEric Cheng 	mblk_t			*tail;
2568da14cebeSEric Cheng 	timeout_id_t 		tid;
2569da14cebeSEric Cheng 	int			cnt = 0;
2570da14cebeSEric Cheng 	mac_client_impl_t	*mcip = mac_srs->srs_mcip;
2571da14cebeSEric Cheng 	mac_srs_rx_t		*srs_rx = &mac_srs->srs_rx;
2572da14cebeSEric Cheng 
2573da14cebeSEric Cheng 	ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
2574da14cebeSEric Cheng 	ASSERT(!(mac_srs->srs_type & SRST_BW_CONTROL));
2575ae6aa22aSVenugopal Iyer 
2576da14cebeSEric Cheng 	/* If we are blanked i.e. can't do upcalls, then we are done */
2577da14cebeSEric Cheng 	if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) {
2578da14cebeSEric Cheng 		ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) ||
2579da14cebeSEric Cheng 		    (mac_srs->srs_state & SRS_PAUSE));
2580da14cebeSEric Cheng 		goto out;
2581da14cebeSEric Cheng 	}
2582da14cebeSEric Cheng 
2583da14cebeSEric Cheng 	if (mac_srs->srs_first == NULL)
2584da14cebeSEric Cheng 		goto out;
2585da14cebeSEric Cheng 
2586ae6aa22aSVenugopal Iyer 	if (!(mac_srs->srs_state & SRS_LATENCY_OPT) &&
2587ae6aa22aSVenugopal Iyer 	    (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)) {
2588ae6aa22aSVenugopal Iyer 		/*
2589ae6aa22aSVenugopal Iyer 		 * In the normal case, the SRS worker thread does no
2590ae6aa22aSVenugopal Iyer 		 * work and we wait for a backlog to build up before
2591ae6aa22aSVenugopal Iyer 		 * we switch into polling mode. In case we are
2592ae6aa22aSVenugopal Iyer 		 * optimizing for throughput, we use the worker thread
2593ae6aa22aSVenugopal Iyer 		 * as well. The goal is to let worker thread process
2594ae6aa22aSVenugopal Iyer 		 * the queue and poll thread to feed packets into
2595ae6aa22aSVenugopal Iyer 		 * the queue. As such, we should signal the poll
2596ae6aa22aSVenugopal Iyer 		 * thread to try and get more packets.
2597ae6aa22aSVenugopal Iyer 		 *
2598ae6aa22aSVenugopal Iyer 		 * We could have pulled this check in the POLL_RING
2599ae6aa22aSVenugopal Iyer 		 * macro itself but keeping it explicit here makes
2600ae6aa22aSVenugopal Iyer 		 * the architecture more human understandable.
2601ae6aa22aSVenugopal Iyer 		 */
2602ae6aa22aSVenugopal Iyer 		MAC_SRS_POLL_RING(mac_srs);
2603ae6aa22aSVenugopal Iyer 	}
2604ae6aa22aSVenugopal Iyer 
2605ae6aa22aSVenugopal Iyer again:
2606da14cebeSEric Cheng 	head = mac_srs->srs_first;
2607da14cebeSEric Cheng 	mac_srs->srs_first = NULL;
2608da14cebeSEric Cheng 	tail = mac_srs->srs_last;
2609da14cebeSEric Cheng 	mac_srs->srs_last = NULL;
2610da14cebeSEric Cheng 	cnt = mac_srs->srs_count;
2611da14cebeSEric Cheng 	mac_srs->srs_count = 0;
2612da14cebeSEric Cheng 
2613da14cebeSEric Cheng 	ASSERT(head != NULL);
2614da14cebeSEric Cheng 	ASSERT(tail != NULL);
2615da14cebeSEric Cheng 
2616*8ad9a34fSRyan Zezeski 	if ((tid = mac_srs->srs_tid) != NULL)
2617*8ad9a34fSRyan Zezeski 		mac_srs->srs_tid = NULL;
2618da14cebeSEric Cheng 
2619da14cebeSEric Cheng 	mac_srs->srs_state |= (SRS_PROC|proc_type);
2620da14cebeSEric Cheng 
2621ae6aa22aSVenugopal Iyer 
2622da14cebeSEric Cheng 	/*
2623da14cebeSEric Cheng 	 * mcip is NULL for broadcast and multicast flows. The promisc
2624da14cebeSEric Cheng 	 * callbacks for broadcast and multicast packets are delivered from
2625da14cebeSEric Cheng 	 * mac_rx() and we don't need to worry about that case in this path
2626da14cebeSEric Cheng 	 */
26270dc2366fSVenugopal Iyer 	if (mcip != NULL) {
26280dc2366fSVenugopal Iyer 		if (mcip->mci_promisc_list != NULL) {
2629da14cebeSEric Cheng 			mutex_exit(&mac_srs->srs_lock);
2630da14cebeSEric Cheng 			mac_promisc_client_dispatch(mcip, head);
2631da14cebeSEric Cheng 			mutex_enter(&mac_srs->srs_lock);
2632da14cebeSEric Cheng 		}
26330dc2366fSVenugopal Iyer 		if (MAC_PROTECT_ENABLED(mcip, MPT_IPNOSPOOF)) {
26340dc2366fSVenugopal Iyer 			mutex_exit(&mac_srs->srs_lock);
26354e6f6c83SCody Peter Mello 			mac_protect_intercept_dynamic(mcip, head);
26360dc2366fSVenugopal Iyer 			mutex_enter(&mac_srs->srs_lock);
26370dc2366fSVenugopal Iyer 		}
26380dc2366fSVenugopal Iyer 	}
2639da14cebeSEric Cheng 
2640da14cebeSEric Cheng 	/*
2641da14cebeSEric Cheng 	 * Check if SRS itself is doing the processing
2642da14cebeSEric Cheng 	 * This direct path does not apply when subflows are present. In this
2643da14cebeSEric Cheng 	 * case, packets need to be dispatched to a soft ring according to the
2644da14cebeSEric Cheng 	 * flow's bandwidth and other resources contraints.
2645da14cebeSEric Cheng 	 */
2646da14cebeSEric Cheng 	if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) {
2647da14cebeSEric Cheng 		mac_direct_rx_t		proc;
2648da14cebeSEric Cheng 		void			*arg1;
2649da14cebeSEric Cheng 		mac_resource_handle_t	arg2;
2650da14cebeSEric Cheng 
2651da14cebeSEric Cheng 		/*
2652da14cebeSEric Cheng 		 * This is the case when a Rx is directly
2653da14cebeSEric Cheng 		 * assigned and we have a fully classified
2654da14cebeSEric Cheng 		 * protocol chain. We can deal with it in
2655da14cebeSEric Cheng 		 * one shot.
2656da14cebeSEric Cheng 		 */
2657da14cebeSEric Cheng 		proc = srs_rx->sr_func;
2658da14cebeSEric Cheng 		arg1 = srs_rx->sr_arg1;
2659da14cebeSEric Cheng 		arg2 = srs_rx->sr_arg2;
2660da14cebeSEric Cheng 
2661da14cebeSEric Cheng 		mac_srs->srs_state |= SRS_CLIENT_PROC;
2662da14cebeSEric Cheng 		mutex_exit(&mac_srs->srs_lock);
2663*8ad9a34fSRyan Zezeski 		if (tid != NULL) {
2664da14cebeSEric Cheng 			(void) untimeout(tid);
2665*8ad9a34fSRyan Zezeski 			tid = NULL;
2666da14cebeSEric Cheng 		}
2667da14cebeSEric Cheng 
2668da14cebeSEric Cheng 		proc(arg1, arg2, head, NULL);
2669da14cebeSEric Cheng 		/*
2670da14cebeSEric Cheng 		 * Decrement the size and count here itelf
2671da14cebeSEric Cheng 		 * since the packet has been processed.
2672da14cebeSEric Cheng 		 */
2673da14cebeSEric Cheng 		mutex_enter(&mac_srs->srs_lock);
2674da14cebeSEric Cheng 		MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
2675da14cebeSEric Cheng 		if (mac_srs->srs_state & SRS_CLIENT_WAIT)
2676da14cebeSEric Cheng 			cv_signal(&mac_srs->srs_client_cv);
2677da14cebeSEric Cheng 		mac_srs->srs_state &= ~SRS_CLIENT_PROC;
2678da14cebeSEric Cheng 	} else {
2679da14cebeSEric Cheng 		/* Some kind of softrings based fanout is required */
2680da14cebeSEric Cheng 		mutex_exit(&mac_srs->srs_lock);
2681*8ad9a34fSRyan Zezeski 		if (tid != NULL) {
2682da14cebeSEric Cheng 			(void) untimeout(tid);
2683*8ad9a34fSRyan Zezeski 			tid = NULL;
2684da14cebeSEric Cheng 		}
2685da14cebeSEric Cheng 
2686da14cebeSEric Cheng 		/*
2687da14cebeSEric Cheng 		 * Since the fanout routines can deal with chains,
2688da14cebeSEric Cheng 		 * shoot the entire chain up.
2689da14cebeSEric Cheng 		 */
2690da14cebeSEric Cheng 		if (mac_srs->srs_type & SRST_FANOUT_SRC_IP)
2691da14cebeSEric Cheng 			mac_rx_srs_fanout(mac_srs, head);
2692da14cebeSEric Cheng 		else
2693da14cebeSEric Cheng 			mac_rx_srs_proto_fanout(mac_srs, head);
2694da14cebeSEric Cheng 		mutex_enter(&mac_srs->srs_lock);
2695da14cebeSEric Cheng 	}
2696da14cebeSEric Cheng 
26978ac29891SEric Cheng 	if (!(mac_srs->srs_state & (SRS_BLANK|SRS_PAUSE)) &&
26988ac29891SEric Cheng 	    (mac_srs->srs_first != NULL)) {
2699da14cebeSEric Cheng 		/*
27008ac29891SEric Cheng 		 * More packets arrived while we were clearing the
27018ac29891SEric Cheng 		 * SRS. This can be possible because of one of
27028ac29891SEric Cheng 		 * three conditions below:
27038ac29891SEric Cheng 		 * 1) The driver is using multiple worker threads
27048ac29891SEric Cheng 		 *    to send the packets to us.
27058ac29891SEric Cheng 		 * 2) The driver has a race in switching
27068ac29891SEric Cheng 		 *    between interrupt and polling mode or
27078ac29891SEric Cheng 		 * 3) Packets are arriving in this SRS via the
27088ac29891SEric Cheng 		 *    S/W classification as well.
27098ac29891SEric Cheng 		 *
27108ac29891SEric Cheng 		 * We should switch to polling mode and see if we
27118ac29891SEric Cheng 		 * need to send the poll thread down. Also, signal
27128ac29891SEric Cheng 		 * the worker thread to process whats just arrived.
2713da14cebeSEric Cheng 		 */
27148ac29891SEric Cheng 		MAC_SRS_POLLING_ON(mac_srs);
2715ae6aa22aSVenugopal Iyer 		if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) {
2716ae6aa22aSVenugopal Iyer 			srs_rx->sr_drain_poll_sig++;
2717da14cebeSEric Cheng 			MAC_SRS_POLL_RING(mac_srs);
2718ae6aa22aSVenugopal Iyer 		}
27198ac29891SEric Cheng 
27208ac29891SEric Cheng 		/*
27218ac29891SEric Cheng 		 * If we didn't signal the poll thread, we need
27228ac29891SEric Cheng 		 * to deal with the pending packets ourselves.
27238ac29891SEric Cheng 		 */
27248ac29891SEric Cheng 		if (proc_type == SRS_WORKER) {
2725da14cebeSEric Cheng 			srs_rx->sr_drain_again++;
2726da14cebeSEric Cheng 			goto again;
27278ac29891SEric Cheng 		} else {
27288ac29891SEric Cheng 			srs_rx->sr_drain_worker_sig++;
27298ac29891SEric Cheng 			cv_signal(&mac_srs->srs_async);
2730da14cebeSEric Cheng 		}
2731da14cebeSEric Cheng 	}
2732da14cebeSEric Cheng 
2733da14cebeSEric Cheng out:
2734da14cebeSEric Cheng 	if (mac_srs->srs_state & SRS_GET_PKTS) {
2735da14cebeSEric Cheng 		/*
2736da14cebeSEric Cheng 		 * Poll thread is already running. Leave the
2737da14cebeSEric Cheng 		 * SRS_RPOC set and hand over the control to
2738da14cebeSEric Cheng 		 * poll thread.
2739da14cebeSEric Cheng 		 */
2740da14cebeSEric Cheng 		mac_srs->srs_state &= ~proc_type;
2741da14cebeSEric Cheng 		srs_rx->sr_drain_poll_running++;
2742da14cebeSEric Cheng 		return;
2743da14cebeSEric Cheng 	}
2744da14cebeSEric Cheng 
2745da14cebeSEric Cheng 	/*
2746da14cebeSEric Cheng 	 * Even if there are no packets queued in SRS, we
2747da14cebeSEric Cheng 	 * need to make sure that the shared counter is
2748da14cebeSEric Cheng 	 * clear and any associated softrings have cleared
2749da14cebeSEric Cheng 	 * all the backlog. Otherwise, leave the interface
2750da14cebeSEric Cheng 	 * in polling mode and the poll thread will get
2751da14cebeSEric Cheng 	 * signalled once the count goes down to zero.
2752da14cebeSEric Cheng 	 *
2753da14cebeSEric Cheng 	 * If someone is already draining the queue (SRS_PROC is
2754da14cebeSEric Cheng 	 * set) when the srs_poll_pkt_cnt goes down to zero,
2755da14cebeSEric Cheng 	 * then it means that drain is already running and we
2756da14cebeSEric Cheng 	 * will turn off polling at that time if there is
2757da14cebeSEric Cheng 	 * no backlog.
2758da14cebeSEric Cheng 	 *
2759da14cebeSEric Cheng 	 * As long as there are packets queued either
2760da14cebeSEric Cheng 	 * in soft ring set or its soft rings, we will leave
2761da14cebeSEric Cheng 	 * the interface in polling mode (even if the drain
2762da14cebeSEric Cheng 	 * was done being the interrupt thread). We signal
2763da14cebeSEric Cheng 	 * the poll thread as well if we have dipped below
2764da14cebeSEric Cheng 	 * low water mark.
2765da14cebeSEric Cheng 	 *
2766da14cebeSEric Cheng 	 * NOTE: We can't use the MAC_SRS_POLLING_ON macro
2767da14cebeSEric Cheng 	 * since that turn polling on only for worker thread.
2768da14cebeSEric Cheng 	 * Its not worth turning polling on for interrupt
2769da14cebeSEric Cheng 	 * thread (since NIC will not issue another interrupt)
2770da14cebeSEric Cheng 	 * unless a backlog builds up.
2771da14cebeSEric Cheng 	 */
2772da14cebeSEric Cheng 	if ((srs_rx->sr_poll_pkt_cnt > 0) &&
2773da14cebeSEric Cheng 	    (mac_srs->srs_state & SRS_POLLING_CAPAB)) {
2774da14cebeSEric Cheng 		mac_srs->srs_state &= ~(SRS_PROC|proc_type);
2775da14cebeSEric Cheng 		srs_rx->sr_drain_keep_polling++;
2776da14cebeSEric Cheng 		MAC_SRS_POLLING_ON(mac_srs);
2777da14cebeSEric Cheng 		if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)
2778da14cebeSEric Cheng 			MAC_SRS_POLL_RING(mac_srs);
2779da14cebeSEric Cheng 		return;
2780da14cebeSEric Cheng 	}
2781da14cebeSEric Cheng 
2782da14cebeSEric Cheng 	/* Nothing else to do. Get out of poll mode */
2783da14cebeSEric Cheng 	MAC_SRS_POLLING_OFF(mac_srs);
2784da14cebeSEric Cheng 	mac_srs->srs_state &= ~(SRS_PROC|proc_type);
2785da14cebeSEric Cheng 	srs_rx->sr_drain_finish_intr++;
2786da14cebeSEric Cheng }
2787da14cebeSEric Cheng 
2788da14cebeSEric Cheng /*
2789da14cebeSEric Cheng  * mac_rx_srs_drain_bw
2790da14cebeSEric Cheng  *
2791da14cebeSEric Cheng  * The SRS BW drain routine. Gets to run to clear the queue. Any thread
2792da14cebeSEric Cheng  * (worker, interrupt, poll) can call this based on processing model.
2793da14cebeSEric Cheng  * The first thing we do is disable interrupts if possible and then
2794da14cebeSEric Cheng  * drain the queue. we also try to poll the underlying hardware if
2795da14cebeSEric Cheng  * there is a dedicated hardware Rx ring assigned to this SRS.
2796da14cebeSEric Cheng  *
2797da14cebeSEric Cheng  * There is a equivalent drain routine in non bandwidth control mode
2798da14cebeSEric Cheng  * mac_rx_srs_drain. There is some code duplication between the two
2799da14cebeSEric Cheng  * routines but they are highly performance sensitive and are easier
2800da14cebeSEric Cheng  * to read/debug if they stay separate. Any code changes here might
2801da14cebeSEric Cheng  * also apply to mac_rx_srs_drain as well.
2802da14cebeSEric Cheng  */
2803da14cebeSEric Cheng void
2804da14cebeSEric Cheng mac_rx_srs_drain_bw(mac_soft_ring_set_t *mac_srs, uint_t proc_type)
2805da14cebeSEric Cheng {
2806da14cebeSEric Cheng 	mblk_t 			*head;
2807da14cebeSEric Cheng 	mblk_t			*tail;
2808da14cebeSEric Cheng 	timeout_id_t 		tid;
2809da14cebeSEric Cheng 	size_t			sz = 0;
2810da14cebeSEric Cheng 	int			cnt = 0;
2811da14cebeSEric Cheng 	mac_client_impl_t	*mcip = mac_srs->srs_mcip;
2812da14cebeSEric Cheng 	mac_srs_rx_t		*srs_rx = &mac_srs->srs_rx;
2813d3d50737SRafael Vanoni 	clock_t			now;
2814da14cebeSEric Cheng 
2815da14cebeSEric Cheng 	ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
2816da14cebeSEric Cheng 	ASSERT(mac_srs->srs_type & SRST_BW_CONTROL);
2817da14cebeSEric Cheng again:
2818da14cebeSEric Cheng 	/* Check if we are doing B/W control */
2819da14cebeSEric Cheng 	mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
2820d3d50737SRafael Vanoni 	now = ddi_get_lbolt();
2821d3d50737SRafael Vanoni 	if (mac_srs->srs_bw->mac_bw_curr_time != now) {
2822d3d50737SRafael Vanoni 		mac_srs->srs_bw->mac_bw_curr_time = now;
2823da14cebeSEric Cheng 		mac_srs->srs_bw->mac_bw_used = 0;
2824da14cebeSEric Cheng 		if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)
2825da14cebeSEric Cheng 			mac_srs->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED;
2826da14cebeSEric Cheng 	} else if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) {
2827da14cebeSEric Cheng 		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2828da14cebeSEric Cheng 		goto done;
2829da14cebeSEric Cheng 	} else if (mac_srs->srs_bw->mac_bw_used >
2830da14cebeSEric Cheng 	    mac_srs->srs_bw->mac_bw_limit) {
2831da14cebeSEric Cheng 		mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
2832da14cebeSEric Cheng 		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2833da14cebeSEric Cheng 		goto done;
2834da14cebeSEric Cheng 	}
2835da14cebeSEric Cheng 	mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2836da14cebeSEric Cheng 
2837da14cebeSEric Cheng 	/* If we are blanked i.e. can't do upcalls, then we are done */
2838da14cebeSEric Cheng 	if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) {
2839da14cebeSEric Cheng 		ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) ||
2840da14cebeSEric Cheng 		    (mac_srs->srs_state & SRS_PAUSE));
2841da14cebeSEric Cheng 		goto done;
2842da14cebeSEric Cheng 	}
2843da14cebeSEric Cheng 
2844da14cebeSEric Cheng 	sz = 0;
2845da14cebeSEric Cheng 	cnt = 0;
2846da14cebeSEric Cheng 	if ((head = mac_srs_pick_chain(mac_srs, &tail, &sz, &cnt)) == NULL) {
2847da14cebeSEric Cheng 		/*
2848da14cebeSEric Cheng 		 * We couldn't pick up a single packet.
2849da14cebeSEric Cheng 		 */
2850da14cebeSEric Cheng 		mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
2851da14cebeSEric Cheng 		if ((mac_srs->srs_bw->mac_bw_used == 0) &&
2852da14cebeSEric Cheng 		    (mac_srs->srs_size != 0) &&
2853da14cebeSEric Cheng 		    !(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) {
2854da14cebeSEric Cheng 			/*
2855da14cebeSEric Cheng 			 * Seems like configured B/W doesn't
2856da14cebeSEric Cheng 			 * even allow processing of 1 packet
2857da14cebeSEric Cheng 			 * per tick.
2858da14cebeSEric Cheng 			 *
2859da14cebeSEric Cheng 			 * XXX: raise the limit to processing
2860da14cebeSEric Cheng 			 * at least 1 packet per tick.
2861da14cebeSEric Cheng 			 */
2862da14cebeSEric Cheng 			mac_srs->srs_bw->mac_bw_limit +=
2863da14cebeSEric Cheng 			    mac_srs->srs_bw->mac_bw_limit;
2864da14cebeSEric Cheng 			mac_srs->srs_bw->mac_bw_drop_threshold +=
2865da14cebeSEric Cheng 			    mac_srs->srs_bw->mac_bw_drop_threshold;
2866da14cebeSEric Cheng 			cmn_err(CE_NOTE, "mac_rx_srs_drain: srs(%p) "
2867da14cebeSEric Cheng 			    "raised B/W limit to %d since not even a "
2868da14cebeSEric Cheng 			    "single packet can be processed per "
2869da14cebeSEric Cheng 			    "tick %d\n", (void *)mac_srs,
2870da14cebeSEric Cheng 			    (int)mac_srs->srs_bw->mac_bw_limit,
2871da14cebeSEric Cheng 			    (int)msgdsize(mac_srs->srs_first));
2872da14cebeSEric Cheng 		}
2873da14cebeSEric Cheng 		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2874da14cebeSEric Cheng 		goto done;
2875da14cebeSEric Cheng 	}
2876da14cebeSEric Cheng 
2877da14cebeSEric Cheng 	ASSERT(head != NULL);
2878da14cebeSEric Cheng 	ASSERT(tail != NULL);
2879da14cebeSEric Cheng 
2880da14cebeSEric Cheng 	/* zero bandwidth: drop all and return to interrupt mode */
2881da14cebeSEric Cheng 	mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
2882da14cebeSEric Cheng 	if (mac_srs->srs_bw->mac_bw_limit == 0) {
28830dc2366fSVenugopal Iyer 		srs_rx->sr_stat.mrs_sdrops += cnt;
2884da14cebeSEric Cheng 		ASSERT(mac_srs->srs_bw->mac_bw_sz >= sz);
2885da14cebeSEric Cheng 		mac_srs->srs_bw->mac_bw_sz -= sz;
2886da14cebeSEric Cheng 		mac_srs->srs_bw->mac_bw_drop_bytes += sz;
2887da14cebeSEric Cheng 		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2888da14cebeSEric Cheng 		mac_pkt_drop(NULL, NULL, head, B_FALSE);
2889da14cebeSEric Cheng 		goto leave_poll;
2890da14cebeSEric Cheng 	} else {
2891da14cebeSEric Cheng 		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2892da14cebeSEric Cheng 	}
2893da14cebeSEric Cheng 
2894*8ad9a34fSRyan Zezeski 	if ((tid = mac_srs->srs_tid) != NULL)
2895*8ad9a34fSRyan Zezeski 		mac_srs->srs_tid = NULL;
2896da14cebeSEric Cheng 
2897da14cebeSEric Cheng 	mac_srs->srs_state |= (SRS_PROC|proc_type);
2898da14cebeSEric Cheng 	MAC_SRS_WORKER_POLLING_ON(mac_srs);
2899da14cebeSEric Cheng 
2900da14cebeSEric Cheng 	/*
2901da14cebeSEric Cheng 	 * mcip is NULL for broadcast and multicast flows. The promisc
2902da14cebeSEric Cheng 	 * callbacks for broadcast and multicast packets are delivered from
2903da14cebeSEric Cheng 	 * mac_rx() and we don't need to worry about that case in this path
2904da14cebeSEric Cheng 	 */
29050dc2366fSVenugopal Iyer 	if (mcip != NULL) {
29060dc2366fSVenugopal Iyer 		if (mcip->mci_promisc_list != NULL) {
2907da14cebeSEric Cheng 			mutex_exit(&mac_srs->srs_lock);
2908da14cebeSEric Cheng 			mac_promisc_client_dispatch(mcip, head);
2909da14cebeSEric Cheng 			mutex_enter(&mac_srs->srs_lock);
2910da14cebeSEric Cheng 		}
29110dc2366fSVenugopal Iyer 		if (MAC_PROTECT_ENABLED(mcip, MPT_IPNOSPOOF)) {
29120dc2366fSVenugopal Iyer 			mutex_exit(&mac_srs->srs_lock);
29134e6f6c83SCody Peter Mello 			mac_protect_intercept_dynamic(mcip, head);
29140dc2366fSVenugopal Iyer 			mutex_enter(&mac_srs->srs_lock);
29150dc2366fSVenugopal Iyer 		}
29160dc2366fSVenugopal Iyer 	}
2917da14cebeSEric Cheng 
2918da14cebeSEric Cheng 	/*
2919da14cebeSEric Cheng 	 * Check if SRS itself is doing the processing
2920da14cebeSEric Cheng 	 * This direct path does not apply when subflows are present. In this
2921da14cebeSEric Cheng 	 * case, packets need to be dispatched to a soft ring according to the
2922da14cebeSEric Cheng 	 * flow's bandwidth and other resources contraints.
2923da14cebeSEric Cheng 	 */
2924da14cebeSEric Cheng 	if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) {
2925da14cebeSEric Cheng 		mac_direct_rx_t		proc;
2926da14cebeSEric Cheng 		void			*arg1;
2927da14cebeSEric Cheng 		mac_resource_handle_t	arg2;
2928da14cebeSEric Cheng 
2929da14cebeSEric Cheng 		/*
2930da14cebeSEric Cheng 		 * This is the case when a Rx is directly
2931da14cebeSEric Cheng 		 * assigned and we have a fully classified
2932da14cebeSEric Cheng 		 * protocol chain. We can deal with it in
2933da14cebeSEric Cheng 		 * one shot.
2934da14cebeSEric Cheng 		 */
2935da14cebeSEric Cheng 		proc = srs_rx->sr_func;
2936da14cebeSEric Cheng 		arg1 = srs_rx->sr_arg1;
2937da14cebeSEric Cheng 		arg2 = srs_rx->sr_arg2;
2938da14cebeSEric Cheng 
2939da14cebeSEric Cheng 		mac_srs->srs_state |= SRS_CLIENT_PROC;
2940da14cebeSEric Cheng 		mutex_exit(&mac_srs->srs_lock);
2941*8ad9a34fSRyan Zezeski 		if (tid != NULL) {
2942da14cebeSEric Cheng 			(void) untimeout(tid);
2943*8ad9a34fSRyan Zezeski 			tid = NULL;
2944da14cebeSEric Cheng 		}
2945da14cebeSEric Cheng 
2946da14cebeSEric Cheng 		proc(arg1, arg2, head, NULL);
2947da14cebeSEric Cheng 		/*
2948da14cebeSEric Cheng 		 * Decrement the size and count here itelf
2949da14cebeSEric Cheng 		 * since the packet has been processed.
2950da14cebeSEric Cheng 		 */
2951da14cebeSEric Cheng 		mutex_enter(&mac_srs->srs_lock);
2952da14cebeSEric Cheng 		MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
2953da14cebeSEric Cheng 		MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz);
2954da14cebeSEric Cheng 
2955da14cebeSEric Cheng 		if (mac_srs->srs_state & SRS_CLIENT_WAIT)
2956da14cebeSEric Cheng 			cv_signal(&mac_srs->srs_client_cv);
2957da14cebeSEric Cheng 		mac_srs->srs_state &= ~SRS_CLIENT_PROC;
2958da14cebeSEric Cheng 	} else {
2959da14cebeSEric Cheng 		/* Some kind of softrings based fanout is required */
2960da14cebeSEric Cheng 		mutex_exit(&mac_srs->srs_lock);
2961*8ad9a34fSRyan Zezeski 		if (tid != NULL) {
2962da14cebeSEric Cheng 			(void) untimeout(tid);
2963*8ad9a34fSRyan Zezeski 			tid = NULL;
2964da14cebeSEric Cheng 		}
2965da14cebeSEric Cheng 
2966da14cebeSEric Cheng 		/*
2967da14cebeSEric Cheng 		 * Since the fanout routines can deal with chains,
2968da14cebeSEric Cheng 		 * shoot the entire chain up.
2969da14cebeSEric Cheng 		 */
2970da14cebeSEric Cheng 		if (mac_srs->srs_type & SRST_FANOUT_SRC_IP)
2971da14cebeSEric Cheng 			mac_rx_srs_fanout(mac_srs, head);
2972da14cebeSEric Cheng 		else
2973da14cebeSEric Cheng 			mac_rx_srs_proto_fanout(mac_srs, head);
2974da14cebeSEric Cheng 		mutex_enter(&mac_srs->srs_lock);
2975da14cebeSEric Cheng 	}
2976da14cebeSEric Cheng 
2977da14cebeSEric Cheng 	/*
2978da14cebeSEric Cheng 	 * Send the poll thread to pick up any packets arrived
2979da14cebeSEric Cheng 	 * so far. This also serves as the last check in case
2980da14cebeSEric Cheng 	 * nothing else is queued in the SRS. The poll thread
2981da14cebeSEric Cheng 	 * is signalled only in the case the drain was done
2982da14cebeSEric Cheng 	 * by the worker thread and SRS_WORKER is set. The
2983da14cebeSEric Cheng 	 * worker thread can run in parallel as long as the
2984da14cebeSEric Cheng 	 * SRS_WORKER flag is set. We we have nothing else to
2985da14cebeSEric Cheng 	 * process, we can exit while leaving SRS_PROC set
2986da14cebeSEric Cheng 	 * which gives the poll thread control to process and
2987da14cebeSEric Cheng 	 * cleanup once it returns from the NIC.
2988da14cebeSEric Cheng 	 *
2989da14cebeSEric Cheng 	 * If we have nothing else to process, we need to
2990da14cebeSEric Cheng 	 * ensure that we keep holding the srs_lock till
2991da14cebeSEric Cheng 	 * all the checks below are done and control is
2992da14cebeSEric Cheng 	 * handed to the poll thread if it was running.
2993da14cebeSEric Cheng 	 */
2994da14cebeSEric Cheng 	mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
2995da14cebeSEric Cheng 	if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) {
2996da14cebeSEric Cheng 		if (mac_srs->srs_first != NULL) {
2997da14cebeSEric Cheng 			if (proc_type == SRS_WORKER) {
2998da14cebeSEric Cheng 				mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2999da14cebeSEric Cheng 				if (srs_rx->sr_poll_pkt_cnt <=
3000da14cebeSEric Cheng 				    srs_rx->sr_lowat)
3001da14cebeSEric Cheng 					MAC_SRS_POLL_RING(mac_srs);
3002da14cebeSEric Cheng 				goto again;
3003da14cebeSEric Cheng 			} else {
3004da14cebeSEric Cheng 				cv_signal(&mac_srs->srs_async);
3005da14cebeSEric Cheng 			}
3006da14cebeSEric Cheng 		}
3007da14cebeSEric Cheng 	}
3008da14cebeSEric Cheng 	mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
3009da14cebeSEric Cheng 
3010da14cebeSEric Cheng done:
3011da14cebeSEric Cheng 
3012da14cebeSEric Cheng 	if (mac_srs->srs_state & SRS_GET_PKTS) {
3013da14cebeSEric Cheng 		/*
3014da14cebeSEric Cheng 		 * Poll thread is already running. Leave the
3015da14cebeSEric Cheng 		 * SRS_RPOC set and hand over the control to
3016da14cebeSEric Cheng 		 * poll thread.
3017da14cebeSEric Cheng 		 */
3018da14cebeSEric Cheng 		mac_srs->srs_state &= ~proc_type;
3019da14cebeSEric Cheng 		return;
3020da14cebeSEric Cheng 	}
3021da14cebeSEric Cheng 
3022da14cebeSEric Cheng 	/*
3023da14cebeSEric Cheng 	 * If we can't process packets because we have exceeded
3024da14cebeSEric Cheng 	 * B/W limit for this tick, just set the timeout
3025da14cebeSEric Cheng 	 * and leave.
3026da14cebeSEric Cheng 	 *
3027da14cebeSEric Cheng 	 * Even if there are no packets queued in SRS, we
3028da14cebeSEric Cheng 	 * need to make sure that the shared counter is
3029da14cebeSEric Cheng 	 * clear and any associated softrings have cleared
3030da14cebeSEric Cheng 	 * all the backlog. Otherwise, leave the interface
3031da14cebeSEric Cheng 	 * in polling mode and the poll thread will get
3032da14cebeSEric Cheng 	 * signalled once the count goes down to zero.
3033da14cebeSEric Cheng 	 *
3034da14cebeSEric Cheng 	 * If someone is already draining the queue (SRS_PROC is
3035da14cebeSEric Cheng 	 * set) when the srs_poll_pkt_cnt goes down to zero,
3036da14cebeSEric Cheng 	 * then it means that drain is already running and we
3037da14cebeSEric Cheng 	 * will turn off polling at that time if there is
3038da14cebeSEric Cheng 	 * no backlog. As long as there are packets queued either
3039da14cebeSEric Cheng 	 * is soft ring set or its soft rings, we will leave
3040da14cebeSEric Cheng 	 * the interface in polling mode.
3041da14cebeSEric Cheng 	 */
3042da14cebeSEric Cheng 	mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
3043da14cebeSEric Cheng 	if ((mac_srs->srs_state & SRS_POLLING_CAPAB) &&
3044da14cebeSEric Cheng 	    ((mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) ||
3045da14cebeSEric Cheng 	    (srs_rx->sr_poll_pkt_cnt > 0))) {
3046da14cebeSEric Cheng 		MAC_SRS_POLLING_ON(mac_srs);
3047da14cebeSEric Cheng 		mac_srs->srs_state &= ~(SRS_PROC|proc_type);
3048da14cebeSEric Cheng 		if ((mac_srs->srs_first != NULL) &&
3049da14cebeSEric Cheng 		    (mac_srs->srs_tid == NULL))
3050da14cebeSEric Cheng 			mac_srs->srs_tid = timeout(mac_srs_fire,
3051da14cebeSEric Cheng 			    mac_srs, 1);
3052da14cebeSEric Cheng 		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
3053da14cebeSEric Cheng 		return;
3054da14cebeSEric Cheng 	}
3055da14cebeSEric Cheng 	mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
3056da14cebeSEric Cheng 
3057da14cebeSEric Cheng leave_poll:
3058da14cebeSEric Cheng 
3059da14cebeSEric Cheng 	/* Nothing else to do. Get out of poll mode */
3060da14cebeSEric Cheng 	MAC_SRS_POLLING_OFF(mac_srs);
3061da14cebeSEric Cheng 	mac_srs->srs_state &= ~(SRS_PROC|proc_type);
3062da14cebeSEric Cheng }
3063da14cebeSEric Cheng 
3064da14cebeSEric Cheng /*
3065da14cebeSEric Cheng  * mac_srs_worker
3066da14cebeSEric Cheng  *
3067da14cebeSEric Cheng  * The SRS worker routine. Drains the queue when no one else is
3068da14cebeSEric Cheng  * processing it.
3069da14cebeSEric Cheng  */
3070da14cebeSEric Cheng void
3071da14cebeSEric Cheng mac_srs_worker(mac_soft_ring_set_t *mac_srs)
3072da14cebeSEric Cheng {
3073da14cebeSEric Cheng 	kmutex_t 		*lock = &mac_srs->srs_lock;
3074da14cebeSEric Cheng 	kcondvar_t 		*async = &mac_srs->srs_async;
3075da14cebeSEric Cheng 	callb_cpr_t		cprinfo;
3076da14cebeSEric Cheng 	boolean_t		bw_ctl_flag;
3077da14cebeSEric Cheng 
3078da14cebeSEric Cheng 	CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "srs_worker");
3079da14cebeSEric Cheng 	mutex_enter(lock);
3080da14cebeSEric Cheng 
3081da14cebeSEric Cheng start:
3082da14cebeSEric Cheng 	for (;;) {
3083da14cebeSEric Cheng 		bw_ctl_flag = B_FALSE;
3084da14cebeSEric Cheng 		if (mac_srs->srs_type & SRST_BW_CONTROL) {
3085da14cebeSEric Cheng 			MAC_SRS_BW_LOCK(mac_srs);
3086da14cebeSEric Cheng 			MAC_SRS_CHECK_BW_CONTROL(mac_srs);
3087da14cebeSEric Cheng 			if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)
3088da14cebeSEric Cheng 				bw_ctl_flag = B_TRUE;
3089da14cebeSEric Cheng 			MAC_SRS_BW_UNLOCK(mac_srs);
3090da14cebeSEric Cheng 		}
3091da14cebeSEric Cheng 		/*
3092da14cebeSEric Cheng 		 * The SRS_BW_ENFORCED flag may change since we have dropped
3093da14cebeSEric Cheng 		 * the mac_bw_lock. However the drain function can handle both
3094da14cebeSEric Cheng 		 * a drainable SRS or a bandwidth controlled SRS, and the
3095da14cebeSEric Cheng 		 * effect of scheduling a timeout is to wakeup the worker
3096da14cebeSEric Cheng 		 * thread which in turn will call the drain function. Since
3097da14cebeSEric Cheng 		 * we release the srs_lock atomically only in the cv_wait there
3098da14cebeSEric Cheng 		 * isn't a fear of waiting for ever.
3099da14cebeSEric Cheng 		 */
3100da14cebeSEric Cheng 		while (((mac_srs->srs_state & SRS_PROC) ||
3101da14cebeSEric Cheng 		    (mac_srs->srs_first == NULL) || bw_ctl_flag ||
3102da14cebeSEric Cheng 		    (mac_srs->srs_state & SRS_TX_BLOCKED)) &&
3103da14cebeSEric Cheng 		    !(mac_srs->srs_state & SRS_PAUSE)) {
3104da14cebeSEric Cheng 			/*
3105da14cebeSEric Cheng 			 * If we have packets queued and we are here
3106da14cebeSEric Cheng 			 * because B/W control is in place, we better
3107da14cebeSEric Cheng 			 * schedule the worker wakeup after 1 tick
3108da14cebeSEric Cheng 			 * to see if bandwidth control can be relaxed.
3109da14cebeSEric Cheng 			 */
3110da14cebeSEric Cheng 			if (bw_ctl_flag && mac_srs->srs_tid == NULL) {
3111da14cebeSEric Cheng 				/*
3112da14cebeSEric Cheng 				 * We need to ensure that a timer  is already
3113da14cebeSEric Cheng 				 * scheduled or we force  schedule one for
3114da14cebeSEric Cheng 				 * later so that we can continue processing
3115da14cebeSEric Cheng 				 * after this  quanta is over.
3116da14cebeSEric Cheng 				 */
3117da14cebeSEric Cheng 				mac_srs->srs_tid = timeout(mac_srs_fire,
3118da14cebeSEric Cheng 				    mac_srs, 1);
3119da14cebeSEric Cheng 			}
3120da14cebeSEric Cheng wait:
3121da14cebeSEric Cheng 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
3122da14cebeSEric Cheng 			cv_wait(async, lock);
3123da14cebeSEric Cheng 			CALLB_CPR_SAFE_END(&cprinfo, lock);
3124da14cebeSEric Cheng 
3125da14cebeSEric Cheng 			if (mac_srs->srs_state & SRS_PAUSE)
3126da14cebeSEric Cheng 				goto done;
3127da14cebeSEric Cheng 			if (mac_srs->srs_state & SRS_PROC)
3128da14cebeSEric Cheng 				goto wait;
3129da14cebeSEric Cheng 
3130da14cebeSEric Cheng 			if (mac_srs->srs_first != NULL &&
3131da14cebeSEric Cheng 			    mac_srs->srs_type & SRST_BW_CONTROL) {
3132da14cebeSEric Cheng 				MAC_SRS_BW_LOCK(mac_srs);
3133da14cebeSEric Cheng 				if (mac_srs->srs_bw->mac_bw_state &
3134da14cebeSEric Cheng 				    SRS_BW_ENFORCED) {
3135da14cebeSEric Cheng 					MAC_SRS_CHECK_BW_CONTROL(mac_srs);
3136da14cebeSEric Cheng 				}
3137da14cebeSEric Cheng 				bw_ctl_flag = mac_srs->srs_bw->mac_bw_state &
3138da14cebeSEric Cheng 				    SRS_BW_ENFORCED;
3139da14cebeSEric Cheng 				MAC_SRS_BW_UNLOCK(mac_srs);
3140da14cebeSEric Cheng 			}
3141da14cebeSEric Cheng 		}
3142da14cebeSEric Cheng 
3143da14cebeSEric Cheng 		if (mac_srs->srs_state & SRS_PAUSE)
3144da14cebeSEric Cheng 			goto done;
3145da14cebeSEric Cheng 		mac_srs->srs_drain_func(mac_srs, SRS_WORKER);
3146da14cebeSEric Cheng 	}
3147da14cebeSEric Cheng done:
3148da14cebeSEric Cheng 	/*
3149da14cebeSEric Cheng 	 * The Rx SRS quiesce logic first cuts off packet supply to the SRS
3150da14cebeSEric Cheng 	 * from both hard and soft classifications and waits for such threads
3151da14cebeSEric Cheng 	 * to finish before signaling the worker. So at this point the only
3152da14cebeSEric Cheng 	 * thread left that could be competing with the worker is the poll
3153da14cebeSEric Cheng 	 * thread. In the case of Tx, there shouldn't be any thread holding
3154da14cebeSEric Cheng 	 * SRS_PROC at this point.
3155da14cebeSEric Cheng 	 */
3156da14cebeSEric Cheng 	if (!(mac_srs->srs_state & SRS_PROC)) {
3157da14cebeSEric Cheng 		mac_srs->srs_state |= SRS_PROC;
3158da14cebeSEric Cheng 	} else {
3159da14cebeSEric Cheng 		ASSERT((mac_srs->srs_type & SRST_TX) == 0);
3160da14cebeSEric Cheng 		/*
3161da14cebeSEric Cheng 		 * Poll thread still owns the SRS and is still running
3162da14cebeSEric Cheng 		 */
3163da14cebeSEric Cheng 		ASSERT((mac_srs->srs_poll_thr == NULL) ||
3164da14cebeSEric Cheng 		    ((mac_srs->srs_state & SRS_POLL_THR_OWNER) ==
3165da14cebeSEric Cheng 		    SRS_POLL_THR_OWNER));
3166da14cebeSEric Cheng 	}
3167da14cebeSEric Cheng 	mac_srs_worker_quiesce(mac_srs);
3168da14cebeSEric Cheng 	/*
3169da14cebeSEric Cheng 	 * Wait for the SRS_RESTART or SRS_CONDEMNED signal from the initiator
3170da14cebeSEric Cheng 	 * of the quiesce operation
3171da14cebeSEric Cheng 	 */
3172da14cebeSEric Cheng 	while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_RESTART)))
3173da14cebeSEric Cheng 		cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock);
3174da14cebeSEric Cheng 
3175da14cebeSEric Cheng 	if (mac_srs->srs_state & SRS_RESTART) {
3176da14cebeSEric Cheng 		ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED));
3177da14cebeSEric Cheng 		mac_srs_worker_restart(mac_srs);
3178da14cebeSEric Cheng 		mac_srs->srs_state &= ~SRS_PROC;
3179da14cebeSEric Cheng 		goto start;
3180da14cebeSEric Cheng 	}
3181da14cebeSEric Cheng 
3182da14cebeSEric Cheng 	if (!(mac_srs->srs_state & SRS_CONDEMNED_DONE))
3183da14cebeSEric Cheng 		mac_srs_worker_quiesce(mac_srs);
3184da14cebeSEric Cheng 
3185da14cebeSEric Cheng 	mac_srs->srs_state &= ~SRS_PROC;
3186da14cebeSEric Cheng 	/* The macro drops the srs_lock */
3187da14cebeSEric Cheng 	CALLB_CPR_EXIT(&cprinfo);
3188da14cebeSEric Cheng 	thread_exit();
3189da14cebeSEric Cheng }
3190da14cebeSEric Cheng 
3191da14cebeSEric Cheng /*
3192da14cebeSEric Cheng  * mac_rx_srs_subflow_process
3193da14cebeSEric Cheng  *
3194da14cebeSEric Cheng  * Receive side routine called from interrupt path when there are
3195da14cebeSEric Cheng  * sub flows present on this SRS.
3196da14cebeSEric Cheng  */
3197da14cebeSEric Cheng /* ARGSUSED */
3198da14cebeSEric Cheng void
3199da14cebeSEric Cheng mac_rx_srs_subflow_process(void *arg, mac_resource_handle_t srs,
3200da14cebeSEric Cheng     mblk_t *mp_chain, boolean_t loopback)
3201da14cebeSEric Cheng {
3202da14cebeSEric Cheng 	flow_entry_t		*flent = NULL;
3203da14cebeSEric Cheng 	flow_entry_t		*prev_flent = NULL;
3204da14cebeSEric Cheng 	mblk_t			*mp = NULL;
3205da14cebeSEric Cheng 	mblk_t			*tail = NULL;
3206da14cebeSEric Cheng 	mac_soft_ring_set_t	*mac_srs = (mac_soft_ring_set_t *)srs;
3207da14cebeSEric Cheng 	mac_client_impl_t	*mcip;
3208da14cebeSEric Cheng 
3209da14cebeSEric Cheng 	mcip = mac_srs->srs_mcip;
3210da14cebeSEric Cheng 	ASSERT(mcip != NULL);
3211da14cebeSEric Cheng 
3212da14cebeSEric Cheng 	/*
3213da14cebeSEric Cheng 	 * We need to determine the SRS for every packet
3214da14cebeSEric Cheng 	 * by walking the flow table, if we don't get any,
3215da14cebeSEric Cheng 	 * then we proceed using the SRS we came with.
3216da14cebeSEric Cheng 	 */
3217da14cebeSEric Cheng 	mp = tail = mp_chain;
3218da14cebeSEric Cheng 	while (mp != NULL) {
3219da14cebeSEric Cheng 
3220da14cebeSEric Cheng 		/*
3221da14cebeSEric Cheng 		 * We will increment the stats for the mactching subflow.
3222da14cebeSEric Cheng 		 * when we get the bytes/pkt count for the classified packets
3223da14cebeSEric Cheng 		 * later in mac_rx_srs_process.
3224da14cebeSEric Cheng 		 */
3225da14cebeSEric Cheng 		(void) mac_flow_lookup(mcip->mci_subflow_tab, mp,
3226da14cebeSEric Cheng 		    FLOW_INBOUND, &flent);
3227da14cebeSEric Cheng 
3228da14cebeSEric Cheng 		if (mp == mp_chain || flent == prev_flent) {
3229da14cebeSEric Cheng 			if (prev_flent != NULL)
3230da14cebeSEric Cheng 				FLOW_REFRELE(prev_flent);
3231da14cebeSEric Cheng 			prev_flent = flent;
3232da14cebeSEric Cheng 			flent = NULL;
3233da14cebeSEric Cheng 			tail = mp;
3234da14cebeSEric Cheng 			mp = mp->b_next;
3235da14cebeSEric Cheng 			continue;
3236da14cebeSEric Cheng 		}
3237da14cebeSEric Cheng 		tail->b_next = NULL;
3238da14cebeSEric Cheng 		/*
3239da14cebeSEric Cheng 		 * A null indicates, this is for the mac_srs itself.
3240da14cebeSEric Cheng 		 * XXX-venu : probably assert for fe_rx_srs_cnt == 0.
3241da14cebeSEric Cheng 		 */
3242da14cebeSEric Cheng 		if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) {
3243da14cebeSEric Cheng 			mac_rx_srs_process(arg,
3244da14cebeSEric Cheng 			    (mac_resource_handle_t)mac_srs, mp_chain,
3245da14cebeSEric Cheng 			    loopback);
3246da14cebeSEric Cheng 		} else {
3247da14cebeSEric Cheng 			(prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1,
3248da14cebeSEric Cheng 			    prev_flent->fe_cb_arg2, mp_chain, loopback);
3249da14cebeSEric Cheng 			FLOW_REFRELE(prev_flent);
3250da14cebeSEric Cheng 		}
3251da14cebeSEric Cheng 		prev_flent = flent;
3252da14cebeSEric Cheng 		flent = NULL;
3253da14cebeSEric Cheng 		mp_chain = mp;
3254da14cebeSEric Cheng 		tail = mp;
3255da14cebeSEric Cheng 		mp = mp->b_next;
3256da14cebeSEric Cheng 	}
3257da14cebeSEric Cheng 	/* Last chain */
3258da14cebeSEric Cheng 	ASSERT(mp_chain != NULL);
3259da14cebeSEric Cheng 	if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) {
3260da14cebeSEric Cheng 		mac_rx_srs_process(arg,
3261da14cebeSEric Cheng 		    (mac_resource_handle_t)mac_srs, mp_chain, loopback);
3262da14cebeSEric Cheng 	} else {
3263da14cebeSEric Cheng 		(prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1,
3264da14cebeSEric Cheng 		    prev_flent->fe_cb_arg2, mp_chain, loopback);
3265da14cebeSEric Cheng 		FLOW_REFRELE(prev_flent);
3266da14cebeSEric Cheng 	}
3267da14cebeSEric Cheng }
3268da14cebeSEric Cheng 
3269da14cebeSEric Cheng /*
3270da14cebeSEric Cheng  * mac_rx_srs_process
3271da14cebeSEric Cheng  *
3272da14cebeSEric Cheng  * Receive side routine called from the interrupt path.
3273da14cebeSEric Cheng  *
3274da14cebeSEric Cheng  * loopback is set to force a context switch on the loopback
3275da14cebeSEric Cheng  * path between MAC clients.
3276da14cebeSEric Cheng  */
3277da14cebeSEric Cheng /* ARGSUSED */
3278da14cebeSEric Cheng void
3279da14cebeSEric Cheng mac_rx_srs_process(void *arg, mac_resource_handle_t srs, mblk_t *mp_chain,
3280da14cebeSEric Cheng     boolean_t loopback)
3281da14cebeSEric Cheng {
3282da14cebeSEric Cheng 	mac_soft_ring_set_t	*mac_srs = (mac_soft_ring_set_t *)srs;
3283da14cebeSEric Cheng 	mblk_t			*mp, *tail, *head;
3284da14cebeSEric Cheng 	int			count = 0;
3285da14cebeSEric Cheng 	int			count1;
3286da14cebeSEric Cheng 	size_t			sz = 0;
3287da14cebeSEric Cheng 	size_t			chain_sz, sz1;
3288da14cebeSEric Cheng 	mac_bw_ctl_t		*mac_bw;
3289da14cebeSEric Cheng 	mac_srs_rx_t		*srs_rx = &mac_srs->srs_rx;
3290da14cebeSEric Cheng 
3291da14cebeSEric Cheng 	/*
3292da14cebeSEric Cheng 	 * Set the tail, count and sz. We set the sz irrespective
3293da14cebeSEric Cheng 	 * of whether we are doing B/W control or not for the
3294da14cebeSEric Cheng 	 * purpose of updating the stats.
3295da14cebeSEric Cheng 	 */
3296da14cebeSEric Cheng 	mp = tail = mp_chain;
3297da14cebeSEric Cheng 	while (mp != NULL) {
3298da14cebeSEric Cheng 		tail = mp;
3299da14cebeSEric Cheng 		count++;
3300da14cebeSEric Cheng 		sz += msgdsize(mp);
3301da14cebeSEric Cheng 		mp = mp->b_next;
3302da14cebeSEric Cheng 	}
3303da14cebeSEric Cheng 
3304da14cebeSEric Cheng 	mutex_enter(&mac_srs->srs_lock);
3305da14cebeSEric Cheng 
33060dc2366fSVenugopal Iyer 	if (loopback) {
33070dc2366fSVenugopal Iyer 		SRS_RX_STAT_UPDATE(mac_srs, lclbytes, sz);
33080dc2366fSVenugopal Iyer 		SRS_RX_STAT_UPDATE(mac_srs, lclcnt, count);
33090dc2366fSVenugopal Iyer 
33100dc2366fSVenugopal Iyer 	} else {
33110dc2366fSVenugopal Iyer 		SRS_RX_STAT_UPDATE(mac_srs, intrbytes, sz);
33120dc2366fSVenugopal Iyer 		SRS_RX_STAT_UPDATE(mac_srs, intrcnt, count);
3313da14cebeSEric Cheng 	}
3314da14cebeSEric Cheng 
3315da14cebeSEric Cheng 	/*
3316da14cebeSEric Cheng 	 * If the SRS in already being processed; has been blanked;
3317da14cebeSEric Cheng 	 * can be processed by worker thread only; or the B/W limit
3318da14cebeSEric Cheng 	 * has been reached, then queue the chain and check if
3319da14cebeSEric Cheng 	 * worker thread needs to be awakend.
3320da14cebeSEric Cheng 	 */
3321da14cebeSEric Cheng 	if (mac_srs->srs_type & SRST_BW_CONTROL) {
3322da14cebeSEric Cheng 		mac_bw = mac_srs->srs_bw;
3323da14cebeSEric Cheng 		ASSERT(mac_bw != NULL);
3324da14cebeSEric Cheng 		mutex_enter(&mac_bw->mac_bw_lock);
3325da14cebeSEric Cheng 		mac_bw->mac_bw_intr += sz;
3326da14cebeSEric Cheng 		if (mac_bw->mac_bw_limit == 0) {
3327da14cebeSEric Cheng 			/* zero bandwidth: drop all */
33280dc2366fSVenugopal Iyer 			srs_rx->sr_stat.mrs_sdrops += count;
3329da14cebeSEric Cheng 			mac_bw->mac_bw_drop_bytes += sz;
3330da14cebeSEric Cheng 			mutex_exit(&mac_bw->mac_bw_lock);
3331da14cebeSEric Cheng 			mutex_exit(&mac_srs->srs_lock);
3332da14cebeSEric Cheng 			mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);
3333da14cebeSEric Cheng 			return;
3334da14cebeSEric Cheng 		} else {
3335da14cebeSEric Cheng 			if ((mac_bw->mac_bw_sz + sz) <=
3336da14cebeSEric Cheng 			    mac_bw->mac_bw_drop_threshold) {
3337da14cebeSEric Cheng 				mutex_exit(&mac_bw->mac_bw_lock);
3338da14cebeSEric Cheng 				MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain,
3339da14cebeSEric Cheng 				    tail, count, sz);
3340da14cebeSEric Cheng 			} else {
3341da14cebeSEric Cheng 				mp = mp_chain;
3342da14cebeSEric Cheng 				chain_sz = 0;
3343da14cebeSEric Cheng 				count1 = 0;
3344da14cebeSEric Cheng 				tail = NULL;
3345da14cebeSEric Cheng 				head = NULL;
3346da14cebeSEric Cheng 				while (mp != NULL) {
3347da14cebeSEric Cheng 					sz1 = msgdsize(mp);
3348da14cebeSEric Cheng 					if (mac_bw->mac_bw_sz + chain_sz + sz1 >
3349da14cebeSEric Cheng 					    mac_bw->mac_bw_drop_threshold)
3350da14cebeSEric Cheng 						break;
3351da14cebeSEric Cheng 					chain_sz += sz1;
3352da14cebeSEric Cheng 					count1++;
3353da14cebeSEric Cheng 					tail = mp;
3354da14cebeSEric Cheng 					mp = mp->b_next;
3355da14cebeSEric Cheng 				}
3356da14cebeSEric Cheng 				mutex_exit(&mac_bw->mac_bw_lock);
3357da14cebeSEric Cheng 				if (tail != NULL) {
3358da14cebeSEric Cheng 					head = tail->b_next;
3359da14cebeSEric Cheng 					tail->b_next = NULL;
3360da14cebeSEric Cheng 					MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs,
3361da14cebeSEric Cheng 					    mp_chain, tail, count1, chain_sz);
3362da14cebeSEric Cheng 					sz -= chain_sz;
3363da14cebeSEric Cheng 					count -= count1;
3364da14cebeSEric Cheng 				} else {
3365da14cebeSEric Cheng 					/* Can't pick up any */
3366da14cebeSEric Cheng 					head = mp_chain;
3367da14cebeSEric Cheng 				}
3368da14cebeSEric Cheng 				if (head != NULL) {
3369da14cebeSEric Cheng 					/* Drop any packet over the threshold */
33700dc2366fSVenugopal Iyer 					srs_rx->sr_stat.mrs_sdrops += count;
3371da14cebeSEric Cheng 					mutex_enter(&mac_bw->mac_bw_lock);
3372da14cebeSEric Cheng 					mac_bw->mac_bw_drop_bytes += sz;
3373da14cebeSEric Cheng 					mutex_exit(&mac_bw->mac_bw_lock);
3374da14cebeSEric Cheng 					freemsgchain(head);
3375da14cebeSEric Cheng 				}
3376da14cebeSEric Cheng 			}
3377da14cebeSEric Cheng 			MAC_SRS_WORKER_WAKEUP(mac_srs);
3378da14cebeSEric Cheng 			mutex_exit(&mac_srs->srs_lock);
3379da14cebeSEric Cheng 			return;
3380da14cebeSEric Cheng 		}
3381da14cebeSEric Cheng 	}
3382da14cebeSEric Cheng 
3383da14cebeSEric Cheng 	/*
3384da14cebeSEric Cheng 	 * If the total number of packets queued in the SRS and
3385da14cebeSEric Cheng 	 * its associated soft rings exceeds the max allowed,
3386da14cebeSEric Cheng 	 * then drop the chain. If we are polling capable, this
3387da14cebeSEric Cheng 	 * shouldn't be happening.
3388da14cebeSEric Cheng 	 */
3389da14cebeSEric Cheng 	if (!(mac_srs->srs_type & SRST_BW_CONTROL) &&
3390da14cebeSEric Cheng 	    (srs_rx->sr_poll_pkt_cnt > srs_rx->sr_hiwat)) {
3391da14cebeSEric Cheng 		mac_bw = mac_srs->srs_bw;
33920dc2366fSVenugopal Iyer 		srs_rx->sr_stat.mrs_sdrops += count;
3393da14cebeSEric Cheng 		mutex_enter(&mac_bw->mac_bw_lock);
3394da14cebeSEric Cheng 		mac_bw->mac_bw_drop_bytes += sz;
3395da14cebeSEric Cheng 		mutex_exit(&mac_bw->mac_bw_lock);
3396da14cebeSEric Cheng 		freemsgchain(mp_chain);
3397da14cebeSEric Cheng 		mutex_exit(&mac_srs->srs_lock);
3398da14cebeSEric Cheng 		return;
3399da14cebeSEric Cheng 	}
3400da14cebeSEric Cheng 
3401da14cebeSEric Cheng 	MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, tail, count, sz);
3402da14cebeSEric Cheng 
3403da14cebeSEric Cheng 	if (!(mac_srs->srs_state & SRS_PROC)) {
3404da14cebeSEric Cheng 		/*
3405e2ea9c96SRobert Mustacchi 		 * If we are coming via loopback, if we are not optimizing for
3406e2ea9c96SRobert Mustacchi 		 * latency, or if our stack is running deep, we should signal
3407e2ea9c96SRobert Mustacchi 		 * the worker thread.
3408da14cebeSEric Cheng 		 */
3409e2ea9c96SRobert Mustacchi 		if (loopback || !(mac_srs->srs_state & SRS_LATENCY_OPT) ||
3410e2ea9c96SRobert Mustacchi 		    MAC_RX_SRS_TOODEEP()) {
3411da14cebeSEric Cheng 			/*
3412da14cebeSEric Cheng 			 * For loopback, We need to let the worker take
3413da14cebeSEric Cheng 			 * over as we don't want to continue in the same
3414da14cebeSEric Cheng 			 * thread even if we can. This could lead to stack
3415da14cebeSEric Cheng 			 * overflows and may also end up using
3416da14cebeSEric Cheng 			 * resources (cpu) incorrectly.
3417da14cebeSEric Cheng 			 */
3418da14cebeSEric Cheng 			cv_signal(&mac_srs->srs_async);
3419da14cebeSEric Cheng 		} else {
3420da14cebeSEric Cheng 			/*
3421da14cebeSEric Cheng 			 * Seems like no one is processing the SRS and
3422da14cebeSEric Cheng 			 * there is no backlog. We also inline process
3423da14cebeSEric Cheng 			 * our packet if its a single packet in non
3424da14cebeSEric Cheng 			 * latency optimized case (in latency optimized
3425da14cebeSEric Cheng 			 * case, we inline process chains of any size).
3426da14cebeSEric Cheng 			 */
3427da14cebeSEric Cheng 			mac_srs->srs_drain_func(mac_srs, SRS_PROC_FAST);
3428da14cebeSEric Cheng 		}
3429da14cebeSEric Cheng 	}
3430da14cebeSEric Cheng 	mutex_exit(&mac_srs->srs_lock);
3431da14cebeSEric Cheng }
3432da14cebeSEric Cheng 
3433da14cebeSEric Cheng /* TX SIDE ROUTINES (RUNTIME) */
3434da14cebeSEric Cheng 
3435da14cebeSEric Cheng /*
3436da14cebeSEric Cheng  * mac_tx_srs_no_desc
3437da14cebeSEric Cheng  *
3438da14cebeSEric Cheng  * This routine is called by Tx single ring default mode
3439da14cebeSEric Cheng  * when Tx ring runs out of descs.
3440da14cebeSEric Cheng  */
3441da14cebeSEric Cheng mac_tx_cookie_t
3442da14cebeSEric Cheng mac_tx_srs_no_desc(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
3443da14cebeSEric Cheng     uint16_t flag, mblk_t **ret_mp)
3444da14cebeSEric Cheng {
3445da14cebeSEric Cheng 	mac_tx_cookie_t cookie = NULL;
3446da14cebeSEric Cheng 	mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
3447da14cebeSEric Cheng 	boolean_t wakeup_worker = B_TRUE;
3448da14cebeSEric Cheng 	uint32_t tx_mode = srs_tx->st_mode;
3449da14cebeSEric Cheng 	int cnt, sz;
3450da14cebeSEric Cheng 	mblk_t *tail;
3451da14cebeSEric Cheng 
3452da14cebeSEric Cheng 	ASSERT(tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_BW);
3453da14cebeSEric Cheng 	if (flag & MAC_DROP_ON_NO_DESC) {
3454da14cebeSEric Cheng 		MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
3455da14cebeSEric Cheng 	} else {
3456da14cebeSEric Cheng 		if (mac_srs->srs_first != NULL)
3457da14cebeSEric Cheng 			wakeup_worker = B_FALSE;
3458da14cebeSEric Cheng 		MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
3459da14cebeSEric Cheng 		if (flag & MAC_TX_NO_ENQUEUE) {
3460da14cebeSEric Cheng 			/*
3461da14cebeSEric Cheng 			 * If TX_QUEUED is not set, queue the
3462da14cebeSEric Cheng 			 * packet and let mac_tx_srs_drain()
3463da14cebeSEric Cheng 			 * set the TX_BLOCKED bit for the
3464da14cebeSEric Cheng 			 * reasons explained above. Otherwise,
3465da14cebeSEric Cheng 			 * return the mblks.
3466da14cebeSEric Cheng 			 */
3467da14cebeSEric Cheng 			if (wakeup_worker) {
3468da14cebeSEric Cheng 				MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
3469da14cebeSEric Cheng 				    mp_chain, tail, cnt, sz);
3470da14cebeSEric Cheng 			} else {
3471da14cebeSEric Cheng 				MAC_TX_SET_NO_ENQUEUE(mac_srs,
3472da14cebeSEric Cheng 				    mp_chain, ret_mp, cookie);
3473da14cebeSEric Cheng 			}
3474da14cebeSEric Cheng 		} else {
3475da14cebeSEric Cheng 			MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain,
3476da14cebeSEric Cheng 			    tail, cnt, sz, cookie);
3477da14cebeSEric Cheng 		}
3478da14cebeSEric Cheng 		if (wakeup_worker)
3479da14cebeSEric Cheng 			cv_signal(&mac_srs->srs_async);
3480da14cebeSEric Cheng 	}
3481da14cebeSEric Cheng 	return (cookie);
3482da14cebeSEric Cheng }
3483da14cebeSEric Cheng 
3484da14cebeSEric Cheng /*
3485da14cebeSEric Cheng  * mac_tx_srs_enqueue
3486da14cebeSEric Cheng  *
3487da14cebeSEric Cheng  * This routine is called when Tx SRS is operating in either serializer
3488da14cebeSEric Cheng  * or bandwidth mode. In serializer mode, a packet will get enqueued
3489da14cebeSEric Cheng  * when a thread cannot enter SRS exclusively. In bandwidth mode,
3490da14cebeSEric Cheng  * packets gets queued if allowed byte-count limit for a tick is
3491da14cebeSEric Cheng  * exceeded. The action that gets taken when MAC_DROP_ON_NO_DESC and
3492da14cebeSEric Cheng  * MAC_TX_NO_ENQUEUE is set is different than when operaing in either
3493da14cebeSEric Cheng  * the default mode or fanout mode. Here packets get dropped or
3494da14cebeSEric Cheng  * returned back to the caller only after hi-watermark worth of data
3495da14cebeSEric Cheng  * is queued.
3496da14cebeSEric Cheng  */
3497da14cebeSEric Cheng static mac_tx_cookie_t
3498da14cebeSEric Cheng mac_tx_srs_enqueue(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
3499da14cebeSEric Cheng     uint16_t flag, uintptr_t fanout_hint, mblk_t **ret_mp)
3500da14cebeSEric Cheng {
3501da14cebeSEric Cheng 	mac_tx_cookie_t cookie = NULL;
3502da14cebeSEric Cheng 	int cnt, sz;
3503da14cebeSEric Cheng 	mblk_t *tail;
3504da14cebeSEric Cheng 	boolean_t wakeup_worker = B_TRUE;
3505da14cebeSEric Cheng 
3506ae6aa22aSVenugopal Iyer 	/*
3507ae6aa22aSVenugopal Iyer 	 * Ignore fanout hint if we don't have multiple tx rings.
3508ae6aa22aSVenugopal Iyer 	 */
35090dc2366fSVenugopal Iyer 	if (!MAC_TX_SOFT_RINGS(mac_srs))
3510ae6aa22aSVenugopal Iyer 		fanout_hint = 0;
3511ae6aa22aSVenugopal Iyer 
3512da14cebeSEric Cheng 	if (mac_srs->srs_first != NULL)
3513da14cebeSEric Cheng 		wakeup_worker = B_FALSE;
3514da14cebeSEric Cheng 	MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
3515da14cebeSEric Cheng 	if (flag & MAC_DROP_ON_NO_DESC) {
3516da14cebeSEric Cheng 		if (mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) {
3517da14cebeSEric Cheng 			MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
3518da14cebeSEric Cheng 		} else {
3519da14cebeSEric Cheng 			MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
3520da14cebeSEric Cheng 			    mp_chain, tail, cnt, sz);
3521da14cebeSEric Cheng 		}
3522da14cebeSEric Cheng 	} else if (flag & MAC_TX_NO_ENQUEUE) {
3523da14cebeSEric Cheng 		if ((mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) ||
3524da14cebeSEric Cheng 		    (mac_srs->srs_state & SRS_TX_WAKEUP_CLIENT)) {
3525da14cebeSEric Cheng 			MAC_TX_SET_NO_ENQUEUE(mac_srs, mp_chain,
3526da14cebeSEric Cheng 			    ret_mp, cookie);
3527da14cebeSEric Cheng 		} else {
3528da14cebeSEric Cheng 			mp_chain->b_prev = (mblk_t *)fanout_hint;
3529da14cebeSEric Cheng 			MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
3530da14cebeSEric Cheng 			    mp_chain, tail, cnt, sz);
3531da14cebeSEric Cheng 		}
3532da14cebeSEric Cheng 	} else {
3533da14cebeSEric Cheng 		/*
3534da14cebeSEric Cheng 		 * If you are BW_ENFORCED, just enqueue the
3535da14cebeSEric Cheng 		 * packet. srs_worker will drain it at the
3536da14cebeSEric Cheng 		 * prescribed rate. Before enqueueing, save
3537da14cebeSEric Cheng 		 * the fanout hint.
3538da14cebeSEric Cheng 		 */
3539da14cebeSEric Cheng 		mp_chain->b_prev = (mblk_t *)fanout_hint;
3540da14cebeSEric Cheng 		MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain,
3541da14cebeSEric Cheng 		    tail, cnt, sz, cookie);
3542da14cebeSEric Cheng 	}
3543da14cebeSEric Cheng 	if (wakeup_worker)
3544da14cebeSEric Cheng 		cv_signal(&mac_srs->srs_async);
3545da14cebeSEric Cheng 	return (cookie);
3546da14cebeSEric Cheng }
3547da14cebeSEric Cheng 
3548da14cebeSEric Cheng /*
35490dc2366fSVenugopal Iyer  * There are seven tx modes:
3550da14cebeSEric Cheng  *
3551da14cebeSEric Cheng  * 1) Default mode (SRS_TX_DEFAULT)
3552da14cebeSEric Cheng  * 2) Serialization mode (SRS_TX_SERIALIZE)
3553da14cebeSEric Cheng  * 3) Fanout mode (SRS_TX_FANOUT)
3554da14cebeSEric Cheng  * 4) Bandwdith mode (SRS_TX_BW)
3555da14cebeSEric Cheng  * 5) Fanout and Bandwidth mode (SRS_TX_BW_FANOUT)
35560dc2366fSVenugopal Iyer  * 6) aggr Tx mode (SRS_TX_AGGR)
35570dc2366fSVenugopal Iyer  * 7) aggr Tx bw mode (SRS_TX_BW_AGGR)
3558da14cebeSEric Cheng  *
3559da14cebeSEric Cheng  * The tx mode in which an SRS operates is decided in mac_tx_srs_setup()
3560da14cebeSEric Cheng  * based on the number of Tx rings requested for an SRS and whether
3561da14cebeSEric Cheng  * bandwidth control is requested or not.
3562da14cebeSEric Cheng  *
35630dc2366fSVenugopal Iyer  * The default mode (i.e., no fanout/no bandwidth) is used when the
35640dc2366fSVenugopal Iyer  * underlying NIC does not have Tx rings or just one Tx ring. In this mode,
35650dc2366fSVenugopal Iyer  * the SRS acts as a pass-thru. Packets will go directly to mac_tx_send().
35660dc2366fSVenugopal Iyer  * When the underlying Tx ring runs out of Tx descs, it starts queueing up
35670dc2366fSVenugopal Iyer  * packets in SRS. When flow-control is relieved, the srs_worker drains
35680dc2366fSVenugopal Iyer  * the queued packets and informs blocked clients to restart sending
35690dc2366fSVenugopal Iyer  * packets.
3570da14cebeSEric Cheng  *
35710dc2366fSVenugopal Iyer  * In the SRS_TX_SERIALIZE mode, all calls to mac_tx() are serialized. This
35720dc2366fSVenugopal Iyer  * mode is used when the link has no Tx rings or only one Tx ring.
3573da14cebeSEric Cheng  *
3574da14cebeSEric Cheng  * In the SRS_TX_FANOUT mode, packets will be fanned out to multiple
3575da14cebeSEric Cheng  * Tx rings. Each Tx ring will have a soft ring associated with it.
3576da14cebeSEric Cheng  * These soft rings will be hung off the Tx SRS. Queueing if it happens
3577da14cebeSEric Cheng  * due to lack of Tx desc will be in individual soft ring (and not srs)
3578da14cebeSEric Cheng  * associated with Tx ring.
3579da14cebeSEric Cheng  *
3580da14cebeSEric Cheng  * In the TX_BW mode, tx srs will allow packets to go down to Tx ring
3581da14cebeSEric Cheng  * only if bw is available. Otherwise the packets will be queued in
3582da14cebeSEric Cheng  * SRS. If fanout to multiple Tx rings is configured, the packets will
3583da14cebeSEric Cheng  * be fanned out among the soft rings associated with the Tx rings.
3584da14cebeSEric Cheng  *
35850dc2366fSVenugopal Iyer  * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine
35860dc2366fSVenugopal Iyer  * invokes an aggr function, aggr_find_tx_ring(), to find a pseudo Tx ring
35870dc2366fSVenugopal Iyer  * belonging to a port on which the packet has to be sent. Aggr will
35880dc2366fSVenugopal Iyer  * always have a pseudo Tx ring associated with it even when it is an
35890dc2366fSVenugopal Iyer  * aggregation over a single NIC that has no Tx rings. Even in such a
35900dc2366fSVenugopal Iyer  * case, the single pseudo Tx ring will have a soft ring associated with
35910dc2366fSVenugopal Iyer  * it and the soft ring will hang off the SRS.
35920dc2366fSVenugopal Iyer  *
35930dc2366fSVenugopal Iyer  * If a bandwidth is specified for an aggr, SRS_TX_BW_AGGR mode is used.
35940dc2366fSVenugopal Iyer  * In this mode, the bandwidth is first applied on the outgoing packets
35950dc2366fSVenugopal Iyer  * and later mac_tx_addr_mode() function is called to send the packet out
35960dc2366fSVenugopal Iyer  * of one of the pseudo Tx rings.
35970dc2366fSVenugopal Iyer  *
3598da14cebeSEric Cheng  * Four flags are used in srs_state for indicating flow control
3599da14cebeSEric Cheng  * conditions : SRS_TX_BLOCKED, SRS_TX_HIWAT, SRS_TX_WAKEUP_CLIENT.
3600da14cebeSEric Cheng  * SRS_TX_BLOCKED indicates out of Tx descs. SRS expects a wakeup from the
3601da14cebeSEric Cheng  * driver below.
3602da14cebeSEric Cheng  * SRS_TX_HIWAT indicates packet count enqueued in Tx SRS exceeded Tx hiwat
3603da14cebeSEric Cheng  * and flow-control pressure is applied back to clients. The clients expect
3604da14cebeSEric Cheng  * wakeup when flow-control is relieved.
3605da14cebeSEric Cheng  * SRS_TX_WAKEUP_CLIENT get set when (flag == MAC_TX_NO_ENQUEUE) and mblk
3606da14cebeSEric Cheng  * got returned back to client either due to lack of Tx descs or due to bw
3607da14cebeSEric Cheng  * control reasons. The clients expect a wakeup when condition is relieved.
3608da14cebeSEric Cheng  *
3609da14cebeSEric Cheng  * The fourth argument to mac_tx() is the flag. Normally it will be 0 but
3610da14cebeSEric Cheng  * some clients set the following values too: MAC_DROP_ON_NO_DESC,
3611da14cebeSEric Cheng  * MAC_TX_NO_ENQUEUE
3612da14cebeSEric Cheng  * Mac clients that do not want packets to be enqueued in the mac layer set
3613da14cebeSEric Cheng  * MAC_DROP_ON_NO_DESC value. The packets won't be queued in the Tx SRS or
3614da14cebeSEric Cheng  * Tx soft rings but instead get dropped when the NIC runs out of desc. The
3615da14cebeSEric Cheng  * behaviour of this flag is different when the Tx is running in serializer
3616da14cebeSEric Cheng  * or bandwidth mode. Under these (Serializer, bandwidth) modes, the packet
3617da14cebeSEric Cheng  * get dropped when Tx high watermark is reached.
3618da14cebeSEric Cheng  * There are some mac clients like vsw, aggr that want the mblks to be
3619da14cebeSEric Cheng  * returned back to clients instead of being queued in Tx SRS (or Tx soft
3620da14cebeSEric Cheng  * rings) under flow-control (i.e., out of desc or exceeding bw limits)
3621da14cebeSEric Cheng  * conditions. These clients call mac_tx() with MAC_TX_NO_ENQUEUE flag set.
3622da14cebeSEric Cheng  * In the default and Tx fanout mode, the un-transmitted mblks will be
3623da14cebeSEric Cheng  * returned back to the clients when the driver runs out of Tx descs.
3624da14cebeSEric Cheng  * SRS_TX_WAKEUP_CLIENT (or S_RING_WAKEUP_CLIENT) will be set in SRS (or
3625da14cebeSEric Cheng  * soft ring) so that the clients can be woken up when Tx desc become
3626da14cebeSEric Cheng  * available. When running in serializer or bandwidth mode mode,
3627da14cebeSEric Cheng  * SRS_TX_WAKEUP_CLIENT will be set when tx hi-watermark is reached.
3628da14cebeSEric Cheng  */
3629da14cebeSEric Cheng 
3630da14cebeSEric Cheng mac_tx_func_t
3631da14cebeSEric Cheng mac_tx_get_func(uint32_t mode)
3632da14cebeSEric Cheng {
3633da14cebeSEric Cheng 	return (mac_tx_mode_list[mode].mac_tx_func);
3634da14cebeSEric Cheng }
3635da14cebeSEric Cheng 
3636da14cebeSEric Cheng /* ARGSUSED */
3637da14cebeSEric Cheng static mac_tx_cookie_t
3638da14cebeSEric Cheng mac_tx_single_ring_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
3639da14cebeSEric Cheng     uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
3640da14cebeSEric Cheng {
3641da14cebeSEric Cheng 	mac_srs_tx_t		*srs_tx = &mac_srs->srs_tx;
3642da14cebeSEric Cheng 	mac_tx_stats_t		stats;
3643da14cebeSEric Cheng 	mac_tx_cookie_t		cookie = NULL;
3644da14cebeSEric Cheng 
3645da14cebeSEric Cheng 	ASSERT(srs_tx->st_mode == SRS_TX_DEFAULT);
3646da14cebeSEric Cheng 
3647da14cebeSEric Cheng 	/* Regular case with a single Tx ring */
3648da14cebeSEric Cheng 	/*
3649da14cebeSEric Cheng 	 * SRS_TX_BLOCKED is set when underlying NIC runs
3650da14cebeSEric Cheng 	 * out of Tx descs and messages start getting
3651da14cebeSEric Cheng 	 * queued. It won't get reset until
3652da14cebeSEric Cheng 	 * tx_srs_drain() completely drains out the
3653da14cebeSEric Cheng 	 * messages.
3654da14cebeSEric Cheng 	 */
3655da14cebeSEric Cheng 	if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) {
3656da14cebeSEric Cheng 		/* Tx descs/resources not available */
3657da14cebeSEric Cheng 		mutex_enter(&mac_srs->srs_lock);
3658da14cebeSEric Cheng 		if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) {
3659da14cebeSEric Cheng 			cookie = mac_tx_srs_no_desc(mac_srs, mp_chain,
3660da14cebeSEric Cheng 			    flag, ret_mp);
3661da14cebeSEric Cheng 			mutex_exit(&mac_srs->srs_lock);
3662da14cebeSEric Cheng 			return (cookie);
3663da14cebeSEric Cheng 		}
3664da14cebeSEric Cheng 		/*
3665da14cebeSEric Cheng 		 * While we were computing mblk count, the
3666da14cebeSEric Cheng 		 * flow control condition got relieved.
3667da14cebeSEric Cheng 		 * Continue with the transmission.
3668da14cebeSEric Cheng 		 */
3669da14cebeSEric Cheng 		mutex_exit(&mac_srs->srs_lock);
3670da14cebeSEric Cheng 	}
3671da14cebeSEric Cheng 
3672da14cebeSEric Cheng 	mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
36730dc2366fSVenugopal Iyer 	    mp_chain, &stats);
3674da14cebeSEric Cheng 
3675da14cebeSEric Cheng 	/*
3676da14cebeSEric Cheng 	 * Multiple threads could be here sending packets.
3677da14cebeSEric Cheng 	 * Under such conditions, it is not possible to
3678da14cebeSEric Cheng 	 * automically set SRS_TX_BLOCKED bit to indicate
3679da14cebeSEric Cheng 	 * out of tx desc condition. To atomically set
3680da14cebeSEric Cheng 	 * this, we queue the returned packet and do
3681da14cebeSEric Cheng 	 * the setting of SRS_TX_BLOCKED in
3682da14cebeSEric Cheng 	 * mac_tx_srs_drain().
3683da14cebeSEric Cheng 	 */
3684da14cebeSEric Cheng 	if (mp_chain != NULL) {
3685da14cebeSEric Cheng 		mutex_enter(&mac_srs->srs_lock);
3686da14cebeSEric Cheng 		cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, flag, ret_mp);
3687da14cebeSEric Cheng 		mutex_exit(&mac_srs->srs_lock);
3688da14cebeSEric Cheng 		return (cookie);
3689da14cebeSEric Cheng 	}
36900dc2366fSVenugopal Iyer 	SRS_TX_STATS_UPDATE(mac_srs, &stats);
3691da14cebeSEric Cheng 
3692da14cebeSEric Cheng 	return (NULL);
3693da14cebeSEric Cheng }
3694da14cebeSEric Cheng 
3695da14cebeSEric Cheng /*
3696da14cebeSEric Cheng  * mac_tx_serialize_mode
3697da14cebeSEric Cheng  *
3698da14cebeSEric Cheng  * This is an experimental mode implemented as per the request of PAE.
3699da14cebeSEric Cheng  * In this mode, all callers attempting to send a packet to the NIC
3700da14cebeSEric Cheng  * will get serialized. Only one thread at any time will access the
3701da14cebeSEric Cheng  * NIC to send the packet out.
3702da14cebeSEric Cheng  */
3703da14cebeSEric Cheng /* ARGSUSED */
3704da14cebeSEric Cheng static mac_tx_cookie_t
3705da14cebeSEric Cheng mac_tx_serializer_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
3706da14cebeSEric Cheng     uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
3707da14cebeSEric Cheng {
3708da14cebeSEric Cheng 	mac_tx_stats_t		stats;
3709da14cebeSEric Cheng 	mac_tx_cookie_t		cookie = NULL;
3710da14cebeSEric Cheng 	mac_srs_tx_t		*srs_tx = &mac_srs->srs_tx;
3711da14cebeSEric Cheng 
3712da14cebeSEric Cheng 	/* Single ring, serialize below */
3713da14cebeSEric Cheng 	ASSERT(srs_tx->st_mode == SRS_TX_SERIALIZE);
3714da14cebeSEric Cheng 	mutex_enter(&mac_srs->srs_lock);
3715da14cebeSEric Cheng 	if ((mac_srs->srs_first != NULL) ||
3716da14cebeSEric Cheng 	    (mac_srs->srs_state & SRS_PROC)) {
3717da14cebeSEric Cheng 		/*
3718da14cebeSEric Cheng 		 * In serialization mode, queue all packets until
3719da14cebeSEric Cheng 		 * TX_HIWAT is set.
3720da14cebeSEric Cheng 		 * If drop bit is set, drop if TX_HIWAT is set.
3721da14cebeSEric Cheng 		 * If no_enqueue is set, still enqueue until hiwat
3722da14cebeSEric Cheng 		 * is set and return mblks after TX_HIWAT is set.
3723da14cebeSEric Cheng 		 */
3724da14cebeSEric Cheng 		cookie = mac_tx_srs_enqueue(mac_srs, mp_chain,
3725da14cebeSEric Cheng 		    flag, NULL, ret_mp);
3726da14cebeSEric Cheng 		mutex_exit(&mac_srs->srs_lock);
3727da14cebeSEric Cheng 		return (cookie);
3728da14cebeSEric Cheng 	}
3729da14cebeSEric Cheng 	/*
3730da14cebeSEric Cheng 	 * No packets queued, nothing on proc and no flow
3731da14cebeSEric Cheng 	 * control condition. Fast-path, ok. Do inline
3732da14cebeSEric Cheng 	 * processing.
3733da14cebeSEric Cheng 	 */
3734da14cebeSEric Cheng 	mac_srs->srs_state |= SRS_PROC;
3735da14cebeSEric Cheng 	mutex_exit(&mac_srs->srs_lock);
3736da14cebeSEric Cheng 
3737da14cebeSEric Cheng 	mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
37380dc2366fSVenugopal Iyer 	    mp_chain, &stats);
3739da14cebeSEric Cheng 
3740da14cebeSEric Cheng 	mutex_enter(&mac_srs->srs_lock);
3741da14cebeSEric Cheng 	mac_srs->srs_state &= ~SRS_PROC;
3742da14cebeSEric Cheng 	if (mp_chain != NULL) {
3743da14cebeSEric Cheng 		cookie = mac_tx_srs_enqueue(mac_srs,
3744da14cebeSEric Cheng 		    mp_chain, flag, NULL, ret_mp);
3745da14cebeSEric Cheng 	}
3746da14cebeSEric Cheng 	if (mac_srs->srs_first != NULL) {
3747da14cebeSEric Cheng 		/*
3748da14cebeSEric Cheng 		 * We processed inline our packet and a new
3749da14cebeSEric Cheng 		 * packet/s got queued while we were
3750da14cebeSEric Cheng 		 * processing. Wakeup srs worker
3751da14cebeSEric Cheng 		 */
3752da14cebeSEric Cheng 		cv_signal(&mac_srs->srs_async);
3753da14cebeSEric Cheng 	}
3754da14cebeSEric Cheng 	mutex_exit(&mac_srs->srs_lock);
3755da14cebeSEric Cheng 
37560dc2366fSVenugopal Iyer 	if (cookie == NULL)
37570dc2366fSVenugopal Iyer 		SRS_TX_STATS_UPDATE(mac_srs, &stats);
3758da14cebeSEric Cheng 
3759da14cebeSEric Cheng 	return (cookie);
3760da14cebeSEric Cheng }
3761da14cebeSEric Cheng 
3762da14cebeSEric Cheng /*
3763da14cebeSEric Cheng  * mac_tx_fanout_mode
3764da14cebeSEric Cheng  *
3765da14cebeSEric Cheng  * In this mode, the SRS will have access to multiple Tx rings to send
3766da14cebeSEric Cheng  * the packet out. The fanout hint that is passed as an argument is
3767da14cebeSEric Cheng  * used to find an appropriate ring to fanout the traffic. Each Tx
3768da14cebeSEric Cheng  * ring, in turn,  will have a soft ring associated with it. If a Tx
3769da14cebeSEric Cheng  * ring runs out of Tx desc's the returned packet will be queued in
3770da14cebeSEric Cheng  * the soft ring associated with that Tx ring. The srs itself will not
3771da14cebeSEric Cheng  * queue any packets.
3772da14cebeSEric Cheng  */
3773ae6aa22aSVenugopal Iyer 
3774ae6aa22aSVenugopal Iyer #define	MAC_TX_SOFT_RING_PROCESS(chain) {		       		\
37750dc2366fSVenugopal Iyer 	index = COMPUTE_INDEX(hash, mac_srs->srs_tx_ring_count),	\
37760dc2366fSVenugopal Iyer 	softring = mac_srs->srs_tx_soft_rings[index];			\
3777ae6aa22aSVenugopal Iyer 	cookie = mac_tx_soft_ring_process(softring, chain, flag, ret_mp); \
3778ae6aa22aSVenugopal Iyer 	DTRACE_PROBE2(tx__fanout, uint64_t, hash, uint_t, index);	\
3779ae6aa22aSVenugopal Iyer }
3780ae6aa22aSVenugopal Iyer 
3781da14cebeSEric Cheng static mac_tx_cookie_t
3782da14cebeSEric Cheng mac_tx_fanout_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
3783da14cebeSEric Cheng     uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
3784da14cebeSEric Cheng {
3785da14cebeSEric Cheng 	mac_soft_ring_t		*softring;
3786ae6aa22aSVenugopal Iyer 	uint64_t		hash;
3787ae6aa22aSVenugopal Iyer 	uint_t			index;
3788ae6aa22aSVenugopal Iyer 	mac_tx_cookie_t		cookie = NULL;
3789da14cebeSEric Cheng 
37900dc2366fSVenugopal Iyer 	ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT ||
37910dc2366fSVenugopal Iyer 	    mac_srs->srs_tx.st_mode == SRS_TX_BW_FANOUT);
3792ae6aa22aSVenugopal Iyer 	if (fanout_hint != 0) {
3793ae6aa22aSVenugopal Iyer 		/*
3794ae6aa22aSVenugopal Iyer 		 * The hint is specified by the caller, simply pass the
3795ae6aa22aSVenugopal Iyer 		 * whole chain to the soft ring.
3796ae6aa22aSVenugopal Iyer 		 */
3797da14cebeSEric Cheng 		hash = HASH_HINT(fanout_hint);
3798ae6aa22aSVenugopal Iyer 		MAC_TX_SOFT_RING_PROCESS(mp_chain);
3799ae6aa22aSVenugopal Iyer 	} else {
3800ae6aa22aSVenugopal Iyer 		mblk_t *last_mp, *cur_mp, *sub_chain;
3801ae6aa22aSVenugopal Iyer 		uint64_t last_hash = 0;
3802ae6aa22aSVenugopal Iyer 		uint_t media = mac_srs->srs_mcip->mci_mip->mi_info.mi_media;
3803ae6aa22aSVenugopal Iyer 
3804ae6aa22aSVenugopal Iyer 		/*
3805ae6aa22aSVenugopal Iyer 		 * Compute the hash from the contents (headers) of the
3806ae6aa22aSVenugopal Iyer 		 * packets of the mblk chain. Split the chains into
3807ae6aa22aSVenugopal Iyer 		 * subchains of the same conversation.
3808ae6aa22aSVenugopal Iyer 		 *
3809ae6aa22aSVenugopal Iyer 		 * Since there may be more than one ring used for
3810ae6aa22aSVenugopal Iyer 		 * sub-chains of the same call, and since the caller
3811ae6aa22aSVenugopal Iyer 		 * does not maintain per conversation state since it
3812ae6aa22aSVenugopal Iyer 		 * passed a zero hint, unsent subchains will be
3813ae6aa22aSVenugopal Iyer 		 * dropped.
3814ae6aa22aSVenugopal Iyer 		 */
3815ae6aa22aSVenugopal Iyer 
3816ae6aa22aSVenugopal Iyer 		flag |= MAC_DROP_ON_NO_DESC;
3817ae6aa22aSVenugopal Iyer 		ret_mp = NULL;
3818ae6aa22aSVenugopal Iyer 
3819ae6aa22aSVenugopal Iyer 		ASSERT(ret_mp == NULL);
3820ae6aa22aSVenugopal Iyer 
3821ae6aa22aSVenugopal Iyer 		sub_chain = NULL;
3822ae6aa22aSVenugopal Iyer 		last_mp = NULL;
3823ae6aa22aSVenugopal Iyer 
3824ae6aa22aSVenugopal Iyer 		for (cur_mp = mp_chain; cur_mp != NULL;
3825ae6aa22aSVenugopal Iyer 		    cur_mp = cur_mp->b_next) {
3826ae6aa22aSVenugopal Iyer 			hash = mac_pkt_hash(media, cur_mp, MAC_PKT_HASH_L4,
3827ae6aa22aSVenugopal Iyer 			    B_TRUE);
3828ae6aa22aSVenugopal Iyer 			if (last_hash != 0 && hash != last_hash) {
3829ae6aa22aSVenugopal Iyer 				/*
3830ae6aa22aSVenugopal Iyer 				 * Starting a different subchain, send current
3831ae6aa22aSVenugopal Iyer 				 * chain out.
3832ae6aa22aSVenugopal Iyer 				 */
3833ae6aa22aSVenugopal Iyer 				ASSERT(last_mp != NULL);
3834ae6aa22aSVenugopal Iyer 				last_mp->b_next = NULL;
3835ae6aa22aSVenugopal Iyer 				MAC_TX_SOFT_RING_PROCESS(sub_chain);
3836ae6aa22aSVenugopal Iyer 				sub_chain = NULL;
3837ae6aa22aSVenugopal Iyer 			}
3838ae6aa22aSVenugopal Iyer 
3839ae6aa22aSVenugopal Iyer 			/* add packet to subchain */
3840ae6aa22aSVenugopal Iyer 			if (sub_chain == NULL)
3841ae6aa22aSVenugopal Iyer 				sub_chain = cur_mp;
3842ae6aa22aSVenugopal Iyer 			last_mp = cur_mp;
3843ae6aa22aSVenugopal Iyer 			last_hash = hash;
3844ae6aa22aSVenugopal Iyer 		}
3845ae6aa22aSVenugopal Iyer 
3846ae6aa22aSVenugopal Iyer 		if (sub_chain != NULL) {
3847ae6aa22aSVenugopal Iyer 			/* send last subchain */
3848ae6aa22aSVenugopal Iyer 			ASSERT(last_mp != NULL);
3849ae6aa22aSVenugopal Iyer 			last_mp->b_next = NULL;
3850ae6aa22aSVenugopal Iyer 			MAC_TX_SOFT_RING_PROCESS(sub_chain);
3851ae6aa22aSVenugopal Iyer 		}
3852ae6aa22aSVenugopal Iyer 
3853ae6aa22aSVenugopal Iyer 		cookie = NULL;
3854ae6aa22aSVenugopal Iyer 	}
3855ae6aa22aSVenugopal Iyer 
3856ae6aa22aSVenugopal Iyer 	return (cookie);
3857da14cebeSEric Cheng }
3858da14cebeSEric Cheng 
3859da14cebeSEric Cheng /*
3860da14cebeSEric Cheng  * mac_tx_bw_mode
3861da14cebeSEric Cheng  *
3862da14cebeSEric Cheng  * In the bandwidth mode, Tx srs will allow packets to go down to Tx ring
3863da14cebeSEric Cheng  * only if bw is available. Otherwise the packets will be queued in
3864da14cebeSEric Cheng  * SRS. If the SRS has multiple Tx rings, then packets will get fanned
3865da14cebeSEric Cheng  * out to a Tx rings.
3866da14cebeSEric Cheng  */
3867da14cebeSEric Cheng static mac_tx_cookie_t
3868da14cebeSEric Cheng mac_tx_bw_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
3869da14cebeSEric Cheng     uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
3870da14cebeSEric Cheng {
3871da14cebeSEric Cheng 	int			cnt, sz;
3872da14cebeSEric Cheng 	mblk_t			*tail;
3873da14cebeSEric Cheng 	mac_tx_cookie_t		cookie = NULL;
3874da14cebeSEric Cheng 	mac_srs_tx_t		*srs_tx = &mac_srs->srs_tx;
3875d3d50737SRafael Vanoni 	clock_t			now;
3876da14cebeSEric Cheng 
3877da14cebeSEric Cheng 	ASSERT(TX_BANDWIDTH_MODE(mac_srs));
3878da14cebeSEric Cheng 	ASSERT(mac_srs->srs_type & SRST_BW_CONTROL);
3879da14cebeSEric Cheng 	mutex_enter(&mac_srs->srs_lock);
3880da14cebeSEric Cheng 	if (mac_srs->srs_bw->mac_bw_limit == 0) {
3881ae6aa22aSVenugopal Iyer 		/*
3882ae6aa22aSVenugopal Iyer 		 * zero bandwidth, no traffic is sent: drop the packets,
3883ae6aa22aSVenugopal Iyer 		 * or return the whole chain if the caller requests all
3884ae6aa22aSVenugopal Iyer 		 * unsent packets back.
3885ae6aa22aSVenugopal Iyer 		 */
3886ae6aa22aSVenugopal Iyer 		if (flag & MAC_TX_NO_ENQUEUE) {
3887ae6aa22aSVenugopal Iyer 			cookie = (mac_tx_cookie_t)mac_srs;
3888ae6aa22aSVenugopal Iyer 			*ret_mp = mp_chain;
3889ae6aa22aSVenugopal Iyer 		} else {
3890da14cebeSEric Cheng 			MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
3891ae6aa22aSVenugopal Iyer 		}
3892da14cebeSEric Cheng 		mutex_exit(&mac_srs->srs_lock);
3893da14cebeSEric Cheng 		return (cookie);
3894da14cebeSEric Cheng 	} else if ((mac_srs->srs_first != NULL) ||
3895da14cebeSEric Cheng 	    (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) {
3896da14cebeSEric Cheng 		cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag,
3897da14cebeSEric Cheng 		    fanout_hint, ret_mp);
3898da14cebeSEric Cheng 		mutex_exit(&mac_srs->srs_lock);
3899da14cebeSEric Cheng 		return (cookie);
3900da14cebeSEric Cheng 	}
3901da14cebeSEric Cheng 	MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
3902d3d50737SRafael Vanoni 	now = ddi_get_lbolt();
3903d3d50737SRafael Vanoni 	if (mac_srs->srs_bw->mac_bw_curr_time != now) {
3904d3d50737SRafael Vanoni 		mac_srs->srs_bw->mac_bw_curr_time = now;
3905da14cebeSEric Cheng 		mac_srs->srs_bw->mac_bw_used = 0;
3906da14cebeSEric Cheng 	} else if (mac_srs->srs_bw->mac_bw_used >
3907da14cebeSEric Cheng 	    mac_srs->srs_bw->mac_bw_limit) {
3908da14cebeSEric Cheng 		mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
3909da14cebeSEric Cheng 		MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
3910da14cebeSEric Cheng 		    mp_chain, tail, cnt, sz);
3911da14cebeSEric Cheng 		/*
3912da14cebeSEric Cheng 		 * Wakeup worker thread. Note that worker
3913da14cebeSEric Cheng 		 * thread has to be woken up so that it
3914da14cebeSEric Cheng 		 * can fire up the timer to be woken up
3915da14cebeSEric Cheng 		 * on the next tick. Also once
3916da14cebeSEric Cheng 		 * BW_ENFORCED is set, it can only be
3917da14cebeSEric Cheng 		 * reset by srs_worker thread. Until then
3918da14cebeSEric Cheng 		 * all packets will get queued up in SRS
3919da14cebeSEric Cheng 		 * and hence this this code path won't be
3920da14cebeSEric Cheng 		 * entered until BW_ENFORCED is reset.
3921da14cebeSEric Cheng 		 */
3922da14cebeSEric Cheng 		cv_signal(&mac_srs->srs_async);
3923da14cebeSEric Cheng 		mutex_exit(&mac_srs->srs_lock);
3924da14cebeSEric Cheng 		return (cookie);
3925da14cebeSEric Cheng 	}
3926da14cebeSEric Cheng 
3927da14cebeSEric Cheng 	mac_srs->srs_bw->mac_bw_used += sz;
3928da14cebeSEric Cheng 	mutex_exit(&mac_srs->srs_lock);
3929da14cebeSEric Cheng 
3930da14cebeSEric Cheng 	if (srs_tx->st_mode == SRS_TX_BW_FANOUT) {
3931da14cebeSEric Cheng 		mac_soft_ring_t *softring;
3932da14cebeSEric Cheng 		uint_t indx, hash;
3933da14cebeSEric Cheng 
3934da14cebeSEric Cheng 		hash = HASH_HINT(fanout_hint);
3935da14cebeSEric Cheng 		indx = COMPUTE_INDEX(hash,
39360dc2366fSVenugopal Iyer 		    mac_srs->srs_tx_ring_count);
39370dc2366fSVenugopal Iyer 		softring = mac_srs->srs_tx_soft_rings[indx];
3938da14cebeSEric Cheng 		return (mac_tx_soft_ring_process(softring, mp_chain, flag,
3939da14cebeSEric Cheng 		    ret_mp));
39400dc2366fSVenugopal Iyer 	} else if (srs_tx->st_mode == SRS_TX_BW_AGGR) {
39410dc2366fSVenugopal Iyer 		return (mac_tx_aggr_mode(mac_srs, mp_chain,
39420dc2366fSVenugopal Iyer 		    fanout_hint, flag, ret_mp));
3943da14cebeSEric Cheng 	} else {
3944da14cebeSEric Cheng 		mac_tx_stats_t		stats;
3945da14cebeSEric Cheng 
3946da14cebeSEric Cheng 		mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
39470dc2366fSVenugopal Iyer 		    mp_chain, &stats);
3948da14cebeSEric Cheng 
3949da14cebeSEric Cheng 		if (mp_chain != NULL) {
3950da14cebeSEric Cheng 			mutex_enter(&mac_srs->srs_lock);
3951da14cebeSEric Cheng 			MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
3952da14cebeSEric Cheng 			if (mac_srs->srs_bw->mac_bw_used > sz)
3953da14cebeSEric Cheng 				mac_srs->srs_bw->mac_bw_used -= sz;
3954da14cebeSEric Cheng 			else
3955da14cebeSEric Cheng 				mac_srs->srs_bw->mac_bw_used = 0;
3956da14cebeSEric Cheng 			cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag,
3957da14cebeSEric Cheng 			    fanout_hint, ret_mp);
3958da14cebeSEric Cheng 			mutex_exit(&mac_srs->srs_lock);
3959da14cebeSEric Cheng 			return (cookie);
3960da14cebeSEric Cheng 		}
39610dc2366fSVenugopal Iyer 		SRS_TX_STATS_UPDATE(mac_srs, &stats);
3962da14cebeSEric Cheng 
3963da14cebeSEric Cheng 		return (NULL);
3964da14cebeSEric Cheng 	}
3965da14cebeSEric Cheng }
3966da14cebeSEric Cheng 
39670dc2366fSVenugopal Iyer /*
39680dc2366fSVenugopal Iyer  * mac_tx_aggr_mode
39690dc2366fSVenugopal Iyer  *
39700dc2366fSVenugopal Iyer  * This routine invokes an aggr function, aggr_find_tx_ring(), to find
39710dc2366fSVenugopal Iyer  * a (pseudo) Tx ring belonging to a port on which the packet has to
39720dc2366fSVenugopal Iyer  * be sent. aggr_find_tx_ring() first finds the outgoing port based on
39730dc2366fSVenugopal Iyer  * L2/L3/L4 policy and then uses the fanout_hint passed to it to pick
39740dc2366fSVenugopal Iyer  * a Tx ring from the selected port.
39750dc2366fSVenugopal Iyer  *
39760dc2366fSVenugopal Iyer  * Note that a port can be deleted from the aggregation. In such a case,
39770dc2366fSVenugopal Iyer  * the aggregation layer first separates the port from the rest of the
39780dc2366fSVenugopal Iyer  * ports making sure that port (and thus any Tx rings associated with
39790dc2366fSVenugopal Iyer  * it) won't get selected in the call to aggr_find_tx_ring() function.
39800dc2366fSVenugopal Iyer  * Later calls are made to mac_group_rem_ring() passing pseudo Tx ring
39810dc2366fSVenugopal Iyer  * handles one by one which in turn will quiesce the Tx SRS and remove
39820dc2366fSVenugopal Iyer  * the soft ring associated with the pseudo Tx ring. Unlike Rx side
39830dc2366fSVenugopal Iyer  * where a cookie is used to protect against mac_rx_ring() calls on
39840dc2366fSVenugopal Iyer  * rings that have been removed, no such cookie is needed on the Tx
39850dc2366fSVenugopal Iyer  * side as the pseudo Tx ring won't be available anymore to
39860dc2366fSVenugopal Iyer  * aggr_find_tx_ring() once the port has been removed.
39870dc2366fSVenugopal Iyer  */
39880dc2366fSVenugopal Iyer static mac_tx_cookie_t
39890dc2366fSVenugopal Iyer mac_tx_aggr_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
39900dc2366fSVenugopal Iyer     uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
39910dc2366fSVenugopal Iyer {
39920dc2366fSVenugopal Iyer 	mac_srs_tx_t		*srs_tx = &mac_srs->srs_tx;
39930dc2366fSVenugopal Iyer 	mac_tx_ring_fn_t	find_tx_ring_fn;
39940dc2366fSVenugopal Iyer 	mac_ring_handle_t	ring = NULL;
39950dc2366fSVenugopal Iyer 	void			*arg;
39960dc2366fSVenugopal Iyer 	mac_soft_ring_t		*sringp;
39970dc2366fSVenugopal Iyer 
39980dc2366fSVenugopal Iyer 	find_tx_ring_fn = srs_tx->st_capab_aggr.mca_find_tx_ring_fn;
39990dc2366fSVenugopal Iyer 	arg = srs_tx->st_capab_aggr.mca_arg;
40000dc2366fSVenugopal Iyer 	if (find_tx_ring_fn(arg, mp_chain, fanout_hint, &ring) == NULL)
40010dc2366fSVenugopal Iyer 		return (NULL);
40020dc2366fSVenugopal Iyer 	sringp = srs_tx->st_soft_rings[((mac_ring_t *)ring)->mr_index];
40030dc2366fSVenugopal Iyer 	return (mac_tx_soft_ring_process(sringp, mp_chain, flag, ret_mp));
40040dc2366fSVenugopal Iyer }
40050dc2366fSVenugopal Iyer 
40060dc2366fSVenugopal Iyer void
40070dc2366fSVenugopal Iyer mac_tx_invoke_callbacks(mac_client_impl_t *mcip, mac_tx_cookie_t cookie)
40080dc2366fSVenugopal Iyer {
40090dc2366fSVenugopal Iyer 	mac_cb_t *mcb;
40100dc2366fSVenugopal Iyer 	mac_tx_notify_cb_t *mtnfp;
40110dc2366fSVenugopal Iyer 
40120dc2366fSVenugopal Iyer 	/* Wakeup callback registered clients */
40130dc2366fSVenugopal Iyer 	MAC_CALLBACK_WALKER_INC(&mcip->mci_tx_notify_cb_info);
40140dc2366fSVenugopal Iyer 	for (mcb = mcip->mci_tx_notify_cb_list; mcb != NULL;
40150dc2366fSVenugopal Iyer 	    mcb = mcb->mcb_nextp) {
40160dc2366fSVenugopal Iyer 		mtnfp = (mac_tx_notify_cb_t *)mcb->mcb_objp;
40170dc2366fSVenugopal Iyer 		mtnfp->mtnf_fn(mtnfp->mtnf_arg, cookie);
40180dc2366fSVenugopal Iyer 	}
40190dc2366fSVenugopal Iyer 	MAC_CALLBACK_WALKER_DCR(&mcip->mci_tx_notify_cb_info,
40200dc2366fSVenugopal Iyer 	    &mcip->mci_tx_notify_cb_list);
40210dc2366fSVenugopal Iyer }
40220dc2366fSVenugopal Iyer 
4023da14cebeSEric Cheng /* ARGSUSED */
4024da14cebeSEric Cheng void
4025da14cebeSEric Cheng mac_tx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type)
4026da14cebeSEric Cheng {
4027da14cebeSEric Cheng 	mblk_t			*head, *tail;
4028da14cebeSEric Cheng 	size_t			sz;
4029da14cebeSEric Cheng 	uint32_t		tx_mode;
4030da14cebeSEric Cheng 	uint_t			saved_pkt_count;
4031da14cebeSEric Cheng 	mac_tx_stats_t		stats;
4032da14cebeSEric Cheng 	mac_srs_tx_t		*srs_tx = &mac_srs->srs_tx;
4033d3d50737SRafael Vanoni 	clock_t			now;
4034da14cebeSEric Cheng 
4035da14cebeSEric Cheng 	saved_pkt_count = 0;
4036da14cebeSEric Cheng 	ASSERT(mutex_owned(&mac_srs->srs_lock));
4037da14cebeSEric Cheng 	ASSERT(!(mac_srs->srs_state & SRS_PROC));
4038da14cebeSEric Cheng 
4039da14cebeSEric Cheng 	mac_srs->srs_state |= SRS_PROC;
4040da14cebeSEric Cheng 
4041da14cebeSEric Cheng 	tx_mode = srs_tx->st_mode;
4042da14cebeSEric Cheng 	if (tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_SERIALIZE) {
4043da14cebeSEric Cheng 		if (mac_srs->srs_first != NULL) {
4044da14cebeSEric Cheng 			head = mac_srs->srs_first;
4045da14cebeSEric Cheng 			tail = mac_srs->srs_last;
4046da14cebeSEric Cheng 			saved_pkt_count = mac_srs->srs_count;
4047da14cebeSEric Cheng 			mac_srs->srs_first = NULL;
4048da14cebeSEric Cheng 			mac_srs->srs_last = NULL;
4049da14cebeSEric Cheng 			mac_srs->srs_count = 0;
4050da14cebeSEric Cheng 			mutex_exit(&mac_srs->srs_lock);
4051da14cebeSEric Cheng 
4052da14cebeSEric Cheng 			head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
4053da14cebeSEric Cheng 			    head, &stats);
4054da14cebeSEric Cheng 
4055da14cebeSEric Cheng 			mutex_enter(&mac_srs->srs_lock);
4056da14cebeSEric Cheng 			if (head != NULL) {
4057da14cebeSEric Cheng 				/* Device out of tx desc, set block */
4058da14cebeSEric Cheng 				if (head->b_next == NULL)
4059da14cebeSEric Cheng 					VERIFY(head == tail);
4060da14cebeSEric Cheng 				tail->b_next = mac_srs->srs_first;
4061da14cebeSEric Cheng 				mac_srs->srs_first = head;
4062da14cebeSEric Cheng 				mac_srs->srs_count +=
40630dc2366fSVenugopal Iyer 				    (saved_pkt_count - stats.mts_opackets);
4064da14cebeSEric Cheng 				if (mac_srs->srs_last == NULL)
4065da14cebeSEric Cheng 					mac_srs->srs_last = tail;
4066da14cebeSEric Cheng 				MAC_TX_SRS_BLOCK(mac_srs, head);
4067da14cebeSEric Cheng 			} else {
4068da14cebeSEric Cheng 				srs_tx->st_woken_up = B_FALSE;
40690dc2366fSVenugopal Iyer 				SRS_TX_STATS_UPDATE(mac_srs, &stats);
4070da14cebeSEric Cheng 			}
4071da14cebeSEric Cheng 		}
4072da14cebeSEric Cheng 	} else if (tx_mode == SRS_TX_BW) {
4073da14cebeSEric Cheng 		/*
4074da14cebeSEric Cheng 		 * We are here because the timer fired and we have some data
4075da14cebeSEric Cheng 		 * to tranmit. Also mac_tx_srs_worker should have reset
4076da14cebeSEric Cheng 		 * SRS_BW_ENFORCED flag
4077da14cebeSEric Cheng 		 */
4078da14cebeSEric Cheng 		ASSERT(!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED));
4079da14cebeSEric Cheng 		head = tail = mac_srs->srs_first;
4080da14cebeSEric Cheng 		while (mac_srs->srs_first != NULL) {
4081da14cebeSEric Cheng 			tail = mac_srs->srs_first;
4082da14cebeSEric Cheng 			tail->b_prev = NULL;
4083da14cebeSEric Cheng 			mac_srs->srs_first = tail->b_next;
4084da14cebeSEric Cheng 			if (mac_srs->srs_first == NULL)
4085da14cebeSEric Cheng 				mac_srs->srs_last = NULL;
4086da14cebeSEric Cheng 			mac_srs->srs_count--;
4087da14cebeSEric Cheng 			sz = msgdsize(tail);
4088da14cebeSEric Cheng 			mac_srs->srs_size -= sz;
4089da14cebeSEric Cheng 			saved_pkt_count++;
4090da14cebeSEric Cheng 			MAC_TX_UPDATE_BW_INFO(mac_srs, sz);
4091da14cebeSEric Cheng 
4092da14cebeSEric Cheng 			if (mac_srs->srs_bw->mac_bw_used <
4093da14cebeSEric Cheng 			    mac_srs->srs_bw->mac_bw_limit)
4094da14cebeSEric Cheng 				continue;
4095da14cebeSEric Cheng 
4096d3d50737SRafael Vanoni 			now = ddi_get_lbolt();
4097d3d50737SRafael Vanoni 			if (mac_srs->srs_bw->mac_bw_curr_time != now) {
4098d3d50737SRafael Vanoni 				mac_srs->srs_bw->mac_bw_curr_time = now;
4099da14cebeSEric Cheng 				mac_srs->srs_bw->mac_bw_used = sz;
4100da14cebeSEric Cheng 				continue;
4101da14cebeSEric Cheng 			}
4102da14cebeSEric Cheng 			mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
4103da14cebeSEric Cheng 			break;
4104da14cebeSEric Cheng 		}
4105da14cebeSEric Cheng 
4106da14cebeSEric Cheng 		ASSERT((head == NULL && tail == NULL) ||
4107da14cebeSEric Cheng 		    (head != NULL && tail != NULL));
4108da14cebeSEric Cheng 		if (tail != NULL) {
4109da14cebeSEric Cheng 			tail->b_next = NULL;
4110da14cebeSEric Cheng 			mutex_exit(&mac_srs->srs_lock);
4111da14cebeSEric Cheng 
4112da14cebeSEric Cheng 			head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
4113da14cebeSEric Cheng 			    head, &stats);
4114da14cebeSEric Cheng 
4115da14cebeSEric Cheng 			mutex_enter(&mac_srs->srs_lock);
4116da14cebeSEric Cheng 			if (head != NULL) {
4117da14cebeSEric Cheng 				uint_t size_sent;
4118da14cebeSEric Cheng 
4119da14cebeSEric Cheng 				/* Device out of tx desc, set block */
4120da14cebeSEric Cheng 				if (head->b_next == NULL)
4121da14cebeSEric Cheng 					VERIFY(head == tail);
4122da14cebeSEric Cheng 				tail->b_next = mac_srs->srs_first;
4123da14cebeSEric Cheng 				mac_srs->srs_first = head;
4124da14cebeSEric Cheng 				mac_srs->srs_count +=
41250dc2366fSVenugopal Iyer 				    (saved_pkt_count - stats.mts_opackets);
4126da14cebeSEric Cheng 				if (mac_srs->srs_last == NULL)
4127da14cebeSEric Cheng 					mac_srs->srs_last = tail;
41280dc2366fSVenugopal Iyer 				size_sent = sz - stats.mts_obytes;
4129da14cebeSEric Cheng 				mac_srs->srs_size += size_sent;
4130da14cebeSEric Cheng 				mac_srs->srs_bw->mac_bw_sz += size_sent;
4131da14cebeSEric Cheng 				if (mac_srs->srs_bw->mac_bw_used > size_sent) {
4132da14cebeSEric Cheng 					mac_srs->srs_bw->mac_bw_used -=
4133da14cebeSEric Cheng 					    size_sent;
4134da14cebeSEric Cheng 				} else {
4135da14cebeSEric Cheng 					mac_srs->srs_bw->mac_bw_used = 0;
4136da14cebeSEric Cheng 				}
4137da14cebeSEric Cheng 				MAC_TX_SRS_BLOCK(mac_srs, head);
4138da14cebeSEric Cheng 			} else {
4139da14cebeSEric Cheng 				srs_tx->st_woken_up = B_FALSE;
41400dc2366fSVenugopal Iyer 				SRS_TX_STATS_UPDATE(mac_srs, &stats);
4141da14cebeSEric Cheng 			}
4142da14cebeSEric Cheng 		}
41430dc2366fSVenugopal Iyer 	} else if (tx_mode == SRS_TX_BW_FANOUT || tx_mode == SRS_TX_BW_AGGR) {
4144da14cebeSEric Cheng 		mblk_t *prev;
4145da14cebeSEric Cheng 		uint64_t hint;
4146da14cebeSEric Cheng 
4147da14cebeSEric Cheng 		/*
4148da14cebeSEric Cheng 		 * We are here because the timer fired and we
4149da14cebeSEric Cheng 		 * have some quota to tranmit.
4150da14cebeSEric Cheng 		 */
4151da14cebeSEric Cheng 		prev = NULL;
4152da14cebeSEric Cheng 		head = tail = mac_srs->srs_first;
4153da14cebeSEric Cheng 		while (mac_srs->srs_first != NULL) {
4154da14cebeSEric Cheng 			tail = mac_srs->srs_first;
4155da14cebeSEric Cheng 			mac_srs->srs_first = tail->b_next;
4156da14cebeSEric Cheng 			if (mac_srs->srs_first == NULL)
4157da14cebeSEric Cheng 				mac_srs->srs_last = NULL;
4158da14cebeSEric Cheng 			mac_srs->srs_count--;
4159da14cebeSEric Cheng 			sz = msgdsize(tail);
4160da14cebeSEric Cheng 			mac_srs->srs_size -= sz;
4161da14cebeSEric Cheng 			mac_srs->srs_bw->mac_bw_used += sz;
4162da14cebeSEric Cheng 			if (prev == NULL)
4163da14cebeSEric Cheng 				hint = (ulong_t)tail->b_prev;
4164da14cebeSEric Cheng 			if (hint != (ulong_t)tail->b_prev) {
4165da14cebeSEric Cheng 				prev->b_next = NULL;
4166da14cebeSEric Cheng 				mutex_exit(&mac_srs->srs_lock);
4167da14cebeSEric Cheng 				TX_SRS_TO_SOFT_RING(mac_srs, head, hint);
4168da14cebeSEric Cheng 				head = tail;
4169da14cebeSEric Cheng 				hint = (ulong_t)tail->b_prev;
4170da14cebeSEric Cheng 				mutex_enter(&mac_srs->srs_lock);
4171da14cebeSEric Cheng 			}
4172da14cebeSEric Cheng 
4173da14cebeSEric Cheng 			prev = tail;
4174da14cebeSEric Cheng 			tail->b_prev = NULL;
4175da14cebeSEric Cheng 			if (mac_srs->srs_bw->mac_bw_used <
4176da14cebeSEric Cheng 			    mac_srs->srs_bw->mac_bw_limit)
4177da14cebeSEric Cheng 				continue;
4178da14cebeSEric Cheng 
4179d3d50737SRafael Vanoni 			now = ddi_get_lbolt();
4180d3d50737SRafael Vanoni 			if (mac_srs->srs_bw->mac_bw_curr_time != now) {
4181d3d50737SRafael Vanoni 				mac_srs->srs_bw->mac_bw_curr_time = now;
4182da14cebeSEric Cheng 				mac_srs->srs_bw->mac_bw_used = 0;
4183da14cebeSEric Cheng 				continue;
4184da14cebeSEric Cheng 			}
4185da14cebeSEric Cheng 			mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
4186da14cebeSEric Cheng 			break;
4187da14cebeSEric Cheng 		}
4188da14cebeSEric Cheng 		ASSERT((head == NULL && tail == NULL) ||
4189da14cebeSEric Cheng 		    (head != NULL && tail != NULL));
4190da14cebeSEric Cheng 		if (tail != NULL) {
4191da14cebeSEric Cheng 			tail->b_next = NULL;
4192da14cebeSEric Cheng 			mutex_exit(&mac_srs->srs_lock);
4193da14cebeSEric Cheng 			TX_SRS_TO_SOFT_RING(mac_srs, head, hint);
4194da14cebeSEric Cheng 			mutex_enter(&mac_srs->srs_lock);
4195da14cebeSEric Cheng 		}
4196da14cebeSEric Cheng 	}
4197da14cebeSEric Cheng 	/*
4198da14cebeSEric Cheng 	 * SRS_TX_FANOUT case not considered here because packets
4199da14cebeSEric Cheng 	 * won't be queued in the SRS for this case. Packets will
4200da14cebeSEric Cheng 	 * be sent directly to soft rings underneath and if there
4201da14cebeSEric Cheng 	 * is any queueing at all, it would be in Tx side soft
4202da14cebeSEric Cheng 	 * rings.
4203da14cebeSEric Cheng 	 */
4204da14cebeSEric Cheng 
4205da14cebeSEric Cheng 	/*
4206da14cebeSEric Cheng 	 * When srs_count becomes 0, reset SRS_TX_HIWAT and
4207da14cebeSEric Cheng 	 * SRS_TX_WAKEUP_CLIENT and wakeup registered clients.
4208da14cebeSEric Cheng 	 */
4209da14cebeSEric Cheng 	if (mac_srs->srs_count == 0 && (mac_srs->srs_state &
4210da14cebeSEric Cheng 	    (SRS_TX_HIWAT | SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED))) {
4211da14cebeSEric Cheng 		mac_client_impl_t *mcip = mac_srs->srs_mcip;
4212da14cebeSEric Cheng 		boolean_t wakeup_required = B_FALSE;
4213da14cebeSEric Cheng 
4214da14cebeSEric Cheng 		if (mac_srs->srs_state &
4215da14cebeSEric Cheng 		    (SRS_TX_HIWAT|SRS_TX_WAKEUP_CLIENT)) {
4216da14cebeSEric Cheng 			wakeup_required = B_TRUE;
4217da14cebeSEric Cheng 		}
4218da14cebeSEric Cheng 		mac_srs->srs_state &= ~(SRS_TX_HIWAT |
4219da14cebeSEric Cheng 		    SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED);
4220da14cebeSEric Cheng 		mutex_exit(&mac_srs->srs_lock);
4221da14cebeSEric Cheng 		if (wakeup_required) {
42220dc2366fSVenugopal Iyer 			mac_tx_invoke_callbacks(mcip, (mac_tx_cookie_t)mac_srs);
4223da14cebeSEric Cheng 			/*
4224da14cebeSEric Cheng 			 * If the client is not the primary MAC client, then we
4225da14cebeSEric Cheng 			 * need to send the notification to the clients upper
4226da14cebeSEric Cheng 			 * MAC, i.e. mci_upper_mip.
4227da14cebeSEric Cheng 			 */
4228da14cebeSEric Cheng 			mac_tx_notify(mcip->mci_upper_mip != NULL ?
4229da14cebeSEric Cheng 			    mcip->mci_upper_mip : mcip->mci_mip);
4230da14cebeSEric Cheng 		}
4231da14cebeSEric Cheng 		mutex_enter(&mac_srs->srs_lock);
4232da14cebeSEric Cheng 	}
4233da14cebeSEric Cheng 	mac_srs->srs_state &= ~SRS_PROC;
4234da14cebeSEric Cheng }
4235da14cebeSEric Cheng 
4236da14cebeSEric Cheng /*
4237da14cebeSEric Cheng  * Given a packet, get the flow_entry that identifies the flow
4238da14cebeSEric Cheng  * to which that packet belongs. The flow_entry will contain
4239da14cebeSEric Cheng  * the transmit function to be used to send the packet. If the
4240da14cebeSEric Cheng  * function returns NULL, the packet should be sent using the
4241da14cebeSEric Cheng  * underlying NIC.
4242da14cebeSEric Cheng  */
4243da14cebeSEric Cheng static flow_entry_t *
4244da14cebeSEric Cheng mac_tx_classify(mac_impl_t *mip, mblk_t *mp)
4245da14cebeSEric Cheng {
4246da14cebeSEric Cheng 	flow_entry_t		*flent = NULL;
4247da14cebeSEric Cheng 	mac_client_impl_t	*mcip;
4248da14cebeSEric Cheng 	int	err;
4249da14cebeSEric Cheng 
4250da14cebeSEric Cheng 	/*
4251da14cebeSEric Cheng 	 * Do classification on the packet.
4252da14cebeSEric Cheng 	 */
4253da14cebeSEric Cheng 	err = mac_flow_lookup(mip->mi_flow_tab, mp, FLOW_OUTBOUND, &flent);
4254da14cebeSEric Cheng 	if (err != 0)
4255da14cebeSEric Cheng 		return (NULL);
4256da14cebeSEric Cheng 
4257da14cebeSEric Cheng 	/*
4258da14cebeSEric Cheng 	 * This flent might just be an additional one on the MAC client,
4259da14cebeSEric Cheng 	 * i.e. for classification purposes (different fdesc), however
4260da14cebeSEric Cheng 	 * the resources, SRS et. al., are in the mci_flent, so if
4261da14cebeSEric Cheng 	 * this isn't the mci_flent, we need to get it.
4262da14cebeSEric Cheng 	 */
4263da14cebeSEric Cheng 	if ((mcip = flent->fe_mcip) != NULL && mcip->mci_flent != flent) {
4264da14cebeSEric Cheng 		FLOW_REFRELE(flent);
4265da14cebeSEric Cheng 		flent = mcip->mci_flent;
4266da14cebeSEric Cheng 		FLOW_TRY_REFHOLD(flent, err);
4267da14cebeSEric Cheng 		if (err != 0)
4268da14cebeSEric Cheng 			return (NULL);
4269da14cebeSEric Cheng 	}
4270da14cebeSEric Cheng 
4271da14cebeSEric Cheng 	return (flent);
4272da14cebeSEric Cheng }
4273da14cebeSEric Cheng 
4274da14cebeSEric Cheng /*
4275da14cebeSEric Cheng  * This macro is only meant to be used by mac_tx_send().
4276da14cebeSEric Cheng  */
4277da14cebeSEric Cheng #define	CHECK_VID_AND_ADD_TAG(mp) {			\
4278da14cebeSEric Cheng 	if (vid_check) {				\
4279da14cebeSEric Cheng 		int err = 0;				\
4280da14cebeSEric Cheng 							\
4281da14cebeSEric Cheng 		MAC_VID_CHECK(src_mcip, (mp), err);	\
4282da14cebeSEric Cheng 		if (err != 0) {				\
4283da14cebeSEric Cheng 			freemsg((mp));			\
4284da14cebeSEric Cheng 			(mp) = next;			\
4285da14cebeSEric Cheng 			oerrors++;			\
4286da14cebeSEric Cheng 			continue;			\
4287da14cebeSEric Cheng 		}					\
4288da14cebeSEric Cheng 	}						\
4289da14cebeSEric Cheng 	if (add_tag) {					\
4290da14cebeSEric Cheng 		(mp) = mac_add_vlan_tag((mp), 0, vid);	\
4291da14cebeSEric Cheng 		if ((mp) == NULL) {			\
4292da14cebeSEric Cheng 			(mp) = next;			\
4293da14cebeSEric Cheng 			oerrors++;			\
4294da14cebeSEric Cheng 			continue;			\
4295da14cebeSEric Cheng 		}					\
4296da14cebeSEric Cheng 	}						\
4297da14cebeSEric Cheng }
4298da14cebeSEric Cheng 
4299da14cebeSEric Cheng mblk_t *
4300da14cebeSEric Cheng mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
4301da14cebeSEric Cheng     mac_tx_stats_t *stats)
4302da14cebeSEric Cheng {
4303da14cebeSEric Cheng 	mac_client_impl_t *src_mcip = (mac_client_impl_t *)mch;
4304da14cebeSEric Cheng 	mac_impl_t *mip = src_mcip->mci_mip;
4305da14cebeSEric Cheng 	uint_t obytes = 0, opackets = 0, oerrors = 0;
4306da14cebeSEric Cheng 	mblk_t *mp = NULL, *next;
4307da14cebeSEric Cheng 	boolean_t vid_check, add_tag;
4308da14cebeSEric Cheng 	uint16_t vid = 0;
4309da14cebeSEric Cheng 
4310da14cebeSEric Cheng 	if (mip->mi_nclients > 1) {
4311da14cebeSEric Cheng 		vid_check = MAC_VID_CHECK_NEEDED(src_mcip);
4312da14cebeSEric Cheng 		add_tag = MAC_TAG_NEEDED(src_mcip);
4313da14cebeSEric Cheng 		if (add_tag)
4314da14cebeSEric Cheng 			vid = mac_client_vid(mch);
4315da14cebeSEric Cheng 	} else {
4316da14cebeSEric Cheng 		ASSERT(mip->mi_nclients == 1);
4317da14cebeSEric Cheng 		vid_check = add_tag = B_FALSE;
4318da14cebeSEric Cheng 	}
4319da14cebeSEric Cheng 
4320da14cebeSEric Cheng 	/*
43210dc2366fSVenugopal Iyer 	 * Fastpath: if there's only one client, we simply send
43220dc2366fSVenugopal Iyer 	 * the packet down to the underlying NIC.
4323da14cebeSEric Cheng 	 */
43240dc2366fSVenugopal Iyer 	if (mip->mi_nactiveclients == 1) {
4325da14cebeSEric Cheng 		DTRACE_PROBE2(fastpath,
4326da14cebeSEric Cheng 		    mac_client_impl_t *, src_mcip, mblk_t *, mp_chain);
4327da14cebeSEric Cheng 
4328da14cebeSEric Cheng 		mp = mp_chain;
4329da14cebeSEric Cheng 		while (mp != NULL) {
4330da14cebeSEric Cheng 			next = mp->b_next;
4331da14cebeSEric Cheng 			mp->b_next = NULL;
4332da14cebeSEric Cheng 			opackets++;
4333da14cebeSEric Cheng 			obytes += (mp->b_cont == NULL ? MBLKL(mp) :
4334da14cebeSEric Cheng 			    msgdsize(mp));
4335da14cebeSEric Cheng 
4336da14cebeSEric Cheng 			CHECK_VID_AND_ADD_TAG(mp);
43370dc2366fSVenugopal Iyer 			MAC_TX(mip, ring, mp, src_mcip);
4338da14cebeSEric Cheng 
4339da14cebeSEric Cheng 			/*
4340da14cebeSEric Cheng 			 * If the driver is out of descriptors and does a
4341da14cebeSEric Cheng 			 * partial send it will return a chain of unsent
4342da14cebeSEric Cheng 			 * mblks. Adjust the accounting stats.
4343da14cebeSEric Cheng 			 */
4344da14cebeSEric Cheng 			if (mp != NULL) {
4345da14cebeSEric Cheng 				opackets--;
4346da14cebeSEric Cheng 				obytes -= msgdsize(mp);
4347da14cebeSEric Cheng 				mp->b_next = next;
4348da14cebeSEric Cheng 				break;
4349da14cebeSEric Cheng 			}
4350da14cebeSEric Cheng 			mp = next;
4351da14cebeSEric Cheng 		}
4352da14cebeSEric Cheng 		goto done;
4353da14cebeSEric Cheng 	}
4354da14cebeSEric Cheng 
4355da14cebeSEric Cheng 	/*
4356da14cebeSEric Cheng 	 * No fastpath, we either have more than one MAC client
4357da14cebeSEric Cheng 	 * defined on top of the same MAC, or one or more MAC
4358da14cebeSEric Cheng 	 * client promiscuous callbacks.
4359da14cebeSEric Cheng 	 */
4360da14cebeSEric Cheng 	DTRACE_PROBE3(slowpath, mac_client_impl_t *,
4361da14cebeSEric Cheng 	    src_mcip, int, mip->mi_nclients, mblk_t *, mp_chain);
4362da14cebeSEric Cheng 
4363da14cebeSEric Cheng 	mp = mp_chain;
4364da14cebeSEric Cheng 	while (mp != NULL) {
4365da14cebeSEric Cheng 		flow_entry_t *dst_flow_ent;
4366da14cebeSEric Cheng 		void *flow_cookie;
4367da14cebeSEric Cheng 		size_t	pkt_size;
4368da14cebeSEric Cheng 		mblk_t *mp1;
4369da14cebeSEric Cheng 
4370da14cebeSEric Cheng 		next = mp->b_next;
4371da14cebeSEric Cheng 		mp->b_next = NULL;
4372da14cebeSEric Cheng 		opackets++;
4373da14cebeSEric Cheng 		pkt_size = (mp->b_cont == NULL ? MBLKL(mp) : msgdsize(mp));
4374da14cebeSEric Cheng 		obytes += pkt_size;
4375da14cebeSEric Cheng 		CHECK_VID_AND_ADD_TAG(mp);
4376da14cebeSEric Cheng 
4377da14cebeSEric Cheng 		/*
4378da14cebeSEric Cheng 		 * Find the destination.
4379da14cebeSEric Cheng 		 */
4380da14cebeSEric Cheng 		dst_flow_ent = mac_tx_classify(mip, mp);
4381da14cebeSEric Cheng 
4382da14cebeSEric Cheng 		if (dst_flow_ent != NULL) {
4383da14cebeSEric Cheng 			size_t	hdrsize;
4384da14cebeSEric Cheng 			int	err = 0;
4385da14cebeSEric Cheng 
4386da14cebeSEric Cheng 			if (mip->mi_info.mi_nativemedia == DL_ETHER) {
4387da14cebeSEric Cheng 				struct ether_vlan_header *evhp =
4388da14cebeSEric Cheng 				    (struct ether_vlan_header *)mp->b_rptr;
4389da14cebeSEric Cheng 
4390da14cebeSEric Cheng 				if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN)
4391da14cebeSEric Cheng 					hdrsize = sizeof (*evhp);
4392da14cebeSEric Cheng 				else
4393da14cebeSEric Cheng 					hdrsize = sizeof (struct ether_header);
4394da14cebeSEric Cheng 			} else {
4395da14cebeSEric Cheng 				mac_header_info_t	mhi;
4396da14cebeSEric Cheng 
4397da14cebeSEric Cheng 				err = mac_header_info((mac_handle_t)mip,
4398da14cebeSEric Cheng 				    mp, &mhi);
4399da14cebeSEric Cheng 				if (err == 0)
4400da14cebeSEric Cheng 					hdrsize = mhi.mhi_hdrsize;
4401da14cebeSEric Cheng 			}
4402da14cebeSEric Cheng 
4403da14cebeSEric Cheng 			/*
4404da14cebeSEric Cheng 			 * Got a matching flow. It's either another
4405da14cebeSEric Cheng 			 * MAC client, or a broadcast/multicast flow.
4406da14cebeSEric Cheng 			 * Make sure the packet size is within the
4407da14cebeSEric Cheng 			 * allowed size. If not drop the packet and
4408da14cebeSEric Cheng 			 * move to next packet.
4409da14cebeSEric Cheng 			 */
4410da14cebeSEric Cheng 			if (err != 0 ||
4411da14cebeSEric Cheng 			    (pkt_size - hdrsize) > mip->mi_sdu_max) {
4412da14cebeSEric Cheng 				oerrors++;
4413da14cebeSEric Cheng 				DTRACE_PROBE2(loopback__drop, size_t, pkt_size,
4414da14cebeSEric Cheng 				    mblk_t *, mp);
4415da14cebeSEric Cheng 				freemsg(mp);
4416da14cebeSEric Cheng 				mp = next;
4417da14cebeSEric Cheng 				FLOW_REFRELE(dst_flow_ent);
4418da14cebeSEric Cheng 				continue;
4419da14cebeSEric Cheng 			}
4420da14cebeSEric Cheng 			flow_cookie = mac_flow_get_client_cookie(dst_flow_ent);
4421da14cebeSEric Cheng 			if (flow_cookie != NULL) {
4422da14cebeSEric Cheng 				/*
4423da14cebeSEric Cheng 				 * The vnic_bcast_send function expects
4424da14cebeSEric Cheng 				 * to receive the sender MAC client
4425da14cebeSEric Cheng 				 * as value for arg2.
4426da14cebeSEric Cheng 				 */
4427da14cebeSEric Cheng 				mac_bcast_send(flow_cookie, src_mcip, mp,
4428da14cebeSEric Cheng 				    B_TRUE);
4429da14cebeSEric Cheng 			} else {
4430da14cebeSEric Cheng 				/*
44310dc2366fSVenugopal Iyer 				 * loopback the packet to a local MAC
44320dc2366fSVenugopal Iyer 				 * client. We force a context switch
44330dc2366fSVenugopal Iyer 				 * if both source and destination MAC
44340dc2366fSVenugopal Iyer 				 * clients are used by IP, i.e.
44350dc2366fSVenugopal Iyer 				 * bypass is set.
4436da14cebeSEric Cheng 				 */
4437da14cebeSEric Cheng 				boolean_t do_switch;
4438da14cebeSEric Cheng 				mac_client_impl_t *dst_mcip =
4439da14cebeSEric Cheng 				    dst_flow_ent->fe_mcip;
4440da14cebeSEric Cheng 
44410dc2366fSVenugopal Iyer 				/*
44420dc2366fSVenugopal Iyer 				 * Check if there are promiscuous mode
44430dc2366fSVenugopal Iyer 				 * callbacks defined. This check is
44440dc2366fSVenugopal Iyer 				 * done here in the 'else' case and
44450dc2366fSVenugopal Iyer 				 * not in other cases because this
44460dc2366fSVenugopal Iyer 				 * path is for local loopback
44470dc2366fSVenugopal Iyer 				 * communication which does not go
44480dc2366fSVenugopal Iyer 				 * through MAC_TX(). For paths that go
44490dc2366fSVenugopal Iyer 				 * through MAC_TX(), the promisc_list
44500dc2366fSVenugopal Iyer 				 * check is done inside the MAC_TX()
44510dc2366fSVenugopal Iyer 				 * macro.
44520dc2366fSVenugopal Iyer 				 */
44530dc2366fSVenugopal Iyer 				if (mip->mi_promisc_list != NULL)
44540dc2366fSVenugopal Iyer 					mac_promisc_dispatch(mip, mp, src_mcip);
44550dc2366fSVenugopal Iyer 
4456da14cebeSEric Cheng 				do_switch = ((src_mcip->mci_state_flags &
4457da14cebeSEric Cheng 				    dst_mcip->mci_state_flags &
4458da14cebeSEric Cheng 				    MCIS_CLIENT_POLL_CAPABLE) != 0);
4459da14cebeSEric Cheng 
4460da14cebeSEric Cheng 				if ((mp1 = mac_fix_cksum(mp)) != NULL) {
4461da14cebeSEric Cheng 					(dst_flow_ent->fe_cb_fn)(
4462da14cebeSEric Cheng 					    dst_flow_ent->fe_cb_arg1,
4463da14cebeSEric Cheng 					    dst_flow_ent->fe_cb_arg2,
4464da14cebeSEric Cheng 					    mp1, do_switch);
4465da14cebeSEric Cheng 				}
4466da14cebeSEric Cheng 			}
4467da14cebeSEric Cheng 			FLOW_REFRELE(dst_flow_ent);
4468da14cebeSEric Cheng 		} else {
4469da14cebeSEric Cheng 			/*
4470da14cebeSEric Cheng 			 * Unknown destination, send via the underlying
4471da14cebeSEric Cheng 			 * NIC.
4472da14cebeSEric Cheng 			 */
44730dc2366fSVenugopal Iyer 			MAC_TX(mip, ring, mp, src_mcip);
4474da14cebeSEric Cheng 			if (mp != NULL) {
4475da14cebeSEric Cheng 				/*
4476da14cebeSEric Cheng 				 * Adjust for the last packet that
4477da14cebeSEric Cheng 				 * could not be transmitted
4478da14cebeSEric Cheng 				 */
4479da14cebeSEric Cheng 				opackets--;
4480da14cebeSEric Cheng 				obytes -= pkt_size;
4481da14cebeSEric Cheng 				mp->b_next = next;
4482da14cebeSEric Cheng 				break;
4483da14cebeSEric Cheng 			}
4484da14cebeSEric Cheng 		}
4485da14cebeSEric Cheng 		mp = next;
4486da14cebeSEric Cheng 	}
4487da14cebeSEric Cheng 
4488da14cebeSEric Cheng done:
44890dc2366fSVenugopal Iyer 	stats->mts_obytes = obytes;
44900dc2366fSVenugopal Iyer 	stats->mts_opackets = opackets;
44910dc2366fSVenugopal Iyer 	stats->mts_oerrors = oerrors;
4492da14cebeSEric Cheng 	return (mp);
4493da14cebeSEric Cheng }
4494da14cebeSEric Cheng 
4495da14cebeSEric Cheng /*
4496da14cebeSEric Cheng  * mac_tx_srs_ring_present
4497da14cebeSEric Cheng  *
4498da14cebeSEric Cheng  * Returns whether the specified ring is part of the specified SRS.
4499da14cebeSEric Cheng  */
4500da14cebeSEric Cheng boolean_t
4501da14cebeSEric Cheng mac_tx_srs_ring_present(mac_soft_ring_set_t *srs, mac_ring_t *tx_ring)
4502da14cebeSEric Cheng {
4503da14cebeSEric Cheng 	int i;
4504da14cebeSEric Cheng 	mac_soft_ring_t *soft_ring;
4505da14cebeSEric Cheng 
4506da14cebeSEric Cheng 	if (srs->srs_tx.st_arg2 == tx_ring)
4507da14cebeSEric Cheng 		return (B_TRUE);
4508da14cebeSEric Cheng 
45090dc2366fSVenugopal Iyer 	for (i = 0; i < srs->srs_tx_ring_count; i++) {
45100dc2366fSVenugopal Iyer 		soft_ring =  srs->srs_tx_soft_rings[i];
4511da14cebeSEric Cheng 		if (soft_ring->s_ring_tx_arg2 == tx_ring)
4512da14cebeSEric Cheng 			return (B_TRUE);
4513da14cebeSEric Cheng 	}
4514da14cebeSEric Cheng 
4515da14cebeSEric Cheng 	return (B_FALSE);
4516da14cebeSEric Cheng }
4517da14cebeSEric Cheng 
4518da14cebeSEric Cheng /*
45190dc2366fSVenugopal Iyer  * mac_tx_srs_get_soft_ring
45200dc2366fSVenugopal Iyer  *
45210dc2366fSVenugopal Iyer  * Returns the TX soft ring associated with the given ring, if present.
45220dc2366fSVenugopal Iyer  */
45230dc2366fSVenugopal Iyer mac_soft_ring_t *
45240dc2366fSVenugopal Iyer mac_tx_srs_get_soft_ring(mac_soft_ring_set_t *srs, mac_ring_t *tx_ring)
45250dc2366fSVenugopal Iyer {
45260dc2366fSVenugopal Iyer 	int		i;
45270dc2366fSVenugopal Iyer 	mac_soft_ring_t	*soft_ring;
45280dc2366fSVenugopal Iyer 
45290dc2366fSVenugopal Iyer 	if (srs->srs_tx.st_arg2 == tx_ring)
45300dc2366fSVenugopal Iyer 		return (NULL);
45310dc2366fSVenugopal Iyer 
45320dc2366fSVenugopal Iyer 	for (i = 0; i < srs->srs_tx_ring_count; i++) {
45330dc2366fSVenugopal Iyer 		soft_ring =  srs->srs_tx_soft_rings[i];
45340dc2366fSVenugopal Iyer 		if (soft_ring->s_ring_tx_arg2 == tx_ring)
45350dc2366fSVenugopal Iyer 			return (soft_ring);
45360dc2366fSVenugopal Iyer 	}
45370dc2366fSVenugopal Iyer 
45380dc2366fSVenugopal Iyer 	return (NULL);
45390dc2366fSVenugopal Iyer }
45400dc2366fSVenugopal Iyer 
45410dc2366fSVenugopal Iyer /*
4542da14cebeSEric Cheng  * mac_tx_srs_wakeup
4543da14cebeSEric Cheng  *
4544da14cebeSEric Cheng  * Called when Tx desc become available. Wakeup the appropriate worker
4545da14cebeSEric Cheng  * thread after resetting the SRS_TX_BLOCKED/S_RING_BLOCK bit in the
4546da14cebeSEric Cheng  * state field.
4547da14cebeSEric Cheng  */
4548da14cebeSEric Cheng void
4549da14cebeSEric Cheng mac_tx_srs_wakeup(mac_soft_ring_set_t *mac_srs, mac_ring_handle_t ring)
4550da14cebeSEric Cheng {
4551da14cebeSEric Cheng 	int i;
4552da14cebeSEric Cheng 	mac_soft_ring_t *sringp;
4553da14cebeSEric Cheng 	mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
4554da14cebeSEric Cheng 
4555da14cebeSEric Cheng 	mutex_enter(&mac_srs->srs_lock);
45560dc2366fSVenugopal Iyer 	/*
45570dc2366fSVenugopal Iyer 	 * srs_tx_ring_count == 0 is the single ring mode case. In
45580dc2366fSVenugopal Iyer 	 * this mode, there will not be Tx soft rings associated
45590dc2366fSVenugopal Iyer 	 * with the SRS.
45600dc2366fSVenugopal Iyer 	 */
45610dc2366fSVenugopal Iyer 	if (!MAC_TX_SOFT_RINGS(mac_srs)) {
4562da14cebeSEric Cheng 		if (srs_tx->st_arg2 == ring &&
4563da14cebeSEric Cheng 		    mac_srs->srs_state & SRS_TX_BLOCKED) {
4564da14cebeSEric Cheng 			mac_srs->srs_state &= ~SRS_TX_BLOCKED;
45650dc2366fSVenugopal Iyer 			srs_tx->st_stat.mts_unblockcnt++;
4566da14cebeSEric Cheng 			cv_signal(&mac_srs->srs_async);
4567da14cebeSEric Cheng 		}
4568da14cebeSEric Cheng 		/*
4569da14cebeSEric Cheng 		 * A wakeup can come before tx_srs_drain() could
4570da14cebeSEric Cheng 		 * grab srs lock and set SRS_TX_BLOCKED. So
4571da14cebeSEric Cheng 		 * always set woken_up flag when we come here.
4572da14cebeSEric Cheng 		 */
4573da14cebeSEric Cheng 		srs_tx->st_woken_up = B_TRUE;
4574da14cebeSEric Cheng 		mutex_exit(&mac_srs->srs_lock);
4575da14cebeSEric Cheng 		return;
4576da14cebeSEric Cheng 	}
4577da14cebeSEric Cheng 
45780dc2366fSVenugopal Iyer 	/*
45790dc2366fSVenugopal Iyer 	 * If you are here, it is for FANOUT, BW_FANOUT,
45800dc2366fSVenugopal Iyer 	 * AGGR_MODE or AGGR_BW_MODE case
45810dc2366fSVenugopal Iyer 	 */
45820dc2366fSVenugopal Iyer 	for (i = 0; i < mac_srs->srs_tx_ring_count; i++) {
45830dc2366fSVenugopal Iyer 		sringp = mac_srs->srs_tx_soft_rings[i];
4584da14cebeSEric Cheng 		mutex_enter(&sringp->s_ring_lock);
4585da14cebeSEric Cheng 		if (sringp->s_ring_tx_arg2 == ring) {
4586da14cebeSEric Cheng 			if (sringp->s_ring_state & S_RING_BLOCK) {
4587da14cebeSEric Cheng 				sringp->s_ring_state &= ~S_RING_BLOCK;
45880dc2366fSVenugopal Iyer 				sringp->s_st_stat.mts_unblockcnt++;
4589da14cebeSEric Cheng 				cv_signal(&sringp->s_ring_async);
4590da14cebeSEric Cheng 			}
4591da14cebeSEric Cheng 			sringp->s_ring_tx_woken_up = B_TRUE;
4592da14cebeSEric Cheng 		}
4593da14cebeSEric Cheng 		mutex_exit(&sringp->s_ring_lock);
4594da14cebeSEric Cheng 	}
4595da14cebeSEric Cheng 	mutex_exit(&mac_srs->srs_lock);
4596da14cebeSEric Cheng }
4597da14cebeSEric Cheng 
4598da14cebeSEric Cheng /*
4599da14cebeSEric Cheng  * Once the driver is done draining, send a MAC_NOTE_TX notification to unleash
4600da14cebeSEric Cheng  * the blocked clients again.
4601da14cebeSEric Cheng  */
4602da14cebeSEric Cheng void
4603da14cebeSEric Cheng mac_tx_notify(mac_impl_t *mip)
4604da14cebeSEric Cheng {
4605da14cebeSEric Cheng 	i_mac_notify(mip, MAC_NOTE_TX);
4606da14cebeSEric Cheng }
4607da14cebeSEric Cheng 
4608da14cebeSEric Cheng /*
4609da14cebeSEric Cheng  * RX SOFTRING RELATED FUNCTIONS
4610da14cebeSEric Cheng  *
4611da14cebeSEric Cheng  * These functions really belong in mac_soft_ring.c and here for
4612da14cebeSEric Cheng  * a short period.
4613da14cebeSEric Cheng  */
4614da14cebeSEric Cheng 
4615da14cebeSEric Cheng #define	SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) {	       	\
4616da14cebeSEric Cheng 	/*								\
4617da14cebeSEric Cheng 	 * Enqueue our mblk chain.					\
4618da14cebeSEric Cheng 	 */								\
4619da14cebeSEric Cheng 	ASSERT(MUTEX_HELD(&(ringp)->s_ring_lock));			\
4620da14cebeSEric Cheng 									\
4621da14cebeSEric Cheng 	if ((ringp)->s_ring_last != NULL)				\
4622da14cebeSEric Cheng 		(ringp)->s_ring_last->b_next = (mp);			\
4623da14cebeSEric Cheng 	else								\
4624da14cebeSEric Cheng 		(ringp)->s_ring_first = (mp);				\
4625da14cebeSEric Cheng 	(ringp)->s_ring_last = (tail);					\
4626da14cebeSEric Cheng 	(ringp)->s_ring_count += (cnt);					\
4627da14cebeSEric Cheng 	ASSERT((ringp)->s_ring_count > 0);				\
4628da14cebeSEric Cheng 	if ((ringp)->s_ring_type & ST_RING_BW_CTL) {			\
4629da14cebeSEric Cheng 		(ringp)->s_ring_size += sz;				\
4630da14cebeSEric Cheng 	}								\
4631da14cebeSEric Cheng }
4632da14cebeSEric Cheng 
4633da14cebeSEric Cheng /*
4634da14cebeSEric Cheng  * Default entry point to deliver a packet chain to a MAC client.
4635da14cebeSEric Cheng  * If the MAC client has flows, do the classification with these
4636da14cebeSEric Cheng  * flows as well.
4637da14cebeSEric Cheng  */
4638da14cebeSEric Cheng /* ARGSUSED */
4639da14cebeSEric Cheng void
4640da14cebeSEric Cheng mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain,
4641da14cebeSEric Cheng     mac_header_info_t *arg3)
4642da14cebeSEric Cheng {
4643da14cebeSEric Cheng 	mac_client_impl_t *mcip = arg1;
4644da14cebeSEric Cheng 
4645da14cebeSEric Cheng 	if (mcip->mci_nvids == 1 &&
4646dbe999caSVenugopal Iyer 	    !(mcip->mci_state_flags & MCIS_STRIP_DISABLE)) {
4647da14cebeSEric Cheng 		/*
4648da14cebeSEric Cheng 		 * If the client has exactly one VID associated with it
4649da14cebeSEric Cheng 		 * and striping of VLAN header is not disabled,
4650da14cebeSEric Cheng 		 * remove the VLAN tag from the packet before
4651da14cebeSEric Cheng 		 * passing it on to the client's receive callback.
4652da14cebeSEric Cheng 		 * Note that this needs to be done after we dispatch
4653da14cebeSEric Cheng 		 * the packet to the promiscuous listeners of the
4654da14cebeSEric Cheng 		 * client, since they expect to see the whole
4655da14cebeSEric Cheng 		 * frame including the VLAN headers.
4656da14cebeSEric Cheng 		 */
4657da14cebeSEric Cheng 		mp_chain = mac_strip_vlan_tag_chain(mp_chain);
4658da14cebeSEric Cheng 	}
4659da14cebeSEric Cheng 
4660da14cebeSEric Cheng 	mcip->mci_rx_fn(mcip->mci_rx_arg, mrh, mp_chain, B_FALSE);
4661da14cebeSEric Cheng }
4662da14cebeSEric Cheng 
4663da14cebeSEric Cheng /*
4664da14cebeSEric Cheng  * mac_rx_soft_ring_process
4665da14cebeSEric Cheng  *
4666da14cebeSEric Cheng  * process a chain for a given soft ring. The number of packets queued
4667da14cebeSEric Cheng  * in the SRS and its associated soft rings (including this one) is
4668da14cebeSEric Cheng  * very small (tracked by srs_poll_pkt_cnt), then allow the entering
4669da14cebeSEric Cheng  * thread (interrupt or poll thread) to do inline processing. This
4670da14cebeSEric Cheng  * helps keep the latency down under low load.
4671da14cebeSEric Cheng  *
4672da14cebeSEric Cheng  * The proc and arg for each mblk is already stored in the mblk in
4673da14cebeSEric Cheng  * appropriate places.
4674da14cebeSEric Cheng  */
4675da14cebeSEric Cheng /* ARGSUSED */
4676da14cebeSEric Cheng void
4677da14cebeSEric Cheng mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp,
4678da14cebeSEric Cheng     mblk_t *mp_chain, mblk_t *tail, int cnt, size_t sz)
4679da14cebeSEric Cheng {
4680da14cebeSEric Cheng 	mac_direct_rx_t		proc;
4681da14cebeSEric Cheng 	void			*arg1;
4682da14cebeSEric Cheng 	mac_resource_handle_t	arg2;
4683da14cebeSEric Cheng 	mac_soft_ring_set_t	*mac_srs = ringp->s_ring_set;
4684da14cebeSEric Cheng 
4685da14cebeSEric Cheng 	ASSERT(ringp != NULL);
4686da14cebeSEric Cheng 	ASSERT(mp_chain != NULL);
4687da14cebeSEric Cheng 	ASSERT(tail != NULL);
4688da14cebeSEric Cheng 	ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
4689da14cebeSEric Cheng 
4690da14cebeSEric Cheng 	mutex_enter(&ringp->s_ring_lock);
4691da14cebeSEric Cheng 	ringp->s_ring_total_inpkt += cnt;
46920dc2366fSVenugopal Iyer 	ringp->s_ring_total_rbytes += sz;
4693ae6aa22aSVenugopal Iyer 	if ((mac_srs->srs_rx.sr_poll_pkt_cnt <= 1) &&
4694ae6aa22aSVenugopal Iyer 	    !(ringp->s_ring_type & ST_RING_WORKER_ONLY)) {
4695da14cebeSEric Cheng 		/* If on processor or blanking on, then enqueue and return */
4696da14cebeSEric Cheng 		if (ringp->s_ring_state & S_RING_BLANK ||
4697da14cebeSEric Cheng 		    ringp->s_ring_state & S_RING_PROC) {
4698da14cebeSEric Cheng 			SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);
4699da14cebeSEric Cheng 			mutex_exit(&ringp->s_ring_lock);
4700da14cebeSEric Cheng 			return;
4701da14cebeSEric Cheng 		}
4702da14cebeSEric Cheng 		proc = ringp->s_ring_rx_func;
4703da14cebeSEric Cheng 		arg1 = ringp->s_ring_rx_arg1;
4704da14cebeSEric Cheng 		arg2 = ringp->s_ring_rx_arg2;
4705da14cebeSEric Cheng 		/*
4706da14cebeSEric Cheng 		 * See if anything is already queued. If we are the
4707da14cebeSEric Cheng 		 * first packet, do inline processing else queue the
4708da14cebeSEric Cheng 		 * packet and do the drain.
4709da14cebeSEric Cheng 		 */
4710da14cebeSEric Cheng 		if (ringp->s_ring_first == NULL) {
4711da14cebeSEric Cheng 			/*
4712da14cebeSEric Cheng 			 * Fast-path, ok to process and nothing queued.
4713da14cebeSEric Cheng 			 */
4714da14cebeSEric Cheng 			ringp->s_ring_run = curthread;
4715da14cebeSEric Cheng 			ringp->s_ring_state |= (S_RING_PROC);
4716da14cebeSEric Cheng 
4717da14cebeSEric Cheng 			mutex_exit(&ringp->s_ring_lock);
4718da14cebeSEric Cheng 
4719da14cebeSEric Cheng 			/*
4720da14cebeSEric Cheng 			 * We are the chain of 1 packet so
4721da14cebeSEric Cheng 			 * go through this fast path.
4722da14cebeSEric Cheng 			 */
4723da14cebeSEric Cheng 			ASSERT(mp_chain->b_next == NULL);
4724da14cebeSEric Cheng 
4725da14cebeSEric Cheng 			(*proc)(arg1, arg2, mp_chain, NULL);
4726da14cebeSEric Cheng 
4727da14cebeSEric Cheng 			ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
4728da14cebeSEric Cheng 			/*
4729da14cebeSEric Cheng 			 * If we have a soft ring set which is doing
4730da14cebeSEric Cheng 			 * bandwidth control, we need to decrement
4731da14cebeSEric Cheng 			 * srs_size and count so it the SRS can have a
4732da14cebeSEric Cheng 			 * accurate idea of what is the real data
4733da14cebeSEric Cheng 			 * queued between SRS and its soft rings. We
4734da14cebeSEric Cheng 			 * decrement the counters only when the packet
4735da14cebeSEric Cheng 			 * gets processed by both SRS and the soft ring.
4736da14cebeSEric Cheng 			 */
4737da14cebeSEric Cheng 			mutex_enter(&mac_srs->srs_lock);
4738da14cebeSEric Cheng 			MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
4739da14cebeSEric Cheng 			MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz);
4740da14cebeSEric Cheng 			mutex_exit(&mac_srs->srs_lock);
4741da14cebeSEric Cheng 
4742da14cebeSEric Cheng 			mutex_enter(&ringp->s_ring_lock);
4743da14cebeSEric Cheng 			ringp->s_ring_run = NULL;
4744da14cebeSEric Cheng 			ringp->s_ring_state &= ~S_RING_PROC;
4745da14cebeSEric Cheng 			if (ringp->s_ring_state & S_RING_CLIENT_WAIT)
4746da14cebeSEric Cheng 				cv_signal(&ringp->s_ring_client_cv);
4747da14cebeSEric Cheng 
4748da14cebeSEric Cheng 			if ((ringp->s_ring_first == NULL) ||
4749da14cebeSEric Cheng 			    (ringp->s_ring_state & S_RING_BLANK)) {
4750da14cebeSEric Cheng 				/*
4751da14cebeSEric Cheng 				 * We processed inline our packet and
4752da14cebeSEric Cheng 				 * nothing new has arrived or our
4753da14cebeSEric Cheng 				 * receiver doesn't want to receive
4754da14cebeSEric Cheng 				 * any packets. We are done.
4755da14cebeSEric Cheng 				 */
4756da14cebeSEric Cheng 				mutex_exit(&ringp->s_ring_lock);
4757da14cebeSEric Cheng 				return;
4758da14cebeSEric Cheng 			}
4759da14cebeSEric Cheng 		} else {
4760da14cebeSEric Cheng 			SOFT_RING_ENQUEUE_CHAIN(ringp,
4761da14cebeSEric Cheng 			    mp_chain, tail, cnt, sz);
4762da14cebeSEric Cheng 		}
4763da14cebeSEric Cheng 
4764da14cebeSEric Cheng 		/*
4765da14cebeSEric Cheng 		 * We are here because either we couldn't do inline
4766da14cebeSEric Cheng 		 * processing (because something was already
4767da14cebeSEric Cheng 		 * queued), or we had a chain of more than one
4768da14cebeSEric Cheng 		 * packet, or something else arrived after we were
4769da14cebeSEric Cheng 		 * done with inline processing.
4770da14cebeSEric Cheng 		 */
4771da14cebeSEric Cheng 		ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
4772da14cebeSEric Cheng 		ASSERT(ringp->s_ring_first != NULL);
4773da14cebeSEric Cheng 
4774da14cebeSEric Cheng 		ringp->s_ring_drain_func(ringp);
4775da14cebeSEric Cheng 		mutex_exit(&ringp->s_ring_lock);
4776da14cebeSEric Cheng 		return;
4777da14cebeSEric Cheng 	} else {
4778da14cebeSEric Cheng 		/* ST_RING_WORKER_ONLY case */
4779da14cebeSEric Cheng 		SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);
4780da14cebeSEric Cheng 		mac_soft_ring_worker_wakeup(ringp);
4781da14cebeSEric Cheng 		mutex_exit(&ringp->s_ring_lock);
4782da14cebeSEric Cheng 	}
4783da14cebeSEric Cheng }
4784da14cebeSEric Cheng 
4785da14cebeSEric Cheng /*
4786da14cebeSEric Cheng  * TX SOFTRING RELATED FUNCTIONS
4787da14cebeSEric Cheng  *
4788da14cebeSEric Cheng  * These functions really belong in mac_soft_ring.c and here for
4789da14cebeSEric Cheng  * a short period.
4790da14cebeSEric Cheng  */
4791da14cebeSEric Cheng 
4792da14cebeSEric Cheng #define	TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) {	       	\
4793da14cebeSEric Cheng 	ASSERT(MUTEX_HELD(&ringp->s_ring_lock));			\
4794da14cebeSEric Cheng 	ringp->s_ring_state |= S_RING_ENQUEUED;				\
4795da14cebeSEric Cheng 	SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);	\
4796da14cebeSEric Cheng }
4797da14cebeSEric Cheng 
4798da14cebeSEric Cheng /*
4799da14cebeSEric Cheng  * mac_tx_sring_queued
4800da14cebeSEric Cheng  *
4801da14cebeSEric Cheng  * When we are out of transmit descriptors and we already have a
4802da14cebeSEric Cheng  * queue that exceeds hiwat (or the client called us with
4803da14cebeSEric Cheng  * MAC_TX_NO_ENQUEUE or MAC_DROP_ON_NO_DESC flag), return the
4804da14cebeSEric Cheng  * soft ring pointer as the opaque cookie for the client enable
4805da14cebeSEric Cheng  * flow control.
4806da14cebeSEric Cheng  */
4807da14cebeSEric Cheng static mac_tx_cookie_t
4808da14cebeSEric Cheng mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag,
4809da14cebeSEric Cheng     mblk_t **ret_mp)
4810da14cebeSEric Cheng {
4811da14cebeSEric Cheng 	int cnt;
4812da14cebeSEric Cheng 	size_t sz;
4813da14cebeSEric Cheng 	mblk_t *tail;
4814da14cebeSEric Cheng 	mac_soft_ring_set_t *mac_srs = ringp->s_ring_set;
4815da14cebeSEric Cheng 	mac_tx_cookie_t cookie = NULL;
4816da14cebeSEric Cheng 	boolean_t wakeup_worker = B_TRUE;
4817da14cebeSEric Cheng 
4818da14cebeSEric Cheng 	ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
4819da14cebeSEric Cheng 	MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
4820da14cebeSEric Cheng 	if (flag & MAC_DROP_ON_NO_DESC) {
4821da14cebeSEric Cheng 		mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);
4822da14cebeSEric Cheng 		/* increment freed stats */
4823da14cebeSEric Cheng 		ringp->s_ring_drops += cnt;
4824da14cebeSEric Cheng 		cookie = (mac_tx_cookie_t)ringp;
4825da14cebeSEric Cheng 	} else {
4826da14cebeSEric Cheng 		if (ringp->s_ring_first != NULL)
4827da14cebeSEric Cheng 			wakeup_worker = B_FALSE;
4828da14cebeSEric Cheng 
4829da14cebeSEric Cheng 		if (flag & MAC_TX_NO_ENQUEUE) {
4830da14cebeSEric Cheng 			/*
4831da14cebeSEric Cheng 			 * If QUEUED is not set, queue the packet
4832da14cebeSEric Cheng 			 * and let mac_tx_soft_ring_drain() set
4833da14cebeSEric Cheng 			 * the TX_BLOCKED bit for the reasons
4834da14cebeSEric Cheng 			 * explained above. Otherwise, return the
4835da14cebeSEric Cheng 			 * mblks.
4836da14cebeSEric Cheng 			 */
4837da14cebeSEric Cheng 			if (wakeup_worker) {
4838da14cebeSEric Cheng 				TX_SOFT_RING_ENQUEUE_CHAIN(ringp,
4839da14cebeSEric Cheng 				    mp_chain, tail, cnt, sz);
4840da14cebeSEric Cheng 			} else {
4841da14cebeSEric Cheng 				ringp->s_ring_state |= S_RING_WAKEUP_CLIENT;
4842da14cebeSEric Cheng 				cookie = (mac_tx_cookie_t)ringp;
4843da14cebeSEric Cheng 				*ret_mp = mp_chain;
4844da14cebeSEric Cheng 			}
4845da14cebeSEric Cheng 		} else {
4846da14cebeSEric Cheng 			boolean_t enqueue = B_TRUE;
4847da14cebeSEric Cheng 
4848da14cebeSEric Cheng 			if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) {
4849da14cebeSEric Cheng 				/*
4850da14cebeSEric Cheng 				 * flow-controlled. Store ringp in cookie
4851da14cebeSEric Cheng 				 * so that it can be returned as
4852da14cebeSEric Cheng 				 * mac_tx_cookie_t to client
4853da14cebeSEric Cheng 				 */
4854da14cebeSEric Cheng 				ringp->s_ring_state |= S_RING_TX_HIWAT;
4855da14cebeSEric Cheng 				cookie = (mac_tx_cookie_t)ringp;
4856da14cebeSEric Cheng 				ringp->s_ring_hiwat_cnt++;
4857da14cebeSEric Cheng 				if (ringp->s_ring_count >
4858da14cebeSEric Cheng 				    ringp->s_ring_tx_max_q_cnt) {
4859da14cebeSEric Cheng 					/* increment freed stats */
4860da14cebeSEric Cheng 					ringp->s_ring_drops += cnt;
4861da14cebeSEric Cheng 					/*
4862da14cebeSEric Cheng 					 * b_prev may be set to the fanout hint
4863da14cebeSEric Cheng 					 * hence can't use freemsg directly
4864da14cebeSEric Cheng 					 */
4865da14cebeSEric Cheng 					mac_pkt_drop(NULL, NULL,
4866da14cebeSEric Cheng 					    mp_chain, B_FALSE);
4867da14cebeSEric Cheng 					DTRACE_PROBE1(tx_queued_hiwat,
4868da14cebeSEric Cheng 					    mac_soft_ring_t *, ringp);
4869da14cebeSEric Cheng 					enqueue = B_FALSE;
4870da14cebeSEric Cheng 				}
4871da14cebeSEric Cheng 			}
4872da14cebeSEric Cheng 			if (enqueue) {
4873da14cebeSEric Cheng 				TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain,
4874da14cebeSEric Cheng 				    tail, cnt, sz);
4875da14cebeSEric Cheng 			}
4876da14cebeSEric Cheng 		}
4877da14cebeSEric Cheng 		if (wakeup_worker)
4878da14cebeSEric Cheng 			cv_signal(&ringp->s_ring_async);
4879da14cebeSEric Cheng 	}
4880da14cebeSEric Cheng 	return (cookie);
4881da14cebeSEric Cheng }
4882da14cebeSEric Cheng 
4883da14cebeSEric Cheng 
4884da14cebeSEric Cheng /*
4885da14cebeSEric Cheng  * mac_tx_soft_ring_process
4886da14cebeSEric Cheng  *
4887da14cebeSEric Cheng  * This routine is called when fanning out outgoing traffic among
4888da14cebeSEric Cheng  * multipe Tx rings.
4889da14cebeSEric Cheng  * Note that a soft ring is associated with a h/w Tx ring.
4890da14cebeSEric Cheng  */
4891da14cebeSEric Cheng mac_tx_cookie_t
4892da14cebeSEric Cheng mac_tx_soft_ring_process(mac_soft_ring_t *ringp, mblk_t *mp_chain,
4893da14cebeSEric Cheng     uint16_t flag, mblk_t **ret_mp)
4894da14cebeSEric Cheng {
4895da14cebeSEric Cheng 	mac_soft_ring_set_t *mac_srs = ringp->s_ring_set;
4896da14cebeSEric Cheng 	int	cnt;
4897da14cebeSEric Cheng 	size_t	sz;
4898da14cebeSEric Cheng 	mblk_t	*tail;
4899da14cebeSEric Cheng 	mac_tx_cookie_t cookie = NULL;
4900da14cebeSEric Cheng 
4901da14cebeSEric Cheng 	ASSERT(ringp != NULL);
4902da14cebeSEric Cheng 	ASSERT(mp_chain != NULL);
4903da14cebeSEric Cheng 	ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
4904da14cebeSEric Cheng 	/*
49050dc2366fSVenugopal Iyer 	 * The following modes can come here: SRS_TX_BW_FANOUT,
49060dc2366fSVenugopal Iyer 	 * SRS_TX_FANOUT, SRS_TX_AGGR, SRS_TX_BW_AGGR.
4907da14cebeSEric Cheng 	 */
49080dc2366fSVenugopal Iyer 	ASSERT(MAC_TX_SOFT_RINGS(mac_srs));
4909da14cebeSEric Cheng 	ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT ||
49100dc2366fSVenugopal Iyer 	    mac_srs->srs_tx.st_mode == SRS_TX_BW_FANOUT ||
49110dc2366fSVenugopal Iyer 	    mac_srs->srs_tx.st_mode == SRS_TX_AGGR ||
49120dc2366fSVenugopal Iyer 	    mac_srs->srs_tx.st_mode == SRS_TX_BW_AGGR);
4913da14cebeSEric Cheng 
4914da14cebeSEric Cheng 	if (ringp->s_ring_type & ST_RING_WORKER_ONLY) {
4915da14cebeSEric Cheng 		/* Serialization mode */
4916da14cebeSEric Cheng 
4917da14cebeSEric Cheng 		mutex_enter(&ringp->s_ring_lock);
4918da14cebeSEric Cheng 		if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) {
4919da14cebeSEric Cheng 			cookie = mac_tx_sring_enqueue(ringp, mp_chain,
4920da14cebeSEric Cheng 			    flag, ret_mp);
4921da14cebeSEric Cheng 			mutex_exit(&ringp->s_ring_lock);
4922da14cebeSEric Cheng 			return (cookie);
4923da14cebeSEric Cheng 		}
4924da14cebeSEric Cheng 		MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
4925da14cebeSEric Cheng 		TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);
4926da14cebeSEric Cheng 		if (ringp->s_ring_state & (S_RING_BLOCK | S_RING_PROC)) {
4927da14cebeSEric Cheng 			/*
4928da14cebeSEric Cheng 			 * If ring is blocked due to lack of Tx
4929da14cebeSEric Cheng 			 * descs, just return. Worker thread
4930da14cebeSEric Cheng 			 * will get scheduled when Tx desc's
4931da14cebeSEric Cheng 			 * become available.
4932da14cebeSEric Cheng 			 */
4933da14cebeSEric Cheng 			mutex_exit(&ringp->s_ring_lock);
4934da14cebeSEric Cheng 			return (cookie);
4935da14cebeSEric Cheng 		}
4936da14cebeSEric Cheng 		mac_soft_ring_worker_wakeup(ringp);
4937da14cebeSEric Cheng 		mutex_exit(&ringp->s_ring_lock);
4938da14cebeSEric Cheng 		return (cookie);
4939da14cebeSEric Cheng 	} else {
4940da14cebeSEric Cheng 		/* Default fanout mode */
4941da14cebeSEric Cheng 		/*
4942da14cebeSEric Cheng 		 * S_RING_BLOCKED is set when underlying NIC runs
4943da14cebeSEric Cheng 		 * out of Tx descs and messages start getting
4944da14cebeSEric Cheng 		 * queued. It won't get reset until
4945da14cebeSEric Cheng 		 * tx_srs_drain() completely drains out the
4946da14cebeSEric Cheng 		 * messages.
4947da14cebeSEric Cheng 		 */
4948da14cebeSEric Cheng 		mac_tx_stats_t		stats;
4949da14cebeSEric Cheng 
4950da14cebeSEric Cheng 		if (ringp->s_ring_state & S_RING_ENQUEUED) {
4951da14cebeSEric Cheng 			/* Tx descs/resources not available */
4952da14cebeSEric Cheng 			mutex_enter(&ringp->s_ring_lock);
4953da14cebeSEric Cheng 			if (ringp->s_ring_state & S_RING_ENQUEUED) {
4954da14cebeSEric Cheng 				cookie = mac_tx_sring_enqueue(ringp, mp_chain,
4955da14cebeSEric Cheng 				    flag, ret_mp);
4956da14cebeSEric Cheng 				mutex_exit(&ringp->s_ring_lock);
4957da14cebeSEric Cheng 				return (cookie);
4958da14cebeSEric Cheng 			}
4959da14cebeSEric Cheng 			/*
4960da14cebeSEric Cheng 			 * While we were computing mblk count, the
4961da14cebeSEric Cheng 			 * flow control condition got relieved.
4962da14cebeSEric Cheng 			 * Continue with the transmission.
4963da14cebeSEric Cheng 			 */
4964da14cebeSEric Cheng 			mutex_exit(&ringp->s_ring_lock);
4965da14cebeSEric Cheng 		}
4966da14cebeSEric Cheng 
4967da14cebeSEric Cheng 		mp_chain = mac_tx_send(ringp->s_ring_tx_arg1,
49680dc2366fSVenugopal Iyer 		    ringp->s_ring_tx_arg2, mp_chain, &stats);
4969da14cebeSEric Cheng 
4970da14cebeSEric Cheng 		/*
4971da14cebeSEric Cheng 		 * Multiple threads could be here sending packets.
4972da14cebeSEric Cheng 		 * Under such conditions, it is not possible to
4973da14cebeSEric Cheng 		 * automically set S_RING_BLOCKED bit to indicate
4974da14cebeSEric Cheng 		 * out of tx desc condition. To atomically set
4975da14cebeSEric Cheng 		 * this, we queue the returned packet and do
4976da14cebeSEric Cheng 		 * the setting of S_RING_BLOCKED in
4977da14cebeSEric Cheng 		 * mac_tx_soft_ring_drain().
4978da14cebeSEric Cheng 		 */
4979da14cebeSEric Cheng 		if (mp_chain != NULL) {
4980da14cebeSEric Cheng 			mutex_enter(&ringp->s_ring_lock);
4981da14cebeSEric Cheng 			cookie =
4982da14cebeSEric Cheng 			    mac_tx_sring_enqueue(ringp, mp_chain, flag, ret_mp);
4983da14cebeSEric Cheng 			mutex_exit(&ringp->s_ring_lock);
4984da14cebeSEric Cheng 			return (cookie);
4985da14cebeSEric Cheng 		}
49860dc2366fSVenugopal Iyer 		SRS_TX_STATS_UPDATE(mac_srs, &stats);
49870dc2366fSVenugopal Iyer 		SOFTRING_TX_STATS_UPDATE(ringp, &stats);
49880dc2366fSVenugopal Iyer 
4989da14cebeSEric Cheng 		return (NULL);
4990da14cebeSEric Cheng 	}
4991da14cebeSEric Cheng }
4992