1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 #include <sys/types.h>
27 #include <sys/conf.h>
28 #include <sys/devops.h>
29 #include <sys/kmem.h>
30 #include <sys/ksynch.h>
31 #include <sys/modctl.h>
32 #include <sys/stat.h>
33 #include <sys/ddi.h>
34 #include <sys/sunddi.h>
35 #include <sys/mac_provider.h>
36 #include <sys/mac_ether.h>
37
38 #include <sys/ib/clients/eoib/eib_impl.h>
39
40 /*
41 * Declarations private to this file
42 */
43 static void eib_rb_mac_start(eib_t *, eib_vnic_t *);
44
45 /*
46 * This set of routines are used to set/clear the condition that the
47 * caller is about to do something that affects the state of the nic.
48 * If there's already someone doing either a start or a stop (possibly
49 * due to the async handler, a plumb or a dlpi_open happening, or an
50 * unplumb or dlpi_close coming in), we wait until that's done.
51 */
52 void
eib_mac_set_nic_state(eib_t * ss,uint_t flags)53 eib_mac_set_nic_state(eib_t *ss, uint_t flags)
54 {
55 eib_node_state_t *ns = ss->ei_node_state;
56
57 mutex_enter(&ns->ns_lock);
58
59 while ((ns->ns_nic_state & EIB_NIC_STARTING) ||
60 (ns->ns_nic_state & EIB_NIC_STOPPING)) {
61 cv_wait(&ns->ns_cv, &ns->ns_lock);
62 }
63 ns->ns_nic_state |= flags;
64
65 mutex_exit(&ns->ns_lock);
66 }
67
68 void
eib_mac_clr_nic_state(eib_t * ss,uint_t flags)69 eib_mac_clr_nic_state(eib_t *ss, uint_t flags)
70 {
71 eib_node_state_t *ns = ss->ei_node_state;
72
73 mutex_enter(&ns->ns_lock);
74
75 ns->ns_nic_state &= (~flags);
76
77 cv_broadcast(&ns->ns_cv);
78 mutex_exit(&ns->ns_lock);
79 }
80
81 void
eib_mac_upd_nic_state(eib_t * ss,uint_t clr_flags,uint_t set_flags)82 eib_mac_upd_nic_state(eib_t *ss, uint_t clr_flags, uint_t set_flags)
83 {
84 eib_node_state_t *ns = ss->ei_node_state;
85
86 mutex_enter(&ns->ns_lock);
87
88 ns->ns_nic_state &= (~clr_flags);
89 ns->ns_nic_state |= set_flags;
90
91 cv_broadcast(&ns->ns_cv);
92 mutex_exit(&ns->ns_lock);
93 }
94
95 uint_t
eib_mac_get_nic_state(eib_t * ss)96 eib_mac_get_nic_state(eib_t *ss)
97 {
98 eib_node_state_t *ns = ss->ei_node_state;
99 uint_t nic_state;
100
101 mutex_enter(&ns->ns_lock);
102 nic_state = ns->ns_nic_state;
103 mutex_exit(&ns->ns_lock);
104
105 return (nic_state);
106 }
107
108 void
eib_mac_link_state(eib_t * ss,link_state_t new_link_state,boolean_t force)109 eib_mac_link_state(eib_t *ss, link_state_t new_link_state,
110 boolean_t force)
111 {
112 eib_node_state_t *ns = ss->ei_node_state;
113 boolean_t state_changed = B_FALSE;
114
115 mutex_enter(&ns->ns_lock);
116
117 /*
118 * We track the link state only if the current link state is
119 * not unknown. Obviously therefore, the first calls to set
120 * the link state from eib_mac_start() have to pass an explicit
121 * 'force' flag to force the state change tracking.
122 */
123 if (ns->ns_link_state != LINK_STATE_UNKNOWN)
124 force = B_TRUE;
125
126 if ((force) && (new_link_state != ns->ns_link_state)) {
127 ns->ns_link_state = new_link_state;
128 state_changed = B_TRUE;
129 }
130 mutex_exit(&ns->ns_lock);
131
132 if (state_changed) {
133 EIB_DPRINTF_DEBUG(ss->ei_instance,
134 "eib_mac_link_state: changing link state to %d",
135 new_link_state);
136
137 mac_link_update(ss->ei_mac_hdl, new_link_state);
138 } else {
139 EIB_DPRINTF_DEBUG(ss->ei_instance,
140 "eib_mac_link_state: link state already %d",
141 new_link_state);
142 }
143 }
144
145 void
eib_mac_link_up(eib_t * ss,boolean_t force)146 eib_mac_link_up(eib_t *ss, boolean_t force)
147 {
148 eib_mac_link_state(ss, LINK_STATE_UP, force);
149 }
150
151 void
eib_mac_link_down(eib_t * ss,boolean_t force)152 eib_mac_link_down(eib_t *ss, boolean_t force)
153 {
154 eib_mac_link_state(ss, LINK_STATE_DOWN, force);
155 }
156
157 int
eib_mac_start(eib_t * ss)158 eib_mac_start(eib_t *ss)
159 {
160 eib_vnic_t *vnic0 = NULL;
161 eib_login_data_t *ld;
162 int err;
163
164 /*
165 * Perform HCA related initializations
166 */
167 if (eib_ibt_hca_init(ss) != EIB_E_SUCCESS)
168 goto start_fail;
169
170 /*
171 * Make sure port is up. Also record the port base lid if it's up.
172 */
173 if (eib_mac_hca_portstate(ss, &ss->ei_props->ep_blid,
174 &err) != EIB_E_SUCCESS) {
175 goto start_fail;
176 }
177
178 /*
179 * Set up tx and rx buffer pools
180 */
181 if (eib_rsrc_setup_bufs(ss, &err) != EIB_E_SUCCESS)
182 goto start_fail;
183
184 /*
185 * Set up admin qp for logins and logouts
186 */
187 if (eib_adm_setup_qp(ss, &err) != EIB_E_SUCCESS)
188 goto start_fail;
189
190 /*
191 * Create the vnic for physlink (instance 0)
192 */
193 if (eib_vnic_create(ss, 0, 0, &vnic0, &err) != EIB_E_SUCCESS)
194 goto start_fail;
195
196 /*
197 * Update the mac layer about the correct values for MTU and
198 * unicast MAC address. Note that we've already verified that the
199 * vhub mtu (plus the eoib encapsulation header) is not greater
200 * than our port mtu, so we can go ahead and report the vhub mtu
201 * (of vnic0) directly.
202 */
203 ld = &(vnic0->vn_login_data);
204 (void) mac_maxsdu_update(ss->ei_mac_hdl, ld->ld_vhub_mtu);
205 mac_unicst_update(ss->ei_mac_hdl, ld->ld_assigned_mac);
206
207 /*
208 * Report that the link is up and ready
209 */
210 eib_mac_link_up(ss, B_TRUE);
211 return (0);
212
213 start_fail:
214 eib_rb_mac_start(ss, vnic0);
215 eib_mac_link_down(ss, B_TRUE);
216 return (err);
217 }
218
219 void
eib_mac_stop(eib_t * ss)220 eib_mac_stop(eib_t *ss)
221 {
222 eib_vnic_t *vnic;
223 link_state_t cur_link_state = ss->ei_node_state->ns_link_state;
224 int ndx;
225
226 /*
227 * Stopping an EoIB device instance is somewhat different from starting
228 * it. Between the time the device instance was started and the call to
229 * eib_m_stop() now, a number of vnics could've been created. All of
230 * these will need to be destroyed before we can stop the device.
231 */
232 for (ndx = EIB_MAX_VNICS - 1; ndx >= 0; ndx--) {
233 if ((vnic = ss->ei_vnic[ndx]) != NULL)
234 eib_vnic_delete(ss, vnic);
235 }
236
237 /*
238 * And now, to undo the things we did in start (other than creation
239 * of vnics itself)
240 */
241 eib_rb_mac_start(ss, NULL);
242
243 /*
244 * Now that we're completed stopped, there's no mac address assigned
245 * to us. Update the mac layer with this information. Note that we
246 * can let the old max mtu information remain as-is, since we're likely
247 * to get that same mtu on a later plumb.
248 */
249 mac_unicst_update(ss->ei_mac_hdl, eib_zero_mac);
250
251 /*
252 * If our link state was up when the eib_m_stop() callback was called,
253 * we'll mark the link state as unknown now. Otherwise, we'll leave
254 * the link state as-is (down).
255 */
256 if (cur_link_state == LINK_STATE_UP)
257 eib_mac_link_state(ss, LINK_STATE_UNKNOWN, B_TRUE);
258 }
259
260 int
eib_mac_multicast(eib_t * ss,boolean_t add,uint8_t * mcast_mac)261 eib_mac_multicast(eib_t *ss, boolean_t add, uint8_t *mcast_mac)
262 {
263 int ret = EIB_E_SUCCESS;
264 int err = 0;
265
266 /*
267 * If it's a broadcast group join, each vnic needs to and is always
268 * joined to the broadcast address, so we return success immediately.
269 * If it's a broadcast group leave, we fail immediately for the same
270 * reason as above.
271 */
272 if (bcmp(mcast_mac, eib_broadcast_mac, ETHERADDRL) == 0) {
273 if (add)
274 return (0);
275 else
276 return (EINVAL);
277 }
278
279 if (ss->ei_vnic[0]) {
280 if (add) {
281 ret = eib_vnic_join_data_mcg(ss, ss->ei_vnic[0],
282 mcast_mac, B_FALSE, &err);
283 } else {
284 eib_vnic_leave_data_mcg(ss, ss->ei_vnic[0], mcast_mac);
285 ret = EIB_E_SUCCESS;
286 }
287 }
288
289 if (ret == EIB_E_SUCCESS)
290 return (0);
291 else
292 return (err);
293 }
294
295 int
eib_mac_promisc(eib_t * ss,boolean_t set)296 eib_mac_promisc(eib_t *ss, boolean_t set)
297 {
298 int ret = EIB_E_SUCCESS;
299 int err = 0;
300
301 if (ss->ei_vnic[0]) {
302 if (set) {
303 ret = eib_vnic_join_data_mcg(ss, ss->ei_vnic[0],
304 eib_zero_mac, B_FALSE, &err);
305 } else {
306 eib_vnic_leave_data_mcg(ss, ss->ei_vnic[0],
307 eib_zero_mac);
308 ret = EIB_E_SUCCESS;
309 }
310 }
311
312 if (ret == EIB_E_SUCCESS)
313 return (0);
314 else
315 return (err);
316 }
317
318 int
eib_mac_tx(eib_t * ss,mblk_t * mp)319 eib_mac_tx(eib_t *ss, mblk_t *mp)
320 {
321 eib_ether_hdr_t evh;
322 eib_vnic_t *vnic = NULL;
323 eib_wqe_t *swqe = NULL;
324 boolean_t failed_vnic;
325 int found;
326 int ret;
327
328 /*
329 * Grab a send wqe. If we cannot get one, wake up a service
330 * thread to monitor the swqe status and let the mac layer know
331 * as soon as we have enough tx wqes to start the traffic again.
332 */
333 if ((swqe = eib_rsrc_grab_swqe(ss, EIB_WPRI_LO)) == NULL) {
334 EIB_DPRINTF_WARN(ss->ei_instance, "eib_mac_tx: "
335 "no swqe available, holding tx until resource "
336 "becomes available");
337 eib_rsrc_txwqes_needed(ss);
338 return (EIB_E_FAILURE);
339 }
340
341 /*
342 * Determine dmac, smac and vlan information
343 */
344 eib_data_parse_ether_hdr(mp, &evh);
345
346 /*
347 * Lookup the {smac, vlan} tuple in our vnic list. If it isn't
348 * there, this is obviously a new packet on a vnic/vlan that
349 * we haven't been informed about. So go ahead and file a request
350 * to create a new vnic. This is obviously not a clean thing to
351 * do - we should be informed when a vnic/vlan is being created
352 * and should be given a proper opportunity to login to the gateway
353 * and do the creation. But we don't have that luxury now, and
354 * this is the next best thing to do. Note that we return failure
355 * from here, so tx flow control should prevent further packets
356 * from coming in until the vnic creation has completed.
357 */
358 found = eib_data_lookup_vnic(ss, evh.eh_smac, evh.eh_vlan, &vnic,
359 &failed_vnic);
360 if (found != EIB_E_SUCCESS) {
361 uint8_t *m = evh.eh_smac;
362
363 /*
364 * Return the swqe back to the pool
365 */
366 eib_rsrc_return_swqe(ss, swqe, NULL);
367
368 /*
369 * If we had previously tried creating this vnic and had
370 * failed, we'll simply drop the packets on this vnic.
371 * Otherwise, we'll queue up a request to create this vnic.
372 */
373 if (failed_vnic) {
374 EIB_DPRINTF_VERBOSE(ss->ei_instance, "eib_mac_tx: "
375 "vnic creation for mac=%x:%x:%x:%x:%x:%x "
376 "vlan=0x%x failed previously, dropping pkt",
377 m[0], m[1], m[2], m[3], m[4], m[5], evh.eh_vlan);
378 return (EIB_E_SUCCESS);
379 } else {
380 eib_vnic_need_new(ss, evh.eh_smac, evh.eh_vlan);
381 return (EIB_E_FAILURE);
382 }
383 }
384
385 /*
386 * We'll try to setup the destination in the swqe for this dmac
387 * and vlan. If we don't succeed, there's no need to undo any
388 * vnic-creation we might've made above (if we didn't find the
389 * vnic corresponding to the {smac, vlan} originally). Note that
390 * this is not a resource issue, so we'll issue a warning and
391 * drop the packet, but won't return failure from here.
392 */
393 ret = eib_vnic_setup_dest(vnic, swqe, evh.eh_dmac, evh.eh_vlan);
394 if (ret != EIB_E_SUCCESS) {
395 uint8_t *dmac;
396
397 dmac = evh.eh_dmac;
398 EIB_DPRINTF_WARN(ss->ei_instance, "eib_mac_tx: "
399 "eib_vnic_setup_dest() failed for mac=%x:%x:%x:%x:%x:%x, "
400 "vlan=0x%x, dropping pkt", dmac[0], dmac[1], dmac[2],
401 dmac[3], dmac[4], dmac[5]);
402
403 eib_rsrc_return_swqe(ss, swqe, NULL);
404 return (EIB_E_SUCCESS);
405 }
406
407 /*
408 * The only reason why this would fail is if we needed LSO buffer(s)
409 * to prepare this frame and couldn't find enough of those.
410 */
411 ret = eib_data_prepare_frame(vnic, swqe, mp, &evh);
412 if (ret != EIB_E_SUCCESS) {
413 EIB_DPRINTF_WARN(ss->ei_instance, "eib_mac_tx: "
414 "eib_data_prepare_frame() failed (no LSO bufs?), "
415 "holding tx until resource becomes available");
416
417 eib_rsrc_return_swqe(ss, swqe, NULL);
418 eib_rsrc_lsobufs_needed(ss);
419 return (EIB_E_FAILURE);
420 }
421
422 eib_data_post_tx(vnic, swqe);
423
424 return (EIB_E_SUCCESS);
425 }
426
427 int
eib_mac_hca_portstate(eib_t * ss,ib_lid_t * blid,int * err)428 eib_mac_hca_portstate(eib_t *ss, ib_lid_t *blid, int *err)
429 {
430 ibt_hca_portinfo_t *pi;
431 ibt_status_t ret;
432 uint_t num_pi;
433 uint_t sz_pi;
434
435 ret = ibt_query_hca_ports(ss->ei_hca_hdl, ss->ei_props->ep_port_num,
436 &pi, &num_pi, &sz_pi);
437 if (ret != IBT_SUCCESS) {
438 EIB_DPRINTF_ERR(ss->ei_instance,
439 "ibt_query_hca_ports(hca_hdl=0x%llx, "
440 "port=0x%x) failed, ret=%d", ss->ei_hca_hdl,
441 ss->ei_props->ep_port_num, ret);
442 goto mac_hca_portstate_fail;
443 }
444 if (num_pi != 1) {
445 EIB_DPRINTF_ERR(ss->ei_instance,
446 "ibt_query_hca_ports(hca_hdl=0x%llx, "
447 "port=0x%x) returned num_pi=%d", ss->ei_hca_hdl,
448 ss->ei_props->ep_port_num, num_pi);
449 goto mac_hca_portstate_fail;
450 }
451
452 if (pi->p_linkstate != IBT_PORT_ACTIVE)
453 goto mac_hca_portstate_fail;
454
455 /*
456 * Return the port's base lid if asked
457 */
458 if (blid) {
459 *blid = pi->p_base_lid;
460 }
461
462 ibt_free_portinfo(pi, sz_pi);
463 return (EIB_E_SUCCESS);
464
465 mac_hca_portstate_fail:
466 if (pi) {
467 ibt_free_portinfo(pi, sz_pi);
468 }
469 if (err) {
470 *err = ENETDOWN;
471 }
472 return (EIB_E_FAILURE);
473 }
474
475 static void
eib_rb_mac_start(eib_t * ss,eib_vnic_t * vnic0)476 eib_rb_mac_start(eib_t *ss, eib_vnic_t *vnic0)
477 {
478 int ntries;
479
480 /*
481 * If vnic0 is non-null, delete it
482 */
483 if (vnic0) {
484 eib_rb_vnic_create(ss, vnic0, ~0);
485 }
486
487 /*
488 * At this point, we're pretty much done with all communication that
489 * we need to do for vnic-logout, etc. so we can get rid of any address
490 * vectors we might've allocated to send control/data packets.
491 */
492 eib_ibt_free_avects(ss);
493
494 /*
495 * Tear down the rest of it
496 */
497 if (ss->ei_admin_chan) {
498 eib_rb_adm_setup_qp(ss);
499 }
500
501 /*
502 * If (say) the network layer has been holding onto our rx buffers, we
503 * wait a reasonable time for it to hand them back to us. If we don't
504 * get it still, we have nothing to do but avoid rolling back hca init
505 * since we cannot unregister the memory, release the pd or close the
506 * hca. We'll try to reuse it if there's a plumb again.
507 */
508 for (ntries = 0; ntries < EIB_MAX_ATTEMPTS; ntries++) {
509 eib_rb_rsrc_setup_bufs(ss, B_FALSE);
510 if ((ss->ei_tx == NULL) && (ss->ei_rx == NULL) &&
511 (ss->ei_lso == NULL)) {
512 break;
513 }
514
515 delay(drv_usectohz(EIB_DELAY_HALF_SECOND));
516 }
517
518 if (ntries == EIB_MAX_ATTEMPTS) {
519 EIB_DPRINTF_WARN(ss->ei_instance, "eib_rb_mac_start: "
520 "bufs outstanding, tx=0x%llx, rx=0x%llx, lso=0x%llx",
521 ss->ei_tx, ss->ei_rx, ss->ei_lso);
522 } else if (ss->ei_hca_hdl) {
523 eib_rb_ibt_hca_init(ss, ~0);
524 }
525 ss->ei_props->ep_blid = 0;
526
527 /*
528 * Pending vnic creation requests (and failed-vnic records) will have
529 * to be cleaned up in any case
530 */
531 eib_flush_vnic_reqs(ss);
532 }
533