1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 #include <sys/types.h>
27 #include <sys/errno.h>
28 #include <sys/debug.h>
29 #include <sys/time.h>
30 #include <sys/sysmacros.h>
31 #include <sys/systm.h>
32 #include <sys/user.h>
33 #include <sys/stropts.h>
34 #include <sys/stream.h>
35 #include <sys/strlog.h>
36 #include <sys/strsubr.h>
37 #include <sys/cmn_err.h>
38 #include <sys/cpu.h>
39 #include <sys/kmem.h>
40 #include <sys/conf.h>
41 #include <sys/ddi.h>
42 #include <sys/sunddi.h>
43 #include <sys/ksynch.h>
44 #include <sys/stat.h>
45 #include <sys/kstat.h>
46 #include <sys/vtrace.h>
47 #include <sys/strsun.h>
48 #include <sys/dlpi.h>
49 #include <sys/ethernet.h>
50 #include <net/if.h>
51 #include <sys/varargs.h>
52 #include <sys/machsystm.h>
53 #include <sys/modctl.h>
54 #include <sys/modhash.h>
55 #include <sys/mac.h>
56 #include <sys/mac_ether.h>
57 #include <sys/taskq.h>
58 #include <sys/note.h>
59 #include <sys/mach_descrip.h>
60 #include <sys/mdeg.h>
61 #include <sys/ldc.h>
62 #include <sys/vsw_fdb.h>
63 #include <sys/vsw.h>
64 #include <sys/vio_mailbox.h>
65 #include <sys/vnet_mailbox.h>
66 #include <sys/vnet_common.h>
67 #include <sys/vio_util.h>
68 #include <sys/sdt.h>
69 #include <sys/atomic.h>
70 #include <sys/callb.h>
71 #include <sys/vlan.h>
72
73 /* Port add/deletion/etc routines */
74 static void vsw_port_delete(vsw_port_t *port);
75 static int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
76 static void vsw_ldc_detach(vsw_ldc_t *ldcp);
77 static int vsw_ldc_init(vsw_ldc_t *ldcp);
78 static void vsw_ldc_uninit(vsw_ldc_t *ldcp);
79 static void vsw_ldc_drain(vsw_ldc_t *ldcp);
80 static void vsw_drain_port_taskq(vsw_port_t *port);
81 static void vsw_marker_task(void *);
82 static int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
83 void vsw_detach_ports(vsw_t *vswp);
84 int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
85 mcst_addr_t *vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr);
86 int vsw_port_detach(vsw_t *vswp, int p_instance);
87 int vsw_portsend(vsw_port_t *port, mblk_t *mp);
88 int vsw_port_attach(vsw_port_t *portp);
89 vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
90 void vsw_vlan_unaware_port_reset(vsw_port_t *portp);
91 void vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate);
92 void vsw_reset_ports(vsw_t *vswp);
93 void vsw_port_reset(vsw_port_t *portp);
94 void vsw_physlink_update_ports(vsw_t *vswp);
95 static void vsw_port_physlink_update(vsw_port_t *portp);
96
97 /* Interrupt routines */
98 static uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);
99
100 /* Handshake routines */
101 static void vsw_ldc_reinit(vsw_ldc_t *);
102 static void vsw_conn_task(void *);
103 static int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
104 static void vsw_next_milestone(vsw_ldc_t *);
105 static int vsw_supported_version(vio_ver_msg_t *);
106 static void vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp);
107 static void vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp);
108 void vsw_process_conn_evt(vsw_ldc_t *, uint16_t);
109
110 /* Data processing routines */
111 void vsw_process_pkt(void *);
112 static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t *, int);
113 static void vsw_process_ctrl_pkt(void *);
114 static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
115 static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
116 static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
117 static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
118 static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
119 static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
120 static void vsw_process_physlink_msg(vsw_ldc_t *, void *);
121 static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *,
122 uint32_t);
123 static void vsw_process_pkt_data_nop(void *, void *, uint32_t);
124 static void vsw_process_pkt_data(void *, void *, uint32_t);
125 static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
126 static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *);
127 static void vsw_process_evt_read(vsw_ldc_t *ldcp);
128 static void vsw_ldc_rcv(vsw_ldc_t *ldcp);
129
130 /* Switching/data transmit routines */
131 static int vsw_descrsend(vsw_ldc_t *, mblk_t *);
132 static void vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp);
133 static int vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries);
134 static int vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
135 static int vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
136
137 /* Packet creation routines */
138 static void vsw_send_ver(void *);
139 static void vsw_send_attr(vsw_ldc_t *);
140 static void vsw_send_dring_info(vsw_ldc_t *);
141 static void vsw_send_rdx(vsw_ldc_t *);
142 static void vsw_send_physlink_msg(vsw_ldc_t *ldcp, link_state_t plink_state);
143
144 /* Dring routines */
145 static void vsw_create_privring(vsw_ldc_t *);
146 static dring_info_t *vsw_map_dring(vsw_ldc_t *ldcp, void *pkt);
147 static void vsw_unmap_dring(vsw_ldc_t *ldcp);
148 static void vsw_destroy_dring(vsw_ldc_t *ldcp);
149 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
150 static int vsw_map_data(vsw_ldc_t *ldcp, dring_info_t *dp, void *pkt);
151 static void vsw_set_lane_attr(vsw_t *, lane_t *);
152 dring_info_t *vsw_map_dring_cmn(vsw_ldc_t *ldcp,
153 vio_dring_reg_msg_t *dring_pkt);
154 static int vsw_mapin_avail(vsw_ldc_t *ldcp);
155
156 /* tx/msg/rcv thread routines */
157 static void vsw_stop_tx_thread(vsw_ldc_t *ldcp);
158 static void vsw_ldc_tx_worker(void *arg);
159
160 /* Misc support routines */
161 static void vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr);
162 static int vsw_get_same_dest_list(struct ether_header *ehp,
163 mblk_t **rhead, mblk_t **rtail, mblk_t **mpp);
164 static mblk_t *vsw_dupmsgchain(mblk_t *mp);
165
166 /* Debugging routines */
167 static void dump_flags(uint64_t);
168 static void display_state(void);
169 static void display_lane(lane_t *);
170 static void display_ring(dring_info_t *);
171
172 /*
173 * Functions imported from other files.
174 */
175 extern int vsw_set_hw(vsw_t *, vsw_port_t *, int);
176 extern void vsw_unset_hw(vsw_t *, vsw_port_t *, int);
177 extern int vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port);
178 extern void vsw_del_mcst_port(vsw_port_t *port);
179 extern int vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
180 extern int vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
181 extern void vsw_fdbe_add(vsw_t *vswp, void *port);
182 extern void vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr);
183 extern void vsw_create_vlans(void *arg, int type);
184 extern void vsw_destroy_vlans(void *arg, int type);
185 extern void vsw_vlan_add_ids(void *arg, int type);
186 extern void vsw_vlan_remove_ids(void *arg, int type);
187 extern boolean_t vsw_frame_lookup_vid(void *arg, int caller,
188 struct ether_header *ehp, uint16_t *vidp);
189 extern mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
190 extern uint32_t vsw_vlan_frame_untag(void *arg, int type, mblk_t **np,
191 mblk_t **npt);
192 extern boolean_t vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid);
193 extern void vsw_hio_start(vsw_t *vswp, vsw_ldc_t *ldcp);
194 extern void vsw_hio_stop(vsw_t *vswp, vsw_ldc_t *ldcp);
195 extern void vsw_process_dds_msg(vsw_t *vswp, vsw_ldc_t *ldcp, void *msg);
196 extern void vsw_hio_stop_port(vsw_port_t *portp);
197 extern void vsw_publish_macaddr(vsw_t *vswp, vsw_port_t *portp);
198 extern int vsw_mac_client_init(vsw_t *vswp, vsw_port_t *port, int type);
199 extern void vsw_mac_client_cleanup(vsw_t *vswp, vsw_port_t *port, int type);
200 extern void vsw_destroy_rxpools(void *arg);
201 extern void vsw_stop_msg_thread(vsw_ldc_t *ldcp);
202 extern int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t);
203 extern int vsw_dringsend(vsw_ldc_t *, mblk_t *);
204 extern int vsw_reclaim_dring(dring_info_t *dp, int start);
205 extern int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
206 int *);
207 extern vio_dring_reg_msg_t *vsw_create_tx_dring_info(vsw_ldc_t *);
208 extern int vsw_setup_tx_dring(vsw_ldc_t *ldcp, dring_info_t *dp);
209 extern void vsw_destroy_tx_dring(vsw_ldc_t *ldcp);
210 extern dring_info_t *vsw_map_rx_dring(vsw_ldc_t *ldcp, void *pkt);
211 extern void vsw_unmap_rx_dring(vsw_ldc_t *ldcp);
212 extern void vsw_ldc_msg_worker(void *arg);
213 extern void vsw_process_dringdata(void *, void *);
214 extern vio_dring_reg_msg_t *vsw_create_rx_dring_info(vsw_ldc_t *);
215 extern void vsw_destroy_rx_dring(vsw_ldc_t *ldcp);
216 extern dring_info_t *vsw_map_tx_dring(vsw_ldc_t *ldcp, void *pkt);
217 extern void vsw_unmap_tx_dring(vsw_ldc_t *ldcp);
218 extern void vsw_ldc_rcv_worker(void *arg);
219 extern void vsw_stop_rcv_thread(vsw_ldc_t *ldcp);
220 extern int vsw_dringsend_shm(vsw_ldc_t *, mblk_t *);
221 extern void vsw_process_dringdata_shm(void *, void *);
222
223 /*
224 * Tunables used in this file.
225 */
226 extern int vsw_num_handshakes;
227 extern int vsw_ldc_tx_delay;
228 extern int vsw_ldc_tx_retries;
229 extern int vsw_ldc_retries;
230 extern int vsw_ldc_delay;
231 extern boolean_t vsw_ldc_rxthr_enabled;
232 extern boolean_t vsw_ldc_txthr_enabled;
233 extern uint32_t vsw_num_descriptors;
234 extern uint8_t vsw_dring_mode;
235 extern uint32_t vsw_max_tx_qcount;
236 extern boolean_t vsw_obp_ver_proto_workaround;
237 extern uint32_t vsw_publish_macaddr_count;
238 extern uint32_t vsw_nrbufs_factor;
239
240 #define LDC_ENTER_LOCK(ldcp) \
241 mutex_enter(&((ldcp)->ldc_cblock));\
242 mutex_enter(&((ldcp)->ldc_rxlock));\
243 mutex_enter(&((ldcp)->ldc_txlock));
244 #define LDC_EXIT_LOCK(ldcp) \
245 mutex_exit(&((ldcp)->ldc_txlock));\
246 mutex_exit(&((ldcp)->ldc_rxlock));\
247 mutex_exit(&((ldcp)->ldc_cblock));
248
249 #define VSW_VER_EQ(ldcp, major, minor) \
250 ((ldcp)->lane_out.ver_major == (major) && \
251 (ldcp)->lane_out.ver_minor == (minor))
252
253 #define VSW_VER_LT(ldcp, major, minor) \
254 (((ldcp)->lane_out.ver_major < (major)) || \
255 ((ldcp)->lane_out.ver_major == (major) && \
256 (ldcp)->lane_out.ver_minor < (minor)))
257
258 #define VSW_VER_GTEQ(ldcp, major, minor) \
259 (((ldcp)->lane_out.ver_major > (major)) || \
260 ((ldcp)->lane_out.ver_major == (major) && \
261 (ldcp)->lane_out.ver_minor >= (minor)))
262
263 #define VSW_VER_LTEQ(ldcp, major, minor) \
264 (((ldcp)->lane_out.ver_major < (major)) || \
265 ((ldcp)->lane_out.ver_major == (major) && \
266 (ldcp)->lane_out.ver_minor <= (minor)))
267
268 /*
269 * VIO Protocol Version Info:
270 *
271 * The version specified below represents the version of protocol currently
272 * supported in the driver. It means the driver can negotiate with peers with
273 * versions <= this version. Here is a summary of the feature(s) that are
274 * supported at each version of the protocol:
275 *
276 * 1.0 Basic VIO protocol.
277 * 1.1 vDisk protocol update (no virtual network update).
278 * 1.2 Support for priority frames (priority-ether-types).
279 * 1.3 VLAN and HybridIO support.
280 * 1.4 Jumbo Frame support.
281 * 1.5 Link State Notification support with optional support
282 * for Physical Link information.
283 * 1.6 Support for RxDringData mode.
284 */
285 static ver_sup_t vsw_versions[] = { {1, 6} };
286
287 /*
288 * For the moment the state dump routines have their own
289 * private flag.
290 */
291 #define DUMP_STATE 0
292
293 #if DUMP_STATE
294
295 #define DUMP_TAG(tag) \
296 { \
297 D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
298 D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype); \
299 D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env); \
300 }
301
302 #define DUMP_TAG_PTR(tag) \
303 { \
304 D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
305 D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype); \
306 D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env); \
307 }
308
309 #define DUMP_FLAGS(flags) dump_flags(flags);
310 #define DISPLAY_STATE() display_state()
311
312 #else
313
314 #define DUMP_TAG(tag)
315 #define DUMP_TAG_PTR(tag)
316 #define DUMP_FLAGS(state)
317 #define DISPLAY_STATE()
318
319 #endif /* DUMP_STATE */
320
321 /*
322 * Attach the specified port.
323 *
324 * Returns 0 on success, 1 on failure.
325 */
326 int
vsw_port_attach(vsw_port_t * port)327 vsw_port_attach(vsw_port_t *port)
328 {
329 vsw_t *vswp = port->p_vswp;
330 vsw_port_list_t *plist = &vswp->plist;
331 vsw_port_t *p, **pp;
332 int nids = port->num_ldcs;
333 uint64_t *ldcids;
334 int rv;
335
336 D1(vswp, "%s: enter : port %d", __func__, port->p_instance);
337
338 /* port already exists? */
339 READ_ENTER(&plist->lockrw);
340 for (p = plist->head; p != NULL; p = p->p_next) {
341 if (p->p_instance == port->p_instance) {
342 DWARN(vswp, "%s: port instance %d already attached",
343 __func__, p->p_instance);
344 RW_EXIT(&plist->lockrw);
345 return (1);
346 }
347 }
348 RW_EXIT(&plist->lockrw);
349
350 mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
351 mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
352 rw_init(&port->maccl_rwlock, NULL, RW_DRIVER, NULL);
353
354 mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
355 cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
356 port->state = VSW_PORT_INIT;
357
358 D2(vswp, "%s: %d nids", __func__, nids);
359 ldcids = port->ldc_ids;
360 D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[0]);
361 if (vsw_ldc_attach(port, (uint64_t)ldcids[0]) != 0) {
362 DERR(vswp, "%s: ldc_attach failed", __func__);
363 goto exit_error;
364 }
365
366 if (vswp->switching_setup_done == B_TRUE) {
367 /*
368 * If the underlying network device has been setup,
369 * then open a mac client and porgram the mac address
370 * for this port.
371 */
372 rv = vsw_mac_client_init(vswp, port, VSW_VNETPORT);
373 if (rv != 0) {
374 goto exit_error;
375 }
376 }
377
378 /* create the fdb entry for this port/mac address */
379 vsw_fdbe_add(vswp, port);
380
381 vsw_create_vlans(port, VSW_VNETPORT);
382
383 WRITE_ENTER(&plist->lockrw);
384
385 /* link it into the list of ports for this vsw instance */
386 pp = (vsw_port_t **)(&plist->head);
387 port->p_next = *pp;
388 *pp = port;
389 plist->num_ports++;
390
391 RW_EXIT(&plist->lockrw);
392
393 /*
394 * Initialise the port and any ldc's under it.
395 */
396 (void) vsw_ldc_init(port->ldcp);
397
398 /* announce macaddr of vnet to the physical switch */
399 if (vsw_publish_macaddr_count != 0) { /* enabled */
400 vsw_publish_macaddr(vswp, port);
401 }
402
403 D1(vswp, "%s: exit", __func__);
404 return (0);
405
406 exit_error:
407
408 cv_destroy(&port->state_cv);
409 mutex_destroy(&port->state_lock);
410
411 rw_destroy(&port->maccl_rwlock);
412 mutex_destroy(&port->tx_lock);
413 mutex_destroy(&port->mca_lock);
414 kmem_free(port, sizeof (vsw_port_t));
415 return (1);
416 }
417
418 /*
419 * Detach the specified port.
420 *
421 * Returns 0 on success, 1 on failure.
422 */
423 int
vsw_port_detach(vsw_t * vswp,int p_instance)424 vsw_port_detach(vsw_t *vswp, int p_instance)
425 {
426 vsw_port_t *port = NULL;
427 vsw_port_list_t *plist = &vswp->plist;
428
429 D1(vswp, "%s: enter: port id %d", __func__, p_instance);
430
431 WRITE_ENTER(&plist->lockrw);
432
433 if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
434 RW_EXIT(&plist->lockrw);
435 return (1);
436 }
437
438 if (vsw_plist_del_node(vswp, port)) {
439 RW_EXIT(&plist->lockrw);
440 return (1);
441 }
442
443 /* cleanup any HybridIO for this port */
444 vsw_hio_stop_port(port);
445
446 /*
447 * No longer need to hold writer lock on port list now
448 * that we have unlinked the target port from the list.
449 */
450 RW_EXIT(&plist->lockrw);
451
452 /* Cleanup and close the mac client */
453 vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT);
454
455 /* Remove the fdb entry for this port/mac address */
456 vsw_fdbe_del(vswp, &(port->p_macaddr));
457 vsw_destroy_vlans(port, VSW_VNETPORT);
458
459 /* Remove any multicast addresses.. */
460 vsw_del_mcst_port(port);
461
462 vsw_port_delete(port);
463
464 D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
465 return (0);
466 }
467
468 /*
469 * Detach all active ports.
470 */
471 void
vsw_detach_ports(vsw_t * vswp)472 vsw_detach_ports(vsw_t *vswp)
473 {
474 vsw_port_list_t *plist = &vswp->plist;
475 vsw_port_t *port = NULL;
476
477 D1(vswp, "%s: enter", __func__);
478
479 WRITE_ENTER(&plist->lockrw);
480
481 while ((port = plist->head) != NULL) {
482 (void) vsw_plist_del_node(vswp, port);
483
484 /* cleanup any HybridIO for this port */
485 vsw_hio_stop_port(port);
486
487 /* Cleanup and close the mac client */
488 vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT);
489
490 /* Remove the fdb entry for this port/mac address */
491 vsw_fdbe_del(vswp, &(port->p_macaddr));
492 vsw_destroy_vlans(port, VSW_VNETPORT);
493
494 /* Remove any multicast addresses.. */
495 vsw_del_mcst_port(port);
496
497 /*
498 * No longer need to hold the lock on the port list
499 * now that we have unlinked the target port from the
500 * list.
501 */
502 RW_EXIT(&plist->lockrw);
503 vsw_port_delete(port);
504 WRITE_ENTER(&plist->lockrw);
505 }
506 RW_EXIT(&plist->lockrw);
507
508 D1(vswp, "%s: exit", __func__);
509 }
510
511 /*
512 * Delete the specified port.
513 */
514 static void
vsw_port_delete(vsw_port_t * port)515 vsw_port_delete(vsw_port_t *port)
516 {
517 vsw_t *vswp = port->p_vswp;
518
519 D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);
520
521 vsw_ldc_uninit(port->ldcp);
522
523 /*
524 * Wait for any pending ctrl msg tasks which reference this
525 * port to finish.
526 */
527 vsw_drain_port_taskq(port);
528
529 /*
530 * Wait for any active callbacks to finish
531 */
532 vsw_ldc_drain(port->ldcp);
533
534 vsw_ldc_detach(port->ldcp);
535
536 rw_destroy(&port->maccl_rwlock);
537 mutex_destroy(&port->mca_lock);
538 mutex_destroy(&port->tx_lock);
539
540 cv_destroy(&port->state_cv);
541 mutex_destroy(&port->state_lock);
542
543 if (port->num_ldcs != 0) {
544 kmem_free(port->ldc_ids, port->num_ldcs * sizeof (uint64_t));
545 port->num_ldcs = 0;
546 }
547
548 if (port->nvids != 0) {
549 kmem_free(port->vids, sizeof (vsw_vlanid_t) * port->nvids);
550 }
551
552 kmem_free(port, sizeof (vsw_port_t));
553
554 D1(vswp, "%s: exit", __func__);
555 }
556
557 /*
558 * Attach a logical domain channel (ldc) under a specified port.
559 *
560 * Returns 0 on success, 1 on failure.
561 */
562 static int
vsw_ldc_attach(vsw_port_t * port,uint64_t ldc_id)563 vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
564 {
565 vsw_t *vswp = port->p_vswp;
566 vsw_ldc_t *ldcp = NULL;
567 ldc_attr_t attr;
568 ldc_status_t istatus;
569 int status = DDI_FAILURE;
570 char kname[MAXNAMELEN];
571 enum { PROG_init = 0x0,
572 PROG_callback = 0x1,
573 PROG_tx_thread = 0x2}
574 progress;
575
576 progress = PROG_init;
577
578 D1(vswp, "%s: enter", __func__);
579
580 ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
581 if (ldcp == NULL) {
582 DERR(vswp, "%s: kmem_zalloc failed", __func__);
583 return (1);
584 }
585 ldcp->ldc_id = ldc_id;
586
587 mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
588 mutex_init(&ldcp->ldc_rxlock, NULL, MUTEX_DRIVER, NULL);
589 mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
590 ldcp->msg_thr_flags = 0;
591 mutex_init(&ldcp->msg_thr_lock, NULL, MUTEX_DRIVER, NULL);
592 cv_init(&ldcp->msg_thr_cv, NULL, CV_DRIVER, NULL);
593 ldcp->rcv_thr_flags = 0;
594 mutex_init(&ldcp->rcv_thr_lock, NULL, MUTEX_DRIVER, NULL);
595 cv_init(&ldcp->rcv_thr_cv, NULL, CV_DRIVER, NULL);
596 mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
597 cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);
598
599 /* required for handshake with peer */
600 ldcp->local_session = (uint64_t)ddi_get_lbolt();
601 ldcp->peer_session = 0;
602 ldcp->session_status = 0;
603 ldcp->hss_id = 1; /* Initial handshake session id */
604 ldcp->hphase = VSW_MILESTONE0;
605
606 (void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
607
608 /* only set for outbound lane, inbound set by peer */
609 vsw_set_lane_attr(vswp, &ldcp->lane_out);
610
611 attr.devclass = LDC_DEV_NT_SVC;
612 attr.instance = ddi_get_instance(vswp->dip);
613 attr.mode = LDC_MODE_UNRELIABLE;
614 attr.mtu = VSW_LDC_MTU;
615 status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
616 if (status != 0) {
617 DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
618 __func__, ldc_id, status);
619 goto ldc_attach_fail;
620 }
621
622 if (vsw_ldc_txthr_enabled) {
623 ldcp->tx_thr_flags = 0;
624 ldcp->tx_mhead = ldcp->tx_mtail = NULL;
625
626 mutex_init(&ldcp->tx_thr_lock, NULL, MUTEX_DRIVER, NULL);
627 cv_init(&ldcp->tx_thr_cv, NULL, CV_DRIVER, NULL);
628 ldcp->tx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
629 vsw_ldc_tx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
630
631 progress |= PROG_tx_thread;
632 if (ldcp->tx_thread == NULL) {
633 DWARN(vswp, "%s(%lld): Failed to create worker thread",
634 __func__, ldc_id);
635 goto ldc_attach_fail;
636 }
637 }
638
639 status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
640 if (status != 0) {
641 DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
642 __func__, ldc_id, status);
643 (void) ldc_fini(ldcp->ldc_handle);
644 goto ldc_attach_fail;
645 }
646 /*
647 * allocate a message for ldc_read()s, big enough to hold ctrl and
648 * data msgs, including raw data msgs used to recv priority frames.
649 */
650 ldcp->msglen = VIO_PKT_DATA_HDRSIZE + vswp->max_frame_size;
651 ldcp->ldcmsg = kmem_alloc(ldcp->msglen, KM_SLEEP);
652
653 progress |= PROG_callback;
654
655 mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL);
656
657 if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
658 DERR(vswp, "%s: ldc_status failed", __func__);
659 mutex_destroy(&ldcp->status_lock);
660 goto ldc_attach_fail;
661 }
662
663 ldcp->ldc_status = istatus;
664 ldcp->ldc_port = port;
665 ldcp->ldc_vswp = vswp;
666
667 vsw_reset_vnet_proto_ops(ldcp);
668
669 (void) sprintf(kname, "%sldc0x%lx", DRV_NAME, ldcp->ldc_id);
670 ldcp->ksp = vgen_setup_kstats(DRV_NAME, vswp->instance,
671 kname, &ldcp->ldc_stats);
672 if (ldcp->ksp == NULL) {
673 DERR(vswp, "%s: kstats setup failed", __func__);
674 goto ldc_attach_fail;
675 }
676
677 /* link it into this port */
678 port->ldcp = ldcp;
679
680 D1(vswp, "%s: exit", __func__);
681 return (0);
682
683 ldc_attach_fail:
684
685 if (progress & PROG_callback) {
686 (void) ldc_unreg_callback(ldcp->ldc_handle);
687 kmem_free(ldcp->ldcmsg, ldcp->msglen);
688 }
689
690 if (progress & PROG_tx_thread) {
691 if (ldcp->tx_thread != NULL) {
692 vsw_stop_tx_thread(ldcp);
693 }
694 mutex_destroy(&ldcp->tx_thr_lock);
695 cv_destroy(&ldcp->tx_thr_cv);
696 }
697 if (ldcp->ksp != NULL) {
698 vgen_destroy_kstats(ldcp->ksp);
699 }
700 mutex_destroy(&ldcp->msg_thr_lock);
701 mutex_destroy(&ldcp->rcv_thr_lock);
702 mutex_destroy(&ldcp->ldc_txlock);
703 mutex_destroy(&ldcp->ldc_rxlock);
704 mutex_destroy(&ldcp->ldc_cblock);
705 mutex_destroy(&ldcp->drain_cv_lock);
706 cv_destroy(&ldcp->msg_thr_cv);
707 cv_destroy(&ldcp->rcv_thr_cv);
708 cv_destroy(&ldcp->drain_cv);
709
710 kmem_free(ldcp, sizeof (vsw_ldc_t));
711
712 return (1);
713 }
714
715 /*
716 * Detach a logical domain channel (ldc) belonging to a
717 * particular port.
718 */
719 static void
vsw_ldc_detach(vsw_ldc_t * ldcp)720 vsw_ldc_detach(vsw_ldc_t *ldcp)
721 {
722 int rv;
723 vsw_t *vswp = ldcp->ldc_port->p_vswp;
724 int retries = 0;
725
726 D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);
727
728 /* Stop msg/rcv thread */
729 if (ldcp->rcv_thread != NULL) {
730 vsw_stop_rcv_thread(ldcp);
731 } else if (ldcp->msg_thread != NULL) {
732 vsw_stop_msg_thread(ldcp);
733 }
734 kmem_free(ldcp->ldcmsg, ldcp->msglen);
735
736 /* Stop the tx thread */
737 if (ldcp->tx_thread != NULL) {
738 vsw_stop_tx_thread(ldcp);
739 mutex_destroy(&ldcp->tx_thr_lock);
740 cv_destroy(&ldcp->tx_thr_cv);
741 if (ldcp->tx_mhead != NULL) {
742 freemsgchain(ldcp->tx_mhead);
743 ldcp->tx_mhead = ldcp->tx_mtail = NULL;
744 ldcp->tx_cnt = 0;
745 }
746 }
747
748 /* Destory kstats */
749 vgen_destroy_kstats(ldcp->ksp);
750
751 /*
752 * Before we can close the channel we must release any mapped
753 * resources (e.g. drings).
754 */
755 vsw_free_lane_resources(ldcp, INBOUND);
756 vsw_free_lane_resources(ldcp, OUTBOUND);
757
758 /*
759 * Close the channel, retry on EAAGIN.
760 */
761 while ((rv = ldc_close(ldcp->ldc_handle)) == EAGAIN) {
762 if (++retries > vsw_ldc_retries) {
763 break;
764 }
765 drv_usecwait(vsw_ldc_delay);
766 }
767 if (rv != 0) {
768 cmn_err(CE_NOTE,
769 "!vsw%d: Error(%d) closing the channel(0x%lx)\n",
770 vswp->instance, rv, ldcp->ldc_id);
771 }
772
773 (void) ldc_fini(ldcp->ldc_handle);
774
775 ldcp->ldc_status = LDC_INIT;
776 ldcp->ldc_handle = NULL;
777 ldcp->ldc_vswp = NULL;
778
779 mutex_destroy(&ldcp->msg_thr_lock);
780 mutex_destroy(&ldcp->rcv_thr_lock);
781 mutex_destroy(&ldcp->ldc_txlock);
782 mutex_destroy(&ldcp->ldc_rxlock);
783 mutex_destroy(&ldcp->ldc_cblock);
784 mutex_destroy(&ldcp->drain_cv_lock);
785 mutex_destroy(&ldcp->status_lock);
786 cv_destroy(&ldcp->msg_thr_cv);
787 cv_destroy(&ldcp->rcv_thr_cv);
788 cv_destroy(&ldcp->drain_cv);
789
790 kmem_free(ldcp, sizeof (vsw_ldc_t));
791 }
792
793 /*
794 * Open and attempt to bring up the channel. Note that channel
795 * can only be brought up if peer has also opened channel.
796 *
797 * Returns 0 if can open and bring up channel, otherwise
798 * returns 1.
799 */
800 static int
vsw_ldc_init(vsw_ldc_t * ldcp)801 vsw_ldc_init(vsw_ldc_t *ldcp)
802 {
803 vsw_t *vswp = ldcp->ldc_vswp;
804 ldc_status_t istatus = 0;
805 int rv;
806
807 D1(vswp, "%s: enter", __func__);
808
809 LDC_ENTER_LOCK(ldcp);
810
811 /* don't start at 0 in case clients don't like that */
812 ldcp->next_ident = 1;
813
814 rv = ldc_open(ldcp->ldc_handle);
815 if (rv != 0) {
816 DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
817 __func__, ldcp->ldc_id, rv);
818 LDC_EXIT_LOCK(ldcp);
819 return (1);
820 }
821
822 if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
823 DERR(vswp, "%s: unable to get status", __func__);
824 LDC_EXIT_LOCK(ldcp);
825 return (1);
826
827 } else if (istatus != LDC_OPEN && istatus != LDC_READY) {
828 DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
829 __func__, ldcp->ldc_id, istatus);
830 LDC_EXIT_LOCK(ldcp);
831 return (1);
832 }
833
834 mutex_enter(&ldcp->status_lock);
835 ldcp->ldc_status = istatus;
836 mutex_exit(&ldcp->status_lock);
837
838 rv = ldc_up(ldcp->ldc_handle);
839 if (rv != 0) {
840 /*
841 * Not a fatal error for ldc_up() to fail, as peer
842 * end point may simply not be ready yet.
843 */
844 D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
845 ldcp->ldc_id, rv);
846 LDC_EXIT_LOCK(ldcp);
847 return (1);
848 }
849
850 /*
851 * ldc_up() call is non-blocking so need to explicitly
852 * check channel status to see if in fact the channel
853 * is UP.
854 */
855 mutex_enter(&ldcp->status_lock);
856 if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) {
857 DERR(vswp, "%s: unable to get status", __func__);
858 mutex_exit(&ldcp->status_lock);
859 LDC_EXIT_LOCK(ldcp);
860 return (1);
861
862 }
863
864 if (ldcp->ldc_status == LDC_UP) {
865 D2(vswp, "%s: channel %ld now UP (%ld)", __func__,
866 ldcp->ldc_id, istatus);
867 mutex_exit(&ldcp->status_lock);
868 LDC_EXIT_LOCK(ldcp);
869
870 vsw_process_conn_evt(ldcp, VSW_CONN_UP);
871 return (0);
872 }
873
874 mutex_exit(&ldcp->status_lock);
875 LDC_EXIT_LOCK(ldcp);
876
877 D1(vswp, "%s: exit", __func__);
878 return (0);
879 }
880
881 /* disable callbacks on the channel */
882 static void
vsw_ldc_uninit(vsw_ldc_t * ldcp)883 vsw_ldc_uninit(vsw_ldc_t *ldcp)
884 {
885 vsw_t *vswp = ldcp->ldc_vswp;
886 int rv;
887
888 D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);
889
890 LDC_ENTER_LOCK(ldcp);
891
892 rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
893 if (rv != 0) {
894 cmn_err(CE_NOTE, "!vsw_ldc_uninit(%ld): error disabling "
895 "interrupts (rv = %d)\n", ldcp->ldc_id, rv);
896 }
897
898 mutex_enter(&ldcp->status_lock);
899 ldcp->ldc_status = LDC_INIT;
900 mutex_exit(&ldcp->status_lock);
901
902 LDC_EXIT_LOCK(ldcp);
903
904 D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
905 }
906
907 /*
908 * Wait until the callback(s) associated with the ldcs under the specified
909 * port have completed.
910 *
911 * Prior to this function being invoked each channel under this port
912 * should have been quiesced via ldc_set_cb_mode(DISABLE).
913 *
914 * A short explaination of what we are doing below..
915 *
916 * The simplest approach would be to have a reference counter in
917 * the ldc structure which is increment/decremented by the callbacks as
918 * they use the channel. The drain function could then simply disable any
919 * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
920 * there is a tiny window here - before the callback is able to get the lock
921 * on the channel it is interrupted and this function gets to execute. It
922 * sees that the ref count is zero and believes its free to delete the
923 * associated data structures.
924 *
925 * We get around this by taking advantage of the fact that before the ldc
926 * framework invokes a callback it sets a flag to indicate that there is a
927 * callback active (or about to become active). If when we attempt to
928 * unregister a callback when this active flag is set then the unregister
929 * will fail with EWOULDBLOCK.
930 *
931 * If the unregister fails we do a cv_timedwait. We will either be signaled
932 * by the callback as it is exiting (note we have to wait a short period to
933 * allow the callback to return fully to the ldc framework and it to clear
934 * the active flag), or by the timer expiring. In either case we again attempt
935 * the unregister. We repeat this until we can succesfully unregister the
936 * callback.
937 *
938 * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
939 * the case where the callback has finished but the ldc framework has not yet
940 * cleared the active flag. In this case we would never get a cv_signal.
941 */
942 static void
vsw_ldc_drain(vsw_ldc_t * ldcp)943 vsw_ldc_drain(vsw_ldc_t *ldcp)
944 {
945 vsw_t *vswp = ldcp->ldc_port->p_vswp;
946
947 D1(vswp, "%s: enter", __func__);
948
949 /*
950 * If we can unregister the channel callback then we
951 * know that there is no callback either running or
952 * scheduled to run for this channel so move on to next
953 * channel in the list.
954 */
955 mutex_enter(&ldcp->drain_cv_lock);
956
957 /* prompt active callbacks to quit */
958 ldcp->drain_state = VSW_LDC_DRAINING;
959
960 if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
961 D2(vswp, "%s: unreg callback for chan %ld", __func__,
962 ldcp->ldc_id);
963 mutex_exit(&ldcp->drain_cv_lock);
964 } else {
965 /*
966 * If we end up here we know that either 1) a callback
967 * is currently executing, 2) is about to start (i.e.
968 * the ldc framework has set the active flag but
969 * has not actually invoked the callback yet, or 3)
970 * has finished and has returned to the ldc framework
971 * but the ldc framework has not yet cleared the
972 * active bit.
973 *
974 * Wait for it to finish.
975 */
976 while (ldc_unreg_callback(ldcp->ldc_handle) == EWOULDBLOCK) {
977 (void) cv_timedwait(&ldcp->drain_cv,
978 &ldcp->drain_cv_lock, ddi_get_lbolt() + hz);
979 }
980
981 mutex_exit(&ldcp->drain_cv_lock);
982 D2(vswp, "%s: unreg callback for chan %ld after "
983 "timeout", __func__, ldcp->ldc_id);
984 }
985
986 D1(vswp, "%s: exit", __func__);
987 }
988
989 /*
990 * Wait until all tasks which reference this port have completed.
991 *
992 * Prior to this function being invoked each channel under this port
993 * should have been quiesced via ldc_set_cb_mode(DISABLE).
994 */
995 static void
vsw_drain_port_taskq(vsw_port_t * port)996 vsw_drain_port_taskq(vsw_port_t *port)
997 {
998 vsw_t *vswp = port->p_vswp;
999
1000 D1(vswp, "%s: enter", __func__);
1001
1002 /*
1003 * Mark the port as in the process of being detached, and
1004 * dispatch a marker task to the queue so we know when all
1005 * relevant tasks have completed.
1006 */
1007 mutex_enter(&port->state_lock);
1008 port->state = VSW_PORT_DETACHING;
1009
1010 if ((vswp->taskq_p == NULL) ||
1011 (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
1012 port, DDI_NOSLEEP) != DDI_SUCCESS)) {
1013 cmn_err(CE_NOTE, "!vsw%d: unable to dispatch marker task",
1014 vswp->instance);
1015 mutex_exit(&port->state_lock);
1016 return;
1017 }
1018
1019 /*
1020 * Wait for the marker task to finish.
1021 */
1022 while (port->state != VSW_PORT_DETACHABLE)
1023 cv_wait(&port->state_cv, &port->state_lock);
1024
1025 mutex_exit(&port->state_lock);
1026
1027 D1(vswp, "%s: exit", __func__);
1028 }
1029
1030 static void
vsw_marker_task(void * arg)1031 vsw_marker_task(void *arg)
1032 {
1033 vsw_port_t *port = arg;
1034 vsw_t *vswp = port->p_vswp;
1035
1036 D1(vswp, "%s: enter", __func__);
1037
1038 mutex_enter(&port->state_lock);
1039
1040 /*
1041 * No further tasks should be dispatched which reference
1042 * this port so ok to mark it as safe to detach.
1043 */
1044 port->state = VSW_PORT_DETACHABLE;
1045
1046 cv_signal(&port->state_cv);
1047
1048 mutex_exit(&port->state_lock);
1049
1050 D1(vswp, "%s: exit", __func__);
1051 }
1052
1053 vsw_port_t *
vsw_lookup_port(vsw_t * vswp,int p_instance)1054 vsw_lookup_port(vsw_t *vswp, int p_instance)
1055 {
1056 vsw_port_list_t *plist = &vswp->plist;
1057 vsw_port_t *port;
1058
1059 for (port = plist->head; port != NULL; port = port->p_next) {
1060 if (port->p_instance == p_instance) {
1061 D2(vswp, "vsw_lookup_port: found p_instance\n");
1062 return (port);
1063 }
1064 }
1065
1066 return (NULL);
1067 }
1068
1069 void
vsw_vlan_unaware_port_reset(vsw_port_t * portp)1070 vsw_vlan_unaware_port_reset(vsw_port_t *portp)
1071 {
1072 vsw_ldc_t *ldcp = portp->ldcp;
1073
1074 mutex_enter(&ldcp->ldc_cblock);
1075
1076 /*
1077 * If the peer is vlan_unaware(ver < 1.3), reset channel and terminate
1078 * the connection. See comments in vsw_set_vnet_proto_ops().
1079 */
1080 if (ldcp->hphase == VSW_MILESTONE4 && VSW_VER_LT(ldcp, 1, 3) &&
1081 portp->nvids != 0) {
1082 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1083 }
1084
1085 mutex_exit(&ldcp->ldc_cblock);
1086 }
1087
1088 void
vsw_hio_port_reset(vsw_port_t * portp,boolean_t immediate)1089 vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate)
1090 {
1091 vsw_ldc_t *ldcp = portp->ldcp;
1092
1093 mutex_enter(&ldcp->ldc_cblock);
1094
1095 /*
1096 * If the peer is HybridIO capable (ver >= 1.3), reset channel
1097 * to trigger re-negotiation, which inturn trigger HybridIO
1098 * setup/cleanup.
1099 */
1100 if ((ldcp->hphase == VSW_MILESTONE4) &&
1101 (portp->p_hio_capable == B_TRUE)) {
1102 if (immediate == B_TRUE) {
1103 (void) ldc_down(ldcp->ldc_handle);
1104 } else {
1105 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1106 }
1107 }
1108
1109 mutex_exit(&ldcp->ldc_cblock);
1110 }
1111
1112 void
vsw_port_reset(vsw_port_t * portp)1113 vsw_port_reset(vsw_port_t *portp)
1114 {
1115 vsw_ldc_t *ldcp = portp->ldcp;
1116
1117 mutex_enter(&ldcp->ldc_cblock);
1118
1119 /*
1120 * reset channel and terminate the connection.
1121 */
1122 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1123
1124 mutex_exit(&ldcp->ldc_cblock);
1125 }
1126
1127 void
vsw_reset_ports(vsw_t * vswp)1128 vsw_reset_ports(vsw_t *vswp)
1129 {
1130 vsw_port_list_t *plist = &vswp->plist;
1131 vsw_port_t *portp;
1132
1133 READ_ENTER(&plist->lockrw);
1134 for (portp = plist->head; portp != NULL; portp = portp->p_next) {
1135 if ((portp->p_hio_capable) && (portp->p_hio_enabled)) {
1136 vsw_hio_stop_port(portp);
1137 }
1138 vsw_port_reset(portp);
1139 }
1140 RW_EXIT(&plist->lockrw);
1141 }
1142
1143 static void
vsw_send_physlink_msg(vsw_ldc_t * ldcp,link_state_t plink_state)1144 vsw_send_physlink_msg(vsw_ldc_t *ldcp, link_state_t plink_state)
1145 {
1146 vnet_physlink_msg_t msg;
1147 vnet_physlink_msg_t *msgp = &msg;
1148 uint32_t physlink_info = 0;
1149
1150 if (plink_state == LINK_STATE_UP) {
1151 physlink_info |= VNET_PHYSLINK_STATE_UP;
1152 } else {
1153 physlink_info |= VNET_PHYSLINK_STATE_DOWN;
1154 }
1155
1156 msgp->tag.vio_msgtype = VIO_TYPE_CTRL;
1157 msgp->tag.vio_subtype = VIO_SUBTYPE_INFO;
1158 msgp->tag.vio_subtype_env = VNET_PHYSLINK_INFO;
1159 msgp->tag.vio_sid = ldcp->local_session;
1160 msgp->physlink_info = physlink_info;
1161
1162 (void) vsw_send_msg(ldcp, msgp, sizeof (msg), B_TRUE);
1163 }
1164
1165 static void
vsw_port_physlink_update(vsw_port_t * portp)1166 vsw_port_physlink_update(vsw_port_t *portp)
1167 {
1168 vsw_ldc_t *ldcp;
1169 vsw_t *vswp;
1170
1171 vswp = portp->p_vswp;
1172 ldcp = portp->ldcp;
1173
1174 mutex_enter(&ldcp->ldc_cblock);
1175
1176 /*
1177 * If handshake has completed successfully and if the vnet device
1178 * has negotiated to get physical link state updates, send a message
1179 * with the current state.
1180 */
1181 if (ldcp->hphase == VSW_MILESTONE4 && ldcp->pls_negotiated == B_TRUE) {
1182 vsw_send_physlink_msg(ldcp, vswp->phys_link_state);
1183 }
1184
1185 mutex_exit(&ldcp->ldc_cblock);
1186 }
1187
1188 void
vsw_physlink_update_ports(vsw_t * vswp)1189 vsw_physlink_update_ports(vsw_t *vswp)
1190 {
1191 vsw_port_list_t *plist = &vswp->plist;
1192 vsw_port_t *portp;
1193
1194 READ_ENTER(&plist->lockrw);
1195 for (portp = plist->head; portp != NULL; portp = portp->p_next) {
1196 vsw_port_physlink_update(portp);
1197 }
1198 RW_EXIT(&plist->lockrw);
1199 }
1200
1201 /*
1202 * Search for and remove the specified port from the port
1203 * list. Returns 0 if able to locate and remove port, otherwise
1204 * returns 1.
1205 */
1206 static int
vsw_plist_del_node(vsw_t * vswp,vsw_port_t * port)1207 vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
1208 {
1209 vsw_port_list_t *plist = &vswp->plist;
1210 vsw_port_t *curr_p, *prev_p;
1211
1212 if (plist->head == NULL)
1213 return (1);
1214
1215 curr_p = prev_p = plist->head;
1216
1217 while (curr_p != NULL) {
1218 if (curr_p == port) {
1219 if (prev_p == curr_p) {
1220 plist->head = curr_p->p_next;
1221 } else {
1222 prev_p->p_next = curr_p->p_next;
1223 }
1224 plist->num_ports--;
1225 break;
1226 } else {
1227 prev_p = curr_p;
1228 curr_p = curr_p->p_next;
1229 }
1230 }
1231 return (0);
1232 }
1233
1234 /*
1235 * Interrupt handler for ldc messages.
1236 */
1237 static uint_t
vsw_ldc_cb(uint64_t event,caddr_t arg)1238 vsw_ldc_cb(uint64_t event, caddr_t arg)
1239 {
1240 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
1241 vsw_t *vswp = ldcp->ldc_vswp;
1242
1243 D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
1244
1245 mutex_enter(&ldcp->ldc_cblock);
1246 ldcp->ldc_stats.callbacks++;
1247
1248 mutex_enter(&ldcp->status_lock);
1249 if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
1250 mutex_exit(&ldcp->status_lock);
1251 mutex_exit(&ldcp->ldc_cblock);
1252 return (LDC_SUCCESS);
1253 }
1254 mutex_exit(&ldcp->status_lock);
1255
1256 if (event & LDC_EVT_UP) {
1257 /*
1258 * Channel has come up.
1259 */
1260 D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)",
1261 __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1262
1263 vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1264
1265 ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1266 }
1267
1268 if (event & LDC_EVT_READ) {
1269 /*
1270 * Data available for reading.
1271 */
1272 D2(vswp, "%s: id(ld) event(%llx) data READ",
1273 __func__, ldcp->ldc_id, event);
1274
1275 vsw_process_evt_read(ldcp);
1276
1277 ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1278
1279 goto vsw_cb_exit;
1280 }
1281
1282 if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) {
1283 D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)",
1284 __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1285
1286 vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
1287 }
1288
1289 /*
1290 * Catch either LDC_EVT_WRITE which we don't support or any
1291 * unknown event.
1292 */
1293 if (event &
1294 ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) {
1295 DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
1296 __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1297 }
1298
1299 vsw_cb_exit:
1300 mutex_exit(&ldcp->ldc_cblock);
1301
1302 /*
1303 * Let the drain function know we are finishing if it
1304 * is waiting.
1305 */
1306 mutex_enter(&ldcp->drain_cv_lock);
1307 if (ldcp->drain_state == VSW_LDC_DRAINING)
1308 cv_signal(&ldcp->drain_cv);
1309 mutex_exit(&ldcp->drain_cv_lock);
1310
1311 return (LDC_SUCCESS);
1312 }
1313
1314 /*
1315 * Reinitialise data structures associated with the channel.
1316 */
1317 static void
vsw_ldc_reinit(vsw_ldc_t * ldcp)1318 vsw_ldc_reinit(vsw_ldc_t *ldcp)
1319 {
1320 vsw_t *vswp = ldcp->ldc_vswp;
1321 vsw_port_t *port;
1322
1323 D1(vswp, "%s: enter", __func__);
1324
1325 port = ldcp->ldc_port;
1326
1327 D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
1328 ldcp->lane_in.lstate, ldcp->lane_out.lstate);
1329
1330 vsw_free_lane_resources(ldcp, INBOUND);
1331 vsw_free_lane_resources(ldcp, OUTBOUND);
1332
1333 ldcp->lane_in.lstate = 0;
1334 ldcp->lane_out.lstate = 0;
1335
1336 /*
1337 * Remove parent port from any multicast groups
1338 * it may have registered with. Client must resend
1339 * multicast add command after handshake completes.
1340 */
1341 vsw_del_mcst_port(port);
1342
1343 ldcp->peer_session = 0;
1344 ldcp->session_status = 0;
1345 ldcp->hcnt = 0;
1346 ldcp->hphase = VSW_MILESTONE0;
1347
1348 vsw_reset_vnet_proto_ops(ldcp);
1349
1350 D1(vswp, "%s: exit", __func__);
1351 }
1352
1353 /*
1354 * Process a connection event.
1355 */
1356 void
vsw_process_conn_evt(vsw_ldc_t * ldcp,uint16_t evt)1357 vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt)
1358 {
1359 vsw_t *vswp = ldcp->ldc_vswp;
1360 vsw_conn_evt_t *conn = NULL;
1361
1362 D1(vswp, "%s: enter", __func__);
1363
1364 /*
1365 * Check if either a reset or restart event is pending
1366 * or in progress. If so just return.
1367 *
1368 * A VSW_CONN_RESET event originates either with a LDC_RESET_EVT
1369 * being received by the callback handler, or a ECONNRESET error
1370 * code being returned from a ldc_read() or ldc_write() call.
1371 *
1372 * A VSW_CONN_RESTART event occurs when some error checking code
1373 * decides that there is a problem with data from the channel,
1374 * and that the handshake should be restarted.
1375 */
1376 if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) &&
1377 (ldstub((uint8_t *)&ldcp->reset_active)))
1378 return;
1379
1380 /*
1381 * If it is an LDC_UP event we first check the recorded
1382 * state of the channel. If this is UP then we know that
1383 * the channel moving to the UP state has already been dealt
1384 * with and don't need to dispatch a new task.
1385 *
1386 * The reason for this check is that when we do a ldc_up(),
1387 * depending on the state of the peer, we may or may not get
1388 * a LDC_UP event. As we can't depend on getting a LDC_UP evt
1389 * every time we do ldc_up() we explicitly check the channel
1390 * status to see has it come up (ldc_up() is asynch and will
1391 * complete at some undefined time), and take the appropriate
1392 * action.
1393 *
1394 * The flip side of this is that we may get a LDC_UP event
1395 * when we have already seen that the channel is up and have
1396 * dealt with that.
1397 */
1398 mutex_enter(&ldcp->status_lock);
1399 if (evt == VSW_CONN_UP) {
1400 if ((ldcp->ldc_status == LDC_UP) || (ldcp->reset_active != 0)) {
1401 mutex_exit(&ldcp->status_lock);
1402 return;
1403 }
1404 }
1405 mutex_exit(&ldcp->status_lock);
1406
1407 /*
1408 * The transaction group id allows us to identify and discard
1409 * any tasks which are still pending on the taskq and refer
1410 * to the handshake session we are about to restart or reset.
1411 * These stale messages no longer have any real meaning.
1412 */
1413 (void) atomic_inc_32(&ldcp->hss_id);
1414
1415 ASSERT(vswp->taskq_p != NULL);
1416
1417 if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) {
1418 cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for"
1419 " connection event", vswp->instance);
1420 goto err_exit;
1421 }
1422
1423 conn->evt = evt;
1424 conn->ldcp = ldcp;
1425
1426 if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn,
1427 DDI_NOSLEEP) != DDI_SUCCESS) {
1428 cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task",
1429 vswp->instance);
1430
1431 kmem_free(conn, sizeof (vsw_conn_evt_t));
1432 goto err_exit;
1433 }
1434
1435 D1(vswp, "%s: exit", __func__);
1436 return;
1437
1438 err_exit:
1439 /*
1440 * Have mostly likely failed due to memory shortage. Clear the flag so
1441 * that future requests will at least be attempted and will hopefully
1442 * succeed.
1443 */
1444 if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1445 ldcp->reset_active = 0;
1446 }
1447
1448 /*
1449 * Deal with events relating to a connection. Invoked from a taskq.
1450 */
1451 static void
vsw_conn_task(void * arg)1452 vsw_conn_task(void *arg)
1453 {
1454 vsw_conn_evt_t *conn = (vsw_conn_evt_t *)arg;
1455 vsw_ldc_t *ldcp = NULL;
1456 vsw_port_t *portp;
1457 vsw_t *vswp = NULL;
1458 uint16_t evt;
1459 ldc_status_t curr_status;
1460
1461 ldcp = conn->ldcp;
1462 evt = conn->evt;
1463 vswp = ldcp->ldc_vswp;
1464 portp = ldcp->ldc_port;
1465
1466 D1(vswp, "%s: enter", __func__);
1467
1468 /* can safely free now have copied out data */
1469 kmem_free(conn, sizeof (vsw_conn_evt_t));
1470
1471 if (ldcp->rcv_thread != NULL) {
1472 vsw_stop_rcv_thread(ldcp);
1473 } else if (ldcp->msg_thread != NULL) {
1474 vsw_stop_msg_thread(ldcp);
1475 }
1476
1477 mutex_enter(&ldcp->status_lock);
1478 if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1479 cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1480 "channel %ld", vswp->instance, ldcp->ldc_id);
1481 mutex_exit(&ldcp->status_lock);
1482 return;
1483 }
1484
1485 /*
1486 * If we wish to restart the handshake on this channel, then if
1487 * the channel is UP we bring it DOWN to flush the underlying
1488 * ldc queue.
1489 */
1490 if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP))
1491 (void) ldc_down(ldcp->ldc_handle);
1492
1493 if ((portp->p_hio_capable) && (portp->p_hio_enabled)) {
1494 vsw_hio_stop(vswp, ldcp);
1495 }
1496
1497 /*
1498 * re-init all the associated data structures.
1499 */
1500 vsw_ldc_reinit(ldcp);
1501
1502 /*
1503 * Bring the channel back up (note it does no harm to
1504 * do this even if the channel is already UP, Just
1505 * becomes effectively a no-op).
1506 */
1507 (void) ldc_up(ldcp->ldc_handle);
1508
1509 /*
1510 * Check if channel is now UP. This will only happen if
1511 * peer has also done a ldc_up().
1512 */
1513 if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1514 cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1515 "channel %ld", vswp->instance, ldcp->ldc_id);
1516 mutex_exit(&ldcp->status_lock);
1517 return;
1518 }
1519
1520 ldcp->ldc_status = curr_status;
1521
1522 /* channel UP so restart handshake by sending version info */
1523 if (curr_status == LDC_UP) {
1524 if (ldcp->hcnt++ > vsw_num_handshakes) {
1525 cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted"
1526 " handshake attempts (%d) on channel %ld",
1527 vswp->instance, ldcp->hcnt, ldcp->ldc_id);
1528 mutex_exit(&ldcp->status_lock);
1529 return;
1530 }
1531
1532 if (vsw_obp_ver_proto_workaround == B_FALSE &&
1533 (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp,
1534 DDI_NOSLEEP) != DDI_SUCCESS)) {
1535 cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task",
1536 vswp->instance);
1537
1538 /*
1539 * Don't count as valid restart attempt if couldn't
1540 * send version msg.
1541 */
1542 if (ldcp->hcnt > 0)
1543 ldcp->hcnt--;
1544 }
1545 }
1546
1547 /*
1548 * Mark that the process is complete by clearing the flag.
1549 *
1550 * Note is it possible that the taskq dispatch above may have failed,
1551 * most likely due to memory shortage. We still clear the flag so
1552 * future attempts will at least be attempted and will hopefully
1553 * succeed.
1554 */
1555 if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1556 ldcp->reset_active = 0;
1557
1558 mutex_exit(&ldcp->status_lock);
1559
1560 D1(vswp, "%s: exit", __func__);
1561 }
1562
1563 /*
1564 * returns 0 if legal for event signified by flag to have
1565 * occured at the time it did. Otherwise returns 1.
1566 */
1567 int
vsw_check_flag(vsw_ldc_t * ldcp,int dir,uint64_t flag)1568 vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
1569 {
1570 vsw_t *vswp = ldcp->ldc_vswp;
1571 uint64_t state;
1572 uint64_t phase;
1573
1574 if (dir == INBOUND)
1575 state = ldcp->lane_in.lstate;
1576 else
1577 state = ldcp->lane_out.lstate;
1578
1579 phase = ldcp->hphase;
1580
1581 switch (flag) {
1582 case VSW_VER_INFO_RECV:
1583 if (phase > VSW_MILESTONE0) {
1584 DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
1585 " when in state %d\n", ldcp->ldc_id, phase);
1586 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1587 return (1);
1588 }
1589 break;
1590
1591 case VSW_VER_ACK_RECV:
1592 case VSW_VER_NACK_RECV:
1593 if (!(state & VSW_VER_INFO_SENT)) {
1594 DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK or "
1595 "VER_NACK when in state %d\n", ldcp->ldc_id, phase);
1596 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1597 return (1);
1598 } else
1599 state &= ~VSW_VER_INFO_SENT;
1600 break;
1601
1602 case VSW_ATTR_INFO_RECV:
1603 if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
1604 DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
1605 " when in state %d\n", ldcp->ldc_id, phase);
1606 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1607 return (1);
1608 }
1609 break;
1610
1611 case VSW_ATTR_ACK_RECV:
1612 case VSW_ATTR_NACK_RECV:
1613 if (!(state & VSW_ATTR_INFO_SENT)) {
1614 DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
1615 " or ATTR_NACK when in state %d\n",
1616 ldcp->ldc_id, phase);
1617 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1618 return (1);
1619 } else
1620 state &= ~VSW_ATTR_INFO_SENT;
1621 break;
1622
1623 case VSW_DRING_INFO_RECV:
1624 if (phase < VSW_MILESTONE1) {
1625 DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
1626 " when in state %d\n", ldcp->ldc_id, phase);
1627 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1628 return (1);
1629 }
1630 break;
1631
1632 case VSW_DRING_ACK_RECV:
1633 case VSW_DRING_NACK_RECV:
1634 if (!(state & VSW_DRING_INFO_SENT)) {
1635 DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK "
1636 " or DRING_NACK when in state %d\n",
1637 ldcp->ldc_id, phase);
1638 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1639 return (1);
1640 } else
1641 state &= ~VSW_DRING_INFO_SENT;
1642 break;
1643
1644 case VSW_RDX_INFO_RECV:
1645 if (phase < VSW_MILESTONE3) {
1646 DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
1647 " when in state %d\n", ldcp->ldc_id, phase);
1648 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1649 return (1);
1650 }
1651 break;
1652
1653 case VSW_RDX_ACK_RECV:
1654 case VSW_RDX_NACK_RECV:
1655 if (!(state & VSW_RDX_INFO_SENT)) {
1656 DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK or "
1657 "RDX_NACK when in state %d\n", ldcp->ldc_id, phase);
1658 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1659 return (1);
1660 } else
1661 state &= ~VSW_RDX_INFO_SENT;
1662 break;
1663
1664 case VSW_MCST_INFO_RECV:
1665 if (phase < VSW_MILESTONE3) {
1666 DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
1667 " when in state %d\n", ldcp->ldc_id, phase);
1668 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1669 return (1);
1670 }
1671 break;
1672
1673 default:
1674 DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
1675 ldcp->ldc_id, flag);
1676 return (1);
1677 }
1678
1679 if (dir == INBOUND)
1680 ldcp->lane_in.lstate = state;
1681 else
1682 ldcp->lane_out.lstate = state;
1683
1684 D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);
1685
1686 return (0);
1687 }
1688
1689 void
vsw_next_milestone(vsw_ldc_t * ldcp)1690 vsw_next_milestone(vsw_ldc_t *ldcp)
1691 {
1692 vsw_t *vswp = ldcp->ldc_vswp;
1693 vsw_port_t *portp = ldcp->ldc_port;
1694 lane_t *lane_out = &ldcp->lane_out;
1695 lane_t *lane_in = &ldcp->lane_in;
1696
1697 D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
1698 ldcp->ldc_id, ldcp->hphase);
1699
1700 DUMP_FLAGS(lane_in->lstate);
1701 DUMP_FLAGS(lane_out->lstate);
1702
1703 switch (ldcp->hphase) {
1704
1705 case VSW_MILESTONE0:
1706 /*
1707 * If we haven't started to handshake with our peer,
1708 * start to do so now.
1709 */
1710 if (lane_out->lstate == 0) {
1711 D2(vswp, "%s: (chan %lld) starting handshake "
1712 "with peer", __func__, ldcp->ldc_id);
1713 vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1714 }
1715
1716 /*
1717 * Only way to pass this milestone is to have successfully
1718 * negotiated version info.
1719 */
1720 if ((lane_in->lstate & VSW_VER_ACK_SENT) &&
1721 (lane_out->lstate & VSW_VER_ACK_RECV)) {
1722
1723 D2(vswp, "%s: (chan %lld) leaving milestone 0",
1724 __func__, ldcp->ldc_id);
1725
1726 vsw_set_vnet_proto_ops(ldcp);
1727
1728 /*
1729 * Next milestone is passed when attribute
1730 * information has been successfully exchanged.
1731 */
1732 ldcp->hphase = VSW_MILESTONE1;
1733 vsw_send_attr(ldcp);
1734
1735 }
1736 break;
1737
1738 case VSW_MILESTONE1:
1739 /*
1740 * Only way to pass this milestone is to have successfully
1741 * negotiated attribute information, in both directions.
1742 */
1743 if (!((lane_in->lstate & VSW_ATTR_ACK_SENT) &&
1744 (lane_out->lstate & VSW_ATTR_ACK_RECV))) {
1745 break;
1746 }
1747
1748 ldcp->hphase = VSW_MILESTONE2;
1749
1750 /*
1751 * If the peer device has said it wishes to
1752 * use descriptor rings then we send it our ring
1753 * info, otherwise we just set up a private ring
1754 * which we use an internal buffer
1755 */
1756 if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
1757 (lane_in->xfer_mode & VIO_DRING_MODE_V1_2)) ||
1758 (VSW_VER_LT(ldcp, 1, 2) &&
1759 (lane_in->xfer_mode == VIO_DRING_MODE_V1_0))) {
1760 vsw_send_dring_info(ldcp);
1761 break;
1762 }
1763
1764 /*
1765 * The peer doesn't operate in dring mode; we
1766 * can simply fallthru to the RDX phase from
1767 * here.
1768 */
1769 /*FALLTHRU*/
1770
1771 case VSW_MILESTONE2:
1772 /*
1773 * If peer has indicated in its attribute message that
1774 * it wishes to use descriptor rings then the only way
1775 * to pass this milestone is for us to have received
1776 * valid dring info.
1777 *
1778 * If peer is not using descriptor rings then just fall
1779 * through.
1780 */
1781 if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
1782 (lane_in->xfer_mode & VIO_DRING_MODE_V1_2)) ||
1783 (VSW_VER_LT(ldcp, 1, 2) &&
1784 (lane_in->xfer_mode ==
1785 VIO_DRING_MODE_V1_0))) {
1786 if (!(lane_in->lstate & VSW_DRING_ACK_SENT))
1787 break;
1788 }
1789
1790 D2(vswp, "%s: (chan %lld) leaving milestone 2",
1791 __func__, ldcp->ldc_id);
1792
1793 ldcp->hphase = VSW_MILESTONE3;
1794 vsw_send_rdx(ldcp);
1795 break;
1796
1797 case VSW_MILESTONE3:
1798 /*
1799 * Pass this milestone when all paramaters have been
1800 * successfully exchanged and RDX sent in both directions.
1801 *
1802 * Mark the relevant lane as available to transmit data. In
1803 * RxDringData mode, lane_in is associated with transmit and
1804 * lane_out is associated with receive. It is the reverse in
1805 * TxDring mode.
1806 */
1807 if ((lane_out->lstate & VSW_RDX_ACK_SENT) &&
1808 (lane_in->lstate & VSW_RDX_ACK_RECV)) {
1809
1810 D2(vswp, "%s: (chan %lld) leaving milestone 3",
1811 __func__, ldcp->ldc_id);
1812 D2(vswp, "%s: ** handshake complete (0x%llx : "
1813 "0x%llx) **", __func__, lane_in->lstate,
1814 lane_out->lstate);
1815 if (lane_out->dring_mode == VIO_RX_DRING_DATA) {
1816 lane_in->lstate |= VSW_LANE_ACTIVE;
1817 } else {
1818 lane_out->lstate |= VSW_LANE_ACTIVE;
1819 }
1820 ldcp->hphase = VSW_MILESTONE4;
1821 ldcp->hcnt = 0;
1822 DISPLAY_STATE();
1823 /* Start HIO if enabled and capable */
1824 if ((portp->p_hio_enabled) && (portp->p_hio_capable)) {
1825 D2(vswp, "%s: start HybridIO setup", __func__);
1826 vsw_hio_start(vswp, ldcp);
1827 }
1828
1829 if (ldcp->pls_negotiated == B_TRUE) {
1830 /*
1831 * The vnet device has negotiated to get phys
1832 * link updates. Now that the handshake with
1833 * the vnet device is complete, send an initial
1834 * update with the current physical link state.
1835 */
1836 vsw_send_physlink_msg(ldcp,
1837 vswp->phys_link_state);
1838 }
1839
1840 } else {
1841 D2(vswp, "%s: still in milestone 3 (0x%llx : 0x%llx)",
1842 __func__, lane_in->lstate,
1843 lane_out->lstate);
1844 }
1845 break;
1846
1847 case VSW_MILESTONE4:
1848 D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
1849 ldcp->ldc_id);
1850 break;
1851
1852 default:
1853 DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
1854 ldcp->ldc_id, ldcp->hphase);
1855 }
1856
1857 D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
1858 ldcp->hphase);
1859 }
1860
1861 /*
1862 * Check if major version is supported.
1863 *
1864 * Returns 0 if finds supported major number, and if necessary
1865 * adjusts the minor field.
1866 *
1867 * Returns 1 if can't match major number exactly. Sets mjor/minor
1868 * to next lowest support values, or to zero if no other values possible.
1869 */
1870 static int
vsw_supported_version(vio_ver_msg_t * vp)1871 vsw_supported_version(vio_ver_msg_t *vp)
1872 {
1873 int i;
1874
1875 D1(NULL, "vsw_supported_version: enter");
1876
1877 for (i = 0; i < VSW_NUM_VER; i++) {
1878 if (vsw_versions[i].ver_major == vp->ver_major) {
1879 /*
1880 * Matching or lower major version found. Update
1881 * minor number if necessary.
1882 */
1883 if (vp->ver_minor > vsw_versions[i].ver_minor) {
1884 D2(NULL, "%s: adjusting minor value from %d "
1885 "to %d", __func__, vp->ver_minor,
1886 vsw_versions[i].ver_minor);
1887 vp->ver_minor = vsw_versions[i].ver_minor;
1888 }
1889
1890 return (0);
1891 }
1892
1893 /*
1894 * If the message contains a higher major version number, set
1895 * the message's major/minor versions to the current values
1896 * and return false, so this message will get resent with
1897 * these values.
1898 */
1899 if (vsw_versions[i].ver_major < vp->ver_major) {
1900 D2(NULL, "%s: adjusting major and minor "
1901 "values to %d, %d\n",
1902 __func__, vsw_versions[i].ver_major,
1903 vsw_versions[i].ver_minor);
1904 vp->ver_major = vsw_versions[i].ver_major;
1905 vp->ver_minor = vsw_versions[i].ver_minor;
1906 return (1);
1907 }
1908 }
1909
1910 /* No match was possible, zero out fields */
1911 vp->ver_major = 0;
1912 vp->ver_minor = 0;
1913
1914 D1(NULL, "vsw_supported_version: exit");
1915
1916 return (1);
1917 }
1918
1919 /*
1920 * Set vnet-protocol-version dependent functions based on version.
1921 */
1922 static void
vsw_set_vnet_proto_ops(vsw_ldc_t * ldcp)1923 vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp)
1924 {
1925 vsw_t *vswp = ldcp->ldc_vswp;
1926 lane_t *lp = &ldcp->lane_out;
1927
1928 /*
1929 * Setup the appropriate dring data processing routine and any
1930 * associated thread based on the version.
1931 *
1932 * In versions < 1.6, we support only TxDring mode. In this mode, the
1933 * msg worker thread processes all types of VIO msgs (ctrl and data).
1934 *
1935 * In versions >= 1.6, we also support RxDringData mode. In this mode,
1936 * the rcv worker thread processes dring data messages (msgtype:
1937 * VIO_TYPE_DATA, subtype: VIO_SUBTYPE_INFO, env: VIO_DRING_DATA). The
1938 * rest of the data messages (including acks) and ctrl messages are
1939 * handled directly by the callback (intr) thread.
1940 *
1941 * However, for versions >= 1.6, we could still fallback to TxDring
1942 * mode. This could happen if RxDringData mode has been disabled (see
1943 * below) on this guest or on the peer guest. This info is determined
1944 * as part of attr exchange phase of handshake. Hence, we setup these
1945 * pointers for v1.6 after attr msg phase completes during handshake.
1946 */
1947 if (VSW_VER_GTEQ(ldcp, 1, 6)) {
1948 /*
1949 * Set data dring mode for vsw_send_attr(). We setup msg worker
1950 * thread in TxDring mode or rcv worker thread in RxDringData
1951 * mode when attr phase of handshake completes.
1952 */
1953 if (vsw_mapin_avail(ldcp) == B_TRUE) {
1954 lp->dring_mode = (VIO_RX_DRING_DATA | VIO_TX_DRING);
1955 } else {
1956 lp->dring_mode = VIO_TX_DRING;
1957 }
1958 } else {
1959 lp->dring_mode = VIO_TX_DRING;
1960 }
1961
1962 /*
1963 * Setup the MTU for attribute negotiation based on the version.
1964 */
1965 if (VSW_VER_GTEQ(ldcp, 1, 4)) {
1966 /*
1967 * If the version negotiated with peer is >= 1.4(Jumbo Frame
1968 * Support), set the mtu in our attributes to max_frame_size.
1969 */
1970 lp->mtu = vswp->max_frame_size;
1971 } else if (VSW_VER_EQ(ldcp, 1, 3)) {
1972 /*
1973 * If the version negotiated with peer is == 1.3 (Vlan Tag
1974 * Support) set the attr.mtu to ETHERMAX + VLAN_TAGSZ.
1975 */
1976 lp->mtu = ETHERMAX + VLAN_TAGSZ;
1977 } else {
1978 vsw_port_t *portp = ldcp->ldc_port;
1979 /*
1980 * Pre-1.3 peers expect max frame size of ETHERMAX.
1981 * We can negotiate that size with those peers provided only
1982 * pvid is defined for our peer and there are no vids. Then we
1983 * can send/recv only untagged frames of max size ETHERMAX.
1984 * Note that pvid of the peer can be different, as vsw has to
1985 * serve the vnet in that vlan even if itself is not assigned
1986 * to that vlan.
1987 */
1988 if (portp->nvids == 0) {
1989 lp->mtu = ETHERMAX;
1990 }
1991 }
1992
1993 /*
1994 * Setup version dependent data processing functions.
1995 */
1996 if (VSW_VER_GTEQ(ldcp, 1, 2)) {
1997 /* Versions >= 1.2 */
1998
1999 if (VSW_PRI_ETH_DEFINED(vswp)) {
2000 /*
2001 * enable priority routines and pkt mode only if
2002 * at least one pri-eth-type is specified in MD.
2003 */
2004 ldcp->tx = vsw_ldctx_pri;
2005 ldcp->rx_pktdata = vsw_process_pkt_data;
2006
2007 /* set xfer mode for vsw_send_attr() */
2008 lp->xfer_mode = VIO_PKT_MODE | VIO_DRING_MODE_V1_2;
2009 } else {
2010 /* no priority eth types defined in MD */
2011
2012 ldcp->tx = vsw_ldctx;
2013 ldcp->rx_pktdata = vsw_process_pkt_data_nop;
2014
2015 /* set xfer mode for vsw_send_attr() */
2016 lp->xfer_mode = VIO_DRING_MODE_V1_2;
2017 }
2018
2019 } else {
2020 /* Versions prior to 1.2 */
2021
2022 vsw_reset_vnet_proto_ops(ldcp);
2023 }
2024 }
2025
2026 /*
2027 * Reset vnet-protocol-version dependent functions to v1.0.
2028 */
2029 static void
vsw_reset_vnet_proto_ops(vsw_ldc_t * ldcp)2030 vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp)
2031 {
2032 lane_t *lp = &ldcp->lane_out;
2033
2034 ldcp->tx = vsw_ldctx;
2035 ldcp->rx_pktdata = vsw_process_pkt_data_nop;
2036
2037 /* set xfer mode for vsw_send_attr() */
2038 lp->xfer_mode = VIO_DRING_MODE_V1_0;
2039 }
2040
2041 static void
vsw_process_evt_read(vsw_ldc_t * ldcp)2042 vsw_process_evt_read(vsw_ldc_t *ldcp)
2043 {
2044 if (ldcp->msg_thread != NULL) {
2045 /*
2046 * TxDring mode; wakeup message worker
2047 * thread to process the VIO messages.
2048 */
2049 mutex_exit(&ldcp->ldc_cblock);
2050 mutex_enter(&ldcp->msg_thr_lock);
2051 if (!(ldcp->msg_thr_flags & VSW_WTHR_DATARCVD)) {
2052 ldcp->msg_thr_flags |= VSW_WTHR_DATARCVD;
2053 cv_signal(&ldcp->msg_thr_cv);
2054 }
2055 mutex_exit(&ldcp->msg_thr_lock);
2056 mutex_enter(&ldcp->ldc_cblock);
2057 } else {
2058 /*
2059 * We invoke vsw_process_pkt() in the context of the LDC
2060 * callback (vsw_ldc_cb()) during handshake, until the dring
2061 * mode is negotiated. After the dring mode is negotiated, the
2062 * msgs are processed by the msg worker thread (above case) if
2063 * the dring mode is TxDring. Otherwise (in RxDringData mode)
2064 * we continue to process the msgs directly in the callback
2065 * context.
2066 */
2067 vsw_process_pkt(ldcp);
2068 }
2069 }
2070
2071 /*
2072 * Main routine for processing messages received over LDC.
2073 */
2074 void
vsw_process_pkt(void * arg)2075 vsw_process_pkt(void *arg)
2076 {
2077 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
2078 vsw_t *vswp = ldcp->ldc_vswp;
2079 size_t msglen;
2080 vio_msg_tag_t *tagp;
2081 uint64_t *ldcmsg;
2082 int rv = 0;
2083
2084
2085 D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2086
2087 ASSERT(MUTEX_HELD(&ldcp->ldc_cblock));
2088
2089 ldcmsg = ldcp->ldcmsg;
2090 /*
2091 * If channel is up read messages until channel is empty.
2092 */
2093 do {
2094 msglen = ldcp->msglen;
2095 rv = ldc_read(ldcp->ldc_handle, (caddr_t)ldcmsg, &msglen);
2096
2097 if (rv != 0) {
2098 DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) len(%d)\n",
2099 __func__, ldcp->ldc_id, rv, msglen);
2100 }
2101
2102 /* channel has been reset */
2103 if (rv == ECONNRESET) {
2104 vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
2105 break;
2106 }
2107
2108 if (msglen == 0) {
2109 D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
2110 ldcp->ldc_id);
2111 break;
2112 }
2113
2114 D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
2115 ldcp->ldc_id, msglen);
2116
2117 /*
2118 * Figure out what sort of packet we have gotten by
2119 * examining the msg tag, and then switch it appropriately.
2120 */
2121 tagp = (vio_msg_tag_t *)ldcmsg;
2122
2123 switch (tagp->vio_msgtype) {
2124 case VIO_TYPE_CTRL:
2125 vsw_dispatch_ctrl_task(ldcp, ldcmsg, tagp, msglen);
2126 break;
2127 case VIO_TYPE_DATA:
2128 vsw_process_data_pkt(ldcp, ldcmsg, tagp, msglen);
2129 break;
2130 case VIO_TYPE_ERR:
2131 vsw_process_err_pkt(ldcp, ldcmsg, tagp);
2132 break;
2133 default:
2134 DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
2135 "id(%lx)\n", tagp->vio_msgtype, ldcp->ldc_id);
2136 break;
2137 }
2138 } while (msglen);
2139
2140 D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2141 }
2142
2143 /*
2144 * Dispatch a task to process a VIO control message.
2145 */
2146 static void
vsw_dispatch_ctrl_task(vsw_ldc_t * ldcp,void * cpkt,vio_msg_tag_t * tagp,int msglen)2147 vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t *tagp,
2148 int msglen)
2149 {
2150 vsw_ctrl_task_t *ctaskp = NULL;
2151 vsw_port_t *port = ldcp->ldc_port;
2152 vsw_t *vswp = port->p_vswp;
2153
2154 D1(vswp, "%s: enter", __func__);
2155
2156 /*
2157 * We need to handle RDX ACK messages in-band as once they
2158 * are exchanged it is possible that we will get an
2159 * immediate (legitimate) data packet.
2160 */
2161 if ((tagp->vio_subtype_env == VIO_RDX) &&
2162 (tagp->vio_subtype == VIO_SUBTYPE_ACK)) {
2163
2164 if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV))
2165 return;
2166
2167 ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV;
2168 D2(vswp, "%s (%ld) handling RDX_ACK in place "
2169 "(ostate 0x%llx : hphase %d)", __func__,
2170 ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase);
2171 vsw_next_milestone(ldcp);
2172 return;
2173 }
2174
2175 ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);
2176
2177 if (ctaskp == NULL) {
2178 DERR(vswp, "%s: unable to alloc space for ctrl msg", __func__);
2179 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2180 return;
2181 }
2182
2183 ctaskp->ldcp = ldcp;
2184 bcopy((def_msg_t *)cpkt, &ctaskp->pktp, msglen);
2185 ctaskp->hss_id = ldcp->hss_id;
2186
2187 /*
2188 * Dispatch task to processing taskq if port is not in
2189 * the process of being detached.
2190 */
2191 mutex_enter(&port->state_lock);
2192 if (port->state == VSW_PORT_INIT) {
2193 if ((vswp->taskq_p == NULL) ||
2194 (ddi_taskq_dispatch(vswp->taskq_p, vsw_process_ctrl_pkt,
2195 ctaskp, DDI_NOSLEEP) != DDI_SUCCESS)) {
2196 mutex_exit(&port->state_lock);
2197 DERR(vswp, "%s: unable to dispatch task to taskq",
2198 __func__);
2199 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2200 kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2201 return;
2202 }
2203 } else {
2204 kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2205 DWARN(vswp, "%s: port %d detaching, not dispatching "
2206 "task", __func__, port->p_instance);
2207 }
2208
2209 mutex_exit(&port->state_lock);
2210
2211 D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
2212 ldcp->ldc_id);
2213 D1(vswp, "%s: exit", __func__);
2214 }
2215
2216 /*
2217 * Process a VIO ctrl message. Invoked from taskq.
2218 */
2219 static void
vsw_process_ctrl_pkt(void * arg)2220 vsw_process_ctrl_pkt(void *arg)
2221 {
2222 vsw_ctrl_task_t *ctaskp = (vsw_ctrl_task_t *)arg;
2223 vsw_ldc_t *ldcp = ctaskp->ldcp;
2224 vsw_t *vswp = ldcp->ldc_vswp;
2225 vio_msg_tag_t tag;
2226 uint16_t env;
2227
2228 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2229
2230 bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
2231 env = tag.vio_subtype_env;
2232
2233 /* stale pkt check */
2234 if (ctaskp->hss_id < ldcp->hss_id) {
2235 DWARN(vswp, "%s: discarding stale packet belonging to earlier"
2236 " (%ld) handshake session", __func__, ctaskp->hss_id);
2237 kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2238 return;
2239 }
2240
2241 /* session id check */
2242 if (ldcp->session_status & VSW_PEER_SESSION) {
2243 if (ldcp->peer_session != tag.vio_sid) {
2244 DERR(vswp, "%s (chan %d): invalid session id (%llx)",
2245 __func__, ldcp->ldc_id, tag.vio_sid);
2246 kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2247 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2248 return;
2249 }
2250 }
2251
2252 /*
2253 * Switch on vio_subtype envelope, then let lower routines
2254 * decide if its an INFO, ACK or NACK packet.
2255 */
2256 switch (env) {
2257 case VIO_VER_INFO:
2258 vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
2259 break;
2260 case VIO_DRING_REG:
2261 vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
2262 break;
2263 case VIO_DRING_UNREG:
2264 vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
2265 break;
2266 case VIO_ATTR_INFO:
2267 vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
2268 break;
2269 case VNET_MCAST_INFO:
2270 vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
2271 break;
2272 case VIO_RDX:
2273 vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
2274 break;
2275 case VIO_DDS_INFO:
2276 vsw_process_dds_msg(vswp, ldcp, &ctaskp->pktp);
2277 break;
2278
2279 case VNET_PHYSLINK_INFO:
2280 vsw_process_physlink_msg(ldcp, &ctaskp->pktp);
2281 break;
2282 default:
2283 DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
2284 }
2285
2286 kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2287 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2288 }
2289
2290 /*
2291 * Version negotiation. We can end up here either because our peer
2292 * has responded to a handshake message we have sent it, or our peer
2293 * has initiated a handshake with us. If its the former then can only
2294 * be ACK or NACK, if its the later can only be INFO.
2295 *
2296 * If its an ACK we move to the next stage of the handshake, namely
2297 * attribute exchange. If its a NACK we see if we can specify another
2298 * version, if we can't we stop.
2299 *
2300 * If it is an INFO we reset all params associated with communication
2301 * in that direction over this channel (remember connection is
2302 * essentially 2 independent simplex channels).
2303 */
2304 void
vsw_process_ctrl_ver_pkt(vsw_ldc_t * ldcp,void * pkt)2305 vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
2306 {
2307 vio_ver_msg_t *ver_pkt;
2308 vsw_t *vswp = ldcp->ldc_vswp;
2309
2310 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2311
2312 /*
2313 * We know this is a ctrl/version packet so
2314 * cast it into the correct structure.
2315 */
2316 ver_pkt = (vio_ver_msg_t *)pkt;
2317
2318 switch (ver_pkt->tag.vio_subtype) {
2319 case VIO_SUBTYPE_INFO:
2320 D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");
2321
2322 /*
2323 * Record the session id, which we will use from now
2324 * until we see another VER_INFO msg. Even then the
2325 * session id in most cases will be unchanged, execpt
2326 * if channel was reset.
2327 */
2328 if ((ldcp->session_status & VSW_PEER_SESSION) &&
2329 (ldcp->peer_session != ver_pkt->tag.vio_sid)) {
2330 DERR(vswp, "%s: updating session id for chan %lld "
2331 "from %llx to %llx", __func__, ldcp->ldc_id,
2332 ldcp->peer_session, ver_pkt->tag.vio_sid);
2333 }
2334
2335 ldcp->peer_session = ver_pkt->tag.vio_sid;
2336 ldcp->session_status |= VSW_PEER_SESSION;
2337
2338 /* Legal message at this time ? */
2339 if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
2340 return;
2341
2342 /*
2343 * First check the device class. Currently only expect
2344 * to be talking to a network device. In the future may
2345 * also talk to another switch.
2346 */
2347 if (ver_pkt->dev_class != VDEV_NETWORK) {
2348 DERR(vswp, "%s: illegal device class %d", __func__,
2349 ver_pkt->dev_class);
2350
2351 ver_pkt->tag.vio_sid = ldcp->local_session;
2352 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2353
2354 DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2355
2356 (void) vsw_send_msg(ldcp, (void *)ver_pkt,
2357 sizeof (vio_ver_msg_t), B_TRUE);
2358
2359 ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2360 vsw_next_milestone(ldcp);
2361 return;
2362 } else {
2363 ldcp->dev_class = ver_pkt->dev_class;
2364 }
2365
2366 /*
2367 * Now check the version.
2368 */
2369 if (vsw_supported_version(ver_pkt) == 0) {
2370 /*
2371 * Support this major version and possibly
2372 * adjusted minor version.
2373 */
2374
2375 D2(vswp, "%s: accepted ver %d:%d", __func__,
2376 ver_pkt->ver_major, ver_pkt->ver_minor);
2377
2378 /* Store accepted values */
2379 ldcp->lane_in.ver_major = ver_pkt->ver_major;
2380 ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2381
2382 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2383
2384 ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;
2385
2386 if (vsw_obp_ver_proto_workaround == B_TRUE) {
2387 /*
2388 * Send a version info message
2389 * using the accepted version that
2390 * we are about to ack. Also note that
2391 * we send our ver info before we ack.
2392 * Otherwise, as soon as receiving the
2393 * ack, obp sends attr info msg, which
2394 * breaks vsw_check_flag() invoked
2395 * from vsw_process_ctrl_attr_pkt();
2396 * as we also need VSW_VER_ACK_RECV to
2397 * be set in lane_out.lstate, before
2398 * we can receive attr info.
2399 */
2400 vsw_send_ver(ldcp);
2401 }
2402 } else {
2403 /*
2404 * NACK back with the next lower major/minor
2405 * pairing we support (if don't suuport any more
2406 * versions then they will be set to zero.
2407 */
2408
2409 D2(vswp, "%s: replying with ver %d:%d", __func__,
2410 ver_pkt->ver_major, ver_pkt->ver_minor);
2411
2412 /* Store updated values */
2413 ldcp->lane_in.ver_major = ver_pkt->ver_major;
2414 ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2415
2416 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2417
2418 ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2419 }
2420
2421 DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2422 ver_pkt->tag.vio_sid = ldcp->local_session;
2423 (void) vsw_send_msg(ldcp, (void *)ver_pkt,
2424 sizeof (vio_ver_msg_t), B_TRUE);
2425
2426 vsw_next_milestone(ldcp);
2427 break;
2428
2429 case VIO_SUBTYPE_ACK:
2430 D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);
2431
2432 if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
2433 return;
2434
2435 /* Store updated values */
2436 ldcp->lane_out.ver_major = ver_pkt->ver_major;
2437 ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2438
2439 ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
2440 vsw_next_milestone(ldcp);
2441
2442 break;
2443
2444 case VIO_SUBTYPE_NACK:
2445 D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);
2446
2447 if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
2448 return;
2449
2450 /*
2451 * If our peer sent us a NACK with the ver fields set to
2452 * zero then there is nothing more we can do. Otherwise see
2453 * if we support either the version suggested, or a lesser
2454 * one.
2455 */
2456 if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2457 DERR(vswp, "%s: peer unable to negotiate any "
2458 "further.", __func__);
2459 ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2460 vsw_next_milestone(ldcp);
2461 return;
2462 }
2463
2464 /*
2465 * Check to see if we support this major version or
2466 * a lower one. If we don't then maj/min will be set
2467 * to zero.
2468 */
2469 (void) vsw_supported_version(ver_pkt);
2470 if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2471 /* Nothing more we can do */
2472 DERR(vswp, "%s: version negotiation failed.\n",
2473 __func__);
2474 ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2475 vsw_next_milestone(ldcp);
2476 } else {
2477 /* found a supported major version */
2478 ldcp->lane_out.ver_major = ver_pkt->ver_major;
2479 ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2480
2481 D2(vswp, "%s: resending with updated values (%x, %x)",
2482 __func__, ver_pkt->ver_major, ver_pkt->ver_minor);
2483
2484 ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
2485 ver_pkt->tag.vio_sid = ldcp->local_session;
2486 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
2487
2488 DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2489
2490 (void) vsw_send_msg(ldcp, (void *)ver_pkt,
2491 sizeof (vio_ver_msg_t), B_TRUE);
2492
2493 vsw_next_milestone(ldcp);
2494
2495 }
2496 break;
2497
2498 default:
2499 DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2500 ver_pkt->tag.vio_subtype);
2501 }
2502
2503 D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
2504 }
2505
2506 static int
vsw_process_attr_info(vsw_ldc_t * ldcp,vnet_attr_msg_t * msg)2507 vsw_process_attr_info(vsw_ldc_t *ldcp, vnet_attr_msg_t *msg)
2508 {
2509 vsw_t *vswp = ldcp->ldc_vswp;
2510 vsw_port_t *port = ldcp->ldc_port;
2511 struct ether_addr ea;
2512 uint64_t macaddr = 0;
2513 lane_t *lane_out = &ldcp->lane_out;
2514 lane_t *lane_in = &ldcp->lane_in;
2515 uint32_t mtu;
2516 int i;
2517 uint8_t dring_mode;
2518
2519 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2520
2521 if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV)) {
2522 return (1);
2523 }
2524
2525 if ((msg->xfer_mode != VIO_DESC_MODE) &&
2526 (msg->xfer_mode != lane_out->xfer_mode)) {
2527 D2(NULL, "%s: unknown mode %x\n", __func__, msg->xfer_mode);
2528 return (1);
2529 }
2530
2531 /* Only support MAC addresses at moment. */
2532 if ((msg->addr_type != ADDR_TYPE_MAC) || (msg->addr == 0)) {
2533 D2(NULL, "%s: invalid addr_type %x, or address 0x%llx\n",
2534 __func__, msg->addr_type, msg->addr);
2535 return (1);
2536 }
2537
2538 /*
2539 * MAC address supplied by device should match that stored
2540 * in the vsw-port OBP node. Need to decide what to do if they
2541 * don't match, for the moment just warn but don't fail.
2542 */
2543 vnet_macaddr_ultostr(msg->addr, ea.ether_addr_octet);
2544 if (ether_cmp(&ea, &port->p_macaddr) != 0) {
2545 DERR(NULL, "%s: device supplied address "
2546 "0x%llx doesn't match node address 0x%llx\n",
2547 __func__, msg->addr, port->p_macaddr);
2548 }
2549
2550 /*
2551 * Ack freq only makes sense in pkt mode, in shared
2552 * mode the ring descriptors say whether or not to
2553 * send back an ACK.
2554 */
2555 if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
2556 (msg->xfer_mode & VIO_DRING_MODE_V1_2)) ||
2557 (VSW_VER_LT(ldcp, 1, 2) &&
2558 (msg->xfer_mode == VIO_DRING_MODE_V1_0))) {
2559 if (msg->ack_freq > 0) {
2560 D2(NULL, "%s: non zero ack freq in SHM mode\n",
2561 __func__);
2562 return (1);
2563 }
2564 }
2565
2566 /*
2567 * Process dring mode attribute.
2568 */
2569 if (VSW_VER_GTEQ(ldcp, 1, 6)) {
2570 /*
2571 * Versions >= 1.6:
2572 * Though we are operating in v1.6 mode, it is possible that
2573 * RxDringData mode has been disabled either on this guest or
2574 * on the peer guest. If so, we revert to pre v1.6 behavior of
2575 * TxDring mode. But this must be agreed upon in both
2576 * directions of attr exchange. We first determine the mode
2577 * that can be negotiated.
2578 */
2579 if ((msg->options & VIO_RX_DRING_DATA) != 0 &&
2580 vsw_mapin_avail(ldcp) == B_TRUE) {
2581 /*
2582 * The peer is capable of handling RxDringData AND we
2583 * are also capable of it; we enable RxDringData mode
2584 * on this channel.
2585 */
2586 dring_mode = VIO_RX_DRING_DATA;
2587 } else if ((msg->options & VIO_TX_DRING) != 0) {
2588 /*
2589 * If the peer is capable of TxDring mode, we
2590 * negotiate TxDring mode on this channel.
2591 */
2592 dring_mode = VIO_TX_DRING;
2593 } else {
2594 /*
2595 * We support only VIO_TX_DRING and VIO_RX_DRING_DATA
2596 * modes. We don't support VIO_RX_DRING mode.
2597 */
2598 return (1);
2599 }
2600
2601 /*
2602 * If we have received an ack for the attr info that we sent,
2603 * then check if the dring mode matches what the peer had ack'd
2604 * (saved in lane_out). If they don't match, we fail the
2605 * handshake.
2606 */
2607 if (lane_out->lstate & VSW_ATTR_ACK_RECV) {
2608 if (msg->options != lane_out->dring_mode) {
2609 /* send NACK */
2610 return (1);
2611 }
2612 } else {
2613 /*
2614 * Save the negotiated dring mode in our attr
2615 * parameters, so it gets sent in the attr info from us
2616 * to the peer.
2617 */
2618 lane_out->dring_mode = dring_mode;
2619 }
2620
2621 /* save the negotiated dring mode in the msg to be replied */
2622 msg->options = dring_mode;
2623 }
2624
2625 /*
2626 * Process MTU attribute.
2627 */
2628 if (VSW_VER_GTEQ(ldcp, 1, 4)) {
2629 /*
2630 * Versions >= 1.4:
2631 * Validate mtu of the peer is at least ETHERMAX. Then, the mtu
2632 * is negotiated down to the minimum of our mtu and peer's mtu.
2633 */
2634 if (msg->mtu < ETHERMAX) {
2635 return (1);
2636 }
2637
2638 mtu = MIN(msg->mtu, vswp->max_frame_size);
2639
2640 /*
2641 * If we have received an ack for the attr info
2642 * that we sent, then check if the mtu computed
2643 * above matches the mtu that the peer had ack'd
2644 * (saved in local hparams). If they don't
2645 * match, we fail the handshake.
2646 */
2647 if (lane_out->lstate & VSW_ATTR_ACK_RECV) {
2648 if (mtu != lane_out->mtu) {
2649 /* send NACK */
2650 return (1);
2651 }
2652 } else {
2653 /*
2654 * Save the mtu computed above in our
2655 * attr parameters, so it gets sent in
2656 * the attr info from us to the peer.
2657 */
2658 lane_out->mtu = mtu;
2659 }
2660
2661 /* save the MIN mtu in the msg to be replied */
2662 msg->mtu = mtu;
2663 } else {
2664 /* Versions < 1.4, mtu must match */
2665 if (msg->mtu != lane_out->mtu) {
2666 D2(NULL, "%s: invalid MTU (0x%llx)\n",
2667 __func__, msg->mtu);
2668 return (1);
2669 }
2670 }
2671
2672 /*
2673 * Otherwise store attributes for this lane and update
2674 * lane state.
2675 */
2676 lane_in->mtu = msg->mtu;
2677 lane_in->addr = msg->addr;
2678 lane_in->addr_type = msg->addr_type;
2679 lane_in->xfer_mode = msg->xfer_mode;
2680 lane_in->ack_freq = msg->ack_freq;
2681 lane_in->physlink_update = msg->physlink_update;
2682 lane_in->dring_mode = msg->options;
2683
2684 /*
2685 * Check if the client has requested physlink state updates.
2686 * If there is a physical device bound to this vswitch (L2
2687 * mode), set the ack bits to indicate it is supported.
2688 * Otherwise, set the nack bits.
2689 */
2690 if (VSW_VER_GTEQ(ldcp, 1, 5)) { /* Protocol ver >= 1.5 */
2691
2692 /* Does the vnet need phys link state updates ? */
2693 if ((lane_in->physlink_update &
2694 PHYSLINK_UPDATE_STATE_MASK) ==
2695 PHYSLINK_UPDATE_STATE) {
2696
2697 if (vswp->smode & VSW_LAYER2) {
2698 /* is a net-dev assigned to us ? */
2699 msg->physlink_update =
2700 PHYSLINK_UPDATE_STATE_ACK;
2701 ldcp->pls_negotiated = B_TRUE;
2702 } else {
2703 /* not in L2 mode */
2704 msg->physlink_update =
2705 PHYSLINK_UPDATE_STATE_NACK;
2706 ldcp->pls_negotiated = B_FALSE;
2707 }
2708
2709 } else {
2710 msg->physlink_update =
2711 PHYSLINK_UPDATE_NONE;
2712 ldcp->pls_negotiated = B_FALSE;
2713 }
2714
2715 } else {
2716 /*
2717 * physlink_update bits are ignored
2718 * if set by clients < v1.5 protocol.
2719 */
2720 msg->physlink_update = PHYSLINK_UPDATE_NONE;
2721 ldcp->pls_negotiated = B_FALSE;
2722 }
2723
2724 macaddr = lane_in->addr;
2725 for (i = ETHERADDRL - 1; i >= 0; i--) {
2726 port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
2727 macaddr >>= 8;
2728 }
2729
2730 /*
2731 * Setup device specific xmit routines. Note this could be changed
2732 * further in vsw_send_dring_info() for versions >= 1.6 if operating in
2733 * RxDringData mode.
2734 */
2735 mutex_enter(&port->tx_lock);
2736
2737 if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
2738 (lane_in->xfer_mode & VIO_DRING_MODE_V1_2)) ||
2739 (VSW_VER_LT(ldcp, 1, 2) &&
2740 (lane_in->xfer_mode == VIO_DRING_MODE_V1_0))) {
2741 D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
2742 port->transmit = vsw_dringsend;
2743 } else if (lane_in->xfer_mode == VIO_DESC_MODE) {
2744 D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
2745 vsw_create_privring(ldcp);
2746 port->transmit = vsw_descrsend;
2747 lane_out->xfer_mode = VIO_DESC_MODE;
2748 }
2749
2750 /*
2751 * HybridIO is supported only vnet, not by OBP.
2752 * So, set hio_capable to true only when in DRING mode.
2753 */
2754 if (VSW_VER_GTEQ(ldcp, 1, 3) &&
2755 (lane_in->xfer_mode != VIO_DESC_MODE)) {
2756 (void) atomic_swap_32(&port->p_hio_capable, B_TRUE);
2757 } else {
2758 (void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
2759 }
2760
2761 mutex_exit(&port->tx_lock);
2762
2763 return (0);
2764 }
2765
2766 static int
vsw_process_attr_ack(vsw_ldc_t * ldcp,vnet_attr_msg_t * msg)2767 vsw_process_attr_ack(vsw_ldc_t *ldcp, vnet_attr_msg_t *msg)
2768 {
2769 vsw_t *vswp = ldcp->ldc_vswp;
2770 lane_t *lane_out = &ldcp->lane_out;
2771 lane_t *lane_in = &ldcp->lane_in;
2772
2773 D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2774
2775 if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV)) {
2776 return (1);
2777 }
2778
2779 /*
2780 * Process dring mode attribute.
2781 */
2782 if (VSW_VER_GTEQ(ldcp, 1, 6)) {
2783 /*
2784 * Versions >= 1.6:
2785 * The ack msg sent by the peer contains the negotiated dring
2786 * mode between our capability (that we had sent in our attr
2787 * info) and the peer's capability.
2788 */
2789 if (lane_in->lstate & VSW_ATTR_ACK_SENT) {
2790 /*
2791 * If we have sent an ack for the attr info msg from
2792 * the peer, check if the dring mode that was
2793 * negotiated then (saved in lane_out) matches the
2794 * mode that the peer has ack'd. If they don't match,
2795 * we fail the handshake.
2796 */
2797 if (lane_out->dring_mode != msg->options) {
2798 return (1);
2799 }
2800 } else {
2801 if ((msg->options & lane_out->dring_mode) == 0) {
2802 /*
2803 * Peer ack'd with a mode that we don't
2804 * support; we fail the handshake.
2805 */
2806 return (1);
2807 }
2808 if ((msg->options & (VIO_TX_DRING|VIO_RX_DRING_DATA))
2809 == (VIO_TX_DRING|VIO_RX_DRING_DATA)) {
2810 /*
2811 * Peer must ack with only one negotiated mode.
2812 * Otherwise fail handshake.
2813 */
2814 return (1);
2815 }
2816
2817 /*
2818 * Save the negotiated mode, so we can validate it when
2819 * we receive attr info from the peer.
2820 */
2821 lane_out->dring_mode = msg->options;
2822 }
2823 }
2824
2825 /*
2826 * Process MTU attribute.
2827 */
2828 if (VSW_VER_GTEQ(ldcp, 1, 4)) {
2829 /*
2830 * Versions >= 1.4:
2831 * The ack msg sent by the peer contains the minimum of
2832 * our mtu (that we had sent in our attr info) and the
2833 * peer's mtu.
2834 *
2835 * If we have sent an ack for the attr info msg from
2836 * the peer, check if the mtu that was computed then
2837 * (saved in lane_out params) matches the mtu that the
2838 * peer has ack'd. If they don't match, we fail the
2839 * handshake.
2840 */
2841 if (lane_in->lstate & VSW_ATTR_ACK_SENT) {
2842 if (lane_out->mtu != msg->mtu) {
2843 return (1);
2844 }
2845 } else {
2846 /*
2847 * If the mtu ack'd by the peer is > our mtu
2848 * fail handshake. Otherwise, save the mtu, so
2849 * we can validate it when we receive attr info
2850 * from our peer.
2851 */
2852 if (msg->mtu <= lane_out->mtu) {
2853 lane_out->mtu = msg->mtu;
2854 } else {
2855 return (1);
2856 }
2857 }
2858 }
2859
2860 return (0);
2861 }
2862
2863 /*
2864 * Process an attribute packet. We can end up here either because our peer
2865 * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
2866 * peer has sent us an attribute INFO message
2867 *
2868 * If its an ACK we then move to the next stage of the handshake which
2869 * is to send our descriptor ring info to our peer. If its a NACK then
2870 * there is nothing more we can (currently) do.
2871 *
2872 * If we get a valid/acceptable INFO packet (and we have already negotiated
2873 * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
2874 * NACK back and reset channel state to INACTIV.
2875 *
2876 * FUTURE: in time we will probably negotiate over attributes, but for
2877 * the moment unacceptable attributes are regarded as a fatal error.
2878 *
2879 */
2880 void
vsw_process_ctrl_attr_pkt(vsw_ldc_t * ldcp,void * pkt)2881 vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
2882 {
2883 vnet_attr_msg_t *attr_pkt;
2884 vsw_t *vswp = ldcp->ldc_vswp;
2885 lane_t *lane_out = &ldcp->lane_out;
2886 lane_t *lane_in = &ldcp->lane_in;
2887 int rv;
2888
2889 D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2890
2891 /*
2892 * We know this is a ctrl/attr packet so
2893 * cast it into the correct structure.
2894 */
2895 attr_pkt = (vnet_attr_msg_t *)pkt;
2896
2897 switch (attr_pkt->tag.vio_subtype) {
2898 case VIO_SUBTYPE_INFO:
2899
2900 rv = vsw_process_attr_info(ldcp, attr_pkt);
2901 if (rv != 0) {
2902 vsw_free_lane_resources(ldcp, INBOUND);
2903 attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2904 ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
2905 } else {
2906 attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2907 lane_in->lstate |= VSW_ATTR_ACK_SENT;
2908 }
2909 attr_pkt->tag.vio_sid = ldcp->local_session;
2910 DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
2911 (void) vsw_send_msg(ldcp, (void *)attr_pkt,
2912 sizeof (vnet_attr_msg_t), B_TRUE);
2913 vsw_next_milestone(ldcp);
2914 break;
2915
2916 case VIO_SUBTYPE_ACK:
2917
2918 rv = vsw_process_attr_ack(ldcp, attr_pkt);
2919 if (rv != 0) {
2920 return;
2921 }
2922 lane_out->lstate |= VSW_ATTR_ACK_RECV;
2923 vsw_next_milestone(ldcp);
2924 break;
2925
2926 case VIO_SUBTYPE_NACK:
2927 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2928
2929 if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
2930 return;
2931
2932 lane_out->lstate |= VSW_ATTR_NACK_RECV;
2933 vsw_next_milestone(ldcp);
2934 break;
2935
2936 default:
2937 DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2938 attr_pkt->tag.vio_subtype);
2939 }
2940
2941 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
2942 }
2943
2944 static int
vsw_process_dring_reg_info(vsw_ldc_t * ldcp,vio_msg_tag_t * tagp)2945 vsw_process_dring_reg_info(vsw_ldc_t *ldcp, vio_msg_tag_t *tagp)
2946 {
2947 int rv;
2948 vsw_t *vswp = ldcp->ldc_vswp;
2949 lane_t *lp = &ldcp->lane_out;
2950 dring_info_t *dp = NULL;
2951
2952 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2953
2954 rv = vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV);
2955 if (rv != 0) {
2956 return (1);
2957 }
2958
2959 if (VSW_VER_GTEQ(ldcp, 1, 6) &&
2960 (lp->dring_mode != ((vio_dring_reg_msg_t *)tagp)->options)) {
2961 /*
2962 * The earlier version of Solaris vnet driver doesn't set the
2963 * option (VIO_TX_DRING in its case) correctly in its dring reg
2964 * message. We workaround that here by doing the check only
2965 * for versions >= v1.6.
2966 */
2967 DWARN(vswp, "%s(%lld): Rcvd dring reg option (%d), "
2968 "negotiated mode (%d)\n", __func__, ldcp->ldc_id,
2969 ((vio_dring_reg_msg_t *)tagp)->options, lp->dring_mode);
2970 return (1);
2971 }
2972
2973 /*
2974 * Map dring exported by the peer.
2975 */
2976 dp = vsw_map_dring(ldcp, (void *)tagp);
2977 if (dp == NULL) {
2978 return (1);
2979 }
2980
2981 /*
2982 * Map data buffers exported by the peer if we are in RxDringData mode.
2983 */
2984 if (lp->dring_mode == VIO_RX_DRING_DATA) {
2985 rv = vsw_map_data(ldcp, dp, (void *)tagp);
2986 if (rv != 0) {
2987 vsw_unmap_dring(ldcp);
2988 return (1);
2989 }
2990 }
2991
2992 return (0);
2993 }
2994
2995 static int
vsw_process_dring_reg_ack(vsw_ldc_t * ldcp,vio_msg_tag_t * tagp)2996 vsw_process_dring_reg_ack(vsw_ldc_t *ldcp, vio_msg_tag_t *tagp)
2997 {
2998 vsw_t *vswp = ldcp->ldc_vswp;
2999 dring_info_t *dp;
3000
3001 D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3002
3003 if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV)) {
3004 return (1);
3005 }
3006
3007 dp = ldcp->lane_out.dringp;
3008
3009 /* save dring_ident acked by peer */
3010 dp->ident = ((vio_dring_reg_msg_t *)tagp)->dring_ident;
3011
3012 return (0);
3013 }
3014
3015 /*
3016 * Process a dring info packet. We can end up here either because our peer
3017 * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
3018 * peer has sent us a dring INFO message.
3019 *
3020 * If we get a valid/acceptable INFO packet (and we have already negotiated
3021 * a version) we ACK back and update the lane state, otherwise we NACK back.
3022 *
3023 * FUTURE: nothing to stop client from sending us info on multiple dring's
3024 * but for the moment we will just use the first one we are given.
3025 *
3026 */
3027 void
vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t * ldcp,void * pkt)3028 vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
3029 {
3030 int rv;
3031 int msgsize;
3032 dring_info_t *dp;
3033 vio_msg_tag_t *tagp = (vio_msg_tag_t *)pkt;
3034 vsw_t *vswp = ldcp->ldc_vswp;
3035 lane_t *lane_out = &ldcp->lane_out;
3036 lane_t *lane_in = &ldcp->lane_in;
3037
3038 D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3039
3040 switch (tagp->vio_subtype) {
3041 case VIO_SUBTYPE_INFO:
3042 rv = vsw_process_dring_reg_info(ldcp, tagp);
3043 if (rv != 0) {
3044 vsw_free_lane_resources(ldcp, INBOUND);
3045 tagp->vio_subtype = VIO_SUBTYPE_NACK;
3046 lane_in->lstate |= VSW_DRING_NACK_SENT;
3047 } else {
3048 tagp->vio_subtype = VIO_SUBTYPE_ACK;
3049 lane_in->lstate |= VSW_DRING_ACK_SENT;
3050 }
3051 tagp->vio_sid = ldcp->local_session;
3052 DUMP_TAG_PTR(tagp);
3053 if (lane_out->dring_mode == VIO_RX_DRING_DATA) {
3054 dp = lane_in->dringp;
3055 msgsize =
3056 VNET_DRING_REG_EXT_MSG_SIZE(dp->data_ncookies);
3057 } else {
3058 msgsize = sizeof (vio_dring_reg_msg_t);
3059 }
3060 (void) vsw_send_msg(ldcp, (void *)tagp, msgsize, B_TRUE);
3061 vsw_next_milestone(ldcp);
3062 break;
3063
3064 case VIO_SUBTYPE_ACK:
3065 rv = vsw_process_dring_reg_ack(ldcp, tagp);
3066 if (rv != 0) {
3067 return;
3068 }
3069 lane_out->lstate |= VSW_DRING_ACK_RECV;
3070 vsw_next_milestone(ldcp);
3071 break;
3072
3073 case VIO_SUBTYPE_NACK:
3074 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3075
3076 if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
3077 return;
3078
3079 lane_out->lstate |= VSW_DRING_NACK_RECV;
3080 vsw_next_milestone(ldcp);
3081 break;
3082
3083 default:
3084 DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3085 tagp->vio_subtype);
3086 }
3087
3088 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3089 }
3090
3091 /*
3092 * Process a request from peer to unregister a dring.
3093 *
3094 * For the moment we just restart the handshake if our
3095 * peer endpoint attempts to unregister a dring.
3096 */
3097 void
vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t * ldcp,void * pkt)3098 vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
3099 {
3100 vsw_t *vswp = ldcp->ldc_vswp;
3101 vio_dring_unreg_msg_t *dring_pkt;
3102
3103 /*
3104 * We know this is a ctrl/dring packet so
3105 * cast it into the correct structure.
3106 */
3107 dring_pkt = (vio_dring_unreg_msg_t *)pkt;
3108
3109 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3110
3111 switch (dring_pkt->tag.vio_subtype) {
3112 case VIO_SUBTYPE_INFO:
3113 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3114
3115 DWARN(vswp, "%s: restarting handshake..", __func__);
3116 break;
3117
3118 case VIO_SUBTYPE_ACK:
3119 D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3120
3121 DWARN(vswp, "%s: restarting handshake..", __func__);
3122 break;
3123
3124 case VIO_SUBTYPE_NACK:
3125 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3126
3127 DWARN(vswp, "%s: restarting handshake..", __func__);
3128 break;
3129
3130 default:
3131 DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3132 dring_pkt->tag.vio_subtype);
3133 }
3134
3135 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3136
3137 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3138 }
3139
3140 #define SND_MCST_NACK(ldcp, pkt) \
3141 pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
3142 pkt->tag.vio_sid = ldcp->local_session; \
3143 (void) vsw_send_msg(ldcp, (void *)pkt, \
3144 sizeof (vnet_mcast_msg_t), B_TRUE);
3145
3146 /*
3147 * Process a multicast request from a vnet.
3148 *
3149 * Vnet's specify a multicast address that they are interested in. This
3150 * address is used as a key into the hash table which forms the multicast
3151 * forwarding database (mFDB).
3152 *
3153 * The table keys are the multicast addresses, while the table entries
3154 * are pointers to lists of ports which wish to receive packets for the
3155 * specified multicast address.
3156 *
3157 * When a multicast packet is being switched we use the address as a key
3158 * into the hash table, and then walk the appropriate port list forwarding
3159 * the pkt to each port in turn.
3160 *
3161 * If a vnet is no longer interested in a particular multicast grouping
3162 * we simply find the correct location in the hash table and then delete
3163 * the relevant port from the port list.
3164 *
3165 * To deal with the case whereby a port is being deleted without first
3166 * removing itself from the lists in the hash table, we maintain a list
3167 * of multicast addresses the port has registered an interest in, within
3168 * the port structure itself. We then simply walk that list of addresses
3169 * using them as keys into the hash table and remove the port from the
3170 * appropriate lists.
3171 */
3172 static void
vsw_process_ctrl_mcst_pkt(vsw_ldc_t * ldcp,void * pkt)3173 vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
3174 {
3175 vnet_mcast_msg_t *mcst_pkt;
3176 vsw_port_t *port = ldcp->ldc_port;
3177 vsw_t *vswp = ldcp->ldc_vswp;
3178 int i;
3179
3180 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3181
3182 /*
3183 * We know this is a ctrl/mcast packet so
3184 * cast it into the correct structure.
3185 */
3186 mcst_pkt = (vnet_mcast_msg_t *)pkt;
3187
3188 switch (mcst_pkt->tag.vio_subtype) {
3189 case VIO_SUBTYPE_INFO:
3190 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3191
3192 /*
3193 * Check if in correct state to receive a multicast
3194 * message (i.e. handshake complete). If not reset
3195 * the handshake.
3196 */
3197 if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
3198 return;
3199
3200 /*
3201 * Before attempting to add or remove address check
3202 * that they are valid multicast addresses.
3203 * If not, then NACK back.
3204 */
3205 for (i = 0; i < mcst_pkt->count; i++) {
3206 if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
3207 DERR(vswp, "%s: invalid multicast address",
3208 __func__);
3209 SND_MCST_NACK(ldcp, mcst_pkt);
3210 return;
3211 }
3212 }
3213
3214 /*
3215 * Now add/remove the addresses. If this fails we
3216 * NACK back.
3217 */
3218 if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
3219 SND_MCST_NACK(ldcp, mcst_pkt);
3220 return;
3221 }
3222
3223 mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3224 mcst_pkt->tag.vio_sid = ldcp->local_session;
3225
3226 DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);
3227
3228 (void) vsw_send_msg(ldcp, (void *)mcst_pkt,
3229 sizeof (vnet_mcast_msg_t), B_TRUE);
3230 break;
3231
3232 case VIO_SUBTYPE_ACK:
3233 DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3234
3235 /*
3236 * We shouldn't ever get a multicast ACK message as
3237 * at the moment we never request multicast addresses
3238 * to be set on some other device. This may change in
3239 * the future if we have cascading switches.
3240 */
3241 if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
3242 return;
3243
3244 /* Do nothing */
3245 break;
3246
3247 case VIO_SUBTYPE_NACK:
3248 DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3249
3250 /*
3251 * We shouldn't get a multicast NACK packet for the
3252 * same reasons as we shouldn't get a ACK packet.
3253 */
3254 if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
3255 return;
3256
3257 /* Do nothing */
3258 break;
3259
3260 default:
3261 DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
3262 mcst_pkt->tag.vio_subtype);
3263 }
3264
3265 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3266 }
3267
3268 static void
vsw_process_ctrl_rdx_pkt(vsw_ldc_t * ldcp,void * pkt)3269 vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
3270 {
3271 vio_rdx_msg_t *rdx_pkt;
3272 vsw_t *vswp = ldcp->ldc_vswp;
3273
3274 /*
3275 * We know this is a ctrl/rdx packet so
3276 * cast it into the correct structure.
3277 */
3278 rdx_pkt = (vio_rdx_msg_t *)pkt;
3279
3280 D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3281
3282 switch (rdx_pkt->tag.vio_subtype) {
3283 case VIO_SUBTYPE_INFO:
3284 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3285
3286 if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV))
3287 return;
3288
3289 rdx_pkt->tag.vio_sid = ldcp->local_session;
3290 rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3291
3292 DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);
3293
3294 ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT;
3295
3296 (void) vsw_send_msg(ldcp, (void *)rdx_pkt,
3297 sizeof (vio_rdx_msg_t), B_TRUE);
3298
3299 vsw_next_milestone(ldcp);
3300 break;
3301
3302 case VIO_SUBTYPE_ACK:
3303 /*
3304 * Should be handled in-band by callback handler.
3305 */
3306 DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
3307 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3308 break;
3309
3310 case VIO_SUBTYPE_NACK:
3311 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3312
3313 if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV))
3314 return;
3315
3316 ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV;
3317 vsw_next_milestone(ldcp);
3318 break;
3319
3320 default:
3321 DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3322 rdx_pkt->tag.vio_subtype);
3323 }
3324
3325 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3326 }
3327
3328 static void
vsw_process_physlink_msg(vsw_ldc_t * ldcp,void * pkt)3329 vsw_process_physlink_msg(vsw_ldc_t *ldcp, void *pkt)
3330 {
3331 vnet_physlink_msg_t *msgp;
3332 vsw_t *vswp = ldcp->ldc_vswp;
3333
3334 msgp = (vnet_physlink_msg_t *)pkt;
3335
3336 D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3337
3338 switch (msgp->tag.vio_subtype) {
3339 case VIO_SUBTYPE_INFO:
3340
3341 /* vsw shouldn't recv physlink info */
3342 DWARN(vswp, "%s: Unexpected VIO_SUBTYPE_INFO", __func__);
3343 break;
3344
3345 case VIO_SUBTYPE_ACK:
3346
3347 D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3348 break;
3349
3350 case VIO_SUBTYPE_NACK:
3351
3352 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3353 break;
3354
3355 default:
3356 DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3357 msgp->tag.vio_subtype);
3358 }
3359
3360 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3361 }
3362
3363 static void
vsw_process_data_pkt(vsw_ldc_t * ldcp,void * dpkt,vio_msg_tag_t * tagp,uint32_t msglen)3364 vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t *tagp,
3365 uint32_t msglen)
3366 {
3367 uint16_t env = tagp->vio_subtype_env;
3368 vsw_t *vswp = ldcp->ldc_vswp;
3369 lane_t *lp = &ldcp->lane_out;
3370 uint8_t dring_mode = lp->dring_mode;
3371
3372 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3373
3374 /* session id check */
3375 if (ldcp->session_status & VSW_PEER_SESSION) {
3376 if (ldcp->peer_session != tagp->vio_sid) {
3377 DERR(vswp, "%s (chan %d): invalid session id (%llx)",
3378 __func__, ldcp->ldc_id, tagp->vio_sid);
3379 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3380 return;
3381 }
3382 }
3383
3384 /*
3385 * It is an error for us to be getting data packets
3386 * before the handshake has completed.
3387 */
3388 if (ldcp->hphase != VSW_MILESTONE4) {
3389 DERR(vswp, "%s: got data packet before handshake complete "
3390 "hphase %d (%x: %x)", __func__, ldcp->hphase,
3391 ldcp->lane_in.lstate, ldcp->lane_out.lstate);
3392 DUMP_FLAGS(ldcp->lane_in.lstate);
3393 DUMP_FLAGS(ldcp->lane_out.lstate);
3394 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3395 return;
3396 }
3397 if (dring_mode == VIO_TX_DRING) {
3398 /*
3399 * To reduce the locking contention, release the ldc_cblock
3400 * here and re-acquire it once we are done receiving packets.
3401 * We do this only in TxDring mode to allow further callbaks to
3402 * continue while the msg worker thread processes the messages.
3403 * In RxDringData mode, we process the messages in the callback
3404 * itself and wake up rcv worker thread to process only data
3405 * info messages.
3406 */
3407 mutex_exit(&ldcp->ldc_cblock);
3408 mutex_enter(&ldcp->ldc_rxlock);
3409 }
3410
3411 /*
3412 * Switch on vio_subtype envelope, then let lower routines
3413 * decide if its an INFO, ACK or NACK packet.
3414 */
3415 if (env == VIO_DRING_DATA) {
3416 ldcp->rx_dringdata(ldcp, dpkt);
3417 } else if (env == VIO_PKT_DATA) {
3418 ldcp->rx_pktdata(ldcp, dpkt, msglen);
3419 } else if (env == VIO_DESC_DATA) {
3420 vsw_process_data_ibnd_pkt(ldcp, dpkt);
3421 } else {
3422 DERR(vswp, "%s: unknown vio_subtype_env (%x)\n",
3423 __func__, env);
3424 }
3425
3426 if (dring_mode == VIO_TX_DRING) {
3427 mutex_exit(&ldcp->ldc_rxlock);
3428 mutex_enter(&ldcp->ldc_cblock);
3429 }
3430
3431 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3432 }
3433
3434 /*
3435 * dummy pkt data handler function for vnet protocol version 1.0
3436 */
3437 static void
vsw_process_pkt_data_nop(void * arg1,void * arg2,uint32_t msglen)3438 vsw_process_pkt_data_nop(void *arg1, void *arg2, uint32_t msglen)
3439 {
3440 _NOTE(ARGUNUSED(arg1, arg2, msglen))
3441 }
3442
3443 /*
3444 * This function handles raw pkt data messages received over the channel.
3445 * Currently, only priority-eth-type frames are received through this mechanism.
3446 * In this case, the frame(data) is present within the message itself which
3447 * is copied into an mblk before switching it.
3448 */
3449 static void
vsw_process_pkt_data(void * arg1,void * arg2,uint32_t msglen)3450 vsw_process_pkt_data(void *arg1, void *arg2, uint32_t msglen)
3451 {
3452 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg1;
3453 vio_raw_data_msg_t *dpkt = (vio_raw_data_msg_t *)arg2;
3454 uint32_t size;
3455 mblk_t *mp;
3456 vio_mblk_t *vmp;
3457 vsw_t *vswp = ldcp->ldc_vswp;
3458 vgen_stats_t *statsp = &ldcp->ldc_stats;
3459 lane_t *lp = &ldcp->lane_out;
3460
3461 size = msglen - VIO_PKT_DATA_HDRSIZE;
3462 if (size < ETHERMIN || size > lp->mtu) {
3463 (void) atomic_inc_32(&statsp->rx_pri_fail);
3464 DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
3465 ldcp->ldc_id, size);
3466 return;
3467 }
3468
3469 vmp = vio_multipool_allocb(&ldcp->vmp, size + VLAN_TAGSZ);
3470 if (vmp == NULL) {
3471 mp = allocb(size + VLAN_TAGSZ, BPRI_MED);
3472 if (mp == NULL) {
3473 (void) atomic_inc_32(&statsp->rx_pri_fail);
3474 DWARN(vswp, "%s(%lld) allocb failure, "
3475 "unable to process priority frame\n", __func__,
3476 ldcp->ldc_id);
3477 return;
3478 }
3479 } else {
3480 mp = vmp->mp;
3481 }
3482
3483 /* skip over the extra space for vlan tag */
3484 mp->b_rptr += VLAN_TAGSZ;
3485
3486 /* copy the frame from the payload of raw data msg into the mblk */
3487 bcopy(dpkt->data, mp->b_rptr, size);
3488 mp->b_wptr = mp->b_rptr + size;
3489
3490 if (vmp != NULL) {
3491 vmp->state = VIO_MBLK_HAS_DATA;
3492 }
3493
3494 /* update stats */
3495 (void) atomic_inc_64(&statsp->rx_pri_packets);
3496 (void) atomic_add_64(&statsp->rx_pri_bytes, size);
3497
3498 /*
3499 * VLAN_TAGSZ of extra space has been pre-alloc'd if tag is needed.
3500 */
3501 (void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
3502
3503 /* switch the frame to destination */
3504 vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT, ldcp->ldc_port, NULL);
3505 }
3506
3507 /*
3508 * Process an in-band descriptor message (most likely from
3509 * OBP).
3510 */
3511 static void
vsw_process_data_ibnd_pkt(vsw_ldc_t * ldcp,void * pkt)3512 vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
3513 {
3514 vnet_ibnd_desc_t *ibnd_desc;
3515 dring_info_t *dp = NULL;
3516 vsw_private_desc_t *priv_addr = NULL;
3517 vsw_t *vswp = ldcp->ldc_vswp;
3518 mblk_t *mp = NULL;
3519 size_t nbytes = 0;
3520 size_t off = 0;
3521 uint64_t idx = 0;
3522 uint32_t num = 1, len, datalen = 0;
3523 uint64_t ncookies = 0;
3524 int i, rv;
3525 int j = 0;
3526
3527 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3528
3529 ibnd_desc = (vnet_ibnd_desc_t *)pkt;
3530
3531 switch (ibnd_desc->hdr.tag.vio_subtype) {
3532 case VIO_SUBTYPE_INFO:
3533 D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3534
3535 if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
3536 return;
3537
3538 /*
3539 * Data is padded to align on a 8 byte boundary,
3540 * nbytes is actual data length, i.e. minus that
3541 * padding.
3542 */
3543 datalen = ibnd_desc->nbytes;
3544
3545 D2(vswp, "%s(%lld): processing inband desc : "
3546 ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);
3547
3548 ncookies = ibnd_desc->ncookies;
3549
3550 /*
3551 * allocb(9F) returns an aligned data block. We
3552 * need to ensure that we ask ldc for an aligned
3553 * number of bytes also.
3554 */
3555 nbytes = datalen;
3556 if (nbytes & 0x7) {
3557 off = 8 - (nbytes & 0x7);
3558 nbytes += off;
3559 }
3560
3561 /* alloc extra space for VLAN_TAG */
3562 mp = allocb(datalen + 8, BPRI_MED);
3563 if (mp == NULL) {
3564 DERR(vswp, "%s(%lld): allocb failed",
3565 __func__, ldcp->ldc_id);
3566 ldcp->ldc_stats.rx_allocb_fail++;
3567 return;
3568 }
3569
3570 /* skip over the extra space for VLAN_TAG */
3571 mp->b_rptr += 8;
3572
3573 rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
3574 0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
3575 LDC_COPY_IN);
3576
3577 if (rv != 0) {
3578 DERR(vswp, "%s(%d): unable to copy in data from "
3579 "%d cookie(s)", __func__, ldcp->ldc_id, ncookies);
3580 freemsg(mp);
3581 ldcp->ldc_stats.ierrors++;
3582 return;
3583 }
3584
3585 D2(vswp, "%s(%d): copied in %ld bytes using %d cookies",
3586 __func__, ldcp->ldc_id, nbytes, ncookies);
3587
3588 /* point to the actual end of data */
3589 mp->b_wptr = mp->b_rptr + datalen;
3590 ldcp->ldc_stats.ipackets++;
3591 ldcp->ldc_stats.rbytes += datalen;
3592
3593 /*
3594 * We ACK back every in-band descriptor message we process
3595 */
3596 ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
3597 ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
3598 (void) vsw_send_msg(ldcp, (void *)ibnd_desc,
3599 sizeof (vnet_ibnd_desc_t), B_TRUE);
3600
3601 /*
3602 * there is extra space alloc'd for VLAN_TAG
3603 */
3604 (void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
3605
3606 /* send the packet to be switched */
3607 vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT,
3608 ldcp->ldc_port, NULL);
3609
3610 break;
3611
3612 case VIO_SUBTYPE_ACK:
3613 D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3614
3615 /* Verify the ACK is valid */
3616 idx = ibnd_desc->hdr.desc_handle;
3617
3618 if (idx >= vsw_num_descriptors) {
3619 cmn_err(CE_WARN, "!vsw%d: corrupted ACK received "
3620 "(idx %ld)", vswp->instance, idx);
3621 return;
3622 }
3623
3624 if ((dp = ldcp->lane_out.dringp) == NULL) {
3625 DERR(vswp, "%s: no dring found", __func__);
3626 return;
3627 }
3628
3629 len = dp->num_descriptors;
3630 /*
3631 * If the descriptor we are being ACK'ed for is not the
3632 * one we expected, then pkts were lost somwhere, either
3633 * when we tried to send a msg, or a previous ACK msg from
3634 * our peer. In either case we now reclaim the descriptors
3635 * in the range from the last ACK we received up to the
3636 * current ACK.
3637 */
3638 if (idx != dp->last_ack_recv) {
3639 DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)",
3640 __func__, dp->last_ack_recv, idx);
3641 num = idx >= dp->last_ack_recv ?
3642 idx - dp->last_ack_recv + 1:
3643 (len - dp->last_ack_recv + 1) + idx;
3644 }
3645
3646 /*
3647 * When we sent the in-band message to our peer we
3648 * marked the copy in our private ring as READY. We now
3649 * check that the descriptor we are being ACK'ed for is in
3650 * fact READY, i.e. it is one we have shared with our peer.
3651 *
3652 * If its not we flag an error, but still reset the descr
3653 * back to FREE.
3654 */
3655 for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) {
3656 priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
3657 mutex_enter(&priv_addr->dstate_lock);
3658 if (priv_addr->dstate != VIO_DESC_READY) {
3659 DERR(vswp, "%s: (%ld) desc at index %ld not "
3660 "READY (0x%lx)", __func__,
3661 ldcp->ldc_id, idx, priv_addr->dstate);
3662 DERR(vswp, "%s: bound %d: ncookies %ld : "
3663 "datalen %ld", __func__,
3664 priv_addr->bound, priv_addr->ncookies,
3665 priv_addr->datalen);
3666 }
3667 D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
3668 ldcp->ldc_id, idx);
3669 /* release resources associated with sent msg */
3670 priv_addr->datalen = 0;
3671 priv_addr->dstate = VIO_DESC_FREE;
3672 mutex_exit(&priv_addr->dstate_lock);
3673 }
3674 /* update to next expected value */
3675 dp->last_ack_recv = (idx + 1) % dp->num_descriptors;
3676
3677 break;
3678
3679 case VIO_SUBTYPE_NACK:
3680 DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3681
3682 /*
3683 * We should only get a NACK if our peer doesn't like
3684 * something about a message we have sent it. If this
3685 * happens we just release the resources associated with
3686 * the message. (We are relying on higher layers to decide
3687 * whether or not to resend.
3688 */
3689
3690 /* limit check */
3691 idx = ibnd_desc->hdr.desc_handle;
3692
3693 if (idx >= vsw_num_descriptors) {
3694 DERR(vswp, "%s: corrupted NACK received (idx %lld)",
3695 __func__, idx);
3696 return;
3697 }
3698
3699 if ((dp = ldcp->lane_out.dringp) == NULL) {
3700 DERR(vswp, "%s: no dring found", __func__);
3701 return;
3702 }
3703
3704 priv_addr = (vsw_private_desc_t *)dp->priv_addr;
3705
3706 /* move to correct location in ring */
3707 priv_addr += idx;
3708
3709 /* release resources associated with sent msg */
3710 mutex_enter(&priv_addr->dstate_lock);
3711 priv_addr->datalen = 0;
3712 priv_addr->dstate = VIO_DESC_FREE;
3713 mutex_exit(&priv_addr->dstate_lock);
3714
3715 break;
3716
3717 default:
3718 DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
3719 ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
3720 }
3721
3722 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3723 }
3724
3725 static void
vsw_process_err_pkt(vsw_ldc_t * ldcp,void * epkt,vio_msg_tag_t * tagp)3726 vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t *tagp)
3727 {
3728 _NOTE(ARGUNUSED(epkt))
3729
3730 vsw_t *vswp = ldcp->ldc_vswp;
3731 uint16_t env = tagp->vio_subtype_env;
3732
3733 D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
3734
3735 /*
3736 * Error vio_subtypes have yet to be defined. So for
3737 * the moment we can't do anything.
3738 */
3739 D2(vswp, "%s: (%x) vio_subtype env", __func__, env);
3740
3741 D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
3742 }
3743
3744 /* transmit the packet over the given port */
3745 int
vsw_portsend(vsw_port_t * port,mblk_t * mp)3746 vsw_portsend(vsw_port_t *port, mblk_t *mp)
3747 {
3748 mblk_t *mpt;
3749 int count;
3750 vsw_ldc_t *ldcp = port->ldcp;
3751 int status = 0;
3752
3753 count = vsw_vlan_frame_untag(port, VSW_VNETPORT, &mp, &mpt);
3754 if (count != 0) {
3755 status = ldcp->tx(ldcp, mp, mpt, count);
3756 }
3757 return (status);
3758 }
3759
3760 /*
3761 * Break up frames into 2 seperate chains: normal and
3762 * priority, based on the frame type. The number of
3763 * priority frames is also counted and returned.
3764 *
3765 * Params:
3766 * vswp: pointer to the instance of vsw
3767 * np: head of packet chain to be broken
3768 * npt: tail of packet chain to be broken
3769 *
3770 * Returns:
3771 * np: head of normal data packets
3772 * npt: tail of normal data packets
3773 * hp: head of high priority packets
3774 * hpt: tail of high priority packets
3775 */
3776 static uint32_t
vsw_get_pri_packets(vsw_t * vswp,mblk_t ** np,mblk_t ** npt,mblk_t ** hp,mblk_t ** hpt)3777 vsw_get_pri_packets(vsw_t *vswp, mblk_t **np, mblk_t **npt,
3778 mblk_t **hp, mblk_t **hpt)
3779 {
3780 mblk_t *tmp = NULL;
3781 mblk_t *smp = NULL;
3782 mblk_t *hmp = NULL; /* high prio pkts head */
3783 mblk_t *hmpt = NULL; /* high prio pkts tail */
3784 mblk_t *nmp = NULL; /* normal pkts head */
3785 mblk_t *nmpt = NULL; /* normal pkts tail */
3786 uint32_t count = 0;
3787 int i;
3788 struct ether_header *ehp;
3789 uint32_t num_types;
3790 uint16_t *types;
3791
3792 tmp = *np;
3793 while (tmp != NULL) {
3794
3795 smp = tmp;
3796 tmp = tmp->b_next;
3797 smp->b_next = NULL;
3798 smp->b_prev = NULL;
3799
3800 ehp = (struct ether_header *)smp->b_rptr;
3801 num_types = vswp->pri_num_types;
3802 types = vswp->pri_types;
3803 for (i = 0; i < num_types; i++) {
3804 if (ehp->ether_type == types[i]) {
3805 /* high priority frame */
3806
3807 if (hmp != NULL) {
3808 hmpt->b_next = smp;
3809 hmpt = smp;
3810 } else {
3811 hmp = hmpt = smp;
3812 }
3813 count++;
3814 break;
3815 }
3816 }
3817 if (i == num_types) {
3818 /* normal data frame */
3819
3820 if (nmp != NULL) {
3821 nmpt->b_next = smp;
3822 nmpt = smp;
3823 } else {
3824 nmp = nmpt = smp;
3825 }
3826 }
3827 }
3828
3829 *hp = hmp;
3830 *hpt = hmpt;
3831 *np = nmp;
3832 *npt = nmpt;
3833
3834 return (count);
3835 }
3836
3837 /*
3838 * Wrapper function to transmit normal and/or priority frames over the channel.
3839 */
3840 static int
vsw_ldctx_pri(void * arg,mblk_t * mp,mblk_t * mpt,uint32_t count)3841 vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
3842 {
3843 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
3844 mblk_t *tmp;
3845 mblk_t *smp;
3846 mblk_t *hmp; /* high prio pkts head */
3847 mblk_t *hmpt; /* high prio pkts tail */
3848 mblk_t *nmp; /* normal pkts head */
3849 mblk_t *nmpt; /* normal pkts tail */
3850 uint32_t n = 0;
3851 vsw_t *vswp = ldcp->ldc_vswp;
3852
3853 ASSERT(VSW_PRI_ETH_DEFINED(vswp));
3854 ASSERT(count != 0);
3855
3856 nmp = mp;
3857 nmpt = mpt;
3858
3859 /* gather any priority frames from the chain of packets */
3860 n = vsw_get_pri_packets(vswp, &nmp, &nmpt, &hmp, &hmpt);
3861
3862 /* transmit priority frames */
3863 tmp = hmp;
3864 while (tmp != NULL) {
3865 smp = tmp;
3866 tmp = tmp->b_next;
3867 smp->b_next = NULL;
3868 vsw_ldcsend_pkt(ldcp, smp);
3869 }
3870
3871 count -= n;
3872
3873 if (count == 0) {
3874 /* no normal data frames to process */
3875 return (0);
3876 }
3877
3878 return (vsw_ldctx(ldcp, nmp, nmpt, count));
3879 }
3880
3881 /*
3882 * Wrapper function to transmit normal frames over the channel.
3883 */
3884 static int
vsw_ldctx(void * arg,mblk_t * mp,mblk_t * mpt,uint32_t count)3885 vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
3886 {
3887 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
3888 mblk_t *tmp = NULL;
3889
3890 ASSERT(count != 0);
3891 /*
3892 * If the TX thread is enabled, then queue the
3893 * ordinary frames and signal the tx thread.
3894 */
3895 if (ldcp->tx_thread != NULL) {
3896
3897 mutex_enter(&ldcp->tx_thr_lock);
3898
3899 if ((ldcp->tx_cnt + count) >= vsw_max_tx_qcount) {
3900 /*
3901 * If we reached queue limit,
3902 * do not queue new packets,
3903 * drop them.
3904 */
3905 ldcp->ldc_stats.tx_qfull += count;
3906 mutex_exit(&ldcp->tx_thr_lock);
3907 freemsgchain(mp);
3908 goto exit;
3909 }
3910 if (ldcp->tx_mhead == NULL) {
3911 ldcp->tx_mhead = mp;
3912 ldcp->tx_mtail = mpt;
3913 cv_signal(&ldcp->tx_thr_cv);
3914 } else {
3915 ldcp->tx_mtail->b_next = mp;
3916 ldcp->tx_mtail = mpt;
3917 }
3918 ldcp->tx_cnt += count;
3919 mutex_exit(&ldcp->tx_thr_lock);
3920 } else {
3921 while (mp != NULL) {
3922 tmp = mp->b_next;
3923 mp->b_next = mp->b_prev = NULL;
3924 (void) vsw_ldcsend(ldcp, mp, 1);
3925 mp = tmp;
3926 }
3927 }
3928
3929 exit:
3930 return (0);
3931 }
3932
3933 /*
3934 * This function transmits the frame in the payload of a raw data
3935 * (VIO_PKT_DATA) message. Thus, it provides an Out-Of-Band path to
3936 * send special frames with high priorities, without going through
3937 * the normal data path which uses descriptor ring mechanism.
3938 */
3939 static void
vsw_ldcsend_pkt(vsw_ldc_t * ldcp,mblk_t * mp)3940 vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp)
3941 {
3942 vio_raw_data_msg_t *pkt;
3943 mblk_t *bp;
3944 mblk_t *nmp = NULL;
3945 vio_mblk_t *vmp;
3946 caddr_t dst;
3947 uint32_t mblksz;
3948 uint32_t size;
3949 uint32_t nbytes;
3950 int rv;
3951 vsw_t *vswp = ldcp->ldc_vswp;
3952 vgen_stats_t *statsp = &ldcp->ldc_stats;
3953
3954 if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
3955 (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
3956 (void) atomic_inc_32(&statsp->tx_pri_fail);
3957 DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
3958 "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
3959 ldcp->lane_out.lstate);
3960 goto send_pkt_exit;
3961 }
3962
3963 size = msgsize(mp);
3964
3965 /* frame size bigger than available payload len of raw data msg ? */
3966 if (size > (size_t)(ldcp->msglen - VIO_PKT_DATA_HDRSIZE)) {
3967 (void) atomic_inc_32(&statsp->tx_pri_fail);
3968 DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
3969 ldcp->ldc_id, size);
3970 goto send_pkt_exit;
3971 }
3972
3973 if (size < ETHERMIN)
3974 size = ETHERMIN;
3975
3976 /* alloc space for a raw data message */
3977 vmp = vio_allocb(vswp->pri_tx_vmp);
3978 if (vmp == NULL) {
3979 (void) atomic_inc_32(&statsp->tx_pri_fail);
3980 DWARN(vswp, "vio_allocb failed\n");
3981 goto send_pkt_exit;
3982 } else {
3983 nmp = vmp->mp;
3984 }
3985 pkt = (vio_raw_data_msg_t *)nmp->b_rptr;
3986
3987 /* copy frame into the payload of raw data message */
3988 dst = (caddr_t)pkt->data;
3989 for (bp = mp; bp != NULL; bp = bp->b_cont) {
3990 mblksz = MBLKL(bp);
3991 bcopy(bp->b_rptr, dst, mblksz);
3992 dst += mblksz;
3993 }
3994
3995 vmp->state = VIO_MBLK_HAS_DATA;
3996
3997 /* setup the raw data msg */
3998 pkt->tag.vio_msgtype = VIO_TYPE_DATA;
3999 pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
4000 pkt->tag.vio_subtype_env = VIO_PKT_DATA;
4001 pkt->tag.vio_sid = ldcp->local_session;
4002 nbytes = VIO_PKT_DATA_HDRSIZE + size;
4003
4004 /* send the msg over ldc */
4005 rv = vsw_send_msg(ldcp, (void *)pkt, nbytes, B_TRUE);
4006 if (rv != 0) {
4007 (void) atomic_inc_32(&statsp->tx_pri_fail);
4008 DWARN(vswp, "%s(%lld) Error sending priority frame\n", __func__,
4009 ldcp->ldc_id);
4010 goto send_pkt_exit;
4011 }
4012
4013 /* update stats */
4014 (void) atomic_inc_64(&statsp->tx_pri_packets);
4015 (void) atomic_add_64(&statsp->tx_pri_packets, size);
4016
4017 send_pkt_exit:
4018 if (nmp != NULL)
4019 freemsg(nmp);
4020 freemsg(mp);
4021 }
4022
4023 /*
4024 * Transmit the packet over the given LDC channel.
4025 *
4026 * The 'retries' argument indicates how many times a packet
4027 * is retried before it is dropped. Note, the retry is done
4028 * only for a resource related failure, for all other failures
4029 * the packet is dropped immediately.
4030 */
4031 static int
vsw_ldcsend(vsw_ldc_t * ldcp,mblk_t * mp,uint32_t retries)4032 vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries)
4033 {
4034 int i;
4035 int rc;
4036 int status = 0;
4037 vsw_port_t *port = ldcp->ldc_port;
4038 dring_info_t *dp = NULL;
4039 lane_t *lp = &ldcp->lane_out;
4040
4041 for (i = 0; i < retries; ) {
4042 /*
4043 * Send the message out using the appropriate
4044 * transmit function which will free mblock when it
4045 * is finished with it.
4046 */
4047 mutex_enter(&port->tx_lock);
4048 if (port->transmit != NULL) {
4049 status = (*port->transmit)(ldcp, mp);
4050 }
4051 if (status == LDC_TX_SUCCESS) {
4052 mutex_exit(&port->tx_lock);
4053 break;
4054 }
4055 i++; /* increment the counter here */
4056
4057 /* If its the last retry, then update the oerror */
4058 if ((i == retries) && (status == LDC_TX_NORESOURCES)) {
4059 ldcp->ldc_stats.oerrors++;
4060 }
4061 mutex_exit(&port->tx_lock);
4062
4063 if (status != LDC_TX_NORESOURCES) {
4064 /*
4065 * No retrying required for errors un-related
4066 * to resources.
4067 */
4068 break;
4069 }
4070 if (((dp = ldcp->lane_out.dringp) != NULL) &&
4071 ((VSW_VER_GTEQ(ldcp, 1, 2) &&
4072 (ldcp->lane_out.xfer_mode & VIO_DRING_MODE_V1_2)) ||
4073 ((VSW_VER_LT(ldcp, 1, 2) &&
4074 (ldcp->lane_out.xfer_mode == VIO_DRING_MODE_V1_0))))) {
4075
4076 /* Need to reclaim in TxDring mode. */
4077 if (lp->dring_mode == VIO_TX_DRING) {
4078 rc = vsw_reclaim_dring(dp, dp->end_idx);
4079 }
4080
4081 } else {
4082 /*
4083 * If there is no dring or the xfer_mode is
4084 * set to DESC_MODE(ie., OBP), then simply break here.
4085 */
4086 break;
4087 }
4088
4089 /*
4090 * Delay only if none were reclaimed
4091 * and its not the last retry.
4092 */
4093 if ((rc == 0) && (i < retries)) {
4094 delay(drv_usectohz(vsw_ldc_tx_delay));
4095 }
4096 }
4097 freemsg(mp);
4098 return (status);
4099 }
4100
4101 /*
4102 * Send an in-band descriptor message over ldc.
4103 */
4104 static int
vsw_descrsend(vsw_ldc_t * ldcp,mblk_t * mp)4105 vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
4106 {
4107 vsw_t *vswp = ldcp->ldc_vswp;
4108 vnet_ibnd_desc_t ibnd_msg;
4109 vsw_private_desc_t *priv_desc = NULL;
4110 dring_info_t *dp = NULL;
4111 size_t n, size = 0;
4112 caddr_t bufp;
4113 mblk_t *bp;
4114 int idx, i;
4115 int status = LDC_TX_SUCCESS;
4116 static int warn_msg = 1;
4117 lane_t *lp = &ldcp->lane_out;
4118
4119 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4120
4121 ASSERT(mp != NULL);
4122
4123 if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4124 (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4125 DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
4126 __func__, ldcp->ldc_id, ldcp->ldc_status,
4127 ldcp->lane_out.lstate);
4128 ldcp->ldc_stats.oerrors++;
4129 return (LDC_TX_FAILURE);
4130 }
4131
4132 /*
4133 * The dring here is as an internal buffer,
4134 * rather than a transfer channel.
4135 */
4136 if ((dp = ldcp->lane_out.dringp) == NULL) {
4137 DERR(vswp, "%s(%lld): no dring for outbound lane",
4138 __func__, ldcp->ldc_id);
4139 DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", __func__,
4140 ldcp->ldc_id, ldcp->ldc_status, ldcp->lane_out.lstate);
4141 ldcp->ldc_stats.oerrors++;
4142 return (LDC_TX_FAILURE);
4143 }
4144
4145 size = msgsize(mp);
4146 if (size > (size_t)lp->mtu) {
4147 DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
4148 ldcp->ldc_id, size);
4149 ldcp->ldc_stats.oerrors++;
4150 return (LDC_TX_FAILURE);
4151 }
4152
4153 /*
4154 * Find a free descriptor in our buffer ring
4155 */
4156 if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
4157 if (warn_msg) {
4158 DERR(vswp, "%s(%lld): no descriptor available for ring "
4159 "at 0x%llx", __func__, ldcp->ldc_id, dp);
4160 warn_msg = 0;
4161 }
4162
4163 /* nothing more we can do */
4164 status = LDC_TX_NORESOURCES;
4165 goto vsw_descrsend_free_exit;
4166 } else {
4167 D2(vswp, "%s(%lld): free private descriptor found at pos "
4168 "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, priv_desc);
4169 warn_msg = 1;
4170 }
4171
4172 /* copy data into the descriptor */
4173 bufp = priv_desc->datap;
4174 for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
4175 n = MBLKL(bp);
4176 bcopy(bp->b_rptr, bufp, n);
4177 bufp += n;
4178 }
4179
4180 priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
4181
4182 /* create and send the in-band descp msg */
4183 ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
4184 ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
4185 ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
4186 ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;
4187
4188 /*
4189 * Copy the mem cookies describing the data from the
4190 * private region of the descriptor ring into the inband
4191 * descriptor.
4192 */
4193 for (i = 0; i < priv_desc->ncookies; i++) {
4194 bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
4195 sizeof (ldc_mem_cookie_t));
4196 }
4197
4198 ibnd_msg.hdr.desc_handle = idx;
4199 ibnd_msg.ncookies = priv_desc->ncookies;
4200 ibnd_msg.nbytes = size;
4201
4202 ldcp->ldc_stats.opackets++;
4203 ldcp->ldc_stats.obytes += size;
4204
4205 (void) vsw_send_msg(ldcp, (void *)&ibnd_msg,
4206 sizeof (vnet_ibnd_desc_t), B_TRUE);
4207
4208 vsw_descrsend_free_exit:
4209
4210 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4211 return (status);
4212 }
4213
4214 static void
vsw_send_ver(void * arg)4215 vsw_send_ver(void *arg)
4216 {
4217 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
4218 vsw_t *vswp = ldcp->ldc_vswp;
4219 lane_t *lp = &ldcp->lane_out;
4220 vio_ver_msg_t ver_msg;
4221
4222 D1(vswp, "%s enter", __func__);
4223
4224 ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4225 ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4226 ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
4227 ver_msg.tag.vio_sid = ldcp->local_session;
4228
4229 if (vsw_obp_ver_proto_workaround == B_FALSE) {
4230 ver_msg.ver_major = vsw_versions[0].ver_major;
4231 ver_msg.ver_minor = vsw_versions[0].ver_minor;
4232 } else {
4233 /* use the major,minor that we've ack'd */
4234 lane_t *lpi = &ldcp->lane_in;
4235 ver_msg.ver_major = lpi->ver_major;
4236 ver_msg.ver_minor = lpi->ver_minor;
4237 }
4238 ver_msg.dev_class = VDEV_NETWORK_SWITCH;
4239
4240 lp->lstate |= VSW_VER_INFO_SENT;
4241 lp->ver_major = ver_msg.ver_major;
4242 lp->ver_minor = ver_msg.ver_minor;
4243
4244 DUMP_TAG(ver_msg.tag);
4245
4246 (void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE);
4247
4248 D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
4249 }
4250
4251 static void
vsw_send_attr(vsw_ldc_t * ldcp)4252 vsw_send_attr(vsw_ldc_t *ldcp)
4253 {
4254 vsw_t *vswp = ldcp->ldc_vswp;
4255 lane_t *lp = &ldcp->lane_out;
4256 vnet_attr_msg_t attr_msg;
4257
4258 D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
4259
4260 /*
4261 * Subtype is set to INFO by default
4262 */
4263 attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4264 attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4265 attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
4266 attr_msg.tag.vio_sid = ldcp->local_session;
4267
4268 /* payload copied from default settings for lane */
4269 attr_msg.mtu = lp->mtu;
4270 attr_msg.addr_type = lp->addr_type;
4271 attr_msg.xfer_mode = lp->xfer_mode;
4272 attr_msg.ack_freq = lp->xfer_mode;
4273 attr_msg.options = lp->dring_mode;
4274
4275 READ_ENTER(&vswp->if_lockrw);
4276 attr_msg.addr = vnet_macaddr_strtoul((vswp->if_addr).ether_addr_octet);
4277 RW_EXIT(&vswp->if_lockrw);
4278
4279 ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;
4280
4281 DUMP_TAG(attr_msg.tag);
4282
4283 (void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE);
4284
4285 D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
4286 }
4287
4288 static void
vsw_send_dring_info(vsw_ldc_t * ldcp)4289 vsw_send_dring_info(vsw_ldc_t *ldcp)
4290 {
4291 int msgsize;
4292 void *msg;
4293 vsw_t *vswp = ldcp->ldc_vswp;
4294 vsw_port_t *port = ldcp->ldc_port;
4295 lane_t *lp = &ldcp->lane_out;
4296 vgen_stats_t *statsp = &ldcp->ldc_stats;
4297
4298 D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);
4299
4300 /* dring mode has been negotiated in attr phase; save in stats */
4301 statsp->dring_mode = lp->dring_mode;
4302
4303 if (lp->dring_mode == VIO_RX_DRING_DATA) {
4304 /*
4305 * Change the transmit routine for RxDringData mode.
4306 */
4307 port->transmit = vsw_dringsend_shm;
4308 msg = (void *) vsw_create_rx_dring_info(ldcp);
4309 if (msg == NULL) {
4310 return;
4311 }
4312 msgsize =
4313 VNET_DRING_REG_EXT_MSG_SIZE(lp->dringp->data_ncookies);
4314 ldcp->rcv_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
4315 vsw_ldc_rcv_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
4316 ldcp->rx_dringdata = vsw_process_dringdata_shm;
4317 } else {
4318 msg = (void *) vsw_create_tx_dring_info(ldcp);
4319 if (msg == NULL) {
4320 return;
4321 }
4322 msgsize = sizeof (vio_dring_reg_msg_t);
4323 ldcp->msg_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
4324 vsw_ldc_msg_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
4325 ldcp->rx_dringdata = vsw_process_dringdata;
4326 }
4327
4328 lp->lstate |= VSW_DRING_INFO_SENT;
4329 DUMP_TAG_PTR((vio_msg_tag_t *)msg);
4330 (void) vsw_send_msg(ldcp, msg, msgsize, B_TRUE);
4331 kmem_free(msg, msgsize);
4332
4333 D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
4334 }
4335
4336 static void
vsw_send_rdx(vsw_ldc_t * ldcp)4337 vsw_send_rdx(vsw_ldc_t *ldcp)
4338 {
4339 vsw_t *vswp = ldcp->ldc_vswp;
4340 vio_rdx_msg_t rdx_msg;
4341
4342 D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
4343
4344 rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4345 rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4346 rdx_msg.tag.vio_subtype_env = VIO_RDX;
4347 rdx_msg.tag.vio_sid = ldcp->local_session;
4348
4349 ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT;
4350
4351 DUMP_TAG(rdx_msg.tag);
4352
4353 (void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE);
4354
4355 D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
4356 }
4357
4358 /*
4359 * Remove the specified address from the list of address maintained
4360 * in this port node.
4361 */
4362 mcst_addr_t *
vsw_del_addr(uint8_t devtype,void * arg,uint64_t addr)4363 vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
4364 {
4365 vsw_t *vswp = NULL;
4366 vsw_port_t *port = NULL;
4367 mcst_addr_t *prev_p = NULL;
4368 mcst_addr_t *curr_p = NULL;
4369
4370 D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
4371 __func__, devtype, addr);
4372
4373 if (devtype == VSW_VNETPORT) {
4374 port = (vsw_port_t *)arg;
4375 mutex_enter(&port->mca_lock);
4376 prev_p = curr_p = port->mcap;
4377 } else {
4378 vswp = (vsw_t *)arg;
4379 mutex_enter(&vswp->mca_lock);
4380 prev_p = curr_p = vswp->mcap;
4381 }
4382
4383 while (curr_p != NULL) {
4384 if (curr_p->addr == addr) {
4385 D2(NULL, "%s: address found", __func__);
4386 /* match found */
4387 if (prev_p == curr_p) {
4388 /* list head */
4389 if (devtype == VSW_VNETPORT)
4390 port->mcap = curr_p->nextp;
4391 else
4392 vswp->mcap = curr_p->nextp;
4393 } else {
4394 prev_p->nextp = curr_p->nextp;
4395 }
4396 break;
4397 } else {
4398 prev_p = curr_p;
4399 curr_p = curr_p->nextp;
4400 }
4401 }
4402
4403 if (devtype == VSW_VNETPORT)
4404 mutex_exit(&port->mca_lock);
4405 else
4406 mutex_exit(&vswp->mca_lock);
4407
4408 D1(NULL, "%s: exit", __func__);
4409
4410 return (curr_p);
4411 }
4412
4413 /*
4414 * Create a ring consisting of just a private portion and link
4415 * it into the list of rings for the outbound lane.
4416 *
4417 * These type of rings are used primarily for temporary data
4418 * storage (i.e. as data buffers).
4419 */
4420 void
vsw_create_privring(vsw_ldc_t * ldcp)4421 vsw_create_privring(vsw_ldc_t *ldcp)
4422 {
4423 dring_info_t *dp;
4424 vsw_t *vswp = ldcp->ldc_vswp;
4425
4426 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4427
4428 dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
4429 mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
4430 mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
4431 ldcp->lane_out.dringp = dp;
4432
4433 /* no public section */
4434 dp->pub_addr = NULL;
4435 dp->priv_addr = kmem_zalloc(
4436 (sizeof (vsw_private_desc_t) * vsw_num_descriptors), KM_SLEEP);
4437 dp->num_descriptors = vsw_num_descriptors;
4438
4439 if (vsw_setup_tx_dring(ldcp, dp)) {
4440 DERR(vswp, "%s: setup of ring failed", __func__);
4441 vsw_destroy_tx_dring(ldcp);
4442 return;
4443 }
4444
4445 /* haven't used any descriptors yet */
4446 dp->end_idx = 0;
4447 dp->restart_reqd = B_TRUE;
4448
4449 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4450 }
4451
4452 /*
4453 * Set the default lane attributes. These are copied into
4454 * the attr msg we send to our peer. If they are not acceptable
4455 * then (currently) the handshake ends.
4456 */
4457 static void
vsw_set_lane_attr(vsw_t * vswp,lane_t * lp)4458 vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
4459 {
4460 bzero(lp, sizeof (lane_t));
4461
4462 READ_ENTER(&vswp->if_lockrw);
4463 ether_copy(&(vswp->if_addr), &(lp->addr));
4464 RW_EXIT(&vswp->if_lockrw);
4465
4466 lp->mtu = vswp->max_frame_size;
4467 lp->addr_type = ADDR_TYPE_MAC;
4468 lp->xfer_mode = VIO_DRING_MODE_V1_0;
4469 lp->ack_freq = 0; /* for shared mode */
4470 lp->seq_num = VNET_ISS;
4471 }
4472
4473 /*
4474 * Map the descriptor ring exported by the peer.
4475 */
4476 static dring_info_t *
vsw_map_dring(vsw_ldc_t * ldcp,void * pkt)4477 vsw_map_dring(vsw_ldc_t *ldcp, void *pkt)
4478 {
4479 dring_info_t *dp = NULL;
4480 lane_t *lp = &ldcp->lane_out;
4481
4482 if (lp->dring_mode == VIO_RX_DRING_DATA) {
4483 /*
4484 * In RxDringData mode, dring that we map in
4485 * becomes our transmit descriptor ring.
4486 */
4487 dp = vsw_map_tx_dring(ldcp, pkt);
4488 } else {
4489 /*
4490 * In TxDring mode, dring that we map in
4491 * becomes our receive descriptor ring.
4492 */
4493 dp = vsw_map_rx_dring(ldcp, pkt);
4494 }
4495 return (dp);
4496 }
4497
4498 /*
4499 * Common dring mapping function used in both TxDring and RxDringData modes.
4500 */
4501 dring_info_t *
vsw_map_dring_cmn(vsw_ldc_t * ldcp,vio_dring_reg_msg_t * dring_pkt)4502 vsw_map_dring_cmn(vsw_ldc_t *ldcp, vio_dring_reg_msg_t *dring_pkt)
4503 {
4504 int rv;
4505 dring_info_t *dp;
4506 ldc_mem_info_t minfo;
4507 vsw_t *vswp = ldcp->ldc_vswp;
4508
4509 /*
4510 * If the dring params are unacceptable then we NACK back.
4511 */
4512 if ((dring_pkt->num_descriptors == 0) ||
4513 (dring_pkt->descriptor_size == 0) ||
4514 (dring_pkt->ncookies != 1)) {
4515 DERR(vswp, "%s (%lld): invalid dring info",
4516 __func__, ldcp->ldc_id);
4517 return (NULL);
4518 }
4519
4520 dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
4521
4522 dp->num_descriptors = dring_pkt->num_descriptors;
4523 dp->descriptor_size = dring_pkt->descriptor_size;
4524 dp->options = dring_pkt->options;
4525 dp->dring_ncookies = dring_pkt->ncookies;
4526
4527 /*
4528 * Note: should only get one cookie. Enforced in
4529 * the ldc layer.
4530 */
4531 bcopy(&dring_pkt->cookie[0], &dp->dring_cookie[0],
4532 sizeof (ldc_mem_cookie_t));
4533
4534 rv = ldc_mem_dring_map(ldcp->ldc_handle, &dp->dring_cookie[0],
4535 dp->dring_ncookies, dp->num_descriptors, dp->descriptor_size,
4536 LDC_DIRECT_MAP, &(dp->dring_handle));
4537 if (rv != 0) {
4538 goto fail;
4539 }
4540
4541 rv = ldc_mem_dring_info(dp->dring_handle, &minfo);
4542 if (rv != 0) {
4543 goto fail;
4544 }
4545 /* store the address of the ring */
4546 dp->pub_addr = minfo.vaddr;
4547
4548 /* cache the dring mtype */
4549 dp->dring_mtype = minfo.mtype;
4550
4551 /* no private section as we are importing */
4552 dp->priv_addr = NULL;
4553
4554 /*
4555 * Using simple mono increasing int for ident at the moment.
4556 */
4557 dp->ident = ldcp->next_ident;
4558 ldcp->next_ident++;
4559
4560 /*
4561 * Acknowledge it; we send back a unique dring identifier that
4562 * the sending side will use in future to refer to this
4563 * descriptor ring.
4564 */
4565 dring_pkt->dring_ident = dp->ident;
4566
4567 return (dp);
4568 fail:
4569 if (dp->dring_handle != NULL) {
4570 (void) ldc_mem_dring_unmap(dp->dring_handle);
4571 }
4572 kmem_free(dp, sizeof (*dp));
4573 return (NULL);
4574 }
4575
4576 /*
4577 * Unmap the descriptor ring exported by the peer.
4578 */
4579 static void
vsw_unmap_dring(vsw_ldc_t * ldcp)4580 vsw_unmap_dring(vsw_ldc_t *ldcp)
4581 {
4582 lane_t *lane_out = &ldcp->lane_out;
4583
4584 if (lane_out->dring_mode == VIO_RX_DRING_DATA) {
4585 vsw_unmap_tx_dring(ldcp);
4586 } else {
4587 vsw_unmap_rx_dring(ldcp);
4588 }
4589 }
4590
4591 /*
4592 * Map the shared memory data buffer area exported by the peer.
4593 * Used in RxDringData mode only.
4594 */
4595 static int
vsw_map_data(vsw_ldc_t * ldcp,dring_info_t * dp,void * pkt)4596 vsw_map_data(vsw_ldc_t *ldcp, dring_info_t *dp, void *pkt)
4597 {
4598 int rv;
4599 vio_dring_reg_ext_msg_t *emsg;
4600 vio_dring_reg_msg_t *msg = pkt;
4601 uint8_t *buf = (uint8_t *)msg->cookie;
4602 vsw_t *vswp = ldcp->ldc_vswp;
4603 ldc_mem_info_t minfo;
4604
4605 /* skip over dring cookies */
4606 ASSERT(msg->ncookies == 1);
4607 buf += (msg->ncookies * sizeof (ldc_mem_cookie_t));
4608
4609 emsg = (vio_dring_reg_ext_msg_t *)buf;
4610 if (emsg->data_ncookies > VNET_DATA_AREA_COOKIES) {
4611 return (1);
4612 }
4613
4614 /* save # of data area cookies */
4615 dp->data_ncookies = emsg->data_ncookies;
4616
4617 /* save data area size */
4618 dp->data_sz = emsg->data_area_size;
4619
4620 /* allocate ldc mem handle for data area */
4621 rv = ldc_mem_alloc_handle(ldcp->ldc_handle, &dp->data_handle);
4622 if (rv != 0) {
4623 cmn_err(CE_WARN, "ldc_mem_alloc_handle failed\n");
4624 DWARN(vswp, "%s (%lld) ldc_mem_alloc_handle() failed: %d\n",
4625 __func__, ldcp->ldc_id, rv);
4626 return (1);
4627 }
4628
4629 /* map the data area */
4630 rv = ldc_mem_map(dp->data_handle, emsg->data_cookie,
4631 emsg->data_ncookies, LDC_DIRECT_MAP, LDC_MEM_R,
4632 (caddr_t *)&dp->data_addr, NULL);
4633 if (rv != 0) {
4634 cmn_err(CE_WARN, "ldc_mem_map failed\n");
4635 DWARN(vswp, "%s (%lld) ldc_mem_map() failed: %d\n",
4636 __func__, ldcp->ldc_id, rv);
4637 return (1);
4638 }
4639
4640 /* get the map info */
4641 rv = ldc_mem_info(dp->data_handle, &minfo);
4642 if (rv != 0) {
4643 cmn_err(CE_WARN, "ldc_mem_info failed\n");
4644 DWARN(vswp, "%s (%lld) ldc_mem_info() failed: %d\n",
4645 __func__, ldcp->ldc_id, rv);
4646 return (1);
4647 }
4648
4649 if (minfo.mtype != LDC_DIRECT_MAP) {
4650 DWARN(vswp, "%s (%lld) mtype(%d) is not direct map\n",
4651 __func__, ldcp->ldc_id, minfo.mtype);
4652 return (1);
4653 }
4654
4655 /* allocate memory for data area cookies */
4656 dp->data_cookie = kmem_zalloc(emsg->data_ncookies *
4657 sizeof (ldc_mem_cookie_t), KM_SLEEP);
4658
4659 /* save data area cookies */
4660 bcopy(emsg->data_cookie, dp->data_cookie,
4661 emsg->data_ncookies * sizeof (ldc_mem_cookie_t));
4662
4663 return (0);
4664 }
4665
4666 /*
4667 * Reset and free all the resources associated with the channel.
4668 */
4669 static void
vsw_free_lane_resources(vsw_ldc_t * ldcp,uint64_t dir)4670 vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
4671 {
4672 lane_t *lp;
4673
4674 D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);
4675
4676 if (dir == INBOUND) {
4677 D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
4678 " of channel %lld", __func__, ldcp->ldc_id);
4679 lp = &ldcp->lane_in;
4680 } else {
4681 D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
4682 " of channel %lld", __func__, ldcp->ldc_id);
4683 lp = &ldcp->lane_out;
4684 }
4685
4686 lp->lstate = VSW_LANE_INACTIV;
4687 lp->seq_num = VNET_ISS;
4688
4689 if (dir == INBOUND) {
4690 /* Unmap the remote dring which is imported from the peer */
4691 vsw_unmap_dring(ldcp);
4692 } else {
4693 /* Destroy the local dring which is exported to the peer */
4694 vsw_destroy_dring(ldcp);
4695 }
4696
4697 D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
4698 }
4699
4700 /*
4701 * Destroy the descriptor ring.
4702 */
4703 static void
vsw_destroy_dring(vsw_ldc_t * ldcp)4704 vsw_destroy_dring(vsw_ldc_t *ldcp)
4705 {
4706 lane_t *lp = &ldcp->lane_out;
4707
4708 if (lp->dring_mode == VIO_RX_DRING_DATA) {
4709 vsw_destroy_rx_dring(ldcp);
4710 } else {
4711 vsw_destroy_tx_dring(ldcp);
4712 }
4713 }
4714
4715 /*
4716 * vsw_ldc_tx_worker -- A per LDC worker thread to transmit data.
4717 * This thread is woken up by the vsw_portsend to transmit
4718 * packets.
4719 */
4720 static void
vsw_ldc_tx_worker(void * arg)4721 vsw_ldc_tx_worker(void *arg)
4722 {
4723 callb_cpr_t cprinfo;
4724 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
4725 vsw_t *vswp = ldcp->ldc_vswp;
4726 mblk_t *mp;
4727 mblk_t *tmp;
4728
4729 D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
4730 CALLB_CPR_INIT(&cprinfo, &ldcp->tx_thr_lock, callb_generic_cpr,
4731 "vnet_tx_thread");
4732 mutex_enter(&ldcp->tx_thr_lock);
4733 while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP)) {
4734
4735 CALLB_CPR_SAFE_BEGIN(&cprinfo);
4736 /*
4737 * Wait until the data is received or a stop
4738 * request is received.
4739 */
4740 while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP) &&
4741 (ldcp->tx_mhead == NULL)) {
4742 cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock);
4743 }
4744 CALLB_CPR_SAFE_END(&cprinfo, &ldcp->tx_thr_lock)
4745
4746 /*
4747 * First process the stop request.
4748 */
4749 if (ldcp->tx_thr_flags & VSW_WTHR_STOP) {
4750 D2(vswp, "%s(%lld):tx thread stopped\n",
4751 __func__, ldcp->ldc_id);
4752 break;
4753 }
4754 mp = ldcp->tx_mhead;
4755 ldcp->tx_mhead = ldcp->tx_mtail = NULL;
4756 ldcp->tx_cnt = 0;
4757 mutex_exit(&ldcp->tx_thr_lock);
4758 D2(vswp, "%s(%lld):calling vsw_ldcsend\n",
4759 __func__, ldcp->ldc_id);
4760 while (mp != NULL) {
4761 tmp = mp->b_next;
4762 mp->b_next = mp->b_prev = NULL;
4763 (void) vsw_ldcsend(ldcp, mp, vsw_ldc_tx_retries);
4764 mp = tmp;
4765 }
4766 mutex_enter(&ldcp->tx_thr_lock);
4767 }
4768
4769 /*
4770 * Update the run status and wakeup the thread that
4771 * has sent the stop request.
4772 */
4773 ldcp->tx_thr_flags &= ~VSW_WTHR_STOP;
4774 ldcp->tx_thread = NULL;
4775 CALLB_CPR_EXIT(&cprinfo);
4776 D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
4777 thread_exit();
4778 }
4779
4780 /* vsw_stop_tx_thread -- Co-ordinate with receive thread to stop it */
4781 static void
vsw_stop_tx_thread(vsw_ldc_t * ldcp)4782 vsw_stop_tx_thread(vsw_ldc_t *ldcp)
4783 {
4784 kt_did_t tid = 0;
4785 vsw_t *vswp = ldcp->ldc_vswp;
4786
4787 D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
4788 /*
4789 * Send a stop request by setting the stop flag and
4790 * wait until the receive thread stops.
4791 */
4792 mutex_enter(&ldcp->tx_thr_lock);
4793 if (ldcp->tx_thread != NULL) {
4794 tid = ldcp->tx_thread->t_did;
4795 ldcp->tx_thr_flags |= VSW_WTHR_STOP;
4796 cv_signal(&ldcp->tx_thr_cv);
4797 }
4798 mutex_exit(&ldcp->tx_thr_lock);
4799
4800 if (tid != 0) {
4801 thread_join(tid);
4802 }
4803
4804 D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
4805 }
4806
4807 static int
vsw_mapin_avail(vsw_ldc_t * ldcp)4808 vsw_mapin_avail(vsw_ldc_t *ldcp)
4809 {
4810 int rv;
4811 ldc_info_t info;
4812 uint64_t mapin_sz_req;
4813 uint64_t dblk_sz;
4814 vsw_t *vswp = ldcp->ldc_vswp;
4815
4816 rv = ldc_info(ldcp->ldc_handle, &info);
4817 if (rv != 0) {
4818 return (B_FALSE);
4819 }
4820
4821 dblk_sz = RXDRING_DBLK_SZ(vswp->max_frame_size);
4822 mapin_sz_req = (VSW_RXDRING_NRBUFS * dblk_sz);
4823
4824 if (info.direct_map_size_max >= mapin_sz_req) {
4825 return (B_TRUE);
4826 }
4827
4828 return (B_FALSE);
4829 }
4830
4831 /*
4832 * Debugging routines
4833 */
4834 static void
display_state(void)4835 display_state(void)
4836 {
4837 vsw_t *vswp;
4838 vsw_port_list_t *plist;
4839 vsw_port_t *port;
4840 vsw_ldc_t *ldcp;
4841 extern vsw_t *vsw_head;
4842
4843 cmn_err(CE_NOTE, "***** system state *****");
4844
4845 for (vswp = vsw_head; vswp; vswp = vswp->next) {
4846 plist = &vswp->plist;
4847 READ_ENTER(&plist->lockrw);
4848 cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
4849 vswp->instance, plist->num_ports);
4850
4851 for (port = plist->head; port != NULL; port = port->p_next) {
4852 cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
4853 port->p_instance, port->num_ldcs);
4854 ldcp = port->ldcp;
4855 cmn_err(CE_CONT, "chan %lu : dev %d : "
4856 "status %d : phase %u\n",
4857 ldcp->ldc_id, ldcp->dev_class,
4858 ldcp->ldc_status, ldcp->hphase);
4859 cmn_err(CE_CONT, "chan %lu : lsession %lu : "
4860 "psession %lu\n", ldcp->ldc_id,
4861 ldcp->local_session, ldcp->peer_session);
4862
4863 cmn_err(CE_CONT, "Inbound lane:\n");
4864 display_lane(&ldcp->lane_in);
4865 cmn_err(CE_CONT, "Outbound lane:\n");
4866 display_lane(&ldcp->lane_out);
4867 }
4868 RW_EXIT(&plist->lockrw);
4869 }
4870 cmn_err(CE_NOTE, "***** system state *****");
4871 }
4872
4873 static void
display_lane(lane_t * lp)4874 display_lane(lane_t *lp)
4875 {
4876 dring_info_t *drp = lp->dringp;
4877
4878 cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
4879 lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
4880 cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
4881 lp->addr_type, lp->addr, lp->xfer_mode);
4882 cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);
4883
4884 cmn_err(CE_CONT, "Dring info:\n");
4885 cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
4886 drp->num_descriptors, drp->descriptor_size);
4887 cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->dring_handle);
4888 cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
4889 (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
4890 cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
4891 drp->ident, drp->end_idx);
4892 display_ring(drp);
4893 }
4894
4895 static void
display_ring(dring_info_t * dringp)4896 display_ring(dring_info_t *dringp)
4897 {
4898 uint64_t i;
4899 uint64_t priv_count = 0;
4900 uint64_t pub_count = 0;
4901 vnet_public_desc_t *pub_addr = NULL;
4902 vsw_private_desc_t *priv_addr = NULL;
4903
4904 for (i = 0; i < vsw_num_descriptors; i++) {
4905 if (dringp->pub_addr != NULL) {
4906 pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;
4907
4908 if (pub_addr->hdr.dstate == VIO_DESC_FREE)
4909 pub_count++;
4910 }
4911
4912 if (dringp->priv_addr != NULL) {
4913 priv_addr = (vsw_private_desc_t *)dringp->priv_addr + i;
4914
4915 if (priv_addr->dstate == VIO_DESC_FREE)
4916 priv_count++;
4917 }
4918 }
4919 cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
4920 i, priv_count, pub_count);
4921 }
4922
4923 static void
dump_flags(uint64_t state)4924 dump_flags(uint64_t state)
4925 {
4926 int i;
4927
4928 typedef struct flag_name {
4929 int flag_val;
4930 char *flag_name;
4931 } flag_name_t;
4932
4933 flag_name_t flags[] = {
4934 VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
4935 VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
4936 VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
4937 VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
4938 VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
4939 VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
4940 VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
4941 VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
4942 VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
4943 VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
4944 VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
4945 VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
4946 VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
4947 VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
4948 VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
4949 VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
4950 VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
4951 VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
4952 VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
4953 VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
4954 VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
4955 VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
4956 VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
4957 VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
4958 VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
4959 VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
4960 VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
4961 VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
4962 VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
4963 VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
4964 VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};
4965
4966 DERR(NULL, "DUMP_FLAGS: %llx\n", state);
4967 for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
4968 if (state & flags[i].flag_val)
4969 DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
4970 }
4971 }
4972