1 /*
2 * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved.
3 * Copyright (c) 2002-2015 Mellanox Technologies LTD. All rights reserved.
4 * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
5 * Copyright (c) 2009 HNR Consulting. All rights reserved.
6 *
7 * This software is available to you under a choice of one of two
8 * licenses. You may choose to be licensed under the terms of the GNU
9 * General Public License (GPL) Version 2, available from the file
10 * COPYING in the main directory of this source tree, or the
11 * OpenIB.org BSD license below:
12 *
13 * Redistribution and use in source and binary forms, with or
14 * without modification, are permitted provided that the following
15 * conditions are met:
16 *
17 * - Redistributions of source code must retain the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer.
20 *
21 * - Redistributions in binary form must reproduce the above
22 * copyright notice, this list of conditions and the following
23 * disclaimer in the documentation and/or other materials
24 * provided with the distribution.
25 *
26 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
27 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
28 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
29 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
30 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
31 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
32 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 * SOFTWARE.
34 *
35 */
36
37 /*
38 * Abstract:
39 * Implementation of osm_switch_t.
40 * This object represents an Infiniband switch.
41 * This object is part of the opensm family of objects.
42 */
43
44 #if HAVE_CONFIG_H
45 # include <config.h>
46 #endif /* HAVE_CONFIG_H */
47
48 #include <stdlib.h>
49 #include <string.h>
50 #include <complib/cl_math.h>
51 #include <iba/ib_types.h>
52 #include <opensm/osm_file_ids.h>
53 #define FILE_ID OSM_FILE_SWITCH_C
54 #include <opensm/osm_switch.h>
55
56 struct switch_port_path {
57 uint8_t port_num;
58 uint32_t path_count;
59 int found_sys_guid;
60 int found_node_guid;
61 uint32_t forwarded_to;
62 };
63
osm_switch_set_hops(IN osm_switch_t * p_sw,IN uint16_t lid_ho,IN uint8_t port_num,IN uint8_t num_hops)64 cl_status_t osm_switch_set_hops(IN osm_switch_t * p_sw, IN uint16_t lid_ho,
65 IN uint8_t port_num, IN uint8_t num_hops)
66 {
67 if (!lid_ho || lid_ho > p_sw->max_lid_ho)
68 return -1;
69 if (port_num >= p_sw->num_ports)
70 return -1;
71 if (!p_sw->hops[lid_ho]) {
72 p_sw->hops[lid_ho] = malloc(p_sw->num_ports);
73 if (!p_sw->hops[lid_ho])
74 return -1;
75 memset(p_sw->hops[lid_ho], OSM_NO_PATH, p_sw->num_ports);
76 }
77
78 p_sw->hops[lid_ho][port_num] = num_hops;
79 if (p_sw->hops[lid_ho][0] > num_hops)
80 p_sw->hops[lid_ho][0] = num_hops;
81
82 return 0;
83 }
84
osm_switch_delete(IN OUT osm_switch_t ** pp_sw)85 void osm_switch_delete(IN OUT osm_switch_t ** pp_sw)
86 {
87 osm_switch_t *p_sw = *pp_sw;
88 unsigned i;
89
90 osm_mcast_tbl_destroy(&p_sw->mcast_tbl);
91 if (p_sw->p_prof)
92 free(p_sw->p_prof);
93 if (p_sw->search_ordering_ports)
94 free(p_sw->search_ordering_ports);
95 if (p_sw->lft)
96 free(p_sw->lft);
97 if (p_sw->new_lft)
98 free(p_sw->new_lft);
99 if (p_sw->hops) {
100 for (i = 0; i < p_sw->num_hops; i++)
101 if (p_sw->hops[i])
102 free(p_sw->hops[i]);
103 free(p_sw->hops);
104 }
105 free(*pp_sw);
106 *pp_sw = NULL;
107 }
108
osm_switch_new(IN osm_node_t * p_node,IN const osm_madw_t * p_madw)109 osm_switch_t *osm_switch_new(IN osm_node_t * p_node,
110 IN const osm_madw_t * p_madw)
111 {
112 osm_switch_t *p_sw;
113 ib_switch_info_t *p_si;
114 ib_smp_t *p_smp;
115 uint8_t num_ports;
116 uint32_t port_num;
117
118 CL_ASSERT(p_madw);
119 CL_ASSERT(p_node);
120
121 p_smp = osm_madw_get_smp_ptr(p_madw);
122 p_si = ib_smp_get_payload_ptr(p_smp);
123 num_ports = osm_node_get_num_physp(p_node);
124
125 CL_ASSERT(p_smp->attr_id == IB_MAD_ATTR_SWITCH_INFO);
126
127 if (!p_si->lin_cap) /* The switch doesn't support LFT */
128 return NULL;
129
130 p_sw = malloc(sizeof(*p_sw));
131 if (!p_sw)
132 return NULL;
133
134 memset(p_sw, 0, sizeof(*p_sw));
135
136 p_sw->p_node = p_node;
137 p_sw->switch_info = *p_si;
138 p_sw->num_ports = num_ports;
139 p_sw->need_update = 2;
140
141 p_sw->p_prof = malloc(sizeof(*p_sw->p_prof) * num_ports);
142 if (!p_sw->p_prof)
143 goto err;
144
145 memset(p_sw->p_prof, 0, sizeof(*p_sw->p_prof) * num_ports);
146
147 osm_mcast_tbl_init(&p_sw->mcast_tbl, osm_node_get_num_physp(p_node),
148 cl_ntoh16(p_si->mcast_cap));
149
150 for (port_num = 0; port_num < num_ports; port_num++)
151 osm_port_prof_construct(&p_sw->p_prof[port_num]);
152
153 return p_sw;
154
155 err:
156 osm_switch_delete(&p_sw);
157 return NULL;
158 }
159
osm_switch_get_lft_block(IN const osm_switch_t * p_sw,IN uint16_t block_id,OUT uint8_t * p_block)160 boolean_t osm_switch_get_lft_block(IN const osm_switch_t * p_sw,
161 IN uint16_t block_id, OUT uint8_t * p_block)
162 {
163 uint16_t base_lid_ho = block_id * IB_SMP_DATA_SIZE;
164
165 CL_ASSERT(p_sw);
166 CL_ASSERT(p_block);
167
168 if (base_lid_ho > p_sw->max_lid_ho)
169 return FALSE;
170
171 CL_ASSERT(base_lid_ho + IB_SMP_DATA_SIZE - 1 <= IB_LID_UCAST_END_HO);
172 memcpy(p_block, &(p_sw->new_lft[base_lid_ho]), IB_SMP_DATA_SIZE);
173 return TRUE;
174 }
175
176 static struct osm_remote_node *
switch_find_guid_common(IN const osm_switch_t * p_sw,IN struct osm_remote_guids_count * r,IN uint8_t port_num,IN int find_sys_guid,IN int find_node_guid)177 switch_find_guid_common(IN const osm_switch_t * p_sw,
178 IN struct osm_remote_guids_count *r,
179 IN uint8_t port_num, IN int find_sys_guid,
180 IN int find_node_guid)
181 {
182 struct osm_remote_node *p_remote_guid = NULL;
183 osm_physp_t *p_physp;
184 osm_physp_t *p_rem_physp;
185 osm_node_t *p_rem_node;
186 uint64_t sys_guid;
187 uint64_t node_guid;
188 unsigned int i;
189
190 CL_ASSERT(p_sw);
191
192 if (!r)
193 goto out;
194
195 p_physp = osm_node_get_physp_ptr(p_sw->p_node, port_num);
196 if (!p_physp)
197 goto out;
198
199 p_rem_physp = osm_physp_get_remote(p_physp);
200 p_rem_node = osm_physp_get_node_ptr(p_rem_physp);
201 sys_guid = p_rem_node->node_info.sys_guid;
202 node_guid = p_rem_node->node_info.node_guid;
203
204 for (i = 0; i < r->count; i++) {
205 if ((!find_sys_guid
206 || r->guids[i].node->node_info.sys_guid == sys_guid)
207 && (!find_node_guid
208 || r->guids[i].node->node_info.node_guid == node_guid)) {
209 p_remote_guid = &r->guids[i];
210 break;
211 }
212 }
213
214 out:
215 return p_remote_guid;
216 }
217
218 static struct osm_remote_node *
switch_find_sys_guid_count(IN const osm_switch_t * p_sw,IN struct osm_remote_guids_count * r,IN uint8_t port_num)219 switch_find_sys_guid_count(IN const osm_switch_t * p_sw,
220 IN struct osm_remote_guids_count *r,
221 IN uint8_t port_num)
222 {
223 return switch_find_guid_common(p_sw, r, port_num, 1, 0);
224 }
225
226 static struct osm_remote_node *
switch_find_node_guid_count(IN const osm_switch_t * p_sw,IN struct osm_remote_guids_count * r,IN uint8_t port_num)227 switch_find_node_guid_count(IN const osm_switch_t * p_sw,
228 IN struct osm_remote_guids_count *r,
229 IN uint8_t port_num)
230 {
231 return switch_find_guid_common(p_sw, r, port_num, 0, 1);
232 }
233
osm_switch_recommend_path(IN const osm_switch_t * p_sw,IN osm_port_t * p_port,IN uint16_t lid_ho,IN unsigned start_from,IN boolean_t ignore_existing,IN boolean_t routing_for_lmc,IN boolean_t dor,IN boolean_t port_shifting,IN uint32_t scatter_ports,IN osm_lft_type_enum lft_enum)234 uint8_t osm_switch_recommend_path(IN const osm_switch_t * p_sw,
235 IN osm_port_t * p_port, IN uint16_t lid_ho,
236 IN unsigned start_from,
237 IN boolean_t ignore_existing,
238 IN boolean_t routing_for_lmc,
239 IN boolean_t dor,
240 IN boolean_t port_shifting,
241 IN uint32_t scatter_ports,
242 IN osm_lft_type_enum lft_enum)
243 {
244 /*
245 We support an enhanced LMC aware routing mode:
246 In the case of LMC > 0, we can track the remote side
247 system and node for all of the lids of the target
248 and try and avoid routing again through the same
249 system / node.
250
251 Assume if routing_for_lmc is true that this procedure was
252 provided the tracking array and counter via p_port->priv,
253 and we can conduct this algorithm.
254 */
255 uint16_t base_lid;
256 uint8_t hops;
257 uint8_t least_hops;
258 uint8_t port_num;
259 uint8_t num_ports;
260 uint32_t least_paths = 0xFFFFFFFF;
261 unsigned i;
262 /*
263 The following will track the least paths if the
264 route should go through a new system/node
265 */
266 uint32_t least_paths_other_sys = 0xFFFFFFFF;
267 uint32_t least_paths_other_nodes = 0xFFFFFFFF;
268 uint32_t least_forwarded_to = 0xFFFFFFFF;
269 uint32_t check_count;
270 uint8_t best_port = 0;
271 /*
272 These vars track the best port if it connects to
273 not used system/node.
274 */
275 uint8_t best_port_other_sys = 0;
276 uint8_t best_port_other_node = 0;
277 boolean_t port_found = FALSE;
278 osm_physp_t *p_physp;
279 osm_physp_t *p_rem_physp;
280 osm_node_t *p_rem_node;
281 osm_node_t *p_rem_node_first = NULL;
282 struct osm_remote_node *p_remote_guid = NULL;
283 struct osm_remote_node null_remote_node = {NULL, 0, 0};
284 struct switch_port_path port_paths[IB_NODE_NUM_PORTS_MAX];
285 unsigned int port_paths_total_paths = 0;
286 unsigned int port_paths_count = 0;
287 uint8_t scatter_possible_ports[IB_NODE_NUM_PORTS_MAX];
288 unsigned int scatter_possible_ports_count = 0;
289 int found_sys_guid = 0;
290 int found_node_guid = 0;
291
292 CL_ASSERT(lid_ho > 0);
293
294 if (p_port->p_node->sw) {
295 if (p_port->p_node->sw == p_sw)
296 return 0;
297 base_lid = osm_port_get_base_lid(p_port);
298 } else {
299 p_physp = p_port->p_physp;
300 if (!p_physp || !p_physp->p_remote_physp ||
301 !p_physp->p_remote_physp->p_node->sw)
302 return OSM_NO_PATH;
303
304 if (p_physp->p_remote_physp->p_node->sw == p_sw)
305 return p_physp->p_remote_physp->port_num;
306 base_lid =
307 osm_node_get_base_lid(p_physp->p_remote_physp->p_node, 0);
308 }
309 base_lid = cl_ntoh16(base_lid);
310
311 num_ports = p_sw->num_ports;
312
313 least_hops = osm_switch_get_least_hops(p_sw, base_lid);
314 if (least_hops == OSM_NO_PATH)
315 return OSM_NO_PATH;
316
317 /*
318 First, inquire with the forwarding table for an existing
319 route. If one is found, honor it unless:
320 1. the ignore existing flag is set.
321 2. the physical port is not a valid one or not healthy
322 3. the physical port has a remote port (the link is up)
323 4. the port has min-hops to the target (avoid loops)
324 */
325 if (!ignore_existing) {
326 port_num = osm_switch_get_port_by_lid(p_sw, lid_ho, lft_enum);
327
328 if (port_num != OSM_NO_PATH) {
329 CL_ASSERT(port_num < num_ports);
330
331 p_physp =
332 osm_node_get_physp_ptr(p_sw->p_node, port_num);
333 /*
334 Don't be too trusting of the current forwarding table!
335 Verify that the port number is legal and that the
336 LID is reachable through this port.
337 */
338 if (p_physp && osm_physp_is_healthy(p_physp) &&
339 osm_physp_get_remote(p_physp)) {
340 hops =
341 osm_switch_get_hop_count(p_sw, base_lid,
342 port_num);
343 /*
344 If we aren't using pre-defined user routes
345 function, then we need to make sure that the
346 current path is the minimum one. In case of
347 having such a user function - this check will
348 not be done, and the old routing will be used.
349 Note: This means that it is the user's job to
350 clean all data in the forwarding tables that
351 he wants to be overridden by the minimum
352 hop function.
353 */
354 if (hops == least_hops)
355 return port_num;
356 }
357 }
358 }
359
360 /*
361 This algorithm selects a port based on a static load balanced
362 selection across equal hop-count ports.
363 There is lots of room for improved sophistication here,
364 possibly guided by user configuration info.
365 */
366
367 /*
368 OpenSM routing is "local" - not considering a full lid to lid
369 path. As such we can not guarantee a path will not loop if we
370 do not always follow least hops.
371 So we must abort if not least hops.
372 */
373
374 /* port number starts with one and num_ports is 1 + num phys ports */
375 for (i = start_from; i < start_from + num_ports; i++) {
376 port_num = osm_switch_get_dimn_port(p_sw, i % num_ports);
377 if (!port_num ||
378 osm_switch_get_hop_count(p_sw, base_lid, port_num) !=
379 least_hops)
380 continue;
381
382 /* let us make sure it is not down or unhealthy */
383 p_physp = osm_node_get_physp_ptr(p_sw->p_node, port_num);
384 if (!p_physp || !osm_physp_is_healthy(p_physp) ||
385 /*
386 we require all - non sma ports to be linked
387 to be routed through
388 */
389 !osm_physp_get_remote(p_physp))
390 continue;
391
392 /*
393 We located a least-hop port, possibly one of many.
394 For this port, check the running total count of
395 the number of paths through this port. Select
396 the port routing the least number of paths.
397 */
398 check_count =
399 osm_port_prof_path_count_get(&p_sw->p_prof[port_num]);
400
401
402 if (dor) {
403 /* Get the Remote Node */
404 p_rem_physp = osm_physp_get_remote(p_physp);
405 p_rem_node = osm_physp_get_node_ptr(p_rem_physp);
406 /* use the first dimension, but spread traffic
407 * out among the group of ports representing
408 * that dimension */
409 if (!p_rem_node_first)
410 p_rem_node_first = p_rem_node;
411 else if (p_rem_node != p_rem_node_first)
412 continue;
413 if (routing_for_lmc) {
414 struct osm_remote_guids_count *r = p_port->priv;
415 uint8_t rem_port = osm_physp_get_port_num(p_rem_physp);
416 unsigned int j;
417
418 for (j = 0; j < r->count; j++) {
419 p_remote_guid = &r->guids[j];
420 if ((p_remote_guid->node == p_rem_node)
421 && (p_remote_guid->port == rem_port))
422 break;
423 }
424 if (j == r->count)
425 p_remote_guid = &null_remote_node;
426 }
427 /*
428 Advanced LMC routing requires tracking of the
429 best port by the node connected to the other side of
430 it.
431 */
432 } else if (routing_for_lmc) {
433 /* Is the sys guid already used ? */
434 p_remote_guid = switch_find_sys_guid_count(p_sw,
435 p_port->priv,
436 port_num);
437
438 /* If not update the least hops for this case */
439 if (!p_remote_guid) {
440 if (check_count < least_paths_other_sys) {
441 least_paths_other_sys = check_count;
442 best_port_other_sys = port_num;
443 least_forwarded_to = 0;
444 }
445 found_sys_guid = 0;
446 } else { /* same sys found - try node */
447
448
449 /* Else is the node guid already used ? */
450 p_remote_guid = switch_find_node_guid_count(p_sw,
451 p_port->priv,
452 port_num);
453
454 /* If not update the least hops for this case */
455 if (!p_remote_guid
456 && check_count < least_paths_other_nodes) {
457 least_paths_other_nodes = check_count;
458 best_port_other_node = port_num;
459 least_forwarded_to = 0;
460 }
461 /* else prior sys and node guid already used */
462
463 if (!p_remote_guid)
464 found_node_guid = 0;
465 else
466 found_node_guid = 1;
467 found_sys_guid = 1;
468 } /* same sys found */
469 }
470
471 port_paths[port_paths_count].port_num = port_num;
472 port_paths[port_paths_count].path_count = check_count;
473 if (routing_for_lmc) {
474 port_paths[port_paths_count].found_sys_guid = found_sys_guid;
475 port_paths[port_paths_count].found_node_guid = found_node_guid;
476 }
477 if (routing_for_lmc && p_remote_guid)
478 port_paths[port_paths_count].forwarded_to = p_remote_guid->forwarded_to;
479 else
480 port_paths[port_paths_count].forwarded_to = 0;
481 port_paths_total_paths += check_count;
482 port_paths_count++;
483
484 /* routing for LMC mode */
485 /*
486 the count is min but also lower then the max subscribed
487 */
488 if (check_count < least_paths) {
489 port_found = TRUE;
490 best_port = port_num;
491 least_paths = check_count;
492 scatter_possible_ports_count = 0;
493 scatter_possible_ports[scatter_possible_ports_count++] = port_num;
494 if (routing_for_lmc
495 && p_remote_guid
496 && p_remote_guid->forwarded_to < least_forwarded_to)
497 least_forwarded_to = p_remote_guid->forwarded_to;
498 } else if (scatter_ports
499 && check_count == least_paths) {
500 scatter_possible_ports[scatter_possible_ports_count++] = port_num;
501 } else if (routing_for_lmc
502 && p_remote_guid
503 && check_count == least_paths
504 && p_remote_guid->forwarded_to < least_forwarded_to) {
505 least_forwarded_to = p_remote_guid->forwarded_to;
506 best_port = port_num;
507 }
508 }
509
510 if (port_found == FALSE)
511 return OSM_NO_PATH;
512
513 if (port_shifting && port_paths_count) {
514 /* In the port_paths[] array, we now have all the ports that we
515 * can route out of. Using some shifting math below, possibly
516 * select a different one so that lids won't align in LFTs
517 *
518 * If lmc > 0, we need to loop through these ports to find the
519 * least_forwarded_to port, best_port_other_sys, and
520 * best_port_other_node just like before but through the different
521 * ordering.
522 */
523
524 least_paths = 0xFFFFFFFF;
525 least_paths_other_sys = 0xFFFFFFFF;
526 least_paths_other_nodes = 0xFFFFFFFF;
527 least_forwarded_to = 0xFFFFFFFF;
528 best_port = 0;
529 best_port_other_sys = 0;
530 best_port_other_node = 0;
531
532 for (i = 0; i < port_paths_count; i++) {
533 unsigned int idx;
534
535 idx = (port_paths_total_paths/port_paths_count + i) % port_paths_count;
536
537 if (routing_for_lmc) {
538 if (!port_paths[idx].found_sys_guid
539 && port_paths[idx].path_count < least_paths_other_sys) {
540 least_paths_other_sys = port_paths[idx].path_count;
541 best_port_other_sys = port_paths[idx].port_num;
542 least_forwarded_to = 0;
543 }
544 else if (!port_paths[idx].found_node_guid
545 && port_paths[idx].path_count < least_paths_other_nodes) {
546 least_paths_other_nodes = port_paths[idx].path_count;
547 best_port_other_node = port_paths[idx].port_num;
548 least_forwarded_to = 0;
549 }
550 }
551
552 if (port_paths[idx].path_count < least_paths) {
553 best_port = port_paths[idx].port_num;
554 least_paths = port_paths[idx].path_count;
555 if (routing_for_lmc
556 && (port_paths[idx].found_sys_guid
557 || port_paths[idx].found_node_guid)
558 && port_paths[idx].forwarded_to < least_forwarded_to)
559 least_forwarded_to = port_paths[idx].forwarded_to;
560 }
561 else if (routing_for_lmc
562 && (port_paths[idx].found_sys_guid
563 || port_paths[idx].found_node_guid)
564 && port_paths[idx].path_count == least_paths
565 && port_paths[idx].forwarded_to < least_forwarded_to) {
566 least_forwarded_to = port_paths[idx].forwarded_to;
567 best_port = port_paths[idx].port_num;
568 }
569
570 }
571 }
572
573 /*
574 if we are in enhanced routing mode and the best port is not
575 the local port 0
576 */
577 if (routing_for_lmc && best_port && !scatter_ports) {
578 /* Select the least hop port of the non used sys first */
579 if (best_port_other_sys)
580 best_port = best_port_other_sys;
581 else if (best_port_other_node)
582 best_port = best_port_other_node;
583 } else if (scatter_ports) {
584 /*
585 * There is some danger that this random could "rebalance" the routes
586 * every time, to combat this there is a global srandom that
587 * occurs at the start of every sweep.
588 */
589 unsigned int idx = random() % scatter_possible_ports_count;
590 best_port = scatter_possible_ports[idx];
591 }
592 return best_port;
593 }
594
osm_switch_clear_hops(IN osm_switch_t * p_sw)595 void osm_switch_clear_hops(IN osm_switch_t * p_sw)
596 {
597 unsigned i;
598
599 for (i = 0; i < p_sw->num_hops; i++)
600 if (p_sw->hops[i])
601 memset(p_sw->hops[i], OSM_NO_PATH, p_sw->num_ports);
602 }
603
alloc_lft(IN osm_switch_t * p_sw,uint16_t lids)604 static int alloc_lft(IN osm_switch_t * p_sw, uint16_t lids)
605 {
606 uint16_t lft_size;
607
608 /* Ensure LFT is in units of LFT block size */
609 lft_size = (lids / IB_SMP_DATA_SIZE + 1) * IB_SMP_DATA_SIZE;
610 if (lft_size > p_sw->lft_size) {
611 uint8_t *new_lft = realloc(p_sw->lft, lft_size);
612 if (!new_lft)
613 return -1;
614 memset(new_lft + p_sw->lft_size, OSM_NO_PATH,
615 lft_size - p_sw->lft_size);
616 p_sw->lft = new_lft;
617 p_sw->lft_size = lft_size;
618 }
619
620 return 0;
621 }
622
osm_switch_prepare_path_rebuild(IN osm_switch_t * p_sw,IN uint16_t max_lids)623 int osm_switch_prepare_path_rebuild(IN osm_switch_t * p_sw, IN uint16_t max_lids)
624 {
625 uint8_t **hops;
626 uint8_t *new_lft;
627 unsigned i;
628
629 if (alloc_lft(p_sw, max_lids))
630 return -1;
631
632 for (i = 0; i < p_sw->num_ports; i++)
633 osm_port_prof_construct(&p_sw->p_prof[i]);
634
635 osm_switch_clear_hops(p_sw);
636
637 if (!(new_lft = realloc(p_sw->new_lft, p_sw->lft_size)))
638 return -1;
639
640 p_sw->new_lft = new_lft;
641
642 memset(p_sw->new_lft, OSM_NO_PATH, p_sw->lft_size);
643
644 if (!p_sw->hops) {
645 hops = malloc((max_lids + 1) * sizeof(hops[0]));
646 if (!hops)
647 return -1;
648 memset(hops, 0, (max_lids + 1) * sizeof(hops[0]));
649 p_sw->hops = hops;
650 p_sw->num_hops = max_lids + 1;
651 } else if (max_lids + 1 > p_sw->num_hops) {
652 hops = realloc(p_sw->hops, (max_lids + 1) * sizeof(hops[0]));
653 if (!hops)
654 return -1;
655 memset(hops + p_sw->num_hops, 0,
656 (max_lids + 1 - p_sw->num_hops) * sizeof(hops[0]));
657 p_sw->hops = hops;
658 p_sw->num_hops = max_lids + 1;
659 }
660 p_sw->max_lid_ho = max_lids;
661
662 return 0;
663 }
664
osm_switch_get_port_least_hops(IN const osm_switch_t * p_sw,IN const osm_port_t * p_port)665 uint8_t osm_switch_get_port_least_hops(IN const osm_switch_t * p_sw,
666 IN const osm_port_t * p_port)
667 {
668 uint16_t lid;
669
670 if (p_port->p_node->sw) {
671 if (p_port->p_node->sw == p_sw)
672 return 0;
673 lid = osm_node_get_base_lid(p_port->p_node, 0);
674 return osm_switch_get_least_hops(p_sw, cl_ntoh16(lid));
675 } else {
676 osm_physp_t *p = p_port->p_physp;
677 uint8_t hops;
678
679 if (!p || !p->p_remote_physp || !p->p_remote_physp->p_node->sw)
680 return OSM_NO_PATH;
681 if (p->p_remote_physp->p_node->sw == p_sw)
682 return 1;
683 lid = osm_node_get_base_lid(p->p_remote_physp->p_node, 0);
684 hops = osm_switch_get_least_hops(p_sw, cl_ntoh16(lid));
685 return hops != OSM_NO_PATH ? hops + 1 : OSM_NO_PATH;
686 }
687 }
688
osm_switch_recommend_mcast_path(IN osm_switch_t * p_sw,IN osm_port_t * p_port,IN uint16_t mlid_ho,IN boolean_t ignore_existing)689 uint8_t osm_switch_recommend_mcast_path(IN osm_switch_t * p_sw,
690 IN osm_port_t * p_port,
691 IN uint16_t mlid_ho,
692 IN boolean_t ignore_existing)
693 {
694 uint16_t base_lid;
695 uint8_t hops;
696 uint8_t port_num;
697 uint8_t num_ports;
698 uint8_t least_hops;
699
700 CL_ASSERT(mlid_ho >= IB_LID_MCAST_START_HO);
701
702 if (p_port->p_node->sw) {
703 if (p_port->p_node->sw == p_sw)
704 return 0;
705 base_lid = osm_port_get_base_lid(p_port);
706 } else {
707 osm_physp_t *p_physp = p_port->p_physp;
708 if (!p_physp || !p_physp->p_remote_physp ||
709 !p_physp->p_remote_physp->p_node->sw)
710 return OSM_NO_PATH;
711 if (p_physp->p_remote_physp->p_node->sw == p_sw)
712 return p_physp->p_remote_physp->port_num;
713 base_lid =
714 osm_node_get_base_lid(p_physp->p_remote_physp->p_node, 0);
715 }
716 base_lid = cl_ntoh16(base_lid);
717 num_ports = p_sw->num_ports;
718
719 /*
720 If the user wants us to ignore existing multicast routes,
721 then simply return the shortest hop count path to the
722 target port.
723
724 Otherwise, return the first port that has a path to the target,
725 picking from the ports that are already in the multicast group.
726 */
727 if (!ignore_existing) {
728 for (port_num = 1; port_num < num_ports; port_num++) {
729 if (!osm_mcast_tbl_is_port
730 (&p_sw->mcast_tbl, mlid_ho, port_num))
731 continue;
732 /*
733 Don't be too trusting of the current forwarding table!
734 Verify that the LID is reachable through this port.
735 */
736 hops =
737 osm_switch_get_hop_count(p_sw, base_lid, port_num);
738 if (hops != OSM_NO_PATH)
739 return port_num;
740 }
741 }
742
743 /*
744 Either no existing mcast paths reach this port or we are
745 ignoring existing paths.
746
747 Determine the best multicast path to the target. Note that this
748 algorithm is slightly different from the one used for unicast route
749 recommendation. In this case (multicast), we must NOT
750 perform any sort of load balancing. We MUST take the FIRST
751 port found that has <= the lowest hop count path. This prevents
752 more than one multicast path to the same remote switch which
753 prevents a multicast loop. Multicast loops are bad since the same
754 multicast packet will go around and around, inevitably creating
755 a black hole that will destroy the Earth in a firey conflagration.
756 */
757 least_hops = osm_switch_get_least_hops(p_sw, base_lid);
758 if (least_hops == OSM_NO_PATH)
759 return OSM_NO_PATH;
760 for (port_num = 1; port_num < num_ports; port_num++)
761 if (osm_switch_get_hop_count(p_sw, base_lid, port_num) ==
762 least_hops)
763 break;
764
765 CL_ASSERT(port_num < num_ports);
766 return port_num;
767 }
768