1 /*
2 * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
3 * Copyright (c) 2002-2015 Mellanox Technologies LTD. All rights reserved.
4 * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
5 * Copyright (c) 2009-2015 ZIH, TU Dresden, Federal Republic of Germany. All rights reserved.
6 * Copyright (C) 2012-2017 Tokyo Institute of Technology. All rights reserved.
7 *
8 * This software is available to you under a choice of one of two
9 * licenses. You may choose to be licensed under the terms of the GNU
10 * General Public License (GPL) Version 2, available from the file
11 * COPYING in the main directory of this source tree, or the
12 * OpenIB.org BSD license below:
13 *
14 * Redistribution and use in source and binary forms, with or
15 * without modification, are permitted provided that the following
16 * conditions are met:
17 *
18 * - Redistributions of source code must retain the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer.
21 *
22 * - Redistributions in binary form must reproduce the above
23 * copyright notice, this list of conditions and the following
24 * disclaimer in the documentation and/or other materials
25 * provided with the distribution.
26 *
27 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
28 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
29 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
30 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
31 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
32 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
33 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
34 * SOFTWARE.
35 *
36 */
37
38 /*
39 * Abstract:
40 * Implementation of OpenSM (deadlock-free) single-source-shortest-path routing
41 * (with dijkstra algorithm)
42 */
43
44 #if HAVE_CONFIG_H
45 # include <config.h>
46 #endif /* HAVE_CONFIG_H */
47
48 #include <stdio.h>
49 #include <stdlib.h>
50 #include <string.h>
51 #include <opensm/osm_file_ids.h>
52 #define FILE_ID OSM_FILE_UCAST_DFSSSP_C
53 #include <opensm/osm_ucast_mgr.h>
54 #include <opensm/osm_opensm.h>
55 #include <opensm/osm_node.h>
56 #include <opensm/osm_multicast.h>
57 #include <opensm/osm_mcast_mgr.h>
58
59 /* "infinity" for dijkstra */
60 #define INF 0x7FFFFFFF
61
62 enum {
63 UNDISCOVERED = 0,
64 DISCOVERED
65 };
66
67 enum {
68 UNKNOWN = 0,
69 GRAY,
70 BLACK,
71 };
72
73 typedef struct link {
74 uint64_t guid; /* guid of the neighbor behind the link */
75 uint32_t from; /* base_index in the adjazenz list (start of the link) */
76 uint8_t from_port; /* port on the base_side (needed for weight update to identify the correct link for multigraphs) */
77 uint32_t to; /* index of the neighbor in the adjazenz list (end of the link) */
78 uint8_t to_port; /* port on the side of the neighbor (needed for the LFT) */
79 uint64_t weight; /* link weight */
80 struct link *next;
81 } link_t;
82
83 typedef struct vertex {
84 /* informations of the fabric */
85 uint64_t guid;
86 uint16_t lid; /* for lft filling */
87 uint32_t num_hca; /* numbers of Hca/LIDs on the switch, for weight calculation */
88 link_t *links;
89 uint8_t hops;
90 /* for dijkstra routing */
91 link_t *used_link; /* link between the vertex discovered before and this vertex */
92 uint64_t distance; /* distance from source to this vertex */
93 uint8_t state;
94 /* for the binary heap */
95 uint32_t heap_id;
96 /* for LFT writing and debug */
97 osm_switch_t *sw; /* selfpointer */
98 boolean_t dropped; /* indicate dropped switches (w/ ucast cache) */
99 } vertex_t;
100
101 typedef struct binary_heap {
102 uint32_t size; /* size of the heap */
103 vertex_t **nodes; /* array with pointers to elements of the adj_list */
104 } binary_heap_t;
105
106 typedef struct vltable {
107 uint64_t num_lids; /* size of the lids array */
108 uint16_t *lids; /* sorted array of all lids in the subnet */
109 uint8_t *vls; /* matrix form assignment lid X lid -> virtual lane */
110 } vltable_t;
111
112 typedef struct cdg_link {
113 struct cdg_node *node;
114 uint32_t num_pairs; /* number of src->dest pairs incremented in path adding step */
115 uint32_t max_len; /* length of the srcdest array */
116 uint32_t removed; /* number of pairs removed in path deletion step */
117 uint32_t *srcdest_pairs;
118 struct cdg_link *next;
119 } cdg_link_t;
120
121 /* struct for a node of a binary tree with additional parent pointer */
122 typedef struct cdg_node {
123 uint64_t channelID; /* unique key consist of src lid + port + dest lid + port */
124 cdg_link_t *linklist; /* edges to adjazent nodes */
125 uint8_t status; /* node status in cycle search to avoid recursive function */
126 uint8_t visited; /* needed to traverse the binary tree */
127 struct cdg_node *pre; /* to save the path in cycle detection algorithm */
128 struct cdg_node *left, *right, *parent;
129 } cdg_node_t;
130
131 typedef struct dfsssp_context {
132 osm_routing_engine_type_t routing_type;
133 osm_ucast_mgr_t *p_mgr;
134 vertex_t *adj_list;
135 uint32_t adj_list_size;
136 vltable_t *srcdest2vl_table;
137 uint8_t *vl_split_count;
138 } dfsssp_context_t;
139
140 /**************** set initial values for structs **********************
141 **********************************************************************/
set_default_link(link_t * link)142 static inline void set_default_link(link_t * link)
143 {
144 link->guid = 0;
145 link->from = 0;
146 link->from_port = 0;
147 link->to = 0;
148 link->to_port = 0;
149 link->weight = 0;
150 link->next = NULL;
151 }
152
set_default_vertex(vertex_t * vertex)153 static inline void set_default_vertex(vertex_t * vertex)
154 {
155 vertex->guid = 0;
156 vertex->lid = 0;
157 vertex->num_hca = 0;
158 vertex->links = NULL;
159 vertex->hops = 0;
160 vertex->used_link = NULL;
161 vertex->distance = 0;
162 vertex->state = UNDISCOVERED;
163 vertex->heap_id = 0;
164 vertex->sw = NULL;
165 vertex->dropped = FALSE;
166 }
167
set_default_cdg_node(cdg_node_t * node)168 static inline void set_default_cdg_node(cdg_node_t * node)
169 {
170 node->channelID = 0;
171 node->linklist = NULL;
172 node->status = UNKNOWN;
173 node->visited = 0;
174 node->pre = NULL;
175 node->left = NULL;
176 node->right = NULL;
177 node->parent = NULL;
178 }
179
180 /**********************************************************************
181 **********************************************************************/
182
183 /************ helper functions for heap in dijkstra *******************
184 **********************************************************************/
185 /* returns true if element 1 is smaller than element 2 */
heap_smaller(binary_heap_t * heap,uint32_t i,uint32_t j)186 static inline uint32_t heap_smaller(binary_heap_t * heap, uint32_t i,
187 uint32_t j)
188 {
189 return (heap->nodes[i]->distance < heap->nodes[j]->distance) ? 1 : 0;
190 }
191
192 /* swap two elements */
heap_exchange(binary_heap_t * heap,uint32_t i,uint32_t j)193 static void heap_exchange(binary_heap_t * heap, uint32_t i, uint32_t j)
194 {
195 uint32_t tmp_heap_id = 0;
196 vertex_t *tmp_node = NULL;
197
198 /* 1. swap the heap_id */
199 tmp_heap_id = heap->nodes[i]->heap_id;
200 heap->nodes[i]->heap_id = heap->nodes[j]->heap_id;
201 heap->nodes[j]->heap_id = tmp_heap_id;
202 /* 2. swap pointers */
203 tmp_node = heap->nodes[i];
204 heap->nodes[i] = heap->nodes[j];
205 heap->nodes[j] = tmp_node;
206 }
207
208 /* changes position of element with parent until children are bigger */
heap_up(binary_heap_t * heap,uint32_t i)209 static uint32_t heap_up(binary_heap_t * heap, uint32_t i)
210 {
211 uint32_t curr = i, father = 0;
212
213 if (curr > 0) {
214 father = (curr - 1) >> 1;
215 while (heap_smaller(heap, curr, father)) {
216 heap_exchange(heap, curr, father);
217 /* try to go up when we arent already root */
218 curr = father;
219 if (curr > 0)
220 father = (curr - 1) >> 1;
221 }
222 }
223
224 return curr;
225 }
226
227 /* changes position of element with children until parent is smaller */
heap_down(binary_heap_t * heap,uint32_t i)228 static uint32_t heap_down(binary_heap_t * heap, uint32_t i)
229 {
230 uint32_t curr = i;
231 uint32_t son1 = 0, son2 = 0, smaller_son = 0;
232 uint32_t exchanged = 0;
233
234 do {
235 son1 = ((curr + 1) << 1) - 1;
236 son2 = (curr + 1) << 1;
237 exchanged = 0;
238
239 /* exchange with smaller son */
240 if (son1 < heap->size && son2 < heap->size) {
241 if (heap_smaller(heap, son1, son2))
242 smaller_son = son1;
243 else
244 smaller_son = son2;
245 } else if (son1 < heap->size) {
246 /* only one son */
247 smaller_son = son1;
248 } else {
249 /* finished */
250 break;
251 }
252
253 /* only exchange when smaller */
254 if (heap_smaller(heap, smaller_son, curr)) {
255 heap_exchange(heap, curr, smaller_son);
256 exchanged = 1;
257 curr = smaller_son;
258 }
259 } while (exchanged);
260
261 return curr;
262 }
263
264 /* reheapify element */
heap_heapify(binary_heap_t * heap,uint32_t i)265 static inline void heap_heapify(binary_heap_t * heap, uint32_t i)
266 {
267 heap_down(heap, heap_up(heap, i));
268 }
269
270 /* creates heap for graph */
heap_create(vertex_t * adj_list,uint32_t adj_list_size,binary_heap_t ** binheap)271 static int heap_create(vertex_t * adj_list, uint32_t adj_list_size,
272 binary_heap_t ** binheap)
273 {
274 binary_heap_t *heap = NULL;
275 uint32_t i = 0;
276
277 /* allocate the memory for the heap object */
278 heap = (binary_heap_t *) malloc(sizeof(binary_heap_t));
279 if (!heap)
280 return 1;
281
282 /* the heap size is equivalent to the size of the adj_list */
283 heap->size = adj_list_size;
284
285 /* allocate the pointer array, fill with the pointers to the elements of the adj_list and set the initial heap_id */
286 heap->nodes = (vertex_t **) malloc(heap->size * sizeof(vertex_t *));
287 if (!heap->nodes) {
288 free(heap);
289 return 1;
290 }
291 for (i = 0; i < heap->size; i++) {
292 heap->nodes[i] = &adj_list[i];
293 heap->nodes[i]->heap_id = i;
294 }
295
296 /* sort elements */
297 for (i = heap->size; i > 0; i--)
298 heap_down(heap, i - 1);
299
300 *binheap = heap;
301 return 0;
302 }
303
304 /* returns current minimum and removes it from heap */
heap_getmin(binary_heap_t * heap)305 static vertex_t *heap_getmin(binary_heap_t * heap)
306 {
307 vertex_t *min = NULL;
308
309 if (heap->size > 0)
310 min = heap->nodes[0];
311
312 if (min == NULL)
313 return min;
314
315 if (heap->size > 0) {
316 if (heap->size > 1) {
317 heap_exchange(heap, 0, heap->size - 1);
318 heap->size--;
319 heap_down(heap, 0);
320 } else {
321 heap->size--;
322 }
323 }
324
325 return min;
326 }
327
328 /* cleanup heap */
heap_free(binary_heap_t * heap)329 static void heap_free(binary_heap_t * heap)
330 {
331 if (heap) {
332 if (heap->nodes) {
333 free(heap->nodes);
334 heap->nodes = NULL;
335 }
336 free(heap);
337 }
338 }
339
340 /**********************************************************************
341 **********************************************************************/
342
343 /************ helper functions to save src/dest X vl combination ******
344 **********************************************************************/
345 /* compare function of two lids for stdlib qsort */
cmp_lids(const void * l1,const void * l2)346 static int cmp_lids(const void *l1, const void *l2)
347 {
348 ib_net16_t lid1 = *((ib_net16_t *) l1), lid2 = *((ib_net16_t *) l2);
349
350 if (lid1 < lid2)
351 return -1;
352 else if (lid1 > lid2)
353 return 1;
354 else
355 return 0;
356 }
357
358 /* use stdlib to sort the lid array */
vltable_sort_lids(vltable_t * vltable)359 static inline void vltable_sort_lids(vltable_t * vltable)
360 {
361 qsort(vltable->lids, vltable->num_lids, sizeof(ib_net16_t), cmp_lids);
362 }
363
364 /* use stdlib to get index of key in lid array;
365 return -1 if lid isn't found in lids array
366 */
vltable_get_lidindex(ib_net16_t * key,vltable_t * vltable)367 static inline int64_t vltable_get_lidindex(ib_net16_t * key, vltable_t * vltable)
368 {
369 ib_net16_t *found_lid = NULL;
370
371 found_lid =
372 (ib_net16_t *) bsearch(key, vltable->lids, vltable->num_lids,
373 sizeof(ib_net16_t), cmp_lids);
374 if (found_lid)
375 return found_lid - vltable->lids;
376 else
377 return -1;
378 }
379
380 /* get virtual lane from src lid X dest lid combination;
381 return -1 for invalid lids
382 */
vltable_get_vl(vltable_t * vltable,ib_net16_t slid,ib_net16_t dlid)383 static int32_t vltable_get_vl(vltable_t * vltable, ib_net16_t slid, ib_net16_t dlid)
384 {
385 int64_t ind1 = vltable_get_lidindex(&slid, vltable);
386 int64_t ind2 = vltable_get_lidindex(&dlid, vltable);
387
388 if (ind1 > -1 && ind2 > -1)
389 return (int32_t) (vltable->
390 vls[ind1 + ind2 * vltable->num_lids]);
391 else
392 return -1;
393 }
394
395 /* set a virtual lane in the matrix */
vltable_insert(vltable_t * vltable,ib_net16_t slid,ib_net16_t dlid,uint8_t vl)396 static inline void vltable_insert(vltable_t * vltable, ib_net16_t slid,
397 ib_net16_t dlid, uint8_t vl)
398 {
399 int64_t ind1 = vltable_get_lidindex(&slid, vltable);
400 int64_t ind2 = vltable_get_lidindex(&dlid, vltable);
401
402 if (ind1 > -1 && ind2 > -1)
403 vltable->vls[ind1 + ind2 * vltable->num_lids] = vl;
404 }
405
406 /* change a number of lanes from lane xy to lane yz */
vltable_change_vl(vltable_t * vltable,uint8_t from,uint8_t to,uint64_t count)407 static void vltable_change_vl(vltable_t * vltable, uint8_t from, uint8_t to,
408 uint64_t count)
409 {
410 uint64_t set = 0, stop = 0;
411 uint64_t ind1 = 0, ind2 = 0;
412
413 for (ind1 = 0; ind1 < vltable->num_lids; ind1++) {
414 for (ind2 = 0; ind2 < vltable->num_lids; ind2++) {
415 if (set == count) {
416 stop = 1;
417 break;
418 }
419 if (ind1 != ind2) {
420 if (vltable->
421 vls[ind1 + ind2 * vltable->num_lids] ==
422 from) {
423 vltable->vls[ind1 +
424 ind2 * vltable->num_lids] =
425 to;
426 set++;
427 }
428 }
429 }
430 if (stop)
431 break;
432 }
433 }
434
vltable_print(osm_ucast_mgr_t * p_mgr,vltable_t * vltable)435 static void vltable_print(osm_ucast_mgr_t * p_mgr, vltable_t * vltable)
436 {
437 uint64_t ind1 = 0, ind2 = 0;
438
439 for (ind1 = 0; ind1 < vltable->num_lids; ind1++) {
440 for (ind2 = 0; ind2 < vltable->num_lids; ind2++) {
441 if (ind1 != ind2) {
442 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
443 " route from src_lid=%" PRIu16
444 " to dest_lid=%" PRIu16 " on vl=%" PRIu8
445 "\n", cl_ntoh16(vltable->lids[ind1]),
446 cl_ntoh16(vltable->lids[ind2]),
447 vltable->vls[ind1 +
448 ind2 * vltable->num_lids]);
449 }
450 }
451 }
452 }
453
vltable_dealloc(vltable_t ** vltable)454 static void vltable_dealloc(vltable_t ** vltable)
455 {
456 if (*vltable) {
457 if ((*vltable)->lids)
458 free((*vltable)->lids);
459 if ((*vltable)->vls)
460 free((*vltable)->vls);
461 free(*vltable);
462 *vltable = NULL;
463 }
464 }
465
vltable_alloc(vltable_t ** vltable,uint64_t size)466 static int vltable_alloc(vltable_t ** vltable, uint64_t size)
467 {
468 /* allocate VL table and indexing array */
469 *vltable = (vltable_t *) malloc(sizeof(vltable_t));
470 if (!(*vltable))
471 goto ERROR;
472 (*vltable)->num_lids = size;
473 (*vltable)->lids = (ib_net16_t *) malloc(size * sizeof(ib_net16_t));
474 if (!((*vltable)->lids))
475 goto ERROR;
476 (*vltable)->vls = (uint8_t *) malloc(size * size * sizeof(uint8_t));
477 if (!((*vltable)->vls))
478 goto ERROR;
479 memset((*vltable)->vls, OSM_DEFAULT_SL, size * size);
480
481 return 0;
482
483 ERROR:
484 vltable_dealloc(vltable);
485
486 return 1;
487 }
488
489 /**********************************************************************
490 **********************************************************************/
491
492 /************ helper functions to save/manage the channel dep. graph **
493 **********************************************************************/
494 /* update the srcdest array;
495 realloc array (double the size) if size is not large enough
496 */
set_next_srcdest_pair(cdg_link_t * link,uint32_t srcdest)497 static void set_next_srcdest_pair(cdg_link_t * link, uint32_t srcdest)
498 {
499 uint32_t new_size = 0, start_size = 2;
500 uint32_t *tmp = NULL, *tmp2 = NULL;
501
502 if (link->num_pairs == 0) {
503 link->srcdest_pairs =
504 (uint32_t *) malloc(start_size * sizeof(uint32_t));
505 link->srcdest_pairs[link->num_pairs] = srcdest;
506 link->max_len = start_size;
507 link->removed = 0;
508 } else if (link->num_pairs == link->max_len) {
509 new_size = link->max_len << 1;
510 tmp = (uint32_t *) malloc(new_size * sizeof(uint32_t));
511 tmp =
512 memcpy(tmp, link->srcdest_pairs,
513 link->max_len * sizeof(uint32_t));
514 tmp2 = link->srcdest_pairs;
515 link->srcdest_pairs = tmp;
516 link->srcdest_pairs[link->num_pairs] = srcdest;
517 free(tmp2);
518 link->max_len = new_size;
519 } else {
520 link->srcdest_pairs[link->num_pairs] = srcdest;
521 }
522 link->num_pairs++;
523 }
524
get_next_srcdest_pair(cdg_link_t * link,uint32_t index)525 static inline uint32_t get_next_srcdest_pair(cdg_link_t * link, uint32_t index)
526 {
527 return link->srcdest_pairs[index];
528 }
529
530 /* traverse binary tree to find a node */
cdg_search(cdg_node_t * root,uint64_t channelID)531 static cdg_node_t *cdg_search(cdg_node_t * root, uint64_t channelID)
532 {
533 while (root) {
534 if (channelID < root->channelID)
535 root = root->left;
536 else if (channelID > root->channelID)
537 root = root->right;
538 else if (channelID == root->channelID)
539 return root;
540 }
541 return NULL;
542 }
543
544 /* insert new node into the binary tree */
cdg_insert(cdg_node_t ** root,cdg_node_t * new_node)545 static void cdg_insert(cdg_node_t ** root, cdg_node_t * new_node)
546 {
547 cdg_node_t *current = *root;
548
549 if (!current) {
550 current = new_node;
551 *root = current;
552 return;
553 }
554
555 while (current) {
556 if (new_node->channelID < current->channelID) {
557 if (current->left) {
558 current = current->left;
559 } else {
560 current->left = new_node;
561 new_node->parent = current;
562 break;
563 }
564 } else if (new_node->channelID > current->channelID) {
565 if (current->right) {
566 current = current->right;
567 } else {
568 current->right = new_node;
569 new_node->parent = current;
570 break;
571 }
572 } else if (new_node->channelID == current->channelID) {
573 /* not really possible, maybe programming error */
574 break;
575 }
576 }
577 }
578
cdg_node_dealloc(cdg_node_t * node)579 static void cdg_node_dealloc(cdg_node_t * node)
580 {
581 cdg_link_t *link = node->linklist, *tmp = NULL;
582
583 /* dealloc linklist */
584 while (link) {
585 tmp = link;
586 link = link->next;
587
588 if (tmp->num_pairs)
589 free(tmp->srcdest_pairs);
590 free(tmp);
591 }
592 /* dealloc node */
593 free(node);
594 }
595
cdg_dealloc(cdg_node_t ** root)596 static void cdg_dealloc(cdg_node_t ** root)
597 {
598 cdg_node_t *current = *root;
599
600 while (current) {
601 if (current->left) {
602 current = current->left;
603 } else if (current->right) {
604 current = current->right;
605 } else {
606 if (current->parent == NULL) {
607 cdg_node_dealloc(current);
608 *root = NULL;
609 break;
610 }
611 if (current->parent->left == current) {
612 current = current->parent;
613 cdg_node_dealloc(current->left);
614 current->left = NULL;
615 } else if (current->parent->right == current) {
616 current = current->parent;
617 cdg_node_dealloc(current->right);
618 current->right = NULL;
619 }
620 }
621 }
622 }
623
624 /* search for a edge in the cdg which should be removed to break a cycle */
get_weakest_link_in_cycle(cdg_node_t * cycle)625 static cdg_link_t *get_weakest_link_in_cycle(cdg_node_t * cycle)
626 {
627 cdg_node_t *current = cycle, *node_with_weakest_link = NULL;
628 cdg_link_t *link = NULL, *weakest_link = NULL;
629
630 link = current->linklist;
631 while (link) {
632 if (link->node->status == GRAY) {
633 weakest_link = link;
634 node_with_weakest_link = current;
635 current = link->node;
636 break;
637 }
638 link = link->next;
639 }
640
641 while (1) {
642 current->status = UNKNOWN;
643 link = current->linklist;
644 while (link) {
645 if (link->node->status == GRAY) {
646 if ((link->num_pairs - link->removed) <
647 (weakest_link->num_pairs -
648 weakest_link->removed)) {
649 weakest_link = link;
650 node_with_weakest_link = current;
651 }
652 current = link->node;
653 break;
654 }
655 link = link->next;
656 }
657 /* if complete cycle is traversed */
658 if (current == cycle) {
659 current->status = UNKNOWN;
660 break;
661 }
662 }
663
664 if (node_with_weakest_link->linklist == weakest_link) {
665 node_with_weakest_link->linklist = weakest_link->next;
666 } else {
667 link = node_with_weakest_link->linklist;
668 while (link) {
669 if (link->next == weakest_link) {
670 link->next = weakest_link->next;
671 break;
672 }
673 link = link->next;
674 }
675 }
676
677 return weakest_link;
678 }
679
680 /* search for nodes in the cdg not yet reached in the cycle search process;
681 (some nodes are unreachable, e.g. a node is a source or the cdg has not connected parts)
682 */
get_next_cdg_node(cdg_node_t * root)683 static cdg_node_t *get_next_cdg_node(cdg_node_t * root)
684 {
685 cdg_node_t *current = root, *res = NULL;
686
687 while (current) {
688 current->visited = 1;
689 if (current->status == UNKNOWN) {
690 res = current;
691 break;
692 }
693 if (current->left && !current->left->visited) {
694 current = current->left;
695 } else if (current->right && !current->right->visited) {
696 current = current->right;
697 } else {
698 if (current->left)
699 current->left->visited = 0;
700 if (current->right)
701 current->right->visited = 0;
702 if (current->parent == NULL)
703 break;
704 else
705 current = current->parent;
706 }
707 }
708
709 /* Clean up */
710 while (current) {
711 current->visited = 0;
712 if (current->left)
713 current->left->visited = 0;
714 if (current->right)
715 current->right->visited = 0;
716 current = current->parent;
717 }
718
719 return res;
720 }
721
722 /* make a DFS on the cdg to check for a cycle */
search_cycle_in_channel_dep_graph(cdg_node_t * cdg,cdg_node_t * start_node)723 static cdg_node_t *search_cycle_in_channel_dep_graph(cdg_node_t * cdg,
724 cdg_node_t * start_node)
725 {
726 cdg_node_t *cycle = NULL;
727 cdg_node_t *current = start_node, *next_node = NULL, *tmp = NULL;
728 cdg_link_t *link = NULL;
729
730 while (current) {
731 current->status = GRAY;
732 link = current->linklist;
733 next_node = NULL;
734 while (link) {
735 if (link->node->status == UNKNOWN) {
736 next_node = link->node;
737 break;
738 }
739 if (link->node->status == GRAY) {
740 cycle = link->node;
741 goto Exit;
742 }
743 link = link->next;
744 }
745 if (next_node) {
746 next_node->pre = current;
747 current = next_node;
748 } else {
749 /* found a sink in the graph, go to last node */
750 current->status = BLACK;
751
752 /* srcdest_pairs of this node aren't relevant, free the allocated memory */
753 link = current->linklist;
754 while (link) {
755 if (link->num_pairs)
756 free(link->srcdest_pairs);
757 link->srcdest_pairs = NULL;
758 link->num_pairs = 0;
759 link->removed = 0;
760 link = link->next;
761 }
762
763 if (current->pre) {
764 tmp = current;
765 current = current->pre;
766 tmp->pre = NULL;
767 } else {
768 /* search for other subgraphs in cdg */
769 current = get_next_cdg_node(cdg);
770 if (!current)
771 break; /* all relevant nodes traversed, no more cycles found */
772 }
773 }
774 }
775
776 Exit:
777 return cycle;
778 }
779
780 /* calculate the path from source to destination port;
781 new channels are added directly to the cdg
782 */
update_channel_dep_graph(cdg_node_t ** cdg_root,osm_port_t * src_port,uint16_t slid,osm_port_t * dest_port,uint16_t dlid)783 static int update_channel_dep_graph(cdg_node_t ** cdg_root,
784 osm_port_t * src_port, uint16_t slid,
785 osm_port_t * dest_port, uint16_t dlid)
786 {
787 osm_node_t *local_node = NULL, *remote_node = NULL;
788 uint16_t local_lid = 0, remote_lid = 0;
789 uint32_t srcdest = 0;
790 uint8_t local_port = 0, remote_port = 0;
791 uint64_t channelID = 0;
792
793 cdg_node_t *channel_head = NULL, *channel = NULL, *last_channel = NULL;
794 cdg_link_t *linklist = NULL;
795
796 /* set the identifier for the src/dest pair to save this on each edge of the cdg */
797 srcdest = (((uint32_t) slid) << 16) + ((uint32_t) dlid);
798
799 channel_head = (cdg_node_t *) malloc(sizeof(cdg_node_t));
800 if (!channel_head)
801 goto ERROR;
802 set_default_cdg_node(channel_head);
803 last_channel = channel_head;
804
805 /* if src is a Hca, then the channel from Hca to switch would be a source in the graph
806 sources can't be part of a cycle -> skip this channel
807 */
808 remote_node =
809 osm_node_get_remote_node(src_port->p_node,
810 src_port->p_physp->port_num, &remote_port);
811
812 while (remote_node && remote_node->sw) {
813 local_node = remote_node;
814 local_port = local_node->sw->new_lft[dlid];
815 /* sanity check: local_port must be set or routing is broken */
816 if (local_port == OSM_NO_PATH)
817 goto ERROR;
818 local_lid = cl_ntoh16(osm_node_get_base_lid(local_node, 0));
819 /* each port belonging to a switch has lmc==0 -> get_base_lid is fine
820 (local/remote port in this function are always part of a switch)
821 */
822
823 remote_node =
824 osm_node_get_remote_node(local_node, local_port,
825 &remote_port);
826 /* if remote_node is a Hca, then the last channel from switch to Hca would be a sink in the cdg -> skip */
827 if (!remote_node || !remote_node->sw)
828 break;
829 remote_lid = cl_ntoh16(osm_node_get_base_lid(remote_node, 0));
830
831 channelID =
832 (((uint64_t) local_lid) << 48) +
833 (((uint64_t) local_port) << 32) +
834 (((uint64_t) remote_lid) << 16) + ((uint64_t) remote_port);
835 channel = cdg_search(*cdg_root, channelID);
836 if (channel) {
837 /* check whether last channel has connection to this channel, i.e. subpath already exists in cdg */
838 linklist = last_channel->linklist;
839 while (linklist && linklist->node != channel
840 && linklist->next)
841 linklist = linklist->next;
842 /* if there is no connection, add one */
843 if (linklist) {
844 if (linklist->node == channel) {
845 set_next_srcdest_pair(linklist,
846 srcdest);
847 } else {
848 linklist->next =
849 (cdg_link_t *)
850 malloc(sizeof(cdg_link_t));
851 if (!linklist->next)
852 goto ERROR;
853 linklist = linklist->next;
854 linklist->node = channel;
855 linklist->num_pairs = 0;
856 linklist->srcdest_pairs = NULL;
857 set_next_srcdest_pair(linklist,
858 srcdest);
859 linklist->next = NULL;
860 }
861 } else {
862 /* either this is the first channel of the path, or the last channel was a new channel, or last channel was a sink */
863 last_channel->linklist =
864 (cdg_link_t *) malloc(sizeof(cdg_link_t));
865 if (!last_channel->linklist)
866 goto ERROR;
867 last_channel->linklist->node = channel;
868 last_channel->linklist->num_pairs = 0;
869 last_channel->linklist->srcdest_pairs = NULL;
870 set_next_srcdest_pair(last_channel->linklist,
871 srcdest);
872 last_channel->linklist->next = NULL;
873 }
874 } else {
875 /* create new channel */
876 channel = (cdg_node_t *) malloc(sizeof(cdg_node_t));
877 if (!channel)
878 goto ERROR;
879 set_default_cdg_node(channel);
880 channel->channelID = channelID;
881 cdg_insert(cdg_root, channel);
882
883 /* go to end of link list of last channel */
884 linklist = last_channel->linklist;
885 while (linklist && linklist->next)
886 linklist = linklist->next;
887 if (linklist) {
888 /* update last link of an existing channel */
889 linklist->next =
890 (cdg_link_t *) malloc(sizeof(cdg_link_t));
891 if (!linklist->next)
892 goto ERROR;
893 linklist = linklist->next;
894 linklist->node = channel;
895 linklist->num_pairs = 0;
896 linklist->srcdest_pairs = NULL;
897 set_next_srcdest_pair(linklist, srcdest);
898 linklist->next = NULL;
899 } else {
900 /* either this is the first channel of the path, or the last channel was a new channel, or last channel was a sink */
901 last_channel->linklist =
902 (cdg_link_t *) malloc(sizeof(cdg_link_t));
903 if (!last_channel->linklist)
904 goto ERROR;
905 last_channel->linklist->node = channel;
906 last_channel->linklist->num_pairs = 0;
907 last_channel->linklist->srcdest_pairs = NULL;
908 set_next_srcdest_pair(last_channel->linklist,
909 srcdest);
910 last_channel->linklist->next = NULL;
911 }
912 }
913 last_channel = channel;
914 }
915
916 if (channel_head->linklist) {
917 if (channel_head->linklist->srcdest_pairs)
918 free(channel_head->linklist->srcdest_pairs);
919 free(channel_head->linklist);
920 }
921 free(channel_head);
922
923 return 0;
924
925 ERROR:
926 /* cleanup data and exit */
927 if (channel_head) {
928 if (channel_head->linklist)
929 free(channel_head->linklist);
930 free(channel_head);
931 }
932
933 return 1;
934 }
935
936 /* calculate the path from source to destination port;
937 the links in the cdg representing this path are decremented to simulate the removal
938 */
remove_path_from_cdg(cdg_node_t ** cdg_root,osm_port_t * src_port,uint16_t slid,osm_port_t * dest_port,uint16_t dlid)939 static int remove_path_from_cdg(cdg_node_t ** cdg_root, osm_port_t * src_port,
940 uint16_t slid, osm_port_t * dest_port,
941 uint16_t dlid)
942 {
943 osm_node_t *local_node = NULL, *remote_node = NULL;
944 uint16_t local_lid = 0, remote_lid = 0;
945 uint8_t local_port = 0, remote_port = 0;
946 uint64_t channelID = 0;
947
948 cdg_node_t *channel_head = NULL, *channel = NULL, *last_channel = NULL;
949 cdg_link_t *linklist = NULL;
950
951 channel_head = (cdg_node_t *) malloc(sizeof(cdg_node_t));
952 if (!channel_head)
953 goto ERROR;
954 set_default_cdg_node(channel_head);
955 last_channel = channel_head;
956
957 /* if src is a Hca, then the channel from Hca to switch would be a source in the graph
958 sources can't be part of a cycle -> skip this channel
959 */
960 remote_node =
961 osm_node_get_remote_node(src_port->p_node,
962 src_port->p_physp->port_num, &remote_port);
963
964 while (remote_node && remote_node->sw) {
965 local_node = remote_node;
966 local_port = local_node->sw->new_lft[dlid];
967 /* sanity check: local_port must be set or routing is broken */
968 if (local_port == OSM_NO_PATH)
969 goto ERROR;
970 local_lid = cl_ntoh16(osm_node_get_base_lid(local_node, 0));
971
972 remote_node =
973 osm_node_get_remote_node(local_node, local_port,
974 &remote_port);
975 /* if remote_node is a Hca, then the last channel from switch to Hca would be a sink in the cdg -> skip */
976 if (!remote_node || !remote_node->sw)
977 break;
978 remote_lid = cl_ntoh16(osm_node_get_base_lid(remote_node, 0));
979
980 channelID =
981 (((uint64_t) local_lid) << 48) +
982 (((uint64_t) local_port) << 32) +
983 (((uint64_t) remote_lid) << 16) + ((uint64_t) remote_port);
984 channel = cdg_search(*cdg_root, channelID);
985 if (channel) {
986 /* check whether last channel has connection to this channel, i.e. subpath already exists in cdg */
987 linklist = last_channel->linklist;
988 while (linklist && linklist->node != channel
989 && linklist->next)
990 linklist = linklist->next;
991 /* remove the srcdest from the link */
992 if (linklist) {
993 if (linklist->node == channel) {
994 linklist->removed++;
995 } else {
996 /* may happen if the link is missing (thru cycle detect algorithm) */
997 }
998 } else {
999 /* may happen if the link is missing (thru cycle detect algorithm or last_channel==channel_head (dummy channel)) */
1000 }
1001 } else {
1002 /* must be an error, channels for the path are added before, so a missing channel would be a corrupt data structure */
1003 goto ERROR;
1004 }
1005 last_channel = channel;
1006 }
1007
1008 if (channel_head->linklist)
1009 free(channel_head->linklist);
1010 free(channel_head);
1011
1012 return 0;
1013
1014 ERROR:
1015 /* cleanup data and exit */
1016 if (channel_head) {
1017 if (channel_head->linklist)
1018 free(channel_head->linklist);
1019 free(channel_head);
1020 }
1021
1022 return 1;
1023 }
1024
1025 /**********************************************************************
1026 **********************************************************************/
1027
1028 /************ helper functions to generate an ordered list of ports ***
1029 ************ (functions copied from osm_ucast_mgr.c and modified) ****
1030 **********************************************************************/
add_sw_endports_to_order_list(osm_switch_t * sw,osm_ucast_mgr_t * m,cl_qmap_t * guid_tbl,boolean_t add_guids)1031 static void add_sw_endports_to_order_list(osm_switch_t * sw,
1032 osm_ucast_mgr_t * m,
1033 cl_qmap_t * guid_tbl,
1034 boolean_t add_guids)
1035 {
1036 osm_port_t *port;
1037 ib_net64_t port_guid;
1038 uint64_t sw_guid;
1039 osm_physp_t *p;
1040 int i;
1041 boolean_t found;
1042
1043 for (i = 1; i < sw->num_ports; i++) {
1044 p = osm_node_get_physp_ptr(sw->p_node, i);
1045 if (p && p->p_remote_physp && !p->p_remote_physp->p_node->sw) {
1046 port_guid = p->p_remote_physp->port_guid;
1047 /* check if link is healthy, otherwise ignore CA */
1048 if (!osm_link_is_healthy(p)) {
1049 sw_guid =
1050 cl_ntoh64(osm_node_get_node_guid
1051 (sw->p_node));
1052 OSM_LOG(m->p_log, OSM_LOG_INFO,
1053 "WRN AD40: ignoring CA due to unhealthy"
1054 " link from switch 0x%016" PRIx64
1055 " port %" PRIu8 " to CA 0x%016" PRIx64
1056 "\n", sw_guid, i, cl_ntoh64(port_guid));
1057 }
1058 port = osm_get_port_by_guid(m->p_subn, port_guid);
1059 if (!port)
1060 continue;
1061 if (!cl_is_qmap_empty(guid_tbl)) {
1062 found = (cl_qmap_get(guid_tbl, port_guid)
1063 != cl_qmap_end(guid_tbl));
1064 if ((add_guids && !found)
1065 || (!add_guids && found))
1066 continue;
1067 }
1068 if (!cl_is_item_in_qlist(&m->port_order_list,
1069 &port->list_item))
1070 cl_qlist_insert_tail(&m->port_order_list,
1071 &port->list_item);
1072 else
1073 OSM_LOG(m->p_log, OSM_LOG_INFO,
1074 "WRN AD37: guid 0x%016" PRIx64
1075 " already in list\n", port_guid);
1076 }
1077 }
1078 }
1079
add_guid_to_order_list(uint64_t guid,osm_ucast_mgr_t * m)1080 static void add_guid_to_order_list(uint64_t guid, osm_ucast_mgr_t * m)
1081 {
1082 osm_port_t *port = osm_get_port_by_guid(m->p_subn, cl_hton64(guid));
1083
1084 if (!port) {
1085 OSM_LOG(m->p_log, OSM_LOG_DEBUG,
1086 "port guid not found: 0x%016" PRIx64 "\n", guid);
1087 }
1088
1089 if (!cl_is_item_in_qlist(&m->port_order_list, &port->list_item))
1090 cl_qlist_insert_tail(&m->port_order_list, &port->list_item);
1091 else
1092 OSM_LOG(m->p_log, OSM_LOG_INFO,
1093 "WRN AD38: guid 0x%016" PRIx64 " already in list\n",
1094 guid);
1095 }
1096
1097 /* compare function of #Hca attached to a switch for stdlib qsort */
cmp_num_hca(const void * l1,const void * l2)1098 static int cmp_num_hca(const void * l1, const void * l2)
1099 {
1100 vertex_t *sw1 = *((vertex_t **) l1);
1101 vertex_t *sw2 = *((vertex_t **) l2);
1102 uint32_t num_hca1 = 0, num_hca2 = 0;
1103
1104 if (sw1)
1105 num_hca1 = sw1->num_hca;
1106 if (sw2)
1107 num_hca2 = sw2->num_hca;
1108
1109 if (num_hca1 > num_hca2)
1110 return -1;
1111 else if (num_hca1 < num_hca2)
1112 return 1;
1113 else
1114 return 0;
1115 }
1116
1117 /* use stdlib to sort the switch array depending on num_hca */
sw_list_sort_by_num_hca(vertex_t ** sw_list,uint32_t sw_list_size)1118 static inline void sw_list_sort_by_num_hca(vertex_t ** sw_list,
1119 uint32_t sw_list_size)
1120 {
1121 qsort(sw_list, sw_list_size, sizeof(vertex_t *), cmp_num_hca);
1122 }
1123
1124 /**********************************************************************
1125 **********************************************************************/
1126
1127 /************ helper functions to manage a map of CN and I/O guids ****
1128 **********************************************************************/
add_guid_to_map(void * cxt,uint64_t guid,char * p)1129 static int add_guid_to_map(void * cxt, uint64_t guid, char * p)
1130 {
1131 cl_qmap_t *map = cxt;
1132 name_map_item_t *item;
1133 name_map_item_t *inserted_item;
1134
1135 item = malloc(sizeof(*item));
1136 if (!item)
1137 return -1;
1138
1139 item->guid = cl_hton64(guid); /* internal: network byte order */
1140 item->name = NULL; /* name isn't needed */
1141 inserted_item = (name_map_item_t *) cl_qmap_insert(map, item->guid, &item->item);
1142 if (inserted_item != item)
1143 free(item);
1144
1145 return 0;
1146 }
1147
destroy_guid_map(cl_qmap_t * guid_tbl)1148 static void destroy_guid_map(cl_qmap_t * guid_tbl)
1149 {
1150 name_map_item_t *p_guid = NULL, *p_next_guid = NULL;
1151
1152 p_next_guid = (name_map_item_t *) cl_qmap_head(guid_tbl);
1153 while (p_next_guid != (name_map_item_t *) cl_qmap_end(guid_tbl)) {
1154 p_guid = p_next_guid;
1155 p_next_guid = (name_map_item_t *) cl_qmap_next(&p_guid->item);
1156 free(p_guid);
1157 }
1158 cl_qmap_remove_all(guid_tbl);
1159 }
1160
1161 /**********************************************************************
1162 **********************************************************************/
1163
dfsssp_print_graph(osm_ucast_mgr_t * p_mgr,vertex_t * adj_list,uint32_t size)1164 static void dfsssp_print_graph(osm_ucast_mgr_t * p_mgr, vertex_t * adj_list,
1165 uint32_t size)
1166 {
1167 uint32_t i = 0, c = 0;
1168 link_t *link = NULL;
1169
1170 /* index 0 is for the source in dijkstra -> ignore */
1171 for (i = 1; i < size; i++) {
1172 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG, "adj_list[%" PRIu32 "]:\n",
1173 i);
1174 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1175 " guid = 0x%" PRIx64 " lid = %" PRIu16 " (%s)\n",
1176 adj_list[i].guid, adj_list[i].lid,
1177 adj_list[i].sw->p_node->print_desc);
1178 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1179 " num_hca = %" PRIu32 "\n", adj_list[i].num_hca);
1180
1181 c = 1;
1182 for (link = adj_list[i].links; link != NULL;
1183 link = link->next, c++) {
1184 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1185 " link[%" PRIu32 "]:\n", c);
1186 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1187 " to guid = 0x%" PRIx64 " (%s) port %"
1188 PRIu8 "\n", link->guid,
1189 adj_list[link->to].sw->p_node->print_desc,
1190 link->to_port);
1191 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1192 " weight on this link = %" PRIu64 "\n",
1193 link->weight);
1194 }
1195 }
1196 }
1197
1198 /* predefine, to use this in next function */
1199 static void dfsssp_context_destroy(void *context);
1200 static int dijkstra(osm_ucast_mgr_t * p_mgr, vertex_t * adj_list,
1201 uint32_t adj_list_size, osm_port_t * port, uint16_t lid);
1202
1203 /* traverse subnet to gather information about the connected switches */
dfsssp_build_graph(void * context)1204 static int dfsssp_build_graph(void *context)
1205 {
1206 dfsssp_context_t *dfsssp_ctx = (dfsssp_context_t *) context;
1207 osm_ucast_mgr_t *p_mgr = (osm_ucast_mgr_t *) (dfsssp_ctx->p_mgr);
1208
1209 cl_qmap_t *port_tbl = &p_mgr->p_subn->port_guid_tbl; /* 1 management port per switch + 1 or 2 ports for each Hca */
1210 osm_port_t *p_port = NULL;
1211 cl_qmap_t *sw_tbl = &p_mgr->p_subn->sw_guid_tbl;
1212 cl_map_item_t *item = NULL;
1213 osm_switch_t *sw = NULL;
1214 osm_node_t *remote_node = NULL;
1215 uint8_t port = 0, remote_port = 0;
1216 uint32_t i = 0, j = 0, err = 0, undiscov = 0, max_num_undiscov = 0;
1217 uint64_t total_num_hca = 0;
1218 vertex_t *adj_list = NULL;
1219 osm_physp_t *p_physp = NULL;
1220 link_t *link = NULL, *head = NULL;
1221 uint32_t num_sw = 0, adj_list_size = 0;
1222 uint8_t lmc = 0;
1223 uint16_t sm_lid = 0;
1224
1225 OSM_LOG_ENTER(p_mgr->p_log);
1226 OSM_LOG(p_mgr->p_log, OSM_LOG_VERBOSE,
1227 "Building graph for df-/sssp routing\n");
1228
1229 /* if this pointer isn't NULL, this is a reroute step;
1230 old context will be destroyed (adj_list and srcdest2vl_table)
1231 */
1232 if (dfsssp_ctx->adj_list)
1233 dfsssp_context_destroy(context);
1234
1235 num_sw = cl_qmap_count(sw_tbl);
1236 adj_list_size = num_sw + 1;
1237 /* allocate an adjazenz list (array), 0. element is reserved for the source (Hca) in the routing algo, others are switches */
1238 adj_list = (vertex_t *) malloc(adj_list_size * sizeof(vertex_t));
1239 if (!adj_list) {
1240 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1241 "ERR AD02: cannot allocate memory for adj_list\n");
1242 goto ERROR;
1243 }
1244 for (i = 0; i < adj_list_size; i++)
1245 set_default_vertex(&adj_list[i]);
1246
1247 dfsssp_ctx->adj_list = adj_list;
1248 dfsssp_ctx->adj_list_size = adj_list_size;
1249
1250 /* count the total number of Hca / LIDs (for lmc>0) in the fabric;
1251 even include base/enhanced switch port 0; base SP0 will have lmc=0
1252 */
1253 for (item = cl_qmap_head(port_tbl); item != cl_qmap_end(port_tbl);
1254 item = cl_qmap_next(item)) {
1255 p_port = (osm_port_t *) item;
1256 if (osm_node_get_type(p_port->p_node) == IB_NODE_TYPE_CA ||
1257 osm_node_get_type(p_port->p_node) == IB_NODE_TYPE_SWITCH) {
1258 lmc = osm_port_get_lmc(p_port);
1259 total_num_hca += (1 << lmc);
1260 }
1261 }
1262
1263 i = 1; /* fill adj_list -> start with index 1 */
1264 for (item = cl_qmap_head(sw_tbl); item != cl_qmap_end(sw_tbl);
1265 item = cl_qmap_next(item), i++) {
1266 sw = (osm_switch_t *) item;
1267 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1268 "Processing switch with GUID 0x%" PRIx64 "\n",
1269 cl_ntoh64(osm_node_get_node_guid(sw->p_node)));
1270
1271 adj_list[i].guid =
1272 cl_ntoh64(osm_node_get_node_guid(sw->p_node));
1273 adj_list[i].lid =
1274 cl_ntoh16(osm_node_get_base_lid(sw->p_node, 0));
1275 adj_list[i].sw = sw;
1276
1277 link = (link_t *) malloc(sizeof(link_t));
1278 if (!link) {
1279 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1280 "ERR AD03: cannot allocate memory for a link\n");
1281 goto ERROR;
1282 }
1283 head = link;
1284 head->next = NULL;
1285
1286 /* add SP0 to number of CA connected to a switch */
1287 lmc = osm_node_get_lmc(sw->p_node, 0);
1288 adj_list[i].num_hca += (1 << lmc);
1289
1290 /* iterate over all ports in the switch, start with port 1 (port 0 is a management port) */
1291 for (port = 1; port < sw->num_ports; port++) {
1292 /* get the node behind the port */
1293 remote_node =
1294 osm_node_get_remote_node(sw->p_node, port,
1295 &remote_port);
1296 /* if there is no remote node on this port or it's the same switch -> try next port */
1297 if (!remote_node || remote_node->sw == sw)
1298 continue;
1299 /* make sure the link is healthy */
1300 p_physp = osm_node_get_physp_ptr(sw->p_node, port);
1301 if (!p_physp || !osm_link_is_healthy(p_physp))
1302 continue;
1303 /* if there is a Hca connected -> count and cycle */
1304 if (!remote_node->sw) {
1305 lmc = osm_node_get_lmc(remote_node, (uint32_t)remote_port);
1306 adj_list[i].num_hca += (1 << lmc);
1307 continue;
1308 }
1309 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1310 "Node 0x%" PRIx64 ", remote node 0x%" PRIx64
1311 ", port %" PRIu8 ", remote port %" PRIu8 "\n",
1312 cl_ntoh64(osm_node_get_node_guid(sw->p_node)),
1313 cl_ntoh64(osm_node_get_node_guid(remote_node)),
1314 port, remote_port);
1315
1316 link->next = (link_t *) malloc(sizeof(link_t));
1317 if (!link->next) {
1318 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1319 "ERR AD08: cannot allocate memory for a link\n");
1320 while (head) {
1321 link = head;
1322 head = head->next;
1323 free(link);
1324 }
1325 goto ERROR;
1326 }
1327 link = link->next;
1328 set_default_link(link);
1329 link->guid =
1330 cl_ntoh64(osm_node_get_node_guid(remote_node));
1331 link->from = i;
1332 link->from_port = port;
1333 link->to_port = remote_port;
1334 link->weight = total_num_hca * total_num_hca; /* initialize with P^2 to force shortest paths */
1335 }
1336
1337 adj_list[i].links = head->next;
1338 free(head);
1339 }
1340 /* connect the links with it's second adjacent node in the list */
1341 for (i = 1; i < adj_list_size; i++) {
1342 link = adj_list[i].links;
1343 while (link) {
1344 for (j = 1; j < adj_list_size; j++) {
1345 if (link->guid == adj_list[j].guid) {
1346 link->to = j;
1347 break;
1348 }
1349 }
1350 link = link->next;
1351 }
1352 }
1353 /* do one dry run to determine connectivity issues */
1354 sm_lid = p_mgr->p_subn->master_sm_base_lid;
1355 p_port = osm_get_port_by_lid(p_mgr->p_subn, sm_lid);
1356 err = dijkstra(p_mgr, adj_list, adj_list_size, p_port, sm_lid);
1357 if (err) {
1358 goto ERROR;
1359 } else {
1360 /* if sm is running on a switch, then dijkstra doesn't
1361 initialize the used_link for this switch
1362 */
1363 if (osm_node_get_type(p_port->p_node) != IB_NODE_TYPE_CA)
1364 max_num_undiscov = 1;
1365 for (i = 1; i < adj_list_size; i++)
1366 undiscov += (adj_list[i].used_link) ? 0 : 1;
1367 if (max_num_undiscov < undiscov) {
1368 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1369 "ERR AD0C: unsupported network state (detached"
1370 " and inaccessible switches found; gracefully"
1371 " shutdown this routing engine)\n");
1372 goto ERROR;
1373 }
1374 }
1375 /* print the discovered graph */
1376 if (OSM_LOG_IS_ACTIVE_V2(p_mgr->p_log, OSM_LOG_DEBUG))
1377 dfsssp_print_graph(p_mgr, adj_list, adj_list_size);
1378
1379 OSM_LOG_EXIT(p_mgr->p_log);
1380 return 0;
1381
1382 ERROR:
1383 dfsssp_context_destroy(context);
1384 return -1;
1385 }
1386
print_routes(osm_ucast_mgr_t * p_mgr,vertex_t * adj_list,uint32_t adj_list_size,osm_port_t * port)1387 static void print_routes(osm_ucast_mgr_t * p_mgr, vertex_t * adj_list,
1388 uint32_t adj_list_size, osm_port_t * port)
1389 {
1390 uint32_t i = 0, j = 0;
1391
1392 for (i = 1; i < adj_list_size; i++) {
1393 if (adj_list[i].state == DISCOVERED) {
1394 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1395 "Route from 0x%" PRIx64 " (%s) to 0x%" PRIx64
1396 " (%s):\n", adj_list[i].guid,
1397 adj_list[i].sw->p_node->print_desc,
1398 cl_ntoh64(osm_node_get_node_guid(port->p_node)),
1399 port->p_node->print_desc);
1400 j = i;
1401 while (adj_list[j].used_link) {
1402 if (j > 0) {
1403 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1404 " 0x%" PRIx64
1405 " (%s) routes thru port %" PRIu8
1406 "\n", adj_list[j].guid,
1407 adj_list[j].sw->p_node->
1408 print_desc,
1409 adj_list[j].used_link->to_port);
1410 } else {
1411 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1412 " 0x%" PRIx64
1413 " (%s) routes thru port %" PRIu8
1414 "\n", adj_list[j].guid,
1415 port->p_node->print_desc,
1416 adj_list[j].used_link->to_port);
1417 }
1418 j = adj_list[j].used_link->from;
1419 }
1420 }
1421 }
1422 }
1423
1424 /* dijkstra step from one source to all switches in the df-/sssp graph */
dijkstra(osm_ucast_mgr_t * p_mgr,vertex_t * adj_list,uint32_t adj_list_size,osm_port_t * port,uint16_t lid)1425 static int dijkstra(osm_ucast_mgr_t * p_mgr, vertex_t * adj_list,
1426 uint32_t adj_list_size, osm_port_t * port, uint16_t lid)
1427 {
1428 uint32_t i = 0, j = 0, index = 0;
1429 osm_node_t *remote_node = NULL;
1430 uint8_t remote_port = 0;
1431 vertex_t *current = NULL;
1432 link_t *link = NULL;
1433 uint64_t guid = 0;
1434 binary_heap_t *heap = NULL;
1435 int err = 0;
1436
1437 OSM_LOG_ENTER(p_mgr->p_log);
1438
1439 /* reset all switches for new round with a new source for dijkstra */
1440 for (i = 1; i < adj_list_size; i++) {
1441 adj_list[i].hops = 0;
1442 adj_list[i].used_link = NULL;
1443 adj_list[i].distance = INF;
1444 adj_list[i].state = UNDISCOVERED;
1445 }
1446
1447 /* if behind port is a Hca -> set adj_list[0] */
1448 if (osm_node_get_type(port->p_node) == IB_NODE_TYPE_CA) {
1449 /* save old link to prevent many mallocs after set_default_... */
1450 link = adj_list[0].links;
1451 /* initialize adj_list[0] (the source for the routing, a Hca) */
1452 set_default_vertex(&adj_list[0]);
1453 adj_list[0].guid =
1454 cl_ntoh64(osm_node_get_node_guid(port->p_node));
1455 adj_list[0].lid = lid;
1456 index = 0;
1457 /* write saved link back to new adj_list[0] */
1458 adj_list[0].links = link;
1459
1460 /* initialize link to neighbor for adj_list[0];
1461 make sure the link is healthy
1462 */
1463 if (port->p_physp && osm_link_is_healthy(port->p_physp)) {
1464 remote_node =
1465 osm_node_get_remote_node(port->p_node,
1466 port->p_physp->port_num,
1467 &remote_port);
1468 /* if there is no remote node on this port or it's the same Hca -> ignore */
1469 if (remote_node
1470 && (osm_node_get_type(remote_node) ==
1471 IB_NODE_TYPE_SWITCH)) {
1472 if (!(adj_list[0].links)) {
1473 adj_list[0].links =
1474 (link_t *) malloc(sizeof(link_t));
1475 if (!(adj_list[0].links)) {
1476 OSM_LOG(p_mgr->p_log,
1477 OSM_LOG_ERROR,
1478 "ERR AD07: cannot allocate memory for a link\n");
1479 return 1;
1480 }
1481 }
1482 set_default_link(adj_list[0].links);
1483 adj_list[0].links->guid =
1484 cl_ntoh64(osm_node_get_node_guid
1485 (remote_node));
1486 adj_list[0].links->from_port =
1487 port->p_physp->port_num;
1488 adj_list[0].links->to_port = remote_port;
1489 adj_list[0].links->weight = 1;
1490 for (j = 1; j < adj_list_size; j++) {
1491 if (adj_list[0].links->guid ==
1492 adj_list[j].guid) {
1493 adj_list[0].links->to = j;
1494 break;
1495 }
1496 }
1497 }
1498 } else {
1499 /* if link is unhealthy then there's a severe issue */
1500 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1501 "ERR AD0B: unsupported network state (CA with"
1502 " unhealthy link state discovered; should have"
1503 " been filtered out before already; gracefully"
1504 " shutdown this routing engine)\n");
1505 return 1;
1506 }
1507 /* if behind port is a switch -> search switch in adj_list */
1508 } else {
1509 /* reset adj_list[0], if links=NULL reset was done before, then skip */
1510 if (adj_list[0].links) {
1511 free(adj_list[0].links);
1512 set_default_vertex(&adj_list[0]);
1513 }
1514 /* search for the switch which is the source in this round */
1515 guid = cl_ntoh64(osm_node_get_node_guid(port->p_node));
1516 for (i = 1; i < adj_list_size; i++) {
1517 if (guid == adj_list[i].guid) {
1518 index = i;
1519 break;
1520 }
1521 }
1522 }
1523
1524 /* source in dijkstra */
1525 adj_list[index].distance = 0;
1526 adj_list[index].state = DISCOVERED;
1527 adj_list[index].hops = 0; /* the source has hop count = 0 */
1528
1529 /* create a heap to find (efficient) the node with the smallest distance */
1530 if (osm_node_get_type(port->p_node) == IB_NODE_TYPE_CA)
1531 err = heap_create(adj_list, adj_list_size, &heap);
1532 else
1533 err = heap_create(&adj_list[1], adj_list_size - 1, &heap);
1534 if (err) {
1535 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1536 "ERR AD09: cannot allocate memory for heap or heap->node in heap_create(...)\n");
1537 return err;
1538 }
1539
1540 current = heap_getmin(heap);
1541 while (current) {
1542 current->state = DISCOVERED;
1543 if (current->used_link) /* increment the number of hops to the source for each new node */
1544 current->hops =
1545 adj_list[current->used_link->from].hops + 1;
1546
1547 /* add/update nodes which aren't discovered but accessible */
1548 for (link = current->links; link != NULL; link = link->next) {
1549 if ((adj_list[link->to].state != DISCOVERED)
1550 && (current->distance + link->weight <
1551 adj_list[link->to].distance)) {
1552 adj_list[link->to].used_link = link;
1553 adj_list[link->to].distance =
1554 current->distance + link->weight;
1555 heap_heapify(heap, adj_list[link->to].heap_id);
1556 }
1557 }
1558
1559 current = heap_getmin(heap);
1560 }
1561
1562 /* destroy the heap */
1563 heap_free(heap);
1564 heap = NULL;
1565
1566 OSM_LOG_EXIT(p_mgr->p_log);
1567 return 0;
1568 }
1569
1570 /* update the linear forwarding tables of all switches with the informations
1571 from the last dijsktra step
1572 */
update_lft(osm_ucast_mgr_t * p_mgr,vertex_t * adj_list,uint32_t adj_list_size,osm_port_t * p_port,uint16_t lid)1573 static int update_lft(osm_ucast_mgr_t * p_mgr, vertex_t * adj_list,
1574 uint32_t adj_list_size, osm_port_t * p_port, uint16_t lid)
1575 {
1576 uint32_t i = 0;
1577 uint8_t port = 0;
1578 uint8_t hops = 0;
1579 osm_switch_t *p_sw = NULL;
1580 boolean_t is_ignored_by_port_prof = FALSE;
1581 osm_physp_t *p = NULL;
1582 cl_status_t ret;
1583
1584 OSM_LOG_ENTER(p_mgr->p_log);
1585
1586 for (i = 1; i < adj_list_size; i++) {
1587 /* if no route goes thru this switch -> cycle */
1588 if (!(adj_list[i].used_link))
1589 continue;
1590
1591 p_sw = adj_list[i].sw;
1592 hops = adj_list[i].hops;
1593 port = adj_list[i].used_link->to_port;
1594 /* the used_link is the link that was used in dijkstra to reach this node,
1595 so the to_port is the local port on this node
1596 */
1597
1598 if (port == OSM_NO_PATH) { /* if clause shouldn't be possible in this routing, but who cares */
1599 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1600 "ERR AD06: No path to get to LID %" PRIu16
1601 " from switch 0x%" PRIx64 "\n", lid,
1602 cl_ntoh64(osm_node_get_node_guid
1603 (p_sw->p_node)));
1604
1605 /* do not try to overwrite the ppro of non existing port ... */
1606 is_ignored_by_port_prof = TRUE;
1607 return 1;
1608 } else {
1609 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1610 "Routing LID %" PRIu16 " to port %" PRIu8
1611 " for switch 0x%" PRIx64 "\n", lid, port,
1612 cl_ntoh64(osm_node_get_node_guid
1613 (p_sw->p_node)));
1614
1615 p = osm_node_get_physp_ptr(p_sw->p_node, port);
1616 if (!p) {
1617 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1618 "ERR AD0A: Physical port %d of Node GUID 0x%"
1619 PRIx64 "not found\n", port,
1620 cl_ntoh64(osm_node_get_node_guid(p_sw->p_node)));
1621 return 1;
1622 }
1623
1624 /* we would like to optionally ignore this port in equalization
1625 as in the case of the Mellanox Anafa Internal PCI TCA port
1626 */
1627 is_ignored_by_port_prof = p->is_prof_ignored;
1628
1629 /* We also would ignore this route if the target lid is of
1630 a switch and the port_profile_switch_node is not TRUE
1631 */
1632 if (!p_mgr->p_subn->opt.port_profile_switch_nodes)
1633 is_ignored_by_port_prof |=
1634 (osm_node_get_type(p_port->p_node) ==
1635 IB_NODE_TYPE_SWITCH);
1636 }
1637
1638 /* to support lmc > 0 the functions alloc_ports_priv, free_ports_priv, find_and_add_remote_sys
1639 from minhop aren't needed cause osm_switch_recommend_path is implicitly calculated
1640 for each LID pair thru dijkstra;
1641 for each port the dijkstra algorithm calculates (max_lid_ho - min_lid_ho)-times maybe
1642 disjoint routes to spread the bandwidth -> diffent routes for one port and lmc>0
1643 */
1644
1645 /* set port in LFT */
1646 p_sw->new_lft[lid] = port;
1647 if (!is_ignored_by_port_prof) {
1648 /* update the number of path routing thru this port */
1649 osm_switch_count_path(p_sw, port);
1650 }
1651 /* set the hop count from this switch to the lid */
1652 ret = osm_switch_set_hops(p_sw, lid, port, hops);
1653 if (ret != CL_SUCCESS)
1654 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1655 "ERR AD05: cannot set hops for LID %" PRIu16
1656 " at switch 0x%" PRIx64 "\n", lid,
1657 cl_ntoh64(osm_node_get_node_guid
1658 (p_sw->p_node)));
1659 }
1660
1661 OSM_LOG_EXIT(p_mgr->p_log);
1662 return 0;
1663 }
1664
1665 /* the function updates the multicast group membership information
1666 similar to create_mgrp_switch_map (osm_mcast_mgr.c)
1667 => with it we can identify if a switch needs to be processed
1668 or not in update_mcft
1669 */
update_mgrp_membership(cl_qlist_t * port_list)1670 static void update_mgrp_membership(cl_qlist_t * port_list)
1671 {
1672 osm_mcast_work_obj_t *wobj = NULL;
1673 osm_port_t *port = NULL;
1674 osm_switch_t *sw = NULL;
1675 cl_list_item_t *i = NULL;
1676
1677 for (i = cl_qlist_head(port_list); i != cl_qlist_end(port_list);
1678 i = cl_qlist_next(i)) {
1679 wobj = cl_item_obj(i, wobj, list_item);
1680 port = wobj->p_port;
1681 if (port->p_node->sw) {
1682 sw = port->p_node->sw;
1683 sw->is_mc_member = 1;
1684 } else {
1685 sw = port->p_physp->p_remote_physp->p_node->sw;
1686 sw->num_of_mcm++;
1687 }
1688 }
1689 }
1690
1691 /* reset is_mc_member and num_of_mcm for future computations */
reset_mgrp_membership(vertex_t * adj_list,uint32_t adj_list_size)1692 static void reset_mgrp_membership(vertex_t * adj_list, uint32_t adj_list_size)
1693 {
1694 uint32_t i = 0;
1695
1696 for (i = 1; i < adj_list_size; i++) {
1697 if (adj_list[i].dropped)
1698 continue;
1699
1700 adj_list[i].sw->is_mc_member = 0;
1701 adj_list[i].sw->num_of_mcm = 0;
1702 }
1703 }
1704
1705 /* update the multicast forwarding tables of all switches with the informations
1706 from the previous dijsktra step for the current mlid
1707 */
update_mcft(osm_sm_t * p_sm,vertex_t * adj_list,uint32_t adj_list_size,uint16_t mlid_ho,cl_qmap_t * port_map,osm_switch_t * root_sw)1708 static int update_mcft(osm_sm_t * p_sm, vertex_t * adj_list,
1709 uint32_t adj_list_size, uint16_t mlid_ho,
1710 cl_qmap_t * port_map, osm_switch_t * root_sw)
1711 {
1712 uint32_t i = 0;
1713 uint8_t port = 0, remote_port = 0;
1714 uint8_t upstream_port = 0, downstream_port = 0;
1715 ib_net64_t guid = 0;
1716 osm_switch_t *p_sw = NULL;
1717 osm_node_t *remote_node = NULL;
1718 osm_physp_t *p_physp = NULL;
1719 osm_mcast_tbl_t *p_tbl = NULL;
1720 vertex_t *curr_adj = NULL;
1721
1722 OSM_LOG_ENTER(p_sm->p_log);
1723
1724 for (i = 1; i < adj_list_size; i++) {
1725 if (adj_list[i].dropped)
1726 continue;
1727
1728 p_sw = adj_list[i].sw;
1729 OSM_LOG(p_sm->p_log, OSM_LOG_VERBOSE,
1730 "Processing switch 0x%016" PRIx64
1731 " (%s) for MLID 0x%X\n", cl_ntoh64(adj_list[i].guid),
1732 p_sw->p_node->print_desc, mlid_ho);
1733
1734 /* if a) the switch does not support mcast or
1735 b) no ports of this switch are part or the mcast group
1736 then cycle
1737 */
1738 if (osm_switch_supports_mcast(p_sw) == FALSE ||
1739 (p_sw->num_of_mcm == 0 && !(p_sw->is_mc_member)))
1740 continue;
1741
1742 p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw);
1743
1744 /* add all ports of this sw to the mcast table,
1745 if they are part of the mcast grp
1746 */
1747 if (p_sw->is_mc_member)
1748 osm_mcast_tbl_set(p_tbl, mlid_ho, 0);
1749 for (port = 1; port < p_sw->num_ports; port++) {
1750 /* get the node behind the port */
1751 remote_node =
1752 osm_node_get_remote_node(p_sw->p_node, port,
1753 &remote_port);
1754 /* check if connected and its not the same switch */
1755 if (!remote_node || remote_node->sw == p_sw)
1756 continue;
1757 /* make sure the link is healthy */
1758 p_physp = osm_node_get_physp_ptr(p_sw->p_node, port);
1759 if (!p_physp || !osm_link_is_healthy(p_physp))
1760 continue;
1761 /* we don't add upstream ports in this step */
1762 if (osm_node_get_type(remote_node) != IB_NODE_TYPE_CA)
1763 continue;
1764
1765 guid = osm_physp_get_port_guid(osm_node_get_physp_ptr(
1766 remote_node,
1767 remote_port));
1768 if (cl_qmap_get(port_map, guid)
1769 != cl_qmap_end(port_map))
1770 osm_mcast_tbl_set(p_tbl, mlid_ho, port);
1771 }
1772
1773 /* now we have to add the upstream port of 'this' switch and
1774 the downstream port of the next switch to the mcast table
1775 until we reach the root_sw
1776 */
1777 curr_adj = &adj_list[i];
1778 while (curr_adj->sw != root_sw) {
1779 /* the used_link is the link that was used in dijkstra to reach this node,
1780 so the to_port is the local (upstream) port on curr_adj->sw
1781 */
1782 upstream_port = curr_adj->used_link->to_port;
1783 osm_mcast_tbl_set(p_tbl, mlid_ho, upstream_port);
1784
1785 /* now we go one step in direction root_sw and add the
1786 downstream port for the spanning tree
1787 */
1788 downstream_port = curr_adj->used_link->from_port;
1789 p_tbl = osm_switch_get_mcast_tbl_ptr(
1790 adj_list[curr_adj->used_link->from].sw);
1791 osm_mcast_tbl_set(p_tbl, mlid_ho, downstream_port);
1792
1793 curr_adj = &adj_list[curr_adj->used_link->from];
1794 }
1795 }
1796
1797 OSM_LOG_EXIT(p_sm->p_log);
1798 return 0;
1799 }
1800
1801 /* increment the edge weights of the df-/sssp graph which represent the number
1802 of paths on this link
1803 */
update_weights(osm_ucast_mgr_t * p_mgr,vertex_t * adj_list,uint32_t adj_list_size)1804 static void update_weights(osm_ucast_mgr_t * p_mgr, vertex_t * adj_list,
1805 uint32_t adj_list_size)
1806 {
1807 uint32_t i = 0, j = 0;
1808 uint32_t additional_weight = 0;
1809
1810 OSM_LOG_ENTER(p_mgr->p_log);
1811
1812 for (i = 1; i < adj_list_size; i++) {
1813 /* if no route goes thru this switch -> cycle */
1814 if (!(adj_list[i].used_link))
1815 continue;
1816 additional_weight = adj_list[i].num_hca;
1817
1818 j = i;
1819 while (adj_list[j].used_link) {
1820 /* update the link from pre to this node */
1821 adj_list[j].used_link->weight += additional_weight;
1822
1823 j = adj_list[j].used_link->from;
1824 }
1825 }
1826
1827 OSM_LOG_EXIT(p_mgr->p_log);
1828 }
1829
1830 /* get the largest number of virtual lanes which is supported by all switches
1831 in the subnet
1832 */
get_avail_vl_in_subn(osm_ucast_mgr_t * p_mgr)1833 static uint8_t get_avail_vl_in_subn(osm_ucast_mgr_t * p_mgr)
1834 {
1835 uint32_t i = 0;
1836 uint8_t vls_avail = 0xFF, port_vls_avail = 0;
1837 cl_qmap_t *sw_tbl = &p_mgr->p_subn->sw_guid_tbl;
1838 cl_map_item_t *item = NULL;
1839 osm_switch_t *sw = NULL;
1840
1841 /* traverse all switches to get the number of available virtual lanes in the subnet */
1842 for (item = cl_qmap_head(sw_tbl); item != cl_qmap_end(sw_tbl);
1843 item = cl_qmap_next(item)) {
1844 sw = (osm_switch_t *) item;
1845
1846 /* ignore management port 0 */
1847 for (i = 1; i < osm_node_get_num_physp(sw->p_node); i++) {
1848 osm_physp_t *p_physp =
1849 osm_node_get_physp_ptr(sw->p_node, i);
1850
1851 if (p_physp && p_physp->p_remote_physp) {
1852 port_vls_avail =
1853 ib_port_info_get_op_vls(&p_physp->
1854 port_info);
1855 if (port_vls_avail
1856 && port_vls_avail < vls_avail)
1857 vls_avail = port_vls_avail;
1858 }
1859 }
1860 }
1861
1862 /* ib_port_info_get_op_vls gives values 1 ... 5 (s. IBAS 14.2.5.6) */
1863 vls_avail = 1 << (vls_avail - 1);
1864
1865 /* set boundaries (s. IBAS 3.5.7) */
1866 if (vls_avail > 15)
1867 vls_avail = 15;
1868 if (vls_avail < 1)
1869 vls_avail = 1;
1870
1871 return vls_avail;
1872 }
1873
1874 /* search for cycles in the channel dependency graph to identify possible
1875 deadlocks in the network;
1876 assign new virtual lanes to some paths to break the deadlocks
1877 */
dfsssp_remove_deadlocks(dfsssp_context_t * dfsssp_ctx)1878 static int dfsssp_remove_deadlocks(dfsssp_context_t * dfsssp_ctx)
1879 {
1880 osm_ucast_mgr_t *p_mgr = (osm_ucast_mgr_t *) dfsssp_ctx->p_mgr;
1881
1882 cl_qlist_t *port_tbl = &p_mgr->port_order_list; /* 1 management port per switch + 1 or 2 ports for each Hca */
1883 cl_list_item_t *item1 = NULL, *item2 = NULL;
1884 osm_port_t *src_port = NULL, *dest_port = NULL;
1885
1886 uint32_t i = 0, j = 0, err = 0;
1887 uint8_t vl = 0, test_vl = 0, vl_avail = 0, vl_needed = 1;
1888 double most_avg_paths = 0.0;
1889 cdg_node_t **cdg = NULL, *start_here = NULL, *cycle = NULL;
1890 cdg_link_t *weakest_link = NULL;
1891 uint32_t srcdest = 0;
1892
1893 vltable_t *srcdest2vl_table = NULL;
1894 uint8_t lmc = 0;
1895 uint16_t slid = 0, dlid = 0, min_lid_ho = 0, max_lid_ho =
1896 0, min_lid_ho2 = 0, max_lid_ho2 = 0;;
1897 uint64_t *paths_per_vl = NULL;
1898 uint64_t from = 0, to = 0, count = 0;
1899 uint8_t *split_count = NULL;
1900 uint8_t ntype = 0;
1901
1902 OSM_LOG_ENTER(p_mgr->p_log);
1903 OSM_LOG(p_mgr->p_log, OSM_LOG_VERBOSE,
1904 "Assign each src/dest pair a Virtual Lanes, to remove deadlocks in the routing\n");
1905
1906 vl_avail = get_avail_vl_in_subn(p_mgr);
1907 OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
1908 "Virtual Lanes available: %" PRIu8 "\n", vl_avail);
1909
1910 paths_per_vl = (uint64_t *) malloc(vl_avail * sizeof(uint64_t));
1911 if (!paths_per_vl) {
1912 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1913 "ERR AD22: cannot allocate memory for paths_per_vl\n");
1914 return 1;
1915 }
1916 memset(paths_per_vl, 0, vl_avail * sizeof(uint64_t));
1917
1918 cdg = (cdg_node_t **) malloc(vl_avail * sizeof(cdg_node_t *));
1919 if (!cdg) {
1920 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1921 "ERR AD23: cannot allocate memory for cdg\n");
1922 free(paths_per_vl);
1923 return 1;
1924 }
1925 for (i = 0; i < vl_avail; i++)
1926 cdg[i] = NULL;
1927
1928 count = 0;
1929 /* count all ports (also multiple LIDs) of type CA or SP0 for size of VL table */
1930 for (item1 = cl_qlist_head(port_tbl); item1 != cl_qlist_end(port_tbl);
1931 item1 = cl_qlist_next(item1)) {
1932 dest_port = (osm_port_t *)cl_item_obj(item1, dest_port,
1933 list_item);
1934 ntype = osm_node_get_type(dest_port->p_node);
1935 if (ntype == IB_NODE_TYPE_CA || ntype == IB_NODE_TYPE_SWITCH) {
1936 /* only SP0 with SLtoVLMapping support will be processed */
1937 if (ntype == IB_NODE_TYPE_SWITCH
1938 && !(dest_port->p_physp->port_info.capability_mask
1939 & IB_PORT_CAP_HAS_SL_MAP))
1940 continue;
1941
1942 lmc = osm_port_get_lmc(dest_port);
1943 count += (1 << lmc);
1944 }
1945 }
1946 /* allocate VL table and indexing array */
1947 err = vltable_alloc(&srcdest2vl_table, count);
1948 if (err) {
1949 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1950 "ERR AD26: cannot allocate memory for srcdest2vl_table\n");
1951 goto ERROR;
1952 }
1953
1954 i = 0;
1955 /* fill lids into indexing array */
1956 for (item1 = cl_qlist_head(port_tbl); item1 != cl_qlist_end(port_tbl);
1957 item1 = cl_qlist_next(item1)) {
1958 dest_port = (osm_port_t *)cl_item_obj(item1, dest_port,
1959 list_item);
1960 ntype = osm_node_get_type(dest_port->p_node);
1961 if (ntype == IB_NODE_TYPE_CA || ntype == IB_NODE_TYPE_SWITCH) {
1962 /* only SP0 with SLtoVLMapping support will be processed */
1963 if (ntype == IB_NODE_TYPE_SWITCH
1964 && !(dest_port->p_physp->port_info.capability_mask
1965 & IB_PORT_CAP_HAS_SL_MAP))
1966 continue;
1967
1968 osm_port_get_lid_range_ho(dest_port, &min_lid_ho,
1969 &max_lid_ho);
1970 for (dlid = min_lid_ho; dlid <= max_lid_ho; dlid++, i++)
1971 srcdest2vl_table->lids[i] = cl_hton16(dlid);
1972 }
1973 }
1974 /* sort lids */
1975 vltable_sort_lids(srcdest2vl_table);
1976
1977 test_vl = 0;
1978 /* fill cdg[0] with routes from each src/dest port combination for all Hca/SP0 in the subnet */
1979 for (item1 = cl_qlist_head(port_tbl); item1 != cl_qlist_end(port_tbl);
1980 item1 = cl_qlist_next(item1)) {
1981 dest_port = (osm_port_t *)cl_item_obj(item1, dest_port,
1982 list_item);
1983 ntype = osm_node_get_type(dest_port->p_node);
1984 if ((ntype != IB_NODE_TYPE_CA && ntype != IB_NODE_TYPE_SWITCH)
1985 || !(dest_port->p_physp->port_info.capability_mask
1986 & IB_PORT_CAP_HAS_SL_MAP))
1987 continue;
1988
1989 for (item2 = cl_qlist_head(port_tbl);
1990 item2 != cl_qlist_end(port_tbl);
1991 item2 = cl_qlist_next(item2)) {
1992 src_port = (osm_port_t *)cl_item_obj(item2, src_port,
1993 list_item);
1994 ntype = osm_node_get_type(src_port->p_node);
1995 if ((ntype != IB_NODE_TYPE_CA
1996 && ntype != IB_NODE_TYPE_SWITCH)
1997 || !(src_port->p_physp->port_info.capability_mask
1998 & IB_PORT_CAP_HAS_SL_MAP))
1999 continue;
2000
2001 if (src_port != dest_port) {
2002 /* iterate over LIDs of src and dest port */
2003 osm_port_get_lid_range_ho(src_port, &min_lid_ho,
2004 &max_lid_ho);
2005 for (slid = min_lid_ho; slid <= max_lid_ho;
2006 slid++) {
2007 osm_port_get_lid_range_ho
2008 (dest_port, &min_lid_ho2,
2009 &max_lid_ho2);
2010 for (dlid = min_lid_ho2;
2011 dlid <= max_lid_ho2;
2012 dlid++) {
2013
2014 /* try to add the path to cdg[0] */
2015 err =
2016 update_channel_dep_graph
2017 (&(cdg[test_vl]),
2018 src_port, slid,
2019 dest_port, dlid);
2020 if (err) {
2021 OSM_LOG(p_mgr->
2022 p_log,
2023 OSM_LOG_ERROR,
2024 "ERR AD14: cannot allocate memory for cdg node or link in update_channel_dep_graph(...)\n");
2025 goto ERROR;
2026 }
2027 /* add the <s,d> combination / corresponding virtual lane to the VL table */
2028 vltable_insert
2029 (srcdest2vl_table,
2030 cl_hton16(slid),
2031 cl_hton16(dlid),
2032 test_vl);
2033 paths_per_vl[test_vl]++;
2034
2035 }
2036
2037 }
2038 }
2039
2040 }
2041 }
2042 dfsssp_ctx->srcdest2vl_table = srcdest2vl_table;
2043
2044 /* test all cdg for cycles and break the cycles by moving paths on the weakest link to the next cdg */
2045 for (test_vl = 0; test_vl < vl_avail - 1; test_vl++) {
2046 start_here = cdg[test_vl];
2047 while (start_here) {
2048 cycle =
2049 search_cycle_in_channel_dep_graph(cdg[test_vl],
2050 start_here);
2051
2052 if (cycle) {
2053 vl_needed = test_vl + 2;
2054
2055 /* calc weakest link n cycle */
2056 weakest_link = get_weakest_link_in_cycle(cycle);
2057 if (!weakest_link) {
2058 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2059 "ERR AD27: something went wrong in get_weakest_link_in_cycle(...)\n");
2060 err = 1;
2061 goto ERROR;
2062 }
2063
2064 paths_per_vl[test_vl] -=
2065 weakest_link->num_pairs;
2066 paths_per_vl[test_vl + 1] +=
2067 weakest_link->num_pairs;
2068
2069 /* move all <s,d> paths on this link to the next cdg */
2070 for (i = 0; i < weakest_link->num_pairs; i++) {
2071 srcdest =
2072 get_next_srcdest_pair(weakest_link,
2073 i);
2074 slid = (uint16_t) (srcdest >> 16);
2075 dlid =
2076 (uint16_t) ((srcdest << 16) >> 16);
2077
2078 /* only move if not moved in a previous step */
2079 if (test_vl !=
2080 (uint8_t)
2081 vltable_get_vl(srcdest2vl_table,
2082 cl_hton16(slid),
2083 cl_hton16(dlid))) {
2084 /* this path has been moved
2085 before -> don't count
2086 */
2087 paths_per_vl[test_vl]++;
2088 paths_per_vl[test_vl + 1]--;
2089 continue;
2090 }
2091
2092 src_port =
2093 osm_get_port_by_lid(p_mgr->p_subn,
2094 cl_hton16
2095 (slid));
2096 dest_port =
2097 osm_get_port_by_lid(p_mgr->p_subn,
2098 cl_hton16
2099 (dlid));
2100
2101 /* remove path from current cdg / vl */
2102 err =
2103 remove_path_from_cdg(&
2104 (cdg[test_vl]),
2105 src_port, slid,
2106 dest_port,
2107 dlid);
2108 if (err) {
2109 OSM_LOG(p_mgr->p_log,
2110 OSM_LOG_ERROR,
2111 "ERR AD44: something went wrong in remove_path_from_cdg(...)\n");
2112 goto ERROR;
2113 }
2114
2115 /* add path to next cdg / vl */
2116 err =
2117 update_channel_dep_graph(&
2118 (cdg
2119 [test_vl +
2120 1]),
2121 src_port,
2122 slid,
2123 dest_port,
2124 dlid);
2125 if (err) {
2126 OSM_LOG(p_mgr->p_log,
2127 OSM_LOG_ERROR,
2128 "ERR AD14: cannot allocate memory for cdg node or link in update_channel_dep_graph(...)\n");
2129 goto ERROR;
2130 }
2131 vltable_insert(srcdest2vl_table,
2132 cl_hton16(slid),
2133 cl_hton16(dlid),
2134 test_vl + 1);
2135 }
2136
2137 if (weakest_link->num_pairs)
2138 free(weakest_link->srcdest_pairs);
2139 if (weakest_link)
2140 free(weakest_link);
2141 }
2142
2143 start_here = cycle;
2144 }
2145 }
2146
2147 /* test the last avail cdg for a cycle;
2148 if there is one, than vl_needed > vl_avail
2149 */
2150 start_here = cdg[vl_avail - 1];
2151 if (start_here) {
2152 cycle =
2153 search_cycle_in_channel_dep_graph(cdg[vl_avail - 1],
2154 start_here);
2155 if (cycle) {
2156 vl_needed = vl_avail + 1;
2157 }
2158 }
2159
2160 OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
2161 "Virtual Lanes needed: %" PRIu8 "\n", vl_needed);
2162 if (OSM_LOG_IS_ACTIVE_V2(p_mgr->p_log, OSM_LOG_INFO)) {
2163 OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
2164 "Paths per VL (before balancing):\n");
2165 for (i = 0; i < vl_avail; i++)
2166 OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
2167 " %" PRIu32 ". lane: %" PRIu64 "\n", i,
2168 paths_per_vl[i]);
2169 }
2170
2171 OSM_LOG(p_mgr->p_log, OSM_LOG_VERBOSE,
2172 "Balancing the paths on the available Virtual Lanes\n");
2173
2174 /* optimal balancing virtual lanes, under condition: no additional cycle checks;
2175 sl/vl != 0 might be assigned to loopback packets (i.e. slid/dlid on the
2176 same port for lmc>0), but thats no problem, see IBAS 10.2.2.3
2177 */
2178 split_count = (uint8_t *) calloc(vl_avail, sizeof(uint8_t));
2179 if (!split_count) {
2180 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2181 "ERR AD24: cannot allocate memory for split_count, skip balancing\n");
2182 err = 1;
2183 goto ERROR;
2184 }
2185 /* initial state: paths for VLs won't be separated */
2186 for (i = 0; i < ((vl_needed < vl_avail) ? vl_needed : vl_avail); i++)
2187 split_count[i] = 1;
2188 dfsssp_ctx->vl_split_count = split_count;
2189 /* balancing is necessary if we have empty VLs */
2190 if (vl_needed < vl_avail) {
2191 /* split paths of VLs until we find an equal distribution */
2192 for (i = vl_needed; i < vl_avail; i++) {
2193 /* find VL with most paths in it */
2194 vl = 0;
2195 most_avg_paths = 0.0;
2196 for (test_vl = 0; test_vl < vl_needed; test_vl++) {
2197 if (most_avg_paths <
2198 ((double)paths_per_vl[test_vl] /
2199 split_count[test_vl])) {
2200 vl = test_vl;
2201 most_avg_paths =
2202 (double)paths_per_vl[test_vl] /
2203 split_count[test_vl];
2204 }
2205 }
2206 split_count[vl]++;
2207 }
2208 /* change the VL assignment depending on split_count for
2209 all VLs except VL 0
2210 */
2211 for (from = vl_needed - 1; from > 0; from--) {
2212 /* how much space needed for others? */
2213 to = 0;
2214 for (i = 0; i < from; i++)
2215 to += split_count[i];
2216 count = paths_per_vl[from];
2217 vltable_change_vl(srcdest2vl_table, from, to, count);
2218 /* change also the information within the split_count
2219 array; this is important for fast calculation later
2220 */
2221 split_count[to] = split_count[from];
2222 split_count[from] = 0;
2223 paths_per_vl[to] = paths_per_vl[from];
2224 paths_per_vl[from] = 0;
2225 }
2226 } else if (vl_needed > vl_avail) {
2227 /* routing not possible, a further development would be the LASH-TOR approach (update: LASH-TOR isn't possible, there is a mistake in the theory) */
2228 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2229 "ERR AD25: Not enough VLs available (avail=%d, needed=%d); Stopping dfsssp routing!\n",
2230 vl_avail, vl_needed);
2231 err = 1;
2232 goto ERROR;
2233 }
2234 /* else { no balancing } */
2235
2236 if (OSM_LOG_IS_ACTIVE_V2(p_mgr->p_log, OSM_LOG_DEBUG)) {
2237 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
2238 "Virtual Lanes per src/dest combination after balancing:\n");
2239 vltable_print(p_mgr, srcdest2vl_table);
2240 }
2241 if (OSM_LOG_IS_ACTIVE_V2(p_mgr->p_log, OSM_LOG_INFO)) {
2242 OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
2243 "Approx. #paths per VL (after balancing):\n");
2244 j = 0;
2245 count = 1; /* to prevent div. by 0 */
2246 for (i = 0; i < vl_avail; i++) {
2247 if (split_count[i] > 0) {
2248 j = i;
2249 count = split_count[i];
2250 }
2251 OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
2252 " %" PRIu32 ". lane: %" PRIu64 "\n", i,
2253 paths_per_vl[j] / count);
2254 }
2255 }
2256
2257 free(paths_per_vl);
2258
2259 /* deallocate channel dependency graphs */
2260 for (i = 0; i < vl_avail; i++)
2261 cdg_dealloc(&cdg[i]);
2262 free(cdg);
2263
2264 OSM_LOG_EXIT(p_mgr->p_log);
2265 return 0;
2266
2267 ERROR:
2268 free(paths_per_vl);
2269
2270 for (i = 0; i < vl_avail; i++)
2271 cdg_dealloc(&cdg[i]);
2272 free(cdg);
2273
2274 vltable_dealloc(&srcdest2vl_table);
2275 dfsssp_ctx->srcdest2vl_table = NULL;
2276
2277 return err;
2278 }
2279
2280 /* meta function which calls subfunctions for dijkstra, update lft and weights,
2281 (and remove deadlocks) to calculate the routing for the subnet
2282 */
dfsssp_do_dijkstra_routing(void * context)2283 static int dfsssp_do_dijkstra_routing(void *context)
2284 {
2285 dfsssp_context_t *dfsssp_ctx = (dfsssp_context_t *) context;
2286 osm_ucast_mgr_t *p_mgr = (osm_ucast_mgr_t *) dfsssp_ctx->p_mgr;
2287 vertex_t *adj_list = (vertex_t *) dfsssp_ctx->adj_list;
2288 uint32_t adj_list_size = dfsssp_ctx->adj_list_size;
2289
2290 vertex_t **sw_list = NULL;
2291 uint32_t sw_list_size = 0;
2292 uint64_t guid = 0;
2293 cl_qlist_t *qlist = NULL;
2294 cl_list_item_t *qlist_item = NULL;
2295
2296 cl_qmap_t *sw_tbl = &p_mgr->p_subn->sw_guid_tbl;
2297 cl_qmap_t cn_tbl, io_tbl, *p_mixed_tbl = NULL;
2298 cl_map_item_t *item = NULL;
2299 osm_switch_t *sw = NULL;
2300 osm_port_t *port = NULL;
2301 uint32_t i = 0, err = 0;
2302 uint16_t lid = 0, min_lid_ho = 0, max_lid_ho = 0;
2303 uint8_t lmc = 0;
2304 boolean_t cn_nodes_provided = FALSE, io_nodes_provided = FALSE;
2305
2306 OSM_LOG_ENTER(p_mgr->p_log);
2307 OSM_LOG(p_mgr->p_log, OSM_LOG_VERBOSE,
2308 "Calculating shortest path from all Hca/switches to all\n");
2309
2310 cl_qmap_init(&cn_tbl);
2311 cl_qmap_init(&io_tbl);
2312 p_mixed_tbl = &cn_tbl;
2313
2314 cl_qlist_init(&p_mgr->port_order_list);
2315
2316 /* reset the new_lft for each switch */
2317 for (item = cl_qmap_head(sw_tbl); item != cl_qmap_end(sw_tbl);
2318 item = cl_qmap_next(item)) {
2319 sw = (osm_switch_t *) item;
2320 /* initialize LIDs in buffer to invalid port number */
2321 memset(sw->new_lft, OSM_NO_PATH, sw->max_lid_ho + 1);
2322 /* initialize LFT and hop count for bsp0/esp0 of the switch */
2323 min_lid_ho = cl_ntoh16(osm_node_get_base_lid(sw->p_node, 0));
2324 lmc = osm_node_get_lmc(sw->p_node, 0);
2325 for (i = min_lid_ho; i < min_lid_ho + (1 << lmc); i++) {
2326 /* for each switch the port to the 'self'lid is the management port 0 */
2327 sw->new_lft[i] = 0;
2328 /* the hop count to the 'self'lid is 0 for each switch */
2329 osm_switch_set_hops(sw, i, 0, 0);
2330 }
2331 }
2332
2333 /* we need an intermediate array of pointers to switches in adj_list;
2334 this array will be sorted in respect to num_hca (descending)
2335 */
2336 sw_list_size = adj_list_size - 1;
2337 sw_list = (vertex_t **)malloc(sw_list_size * sizeof(vertex_t *));
2338 if (!sw_list) {
2339 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2340 "ERR AD29: cannot allocate memory for sw_list in dfsssp_do_dijkstra_routing\n");
2341 goto ERROR;
2342 }
2343 memset(sw_list, 0, sw_list_size * sizeof(vertex_t *));
2344
2345 /* fill the array with references to the 'real' sw in adj_list */
2346 for (i = 0; i < sw_list_size; i++)
2347 sw_list[i] = &(adj_list[i + 1]);
2348
2349 /* sort the sw_list in descending order */
2350 sw_list_sort_by_num_hca(sw_list, sw_list_size);
2351
2352 /* parse compute node guid file, if provided by the user */
2353 if (p_mgr->p_subn->opt.cn_guid_file) {
2354 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
2355 "Parsing compute nodes from file %s\n",
2356 p_mgr->p_subn->opt.cn_guid_file);
2357
2358 if (parse_node_map(p_mgr->p_subn->opt.cn_guid_file,
2359 add_guid_to_map, &cn_tbl)) {
2360 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2361 "ERR AD33: Problem parsing compute node guid file\n");
2362 goto ERROR;
2363 }
2364
2365 if (cl_is_qmap_empty(&cn_tbl))
2366 OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
2367 "WRN AD34: compute node guids file contains no valid guids\n");
2368 else
2369 cn_nodes_provided = TRUE;
2370 }
2371
2372 /* parse I/O guid file, if provided by the user */
2373 if (p_mgr->p_subn->opt.io_guid_file) {
2374 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
2375 "Parsing I/O nodes from file %s\n",
2376 p_mgr->p_subn->opt.io_guid_file);
2377
2378 if (parse_node_map(p_mgr->p_subn->opt.io_guid_file,
2379 add_guid_to_map, &io_tbl)) {
2380 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2381 "ERR AD35: Problem parsing I/O guid file\n");
2382 goto ERROR;
2383 }
2384
2385 if (cl_is_qmap_empty(&io_tbl))
2386 OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
2387 "WRN AD36: I/O node guids file contains no valid guids\n");
2388 else
2389 io_nodes_provided = TRUE;
2390 }
2391
2392 /* if we mix Hca/Tca/SP0 during the dijkstra routing, we might end up
2393 in rare cases with a bad balancing for Hca<->Hca connections, i.e.
2394 some inter-switch links get oversubscribed with paths;
2395 therefore: add Hca ports first to ensure good Hca<->Hca balancing
2396 */
2397 if (cn_nodes_provided) {
2398 for (i = 0; i < adj_list_size - 1; i++) {
2399 if (sw_list[i] && sw_list[i]->sw) {
2400 sw = (osm_switch_t *)(sw_list[i]->sw);
2401 add_sw_endports_to_order_list(sw, p_mgr,
2402 &cn_tbl, TRUE);
2403 } else {
2404 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2405 "ERR AD30: corrupted sw_list array in dfsssp_do_dijkstra_routing\n");
2406 goto ERROR;
2407 }
2408 }
2409 }
2410 /* then: add Tca ports to ensure good Hca->Tca balancing and separate
2411 paths towards I/O nodes on the same switch (if possible)
2412 */
2413 if (io_nodes_provided) {
2414 for (i = 0; i < adj_list_size - 1; i++) {
2415 if (sw_list[i] && sw_list[i]->sw) {
2416 sw = (osm_switch_t *)(sw_list[i]->sw);
2417 add_sw_endports_to_order_list(sw, p_mgr,
2418 &io_tbl, TRUE);
2419 } else {
2420 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2421 "ERR AD32: corrupted sw_list array in dfsssp_do_dijkstra_routing\n");
2422 goto ERROR;
2423 }
2424 }
2425 }
2426 /* then: add anything else, such as administration nodes, ... */
2427 if (cn_nodes_provided && io_nodes_provided) {
2428 cl_qmap_merge(&cn_tbl, &io_tbl);
2429 } else if (io_nodes_provided) {
2430 p_mixed_tbl = &io_tbl;
2431 }
2432 for (i = 0; i < adj_list_size - 1; i++) {
2433 if (sw_list[i] && sw_list[i]->sw) {
2434 sw = (osm_switch_t *)(sw_list[i]->sw);
2435 add_sw_endports_to_order_list(sw, p_mgr, p_mixed_tbl,
2436 FALSE);
2437 } else {
2438 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2439 "ERR AD39: corrupted sw_list array in dfsssp_do_dijkstra_routing\n");
2440 goto ERROR;
2441 }
2442 }
2443 /* last: add SP0 afterwards which have lower priority for balancing */
2444 for (i = 0; i < sw_list_size; i++) {
2445 if (sw_list[i] && sw_list[i]->sw) {
2446 sw = (osm_switch_t *)(sw_list[i]->sw);
2447 guid = cl_ntoh64(osm_node_get_node_guid(sw->p_node));
2448 add_guid_to_order_list(guid, p_mgr);
2449 } else {
2450 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2451 "ERR AD31: corrupted sw_list array in dfsssp_do_dijkstra_routing\n");
2452 goto ERROR;
2453 }
2454 }
2455
2456 /* the intermediate array lived long enough */
2457 free(sw_list);
2458 sw_list = NULL;
2459 /* same is true for the compute node and I/O guid map */
2460 destroy_guid_map(&cn_tbl);
2461 cn_nodes_provided = FALSE;
2462 destroy_guid_map(&io_tbl);
2463 io_nodes_provided = FALSE;
2464
2465 /* do the routing for the each Hca in the subnet and each switch
2466 in the subnet (to add the routes to base/enhanced SP0)
2467 */
2468 qlist = &p_mgr->port_order_list;
2469 for (qlist_item = cl_qlist_head(qlist);
2470 qlist_item != cl_qlist_end(qlist);
2471 qlist_item = cl_qlist_next(qlist_item)) {
2472 port = (osm_port_t *)cl_item_obj(qlist_item, port, list_item);
2473
2474 /* calculate shortest path with dijkstra from node to all switches/Hca */
2475 if (osm_node_get_type(port->p_node) == IB_NODE_TYPE_CA) {
2476 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
2477 "Processing Hca with GUID 0x%" PRIx64 "\n",
2478 cl_ntoh64(osm_node_get_node_guid
2479 (port->p_node)));
2480 } else if (osm_node_get_type(port->p_node) == IB_NODE_TYPE_SWITCH) {
2481 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
2482 "Processing switch with GUID 0x%" PRIx64 "\n",
2483 cl_ntoh64(osm_node_get_node_guid
2484 (port->p_node)));
2485 } else {
2486 /* we don't handle routers, in case they show up */
2487 continue;
2488 }
2489
2490 /* distribute the LID range across the ports that can reach those LIDs
2491 to have disjoint paths for one destination port with lmc>0;
2492 for switches with bsp0: min=max; with esp0: max>min if lmc>0
2493 */
2494 osm_port_get_lid_range_ho(port, &min_lid_ho,
2495 &max_lid_ho);
2496 for (lid = min_lid_ho; lid <= max_lid_ho; lid++) {
2497 /* do dijkstra from this Hca/LID/SP0 to each switch */
2498 err =
2499 dijkstra(p_mgr, adj_list, adj_list_size, port, lid);
2500 if (err)
2501 goto ERROR;
2502 if (OSM_LOG_IS_ACTIVE_V2(p_mgr->p_log, OSM_LOG_DEBUG))
2503 print_routes(p_mgr, adj_list, adj_list_size,
2504 port);
2505
2506 /* make an update for the linear forwarding tables of the switches */
2507 err =
2508 update_lft(p_mgr, adj_list, adj_list_size, port, lid);
2509 if (err)
2510 goto ERROR;
2511
2512 /* add weights for calculated routes to adjust the weights for the next cycle */
2513 update_weights(p_mgr, adj_list, adj_list_size);
2514
2515 if (OSM_LOG_IS_ACTIVE_V2(p_mgr->p_log, OSM_LOG_DEBUG))
2516 dfsssp_print_graph(p_mgr, adj_list,
2517 adj_list_size);
2518 }
2519 }
2520
2521 /* try deadlock removal only for the dfsssp routing (not for the sssp case, which is a subset of the dfsssp algorithm) */
2522 if (dfsssp_ctx->routing_type == OSM_ROUTING_ENGINE_TYPE_DFSSSP) {
2523 /* remove potential deadlocks by assigning different virtual lanes to src/dest paths and balance the lanes */
2524 err = dfsssp_remove_deadlocks(dfsssp_ctx);
2525 if (err)
2526 goto ERROR;
2527 } else if (dfsssp_ctx->routing_type == OSM_ROUTING_ENGINE_TYPE_SSSP) {
2528 OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
2529 "SSSP routing specified -> skipping deadlock removal thru dfsssp_remove_deadlocks(...)\n");
2530 } else {
2531 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2532 "ERR AD28: wrong routing engine specified in dfsssp_ctx\n");
2533 goto ERROR;
2534 }
2535
2536 /* list not needed after the dijkstra steps and deadlock removal */
2537 cl_qlist_remove_all(&p_mgr->port_order_list);
2538
2539 /* print the new_lft for each switch after routing is done */
2540 if (OSM_LOG_IS_ACTIVE_V2(p_mgr->p_log, OSM_LOG_DEBUG)) {
2541 for (item = cl_qmap_head(sw_tbl); item != cl_qmap_end(sw_tbl);
2542 item = cl_qmap_next(item)) {
2543 sw = (osm_switch_t *) item;
2544 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
2545 "Summary of the (new) LFT for switch 0x%" PRIx64
2546 " (%s):\n",
2547 cl_ntoh64(osm_node_get_node_guid(sw->p_node)),
2548 sw->p_node->print_desc);
2549 for (i = 0; i < sw->max_lid_ho + 1; i++)
2550 if (sw->new_lft[i] != OSM_NO_PATH) {
2551 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
2552 " for LID=%" PRIu32
2553 " use port=%" PRIu8 "\n", i,
2554 sw->new_lft[i]);
2555 }
2556 }
2557 }
2558
2559 OSM_LOG_EXIT(p_mgr->p_log);
2560 return 0;
2561
2562 ERROR:
2563 if (!cl_is_qlist_empty(&p_mgr->port_order_list))
2564 cl_qlist_remove_all(&p_mgr->port_order_list);
2565 if (cn_nodes_provided)
2566 destroy_guid_map(&cn_tbl);
2567 if (io_nodes_provided)
2568 destroy_guid_map(&io_tbl);
2569 if (sw_list)
2570 free(sw_list);
2571 return -1;
2572 }
2573
2574 /* meta function which calls subfunctions for finding the optimal switch
2575 for the spanning tree, performing a dijkstra step with this sw as root,
2576 and calculating the mcast table for MLID
2577 */
dfsssp_do_mcast_routing(void * context,osm_mgrp_box_t * mbox)2578 static ib_api_status_t dfsssp_do_mcast_routing(void * context,
2579 osm_mgrp_box_t * mbox)
2580 {
2581 dfsssp_context_t *dfsssp_ctx = (dfsssp_context_t *) context;
2582 osm_ucast_mgr_t *p_mgr = (osm_ucast_mgr_t *) dfsssp_ctx->p_mgr;
2583 osm_sm_t *sm = (osm_sm_t *) p_mgr->sm;
2584 vertex_t *adj_list = (vertex_t *) dfsssp_ctx->adj_list;
2585 uint32_t adj_list_size = dfsssp_ctx->adj_list_size;
2586 cl_qlist_t mcastgrp_port_list;
2587 cl_qmap_t mcastgrp_port_map;
2588 osm_switch_t *root_sw = NULL, *p_sw = NULL;
2589 osm_port_t *port = NULL;
2590 ib_net16_t lid = 0;
2591 uint32_t err = 0, num_ports = 0, i = 0;
2592 ib_net64_t guid = 0;
2593 ib_api_status_t status = IB_SUCCESS;
2594
2595 OSM_LOG_ENTER(sm->p_log);
2596
2597 /* using the ucast cache feature with dfsssp might mean that a leaf sw
2598 got removed (and got back) without calling dfsssp_build_graph
2599 and therefore the adj_list (and pointers to osm's internal switches)
2600 could be outdated (here we have no knowledge if it has happened, so
2601 unfortunately a check is necessary... still better than rebuilding
2602 adj_list every time we arrive here)
2603 */
2604 if (p_mgr->p_subn->opt.use_ucast_cache && p_mgr->cache_valid) {
2605 for (i = 1; i < adj_list_size; i++) {
2606 guid = cl_hton64(adj_list[i].guid);
2607 p_sw = osm_get_switch_by_guid(p_mgr->p_subn, guid);
2608 if (p_sw) {
2609 /* check if switch came back from the dead */
2610 if (adj_list[i].dropped)
2611 adj_list[i].dropped = FALSE;
2612
2613 /* verify that sw object has not been moved
2614 (this can happen for a leaf switch, if it
2615 was dropped and came back later without a
2616 rerouting), otherwise we have to update
2617 dfsssp's internal switch list with the new
2618 sw pointer
2619 */
2620 if (p_sw == adj_list[i].sw)
2621 continue;
2622 else
2623 adj_list[i].sw = p_sw;
2624 } else {
2625 /* if a switch from adj_list is not in the
2626 sw_guid_tbl anymore, then the only reason is
2627 that it was a leaf switch and opensm dropped
2628 it without calling a rerouting
2629 -> calling dijkstra is no problem, since it
2630 is a leaf and different from root_sw
2631 -> only update_mcft and reset_mgrp_membership
2632 need to be aware of these dropped switches
2633 */
2634 if (!adj_list[i].dropped)
2635 adj_list[i].dropped = TRUE;
2636 }
2637 }
2638 }
2639
2640 /* create a map and a list of all ports which are member in the mcast
2641 group; map for searching elements and list for iteration
2642 */
2643 if (osm_mcast_make_port_list_and_map(&mcastgrp_port_list,
2644 &mcastgrp_port_map, mbox)) {
2645 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR AD50: "
2646 "Insufficient memory to make port list\n");
2647 status = IB_ERROR;
2648 goto Exit;
2649 }
2650
2651 num_ports = cl_qlist_count(&mcastgrp_port_list);
2652 if (num_ports < 2) {
2653 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
2654 "MLID 0x%X has %u members - nothing to do\n",
2655 mbox->mlid, num_ports);
2656 goto Exit;
2657 }
2658
2659 /* find the root switch for the spanning tree, which has the smallest
2660 hops count to all LIDs in the mcast group
2661 */
2662 root_sw = osm_mcast_mgr_find_root_switch(sm, &mcastgrp_port_list);
2663 if (!root_sw) {
2664 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR AD51: "
2665 "Unable to locate a suitable switch for group 0x%X\n",
2666 mbox->mlid);
2667 status = IB_ERROR;
2668 goto Exit;
2669 }
2670
2671 /* a) start one dijkstra step from the root switch to generate a
2672 spanning tree
2673 b) this might be a bit of an overkill to span the whole
2674 network, if there are only a few ports in the mcast group, but
2675 its only one dijkstra step for each mcast group and we did many
2676 steps before in the ucast routing for each LID in the subnet;
2677 c) we can use the subnet structure from the ucast routing, and
2678 don't even have to reset the link weights (=> therefore the mcast
2679 spanning tree will use less 'growded' links in the network)
2680 d) the mcast dfsssp algorithm will not change the link weights
2681 */
2682 lid = osm_node_get_base_lid(root_sw->p_node, 0);
2683 port = osm_get_port_by_lid(sm->p_subn, lid);
2684 err = dijkstra(p_mgr, adj_list, adj_list_size, port, lid);
2685 if (err) {
2686 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR AD52: "
2687 "Dijkstra step for mcast failed for group 0x%X\n",
2688 mbox->mlid);
2689 status = IB_ERROR;
2690 goto Exit;
2691 }
2692
2693 /* set mcast group membership again for update_mcft
2694 (unfortunately: osm_mcast_mgr_find_root_switch resets it)
2695 */
2696 update_mgrp_membership(&mcastgrp_port_list);
2697
2698 /* update the mcast forwarding tables of the switches */
2699 err = update_mcft(sm, adj_list, adj_list_size, mbox->mlid,
2700 &mcastgrp_port_map, root_sw);
2701 if (err) {
2702 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR AD53: "
2703 "Update of mcast forwarding tables failed for group 0x%X\n",
2704 mbox->mlid);
2705 status = IB_ERROR;
2706 goto Exit;
2707 }
2708
2709 Exit:
2710 reset_mgrp_membership(adj_list, adj_list_size);
2711 osm_mcast_drop_port_list(&mcastgrp_port_list);
2712 OSM_LOG_EXIT(sm->p_log);
2713 return status;
2714 }
2715
2716 /* called from extern in QP creation process to gain the the service level and
2717 the virtual lane respectively for a <s,d> pair
2718 */
get_dfsssp_sl(void * context,uint8_t hint_for_default_sl,const ib_net16_t slid,const ib_net16_t dlid)2719 static uint8_t get_dfsssp_sl(void *context, uint8_t hint_for_default_sl,
2720 const ib_net16_t slid, const ib_net16_t dlid)
2721 {
2722 dfsssp_context_t *dfsssp_ctx = (dfsssp_context_t *) context;
2723 osm_port_t *src_port, *dest_port;
2724 vltable_t *srcdest2vl_table = NULL;
2725 uint8_t *vl_split_count = NULL;
2726 osm_ucast_mgr_t *p_mgr = NULL;
2727 int32_t res = 0;
2728
2729 if (dfsssp_ctx
2730 && dfsssp_ctx->routing_type == OSM_ROUTING_ENGINE_TYPE_DFSSSP) {
2731 p_mgr = (osm_ucast_mgr_t *) dfsssp_ctx->p_mgr;
2732 srcdest2vl_table = (vltable_t *) (dfsssp_ctx->srcdest2vl_table);
2733 vl_split_count = (uint8_t *) (dfsssp_ctx->vl_split_count);
2734 }
2735 else
2736 return hint_for_default_sl;
2737
2738 src_port = osm_get_port_by_lid(p_mgr->p_subn, slid);
2739 if (!src_port)
2740 return hint_for_default_sl;
2741
2742 dest_port = osm_get_port_by_lid(p_mgr->p_subn, dlid);
2743 if (!dest_port)
2744 return hint_for_default_sl;
2745
2746 if (!srcdest2vl_table)
2747 return hint_for_default_sl;
2748
2749 res = vltable_get_vl(srcdest2vl_table, slid, dlid);
2750
2751 /* we will randomly distribute the traffic over multiple VLs if
2752 necessary for good balancing; therefore vl_split_count provides
2753 the number of VLs to use for certain traffic
2754 */
2755 if (res > -1) {
2756 if (vl_split_count[res] > 1)
2757 return (uint8_t) (res + rand()%(vl_split_count[res]));
2758 else
2759 return (uint8_t) res;
2760 } else
2761 return hint_for_default_sl;
2762 }
2763
dfsssp_context_create(osm_opensm_t * p_osm,osm_routing_engine_type_t routing_type)2764 static dfsssp_context_t *dfsssp_context_create(osm_opensm_t * p_osm,
2765 osm_routing_engine_type_t
2766 routing_type)
2767 {
2768 dfsssp_context_t *dfsssp_ctx = NULL;
2769
2770 /* allocate memory */
2771 dfsssp_ctx = (dfsssp_context_t *) malloc(sizeof(dfsssp_context_t));
2772 if (dfsssp_ctx) {
2773 /* set initial values */
2774 dfsssp_ctx->routing_type = routing_type;
2775 dfsssp_ctx->p_mgr = (osm_ucast_mgr_t *) & (p_osm->sm.ucast_mgr);
2776 dfsssp_ctx->adj_list = NULL;
2777 dfsssp_ctx->adj_list_size = 0;
2778 dfsssp_ctx->srcdest2vl_table = NULL;
2779 dfsssp_ctx->vl_split_count = NULL;
2780 } else {
2781 OSM_LOG(p_osm->sm.ucast_mgr.p_log, OSM_LOG_ERROR,
2782 "ERR AD04: cannot allocate memory for dfsssp_ctx in dfsssp_context_create\n");
2783 return NULL;
2784 }
2785
2786 return dfsssp_ctx;
2787 }
2788
dfsssp_context_destroy(void * context)2789 static void dfsssp_context_destroy(void *context)
2790 {
2791 dfsssp_context_t *dfsssp_ctx = (dfsssp_context_t *) context;
2792 vertex_t *adj_list = (vertex_t *) (dfsssp_ctx->adj_list);
2793 uint32_t i = 0;
2794 link_t *link = NULL, *tmp = NULL;
2795
2796 /* free adj_list */
2797 for (i = 0; i < dfsssp_ctx->adj_list_size; i++) {
2798 link = adj_list[i].links;
2799 while (link) {
2800 tmp = link;
2801 link = link->next;
2802 free(tmp);
2803 }
2804 }
2805 free(adj_list);
2806 dfsssp_ctx->adj_list = NULL;
2807 dfsssp_ctx->adj_list_size = 0;
2808
2809 /* free srcdest2vl table and the split count information table
2810 (can be done, because dfsssp_context_destroy is called after
2811 osm_get_dfsssp_sl)
2812 */
2813 vltable_dealloc(&(dfsssp_ctx->srcdest2vl_table));
2814 dfsssp_ctx->srcdest2vl_table = NULL;
2815
2816 if (dfsssp_ctx->vl_split_count) {
2817 free(dfsssp_ctx->vl_split_count);
2818 dfsssp_ctx->vl_split_count = NULL;
2819 }
2820 }
2821
delete(void * context)2822 static void delete(void *context)
2823 {
2824 if (!context)
2825 return;
2826 dfsssp_context_destroy(context);
2827
2828 free(context);
2829 }
2830
osm_ucast_dfsssp_setup(struct osm_routing_engine * r,osm_opensm_t * p_osm)2831 int osm_ucast_dfsssp_setup(struct osm_routing_engine *r, osm_opensm_t * p_osm)
2832 {
2833 /* create context container and add ucast management object */
2834 dfsssp_context_t *dfsssp_context =
2835 dfsssp_context_create(p_osm, OSM_ROUTING_ENGINE_TYPE_DFSSSP);
2836 if (!dfsssp_context) {
2837 return 1; /* alloc failed -> skip this routing */
2838 }
2839
2840 /* reset function pointers to dfsssp routines */
2841 r->context = (void *)dfsssp_context;
2842 r->build_lid_matrices = dfsssp_build_graph;
2843 r->ucast_build_fwd_tables = dfsssp_do_dijkstra_routing;
2844 r->mcast_build_stree = dfsssp_do_mcast_routing;
2845 r->path_sl = get_dfsssp_sl;
2846 r->destroy = delete;
2847
2848 /* we initialize with the current time to achieve a 'good' randomized
2849 assignment in get_dfsssp_sl(...)
2850 */
2851 srand(time(NULL));
2852
2853 return 0;
2854 }
2855
osm_ucast_sssp_setup(struct osm_routing_engine * r,osm_opensm_t * p_osm)2856 int osm_ucast_sssp_setup(struct osm_routing_engine *r, osm_opensm_t * p_osm)
2857 {
2858 /* create context container and add ucast management object */
2859 dfsssp_context_t *dfsssp_context =
2860 dfsssp_context_create(p_osm, OSM_ROUTING_ENGINE_TYPE_SSSP);
2861 if (!dfsssp_context) {
2862 return 1; /* alloc failed -> skip this routing */
2863 }
2864
2865 /* reset function pointers to sssp routines */
2866 r->context = (void *)dfsssp_context;
2867 r->build_lid_matrices = dfsssp_build_graph;
2868 r->ucast_build_fwd_tables = dfsssp_do_dijkstra_routing;
2869 r->mcast_build_stree = dfsssp_do_mcast_routing;
2870 r->destroy = delete;
2871
2872 return 0;
2873 }
2874