xref: /freebsd/contrib/ofed/opensm/opensm/osm_ucast_dnup.c (revision c1cdf6a42f0d951ba720688dfc6ce07608b02f6e)
1 /*
2  * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
3  * Copyright (c) 2002-2007,2009 Mellanox Technologies LTD. All rights reserved.
4  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
5  * Copyright (c) 2009 HNR Consulting. All rights reserved.
6  * Copyright (c) 2009 Battelle Memorial Institue. All rights reserved.
7  *
8  * This software is available to you under a choice of one of two
9  * licenses.  You may choose to be licensed under the terms of the GNU
10  * General Public License (GPL) Version 2, available from the file
11  * COPYING in the main directory of this source tree, or the
12  * OpenIB.org BSD license below:
13  *
14  *     Redistribution and use in source and binary forms, with or
15  *     without modification, are permitted provided that the following
16  *     conditions are met:
17  *
18  *      - Redistributions of source code must retain the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer.
21  *
22  *      - Redistributions in binary form must reproduce the above
23  *        copyright notice, this list of conditions and the following
24  *        disclaimer in the documentation and/or other materials
25  *        provided with the distribution.
26  *
27  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
28  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
29  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
30  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
31  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
32  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
33  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
34  * SOFTWARE.
35  *
36  */
37 
38 /*
39  * Abstract:
40  *      Implementation of Up Down Algorithm using ranking & Min Hop
41  *      Calculation functions
42  */
43 
44 #if HAVE_CONFIG_H
45 #  include <config.h>
46 #endif				/* HAVE_CONFIG_H */
47 
48 #include <stdlib.h>
49 #include <ctype.h>
50 #include <complib/cl_debug.h>
51 #include <complib/cl_qmap.h>
52 #include <opensm/osm_file_ids.h>
53 #define FILE_ID OSM_FILE_UCAST_DNUP_C
54 #include <opensm/osm_switch.h>
55 #include <opensm/osm_opensm.h>
56 #include <opensm/osm_ucast_mgr.h>
57 
58 /* //////////////////////////// */
59 /*  Local types                 */
60 /* //////////////////////////// */
61 
62 /* direction */
63 typedef enum dnup_switch_dir {
64 	UP = 0,
65 	DOWN,
66 	EQUAL
67 } dnup_switch_dir_t;
68 
69 /* dnup structure */
70 typedef struct dnup {
71 	osm_opensm_t *p_osm;
72 } dnup_t;
73 
74 struct dnup_node {
75 	cl_list_item_t list;
76 	osm_switch_t *sw;
77 	dnup_switch_dir_t dir;
78 	unsigned rank;
79 	unsigned visited;
80 };
81 
82 /* This function returns direction based on rank and guid info of current &
83    remote ports */
84 static dnup_switch_dir_t dnup_get_dir(unsigned cur_rank, unsigned rem_rank)
85 {
86 	/* HACK: comes to solve root nodes connection, in a classic subnet root nodes do not connect
87 	   directly, but in case they are we assign to root node an UP direction to allow DNUP to discover
88 	   the subnet correctly (and not from the point of view of the last root node).
89 	 */
90 	if (!cur_rank && !rem_rank)
91 		return EQUAL;
92 
93 	if (cur_rank < rem_rank)
94 		return DOWN;
95 	else if (cur_rank > rem_rank)
96 		return UP;
97 	else
98 		return EQUAL;
99 }
100 
101 /**********************************************************************
102  * This function does the bfs of min hop table calculation by guid index
103  * as a starting point.
104  **********************************************************************/
105 static int dnup_bfs_by_node(IN osm_log_t * p_log, IN osm_subn_t * p_subn,
106 			    IN osm_switch_t * p_sw, IN uint8_t prune_weight,
107 			    OUT uint8_t * max_hops)
108 {
109 	uint8_t pn, pn_rem;
110 	cl_qlist_t list;
111 	uint16_t lid;
112 	struct dnup_node *u;
113 	dnup_switch_dir_t next_dir, current_dir;
114 
115 	OSM_LOG_ENTER(p_log);
116 
117 	lid = osm_node_get_base_lid(p_sw->p_node, 0);
118 	lid = cl_ntoh16(lid);
119 	osm_switch_set_hops(p_sw, lid, 0, 0);
120 
121 	OSM_LOG(p_log, OSM_LOG_DEBUG,
122 		"Starting from switch - port GUID 0x%" PRIx64 " lid %u\n",
123 		cl_ntoh64(p_sw->p_node->node_info.port_guid), lid);
124 
125 	u = p_sw->priv;
126 	u->dir = DOWN;
127 
128 	/* Update list with the new element */
129 	cl_qlist_init(&list);
130 	cl_qlist_insert_tail(&list, &u->list);
131 
132 	/* BFS the list till no next element */
133 	while (!cl_is_qlist_empty(&list)) {
134 		u = (struct dnup_node *)cl_qlist_remove_head(&list);
135 		u->visited = 0;	/* cleanup */
136 		current_dir = u->dir;
137 		/* Go over all ports of the switch and find unvisited remote nodes */
138 		for (pn = 1; pn < u->sw->num_ports; pn++) {
139 			osm_node_t *p_remote_node;
140 			struct dnup_node *rem_u;
141 			uint8_t current_min_hop, remote_min_hop,
142 			    set_hop_return_value;
143 			osm_switch_t *p_remote_sw;
144 
145 			p_remote_node =
146 			    osm_node_get_remote_node(u->sw->p_node, pn,
147 						     &pn_rem);
148 			/* If no remote node OR remote node is not a SWITCH
149 			   continue to next pn */
150 			if (!p_remote_node || !p_remote_node->sw)
151 				continue;
152 			/* Fetch remote guid only after validation of remote node */
153 			p_remote_sw = p_remote_node->sw;
154 			rem_u = p_remote_sw->priv;
155 			/* Decide which direction to mark it (UP/DOWN) */
156 			next_dir = dnup_get_dir(u->rank, rem_u->rank);
157 
158 			/* Set MinHop value for the current lid */
159 			current_min_hop = osm_switch_get_least_hops(u->sw, lid);
160 			/* Check hop count if better insert into list && update
161 			   the remote node Min Hop Table */
162 			remote_min_hop =
163 			    osm_switch_get_hop_count(p_remote_sw, lid, pn_rem);
164 
165 			/* Check if this is a legal step : the only illegal step is going
166 			   from UP to DOWN */
167 			if ((current_dir == UP) && (next_dir == DOWN)) {
168 				OSM_LOG(p_log, OSM_LOG_DEBUG,
169 					"Avoiding move from 0x%016" PRIx64
170 					" to 0x%016" PRIx64 "\n",
171 					cl_ntoh64(osm_node_get_node_guid(u->sw->p_node)),
172 					cl_ntoh64(osm_node_get_node_guid(p_remote_node)));
173 				/* Illegal step. If prune_weight is set, allow it with an
174 				 * additional weight
175 				 */
176 				if(prune_weight) {
177 					current_min_hop+=prune_weight;
178 					if(current_min_hop >= 64) {
179 						OSM_LOG(p_log, OSM_LOG_ERROR,
180 							"ERR AE02: Too many hops on subnet,"
181 							" can't relax illegal Dn/Up transition.");
182 						osm_switch_set_hops(p_remote_sw, lid,
183 								    pn_rem, OSM_NO_PATH);
184 					}
185 				} else {
186 					continue;
187 				}
188 			}
189 			if (current_min_hop + 1 < remote_min_hop) {
190 				set_hop_return_value =
191 				    osm_switch_set_hops(p_remote_sw, lid,
192 							pn_rem,
193 							current_min_hop + 1);
194 				if(max_hops && current_min_hop + 1 > *max_hops) {
195 					*max_hops = current_min_hop + 1;
196 				}
197 				if (set_hop_return_value) {
198 					OSM_LOG(p_log, OSM_LOG_ERROR, "ERR AE01: "
199 						"Invalid value returned from set min hop is: %d\n",
200 						set_hop_return_value);
201 				}
202 				/* Check if remote port has already been visited */
203 				if (!rem_u->visited) {
204 					/* Insert dnup_switch item into the list */
205 					rem_u->dir = next_dir;
206 					rem_u->visited = 1;
207 					cl_qlist_insert_tail(&list,
208 							     &rem_u->list);
209 				}
210 			}
211 		}
212 	}
213 
214 	OSM_LOG_EXIT(p_log);
215 	return 0;
216 }
217 
218 /* NOTE : PLS check if we need to decide that the first */
219 /*        rank is a SWITCH for BFS purpose */
220 static int dnup_subn_rank(IN dnup_t * p_dnup)
221 {
222 	osm_switch_t *p_sw;
223 	osm_physp_t *p_physp, *p_remote_physp;
224 	cl_qlist_t list;
225 	cl_map_item_t *item;
226 	struct dnup_node *u, *remote_u;
227 	uint8_t num_ports, port_num;
228 	osm_log_t *p_log = &p_dnup->p_osm->log;
229 	unsigned max_rank = 0;
230 
231 	OSM_LOG_ENTER(p_log);
232 	cl_qlist_init(&list);
233 
234 	/* add all node level switches to the list */
235 	for (item = cl_qmap_head(&p_dnup->p_osm->subn.sw_guid_tbl);
236 	     item != cl_qmap_end(&p_dnup->p_osm->subn.sw_guid_tbl);
237 	     item = cl_qmap_next(item)) {
238 		p_sw = (osm_switch_t *)item;
239 		u = p_sw->priv;
240 		if (u->rank == 0)
241 			cl_qlist_insert_tail(&list, &u->list);
242 	}
243 
244 	/* BFS the list till it's empty */
245 	while (!cl_is_qlist_empty(&list)) {
246 		u = (struct dnup_node *)cl_qlist_remove_head(&list);
247 		/* Go over all remote nodes and rank them (if not already visited) */
248 		p_sw = u->sw;
249 		num_ports = p_sw->num_ports;
250 		OSM_LOG(p_log, OSM_LOG_DEBUG,
251 			"Handling switch GUID 0x%" PRIx64 "\n",
252 			cl_ntoh64(osm_node_get_node_guid(p_sw->p_node)));
253 		for (port_num = 1; port_num < num_ports; port_num++) {
254 			ib_net64_t port_guid;
255 
256 			/* Current port fetched in order to get remote side */
257 			p_physp =
258 			    osm_node_get_physp_ptr(p_sw->p_node, port_num);
259 
260 			if (!p_physp)
261 				continue;
262 
263 			p_remote_physp = p_physp->p_remote_physp;
264 
265 			/*
266 			   make sure that all the following occur on p_remote_physp:
267 			   1. The port isn't NULL
268 			   2. It is a switch
269 			 */
270 			if (p_remote_physp && p_remote_physp->p_node->sw) {
271 				remote_u = p_remote_physp->p_node->sw->priv;
272 				port_guid = p_remote_physp->port_guid;
273 
274 				if (remote_u->rank > u->rank + 1) {
275 					remote_u->rank = u->rank + 1;
276 					max_rank = remote_u->rank;
277 					cl_qlist_insert_tail(&list,
278 							     &remote_u->list);
279 					OSM_LOG(p_log, OSM_LOG_DEBUG,
280 						"Rank of port GUID 0x%" PRIx64
281 						" = %u\n", cl_ntoh64(port_guid),
282 						remote_u->rank);
283 				}
284 			}
285 		}
286 	}
287 
288 	/* Print Summary of ranking */
289 	OSM_LOG(p_log, OSM_LOG_VERBOSE,
290 		"Subnet ranking completed. Max Node Rank = %d\n", max_rank);
291 	OSM_LOG_EXIT(p_log);
292 	return 0;
293 }
294 
295 static int dnup_set_min_hop_table(IN dnup_t * p_dnup)
296 {
297 	osm_subn_t *p_subn = &p_dnup->p_osm->subn;
298 	osm_log_t *p_log = &p_dnup->p_osm->log;
299 	osm_switch_t *p_sw;
300 	struct dnup_node *u;
301 	cl_map_item_t *item;
302 	uint8_t max_hops = 0;
303 
304 	OSM_LOG_ENTER(p_log);
305 
306 	/* Go over all the switches in the subnet - for each init their Min Hop
307 	   Table */
308 	OSM_LOG(p_log, OSM_LOG_VERBOSE,
309 		"Init Min Hop Table of all switches [\n");
310 
311 	for (item = cl_qmap_head(&p_dnup->p_osm->subn.sw_guid_tbl);
312 	     item != cl_qmap_end(&p_dnup->p_osm->subn.sw_guid_tbl);
313 	     item = cl_qmap_next(item)) {
314 		p_sw = (osm_switch_t *)item;
315 		/* Clear Min Hop Table */
316 		osm_switch_clear_hops(p_sw);
317 	}
318 
319 	OSM_LOG(p_log, OSM_LOG_VERBOSE,
320 		"Init Min Hop Table of all switches ]\n");
321 
322 	/* Now do the BFS for each port  in the subnet */
323 	OSM_LOG(p_log, OSM_LOG_VERBOSE,
324 		"BFS through all port guids in the subnet [\n");
325 
326 	for (item = cl_qmap_head(&p_dnup->p_osm->subn.sw_guid_tbl);
327 	     item != cl_qmap_end(&p_dnup->p_osm->subn.sw_guid_tbl);
328 	     item = cl_qmap_next(item)) {
329 		p_sw = (osm_switch_t *)item;
330 		dnup_bfs_by_node(p_log, p_subn, p_sw, 0, &max_hops);
331 	}
332 	if(p_subn->opt.connect_roots) {
333 		/*This is probably not necessary, by I am more comfortable
334 		 * clearing any possible side effects from the previous
335 		 * dnup routing pass
336 		 */
337 		for (item = cl_qmap_head(&p_dnup->p_osm->subn.sw_guid_tbl);
338 		     item != cl_qmap_end(&p_dnup->p_osm->subn.sw_guid_tbl);
339 		     item = cl_qmap_next(item)) {
340 			p_sw = (osm_switch_t *)item;
341 			osm_switch_clear_hops(p_sw);
342 			u = (struct dnup_node *) p_sw->priv;
343 			u->visited = 0;
344 		}
345 		for (item = cl_qmap_head(&p_dnup->p_osm->subn.sw_guid_tbl);
346 		     item != cl_qmap_end(&p_dnup->p_osm->subn.sw_guid_tbl);
347 		     item = cl_qmap_next(item)) {
348 			p_sw = (osm_switch_t *)item;
349 			dnup_bfs_by_node(p_log, p_subn, p_sw, max_hops + 1, NULL);
350 		}
351 	}
352 
353 	OSM_LOG(p_log, OSM_LOG_VERBOSE,
354 		"BFS through all port guids in the subnet ]\n");
355 	/* Cleanup */
356 	OSM_LOG_EXIT(p_log);
357 	return 0;
358 }
359 
360 static int dnup_build_lid_matrices(IN dnup_t * p_dnup)
361 {
362 	int status;
363 
364 	OSM_LOG_ENTER(&p_dnup->p_osm->log);
365 
366 	OSM_LOG(&p_dnup->p_osm->log, OSM_LOG_VERBOSE,
367 		"Ranking all port guids in the list\n");
368 	/* Check if it's not a switched subnet */
369 	if (cl_is_qmap_empty(&p_dnup->p_osm->subn.sw_guid_tbl)) {
370 		OSM_LOG(&p_dnup->p_osm->log, OSM_LOG_ERROR, "ERR AEOB: "
371 			"This is not a switched subnet, cannot perform DNUP algorithm\n");
372 		status = -1;
373 		goto _exit;
374 	}
375 
376 	/* Rank the subnet switches */
377 	dnup_subn_rank(p_dnup);
378 
379 	/* After multiple ranking need to set Min Hop Table by DnUp algorithm  */
380 	OSM_LOG(&p_dnup->p_osm->log, OSM_LOG_VERBOSE,
381 		"Setting all switches' Min Hop Table\n");
382 	status = dnup_set_min_hop_table(p_dnup);
383 
384 _exit:
385 	OSM_LOG_EXIT(&p_dnup->p_osm->log);
386 	return status;
387 }
388 
389 static struct dnup_node *create_dnup_node(osm_switch_t * sw)
390 {
391 	struct dnup_node *u;
392 
393 	u = malloc(sizeof(*u));
394 	if (!u)
395 		return NULL;
396 	memset(u, 0, sizeof(*u));
397 	u->sw = sw;
398 	u->rank = 0xffffffff;
399 	return u;
400 }
401 
402 static void delete_dnup_node(struct dnup_node *u)
403 {
404 	u->sw->priv = NULL;
405 	free(u);
406 }
407 
408 /* DNUP callback function */
409 static int dnup_lid_matrices(void *ctx)
410 {
411 	dnup_t *p_dnup = ctx;
412 	cl_map_item_t *item;
413 	osm_switch_t *p_sw;
414 	int ret = 0;
415 	int num_leafs = 0;
416 	uint8_t pn, pn_rem;
417 
418 	OSM_LOG_ENTER(&p_dnup->p_osm->log);
419 
420 	for (item = cl_qmap_head(&p_dnup->p_osm->subn.sw_guid_tbl);
421 	     item != cl_qmap_end(&p_dnup->p_osm->subn.sw_guid_tbl);
422 	     item = cl_qmap_next(item)) {
423 		p_sw = (osm_switch_t *)item;
424 		p_sw->priv = create_dnup_node(p_sw);
425 		if (!p_sw->priv) {
426 			OSM_LOG(&(p_dnup->p_osm->log), OSM_LOG_ERROR, "ERR AE0C: "
427 				"cannot create dnup node\n");
428 			OSM_LOG_EXIT(&p_dnup->p_osm->log);
429 			return -1;
430 		}
431 	}
432 
433 
434 	/* First setup node level nodes */
435 	for (item = cl_qmap_head(&p_dnup->p_osm->subn.sw_guid_tbl);
436 	     item != cl_qmap_end(&p_dnup->p_osm->subn.sw_guid_tbl);
437 	     item = cl_qmap_next(item)) {
438 		p_sw = (osm_switch_t *)item;
439 
440 		for (pn = 0; pn < p_sw->num_ports; pn++) {
441 			osm_node_t *p_remote_node;
442 			p_remote_node = osm_node_get_remote_node(p_sw->p_node, pn, &pn_rem);
443 			if(p_remote_node && !p_remote_node->sw) {
444 				struct dnup_node *u = p_sw->priv;
445 				u->rank = 0;
446 				OSM_LOG(&(p_dnup->p_osm->log),
447 					OSM_LOG_VERBOSE, "(%s) rank 0 leaf switch\n",
448 					p_sw->p_node->print_desc);
449 				num_leafs++;
450 				break;
451 			}
452 		}
453 	}
454 
455 	if(num_leafs == 0) {
456 		OSM_LOG(&(p_dnup->p_osm->log),
457 			OSM_LOG_ERROR, "ERR AE0D: No leaf switches found, DnUp routing failed\n");
458 		OSM_LOG_EXIT(&p_dnup->p_osm->log);
459 		return -1;
460 	}
461 
462 	ret = dnup_build_lid_matrices(p_dnup);
463 
464 	for (item = cl_qmap_head(&p_dnup->p_osm->subn.sw_guid_tbl);
465 	     item != cl_qmap_end(&p_dnup->p_osm->subn.sw_guid_tbl);
466 	     item = cl_qmap_next(item)) {
467 		p_sw = (osm_switch_t *) item;
468 		delete_dnup_node(p_sw->priv);
469 	}
470 
471 	OSM_LOG_EXIT(&p_dnup->p_osm->log);
472 	return ret;
473 }
474 
475 static void dnup_delete(void *context)
476 {
477 	free(context);
478 }
479 
480 int osm_ucast_dnup_setup(struct osm_routing_engine *r, osm_opensm_t *osm)
481 {
482 	dnup_t *dnup;
483 
484 	OSM_LOG_ENTER(&osm->log);
485 
486 	dnup = malloc(sizeof(dnup_t));
487 	if (!dnup)
488 		return -1;
489 	memset(dnup, 0, sizeof(dnup_t));
490 
491 	dnup->p_osm = osm;
492 
493 	r->context = dnup;
494 	r->destroy = dnup_delete;
495 	r->build_lid_matrices = dnup_lid_matrices;
496 
497 	OSM_LOG_EXIT(&osm->log);
498 	return 0;
499 }
500