1 /*
2 * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved.
3 * Copyright (c) 2002-2015 Mellanox Technologies LTD. All rights reserved.
4 * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
5 * Copyright (c) 2009 HNR Consulting. All rights reserved.
6 *
7 * This software is available to you under a choice of one of two
8 * licenses. You may choose to be licensed under the terms of the GNU
9 * General Public License (GPL) Version 2, available from the file
10 * COPYING in the main directory of this source tree, or the
11 * OpenIB.org BSD license below:
12 *
13 * Redistribution and use in source and binary forms, with or
14 * without modification, are permitted provided that the following
15 * conditions are met:
16 *
17 * - Redistributions of source code must retain the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer.
20 *
21 * - Redistributions in binary form must reproduce the above
22 * copyright notice, this list of conditions and the following
23 * disclaimer in the documentation and/or other materials
24 * provided with the distribution.
25 *
26 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
27 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
28 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
29 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
30 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
31 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
32 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 * SOFTWARE.
34 *
35 */
36
37 /*
38 * Abstract:
39 * Implementation of osm_ni_rcv_t.
40 * This object represents the NodeInfo Receiver object.
41 * This object is part of the opensm family of objects.
42 */
43
44 #if HAVE_CONFIG_H
45 # include <config.h>
46 #endif /* HAVE_CONFIG_H */
47
48 #include <stdlib.h>
49 #include <string.h>
50 #include <iba/ib_types.h>
51 #include <complib/cl_qmap.h>
52 #include <complib/cl_passivelock.h>
53 #include <complib/cl_debug.h>
54 #include <opensm/osm_file_ids.h>
55 #define FILE_ID OSM_FILE_NODE_INFO_RCV_C
56 #include <opensm/osm_madw.h>
57 #include <opensm/osm_log.h>
58 #include <opensm/osm_node.h>
59 #include <opensm/osm_subnet.h>
60 #include <opensm/osm_router.h>
61 #include <opensm/osm_mad_pool.h>
62 #include <opensm/osm_helper.h>
63 #include <opensm/osm_msgdef.h>
64 #include <opensm/osm_opensm.h>
65 #include <opensm/osm_ucast_mgr.h>
66 #include <opensm/osm_db_pack.h>
67
report_duplicated_guid(IN osm_sm_t * sm,osm_physp_t * p_physp,osm_node_t * p_neighbor_node,const uint8_t port_num)68 static void report_duplicated_guid(IN osm_sm_t * sm, osm_physp_t * p_physp,
69 osm_node_t * p_neighbor_node,
70 const uint8_t port_num)
71 {
72 osm_physp_t *p_old, *p_new;
73 osm_dr_path_t path;
74
75 p_old = p_physp->p_remote_physp;
76 p_new = osm_node_get_physp_ptr(p_neighbor_node, port_num);
77
78 OSM_LOG(sm->p_log, OSM_LOG_SYS | OSM_LOG_ERROR, "ERR 0D01: "
79 "Found duplicated node GUID.\n"
80 "Node 0x%" PRIx64 " port %u is reachable from remote node "
81 "0x%" PRIx64 " port %u and remote node 0x%" PRIx64 " port %u.\n"
82 "Paths are:\n",
83 cl_ntoh64(p_physp->p_node->node_info.node_guid),
84 p_physp->port_num,
85 p_old ? cl_ntoh64(p_old->p_node->node_info.node_guid) : 0,
86 p_old ? p_old->port_num : 0,
87 p_new ? cl_ntoh64(p_new->p_node->node_info.node_guid) : 0,
88 p_new ? p_new->port_num : 0);
89
90 osm_dump_dr_path_v2(sm->p_log, osm_physp_get_dr_path_ptr(p_physp),
91 FILE_ID, OSM_LOG_ERROR);
92
93 path = *osm_physp_get_dr_path_ptr(p_new);
94 if (osm_dr_path_extend(&path, port_num))
95 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D05: "
96 "DR path with hop count %d couldn't be extended\n",
97 path.hop_count);
98 osm_dump_dr_path_v2(sm->p_log, &path, FILE_ID, OSM_LOG_ERROR);
99 }
100
requery_dup_node_info(IN osm_sm_t * sm,osm_physp_t * p_physp,unsigned count)101 static void requery_dup_node_info(IN osm_sm_t * sm, osm_physp_t * p_physp,
102 unsigned count)
103 {
104 osm_madw_context_t context;
105 osm_dr_path_t path;
106 cl_status_t status;
107
108 if (!p_physp->p_remote_physp) {
109 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D0D: "
110 "DR path couldn't be extended due to NULL remote physp\n");
111 return;
112 }
113
114 path = *osm_physp_get_dr_path_ptr(p_physp->p_remote_physp);
115 if (osm_dr_path_extend(&path, p_physp->p_remote_physp->port_num)) {
116 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D08: "
117 "DR path with hop count %d couldn't be extended\n",
118 path.hop_count);
119 return;
120 }
121
122 context.ni_context.node_guid =
123 p_physp->p_remote_physp->p_node->node_info.port_guid;
124 context.ni_context.port_num = p_physp->p_remote_physp->port_num;
125 context.ni_context.dup_node_guid = p_physp->p_node->node_info.node_guid;
126 context.ni_context.dup_port_num = p_physp->port_num;
127 context.ni_context.dup_count = count;
128
129 status = osm_req_get(sm, &path, IB_MAD_ATTR_NODE_INFO, 0,
130 TRUE, 0, CL_DISP_MSGID_NONE, &context);
131
132 if (status != IB_SUCCESS)
133 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D02: "
134 "Failure initiating NodeInfo request (%s)\n",
135 ib_get_err_str(status));
136 }
137
138 /**********************************************************************
139 The plock must be held before calling this function.
140 **********************************************************************/
ni_rcv_set_links(IN osm_sm_t * sm,osm_node_t * p_node,const uint8_t port_num,const osm_ni_context_t * p_ni_context)141 static void ni_rcv_set_links(IN osm_sm_t * sm, osm_node_t * p_node,
142 const uint8_t port_num,
143 const osm_ni_context_t * p_ni_context)
144 {
145 osm_node_t *p_neighbor_node;
146 osm_physp_t *p_physp, *p_remote_physp;
147
148 OSM_LOG_ENTER(sm->p_log);
149
150 /*
151 A special case exists in which the node we're trying to
152 link is our own node. In this case, the guid value in
153 the ni_context will be zero.
154 */
155 if (p_ni_context->node_guid == 0) {
156 OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
157 "Nothing to link for our own node 0x%" PRIx64 "\n",
158 cl_ntoh64(osm_node_get_node_guid(p_node)));
159 goto _exit;
160 }
161
162 p_neighbor_node = osm_get_node_by_guid(sm->p_subn,
163 p_ni_context->node_guid);
164 if (PF(!p_neighbor_node)) {
165 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D10: "
166 "Unexpected removal of neighbor node 0x%" PRIx64 "\n",
167 cl_ntoh64(p_ni_context->node_guid));
168 goto _exit;
169 }
170
171 /* When setting the link, ports on both
172 sides of the link should be initialized */
173 CL_ASSERT(osm_node_link_has_valid_ports(p_node, port_num,
174 p_neighbor_node,
175 p_ni_context->port_num));
176
177 if (osm_node_link_exists(p_node, port_num,
178 p_neighbor_node, p_ni_context->port_num)) {
179 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, "Link already exists\n");
180 goto _exit;
181 }
182
183 p_physp = osm_node_get_physp_ptr(p_node, port_num);
184 if (!p_physp) {
185 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR OD0E: "
186 "Failed to find physp for port %d of Node GUID 0x%"
187 PRIx64 "\n", port_num,
188 cl_ntoh64(osm_node_get_node_guid(p_node)));
189 goto _exit;
190 }
191
192 /*
193 * If the link went UP, after we already discovered it, we shouldn't
194 * set the link between the ports and resweep.
195 */
196 if (osm_physp_get_port_state(p_physp) == IB_LINK_DOWN &&
197 p_node->physp_discovered[port_num]) {
198 /* Link down on another side. Don't create a link*/
199 p_node->physp_discovered[port_num] = 0;
200 sm->p_subn->force_heavy_sweep = TRUE;
201 goto _exit;
202 }
203
204 if (osm_node_has_any_link(p_node, port_num) &&
205 sm->p_subn->force_heavy_sweep == FALSE &&
206 (!p_ni_context->dup_count ||
207 (p_ni_context->dup_node_guid == osm_node_get_node_guid(p_node) &&
208 p_ni_context->dup_port_num == port_num))) {
209 /*
210 Uh oh...
211 This could be reconnected ports, but also duplicated GUID
212 (2 nodes have the same guid) or a 12x link with lane reversal
213 that is not configured correctly.
214 We will try to recover by querying NodeInfo again.
215 In order to catch even fast port moving to new location(s)
216 and back we will count up to 5.
217 Some crazy reconnections (newly created switch loop right
218 before targeted CA) will not be catched this way. So in worst
219 case - report GUID duplication and request new discovery.
220 When switch node is targeted NodeInfo querying will be done
221 in opposite order, this is much stronger check, unfortunately
222 it is impossible with CAs.
223 */
224 p_physp = osm_node_get_physp_ptr(p_node, port_num);
225 if (!p_physp) {
226 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR OD0F: "
227 "Failed to find physp for port %d of Node GUID 0x%"
228 PRIx64 "\n", port_num,
229 cl_ntoh64(osm_node_get_node_guid(p_node)));
230 goto _exit;
231 }
232
233 if (p_ni_context->dup_count > 5) {
234 report_duplicated_guid(sm, p_physp, p_neighbor_node,
235 p_ni_context->port_num);
236 sm->p_subn->force_heavy_sweep = TRUE;
237 } else if (p_node->sw)
238 requery_dup_node_info(sm, p_physp->p_remote_physp,
239 p_ni_context->dup_count + 1);
240 else
241 requery_dup_node_info(sm, p_physp,
242 p_ni_context->dup_count + 1);
243 }
244
245 /*
246 When there are only two nodes with exact same guids (connected back
247 to back) - the previous check for duplicated guid will not catch
248 them. But the link will be from the port to itself...
249 Enhanced Port 0 is an exception to this
250 */
251 if (osm_node_get_node_guid(p_node) == p_ni_context->node_guid &&
252 port_num == p_ni_context->port_num &&
253 port_num != 0 && cl_qmap_count(&sm->p_subn->sw_guid_tbl) == 0) {
254 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
255 "Duplicate GUID found by link from a port to itself:"
256 "node 0x%" PRIx64 ", port number %u\n",
257 cl_ntoh64(osm_node_get_node_guid(p_node)), port_num);
258 p_physp = osm_node_get_physp_ptr(p_node, port_num);
259 if (!p_physp) {
260 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR OD1D: "
261 "Failed to find physp for port %d of Node GUID 0x%"
262 PRIx64 "\n", port_num,
263 cl_ntoh64(osm_node_get_node_guid(p_node)));
264 goto _exit;
265 }
266
267 osm_dump_dr_path_v2(sm->p_log, osm_physp_get_dr_path_ptr(p_physp),
268 FILE_ID, OSM_LOG_VERBOSE);
269
270 if (sm->p_subn->opt.exit_on_fatal == TRUE) {
271 osm_log_v2(sm->p_log, OSM_LOG_SYS, FILE_ID,
272 "Errors on subnet. Duplicate GUID found "
273 "by link from a port to itself. "
274 "See verbose opensm.log for more details\n");
275 exit(1);
276 }
277 }
278
279 OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
280 "Creating new link between:\n\t\t\t\tnode 0x%" PRIx64
281 ", port number %u and\n\t\t\t\tnode 0x%" PRIx64
282 ", port number %u\n",
283 cl_ntoh64(osm_node_get_node_guid(p_node)), port_num,
284 cl_ntoh64(p_ni_context->node_guid), p_ni_context->port_num);
285
286 if (sm->ucast_mgr.cache_valid)
287 osm_ucast_cache_check_new_link(&sm->ucast_mgr, p_node, port_num,
288 p_neighbor_node,
289 p_ni_context->port_num);
290
291 p_physp = osm_node_get_physp_ptr(p_node, port_num);
292 p_remote_physp = osm_node_get_physp_ptr(p_neighbor_node,
293 p_ni_context->port_num);
294 if (!p_physp || !p_remote_physp)
295 goto _exit;
296
297 osm_node_link(p_node, port_num, p_neighbor_node, p_ni_context->port_num);
298
299 osm_db_neighbor_set(sm->p_subn->p_neighbor,
300 cl_ntoh64(osm_physp_get_port_guid(p_physp)),
301 port_num,
302 cl_ntoh64(osm_physp_get_port_guid(p_remote_physp)),
303 p_ni_context->port_num);
304 osm_db_neighbor_set(sm->p_subn->p_neighbor,
305 cl_ntoh64(osm_physp_get_port_guid(p_remote_physp)),
306 p_ni_context->port_num,
307 cl_ntoh64(osm_physp_get_port_guid(p_physp)),
308 port_num);
309
310 _exit:
311 OSM_LOG_EXIT(sm->p_log);
312 }
313
ni_rcv_get_port_info(IN osm_sm_t * sm,IN osm_node_t * node,IN const osm_madw_t * madw)314 static void ni_rcv_get_port_info(IN osm_sm_t * sm, IN osm_node_t * node,
315 IN const osm_madw_t * madw)
316 {
317 osm_madw_context_t context;
318 osm_physp_t *physp;
319 ib_node_info_t *ni;
320 unsigned port;
321 ib_api_status_t status;
322 int mlnx_epi_supported = 0;
323
324 ni = ib_smp_get_payload_ptr(osm_madw_get_smp_ptr(madw));
325
326 port = ib_node_info_get_local_port_num(ni);
327
328 if (sm->p_subn->opt.fdr10)
329 mlnx_epi_supported = is_mlnx_ext_port_info_supported(
330 ib_node_info_get_vendor_id(ni),
331 ni->device_id);
332
333 physp = osm_node_get_physp_ptr(node, port);
334 if (!physp) {
335 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR OD1E: "
336 "Failed to find physp for port %d of Node GUID 0x%"
337 PRIx64 "\n", port,
338 cl_ntoh64(osm_node_get_node_guid(node)));
339 return;
340 }
341
342 context.pi_context.node_guid = osm_node_get_node_guid(node);
343 context.pi_context.port_guid = osm_physp_get_port_guid(physp);
344 context.pi_context.set_method = FALSE;
345 context.pi_context.light_sweep = FALSE;
346 context.pi_context.active_transition = FALSE;
347 context.pi_context.client_rereg = FALSE;
348
349 status = osm_req_get(sm, osm_physp_get_dr_path_ptr(physp),
350 IB_MAD_ATTR_PORT_INFO, cl_hton32(port),
351 TRUE, 0, CL_DISP_MSGID_NONE, &context);
352 if (status != IB_SUCCESS)
353 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR OD02: "
354 "Failure initiating PortInfo request (%s)\n",
355 ib_get_err_str(status));
356 if (mlnx_epi_supported) {
357 status = osm_req_get(sm,
358 osm_physp_get_dr_path_ptr(physp),
359 IB_MAD_ATTR_MLNX_EXTENDED_PORT_INFO,
360 cl_hton32(port),
361 TRUE, 0, CL_DISP_MSGID_NONE, &context);
362 if (status != IB_SUCCESS)
363 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D0B: "
364 "Failure initiating MLNX ExtPortInfo request (%s)\n",
365 ib_get_err_str(status));
366 }
367 }
368
369 /**********************************************************************
370 The plock must be held before calling this function.
371 **********************************************************************/
osm_req_get_node_desc(IN osm_sm_t * sm,osm_physp_t * p_physp)372 void osm_req_get_node_desc(IN osm_sm_t * sm, osm_physp_t * p_physp)
373 {
374 ib_api_status_t status = IB_SUCCESS;
375 osm_madw_context_t context;
376
377 OSM_LOG_ENTER(sm->p_log);
378
379 context.nd_context.node_guid =
380 osm_node_get_node_guid(osm_physp_get_node_ptr(p_physp));
381
382 status = osm_req_get(sm, osm_physp_get_dr_path_ptr(p_physp),
383 IB_MAD_ATTR_NODE_DESC, 0, TRUE, 0,
384 CL_DISP_MSGID_NONE, &context);
385 if (status != IB_SUCCESS)
386 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D03: "
387 "Failure initiating NodeDescription request (%s)\n",
388 ib_get_err_str(status));
389
390 OSM_LOG_EXIT(sm->p_log);
391 }
392
393 /**********************************************************************
394 The plock must be held before calling this function.
395 **********************************************************************/
ni_rcv_get_node_desc(IN osm_sm_t * sm,IN osm_node_t * p_node,IN const osm_madw_t * p_madw)396 static void ni_rcv_get_node_desc(IN osm_sm_t * sm, IN osm_node_t * p_node,
397 IN const osm_madw_t * p_madw)
398 {
399 ib_node_info_t *p_ni;
400 ib_smp_t *p_smp;
401 uint8_t port_num;
402 osm_physp_t *p_physp = NULL;
403
404 OSM_LOG_ENTER(sm->p_log);
405
406 p_smp = osm_madw_get_smp_ptr(p_madw);
407 p_ni = ib_smp_get_payload_ptr(p_smp);
408 port_num = ib_node_info_get_local_port_num(p_ni);
409
410 /*
411 Request PortInfo & NodeDescription attributes for the port
412 that responded to the NodeInfo attribute.
413 Because this is a channel adapter or router, we are
414 not allowed to request PortInfo for the other ports.
415 Set the context union properly, so the recipient
416 knows which node & port are relevant.
417 */
418 p_physp = osm_node_get_physp_ptr(p_node, port_num);
419 if (!p_physp) {
420 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR OD1F: "
421 "Failed to find physp for port %d of Node GUID 0x%"
422 PRIx64 "\n", port_num,
423 cl_ntoh64(osm_node_get_node_guid(p_node)));
424 return;
425 }
426
427 osm_req_get_node_desc(sm, p_physp);
428
429 OSM_LOG_EXIT(sm->p_log);
430 }
431
432 /**********************************************************************
433 The plock must be held before calling this function.
434 **********************************************************************/
ni_rcv_process_new_ca_or_router(IN osm_sm_t * sm,IN osm_node_t * p_node,IN const osm_madw_t * p_madw)435 static void ni_rcv_process_new_ca_or_router(IN osm_sm_t * sm,
436 IN osm_node_t * p_node,
437 IN const osm_madw_t * p_madw)
438 {
439 OSM_LOG_ENTER(sm->p_log);
440
441 ni_rcv_get_port_info(sm, p_node, p_madw);
442
443 /*
444 A node guid of 0 is the corner case that indicates
445 we discovered our own node. Initialize the subnet
446 object with the SM's own port guid.
447 */
448 if (osm_madw_get_ni_context_ptr(p_madw)->node_guid == 0)
449 sm->p_subn->sm_port_guid = p_node->node_info.port_guid;
450
451 OSM_LOG_EXIT(sm->p_log);
452 }
453
454 /**********************************************************************
455 The plock must be held before calling this function.
456 **********************************************************************/
ni_rcv_process_existing_ca_or_router(IN osm_sm_t * sm,IN osm_node_t * p_node,IN const osm_madw_t * p_madw)457 static void ni_rcv_process_existing_ca_or_router(IN osm_sm_t * sm,
458 IN osm_node_t * p_node,
459 IN const osm_madw_t * p_madw)
460 {
461 ib_node_info_t *p_ni;
462 ib_smp_t *p_smp;
463 osm_port_t *p_port;
464 osm_port_t *p_port_check;
465 uint8_t port_num;
466 osm_dr_path_t *p_dr_path;
467 osm_alias_guid_t *p_alias_guid, *p_alias_guid_check;
468
469 OSM_LOG_ENTER(sm->p_log);
470
471 p_smp = osm_madw_get_smp_ptr(p_madw);
472 p_ni = ib_smp_get_payload_ptr(p_smp);
473 port_num = ib_node_info_get_local_port_num(p_ni);
474
475 /*
476 Determine if we have encountered this node through a
477 previously undiscovered port. If so, build the new
478 port object.
479 */
480 p_port = osm_get_port_by_guid(sm->p_subn, p_ni->port_guid);
481 if (!p_port) {
482 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
483 "Creating new port object with GUID 0x%" PRIx64 "\n",
484 cl_ntoh64(p_ni->port_guid));
485
486 osm_node_init_physp(p_node, port_num, p_madw);
487
488 p_port = osm_port_new(p_ni, p_node);
489 if (PF(p_port == NULL)) {
490 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D04: "
491 "Unable to create new port object\n");
492 goto Exit;
493 }
494
495 /*
496 Add the new port object to the database.
497 */
498 p_port_check =
499 (osm_port_t *) cl_qmap_insert(&sm->p_subn->port_guid_tbl,
500 p_ni->port_guid,
501 &p_port->map_item);
502 if (PF(p_port_check != p_port)) {
503 /*
504 We should never be here!
505 Somehow, this port GUID already exists in the table.
506 */
507 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D12: "
508 "Port 0x%" PRIx64 " already in the database!\n",
509 cl_ntoh64(p_ni->port_guid));
510
511 osm_port_delete(&p_port);
512 goto Exit;
513 }
514
515 p_alias_guid = osm_alias_guid_new(p_ni->port_guid,
516 p_port);
517 if (PF(!p_alias_guid)) {
518 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D11: "
519 "alias guid memory allocation failed"
520 " for port GUID 0x%" PRIx64 "\n",
521 cl_ntoh64(p_ni->port_guid));
522 goto alias_done;
523 }
524
525 /* insert into alias guid table */
526 p_alias_guid_check =
527 (osm_alias_guid_t *) cl_qmap_insert(&sm->p_subn->alias_port_guid_tbl,
528 p_alias_guid->alias_guid,
529 &p_alias_guid->map_item);
530 if (p_alias_guid_check != p_alias_guid) {
531 /* alias GUID is a duplicate */
532 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D13: "
533 "Duplicate alias port GUID 0x%" PRIx64 "\n",
534 cl_ntoh64(p_ni->port_guid));
535 osm_alias_guid_delete(&p_alias_guid);
536 osm_port_delete(&p_port);
537 goto Exit;
538 }
539
540 alias_done:
541 /* If we are a master, then this means the port is new on the subnet.
542 Mark it as new - need to send trap 64 for these ports.
543 The condition that we are master is true, since if we are in discovering
544 state (meaning we woke up from standby or we are just initializing),
545 then these ports may be new to us, but are not new on the subnet.
546 If we are master, then the subnet as we know it is the updated one,
547 and any new ports we encounter should cause trap 64. C14-72.1.1 */
548 if (sm->p_subn->sm_state == IB_SMINFO_STATE_MASTER)
549 p_port->is_new = 1;
550
551 } else {
552 osm_physp_t *p_physp = osm_node_get_physp_ptr(p_node, port_num);
553
554 if (PF(p_physp == NULL)) {
555 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D1C: "
556 "No physical port found for node GUID 0x%"
557 PRIx64 " port %u. Might be duplicate port GUID\n",
558 cl_ntoh64(p_node->node_info.node_guid),
559 port_num);
560 goto Exit;
561 }
562
563 /*
564 Update the DR Path to the port,
565 in case the old one is no longer available.
566 */
567 p_dr_path = osm_physp_get_dr_path_ptr(p_physp);
568
569 osm_dr_path_init(p_dr_path, p_smp->hop_count,
570 p_smp->initial_path);
571 }
572
573 ni_rcv_get_port_info(sm, p_node, p_madw);
574
575 Exit:
576 OSM_LOG_EXIT(sm->p_log);
577 }
578
ni_rcv_process_switch(IN osm_sm_t * sm,IN osm_node_t * p_node,IN const osm_madw_t * p_madw)579 static void ni_rcv_process_switch(IN osm_sm_t * sm, IN osm_node_t * p_node,
580 IN const osm_madw_t * p_madw)
581 {
582 ib_api_status_t status = IB_SUCCESS;
583 osm_physp_t *p_physp;
584 osm_madw_context_t context;
585 osm_dr_path_t *path;
586 ib_smp_t *p_smp;
587
588 OSM_LOG_ENTER(sm->p_log);
589
590 p_smp = osm_madw_get_smp_ptr(p_madw);
591
592 p_physp = osm_node_get_physp_ptr(p_node, 0);
593 /* update DR path of already initialized switch port 0 */
594 path = osm_physp_get_dr_path_ptr(p_physp);
595 osm_dr_path_init(path, p_smp->hop_count, p_smp->initial_path);
596
597 context.si_context.node_guid = osm_node_get_node_guid(p_node);
598 context.si_context.set_method = FALSE;
599 context.si_context.light_sweep = FALSE;
600 context.si_context.lft_top_change = FALSE;
601
602 /* Request a SwitchInfo attribute */
603 status = osm_req_get(sm, path, IB_MAD_ATTR_SWITCH_INFO, 0, TRUE, 0,
604 CL_DISP_MSGID_NONE, &context);
605 if (status != IB_SUCCESS)
606 /* continue despite error */
607 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D06: "
608 "Failure initiating SwitchInfo request (%s)\n",
609 ib_get_err_str(status));
610
611 OSM_LOG_EXIT(sm->p_log);
612 }
613
614 /**********************************************************************
615 The plock must be held before calling this function.
616 **********************************************************************/
ni_rcv_process_existing_switch(IN osm_sm_t * sm,IN osm_node_t * p_node,IN const osm_madw_t * p_madw)617 static void ni_rcv_process_existing_switch(IN osm_sm_t * sm,
618 IN osm_node_t * p_node,
619 IN const osm_madw_t * p_madw)
620 {
621 OSM_LOG_ENTER(sm->p_log);
622
623 /*
624 If this switch has already been probed during this sweep,
625 then don't bother reprobing it.
626 */
627 if (p_node->discovery_count == 1)
628 ni_rcv_process_switch(sm, p_node, p_madw);
629
630 OSM_LOG_EXIT(sm->p_log);
631 }
632
633 /**********************************************************************
634 The plock must be held before calling this function.
635 **********************************************************************/
ni_rcv_process_new_switch(IN osm_sm_t * sm,IN osm_node_t * p_node,IN const osm_madw_t * p_madw)636 static void ni_rcv_process_new_switch(IN osm_sm_t * sm, IN osm_node_t * p_node,
637 IN const osm_madw_t * p_madw)
638 {
639 OSM_LOG_ENTER(sm->p_log);
640
641 ni_rcv_process_switch(sm, p_node, p_madw);
642
643 /*
644 A node guid of 0 is the corner case that indicates
645 we discovered our own node. Initialize the subnet
646 object with the SM's own port guid.
647 */
648 if (osm_madw_get_ni_context_ptr(p_madw)->node_guid == 0)
649 sm->p_subn->sm_port_guid = p_node->node_info.port_guid;
650
651 OSM_LOG_EXIT(sm->p_log);
652 }
653
654 /**********************************************************************
655 The plock must NOT be held before calling this function.
656 **********************************************************************/
ni_rcv_process_new(IN osm_sm_t * sm,IN const osm_madw_t * p_madw)657 static void ni_rcv_process_new(IN osm_sm_t * sm, IN const osm_madw_t * p_madw)
658 {
659 osm_node_t *p_node;
660 osm_node_t *p_node_check;
661 osm_port_t *p_port;
662 osm_port_t *p_port_check;
663 osm_router_t *p_rtr = NULL;
664 osm_router_t *p_rtr_check;
665 cl_qmap_t *p_rtr_guid_tbl;
666 ib_node_info_t *p_ni;
667 ib_smp_t *p_smp;
668 osm_ni_context_t *p_ni_context;
669 osm_alias_guid_t *p_alias_guid, *p_alias_guid_check;
670 uint8_t port_num;
671
672 OSM_LOG_ENTER(sm->p_log);
673
674 p_smp = osm_madw_get_smp_ptr(p_madw);
675 p_ni = ib_smp_get_payload_ptr(p_smp);
676 p_ni_context = osm_madw_get_ni_context_ptr(p_madw);
677 port_num = ib_node_info_get_local_port_num(p_ni);
678
679 osm_dump_smp_dr_path_v2(sm->p_log, p_smp, FILE_ID, OSM_LOG_VERBOSE);
680
681 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
682 "Discovered new %s node,"
683 "\n\t\t\t\tGUID 0x%" PRIx64 ", TID 0x%" PRIx64 "\n",
684 ib_get_node_type_str(p_ni->node_type),
685 cl_ntoh64(p_ni->node_guid), cl_ntoh64(p_smp->trans_id));
686
687 if (PF(port_num > p_ni->num_ports)) {
688 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D0A: "
689 "New %s node GUID 0x%" PRIx64 "is non-compliant and "
690 "is being ignored since the "
691 "local port num %u > num ports %u\n",
692 ib_get_node_type_str(p_ni->node_type),
693 cl_ntoh64(p_ni->node_guid), port_num,
694 p_ni->num_ports);
695 goto Exit;
696 }
697
698 p_node = osm_node_new(p_madw);
699 if (PF(p_node == NULL)) {
700 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D07: "
701 "Unable to create new node object\n");
702 goto Exit;
703 }
704
705 /*
706 Create a new port object to represent this node's physical
707 ports in the port table.
708 */
709 p_port = osm_port_new(p_ni, p_node);
710 if (PF(p_port == NULL)) {
711 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D14: "
712 "Unable to create new port object\n");
713 osm_node_delete(&p_node);
714 goto Exit;
715 }
716
717 /*
718 Add the new port object to the database.
719 */
720 p_port_check =
721 (osm_port_t *) cl_qmap_insert(&sm->p_subn->port_guid_tbl,
722 p_ni->port_guid, &p_port->map_item);
723 if (PF(p_port_check != p_port)) {
724 /*
725 We should never be here!
726 Somehow, this port GUID already exists in the table.
727 */
728 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D15: "
729 "Duplicate Port GUID 0x%" PRIx64
730 "! Found by the two directed routes:\n",
731 cl_ntoh64(p_ni->port_guid));
732 osm_dump_dr_path_v2(sm->p_log,
733 osm_physp_get_dr_path_ptr(p_port->p_physp),
734 FILE_ID, OSM_LOG_ERROR);
735 osm_dump_dr_path_v2(sm->p_log,
736 osm_physp_get_dr_path_ptr(p_port_check->
737 p_physp),
738 FILE_ID, OSM_LOG_ERROR);
739 osm_port_delete(&p_port);
740 osm_node_delete(&p_node);
741 goto Exit;
742 }
743
744 p_alias_guid = osm_alias_guid_new(p_ni->port_guid,
745 p_port);
746 if (PF(!p_alias_guid)) {
747 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D18: "
748 "alias guid memory allocation failed"
749 " for port GUID 0x%" PRIx64 "\n",
750 cl_ntoh64(p_ni->port_guid));
751 goto alias_done2;
752 }
753
754 /* insert into alias guid table */
755 p_alias_guid_check =
756 (osm_alias_guid_t *) cl_qmap_insert(&sm->p_subn->alias_port_guid_tbl,
757 p_alias_guid->alias_guid,
758 &p_alias_guid->map_item);
759 if (p_alias_guid_check != p_alias_guid) {
760 /* alias GUID is a duplicate */
761 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D19: "
762 "Duplicate alias port GUID 0x%" PRIx64 "\n",
763 cl_ntoh64(p_ni->port_guid));
764 osm_alias_guid_delete(&p_alias_guid);
765 }
766
767 alias_done2:
768 /* If we are a master, then this means the port is new on the subnet.
769 Mark it as new - need to send trap 64 on these ports.
770 The condition that we are master is true, since if we are in discovering
771 state (meaning we woke up from standby or we are just initializing),
772 then these ports may be new to us, but are not new on the subnet.
773 If we are master, then the subnet as we know it is the updated one,
774 and any new ports we encounter should cause trap 64. C14-72.1.1 */
775 if (sm->p_subn->sm_state == IB_SMINFO_STATE_MASTER)
776 p_port->is_new = 1;
777
778 /* If there were RouterInfo or other router attribute,
779 this would be elsewhere */
780 if (p_ni->node_type == IB_NODE_TYPE_ROUTER) {
781 if (PF((p_rtr = osm_router_new(p_port)) == NULL))
782 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D1A: "
783 "Unable to create new router object\n");
784 else {
785 p_rtr_guid_tbl = &sm->p_subn->rtr_guid_tbl;
786 p_rtr_check =
787 (osm_router_t *) cl_qmap_insert(p_rtr_guid_tbl,
788 p_ni->port_guid,
789 &p_rtr->map_item);
790 if (PF(p_rtr_check != p_rtr))
791 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D1B: "
792 "Unable to add port GUID:0x%016" PRIx64
793 " to router table\n",
794 cl_ntoh64(p_ni->port_guid));
795 }
796 }
797
798 p_node_check =
799 (osm_node_t *) cl_qmap_insert(&sm->p_subn->node_guid_tbl,
800 p_ni->node_guid, &p_node->map_item);
801 if (PF(p_node_check != p_node)) {
802 /*
803 This node must have been inserted by another thread.
804 This is unexpected, but is not an error.
805 We can simply clean-up, since the other thread will
806 see this processing through to completion.
807 */
808 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
809 "Discovery race detected at node 0x%" PRIx64 "\n",
810 cl_ntoh64(p_ni->node_guid));
811 osm_node_delete(&p_node);
812 p_node = p_node_check;
813 ni_rcv_set_links(sm, p_node, port_num, p_ni_context);
814 goto Exit;
815 } else
816 ni_rcv_set_links(sm, p_node, port_num, p_ni_context);
817
818 p_node->discovery_count++;
819 ni_rcv_get_node_desc(sm, p_node, p_madw);
820
821 switch (p_ni->node_type) {
822 case IB_NODE_TYPE_CA:
823 case IB_NODE_TYPE_ROUTER:
824 ni_rcv_process_new_ca_or_router(sm, p_node, p_madw);
825 break;
826 case IB_NODE_TYPE_SWITCH:
827 ni_rcv_process_new_switch(sm, p_node, p_madw);
828 break;
829 default:
830 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D16: "
831 "Unknown node type %u with GUID 0x%" PRIx64 "\n",
832 p_ni->node_type, cl_ntoh64(p_ni->node_guid));
833 break;
834 }
835
836 Exit:
837 OSM_LOG_EXIT(sm->p_log);
838 }
839
840 /**********************************************************************
841 The plock must be held before calling this function.
842 **********************************************************************/
ni_rcv_process_existing(IN osm_sm_t * sm,IN osm_node_t * p_node,IN const osm_madw_t * p_madw)843 static void ni_rcv_process_existing(IN osm_sm_t * sm, IN osm_node_t * p_node,
844 IN const osm_madw_t * p_madw)
845 {
846 ib_node_info_t *p_ni;
847 ib_smp_t *p_smp;
848 osm_ni_context_t *p_ni_context;
849 uint8_t port_num;
850
851 OSM_LOG_ENTER(sm->p_log);
852
853 p_smp = osm_madw_get_smp_ptr(p_madw);
854 p_ni = ib_smp_get_payload_ptr(p_smp);
855 p_ni_context = osm_madw_get_ni_context_ptr(p_madw);
856 port_num = ib_node_info_get_local_port_num(p_ni);
857
858 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
859 "Rediscovered %s node 0x%" PRIx64 " TID 0x%" PRIx64
860 ", discovered %u times already\n",
861 ib_get_node_type_str(p_ni->node_type),
862 cl_ntoh64(p_ni->node_guid),
863 cl_ntoh64(p_smp->trans_id), p_node->discovery_count);
864
865 if (PF(port_num > p_ni->num_ports)) {
866 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D0C: "
867 "Existing %s node GUID 0x%" PRIx64 "is non-compliant "
868 "and is being ignored since the "
869 "local port num %u > num ports %u\n",
870 ib_get_node_type_str(p_ni->node_type),
871 cl_ntoh64(p_ni->node_guid), port_num,
872 p_ni->num_ports);
873 goto Exit;
874 }
875
876 /*
877 If we haven't already encountered this existing node
878 on this particular sweep, then process further.
879 */
880 p_node->discovery_count++;
881
882 switch (p_ni->node_type) {
883 case IB_NODE_TYPE_CA:
884 case IB_NODE_TYPE_ROUTER:
885 ni_rcv_process_existing_ca_or_router(sm, p_node, p_madw);
886 break;
887
888 case IB_NODE_TYPE_SWITCH:
889 ni_rcv_process_existing_switch(sm, p_node, p_madw);
890 break;
891
892 default:
893 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D09: "
894 "Unknown node type %u with GUID 0x%" PRIx64 "\n",
895 p_ni->node_type, cl_ntoh64(p_ni->node_guid));
896 break;
897 }
898
899 if ( p_ni->sys_guid != p_node->node_info.sys_guid) {
900 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, "Updated SysImageGUID: 0x%"
901 PRIx64 " for node 0x%" PRIx64 "\n",
902 cl_ntoh64(p_ni->sys_guid),
903 cl_ntoh64(p_ni->node_guid));
904 }
905 ni_rcv_set_links(sm, p_node, port_num, p_ni_context);
906 p_node->node_info = *p_ni;
907
908 Exit:
909 OSM_LOG_EXIT(sm->p_log);
910 }
911
osm_ni_rcv_process(IN void * context,IN void * data)912 void osm_ni_rcv_process(IN void *context, IN void *data)
913 {
914 osm_sm_t *sm = context;
915 osm_madw_t *p_madw = data;
916 ib_node_info_t *p_ni;
917 ib_smp_t *p_smp;
918 osm_node_t *p_node;
919
920 CL_ASSERT(sm);
921
922 OSM_LOG_ENTER(sm->p_log);
923
924 CL_ASSERT(p_madw);
925
926 p_smp = osm_madw_get_smp_ptr(p_madw);
927 p_ni = ib_smp_get_payload_ptr(p_smp);
928
929 CL_ASSERT(p_smp->attr_id == IB_MAD_ATTR_NODE_INFO);
930
931 if (PF(p_ni->node_guid == 0)) {
932 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D16: "
933 "Got Zero Node GUID! Found on the directed route:\n");
934 osm_dump_smp_dr_path_v2(sm->p_log, p_smp, FILE_ID, OSM_LOG_ERROR);
935 goto Exit;
936 }
937
938 if (PF(p_ni->port_guid == 0)) {
939 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D17: "
940 "Got Zero Port GUID! Found on the directed route:\n");
941 osm_dump_smp_dr_path_v2(sm->p_log, p_smp, FILE_ID, OSM_LOG_ERROR);
942 goto Exit;
943 }
944
945 if (ib_smp_get_status(p_smp)) {
946 OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
947 "MAD status 0x%x received\n",
948 cl_ntoh16(ib_smp_get_status(p_smp)));
949 goto Exit;
950 }
951
952 /*
953 Determine if this node has already been discovered,
954 and process accordingly.
955 During processing of this node, hold the shared lock.
956 */
957
958 CL_PLOCK_EXCL_ACQUIRE(sm->p_lock);
959 p_node = osm_get_node_by_guid(sm->p_subn, p_ni->node_guid);
960
961 osm_dump_node_info_v2(sm->p_log, p_ni, FILE_ID, OSM_LOG_DEBUG);
962
963 if (!p_node)
964 ni_rcv_process_new(sm, p_madw);
965 else
966 ni_rcv_process_existing(sm, p_node, p_madw);
967
968 CL_PLOCK_RELEASE(sm->p_lock);
969
970 Exit:
971 OSM_LOG_EXIT(sm->p_log);
972 }
973