1 /* 2 * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. 3 * Copyright (c) 2007 Xsigo Systems Inc. All rights reserved. 4 * Copyright (c) 2008 Lawrence Livermore National Lab. All rights reserved. 5 * Copyright (c) 2009 HNR Consulting. All rights reserved. 6 * Copyright (c) 2010,2011 Mellanox Technologies LTD. All rights reserved. 7 * 8 * This software is available to you under a choice of one of two 9 * licenses. You may choose to be licensed under the terms of the GNU 10 * General Public License (GPL) Version 2, available from the file 11 * COPYING in the main directory of this source tree, or the 12 * OpenIB.org BSD license below: 13 * 14 * Redistribution and use in source and binary forms, with or 15 * without modification, are permitted provided that the following 16 * conditions are met: 17 * 18 * - Redistributions of source code must retain the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer. 21 * 22 * - Redistributions in binary form must reproduce the above 23 * copyright notice, this list of conditions and the following 24 * disclaimer in the documentation and/or other materials 25 * provided with the distribution. 26 * 27 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 28 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 29 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 30 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 31 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 32 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 33 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 34 * SOFTWARE. 35 * 36 */ 37 38 #if HAVE_CONFIG_H 39 # include <config.h> 40 #endif /* HAVE_CONFIG_H */ 41 42 #define _GNU_SOURCE 43 #include <stdio.h> 44 #include <stdlib.h> 45 #include <unistd.h> 46 #include <stdarg.h> 47 #include <time.h> 48 #include <string.h> 49 #include <getopt.h> 50 #include <errno.h> 51 #include <inttypes.h> 52 53 #include <complib/cl_nodenamemap.h> 54 #include <infiniband/ibnetdisc.h> 55 #include <infiniband/mad.h> 56 57 #include "ibdiag_common.h" 58 #include "ibdiag_sa.h" 59 60 struct ibmad_port *ibmad_port; 61 static char *node_name_map_file = NULL; 62 static nn_map_t *node_name_map = NULL; 63 static char *load_cache_file = NULL; 64 static uint16_t lid2sl_table[sizeof(uint8_t) * 1024 * 48] = { 0 }; 65 static int obtain_sl = 1; 66 67 int data_counters = 0; 68 int data_counters_only = 0; 69 int port_config = 0; 70 uint64_t port_guid = 0; 71 char *port_guid_str = NULL; 72 #define SUP_MAX 64 73 int sup_total = 0; 74 enum MAD_FIELDS suppressed_fields[SUP_MAX]; 75 char *dr_path = NULL; 76 uint8_t node_type_to_print = 0; 77 unsigned clear_errors = 0, clear_counts = 0, details = 0; 78 79 #define PRINT_SWITCH 0x1 80 #define PRINT_CA 0x2 81 #define PRINT_ROUTER 0x4 82 #define PRINT_ALL 0xFF /* all nodes default flag */ 83 84 #define DEFAULT_HALF_WORLD_PR_TIMEOUT (3000) 85 86 struct { 87 int nodes_checked; 88 int bad_nodes; 89 int ports_checked; 90 int bad_ports; 91 int pma_query_failures; 92 } summary = { 0 }; 93 94 #define DEF_THRES_FILE IBDIAG_CONFIG_PATH"/error_thresholds" 95 static char *threshold_file = DEF_THRES_FILE; 96 97 /* define a "packet" with threshold values in it */ 98 uint8_t thresholds[1204] = { 0 }; 99 char * threshold_str = ""; 100 101 static unsigned valid_gid(ib_gid_t * gid) 102 { 103 ib_gid_t zero_gid; 104 memset(&zero_gid, 0, sizeof zero_gid); 105 return memcmp(&zero_gid, gid, sizeof(*gid)); 106 } 107 108 static void set_thres(char *name, uint32_t val) 109 { 110 int f; 111 int n; 112 char tmp[256]; 113 for (f = IB_PC_FIRST_F; f <= IB_PC_LAST_F; f++) { 114 if (strcmp(name, mad_field_name(f)) == 0) { 115 mad_encode_field(thresholds, f, &val); 116 snprintf(tmp, 255, "[%s = %u]", name, val); 117 threshold_str = realloc(threshold_str, 118 strlen(threshold_str)+strlen(tmp)+1); 119 if (!threshold_str) { 120 fprintf(stderr, "Failed to allocate memory: " 121 "%s\n", strerror(errno)); 122 exit(1); 123 } 124 n = strlen(threshold_str); 125 strcpy(threshold_str+n, tmp); 126 } 127 } 128 } 129 130 static void set_thresholds(char *threshold_file) 131 { 132 char buf[1024]; 133 char orig_buf[1024]; 134 int val = 0; 135 FILE *thresf = fopen(threshold_file, "r"); 136 char *p_prefix, *p_last; 137 char *name; 138 char *val_str; 139 char str[64]; 140 141 if (!thresf) 142 return; 143 144 snprintf(str, 63, "Thresholds: "); 145 threshold_str = malloc(strlen(str)+1); 146 if (!threshold_str) { 147 fprintf(stderr, "Failed to allocate memory: %s\n", 148 strerror(errno)); 149 exit(1); 150 } 151 strcpy(threshold_str, str); 152 while (fgets(buf, sizeof buf, thresf) != NULL) { 153 p_prefix = strtok_r(buf, "\n", &p_last); 154 if (!p_prefix) 155 continue; /* ignore blank lines */ 156 157 if (*p_prefix == '#') 158 continue; /* ignore comment lines */ 159 160 strlcpy(orig_buf, buf, sizeof(orig_buf)); 161 name = strtok_r(p_prefix, "=", &p_last); 162 val_str = strtok_r(NULL, "\n", &p_last); 163 if (!name || !val_str) { 164 fprintf(stderr, "malformed line in \"%s\":\n%s\n", 165 threshold_file, orig_buf); 166 continue; 167 } 168 169 val = strtoul(val_str, NULL, 0); 170 set_thres(name, val); 171 } 172 173 fclose(thresf); 174 } 175 176 static int exceeds_threshold(int field, unsigned val) 177 { 178 uint32_t thres = 0; 179 mad_decode_field(thresholds, field, &thres); 180 return (val > thres); 181 } 182 183 static void print_port_config(ibnd_node_t * node, int portnum) 184 { 185 char width[64], speed[64], state[64], physstate[64]; 186 char remote_str[256]; 187 char link_str[256]; 188 char width_msg[256]; 189 char speed_msg[256]; 190 char ext_port_str[256]; 191 int iwidth, ispeed, fdr10, espeed, istate, iphystate, cap_mask; 192 uint8_t *info; 193 194 ibnd_port_t *port = node->ports[portnum]; 195 196 if (!port) 197 return; 198 199 iwidth = mad_get_field(port->info, 0, IB_PORT_LINK_WIDTH_ACTIVE_F); 200 ispeed = mad_get_field(port->info, 0, IB_PORT_LINK_SPEED_ACTIVE_F); 201 fdr10 = mad_get_field(port->ext_info, 0, 202 IB_MLNX_EXT_PORT_LINK_SPEED_ACTIVE_F) & FDR10; 203 204 if (port->node->type == IB_NODE_SWITCH) 205 info = (uint8_t *)&port->node->ports[0]->info; 206 else 207 info = (uint8_t *)&port->info; 208 cap_mask = mad_get_field(info, 0, IB_PORT_CAPMASK_F); 209 if (cap_mask & CL_NTOH32(IB_PORT_CAP_HAS_EXT_SPEEDS)) 210 espeed = mad_get_field(port->info, 0, 211 IB_PORT_LINK_SPEED_EXT_ACTIVE_F); 212 else 213 espeed = 0; 214 istate = mad_get_field(port->info, 0, IB_PORT_STATE_F); 215 iphystate = mad_get_field(port->info, 0, IB_PORT_PHYS_STATE_F); 216 217 remote_str[0] = '\0'; 218 link_str[0] = '\0'; 219 width_msg[0] = '\0'; 220 speed_msg[0] = '\0'; 221 222 /* C14-24.2.1 states that a down port allows for invalid data to be 223 * returned for all PortInfo components except PortState and 224 * PortPhysicalState */ 225 if (istate != IB_LINK_DOWN) { 226 if (!espeed) { 227 if (fdr10) 228 sprintf(speed, "10.0 Gbps (FDR10)"); 229 else 230 mad_dump_val(IB_PORT_LINK_SPEED_ACTIVE_F, speed, 231 64, &ispeed); 232 } else 233 mad_dump_val(IB_PORT_LINK_SPEED_EXT_ACTIVE_F, speed, 234 64, &espeed); 235 236 snprintf(link_str, 256, "(%3s %18s %6s/%8s)", 237 mad_dump_val(IB_PORT_LINK_WIDTH_ACTIVE_F, width, 64, &iwidth), 238 speed, 239 mad_dump_val(IB_PORT_STATE_F, state, 64, &istate), 240 mad_dump_val(IB_PORT_PHYS_STATE_F, physstate, 64, &iphystate)); 241 } else { 242 snprintf(link_str, 256, "( %6s/%8s)", 243 mad_dump_val(IB_PORT_STATE_F, state, 64, &istate), 244 mad_dump_val(IB_PORT_PHYS_STATE_F, physstate, 64, &iphystate)); 245 } 246 247 if (port->remoteport) { 248 char *rem_node_name = NULL; 249 250 if (port->remoteport->ext_portnum) 251 snprintf(ext_port_str, 256, "%d", 252 port->remoteport->ext_portnum); 253 else 254 ext_port_str[0] = '\0'; 255 256 get_max_msg(width_msg, speed_msg, 256, port); 257 258 rem_node_name = remap_node_name(node_name_map, 259 port->remoteport->node->guid, 260 port->remoteport->node-> 261 nodedesc); 262 263 snprintf(remote_str, 256, 264 "0x%016" PRIx64 " %6d %4d[%2s] \"%s\" (%s %s)\n", 265 port->remoteport->guid, 266 port->remoteport->base_lid ? port->remoteport-> 267 base_lid : port->remoteport->node->smalid, 268 port->remoteport->portnum, ext_port_str, rem_node_name, 269 width_msg, speed_msg); 270 271 free(rem_node_name); 272 } else 273 snprintf(remote_str, 256, " [ ] \"\" ( )\n"); 274 275 if (port->ext_portnum) 276 snprintf(ext_port_str, 256, "%d", port->ext_portnum); 277 else 278 ext_port_str[0] = '\0'; 279 280 if (node->type == IB_NODE_SWITCH) 281 printf(" Link info: %6d", node->smalid); 282 else 283 printf(" Link info: %6d", port->base_lid); 284 285 printf("%4d[%2s] ==%s==> %s", 286 port->portnum, ext_port_str, link_str, remote_str); 287 } 288 289 static int suppress(enum MAD_FIELDS field) 290 { 291 int i = 0; 292 for (i = 0; i < sup_total; i++) 293 if (field == suppressed_fields[i]) 294 return 1; 295 return 0; 296 } 297 298 static void report_suppressed(void) 299 { 300 int i = 0; 301 printf("## Suppressed:"); 302 for (i = 0; i < sup_total; i++) 303 printf(" %s", mad_field_name(suppressed_fields[i])); 304 printf("\n"); 305 } 306 307 static int print_summary(void) 308 { 309 printf("\n## Summary: %d nodes checked, %d bad nodes found\n", 310 summary.nodes_checked, summary.bad_nodes); 311 printf("## %d ports checked, %d ports have errors beyond threshold\n", 312 summary.ports_checked, summary.bad_ports); 313 printf("## %s\n", threshold_str); 314 if (summary.pma_query_failures) 315 printf("## %d PMA query failures\n", summary.pma_query_failures); 316 report_suppressed(); 317 return (summary.bad_ports); 318 } 319 320 static void insert_lid2sl_table(struct sa_query_result *r) 321 { 322 unsigned int i; 323 for (i = 0; i < r->result_cnt; i++) { 324 ib_path_rec_t *p_pr = (ib_path_rec_t *)sa_get_query_rec(r->p_result_madw, i); 325 lid2sl_table[cl_ntoh16(p_pr->dlid)] = ib_path_rec_sl(p_pr); 326 } 327 } 328 329 static int path_record_query(ib_gid_t sgid,uint64_t dguid) 330 { 331 ib_path_rec_t pr; 332 ib_net64_t comp_mask = 0; 333 uint8_t reversible = 0; 334 struct sa_handle * h; 335 336 if (!(h = sa_get_handle())) 337 return -1; 338 339 ibd_timeout = DEFAULT_HALF_WORLD_PR_TIMEOUT; 340 memset(&pr, 0, sizeof(pr)); 341 342 CHECK_AND_SET_GID(sgid, pr.sgid, PR, SGID); 343 if(dguid) { 344 mad_encode_field(sgid.raw, IB_GID_GUID_F, &dguid); 345 CHECK_AND_SET_GID(sgid, pr.dgid, PR, DGID); 346 } 347 348 CHECK_AND_SET_VAL(1, 8, -1, pr.num_path, PR, NUMBPATH);/*to get only one PathRecord for each source and destination pair*/ 349 CHECK_AND_SET_VAL(1, 8, -1, reversible, PR, REVERSIBLE);/*for a reversible path*/ 350 pr.num_path |= reversible << 7; 351 struct sa_query_result result; 352 int ret = sa_query(h, IB_MAD_METHOD_GET_TABLE, 353 (uint16_t)IB_SA_ATTR_PATHRECORD,0,cl_ntoh64(comp_mask),ibd_sakey, 354 &pr, sizeof(pr), &result); 355 if (ret) { 356 sa_free_handle(h); 357 fprintf(stderr, "Query SA failed: %s; sa call path_query failed\n", strerror(ret)); 358 return ret; 359 } 360 if (result.status != IB_SA_MAD_STATUS_SUCCESS) { 361 sa_report_err(result.status); 362 ret = EIO; 363 goto Exit; 364 } 365 366 insert_lid2sl_table(&result); 367 Exit: 368 sa_free_handle(h); 369 sa_free_result_mad(&result); 370 return ret; 371 } 372 373 static int query_and_dump(char *buf, size_t size, ib_portid_t * portid, 374 char *node_name, int portnum, 375 const char *attr_name, uint16_t attr_id, 376 int start_field, int end_field) 377 { 378 uint8_t pc[1024]; 379 uint32_t val = 0; 380 int i, n; 381 382 memset(pc, 0, sizeof(pc)); 383 384 if (!pma_query_via(pc, portid, portnum, ibd_timeout, attr_id, 385 ibmad_port)) { 386 IBWARN("%s query failed on %s, %s port %d", attr_name, 387 node_name, portid2str(portid), portnum); 388 summary.pma_query_failures++; 389 return 0; 390 } 391 392 for (n = 0, i = start_field; i < end_field; i++) { 393 mad_decode_field(pc, i, (void *)&val); 394 if (val) 395 n += snprintf(buf + n, size - n, " [%s == %u]", 396 mad_field_name(i), val); 397 } 398 399 return n; 400 } 401 402 403 static int print_results(ib_portid_t * portid, char *node_name, 404 ibnd_node_t * node, uint8_t * pc, int portnum, 405 int *header_printed, uint8_t *pce, uint16_t cap_mask) 406 { 407 char buf[1024]; 408 char *str = buf; 409 uint32_t val = 0; 410 int i, n; 411 412 for (n = 0, i = IB_PC_ERR_SYM_F; i <= IB_PC_VL15_DROPPED_F; i++) { 413 if (suppress(i)) 414 continue; 415 416 /* this is not a counter, skip it */ 417 if (i == IB_PC_COUNTER_SELECT2_F) 418 continue; 419 420 mad_decode_field(pc, i, (void *)&val); 421 if (exceeds_threshold(i, val)) { 422 n += snprintf(str + n, 1024 - n, " [%s == %u]", 423 mad_field_name(i), val); 424 425 /* If there are PortXmitDiscards, get details (if supported) */ 426 if (i == IB_PC_XMT_DISCARDS_F && details) { 427 n += query_and_dump(str + n, sizeof(buf) - n, portid, 428 node_name, portnum, 429 "PortXmitDiscardDetails", 430 IB_GSI_PORT_XMIT_DISCARD_DETAILS, 431 IB_PC_RCV_LOCAL_PHY_ERR_F, 432 IB_PC_RCV_ERR_LAST_F); 433 /* If there are PortRcvErrors, get details (if supported) */ 434 } else if (i == IB_PC_ERR_RCV_F && details) { 435 n += query_and_dump(str + n, sizeof(buf) - n, portid, 436 node_name, portnum, 437 "PortRcvErrorDetails", 438 IB_GSI_PORT_RCV_ERROR_DETAILS, 439 IB_PC_XMT_INACT_DISC_F, 440 IB_PC_XMT_DISC_LAST_F); 441 } 442 } 443 } 444 445 if (!suppress(IB_PC_XMT_WAIT_F)) { 446 mad_decode_field(pc, IB_PC_XMT_WAIT_F, (void *)&val); 447 if (exceeds_threshold(IB_PC_XMT_WAIT_F, val)) 448 n += snprintf(str + n, 1024 - n, " [%s == %u]", 449 mad_field_name(IB_PC_XMT_WAIT_F), val); 450 } 451 452 /* if we found errors. */ 453 if (n != 0) { 454 if (data_counters) { 455 uint8_t *pkt = pc; 456 int start_field = IB_PC_XMT_BYTES_F; 457 int end_field = IB_PC_RCV_PKTS_F; 458 459 if (pce) { 460 pkt = pce; 461 start_field = IB_PC_EXT_XMT_BYTES_F; 462 if (cap_mask & IB_PM_EXT_WIDTH_SUPPORTED) 463 end_field = IB_PC_EXT_RCV_MPKTS_F; 464 else 465 end_field = IB_PC_EXT_RCV_PKTS_F; 466 } 467 468 for (i = start_field; i <= end_field; i++) { 469 uint64_t val64 = 0; 470 float val = 0; 471 char *unit = ""; 472 mad_decode_field(pkt, i, (void *)&val64); 473 if (val64) { 474 int data = 0; 475 if (i == IB_PC_EXT_XMT_BYTES_F || 476 i == IB_PC_EXT_RCV_BYTES_F || 477 i == IB_PC_XMT_BYTES_F || 478 i == IB_PC_RCV_BYTES_F) 479 data = 1; 480 unit = conv_cnt_human_readable(val64, 481 &val, data); 482 n += snprintf(str + n, 1024 - n, 483 " [%s == %" PRIu64 484 " (%5.3f%s)]", 485 mad_field_name(i), val64, val, 486 unit); 487 } 488 } 489 } 490 491 if (!*header_printed) { 492 if (node->type == IB_NODE_SWITCH) 493 printf("Errors for 0x%" PRIx64 " \"%s\"\n", 494 node->ports[0]->guid, node_name); 495 else 496 printf("Errors for \"%s\"\n", node_name); 497 *header_printed = 1; 498 summary.bad_nodes++; 499 } 500 501 if (portnum == 0xFF) { 502 if (node->type == IB_NODE_SWITCH) 503 printf(" GUID 0x%" PRIx64 " port ALL:%s\n", 504 node->ports[0]->guid, str); 505 } else { 506 printf(" GUID 0x%" PRIx64 " port %d:%s\n", 507 node->ports[portnum]->guid, portnum, str); 508 if (port_config) 509 print_port_config(node, portnum); 510 summary.bad_ports++; 511 } 512 } 513 return (n); 514 } 515 516 static int query_cap_mask(ib_portid_t * portid, char *node_name, int portnum, 517 uint16_t * cap_mask) 518 { 519 uint8_t pc[1024] = { 0 }; 520 uint16_t rc_cap_mask; 521 522 portid->sl = lid2sl_table[portid->lid]; 523 524 /* PerfMgt ClassPortInfo is a required attribute */ 525 if (!pma_query_via(pc, portid, portnum, ibd_timeout, CLASS_PORT_INFO, 526 ibmad_port)) { 527 IBWARN("classportinfo query failed on %s, %s port %d", 528 node_name, portid2str(portid), portnum); 529 summary.pma_query_failures++; 530 return -1; 531 } 532 533 /* ClassPortInfo should be supported as part of libibmad */ 534 memcpy(&rc_cap_mask, pc + 2, sizeof(rc_cap_mask)); /* CapabilityMask */ 535 536 *cap_mask = rc_cap_mask; 537 return 0; 538 } 539 540 static int print_data_cnts(ib_portid_t * portid, uint16_t cap_mask, 541 char *node_name, ibnd_node_t * node, int portnum, 542 int *header_printed) 543 { 544 uint8_t pc[1024]; 545 int i; 546 int start_field = IB_PC_XMT_BYTES_F; 547 int end_field = IB_PC_RCV_PKTS_F; 548 549 memset(pc, 0, 1024); 550 551 portid->sl = lid2sl_table[portid->lid]; 552 553 if (cap_mask & (IB_PM_EXT_WIDTH_SUPPORTED | IB_PM_EXT_WIDTH_NOIETF_SUP)) { 554 if (!pma_query_via(pc, portid, portnum, ibd_timeout, 555 IB_GSI_PORT_COUNTERS_EXT, ibmad_port)) { 556 IBWARN("IB_GSI_PORT_COUNTERS_EXT query failed on %s, %s port %d", 557 node_name, portid2str(portid), portnum); 558 summary.pma_query_failures++; 559 return (1); 560 } 561 start_field = IB_PC_EXT_XMT_BYTES_F; 562 if (cap_mask & IB_PM_EXT_WIDTH_SUPPORTED) 563 end_field = IB_PC_EXT_RCV_MPKTS_F; 564 else 565 end_field = IB_PC_EXT_RCV_PKTS_F; 566 } else { 567 if (!pma_query_via(pc, portid, portnum, ibd_timeout, 568 IB_GSI_PORT_COUNTERS, ibmad_port)) { 569 IBWARN("IB_GSI_PORT_COUNTERS query failed on %s, %s port %d", 570 node_name, portid2str(portid), portnum); 571 summary.pma_query_failures++; 572 return (1); 573 } 574 start_field = IB_PC_XMT_BYTES_F; 575 end_field = IB_PC_RCV_PKTS_F; 576 } 577 578 if (!*header_printed) { 579 printf("Data Counters for 0x%" PRIx64 " \"%s\"\n", node->guid, 580 node_name); 581 *header_printed = 1; 582 } 583 584 if (portnum == 0xFF) 585 printf(" GUID 0x%" PRIx64 " port ALL:", node->guid); 586 else 587 printf(" GUID 0x%" PRIx64 " port %d:", 588 node->guid, portnum); 589 590 for (i = start_field; i <= end_field; i++) { 591 uint64_t val64 = 0; 592 float val = 0; 593 char *unit = ""; 594 int data = 0; 595 mad_decode_field(pc, i, (void *)&val64); 596 if (i == IB_PC_EXT_XMT_BYTES_F || i == IB_PC_EXT_RCV_BYTES_F || 597 i == IB_PC_XMT_BYTES_F || i == IB_PC_RCV_BYTES_F) 598 data = 1; 599 unit = conv_cnt_human_readable(val64, &val, data); 600 printf(" [%s == %" PRIu64 " (%5.3f%s)]", mad_field_name(i), 601 val64, val, unit); 602 } 603 printf("\n"); 604 605 if (portnum != 0xFF && port_config) 606 print_port_config(node, portnum); 607 608 return (0); 609 } 610 611 static int print_errors(ib_portid_t * portid, uint16_t cap_mask, 612 char *node_name, ibnd_node_t * node, int portnum, 613 int *header_printed) 614 { 615 uint8_t pc[1024]; 616 uint8_t pce[1024]; 617 uint8_t *pc_ext = NULL; 618 619 memset(pc, 0, 1024); 620 memset(pce, 0, 1024); 621 622 portid->sl = lid2sl_table[portid->lid]; 623 624 if (!pma_query_via(pc, portid, portnum, ibd_timeout, 625 IB_GSI_PORT_COUNTERS, ibmad_port)) { 626 IBWARN("IB_GSI_PORT_COUNTERS query failed on %s, %s port %d", 627 node_name, portid2str(portid), portnum); 628 summary.pma_query_failures++; 629 return (0); 630 } 631 632 if (cap_mask & (IB_PM_EXT_WIDTH_SUPPORTED | IB_PM_EXT_WIDTH_NOIETF_SUP)) { 633 if (!pma_query_via(pce, portid, portnum, ibd_timeout, 634 IB_GSI_PORT_COUNTERS_EXT, ibmad_port)) { 635 IBWARN("IB_GSI_PORT_COUNTERS_EXT query failed on %s, %s port %d", 636 node_name, portid2str(portid), portnum); 637 summary.pma_query_failures++; 638 return (0); 639 } 640 pc_ext = pce; 641 } 642 643 if (!(cap_mask & IB_PM_PC_XMIT_WAIT_SUP)) { 644 /* if PortCounters:PortXmitWait not supported clear this counter */ 645 uint32_t foo = 0; 646 mad_encode_field(pc, IB_PC_XMT_WAIT_F, &foo); 647 } 648 return (print_results(portid, node_name, node, pc, portnum, 649 header_printed, pc_ext, cap_mask)); 650 } 651 652 uint8_t *reset_pc_ext(void *rcvbuf, ib_portid_t * dest, 653 int port, unsigned mask, unsigned timeout, 654 const struct ibmad_port * srcport) 655 { 656 ib_rpc_t rpc = { 0 }; 657 int lid = dest->lid; 658 659 DEBUG("lid %u port %d mask 0x%x", lid, port, mask); 660 661 if (lid == -1) { 662 IBWARN("only lid routed is supported"); 663 return NULL; 664 } 665 666 if (!mask) 667 mask = ~0; 668 669 rpc.mgtclass = IB_PERFORMANCE_CLASS; 670 rpc.method = IB_MAD_METHOD_SET; 671 rpc.attr.id = IB_GSI_PORT_COUNTERS_EXT; 672 673 memset(rcvbuf, 0, IB_MAD_SIZE); 674 675 /* Same for attribute IDs */ 676 mad_set_field(rcvbuf, 0, IB_PC_EXT_PORT_SELECT_F, port); 677 mad_set_field(rcvbuf, 0, IB_PC_EXT_COUNTER_SELECT_F, mask); 678 rpc.attr.mod = 0; 679 rpc.timeout = timeout; 680 rpc.datasz = IB_PC_DATA_SZ; 681 rpc.dataoffs = IB_PC_DATA_OFFS; 682 if (!dest->qp) 683 dest->qp = 1; 684 if (!dest->qkey) 685 dest->qkey = IB_DEFAULT_QP1_QKEY; 686 687 return mad_rpc(srcport, &rpc, dest, rcvbuf, rcvbuf); 688 } 689 690 static void clear_port(ib_portid_t * portid, uint16_t cap_mask, 691 char *node_name, int port) 692 { 693 uint8_t pc[1024] = { 0 }; 694 /* bits defined in Table 228 PortCounters CounterSelect and 695 * CounterSelect2 696 */ 697 uint32_t mask = 0; 698 699 if (clear_errors) { 700 mask |= 0xFFF; 701 if (cap_mask & IB_PM_PC_XMIT_WAIT_SUP) 702 mask |= 0x10000; 703 } 704 if (clear_counts) 705 mask |= 0xF000; 706 707 if (mask) 708 if (!performance_reset_via(pc, portid, port, mask, ibd_timeout, 709 IB_GSI_PORT_COUNTERS, ibmad_port)) 710 fprintf(stderr, "Failed to reset errors %s port %d\n", node_name, 711 port); 712 713 if (clear_errors && details) { 714 memset(pc, 0, 1024); 715 performance_reset_via(pc, portid, port, 0xf, ibd_timeout, 716 IB_GSI_PORT_XMIT_DISCARD_DETAILS, 717 ibmad_port); 718 memset(pc, 0, 1024); 719 performance_reset_via(pc, portid, port, 0x3f, ibd_timeout, 720 IB_GSI_PORT_RCV_ERROR_DETAILS, 721 ibmad_port); 722 } 723 724 if (clear_counts && 725 (cap_mask & 726 (IB_PM_EXT_WIDTH_SUPPORTED | IB_PM_EXT_WIDTH_NOIETF_SUP))) { 727 if (cap_mask & IB_PM_EXT_WIDTH_SUPPORTED) 728 mask = 0xFF; 729 else 730 mask = 0x0F; 731 732 if (!reset_pc_ext(pc, portid, port, mask, ibd_timeout, 733 ibmad_port)) 734 fprintf(stderr, "Failed to reset extended data counters %s, " 735 "%s port %d\n", node_name, portid2str(portid), 736 port); 737 } 738 } 739 740 void print_node(ibnd_node_t * node, void *user_data) 741 { 742 int header_printed = 0; 743 int p = 0; 744 int startport = 1; 745 int type = 0; 746 int all_port_sup = 0; 747 ib_portid_t portid = { 0 }; 748 uint16_t cap_mask = 0; 749 char *node_name = NULL; 750 751 switch (node->type) { 752 case IB_NODE_SWITCH: 753 type = PRINT_SWITCH; 754 break; 755 case IB_NODE_CA: 756 type = PRINT_CA; 757 break; 758 case IB_NODE_ROUTER: 759 type = PRINT_ROUTER; 760 break; 761 } 762 763 if ((type & node_type_to_print) == 0) 764 return; 765 766 if (node->type == IB_NODE_SWITCH && node->smaenhsp0) 767 startport = 0; 768 769 node_name = remap_node_name(node_name_map, node->guid, node->nodedesc); 770 771 if (node->type == IB_NODE_SWITCH) { 772 ib_portid_set(&portid, node->smalid, 0, 0); 773 p = 0; 774 } else { 775 for (p = 1; p <= node->numports; p++) { 776 if (node->ports[p]) { 777 ib_portid_set(&portid, 778 node->ports[p]->base_lid, 779 0, 0); 780 break; 781 } 782 } 783 } 784 785 if ((query_cap_mask(&portid, node_name, p, &cap_mask) == 0) && 786 (cap_mask & IB_PM_ALL_PORT_SELECT)) 787 all_port_sup = 1; 788 789 if (data_counters_only) { 790 for (p = startport; p <= node->numports; p++) { 791 if (node->ports[p]) { 792 if (node->type == IB_NODE_SWITCH) 793 ib_portid_set(&portid, node->smalid, 0, 0); 794 else 795 ib_portid_set(&portid, node->ports[p]->base_lid, 796 0, 0); 797 798 print_data_cnts(&portid, cap_mask, node_name, node, p, 799 &header_printed); 800 summary.ports_checked++; 801 if (!all_port_sup) 802 clear_port(&portid, cap_mask, node_name, p); 803 } 804 } 805 } else { 806 if (all_port_sup) 807 if (!print_errors(&portid, cap_mask, node_name, node, 808 0xFF, &header_printed)) { 809 summary.ports_checked += node->numports; 810 goto clear; 811 } 812 813 for (p = startport; p <= node->numports; p++) { 814 if (node->ports[p]) { 815 if (node->type == IB_NODE_SWITCH) 816 ib_portid_set(&portid, node->smalid, 0, 0); 817 else 818 ib_portid_set(&portid, node->ports[p]->base_lid, 819 0, 0); 820 821 print_errors(&portid, cap_mask, node_name, node, p, 822 &header_printed); 823 summary.ports_checked++; 824 if (!all_port_sup) 825 clear_port(&portid, cap_mask, node_name, p); 826 } 827 } 828 } 829 830 clear: 831 summary.nodes_checked++; 832 if (all_port_sup) 833 clear_port(&portid, cap_mask, node_name, 0xFF); 834 835 free(node_name); 836 } 837 838 static void add_suppressed(enum MAD_FIELDS field) 839 { 840 if (sup_total >= SUP_MAX) { 841 IBWARN("Maximum (%d) fields have been suppressed; skipping %s", 842 sup_total, mad_field_name(field)); 843 return; 844 } 845 suppressed_fields[sup_total++] = field; 846 } 847 848 static void calculate_suppressed_fields(char *str) 849 { 850 enum MAD_FIELDS f; 851 char *val, *lasts = NULL; 852 char *tmp = strdup(str); 853 854 val = strtok_r(tmp, ",", &lasts); 855 while (val) { 856 for (f = IB_PC_FIRST_F; f <= IB_PC_LAST_F; f++) 857 if (strcmp(val, mad_field_name(f)) == 0) 858 add_suppressed(f); 859 val = strtok_r(NULL, ",", &lasts); 860 } 861 862 free(tmp); 863 } 864 865 static int process_opt(void *context, int ch, char *optarg) 866 { 867 struct ibnd_config *cfg = context; 868 switch (ch) { 869 case 's': 870 calculate_suppressed_fields(optarg); 871 break; 872 case 'c': 873 /* Right now this is the only "common" error */ 874 add_suppressed(IB_PC_ERR_SWITCH_REL_F); 875 break; 876 case 1: 877 node_name_map_file = strdup(optarg); 878 break; 879 case 2: 880 data_counters++; 881 break; 882 case 3: 883 node_type_to_print |= PRINT_SWITCH; 884 break; 885 case 4: 886 node_type_to_print |= PRINT_CA; 887 break; 888 case 5: 889 node_type_to_print |= PRINT_ROUTER; 890 break; 891 case 6: 892 details = 1; 893 break; 894 case 7: 895 load_cache_file = strdup(optarg); 896 break; 897 case 8: 898 threshold_file = strdup(optarg); 899 break; 900 case 9: 901 data_counters_only = 1; 902 break; 903 case 10: 904 obtain_sl = 0; 905 break; 906 case 'G': 907 case 'S': 908 port_guid_str = optarg; 909 port_guid = strtoull(optarg, 0, 0); 910 break; 911 case 'D': 912 dr_path = strdup(optarg); 913 break; 914 case 'r': 915 port_config++; 916 break; 917 case 'R': /* nop */ 918 break; 919 case 'k': 920 clear_errors = 1; 921 break; 922 case 'K': 923 clear_counts = 1; 924 break; 925 case 'o': 926 cfg->max_smps = strtoul(optarg, NULL, 0); 927 break; 928 default: 929 return -1; 930 } 931 932 return 0; 933 } 934 935 int main(int argc, char **argv) 936 { 937 struct ibnd_config config = { 0 }; 938 int resolved = -1; 939 ib_portid_t portid = { 0 }; 940 ib_portid_t self_portid = { 0 }; 941 int rc = 0; 942 ibnd_fabric_t *fabric = NULL; 943 ib_gid_t self_gid; 944 int port = 0; 945 946 int mgmt_classes[4] = { IB_SMI_CLASS, IB_SMI_DIRECT_CLASS, IB_SA_CLASS, 947 IB_PERFORMANCE_CLASS 948 }; 949 950 const struct ibdiag_opt opts[] = { 951 {"suppress", 's', 1, "<err1,err2,...>", 952 "suppress errors listed"}, 953 {"suppress-common", 'c', 0, NULL, 954 "suppress some of the common counters"}, 955 {"node-name-map", 1, 1, "<file>", "node name map file"}, 956 {"port-guid", 'G', 1, "<port_guid>", 957 "report the node containing the port specified by <port_guid>"}, 958 {"", 'S', 1, "<port_guid>", 959 "Same as \"-G\" for backward compatibility"}, 960 {"Direct", 'D', 1, "<dr_path>", 961 "report the node containing the port specified by <dr_path>"}, 962 {"skip-sl", 10, 0, NULL,"don't obtain SL to all destinations"}, 963 {"report-port", 'r', 0, NULL, 964 "report port link information"}, 965 {"threshold-file", 8, 1, NULL, 966 "specify an alternate threshold file, default: " DEF_THRES_FILE}, 967 {"GNDN", 'R', 0, NULL, 968 "(This option is obsolete and does nothing)"}, 969 {"data", 2, 0, NULL, "include data counters for ports with errors"}, 970 {"switch", 3, 0, NULL, "print data for switches only"}, 971 {"ca", 4, 0, NULL, "print data for CA's only"}, 972 {"router", 5, 0, NULL, "print data for routers only"}, 973 {"details", 6, 0, NULL, "include transmit discard details"}, 974 {"counters", 9, 0, NULL, "print data counters only"}, 975 {"clear-errors", 'k', 0, NULL, 976 "Clear error counters after read"}, 977 {"clear-counts", 'K', 0, NULL, 978 "Clear data counters after read"}, 979 {"load-cache", 7, 1, "<file>", 980 "filename of ibnetdiscover cache to load"}, 981 {"outstanding_smps", 'o', 1, NULL, 982 "specify the number of outstanding SMP's which should be " 983 "issued during the scan"}, 984 {0} 985 }; 986 char usage_args[] = ""; 987 988 memset(suppressed_fields, 0, sizeof suppressed_fields); 989 ibdiag_process_opts(argc, argv, &config, "cDGKLnRrSs", opts, process_opt, 990 usage_args, NULL); 991 992 argc -= optind; 993 argv += optind; 994 995 if (!node_type_to_print) 996 node_type_to_print = PRINT_ALL; 997 998 ibmad_port = mad_rpc_open_port(ibd_ca, ibd_ca_port, mgmt_classes, 4); 999 if (!ibmad_port) 1000 IBEXIT("Failed to open port; %s:%d\n", ibd_ca, ibd_ca_port); 1001 1002 smp_mkey_set(ibmad_port, ibd_mkey); 1003 1004 if (ibd_timeout) { 1005 mad_rpc_set_timeout(ibmad_port, ibd_timeout); 1006 config.timeout_ms = ibd_timeout; 1007 } 1008 1009 config.flags = ibd_ibnetdisc_flags; 1010 config.mkey = ibd_mkey; 1011 1012 if (dr_path && load_cache_file) { 1013 mad_rpc_close_port(ibmad_port); 1014 fprintf(stderr, "Cannot specify cache and direct route path\n"); 1015 exit(-1); 1016 } 1017 1018 if (resolve_self(ibd_ca, ibd_ca_port, &self_portid, &port, &self_gid.raw) < 0) { 1019 mad_rpc_close_port(ibmad_port); 1020 IBEXIT("can't resolve self port %s", argv[0]); 1021 } 1022 1023 node_name_map = open_node_name_map(node_name_map_file); 1024 1025 /* limit the scan the fabric around the target */ 1026 if (dr_path) { 1027 if ((resolved = 1028 resolve_portid_str(ibd_ca, ibd_ca_port, &portid, dr_path, 1029 IB_DEST_DRPATH, NULL, ibmad_port)) < 0) 1030 IBWARN("Failed to resolve %s; attempting full scan", 1031 dr_path); 1032 } else if (port_guid_str) { 1033 if ((resolved = 1034 resolve_portid_str(ibd_ca, ibd_ca_port, &portid, 1035 port_guid_str, IB_DEST_GUID, ibd_sm_id, 1036 ibmad_port)) < 0) 1037 IBWARN("Failed to resolve %s; attempting full scan", 1038 port_guid_str); 1039 if(obtain_sl) 1040 lid2sl_table[portid.lid] = portid.sl; 1041 } 1042 1043 mad_rpc_close_port(ibmad_port); 1044 1045 if (load_cache_file) { 1046 if ((fabric = ibnd_load_fabric(load_cache_file, 0)) == NULL) { 1047 fprintf(stderr, "loading cached fabric failed\n"); 1048 rc = -1; 1049 goto close_port; 1050 } 1051 } else { 1052 if (resolved >= 0) { 1053 if (!config.max_hops) 1054 config.max_hops = 1; 1055 if (!(fabric = ibnd_discover_fabric(ibd_ca, ibd_ca_port, 1056 &portid, &config))) 1057 IBWARN("Single node discover failed;" 1058 " attempting full scan"); 1059 } 1060 1061 if (!fabric && !(fabric = ibnd_discover_fabric(ibd_ca, 1062 ibd_ca_port, 1063 NULL, 1064 &config))) { 1065 fprintf(stderr, "discover failed\n"); 1066 rc = -1; 1067 goto close_port; 1068 } 1069 } 1070 1071 set_thresholds(threshold_file); 1072 1073 /* reopen the global ibmad_port */ 1074 ibmad_port = mad_rpc_open_port(ibd_ca, ibd_ca_port, 1075 mgmt_classes, 4); 1076 if (!ibmad_port) { 1077 ibnd_destroy_fabric(fabric); 1078 close_node_name_map(node_name_map); 1079 IBEXIT("Failed to reopen port: %s:%d\n", 1080 ibd_ca, ibd_ca_port); 1081 } 1082 1083 smp_mkey_set(ibmad_port, ibd_mkey); 1084 1085 if (ibd_timeout) 1086 mad_rpc_set_timeout(ibmad_port, ibd_timeout); 1087 1088 if (port_guid_str) { 1089 ibnd_port_t *port = ibnd_find_port_guid(fabric, port_guid); 1090 if (port) 1091 print_node(port->node, NULL); 1092 else 1093 fprintf(stderr, "Failed to find node: %s\n", 1094 port_guid_str); 1095 } else if (dr_path) { 1096 ibnd_port_t *port; 1097 uint8_t ni[IB_SMP_DATA_SIZE] = { 0 }; 1098 if (!smp_query_via(ni, &portid, IB_ATTR_NODE_INFO, 0, 1099 ibd_timeout, ibmad_port)) { 1100 fprintf(stderr, "Failed to query local Node Info\n"); 1101 goto destroy_fabric; 1102 } 1103 1104 mad_decode_field(ni, IB_NODE_PORT_GUID_F, &(port_guid)); 1105 1106 port = ibnd_find_port_guid(fabric, port_guid); 1107 if (port) { 1108 if(obtain_sl) 1109 if(path_record_query(self_gid,port->guid)) 1110 goto destroy_fabric; 1111 print_node(port->node, NULL); 1112 } else 1113 fprintf(stderr, "Failed to find node: %s\n", dr_path); 1114 } else { 1115 if(obtain_sl) 1116 if(path_record_query(self_gid,0)) 1117 goto destroy_fabric; 1118 1119 ibnd_iter_nodes(fabric, print_node, NULL); 1120 } 1121 1122 rc = print_summary(); 1123 if (rc) 1124 rc = 1; 1125 1126 destroy_fabric: 1127 mad_rpc_close_port(ibmad_port); 1128 ibnd_destroy_fabric(fabric); 1129 1130 close_port: 1131 close_node_name_map(node_name_map); 1132 exit(rc); 1133 } 1134