1 /* 2 * Copyright (c) 2010-2012 Intel Corporation. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 */ 32 33 #include <config.h> 34 35 #include <stdio.h> 36 #include <inttypes.h> 37 #include <sys/types.h> 38 #include <sys/socket.h> 39 #include <netdb.h> 40 #include <unistd.h> 41 42 #include "cma.h" 43 #include <rdma/rdma_cma.h> 44 #include <infiniband/ib.h> 45 #include <infiniband/sa.h> 46 47 #define ACM_VERSION 1 48 49 #define ACM_OP_RESOLVE 0x01 50 #define ACM_OP_ACK 0x80 51 52 #define ACM_STATUS_SUCCESS 0 53 #define ACM_STATUS_ENOMEM 1 54 #define ACM_STATUS_EINVAL 2 55 #define ACM_STATUS_ENODATA 3 56 #define ACM_STATUS_ENOTCONN 5 57 #define ACM_STATUS_ETIMEDOUT 6 58 #define ACM_STATUS_ESRCADDR 7 59 #define ACM_STATUS_ESRCTYPE 8 60 #define ACM_STATUS_EDESTADDR 9 61 #define ACM_STATUS_EDESTTYPE 10 62 63 #define ACM_FLAGS_NODELAY (1<<30) 64 65 #define ACM_MSG_HDR_LENGTH 16 66 #define ACM_MAX_ADDRESS 64 67 #define ACM_MSG_EP_LENGTH 72 68 #define ACM_MSG_DATA_LENGTH (ACM_MSG_EP_LENGTH * 8) 69 70 struct acm_hdr { 71 uint8_t version; 72 uint8_t opcode; 73 uint8_t status; 74 uint8_t data[3]; 75 uint16_t length; 76 uint64_t tid; 77 }; 78 79 #define ACM_EP_INFO_NAME 0x0001 80 #define ACM_EP_INFO_ADDRESS_IP 0x0002 81 #define ACM_EP_INFO_ADDRESS_IP6 0x0003 82 #define ACM_EP_INFO_PATH 0x0010 83 84 union acm_ep_info { 85 uint8_t addr[ACM_MAX_ADDRESS]; 86 uint8_t name[ACM_MAX_ADDRESS]; 87 struct ibv_path_record path; 88 }; 89 90 #define ACM_EP_FLAG_SOURCE (1<<0) 91 #define ACM_EP_FLAG_DEST (1<<1) 92 93 struct acm_ep_addr_data { 94 uint32_t flags; 95 uint16_t type; 96 uint16_t reserved; 97 union acm_ep_info info; 98 }; 99 100 struct acm_resolve_msg { 101 struct acm_hdr hdr; 102 struct acm_ep_addr_data data[0]; 103 }; 104 105 struct acm_msg { 106 struct acm_hdr hdr; 107 union{ 108 uint8_t data[ACM_MSG_DATA_LENGTH]; 109 struct acm_ep_addr_data resolve_data[0]; 110 }; 111 }; 112 113 static pthread_mutex_t acm_lock = PTHREAD_MUTEX_INITIALIZER; 114 static int sock = -1; 115 static uint16_t server_port; 116 117 static int ucma_set_server_port(void) 118 { 119 FILE *f; 120 121 if ((f = fopen(IBACM_PORT_FILE, "r" STREAM_CLOEXEC))) { 122 if (fscanf(f, "%" SCNu16, &server_port) != 1) 123 server_port = 0; 124 fclose(f); 125 } 126 return server_port; 127 } 128 129 void ucma_ib_init(void) 130 { 131 struct sockaddr_in addr; 132 static int init; 133 int ret; 134 135 if (init) 136 return; 137 138 pthread_mutex_lock(&acm_lock); 139 if (init) 140 goto unlock; 141 142 if (!ucma_set_server_port()) 143 goto out; 144 145 sock = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP); 146 if (sock < 0) 147 goto out; 148 149 memset(&addr, 0, sizeof addr); 150 addr.sin_family = AF_INET; 151 addr.sin_addr.s_addr = htobe32(INADDR_LOOPBACK); 152 addr.sin_port = htobe16(server_port); 153 ret = connect(sock, (struct sockaddr *) &addr, sizeof(addr)); 154 if (ret) { 155 close(sock); 156 sock = -1; 157 } 158 out: 159 init = 1; 160 unlock: 161 pthread_mutex_unlock(&acm_lock); 162 } 163 164 void ucma_ib_cleanup(void) 165 { 166 if (sock >= 0) { 167 shutdown(sock, SHUT_RDWR); 168 close(sock); 169 } 170 } 171 172 static int ucma_ib_set_addr(struct rdma_addrinfo *ib_rai, 173 struct rdma_addrinfo *rai) 174 { 175 struct sockaddr_ib *src, *dst; 176 struct ibv_path_record *path; 177 178 src = calloc(1, sizeof(*src)); 179 if (!src) 180 return ERR(ENOMEM); 181 182 dst = calloc(1, sizeof(*dst)); 183 if (!dst) { 184 free(src); 185 return ERR(ENOMEM); 186 } 187 188 path = &((struct ibv_path_data *) ib_rai->ai_route)->path; 189 190 src->sib_family = AF_IB; 191 src->sib_pkey = path->pkey; 192 src->sib_flowinfo = htobe32(be32toh(path->flowlabel_hoplimit) >> 8); 193 memcpy(&src->sib_addr, &path->sgid, 16); 194 ucma_set_sid(ib_rai->ai_port_space, rai->ai_src_addr, src); 195 196 dst->sib_family = AF_IB; 197 dst->sib_pkey = path->pkey; 198 dst->sib_flowinfo = htobe32(be32toh(path->flowlabel_hoplimit) >> 8); 199 memcpy(&dst->sib_addr, &path->dgid, 16); 200 ucma_set_sid(ib_rai->ai_port_space, rai->ai_dst_addr, dst); 201 202 ib_rai->ai_src_addr = (struct sockaddr *) src; 203 ib_rai->ai_src_len = sizeof(*src); 204 205 ib_rai->ai_dst_addr = (struct sockaddr *) dst; 206 ib_rai->ai_dst_len = sizeof(*dst); 207 208 return 0; 209 } 210 211 static int ucma_ib_set_connect(struct rdma_addrinfo *ib_rai, 212 struct rdma_addrinfo *rai) 213 { 214 struct ib_connect_hdr *hdr; 215 216 if (rai->ai_family == AF_IB) 217 return 0; 218 219 hdr = calloc(1, sizeof(*hdr)); 220 if (!hdr) 221 return ERR(ENOMEM); 222 223 if (rai->ai_family == AF_INET) { 224 hdr->ip_version = 4 << 4; 225 memcpy(&hdr->cma_src_ip4, 226 &((struct sockaddr_in *) rai->ai_src_addr)->sin_addr, 4); 227 memcpy(&hdr->cma_dst_ip4, 228 &((struct sockaddr_in *) rai->ai_dst_addr)->sin_addr, 4); 229 } else { 230 hdr->ip_version = 6 << 4; 231 memcpy(&hdr->cma_src_ip6, 232 &((struct sockaddr_in6 *) rai->ai_src_addr)->sin6_addr, 16); 233 memcpy(&hdr->cma_dst_ip6, 234 &((struct sockaddr_in6 *) rai->ai_dst_addr)->sin6_addr, 16); 235 } 236 237 ib_rai->ai_connect = hdr; 238 ib_rai->ai_connect_len = sizeof(*hdr); 239 return 0; 240 } 241 242 static void ucma_resolve_af_ib(struct rdma_addrinfo **rai) 243 { 244 struct rdma_addrinfo *ib_rai; 245 246 ib_rai = calloc(1, sizeof(*ib_rai)); 247 if (!ib_rai) 248 return; 249 250 ib_rai->ai_flags = (*rai)->ai_flags; 251 ib_rai->ai_family = AF_IB; 252 ib_rai->ai_qp_type = (*rai)->ai_qp_type; 253 ib_rai->ai_port_space = (*rai)->ai_port_space; 254 255 ib_rai->ai_route = calloc(1, (*rai)->ai_route_len); 256 if (!ib_rai->ai_route) 257 goto err; 258 259 memcpy(ib_rai->ai_route, (*rai)->ai_route, (*rai)->ai_route_len); 260 ib_rai->ai_route_len = (*rai)->ai_route_len; 261 262 if ((*rai)->ai_src_canonname) { 263 ib_rai->ai_src_canonname = strdup((*rai)->ai_src_canonname); 264 if (!ib_rai->ai_src_canonname) 265 goto err; 266 } 267 268 if ((*rai)->ai_dst_canonname) { 269 ib_rai->ai_dst_canonname = strdup((*rai)->ai_dst_canonname); 270 if (!ib_rai->ai_dst_canonname) 271 goto err; 272 } 273 274 if (ucma_ib_set_connect(ib_rai, *rai)) 275 goto err; 276 277 if (ucma_ib_set_addr(ib_rai, *rai)) 278 goto err; 279 280 ib_rai->ai_next = *rai; 281 *rai = ib_rai; 282 return; 283 284 err: 285 rdma_freeaddrinfo(ib_rai); 286 } 287 288 static void ucma_ib_save_resp(struct rdma_addrinfo *rai, struct acm_msg *msg) 289 { 290 struct acm_ep_addr_data *ep_data; 291 struct ibv_path_data *path_data = NULL; 292 struct sockaddr_in *sin; 293 struct sockaddr_in6 *sin6; 294 int i, cnt, path_cnt = 0; 295 296 cnt = (msg->hdr.length - ACM_MSG_HDR_LENGTH) / ACM_MSG_EP_LENGTH; 297 for (i = 0; i < cnt; i++) { 298 ep_data = &msg->resolve_data[i]; 299 switch (ep_data->type) { 300 case ACM_EP_INFO_PATH: 301 ep_data->type = 0; 302 if (!path_data) 303 path_data = (struct ibv_path_data *) ep_data; 304 path_cnt++; 305 break; 306 case ACM_EP_INFO_ADDRESS_IP: 307 if (!(ep_data->flags & ACM_EP_FLAG_SOURCE) || rai->ai_src_len) 308 break; 309 310 sin = calloc(1, sizeof(*sin)); 311 if (!sin) 312 break; 313 314 sin->sin_family = AF_INET; 315 memcpy(&sin->sin_addr, &ep_data->info.addr, 4); 316 rai->ai_src_len = sizeof(*sin); 317 rai->ai_src_addr = (struct sockaddr *) sin; 318 break; 319 case ACM_EP_INFO_ADDRESS_IP6: 320 if (!(ep_data->flags & ACM_EP_FLAG_SOURCE) || rai->ai_src_len) 321 break; 322 323 sin6 = calloc(1, sizeof(*sin6)); 324 if (!sin6) 325 break; 326 327 sin6->sin6_family = AF_INET6; 328 memcpy(&sin6->sin6_addr, &ep_data->info.addr, 16); 329 rai->ai_src_len = sizeof(*sin6); 330 rai->ai_src_addr = (struct sockaddr *) sin6; 331 break; 332 default: 333 break; 334 } 335 } 336 337 rai->ai_route = calloc(path_cnt, sizeof(*path_data)); 338 if (rai->ai_route) { 339 memcpy(rai->ai_route, path_data, path_cnt * sizeof(*path_data)); 340 rai->ai_route_len = path_cnt * sizeof(*path_data); 341 } 342 } 343 344 static void ucma_set_ep_addr(struct acm_ep_addr_data *data, struct sockaddr *addr) 345 { 346 if (addr->sa_family == AF_INET) { 347 data->type = ACM_EP_INFO_ADDRESS_IP; 348 memcpy(data->info.addr, &((struct sockaddr_in *) addr)->sin_addr, 4); 349 } else { 350 data->type = ACM_EP_INFO_ADDRESS_IP6; 351 memcpy(data->info.addr, &((struct sockaddr_in6 *) addr)->sin6_addr, 16); 352 } 353 } 354 355 static int ucma_inet_addr(struct sockaddr *addr, socklen_t len) 356 { 357 return len && addr && (addr->sa_family == AF_INET || 358 addr->sa_family == AF_INET6); 359 } 360 361 static int ucma_ib_addr(struct sockaddr *addr, socklen_t len) 362 { 363 return len && addr && (addr->sa_family == AF_IB); 364 } 365 366 void ucma_ib_resolve(struct rdma_addrinfo **rai, 367 const struct rdma_addrinfo *hints) 368 { 369 struct acm_msg msg; 370 struct acm_ep_addr_data *data; 371 int ret; 372 373 ucma_ib_init(); 374 if (sock < 0) 375 return; 376 377 memset(&msg, 0, sizeof msg); 378 msg.hdr.version = ACM_VERSION; 379 msg.hdr.opcode = ACM_OP_RESOLVE; 380 msg.hdr.length = ACM_MSG_HDR_LENGTH; 381 382 data = &msg.resolve_data[0]; 383 if (ucma_inet_addr((*rai)->ai_src_addr, (*rai)->ai_src_len)) { 384 data->flags = ACM_EP_FLAG_SOURCE; 385 ucma_set_ep_addr(data, (*rai)->ai_src_addr); 386 data++; 387 msg.hdr.length += ACM_MSG_EP_LENGTH; 388 } 389 390 if (ucma_inet_addr((*rai)->ai_dst_addr, (*rai)->ai_dst_len)) { 391 data->flags = ACM_EP_FLAG_DEST; 392 if (hints->ai_flags & (RAI_NUMERICHOST | RAI_NOROUTE)) 393 data->flags |= ACM_FLAGS_NODELAY; 394 ucma_set_ep_addr(data, (*rai)->ai_dst_addr); 395 data++; 396 msg.hdr.length += ACM_MSG_EP_LENGTH; 397 } 398 399 if (hints->ai_route_len || 400 ucma_ib_addr((*rai)->ai_src_addr, (*rai)->ai_src_len) || 401 ucma_ib_addr((*rai)->ai_dst_addr, (*rai)->ai_dst_len)) { 402 struct ibv_path_record *path; 403 404 if (hints->ai_route_len == sizeof(struct ibv_path_record)) 405 path = (struct ibv_path_record *) hints->ai_route; 406 else if (hints->ai_route_len == sizeof(struct ibv_path_data)) 407 path = &((struct ibv_path_data *) hints->ai_route)->path; 408 else 409 path = NULL; 410 411 if (path) 412 memcpy(&data->info.path, path, sizeof(*path)); 413 414 if (ucma_ib_addr((*rai)->ai_src_addr, (*rai)->ai_src_len)) { 415 memcpy(&data->info.path.sgid, 416 &((struct sockaddr_ib *) (*rai)->ai_src_addr)->sib_addr, 16); 417 } 418 if (ucma_ib_addr((*rai)->ai_dst_addr, (*rai)->ai_dst_len)) { 419 memcpy(&data->info.path.dgid, 420 &((struct sockaddr_ib *) (*rai)->ai_dst_addr)->sib_addr, 16); 421 } 422 data->type = ACM_EP_INFO_PATH; 423 data++; 424 msg.hdr.length += ACM_MSG_EP_LENGTH; 425 } 426 427 pthread_mutex_lock(&acm_lock); 428 ret = send(sock, (char *) &msg, msg.hdr.length, 0); 429 if (ret != msg.hdr.length) { 430 pthread_mutex_unlock(&acm_lock); 431 return; 432 } 433 434 ret = recv(sock, (char *) &msg, sizeof msg, 0); 435 pthread_mutex_unlock(&acm_lock); 436 if (ret < ACM_MSG_HDR_LENGTH || ret != msg.hdr.length || msg.hdr.status) 437 return; 438 439 ucma_ib_save_resp(*rai, &msg); 440 441 if (af_ib_support && !(hints->ai_flags & RAI_ROUTEONLY) && (*rai)->ai_route_len) 442 ucma_resolve_af_ib(rai); 443 } 444