1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 2018, Joyent, Inc. 28 */ 29 30 #include <sys/types.h> 31 #include <sys/conf.h> 32 #include <sys/modctl.h> 33 #include <sys/stat.h> 34 #include <sys/stream.h> 35 #include <sys/strsun.h> 36 #include <sys/ddi.h> 37 #include <sys/sunddi.h> 38 #include <sys/priv_names.h> 39 #include <inet/common.h> 40 41 #define _SUN_TPI_VERSION 2 42 #include <sys/tihdr.h> 43 #include <sys/timod.h> 44 #include <sys/tiuser.h> 45 #include <sys/suntpi.h> 46 #include <inet/common.h> 47 #include <inet/ip.h> 48 #include <inet/mi.h> 49 #include <inet/proto_set.h> 50 #include <sys/ib/clients/rds/rds.h> 51 #include <sys/policy.h> 52 #include <inet/ipclassifier.h> 53 #include <sys/ib/clients/rds/rds_kstat.h> 54 #include "sys/random.h" 55 #include <sys/ib/clients/rds/rds_transport.h> 56 #include <sys/ib/ibtl/ibti.h> 57 58 59 #define RDS_NAME "rds" 60 #define RDS_STRTAB rdsinfo 61 #define RDS_DEVDESC "RDS STREAMS driver" 62 #define RDS_DEVMINOR 0 63 #define RDS_DEVMTFLAGS D_MP | D_SYNCSTR 64 #define RDS_DEFAULT_PRIV_MODE 0666 65 66 #define rds_smallest_port 1 67 #define rds_largest_port 65535 68 69 #define RDS_RECV_HIWATER (56 * 1024) 70 #define RDS_RECV_LOWATER 128 71 #define RDS_XMIT_HIWATER (56 * 1024) 72 #define RDS_XMIT_LOWATER 1024 73 74 #define RDS_DPRINTF2 0 && 75 #define LABEL "RDS" 76 77 typedef struct rdsahdr_s { 78 in_port_t uha_src_port; /* Source port */ 79 in_port_t uha_dst_port; /* Destination port */ 80 } rdsha_t; 81 82 #define RDSH_SIZE 4 83 84 int rds_recv_hiwat = RDS_RECV_HIWATER; 85 int rds_recv_lowat = RDS_RECV_LOWATER; 86 int rds_xmit_hiwat = RDS_XMIT_HIWATER; 87 int rds_xmit_lowat = RDS_XMIT_LOWATER; 88 89 int rdsdebug; 90 91 static dev_info_t *rds_dev_info; 92 93 /* Hint not protected by any lock */ 94 static in_port_t rds_next_port_to_try; 95 96 ldi_ident_t rds_li; 97 static int loopmax = rds_largest_port - rds_smallest_port + 1; 98 99 /* global configuration variables */ 100 uint_t UserBufferSize; 101 uint_t rds_rx_pkts_pending_hwm; 102 103 extern void rds_ioctl(queue_t *, mblk_t *); 104 extern void rds_ioctl_copyin_done(queue_t *q, mblk_t *mp); 105 106 int rds_open_transport_driver(); 107 int rds_close_transport_driver(); 108 109 #define RDS_CURRENT_PORT_QUOTA() \ 110 (rds_rx_pkts_pending_hwm/RDS_GET_NPORT()) 111 112 krwlock_t rds_transport_lock; 113 ldi_handle_t rds_transport_handle = NULL; 114 rds_transport_ops_t *rds_transport_ops = NULL; 115 116 static int 117 rds_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) 118 { 119 int ret; 120 121 if (cmd != DDI_ATTACH) 122 return (DDI_FAILURE); 123 124 rds_dev_info = devi; 125 126 ret = ddi_create_minor_node(devi, RDS_NAME, S_IFCHR, 127 RDS_DEVMINOR, DDI_PSEUDO, 0); 128 if (ret != DDI_SUCCESS) { 129 return (ret); 130 } 131 132 return (DDI_SUCCESS); 133 } 134 135 static int 136 rds_detach(dev_info_t *devi, ddi_detach_cmd_t cmd) 137 { 138 if (cmd != DDI_DETACH) 139 return (DDI_FAILURE); 140 141 ASSERT(devi == rds_dev_info); 142 143 ddi_remove_minor_node(devi, NULL); 144 145 return (DDI_SUCCESS); 146 } 147 148 /* ARGSUSED */ 149 static int 150 rds_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 151 { 152 int error = DDI_FAILURE; 153 154 switch (cmd) { 155 case DDI_INFO_DEVT2DEVINFO: 156 if (rds_dev_info != NULL) { 157 *result = (void *)rds_dev_info; 158 error = DDI_SUCCESS; 159 } 160 break; 161 162 case DDI_INFO_DEVT2INSTANCE: 163 *result = NULL; 164 error = DDI_SUCCESS; 165 break; 166 167 default: 168 break; 169 } 170 171 return (error); 172 } 173 174 175 /*ARGSUSED*/ 176 static int 177 rds_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 178 { 179 rds_t *rds; 180 int ret; 181 182 if (is_system_labeled()) { 183 /* 184 * RDS socket is not supported on labeled systems 185 */ 186 return (ESOCKTNOSUPPORT); 187 } 188 189 /* Open the transport driver if IB HW is present */ 190 rw_enter(&rds_transport_lock, RW_READER); 191 if (rds_transport_handle == NULL) { 192 rw_exit(&rds_transport_lock); 193 ret = rds_open_transport_driver(); 194 rw_enter(&rds_transport_lock, RW_READER); 195 196 if (ret != 0) { 197 /* Transport driver failed to load */ 198 rw_exit(&rds_transport_lock); 199 return (ret); 200 } 201 } 202 rw_exit(&rds_transport_lock); 203 204 if (sflag == MODOPEN) { 205 return (EINVAL); 206 } 207 208 /* Reopen not supported */ 209 if (q->q_ptr != NULL) { 210 dprint(2, ("%s: Reopen is not supported: %p", LABEL, q->q_ptr)); 211 return (0); 212 } 213 214 rds = rds_create(q, credp); 215 if (rds == NULL) { 216 dprint(2, ("%s: rds_create failed", LABEL)); 217 return (0); 218 } 219 220 q->q_ptr = WR(q)->q_ptr = rds; 221 rds->rds_state = TS_UNBND; 222 rds->rds_family = AF_INET_OFFLOAD; 223 224 q->q_hiwat = rds_recv_hiwat; 225 q->q_lowat = rds_recv_lowat; 226 227 qprocson(q); 228 229 WR(q)->q_hiwat = rds_xmit_hiwat; 230 WR(q)->q_lowat = rds_xmit_lowat; 231 232 /* Set the Stream head watermarks */ 233 (void) proto_set_rx_hiwat(q, NULL, rds_recv_hiwat); 234 (void) proto_set_rx_lowat(q, NULL, rds_recv_lowat); 235 236 return (0); 237 } 238 239 /* ARGSUSED */ 240 static int 241 rds_close(queue_t *q, int flags __unused, cred_t *credp __unused) 242 { 243 rds_t *rdsp = (rds_t *)q->q_ptr; 244 245 qprocsoff(q); 246 247 /* 248 * NPORT should be decremented only if this socket was previously 249 * bound to an RDS port. 250 */ 251 if (rdsp->rds_state >= TS_IDLE) { 252 RDS_DECR_NPORT(); 253 RDS_SET_PORT_QUOTA(RDS_CURRENT_PORT_QUOTA()); 254 rds_transport_ops-> 255 rds_transport_resume_port(ntohs(rdsp->rds_port)); 256 } 257 258 /* close the transport driver if this is the last socket */ 259 if (RDS_GET_NPORT() == 1) { 260 (void) rds_close_transport_driver(); 261 } 262 263 /* 264 * We set the flags without holding a lock as this is 265 * just a hint for the fanout lookup to skip this rds. 266 * We dont free the struct until it's out of the hash and 267 * the ref count goes down. 268 */ 269 rdsp->rds_flags |= RDS_CLOSING; 270 rds_bind_hash_remove(rdsp, B_FALSE); 271 mutex_enter(&rdsp->rds_lock); 272 ASSERT(rdsp->rds_refcnt > 0); 273 if (rdsp->rds_refcnt != 1) { 274 cv_wait(&rdsp->rds_refcv, &rdsp->rds_lock); 275 } 276 mutex_exit(&rdsp->rds_lock); 277 RDS_DEC_REF_CNT(rdsp); 278 RD(q)->q_ptr = NULL; 279 WR(q)->q_ptr = NULL; 280 return (0); 281 } 282 283 /* 284 * Add a new message to the socket 285 */ 286 int 287 rds_deliver_new_msg(mblk_t *mp, ipaddr_t local_addr, ipaddr_t rem_addr, 288 in_port_t local_port, in_port_t rem_port, zoneid_t zoneid) 289 { 290 rds_t *rds; 291 struct T_unitdata_ind *tudi; 292 int udi_size; /* Size of T_unitdata_ind */ 293 mblk_t *mp1; 294 sin_t *sin; 295 int error = 0; 296 297 local_port = htons(local_port); 298 rem_port = htons(rem_port); 299 300 ASSERT(mp->b_datap->db_type == M_DATA); 301 rds = rds_fanout(local_addr, rem_addr, local_port, rem_port, zoneid); 302 if (rds == NULL) { 303 dprint(2, ("%s: rds_fanout failed: (0x%x 0x%x %d %d)", LABEL, 304 local_addr, rem_addr, ntohs(local_port), ntohs(rem_port))); 305 freemsg(mp); 306 return (error); 307 } 308 309 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t); 310 311 /* Allocate a message block for the T_UNITDATA_IND structure. */ 312 mp1 = allocb(udi_size, BPRI_MED); 313 if (mp1 == NULL) { 314 dprint(2, ("%s: allocb failed", LABEL)); 315 freemsg(mp); 316 return (ENOMEM); 317 } 318 319 mp1->b_cont = mp; 320 mp = mp1; 321 mp->b_datap->db_type = M_PROTO; 322 tudi = (struct T_unitdata_ind *)(uintptr_t)mp->b_rptr; 323 mp->b_wptr = (uchar_t *)tudi + udi_size; 324 tudi->PRIM_type = T_UNITDATA_IND; 325 tudi->SRC_length = sizeof (sin_t); 326 tudi->SRC_offset = sizeof (struct T_unitdata_ind); 327 tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin_t); 328 udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t)); 329 tudi->OPT_length = udi_size; 330 sin = (sin_t *)&tudi[1]; 331 sin->sin_addr.s_addr = rem_addr; 332 sin->sin_port = ntohs(rem_port); 333 sin->sin_family = rds->rds_family; 334 *(uint32_t *)(uintptr_t)&sin->sin_zero[0] = 0; 335 *(uint32_t *)(uintptr_t)&sin->sin_zero[4] = 0; 336 337 putnext(rds->rds_ulpd, mp); 338 339 /* check port quota */ 340 if (RDS_GET_RXPKTS_PEND() > rds_rx_pkts_pending_hwm) { 341 ulong_t current_port_quota = RDS_GET_PORT_QUOTA(); 342 if (rds->rds_port_quota > current_port_quota) { 343 /* this may result in stalling the port */ 344 rds->rds_port_quota = current_port_quota; 345 (void) proto_set_rx_hiwat(rds->rds_ulpd, NULL, 346 rds->rds_port_quota * UserBufferSize); 347 RDS_INCR_PORT_QUOTA_ADJUSTED(); 348 } 349 } 350 351 /* 352 * canputnext() check is done after putnext as the protocol does 353 * not allow dropping any received packet. 354 */ 355 if (!canputnext(rds->rds_ulpd)) { 356 error = ENOSPC; 357 } 358 359 RDS_DEC_REF_CNT(rds); 360 return (error); 361 } 362 363 364 /* Default structure copied into T_INFO_ACK messages */ 365 static struct T_info_ack rds_g_t_info_ack_ipv4 = { 366 T_INFO_ACK, 367 65535, /* TSDU_size. Excl. headers */ 368 T_INVALID, /* ETSU_size. rds does not support expedited data. */ 369 T_INVALID, /* CDATA_size. rds does not support connect data. */ 370 T_INVALID, /* DDATA_size. rds does not support disconnect data. */ 371 sizeof (sin_t), /* ADDR_size. */ 372 0, /* OPT_size - not initialized here */ 373 65535, /* TIDU_size. Excl. headers */ 374 T_CLTS, /* SERV_type. rds supports connection-less. */ 375 TS_UNBND, /* CURRENT_state. This is set from rds_state. */ 376 (XPG4_1|SENDZERO) /* PROVIDER_flag */ 377 }; 378 379 static in_port_t 380 rds_update_next_port(in_port_t port) 381 { 382 (void) random_get_pseudo_bytes((uint8_t *)&port, sizeof (in_port_t)); 383 if (port < rds_smallest_port) 384 port = rds_smallest_port; 385 return (port); 386 } 387 388 /* This routine creates a T_ERROR_ACK message and passes it upstream. */ 389 static void 390 rds_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error) 391 { 392 if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL) 393 qreply(q, mp); 394 } 395 396 static void 397 rds_capability_req(queue_t *q, mblk_t *mp) 398 { 399 t_uscalar_t cap_bits1; 400 struct T_capability_ack *tcap; 401 402 cap_bits1 = 403 ((struct T_capability_req *)(uintptr_t)mp->b_rptr)->CAP_bits1; 404 405 mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack), 406 mp->b_datap->db_type, T_CAPABILITY_ACK); 407 if (mp == NULL) 408 return; 409 tcap = (struct T_capability_ack *)(uintptr_t)mp->b_rptr; 410 tcap->CAP_bits1 = 0; 411 412 if (cap_bits1 & TC1_INFO) { 413 tcap->CAP_bits1 |= TC1_INFO; 414 *(&tcap->INFO_ack) = rds_g_t_info_ack_ipv4; 415 } 416 417 qreply(q, mp); 418 } 419 420 static void 421 rds_info_req(queue_t *q, mblk_t *omp) 422 { 423 rds_t *rds = (rds_t *)q->q_ptr; 424 struct T_info_ack *tap; 425 mblk_t *mp; 426 427 /* Create a T_INFO_ACK message. */ 428 mp = tpi_ack_alloc(omp, sizeof (struct T_info_ack), M_PCPROTO, 429 T_INFO_ACK); 430 if (mp == NULL) 431 return; 432 tap = (struct T_info_ack *)(uintptr_t)mp->b_rptr; 433 *tap = rds_g_t_info_ack_ipv4; 434 tap->CURRENT_state = rds->rds_state; 435 tap->OPT_size = 128; 436 qreply(q, mp); 437 } 438 439 /* 440 * NO locking protection here as sockfs will only send down 441 * one bind operation at a time. 442 */ 443 static void 444 rds_bind(queue_t *q, mblk_t *mp) 445 { 446 sin_t *sin; 447 rds_t *rds; 448 struct T_bind_req *tbr; 449 in_port_t port; /* Host byte order */ 450 in_port_t requested_port; /* Host byte order */ 451 struct T_bind_ack *tba; 452 int count; 453 rds_bf_t *rdsbf; 454 in_port_t lport; /* Network byte order */ 455 456 rds = (rds_t *)q->q_ptr; 457 if (((uintptr_t)mp->b_wptr - (uintptr_t)mp->b_rptr) < sizeof (*tbr)) { 458 rds_err_ack(q, mp, TPROTO, 0); 459 return; 460 } 461 462 /* 463 * We don't allow multiple binds 464 */ 465 if (rds->rds_state != TS_UNBND) { 466 rds_err_ack(q, mp, TOUTSTATE, 0); 467 return; 468 } 469 470 tbr = (struct T_bind_req *)(uintptr_t)mp->b_rptr; 471 switch (tbr->ADDR_length) { 472 case sizeof (sin_t): /* Complete IPv4 address */ 473 sin = (sin_t *)(uintptr_t)mi_offset_param(mp, tbr->ADDR_offset, 474 sizeof (sin_t)); 475 if (sin == NULL || !OK_32PTR((char *)sin)) { 476 rds_err_ack(q, mp, TSYSERR, EINVAL); 477 return; 478 } 479 if (rds->rds_family != AF_INET_OFFLOAD || 480 sin->sin_family != AF_INET_OFFLOAD) { 481 rds_err_ack(q, mp, TSYSERR, EAFNOSUPPORT); 482 return; 483 } 484 if (sin->sin_addr.s_addr == INADDR_ANY) { 485 rds_err_ack(q, mp, TBADADDR, 0); 486 return; 487 } 488 489 /* 490 * verify that the address is hosted on IB 491 * only exception is the loopback address. 492 */ 493 if ((sin->sin_addr.s_addr != INADDR_LOOPBACK) && 494 !rds_verify_bind_address(sin->sin_addr.s_addr)) { 495 rds_err_ack(q, mp, TBADADDR, 0); 496 return; 497 } 498 499 port = ntohs(sin->sin_port); 500 break; 501 default: /* Invalid request */ 502 rds_err_ack(q, mp, TBADADDR, 0); 503 return; 504 } 505 506 requested_port = port; 507 508 /* 509 * TPI only sends down T_BIND_REQ for AF_INET and AF_INET6 510 * since RDS socket is of type AF_INET_OFFLOAD a O_T_BIND_REQ 511 * will be sent down. Treat O_T_BIND_REQ as T_BIND_REQ 512 */ 513 514 if (requested_port == 0) { 515 /* 516 * If the application passed in zero for the port number, it 517 * doesn't care which port number we bind to. Get one in the 518 * valid range. 519 */ 520 port = rds_update_next_port(rds_next_port_to_try); 521 } 522 523 ASSERT(port != 0); 524 count = 0; 525 for (;;) { 526 rds_t *rds1; 527 ASSERT(sin->sin_addr.s_addr != INADDR_ANY); 528 /* 529 * Walk through the list of rds streams bound to 530 * requested port with the same IP address. 531 */ 532 lport = htons(port); 533 rdsbf = &rds_bind_fanout[RDS_BIND_HASH(lport)]; 534 mutex_enter(&rdsbf->rds_bf_lock); 535 for (rds1 = rdsbf->rds_bf_rds; rds1 != NULL; 536 rds1 = rds1->rds_bind_hash) { 537 if (lport != rds1->rds_port || 538 rds1->rds_src != sin->sin_addr.s_addr || 539 rds1->rds_zoneid != rds->rds_zoneid) 540 541 continue; 542 break; 543 } 544 545 if (rds1 == NULL) { 546 /* 547 * No other stream has this IP address 548 * and port number. We can use it. 549 */ 550 break; 551 } 552 mutex_exit(&rdsbf->rds_bf_lock); 553 if (requested_port != 0) { 554 /* 555 * We get here only when requested port 556 * is bound (and only first of the for() 557 * loop iteration). 558 * 559 * The semantics of this bind request 560 * require it to fail so we return from 561 * the routine (and exit the loop). 562 * 563 */ 564 rds_err_ack(q, mp, TADDRBUSY, 0); 565 return; 566 } 567 568 port = rds_update_next_port(port + 1); 569 570 if (++count >= loopmax) { 571 /* 572 * We've tried every possible port number and 573 * there are none available, so send an error 574 * to the user. 575 */ 576 rds_err_ack(q, mp, TNOADDR, 0); 577 return; 578 } 579 } 580 581 /* 582 * Copy the source address into our rds structure. 583 */ 584 rds->rds_src = sin->sin_addr.s_addr; 585 rds->rds_port = lport; 586 587 /* 588 * reset the next port if we choose the port 589 */ 590 if (requested_port == 0) { 591 rds_next_port_to_try = port + 1; 592 } 593 594 rds->rds_state = TS_IDLE; 595 rds_bind_hash_insert(rdsbf, rds); 596 mutex_exit(&rdsbf->rds_bf_lock); 597 598 /* Reset the message type in preparation for shipping it back. */ 599 mp->b_datap->db_type = M_PCPROTO; 600 tba = (struct T_bind_ack *)(uintptr_t)mp->b_rptr; 601 tba->PRIM_type = T_BIND_ACK; 602 603 /* Increment the number of ports and set the port quota */ 604 RDS_INCR_NPORT(); 605 rds->rds_port_quota = RDS_CURRENT_PORT_QUOTA(); 606 RDS_SET_PORT_QUOTA(rds->rds_port_quota); 607 (void) proto_set_rx_hiwat(RD(q), NULL, 608 rds->rds_port_quota * UserBufferSize); 609 610 qreply(q, mp); 611 } 612 613 static void 614 rds_wput_other(queue_t *q, mblk_t *mp) 615 { 616 uchar_t *rptr = mp->b_rptr; 617 struct datab *db; 618 cred_t *cr; 619 620 db = mp->b_datap; 621 switch (db->db_type) { 622 case M_DATA: 623 /* Not connected */ 624 freemsg(mp); 625 return; 626 case M_PROTO: 627 case M_PCPROTO: 628 if ((uintptr_t)mp->b_wptr - (uintptr_t)rptr < 629 sizeof (t_scalar_t)) { 630 freemsg(mp); 631 return; 632 } 633 switch (((union T_primitives *)(uintptr_t)rptr)->type) { 634 case T_CAPABILITY_REQ: 635 rds_capability_req(q, mp); 636 return; 637 638 case T_INFO_REQ: 639 rds_info_req(q, mp); 640 return; 641 case O_T_BIND_REQ: 642 case T_BIND_REQ: 643 rds_bind(q, mp); 644 return; 645 case T_SVR4_OPTMGMT_REQ: 646 case T_OPTMGMT_REQ: 647 /* 648 * All Solaris components should pass a db_credp 649 * for this TPI message, hence we ASSERT. 650 * But in case there is some other M_PROTO that looks 651 * like a TPI message sent by some other kernel 652 * component, we check and return an error. 653 */ 654 cr = msg_getcred(mp, NULL); 655 ASSERT(cr != NULL); 656 if (cr == NULL) { 657 rds_err_ack(q, mp, TSYSERR, EINVAL); 658 return; 659 } 660 if (((union T_primitives *)(uintptr_t)rptr)->type == 661 T_SVR4_OPTMGMT_REQ) { 662 svr4_optcom_req(q, mp, cr, &rds_opt_obj); 663 } else { 664 tpi_optcom_req(q, mp, cr, &rds_opt_obj); 665 } 666 return; 667 case T_CONN_REQ: 668 /* 669 * We should not receive T_CONN_REQ as sockfs only 670 * sends down T_CONN_REQ if family == AF_INET/AF_INET6 671 * and type == SOCK_DGRAM/SOCK_RAW. For all others 672 * it simply calls soisconnected. see sotpi_connect() 673 * for details. 674 */ 675 /* FALLTHRU */ 676 default: 677 cmn_err(CE_PANIC, "type %d \n", 678 ((union T_primitives *)(uintptr_t)rptr)->type); 679 } 680 break; 681 case M_FLUSH: 682 if (*rptr & FLUSHW) 683 flushq(q, FLUSHDATA); 684 break; 685 case M_IOCTL: 686 rds_ioctl(q, mp); 687 break; 688 case M_IOCDATA: 689 /* IOCTL continuation following copyin or copyout. */ 690 if (mi_copy_state(q, mp, NULL) == -1) { 691 /* 692 * The copy operation failed. mi_copy_state already 693 * cleaned up, so we're out of here. 694 */ 695 return; 696 } 697 /* 698 * If we just completed a copy in, continue processing 699 * in rds_ioctl_copyin_done. If it was a copy out, we call 700 * mi_copyout again. If there is nothing more to copy out, 701 * it will complete the IOCTL. 702 */ 703 704 if (MI_COPY_DIRECTION(mp) == MI_COPY_IN) 705 rds_ioctl_copyin_done(q, mp); 706 else 707 mi_copyout(q, mp); 708 return; 709 710 default: 711 cmn_err(CE_PANIC, "types %d \n", db->db_type); 712 } 713 } 714 715 static int 716 rds_wput(queue_t *q, mblk_t *mp) 717 { 718 struct datab *db; 719 uchar_t *rptr = mp->b_rptr; 720 721 db = mp->b_datap; 722 switch (db->db_type) { 723 case M_PROTO: 724 case M_PCPROTO: 725 ASSERT(((uintptr_t)mp->b_wptr - (uintptr_t)rptr) <= 726 (uintptr_t)INT_MAX); 727 if ((uintptr_t)mp->b_wptr - (uintptr_t)rptr >= 728 sizeof (struct T_unitdata_req)) { 729 if (((union T_primitives *)(uintptr_t)rptr)->type 730 == T_UNITDATA_REQ) { 731 /* 732 * We should never come here for T_UNITDATA_REQ 733 */ 734 cmn_err(CE_PANIC, "rds_wput T_UNITDATA_REQ \n"); 735 } 736 } 737 /* FALLTHRU */ 738 default: 739 rds_wput_other(q, mp); 740 return (0); 741 } 742 } 743 744 static int 745 rds_wput_data(queue_t *q, mblk_t *mp, uio_t *uiop) 746 { 747 uchar_t *rptr = mp->b_rptr; 748 rds_t *rds; 749 mblk_t *mp1; 750 sin_t *sin; 751 ipaddr_t dst; 752 uint16_t port; 753 int ret = 0; 754 755 #define tudr ((struct T_unitdata_req *)(uintptr_t)rptr) 756 757 rds = (rds_t *)q->q_ptr; 758 /* Handle UNITDATA_REQ messages here */ 759 if (rds->rds_state == TS_UNBND) { 760 /* If a port has not been bound to the stream, fail. */ 761 dprint(2, ("%s: socket is not bound to a port", LABEL)); 762 freemsg(mp); 763 return (EPROTO); 764 } 765 766 mp1 = mp->b_cont; 767 mp->b_cont = NULL; 768 if (mp1 == NULL) { 769 dprint(2, ("%s: No message to send", LABEL)); 770 freemsg(mp); 771 return (EPROTO); 772 } 773 774 /* 775 * No options allowed 776 */ 777 if (tudr->OPT_length != 0) { 778 ret = EINVAL; 779 goto done; 780 } 781 782 ASSERT(mp1->b_datap->db_ref == 1); 783 784 if ((rptr + tudr->DEST_offset + tudr->DEST_length) > 785 mp->b_wptr) { 786 ret = EDESTADDRREQ; 787 goto done; 788 } 789 790 sin = (sin_t *)(uintptr_t)&rptr[tudr->DEST_offset]; 791 if (!OK_32PTR((char *)sin) || tudr->DEST_length != 792 sizeof (sin_t) || sin->sin_family != AF_INET_OFFLOAD) { 793 ret = EDESTADDRREQ; 794 goto done; 795 } 796 /* Extract port and ipaddr */ 797 port = sin->sin_port; 798 dst = sin->sin_addr.s_addr; 799 800 if (port == 0 || dst == INADDR_ANY) { 801 ret = EDESTADDRREQ; 802 goto done; 803 } 804 805 ASSERT(rds_transport_ops != NULL); 806 ret = rds_transport_ops->rds_transport_sendmsg(uiop, rds->rds_src, dst, 807 ntohs(rds->rds_port), ntohs(port), rds->rds_zoneid); 808 if (ret != 0) { 809 if ((ret != ENOBUFS) && (ret != ENOMEM)) { 810 /* ENOMEM is actually EWOULDBLOCK */ 811 dprint(2, ("%s: rds_sendmsg returned %d", LABEL, ret)); 812 goto done; 813 } 814 } 815 done: 816 freemsg(mp1); 817 freemsg(mp); 818 return (ret); 819 } 820 821 /* 822 * Make sure we dont return EINVAL and EWOULDBLOCK as it has 823 * special meanings for the synchronous streams (rwnext()). 824 * We should return ENOMEM which is changed to EWOULDBLOCK by kstrputmsg() 825 */ 826 static int 827 rds_wrw(queue_t *q, struiod_t *dp) 828 { 829 mblk_t *mp = dp->d_mp; 830 int error = 0; 831 struct datab *db; 832 uchar_t *rptr; 833 834 db = mp->b_datap; 835 rptr = mp->b_rptr; 836 switch (db->db_type) { 837 case M_PROTO: 838 case M_PCPROTO: 839 ASSERT(((uintptr_t)mp->b_wptr - (uintptr_t)rptr) <= 840 (uintptr_t)INT_MAX); 841 if ((uintptr_t)mp->b_wptr - (uintptr_t)rptr >= 842 sizeof (struct T_unitdata_req)) { 843 /* Detect valid T_UNITDATA_REQ here */ 844 if (((union T_primitives *)(uintptr_t)rptr)->type 845 == T_UNITDATA_REQ) 846 break; 847 } 848 /* FALLTHRU */ 849 default: 850 851 if (isuioq(q) && (error = struioget(q, mp, dp, 0))) { 852 /* 853 * Uio error of some sort, so just return the error. 854 */ 855 goto done; 856 } 857 dp->d_mp = 0; 858 rds_wput_other(q, mp); 859 return (0); 860 } 861 862 dp->d_mp = 0; 863 error = rds_wput_data(q, mp, &dp->d_uio); 864 done: 865 if (error == EWOULDBLOCK || error == EINVAL) 866 error = EIO; 867 868 return (error); 869 } 870 871 static void 872 rds_rsrv(queue_t *q) 873 { 874 rds_t *rds = (rds_t *)q->q_ptr; 875 ulong_t current_port_quota; 876 877 /* update the port quota to the current level */ 878 current_port_quota = RDS_GET_PORT_QUOTA(); 879 if (rds->rds_port_quota != current_port_quota) { 880 rds->rds_port_quota = current_port_quota; 881 (void) proto_set_rx_hiwat(q, NULL, 882 rds->rds_port_quota * UserBufferSize); 883 } 884 885 /* No more messages in the q, unstall the socket */ 886 rds_transport_ops->rds_transport_resume_port(ntohs(rds->rds_port)); 887 } 888 889 int 890 rds_close_transport_driver() 891 { 892 ASSERT(rds_transport_ops != NULL); 893 894 rw_enter(&rds_transport_lock, RW_WRITER); 895 if (rds_transport_handle != NULL) { 896 rds_transport_ops->rds_transport_close_ib(); 897 (void) ldi_close(rds_transport_handle, FNDELAY, kcred); 898 rds_transport_handle = NULL; 899 } 900 rw_exit(&rds_transport_lock); 901 902 return (0); 903 } 904 905 906 int 907 rds_open_transport_driver() 908 { 909 int ret = 0; 910 911 rw_enter(&rds_transport_lock, RW_WRITER); 912 if (rds_transport_handle != NULL) { 913 /* 914 * Someone beat us to it. 915 */ 916 goto done; 917 } 918 919 if (ibt_hw_is_present() == 0) { 920 ret = ENODEV; 921 goto done; 922 } 923 924 if (rds_li == NULL) { 925 ret = EPROTONOSUPPORT; 926 goto done; 927 } 928 929 ret = ldi_open_by_name("/devices/ib/rdsib@0:rdsib", 930 FREAD | FWRITE, kcred, &rds_transport_handle, rds_li); 931 if (ret != 0) { 932 ret = EPROTONOSUPPORT; 933 rds_transport_handle = NULL; 934 goto done; 935 } 936 937 ret = rds_transport_ops->rds_transport_open_ib(); 938 if (ret != 0) { 939 (void) ldi_close(rds_transport_handle, FNDELAY, kcred); 940 rds_transport_handle = NULL; 941 } 942 done: 943 rw_exit(&rds_transport_lock); 944 return (ret); 945 } 946 947 static struct module_info info = { 948 0, "rds", 1, INFPSZ, 65536, 1024 949 }; 950 951 static struct qinit rinit = { 952 NULL, (pfi_t)rds_rsrv, rds_open, rds_close, NULL, &info 953 }; 954 955 static struct qinit winit = { 956 (pfi_t)rds_wput, NULL, rds_open, rds_close, NULL, &info, 957 NULL, rds_wrw, NULL, STRUIOT_STANDARD 958 }; 959 960 struct streamtab rdsinfo = { 961 &rinit, &winit, NULL, NULL 962 }; 963 964 DDI_DEFINE_STREAM_OPS(rds_devops, nulldev, nulldev, rds_attach, rds_detach, 965 nulldev, rds_info, RDS_DEVMTFLAGS, &RDS_STRTAB, ddi_quiesce_not_supported); 966 967 /* 968 * Module linkage information for the kernel. 969 */ 970 static struct modldrv modldrv = { 971 &mod_driverops, 972 RDS_DEVDESC, 973 &rds_devops 974 }; 975 976 static struct modlinkage modlinkage = { 977 MODREV_1, 978 &modldrv, 979 NULL 980 }; 981 982 int 983 _init(void) 984 { 985 int ret; 986 987 rds_init(); 988 989 ret = mod_install(&modlinkage); 990 if (ret != 0) 991 goto done; 992 ret = ldi_ident_from_mod(&modlinkage, &rds_li); 993 if (ret != 0) 994 rds_li = NULL; 995 done: 996 return (ret); 997 } 998 999 int 1000 _fini(void) 1001 { 1002 int ret; 1003 1004 ret = mod_remove(&modlinkage); 1005 if (ret != 0) { 1006 return (ret); 1007 } 1008 1009 rds_fini(); 1010 1011 ldi_ident_release(rds_li); 1012 return (0); 1013 } 1014 1015 int 1016 _info(struct modinfo *modinfop) 1017 { 1018 return (mod_info(&modlinkage, modinfop)); 1019 } 1020