1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/conf.h> 28 #include <sys/modctl.h> 29 #include <sys/stat.h> 30 #include <sys/stream.h> 31 #include <sys/strsun.h> 32 #include <sys/ddi.h> 33 #include <sys/sunddi.h> 34 #include <sys/priv_names.h> 35 #include <inet/common.h> 36 37 #define _SUN_TPI_VERSION 2 38 #include <sys/tihdr.h> 39 #include <sys/timod.h> 40 #include <sys/tiuser.h> 41 #include <sys/suntpi.h> 42 #include <inet/common.h> 43 #include <inet/ip.h> 44 #include <inet/mi.h> 45 #include <inet/proto_set.h> 46 #include <sys/ib/clients/rds/rds.h> 47 #include <sys/policy.h> 48 #include <inet/ipclassifier.h> 49 #include <sys/ib/clients/rds/rds_kstat.h> 50 #include "sys/random.h" 51 #include <sys/ib/clients/rds/rds_transport.h> 52 #include <sys/ib/ibtl/ibti.h> 53 54 55 #define RDS_NAME "rds" 56 #define RDS_STRTAB rdsinfo 57 #define RDS_DEVDESC "RDS STREAMS driver" 58 #define RDS_DEVMINOR 0 59 #define RDS_DEVMTFLAGS D_MP | D_SYNCSTR 60 #define RDS_DEFAULT_PRIV_MODE 0666 61 62 #define rds_smallest_port 1 63 #define rds_largest_port 65535 64 65 #define RDS_RECV_HIWATER (56 * 1024) 66 #define RDS_RECV_LOWATER 128 67 #define RDS_XMIT_HIWATER (56 * 1024) 68 #define RDS_XMIT_LOWATER 1024 69 70 #define RDS_DPRINTF2 0 && 71 #define LABEL "RDS" 72 73 typedef struct rdsahdr_s { 74 in_port_t uha_src_port; /* Source port */ 75 in_port_t uha_dst_port; /* Destination port */ 76 } rdsha_t; 77 78 #define RDSH_SIZE 4 79 80 int rds_recv_hiwat = RDS_RECV_HIWATER; 81 int rds_recv_lowat = RDS_RECV_LOWATER; 82 int rds_xmit_hiwat = RDS_XMIT_HIWATER; 83 int rds_xmit_lowat = RDS_XMIT_LOWATER; 84 85 int rdsdebug; 86 87 static dev_info_t *rds_dev_info; 88 89 /* Hint not protected by any lock */ 90 static in_port_t rds_next_port_to_try; 91 92 ldi_ident_t rds_li; 93 static int loopmax = rds_largest_port - rds_smallest_port + 1; 94 95 /* global configuration variables */ 96 uint_t UserBufferSize; 97 uint_t rds_rx_pkts_pending_hwm; 98 99 extern void rds_ioctl(queue_t *, mblk_t *); 100 extern void rds_ioctl_copyin_done(queue_t *q, mblk_t *mp); 101 102 int rds_open_transport_driver(); 103 int rds_close_transport_driver(); 104 105 #define RDS_CURRENT_PORT_QUOTA() \ 106 (rds_rx_pkts_pending_hwm/RDS_GET_NPORT()) 107 108 krwlock_t rds_transport_lock; 109 ldi_handle_t rds_transport_handle = NULL; 110 rds_transport_ops_t *rds_transport_ops = NULL; 111 112 static int 113 rds_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) 114 { 115 int ret; 116 117 if (cmd != DDI_ATTACH) 118 return (DDI_FAILURE); 119 120 rds_dev_info = devi; 121 122 ret = ddi_create_minor_node(devi, RDS_NAME, S_IFCHR, 123 RDS_DEVMINOR, DDI_PSEUDO, 0); 124 if (ret != DDI_SUCCESS) { 125 return (ret); 126 } 127 128 return (DDI_SUCCESS); 129 } 130 131 static int 132 rds_detach(dev_info_t *devi, ddi_detach_cmd_t cmd) 133 { 134 if (cmd != DDI_DETACH) 135 return (DDI_FAILURE); 136 137 ASSERT(devi == rds_dev_info); 138 139 ddi_remove_minor_node(devi, NULL); 140 141 return (DDI_SUCCESS); 142 } 143 144 /* ARGSUSED */ 145 static int 146 rds_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 147 { 148 int error = DDI_FAILURE; 149 150 switch (cmd) { 151 case DDI_INFO_DEVT2DEVINFO: 152 if (rds_dev_info != NULL) { 153 *result = (void *)rds_dev_info; 154 error = DDI_SUCCESS; 155 } 156 break; 157 158 case DDI_INFO_DEVT2INSTANCE: 159 *result = NULL; 160 error = DDI_SUCCESS; 161 break; 162 163 default: 164 break; 165 } 166 167 return (error); 168 } 169 170 171 /*ARGSUSED*/ 172 static int 173 rds_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 174 { 175 rds_t *rds; 176 int ret; 177 178 if (is_system_labeled()) { 179 /* 180 * RDS socket is not supported on labeled systems 181 */ 182 return (ESOCKTNOSUPPORT); 183 } 184 185 /* Open the transport driver if IB HW is present */ 186 rw_enter(&rds_transport_lock, RW_READER); 187 if (rds_transport_handle == NULL) { 188 rw_exit(&rds_transport_lock); 189 ret = rds_open_transport_driver(); 190 rw_enter(&rds_transport_lock, RW_READER); 191 192 if (ret != 0) { 193 /* Transport driver failed to load */ 194 rw_exit(&rds_transport_lock); 195 return (ret); 196 } 197 } 198 rw_exit(&rds_transport_lock); 199 200 if (sflag == MODOPEN) { 201 return (EINVAL); 202 } 203 204 /* Reopen not supported */ 205 if (q->q_ptr != NULL) { 206 dprint(2, ("%s: Reopen is not supported: %p", LABEL, q->q_ptr)); 207 return (0); 208 } 209 210 rds = rds_create(q, credp); 211 if (rds == NULL) { 212 dprint(2, ("%s: rds_create failed", LABEL)); 213 return (0); 214 } 215 216 q->q_ptr = WR(q)->q_ptr = rds; 217 rds->rds_state = TS_UNBND; 218 rds->rds_family = AF_INET_OFFLOAD; 219 220 q->q_hiwat = rds_recv_hiwat; 221 q->q_lowat = rds_recv_lowat; 222 223 qprocson(q); 224 225 WR(q)->q_hiwat = rds_xmit_hiwat; 226 WR(q)->q_lowat = rds_xmit_lowat; 227 228 /* Set the Stream head watermarks */ 229 (void) proto_set_rx_hiwat(q, NULL, rds_recv_hiwat); 230 (void) proto_set_rx_lowat(q, NULL, rds_recv_lowat); 231 232 return (0); 233 } 234 235 static int 236 rds_close(queue_t *q) 237 { 238 rds_t *rdsp = (rds_t *)q->q_ptr; 239 240 qprocsoff(q); 241 242 /* 243 * NPORT should be decremented only if this socket was previously 244 * bound to an RDS port. 245 */ 246 if (rdsp->rds_state >= TS_IDLE) { 247 RDS_DECR_NPORT(); 248 RDS_SET_PORT_QUOTA(RDS_CURRENT_PORT_QUOTA()); 249 rds_transport_ops-> 250 rds_transport_resume_port(ntohs(rdsp->rds_port)); 251 } 252 253 /* close the transport driver if this is the last socket */ 254 if (RDS_GET_NPORT() == 1) { 255 (void) rds_close_transport_driver(); 256 } 257 258 /* 259 * We set the flags without holding a lock as this is 260 * just a hint for the fanout lookup to skip this rds. 261 * We dont free the struct until it's out of the hash and 262 * the ref count goes down. 263 */ 264 rdsp->rds_flags |= RDS_CLOSING; 265 rds_bind_hash_remove(rdsp, B_FALSE); 266 mutex_enter(&rdsp->rds_lock); 267 ASSERT(rdsp->rds_refcnt > 0); 268 if (rdsp->rds_refcnt != 1) { 269 cv_wait(&rdsp->rds_refcv, &rdsp->rds_lock); 270 } 271 mutex_exit(&rdsp->rds_lock); 272 RDS_DEC_REF_CNT(rdsp); 273 RD(q)->q_ptr = NULL; 274 WR(q)->q_ptr = NULL; 275 return (0); 276 } 277 278 /* 279 * Add a new message to the socket 280 */ 281 int 282 rds_deliver_new_msg(mblk_t *mp, ipaddr_t local_addr, ipaddr_t rem_addr, 283 in_port_t local_port, in_port_t rem_port, zoneid_t zoneid) 284 { 285 rds_t *rds; 286 struct T_unitdata_ind *tudi; 287 int udi_size; /* Size of T_unitdata_ind */ 288 mblk_t *mp1; 289 sin_t *sin; 290 int error = 0; 291 292 local_port = htons(local_port); 293 rem_port = htons(rem_port); 294 295 ASSERT(mp->b_datap->db_type == M_DATA); 296 rds = rds_fanout(local_addr, rem_addr, local_port, rem_port, zoneid); 297 if (rds == NULL) { 298 dprint(2, ("%s: rds_fanout failed: (0x%x 0x%x %d %d)", LABEL, 299 local_addr, rem_addr, ntohs(local_port), ntohs(rem_port))); 300 freemsg(mp); 301 return (error); 302 } 303 304 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t); 305 306 /* Allocate a message block for the T_UNITDATA_IND structure. */ 307 mp1 = allocb(udi_size, BPRI_MED); 308 if (mp1 == NULL) { 309 dprint(2, ("%s: allocb failed", LABEL)); 310 freemsg(mp); 311 return (ENOMEM); 312 } 313 314 mp1->b_cont = mp; 315 mp = mp1; 316 mp->b_datap->db_type = M_PROTO; 317 tudi = (struct T_unitdata_ind *)(uintptr_t)mp->b_rptr; 318 mp->b_wptr = (uchar_t *)tudi + udi_size; 319 tudi->PRIM_type = T_UNITDATA_IND; 320 tudi->SRC_length = sizeof (sin_t); 321 tudi->SRC_offset = sizeof (struct T_unitdata_ind); 322 tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin_t); 323 udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t)); 324 tudi->OPT_length = udi_size; 325 sin = (sin_t *)&tudi[1]; 326 sin->sin_addr.s_addr = rem_addr; 327 sin->sin_port = ntohs(rem_port); 328 sin->sin_family = rds->rds_family; 329 *(uint32_t *)(uintptr_t)&sin->sin_zero[0] = 0; 330 *(uint32_t *)(uintptr_t)&sin->sin_zero[4] = 0; 331 332 putnext(rds->rds_ulpd, mp); 333 334 /* check port quota */ 335 if (RDS_GET_RXPKTS_PEND() > rds_rx_pkts_pending_hwm) { 336 ulong_t current_port_quota = RDS_GET_PORT_QUOTA(); 337 if (rds->rds_port_quota > current_port_quota) { 338 /* this may result in stalling the port */ 339 rds->rds_port_quota = current_port_quota; 340 (void) proto_set_rx_hiwat(rds->rds_ulpd, NULL, 341 rds->rds_port_quota * UserBufferSize); 342 RDS_INCR_PORT_QUOTA_ADJUSTED(); 343 } 344 } 345 346 /* 347 * canputnext() check is done after putnext as the protocol does 348 * not allow dropping any received packet. 349 */ 350 if (!canputnext(rds->rds_ulpd)) { 351 error = ENOSPC; 352 } 353 354 RDS_DEC_REF_CNT(rds); 355 return (error); 356 } 357 358 359 /* Default structure copied into T_INFO_ACK messages */ 360 static struct T_info_ack rds_g_t_info_ack_ipv4 = { 361 T_INFO_ACK, 362 65535, /* TSDU_size. Excl. headers */ 363 T_INVALID, /* ETSU_size. rds does not support expedited data. */ 364 T_INVALID, /* CDATA_size. rds does not support connect data. */ 365 T_INVALID, /* DDATA_size. rds does not support disconnect data. */ 366 sizeof (sin_t), /* ADDR_size. */ 367 0, /* OPT_size - not initialized here */ 368 65535, /* TIDU_size. Excl. headers */ 369 T_CLTS, /* SERV_type. rds supports connection-less. */ 370 TS_UNBND, /* CURRENT_state. This is set from rds_state. */ 371 (XPG4_1|SENDZERO) /* PROVIDER_flag */ 372 }; 373 374 static in_port_t 375 rds_update_next_port(in_port_t port) 376 { 377 (void) random_get_pseudo_bytes((uint8_t *)&port, sizeof (in_port_t)); 378 if (port < rds_smallest_port) 379 port = rds_smallest_port; 380 return (port); 381 } 382 383 /* This routine creates a T_ERROR_ACK message and passes it upstream. */ 384 static void 385 rds_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error) 386 { 387 if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL) 388 qreply(q, mp); 389 } 390 391 static void 392 rds_capability_req(queue_t *q, mblk_t *mp) 393 { 394 t_uscalar_t cap_bits1; 395 struct T_capability_ack *tcap; 396 397 cap_bits1 = 398 ((struct T_capability_req *)(uintptr_t)mp->b_rptr)->CAP_bits1; 399 400 mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack), 401 mp->b_datap->db_type, T_CAPABILITY_ACK); 402 if (mp == NULL) 403 return; 404 tcap = (struct T_capability_ack *)(uintptr_t)mp->b_rptr; 405 tcap->CAP_bits1 = 0; 406 407 if (cap_bits1 & TC1_INFO) { 408 tcap->CAP_bits1 |= TC1_INFO; 409 *(&tcap->INFO_ack) = rds_g_t_info_ack_ipv4; 410 } 411 412 qreply(q, mp); 413 } 414 415 static void 416 rds_info_req(queue_t *q, mblk_t *omp) 417 { 418 rds_t *rds = (rds_t *)q->q_ptr; 419 struct T_info_ack *tap; 420 mblk_t *mp; 421 422 /* Create a T_INFO_ACK message. */ 423 mp = tpi_ack_alloc(omp, sizeof (struct T_info_ack), M_PCPROTO, 424 T_INFO_ACK); 425 if (mp == NULL) 426 return; 427 tap = (struct T_info_ack *)(uintptr_t)mp->b_rptr; 428 *tap = rds_g_t_info_ack_ipv4; 429 tap->CURRENT_state = rds->rds_state; 430 tap->OPT_size = 128; 431 qreply(q, mp); 432 } 433 434 /* 435 * NO locking protection here as sockfs will only send down 436 * one bind operation at a time. 437 */ 438 static void 439 rds_bind(queue_t *q, mblk_t *mp) 440 { 441 sin_t *sin; 442 rds_t *rds; 443 struct T_bind_req *tbr; 444 in_port_t port; /* Host byte order */ 445 in_port_t requested_port; /* Host byte order */ 446 struct T_bind_ack *tba; 447 int count; 448 rds_bf_t *rdsbf; 449 in_port_t lport; /* Network byte order */ 450 451 rds = (rds_t *)q->q_ptr; 452 if (((uintptr_t)mp->b_wptr - (uintptr_t)mp->b_rptr) < sizeof (*tbr)) { 453 rds_err_ack(q, mp, TPROTO, 0); 454 return; 455 } 456 457 /* 458 * We don't allow multiple binds 459 */ 460 if (rds->rds_state != TS_UNBND) { 461 rds_err_ack(q, mp, TOUTSTATE, 0); 462 return; 463 } 464 465 tbr = (struct T_bind_req *)(uintptr_t)mp->b_rptr; 466 switch (tbr->ADDR_length) { 467 case sizeof (sin_t): /* Complete IPv4 address */ 468 sin = (sin_t *)(uintptr_t)mi_offset_param(mp, tbr->ADDR_offset, 469 sizeof (sin_t)); 470 if (sin == NULL || !OK_32PTR((char *)sin)) { 471 rds_err_ack(q, mp, TSYSERR, EINVAL); 472 return; 473 } 474 if (rds->rds_family != AF_INET_OFFLOAD || 475 sin->sin_family != AF_INET_OFFLOAD) { 476 rds_err_ack(q, mp, TSYSERR, EAFNOSUPPORT); 477 return; 478 } 479 if (sin->sin_addr.s_addr == INADDR_ANY) { 480 rds_err_ack(q, mp, TBADADDR, 0); 481 return; 482 } 483 484 /* 485 * verify that the address is hosted on IB 486 * only exception is the loopback address. 487 */ 488 if ((sin->sin_addr.s_addr != INADDR_LOOPBACK) && 489 !rds_verify_bind_address(sin->sin_addr.s_addr)) { 490 rds_err_ack(q, mp, TBADADDR, 0); 491 return; 492 } 493 494 port = ntohs(sin->sin_port); 495 break; 496 default: /* Invalid request */ 497 rds_err_ack(q, mp, TBADADDR, 0); 498 return; 499 } 500 501 requested_port = port; 502 503 /* 504 * TPI only sends down T_BIND_REQ for AF_INET and AF_INET6 505 * since RDS socket is of type AF_INET_OFFLOAD a O_T_BIND_REQ 506 * will be sent down. Treat O_T_BIND_REQ as T_BIND_REQ 507 */ 508 509 if (requested_port == 0) { 510 /* 511 * If the application passed in zero for the port number, it 512 * doesn't care which port number we bind to. Get one in the 513 * valid range. 514 */ 515 port = rds_update_next_port(rds_next_port_to_try); 516 } 517 518 ASSERT(port != 0); 519 count = 0; 520 for (;;) { 521 rds_t *rds1; 522 ASSERT(sin->sin_addr.s_addr != INADDR_ANY); 523 /* 524 * Walk through the list of rds streams bound to 525 * requested port with the same IP address. 526 */ 527 lport = htons(port); 528 rdsbf = &rds_bind_fanout[RDS_BIND_HASH(lport)]; 529 mutex_enter(&rdsbf->rds_bf_lock); 530 for (rds1 = rdsbf->rds_bf_rds; rds1 != NULL; 531 rds1 = rds1->rds_bind_hash) { 532 if (lport != rds1->rds_port || 533 rds1->rds_src != sin->sin_addr.s_addr || 534 rds1->rds_zoneid != rds->rds_zoneid) 535 536 continue; 537 break; 538 } 539 540 if (rds1 == NULL) { 541 /* 542 * No other stream has this IP address 543 * and port number. We can use it. 544 */ 545 break; 546 } 547 mutex_exit(&rdsbf->rds_bf_lock); 548 if (requested_port != 0) { 549 /* 550 * We get here only when requested port 551 * is bound (and only first of the for() 552 * loop iteration). 553 * 554 * The semantics of this bind request 555 * require it to fail so we return from 556 * the routine (and exit the loop). 557 * 558 */ 559 rds_err_ack(q, mp, TADDRBUSY, 0); 560 return; 561 } 562 563 port = rds_update_next_port(port + 1); 564 565 if (++count >= loopmax) { 566 /* 567 * We've tried every possible port number and 568 * there are none available, so send an error 569 * to the user. 570 */ 571 rds_err_ack(q, mp, TNOADDR, 0); 572 return; 573 } 574 } 575 576 /* 577 * Copy the source address into our rds structure. 578 */ 579 rds->rds_src = sin->sin_addr.s_addr; 580 rds->rds_port = lport; 581 582 /* 583 * reset the next port if we choose the port 584 */ 585 if (requested_port == 0) { 586 rds_next_port_to_try = port + 1; 587 } 588 589 rds->rds_state = TS_IDLE; 590 rds_bind_hash_insert(rdsbf, rds); 591 mutex_exit(&rdsbf->rds_bf_lock); 592 593 /* Reset the message type in preparation for shipping it back. */ 594 mp->b_datap->db_type = M_PCPROTO; 595 tba = (struct T_bind_ack *)(uintptr_t)mp->b_rptr; 596 tba->PRIM_type = T_BIND_ACK; 597 598 /* Increment the number of ports and set the port quota */ 599 RDS_INCR_NPORT(); 600 rds->rds_port_quota = RDS_CURRENT_PORT_QUOTA(); 601 RDS_SET_PORT_QUOTA(rds->rds_port_quota); 602 (void) proto_set_rx_hiwat(RD(q), NULL, 603 rds->rds_port_quota * UserBufferSize); 604 605 qreply(q, mp); 606 } 607 608 static void 609 rds_wput_other(queue_t *q, mblk_t *mp) 610 { 611 uchar_t *rptr = mp->b_rptr; 612 struct datab *db; 613 cred_t *cr; 614 615 db = mp->b_datap; 616 switch (db->db_type) { 617 case M_DATA: 618 /* Not connected */ 619 freemsg(mp); 620 return; 621 case M_PROTO: 622 case M_PCPROTO: 623 if ((uintptr_t)mp->b_wptr - (uintptr_t)rptr < 624 sizeof (t_scalar_t)) { 625 freemsg(mp); 626 return; 627 } 628 switch (((union T_primitives *)(uintptr_t)rptr)->type) { 629 case T_CAPABILITY_REQ: 630 rds_capability_req(q, mp); 631 return; 632 633 case T_INFO_REQ: 634 rds_info_req(q, mp); 635 return; 636 case O_T_BIND_REQ: 637 case T_BIND_REQ: 638 rds_bind(q, mp); 639 return; 640 case T_SVR4_OPTMGMT_REQ: 641 case T_OPTMGMT_REQ: 642 /* 643 * All Solaris components should pass a db_credp 644 * for this TPI message, hence we ASSERT. 645 * But in case there is some other M_PROTO that looks 646 * like a TPI message sent by some other kernel 647 * component, we check and return an error. 648 */ 649 cr = msg_getcred(mp, NULL); 650 ASSERT(cr != NULL); 651 if (cr == NULL) { 652 rds_err_ack(q, mp, TSYSERR, EINVAL); 653 return; 654 } 655 if (((union T_primitives *)(uintptr_t)rptr)->type == 656 T_SVR4_OPTMGMT_REQ) { 657 (void) svr4_optcom_req(q, mp, cr, &rds_opt_obj, 658 B_FALSE); 659 } else { 660 (void) tpi_optcom_req(q, mp, cr, &rds_opt_obj, 661 B_FALSE); 662 } 663 return; 664 case T_CONN_REQ: 665 /* 666 * We should not receive T_CONN_REQ as sockfs only 667 * sends down T_CONN_REQ if family == AF_INET/AF_INET6 668 * and type == SOCK_DGRAM/SOCK_RAW. For all others 669 * it simply calls soisconnected. see sotpi_connect() 670 * for details. 671 */ 672 /* FALLTHRU */ 673 default: 674 cmn_err(CE_PANIC, "type %d \n", 675 ((union T_primitives *)(uintptr_t)rptr)->type); 676 } 677 break; 678 case M_FLUSH: 679 if (*rptr & FLUSHW) 680 flushq(q, FLUSHDATA); 681 break; 682 case M_IOCTL: 683 rds_ioctl(q, mp); 684 break; 685 case M_IOCDATA: 686 /* IOCTL continuation following copyin or copyout. */ 687 if (mi_copy_state(q, mp, NULL) == -1) { 688 /* 689 * The copy operation failed. mi_copy_state already 690 * cleaned up, so we're out of here. 691 */ 692 return; 693 } 694 /* 695 * If we just completed a copy in, continue processing 696 * in rds_ioctl_copyin_done. If it was a copy out, we call 697 * mi_copyout again. If there is nothing more to copy out, 698 * it will complete the IOCTL. 699 */ 700 701 if (MI_COPY_DIRECTION(mp) == MI_COPY_IN) 702 rds_ioctl_copyin_done(q, mp); 703 else 704 mi_copyout(q, mp); 705 return; 706 707 default: 708 cmn_err(CE_PANIC, "types %d \n", db->db_type); 709 } 710 } 711 712 static int 713 rds_wput(queue_t *q, mblk_t *mp) 714 { 715 struct datab *db; 716 uchar_t *rptr = mp->b_rptr; 717 718 db = mp->b_datap; 719 switch (db->db_type) { 720 case M_PROTO: 721 case M_PCPROTO: 722 ASSERT(((uintptr_t)mp->b_wptr - (uintptr_t)rptr) <= 723 (uintptr_t)INT_MAX); 724 if ((uintptr_t)mp->b_wptr - (uintptr_t)rptr >= 725 sizeof (struct T_unitdata_req)) { 726 if (((union T_primitives *)(uintptr_t)rptr)->type 727 == T_UNITDATA_REQ) { 728 /* 729 * We should never come here for T_UNITDATA_REQ 730 */ 731 cmn_err(CE_PANIC, "rds_wput T_UNITDATA_REQ \n"); 732 } 733 } 734 /* FALLTHRU */ 735 default: 736 rds_wput_other(q, mp); 737 return (0); 738 } 739 } 740 741 static int 742 rds_wput_data(queue_t *q, mblk_t *mp, uio_t *uiop) 743 { 744 uchar_t *rptr = mp->b_rptr; 745 rds_t *rds; 746 mblk_t *mp1; 747 sin_t *sin; 748 ipaddr_t dst; 749 uint16_t port; 750 int ret = 0; 751 752 #define tudr ((struct T_unitdata_req *)(uintptr_t)rptr) 753 754 rds = (rds_t *)q->q_ptr; 755 /* Handle UNITDATA_REQ messages here */ 756 if (rds->rds_state == TS_UNBND) { 757 /* If a port has not been bound to the stream, fail. */ 758 dprint(2, ("%s: socket is not bound to a port", LABEL)); 759 freemsg(mp); 760 return (EPROTO); 761 } 762 763 mp1 = mp->b_cont; 764 mp->b_cont = NULL; 765 if (mp1 == NULL) { 766 dprint(2, ("%s: No message to send", LABEL)); 767 freemsg(mp); 768 return (EPROTO); 769 } 770 771 /* 772 * No options allowed 773 */ 774 if (tudr->OPT_length != 0) { 775 ret = EINVAL; 776 goto done; 777 } 778 779 ASSERT(mp1->b_datap->db_ref == 1); 780 781 if ((rptr + tudr->DEST_offset + tudr->DEST_length) > 782 mp->b_wptr) { 783 ret = EDESTADDRREQ; 784 goto done; 785 } 786 787 sin = (sin_t *)(uintptr_t)&rptr[tudr->DEST_offset]; 788 if (!OK_32PTR((char *)sin) || tudr->DEST_length != 789 sizeof (sin_t) || sin->sin_family != AF_INET_OFFLOAD) { 790 ret = EDESTADDRREQ; 791 goto done; 792 } 793 /* Extract port and ipaddr */ 794 port = sin->sin_port; 795 dst = sin->sin_addr.s_addr; 796 797 if (port == 0 || dst == INADDR_ANY) { 798 ret = EDESTADDRREQ; 799 goto done; 800 } 801 802 ASSERT(rds_transport_ops != NULL); 803 ret = rds_transport_ops->rds_transport_sendmsg(uiop, rds->rds_src, dst, 804 ntohs(rds->rds_port), ntohs(port), rds->rds_zoneid); 805 if (ret != 0) { 806 if ((ret != ENOBUFS) && (ret != ENOMEM)) { 807 /* ENOMEM is actually EWOULDBLOCK */ 808 dprint(2, ("%s: rds_sendmsg returned %d", LABEL, ret)); 809 goto done; 810 } 811 } 812 done: 813 freemsg(mp1); 814 freemsg(mp); 815 return (ret); 816 } 817 818 /* 819 * Make sure we dont return EINVAL and EWOULDBLOCK as it has 820 * special meanings for the synchronous streams (rwnext()). 821 * We should return ENOMEM which is changed to EWOULDBLOCK by kstrputmsg() 822 */ 823 static int 824 rds_wrw(queue_t *q, struiod_t *dp) 825 { 826 mblk_t *mp = dp->d_mp; 827 int error = 0; 828 struct datab *db; 829 uchar_t *rptr; 830 831 db = mp->b_datap; 832 rptr = mp->b_rptr; 833 switch (db->db_type) { 834 case M_PROTO: 835 case M_PCPROTO: 836 ASSERT(((uintptr_t)mp->b_wptr - (uintptr_t)rptr) <= 837 (uintptr_t)INT_MAX); 838 if ((uintptr_t)mp->b_wptr - (uintptr_t)rptr >= 839 sizeof (struct T_unitdata_req)) { 840 /* Detect valid T_UNITDATA_REQ here */ 841 if (((union T_primitives *)(uintptr_t)rptr)->type 842 == T_UNITDATA_REQ) 843 break; 844 } 845 /* FALLTHRU */ 846 default: 847 848 if (isuioq(q) && (error = struioget(q, mp, dp, 0))) { 849 /* 850 * Uio error of some sort, so just return the error. 851 */ 852 goto done; 853 } 854 dp->d_mp = 0; 855 rds_wput_other(q, mp); 856 return (0); 857 } 858 859 dp->d_mp = 0; 860 error = rds_wput_data(q, mp, &dp->d_uio); 861 done: 862 if (error == EWOULDBLOCK || error == EINVAL) 863 error = EIO; 864 865 return (error); 866 } 867 868 static void 869 rds_rsrv(queue_t *q) 870 { 871 rds_t *rds = (rds_t *)q->q_ptr; 872 ulong_t current_port_quota; 873 874 /* update the port quota to the current level */ 875 current_port_quota = RDS_GET_PORT_QUOTA(); 876 if (rds->rds_port_quota != current_port_quota) { 877 rds->rds_port_quota = current_port_quota; 878 (void) proto_set_rx_hiwat(q, NULL, 879 rds->rds_port_quota * UserBufferSize); 880 } 881 882 /* No more messages in the q, unstall the socket */ 883 rds_transport_ops->rds_transport_resume_port(ntohs(rds->rds_port)); 884 } 885 886 int 887 rds_close_transport_driver() 888 { 889 ASSERT(rds_transport_ops != NULL); 890 891 rw_enter(&rds_transport_lock, RW_WRITER); 892 if (rds_transport_handle != NULL) { 893 rds_transport_ops->rds_transport_close_ib(); 894 (void) ldi_close(rds_transport_handle, FNDELAY, kcred); 895 rds_transport_handle = NULL; 896 } 897 rw_exit(&rds_transport_lock); 898 899 return (0); 900 } 901 902 903 int 904 rds_open_transport_driver() 905 { 906 int ret = 0; 907 908 rw_enter(&rds_transport_lock, RW_WRITER); 909 if (rds_transport_handle != NULL) { 910 /* 911 * Someone beat us to it. 912 */ 913 goto done; 914 } 915 916 if (ibt_hw_is_present() == 0) { 917 ret = ENODEV; 918 goto done; 919 } 920 921 if (rds_li == NULL) { 922 ret = EPROTONOSUPPORT; 923 goto done; 924 } 925 926 ret = ldi_open_by_name("/devices/ib/rdsib@0:rdsib", 927 FREAD | FWRITE, kcred, &rds_transport_handle, rds_li); 928 if (ret != 0) { 929 ret = EPROTONOSUPPORT; 930 rds_transport_handle = NULL; 931 goto done; 932 } 933 934 ret = rds_transport_ops->rds_transport_open_ib(); 935 if (ret != 0) { 936 (void) ldi_close(rds_transport_handle, FNDELAY, kcred); 937 rds_transport_handle = NULL; 938 } 939 done: 940 rw_exit(&rds_transport_lock); 941 return (ret); 942 } 943 944 static struct module_info info = { 945 0, "rds", 1, INFPSZ, 65536, 1024 946 }; 947 948 static struct qinit rinit = { 949 NULL, (pfi_t)rds_rsrv, rds_open, rds_close, NULL, &info 950 }; 951 952 static struct qinit winit = { 953 (pfi_t)rds_wput, NULL, rds_open, rds_close, NULL, &info, 954 NULL, rds_wrw, NULL, STRUIOT_STANDARD 955 }; 956 957 struct streamtab rdsinfo = { 958 &rinit, &winit, NULL, NULL 959 }; 960 961 DDI_DEFINE_STREAM_OPS(rds_devops, nulldev, nulldev, rds_attach, rds_detach, 962 nulldev, rds_info, RDS_DEVMTFLAGS, &RDS_STRTAB, ddi_quiesce_not_supported); 963 964 /* 965 * Module linkage information for the kernel. 966 */ 967 static struct modldrv modldrv = { 968 &mod_driverops, 969 RDS_DEVDESC, 970 &rds_devops 971 }; 972 973 static struct modlinkage modlinkage = { 974 MODREV_1, 975 &modldrv, 976 NULL 977 }; 978 979 int 980 _init(void) 981 { 982 int ret; 983 984 rds_init(); 985 986 ret = mod_install(&modlinkage); 987 if (ret != 0) 988 goto done; 989 ret = ldi_ident_from_mod(&modlinkage, &rds_li); 990 if (ret != 0) 991 rds_li = NULL; 992 done: 993 return (ret); 994 } 995 996 int 997 _fini(void) 998 { 999 int ret; 1000 1001 ret = mod_remove(&modlinkage); 1002 if (ret != 0) { 1003 return (ret); 1004 } 1005 1006 rds_fini(); 1007 1008 ldi_ident_release(rds_li); 1009 return (0); 1010 } 1011 1012 int 1013 _info(struct modinfo *modinfop) 1014 { 1015 return (mod_info(&modlinkage, modinfop)); 1016 } 1017