1 /*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 30 * $FreeBSD$ 31 */ 32 33 #include "opt_ipfw.h" 34 #include "opt_ipsec.h" 35 #include "opt_mac.h" 36 #include "opt_mbuf_stress_test.h" 37 38 #include <sys/param.h> 39 #include <sys/systm.h> 40 #include <sys/kernel.h> 41 #include <sys/mac.h> 42 #include <sys/malloc.h> 43 #include <sys/mbuf.h> 44 #include <sys/protosw.h> 45 #include <sys/socket.h> 46 #include <sys/socketvar.h> 47 #include <sys/sysctl.h> 48 49 #include <net/if.h> 50 #include <net/netisr.h> 51 #include <net/pfil.h> 52 #include <net/route.h> 53 54 #include <netinet/in.h> 55 #include <netinet/in_systm.h> 56 #include <netinet/ip.h> 57 #include <netinet/in_pcb.h> 58 #include <netinet/in_var.h> 59 #include <netinet/ip_var.h> 60 #include <netinet/ip_options.h> 61 62 #include <machine/in_cksum.h> 63 64 static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options"); 65 66 #ifdef IPSEC 67 #include <netinet6/ipsec.h> 68 #include <netkey/key.h> 69 #ifdef IPSEC_DEBUG 70 #include <netkey/key_debug.h> 71 #else 72 #define KEYDEBUG(lev,arg) 73 #endif 74 #endif /*IPSEC*/ 75 76 #ifdef FAST_IPSEC 77 #include <netipsec/ipsec.h> 78 #include <netipsec/xform.h> 79 #include <netipsec/key.h> 80 #endif /*FAST_IPSEC*/ 81 82 #define print_ip(x, a, y) printf("%s %d.%d.%d.%d%s",\ 83 x, (ntohl(a.s_addr)>>24)&0xFF,\ 84 (ntohl(a.s_addr)>>16)&0xFF,\ 85 (ntohl(a.s_addr)>>8)&0xFF,\ 86 (ntohl(a.s_addr))&0xFF, y); 87 88 u_short ip_id; 89 90 #ifdef MBUF_STRESS_TEST 91 int mbuf_frag_size = 0; 92 SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW, 93 &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size"); 94 #endif 95 96 static struct ifnet *ip_multicast_if(struct in_addr *, int *); 97 static void ip_mloopback 98 (struct ifnet *, struct mbuf *, struct sockaddr_in *, int); 99 static int ip_getmoptions(struct inpcb *, struct sockopt *); 100 static int ip_setmoptions(struct inpcb *, struct sockopt *); 101 102 103 extern struct protosw inetsw[]; 104 105 /* 106 * IP output. The packet in mbuf chain m contains a skeletal IP 107 * header (with len, off, ttl, proto, tos, src, dst). 108 * The mbuf chain containing the packet will be freed. 109 * The mbuf opt, if present, will not be freed. 110 * In the IP forwarding case, the packet will arrive with options already 111 * inserted, so must have a NULL opt pointer. 112 */ 113 int 114 ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, 115 int flags, struct ip_moptions *imo, struct inpcb *inp) 116 { 117 struct ip *ip; 118 struct ifnet *ifp = NULL; /* keep compiler happy */ 119 struct mbuf *m0; 120 int hlen = sizeof (struct ip); 121 int len, error = 0; 122 struct sockaddr_in *dst = NULL; /* keep compiler happy */ 123 struct in_ifaddr *ia = NULL; 124 int isbroadcast, sw_csum; 125 struct route iproute; 126 struct in_addr odst; 127 #ifdef IPFIREWALL_FORWARD 128 struct m_tag *fwd_tag = NULL; 129 #endif 130 #ifdef IPSEC 131 struct secpolicy *sp = NULL; 132 #endif 133 #ifdef FAST_IPSEC 134 struct secpolicy *sp = NULL; 135 struct tdb_ident *tdbi; 136 struct m_tag *mtag; 137 int s; 138 #endif /* FAST_IPSEC */ 139 140 M_ASSERTPKTHDR(m); 141 142 if (ro == NULL) { 143 ro = &iproute; 144 bzero(ro, sizeof (*ro)); 145 } 146 147 if (inp != NULL) 148 INP_LOCK_ASSERT(inp); 149 150 if (opt) { 151 len = 0; 152 m = ip_insertoptions(m, opt, &len); 153 if (len != 0) 154 hlen = len; 155 } 156 ip = mtod(m, struct ip *); 157 158 /* 159 * Fill in IP header. If we are not allowing fragmentation, 160 * then the ip_id field is meaningless, but we don't set it 161 * to zero. Doing so causes various problems when devices along 162 * the path (routers, load balancers, firewalls, etc.) illegally 163 * disable DF on our packet. Note that a 16-bit counter 164 * will wrap around in less than 10 seconds at 100 Mbit/s on a 165 * medium with MTU 1500. See Steven M. Bellovin, "A Technique 166 * for Counting NATted Hosts", Proc. IMW'02, available at 167 * <http://www.research.att.com/~smb/papers/fnat.pdf>. 168 */ 169 if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { 170 ip->ip_v = IPVERSION; 171 ip->ip_hl = hlen >> 2; 172 ip->ip_id = ip_newid(); 173 ipstat.ips_localout++; 174 } else { 175 hlen = ip->ip_hl << 2; 176 } 177 178 dst = (struct sockaddr_in *)&ro->ro_dst; 179 again: 180 /* 181 * If there is a cached route, 182 * check that it is to the same destination 183 * and is still up. If not, free it and try again. 184 * The address family should also be checked in case of sharing the 185 * cache with IPv6. 186 */ 187 if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 || 188 dst->sin_family != AF_INET || 189 dst->sin_addr.s_addr != ip->ip_dst.s_addr)) { 190 RTFREE(ro->ro_rt); 191 ro->ro_rt = (struct rtentry *)0; 192 } 193 #ifdef IPFIREWALL_FORWARD 194 if (ro->ro_rt == NULL && fwd_tag == NULL) { 195 #else 196 if (ro->ro_rt == NULL) { 197 #endif 198 bzero(dst, sizeof(*dst)); 199 dst->sin_family = AF_INET; 200 dst->sin_len = sizeof(*dst); 201 dst->sin_addr = ip->ip_dst; 202 } 203 /* 204 * If routing to interface only, 205 * short circuit routing lookup. 206 */ 207 if (flags & IP_ROUTETOIF) { 208 if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL && 209 (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == NULL) { 210 ipstat.ips_noroute++; 211 error = ENETUNREACH; 212 goto bad; 213 } 214 ifp = ia->ia_ifp; 215 ip->ip_ttl = 1; 216 isbroadcast = in_broadcast(dst->sin_addr, ifp); 217 } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && 218 imo != NULL && imo->imo_multicast_ifp != NULL) { 219 /* 220 * Bypass the normal routing lookup for multicast 221 * packets if the interface is specified. 222 */ 223 ifp = imo->imo_multicast_ifp; 224 IFP_TO_IA(ifp, ia); 225 isbroadcast = 0; /* fool gcc */ 226 } else { 227 /* 228 * We want to do any cloning requested by the link layer, 229 * as this is probably required in all cases for correct 230 * operation (as it is for ARP). 231 */ 232 if (ro->ro_rt == NULL) 233 rtalloc_ign(ro, 0); 234 if (ro->ro_rt == NULL) { 235 ipstat.ips_noroute++; 236 error = EHOSTUNREACH; 237 goto bad; 238 } 239 ia = ifatoia(ro->ro_rt->rt_ifa); 240 ifp = ro->ro_rt->rt_ifp; 241 ro->ro_rt->rt_rmx.rmx_pksent++; 242 if (ro->ro_rt->rt_flags & RTF_GATEWAY) 243 dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway; 244 if (ro->ro_rt->rt_flags & RTF_HOST) 245 isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST); 246 else 247 isbroadcast = in_broadcast(dst->sin_addr, ifp); 248 } 249 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { 250 struct in_multi *inm; 251 252 m->m_flags |= M_MCAST; 253 /* 254 * IP destination address is multicast. Make sure "dst" 255 * still points to the address in "ro". (It may have been 256 * changed to point to a gateway address, above.) 257 */ 258 dst = (struct sockaddr_in *)&ro->ro_dst; 259 /* 260 * See if the caller provided any multicast options 261 */ 262 if (imo != NULL) { 263 ip->ip_ttl = imo->imo_multicast_ttl; 264 if (imo->imo_multicast_vif != -1) 265 ip->ip_src.s_addr = 266 ip_mcast_src ? 267 ip_mcast_src(imo->imo_multicast_vif) : 268 INADDR_ANY; 269 } else 270 ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; 271 /* 272 * Confirm that the outgoing interface supports multicast. 273 */ 274 if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { 275 if ((ifp->if_flags & IFF_MULTICAST) == 0) { 276 ipstat.ips_noroute++; 277 error = ENETUNREACH; 278 goto bad; 279 } 280 } 281 /* 282 * If source address not specified yet, use address 283 * of outgoing interface. 284 */ 285 if (ip->ip_src.s_addr == INADDR_ANY) { 286 /* Interface may have no addresses. */ 287 if (ia != NULL) 288 ip->ip_src = IA_SIN(ia)->sin_addr; 289 } 290 291 IN_MULTI_LOCK(); 292 IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm); 293 if (inm != NULL && 294 (imo == NULL || imo->imo_multicast_loop)) { 295 IN_MULTI_UNLOCK(); 296 /* 297 * If we belong to the destination multicast group 298 * on the outgoing interface, and the caller did not 299 * forbid loopback, loop back a copy. 300 */ 301 ip_mloopback(ifp, m, dst, hlen); 302 } 303 else { 304 IN_MULTI_UNLOCK(); 305 /* 306 * If we are acting as a multicast router, perform 307 * multicast forwarding as if the packet had just 308 * arrived on the interface to which we are about 309 * to send. The multicast forwarding function 310 * recursively calls this function, using the 311 * IP_FORWARDING flag to prevent infinite recursion. 312 * 313 * Multicasts that are looped back by ip_mloopback(), 314 * above, will be forwarded by the ip_input() routine, 315 * if necessary. 316 */ 317 if (ip_mrouter && (flags & IP_FORWARDING) == 0) { 318 /* 319 * If rsvp daemon is not running, do not 320 * set ip_moptions. This ensures that the packet 321 * is multicast and not just sent down one link 322 * as prescribed by rsvpd. 323 */ 324 if (!rsvp_on) 325 imo = NULL; 326 if (ip_mforward && 327 ip_mforward(ip, ifp, m, imo) != 0) { 328 m_freem(m); 329 goto done; 330 } 331 } 332 } 333 334 /* 335 * Multicasts with a time-to-live of zero may be looped- 336 * back, above, but must not be transmitted on a network. 337 * Also, multicasts addressed to the loopback interface 338 * are not sent -- the above call to ip_mloopback() will 339 * loop back a copy if this host actually belongs to the 340 * destination group on the loopback interface. 341 */ 342 if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { 343 m_freem(m); 344 goto done; 345 } 346 347 goto sendit; 348 } 349 #ifndef notdef 350 /* 351 * If the source address is not specified yet, use the address 352 * of the outoing interface. 353 */ 354 if (ip->ip_src.s_addr == INADDR_ANY) { 355 /* Interface may have no addresses. */ 356 if (ia != NULL) { 357 ip->ip_src = IA_SIN(ia)->sin_addr; 358 } 359 } 360 #endif /* notdef */ 361 /* 362 * Verify that we have any chance at all of being able to queue the 363 * packet or packet fragments, unless ALTQ is enabled on the given 364 * interface in which case packetdrop should be done by queueing. 365 */ 366 #ifdef ALTQ 367 if ((!ALTQ_IS_ENABLED(&ifp->if_snd)) && 368 ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >= 369 ifp->if_snd.ifq_maxlen)) 370 #else 371 if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >= 372 ifp->if_snd.ifq_maxlen) 373 #endif /* ALTQ */ 374 { 375 error = ENOBUFS; 376 ipstat.ips_odropped++; 377 ifp->if_snd.ifq_drops += (ip->ip_len / ifp->if_mtu + 1); 378 goto bad; 379 } 380 381 /* 382 * Look for broadcast address and 383 * verify user is allowed to send 384 * such a packet. 385 */ 386 if (isbroadcast) { 387 if ((ifp->if_flags & IFF_BROADCAST) == 0) { 388 error = EADDRNOTAVAIL; 389 goto bad; 390 } 391 if ((flags & IP_ALLOWBROADCAST) == 0) { 392 error = EACCES; 393 goto bad; 394 } 395 /* don't allow broadcast messages to be fragmented */ 396 if (ip->ip_len > ifp->if_mtu) { 397 error = EMSGSIZE; 398 goto bad; 399 } 400 if (flags & IP_SENDONES) 401 ip->ip_dst.s_addr = INADDR_BROADCAST; 402 m->m_flags |= M_BCAST; 403 } else { 404 m->m_flags &= ~M_BCAST; 405 } 406 407 sendit: 408 #ifdef IPSEC 409 /* get SP for this packet */ 410 if (inp == NULL) 411 sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, 412 flags, &error); 413 else 414 sp = ipsec4_getpolicybypcb(m, IPSEC_DIR_OUTBOUND, inp, &error); 415 416 if (sp == NULL) { 417 ipsecstat.out_inval++; 418 goto bad; 419 } 420 421 error = 0; 422 423 /* check policy */ 424 switch (sp->policy) { 425 case IPSEC_POLICY_DISCARD: 426 /* 427 * This packet is just discarded. 428 */ 429 ipsecstat.out_polvio++; 430 goto bad; 431 432 case IPSEC_POLICY_BYPASS: 433 case IPSEC_POLICY_NONE: 434 case IPSEC_POLICY_TCP: 435 /* no need to do IPsec. */ 436 goto skip_ipsec; 437 438 case IPSEC_POLICY_IPSEC: 439 if (sp->req == NULL) { 440 /* acquire a policy */ 441 error = key_spdacquire(sp); 442 goto bad; 443 } 444 break; 445 446 case IPSEC_POLICY_ENTRUST: 447 default: 448 printf("ip_output: Invalid policy found. %d\n", sp->policy); 449 } 450 { 451 struct ipsec_output_state state; 452 bzero(&state, sizeof(state)); 453 state.m = m; 454 if (flags & IP_ROUTETOIF) { 455 state.ro = &iproute; 456 bzero(&iproute, sizeof(iproute)); 457 } else 458 state.ro = ro; 459 state.dst = (struct sockaddr *)dst; 460 461 ip->ip_sum = 0; 462 463 /* 464 * XXX 465 * delayed checksums are not currently compatible with IPsec 466 */ 467 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 468 in_delayed_cksum(m); 469 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 470 } 471 472 ip->ip_len = htons(ip->ip_len); 473 ip->ip_off = htons(ip->ip_off); 474 475 error = ipsec4_output(&state, sp, flags); 476 477 m = state.m; 478 if (flags & IP_ROUTETOIF) { 479 /* 480 * if we have tunnel mode SA, we may need to ignore 481 * IP_ROUTETOIF. 482 */ 483 if (state.ro != &iproute || state.ro->ro_rt != NULL) { 484 flags &= ~IP_ROUTETOIF; 485 ro = state.ro; 486 } 487 } else 488 ro = state.ro; 489 dst = (struct sockaddr_in *)state.dst; 490 if (error) { 491 /* mbuf is already reclaimed in ipsec4_output. */ 492 m = NULL; 493 switch (error) { 494 case EHOSTUNREACH: 495 case ENETUNREACH: 496 case EMSGSIZE: 497 case ENOBUFS: 498 case ENOMEM: 499 break; 500 default: 501 printf("ip4_output (ipsec): error code %d\n", error); 502 /*fall through*/ 503 case ENOENT: 504 /* don't show these error codes to the user */ 505 error = 0; 506 break; 507 } 508 goto bad; 509 } 510 511 /* be sure to update variables that are affected by ipsec4_output() */ 512 ip = mtod(m, struct ip *); 513 hlen = ip->ip_hl << 2; 514 if (ro->ro_rt == NULL) { 515 if ((flags & IP_ROUTETOIF) == 0) { 516 printf("ip_output: " 517 "can't update route after IPsec processing\n"); 518 error = EHOSTUNREACH; /*XXX*/ 519 goto bad; 520 } 521 } else { 522 if (state.encap) { 523 ia = ifatoia(ro->ro_rt->rt_ifa); 524 ifp = ro->ro_rt->rt_ifp; 525 } 526 } 527 } 528 529 /* make it flipped, again. */ 530 ip->ip_len = ntohs(ip->ip_len); 531 ip->ip_off = ntohs(ip->ip_off); 532 skip_ipsec: 533 #endif /*IPSEC*/ 534 #ifdef FAST_IPSEC 535 /* 536 * Check the security policy (SP) for the packet and, if 537 * required, do IPsec-related processing. There are two 538 * cases here; the first time a packet is sent through 539 * it will be untagged and handled by ipsec4_checkpolicy. 540 * If the packet is resubmitted to ip_output (e.g. after 541 * AH, ESP, etc. processing), there will be a tag to bypass 542 * the lookup and related policy checking. 543 */ 544 mtag = m_tag_find(m, PACKET_TAG_IPSEC_PENDING_TDB, NULL); 545 s = splnet(); 546 if (mtag != NULL) { 547 tdbi = (struct tdb_ident *)(mtag + 1); 548 sp = ipsec_getpolicy(tdbi, IPSEC_DIR_OUTBOUND); 549 if (sp == NULL) 550 error = -EINVAL; /* force silent drop */ 551 m_tag_delete(m, mtag); 552 } else { 553 sp = ipsec4_checkpolicy(m, IPSEC_DIR_OUTBOUND, flags, 554 &error, inp); 555 } 556 /* 557 * There are four return cases: 558 * sp != NULL apply IPsec policy 559 * sp == NULL, error == 0 no IPsec handling needed 560 * sp == NULL, error == -EINVAL discard packet w/o error 561 * sp == NULL, error != 0 discard packet, report error 562 */ 563 if (sp != NULL) { 564 /* Loop detection, check if ipsec processing already done */ 565 KASSERT(sp->req != NULL, ("ip_output: no ipsec request")); 566 for (mtag = m_tag_first(m); mtag != NULL; 567 mtag = m_tag_next(m, mtag)) { 568 if (mtag->m_tag_cookie != MTAG_ABI_COMPAT) 569 continue; 570 if (mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_DONE && 571 mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED) 572 continue; 573 /* 574 * Check if policy has an SA associated with it. 575 * This can happen when an SP has yet to acquire 576 * an SA; e.g. on first reference. If it occurs, 577 * then we let ipsec4_process_packet do its thing. 578 */ 579 if (sp->req->sav == NULL) 580 break; 581 tdbi = (struct tdb_ident *)(mtag + 1); 582 if (tdbi->spi == sp->req->sav->spi && 583 tdbi->proto == sp->req->sav->sah->saidx.proto && 584 bcmp(&tdbi->dst, &sp->req->sav->sah->saidx.dst, 585 sizeof (union sockaddr_union)) == 0) { 586 /* 587 * No IPsec processing is needed, free 588 * reference to SP. 589 * 590 * NB: null pointer to avoid free at 591 * done: below. 592 */ 593 KEY_FREESP(&sp), sp = NULL; 594 splx(s); 595 goto spd_done; 596 } 597 } 598 599 /* 600 * Do delayed checksums now because we send before 601 * this is done in the normal processing path. 602 */ 603 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 604 in_delayed_cksum(m); 605 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 606 } 607 608 ip->ip_len = htons(ip->ip_len); 609 ip->ip_off = htons(ip->ip_off); 610 611 /* NB: callee frees mbuf */ 612 error = ipsec4_process_packet(m, sp->req, flags, 0); 613 /* 614 * Preserve KAME behaviour: ENOENT can be returned 615 * when an SA acquire is in progress. Don't propagate 616 * this to user-level; it confuses applications. 617 * 618 * XXX this will go away when the SADB is redone. 619 */ 620 if (error == ENOENT) 621 error = 0; 622 splx(s); 623 goto done; 624 } else { 625 splx(s); 626 627 if (error != 0) { 628 /* 629 * Hack: -EINVAL is used to signal that a packet 630 * should be silently discarded. This is typically 631 * because we asked key management for an SA and 632 * it was delayed (e.g. kicked up to IKE). 633 */ 634 if (error == -EINVAL) 635 error = 0; 636 goto bad; 637 } else { 638 /* No IPsec processing for this packet. */ 639 } 640 #ifdef notyet 641 /* 642 * If deferred crypto processing is needed, check that 643 * the interface supports it. 644 */ 645 mtag = m_tag_find(m, PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED, NULL); 646 if (mtag != NULL && (ifp->if_capenable & IFCAP_IPSEC) == 0) { 647 /* notify IPsec to do its own crypto */ 648 ipsp_skipcrypto_unmark((struct tdb_ident *)(mtag + 1)); 649 error = EHOSTUNREACH; 650 goto bad; 651 } 652 #endif 653 } 654 spd_done: 655 #endif /* FAST_IPSEC */ 656 657 /* Jump over all PFIL processing if hooks are not active. */ 658 if (inet_pfil_hook.ph_busy_count == -1) 659 goto passout; 660 661 /* Run through list of hooks for output packets. */ 662 odst.s_addr = ip->ip_dst.s_addr; 663 error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT, inp); 664 if (error != 0 || m == NULL) 665 goto done; 666 667 ip = mtod(m, struct ip *); 668 669 /* See if destination IP address was changed by packet filter. */ 670 if (odst.s_addr != ip->ip_dst.s_addr) { 671 m->m_flags |= M_SKIP_FIREWALL; 672 /* If destination is now ourself drop to ip_input(). */ 673 if (in_localip(ip->ip_dst)) { 674 m->m_flags |= M_FASTFWD_OURS; 675 if (m->m_pkthdr.rcvif == NULL) 676 m->m_pkthdr.rcvif = loif; 677 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 678 m->m_pkthdr.csum_flags |= 679 CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 680 m->m_pkthdr.csum_data = 0xffff; 681 } 682 m->m_pkthdr.csum_flags |= 683 CSUM_IP_CHECKED | CSUM_IP_VALID; 684 685 error = netisr_queue(NETISR_IP, m); 686 goto done; 687 } else 688 goto again; /* Redo the routing table lookup. */ 689 } 690 691 #ifdef IPFIREWALL_FORWARD 692 /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */ 693 if (m->m_flags & M_FASTFWD_OURS) { 694 if (m->m_pkthdr.rcvif == NULL) 695 m->m_pkthdr.rcvif = loif; 696 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 697 m->m_pkthdr.csum_flags |= 698 CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 699 m->m_pkthdr.csum_data = 0xffff; 700 } 701 m->m_pkthdr.csum_flags |= 702 CSUM_IP_CHECKED | CSUM_IP_VALID; 703 704 error = netisr_queue(NETISR_IP, m); 705 goto done; 706 } 707 /* Or forward to some other address? */ 708 fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); 709 if (fwd_tag) { 710 #ifndef IPFIREWALL_FORWARD_EXTENDED 711 if (!in_localip(ip->ip_src) && !in_localaddr(ip->ip_dst)) { 712 #endif 713 dst = (struct sockaddr_in *)&ro->ro_dst; 714 bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in)); 715 m->m_flags |= M_SKIP_FIREWALL; 716 m_tag_delete(m, fwd_tag); 717 goto again; 718 #ifndef IPFIREWALL_FORWARD_EXTENDED 719 } else { 720 m_tag_delete(m, fwd_tag); 721 /* Continue. */ 722 } 723 #endif 724 } 725 #endif /* IPFIREWALL_FORWARD */ 726 727 passout: 728 /* 127/8 must not appear on wire - RFC1122. */ 729 if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || 730 (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { 731 if ((ifp->if_flags & IFF_LOOPBACK) == 0) { 732 ipstat.ips_badaddr++; 733 error = EADDRNOTAVAIL; 734 goto bad; 735 } 736 } 737 738 m->m_pkthdr.csum_flags |= CSUM_IP; 739 sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist; 740 if (sw_csum & CSUM_DELAY_DATA) { 741 in_delayed_cksum(m); 742 sw_csum &= ~CSUM_DELAY_DATA; 743 } 744 m->m_pkthdr.csum_flags &= ifp->if_hwassist; 745 746 /* 747 * If small enough for interface, or the interface will take 748 * care of the fragmentation for us, can just send directly. 749 */ 750 if (ip->ip_len <= ifp->if_mtu || (ifp->if_hwassist & CSUM_FRAGMENT && 751 ((ip->ip_off & IP_DF) == 0))) { 752 ip->ip_len = htons(ip->ip_len); 753 ip->ip_off = htons(ip->ip_off); 754 ip->ip_sum = 0; 755 if (sw_csum & CSUM_DELAY_IP) 756 ip->ip_sum = in_cksum(m, hlen); 757 758 /* Record statistics for this interface address. */ 759 if (!(flags & IP_FORWARDING) && ia) { 760 ia->ia_ifa.if_opackets++; 761 ia->ia_ifa.if_obytes += m->m_pkthdr.len; 762 } 763 764 #ifdef IPSEC 765 /* clean ipsec history once it goes out of the node */ 766 ipsec_delaux(m); 767 #endif 768 769 #ifdef MBUF_STRESS_TEST 770 if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size) 771 m = m_fragment(m, M_DONTWAIT, mbuf_frag_size); 772 #endif 773 /* 774 * Reset layer specific mbuf flags 775 * to avoid confusing lower layers. 776 */ 777 m->m_flags &= ~(M_PROTOFLAGS); 778 779 error = (*ifp->if_output)(ifp, m, 780 (struct sockaddr *)dst, ro->ro_rt); 781 goto done; 782 } 783 784 if (ip->ip_off & IP_DF) { 785 error = EMSGSIZE; 786 /* 787 * This case can happen if the user changed the MTU 788 * of an interface after enabling IP on it. Because 789 * most netifs don't keep track of routes pointing to 790 * them, there is no way for one to update all its 791 * routes when the MTU is changed. 792 */ 793 if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) && 794 (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) { 795 ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu; 796 } 797 ipstat.ips_cantfrag++; 798 goto bad; 799 } 800 801 /* 802 * Too large for interface; fragment if possible. If successful, 803 * on return, m will point to a list of packets to be sent. 804 */ 805 error = ip_fragment(ip, &m, ifp->if_mtu, ifp->if_hwassist, sw_csum); 806 if (error) 807 goto bad; 808 for (; m; m = m0) { 809 m0 = m->m_nextpkt; 810 m->m_nextpkt = 0; 811 #ifdef IPSEC 812 /* clean ipsec history once it goes out of the node */ 813 ipsec_delaux(m); 814 #endif 815 if (error == 0) { 816 /* Record statistics for this interface address. */ 817 if (ia != NULL) { 818 ia->ia_ifa.if_opackets++; 819 ia->ia_ifa.if_obytes += m->m_pkthdr.len; 820 } 821 /* 822 * Reset layer specific mbuf flags 823 * to avoid confusing upper layers. 824 */ 825 m->m_flags &= ~(M_PROTOFLAGS); 826 827 error = (*ifp->if_output)(ifp, m, 828 (struct sockaddr *)dst, ro->ro_rt); 829 } else 830 m_freem(m); 831 } 832 833 if (error == 0) 834 ipstat.ips_fragmented++; 835 836 done: 837 if (ro == &iproute && ro->ro_rt) { 838 RTFREE(ro->ro_rt); 839 } 840 #ifdef IPSEC 841 if (sp != NULL) { 842 KEYDEBUG(KEYDEBUG_IPSEC_STAMP, 843 printf("DP ip_output call free SP:%p\n", sp)); 844 key_freesp(sp); 845 } 846 #endif 847 #ifdef FAST_IPSEC 848 if (sp != NULL) 849 KEY_FREESP(&sp); 850 #endif 851 return (error); 852 bad: 853 m_freem(m); 854 goto done; 855 } 856 857 /* 858 * Create a chain of fragments which fit the given mtu. m_frag points to the 859 * mbuf to be fragmented; on return it points to the chain with the fragments. 860 * Return 0 if no error. If error, m_frag may contain a partially built 861 * chain of fragments that should be freed by the caller. 862 * 863 * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist) 864 * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP). 865 */ 866 int 867 ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, 868 u_long if_hwassist_flags, int sw_csum) 869 { 870 int error = 0; 871 int hlen = ip->ip_hl << 2; 872 int len = (mtu - hlen) & ~7; /* size of payload in each fragment */ 873 int off; 874 struct mbuf *m0 = *m_frag; /* the original packet */ 875 int firstlen; 876 struct mbuf **mnext; 877 int nfrags; 878 879 if (ip->ip_off & IP_DF) { /* Fragmentation not allowed */ 880 ipstat.ips_cantfrag++; 881 return EMSGSIZE; 882 } 883 884 /* 885 * Must be able to put at least 8 bytes per fragment. 886 */ 887 if (len < 8) 888 return EMSGSIZE; 889 890 /* 891 * If the interface will not calculate checksums on 892 * fragmented packets, then do it here. 893 */ 894 if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA && 895 (if_hwassist_flags & CSUM_IP_FRAGS) == 0) { 896 in_delayed_cksum(m0); 897 m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 898 } 899 900 if (len > PAGE_SIZE) { 901 /* 902 * Fragment large datagrams such that each segment 903 * contains a multiple of PAGE_SIZE amount of data, 904 * plus headers. This enables a receiver to perform 905 * page-flipping zero-copy optimizations. 906 * 907 * XXX When does this help given that sender and receiver 908 * could have different page sizes, and also mtu could 909 * be less than the receiver's page size ? 910 */ 911 int newlen; 912 struct mbuf *m; 913 914 for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next) 915 off += m->m_len; 916 917 /* 918 * firstlen (off - hlen) must be aligned on an 919 * 8-byte boundary 920 */ 921 if (off < hlen) 922 goto smart_frag_failure; 923 off = ((off - hlen) & ~7) + hlen; 924 newlen = (~PAGE_MASK) & mtu; 925 if ((newlen + sizeof (struct ip)) > mtu) { 926 /* we failed, go back the default */ 927 smart_frag_failure: 928 newlen = len; 929 off = hlen + len; 930 } 931 len = newlen; 932 933 } else { 934 off = hlen + len; 935 } 936 937 firstlen = off - hlen; 938 mnext = &m0->m_nextpkt; /* pointer to next packet */ 939 940 /* 941 * Loop through length of segment after first fragment, 942 * make new header and copy data of each part and link onto chain. 943 * Here, m0 is the original packet, m is the fragment being created. 944 * The fragments are linked off the m_nextpkt of the original 945 * packet, which after processing serves as the first fragment. 946 */ 947 for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) { 948 struct ip *mhip; /* ip header on the fragment */ 949 struct mbuf *m; 950 int mhlen = sizeof (struct ip); 951 952 MGETHDR(m, M_DONTWAIT, MT_DATA); 953 if (m == NULL) { 954 error = ENOBUFS; 955 ipstat.ips_odropped++; 956 goto done; 957 } 958 m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG; 959 /* 960 * In the first mbuf, leave room for the link header, then 961 * copy the original IP header including options. The payload 962 * goes into an additional mbuf chain returned by m_copy(). 963 */ 964 m->m_data += max_linkhdr; 965 mhip = mtod(m, struct ip *); 966 *mhip = *ip; 967 if (hlen > sizeof (struct ip)) { 968 mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); 969 mhip->ip_v = IPVERSION; 970 mhip->ip_hl = mhlen >> 2; 971 } 972 m->m_len = mhlen; 973 /* XXX do we need to add ip->ip_off below ? */ 974 mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off; 975 if (off + len >= ip->ip_len) { /* last fragment */ 976 len = ip->ip_len - off; 977 m->m_flags |= M_LASTFRAG; 978 } else 979 mhip->ip_off |= IP_MF; 980 mhip->ip_len = htons((u_short)(len + mhlen)); 981 m->m_next = m_copy(m0, off, len); 982 if (m->m_next == NULL) { /* copy failed */ 983 m_free(m); 984 error = ENOBUFS; /* ??? */ 985 ipstat.ips_odropped++; 986 goto done; 987 } 988 m->m_pkthdr.len = mhlen + len; 989 m->m_pkthdr.rcvif = NULL; 990 #ifdef MAC 991 mac_create_fragment(m0, m); 992 #endif 993 m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags; 994 mhip->ip_off = htons(mhip->ip_off); 995 mhip->ip_sum = 0; 996 if (sw_csum & CSUM_DELAY_IP) 997 mhip->ip_sum = in_cksum(m, mhlen); 998 *mnext = m; 999 mnext = &m->m_nextpkt; 1000 } 1001 ipstat.ips_ofragments += nfrags; 1002 1003 /* set first marker for fragment chain */ 1004 m0->m_flags |= M_FIRSTFRAG | M_FRAG; 1005 m0->m_pkthdr.csum_data = nfrags; 1006 1007 /* 1008 * Update first fragment by trimming what's been copied out 1009 * and updating header. 1010 */ 1011 m_adj(m0, hlen + firstlen - ip->ip_len); 1012 m0->m_pkthdr.len = hlen + firstlen; 1013 ip->ip_len = htons((u_short)m0->m_pkthdr.len); 1014 ip->ip_off |= IP_MF; 1015 ip->ip_off = htons(ip->ip_off); 1016 ip->ip_sum = 0; 1017 if (sw_csum & CSUM_DELAY_IP) 1018 ip->ip_sum = in_cksum(m0, hlen); 1019 1020 done: 1021 *m_frag = m0; 1022 return error; 1023 } 1024 1025 void 1026 in_delayed_cksum(struct mbuf *m) 1027 { 1028 struct ip *ip; 1029 u_short csum, offset; 1030 1031 ip = mtod(m, struct ip *); 1032 offset = ip->ip_hl << 2 ; 1033 csum = in_cksum_skip(m, ip->ip_len, offset); 1034 if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0) 1035 csum = 0xffff; 1036 offset += m->m_pkthdr.csum_data; /* checksum offset */ 1037 1038 if (offset + sizeof(u_short) > m->m_len) { 1039 printf("delayed m_pullup, m->len: %d off: %d p: %d\n", 1040 m->m_len, offset, ip->ip_p); 1041 /* 1042 * XXX 1043 * this shouldn't happen, but if it does, the 1044 * correct behavior may be to insert the checksum 1045 * in the existing chain instead of rearranging it. 1046 */ 1047 m = m_pullup(m, offset + sizeof(u_short)); 1048 } 1049 *(u_short *)(m->m_data + offset) = csum; 1050 } 1051 1052 /* 1053 * IP socket option processing. 1054 */ 1055 int 1056 ip_ctloutput(so, sopt) 1057 struct socket *so; 1058 struct sockopt *sopt; 1059 { 1060 struct inpcb *inp = sotoinpcb(so); 1061 int error, optval; 1062 1063 error = optval = 0; 1064 if (sopt->sopt_level != IPPROTO_IP) { 1065 return (EINVAL); 1066 } 1067 1068 switch (sopt->sopt_dir) { 1069 case SOPT_SET: 1070 switch (sopt->sopt_name) { 1071 case IP_OPTIONS: 1072 #ifdef notyet 1073 case IP_RETOPTS: 1074 #endif 1075 { 1076 struct mbuf *m; 1077 if (sopt->sopt_valsize > MLEN) { 1078 error = EMSGSIZE; 1079 break; 1080 } 1081 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 1082 if (m == NULL) { 1083 error = ENOBUFS; 1084 break; 1085 } 1086 m->m_len = sopt->sopt_valsize; 1087 error = sooptcopyin(sopt, mtod(m, char *), m->m_len, 1088 m->m_len); 1089 INP_LOCK(inp); 1090 error = ip_pcbopts(inp, sopt->sopt_name, m); 1091 INP_UNLOCK(inp); 1092 return (error); 1093 } 1094 1095 case IP_TOS: 1096 case IP_TTL: 1097 case IP_MINTTL: 1098 case IP_RECVOPTS: 1099 case IP_RECVRETOPTS: 1100 case IP_RECVDSTADDR: 1101 case IP_RECVTTL: 1102 case IP_RECVIF: 1103 case IP_FAITH: 1104 case IP_ONESBCAST: 1105 case IP_DONTFRAG: 1106 error = sooptcopyin(sopt, &optval, sizeof optval, 1107 sizeof optval); 1108 if (error) 1109 break; 1110 1111 switch (sopt->sopt_name) { 1112 case IP_TOS: 1113 inp->inp_ip_tos = optval; 1114 break; 1115 1116 case IP_TTL: 1117 inp->inp_ip_ttl = optval; 1118 break; 1119 1120 case IP_MINTTL: 1121 if (optval > 0 && optval <= MAXTTL) 1122 inp->inp_ip_minttl = optval; 1123 else 1124 error = EINVAL; 1125 break; 1126 1127 #define OPTSET(bit) do { \ 1128 INP_LOCK(inp); \ 1129 if (optval) \ 1130 inp->inp_flags |= bit; \ 1131 else \ 1132 inp->inp_flags &= ~bit; \ 1133 INP_UNLOCK(inp); \ 1134 } while (0) 1135 1136 case IP_RECVOPTS: 1137 OPTSET(INP_RECVOPTS); 1138 break; 1139 1140 case IP_RECVRETOPTS: 1141 OPTSET(INP_RECVRETOPTS); 1142 break; 1143 1144 case IP_RECVDSTADDR: 1145 OPTSET(INP_RECVDSTADDR); 1146 break; 1147 1148 case IP_RECVTTL: 1149 OPTSET(INP_RECVTTL); 1150 break; 1151 1152 case IP_RECVIF: 1153 OPTSET(INP_RECVIF); 1154 break; 1155 1156 case IP_FAITH: 1157 OPTSET(INP_FAITH); 1158 break; 1159 1160 case IP_ONESBCAST: 1161 OPTSET(INP_ONESBCAST); 1162 break; 1163 case IP_DONTFRAG: 1164 OPTSET(INP_DONTFRAG); 1165 break; 1166 } 1167 break; 1168 #undef OPTSET 1169 1170 case IP_MULTICAST_IF: 1171 case IP_MULTICAST_VIF: 1172 case IP_MULTICAST_TTL: 1173 case IP_MULTICAST_LOOP: 1174 case IP_ADD_MEMBERSHIP: 1175 case IP_DROP_MEMBERSHIP: 1176 error = ip_setmoptions(inp, sopt); 1177 break; 1178 1179 case IP_PORTRANGE: 1180 error = sooptcopyin(sopt, &optval, sizeof optval, 1181 sizeof optval); 1182 if (error) 1183 break; 1184 1185 INP_LOCK(inp); 1186 switch (optval) { 1187 case IP_PORTRANGE_DEFAULT: 1188 inp->inp_flags &= ~(INP_LOWPORT); 1189 inp->inp_flags &= ~(INP_HIGHPORT); 1190 break; 1191 1192 case IP_PORTRANGE_HIGH: 1193 inp->inp_flags &= ~(INP_LOWPORT); 1194 inp->inp_flags |= INP_HIGHPORT; 1195 break; 1196 1197 case IP_PORTRANGE_LOW: 1198 inp->inp_flags &= ~(INP_HIGHPORT); 1199 inp->inp_flags |= INP_LOWPORT; 1200 break; 1201 1202 default: 1203 error = EINVAL; 1204 break; 1205 } 1206 INP_UNLOCK(inp); 1207 break; 1208 1209 #if defined(IPSEC) || defined(FAST_IPSEC) 1210 case IP_IPSEC_POLICY: 1211 { 1212 caddr_t req; 1213 size_t len = 0; 1214 int priv; 1215 struct mbuf *m; 1216 int optname; 1217 1218 if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ 1219 break; 1220 if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ 1221 break; 1222 priv = (sopt->sopt_td != NULL && 1223 suser(sopt->sopt_td) != 0) ? 0 : 1; 1224 req = mtod(m, caddr_t); 1225 len = m->m_len; 1226 optname = sopt->sopt_name; 1227 error = ipsec4_set_policy(inp, optname, req, len, priv); 1228 m_freem(m); 1229 break; 1230 } 1231 #endif /*IPSEC*/ 1232 1233 default: 1234 error = ENOPROTOOPT; 1235 break; 1236 } 1237 break; 1238 1239 case SOPT_GET: 1240 switch (sopt->sopt_name) { 1241 case IP_OPTIONS: 1242 case IP_RETOPTS: 1243 if (inp->inp_options) 1244 error = sooptcopyout(sopt, 1245 mtod(inp->inp_options, 1246 char *), 1247 inp->inp_options->m_len); 1248 else 1249 sopt->sopt_valsize = 0; 1250 break; 1251 1252 case IP_TOS: 1253 case IP_TTL: 1254 case IP_MINTTL: 1255 case IP_RECVOPTS: 1256 case IP_RECVRETOPTS: 1257 case IP_RECVDSTADDR: 1258 case IP_RECVTTL: 1259 case IP_RECVIF: 1260 case IP_PORTRANGE: 1261 case IP_FAITH: 1262 case IP_ONESBCAST: 1263 case IP_DONTFRAG: 1264 switch (sopt->sopt_name) { 1265 1266 case IP_TOS: 1267 optval = inp->inp_ip_tos; 1268 break; 1269 1270 case IP_TTL: 1271 optval = inp->inp_ip_ttl; 1272 break; 1273 1274 case IP_MINTTL: 1275 optval = inp->inp_ip_minttl; 1276 break; 1277 1278 #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) 1279 1280 case IP_RECVOPTS: 1281 optval = OPTBIT(INP_RECVOPTS); 1282 break; 1283 1284 case IP_RECVRETOPTS: 1285 optval = OPTBIT(INP_RECVRETOPTS); 1286 break; 1287 1288 case IP_RECVDSTADDR: 1289 optval = OPTBIT(INP_RECVDSTADDR); 1290 break; 1291 1292 case IP_RECVTTL: 1293 optval = OPTBIT(INP_RECVTTL); 1294 break; 1295 1296 case IP_RECVIF: 1297 optval = OPTBIT(INP_RECVIF); 1298 break; 1299 1300 case IP_PORTRANGE: 1301 if (inp->inp_flags & INP_HIGHPORT) 1302 optval = IP_PORTRANGE_HIGH; 1303 else if (inp->inp_flags & INP_LOWPORT) 1304 optval = IP_PORTRANGE_LOW; 1305 else 1306 optval = 0; 1307 break; 1308 1309 case IP_FAITH: 1310 optval = OPTBIT(INP_FAITH); 1311 break; 1312 1313 case IP_ONESBCAST: 1314 optval = OPTBIT(INP_ONESBCAST); 1315 break; 1316 case IP_DONTFRAG: 1317 optval = OPTBIT(INP_DONTFRAG); 1318 break; 1319 } 1320 error = sooptcopyout(sopt, &optval, sizeof optval); 1321 break; 1322 1323 case IP_MULTICAST_IF: 1324 case IP_MULTICAST_VIF: 1325 case IP_MULTICAST_TTL: 1326 case IP_MULTICAST_LOOP: 1327 case IP_ADD_MEMBERSHIP: 1328 case IP_DROP_MEMBERSHIP: 1329 error = ip_getmoptions(inp, sopt); 1330 break; 1331 1332 #if defined(IPSEC) || defined(FAST_IPSEC) 1333 case IP_IPSEC_POLICY: 1334 { 1335 struct mbuf *m = NULL; 1336 caddr_t req = NULL; 1337 size_t len = 0; 1338 1339 if (m != 0) { 1340 req = mtod(m, caddr_t); 1341 len = m->m_len; 1342 } 1343 error = ipsec4_get_policy(sotoinpcb(so), req, len, &m); 1344 if (error == 0) 1345 error = soopt_mcopyout(sopt, m); /* XXX */ 1346 if (error == 0) 1347 m_freem(m); 1348 break; 1349 } 1350 #endif /*IPSEC*/ 1351 1352 default: 1353 error = ENOPROTOOPT; 1354 break; 1355 } 1356 break; 1357 } 1358 return (error); 1359 } 1360 1361 /* 1362 * XXX 1363 * The whole multicast option thing needs to be re-thought. 1364 * Several of these options are equally applicable to non-multicast 1365 * transmission, and one (IP_MULTICAST_TTL) totally duplicates a 1366 * standard option (IP_TTL). 1367 */ 1368 1369 /* 1370 * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index. 1371 */ 1372 static struct ifnet * 1373 ip_multicast_if(a, ifindexp) 1374 struct in_addr *a; 1375 int *ifindexp; 1376 { 1377 int ifindex; 1378 struct ifnet *ifp; 1379 1380 if (ifindexp) 1381 *ifindexp = 0; 1382 if (ntohl(a->s_addr) >> 24 == 0) { 1383 ifindex = ntohl(a->s_addr) & 0xffffff; 1384 if (ifindex < 0 || if_index < ifindex) 1385 return NULL; 1386 ifp = ifnet_byindex(ifindex); 1387 if (ifindexp) 1388 *ifindexp = ifindex; 1389 } else { 1390 INADDR_TO_IFP(*a, ifp); 1391 } 1392 return ifp; 1393 } 1394 1395 /* 1396 * Given an inpcb, return its multicast options structure pointer. Accepts 1397 * an unlocked inpcb pointer, but will return it locked. May sleep. 1398 */ 1399 static struct ip_moptions * 1400 ip_findmoptions(struct inpcb *inp) 1401 { 1402 struct ip_moptions *imo; 1403 1404 INP_LOCK(inp); 1405 if (inp->inp_moptions != NULL) 1406 return (inp->inp_moptions); 1407 1408 INP_UNLOCK(inp); 1409 1410 imo = (struct ip_moptions*)malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK); 1411 1412 imo->imo_multicast_ifp = NULL; 1413 imo->imo_multicast_addr.s_addr = INADDR_ANY; 1414 imo->imo_multicast_vif = -1; 1415 imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 1416 imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; 1417 imo->imo_num_memberships = 0; 1418 1419 INP_LOCK(inp); 1420 if (inp->inp_moptions != NULL) { 1421 free(imo, M_IPMOPTS); 1422 return (inp->inp_moptions); 1423 } 1424 inp->inp_moptions = imo; 1425 return (imo); 1426 } 1427 1428 /* 1429 * Set the IP multicast options in response to user setsockopt(). 1430 */ 1431 static int 1432 ip_setmoptions(struct inpcb *inp, struct sockopt *sopt) 1433 { 1434 int error = 0; 1435 int i; 1436 struct in_addr addr; 1437 struct ip_mreq mreq; 1438 struct ifnet *ifp; 1439 struct ip_moptions *imo; 1440 struct route ro; 1441 struct sockaddr_in *dst; 1442 int ifindex; 1443 int s; 1444 1445 switch (sopt->sopt_name) { 1446 /* store an index number for the vif you wanna use in the send */ 1447 case IP_MULTICAST_VIF: 1448 if (legal_vif_num == 0) { 1449 error = EOPNOTSUPP; 1450 break; 1451 } 1452 error = sooptcopyin(sopt, &i, sizeof i, sizeof i); 1453 if (error) 1454 break; 1455 if (!legal_vif_num(i) && (i != -1)) { 1456 error = EINVAL; 1457 break; 1458 } 1459 imo = ip_findmoptions(inp); 1460 imo->imo_multicast_vif = i; 1461 INP_UNLOCK(inp); 1462 break; 1463 1464 case IP_MULTICAST_IF: 1465 /* 1466 * Select the interface for outgoing multicast packets. 1467 */ 1468 error = sooptcopyin(sopt, &addr, sizeof addr, sizeof addr); 1469 if (error) 1470 break; 1471 /* 1472 * INADDR_ANY is used to remove a previous selection. 1473 * When no interface is selected, a default one is 1474 * chosen every time a multicast packet is sent. 1475 */ 1476 imo = ip_findmoptions(inp); 1477 if (addr.s_addr == INADDR_ANY) { 1478 imo->imo_multicast_ifp = NULL; 1479 INP_UNLOCK(inp); 1480 break; 1481 } 1482 /* 1483 * The selected interface is identified by its local 1484 * IP address. Find the interface and confirm that 1485 * it supports multicasting. 1486 */ 1487 s = splimp(); 1488 ifp = ip_multicast_if(&addr, &ifindex); 1489 if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { 1490 INP_UNLOCK(inp); 1491 splx(s); 1492 error = EADDRNOTAVAIL; 1493 break; 1494 } 1495 imo->imo_multicast_ifp = ifp; 1496 if (ifindex) 1497 imo->imo_multicast_addr = addr; 1498 else 1499 imo->imo_multicast_addr.s_addr = INADDR_ANY; 1500 INP_UNLOCK(inp); 1501 splx(s); 1502 break; 1503 1504 case IP_MULTICAST_TTL: 1505 /* 1506 * Set the IP time-to-live for outgoing multicast packets. 1507 * The original multicast API required a char argument, 1508 * which is inconsistent with the rest of the socket API. 1509 * We allow either a char or an int. 1510 */ 1511 if (sopt->sopt_valsize == 1) { 1512 u_char ttl; 1513 error = sooptcopyin(sopt, &ttl, 1, 1); 1514 if (error) 1515 break; 1516 imo = ip_findmoptions(inp); 1517 imo->imo_multicast_ttl = ttl; 1518 INP_UNLOCK(inp); 1519 } else { 1520 u_int ttl; 1521 error = sooptcopyin(sopt, &ttl, sizeof ttl, 1522 sizeof ttl); 1523 if (error) 1524 break; 1525 if (ttl > 255) 1526 error = EINVAL; 1527 else { 1528 imo = ip_findmoptions(inp); 1529 imo->imo_multicast_ttl = ttl; 1530 INP_UNLOCK(inp); 1531 } 1532 } 1533 break; 1534 1535 case IP_MULTICAST_LOOP: 1536 /* 1537 * Set the loopback flag for outgoing multicast packets. 1538 * Must be zero or one. The original multicast API required a 1539 * char argument, which is inconsistent with the rest 1540 * of the socket API. We allow either a char or an int. 1541 */ 1542 if (sopt->sopt_valsize == 1) { 1543 u_char loop; 1544 error = sooptcopyin(sopt, &loop, 1, 1); 1545 if (error) 1546 break; 1547 imo = ip_findmoptions(inp); 1548 imo->imo_multicast_loop = !!loop; 1549 INP_UNLOCK(inp); 1550 } else { 1551 u_int loop; 1552 error = sooptcopyin(sopt, &loop, sizeof loop, 1553 sizeof loop); 1554 if (error) 1555 break; 1556 imo = ip_findmoptions(inp); 1557 imo->imo_multicast_loop = !!loop; 1558 INP_UNLOCK(inp); 1559 } 1560 break; 1561 1562 case IP_ADD_MEMBERSHIP: 1563 /* 1564 * Add a multicast group membership. 1565 * Group must be a valid IP multicast address. 1566 */ 1567 error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); 1568 if (error) 1569 break; 1570 1571 if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) { 1572 error = EINVAL; 1573 break; 1574 } 1575 s = splimp(); 1576 /* 1577 * If no interface address was provided, use the interface of 1578 * the route to the given multicast address. 1579 */ 1580 if (mreq.imr_interface.s_addr == INADDR_ANY) { 1581 bzero((caddr_t)&ro, sizeof(ro)); 1582 dst = (struct sockaddr_in *)&ro.ro_dst; 1583 dst->sin_len = sizeof(*dst); 1584 dst->sin_family = AF_INET; 1585 dst->sin_addr = mreq.imr_multiaddr; 1586 rtalloc_ign(&ro, RTF_CLONING); 1587 if (ro.ro_rt == NULL) { 1588 error = EADDRNOTAVAIL; 1589 splx(s); 1590 break; 1591 } 1592 ifp = ro.ro_rt->rt_ifp; 1593 RTFREE(ro.ro_rt); 1594 } 1595 else { 1596 ifp = ip_multicast_if(&mreq.imr_interface, NULL); 1597 } 1598 1599 /* 1600 * See if we found an interface, and confirm that it 1601 * supports multicast. 1602 */ 1603 if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { 1604 error = EADDRNOTAVAIL; 1605 splx(s); 1606 break; 1607 } 1608 /* 1609 * See if the membership already exists or if all the 1610 * membership slots are full. 1611 */ 1612 imo = ip_findmoptions(inp); 1613 for (i = 0; i < imo->imo_num_memberships; ++i) { 1614 if (imo->imo_membership[i]->inm_ifp == ifp && 1615 imo->imo_membership[i]->inm_addr.s_addr 1616 == mreq.imr_multiaddr.s_addr) 1617 break; 1618 } 1619 if (i < imo->imo_num_memberships) { 1620 INP_UNLOCK(inp); 1621 error = EADDRINUSE; 1622 splx(s); 1623 break; 1624 } 1625 if (i == IP_MAX_MEMBERSHIPS) { 1626 INP_UNLOCK(inp); 1627 error = ETOOMANYREFS; 1628 splx(s); 1629 break; 1630 } 1631 /* 1632 * Everything looks good; add a new record to the multicast 1633 * address list for the given interface. 1634 */ 1635 if ((imo->imo_membership[i] = 1636 in_addmulti(&mreq.imr_multiaddr, ifp)) == NULL) { 1637 INP_UNLOCK(inp); 1638 error = ENOBUFS; 1639 splx(s); 1640 break; 1641 } 1642 ++imo->imo_num_memberships; 1643 INP_UNLOCK(inp); 1644 splx(s); 1645 break; 1646 1647 case IP_DROP_MEMBERSHIP: 1648 /* 1649 * Drop a multicast group membership. 1650 * Group must be a valid IP multicast address. 1651 */ 1652 error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); 1653 if (error) 1654 break; 1655 1656 if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) { 1657 error = EINVAL; 1658 break; 1659 } 1660 1661 s = splimp(); 1662 /* 1663 * If an interface address was specified, get a pointer 1664 * to its ifnet structure. 1665 */ 1666 if (mreq.imr_interface.s_addr == INADDR_ANY) 1667 ifp = NULL; 1668 else { 1669 ifp = ip_multicast_if(&mreq.imr_interface, NULL); 1670 if (ifp == NULL) { 1671 error = EADDRNOTAVAIL; 1672 splx(s); 1673 break; 1674 } 1675 } 1676 /* 1677 * Find the membership in the membership array. 1678 */ 1679 imo = ip_findmoptions(inp); 1680 for (i = 0; i < imo->imo_num_memberships; ++i) { 1681 if ((ifp == NULL || 1682 imo->imo_membership[i]->inm_ifp == ifp) && 1683 imo->imo_membership[i]->inm_addr.s_addr == 1684 mreq.imr_multiaddr.s_addr) 1685 break; 1686 } 1687 if (i == imo->imo_num_memberships) { 1688 INP_UNLOCK(inp); 1689 error = EADDRNOTAVAIL; 1690 splx(s); 1691 break; 1692 } 1693 /* 1694 * Give up the multicast address record to which the 1695 * membership points. 1696 */ 1697 in_delmulti(imo->imo_membership[i]); 1698 /* 1699 * Remove the gap in the membership array. 1700 */ 1701 for (++i; i < imo->imo_num_memberships; ++i) 1702 imo->imo_membership[i-1] = imo->imo_membership[i]; 1703 --imo->imo_num_memberships; 1704 INP_UNLOCK(inp); 1705 splx(s); 1706 break; 1707 1708 default: 1709 error = EOPNOTSUPP; 1710 break; 1711 } 1712 1713 return (error); 1714 } 1715 1716 /* 1717 * Return the IP multicast options in response to user getsockopt(). 1718 */ 1719 static int 1720 ip_getmoptions(struct inpcb *inp, struct sockopt *sopt) 1721 { 1722 struct ip_moptions *imo; 1723 struct in_addr addr; 1724 struct in_ifaddr *ia; 1725 int error, optval; 1726 u_char coptval; 1727 1728 INP_LOCK(inp); 1729 imo = inp->inp_moptions; 1730 1731 error = 0; 1732 switch (sopt->sopt_name) { 1733 case IP_MULTICAST_VIF: 1734 if (imo != NULL) 1735 optval = imo->imo_multicast_vif; 1736 else 1737 optval = -1; 1738 INP_UNLOCK(inp); 1739 error = sooptcopyout(sopt, &optval, sizeof optval); 1740 break; 1741 1742 case IP_MULTICAST_IF: 1743 if (imo == NULL || imo->imo_multicast_ifp == NULL) 1744 addr.s_addr = INADDR_ANY; 1745 else if (imo->imo_multicast_addr.s_addr) { 1746 /* return the value user has set */ 1747 addr = imo->imo_multicast_addr; 1748 } else { 1749 IFP_TO_IA(imo->imo_multicast_ifp, ia); 1750 addr.s_addr = (ia == NULL) ? INADDR_ANY 1751 : IA_SIN(ia)->sin_addr.s_addr; 1752 } 1753 INP_UNLOCK(inp); 1754 error = sooptcopyout(sopt, &addr, sizeof addr); 1755 break; 1756 1757 case IP_MULTICAST_TTL: 1758 if (imo == 0) 1759 optval = coptval = IP_DEFAULT_MULTICAST_TTL; 1760 else 1761 optval = coptval = imo->imo_multicast_ttl; 1762 INP_UNLOCK(inp); 1763 if (sopt->sopt_valsize == 1) 1764 error = sooptcopyout(sopt, &coptval, 1); 1765 else 1766 error = sooptcopyout(sopt, &optval, sizeof optval); 1767 break; 1768 1769 case IP_MULTICAST_LOOP: 1770 if (imo == 0) 1771 optval = coptval = IP_DEFAULT_MULTICAST_LOOP; 1772 else 1773 optval = coptval = imo->imo_multicast_loop; 1774 INP_UNLOCK(inp); 1775 if (sopt->sopt_valsize == 1) 1776 error = sooptcopyout(sopt, &coptval, 1); 1777 else 1778 error = sooptcopyout(sopt, &optval, sizeof optval); 1779 break; 1780 1781 default: 1782 INP_UNLOCK(inp); 1783 error = ENOPROTOOPT; 1784 break; 1785 } 1786 INP_UNLOCK_ASSERT(inp); 1787 1788 return (error); 1789 } 1790 1791 /* 1792 * Discard the IP multicast options. 1793 */ 1794 void 1795 ip_freemoptions(imo) 1796 register struct ip_moptions *imo; 1797 { 1798 register int i; 1799 1800 if (imo != NULL) { 1801 for (i = 0; i < imo->imo_num_memberships; ++i) 1802 in_delmulti(imo->imo_membership[i]); 1803 free(imo, M_IPMOPTS); 1804 } 1805 } 1806 1807 /* 1808 * Routine called from ip_output() to loop back a copy of an IP multicast 1809 * packet to the input queue of a specified interface. Note that this 1810 * calls the output routine of the loopback "driver", but with an interface 1811 * pointer that might NOT be a loopback interface -- evil, but easier than 1812 * replicating that code here. 1813 */ 1814 static void 1815 ip_mloopback(ifp, m, dst, hlen) 1816 struct ifnet *ifp; 1817 register struct mbuf *m; 1818 register struct sockaddr_in *dst; 1819 int hlen; 1820 { 1821 register struct ip *ip; 1822 struct mbuf *copym; 1823 1824 copym = m_copy(m, 0, M_COPYALL); 1825 if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen)) 1826 copym = m_pullup(copym, hlen); 1827 if (copym != NULL) { 1828 /* If needed, compute the checksum and mark it as valid. */ 1829 if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 1830 in_delayed_cksum(copym); 1831 copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 1832 copym->m_pkthdr.csum_flags |= 1833 CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 1834 copym->m_pkthdr.csum_data = 0xffff; 1835 } 1836 /* 1837 * We don't bother to fragment if the IP length is greater 1838 * than the interface's MTU. Can this possibly matter? 1839 */ 1840 ip = mtod(copym, struct ip *); 1841 ip->ip_len = htons(ip->ip_len); 1842 ip->ip_off = htons(ip->ip_off); 1843 ip->ip_sum = 0; 1844 ip->ip_sum = in_cksum(copym, hlen); 1845 /* 1846 * NB: 1847 * It's not clear whether there are any lingering 1848 * reentrancy problems in other areas which might 1849 * be exposed by using ip_input directly (in 1850 * particular, everything which modifies the packet 1851 * in-place). Yet another option is using the 1852 * protosw directly to deliver the looped back 1853 * packet. For the moment, we'll err on the side 1854 * of safety by using if_simloop(). 1855 */ 1856 #if 1 /* XXX */ 1857 if (dst->sin_family != AF_INET) { 1858 printf("ip_mloopback: bad address family %d\n", 1859 dst->sin_family); 1860 dst->sin_family = AF_INET; 1861 } 1862 #endif 1863 1864 #ifdef notdef 1865 copym->m_pkthdr.rcvif = ifp; 1866 ip_input(copym); 1867 #else 1868 if_simloop(ifp, copym, dst->sin_family, 0); 1869 #endif 1870 } 1871 } 1872