1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2020 Alexander V. Chernikov 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 #include "opt_inet.h" 31 #include "opt_inet6.h" 32 #include "opt_route.h" 33 34 #include <sys/param.h> 35 #include <sys/jail.h> 36 #include <sys/systm.h> 37 #include <sys/malloc.h> 38 #include <sys/mbuf.h> 39 #include <sys/socket.h> 40 #include <sys/sysctl.h> 41 #include <sys/syslog.h> 42 #include <sys/sysproto.h> 43 #include <sys/proc.h> 44 #include <sys/domain.h> 45 #include <sys/kernel.h> 46 #include <sys/lock.h> 47 #include <sys/rmlock.h> 48 49 #include <net/if.h> 50 #include <net/if_var.h> 51 #include <net/if_dl.h> 52 #include <net/route.h> 53 #include <net/route/route_ctl.h> 54 #include <net/route/route_var.h> 55 #include <net/route/nhop_utils.h> 56 #include <net/route/nhop.h> 57 #include <net/route/nhop_var.h> 58 #ifdef INET 59 #include <netinet/in_fib.h> 60 #endif 61 #ifdef INET6 62 #include <netinet6/in6_fib.h> 63 #endif 64 #include <net/vnet.h> 65 66 /* 67 * RIB helper functions. 68 */ 69 70 void 71 rib_walk_ext_locked(struct rib_head *rnh, rib_walktree_f_t *wa_f, 72 rib_walk_hook_f_t *hook_f, void *arg) 73 { 74 if (hook_f != NULL) 75 hook_f(rnh, RIB_WALK_HOOK_PRE, arg); 76 rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f, arg); 77 if (hook_f != NULL) 78 hook_f(rnh, RIB_WALK_HOOK_POST, arg); 79 } 80 81 /* 82 * Calls @wa_f with @arg for each entry in the table specified by 83 * @af and @fibnum. 84 * 85 * @ss_t callback is called before and after the tree traversal 86 * while holding table lock. 87 * 88 * Table is traversed under read lock unless @wlock is set. 89 */ 90 void 91 rib_walk_ext_internal(struct rib_head *rnh, bool wlock, rib_walktree_f_t *wa_f, 92 rib_walk_hook_f_t *hook_f, void *arg) 93 { 94 RIB_RLOCK_TRACKER; 95 96 if (wlock) 97 RIB_WLOCK(rnh); 98 else 99 RIB_RLOCK(rnh); 100 rib_walk_ext_locked(rnh, wa_f, hook_f, arg); 101 if (wlock) 102 RIB_WUNLOCK(rnh); 103 else 104 RIB_RUNLOCK(rnh); 105 } 106 107 void 108 rib_walk_ext(uint32_t fibnum, int family, bool wlock, rib_walktree_f_t *wa_f, 109 rib_walk_hook_f_t *hook_f, void *arg) 110 { 111 struct rib_head *rnh; 112 113 if ((rnh = rt_tables_get_rnh(fibnum, family)) != NULL) 114 rib_walk_ext_internal(rnh, wlock, wa_f, hook_f, arg); 115 } 116 117 /* 118 * Calls @wa_f with @arg for each entry in the table specified by 119 * @af and @fibnum. 120 * 121 * Table is traversed under read lock unless @wlock is set. 122 */ 123 void 124 rib_walk(uint32_t fibnum, int family, bool wlock, rib_walktree_f_t *wa_f, 125 void *arg) 126 { 127 128 rib_walk_ext(fibnum, family, wlock, wa_f, NULL, arg); 129 } 130 131 /* 132 * Calls @wa_f with @arg for each entry in the table matching @prefix/@mask. 133 * 134 * The following flags are supported: 135 * RIB_FLAG_WLOCK: acquire exclusive lock 136 * RIB_FLAG_LOCKED: Assumes the table is already locked & skip locking 137 * 138 * By default, table is traversed under read lock. 139 */ 140 void 141 rib_walk_from(uint32_t fibnum, int family, uint32_t flags, struct sockaddr *prefix, 142 struct sockaddr *mask, rib_walktree_f_t *wa_f, void *arg) 143 { 144 RIB_RLOCK_TRACKER; 145 struct rib_head *rnh = rt_tables_get_rnh(fibnum, family); 146 147 if (rnh == NULL) 148 return; 149 150 if (flags & RIB_FLAG_WLOCK) 151 RIB_WLOCK(rnh); 152 else if (!(flags & RIB_FLAG_LOCKED)) 153 RIB_RLOCK(rnh); 154 155 rnh->rnh_walktree_from(&rnh->head, prefix, mask, (walktree_f_t *)wa_f, arg); 156 157 if (flags & RIB_FLAG_WLOCK) 158 RIB_WUNLOCK(rnh); 159 else if (!(flags & RIB_FLAG_LOCKED)) 160 RIB_RUNLOCK(rnh); 161 } 162 163 /* 164 * Iterates over all existing fibs in system calling 165 * @hook_f function before/after traversing each fib. 166 * Calls @wa_f function for each element in current fib. 167 * If af is not AF_UNSPEC, iterates over fibs in particular 168 * address family. 169 */ 170 void 171 rib_foreach_table_walk(int family, bool wlock, rib_walktree_f_t *wa_f, 172 rib_walk_hook_f_t *hook_f, void *arg) 173 { 174 175 for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) { 176 /* Do we want some specific family? */ 177 if (family != AF_UNSPEC) { 178 rib_walk_ext(fibnum, family, wlock, wa_f, hook_f, arg); 179 continue; 180 } 181 182 for (int i = 1; i <= AF_MAX; i++) 183 rib_walk_ext(fibnum, i, wlock, wa_f, hook_f, arg); 184 } 185 } 186 187 /* 188 * Iterates over all existing fibs in system and deletes each element 189 * for which @filter_f function returns non-zero value. 190 * If @family is not AF_UNSPEC, iterates over fibs in particular 191 * address family. 192 */ 193 void 194 rib_foreach_table_walk_del(int family, rib_filter_f_t *filter_f, void *arg) 195 { 196 197 for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) { 198 /* Do we want some specific family? */ 199 if (family != AF_UNSPEC) { 200 rib_walk_del(fibnum, family, filter_f, arg, 0); 201 continue; 202 } 203 204 for (int i = 1; i <= AF_MAX; i++) 205 rib_walk_del(fibnum, i, filter_f, arg, 0); 206 } 207 } 208 209 210 /* 211 * Wrapper for the control plane functions for performing af-agnostic 212 * lookups. 213 * @fibnum: fib to perform the lookup. 214 * @dst: sockaddr with family and addr filled in. IPv6 addresses needs to be in 215 * deembedded from. 216 * @flags: fib(9) flags. 217 * @flowid: flow id for path selection in multipath use case. 218 * 219 * Returns nhop_object or NULL. 220 * 221 * Requires NET_EPOCH. 222 * 223 */ 224 struct nhop_object * 225 rib_lookup(uint32_t fibnum, const struct sockaddr *dst, uint32_t flags, 226 uint32_t flowid) 227 { 228 struct nhop_object *nh; 229 230 nh = NULL; 231 232 switch (dst->sa_family) { 233 #ifdef INET 234 case AF_INET: 235 { 236 const struct sockaddr_in *a = (const struct sockaddr_in *)dst; 237 nh = fib4_lookup(fibnum, a->sin_addr, 0, flags, flowid); 238 break; 239 } 240 #endif 241 #ifdef INET6 242 case AF_INET6: 243 { 244 const struct sockaddr_in6 *a = (const struct sockaddr_in6*)dst; 245 nh = fib6_lookup(fibnum, &a->sin6_addr, a->sin6_scope_id, 246 flags, flowid); 247 break; 248 } 249 #endif 250 } 251 252 return (nh); 253 } 254 255 #ifdef ROUTE_MPATH 256 static void 257 decompose_change_notification(struct rib_cmd_info *rc, route_notification_t *cb, 258 void *cbdata) 259 { 260 uint32_t num_old, num_new; 261 uint32_t nh_idx_old, nh_idx_new; 262 struct weightened_nhop *wn_old, *wn_new; 263 struct weightened_nhop tmp = { NULL, 0 }; 264 uint32_t idx_old = 0, idx_new = 0; 265 266 struct rib_cmd_info rc_del = { .rc_cmd = RTM_DELETE, .rc_rt = rc->rc_rt }; 267 struct rib_cmd_info rc_add = { .rc_cmd = RTM_ADD, .rc_rt = rc->rc_rt }; 268 269 if (NH_IS_NHGRP(rc->rc_nh_old)) { 270 wn_old = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_old); 271 } else { 272 tmp.nh = rc->rc_nh_old; 273 tmp.weight = rc->rc_nh_weight; 274 wn_old = &tmp; 275 num_old = 1; 276 } 277 if (NH_IS_NHGRP(rc->rc_nh_new)) { 278 wn_new = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_new); 279 } else { 280 tmp.nh = rc->rc_nh_new; 281 tmp.weight = rc->rc_nh_weight; 282 wn_new = &tmp; 283 num_new = 1; 284 } 285 286 /* Use the fact that each @wn array is sorted */ 287 /* 288 * Want to convert into set of add and delete operations 289 * [1] -> [1, 2] = A{2} 290 * [2] -> [1, 2] = A{1} 291 * [1, 2, 4]->[1, 3, 4] = A{2}, D{3} 292 * [1, 2, 4]->[1, 4] = D{2} 293 * [1, 2, 4] -> [3, 4] = D{1}, C{2,3} OR C{1,3}, D{2} OR D{1},D{2},A{3} 294 * [1, 2] -> [3, 4] = 295 * 296 */ 297 idx_old = 0; 298 while ((idx_old < num_old) && (idx_new < num_new)) { 299 nh_idx_old = wn_old[idx_old].nh->nh_priv->nh_idx; 300 nh_idx_new = wn_new[idx_new].nh->nh_priv->nh_idx; 301 302 if (nh_idx_old == nh_idx_new) { 303 if (wn_old[idx_old].weight != wn_new[idx_new].weight) { 304 /* Update weight by providing del/add notifications */ 305 rc_del.rc_nh_old = wn_old[idx_old].nh; 306 rc_del.rc_nh_weight = wn_old[idx_old].weight; 307 cb(&rc_del, cbdata); 308 309 rc_add.rc_nh_new = wn_new[idx_new].nh; 310 rc_add.rc_nh_weight = wn_new[idx_new].weight; 311 cb(&rc_add, cbdata); 312 } 313 idx_old++; 314 idx_new++; 315 } else if (nh_idx_old < nh_idx_new) { 316 /* 317 * [1, ~2~, 4], [1, ~3~, 4] 318 * [1, ~2~, 5], [1, ~3~, 4] 319 * [1, ~2~], [1, ~3~, 4] 320 */ 321 if ((idx_old + 1 >= num_old) || 322 (wn_old[idx_old + 1].nh->nh_priv->nh_idx > nh_idx_new)) { 323 /* Add new unless the next old item is still <= new */ 324 rc_add.rc_nh_new = wn_new[idx_new].nh; 325 rc_add.rc_nh_weight = wn_new[idx_new].weight; 326 cb(&rc_add, cbdata); 327 idx_new++; 328 } 329 /* In any case, delete current old */ 330 rc_del.rc_nh_old = wn_old[idx_old].nh; 331 rc_del.rc_nh_weight = wn_old[idx_old].weight; 332 cb(&rc_del, cbdata); 333 idx_old++; 334 } else { 335 /* 336 * nh_idx_old > nh_idx_new 337 * 338 * [1, ~3~, 4], [1, ~2~, 4] 339 * [1, ~3~, 5], [1, ~2~, 4] 340 * [1, ~3~, 4], [1, ~2~] 341 */ 342 if ((idx_new + 1 >= num_new) || 343 (wn_new[idx_new + 1].nh->nh_priv->nh_idx > nh_idx_old)) { 344 /* No next item or next item is > current one */ 345 rc_add.rc_nh_new = wn_new[idx_new].nh; 346 rc_add.rc_nh_weight = wn_new[idx_new].weight; 347 cb(&rc_add, cbdata); 348 idx_new++; 349 } 350 /* In any case, delete current old */ 351 rc_del.rc_nh_old = wn_old[idx_old].nh; 352 rc_del.rc_nh_weight = wn_old[idx_old].weight; 353 cb(&rc_del, cbdata); 354 idx_old++; 355 } 356 } 357 358 while (idx_old < num_old) { 359 rc_del.rc_nh_old = wn_old[idx_old].nh; 360 rc_del.rc_nh_weight = wn_old[idx_old].weight; 361 cb(&rc_del, cbdata); 362 idx_old++; 363 } 364 365 while (idx_new < num_new) { 366 rc_add.rc_nh_new = wn_new[idx_new].nh; 367 rc_add.rc_nh_weight = wn_new[idx_new].weight; 368 cb(&rc_add, cbdata); 369 idx_new++; 370 } 371 } 372 373 /* 374 * Decompose multipath cmd info @rc into a list of add/del/change 375 * single-path operations, calling @cb callback for each operation. 376 * Assumes at least one of the nexthops in @rc is multipath. 377 */ 378 void 379 rib_decompose_notification(struct rib_cmd_info *rc, route_notification_t *cb, 380 void *cbdata) 381 { 382 struct weightened_nhop *wn; 383 uint32_t num_nhops; 384 struct rib_cmd_info rc_new; 385 386 rc_new = *rc; 387 DPRINTF("cb=%p cmd=%d nh_old=%p nh_new=%p", 388 cb, rc->cmd, rc->nh_old, rc->nh_new); 389 switch (rc->rc_cmd) { 390 case RTM_ADD: 391 if (!NH_IS_NHGRP(rc->rc_nh_new)) 392 return; 393 wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_nhops); 394 for (uint32_t i = 0; i < num_nhops; i++) { 395 rc_new.rc_nh_new = wn[i].nh; 396 rc_new.rc_nh_weight = wn[i].weight; 397 cb(&rc_new, cbdata); 398 } 399 break; 400 case RTM_DELETE: 401 if (!NH_IS_NHGRP(rc->rc_nh_old)) 402 return; 403 wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_nhops); 404 for (uint32_t i = 0; i < num_nhops; i++) { 405 rc_new.rc_nh_old = wn[i].nh; 406 rc_new.rc_nh_weight = wn[i].weight; 407 cb(&rc_new, cbdata); 408 } 409 break; 410 case RTM_CHANGE: 411 if (!NH_IS_NHGRP(rc->rc_nh_old) && !NH_IS_NHGRP(rc->rc_nh_new)) 412 return; 413 decompose_change_notification(rc, cb, cbdata); 414 break; 415 } 416 } 417 #endif 418