1 // SPDX-License-Identifier: GPL-2.0 2 /* Generic nexthop implementation 3 * 4 * Copyright (c) 2017-19 Cumulus Networks 5 * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com> 6 */ 7 8 #include <linux/nexthop.h> 9 #include <linux/rtnetlink.h> 10 #include <linux/slab.h> 11 #include <linux/vmalloc.h> 12 #include <net/arp.h> 13 #include <net/ipv6_stubs.h> 14 #include <net/lwtunnel.h> 15 #include <net/ndisc.h> 16 #include <net/nexthop.h> 17 #include <net/route.h> 18 #include <net/sock.h> 19 20 #define NH_RES_DEFAULT_IDLE_TIMER (120 * HZ) 21 #define NH_RES_DEFAULT_UNBALANCED_TIMER 0 /* No forced rebalancing. */ 22 23 static void remove_nexthop(struct net *net, struct nexthop *nh, 24 struct nl_info *nlinfo); 25 26 #define NH_DEV_HASHBITS 8 27 #define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS) 28 29 #define NHA_OP_FLAGS_DUMP_ALL (NHA_OP_FLAG_DUMP_STATS | \ 30 NHA_OP_FLAG_DUMP_HW_STATS) 31 32 static const struct nla_policy rtm_nh_policy_new[] = { 33 [NHA_ID] = { .type = NLA_U32 }, 34 [NHA_GROUP] = { .type = NLA_BINARY }, 35 [NHA_GROUP_TYPE] = { .type = NLA_U16 }, 36 [NHA_BLACKHOLE] = { .type = NLA_FLAG }, 37 [NHA_OIF] = { .type = NLA_U32 }, 38 [NHA_GATEWAY] = { .type = NLA_BINARY }, 39 [NHA_ENCAP_TYPE] = { .type = NLA_U16 }, 40 [NHA_ENCAP] = { .type = NLA_NESTED }, 41 [NHA_FDB] = { .type = NLA_FLAG }, 42 [NHA_RES_GROUP] = { .type = NLA_NESTED }, 43 [NHA_HW_STATS_ENABLE] = NLA_POLICY_MAX(NLA_U32, true), 44 }; 45 46 static const struct nla_policy rtm_nh_policy_get[] = { 47 [NHA_ID] = { .type = NLA_U32 }, 48 [NHA_OP_FLAGS] = NLA_POLICY_MASK(NLA_U32, 49 NHA_OP_FLAGS_DUMP_ALL), 50 }; 51 52 static const struct nla_policy rtm_nh_policy_del[] = { 53 [NHA_ID] = { .type = NLA_U32 }, 54 }; 55 56 static const struct nla_policy rtm_nh_policy_dump[] = { 57 [NHA_OIF] = { .type = NLA_U32 }, 58 [NHA_GROUPS] = { .type = NLA_FLAG }, 59 [NHA_MASTER] = { .type = NLA_U32 }, 60 [NHA_FDB] = { .type = NLA_FLAG }, 61 [NHA_OP_FLAGS] = NLA_POLICY_MASK(NLA_U32, 62 NHA_OP_FLAGS_DUMP_ALL), 63 }; 64 65 static const struct nla_policy rtm_nh_res_policy_new[] = { 66 [NHA_RES_GROUP_BUCKETS] = { .type = NLA_U16 }, 67 [NHA_RES_GROUP_IDLE_TIMER] = { .type = NLA_U32 }, 68 [NHA_RES_GROUP_UNBALANCED_TIMER] = { .type = NLA_U32 }, 69 }; 70 71 static const struct nla_policy rtm_nh_policy_dump_bucket[] = { 72 [NHA_ID] = { .type = NLA_U32 }, 73 [NHA_OIF] = { .type = NLA_U32 }, 74 [NHA_MASTER] = { .type = NLA_U32 }, 75 [NHA_RES_BUCKET] = { .type = NLA_NESTED }, 76 }; 77 78 static const struct nla_policy rtm_nh_res_bucket_policy_dump[] = { 79 [NHA_RES_BUCKET_NH_ID] = { .type = NLA_U32 }, 80 }; 81 82 static const struct nla_policy rtm_nh_policy_get_bucket[] = { 83 [NHA_ID] = { .type = NLA_U32 }, 84 [NHA_RES_BUCKET] = { .type = NLA_NESTED }, 85 }; 86 87 static const struct nla_policy rtm_nh_res_bucket_policy_get[] = { 88 [NHA_RES_BUCKET_INDEX] = { .type = NLA_U16 }, 89 }; 90 91 static bool nexthop_notifiers_is_empty(struct net *net) 92 { 93 return !net->nexthop.notifier_chain.head; 94 } 95 96 static void 97 __nh_notifier_single_info_init(struct nh_notifier_single_info *nh_info, 98 const struct nh_info *nhi) 99 { 100 nh_info->dev = nhi->fib_nhc.nhc_dev; 101 nh_info->gw_family = nhi->fib_nhc.nhc_gw_family; 102 if (nh_info->gw_family == AF_INET) 103 nh_info->ipv4 = nhi->fib_nhc.nhc_gw.ipv4; 104 else if (nh_info->gw_family == AF_INET6) 105 nh_info->ipv6 = nhi->fib_nhc.nhc_gw.ipv6; 106 107 nh_info->id = nhi->nh_parent->id; 108 nh_info->is_reject = nhi->reject_nh; 109 nh_info->is_fdb = nhi->fdb_nh; 110 nh_info->has_encap = !!nhi->fib_nhc.nhc_lwtstate; 111 } 112 113 static int nh_notifier_single_info_init(struct nh_notifier_info *info, 114 const struct nexthop *nh) 115 { 116 struct nh_info *nhi = rtnl_dereference(nh->nh_info); 117 118 info->type = NH_NOTIFIER_INFO_TYPE_SINGLE; 119 info->nh = kzalloc_obj(*info->nh); 120 if (!info->nh) 121 return -ENOMEM; 122 123 __nh_notifier_single_info_init(info->nh, nhi); 124 125 return 0; 126 } 127 128 static void nh_notifier_single_info_fini(struct nh_notifier_info *info) 129 { 130 kfree(info->nh); 131 } 132 133 static int nh_notifier_mpath_info_init(struct nh_notifier_info *info, 134 struct nh_group *nhg) 135 { 136 u16 num_nh = nhg->num_nh; 137 int i; 138 139 info->type = NH_NOTIFIER_INFO_TYPE_GRP; 140 info->nh_grp = kzalloc_flex(*info->nh_grp, nh_entries, num_nh, 141 GFP_KERNEL); 142 if (!info->nh_grp) 143 return -ENOMEM; 144 145 info->nh_grp->num_nh = num_nh; 146 info->nh_grp->is_fdb = nhg->fdb_nh; 147 info->nh_grp->hw_stats = nhg->hw_stats; 148 149 for (i = 0; i < num_nh; i++) { 150 struct nh_grp_entry *nhge = &nhg->nh_entries[i]; 151 struct nh_info *nhi; 152 153 nhi = rtnl_dereference(nhge->nh->nh_info); 154 info->nh_grp->nh_entries[i].weight = nhge->weight; 155 __nh_notifier_single_info_init(&info->nh_grp->nh_entries[i].nh, 156 nhi); 157 } 158 159 return 0; 160 } 161 162 static int nh_notifier_res_table_info_init(struct nh_notifier_info *info, 163 struct nh_group *nhg) 164 { 165 struct nh_res_table *res_table = rtnl_dereference(nhg->res_table); 166 u16 num_nh_buckets = res_table->num_nh_buckets; 167 unsigned long size; 168 u16 i; 169 170 info->type = NH_NOTIFIER_INFO_TYPE_RES_TABLE; 171 size = struct_size(info->nh_res_table, nhs, num_nh_buckets); 172 info->nh_res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO | 173 __GFP_NOWARN); 174 if (!info->nh_res_table) 175 return -ENOMEM; 176 177 info->nh_res_table->num_nh_buckets = num_nh_buckets; 178 info->nh_res_table->hw_stats = nhg->hw_stats; 179 180 for (i = 0; i < num_nh_buckets; i++) { 181 struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; 182 struct nh_grp_entry *nhge; 183 struct nh_info *nhi; 184 185 nhge = rtnl_dereference(bucket->nh_entry); 186 nhi = rtnl_dereference(nhge->nh->nh_info); 187 __nh_notifier_single_info_init(&info->nh_res_table->nhs[i], 188 nhi); 189 } 190 191 return 0; 192 } 193 194 static int nh_notifier_grp_info_init(struct nh_notifier_info *info, 195 const struct nexthop *nh) 196 { 197 struct nh_group *nhg = rtnl_dereference(nh->nh_grp); 198 199 if (nhg->hash_threshold) 200 return nh_notifier_mpath_info_init(info, nhg); 201 else if (nhg->resilient) 202 return nh_notifier_res_table_info_init(info, nhg); 203 return -EINVAL; 204 } 205 206 static void nh_notifier_grp_info_fini(struct nh_notifier_info *info, 207 const struct nexthop *nh) 208 { 209 struct nh_group *nhg = rtnl_dereference(nh->nh_grp); 210 211 if (nhg->hash_threshold) 212 kfree(info->nh_grp); 213 else if (nhg->resilient) 214 vfree(info->nh_res_table); 215 } 216 217 static int nh_notifier_info_init(struct nh_notifier_info *info, 218 const struct nexthop *nh) 219 { 220 info->id = nh->id; 221 222 if (nh->is_group) 223 return nh_notifier_grp_info_init(info, nh); 224 else 225 return nh_notifier_single_info_init(info, nh); 226 } 227 228 static void nh_notifier_info_fini(struct nh_notifier_info *info, 229 const struct nexthop *nh) 230 { 231 if (nh->is_group) 232 nh_notifier_grp_info_fini(info, nh); 233 else 234 nh_notifier_single_info_fini(info); 235 } 236 237 static int call_nexthop_notifiers(struct net *net, 238 enum nexthop_event_type event_type, 239 struct nexthop *nh, 240 struct netlink_ext_ack *extack) 241 { 242 struct nh_notifier_info info = { 243 .net = net, 244 .extack = extack, 245 }; 246 int err; 247 248 ASSERT_RTNL(); 249 250 if (nexthop_notifiers_is_empty(net)) 251 return 0; 252 253 err = nh_notifier_info_init(&info, nh); 254 if (err) { 255 NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info"); 256 return err; 257 } 258 259 err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, 260 event_type, &info); 261 nh_notifier_info_fini(&info, nh); 262 263 return notifier_to_errno(err); 264 } 265 266 static int 267 nh_notifier_res_bucket_idle_timer_get(const struct nh_notifier_info *info, 268 bool force, unsigned int *p_idle_timer_ms) 269 { 270 struct nh_res_table *res_table; 271 struct nh_group *nhg; 272 struct nexthop *nh; 273 int err = 0; 274 275 /* When 'force' is false, nexthop bucket replacement is performed 276 * because the bucket was deemed to be idle. In this case, capable 277 * listeners can choose to perform an atomic replacement: The bucket is 278 * only replaced if it is inactive. However, if the idle timer interval 279 * is smaller than the interval in which a listener is querying 280 * buckets' activity from the device, then atomic replacement should 281 * not be tried. Pass the idle timer value to listeners, so that they 282 * could determine which type of replacement to perform. 283 */ 284 if (force) { 285 *p_idle_timer_ms = 0; 286 return 0; 287 } 288 289 rcu_read_lock(); 290 291 nh = nexthop_find_by_id(info->net, info->id); 292 if (!nh) { 293 err = -EINVAL; 294 goto out; 295 } 296 297 nhg = rcu_dereference(nh->nh_grp); 298 res_table = rcu_dereference(nhg->res_table); 299 *p_idle_timer_ms = jiffies_to_msecs(res_table->idle_timer); 300 301 out: 302 rcu_read_unlock(); 303 304 return err; 305 } 306 307 static int nh_notifier_res_bucket_info_init(struct nh_notifier_info *info, 308 u16 bucket_index, bool force, 309 struct nh_info *oldi, 310 struct nh_info *newi) 311 { 312 unsigned int idle_timer_ms; 313 int err; 314 315 err = nh_notifier_res_bucket_idle_timer_get(info, force, 316 &idle_timer_ms); 317 if (err) 318 return err; 319 320 info->type = NH_NOTIFIER_INFO_TYPE_RES_BUCKET; 321 info->nh_res_bucket = kzalloc_obj(*info->nh_res_bucket); 322 if (!info->nh_res_bucket) 323 return -ENOMEM; 324 325 info->nh_res_bucket->bucket_index = bucket_index; 326 info->nh_res_bucket->idle_timer_ms = idle_timer_ms; 327 info->nh_res_bucket->force = force; 328 __nh_notifier_single_info_init(&info->nh_res_bucket->old_nh, oldi); 329 __nh_notifier_single_info_init(&info->nh_res_bucket->new_nh, newi); 330 return 0; 331 } 332 333 static void nh_notifier_res_bucket_info_fini(struct nh_notifier_info *info) 334 { 335 kfree(info->nh_res_bucket); 336 } 337 338 static int __call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id, 339 u16 bucket_index, bool force, 340 struct nh_info *oldi, 341 struct nh_info *newi, 342 struct netlink_ext_ack *extack) 343 { 344 struct nh_notifier_info info = { 345 .net = net, 346 .extack = extack, 347 .id = nhg_id, 348 }; 349 int err; 350 351 if (nexthop_notifiers_is_empty(net)) 352 return 0; 353 354 err = nh_notifier_res_bucket_info_init(&info, bucket_index, force, 355 oldi, newi); 356 if (err) 357 return err; 358 359 err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, 360 NEXTHOP_EVENT_BUCKET_REPLACE, &info); 361 nh_notifier_res_bucket_info_fini(&info); 362 363 return notifier_to_errno(err); 364 } 365 366 /* There are three users of RES_TABLE, and NHs etc. referenced from there: 367 * 368 * 1) a collection of callbacks for NH maintenance. This operates under 369 * RTNL, 370 * 2) the delayed work that gradually balances the resilient table, 371 * 3) and nexthop_select_path(), operating under RCU. 372 * 373 * Both the delayed work and the RTNL block are writers, and need to 374 * maintain mutual exclusion. Since there are only two and well-known 375 * writers for each table, the RTNL code can make sure it has exclusive 376 * access thus: 377 * 378 * - Have the DW operate without locking; 379 * - synchronously cancel the DW; 380 * - do the writing; 381 * - if the write was not actually a delete, call upkeep, which schedules 382 * DW again if necessary. 383 * 384 * The functions that are always called from the RTNL context use 385 * rtnl_dereference(). The functions that can also be called from the DW do 386 * a raw dereference and rely on the above mutual exclusion scheme. 387 */ 388 #define nh_res_dereference(p) (rcu_dereference_raw(p)) 389 390 static int call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id, 391 u16 bucket_index, bool force, 392 struct nexthop *old_nh, 393 struct nexthop *new_nh, 394 struct netlink_ext_ack *extack) 395 { 396 struct nh_info *oldi = nh_res_dereference(old_nh->nh_info); 397 struct nh_info *newi = nh_res_dereference(new_nh->nh_info); 398 399 return __call_nexthop_res_bucket_notifiers(net, nhg_id, bucket_index, 400 force, oldi, newi, extack); 401 } 402 403 static int call_nexthop_res_table_notifiers(struct net *net, struct nexthop *nh, 404 struct netlink_ext_ack *extack) 405 { 406 struct nh_notifier_info info = { 407 .net = net, 408 .extack = extack, 409 .id = nh->id, 410 }; 411 struct nh_group *nhg; 412 int err; 413 414 ASSERT_RTNL(); 415 416 if (nexthop_notifiers_is_empty(net)) 417 return 0; 418 419 /* At this point, the nexthop buckets are still not populated. Only 420 * emit a notification with the logical nexthops, so that a listener 421 * could potentially veto it in case of unsupported configuration. 422 */ 423 nhg = rtnl_dereference(nh->nh_grp); 424 err = nh_notifier_mpath_info_init(&info, nhg); 425 if (err) { 426 NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info"); 427 return err; 428 } 429 430 err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, 431 NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE, 432 &info); 433 kfree(info.nh_grp); 434 435 return notifier_to_errno(err); 436 } 437 438 static int call_nexthop_notifier(struct notifier_block *nb, struct net *net, 439 enum nexthop_event_type event_type, 440 struct nexthop *nh, 441 struct netlink_ext_ack *extack) 442 { 443 struct nh_notifier_info info = { 444 .net = net, 445 .extack = extack, 446 }; 447 int err; 448 449 err = nh_notifier_info_init(&info, nh); 450 if (err) 451 return err; 452 453 err = nb->notifier_call(nb, event_type, &info); 454 nh_notifier_info_fini(&info, nh); 455 456 return notifier_to_errno(err); 457 } 458 459 static unsigned int nh_dev_hashfn(unsigned int val) 460 { 461 unsigned int mask = NH_DEV_HASHSIZE - 1; 462 463 return (val ^ 464 (val >> NH_DEV_HASHBITS) ^ 465 (val >> (NH_DEV_HASHBITS * 2))) & mask; 466 } 467 468 static void nexthop_devhash_add(struct net *net, struct nh_info *nhi) 469 { 470 struct net_device *dev = nhi->fib_nhc.nhc_dev; 471 struct hlist_head *head; 472 unsigned int hash; 473 474 WARN_ON(!dev); 475 476 hash = nh_dev_hashfn(dev->ifindex); 477 head = &net->nexthop.devhash[hash]; 478 hlist_add_head(&nhi->dev_hash, head); 479 } 480 481 static void nexthop_free_group(struct nexthop *nh) 482 { 483 struct nh_group *nhg; 484 int i; 485 486 nhg = rcu_dereference_raw(nh->nh_grp); 487 for (i = 0; i < nhg->num_nh; ++i) { 488 struct nh_grp_entry *nhge = &nhg->nh_entries[i]; 489 490 WARN_ON(!list_empty(&nhge->nh_list)); 491 free_percpu(nhge->stats); 492 nexthop_put(nhge->nh); 493 } 494 495 WARN_ON(nhg->spare == nhg); 496 497 if (nhg->resilient) 498 vfree(rcu_dereference_raw(nhg->res_table)); 499 500 kfree(nhg->spare); 501 kfree(nhg); 502 } 503 504 static void nexthop_free_single(struct nexthop *nh) 505 { 506 struct nh_info *nhi; 507 508 nhi = rcu_dereference_raw(nh->nh_info); 509 switch (nhi->family) { 510 case AF_INET: 511 fib_nh_release(nh->net, &nhi->fib_nh); 512 break; 513 case AF_INET6: 514 ipv6_stub->fib6_nh_release(&nhi->fib6_nh); 515 break; 516 } 517 kfree(nhi); 518 } 519 520 void nexthop_free_rcu(struct rcu_head *head) 521 { 522 struct nexthop *nh = container_of(head, struct nexthop, rcu); 523 524 if (nh->is_group) 525 nexthop_free_group(nh); 526 else 527 nexthop_free_single(nh); 528 529 kfree(nh); 530 } 531 EXPORT_SYMBOL_GPL(nexthop_free_rcu); 532 533 static struct nexthop *nexthop_alloc(void) 534 { 535 struct nexthop *nh; 536 537 nh = kzalloc_obj(struct nexthop); 538 if (nh) { 539 INIT_LIST_HEAD(&nh->fi_list); 540 INIT_LIST_HEAD(&nh->f6i_list); 541 INIT_LIST_HEAD(&nh->grp_list); 542 INIT_LIST_HEAD(&nh->fdb_list); 543 spin_lock_init(&nh->lock); 544 } 545 return nh; 546 } 547 548 static struct nh_group *nexthop_grp_alloc(u16 num_nh) 549 { 550 struct nh_group *nhg; 551 552 nhg = kzalloc_flex(*nhg, nh_entries, num_nh, GFP_KERNEL); 553 if (nhg) 554 nhg->num_nh = num_nh; 555 556 return nhg; 557 } 558 559 static void nh_res_table_upkeep_dw(struct work_struct *work); 560 561 static struct nh_res_table * 562 nexthop_res_table_alloc(struct net *net, u32 nhg_id, struct nh_config *cfg) 563 { 564 const u16 num_nh_buckets = cfg->nh_grp_res_num_buckets; 565 struct nh_res_table *res_table; 566 unsigned long size; 567 568 size = struct_size(res_table, nh_buckets, num_nh_buckets); 569 res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN); 570 if (!res_table) 571 return NULL; 572 573 res_table->net = net; 574 res_table->nhg_id = nhg_id; 575 INIT_DELAYED_WORK(&res_table->upkeep_dw, &nh_res_table_upkeep_dw); 576 INIT_LIST_HEAD(&res_table->uw_nh_entries); 577 res_table->idle_timer = cfg->nh_grp_res_idle_timer; 578 res_table->unbalanced_timer = cfg->nh_grp_res_unbalanced_timer; 579 res_table->num_nh_buckets = num_nh_buckets; 580 return res_table; 581 } 582 583 static void nh_base_seq_inc(struct net *net) 584 { 585 while (++net->nexthop.seq == 0) 586 ; 587 } 588 589 /* no reference taken; rcu lock or rtnl must be held */ 590 struct nexthop *nexthop_find_by_id(struct net *net, u32 id) 591 { 592 struct rb_node **pp, *parent = NULL, *next; 593 594 pp = &net->nexthop.rb_root.rb_node; 595 while (1) { 596 struct nexthop *nh; 597 598 next = rcu_dereference_raw(*pp); 599 if (!next) 600 break; 601 parent = next; 602 603 nh = rb_entry(parent, struct nexthop, rb_node); 604 if (id < nh->id) 605 pp = &next->rb_left; 606 else if (id > nh->id) 607 pp = &next->rb_right; 608 else 609 return nh; 610 } 611 return NULL; 612 } 613 EXPORT_SYMBOL_GPL(nexthop_find_by_id); 614 615 /* used for auto id allocation; called with rtnl held */ 616 static u32 nh_find_unused_id(struct net *net) 617 { 618 u32 id_start = net->nexthop.last_id_allocated; 619 620 while (1) { 621 net->nexthop.last_id_allocated++; 622 if (net->nexthop.last_id_allocated == id_start) 623 break; 624 625 if (!nexthop_find_by_id(net, net->nexthop.last_id_allocated)) 626 return net->nexthop.last_id_allocated; 627 } 628 return 0; 629 } 630 631 static void nh_res_time_set_deadline(unsigned long next_time, 632 unsigned long *deadline) 633 { 634 if (time_before(next_time, *deadline)) 635 *deadline = next_time; 636 } 637 638 static clock_t nh_res_table_unbalanced_time(struct nh_res_table *res_table) 639 { 640 if (list_empty(&res_table->uw_nh_entries)) 641 return 0; 642 return jiffies_delta_to_clock_t(jiffies - res_table->unbalanced_since); 643 } 644 645 static int nla_put_nh_group_res(struct sk_buff *skb, struct nh_group *nhg) 646 { 647 struct nh_res_table *res_table = rtnl_dereference(nhg->res_table); 648 struct nlattr *nest; 649 650 nest = nla_nest_start(skb, NHA_RES_GROUP); 651 if (!nest) 652 return -EMSGSIZE; 653 654 if (nla_put_u16(skb, NHA_RES_GROUP_BUCKETS, 655 res_table->num_nh_buckets) || 656 nla_put_u32(skb, NHA_RES_GROUP_IDLE_TIMER, 657 jiffies_to_clock_t(res_table->idle_timer)) || 658 nla_put_u32(skb, NHA_RES_GROUP_UNBALANCED_TIMER, 659 jiffies_to_clock_t(res_table->unbalanced_timer)) || 660 nla_put_u64_64bit(skb, NHA_RES_GROUP_UNBALANCED_TIME, 661 nh_res_table_unbalanced_time(res_table), 662 NHA_RES_GROUP_PAD)) 663 goto nla_put_failure; 664 665 nla_nest_end(skb, nest); 666 return 0; 667 668 nla_put_failure: 669 nla_nest_cancel(skb, nest); 670 return -EMSGSIZE; 671 } 672 673 static void nh_grp_entry_stats_inc(struct nh_grp_entry *nhge) 674 { 675 struct nh_grp_entry_stats *cpu_stats; 676 677 cpu_stats = get_cpu_ptr(nhge->stats); 678 u64_stats_update_begin(&cpu_stats->syncp); 679 u64_stats_inc(&cpu_stats->packets); 680 u64_stats_update_end(&cpu_stats->syncp); 681 put_cpu_ptr(cpu_stats); 682 } 683 684 static void nh_grp_entry_stats_read(struct nh_grp_entry *nhge, 685 u64 *ret_packets) 686 { 687 int i; 688 689 *ret_packets = 0; 690 691 for_each_possible_cpu(i) { 692 struct nh_grp_entry_stats *cpu_stats; 693 unsigned int start; 694 u64 packets; 695 696 cpu_stats = per_cpu_ptr(nhge->stats, i); 697 do { 698 start = u64_stats_fetch_begin(&cpu_stats->syncp); 699 packets = u64_stats_read(&cpu_stats->packets); 700 } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); 701 702 *ret_packets += packets; 703 } 704 } 705 706 static int nh_notifier_grp_hw_stats_init(struct nh_notifier_info *info, 707 const struct nexthop *nh) 708 { 709 struct nh_group *nhg; 710 int i; 711 712 ASSERT_RTNL(); 713 nhg = rtnl_dereference(nh->nh_grp); 714 715 info->id = nh->id; 716 info->type = NH_NOTIFIER_INFO_TYPE_GRP_HW_STATS; 717 info->nh_grp_hw_stats = kzalloc_flex(*info->nh_grp_hw_stats, stats, 718 nhg->num_nh, GFP_KERNEL); 719 if (!info->nh_grp_hw_stats) 720 return -ENOMEM; 721 722 info->nh_grp_hw_stats->num_nh = nhg->num_nh; 723 for (i = 0; i < nhg->num_nh; i++) { 724 struct nh_grp_entry *nhge = &nhg->nh_entries[i]; 725 726 info->nh_grp_hw_stats->stats[i].id = nhge->nh->id; 727 } 728 729 return 0; 730 } 731 732 static void nh_notifier_grp_hw_stats_fini(struct nh_notifier_info *info) 733 { 734 kfree(info->nh_grp_hw_stats); 735 } 736 737 void nh_grp_hw_stats_report_delta(struct nh_notifier_grp_hw_stats_info *info, 738 unsigned int nh_idx, 739 u64 delta_packets) 740 { 741 info->hw_stats_used = true; 742 info->stats[nh_idx].packets += delta_packets; 743 } 744 EXPORT_SYMBOL(nh_grp_hw_stats_report_delta); 745 746 static void nh_grp_hw_stats_apply_update(struct nexthop *nh, 747 struct nh_notifier_info *info) 748 { 749 struct nh_group *nhg; 750 int i; 751 752 ASSERT_RTNL(); 753 nhg = rtnl_dereference(nh->nh_grp); 754 755 for (i = 0; i < nhg->num_nh; i++) { 756 struct nh_grp_entry *nhge = &nhg->nh_entries[i]; 757 758 nhge->packets_hw += info->nh_grp_hw_stats->stats[i].packets; 759 } 760 } 761 762 static int nh_grp_hw_stats_update(struct nexthop *nh, bool *hw_stats_used) 763 { 764 struct nh_notifier_info info = { 765 .net = nh->net, 766 }; 767 struct net *net = nh->net; 768 int err; 769 770 if (nexthop_notifiers_is_empty(net)) { 771 *hw_stats_used = false; 772 return 0; 773 } 774 775 err = nh_notifier_grp_hw_stats_init(&info, nh); 776 if (err) 777 return err; 778 779 err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, 780 NEXTHOP_EVENT_HW_STATS_REPORT_DELTA, 781 &info); 782 783 /* Cache whatever we got, even if there was an error, otherwise the 784 * successful stats retrievals would get lost. 785 */ 786 nh_grp_hw_stats_apply_update(nh, &info); 787 *hw_stats_used = info.nh_grp_hw_stats->hw_stats_used; 788 789 nh_notifier_grp_hw_stats_fini(&info); 790 return notifier_to_errno(err); 791 } 792 793 static int nla_put_nh_group_stats_entry(struct sk_buff *skb, 794 struct nh_grp_entry *nhge, 795 u32 op_flags) 796 { 797 struct nlattr *nest; 798 u64 packets; 799 800 nh_grp_entry_stats_read(nhge, &packets); 801 802 nest = nla_nest_start(skb, NHA_GROUP_STATS_ENTRY); 803 if (!nest) 804 return -EMSGSIZE; 805 806 if (nla_put_u32(skb, NHA_GROUP_STATS_ENTRY_ID, nhge->nh->id) || 807 nla_put_uint(skb, NHA_GROUP_STATS_ENTRY_PACKETS, 808 packets + nhge->packets_hw)) 809 goto nla_put_failure; 810 811 if (op_flags & NHA_OP_FLAG_DUMP_HW_STATS && 812 nla_put_uint(skb, NHA_GROUP_STATS_ENTRY_PACKETS_HW, 813 nhge->packets_hw)) 814 goto nla_put_failure; 815 816 nla_nest_end(skb, nest); 817 return 0; 818 819 nla_put_failure: 820 nla_nest_cancel(skb, nest); 821 return -EMSGSIZE; 822 } 823 824 static int nla_put_nh_group_stats(struct sk_buff *skb, struct nexthop *nh, 825 u32 op_flags) 826 { 827 struct nh_group *nhg = rtnl_dereference(nh->nh_grp); 828 struct nlattr *nest; 829 bool hw_stats_used; 830 int err; 831 int i; 832 833 if (nla_put_u32(skb, NHA_HW_STATS_ENABLE, nhg->hw_stats)) 834 goto err_out; 835 836 if (op_flags & NHA_OP_FLAG_DUMP_HW_STATS && 837 nhg->hw_stats) { 838 err = nh_grp_hw_stats_update(nh, &hw_stats_used); 839 if (err) 840 goto out; 841 842 if (nla_put_u32(skb, NHA_HW_STATS_USED, hw_stats_used)) 843 goto err_out; 844 } 845 846 nest = nla_nest_start(skb, NHA_GROUP_STATS); 847 if (!nest) 848 goto err_out; 849 850 for (i = 0; i < nhg->num_nh; i++) 851 if (nla_put_nh_group_stats_entry(skb, &nhg->nh_entries[i], 852 op_flags)) 853 goto cancel_out; 854 855 nla_nest_end(skb, nest); 856 return 0; 857 858 cancel_out: 859 nla_nest_cancel(skb, nest); 860 err_out: 861 err = -EMSGSIZE; 862 out: 863 return err; 864 } 865 866 static int nla_put_nh_group(struct sk_buff *skb, struct nexthop *nh, 867 u32 op_flags, u32 *resp_op_flags) 868 { 869 struct nh_group *nhg = rtnl_dereference(nh->nh_grp); 870 struct nexthop_grp *p; 871 size_t len = nhg->num_nh * sizeof(*p); 872 struct nlattr *nla; 873 u16 group_type = 0; 874 u16 weight; 875 int i; 876 877 *resp_op_flags |= NHA_OP_FLAG_RESP_GRP_RESVD_0; 878 879 if (nhg->hash_threshold) 880 group_type = NEXTHOP_GRP_TYPE_MPATH; 881 else if (nhg->resilient) 882 group_type = NEXTHOP_GRP_TYPE_RES; 883 884 if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type)) 885 goto nla_put_failure; 886 887 nla = nla_reserve(skb, NHA_GROUP, len); 888 if (!nla) 889 goto nla_put_failure; 890 891 p = nla_data(nla); 892 for (i = 0; i < nhg->num_nh; ++i) { 893 weight = nhg->nh_entries[i].weight - 1; 894 895 *p++ = (struct nexthop_grp) { 896 .id = nhg->nh_entries[i].nh->id, 897 .weight = weight, 898 .weight_high = weight >> 8, 899 }; 900 } 901 902 if (nhg->resilient && nla_put_nh_group_res(skb, nhg)) 903 goto nla_put_failure; 904 905 if (op_flags & NHA_OP_FLAG_DUMP_STATS && 906 (nla_put_u32(skb, NHA_HW_STATS_ENABLE, nhg->hw_stats) || 907 nla_put_nh_group_stats(skb, nh, op_flags))) 908 goto nla_put_failure; 909 910 return 0; 911 912 nla_put_failure: 913 return -EMSGSIZE; 914 } 915 916 static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, 917 int event, u32 portid, u32 seq, unsigned int nlflags, 918 u32 op_flags) 919 { 920 struct fib6_nh *fib6_nh; 921 struct fib_nh *fib_nh; 922 struct nlmsghdr *nlh; 923 struct nh_info *nhi; 924 struct nhmsg *nhm; 925 926 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags); 927 if (!nlh) 928 return -EMSGSIZE; 929 930 nhm = nlmsg_data(nlh); 931 nhm->nh_family = AF_UNSPEC; 932 nhm->nh_flags = nh->nh_flags; 933 nhm->nh_protocol = nh->protocol; 934 nhm->nh_scope = 0; 935 nhm->resvd = 0; 936 937 if (nla_put_u32(skb, NHA_ID, nh->id)) 938 goto nla_put_failure; 939 940 if (nh->is_group) { 941 struct nh_group *nhg = rtnl_dereference(nh->nh_grp); 942 u32 resp_op_flags = 0; 943 944 if (nhg->fdb_nh && nla_put_flag(skb, NHA_FDB)) 945 goto nla_put_failure; 946 if (nla_put_nh_group(skb, nh, op_flags, &resp_op_flags) || 947 nla_put_u32(skb, NHA_OP_FLAGS, resp_op_flags)) 948 goto nla_put_failure; 949 goto out; 950 } 951 952 nhi = rtnl_dereference(nh->nh_info); 953 nhm->nh_family = nhi->family; 954 if (nhi->reject_nh) { 955 if (nla_put_flag(skb, NHA_BLACKHOLE)) 956 goto nla_put_failure; 957 goto out; 958 } else if (nhi->fdb_nh) { 959 if (nla_put_flag(skb, NHA_FDB)) 960 goto nla_put_failure; 961 } else { 962 const struct net_device *dev; 963 964 dev = nhi->fib_nhc.nhc_dev; 965 if (dev && nla_put_u32(skb, NHA_OIF, dev->ifindex)) 966 goto nla_put_failure; 967 } 968 969 nhm->nh_scope = nhi->fib_nhc.nhc_scope; 970 switch (nhi->family) { 971 case AF_INET: 972 fib_nh = &nhi->fib_nh; 973 if (fib_nh->fib_nh_gw_family && 974 nla_put_be32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4)) 975 goto nla_put_failure; 976 break; 977 978 case AF_INET6: 979 fib6_nh = &nhi->fib6_nh; 980 if (fib6_nh->fib_nh_gw_family && 981 nla_put_in6_addr(skb, NHA_GATEWAY, &fib6_nh->fib_nh_gw6)) 982 goto nla_put_failure; 983 break; 984 } 985 986 if (lwtunnel_fill_encap(skb, nhi->fib_nhc.nhc_lwtstate, 987 NHA_ENCAP, NHA_ENCAP_TYPE) < 0) 988 goto nla_put_failure; 989 990 out: 991 nlmsg_end(skb, nlh); 992 return 0; 993 994 nla_put_failure: 995 nlmsg_cancel(skb, nlh); 996 return -EMSGSIZE; 997 } 998 999 static size_t nh_nlmsg_size_grp_res(struct nh_group *nhg) 1000 { 1001 return nla_total_size(0) + /* NHA_RES_GROUP */ 1002 nla_total_size(2) + /* NHA_RES_GROUP_BUCKETS */ 1003 nla_total_size(4) + /* NHA_RES_GROUP_IDLE_TIMER */ 1004 nla_total_size(4) + /* NHA_RES_GROUP_UNBALANCED_TIMER */ 1005 nla_total_size_64bit(8);/* NHA_RES_GROUP_UNBALANCED_TIME */ 1006 } 1007 1008 static size_t nh_nlmsg_size_grp(struct nexthop *nh) 1009 { 1010 struct nh_group *nhg = rtnl_dereference(nh->nh_grp); 1011 size_t sz = sizeof(struct nexthop_grp) * nhg->num_nh; 1012 size_t tot = nla_total_size(sz) + 1013 nla_total_size(2); /* NHA_GROUP_TYPE */ 1014 1015 if (nhg->resilient) 1016 tot += nh_nlmsg_size_grp_res(nhg); 1017 1018 return tot; 1019 } 1020 1021 static size_t nh_nlmsg_size_single(struct nexthop *nh) 1022 { 1023 struct nh_info *nhi = rtnl_dereference(nh->nh_info); 1024 size_t sz; 1025 1026 /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE 1027 * are mutually exclusive 1028 */ 1029 sz = nla_total_size(4); /* NHA_OIF */ 1030 1031 switch (nhi->family) { 1032 case AF_INET: 1033 if (nhi->fib_nh.fib_nh_gw_family) 1034 sz += nla_total_size(4); /* NHA_GATEWAY */ 1035 break; 1036 1037 case AF_INET6: 1038 /* NHA_GATEWAY */ 1039 if (nhi->fib6_nh.fib_nh_gw_family) 1040 sz += nla_total_size(sizeof(const struct in6_addr)); 1041 break; 1042 } 1043 1044 if (nhi->fib_nhc.nhc_lwtstate) { 1045 sz += lwtunnel_get_encap_size(nhi->fib_nhc.nhc_lwtstate); 1046 sz += nla_total_size(2); /* NHA_ENCAP_TYPE */ 1047 } 1048 1049 return sz; 1050 } 1051 1052 static size_t nh_nlmsg_size(struct nexthop *nh) 1053 { 1054 size_t sz = NLMSG_ALIGN(sizeof(struct nhmsg)); 1055 1056 sz += nla_total_size(4); /* NHA_ID */ 1057 1058 if (nh->is_group) 1059 sz += nh_nlmsg_size_grp(nh) + 1060 nla_total_size(4) + /* NHA_OP_FLAGS */ 1061 0; 1062 else 1063 sz += nh_nlmsg_size_single(nh); 1064 1065 return sz; 1066 } 1067 1068 static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info) 1069 { 1070 unsigned int nlflags = info->nlh ? info->nlh->nlmsg_flags : 0; 1071 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 1072 struct sk_buff *skb; 1073 int err = -ENOBUFS; 1074 1075 skb = nlmsg_new(nh_nlmsg_size(nh), gfp_any()); 1076 if (!skb) 1077 goto errout; 1078 1079 err = nh_fill_node(skb, nh, event, info->portid, seq, nlflags, 0); 1080 if (err < 0) { 1081 /* -EMSGSIZE implies BUG in nh_nlmsg_size() */ 1082 WARN_ON(err == -EMSGSIZE); 1083 kfree_skb(skb); 1084 goto errout; 1085 } 1086 1087 rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_NEXTHOP, 1088 info->nlh, gfp_any()); 1089 return; 1090 errout: 1091 rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err); 1092 } 1093 1094 static unsigned long nh_res_bucket_used_time(const struct nh_res_bucket *bucket) 1095 { 1096 return (unsigned long)atomic_long_read(&bucket->used_time); 1097 } 1098 1099 static unsigned long 1100 nh_res_bucket_idle_point(const struct nh_res_table *res_table, 1101 const struct nh_res_bucket *bucket, 1102 unsigned long now) 1103 { 1104 unsigned long time = nh_res_bucket_used_time(bucket); 1105 1106 /* Bucket was not used since it was migrated. The idle time is now. */ 1107 if (time == bucket->migrated_time) 1108 return now; 1109 1110 return time + res_table->idle_timer; 1111 } 1112 1113 static unsigned long 1114 nh_res_table_unb_point(const struct nh_res_table *res_table) 1115 { 1116 return res_table->unbalanced_since + res_table->unbalanced_timer; 1117 } 1118 1119 static void nh_res_bucket_set_idle(const struct nh_res_table *res_table, 1120 struct nh_res_bucket *bucket) 1121 { 1122 unsigned long now = jiffies; 1123 1124 atomic_long_set(&bucket->used_time, (long)now); 1125 bucket->migrated_time = now; 1126 } 1127 1128 static void nh_res_bucket_set_busy(struct nh_res_bucket *bucket) 1129 { 1130 atomic_long_set(&bucket->used_time, (long)jiffies); 1131 } 1132 1133 static clock_t nh_res_bucket_idle_time(const struct nh_res_bucket *bucket) 1134 { 1135 unsigned long used_time = nh_res_bucket_used_time(bucket); 1136 1137 return jiffies_delta_to_clock_t(jiffies - used_time); 1138 } 1139 1140 static int nh_fill_res_bucket(struct sk_buff *skb, struct nexthop *nh, 1141 struct nh_res_bucket *bucket, u16 bucket_index, 1142 int event, u32 portid, u32 seq, 1143 unsigned int nlflags, 1144 struct netlink_ext_ack *extack) 1145 { 1146 struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry); 1147 struct nlmsghdr *nlh; 1148 struct nlattr *nest; 1149 struct nhmsg *nhm; 1150 1151 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags); 1152 if (!nlh) 1153 return -EMSGSIZE; 1154 1155 nhm = nlmsg_data(nlh); 1156 nhm->nh_family = AF_UNSPEC; 1157 nhm->nh_flags = bucket->nh_flags; 1158 nhm->nh_protocol = nh->protocol; 1159 nhm->nh_scope = 0; 1160 nhm->resvd = 0; 1161 1162 if (nla_put_u32(skb, NHA_ID, nh->id)) 1163 goto nla_put_failure; 1164 1165 nest = nla_nest_start(skb, NHA_RES_BUCKET); 1166 if (!nest) 1167 goto nla_put_failure; 1168 1169 if (nla_put_u16(skb, NHA_RES_BUCKET_INDEX, bucket_index) || 1170 nla_put_u32(skb, NHA_RES_BUCKET_NH_ID, nhge->nh->id) || 1171 nla_put_u64_64bit(skb, NHA_RES_BUCKET_IDLE_TIME, 1172 nh_res_bucket_idle_time(bucket), 1173 NHA_RES_BUCKET_PAD)) 1174 goto nla_put_failure_nest; 1175 1176 nla_nest_end(skb, nest); 1177 nlmsg_end(skb, nlh); 1178 return 0; 1179 1180 nla_put_failure_nest: 1181 nla_nest_cancel(skb, nest); 1182 nla_put_failure: 1183 nlmsg_cancel(skb, nlh); 1184 return -EMSGSIZE; 1185 } 1186 1187 static void nexthop_bucket_notify(struct nh_res_table *res_table, 1188 u16 bucket_index) 1189 { 1190 struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index]; 1191 struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry); 1192 struct nexthop *nh = nhge->nh_parent; 1193 struct sk_buff *skb; 1194 int err = -ENOBUFS; 1195 1196 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 1197 if (!skb) 1198 goto errout; 1199 1200 err = nh_fill_res_bucket(skb, nh, bucket, bucket_index, 1201 RTM_NEWNEXTHOPBUCKET, 0, 0, NLM_F_REPLACE, 1202 NULL); 1203 if (err < 0) { 1204 kfree_skb(skb); 1205 goto errout; 1206 } 1207 1208 rtnl_notify(skb, nh->net, 0, RTNLGRP_NEXTHOP, NULL, GFP_KERNEL); 1209 return; 1210 errout: 1211 rtnl_set_sk_err(nh->net, RTNLGRP_NEXTHOP, err); 1212 } 1213 1214 static bool valid_group_nh(struct nexthop *nh, unsigned int npaths, 1215 bool *is_fdb, struct netlink_ext_ack *extack) 1216 { 1217 if (nh->is_group) { 1218 struct nh_group *nhg = rtnl_dereference(nh->nh_grp); 1219 1220 /* Nesting groups within groups is not supported. */ 1221 if (nhg->hash_threshold) { 1222 NL_SET_ERR_MSG(extack, 1223 "Hash-threshold group can not be a nexthop within a group"); 1224 return false; 1225 } 1226 if (nhg->resilient) { 1227 NL_SET_ERR_MSG(extack, 1228 "Resilient group can not be a nexthop within a group"); 1229 return false; 1230 } 1231 *is_fdb = nhg->fdb_nh; 1232 } else { 1233 struct nh_info *nhi = rtnl_dereference(nh->nh_info); 1234 1235 if (nhi->reject_nh && npaths > 1) { 1236 NL_SET_ERR_MSG(extack, 1237 "Blackhole nexthop can not be used in a group with more than 1 path"); 1238 return false; 1239 } 1240 *is_fdb = nhi->fdb_nh; 1241 } 1242 1243 return true; 1244 } 1245 1246 static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family, 1247 struct netlink_ext_ack *extack) 1248 { 1249 struct nh_info *nhi; 1250 1251 nhi = rtnl_dereference(nh->nh_info); 1252 1253 if (!nhi->fdb_nh) { 1254 NL_SET_ERR_MSG(extack, "FDB nexthop group can only have fdb nexthops"); 1255 return -EINVAL; 1256 } 1257 1258 if (*nh_family == AF_UNSPEC) { 1259 *nh_family = nhi->family; 1260 } else if (*nh_family != nhi->family) { 1261 NL_SET_ERR_MSG(extack, "FDB nexthop group cannot have mixed family nexthops"); 1262 return -EINVAL; 1263 } 1264 1265 return 0; 1266 } 1267 1268 static int nh_check_attr_group(struct net *net, 1269 struct nlattr *tb[], size_t tb_size, 1270 u16 nh_grp_type, struct netlink_ext_ack *extack) 1271 { 1272 unsigned int len = nla_len(tb[NHA_GROUP]); 1273 struct nexthop_grp *nhg; 1274 unsigned int i, j; 1275 1276 if (!len || len & (sizeof(struct nexthop_grp) - 1)) { 1277 NL_SET_ERR_MSG(extack, 1278 "Invalid length for nexthop group attribute"); 1279 return -EINVAL; 1280 } 1281 1282 /* convert len to number of nexthop ids */ 1283 len /= sizeof(*nhg); 1284 1285 nhg = nla_data(tb[NHA_GROUP]); 1286 for (i = 0; i < len; ++i) { 1287 if (nhg[i].resvd2) { 1288 NL_SET_ERR_MSG(extack, "Reserved field in nexthop_grp must be 0"); 1289 return -EINVAL; 1290 } 1291 if (nexthop_grp_weight(&nhg[i]) == 0) { 1292 /* 0xffff got passed in, representing weight of 0x10000, 1293 * which is too heavy. 1294 */ 1295 NL_SET_ERR_MSG(extack, "Invalid value for weight"); 1296 return -EINVAL; 1297 } 1298 for (j = i + 1; j < len; ++j) { 1299 if (nhg[i].id == nhg[j].id) { 1300 NL_SET_ERR_MSG(extack, "Nexthop id can not be used twice in a group"); 1301 return -EINVAL; 1302 } 1303 } 1304 } 1305 1306 nhg = nla_data(tb[NHA_GROUP]); 1307 for (i = NHA_GROUP_TYPE + 1; i < tb_size; ++i) { 1308 if (!tb[i]) 1309 continue; 1310 switch (i) { 1311 case NHA_HW_STATS_ENABLE: 1312 case NHA_FDB: 1313 continue; 1314 case NHA_RES_GROUP: 1315 if (nh_grp_type == NEXTHOP_GRP_TYPE_RES) 1316 continue; 1317 break; 1318 } 1319 NL_SET_ERR_MSG(extack, 1320 "No other attributes can be set in nexthop groups"); 1321 return -EINVAL; 1322 } 1323 1324 return 0; 1325 } 1326 1327 static int nh_check_attr_group_rtnl(struct net *net, struct nlattr *tb[], 1328 struct netlink_ext_ack *extack) 1329 { 1330 u8 nh_family = AF_UNSPEC; 1331 struct nexthop_grp *nhg; 1332 unsigned int len; 1333 unsigned int i; 1334 u8 nhg_fdb; 1335 1336 len = nla_len(tb[NHA_GROUP]) / sizeof(*nhg); 1337 nhg = nla_data(tb[NHA_GROUP]); 1338 nhg_fdb = !!tb[NHA_FDB]; 1339 1340 for (i = 0; i < len; i++) { 1341 struct nexthop *nh; 1342 bool is_fdb_nh; 1343 1344 nh = nexthop_find_by_id(net, nhg[i].id); 1345 if (!nh) { 1346 NL_SET_ERR_MSG(extack, "Invalid nexthop id"); 1347 return -EINVAL; 1348 } 1349 if (!valid_group_nh(nh, len, &is_fdb_nh, extack)) 1350 return -EINVAL; 1351 1352 if (nhg_fdb && nh_check_attr_fdb_group(nh, &nh_family, extack)) 1353 return -EINVAL; 1354 1355 if (!nhg_fdb && is_fdb_nh) { 1356 NL_SET_ERR_MSG(extack, "Non FDB nexthop group cannot have fdb nexthops"); 1357 return -EINVAL; 1358 } 1359 } 1360 1361 return 0; 1362 } 1363 1364 static bool ipv6_good_nh(const struct fib6_nh *nh) 1365 { 1366 int state = NUD_REACHABLE; 1367 struct neighbour *n; 1368 1369 rcu_read_lock(); 1370 1371 n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6); 1372 if (n) 1373 state = READ_ONCE(n->nud_state); 1374 1375 rcu_read_unlock(); 1376 1377 return !!(state & NUD_VALID); 1378 } 1379 1380 static bool ipv4_good_nh(const struct fib_nh *nh) 1381 { 1382 int state = NUD_REACHABLE; 1383 struct neighbour *n; 1384 1385 rcu_read_lock(); 1386 1387 n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev, 1388 (__force u32)nh->fib_nh_gw4); 1389 if (n) 1390 state = READ_ONCE(n->nud_state); 1391 1392 rcu_read_unlock(); 1393 1394 return !!(state & NUD_VALID); 1395 } 1396 1397 static bool nexthop_is_good_nh(const struct nexthop *nh) 1398 { 1399 struct nh_info *nhi = rcu_dereference(nh->nh_info); 1400 1401 switch (nhi->family) { 1402 case AF_INET: 1403 return ipv4_good_nh(&nhi->fib_nh); 1404 case AF_INET6: 1405 return ipv6_good_nh(&nhi->fib6_nh); 1406 } 1407 1408 return false; 1409 } 1410 1411 static struct nexthop *nexthop_select_path_fdb(struct nh_group *nhg, int hash) 1412 { 1413 int i; 1414 1415 for (i = 0; i < nhg->num_nh; i++) { 1416 struct nh_grp_entry *nhge = &nhg->nh_entries[i]; 1417 1418 if (hash > atomic_read(&nhge->hthr.upper_bound)) 1419 continue; 1420 1421 nh_grp_entry_stats_inc(nhge); 1422 return nhge->nh; 1423 } 1424 1425 WARN_ON_ONCE(1); 1426 return NULL; 1427 } 1428 1429 static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash) 1430 { 1431 struct nh_grp_entry *nhge0 = NULL; 1432 int i; 1433 1434 if (nhg->fdb_nh) 1435 return nexthop_select_path_fdb(nhg, hash); 1436 1437 for (i = 0; i < nhg->num_nh; ++i) { 1438 struct nh_grp_entry *nhge = &nhg->nh_entries[i]; 1439 1440 /* nexthops always check if it is good and does 1441 * not rely on a sysctl for this behavior 1442 */ 1443 if (!nexthop_is_good_nh(nhge->nh)) 1444 continue; 1445 1446 if (!nhge0) 1447 nhge0 = nhge; 1448 1449 if (hash > atomic_read(&nhge->hthr.upper_bound)) 1450 continue; 1451 1452 nh_grp_entry_stats_inc(nhge); 1453 return nhge->nh; 1454 } 1455 1456 if (!nhge0) 1457 nhge0 = &nhg->nh_entries[0]; 1458 nh_grp_entry_stats_inc(nhge0); 1459 return nhge0->nh; 1460 } 1461 1462 static struct nexthop *nexthop_select_path_res(struct nh_group *nhg, int hash) 1463 { 1464 struct nh_res_table *res_table = rcu_dereference(nhg->res_table); 1465 u16 bucket_index = hash % res_table->num_nh_buckets; 1466 struct nh_res_bucket *bucket; 1467 struct nh_grp_entry *nhge; 1468 1469 /* nexthop_select_path() is expected to return a non-NULL value, so 1470 * skip protocol validation and just hand out whatever there is. 1471 */ 1472 bucket = &res_table->nh_buckets[bucket_index]; 1473 nh_res_bucket_set_busy(bucket); 1474 nhge = rcu_dereference(bucket->nh_entry); 1475 nh_grp_entry_stats_inc(nhge); 1476 return nhge->nh; 1477 } 1478 1479 struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) 1480 { 1481 struct nh_group *nhg; 1482 1483 if (!nh->is_group) 1484 return nh; 1485 1486 nhg = rcu_dereference(nh->nh_grp); 1487 if (nhg->hash_threshold) 1488 return nexthop_select_path_hthr(nhg, hash); 1489 else if (nhg->resilient) 1490 return nexthop_select_path_res(nhg, hash); 1491 1492 /* Unreachable. */ 1493 return NULL; 1494 } 1495 EXPORT_SYMBOL_GPL(nexthop_select_path); 1496 1497 int nexthop_for_each_fib6_nh(struct nexthop *nh, 1498 int (*cb)(struct fib6_nh *nh, void *arg), 1499 void *arg) 1500 { 1501 struct nh_info *nhi; 1502 int err; 1503 1504 if (nh->is_group) { 1505 struct nh_group *nhg; 1506 int i; 1507 1508 nhg = rcu_dereference_rtnl(nh->nh_grp); 1509 for (i = 0; i < nhg->num_nh; i++) { 1510 struct nh_grp_entry *nhge = &nhg->nh_entries[i]; 1511 1512 nhi = rcu_dereference_rtnl(nhge->nh->nh_info); 1513 err = cb(&nhi->fib6_nh, arg); 1514 if (err) 1515 return err; 1516 } 1517 } else { 1518 nhi = rcu_dereference_rtnl(nh->nh_info); 1519 err = cb(&nhi->fib6_nh, arg); 1520 if (err) 1521 return err; 1522 } 1523 1524 return 0; 1525 } 1526 EXPORT_SYMBOL_GPL(nexthop_for_each_fib6_nh); 1527 1528 static int check_src_addr(const struct in6_addr *saddr, 1529 struct netlink_ext_ack *extack) 1530 { 1531 if (!ipv6_addr_any(saddr)) { 1532 NL_SET_ERR_MSG(extack, "IPv6 routes using source address can not use nexthop objects"); 1533 return -EINVAL; 1534 } 1535 return 0; 1536 } 1537 1538 int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg, 1539 struct netlink_ext_ack *extack) 1540 { 1541 struct nh_info *nhi; 1542 bool is_fdb_nh; 1543 1544 /* fib6_src is unique to a fib6_info and limits the ability to cache 1545 * routes in fib6_nh within a nexthop that is potentially shared 1546 * across multiple fib entries. If the config wants to use source 1547 * routing it can not use nexthop objects. mlxsw also does not allow 1548 * fib6_src on routes. 1549 */ 1550 if (cfg && check_src_addr(&cfg->fc_src, extack) < 0) 1551 return -EINVAL; 1552 1553 if (nh->is_group) { 1554 struct nh_group *nhg; 1555 1556 nhg = rcu_dereference_rtnl(nh->nh_grp); 1557 if (nhg->has_v4) 1558 goto no_v4_nh; 1559 is_fdb_nh = nhg->fdb_nh; 1560 } else { 1561 nhi = rcu_dereference_rtnl(nh->nh_info); 1562 if (nhi->family == AF_INET) 1563 goto no_v4_nh; 1564 is_fdb_nh = nhi->fdb_nh; 1565 } 1566 1567 if (is_fdb_nh) { 1568 NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); 1569 return -EINVAL; 1570 } 1571 1572 return 0; 1573 no_v4_nh: 1574 NL_SET_ERR_MSG(extack, "IPv6 routes can not use an IPv4 nexthop"); 1575 return -EINVAL; 1576 } 1577 EXPORT_SYMBOL_GPL(fib6_check_nexthop); 1578 1579 /* if existing nexthop has ipv6 routes linked to it, need 1580 * to verify this new spec works with ipv6 1581 */ 1582 static int fib6_check_nh_list(struct nexthop *old, struct nexthop *new, 1583 struct netlink_ext_ack *extack) 1584 { 1585 struct fib6_info *f6i; 1586 1587 if (list_empty(&old->f6i_list)) 1588 return 0; 1589 1590 list_for_each_entry(f6i, &old->f6i_list, nh_list) { 1591 if (check_src_addr(&f6i->fib6_src.addr, extack) < 0) 1592 return -EINVAL; 1593 } 1594 1595 return fib6_check_nexthop(new, NULL, extack); 1596 } 1597 1598 static int nexthop_check_scope(struct nh_info *nhi, u8 scope, 1599 struct netlink_ext_ack *extack) 1600 { 1601 if (scope == RT_SCOPE_HOST && nhi->fib_nhc.nhc_gw_family) { 1602 NL_SET_ERR_MSG(extack, 1603 "Route with host scope can not have a gateway"); 1604 return -EINVAL; 1605 } 1606 1607 if (nhi->fib_nhc.nhc_flags & RTNH_F_ONLINK && scope >= RT_SCOPE_LINK) { 1608 NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop"); 1609 return -EINVAL; 1610 } 1611 1612 return 0; 1613 } 1614 1615 /* Invoked by fib add code to verify nexthop by id is ok with 1616 * config for prefix; parts of fib_check_nh not done when nexthop 1617 * object is used. 1618 */ 1619 int fib_check_nexthop(struct nexthop *nh, u8 scope, 1620 struct netlink_ext_ack *extack) 1621 { 1622 struct nh_info *nhi; 1623 int err = 0; 1624 1625 if (nh->is_group) { 1626 struct nh_group *nhg; 1627 1628 nhg = rtnl_dereference(nh->nh_grp); 1629 if (nhg->fdb_nh) { 1630 NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); 1631 err = -EINVAL; 1632 goto out; 1633 } 1634 1635 if (scope == RT_SCOPE_HOST) { 1636 NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops"); 1637 err = -EINVAL; 1638 goto out; 1639 } 1640 1641 /* all nexthops in a group have the same scope */ 1642 nhi = rtnl_dereference(nhg->nh_entries[0].nh->nh_info); 1643 err = nexthop_check_scope(nhi, scope, extack); 1644 } else { 1645 nhi = rtnl_dereference(nh->nh_info); 1646 if (nhi->fdb_nh) { 1647 NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); 1648 err = -EINVAL; 1649 goto out; 1650 } 1651 err = nexthop_check_scope(nhi, scope, extack); 1652 } 1653 1654 out: 1655 return err; 1656 } 1657 1658 static int fib_check_nh_list(struct nexthop *old, struct nexthop *new, 1659 struct netlink_ext_ack *extack) 1660 { 1661 struct fib_info *fi; 1662 1663 list_for_each_entry(fi, &old->fi_list, nh_list) { 1664 int err; 1665 1666 err = fib_check_nexthop(new, fi->fib_scope, extack); 1667 if (err) 1668 return err; 1669 } 1670 return 0; 1671 } 1672 1673 static bool nh_res_nhge_is_balanced(const struct nh_grp_entry *nhge) 1674 { 1675 return nhge->res.count_buckets == nhge->res.wants_buckets; 1676 } 1677 1678 static bool nh_res_nhge_is_ow(const struct nh_grp_entry *nhge) 1679 { 1680 return nhge->res.count_buckets > nhge->res.wants_buckets; 1681 } 1682 1683 static bool nh_res_nhge_is_uw(const struct nh_grp_entry *nhge) 1684 { 1685 return nhge->res.count_buckets < nhge->res.wants_buckets; 1686 } 1687 1688 static bool nh_res_table_is_balanced(const struct nh_res_table *res_table) 1689 { 1690 return list_empty(&res_table->uw_nh_entries); 1691 } 1692 1693 static void nh_res_bucket_unset_nh(struct nh_res_bucket *bucket) 1694 { 1695 struct nh_grp_entry *nhge; 1696 1697 if (bucket->occupied) { 1698 nhge = nh_res_dereference(bucket->nh_entry); 1699 nhge->res.count_buckets--; 1700 bucket->occupied = false; 1701 } 1702 } 1703 1704 static void nh_res_bucket_set_nh(struct nh_res_bucket *bucket, 1705 struct nh_grp_entry *nhge) 1706 { 1707 nh_res_bucket_unset_nh(bucket); 1708 1709 bucket->occupied = true; 1710 rcu_assign_pointer(bucket->nh_entry, nhge); 1711 nhge->res.count_buckets++; 1712 } 1713 1714 static bool nh_res_bucket_should_migrate(struct nh_res_table *res_table, 1715 struct nh_res_bucket *bucket, 1716 unsigned long *deadline, bool *force) 1717 { 1718 unsigned long now = jiffies; 1719 struct nh_grp_entry *nhge; 1720 unsigned long idle_point; 1721 1722 if (!bucket->occupied) { 1723 /* The bucket is not occupied, its NHGE pointer is either 1724 * NULL or obsolete. We _have to_ migrate: set force. 1725 */ 1726 *force = true; 1727 return true; 1728 } 1729 1730 nhge = nh_res_dereference(bucket->nh_entry); 1731 1732 /* If the bucket is populated by an underweight or balanced 1733 * nexthop, do not migrate. 1734 */ 1735 if (!nh_res_nhge_is_ow(nhge)) 1736 return false; 1737 1738 /* At this point we know that the bucket is populated with an 1739 * overweight nexthop. It needs to be migrated to a new nexthop if 1740 * the idle timer of unbalanced timer expired. 1741 */ 1742 1743 idle_point = nh_res_bucket_idle_point(res_table, bucket, now); 1744 if (time_after_eq(now, idle_point)) { 1745 /* The bucket is idle. We _can_ migrate: unset force. */ 1746 *force = false; 1747 return true; 1748 } 1749 1750 /* Unbalanced timer of 0 means "never force". */ 1751 if (res_table->unbalanced_timer) { 1752 unsigned long unb_point; 1753 1754 unb_point = nh_res_table_unb_point(res_table); 1755 if (time_after(now, unb_point)) { 1756 /* The bucket is not idle, but the unbalanced timer 1757 * expired. We _can_ migrate, but set force anyway, 1758 * so that drivers know to ignore activity reports 1759 * from the HW. 1760 */ 1761 *force = true; 1762 return true; 1763 } 1764 1765 nh_res_time_set_deadline(unb_point, deadline); 1766 } 1767 1768 nh_res_time_set_deadline(idle_point, deadline); 1769 return false; 1770 } 1771 1772 static bool nh_res_bucket_migrate(struct nh_res_table *res_table, 1773 u16 bucket_index, bool notify, 1774 bool notify_nl, bool force) 1775 { 1776 struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index]; 1777 struct nh_grp_entry *new_nhge; 1778 struct netlink_ext_ack extack; 1779 int err; 1780 1781 new_nhge = list_first_entry_or_null(&res_table->uw_nh_entries, 1782 struct nh_grp_entry, 1783 res.uw_nh_entry); 1784 if (WARN_ON_ONCE(!new_nhge)) 1785 /* If this function is called, "bucket" is either not 1786 * occupied, or it belongs to a next hop that is 1787 * overweight. In either case, there ought to be a 1788 * corresponding underweight next hop. 1789 */ 1790 return false; 1791 1792 if (notify) { 1793 struct nh_grp_entry *old_nhge; 1794 1795 old_nhge = nh_res_dereference(bucket->nh_entry); 1796 err = call_nexthop_res_bucket_notifiers(res_table->net, 1797 res_table->nhg_id, 1798 bucket_index, force, 1799 old_nhge->nh, 1800 new_nhge->nh, &extack); 1801 if (err) { 1802 pr_err_ratelimited("%s\n", extack._msg); 1803 if (!force) 1804 return false; 1805 /* It is not possible to veto a forced replacement, so 1806 * just clear the hardware flags from the nexthop 1807 * bucket to indicate to user space that this bucket is 1808 * not correctly populated in hardware. 1809 */ 1810 bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); 1811 } 1812 } 1813 1814 nh_res_bucket_set_nh(bucket, new_nhge); 1815 nh_res_bucket_set_idle(res_table, bucket); 1816 1817 if (notify_nl) 1818 nexthop_bucket_notify(res_table, bucket_index); 1819 1820 if (nh_res_nhge_is_balanced(new_nhge)) 1821 list_del(&new_nhge->res.uw_nh_entry); 1822 return true; 1823 } 1824 1825 #define NH_RES_UPKEEP_DW_MINIMUM_INTERVAL (HZ / 2) 1826 1827 static void nh_res_table_upkeep(struct nh_res_table *res_table, 1828 bool notify, bool notify_nl) 1829 { 1830 unsigned long now = jiffies; 1831 unsigned long deadline; 1832 u16 i; 1833 1834 /* Deadline is the next time that upkeep should be run. It is the 1835 * earliest time at which one of the buckets might be migrated. 1836 * Start at the most pessimistic estimate: either unbalanced_timer 1837 * from now, or if there is none, idle_timer from now. For each 1838 * encountered time point, call nh_res_time_set_deadline() to 1839 * refine the estimate. 1840 */ 1841 if (res_table->unbalanced_timer) 1842 deadline = now + res_table->unbalanced_timer; 1843 else 1844 deadline = now + res_table->idle_timer; 1845 1846 for (i = 0; i < res_table->num_nh_buckets; i++) { 1847 struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; 1848 bool force; 1849 1850 if (nh_res_bucket_should_migrate(res_table, bucket, 1851 &deadline, &force)) { 1852 if (!nh_res_bucket_migrate(res_table, i, notify, 1853 notify_nl, force)) { 1854 unsigned long idle_point; 1855 1856 /* A driver can override the migration 1857 * decision if the HW reports that the 1858 * bucket is actually not idle. Therefore 1859 * remark the bucket as busy again and 1860 * update the deadline. 1861 */ 1862 nh_res_bucket_set_busy(bucket); 1863 idle_point = nh_res_bucket_idle_point(res_table, 1864 bucket, 1865 now); 1866 nh_res_time_set_deadline(idle_point, &deadline); 1867 } 1868 } 1869 } 1870 1871 /* If the group is still unbalanced, schedule the next upkeep to 1872 * either the deadline computed above, or the minimum deadline, 1873 * whichever comes later. 1874 */ 1875 if (!nh_res_table_is_balanced(res_table)) { 1876 unsigned long now = jiffies; 1877 unsigned long min_deadline; 1878 1879 min_deadline = now + NH_RES_UPKEEP_DW_MINIMUM_INTERVAL; 1880 if (time_before(deadline, min_deadline)) 1881 deadline = min_deadline; 1882 1883 queue_delayed_work(system_power_efficient_wq, 1884 &res_table->upkeep_dw, deadline - now); 1885 } 1886 } 1887 1888 static void nh_res_table_upkeep_dw(struct work_struct *work) 1889 { 1890 struct delayed_work *dw = to_delayed_work(work); 1891 struct nh_res_table *res_table; 1892 1893 res_table = container_of(dw, struct nh_res_table, upkeep_dw); 1894 nh_res_table_upkeep(res_table, true, true); 1895 } 1896 1897 static void nh_res_table_cancel_upkeep(struct nh_res_table *res_table) 1898 { 1899 cancel_delayed_work_sync(&res_table->upkeep_dw); 1900 } 1901 1902 static void nh_res_group_rebalance(struct nh_group *nhg, 1903 struct nh_res_table *res_table) 1904 { 1905 u16 prev_upper_bound = 0; 1906 u32 total = 0; 1907 u32 w = 0; 1908 int i; 1909 1910 INIT_LIST_HEAD(&res_table->uw_nh_entries); 1911 1912 for (i = 0; i < nhg->num_nh; ++i) 1913 total += nhg->nh_entries[i].weight; 1914 1915 for (i = 0; i < nhg->num_nh; ++i) { 1916 struct nh_grp_entry *nhge = &nhg->nh_entries[i]; 1917 u16 upper_bound; 1918 u64 btw; 1919 1920 w += nhge->weight; 1921 btw = ((u64)res_table->num_nh_buckets) * w; 1922 upper_bound = DIV_ROUND_CLOSEST_ULL(btw, total); 1923 nhge->res.wants_buckets = upper_bound - prev_upper_bound; 1924 prev_upper_bound = upper_bound; 1925 1926 if (nh_res_nhge_is_uw(nhge)) { 1927 if (list_empty(&res_table->uw_nh_entries)) 1928 res_table->unbalanced_since = jiffies; 1929 list_add(&nhge->res.uw_nh_entry, 1930 &res_table->uw_nh_entries); 1931 } 1932 } 1933 } 1934 1935 /* Migrate buckets in res_table so that they reference NHGE's from NHG with 1936 * the right NH ID. Set those buckets that do not have a corresponding NHGE 1937 * entry in NHG as not occupied. 1938 */ 1939 static void nh_res_table_migrate_buckets(struct nh_res_table *res_table, 1940 struct nh_group *nhg) 1941 { 1942 u16 i; 1943 1944 for (i = 0; i < res_table->num_nh_buckets; i++) { 1945 struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; 1946 u32 id = rtnl_dereference(bucket->nh_entry)->nh->id; 1947 bool found = false; 1948 int j; 1949 1950 for (j = 0; j < nhg->num_nh; j++) { 1951 struct nh_grp_entry *nhge = &nhg->nh_entries[j]; 1952 1953 if (nhge->nh->id == id) { 1954 nh_res_bucket_set_nh(bucket, nhge); 1955 found = true; 1956 break; 1957 } 1958 } 1959 1960 if (!found) 1961 nh_res_bucket_unset_nh(bucket); 1962 } 1963 } 1964 1965 static void replace_nexthop_grp_res(struct nh_group *oldg, 1966 struct nh_group *newg) 1967 { 1968 /* For NH group replacement, the new NHG might only have a stub 1969 * hash table with 0 buckets, because the number of buckets was not 1970 * specified. For NH removal, oldg and newg both reference the same 1971 * res_table. So in any case, in the following, we want to work 1972 * with oldg->res_table. 1973 */ 1974 struct nh_res_table *old_res_table = rtnl_dereference(oldg->res_table); 1975 unsigned long prev_unbalanced_since = old_res_table->unbalanced_since; 1976 bool prev_has_uw = !list_empty(&old_res_table->uw_nh_entries); 1977 1978 nh_res_table_cancel_upkeep(old_res_table); 1979 nh_res_table_migrate_buckets(old_res_table, newg); 1980 nh_res_group_rebalance(newg, old_res_table); 1981 if (prev_has_uw && !list_empty(&old_res_table->uw_nh_entries)) 1982 old_res_table->unbalanced_since = prev_unbalanced_since; 1983 nh_res_table_upkeep(old_res_table, true, false); 1984 } 1985 1986 static void nh_hthr_group_rebalance(struct nh_group *nhg) 1987 { 1988 u32 total = 0; 1989 u32 w = 0; 1990 int i; 1991 1992 for (i = 0; i < nhg->num_nh; ++i) 1993 total += nhg->nh_entries[i].weight; 1994 1995 for (i = 0; i < nhg->num_nh; ++i) { 1996 struct nh_grp_entry *nhge = &nhg->nh_entries[i]; 1997 u32 upper_bound; 1998 1999 w += nhge->weight; 2000 upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1; 2001 atomic_set(&nhge->hthr.upper_bound, upper_bound); 2002 } 2003 } 2004 2005 static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, 2006 struct nl_info *nlinfo) 2007 { 2008 struct nh_grp_entry *nhges, *new_nhges; 2009 struct nexthop *nhp = nhge->nh_parent; 2010 struct netlink_ext_ack extack; 2011 struct nexthop *nh = nhge->nh; 2012 struct nh_group *nhg, *newg; 2013 int i, j, err; 2014 2015 WARN_ON(!nh); 2016 2017 nhg = rtnl_dereference(nhp->nh_grp); 2018 newg = nhg->spare; 2019 2020 /* last entry, keep it visible and remove the parent */ 2021 if (nhg->num_nh == 1) { 2022 remove_nexthop(net, nhp, nlinfo); 2023 return; 2024 } 2025 2026 newg->has_v4 = false; 2027 newg->is_multipath = nhg->is_multipath; 2028 newg->hash_threshold = nhg->hash_threshold; 2029 newg->resilient = nhg->resilient; 2030 newg->fdb_nh = nhg->fdb_nh; 2031 newg->num_nh = nhg->num_nh; 2032 2033 /* copy old entries to new except the one getting removed */ 2034 nhges = nhg->nh_entries; 2035 new_nhges = newg->nh_entries; 2036 for (i = 0, j = 0; i < nhg->num_nh; ++i) { 2037 struct nh_info *nhi; 2038 2039 /* current nexthop getting removed */ 2040 if (nhg->nh_entries[i].nh == nh) { 2041 newg->num_nh--; 2042 continue; 2043 } 2044 2045 nhi = rtnl_dereference(nhges[i].nh->nh_info); 2046 if (nhi->family == AF_INET) 2047 newg->has_v4 = true; 2048 2049 list_del(&nhges[i].nh_list); 2050 new_nhges[j].stats = nhges[i].stats; 2051 new_nhges[j].nh_parent = nhges[i].nh_parent; 2052 new_nhges[j].nh = nhges[i].nh; 2053 new_nhges[j].weight = nhges[i].weight; 2054 list_add(&new_nhges[j].nh_list, &new_nhges[j].nh->grp_list); 2055 j++; 2056 } 2057 2058 if (newg->hash_threshold) 2059 nh_hthr_group_rebalance(newg); 2060 else if (newg->resilient) 2061 replace_nexthop_grp_res(nhg, newg); 2062 2063 rcu_assign_pointer(nhp->nh_grp, newg); 2064 2065 list_del(&nhge->nh_list); 2066 free_percpu(nhge->stats); 2067 nexthop_put(nhge->nh); 2068 2069 /* Removal of a NH from a resilient group is notified through 2070 * bucket notifications. 2071 */ 2072 if (newg->hash_threshold) { 2073 err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, 2074 &extack); 2075 if (err) 2076 pr_err("%s\n", extack._msg); 2077 } 2078 2079 if (nlinfo) 2080 nexthop_notify(RTM_NEWNEXTHOP, nhp, nlinfo); 2081 } 2082 2083 static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh, 2084 struct nl_info *nlinfo) 2085 { 2086 struct nh_grp_entry *nhge, *tmp; 2087 2088 /* If there is nothing to do, let's avoid the costly call to 2089 * synchronize_net() 2090 */ 2091 if (list_empty(&nh->grp_list)) 2092 return; 2093 2094 list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) 2095 remove_nh_grp_entry(net, nhge, nlinfo); 2096 2097 /* make sure all see the newly published array before releasing rtnl */ 2098 synchronize_net(); 2099 } 2100 2101 static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) 2102 { 2103 struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp); 2104 struct nh_res_table *res_table; 2105 int i, num_nh = nhg->num_nh; 2106 2107 for (i = 0; i < num_nh; ++i) { 2108 struct nh_grp_entry *nhge = &nhg->nh_entries[i]; 2109 2110 if (WARN_ON(!nhge->nh)) 2111 continue; 2112 2113 list_del_init(&nhge->nh_list); 2114 } 2115 2116 if (nhg->resilient) { 2117 res_table = rtnl_dereference(nhg->res_table); 2118 nh_res_table_cancel_upkeep(res_table); 2119 } 2120 } 2121 2122 /* not called for nexthop replace */ 2123 static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) 2124 { 2125 struct fib6_info *f6i; 2126 bool do_flush = false; 2127 struct fib_info *fi; 2128 2129 list_for_each_entry(fi, &nh->fi_list, nh_list) { 2130 fi->fib_flags |= RTNH_F_DEAD; 2131 do_flush = true; 2132 } 2133 if (do_flush) 2134 fib_flush(net); 2135 2136 spin_lock_bh(&nh->lock); 2137 2138 nh->dead = true; 2139 2140 while (!list_empty(&nh->f6i_list)) { 2141 f6i = list_first_entry(&nh->f6i_list, typeof(*f6i), nh_list); 2142 2143 /* __ip6_del_rt does a release, so do a hold here */ 2144 fib6_info_hold(f6i); 2145 2146 spin_unlock_bh(&nh->lock); 2147 ipv6_stub->ip6_del_rt(net, f6i, 2148 !READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode)); 2149 2150 spin_lock_bh(&nh->lock); 2151 } 2152 2153 spin_unlock_bh(&nh->lock); 2154 } 2155 2156 static void __remove_nexthop(struct net *net, struct nexthop *nh, 2157 struct nl_info *nlinfo) 2158 { 2159 __remove_nexthop_fib(net, nh); 2160 2161 if (nh->is_group) { 2162 remove_nexthop_group(nh, nlinfo); 2163 } else { 2164 struct nh_info *nhi; 2165 2166 nhi = rtnl_dereference(nh->nh_info); 2167 if (nhi->fib_nhc.nhc_dev) 2168 hlist_del(&nhi->dev_hash); 2169 2170 remove_nexthop_from_groups(net, nh, nlinfo); 2171 } 2172 } 2173 2174 static void remove_nexthop(struct net *net, struct nexthop *nh, 2175 struct nl_info *nlinfo) 2176 { 2177 call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh, NULL); 2178 2179 /* remove from the tree */ 2180 rb_erase(&nh->rb_node, &net->nexthop.rb_root); 2181 2182 if (nlinfo) 2183 nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo); 2184 2185 __remove_nexthop(net, nh, nlinfo); 2186 nh_base_seq_inc(net); 2187 2188 nexthop_put(nh); 2189 } 2190 2191 /* if any FIB entries reference this nexthop, any dst entries 2192 * need to be regenerated 2193 */ 2194 static void nh_rt_cache_flush(struct net *net, struct nexthop *nh, 2195 struct nexthop *replaced_nh) 2196 { 2197 struct fib6_info *f6i; 2198 struct nh_group *nhg; 2199 int i; 2200 2201 if (!list_empty(&nh->fi_list)) 2202 rt_cache_flush(net); 2203 2204 list_for_each_entry(f6i, &nh->f6i_list, nh_list) 2205 ipv6_stub->fib6_update_sernum(net, f6i); 2206 2207 /* if an IPv6 group was replaced, we have to release all old 2208 * dsts to make sure all refcounts are released 2209 */ 2210 if (!replaced_nh->is_group) 2211 return; 2212 2213 nhg = rtnl_dereference(replaced_nh->nh_grp); 2214 for (i = 0; i < nhg->num_nh; i++) { 2215 struct nh_grp_entry *nhge = &nhg->nh_entries[i]; 2216 struct nh_info *nhi = rtnl_dereference(nhge->nh->nh_info); 2217 2218 if (nhi->family == AF_INET6) 2219 ipv6_stub->fib6_nh_release_dsts(&nhi->fib6_nh); 2220 } 2221 } 2222 2223 static int replace_nexthop_grp(struct net *net, struct nexthop *old, 2224 struct nexthop *new, const struct nh_config *cfg, 2225 struct netlink_ext_ack *extack) 2226 { 2227 struct nh_res_table *tmp_table = NULL; 2228 struct nh_res_table *new_res_table; 2229 struct nh_res_table *old_res_table; 2230 struct nh_group *oldg, *newg; 2231 int i, err; 2232 2233 if (!new->is_group) { 2234 NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with a nexthop."); 2235 return -EINVAL; 2236 } 2237 2238 oldg = rtnl_dereference(old->nh_grp); 2239 newg = rtnl_dereference(new->nh_grp); 2240 2241 if (newg->hash_threshold != oldg->hash_threshold) { 2242 NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with one of a different type."); 2243 return -EINVAL; 2244 } 2245 2246 if (newg->hash_threshold) { 2247 err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, 2248 extack); 2249 if (err) 2250 return err; 2251 } else if (newg->resilient) { 2252 new_res_table = rtnl_dereference(newg->res_table); 2253 old_res_table = rtnl_dereference(oldg->res_table); 2254 2255 /* Accept if num_nh_buckets was not given, but if it was 2256 * given, demand that the value be correct. 2257 */ 2258 if (cfg->nh_grp_res_has_num_buckets && 2259 cfg->nh_grp_res_num_buckets != 2260 old_res_table->num_nh_buckets) { 2261 NL_SET_ERR_MSG(extack, "Can not change number of buckets of a resilient nexthop group."); 2262 return -EINVAL; 2263 } 2264 2265 /* Emit a pre-replace notification so that listeners could veto 2266 * a potentially unsupported configuration. Otherwise, 2267 * individual bucket replacement notifications would need to be 2268 * vetoed, which is something that should only happen if the 2269 * bucket is currently active. 2270 */ 2271 err = call_nexthop_res_table_notifiers(net, new, extack); 2272 if (err) 2273 return err; 2274 2275 if (cfg->nh_grp_res_has_idle_timer) 2276 old_res_table->idle_timer = cfg->nh_grp_res_idle_timer; 2277 if (cfg->nh_grp_res_has_unbalanced_timer) 2278 old_res_table->unbalanced_timer = 2279 cfg->nh_grp_res_unbalanced_timer; 2280 2281 replace_nexthop_grp_res(oldg, newg); 2282 2283 tmp_table = new_res_table; 2284 rcu_assign_pointer(newg->res_table, old_res_table); 2285 rcu_assign_pointer(newg->spare->res_table, old_res_table); 2286 } 2287 2288 /* update parents - used by nexthop code for cleanup */ 2289 for (i = 0; i < newg->num_nh; i++) 2290 newg->nh_entries[i].nh_parent = old; 2291 2292 rcu_assign_pointer(old->nh_grp, newg); 2293 2294 /* Make sure concurrent readers are not using 'oldg' anymore. */ 2295 synchronize_net(); 2296 2297 if (newg->resilient) { 2298 rcu_assign_pointer(oldg->res_table, tmp_table); 2299 rcu_assign_pointer(oldg->spare->res_table, tmp_table); 2300 } 2301 2302 for (i = 0; i < oldg->num_nh; i++) 2303 oldg->nh_entries[i].nh_parent = new; 2304 2305 rcu_assign_pointer(new->nh_grp, oldg); 2306 2307 return 0; 2308 } 2309 2310 static void nh_group_v4_update(struct nh_group *nhg) 2311 { 2312 struct nh_grp_entry *nhges; 2313 bool has_v4 = false; 2314 int i; 2315 2316 nhges = nhg->nh_entries; 2317 for (i = 0; i < nhg->num_nh; i++) { 2318 struct nh_info *nhi; 2319 2320 nhi = rtnl_dereference(nhges[i].nh->nh_info); 2321 if (nhi->family == AF_INET) 2322 has_v4 = true; 2323 } 2324 nhg->has_v4 = has_v4; 2325 } 2326 2327 static int replace_nexthop_single_notify_res(struct net *net, 2328 struct nh_res_table *res_table, 2329 struct nexthop *old, 2330 struct nh_info *oldi, 2331 struct nh_info *newi, 2332 struct netlink_ext_ack *extack) 2333 { 2334 u32 nhg_id = res_table->nhg_id; 2335 int err; 2336 u16 i; 2337 2338 for (i = 0; i < res_table->num_nh_buckets; i++) { 2339 struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; 2340 struct nh_grp_entry *nhge; 2341 2342 nhge = rtnl_dereference(bucket->nh_entry); 2343 if (nhge->nh == old) { 2344 err = __call_nexthop_res_bucket_notifiers(net, nhg_id, 2345 i, true, 2346 oldi, newi, 2347 extack); 2348 if (err) 2349 goto err_notify; 2350 } 2351 } 2352 2353 return 0; 2354 2355 err_notify: 2356 while (i-- > 0) { 2357 struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; 2358 struct nh_grp_entry *nhge; 2359 2360 nhge = rtnl_dereference(bucket->nh_entry); 2361 if (nhge->nh == old) 2362 __call_nexthop_res_bucket_notifiers(net, nhg_id, i, 2363 true, newi, oldi, 2364 extack); 2365 } 2366 return err; 2367 } 2368 2369 static int replace_nexthop_single_notify(struct net *net, 2370 struct nexthop *group_nh, 2371 struct nexthop *old, 2372 struct nh_info *oldi, 2373 struct nh_info *newi, 2374 struct netlink_ext_ack *extack) 2375 { 2376 struct nh_group *nhg = rtnl_dereference(group_nh->nh_grp); 2377 struct nh_res_table *res_table; 2378 2379 if (nhg->hash_threshold) { 2380 return call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, 2381 group_nh, extack); 2382 } else if (nhg->resilient) { 2383 res_table = rtnl_dereference(nhg->res_table); 2384 return replace_nexthop_single_notify_res(net, res_table, 2385 old, oldi, newi, 2386 extack); 2387 } 2388 2389 return -EINVAL; 2390 } 2391 2392 static int replace_nexthop_single(struct net *net, struct nexthop *old, 2393 struct nexthop *new, 2394 struct netlink_ext_ack *extack) 2395 { 2396 u8 old_protocol, old_nh_flags; 2397 struct nh_info *oldi, *newi; 2398 struct nh_grp_entry *nhge; 2399 int err; 2400 2401 if (new->is_group) { 2402 NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group."); 2403 return -EINVAL; 2404 } 2405 2406 if (!list_empty(&old->grp_list) && 2407 rtnl_dereference(new->nh_info)->fdb_nh != 2408 rtnl_dereference(old->nh_info)->fdb_nh) { 2409 NL_SET_ERR_MSG(extack, "Cannot change nexthop FDB status while in a group"); 2410 return -EINVAL; 2411 } 2412 2413 err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack); 2414 if (err) 2415 return err; 2416 2417 /* Hardware flags were set on 'old' as 'new' is not in the red-black 2418 * tree. Therefore, inherit the flags from 'old' to 'new'. 2419 */ 2420 new->nh_flags |= old->nh_flags & (RTNH_F_OFFLOAD | RTNH_F_TRAP); 2421 2422 oldi = rtnl_dereference(old->nh_info); 2423 newi = rtnl_dereference(new->nh_info); 2424 2425 newi->nh_parent = old; 2426 oldi->nh_parent = new; 2427 2428 old_protocol = old->protocol; 2429 old_nh_flags = old->nh_flags; 2430 2431 old->protocol = new->protocol; 2432 old->nh_flags = new->nh_flags; 2433 2434 rcu_assign_pointer(old->nh_info, newi); 2435 rcu_assign_pointer(new->nh_info, oldi); 2436 2437 /* Send a replace notification for all the groups using the nexthop. */ 2438 list_for_each_entry(nhge, &old->grp_list, nh_list) { 2439 struct nexthop *nhp = nhge->nh_parent; 2440 2441 err = replace_nexthop_single_notify(net, nhp, old, oldi, newi, 2442 extack); 2443 if (err) 2444 goto err_notify; 2445 } 2446 2447 /* When replacing an IPv4 nexthop with an IPv6 nexthop, potentially 2448 * update IPv4 indication in all the groups using the nexthop. 2449 */ 2450 if (oldi->family == AF_INET && newi->family == AF_INET6) { 2451 list_for_each_entry(nhge, &old->grp_list, nh_list) { 2452 struct nexthop *nhp = nhge->nh_parent; 2453 struct nh_group *nhg; 2454 2455 nhg = rtnl_dereference(nhp->nh_grp); 2456 nh_group_v4_update(nhg); 2457 } 2458 } 2459 2460 return 0; 2461 2462 err_notify: 2463 rcu_assign_pointer(new->nh_info, newi); 2464 rcu_assign_pointer(old->nh_info, oldi); 2465 old->nh_flags = old_nh_flags; 2466 old->protocol = old_protocol; 2467 oldi->nh_parent = old; 2468 newi->nh_parent = new; 2469 list_for_each_entry_continue_reverse(nhge, &old->grp_list, nh_list) { 2470 struct nexthop *nhp = nhge->nh_parent; 2471 2472 replace_nexthop_single_notify(net, nhp, old, newi, oldi, NULL); 2473 } 2474 call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, old, extack); 2475 return err; 2476 } 2477 2478 static void __nexthop_replace_notify(struct net *net, struct nexthop *nh, 2479 struct nl_info *info) 2480 { 2481 struct fib6_info *f6i; 2482 2483 if (!list_empty(&nh->fi_list)) { 2484 struct fib_info *fi; 2485 2486 /* expectation is a few fib_info per nexthop and then 2487 * a lot of routes per fib_info. So mark the fib_info 2488 * and then walk the fib tables once 2489 */ 2490 list_for_each_entry(fi, &nh->fi_list, nh_list) 2491 fi->nh_updated = true; 2492 2493 fib_info_notify_update(net, info); 2494 2495 list_for_each_entry(fi, &nh->fi_list, nh_list) 2496 fi->nh_updated = false; 2497 } 2498 2499 list_for_each_entry(f6i, &nh->f6i_list, nh_list) 2500 ipv6_stub->fib6_rt_update(net, f6i, info); 2501 } 2502 2503 /* send RTM_NEWROUTE with REPLACE flag set for all FIB entries 2504 * linked to this nexthop and for all groups that the nexthop 2505 * is a member of 2506 */ 2507 static void nexthop_replace_notify(struct net *net, struct nexthop *nh, 2508 struct nl_info *info) 2509 { 2510 struct nh_grp_entry *nhge; 2511 2512 __nexthop_replace_notify(net, nh, info); 2513 2514 list_for_each_entry(nhge, &nh->grp_list, nh_list) 2515 __nexthop_replace_notify(net, nhge->nh_parent, info); 2516 } 2517 2518 static int replace_nexthop(struct net *net, struct nexthop *old, 2519 struct nexthop *new, const struct nh_config *cfg, 2520 struct netlink_ext_ack *extack) 2521 { 2522 bool new_is_reject = false; 2523 struct nh_grp_entry *nhge; 2524 int err; 2525 2526 /* check that existing FIB entries are ok with the 2527 * new nexthop definition 2528 */ 2529 err = fib_check_nh_list(old, new, extack); 2530 if (err) 2531 return err; 2532 2533 err = fib6_check_nh_list(old, new, extack); 2534 if (err) 2535 return err; 2536 2537 if (!new->is_group) { 2538 struct nh_info *nhi = rtnl_dereference(new->nh_info); 2539 2540 new_is_reject = nhi->reject_nh; 2541 } 2542 2543 list_for_each_entry(nhge, &old->grp_list, nh_list) { 2544 /* if new nexthop is a blackhole, any groups using this 2545 * nexthop cannot have more than 1 path 2546 */ 2547 if (new_is_reject && 2548 nexthop_num_path(nhge->nh_parent) > 1) { 2549 NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be a member of a group with more than one path"); 2550 return -EINVAL; 2551 } 2552 2553 err = fib_check_nh_list(nhge->nh_parent, new, extack); 2554 if (err) 2555 return err; 2556 2557 err = fib6_check_nh_list(nhge->nh_parent, new, extack); 2558 if (err) 2559 return err; 2560 } 2561 2562 if (old->is_group) 2563 err = replace_nexthop_grp(net, old, new, cfg, extack); 2564 else 2565 err = replace_nexthop_single(net, old, new, extack); 2566 2567 if (!err) { 2568 nh_rt_cache_flush(net, old, new); 2569 2570 __remove_nexthop(net, new, NULL); 2571 nexthop_put(new); 2572 } 2573 2574 return err; 2575 } 2576 2577 /* called with rtnl_lock held */ 2578 static int insert_nexthop(struct net *net, struct nexthop *new_nh, 2579 struct nh_config *cfg, struct netlink_ext_ack *extack) 2580 { 2581 struct rb_node **pp, *parent = NULL, *next; 2582 struct rb_root *root = &net->nexthop.rb_root; 2583 bool replace = !!(cfg->nlflags & NLM_F_REPLACE); 2584 bool create = !!(cfg->nlflags & NLM_F_CREATE); 2585 u32 new_id = new_nh->id; 2586 int replace_notify = 0; 2587 int rc = -EEXIST; 2588 2589 pp = &root->rb_node; 2590 while (1) { 2591 struct nexthop *nh; 2592 2593 next = *pp; 2594 if (!next) 2595 break; 2596 2597 parent = next; 2598 2599 nh = rb_entry(parent, struct nexthop, rb_node); 2600 if (new_id < nh->id) { 2601 pp = &next->rb_left; 2602 } else if (new_id > nh->id) { 2603 pp = &next->rb_right; 2604 } else if (replace) { 2605 rc = replace_nexthop(net, nh, new_nh, cfg, extack); 2606 if (!rc) { 2607 new_nh = nh; /* send notification with old nh */ 2608 replace_notify = 1; 2609 } 2610 goto out; 2611 } else { 2612 /* id already exists and not a replace */ 2613 goto out; 2614 } 2615 } 2616 2617 if (replace && !create) { 2618 NL_SET_ERR_MSG(extack, "Replace specified without create and no entry exists"); 2619 rc = -ENOENT; 2620 goto out; 2621 } 2622 2623 if (new_nh->is_group) { 2624 struct nh_group *nhg = rtnl_dereference(new_nh->nh_grp); 2625 struct nh_res_table *res_table; 2626 2627 if (nhg->resilient) { 2628 res_table = rtnl_dereference(nhg->res_table); 2629 2630 /* Not passing the number of buckets is OK when 2631 * replacing, but not when creating a new group. 2632 */ 2633 if (!cfg->nh_grp_res_has_num_buckets) { 2634 NL_SET_ERR_MSG(extack, "Number of buckets not specified for nexthop group insertion"); 2635 rc = -EINVAL; 2636 goto out; 2637 } 2638 2639 nh_res_group_rebalance(nhg, res_table); 2640 2641 /* Do not send bucket notifications, we do full 2642 * notification below. 2643 */ 2644 nh_res_table_upkeep(res_table, false, false); 2645 } 2646 } 2647 2648 rb_link_node_rcu(&new_nh->rb_node, parent, pp); 2649 rb_insert_color(&new_nh->rb_node, root); 2650 2651 /* The initial insertion is a full notification for hash-threshold as 2652 * well as resilient groups. 2653 */ 2654 rc = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new_nh, extack); 2655 if (rc) 2656 rb_erase(&new_nh->rb_node, &net->nexthop.rb_root); 2657 2658 out: 2659 if (!rc) { 2660 nh_base_seq_inc(net); 2661 nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo); 2662 if (replace_notify && 2663 READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode)) 2664 nexthop_replace_notify(net, new_nh, &cfg->nlinfo); 2665 } 2666 2667 return rc; 2668 } 2669 2670 /* rtnl */ 2671 /* remove all nexthops tied to a device being deleted */ 2672 static void nexthop_flush_dev(struct net_device *dev, unsigned long event) 2673 { 2674 unsigned int hash = nh_dev_hashfn(dev->ifindex); 2675 struct net *net = dev_net(dev); 2676 struct hlist_head *head = &net->nexthop.devhash[hash]; 2677 struct hlist_node *n; 2678 struct nh_info *nhi; 2679 2680 hlist_for_each_entry_safe(nhi, n, head, dev_hash) { 2681 if (nhi->fib_nhc.nhc_dev != dev) 2682 continue; 2683 2684 if (nhi->reject_nh && 2685 (event == NETDEV_DOWN || event == NETDEV_CHANGE)) 2686 continue; 2687 2688 remove_nexthop(net, nhi->nh_parent, NULL); 2689 } 2690 } 2691 2692 /* rtnl; called when net namespace is deleted */ 2693 static void flush_all_nexthops(struct net *net) 2694 { 2695 struct rb_root *root = &net->nexthop.rb_root; 2696 struct rb_node *node; 2697 struct nexthop *nh; 2698 2699 while ((node = rb_first(root))) { 2700 nh = rb_entry(node, struct nexthop, rb_node); 2701 remove_nexthop(net, nh, NULL); 2702 cond_resched(); 2703 } 2704 } 2705 2706 static struct nexthop *nexthop_create_group(struct net *net, 2707 struct nh_config *cfg) 2708 { 2709 struct nlattr *grps_attr = cfg->nh_grp; 2710 struct nexthop_grp *entry = nla_data(grps_attr); 2711 u16 num_nh = nla_len(grps_attr) / sizeof(*entry); 2712 struct nh_group *nhg; 2713 struct nexthop *nh; 2714 int err; 2715 int i; 2716 2717 nh = nexthop_alloc(); 2718 if (!nh) 2719 return ERR_PTR(-ENOMEM); 2720 2721 nh->is_group = 1; 2722 2723 nhg = nexthop_grp_alloc(num_nh); 2724 if (!nhg) { 2725 kfree(nh); 2726 return ERR_PTR(-ENOMEM); 2727 } 2728 2729 /* spare group used for removals */ 2730 nhg->spare = nexthop_grp_alloc(num_nh); 2731 if (!nhg->spare) { 2732 kfree(nhg); 2733 kfree(nh); 2734 return ERR_PTR(-ENOMEM); 2735 } 2736 nhg->spare->spare = nhg; 2737 2738 for (i = 0; i < nhg->num_nh; ++i) { 2739 struct nexthop *nhe; 2740 struct nh_info *nhi; 2741 2742 nhe = nexthop_find_by_id(net, entry[i].id); 2743 if (!nexthop_get(nhe)) { 2744 err = -ENOENT; 2745 goto out_no_nh; 2746 } 2747 2748 nhi = rtnl_dereference(nhe->nh_info); 2749 if (nhi->family == AF_INET) 2750 nhg->has_v4 = true; 2751 2752 nhg->nh_entries[i].stats = 2753 netdev_alloc_pcpu_stats(struct nh_grp_entry_stats); 2754 if (!nhg->nh_entries[i].stats) { 2755 err = -ENOMEM; 2756 nexthop_put(nhe); 2757 goto out_no_nh; 2758 } 2759 nhg->nh_entries[i].nh = nhe; 2760 nhg->nh_entries[i].weight = nexthop_grp_weight(&entry[i]); 2761 2762 list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list); 2763 nhg->nh_entries[i].nh_parent = nh; 2764 } 2765 2766 if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) { 2767 nhg->hash_threshold = 1; 2768 nhg->is_multipath = true; 2769 } else if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) { 2770 struct nh_res_table *res_table; 2771 2772 res_table = nexthop_res_table_alloc(net, cfg->nh_id, cfg); 2773 if (!res_table) { 2774 err = -ENOMEM; 2775 goto out_no_nh; 2776 } 2777 2778 rcu_assign_pointer(nhg->spare->res_table, res_table); 2779 rcu_assign_pointer(nhg->res_table, res_table); 2780 nhg->resilient = true; 2781 nhg->is_multipath = true; 2782 } 2783 2784 WARN_ON_ONCE(nhg->hash_threshold + nhg->resilient != 1); 2785 2786 if (nhg->hash_threshold) 2787 nh_hthr_group_rebalance(nhg); 2788 2789 if (cfg->nh_fdb) 2790 nhg->fdb_nh = 1; 2791 2792 if (cfg->nh_hw_stats) 2793 nhg->hw_stats = true; 2794 2795 rcu_assign_pointer(nh->nh_grp, nhg); 2796 2797 return nh; 2798 2799 out_no_nh: 2800 for (i--; i >= 0; --i) { 2801 list_del(&nhg->nh_entries[i].nh_list); 2802 free_percpu(nhg->nh_entries[i].stats); 2803 nexthop_put(nhg->nh_entries[i].nh); 2804 } 2805 2806 kfree(nhg->spare); 2807 kfree(nhg); 2808 kfree(nh); 2809 2810 return ERR_PTR(err); 2811 } 2812 2813 static int nh_create_ipv4(struct net *net, struct nexthop *nh, 2814 struct nh_info *nhi, struct nh_config *cfg, 2815 struct netlink_ext_ack *extack) 2816 { 2817 struct fib_nh *fib_nh = &nhi->fib_nh; 2818 struct fib_config fib_cfg = { 2819 .fc_oif = cfg->nh_ifindex, 2820 .fc_gw4 = cfg->gw.ipv4, 2821 .fc_gw_family = cfg->gw.ipv4 ? AF_INET : 0, 2822 .fc_flags = cfg->nh_flags, 2823 .fc_nlinfo = cfg->nlinfo, 2824 .fc_encap = cfg->nh_encap, 2825 .fc_encap_type = cfg->nh_encap_type, 2826 }; 2827 u32 tb_id = (cfg->dev ? l3mdev_fib_table(cfg->dev) : RT_TABLE_MAIN); 2828 int err; 2829 2830 err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack); 2831 if (err) { 2832 fib_nh_release(net, fib_nh); 2833 goto out; 2834 } 2835 2836 if (nhi->fdb_nh) 2837 goto out; 2838 2839 /* sets nh_dev if successful */ 2840 err = fib_check_nh(net, fib_nh, tb_id, 0, extack); 2841 if (!err) { 2842 nh->nh_flags = fib_nh->fib_nh_flags; 2843 fib_info_update_nhc_saddr(net, &fib_nh->nh_common, 2844 !fib_nh->fib_nh_scope ? 0 : fib_nh->fib_nh_scope - 1); 2845 } else { 2846 fib_nh_release(net, fib_nh); 2847 } 2848 out: 2849 return err; 2850 } 2851 2852 static int nh_create_ipv6(struct net *net, struct nexthop *nh, 2853 struct nh_info *nhi, struct nh_config *cfg, 2854 struct netlink_ext_ack *extack) 2855 { 2856 struct fib6_nh *fib6_nh = &nhi->fib6_nh; 2857 struct fib6_config fib6_cfg = { 2858 .fc_table = l3mdev_fib_table(cfg->dev), 2859 .fc_ifindex = cfg->nh_ifindex, 2860 .fc_gateway = cfg->gw.ipv6, 2861 .fc_flags = cfg->nh_flags, 2862 .fc_nlinfo = cfg->nlinfo, 2863 .fc_encap = cfg->nh_encap, 2864 .fc_encap_type = cfg->nh_encap_type, 2865 .fc_is_fdb = cfg->nh_fdb, 2866 }; 2867 int err; 2868 2869 if (!ipv6_addr_any(&cfg->gw.ipv6)) 2870 fib6_cfg.fc_flags |= RTF_GATEWAY; 2871 2872 /* sets nh_dev if successful */ 2873 err = ipv6_stub->fib6_nh_init(net, fib6_nh, &fib6_cfg, GFP_KERNEL, 2874 extack); 2875 if (err) { 2876 /* IPv6 is not enabled, don't call fib6_nh_release */ 2877 if (err == -EAFNOSUPPORT) 2878 goto out; 2879 ipv6_stub->fib6_nh_release(fib6_nh); 2880 } else { 2881 nh->nh_flags = fib6_nh->fib_nh_flags; 2882 } 2883 out: 2884 return err; 2885 } 2886 2887 static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg, 2888 struct netlink_ext_ack *extack) 2889 { 2890 struct nh_info *nhi; 2891 struct nexthop *nh; 2892 int err = 0; 2893 2894 nh = nexthop_alloc(); 2895 if (!nh) 2896 return ERR_PTR(-ENOMEM); 2897 2898 nhi = kzalloc_obj(*nhi); 2899 if (!nhi) { 2900 kfree(nh); 2901 return ERR_PTR(-ENOMEM); 2902 } 2903 2904 nh->nh_flags = cfg->nh_flags; 2905 nh->net = net; 2906 2907 nhi->nh_parent = nh; 2908 nhi->family = cfg->nh_family; 2909 nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK; 2910 2911 if (cfg->nh_fdb) 2912 nhi->fdb_nh = 1; 2913 2914 if (cfg->nh_blackhole) { 2915 nhi->reject_nh = 1; 2916 cfg->nh_ifindex = net->loopback_dev->ifindex; 2917 } 2918 2919 switch (cfg->nh_family) { 2920 case AF_INET: 2921 err = nh_create_ipv4(net, nh, nhi, cfg, extack); 2922 break; 2923 case AF_INET6: 2924 err = nh_create_ipv6(net, nh, nhi, cfg, extack); 2925 break; 2926 } 2927 2928 if (err) { 2929 kfree(nhi); 2930 kfree(nh); 2931 return ERR_PTR(err); 2932 } 2933 2934 /* add the entry to the device based hash */ 2935 if (!nhi->fdb_nh) 2936 nexthop_devhash_add(net, nhi); 2937 2938 rcu_assign_pointer(nh->nh_info, nhi); 2939 2940 return nh; 2941 } 2942 2943 /* called with rtnl lock held */ 2944 static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg, 2945 struct netlink_ext_ack *extack) 2946 { 2947 struct nexthop *nh; 2948 int err; 2949 2950 if (!cfg->nh_id) { 2951 cfg->nh_id = nh_find_unused_id(net); 2952 if (!cfg->nh_id) { 2953 NL_SET_ERR_MSG(extack, "No unused id"); 2954 return ERR_PTR(-EINVAL); 2955 } 2956 } 2957 2958 if (cfg->nh_grp) 2959 nh = nexthop_create_group(net, cfg); 2960 else 2961 nh = nexthop_create(net, cfg, extack); 2962 2963 if (IS_ERR(nh)) 2964 return nh; 2965 2966 refcount_set(&nh->refcnt, 1); 2967 nh->id = cfg->nh_id; 2968 nh->protocol = cfg->nh_protocol; 2969 nh->net = net; 2970 2971 err = insert_nexthop(net, nh, cfg, extack); 2972 if (err) { 2973 __remove_nexthop(net, nh, NULL); 2974 nexthop_put(nh); 2975 nh = ERR_PTR(err); 2976 } 2977 2978 return nh; 2979 } 2980 2981 static int rtm_nh_get_timer(struct nlattr *attr, unsigned long fallback, 2982 unsigned long *timer_p, bool *has_p, 2983 struct netlink_ext_ack *extack) 2984 { 2985 unsigned long timer; 2986 u32 value; 2987 2988 if (!attr) { 2989 *timer_p = fallback; 2990 *has_p = false; 2991 return 0; 2992 } 2993 2994 value = nla_get_u32(attr); 2995 timer = clock_t_to_jiffies(value); 2996 if (timer == ~0UL) { 2997 NL_SET_ERR_MSG(extack, "Timer value too large"); 2998 return -EINVAL; 2999 } 3000 3001 *timer_p = timer; 3002 *has_p = true; 3003 return 0; 3004 } 3005 3006 static int rtm_to_nh_config_grp_res(struct nlattr *res, struct nh_config *cfg, 3007 struct netlink_ext_ack *extack) 3008 { 3009 struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_policy_new)] = {}; 3010 int err; 3011 3012 if (res) { 3013 err = nla_parse_nested(tb, 3014 ARRAY_SIZE(rtm_nh_res_policy_new) - 1, 3015 res, rtm_nh_res_policy_new, extack); 3016 if (err < 0) 3017 return err; 3018 } 3019 3020 if (tb[NHA_RES_GROUP_BUCKETS]) { 3021 cfg->nh_grp_res_num_buckets = 3022 nla_get_u16(tb[NHA_RES_GROUP_BUCKETS]); 3023 cfg->nh_grp_res_has_num_buckets = true; 3024 if (!cfg->nh_grp_res_num_buckets) { 3025 NL_SET_ERR_MSG(extack, "Number of buckets needs to be non-0"); 3026 return -EINVAL; 3027 } 3028 } 3029 3030 err = rtm_nh_get_timer(tb[NHA_RES_GROUP_IDLE_TIMER], 3031 NH_RES_DEFAULT_IDLE_TIMER, 3032 &cfg->nh_grp_res_idle_timer, 3033 &cfg->nh_grp_res_has_idle_timer, 3034 extack); 3035 if (err) 3036 return err; 3037 3038 return rtm_nh_get_timer(tb[NHA_RES_GROUP_UNBALANCED_TIMER], 3039 NH_RES_DEFAULT_UNBALANCED_TIMER, 3040 &cfg->nh_grp_res_unbalanced_timer, 3041 &cfg->nh_grp_res_has_unbalanced_timer, 3042 extack); 3043 } 3044 3045 static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, 3046 struct nlmsghdr *nlh, struct nlattr **tb, 3047 struct nh_config *cfg, 3048 struct netlink_ext_ack *extack) 3049 { 3050 struct nhmsg *nhm = nlmsg_data(nlh); 3051 int err; 3052 3053 err = -EINVAL; 3054 if (nhm->resvd || nhm->nh_scope) { 3055 NL_SET_ERR_MSG(extack, "Invalid values in ancillary header"); 3056 goto out; 3057 } 3058 if (nhm->nh_flags & ~NEXTHOP_VALID_USER_FLAGS) { 3059 NL_SET_ERR_MSG(extack, "Invalid nexthop flags in ancillary header"); 3060 goto out; 3061 } 3062 3063 switch (nhm->nh_family) { 3064 case AF_INET: 3065 case AF_INET6: 3066 break; 3067 case AF_UNSPEC: 3068 if (tb[NHA_GROUP]) 3069 break; 3070 fallthrough; 3071 default: 3072 NL_SET_ERR_MSG(extack, "Invalid address family"); 3073 goto out; 3074 } 3075 3076 memset(cfg, 0, sizeof(*cfg)); 3077 cfg->nlflags = nlh->nlmsg_flags; 3078 cfg->nlinfo.portid = NETLINK_CB(skb).portid; 3079 cfg->nlinfo.nlh = nlh; 3080 cfg->nlinfo.nl_net = net; 3081 3082 cfg->nh_family = nhm->nh_family; 3083 cfg->nh_protocol = nhm->nh_protocol; 3084 cfg->nh_flags = nhm->nh_flags; 3085 3086 if (tb[NHA_ID]) 3087 cfg->nh_id = nla_get_u32(tb[NHA_ID]); 3088 3089 if (tb[NHA_FDB]) { 3090 if (tb[NHA_OIF] || tb[NHA_BLACKHOLE] || 3091 tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE]) { 3092 NL_SET_ERR_MSG(extack, "Fdb attribute can not be used with encap, oif or blackhole"); 3093 goto out; 3094 } 3095 if (nhm->nh_flags) { 3096 NL_SET_ERR_MSG(extack, "Unsupported nexthop flags in ancillary header"); 3097 goto out; 3098 } 3099 cfg->nh_fdb = nla_get_flag(tb[NHA_FDB]); 3100 } 3101 3102 if (tb[NHA_GROUP]) { 3103 if (nhm->nh_family != AF_UNSPEC) { 3104 NL_SET_ERR_MSG(extack, "Invalid family for group"); 3105 goto out; 3106 } 3107 cfg->nh_grp = tb[NHA_GROUP]; 3108 3109 cfg->nh_grp_type = NEXTHOP_GRP_TYPE_MPATH; 3110 if (tb[NHA_GROUP_TYPE]) 3111 cfg->nh_grp_type = nla_get_u16(tb[NHA_GROUP_TYPE]); 3112 3113 if (cfg->nh_grp_type > NEXTHOP_GRP_TYPE_MAX) { 3114 NL_SET_ERR_MSG(extack, "Invalid group type"); 3115 goto out; 3116 } 3117 3118 err = nh_check_attr_group(net, tb, ARRAY_SIZE(rtm_nh_policy_new), 3119 cfg->nh_grp_type, extack); 3120 if (err) 3121 goto out; 3122 3123 if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) 3124 err = rtm_to_nh_config_grp_res(tb[NHA_RES_GROUP], 3125 cfg, extack); 3126 3127 if (tb[NHA_HW_STATS_ENABLE]) 3128 cfg->nh_hw_stats = nla_get_u32(tb[NHA_HW_STATS_ENABLE]); 3129 3130 /* no other attributes should be set */ 3131 goto out; 3132 } 3133 3134 if (tb[NHA_BLACKHOLE]) { 3135 if (tb[NHA_GATEWAY] || tb[NHA_OIF] || 3136 tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE] || tb[NHA_FDB]) { 3137 NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway, oif, encap or fdb"); 3138 goto out; 3139 } 3140 3141 cfg->nh_blackhole = 1; 3142 err = 0; 3143 goto out; 3144 } 3145 3146 if (!cfg->nh_fdb && !tb[NHA_OIF]) { 3147 NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole and non-fdb nexthops"); 3148 goto out; 3149 } 3150 3151 err = -EINVAL; 3152 if (tb[NHA_GATEWAY]) { 3153 struct nlattr *gwa = tb[NHA_GATEWAY]; 3154 3155 switch (cfg->nh_family) { 3156 case AF_INET: 3157 if (nla_len(gwa) != sizeof(u32)) { 3158 NL_SET_ERR_MSG(extack, "Invalid gateway"); 3159 goto out; 3160 } 3161 cfg->gw.ipv4 = nla_get_be32(gwa); 3162 break; 3163 case AF_INET6: 3164 if (nla_len(gwa) != sizeof(struct in6_addr)) { 3165 NL_SET_ERR_MSG(extack, "Invalid gateway"); 3166 goto out; 3167 } 3168 cfg->gw.ipv6 = nla_get_in6_addr(gwa); 3169 break; 3170 default: 3171 NL_SET_ERR_MSG(extack, 3172 "Unknown address family for gateway"); 3173 goto out; 3174 } 3175 } else { 3176 /* device only nexthop (no gateway) */ 3177 if (cfg->nh_flags & RTNH_F_ONLINK) { 3178 NL_SET_ERR_MSG(extack, 3179 "ONLINK flag can not be set for nexthop without a gateway"); 3180 goto out; 3181 } 3182 } 3183 3184 if (tb[NHA_ENCAP]) { 3185 cfg->nh_encap = tb[NHA_ENCAP]; 3186 3187 if (!tb[NHA_ENCAP_TYPE]) { 3188 NL_SET_ERR_MSG(extack, "LWT encapsulation type is missing"); 3189 goto out; 3190 } 3191 3192 cfg->nh_encap_type = nla_get_u16(tb[NHA_ENCAP_TYPE]); 3193 err = lwtunnel_valid_encap_type(cfg->nh_encap_type, extack); 3194 if (err < 0) 3195 goto out; 3196 3197 } else if (tb[NHA_ENCAP_TYPE]) { 3198 NL_SET_ERR_MSG(extack, "LWT encapsulation attribute is missing"); 3199 goto out; 3200 } 3201 3202 if (tb[NHA_HW_STATS_ENABLE]) { 3203 NL_SET_ERR_MSG(extack, "Cannot enable nexthop hardware statistics for non-group nexthops"); 3204 goto out; 3205 } 3206 3207 err = 0; 3208 out: 3209 return err; 3210 } 3211 3212 static int rtm_to_nh_config_rtnl(struct net *net, struct nlattr **tb, 3213 struct nh_config *cfg, 3214 struct netlink_ext_ack *extack) 3215 { 3216 if (tb[NHA_GROUP]) 3217 return nh_check_attr_group_rtnl(net, tb, extack); 3218 3219 if (tb[NHA_OIF]) { 3220 cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]); 3221 if (cfg->nh_ifindex) 3222 cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex); 3223 3224 if (!cfg->dev) { 3225 NL_SET_ERR_MSG(extack, "Invalid device index"); 3226 return -EINVAL; 3227 } 3228 3229 if (!(cfg->dev->flags & IFF_UP)) { 3230 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 3231 return -ENETDOWN; 3232 } 3233 3234 if (!netif_carrier_ok(cfg->dev)) { 3235 NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down"); 3236 return -ENETDOWN; 3237 } 3238 } 3239 3240 return 0; 3241 } 3242 3243 /* rtnl */ 3244 static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, 3245 struct netlink_ext_ack *extack) 3246 { 3247 struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_new)]; 3248 struct net *net = sock_net(skb->sk); 3249 struct nh_config cfg; 3250 struct nexthop *nh; 3251 int err; 3252 3253 err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, 3254 ARRAY_SIZE(rtm_nh_policy_new) - 1, 3255 rtm_nh_policy_new, extack); 3256 if (err < 0) 3257 goto out; 3258 3259 err = rtm_to_nh_config(net, skb, nlh, tb, &cfg, extack); 3260 if (err) 3261 goto out; 3262 3263 if (cfg.nlflags & NLM_F_REPLACE && !cfg.nh_id) { 3264 NL_SET_ERR_MSG(extack, "Replace requires nexthop id"); 3265 err = -EINVAL; 3266 goto out; 3267 } 3268 3269 rtnl_net_lock(net); 3270 3271 err = rtm_to_nh_config_rtnl(net, tb, &cfg, extack); 3272 if (err) 3273 goto unlock; 3274 3275 nh = nexthop_add(net, &cfg, extack); 3276 if (IS_ERR(nh)) 3277 err = PTR_ERR(nh); 3278 3279 unlock: 3280 rtnl_net_unlock(net); 3281 out: 3282 return err; 3283 } 3284 3285 static int nh_valid_get_del_req(const struct nlmsghdr *nlh, 3286 struct nlattr **tb, u32 *id, u32 *op_flags, 3287 struct netlink_ext_ack *extack) 3288 { 3289 struct nhmsg *nhm = nlmsg_data(nlh); 3290 3291 if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) { 3292 NL_SET_ERR_MSG(extack, "Invalid values in header"); 3293 return -EINVAL; 3294 } 3295 3296 if (!tb[NHA_ID]) { 3297 NL_SET_ERR_MSG(extack, "Nexthop id is missing"); 3298 return -EINVAL; 3299 } 3300 3301 *id = nla_get_u32(tb[NHA_ID]); 3302 if (!(*id)) { 3303 NL_SET_ERR_MSG(extack, "Invalid nexthop id"); 3304 return -EINVAL; 3305 } 3306 3307 if (op_flags) 3308 *op_flags = nla_get_u32_default(tb[NHA_OP_FLAGS], 0); 3309 3310 return 0; 3311 } 3312 3313 /* rtnl */ 3314 static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, 3315 struct netlink_ext_ack *extack) 3316 { 3317 struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_del)]; 3318 struct net *net = sock_net(skb->sk); 3319 struct nl_info nlinfo = { 3320 .nlh = nlh, 3321 .nl_net = net, 3322 .portid = NETLINK_CB(skb).portid, 3323 }; 3324 struct nexthop *nh; 3325 int err; 3326 u32 id; 3327 3328 err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, 3329 ARRAY_SIZE(rtm_nh_policy_del) - 1, rtm_nh_policy_del, 3330 extack); 3331 if (err < 0) 3332 return err; 3333 3334 err = nh_valid_get_del_req(nlh, tb, &id, NULL, extack); 3335 if (err) 3336 return err; 3337 3338 rtnl_net_lock(net); 3339 3340 nh = nexthop_find_by_id(net, id); 3341 if (nh) 3342 remove_nexthop(net, nh, &nlinfo); 3343 else 3344 err = -ENOENT; 3345 3346 rtnl_net_unlock(net); 3347 3348 return err; 3349 } 3350 3351 /* rtnl */ 3352 static int rtm_get_nexthop(struct sk_buff *in_skb, struct nlmsghdr *nlh, 3353 struct netlink_ext_ack *extack) 3354 { 3355 struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get)]; 3356 struct net *net = sock_net(in_skb->sk); 3357 struct sk_buff *skb = NULL; 3358 struct nexthop *nh; 3359 u32 op_flags; 3360 int err; 3361 u32 id; 3362 3363 err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, 3364 ARRAY_SIZE(rtm_nh_policy_get) - 1, rtm_nh_policy_get, 3365 extack); 3366 if (err < 0) 3367 return err; 3368 3369 err = nh_valid_get_del_req(nlh, tb, &id, &op_flags, extack); 3370 if (err) 3371 return err; 3372 3373 err = -ENOBUFS; 3374 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 3375 if (!skb) 3376 goto out; 3377 3378 err = -ENOENT; 3379 nh = nexthop_find_by_id(net, id); 3380 if (!nh) 3381 goto errout_free; 3382 3383 err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(in_skb).portid, 3384 nlh->nlmsg_seq, 0, op_flags); 3385 if (err < 0) { 3386 WARN_ON(err == -EMSGSIZE); 3387 goto errout_free; 3388 } 3389 3390 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 3391 out: 3392 return err; 3393 errout_free: 3394 kfree_skb(skb); 3395 goto out; 3396 } 3397 3398 struct nh_dump_filter { 3399 u32 nh_id; 3400 int dev_idx; 3401 int master_idx; 3402 bool group_filter; 3403 bool fdb_filter; 3404 u32 res_bucket_nh_id; 3405 u32 op_flags; 3406 }; 3407 3408 static bool nh_dump_filtered(struct nexthop *nh, 3409 struct nh_dump_filter *filter, u8 family) 3410 { 3411 const struct net_device *dev; 3412 const struct nh_info *nhi; 3413 3414 if (filter->group_filter && !nh->is_group) 3415 return true; 3416 3417 if (!filter->dev_idx && !filter->master_idx && !family) 3418 return false; 3419 3420 if (nh->is_group) 3421 return true; 3422 3423 nhi = rtnl_dereference(nh->nh_info); 3424 if (family && nhi->family != family) 3425 return true; 3426 3427 dev = nhi->fib_nhc.nhc_dev; 3428 if (filter->dev_idx && (!dev || dev->ifindex != filter->dev_idx)) 3429 return true; 3430 3431 if (filter->master_idx) { 3432 struct net_device *master; 3433 3434 if (!dev) 3435 return true; 3436 3437 master = netdev_master_upper_dev_get((struct net_device *)dev); 3438 if (!master || master->ifindex != filter->master_idx) 3439 return true; 3440 } 3441 3442 return false; 3443 } 3444 3445 static int __nh_valid_dump_req(const struct nlmsghdr *nlh, struct nlattr **tb, 3446 struct nh_dump_filter *filter, 3447 struct netlink_ext_ack *extack) 3448 { 3449 struct nhmsg *nhm; 3450 u32 idx; 3451 3452 if (tb[NHA_OIF]) { 3453 idx = nla_get_u32(tb[NHA_OIF]); 3454 if (idx > INT_MAX) { 3455 NL_SET_ERR_MSG(extack, "Invalid device index"); 3456 return -EINVAL; 3457 } 3458 filter->dev_idx = idx; 3459 } 3460 if (tb[NHA_MASTER]) { 3461 idx = nla_get_u32(tb[NHA_MASTER]); 3462 if (idx > INT_MAX) { 3463 NL_SET_ERR_MSG(extack, "Invalid master device index"); 3464 return -EINVAL; 3465 } 3466 filter->master_idx = idx; 3467 } 3468 filter->group_filter = nla_get_flag(tb[NHA_GROUPS]); 3469 filter->fdb_filter = nla_get_flag(tb[NHA_FDB]); 3470 3471 nhm = nlmsg_data(nlh); 3472 if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) { 3473 NL_SET_ERR_MSG(extack, "Invalid values in header for nexthop dump request"); 3474 return -EINVAL; 3475 } 3476 3477 return 0; 3478 } 3479 3480 static int nh_valid_dump_req(const struct nlmsghdr *nlh, 3481 struct nh_dump_filter *filter, 3482 struct netlink_callback *cb) 3483 { 3484 struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump)]; 3485 int err; 3486 3487 err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, 3488 ARRAY_SIZE(rtm_nh_policy_dump) - 1, 3489 rtm_nh_policy_dump, cb->extack); 3490 if (err < 0) 3491 return err; 3492 3493 filter->op_flags = nla_get_u32_default(tb[NHA_OP_FLAGS], 0); 3494 3495 return __nh_valid_dump_req(nlh, tb, filter, cb->extack); 3496 } 3497 3498 struct rtm_dump_nh_ctx { 3499 u32 idx; 3500 }; 3501 3502 static struct rtm_dump_nh_ctx * 3503 rtm_dump_nh_ctx(struct netlink_callback *cb) 3504 { 3505 struct rtm_dump_nh_ctx *ctx = (void *)cb->ctx; 3506 3507 BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); 3508 return ctx; 3509 } 3510 3511 static int rtm_dump_walk_nexthops(struct sk_buff *skb, 3512 struct netlink_callback *cb, 3513 struct rb_root *root, 3514 struct rtm_dump_nh_ctx *ctx, 3515 int (*nh_cb)(struct sk_buff *skb, 3516 struct netlink_callback *cb, 3517 struct nexthop *nh, void *data), 3518 void *data) 3519 { 3520 struct rb_node *node; 3521 int s_idx; 3522 int err; 3523 3524 s_idx = ctx->idx; 3525 3526 /* If this is not the first invocation, ctx->idx will contain the id of 3527 * the last nexthop we processed. Instead of starting from the very 3528 * first element of the red/black tree again and linearly skipping the 3529 * (potentially large) set of nodes with an id smaller than s_idx, walk 3530 * the tree and find the left-most node whose id is >= s_idx. This 3531 * provides an efficient O(log n) starting point for the dump 3532 * continuation. 3533 */ 3534 if (s_idx != 0) { 3535 struct rb_node *tmp = root->rb_node; 3536 3537 node = NULL; 3538 while (tmp) { 3539 struct nexthop *nh; 3540 3541 nh = rb_entry(tmp, struct nexthop, rb_node); 3542 if (nh->id < s_idx) { 3543 tmp = tmp->rb_right; 3544 } else { 3545 /* Track current candidate and keep looking on 3546 * the left side to find the left-most 3547 * (smallest id) that is still >= s_idx. 3548 */ 3549 node = tmp; 3550 tmp = tmp->rb_left; 3551 } 3552 } 3553 } else { 3554 node = rb_first(root); 3555 } 3556 3557 for (; node; node = rb_next(node)) { 3558 struct nexthop *nh; 3559 3560 nh = rb_entry(node, struct nexthop, rb_node); 3561 3562 ctx->idx = nh->id; 3563 err = nh_cb(skb, cb, nh, data); 3564 if (err) 3565 return err; 3566 } 3567 3568 return 0; 3569 } 3570 3571 static int rtm_dump_nexthop_cb(struct sk_buff *skb, struct netlink_callback *cb, 3572 struct nexthop *nh, void *data) 3573 { 3574 struct nhmsg *nhm = nlmsg_data(cb->nlh); 3575 struct nh_dump_filter *filter = data; 3576 3577 if (nh_dump_filtered(nh, filter, nhm->nh_family)) 3578 return 0; 3579 3580 return nh_fill_node(skb, nh, RTM_NEWNEXTHOP, 3581 NETLINK_CB(cb->skb).portid, 3582 cb->nlh->nlmsg_seq, NLM_F_MULTI, filter->op_flags); 3583 } 3584 3585 /* rtnl */ 3586 static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb) 3587 { 3588 struct rtm_dump_nh_ctx *ctx = rtm_dump_nh_ctx(cb); 3589 struct net *net = sock_net(skb->sk); 3590 struct rb_root *root = &net->nexthop.rb_root; 3591 struct nh_dump_filter filter = {}; 3592 int err; 3593 3594 err = nh_valid_dump_req(cb->nlh, &filter, cb); 3595 if (err < 0) 3596 return err; 3597 3598 err = rtm_dump_walk_nexthops(skb, cb, root, ctx, 3599 &rtm_dump_nexthop_cb, &filter); 3600 3601 cb->seq = net->nexthop.seq; 3602 nl_dump_check_consistent(cb, nlmsg_hdr(skb)); 3603 return err; 3604 } 3605 3606 static struct nexthop * 3607 nexthop_find_group_resilient(struct net *net, u32 id, 3608 struct netlink_ext_ack *extack) 3609 { 3610 struct nh_group *nhg; 3611 struct nexthop *nh; 3612 3613 nh = nexthop_find_by_id(net, id); 3614 if (!nh) 3615 return ERR_PTR(-ENOENT); 3616 3617 if (!nh->is_group) { 3618 NL_SET_ERR_MSG(extack, "Not a nexthop group"); 3619 return ERR_PTR(-EINVAL); 3620 } 3621 3622 nhg = rtnl_dereference(nh->nh_grp); 3623 if (!nhg->resilient) { 3624 NL_SET_ERR_MSG(extack, "Nexthop group not of type resilient"); 3625 return ERR_PTR(-EINVAL); 3626 } 3627 3628 return nh; 3629 } 3630 3631 static int nh_valid_dump_nhid(struct nlattr *attr, u32 *nh_id_p, 3632 struct netlink_ext_ack *extack) 3633 { 3634 u32 idx; 3635 3636 if (attr) { 3637 idx = nla_get_u32(attr); 3638 if (!idx) { 3639 NL_SET_ERR_MSG(extack, "Invalid nexthop id"); 3640 return -EINVAL; 3641 } 3642 *nh_id_p = idx; 3643 } else { 3644 *nh_id_p = 0; 3645 } 3646 3647 return 0; 3648 } 3649 3650 static int nh_valid_dump_bucket_req(const struct nlmsghdr *nlh, 3651 struct nh_dump_filter *filter, 3652 struct netlink_callback *cb) 3653 { 3654 struct nlattr *res_tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_dump)]; 3655 struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump_bucket)]; 3656 int err; 3657 3658 err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, 3659 ARRAY_SIZE(rtm_nh_policy_dump_bucket) - 1, 3660 rtm_nh_policy_dump_bucket, NULL); 3661 if (err < 0) 3662 return err; 3663 3664 err = nh_valid_dump_nhid(tb[NHA_ID], &filter->nh_id, cb->extack); 3665 if (err) 3666 return err; 3667 3668 if (tb[NHA_RES_BUCKET]) { 3669 size_t max = ARRAY_SIZE(rtm_nh_res_bucket_policy_dump) - 1; 3670 3671 err = nla_parse_nested(res_tb, max, 3672 tb[NHA_RES_BUCKET], 3673 rtm_nh_res_bucket_policy_dump, 3674 cb->extack); 3675 if (err < 0) 3676 return err; 3677 3678 err = nh_valid_dump_nhid(res_tb[NHA_RES_BUCKET_NH_ID], 3679 &filter->res_bucket_nh_id, 3680 cb->extack); 3681 if (err) 3682 return err; 3683 } 3684 3685 return __nh_valid_dump_req(nlh, tb, filter, cb->extack); 3686 } 3687 3688 struct rtm_dump_res_bucket_ctx { 3689 struct rtm_dump_nh_ctx nh; 3690 u16 bucket_index; 3691 }; 3692 3693 static struct rtm_dump_res_bucket_ctx * 3694 rtm_dump_res_bucket_ctx(struct netlink_callback *cb) 3695 { 3696 struct rtm_dump_res_bucket_ctx *ctx = (void *)cb->ctx; 3697 3698 BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); 3699 return ctx; 3700 } 3701 3702 struct rtm_dump_nexthop_bucket_data { 3703 struct rtm_dump_res_bucket_ctx *ctx; 3704 struct nh_dump_filter filter; 3705 }; 3706 3707 static int rtm_dump_nexthop_bucket_nh(struct sk_buff *skb, 3708 struct netlink_callback *cb, 3709 struct nexthop *nh, 3710 struct rtm_dump_nexthop_bucket_data *dd) 3711 { 3712 u32 portid = NETLINK_CB(cb->skb).portid; 3713 struct nhmsg *nhm = nlmsg_data(cb->nlh); 3714 struct nh_res_table *res_table; 3715 struct nh_group *nhg; 3716 u16 bucket_index; 3717 int err; 3718 3719 nhg = rtnl_dereference(nh->nh_grp); 3720 res_table = rtnl_dereference(nhg->res_table); 3721 for (bucket_index = dd->ctx->bucket_index; 3722 bucket_index < res_table->num_nh_buckets; 3723 bucket_index++) { 3724 struct nh_res_bucket *bucket; 3725 struct nh_grp_entry *nhge; 3726 3727 bucket = &res_table->nh_buckets[bucket_index]; 3728 nhge = rtnl_dereference(bucket->nh_entry); 3729 if (nh_dump_filtered(nhge->nh, &dd->filter, nhm->nh_family)) 3730 continue; 3731 3732 if (dd->filter.res_bucket_nh_id && 3733 dd->filter.res_bucket_nh_id != nhge->nh->id) 3734 continue; 3735 3736 dd->ctx->bucket_index = bucket_index; 3737 err = nh_fill_res_bucket(skb, nh, bucket, bucket_index, 3738 RTM_NEWNEXTHOPBUCKET, portid, 3739 cb->nlh->nlmsg_seq, NLM_F_MULTI, 3740 cb->extack); 3741 if (err) 3742 return err; 3743 } 3744 3745 dd->ctx->bucket_index = 0; 3746 3747 return 0; 3748 } 3749 3750 static int rtm_dump_nexthop_bucket_cb(struct sk_buff *skb, 3751 struct netlink_callback *cb, 3752 struct nexthop *nh, void *data) 3753 { 3754 struct rtm_dump_nexthop_bucket_data *dd = data; 3755 struct nh_group *nhg; 3756 3757 if (!nh->is_group) 3758 return 0; 3759 3760 nhg = rtnl_dereference(nh->nh_grp); 3761 if (!nhg->resilient) 3762 return 0; 3763 3764 return rtm_dump_nexthop_bucket_nh(skb, cb, nh, dd); 3765 } 3766 3767 /* rtnl */ 3768 static int rtm_dump_nexthop_bucket(struct sk_buff *skb, 3769 struct netlink_callback *cb) 3770 { 3771 struct rtm_dump_res_bucket_ctx *ctx = rtm_dump_res_bucket_ctx(cb); 3772 struct rtm_dump_nexthop_bucket_data dd = { .ctx = ctx }; 3773 struct net *net = sock_net(skb->sk); 3774 struct nexthop *nh; 3775 int err; 3776 3777 err = nh_valid_dump_bucket_req(cb->nlh, &dd.filter, cb); 3778 if (err) 3779 return err; 3780 3781 if (dd.filter.nh_id) { 3782 nh = nexthop_find_group_resilient(net, dd.filter.nh_id, 3783 cb->extack); 3784 if (IS_ERR(nh)) 3785 return PTR_ERR(nh); 3786 err = rtm_dump_nexthop_bucket_nh(skb, cb, nh, &dd); 3787 } else { 3788 struct rb_root *root = &net->nexthop.rb_root; 3789 3790 err = rtm_dump_walk_nexthops(skb, cb, root, &ctx->nh, 3791 &rtm_dump_nexthop_bucket_cb, &dd); 3792 } 3793 3794 cb->seq = net->nexthop.seq; 3795 nl_dump_check_consistent(cb, nlmsg_hdr(skb)); 3796 return err; 3797 } 3798 3799 static int nh_valid_get_bucket_req_res_bucket(struct nlattr *res, 3800 u16 *bucket_index, 3801 struct netlink_ext_ack *extack) 3802 { 3803 struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_get)]; 3804 int err; 3805 3806 err = nla_parse_nested(tb, ARRAY_SIZE(rtm_nh_res_bucket_policy_get) - 1, 3807 res, rtm_nh_res_bucket_policy_get, extack); 3808 if (err < 0) 3809 return err; 3810 3811 if (!tb[NHA_RES_BUCKET_INDEX]) { 3812 NL_SET_ERR_MSG(extack, "Bucket index is missing"); 3813 return -EINVAL; 3814 } 3815 3816 *bucket_index = nla_get_u16(tb[NHA_RES_BUCKET_INDEX]); 3817 return 0; 3818 } 3819 3820 static int nh_valid_get_bucket_req(const struct nlmsghdr *nlh, 3821 u32 *id, u16 *bucket_index, 3822 struct netlink_ext_ack *extack) 3823 { 3824 struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get_bucket)]; 3825 int err; 3826 3827 err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, 3828 ARRAY_SIZE(rtm_nh_policy_get_bucket) - 1, 3829 rtm_nh_policy_get_bucket, extack); 3830 if (err < 0) 3831 return err; 3832 3833 err = nh_valid_get_del_req(nlh, tb, id, NULL, extack); 3834 if (err) 3835 return err; 3836 3837 if (!tb[NHA_RES_BUCKET]) { 3838 NL_SET_ERR_MSG(extack, "Bucket information is missing"); 3839 return -EINVAL; 3840 } 3841 3842 err = nh_valid_get_bucket_req_res_bucket(tb[NHA_RES_BUCKET], 3843 bucket_index, extack); 3844 if (err) 3845 return err; 3846 3847 return 0; 3848 } 3849 3850 /* rtnl */ 3851 static int rtm_get_nexthop_bucket(struct sk_buff *in_skb, struct nlmsghdr *nlh, 3852 struct netlink_ext_ack *extack) 3853 { 3854 struct net *net = sock_net(in_skb->sk); 3855 struct nh_res_table *res_table; 3856 struct sk_buff *skb = NULL; 3857 struct nh_group *nhg; 3858 struct nexthop *nh; 3859 u16 bucket_index; 3860 int err; 3861 u32 id; 3862 3863 err = nh_valid_get_bucket_req(nlh, &id, &bucket_index, extack); 3864 if (err) 3865 return err; 3866 3867 nh = nexthop_find_group_resilient(net, id, extack); 3868 if (IS_ERR(nh)) 3869 return PTR_ERR(nh); 3870 3871 nhg = rtnl_dereference(nh->nh_grp); 3872 res_table = rtnl_dereference(nhg->res_table); 3873 if (bucket_index >= res_table->num_nh_buckets) { 3874 NL_SET_ERR_MSG(extack, "Bucket index out of bounds"); 3875 return -ENOENT; 3876 } 3877 3878 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 3879 if (!skb) 3880 return -ENOBUFS; 3881 3882 err = nh_fill_res_bucket(skb, nh, &res_table->nh_buckets[bucket_index], 3883 bucket_index, RTM_NEWNEXTHOPBUCKET, 3884 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 3885 0, extack); 3886 if (err < 0) { 3887 WARN_ON(err == -EMSGSIZE); 3888 goto errout_free; 3889 } 3890 3891 return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 3892 3893 errout_free: 3894 kfree_skb(skb); 3895 return err; 3896 } 3897 3898 static void nexthop_sync_mtu(struct net_device *dev, u32 orig_mtu) 3899 { 3900 unsigned int hash = nh_dev_hashfn(dev->ifindex); 3901 struct net *net = dev_net(dev); 3902 struct hlist_head *head = &net->nexthop.devhash[hash]; 3903 struct hlist_node *n; 3904 struct nh_info *nhi; 3905 3906 hlist_for_each_entry_safe(nhi, n, head, dev_hash) { 3907 if (nhi->fib_nhc.nhc_dev == dev) { 3908 if (nhi->family == AF_INET) 3909 fib_nhc_update_mtu(&nhi->fib_nhc, dev->mtu, 3910 orig_mtu); 3911 } 3912 } 3913 } 3914 3915 /* rtnl */ 3916 static int nh_netdev_event(struct notifier_block *this, 3917 unsigned long event, void *ptr) 3918 { 3919 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 3920 struct netdev_notifier_info_ext *info_ext; 3921 3922 switch (event) { 3923 case NETDEV_DOWN: 3924 case NETDEV_UNREGISTER: 3925 nexthop_flush_dev(dev, event); 3926 break; 3927 case NETDEV_CHANGE: 3928 if (!(netif_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP))) 3929 nexthop_flush_dev(dev, event); 3930 break; 3931 case NETDEV_CHANGEMTU: 3932 info_ext = ptr; 3933 nexthop_sync_mtu(dev, info_ext->ext.mtu); 3934 rt_cache_flush(dev_net(dev)); 3935 break; 3936 } 3937 return NOTIFY_DONE; 3938 } 3939 3940 static struct notifier_block nh_netdev_notifier = { 3941 .notifier_call = nh_netdev_event, 3942 }; 3943 3944 static int nexthops_dump(struct net *net, struct notifier_block *nb, 3945 enum nexthop_event_type event_type, 3946 struct netlink_ext_ack *extack) 3947 { 3948 struct rb_root *root = &net->nexthop.rb_root; 3949 struct rb_node *node; 3950 int err = 0; 3951 3952 for (node = rb_first(root); node; node = rb_next(node)) { 3953 struct nexthop *nh; 3954 3955 nh = rb_entry(node, struct nexthop, rb_node); 3956 err = call_nexthop_notifier(nb, net, event_type, nh, extack); 3957 if (err) 3958 break; 3959 } 3960 3961 return err; 3962 } 3963 3964 int register_nexthop_notifier(struct net *net, struct notifier_block *nb, 3965 struct netlink_ext_ack *extack) 3966 { 3967 int err; 3968 3969 rtnl_lock(); 3970 err = nexthops_dump(net, nb, NEXTHOP_EVENT_REPLACE, extack); 3971 if (err) 3972 goto unlock; 3973 err = blocking_notifier_chain_register(&net->nexthop.notifier_chain, 3974 nb); 3975 unlock: 3976 rtnl_unlock(); 3977 return err; 3978 } 3979 EXPORT_SYMBOL(register_nexthop_notifier); 3980 3981 int __unregister_nexthop_notifier(struct net *net, struct notifier_block *nb) 3982 { 3983 int err; 3984 3985 err = blocking_notifier_chain_unregister(&net->nexthop.notifier_chain, 3986 nb); 3987 if (!err) 3988 nexthops_dump(net, nb, NEXTHOP_EVENT_DEL, NULL); 3989 return err; 3990 } 3991 EXPORT_SYMBOL(__unregister_nexthop_notifier); 3992 3993 int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb) 3994 { 3995 int err; 3996 3997 rtnl_lock(); 3998 err = __unregister_nexthop_notifier(net, nb); 3999 rtnl_unlock(); 4000 return err; 4001 } 4002 EXPORT_SYMBOL(unregister_nexthop_notifier); 4003 4004 void nexthop_set_hw_flags(struct net *net, u32 id, bool offload, bool trap) 4005 { 4006 struct nexthop *nexthop; 4007 4008 rcu_read_lock(); 4009 4010 nexthop = nexthop_find_by_id(net, id); 4011 if (!nexthop) 4012 goto out; 4013 4014 nexthop->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); 4015 if (offload) 4016 nexthop->nh_flags |= RTNH_F_OFFLOAD; 4017 if (trap) 4018 nexthop->nh_flags |= RTNH_F_TRAP; 4019 4020 out: 4021 rcu_read_unlock(); 4022 } 4023 EXPORT_SYMBOL(nexthop_set_hw_flags); 4024 4025 void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index, 4026 bool offload, bool trap) 4027 { 4028 struct nh_res_table *res_table; 4029 struct nh_res_bucket *bucket; 4030 struct nexthop *nexthop; 4031 struct nh_group *nhg; 4032 4033 rcu_read_lock(); 4034 4035 nexthop = nexthop_find_by_id(net, id); 4036 if (!nexthop || !nexthop->is_group) 4037 goto out; 4038 4039 nhg = rcu_dereference(nexthop->nh_grp); 4040 if (!nhg->resilient) 4041 goto out; 4042 4043 if (bucket_index >= nhg->res_table->num_nh_buckets) 4044 goto out; 4045 4046 res_table = rcu_dereference(nhg->res_table); 4047 bucket = &res_table->nh_buckets[bucket_index]; 4048 bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); 4049 if (offload) 4050 bucket->nh_flags |= RTNH_F_OFFLOAD; 4051 if (trap) 4052 bucket->nh_flags |= RTNH_F_TRAP; 4053 4054 out: 4055 rcu_read_unlock(); 4056 } 4057 EXPORT_SYMBOL(nexthop_bucket_set_hw_flags); 4058 4059 void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets, 4060 unsigned long *activity) 4061 { 4062 struct nh_res_table *res_table; 4063 struct nexthop *nexthop; 4064 struct nh_group *nhg; 4065 u16 i; 4066 4067 rcu_read_lock(); 4068 4069 nexthop = nexthop_find_by_id(net, id); 4070 if (!nexthop || !nexthop->is_group) 4071 goto out; 4072 4073 nhg = rcu_dereference(nexthop->nh_grp); 4074 if (!nhg->resilient) 4075 goto out; 4076 4077 /* Instead of silently ignoring some buckets, demand that the sizes 4078 * be the same. 4079 */ 4080 res_table = rcu_dereference(nhg->res_table); 4081 if (num_buckets != res_table->num_nh_buckets) 4082 goto out; 4083 4084 for (i = 0; i < num_buckets; i++) { 4085 if (test_bit(i, activity)) 4086 nh_res_bucket_set_busy(&res_table->nh_buckets[i]); 4087 } 4088 4089 out: 4090 rcu_read_unlock(); 4091 } 4092 EXPORT_SYMBOL(nexthop_res_grp_activity_update); 4093 4094 static void __net_exit nexthop_net_exit_rtnl(struct net *net, 4095 struct list_head *dev_to_kill) 4096 { 4097 ASSERT_RTNL_NET(net); 4098 flush_all_nexthops(net); 4099 } 4100 4101 static void __net_exit nexthop_net_exit(struct net *net) 4102 { 4103 kfree(net->nexthop.devhash); 4104 net->nexthop.devhash = NULL; 4105 } 4106 4107 static int __net_init nexthop_net_init(struct net *net) 4108 { 4109 size_t sz = sizeof(struct hlist_head) * NH_DEV_HASHSIZE; 4110 4111 net->nexthop.rb_root = RB_ROOT; 4112 net->nexthop.devhash = kzalloc(sz, GFP_KERNEL); 4113 if (!net->nexthop.devhash) 4114 return -ENOMEM; 4115 BLOCKING_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain); 4116 4117 return 0; 4118 } 4119 4120 static struct pernet_operations nexthop_net_ops = { 4121 .init = nexthop_net_init, 4122 .exit = nexthop_net_exit, 4123 .exit_rtnl = nexthop_net_exit_rtnl, 4124 }; 4125 4126 static const struct rtnl_msg_handler nexthop_rtnl_msg_handlers[] __initconst = { 4127 {.msgtype = RTM_NEWNEXTHOP, .doit = rtm_new_nexthop, 4128 .flags = RTNL_FLAG_DOIT_PERNET}, 4129 {.msgtype = RTM_DELNEXTHOP, .doit = rtm_del_nexthop, 4130 .flags = RTNL_FLAG_DOIT_PERNET}, 4131 {.msgtype = RTM_GETNEXTHOP, .doit = rtm_get_nexthop, 4132 .dumpit = rtm_dump_nexthop}, 4133 {.msgtype = RTM_GETNEXTHOPBUCKET, .doit = rtm_get_nexthop_bucket, 4134 .dumpit = rtm_dump_nexthop_bucket}, 4135 {.protocol = PF_INET, .msgtype = RTM_NEWNEXTHOP, 4136 .doit = rtm_new_nexthop, .flags = RTNL_FLAG_DOIT_PERNET}, 4137 {.protocol = PF_INET, .msgtype = RTM_GETNEXTHOP, 4138 .dumpit = rtm_dump_nexthop}, 4139 {.protocol = PF_INET6, .msgtype = RTM_NEWNEXTHOP, 4140 .doit = rtm_new_nexthop, .flags = RTNL_FLAG_DOIT_PERNET}, 4141 {.protocol = PF_INET6, .msgtype = RTM_GETNEXTHOP, 4142 .dumpit = rtm_dump_nexthop}, 4143 }; 4144 4145 static int __init nexthop_init(void) 4146 { 4147 register_pernet_subsys(&nexthop_net_ops); 4148 4149 register_netdevice_notifier(&nh_netdev_notifier); 4150 4151 rtnl_register_many(nexthop_rtnl_msg_handlers); 4152 4153 return 0; 4154 } 4155 subsys_initcall(nexthop_init); 4156