From 75cee397ae6f1020fbb75db90aa22a51bc3318ac Mon Sep 17 00:00:00 2001 From: Hoang Huu Le Date: Fri, 16 Oct 2020 09:31:18 +0700 Subject: [PATCH 01/62] tipc: re-configure queue limit for broadcast link The queue limit of the broadcast link is being calculated base on initial MTU. However, when MTU value changed (e.g manual changing MTU on NIC device, MTU negotiation etc.,) we do not re-calculate queue limit. This gives throughput does not reflect with the change. So fix it by calling the function to re-calculate queue limit of the broadcast link. Acked-by: Jon Maloy Signed-off-by: Hoang Huu Le Signed-off-by: Jakub Kicinski --- net/tipc/bcast.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 940d176e0e87..c77fd13e2777 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -108,6 +108,7 @@ static void tipc_bcbase_select_primary(struct net *net) { struct tipc_bc_base *bb = tipc_bc_base(net); int all_dests = tipc_link_bc_peers(bb->link); + int max_win = tipc_link_max_win(bb->link); int i, mtu, prim; bb->primary_bearer = INVALID_BEARER_ID; @@ -121,8 +122,11 @@ static void tipc_bcbase_select_primary(struct net *net) continue; mtu = tipc_bearer_mtu(net, i); - if (mtu < tipc_link_mtu(bb->link)) + if (mtu < tipc_link_mtu(bb->link)) { tipc_link_set_mtu(bb->link, mtu); + tipc_link_set_queue_limits(bb->link, max_win, + max_win); + } bb->bcast_support &= tipc_bearer_bcast_support(net, i); if (bb->dests[i] < all_dests) continue; From ec78e31852c9bb7d96b6557468fecb6f6f3b28f3 Mon Sep 17 00:00:00 2001 From: Hoang Huu Le Date: Fri, 16 Oct 2020 09:31:19 +0700 Subject: [PATCH 02/62] tipc: fix incorrect setting window for bcast link In commit 16ad3f4022bb ("tipc: introduce variable window congestion control"), we applied the algorithm to select window size from minimum window to the configured maximum window for unicast link, and, besides we chose to keep the window size for broadcast link unchanged and equal (i.e fix window 50) However, when setting maximum window variable via command, the window variable was re-initialized to unexpect value (i.e 32). We fix this by updating the fix window for broadcast as we stated. Fixes: 16ad3f4022bb ("tipc: introduce variable window congestion control") Acked-by: Jon Maloy Signed-off-by: Hoang Huu Le Signed-off-by: Jakub Kicinski --- net/tipc/bcast.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index c77fd13e2777..d4beca895992 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -109,6 +109,7 @@ static void tipc_bcbase_select_primary(struct net *net) struct tipc_bc_base *bb = tipc_bc_base(net); int all_dests = tipc_link_bc_peers(bb->link); int max_win = tipc_link_max_win(bb->link); + int min_win = tipc_link_min_win(bb->link); int i, mtu, prim; bb->primary_bearer = INVALID_BEARER_ID; @@ -124,7 +125,8 @@ static void tipc_bcbase_select_primary(struct net *net) mtu = tipc_bearer_mtu(net, i); if (mtu < tipc_link_mtu(bb->link)) { tipc_link_set_mtu(bb->link, mtu); - tipc_link_set_queue_limits(bb->link, max_win, + tipc_link_set_queue_limits(bb->link, + min_win, max_win); } bb->bcast_support &= tipc_bearer_bcast_support(net, i); @@ -589,7 +591,7 @@ static int tipc_bc_link_set_queue_limits(struct net *net, u32 max_win) if (max_win > TIPC_MAX_LINK_WIN) return -EINVAL; tipc_bcast_lock(net); - tipc_link_set_queue_limits(l, BCLINK_WIN_MIN, max_win); + tipc_link_set_queue_limits(l, tipc_link_min_win(l), max_win); tipc_bcast_unlock(net); return 0; } From 137d23cea1c044b2d4853ac71bc68126b25fdbb2 Mon Sep 17 00:00:00 2001 From: Dylan Hung Date: Wed, 14 Oct 2020 14:06:32 +0800 Subject: [PATCH 03/62] net: ftgmac100: Fix Aspeed ast2600 TX hang issue The new HW arbitration feature on Aspeed ast2600 will cause MAC TX to hang when handling scatter-gather DMA. Disable the problematic feature by setting MAC register 0x58 bit28 and bit27. Fixes: 39bfab8844a0 ("net: ftgmac100: Add support for DT phy-handle property") Signed-off-by: Dylan Hung Reviewed-by: Joel Stanley Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/faraday/ftgmac100.c | 5 +++++ drivers/net/ethernet/faraday/ftgmac100.h | 8 ++++++++ 2 files changed, 13 insertions(+) diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c index 87236206366f..00024dd41147 100644 --- a/drivers/net/ethernet/faraday/ftgmac100.c +++ b/drivers/net/ethernet/faraday/ftgmac100.c @@ -1817,6 +1817,11 @@ static int ftgmac100_probe(struct platform_device *pdev) priv->rxdes0_edorr_mask = BIT(30); priv->txdes0_edotr_mask = BIT(30); priv->is_aspeed = true; + /* Disable ast2600 problematic HW arbitration */ + if (of_device_is_compatible(np, "aspeed,ast2600-mac")) { + iowrite32(FTGMAC100_TM_DEFAULT, + priv->base + FTGMAC100_OFFSET_TM); + } } else { priv->rxdes0_edorr_mask = BIT(15); priv->txdes0_edotr_mask = BIT(15); diff --git a/drivers/net/ethernet/faraday/ftgmac100.h b/drivers/net/ethernet/faraday/ftgmac100.h index e5876a3fda91..63b3e02fab16 100644 --- a/drivers/net/ethernet/faraday/ftgmac100.h +++ b/drivers/net/ethernet/faraday/ftgmac100.h @@ -169,6 +169,14 @@ #define FTGMAC100_MACCR_FAST_MODE (1 << 19) #define FTGMAC100_MACCR_SW_RST (1 << 31) +/* + * test mode control register + */ +#define FTGMAC100_TM_RQ_TX_VALID_DIS (1 << 28) +#define FTGMAC100_TM_RQ_RR_IDLE_PREV (1 << 27) +#define FTGMAC100_TM_DEFAULT \ + (FTGMAC100_TM_RQ_TX_VALID_DIS | FTGMAC100_TM_RQ_RR_IDLE_PREV) + /* * PHY control register */ From b38e7819cae946e2edf869e604af1e65a5d241c5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 15 Oct 2020 11:42:00 -0700 Subject: [PATCH 04/62] icmp: randomize the global rate limiter Keyu Man reported that the ICMP rate limiter could be used by attackers to get useful signal. Details will be provided in an upcoming academic publication. Our solution is to add some noise, so that the attackers no longer can get help from the predictable token bucket limiter. Fixes: 4cdf507d5452 ("icmp: add a global rate limitation") Signed-off-by: Eric Dumazet Reported-by: Keyu Man Signed-off-by: Jakub Kicinski --- Documentation/networking/ip-sysctl.rst | 4 +++- net/ipv4/icmp.c | 7 +++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 837d51f9e1fa..25e6673a085a 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -1142,13 +1142,15 @@ icmp_ratelimit - INTEGER icmp_msgs_per_sec - INTEGER Limit maximal number of ICMP packets sent per second from this host. Only messages whose type matches icmp_ratemask (see below) are - controlled by this limit. + controlled by this limit. For security reasons, the precise count + of messages per second is randomized. Default: 1000 icmp_msgs_burst - INTEGER icmp_msgs_per_sec controls number of ICMP packets sent per second, while icmp_msgs_burst controls the burst size of these packets. + For security reasons, the precise burst size is randomized. Default: 50 diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 07f67ced962a..005faea415a4 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -239,7 +239,7 @@ static struct { /** * icmp_global_allow - Are we allowed to send one more ICMP message ? * - * Uses a token bucket to limit our ICMP messages to sysctl_icmp_msgs_per_sec. + * Uses a token bucket to limit our ICMP messages to ~sysctl_icmp_msgs_per_sec. * Returns false if we reached the limit and can not send another packet. * Note: called with BH disabled */ @@ -267,7 +267,10 @@ bool icmp_global_allow(void) } credit = min_t(u32, icmp_global.credit + incr, sysctl_icmp_msgs_burst); if (credit) { - credit--; + /* We want to use a credit of one in average, but need to randomize + * it for security reasons. + */ + credit = max_t(int, credit - prandom_u32_max(3), 0); rc = true; } WRITE_ONCE(icmp_global.credit, credit); From f355a55f8202811df304de42f59868f2c6810db1 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Thu, 15 Oct 2020 23:00:23 +0300 Subject: [PATCH 05/62] net: pcs-xpcs: depend on MDIO_BUS instead of selecting it The below compile time error can be seen when PHYLIB is configured as a module. ld: drivers/net/pcs/pcs-xpcs.o: in function `xpcs_read': pcs-xpcs.c:(.text+0x29): undefined reference to `mdiobus_read' ld: drivers/net/pcs/pcs-xpcs.o: in function `xpcs_soft_reset.constprop.7': pcs-xpcs.c:(.text+0x80): undefined reference to `mdiobus_write' ld: drivers/net/pcs/pcs-xpcs.o: in function `xpcs_config_aneg': pcs-xpcs.c:(.text+0x318): undefined reference to `mdiobus_write' ld: pcs-xpcs.c:(.text+0x38e): undefined reference to `mdiobus_write' ld: pcs-xpcs.c:(.text+0x3eb): undefined reference to `mdiobus_write' ld: pcs-xpcs.c:(.text+0x437): undefined reference to `mdiobus_write' ld: drivers/net/pcs/pcs-xpcs.o:pcs-xpcs.c:(.text+0xb1e): more undefined references to `mdiobus_write' follow PHYLIB being a module leads to MDIO_BUS being a module as well while the XPCS is still built-in. What should happen in this configuration is that PCS_XPCS should be forced to build as module. However, that select only acts in the opposite way so we should turn it into a depends. Fix this up by explicitly depending on MDIO_BUS. Reported-by: Randy Dunlap Acked-by: Randy Dunlap # build-tested Fixes: 2fa4e4b799e1 ("net: pcs: Move XPCS into new PCS subdirectory") Signed-off-by: Ioana Ciornei Signed-off-by: Jakub Kicinski --- drivers/net/pcs/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/pcs/Kconfig b/drivers/net/pcs/Kconfig index 074fb3f5db18..22ba7b0b476d 100644 --- a/drivers/net/pcs/Kconfig +++ b/drivers/net/pcs/Kconfig @@ -7,8 +7,7 @@ menu "PCS device drivers" config PCS_XPCS tristate "Synopsys DesignWare XPCS controller" - select MDIO_BUS - depends on MDIO_DEVICE + depends on MDIO_DEVICE && MDIO_BUS help This module provides helper functions for Synopsys DesignWare XPCS controllers. From f981fc3d515a588c389242b7e3a71487b40571a5 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Sat, 17 Oct 2020 20:24:51 +0200 Subject: [PATCH 06/62] net: openvswitch: fix to make sure flow_lookup() is not preempted The flow_lookup() function uses per CPU variables, which must be called with BH disabled. However, this is fine in the general NAPI use case where the local BH is disabled. But, it's also called from the netlink context. The below patch makes sure that even in the netlink path, the BH is disabled. In addition, u64_stats_update_begin() requires a lock to ensure one writer which is not ensured here. Making it per-CPU and disabling NAPI (softirq) ensures that there is always only one writer. Fixes: eac87c413bf9 ("net: openvswitch: reorder masks array based on usage") Reported-by: Juri Lelli Signed-off-by: Eelco Chaudron Link: https://lore.kernel.org/r/160295903253.7789.826736662555102345.stgit@ebuild Signed-off-by: Jakub Kicinski --- net/openvswitch/flow_table.c | 58 ++++++++++++++++++++++-------------- net/openvswitch/flow_table.h | 8 +++-- 2 files changed, 41 insertions(+), 25 deletions(-) diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index 87c286ad660e..f3486a37361a 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -175,7 +175,7 @@ static struct table_instance *table_instance_alloc(int new_size) static void __mask_array_destroy(struct mask_array *ma) { - free_percpu(ma->masks_usage_cntr); + free_percpu(ma->masks_usage_stats); kfree(ma); } @@ -199,15 +199,15 @@ static void tbl_mask_array_reset_counters(struct mask_array *ma) ma->masks_usage_zero_cntr[i] = 0; for_each_possible_cpu(cpu) { - u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr, - cpu); + struct mask_array_stats *stats; unsigned int start; u64 counter; + stats = per_cpu_ptr(ma->masks_usage_stats, cpu); do { - start = u64_stats_fetch_begin_irq(&ma->syncp); - counter = usage_counters[i]; - } while (u64_stats_fetch_retry_irq(&ma->syncp, start)); + start = u64_stats_fetch_begin_irq(&stats->syncp); + counter = stats->usage_cntrs[i]; + } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); ma->masks_usage_zero_cntr[i] += counter; } @@ -230,9 +230,10 @@ static struct mask_array *tbl_mask_array_alloc(int size) sizeof(struct sw_flow_mask *) * size); - new->masks_usage_cntr = __alloc_percpu(sizeof(u64) * size, - __alignof__(u64)); - if (!new->masks_usage_cntr) { + new->masks_usage_stats = __alloc_percpu(sizeof(struct mask_array_stats) + + sizeof(u64) * size, + __alignof__(u64)); + if (!new->masks_usage_stats) { kfree(new); return NULL; } @@ -722,6 +723,8 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti, /* Flow lookup does full lookup on flow table. It starts with * mask from index passed in *index. + * This function MUST be called with BH disabled due to the use + * of CPU specific variables. */ static struct sw_flow *flow_lookup(struct flow_table *tbl, struct table_instance *ti, @@ -731,7 +734,7 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl, u32 *n_cache_hit, u32 *index) { - u64 *usage_counters = this_cpu_ptr(ma->masks_usage_cntr); + struct mask_array_stats *stats = this_cpu_ptr(ma->masks_usage_stats); struct sw_flow *flow; struct sw_flow_mask *mask; int i; @@ -741,9 +744,9 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl, if (mask) { flow = masked_flow_lookup(ti, key, mask, n_mask_hit); if (flow) { - u64_stats_update_begin(&ma->syncp); - usage_counters[*index]++; - u64_stats_update_end(&ma->syncp); + u64_stats_update_begin(&stats->syncp); + stats->usage_cntrs[*index]++; + u64_stats_update_end(&stats->syncp); (*n_cache_hit)++; return flow; } @@ -762,9 +765,9 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl, flow = masked_flow_lookup(ti, key, mask, n_mask_hit); if (flow) { /* Found */ *index = i; - u64_stats_update_begin(&ma->syncp); - usage_counters[*index]++; - u64_stats_update_end(&ma->syncp); + u64_stats_update_begin(&stats->syncp); + stats->usage_cntrs[*index]++; + u64_stats_update_end(&stats->syncp); return flow; } } @@ -850,9 +853,17 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl, struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array); u32 __always_unused n_mask_hit; u32 __always_unused n_cache_hit; + struct sw_flow *flow; u32 index = 0; - return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index); + /* This function gets called trough the netlink interface and therefore + * is preemptible. However, flow_lookup() function needs to be called + * with BH disabled due to CPU specific variables. + */ + local_bh_disable(); + flow = flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index); + local_bh_enable(); + return flow; } struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, @@ -1109,7 +1120,6 @@ void ovs_flow_masks_rebalance(struct flow_table *table) for (i = 0; i < ma->max; i++) { struct sw_flow_mask *mask; - unsigned int start; int cpu; mask = rcu_dereference_ovsl(ma->masks[i]); @@ -1120,14 +1130,16 @@ void ovs_flow_masks_rebalance(struct flow_table *table) masks_and_count[i].counter = 0; for_each_possible_cpu(cpu) { - u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr, - cpu); + struct mask_array_stats *stats; + unsigned int start; u64 counter; + stats = per_cpu_ptr(ma->masks_usage_stats, cpu); do { - start = u64_stats_fetch_begin_irq(&ma->syncp); - counter = usage_counters[i]; - } while (u64_stats_fetch_retry_irq(&ma->syncp, start)); + start = u64_stats_fetch_begin_irq(&stats->syncp); + counter = stats->usage_cntrs[i]; + } while (u64_stats_fetch_retry_irq(&stats->syncp, + start)); masks_and_count[i].counter += counter; } diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h index d8fb7a3a3dfd..9e659db78c05 100644 --- a/net/openvswitch/flow_table.h +++ b/net/openvswitch/flow_table.h @@ -38,12 +38,16 @@ struct mask_count { u64 counter; }; +struct mask_array_stats { + struct u64_stats_sync syncp; + u64 usage_cntrs[]; +}; + struct mask_array { struct rcu_head rcu; int count, max; - u64 __percpu *masks_usage_cntr; + struct mask_array_stats __percpu *masks_usage_stats; u64 *masks_usage_zero_cntr; - struct u64_stats_sync syncp; struct sw_flow_mask __rcu *masks[]; }; From 71a0e29e99405d89b695882d52eec60844173697 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 15 Oct 2020 11:45:25 +0300 Subject: [PATCH 07/62] selftests: forwarding: Add missing 'rp_filter' configuration When 'rp_filter' is configured in strict mode (1) the tests fail because packets received from the macvlan netdevs would not be forwarded through them on the reverse path. Fix this by disabling the 'rp_filter', meaning no source validation is performed. Fixes: 1538812e0880 ("selftests: forwarding: Add a test for VXLAN asymmetric routing") Fixes: 438a4f5665b2 ("selftests: forwarding: Add a test for VXLAN symmetric routing") Signed-off-by: Ido Schimmel Reported-by: Hangbin Liu Tested-by: Hangbin Liu Link: https://lore.kernel.org/r/20201015084525.135121-1-idosch@idosch.org Signed-off-by: Jakub Kicinski --- .../selftests/net/forwarding/vxlan_asymmetric.sh | 10 ++++++++++ .../selftests/net/forwarding/vxlan_symmetric.sh | 10 ++++++++++ 2 files changed, 20 insertions(+) diff --git a/tools/testing/selftests/net/forwarding/vxlan_asymmetric.sh b/tools/testing/selftests/net/forwarding/vxlan_asymmetric.sh index a0b5f57d6bd3..0727e2012b68 100755 --- a/tools/testing/selftests/net/forwarding/vxlan_asymmetric.sh +++ b/tools/testing/selftests/net/forwarding/vxlan_asymmetric.sh @@ -215,10 +215,16 @@ switch_create() bridge fdb add 00:00:5e:00:01:01 dev br1 self local vlan 10 bridge fdb add 00:00:5e:00:01:01 dev br1 self local vlan 20 + + sysctl_set net.ipv4.conf.all.rp_filter 0 + sysctl_set net.ipv4.conf.vlan10-v.rp_filter 0 + sysctl_set net.ipv4.conf.vlan20-v.rp_filter 0 } switch_destroy() { + sysctl_restore net.ipv4.conf.all.rp_filter + bridge fdb del 00:00:5e:00:01:01 dev br1 self local vlan 20 bridge fdb del 00:00:5e:00:01:01 dev br1 self local vlan 10 @@ -359,6 +365,10 @@ ns_switch_create() bridge fdb add 00:00:5e:00:01:01 dev br1 self local vlan 10 bridge fdb add 00:00:5e:00:01:01 dev br1 self local vlan 20 + + sysctl_set net.ipv4.conf.all.rp_filter 0 + sysctl_set net.ipv4.conf.vlan10-v.rp_filter 0 + sysctl_set net.ipv4.conf.vlan20-v.rp_filter 0 } export -f ns_switch_create diff --git a/tools/testing/selftests/net/forwarding/vxlan_symmetric.sh b/tools/testing/selftests/net/forwarding/vxlan_symmetric.sh index 1209031bc794..5d97fa347d75 100755 --- a/tools/testing/selftests/net/forwarding/vxlan_symmetric.sh +++ b/tools/testing/selftests/net/forwarding/vxlan_symmetric.sh @@ -237,10 +237,16 @@ switch_create() bridge fdb add 00:00:5e:00:01:01 dev br1 self local vlan 10 bridge fdb add 00:00:5e:00:01:01 dev br1 self local vlan 20 + + sysctl_set net.ipv4.conf.all.rp_filter 0 + sysctl_set net.ipv4.conf.vlan10-v.rp_filter 0 + sysctl_set net.ipv4.conf.vlan20-v.rp_filter 0 } switch_destroy() { + sysctl_restore net.ipv4.conf.all.rp_filter + bridge fdb del 00:00:5e:00:01:01 dev br1 self local vlan 20 bridge fdb del 00:00:5e:00:01:01 dev br1 self local vlan 10 @@ -402,6 +408,10 @@ ns_switch_create() bridge fdb add 00:00:5e:00:01:01 dev br1 self local vlan 10 bridge fdb add 00:00:5e:00:01:01 dev br1 self local vlan 20 + + sysctl_set net.ipv4.conf.all.rp_filter 0 + sysctl_set net.ipv4.conf.vlan10-v.rp_filter 0 + sysctl_set net.ipv4.conf.vlan20-v.rp_filter 0 } export -f ns_switch_create From 60f1626f0f866809de801724c3d180a9f322841e Mon Sep 17 00:00:00 2001 From: Anant Thazhemadam Date: Sun, 11 Oct 2020 23:00:30 +0530 Subject: [PATCH 08/62] net: usb: rtl8150: don't incorrectly assign random MAC addresses In set_ethernet_addr(), if get_registers() succeeds, the ethernet address that was read must be copied over. Otherwise, a random ethernet address must be assigned. get_registers() returns 0 if successful, and negative error number otherwise. However, in set_ethernet_addr(), this return value is incorrectly checked. Since this return value will never be equal to sizeof(node_id), a random MAC address will always be generated and assigned to the device; even in cases when get_registers() is successful. Correctly modifying the condition that checks if get_registers() was successful or not fixes this problem, and copies the ethernet address appropriately. Fixes: b2a0f274e3f7 ("net: rtl8150: Use the new usb control message API.") Signed-off-by: Anant Thazhemadam Link: https://lore.kernel.org/r/20201011173030.141582-1-anant.thazhemadam@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/rtl8150.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/usb/rtl8150.c b/drivers/net/usb/rtl8150.c index f020401adf04..bf8a60533f3e 100644 --- a/drivers/net/usb/rtl8150.c +++ b/drivers/net/usb/rtl8150.c @@ -261,7 +261,7 @@ static void set_ethernet_addr(rtl8150_t *dev) ret = get_registers(dev, IDR, sizeof(node_id), node_id); - if (ret == sizeof(node_id)) { + if (!ret) { ether_addr_copy(dev->netdev->dev_addr, node_id); } else { eth_hw_addr_random(dev->netdev); From bd7f14df9492e7d3772812a215fca66e6737e598 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 16 Oct 2020 16:20:06 -0700 Subject: [PATCH 09/62] ixgbe: fix probing of multi-port devices with one MDIO Ian reports that after upgrade from v5.8.14 to v5.9 only one of his 4 ixgbe netdevs appear in the system. Quoting the comment on ixgbe_x550em_a_has_mii(): * Returns true if hw points to lowest numbered PCI B:D.F x550_em_a device in * the SoC. There are up to 4 MACs sharing a single MDIO bus on the x550em_a, * but we only want to register one MDIO bus. This matches the symptoms, since the return value from ixgbe_mii_bus_init() is no longer ignored we need to handle the higher ports of x550em without an error. Fixes: 09ef193fef7e ("net: ethernet: ixgbe: check the return value of ixgbe_mii_bus_init()") Reported-by: Ian Kumlien Tested-by: Ian Kumlien Acked-by: Jesse Brandeburg Link: https://lore.kernel.org/r/20201016232006.3352947-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ixgbe/ixgbe_phy.c | 23 ++++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_phy.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_phy.c index f77fa3e4fdd1..fc389eecdd2b 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_phy.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_phy.c @@ -901,15 +901,13 @@ static bool ixgbe_x550em_a_has_mii(struct ixgbe_hw *hw) **/ s32 ixgbe_mii_bus_init(struct ixgbe_hw *hw) { + s32 (*write)(struct mii_bus *bus, int addr, int regnum, u16 val); + s32 (*read)(struct mii_bus *bus, int addr, int regnum); struct ixgbe_adapter *adapter = hw->back; struct pci_dev *pdev = adapter->pdev; struct device *dev = &adapter->netdev->dev; struct mii_bus *bus; - bus = devm_mdiobus_alloc(dev); - if (!bus) - return -ENOMEM; - switch (hw->device_id) { /* C3000 SoCs */ case IXGBE_DEV_ID_X550EM_A_KR: @@ -922,16 +920,23 @@ s32 ixgbe_mii_bus_init(struct ixgbe_hw *hw) case IXGBE_DEV_ID_X550EM_A_1G_T: case IXGBE_DEV_ID_X550EM_A_1G_T_L: if (!ixgbe_x550em_a_has_mii(hw)) - return -ENODEV; - bus->read = &ixgbe_x550em_a_mii_bus_read; - bus->write = &ixgbe_x550em_a_mii_bus_write; + return 0; + read = &ixgbe_x550em_a_mii_bus_read; + write = &ixgbe_x550em_a_mii_bus_write; break; default: - bus->read = &ixgbe_mii_bus_read; - bus->write = &ixgbe_mii_bus_write; + read = &ixgbe_mii_bus_read; + write = &ixgbe_mii_bus_write; break; } + bus = devm_mdiobus_alloc(dev); + if (!bus) + return -ENOMEM; + + bus->read = read; + bus->write = write; + /* Use the position of the device in the PCI hierarchy as the id */ snprintf(bus->id, MII_BUS_ID_SIZE, "%s-mdio-%s", ixgbe_driver_name, pci_name(pdev)); From 0e8b8d6a2d85344d80dda5beadd98f5f86e8d3d3 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Thu, 15 Oct 2020 16:26:06 +0000 Subject: [PATCH 10/62] net: core: use list_del_init() instead of list_del() in netdev_run_todo() dev->unlink_list is reused unless dev is deleted. So, list_del() should not be used. Due to using list_del(), dev->unlink_list can't be reused so that dev->nested_level update logic doesn't work. In order to fix this bug, list_del_init() should be used instead of list_del(). Test commands: ip link add bond0 type bond ip link add bond1 type bond ip link set bond0 master bond1 ip link set bond0 nomaster ip link set bond1 master bond0 ip link set bond1 nomaster Splat looks like: [ 255.750458][ T1030] ============================================ [ 255.751967][ T1030] WARNING: possible recursive locking detected [ 255.753435][ T1030] 5.9.0-rc8+ #772 Not tainted [ 255.754553][ T1030] -------------------------------------------- [ 255.756047][ T1030] ip/1030 is trying to acquire lock: [ 255.757304][ T1030] ffff88811782a280 (&dev_addr_list_lock_key/1){+...}-{2:2}, at: dev_mc_sync_multiple+0xc2/0x150 [ 255.760056][ T1030] [ 255.760056][ T1030] but task is already holding lock: [ 255.761862][ T1030] ffff88811130a280 (&dev_addr_list_lock_key/1){+...}-{2:2}, at: bond_enslave+0x3d4d/0x43e0 [bonding] [ 255.764581][ T1030] [ 255.764581][ T1030] other info that might help us debug this: [ 255.766645][ T1030] Possible unsafe locking scenario: [ 255.766645][ T1030] [ 255.768566][ T1030] CPU0 [ 255.769415][ T1030] ---- [ 255.770259][ T1030] lock(&dev_addr_list_lock_key/1); [ 255.771629][ T1030] lock(&dev_addr_list_lock_key/1); [ 255.772994][ T1030] [ 255.772994][ T1030] *** DEADLOCK *** [ 255.772994][ T1030] [ 255.775091][ T1030] May be due to missing lock nesting notation [ 255.775091][ T1030] [ 255.777182][ T1030] 2 locks held by ip/1030: [ 255.778299][ T1030] #0: ffffffffb1f63250 (rtnl_mutex){+.+.}-{3:3}, at: rtnetlink_rcv_msg+0x2e4/0x8b0 [ 255.780600][ T1030] #1: ffff88811130a280 (&dev_addr_list_lock_key/1){+...}-{2:2}, at: bond_enslave+0x3d4d/0x43e0 [bonding] [ 255.783411][ T1030] [ 255.783411][ T1030] stack backtrace: [ 255.784874][ T1030] CPU: 7 PID: 1030 Comm: ip Not tainted 5.9.0-rc8+ #772 [ 255.786595][ T1030] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 [ 255.789030][ T1030] Call Trace: [ 255.789850][ T1030] dump_stack+0x99/0xd0 [ 255.790882][ T1030] __lock_acquire.cold.71+0x166/0x3cc [ 255.792285][ T1030] ? register_lock_class+0x1a30/0x1a30 [ 255.793619][ T1030] ? rcu_read_lock_sched_held+0x91/0xc0 [ 255.794963][ T1030] ? rcu_read_lock_bh_held+0xa0/0xa0 [ 255.796246][ T1030] lock_acquire+0x1b8/0x850 [ 255.797332][ T1030] ? dev_mc_sync_multiple+0xc2/0x150 [ 255.798624][ T1030] ? bond_enslave+0x3d4d/0x43e0 [bonding] [ 255.800039][ T1030] ? check_flags+0x50/0x50 [ 255.801143][ T1030] ? lock_contended+0xd80/0xd80 [ 255.802341][ T1030] _raw_spin_lock_nested+0x2e/0x70 [ 255.803592][ T1030] ? dev_mc_sync_multiple+0xc2/0x150 [ 255.804897][ T1030] dev_mc_sync_multiple+0xc2/0x150 [ 255.806168][ T1030] bond_enslave+0x3d58/0x43e0 [bonding] [ 255.807542][ T1030] ? __lock_acquire+0xe53/0x51b0 [ 255.808824][ T1030] ? bond_update_slave_arr+0xdc0/0xdc0 [bonding] [ 255.810451][ T1030] ? check_chain_key+0x236/0x5e0 [ 255.811742][ T1030] ? mutex_is_locked+0x13/0x50 [ 255.812910][ T1030] ? rtnl_is_locked+0x11/0x20 [ 255.814061][ T1030] ? netdev_master_upper_dev_get+0xf/0x120 [ 255.815553][ T1030] do_setlink+0x94c/0x3040 [ ... ] Reported-by: syzbot+4a0f7bc34e3997a6c7df@syzkaller.appspotmail.com Fixes: 1fc70edb7d7b ("net: core: add nested_level variable in net_device") Signed-off-by: Taehee Yoo Link: https://lore.kernel.org/r/20201015162606.9377-1-ap420073@gmail.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 751e5264fd49..9499a414d67e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10213,7 +10213,7 @@ void netdev_run_todo(void) struct net_device *dev = list_first_entry(&unlink_list, struct net_device, unlink_list); - list_del(&dev->unlink_list); + list_del_init(&dev->unlink_list); dev->nested_level = dev->lower_level - 1; } #endif From 76702a2e7280594a0add4c1283623c81a868373f Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Mon, 19 Oct 2020 10:38:46 -0700 Subject: [PATCH 11/62] bpf: Remove unneeded break A break is not needed if it is preceded by a return. Signed-off-by: Tom Rix Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201019173846.1021-1-trix@redhat.com --- kernel/bpf/syscall.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1110ecd7d1f3..8f50c9c19f1b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2913,7 +2913,6 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: return BPF_PROG_TYPE_CGROUP_SKB; - break; case BPF_CGROUP_INET_SOCK_CREATE: case BPF_CGROUP_INET_SOCK_RELEASE: case BPF_CGROUP_INET4_POST_BIND: From 93c230e3f5bd6e1d2b2759d582fdfe9c2731473b Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 19 Oct 2020 12:42:12 -0700 Subject: [PATCH 12/62] bpf: Enforce id generation for all may-be-null register type The commit af7ec1383361 ("bpf: Add bpf_skc_to_tcp6_sock() helper") introduces RET_PTR_TO_BTF_ID_OR_NULL and the commit eaa6bcb71ef6 ("bpf: Introduce bpf_per_cpu_ptr()") introduces RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL. Note that for RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, the reg0->type could become PTR_TO_MEM_OR_NULL which is not covered by BPF_PROBE_MEM. The BPF_REG_0 will then hold a _OR_NULL pointer type. This _OR_NULL pointer type requires the bpf program to explicitly do a NULL check first. After NULL check, the verifier will mark all registers having the same reg->id as safe to use. However, the reg->id is not set for those new _OR_NULL return types. One of the ways that may be wrong is, checking NULL for one btf_id typed pointer will end up validating all other btf_id typed pointers because all of them have id == 0. The later tests will exercise this path. To fix it and also avoid similar issue in the future, this patch moves the id generation logic out of each individual RET type test in check_helper_call(). Instead, it does one reg_type_may_be_null() test and then do the id generation if needed. This patch also adds a WARN_ON_ONCE in mark_ptr_or_null_reg() to catch future breakage. The _OR_NULL pointer usage in the bpf_iter_reg.ctx_arg_info is fine because it just happens that the existing id generation after check_ctx_access() has covered it. It is also using the reg_type_may_be_null() to decide if id generation is needed or not. Fixes: af7ec1383361 ("bpf: Add bpf_skc_to_tcp6_sock() helper") Fixes: eaa6bcb71ef6 ("bpf: Introduce bpf_per_cpu_ptr()") Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201019194212.1050855-1-kafai@fb.com --- kernel/bpf/verifier.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 39d7f44e7c92..6200519582a6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5133,24 +5133,19 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].id = ++env->id_gen; } else { regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; - regs[BPF_REG_0].id = ++env->id_gen; } } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; - regs[BPF_REG_0].id = ++env->id_gen; } else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL; - regs[BPF_REG_0].id = ++env->id_gen; } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; - regs[BPF_REG_0].id = ++env->id_gen; } else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; - regs[BPF_REG_0].id = ++env->id_gen; regs[BPF_REG_0].mem_size = meta.mem_size; } else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL || fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID) { @@ -5199,6 +5194,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } + if (reg_type_may_be_null(regs[BPF_REG_0].type)) + regs[BPF_REG_0].id = ++env->id_gen; + if (is_ptr_cast_function(func_id)) { /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; @@ -7212,7 +7210,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, struct bpf_reg_state *reg, u32 id, bool is_null) { - if (reg_type_may_be_null(reg->type) && reg->id == id) { + if (reg_type_may_be_null(reg->type) && reg->id == id && + !WARN_ON_ONCE(!reg->id)) { /* Old offset (both fixed and variable parts) should * have been known-zero, because we don't allow pointer * arithmetic on pointers that might be NULL. From e710bcc6d92c47bb7d8e803b41ef529c09ad6a9e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 19 Oct 2020 12:42:19 -0700 Subject: [PATCH 13/62] bpf: selftest: Ensure the return value of bpf_skc_to helpers must be checked This patch tests: int bpf_cls(struct __sk_buff *skb) { /* REG_6: sk * REG_7: tp * REG_8: req_sk */ sk = skb->sk; if (!sk) return 0; tp = bpf_skc_to_tcp_sock(sk); req_sk = bpf_skc_to_tcp_request_sock(sk); if (!req_sk) return 0; /* !tp has not been tested, so verifier should reject. */ return *(__u8 *)tp; } Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201019194219.1051314-1-kafai@fb.com --- tools/testing/selftests/bpf/verifier/sock.c | 25 +++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tools/testing/selftests/bpf/verifier/sock.c b/tools/testing/selftests/bpf/verifier/sock.c index b1aac2641498..ce13ece08d51 100644 --- a/tools/testing/selftests/bpf/verifier/sock.c +++ b/tools/testing/selftests/bpf/verifier/sock.c @@ -631,3 +631,28 @@ .prog_type = BPF_PROG_TYPE_SK_REUSEPORT, .result = ACCEPT, }, +{ + "mark null check on return value of bpf_skc_to helpers", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_EMIT_CALL(BPF_FUNC_skc_to_tcp_sock), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_EMIT_CALL(BPF_FUNC_skc_to_tcp_request_sock), + BPF_MOV64_REG(BPF_REG_8, BPF_REG_0), + BPF_JMP_IMM(BPF_JNE, BPF_REG_8, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_7, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = REJECT, + .errstr = "invalid mem access", + .result_unpriv = REJECT, + .errstr_unpriv = "unknown func", +}, From 8568c3cefd5143fa0dc09f90e1bc9dc8905292f4 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 19 Oct 2020 12:42:25 -0700 Subject: [PATCH 14/62] bpf: selftest: Ensure the return value of the bpf_per_cpu_ptr() must be checked This patch tests all pointers returned by bpf_per_cpu_ptr() must be tested for NULL first before it can be accessed. This patch adds a subtest "null_check", so it moves the ".data..percpu" existence check to the very beginning and before doing any subtest. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201019194225.1051596-1-kafai@fb.com --- .../selftests/bpf/prog_tests/ksyms_btf.c | 57 +++++++++++++------ .../bpf/progs/test_ksyms_btf_null_check.c | 31 ++++++++++ 2 files changed, 70 insertions(+), 18 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/test_ksyms_btf_null_check.c diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c index 28e26bd3e0ca..b58b775d19f3 100644 --- a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c +++ b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c @@ -5,18 +5,17 @@ #include #include #include "test_ksyms_btf.skel.h" +#include "test_ksyms_btf_null_check.skel.h" static int duration; -void test_ksyms_btf(void) +static void test_basic(void) { __u64 runqueues_addr, bpf_prog_active_addr; __u32 this_rq_cpu; int this_bpf_prog_active; struct test_ksyms_btf *skel = NULL; struct test_ksyms_btf__data *data; - struct btf *btf; - int percpu_datasec; int err; err = kallsyms_find("runqueues", &runqueues_addr); @@ -31,20 +30,6 @@ void test_ksyms_btf(void) if (CHECK(err == -ENOENT, "ksym_find", "symbol 'bpf_prog_active' not found\n")) return; - btf = libbpf_find_kernel_btf(); - if (CHECK(IS_ERR(btf), "btf_exists", "failed to load kernel BTF: %ld\n", - PTR_ERR(btf))) - return; - - percpu_datasec = btf__find_by_name_kind(btf, ".data..percpu", - BTF_KIND_DATASEC); - if (percpu_datasec < 0) { - printf("%s:SKIP:no PERCPU DATASEC in kernel btf\n", - __func__); - test__skip(); - goto cleanup; - } - skel = test_ksyms_btf__open_and_load(); if (CHECK(!skel, "skel_open", "failed to open and load skeleton\n")) goto cleanup; @@ -83,6 +68,42 @@ void test_ksyms_btf(void) data->out__bpf_prog_active); cleanup: - btf__free(btf); test_ksyms_btf__destroy(skel); } + +static void test_null_check(void) +{ + struct test_ksyms_btf_null_check *skel; + + skel = test_ksyms_btf_null_check__open_and_load(); + CHECK(skel, "skel_open", "unexpected load of a prog missing null check\n"); + + test_ksyms_btf_null_check__destroy(skel); +} + +void test_ksyms_btf(void) +{ + int percpu_datasec; + struct btf *btf; + + btf = libbpf_find_kernel_btf(); + if (CHECK(IS_ERR(btf), "btf_exists", "failed to load kernel BTF: %ld\n", + PTR_ERR(btf))) + return; + + percpu_datasec = btf__find_by_name_kind(btf, ".data..percpu", + BTF_KIND_DATASEC); + btf__free(btf); + if (percpu_datasec < 0) { + printf("%s:SKIP:no PERCPU DATASEC in kernel btf\n", + __func__); + test__skip(); + return; + } + + if (test__start_subtest("basic")) + test_basic(); + + if (test__start_subtest("null_check")) + test_null_check(); +} diff --git a/tools/testing/selftests/bpf/progs/test_ksyms_btf_null_check.c b/tools/testing/selftests/bpf/progs/test_ksyms_btf_null_check.c new file mode 100644 index 000000000000..8bc8f7c637bc --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_ksyms_btf_null_check.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ + +#include "vmlinux.h" + +#include + +extern const struct rq runqueues __ksym; /* struct type global var. */ +extern const int bpf_prog_active __ksym; /* int type global var. */ + +SEC("raw_tp/sys_enter") +int handler(const void *ctx) +{ + struct rq *rq; + int *active; + __u32 cpu; + + cpu = bpf_get_smp_processor_id(); + rq = (struct rq *)bpf_per_cpu_ptr(&runqueues, cpu); + active = (int *)bpf_per_cpu_ptr(&bpf_prog_active, cpu); + if (active) { + /* READ_ONCE */ + *(volatile int *)active; + /* !rq has not been tested, so verifier should reject. */ + *(volatile int *)(&rq->cpu); + } + + return 0; +} + +char _license[] SEC("license") = "GPL"; From 424a646e072a887aa87283b53aa6f8b19c2a7bef Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 18 Oct 2020 18:38:59 +0200 Subject: [PATCH 15/62] r8169: fix operation under forced interrupt threading For several network drivers it was reported that using __napi_schedule_irqoff() is unsafe with forced threading. One way to fix this is switching back to __napi_schedule, but then we lose the benefit of the irqoff version in general. As stated by Eric it doesn't make sense to make the minimal hard irq handlers in drivers using NAPI a thread. Therefore ensure that the hard irq handler is never thread-ified. Fixes: 9a899a35b0d6 ("r8169: switch to napi_schedule_irqoff") Link: https://lkml.org/lkml/2020/10/18/19 Signed-off-by: Heiner Kallweit Link: https://lore.kernel.org/r/4d3ef84a-c812-5072-918a-22a6f6468310@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169_main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 7d366b0362cb..3b6ddc706e92 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -4694,7 +4694,7 @@ static int rtl8169_close(struct net_device *dev) phy_disconnect(tp->phydev); - pci_free_irq(pdev, 0, tp); + free_irq(pci_irq_vector(pdev, 0), tp); dma_free_coherent(&pdev->dev, R8169_RX_RING_BYTES, tp->RxDescArray, tp->RxPhyAddr); @@ -4745,8 +4745,8 @@ static int rtl_open(struct net_device *dev) rtl_request_firmware(tp); - retval = pci_request_irq(pdev, 0, rtl8169_interrupt, NULL, tp, - dev->name); + retval = request_irq(pci_irq_vector(pdev, 0), rtl8169_interrupt, + IRQF_NO_THREAD | IRQF_SHARED, dev->name, tp); if (retval < 0) goto err_release_fw_2; @@ -4763,7 +4763,7 @@ static int rtl_open(struct net_device *dev) return retval; err_free_irq: - pci_free_irq(pdev, 0, tp); + free_irq(pci_irq_vector(pdev, 0), tp); err_release_fw_2: rtl_release_firmware(tp); rtl8169_rx_clear(tp); From 3bd57b90554b4bb82dce638e0668ef9dc95d3e96 Mon Sep 17 00:00:00 2001 From: Valentin Vidic Date: Sun, 18 Oct 2020 20:42:55 +0200 Subject: [PATCH 16/62] net: korina: cast KSEG0 address to pointer in kfree Fixes gcc warning: passing argument 1 of 'kfree' makes pointer from integer without a cast Fixes: 3af5f0f5c74e ("net: korina: fix kfree of rx/tx descriptor array") Reported-by: kernel test robot Signed-off-by: Valentin Vidic Link: https://lore.kernel.org/r/20201018184255.28989-1-vvidic@valentin-vidic.from.hr Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/korina.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/korina.c b/drivers/net/ethernet/korina.c index af441d699a57..bf48f0ded9c7 100644 --- a/drivers/net/ethernet/korina.c +++ b/drivers/net/ethernet/korina.c @@ -1113,7 +1113,7 @@ static int korina_probe(struct platform_device *pdev) return rc; probe_err_register: - kfree(KSEG0ADDR(lp->td_ring)); + kfree((struct dma_desc *)KSEG0ADDR(lp->td_ring)); probe_err_td_ring: iounmap(lp->tx_dma_regs); probe_err_dma_tx: @@ -1133,7 +1133,7 @@ static int korina_remove(struct platform_device *pdev) iounmap(lp->eth_regs); iounmap(lp->rx_dma_regs); iounmap(lp->tx_dma_regs); - kfree(KSEG0ADDR(lp->td_ring)); + kfree((struct dma_desc *)KSEG0ADDR(lp->td_ring)); unregister_netdev(bif->dev); free_netdev(bif->dev); From bc7e343dbd4c1a86c490c1d4f08fca9ecdedbeaa Mon Sep 17 00:00:00 2001 From: Christian Eggers Date: Fri, 16 Oct 2020 19:16:03 +0200 Subject: [PATCH 17/62] net: dsa: tag_ksz: KSZ8795 and KSZ9477 also use tail tags The Marvell 88E6060 uses tag_trailer.c and the KSZ8795, KSZ9477 and KSZ9893 switches also use tail tags. Fixes: 7a6ffe764be3 ("net: dsa: point out the tail taggers") Signed-off-by: Christian Eggers Reviewed-by: Florian Fainelli Link: https://lore.kernel.org/r/20201016171603.10587-1-ceggers@arri.de Signed-off-by: Jakub Kicinski --- net/dsa/tag_ksz.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index 945a9bd5ba35..0a5aa982c60d 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -123,6 +123,7 @@ static const struct dsa_device_ops ksz8795_netdev_ops = { .xmit = ksz8795_xmit, .rcv = ksz8795_rcv, .overhead = KSZ_INGRESS_TAG_LEN, + .tail_tag = true, }; DSA_TAG_DRIVER(ksz8795_netdev_ops); @@ -199,6 +200,7 @@ static const struct dsa_device_ops ksz9477_netdev_ops = { .xmit = ksz9477_xmit, .rcv = ksz9477_rcv, .overhead = KSZ9477_INGRESS_TAG_LEN, + .tail_tag = true, }; DSA_TAG_DRIVER(ksz9477_netdev_ops); From 26ebd6fed9bb3aa480c7c0f147ac0e7b11000f65 Mon Sep 17 00:00:00 2001 From: Po-Hsu Lin Date: Mon, 19 Oct 2020 11:09:28 +0800 Subject: [PATCH 18/62] selftests: rtnetlink: load fou module for kci_test_encap_fou() test The kci_test_encap_fou() test from kci_test_encap() in rtnetlink.sh needs the fou module to work. Otherwise it will fail with: $ ip netns exec "$testns" ip fou add port 7777 ipproto 47 RTNETLINK answers: No such file or directory Error talking to the kernel Add the CONFIG_NET_FOU into the config file as well. Which needs at least to be set as a loadable module. Fixes: 6227efc1a20b ("selftests: rtnetlink.sh: add vxlan and fou test cases") Signed-off-by: Po-Hsu Lin Link: https://lore.kernel.org/r/20201019030928.9859-1-po-hsu.lin@canonical.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/config | 1 + tools/testing/selftests/net/rtnetlink.sh | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config index 43649242adc0..4d5df8e1eee7 100644 --- a/tools/testing/selftests/net/config +++ b/tools/testing/selftests/net/config @@ -33,3 +33,4 @@ CONFIG_KALLSYMS=y CONFIG_TRACEPOINTS=y CONFIG_NET_DROP_MONITOR=m CONFIG_NETDEVSIM=m +CONFIG_NET_FOU=m diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh index 8a2fe6d64bf2..c9ce3dfa42ee 100755 --- a/tools/testing/selftests/net/rtnetlink.sh +++ b/tools/testing/selftests/net/rtnetlink.sh @@ -520,6 +520,11 @@ kci_test_encap_fou() return $ksft_skip fi + if ! /sbin/modprobe -q -n fou; then + echo "SKIP: module fou is not found" + return $ksft_skip + fi + /sbin/modprobe -q fou ip -netns "$testns" fou add port 7777 ipproto 47 2>/dev/null if [ $? -ne 0 ];then echo "FAIL: can't add fou port 7777, skipping test" From a15a6afb3bf9388eb83a4b876d3453f305fba909 Mon Sep 17 00:00:00 2001 From: Maxim Kochetkov Date: Mon, 19 Oct 2020 08:06:25 +0300 Subject: [PATCH 19/62] net: dsa: seville: the packet buffer is 2 megabits, not megabytes The VSC9953 Seville switch has 2 megabits of buffer split into 4360 words of 60 bytes each. 2048 * 1024 is 2 megabytes instead of 2 megabits. 2 megabits is (2048 / 8) * 1024 = 256 * 1024. Signed-off-by: Maxim Kochetkov Reviewed-by: Vladimir Oltean Fixes: a63ed92d217f ("net: dsa: seville: fix buffer size of the queue system") Link: https://lore.kernel.org/r/20201019050625.21533-1-fido_max@inbox.ru Signed-off-by: Jakub Kicinski --- drivers/net/dsa/ocelot/seville_vsc9953.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/ocelot/seville_vsc9953.c b/drivers/net/dsa/ocelot/seville_vsc9953.c index 76576cf0ba8a..1d420c4a2f0f 100644 --- a/drivers/net/dsa/ocelot/seville_vsc9953.c +++ b/drivers/net/dsa/ocelot/seville_vsc9953.c @@ -1181,7 +1181,7 @@ static const struct felix_info seville_info_vsc9953 = { .stats_layout = vsc9953_stats_layout, .num_stats = ARRAY_SIZE(vsc9953_stats_layout), .vcap = vsc9953_vcap_props, - .shared_queue_sz = 2048 * 1024, + .shared_queue_sz = 256 * 1024, .num_mact_rows = 2048, .num_ports = 10, .mdio_bus_alloc = vsc9953_mdio_bus_alloc, From df6afe2f7c19349de2ee560dc62ea4d9ad3ff889 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 16 Oct 2020 20:29:14 +0300 Subject: [PATCH 20/62] nexthop: Fix performance regression in nexthop deletion While insertion of 16k nexthops all using the same netdev ('dummy10') takes less than a second, deletion takes about 130 seconds: # time -p ip -b nexthop.batch real 0.29 user 0.01 sys 0.15 # time -p ip link set dev dummy10 down real 131.03 user 0.06 sys 0.52 This is because of repeated calls to synchronize_rcu() whenever a nexthop is removed from a nexthop group: # /usr/share/bcc/tools/offcputime -p `pgrep -nx ip` -K ... b'finish_task_switch' b'schedule' b'schedule_timeout' b'wait_for_completion' b'__wait_rcu_gp' b'synchronize_rcu.part.0' b'synchronize_rcu' b'__remove_nexthop' b'remove_nexthop' b'nexthop_flush_dev' b'nh_netdev_event' b'raw_notifier_call_chain' b'call_netdevice_notifiers_info' b'__dev_notify_flags' b'dev_change_flags' b'do_setlink' b'__rtnl_newlink' b'rtnl_newlink' b'rtnetlink_rcv_msg' b'netlink_rcv_skb' b'rtnetlink_rcv' b'netlink_unicast' b'netlink_sendmsg' b'____sys_sendmsg' b'___sys_sendmsg' b'__sys_sendmsg' b'__x64_sys_sendmsg' b'do_syscall_64' b'entry_SYSCALL_64_after_hwframe' - ip (277) 126554955 Since nexthops are always deleted under RTNL, synchronize_net() can be used instead. It will call synchronize_rcu_expedited() which only blocks for several microseconds as opposed to multiple milliseconds like synchronize_rcu(). With this patch deletion of 16k nexthops takes less than a second: # time -p ip link set dev dummy10 down real 0.12 user 0.00 sys 0.04 Tested with fib_nexthops.sh which includes torture tests that prompted the initial change: # ./fib_nexthops.sh ... Tests passed: 134 Tests failed: 0 Fixes: 90f33bffa382 ("nexthops: don't modify published nexthop groups") Signed-off-by: Ido Schimmel Reviewed-by: Jesse Brandeburg Reviewed-by: David Ahern Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20201016172914.643282-1-idosch@idosch.org Signed-off-by: Jakub Kicinski --- net/ipv4/nexthop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 8c0f17c6863c..0dc43ad28eb9 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -845,7 +845,7 @@ static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh, remove_nh_grp_entry(net, nhge, nlinfo); /* make sure all see the newly published array before releasing rtnl */ - synchronize_rcu(); + synchronize_net(); } static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) From 79dce09ab02729a90cf6ce49c9aaaf17aa0d21db Mon Sep 17 00:00:00 2001 From: "longguang.yue" Date: Mon, 28 Sep 2020 10:49:38 +0800 Subject: [PATCH 21/62] ipvs: adjust the debug info in function set_tcp_state Outputting client,virtual,dst addresses info when tcp state changes, which makes the connection debug more clear Signed-off-by: longguang.yue Acked-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_proto_tcp.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index dc2e7da2742a..7da51390cea6 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -539,8 +539,8 @@ set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, if (new_state != cp->state) { struct ip_vs_dest *dest = cp->dest; - IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->" - "%s:%d state: %s->%s conn->refcnt:%d\n", + IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] c:%s:%d v:%s:%d " + "d:%s:%d state: %s->%s conn->refcnt:%d\n", pd->pp->name, ((state_off == TCP_DIR_OUTPUT) ? "output " : "input "), @@ -548,10 +548,12 @@ set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, th->fin ? 'F' : '.', th->ack ? 'A' : '.', th->rst ? 'R' : '.', - IP_VS_DBG_ADDR(cp->daf, &cp->daddr), - ntohs(cp->dport), IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), + ntohs(cp->vport), + IP_VS_DBG_ADDR(cp->daf, &cp->daddr), + ntohs(cp->dport), tcp_state_name(cp->state), tcp_state_name(new_state), refcount_read(&cp->refcnt)); From 4f25434bccc28cf8a07876ef5142a2869a674353 Mon Sep 17 00:00:00 2001 From: Francesco Ruggeri Date: Wed, 7 Oct 2020 12:32:52 -0700 Subject: [PATCH 22/62] netfilter: conntrack: connection timeout after re-register If the first packet conntrack sees after a re-register is an outgoing keepalive packet with no data (SEG.SEQ = SND.NXT-1), td_end is set to SND.NXT-1. When the peer correctly acknowledges SND.NXT, tcp_in_window fails check III (Upper bound for valid (s)ack: sack <= receiver.td_end) and returns false, which cascades into nf_conntrack_in setting skb->_nfct = 0 and in later conntrack iptables rules not matching. In cases where iptables are dropping packets that do not match conntrack rules this can result in idle tcp connections to time out. v2: adjust td_end when getting the reply rather than when sending out the keepalive packet. Fixes: f94e63801ab2 ("netfilter: conntrack: reset tcp maxwin on re-register") Signed-off-by: Francesco Ruggeri Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_tcp.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index e8c86ee4c1c4..c8fb2187ad4b 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -541,13 +541,20 @@ static bool tcp_in_window(const struct nf_conn *ct, swin = win << sender->td_scale; sender->td_maxwin = (swin == 0 ? 1 : swin); sender->td_maxend = end + sender->td_maxwin; - /* - * We haven't seen traffic in the other direction yet - * but we have to tweak window tracking to pass III - * and IV until that happens. - */ - if (receiver->td_maxwin == 0) + if (receiver->td_maxwin == 0) { + /* We haven't seen traffic in the other + * direction yet but we have to tweak window + * tracking to pass III and IV until that + * happens. + */ receiver->td_end = receiver->td_maxend = sack; + } else if (sack == receiver->td_end + 1) { + /* Likely a reply to a keepalive. + * Needed for III. + */ + receiver->td_end++; + } + } } else if (((state->state == TCP_CONNTRACK_SYN_SENT && dir == IP_CT_DIR_ORIGINAL) From 68f9f9c2c3b6a7259f6a92bc26cdc7bd22e7a982 Mon Sep 17 00:00:00 2001 From: Georg Kohmann Date: Tue, 13 Oct 2020 14:23:12 +0200 Subject: [PATCH 23/62] netfilter: Drop fragmented ndisc packets assembled in netfilter Fragmented ndisc packets assembled in netfilter not dropped as specified in RFC 6980, section 5. This behaviour breaks TAHI IPv6 Core Conformance Tests v6LC.2.1.22/23, V6LC.2.2.26/27 and V6LC.2.3.18. Setting IP6SKB_FRAGMENTED flag during reassembly. References: commit b800c3b966bc ("ipv6: drop fragmented ndisc packets by default (RFC 6980)") Signed-off-by: Georg Kohmann Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/nf_conntrack_reasm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index fed9666a2f7d..054d287eb13d 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -355,6 +355,7 @@ static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, ipv6_hdr(skb)->payload_len = htons(payload_len); ipv6_change_dsfield(ipv6_hdr(skb), 0xff, ecn); IP6CB(skb)->frag_max_size = sizeof(struct ipv6hdr) + fq->q.max_size; + IP6CB(skb)->flags |= IP6SKB_FRAGMENTED; /* Yes, and fold redundant checksum back. 8) */ if (skb->ip_summed == CHECKSUM_COMPLETE) From 63137bc5882a1882c553d389fdeeeace86ee1741 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20COCAULT?= Date: Wed, 14 Oct 2020 12:36:15 +0000 Subject: [PATCH 24/62] netfilter: ebtables: Fixes dropping of small packets in bridge nat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes an error causing small packets to get dropped. skb_ensure_writable expects the second parameter to be a length in the ethernet payload.=20 If we want to write the ethernet header (src, dst), we should pass 0. Otherwise, packets with small payloads (< ETH_ALEN) will get dropped. Fixes: c1a831167901 ("netfilter: bridge: convert skb_make_writable to skb_ensure_writable") Signed-off-by: Timothée COCAULT Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebt_dnat.c | 2 +- net/bridge/netfilter/ebt_redirect.c | 2 +- net/bridge/netfilter/ebt_snat.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c index 12a4f4d93681..3fda71a8579d 100644 --- a/net/bridge/netfilter/ebt_dnat.c +++ b/net/bridge/netfilter/ebt_dnat.c @@ -21,7 +21,7 @@ ebt_dnat_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ebt_nat_info *info = par->targinfo; - if (skb_ensure_writable(skb, ETH_ALEN)) + if (skb_ensure_writable(skb, 0)) return EBT_DROP; ether_addr_copy(eth_hdr(skb)->h_dest, info->mac); diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c index 0cad62a4052b..307790562b49 100644 --- a/net/bridge/netfilter/ebt_redirect.c +++ b/net/bridge/netfilter/ebt_redirect.c @@ -21,7 +21,7 @@ ebt_redirect_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ebt_redirect_info *info = par->targinfo; - if (skb_ensure_writable(skb, ETH_ALEN)) + if (skb_ensure_writable(skb, 0)) return EBT_DROP; if (xt_hooknum(par) != NF_BR_BROUTING) diff --git a/net/bridge/netfilter/ebt_snat.c b/net/bridge/netfilter/ebt_snat.c index 27443bf229a3..7dfbcdfc30e5 100644 --- a/net/bridge/netfilter/ebt_snat.c +++ b/net/bridge/netfilter/ebt_snat.c @@ -22,7 +22,7 @@ ebt_snat_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ebt_nat_info *info = par->targinfo; - if (skb_ensure_writable(skb, ETH_ALEN * 2)) + if (skb_ensure_writable(skb, 0)) return EBT_DROP; ether_addr_copy(eth_hdr(skb)->h_source, info->mac); From 64747d5ed19911a867af733f6679d2a859fb18ae Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Sun, 18 Oct 2020 16:30:19 +0100 Subject: [PATCH 25/62] docs: nf_flowtable: fix typo. "mailined" should be "mainlined." Signed-off-by: Jeremy Sowden Signed-off-by: Pablo Neira Ayuso --- Documentation/networking/nf_flowtable.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/networking/nf_flowtable.rst b/Documentation/networking/nf_flowtable.rst index b6e1fa141aae..6cdf9a1724b6 100644 --- a/Documentation/networking/nf_flowtable.rst +++ b/Documentation/networking/nf_flowtable.rst @@ -109,7 +109,7 @@ More reading This documentation is based on the LWN.net articles [1]_\ [2]_. Rafal Milecki also made a very complete and comprehensive summary called "A state of network acceleration" that describes how things were before this infrastructure was -mailined [3]_ and it also makes a rough summary of this work [4]_. +mainlined [3]_ and it also makes a rough summary of this work [4]_. .. [1] https://lwn.net/Articles/738214/ .. [2] https://lwn.net/Articles/742164/ From 31cc578ae2de19c748af06d859019dced68e325d Mon Sep 17 00:00:00 2001 From: Saeed Mirzamohammadi Date: Tue, 20 Oct 2020 13:41:36 +0200 Subject: [PATCH 26/62] netfilter: nftables_offload: KASAN slab-out-of-bounds Read in nft_flow_rule_create This patch fixes the issue due to: BUG: KASAN: slab-out-of-bounds in nft_flow_rule_create+0x622/0x6a2 net/netfilter/nf_tables_offload.c:40 Read of size 8 at addr ffff888103910b58 by task syz-executor227/16244 The error happens when expr->ops is accessed early on before performing the boundary check and after nft_expr_next() moves the expr to go out-of-bounds. This patch checks the boundary condition before expr->ops that fixes the slab-out-of-bounds Read issue. Add nft_expr_more() and use it to fix this problem. Signed-off-by: Saeed Mirzamohammadi Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 6 ++++++ net/netfilter/nf_tables_api.c | 6 +++--- net/netfilter/nf_tables_offload.c | 4 ++-- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 3f7e56b1171e..55b4cadf290a 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -891,6 +891,12 @@ static inline struct nft_expr *nft_expr_last(const struct nft_rule *rule) return (struct nft_expr *)&rule->data[rule->dlen]; } +static inline bool nft_expr_more(const struct nft_rule *rule, + const struct nft_expr *expr) +{ + return expr != nft_expr_last(rule) && expr->ops; +} + static inline struct nft_userdata *nft_userdata(const struct nft_rule *rule) { return (void *)&rule->data[rule->dlen]; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 9957e0ed8658..65cb8e3c13d9 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -302,7 +302,7 @@ static void nft_rule_expr_activate(const struct nft_ctx *ctx, struct nft_expr *expr; expr = nft_expr_first(rule); - while (expr != nft_expr_last(rule) && expr->ops) { + while (nft_expr_more(rule, expr)) { if (expr->ops->activate) expr->ops->activate(ctx, expr); @@ -317,7 +317,7 @@ static void nft_rule_expr_deactivate(const struct nft_ctx *ctx, struct nft_expr *expr; expr = nft_expr_first(rule); - while (expr != nft_expr_last(rule) && expr->ops) { + while (nft_expr_more(rule, expr)) { if (expr->ops->deactivate) expr->ops->deactivate(ctx, expr, phase); @@ -3080,7 +3080,7 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, * is called on error from nf_tables_newrule(). */ expr = nft_expr_first(rule); - while (expr != nft_expr_last(rule) && expr->ops) { + while (nft_expr_more(rule, expr)) { next = nft_expr_next(expr); nf_tables_expr_destroy(ctx, expr); expr = next; diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 7c7e06624dc3..9f625724a20f 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -37,7 +37,7 @@ struct nft_flow_rule *nft_flow_rule_create(struct net *net, struct nft_expr *expr; expr = nft_expr_first(rule); - while (expr->ops && expr != nft_expr_last(rule)) { + while (nft_expr_more(rule, expr)) { if (expr->ops->offload_flags & NFT_OFFLOAD_F_ACTION) num_actions++; @@ -61,7 +61,7 @@ struct nft_flow_rule *nft_flow_rule_create(struct net *net, ctx->net = net; ctx->dep.type = NFT_OFFLOAD_DEP_UNSPEC; - while (expr->ops && expr != nft_expr_last(rule)) { + while (nft_expr_more(rule, expr)) { if (!expr->ops->offload) { err = -EOPNOTSUPP; goto err_out; From 7ebb9db011088f9bd357791f49cb7012e66f29e2 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Mon, 19 Oct 2020 10:26:07 -0700 Subject: [PATCH 27/62] net: remove unneeded break A break is not needed if it is preceded by a return or goto Signed-off-by: Tom Rix Link: https://lore.kernel.org/r/20201019172607.31622-1-trix@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/aquantia/atlantic/aq_nic.c | 1 - drivers/net/ethernet/cisco/enic/enic_ethtool.c | 1 - drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c | 1 - drivers/net/wan/lmc/lmc_proto.c | 4 ---- 4 files changed, 7 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c index 0f865daeb36d..bf5e0e9bd0e2 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c @@ -1163,7 +1163,6 @@ int aq_nic_set_link_ksettings(struct aq_nic_s *self, default: err = -1; goto err_exit; - break; } if (!(self->aq_nic_cfg.aq_hw_caps->link_speed_msk & rate)) { err = -1; diff --git a/drivers/net/ethernet/cisco/enic/enic_ethtool.c b/drivers/net/ethernet/cisco/enic/enic_ethtool.c index a4dd52bba2c3..1a9803f2073e 100644 --- a/drivers/net/ethernet/cisco/enic/enic_ethtool.c +++ b/drivers/net/ethernet/cisco/enic/enic_ethtool.c @@ -434,7 +434,6 @@ static int enic_grxclsrule(struct enic *enic, struct ethtool_rxnfc *cmd) break; default: return -EINVAL; - break; } fsp->h_u.tcp_ip4_spec.ip4src = flow_get_u32_src(&n->keys); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c index de563cfd294d..4b93ba149ec5 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c @@ -350,7 +350,6 @@ static s32 ixgbe_calc_eeprom_checksum_X540(struct ixgbe_hw *hw) if (ixgbe_read_eerd_generic(hw, pointer, &length)) { hw_dbg(hw, "EEPROM read failed\n"); return IXGBE_ERR_EEPROM; - break; } /* Skip pointer section if length is invalid. */ diff --git a/drivers/net/wan/lmc/lmc_proto.c b/drivers/net/wan/lmc/lmc_proto.c index e8b0b902b424..4e9cc83b615a 100644 --- a/drivers/net/wan/lmc/lmc_proto.c +++ b/drivers/net/wan/lmc/lmc_proto.c @@ -89,17 +89,13 @@ __be16 lmc_proto_type(lmc_softc_t *sc, struct sk_buff *skb) /*FOLD00*/ switch(sc->if_type){ case LMC_PPP: return hdlc_type_trans(skb, sc->lmc_device); - break; case LMC_NET: return htons(ETH_P_802_2); - break; case LMC_RAW: /* Packet type for skbuff kind of useless */ return htons(ETH_P_802_2); - break; default: printk(KERN_WARNING "%s: No protocol set for this interface, assuming 802.2 (which is wrong!!)\n", sc->name); return htons(ETH_P_802_2); - break; } } From 618355cc6a0d9c23da2be171b72686f1f94a4fc1 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Mon, 19 Oct 2020 12:15:00 -0700 Subject: [PATCH 28/62] nfc: remove unneeded break A break is not needed if it is preceded by a return Signed-off-by: Tom Rix Link: https://lore.kernel.org/r/20201019191500.9264-1-trix@redhat.com Signed-off-by: Jakub Kicinski --- drivers/nfc/st21nfca/core.c | 1 - drivers/nfc/trf7970a.c | 1 - 2 files changed, 2 deletions(-) diff --git a/drivers/nfc/st21nfca/core.c b/drivers/nfc/st21nfca/core.c index 2ce17932a073..6ca0d2f56b18 100644 --- a/drivers/nfc/st21nfca/core.c +++ b/drivers/nfc/st21nfca/core.c @@ -794,7 +794,6 @@ static int st21nfca_hci_im_transceive(struct nfc_hci_dev *hdev, skb->len, st21nfca_hci_data_exchange_cb, info); - break; default: return 1; } diff --git a/drivers/nfc/trf7970a.c b/drivers/nfc/trf7970a.c index 3bd97c73f983..c70f62fe321e 100644 --- a/drivers/nfc/trf7970a.c +++ b/drivers/nfc/trf7970a.c @@ -1382,7 +1382,6 @@ static int trf7970a_is_iso15693_write_or_lock(u8 cmd) case ISO15693_CMD_WRITE_DSFID: case ISO15693_CMD_LOCK_DSFID: return 1; - break; default: return 0; } From c5eb48e89286510d638844104a6f6582a4838a32 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 20 Oct 2020 21:59:55 +0200 Subject: [PATCH 29/62] bpf, doc: Fix patchwork URL to point to kernel.org instance Follow-up on ebb034b15bfa ("bpf: Migrate from patchwork.ozlabs.org to patchwork.kernel.org.") in order to fix up the patchwork URL (Q) in the MAINTAINERS file for BPF subsystem. While at it, also add the official website (W) entry. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/f73ae01c7e6f9cf0a3890f2ca988a8e69190c50b.1603223852.git.daniel@iogearbox.net --- MAINTAINERS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 0f59b0412953..6d50cbf198b5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3244,7 +3244,8 @@ R: KP Singh L: netdev@vger.kernel.org L: bpf@vger.kernel.org S: Supported -Q: https://patchwork.ozlabs.org/project/netdev/list/?delegate=77147 +W: https://bpf.io/ +Q: https://patchwork.kernel.org/project/netdevbpf/list/?delegate=121173 T: git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git F: Documentation/bpf/ From acd7aaf51b20263a7e62d2a26569988c63bdd3d8 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sun, 18 Oct 2020 18:36:25 +0200 Subject: [PATCH 30/62] netsec: ignore 'phy-mode' device property on ACPI systems Since commit bbc4d71d63549bc ("net: phy: realtek: fix rtl8211e rx/tx delay config"), the Realtek PHY driver will override any TX/RX delay set by hardware straps if the phy-mode device property does not match. This is causing problems on SynQuacer based platforms (the only SoC that incorporates the netsec hardware), since many were built with this Realtek PHY, and shipped with firmware that defines the phy-mode as 'rgmii', even though the PHY is configured for TX and RX delay using pull-ups. From the driver's perspective, we should not make any assumptions in the general case that the PHY hardware does not require any initial configuration. However, the situation is slightly different for ACPI boot, since it implies rich firmware with AML abstractions to handle hardware details that are not exposed to the OS. So in the ACPI case, it is reasonable to assume that the PHY comes up in the right mode, regardless of whether the mode is set by straps, by boot time firmware or by AML executed by the ACPI interpreter. So let's ignore the 'phy-mode' device property when probing the netsec driver in ACPI mode, and hardcode the mode to PHY_INTERFACE_MODE_NA, which should work with any PHY provided that it is configured by the time the driver attaches to it. While at it, document that omitting the mode is permitted for DT probing as well, by setting the phy-mode DT property to the empty string. Fixes: 533dd11a12f6 ("net: socionext: Add Synquacer NetSec driver") Signed-off-by: Ard Biesheuvel Reviewed-by: Ilias Apalodimas Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20201018163625.2392-1-ardb@kernel.org Signed-off-by: Jakub Kicinski --- .../bindings/net/socionext-netsec.txt | 4 +++- drivers/net/ethernet/socionext/netsec.c | 24 +++++++++++++------ 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/Documentation/devicetree/bindings/net/socionext-netsec.txt b/Documentation/devicetree/bindings/net/socionext-netsec.txt index 9d6c9feb12ff..a3c1dffaa4bb 100644 --- a/Documentation/devicetree/bindings/net/socionext-netsec.txt +++ b/Documentation/devicetree/bindings/net/socionext-netsec.txt @@ -30,7 +30,9 @@ Optional properties: (See ethernet.txt file in the same directory) - max-frame-size: See ethernet.txt in the same directory. The MAC address will be determined using the optional properties -defined in ethernet.txt. +defined in ethernet.txt. The 'phy-mode' property is required, but may +be set to the empty string if the PHY configuration is programmed by +the firmware or set by hardware straps, and needs to be preserved. Example: eth0: ethernet@522d0000 { diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c index 806eb651cea3..1503cc9ec6e2 100644 --- a/drivers/net/ethernet/socionext/netsec.c +++ b/drivers/net/ethernet/socionext/netsec.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -1833,6 +1834,14 @@ static const struct net_device_ops netsec_netdev_ops = { static int netsec_of_probe(struct platform_device *pdev, struct netsec_priv *priv, u32 *phy_addr) { + int err; + + err = of_get_phy_mode(pdev->dev.of_node, &priv->phy_interface); + if (err) { + dev_err(&pdev->dev, "missing required property 'phy-mode'\n"); + return err; + } + priv->phy_np = of_parse_phandle(pdev->dev.of_node, "phy-handle", 0); if (!priv->phy_np) { dev_err(&pdev->dev, "missing required property 'phy-handle'\n"); @@ -1859,6 +1868,14 @@ static int netsec_acpi_probe(struct platform_device *pdev, if (!IS_ENABLED(CONFIG_ACPI)) return -ENODEV; + /* ACPI systems are assumed to configure the PHY in firmware, so + * there is really no need to discover the PHY mode from the DSDT. + * Since firmware is known to exist in the field that configures the + * PHY correctly but passes the wrong mode string in the phy-mode + * device property, we have no choice but to ignore it. + */ + priv->phy_interface = PHY_INTERFACE_MODE_NA; + ret = device_property_read_u32(&pdev->dev, "phy-channel", phy_addr); if (ret) { dev_err(&pdev->dev, @@ -1995,13 +2012,6 @@ static int netsec_probe(struct platform_device *pdev) priv->msg_enable = NETIF_MSG_TX_ERR | NETIF_MSG_HW | NETIF_MSG_DRV | NETIF_MSG_LINK | NETIF_MSG_PROBE; - priv->phy_interface = device_get_phy_mode(&pdev->dev); - if ((int)priv->phy_interface < 0) { - dev_err(&pdev->dev, "missing required property 'phy-mode'\n"); - ret = -ENODEV; - goto free_ndev; - } - priv->ioaddr = devm_ioremap(&pdev->dev, mmio_res->start, resource_size(mmio_res)); if (!priv->ioaddr) { From 47b5d2a107396ab05e83a4dfbd30b563ecbc83af Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Mon, 19 Oct 2020 12:02:44 +0300 Subject: [PATCH 31/62] net/sched: act_ct: Fix adding udp port mangle operation Need to use the udp header type and not tcp. Fixes: 9c26ba9b1f45 ("net/sched: act_ct: Instantiate flow table entry actions") Signed-off-by: Roi Dayan Reviewed-by: Paul Blakey Link: https://lore.kernel.org/r/20201019090244.3015186-1-roid@nvidia.com Signed-off-by: Jakub Kicinski --- net/sched/act_ct.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 9c79fb92c2da..aba3cd85f284 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -156,11 +156,11 @@ tcf_ct_flow_table_add_action_nat_udp(const struct nf_conntrack_tuple *tuple, __be16 target_dst = target.dst.u.udp.port; if (target_src != tuple->src.u.udp.port) - tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_TCP, + tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_UDP, offsetof(struct udphdr, source), 0xFFFF, be16_to_cpu(target_src)); if (target_dst != tuple->dst.u.udp.port) - tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_TCP, + tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_UDP, offsetof(struct udphdr, dest), 0xFFFF, be16_to_cpu(target_dst)); } From fe2d9b1a0e7805384770ec0ddd34c9f1e9fe6fa8 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 19 Oct 2020 18:23:15 +0800 Subject: [PATCH 32/62] mptcp: initialize mptcp_options_received's ahmac Initialize mptcp_options_received's ahmac to zero, otherwise it will be a random number when receiving ADD_ADDR suboption with echo-flag=1. Fixes: 3df523ab582c5 ("mptcp: Add ADD_ADDR handling") Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 092a2d48bfd3..1ff3469938b6 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -297,6 +297,7 @@ void mptcp_get_options(const struct sk_buff *skb, mp_opt->mp_capable = 0; mp_opt->mp_join = 0; mp_opt->add_addr = 0; + mp_opt->ahmac = 0; mp_opt->rm_addr = 0; mp_opt->dss = 0; From 65b8c8a620a390a901522f19beed1476e2274feb Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 19 Oct 2020 18:23:16 +0800 Subject: [PATCH 33/62] mptcp: move mptcp_options_received's port initialization Move mptcp_options_received's port initialization from mptcp_parse_option to mptcp_get_options, put it together with the other fields initializations of mptcp_options_received. Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 1ff3469938b6..a044dd43411d 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -241,7 +241,6 @@ static void mptcp_parse_option(const struct sk_buff *skb, } mp_opt->add_addr = 1; - mp_opt->port = 0; mp_opt->addr_id = *ptr++; pr_debug("ADD_ADDR: id=%d, echo=%d", mp_opt->addr_id, mp_opt->echo); if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) { @@ -298,6 +297,7 @@ void mptcp_get_options(const struct sk_buff *skb, mp_opt->mp_join = 0; mp_opt->add_addr = 0; mp_opt->ahmac = 0; + mp_opt->port = 0; mp_opt->rm_addr = 0; mp_opt->dss = 0; From b142083b585c2c03af24cca4d274f797796a4064 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 19 Oct 2020 13:32:40 +0200 Subject: [PATCH 34/62] mptcp: MPTCP_KUNIT_TESTS should depend on MPTCP instead of selecting it MPTCP_KUNIT_TESTS selects MPTCP, thus enabling an optional feature the user may not want to enable. Fix this by making the test depend on MPTCP instead. Fixes: a00a582203dbc43e ("mptcp: move crypto test to KUNIT") Signed-off-by: Geert Uytterhoeven Reviewed-by: Matthieu Baerts Link: https://lore.kernel.org/r/20201019113240.11516-1-geert@linux-m68k.org Signed-off-by: Jakub Kicinski --- net/mptcp/Kconfig | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig index 698bc3525160..abb0a992d4a0 100644 --- a/net/mptcp/Kconfig +++ b/net/mptcp/Kconfig @@ -22,11 +22,8 @@ config MPTCP_IPV6 select IPV6 default y -endif - config MPTCP_KUNIT_TESTS tristate "This builds the MPTCP KUnit tests" if !KUNIT_ALL_TESTS - select MPTCP depends on KUNIT default KUNIT_ALL_TESTS help @@ -39,3 +36,4 @@ config MPTCP_KUNIT_TESTS If unsure, say N. +endif From 280e3ebdafb863b3cb50d5842f056267e15bf40c Mon Sep 17 00:00:00 2001 From: Defang Bo Date: Mon, 19 Oct 2020 19:38:58 +0800 Subject: [PATCH 35/62] nfc: Ensure presence of NFC_ATTR_FIRMWARE_NAME attribute in nfc_genl_fw_download() Check that the NFC_ATTR_FIRMWARE_NAME attributes are provided by the netlink client prior to accessing them.This prevents potential unhandled NULL pointer dereference exceptions which can be triggered by malicious user-mode programs, if they omit one or both of these attributes. Similar to commit a0323b979f81 ("nfc: Ensure presence of required attributes in the activate_target handler"). Fixes: 9674da8759df ("NFC: Add firmware upload netlink command") Signed-off-by: Defang Bo Link: https://lore.kernel.org/r/1603107538-4744-1-git-send-email-bodefang@126.com Signed-off-by: Jakub Kicinski --- net/nfc/netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index e894254c17d4..8709f3d4e7c4 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -1217,7 +1217,7 @@ static int nfc_genl_fw_download(struct sk_buff *skb, struct genl_info *info) u32 idx; char firmware_name[NFC_FIRMWARE_NAME_MAXSIZE + 1]; - if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) + if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_FIRMWARE_NAME]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); From 0fb5f0160a36d7acaa8e84ce873af99f94b60484 Mon Sep 17 00:00:00 2001 From: Vinay Kumar Yadav Date: Mon, 19 Oct 2020 17:20:20 +0530 Subject: [PATCH 36/62] chelsio/chtls: fix socket lock In chtls_sendpage() socket lock is released but not acquired, fix it by taking lock. Fixes: 36bedb3f2e5b ("crypto: chtls - Inline TLS record Tx") Signed-off-by: Vinay Kumar Yadav Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c index 2e9acae1cba3..28c6c538032d 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c @@ -1240,6 +1240,7 @@ int chtls_sendpage(struct sock *sk, struct page *page, copied = 0; csk = rcu_dereference_sk_user_data(sk); cdev = csk->cdev; + lock_sock(sk); timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); err = sk_stream_wait_connect(sk, &timeo); From 81519d1f7df7ed1bd5b1397540c8884438f57ae2 Mon Sep 17 00:00:00 2001 From: Vinay Kumar Yadav Date: Mon, 19 Oct 2020 17:20:21 +0530 Subject: [PATCH 37/62] chelsio/chtls: correct netdevice for vlan interface Check if netdevice is a vlan interface and find real vlan netdevice. Fixes: cc35c88ae4db ("crypto : chtls - CPL handler definition") Signed-off-by: Venkatesh Ellapu Signed-off-by: Vinay Kumar Yadav Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c index 05520dccd906..2f9eceaf706d 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c @@ -1157,6 +1157,9 @@ static struct sock *chtls_recv_sock(struct sock *lsk, ndev = n->dev; if (!ndev) goto free_dst; + if (is_vlan_dev(ndev)) + ndev = vlan_dev_real_dev(ndev); + port_id = cxgb4_port_idx(ndev); csk = chtls_sock_create(cdev); From 86cdf9ca4409d997a391103e480b3f77b7ccc19b Mon Sep 17 00:00:00 2001 From: Vinay Kumar Yadav Date: Mon, 19 Oct 2020 17:20:22 +0530 Subject: [PATCH 38/62] chelsio/chtls: fix panic when server is on ipv6 Netdev is filled in egress_dev when connection is established, If connection is closed before establishment, then egress_dev is NULL, Fix it using ip_dev_find() rather then extracting from egress_dev. Fixes: 6abde0b24122 ("crypto/chtls: IPv6 support for inline TLS") Signed-off-by: Venkatesh Ellapu Signed-off-by: Vinay Kumar Yadav Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c index 2f9eceaf706d..e46228ca49ad 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c @@ -736,14 +736,13 @@ void chtls_listen_stop(struct chtls_dev *cdev, struct sock *sk) #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == PF_INET6) { - struct chtls_sock *csk; + struct net_device *ndev = chtls_find_netdev(cdev, sk); int addr_type = 0; - csk = rcu_dereference_sk_user_data(sk); addr_type = ipv6_addr_type((const struct in6_addr *) &sk->sk_v6_rcv_saddr); if (addr_type != IPV6_ADDR_ANY) - cxgb4_clip_release(csk->egress_dev, (const u32 *) + cxgb4_clip_release(ndev, (const u32 *) &sk->sk_v6_rcv_saddr, 1); } #endif From 9819f22c410b4bf6589d3126e8bc3952db507cbf Mon Sep 17 00:00:00 2001 From: Vinay Kumar Yadav Date: Mon, 19 Oct 2020 17:20:23 +0530 Subject: [PATCH 39/62] chelsio/chtls: Fix panic when listen on multiadapter Add the logic to compare net_device returned by ip_dev_find() with the net_device list in cdev->ports[] array and return net_device if matched else NULL. Fixes: 6abde0b24122 ("crypto/chtls: IPv6 support for inline TLS") Signed-off-by: Venkatesh Ellapu Signed-off-by: Vinay Kumar Yadav Signed-off-by: Jakub Kicinski --- .../ethernet/chelsio/inline_crypto/chtls/chtls_cm.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c index e46228ca49ad..bdb53fa41022 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c @@ -92,11 +92,13 @@ static void chtls_sock_release(struct kref *ref) static struct net_device *chtls_find_netdev(struct chtls_dev *cdev, struct sock *sk) { + struct adapter *adap = pci_get_drvdata(cdev->pdev); struct net_device *ndev = cdev->ports[0]; #if IS_ENABLED(CONFIG_IPV6) struct net_device *temp; int addr_type; #endif + int i; switch (sk->sk_family) { case PF_INET: @@ -127,8 +129,12 @@ static struct net_device *chtls_find_netdev(struct chtls_dev *cdev, return NULL; if (is_vlan_dev(ndev)) - return vlan_dev_real_dev(ndev); - return ndev; + ndev = vlan_dev_real_dev(ndev); + + for_each_port(adap, i) + if (cdev->ports[i] == ndev) + return ndev; + return NULL; } static void assign_rxopt(struct sock *sk, unsigned int opt) From 8580a61aede28d441e1c80588803411ee86aa299 Mon Sep 17 00:00:00 2001 From: Vinay Kumar Yadav Date: Mon, 19 Oct 2020 17:20:24 +0530 Subject: [PATCH 40/62] chelsio/chtls: correct function return and return type csk_mem_free() should return true if send buffer is available, false otherwise. Fixes: 3b8305f5c844 ("crypto: chtls - wait for memory sendmsg, sendpage") Signed-off-by: Vinay Kumar Yadav Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c index 28c6c538032d..9fb5ca6682ea 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c @@ -902,9 +902,9 @@ static int chtls_skb_copy_to_page_nocache(struct sock *sk, return 0; } -static int csk_mem_free(struct chtls_dev *cdev, struct sock *sk) +static bool csk_mem_free(struct chtls_dev *cdev, struct sock *sk) { - return (cdev->max_host_sndbuf - sk->sk_wmem_queued); + return (cdev->max_host_sndbuf - sk->sk_wmem_queued > 0); } static int csk_wait_memory(struct chtls_dev *cdev, From da1a039bcf293e4699d413c9f65d975da2d7c0bd Mon Sep 17 00:00:00 2001 From: Vinay Kumar Yadav Date: Mon, 19 Oct 2020 17:20:25 +0530 Subject: [PATCH 41/62] chelsio/chtls: fix writing freed memory When chtls_sock *csk is freed, same memory can be allocated to different csk in chtls_sock_create(). csk->cdev = NULL; statement might ends up modifying wrong csk, eventually causing kernel panic. removing (csk->cdev = NULL) statement as it is not required. Fixes: 3a0a97838923 ("crypto/chtls: Fix chtls crash in connection cleanup") Signed-off-by: Vinay Kumar Yadav Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c index bdb53fa41022..ec4f79049a06 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c @@ -483,7 +483,6 @@ void chtls_destroy_sock(struct sock *sk) chtls_purge_write_queue(sk); free_tls_keyid(sk); kref_put(&csk->kref, chtls_sock_release); - csk->cdev = NULL; if (sk->sk_family == AF_INET) sk->sk_prot = &tcp_prot; #if IS_ENABLED(CONFIG_IPV6) From 6e915b274860dcb9301ba7d6e9e033903fbf6137 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 19 Oct 2020 11:10:59 -0700 Subject: [PATCH 42/62] net: chelsio: inline_crypto: fix Kconfig and build errors Fix build errors when TLS=m, TLS_TOE=y, and CRYPTO_DEV_CHELSIO_TLS=y. Having (tristate) CRYPTO_DEV_CHELSIO_TLS depend on (bool) TLS_TOE is not strong enough to prevent the bad combination of TLS=m and CRYPTO_DEV_CHELSIO_TLS=y, so add a dependency on TLS to prevent the problematic kconfig combination. Fixes these build errors: hppa-linux-ld: drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.o: in function `chtls_free_uld': drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c:165: undefined reference to `tls_toe_unregister_device' hppa-linux-ld: drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.o: in function `chtls_register_dev': drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c:204: undefined reference to `tls_toe_register_device' Fixes: 53b4414a7003 ("net/tls: allow compiling TLS TOE out") Reported-by: kernel test robot Signed-off-by: Randy Dunlap Link: https://lore.kernel.org/r/20201019181059.22634-1-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/inline_crypto/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/Kconfig b/drivers/net/ethernet/chelsio/inline_crypto/Kconfig index 7dfa57348d54..bc06e83fd3c6 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/Kconfig +++ b/drivers/net/ethernet/chelsio/inline_crypto/Kconfig @@ -16,6 +16,7 @@ if CHELSIO_INLINE_CRYPTO config CRYPTO_DEV_CHELSIO_TLS tristate "Chelsio Crypto Inline TLS Driver" depends on CHELSIO_T4 + depends on TLS depends on TLS_TOE help Support Chelsio Inline TLS with Chelsio crypto accelerator. From 010b430d5df556d5d232e3751ac691ba9e88c041 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 20 Oct 2020 09:38:39 +0200 Subject: [PATCH 43/62] mptcp: MPTCP_IPV6 should depend on IPV6 instead of selecting it MPTCP_IPV6 selects IPV6, thus enabling an optional feature the user may not want to enable. Fix this by making MPTCP_IPV6 depend on IPV6, like is done for all other IPv6 features. Fixes: f870fa0b5768842c ("mptcp: Add MPTCP socket stubs") Signed-off-by: Geert Uytterhoeven Reviewed-by: Matthieu Baerts Link: https://lore.kernel.org/r/20201020073839.29226-1-geert@linux-m68k.org Signed-off-by: Jakub Kicinski --- net/mptcp/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig index abb0a992d4a0..8936604b3bf9 100644 --- a/net/mptcp/Kconfig +++ b/net/mptcp/Kconfig @@ -19,7 +19,7 @@ config INET_MPTCP_DIAG config MPTCP_IPV6 bool "MPTCP: IPv6 support for Multipath TCP" - select IPV6 + depends on IPV6 default y config MPTCP_KUNIT_TESTS From d978d6d008fa7a90a435ba7f101dfcbcc1c816a9 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 20 Oct 2020 17:50:29 +0100 Subject: [PATCH 44/62] net: dsa: bcm_sf2: make const array static, makes object smaller Don't populate the const array rate_table on the stack but instead it static. Makes the object code smaller by 46 bytes. Before: text data bss dec hex filename 29812 3824 192 33828 8424 drivers/net/dsa/bcm_sf2.o After: text data bss dec hex filename 29670 3920 192 33782 83f6 drivers/net/dsa/bcm_sf2.o (gcc version 10.2.0) Signed-off-by: Colin Ian King Acked-by: Florian Fainelli Link: https://lore.kernel.org/r/20201020165029.56383-1-colin.king@canonical.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/bcm_sf2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 0b5b2b33b3b6..1e9a0adda2d6 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -54,7 +54,7 @@ static void bcm_sf2_recalc_clock(struct dsa_switch *ds) unsigned long new_rate; unsigned int ports_active; /* Frequenty in Mhz */ - const unsigned long rate_table[] = { + static const unsigned long rate_table[] = { 59220000, 60820000, 62500000, From b130762161374b1ef31549bef8ebd4abeb998d94 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 20 Oct 2020 17:34:31 +0200 Subject: [PATCH 45/62] net/sched: act_gate: Unlock ->tcfa_lock in tc_setup_flow_action() We need to jump to the "err_out_locked" label when tcf_gate_get_entries() fails. Otherwise, tc_setup_flow_action() exits with ->tcfa_lock still held. Fixes: d29bdd69ecdd ("net: schedule: add action gate offloading") Signed-off-by: Guillaume Nault Acked-by: Cong Wang Link: https://lore.kernel.org/r/12f60e385584c52c22863701c0185e40ab08a7a7.1603207948.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- net/sched/cls_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 41a55c6cbeb8..faeabff283a2 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -3712,7 +3712,7 @@ int tc_setup_flow_action(struct flow_action *flow_action, entry->gate.num_entries = tcf_gate_num_entries(act); err = tcf_gate_get_entries(entry, act); if (err) - goto err_out; + goto err_out_locked; } else { err = -EOPNOTSUPP; goto err_out_locked; From a7a12b5a0f950bc6b9f7153390634ea798738db9 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 21 Oct 2020 00:02:40 +0200 Subject: [PATCH 46/62] net/sched: act_tunnel_key: fix OOB write in case of IPv6 ERSPAN tunnels the following command # tc action add action tunnel_key \ > set src_ip 2001:db8::1 dst_ip 2001:db8::2 id 10 erspan_opts 1:6789:0:0 generates the following splat: BUG: KASAN: slab-out-of-bounds in tunnel_key_copy_opts+0xcc9/0x1010 [act_tunnel_key] Write of size 4 at addr ffff88813f5f1cc8 by task tc/873 CPU: 2 PID: 873 Comm: tc Not tainted 5.9.0+ #282 Hardware name: Red Hat KVM, BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/01/2014 Call Trace: dump_stack+0x99/0xcb print_address_description.constprop.7+0x1e/0x230 kasan_report.cold.13+0x37/0x7c tunnel_key_copy_opts+0xcc9/0x1010 [act_tunnel_key] tunnel_key_init+0x160c/0x1f40 [act_tunnel_key] tcf_action_init_1+0x5b5/0x850 tcf_action_init+0x15d/0x370 tcf_action_add+0xd9/0x2f0 tc_ctl_action+0x29b/0x3a0 rtnetlink_rcv_msg+0x341/0x8d0 netlink_rcv_skb+0x120/0x380 netlink_unicast+0x439/0x630 netlink_sendmsg+0x719/0xbf0 sock_sendmsg+0xe2/0x110 ____sys_sendmsg+0x5ba/0x890 ___sys_sendmsg+0xe9/0x160 __sys_sendmsg+0xd3/0x170 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f872a96b338 Code: 89 02 48 c7 c0 ff ff ff ff eb b5 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 25 43 2c 00 8b 00 85 c0 75 17 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 58 c3 0f 1f 80 00 00 00 00 41 54 41 89 d4 55 RSP: 002b:00007ffffe367518 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 000000005f8f5aed RCX: 00007f872a96b338 RDX: 0000000000000000 RSI: 00007ffffe367580 RDI: 0000000000000003 RBP: 0000000000000000 R08: 0000000000000001 R09: 000000000000001c R10: 000000000000000b R11: 0000000000000246 R12: 0000000000000001 R13: 0000000000686760 R14: 0000000000000601 R15: 0000000000000000 Allocated by task 873: kasan_save_stack+0x19/0x40 __kasan_kmalloc.constprop.7+0xc1/0xd0 __kmalloc+0x151/0x310 metadata_dst_alloc+0x20/0x40 tunnel_key_init+0xfff/0x1f40 [act_tunnel_key] tcf_action_init_1+0x5b5/0x850 tcf_action_init+0x15d/0x370 tcf_action_add+0xd9/0x2f0 tc_ctl_action+0x29b/0x3a0 rtnetlink_rcv_msg+0x341/0x8d0 netlink_rcv_skb+0x120/0x380 netlink_unicast+0x439/0x630 netlink_sendmsg+0x719/0xbf0 sock_sendmsg+0xe2/0x110 ____sys_sendmsg+0x5ba/0x890 ___sys_sendmsg+0xe9/0x160 __sys_sendmsg+0xd3/0x170 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The buggy address belongs to the object at ffff88813f5f1c00 which belongs to the cache kmalloc-256 of size 256 The buggy address is located 200 bytes inside of 256-byte region [ffff88813f5f1c00, ffff88813f5f1d00) The buggy address belongs to the page: page:0000000011b48a19 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x13f5f0 head:0000000011b48a19 order:1 compound_mapcount:0 flags: 0x17ffffc0010200(slab|head) raw: 0017ffffc0010200 0000000000000000 0000000d00000001 ffff888107c43400 raw: 0000000000000000 0000000080100010 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88813f5f1b80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff88813f5f1c00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 >ffff88813f5f1c80: 00 00 00 00 00 00 00 00 00 fc fc fc fc fc fc fc ^ ffff88813f5f1d00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff88813f5f1d80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc using IPv6 tunnels, act_tunnel_key allocates a fixed amount of memory for the tunnel metadata, but then it expects additional bytes to store tunnel specific metadata with tunnel_key_copy_opts(). Fix the arguments of __ipv6_tun_set_dst(), so that 'md_size' contains the size previously computed by tunnel_key_get_opts_len(), like it's done for IPv4 tunnels. Fixes: 0ed5269f9e41 ("net/sched: add tunnel option support to act_tunnel_key") Reported-by: Shuang Li Signed-off-by: Davide Caratti Acked-by: Cong Wang Link: https://lore.kernel.org/r/36ebe969f6d13ff59912d6464a4356fe6f103766.1603231100.git.dcaratti@redhat.com Signed-off-by: Jakub Kicinski --- net/sched/act_tunnel_key.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index a229751ee8c4..85c0d0d5b9da 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -459,7 +459,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, metadata = __ipv6_tun_set_dst(&saddr, &daddr, tos, ttl, dst_port, 0, flags, - key_id, 0); + key_id, opts_len); } else { NL_SET_ERR_MSG(extack, "Missing either ipv4 or ipv6 src and dst"); ret = -EINVAL; From b7c24497baeaf21172b447f7cca36b0e99bd11e3 Mon Sep 17 00:00:00 2001 From: Alexander Ovechkin Date: Tue, 20 Oct 2020 14:43:33 +0300 Subject: [PATCH 47/62] mpls: load mpls_gso after mpls_iptunnel mpls_iptunnel is used only for mpls encapsuation, and if encaplusated packet is larger than MTU we need mpls_gso for segmentation. Signed-off-by: Alexander Ovechkin Acked-by: Dmitry Yakunin Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20201020114333.26866-1-ovov@yandex-team.ru Signed-off-by: Jakub Kicinski --- net/mpls/mpls_iptunnel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index 2def85718d94..ef59e25dc482 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -300,5 +300,6 @@ static void __exit mpls_iptunnel_exit(void) module_exit(mpls_iptunnel_exit); MODULE_ALIAS_RTNL_LWT(MPLS); +MODULE_SOFTDEP("post: mpls_gso"); MODULE_DESCRIPTION("MultiProtocol Label Switching IP Tunnels"); MODULE_LICENSE("GPL v2"); From 05f90bf3d5df40e1a705527520e5fd56b2b6f09e Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Tue, 20 Oct 2020 15:35:08 +0100 Subject: [PATCH 48/62] sfc: move initialisation of efx->filter_sem to efx_init_struct() efx_probe_filters() has not been called yet when EF100 calls into efx_mcdi_filter_table_probe(), for which it wants to take the filter_sem. Fixes: a9dc3d5612ce ("sfc_ef100: RX filter table management and related gubbins") Signed-off-by: Edward Cree Link: https://lore.kernel.org/r/24fad43e-887d-051e-25e3-506f23f63abf@solarflare.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/sfc/efx_common.c | 1 + drivers/net/ethernet/sfc/rx_common.c | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sfc/efx_common.c b/drivers/net/ethernet/sfc/efx_common.c index 72a3f0e09f52..de797e1ac5a9 100644 --- a/drivers/net/ethernet/sfc/efx_common.c +++ b/drivers/net/ethernet/sfc/efx_common.c @@ -1014,6 +1014,7 @@ int efx_init_struct(struct efx_nic *efx, efx->num_mac_stats = MC_CMD_MAC_NSTATS; BUILD_BUG_ON(MC_CMD_MAC_NSTATS - 1 != MC_CMD_MAC_GENERATION_END); mutex_init(&efx->mac_lock); + init_rwsem(&efx->filter_sem); #ifdef CONFIG_RFS_ACCEL mutex_init(&efx->rps_mutex); spin_lock_init(&efx->rps_hash_lock); diff --git a/drivers/net/ethernet/sfc/rx_common.c b/drivers/net/ethernet/sfc/rx_common.c index 5e29284c89c9..19cf7cac1e6e 100644 --- a/drivers/net/ethernet/sfc/rx_common.c +++ b/drivers/net/ethernet/sfc/rx_common.c @@ -797,7 +797,6 @@ int efx_probe_filters(struct efx_nic *efx) { int rc; - init_rwsem(&efx->filter_sem); mutex_lock(&efx->mac_lock); down_write(&efx->filter_sem); rc = efx->type->filter_table_probe(efx); From 0ed37ac586c01fd5bf3f7559de79f1d621ccf192 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Wed, 21 Oct 2020 12:51:53 +0200 Subject: [PATCH 49/62] mptcp: depends on IPV6 but not as a module Like TCP, MPTCP cannot be compiled as a module. Obviously, MPTCP IPv6' support also depends on CONFIG_IPV6. But not all functions from IPv6 code are exported. To simplify the code and reduce modifications outside MPTCP, it was decided from the beginning to support MPTCP with IPv6 only if CONFIG_IPV6 was built inlined. That's also why CONFIG_MPTCP_IPV6 was created. More modifications are needed to support CONFIG_IPV6=m. Even if it was not explicit, until recently, we were forcing CONFIG_IPV6 to be built-in because we had "select IPV6" in Kconfig. Now that we have "depends on IPV6", we have to explicitly set "IPV6=y" to force CONFIG_IPV6 not to be built as a module. In other words, we can now only have CONFIG_MPTCP_IPV6=y if CONFIG_IPV6=y. Note that the new dependency might hide the fact IPv6 is not supported in MPTCP even if we have CONFIG_IPV6=m. But selecting IPV6 like we did before was forcing it to be built-in while it was maybe not what the user wants. Reported-by: Geert Uytterhoeven Fixes: 010b430d5df5 ("mptcp: MPTCP_IPV6 should depend on IPV6 instead of selecting it") Signed-off-by: Matthieu Baerts Link: https://lore.kernel.org/r/20201021105154.628257-1-matthieu.baerts@tessares.net Signed-off-by: Jakub Kicinski --- net/mptcp/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig index 8936604b3bf9..a014149aa323 100644 --- a/net/mptcp/Kconfig +++ b/net/mptcp/Kconfig @@ -19,7 +19,7 @@ config INET_MPTCP_DIAG config MPTCP_IPV6 bool "MPTCP: IPv6 support for Multipath TCP" - depends on IPV6 + depends on IPV6=y default y config MPTCP_KUNIT_TESTS From ba452c9e996d8a4c347b32805f91abb70de5de7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 20 Oct 2020 23:25:56 +0200 Subject: [PATCH 50/62] bpf: Fix bpf_redirect_neigh helper api to support supplying nexthop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Based on the discussion in [0], update the bpf_redirect_neigh() helper to accept an optional parameter specifying the nexthop information. This makes it possible to combine bpf_fib_lookup() and bpf_redirect_neigh() without incurring a duplicate FIB lookup - since the FIB lookup helper will return the nexthop information even if no neighbour is present, this can simply be passed on to bpf_redirect_neigh() if bpf_fib_lookup() returns BPF_FIB_LKUP_RET_NO_NEIGH. Thus fix & extend it before helper API is frozen. [0] https://lore.kernel.org/bpf/393e17fc-d187-3a8d-2f0d-a627c7c63fca@iogearbox.net/ Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Reviewed-by: David Ahern Link: https://lore.kernel.org/bpf/160322915615.32199.1187570224032024535.stgit@toke.dk --- include/linux/filter.h | 9 ++ include/uapi/linux/bpf.h | 22 ++++- net/core/filter.c | 158 +++++++++++++++++++++------------ scripts/bpf_helpers_doc.py | 1 + tools/include/uapi/linux/bpf.h | 22 ++++- 5 files changed, 145 insertions(+), 67 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 20fc24c9779a..72d62cbc1578 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -607,12 +607,21 @@ struct bpf_skb_data_end { void *data_end; }; +struct bpf_nh_params { + u32 nh_family; + union { + u32 ipv4_nh; + struct in6_addr ipv6_nh; + }; +}; + struct bpf_redirect_info { u32 flags; u32 tgt_index; void *tgt_value; struct bpf_map *map; u32 kern_flags; + struct bpf_nh_params nh; }; DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bf5a99d803e4..e6ceac3f7d62 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3677,15 +3677,19 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * long bpf_redirect_neigh(u32 ifindex, u64 flags) + * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags) * Description * Redirect the packet to another net device of index *ifindex* * and fill in L2 addresses from neighboring subsystem. This helper * is somewhat similar to **bpf_redirect**\ (), except that it * populates L2 addresses as well, meaning, internally, the helper - * performs a FIB lookup based on the skb's networking header to - * get the address of the next hop and then relies on the neighbor - * lookup for the L2 address of the nexthop. + * relies on the neighbor lookup for the L2 address of the nexthop. + * + * The helper will perform a FIB lookup based on the skb's + * networking header to get the address of the next hop, unless + * this is supplied by the caller in the *params* argument. The + * *plen* argument indicates the len of *params* and should be set + * to 0 if *params* is NULL. * * The *flags* argument is reserved and must be 0. The helper is * currently only supported for tc BPF program types, and enabled @@ -4906,6 +4910,16 @@ struct bpf_fib_lookup { __u8 dmac[6]; /* ETH_ALEN */ }; +struct bpf_redir_neigh { + /* network family for lookup (AF_INET, AF_INET6) */ + __u32 nh_family; + /* network address of nexthop; skips fib lookup to find gateway */ + union { + __be32 ipv4_nh; + __u32 ipv6_nh[4]; /* in6_addr; network order */ + }; +}; + enum bpf_task_fd_type { BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ BPF_FD_TYPE_TRACEPOINT, /* tp name */ diff --git a/net/core/filter.c b/net/core/filter.c index c5e2a1c5fd8d..6d0fa65a4a46 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2165,12 +2165,12 @@ static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, } #if IS_ENABLED(CONFIG_IPV6) -static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb) +static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, + struct net_device *dev, struct bpf_nh_params *nh) { - struct dst_entry *dst = skb_dst(skb); - struct net_device *dev = dst->dev; u32 hh_len = LL_RESERVED_SPACE(dev); const struct in6_addr *nexthop; + struct dst_entry *dst = NULL; struct neighbour *neigh; if (dev_xmit_recursion()) { @@ -2196,8 +2196,13 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb) } rcu_read_lock_bh(); - nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst), - &ipv6_hdr(skb)->daddr); + if (!nh) { + dst = skb_dst(skb); + nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst), + &ipv6_hdr(skb)->daddr); + } else { + nexthop = &nh->ipv6_nh; + } neigh = ip_neigh_gw6(dev, nexthop); if (likely(!IS_ERR(neigh))) { int ret; @@ -2210,36 +2215,43 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb) return ret; } rcu_read_unlock_bh(); - IP6_INC_STATS(dev_net(dst->dev), - ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); + if (dst) + IP6_INC_STATS(dev_net(dst->dev), + ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); out_drop: kfree_skb(skb); return -ENETDOWN; } -static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev) +static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); struct net *net = dev_net(dev); int err, ret = NET_XMIT_DROP; - struct dst_entry *dst; - struct flowi6 fl6 = { - .flowi6_flags = FLOWI_FLAG_ANYSRC, - .flowi6_mark = skb->mark, - .flowlabel = ip6_flowinfo(ip6h), - .flowi6_oif = dev->ifindex, - .flowi6_proto = ip6h->nexthdr, - .daddr = ip6h->daddr, - .saddr = ip6h->saddr, - }; - dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); - if (IS_ERR(dst)) + if (!nh) { + struct dst_entry *dst; + struct flowi6 fl6 = { + .flowi6_flags = FLOWI_FLAG_ANYSRC, + .flowi6_mark = skb->mark, + .flowlabel = ip6_flowinfo(ip6h), + .flowi6_oif = dev->ifindex, + .flowi6_proto = ip6h->nexthdr, + .daddr = ip6h->daddr, + .saddr = ip6h->saddr, + }; + + dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); + if (IS_ERR(dst)) + goto out_drop; + + skb_dst_set(skb, dst); + } else if (nh->nh_family != AF_INET6) { goto out_drop; + } - skb_dst_set(skb, dst); - - err = bpf_out_neigh_v6(net, skb); + err = bpf_out_neigh_v6(net, skb, dev, nh); if (unlikely(net_xmit_eval(err))) dev->stats.tx_errors++; else @@ -2252,7 +2264,8 @@ static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev) return ret; } #else -static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev) +static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) { kfree_skb(skb); return NET_XMIT_DROP; @@ -2260,11 +2273,9 @@ static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev) #endif /* CONFIG_IPV6 */ #if IS_ENABLED(CONFIG_INET) -static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb) +static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb, + struct net_device *dev, struct bpf_nh_params *nh) { - struct dst_entry *dst = skb_dst(skb); - struct rtable *rt = container_of(dst, struct rtable, dst); - struct net_device *dev = dst->dev; u32 hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; bool is_v6gw = false; @@ -2292,7 +2303,21 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb) } rcu_read_lock_bh(); - neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); + if (!nh) { + struct dst_entry *dst = skb_dst(skb); + struct rtable *rt = container_of(dst, struct rtable, dst); + + neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); + } else if (nh->nh_family == AF_INET6) { + neigh = ip_neigh_gw6(dev, &nh->ipv6_nh); + is_v6gw = true; + } else if (nh->nh_family == AF_INET) { + neigh = ip_neigh_gw4(dev, nh->ipv4_nh); + } else { + rcu_read_unlock_bh(); + goto out_drop; + } + if (likely(!IS_ERR(neigh))) { int ret; @@ -2309,33 +2334,37 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb) return -ENETDOWN; } -static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev) +static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) { const struct iphdr *ip4h = ip_hdr(skb); struct net *net = dev_net(dev); int err, ret = NET_XMIT_DROP; - struct rtable *rt; - struct flowi4 fl4 = { - .flowi4_flags = FLOWI_FLAG_ANYSRC, - .flowi4_mark = skb->mark, - .flowi4_tos = RT_TOS(ip4h->tos), - .flowi4_oif = dev->ifindex, - .flowi4_proto = ip4h->protocol, - .daddr = ip4h->daddr, - .saddr = ip4h->saddr, - }; - rt = ip_route_output_flow(net, &fl4, NULL); - if (IS_ERR(rt)) - goto out_drop; - if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { - ip_rt_put(rt); - goto out_drop; + if (!nh) { + struct flowi4 fl4 = { + .flowi4_flags = FLOWI_FLAG_ANYSRC, + .flowi4_mark = skb->mark, + .flowi4_tos = RT_TOS(ip4h->tos), + .flowi4_oif = dev->ifindex, + .flowi4_proto = ip4h->protocol, + .daddr = ip4h->daddr, + .saddr = ip4h->saddr, + }; + struct rtable *rt; + + rt = ip_route_output_flow(net, &fl4, NULL); + if (IS_ERR(rt)) + goto out_drop; + if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { + ip_rt_put(rt); + goto out_drop; + } + + skb_dst_set(skb, &rt->dst); } - skb_dst_set(skb, &rt->dst); - - err = bpf_out_neigh_v4(net, skb); + err = bpf_out_neigh_v4(net, skb, dev, nh); if (unlikely(net_xmit_eval(err))) dev->stats.tx_errors++; else @@ -2348,14 +2377,16 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev) return ret; } #else -static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev) +static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) { kfree_skb(skb); return NET_XMIT_DROP; } #endif /* CONFIG_INET */ -static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev) +static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) { struct ethhdr *ethh = eth_hdr(skb); @@ -2370,9 +2401,9 @@ static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev) skb_reset_network_header(skb); if (skb->protocol == htons(ETH_P_IP)) - return __bpf_redirect_neigh_v4(skb, dev); + return __bpf_redirect_neigh_v4(skb, dev, nh); else if (skb->protocol == htons(ETH_P_IPV6)) - return __bpf_redirect_neigh_v6(skb, dev); + return __bpf_redirect_neigh_v6(skb, dev, nh); out: kfree_skb(skb); return -ENOTSUPP; @@ -2382,7 +2413,8 @@ static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev) enum { BPF_F_NEIGH = (1ULL << 1), BPF_F_PEER = (1ULL << 2), -#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER) + BPF_F_NEXTHOP = (1ULL << 3), +#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP) }; BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) @@ -2455,7 +2487,8 @@ int skb_do_redirect(struct sk_buff *skb) return -EAGAIN; } return flags & BPF_F_NEIGH ? - __bpf_redirect_neigh(skb, dev) : + __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ? + &ri->nh : NULL) : __bpf_redirect(skb, dev, flags); out_drop: kfree_skb(skb); @@ -2504,16 +2537,21 @@ static const struct bpf_func_proto bpf_redirect_peer_proto = { .arg2_type = ARG_ANYTHING, }; -BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags) +BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params, + int, plen, u64, flags) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - if (unlikely(flags)) + if (unlikely((plen && plen < sizeof(*params)) || flags)) return TC_ACT_SHOT; - ri->flags = BPF_F_NEIGH; + ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0); ri->tgt_index = ifindex; + BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params)); + if (plen) + memcpy(&ri->nh, params, sizeof(ri->nh)); + return TC_ACT_REDIRECT; } @@ -2522,7 +2560,9 @@ static const struct bpf_func_proto bpf_redirect_neigh_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, - .arg2_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_MEM_OR_NULL, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes) diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 7d86fdd190be..6769caae142f 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -453,6 +453,7 @@ class PrinterHelpers(Printer): 'struct bpf_perf_event_data', 'struct bpf_perf_event_value', 'struct bpf_pidns_info', + 'struct bpf_redir_neigh', 'struct bpf_sk_lookup', 'struct bpf_sock', 'struct bpf_sock_addr', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index bf5a99d803e4..e6ceac3f7d62 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3677,15 +3677,19 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * long bpf_redirect_neigh(u32 ifindex, u64 flags) + * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags) * Description * Redirect the packet to another net device of index *ifindex* * and fill in L2 addresses from neighboring subsystem. This helper * is somewhat similar to **bpf_redirect**\ (), except that it * populates L2 addresses as well, meaning, internally, the helper - * performs a FIB lookup based on the skb's networking header to - * get the address of the next hop and then relies on the neighbor - * lookup for the L2 address of the nexthop. + * relies on the neighbor lookup for the L2 address of the nexthop. + * + * The helper will perform a FIB lookup based on the skb's + * networking header to get the address of the next hop, unless + * this is supplied by the caller in the *params* argument. The + * *plen* argument indicates the len of *params* and should be set + * to 0 if *params* is NULL. * * The *flags* argument is reserved and must be 0. The helper is * currently only supported for tc BPF program types, and enabled @@ -4906,6 +4910,16 @@ struct bpf_fib_lookup { __u8 dmac[6]; /* ETH_ALEN */ }; +struct bpf_redir_neigh { + /* network family for lookup (AF_INET, AF_INET6) */ + __u32 nh_family; + /* network address of nexthop; skips fib lookup to find gateway */ + union { + __be32 ipv4_nh; + __u32 ipv6_nh[4]; /* in6_addr; network order */ + }; +}; + enum bpf_task_fd_type { BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ BPF_FD_TYPE_TRACEPOINT, /* tp name */ From adfd272c4ccbe43d9761bb17dd8a4387d7815382 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 20 Oct 2020 23:25:57 +0200 Subject: [PATCH 51/62] bpf, selftests: Extend test_tc_redirect to use modified bpf_redirect_neigh() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This updates the test_tc_neigh prog in selftests to use the new syntax of bpf_redirect_neigh(). To exercise the helper both with and without the optional parameter, add an additional test_tc_neigh_fib test program, which does a bpf_fib_lookup() followed by a call to bpf_redirect_neigh() instead of looking up the ifindex in a map. Update the test_tc_redirect.sh script to run both versions of the test, and while we're add it, fix it to work on systems that have a consolidated dual-stack 'ping' binary instead of separate ping/ping6 versions. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/160322915724.32199.17530068594636950447.stgit@toke.dk --- .../selftests/bpf/progs/test_tc_neigh.c | 5 +- .../selftests/bpf/progs/test_tc_neigh_fib.c | 155 ++++++++++++++++++ .../testing/selftests/bpf/test_tc_redirect.sh | 18 +- 3 files changed, 173 insertions(+), 5 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/test_tc_neigh_fib.c diff --git a/tools/testing/selftests/bpf/progs/test_tc_neigh.c b/tools/testing/selftests/bpf/progs/test_tc_neigh.c index fe182616b112..b985ac4e7a81 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_neigh.c +++ b/tools/testing/selftests/bpf/progs/test_tc_neigh.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include #include @@ -118,7 +119,7 @@ SEC("dst_ingress") int tc_dst(struct __sk_buff *skb) if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0) return TC_ACT_SHOT; - return bpf_redirect_neigh(get_dev_ifindex(dev_src), 0); + return bpf_redirect_neigh(get_dev_ifindex(dev_src), NULL, 0, 0); } SEC("src_ingress") int tc_src(struct __sk_buff *skb) @@ -142,7 +143,7 @@ SEC("src_ingress") int tc_src(struct __sk_buff *skb) if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0) return TC_ACT_SHOT; - return bpf_redirect_neigh(get_dev_ifindex(dev_dst), 0); + return bpf_redirect_neigh(get_dev_ifindex(dev_dst), NULL, 0, 0); } char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_tc_neigh_fib.c b/tools/testing/selftests/bpf/progs/test_tc_neigh_fib.c new file mode 100644 index 000000000000..d82ed3457030 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_tc_neigh_fib.c @@ -0,0 +1,155 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#ifndef ctx_ptr +# define ctx_ptr(field) (void *)(long)(field) +#endif + +#define AF_INET 2 +#define AF_INET6 10 + +static __always_inline int fill_fib_params_v4(struct __sk_buff *skb, + struct bpf_fib_lookup *fib_params) +{ + void *data_end = ctx_ptr(skb->data_end); + void *data = ctx_ptr(skb->data); + struct iphdr *ip4h; + + if (data + sizeof(struct ethhdr) > data_end) + return -1; + + ip4h = (struct iphdr *)(data + sizeof(struct ethhdr)); + if ((void *)(ip4h + 1) > data_end) + return -1; + + fib_params->family = AF_INET; + fib_params->tos = ip4h->tos; + fib_params->l4_protocol = ip4h->protocol; + fib_params->sport = 0; + fib_params->dport = 0; + fib_params->tot_len = bpf_ntohs(ip4h->tot_len); + fib_params->ipv4_src = ip4h->saddr; + fib_params->ipv4_dst = ip4h->daddr; + + return 0; +} + +static __always_inline int fill_fib_params_v6(struct __sk_buff *skb, + struct bpf_fib_lookup *fib_params) +{ + struct in6_addr *src = (struct in6_addr *)fib_params->ipv6_src; + struct in6_addr *dst = (struct in6_addr *)fib_params->ipv6_dst; + void *data_end = ctx_ptr(skb->data_end); + void *data = ctx_ptr(skb->data); + struct ipv6hdr *ip6h; + + if (data + sizeof(struct ethhdr) > data_end) + return -1; + + ip6h = (struct ipv6hdr *)(data + sizeof(struct ethhdr)); + if ((void *)(ip6h + 1) > data_end) + return -1; + + fib_params->family = AF_INET6; + fib_params->flowinfo = 0; + fib_params->l4_protocol = ip6h->nexthdr; + fib_params->sport = 0; + fib_params->dport = 0; + fib_params->tot_len = bpf_ntohs(ip6h->payload_len); + *src = ip6h->saddr; + *dst = ip6h->daddr; + + return 0; +} + +SEC("chk_egress") int tc_chk(struct __sk_buff *skb) +{ + void *data_end = ctx_ptr(skb->data_end); + void *data = ctx_ptr(skb->data); + __u32 *raw = data; + + if (data + sizeof(struct ethhdr) > data_end) + return TC_ACT_SHOT; + + return !raw[0] && !raw[1] && !raw[2] ? TC_ACT_SHOT : TC_ACT_OK; +} + +static __always_inline int tc_redir(struct __sk_buff *skb) +{ + struct bpf_fib_lookup fib_params = { .ifindex = skb->ingress_ifindex }; + __u8 zero[ETH_ALEN * 2]; + int ret = -1; + + switch (skb->protocol) { + case __bpf_constant_htons(ETH_P_IP): + ret = fill_fib_params_v4(skb, &fib_params); + break; + case __bpf_constant_htons(ETH_P_IPV6): + ret = fill_fib_params_v6(skb, &fib_params); + break; + } + + if (ret) + return TC_ACT_OK; + + ret = bpf_fib_lookup(skb, &fib_params, sizeof(fib_params), 0); + if (ret == BPF_FIB_LKUP_RET_NOT_FWDED || ret < 0) + return TC_ACT_OK; + + __builtin_memset(&zero, 0, sizeof(zero)); + if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0) + return TC_ACT_SHOT; + + if (ret == BPF_FIB_LKUP_RET_NO_NEIGH) { + struct bpf_redir_neigh nh_params = {}; + + nh_params.nh_family = fib_params.family; + __builtin_memcpy(&nh_params.ipv6_nh, &fib_params.ipv6_dst, + sizeof(nh_params.ipv6_nh)); + + return bpf_redirect_neigh(fib_params.ifindex, &nh_params, + sizeof(nh_params), 0); + + } else if (ret == BPF_FIB_LKUP_RET_SUCCESS) { + void *data_end = ctx_ptr(skb->data_end); + struct ethhdr *eth = ctx_ptr(skb->data); + + if (eth + 1 > data_end) + return TC_ACT_SHOT; + + __builtin_memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN); + __builtin_memcpy(eth->h_source, fib_params.smac, ETH_ALEN); + + return bpf_redirect(fib_params.ifindex, 0); + } + + return TC_ACT_SHOT; +} + +/* these are identical, but keep them separate for compatibility with the + * section names expected by test_tc_redirect.sh + */ +SEC("dst_ingress") int tc_dst(struct __sk_buff *skb) +{ + return tc_redir(skb); +} + +SEC("src_ingress") int tc_src(struct __sk_buff *skb) +{ + return tc_redir(skb); +} + +char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_tc_redirect.sh b/tools/testing/selftests/bpf/test_tc_redirect.sh index 6d7482562140..8868aa1ca902 100755 --- a/tools/testing/selftests/bpf/test_tc_redirect.sh +++ b/tools/testing/selftests/bpf/test_tc_redirect.sh @@ -24,8 +24,7 @@ command -v timeout >/dev/null 2>&1 || \ { echo >&2 "timeout is not available"; exit 1; } command -v ping >/dev/null 2>&1 || \ { echo >&2 "ping is not available"; exit 1; } -command -v ping6 >/dev/null 2>&1 || \ - { echo >&2 "ping6 is not available"; exit 1; } +if command -v ping6 >/dev/null 2>&1; then PING6=ping6; else PING6=ping; fi command -v perl >/dev/null 2>&1 || \ { echo >&2 "perl is not available"; exit 1; } command -v jq >/dev/null 2>&1 || \ @@ -152,7 +151,7 @@ netns_test_connectivity() echo -e "${TEST}: ${GREEN}PASS${NC}" TEST="ICMPv6 connectivity test" - ip netns exec ${NS_SRC} ping6 $PING_ARG ${IP6_DST} + ip netns exec ${NS_SRC} $PING6 $PING_ARG ${IP6_DST} if [ $? -ne 0 ]; then echo -e "${TEST}: ${RED}FAIL${NC}" exit 1 @@ -170,6 +169,7 @@ hex_mem_str() netns_setup_bpf() { local obj=$1 + local use_forwarding=${2:-0} ip netns exec ${NS_FWD} tc qdisc add dev veth_src_fwd clsact ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd ingress bpf da obj $obj sec src_ingress @@ -179,6 +179,14 @@ netns_setup_bpf() ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd ingress bpf da obj $obj sec dst_ingress ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd egress bpf da obj $obj sec chk_egress + if [ "$use_forwarding" -eq "1" ]; then + # bpf_fib_lookup() checks if forwarding is enabled + ip netns exec ${NS_FWD} sysctl -w net.ipv4.ip_forward=1 + ip netns exec ${NS_FWD} sysctl -w net.ipv6.conf.veth_dst_fwd.forwarding=1 + ip netns exec ${NS_FWD} sysctl -w net.ipv6.conf.veth_src_fwd.forwarding=1 + return 0 + fi + veth_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/ifindex) veth_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/ifindex) @@ -200,5 +208,9 @@ netns_setup_bpf test_tc_neigh.o netns_test_connectivity netns_cleanup netns_setup +netns_setup_bpf test_tc_neigh_fib.o 1 +netns_test_connectivity +netns_cleanup +netns_setup netns_setup_bpf test_tc_peer.o netns_test_connectivity From 3652c9a1b1fe6cbdd4510eb220db548bff8704ae Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 21 Oct 2020 22:32:57 +0200 Subject: [PATCH 52/62] bpf, libbpf: Guard bpf inline asm from bpf_tail_call_static Yaniv reported a compilation error after pulling latest libbpf: [...] ../libbpf/src/root/usr/include/bpf/bpf_helpers.h:99:10: error: unknown register name 'r0' in asm : "r0", "r1", "r2", "r3", "r4", "r5"); [...] The issue got triggered given Yaniv was compiling tracing programs with native target (e.g. x86) instead of BPF target, hence no BTF generated vmlinux.h nor CO-RE used, and later llc with -march=bpf was invoked to compile from LLVM IR to BPF object file. Given that clang was expecting x86 inline asm and not BPF one the error complained that these regs don't exist on the former. Guard bpf_tail_call_static() with defined(__bpf__) where BPF inline asm is valid to use. BPF tracing programs on more modern kernels use BPF target anyway and thus the bpf_tail_call_static() function will be available for them. BPF inline asm is supported since clang 7 (clang <= 6 otherwise throws same above error), and __bpf_unreachable() since clang 8, therefore include the latter condition in order to prevent compilation errors for older clang versions. Given even an old Ubuntu 18.04 LTS has official LLVM packages all the way up to llvm-10, I did not bother to special case the __bpf_unreachable() inside bpf_tail_call_static() further. Also, undo the sockex3_kern's use of bpf_tail_call_static() sample given they still have the old hacky way to even compile networking progs with native instead of BPF target so bpf_tail_call_static() won't be defined there anymore. Fixes: 0e9f6841f664 ("bpf, libbpf: Add bpf_tail_call_static helper for bpf programs") Reported-by: Yaniv Agman Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Yonghong Song Tested-by: Yaniv Agman Link: https://lore.kernel.org/bpf/CAMy7=ZUk08w5Gc2Z-EKi4JFtuUCaZYmE4yzhJjrExXpYKR4L8w@mail.gmail.com Link: https://lore.kernel.org/bpf/20201021203257.26223-1-daniel@iogearbox.net --- samples/bpf/sockex3_kern.c | 8 ++++---- tools/lib/bpf/bpf_helpers.h | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/samples/bpf/sockex3_kern.c b/samples/bpf/sockex3_kern.c index 8142d02b33e6..b363503357e5 100644 --- a/samples/bpf/sockex3_kern.c +++ b/samples/bpf/sockex3_kern.c @@ -44,17 +44,17 @@ static inline void parse_eth_proto(struct __sk_buff *skb, u32 proto) switch (proto) { case ETH_P_8021Q: case ETH_P_8021AD: - bpf_tail_call_static(skb, &jmp_table, PARSE_VLAN); + bpf_tail_call(skb, &jmp_table, PARSE_VLAN); break; case ETH_P_MPLS_UC: case ETH_P_MPLS_MC: - bpf_tail_call_static(skb, &jmp_table, PARSE_MPLS); + bpf_tail_call(skb, &jmp_table, PARSE_MPLS); break; case ETH_P_IP: - bpf_tail_call_static(skb, &jmp_table, PARSE_IP); + bpf_tail_call(skb, &jmp_table, PARSE_IP); break; case ETH_P_IPV6: - bpf_tail_call_static(skb, &jmp_table, PARSE_IPV6); + bpf_tail_call(skb, &jmp_table, PARSE_IPV6); break; } } diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index 2bdb7d6dbad2..72b251110c4d 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -72,6 +72,7 @@ /* * Helper function to perform a tail call with a constant/immediate map slot. */ +#if __clang_major__ >= 8 && defined(__bpf__) static __always_inline void bpf_tail_call_static(void *ctx, const void *map, const __u32 slot) { @@ -98,6 +99,7 @@ bpf_tail_call_static(void *ctx, const void *map, const __u32 slot) :: [ctx]"r"(ctx), [map]"r"(map), [slot]"i"(slot) : "r0", "r1", "r2", "r3", "r4", "r5"); } +#endif /* * Helper structure used by eBPF C program From 01c4ceae0a38a0bdbfea6896f41efcd985a9c064 Mon Sep 17 00:00:00 2001 From: Xie He Date: Mon, 19 Oct 2020 18:31:52 -0700 Subject: [PATCH 53/62] net: hdlc: In hdlc_rcv, check to make sure dev is an HDLC device The hdlc_rcv function is used as hdlc_packet_type.func to process any skb received in the kernel with skb->protocol == htons(ETH_P_HDLC). The purpose of this function is to provide second-stage processing for skbs not assigned a "real" L3 skb->protocol value in the first stage. This function assumes the device from which the skb is received is an HDLC device (a device created by this module). It assumes that netdev_priv(dev) returns a pointer to "struct hdlc_device". However, it is possible that some driver in the kernel (not necessarily in our control) submits a received skb with skb->protocol == htons(ETH_P_HDLC), from a non-HDLC device. In this case, the skb would still be received by hdlc_rcv. This will cause problems. hdlc_rcv should be able to recognize and drop invalid skbs. It should first make sure "dev" is actually an HDLC device, before starting its processing. This patch adds this check to hdlc_rcv. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Cc: Krzysztof Halasa Signed-off-by: Xie He Link: https://lore.kernel.org/r/20201020013152.89259-1-xie.he.0141@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/wan/hdlc.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/net/wan/hdlc.c b/drivers/net/wan/hdlc.c index 9b00708676cf..1bdd3df0867a 100644 --- a/drivers/net/wan/hdlc.c +++ b/drivers/net/wan/hdlc.c @@ -46,7 +46,15 @@ static struct hdlc_proto *first_proto; static int hdlc_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *p, struct net_device *orig_dev) { - struct hdlc_device *hdlc = dev_to_hdlc(dev); + struct hdlc_device *hdlc; + + /* First make sure "dev" is an HDLC device */ + if (!(dev->priv_flags & IFF_WAN_HDLC)) { + kfree_skb(skb); + return NET_RX_SUCCESS; + } + + hdlc = dev_to_hdlc(dev); if (!net_eq(dev_net(dev), &init_net)) { kfree_skb(skb); From 5fce1e43e2d5bf2f7e3224d7b99b1c65ab2c26e2 Mon Sep 17 00:00:00 2001 From: Xie He Date: Mon, 19 Oct 2020 23:34:20 -0700 Subject: [PATCH 54/62] net: hdlc_raw_eth: Clear the IFF_TX_SKB_SHARING flag after calling ether_setup This driver calls ether_setup to set up the network device. The ether_setup function would add the IFF_TX_SKB_SHARING flag to the device. This flag indicates that it is safe to transmit shared skbs to the device. However, this is not true. This driver may pad the frame (in eth_tx) before transmission, so the skb may be modified. Fixes: 550fd08c2ceb ("net: Audit drivers to identify those needing IFF_TX_SKB_SHARING cleared") Cc: Neil Horman Cc: Krzysztof Halasa Signed-off-by: Xie He Link: https://lore.kernel.org/r/20201020063420.187497-1-xie.he.0141@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/wan/hdlc_raw_eth.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wan/hdlc_raw_eth.c b/drivers/net/wan/hdlc_raw_eth.c index 08e0a46501de..c70a518b8b47 100644 --- a/drivers/net/wan/hdlc_raw_eth.c +++ b/drivers/net/wan/hdlc_raw_eth.c @@ -99,6 +99,7 @@ static int raw_eth_ioctl(struct net_device *dev, struct ifreq *ifr) old_qlen = dev->tx_queue_len; ether_setup(dev); dev->tx_queue_len = old_qlen; + dev->priv_flags &= ~IFF_TX_SKB_SHARING; eth_hw_addr_random(dev); call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE, dev); netif_dormant_off(dev); From 5403caf21648d739bf2b1266c33e34384c313379 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 20 Oct 2020 09:35:15 +0200 Subject: [PATCH 55/62] net: ethernet: mtk-star-emac: select REGMAP_MMIO The driver depends on mmio regmap API but doesn't select the appropriate Kconfig option. This fixes it. Fixes: 8c7bd5a454ff ("net: ethernet: mtk-star-emac: new driver") Signed-off-by: Bartosz Golaszewski Link: https://lore.kernel.org/r/20201020073515.22769-1-brgl@bgdev.pl Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mediatek/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mediatek/Kconfig b/drivers/net/ethernet/mediatek/Kconfig index 62a820b1eb16..3362b148de23 100644 --- a/drivers/net/ethernet/mediatek/Kconfig +++ b/drivers/net/ethernet/mediatek/Kconfig @@ -17,6 +17,7 @@ config NET_MEDIATEK_SOC config NET_MEDIATEK_STAR_EMAC tristate "MediaTek STAR Ethernet MAC support" select PHYLIB + select REGMAP_MMIO help This driver supports the ethernet MAC IP first used on MediaTek MT85** SoCs. From ebfe3c5183733f784264450a41646a482f964e5e Mon Sep 17 00:00:00 2001 From: Di Zhu Date: Wed, 21 Oct 2020 10:00:53 +0800 Subject: [PATCH 56/62] rtnetlink: fix data overflow in rtnl_calcit() "ip addr show" command execute error when we have a physical network card with a large number of VFs The return value of if_nlmsg_size() in rtnl_calcit() will exceed range of u16 data type when any network cards has a larger number of VFs. rtnl_vfinfo_size() will significant increase needed dump size when the value of num_vfs is larger. Eventually we get a wrong value of min_ifinfo_dump_size because of overflow which decides the memory size needed by netlink dump and netlink_dump() will return -EMSGSIZE because of not enough memory was allocated. So fix it by promoting min_dump_alloc data type to u32 to avoid whole netlink message size overflow and it's also align with the data type of struct netlink_callback{}.min_dump_alloc which is assigned by return value of rtnl_calcit() Signed-off-by: Di Zhu Link: https://lore.kernel.org/r/20201021020053.1401-1-zhudi21@huawei.com Signed-off-by: Jakub Kicinski --- include/linux/netlink.h | 2 +- net/core/rtnetlink.c | 13 ++++++------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 666cd0390699..9f118771e248 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -240,7 +240,7 @@ struct netlink_dump_control { int (*done)(struct netlink_callback *); void *data; struct module *module; - u16 min_dump_alloc; + u32 min_dump_alloc; }; int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 68e0682450c6..7d7223691783 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3709,13 +3709,13 @@ static int rtnl_dellinkprop(struct sk_buff *skb, struct nlmsghdr *nlh, return rtnl_linkprop(RTM_DELLINKPROP, skb, nlh, extack); } -static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) +static u32 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); - struct net_device *dev; + size_t min_ifinfo_dump_size = 0; struct nlattr *tb[IFLA_MAX+1]; u32 ext_filter_mask = 0; - u16 min_ifinfo_dump_size = 0; + struct net_device *dev; int hdrlen; /* Same kernel<->userspace interface hack as in rtnl_dump_ifinfo. */ @@ -3735,9 +3735,8 @@ static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) */ rcu_read_lock(); for_each_netdev_rcu(net, dev) { - min_ifinfo_dump_size = max_t(u16, min_ifinfo_dump_size, - if_nlmsg_size(dev, - ext_filter_mask)); + min_ifinfo_dump_size = max(min_ifinfo_dump_size, + if_nlmsg_size(dev, ext_filter_mask)); } rcu_read_unlock(); @@ -5494,7 +5493,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { struct sock *rtnl; rtnl_dumpit_func dumpit; - u16 min_dump_alloc = 0; + u32 min_dump_alloc = 0; link = rtnl_get_link(family, type); if (!link || !link->dumpit) { From cf8691cbc286592177dee7db7bba684e95058f96 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 21 Oct 2020 10:30:43 -0400 Subject: [PATCH 57/62] Revert "virtio-net: ethtool configurable RXCSUM" This reverts commit 3618ad2a7c0e78e4258386394d5d5f92a3dbccf8. When control vq is not negotiated, that commit causes a crash: [ 72.229171] kernel BUG at drivers/net/virtio_net.c:1667! [ 72.230266] invalid opcode: 0000 [#1] PREEMPT SMP [ 72.231172] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.9.0-rc8-02934-g3618ad2a7c0e7 #1 [ 72.231172] EIP: virtnet_send_command+0x120/0x140 [ 72.231172] Code: 00 0f 94 c0 8b 7d f0 65 33 3d 14 00 00 00 75 1c 8d 65 f4 5b 5e 5f 5d c3 66 90 be 01 00 00 00 e9 6e ff ff ff 8d b6 00 +00 00 00 <0f> 0b e8 d9 bb 82 00 eb 17 8d b4 26 00 00 00 00 8d b4 26 00 00 00 [ 72.231172] EAX: 0000000d EBX: f72895c0 ECX: 00000017 EDX: 00000011 [ 72.231172] ESI: f7197800 EDI: ed69bd00 EBP: ed69bcf4 ESP: ed69bc98 [ 72.231172] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00010246 [ 72.231172] CR0: 80050033 CR2: 00000000 CR3: 02c84000 CR4: 000406f0 [ 72.231172] Call Trace: [ 72.231172] ? __virt_addr_valid+0x45/0x60 [ 72.231172] ? ___cache_free+0x51f/0x760 [ 72.231172] ? kobject_uevent_env+0xf4/0x560 [ 72.231172] virtnet_set_guest_offloads+0x4d/0x80 [ 72.231172] virtnet_set_features+0x85/0x120 [ 72.231172] ? virtnet_set_guest_offloads+0x80/0x80 [ 72.231172] __netdev_update_features+0x27a/0x8e0 [ 72.231172] ? kobject_uevent+0xa/0x20 [ 72.231172] ? netdev_register_kobject+0x12c/0x160 [ 72.231172] register_netdevice+0x4fe/0x740 [ 72.231172] register_netdev+0x1c/0x40 [ 72.231172] virtnet_probe+0x728/0xb60 [ 72.231172] ? _raw_spin_unlock+0x1d/0x40 [ 72.231172] ? virtio_vdpa_get_status+0x1c/0x20 [ 72.231172] virtio_dev_probe+0x1c6/0x271 [ 72.231172] really_probe+0x195/0x2e0 [ 72.231172] driver_probe_device+0x26/0x60 [ 72.231172] device_driver_attach+0x49/0x60 [ 72.231172] __driver_attach+0x46/0xc0 [ 72.231172] ? device_driver_attach+0x60/0x60 [ 72.231172] bus_add_driver+0x197/0x1c0 [ 72.231172] driver_register+0x66/0xc0 [ 72.231172] register_virtio_driver+0x1b/0x40 [ 72.231172] virtio_net_driver_init+0x61/0x86 [ 72.231172] ? veth_init+0x14/0x14 [ 72.231172] do_one_initcall+0x76/0x2e4 [ 72.231172] ? rdinit_setup+0x2a/0x2a [ 72.231172] do_initcalls+0xb2/0xd5 [ 72.231172] kernel_init_freeable+0x14f/0x179 [ 72.231172] ? rest_init+0x100/0x100 [ 72.231172] kernel_init+0xd/0xe0 [ 72.231172] ret_from_fork+0x1c/0x30 [ 72.231172] Modules linked in: [ 72.269563] ---[ end trace a6ebc4afea0e6cb1 ]--- The reason is that virtnet_set_features now calls virtnet_set_guest_offloads unconditionally, it used to only call it when there is something to configure. If device does not have a control vq, everything breaks. Revert the original commit for now. Cc: Tonghao Zhang Fixes: 3618ad2a7c0e7 ("virtio-net: ethtool configurable RXCSUM") Reported-by: kernel test robot Signed-off-by: Michael S. Tsirkin Acked-by: Willem de Bruijn Acked-by: Jason Wang Link: https://lore.kernel.org/r/20201021142944.13615-1-mst@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/virtio_net.c | 50 +++++++++++----------------------------- 1 file changed, 13 insertions(+), 37 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index d2d2c4a53cf2..21b71148c532 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -68,8 +68,6 @@ static const unsigned long guest_offloads[] = { (1ULL << VIRTIO_NET_F_GUEST_ECN) | \ (1ULL << VIRTIO_NET_F_GUEST_UFO)) -#define GUEST_OFFLOAD_CSUM_MASK (1ULL << VIRTIO_NET_F_GUEST_CSUM) - struct virtnet_stat_desc { char desc[ETH_GSTRING_LEN]; size_t offset; @@ -2524,48 +2522,29 @@ static int virtnet_get_phys_port_name(struct net_device *dev, char *buf, return 0; } -static netdev_features_t virtnet_fix_features(struct net_device *netdev, - netdev_features_t features) -{ - /* If Rx checksum is disabled, LRO should also be disabled. */ - if (!(features & NETIF_F_RXCSUM)) - features &= ~NETIF_F_LRO; - - return features; -} - static int virtnet_set_features(struct net_device *dev, netdev_features_t features) { struct virtnet_info *vi = netdev_priv(dev); - u64 offloads = vi->guest_offloads; + u64 offloads; int err; - /* Don't allow configuration while XDP is active. */ - if (vi->xdp_queue_pairs) - return -EBUSY; - if ((dev->features ^ features) & NETIF_F_LRO) { + if (vi->xdp_queue_pairs) + return -EBUSY; + if (features & NETIF_F_LRO) - offloads |= GUEST_OFFLOAD_LRO_MASK & - vi->guest_offloads_capable; + offloads = vi->guest_offloads_capable; else - offloads &= ~GUEST_OFFLOAD_LRO_MASK; + offloads = vi->guest_offloads_capable & + ~GUEST_OFFLOAD_LRO_MASK; + + err = virtnet_set_guest_offloads(vi, offloads); + if (err) + return err; + vi->guest_offloads = offloads; } - if ((dev->features ^ features) & NETIF_F_RXCSUM) { - if (features & NETIF_F_RXCSUM) - offloads |= GUEST_OFFLOAD_CSUM_MASK & - vi->guest_offloads_capable; - else - offloads &= ~GUEST_OFFLOAD_CSUM_MASK; - } - - err = virtnet_set_guest_offloads(vi, offloads); - if (err) - return err; - - vi->guest_offloads = offloads; return 0; } @@ -2584,7 +2563,6 @@ static const struct net_device_ops virtnet_netdev = { .ndo_features_check = passthru_features_check, .ndo_get_phys_port_name = virtnet_get_phys_port_name, .ndo_set_features = virtnet_set_features, - .ndo_fix_features = virtnet_fix_features, }; static void virtnet_config_changed_work(struct work_struct *work) @@ -3035,10 +3013,8 @@ static int virtnet_probe(struct virtio_device *vdev) if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) || virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6)) dev->features |= NETIF_F_LRO; - if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) { - dev->hw_features |= NETIF_F_RXCSUM; + if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) dev->hw_features |= NETIF_F_LRO; - } dev->vlan_features = dev->features; From 287d35405989cfe0090e3059f7788dc531879a8d Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Wed, 21 Oct 2020 17:55:49 +0200 Subject: [PATCH 58/62] selftests: mptcp: depends on built-in IPv6 Recently, CONFIG_MPTCP_IPV6 no longer selects CONFIG_IPV6. As a consequence, if CONFIG_MPTCP_IPV6=y is added to the kconfig, it will no longer ensure CONFIG_IPV6=y. If it is not enabled, CONFIG_MPTCP_IPV6 will stay disabled and selftests will fail. We also need CONFIG_IPV6 to be built-in. For more details, please see commit 0ed37ac586c0 ("mptcp: depends on IPV6 but not as a module"). Note that 'make kselftest-merge' will take all 'config' files found in 'tools/testsing/selftests'. Because some of them already set CONFIG_IPV6=y, MPTCP selftests were still passing. But they will fail if MPTCP selftests are launched manually after having executed this command to prepare the kernel config: ./scripts/kconfig/merge_config.sh -m .config \ ./tools/testing/selftests/net/mptcp/config Fixes: 010b430d5df5 ("mptcp: MPTCP_IPV6 should depend on IPV6 instead of selecting it") Signed-off-by: Matthieu Baerts Reviewed-by: Mat Martineau Link: https://lore.kernel.org/r/20201021155549.933731-1-matthieu.baerts@tessares.net Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/config | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/net/mptcp/config b/tools/testing/selftests/net/mptcp/config index 8df5cb8f71ff..741a1c4f4ae8 100644 --- a/tools/testing/selftests/net/mptcp/config +++ b/tools/testing/selftests/net/mptcp/config @@ -1,4 +1,5 @@ CONFIG_MPTCP=y +CONFIG_IPV6=y CONFIG_MPTCP_IPV6=y CONFIG_INET_DIAG=m CONFIG_INET_MPTCP_DIAG=m From d9b0e599b2b892422f1cbc5d2658049b895b2b58 Mon Sep 17 00:00:00 2001 From: Lijun Pan Date: Tue, 20 Oct 2020 17:39:19 -0500 Subject: [PATCH 59/62] ibmvnic: save changed mac address to adapter->mac_addr After mac address change request completes successfully, the new mac address need to be saved to adapter->mac_addr as well as netdev->dev_addr. Otherwise, adapter->mac_addr still holds old data. Fixes: 62740e97881c ("net/ibmvnic: Update MAC address settings after adapter reset") Signed-off-by: Lijun Pan Link: https://lore.kernel.org/r/20201020223919.46106-1-ljp@linux.ibm.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ibm/ibmvnic.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 1f7fe6b3dd5a..8148f796a807 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -4235,8 +4235,13 @@ static int handle_change_mac_rsp(union ibmvnic_crq *crq, dev_err(dev, "Error %ld in CHANGE_MAC_ADDR_RSP\n", rc); goto out; } + /* crq->change_mac_addr.mac_addr is the requested one + * crq->change_mac_addr_rsp.mac_addr is the returned valid one. + */ ether_addr_copy(netdev->dev_addr, &crq->change_mac_addr_rsp.mac_addr[0]); + ether_addr_copy(adapter->mac_addr, + &crq->change_mac_addr_rsp.mac_addr[0]); out: complete(&adapter->fw_done); return rc; From c77761c8a59405cb7aa44188b30fffe13fbdd02d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 21 Oct 2020 12:55:52 +0200 Subject: [PATCH 60/62] netfilter: nf_fwd_netdev: clear timestamp in forwarding path Similar to 7980d2eabde8 ("ipvs: clear skb->tstamp in forwarding path"). fq qdisc requires tstamp to be cleared in forwarding path. Fixes: 8203e2d844d3 ("net: clear skb->tstamp in forwarding paths") Fixes: fb420d5d91c1 ("tcp/fq: move back to CLOCK_MONOTONIC") Fixes: 80b14dee2bea ("net: Add a new socket option for a future transmit time.") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_dup_netdev.c | 1 + net/netfilter/nft_fwd_netdev.c | 1 + 2 files changed, 2 insertions(+) diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c index 2b01a151eaa8..a579e59ee5c5 100644 --- a/net/netfilter/nf_dup_netdev.c +++ b/net/netfilter/nf_dup_netdev.c @@ -19,6 +19,7 @@ static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev) skb_push(skb, skb->mac_len); skb->dev = dev; + skb->tstamp = 0; dev_queue_xmit(skb); } diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index 3087e23297db..b77985986b24 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -138,6 +138,7 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr, return; skb->dev = dev; + skb->tstamp = 0; neigh_xmit(neigh_table, dev, addr, skb); out: regs->verdict.code = verdict; From 700465fd338fe5df08a1b2e27fa16981f562547f Mon Sep 17 00:00:00 2001 From: Ke Li Date: Thu, 22 Oct 2020 02:41:46 -0400 Subject: [PATCH 61/62] net: Properly typecast int values to set sk_max_pacing_rate In setsockopt(SO_MAX_PACING_RATE) on 64bit systems, sk_max_pacing_rate, after extended from 'u32' to 'unsigned long', takes unintentionally hiked value whenever assigned from an 'int' value with MSB=1, due to binary sign extension in promoting s32 to u64, e.g. 0x80000000 becomes 0xFFFFFFFF80000000. Thus inflated sk_max_pacing_rate causes subsequent getsockopt to return ~0U unexpectedly. It may also result in increased pacing rate. Fix by explicitly casting the 'int' value to 'unsigned int' before assigning it to sk_max_pacing_rate, for zero extension to happen. Fixes: 76a9ebe811fb ("net: extend sk_pacing_rate to unsigned long") Signed-off-by: Ji Li Signed-off-by: Ke Li Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20201022064146.79873-1-keli@akamai.com Signed-off-by: Jakub Kicinski --- net/core/filter.c | 3 ++- net/core/sock.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 6d0fa65a4a46..2ca5eecebacf 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4733,7 +4733,8 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); - sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val; + sk->sk_max_pacing_rate = (val == ~0U) ? + ~0UL : (unsigned int)val; sk->sk_pacing_rate = min(sk->sk_pacing_rate, sk->sk_max_pacing_rate); break; diff --git a/net/core/sock.c b/net/core/sock.c index 4e8729357122..727ea1cc633c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1163,7 +1163,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, case SO_MAX_PACING_RATE: { - unsigned long ulval = (val == ~0U) ? ~0UL : val; + unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; if (sizeof(ulval) != sizeof(val) && optlen >= sizeof(ulval) && From 18ded910b589839e38a51623a179837ab4cc3789 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 22 Oct 2020 10:33:31 -0400 Subject: [PATCH 62/62] tcp: fix to update snd_wl1 in bulk receiver fast path In the header prediction fast path for a bulk data receiver, if no data is newly acknowledged then we do not call tcp_ack() and do not call tcp_ack_update_window(). This means that a bulk receiver that receives large amounts of data can have the incoming sequence numbers wrap, so that the check in tcp_may_update_window fails: after(ack_seq, tp->snd_wl1) If the incoming receive windows are zero in this state, and then the connection that was a bulk data receiver later wants to send data, that connection can find itself persistently rejecting the window updates in incoming ACKs. This means the connection can persistently fail to discover that the receive window has opened, which in turn means that the connection is unable to send anything, and the connection's sending process can get permanently "stuck". The fix is to update snd_wl1 in the header prediction fast path for a bulk data receiver, so that it keeps up and does not see wrapping problems. This fix is based on a very nice and thorough analysis and diagnosis by Apollon Oikonomopoulos (see link below). This is a stable candidate but there is no Fixes tag here since the bug predates current git history. Just for fun: looks like the bug dates back to when header prediction was added in Linux v2.1.8 in Nov 1996. In that version tcp_rcv_established() was added, and the code only updates snd_wl1 in tcp_ack(), and in the new "Bulk data transfer: receiver" code path it does not call tcp_ack(). This fix seems to apply cleanly at least as far back as v3.2. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Neal Cardwell Reported-by: Apollon Oikonomopoulos Tested-by: Apollon Oikonomopoulos Link: https://www.spinics.net/lists/netdev/msg692430.html Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20201022143331.1887495-1-ncardwell.kernel@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 67f10d3ec240..fc445833b5e5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5827,6 +5827,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) tcp_data_snd_check(sk); if (!inet_csk_ack_scheduled(sk)) goto no_ack; + } else { + tcp_update_wl(tp, TCP_SKB_CB(skb)->seq); } __tcp_ack_snd_check(sk, 0);