From 303216e76dcab6049c9d42390b1032f0649a8206 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Wed, 22 Jul 2020 20:36:43 +0200 Subject: [PATCH 001/110] batman-adv: Avoid uninitialized chaddr when handling DHCP The gateway client code can try to optimize the delivery of DHCP packets to avoid broadcasting them through the whole mesh. But also transmissions to the client can be optimized by looking up the destination via the chaddr of the DHCP packet. But the chaddr is currently only done when chaddr is fully inside the non-paged area of the skbuff. Otherwise it will not be initialized and the unoptimized path should have been taken. But the implementation didn't handle this correctly. It didn't retrieve the correct chaddr but still tried to perform the TT lookup with this uninitialized memory. Reported-by: syzbot+ab16e463b903f5a37036@syzkaller.appspotmail.com Fixes: 6c413b1c22a2 ("batman-adv: send every DHCP packet as bat-unicast") Signed-off-by: Sven Eckelmann Acked-by: Antonio Quartulli Signed-off-by: Simon Wunderlich --- net/batman-adv/gateway_client.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index a18dcc686dc3..ef3f85b576c4 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -703,8 +703,10 @@ batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len, chaddr_offset = *header_len + BATADV_DHCP_CHADDR_OFFSET; /* store the client address if the message is going to a client */ - if (ret == BATADV_DHCP_TO_CLIENT && - pskb_may_pull(skb, chaddr_offset + ETH_ALEN)) { + if (ret == BATADV_DHCP_TO_CLIENT) { + if (!pskb_may_pull(skb, chaddr_offset + ETH_ALEN)) + return BATADV_DHCP_NO; + /* check if the DHCP packet carries an Ethernet DHCP */ p = skb->data + *header_len + BATADV_DHCP_HTYPE_OFFSET; if (*p != BATADV_DHCP_HTYPE_ETHERNET) From d8bf0c01642275c7dca1e5d02c34e4199c200b1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20L=C3=BCssing?= Date: Thu, 23 Jul 2020 14:28:08 +0200 Subject: [PATCH 002/110] batman-adv: Fix own OGM check in aggregated OGMs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The own OGM check is currently misplaced and can lead to the following issues: For one thing we might receive an aggregated OGM from a neighbor node which has our own OGM in the first place. We would then not only skip our own OGM but erroneously also any other, following OGM in the aggregate. For another, we might receive an OGM aggregate which has our own OGM in a place other then the first one. Then we would wrongly not skip this OGM, leading to populating the orginator and gateway table with ourself. Fixes: 9323158ef9f4 ("batman-adv: OGMv2 - implement originators logic") Signed-off-by: Linus Lüssing Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_v_ogm.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 0f8495b9eeb1..717fe657561d 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -881,6 +881,12 @@ static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset, ntohl(ogm_packet->seqno), ogm_throughput, ogm_packet->ttl, ogm_packet->version, ntohs(ogm_packet->tvlv_len)); + if (batadv_is_my_mac(bat_priv, ogm_packet->orig)) { + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Drop packet: originator packet from ourself\n"); + return; + } + /* If the throughput metric is 0, immediately drop the packet. No need * to create orig_node / neigh_node for an unusable route. */ @@ -1008,11 +1014,6 @@ int batadv_v_ogm_packet_recv(struct sk_buff *skb, if (batadv_is_my_mac(bat_priv, ethhdr->h_source)) goto free_skb; - ogm_packet = (struct batadv_ogm2_packet *)skb->data; - - if (batadv_is_my_mac(bat_priv, ogm_packet->orig)) - goto free_skb; - batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_RX); batadv_add_counter(bat_priv, BATADV_CNT_MGMT_RX_BYTES, skb->len + ETH_HLEN); From 279e89b2281af3b1a9f04906e157992c19c9f163 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Tue, 18 Aug 2020 17:46:10 +0300 Subject: [PATCH 003/110] batman-adv: bla: use netif_rx_ni when not in interrupt context batadv_bla_send_claim() gets called from worker thread context through batadv_bla_periodic_work(), thus netif_rx_ni needs to be used in that case. This fixes "NOHZ: local_softirq_pending 08" log messages seen when batman-adv is enabled. Fixes: 23721387c409 ("batman-adv: add basic bridge loop avoidance code") Signed-off-by: Jussi Kivilinna Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bridge_loop_avoidance.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 91a04ca373dc..8500f56cbd10 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -437,7 +437,10 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac, batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES, skb->len + ETH_HLEN); - netif_rx(skb); + if (in_interrupt()) + netif_rx(skb); + else + netif_rx_ni(skb); out: if (primary_if) batadv_hardif_put(primary_if); From cc5453a5b7e90c39f713091a7ebc53c1f87d1700 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Aug 2020 16:15:58 +0200 Subject: [PATCH 004/110] netfilter: conntrack: allow sctp hearbeat after connection re-use If an sctp connection gets re-used, heartbeats are flagged as invalid because their vtag doesn't match. Handle this in a similar way as TCP conntrack when it suspects that the endpoints and conntrack are out-of-sync. When a HEARTBEAT request fails its vtag validation, flag this in the conntrack state and accept the packet. When a HEARTBEAT_ACK is received with an invalid vtag in the reverse direction after we allowed such a HEARTBEAT through, assume we are out-of-sync and re-set the vtag info. v2: remove left-over snippet from an older incarnation that moved new_state/old_state assignments, thats not needed so keep that as-is. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_sctp.h | 2 ++ net/netfilter/nf_conntrack_proto_sctp.c | 39 ++++++++++++++++++--- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/include/linux/netfilter/nf_conntrack_sctp.h b/include/linux/netfilter/nf_conntrack_sctp.h index 9a33f171aa82..625f491b95de 100644 --- a/include/linux/netfilter/nf_conntrack_sctp.h +++ b/include/linux/netfilter/nf_conntrack_sctp.h @@ -9,6 +9,8 @@ struct ip_ct_sctp { enum sctp_conntrack state; __be32 vtag[IP_CT_DIR_MAX]; + u8 last_dir; + u8 flags; }; #endif /* _NF_CONNTRACK_SCTP_H */ diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 4f897b14b606..810cca24b399 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -62,6 +62,8 @@ static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = { [SCTP_CONNTRACK_HEARTBEAT_ACKED] = 210 SECS, }; +#define SCTP_FLAG_HEARTBEAT_VTAG_FAILED 1 + #define sNO SCTP_CONNTRACK_NONE #define sCL SCTP_CONNTRACK_CLOSED #define sCW SCTP_CONNTRACK_COOKIE_WAIT @@ -369,6 +371,7 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, u_int32_t offset, count; unsigned int *timeouts; unsigned long map[256 / sizeof(unsigned long)] = { 0 }; + bool ignore = false; if (sctp_error(skb, dataoff, state)) return -NF_ACCEPT; @@ -427,15 +430,39 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, /* Sec 8.5.1 (D) */ if (sh->vtag != ct->proto.sctp.vtag[dir]) goto out_unlock; - } else if (sch->type == SCTP_CID_HEARTBEAT || - sch->type == SCTP_CID_HEARTBEAT_ACK) { + } else if (sch->type == SCTP_CID_HEARTBEAT) { + if (ct->proto.sctp.vtag[dir] == 0) { + pr_debug("Setting %d vtag %x for dir %d\n", sch->type, sh->vtag, dir); + ct->proto.sctp.vtag[dir] = sh->vtag; + } else if (sh->vtag != ct->proto.sctp.vtag[dir]) { + if (test_bit(SCTP_CID_DATA, map) || ignore) + goto out_unlock; + + ct->proto.sctp.flags |= SCTP_FLAG_HEARTBEAT_VTAG_FAILED; + ct->proto.sctp.last_dir = dir; + ignore = true; + continue; + } else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) { + ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED; + } + } else if (sch->type == SCTP_CID_HEARTBEAT_ACK) { if (ct->proto.sctp.vtag[dir] == 0) { pr_debug("Setting vtag %x for dir %d\n", sh->vtag, dir); ct->proto.sctp.vtag[dir] = sh->vtag; } else if (sh->vtag != ct->proto.sctp.vtag[dir]) { - pr_debug("Verification tag check failed\n"); - goto out_unlock; + if (test_bit(SCTP_CID_DATA, map) || ignore) + goto out_unlock; + + if ((ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) == 0 || + ct->proto.sctp.last_dir == dir) + goto out_unlock; + + ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED; + ct->proto.sctp.vtag[dir] = sh->vtag; + ct->proto.sctp.vtag[!dir] = 0; + } else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) { + ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED; } } @@ -470,6 +497,10 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, } spin_unlock_bh(&ct->lock); + /* allow but do not refresh timeout */ + if (ignore) + return NF_ACCEPT; + timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) timeouts = nf_sctp_pernet(nf_ct_net(ct))->timeouts; From fce2ff728f95b8894db14f51c9274dc56c37616f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 5 Aug 2020 15:35:18 +0200 Subject: [PATCH 005/110] nl80211: fix NL80211_ATTR_HE_6GHZ_CAPABILITY usage In nl80211_set_station(), we check NL80211_ATTR_HE_6GHZ_CAPABILITY and then use NL80211_ATTR_HE_CAPABILITY, which is clearly wrong. Fix this to use NL80211_ATTR_HE_6GHZ_CAPABILITY as well. Cc: stable@vger.kernel.org Fixes: 43e64bf301fd ("cfg80211: handle 6 GHz capability of new station") Link: https://lore.kernel.org/r/20200805153516.310cef625955.I0abc04dc8abb2c7c005c88ef8fa2d0e3c9fb95c4@changeid Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index c04fc6cf6583..19dc0ee807f6 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6011,7 +6011,7 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]) params.he_6ghz_capa = - nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]); + nla_data(info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]); if (info->attrs[NL80211_ATTR_AIRTIME_WEIGHT]) params.airtime_weight = From 68528d937dcd675e79973061c1a314db598162d1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 20 Aug 2020 14:12:33 +0100 Subject: [PATCH 006/110] rxrpc: Keep the ACK serial in a var in rxrpc_input_ack() Keep the ACK serial number in a variable in rxrpc_input_ack() as it's used frequently. Signed-off-by: David Howells --- net/rxrpc/input.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 767579328a06..a7699e56eac8 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -843,7 +843,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) struct rxrpc_ackinfo info; u8 acks[RXRPC_MAXACKS]; } buf; - rxrpc_serial_t acked_serial; + rxrpc_serial_t ack_serial, acked_serial; rxrpc_seq_t first_soft_ack, hard_ack, prev_pkt; int nr_acks, offset, ioffset; @@ -856,6 +856,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) } offset += sizeof(buf.ack); + ack_serial = sp->hdr.serial; acked_serial = ntohl(buf.ack.serial); first_soft_ack = ntohl(buf.ack.firstPacket); prev_pkt = ntohl(buf.ack.previousPacket); @@ -864,31 +865,31 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) summary.ack_reason = (buf.ack.reason < RXRPC_ACK__INVALID ? buf.ack.reason : RXRPC_ACK__INVALID); - trace_rxrpc_rx_ack(call, sp->hdr.serial, acked_serial, + trace_rxrpc_rx_ack(call, ack_serial, acked_serial, first_soft_ack, prev_pkt, summary.ack_reason, nr_acks); if (buf.ack.reason == RXRPC_ACK_PING_RESPONSE) rxrpc_input_ping_response(call, skb->tstamp, acked_serial, - sp->hdr.serial); + ack_serial); if (buf.ack.reason == RXRPC_ACK_REQUESTED) rxrpc_input_requested_ack(call, skb->tstamp, acked_serial, - sp->hdr.serial); + ack_serial); if (buf.ack.reason == RXRPC_ACK_PING) { - _proto("Rx ACK %%%u PING Request", sp->hdr.serial); + _proto("Rx ACK %%%u PING Request", ack_serial); rxrpc_propose_ACK(call, RXRPC_ACK_PING_RESPONSE, - sp->hdr.serial, true, true, + ack_serial, true, true, rxrpc_propose_ack_respond_to_ping); } else if (sp->hdr.flags & RXRPC_REQUEST_ACK) { rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED, - sp->hdr.serial, true, true, + ack_serial, true, true, rxrpc_propose_ack_respond_to_ack); } /* Discard any out-of-order or duplicate ACKs (outside lock). */ if (!rxrpc_is_ack_valid(call, first_soft_ack, prev_pkt)) { - trace_rxrpc_rx_discard_ack(call->debug_id, sp->hdr.serial, + trace_rxrpc_rx_discard_ack(call->debug_id, ack_serial, first_soft_ack, call->ackr_first_seq, prev_pkt, call->ackr_prev_seq); return; @@ -904,7 +905,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) /* Discard any out-of-order or duplicate ACKs (inside lock). */ if (!rxrpc_is_ack_valid(call, first_soft_ack, prev_pkt)) { - trace_rxrpc_rx_discard_ack(call->debug_id, sp->hdr.serial, + trace_rxrpc_rx_discard_ack(call->debug_id, ack_serial, first_soft_ack, call->ackr_first_seq, prev_pkt, call->ackr_prev_seq); goto out; @@ -964,7 +965,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) RXRPC_TX_ANNO_LAST && summary.nr_acks == call->tx_top - hard_ack && rxrpc_is_client_call(call)) - rxrpc_propose_ACK(call, RXRPC_ACK_PING, sp->hdr.serial, + rxrpc_propose_ACK(call, RXRPC_ACK_PING, ack_serial, false, true, rxrpc_propose_ack_ping_for_lost_reply); From 4700c4d80b7bb171f6996016ef121e1508860b42 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 19 Aug 2020 23:29:16 +0100 Subject: [PATCH 007/110] rxrpc: Fix loss of RTT samples due to interposed ACK The Rx protocol has a mechanism to help generate RTT samples that works by a client transmitting a REQUESTED-type ACK when it receives a DATA packet that has the REQUEST_ACK flag set. The peer, however, may interpose other ACKs before transmitting the REQUESTED-ACK, as can be seen in the following trace excerpt: rxrpc_tx_data: c=00000044 DATA d0b5ece8:00000001 00000001 q=00000001 fl=07 rxrpc_rx_ack: c=00000044 00000001 PNG r=00000000 f=00000002 p=00000000 n=0 rxrpc_rx_ack: c=00000044 00000002 REQ r=00000001 f=00000002 p=00000001 n=0 ... DATA packet 1 (q=xx) has REQUEST_ACK set (bit 1 of fl=xx). The incoming ping (labelled PNG) hard-acks the request DATA packet (f=xx exceeds the sequence number of the DATA packet), causing it to be discarded from the Tx ring. The ACK that was requested (labelled REQ, r=xx references the serial of the DATA packet) comes after the ping, but the sk_buff holding the timestamp has gone and the RTT sample is lost. This is particularly noticeable on RPC calls used to probe the service offered by the peer. A lot of peers end up with an unknown RTT because we only ever sent a single RPC. This confuses the server rotation algorithm. Fix this by caching the information about the outgoing packet in RTT calculations in the rxrpc_call struct rather than looking in the Tx ring. A four-deep buffer is maintained and both REQUEST_ACK-flagged DATA and PING-ACK transmissions are recorded in there. When the appropriate response ACK is received, the buffer is checked for a match and, if found, an RTT sample is recorded. If a received ACK refers to a packet with a later serial number than an entry in the cache, that entry is presumed lost and the entry is made available to record a new transmission. ACKs types other than REQUESTED-type and PING-type cause any matching sample to be cancelled as they don't necessarily represent a useful measurement. If there's no space in the buffer on ping/data transmission, the sample base is discarded. Fixes: 50235c4b5a2f ("rxrpc: Obtain RTT data by requesting ACKs on DATA packets") Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 27 +++++++-- net/rxrpc/ar-internal.h | 13 +++-- net/rxrpc/call_object.c | 1 + net/rxrpc/input.c | 104 ++++++++++++++++++++--------------- net/rxrpc/output.c | 82 ++++++++++++++++++++------- net/rxrpc/rtt.c | 3 +- 6 files changed, 154 insertions(+), 76 deletions(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 059b6e45a028..c33079b986e8 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -138,11 +138,16 @@ enum rxrpc_recvmsg_trace { }; enum rxrpc_rtt_tx_trace { + rxrpc_rtt_tx_cancel, rxrpc_rtt_tx_data, + rxrpc_rtt_tx_no_slot, rxrpc_rtt_tx_ping, }; enum rxrpc_rtt_rx_trace { + rxrpc_rtt_rx_cancel, + rxrpc_rtt_rx_lost, + rxrpc_rtt_rx_obsolete, rxrpc_rtt_rx_ping_response, rxrpc_rtt_rx_requested_ack, }; @@ -339,10 +344,15 @@ enum rxrpc_tx_point { E_(rxrpc_recvmsg_wait, "WAIT") #define rxrpc_rtt_tx_traces \ + EM(rxrpc_rtt_tx_cancel, "CNCE") \ EM(rxrpc_rtt_tx_data, "DATA") \ + EM(rxrpc_rtt_tx_no_slot, "FULL") \ E_(rxrpc_rtt_tx_ping, "PING") #define rxrpc_rtt_rx_traces \ + EM(rxrpc_rtt_rx_cancel, "CNCL") \ + EM(rxrpc_rtt_rx_obsolete, "OBSL") \ + EM(rxrpc_rtt_rx_lost, "LOST") \ EM(rxrpc_rtt_rx_ping_response, "PONG") \ E_(rxrpc_rtt_rx_requested_ack, "RACK") @@ -1087,38 +1097,43 @@ TRACE_EVENT(rxrpc_recvmsg, TRACE_EVENT(rxrpc_rtt_tx, TP_PROTO(struct rxrpc_call *call, enum rxrpc_rtt_tx_trace why, - rxrpc_serial_t send_serial), + int slot, rxrpc_serial_t send_serial), - TP_ARGS(call, why, send_serial), + TP_ARGS(call, why, slot, send_serial), TP_STRUCT__entry( __field(unsigned int, call ) __field(enum rxrpc_rtt_tx_trace, why ) + __field(int, slot ) __field(rxrpc_serial_t, send_serial ) ), TP_fast_assign( __entry->call = call->debug_id; __entry->why = why; + __entry->slot = slot; __entry->send_serial = send_serial; ), - TP_printk("c=%08x %s sr=%08x", + TP_printk("c=%08x [%d] %s sr=%08x", __entry->call, + __entry->slot, __print_symbolic(__entry->why, rxrpc_rtt_tx_traces), __entry->send_serial) ); TRACE_EVENT(rxrpc_rtt_rx, TP_PROTO(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, + int slot, rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, u32 rtt, u32 rto), - TP_ARGS(call, why, send_serial, resp_serial, rtt, rto), + TP_ARGS(call, why, slot, send_serial, resp_serial, rtt, rto), TP_STRUCT__entry( __field(unsigned int, call ) __field(enum rxrpc_rtt_rx_trace, why ) + __field(int, slot ) __field(rxrpc_serial_t, send_serial ) __field(rxrpc_serial_t, resp_serial ) __field(u32, rtt ) @@ -1128,14 +1143,16 @@ TRACE_EVENT(rxrpc_rtt_rx, TP_fast_assign( __entry->call = call->debug_id; __entry->why = why; + __entry->slot = slot; __entry->send_serial = send_serial; __entry->resp_serial = resp_serial; __entry->rtt = rtt; __entry->rto = rto; ), - TP_printk("c=%08x %s sr=%08x rr=%08x rtt=%u rto=%u", + TP_printk("c=%08x [%d] %s sr=%08x rr=%08x rtt=%u rto=%u", __entry->call, + __entry->slot, __print_symbolic(__entry->why, rxrpc_rtt_rx_traces), __entry->send_serial, __entry->resp_serial, diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 6d29a3603a3e..884cff7bb169 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -488,7 +488,6 @@ enum rxrpc_call_flag { RXRPC_CALL_RX_LAST, /* Received the last packet (at rxtx_top) */ RXRPC_CALL_TX_LAST, /* Last packet in Tx buffer (at rxtx_top) */ RXRPC_CALL_SEND_PING, /* A ping will need to be sent */ - RXRPC_CALL_PINGING, /* Ping in process */ RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */ RXRPC_CALL_BEGAN_RX_TIMER, /* We began the expect_rx_by timer */ RXRPC_CALL_RX_HEARD, /* The peer responded at least once to this call */ @@ -673,9 +672,13 @@ struct rxrpc_call { rxrpc_seq_t ackr_consumed; /* Highest packet shown consumed */ rxrpc_seq_t ackr_seen; /* Highest packet shown seen */ - /* ping management */ - rxrpc_serial_t ping_serial; /* Last ping sent */ - ktime_t ping_time; /* Time last ping sent */ + /* RTT management */ + rxrpc_serial_t rtt_serial[4]; /* Serial number of DATA or PING sent */ + ktime_t rtt_sent_at[4]; /* Time packet sent */ + unsigned long rtt_avail; /* Mask of available slots in bits 0-3, + * Mask of pending samples in 8-11 */ +#define RXRPC_CALL_RTT_AVAIL_MASK 0xf +#define RXRPC_CALL_RTT_PEND_SHIFT 8 /* transmission-phase ACK management */ ktime_t acks_latest_ts; /* Timestamp of latest ACK received */ @@ -1037,7 +1040,7 @@ static inline bool __rxrpc_abort_eproto(struct rxrpc_call *call, /* * rtt.c */ -void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace, +void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace, int, rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t); unsigned long rxrpc_get_rto_backoff(struct rxrpc_peer *, bool); void rxrpc_peer_init_rtt(struct rxrpc_peer *); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 38a46167523f..a40fae013942 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -153,6 +153,7 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, call->cong_ssthresh = RXRPC_RXTX_BUFF_SIZE - 1; call->rxnet = rxnet; + call->rtt_avail = RXRPC_CALL_RTT_AVAIL_MASK; atomic_inc(&rxnet->nr_calls); return call; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index a7699e56eac8..19ddfc9807e8 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -608,36 +608,57 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) } /* - * Process a requested ACK. + * See if there's a cached RTT probe to complete. */ -static void rxrpc_input_requested_ack(struct rxrpc_call *call, - ktime_t resp_time, - rxrpc_serial_t orig_serial, - rxrpc_serial_t ack_serial) +static void rxrpc_complete_rtt_probe(struct rxrpc_call *call, + ktime_t resp_time, + rxrpc_serial_t acked_serial, + rxrpc_serial_t ack_serial, + enum rxrpc_rtt_rx_trace type) { - struct rxrpc_skb_priv *sp; - struct sk_buff *skb; + rxrpc_serial_t orig_serial; + unsigned long avail; ktime_t sent_at; - int ix; + bool matched = false; + int i; - for (ix = 0; ix < RXRPC_RXTX_BUFF_SIZE; ix++) { - skb = call->rxtx_buffer[ix]; - if (!skb) + avail = READ_ONCE(call->rtt_avail); + smp_rmb(); /* Read avail bits before accessing data. */ + + for (i = 0; i < ARRAY_SIZE(call->rtt_serial); i++) { + if (!test_bit(i + RXRPC_CALL_RTT_PEND_SHIFT, &avail)) continue; - sent_at = skb->tstamp; - smp_rmb(); /* Read timestamp before serial. */ - sp = rxrpc_skb(skb); - if (sp->hdr.serial != orig_serial) - continue; - goto found; + sent_at = call->rtt_sent_at[i]; + orig_serial = call->rtt_serial[i]; + + if (orig_serial == acked_serial) { + clear_bit(i + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail); + smp_mb(); /* Read data before setting avail bit */ + set_bit(i, &call->rtt_avail); + if (type != rxrpc_rtt_rx_cancel) + rxrpc_peer_add_rtt(call, type, i, acked_serial, ack_serial, + sent_at, resp_time); + else + trace_rxrpc_rtt_rx(call, rxrpc_rtt_rx_cancel, i, + orig_serial, acked_serial, 0, 0); + matched = true; + } + + /* If a later serial is being acked, then mark this slot as + * being available. + */ + if (after(acked_serial, orig_serial)) { + trace_rxrpc_rtt_rx(call, rxrpc_rtt_rx_obsolete, i, + orig_serial, acked_serial, 0, 0); + clear_bit(i + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail); + smp_wmb(); + set_bit(i, &call->rtt_avail); + } } - return; - -found: - rxrpc_peer_add_rtt(call, rxrpc_rtt_rx_requested_ack, - orig_serial, ack_serial, sent_at, resp_time); + if (!matched) + trace_rxrpc_rtt_rx(call, rxrpc_rtt_rx_lost, 9, 0, acked_serial, 0, 0); } /* @@ -682,27 +703,11 @@ static void rxrpc_input_check_for_lost_ack(struct rxrpc_call *call) */ static void rxrpc_input_ping_response(struct rxrpc_call *call, ktime_t resp_time, - rxrpc_serial_t orig_serial, + rxrpc_serial_t acked_serial, rxrpc_serial_t ack_serial) { - rxrpc_serial_t ping_serial; - ktime_t ping_time; - - ping_time = call->ping_time; - smp_rmb(); - ping_serial = READ_ONCE(call->ping_serial); - - if (orig_serial == call->acks_lost_ping) + if (acked_serial == call->acks_lost_ping) rxrpc_input_check_for_lost_ack(call); - - if (before(orig_serial, ping_serial) || - !test_and_clear_bit(RXRPC_CALL_PINGING, &call->flags)) - return; - if (after(orig_serial, ping_serial)) - return; - - rxrpc_peer_add_rtt(call, rxrpc_rtt_rx_ping_response, - orig_serial, ack_serial, ping_time, resp_time); } /* @@ -869,12 +874,23 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) first_soft_ack, prev_pkt, summary.ack_reason, nr_acks); - if (buf.ack.reason == RXRPC_ACK_PING_RESPONSE) + switch (buf.ack.reason) { + case RXRPC_ACK_PING_RESPONSE: rxrpc_input_ping_response(call, skb->tstamp, acked_serial, ack_serial); - if (buf.ack.reason == RXRPC_ACK_REQUESTED) - rxrpc_input_requested_ack(call, skb->tstamp, acked_serial, - ack_serial); + rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial, + rxrpc_rtt_rx_ping_response); + break; + case RXRPC_ACK_REQUESTED: + rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial, + rxrpc_rtt_rx_requested_ack); + break; + default: + if (acked_serial != 0) + rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial, + rxrpc_rtt_rx_cancel); + break; + } if (buf.ack.reason == RXRPC_ACK_PING) { _proto("Rx ACK %%%u PING Request", ack_serial); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 1ba43c3df4ad..3cfff7922ba8 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -123,6 +123,49 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_connection *conn, return top - hard_ack + 3; } +/* + * Record the beginning of an RTT probe. + */ +static int rxrpc_begin_rtt_probe(struct rxrpc_call *call, rxrpc_serial_t serial, + enum rxrpc_rtt_tx_trace why) +{ + unsigned long avail = call->rtt_avail; + int rtt_slot = 9; + + if (!(avail & RXRPC_CALL_RTT_AVAIL_MASK)) + goto no_slot; + + rtt_slot = __ffs(avail & RXRPC_CALL_RTT_AVAIL_MASK); + if (!test_and_clear_bit(rtt_slot, &call->rtt_avail)) + goto no_slot; + + call->rtt_serial[rtt_slot] = serial; + call->rtt_sent_at[rtt_slot] = ktime_get_real(); + smp_wmb(); /* Write data before avail bit */ + set_bit(rtt_slot + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail); + + trace_rxrpc_rtt_tx(call, why, rtt_slot, serial); + return rtt_slot; + +no_slot: + trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_no_slot, rtt_slot, serial); + return -1; +} + +/* + * Cancel an RTT probe. + */ +static void rxrpc_cancel_rtt_probe(struct rxrpc_call *call, + rxrpc_serial_t serial, int rtt_slot) +{ + if (rtt_slot != -1) { + clear_bit(rtt_slot + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail); + smp_wmb(); /* Clear pending bit before setting slot */ + set_bit(rtt_slot, &call->rtt_avail); + trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_cancel, rtt_slot, serial); + } +} + /* * Send an ACK call packet. */ @@ -136,7 +179,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, rxrpc_serial_t serial; rxrpc_seq_t hard_ack, top; size_t len, n; - int ret; + int ret, rtt_slot = -1; u8 reason; if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) @@ -196,18 +239,8 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, if (_serial) *_serial = serial; - if (ping) { - call->ping_serial = serial; - smp_wmb(); - /* We need to stick a time in before we send the packet in case - * the reply gets back before kernel_sendmsg() completes - but - * asking UDP to send the packet can take a relatively long - * time. - */ - call->ping_time = ktime_get_real(); - set_bit(RXRPC_CALL_PINGING, &call->flags); - trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_ping, serial); - } + if (ping) + rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_ping); ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); conn->params.peer->last_tx_at = ktime_get_seconds(); @@ -221,8 +254,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, if (call->state < RXRPC_CALL_COMPLETE) { if (ret < 0) { - if (ping) - clear_bit(RXRPC_CALL_PINGING, &call->flags); + rxrpc_cancel_rtt_probe(call, serial, rtt_slot); rxrpc_propose_ACK(call, pkt->ack.reason, ntohl(pkt->ack.serial), false, true, @@ -321,7 +353,7 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, struct kvec iov[2]; rxrpc_serial_t serial; size_t len; - int ret; + int ret, rtt_slot = -1; _enter(",{%d}", skb->len); @@ -397,6 +429,8 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, sp->hdr.serial = serial; smp_wmb(); /* Set serial before timestamp */ skb->tstamp = ktime_get_real(); + if (whdr.flags & RXRPC_REQUEST_ACK) + rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_data); /* send the packet by UDP * - returns -EMSGSIZE if UDP would have to fragment the packet @@ -408,12 +442,15 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, conn->params.peer->last_tx_at = ktime_get_seconds(); up_read(&conn->params.local->defrag_sem); - if (ret < 0) + if (ret < 0) { + rxrpc_cancel_rtt_probe(call, serial, rtt_slot); trace_rxrpc_tx_fail(call->debug_id, serial, ret, rxrpc_tx_point_call_data_nofrag); - else + } else { trace_rxrpc_tx_packet(call->debug_id, &whdr, rxrpc_tx_point_call_data_nofrag); + } + rxrpc_tx_backoff(call, ret); if (ret == -EMSGSIZE) goto send_fragmentable; @@ -422,7 +459,6 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, if (ret >= 0) { if (whdr.flags & RXRPC_REQUEST_ACK) { call->peer->rtt_last_req = skb->tstamp; - trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_data, serial); if (call->peer->rtt_count > 1) { unsigned long nowj = jiffies, ack_lost_at; @@ -469,6 +505,8 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, sp->hdr.serial = serial; smp_wmb(); /* Set serial before timestamp */ skb->tstamp = ktime_get_real(); + if (whdr.flags & RXRPC_REQUEST_ACK) + rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_data); switch (conn->params.local->srx.transport.family) { case AF_INET6: @@ -487,12 +525,14 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, BUG(); } - if (ret < 0) + if (ret < 0) { + rxrpc_cancel_rtt_probe(call, serial, rtt_slot); trace_rxrpc_tx_fail(call->debug_id, serial, ret, rxrpc_tx_point_call_data_frag); - else + } else { trace_rxrpc_tx_packet(call->debug_id, &whdr, rxrpc_tx_point_call_data_frag); + } rxrpc_tx_backoff(call, ret); up_write(&conn->params.local->defrag_sem); diff --git a/net/rxrpc/rtt.c b/net/rxrpc/rtt.c index 928d8b34a3ee..1221b0637a7e 100644 --- a/net/rxrpc/rtt.c +++ b/net/rxrpc/rtt.c @@ -146,6 +146,7 @@ static void rxrpc_ack_update_rtt(struct rxrpc_peer *peer, long rtt_us) * exclusive access to the peer RTT data. */ void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, + int rtt_slot, rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, ktime_t send_time, ktime_t resp_time) { @@ -162,7 +163,7 @@ void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, peer->rtt_count++; spin_unlock(&peer->rtt_input_lock); - trace_rxrpc_rtt_rx(call, why, send_serial, resp_serial, + trace_rxrpc_rtt_rx(call, why, rtt_slot, send_serial, resp_serial, peer->srtt_us >> 3, peer->rto_j); } From 1d4adfaf65746203861c72d9d78de349eb97d528 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 20 Aug 2020 15:13:00 +0100 Subject: [PATCH 008/110] rxrpc: Make rxrpc_kernel_get_srtt() indicate validity Fix rxrpc_kernel_get_srtt() to indicate the validity of the returned smoothed RTT. If we haven't had any valid samples yet, the SRTT isn't useful. Fixes: c410bf01933e ("rxrpc: Fix the excessive initial retransmission timeout") Signed-off-by: David Howells --- fs/afs/fs_probe.c | 4 ++-- fs/afs/vl_probe.c | 4 ++-- include/net/af_rxrpc.h | 2 +- net/rxrpc/peer_object.c | 16 +++++++++++++--- 4 files changed, 18 insertions(+), 8 deletions(-) diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c index 5d9ef517cf81..e7e98ad63a91 100644 --- a/fs/afs/fs_probe.c +++ b/fs/afs/fs_probe.c @@ -161,8 +161,8 @@ void afs_fileserver_probe_result(struct afs_call *call) } } - rtt_us = rxrpc_kernel_get_srtt(call->net->socket, call->rxcall); - if (rtt_us < server->probe.rtt) { + if (rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us) && + rtt_us < server->probe.rtt) { server->probe.rtt = rtt_us; server->rtt = rtt_us; alist->preferred = index; diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c index e3aa013c2177..081b7e5b13f5 100644 --- a/fs/afs/vl_probe.c +++ b/fs/afs/vl_probe.c @@ -92,8 +92,8 @@ void afs_vlserver_probe_result(struct afs_call *call) } } - rtt_us = rxrpc_kernel_get_srtt(call->net->socket, call->rxcall); - if (rtt_us < server->probe.rtt) { + if (rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us) && + rtt_us < server->probe.rtt) { server->probe.rtt = rtt_us; alist->preferred = index; have_result = true; diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index 91eacbdcf33d..f6abcc0bbd6e 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -59,7 +59,7 @@ bool rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *, void rxrpc_kernel_end_call(struct socket *, struct rxrpc_call *); void rxrpc_kernel_get_peer(struct socket *, struct rxrpc_call *, struct sockaddr_rxrpc *); -u32 rxrpc_kernel_get_srtt(struct socket *, struct rxrpc_call *); +bool rxrpc_kernel_get_srtt(struct socket *, struct rxrpc_call *, u32 *); int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t, rxrpc_user_attach_call_t, unsigned long, gfp_t, unsigned int); diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index ca29976bb193..68396d052052 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -502,11 +502,21 @@ EXPORT_SYMBOL(rxrpc_kernel_get_peer); * rxrpc_kernel_get_srtt - Get a call's peer smoothed RTT * @sock: The socket on which the call is in progress. * @call: The call to query + * @_srtt: Where to store the SRTT value. * - * Get the call's peer smoothed RTT. + * Get the call's peer smoothed RTT in uS. */ -u32 rxrpc_kernel_get_srtt(struct socket *sock, struct rxrpc_call *call) +bool rxrpc_kernel_get_srtt(struct socket *sock, struct rxrpc_call *call, + u32 *_srtt) { - return call->peer->srtt_us >> 3; + struct rxrpc_peer *peer = call->peer; + + if (peer->rtt_count == 0) { + *_srtt = 1000000; /* 1S */ + return false; + } + + *_srtt = call->peer->srtt_us >> 3; + return true; } EXPORT_SYMBOL(rxrpc_kernel_get_srtt); From 4f4c2c05eb7703b485b4285b8e2eee908c7b4508 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 19 Aug 2020 15:13:44 +0100 Subject: [PATCH 009/110] afs: Remove afs_vlserver->probe.have_result Remove afs_vlserver->probe.have_result as it's neither read nor waited upon. Fixes: 3bf0fb6f33dd ("afs: Probe multiple fileservers simultaneously") Signed-off-by: David Howells --- fs/afs/internal.h | 1 - fs/afs/vl_probe.c | 5 +---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 792ac711985e..2e6ae6388c72 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -412,7 +412,6 @@ struct afs_vlserver { unsigned int rtt; /* RTT as ktime/64 */ u32 abort_code; short error; - bool have_result; bool responded:1; bool is_yfs:1; bool not_yfs:1; diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c index 081b7e5b13f5..ee59188433b9 100644 --- a/fs/afs/vl_probe.c +++ b/fs/afs/vl_probe.c @@ -109,11 +109,8 @@ void afs_vlserver_probe_result(struct afs_call *call) server_index, index, &alist->addrs[index].transport, rtt_us, ret); have_result |= afs_vl_probe_done(server); - if (have_result) { - server->probe.have_result = true; - wake_up_var(&server->probe.have_result); + if (have_result) wake_up_all(&server->probe_wq); - } } /* From fb72cd3d484ce548597c75ebeecc70483fe8bb6e Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 20 Aug 2020 15:01:54 +0100 Subject: [PATCH 010/110] afs: Expose information from afs_vlserver through /proc for debugging Convert various bitfields in afs_vlserver::probe to a mask and then expose this and some other bits of information through /proc/net/afs//vlservers to make it easier to debug VL server communication issues. Signed-off-by: David Howells --- fs/afs/internal.h | 9 +++++---- fs/afs/proc.c | 5 +++++ fs/afs/vl_probe.c | 20 ++++++++++---------- fs/afs/vl_rotate.c | 3 ++- 4 files changed, 22 insertions(+), 15 deletions(-) diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 2e6ae6388c72..b29dfcfe5fc2 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -412,10 +412,11 @@ struct afs_vlserver { unsigned int rtt; /* RTT as ktime/64 */ u32 abort_code; short error; - bool responded:1; - bool is_yfs:1; - bool not_yfs:1; - bool local_failure:1; + unsigned short flags; +#define AFS_VLSERVER_PROBE_RESPONDED 0x01 /* At least once response (may be abort) */ +#define AFS_VLSERVER_PROBE_IS_YFS 0x02 /* The peer appears to be YFS */ +#define AFS_VLSERVER_PROBE_NOT_YFS 0x04 /* The peer appears not to be YFS */ +#define AFS_VLSERVER_PROBE_LOCAL_FAILURE 0x08 /* A local failure prevented a probe */ } probe; u16 port; diff --git a/fs/afs/proc.c b/fs/afs/proc.c index e817fc740ba0..5e837155f1a0 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -310,6 +310,11 @@ static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v) alist->preferred == i ? '>' : '-', &alist->addrs[i].transport); } + seq_printf(m, " info: fl=%lx rtt=%d\n", vlserver->flags, vlserver->probe.rtt); + seq_printf(m, " probe: fl=%x e=%d ac=%d out=%d\n", + vlserver->probe.flags, vlserver->probe.error, + vlserver->probe.abort_code, + atomic_read(&vlserver->probe_outstanding)); return 0; } diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c index ee59188433b9..a6d04b4fbf56 100644 --- a/fs/afs/vl_probe.c +++ b/fs/afs/vl_probe.c @@ -45,14 +45,14 @@ void afs_vlserver_probe_result(struct afs_call *call) server->probe.error = 0; goto responded; case -ECONNABORTED: - if (!server->probe.responded) { + if (!(server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED)) { server->probe.abort_code = call->abort_code; server->probe.error = ret; } goto responded; case -ENOMEM: case -ENONET: - server->probe.local_failure = true; + server->probe.flags |= AFS_VLSERVER_PROBE_LOCAL_FAILURE; afs_io_error(call, afs_io_error_vl_probe_fail); goto out; case -ECONNRESET: /* Responded, but call expired. */ @@ -67,7 +67,7 @@ void afs_vlserver_probe_result(struct afs_call *call) default: clear_bit(index, &alist->responded); set_bit(index, &alist->failed); - if (!server->probe.responded && + if (!(server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED) && (server->probe.error == 0 || server->probe.error == -ETIMEDOUT || server->probe.error == -ETIME)) @@ -81,12 +81,12 @@ void afs_vlserver_probe_result(struct afs_call *call) clear_bit(index, &alist->failed); if (call->service_id == YFS_VL_SERVICE) { - server->probe.is_yfs = true; + server->probe.flags |= AFS_VLSERVER_PROBE_IS_YFS; set_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags); alist->addrs[index].srx_service = call->service_id; } else { - server->probe.not_yfs = true; - if (!server->probe.is_yfs) { + server->probe.flags |= AFS_VLSERVER_PROBE_NOT_YFS; + if (!(server->probe.flags & AFS_VLSERVER_PROBE_IS_YFS)) { clear_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags); alist->addrs[index].srx_service = call->service_id; } @@ -100,7 +100,7 @@ void afs_vlserver_probe_result(struct afs_call *call) } smp_wmb(); /* Set rtt before responded. */ - server->probe.responded = true; + server->probe.flags |= AFS_VLSERVER_PROBE_RESPONDED; set_bit(AFS_VLSERVER_FL_PROBED, &server->flags); out: spin_unlock(&server->probe_lock); @@ -202,7 +202,7 @@ int afs_wait_for_vl_probes(struct afs_vlserver_list *vllist, server = vllist->servers[i].server; if (!test_bit(AFS_VLSERVER_FL_PROBING, &server->flags)) __clear_bit(i, &untried); - if (server->probe.responded) + if (server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED) have_responders = true; } } @@ -228,7 +228,7 @@ int afs_wait_for_vl_probes(struct afs_vlserver_list *vllist, for (i = 0; i < vllist->nr_servers; i++) { if (test_bit(i, &untried)) { server = vllist->servers[i].server; - if (server->probe.responded) + if (server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED) goto stop; if (test_bit(AFS_VLSERVER_FL_PROBING, &server->flags)) still_probing = true; @@ -246,7 +246,7 @@ int afs_wait_for_vl_probes(struct afs_vlserver_list *vllist, for (i = 0; i < vllist->nr_servers; i++) { if (test_bit(i, &untried)) { server = vllist->servers[i].server; - if (server->probe.responded && + if ((server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED) && server->probe.rtt < rtt) { pref = i; rtt = server->probe.rtt; diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c index f405ca8b240a..ed2609e82695 100644 --- a/fs/afs/vl_rotate.c +++ b/fs/afs/vl_rotate.c @@ -192,7 +192,8 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc) for (i = 0; i < vc->server_list->nr_servers; i++) { struct afs_vlserver *s = vc->server_list->servers[i].server; - if (!test_bit(i, &vc->untried) || !s->probe.responded) + if (!test_bit(i, &vc->untried) || + !(s->probe.flags & AFS_VLSERVER_PROBE_RESPONDED)) continue; if (s->probe.rtt < rtt) { vc->index = i; From b95b30940ee46a4d892ca3de3ffb8249c4985b1f Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 19 Aug 2020 15:27:17 +0100 Subject: [PATCH 011/110] afs: Don't use VL probe running state to make decisions outside probe code Don't use the running state for VL server probes to make decisions about which server to use as the state is cleared at the start of a probe and intermediate values might also be misleading. Instead, add a separate 'latest known' rtt in the afs_vlserver struct and a flag to indicate if the server is known to be responding and update these as and when we know what to change them to. Fixes: 3bf0fb6f33dd ("afs: Probe multiple fileservers simultaneously") Signed-off-by: David Howells --- fs/afs/internal.h | 4 +++- fs/afs/proc.c | 2 +- fs/afs/vl_list.c | 1 + fs/afs/vl_probe.c | 59 ++++++++++++++++++++++++++++++++-------------- fs/afs/vl_rotate.c | 2 +- 5 files changed, 47 insertions(+), 21 deletions(-) diff --git a/fs/afs/internal.h b/fs/afs/internal.h index b29dfcfe5fc2..18042b7dab6a 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -401,15 +401,17 @@ struct afs_vlserver { #define AFS_VLSERVER_FL_PROBED 0 /* The VL server has been probed */ #define AFS_VLSERVER_FL_PROBING 1 /* VL server is being probed */ #define AFS_VLSERVER_FL_IS_YFS 2 /* Server is YFS not AFS */ +#define AFS_VLSERVER_FL_RESPONDING 3 /* VL server is responding */ rwlock_t lock; /* Lock on addresses */ atomic_t usage; + unsigned int rtt; /* Server's current RTT in uS */ /* Probe state */ wait_queue_head_t probe_wq; atomic_t probe_outstanding; spinlock_t probe_lock; struct { - unsigned int rtt; /* RTT as ktime/64 */ + unsigned int rtt; /* RTT in uS */ u32 abort_code; short error; unsigned short flags; diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 5e837155f1a0..e8babb62ed44 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -310,7 +310,7 @@ static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v) alist->preferred == i ? '>' : '-', &alist->addrs[i].transport); } - seq_printf(m, " info: fl=%lx rtt=%d\n", vlserver->flags, vlserver->probe.rtt); + seq_printf(m, " info: fl=%lx rtt=%d\n", vlserver->flags, vlserver->rtt); seq_printf(m, " probe: fl=%x e=%d ac=%d out=%d\n", vlserver->probe.flags, vlserver->probe.error, vlserver->probe.abort_code, diff --git a/fs/afs/vl_list.c b/fs/afs/vl_list.c index 8fea54eba0c2..38b2ba1d9ec0 100644 --- a/fs/afs/vl_list.c +++ b/fs/afs/vl_list.c @@ -21,6 +21,7 @@ struct afs_vlserver *afs_alloc_vlserver(const char *name, size_t name_len, rwlock_init(&vlserver->lock); init_waitqueue_head(&vlserver->probe_wq); spin_lock_init(&vlserver->probe_lock); + vlserver->rtt = UINT_MAX; vlserver->name_len = name_len; vlserver->port = port; memcpy(vlserver->name, name, name_len); diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c index a6d04b4fbf56..d1c7068b4346 100644 --- a/fs/afs/vl_probe.c +++ b/fs/afs/vl_probe.c @@ -11,15 +11,33 @@ #include "internal.h" #include "protocol_yfs.h" -static bool afs_vl_probe_done(struct afs_vlserver *server) -{ - if (!atomic_dec_and_test(&server->probe_outstanding)) - return false; - wake_up_var(&server->probe_outstanding); +/* + * Handle the completion of a set of probes. + */ +static void afs_finished_vl_probe(struct afs_vlserver *server) +{ + if (!(server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED)) { + server->rtt = UINT_MAX; + clear_bit(AFS_VLSERVER_FL_RESPONDING, &server->flags); + } + clear_bit_unlock(AFS_VLSERVER_FL_PROBING, &server->flags); wake_up_bit(&server->flags, AFS_VLSERVER_FL_PROBING); - return true; +} + +/* + * Handle the completion of a probe RPC call. + */ +static void afs_done_one_vl_probe(struct afs_vlserver *server, bool wake_up) +{ + if (atomic_dec_and_test(&server->probe_outstanding)) { + afs_finished_vl_probe(server); + wake_up = true; + } + + if (wake_up) + wake_up_all(&server->probe_wq); } /* @@ -52,8 +70,13 @@ void afs_vlserver_probe_result(struct afs_call *call) goto responded; case -ENOMEM: case -ENONET: + case -EKEYEXPIRED: + case -EKEYREVOKED: + case -EKEYREJECTED: server->probe.flags |= AFS_VLSERVER_PROBE_LOCAL_FAILURE; - afs_io_error(call, afs_io_error_vl_probe_fail); + if (server->probe.error == 0) + server->probe.error = ret; + trace_afs_io_error(call->debug_id, ret, afs_io_error_vl_probe_fail); goto out; case -ECONNRESET: /* Responded, but call expired. */ case -ERFKILL: @@ -72,7 +95,7 @@ void afs_vlserver_probe_result(struct afs_call *call) server->probe.error == -ETIMEDOUT || server->probe.error == -ETIME)) server->probe.error = ret; - afs_io_error(call, afs_io_error_vl_probe_fail); + trace_afs_io_error(call->debug_id, ret, afs_io_error_vl_probe_fail); goto out; } @@ -95,22 +118,22 @@ void afs_vlserver_probe_result(struct afs_call *call) if (rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us) && rtt_us < server->probe.rtt) { server->probe.rtt = rtt_us; + server->rtt = rtt_us; alist->preferred = index; - have_result = true; } smp_wmb(); /* Set rtt before responded. */ server->probe.flags |= AFS_VLSERVER_PROBE_RESPONDED; set_bit(AFS_VLSERVER_FL_PROBED, &server->flags); + set_bit(AFS_VLSERVER_FL_RESPONDING, &server->flags); + have_result = true; out: spin_unlock(&server->probe_lock); _debug("probe [%u][%u] %pISpc rtt=%u ret=%d", server_index, index, &alist->addrs[index].transport, rtt_us, ret); - have_result |= afs_vl_probe_done(server); - if (have_result) - wake_up_all(&server->probe_wq); + afs_done_one_vl_probe(server, have_result); } /* @@ -148,11 +171,10 @@ static bool afs_do_probe_vlserver(struct afs_net *net, in_progress = true; } else { afs_prioritise_error(_e, PTR_ERR(call), ac.abort_code); + afs_done_one_vl_probe(server, false); } } - if (!in_progress) - afs_vl_probe_done(server); return in_progress; } @@ -190,7 +212,7 @@ int afs_wait_for_vl_probes(struct afs_vlserver_list *vllist, { struct wait_queue_entry *waits; struct afs_vlserver *server; - unsigned int rtt = UINT_MAX; + unsigned int rtt = UINT_MAX, rtt_s; bool have_responders = false; int pref = -1, i; @@ -246,10 +268,11 @@ int afs_wait_for_vl_probes(struct afs_vlserver_list *vllist, for (i = 0; i < vllist->nr_servers; i++) { if (test_bit(i, &untried)) { server = vllist->servers[i].server; - if ((server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED) && - server->probe.rtt < rtt) { + rtt_s = READ_ONCE(server->rtt); + if (test_bit(AFS_VLSERVER_FL_RESPONDING, &server->flags) && + rtt_s < rtt) { pref = i; - rtt = server->probe.rtt; + rtt = rtt_s; } remove_wait_queue(&server->probe_wq, &waits[i]); diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c index ed2609e82695..ab2beb4ba20e 100644 --- a/fs/afs/vl_rotate.c +++ b/fs/afs/vl_rotate.c @@ -193,7 +193,7 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc) struct afs_vlserver *s = vc->server_list->servers[i].server; if (!test_bit(i, &vc->untried) || - !(s->probe.flags & AFS_VLSERVER_PROBE_RESPONDED)) + !test_bit(AFS_VLSERVER_FL_RESPONDING, &s->flags)) continue; if (s->probe.rtt < rtt) { vc->index = i; From e4686c79b103c0a73b8ed41e5fd967da201b6fba Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 20 Aug 2020 16:13:05 +0100 Subject: [PATCH 012/110] afs: Fix error handling in VL server rotation The error handling in the VL server rotation in the case of there being no contactable servers is not correct. In such a case, the records of all the servers in the list are scanned and the errors and abort codes are mapped and prioritised and one error is chosen. This is then forgotten and the default error is used (EDESTADDRREQ). Fix this by using the calculated error. Also we need to note whether a server responded on one of its endpoints so that we can priorise an error from an abort message over local and network errors. Fixes: 4584ae96ae30 ("afs: Fix missing net error handling") Signed-off-by: David Howells --- fs/afs/vl_rotate.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c index ab2beb4ba20e..c0458c903b31 100644 --- a/fs/afs/vl_rotate.c +++ b/fs/afs/vl_rotate.c @@ -263,10 +263,14 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc) for (i = 0; i < vc->server_list->nr_servers; i++) { struct afs_vlserver *s = vc->server_list->servers[i].server; + if (test_bit(AFS_VLSERVER_FL_RESPONDING, &s->flags)) + e.responded = true; afs_prioritise_error(&e, READ_ONCE(s->probe.error), s->probe.abort_code); } + error = e.error; + failed_set_error: vc->error = error; failed: From 226a88de473e475cb9f993682a1c7d0c2b451ad8 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Wed, 19 Aug 2020 23:59:14 +0200 Subject: [PATCH 013/110] netfilter: nft_set_rbtree: Handle outcomes of tree rotations in overlap detection Checks for partial overlaps on insertion assume that end elements are always descendant nodes of their corresponding start, because they are inserted later. However, this is not the case if a previous delete operation caused a tree rotation as part of rebalancing. Taking the issue reported by Andreas Fischer as an example, if we omit delete operations, the existing procedure works because, equivalently, we are inserting a start item with value 40 in the this region of the red-black tree with single-sized intervals: overlap flag 10 (start) / \ false 20 (start) / \ false 30 (start) / \ false 60 (start) / \ false 50 (end) / \ false 20 (end) / \ false 40 (start) if we now delete interval 30 - 30, the tree can be rearranged in a way similar to this (note the rotation involving 50 - 50): overlap flag 10 (start) / \ false 20 (start) / \ false 25 (start) / \ false 70 (start) / \ false 50 (end) / \ true (from rule a1.) 50 (start) / \ true 40 (start) and we traverse interval 50 - 50 from the opposite direction compared to what was expected. To deal with those cases, add a start-before-start rule, b4., that covers traversal of existing intervals from the right. We now need to restrict start-after-end rule b3. to cases where there are no occurring nodes between existing start and end elements, because addition of rule b4. isn't sufficient to ensure that the pre-existing end element we encounter while descending the tree corresponds to a start element of an interval that we already traversed entirely. Different types of overlap detection on trees with rotations resulting from re-balancing will be covered by nft test case sets/0044interval_overlap_1. Reported-by: Andreas Fischer Bugzilla: https://bugzilla.netfilter.org/show_bug.cgi?id=1449 Cc: # 5.6.x Fixes: 7c84d41416d8 ("netfilter: nft_set_rbtree: Detect partial overlaps on insertion") Signed-off-by: Stefano Brivio Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_rbtree.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 4b2834fd17b2..27668f4e44ea 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -238,21 +238,27 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, * * b1. _ _ __>| !_ _ __| (insert end before existing start) * b2. _ _ ___| !_ _ _>| (insert end after existing start) - * b3. _ _ ___! >|_ _ __| (insert start after existing end) + * b3. _ _ ___! >|_ _ __| (insert start after existing end, as a leaf) + * '--' no nodes falling in this range + * b4. >|_ _ ! (insert start before existing start) * * Case a3. resolves to b3.: * - if the inserted start element is the leftmost, because the '0' * element in the tree serves as end element - * - otherwise, if an existing end is found. Note that end elements are - * always inserted after corresponding start elements. + * - otherwise, if an existing end is found immediately to the left. If + * there are existing nodes in between, we need to further descend the + * tree before we can conclude the new start isn't causing an overlap + * + * or to b4., which, preceded by a3., means we already traversed one or + * more existing intervals entirely, from the right. * * For a new, rightmost pair of elements, we'll hit cases b3. and b2., * in that order. * * The flag is also cleared in two special cases: * - * b4. |__ _ _!|<_ _ _ (insert start right before existing end) - * b5. |__ _ >|!__ _ _ (insert end right after existing start) + * b5. |__ _ _!|<_ _ _ (insert start right before existing end) + * b6. |__ _ >|!__ _ _ (insert end right after existing start) * * which always happen as last step and imply that no further * overlapping is possible. @@ -272,7 +278,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, if (nft_rbtree_interval_start(new)) { if (nft_rbtree_interval_end(rbe) && nft_set_elem_active(&rbe->ext, genmask) && - !nft_set_elem_expired(&rbe->ext)) + !nft_set_elem_expired(&rbe->ext) && !*p) overlap = false; } else { overlap = nft_rbtree_interval_end(rbe) && @@ -288,10 +294,9 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, nft_set_elem_active(&rbe->ext, genmask) && !nft_set_elem_expired(&rbe->ext); - } else if (nft_rbtree_interval_end(rbe) && - nft_set_elem_active(&rbe->ext, genmask) && + } else if (nft_set_elem_active(&rbe->ext, genmask) && !nft_set_elem_expired(&rbe->ext)) { - overlap = true; + overlap = nft_rbtree_interval_end(rbe); } } else { if (nft_rbtree_interval_end(rbe) && From 0726763043dc10dd4c12481f050b1a5ef8f15410 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Wed, 19 Aug 2020 23:59:15 +0200 Subject: [PATCH 014/110] netfilter: nft_set_rbtree: Detect partial overlap with start endpoint match Getting creative with nft and omitting the interval_overlap() check from the set_overlap() function, without omitting set_overlap() altogether, led to the observation of a partial overlap that wasn't detected, and would actually result in replacement of the end element of an existing interval. This is due to the fact that we'll return -EEXIST on a matching, pre-existing start element, instead of -ENOTEMPTY, and the error is cleared by API if NLM_F_EXCL is not given. At this point, we can insert a matching start, and duplicate the end element as long as we don't end up into other intervals. For instance, inserting interval 0 - 2 with an existing 0 - 3 interval would result in a single 0 - 2 interval, and a dangling '3' end element. This is because nft will proceed after inserting the '0' start element as no error is reported, and no further conflicting intervals are detected on insertion of the end element. This needs a different approach as it's a local condition that can be detected by looking for duplicate ends coming from left and right, separately. Track those and directly report -ENOTEMPTY on duplicated end elements for a matching start. Signed-off-by: Stefano Brivio Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_rbtree.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 27668f4e44ea..217ab3644c25 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -218,11 +218,11 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree_elem *new, struct nft_set_ext **ext) { + bool overlap = false, dup_end_left = false, dup_end_right = false; struct nft_rbtree *priv = nft_set_priv(set); u8 genmask = nft_genmask_next(net); struct nft_rbtree_elem *rbe; struct rb_node *parent, **p; - bool overlap = false; int d; /* Detect overlaps as we descend the tree. Set the flag in these cases: @@ -262,6 +262,20 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, * * which always happen as last step and imply that no further * overlapping is possible. + * + * Another special case comes from the fact that start elements matching + * an already existing start element are allowed: insertion is not + * performed but we return -EEXIST in that case, and the error will be + * cleared by the caller if NLM_F_EXCL is not present in the request. + * This way, request for insertion of an exact overlap isn't reported as + * error to userspace if not desired. + * + * However, if the existing start matches a pre-existing start, but the + * end element doesn't match the corresponding pre-existing end element, + * we need to report a partial overlap. This is a local condition that + * can be noticed without need for a tracking flag, by checking for a + * local duplicated end for a corresponding start, from left and right, + * separately. */ parent = NULL; @@ -281,19 +295,35 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, !nft_set_elem_expired(&rbe->ext) && !*p) overlap = false; } else { + if (dup_end_left && !*p) + return -ENOTEMPTY; + overlap = nft_rbtree_interval_end(rbe) && nft_set_elem_active(&rbe->ext, genmask) && !nft_set_elem_expired(&rbe->ext); + + if (overlap) { + dup_end_right = true; + continue; + } } } else if (d > 0) { p = &parent->rb_right; if (nft_rbtree_interval_end(new)) { + if (dup_end_right && !*p) + return -ENOTEMPTY; + overlap = nft_rbtree_interval_end(rbe) && nft_set_elem_active(&rbe->ext, genmask) && !nft_set_elem_expired(&rbe->ext); + + if (overlap) { + dup_end_left = true; + continue; + } } else if (nft_set_elem_active(&rbe->ext, genmask) && !nft_set_elem_expired(&rbe->ext)) { overlap = nft_rbtree_interval_end(rbe); @@ -321,6 +351,8 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, p = &parent->rb_left; } } + + dup_end_left = dup_end_right = false; } if (overlap) From 6f03bf43ee05b31d3822def2a80f11b3591c55b3 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 20 Aug 2020 14:12:54 +0200 Subject: [PATCH 015/110] netfilter: nf_tables: add NFTA_SET_USERDATA if not null Kernel sends an empty NFTA_SET_USERDATA attribute with no value if userspace adds a set with no NFTA_SET_USERDATA attribute. Fixes: e6d8ecac9e68 ("netfilter: nf_tables: Add new attributes into nft_set to store user data.") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index fd814e514f94..71e501c5ad21 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3770,7 +3770,8 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, goto nla_put_failure; } - if (nla_put(skb, NFTA_SET_USERDATA, set->udlen, set->udata)) + if (set->udata && + nla_put(skb, NFTA_SET_USERDATA, set->udlen, set->udata)) goto nla_put_failure; nest = nla_nest_start_noflag(skb, NFTA_SET_DESC); From da9125df854ea48a6240c66e8a67be06e2c12c03 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 20 Aug 2020 14:12:55 +0200 Subject: [PATCH 016/110] netfilter: nf_tables: incorrect enum nft_list_attributes definition This should be NFTA_LIST_UNSPEC instead of NFTA_LIST_UNPEC, all other similar attribute definitions are postfixed with _UNSPEC. Fixes: 96518518cc41 ("netfilter: add nftables") Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 42f351c1f5c5..2b8e12f7a4a6 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -133,7 +133,7 @@ enum nf_tables_msg_types { * @NFTA_LIST_ELEM: list element (NLA_NESTED) */ enum nft_list_attributes { - NFTA_LIST_UNPEC, + NFTA_LIST_UNSPEC, NFTA_LIST_ELEM, __NFTA_LIST_MAX }; From 1e105e6afa6c3d32bfb52c00ffa393894a525c27 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 20 Aug 2020 21:05:50 +0200 Subject: [PATCH 017/110] netfilter: nf_tables: fix destination register zeroing Following bug was reported via irc: nft list ruleset set knock_candidates_ipv4 { type ipv4_addr . inet_service size 65535 elements = { 127.0.0.1 . 123, 127.0.0.1 . 123 } } .. udp dport 123 add @knock_candidates_ipv4 { ip saddr . 123 } udp dport 123 add @knock_candidates_ipv4 { ip saddr . udp dport } It should not have been possible to add a duplicate set entry. After some debugging it turned out that the problem is the immediate value (123) in the second-to-last rule. Concatenations use 32bit registers, i.e. the elements are 8 bytes each, not 6 and it turns out the kernel inserted inet firewall @knock_candidates_ipv4 element 0100007f ffff7b00 : 0 [end] element 0100007f 00007b00 : 0 [end] Note the non-zero upper bits of the first element. It turns out that nft_immediate doesn't zero the destination register, but this is needed when the length isn't a multiple of 4. Furthermore, the zeroing in nft_payload is broken. We can't use [len / 4] = 0 -- if len is a multiple of 4, index is off by one. Skip zeroing in this case and use a conditional instead of (len -1) / 4. Fixes: 49499c3e6e18 ("netfilter: nf_tables: switch registers to 32 bit addressing") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 ++ net/netfilter/nft_payload.c | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index bf9491b77d16..224d194ad29d 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -143,6 +143,8 @@ static inline u64 nft_reg_load64(const u32 *sreg) static inline void nft_data_copy(u32 *dst, const struct nft_data *src, unsigned int len) { + if (len % NFT_REG32_SIZE) + dst[len / NFT_REG32_SIZE] = 0; memcpy(dst, src, len); } diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index ed7cb9f747f6..7a2e59638499 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -87,7 +87,9 @@ void nft_payload_eval(const struct nft_expr *expr, u32 *dest = ®s->data[priv->dreg]; int offset; - dest[priv->len / NFT_REG32_SIZE] = 0; + if (priv->len % NFT_REG32_SIZE) + dest[priv->len / NFT_REG32_SIZE] = 0; + switch (priv->base) { case NFT_PAYLOAD_LL_HEADER: if (!skb_mac_header_was_set(skb)) From 100e3345c6e719d2291e1efd5de311cc24bb9c0b Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Mon, 24 Aug 2020 13:44:42 +0800 Subject: [PATCH 018/110] net: hns: Fix memleak in hns_nic_dev_probe hns_nic_dev_probe allocates ndev, but not free it on two error handling paths, which may lead to memleak. Fixes: 63434888aaf1b ("net: hns: net: hns: enet adds support of acpi") Signed-off-by: Dinghao Liu Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns/hns_enet.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c b/drivers/net/ethernet/hisilicon/hns/hns_enet.c index 23f278e46975..22522f8a5299 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c @@ -2282,8 +2282,10 @@ static int hns_nic_dev_probe(struct platform_device *pdev) priv->enet_ver = AE_VERSION_1; else if (acpi_dev_found(hns_enet_acpi_match[1].id)) priv->enet_ver = AE_VERSION_2; - else - return -ENXIO; + else { + ret = -ENXIO; + goto out_read_prop_fail; + } /* try to find port-idx-in-ae first */ ret = acpi_node_get_property_reference(dev->fwnode, @@ -2299,7 +2301,8 @@ static int hns_nic_dev_probe(struct platform_device *pdev) priv->fwnode = args.fwnode; } else { dev_err(dev, "cannot read cfg data from OF or acpi\n"); - return -ENXIO; + ret = -ENXIO; + goto out_read_prop_fail; } ret = device_property_read_u32(dev, "port-idx-in-ae", &port_id); From 7ef1fc57301f3cef7201497aa27e89ccb91737fe Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Mon, 24 Aug 2020 13:58:31 +0800 Subject: [PATCH 019/110] net: systemport: Fix memleak in bcm_sysport_probe When devm_kcalloc() fails, dev should be freed just like what we've done in the subsequent error paths. Fixes: 7b78be48a8eb6 ("net: systemport: Dynamically allocate number of TX rings") Signed-off-by: Dinghao Liu Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bcmsysport.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index dfed9ade6950..0762d5d1a810 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -2491,8 +2491,10 @@ static int bcm_sysport_probe(struct platform_device *pdev) priv->tx_rings = devm_kcalloc(&pdev->dev, txq, sizeof(struct bcm_sysport_tx_ring), GFP_KERNEL); - if (!priv->tx_rings) - return -ENOMEM; + if (!priv->tx_rings) { + ret = -ENOMEM; + goto err_free_netdev; + } priv->is_lite = params->is_lite; priv->num_rx_desc_words = params->num_rx_desc_words; From 3622adb02623cde94ea85c68e9a968d0059de545 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Mon, 24 Aug 2020 13:46:22 +0200 Subject: [PATCH 020/110] ipv6: ndisc: adjust ndisc_ifinfo_sysctl_change prototype Commit 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") changed ndisc_ifinfo_sysctl_change to take a kernel pointer. Adjust its prototype in net/ndisc.h as well to fix the following sparse warning: net/ipv6/ndisc.c:1838:5: error: symbol 'ndisc_ifinfo_sysctl_change' redeclared with different type (incompatible argument 3 (different address spaces)): net/ipv6/ndisc.c:1838:5: int extern [addressable] [signed] [toplevel] ndisc_ifinfo_sysctl_change( ... ) net/ipv6/ndisc.c: note: in included file (through include/net/ipv6.h): ./include/net/ndisc.h:496:5: note: previously declared as: ./include/net/ndisc.h:496:5: int extern [addressable] [signed] [toplevel] ndisc_ifinfo_sysctl_change( ... ) net/ipv6/ndisc.c: note: in included file (through include/net/ip6_route.h): Fixes: 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") Cc: Christoph Hellwig Signed-off-by: Tobias Klauser Signed-off-by: David S. Miller --- include/net/ndisc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/ndisc.h b/include/net/ndisc.h index 9205a76d967a..38e4094960ce 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -494,7 +494,7 @@ int igmp6_event_report(struct sk_buff *skb); #ifdef CONFIG_SYSCTL int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); int ndisc_ifinfo_sysctl_strategy(struct ctl_table *ctl, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen); From 1838d6c62f57836639bd3d83e7855e0ee4f6defc Mon Sep 17 00:00:00 2001 From: Yuusuke Ashizuka Date: Thu, 20 Aug 2020 18:43:07 +0900 Subject: [PATCH 021/110] ravb: Fixed to be able to unload modules When this driver is built as a module, I cannot rmmod it after insmoding it. This is because that this driver calls ravb_mdio_init() at the time of probe, and module->refcnt is incremented by alloc_mdio_bitbang() called after that. Therefore, even if ifup is not performed, the driver is in use and rmmod cannot be performed. $ lsmod Module Size Used by ravb 40960 1 $ rmmod ravb rmmod: ERROR: Module ravb is in use Call ravb_mdio_init() at open and free_mdio_bitbang() at close, thereby rmmod is possible in the ifdown state. Fixes: c156633f1353 ("Renesas Ethernet AVB driver proper") Signed-off-by: Yuusuke Ashizuka Reviewed-by: Sergei Shtylyov Signed-off-by: David S. Miller --- drivers/net/ethernet/renesas/ravb_main.c | 110 +++++++++++------------ 1 file changed, 55 insertions(+), 55 deletions(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 99f7aae102ce..df89d09b253e 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -1342,6 +1342,51 @@ static inline int ravb_hook_irq(unsigned int irq, irq_handler_t handler, return error; } +/* MDIO bus init function */ +static int ravb_mdio_init(struct ravb_private *priv) +{ + struct platform_device *pdev = priv->pdev; + struct device *dev = &pdev->dev; + int error; + + /* Bitbang init */ + priv->mdiobb.ops = &bb_ops; + + /* MII controller setting */ + priv->mii_bus = alloc_mdio_bitbang(&priv->mdiobb); + if (!priv->mii_bus) + return -ENOMEM; + + /* Hook up MII support for ethtool */ + priv->mii_bus->name = "ravb_mii"; + priv->mii_bus->parent = dev; + snprintf(priv->mii_bus->id, MII_BUS_ID_SIZE, "%s-%x", + pdev->name, pdev->id); + + /* Register MDIO bus */ + error = of_mdiobus_register(priv->mii_bus, dev->of_node); + if (error) + goto out_free_bus; + + return 0; + +out_free_bus: + free_mdio_bitbang(priv->mii_bus); + return error; +} + +/* MDIO bus release function */ +static int ravb_mdio_release(struct ravb_private *priv) +{ + /* Unregister mdio bus */ + mdiobus_unregister(priv->mii_bus); + + /* Free bitbang info */ + free_mdio_bitbang(priv->mii_bus); + + return 0; +} + /* Network device open function for Ethernet AVB */ static int ravb_open(struct net_device *ndev) { @@ -1350,6 +1395,13 @@ static int ravb_open(struct net_device *ndev) struct device *dev = &pdev->dev; int error; + /* MDIO bus init */ + error = ravb_mdio_init(priv); + if (error) { + netdev_err(ndev, "failed to initialize MDIO\n"); + return error; + } + napi_enable(&priv->napi[RAVB_BE]); napi_enable(&priv->napi[RAVB_NC]); @@ -1427,6 +1479,7 @@ static int ravb_open(struct net_device *ndev) out_napi_off: napi_disable(&priv->napi[RAVB_NC]); napi_disable(&priv->napi[RAVB_BE]); + ravb_mdio_release(priv); return error; } @@ -1736,6 +1789,8 @@ static int ravb_close(struct net_device *ndev) ravb_ring_free(ndev, RAVB_BE); ravb_ring_free(ndev, RAVB_NC); + ravb_mdio_release(priv); + return 0; } @@ -1887,51 +1942,6 @@ static const struct net_device_ops ravb_netdev_ops = { .ndo_set_features = ravb_set_features, }; -/* MDIO bus init function */ -static int ravb_mdio_init(struct ravb_private *priv) -{ - struct platform_device *pdev = priv->pdev; - struct device *dev = &pdev->dev; - int error; - - /* Bitbang init */ - priv->mdiobb.ops = &bb_ops; - - /* MII controller setting */ - priv->mii_bus = alloc_mdio_bitbang(&priv->mdiobb); - if (!priv->mii_bus) - return -ENOMEM; - - /* Hook up MII support for ethtool */ - priv->mii_bus->name = "ravb_mii"; - priv->mii_bus->parent = dev; - snprintf(priv->mii_bus->id, MII_BUS_ID_SIZE, "%s-%x", - pdev->name, pdev->id); - - /* Register MDIO bus */ - error = of_mdiobus_register(priv->mii_bus, dev->of_node); - if (error) - goto out_free_bus; - - return 0; - -out_free_bus: - free_mdio_bitbang(priv->mii_bus); - return error; -} - -/* MDIO bus release function */ -static int ravb_mdio_release(struct ravb_private *priv) -{ - /* Unregister mdio bus */ - mdiobus_unregister(priv->mii_bus); - - /* Free bitbang info */ - free_mdio_bitbang(priv->mii_bus); - - return 0; -} - static const struct of_device_id ravb_match_table[] = { { .compatible = "renesas,etheravb-r8a7790", .data = (void *)RCAR_GEN2 }, { .compatible = "renesas,etheravb-r8a7794", .data = (void *)RCAR_GEN2 }, @@ -2174,13 +2184,6 @@ static int ravb_probe(struct platform_device *pdev) eth_hw_addr_random(ndev); } - /* MDIO bus init */ - error = ravb_mdio_init(priv); - if (error) { - dev_err(&pdev->dev, "failed to initialize MDIO\n"); - goto out_dma_free; - } - netif_napi_add(ndev, &priv->napi[RAVB_BE], ravb_poll, 64); netif_napi_add(ndev, &priv->napi[RAVB_NC], ravb_poll, 64); @@ -2202,8 +2205,6 @@ static int ravb_probe(struct platform_device *pdev) out_napi_del: netif_napi_del(&priv->napi[RAVB_NC]); netif_napi_del(&priv->napi[RAVB_BE]); - ravb_mdio_release(priv); -out_dma_free: dma_free_coherent(ndev->dev.parent, priv->desc_bat_size, priv->desc_bat, priv->desc_bat_dma); @@ -2235,7 +2236,6 @@ static int ravb_remove(struct platform_device *pdev) unregister_netdev(ndev); netif_napi_del(&priv->napi[RAVB_NC]); netif_napi_del(&priv->napi[RAVB_BE]); - ravb_mdio_release(priv); pm_runtime_disable(&pdev->dev); free_netdev(ndev); platform_set_drvdata(pdev, NULL); From 3106ecb43a05dc3e009779764b9da245a5d082de Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 21 Aug 2020 14:59:38 +0800 Subject: [PATCH 022/110] sctp: not disable bh in the whole sctp_get_port_local() With disabling bh in the whole sctp_get_port_local(), when snum == 0 and too many ports have been used, the do-while loop will take the cpu for a long time and cause cpu stuck: [ ] watchdog: BUG: soft lockup - CPU#11 stuck for 22s! [ ] RIP: 0010:native_queued_spin_lock_slowpath+0x4de/0x940 [ ] Call Trace: [ ] _raw_spin_lock+0xc1/0xd0 [ ] sctp_get_port_local+0x527/0x650 [sctp] [ ] sctp_do_bind+0x208/0x5e0 [sctp] [ ] sctp_autobind+0x165/0x1e0 [sctp] [ ] sctp_connect_new_asoc+0x355/0x480 [sctp] [ ] __sctp_connect+0x360/0xb10 [sctp] There's no need to disable bh in the whole function of sctp_get_port_local. So fix this cpu stuck by removing local_bh_disable() called at the beginning, and using spin_lock_bh() instead. The same thing was actually done for inet_csk_get_port() in Commit ea8add2b1903 ("tcp/dccp: better use of ephemeral ports in bind()"). Thanks to Marcelo for pointing the buggy code out. v1->v2: - use cond_resched() to yield cpu to other tasks if needed, as Eric noticed. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Ying Xu Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/socket.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index ec1fba1fbe71..836615f71a7d 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -8060,8 +8060,6 @@ static int sctp_get_port_local(struct sock *sk, union sctp_addr *addr) pr_debug("%s: begins, snum:%d\n", __func__, snum); - local_bh_disable(); - if (snum == 0) { /* Search for an available port. */ int low, high, remaining, index; @@ -8079,20 +8077,21 @@ static int sctp_get_port_local(struct sock *sk, union sctp_addr *addr) continue; index = sctp_phashfn(net, rover); head = &sctp_port_hashtable[index]; - spin_lock(&head->lock); + spin_lock_bh(&head->lock); sctp_for_each_hentry(pp, &head->chain) if ((pp->port == rover) && net_eq(net, pp->net)) goto next; break; next: - spin_unlock(&head->lock); + spin_unlock_bh(&head->lock); + cond_resched(); } while (--remaining > 0); /* Exhausted local port range during search? */ ret = 1; if (remaining <= 0) - goto fail; + return ret; /* OK, here is the one we will use. HEAD (the port * hash table list entry) is non-NULL and we hold it's @@ -8107,7 +8106,7 @@ static int sctp_get_port_local(struct sock *sk, union sctp_addr *addr) * port iterator, pp being NULL. */ head = &sctp_port_hashtable[sctp_phashfn(net, snum)]; - spin_lock(&head->lock); + spin_lock_bh(&head->lock); sctp_for_each_hentry(pp, &head->chain) { if ((pp->port == snum) && net_eq(pp->net, net)) goto pp_found; @@ -8207,10 +8206,7 @@ static int sctp_get_port_local(struct sock *sk, union sctp_addr *addr) ret = 0; fail_unlock: - spin_unlock(&head->lock); - -fail: - local_bh_enable(); + spin_unlock_bh(&head->lock); return ret; } From d3b990b7f327e2afa98006e7666fb8ada8ed8683 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Fri, 21 Aug 2020 16:34:52 -0400 Subject: [PATCH 023/110] netlabel: fix problems with mapping removal This patch fixes two main problems seen when removing NetLabel mappings: memory leaks and potentially extra audit noise. The memory leaks are caused by not properly free'ing the mapping's address selector struct when free'ing the entire entry as well as not properly cleaning up a temporary mapping entry when adding new address selectors to an existing entry. This patch fixes both these problems such that kmemleak reports no NetLabel associated leaks after running the SELinux test suite. The potentially extra audit noise was caused by the auditing code in netlbl_domhsh_remove_entry() being called regardless of the entry's validity. If another thread had already marked the entry as invalid, but not removed/free'd it from the list of mappings, then it was possible that an additional mapping removal audit record would be generated. This patch fixes this by returning early from the removal function when the entry was previously marked invalid. This change also had the side benefit of improving the code by decreasing the indentation level of large chunk of code by one (accounting for most of the diffstat). Fixes: 63c416887437 ("netlabel: Add network address selectors to the NetLabel/LSM domain mapping") Reported-by: Stephen Smalley Signed-off-by: Paul Moore Signed-off-by: David S. Miller --- net/netlabel/netlabel_domainhash.c | 65 +++++++++++++++--------------- 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c index d07de2c0fbc7..f73a8382c275 100644 --- a/net/netlabel/netlabel_domainhash.c +++ b/net/netlabel/netlabel_domainhash.c @@ -85,6 +85,7 @@ static void netlbl_domhsh_free_entry(struct rcu_head *entry) kfree(netlbl_domhsh_addr6_entry(iter6)); } #endif /* IPv6 */ + kfree(ptr->def.addrsel); } kfree(ptr->domain); kfree(ptr); @@ -537,6 +538,8 @@ int netlbl_domhsh_add(struct netlbl_dom_map *entry, goto add_return; } #endif /* IPv6 */ + /* cleanup the new entry since we've moved everything over */ + netlbl_domhsh_free_entry(&entry->rcu); } else ret_val = -EINVAL; @@ -580,6 +583,12 @@ int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry, { int ret_val = 0; struct audit_buffer *audit_buf; + struct netlbl_af4list *iter4; + struct netlbl_domaddr4_map *map4; +#if IS_ENABLED(CONFIG_IPV6) + struct netlbl_af6list *iter6; + struct netlbl_domaddr6_map *map6; +#endif /* IPv6 */ if (entry == NULL) return -ENOENT; @@ -597,6 +606,9 @@ int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry, ret_val = -ENOENT; spin_unlock(&netlbl_domhsh_lock); + if (ret_val) + return ret_val; + audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_DEL, audit_info); if (audit_buf != NULL) { audit_log_format(audit_buf, @@ -606,40 +618,29 @@ int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry, audit_log_end(audit_buf); } - if (ret_val == 0) { - struct netlbl_af4list *iter4; - struct netlbl_domaddr4_map *map4; -#if IS_ENABLED(CONFIG_IPV6) - struct netlbl_af6list *iter6; - struct netlbl_domaddr6_map *map6; -#endif /* IPv6 */ - - switch (entry->def.type) { - case NETLBL_NLTYPE_ADDRSELECT: - netlbl_af4list_foreach_rcu(iter4, - &entry->def.addrsel->list4) { - map4 = netlbl_domhsh_addr4_entry(iter4); - cipso_v4_doi_putdef(map4->def.cipso); - } -#if IS_ENABLED(CONFIG_IPV6) - netlbl_af6list_foreach_rcu(iter6, - &entry->def.addrsel->list6) { - map6 = netlbl_domhsh_addr6_entry(iter6); - calipso_doi_putdef(map6->def.calipso); - } -#endif /* IPv6 */ - break; - case NETLBL_NLTYPE_CIPSOV4: - cipso_v4_doi_putdef(entry->def.cipso); - break; -#if IS_ENABLED(CONFIG_IPV6) - case NETLBL_NLTYPE_CALIPSO: - calipso_doi_putdef(entry->def.calipso); - break; -#endif /* IPv6 */ + switch (entry->def.type) { + case NETLBL_NLTYPE_ADDRSELECT: + netlbl_af4list_foreach_rcu(iter4, &entry->def.addrsel->list4) { + map4 = netlbl_domhsh_addr4_entry(iter4); + cipso_v4_doi_putdef(map4->def.cipso); } - call_rcu(&entry->rcu, netlbl_domhsh_free_entry); +#if IS_ENABLED(CONFIG_IPV6) + netlbl_af6list_foreach_rcu(iter6, &entry->def.addrsel->list6) { + map6 = netlbl_domhsh_addr6_entry(iter6); + calipso_doi_putdef(map6->def.calipso); + } +#endif /* IPv6 */ + break; + case NETLBL_NLTYPE_CIPSOV4: + cipso_v4_doi_putdef(entry->def.cipso); + break; +#if IS_ENABLED(CONFIG_IPV6) + case NETLBL_NLTYPE_CALIPSO: + calipso_doi_putdef(entry->def.calipso); + break; +#endif /* IPv6 */ } + call_rcu(&entry->rcu, netlbl_domhsh_free_entry); return ret_val; } From 1ee39c1448c4e0d480c5b390e2db1987561fb5c2 Mon Sep 17 00:00:00 2001 From: Xie He Date: Fri, 21 Aug 2020 14:26:59 -0700 Subject: [PATCH 024/110] drivers/net/wan/lapbether: Added needed_tailroom The underlying Ethernet device may request necessary tailroom to be allocated by setting needed_tailroom. This driver should also set needed_tailroom to request the tailroom needed by the underlying Ethernet device to be allocated. Cc: Willem de Bruijn Cc: Martin Schiller Signed-off-by: Xie He Signed-off-by: David S. Miller --- drivers/net/wan/lapbether.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wan/lapbether.c b/drivers/net/wan/lapbether.c index 1ea15f2123ed..cc297ea9c6ec 100644 --- a/drivers/net/wan/lapbether.c +++ b/drivers/net/wan/lapbether.c @@ -340,6 +340,7 @@ static int lapbeth_new_device(struct net_device *dev) */ ndev->needed_headroom = -1 + 3 + 2 + dev->hard_header_len + dev->needed_headroom; + ndev->needed_tailroom = dev->needed_tailroom; lapbeth = netdev_priv(ndev); lapbeth->axdev = ndev; From be769db2f95861cc8c7c8fedcc71a8c39b803b10 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 22 Aug 2020 08:23:29 +1000 Subject: [PATCH 025/110] net: Get rid of consume_skb when tracing is off The function consume_skb is only meaningful when tracing is enabled. This patch makes it conditional on CONFIG_TRACEPOINTS. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/skbuff.h | 9 +++++++++ net/core/skbuff.c | 2 ++ 2 files changed, 11 insertions(+) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 46881d902124..e8bca74857a3 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1056,7 +1056,16 @@ void kfree_skb(struct sk_buff *skb); void kfree_skb_list(struct sk_buff *segs); void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt); void skb_tx_error(struct sk_buff *skb); + +#ifdef CONFIG_TRACEPOINTS void consume_skb(struct sk_buff *skb); +#else +static inline void consume_skb(struct sk_buff *skb) +{ + return kfree_skb(skb); +} +#endif + void __consume_stateless_skb(struct sk_buff *skb); void __kfree_skb(struct sk_buff *skb); extern struct kmem_cache *skbuff_head_cache; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e18184ffa9c3..6faf73d6a0f7 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -820,6 +820,7 @@ void skb_tx_error(struct sk_buff *skb) } EXPORT_SYMBOL(skb_tx_error); +#ifdef CONFIG_TRACEPOINTS /** * consume_skb - free an skbuff * @skb: buffer to free @@ -837,6 +838,7 @@ void consume_skb(struct sk_buff *skb) __kfree_skb(skb); } EXPORT_SYMBOL(consume_skb); +#endif /** * consume_stateless_skb - free an skbuff, assuming it is stateless From 5978fac03ea3faf793dffeedcbe60dd8da673f89 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 21 Aug 2020 15:25:16 -0700 Subject: [PATCH 026/110] net: dsa: sja1105: Do not use address of compatible member in sja1105_check_device_id Clang warns: drivers/net/dsa/sja1105/sja1105_main.c:3418:38: warning: address of array 'match->compatible' will always evaluate to 'true' [-Wpointer-bool-conversion] for (match = sja1105_dt_ids; match->compatible; match++) { ~~~ ~~~~~~~^~~~~~~~~~ 1 warning generated. We should check the value of the first character in compatible to see if it is empty or not. This matches how the rest of the tree iterates over IDs. Fixes: 0b0e299720bb ("net: dsa: sja1105: use detected device id instead of DT one on mismatch") Link: https://github.com/ClangBuiltLinux/linux/issues/1139 Signed-off-by: Nathan Chancellor Acked-by: Florian Fainelli Acked-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index c3f6f124e5f0..5a28dfb36ec3 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -3415,7 +3415,7 @@ static int sja1105_check_device_id(struct sja1105_private *priv) sja1105_unpack(prod_id, &part_no, 19, 4, SJA1105_SIZE_DEVICE_ID); - for (match = sja1105_dt_ids; match->compatible; match++) { + for (match = sja1105_dt_ids; match->compatible[0]; match++) { const struct sja1105_info *info = match->data; /* Is what's been probed in our match table at all? */ From d0cac91817c81140957a9b3deb1581e4f908907d Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Sat, 22 Aug 2020 13:11:21 -0700 Subject: [PATCH 027/110] MAINTAINERS: GENET: Add missing platform data file When commit b0ba512e25d7 ("net: bcmgenet: enable driver to work without a device tree") added include/linux/platform_data/bcmgenet.h, the file was not added to the GENET MAINTAINERS file section, add it now. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index f0068bceeb61..2bfa6def184b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3580,6 +3580,7 @@ L: bcm-kernel-feedback-list@broadcom.com L: netdev@vger.kernel.org S: Supported F: drivers/net/ethernet/broadcom/genet/ +F: include/linux/platform_data/bcmgenet.h BROADCOM IPROC ARM ARCHITECTURE M: Ray Jui From 9fac261c1eb5db61aad52b9071d6b569ffdabe55 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Sat, 22 Aug 2020 13:11:22 -0700 Subject: [PATCH 028/110] MAINTAINERS: B53: Add DT binding file When the binding was added with 967dd82ffc52 ("net: dsa: b53: Add support for Broadcom RoboSwitch"), it was not explicitly added to the B53 MAINTAINERS file section, add it now. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 2bfa6def184b..8a99fd4f426a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3388,6 +3388,7 @@ M: Florian Fainelli L: netdev@vger.kernel.org L: openwrt-devel@lists.openwrt.org (subscribers-only) S: Supported +F: Documentation/devicetree/bindings/net/dsa/b53.txt F: drivers/net/dsa/b53/* F: include/linux/platform_data/b53.h From f69ccc563df0c2097eb9fdcf80d426a4458835f3 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Sat, 22 Aug 2020 13:11:23 -0700 Subject: [PATCH 029/110] MAINTAINERS: GENET: Add DT binding file When the DT binding was added in aab5127d94e6 ("Documentation: add Device tree bindings for Broadcom GENET"), the file was not explicitly listed under the GENET MAINTAINERS section, do that now. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 8a99fd4f426a..a523ab65f2d4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3580,6 +3580,7 @@ M: Florian Fainelli L: bcm-kernel-feedback-list@broadcom.com L: netdev@vger.kernel.org S: Supported +F: Documentation/devicetree/bindings/net/brcm,bcmgenet.txt F: drivers/net/ethernet/broadcom/genet/ F: include/linux/platform_data/bcmgenet.h From ccaab4d3df9879a3d4b930f160b229fa7c2a8477 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Sat, 22 Aug 2020 13:11:24 -0700 Subject: [PATCH 030/110] MAINTAINERS: GENET: Add UniMAC MDIO controller files In preparation for removing myself from the PHYLIB entry, add the UniMAC MDIO controller files (DT binding, driver and platform_data header) to the GENET entry. The UniMAC MDIO controller is essential to the GENET operation, therefore it makes sense to group them together. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- MAINTAINERS | 3 +++ 1 file changed, 3 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index a523ab65f2d4..31004daf11a9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3581,8 +3581,11 @@ L: bcm-kernel-feedback-list@broadcom.com L: netdev@vger.kernel.org S: Supported F: Documentation/devicetree/bindings/net/brcm,bcmgenet.txt +F: Documentation/devicetree/bindings/net/brcm,unimac-mdio.txt F: drivers/net/ethernet/broadcom/genet/ +F: drivers/net/mdio/mdio-bcm-unimac.c F: include/linux/platform_data/bcmgenet.h +F: include/linux/platform_data/mdio-bcm-unimac.h BROADCOM IPROC ARM ARCHITECTURE M: Ray Jui From e063713c05bc22f11d9e9aac0bc846ca233b9f41 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Sat, 22 Aug 2020 13:11:25 -0700 Subject: [PATCH 031/110] MAINTAINERS: Add entry for Broadcom Ethernet PHY drivers Add an entry for the Broadcom Ethernet PHY drivers covering the BCM63xx, BCM7xxx, BCM87xx, BCM54140, BCM84881, the venerable broadcom.c driver and the companion library files. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- MAINTAINERS | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 31004daf11a9..da2206d8c21e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3574,6 +3574,16 @@ L: bcm-kernel-feedback-list@broadcom.com S: Maintained F: drivers/phy/broadcom/phy-brcm-usb* +BROADCOM ETHERNET PHY DRIVERS +M: Florian Fainelli +L: bcm-kernel-feedback-list@broadcom.com +L: netdev@vger.kernel.org +S: Supported +F: Documentation/devicetree/bindings/net/broadcom-bcm87xx.txt +F: drivers/net/phy/bcm*.[ch] +F: drivers/net/phy/broadcom.c +F: include/linux/brcmphy.h + BROADCOM GENET ETHERNET DRIVER M: Doug Berger M: Florian Fainelli From 3ad1b1e16dbff695f430b7d7ac0b6e98c02065c2 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Sat, 22 Aug 2020 13:11:26 -0700 Subject: [PATCH 032/110] MAINTAINERS: Remove self from PHY LIBRARY My last significant achievements to the PHY library was ensuring we would have small bus factor by having Andrew and Heiner added. The world has moved on past 1G, but I have not, so let more competent maintainers take over. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index da2206d8c21e..f8b2991b45ee 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6510,7 +6510,6 @@ F: net/bridge/ ETHERNET PHY LIBRARY M: Andrew Lunn -M: Florian Fainelli M: Heiner Kallweit R: Russell King L: netdev@vger.kernel.org From f97c04c316d8fea16dca449fdfbe101fbdfee6a2 Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Sun, 23 Aug 2020 15:23:43 +0800 Subject: [PATCH 033/110] NFC: st95hf: Fix memleak in st95hf_in_send_cmd When down_killable() fails, skb_resp should be freed just like when st95hf_spi_send() fails. Signed-off-by: Dinghao Liu Signed-off-by: David S. Miller --- drivers/nfc/st95hf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nfc/st95hf/core.c b/drivers/nfc/st95hf/core.c index 9642971e89ce..457854765983 100644 --- a/drivers/nfc/st95hf/core.c +++ b/drivers/nfc/st95hf/core.c @@ -966,7 +966,7 @@ static int st95hf_in_send_cmd(struct nfc_digital_dev *ddev, rc = down_killable(&stcontext->exchange_lock); if (rc) { WARN(1, "Semaphore is not found up in st95hf_in_send_cmd\n"); - return rc; + goto free_skb_resp; } rc = st95hf_spi_send(&stcontext->spicontext, skb->data, From e2d79cd8875fa8c3cc7defa98a8cc99a1ed0c62f Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Sun, 23 Aug 2020 16:56:47 +0800 Subject: [PATCH 034/110] net: arc_emac: Fix memleak in arc_mdio_probe When devm_gpiod_get_optional() fails, bus should be freed just like when of_mdiobus_register() fails. Fixes: 1bddd96cba03d ("net: arc_emac: support the phy reset for emac driver") Signed-off-by: Dinghao Liu Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/arc/emac_mdio.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/arc/emac_mdio.c b/drivers/net/ethernet/arc/emac_mdio.c index 0187dbf3b87d..54cdafdd067d 100644 --- a/drivers/net/ethernet/arc/emac_mdio.c +++ b/drivers/net/ethernet/arc/emac_mdio.c @@ -153,6 +153,7 @@ int arc_mdio_probe(struct arc_emac_priv *priv) if (IS_ERR(data->reset_gpio)) { error = PTR_ERR(data->reset_gpio); dev_err(priv->dev, "Failed to request gpio: %d\n", error); + mdiobus_free(bus); return error; } From 15ac5cdafb9202424206dc5bd376437a358963f9 Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Sun, 23 Aug 2020 19:29:35 +0800 Subject: [PATCH 035/110] firestream: Fix memleak in fs_open When make_rate() fails, vcc should be freed just like other error paths in fs_open(). Signed-off-by: Dinghao Liu Signed-off-by: David S. Miller --- drivers/atm/firestream.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/atm/firestream.c b/drivers/atm/firestream.c index 2ca9ec802734..510250cf5c87 100644 --- a/drivers/atm/firestream.c +++ b/drivers/atm/firestream.c @@ -998,6 +998,7 @@ static int fs_open(struct atm_vcc *atm_vcc) error = make_rate (pcr, r, &tmc0, NULL); if (error) { kfree(tc); + kfree(vcc); return error; } } From 4341b7d9164093c14edc686deac6a9f36eb661b5 Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Sun, 23 Aug 2020 14:18:36 +0200 Subject: [PATCH 036/110] dt-bindings: net: dsa: Fix typo Fix spelling mistake documenation -> documentation. Fixes: 5a18bb14c0f7 ("dt-bindings: net: dsa: Let dsa.txt refer to dsa.yaml") Signed-off-by: Kurt Kanzenbach Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/dsa/dsa.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/net/dsa/dsa.txt b/Documentation/devicetree/bindings/net/dsa/dsa.txt index bf7328aba330..dab208b5c7c7 100644 --- a/Documentation/devicetree/bindings/net/dsa/dsa.txt +++ b/Documentation/devicetree/bindings/net/dsa/dsa.txt @@ -1,4 +1,4 @@ Distributed Switch Architecture Device Tree Bindings ---------------------------------------------------- -See Documentation/devicetree/bindings/net/dsa/dsa.yaml for the documenation. +See Documentation/devicetree/bindings/net/dsa/dsa.yaml for the documentation. From 59ebb4305c432b6ca34fd9704c9294c1f16e5847 Mon Sep 17 00:00:00 2001 From: Sumera Priyadarsini Date: Sun, 23 Aug 2020 19:22:45 +0530 Subject: [PATCH 037/110] net: ocelot: Add of_node_put() before return statement Every iteration of for_each_available_child_of_node() decrements the reference count of the previous node, however when control is transferred from the middle of the loop, as in the case of a return or break or goto, there is no decrement thus ultimately resulting in a memory leak. Fix a potential memory leak in felix.c by inserting of_node_put() before the return statement. Issue found with Coccinelle. Signed-off-by: Sumera Priyadarsini Signed-off-by: David S. Miller --- drivers/net/dsa/ocelot/felix.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index c69d9592a2b7..04bfa6e465ff 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -400,6 +400,7 @@ static int felix_parse_ports_node(struct felix *felix, if (err < 0) { dev_err(dev, "Unsupported PHY mode %s on port %d\n", phy_modes(phy_mode), port); + of_node_put(child); return err; } From 966b8266a4629a84042f8c237d767d133c456a10 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Mon, 24 Aug 2020 16:18:51 +0100 Subject: [PATCH 038/110] sfc: fix boolreturn.cocci warning and rename function check_fcs() was returning bool as 0/1, which was a sign that the sense of the function was unclear: false was good, which doesn't really match a name like 'check_$thing'. So rename it to ef100_has_fcs_error(), and use proper booleans in the return statements. Reported-by: kernel test robot Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- drivers/net/ethernet/sfc/ef100_rx.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/sfc/ef100_rx.c b/drivers/net/ethernet/sfc/ef100_rx.c index 012925e878f4..85207acf7dee 100644 --- a/drivers/net/ethernet/sfc/ef100_rx.c +++ b/drivers/net/ethernet/sfc/ef100_rx.c @@ -36,7 +36,7 @@ bool ef100_rx_buf_hash_valid(const u8 *prefix) return PREFIX_FIELD(prefix, RSS_HASH_VALID); } -static bool check_fcs(struct efx_channel *channel, u32 *prefix) +static bool ef100_has_fcs_error(struct efx_channel *channel, u32 *prefix) { u16 rxclass; u8 l2status; @@ -46,11 +46,11 @@ static bool check_fcs(struct efx_channel *channel, u32 *prefix) if (likely(l2status == ESE_GZ_RH_HCLASS_L2_STATUS_OK)) /* Everything is ok */ - return 0; + return false; if (l2status == ESE_GZ_RH_HCLASS_L2_STATUS_FCS_ERR) channel->n_rx_eth_crc_err++; - return 1; + return true; } void __ef100_rx_packet(struct efx_channel *channel) @@ -63,7 +63,7 @@ void __ef100_rx_packet(struct efx_channel *channel) prefix = (u32 *)(eh - ESE_GZ_RX_PKT_PREFIX_LEN); - if (check_fcs(channel, prefix) && + if (ef100_has_fcs_error(channel, prefix) && unlikely(!(efx->net_dev->features & NETIF_F_RXALL))) goto out; From b474959d5afda6e341a02c85f9595d85d39189ae Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 21 Aug 2020 12:10:54 -0700 Subject: [PATCH 039/110] bpf: Fix a buffer out-of-bound access when filling raw_tp link_info Commit f2e10bff16a0 ("bpf: Add support for BPF_OBJ_GET_INFO_BY_FD for bpf_link") added link query for raw_tp. One of fields in link_info is to fill a user buffer with tp_name. The Scurrent checking only declares "ulen && !ubuf" as invalid. So "!ulen && ubuf" will be valid. Later on, we do "copy_to_user(ubuf, tp_name, ulen - 1)" which may overwrite user memory incorrectly. This patch fixed the problem by disallowing "!ulen && ubuf" case as well. Fixes: f2e10bff16a0 ("bpf: Add support for BPF_OBJ_GET_INFO_BY_FD for bpf_link") Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200821191054.714731-1-yhs@fb.com --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 86299a292214..ac6c784c0576 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2634,7 +2634,7 @@ static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link, u32 ulen = info->raw_tracepoint.tp_name_len; size_t tp_len = strlen(tp_name); - if (ulen && !ubuf) + if (!ulen ^ !ubuf) return -EINVAL; info->raw_tracepoint.tp_name_len = tp_len + 1; From 2b10af318ad305b8e56f1f7ad78ea3ba20aadc01 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 24 Aug 2020 13:57:20 +0200 Subject: [PATCH 040/110] selftests/bpf: Fix test_progs-flavor run getting number of tests Commit 643e7233aa94 ("selftests/bpf: Test_progs option for getting number of tests") introduced ability to getting number of tests, which is targeted towards scripting. As demonstrate in the commit the number can be use as a shell variable for further scripting. The test_progs program support "flavor", which is detected by the binary have a "-flavor" in the executable name. One example is test_progs-no_alu32, which load bpf-progs compiled with disabled alu32, located in dir 'no_alu32/'. The problem is that invoking a "flavor" binary prints to stdout e.g.: "Switching to flavor 'no_alu32' subdirectory..." Thus, intermixing with the number of tests, making it unusable for scripting. Fix the issue by only printing "flavor" info when verbose -v option is used. Fixes: 643e7233aa94 ("selftests/bpf: Test_progs option for getting number of tests") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/159827024012.923543.7104106594870150597.stgit@firesoul --- tools/testing/selftests/bpf/test_progs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index b1e4dadacd9b..22943b58d752 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -618,7 +618,9 @@ int cd_flavor_subdir(const char *exec_name) if (!flavor) return 0; flavor++; - fprintf(stdout, "Switching to flavor '%s' subdirectory...\n", flavor); + if (env.verbosity > VERBOSE_NONE) + fprintf(stdout, "Switching to flavor '%s' subdirectory...\n", flavor); + return chdir(flavor); } From 7787b6fc938e16aa418613c4a765c1dbb268ed9f Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Mon, 24 Aug 2020 16:20:47 +0200 Subject: [PATCH 041/110] bpf, sysctl: Let bpf_stats_handler take a kernel pointer buffer Commit 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") changed ctl_table.proc_handler to take a kernel pointer. Adjust the signature of bpf_stats_handler to match ctl_table.proc_handler which fixes the following sparse warning: kernel/sysctl.c:226:49: warning: incorrect type in argument 3 (different address spaces) kernel/sysctl.c:226:49: expected void * kernel/sysctl.c:226:49: got void [noderef] __user *buffer kernel/sysctl.c:2640:35: warning: incorrect type in initializer (incompatible argument 3 (different address spaces)) kernel/sysctl.c:2640:35: expected int ( [usertype] *proc_handler )( ... ) kernel/sysctl.c:2640:35: got int ( * )( ... ) Fixes: 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") Signed-off-by: Tobias Klauser Signed-off-by: Alexei Starovoitov Cc: Christoph Hellwig Link: https://lore.kernel.org/bpf/20200824142047.22043-1-tklauser@distanz.ch --- kernel/sysctl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 287862f91717..09e70ee2332e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -204,8 +204,7 @@ static int max_extfrag_threshold = 1000; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL) static int bpf_stats_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct static_key *key = (struct static_key *)table->data; static int saved_val; From b274e47d9e3f4dcd4ad4028a316ec22dc4533ac7 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Tue, 25 Aug 2020 14:59:40 +0200 Subject: [PATCH 042/110] gtp: add GTPA_LINK info to msg sent to userspace During a dump, this attribute is essential, it enables the userspace to know on which interface the context is linked to. Fixes: 459aa660eb1d ("gtp: add initial driver for datapath of GPRS Tunneling Protocol (GTP-U)") Signed-off-by: Nicolas Dichtel Tested-by: Gabriel Ganne Signed-off-by: David S. Miller --- drivers/net/gtp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 21640a035d7d..8e47d0112e5d 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -1179,6 +1179,7 @@ static int gtp_genl_fill_info(struct sk_buff *skb, u32 snd_portid, u32 snd_seq, goto nlmsg_failure; if (nla_put_u32(skb, GTPA_VERSION, pctx->gtp_version) || + nla_put_u32(skb, GTPA_LINK, pctx->dev->ifindex) || nla_put_be32(skb, GTPA_PEER_ADDRESS, pctx->peer_addr_ip4.s_addr) || nla_put_be32(skb, GTPA_MS_ADDRESS, pctx->ms_addr_ip4.s_addr)) goto nla_put_failure; From 8e4efd4706f77d76c99f78f8859e1d61fae5142a Mon Sep 17 00:00:00 2001 From: Sumera Priyadarsini Date: Tue, 25 Aug 2020 01:33:11 +0530 Subject: [PATCH 043/110] net: dsa: mt7530: Add of_node_put() before break and return statements Every iteration of for_each_child_of_node() decrements the reference count of the previous node, however when control is transferred from the middle of the loop, as in the case of a return or break or goto, there is no decrement thus ultimately resulting in a memory leak. Fix a potential memory leak in mt7530.c by inserting of_node_put() before the break and return statements. Issue found with Coccinelle. Signed-off-by: Sumera Priyadarsini Signed-off-by: David S. Miller --- drivers/net/dsa/mt7530.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index 8dcb8a49ab67..4b4701c69fe1 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -1326,14 +1326,17 @@ mt7530_setup(struct dsa_switch *ds) if (phy_node->parent == priv->dev->of_node->parent) { ret = of_get_phy_mode(mac_np, &interface); - if (ret && ret != -ENODEV) + if (ret && ret != -ENODEV) { + of_node_put(mac_np); return ret; + } id = of_mdio_parse_addr(ds->dev, phy_node); if (id == 0) priv->p5_intf_sel = P5_INTF_SEL_PHY_P0; if (id == 4) priv->p5_intf_sel = P5_INTF_SEL_PHY_P4; } + of_node_put(mac_np); of_node_put(phy_node); break; } From e104684108fc8b846dbca889f28257a65a956874 Mon Sep 17 00:00:00 2001 From: Tong Zhang Date: Mon, 24 Aug 2020 18:08:06 -0400 Subject: [PATCH 044/110] net: caif: fix error code handling cfpkt_peek_head return 0 and 1, caller is checking error using <0 Signed-off-by: Tong Zhang Signed-off-by: David S. Miller --- net/caif/cfrfml.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/caif/cfrfml.c b/net/caif/cfrfml.c index ce2767e9cec6..7b0af33bdb97 100644 --- a/net/caif/cfrfml.c +++ b/net/caif/cfrfml.c @@ -116,7 +116,7 @@ static int cfrfml_receive(struct cflayer *layr, struct cfpkt *pkt) if (segmented) { if (rfml->incomplete_frm == NULL) { /* Initial Segment */ - if (cfpkt_peek_head(pkt, rfml->seghead, 6) < 0) + if (cfpkt_peek_head(pkt, rfml->seghead, 6) != 0) goto out; rfml->pdu_size = get_unaligned_le16(rfml->seghead+4); @@ -233,7 +233,7 @@ static int cfrfml_transmit(struct cflayer *layr, struct cfpkt *pkt) if (cfpkt_getlen(pkt) > rfml->fragment_size + RFM_HEAD_SIZE) err = cfpkt_peek_head(pkt, head, 6); - if (err < 0) + if (err != 0) goto out; while (cfpkt_getlen(frontpkt) > rfml->fragment_size + RFM_HEAD_SIZE) { From 99d469fc64d06f0c81c0fe2a1c01645a67e0cbf0 Mon Sep 17 00:00:00 2001 From: Murali Karicheri Date: Mon, 24 Aug 2020 11:10:52 -0400 Subject: [PATCH 045/110] net: ethernet: ti: cpsw: fix clean up of vlan mc entries for host port To flush the vid + mc entries from ALE, which is required when a VLAN interface is removed, driver needs to call cpsw_ale_flush_multicast() with ALE_PORT_HOST for port mask as these entries are added only for host port. Without this, these entries remain in the ALE table even after removing the VLAN interface. cpsw_ale_flush_multicast() calls cpsw_ale_flush_mcast which expects a port mask to do the job. Fixes: 15180eca569b ("net: ethernet: ti: cpsw: fix vlan mcast") Signed-off-by: Murali Karicheri Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpsw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index 9b17bbbe102f..4a65edc5a375 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -1116,7 +1116,7 @@ static int cpsw_ndo_vlan_rx_kill_vid(struct net_device *ndev, HOST_PORT_NUM, ALE_VLAN, vid); ret |= cpsw_ale_del_mcast(cpsw->ale, priv->ndev->broadcast, 0, ALE_VLAN, vid); - ret |= cpsw_ale_flush_multicast(cpsw->ale, 0, vid); + ret |= cpsw_ale_flush_multicast(cpsw->ale, ALE_PORT_HOST, vid); err: pm_runtime_put(cpsw->dev); return ret; From 2c6500e82e5190b038f0b79f85a20da55bdd4b86 Mon Sep 17 00:00:00 2001 From: Murali Karicheri Date: Mon, 24 Aug 2020 11:10:53 -0400 Subject: [PATCH 046/110] net: ethernet: ti: cpsw_new: fix clean up of vlan mc entries for host port To flush the vid + mc entries from ALE, which is required when a VLAN interface is removed, driver needs to call cpsw_ale_flush_multicast() with ALE_PORT_HOST for port mask as these entries are added only for host port. Without this, these entries remain in the ALE table even after removing the VLAN interface. cpsw_ale_flush_multicast() calls cpsw_ale_flush_mcast which expects a port mask to do the job. Fixes: ed3525eda4c4 ("net: ethernet: ti: introduce cpsw switchdev based driver part 1 - dual-emac") Signed-off-by: Murali Karicheri Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpsw_new.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c index 1247d35d42ef..8d0a2bc7128d 100644 --- a/drivers/net/ethernet/ti/cpsw_new.c +++ b/drivers/net/ethernet/ti/cpsw_new.c @@ -1044,7 +1044,7 @@ static int cpsw_ndo_vlan_rx_kill_vid(struct net_device *ndev, HOST_PORT_NUM, ALE_VLAN, vid); cpsw_ale_del_mcast(cpsw->ale, priv->ndev->broadcast, 0, ALE_VLAN, vid); - cpsw_ale_flush_multicast(cpsw->ale, 0, vid); + cpsw_ale_flush_multicast(cpsw->ale, ALE_PORT_HOST, vid); err: pm_runtime_put(cpsw->dev); return ret; From 9f13457377907fa253aef560e1a37e1ca4197f9b Mon Sep 17 00:00:00 2001 From: Mingming Cao Date: Tue, 25 Aug 2020 13:26:41 -0400 Subject: [PATCH 047/110] ibmvnic fix NULL tx_pools and rx_tools issue at do_reset At the time of do_rest, ibmvnic tries to re-initalize the tx_pools and rx_pools to avoid re-allocating the long term buffer. However there is a window inside do_reset that the tx_pools and rx_pools were freed before re-initialized making it possible to deference null pointers. This patch fix this issue by always check the tx_pool and rx_pool are not NULL after ibmvnic_login. If so, re-allocating the pools. This will avoid getting into calling reset_tx/rx_pools with NULL adapter tx_pools/rx_pools pointer. Also add null pointer check in reset_tx_pools and reset_rx_pools to safe handle NULL pointer case. Signed-off-by: Mingming Cao Signed-off-by: Dany Madden Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 5afb3c9c52d2..d3a774331afc 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -479,6 +479,9 @@ static int reset_rx_pools(struct ibmvnic_adapter *adapter) int i, j, rc; u64 *size_array; + if (!adapter->rx_pool) + return -1; + size_array = (u64 *)((u8 *)(adapter->login_rsp_buf) + be32_to_cpu(adapter->login_rsp_buf->off_rxadd_buff_size)); @@ -649,6 +652,9 @@ static int reset_tx_pools(struct ibmvnic_adapter *adapter) int tx_scrqs; int i, rc; + if (!adapter->tx_pool) + return -1; + tx_scrqs = be32_to_cpu(adapter->login_rsp_buf->num_txsubm_subcrqs); for (i = 0; i < tx_scrqs; i++) { rc = reset_one_tx_pool(adapter, &adapter->tso_pool[i]); @@ -2011,7 +2017,10 @@ static int do_reset(struct ibmvnic_adapter *adapter, adapter->req_rx_add_entries_per_subcrq != old_num_rx_slots || adapter->req_tx_entries_per_subcrq != - old_num_tx_slots) { + old_num_tx_slots || + !adapter->rx_pool || + !adapter->tso_pool || + !adapter->tx_pool) { release_rx_pools(adapter); release_tx_pools(adapter); release_napi(adapter); @@ -2024,10 +2033,14 @@ static int do_reset(struct ibmvnic_adapter *adapter, } else { rc = reset_tx_pools(adapter); if (rc) + netdev_dbg(adapter->netdev, "reset tx pools failed (%d)\n", + rc); goto out; rc = reset_rx_pools(adapter); if (rc) + netdev_dbg(adapter->netdev, "reset rx pools failed (%d)\n", + rc); goto out; } ibmvnic_disable_irqs(adapter); From 2e1ec861a605d1d116f8c774f45e9f6a2b593cbb Mon Sep 17 00:00:00 2001 From: Daniel Gorsulowski Date: Wed, 26 Aug 2020 07:00:14 +0200 Subject: [PATCH 048/110] net: dp83869: Fix RGMII internal delay configuration The RGMII control register at 0x32 indicates the states for the bits RGMII_TX_CLK_DELAY and RGMII_RX_CLK_DELAY as follows: RGMII Transmit/Receive Clock Delay 0x0 = RGMII transmit clock is shifted with respect to transmit/receive data. 0x1 = RGMII transmit clock is aligned with respect to transmit/receive data. This commit fixes the inversed behavior of these bits Fixes: 736b25afe284 ("net: dp83869: Add RGMII internal delay configuration") Signed-off-by: Daniel Gorsulowski Acked-by: Dan Murphy Signed-off-by: David S. Miller --- drivers/net/phy/dp83869.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/phy/dp83869.c b/drivers/net/phy/dp83869.c index 58103152c601..6b98d74b5102 100644 --- a/drivers/net/phy/dp83869.c +++ b/drivers/net/phy/dp83869.c @@ -427,18 +427,18 @@ static int dp83869_config_init(struct phy_device *phydev) return ret; val = phy_read_mmd(phydev, DP83869_DEVADDR, DP83869_RGMIICTL); - val &= ~(DP83869_RGMII_TX_CLK_DELAY_EN | - DP83869_RGMII_RX_CLK_DELAY_EN); + val |= (DP83869_RGMII_TX_CLK_DELAY_EN | + DP83869_RGMII_RX_CLK_DELAY_EN); if (phydev->interface == PHY_INTERFACE_MODE_RGMII_ID) - val |= (DP83869_RGMII_TX_CLK_DELAY_EN | - DP83869_RGMII_RX_CLK_DELAY_EN); + val &= ~(DP83869_RGMII_TX_CLK_DELAY_EN | + DP83869_RGMII_RX_CLK_DELAY_EN); if (phydev->interface == PHY_INTERFACE_MODE_RGMII_TXID) - val |= DP83869_RGMII_TX_CLK_DELAY_EN; + val &= ~DP83869_RGMII_TX_CLK_DELAY_EN; if (phydev->interface == PHY_INTERFACE_MODE_RGMII_RXID) - val |= DP83869_RGMII_RX_CLK_DELAY_EN; + val &= ~DP83869_RGMII_RX_CLK_DELAY_EN; ret = phy_write_mmd(phydev, DP83869_DEVADDR, DP83869_RGMIICTL, val); From c1c2d77408022a398a1a7c51cf20488c922629de Mon Sep 17 00:00:00 2001 From: Pavan Chebbi Date: Wed, 26 Aug 2020 01:08:32 -0400 Subject: [PATCH 049/110] bnxt_en: Don't query FW when netif_running() is false. In rare conditions like two stage OS installation, the ethtool's get_channels function may be called when the device is in D3 state, leading to uncorrectable PCI error. Check netif_running() first before making any query to FW which involves writing to BAR. Fixes: db4723b3cd2d ("bnxt_en: Check max_tx_scheduler_inputs value from firmware.") Signed-off-by: Pavan Chebbi Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 64da654f1038..3890c1aaeed6 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -806,7 +806,7 @@ static void bnxt_get_channels(struct net_device *dev, int max_tx_sch_inputs; /* Get the most up-to-date max_tx_sch_inputs. */ - if (BNXT_NEW_RM(bp)) + if (netif_running(dev) && BNXT_NEW_RM(bp)) bnxt_hwrm_func_resc_qcaps(bp, false); max_tx_sch_inputs = hw_resc->max_tx_sch_inputs; From dbbfa96ad920c50d58bcaefa57f5f33ceef9d00e Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Wed, 26 Aug 2020 01:08:33 -0400 Subject: [PATCH 050/110] bnxt_en: Check for zero dir entries in NVRAM. If firmware goes into unstable state, HWRM_NVM_GET_DIR_INFO firmware command may return zero dir entries. Return error in such case to avoid zero length dma buffer request. Fixes: c0c050c58d84 ("bnxt_en: New Broadcom ethernet driver.") Signed-off-by: Vasundhara Volam Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 3890c1aaeed6..5d1a0cd0b53e 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -2323,6 +2323,9 @@ static int bnxt_get_nvram_directory(struct net_device *dev, u32 len, u8 *data) if (rc != 0) return rc; + if (!dir_entries || !entry_length) + return -EIO; + /* Insert 2 bytes of directory info (count and size of entries) */ if (len < 2) return -EINVAL; From 7de651490c27ebc5edb5c7224c368bd0cd5b3862 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Wed, 26 Aug 2020 01:08:34 -0400 Subject: [PATCH 051/110] bnxt_en: Fix ethtool -S statitics with XDP or TCs enabled. We are returning the wrong count for ETH_SS_STATS in get_sset_count() when XDP or TCs are enabled. In a recent commit, we got rid of irrelevant counters when the ring is RX only or TX only, but we did not make the proper adjustments for the count. As a result, when we have XDP or TCs enabled, we are returning an excess count because some of the rings are TX only. This causes ethtool -S to display extra counters with no counter names. Fix bnxt_get_num_ring_stats() by not assuming that all rings will always have RX and TX counters in combined mode. Fixes: 125592fbf467 ("bnxt_en: show only relevant ethtool stats for a TX or RX ring") Reviewed-by: Vasundhara Volam Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 5d1a0cd0b53e..1bc5130458bf 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -472,20 +472,13 @@ static int bnxt_get_num_tpa_ring_stats(struct bnxt *bp) static int bnxt_get_num_ring_stats(struct bnxt *bp) { int rx, tx, cmn; - bool sh = false; - - if (bp->flags & BNXT_FLAG_SHARED_RINGS) - sh = true; rx = NUM_RING_RX_HW_STATS + NUM_RING_RX_SW_STATS + bnxt_get_num_tpa_ring_stats(bp); tx = NUM_RING_TX_HW_STATS; cmn = NUM_RING_CMN_SW_STATS; - if (sh) - return (rx + tx + cmn) * bp->cp_nr_rings; - else - return rx * bp->rx_nr_rings + tx * bp->tx_nr_rings + - cmn * bp->cp_nr_rings; + return rx * bp->rx_nr_rings + tx * bp->tx_nr_rings + + cmn * bp->cp_nr_rings; } static int bnxt_get_num_stats(struct bnxt *bp) From df3875ec550396974b1d8a518bd120d034738236 Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Wed, 26 Aug 2020 01:08:35 -0400 Subject: [PATCH 052/110] bnxt_en: Fix PCI AER error recovery flow When a PCI error is detected the PCI state could be corrupt, save the PCI state after initialization and restore it after the slot reset. Fixes: 6316ea6db93d ("bnxt_en: Enable AER support.") Signed-off-by: Vasundhara Volam Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 31fb5a28e1c4..4389a74c0d53 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -12339,6 +12339,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) (long)pci_resource_start(pdev, 0), dev->dev_addr); pcie_print_link_status(pdev); + pci_save_state(pdev); return 0; init_err_cleanup: @@ -12536,6 +12537,8 @@ static pci_ers_result_t bnxt_io_slot_reset(struct pci_dev *pdev) "Cannot re-enable PCI device after reset.\n"); } else { pci_set_master(pdev); + pci_restore_state(pdev); + pci_save_state(pdev); err = bnxt_hwrm_func_reset(bp); if (!err) { From b148bb238c02f0c7797efed026e9bba5892d2172 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Wed, 26 Aug 2020 01:08:36 -0400 Subject: [PATCH 053/110] bnxt_en: Fix possible crash in bnxt_fw_reset_task(). bnxt_fw_reset_task() is run from a delayed workqueue. The current code is not cancelling the workqueue in the driver's .remove() method and it can potentially crash if the device is removed with the workqueue still pending. The fix is to clear the BNXT_STATE_IN_FW_RESET flag and then cancel the delayed workqueue in bnxt_remove_one(). bnxt_queue_fw_reset_work() also needs to check that this flag is set before scheduling. This will guarantee that no rescheduling will be done after it is cancelled. Fixes: 230d1f0de754 ("bnxt_en: Handle firmware reset.") Reviewed-by: Vasundhara Volam Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 4389a74c0d53..d6f35929020c 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -1141,6 +1141,9 @@ static int bnxt_discard_rx(struct bnxt *bp, struct bnxt_cp_ring_info *cpr, static void bnxt_queue_fw_reset_work(struct bnxt *bp, unsigned long delay) { + if (!(test_bit(BNXT_STATE_IN_FW_RESET, &bp->state))) + return; + if (BNXT_PF(bp)) queue_delayed_work(bnxt_pf_wq, &bp->fw_reset_task, delay); else @@ -1157,10 +1160,12 @@ static void bnxt_queue_sp_work(struct bnxt *bp) static void bnxt_cancel_sp_work(struct bnxt *bp) { - if (BNXT_PF(bp)) + if (BNXT_PF(bp)) { flush_workqueue(bnxt_pf_wq); - else + } else { cancel_work_sync(&bp->sp_task); + cancel_delayed_work_sync(&bp->fw_reset_task); + } } static void bnxt_sched_reset(struct bnxt *bp, struct bnxt_rx_ring_info *rxr) @@ -11761,6 +11766,7 @@ static void bnxt_remove_one(struct pci_dev *pdev) unregister_netdev(dev); bnxt_dl_unregister(bp); bnxt_shutdown_tc(bp); + clear_bit(BNXT_STATE_IN_FW_RESET, &bp->state); bnxt_cancel_sp_work(bp); bp->sp_event = 0; From 12cce90b934bf2b0ed9c339b4d5503e69954351a Mon Sep 17 00:00:00 2001 From: Edwin Peer Date: Wed, 26 Aug 2020 01:08:37 -0400 Subject: [PATCH 054/110] bnxt_en: fix HWRM error when querying VF temperature Firmware returns RESOURCE_ACCESS_DENIED for HWRM_TEMP_MONITORY_QUERY for VFs. This produces unpleasing error messages in the log when temp1_input is queried via the hwmon sysfs interface from a VF. The error is harmless and expected, so silence it and return unknown as the value. Since the device temperature is not particularly sensitive information, provide flexibility to change this policy in future by silencing the error rather than avoiding the HWRM call entirely for VFs. Fixes: cde49a42a9bb ("bnxt_en: Add hwmon sysfs support to read temperature") Cc: Marc Smith Reported-by: Marc Smith Signed-off-by: Edwin Peer Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index d6f35929020c..a23ccb0392a6 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -9289,16 +9289,19 @@ static ssize_t bnxt_show_temp(struct device *dev, struct hwrm_temp_monitor_query_input req = {0}; struct hwrm_temp_monitor_query_output *resp; struct bnxt *bp = dev_get_drvdata(dev); - u32 temp = 0; + u32 len = 0; resp = bp->hwrm_cmd_resp_addr; bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_TEMP_MONITOR_QUERY, -1, -1); mutex_lock(&bp->hwrm_cmd_lock); - if (!_hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT)) - temp = resp->temp * 1000; /* display millidegree */ + if (!_hwrm_send_message_silent(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT)) + len = sprintf(buf, "%u\n", resp->temp * 1000); /* display millidegree */ mutex_unlock(&bp->hwrm_cmd_lock); - return sprintf(buf, "%u\n", temp); + if (len) + return len; + + return sprintf(buf, "unknown\n"); } static SENSOR_DEVICE_ATTR(temp1_input, 0444, bnxt_show_temp, NULL, 0); From 5fa65524f6e0b9528c6ceac3f33f7e8f0c3a084a Mon Sep 17 00:00:00 2001 From: Edwin Peer Date: Wed, 26 Aug 2020 01:08:38 -0400 Subject: [PATCH 055/110] bnxt_en: init RSS table for Minimal-Static VF reservation There are no VF rings available during probe when the device is configured using the Minimal-Static reservation strategy. In this case, the RSS indirection table can only be initialized later, during bnxt_open_nic(). However, this was not happening because the rings will already have been reserved via bnxt_init_dflt_ring_mode(), causing bnxt_need_reserve_rings() to return false in bnxt_reserve_rings() and bypass the RSS table init. Solve this by pushing the call to bnxt_set_dflt_rss_indir_tbl() into __bnxt_reserve_rings(), which is common to both paths and is called whenever ring configuration is changed. After doing this, the RSS table init that must be called from bnxt_init_one() happens implicitly via bnxt_set_default_rings(), necessitating doing the allocation earlier in order to avoid a null pointer dereference. Fixes: bd3191b5d87d ("bnxt_en: Implement ethtool -X to set indirection table.") Signed-off-by: Edwin Peer Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index a23ccb0392a6..27df5721353d 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -6219,6 +6219,9 @@ static int __bnxt_reserve_rings(struct bnxt *bp) if (!tx || !rx || !cp || !grp || !vnic || !stat) return -ENOMEM; + if (!netif_is_rxfh_configured(bp->dev)) + bnxt_set_dflt_rss_indir_tbl(bp); + return rc; } @@ -8500,9 +8503,6 @@ int bnxt_reserve_rings(struct bnxt *bp, bool irq_re_init) rc = bnxt_init_int_mode(bp); bnxt_ulp_irq_restart(bp, rc); } - if (!netif_is_rxfh_configured(bp->dev)) - bnxt_set_dflt_rss_indir_tbl(bp); - if (rc) { netdev_err(bp->dev, "ring reservation/IRQ init failure rc: %d\n", rc); return rc; @@ -12209,6 +12209,10 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) if (BNXT_CHIP_P5(bp)) bp->flags |= BNXT_FLAG_CHIP_P5; + rc = bnxt_alloc_rss_indir_tbl(bp); + if (rc) + goto init_err_pci_clean; + rc = bnxt_fw_init_one_p2(bp); if (rc) goto init_err_pci_clean; @@ -12313,11 +12317,6 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) */ bp->tx_nr_rings_per_tc = bp->tx_nr_rings; - rc = bnxt_alloc_rss_indir_tbl(bp); - if (rc) - goto init_err_pci_clean; - bnxt_set_dflt_rss_indir_tbl(bp); - if (BNXT_PF(bp)) { if (!bnxt_pf_wq) { bnxt_pf_wq = From b43b9f53fbb06faa4f2fcdbf235db3289026e2e4 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Wed, 26 Aug 2020 01:08:39 -0400 Subject: [PATCH 056/110] bnxt_en: Setup default RSS map in all scenarios. The recent changes to support user-defined RSS map assume that RX rings are always reserved and the default RSS map is set after the RX rings are successfully reserved. If the firmware spec is older than 1.6.1, no ring reservations are required and the default RSS map is not setup at all. In another scenario where the fw Resource Manager is older, RX rings are not reserved and we also end up with no valid RSS map. Fix both issues in bnxt_need_reserve_rings(). In both scenarios described above, we don't need to reserve RX rings so we need to call this new function bnxt_check_rss_map_no_rmgr() to setup the default RSS map when needed. Without valid RSS map, the NIC won't receive packets properly. Fixes: 1667cbf6a4eb ("bnxt_en: Add logical RSS indirection table structure.") Reviewed-by: Vasundhara Volam Reviewed-by: Edwin Peer Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 39 +++++++++++++++++------ 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 27df5721353d..316227136f5b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -6107,6 +6107,21 @@ static int bnxt_get_func_stat_ctxs(struct bnxt *bp) return cp + ulp_stat; } +/* Check if a default RSS map needs to be setup. This function is only + * used on older firmware that does not require reserving RX rings. + */ +static void bnxt_check_rss_tbl_no_rmgr(struct bnxt *bp) +{ + struct bnxt_hw_resc *hw_resc = &bp->hw_resc; + + /* The RSS map is valid for RX rings set to resv_rx_rings */ + if (hw_resc->resv_rx_rings != bp->rx_nr_rings) { + hw_resc->resv_rx_rings = bp->rx_nr_rings; + if (!netif_is_rxfh_configured(bp->dev)) + bnxt_set_dflt_rss_indir_tbl(bp); + } +} + static bool bnxt_need_reserve_rings(struct bnxt *bp) { struct bnxt_hw_resc *hw_resc = &bp->hw_resc; @@ -6115,22 +6130,28 @@ static bool bnxt_need_reserve_rings(struct bnxt *bp) int rx = bp->rx_nr_rings, stat; int vnic = 1, grp = rx; - if (bp->hwrm_spec_code < 0x10601) - return false; - - if (hw_resc->resv_tx_rings != bp->tx_nr_rings) + if (hw_resc->resv_tx_rings != bp->tx_nr_rings && + bp->hwrm_spec_code >= 0x10601) return true; + /* Old firmware does not need RX ring reservations but we still + * need to setup a default RSS map when needed. With new firmware + * we go through RX ring reservations first and then set up the + * RSS map for the successfully reserved RX rings when needed. + */ + if (!BNXT_NEW_RM(bp)) { + bnxt_check_rss_tbl_no_rmgr(bp); + return false; + } if ((bp->flags & BNXT_FLAG_RFS) && !(bp->flags & BNXT_FLAG_CHIP_P5)) vnic = rx + 1; if (bp->flags & BNXT_FLAG_AGG_RINGS) rx <<= 1; stat = bnxt_get_func_stat_ctxs(bp); - if (BNXT_NEW_RM(bp) && - (hw_resc->resv_rx_rings != rx || hw_resc->resv_cp_rings != cp || - hw_resc->resv_vnics != vnic || hw_resc->resv_stat_ctxs != stat || - (hw_resc->resv_hw_ring_grps != grp && - !(bp->flags & BNXT_FLAG_CHIP_P5)))) + if (hw_resc->resv_rx_rings != rx || hw_resc->resv_cp_rings != cp || + hw_resc->resv_vnics != vnic || hw_resc->resv_stat_ctxs != stat || + (hw_resc->resv_hw_ring_grps != grp && + !(bp->flags & BNXT_FLAG_CHIP_P5))) return true; if ((bp->flags & BNXT_FLAG_CHIP_P5) && BNXT_PF(bp) && hw_resc->resv_irqs != nq) From a156998fc92d3859c8e820f1583f6d0541d643c3 Mon Sep 17 00:00:00 2001 From: Yi Li Date: Wed, 26 Aug 2020 13:11:50 +0800 Subject: [PATCH 057/110] net: hns3: Fix for geneve tx checksum bug when skb->encapsulation is 0, skb->ip_summed is CHECKSUM_PARTIAL and it is udp packet, which has a dest port as the IANA assigned. the hardware is expected to do the checksum offload, but the hardware will not do the checksum offload when udp dest port is 6081. This patch fixes it by doing the checksum in software. Reported-by: Li Bing Signed-off-by: Yi Li Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 87776ce3539b..7d83c45369c2 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "hnae3.h" #include "hns3_enet.h" @@ -780,7 +781,7 @@ static int hns3_get_l4_protocol(struct sk_buff *skb, u8 *ol4_proto, * and it is udp packet, which has a dest port as the IANA assigned. * the hardware is expected to do the checksum offload, but the * hardware will not do the checksum offload when udp dest port is - * 4789. + * 4789 or 6081. */ static bool hns3_tunnel_csum_bug(struct sk_buff *skb) { @@ -789,7 +790,8 @@ static bool hns3_tunnel_csum_bug(struct sk_buff *skb) l4.hdr = skb_transport_header(skb); if (!(!skb->encapsulation && - l4.udp->dest == htons(IANA_VXLAN_UDP_PORT))) + (l4.udp->dest == htons(IANA_VXLAN_UDP_PORT) || + l4.udp->dest == htons(GENEVE_UDP_PORT)))) return false; skb_checksum_help(skb); From 5fd99b5d9950d6300467ded18ff4e44af0b4ae55 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 26 Aug 2020 14:52:31 +0800 Subject: [PATCH 058/110] net: cdc_ncm: Fix build error If USB_NET_CDC_NCM is y and USB_NET_CDCETHER is m, build fails: drivers/net/usb/cdc_ncm.o:(.rodata+0x1d8): undefined reference to `usbnet_cdc_update_filter' Select USB_NET_CDCETHER for USB_NET_CDC_NCM to fix this. Reported-by: Hulk Robot Fixes: e10dcb1b6ba7 ("net: cdc_ncm: hook into set_rx_mode to admit multicast traffic") Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- drivers/net/usb/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/Kconfig b/drivers/net/usb/Kconfig index a7fbc3ccd29e..c7bcfca7d70b 100644 --- a/drivers/net/usb/Kconfig +++ b/drivers/net/usb/Kconfig @@ -252,6 +252,7 @@ config USB_NET_CDC_EEM config USB_NET_CDC_NCM tristate "CDC NCM support" depends on USB_USBNET + select USB_NET_CDCETHER default y help This driver provides support for CDC NCM (Network Control Model From 09e31cf0c528dac3358a081dc4e773d1b3de1bc9 Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Tue, 25 Aug 2020 10:44:04 -0700 Subject: [PATCH 059/110] taprio: Fix using wrong queues in gate mask Since commit 9c66d1564676 ("taprio: Add support for hardware offloading") there's a bit of inconsistency when offloading schedules to the hardware: In software mode, the gate masks are specified in terms of traffic classes, so if say "sched-entry S 03 20000", it means that the traffic classes 0 and 1 are open for 20us; when taprio is offloaded to hardware, the gate masks are specified in terms of hardware queues. The idea here is to fix hardware offloading, so schedules in hardware and software mode have the same behavior. What's needed to do is to map traffic classes to queues when applying the offload to the driver. Fixes: 9c66d1564676 ("taprio: Add support for hardware offloading") Signed-off-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- net/sched/sch_taprio.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index e981992634dd..fe53c1e38c7d 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1176,9 +1176,27 @@ static void taprio_offload_config_changed(struct taprio_sched *q) spin_unlock(&q->current_entry_lock); } -static void taprio_sched_to_offload(struct taprio_sched *q, +static u32 tc_map_to_queue_mask(struct net_device *dev, u32 tc_mask) +{ + u32 i, queue_mask = 0; + + for (i = 0; i < dev->num_tc; i++) { + u32 offset, count; + + if (!(tc_mask & BIT(i))) + continue; + + offset = dev->tc_to_txq[i].offset; + count = dev->tc_to_txq[i].count; + + queue_mask |= GENMASK(offset + count - 1, offset); + } + + return queue_mask; +} + +static void taprio_sched_to_offload(struct net_device *dev, struct sched_gate_list *sched, - const struct tc_mqprio_qopt *mqprio, struct tc_taprio_qopt_offload *offload) { struct sched_entry *entry; @@ -1193,7 +1211,8 @@ static void taprio_sched_to_offload(struct taprio_sched *q, e->command = entry->command; e->interval = entry->interval; - e->gate_mask = entry->gate_mask; + e->gate_mask = tc_map_to_queue_mask(dev, entry->gate_mask); + i++; } @@ -1201,7 +1220,6 @@ static void taprio_sched_to_offload(struct taprio_sched *q, } static int taprio_enable_offload(struct net_device *dev, - struct tc_mqprio_qopt *mqprio, struct taprio_sched *q, struct sched_gate_list *sched, struct netlink_ext_ack *extack) @@ -1223,7 +1241,7 @@ static int taprio_enable_offload(struct net_device *dev, return -ENOMEM; } offload->enable = 1; - taprio_sched_to_offload(q, sched, mqprio, offload); + taprio_sched_to_offload(dev, sched, offload); err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload); if (err < 0) { @@ -1485,7 +1503,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, } if (FULL_OFFLOAD_IS_ENABLED(q->flags)) - err = taprio_enable_offload(dev, mqprio, q, new_admin, extack); + err = taprio_enable_offload(dev, q, new_admin, extack); else err = taprio_disable_offload(dev, q, extack); if (err) From 1cec170d458b1d18f6f1654ca84c0804a701c5ef Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 26 Aug 2020 01:31:05 +0200 Subject: [PATCH 060/110] mptcp: free acked data before waiting for more memory After subflow lock is dropped, more wmem might have been made available. This fixes a deadlock in mptcp_connect.sh 'mmap' mode: wmem is exhausted. But as the mptcp socket holds on to already-acked data (for retransmit) no wakeup will occur. Using 'goto restart' calls mptcp_clean_una(sk) which will free pages that have been acked completely in the mean time. Fixes: fb529e62d3f3 ("mptcp: break and restart in case mptcp sndbuf is full") Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 1aad411a0e46..8ccd4a151dcb 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -892,7 +892,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) goto out; } -wait_for_sndbuf: __mptcp_flush_join_list(msk); ssk = mptcp_subflow_get_send(msk); while (!sk_stream_memory_free(sk) || @@ -982,7 +981,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) */ mptcp_set_timeout(sk, ssk); release_sock(ssk); - goto wait_for_sndbuf; + goto restart; } } } From 91244d108441013b7367b3b4dcc6869998676473 Mon Sep 17 00:00:00 2001 From: Xie He Date: Tue, 25 Aug 2020 20:03:53 -0700 Subject: [PATCH 061/110] drivers/net/wan/lapbether: Set network_header before transmitting Set the skb's network_header before it is passed to the underlying Ethernet device for transmission. This patch fixes the following issue: When we use this driver with AF_PACKET sockets, there would be error messages of: protocol 0805 is buggy, dev (Ethernet interface name) printed in the system "dmesg" log. This is because skbs passed down to the Ethernet device for transmission don't have their network_header properly set, and the dev_queue_xmit_nit function in net/core/dev.c complains about this. Reason of setting the network_header to this place (at the end of the Ethernet header, and at the beginning of the Ethernet payload): Because when this driver receives an skb from the Ethernet device, the network_header is also set at this place. Cc: Martin Schiller Signed-off-by: Xie He Signed-off-by: David S. Miller --- drivers/net/wan/lapbether.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/wan/lapbether.c b/drivers/net/wan/lapbether.c index cc297ea9c6ec..e61616b0b91c 100644 --- a/drivers/net/wan/lapbether.c +++ b/drivers/net/wan/lapbether.c @@ -210,6 +210,8 @@ static void lapbeth_data_transmit(struct net_device *ndev, struct sk_buff *skb) skb->dev = dev = lapbeth->ethdev; + skb_reset_network_header(skb); + dev_hard_header(skb, dev, ETH_P_DEC, bcast_addr, NULL, 0); dev_queue_xmit(skb); From 7f6f32bb7d3355cd78ebf1dece9a6ea7a0ca8158 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 26 Aug 2020 19:48:10 +0300 Subject: [PATCH 062/110] ipv4: Silence suspicious RCU usage warning fib_info_notify_update() is always called with RTNL held, but not from an RCU read-side critical section. This leads to the following warning [1] when the FIB table list is traversed with hlist_for_each_entry_rcu(), but without a proper lockdep expression. Since modification of the list is protected by RTNL, silence the warning by adding a lockdep expression which verifies RTNL is held. [1] ============================= WARNING: suspicious RCU usage 5.9.0-rc1-custom-14233-g2f26e122d62f #129 Not tainted ----------------------------- net/ipv4/fib_trie.c:2124 RCU-list traversed in non-reader section!! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 1 lock held by ip/834: #0: ffffffff85a3b6b0 (rtnl_mutex){+.+.}-{3:3}, at: rtnetlink_rcv_msg+0x49a/0xbd0 stack backtrace: CPU: 0 PID: 834 Comm: ip Not tainted 5.9.0-rc1-custom-14233-g2f26e122d62f #129 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-2.fc32 04/01/2014 Call Trace: dump_stack+0x100/0x184 lockdep_rcu_suspicious+0x143/0x14d fib_info_notify_update+0x8d1/0xa60 __nexthop_replace_notify+0xd2/0x290 rtm_new_nexthop+0x35e2/0x5946 rtnetlink_rcv_msg+0x4f7/0xbd0 netlink_rcv_skb+0x17a/0x480 rtnetlink_rcv+0x22/0x30 netlink_unicast+0x5ae/0x890 netlink_sendmsg+0x98a/0xf40 ____sys_sendmsg+0x879/0xa00 ___sys_sendmsg+0x122/0x190 __sys_sendmsg+0x103/0x1d0 __x64_sys_sendmsg+0x7d/0xb0 do_syscall_64+0x32/0x50 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7fde28c3be57 Code: 0c 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 89 54 24 1c 48 89 74 24 10 RSP: 002b:00007ffc09330028 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fde28c3be57 RDX: 0000000000000000 RSI: 00007ffc09330090 RDI: 0000000000000003 RBP: 000000005f45f911 R08: 0000000000000001 R09: 00007ffc0933012c R10: 0000000000000076 R11: 0000000000000246 R12: 0000000000000001 R13: 00007ffc09330290 R14: 00007ffc09330eee R15: 00005610e48ed020 Fixes: 1bff1a0c9bbd ("ipv4: Add function to send route updates") Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index c89b46fec153..ffc5332f1390 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -2121,7 +2121,8 @@ void fib_info_notify_update(struct net *net, struct nl_info *info) struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct fib_table *tb; - hlist_for_each_entry_rcu(tb, head, tb_hlist) + hlist_for_each_entry_rcu(tb, head, tb_hlist, + lockdep_rtnl_is_held()) __fib_info_notify_update(net, tb, info); } } From 96e97bc07e90f175a8980a22827faf702ca4cb30 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Aug 2020 12:40:06 -0700 Subject: [PATCH 063/110] net: disable netpoll on fresh napis napi_disable() makes sure to set the NAPI_STATE_NPSVC bit to prevent netpoll from accessing rings before init is complete. However, the same is not done for fresh napi instances in netif_napi_add(), even though we expect NAPI instances to be added as disabled. This causes crashes during driver reconfiguration (enabling XDP, changing the channel count) - if there is any printk() after netif_napi_add() but before napi_enable(). To ensure memory ordering is correct we need to use RCU accessors. Reported-by: Rob Sherwood Fixes: 2d8bff12699a ("netpoll: Close race condition between poll_one_napi and napi_disable") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/core/dev.c | 3 ++- net/core/netpoll.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index d42c9ea0c3c0..95ac7568f693 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6612,12 +6612,13 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, netdev_err_once(dev, "%s() called with weight %d\n", __func__, weight); napi->weight = weight; - list_add(&napi->dev_list, &dev->napi_list); napi->dev = dev; #ifdef CONFIG_NETPOLL napi->poll_owner = -1; #endif set_bit(NAPI_STATE_SCHED, &napi->state); + set_bit(NAPI_STATE_NPSVC, &napi->state); + list_add_rcu(&napi->dev_list, &dev->napi_list); napi_hash_add(napi); } EXPORT_SYMBOL(netif_napi_add); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 093e90e52bc2..2338753e936b 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -162,7 +162,7 @@ static void poll_napi(struct net_device *dev) struct napi_struct *napi; int cpu = smp_processor_id(); - list_for_each_entry(napi, &dev->napi_list, dev_list) { + list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) { if (cmpxchg(&napi->poll_owner, -1, cpu) == -1) { poll_one_napi(napi); smp_store_release(&napi->poll_owner, -1); From 96ecdcc992eb7f468b2cf829b0f5408a1fad4668 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Aug 2020 12:40:07 -0700 Subject: [PATCH 064/110] bnxt: don't enable NAPI until rings are ready Netpoll can try to poll napi as soon as napi_enable() is called. It crashes trying to access a doorbell which is still NULL: BUG: kernel NULL pointer dereference, address: 0000000000000000 CPU: 59 PID: 6039 Comm: ethtool Kdump: loaded Tainted: G S 5.9.0-rc1-00469-g5fd99b5d9950-dirty #26 RIP: 0010:bnxt_poll+0x121/0x1c0 Code: c4 20 44 89 e0 5b 5d 41 5c 41 5d 41 5e 41 5f c3 41 8b 86 a0 01 00 00 41 23 85 18 01 00 00 49 8b 96 a8 01 00 00 0d 00 00 00 24 <89> 02 41 f6 45 77 02 74 cb 49 8b ae d8 01 00 00 31 c0 c7 44 24 1a netpoll_poll_dev+0xbd/0x1a0 __netpoll_send_skb+0x1b2/0x210 netpoll_send_udp+0x2c9/0x406 write_ext_msg+0x1d7/0x1f0 console_unlock+0x23c/0x520 vprintk_emit+0xe0/0x1d0 printk+0x58/0x6f x86_vector_activate.cold+0xf/0x46 __irq_domain_activate_irq+0x50/0x80 __irq_domain_activate_irq+0x32/0x80 __irq_domain_activate_irq+0x32/0x80 irq_domain_activate_irq+0x25/0x40 __setup_irq+0x2d2/0x700 request_threaded_irq+0xfb/0x160 __bnxt_open_nic+0x3b1/0x750 bnxt_open_nic+0x19/0x30 ethtool_set_channels+0x1ac/0x220 dev_ethtool+0x11ba/0x2240 dev_ioctl+0x1cf/0x390 sock_do_ioctl+0x95/0x130 Reported-by: Rob Sherwood Fixes: c0c050c58d84 ("bnxt_en: New Broadcom ethernet driver.") Signed-off-by: Jakub Kicinski Reviewed-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 316227136f5b..57d0e195cddf 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -9504,15 +9504,15 @@ static int __bnxt_open_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init) } } - bnxt_enable_napi(bp); - bnxt_debug_dev_init(bp); - rc = bnxt_init_nic(bp, irq_re_init); if (rc) { netdev_err(bp->dev, "bnxt_init_nic err: %x\n", rc); - goto open_err; + goto open_err_irq; } + bnxt_enable_napi(bp); + bnxt_debug_dev_init(bp); + if (link_re_init) { mutex_lock(&bp->link_lock); rc = bnxt_update_phy_setting(bp); @@ -9543,10 +9543,6 @@ static int __bnxt_open_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init) bnxt_vf_reps_open(bp); return 0; -open_err: - bnxt_debug_dev_exit(bp); - bnxt_disable_napi(bp); - open_err_irq: bnxt_del_napi(bp); From 8ed37e79196033dbdb09a4eea9465af6a0e29fe9 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Fri, 21 Aug 2020 18:30:43 +0200 Subject: [PATCH 065/110] mac80211: use rate provided via status->rate on ieee80211_tx_status_ext for AQL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since ieee80211_tx_info does not have enough room to encode HE rates, HE drivers use status->rate to provide rate info. Store it in struct sta_info and use it for AQL. Signed-off-by: Felix Fietkau Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20200821163045.62140-1-nbd@nbd.name Signed-off-by: Johannes Berg --- net/mac80211/airtime.c | 53 +++++++++++++++++++++++++++++++++++++++-- net/mac80211/sta_info.h | 1 + net/mac80211/status.c | 12 ++++++---- 3 files changed, 60 insertions(+), 6 deletions(-) diff --git a/net/mac80211/airtime.c b/net/mac80211/airtime.c index 366f76c9003d..656c9a033714 100644 --- a/net/mac80211/airtime.c +++ b/net/mac80211/airtime.c @@ -487,14 +487,61 @@ u32 ieee80211_calc_rx_airtime(struct ieee80211_hw *hw, } EXPORT_SYMBOL_GPL(ieee80211_calc_rx_airtime); +static bool ieee80211_fill_rate_info(struct ieee80211_hw *hw, + struct ieee80211_rx_status *stat, u8 band, + struct rate_info *ri) +{ + struct ieee80211_supported_band *sband = hw->wiphy->bands[band]; + int i; + + if (!ri || !sband) + return false; + + stat->bw = ri->bw; + stat->nss = ri->nss; + stat->rate_idx = ri->mcs; + + if (ri->flags & RATE_INFO_FLAGS_HE_MCS) + stat->encoding = RX_ENC_HE; + else if (ri->flags & RATE_INFO_FLAGS_VHT_MCS) + stat->encoding = RX_ENC_VHT; + else if (ri->flags & RATE_INFO_FLAGS_MCS) + stat->encoding = RX_ENC_HT; + else + stat->encoding = RX_ENC_LEGACY; + + if (ri->flags & RATE_INFO_FLAGS_SHORT_GI) + stat->enc_flags |= RX_ENC_FLAG_SHORT_GI; + + stat->he_gi = ri->he_gi; + + if (stat->encoding != RX_ENC_LEGACY) + return true; + + stat->rate_idx = 0; + for (i = 0; i < sband->n_bitrates; i++) { + if (ri->legacy != sband->bitrates[i].bitrate) + continue; + + stat->rate_idx = i; + return true; + } + + return false; +} + static u32 ieee80211_calc_tx_airtime_rate(struct ieee80211_hw *hw, struct ieee80211_tx_rate *rate, + struct rate_info *ri, u8 band, int len) { struct ieee80211_rx_status stat = { .band = band, }; + if (ieee80211_fill_rate_info(hw, &stat, band, ri)) + goto out; + if (rate->idx < 0 || !rate->count) return 0; @@ -522,6 +569,7 @@ static u32 ieee80211_calc_tx_airtime_rate(struct ieee80211_hw *hw, stat.encoding = RX_ENC_LEGACY; } +out: return ieee80211_calc_rx_airtime(hw, &stat, len); } @@ -536,7 +584,7 @@ u32 ieee80211_calc_tx_airtime(struct ieee80211_hw *hw, struct ieee80211_tx_rate *rate = &info->status.rates[i]; u32 cur_duration; - cur_duration = ieee80211_calc_tx_airtime_rate(hw, rate, + cur_duration = ieee80211_calc_tx_airtime_rate(hw, rate, NULL, info->band, len); if (!cur_duration) break; @@ -573,6 +621,7 @@ u32 ieee80211_calc_expected_tx_airtime(struct ieee80211_hw *hw, struct sta_info *sta = container_of(pubsta, struct sta_info, sta); struct ieee80211_tx_rate *rate = &sta->tx_stats.last_rate; + struct rate_info *ri = &sta->tx_stats.last_rate_info; u32 airtime; if (!(rate->flags & (IEEE80211_TX_RC_VHT_MCS | @@ -586,7 +635,7 @@ u32 ieee80211_calc_expected_tx_airtime(struct ieee80211_hw *hw, * This will not be very accurate, but much better than simply * assuming un-aggregated tx. */ - airtime = ieee80211_calc_tx_airtime_rate(hw, rate, band, + airtime = ieee80211_calc_tx_airtime_rate(hw, rate, ri, band, ampdu ? len * 16 : len); if (ampdu) airtime /= 16; diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 9d398c9daa4c..8060d142e4f1 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -611,6 +611,7 @@ struct sta_info { u64 packets[IEEE80211_NUM_ACS]; u64 bytes[IEEE80211_NUM_ACS]; struct ieee80211_tx_rate last_rate; + struct rate_info last_rate_info; u64 msdu[IEEE80211_NUM_TIDS + 1]; } tx_stats; u16 tid_seq[IEEE80211_QOS_CTL_TID_MASK + 1]; diff --git a/net/mac80211/status.c b/net/mac80211/status.c index adb1d30ce06e..6de63f1d8c7b 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -1137,9 +1137,17 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw, struct ieee80211_tx_info *info = status->info; struct ieee80211_sta *pubsta = status->sta; struct ieee80211_supported_band *sband; + struct sta_info *sta; int retry_count; bool acked, noack_success; + if (pubsta) { + sta = container_of(pubsta, struct sta_info, sta); + + if (status->rate) + sta->tx_stats.last_rate_info = *status->rate; + } + if (status->skb) return __ieee80211_tx_status(hw, status); @@ -1154,10 +1162,6 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw, noack_success = !!(info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED); if (pubsta) { - struct sta_info *sta; - - sta = container_of(pubsta, struct sta_info, sta); - if (!acked && !noack_success) sta->status_stats.retry_failed++; sta->status_stats.retry_count += retry_count; From 43cd72c5892c46fe69b8a1682fb4905cf158c39c Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Fri, 21 Aug 2020 18:30:44 +0200 Subject: [PATCH 066/110] mac80211: factor out code to look up the average packet length duration for a rate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This will be used to enhance AQL estimated aggregation length Signed-off-by: Felix Fietkau Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20200821163045.62140-2-nbd@nbd.name Signed-off-by: Johannes Berg --- net/mac80211/airtime.c | 139 ++++++++++++++++++++++++----------------- 1 file changed, 83 insertions(+), 56 deletions(-) diff --git a/net/mac80211/airtime.c b/net/mac80211/airtime.c index 656c9a033714..ba95f4ee1f71 100644 --- a/net/mac80211/airtime.c +++ b/net/mac80211/airtime.c @@ -405,18 +405,14 @@ ieee80211_calc_legacy_rate_duration(u16 bitrate, bool short_pre, return duration; } -u32 ieee80211_calc_rx_airtime(struct ieee80211_hw *hw, - struct ieee80211_rx_status *status, - int len) +static u32 ieee80211_get_rate_duration(struct ieee80211_hw *hw, + struct ieee80211_rx_status *status, + u32 *overhead) { - struct ieee80211_supported_band *sband; - const struct ieee80211_rate *rate; bool sgi = status->enc_flags & RX_ENC_FLAG_SHORT_GI; - bool sp = status->enc_flags & RX_ENC_FLAG_SHORTPRE; int bw, streams; int group, idx; u32 duration; - bool cck; switch (status->bw) { case RATE_INFO_BW_20: @@ -437,20 +433,6 @@ u32 ieee80211_calc_rx_airtime(struct ieee80211_hw *hw, } switch (status->encoding) { - case RX_ENC_LEGACY: - if (WARN_ON_ONCE(status->band > NL80211_BAND_5GHZ)) - return 0; - - sband = hw->wiphy->bands[status->band]; - if (!sband || status->rate_idx >= sband->n_bitrates) - return 0; - - rate = &sband->bitrates[status->rate_idx]; - cck = rate->flags & IEEE80211_RATE_MANDATORY_B; - - return ieee80211_calc_legacy_rate_duration(rate->bitrate, sp, - cck, len); - case RX_ENC_VHT: streams = status->nss; idx = status->rate_idx; @@ -477,13 +459,47 @@ u32 ieee80211_calc_rx_airtime(struct ieee80211_hw *hw, duration = airtime_mcs_groups[group].duration[idx]; duration <<= airtime_mcs_groups[group].shift; + *overhead = 36 + (streams << 2); + + return duration; +} + + +u32 ieee80211_calc_rx_airtime(struct ieee80211_hw *hw, + struct ieee80211_rx_status *status, + int len) +{ + struct ieee80211_supported_band *sband; + u32 duration, overhead = 0; + + if (status->encoding == RX_ENC_LEGACY) { + const struct ieee80211_rate *rate; + bool sp = status->enc_flags & RX_ENC_FLAG_SHORTPRE; + bool cck; + + if (WARN_ON_ONCE(status->band > NL80211_BAND_5GHZ)) + return 0; + + sband = hw->wiphy->bands[status->band]; + if (!sband || status->rate_idx >= sband->n_bitrates) + return 0; + + rate = &sband->bitrates[status->rate_idx]; + cck = rate->flags & IEEE80211_RATE_MANDATORY_B; + + return ieee80211_calc_legacy_rate_duration(rate->bitrate, sp, + cck, len); + } + + duration = ieee80211_get_rate_duration(hw, status, &overhead); + if (!duration) + return 0; + duration *= len; duration /= AVG_PKT_SIZE; duration /= 1024; - duration += 36 + (streams << 2); - - return duration; + return duration + overhead; } EXPORT_SYMBOL_GPL(ieee80211_calc_rx_airtime); @@ -530,46 +546,57 @@ static bool ieee80211_fill_rate_info(struct ieee80211_hw *hw, return false; } +static int ieee80211_fill_rx_status(struct ieee80211_rx_status *stat, + struct ieee80211_hw *hw, + struct ieee80211_tx_rate *rate, + struct rate_info *ri, u8 band, int len) +{ + memset(stat, 0, sizeof(*stat)); + stat->band = band; + + if (ieee80211_fill_rate_info(hw, stat, band, ri)) + return 0; + + if (rate->idx < 0 || !rate->count) + return -1; + + if (rate->flags & IEEE80211_TX_RC_80_MHZ_WIDTH) + stat->bw = RATE_INFO_BW_80; + else if (rate->flags & IEEE80211_TX_RC_40_MHZ_WIDTH) + stat->bw = RATE_INFO_BW_40; + else + stat->bw = RATE_INFO_BW_20; + + stat->enc_flags = 0; + if (rate->flags & IEEE80211_TX_RC_USE_SHORT_PREAMBLE) + stat->enc_flags |= RX_ENC_FLAG_SHORTPRE; + if (rate->flags & IEEE80211_TX_RC_SHORT_GI) + stat->enc_flags |= RX_ENC_FLAG_SHORT_GI; + + stat->rate_idx = rate->idx; + if (rate->flags & IEEE80211_TX_RC_VHT_MCS) { + stat->encoding = RX_ENC_VHT; + stat->rate_idx = ieee80211_rate_get_vht_mcs(rate); + stat->nss = ieee80211_rate_get_vht_nss(rate); + } else if (rate->flags & IEEE80211_TX_RC_MCS) { + stat->encoding = RX_ENC_HT; + } else { + stat->encoding = RX_ENC_LEGACY; + } + + return 0; +} + static u32 ieee80211_calc_tx_airtime_rate(struct ieee80211_hw *hw, struct ieee80211_tx_rate *rate, struct rate_info *ri, u8 band, int len) { - struct ieee80211_rx_status stat = { - .band = band, - }; + struct ieee80211_rx_status stat; - if (ieee80211_fill_rate_info(hw, &stat, band, ri)) - goto out; - - if (rate->idx < 0 || !rate->count) + if (ieee80211_fill_rx_status(&stat, hw, rate, ri, band, len)) return 0; - if (rate->flags & IEEE80211_TX_RC_80_MHZ_WIDTH) - stat.bw = RATE_INFO_BW_80; - else if (rate->flags & IEEE80211_TX_RC_40_MHZ_WIDTH) - stat.bw = RATE_INFO_BW_40; - else - stat.bw = RATE_INFO_BW_20; - - stat.enc_flags = 0; - if (rate->flags & IEEE80211_TX_RC_USE_SHORT_PREAMBLE) - stat.enc_flags |= RX_ENC_FLAG_SHORTPRE; - if (rate->flags & IEEE80211_TX_RC_SHORT_GI) - stat.enc_flags |= RX_ENC_FLAG_SHORT_GI; - - stat.rate_idx = rate->idx; - if (rate->flags & IEEE80211_TX_RC_VHT_MCS) { - stat.encoding = RX_ENC_VHT; - stat.rate_idx = ieee80211_rate_get_vht_mcs(rate); - stat.nss = ieee80211_rate_get_vht_nss(rate); - } else if (rate->flags & IEEE80211_TX_RC_MCS) { - stat.encoding = RX_ENC_HT; - } else { - stat.encoding = RX_ENC_LEGACY; - } - -out: return ieee80211_calc_rx_airtime(hw, &stat, len); } From f01cfbaf9b2971126f094acfbda2589bd2394f4d Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Fri, 21 Aug 2020 18:30:45 +0200 Subject: [PATCH 067/110] mac80211: improve AQL aggregation estimation for low data rates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Links with low data rates use much smaller aggregates and are much more sensitive to latency added by bufferbloat. Tune the assumed aggregation length based on the tx rate duration. Signed-off-by: Felix Fietkau Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20200821163045.62140-3-nbd@nbd.name Signed-off-by: Johannes Berg --- net/mac80211/airtime.c | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/net/mac80211/airtime.c b/net/mac80211/airtime.c index ba95f4ee1f71..314973033d03 100644 --- a/net/mac80211/airtime.c +++ b/net/mac80211/airtime.c @@ -647,27 +647,41 @@ u32 ieee80211_calc_expected_tx_airtime(struct ieee80211_hw *hw, if (pubsta) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); + struct ieee80211_rx_status stat; struct ieee80211_tx_rate *rate = &sta->tx_stats.last_rate; struct rate_info *ri = &sta->tx_stats.last_rate_info; - u32 airtime; + u32 duration, overhead; + u8 agg_shift; - if (!(rate->flags & (IEEE80211_TX_RC_VHT_MCS | - IEEE80211_TX_RC_MCS))) - ampdu = false; + if (ieee80211_fill_rx_status(&stat, hw, rate, ri, band, len)) + return 0; + if (stat.encoding == RX_ENC_LEGACY || !ampdu) + return ieee80211_calc_rx_airtime(hw, &stat, len); + + duration = ieee80211_get_rate_duration(hw, &stat, &overhead); /* * Assume that HT/VHT transmission on any AC except VO will * use aggregation. Since we don't have reliable reporting - * of aggregation length, assume an average of 16. + * of aggregation length, assume an average size based on the + * tx rate. * This will not be very accurate, but much better than simply - * assuming un-aggregated tx. + * assuming un-aggregated tx in all cases. */ - airtime = ieee80211_calc_tx_airtime_rate(hw, rate, ri, band, - ampdu ? len * 16 : len); - if (ampdu) - airtime /= 16; + if (duration > 400) /* <= VHT20 MCS2 1S */ + agg_shift = 1; + else if (duration > 250) /* <= VHT20 MCS3 1S or MCS1 2S */ + agg_shift = 2; + else if (duration > 150) /* <= VHT20 MCS5 1S or MCS3 2S */ + agg_shift = 3; + else + agg_shift = 4; - return airtime; + duration *= len; + duration /= AVG_PKT_SIZE; + duration /= 1024; + + return duration + (overhead >> agg_shift); } if (!conf) From 3579994476b65cb5e272ff0f720a1fd31322e53f Mon Sep 17 00:00:00 2001 From: Shay Bar Date: Wed, 26 Aug 2020 17:31:39 +0300 Subject: [PATCH 068/110] wireless: fix wrong 160/80+80 MHz setting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix cfg80211_chandef_usable(): consider IEEE80211_VHT_CAP_EXT_NSS_BW when verifying 160/80+80 MHz. Based on: "Table 9-272 — Setting of the Supported Channel Width Set subfield and Extended NSS BW Support subfield at a STA transmitting the VHT Capabilities Information field" From "Draft P802.11REVmd_D3.0.pdf" Signed-off-by: Aviad Brikman Signed-off-by: Shay Bar Link: https://lore.kernel.org/r/20200826143139.25976-1-shay.bar@celeno.com [reformat the code a bit and use u32_get_bits()] Signed-off-by: Johannes Berg --- net/wireless/chan.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 90f0f82cd9ca..edee3a645c06 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -10,6 +10,7 @@ */ #include +#include #include #include "core.h" #include "rdev-ops.h" @@ -912,6 +913,7 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, struct ieee80211_sta_vht_cap *vht_cap; struct ieee80211_edmg *edmg_cap; u32 width, control_freq, cap; + bool support_80_80 = false; if (WARN_ON(!cfg80211_chandef_valid(chandef))) return false; @@ -979,9 +981,13 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, return false; break; case NL80211_CHAN_WIDTH_80P80: - cap = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; - if (chandef->chan->band != NL80211_BAND_6GHZ && - cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) + cap = vht_cap->cap; + support_80_80 = + (cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) || + (cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ && + cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) || + u32_get_bits(cap, IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) > 1; + if (chandef->chan->band != NL80211_BAND_6GHZ && !support_80_80) return false; /* fall through */ case NL80211_CHAN_WIDTH_80: @@ -1001,7 +1007,8 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, return false; cap = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; if (cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ && - cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) + cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ && + !(vht_cap->cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK)) return false; break; default: From 47caf685a6854593348f216e0b489b71c10cbe03 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 19 Aug 2020 10:46:48 +0200 Subject: [PATCH 069/110] cfg80211: regulatory: reject invalid hints Reject invalid hints early in order to not cause a kernel WARN later if they're restored to or similar. Reported-by: syzbot+d451401ffd00a60677ee@syzkaller.appspotmail.com Link: https://syzkaller.appspot.com/bug?extid=d451401ffd00a60677ee Link: https://lore.kernel.org/r/20200819084648.13956-1-johannes@sipsolutions.net Signed-off-by: Johannes Berg --- net/wireless/reg.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 35b8847a2f6d..d8a90d397423 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -2946,6 +2946,9 @@ int regulatory_hint_user(const char *alpha2, if (WARN_ON(!alpha2)) return -EINVAL; + if (!is_world_regdom(alpha2) && !is_an_alpha2(alpha2)) + return -EINVAL; + request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL); if (!request) return -ENOMEM; From 47df8e059b49a80c179fae39256bcd7096810934 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sat, 8 Aug 2020 19:25:42 +0200 Subject: [PATCH 070/110] mac80211: reduce packet loss event false positives When running a large number of packets per second with a high data rate and long A-MPDUs, the packet loss threshold can be reached very quickly when the link conditions change. This frequently shows up as spurious disconnects. Mitigate false positives by using a similar logic for regular stations as the one being used for TDLS, though with a more aggressive timeout. Packet loss events are only reported if no ACK was received for a second. Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20200808172542.41628-1-nbd@nbd.name Signed-off-by: Johannes Berg --- net/mac80211/sta_info.h | 4 ++-- net/mac80211/status.c | 31 +++++++++++++++---------------- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 8060d142e4f1..d5010116cf4d 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -524,7 +524,7 @@ struct ieee80211_sta_rx_stats { * @status_stats.retry_failed: # of frames that failed after retry * @status_stats.retry_count: # of retries attempted * @status_stats.lost_packets: # of lost packets - * @status_stats.last_tdls_pkt_time: timestamp of last TDLS packet + * @status_stats.last_pkt_time: timestamp of last ACKed packet * @status_stats.msdu_retries: # of MSDU retries * @status_stats.msdu_failed: # of failed MSDUs * @status_stats.last_ack: last ack timestamp (jiffies) @@ -597,7 +597,7 @@ struct sta_info { unsigned long filtered; unsigned long retry_failed, retry_count; unsigned int lost_packets; - unsigned long last_tdls_pkt_time; + unsigned long last_pkt_time; u64 msdu_retries[IEEE80211_NUM_TIDS + 1]; u64 msdu_failed[IEEE80211_NUM_TIDS + 1]; unsigned long last_ack; diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 6de63f1d8c7b..0794396a7988 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -755,12 +755,16 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local, * - current throughput (higher value for higher tpt)? */ #define STA_LOST_PKT_THRESHOLD 50 +#define STA_LOST_PKT_TIME HZ /* 1 sec since last ACK */ #define STA_LOST_TDLS_PKT_THRESHOLD 10 #define STA_LOST_TDLS_PKT_TIME (10*HZ) /* 10secs since last ACK */ static void ieee80211_lost_packet(struct sta_info *sta, struct ieee80211_tx_info *info) { + unsigned long pkt_time = STA_LOST_PKT_TIME; + unsigned int pkt_thr = STA_LOST_PKT_THRESHOLD; + /* If driver relies on its own algorithm for station kickout, skip * mac80211 packet loss mechanism. */ @@ -773,21 +777,20 @@ static void ieee80211_lost_packet(struct sta_info *sta, return; sta->status_stats.lost_packets++; - if (!sta->sta.tdls && - sta->status_stats.lost_packets < STA_LOST_PKT_THRESHOLD) - return; + if (sta->sta.tdls) { + pkt_time = STA_LOST_TDLS_PKT_TIME; + pkt_thr = STA_LOST_PKT_THRESHOLD; + } /* * If we're in TDLS mode, make sure that all STA_LOST_TDLS_PKT_THRESHOLD * of the last packets were lost, and that no ACK was received in the * last STA_LOST_TDLS_PKT_TIME ms, before triggering the CQM packet-loss * mechanism. + * For non-TDLS, use STA_LOST_PKT_THRESHOLD and STA_LOST_PKT_TIME */ - if (sta->sta.tdls && - (sta->status_stats.lost_packets < STA_LOST_TDLS_PKT_THRESHOLD || - time_before(jiffies, - sta->status_stats.last_tdls_pkt_time + - STA_LOST_TDLS_PKT_TIME))) + if (sta->status_stats.lost_packets < pkt_thr || + !time_after(jiffies, sta->status_stats.last_pkt_time + pkt_time)) return; cfg80211_cqm_pktloss_notify(sta->sdata->dev, sta->sta.addr, @@ -1033,9 +1036,7 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw, sta->status_stats.lost_packets = 0; /* Track when last TDLS packet was ACKed */ - if (test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH)) - sta->status_stats.last_tdls_pkt_time = - jiffies; + sta->status_stats.last_pkt_time = jiffies; } else if (noack_success) { /* nothing to do here, do not account as lost */ } else { @@ -1172,9 +1173,8 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw, if (sta->status_stats.lost_packets) sta->status_stats.lost_packets = 0; - /* Track when last TDLS packet was ACKed */ - if (test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH)) - sta->status_stats.last_tdls_pkt_time = jiffies; + /* Track when last packet was ACKed */ + sta->status_stats.last_pkt_time = jiffies; } else if (test_sta_flag(sta, WLAN_STA_PS_STA)) { return; } else if (noack_success) { @@ -1263,8 +1263,7 @@ void ieee80211_tx_status_8023(struct ieee80211_hw *hw, if (sta->status_stats.lost_packets) sta->status_stats.lost_packets = 0; - if (test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH)) - sta->status_stats.last_tdls_pkt_time = jiffies; + sta->status_stats.last_pkt_time = jiffies; } else { ieee80211_lost_packet(sta, info); } From 2d9b55508556ccee6410310fb9ea2482fd3328eb Mon Sep 17 00:00:00 2001 From: Amar Singhal Date: Fri, 19 Jun 2020 13:52:01 -0700 Subject: [PATCH 071/110] cfg80211: Adjust 6 GHz frequency to channel conversion Adjust the 6 GHz frequency to channel conversion function, the other way around was previously handled. Signed-off-by: Amar Singhal Link: https://lore.kernel.org/r/1592599921-10607-1-git-send-email-asinghal@codeaurora.org [rewrite commit message, hard-code channel 2] Signed-off-by: Johannes Berg --- net/wireless/util.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/wireless/util.c b/net/wireless/util.c index dfad1c0f57ad..c62eb3d9ab3d 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -123,11 +123,13 @@ int ieee80211_freq_khz_to_channel(u32 freq) return (freq - 2407) / 5; else if (freq >= 4910 && freq <= 4980) return (freq - 4000) / 5; - else if (freq < 5945) + else if (freq < 5925) return (freq - 5000) / 5; + else if (freq == 5935) + return 2; else if (freq <= 45000) /* DMG band lower limit */ - /* see 802.11ax D4.1 27.3.22.2 */ - return (freq - 5940) / 5; + /* see 802.11ax D6.1 27.3.22.2 */ + return (freq - 5950) / 5; else if (freq >= 58320 && freq <= 70200) return (freq - 56160) / 2160; else From a092b7233f0e000cc6f2c71a49e2ecc6f917a5fc Mon Sep 17 00:00:00 2001 From: Himadri Pandya Date: Thu, 27 Aug 2020 12:23:55 +0530 Subject: [PATCH 072/110] net: usb: Fix uninit-was-stored issue in asix_read_phy_addr() The buffer size is 2 Bytes and we expect to receive the same amount of data. But sometimes we receive less data and run into uninit-was-stored issue upon read. Hence modify the error check on the return value to match with the buffer size as a prevention. Reported-and-tested by: syzbot+a7e220df5a81d1ab400e@syzkaller.appspotmail.com Signed-off-by: Himadri Pandya Signed-off-by: David S. Miller --- drivers/net/usb/asix_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/usb/asix_common.c b/drivers/net/usb/asix_common.c index e39f41efda3e..7bc6e8f856fe 100644 --- a/drivers/net/usb/asix_common.c +++ b/drivers/net/usb/asix_common.c @@ -296,7 +296,7 @@ int asix_read_phy_addr(struct usbnet *dev, int internal) netdev_dbg(dev->net, "asix_get_phy_addr()\n"); - if (ret < 0) { + if (ret < 2) { netdev_err(dev->net, "Error reading PHYID register: %02x\n", ret); goto out; } From 645f08975f49441b3e753d8dc5b740cbcb226594 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 27 Aug 2020 07:27:49 -0400 Subject: [PATCH 073/110] net: Fix some comments Fix some comments, including wrong function name, duplicated word and so on. Signed-off-by: Miaohe Lin Signed-off-by: David S. Miller --- include/linux/skbuff.h | 4 ++-- include/uapi/linux/in.h | 2 +- net/core/sock.c | 2 +- net/ipv4/raw.c | 2 +- net/l3mdev/l3mdev.c | 2 +- net/socket.c | 4 ++-- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index e8bca74857a3..8d9ab50b08c9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -71,7 +71,7 @@ * NETIF_F_IPV6_CSUM - Driver (device) is only able to checksum plain * TCP or UDP packets over IPv6. These are specifically * unencapsulated packets of the form IPv6|TCP or - * IPv4|UDP where the Next Header field in the IPv6 + * IPv6|UDP where the Next Header field in the IPv6 * header is either TCP or UDP. IPv6 extension headers * are not supported with this feature. This feature * cannot be set in features for a device with @@ -2667,7 +2667,7 @@ static inline int pskb_network_may_pull(struct sk_buff *skb, unsigned int len) * * Using max(32, L1_CACHE_BYTES) makes sense (especially with RPS) * to reduce average number of cache lines per packet. - * get_rps_cpus() for example only access one 64 bytes aligned block : + * get_rps_cpu() for example only access one 64 bytes aligned block : * NET_IP_ALIGN(2) + ethernet_header(14) + IP_header(20/40) + ports(8) */ #ifndef NET_SKB_PAD diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index 3d0d8231dc19..7d6687618d80 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -135,7 +135,7 @@ struct in_addr { * this socket to prevent accepting spoofed ones. */ #define IP_PMTUDISC_INTERFACE 4 -/* weaker version of IP_PMTUDISC_INTERFACE, which allos packets to get +/* weaker version of IP_PMTUDISC_INTERFACE, which allows packets to get * fragmented if they exeed the interface mtu */ #define IP_PMTUDISC_OMIT 5 diff --git a/net/core/sock.c b/net/core/sock.c index e4f40b175acb..8eb2c924805a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3254,7 +3254,7 @@ void sk_common_release(struct sock *sk) sk->sk_prot->destroy(sk); /* - * Observation: when sock_common_release is called, processes have + * Observation: when sk_common_release is called, processes have * no access to socket. But net still has. * Step one, detach it from networking: * diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 6fd4330287c2..407956be7deb 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -610,7 +610,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } else if (!ipc.oif) { ipc.oif = inet->uc_index; } else if (ipv4_is_lbcast(daddr) && inet->uc_index) { - /* oif is set, packet is to local broadcast and + /* oif is set, packet is to local broadcast * and uc_index is set. oif is most likely set * by sk_bound_dev_if. If uc_index != oif check if the * oif is an L3 master and uc_index is an L3 slave. diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index e71ca5aec684..864326f150e2 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -154,7 +154,7 @@ int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex) EXPORT_SYMBOL_GPL(l3mdev_master_upper_ifindex_by_index_rcu); /** - * l3mdev_fib_table - get FIB table id associated with an L3 + * l3mdev_fib_table_rcu - get FIB table id associated with an L3 * master interface * @dev: targeted interface */ diff --git a/net/socket.c b/net/socket.c index dbbe8ea7d395..0c0144604f81 100644 --- a/net/socket.c +++ b/net/socket.c @@ -3610,7 +3610,7 @@ int kernel_getsockname(struct socket *sock, struct sockaddr *addr) EXPORT_SYMBOL(kernel_getsockname); /** - * kernel_peername - get the address which the socket is connected (kernel space) + * kernel_getpeername - get the address which the socket is connected (kernel space) * @sock: socket * @addr: address holder * @@ -3671,7 +3671,7 @@ int kernel_sendpage_locked(struct sock *sk, struct page *page, int offset, EXPORT_SYMBOL(kernel_sendpage_locked); /** - * kernel_shutdown - shut down part of a full-duplex connection (kernel space) + * kernel_sock_shutdown - shut down part of a full-duplex connection (kernel space) * @sock: socket * @how: connection part * From af8ea111134624855710a0ef5543b871d49b0162 Mon Sep 17 00:00:00 2001 From: Murali Karicheri Date: Thu, 27 Aug 2020 10:38:39 -0400 Subject: [PATCH 074/110] net: ethernet: ti: cpsw_new: fix error handling in cpsw_ndo_vlan_rx_kill_vid() This patch fixes a bunch of issues in cpsw_ndo_vlan_rx_kill_vid() - pm_runtime_get_sync() returns non zero value. This results in non zero value return to caller which will be interpreted as error. So overwrite ret with zero. - If VID matches with port VLAN VID, then set error code. - Currently when VLAN interface is deleted, all of the VLAN mc addresses are removed from ALE table, however the return values from ale function calls are not checked. These functions can return error code -ENOENT. But that shouldn't happen in a normal case. So add error print to catch the situations so that these can be investigated and addressed. return zero in these cases as these are not real error case, but only serve to catch ALE table update related issues and help address the same in the driver. Fixes: ed3525eda4c4 ("net: ethernet: ti: introduce cpsw switchdev based driver part 1 - dual-emac") Signed-off-by: Murali Karicheri Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpsw_new.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c index 8d0a2bc7128d..8ed78577cded 100644 --- a/drivers/net/ethernet/ti/cpsw_new.c +++ b/drivers/net/ethernet/ti/cpsw_new.c @@ -1032,19 +1032,34 @@ static int cpsw_ndo_vlan_rx_kill_vid(struct net_device *ndev, return ret; } + /* reset the return code as pm_runtime_get_sync() can return + * non zero values as well. + */ + ret = 0; for (i = 0; i < cpsw->data.slaves; i++) { if (cpsw->slaves[i].ndev && - vid == cpsw->slaves[i].port_vlan) + vid == cpsw->slaves[i].port_vlan) { + ret = -EINVAL; goto err; + } } dev_dbg(priv->dev, "removing vlanid %d from vlan filter\n", vid); - cpsw_ale_del_vlan(cpsw->ale, vid, 0); - cpsw_ale_del_ucast(cpsw->ale, priv->mac_addr, - HOST_PORT_NUM, ALE_VLAN, vid); - cpsw_ale_del_mcast(cpsw->ale, priv->ndev->broadcast, - 0, ALE_VLAN, vid); + ret = cpsw_ale_del_vlan(cpsw->ale, vid, 0); + if (ret) + dev_err(priv->dev, "cpsw_ale_del_vlan() failed: ret %d\n", ret); + ret = cpsw_ale_del_ucast(cpsw->ale, priv->mac_addr, + HOST_PORT_NUM, ALE_VLAN, vid); + if (ret) + dev_err(priv->dev, "cpsw_ale_del_ucast() failed: ret %d\n", + ret); + ret = cpsw_ale_del_mcast(cpsw->ale, priv->ndev->broadcast, + 0, ALE_VLAN, vid); + if (ret) + dev_err(priv->dev, "cpsw_ale_del_mcast failed. ret %d\n", + ret); cpsw_ale_flush_multicast(cpsw->ale, ALE_PORT_HOST, vid); + ret = 0; err: pm_runtime_put(cpsw->dev); return ret; From b43c75abfd084b9e961a47c6331e01f2f91176b6 Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Thu, 27 Aug 2020 16:55:46 +0100 Subject: [PATCH 075/110] rxrpc: Fix memory leak in rxkad_verify_response() Fix a memory leak in rxkad_verify_response() whereby the response buffer doesn't get freed if we fail to allocate a ticket buffer. Fixes: ef68622da9cc ("rxrpc: Handle temporary errors better in rxkad security") Signed-off-by: Dinghao Liu Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/rxkad.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 52a24d4ef5d8..e08130e5746b 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -1137,7 +1137,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, ret = -ENOMEM; ticket = kmalloc(ticket_len, GFP_NOFS); if (!ticket) - goto temporary_error; + goto temporary_error_free_resp; eproto = tracepoint_string("rxkad_tkt_short"); abort_code = RXKADPACKETSHORT; @@ -1230,6 +1230,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, temporary_error_free_ticket: kfree(ticket); +temporary_error_free_resp: kfree(response); temporary_error: /* Ignore the response packet if we got a temporary error such as From fa4505675e093e895b7ec49a76d44f6b5ad9602e Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 26 Aug 2020 10:17:36 +0200 Subject: [PATCH 076/110] selftests/bpf: Fix massive output from test_maps When stdout output from the selftests tool 'test_maps' gets redirected into e.g file or pipe, then the output lines increase a lot (from 21 to 33949 lines). This is caused by the printf that happens before the fork() call, and there are user-space buffered printf data that seems to be duplicated into the forked process. To fix this fflush() stdout before the fork loop in __run_parallel(). Fixes: 1a97cf1fe503 ("selftests/bpf: speedup test_maps") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/159842985651.1050885.2154399297503372406.stgit@firesoul --- tools/testing/selftests/bpf/test_maps.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 754cf611723e..0d92ebcb335d 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -1274,6 +1274,8 @@ static void __run_parallel(unsigned int tasks, pid_t pid[tasks]; int i; + fflush(stdout); + for (i = 0; i < tasks; i++) { pid[i] = fork(); if (pid[i] == 0) { From 174bce38caf18b47816f352156fab926f4fe072a Mon Sep 17 00:00:00 2001 From: zhudi Date: Wed, 26 Aug 2020 20:01:13 +0800 Subject: [PATCH 077/110] netlink: fix a data race in netlink_rcv_wake() The data races were reported by KCSAN: BUG: KCSAN: data-race in netlink_recvmsg / skb_queue_tail write (marked) to 0xffff8c0986e5a8c8 of 8 bytes by interrupt on cpu 3: skb_queue_tail+0xcc/0x120 __netlink_sendskb+0x55/0x80 netlink_broadcast_filtered+0x465/0x7e0 nlmsg_notify+0x8f/0x120 rtnl_notify+0x8e/0xb0 __neigh_notify+0xf2/0x120 neigh_update+0x927/0xde0 arp_process+0x8a3/0xf50 arp_rcv+0x27c/0x3b0 __netif_receive_skb_core+0x181c/0x1840 __netif_receive_skb+0x38/0xf0 netif_receive_skb_internal+0x77/0x1c0 napi_gro_receive+0x1bd/0x1f0 e1000_clean_rx_irq+0x538/0xb20 [e1000] e1000_clean+0x5e4/0x1340 [e1000] net_rx_action+0x310/0x9d0 __do_softirq+0xe8/0x308 irq_exit+0x109/0x110 do_IRQ+0x7f/0xe0 ret_from_intr+0x0/0x1d 0xffffffffffffffff read to 0xffff8c0986e5a8c8 of 8 bytes by task 1463 on cpu 0: netlink_recvmsg+0x40b/0x820 sock_recvmsg+0xc9/0xd0 ___sys_recvmsg+0x1a4/0x3b0 __sys_recvmsg+0x86/0x120 __x64_sys_recvmsg+0x52/0x70 do_syscall_64+0xb5/0x360 entry_SYSCALL_64_after_hwframe+0x65/0xca 0xffffffffffffffff Since the write is under sk_receive_queue->lock but the read is done as lockless. so fix it by using skb_queue_empty_lockless() instead of skb_queue_empty() for the read in netlink_rcv_wake() Signed-off-by: zhudi Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index b5f30d7d30d0..d2d1448274f5 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -353,7 +353,7 @@ static void netlink_rcv_wake(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); - if (skb_queue_empty(&sk->sk_receive_queue)) + if (skb_queue_empty_lockless(&sk->sk_receive_queue)) clear_bit(NETLINK_S_CONGESTED, &nlk->state); if (!test_bit(NETLINK_S_CONGESTED, &nlk->state)) wake_up_interruptible(&nlk->wait); From f272285f6abb9178d029375599626baf3d5f4e8a Mon Sep 17 00:00:00 2001 From: Landen Chao Date: Thu, 27 Aug 2020 17:15:47 +0800 Subject: [PATCH 078/110] net: dsa: mt7530: fix advertising unsupported 1000baseT_Half Remove 1000baseT_Half to advertise correct hardware capability in phylink_validate() callback function. Fixes: 38f790a80560 ("net: dsa: mt7530: Add support for port 5") Signed-off-by: Landen Chao Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/mt7530.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index 4b4701c69fe1..e044d9e8d016 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -1504,7 +1504,7 @@ static void mt7530_phylink_validate(struct dsa_switch *ds, int port, phylink_set(mask, 100baseT_Full); if (state->interface != PHY_INTERFACE_MODE_MII) { - phylink_set(mask, 1000baseT_Half); + /* This switch only supports 1G full-duplex. */ phylink_set(mask, 1000baseT_Full); if (port == 5) phylink_set(mask, 1000baseX_Full); From 5438dd45831ee33869779bd1919b05816ae4dbc9 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 27 Aug 2020 10:40:41 -0700 Subject: [PATCH 079/110] net_sched: fix error path in red_init() When ->init() fails, ->destroy() is called to clean up. So it is unnecessary to clean up in red_init(), and it would cause some refcount underflow. Fixes: aee9caa03fc3 ("net: sched: sch_red: Add qevents "early_drop" and "mark"") Reported-and-tested-by: syzbot+b33c1cb0a30ebdc8a5f9@syzkaller.appspotmail.com Reported-and-tested-by: syzbot+e5ea5f8a3ecfd4427a1c@syzkaller.appspotmail.com Cc: Petr Machata Signed-off-by: Cong Wang Reviewed-by: Petr Machata Signed-off-by: David S. Miller --- net/sched/sch_red.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index deac82f3ad7b..e89fab6ccb34 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -353,23 +353,11 @@ static int red_init(struct Qdisc *sch, struct nlattr *opt, FLOW_BLOCK_BINDER_TYPE_RED_EARLY_DROP, tb[TCA_RED_EARLY_DROP_BLOCK], extack); if (err) - goto err_early_drop_init; + return err; - err = tcf_qevent_init(&q->qe_mark, sch, - FLOW_BLOCK_BINDER_TYPE_RED_MARK, - tb[TCA_RED_MARK_BLOCK], extack); - if (err) - goto err_mark_init; - - return 0; - -err_mark_init: - tcf_qevent_destroy(&q->qe_early_drop, sch); -err_early_drop_init: - del_timer_sync(&q->adapt_timer); - red_offload(sch, false); - qdisc_put(q->qdisc); - return err; + return tcf_qevent_init(&q->qe_mark, sch, + FLOW_BLOCK_BINDER_TYPE_RED_MARK, + tb[TCA_RED_MARK_BLOCK], extack); } static int red_change(struct Qdisc *sch, struct nlattr *opt, From 4b7ddc58e61ada82be1f47f82d5139f0ab2ba478 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 22 Aug 2020 18:07:27 -0700 Subject: [PATCH 080/110] netfilter: delete repeated words Drop duplicated words in net/netfilter/ and net/ipv4/netfilter/. Signed-off-by: Randy Dunlap Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_nat_pptp.c | 2 +- net/netfilter/nf_conntrack_pptp.c | 2 +- net/netfilter/nf_conntrack_proto_tcp.c | 2 +- net/netfilter/xt_recent.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index 7afde8828b4c..3f248a19faa3 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -3,7 +3,7 @@ * nf_nat_pptp.c * * NAT support for PPTP (Point to Point Tunneling Protocol). - * PPTP is a a protocol for creating virtual private networks. + * PPTP is a protocol for creating virtual private networks. * It is a specification defined by Microsoft and some vendors * working with Microsoft. PPTP is built on top of a modified * version of the Internet Generic Routing Encapsulation Protocol. diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index 1f44d523b512..5105d4250012 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Connection tracking support for PPTP (Point to Point Tunneling Protocol). - * PPTP is a a protocol for creating virtual private networks. + * PPTP is a protocol for creating virtual private networks. * It is a specification defined by Microsoft and some vendors * working with Microsoft. PPTP is built on top of a modified * version of the Internet Generic Routing Encapsulation Protocol. diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 6892e497781c..e8c86ee4c1c4 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1152,7 +1152,7 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, && (old_state == TCP_CONNTRACK_SYN_RECV || old_state == TCP_CONNTRACK_ESTABLISHED) && new_state == TCP_CONNTRACK_ESTABLISHED) { - /* Set ASSURED if we see see valid ack in ESTABLISHED + /* Set ASSURED if we see valid ack in ESTABLISHED after SYN_RECV or a valid answer for a picked up connection. */ set_bit(IPS_ASSURED_BIT, &ct->status); diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 19bef176145e..606411869698 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -640,7 +640,7 @@ static void __net_exit recent_proc_net_exit(struct net *net) struct recent_table *t; /* recent_net_exit() is called before recent_mt_destroy(). Make sure - * that the parent xt_recent proc entry is is empty before trying to + * that the parent xt_recent proc entry is empty before trying to * remove it. */ spin_lock_bh(&recent_lock); From ee921183557af39c1a0475f982d43b0fcac25e2e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 23 Aug 2020 13:55:36 +0200 Subject: [PATCH 081/110] netfilter: nfnetlink: nfnetlink_unicast() reports EAGAIN instead of ENOBUFS Frontend callback reports EAGAIN to nfnetlink to retry a command, this is used to signal that module autoloading is required. Unfortunately, nlmsg_unicast() reports EAGAIN in case the receiver socket buffer gets full, so it enters a busy-loop. This patch updates nfnetlink_unicast() to turn EAGAIN into ENOBUFS and to use nlmsg_unicast(). Remove the flags field in nfnetlink_unicast() since this is always MSG_DONTWAIT in the existing code which is exactly what nlmsg_unicast() passes to netlink_unicast() as parameter. Fixes: 96518518cc41 ("netfilter: add nftables") Reported-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nfnetlink.h | 3 +- net/netfilter/nf_tables_api.c | 61 ++++++++++++++--------------- net/netfilter/nfnetlink.c | 11 ++++-- net/netfilter/nfnetlink_log.c | 3 +- net/netfilter/nfnetlink_queue.c | 2 +- 5 files changed, 40 insertions(+), 40 deletions(-) diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index 851425c3178f..89016d08f6a2 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -43,8 +43,7 @@ int nfnetlink_has_listeners(struct net *net, unsigned int group); int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid, unsigned int group, int echo, gfp_t flags); int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error); -int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid, - int flags); +int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid); static inline u16 nfnl_msg_type(u8 subsys, u8 msg_type) { diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 71e501c5ad21..b7dc1cbf40ea 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -815,11 +815,11 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk, nlh->nlmsg_seq, NFT_MSG_NEWTABLE, 0, family, table); if (err < 0) - goto err; + goto err_fill_table_info; - return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); -err: +err_fill_table_info: kfree_skb(skb2); return err; } @@ -1563,11 +1563,11 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk, nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, 0, family, table, chain); if (err < 0) - goto err; + goto err_fill_chain_info; - return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); -err: +err_fill_chain_info: kfree_skb(skb2); return err; } @@ -3008,11 +3008,11 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0, family, table, chain, rule, NULL); if (err < 0) - goto err; + goto err_fill_rule_info; - return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); -err: +err_fill_rule_info: kfree_skb(skb2); return err; } @@ -3968,11 +3968,11 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk, err = nf_tables_fill_set(skb2, &ctx, set, NFT_MSG_NEWSET, 0); if (err < 0) - goto err; + goto err_fill_set_info; - return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); -err: +err_fill_set_info: kfree_skb(skb2); return err; } @@ -4860,24 +4860,18 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = -ENOMEM; skb = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC); if (skb == NULL) - goto err1; + return err; err = nf_tables_fill_setelem_info(skb, ctx, ctx->seq, ctx->portid, NFT_MSG_NEWSETELEM, 0, set, &elem); if (err < 0) - goto err2; + goto err_fill_setelem; - err = nfnetlink_unicast(skb, ctx->net, ctx->portid, MSG_DONTWAIT); - /* This avoids a loop in nfnetlink. */ - if (err < 0) - goto err1; + return nfnetlink_unicast(skb, ctx->net, ctx->portid); - return 0; -err2: +err_fill_setelem: kfree_skb(skb); -err1: - /* this avoids a loop in nfnetlink. */ - return err == -EAGAIN ? -ENOBUFS : err; + return err; } /* called with rcu_read_lock held */ @@ -6182,10 +6176,11 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk, nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0, family, table, obj, reset); if (err < 0) - goto err; + goto err_fill_obj_info; - return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); -err: + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); + +err_fill_obj_info: kfree_skb(skb2); return err; } @@ -7045,10 +7040,11 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk, NFT_MSG_NEWFLOWTABLE, 0, family, flowtable, &flowtable->hook_list); if (err < 0) - goto err; + goto err_fill_flowtable_info; - return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); -err: + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); + +err_fill_flowtable_info: kfree_skb(skb2); return err; } @@ -7234,10 +7230,11 @@ static int nf_tables_getgen(struct net *net, struct sock *nlsk, err = nf_tables_fill_gen_info(skb2, net, NETLINK_CB(skb).portid, nlh->nlmsg_seq); if (err < 0) - goto err; + goto err_fill_gen_info; - return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); -err: + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); + +err_fill_gen_info: kfree_skb(skb2); return err; } diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 5f24edf95830..3a2e64e13b22 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -149,10 +149,15 @@ int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error) } EXPORT_SYMBOL_GPL(nfnetlink_set_err); -int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid, - int flags) +int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid) { - return netlink_unicast(net->nfnl, skb, portid, flags); + int err; + + err = nlmsg_unicast(net->nfnl, skb, portid); + if (err == -EAGAIN) + err = -ENOBUFS; + + return err; } EXPORT_SYMBOL_GPL(nfnetlink_unicast); diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index f02992419850..b35e8d9a5b37 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -356,8 +356,7 @@ __nfulnl_send(struct nfulnl_instance *inst) goto out; } } - nfnetlink_unicast(inst->skb, inst->net, inst->peer_portid, - MSG_DONTWAIT); + nfnetlink_unicast(inst->skb, inst->net, inst->peer_portid); out: inst->qlen = 0; inst->skb = NULL; diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index dadfc06245a3..d1d8bca03b4f 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -681,7 +681,7 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue, *packet_id_ptr = htonl(entry->id); /* nfnetlink_unicast will either free the nskb or add it to a socket */ - err = nfnetlink_unicast(nskb, net, queue->peer_portid, MSG_DONTWAIT); + err = nfnetlink_unicast(nskb, net, queue->peer_portid); if (err < 0) { if (queue->flags & NFQA_CFG_F_FAIL_OPEN) { failopen = 1; From da2f849e89ed621f3e0688ec5ba92725ed9f0f92 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Sun, 23 Aug 2020 20:15:37 +0200 Subject: [PATCH 082/110] selftests: netfilter: fix header example nft_flowtable.sh is made for bash not sh. Also give values which not return "RTNETLINK answers: Invalid argument" Signed-off-by: Fabian Frederick Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/netfilter/nft_flowtable.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/netfilter/nft_flowtable.sh b/tools/testing/selftests/netfilter/nft_flowtable.sh index a47d1d832210..28e32fddf9b2 100755 --- a/tools/testing/selftests/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/netfilter/nft_flowtable.sh @@ -11,7 +11,7 @@ # result in fragmentation and/or PMTU discovery. # # You can check with different Orgininator/Link/Responder MTU eg: -# sh nft_flowtable.sh -o1000 -l500 -r100 +# nft_flowtable.sh -o8000 -l1500 -r2000 # From a7bf670ebe192120cbe0a0ab6448baad6fbf7983 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Sun, 23 Aug 2020 20:15:59 +0200 Subject: [PATCH 083/110] selftests: netfilter: exit on invalid parameters exit script with comments when parameters are wrong during address addition. No need for a message when trying to change MTU with lower values: output is self-explanatory. Use short testing sequence to avoid shellcheck warnings (suggested by Stefano Brivio). Signed-off-by: Fabian Frederick Signed-off-by: Pablo Neira Ayuso --- .../testing/selftests/netfilter/nft_flowtable.sh | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/netfilter/nft_flowtable.sh b/tools/testing/selftests/netfilter/nft_flowtable.sh index 28e32fddf9b2..dc05c9940597 100755 --- a/tools/testing/selftests/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/netfilter/nft_flowtable.sh @@ -96,10 +96,16 @@ do esac done -ip -net nsr1 link set veth0 mtu $omtu +if ! ip -net nsr1 link set veth0 mtu $omtu; then + exit 1 +fi + ip -net ns1 link set eth0 mtu $omtu -ip -net nsr2 link set veth1 mtu $rmtu +if ! ip -net nsr2 link set veth1 mtu $rmtu; then + exit 1 +fi + ip -net ns2 link set eth0 mtu $rmtu # transfer-net between nsr1 and nsr2. @@ -120,7 +126,10 @@ for i in 1 2; do ip -net ns$i route add default via 10.0.$i.1 ip -net ns$i addr add dead:$i::99/64 dev eth0 ip -net ns$i route add default via dead:$i::1 - ip netns exec ns$i sysctl net.ipv4.tcp_no_metrics_save=1 > /dev/null + if ! ip netns exec ns$i sysctl net.ipv4.tcp_no_metrics_save=1 > /dev/null; then + echo "ERROR: Check Originator/Responder values (problem during address addition)" + exit 1 + fi # don't set ip DF bit for first two tests ip netns exec ns$i sysctl net.ipv4.ip_no_pmtu_disc=1 > /dev/null From d721b68654d0fcf8930fd0d2edfff78df82fb8c4 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Sun, 23 Aug 2020 20:17:07 +0200 Subject: [PATCH 084/110] selftests: netfilter: remove unused variable in make_file() 'who' variable was not used in make_file() Problem found using Shellcheck Signed-off-by: Fabian Frederick Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/netfilter/nft_flowtable.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/netfilter/nft_flowtable.sh b/tools/testing/selftests/netfilter/nft_flowtable.sh index dc05c9940597..1058952d1b36 100755 --- a/tools/testing/selftests/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/netfilter/nft_flowtable.sh @@ -212,7 +212,6 @@ ns2out=$(mktemp) make_file() { name=$1 - who=$2 SIZE=$((RANDOM % (1024 * 8))) TSIZE=$((SIZE * 1024)) @@ -304,8 +303,8 @@ test_tcp_forwarding_nat() return $lret } -make_file "$ns1in" "ns1" -make_file "$ns2in" "ns2" +make_file "$ns1in" +make_file "$ns2in" # First test: # No PMTU discovery, nsr1 is expected to fragment packets from ns1 to ns2 as needed. From 2f4bba4ef77cf01ae805554e8f1d98e57b28f25f Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Sun, 23 Aug 2020 20:17:39 +0200 Subject: [PATCH 085/110] selftests: netfilter: simplify command testing Fix some shellcheck SC2181 warnings: "Check exit code directly with e.g. 'if mycmd;', not indirectly with $?." as suggested by Stefano Brivio. Signed-off-by: Fabian Frederick Signed-off-by: Pablo Neira Ayuso --- .../selftests/netfilter/nft_flowtable.sh | 34 ++++++------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/tools/testing/selftests/netfilter/nft_flowtable.sh b/tools/testing/selftests/netfilter/nft_flowtable.sh index 1058952d1b36..44a879826236 100755 --- a/tools/testing/selftests/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/netfilter/nft_flowtable.sh @@ -27,8 +27,7 @@ ns2out="" log_netns=$(sysctl -n net.netfilter.nf_log_all_netns) checktool (){ - $1 > /dev/null 2>&1 - if [ $? -ne 0 ];then + if ! $1 > /dev/null 2>&1; then echo "SKIP: Could not $2" exit $ksft_skip fi @@ -187,15 +186,13 @@ if [ $? -ne 0 ]; then fi # test basic connectivity -ip netns exec ns1 ping -c 1 -q 10.0.2.99 > /dev/null -if [ $? -ne 0 ];then +if ! ip netns exec ns1 ping -c 1 -q 10.0.2.99 > /dev/null; then echo "ERROR: ns1 cannot reach ns2" 1>&2 bash exit 1 fi -ip netns exec ns2 ping -c 1 -q 10.0.1.99 > /dev/null -if [ $? -ne 0 ];then +if ! ip netns exec ns2 ping -c 1 -q 10.0.1.99 > /dev/null; then echo "ERROR: ns2 cannot reach ns1" 1>&2 exit 1 fi @@ -230,8 +227,7 @@ check_transfer() out=$2 what=$3 - cmp "$in" "$out" > /dev/null 2>&1 - if [ $? -ne 0 ] ;then + if ! cmp "$in" "$out" > /dev/null 2>&1; then echo "FAIL: file mismatch for $what" 1>&2 ls -l "$in" ls -l "$out" @@ -268,13 +264,11 @@ test_tcp_forwarding_ip() wait - check_transfer "$ns1in" "$ns2out" "ns1 -> ns2" - if [ $? -ne 0 ];then + if ! check_transfer "$ns1in" "$ns2out" "ns1 -> ns2"; then lret=1 fi - check_transfer "$ns2in" "$ns1out" "ns1 <- ns2" - if [ $? -ne 0 ];then + if ! check_transfer "$ns2in" "$ns1out" "ns1 <- ns2"; then lret=1 fi @@ -308,8 +302,7 @@ make_file "$ns2in" # First test: # No PMTU discovery, nsr1 is expected to fragment packets from ns1 to ns2 as needed. -test_tcp_forwarding ns1 ns2 -if [ $? -eq 0 ] ;then +if test_tcp_forwarding ns1 ns2; then echo "PASS: flow offloaded for ns1/ns2" else echo "FAIL: flow offload for ns1/ns2:" 1>&2 @@ -340,9 +333,7 @@ table ip nat { } EOF -test_tcp_forwarding_nat ns1 ns2 - -if [ $? -eq 0 ] ;then +if test_tcp_forwarding_nat ns1 ns2; then echo "PASS: flow offloaded for ns1/ns2 with NAT" else echo "FAIL: flow offload for ns1/ns2 with NAT" 1>&2 @@ -354,8 +345,7 @@ fi # Same as second test, but with PMTU discovery enabled. handle=$(ip netns exec nsr1 nft -a list table inet filter | grep something-to-grep-for | cut -d \# -f 2) -ip netns exec nsr1 nft delete rule inet filter forward $handle -if [ $? -ne 0 ] ;then +if ! ip netns exec nsr1 nft delete rule inet filter forward $handle; then echo "FAIL: Could not delete large-packet accept rule" exit 1 fi @@ -363,8 +353,7 @@ fi ip netns exec ns1 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null ip netns exec ns2 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null -test_tcp_forwarding_nat ns1 ns2 -if [ $? -eq 0 ] ;then +if test_tcp_forwarding_nat ns1 ns2; then echo "PASS: flow offloaded for ns1/ns2 with NAT and pmtu discovery" else echo "FAIL: flow offload for ns1/ns2 with NAT and pmtu discovery" 1>&2 @@ -410,8 +399,7 @@ ip -net ns2 route del 192.168.10.1 via 10.0.2.1 ip -net ns2 route add default via 10.0.2.1 ip -net ns2 route add default via dead:2::1 -test_tcp_forwarding ns1 ns2 -if [ $? -eq 0 ] ;then +if test_tcp_forwarding ns1 ns2; then echo "PASS: ipsec tunnel mode for ns1/ns2" else echo "FAIL: ipsec tunnel mode for ns1/ns2" From 67afbda69645a89adb365f1dfa35181e09cd7e80 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Sun, 23 Aug 2020 20:18:06 +0200 Subject: [PATCH 086/110] selftests: netfilter: add command usage Avoid bad command arguments. Based on tools/power/cpupower/bench/cpufreq-bench_plot.sh Signed-off-by: Fabian Frederick Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/netfilter/nft_flowtable.sh | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/testing/selftests/netfilter/nft_flowtable.sh b/tools/testing/selftests/netfilter/nft_flowtable.sh index 44a879826236..431296c0f91c 100755 --- a/tools/testing/selftests/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/netfilter/nft_flowtable.sh @@ -86,12 +86,23 @@ omtu=9000 lmtu=1500 rmtu=2000 +usage(){ + echo "nft_flowtable.sh [OPTIONS]" + echo + echo "MTU options" + echo " -o originator" + echo " -l link" + echo " -r responder" + exit 1 +} + while getopts "o:l:r:" o do case $o in o) omtu=$OPTARG;; l) lmtu=$OPTARG;; r) rmtu=$OPTARG;; + *) usage;; esac done From c46172147ebbeb70094db48d76ab7945d96c638b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 26 Aug 2020 00:07:18 +0200 Subject: [PATCH 087/110] netfilter: conntrack: do not auto-delete clash entries on reply Its possible that we have more than one packet with the same ct tuple simultaneously, e.g. when an application emits n packets on same UDP socket from multiple threads. NAT rules might be applied to those packets. With the right set of rules, n packets will be mapped to m destinations, where at least two packets end up with the same destination. When this happens, the existing clash resolution may merge the skb that is processed after the first has been received with the identical tuple already in hash table. However, its possible that this identical tuple is a NAT_CLASH tuple. In that case the second skb will be sent, but no reply can be received since the reply that is processed first removes the NAT_CLASH tuple. Do not auto-delete, this gives a 1 second window for replies to be passed back to originator. Packets that are coming later (udp stream case) will not be affected: they match the original ct entry, not a NAT_CLASH one. Also prevent NAT_CLASH entries from getting offloaded. Fixes: 6a757c07e51f ("netfilter: conntrack: allow insertion of clashing entries") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_udp.c | 26 ++++++++++---------------- net/netfilter/nft_flow_offload.c | 2 +- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 760ca2422816..af402f458ee0 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -81,18 +81,6 @@ static bool udp_error(struct sk_buff *skb, return false; } -static void nf_conntrack_udp_refresh_unreplied(struct nf_conn *ct, - struct sk_buff *skb, - enum ip_conntrack_info ctinfo, - u32 extra_jiffies) -{ - if (unlikely(ctinfo == IP_CT_ESTABLISHED_REPLY && - ct->status & IPS_NAT_CLASH)) - nf_ct_kill(ct); - else - nf_ct_refresh_acct(ct, ctinfo, skb, extra_jiffies); -} - /* Returns verdict for packet, and may modify conntracktype */ int nf_conntrack_udp_packet(struct nf_conn *ct, struct sk_buff *skb, @@ -124,12 +112,15 @@ int nf_conntrack_udp_packet(struct nf_conn *ct, nf_ct_refresh_acct(ct, ctinfo, skb, extra); + /* never set ASSURED for IPS_NAT_CLASH, they time out soon */ + if (unlikely((ct->status & IPS_NAT_CLASH))) + return NF_ACCEPT; + /* Also, more likely to be important, and not a probe */ if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); } else { - nf_conntrack_udp_refresh_unreplied(ct, skb, ctinfo, - timeouts[UDP_CT_UNREPLIED]); + nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_UNREPLIED]); } return NF_ACCEPT; } @@ -206,12 +197,15 @@ int nf_conntrack_udplite_packet(struct nf_conn *ct, if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_REPLIED]); + + if (unlikely((ct->status & IPS_NAT_CLASH))) + return NF_ACCEPT; + /* Also, more likely to be important, and not a probe */ if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); } else { - nf_conntrack_udp_refresh_unreplied(ct, skb, ctinfo, - timeouts[UDP_CT_UNREPLIED]); + nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_UNREPLIED]); } return NF_ACCEPT; } diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 3b9b97aa4b32..3a6c84fb2c90 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -102,7 +102,7 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, } if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) || - ct->status & IPS_SEQ_ADJUST) + ct->status & (IPS_SEQ_ADJUST | IPS_NAT_CLASH)) goto out; if (!nf_ct_is_confirmed(ct)) From bb8872a1e6bc911869a729240781076ed950764b Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Sun, 30 Aug 2020 02:37:55 +0700 Subject: [PATCH 088/110] tipc: fix using smp_processor_id() in preemptible The 'this_cpu_ptr()' is used to obtain the AEAD key' TFM on the current CPU for encryption, however the execution can be preemptible since it's actually user-space context, so the 'using smp_processor_id() in preemptible' has been observed. We fix the issue by using the 'get/put_cpu_ptr()' API which consists of a 'preempt_disable()' instead. Fixes: fc1b6d6de220 ("tipc: introduce TIPC encryption & authentication") Acked-by: Jon Maloy Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- net/tipc/crypto.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index c38babaa4e57..7c523dc81575 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -326,7 +326,8 @@ static void tipc_aead_free(struct rcu_head *rp) if (aead->cloned) { tipc_aead_put(aead->cloned); } else { - head = *this_cpu_ptr(aead->tfm_entry); + head = *get_cpu_ptr(aead->tfm_entry); + put_cpu_ptr(aead->tfm_entry); list_for_each_entry_safe(tfm_entry, tmp, &head->list, list) { crypto_free_aead(tfm_entry->tfm); list_del(&tfm_entry->list); @@ -399,10 +400,15 @@ static void tipc_aead_users_set(struct tipc_aead __rcu *aead, int val) */ static struct crypto_aead *tipc_aead_tfm_next(struct tipc_aead *aead) { - struct tipc_tfm **tfm_entry = this_cpu_ptr(aead->tfm_entry); + struct tipc_tfm **tfm_entry; + struct crypto_aead *tfm; + tfm_entry = get_cpu_ptr(aead->tfm_entry); *tfm_entry = list_next_entry(*tfm_entry, list); - return (*tfm_entry)->tfm; + tfm = (*tfm_entry)->tfm; + put_cpu_ptr(tfm_entry); + + return tfm; } /** From 9dda51101a77abb05ae49612164523bb91bf92f0 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Thu, 27 Aug 2020 16:44:22 -0700 Subject: [PATCH 089/110] ionic: fix txrx work accounting Take the tx accounting out of the work_done calculation to prevent a possible duplicate napi_schedule call when under high Tx stress but low Rx traffic. Fixes: b14e4e95f9ec ("ionic: tx separate servicing") Signed-off-by: Shannon Nelson Signed-off-by: David S. Miller --- drivers/net/ethernet/pensando/ionic/ionic_txrx.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c index 8107d32c2767..def65fee27b5 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c @@ -496,9 +496,7 @@ int ionic_txrx_napi(struct napi_struct *napi, int budget) struct ionic_cq *txcq; u32 rx_work_done = 0; u32 tx_work_done = 0; - u32 work_done = 0; u32 flags = 0; - bool unmask; lif = rxcq->bound_q->lif; idev = &lif->ionic->idev; @@ -512,17 +510,12 @@ int ionic_txrx_napi(struct napi_struct *napi, int budget) if (rx_work_done) ionic_rx_fill_cb(rxcq->bound_q); - unmask = (rx_work_done < budget) && (tx_work_done < lif->tx_budget); - - if (unmask && napi_complete_done(napi, rx_work_done)) { + if (rx_work_done < budget && napi_complete_done(napi, rx_work_done)) { flags |= IONIC_INTR_CRED_UNMASK; DEBUG_STATS_INTR_REARM(rxcq->bound_intr); - work_done = rx_work_done; - } else { - work_done = budget; } - if (work_done || flags) { + if (rx_work_done || flags) { flags |= IONIC_INTR_CRED_RESET_COALESCE; ionic_intr_credits(idev->intr_ctrl, rxcq->bound_intr->index, tx_work_done + rx_work_done, flags); @@ -531,7 +524,7 @@ int ionic_txrx_napi(struct napi_struct *napi, int budget) DEBUG_STATS_NAPI_POLL(qcq, rx_work_done); DEBUG_STATS_NAPI_POLL(qcq, tx_work_done); - return work_done; + return rx_work_done; } static dma_addr_t ionic_tx_map_single(struct ionic_queue *q, From 1a545ebe380bf4c1433e3c136e35a77764fda5ad Mon Sep 17 00:00:00 2001 From: Xie He Date: Fri, 28 Aug 2020 00:07:52 -0700 Subject: [PATCH 090/110] drivers/net/wan/hdlc_cisco: Add hard_header_len This driver didn't set hard_header_len. This patch sets hard_header_len for it according to its header_ops->create function. This driver's header_ops->create function (cisco_hard_header) creates a header of (struct hdlc_header), so hard_header_len should be set to sizeof(struct hdlc_header). Cc: Martin Schiller Signed-off-by: Xie He Acked-by: Krzysztof Halasa Signed-off-by: David S. Miller --- drivers/net/wan/hdlc_cisco.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wan/hdlc_cisco.c b/drivers/net/wan/hdlc_cisco.c index d8cba3625c18..444130655d8e 100644 --- a/drivers/net/wan/hdlc_cisco.c +++ b/drivers/net/wan/hdlc_cisco.c @@ -370,6 +370,7 @@ static int cisco_ioctl(struct net_device *dev, struct ifreq *ifr) memcpy(&state(hdlc)->settings, &new_settings, size); spin_lock_init(&state(hdlc)->lock); dev->header_ops = &cisco_header_ops; + dev->hard_header_len = sizeof(struct hdlc_header); dev->type = ARPHRD_CISCO; call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE, dev); netif_dormant_on(dev); From 6b6382a857d824c0866056d5736bbcb597a922ed Mon Sep 17 00:00:00 2001 From: Potnuri Bharat Teja Date: Fri, 28 Aug 2020 21:14:40 +0530 Subject: [PATCH 091/110] cxgb4: fix thermal zone device registration When multiple adapters are present in the system, pci hot-removing second adapter leads to the following warning as both the adapters registered thermal zone device with same thermal zone name/type. Therefore, use unique thermal zone name during thermal zone device initialization. Also mark thermal zone dev NULL once unregistered. [ 414.370143] ------------[ cut here ]------------ [ 414.370944] sysfs group 'power' not found for kobject 'hwmon0' [ 414.371747] WARNING: CPU: 9 PID: 2661 at fs/sysfs/group.c:281 sysfs_remove_group+0x76/0x80 [ 414.382550] CPU: 9 PID: 2661 Comm: bash Not tainted 5.8.0-rc6+ #33 [ 414.383593] Hardware name: Supermicro X10SRA-F/X10SRA-F, BIOS 2.0a 06/23/2016 [ 414.384669] RIP: 0010:sysfs_remove_group+0x76/0x80 [ 414.385738] Code: 48 89 df 5b 5d 41 5c e9 d8 b5 ff ff 48 89 df e8 60 b0 ff ff eb cb 49 8b 14 24 48 8b 75 00 48 c7 c7 90 ae 13 bb e8 6a 27 d0 ff <0f> 0b 5b 5d 41 5c c3 0f 1f 00 0f 1f 44 00 00 48 85 f6 74 31 41 54 [ 414.388404] RSP: 0018:ffffa22bc080fcb0 EFLAGS: 00010286 [ 414.389638] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 [ 414.390829] RDX: 0000000000000001 RSI: ffff8ee2de3e9510 RDI: ffff8ee2de3e9510 [ 414.392064] RBP: ffffffffbaef2ee0 R08: 0000000000000000 R09: 0000000000000000 [ 414.393224] R10: 0000000000000000 R11: 000000002b30006c R12: ffff8ee260720008 [ 414.394388] R13: ffff8ee25e0a40e8 R14: ffffa22bc080ff08 R15: ffff8ee2c3be5020 [ 414.395661] FS: 00007fd2a7171740(0000) GS:ffff8ee2de200000(0000) knlGS:0000000000000000 [ 414.396825] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 414.398011] CR2: 00007f178ffe5020 CR3: 000000084c5cc003 CR4: 00000000003606e0 [ 414.399172] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 414.400352] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 414.401473] Call Trace: [ 414.402685] device_del+0x89/0x400 [ 414.403819] device_unregister+0x16/0x60 [ 414.405024] hwmon_device_unregister+0x44/0xa0 [ 414.406112] thermal_remove_hwmon_sysfs+0x196/0x200 [ 414.407256] thermal_zone_device_unregister+0x1b5/0x1f0 [ 414.408415] cxgb4_thermal_remove+0x3c/0x4f [cxgb4] [ 414.409668] remove_one+0x212/0x290 [cxgb4] [ 414.410875] pci_device_remove+0x36/0xb0 [ 414.412004] device_release_driver_internal+0xe2/0x1c0 [ 414.413276] pci_stop_bus_device+0x64/0x90 [ 414.414433] pci_stop_and_remove_bus_device_locked+0x16/0x30 [ 414.415609] remove_store+0x75/0x90 [ 414.416790] kernfs_fop_write+0x114/0x1b0 [ 414.417930] vfs_write+0xcf/0x210 [ 414.419059] ksys_write+0xa7/0xe0 [ 414.420120] do_syscall_64+0x4c/0xa0 [ 414.421278] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 414.422335] RIP: 0033:0x7fd2a686afd0 [ 414.423396] Code: Bad RIP value. [ 414.424549] RSP: 002b:00007fffc1446148 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [ 414.425638] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007fd2a686afd0 [ 414.426830] RDX: 0000000000000002 RSI: 00007fd2a7196000 RDI: 0000000000000001 [ 414.427927] RBP: 00007fd2a7196000 R08: 000000000000000a R09: 00007fd2a7171740 [ 414.428923] R10: 00007fd2a7171740 R11: 0000000000000246 R12: 00007fd2a6b43400 [ 414.430082] R13: 0000000000000002 R14: 0000000000000001 R15: 0000000000000000 [ 414.431027] irq event stamp: 76300 [ 414.435678] ---[ end trace 13865acb4d5ab00f ]--- Fixes: b18719157762 ("cxgb4: Add thermal zone support") Signed-off-by: Potnuri Bharat Teja Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_thermal.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_thermal.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_thermal.c index e3510e9b21f3..9a6d65243334 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_thermal.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_thermal.c @@ -62,6 +62,7 @@ static struct thermal_zone_device_ops cxgb4_thermal_ops = { int cxgb4_thermal_init(struct adapter *adap) { struct ch_thermal *ch_thermal = &adap->ch_thermal; + char ch_tz_name[THERMAL_NAME_LENGTH]; int num_trip = CXGB4_NUM_TRIPS; u32 param, val; int ret; @@ -82,7 +83,8 @@ int cxgb4_thermal_init(struct adapter *adap) ch_thermal->trip_type = THERMAL_TRIP_CRITICAL; } - ch_thermal->tzdev = thermal_zone_device_register("cxgb4", num_trip, + snprintf(ch_tz_name, sizeof(ch_tz_name), "cxgb4_%s", adap->name); + ch_thermal->tzdev = thermal_zone_device_register(ch_tz_name, num_trip, 0, adap, &cxgb4_thermal_ops, NULL, 0, 0); @@ -105,7 +107,9 @@ int cxgb4_thermal_init(struct adapter *adap) int cxgb4_thermal_remove(struct adapter *adap) { - if (adap->ch_thermal.tzdev) + if (adap->ch_thermal.tzdev) { thermal_zone_device_unregister(adap->ch_thermal.tzdev); + adap->ch_thermal.tzdev = NULL; + } return 0; } From c2f89219f559502c9292d79f695bab9dcec532d1 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Sun, 30 Aug 2020 20:34:32 +0300 Subject: [PATCH 092/110] net: ethernet: ti: am65-cpsw: fix rmii 100Mbit link mode In RMII link mode it's required to set bit 15 IFCTL_A in MAC_SL MAC_CONTROL register to enable support for 100Mbit link speed. Fixes: 93a76530316a ("net: ethernet: ti: introduce am65x/j721e gigabit eth subsystem driver") Signed-off-by: Grygorii Strashko Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index cb994f66c3be..9baf3f3da91e 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -174,6 +174,8 @@ void am65_cpsw_nuss_adjust_link(struct net_device *ndev) if (phy->speed == 10 && phy_interface_is_rgmii(phy)) /* Can be used with in band mode only */ mac_control |= CPSW_SL_CTL_EXT_EN; + if (phy->speed == 100 && phy->interface == PHY_INTERFACE_MODE_RMII) + mac_control |= CPSW_SL_CTL_IFCTL_A; if (phy->duplex) mac_control |= CPSW_SL_CTL_FULLDUPLEX; From 355db39110100b968c8291a1a7d897f92e17a8df Mon Sep 17 00:00:00 2001 From: Leesoo Ahn Date: Tue, 1 Sep 2020 22:04:47 +0900 Subject: [PATCH 093/110] pktgen: fix error message with wrong function name Error on calling kthread_create_on_node prints wrong function name, kernel_thread. Fixes: 94dcf29a11b3 ("kthread: use kthread_create_on_node()") Signed-off-by: Leesoo Ahn Acked-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/core/pktgen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index b53b6d38c4df..fce968b673b3 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3699,7 +3699,7 @@ static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn) cpu_to_node(cpu), "kpktgend_%d", cpu); if (IS_ERR(p)) { - pr_err("kernel_thread() failed for cpu %d\n", t->cpu); + pr_err("kthread_create_on_node() failed for cpu %d\n", t->cpu); list_del(&t->th_list); kfree(t); return PTR_ERR(p); From cbedcb044e9cc4e14bbe6658111224bb923094f4 Mon Sep 17 00:00:00 2001 From: Shung-Hsi Yu Date: Mon, 31 Aug 2020 22:37:09 +0800 Subject: [PATCH 094/110] net: ethernet: mlx4: Fix memory allocation in mlx4_buddy_init() On machines with much memory (> 2 TByte) and log_mtts_per_seg == 0, a max_order of 31 will be passed to mlx_buddy_init(), which results in s = BITS_TO_LONGS(1 << 31) becoming a negative value, leading to kvmalloc_array() failure when it is converted to size_t. mlx4_core 0000:b1:00.0: Failed to initialize memory region table, aborting mlx4_core: probe of 0000:b1:00.0 failed with error -12 Fix this issue by changing the left shifting operand from a signed literal to an unsigned one. Fixes: 225c7b1feef1 ("IB/mlx4: Add a driver Mellanox ConnectX InfiniBand adapters") Signed-off-by: Shung-Hsi Yu Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/mr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/mr.c b/drivers/net/ethernet/mellanox/mlx4/mr.c index d2986f1f2db0..d7444782bfdd 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mr.c +++ b/drivers/net/ethernet/mellanox/mlx4/mr.c @@ -114,7 +114,7 @@ static int mlx4_buddy_init(struct mlx4_buddy *buddy, int max_order) goto err_out; for (i = 0; i <= buddy->max_order; ++i) { - s = BITS_TO_LONGS(1 << (buddy->max_order - i)); + s = BITS_TO_LONGS(1UL << (buddy->max_order - i)); buddy->bits[i] = kvmalloc_array(s, sizeof(long), GFP_KERNEL | __GFP_ZERO); if (!buddy->bits[i]) goto err_out_free; From ae6961de541d2307076d02fe9ae31f25c9f13611 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Tue, 1 Sep 2020 10:39:09 +0800 Subject: [PATCH 095/110] vhost: fix typo in error message "enable" should be "disable" when the function name is vhost_disable_notify(), which does the disabling work. Signed-off-by: Yunsheng Lin Acked-by: Jason Wang Acked-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- drivers/vhost/vhost.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 5857d4eec9d7..b45519ca66a7 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2537,7 +2537,7 @@ void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) { r = vhost_update_used_flags(vq); if (r) - vq_err(vq, "Failed to enable notification at %p: %d\n", + vq_err(vq, "Failed to disable notification at %p: %d\n", &vq->used->flags, r); } } From a609d0259183a841621f252e067f40f8cc25d6f6 Mon Sep 17 00:00:00 2001 From: Kamil Lorenc Date: Tue, 1 Sep 2020 10:57:38 +0200 Subject: [PATCH 096/110] net: usb: dm9601: Add USB ID of Keenetic Plus DSL Keenetic Plus DSL is a xDSL modem that uses dm9620 as its USB interface. Signed-off-by: Kamil Lorenc Signed-off-by: David S. Miller --- drivers/net/usb/dm9601.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/usb/dm9601.c b/drivers/net/usb/dm9601.c index b91f92e4e5f2..915ac75b55fc 100644 --- a/drivers/net/usb/dm9601.c +++ b/drivers/net/usb/dm9601.c @@ -625,6 +625,10 @@ static const struct usb_device_id products[] = { USB_DEVICE(0x0a46, 0x1269), /* DM9621A USB to Fast Ethernet Adapter */ .driver_info = (unsigned long)&dm9601_info, }, + { + USB_DEVICE(0x0586, 0x3427), /* ZyXEL Keenetic Plus DSL xDSL modem */ + .driver_info = (unsigned long)&dm9601_info, + }, {}, // END }; From 7deedd9f0e43bb3f4ea531b3ea9ecc27bf1f4871 Mon Sep 17 00:00:00 2001 From: Shyam Sundar S K Date: Wed, 2 Sep 2020 09:28:07 +0000 Subject: [PATCH 097/110] amd-xgbe: Add support for new port mode Add support for a new port mode that is a backplane connection without support for auto negotiation. Signed-off-by: Shyam Sundar S K Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c index 46c3c1ca38d6..859ded0c06b0 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c @@ -166,6 +166,7 @@ enum xgbe_port_mode { XGBE_PORT_MODE_10GBASE_T, XGBE_PORT_MODE_10GBASE_R, XGBE_PORT_MODE_SFP, + XGBE_PORT_MODE_BACKPLANE_NO_AUTONEG, XGBE_PORT_MODE_MAX, }; @@ -1634,6 +1635,7 @@ static enum xgbe_mode xgbe_phy_an73_redrv_outcome(struct xgbe_prv_data *pdata) if (ad_reg & 0x80) { switch (phy_data->port_mode) { case XGBE_PORT_MODE_BACKPLANE: + case XGBE_PORT_MODE_BACKPLANE_NO_AUTONEG: mode = XGBE_MODE_KR; break; default: @@ -1643,6 +1645,7 @@ static enum xgbe_mode xgbe_phy_an73_redrv_outcome(struct xgbe_prv_data *pdata) } else if (ad_reg & 0x20) { switch (phy_data->port_mode) { case XGBE_PORT_MODE_BACKPLANE: + case XGBE_PORT_MODE_BACKPLANE_NO_AUTONEG: mode = XGBE_MODE_KX_1000; break; case XGBE_PORT_MODE_1000BASE_X: @@ -1782,6 +1785,7 @@ static void xgbe_phy_an_advertising(struct xgbe_prv_data *pdata, switch (phy_data->port_mode) { case XGBE_PORT_MODE_BACKPLANE: + case XGBE_PORT_MODE_BACKPLANE_NO_AUTONEG: XGBE_SET_ADV(dlks, 10000baseKR_Full); break; case XGBE_PORT_MODE_BACKPLANE_2500: @@ -1874,6 +1878,7 @@ static enum xgbe_an_mode xgbe_phy_an_mode(struct xgbe_prv_data *pdata) switch (phy_data->port_mode) { case XGBE_PORT_MODE_BACKPLANE: return XGBE_AN_MODE_CL73; + case XGBE_PORT_MODE_BACKPLANE_NO_AUTONEG: case XGBE_PORT_MODE_BACKPLANE_2500: return XGBE_AN_MODE_NONE; case XGBE_PORT_MODE_1000BASE_T: @@ -2156,6 +2161,7 @@ static enum xgbe_mode xgbe_phy_switch_mode(struct xgbe_prv_data *pdata) switch (phy_data->port_mode) { case XGBE_PORT_MODE_BACKPLANE: + case XGBE_PORT_MODE_BACKPLANE_NO_AUTONEG: return xgbe_phy_switch_bp_mode(pdata); case XGBE_PORT_MODE_BACKPLANE_2500: return xgbe_phy_switch_bp_2500_mode(pdata); @@ -2251,6 +2257,7 @@ static enum xgbe_mode xgbe_phy_get_mode(struct xgbe_prv_data *pdata, switch (phy_data->port_mode) { case XGBE_PORT_MODE_BACKPLANE: + case XGBE_PORT_MODE_BACKPLANE_NO_AUTONEG: return xgbe_phy_get_bp_mode(speed); case XGBE_PORT_MODE_BACKPLANE_2500: return xgbe_phy_get_bp_2500_mode(speed); @@ -2426,6 +2433,7 @@ static bool xgbe_phy_use_mode(struct xgbe_prv_data *pdata, enum xgbe_mode mode) switch (phy_data->port_mode) { case XGBE_PORT_MODE_BACKPLANE: + case XGBE_PORT_MODE_BACKPLANE_NO_AUTONEG: return xgbe_phy_use_bp_mode(pdata, mode); case XGBE_PORT_MODE_BACKPLANE_2500: return xgbe_phy_use_bp_2500_mode(pdata, mode); @@ -2515,6 +2523,7 @@ static bool xgbe_phy_valid_speed(struct xgbe_prv_data *pdata, int speed) switch (phy_data->port_mode) { case XGBE_PORT_MODE_BACKPLANE: + case XGBE_PORT_MODE_BACKPLANE_NO_AUTONEG: return xgbe_phy_valid_speed_bp_mode(speed); case XGBE_PORT_MODE_BACKPLANE_2500: return xgbe_phy_valid_speed_bp_2500_mode(speed); @@ -2792,6 +2801,7 @@ static bool xgbe_phy_port_mode_mismatch(struct xgbe_prv_data *pdata) switch (phy_data->port_mode) { case XGBE_PORT_MODE_BACKPLANE: + case XGBE_PORT_MODE_BACKPLANE_NO_AUTONEG: if ((phy_data->port_speeds & XGBE_PHY_PORT_SPEED_1000) || (phy_data->port_speeds & XGBE_PHY_PORT_SPEED_10000)) return false; @@ -2844,6 +2854,7 @@ static bool xgbe_phy_conn_type_mismatch(struct xgbe_prv_data *pdata) switch (phy_data->port_mode) { case XGBE_PORT_MODE_BACKPLANE: + case XGBE_PORT_MODE_BACKPLANE_NO_AUTONEG: case XGBE_PORT_MODE_BACKPLANE_2500: if (phy_data->conn_type == XGBE_CONN_TYPE_BACKPLANE) return false; @@ -3160,6 +3171,8 @@ static int xgbe_phy_init(struct xgbe_prv_data *pdata) /* Backplane support */ case XGBE_PORT_MODE_BACKPLANE: XGBE_SET_SUP(lks, Autoneg); + fallthrough; + case XGBE_PORT_MODE_BACKPLANE_NO_AUTONEG: XGBE_SET_SUP(lks, Pause); XGBE_SET_SUP(lks, Asym_Pause); XGBE_SET_SUP(lks, Backplane); From 1996cf46e4673a25ef2478eb266714f409a98221 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Wed, 2 Sep 2020 14:18:45 +0300 Subject: [PATCH 098/110] net: bcmgenet: fix mask check in bcmgenet_validate_flow() VALIDATE_MASK(eth_mask->h_source) is checked twice in a row in bcmgenet_validate_flow(). Add VALIDATE_MASK(eth_mask->h_dest) instead. Fixes: 3e370952287c ("net: bcmgenet: add support for ethtool rxnfc flows") Signed-off-by: Denis Efremov Acked-by: Doug Berger Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/genet/bcmgenet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index 1fecc25767bd..d95f46ef8040 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -1364,7 +1364,7 @@ static int bcmgenet_validate_flow(struct net_device *dev, case ETHER_FLOW: eth_mask = &cmd->fs.m_u.ether_spec; /* don't allow mask which isn't valid */ - if (VALIDATE_MASK(eth_mask->h_source) || + if (VALIDATE_MASK(eth_mask->h_dest) || VALIDATE_MASK(eth_mask->h_source) || VALIDATE_MASK(eth_mask->h_proto)) { netdev_err(dev, "rxnfc: Unsupported mask\n"); From eb0f3bc463d59d86402f19c59aa44e82dc3fab6d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 2 Sep 2020 14:56:31 +0300 Subject: [PATCH 099/110] net: gemini: Fix another missing clk_disable_unprepare() in probe We recently added some calls to clk_disable_unprepare() but we missed the last error path if register_netdev() fails. I made a couple cleanups so we avoid mistakes like this in the future. First I reversed the "if (!ret)" condition and pulled the code in one indent level. Also, the "port->netdev = NULL;" is not required because "port" isn't used again outside this function so I deleted that line. Fixes: 4d5ae32f5e1e ("net: ethernet: Add a driver for Gemini gigabit ethernet") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/net/ethernet/cortina/gemini.c | 36 +++++++++++++-------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c index 62e271aea4a5..ffec0f3dd957 100644 --- a/drivers/net/ethernet/cortina/gemini.c +++ b/drivers/net/ethernet/cortina/gemini.c @@ -2446,8 +2446,8 @@ static int gemini_ethernet_port_probe(struct platform_device *pdev) port->reset = devm_reset_control_get_exclusive(dev, NULL); if (IS_ERR(port->reset)) { dev_err(dev, "no reset\n"); - clk_disable_unprepare(port->pclk); - return PTR_ERR(port->reset); + ret = PTR_ERR(port->reset); + goto unprepare; } reset_control_reset(port->reset); usleep_range(100, 500); @@ -2502,25 +2502,25 @@ static int gemini_ethernet_port_probe(struct platform_device *pdev) IRQF_SHARED, port_names[port->id], port); - if (ret) { - clk_disable_unprepare(port->pclk); - return ret; - } + if (ret) + goto unprepare; ret = register_netdev(netdev); - if (!ret) { - netdev_info(netdev, - "irq %d, DMA @ 0x%pap, GMAC @ 0x%pap\n", - port->irq, &dmares->start, - &gmacres->start); - ret = gmac_setup_phy(netdev); - if (ret) - netdev_info(netdev, - "PHY init failed, deferring to ifup time\n"); - return 0; - } + if (ret) + goto unprepare; - port->netdev = NULL; + netdev_info(netdev, + "irq %d, DMA @ 0x%pap, GMAC @ 0x%pap\n", + port->irq, &dmares->start, + &gmacres->start); + ret = gmac_setup_phy(netdev); + if (ret) + netdev_info(netdev, + "PHY init failed, deferring to ifup time\n"); + return 0; + +unprepare: + clk_disable_unprepare(port->pclk); return ret; } From 2b7bcd967a0f5b7ac9bb0c37b92de36e073dd119 Mon Sep 17 00:00:00 2001 From: Xie He Date: Wed, 2 Sep 2020 05:07:06 -0700 Subject: [PATCH 100/110] drivers/net/wan/hdlc: Change the default of hard_header_len to 0 Change the default value of hard_header_len in hdlc.c from 16 to 0. Currently there are 6 HDLC protocol drivers, among them: hdlc_raw_eth, hdlc_cisco, hdlc_ppp, hdlc_x25 set hard_header_len when attaching the protocol, overriding the default. So this patch does not affect them. hdlc_raw and hdlc_fr don't set hard_header_len when attaching the protocol. So this patch will change the hard_header_len of the HDLC device for them from 16 to 0. This is the correct change because both hdlc_raw and hdlc_fr don't have header_ops, and the code in net/packet/af_packet.c expects the value of hard_header_len to be consistent with header_ops. In net/packet/af_packet.c, in the packet_snd function, for AF_PACKET/DGRAM sockets it would reserve a headroom of hard_header_len and call dev_hard_header to fill in that headroom, and for AF_PACKET/RAW sockets, it does not reserve the headroom and does not call dev_hard_header, but checks if the user has provided a header of length hard_header_len (in function dev_validate_header). Cc: Krzysztof Halasa Cc: Martin Schiller Signed-off-by: Xie He Signed-off-by: David S. Miller --- drivers/net/wan/hdlc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wan/hdlc.c b/drivers/net/wan/hdlc.c index 386ed2aa31fd..9b00708676cf 100644 --- a/drivers/net/wan/hdlc.c +++ b/drivers/net/wan/hdlc.c @@ -229,7 +229,7 @@ static void hdlc_setup_dev(struct net_device *dev) dev->min_mtu = 68; dev->max_mtu = HDLC_MAX_MTU; dev->type = ARPHRD_RAWHDLC; - dev->hard_header_len = 16; + dev->hard_header_len = 0; dev->needed_headroom = 0; dev->addr_len = 0; dev->header_ops = &hdlc_null_ops; From 05d4487197b2b71d5363623c28924fd58c71c0b6 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 2 Sep 2020 16:16:59 +0300 Subject: [PATCH 101/110] ipv6: Fix sysctl max for fib_multipath_hash_policy Cited commit added the possible value of '2', but it cannot be set. Fix it by adjusting the maximum value to '2'. This is consistent with the corresponding IPv4 sysctl. Before: # sysctl -w net.ipv6.fib_multipath_hash_policy=2 sysctl: setting key "net.ipv6.fib_multipath_hash_policy": Invalid argument net.ipv6.fib_multipath_hash_policy = 2 # sysctl net.ipv6.fib_multipath_hash_policy net.ipv6.fib_multipath_hash_policy = 0 After: # sysctl -w net.ipv6.fib_multipath_hash_policy=2 net.ipv6.fib_multipath_hash_policy = 2 # sysctl net.ipv6.fib_multipath_hash_policy net.ipv6.fib_multipath_hash_policy = 2 Fixes: d8f74f0975d8 ("ipv6: Support multipath hashing on inner IP pkts") Signed-off-by: Ido Schimmel Reviewed-by: Stephen Suryaputra Signed-off-by: David S. Miller --- net/ipv6/sysctl_net_ipv6.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index fac2135aa47b..5b60a4bdd36a 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -21,6 +21,7 @@ #include #endif +static int two = 2; static int flowlabel_reflect_max = 0x7; static int auto_flowlabels_min; static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX; @@ -150,7 +151,7 @@ static struct ctl_table ipv6_table_template[] = { .mode = 0644, .proc_handler = proc_rt6_multipath_hash_policy, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, + .extra2 = &two, }, { .procname = "seg6_flowlabel", From 2a63866c8b51a3f72cea388dfac259d0e14c4ba6 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 2 Sep 2020 22:44:16 +0900 Subject: [PATCH 102/110] tipc: fix shutdown() of connectionless socket syzbot is reporting hung task at nbd_ioctl() [1], for there are two problems regarding TIPC's connectionless socket's shutdown() operation. ---------- #include #include #include #include #include int main(int argc, char *argv[]) { const int fd = open("/dev/nbd0", 3); alarm(5); ioctl(fd, NBD_SET_SOCK, socket(PF_TIPC, SOCK_DGRAM, 0)); ioctl(fd, NBD_DO_IT, 0); /* To be interrupted by SIGALRM. */ return 0; } ---------- One problem is that wait_for_completion() from flush_workqueue() from nbd_start_device_ioctl() from nbd_ioctl() cannot be completed when nbd_start_device_ioctl() received a signal at wait_event_interruptible(), for tipc_shutdown() from kernel_sock_shutdown(SHUT_RDWR) from nbd_mark_nsock_dead() from sock_shutdown() from nbd_start_device_ioctl() is failing to wake up a WQ thread sleeping at wait_woken() from tipc_wait_for_rcvmsg() from sock_recvmsg() from sock_xmit() from nbd_read_stat() from recv_work() scheduled by nbd_start_device() from nbd_start_device_ioctl(). Fix this problem by always invoking sk->sk_state_change() (like inet_shutdown() does) when tipc_shutdown() is called. The other problem is that tipc_wait_for_rcvmsg() cannot return when tipc_shutdown() is called, for tipc_shutdown() sets sk->sk_shutdown to SEND_SHUTDOWN (despite "how" is SHUT_RDWR) while tipc_wait_for_rcvmsg() needs sk->sk_shutdown set to RCV_SHUTDOWN or SHUTDOWN_MASK. Fix this problem by setting sk->sk_shutdown to SHUTDOWN_MASK (like inet_shutdown() does) when the socket is connectionless. [1] https://syzkaller.appspot.com/bug?id=3fe51d307c1f0a845485cf1798aa059d12bf18b2 Reported-by: syzbot Signed-off-by: Tetsuo Handa Signed-off-by: David S. Miller --- net/tipc/socket.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 07419f36116a..d49c6173954a 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2771,18 +2771,21 @@ static int tipc_shutdown(struct socket *sock, int how) trace_tipc_sk_shutdown(sk, NULL, TIPC_DUMP_ALL, " "); __tipc_shutdown(sock, TIPC_CONN_SHUTDOWN); - sk->sk_shutdown = SEND_SHUTDOWN; + if (tipc_sk_type_connectionless(sk)) + sk->sk_shutdown = SHUTDOWN_MASK; + else + sk->sk_shutdown = SEND_SHUTDOWN; if (sk->sk_state == TIPC_DISCONNECTING) { /* Discard any unreceived messages */ __skb_queue_purge(&sk->sk_receive_queue); - /* Wake up anyone sleeping in poll */ - sk->sk_state_change(sk); res = 0; } else { res = -ENOTCONN; } + /* Wake up anyone sleeping in poll. */ + sk->sk_state_change(sk); release_sock(sk); return res; From f614e536704d37326b0975da9cc33dd61d28c378 Mon Sep 17 00:00:00 2001 From: Louis Peens Date: Wed, 2 Sep 2020 17:04:58 +0200 Subject: [PATCH 103/110] nfp: flower: fix ABI mismatch between driver and firmware Fix an issue where the driver wrongly detected ipv6 neighbour updates from the NFP as corrupt. Add a reserved field on the kernel side so it is similar to the ipv4 version of the struct and has space for the extra bytes from the card. Fixes: 9ea9bfa12240 ("nfp: flower: support ipv6 tunnel keep-alive messages from fw") Signed-off-by: Louis Peens Signed-off-by: Simon Horman Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c index 2df3deedf9fd..7248d248f604 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c +++ b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c @@ -61,6 +61,7 @@ struct nfp_tun_active_tuns { * @flags: options part of the request * @tun_info.ipv6: dest IPv6 address of active route * @tun_info.egress_port: port the encapsulated packet egressed + * @tun_info.extra: reserved for future use * @tun_info: tunnels that have sent traffic in reported period */ struct nfp_tun_active_tuns_v6 { @@ -70,6 +71,7 @@ struct nfp_tun_active_tuns_v6 { struct route_ip_info_v6 { struct in6_addr ipv6; __be32 egress_port; + __be32 extra[2]; } tun_info[]; }; From 8b4a11c67da538504d60ae917ffe5254f59b1248 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Wed, 2 Sep 2020 14:27:04 -0500 Subject: [PATCH 104/110] net: dp83867: Fix WoL SecureOn password Fix the registers being written to as the values were being over written when writing the same registers. Fixes: caabee5b53f5 ("net: phy: dp83867: support Wake on LAN") Signed-off-by: Dan Murphy Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/dp83867.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/dp83867.c b/drivers/net/phy/dp83867.c index f3c04981b8da..cd7032628a28 100644 --- a/drivers/net/phy/dp83867.c +++ b/drivers/net/phy/dp83867.c @@ -215,9 +215,9 @@ static int dp83867_set_wol(struct phy_device *phydev, if (wol->wolopts & WAKE_MAGICSECURE) { phy_write_mmd(phydev, DP83867_DEVADDR, DP83867_RXFSOP1, (wol->sopass[1] << 8) | wol->sopass[0]); - phy_write_mmd(phydev, DP83867_DEVADDR, DP83867_RXFSOP1, + phy_write_mmd(phydev, DP83867_DEVADDR, DP83867_RXFSOP2, (wol->sopass[3] << 8) | wol->sopass[2]); - phy_write_mmd(phydev, DP83867_DEVADDR, DP83867_RXFSOP1, + phy_write_mmd(phydev, DP83867_DEVADDR, DP83867_RXFSOP3, (wol->sopass[5] << 8) | wol->sopass[4]); val_rxcfg |= DP83867_WOL_SEC_EN; From af0ae997a3a7ef7e9d0b1358a0983e088084f086 Mon Sep 17 00:00:00 2001 From: Paul Barker Date: Thu, 3 Sep 2020 09:49:25 +0100 Subject: [PATCH 105/110] doc: net: dsa: Fix typo in config code sample In the "single port" example code for configuring a DSA switch without tagging support from userspace the command to bring up the "lan2" link was typo'd. Signed-off-by: Paul Barker Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/networking/dsa/configuration.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/networking/dsa/configuration.rst b/Documentation/networking/dsa/configuration.rst index af029b3ca2ab..11bd5e6108c0 100644 --- a/Documentation/networking/dsa/configuration.rst +++ b/Documentation/networking/dsa/configuration.rst @@ -180,7 +180,7 @@ The configuration can only be set up via VLAN tagging and bridge setup. # bring up the slave interfaces ip link set lan1 up - ip link set lan1 up + ip link set lan2 up ip link set lan3 up # create bridge From 556699341efa98243e08e34401b3f601da91f5a3 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Thu, 3 Sep 2020 14:28:54 -0400 Subject: [PATCH 106/110] tg3: Fix soft lockup when tg3_reset_task() fails. If tg3_reset_task() fails, the device state is left in an inconsistent state with IFF_RUNNING still set but NAPI state not enabled. A subsequent operation, such as ifdown or AER error can cause it to soft lock up when it tries to disable NAPI state. Fix it by bringing down the device to !IFF_RUNNING state when tg3_reset_task() fails. tg3_reset_task() running from workqueue will now call tg3_close() when the reset fails. We need to modify tg3_reset_task_cancel() slightly to avoid tg3_close() calling cancel_work_sync() to cancel tg3_reset_task(). Otherwise cancel_work_sync() will wait forever for tg3_reset_task() to finish. Reported-by: David Christensen Reported-by: Baptiste Covolato Fixes: db2199737990 ("tg3: Schedule at most one tg3_reset_task run") Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/tg3.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index ebff1fc0d8ce..4515804d1ce4 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -7221,8 +7221,8 @@ static inline void tg3_reset_task_schedule(struct tg3 *tp) static inline void tg3_reset_task_cancel(struct tg3 *tp) { - cancel_work_sync(&tp->reset_task); - tg3_flag_clear(tp, RESET_TASK_PENDING); + if (test_and_clear_bit(TG3_FLAG_RESET_TASK_PENDING, tp->tg3_flags)) + cancel_work_sync(&tp->reset_task); tg3_flag_clear(tp, TX_RECOVERY_PENDING); } @@ -11209,18 +11209,27 @@ static void tg3_reset_task(struct work_struct *work) tg3_halt(tp, RESET_KIND_SHUTDOWN, 0); err = tg3_init_hw(tp, true); - if (err) + if (err) { + tg3_full_unlock(tp); + tp->irq_sync = 0; + tg3_napi_enable(tp); + /* Clear this flag so that tg3_reset_task_cancel() will not + * call cancel_work_sync() and wait forever. + */ + tg3_flag_clear(tp, RESET_TASK_PENDING); + dev_close(tp->dev); goto out; + } tg3_netif_start(tp); -out: tg3_full_unlock(tp); if (!err) tg3_phy_start(tp); tg3_flag_clear(tp, RESET_TASK_PENDING); +out: rtnl_unlock(); } From fffe83c8c40a61978f8998a85cc94bb2ca19b6dd Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 3 Sep 2020 21:53:15 +0200 Subject: [PATCH 107/110] net/smc: fix toleration of fake add_link messages Older SMCR implementations had no link failover support and used one link only. Because the handshake protocol requires to try the establishment of a second link the old code sent a fake add_link message and declined any server response afterwards. The current code supports multiple links and inspects the received fake add_link message more closely. To tolerate the fake add_link messages smc_llc_is_local_add_link() needs an improved check of the message to be able to separate between locally enqueued and fake add_link messages. And smc_llc_cli_add_link() needs to check if the provided qp_mtu size is invalid and reject the add_link request in that case. Fixes: c48254fa48e5 ("net/smc: move add link processing for new device into llc layer") Reviewed-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/smc_llc.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index df5b0a6ea848..3ea33466ebe9 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -841,6 +841,9 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) struct smc_init_info ini; int lnk_idx, rc = 0; + if (!llc->qp_mtu) + goto out_reject; + ini.vlan_id = lgr->vlan_id; smc_pnet_find_alt_roce(lgr, &ini, link->smcibdev); if (!memcmp(llc->sender_gid, link->peer_gid, SMC_GID_SIZE) && @@ -917,10 +920,20 @@ static void smc_llc_cli_add_link_invite(struct smc_link *link, kfree(qentry); } +static bool smc_llc_is_empty_llc_message(union smc_llc_msg *llc) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(llc->raw.data); i++) + if (llc->raw.data[i]) + return false; + return true; +} + static bool smc_llc_is_local_add_link(union smc_llc_msg *llc) { if (llc->raw.hdr.common.type == SMC_LLC_ADD_LINK && - !llc->add_link.qp_mtu && !llc->add_link.link_num) + smc_llc_is_empty_llc_message(llc)) return true; return false; } From 2d2bfeb8c5c8c95a93c473ca2c7d07cad6943a34 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 3 Sep 2020 21:53:16 +0200 Subject: [PATCH 108/110] net/smc: set rx_off for SMCR explicitly SMC tries to make use of SMCD first. If a problem shows up, it tries to switch to SMCR. If the SMCD initializing problem shows up after the SMCD connection has already been initialized, field rx_off keeps the wrong SMCD value for SMCR, which results in corrupted data at the receiver. This patch adds an explicit (re-)setting of field rx_off to zero if the connection uses SMCR. Fixes: be244f28d22f ("net/smc: add SMC-D support in data transfer") Reviewed-by: Karsten Graul Signed-off-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/smc_core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index b42fa3b00d00..a6ac0eb20c99 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1356,6 +1356,8 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) if (ini->is_smcd) { conn->rx_off = sizeof(struct smcd_cdc_msg); smcd_cdc_rx_init(conn); /* init tasklet for this conn */ + } else { + conn->rx_off = 0; } #ifndef KERNEL_HAS_ATOMIC64 spin_lock_init(&conn->acurs_lock); From 1d8df41d896d75c6eababaa912e3a814585d9f61 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 3 Sep 2020 21:53:17 +0200 Subject: [PATCH 109/110] net/smc: reset sndbuf_desc if freed When an SMC connection is created, and there is a problem to create an RMB or DMB, the previously created send buffer is thrown away as well including buffer descriptor freeing. Make sure the connection no longer references the freed buffer descriptor, otherwise bugs like this are possible: [71556.835148] ============================================================================= [71556.835168] BUG kmalloc-128 (Tainted: G B OE ): Poison overwritten [71556.835172] ----------------------------------------------------------------------------- [71556.835179] INFO: 0x00000000d20894be-0x00000000aaef63e9 @offset=2724. First byte 0x0 instead of 0x6b [71556.835215] INFO: Allocated in __smc_buf_create+0x184/0x578 [smc] age=0 cpu=5 pid=46726 [71556.835234] ___slab_alloc+0x5a4/0x690 [71556.835239] __slab_alloc.constprop.0+0x70/0xb0 [71556.835243] kmem_cache_alloc_trace+0x38e/0x3f8 [71556.835250] __smc_buf_create+0x184/0x578 [smc] [71556.835257] smc_buf_create+0x2e/0xe8 [smc] [71556.835264] smc_listen_work+0x516/0x6a0 [smc] [71556.835275] process_one_work+0x280/0x478 [71556.835280] worker_thread+0x66/0x368 [71556.835287] kthread+0x17a/0x1a0 [71556.835294] ret_from_fork+0x28/0x2c [71556.835301] INFO: Freed in smc_buf_create+0xd8/0xe8 [smc] age=0 cpu=5 pid=46726 [71556.835307] __slab_free+0x246/0x560 [71556.835311] kfree+0x398/0x3f8 [71556.835318] smc_buf_create+0xd8/0xe8 [smc] [71556.835324] smc_listen_work+0x516/0x6a0 [smc] [71556.835328] process_one_work+0x280/0x478 [71556.835332] worker_thread+0x66/0x368 [71556.835337] kthread+0x17a/0x1a0 [71556.835344] ret_from_fork+0x28/0x2c [71556.835348] INFO: Slab 0x00000000a0744551 objects=51 used=51 fp=0x0000000000000000 flags=0x1ffff00000010200 [71556.835352] INFO: Object 0x00000000563480a1 @offset=2688 fp=0x00000000289567b2 [71556.835359] Redzone 000000006783cde2: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb ................ [71556.835363] Redzone 00000000e35b876e: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb ................ [71556.835367] Redzone 0000000023074562: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb ................ [71556.835372] Redzone 00000000b9564b8c: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb ................ [71556.835376] Redzone 00000000810c6362: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb ................ [71556.835380] Redzone 0000000065ef52c3: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb ................ [71556.835384] Redzone 00000000c5dd6984: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb ................ [71556.835388] Redzone 000000004c480f8f: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb ................ [71556.835392] Object 00000000563480a1: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b kkkkkkkkkkkkkkkk [71556.835397] Object 000000009c479d06: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b kkkkkkkkkkkkkkkk [71556.835401] Object 000000006e1dce92: 6b 6b 6b 6b 00 00 00 00 6b 6b 6b 6b 6b 6b 6b 6b kkkk....kkkkkkkk [71556.835405] Object 00000000227f7cf8: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b kkkkkkkkkkkkkkkk [71556.835410] Object 000000009a701215: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b kkkkkkkkkkkkkkkk [71556.835414] Object 000000003731ce76: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b kkkkkkkkkkkkkkkk [71556.835418] Object 00000000f7085967: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b kkkkkkkkkkkkkkkk [71556.835422] Object 0000000007f99927: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b a5 kkkkkkkkkkkkkkk. [71556.835427] Redzone 00000000579c4913: bb bb bb bb bb bb bb bb ........ [71556.835431] Padding 00000000305aef82: 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZZZZZZZZZ [71556.835435] Padding 00000000b1cdd722: 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZZZZZZZZZ [71556.835438] Padding 00000000c7568199: 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZZZZZZZZZ [71556.835442] Padding 00000000fad4c4d4: 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZZZZZZZZZ [71556.835451] CPU: 0 PID: 47939 Comm: kworker/0:15 Tainted: G B OE 5.9.0-rc1uschi+ #54 [71556.835456] Hardware name: IBM 3906 M03 703 (LPAR) [71556.835464] Workqueue: events smc_listen_work [smc] [71556.835470] Call Trace: [71556.835478] [<00000000d5eaeb10>] show_stack+0x90/0xf8 [71556.835493] [<00000000d66fc0f8>] dump_stack+0xa8/0xe8 [71556.835499] [<00000000d61a511c>] check_bytes_and_report+0x104/0x130 [71556.835504] [<00000000d61a57b2>] check_object+0x26a/0x2e0 [71556.835509] [<00000000d61a59bc>] alloc_debug_processing+0x194/0x238 [71556.835514] [<00000000d61a8c14>] ___slab_alloc+0x5a4/0x690 [71556.835519] [<00000000d61a9170>] __slab_alloc.constprop.0+0x70/0xb0 [71556.835524] [<00000000d61aaf66>] kmem_cache_alloc_trace+0x38e/0x3f8 [71556.835530] [<000003ff80549bbc>] __smc_buf_create+0x184/0x578 [smc] [71556.835538] [<000003ff8054a396>] smc_buf_create+0x2e/0xe8 [smc] [71556.835545] [<000003ff80540c16>] smc_listen_work+0x516/0x6a0 [smc] [71556.835549] [<00000000d5f0f448>] process_one_work+0x280/0x478 [71556.835554] [<00000000d5f0f6a6>] worker_thread+0x66/0x368 [71556.835559] [<00000000d5f18692>] kthread+0x17a/0x1a0 [71556.835563] [<00000000d6abf3b8>] ret_from_fork+0x28/0x2c [71556.835569] INFO: lockdep is turned off. [71556.835573] FIX kmalloc-128: Restoring 0x00000000d20894be-0x00000000aaef63e9=0x6b [71556.835577] FIX kmalloc-128: Marking all objects used Fixes: fd7f3a746582 ("net/smc: remove freed buffer from list") Reviewed-by: Karsten Graul Signed-off-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/smc_core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index a6ac0eb20c99..a406627b1d55 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1779,6 +1779,7 @@ int smc_buf_create(struct smc_sock *smc, bool is_smcd) list_del(&smc->conn.sndbuf_desc->list); mutex_unlock(&smc->conn.lgr->sndbufs_lock); smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc); + smc->conn.sndbuf_desc = NULL; } return rc; } From 5fb8642a17aa2ab9077372977d67a72d3899a98d Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 3 Sep 2020 21:53:18 +0200 Subject: [PATCH 110/110] net/smc: fix sock refcounting in case of termination When an ISM device is removed, all its linkgroups are terminated, i.e. all the corresponding connections are killed. Connection killing invokes smc_close_active_abort(), which decreases the sock refcount for certain states to simulate passive closing. And it cancels the close worker and has to give up the sock lock for this timeframe. This opens the door for a passive close worker or a socket close to run in between. In this case smc_close_active_abort() and passive close worker resp. smc_release() might do a sock_put for passive closing. This causes: [ 1323.315943] refcount_t: underflow; use-after-free. [ 1323.316055] WARNING: CPU: 3 PID: 54469 at lib/refcount.c:28 refcount_warn_saturate+0xe8/0x130 [ 1323.316069] Kernel panic - not syncing: panic_on_warn set ... [ 1323.316084] CPU: 3 PID: 54469 Comm: uperf Not tainted 5.9.0-20200826.rc2.git0.46328853ed20.300.fc32.s390x+debug #1 [ 1323.316096] Hardware name: IBM 2964 NC9 702 (z/VM 6.4.0) [ 1323.316108] Call Trace: [ 1323.316125] [<00000000c0d4aae8>] show_stack+0x90/0xf8 [ 1323.316143] [<00000000c15989b0>] dump_stack+0xa8/0xe8 [ 1323.316158] [<00000000c0d8344e>] panic+0x11e/0x288 [ 1323.316173] [<00000000c0d83144>] __warn+0xac/0x158 [ 1323.316187] [<00000000c1597a7a>] report_bug+0xb2/0x130 [ 1323.316201] [<00000000c0d36424>] monitor_event_exception+0x44/0xc0 [ 1323.316219] [<00000000c195c716>] pgm_check_handler+0x1da/0x238 [ 1323.316234] [<00000000c151844c>] refcount_warn_saturate+0xec/0x130 [ 1323.316280] ([<00000000c1518448>] refcount_warn_saturate+0xe8/0x130) [ 1323.316310] [<000003ff801f2e2a>] smc_release+0x192/0x1c8 [smc] [ 1323.316323] [<00000000c169f1fa>] __sock_release+0x5a/0xe0 [ 1323.316334] [<00000000c169f2ac>] sock_close+0x2c/0x40 [ 1323.316350] [<00000000c1086de0>] __fput+0xb8/0x278 [ 1323.316362] [<00000000c0db1e0e>] task_work_run+0x76/0xb8 [ 1323.316393] [<00000000c0d8ab84>] do_exit+0x26c/0x520 [ 1323.316408] [<00000000c0d8af08>] do_group_exit+0x48/0xc0 [ 1323.316421] [<00000000c0d8afa8>] __s390x_sys_exit_group+0x28/0x38 [ 1323.316433] [<00000000c195c32c>] system_call+0xe0/0x2b4 [ 1323.316446] 1 lock held by uperf/54469: [ 1323.316456] #0: 0000000044125e60 (&sb->s_type->i_mutex_key#9){+.+.}-{3:3}, at: __sock_release+0x44/0xe0 The patch rechecks sock state in smc_close_active_abort() after smc_close_cancel_work() to avoid duplicate decrease of sock refcount for the same purpose. Fixes: 611b63a12732 ("net/smc: cancel tx worker in case of socket aborts") Reviewed-by: Karsten Graul Signed-off-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/smc_close.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 290270c821ca..35e30d39510d 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -116,7 +116,6 @@ static void smc_close_cancel_work(struct smc_sock *smc) cancel_work_sync(&smc->conn.close_work); cancel_delayed_work_sync(&smc->conn.tx_work); lock_sock(sk); - sk->sk_state = SMC_CLOSED; } /* terminate smc socket abnormally - active abort @@ -134,22 +133,22 @@ void smc_close_active_abort(struct smc_sock *smc) } switch (sk->sk_state) { case SMC_ACTIVE: - sk->sk_state = SMC_PEERABORTWAIT; - smc_close_cancel_work(smc); - sk->sk_state = SMC_CLOSED; - sock_put(sk); /* passive closing */ - break; case SMC_APPCLOSEWAIT1: case SMC_APPCLOSEWAIT2: + sk->sk_state = SMC_PEERABORTWAIT; smc_close_cancel_work(smc); + if (sk->sk_state != SMC_PEERABORTWAIT) + break; sk->sk_state = SMC_CLOSED; - sock_put(sk); /* postponed passive closing */ + sock_put(sk); /* (postponed) passive closing */ break; case SMC_PEERCLOSEWAIT1: case SMC_PEERCLOSEWAIT2: case SMC_PEERFINCLOSEWAIT: sk->sk_state = SMC_PEERABORTWAIT; smc_close_cancel_work(smc); + if (sk->sk_state != SMC_PEERABORTWAIT) + break; sk->sk_state = SMC_CLOSED; smc_conn_free(&smc->conn); release_clcsock = true; @@ -159,6 +158,8 @@ void smc_close_active_abort(struct smc_sock *smc) case SMC_APPFINCLOSEWAIT: sk->sk_state = SMC_PEERABORTWAIT; smc_close_cancel_work(smc); + if (sk->sk_state != SMC_PEERABORTWAIT) + break; sk->sk_state = SMC_CLOSED; smc_conn_free(&smc->conn); release_clcsock = true;