forked from luck/tmp_suning_uos_patched
492135557d
This work as a follow-up of commitf7b3bec6f5
("net: allow setting ecn via routing table") and adds RFC3168 section 6.1.1.1. fallback for outgoing ECN connections. In other words, this work adds a retry with a non-ECN setup SYN packet, as suggested from the RFC on the first timeout: [...] A host that receives no reply to an ECN-setup SYN within the normal SYN retransmission timeout interval MAY resend the SYN and any subsequent SYN retransmissions with CWR and ECE cleared. [...] Schematic client-side view when assuming the server is in tcp_ecn=2 mode, that is, Linux default since 2009 via commit255cac91c3
("tcp: extend ECN sysctl to allow server-side only ECN"): 1) Normal ECN-capable path: SYN ECE CWR -----> <----- SYN ACK ECE ACK -----> 2) Path with broken middlebox, when client has fallback: SYN ECE CWR ----X crappy middlebox drops packet (timeout, rtx) SYN -----> <----- SYN ACK ACK -----> In case we would not have the fallback implemented, the middlebox drop point would basically end up as: SYN ECE CWR ----X crappy middlebox drops packet (timeout, rtx) SYN ECE CWR ----X crappy middlebox drops packet (timeout, rtx) SYN ECE CWR ----X crappy middlebox drops packet (timeout, rtx) In any case, it's rather a smaller percentage of sites where there would occur such additional setup latency: it was found in end of 2014 that ~56% of IPv4 and 65% of IPv6 servers of Alexa 1 million list would negotiate ECN (aka tcp_ecn=2 default), 0.42% of these webservers will fail to connect when trying to negotiate with ECN (tcp_ecn=1) due to timeouts, which the fallback would mitigate with a slight latency trade-off. Recent related paper on this topic: Brian Trammell, Mirja Kühlewind, Damiano Boppart, Iain Learmonth, Gorry Fairhurst, and Richard Scheffenegger: "Enabling Internet-Wide Deployment of Explicit Congestion Notification." Proc. PAM 2015, New York. http://ecn.ethz.ch/ecn-pam15.pdf Thus, when net.ipv4.tcp_ecn=1 is being set, the patch will perform RFC3168, section 6.1.1.1. fallback on timeout. For users explicitly not wanting this which can be in DC use case, we add a net.ipv4.tcp_ecn_fallback knob that allows for disabling the fallback. tp->ecn_flags are not being cleared in tcp_ecn_clear_syn() on output, but rather we let tcp_ecn_rcv_synack() take that over on input path in case a SYN ACK ECE was delayed. Thus a spurious SYN retransmission will not prevent ECN being negotiated eventually in that case. Reference: https://www.ietf.org/proceedings/92/slides/slides-92-iccrg-1.pdf Reference: https://www.ietf.org/proceedings/89/slides/slides-89-tsvarea-1.pdf Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: Mirja Kühlewind <mirja.kuehlewind@tik.ee.ethz.ch> Signed-off-by: Brian Trammell <trammell@tik.ee.ethz.ch> Cc: Eric Dumazet <edumazet@google.com> Cc: Dave That <dave.taht@gmail.com> Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
112 lines
2.4 KiB
C
112 lines
2.4 KiB
C
/*
|
|
* ipv4 in net namespaces
|
|
*/
|
|
|
|
#ifndef __NETNS_IPV4_H__
|
|
#define __NETNS_IPV4_H__
|
|
|
|
#include <linux/uidgid.h>
|
|
#include <net/inet_frag.h>
|
|
#include <linux/rcupdate.h>
|
|
|
|
struct tcpm_hash_bucket;
|
|
struct ctl_table_header;
|
|
struct ipv4_devconf;
|
|
struct fib_rules_ops;
|
|
struct hlist_head;
|
|
struct fib_table;
|
|
struct sock;
|
|
struct local_ports {
|
|
seqlock_t lock;
|
|
int range[2];
|
|
};
|
|
|
|
struct ping_group_range {
|
|
seqlock_t lock;
|
|
kgid_t range[2];
|
|
};
|
|
|
|
struct netns_ipv4 {
|
|
#ifdef CONFIG_SYSCTL
|
|
struct ctl_table_header *forw_hdr;
|
|
struct ctl_table_header *frags_hdr;
|
|
struct ctl_table_header *ipv4_hdr;
|
|
struct ctl_table_header *route_hdr;
|
|
struct ctl_table_header *xfrm4_hdr;
|
|
#endif
|
|
struct ipv4_devconf *devconf_all;
|
|
struct ipv4_devconf *devconf_dflt;
|
|
#ifdef CONFIG_IP_MULTIPLE_TABLES
|
|
struct fib_rules_ops *rules_ops;
|
|
bool fib_has_custom_rules;
|
|
struct fib_table __rcu *fib_local;
|
|
struct fib_table __rcu *fib_main;
|
|
struct fib_table __rcu *fib_default;
|
|
#endif
|
|
#ifdef CONFIG_IP_ROUTE_CLASSID
|
|
int fib_num_tclassid_users;
|
|
#endif
|
|
struct hlist_head *fib_table_hash;
|
|
bool fib_offload_disabled;
|
|
struct sock *fibnl;
|
|
|
|
struct sock * __percpu *icmp_sk;
|
|
struct sock *mc_autojoin_sk;
|
|
|
|
struct inet_peer_base *peers;
|
|
struct sock * __percpu *tcp_sk;
|
|
struct netns_frags frags;
|
|
#ifdef CONFIG_NETFILTER
|
|
struct xt_table *iptable_filter;
|
|
struct xt_table *iptable_mangle;
|
|
struct xt_table *iptable_raw;
|
|
struct xt_table *arptable_filter;
|
|
#ifdef CONFIG_SECURITY
|
|
struct xt_table *iptable_security;
|
|
#endif
|
|
struct xt_table *nat_table;
|
|
#endif
|
|
|
|
int sysctl_icmp_echo_ignore_all;
|
|
int sysctl_icmp_echo_ignore_broadcasts;
|
|
int sysctl_icmp_ignore_bogus_error_responses;
|
|
int sysctl_icmp_ratelimit;
|
|
int sysctl_icmp_ratemask;
|
|
int sysctl_icmp_errors_use_inbound_ifaddr;
|
|
|
|
struct local_ports ip_local_ports;
|
|
|
|
int sysctl_tcp_ecn;
|
|
int sysctl_tcp_ecn_fallback;
|
|
|
|
int sysctl_ip_no_pmtu_disc;
|
|
int sysctl_ip_fwd_use_pmtu;
|
|
int sysctl_ip_nonlocal_bind;
|
|
|
|
int sysctl_fwmark_reflect;
|
|
int sysctl_tcp_fwmark_accept;
|
|
int sysctl_tcp_mtu_probing;
|
|
int sysctl_tcp_base_mss;
|
|
int sysctl_tcp_probe_threshold;
|
|
u32 sysctl_tcp_probe_interval;
|
|
|
|
struct ping_group_range ping_group_range;
|
|
|
|
atomic_t dev_addr_genid;
|
|
|
|
#ifdef CONFIG_SYSCTL
|
|
unsigned long *sysctl_local_reserved_ports;
|
|
#endif
|
|
|
|
#ifdef CONFIG_IP_MROUTE
|
|
#ifndef CONFIG_IP_MROUTE_MULTIPLE_TABLES
|
|
struct mr_table *mrt;
|
|
#else
|
|
struct list_head mr_tables;
|
|
struct fib_rules_ops *mr_rules_ops;
|
|
#endif
|
|
#endif
|
|
atomic_t rt_genid;
|
|
};
|
|
#endif
|