kernel_optimize_test/net/ipv4/ip_options.c

664 lines
15 KiB
C
Raw Normal View History

/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
* interface as the means of communication with the user level.
*
* The options processing module for ip.c
*
* Authors: A.N.Kuznetsov
*
*/
#define pr_fmt(fmt) "IPv4: " fmt
#include <linux/capability.h>
#include <linux/module.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 16:04:11 +08:00
#include <linux/slab.h>
#include <linux/types.h>
#include <asm/uaccess.h>
#include <asm/unaligned.h>
#include <linux/skbuff.h>
#include <linux/ip.h>
#include <linux/icmp.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/route.h>
#include <net/cipso_ipv4.h>
#include <net/ip_fib.h>
/*
* Write options to IP header, record destination address to
* source route option, address of outgoing interface
* (we should already know it, so that this function is allowed be
* called only after routing decision) and timestamp,
* if we originate this datagram.
*
* daddr is real destination address, next hop is recorded in IP header.
* saddr is address of outgoing interface.
*/
void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
__be32 daddr, struct rtable *rt, int is_frag)
{
unsigned char *iph = skb_network_header(skb);
memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options));
memcpy(iph+sizeof(struct iphdr), opt->__data, opt->optlen);
opt = &(IPCB(skb)->opt);
if (opt->srr)
memcpy(iph+opt->srr+iph[opt->srr+1]-4, &daddr, 4);
if (!is_frag) {
if (opt->rr_needaddr)
ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, skb, rt);
if (opt->ts_needaddr)
ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, skb, rt);
if (opt->ts_needtime) {
struct timespec tv;
__be32 midtime;
getnstimeofday(&tv);
midtime = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC);
memcpy(iph+opt->ts+iph[opt->ts+2]-5, &midtime, 4);
}
return;
}
if (opt->rr) {
memset(iph+opt->rr, IPOPT_NOP, iph[opt->rr+1]);
opt->rr = 0;
opt->rr_needaddr = 0;
}
if (opt->ts) {
memset(iph+opt->ts, IPOPT_NOP, iph[opt->ts+1]);
opt->ts = 0;
opt->ts_needaddr = opt->ts_needtime = 0;
}
}
/*
* Provided (sopt, skb) points to received options,
* build in dopt compiled option set appropriate for answering.
* i.e. invert SRR option, copy anothers,
* and grab room in RR/TS options.
*
* NOTE: dopt cannot point to skb.
*/
int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb,
const struct ip_options *sopt)
{
unsigned char *sptr, *dptr;
int soffset, doffset;
int optlen;
memset(dopt, 0, sizeof(struct ip_options));
if (sopt->optlen == 0)
return 0;
sptr = skb_network_header(skb);
dptr = dopt->__data;
if (sopt->rr) {
optlen = sptr[sopt->rr+1];
soffset = sptr[sopt->rr+2];
dopt->rr = dopt->optlen + sizeof(struct iphdr);
memcpy(dptr, sptr+sopt->rr, optlen);
if (sopt->rr_needaddr && soffset <= optlen) {
if (soffset + 3 > optlen)
return -EINVAL;
dptr[2] = soffset + 4;
dopt->rr_needaddr = 1;
}
dptr += optlen;
dopt->optlen += optlen;
}
if (sopt->ts) {
optlen = sptr[sopt->ts+1];
soffset = sptr[sopt->ts+2];
dopt->ts = dopt->optlen + sizeof(struct iphdr);
memcpy(dptr, sptr+sopt->ts, optlen);
if (soffset <= optlen) {
if (sopt->ts_needaddr) {
if (soffset + 3 > optlen)
return -EINVAL;
dopt->ts_needaddr = 1;
soffset += 4;
}
if (sopt->ts_needtime) {
if (soffset + 3 > optlen)
return -EINVAL;
if ((dptr[3]&0xF) != IPOPT_TS_PRESPEC) {
dopt->ts_needtime = 1;
soffset += 4;
} else {
dopt->ts_needtime = 0;
if (soffset + 7 <= optlen) {
__be32 addr;
memcpy(&addr, dptr+soffset-1, 4);
if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_UNICAST) {
dopt->ts_needtime = 1;
soffset += 8;
}
}
}
}
dptr[2] = soffset;
}
dptr += optlen;
dopt->optlen += optlen;
}
if (sopt->srr) {
unsigned char *start = sptr+sopt->srr;
__be32 faddr;
optlen = start[1];
soffset = start[2];
doffset = 0;
if (soffset > optlen)
soffset = optlen + 1;
soffset -= 4;
if (soffset > 3) {
memcpy(&faddr, &start[soffset-1], 4);
for (soffset -= 4, doffset = 4; soffset > 3; soffset -= 4, doffset += 4)
memcpy(&dptr[doffset-1], &start[soffset-1], 4);
/*
* RFC1812 requires to fix illegal source routes.
*/
if (memcmp(&ip_hdr(skb)->saddr,
&start[soffset + 3], 4) == 0)
doffset -= 4;
}
if (doffset > 3) {
__be32 daddr = fib_compute_spec_dst(skb);
memcpy(&start[doffset-1], &daddr, 4);
dopt->faddr = faddr;
dptr[0] = start[0];
dptr[1] = doffset+3;
dptr[2] = 4;
dptr += doffset+3;
dopt->srr = dopt->optlen + sizeof(struct iphdr);
dopt->optlen += doffset+3;
dopt->is_strictroute = sopt->is_strictroute;
}
}
if (sopt->cipso) {
optlen = sptr[sopt->cipso+1];
dopt->cipso = dopt->optlen+sizeof(struct iphdr);
memcpy(dptr, sptr+sopt->cipso, optlen);
dptr += optlen;
dopt->optlen += optlen;
}
while (dopt->optlen & 3) {
*dptr++ = IPOPT_END;
dopt->optlen++;
}
return 0;
}
/*
* Options "fragmenting", just fill options not
* allowed in fragments with NOOPs.
* Simple and stupid 8), but the most efficient way.
*/
void ip_options_fragment(struct sk_buff *skb)
{
unsigned char *optptr = skb_network_header(skb) + sizeof(struct iphdr);
struct ip_options *opt = &(IPCB(skb)->opt);
int l = opt->optlen;
int optlen;
while (l > 0) {
switch (*optptr) {
case IPOPT_END:
return;
case IPOPT_NOOP:
l--;
optptr++;
continue;
}
optlen = optptr[1];
if (optlen < 2 || optlen > l)
return;
if (!IPOPT_COPIED(*optptr))
memset(optptr, IPOPT_NOOP, optlen);
l -= optlen;
optptr += optlen;
}
opt->ts = 0;
opt->rr = 0;
opt->rr_needaddr = 0;
opt->ts_needaddr = 0;
opt->ts_needtime = 0;
}
/* helper used by ip_options_compile() to call fib_compute_spec_dst()
* at most one time.
*/
static void spec_dst_fill(__be32 *spec_dst, struct sk_buff *skb)
{
if (*spec_dst == htonl(INADDR_ANY))
*spec_dst = fib_compute_spec_dst(skb);
}
/*
* Verify options and fill pointers in struct options.
* Caller should clear *opt, and set opt->data.
* If opt == NULL, then skb->data should point to IP header.
*/
int ip_options_compile(struct net *net,
struct ip_options *opt, struct sk_buff *skb)
{
__be32 spec_dst = htonl(INADDR_ANY);
unsigned char *pp_ptr = NULL;
struct rtable *rt = NULL;
unsigned char *optptr;
unsigned char *iph;
int optlen, l;
if (skb) {
rt = skb_rtable(skb);
optptr = (unsigned char *)&(ip_hdr(skb)[1]);
} else
optptr = opt->__data;
iph = optptr - sizeof(struct iphdr);
for (l = opt->optlen; l > 0; ) {
switch (*optptr) {
case IPOPT_END:
for (optptr++, l--; l > 0; optptr++, l--) {
if (*optptr != IPOPT_END) {
*optptr = IPOPT_END;
opt->is_changed = 1;
}
}
goto eol;
case IPOPT_NOOP:
l--;
optptr++;
continue;
}
ipv4: fix buffer overflow in ip_options_compile() There is a benign buffer overflow in ip_options_compile spotted by AddressSanitizer[1] : Its benign because we always can access one extra byte in skb->head (because header is followed by struct skb_shared_info), and in this case this byte is not even used. [28504.910798] ================================================================== [28504.912046] AddressSanitizer: heap-buffer-overflow in ip_options_compile [28504.913170] Read of size 1 by thread T15843: [28504.914026] [<ffffffff81802f91>] ip_options_compile+0x121/0x9c0 [28504.915394] [<ffffffff81804a0d>] ip_options_get_from_user+0xad/0x120 [28504.916843] [<ffffffff8180dedf>] do_ip_setsockopt.isra.15+0x8df/0x1630 [28504.918175] [<ffffffff8180ec60>] ip_setsockopt+0x30/0xa0 [28504.919490] [<ffffffff8181e59b>] tcp_setsockopt+0x5b/0x90 [28504.920835] [<ffffffff8177462f>] sock_common_setsockopt+0x5f/0x70 [28504.922208] [<ffffffff817729c2>] SyS_setsockopt+0xa2/0x140 [28504.923459] [<ffffffff818cfb69>] system_call_fastpath+0x16/0x1b [28504.924722] [28504.925106] Allocated by thread T15843: [28504.925815] [<ffffffff81804995>] ip_options_get_from_user+0x35/0x120 [28504.926884] [<ffffffff8180dedf>] do_ip_setsockopt.isra.15+0x8df/0x1630 [28504.927975] [<ffffffff8180ec60>] ip_setsockopt+0x30/0xa0 [28504.929175] [<ffffffff8181e59b>] tcp_setsockopt+0x5b/0x90 [28504.930400] [<ffffffff8177462f>] sock_common_setsockopt+0x5f/0x70 [28504.931677] [<ffffffff817729c2>] SyS_setsockopt+0xa2/0x140 [28504.932851] [<ffffffff818cfb69>] system_call_fastpath+0x16/0x1b [28504.934018] [28504.934377] The buggy address ffff880026382828 is located 0 bytes to the right [28504.934377] of 40-byte region [ffff880026382800, ffff880026382828) [28504.937144] [28504.937474] Memory state around the buggy address: [28504.938430] ffff880026382300: ........ rrrrrrrr rrrrrrrr rrrrrrrr [28504.939884] ffff880026382400: ffffffff rrrrrrrr rrrrrrrr rrrrrrrr [28504.941294] ffff880026382500: .....rrr rrrrrrrr rrrrrrrr rrrrrrrr [28504.942504] ffff880026382600: ffffffff rrrrrrrr rrrrrrrr rrrrrrrr [28504.943483] ffff880026382700: ffffffff rrrrrrrr rrrrrrrr rrrrrrrr [28504.944511] >ffff880026382800: .....rrr rrrrrrrr rrrrrrrr rrrrrrrr [28504.945573] ^ [28504.946277] ffff880026382900: ffffffff rrrrrrrr rrrrrrrr rrrrrrrr [28505.094949] ffff880026382a00: ffffffff rrrrrrrr rrrrrrrr rrrrrrrr [28505.096114] ffff880026382b00: ffffffff rrrrrrrr rrrrrrrr rrrrrrrr [28505.097116] ffff880026382c00: ffffffff rrrrrrrr rrrrrrrr rrrrrrrr [28505.098472] ffff880026382d00: ffffffff rrrrrrrr rrrrrrrr rrrrrrrr [28505.099804] Legend: [28505.100269] f - 8 freed bytes [28505.100884] r - 8 redzone bytes [28505.101649] . - 8 allocated bytes [28505.102406] x=1..7 - x allocated bytes + (8-x) redzone bytes [28505.103637] ================================================================== [1] https://code.google.com/p/address-sanitizer/wiki/AddressSanitizerForKernel Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-07-21 13:17:42 +08:00
if (unlikely(l < 2)) {
pp_ptr = optptr;
goto error;
}
optlen = optptr[1];
if (optlen < 2 || optlen > l) {
pp_ptr = optptr;
goto error;
}
switch (*optptr) {
case IPOPT_SSRR:
case IPOPT_LSRR:
if (optlen < 3) {
pp_ptr = optptr + 1;
goto error;
}
if (optptr[2] < 4) {
pp_ptr = optptr + 2;
goto error;
}
/* NB: cf RFC-1812 5.2.4.1 */
if (opt->srr) {
pp_ptr = optptr;
goto error;
}
if (!skb) {
if (optptr[2] != 4 || optlen < 7 || ((optlen-3) & 3)) {
pp_ptr = optptr + 1;
goto error;
}
memcpy(&opt->faddr, &optptr[3], 4);
if (optlen > 7)
memmove(&optptr[3], &optptr[7], optlen-7);
}
opt->is_strictroute = (optptr[0] == IPOPT_SSRR);
opt->srr = optptr - iph;
break;
case IPOPT_RR:
if (opt->rr) {
pp_ptr = optptr;
goto error;
}
if (optlen < 3) {
pp_ptr = optptr + 1;
goto error;
}
if (optptr[2] < 4) {
pp_ptr = optptr + 2;
goto error;
}
if (optptr[2] <= optlen) {
if (optptr[2]+3 > optlen) {
pp_ptr = optptr + 2;
goto error;
}
if (rt) {
spec_dst_fill(&spec_dst, skb);
memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
opt->is_changed = 1;
}
optptr[2] += 4;
opt->rr_needaddr = 1;
}
opt->rr = optptr - iph;
break;
case IPOPT_TIMESTAMP:
if (opt->ts) {
pp_ptr = optptr;
goto error;
}
if (optlen < 4) {
pp_ptr = optptr + 1;
goto error;
}
if (optptr[2] < 5) {
pp_ptr = optptr + 2;
goto error;
}
if (optptr[2] <= optlen) {
unsigned char *timeptr = NULL;
if (optptr[2]+3 > optlen) {
pp_ptr = optptr + 2;
goto error;
}
switch (optptr[3]&0xF) {
case IPOPT_TS_TSONLY:
if (skb)
timeptr = &optptr[optptr[2]-1];
opt->ts_needtime = 1;
optptr[2] += 4;
break;
case IPOPT_TS_TSANDADDR:
if (optptr[2]+7 > optlen) {
pp_ptr = optptr + 2;
goto error;
}
if (rt) {
spec_dst_fill(&spec_dst, skb);
memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
timeptr = &optptr[optptr[2]+3];
}
opt->ts_needaddr = 1;
opt->ts_needtime = 1;
optptr[2] += 8;
break;
case IPOPT_TS_PRESPEC:
if (optptr[2]+7 > optlen) {
pp_ptr = optptr + 2;
goto error;
}
{
__be32 addr;
memcpy(&addr, &optptr[optptr[2]-1], 4);
if (inet_addr_type(net, addr) == RTN_UNICAST)
break;
if (skb)
timeptr = &optptr[optptr[2]+3];
}
opt->ts_needtime = 1;
optptr[2] += 8;
break;
default:
net: Allow userns root to control ipv4 Allow an unpriviled user who has created a user namespace, and then created a network namespace to effectively use the new network namespace, by reducing capable(CAP_NET_ADMIN) and capable(CAP_NET_RAW) calls to be ns_capable(net->user_ns, CAP_NET_ADMIN), or capable(net->user_ns, CAP_NET_RAW) calls. Settings that merely control a single network device are allowed. Either the network device is a logical network device where restrictions make no difference or the network device is hardware NIC that has been explicity moved from the initial network namespace. In general policy and network stack state changes are allowed while resource control is left unchanged. Allow creating raw sockets. Allow the SIOCSARP ioctl to control the arp cache. Allow the SIOCSIFFLAG ioctl to allow setting network device flags. Allow the SIOCSIFADDR ioctl to allow setting a netdevice ipv4 address. Allow the SIOCSIFBRDADDR ioctl to allow setting a netdevice ipv4 broadcast address. Allow the SIOCSIFDSTADDR ioctl to allow setting a netdevice ipv4 destination address. Allow the SIOCSIFNETMASK ioctl to allow setting a netdevice ipv4 netmask. Allow the SIOCADDRT and SIOCDELRT ioctls to allow adding and deleting ipv4 routes. Allow the SIOCADDTUNNEL, SIOCCHGTUNNEL and SIOCDELTUNNEL ioctls for adding, changing and deleting gre tunnels. Allow the SIOCADDTUNNEL, SIOCCHGTUNNEL and SIOCDELTUNNEL ioctls for adding, changing and deleting ipip tunnels. Allow the SIOCADDTUNNEL, SIOCCHGTUNNEL and SIOCDELTUNNEL ioctls for adding, changing and deleting ipsec virtual tunnel interfaces. Allow setting the MRT_INIT, MRT_DONE, MRT_ADD_VIF, MRT_DEL_VIF, MRT_ADD_MFC, MRT_DEL_MFC, MRT_ASSERT, MRT_PIM, MRT_TABLE socket options on multicast routing sockets. Allow setting and receiving IPOPT_CIPSO, IP_OPT_SEC, IP_OPT_SID and arbitrary ip options. Allow setting IP_SEC_POLICY/IP_XFRM_POLICY ipv4 socket option. Allow setting the IP_TRANSPARENT ipv4 socket option. Allow setting the TCP_REPAIR socket option. Allow setting the TCP_CONGESTION socket option. Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-16 11:03:05 +08:00
if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) {
pp_ptr = optptr + 3;
goto error;
}
break;
}
if (timeptr) {
struct timespec tv;
u32 midtime;
getnstimeofday(&tv);
midtime = (tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC;
put_unaligned_be32(midtime, timeptr);
opt->is_changed = 1;
}
} else if ((optptr[3]&0xF) != IPOPT_TS_PRESPEC) {
unsigned int overflow = optptr[3]>>4;
if (overflow == 15) {
pp_ptr = optptr + 3;
goto error;
}
if (skb) {
optptr[3] = (optptr[3]&0xF)|((overflow+1)<<4);
opt->is_changed = 1;
}
}
opt->ts = optptr - iph;
break;
case IPOPT_RA:
if (optlen < 4) {
pp_ptr = optptr + 1;
goto error;
}
if (optptr[2] == 0 && optptr[3] == 0)
opt->router_alert = optptr - iph;
break;
case IPOPT_CIPSO:
net: Allow userns root to control ipv4 Allow an unpriviled user who has created a user namespace, and then created a network namespace to effectively use the new network namespace, by reducing capable(CAP_NET_ADMIN) and capable(CAP_NET_RAW) calls to be ns_capable(net->user_ns, CAP_NET_ADMIN), or capable(net->user_ns, CAP_NET_RAW) calls. Settings that merely control a single network device are allowed. Either the network device is a logical network device where restrictions make no difference or the network device is hardware NIC that has been explicity moved from the initial network namespace. In general policy and network stack state changes are allowed while resource control is left unchanged. Allow creating raw sockets. Allow the SIOCSARP ioctl to control the arp cache. Allow the SIOCSIFFLAG ioctl to allow setting network device flags. Allow the SIOCSIFADDR ioctl to allow setting a netdevice ipv4 address. Allow the SIOCSIFBRDADDR ioctl to allow setting a netdevice ipv4 broadcast address. Allow the SIOCSIFDSTADDR ioctl to allow setting a netdevice ipv4 destination address. Allow the SIOCSIFNETMASK ioctl to allow setting a netdevice ipv4 netmask. Allow the SIOCADDRT and SIOCDELRT ioctls to allow adding and deleting ipv4 routes. Allow the SIOCADDTUNNEL, SIOCCHGTUNNEL and SIOCDELTUNNEL ioctls for adding, changing and deleting gre tunnels. Allow the SIOCADDTUNNEL, SIOCCHGTUNNEL and SIOCDELTUNNEL ioctls for adding, changing and deleting ipip tunnels. Allow the SIOCADDTUNNEL, SIOCCHGTUNNEL and SIOCDELTUNNEL ioctls for adding, changing and deleting ipsec virtual tunnel interfaces. Allow setting the MRT_INIT, MRT_DONE, MRT_ADD_VIF, MRT_DEL_VIF, MRT_ADD_MFC, MRT_DEL_MFC, MRT_ASSERT, MRT_PIM, MRT_TABLE socket options on multicast routing sockets. Allow setting and receiving IPOPT_CIPSO, IP_OPT_SEC, IP_OPT_SID and arbitrary ip options. Allow setting IP_SEC_POLICY/IP_XFRM_POLICY ipv4 socket option. Allow setting the IP_TRANSPARENT ipv4 socket option. Allow setting the TCP_REPAIR socket option. Allow setting the TCP_CONGESTION socket option. Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-16 11:03:05 +08:00
if ((!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) || opt->cipso) {
pp_ptr = optptr;
goto error;
}
opt->cipso = optptr - iph;
if (cipso_v4_validate(skb, &optptr)) {
pp_ptr = optptr;
goto error;
}
break;
case IPOPT_SEC:
case IPOPT_SID:
default:
net: Allow userns root to control ipv4 Allow an unpriviled user who has created a user namespace, and then created a network namespace to effectively use the new network namespace, by reducing capable(CAP_NET_ADMIN) and capable(CAP_NET_RAW) calls to be ns_capable(net->user_ns, CAP_NET_ADMIN), or capable(net->user_ns, CAP_NET_RAW) calls. Settings that merely control a single network device are allowed. Either the network device is a logical network device where restrictions make no difference or the network device is hardware NIC that has been explicity moved from the initial network namespace. In general policy and network stack state changes are allowed while resource control is left unchanged. Allow creating raw sockets. Allow the SIOCSARP ioctl to control the arp cache. Allow the SIOCSIFFLAG ioctl to allow setting network device flags. Allow the SIOCSIFADDR ioctl to allow setting a netdevice ipv4 address. Allow the SIOCSIFBRDADDR ioctl to allow setting a netdevice ipv4 broadcast address. Allow the SIOCSIFDSTADDR ioctl to allow setting a netdevice ipv4 destination address. Allow the SIOCSIFNETMASK ioctl to allow setting a netdevice ipv4 netmask. Allow the SIOCADDRT and SIOCDELRT ioctls to allow adding and deleting ipv4 routes. Allow the SIOCADDTUNNEL, SIOCCHGTUNNEL and SIOCDELTUNNEL ioctls for adding, changing and deleting gre tunnels. Allow the SIOCADDTUNNEL, SIOCCHGTUNNEL and SIOCDELTUNNEL ioctls for adding, changing and deleting ipip tunnels. Allow the SIOCADDTUNNEL, SIOCCHGTUNNEL and SIOCDELTUNNEL ioctls for adding, changing and deleting ipsec virtual tunnel interfaces. Allow setting the MRT_INIT, MRT_DONE, MRT_ADD_VIF, MRT_DEL_VIF, MRT_ADD_MFC, MRT_DEL_MFC, MRT_ASSERT, MRT_PIM, MRT_TABLE socket options on multicast routing sockets. Allow setting and receiving IPOPT_CIPSO, IP_OPT_SEC, IP_OPT_SID and arbitrary ip options. Allow setting IP_SEC_POLICY/IP_XFRM_POLICY ipv4 socket option. Allow setting the IP_TRANSPARENT ipv4 socket option. Allow setting the TCP_REPAIR socket option. Allow setting the TCP_CONGESTION socket option. Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-16 11:03:05 +08:00
if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) {
pp_ptr = optptr;
goto error;
}
break;
}
l -= optlen;
optptr += optlen;
}
eol:
if (!pp_ptr)
return 0;
error:
if (skb) {
icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((pp_ptr-iph)<<24));
}
return -EINVAL;
}
EXPORT_SYMBOL(ip_options_compile);
/*
* Undo all the changes done by ip_options_compile().
*/
void ip_options_undo(struct ip_options *opt)
{
if (opt->srr) {
unsigned char *optptr = opt->__data+opt->srr-sizeof(struct iphdr);
memmove(optptr+7, optptr+3, optptr[1]-7);
memcpy(optptr+3, &opt->faddr, 4);
}
if (opt->rr_needaddr) {
unsigned char *optptr = opt->__data+opt->rr-sizeof(struct iphdr);
optptr[2] -= 4;
memset(&optptr[optptr[2]-1], 0, 4);
}
if (opt->ts) {
unsigned char *optptr = opt->__data+opt->ts-sizeof(struct iphdr);
if (opt->ts_needtime) {
optptr[2] -= 4;
memset(&optptr[optptr[2]-1], 0, 4);
if ((optptr[3]&0xF) == IPOPT_TS_PRESPEC)
optptr[2] -= 4;
}
if (opt->ts_needaddr) {
optptr[2] -= 4;
memset(&optptr[optptr[2]-1], 0, 4);
}
}
}
static struct ip_options_rcu *ip_options_get_alloc(const int optlen)
{
return kzalloc(sizeof(struct ip_options_rcu) + ((optlen + 3) & ~3),
GFP_KERNEL);
}
static int ip_options_get_finish(struct net *net, struct ip_options_rcu **optp,
struct ip_options_rcu *opt, int optlen)
{
while (optlen & 3)
opt->opt.__data[optlen++] = IPOPT_END;
opt->opt.optlen = optlen;
if (optlen && ip_options_compile(net, &opt->opt, NULL)) {
kfree(opt);
return -EINVAL;
}
kfree(*optp);
*optp = opt;
return 0;
}
int ip_options_get_from_user(struct net *net, struct ip_options_rcu **optp,
unsigned char __user *data, int optlen)
{
struct ip_options_rcu *opt = ip_options_get_alloc(optlen);
if (!opt)
return -ENOMEM;
if (optlen && copy_from_user(opt->opt.__data, data, optlen)) {
kfree(opt);
return -EFAULT;
}
return ip_options_get_finish(net, optp, opt, optlen);
}
int ip_options_get(struct net *net, struct ip_options_rcu **optp,
unsigned char *data, int optlen)
{
struct ip_options_rcu *opt = ip_options_get_alloc(optlen);
if (!opt)
return -ENOMEM;
if (optlen)
memcpy(opt->opt.__data, data, optlen);
return ip_options_get_finish(net, optp, opt, optlen);
}
void ip_forward_options(struct sk_buff *skb)
{
struct ip_options *opt = &(IPCB(skb)->opt);
unsigned char *optptr;
struct rtable *rt = skb_rtable(skb);
unsigned char *raw = skb_network_header(skb);
if (opt->rr_needaddr) {
optptr = (unsigned char *)raw + opt->rr;
ip_rt_get_source(&optptr[optptr[2]-5], skb, rt);
opt->is_changed = 1;
}
if (opt->srr_is_hit) {
int srrptr, srrspace;
optptr = raw + opt->srr;
for ( srrptr = optptr[2], srrspace = optptr[1];
srrptr <= srrspace;
srrptr += 4
) {
if (srrptr + 3 > srrspace)
break;
if (memcmp(&opt->nexthop, &optptr[srrptr-1], 4) == 0)
break;
}
if (srrptr + 3 <= srrspace) {
opt->is_changed = 1;
ip_hdr(skb)->daddr = opt->nexthop;
ip_rt_get_source(&optptr[srrptr-1], skb, rt);
optptr[2] = srrptr+4;
} else {
net_crit_ratelimited("%s(): Argh! Destination lost!\n",
__func__);
}
if (opt->ts_needaddr) {
optptr = raw + opt->ts;
ip_rt_get_source(&optptr[optptr[2]-9], skb, rt);
opt->is_changed = 1;
}
}
if (opt->is_changed) {
opt->is_changed = 0;
ip_send_check(ip_hdr(skb));
}
}
int ip_options_rcv_srr(struct sk_buff *skb)
{
struct ip_options *opt = &(IPCB(skb)->opt);
int srrspace, srrptr;
__be32 nexthop;
struct iphdr *iph = ip_hdr(skb);
unsigned char *optptr = skb_network_header(skb) + opt->srr;
struct rtable *rt = skb_rtable(skb);
struct rtable *rt2;
unsigned long orefdst;
int err;
if (!rt)
return 0;
if (skb->pkt_type != PACKET_HOST)
return -EINVAL;
if (rt->rt_type == RTN_UNICAST) {
if (!opt->is_strictroute)
return 0;
icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl(16<<24));
return -EINVAL;
}
if (rt->rt_type != RTN_LOCAL)
return -EINVAL;
for (srrptr = optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) {
if (srrptr + 3 > srrspace) {
icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((opt->srr+2)<<24));
return -EINVAL;
}
memcpy(&nexthop, &optptr[srrptr-1], 4);
orefdst = skb->_skb_refdst;
skb_dst_set(skb, NULL);
err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev);
rt2 = skb_rtable(skb);
if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) {
skb_dst_drop(skb);
skb->_skb_refdst = orefdst;
return -EINVAL;
}
refdst_drop(orefdst);
if (rt2->rt_type != RTN_LOCAL)
break;
/* Superfast 8) loopback forward */
iph->daddr = nexthop;
opt->is_changed = 1;
}
if (srrptr <= srrspace) {
opt->srr_is_hit = 1;
opt->nexthop = nexthop;
opt->is_changed = 1;
}
return 0;
}
EXPORT_SYMBOL(ip_options_rcv_srr);