net: add a sysctl to reflect the fwmark on replies
Kernel-originated IP packets that have no user socket associated
with them (e.g., ICMP errors and echo replies, TCP RSTs, etc.)
are emitted with a mark of zero. Add a sysctl to make them have
the same mark as the packet they are replying to.
This allows an administrator that wishes to do so to use
mark-based routing, firewalling, etc. for these replies by
marking the original packets inbound.
Tested using user-mode linux:
- ICMP/ICMPv6 echo replies and errors.
- TCP RST packets (IPv4 and IPv6).
Change-Id: I95d896647b278d092ef331d1377b959da1deb042
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Git-commit: 3356997e1e1b2aa9959f046203e6d0b193bbd7f7
Git-repo: https://android.googlesource.com/kernel/common.git
[imaund@codeaurora.org: Resolve trivial merge conflicts.]
Signed-off-by: Ian Maund <imaund@codeaurora.org>
Signed-off-by: Samir Mehta <samirn@codeaurora.org>
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 2d4a6db..4af5946 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -22,6 +22,13 @@
min_pmtu - INTEGER
default 552 - minimum discovered Path MTU
+fwmark_reflect - BOOLEAN
+ Controls the fwmark of kernel-generated IPv4 reply packets that are not
+ associated with a socket for example, TCP RSTs or ICMP echo replies).
+ If unset, these packets have a fwmark of zero. If set, they have the
+ fwmark of the packet they are replying to.
+ Default: 0
+
route/max_size - INTEGER
Maximum number of routes allowed in the kernel. Increase
this when using large numbers of interfaces and/or routes.
@@ -1048,6 +1055,13 @@
2 NDP packets are sent to userspace, where a userspace proxy
can be implemented
+fwmark_reflect - BOOLEAN
+ Controls the fwmark of kernel-generated IPv6 reply packets that are not
+ associated with a socket for example, TCP RSTs or ICMPv6 echo replies).
+ If unset, these packets have a fwmark of zero. If set, they have the
+ fwmark of the packet they are replying to.
+ Default: 0
+
conf/interface/*:
Change special settings per interface.
diff --git a/include/net/ip.h b/include/net/ip.h
index b53d65f..f6c3f02 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -237,6 +237,9 @@
extern void ip_static_sysctl_init(void);
+#define IP4_REPLY_MARK(net, mark) \
+ ((net)->ipv4.sysctl_fwmark_reflect ? (mark) : 0)
+
static inline bool ip_is_fragment(const struct iphdr *iph)
{
return (iph->frag_off & htons(IP_MF | IP_OFFSET)) != 0;
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index a5a9e4d..2161a39 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -109,6 +109,9 @@
#define IP6_MF 0x0001
+#define IP6_REPLY_MARK(net, mark) \
+ ((net)->ipv6.sysctl.fwmark_reflect ? (mark) : 0)
+
#include <net/sock.h>
/* sysctls */
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index bbd023a..446db34 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -56,6 +56,7 @@
unsigned int sysctl_ping_group_range[2];
long sysctl_tcp_mem[3];
+ int sysctl_fwmark_reflect;
atomic_t rt_genid;
atomic_t dev_addr_genid;
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 81abfcb..20b76ab 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -25,6 +25,7 @@
int ip6_rt_mtu_expires;
int ip6_rt_min_advmss;
int icmpv6_time;
+ int fwmark_reflect;
};
struct netns_ipv6 {
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 2e109ff..ea63a57 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -335,6 +335,7 @@
struct sock *sk;
struct inet_sock *inet;
__be32 daddr;
+ u32 mark = IP4_REPLY_MARK(net, skb->mark);
if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
return;
@@ -347,6 +348,7 @@
icmp_param->data.icmph.checksum = 0;
inet->tos = ip_hdr(skb)->tos;
+ sk->sk_mark = mark;
daddr = ipc.addr = ip_hdr(skb)->saddr;
ipc.opt = NULL;
ipc.tx_flags = 0;
@@ -358,6 +360,7 @@
memset(&fl4, 0, sizeof(fl4));
fl4.daddr = daddr;
fl4.saddr = rt->rt_spec_dst;
+ fl4.flowi4_mark = mark;
fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
fl4.flowi4_proto = IPPROTO_ICMP;
security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
@@ -376,7 +379,7 @@
struct flowi4 *fl4,
struct sk_buff *skb_in,
const struct iphdr *iph,
- __be32 saddr, u8 tos,
+ __be32 saddr, u8 tos, u32 mark,
int type, int code,
struct icmp_bxm *param)
{
@@ -388,6 +391,7 @@
fl4->daddr = (param->replyopts.opt.opt.srr ?
param->replyopts.opt.opt.faddr : iph->saddr);
fl4->saddr = saddr;
+ fl4->flowi4_mark = mark;
fl4->flowi4_tos = RT_TOS(tos);
fl4->flowi4_proto = IPPROTO_ICMP;
fl4->fl4_icmp_type = type;
@@ -485,6 +489,7 @@
struct flowi4 fl4;
__be32 saddr;
u8 tos;
+ u32 mark;
struct net *net;
struct sock *sk;
@@ -581,6 +586,7 @@
tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) |
IPTOS_PREC_INTERNETCONTROL) :
iph->tos;
+ mark = IP4_REPLY_MARK(net, skb_in->mark);
if (ip_options_echo(&icmp_param.replyopts.opt.opt, skb_in))
goto out_unlock;
@@ -597,11 +603,12 @@
icmp_param.skb = skb_in;
icmp_param.offset = skb_network_offset(skb_in);
inet_sk(sk)->tos = tos;
+ sk->sk_mark = mark;
ipc.addr = iph->saddr;
ipc.opt = &icmp_param.replyopts.opt;
ipc.tx_flags = 0;
- rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos,
+ rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark,
type, code, &icmp_param);
if (IS_ERR(rt))
goto out_unlock;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 4910176..859ac52 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1500,7 +1500,8 @@
daddr = replyopts.opt.opt.faddr;
}
- flowi4_init_output(&fl4, arg->bound_dev_if, 0,
+ flowi4_init_output(&fl4, arg->bound_dev_if,
+ IP4_REPLY_MARK(sock_net(sk), skb->mark),
RT_TOS(arg->tos),
RT_SCOPE_UNIVERSE, sk->sk_protocol,
ip_reply_arg_flowi_flags(arg),
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 3171f7f..f4ae36f 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -788,6 +788,13 @@
.mode = 0644,
.proc_handler = ipv4_tcp_mem,
},
+ {
+ .procname = "fwmark_reflect",
+ .data = &init_net.ipv4.sysctl_fwmark_reflect,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
{ }
};
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index ba0c147..b595aa6 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -396,6 +396,7 @@
int len;
int hlimit;
int err = 0;
+ u32 mark = IP6_REPLY_MARK(net, skb->mark);
if ((u8 *)hdr < skb->head ||
(skb->network_header + sizeof(*hdr)) > skb->tail)
@@ -461,6 +462,7 @@
fl6.daddr = hdr->saddr;
if (saddr)
fl6.saddr = *saddr;
+ fl6.flowi6_mark = mark;
fl6.flowi6_oif = iif;
fl6.fl6_icmp_type = type;
fl6.fl6_icmp_code = code;
@@ -469,6 +471,7 @@
sk = icmpv6_xmit_lock(net);
if (sk == NULL)
return;
+ sk->sk_mark = mark;
np = inet6_sk(sk);
if (!icmpv6_xrlim_allow(sk, type, &fl6))
@@ -543,6 +546,7 @@
struct dst_entry *dst;
int err = 0;
int hlimit;
+ u32 mark = IP6_REPLY_MARK(net, skb->mark);
saddr = &ipv6_hdr(skb)->daddr;
@@ -559,11 +563,13 @@
fl6.saddr = *saddr;
fl6.flowi6_oif = skb->dev->ifindex;
fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY;
+ fl6.flowi6_mark = mark;
security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
sk = icmpv6_xmit_lock(net);
if (sk == NULL)
return;
+ sk->sk_mark = mark;
np = inet6_sk(sk);
if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index 166a57c..1f872c3 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -48,6 +48,13 @@
.mode = 0644,
.proc_handler = proc_dointvec
},
+ {
+ .procname = "fwmark_reflect",
+ .data = &init_net.ipv6.sysctl.fwmark_reflect,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
{ }
};
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 98256cf..035812b 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -897,6 +897,7 @@
fl6.flowi6_proto = IPPROTO_TCP;
fl6.flowi6_oif = inet6_iif(skb);
+ fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark);
fl6.fl6_dport = t1->dest;
fl6.fl6_sport = t1->source;
security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));