blob: a5bd0b4acc614d020a9bff3a2e53dbc846c37ff5 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
Jesper Juhl02c30a82005-05-05 16:16:16 -07008 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -07009 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090021 * Alan Cox : Super /proc >4K
Linus Torvalds1da177e2005-04-16 15:20:36 -070022 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090039 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070055 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Ilia Sotnikovcef26852006-03-25 01:38:55 -080056 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
Linus Torvalds1da177e2005-04-16 15:20:36 -070058 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
Joe Perchesafd465032012-03-12 07:03:32 +000065#define pr_fmt(fmt) "IPv4: " fmt
66
Linus Torvalds1da177e2005-04-16 15:20:36 -070067#include <linux/module.h>
68#include <asm/uaccess.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070069#include <linux/bitops.h>
70#include <linux/types.h>
71#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070072#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070073#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070074#include <linux/string.h>
75#include <linux/socket.h>
76#include <linux/sockios.h>
77#include <linux/errno.h>
78#include <linux/in.h>
79#include <linux/inet.h>
80#include <linux/netdevice.h>
81#include <linux/proc_fs.h>
82#include <linux/init.h>
Eric Dumazet39c90ec2007-09-15 10:55:54 -070083#include <linux/workqueue.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070084#include <linux/skbuff.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070085#include <linux/inetdevice.h>
86#include <linux/igmp.h>
87#include <linux/pkt_sched.h>
88#include <linux/mroute.h>
89#include <linux/netfilter_ipv4.h>
90#include <linux/random.h>
91#include <linux/jhash.h>
92#include <linux/rcupdate.h>
93#include <linux/times.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090094#include <linux/slab.h>
Stephen Rothwellb9eda062011-12-22 17:03:29 +110095#include <linux/prefetch.h>
Herbert Xu352e5122007-11-13 21:34:06 -080096#include <net/dst.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020097#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070098#include <net/protocol.h>
99#include <net/ip.h>
100#include <net/route.h>
101#include <net/inetpeer.h>
102#include <net/sock.h>
103#include <net/ip_fib.h>
104#include <net/arp.h>
105#include <net/tcp.h>
106#include <net/icmp.h>
107#include <net/xfrm.h>
Tom Tucker8d717402006-07-30 20:43:36 -0700108#include <net/netevent.h>
Thomas Graf63f34442007-03-22 11:55:17 -0700109#include <net/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700110#ifdef CONFIG_SYSCTL
111#include <linux/sysctl.h>
Shan Wei7426a562012-04-18 18:05:46 +0000112#include <linux/kmemleak.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700113#endif
David S. Miller6e5714e2011-08-03 20:50:44 -0700114#include <net/secure_seq.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700115
David S. Miller68a5e3d2011-03-11 20:07:33 -0500116#define RT_FL_TOS(oldflp4) \
Julian Anastasovf61759e2011-12-02 11:39:42 +0000117 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700118
119#define IP_MAX_MTU 0xFFF0
120
121#define RT_GC_TIMEOUT (300*HZ)
122
Linus Torvalds1da177e2005-04-16 15:20:36 -0700123static int ip_rt_max_size;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700124static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500125static int ip_rt_gc_interval __read_mostly = 60 * HZ;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700126static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
127static int ip_rt_redirect_number __read_mostly = 9;
128static int ip_rt_redirect_load __read_mostly = HZ / 50;
129static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
130static int ip_rt_error_cost __read_mostly = HZ;
131static int ip_rt_error_burst __read_mostly = 5 * HZ;
132static int ip_rt_gc_elasticity __read_mostly = 8;
133static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
134static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
135static int ip_rt_min_advmss __read_mostly = 256;
Neil Horman1080d702008-10-27 12:28:25 -0700136static int rt_chain_length_max __read_mostly = 20;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700137
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500138static struct delayed_work expires_work;
139static unsigned long expires_ljiffies;
140
Linus Torvalds1da177e2005-04-16 15:20:36 -0700141/*
142 * Interface to generic destination cache.
143 */
144
145static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
David S. Miller0dbaee32010-12-13 12:52:14 -0800146static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
Steffen Klassertebb762f2011-11-23 02:12:51 +0000147static unsigned int ipv4_mtu(const struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148static void ipv4_dst_destroy(struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700149static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150static void ipv4_link_failure(struct sk_buff *skb);
David S. Miller6700c272012-07-17 03:29:28 -0700151static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
152 struct sk_buff *skb, u32 mtu);
153static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
154 struct sk_buff *skb);
Daniel Lezcano569d3642008-01-18 03:56:57 -0800155static int rt_garbage_collect(struct dst_ops *ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700156
Eric Dumazet72cdd1d2010-11-11 07:14:07 +0000157static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
158 int how)
159{
160}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700161
David S. Miller62fa8a82011-01-26 20:51:05 -0800162static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
163{
David S. Miller31248732012-07-10 07:08:18 -0700164 WARN_ON(1);
165 return NULL;
David S. Miller62fa8a82011-01-26 20:51:05 -0800166}
167
David S. Millerf894cbf2012-07-02 21:52:24 -0700168static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
169 struct sk_buff *skb,
170 const void *daddr);
David S. Millerd3aaeb32011-07-18 00:40:17 -0700171
Linus Torvalds1da177e2005-04-16 15:20:36 -0700172static struct dst_ops ipv4_dst_ops = {
173 .family = AF_INET,
Harvey Harrison09640e62009-02-01 00:45:17 -0800174 .protocol = cpu_to_be16(ETH_P_IP),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700175 .gc = rt_garbage_collect,
176 .check = ipv4_dst_check,
David S. Miller0dbaee32010-12-13 12:52:14 -0800177 .default_advmss = ipv4_default_advmss,
Steffen Klassertebb762f2011-11-23 02:12:51 +0000178 .mtu = ipv4_mtu,
David S. Miller62fa8a82011-01-26 20:51:05 -0800179 .cow_metrics = ipv4_cow_metrics,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700180 .destroy = ipv4_dst_destroy,
181 .ifdown = ipv4_dst_ifdown,
182 .negative_advice = ipv4_negative_advice,
183 .link_failure = ipv4_link_failure,
184 .update_pmtu = ip_rt_update_pmtu,
David S. Millere47a1852012-07-11 20:55:47 -0700185 .redirect = ip_do_redirect,
Herbert Xu1ac06e02008-05-20 14:32:14 -0700186 .local_out = __ip_local_out,
David S. Millerd3aaeb32011-07-18 00:40:17 -0700187 .neigh_lookup = ipv4_neigh_lookup,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700188};
189
190#define ECN_OR_COST(class) TC_PRIO_##class
191
Philippe De Muyter4839c522007-07-09 15:32:57 -0700192const __u8 ip_tos2prio[16] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700193 TC_PRIO_BESTEFFORT,
Dan Siemon4a2b9c32011-03-15 13:56:07 +0000194 ECN_OR_COST(BESTEFFORT),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700195 TC_PRIO_BESTEFFORT,
196 ECN_OR_COST(BESTEFFORT),
197 TC_PRIO_BULK,
198 ECN_OR_COST(BULK),
199 TC_PRIO_BULK,
200 ECN_OR_COST(BULK),
201 TC_PRIO_INTERACTIVE,
202 ECN_OR_COST(INTERACTIVE),
203 TC_PRIO_INTERACTIVE,
204 ECN_OR_COST(INTERACTIVE),
205 TC_PRIO_INTERACTIVE_BULK,
206 ECN_OR_COST(INTERACTIVE_BULK),
207 TC_PRIO_INTERACTIVE_BULK,
208 ECN_OR_COST(INTERACTIVE_BULK)
209};
Amir Vadaid4a96862012-04-04 21:33:28 +0000210EXPORT_SYMBOL(ip_tos2prio);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700211
212/*
213 * Route cache.
214 */
215
216/* The locking scheme is rather straight forward:
217 *
218 * 1) Read-Copy Update protects the buckets of the central route hash.
219 * 2) Only writers remove entries, and they hold the lock
220 * as they look at rtable reference counts.
221 * 3) Only readers acquire references to rtable entries,
222 * they do so with atomic increments and with the
223 * lock held.
224 */
225
226struct rt_hash_bucket {
Eric Dumazet1c317202010-10-25 21:02:07 +0000227 struct rtable __rcu *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700228};
Neil Horman1080d702008-10-27 12:28:25 -0700229
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700230#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
231 defined(CONFIG_PROVE_LOCKING)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700232/*
233 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
234 * The size of this table is a power of two and depends on the number of CPUS.
Ingo Molnar62051202006-07-03 00:24:59 -0700235 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700236 */
Ingo Molnar62051202006-07-03 00:24:59 -0700237#ifdef CONFIG_LOCKDEP
238# define RT_HASH_LOCK_SZ 256
Eric Dumazet22c047c2005-07-05 14:55:24 -0700239#else
Ingo Molnar62051202006-07-03 00:24:59 -0700240# if NR_CPUS >= 32
241# define RT_HASH_LOCK_SZ 4096
242# elif NR_CPUS >= 16
243# define RT_HASH_LOCK_SZ 2048
244# elif NR_CPUS >= 8
245# define RT_HASH_LOCK_SZ 1024
246# elif NR_CPUS >= 4
247# define RT_HASH_LOCK_SZ 512
248# else
249# define RT_HASH_LOCK_SZ 256
250# endif
Eric Dumazet22c047c2005-07-05 14:55:24 -0700251#endif
252
253static spinlock_t *rt_hash_locks;
254# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800255
256static __init void rt_hash_lock_init(void)
257{
258 int i;
259
260 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
261 GFP_KERNEL);
262 if (!rt_hash_locks)
263 panic("IP: failed to allocate rt_hash_locks\n");
264
265 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
266 spin_lock_init(&rt_hash_locks[i]);
267}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700268#else
269# define rt_hash_lock_addr(slot) NULL
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800270
271static inline void rt_hash_lock_init(void)
272{
273}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700274#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700275
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700276static struct rt_hash_bucket *rt_hash_table __read_mostly;
Eric Dumazet95c96172012-04-15 05:58:06 +0000277static unsigned int rt_hash_mask __read_mostly;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700278static unsigned int rt_hash_log __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700279
Eric Dumazet2f970d82006-01-17 02:54:36 -0800280static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
Eric Dumazet27f39c72010-05-19 22:07:23 +0000281#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700282
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700283static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700284 int genid)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700285{
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700286 return jhash_3words((__force u32)daddr, (__force u32)saddr,
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700287 idx, genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800288 & rt_hash_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700289}
290
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700291static inline int rt_genid(struct net *net)
292{
293 return atomic_read(&net->ipv4.rt_genid);
294}
295
Linus Torvalds1da177e2005-04-16 15:20:36 -0700296#ifdef CONFIG_PROC_FS
297struct rt_cache_iter_state {
Denis V. Luneva75e9362008-02-28 20:50:55 -0800298 struct seq_net_private p;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700299 int bucket;
Eric Dumazet29e75252008-01-31 17:05:09 -0800300 int genid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700301};
302
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900303static struct rtable *rt_cache_get_first(struct seq_file *seq)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700304{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900305 struct rt_cache_iter_state *st = seq->private;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700306 struct rtable *r = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700307
308 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
Eric Dumazet33d480c2011-08-11 19:30:52 +0000309 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
Eric Dumazeta6272662008-08-28 01:11:25 -0700310 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700311 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -0800312 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Eric Dumazet29e75252008-01-31 17:05:09 -0800313 while (r) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700314 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
Denis V. Luneva75e9362008-02-28 20:50:55 -0800315 r->rt_genid == st->genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800316 return r;
Changli Gaod8d1f302010-06-10 23:31:35 -0700317 r = rcu_dereference_bh(r->dst.rt_next);
Eric Dumazet29e75252008-01-31 17:05:09 -0800318 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700319 rcu_read_unlock_bh();
320 }
Eric Dumazet29e75252008-01-31 17:05:09 -0800321 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700322}
323
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900324static struct rtable *__rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800325 struct rtable *r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700326{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900327 struct rt_cache_iter_state *st = seq->private;
Eric Dumazeta6272662008-08-28 01:11:25 -0700328
Eric Dumazet1c317202010-10-25 21:02:07 +0000329 r = rcu_dereference_bh(r->dst.rt_next);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700330 while (!r) {
331 rcu_read_unlock_bh();
Eric Dumazeta6272662008-08-28 01:11:25 -0700332 do {
333 if (--st->bucket < 0)
334 return NULL;
Eric Dumazet33d480c2011-08-11 19:30:52 +0000335 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700336 rcu_read_lock_bh();
Eric Dumazet1c317202010-10-25 21:02:07 +0000337 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700338 }
Eric Dumazet1c317202010-10-25 21:02:07 +0000339 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700340}
341
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900342static struct rtable *rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800343 struct rtable *r)
344{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900345 struct rt_cache_iter_state *st = seq->private;
346 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700347 if (dev_net(r->dst.dev) != seq_file_net(seq))
Denis V. Luneva75e9362008-02-28 20:50:55 -0800348 continue;
Denis V. Lunev642d6312008-02-28 20:50:33 -0800349 if (r->rt_genid == st->genid)
350 break;
351 }
352 return r;
353}
354
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900355static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700356{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900357 struct rtable *r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700358
359 if (r)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900360 while (pos && (r = rt_cache_get_next(seq, r)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700361 --pos;
362 return pos ? NULL : r;
363}
364
365static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
366{
Eric Dumazet29e75252008-01-31 17:05:09 -0800367 struct rt_cache_iter_state *st = seq->private;
Eric Dumazet29e75252008-01-31 17:05:09 -0800368 if (*pos)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900369 return rt_cache_get_idx(seq, *pos - 1);
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700370 st->genid = rt_genid(seq_file_net(seq));
Eric Dumazet29e75252008-01-31 17:05:09 -0800371 return SEQ_START_TOKEN;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700372}
373
374static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
375{
Eric Dumazet29e75252008-01-31 17:05:09 -0800376 struct rtable *r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700377
378 if (v == SEQ_START_TOKEN)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900379 r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700380 else
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900381 r = rt_cache_get_next(seq, v);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700382 ++*pos;
383 return r;
384}
385
386static void rt_cache_seq_stop(struct seq_file *seq, void *v)
387{
388 if (v && v != SEQ_START_TOKEN)
389 rcu_read_unlock_bh();
390}
391
392static int rt_cache_seq_show(struct seq_file *seq, void *v)
393{
394 if (v == SEQ_START_TOKEN)
395 seq_printf(seq, "%-127s\n",
396 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
397 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
398 "HHUptod\tSpecDst");
399 else {
400 struct rtable *r = v;
David S. Miller3c521f22012-07-02 02:04:13 -0700401 int len;
Eric Dumazet218fa902011-11-29 20:05:55 +0000402
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700403 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
David S. Miller794785b2012-07-10 00:52:56 -0700404 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
405 r->dst.dev ? r->dst.dev->name : "*",
406 (__force u32)r->rt_dst,
407 (__force u32)r->rt_gateway,
408 r->rt_flags, atomic_read(&r->dst.__refcnt),
409 r->dst.__use, 0, (__force u32)r->rt_src,
410 dst_metric_advmss(&r->dst) + 40,
411 dst_metric(&r->dst, RTAX_WINDOW), 0,
412 r->rt_key_tos,
413 -1, 0, 0, &len);
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700414
415 seq_printf(seq, "%*s\n", 127 - len, "");
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900416 }
417 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700418}
419
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700420static const struct seq_operations rt_cache_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700421 .start = rt_cache_seq_start,
422 .next = rt_cache_seq_next,
423 .stop = rt_cache_seq_stop,
424 .show = rt_cache_seq_show,
425};
426
427static int rt_cache_seq_open(struct inode *inode, struct file *file)
428{
Denis V. Luneva75e9362008-02-28 20:50:55 -0800429 return seq_open_net(inode, file, &rt_cache_seq_ops,
Pavel Emelyanovcf7732e2007-10-10 02:29:29 -0700430 sizeof(struct rt_cache_iter_state));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700431}
432
Arjan van de Ven9a321442007-02-12 00:55:35 -0800433static const struct file_operations rt_cache_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700434 .owner = THIS_MODULE,
435 .open = rt_cache_seq_open,
436 .read = seq_read,
437 .llseek = seq_lseek,
Denis V. Luneva75e9362008-02-28 20:50:55 -0800438 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700439};
440
441
442static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
443{
444 int cpu;
445
446 if (*pos == 0)
447 return SEQ_START_TOKEN;
448
Rusty Russell0f23174a2008-12-29 12:23:42 +0000449 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700450 if (!cpu_possible(cpu))
451 continue;
452 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800453 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700454 }
455 return NULL;
456}
457
458static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
459{
460 int cpu;
461
Rusty Russell0f23174a2008-12-29 12:23:42 +0000462 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700463 if (!cpu_possible(cpu))
464 continue;
465 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800466 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700467 }
468 return NULL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900469
Linus Torvalds1da177e2005-04-16 15:20:36 -0700470}
471
472static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
473{
474
475}
476
477static int rt_cpu_seq_show(struct seq_file *seq, void *v)
478{
479 struct rt_cache_stat *st = v;
480
481 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700482 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700483 return 0;
484 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900485
Linus Torvalds1da177e2005-04-16 15:20:36 -0700486 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
487 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
Eric Dumazetfc66f952010-10-08 06:37:34 +0000488 dst_entries_get_slow(&ipv4_dst_ops),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700489 st->in_hit,
490 st->in_slow_tot,
491 st->in_slow_mc,
492 st->in_no_route,
493 st->in_brd,
494 st->in_martian_dst,
495 st->in_martian_src,
496
497 st->out_hit,
498 st->out_slow_tot,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900499 st->out_slow_mc,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700500
501 st->gc_total,
502 st->gc_ignored,
503 st->gc_goal_miss,
504 st->gc_dst_overflow,
505 st->in_hlist_search,
506 st->out_hlist_search
507 );
508 return 0;
509}
510
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700511static const struct seq_operations rt_cpu_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700512 .start = rt_cpu_seq_start,
513 .next = rt_cpu_seq_next,
514 .stop = rt_cpu_seq_stop,
515 .show = rt_cpu_seq_show,
516};
517
518
519static int rt_cpu_seq_open(struct inode *inode, struct file *file)
520{
521 return seq_open(file, &rt_cpu_seq_ops);
522}
523
Arjan van de Ven9a321442007-02-12 00:55:35 -0800524static const struct file_operations rt_cpu_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700525 .owner = THIS_MODULE,
526 .open = rt_cpu_seq_open,
527 .read = seq_read,
528 .llseek = seq_lseek,
529 .release = seq_release,
530};
531
Patrick McHardyc7066f72011-01-14 13:36:42 +0100532#ifdef CONFIG_IP_ROUTE_CLASSID
Alexey Dobriyana661c412009-11-25 15:40:35 -0800533static int rt_acct_proc_show(struct seq_file *m, void *v)
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800534{
Alexey Dobriyana661c412009-11-25 15:40:35 -0800535 struct ip_rt_acct *dst, *src;
536 unsigned int i, j;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800537
Alexey Dobriyana661c412009-11-25 15:40:35 -0800538 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
539 if (!dst)
540 return -ENOMEM;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800541
Alexey Dobriyana661c412009-11-25 15:40:35 -0800542 for_each_possible_cpu(i) {
543 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
544 for (j = 0; j < 256; j++) {
545 dst[j].o_bytes += src[j].o_bytes;
546 dst[j].o_packets += src[j].o_packets;
547 dst[j].i_bytes += src[j].i_bytes;
548 dst[j].i_packets += src[j].i_packets;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800549 }
550 }
Alexey Dobriyana661c412009-11-25 15:40:35 -0800551
552 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
553 kfree(dst);
554 return 0;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800555}
Alexey Dobriyana661c412009-11-25 15:40:35 -0800556
557static int rt_acct_proc_open(struct inode *inode, struct file *file)
558{
559 return single_open(file, rt_acct_proc_show, NULL);
560}
561
562static const struct file_operations rt_acct_proc_fops = {
563 .owner = THIS_MODULE,
564 .open = rt_acct_proc_open,
565 .read = seq_read,
566 .llseek = seq_lseek,
567 .release = single_release,
568};
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800569#endif
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800570
Denis V. Lunev73b38712008-02-28 20:51:18 -0800571static int __net_init ip_rt_do_proc_init(struct net *net)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800572{
573 struct proc_dir_entry *pde;
574
575 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
576 &rt_cache_seq_fops);
577 if (!pde)
578 goto err1;
579
Wang Chen77020722008-02-28 14:14:25 -0800580 pde = proc_create("rt_cache", S_IRUGO,
581 net->proc_net_stat, &rt_cpu_seq_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800582 if (!pde)
583 goto err2;
584
Patrick McHardyc7066f72011-01-14 13:36:42 +0100585#ifdef CONFIG_IP_ROUTE_CLASSID
Alexey Dobriyana661c412009-11-25 15:40:35 -0800586 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800587 if (!pde)
588 goto err3;
589#endif
590 return 0;
591
Patrick McHardyc7066f72011-01-14 13:36:42 +0100592#ifdef CONFIG_IP_ROUTE_CLASSID
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800593err3:
594 remove_proc_entry("rt_cache", net->proc_net_stat);
595#endif
596err2:
597 remove_proc_entry("rt_cache", net->proc_net);
598err1:
599 return -ENOMEM;
600}
Denis V. Lunev73b38712008-02-28 20:51:18 -0800601
602static void __net_exit ip_rt_do_proc_exit(struct net *net)
603{
604 remove_proc_entry("rt_cache", net->proc_net_stat);
605 remove_proc_entry("rt_cache", net->proc_net);
Patrick McHardyc7066f72011-01-14 13:36:42 +0100606#ifdef CONFIG_IP_ROUTE_CLASSID
Denis V. Lunev73b38712008-02-28 20:51:18 -0800607 remove_proc_entry("rt_acct", net->proc_net);
Alexey Dobriyan0a931ac2010-01-17 03:32:50 +0000608#endif
Denis V. Lunev73b38712008-02-28 20:51:18 -0800609}
610
611static struct pernet_operations ip_rt_proc_ops __net_initdata = {
612 .init = ip_rt_do_proc_init,
613 .exit = ip_rt_do_proc_exit,
614};
615
616static int __init ip_rt_proc_init(void)
617{
618 return register_pernet_subsys(&ip_rt_proc_ops);
619}
620
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800621#else
Denis V. Lunev73b38712008-02-28 20:51:18 -0800622static inline int ip_rt_proc_init(void)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800623{
624 return 0;
625}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700626#endif /* CONFIG_PROC_FS */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900627
Stephen Hemminger5969f712008-04-10 01:52:09 -0700628static inline void rt_free(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700629{
Changli Gaod8d1f302010-06-10 23:31:35 -0700630 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700631}
632
Stephen Hemminger5969f712008-04-10 01:52:09 -0700633static inline void rt_drop(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700634{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700635 ip_rt_put(rt);
Changli Gaod8d1f302010-06-10 23:31:35 -0700636 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700637}
638
Stephen Hemminger5969f712008-04-10 01:52:09 -0700639static inline int rt_fast_clean(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700640{
641 /* Kill broadcast/multicast entries very aggresively, if they
642 collide in hash table with more useful entries */
643 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
David S. Millerc7537962010-11-11 17:07:48 -0800644 rt_is_input_route(rth) && rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700645}
646
Stephen Hemminger5969f712008-04-10 01:52:09 -0700647static inline int rt_valuable(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700648{
649 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
David S. Miller59436342012-07-10 06:58:42 -0700650 rth->dst.expires;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700651}
652
653static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
654{
655 unsigned long age;
656 int ret = 0;
657
Changli Gaod8d1f302010-06-10 23:31:35 -0700658 if (atomic_read(&rth->dst.__refcnt))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700659 goto out;
660
Changli Gaod8d1f302010-06-10 23:31:35 -0700661 age = jiffies - rth->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700662 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
663 (age <= tmo2 && rt_valuable(rth)))
664 goto out;
665 ret = 1;
666out: return ret;
667}
668
669/* Bits of score are:
670 * 31: very valuable
671 * 30: not quite useless
672 * 29..0: usage counter
673 */
674static inline u32 rt_score(struct rtable *rt)
675{
Changli Gaod8d1f302010-06-10 23:31:35 -0700676 u32 score = jiffies - rt->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700677
678 score = ~score & ~(3<<30);
679
680 if (rt_valuable(rt))
681 score |= (1<<31);
682
David S. Millerc7537962010-11-11 17:07:48 -0800683 if (rt_is_output_route(rt) ||
Linus Torvalds1da177e2005-04-16 15:20:36 -0700684 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
685 score |= (1<<30);
686
687 return score;
688}
689
Neil Horman1080d702008-10-27 12:28:25 -0700690static inline bool rt_caching(const struct net *net)
691{
692 return net->ipv4.current_rt_cache_rebuild_count <=
693 net->ipv4.sysctl_rt_cache_rebuild_count;
694}
695
David S. Miller5e2b61f2011-03-04 21:47:09 -0800696static inline bool compare_hash_inputs(const struct rtable *rt1,
697 const struct rtable *rt2)
Neil Horman1080d702008-10-27 12:28:25 -0700698{
David S. Miller5e2b61f2011-03-04 21:47:09 -0800699 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
700 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
Julian Anastasov97a80412011-08-09 04:01:16 +0000701 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
Neil Horman1080d702008-10-27 12:28:25 -0700702}
703
David S. Miller5e2b61f2011-03-04 21:47:09 -0800704static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700705{
David S. Miller5e2b61f2011-03-04 21:47:09 -0800706 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
707 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
708 (rt1->rt_mark ^ rt2->rt_mark) |
David S. Miller475949d2011-05-03 19:45:15 -0700709 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
Julian Anastasovd547f722011-08-07 22:20:20 -0700710 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
Julian Anastasov97a80412011-08-09 04:01:16 +0000711 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700712}
713
Denis V. Lunevb5921912008-01-22 23:50:25 -0800714static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
715{
Changli Gaod8d1f302010-06-10 23:31:35 -0700716 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
Denis V. Lunevb5921912008-01-22 23:50:25 -0800717}
718
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700719static inline int rt_is_expired(struct rtable *rth)
720{
Changli Gaod8d1f302010-06-10 23:31:35 -0700721 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700722}
723
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800724/*
725 * Perform a full scan of hash table and free all entries.
726 * Can be called by a softirq or a process.
727 * In the later case, we want to be reschedule if necessary
728 */
David S. Miller6561a3b2010-12-19 21:11:20 -0800729static void rt_do_flush(struct net *net, int process_context)
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800730{
731 unsigned int i;
732 struct rtable *rth, *next;
733
734 for (i = 0; i <= rt_hash_mask; i++) {
David S. Miller6561a3b2010-12-19 21:11:20 -0800735 struct rtable __rcu **pprev;
736 struct rtable *list;
737
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800738 if (process_context && need_resched())
739 cond_resched();
Eric Dumazet33d480c2011-08-11 19:30:52 +0000740 rth = rcu_access_pointer(rt_hash_table[i].chain);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800741 if (!rth)
742 continue;
743
744 spin_lock_bh(rt_hash_lock_addr(i));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700745
David S. Miller6561a3b2010-12-19 21:11:20 -0800746 list = NULL;
747 pprev = &rt_hash_table[i].chain;
748 rth = rcu_dereference_protected(*pprev,
Eric Dumazet1c317202010-10-25 21:02:07 +0000749 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700750
David S. Miller6561a3b2010-12-19 21:11:20 -0800751 while (rth) {
752 next = rcu_dereference_protected(rth->dst.rt_next,
753 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700754
David S. Miller6561a3b2010-12-19 21:11:20 -0800755 if (!net ||
756 net_eq(dev_net(rth->dst.dev), net)) {
757 rcu_assign_pointer(*pprev, next);
758 rcu_assign_pointer(rth->dst.rt_next, list);
759 list = rth;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700760 } else {
David S. Miller6561a3b2010-12-19 21:11:20 -0800761 pprev = &rth->dst.rt_next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700762 }
David S. Miller6561a3b2010-12-19 21:11:20 -0800763 rth = next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700764 }
David S. Miller6561a3b2010-12-19 21:11:20 -0800765
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800766 spin_unlock_bh(rt_hash_lock_addr(i));
767
David S. Miller6561a3b2010-12-19 21:11:20 -0800768 for (; list; list = next) {
769 next = rcu_dereference_protected(list->dst.rt_next, 1);
770 rt_free(list);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800771 }
772 }
773}
774
Neil Horman1080d702008-10-27 12:28:25 -0700775/*
776 * While freeing expired entries, we compute average chain length
777 * and standard deviation, using fixed-point arithmetic.
778 * This to have an estimation of rt_chain_length_max
779 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
780 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
781 */
782
783#define FRACT_BITS 3
784#define ONE (1UL << FRACT_BITS)
785
Eric Dumazet98376382010-03-08 03:20:00 +0000786/*
787 * Given a hash chain and an item in this hash chain,
788 * find if a previous entry has the same hash_inputs
789 * (but differs on tos, mark or oif)
790 * Returns 0 if an alias is found.
791 * Returns ONE if rth has no alias before itself.
792 */
793static int has_noalias(const struct rtable *head, const struct rtable *rth)
794{
795 const struct rtable *aux = head;
796
797 while (aux != rth) {
David S. Miller5e2b61f2011-03-04 21:47:09 -0800798 if (compare_hash_inputs(aux, rth))
Eric Dumazet98376382010-03-08 03:20:00 +0000799 return 0;
Eric Dumazet1c317202010-10-25 21:02:07 +0000800 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +0000801 }
802 return ONE;
803}
804
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500805static void rt_check_expire(void)
806{
807 static unsigned int rover;
808 unsigned int i = rover, goal;
809 struct rtable *rth;
810 struct rtable __rcu **rthp;
811 unsigned long samples = 0;
812 unsigned long sum = 0, sum2 = 0;
813 unsigned long delta;
814 u64 mult;
815
816 delta = jiffies - expires_ljiffies;
817 expires_ljiffies = jiffies;
818 mult = ((u64)delta) << rt_hash_log;
819 if (ip_rt_gc_timeout > 1)
820 do_div(mult, ip_rt_gc_timeout);
821 goal = (unsigned int)mult;
822 if (goal > rt_hash_mask)
823 goal = rt_hash_mask + 1;
824 for (; goal > 0; goal--) {
825 unsigned long tmo = ip_rt_gc_timeout;
826 unsigned long length;
827
828 i = (i + 1) & rt_hash_mask;
829 rthp = &rt_hash_table[i].chain;
830
831 if (need_resched())
832 cond_resched();
833
834 samples++;
835
836 if (rcu_dereference_raw(*rthp) == NULL)
837 continue;
838 length = 0;
839 spin_lock_bh(rt_hash_lock_addr(i));
840 while ((rth = rcu_dereference_protected(*rthp,
841 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
842 prefetch(rth->dst.rt_next);
David S. Millerdf67e6c2012-06-26 00:10:09 -0700843 if (rt_is_expired(rth) ||
844 rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500845 *rthp = rth->dst.rt_next;
846 rt_free(rth);
847 continue;
848 }
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500849
David S. Millerdf67e6c2012-06-26 00:10:09 -0700850 /* We only count entries on a chain with equal
851 * hash inputs once so that entries for
852 * different QOS levels, and other non-hash
853 * input attributes don't unfairly skew the
854 * length computation
855 */
856 tmo >>= 1;
857 rthp = &rth->dst.rt_next;
858 length += has_noalias(rt_hash_table[i].chain, rth);
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500859 }
860 spin_unlock_bh(rt_hash_lock_addr(i));
861 sum += length;
862 sum2 += length*length;
863 }
864 if (samples) {
865 unsigned long avg = sum / samples;
866 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
867 rt_chain_length_max = max_t(unsigned long,
868 ip_rt_gc_elasticity,
869 (avg + 4*sd) >> FRACT_BITS);
870 }
871 rover = i;
872}
873
874/*
875 * rt_worker_func() is run in process context.
876 * we call rt_check_expire() to scan part of the hash table
877 */
878static void rt_worker_func(struct work_struct *work)
879{
880 rt_check_expire();
881 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
882}
883
Eric Dumazet29e75252008-01-31 17:05:09 -0800884/*
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300885 * Perturbation of rt_genid by a small quantity [1..256]
Eric Dumazet29e75252008-01-31 17:05:09 -0800886 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
887 * many times (2^24) without giving recent rt_genid.
888 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700889 */
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700890static void rt_cache_invalidate(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700891{
Eric Dumazet29e75252008-01-31 17:05:09 -0800892 unsigned char shuffle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700893
Eric Dumazet29e75252008-01-31 17:05:09 -0800894 get_random_bytes(&shuffle, sizeof(shuffle));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700895 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700896}
897
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800898/*
Eric Dumazet29e75252008-01-31 17:05:09 -0800899 * delay < 0 : invalidate cache (fast : entries will be deleted later)
900 * delay >= 0 : invalidate & flush cache (can be long)
901 */
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -0700902void rt_cache_flush(struct net *net, int delay)
Eric Dumazet29e75252008-01-31 17:05:09 -0800903{
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700904 rt_cache_invalidate(net);
Eric Dumazet29e75252008-01-31 17:05:09 -0800905 if (delay >= 0)
David S. Miller6561a3b2010-12-19 21:11:20 -0800906 rt_do_flush(net, !in_softirq());
Eric Dumazet29e75252008-01-31 17:05:09 -0800907}
908
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000909/* Flush previous cache invalidated entries from the cache */
David S. Miller6561a3b2010-12-19 21:11:20 -0800910void rt_cache_flush_batch(struct net *net)
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000911{
David S. Miller6561a3b2010-12-19 21:11:20 -0800912 rt_do_flush(net, !in_softirq());
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000913}
914
Neil Horman1080d702008-10-27 12:28:25 -0700915static void rt_emergency_hash_rebuild(struct net *net)
916{
Joe Perchese87cc472012-05-13 21:56:26 +0000917 net_warn_ratelimited("Route hash chain too long!\n");
Neil Horman3ee94372010-05-08 01:57:52 -0700918 rt_cache_invalidate(net);
Neil Horman1080d702008-10-27 12:28:25 -0700919}
920
Linus Torvalds1da177e2005-04-16 15:20:36 -0700921/*
922 Short description of GC goals.
923
924 We want to build algorithm, which will keep routing cache
925 at some equilibrium point, when number of aged off entries
926 is kept approximately equal to newly generated ones.
927
928 Current expiration strength is variable "expire".
929 We try to adjust it dynamically, so that if networking
930 is idle expires is large enough to keep enough of warm entries,
931 and when load increases it reduces to limit cache size.
932 */
933
Daniel Lezcano569d3642008-01-18 03:56:57 -0800934static int rt_garbage_collect(struct dst_ops *ops)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700935{
936 static unsigned long expire = RT_GC_TIMEOUT;
937 static unsigned long last_gc;
938 static int rover;
939 static int equilibrium;
Eric Dumazet1c317202010-10-25 21:02:07 +0000940 struct rtable *rth;
941 struct rtable __rcu **rthp;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700942 unsigned long now = jiffies;
943 int goal;
Eric Dumazetfc66f952010-10-08 06:37:34 +0000944 int entries = dst_entries_get_fast(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700945
946 /*
947 * Garbage collection is pretty expensive,
948 * do not make it too frequently.
949 */
950
951 RT_CACHE_STAT_INC(gc_total);
952
953 if (now - last_gc < ip_rt_gc_min_interval &&
Eric Dumazetfc66f952010-10-08 06:37:34 +0000954 entries < ip_rt_max_size) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700955 RT_CACHE_STAT_INC(gc_ignored);
956 goto out;
957 }
958
Eric Dumazetfc66f952010-10-08 06:37:34 +0000959 entries = dst_entries_get_slow(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700960 /* Calculate number of entries, which we want to expire now. */
Eric Dumazetfc66f952010-10-08 06:37:34 +0000961 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700962 if (goal <= 0) {
963 if (equilibrium < ipv4_dst_ops.gc_thresh)
964 equilibrium = ipv4_dst_ops.gc_thresh;
Eric Dumazetfc66f952010-10-08 06:37:34 +0000965 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700966 if (goal > 0) {
Eric Dumazetb790ced2007-12-21 01:49:07 -0800967 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +0000968 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700969 }
970 } else {
971 /* We are in dangerous area. Try to reduce cache really
972 * aggressively.
973 */
Eric Dumazetb790ced2007-12-21 01:49:07 -0800974 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +0000975 equilibrium = entries - goal;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700976 }
977
978 if (now - last_gc >= ip_rt_gc_min_interval)
979 last_gc = now;
980
981 if (goal <= 0) {
982 equilibrium += goal;
983 goto work_done;
984 }
985
986 do {
987 int i, k;
988
989 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
990 unsigned long tmo = expire;
991
992 k = (k + 1) & rt_hash_mask;
993 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700994 spin_lock_bh(rt_hash_lock_addr(k));
Eric Dumazet1c317202010-10-25 21:02:07 +0000995 while ((rth = rcu_dereference_protected(*rthp,
996 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700997 if (!rt_is_expired(rth) &&
Eric Dumazet29e75252008-01-31 17:05:09 -0800998 !rt_may_expire(rth, tmo, expire)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700999 tmo >>= 1;
Changli Gaod8d1f302010-06-10 23:31:35 -07001000 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001001 continue;
1002 }
Changli Gaod8d1f302010-06-10 23:31:35 -07001003 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001004 rt_free(rth);
1005 goal--;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001006 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001007 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001008 if (goal <= 0)
1009 break;
1010 }
1011 rover = k;
1012
1013 if (goal <= 0)
1014 goto work_done;
1015
1016 /* Goal is not achieved. We stop process if:
1017
1018 - if expire reduced to zero. Otherwise, expire is halfed.
1019 - if table is not full.
1020 - if we are called from interrupt.
1021 - jiffies check is just fallback/debug loop breaker.
1022 We will not spin here for long time in any case.
1023 */
1024
1025 RT_CACHE_STAT_INC(gc_goal_miss);
1026
1027 if (expire == 0)
1028 break;
1029
1030 expire >>= 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001031
Eric Dumazetfc66f952010-10-08 06:37:34 +00001032 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001033 goto out;
1034 } while (!in_softirq() && time_before_eq(jiffies, now));
1035
Eric Dumazetfc66f952010-10-08 06:37:34 +00001036 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1037 goto out;
1038 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001039 goto out;
Joe Perchese87cc472012-05-13 21:56:26 +00001040 net_warn_ratelimited("dst cache overflow\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07001041 RT_CACHE_STAT_INC(gc_dst_overflow);
1042 return 1;
1043
1044work_done:
1045 expire += ip_rt_gc_min_interval;
1046 if (expire > ip_rt_gc_timeout ||
Eric Dumazetfc66f952010-10-08 06:37:34 +00001047 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1048 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001049 expire = ip_rt_gc_timeout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001050out: return 0;
1051}
1052
Eric Dumazet98376382010-03-08 03:20:00 +00001053/*
1054 * Returns number of entries in a hash chain that have different hash_inputs
1055 */
1056static int slow_chain_length(const struct rtable *head)
1057{
1058 int length = 0;
1059 const struct rtable *rth = head;
1060
1061 while (rth) {
1062 length += has_noalias(head, rth);
Eric Dumazet1c317202010-10-25 21:02:07 +00001063 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +00001064 }
1065 return length >> FRACT_BITS;
1066}
1067
David S. Millerf894cbf2012-07-02 21:52:24 -07001068static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
1069 struct sk_buff *skb,
1070 const void *daddr)
David Miller3769cff2011-07-11 22:44:24 +00001071{
David S. Millerd3aaeb32011-07-18 00:40:17 -07001072 struct net_device *dev = dst->dev;
1073 const __be32 *pkey = daddr;
David S. Miller39232972012-01-26 15:22:32 -05001074 const struct rtable *rt;
David Miller3769cff2011-07-11 22:44:24 +00001075 struct neighbour *n;
1076
David S. Miller39232972012-01-26 15:22:32 -05001077 rt = (const struct rtable *) dst;
David S. Millera263b302012-07-02 02:02:15 -07001078 if (rt->rt_gateway)
David S. Miller39232972012-01-26 15:22:32 -05001079 pkey = (const __be32 *) &rt->rt_gateway;
David S. Millerf894cbf2012-07-02 21:52:24 -07001080 else if (skb)
1081 pkey = &ip_hdr(skb)->daddr;
David S. Millerd3aaeb32011-07-18 00:40:17 -07001082
David S. Miller80703d22012-02-15 17:48:35 -05001083 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
David S. Millerd3aaeb32011-07-18 00:40:17 -07001084 if (n)
1085 return n;
David Miller32092ec2011-07-25 00:01:41 +00001086 return neigh_create(&arp_tbl, pkey, dev);
David S. Millerd3aaeb32011-07-18 00:40:17 -07001087}
1088
Eric Dumazet95c96172012-04-15 05:58:06 +00001089static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
David S. Millerb23dd4f2011-03-02 14:31:35 -08001090 struct sk_buff *skb, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001091{
Eric Dumazet1c317202010-10-25 21:02:07 +00001092 struct rtable *rth, *cand;
1093 struct rtable __rcu **rthp, **candp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001094 unsigned long now;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001095 u32 min_score;
1096 int chain_length;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001097
1098restart:
1099 chain_length = 0;
1100 min_score = ~(u32)0;
1101 cand = NULL;
1102 candp = NULL;
1103 now = jiffies;
1104
Eric Dumazet7586ece2012-06-20 05:02:19 +00001105 if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) {
Neil Horman73e42892009-06-20 01:15:16 -07001106 /*
1107 * If we're not caching, just tell the caller we
1108 * were successful and don't touch the route. The
1109 * caller hold the sole reference to the cache entry, and
1110 * it will be released when the caller is done with it.
1111 * If we drop it here, the callers have no way to resolve routes
1112 * when we're not caching. Instead, just point *rp at rt, so
1113 * the caller gets a single use out of the route
Neil Hormanb6280b42009-06-22 10:18:53 +00001114 * Note that we do rt_free on this new route entry, so that
1115 * once its refcount hits zero, we are still able to reap it
1116 * (Thanks Alexey)
Eric Dumazet27b75c92010-10-15 05:44:11 +00001117 * Note: To avoid expensive rcu stuff for this uncached dst,
1118 * we set DST_NOCACHE so that dst_release() can free dst without
1119 * waiting a grace period.
Neil Horman73e42892009-06-20 01:15:16 -07001120 */
Neil Hormanb6280b42009-06-22 10:18:53 +00001121
Eric Dumazetc7d44262010-10-03 22:17:54 -07001122 rt->dst.flags |= DST_NOCACHE;
Neil Hormanb6280b42009-06-22 10:18:53 +00001123 goto skip_hashing;
Neil Horman1080d702008-10-27 12:28:25 -07001124 }
1125
Linus Torvalds1da177e2005-04-16 15:20:36 -07001126 rthp = &rt_hash_table[hash].chain;
1127
Eric Dumazet22c047c2005-07-05 14:55:24 -07001128 spin_lock_bh(rt_hash_lock_addr(hash));
Eric Dumazet1c317202010-10-25 21:02:07 +00001129 while ((rth = rcu_dereference_protected(*rthp,
1130 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001131 if (rt_is_expired(rth)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001132 *rthp = rth->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001133 rt_free(rth);
1134 continue;
1135 }
David S. Miller5e2b61f2011-03-04 21:47:09 -08001136 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001137 /* Put it first */
Changli Gaod8d1f302010-06-10 23:31:35 -07001138 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001139 /*
1140 * Since lookup is lockfree, the deletion
1141 * must be visible to another weakly ordered CPU before
1142 * the insertion at the start of the hash chain.
1143 */
Changli Gaod8d1f302010-06-10 23:31:35 -07001144 rcu_assign_pointer(rth->dst.rt_next,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001145 rt_hash_table[hash].chain);
1146 /*
1147 * Since lookup is lockfree, the update writes
1148 * must be ordered for consistency on SMP.
1149 */
1150 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1151
Changli Gaod8d1f302010-06-10 23:31:35 -07001152 dst_use(&rth->dst, now);
Eric Dumazet22c047c2005-07-05 14:55:24 -07001153 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001154
1155 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001156 if (skb)
Changli Gaod8d1f302010-06-10 23:31:35 -07001157 skb_dst_set(skb, &rth->dst);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001158 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001159 }
1160
Changli Gaod8d1f302010-06-10 23:31:35 -07001161 if (!atomic_read(&rth->dst.__refcnt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001162 u32 score = rt_score(rth);
1163
1164 if (score <= min_score) {
1165 cand = rth;
1166 candp = rthp;
1167 min_score = score;
1168 }
1169 }
1170
1171 chain_length++;
1172
Changli Gaod8d1f302010-06-10 23:31:35 -07001173 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001174 }
1175
1176 if (cand) {
1177 /* ip_rt_gc_elasticity used to be average length of chain
1178 * length, when exceeded gc becomes really aggressive.
1179 *
1180 * The second limit is less certain. At the moment it allows
1181 * only 2 entries per bucket. We will see.
1182 */
1183 if (chain_length > ip_rt_gc_elasticity) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001184 *candp = cand->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001185 rt_free(cand);
1186 }
Neil Horman1080d702008-10-27 12:28:25 -07001187 } else {
Eric Dumazet98376382010-03-08 03:20:00 +00001188 if (chain_length > rt_chain_length_max &&
1189 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001190 struct net *net = dev_net(rt->dst.dev);
Neil Horman1080d702008-10-27 12:28:25 -07001191 int num = ++net->ipv4.current_rt_cache_rebuild_count;
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001192 if (!rt_caching(net)) {
Joe Perches058bd4d2012-03-11 18:36:11 +00001193 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
Changli Gaod8d1f302010-06-10 23:31:35 -07001194 rt->dst.dev->name, num);
Neil Horman1080d702008-10-27 12:28:25 -07001195 }
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001196 rt_emergency_hash_rebuild(net);
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001197 spin_unlock_bh(rt_hash_lock_addr(hash));
1198
David S. Miller5e2b61f2011-03-04 21:47:09 -08001199 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001200 ifindex, rt_genid(net));
1201 goto restart;
Neil Horman1080d702008-10-27 12:28:25 -07001202 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001203 }
1204
Changli Gaod8d1f302010-06-10 23:31:35 -07001205 rt->dst.rt_next = rt_hash_table[hash].chain;
Neil Horman1080d702008-10-27 12:28:25 -07001206
Eric Dumazet00269b52008-10-16 14:18:29 -07001207 /*
1208 * Since lookup is lockfree, we must make sure
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001209 * previous writes to rt are committed to memory
Eric Dumazet00269b52008-10-16 14:18:29 -07001210 * before making rt visible to other CPUS.
1211 */
Eric Dumazet1ddbcb02009-05-19 20:14:28 +00001212 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
Neil Horman1080d702008-10-27 12:28:25 -07001213
Eric Dumazet22c047c2005-07-05 14:55:24 -07001214 spin_unlock_bh(rt_hash_lock_addr(hash));
Neil Horman73e42892009-06-20 01:15:16 -07001215
Neil Hormanb6280b42009-06-22 10:18:53 +00001216skip_hashing:
David S. Millerb23dd4f2011-03-02 14:31:35 -08001217 if (skb)
Changli Gaod8d1f302010-06-10 23:31:35 -07001218 skb_dst_set(skb, &rt->dst);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001219 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001220}
1221
Linus Torvalds1da177e2005-04-16 15:20:36 -07001222/*
1223 * Peer allocation may fail only in serious out-of-memory conditions. However
1224 * we still can generate some output.
1225 * Random ID selection looks a bit dangerous because we have no chances to
1226 * select ID being unique in a reasonable period of time.
1227 * But broken packet identifier may be better than no packet at all.
1228 */
1229static void ip_select_fb_ident(struct iphdr *iph)
1230{
1231 static DEFINE_SPINLOCK(ip_fb_id_lock);
1232 static u32 ip_fallback_id;
1233 u32 salt;
1234
1235 spin_lock_bh(&ip_fb_id_lock);
Al Viroe4485152006-09-26 22:15:01 -07001236 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001237 iph->id = htons(salt & 0xFFFF);
1238 ip_fallback_id = salt;
1239 spin_unlock_bh(&ip_fb_id_lock);
1240}
1241
1242void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1243{
David S. Miller1d861aa2012-07-10 03:58:16 -07001244 struct net *net = dev_net(dst->dev);
1245 struct inet_peer *peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001246
David S. Miller1d861aa2012-07-10 03:58:16 -07001247 peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
1248 if (peer) {
1249 iph->id = htons(inet_getid(peer, more));
1250 inet_putpeer(peer);
1251 return;
1252 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001253
1254 ip_select_fb_ident(iph);
1255}
Eric Dumazet4bc2f182010-07-09 21:22:10 +00001256EXPORT_SYMBOL(__ip_select_ident);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001257
Eric Dumazet95c96172012-04-15 05:58:06 +00001258static void rt_del(unsigned int hash, struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001259{
Eric Dumazet1c317202010-10-25 21:02:07 +00001260 struct rtable __rcu **rthp;
1261 struct rtable *aux;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001262
Eric Dumazet29e75252008-01-31 17:05:09 -08001263 rthp = &rt_hash_table[hash].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001264 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001265 ip_rt_put(rt);
Eric Dumazet1c317202010-10-25 21:02:07 +00001266 while ((aux = rcu_dereference_protected(*rthp,
1267 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001268 if (aux == rt || rt_is_expired(aux)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001269 *rthp = aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001270 rt_free(aux);
1271 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001272 }
Changli Gaod8d1f302010-06-10 23:31:35 -07001273 rthp = &aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001274 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001275 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001276}
1277
David S. Miller4895c772012-07-17 04:19:00 -07001278static void __build_flow_key(struct flowi4 *fl4, struct sock *sk,
1279 const struct iphdr *iph,
1280 int oif, u8 tos,
1281 u8 prot, u32 mark, int flow_flags)
1282{
1283 if (sk) {
1284 const struct inet_sock *inet = inet_sk(sk);
1285
1286 oif = sk->sk_bound_dev_if;
1287 mark = sk->sk_mark;
1288 tos = RT_CONN_FLAGS(sk);
1289 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
1290 }
1291 flowi4_init_output(fl4, oif, mark, tos,
1292 RT_SCOPE_UNIVERSE, prot,
1293 flow_flags,
1294 iph->daddr, iph->saddr, 0, 0);
1295}
1296
1297static void build_skb_flow_key(struct flowi4 *fl4, struct sk_buff *skb, struct sock *sk)
1298{
1299 const struct iphdr *iph = ip_hdr(skb);
1300 int oif = skb->dev->ifindex;
1301 u8 tos = RT_TOS(iph->tos);
1302 u8 prot = iph->protocol;
1303 u32 mark = skb->mark;
1304
1305 __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
1306}
1307
1308static void build_sk_flow_key(struct flowi4 *fl4, struct sock *sk)
1309{
1310 const struct inet_sock *inet = inet_sk(sk);
1311 struct ip_options_rcu *inet_opt;
1312 __be32 daddr = inet->inet_daddr;
1313
1314 rcu_read_lock();
1315 inet_opt = rcu_dereference(inet->inet_opt);
1316 if (inet_opt && inet_opt->opt.srr)
1317 daddr = inet_opt->opt.faddr;
1318 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
1319 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
1320 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
1321 inet_sk_flowi_flags(sk),
1322 daddr, inet->inet_saddr, 0, 0);
1323 rcu_read_unlock();
1324}
1325
1326static void ip_rt_build_flow_key(struct flowi4 *fl4, struct sock *sk,
1327 struct sk_buff *skb)
1328{
1329 if (skb)
1330 build_skb_flow_key(fl4, skb, sk);
1331 else
1332 build_sk_flow_key(fl4, sk);
1333}
1334
1335static DEFINE_SPINLOCK(fnhe_lock);
1336
1337static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash, __be32 daddr)
1338{
1339 struct fib_nh_exception *fnhe, *oldest;
1340
1341 oldest = rcu_dereference(hash->chain);
1342 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
1343 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1344 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
1345 oldest = fnhe;
1346 }
1347 return oldest;
1348}
1349
1350static struct fib_nh_exception *find_or_create_fnhe(struct fib_nh *nh, __be32 daddr)
1351{
1352 struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1353 struct fib_nh_exception *fnhe;
1354 int depth;
1355 u32 hval;
1356
1357 if (!hash) {
1358 hash = nh->nh_exceptions = kzalloc(FNHE_HASH_SIZE * sizeof(*hash),
1359 GFP_ATOMIC);
1360 if (!hash)
1361 return NULL;
1362 }
1363
1364 hval = (__force u32) daddr;
1365 hval ^= (hval >> 11) ^ (hval >> 22);
1366 hash += hval;
1367
1368 depth = 0;
1369 for (fnhe = rcu_dereference(hash->chain); fnhe;
1370 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1371 if (fnhe->fnhe_daddr == daddr)
1372 goto out;
1373 depth++;
1374 }
1375
1376 if (depth > FNHE_RECLAIM_DEPTH) {
1377 fnhe = fnhe_oldest(hash + hval, daddr);
1378 goto out_daddr;
1379 }
1380 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
1381 if (!fnhe)
1382 return NULL;
1383
1384 fnhe->fnhe_next = hash->chain;
1385 rcu_assign_pointer(hash->chain, fnhe);
1386
1387out_daddr:
1388 fnhe->fnhe_daddr = daddr;
1389out:
1390 fnhe->fnhe_stamp = jiffies;
1391 return fnhe;
1392}
1393
1394static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001395{
David S. Millere47a1852012-07-11 20:55:47 -07001396 __be32 new_gw = icmp_hdr(skb)->un.gateway;
David S. Miller94206122012-07-11 20:38:08 -07001397 __be32 old_gw = ip_hdr(skb)->saddr;
David S. Millere47a1852012-07-11 20:55:47 -07001398 struct net_device *dev = skb->dev;
David S. Millere47a1852012-07-11 20:55:47 -07001399 struct in_device *in_dev;
David S. Miller4895c772012-07-17 04:19:00 -07001400 struct fib_result res;
David S. Millere47a1852012-07-11 20:55:47 -07001401 struct neighbour *n;
Denis V. Lunev317805b2008-02-28 20:50:06 -08001402 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001403
David S. Miller94206122012-07-11 20:38:08 -07001404 switch (icmp_hdr(skb)->code & 7) {
1405 case ICMP_REDIR_NET:
1406 case ICMP_REDIR_NETTOS:
1407 case ICMP_REDIR_HOST:
1408 case ICMP_REDIR_HOSTTOS:
1409 break;
1410
1411 default:
1412 return;
1413 }
1414
David S. Millere47a1852012-07-11 20:55:47 -07001415 if (rt->rt_gateway != old_gw)
1416 return;
1417
1418 in_dev = __in_dev_get_rcu(dev);
1419 if (!in_dev)
1420 return;
1421
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001422 net = dev_net(dev);
Joe Perches9d4fb272009-11-23 10:41:23 -08001423 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1424 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1425 ipv4_is_zeronet(new_gw))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001426 goto reject_redirect;
1427
1428 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1429 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1430 goto reject_redirect;
1431 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1432 goto reject_redirect;
1433 } else {
Denis V. Lunev317805b2008-02-28 20:50:06 -08001434 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001435 goto reject_redirect;
1436 }
1437
David S. Miller4895c772012-07-17 04:19:00 -07001438 n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
David S. Millere47a1852012-07-11 20:55:47 -07001439 if (n) {
1440 if (!(n->nud_state & NUD_VALID)) {
1441 neigh_event_send(n, NULL);
1442 } else {
David S. Miller4895c772012-07-17 04:19:00 -07001443 if (fib_lookup(net, fl4, &res) == 0) {
1444 struct fib_nh *nh = &FIB_RES_NH(res);
1445 struct fib_nh_exception *fnhe;
1446
1447 spin_lock_bh(&fnhe_lock);
1448 fnhe = find_or_create_fnhe(nh, fl4->daddr);
1449 if (fnhe)
1450 fnhe->fnhe_gw = new_gw;
1451 spin_unlock_bh(&fnhe_lock);
1452 }
David S. Millere47a1852012-07-11 20:55:47 -07001453 rt->rt_gateway = new_gw;
1454 rt->rt_flags |= RTCF_REDIRECTED;
1455 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1456 }
1457 neigh_release(n);
1458 }
1459 return;
1460
1461reject_redirect:
1462#ifdef CONFIG_IP_ROUTE_VERBOSE
David S. Miller99ee0382012-07-12 07:40:05 -07001463 if (IN_DEV_LOG_MARTIANS(in_dev)) {
1464 const struct iphdr *iph = (const struct iphdr *) skb->data;
1465 __be32 daddr = iph->daddr;
1466 __be32 saddr = iph->saddr;
1467
David S. Millere47a1852012-07-11 20:55:47 -07001468 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1469 " Advised path = %pI4 -> %pI4\n",
1470 &old_gw, dev->name, &new_gw,
1471 &saddr, &daddr);
David S. Miller99ee0382012-07-12 07:40:05 -07001472 }
David S. Millere47a1852012-07-11 20:55:47 -07001473#endif
1474 ;
1475}
1476
David S. Miller4895c772012-07-17 04:19:00 -07001477static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
1478{
1479 struct rtable *rt;
1480 struct flowi4 fl4;
1481
1482 rt = (struct rtable *) dst;
1483
1484 ip_rt_build_flow_key(&fl4, sk, skb);
1485 __ip_do_redirect(rt, skb, &fl4);
1486}
1487
Linus Torvalds1da177e2005-04-16 15:20:36 -07001488static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1489{
Eric Dumazetee6b9672008-03-05 18:30:47 -08001490 struct rtable *rt = (struct rtable *)dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001491 struct dst_entry *ret = dst;
1492
1493 if (rt) {
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001494 if (dst->obsolete > 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001495 ip_rt_put(rt);
1496 ret = NULL;
David S. Miller59436342012-07-10 06:58:42 -07001497 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1498 rt->dst.expires) {
Eric Dumazet95c96172012-04-15 05:58:06 +00001499 unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
David S. Miller5e2b61f2011-03-04 21:47:09 -08001500 rt->rt_oif,
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001501 rt_genid(dev_net(dst->dev)));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001502 rt_del(hash, rt);
1503 ret = NULL;
1504 }
1505 }
1506 return ret;
1507}
1508
1509/*
1510 * Algorithm:
1511 * 1. The first ip_rt_redirect_number redirects are sent
1512 * with exponential backoff, then we stop sending them at all,
1513 * assuming that the host ignores our redirects.
1514 * 2. If we did not see packets requiring redirects
1515 * during ip_rt_redirect_silence, we assume that the host
1516 * forgot redirected route and start to send redirects again.
1517 *
1518 * This algorithm is much cheaper and more intelligent than dumb load limiting
1519 * in icmp.c.
1520 *
1521 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1522 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1523 */
1524
1525void ip_rt_send_redirect(struct sk_buff *skb)
1526{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001527 struct rtable *rt = skb_rtable(skb);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001528 struct in_device *in_dev;
David S. Miller92d86822011-02-04 15:55:25 -08001529 struct inet_peer *peer;
David S. Miller1d861aa2012-07-10 03:58:16 -07001530 struct net *net;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001531 int log_martians;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001532
Eric Dumazet30038fc2009-08-28 23:52:01 -07001533 rcu_read_lock();
Changli Gaod8d1f302010-06-10 23:31:35 -07001534 in_dev = __in_dev_get_rcu(rt->dst.dev);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001535 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1536 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001537 return;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001538 }
1539 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1540 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001541
David S. Miller1d861aa2012-07-10 03:58:16 -07001542 net = dev_net(rt->dst.dev);
1543 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
David S. Miller92d86822011-02-04 15:55:25 -08001544 if (!peer) {
1545 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1546 return;
1547 }
1548
Linus Torvalds1da177e2005-04-16 15:20:36 -07001549 /* No redirected packets during ip_rt_redirect_silence;
1550 * reset the algorithm.
1551 */
David S. Miller92d86822011-02-04 15:55:25 -08001552 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1553 peer->rate_tokens = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001554
1555 /* Too many ignored redirects; do not send anything
Changli Gaod8d1f302010-06-10 23:31:35 -07001556 * set dst.rate_last to the last seen redirected packet.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001557 */
David S. Miller92d86822011-02-04 15:55:25 -08001558 if (peer->rate_tokens >= ip_rt_redirect_number) {
1559 peer->rate_last = jiffies;
David S. Miller1d861aa2012-07-10 03:58:16 -07001560 goto out_put_peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001561 }
1562
1563 /* Check for load limit; set rate_last to the latest sent
1564 * redirect.
1565 */
David S. Miller92d86822011-02-04 15:55:25 -08001566 if (peer->rate_tokens == 0 ||
Li Yewang14fb8a72006-12-18 00:26:35 -08001567 time_after(jiffies,
David S. Miller92d86822011-02-04 15:55:25 -08001568 (peer->rate_last +
1569 (ip_rt_redirect_load << peer->rate_tokens)))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001570 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
David S. Miller92d86822011-02-04 15:55:25 -08001571 peer->rate_last = jiffies;
1572 ++peer->rate_tokens;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001573#ifdef CONFIG_IP_ROUTE_VERBOSE
Eric Dumazet30038fc2009-08-28 23:52:01 -07001574 if (log_martians &&
Joe Perchese87cc472012-05-13 21:56:26 +00001575 peer->rate_tokens == ip_rt_redirect_number)
1576 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1577 &ip_hdr(skb)->saddr, rt->rt_iif,
1578 &rt->rt_dst, &rt->rt_gateway);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001579#endif
1580 }
David S. Miller1d861aa2012-07-10 03:58:16 -07001581out_put_peer:
1582 inet_putpeer(peer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001583}
1584
1585static int ip_error(struct sk_buff *skb)
1586{
David S. Miller251da412012-06-26 16:27:09 -07001587 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
Eric Dumazet511c3f92009-06-02 05:14:27 +00001588 struct rtable *rt = skb_rtable(skb);
David S. Miller92d86822011-02-04 15:55:25 -08001589 struct inet_peer *peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001590 unsigned long now;
David S. Miller251da412012-06-26 16:27:09 -07001591 struct net *net;
David S. Miller92d86822011-02-04 15:55:25 -08001592 bool send;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001593 int code;
1594
David S. Miller251da412012-06-26 16:27:09 -07001595 net = dev_net(rt->dst.dev);
1596 if (!IN_DEV_FORWARD(in_dev)) {
1597 switch (rt->dst.error) {
1598 case EHOSTUNREACH:
1599 IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
1600 break;
1601
1602 case ENETUNREACH:
1603 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1604 break;
1605 }
1606 goto out;
1607 }
1608
Changli Gaod8d1f302010-06-10 23:31:35 -07001609 switch (rt->dst.error) {
Joe Perches4500ebf2011-07-01 09:43:07 +00001610 case EINVAL:
1611 default:
1612 goto out;
1613 case EHOSTUNREACH:
1614 code = ICMP_HOST_UNREACH;
1615 break;
1616 case ENETUNREACH:
1617 code = ICMP_NET_UNREACH;
David S. Miller251da412012-06-26 16:27:09 -07001618 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
Joe Perches4500ebf2011-07-01 09:43:07 +00001619 break;
1620 case EACCES:
1621 code = ICMP_PKT_FILTERED;
1622 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001623 }
1624
David S. Miller1d861aa2012-07-10 03:58:16 -07001625 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
David S. Miller92d86822011-02-04 15:55:25 -08001626
1627 send = true;
1628 if (peer) {
1629 now = jiffies;
1630 peer->rate_tokens += now - peer->rate_last;
1631 if (peer->rate_tokens > ip_rt_error_burst)
1632 peer->rate_tokens = ip_rt_error_burst;
1633 peer->rate_last = now;
1634 if (peer->rate_tokens >= ip_rt_error_cost)
1635 peer->rate_tokens -= ip_rt_error_cost;
1636 else
1637 send = false;
David S. Miller1d861aa2012-07-10 03:58:16 -07001638 inet_putpeer(peer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001639 }
David S. Miller92d86822011-02-04 15:55:25 -08001640 if (send)
1641 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001642
1643out: kfree_skb(skb);
1644 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001645}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001646
David S. Miller4895c772012-07-17 04:19:00 -07001647static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001648{
David S. Miller4895c772012-07-17 04:19:00 -07001649 struct fib_result res;
David S. Miller2c8cec52011-02-09 20:42:07 -08001650
David S. Miller59436342012-07-10 06:58:42 -07001651 if (mtu < ip_rt_min_pmtu)
1652 mtu = ip_rt_min_pmtu;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001653
David S. Miller4895c772012-07-17 04:19:00 -07001654 if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
1655 struct fib_nh *nh = &FIB_RES_NH(res);
1656 struct fib_nh_exception *fnhe;
1657
1658 spin_lock_bh(&fnhe_lock);
1659 fnhe = find_or_create_fnhe(nh, fl4->daddr);
1660 if (fnhe) {
1661 fnhe->fnhe_pmtu = mtu;
1662 fnhe->fnhe_expires = jiffies + ip_rt_mtu_expires;
1663 }
1664 spin_unlock_bh(&fnhe_lock);
1665 }
David S. Miller59436342012-07-10 06:58:42 -07001666 rt->rt_pmtu = mtu;
1667 dst_set_expires(&rt->dst, ip_rt_mtu_expires);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001668}
1669
David S. Miller4895c772012-07-17 04:19:00 -07001670static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1671 struct sk_buff *skb, u32 mtu)
1672{
1673 struct rtable *rt = (struct rtable *) dst;
1674 struct flowi4 fl4;
1675
1676 ip_rt_build_flow_key(&fl4, sk, skb);
1677 __ip_rt_update_pmtu(rt, &fl4, mtu);
1678}
1679
David S. Miller36393392012-06-14 22:21:46 -07001680void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1681 int oif, u32 mark, u8 protocol, int flow_flags)
1682{
David S. Miller4895c772012-07-17 04:19:00 -07001683 const struct iphdr *iph = (const struct iphdr *) skb->data;
David S. Miller36393392012-06-14 22:21:46 -07001684 struct flowi4 fl4;
1685 struct rtable *rt;
1686
David S. Miller4895c772012-07-17 04:19:00 -07001687 __build_flow_key(&fl4, NULL, iph, oif,
1688 RT_TOS(iph->tos), protocol, mark, flow_flags);
David S. Miller36393392012-06-14 22:21:46 -07001689 rt = __ip_route_output_key(net, &fl4);
1690 if (!IS_ERR(rt)) {
David S. Miller4895c772012-07-17 04:19:00 -07001691 __ip_rt_update_pmtu(rt, &fl4, mtu);
David S. Miller36393392012-06-14 22:21:46 -07001692 ip_rt_put(rt);
1693 }
1694}
1695EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1696
1697void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1698{
David S. Miller4895c772012-07-17 04:19:00 -07001699 const struct iphdr *iph = (const struct iphdr *) skb->data;
1700 struct flowi4 fl4;
1701 struct rtable *rt;
David S. Miller36393392012-06-14 22:21:46 -07001702
David S. Miller4895c772012-07-17 04:19:00 -07001703 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1704 rt = __ip_route_output_key(sock_net(sk), &fl4);
1705 if (!IS_ERR(rt)) {
1706 __ip_rt_update_pmtu(rt, &fl4, mtu);
1707 ip_rt_put(rt);
1708 }
David S. Miller36393392012-06-14 22:21:46 -07001709}
1710EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
David S. Millerf39925d2011-02-09 22:00:16 -08001711
David S. Millerb42597e2012-07-11 21:25:45 -07001712void ipv4_redirect(struct sk_buff *skb, struct net *net,
1713 int oif, u32 mark, u8 protocol, int flow_flags)
1714{
David S. Miller4895c772012-07-17 04:19:00 -07001715 const struct iphdr *iph = (const struct iphdr *) skb->data;
David S. Millerb42597e2012-07-11 21:25:45 -07001716 struct flowi4 fl4;
1717 struct rtable *rt;
1718
David S. Miller4895c772012-07-17 04:19:00 -07001719 __build_flow_key(&fl4, NULL, iph, oif,
1720 RT_TOS(iph->tos), protocol, mark, flow_flags);
David S. Millerb42597e2012-07-11 21:25:45 -07001721 rt = __ip_route_output_key(net, &fl4);
1722 if (!IS_ERR(rt)) {
David S. Miller4895c772012-07-17 04:19:00 -07001723 __ip_do_redirect(rt, skb, &fl4);
David S. Millerb42597e2012-07-11 21:25:45 -07001724 ip_rt_put(rt);
1725 }
1726}
1727EXPORT_SYMBOL_GPL(ipv4_redirect);
1728
1729void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1730{
David S. Miller4895c772012-07-17 04:19:00 -07001731 const struct iphdr *iph = (const struct iphdr *) skb->data;
1732 struct flowi4 fl4;
1733 struct rtable *rt;
David S. Millerb42597e2012-07-11 21:25:45 -07001734
David S. Miller4895c772012-07-17 04:19:00 -07001735 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1736 rt = __ip_route_output_key(sock_net(sk), &fl4);
1737 if (!IS_ERR(rt)) {
1738 __ip_do_redirect(rt, skb, &fl4);
1739 ip_rt_put(rt);
1740 }
David S. Millerb42597e2012-07-11 21:25:45 -07001741}
1742EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1743
David S. Millerefbc3682011-12-01 13:38:59 -05001744static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1745{
1746 struct rtable *rt = (struct rtable *) dst;
1747
1748 if (rt_is_expired(rt))
1749 return NULL;
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001750 return dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001751}
1752
1753static void ipv4_dst_destroy(struct dst_entry *dst)
1754{
1755 struct rtable *rt = (struct rtable *) dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001756
David S. Miller62fa8a82011-01-26 20:51:05 -08001757 if (rt->fi) {
1758 fib_info_put(rt->fi);
1759 rt->fi = NULL;
1760 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001761}
1762
Linus Torvalds1da177e2005-04-16 15:20:36 -07001763
1764static void ipv4_link_failure(struct sk_buff *skb)
1765{
1766 struct rtable *rt;
1767
1768 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1769
Eric Dumazet511c3f92009-06-02 05:14:27 +00001770 rt = skb_rtable(skb);
David S. Miller59436342012-07-10 06:58:42 -07001771 if (rt)
1772 dst_set_expires(&rt->dst, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001773}
1774
1775static int ip_rt_bug(struct sk_buff *skb)
1776{
Joe Perches91df42b2012-05-15 14:11:54 +00001777 pr_debug("%s: %pI4 -> %pI4, %s\n",
1778 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1779 skb->dev ? skb->dev->name : "?");
Linus Torvalds1da177e2005-04-16 15:20:36 -07001780 kfree_skb(skb);
Dave Jonesc378a9c2011-05-21 07:16:42 +00001781 WARN_ON(1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001782 return 0;
1783}
1784
1785/*
1786 We do not cache source address of outgoing interface,
1787 because it is used only by IP RR, TS and SRR options,
1788 so that it out of fast path.
1789
1790 BTW remember: "addr" is allowed to be not aligned
1791 in IP options!
1792 */
1793
David S. Miller8e363602011-05-13 17:29:41 -04001794void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001795{
Al Viroa61ced52006-09-26 21:27:54 -07001796 __be32 src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001797
David S. Millerc7537962010-11-11 17:07:48 -08001798 if (rt_is_output_route(rt))
David S. Millerc5be24f2011-05-13 18:01:21 -04001799 src = ip_hdr(skb)->saddr;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001800 else {
David S. Miller8e363602011-05-13 17:29:41 -04001801 struct fib_result res;
1802 struct flowi4 fl4;
1803 struct iphdr *iph;
1804
1805 iph = ip_hdr(skb);
1806
1807 memset(&fl4, 0, sizeof(fl4));
1808 fl4.daddr = iph->daddr;
1809 fl4.saddr = iph->saddr;
Julian Anastasovb0fe4a32011-07-23 02:00:41 +00001810 fl4.flowi4_tos = RT_TOS(iph->tos);
David S. Miller8e363602011-05-13 17:29:41 -04001811 fl4.flowi4_oif = rt->dst.dev->ifindex;
1812 fl4.flowi4_iif = skb->dev->ifindex;
1813 fl4.flowi4_mark = skb->mark;
David S. Miller5e2b61f2011-03-04 21:47:09 -08001814
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001815 rcu_read_lock();
David S. Miller68a5e3d2011-03-11 20:07:33 -05001816 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
David S. Miller436c3b62011-03-24 17:42:21 -07001817 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001818 else
1819 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001820 RT_SCOPE_UNIVERSE);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001821 rcu_read_unlock();
1822 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001823 memcpy(addr, &src, 4);
1824}
1825
Patrick McHardyc7066f72011-01-14 13:36:42 +01001826#ifdef CONFIG_IP_ROUTE_CLASSID
Linus Torvalds1da177e2005-04-16 15:20:36 -07001827static void set_class_tag(struct rtable *rt, u32 tag)
1828{
Changli Gaod8d1f302010-06-10 23:31:35 -07001829 if (!(rt->dst.tclassid & 0xFFFF))
1830 rt->dst.tclassid |= tag & 0xFFFF;
1831 if (!(rt->dst.tclassid & 0xFFFF0000))
1832 rt->dst.tclassid |= tag & 0xFFFF0000;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001833}
1834#endif
1835
David S. Miller0dbaee32010-12-13 12:52:14 -08001836static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1837{
1838 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1839
1840 if (advmss == 0) {
1841 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1842 ip_rt_min_advmss);
1843 if (advmss > 65535 - 40)
1844 advmss = 65535 - 40;
1845 }
1846 return advmss;
1847}
1848
Steffen Klassertebb762f2011-11-23 02:12:51 +00001849static unsigned int ipv4_mtu(const struct dst_entry *dst)
David S. Millerd33e4552010-12-14 13:01:14 -08001850{
Steffen Klassert261663b2011-11-23 02:14:50 +00001851 const struct rtable *rt = (const struct rtable *) dst;
David S. Miller59436342012-07-10 06:58:42 -07001852 unsigned int mtu = rt->rt_pmtu;
1853
1854 if (mtu && time_after_eq(jiffies, rt->dst.expires))
1855 mtu = 0;
1856
1857 if (!mtu)
1858 mtu = dst_metric_raw(dst, RTAX_MTU);
Steffen Klassert618f9bc2011-11-23 02:13:31 +00001859
Steffen Klassert261663b2011-11-23 02:14:50 +00001860 if (mtu && rt_is_output_route(rt))
Steffen Klassert618f9bc2011-11-23 02:13:31 +00001861 return mtu;
1862
1863 mtu = dst->dev->mtu;
David S. Millerd33e4552010-12-14 13:01:14 -08001864
1865 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
David S. Millerd33e4552010-12-14 13:01:14 -08001866
1867 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1868 mtu = 576;
1869 }
1870
1871 if (mtu > IP_MAX_MTU)
1872 mtu = IP_MAX_MTU;
1873
1874 return mtu;
1875}
1876
David S. Miller813b3b52011-04-28 14:48:42 -07001877static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
David S. Miller5e2b61f2011-03-04 21:47:09 -08001878 struct fib_info *fi)
David S. Millera4daad62011-01-27 22:01:53 -08001879{
David S. Millerf1850712012-07-10 07:26:01 -07001880 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1881 rt->fi = fi;
1882 atomic_inc(&fi->fib_clntref);
David S. Millera4daad62011-01-27 22:01:53 -08001883 }
David S. Millerf1850712012-07-10 07:26:01 -07001884 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
David S. Millera4daad62011-01-27 22:01:53 -08001885}
1886
David S. Miller4895c772012-07-17 04:19:00 -07001887static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr)
1888{
1889 struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1890 struct fib_nh_exception *fnhe;
1891 u32 hval;
1892
1893 hval = (__force u32) daddr;
1894 hval ^= (hval >> 11) ^ (hval >> 22);
1895
1896 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1897 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1898 if (fnhe->fnhe_daddr == daddr) {
1899 if (fnhe->fnhe_pmtu) {
1900 unsigned long expires = fnhe->fnhe_expires;
1901 unsigned long diff = jiffies - expires;
1902
1903 if (time_before(jiffies, expires)) {
1904 rt->rt_pmtu = fnhe->fnhe_pmtu;
1905 dst_set_expires(&rt->dst, diff);
1906 }
1907 }
1908 if (fnhe->fnhe_gw)
1909 rt->rt_gateway = fnhe->fnhe_gw;
1910 fnhe->fnhe_stamp = jiffies;
1911 break;
1912 }
1913 }
1914}
1915
David S. Miller813b3b52011-04-28 14:48:42 -07001916static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
David S. Miller5e2b61f2011-03-04 21:47:09 -08001917 const struct fib_result *res,
David S. Miller982721f2011-02-16 21:44:24 -08001918 struct fib_info *fi, u16 type, u32 itag)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001919{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001920 if (fi) {
David S. Miller4895c772012-07-17 04:19:00 -07001921 struct fib_nh *nh = &FIB_RES_NH(*res);
1922
1923 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
1924 rt->rt_gateway = nh->nh_gw;
1925 if (unlikely(nh->nh_exceptions))
1926 rt_bind_exception(rt, nh, fl4->daddr);
David S. Miller813b3b52011-04-28 14:48:42 -07001927 rt_init_metrics(rt, fl4, fi);
Patrick McHardyc7066f72011-01-14 13:36:42 +01001928#ifdef CONFIG_IP_ROUTE_CLASSID
David S. Miller710ab6c2012-07-10 07:02:09 -07001929 rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001930#endif
David S. Millerd33e4552010-12-14 13:01:14 -08001931 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001932
Patrick McHardyc7066f72011-01-14 13:36:42 +01001933#ifdef CONFIG_IP_ROUTE_CLASSID
Linus Torvalds1da177e2005-04-16 15:20:36 -07001934#ifdef CONFIG_IP_MULTIPLE_TABLES
David S. Miller85b91b02012-07-13 08:21:29 -07001935 set_class_tag(rt, res->tclassid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001936#endif
1937 set_class_tag(rt, itag);
1938#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001939}
1940
David S. Miller5c1e6aa2011-04-28 14:13:38 -07001941static struct rtable *rt_dst_alloc(struct net_device *dev,
1942 bool nopolicy, bool noxfrm)
David S. Miller0c4dcd52011-02-17 15:42:37 -08001943{
David S. Miller5c1e6aa2011-04-28 14:13:38 -07001944 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1945 DST_HOST |
1946 (nopolicy ? DST_NOPOLICY : 0) |
1947 (noxfrm ? DST_NOXFRM : 0));
David S. Miller0c4dcd52011-02-17 15:42:37 -08001948}
1949
Eric Dumazet96d36222010-06-02 19:21:31 +00001950/* called in rcu_read_lock() section */
Al Viro9e12bb22006-09-26 21:25:20 -07001951static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001952 u8 tos, struct net_device *dev, int our)
1953{
Eric Dumazet96d36222010-06-02 19:21:31 +00001954 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001955 struct rtable *rth;
Eric Dumazet96d36222010-06-02 19:21:31 +00001956 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001957 u32 itag = 0;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001958 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001959
1960 /* Primary sanity checks. */
1961
1962 if (in_dev == NULL)
1963 return -EINVAL;
1964
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001965 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Thomas Grafd0daebc32012-06-12 00:44:01 +00001966 skb->protocol != htons(ETH_P_IP))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001967 goto e_inval;
1968
Thomas Grafd0daebc32012-06-12 00:44:01 +00001969 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1970 if (ipv4_is_loopback(saddr))
1971 goto e_inval;
1972
Joe Perchesf97c1e02007-12-16 13:45:43 -08001973 if (ipv4_is_zeronet(saddr)) {
1974 if (!ipv4_is_local_multicast(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001975 goto e_inval;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001976 } else {
David S. Miller9e56e382012-06-28 18:54:02 -07001977 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1978 in_dev, &itag);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001979 if (err < 0)
1980 goto e_err;
1981 }
Benjamin LaHaise4e7b2f12012-03-27 15:55:32 +00001982 rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
David S. Miller5c1e6aa2011-04-28 14:13:38 -07001983 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001984 if (!rth)
1985 goto e_nobufs;
1986
Patrick McHardyc7066f72011-01-14 13:36:42 +01001987#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07001988 rth->dst.tclassid = itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001989#endif
David S. Millercf911662011-04-28 14:31:47 -07001990 rth->dst.output = ip_rt_bug;
1991
1992 rth->rt_key_dst = daddr;
1993 rth->rt_key_src = saddr;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001994 rth->rt_genid = rt_genid(dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001995 rth->rt_flags = RTCF_MULTICAST;
Eric Dumazet29e75252008-01-31 17:05:09 -08001996 rth->rt_type = RTN_MULTICAST;
David S. Miller475949d2011-05-03 19:45:15 -07001997 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07001998 rth->rt_dst = daddr;
1999 rth->rt_src = saddr;
2000 rth->rt_route_iif = dev->ifindex;
2001 rth->rt_iif = dev->ifindex;
2002 rth->rt_oif = 0;
2003 rth->rt_mark = skb->mark;
David S. Miller59436342012-07-10 06:58:42 -07002004 rth->rt_pmtu = 0;
David S. Millercf911662011-04-28 14:31:47 -07002005 rth->rt_gateway = daddr;
David S. Millercf911662011-04-28 14:31:47 -07002006 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002007 if (our) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002008 rth->dst.input= ip_local_deliver;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002009 rth->rt_flags |= RTCF_LOCAL;
2010 }
2011
2012#ifdef CONFIG_IP_MROUTE
Joe Perchesf97c1e02007-12-16 13:45:43 -08002013 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
Changli Gaod8d1f302010-06-10 23:31:35 -07002014 rth->dst.input = ip_mr_input;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002015#endif
2016 RT_CACHE_STAT_INC(in_slow_mc);
2017
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002018 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
David S. Millerb23dd4f2011-03-02 14:31:35 -08002019 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
Eric Dumazet9aa3c942011-06-18 11:59:18 -07002020 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002021
2022e_nobufs:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002023 return -ENOBUFS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002024e_inval:
Eric Dumazet96d36222010-06-02 19:21:31 +00002025 return -EINVAL;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002026e_err:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002027 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002028}
2029
2030
2031static void ip_handle_martian_source(struct net_device *dev,
2032 struct in_device *in_dev,
2033 struct sk_buff *skb,
Al Viro9e12bb22006-09-26 21:25:20 -07002034 __be32 daddr,
2035 __be32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002036{
2037 RT_CACHE_STAT_INC(in_martian_src);
2038#ifdef CONFIG_IP_ROUTE_VERBOSE
2039 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2040 /*
2041 * RFC1812 recommendation, if source is martian,
2042 * the only hint is MAC header.
2043 */
Joe Perches058bd4d2012-03-11 18:36:11 +00002044 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
Harvey Harrison673d57e2008-10-31 00:53:57 -07002045 &daddr, &saddr, dev->name);
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07002046 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
Joe Perches058bd4d2012-03-11 18:36:11 +00002047 print_hex_dump(KERN_WARNING, "ll header: ",
2048 DUMP_PREFIX_OFFSET, 16, 1,
2049 skb_mac_header(skb),
2050 dev->hard_header_len, true);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002051 }
2052 }
2053#endif
2054}
2055
Eric Dumazet47360222010-06-03 04:13:21 +00002056/* called in rcu_read_lock() section */
Stephen Hemminger5969f712008-04-10 01:52:09 -07002057static int __mkroute_input(struct sk_buff *skb,
David S. Miller982721f2011-02-16 21:44:24 -08002058 const struct fib_result *res,
Stephen Hemminger5969f712008-04-10 01:52:09 -07002059 struct in_device *in_dev,
2060 __be32 daddr, __be32 saddr, u32 tos,
2061 struct rtable **result)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002062{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002063 struct rtable *rth;
2064 int err;
2065 struct in_device *out_dev;
Eric Dumazet47360222010-06-03 04:13:21 +00002066 unsigned int flags = 0;
Al Virod9c9df82006-09-26 21:28:14 -07002067 u32 itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002068
2069 /* get a working reference to the output device */
Eric Dumazet47360222010-06-03 04:13:21 +00002070 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002071 if (out_dev == NULL) {
Joe Perchese87cc472012-05-13 21:56:26 +00002072 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07002073 return -EINVAL;
2074 }
2075
2076
Michael Smith5c04c812011-04-07 04:51:50 +00002077 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
David S. Miller9e56e382012-06-28 18:54:02 -07002078 in_dev->dev, in_dev, &itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002079 if (err < 0) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002080 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002081 saddr);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002082
Linus Torvalds1da177e2005-04-16 15:20:36 -07002083 goto cleanup;
2084 }
2085
2086 if (err)
2087 flags |= RTCF_DIRECTSRC;
2088
Thomas Graf51b77ca2008-06-03 16:36:01 -07002089 if (out_dev == in_dev && err &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002090 (IN_DEV_SHARED_MEDIA(out_dev) ||
2091 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2092 flags |= RTCF_DOREDIRECT;
2093
2094 if (skb->protocol != htons(ETH_P_IP)) {
2095 /* Not IP (i.e. ARP). Do not create route, if it is
2096 * invalid for proxy arp. DNAT routes are always valid.
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00002097 *
2098 * Proxy arp feature have been extended to allow, ARP
2099 * replies back to the same interface, to support
2100 * Private VLAN switch technologies. See arp.c.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002101 */
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00002102 if (out_dev == in_dev &&
2103 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002104 err = -EINVAL;
2105 goto cleanup;
2106 }
2107 }
2108
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002109 rth = rt_dst_alloc(out_dev->dev,
2110 IN_DEV_CONF_GET(in_dev, NOPOLICY),
David S. Miller0c4dcd52011-02-17 15:42:37 -08002111 IN_DEV_CONF_GET(out_dev, NOXFRM));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002112 if (!rth) {
2113 err = -ENOBUFS;
2114 goto cleanup;
2115 }
2116
David S. Miller5e2b61f2011-03-04 21:47:09 -08002117 rth->rt_key_dst = daddr;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002118 rth->rt_key_src = saddr;
David S. Millercf911662011-04-28 14:31:47 -07002119 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2120 rth->rt_flags = flags;
2121 rth->rt_type = res->type;
David S. Miller475949d2011-05-03 19:45:15 -07002122 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002123 rth->rt_dst = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002124 rth->rt_src = saddr;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002125 rth->rt_route_iif = in_dev->dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002126 rth->rt_iif = in_dev->dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002127 rth->rt_oif = 0;
David S. Millercf911662011-04-28 14:31:47 -07002128 rth->rt_mark = skb->mark;
David S. Miller59436342012-07-10 06:58:42 -07002129 rth->rt_pmtu = 0;
David S. Millercf911662011-04-28 14:31:47 -07002130 rth->rt_gateway = daddr;
David S. Millercf911662011-04-28 14:31:47 -07002131 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002132
Changli Gaod8d1f302010-06-10 23:31:35 -07002133 rth->dst.input = ip_forward;
2134 rth->dst.output = ip_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002135
David S. Miller5e2b61f2011-03-04 21:47:09 -08002136 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002137
Linus Torvalds1da177e2005-04-16 15:20:36 -07002138 *result = rth;
2139 err = 0;
2140 cleanup:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002141 return err;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002142}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002143
Stephen Hemminger5969f712008-04-10 01:52:09 -07002144static int ip_mkroute_input(struct sk_buff *skb,
2145 struct fib_result *res,
David S. Miller68a5e3d2011-03-11 20:07:33 -05002146 const struct flowi4 *fl4,
Stephen Hemminger5969f712008-04-10 01:52:09 -07002147 struct in_device *in_dev,
2148 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002149{
Daniel Baluta5e73ea12012-04-15 01:34:41 +00002150 struct rtable *rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002151 int err;
Eric Dumazet95c96172012-04-15 05:58:06 +00002152 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002153
2154#ifdef CONFIG_IP_ROUTE_MULTIPATH
David S. Millerff3fccb2011-03-10 16:23:24 -08002155 if (res->fi && res->fi->fib_nhs > 1)
David S. Miller1b7fe5932011-03-10 17:01:16 -08002156 fib_select_multipath(res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002157#endif
2158
2159 /* create a routing cache entry */
2160 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2161 if (err)
2162 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002163
2164 /* put it into the cache */
David S. Miller68a5e3d2011-03-11 20:07:33 -05002165 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
Changli Gaod8d1f302010-06-10 23:31:35 -07002166 rt_genid(dev_net(rth->dst.dev)));
David S. Miller68a5e3d2011-03-11 20:07:33 -05002167 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002168 if (IS_ERR(rth))
2169 return PTR_ERR(rth);
2170 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002171}
2172
Linus Torvalds1da177e2005-04-16 15:20:36 -07002173/*
2174 * NOTE. We drop all the packets that has local source
2175 * addresses, because every properly looped back packet
2176 * must have correct destination already attached by output routine.
2177 *
2178 * Such approach solves two big problems:
2179 * 1. Not simplex devices are handled properly.
2180 * 2. IP spoofing attempts are filtered with 100% of guarantee.
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002181 * called with rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -07002182 */
2183
Al Viro9e12bb22006-09-26 21:25:20 -07002184static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
David S. Millerc10237e2012-06-27 17:05:06 -07002185 u8 tos, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002186{
2187 struct fib_result res;
Eric Dumazet96d36222010-06-02 19:21:31 +00002188 struct in_device *in_dev = __in_dev_get_rcu(dev);
David S. Miller68a5e3d2011-03-11 20:07:33 -05002189 struct flowi4 fl4;
Eric Dumazet95c96172012-04-15 05:58:06 +00002190 unsigned int flags = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002191 u32 itag = 0;
Eric Dumazet95c96172012-04-15 05:58:06 +00002192 struct rtable *rth;
2193 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002194 int err = -EINVAL;
Daniel Baluta5e73ea12012-04-15 01:34:41 +00002195 struct net *net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002196
2197 /* IP on this device is disabled. */
2198
2199 if (!in_dev)
2200 goto out;
2201
2202 /* Check for the most weird martians, which can be not detected
2203 by fib_lookup.
2204 */
2205
Thomas Grafd0daebc32012-06-12 00:44:01 +00002206 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002207 goto martian_source;
2208
Andy Walls27a954b2010-10-17 15:11:22 +00002209 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002210 goto brd_input;
2211
2212 /* Accept zero addresses only to limited broadcast;
2213 * I even do not know to fix it or not. Waiting for complains :-)
2214 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002215 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002216 goto martian_source;
2217
Thomas Grafd0daebc32012-06-12 00:44:01 +00002218 if (ipv4_is_zeronet(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002219 goto martian_destination;
2220
Thomas Grafd0daebc32012-06-12 00:44:01 +00002221 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
2222 if (ipv4_is_loopback(daddr))
2223 goto martian_destination;
2224
2225 if (ipv4_is_loopback(saddr))
2226 goto martian_source;
2227 }
2228
Linus Torvalds1da177e2005-04-16 15:20:36 -07002229 /*
2230 * Now we are ready to route packet.
2231 */
David S. Miller68a5e3d2011-03-11 20:07:33 -05002232 fl4.flowi4_oif = 0;
2233 fl4.flowi4_iif = dev->ifindex;
2234 fl4.flowi4_mark = skb->mark;
2235 fl4.flowi4_tos = tos;
2236 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2237 fl4.daddr = daddr;
2238 fl4.saddr = saddr;
2239 err = fib_lookup(net, &fl4, &res);
David S. Miller251da412012-06-26 16:27:09 -07002240 if (err != 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002241 goto no_route;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002242
2243 RT_CACHE_STAT_INC(in_slow_tot);
2244
2245 if (res.type == RTN_BROADCAST)
2246 goto brd_input;
2247
2248 if (res.type == RTN_LOCAL) {
Michael Smith5c04c812011-04-07 04:51:50 +00002249 err = fib_validate_source(skb, saddr, daddr, tos,
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002250 net->loopback_dev->ifindex,
David S. Miller9e56e382012-06-28 18:54:02 -07002251 dev, in_dev, &itag);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002252 if (err < 0)
2253 goto martian_source_keep_err;
2254 if (err)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002255 flags |= RTCF_DIRECTSRC;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002256 goto local_input;
2257 }
2258
2259 if (!IN_DEV_FORWARD(in_dev))
David S. Miller251da412012-06-26 16:27:09 -07002260 goto no_route;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002261 if (res.type != RTN_UNICAST)
2262 goto martian_destination;
2263
David S. Miller68a5e3d2011-03-11 20:07:33 -05002264 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002265out: return err;
2266
2267brd_input:
2268 if (skb->protocol != htons(ETH_P_IP))
2269 goto e_inval;
2270
David S. Miller41347dc2012-06-28 04:05:27 -07002271 if (!ipv4_is_zeronet(saddr)) {
David S. Miller9e56e382012-06-28 18:54:02 -07002272 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2273 in_dev, &itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002274 if (err < 0)
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002275 goto martian_source_keep_err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002276 if (err)
2277 flags |= RTCF_DIRECTSRC;
2278 }
2279 flags |= RTCF_BROADCAST;
2280 res.type = RTN_BROADCAST;
2281 RT_CACHE_STAT_INC(in_brd);
2282
2283local_input:
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002284 rth = rt_dst_alloc(net->loopback_dev,
2285 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002286 if (!rth)
2287 goto e_nobufs;
2288
David S. Millercf911662011-04-28 14:31:47 -07002289 rth->dst.input= ip_local_deliver;
Changli Gaod8d1f302010-06-10 23:31:35 -07002290 rth->dst.output= ip_rt_bug;
David S. Millercf911662011-04-28 14:31:47 -07002291#ifdef CONFIG_IP_ROUTE_CLASSID
2292 rth->dst.tclassid = itag;
2293#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002294
David S. Miller5e2b61f2011-03-04 21:47:09 -08002295 rth->rt_key_dst = daddr;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002296 rth->rt_key_src = saddr;
David S. Millercf911662011-04-28 14:31:47 -07002297 rth->rt_genid = rt_genid(net);
2298 rth->rt_flags = flags|RTCF_LOCAL;
2299 rth->rt_type = res.type;
David S. Miller475949d2011-05-03 19:45:15 -07002300 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002301 rth->rt_dst = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002302 rth->rt_src = saddr;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002303 rth->rt_route_iif = dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002304 rth->rt_iif = dev->ifindex;
David S. Millercf911662011-04-28 14:31:47 -07002305 rth->rt_oif = 0;
2306 rth->rt_mark = skb->mark;
David S. Miller59436342012-07-10 06:58:42 -07002307 rth->rt_pmtu = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002308 rth->rt_gateway = daddr;
David S. Millercf911662011-04-28 14:31:47 -07002309 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002310 if (res.type == RTN_UNREACHABLE) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002311 rth->dst.input= ip_error;
2312 rth->dst.error= -err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002313 rth->rt_flags &= ~RTCF_LOCAL;
2314 }
David S. Miller68a5e3d2011-03-11 20:07:33 -05002315 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2316 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002317 err = 0;
2318 if (IS_ERR(rth))
2319 err = PTR_ERR(rth);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002320 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002321
2322no_route:
2323 RT_CACHE_STAT_INC(in_no_route);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002324 res.type = RTN_UNREACHABLE;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08002325 if (err == -ESRCH)
2326 err = -ENETUNREACH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002327 goto local_input;
2328
2329 /*
2330 * Do not cache martian addresses: they should be logged (RFC1812)
2331 */
2332martian_destination:
2333 RT_CACHE_STAT_INC(in_martian_dst);
2334#ifdef CONFIG_IP_ROUTE_VERBOSE
Joe Perchese87cc472012-05-13 21:56:26 +00002335 if (IN_DEV_LOG_MARTIANS(in_dev))
2336 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2337 &daddr, &saddr, dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002338#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002339
Linus Torvalds1da177e2005-04-16 15:20:36 -07002340e_inval:
2341 err = -EINVAL;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002342 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002343
2344e_nobufs:
2345 err = -ENOBUFS;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002346 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002347
2348martian_source:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002349 err = -EINVAL;
2350martian_source_keep_err:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002351 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002352 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002353}
2354
Eric Dumazet407eadd2010-05-10 11:32:55 +00002355int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
David S. Millerc10237e2012-06-27 17:05:06 -07002356 u8 tos, struct net_device *dev, bool noref)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002357{
Eric Dumazet95c96172012-04-15 05:58:06 +00002358 struct rtable *rth;
2359 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002360 int iif = dev->ifindex;
Denis V. Lunevb5921912008-01-22 23:50:25 -08002361 struct net *net;
Eric Dumazet96d36222010-06-02 19:21:31 +00002362 int res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002363
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002364 net = dev_net(dev);
Neil Horman1080d702008-10-27 12:28:25 -07002365
Eric Dumazet96d36222010-06-02 19:21:31 +00002366 rcu_read_lock();
2367
Neil Horman1080d702008-10-27 12:28:25 -07002368 if (!rt_caching(net))
2369 goto skip_cache;
2370
Linus Torvalds1da177e2005-04-16 15:20:36 -07002371 tos &= IPTOS_RT_MASK;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002372 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002373
Linus Torvalds1da177e2005-04-16 15:20:36 -07002374 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002375 rth = rcu_dereference(rth->dst.rt_next)) {
David S. Miller5e2b61f2011-03-04 21:47:09 -08002376 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2377 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
Julian Anastasov97a80412011-08-09 04:01:16 +00002378 (rth->rt_route_iif ^ iif) |
David S. Miller475949d2011-05-03 19:45:15 -07002379 (rth->rt_key_tos ^ tos)) == 0 &&
David S. Miller5e2b61f2011-03-04 21:47:09 -08002380 rth->rt_mark == skb->mark &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002381 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002382 !rt_is_expired(rth)) {
Eric Dumazet407eadd2010-05-10 11:32:55 +00002383 if (noref) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002384 dst_use_noref(&rth->dst, jiffies);
2385 skb_dst_set_noref(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002386 } else {
Changli Gaod8d1f302010-06-10 23:31:35 -07002387 dst_use(&rth->dst, jiffies);
2388 skb_dst_set(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002389 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002390 RT_CACHE_STAT_INC(in_hit);
2391 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002392 return 0;
2393 }
2394 RT_CACHE_STAT_INC(in_hlist_search);
2395 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002396
Neil Horman1080d702008-10-27 12:28:25 -07002397skip_cache:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002398 /* Multicast recognition logic is moved from route cache to here.
2399 The problem was that too many Ethernet cards have broken/missing
2400 hardware multicast filters :-( As result the host on multicasting
2401 network acquires a lot of useless route cache entries, sort of
2402 SDR messages from all the world. Now we try to get rid of them.
2403 Really, provided software IP multicast filter is organized
2404 reasonably (at least, hashed), it does not result in a slowdown
2405 comparing with route cache reject entries.
2406 Note, that multicast routers are not affected, because
2407 route cache entry is created eventually.
2408 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002409 if (ipv4_is_multicast(daddr)) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002410 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002411
Eric Dumazet96d36222010-06-02 19:21:31 +00002412 if (in_dev) {
David S. Millerdbdd9a52011-03-10 16:34:38 -08002413 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2414 ip_hdr(skb)->protocol);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002415 if (our
2416#ifdef CONFIG_IP_MROUTE
Joe Perches9d4fb272009-11-23 10:41:23 -08002417 ||
2418 (!ipv4_is_local_multicast(daddr) &&
2419 IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002420#endif
Joe Perches9d4fb272009-11-23 10:41:23 -08002421 ) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002422 int res = ip_route_input_mc(skb, daddr, saddr,
2423 tos, dev, our);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002424 rcu_read_unlock();
Eric Dumazet96d36222010-06-02 19:21:31 +00002425 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002426 }
2427 }
2428 rcu_read_unlock();
2429 return -EINVAL;
2430 }
David S. Millerc10237e2012-06-27 17:05:06 -07002431 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
Eric Dumazet96d36222010-06-02 19:21:31 +00002432 rcu_read_unlock();
2433 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002434}
Eric Dumazet407eadd2010-05-10 11:32:55 +00002435EXPORT_SYMBOL(ip_route_input_common);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002436
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002437/* called with rcu_read_lock() */
David S. Miller982721f2011-02-16 21:44:24 -08002438static struct rtable *__mkroute_output(const struct fib_result *res,
David S. Miller68a5e3d2011-03-11 20:07:33 -05002439 const struct flowi4 *fl4,
David S. Miller813b3b52011-04-28 14:48:42 -07002440 __be32 orig_daddr, __be32 orig_saddr,
Julian Anastasovf61759e2011-12-02 11:39:42 +00002441 int orig_oif, __u8 orig_rtos,
2442 struct net_device *dev_out,
David S. Miller5ada5522011-02-17 15:29:00 -08002443 unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002444{
David S. Miller982721f2011-02-16 21:44:24 -08002445 struct fib_info *fi = res->fi;
David S. Miller5ada5522011-02-17 15:29:00 -08002446 struct in_device *in_dev;
David S. Miller982721f2011-02-16 21:44:24 -08002447 u16 type = res->type;
David S. Miller5ada5522011-02-17 15:29:00 -08002448 struct rtable *rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002449
Thomas Grafd0daebc32012-06-12 00:44:01 +00002450 in_dev = __in_dev_get_rcu(dev_out);
2451 if (!in_dev)
David S. Miller5ada5522011-02-17 15:29:00 -08002452 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002453
Thomas Grafd0daebc32012-06-12 00:44:01 +00002454 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2455 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2456 return ERR_PTR(-EINVAL);
2457
David S. Miller68a5e3d2011-03-11 20:07:33 -05002458 if (ipv4_is_lbcast(fl4->daddr))
David S. Miller982721f2011-02-16 21:44:24 -08002459 type = RTN_BROADCAST;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002460 else if (ipv4_is_multicast(fl4->daddr))
David S. Miller982721f2011-02-16 21:44:24 -08002461 type = RTN_MULTICAST;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002462 else if (ipv4_is_zeronet(fl4->daddr))
David S. Miller5ada5522011-02-17 15:29:00 -08002463 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002464
2465 if (dev_out->flags & IFF_LOOPBACK)
2466 flags |= RTCF_LOCAL;
2467
David S. Miller982721f2011-02-16 21:44:24 -08002468 if (type == RTN_BROADCAST) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002469 flags |= RTCF_BROADCAST | RTCF_LOCAL;
David S. Miller982721f2011-02-16 21:44:24 -08002470 fi = NULL;
2471 } else if (type == RTN_MULTICAST) {
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002472 flags |= RTCF_MULTICAST | RTCF_LOCAL;
David S. Miller813b3b52011-04-28 14:48:42 -07002473 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2474 fl4->flowi4_proto))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002475 flags &= ~RTCF_LOCAL;
2476 /* If multicast route do not exist use
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002477 * default one, but do not gateway in this case.
2478 * Yes, it is hack.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002479 */
David S. Miller982721f2011-02-16 21:44:24 -08002480 if (fi && res->prefixlen < 4)
2481 fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002482 }
2483
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002484 rth = rt_dst_alloc(dev_out,
2485 IN_DEV_CONF_GET(in_dev, NOPOLICY),
David S. Miller0c4dcd52011-02-17 15:42:37 -08002486 IN_DEV_CONF_GET(in_dev, NOXFRM));
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002487 if (!rth)
David S. Miller5ada5522011-02-17 15:29:00 -08002488 return ERR_PTR(-ENOBUFS);
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002489
David S. Millercf911662011-04-28 14:31:47 -07002490 rth->dst.output = ip_output;
2491
David S. Miller813b3b52011-04-28 14:48:42 -07002492 rth->rt_key_dst = orig_daddr;
2493 rth->rt_key_src = orig_saddr;
David S. Millercf911662011-04-28 14:31:47 -07002494 rth->rt_genid = rt_genid(dev_net(dev_out));
2495 rth->rt_flags = flags;
2496 rth->rt_type = type;
Julian Anastasovf61759e2011-12-02 11:39:42 +00002497 rth->rt_key_tos = orig_rtos;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002498 rth->rt_dst = fl4->daddr;
2499 rth->rt_src = fl4->saddr;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002500 rth->rt_route_iif = 0;
David S. Miller813b3b52011-04-28 14:48:42 -07002501 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2502 rth->rt_oif = orig_oif;
2503 rth->rt_mark = fl4->flowi4_mark;
David S. Miller59436342012-07-10 06:58:42 -07002504 rth->rt_pmtu = 0;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002505 rth->rt_gateway = fl4->daddr;
David S. Millercf911662011-04-28 14:31:47 -07002506 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002507
2508 RT_CACHE_STAT_INC(out_slow_tot);
2509
David S. Miller41347dc2012-06-28 04:05:27 -07002510 if (flags & RTCF_LOCAL)
Changli Gaod8d1f302010-06-10 23:31:35 -07002511 rth->dst.input = ip_local_deliver;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002512 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002513 if (flags & RTCF_LOCAL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002514 !(dev_out->flags & IFF_LOOPBACK)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002515 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002516 RT_CACHE_STAT_INC(out_slow_mc);
2517 }
2518#ifdef CONFIG_IP_MROUTE
David S. Miller982721f2011-02-16 21:44:24 -08002519 if (type == RTN_MULTICAST) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002520 if (IN_DEV_MFORWARD(in_dev) &&
David S. Miller813b3b52011-04-28 14:48:42 -07002521 !ipv4_is_local_multicast(fl4->daddr)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002522 rth->dst.input = ip_mr_input;
2523 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002524 }
2525 }
2526#endif
2527 }
2528
David S. Miller813b3b52011-04-28 14:48:42 -07002529 rt_set_nexthop(rth, fl4, res, fi, type, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002530
Eric Dumazet7586ece2012-06-20 05:02:19 +00002531 if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE)
2532 rth->dst.flags |= DST_NOCACHE;
2533
David S. Miller5ada5522011-02-17 15:29:00 -08002534 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002535}
2536
Linus Torvalds1da177e2005-04-16 15:20:36 -07002537/*
2538 * Major route resolver routine.
Eric Dumazet0197aa32010-09-30 03:33:58 +00002539 * called with rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002540 */
2541
David S. Miller813b3b52011-04-28 14:48:42 -07002542static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002543{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002544 struct net_device *dev_out = NULL;
Julian Anastasovf61759e2011-12-02 11:39:42 +00002545 __u8 tos = RT_FL_TOS(fl4);
David S. Miller813b3b52011-04-28 14:48:42 -07002546 unsigned int flags = 0;
2547 struct fib_result res;
David S. Miller5ada5522011-02-17 15:29:00 -08002548 struct rtable *rth;
David S. Miller813b3b52011-04-28 14:48:42 -07002549 __be32 orig_daddr;
2550 __be32 orig_saddr;
2551 int orig_oif;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002552
David S. Miller85b91b02012-07-13 08:21:29 -07002553 res.tclassid = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002554 res.fi = NULL;
David S. Miller8b96d222012-06-11 02:01:56 -07002555 res.table = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002556
David S. Miller813b3b52011-04-28 14:48:42 -07002557 orig_daddr = fl4->daddr;
2558 orig_saddr = fl4->saddr;
2559 orig_oif = fl4->flowi4_oif;
2560
2561 fl4->flowi4_iif = net->loopback_dev->ifindex;
2562 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2563 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2564 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
David S. Miller44713b62011-03-04 21:24:47 -08002565
David S. Miller010c2702011-02-17 15:37:09 -08002566 rcu_read_lock();
David S. Miller813b3b52011-04-28 14:48:42 -07002567 if (fl4->saddr) {
David S. Millerb23dd4f2011-03-02 14:31:35 -08002568 rth = ERR_PTR(-EINVAL);
David S. Miller813b3b52011-04-28 14:48:42 -07002569 if (ipv4_is_multicast(fl4->saddr) ||
2570 ipv4_is_lbcast(fl4->saddr) ||
2571 ipv4_is_zeronet(fl4->saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002572 goto out;
2573
Linus Torvalds1da177e2005-04-16 15:20:36 -07002574 /* I removed check for oif == dev_out->oif here.
2575 It was wrong for two reasons:
Denis V. Lunev1ab35272008-01-22 22:04:30 -08002576 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2577 is assigned to multiple interfaces.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002578 2. Moreover, we are allowed to send packets with saddr
2579 of another iface. --ANK
2580 */
2581
David S. Miller813b3b52011-04-28 14:48:42 -07002582 if (fl4->flowi4_oif == 0 &&
2583 (ipv4_is_multicast(fl4->daddr) ||
2584 ipv4_is_lbcast(fl4->daddr))) {
Julian Anastasova210d012008-10-01 07:28:28 -07002585 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
David S. Miller813b3b52011-04-28 14:48:42 -07002586 dev_out = __ip_dev_find(net, fl4->saddr, false);
Julian Anastasova210d012008-10-01 07:28:28 -07002587 if (dev_out == NULL)
2588 goto out;
2589
Linus Torvalds1da177e2005-04-16 15:20:36 -07002590 /* Special hack: user can direct multicasts
2591 and limited broadcast via necessary interface
2592 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2593 This hack is not just for fun, it allows
2594 vic,vat and friends to work.
2595 They bind socket to loopback, set ttl to zero
2596 and expect that it will work.
2597 From the viewpoint of routing cache they are broken,
2598 because we are not allowed to build multicast path
2599 with loopback source addr (look, routing cache
2600 cannot know, that ttl is zero, so that packet
2601 will not leave this host and route is valid).
2602 Luckily, this hack is good workaround.
2603 */
2604
David S. Miller813b3b52011-04-28 14:48:42 -07002605 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002606 goto make_route;
2607 }
Julian Anastasova210d012008-10-01 07:28:28 -07002608
David S. Miller813b3b52011-04-28 14:48:42 -07002609 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
Julian Anastasova210d012008-10-01 07:28:28 -07002610 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
David S. Miller813b3b52011-04-28 14:48:42 -07002611 if (!__ip_dev_find(net, fl4->saddr, false))
Julian Anastasova210d012008-10-01 07:28:28 -07002612 goto out;
Julian Anastasova210d012008-10-01 07:28:28 -07002613 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002614 }
2615
2616
David S. Miller813b3b52011-04-28 14:48:42 -07002617 if (fl4->flowi4_oif) {
2618 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002619 rth = ERR_PTR(-ENODEV);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002620 if (dev_out == NULL)
2621 goto out;
Herbert Xue5ed6392005-10-03 14:35:55 -07002622
2623 /* RACE: Check return value of inet_select_addr instead. */
Eric Dumazetfc75fc82010-12-22 04:39:39 +00002624 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
David S. Millerb23dd4f2011-03-02 14:31:35 -08002625 rth = ERR_PTR(-ENETUNREACH);
Eric Dumazetfc75fc82010-12-22 04:39:39 +00002626 goto out;
2627 }
David S. Miller813b3b52011-04-28 14:48:42 -07002628 if (ipv4_is_local_multicast(fl4->daddr) ||
2629 ipv4_is_lbcast(fl4->daddr)) {
2630 if (!fl4->saddr)
2631 fl4->saddr = inet_select_addr(dev_out, 0,
2632 RT_SCOPE_LINK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002633 goto make_route;
2634 }
David S. Miller813b3b52011-04-28 14:48:42 -07002635 if (fl4->saddr) {
2636 if (ipv4_is_multicast(fl4->daddr))
2637 fl4->saddr = inet_select_addr(dev_out, 0,
2638 fl4->flowi4_scope);
2639 else if (!fl4->daddr)
2640 fl4->saddr = inet_select_addr(dev_out, 0,
2641 RT_SCOPE_HOST);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002642 }
2643 }
2644
David S. Miller813b3b52011-04-28 14:48:42 -07002645 if (!fl4->daddr) {
2646 fl4->daddr = fl4->saddr;
2647 if (!fl4->daddr)
2648 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002649 dev_out = net->loopback_dev;
David S. Miller813b3b52011-04-28 14:48:42 -07002650 fl4->flowi4_oif = net->loopback_dev->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002651 res.type = RTN_LOCAL;
2652 flags |= RTCF_LOCAL;
2653 goto make_route;
2654 }
2655
David S. Miller813b3b52011-04-28 14:48:42 -07002656 if (fib_lookup(net, fl4, &res)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002657 res.fi = NULL;
David S. Miller8b96d222012-06-11 02:01:56 -07002658 res.table = NULL;
David S. Miller813b3b52011-04-28 14:48:42 -07002659 if (fl4->flowi4_oif) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002660 /* Apparently, routing tables are wrong. Assume,
2661 that the destination is on link.
2662
2663 WHY? DW.
2664 Because we are allowed to send to iface
2665 even if it has NO routes and NO assigned
2666 addresses. When oif is specified, routing
2667 tables are looked up with only one purpose:
2668 to catch if destination is gatewayed, rather than
2669 direct. Moreover, if MSG_DONTROUTE is set,
2670 we send packet, ignoring both routing tables
2671 and ifaddr state. --ANK
2672
2673
2674 We could make it even if oif is unknown,
2675 likely IPv6, but we do not.
2676 */
2677
David S. Miller813b3b52011-04-28 14:48:42 -07002678 if (fl4->saddr == 0)
2679 fl4->saddr = inet_select_addr(dev_out, 0,
2680 RT_SCOPE_LINK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002681 res.type = RTN_UNICAST;
2682 goto make_route;
2683 }
David S. Millerb23dd4f2011-03-02 14:31:35 -08002684 rth = ERR_PTR(-ENETUNREACH);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002685 goto out;
2686 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002687
2688 if (res.type == RTN_LOCAL) {
David S. Miller813b3b52011-04-28 14:48:42 -07002689 if (!fl4->saddr) {
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002690 if (res.fi->fib_prefsrc)
David S. Miller813b3b52011-04-28 14:48:42 -07002691 fl4->saddr = res.fi->fib_prefsrc;
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002692 else
David S. Miller813b3b52011-04-28 14:48:42 -07002693 fl4->saddr = fl4->daddr;
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002694 }
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002695 dev_out = net->loopback_dev;
David S. Miller813b3b52011-04-28 14:48:42 -07002696 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002697 res.fi = NULL;
2698 flags |= RTCF_LOCAL;
2699 goto make_route;
2700 }
2701
2702#ifdef CONFIG_IP_ROUTE_MULTIPATH
David S. Miller813b3b52011-04-28 14:48:42 -07002703 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
David S. Miller1b7fe5932011-03-10 17:01:16 -08002704 fib_select_multipath(&res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002705 else
2706#endif
David S. Miller21d8c492011-04-14 14:49:37 -07002707 if (!res.prefixlen &&
2708 res.table->tb_num_default > 1 &&
David S. Miller813b3b52011-04-28 14:48:42 -07002709 res.type == RTN_UNICAST && !fl4->flowi4_oif)
David S. Miller0c838ff2011-01-31 16:16:50 -08002710 fib_select_default(&res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002711
David S. Miller813b3b52011-04-28 14:48:42 -07002712 if (!fl4->saddr)
2713 fl4->saddr = FIB_RES_PREFSRC(net, res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002714
Linus Torvalds1da177e2005-04-16 15:20:36 -07002715 dev_out = FIB_RES_DEV(res);
David S. Miller813b3b52011-04-28 14:48:42 -07002716 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002717
2718
2719make_route:
David S. Miller813b3b52011-04-28 14:48:42 -07002720 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
Julian Anastasovf61759e2011-12-02 11:39:42 +00002721 tos, dev_out, flags);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002722 if (!IS_ERR(rth)) {
David S. Miller5ada5522011-02-17 15:29:00 -08002723 unsigned int hash;
2724
David S. Miller813b3b52011-04-28 14:48:42 -07002725 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
David S. Miller5ada5522011-02-17 15:29:00 -08002726 rt_genid(dev_net(dev_out)));
David S. Miller813b3b52011-04-28 14:48:42 -07002727 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
David S. Miller5ada5522011-02-17 15:29:00 -08002728 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002729
David S. Miller010c2702011-02-17 15:37:09 -08002730out:
2731 rcu_read_unlock();
David S. Millerb23dd4f2011-03-02 14:31:35 -08002732 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002733}
2734
David S. Miller813b3b52011-04-28 14:48:42 -07002735struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002736{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002737 struct rtable *rth;
David S. Miller010c2702011-02-17 15:37:09 -08002738 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002739
Neil Horman1080d702008-10-27 12:28:25 -07002740 if (!rt_caching(net))
2741 goto slow_output;
2742
David S. Miller9d6ec932011-03-12 01:12:47 -05002743 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002744
2745 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08002746 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002747 rth = rcu_dereference_bh(rth->dst.rt_next)) {
David S. Miller9d6ec932011-03-12 01:12:47 -05002748 if (rth->rt_key_dst == flp4->daddr &&
2749 rth->rt_key_src == flp4->saddr &&
David S. Millerc7537962010-11-11 17:07:48 -08002750 rt_is_output_route(rth) &&
David S. Miller9d6ec932011-03-12 01:12:47 -05002751 rth->rt_oif == flp4->flowi4_oif &&
2752 rth->rt_mark == flp4->flowi4_mark &&
David S. Miller475949d2011-05-03 19:45:15 -07002753 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
Denis V. Lunevb5921912008-01-22 23:50:25 -08002754 (IPTOS_RT_MASK | RTO_ONLINK)) &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002755 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002756 !rt_is_expired(rth)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002757 dst_use(&rth->dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002758 RT_CACHE_STAT_INC(out_hit);
2759 rcu_read_unlock_bh();
David S. Miller56157872011-05-02 14:37:45 -07002760 if (!flp4->saddr)
2761 flp4->saddr = rth->rt_src;
2762 if (!flp4->daddr)
2763 flp4->daddr = rth->rt_dst;
David S. Millerb23dd4f2011-03-02 14:31:35 -08002764 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002765 }
2766 RT_CACHE_STAT_INC(out_hlist_search);
2767 }
2768 rcu_read_unlock_bh();
2769
Neil Horman1080d702008-10-27 12:28:25 -07002770slow_output:
David S. Miller9d6ec932011-03-12 01:12:47 -05002771 return ip_route_output_slow(net, flp4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002772}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002773EXPORT_SYMBOL_GPL(__ip_route_output_key);
2774
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002775static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2776{
2777 return NULL;
2778}
2779
Steffen Klassertebb762f2011-11-23 02:12:51 +00002780static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
Roland Dreierec831ea2011-01-31 13:16:00 -08002781{
Steffen Klassert618f9bc2011-11-23 02:13:31 +00002782 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2783
2784 return mtu ? : dst->dev->mtu;
Roland Dreierec831ea2011-01-31 13:16:00 -08002785}
2786
David S. Miller6700c272012-07-17 03:29:28 -07002787static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2788 struct sk_buff *skb, u32 mtu)
David S. Miller14e50e52007-05-24 18:17:54 -07002789{
2790}
2791
David S. Miller6700c272012-07-17 03:29:28 -07002792static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2793 struct sk_buff *skb)
David S. Millerb587ee32012-07-12 00:39:24 -07002794{
2795}
2796
Held Bernhard0972ddb2011-04-24 22:07:32 +00002797static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2798 unsigned long old)
2799{
2800 return NULL;
2801}
2802
David S. Miller14e50e52007-05-24 18:17:54 -07002803static struct dst_ops ipv4_dst_blackhole_ops = {
2804 .family = AF_INET,
Harvey Harrison09640e62009-02-01 00:45:17 -08002805 .protocol = cpu_to_be16(ETH_P_IP),
David S. Miller14e50e52007-05-24 18:17:54 -07002806 .destroy = ipv4_dst_destroy,
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002807 .check = ipv4_blackhole_dst_check,
Steffen Klassertebb762f2011-11-23 02:12:51 +00002808 .mtu = ipv4_blackhole_mtu,
Eric Dumazet214f45c2011-02-18 11:39:01 -08002809 .default_advmss = ipv4_default_advmss,
David S. Miller14e50e52007-05-24 18:17:54 -07002810 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
David S. Millerb587ee32012-07-12 00:39:24 -07002811 .redirect = ipv4_rt_blackhole_redirect,
Held Bernhard0972ddb2011-04-24 22:07:32 +00002812 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
David S. Millerd3aaeb32011-07-18 00:40:17 -07002813 .neigh_lookup = ipv4_neigh_lookup,
David S. Miller14e50e52007-05-24 18:17:54 -07002814};
2815
David S. Miller2774c132011-03-01 14:59:04 -08002816struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
David S. Miller14e50e52007-05-24 18:17:54 -07002817{
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002818 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
David S. Miller2774c132011-03-01 14:59:04 -08002819 struct rtable *ort = (struct rtable *) dst_orig;
David S. Miller14e50e52007-05-24 18:17:54 -07002820
2821 if (rt) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002822 struct dst_entry *new = &rt->dst;
David S. Miller14e50e52007-05-24 18:17:54 -07002823
David S. Miller14e50e52007-05-24 18:17:54 -07002824 new->__use = 1;
Herbert Xu352e5122007-11-13 21:34:06 -08002825 new->input = dst_discard;
2826 new->output = dst_discard;
David S. Miller14e50e52007-05-24 18:17:54 -07002827
Changli Gaod8d1f302010-06-10 23:31:35 -07002828 new->dev = ort->dst.dev;
David S. Miller14e50e52007-05-24 18:17:54 -07002829 if (new->dev)
2830 dev_hold(new->dev);
2831
David S. Miller5e2b61f2011-03-04 21:47:09 -08002832 rt->rt_key_dst = ort->rt_key_dst;
2833 rt->rt_key_src = ort->rt_key_src;
David S. Miller475949d2011-05-03 19:45:15 -07002834 rt->rt_key_tos = ort->rt_key_tos;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002835 rt->rt_route_iif = ort->rt_route_iif;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002836 rt->rt_iif = ort->rt_iif;
2837 rt->rt_oif = ort->rt_oif;
2838 rt->rt_mark = ort->rt_mark;
David S. Miller59436342012-07-10 06:58:42 -07002839 rt->rt_pmtu = ort->rt_pmtu;
David S. Miller14e50e52007-05-24 18:17:54 -07002840
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002841 rt->rt_genid = rt_genid(net);
David S. Miller14e50e52007-05-24 18:17:54 -07002842 rt->rt_flags = ort->rt_flags;
2843 rt->rt_type = ort->rt_type;
2844 rt->rt_dst = ort->rt_dst;
2845 rt->rt_src = ort->rt_src;
David S. Miller14e50e52007-05-24 18:17:54 -07002846 rt->rt_gateway = ort->rt_gateway;
David S. Miller62fa8a82011-01-26 20:51:05 -08002847 rt->fi = ort->fi;
2848 if (rt->fi)
2849 atomic_inc(&rt->fi->fib_clntref);
David S. Miller14e50e52007-05-24 18:17:54 -07002850
2851 dst_free(new);
2852 }
2853
David S. Miller2774c132011-03-01 14:59:04 -08002854 dst_release(dst_orig);
2855
2856 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
David S. Miller14e50e52007-05-24 18:17:54 -07002857}
2858
David S. Miller9d6ec932011-03-12 01:12:47 -05002859struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
David S. Millerb23dd4f2011-03-02 14:31:35 -08002860 struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002861{
David S. Miller9d6ec932011-03-12 01:12:47 -05002862 struct rtable *rt = __ip_route_output_key(net, flp4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002863
David S. Millerb23dd4f2011-03-02 14:31:35 -08002864 if (IS_ERR(rt))
2865 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002866
David S. Miller56157872011-05-02 14:37:45 -07002867 if (flp4->flowi4_proto)
David S. Miller9d6ec932011-03-12 01:12:47 -05002868 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2869 flowi4_to_flowi(flp4),
2870 sk, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002871
David S. Millerb23dd4f2011-03-02 14:31:35 -08002872 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002873}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002874EXPORT_SYMBOL_GPL(ip_route_output_flow);
2875
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002876static int rt_fill_info(struct net *net,
2877 struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002878 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002879{
Eric Dumazet511c3f92009-06-02 05:14:27 +00002880 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002881 struct rtmsg *r;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002882 struct nlmsghdr *nlh;
Steffen Klassert2bc8ca42011-10-11 01:12:02 +00002883 unsigned long expires = 0;
David S. Millerf1850712012-07-10 07:26:01 -07002884 u32 error;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002885
2886 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2887 if (nlh == NULL)
Patrick McHardy26932562007-01-31 23:16:40 -08002888 return -EMSGSIZE;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002889
2890 r = nlmsg_data(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002891 r->rtm_family = AF_INET;
2892 r->rtm_dst_len = 32;
2893 r->rtm_src_len = 0;
David S. Miller475949d2011-05-03 19:45:15 -07002894 r->rtm_tos = rt->rt_key_tos;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002895 r->rtm_table = RT_TABLE_MAIN;
David S. Millerf3756b72012-04-01 20:39:02 -04002896 if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2897 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002898 r->rtm_type = rt->rt_type;
2899 r->rtm_scope = RT_SCOPE_UNIVERSE;
2900 r->rtm_protocol = RTPROT_UNSPEC;
2901 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2902 if (rt->rt_flags & RTCF_NOTIFY)
2903 r->rtm_flags |= RTM_F_NOTIFY;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002904
David S. Millerf3756b72012-04-01 20:39:02 -04002905 if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2906 goto nla_put_failure;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002907 if (rt->rt_key_src) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002908 r->rtm_src_len = 32;
David S. Millerf3756b72012-04-01 20:39:02 -04002909 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2910 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002911 }
David S. Millerf3756b72012-04-01 20:39:02 -04002912 if (rt->dst.dev &&
2913 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2914 goto nla_put_failure;
Patrick McHardyc7066f72011-01-14 13:36:42 +01002915#ifdef CONFIG_IP_ROUTE_CLASSID
David S. Millerf3756b72012-04-01 20:39:02 -04002916 if (rt->dst.tclassid &&
2917 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2918 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002919#endif
David S. Miller41347dc2012-06-28 04:05:27 -07002920 if (!rt_is_input_route(rt) &&
2921 rt->rt_src != rt->rt_key_src) {
David S. Millerf3756b72012-04-01 20:39:02 -04002922 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
2923 goto nla_put_failure;
2924 }
2925 if (rt->rt_dst != rt->rt_gateway &&
2926 nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2927 goto nla_put_failure;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002928
David S. Millerdefb3512010-12-08 21:16:57 -08002929 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002930 goto nla_put_failure;
2931
David S. Millerf3756b72012-04-01 20:39:02 -04002932 if (rt->rt_mark &&
2933 nla_put_be32(skb, RTA_MARK, rt->rt_mark))
2934 goto nla_put_failure;
Eric Dumazet963bfee2010-07-20 22:03:14 +00002935
Changli Gaod8d1f302010-06-10 23:31:35 -07002936 error = rt->dst.error;
David S. Miller59436342012-07-10 06:58:42 -07002937 expires = rt->dst.expires;
2938 if (expires) {
2939 if (time_before(jiffies, expires))
2940 expires -= jiffies;
2941 else
2942 expires = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002943 }
Thomas Grafbe403ea2006-08-17 18:15:17 -07002944
David S. Millerc7537962010-11-11 17:07:48 -08002945 if (rt_is_input_route(rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002946#ifdef CONFIG_IP_MROUTE
Al Viroe4485152006-09-26 22:15:01 -07002947 __be32 dst = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002948
Joe Perchesf97c1e02007-12-16 13:45:43 -08002949 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002950 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
David S. Miller9a1b9492011-05-04 12:18:54 -07002951 int err = ipmr_get_route(net, skb,
2952 rt->rt_src, rt->rt_dst,
2953 r, nowait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002954 if (err <= 0) {
2955 if (!nowait) {
2956 if (err == 0)
2957 return 0;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002958 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002959 } else {
2960 if (err == -EMSGSIZE)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002961 goto nla_put_failure;
Thomas Grafe3703b32006-11-27 09:27:07 -08002962 error = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002963 }
2964 }
2965 } else
2966#endif
David S. Millerf3756b72012-04-01 20:39:02 -04002967 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2968 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002969 }
2970
David S. Millerf1850712012-07-10 07:26:01 -07002971 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
Thomas Grafe3703b32006-11-27 09:27:07 -08002972 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002973
Thomas Grafbe403ea2006-08-17 18:15:17 -07002974 return nlmsg_end(skb, nlh);
2975
2976nla_put_failure:
Patrick McHardy26932562007-01-31 23:16:40 -08002977 nlmsg_cancel(skb, nlh);
2978 return -EMSGSIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002979}
2980
Daniel Baluta5e73ea12012-04-15 01:34:41 +00002981static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002982{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09002983 struct net *net = sock_net(in_skb->sk);
Thomas Grafd889ce32006-08-17 18:15:44 -07002984 struct rtmsg *rtm;
2985 struct nlattr *tb[RTA_MAX+1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07002986 struct rtable *rt = NULL;
Al Viro9e12bb22006-09-26 21:25:20 -07002987 __be32 dst = 0;
2988 __be32 src = 0;
2989 u32 iif;
Thomas Grafd889ce32006-08-17 18:15:44 -07002990 int err;
Eric Dumazet963bfee2010-07-20 22:03:14 +00002991 int mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002992 struct sk_buff *skb;
2993
Thomas Grafd889ce32006-08-17 18:15:44 -07002994 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2995 if (err < 0)
2996 goto errout;
2997
2998 rtm = nlmsg_data(nlh);
2999
Linus Torvalds1da177e2005-04-16 15:20:36 -07003000 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
Thomas Grafd889ce32006-08-17 18:15:44 -07003001 if (skb == NULL) {
3002 err = -ENOBUFS;
3003 goto errout;
3004 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003005
3006 /* Reserve room for dummy headers, this skb can pass
3007 through good chunk of routing engine.
3008 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07003009 skb_reset_mac_header(skb);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07003010 skb_reset_network_header(skb);
Stephen Hemmingerd2c962b2006-04-17 17:27:11 -07003011
3012 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07003013 ip_hdr(skb)->protocol = IPPROTO_ICMP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003014 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3015
Al Viro17fb2c62006-09-26 22:15:25 -07003016 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3017 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
Thomas Grafd889ce32006-08-17 18:15:44 -07003018 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
Eric Dumazet963bfee2010-07-20 22:03:14 +00003019 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003020
3021 if (iif) {
Thomas Grafd889ce32006-08-17 18:15:44 -07003022 struct net_device *dev;
3023
Denis V. Lunev19375042008-02-28 20:52:04 -08003024 dev = __dev_get_by_index(net, iif);
Thomas Grafd889ce32006-08-17 18:15:44 -07003025 if (dev == NULL) {
3026 err = -ENODEV;
3027 goto errout_free;
3028 }
3029
Linus Torvalds1da177e2005-04-16 15:20:36 -07003030 skb->protocol = htons(ETH_P_IP);
3031 skb->dev = dev;
Eric Dumazet963bfee2010-07-20 22:03:14 +00003032 skb->mark = mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003033 local_bh_disable();
3034 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3035 local_bh_enable();
Thomas Grafd889ce32006-08-17 18:15:44 -07003036
Eric Dumazet511c3f92009-06-02 05:14:27 +00003037 rt = skb_rtable(skb);
Changli Gaod8d1f302010-06-10 23:31:35 -07003038 if (err == 0 && rt->dst.error)
3039 err = -rt->dst.error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003040 } else {
David S. Miller68a5e3d2011-03-11 20:07:33 -05003041 struct flowi4 fl4 = {
3042 .daddr = dst,
3043 .saddr = src,
3044 .flowi4_tos = rtm->rtm_tos,
3045 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3046 .flowi4_mark = mark,
Thomas Grafd889ce32006-08-17 18:15:44 -07003047 };
David S. Miller9d6ec932011-03-12 01:12:47 -05003048 rt = ip_route_output_key(net, &fl4);
David S. Millerb23dd4f2011-03-02 14:31:35 -08003049
3050 err = 0;
3051 if (IS_ERR(rt))
3052 err = PTR_ERR(rt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003053 }
Thomas Grafd889ce32006-08-17 18:15:44 -07003054
Linus Torvalds1da177e2005-04-16 15:20:36 -07003055 if (err)
Thomas Grafd889ce32006-08-17 18:15:44 -07003056 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003057
Changli Gaod8d1f302010-06-10 23:31:35 -07003058 skb_dst_set(skb, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003059 if (rtm->rtm_flags & RTM_F_NOTIFY)
3060 rt->rt_flags |= RTCF_NOTIFY;
3061
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003062 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Denis V. Lunev19375042008-02-28 20:52:04 -08003063 RTM_NEWROUTE, 0, 0);
Thomas Grafd889ce32006-08-17 18:15:44 -07003064 if (err <= 0)
3065 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003066
Denis V. Lunev19375042008-02-28 20:52:04 -08003067 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
Thomas Grafd889ce32006-08-17 18:15:44 -07003068errout:
Thomas Graf2942e902006-08-15 00:30:25 -07003069 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003070
Thomas Grafd889ce32006-08-17 18:15:44 -07003071errout_free:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003072 kfree_skb(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07003073 goto errout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003074}
3075
3076int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3077{
3078 struct rtable *rt;
3079 int h, s_h;
3080 int idx, s_idx;
Denis V. Lunev19375042008-02-28 20:52:04 -08003081 struct net *net;
3082
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09003083 net = sock_net(skb->sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003084
3085 s_h = cb->args[0];
Eric Dumazetd8c92832008-01-07 21:52:14 -08003086 if (s_h < 0)
3087 s_h = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003088 s_idx = idx = cb->args[1];
Eric Dumazeta6272662008-08-28 01:11:25 -07003089 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3090 if (!rt_hash_table[h].chain)
3091 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003092 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08003093 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
Changli Gaod8d1f302010-06-10 23:31:35 -07003094 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3095 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003096 continue;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07003097 if (rt_is_expired(rt))
Eric Dumazet29e75252008-01-31 17:05:09 -08003098 continue;
Changli Gaod8d1f302010-06-10 23:31:35 -07003099 skb_dst_set_noref(skb, &rt->dst);
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003100 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003101 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07003102 1, NLM_F_MULTI) <= 0) {
Eric Dumazetadf30902009-06-02 05:19:30 +00003103 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003104 rcu_read_unlock_bh();
3105 goto done;
3106 }
Eric Dumazetadf30902009-06-02 05:19:30 +00003107 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003108 }
3109 rcu_read_unlock_bh();
3110 }
3111
3112done:
3113 cb->args[0] = h;
3114 cb->args[1] = idx;
3115 return skb->len;
3116}
3117
3118void ip_rt_multicast_event(struct in_device *in_dev)
3119{
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -07003120 rt_cache_flush(dev_net(in_dev->dev), 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003121}
3122
3123#ifdef CONFIG_SYSCTL
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003124static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003125 void __user *buffer,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003126 size_t *lenp, loff_t *ppos)
3127{
3128 if (write) {
Denis V. Lunev639e1042008-07-05 19:02:06 -07003129 int flush_delay;
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003130 ctl_table ctl;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003131 struct net *net;
Denis V. Lunev639e1042008-07-05 19:02:06 -07003132
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003133 memcpy(&ctl, __ctl, sizeof(ctl));
3134 ctl.data = &flush_delay;
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003135 proc_dointvec(&ctl, write, buffer, lenp, ppos);
Denis V. Lunev639e1042008-07-05 19:02:06 -07003136
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003137 net = (struct net *)__ctl->extra1;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003138 rt_cache_flush(net, flush_delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003139 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003140 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003141
3142 return -EINVAL;
3143}
3144
Al Viroeeb61f72008-07-27 08:59:33 +01003145static ctl_table ipv4_route_table[] = {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003146 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003147 .procname = "gc_thresh",
3148 .data = &ipv4_dst_ops.gc_thresh,
3149 .maxlen = sizeof(int),
3150 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003151 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003152 },
3153 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003154 .procname = "max_size",
3155 .data = &ip_rt_max_size,
3156 .maxlen = sizeof(int),
3157 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003158 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003159 },
3160 {
3161 /* Deprecated. Use gc_min_interval_ms */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003162
Linus Torvalds1da177e2005-04-16 15:20:36 -07003163 .procname = "gc_min_interval",
3164 .data = &ip_rt_gc_min_interval,
3165 .maxlen = sizeof(int),
3166 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003167 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003168 },
3169 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003170 .procname = "gc_min_interval_ms",
3171 .data = &ip_rt_gc_min_interval,
3172 .maxlen = sizeof(int),
3173 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003174 .proc_handler = proc_dointvec_ms_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003175 },
3176 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003177 .procname = "gc_timeout",
3178 .data = &ip_rt_gc_timeout,
3179 .maxlen = sizeof(int),
3180 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003181 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003182 },
3183 {
Eric Dumazet9f28a2f2011-12-21 15:47:16 -05003184 .procname = "gc_interval",
3185 .data = &ip_rt_gc_interval,
3186 .maxlen = sizeof(int),
3187 .mode = 0644,
3188 .proc_handler = proc_dointvec_jiffies,
3189 },
3190 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003191 .procname = "redirect_load",
3192 .data = &ip_rt_redirect_load,
3193 .maxlen = sizeof(int),
3194 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003195 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003196 },
3197 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003198 .procname = "redirect_number",
3199 .data = &ip_rt_redirect_number,
3200 .maxlen = sizeof(int),
3201 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003202 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003203 },
3204 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003205 .procname = "redirect_silence",
3206 .data = &ip_rt_redirect_silence,
3207 .maxlen = sizeof(int),
3208 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003209 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003210 },
3211 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003212 .procname = "error_cost",
3213 .data = &ip_rt_error_cost,
3214 .maxlen = sizeof(int),
3215 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003216 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003217 },
3218 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003219 .procname = "error_burst",
3220 .data = &ip_rt_error_burst,
3221 .maxlen = sizeof(int),
3222 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003223 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003224 },
3225 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003226 .procname = "gc_elasticity",
3227 .data = &ip_rt_gc_elasticity,
3228 .maxlen = sizeof(int),
3229 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003230 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003231 },
3232 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003233 .procname = "mtu_expires",
3234 .data = &ip_rt_mtu_expires,
3235 .maxlen = sizeof(int),
3236 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003237 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003238 },
3239 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003240 .procname = "min_pmtu",
3241 .data = &ip_rt_min_pmtu,
3242 .maxlen = sizeof(int),
3243 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003244 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003245 },
3246 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003247 .procname = "min_adv_mss",
3248 .data = &ip_rt_min_advmss,
3249 .maxlen = sizeof(int),
3250 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003251 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003252 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003253 { }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003254};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003255
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003256static struct ctl_table ipv4_route_flush_table[] = {
3257 {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003258 .procname = "flush",
3259 .maxlen = sizeof(int),
3260 .mode = 0200,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003261 .proc_handler = ipv4_sysctl_rtcache_flush,
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003262 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003263 { },
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003264};
3265
3266static __net_init int sysctl_route_net_init(struct net *net)
3267{
3268 struct ctl_table *tbl;
3269
3270 tbl = ipv4_route_flush_table;
Octavian Purdila09ad9bc2009-11-25 15:14:13 -08003271 if (!net_eq(net, &init_net)) {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003272 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3273 if (tbl == NULL)
3274 goto err_dup;
3275 }
3276 tbl[0].extra1 = net;
3277
Eric W. Biedermanec8f23c2012-04-19 13:44:49 +00003278 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003279 if (net->ipv4.route_hdr == NULL)
3280 goto err_reg;
3281 return 0;
3282
3283err_reg:
3284 if (tbl != ipv4_route_flush_table)
3285 kfree(tbl);
3286err_dup:
3287 return -ENOMEM;
3288}
3289
3290static __net_exit void sysctl_route_net_exit(struct net *net)
3291{
3292 struct ctl_table *tbl;
3293
3294 tbl = net->ipv4.route_hdr->ctl_table_arg;
3295 unregister_net_sysctl_table(net->ipv4.route_hdr);
3296 BUG_ON(tbl == ipv4_route_flush_table);
3297 kfree(tbl);
3298}
3299
3300static __net_initdata struct pernet_operations sysctl_route_ops = {
3301 .init = sysctl_route_net_init,
3302 .exit = sysctl_route_net_exit,
3303};
Linus Torvalds1da177e2005-04-16 15:20:36 -07003304#endif
3305
Neil Horman3ee94372010-05-08 01:57:52 -07003306static __net_init int rt_genid_init(struct net *net)
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003307{
Neil Horman3ee94372010-05-08 01:57:52 -07003308 get_random_bytes(&net->ipv4.rt_genid,
3309 sizeof(net->ipv4.rt_genid));
David S. Miller436c3b62011-03-24 17:42:21 -07003310 get_random_bytes(&net->ipv4.dev_addr_genid,
3311 sizeof(net->ipv4.dev_addr_genid));
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003312 return 0;
3313}
3314
Neil Horman3ee94372010-05-08 01:57:52 -07003315static __net_initdata struct pernet_operations rt_genid_ops = {
3316 .init = rt_genid_init,
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003317};
3318
David S. Millerc3426b42012-06-09 16:27:05 -07003319static int __net_init ipv4_inetpeer_init(struct net *net)
3320{
3321 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3322
3323 if (!bp)
3324 return -ENOMEM;
3325 inet_peer_base_init(bp);
3326 net->ipv4.peers = bp;
3327 return 0;
3328}
3329
3330static void __net_exit ipv4_inetpeer_exit(struct net *net)
3331{
3332 struct inet_peer_base *bp = net->ipv4.peers;
3333
3334 net->ipv4.peers = NULL;
David S. Miller56a6b242012-06-09 16:32:41 -07003335 inetpeer_invalidate_tree(bp);
David S. Millerc3426b42012-06-09 16:27:05 -07003336 kfree(bp);
3337}
3338
3339static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3340 .init = ipv4_inetpeer_init,
3341 .exit = ipv4_inetpeer_exit,
3342};
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003343
Patrick McHardyc7066f72011-01-14 13:36:42 +01003344#ifdef CONFIG_IP_ROUTE_CLASSID
Tejun Heo7d720c32010-02-16 15:20:26 +00003345struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
Patrick McHardyc7066f72011-01-14 13:36:42 +01003346#endif /* CONFIG_IP_ROUTE_CLASSID */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003347
3348static __initdata unsigned long rhash_entries;
3349static int __init set_rhash_entries(char *str)
3350{
Eldad Zack413c27d2012-05-19 14:13:18 +00003351 ssize_t ret;
3352
Linus Torvalds1da177e2005-04-16 15:20:36 -07003353 if (!str)
3354 return 0;
Eldad Zack413c27d2012-05-19 14:13:18 +00003355
3356 ret = kstrtoul(str, 0, &rhash_entries);
3357 if (ret)
3358 return 0;
3359
Linus Torvalds1da177e2005-04-16 15:20:36 -07003360 return 1;
3361}
3362__setup("rhash_entries=", set_rhash_entries);
3363
3364int __init ip_rt_init(void)
3365{
Eric Dumazet424c4b72005-07-05 14:58:19 -07003366 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003367
Patrick McHardyc7066f72011-01-14 13:36:42 +01003368#ifdef CONFIG_IP_ROUTE_CLASSID
Ingo Molnar0dcec8c2009-02-25 14:07:33 +01003369 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003370 if (!ip_rt_acct)
3371 panic("IP: failed to allocate ip_rt_acct\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003372#endif
3373
Alexey Dobriyane5d679f2006-08-26 19:25:52 -07003374 ipv4_dst_ops.kmem_cachep =
3375 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09003376 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003377
David S. Miller14e50e52007-05-24 18:17:54 -07003378 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3379
Eric Dumazetfc66f952010-10-08 06:37:34 +00003380 if (dst_entries_init(&ipv4_dst_ops) < 0)
3381 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3382
3383 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3384 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3385
Eric Dumazet424c4b72005-07-05 14:58:19 -07003386 rt_hash_table = (struct rt_hash_bucket *)
3387 alloc_large_system_hash("IP route cache",
3388 sizeof(struct rt_hash_bucket),
3389 rhash_entries,
Jan Beulich44813742009-09-21 17:03:05 -07003390 (totalram_pages >= 128 * 1024) ?
Mike Stroyan18955cf2005-11-29 16:12:55 -08003391 15 : 17,
Kirill Korotaev8d1502d2006-08-07 20:44:22 -07003392 0,
Eric Dumazet424c4b72005-07-05 14:58:19 -07003393 &rt_hash_log,
3394 &rt_hash_mask,
Tim Bird31fe62b2012-05-23 13:33:35 +00003395 0,
Anton Blanchardc9503e02009-04-27 05:42:24 -07003396 rhash_entries ? 0 : 512 * 1024);
Eric Dumazet22c047c2005-07-05 14:55:24 -07003397 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3398 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003399
3400 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3401 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3402
Linus Torvalds1da177e2005-04-16 15:20:36 -07003403 devinet_init();
3404 ip_fib_init();
3405
Eric Dumazet9f28a2f2011-12-21 15:47:16 -05003406 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3407 expires_ljiffies = jiffies;
3408 schedule_delayed_work(&expires_work,
3409 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3410
Denis V. Lunev73b38712008-02-28 20:51:18 -08003411 if (ip_rt_proc_init())
Joe Perches058bd4d2012-03-11 18:36:11 +00003412 pr_err("Unable to create route proc files\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003413#ifdef CONFIG_XFRM
3414 xfrm_init();
Neil Hormana33bc5c2009-07-30 18:52:15 -07003415 xfrm4_init(ip_rt_max_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003416#endif
Greg Rosec7ac8672011-06-10 01:27:09 +00003417 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
Thomas Graf63f34442007-03-22 11:55:17 -07003418
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003419#ifdef CONFIG_SYSCTL
3420 register_pernet_subsys(&sysctl_route_ops);
3421#endif
Neil Horman3ee94372010-05-08 01:57:52 -07003422 register_pernet_subsys(&rt_genid_ops);
David S. Millerc3426b42012-06-09 16:27:05 -07003423 register_pernet_subsys(&ipv4_inetpeer_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003424 return rc;
3425}
3426
Al Viroa1bc6eb2008-07-30 06:32:52 -04003427#ifdef CONFIG_SYSCTL
Al Viroeeb61f72008-07-27 08:59:33 +01003428/*
3429 * We really need to sanitize the damn ipv4 init order, then all
3430 * this nonsense will go away.
3431 */
3432void __init ip_static_sysctl_init(void)
3433{
Eric W. Biederman4e5ca782012-04-19 13:32:39 +00003434 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
Al Viroeeb61f72008-07-27 08:59:33 +01003435}
Al Viroa1bc6eb2008-07-30 06:32:52 -04003436#endif