blob: 9e7909eef8d10107008f8d629f9f2d75fde52eb2 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
Jesper Juhl02c30a82005-05-05 16:16:16 -07008 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -07009 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090021 * Alan Cox : Super /proc >4K
Linus Torvalds1da177e2005-04-16 15:20:36 -070022 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090039 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070055 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Ilia Sotnikovcef26852006-03-25 01:38:55 -080056 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
Linus Torvalds1da177e2005-04-16 15:20:36 -070058 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
Joe Perchesafd465032012-03-12 07:03:32 +000065#define pr_fmt(fmt) "IPv4: " fmt
66
Linus Torvalds1da177e2005-04-16 15:20:36 -070067#include <linux/module.h>
68#include <asm/uaccess.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070069#include <linux/bitops.h>
70#include <linux/types.h>
71#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070072#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070073#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070074#include <linux/string.h>
75#include <linux/socket.h>
76#include <linux/sockios.h>
77#include <linux/errno.h>
78#include <linux/in.h>
79#include <linux/inet.h>
80#include <linux/netdevice.h>
81#include <linux/proc_fs.h>
82#include <linux/init.h>
Eric Dumazet39c90ec2007-09-15 10:55:54 -070083#include <linux/workqueue.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070084#include <linux/skbuff.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070085#include <linux/inetdevice.h>
86#include <linux/igmp.h>
87#include <linux/pkt_sched.h>
88#include <linux/mroute.h>
89#include <linux/netfilter_ipv4.h>
90#include <linux/random.h>
91#include <linux/jhash.h>
92#include <linux/rcupdate.h>
93#include <linux/times.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090094#include <linux/slab.h>
Stephen Rothwellb9eda062011-12-22 17:03:29 +110095#include <linux/prefetch.h>
Herbert Xu352e5122007-11-13 21:34:06 -080096#include <net/dst.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020097#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070098#include <net/protocol.h>
99#include <net/ip.h>
100#include <net/route.h>
101#include <net/inetpeer.h>
102#include <net/sock.h>
103#include <net/ip_fib.h>
104#include <net/arp.h>
105#include <net/tcp.h>
106#include <net/icmp.h>
107#include <net/xfrm.h>
Tom Tucker8d717402006-07-30 20:43:36 -0700108#include <net/netevent.h>
Thomas Graf63f34442007-03-22 11:55:17 -0700109#include <net/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700110#ifdef CONFIG_SYSCTL
111#include <linux/sysctl.h>
112#endif
David S. Miller6e5714e2011-08-03 20:50:44 -0700113#include <net/secure_seq.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700114
David S. Miller68a5e3d2011-03-11 20:07:33 -0500115#define RT_FL_TOS(oldflp4) \
Julian Anastasovf61759e2011-12-02 11:39:42 +0000116 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700117
118#define IP_MAX_MTU 0xFFF0
119
120#define RT_GC_TIMEOUT (300*HZ)
121
Linus Torvalds1da177e2005-04-16 15:20:36 -0700122static int ip_rt_max_size;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700123static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500124static int ip_rt_gc_interval __read_mostly = 60 * HZ;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700125static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
126static int ip_rt_redirect_number __read_mostly = 9;
127static int ip_rt_redirect_load __read_mostly = HZ / 50;
128static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
129static int ip_rt_error_cost __read_mostly = HZ;
130static int ip_rt_error_burst __read_mostly = 5 * HZ;
131static int ip_rt_gc_elasticity __read_mostly = 8;
132static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
133static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
134static int ip_rt_min_advmss __read_mostly = 256;
Neil Horman1080d702008-10-27 12:28:25 -0700135static int rt_chain_length_max __read_mostly = 20;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700136
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500137static struct delayed_work expires_work;
138static unsigned long expires_ljiffies;
139
Linus Torvalds1da177e2005-04-16 15:20:36 -0700140/*
141 * Interface to generic destination cache.
142 */
143
144static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
David S. Miller0dbaee32010-12-13 12:52:14 -0800145static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
Steffen Klassertebb762f2011-11-23 02:12:51 +0000146static unsigned int ipv4_mtu(const struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700147static void ipv4_dst_destroy(struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
149static void ipv4_link_failure(struct sk_buff *skb);
150static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
Daniel Lezcano569d3642008-01-18 03:56:57 -0800151static int rt_garbage_collect(struct dst_ops *ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700152
Marcelo Ricardo Leitnerb6153ea2014-08-14 16:44:52 -0300153static void __rt_garbage_collect(struct work_struct *w);
154static DECLARE_WORK(rt_gc_worker, __rt_garbage_collect);
155
Eric Dumazet72cdd1d2010-11-11 07:14:07 +0000156static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
157 int how)
158{
159}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700160
David S. Miller62fa8a82011-01-26 20:51:05 -0800161static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
162{
David S. Miller06582542011-01-27 14:58:42 -0800163 struct rtable *rt = (struct rtable *) dst;
164 struct inet_peer *peer;
165 u32 *p = NULL;
David S. Miller62fa8a82011-01-26 20:51:05 -0800166
David S. Miller06582542011-01-27 14:58:42 -0800167 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -0400168 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller06582542011-01-27 14:58:42 -0800169
170 peer = rt->peer;
171 if (peer) {
David S. Miller62fa8a82011-01-26 20:51:05 -0800172 u32 *old_p = __DST_METRICS_PTR(old);
173 unsigned long prev, new;
174
David S. Miller06582542011-01-27 14:58:42 -0800175 p = peer->metrics;
176 if (inet_metrics_new(peer))
177 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
David S. Miller62fa8a82011-01-26 20:51:05 -0800178
179 new = (unsigned long) p;
180 prev = cmpxchg(&dst->_metrics, old, new);
181
182 if (prev != old) {
David S. Miller62fa8a82011-01-26 20:51:05 -0800183 p = __DST_METRICS_PTR(prev);
184 if (prev & DST_METRICS_READ_ONLY)
185 p = NULL;
186 } else {
David S. Miller62fa8a82011-01-26 20:51:05 -0800187 if (rt->fi) {
188 fib_info_put(rt->fi);
189 rt->fi = NULL;
190 }
191 }
192 }
193 return p;
194}
195
David S. Millerd3aaeb32011-07-18 00:40:17 -0700196static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
197
Linus Torvalds1da177e2005-04-16 15:20:36 -0700198static struct dst_ops ipv4_dst_ops = {
199 .family = AF_INET,
Harvey Harrison09640e62009-02-01 00:45:17 -0800200 .protocol = cpu_to_be16(ETH_P_IP),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700201 .gc = rt_garbage_collect,
202 .check = ipv4_dst_check,
David S. Miller0dbaee32010-12-13 12:52:14 -0800203 .default_advmss = ipv4_default_advmss,
Steffen Klassertebb762f2011-11-23 02:12:51 +0000204 .mtu = ipv4_mtu,
David S. Miller62fa8a82011-01-26 20:51:05 -0800205 .cow_metrics = ipv4_cow_metrics,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700206 .destroy = ipv4_dst_destroy,
207 .ifdown = ipv4_dst_ifdown,
208 .negative_advice = ipv4_negative_advice,
209 .link_failure = ipv4_link_failure,
210 .update_pmtu = ip_rt_update_pmtu,
Herbert Xu1ac06e02008-05-20 14:32:14 -0700211 .local_out = __ip_local_out,
David S. Millerd3aaeb32011-07-18 00:40:17 -0700212 .neigh_lookup = ipv4_neigh_lookup,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700213};
214
215#define ECN_OR_COST(class) TC_PRIO_##class
216
Philippe De Muyter4839c522007-07-09 15:32:57 -0700217const __u8 ip_tos2prio[16] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700218 TC_PRIO_BESTEFFORT,
Dan Siemon4a2b9c32011-03-15 13:56:07 +0000219 ECN_OR_COST(BESTEFFORT),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700220 TC_PRIO_BESTEFFORT,
221 ECN_OR_COST(BESTEFFORT),
222 TC_PRIO_BULK,
223 ECN_OR_COST(BULK),
224 TC_PRIO_BULK,
225 ECN_OR_COST(BULK),
226 TC_PRIO_INTERACTIVE,
227 ECN_OR_COST(INTERACTIVE),
228 TC_PRIO_INTERACTIVE,
229 ECN_OR_COST(INTERACTIVE),
230 TC_PRIO_INTERACTIVE_BULK,
231 ECN_OR_COST(INTERACTIVE_BULK),
232 TC_PRIO_INTERACTIVE_BULK,
233 ECN_OR_COST(INTERACTIVE_BULK)
234};
235
236
237/*
238 * Route cache.
239 */
240
241/* The locking scheme is rather straight forward:
242 *
243 * 1) Read-Copy Update protects the buckets of the central route hash.
244 * 2) Only writers remove entries, and they hold the lock
245 * as they look at rtable reference counts.
246 * 3) Only readers acquire references to rtable entries,
247 * they do so with atomic increments and with the
248 * lock held.
249 */
250
251struct rt_hash_bucket {
Eric Dumazet1c317202010-10-25 21:02:07 +0000252 struct rtable __rcu *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700253};
Neil Horman1080d702008-10-27 12:28:25 -0700254
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700255#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
256 defined(CONFIG_PROVE_LOCKING)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700257/*
258 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
259 * The size of this table is a power of two and depends on the number of CPUS.
Ingo Molnar62051202006-07-03 00:24:59 -0700260 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700261 */
Ingo Molnar62051202006-07-03 00:24:59 -0700262#ifdef CONFIG_LOCKDEP
263# define RT_HASH_LOCK_SZ 256
Eric Dumazet22c047c2005-07-05 14:55:24 -0700264#else
Ingo Molnar62051202006-07-03 00:24:59 -0700265# if NR_CPUS >= 32
266# define RT_HASH_LOCK_SZ 4096
267# elif NR_CPUS >= 16
268# define RT_HASH_LOCK_SZ 2048
269# elif NR_CPUS >= 8
270# define RT_HASH_LOCK_SZ 1024
271# elif NR_CPUS >= 4
272# define RT_HASH_LOCK_SZ 512
273# else
274# define RT_HASH_LOCK_SZ 256
275# endif
Eric Dumazet22c047c2005-07-05 14:55:24 -0700276#endif
277
278static spinlock_t *rt_hash_locks;
279# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800280
281static __init void rt_hash_lock_init(void)
282{
283 int i;
284
285 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
286 GFP_KERNEL);
287 if (!rt_hash_locks)
288 panic("IP: failed to allocate rt_hash_locks\n");
289
290 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
291 spin_lock_init(&rt_hash_locks[i]);
292}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700293#else
294# define rt_hash_lock_addr(slot) NULL
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800295
296static inline void rt_hash_lock_init(void)
297{
298}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700299#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700300
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700301static struct rt_hash_bucket *rt_hash_table __read_mostly;
302static unsigned rt_hash_mask __read_mostly;
303static unsigned int rt_hash_log __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700304
Eric Dumazet2f970d82006-01-17 02:54:36 -0800305static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
Eric Dumazet27f39c72010-05-19 22:07:23 +0000306#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700307
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700308static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700309 int genid)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700310{
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700311 return jhash_3words((__force u32)daddr, (__force u32)saddr,
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700312 idx, genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800313 & rt_hash_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700314}
315
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700316static inline int rt_genid(struct net *net)
317{
318 return atomic_read(&net->ipv4.rt_genid);
319}
320
Linus Torvalds1da177e2005-04-16 15:20:36 -0700321#ifdef CONFIG_PROC_FS
322struct rt_cache_iter_state {
Denis V. Luneva75e9362008-02-28 20:50:55 -0800323 struct seq_net_private p;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700324 int bucket;
Eric Dumazet29e75252008-01-31 17:05:09 -0800325 int genid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700326};
327
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900328static struct rtable *rt_cache_get_first(struct seq_file *seq)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700329{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900330 struct rt_cache_iter_state *st = seq->private;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700331 struct rtable *r = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700332
333 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
Eric Dumazet33d480c2011-08-11 19:30:52 +0000334 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
Eric Dumazeta6272662008-08-28 01:11:25 -0700335 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700336 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -0800337 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Eric Dumazet29e75252008-01-31 17:05:09 -0800338 while (r) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700339 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
Denis V. Luneva75e9362008-02-28 20:50:55 -0800340 r->rt_genid == st->genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800341 return r;
Changli Gaod8d1f302010-06-10 23:31:35 -0700342 r = rcu_dereference_bh(r->dst.rt_next);
Eric Dumazet29e75252008-01-31 17:05:09 -0800343 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700344 rcu_read_unlock_bh();
345 }
Eric Dumazet29e75252008-01-31 17:05:09 -0800346 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700347}
348
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900349static struct rtable *__rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800350 struct rtable *r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700351{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900352 struct rt_cache_iter_state *st = seq->private;
Eric Dumazeta6272662008-08-28 01:11:25 -0700353
Eric Dumazet1c317202010-10-25 21:02:07 +0000354 r = rcu_dereference_bh(r->dst.rt_next);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700355 while (!r) {
356 rcu_read_unlock_bh();
Eric Dumazeta6272662008-08-28 01:11:25 -0700357 do {
358 if (--st->bucket < 0)
359 return NULL;
Eric Dumazet33d480c2011-08-11 19:30:52 +0000360 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700361 rcu_read_lock_bh();
Eric Dumazet1c317202010-10-25 21:02:07 +0000362 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700363 }
Eric Dumazet1c317202010-10-25 21:02:07 +0000364 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700365}
366
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900367static struct rtable *rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800368 struct rtable *r)
369{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900370 struct rt_cache_iter_state *st = seq->private;
371 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700372 if (dev_net(r->dst.dev) != seq_file_net(seq))
Denis V. Luneva75e9362008-02-28 20:50:55 -0800373 continue;
Denis V. Lunev642d6312008-02-28 20:50:33 -0800374 if (r->rt_genid == st->genid)
375 break;
376 }
377 return r;
378}
379
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900380static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700381{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900382 struct rtable *r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700383
384 if (r)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900385 while (pos && (r = rt_cache_get_next(seq, r)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700386 --pos;
387 return pos ? NULL : r;
388}
389
390static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
391{
Eric Dumazet29e75252008-01-31 17:05:09 -0800392 struct rt_cache_iter_state *st = seq->private;
Eric Dumazet29e75252008-01-31 17:05:09 -0800393 if (*pos)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900394 return rt_cache_get_idx(seq, *pos - 1);
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700395 st->genid = rt_genid(seq_file_net(seq));
Eric Dumazet29e75252008-01-31 17:05:09 -0800396 return SEQ_START_TOKEN;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700397}
398
399static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
400{
Eric Dumazet29e75252008-01-31 17:05:09 -0800401 struct rtable *r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700402
403 if (v == SEQ_START_TOKEN)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900404 r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700405 else
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900406 r = rt_cache_get_next(seq, v);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700407 ++*pos;
408 return r;
409}
410
411static void rt_cache_seq_stop(struct seq_file *seq, void *v)
412{
413 if (v && v != SEQ_START_TOKEN)
414 rcu_read_unlock_bh();
415}
416
417static int rt_cache_seq_show(struct seq_file *seq, void *v)
418{
419 if (v == SEQ_START_TOKEN)
420 seq_printf(seq, "%-127s\n",
421 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
422 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
423 "HHUptod\tSpecDst");
424 else {
425 struct rtable *r = v;
David S. Miller69cce1d2011-07-17 23:09:49 -0700426 struct neighbour *n;
Eric Dumazet218fa902011-11-29 20:05:55 +0000427 int len, HHUptod;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700428
Eric Dumazet218fa902011-11-29 20:05:55 +0000429 rcu_read_lock();
David Miller27217452011-12-02 16:52:08 +0000430 n = dst_get_neighbour_noref(&r->dst);
Eric Dumazet218fa902011-11-29 20:05:55 +0000431 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
432 rcu_read_unlock();
433
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700434 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
435 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
Changli Gaod8d1f302010-06-10 23:31:35 -0700436 r->dst.dev ? r->dst.dev->name : "*",
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700437 (__force u32)r->rt_dst,
438 (__force u32)r->rt_gateway,
Changli Gaod8d1f302010-06-10 23:31:35 -0700439 r->rt_flags, atomic_read(&r->dst.__refcnt),
440 r->dst.__use, 0, (__force u32)r->rt_src,
David S. Miller0dbaee32010-12-13 12:52:14 -0800441 dst_metric_advmss(&r->dst) + 40,
Changli Gaod8d1f302010-06-10 23:31:35 -0700442 dst_metric(&r->dst, RTAX_WINDOW),
443 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
444 dst_metric(&r->dst, RTAX_RTTVAR)),
David S. Miller475949d2011-05-03 19:45:15 -0700445 r->rt_key_tos,
David S. Millerf6b72b62011-07-14 07:53:20 -0700446 -1,
Eric Dumazet218fa902011-11-29 20:05:55 +0000447 HHUptod,
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700448 r->rt_spec_dst, &len);
449
450 seq_printf(seq, "%*s\n", 127 - len, "");
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900451 }
452 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700453}
454
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700455static const struct seq_operations rt_cache_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700456 .start = rt_cache_seq_start,
457 .next = rt_cache_seq_next,
458 .stop = rt_cache_seq_stop,
459 .show = rt_cache_seq_show,
460};
461
462static int rt_cache_seq_open(struct inode *inode, struct file *file)
463{
Denis V. Luneva75e9362008-02-28 20:50:55 -0800464 return seq_open_net(inode, file, &rt_cache_seq_ops,
Pavel Emelyanovcf7732e2007-10-10 02:29:29 -0700465 sizeof(struct rt_cache_iter_state));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700466}
467
Arjan van de Ven9a321442007-02-12 00:55:35 -0800468static const struct file_operations rt_cache_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700469 .owner = THIS_MODULE,
470 .open = rt_cache_seq_open,
471 .read = seq_read,
472 .llseek = seq_lseek,
Denis V. Luneva75e9362008-02-28 20:50:55 -0800473 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700474};
475
476
477static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
478{
479 int cpu;
480
481 if (*pos == 0)
482 return SEQ_START_TOKEN;
483
Rusty Russell0f231742008-12-29 12:23:42 +0000484 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700485 if (!cpu_possible(cpu))
486 continue;
487 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800488 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700489 }
490 return NULL;
491}
492
493static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
494{
495 int cpu;
496
Rusty Russell0f231742008-12-29 12:23:42 +0000497 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700498 if (!cpu_possible(cpu))
499 continue;
500 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800501 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700502 }
503 return NULL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900504
Linus Torvalds1da177e2005-04-16 15:20:36 -0700505}
506
507static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
508{
509
510}
511
512static int rt_cpu_seq_show(struct seq_file *seq, void *v)
513{
514 struct rt_cache_stat *st = v;
515
516 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700517 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700518 return 0;
519 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900520
Linus Torvalds1da177e2005-04-16 15:20:36 -0700521 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
522 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
Eric Dumazetfc66f952010-10-08 06:37:34 +0000523 dst_entries_get_slow(&ipv4_dst_ops),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700524 st->in_hit,
525 st->in_slow_tot,
526 st->in_slow_mc,
527 st->in_no_route,
528 st->in_brd,
529 st->in_martian_dst,
530 st->in_martian_src,
531
532 st->out_hit,
533 st->out_slow_tot,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900534 st->out_slow_mc,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700535
536 st->gc_total,
537 st->gc_ignored,
538 st->gc_goal_miss,
539 st->gc_dst_overflow,
540 st->in_hlist_search,
541 st->out_hlist_search
542 );
543 return 0;
544}
545
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700546static const struct seq_operations rt_cpu_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700547 .start = rt_cpu_seq_start,
548 .next = rt_cpu_seq_next,
549 .stop = rt_cpu_seq_stop,
550 .show = rt_cpu_seq_show,
551};
552
553
554static int rt_cpu_seq_open(struct inode *inode, struct file *file)
555{
556 return seq_open(file, &rt_cpu_seq_ops);
557}
558
Arjan van de Ven9a321442007-02-12 00:55:35 -0800559static const struct file_operations rt_cpu_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700560 .owner = THIS_MODULE,
561 .open = rt_cpu_seq_open,
562 .read = seq_read,
563 .llseek = seq_lseek,
564 .release = seq_release,
565};
566
Patrick McHardyc7066f72011-01-14 13:36:42 +0100567#ifdef CONFIG_IP_ROUTE_CLASSID
Alexey Dobriyana661c412009-11-25 15:40:35 -0800568static int rt_acct_proc_show(struct seq_file *m, void *v)
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800569{
Alexey Dobriyana661c412009-11-25 15:40:35 -0800570 struct ip_rt_acct *dst, *src;
571 unsigned int i, j;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800572
Alexey Dobriyana661c412009-11-25 15:40:35 -0800573 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
574 if (!dst)
575 return -ENOMEM;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800576
Alexey Dobriyana661c412009-11-25 15:40:35 -0800577 for_each_possible_cpu(i) {
578 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
579 for (j = 0; j < 256; j++) {
580 dst[j].o_bytes += src[j].o_bytes;
581 dst[j].o_packets += src[j].o_packets;
582 dst[j].i_bytes += src[j].i_bytes;
583 dst[j].i_packets += src[j].i_packets;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800584 }
585 }
Alexey Dobriyana661c412009-11-25 15:40:35 -0800586
587 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
588 kfree(dst);
589 return 0;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800590}
Alexey Dobriyana661c412009-11-25 15:40:35 -0800591
592static int rt_acct_proc_open(struct inode *inode, struct file *file)
593{
594 return single_open(file, rt_acct_proc_show, NULL);
595}
596
597static const struct file_operations rt_acct_proc_fops = {
598 .owner = THIS_MODULE,
599 .open = rt_acct_proc_open,
600 .read = seq_read,
601 .llseek = seq_lseek,
602 .release = single_release,
603};
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800604#endif
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800605
Denis V. Lunev73b38712008-02-28 20:51:18 -0800606static int __net_init ip_rt_do_proc_init(struct net *net)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800607{
608 struct proc_dir_entry *pde;
609
610 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
611 &rt_cache_seq_fops);
612 if (!pde)
613 goto err1;
614
Wang Chen77020722008-02-28 14:14:25 -0800615 pde = proc_create("rt_cache", S_IRUGO,
616 net->proc_net_stat, &rt_cpu_seq_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800617 if (!pde)
618 goto err2;
619
Patrick McHardyc7066f72011-01-14 13:36:42 +0100620#ifdef CONFIG_IP_ROUTE_CLASSID
Alexey Dobriyana661c412009-11-25 15:40:35 -0800621 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800622 if (!pde)
623 goto err3;
624#endif
625 return 0;
626
Patrick McHardyc7066f72011-01-14 13:36:42 +0100627#ifdef CONFIG_IP_ROUTE_CLASSID
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800628err3:
629 remove_proc_entry("rt_cache", net->proc_net_stat);
630#endif
631err2:
632 remove_proc_entry("rt_cache", net->proc_net);
633err1:
634 return -ENOMEM;
635}
Denis V. Lunev73b38712008-02-28 20:51:18 -0800636
637static void __net_exit ip_rt_do_proc_exit(struct net *net)
638{
639 remove_proc_entry("rt_cache", net->proc_net_stat);
640 remove_proc_entry("rt_cache", net->proc_net);
Patrick McHardyc7066f72011-01-14 13:36:42 +0100641#ifdef CONFIG_IP_ROUTE_CLASSID
Denis V. Lunev73b38712008-02-28 20:51:18 -0800642 remove_proc_entry("rt_acct", net->proc_net);
Alexey Dobriyan0a931ac2010-01-17 03:32:50 +0000643#endif
Denis V. Lunev73b38712008-02-28 20:51:18 -0800644}
645
646static struct pernet_operations ip_rt_proc_ops __net_initdata = {
647 .init = ip_rt_do_proc_init,
648 .exit = ip_rt_do_proc_exit,
649};
650
651static int __init ip_rt_proc_init(void)
652{
653 return register_pernet_subsys(&ip_rt_proc_ops);
654}
655
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800656#else
Denis V. Lunev73b38712008-02-28 20:51:18 -0800657static inline int ip_rt_proc_init(void)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800658{
659 return 0;
660}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700661#endif /* CONFIG_PROC_FS */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900662
Stephen Hemminger5969f712008-04-10 01:52:09 -0700663static inline void rt_free(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700664{
Changli Gaod8d1f302010-06-10 23:31:35 -0700665 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700666}
667
Stephen Hemminger5969f712008-04-10 01:52:09 -0700668static inline void rt_drop(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700669{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700670 ip_rt_put(rt);
Changli Gaod8d1f302010-06-10 23:31:35 -0700671 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700672}
673
Stephen Hemminger5969f712008-04-10 01:52:09 -0700674static inline int rt_fast_clean(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700675{
676 /* Kill broadcast/multicast entries very aggresively, if they
677 collide in hash table with more useful entries */
678 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
David S. Millerc7537962010-11-11 17:07:48 -0800679 rt_is_input_route(rth) && rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700680}
681
Stephen Hemminger5969f712008-04-10 01:52:09 -0700682static inline int rt_valuable(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700683{
684 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
David S. Miller2c8cec52011-02-09 20:42:07 -0800685 (rth->peer && rth->peer->pmtu_expires);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700686}
687
688static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
689{
690 unsigned long age;
691 int ret = 0;
692
Changli Gaod8d1f302010-06-10 23:31:35 -0700693 if (atomic_read(&rth->dst.__refcnt))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700694 goto out;
695
Changli Gaod8d1f302010-06-10 23:31:35 -0700696 age = jiffies - rth->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700697 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
698 (age <= tmo2 && rt_valuable(rth)))
699 goto out;
700 ret = 1;
701out: return ret;
702}
703
704/* Bits of score are:
705 * 31: very valuable
706 * 30: not quite useless
707 * 29..0: usage counter
708 */
709static inline u32 rt_score(struct rtable *rt)
710{
Changli Gaod8d1f302010-06-10 23:31:35 -0700711 u32 score = jiffies - rt->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700712
713 score = ~score & ~(3<<30);
714
715 if (rt_valuable(rt))
716 score |= (1<<31);
717
David S. Millerc7537962010-11-11 17:07:48 -0800718 if (rt_is_output_route(rt) ||
Linus Torvalds1da177e2005-04-16 15:20:36 -0700719 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
720 score |= (1<<30);
721
722 return score;
723}
724
Neil Horman1080d702008-10-27 12:28:25 -0700725static inline bool rt_caching(const struct net *net)
726{
727 return net->ipv4.current_rt_cache_rebuild_count <=
728 net->ipv4.sysctl_rt_cache_rebuild_count;
729}
730
David S. Miller5e2b61f2011-03-04 21:47:09 -0800731static inline bool compare_hash_inputs(const struct rtable *rt1,
732 const struct rtable *rt2)
Neil Horman1080d702008-10-27 12:28:25 -0700733{
David S. Miller5e2b61f2011-03-04 21:47:09 -0800734 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
735 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
Julian Anastasov97a80412011-08-09 04:01:16 +0000736 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
Neil Horman1080d702008-10-27 12:28:25 -0700737}
738
David S. Miller5e2b61f2011-03-04 21:47:09 -0800739static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700740{
David S. Miller5e2b61f2011-03-04 21:47:09 -0800741 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
742 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
743 (rt1->rt_mark ^ rt2->rt_mark) |
David S. Miller475949d2011-05-03 19:45:15 -0700744 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
Julian Anastasovd547f722011-08-07 22:20:20 -0700745 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
Julian Anastasov97a80412011-08-09 04:01:16 +0000746 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700747}
748
Denis V. Lunevb5921912008-01-22 23:50:25 -0800749static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
750{
Changli Gaod8d1f302010-06-10 23:31:35 -0700751 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
Denis V. Lunevb5921912008-01-22 23:50:25 -0800752}
753
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700754static inline int rt_is_expired(struct rtable *rth)
755{
Changli Gaod8d1f302010-06-10 23:31:35 -0700756 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700757}
758
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800759/*
760 * Perform a full scan of hash table and free all entries.
761 * Can be called by a softirq or a process.
762 * In the later case, we want to be reschedule if necessary
763 */
David S. Miller6561a3b2010-12-19 21:11:20 -0800764static void rt_do_flush(struct net *net, int process_context)
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800765{
766 unsigned int i;
767 struct rtable *rth, *next;
768
769 for (i = 0; i <= rt_hash_mask; i++) {
David S. Miller6561a3b2010-12-19 21:11:20 -0800770 struct rtable __rcu **pprev;
771 struct rtable *list;
772
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800773 if (process_context && need_resched())
774 cond_resched();
Eric Dumazet33d480c2011-08-11 19:30:52 +0000775 rth = rcu_access_pointer(rt_hash_table[i].chain);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800776 if (!rth)
777 continue;
778
779 spin_lock_bh(rt_hash_lock_addr(i));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700780
David S. Miller6561a3b2010-12-19 21:11:20 -0800781 list = NULL;
782 pprev = &rt_hash_table[i].chain;
783 rth = rcu_dereference_protected(*pprev,
Eric Dumazet1c317202010-10-25 21:02:07 +0000784 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700785
David S. Miller6561a3b2010-12-19 21:11:20 -0800786 while (rth) {
787 next = rcu_dereference_protected(rth->dst.rt_next,
788 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700789
David S. Miller6561a3b2010-12-19 21:11:20 -0800790 if (!net ||
791 net_eq(dev_net(rth->dst.dev), net)) {
792 rcu_assign_pointer(*pprev, next);
793 rcu_assign_pointer(rth->dst.rt_next, list);
794 list = rth;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700795 } else {
David S. Miller6561a3b2010-12-19 21:11:20 -0800796 pprev = &rth->dst.rt_next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700797 }
David S. Miller6561a3b2010-12-19 21:11:20 -0800798 rth = next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700799 }
David S. Miller6561a3b2010-12-19 21:11:20 -0800800
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800801 spin_unlock_bh(rt_hash_lock_addr(i));
802
David S. Miller6561a3b2010-12-19 21:11:20 -0800803 for (; list; list = next) {
804 next = rcu_dereference_protected(list->dst.rt_next, 1);
805 rt_free(list);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800806 }
807 }
808}
809
Neil Horman1080d702008-10-27 12:28:25 -0700810/*
811 * While freeing expired entries, we compute average chain length
812 * and standard deviation, using fixed-point arithmetic.
813 * This to have an estimation of rt_chain_length_max
814 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
815 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
816 */
817
818#define FRACT_BITS 3
819#define ONE (1UL << FRACT_BITS)
820
Eric Dumazet98376382010-03-08 03:20:00 +0000821/*
822 * Given a hash chain and an item in this hash chain,
823 * find if a previous entry has the same hash_inputs
824 * (but differs on tos, mark or oif)
825 * Returns 0 if an alias is found.
826 * Returns ONE if rth has no alias before itself.
827 */
828static int has_noalias(const struct rtable *head, const struct rtable *rth)
829{
830 const struct rtable *aux = head;
831
832 while (aux != rth) {
David S. Miller5e2b61f2011-03-04 21:47:09 -0800833 if (compare_hash_inputs(aux, rth))
Eric Dumazet98376382010-03-08 03:20:00 +0000834 return 0;
Eric Dumazet1c317202010-10-25 21:02:07 +0000835 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +0000836 }
837 return ONE;
838}
839
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500840static void rt_check_expire(void)
841{
842 static unsigned int rover;
843 unsigned int i = rover, goal;
844 struct rtable *rth;
845 struct rtable __rcu **rthp;
846 unsigned long samples = 0;
847 unsigned long sum = 0, sum2 = 0;
848 unsigned long delta;
849 u64 mult;
850
851 delta = jiffies - expires_ljiffies;
852 expires_ljiffies = jiffies;
853 mult = ((u64)delta) << rt_hash_log;
854 if (ip_rt_gc_timeout > 1)
855 do_div(mult, ip_rt_gc_timeout);
856 goal = (unsigned int)mult;
857 if (goal > rt_hash_mask)
858 goal = rt_hash_mask + 1;
859 for (; goal > 0; goal--) {
860 unsigned long tmo = ip_rt_gc_timeout;
861 unsigned long length;
862
863 i = (i + 1) & rt_hash_mask;
864 rthp = &rt_hash_table[i].chain;
865
866 if (need_resched())
867 cond_resched();
868
869 samples++;
870
871 if (rcu_dereference_raw(*rthp) == NULL)
872 continue;
873 length = 0;
874 spin_lock_bh(rt_hash_lock_addr(i));
875 while ((rth = rcu_dereference_protected(*rthp,
876 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
877 prefetch(rth->dst.rt_next);
878 if (rt_is_expired(rth)) {
879 *rthp = rth->dst.rt_next;
880 rt_free(rth);
881 continue;
882 }
883 if (rth->dst.expires) {
884 /* Entry is expired even if it is in use */
885 if (time_before_eq(jiffies, rth->dst.expires)) {
886nofree:
887 tmo >>= 1;
888 rthp = &rth->dst.rt_next;
889 /*
890 * We only count entries on
891 * a chain with equal hash inputs once
892 * so that entries for different QOS
893 * levels, and other non-hash input
894 * attributes don't unfairly skew
895 * the length computation
896 */
897 length += has_noalias(rt_hash_table[i].chain, rth);
898 continue;
899 }
900 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
901 goto nofree;
902
903 /* Cleanup aged off entries. */
904 *rthp = rth->dst.rt_next;
905 rt_free(rth);
906 }
907 spin_unlock_bh(rt_hash_lock_addr(i));
908 sum += length;
909 sum2 += length*length;
910 }
911 if (samples) {
912 unsigned long avg = sum / samples;
913 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
914 rt_chain_length_max = max_t(unsigned long,
915 ip_rt_gc_elasticity,
916 (avg + 4*sd) >> FRACT_BITS);
917 }
918 rover = i;
919}
920
921/*
922 * rt_worker_func() is run in process context.
923 * we call rt_check_expire() to scan part of the hash table
924 */
925static void rt_worker_func(struct work_struct *work)
926{
927 rt_check_expire();
928 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
929}
930
Eric Dumazet29e75252008-01-31 17:05:09 -0800931/*
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300932 * Perturbation of rt_genid by a small quantity [1..256]
Eric Dumazet29e75252008-01-31 17:05:09 -0800933 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
934 * many times (2^24) without giving recent rt_genid.
935 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700936 */
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700937static void rt_cache_invalidate(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700938{
Eric Dumazet29e75252008-01-31 17:05:09 -0800939 unsigned char shuffle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700940
Eric Dumazet29e75252008-01-31 17:05:09 -0800941 get_random_bytes(&shuffle, sizeof(shuffle));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700942 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
Steffen Klassert5faa5df2012-03-06 21:20:26 +0000943 inetpeer_invalidate_tree(AF_INET);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700944}
945
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800946/*
Eric Dumazet29e75252008-01-31 17:05:09 -0800947 * delay < 0 : invalidate cache (fast : entries will be deleted later)
948 * delay >= 0 : invalidate & flush cache (can be long)
949 */
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -0700950void rt_cache_flush(struct net *net, int delay)
Eric Dumazet29e75252008-01-31 17:05:09 -0800951{
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700952 rt_cache_invalidate(net);
Eric Dumazet29e75252008-01-31 17:05:09 -0800953 if (delay >= 0)
David S. Miller6561a3b2010-12-19 21:11:20 -0800954 rt_do_flush(net, !in_softirq());
Eric Dumazet29e75252008-01-31 17:05:09 -0800955}
956
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000957/* Flush previous cache invalidated entries from the cache */
David S. Miller6561a3b2010-12-19 21:11:20 -0800958void rt_cache_flush_batch(struct net *net)
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000959{
David S. Miller6561a3b2010-12-19 21:11:20 -0800960 rt_do_flush(net, !in_softirq());
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000961}
962
Neil Horman1080d702008-10-27 12:28:25 -0700963static void rt_emergency_hash_rebuild(struct net *net)
964{
Neil Horman3ee94372010-05-08 01:57:52 -0700965 if (net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +0000966 pr_warn("Route hash chain too long!\n");
Neil Horman3ee94372010-05-08 01:57:52 -0700967 rt_cache_invalidate(net);
Neil Horman1080d702008-10-27 12:28:25 -0700968}
969
Linus Torvalds1da177e2005-04-16 15:20:36 -0700970/*
971 Short description of GC goals.
972
973 We want to build algorithm, which will keep routing cache
974 at some equilibrium point, when number of aged off entries
975 is kept approximately equal to newly generated ones.
976
977 Current expiration strength is variable "expire".
978 We try to adjust it dynamically, so that if networking
979 is idle expires is large enough to keep enough of warm entries,
980 and when load increases it reduces to limit cache size.
981 */
982
Marcelo Ricardo Leitnerb6153ea2014-08-14 16:44:52 -0300983static void __do_rt_garbage_collect(int elasticity, int min_interval)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700984{
985 static unsigned long expire = RT_GC_TIMEOUT;
986 static unsigned long last_gc;
987 static int rover;
988 static int equilibrium;
Marcelo Ricardo Leitnerb54ca602014-08-14 16:44:53 -0300989 static DEFINE_SPINLOCK(rt_gc_lock);
Eric Dumazet1c317202010-10-25 21:02:07 +0000990 struct rtable *rth;
991 struct rtable __rcu **rthp;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700992 unsigned long now = jiffies;
993 int goal;
Eric Dumazetfc66f952010-10-08 06:37:34 +0000994 int entries = dst_entries_get_fast(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700995
996 /*
997 * Garbage collection is pretty expensive,
998 * do not make it too frequently.
999 */
1000
Marcelo Ricardo Leitnerb54ca602014-08-14 16:44:53 -03001001 spin_lock(&rt_gc_lock);
1002
Linus Torvalds1da177e2005-04-16 15:20:36 -07001003 RT_CACHE_STAT_INC(gc_total);
1004
Marcelo Ricardo Leitnerb6153ea2014-08-14 16:44:52 -03001005 if (now - last_gc < min_interval &&
Eric Dumazetfc66f952010-10-08 06:37:34 +00001006 entries < ip_rt_max_size) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001007 RT_CACHE_STAT_INC(gc_ignored);
1008 goto out;
1009 }
1010
Eric Dumazetfc66f952010-10-08 06:37:34 +00001011 entries = dst_entries_get_slow(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001012 /* Calculate number of entries, which we want to expire now. */
Marcelo Ricardo Leitnerb6153ea2014-08-14 16:44:52 -03001013 goal = entries - (elasticity << rt_hash_log);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001014 if (goal <= 0) {
1015 if (equilibrium < ipv4_dst_ops.gc_thresh)
1016 equilibrium = ipv4_dst_ops.gc_thresh;
Eric Dumazetfc66f952010-10-08 06:37:34 +00001017 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001018 if (goal > 0) {
Eric Dumazetb790ced2007-12-21 01:49:07 -08001019 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +00001020 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001021 }
1022 } else {
1023 /* We are in dangerous area. Try to reduce cache really
1024 * aggressively.
1025 */
Eric Dumazetb790ced2007-12-21 01:49:07 -08001026 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +00001027 equilibrium = entries - goal;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001028 }
1029
Marcelo Ricardo Leitnerb6153ea2014-08-14 16:44:52 -03001030 if (now - last_gc >= min_interval)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001031 last_gc = now;
1032
1033 if (goal <= 0) {
1034 equilibrium += goal;
1035 goto work_done;
1036 }
1037
1038 do {
1039 int i, k;
1040
1041 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1042 unsigned long tmo = expire;
1043
1044 k = (k + 1) & rt_hash_mask;
1045 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001046 spin_lock_bh(rt_hash_lock_addr(k));
Eric Dumazet1c317202010-10-25 21:02:07 +00001047 while ((rth = rcu_dereference_protected(*rthp,
1048 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001049 if (!rt_is_expired(rth) &&
Eric Dumazet29e75252008-01-31 17:05:09 -08001050 !rt_may_expire(rth, tmo, expire)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001051 tmo >>= 1;
Changli Gaod8d1f302010-06-10 23:31:35 -07001052 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001053 continue;
1054 }
Changli Gaod8d1f302010-06-10 23:31:35 -07001055 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001056 rt_free(rth);
1057 goal--;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001058 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001059 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001060 if (goal <= 0)
1061 break;
1062 }
1063 rover = k;
1064
1065 if (goal <= 0)
1066 goto work_done;
1067
1068 /* Goal is not achieved. We stop process if:
1069
1070 - if expire reduced to zero. Otherwise, expire is halfed.
1071 - if table is not full.
1072 - if we are called from interrupt.
1073 - jiffies check is just fallback/debug loop breaker.
1074 We will not spin here for long time in any case.
1075 */
1076
1077 RT_CACHE_STAT_INC(gc_goal_miss);
1078
1079 if (expire == 0)
1080 break;
1081
1082 expire >>= 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001083
Eric Dumazetfc66f952010-10-08 06:37:34 +00001084 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001085 goto out;
1086 } while (!in_softirq() && time_before_eq(jiffies, now));
1087
Eric Dumazetfc66f952010-10-08 06:37:34 +00001088 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1089 goto out;
1090 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001091 goto out;
1092 if (net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00001093 pr_warn("dst cache overflow\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07001094 RT_CACHE_STAT_INC(gc_dst_overflow);
Marcelo Ricardo Leitnerb54ca602014-08-14 16:44:53 -03001095 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001096
1097work_done:
Marcelo Ricardo Leitnerb6153ea2014-08-14 16:44:52 -03001098 expire += min_interval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001099 if (expire > ip_rt_gc_timeout ||
Eric Dumazetfc66f952010-10-08 06:37:34 +00001100 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1101 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001102 expire = ip_rt_gc_timeout;
Marcelo Ricardo Leitnerb54ca602014-08-14 16:44:53 -03001103out:
1104 spin_unlock(&rt_gc_lock);
Marcelo Ricardo Leitnerb6153ea2014-08-14 16:44:52 -03001105}
1106
1107static void __rt_garbage_collect(struct work_struct *w)
1108{
1109 __do_rt_garbage_collect(ip_rt_gc_elasticity, ip_rt_gc_min_interval);
1110}
1111
1112static int rt_garbage_collect(struct dst_ops *ops)
1113{
1114 if (!work_pending(&rt_gc_worker))
1115 schedule_work(&rt_gc_worker);
1116
1117 if (dst_entries_get_fast(&ipv4_dst_ops) >= ip_rt_max_size ||
1118 dst_entries_get_slow(&ipv4_dst_ops) >= ip_rt_max_size) {
1119 RT_CACHE_STAT_INC(gc_dst_overflow);
1120 return 1;
1121 }
1122 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001123}
1124
Eric Dumazet98376382010-03-08 03:20:00 +00001125/*
1126 * Returns number of entries in a hash chain that have different hash_inputs
1127 */
1128static int slow_chain_length(const struct rtable *head)
1129{
1130 int length = 0;
1131 const struct rtable *rth = head;
1132
1133 while (rth) {
1134 length += has_noalias(head, rth);
Eric Dumazet1c317202010-10-25 21:02:07 +00001135 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +00001136 }
1137 return length >> FRACT_BITS;
1138}
1139
David S. Millerd3aaeb32011-07-18 00:40:17 -07001140static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
David Miller3769cff2011-07-11 22:44:24 +00001141{
David S. Millerd3aaeb32011-07-18 00:40:17 -07001142 static const __be32 inaddr_any = 0;
1143 struct net_device *dev = dst->dev;
1144 const __be32 *pkey = daddr;
David S. Miller39232972012-01-26 15:22:32 -05001145 const struct rtable *rt;
David Miller3769cff2011-07-11 22:44:24 +00001146 struct neighbour *n;
1147
David S. Miller39232972012-01-26 15:22:32 -05001148 rt = (const struct rtable *) dst;
1149
David Miller3769cff2011-07-11 22:44:24 +00001150 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
David S. Millerd3aaeb32011-07-18 00:40:17 -07001151 pkey = &inaddr_any;
David S. Miller39232972012-01-26 15:22:32 -05001152 else if (rt->rt_gateway)
1153 pkey = (const __be32 *) &rt->rt_gateway;
David S. Millerd3aaeb32011-07-18 00:40:17 -07001154
David S. Miller80703d22012-02-15 17:48:35 -05001155 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
David S. Millerd3aaeb32011-07-18 00:40:17 -07001156 if (n)
1157 return n;
David Miller32092ec2011-07-25 00:01:41 +00001158 return neigh_create(&arp_tbl, pkey, dev);
David S. Millerd3aaeb32011-07-18 00:40:17 -07001159}
1160
1161static int rt_bind_neighbour(struct rtable *rt)
1162{
1163 struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
David Miller3769cff2011-07-11 22:44:24 +00001164 if (IS_ERR(n))
1165 return PTR_ERR(n);
David S. Miller69cce1d2011-07-17 23:09:49 -07001166 dst_set_neighbour(&rt->dst, n);
David Miller3769cff2011-07-11 22:44:24 +00001167
1168 return 0;
1169}
1170
David S. Millerb23dd4f2011-03-02 14:31:35 -08001171static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1172 struct sk_buff *skb, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001173{
Eric Dumazet1c317202010-10-25 21:02:07 +00001174 struct rtable *rth, *cand;
1175 struct rtable __rcu **rthp, **candp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001176 unsigned long now;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001177 u32 min_score;
1178 int chain_length;
Marcelo Ricardo Leitnerb54ca602014-08-14 16:44:53 -03001179 int attempts = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001180
1181restart:
1182 chain_length = 0;
1183 min_score = ~(u32)0;
1184 cand = NULL;
1185 candp = NULL;
1186 now = jiffies;
1187
Changli Gaod8d1f302010-06-10 23:31:35 -07001188 if (!rt_caching(dev_net(rt->dst.dev))) {
Neil Horman73e42892009-06-20 01:15:16 -07001189 /*
1190 * If we're not caching, just tell the caller we
1191 * were successful and don't touch the route. The
1192 * caller hold the sole reference to the cache entry, and
1193 * it will be released when the caller is done with it.
1194 * If we drop it here, the callers have no way to resolve routes
1195 * when we're not caching. Instead, just point *rp at rt, so
1196 * the caller gets a single use out of the route
Neil Hormanb6280b42009-06-22 10:18:53 +00001197 * Note that we do rt_free on this new route entry, so that
1198 * once its refcount hits zero, we are still able to reap it
1199 * (Thanks Alexey)
Eric Dumazet27b75c92010-10-15 05:44:11 +00001200 * Note: To avoid expensive rcu stuff for this uncached dst,
1201 * we set DST_NOCACHE so that dst_release() can free dst without
1202 * waiting a grace period.
Neil Horman73e42892009-06-20 01:15:16 -07001203 */
Neil Hormanb6280b42009-06-22 10:18:53 +00001204
Eric Dumazetc7d44262010-10-03 22:17:54 -07001205 rt->dst.flags |= DST_NOCACHE;
David S. Millerc7537962010-11-11 17:07:48 -08001206 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
David Miller3769cff2011-07-11 22:44:24 +00001207 int err = rt_bind_neighbour(rt);
Neil Hormanb6280b42009-06-22 10:18:53 +00001208 if (err) {
1209 if (net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00001210 pr_warn("Neighbour table failure & not caching routes\n");
Eric Dumazet27b75c92010-10-15 05:44:11 +00001211 ip_rt_put(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001212 return ERR_PTR(err);
Neil Hormanb6280b42009-06-22 10:18:53 +00001213 }
1214 }
1215
Neil Hormanb6280b42009-06-22 10:18:53 +00001216 goto skip_hashing;
Neil Horman1080d702008-10-27 12:28:25 -07001217 }
1218
Linus Torvalds1da177e2005-04-16 15:20:36 -07001219 rthp = &rt_hash_table[hash].chain;
1220
Eric Dumazet22c047c2005-07-05 14:55:24 -07001221 spin_lock_bh(rt_hash_lock_addr(hash));
Eric Dumazet1c317202010-10-25 21:02:07 +00001222 while ((rth = rcu_dereference_protected(*rthp,
1223 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001224 if (rt_is_expired(rth)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001225 *rthp = rth->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001226 rt_free(rth);
1227 continue;
1228 }
David S. Miller5e2b61f2011-03-04 21:47:09 -08001229 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001230 /* Put it first */
Changli Gaod8d1f302010-06-10 23:31:35 -07001231 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001232 /*
1233 * Since lookup is lockfree, the deletion
1234 * must be visible to another weakly ordered CPU before
1235 * the insertion at the start of the hash chain.
1236 */
Changli Gaod8d1f302010-06-10 23:31:35 -07001237 rcu_assign_pointer(rth->dst.rt_next,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001238 rt_hash_table[hash].chain);
1239 /*
1240 * Since lookup is lockfree, the update writes
1241 * must be ordered for consistency on SMP.
1242 */
1243 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1244
Changli Gaod8d1f302010-06-10 23:31:35 -07001245 dst_use(&rth->dst, now);
Eric Dumazet22c047c2005-07-05 14:55:24 -07001246 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001247
1248 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001249 if (skb)
Changli Gaod8d1f302010-06-10 23:31:35 -07001250 skb_dst_set(skb, &rth->dst);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001251 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001252 }
1253
Changli Gaod8d1f302010-06-10 23:31:35 -07001254 if (!atomic_read(&rth->dst.__refcnt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001255 u32 score = rt_score(rth);
1256
1257 if (score <= min_score) {
1258 cand = rth;
1259 candp = rthp;
1260 min_score = score;
1261 }
1262 }
1263
1264 chain_length++;
1265
Changli Gaod8d1f302010-06-10 23:31:35 -07001266 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001267 }
1268
1269 if (cand) {
1270 /* ip_rt_gc_elasticity used to be average length of chain
1271 * length, when exceeded gc becomes really aggressive.
1272 *
1273 * The second limit is less certain. At the moment it allows
1274 * only 2 entries per bucket. We will see.
1275 */
1276 if (chain_length > ip_rt_gc_elasticity) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001277 *candp = cand->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001278 rt_free(cand);
1279 }
Neil Horman1080d702008-10-27 12:28:25 -07001280 } else {
Eric Dumazet98376382010-03-08 03:20:00 +00001281 if (chain_length > rt_chain_length_max &&
1282 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001283 struct net *net = dev_net(rt->dst.dev);
Neil Horman1080d702008-10-27 12:28:25 -07001284 int num = ++net->ipv4.current_rt_cache_rebuild_count;
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001285 if (!rt_caching(net)) {
Joe Perches058bd4d2012-03-11 18:36:11 +00001286 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
Changli Gaod8d1f302010-06-10 23:31:35 -07001287 rt->dst.dev->name, num);
Neil Horman1080d702008-10-27 12:28:25 -07001288 }
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001289 rt_emergency_hash_rebuild(net);
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001290 spin_unlock_bh(rt_hash_lock_addr(hash));
1291
David S. Miller5e2b61f2011-03-04 21:47:09 -08001292 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001293 ifindex, rt_genid(net));
1294 goto restart;
Neil Horman1080d702008-10-27 12:28:25 -07001295 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001296 }
1297
1298 /* Try to bind route to arp only if it is output
1299 route or unicast forwarding path.
1300 */
David S. Millerc7537962010-11-11 17:07:48 -08001301 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
David Miller3769cff2011-07-11 22:44:24 +00001302 int err = rt_bind_neighbour(rt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001303 if (err) {
Eric Dumazet22c047c2005-07-05 14:55:24 -07001304 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001305
1306 if (err != -ENOBUFS) {
1307 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001308 return ERR_PTR(err);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001309 }
1310
1311 /* Neighbour tables are full and nothing
1312 can be released. Try to shrink route cache,
1313 it is most likely it holds some neighbour records.
1314 */
Marcelo Ricardo Leitnerb54ca602014-08-14 16:44:53 -03001315 if (!in_softirq() && attempts-- > 0) {
1316 static DEFINE_SPINLOCK(lock);
1317
1318 if (spin_trylock(&lock)) {
1319 __do_rt_garbage_collect(1, 0);
1320 spin_unlock(&lock);
1321 } else {
1322 spin_unlock_wait(&lock);
1323 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001324 goto restart;
1325 }
1326
1327 if (net_ratelimit())
Joe Perchesafd465032012-03-12 07:03:32 +00001328 pr_warn("Neighbour table overflow\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07001329 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001330 return ERR_PTR(-ENOBUFS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001331 }
1332 }
1333
Changli Gaod8d1f302010-06-10 23:31:35 -07001334 rt->dst.rt_next = rt_hash_table[hash].chain;
Neil Horman1080d702008-10-27 12:28:25 -07001335
Eric Dumazet00269b52008-10-16 14:18:29 -07001336 /*
1337 * Since lookup is lockfree, we must make sure
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001338 * previous writes to rt are committed to memory
Eric Dumazet00269b52008-10-16 14:18:29 -07001339 * before making rt visible to other CPUS.
1340 */
Eric Dumazet1ddbcb02009-05-19 20:14:28 +00001341 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
Neil Horman1080d702008-10-27 12:28:25 -07001342
Eric Dumazet22c047c2005-07-05 14:55:24 -07001343 spin_unlock_bh(rt_hash_lock_addr(hash));
Neil Horman73e42892009-06-20 01:15:16 -07001344
Neil Hormanb6280b42009-06-22 10:18:53 +00001345skip_hashing:
David S. Millerb23dd4f2011-03-02 14:31:35 -08001346 if (skb)
Changli Gaod8d1f302010-06-10 23:31:35 -07001347 skb_dst_set(skb, &rt->dst);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001348 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001349}
1350
David S. Miller6431cbc2011-02-07 20:38:06 -08001351static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1352
1353static u32 rt_peer_genid(void)
1354{
1355 return atomic_read(&__rt_peer_genid);
1356}
1357
David S. Millera48eff12011-05-18 18:42:43 -04001358void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001359{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001360 struct inet_peer *peer;
1361
David S. Millera48eff12011-05-18 18:42:43 -04001362 peer = inet_getpeer_v4(daddr, create);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001363
Eric Dumazet49e8ab02010-08-19 06:10:45 +00001364 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001365 inet_putpeer(peer);
David S. Miller6431cbc2011-02-07 20:38:06 -08001366 else
1367 rt->rt_peer_genid = rt_peer_genid();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001368}
1369
Eric Dumazet509a15a2014-07-26 08:58:10 +02001370#define IP_IDENTS_SZ 2048u
1371struct ip_ident_bucket {
1372 atomic_t id;
1373 u32 stamp32;
1374};
1375
1376static struct ip_ident_bucket *ip_idents __read_mostly;
1377
1378/* In order to protect privacy, we add a perturbation to identifiers
1379 * if one generator is seldom used. This makes hard for an attacker
1380 * to infer how many packets were sent between two points in time.
1381 */
1382u32 ip_idents_reserve(u32 hash, int segs)
1383{
1384 struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ;
1385 u32 old = ACCESS_ONCE(bucket->stamp32);
1386 u32 now = (u32)jiffies;
1387 u32 delta = 0;
1388
1389 if (old != now && cmpxchg(&bucket->stamp32, old, now) == old) {
1390 u64 x = random32();
1391
1392 x *= (now - old);
1393 delta = (u32)(x >> 32);
1394 }
1395
1396 return atomic_add_return(segs + delta, &bucket->id) - segs;
1397}
1398EXPORT_SYMBOL(ip_idents_reserve);
Eric Dumazetad52eef2014-06-02 05:26:03 -07001399
1400void __ip_select_ident(struct iphdr *iph, int segs)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001401{
Eric Dumazetad52eef2014-06-02 05:26:03 -07001402 static u32 ip_idents_hashrnd __read_mostly;
1403 static bool hashrnd_initialized = false;
1404 u32 hash, id;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001405
Eric Dumazetad52eef2014-06-02 05:26:03 -07001406 if (unlikely(!hashrnd_initialized)) {
1407 hashrnd_initialized = true;
1408 get_random_bytes(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
1409 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001410
Eric Dumazet509a15a2014-07-26 08:58:10 +02001411 hash = jhash_3words((__force u32)iph->daddr,
1412 (__force u32)iph->saddr,
1413 iph->protocol,
1414 ip_idents_hashrnd);
Eric Dumazetad52eef2014-06-02 05:26:03 -07001415 id = ip_idents_reserve(hash, segs);
1416 iph->id = htons(id);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001417}
Eric Dumazet4bc2f182010-07-09 21:22:10 +00001418EXPORT_SYMBOL(__ip_select_ident);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001419
1420static void rt_del(unsigned hash, struct rtable *rt)
1421{
Eric Dumazet1c317202010-10-25 21:02:07 +00001422 struct rtable __rcu **rthp;
1423 struct rtable *aux;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001424
Eric Dumazet29e75252008-01-31 17:05:09 -08001425 rthp = &rt_hash_table[hash].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001426 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001427 ip_rt_put(rt);
Eric Dumazet1c317202010-10-25 21:02:07 +00001428 while ((aux = rcu_dereference_protected(*rthp,
1429 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001430 if (aux == rt || rt_is_expired(aux)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001431 *rthp = aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001432 rt_free(aux);
1433 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001434 }
Changli Gaod8d1f302010-06-10 23:31:35 -07001435 rthp = &aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001436 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001437 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001438}
1439
David S. Millerde398fb2011-12-05 13:21:42 -05001440static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001441{
1442 struct rtable *rt = (struct rtable *) dst;
1443 __be32 orig_gw = rt->rt_gateway;
1444 struct neighbour *n, *old_n;
1445
1446 dst_confirm(&rt->dst);
1447
1448 rt->rt_gateway = peer->redirect_learned.a4;
1449
1450 n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
David S. Millerde398fb2011-12-05 13:21:42 -05001451 if (IS_ERR(n)) {
1452 rt->rt_gateway = orig_gw;
1453 return;
1454 }
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001455 old_n = xchg(&rt->dst._neighbour, n);
1456 if (old_n)
1457 neigh_release(old_n);
David S. Millerde398fb2011-12-05 13:21:42 -05001458 if (!(n->nud_state & NUD_VALID)) {
1459 neigh_event_send(n, NULL);
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001460 } else {
1461 rt->rt_flags |= RTCF_REDIRECTED;
1462 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1463 }
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001464}
1465
Eric Dumazeted7865a42010-06-07 21:49:44 -07001466/* called in rcu_read_lock() section */
Al Virof7655222006-09-26 21:25:43 -07001467void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1468 __be32 saddr, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001469{
Flavio Leitner7cc91502011-10-24 02:56:38 -04001470 int s, i;
Eric Dumazeted7865a42010-06-07 21:49:44 -07001471 struct in_device *in_dev = __in_dev_get_rcu(dev);
Flavio Leitner7cc91502011-10-24 02:56:38 -04001472 __be32 skeys[2] = { saddr, 0 };
1473 int ikeys[2] = { dev->ifindex, 0 };
David S. Millerf39925d2011-02-09 22:00:16 -08001474 struct inet_peer *peer;
Denis V. Lunev317805b2008-02-28 20:50:06 -08001475 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001476
Linus Torvalds1da177e2005-04-16 15:20:36 -07001477 if (!in_dev)
1478 return;
1479
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001480 net = dev_net(dev);
Joe Perches9d4fb272009-11-23 10:41:23 -08001481 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1482 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1483 ipv4_is_zeronet(new_gw))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001484 goto reject_redirect;
1485
1486 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1487 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1488 goto reject_redirect;
1489 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1490 goto reject_redirect;
1491 } else {
Denis V. Lunev317805b2008-02-28 20:50:06 -08001492 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001493 goto reject_redirect;
1494 }
1495
Flavio Leitner7cc91502011-10-24 02:56:38 -04001496 for (s = 0; s < 2; s++) {
1497 for (i = 0; i < 2; i++) {
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001498 unsigned int hash;
1499 struct rtable __rcu **rthp;
1500 struct rtable *rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001501
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001502 hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1503
1504 rthp = &rt_hash_table[hash].chain;
1505
1506 while ((rt = rcu_dereference(*rthp)) != NULL) {
1507 rthp = &rt->dst.rt_next;
1508
1509 if (rt->rt_key_dst != daddr ||
1510 rt->rt_key_src != skeys[s] ||
1511 rt->rt_oif != ikeys[i] ||
1512 rt_is_input_route(rt) ||
1513 rt_is_expired(rt) ||
1514 !net_eq(dev_net(rt->dst.dev), net) ||
1515 rt->dst.error ||
1516 rt->dst.dev != dev ||
1517 rt->rt_gateway != old_gw)
1518 continue;
1519
1520 if (!rt->peer)
1521 rt_bind_peer(rt, rt->rt_dst, 1);
1522
1523 peer = rt->peer;
1524 if (peer) {
Steffen Klassertac3f48d2012-03-06 21:21:10 +00001525 if (peer->redirect_learned.a4 != new_gw) {
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001526 peer->redirect_learned.a4 = new_gw;
1527 atomic_inc(&__rt_peer_genid);
1528 }
1529 check_peer_redir(&rt->dst, peer);
1530 }
Flavio Leitner7cc91502011-10-24 02:56:38 -04001531 }
Flavio Leitner7cc91502011-10-24 02:56:38 -04001532 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001533 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001534 return;
1535
1536reject_redirect:
1537#ifdef CONFIG_IP_ROUTE_VERBOSE
1538 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00001539 pr_info("Redirect from %pI4 on %s about %pI4 ignored\n"
Harvey Harrison673d57e2008-10-31 00:53:57 -07001540 " Advised path = %pI4 -> %pI4\n",
Joe Perches058bd4d2012-03-11 18:36:11 +00001541 &old_gw, dev->name, &new_gw,
1542 &saddr, &daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001543#endif
Eric Dumazeted7865a42010-06-07 21:49:44 -07001544 ;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001545}
1546
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001547static bool peer_pmtu_expired(struct inet_peer *peer)
1548{
1549 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1550
1551 return orig &&
1552 time_after_eq(jiffies, orig) &&
1553 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1554}
1555
1556static bool peer_pmtu_cleaned(struct inet_peer *peer)
1557{
1558 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1559
1560 return orig &&
1561 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1562}
1563
Linus Torvalds1da177e2005-04-16 15:20:36 -07001564static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1565{
Eric Dumazetee6b9672008-03-05 18:30:47 -08001566 struct rtable *rt = (struct rtable *)dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001567 struct dst_entry *ret = dst;
1568
1569 if (rt) {
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001570 if (dst->obsolete > 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001571 ip_rt_put(rt);
1572 ret = NULL;
David S. Miller2c8cec52011-02-09 20:42:07 -08001573 } else if (rt->rt_flags & RTCF_REDIRECTED) {
David S. Miller5e2b61f2011-03-04 21:47:09 -08001574 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1575 rt->rt_oif,
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001576 rt_genid(dev_net(dst->dev)));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001577 rt_del(hash, rt);
1578 ret = NULL;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001579 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1580 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001581 }
1582 }
1583 return ret;
1584}
1585
1586/*
1587 * Algorithm:
1588 * 1. The first ip_rt_redirect_number redirects are sent
1589 * with exponential backoff, then we stop sending them at all,
1590 * assuming that the host ignores our redirects.
1591 * 2. If we did not see packets requiring redirects
1592 * during ip_rt_redirect_silence, we assume that the host
1593 * forgot redirected route and start to send redirects again.
1594 *
1595 * This algorithm is much cheaper and more intelligent than dumb load limiting
1596 * in icmp.c.
1597 *
1598 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1599 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1600 */
1601
1602void ip_rt_send_redirect(struct sk_buff *skb)
1603{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001604 struct rtable *rt = skb_rtable(skb);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001605 struct in_device *in_dev;
David S. Miller92d86822011-02-04 15:55:25 -08001606 struct inet_peer *peer;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001607 int log_martians;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001608
Eric Dumazet30038fc2009-08-28 23:52:01 -07001609 rcu_read_lock();
Changli Gaod8d1f302010-06-10 23:31:35 -07001610 in_dev = __in_dev_get_rcu(rt->dst.dev);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001611 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1612 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001613 return;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001614 }
1615 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1616 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001617
David S. Miller92d86822011-02-04 15:55:25 -08001618 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001619 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller92d86822011-02-04 15:55:25 -08001620 peer = rt->peer;
1621 if (!peer) {
1622 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1623 return;
1624 }
1625
Linus Torvalds1da177e2005-04-16 15:20:36 -07001626 /* No redirected packets during ip_rt_redirect_silence;
1627 * reset the algorithm.
1628 */
David S. Miller92d86822011-02-04 15:55:25 -08001629 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1630 peer->rate_tokens = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001631
1632 /* Too many ignored redirects; do not send anything
Changli Gaod8d1f302010-06-10 23:31:35 -07001633 * set dst.rate_last to the last seen redirected packet.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001634 */
David S. Miller92d86822011-02-04 15:55:25 -08001635 if (peer->rate_tokens >= ip_rt_redirect_number) {
1636 peer->rate_last = jiffies;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001637 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001638 }
1639
1640 /* Check for load limit; set rate_last to the latest sent
1641 * redirect.
1642 */
David S. Miller92d86822011-02-04 15:55:25 -08001643 if (peer->rate_tokens == 0 ||
Li Yewang14fb8a72006-12-18 00:26:35 -08001644 time_after(jiffies,
David S. Miller92d86822011-02-04 15:55:25 -08001645 (peer->rate_last +
1646 (ip_rt_redirect_load << peer->rate_tokens)))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001647 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
David S. Miller92d86822011-02-04 15:55:25 -08001648 peer->rate_last = jiffies;
1649 ++peer->rate_tokens;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001650#ifdef CONFIG_IP_ROUTE_VERBOSE
Eric Dumazet30038fc2009-08-28 23:52:01 -07001651 if (log_martians &&
David S. Miller92d86822011-02-04 15:55:25 -08001652 peer->rate_tokens == ip_rt_redirect_number &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001653 net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00001654 pr_warn("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1655 &ip_hdr(skb)->saddr, rt->rt_iif,
Harvey Harrison673d57e2008-10-31 00:53:57 -07001656 &rt->rt_dst, &rt->rt_gateway);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001657#endif
1658 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001659}
1660
1661static int ip_error(struct sk_buff *skb)
1662{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001663 struct rtable *rt = skb_rtable(skb);
David S. Miller92d86822011-02-04 15:55:25 -08001664 struct inet_peer *peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001665 unsigned long now;
David S. Miller92d86822011-02-04 15:55:25 -08001666 bool send;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001667 int code;
1668
Changli Gaod8d1f302010-06-10 23:31:35 -07001669 switch (rt->dst.error) {
Joe Perches4500ebf2011-07-01 09:43:07 +00001670 case EINVAL:
1671 default:
1672 goto out;
1673 case EHOSTUNREACH:
1674 code = ICMP_HOST_UNREACH;
1675 break;
1676 case ENETUNREACH:
1677 code = ICMP_NET_UNREACH;
1678 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1679 IPSTATS_MIB_INNOROUTES);
1680 break;
1681 case EACCES:
1682 code = ICMP_PKT_FILTERED;
1683 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001684 }
1685
David S. Miller92d86822011-02-04 15:55:25 -08001686 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001687 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller92d86822011-02-04 15:55:25 -08001688 peer = rt->peer;
1689
1690 send = true;
1691 if (peer) {
1692 now = jiffies;
1693 peer->rate_tokens += now - peer->rate_last;
1694 if (peer->rate_tokens > ip_rt_error_burst)
1695 peer->rate_tokens = ip_rt_error_burst;
1696 peer->rate_last = now;
1697 if (peer->rate_tokens >= ip_rt_error_cost)
1698 peer->rate_tokens -= ip_rt_error_cost;
1699 else
1700 send = false;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001701 }
David S. Miller92d86822011-02-04 15:55:25 -08001702 if (send)
1703 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001704
1705out: kfree_skb(skb);
1706 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001707}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001708
1709/*
1710 * The last two values are not from the RFC but
1711 * are needed for AMPRnet AX.25 paths.
1712 */
1713
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001714static const unsigned short mtu_plateau[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -07001715{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1716
Stephen Hemminger5969f712008-04-10 01:52:09 -07001717static inline unsigned short guess_mtu(unsigned short old_mtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001718{
1719 int i;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001720
Linus Torvalds1da177e2005-04-16 15:20:36 -07001721 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1722 if (old_mtu > mtu_plateau[i])
1723 return mtu_plateau[i];
1724 return 68;
1725}
1726
Eric Dumazetb71d1d42011-04-22 04:53:02 +00001727unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
Timo Teras0010e462008-04-29 03:32:25 -07001728 unsigned short new_mtu,
1729 struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001730{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001731 unsigned short old_mtu = ntohs(iph->tot_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001732 unsigned short est_mtu = 0;
David S. Miller2c8cec52011-02-09 20:42:07 -08001733 struct inet_peer *peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001734
David S. Miller2c8cec52011-02-09 20:42:07 -08001735 peer = inet_getpeer_v4(iph->daddr, 1);
1736 if (peer) {
1737 unsigned short mtu = new_mtu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001738
David S. Miller2c8cec52011-02-09 20:42:07 -08001739 if (new_mtu < 68 || new_mtu >= old_mtu) {
1740 /* BSD 4.2 derived systems incorrectly adjust
1741 * tot_len by the IP header length, and report
1742 * a zero MTU in the ICMP message.
1743 */
1744 if (mtu == 0 &&
1745 old_mtu >= 68 + (iph->ihl << 2))
1746 old_mtu -= iph->ihl << 2;
1747 mtu = guess_mtu(old_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001748 }
David S. Miller2c8cec52011-02-09 20:42:07 -08001749
1750 if (mtu < ip_rt_min_pmtu)
1751 mtu = ip_rt_min_pmtu;
1752 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001753 unsigned long pmtu_expires;
1754
1755 pmtu_expires = jiffies + ip_rt_mtu_expires;
1756 if (!pmtu_expires)
1757 pmtu_expires = 1UL;
1758
David S. Miller2c8cec52011-02-09 20:42:07 -08001759 est_mtu = mtu;
1760 peer->pmtu_learned = mtu;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001761 peer->pmtu_expires = pmtu_expires;
Gao feng59445b62011-10-19 15:34:09 +00001762 atomic_inc(&__rt_peer_genid);
David S. Miller2c8cec52011-02-09 20:42:07 -08001763 }
1764
1765 inet_putpeer(peer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001766 }
1767 return est_mtu ? : new_mtu;
1768}
1769
David S. Miller2c8cec52011-02-09 20:42:07 -08001770static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1771{
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001772 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
David S. Miller2c8cec52011-02-09 20:42:07 -08001773
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001774 if (!expires)
1775 return;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001776 if (time_before(jiffies, expires)) {
David S. Miller2c8cec52011-02-09 20:42:07 -08001777 u32 orig_dst_mtu = dst_mtu(dst);
1778 if (peer->pmtu_learned < orig_dst_mtu) {
1779 if (!peer->pmtu_orig)
1780 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1781 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1782 }
1783 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1784 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1785}
1786
Linus Torvalds1da177e2005-04-16 15:20:36 -07001787static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1788{
David S. Miller2c8cec52011-02-09 20:42:07 -08001789 struct rtable *rt = (struct rtable *) dst;
1790 struct inet_peer *peer;
1791
1792 dst_confirm(dst);
1793
1794 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001795 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller2c8cec52011-02-09 20:42:07 -08001796 peer = rt->peer;
1797 if (peer) {
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001798 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1799
David S. Miller2c8cec52011-02-09 20:42:07 -08001800 if (mtu < ip_rt_min_pmtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001801 mtu = ip_rt_min_pmtu;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001802 if (!pmtu_expires || mtu < peer->pmtu_learned) {
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001803
1804 pmtu_expires = jiffies + ip_rt_mtu_expires;
1805 if (!pmtu_expires)
1806 pmtu_expires = 1UL;
1807
David S. Miller2c8cec52011-02-09 20:42:07 -08001808 peer->pmtu_learned = mtu;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001809 peer->pmtu_expires = pmtu_expires;
David S. Miller2c8cec52011-02-09 20:42:07 -08001810
1811 atomic_inc(&__rt_peer_genid);
1812 rt->rt_peer_genid = rt_peer_genid();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001813 }
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001814 check_peer_pmtu(dst, peer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001815 }
1816}
1817
David S. Millerf39925d2011-02-09 22:00:16 -08001818
David S. Millerde398fb2011-12-05 13:21:42 -05001819static void ipv4_validate_peer(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001820{
David S. Miller6431cbc2011-02-07 20:38:06 -08001821 if (rt->rt_peer_genid != rt_peer_genid()) {
David S. Miller2c8cec52011-02-09 20:42:07 -08001822 struct inet_peer *peer;
1823
David S. Miller6431cbc2011-02-07 20:38:06 -08001824 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001825 rt_bind_peer(rt, rt->rt_dst, 0);
David S. Miller6431cbc2011-02-07 20:38:06 -08001826
David S. Miller2c8cec52011-02-09 20:42:07 -08001827 peer = rt->peer;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001828 if (peer) {
David S. Millerefbc3682011-12-01 13:38:59 -05001829 check_peer_pmtu(&rt->dst, peer);
David S. Miller2c8cec52011-02-09 20:42:07 -08001830
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001831 if (peer->redirect_learned.a4 &&
David S. Millerde398fb2011-12-05 13:21:42 -05001832 peer->redirect_learned.a4 != rt->rt_gateway)
1833 check_peer_redir(&rt->dst, peer);
David S. Millerf39925d2011-02-09 22:00:16 -08001834 }
1835
David S. Miller6431cbc2011-02-07 20:38:06 -08001836 rt->rt_peer_genid = rt_peer_genid();
1837 }
David S. Millerefbc3682011-12-01 13:38:59 -05001838}
1839
1840static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1841{
1842 struct rtable *rt = (struct rtable *) dst;
1843
1844 if (rt_is_expired(rt))
1845 return NULL;
David S. Millerde398fb2011-12-05 13:21:42 -05001846 ipv4_validate_peer(rt);
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001847 return dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001848}
1849
1850static void ipv4_dst_destroy(struct dst_entry *dst)
1851{
1852 struct rtable *rt = (struct rtable *) dst;
1853 struct inet_peer *peer = rt->peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001854
David S. Miller62fa8a82011-01-26 20:51:05 -08001855 if (rt->fi) {
1856 fib_info_put(rt->fi);
1857 rt->fi = NULL;
1858 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001859 if (peer) {
1860 rt->peer = NULL;
1861 inet_putpeer(peer);
1862 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001863}
1864
Linus Torvalds1da177e2005-04-16 15:20:36 -07001865
1866static void ipv4_link_failure(struct sk_buff *skb)
1867{
1868 struct rtable *rt;
1869
1870 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1871
Eric Dumazet511c3f92009-06-02 05:14:27 +00001872 rt = skb_rtable(skb);
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001873 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1874 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001875}
1876
1877static int ip_rt_bug(struct sk_buff *skb)
1878{
Harvey Harrison673d57e2008-10-31 00:53:57 -07001879 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1880 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001881 skb->dev ? skb->dev->name : "?");
1882 kfree_skb(skb);
Dave Jonesc378a9c2011-05-21 07:16:42 +00001883 WARN_ON(1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001884 return 0;
1885}
1886
1887/*
1888 We do not cache source address of outgoing interface,
1889 because it is used only by IP RR, TS and SRR options,
1890 so that it out of fast path.
1891
1892 BTW remember: "addr" is allowed to be not aligned
1893 in IP options!
1894 */
1895
David S. Miller8e363602011-05-13 17:29:41 -04001896void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001897{
Al Viroa61ced52006-09-26 21:27:54 -07001898 __be32 src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001899
David S. Millerc7537962010-11-11 17:07:48 -08001900 if (rt_is_output_route(rt))
David S. Millerc5be24f2011-05-13 18:01:21 -04001901 src = ip_hdr(skb)->saddr;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001902 else {
David S. Miller8e363602011-05-13 17:29:41 -04001903 struct fib_result res;
1904 struct flowi4 fl4;
1905 struct iphdr *iph;
1906
1907 iph = ip_hdr(skb);
1908
1909 memset(&fl4, 0, sizeof(fl4));
1910 fl4.daddr = iph->daddr;
1911 fl4.saddr = iph->saddr;
Julian Anastasovb0fe4a32011-07-23 02:00:41 +00001912 fl4.flowi4_tos = RT_TOS(iph->tos);
David S. Miller8e363602011-05-13 17:29:41 -04001913 fl4.flowi4_oif = rt->dst.dev->ifindex;
1914 fl4.flowi4_iif = skb->dev->ifindex;
1915 fl4.flowi4_mark = skb->mark;
David S. Miller5e2b61f2011-03-04 21:47:09 -08001916
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001917 rcu_read_lock();
David S. Miller68a5e3d2011-03-11 20:07:33 -05001918 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
David S. Miller436c3b62011-03-24 17:42:21 -07001919 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001920 else
1921 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001922 RT_SCOPE_UNIVERSE);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001923 rcu_read_unlock();
1924 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001925 memcpy(addr, &src, 4);
1926}
1927
Patrick McHardyc7066f72011-01-14 13:36:42 +01001928#ifdef CONFIG_IP_ROUTE_CLASSID
Linus Torvalds1da177e2005-04-16 15:20:36 -07001929static void set_class_tag(struct rtable *rt, u32 tag)
1930{
Changli Gaod8d1f302010-06-10 23:31:35 -07001931 if (!(rt->dst.tclassid & 0xFFFF))
1932 rt->dst.tclassid |= tag & 0xFFFF;
1933 if (!(rt->dst.tclassid & 0xFFFF0000))
1934 rt->dst.tclassid |= tag & 0xFFFF0000;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001935}
1936#endif
1937
David S. Miller0dbaee32010-12-13 12:52:14 -08001938static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1939{
1940 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1941
1942 if (advmss == 0) {
1943 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1944 ip_rt_min_advmss);
1945 if (advmss > 65535 - 40)
1946 advmss = 65535 - 40;
1947 }
1948 return advmss;
1949}
1950
Steffen Klassertebb762f2011-11-23 02:12:51 +00001951static unsigned int ipv4_mtu(const struct dst_entry *dst)
David S. Millerd33e4552010-12-14 13:01:14 -08001952{
Steffen Klassert261663b2011-11-23 02:14:50 +00001953 const struct rtable *rt = (const struct rtable *) dst;
Steffen Klassert618f9bc2011-11-23 02:13:31 +00001954 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1955
Steffen Klassert261663b2011-11-23 02:14:50 +00001956 if (mtu && rt_is_output_route(rt))
Steffen Klassert618f9bc2011-11-23 02:13:31 +00001957 return mtu;
1958
1959 mtu = dst->dev->mtu;
David S. Millerd33e4552010-12-14 13:01:14 -08001960
1961 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
David S. Millerd33e4552010-12-14 13:01:14 -08001962
1963 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1964 mtu = 576;
1965 }
1966
1967 if (mtu > IP_MAX_MTU)
1968 mtu = IP_MAX_MTU;
1969
1970 return mtu;
1971}
1972
David S. Miller813b3b52011-04-28 14:48:42 -07001973static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
David S. Miller5e2b61f2011-03-04 21:47:09 -08001974 struct fib_info *fi)
David S. Millera4daad62011-01-27 22:01:53 -08001975{
David S. Miller0131ba42011-02-04 14:37:30 -08001976 struct inet_peer *peer;
1977 int create = 0;
1978
1979 /* If a peer entry exists for this destination, we must hook
1980 * it up in order to get at cached metrics.
1981 */
David S. Miller813b3b52011-04-28 14:48:42 -07001982 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
David S. Miller0131ba42011-02-04 14:37:30 -08001983 create = 1;
1984
David S. Miller3c0afdc2011-03-04 21:26:07 -08001985 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
David S. Miller0131ba42011-02-04 14:37:30 -08001986 if (peer) {
David S. Miller3c0afdc2011-03-04 21:26:07 -08001987 rt->rt_peer_genid = rt_peer_genid();
David S. Miller0131ba42011-02-04 14:37:30 -08001988 if (inet_metrics_new(peer))
1989 memcpy(peer->metrics, fi->fib_metrics,
1990 sizeof(u32) * RTAX_MAX);
1991 dst_init_metrics(&rt->dst, peer->metrics, false);
David S. Miller2c8cec52011-02-09 20:42:07 -08001992
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001993 check_peer_pmtu(&rt->dst, peer);
Steffen Klassertac3f48d2012-03-06 21:21:10 +00001994
David S. Millerf39925d2011-02-09 22:00:16 -08001995 if (peer->redirect_learned.a4 &&
1996 peer->redirect_learned.a4 != rt->rt_gateway) {
1997 rt->rt_gateway = peer->redirect_learned.a4;
1998 rt->rt_flags |= RTCF_REDIRECTED;
1999 }
David S. Miller0131ba42011-02-04 14:37:30 -08002000 } else {
David S. Millerb8dad612011-01-28 14:07:16 -08002001 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
2002 rt->fi = fi;
2003 atomic_inc(&fi->fib_clntref);
2004 }
David S. Millera4daad62011-01-27 22:01:53 -08002005 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
David S. Millera4daad62011-01-27 22:01:53 -08002006 }
2007}
2008
David S. Miller813b3b52011-04-28 14:48:42 -07002009static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
David S. Miller5e2b61f2011-03-04 21:47:09 -08002010 const struct fib_result *res,
David S. Miller982721f2011-02-16 21:44:24 -08002011 struct fib_info *fi, u16 type, u32 itag)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002012{
David S. Millerdefb3512010-12-08 21:16:57 -08002013 struct dst_entry *dst = &rt->dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002014
2015 if (fi) {
2016 if (FIB_RES_GW(*res) &&
2017 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
2018 rt->rt_gateway = FIB_RES_GW(*res);
David S. Miller813b3b52011-04-28 14:48:42 -07002019 rt_init_metrics(rt, fl4, fi);
Patrick McHardyc7066f72011-01-14 13:36:42 +01002020#ifdef CONFIG_IP_ROUTE_CLASSID
David S. Millerdefb3512010-12-08 21:16:57 -08002021 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002022#endif
David S. Millerd33e4552010-12-14 13:01:14 -08002023 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002024
David S. Millerdefb3512010-12-08 21:16:57 -08002025 if (dst_mtu(dst) > IP_MAX_MTU)
2026 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
David S. Miller0dbaee32010-12-13 12:52:14 -08002027 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
David S. Millerdefb3512010-12-08 21:16:57 -08002028 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002029
Patrick McHardyc7066f72011-01-14 13:36:42 +01002030#ifdef CONFIG_IP_ROUTE_CLASSID
Linus Torvalds1da177e2005-04-16 15:20:36 -07002031#ifdef CONFIG_IP_MULTIPLE_TABLES
2032 set_class_tag(rt, fib_rules_tclass(res));
2033#endif
2034 set_class_tag(rt, itag);
2035#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002036}
2037
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002038static struct rtable *rt_dst_alloc(struct net_device *dev,
2039 bool nopolicy, bool noxfrm)
David S. Miller0c4dcd52011-02-17 15:42:37 -08002040{
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002041 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2042 DST_HOST |
2043 (nopolicy ? DST_NOPOLICY : 0) |
2044 (noxfrm ? DST_NOXFRM : 0));
David S. Miller0c4dcd52011-02-17 15:42:37 -08002045}
2046
Eric Dumazet96d36222010-06-02 19:21:31 +00002047/* called in rcu_read_lock() section */
Al Viro9e12bb22006-09-26 21:25:20 -07002048static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002049 u8 tos, struct net_device *dev, int our)
2050{
Eric Dumazet96d36222010-06-02 19:21:31 +00002051 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002052 struct rtable *rth;
Al Viroa61ced52006-09-26 21:27:54 -07002053 __be32 spec_dst;
Eric Dumazet96d36222010-06-02 19:21:31 +00002054 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002055 u32 itag = 0;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002056 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002057
2058 /* Primary sanity checks. */
2059
2060 if (in_dev == NULL)
2061 return -EINVAL;
2062
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002063 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002064 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002065 goto e_inval;
2066
Joe Perchesf97c1e02007-12-16 13:45:43 -08002067 if (ipv4_is_zeronet(saddr)) {
2068 if (!ipv4_is_local_multicast(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002069 goto e_inval;
2070 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002071 } else {
Michael Smith5c04c812011-04-07 04:51:50 +00002072 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2073 &itag);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002074 if (err < 0)
2075 goto e_err;
2076 }
Benjamin LaHaise4e7b2f12012-03-27 15:55:32 +00002077 rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002078 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002079 if (!rth)
2080 goto e_nobufs;
2081
Patrick McHardyc7066f72011-01-14 13:36:42 +01002082#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07002083 rth->dst.tclassid = itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002084#endif
David S. Millercf911662011-04-28 14:31:47 -07002085 rth->dst.output = ip_rt_bug;
2086
2087 rth->rt_key_dst = daddr;
2088 rth->rt_key_src = saddr;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002089 rth->rt_genid = rt_genid(dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002090 rth->rt_flags = RTCF_MULTICAST;
Eric Dumazet29e75252008-01-31 17:05:09 -08002091 rth->rt_type = RTN_MULTICAST;
David S. Miller475949d2011-05-03 19:45:15 -07002092 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002093 rth->rt_dst = daddr;
2094 rth->rt_src = saddr;
2095 rth->rt_route_iif = dev->ifindex;
2096 rth->rt_iif = dev->ifindex;
2097 rth->rt_oif = 0;
2098 rth->rt_mark = skb->mark;
2099 rth->rt_gateway = daddr;
2100 rth->rt_spec_dst= spec_dst;
2101 rth->rt_peer_genid = 0;
2102 rth->peer = NULL;
2103 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002104 if (our) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002105 rth->dst.input= ip_local_deliver;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002106 rth->rt_flags |= RTCF_LOCAL;
2107 }
2108
2109#ifdef CONFIG_IP_MROUTE
Joe Perchesf97c1e02007-12-16 13:45:43 -08002110 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
Changli Gaod8d1f302010-06-10 23:31:35 -07002111 rth->dst.input = ip_mr_input;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002112#endif
2113 RT_CACHE_STAT_INC(in_slow_mc);
2114
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002115 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
David S. Millerb23dd4f2011-03-02 14:31:35 -08002116 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
Eric Dumazet9aa3c942011-06-18 11:59:18 -07002117 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002118
2119e_nobufs:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002120 return -ENOBUFS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002121e_inval:
Eric Dumazet96d36222010-06-02 19:21:31 +00002122 return -EINVAL;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002123e_err:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002124 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002125}
2126
2127
2128static void ip_handle_martian_source(struct net_device *dev,
2129 struct in_device *in_dev,
2130 struct sk_buff *skb,
Al Viro9e12bb22006-09-26 21:25:20 -07002131 __be32 daddr,
2132 __be32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002133{
2134 RT_CACHE_STAT_INC(in_martian_src);
2135#ifdef CONFIG_IP_ROUTE_VERBOSE
2136 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2137 /*
2138 * RFC1812 recommendation, if source is martian,
2139 * the only hint is MAC header.
2140 */
Joe Perches058bd4d2012-03-11 18:36:11 +00002141 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
Harvey Harrison673d57e2008-10-31 00:53:57 -07002142 &daddr, &saddr, dev->name);
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07002143 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
Joe Perches058bd4d2012-03-11 18:36:11 +00002144 print_hex_dump(KERN_WARNING, "ll header: ",
2145 DUMP_PREFIX_OFFSET, 16, 1,
2146 skb_mac_header(skb),
2147 dev->hard_header_len, true);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002148 }
2149 }
2150#endif
2151}
2152
Eric Dumazet47360222010-06-03 04:13:21 +00002153/* called in rcu_read_lock() section */
Stephen Hemminger5969f712008-04-10 01:52:09 -07002154static int __mkroute_input(struct sk_buff *skb,
David S. Miller982721f2011-02-16 21:44:24 -08002155 const struct fib_result *res,
Stephen Hemminger5969f712008-04-10 01:52:09 -07002156 struct in_device *in_dev,
2157 __be32 daddr, __be32 saddr, u32 tos,
2158 struct rtable **result)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002159{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002160 struct rtable *rth;
2161 int err;
2162 struct in_device *out_dev;
Eric Dumazet47360222010-06-03 04:13:21 +00002163 unsigned int flags = 0;
Al Virod9c9df82006-09-26 21:28:14 -07002164 __be32 spec_dst;
Li RongQing62e1a642014-05-22 16:36:55 +08002165 u32 itag = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002166
2167 /* get a working reference to the output device */
Eric Dumazet47360222010-06-03 04:13:21 +00002168 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002169 if (out_dev == NULL) {
2170 if (net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00002171 pr_crit("Bug in ip_route_input_slow(). Please report.\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07002172 return -EINVAL;
2173 }
2174
2175
Michael Smith5c04c812011-04-07 04:51:50 +00002176 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2177 in_dev->dev, &spec_dst, &itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002178 if (err < 0) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002179 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002180 saddr);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002181
Linus Torvalds1da177e2005-04-16 15:20:36 -07002182 goto cleanup;
2183 }
2184
2185 if (err)
2186 flags |= RTCF_DIRECTSRC;
2187
Thomas Graf51b77ca2008-06-03 16:36:01 -07002188 if (out_dev == in_dev && err &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002189 (IN_DEV_SHARED_MEDIA(out_dev) ||
2190 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2191 flags |= RTCF_DOREDIRECT;
2192
2193 if (skb->protocol != htons(ETH_P_IP)) {
2194 /* Not IP (i.e. ARP). Do not create route, if it is
2195 * invalid for proxy arp. DNAT routes are always valid.
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00002196 *
2197 * Proxy arp feature have been extended to allow, ARP
2198 * replies back to the same interface, to support
2199 * Private VLAN switch technologies. See arp.c.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002200 */
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00002201 if (out_dev == in_dev &&
2202 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002203 err = -EINVAL;
2204 goto cleanup;
2205 }
2206 }
2207
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002208 rth = rt_dst_alloc(out_dev->dev,
2209 IN_DEV_CONF_GET(in_dev, NOPOLICY),
David S. Miller0c4dcd52011-02-17 15:42:37 -08002210 IN_DEV_CONF_GET(out_dev, NOXFRM));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002211 if (!rth) {
2212 err = -ENOBUFS;
2213 goto cleanup;
2214 }
2215
David S. Miller5e2b61f2011-03-04 21:47:09 -08002216 rth->rt_key_dst = daddr;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002217 rth->rt_key_src = saddr;
David S. Millercf911662011-04-28 14:31:47 -07002218 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2219 rth->rt_flags = flags;
2220 rth->rt_type = res->type;
David S. Miller475949d2011-05-03 19:45:15 -07002221 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002222 rth->rt_dst = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002223 rth->rt_src = saddr;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002224 rth->rt_route_iif = in_dev->dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002225 rth->rt_iif = in_dev->dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002226 rth->rt_oif = 0;
David S. Millercf911662011-04-28 14:31:47 -07002227 rth->rt_mark = skb->mark;
2228 rth->rt_gateway = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002229 rth->rt_spec_dst= spec_dst;
David S. Millercf911662011-04-28 14:31:47 -07002230 rth->rt_peer_genid = 0;
2231 rth->peer = NULL;
2232 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002233
Changli Gaod8d1f302010-06-10 23:31:35 -07002234 rth->dst.input = ip_forward;
2235 rth->dst.output = ip_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002236
David S. Miller5e2b61f2011-03-04 21:47:09 -08002237 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002238
Linus Torvalds1da177e2005-04-16 15:20:36 -07002239 *result = rth;
2240 err = 0;
2241 cleanup:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002242 return err;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002243}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002244
Stephen Hemminger5969f712008-04-10 01:52:09 -07002245static int ip_mkroute_input(struct sk_buff *skb,
2246 struct fib_result *res,
David S. Miller68a5e3d2011-03-11 20:07:33 -05002247 const struct flowi4 *fl4,
Stephen Hemminger5969f712008-04-10 01:52:09 -07002248 struct in_device *in_dev,
2249 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002250{
Chuck Short7abaa272005-06-22 22:10:23 -07002251 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002252 int err;
2253 unsigned hash;
2254
2255#ifdef CONFIG_IP_ROUTE_MULTIPATH
David S. Millerff3fccb2011-03-10 16:23:24 -08002256 if (res->fi && res->fi->fib_nhs > 1)
David S. Miller1b7fe5932011-03-10 17:01:16 -08002257 fib_select_multipath(res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002258#endif
2259
2260 /* create a routing cache entry */
2261 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2262 if (err)
2263 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002264
2265 /* put it into the cache */
David S. Miller68a5e3d2011-03-11 20:07:33 -05002266 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
Changli Gaod8d1f302010-06-10 23:31:35 -07002267 rt_genid(dev_net(rth->dst.dev)));
David S. Miller68a5e3d2011-03-11 20:07:33 -05002268 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002269 if (IS_ERR(rth))
2270 return PTR_ERR(rth);
2271 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002272}
2273
Linus Torvalds1da177e2005-04-16 15:20:36 -07002274/*
2275 * NOTE. We drop all the packets that has local source
2276 * addresses, because every properly looped back packet
2277 * must have correct destination already attached by output routine.
2278 *
2279 * Such approach solves two big problems:
2280 * 1. Not simplex devices are handled properly.
2281 * 2. IP spoofing attempts are filtered with 100% of guarantee.
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002282 * called with rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -07002283 */
2284
Al Viro9e12bb22006-09-26 21:25:20 -07002285static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002286 u8 tos, struct net_device *dev)
2287{
2288 struct fib_result res;
Eric Dumazet96d36222010-06-02 19:21:31 +00002289 struct in_device *in_dev = __in_dev_get_rcu(dev);
David S. Miller68a5e3d2011-03-11 20:07:33 -05002290 struct flowi4 fl4;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002291 unsigned flags = 0;
2292 u32 itag = 0;
2293 struct rtable * rth;
2294 unsigned hash;
Al Viro9e12bb22006-09-26 21:25:20 -07002295 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002296 int err = -EINVAL;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002297 struct net * net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002298
2299 /* IP on this device is disabled. */
2300
2301 if (!in_dev)
2302 goto out;
2303
2304 /* Check for the most weird martians, which can be not detected
2305 by fib_lookup.
2306 */
2307
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002308 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002309 ipv4_is_loopback(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002310 goto martian_source;
2311
Andy Walls27a954b2010-10-17 15:11:22 +00002312 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002313 goto brd_input;
2314
2315 /* Accept zero addresses only to limited broadcast;
2316 * I even do not know to fix it or not. Waiting for complains :-)
2317 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002318 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002319 goto martian_source;
2320
Andy Walls27a954b2010-10-17 15:11:22 +00002321 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002322 goto martian_destination;
2323
2324 /*
2325 * Now we are ready to route packet.
2326 */
David S. Miller68a5e3d2011-03-11 20:07:33 -05002327 fl4.flowi4_oif = 0;
2328 fl4.flowi4_iif = dev->ifindex;
2329 fl4.flowi4_mark = skb->mark;
2330 fl4.flowi4_tos = tos;
2331 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2332 fl4.daddr = daddr;
2333 fl4.saddr = saddr;
2334 err = fib_lookup(net, &fl4, &res);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002335 if (err != 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002336 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002337 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002338 goto no_route;
2339 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002340
2341 RT_CACHE_STAT_INC(in_slow_tot);
2342
2343 if (res.type == RTN_BROADCAST)
2344 goto brd_input;
2345
2346 if (res.type == RTN_LOCAL) {
Michael Smith5c04c812011-04-07 04:51:50 +00002347 err = fib_validate_source(skb, saddr, daddr, tos,
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002348 net->loopback_dev->ifindex,
Michael Smith5c04c812011-04-07 04:51:50 +00002349 dev, &spec_dst, &itag);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002350 if (err < 0)
2351 goto martian_source_keep_err;
2352 if (err)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002353 flags |= RTCF_DIRECTSRC;
2354 spec_dst = daddr;
2355 goto local_input;
2356 }
2357
2358 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002359 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002360 if (res.type != RTN_UNICAST)
2361 goto martian_destination;
2362
David S. Miller68a5e3d2011-03-11 20:07:33 -05002363 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002364out: return err;
2365
2366brd_input:
2367 if (skb->protocol != htons(ETH_P_IP))
2368 goto e_inval;
2369
Joe Perchesf97c1e02007-12-16 13:45:43 -08002370 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002371 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2372 else {
Michael Smith5c04c812011-04-07 04:51:50 +00002373 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2374 &itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002375 if (err < 0)
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002376 goto martian_source_keep_err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002377 if (err)
2378 flags |= RTCF_DIRECTSRC;
2379 }
2380 flags |= RTCF_BROADCAST;
2381 res.type = RTN_BROADCAST;
2382 RT_CACHE_STAT_INC(in_brd);
2383
2384local_input:
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002385 rth = rt_dst_alloc(net->loopback_dev,
2386 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002387 if (!rth)
2388 goto e_nobufs;
2389
David S. Millercf911662011-04-28 14:31:47 -07002390 rth->dst.input= ip_local_deliver;
Changli Gaod8d1f302010-06-10 23:31:35 -07002391 rth->dst.output= ip_rt_bug;
David S. Millercf911662011-04-28 14:31:47 -07002392#ifdef CONFIG_IP_ROUTE_CLASSID
2393 rth->dst.tclassid = itag;
2394#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002395
David S. Miller5e2b61f2011-03-04 21:47:09 -08002396 rth->rt_key_dst = daddr;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002397 rth->rt_key_src = saddr;
David S. Millercf911662011-04-28 14:31:47 -07002398 rth->rt_genid = rt_genid(net);
2399 rth->rt_flags = flags|RTCF_LOCAL;
2400 rth->rt_type = res.type;
David S. Miller475949d2011-05-03 19:45:15 -07002401 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002402 rth->rt_dst = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002403 rth->rt_src = saddr;
Patrick McHardyc7066f72011-01-14 13:36:42 +01002404#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07002405 rth->dst.tclassid = itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002406#endif
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002407 rth->rt_route_iif = dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002408 rth->rt_iif = dev->ifindex;
David S. Millercf911662011-04-28 14:31:47 -07002409 rth->rt_oif = 0;
2410 rth->rt_mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002411 rth->rt_gateway = daddr;
2412 rth->rt_spec_dst= spec_dst;
David S. Millercf911662011-04-28 14:31:47 -07002413 rth->rt_peer_genid = 0;
2414 rth->peer = NULL;
2415 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002416 if (res.type == RTN_UNREACHABLE) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002417 rth->dst.input= ip_error;
2418 rth->dst.error= -err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002419 rth->rt_flags &= ~RTCF_LOCAL;
2420 }
David S. Miller68a5e3d2011-03-11 20:07:33 -05002421 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2422 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002423 err = 0;
2424 if (IS_ERR(rth))
2425 err = PTR_ERR(rth);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002426 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002427
2428no_route:
2429 RT_CACHE_STAT_INC(in_no_route);
2430 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2431 res.type = RTN_UNREACHABLE;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08002432 if (err == -ESRCH)
2433 err = -ENETUNREACH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002434 goto local_input;
2435
2436 /*
2437 * Do not cache martian addresses: they should be logged (RFC1812)
2438 */
2439martian_destination:
2440 RT_CACHE_STAT_INC(in_martian_dst);
2441#ifdef CONFIG_IP_ROUTE_VERBOSE
2442 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00002443 pr_warn("martian destination %pI4 from %pI4, dev %s\n",
Harvey Harrison673d57e2008-10-31 00:53:57 -07002444 &daddr, &saddr, dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002445#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002446
2447e_hostunreach:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002448 err = -EHOSTUNREACH;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002449 goto out;
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002450
Linus Torvalds1da177e2005-04-16 15:20:36 -07002451e_inval:
2452 err = -EINVAL;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002453 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002454
2455e_nobufs:
2456 err = -ENOBUFS;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002457 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002458
2459martian_source:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002460 err = -EINVAL;
2461martian_source_keep_err:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002462 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002463 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002464}
2465
Eric Dumazet407eadd2010-05-10 11:32:55 +00002466int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2467 u8 tos, struct net_device *dev, bool noref)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002468{
2469 struct rtable * rth;
2470 unsigned hash;
2471 int iif = dev->ifindex;
Denis V. Lunevb5921912008-01-22 23:50:25 -08002472 struct net *net;
Eric Dumazet96d36222010-06-02 19:21:31 +00002473 int res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002474
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002475 net = dev_net(dev);
Neil Horman1080d702008-10-27 12:28:25 -07002476
Eric Dumazet96d36222010-06-02 19:21:31 +00002477 rcu_read_lock();
2478
Neil Horman1080d702008-10-27 12:28:25 -07002479 if (!rt_caching(net))
2480 goto skip_cache;
2481
Linus Torvalds1da177e2005-04-16 15:20:36 -07002482 tos &= IPTOS_RT_MASK;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002483 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002484
Linus Torvalds1da177e2005-04-16 15:20:36 -07002485 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002486 rth = rcu_dereference(rth->dst.rt_next)) {
David S. Miller5e2b61f2011-03-04 21:47:09 -08002487 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2488 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
Julian Anastasov97a80412011-08-09 04:01:16 +00002489 (rth->rt_route_iif ^ iif) |
David S. Miller475949d2011-05-03 19:45:15 -07002490 (rth->rt_key_tos ^ tos)) == 0 &&
David S. Miller5e2b61f2011-03-04 21:47:09 -08002491 rth->rt_mark == skb->mark &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002492 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002493 !rt_is_expired(rth)) {
David S. Millerde398fb2011-12-05 13:21:42 -05002494 ipv4_validate_peer(rth);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002495 if (noref) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002496 dst_use_noref(&rth->dst, jiffies);
2497 skb_dst_set_noref(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002498 } else {
Changli Gaod8d1f302010-06-10 23:31:35 -07002499 dst_use(&rth->dst, jiffies);
2500 skb_dst_set(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002501 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002502 RT_CACHE_STAT_INC(in_hit);
2503 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002504 return 0;
2505 }
2506 RT_CACHE_STAT_INC(in_hlist_search);
2507 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002508
Neil Horman1080d702008-10-27 12:28:25 -07002509skip_cache:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002510 /* Multicast recognition logic is moved from route cache to here.
2511 The problem was that too many Ethernet cards have broken/missing
2512 hardware multicast filters :-( As result the host on multicasting
2513 network acquires a lot of useless route cache entries, sort of
2514 SDR messages from all the world. Now we try to get rid of them.
2515 Really, provided software IP multicast filter is organized
2516 reasonably (at least, hashed), it does not result in a slowdown
2517 comparing with route cache reject entries.
2518 Note, that multicast routers are not affected, because
2519 route cache entry is created eventually.
2520 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002521 if (ipv4_is_multicast(daddr)) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002522 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002523
Eric Dumazet96d36222010-06-02 19:21:31 +00002524 if (in_dev) {
David S. Millerdbdd9a52011-03-10 16:34:38 -08002525 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2526 ip_hdr(skb)->protocol);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002527 if (our
2528#ifdef CONFIG_IP_MROUTE
Joe Perches9d4fb272009-11-23 10:41:23 -08002529 ||
2530 (!ipv4_is_local_multicast(daddr) &&
2531 IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002532#endif
Joe Perches9d4fb272009-11-23 10:41:23 -08002533 ) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002534 int res = ip_route_input_mc(skb, daddr, saddr,
2535 tos, dev, our);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002536 rcu_read_unlock();
Eric Dumazet96d36222010-06-02 19:21:31 +00002537 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002538 }
2539 }
2540 rcu_read_unlock();
2541 return -EINVAL;
2542 }
Eric Dumazet96d36222010-06-02 19:21:31 +00002543 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2544 rcu_read_unlock();
2545 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002546}
Eric Dumazet407eadd2010-05-10 11:32:55 +00002547EXPORT_SYMBOL(ip_route_input_common);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002548
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002549/* called with rcu_read_lock() */
David S. Miller982721f2011-02-16 21:44:24 -08002550static struct rtable *__mkroute_output(const struct fib_result *res,
David S. Miller68a5e3d2011-03-11 20:07:33 -05002551 const struct flowi4 *fl4,
David S. Miller813b3b52011-04-28 14:48:42 -07002552 __be32 orig_daddr, __be32 orig_saddr,
Julian Anastasovf61759e2011-12-02 11:39:42 +00002553 int orig_oif, __u8 orig_rtos,
2554 struct net_device *dev_out,
David S. Miller5ada5522011-02-17 15:29:00 -08002555 unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002556{
David S. Miller982721f2011-02-16 21:44:24 -08002557 struct fib_info *fi = res->fi;
David S. Miller5ada5522011-02-17 15:29:00 -08002558 struct in_device *in_dev;
David S. Miller982721f2011-02-16 21:44:24 -08002559 u16 type = res->type;
David S. Miller5ada5522011-02-17 15:29:00 -08002560 struct rtable *rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002561
David S. Miller68a5e3d2011-03-11 20:07:33 -05002562 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
David S. Miller5ada5522011-02-17 15:29:00 -08002563 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002564
David S. Miller68a5e3d2011-03-11 20:07:33 -05002565 if (ipv4_is_lbcast(fl4->daddr))
David S. Miller982721f2011-02-16 21:44:24 -08002566 type = RTN_BROADCAST;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002567 else if (ipv4_is_multicast(fl4->daddr))
David S. Miller982721f2011-02-16 21:44:24 -08002568 type = RTN_MULTICAST;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002569 else if (ipv4_is_zeronet(fl4->daddr))
David S. Miller5ada5522011-02-17 15:29:00 -08002570 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002571
2572 if (dev_out->flags & IFF_LOOPBACK)
2573 flags |= RTCF_LOCAL;
2574
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002575 in_dev = __in_dev_get_rcu(dev_out);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002576 if (!in_dev)
David S. Miller5ada5522011-02-17 15:29:00 -08002577 return ERR_PTR(-EINVAL);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002578
David S. Miller982721f2011-02-16 21:44:24 -08002579 if (type == RTN_BROADCAST) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002580 flags |= RTCF_BROADCAST | RTCF_LOCAL;
David S. Miller982721f2011-02-16 21:44:24 -08002581 fi = NULL;
2582 } else if (type == RTN_MULTICAST) {
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002583 flags |= RTCF_MULTICAST | RTCF_LOCAL;
David S. Miller813b3b52011-04-28 14:48:42 -07002584 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2585 fl4->flowi4_proto))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002586 flags &= ~RTCF_LOCAL;
2587 /* If multicast route do not exist use
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002588 * default one, but do not gateway in this case.
2589 * Yes, it is hack.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002590 */
David S. Miller982721f2011-02-16 21:44:24 -08002591 if (fi && res->prefixlen < 4)
2592 fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002593 }
2594
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002595 rth = rt_dst_alloc(dev_out,
2596 IN_DEV_CONF_GET(in_dev, NOPOLICY),
David S. Miller0c4dcd52011-02-17 15:42:37 -08002597 IN_DEV_CONF_GET(in_dev, NOXFRM));
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002598 if (!rth)
David S. Miller5ada5522011-02-17 15:29:00 -08002599 return ERR_PTR(-ENOBUFS);
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002600
David S. Millercf911662011-04-28 14:31:47 -07002601 rth->dst.output = ip_output;
2602
David S. Miller813b3b52011-04-28 14:48:42 -07002603 rth->rt_key_dst = orig_daddr;
2604 rth->rt_key_src = orig_saddr;
David S. Millercf911662011-04-28 14:31:47 -07002605 rth->rt_genid = rt_genid(dev_net(dev_out));
2606 rth->rt_flags = flags;
2607 rth->rt_type = type;
Julian Anastasovf61759e2011-12-02 11:39:42 +00002608 rth->rt_key_tos = orig_rtos;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002609 rth->rt_dst = fl4->daddr;
2610 rth->rt_src = fl4->saddr;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002611 rth->rt_route_iif = 0;
David S. Miller813b3b52011-04-28 14:48:42 -07002612 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2613 rth->rt_oif = orig_oif;
2614 rth->rt_mark = fl4->flowi4_mark;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002615 rth->rt_gateway = fl4->daddr;
2616 rth->rt_spec_dst= fl4->saddr;
David S. Millercf911662011-04-28 14:31:47 -07002617 rth->rt_peer_genid = 0;
2618 rth->peer = NULL;
2619 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002620
2621 RT_CACHE_STAT_INC(out_slow_tot);
2622
2623 if (flags & RTCF_LOCAL) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002624 rth->dst.input = ip_local_deliver;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002625 rth->rt_spec_dst = fl4->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002626 }
2627 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
David S. Miller68a5e3d2011-03-11 20:07:33 -05002628 rth->rt_spec_dst = fl4->saddr;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002629 if (flags & RTCF_LOCAL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002630 !(dev_out->flags & IFF_LOOPBACK)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002631 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002632 RT_CACHE_STAT_INC(out_slow_mc);
2633 }
2634#ifdef CONFIG_IP_MROUTE
David S. Miller982721f2011-02-16 21:44:24 -08002635 if (type == RTN_MULTICAST) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002636 if (IN_DEV_MFORWARD(in_dev) &&
David S. Miller813b3b52011-04-28 14:48:42 -07002637 !ipv4_is_local_multicast(fl4->daddr)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002638 rth->dst.input = ip_mr_input;
2639 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002640 }
2641 }
2642#endif
2643 }
2644
David S. Miller813b3b52011-04-28 14:48:42 -07002645 rt_set_nexthop(rth, fl4, res, fi, type, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002646
David S. Miller5ada5522011-02-17 15:29:00 -08002647 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002648}
2649
Linus Torvalds1da177e2005-04-16 15:20:36 -07002650/*
2651 * Major route resolver routine.
Eric Dumazet0197aa32010-09-30 03:33:58 +00002652 * called with rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002653 */
2654
David S. Miller813b3b52011-04-28 14:48:42 -07002655static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002656{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002657 struct net_device *dev_out = NULL;
Julian Anastasovf61759e2011-12-02 11:39:42 +00002658 __u8 tos = RT_FL_TOS(fl4);
David S. Miller813b3b52011-04-28 14:48:42 -07002659 unsigned int flags = 0;
2660 struct fib_result res;
David S. Miller5ada5522011-02-17 15:29:00 -08002661 struct rtable *rth;
David S. Miller813b3b52011-04-28 14:48:42 -07002662 __be32 orig_daddr;
2663 __be32 orig_saddr;
2664 int orig_oif;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002665
2666 res.fi = NULL;
2667#ifdef CONFIG_IP_MULTIPLE_TABLES
2668 res.r = NULL;
2669#endif
2670
David S. Miller813b3b52011-04-28 14:48:42 -07002671 orig_daddr = fl4->daddr;
2672 orig_saddr = fl4->saddr;
2673 orig_oif = fl4->flowi4_oif;
2674
2675 fl4->flowi4_iif = net->loopback_dev->ifindex;
2676 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2677 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2678 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
David S. Miller44713b62011-03-04 21:24:47 -08002679
David S. Miller010c2702011-02-17 15:37:09 -08002680 rcu_read_lock();
David S. Miller813b3b52011-04-28 14:48:42 -07002681 if (fl4->saddr) {
David S. Millerb23dd4f2011-03-02 14:31:35 -08002682 rth = ERR_PTR(-EINVAL);
David S. Miller813b3b52011-04-28 14:48:42 -07002683 if (ipv4_is_multicast(fl4->saddr) ||
2684 ipv4_is_lbcast(fl4->saddr) ||
2685 ipv4_is_zeronet(fl4->saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002686 goto out;
2687
Linus Torvalds1da177e2005-04-16 15:20:36 -07002688 /* I removed check for oif == dev_out->oif here.
2689 It was wrong for two reasons:
Denis V. Lunev1ab35272008-01-22 22:04:30 -08002690 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2691 is assigned to multiple interfaces.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002692 2. Moreover, we are allowed to send packets with saddr
2693 of another iface. --ANK
2694 */
2695
David S. Miller813b3b52011-04-28 14:48:42 -07002696 if (fl4->flowi4_oif == 0 &&
2697 (ipv4_is_multicast(fl4->daddr) ||
2698 ipv4_is_lbcast(fl4->daddr))) {
Julian Anastasova210d012008-10-01 07:28:28 -07002699 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
David S. Miller813b3b52011-04-28 14:48:42 -07002700 dev_out = __ip_dev_find(net, fl4->saddr, false);
Julian Anastasova210d012008-10-01 07:28:28 -07002701 if (dev_out == NULL)
2702 goto out;
2703
Linus Torvalds1da177e2005-04-16 15:20:36 -07002704 /* Special hack: user can direct multicasts
2705 and limited broadcast via necessary interface
2706 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2707 This hack is not just for fun, it allows
2708 vic,vat and friends to work.
2709 They bind socket to loopback, set ttl to zero
2710 and expect that it will work.
2711 From the viewpoint of routing cache they are broken,
2712 because we are not allowed to build multicast path
2713 with loopback source addr (look, routing cache
2714 cannot know, that ttl is zero, so that packet
2715 will not leave this host and route is valid).
2716 Luckily, this hack is good workaround.
2717 */
2718
David S. Miller813b3b52011-04-28 14:48:42 -07002719 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002720 goto make_route;
2721 }
Julian Anastasova210d012008-10-01 07:28:28 -07002722
David S. Miller813b3b52011-04-28 14:48:42 -07002723 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
Julian Anastasova210d012008-10-01 07:28:28 -07002724 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
David S. Miller813b3b52011-04-28 14:48:42 -07002725 if (!__ip_dev_find(net, fl4->saddr, false))
Julian Anastasova210d012008-10-01 07:28:28 -07002726 goto out;
Julian Anastasova210d012008-10-01 07:28:28 -07002727 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002728 }
2729
2730
David S. Miller813b3b52011-04-28 14:48:42 -07002731 if (fl4->flowi4_oif) {
2732 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002733 rth = ERR_PTR(-ENODEV);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002734 if (dev_out == NULL)
2735 goto out;
Herbert Xue5ed6392005-10-03 14:35:55 -07002736
2737 /* RACE: Check return value of inet_select_addr instead. */
Eric Dumazetfc75fc82010-12-22 04:39:39 +00002738 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
David S. Millerb23dd4f2011-03-02 14:31:35 -08002739 rth = ERR_PTR(-ENETUNREACH);
Eric Dumazetfc75fc82010-12-22 04:39:39 +00002740 goto out;
2741 }
David S. Miller813b3b52011-04-28 14:48:42 -07002742 if (ipv4_is_local_multicast(fl4->daddr) ||
2743 ipv4_is_lbcast(fl4->daddr)) {
2744 if (!fl4->saddr)
2745 fl4->saddr = inet_select_addr(dev_out, 0,
2746 RT_SCOPE_LINK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002747 goto make_route;
2748 }
Jiri Bencad61d4c2013-10-04 17:04:48 +02002749 if (!fl4->saddr) {
David S. Miller813b3b52011-04-28 14:48:42 -07002750 if (ipv4_is_multicast(fl4->daddr))
2751 fl4->saddr = inet_select_addr(dev_out, 0,
2752 fl4->flowi4_scope);
2753 else if (!fl4->daddr)
2754 fl4->saddr = inet_select_addr(dev_out, 0,
2755 RT_SCOPE_HOST);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002756 }
2757 }
2758
David S. Miller813b3b52011-04-28 14:48:42 -07002759 if (!fl4->daddr) {
2760 fl4->daddr = fl4->saddr;
2761 if (!fl4->daddr)
2762 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002763 dev_out = net->loopback_dev;
David S. Miller813b3b52011-04-28 14:48:42 -07002764 fl4->flowi4_oif = net->loopback_dev->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002765 res.type = RTN_LOCAL;
2766 flags |= RTCF_LOCAL;
2767 goto make_route;
2768 }
2769
David S. Miller813b3b52011-04-28 14:48:42 -07002770 if (fib_lookup(net, fl4, &res)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002771 res.fi = NULL;
David S. Miller813b3b52011-04-28 14:48:42 -07002772 if (fl4->flowi4_oif) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002773 /* Apparently, routing tables are wrong. Assume,
2774 that the destination is on link.
2775
2776 WHY? DW.
2777 Because we are allowed to send to iface
2778 even if it has NO routes and NO assigned
2779 addresses. When oif is specified, routing
2780 tables are looked up with only one purpose:
2781 to catch if destination is gatewayed, rather than
2782 direct. Moreover, if MSG_DONTROUTE is set,
2783 we send packet, ignoring both routing tables
2784 and ifaddr state. --ANK
2785
2786
2787 We could make it even if oif is unknown,
2788 likely IPv6, but we do not.
2789 */
2790
David S. Miller813b3b52011-04-28 14:48:42 -07002791 if (fl4->saddr == 0)
2792 fl4->saddr = inet_select_addr(dev_out, 0,
2793 RT_SCOPE_LINK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002794 res.type = RTN_UNICAST;
2795 goto make_route;
2796 }
David S. Millerb23dd4f2011-03-02 14:31:35 -08002797 rth = ERR_PTR(-ENETUNREACH);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002798 goto out;
2799 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002800
2801 if (res.type == RTN_LOCAL) {
David S. Miller813b3b52011-04-28 14:48:42 -07002802 if (!fl4->saddr) {
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002803 if (res.fi->fib_prefsrc)
David S. Miller813b3b52011-04-28 14:48:42 -07002804 fl4->saddr = res.fi->fib_prefsrc;
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002805 else
David S. Miller813b3b52011-04-28 14:48:42 -07002806 fl4->saddr = fl4->daddr;
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002807 }
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002808 dev_out = net->loopback_dev;
David S. Miller813b3b52011-04-28 14:48:42 -07002809 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002810 res.fi = NULL;
2811 flags |= RTCF_LOCAL;
2812 goto make_route;
2813 }
2814
2815#ifdef CONFIG_IP_ROUTE_MULTIPATH
David S. Miller813b3b52011-04-28 14:48:42 -07002816 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
David S. Miller1b7fe5932011-03-10 17:01:16 -08002817 fib_select_multipath(&res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002818 else
2819#endif
David S. Miller21d8c492011-04-14 14:49:37 -07002820 if (!res.prefixlen &&
2821 res.table->tb_num_default > 1 &&
David S. Miller813b3b52011-04-28 14:48:42 -07002822 res.type == RTN_UNICAST && !fl4->flowi4_oif)
David S. Miller0c838ff2011-01-31 16:16:50 -08002823 fib_select_default(&res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002824
David S. Miller813b3b52011-04-28 14:48:42 -07002825 if (!fl4->saddr)
2826 fl4->saddr = FIB_RES_PREFSRC(net, res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002827
Linus Torvalds1da177e2005-04-16 15:20:36 -07002828 dev_out = FIB_RES_DEV(res);
David S. Miller813b3b52011-04-28 14:48:42 -07002829 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002830
2831
2832make_route:
David S. Miller813b3b52011-04-28 14:48:42 -07002833 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
Julian Anastasovf61759e2011-12-02 11:39:42 +00002834 tos, dev_out, flags);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002835 if (!IS_ERR(rth)) {
David S. Miller5ada5522011-02-17 15:29:00 -08002836 unsigned int hash;
2837
David S. Miller813b3b52011-04-28 14:48:42 -07002838 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
David S. Miller5ada5522011-02-17 15:29:00 -08002839 rt_genid(dev_net(dev_out)));
David S. Miller813b3b52011-04-28 14:48:42 -07002840 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
David S. Miller5ada5522011-02-17 15:29:00 -08002841 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002842
David S. Miller010c2702011-02-17 15:37:09 -08002843out:
2844 rcu_read_unlock();
David S. Millerb23dd4f2011-03-02 14:31:35 -08002845 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002846}
2847
David S. Miller813b3b52011-04-28 14:48:42 -07002848struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002849{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002850 struct rtable *rth;
David S. Miller010c2702011-02-17 15:37:09 -08002851 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002852
Neil Horman1080d702008-10-27 12:28:25 -07002853 if (!rt_caching(net))
2854 goto slow_output;
2855
David S. Miller9d6ec932011-03-12 01:12:47 -05002856 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002857
2858 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08002859 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002860 rth = rcu_dereference_bh(rth->dst.rt_next)) {
David S. Miller9d6ec932011-03-12 01:12:47 -05002861 if (rth->rt_key_dst == flp4->daddr &&
2862 rth->rt_key_src == flp4->saddr &&
David S. Millerc7537962010-11-11 17:07:48 -08002863 rt_is_output_route(rth) &&
David S. Miller9d6ec932011-03-12 01:12:47 -05002864 rth->rt_oif == flp4->flowi4_oif &&
2865 rth->rt_mark == flp4->flowi4_mark &&
David S. Miller475949d2011-05-03 19:45:15 -07002866 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
Denis V. Lunevb5921912008-01-22 23:50:25 -08002867 (IPTOS_RT_MASK | RTO_ONLINK)) &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002868 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002869 !rt_is_expired(rth)) {
David S. Millerde398fb2011-12-05 13:21:42 -05002870 ipv4_validate_peer(rth);
Changli Gaod8d1f302010-06-10 23:31:35 -07002871 dst_use(&rth->dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002872 RT_CACHE_STAT_INC(out_hit);
2873 rcu_read_unlock_bh();
David S. Miller56157872011-05-02 14:37:45 -07002874 if (!flp4->saddr)
2875 flp4->saddr = rth->rt_src;
2876 if (!flp4->daddr)
2877 flp4->daddr = rth->rt_dst;
David S. Millerb23dd4f2011-03-02 14:31:35 -08002878 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002879 }
2880 RT_CACHE_STAT_INC(out_hlist_search);
2881 }
2882 rcu_read_unlock_bh();
2883
Neil Horman1080d702008-10-27 12:28:25 -07002884slow_output:
David S. Miller9d6ec932011-03-12 01:12:47 -05002885 return ip_route_output_slow(net, flp4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002886}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002887EXPORT_SYMBOL_GPL(__ip_route_output_key);
2888
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002889static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2890{
2891 return NULL;
2892}
2893
Steffen Klassertebb762f2011-11-23 02:12:51 +00002894static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
Roland Dreierec831ea2011-01-31 13:16:00 -08002895{
Steffen Klassert618f9bc2011-11-23 02:13:31 +00002896 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2897
2898 return mtu ? : dst->dev->mtu;
Roland Dreierec831ea2011-01-31 13:16:00 -08002899}
2900
David S. Miller14e50e52007-05-24 18:17:54 -07002901static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2902{
2903}
2904
Held Bernhard0972ddb2011-04-24 22:07:32 +00002905static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2906 unsigned long old)
2907{
2908 return NULL;
2909}
2910
David S. Miller14e50e52007-05-24 18:17:54 -07002911static struct dst_ops ipv4_dst_blackhole_ops = {
2912 .family = AF_INET,
Harvey Harrison09640e62009-02-01 00:45:17 -08002913 .protocol = cpu_to_be16(ETH_P_IP),
David S. Miller14e50e52007-05-24 18:17:54 -07002914 .destroy = ipv4_dst_destroy,
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002915 .check = ipv4_blackhole_dst_check,
Steffen Klassertebb762f2011-11-23 02:12:51 +00002916 .mtu = ipv4_blackhole_mtu,
Eric Dumazet214f45c2011-02-18 11:39:01 -08002917 .default_advmss = ipv4_default_advmss,
David S. Miller14e50e52007-05-24 18:17:54 -07002918 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
Held Bernhard0972ddb2011-04-24 22:07:32 +00002919 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
David S. Millerd3aaeb32011-07-18 00:40:17 -07002920 .neigh_lookup = ipv4_neigh_lookup,
David S. Miller14e50e52007-05-24 18:17:54 -07002921};
2922
David S. Miller2774c132011-03-01 14:59:04 -08002923struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
David S. Miller14e50e52007-05-24 18:17:54 -07002924{
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002925 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
David S. Miller2774c132011-03-01 14:59:04 -08002926 struct rtable *ort = (struct rtable *) dst_orig;
David S. Miller14e50e52007-05-24 18:17:54 -07002927
2928 if (rt) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002929 struct dst_entry *new = &rt->dst;
David S. Miller14e50e52007-05-24 18:17:54 -07002930
David S. Miller14e50e52007-05-24 18:17:54 -07002931 new->__use = 1;
Herbert Xu352e5122007-11-13 21:34:06 -08002932 new->input = dst_discard;
2933 new->output = dst_discard;
David S. Millerdefb3512010-12-08 21:16:57 -08002934 dst_copy_metrics(new, &ort->dst);
David S. Miller14e50e52007-05-24 18:17:54 -07002935
Changli Gaod8d1f302010-06-10 23:31:35 -07002936 new->dev = ort->dst.dev;
David S. Miller14e50e52007-05-24 18:17:54 -07002937 if (new->dev)
2938 dev_hold(new->dev);
2939
David S. Miller5e2b61f2011-03-04 21:47:09 -08002940 rt->rt_key_dst = ort->rt_key_dst;
2941 rt->rt_key_src = ort->rt_key_src;
David S. Miller475949d2011-05-03 19:45:15 -07002942 rt->rt_key_tos = ort->rt_key_tos;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002943 rt->rt_route_iif = ort->rt_route_iif;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002944 rt->rt_iif = ort->rt_iif;
2945 rt->rt_oif = ort->rt_oif;
2946 rt->rt_mark = ort->rt_mark;
David S. Miller14e50e52007-05-24 18:17:54 -07002947
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002948 rt->rt_genid = rt_genid(net);
David S. Miller14e50e52007-05-24 18:17:54 -07002949 rt->rt_flags = ort->rt_flags;
2950 rt->rt_type = ort->rt_type;
2951 rt->rt_dst = ort->rt_dst;
2952 rt->rt_src = ort->rt_src;
David S. Miller14e50e52007-05-24 18:17:54 -07002953 rt->rt_gateway = ort->rt_gateway;
2954 rt->rt_spec_dst = ort->rt_spec_dst;
2955 rt->peer = ort->peer;
2956 if (rt->peer)
2957 atomic_inc(&rt->peer->refcnt);
David S. Miller62fa8a82011-01-26 20:51:05 -08002958 rt->fi = ort->fi;
2959 if (rt->fi)
2960 atomic_inc(&rt->fi->fib_clntref);
David S. Miller14e50e52007-05-24 18:17:54 -07002961
2962 dst_free(new);
2963 }
2964
David S. Miller2774c132011-03-01 14:59:04 -08002965 dst_release(dst_orig);
2966
2967 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
David S. Miller14e50e52007-05-24 18:17:54 -07002968}
2969
David S. Miller9d6ec932011-03-12 01:12:47 -05002970struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
David S. Millerb23dd4f2011-03-02 14:31:35 -08002971 struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002972{
David S. Miller9d6ec932011-03-12 01:12:47 -05002973 struct rtable *rt = __ip_route_output_key(net, flp4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002974
David S. Millerb23dd4f2011-03-02 14:31:35 -08002975 if (IS_ERR(rt))
2976 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002977
David S. Miller56157872011-05-02 14:37:45 -07002978 if (flp4->flowi4_proto)
David S. Miller9d6ec932011-03-12 01:12:47 -05002979 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2980 flowi4_to_flowi(flp4),
2981 sk, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002982
David S. Millerb23dd4f2011-03-02 14:31:35 -08002983 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002984}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002985EXPORT_SYMBOL_GPL(ip_route_output_flow);
2986
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002987static int rt_fill_info(struct net *net,
2988 struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002989 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002990{
Eric Dumazet511c3f92009-06-02 05:14:27 +00002991 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002992 struct rtmsg *r;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002993 struct nlmsghdr *nlh;
Steffen Klassert2bc8ca42011-10-11 01:12:02 +00002994 unsigned long expires = 0;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00002995 const struct inet_peer *peer = rt->peer;
Thomas Grafe3703b32006-11-27 09:27:07 -08002996 u32 id = 0, ts = 0, tsage = 0, error;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002997
2998 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2999 if (nlh == NULL)
Patrick McHardy26932562007-01-31 23:16:40 -08003000 return -EMSGSIZE;
Thomas Grafbe403ea2006-08-17 18:15:17 -07003001
3002 r = nlmsg_data(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003003 r->rtm_family = AF_INET;
3004 r->rtm_dst_len = 32;
3005 r->rtm_src_len = 0;
David S. Miller475949d2011-05-03 19:45:15 -07003006 r->rtm_tos = rt->rt_key_tos;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003007 r->rtm_table = RT_TABLE_MAIN;
Thomas Grafbe403ea2006-08-17 18:15:17 -07003008 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003009 r->rtm_type = rt->rt_type;
3010 r->rtm_scope = RT_SCOPE_UNIVERSE;
3011 r->rtm_protocol = RTPROT_UNSPEC;
3012 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
3013 if (rt->rt_flags & RTCF_NOTIFY)
3014 r->rtm_flags |= RTM_F_NOTIFY;
Thomas Grafbe403ea2006-08-17 18:15:17 -07003015
Al Viro17fb2c62006-09-26 22:15:25 -07003016 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
Thomas Grafbe403ea2006-08-17 18:15:17 -07003017
David S. Miller5e2b61f2011-03-04 21:47:09 -08003018 if (rt->rt_key_src) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003019 r->rtm_src_len = 32;
David S. Miller5e2b61f2011-03-04 21:47:09 -08003020 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003021 }
Changli Gaod8d1f302010-06-10 23:31:35 -07003022 if (rt->dst.dev)
3023 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
Patrick McHardyc7066f72011-01-14 13:36:42 +01003024#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07003025 if (rt->dst.tclassid)
3026 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003027#endif
David S. Millerc7537962010-11-11 17:07:48 -08003028 if (rt_is_input_route(rt))
Al Viro17fb2c62006-09-26 22:15:25 -07003029 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
David S. Miller5e2b61f2011-03-04 21:47:09 -08003030 else if (rt->rt_src != rt->rt_key_src)
Al Viro17fb2c62006-09-26 22:15:25 -07003031 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
Thomas Grafbe403ea2006-08-17 18:15:17 -07003032
Linus Torvalds1da177e2005-04-16 15:20:36 -07003033 if (rt->rt_dst != rt->rt_gateway)
Al Viro17fb2c62006-09-26 22:15:25 -07003034 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
Thomas Grafbe403ea2006-08-17 18:15:17 -07003035
David S. Millerdefb3512010-12-08 21:16:57 -08003036 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
Thomas Grafbe403ea2006-08-17 18:15:17 -07003037 goto nla_put_failure;
3038
David S. Miller5e2b61f2011-03-04 21:47:09 -08003039 if (rt->rt_mark)
3040 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
Eric Dumazet963bfee2010-07-20 22:03:14 +00003041
Changli Gaod8d1f302010-06-10 23:31:35 -07003042 error = rt->dst.error;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00003043 if (peer) {
Eric Dumazet317fe0e2010-06-16 04:52:13 +00003044 inet_peer_refcheck(rt->peer);
Eric Dumazetfe6fe792011-06-08 06:07:07 +00003045 if (peer->tcp_ts_stamp) {
3046 ts = peer->tcp_ts;
3047 tsage = get_seconds() - peer->tcp_ts_stamp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003048 }
Eric Dumazetfe6fe792011-06-08 06:07:07 +00003049 expires = ACCESS_ONCE(peer->pmtu_expires);
Steffen Klassert2bc8ca42011-10-11 01:12:02 +00003050 if (expires) {
3051 if (time_before(jiffies, expires))
3052 expires -= jiffies;
3053 else
3054 expires = 0;
3055 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003056 }
Thomas Grafbe403ea2006-08-17 18:15:17 -07003057
David S. Millerc7537962010-11-11 17:07:48 -08003058 if (rt_is_input_route(rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003059#ifdef CONFIG_IP_MROUTE
Al Viroe4485152006-09-26 22:15:01 -07003060 __be32 dst = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003061
Joe Perchesf97c1e02007-12-16 13:45:43 -08003062 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003063 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
David S. Miller9a1b9492011-05-04 12:18:54 -07003064 int err = ipmr_get_route(net, skb,
3065 rt->rt_src, rt->rt_dst,
3066 r, nowait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003067 if (err <= 0) {
3068 if (!nowait) {
3069 if (err == 0)
3070 return 0;
Thomas Grafbe403ea2006-08-17 18:15:17 -07003071 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003072 } else {
3073 if (err == -EMSGSIZE)
Thomas Grafbe403ea2006-08-17 18:15:17 -07003074 goto nla_put_failure;
Thomas Grafe3703b32006-11-27 09:27:07 -08003075 error = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003076 }
3077 }
3078 } else
3079#endif
David S. Miller5e2b61f2011-03-04 21:47:09 -08003080 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003081 }
3082
Changli Gaod8d1f302010-06-10 23:31:35 -07003083 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
Thomas Grafe3703b32006-11-27 09:27:07 -08003084 expires, error) < 0)
3085 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003086
Thomas Grafbe403ea2006-08-17 18:15:17 -07003087 return nlmsg_end(skb, nlh);
3088
3089nla_put_failure:
Patrick McHardy26932562007-01-31 23:16:40 -08003090 nlmsg_cancel(skb, nlh);
3091 return -EMSGSIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003092}
3093
Thomas Graf63f34442007-03-22 11:55:17 -07003094static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003095{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09003096 struct net *net = sock_net(in_skb->sk);
Thomas Grafd889ce32006-08-17 18:15:44 -07003097 struct rtmsg *rtm;
3098 struct nlattr *tb[RTA_MAX+1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07003099 struct rtable *rt = NULL;
Al Viro9e12bb22006-09-26 21:25:20 -07003100 __be32 dst = 0;
3101 __be32 src = 0;
3102 u32 iif;
Thomas Grafd889ce32006-08-17 18:15:44 -07003103 int err;
Eric Dumazet963bfee2010-07-20 22:03:14 +00003104 int mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003105 struct sk_buff *skb;
3106
Thomas Grafd889ce32006-08-17 18:15:44 -07003107 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3108 if (err < 0)
3109 goto errout;
3110
3111 rtm = nlmsg_data(nlh);
3112
Linus Torvalds1da177e2005-04-16 15:20:36 -07003113 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
Thomas Grafd889ce32006-08-17 18:15:44 -07003114 if (skb == NULL) {
3115 err = -ENOBUFS;
3116 goto errout;
3117 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003118
3119 /* Reserve room for dummy headers, this skb can pass
3120 through good chunk of routing engine.
3121 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07003122 skb_reset_mac_header(skb);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07003123 skb_reset_network_header(skb);
Stephen Hemmingerd2c962b2006-04-17 17:27:11 -07003124
3125 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07003126 ip_hdr(skb)->protocol = IPPROTO_ICMP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003127 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3128
Al Viro17fb2c62006-09-26 22:15:25 -07003129 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3130 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
Thomas Grafd889ce32006-08-17 18:15:44 -07003131 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
Eric Dumazet963bfee2010-07-20 22:03:14 +00003132 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003133
3134 if (iif) {
Thomas Grafd889ce32006-08-17 18:15:44 -07003135 struct net_device *dev;
3136
Denis V. Lunev19375042008-02-28 20:52:04 -08003137 dev = __dev_get_by_index(net, iif);
Thomas Grafd889ce32006-08-17 18:15:44 -07003138 if (dev == NULL) {
3139 err = -ENODEV;
3140 goto errout_free;
3141 }
3142
Linus Torvalds1da177e2005-04-16 15:20:36 -07003143 skb->protocol = htons(ETH_P_IP);
3144 skb->dev = dev;
Eric Dumazet963bfee2010-07-20 22:03:14 +00003145 skb->mark = mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003146 local_bh_disable();
3147 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3148 local_bh_enable();
Thomas Grafd889ce32006-08-17 18:15:44 -07003149
Eric Dumazet511c3f92009-06-02 05:14:27 +00003150 rt = skb_rtable(skb);
Changli Gaod8d1f302010-06-10 23:31:35 -07003151 if (err == 0 && rt->dst.error)
3152 err = -rt->dst.error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003153 } else {
David S. Miller68a5e3d2011-03-11 20:07:33 -05003154 struct flowi4 fl4 = {
3155 .daddr = dst,
3156 .saddr = src,
3157 .flowi4_tos = rtm->rtm_tos,
3158 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3159 .flowi4_mark = mark,
Thomas Grafd889ce32006-08-17 18:15:44 -07003160 };
David S. Miller9d6ec932011-03-12 01:12:47 -05003161 rt = ip_route_output_key(net, &fl4);
David S. Millerb23dd4f2011-03-02 14:31:35 -08003162
3163 err = 0;
3164 if (IS_ERR(rt))
3165 err = PTR_ERR(rt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003166 }
Thomas Grafd889ce32006-08-17 18:15:44 -07003167
Linus Torvalds1da177e2005-04-16 15:20:36 -07003168 if (err)
Thomas Grafd889ce32006-08-17 18:15:44 -07003169 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003170
Changli Gaod8d1f302010-06-10 23:31:35 -07003171 skb_dst_set(skb, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003172 if (rtm->rtm_flags & RTM_F_NOTIFY)
3173 rt->rt_flags |= RTCF_NOTIFY;
3174
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003175 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Denis V. Lunev19375042008-02-28 20:52:04 -08003176 RTM_NEWROUTE, 0, 0);
Thomas Grafd889ce32006-08-17 18:15:44 -07003177 if (err <= 0)
3178 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003179
Denis V. Lunev19375042008-02-28 20:52:04 -08003180 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
Thomas Grafd889ce32006-08-17 18:15:44 -07003181errout:
Thomas Graf2942e902006-08-15 00:30:25 -07003182 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003183
Thomas Grafd889ce32006-08-17 18:15:44 -07003184errout_free:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003185 kfree_skb(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07003186 goto errout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003187}
3188
3189int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3190{
3191 struct rtable *rt;
3192 int h, s_h;
3193 int idx, s_idx;
Denis V. Lunev19375042008-02-28 20:52:04 -08003194 struct net *net;
3195
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09003196 net = sock_net(skb->sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003197
3198 s_h = cb->args[0];
Eric Dumazetd8c92832008-01-07 21:52:14 -08003199 if (s_h < 0)
3200 s_h = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003201 s_idx = idx = cb->args[1];
Eric Dumazeta6272662008-08-28 01:11:25 -07003202 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3203 if (!rt_hash_table[h].chain)
3204 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003205 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08003206 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
Changli Gaod8d1f302010-06-10 23:31:35 -07003207 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3208 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003209 continue;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07003210 if (rt_is_expired(rt))
Eric Dumazet29e75252008-01-31 17:05:09 -08003211 continue;
Changli Gaod8d1f302010-06-10 23:31:35 -07003212 skb_dst_set_noref(skb, &rt->dst);
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003213 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003214 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07003215 1, NLM_F_MULTI) <= 0) {
Eric Dumazetadf30902009-06-02 05:19:30 +00003216 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003217 rcu_read_unlock_bh();
3218 goto done;
3219 }
Eric Dumazetadf30902009-06-02 05:19:30 +00003220 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003221 }
3222 rcu_read_unlock_bh();
3223 }
3224
3225done:
3226 cb->args[0] = h;
3227 cb->args[1] = idx;
3228 return skb->len;
3229}
3230
3231void ip_rt_multicast_event(struct in_device *in_dev)
3232{
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -07003233 rt_cache_flush(dev_net(in_dev->dev), 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003234}
3235
3236#ifdef CONFIG_SYSCTL
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003237static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003238 void __user *buffer,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003239 size_t *lenp, loff_t *ppos)
3240{
3241 if (write) {
Denis V. Lunev639e1042008-07-05 19:02:06 -07003242 int flush_delay;
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003243 ctl_table ctl;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003244 struct net *net;
Denis V. Lunev639e1042008-07-05 19:02:06 -07003245
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003246 memcpy(&ctl, __ctl, sizeof(ctl));
3247 ctl.data = &flush_delay;
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003248 proc_dointvec(&ctl, write, buffer, lenp, ppos);
Denis V. Lunev639e1042008-07-05 19:02:06 -07003249
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003250 net = (struct net *)__ctl->extra1;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003251 rt_cache_flush(net, flush_delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003252 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003253 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003254
3255 return -EINVAL;
3256}
3257
Al Viroeeb61f72008-07-27 08:59:33 +01003258static ctl_table ipv4_route_table[] = {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003259 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003260 .procname = "gc_thresh",
3261 .data = &ipv4_dst_ops.gc_thresh,
3262 .maxlen = sizeof(int),
3263 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003264 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003265 },
3266 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003267 .procname = "max_size",
3268 .data = &ip_rt_max_size,
3269 .maxlen = sizeof(int),
3270 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003271 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003272 },
3273 {
3274 /* Deprecated. Use gc_min_interval_ms */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003275
Linus Torvalds1da177e2005-04-16 15:20:36 -07003276 .procname = "gc_min_interval",
3277 .data = &ip_rt_gc_min_interval,
3278 .maxlen = sizeof(int),
3279 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003280 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003281 },
3282 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003283 .procname = "gc_min_interval_ms",
3284 .data = &ip_rt_gc_min_interval,
3285 .maxlen = sizeof(int),
3286 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003287 .proc_handler = proc_dointvec_ms_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003288 },
3289 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003290 .procname = "gc_timeout",
3291 .data = &ip_rt_gc_timeout,
3292 .maxlen = sizeof(int),
3293 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003294 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003295 },
3296 {
Eric Dumazet9f28a2f2011-12-21 15:47:16 -05003297 .procname = "gc_interval",
3298 .data = &ip_rt_gc_interval,
3299 .maxlen = sizeof(int),
3300 .mode = 0644,
3301 .proc_handler = proc_dointvec_jiffies,
3302 },
3303 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003304 .procname = "redirect_load",
3305 .data = &ip_rt_redirect_load,
3306 .maxlen = sizeof(int),
3307 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003308 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003309 },
3310 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003311 .procname = "redirect_number",
3312 .data = &ip_rt_redirect_number,
3313 .maxlen = sizeof(int),
3314 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003315 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003316 },
3317 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003318 .procname = "redirect_silence",
3319 .data = &ip_rt_redirect_silence,
3320 .maxlen = sizeof(int),
3321 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003322 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003323 },
3324 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003325 .procname = "error_cost",
3326 .data = &ip_rt_error_cost,
3327 .maxlen = sizeof(int),
3328 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003329 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003330 },
3331 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003332 .procname = "error_burst",
3333 .data = &ip_rt_error_burst,
3334 .maxlen = sizeof(int),
3335 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003336 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003337 },
3338 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003339 .procname = "gc_elasticity",
3340 .data = &ip_rt_gc_elasticity,
3341 .maxlen = sizeof(int),
3342 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003343 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003344 },
3345 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003346 .procname = "mtu_expires",
3347 .data = &ip_rt_mtu_expires,
3348 .maxlen = sizeof(int),
3349 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003350 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003351 },
3352 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003353 .procname = "min_pmtu",
3354 .data = &ip_rt_min_pmtu,
3355 .maxlen = sizeof(int),
3356 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003357 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003358 },
3359 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003360 .procname = "min_adv_mss",
3361 .data = &ip_rt_min_advmss,
3362 .maxlen = sizeof(int),
3363 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003364 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003365 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003366 { }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003367};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003368
Al Viro2f4520d2008-08-25 15:17:44 -07003369static struct ctl_table empty[1];
3370
3371static struct ctl_table ipv4_skeleton[] =
3372{
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003373 { .procname = "route",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003374 .mode = 0555, .child = ipv4_route_table},
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003375 { .procname = "neigh",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003376 .mode = 0555, .child = empty},
Al Viro2f4520d2008-08-25 15:17:44 -07003377 { }
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003378};
3379
Al Viro2f4520d2008-08-25 15:17:44 -07003380static __net_initdata struct ctl_path ipv4_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003381 { .procname = "net", },
3382 { .procname = "ipv4", },
Al Viro2f4520d2008-08-25 15:17:44 -07003383 { },
3384};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003385
3386static struct ctl_table ipv4_route_flush_table[] = {
3387 {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003388 .procname = "flush",
3389 .maxlen = sizeof(int),
3390 .mode = 0200,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003391 .proc_handler = ipv4_sysctl_rtcache_flush,
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003392 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003393 { },
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003394};
3395
Al Viro2f4520d2008-08-25 15:17:44 -07003396static __net_initdata struct ctl_path ipv4_route_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003397 { .procname = "net", },
3398 { .procname = "ipv4", },
3399 { .procname = "route", },
Al Viro2f4520d2008-08-25 15:17:44 -07003400 { },
3401};
3402
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003403static __net_init int sysctl_route_net_init(struct net *net)
3404{
3405 struct ctl_table *tbl;
3406
3407 tbl = ipv4_route_flush_table;
Octavian Purdila09ad9bc2009-11-25 15:14:13 -08003408 if (!net_eq(net, &init_net)) {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003409 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3410 if (tbl == NULL)
3411 goto err_dup;
3412 }
3413 tbl[0].extra1 = net;
3414
3415 net->ipv4.route_hdr =
3416 register_net_sysctl_table(net, ipv4_route_path, tbl);
3417 if (net->ipv4.route_hdr == NULL)
3418 goto err_reg;
3419 return 0;
3420
3421err_reg:
3422 if (tbl != ipv4_route_flush_table)
3423 kfree(tbl);
3424err_dup:
3425 return -ENOMEM;
3426}
3427
3428static __net_exit void sysctl_route_net_exit(struct net *net)
3429{
3430 struct ctl_table *tbl;
3431
3432 tbl = net->ipv4.route_hdr->ctl_table_arg;
3433 unregister_net_sysctl_table(net->ipv4.route_hdr);
3434 BUG_ON(tbl == ipv4_route_flush_table);
3435 kfree(tbl);
3436}
3437
3438static __net_initdata struct pernet_operations sysctl_route_ops = {
3439 .init = sysctl_route_net_init,
3440 .exit = sysctl_route_net_exit,
3441};
Linus Torvalds1da177e2005-04-16 15:20:36 -07003442#endif
3443
Neil Horman3ee94372010-05-08 01:57:52 -07003444static __net_init int rt_genid_init(struct net *net)
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003445{
Neil Horman3ee94372010-05-08 01:57:52 -07003446 get_random_bytes(&net->ipv4.rt_genid,
3447 sizeof(net->ipv4.rt_genid));
David S. Miller436c3b62011-03-24 17:42:21 -07003448 get_random_bytes(&net->ipv4.dev_addr_genid,
3449 sizeof(net->ipv4.dev_addr_genid));
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003450 return 0;
3451}
3452
Neil Horman3ee94372010-05-08 01:57:52 -07003453static __net_initdata struct pernet_operations rt_genid_ops = {
3454 .init = rt_genid_init,
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003455};
3456
3457
Patrick McHardyc7066f72011-01-14 13:36:42 +01003458#ifdef CONFIG_IP_ROUTE_CLASSID
Tejun Heo7d720c32010-02-16 15:20:26 +00003459struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
Patrick McHardyc7066f72011-01-14 13:36:42 +01003460#endif /* CONFIG_IP_ROUTE_CLASSID */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003461
3462static __initdata unsigned long rhash_entries;
3463static int __init set_rhash_entries(char *str)
3464{
3465 if (!str)
3466 return 0;
3467 rhash_entries = simple_strtoul(str, &str, 0);
3468 return 1;
3469}
3470__setup("rhash_entries=", set_rhash_entries);
3471
3472int __init ip_rt_init(void)
3473{
Eric Dumazet424c4b72005-07-05 14:58:19 -07003474 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003475
Eric Dumazetad52eef2014-06-02 05:26:03 -07003476 ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3477 if (!ip_idents)
3478 panic("IP: failed to allocate ip_idents\n");
3479
3480 get_random_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3481
Patrick McHardyc7066f72011-01-14 13:36:42 +01003482#ifdef CONFIG_IP_ROUTE_CLASSID
Ingo Molnar0dcec8c2009-02-25 14:07:33 +01003483 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003484 if (!ip_rt_acct)
3485 panic("IP: failed to allocate ip_rt_acct\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003486#endif
3487
Alexey Dobriyane5d679f2006-08-26 19:25:52 -07003488 ipv4_dst_ops.kmem_cachep =
3489 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09003490 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003491
David S. Miller14e50e52007-05-24 18:17:54 -07003492 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3493
Eric Dumazetfc66f952010-10-08 06:37:34 +00003494 if (dst_entries_init(&ipv4_dst_ops) < 0)
3495 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3496
3497 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3498 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3499
Eric Dumazet424c4b72005-07-05 14:58:19 -07003500 rt_hash_table = (struct rt_hash_bucket *)
3501 alloc_large_system_hash("IP route cache",
3502 sizeof(struct rt_hash_bucket),
3503 rhash_entries,
Jan Beulich44813742009-09-21 17:03:05 -07003504 (totalram_pages >= 128 * 1024) ?
Mike Stroyan18955cf2005-11-29 16:12:55 -08003505 15 : 17,
Kirill Korotaev8d1502d2006-08-07 20:44:22 -07003506 0,
Eric Dumazet424c4b72005-07-05 14:58:19 -07003507 &rt_hash_log,
3508 &rt_hash_mask,
Anton Blanchardc9503e02009-04-27 05:42:24 -07003509 rhash_entries ? 0 : 512 * 1024);
Eric Dumazet22c047c2005-07-05 14:55:24 -07003510 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3511 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003512
3513 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3514 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3515
Linus Torvalds1da177e2005-04-16 15:20:36 -07003516 devinet_init();
3517 ip_fib_init();
3518
Eric Dumazet9f28a2f2011-12-21 15:47:16 -05003519 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3520 expires_ljiffies = jiffies;
3521 schedule_delayed_work(&expires_work,
3522 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3523
Denis V. Lunev73b38712008-02-28 20:51:18 -08003524 if (ip_rt_proc_init())
Joe Perches058bd4d2012-03-11 18:36:11 +00003525 pr_err("Unable to create route proc files\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003526#ifdef CONFIG_XFRM
3527 xfrm_init();
Neil Hormana33bc5c2009-07-30 18:52:15 -07003528 xfrm4_init(ip_rt_max_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003529#endif
Greg Rosec7ac8672011-06-10 01:27:09 +00003530 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
Thomas Graf63f34442007-03-22 11:55:17 -07003531
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003532#ifdef CONFIG_SYSCTL
3533 register_pernet_subsys(&sysctl_route_ops);
3534#endif
Neil Horman3ee94372010-05-08 01:57:52 -07003535 register_pernet_subsys(&rt_genid_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003536 return rc;
3537}
3538
Al Viroa1bc6eb2008-07-30 06:32:52 -04003539#ifdef CONFIG_SYSCTL
Al Viroeeb61f72008-07-27 08:59:33 +01003540/*
3541 * We really need to sanitize the damn ipv4 init order, then all
3542 * this nonsense will go away.
3543 */
3544void __init ip_static_sysctl_init(void)
3545{
Al Viro2f4520d2008-08-25 15:17:44 -07003546 register_sysctl_paths(ipv4_path, ipv4_skeleton);
Al Viroeeb61f72008-07-27 08:59:33 +01003547}
Al Viroa1bc6eb2008-07-30 06:32:52 -04003548#endif