blob: 90de2f9b4549bd06b5661a6f4656a4538026677c [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
Jesper Juhl02c30a82005-05-05 16:16:16 -07008 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -07009 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090021 * Alan Cox : Super /proc >4K
Linus Torvalds1da177e2005-04-16 15:20:36 -070022 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090039 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070055 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Ilia Sotnikovcef26852006-03-25 01:38:55 -080056 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
Linus Torvalds1da177e2005-04-16 15:20:36 -070058 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
Joe Perchesafd465032012-03-12 07:03:32 +000065#define pr_fmt(fmt) "IPv4: " fmt
66
Linus Torvalds1da177e2005-04-16 15:20:36 -070067#include <linux/module.h>
68#include <asm/uaccess.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070069#include <linux/bitops.h>
70#include <linux/types.h>
71#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070072#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070073#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070074#include <linux/string.h>
75#include <linux/socket.h>
76#include <linux/sockios.h>
77#include <linux/errno.h>
78#include <linux/in.h>
79#include <linux/inet.h>
80#include <linux/netdevice.h>
81#include <linux/proc_fs.h>
82#include <linux/init.h>
Eric Dumazet39c90ec2007-09-15 10:55:54 -070083#include <linux/workqueue.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070084#include <linux/skbuff.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070085#include <linux/inetdevice.h>
86#include <linux/igmp.h>
87#include <linux/pkt_sched.h>
88#include <linux/mroute.h>
89#include <linux/netfilter_ipv4.h>
90#include <linux/random.h>
91#include <linux/jhash.h>
92#include <linux/rcupdate.h>
93#include <linux/times.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090094#include <linux/slab.h>
Stephen Rothwellb9eda062011-12-22 17:03:29 +110095#include <linux/prefetch.h>
Herbert Xu352e5122007-11-13 21:34:06 -080096#include <net/dst.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020097#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070098#include <net/protocol.h>
99#include <net/ip.h>
100#include <net/route.h>
101#include <net/inetpeer.h>
102#include <net/sock.h>
103#include <net/ip_fib.h>
104#include <net/arp.h>
105#include <net/tcp.h>
106#include <net/icmp.h>
107#include <net/xfrm.h>
Tom Tucker8d717402006-07-30 20:43:36 -0700108#include <net/netevent.h>
Thomas Graf63f34442007-03-22 11:55:17 -0700109#include <net/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700110#ifdef CONFIG_SYSCTL
111#include <linux/sysctl.h>
112#endif
David S. Miller6e5714e2011-08-03 20:50:44 -0700113#include <net/secure_seq.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700114
David S. Miller68a5e3d2011-03-11 20:07:33 -0500115#define RT_FL_TOS(oldflp4) \
Julian Anastasovf61759e2011-12-02 11:39:42 +0000116 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700117
118#define IP_MAX_MTU 0xFFF0
119
120#define RT_GC_TIMEOUT (300*HZ)
121
Linus Torvalds1da177e2005-04-16 15:20:36 -0700122static int ip_rt_max_size;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700123static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500124static int ip_rt_gc_interval __read_mostly = 60 * HZ;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700125static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
126static int ip_rt_redirect_number __read_mostly = 9;
127static int ip_rt_redirect_load __read_mostly = HZ / 50;
128static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
129static int ip_rt_error_cost __read_mostly = HZ;
130static int ip_rt_error_burst __read_mostly = 5 * HZ;
131static int ip_rt_gc_elasticity __read_mostly = 8;
132static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
133static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
134static int ip_rt_min_advmss __read_mostly = 256;
Neil Horman1080d702008-10-27 12:28:25 -0700135static int rt_chain_length_max __read_mostly = 20;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700136
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500137static struct delayed_work expires_work;
138static unsigned long expires_ljiffies;
139
Linus Torvalds1da177e2005-04-16 15:20:36 -0700140/*
141 * Interface to generic destination cache.
142 */
143
144static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
David S. Miller0dbaee32010-12-13 12:52:14 -0800145static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
Steffen Klassertebb762f2011-11-23 02:12:51 +0000146static unsigned int ipv4_mtu(const struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700147static void ipv4_dst_destroy(struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
149static void ipv4_link_failure(struct sk_buff *skb);
150static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
Daniel Lezcano569d3642008-01-18 03:56:57 -0800151static int rt_garbage_collect(struct dst_ops *ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700152
Marcelo Ricardo Leitnerb6153ea2014-08-14 16:44:52 -0300153static void __rt_garbage_collect(struct work_struct *w);
154static DECLARE_WORK(rt_gc_worker, __rt_garbage_collect);
155
Eric Dumazet72cdd1d2010-11-11 07:14:07 +0000156static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
157 int how)
158{
159}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700160
David S. Miller62fa8a82011-01-26 20:51:05 -0800161static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
162{
David S. Miller06582542011-01-27 14:58:42 -0800163 struct rtable *rt = (struct rtable *) dst;
164 struct inet_peer *peer;
165 u32 *p = NULL;
David S. Miller62fa8a82011-01-26 20:51:05 -0800166
David S. Miller06582542011-01-27 14:58:42 -0800167 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -0400168 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller06582542011-01-27 14:58:42 -0800169
170 peer = rt->peer;
171 if (peer) {
David S. Miller62fa8a82011-01-26 20:51:05 -0800172 u32 *old_p = __DST_METRICS_PTR(old);
173 unsigned long prev, new;
174
David S. Miller06582542011-01-27 14:58:42 -0800175 p = peer->metrics;
176 if (inet_metrics_new(peer))
177 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
David S. Miller62fa8a82011-01-26 20:51:05 -0800178
179 new = (unsigned long) p;
180 prev = cmpxchg(&dst->_metrics, old, new);
181
182 if (prev != old) {
David S. Miller62fa8a82011-01-26 20:51:05 -0800183 p = __DST_METRICS_PTR(prev);
184 if (prev & DST_METRICS_READ_ONLY)
185 p = NULL;
186 } else {
David S. Miller62fa8a82011-01-26 20:51:05 -0800187 if (rt->fi) {
188 fib_info_put(rt->fi);
189 rt->fi = NULL;
190 }
191 }
192 }
193 return p;
194}
195
David S. Millerd3aaeb32011-07-18 00:40:17 -0700196static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
197
Linus Torvalds1da177e2005-04-16 15:20:36 -0700198static struct dst_ops ipv4_dst_ops = {
199 .family = AF_INET,
Harvey Harrison09640e62009-02-01 00:45:17 -0800200 .protocol = cpu_to_be16(ETH_P_IP),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700201 .gc = rt_garbage_collect,
202 .check = ipv4_dst_check,
David S. Miller0dbaee32010-12-13 12:52:14 -0800203 .default_advmss = ipv4_default_advmss,
Steffen Klassertebb762f2011-11-23 02:12:51 +0000204 .mtu = ipv4_mtu,
David S. Miller62fa8a82011-01-26 20:51:05 -0800205 .cow_metrics = ipv4_cow_metrics,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700206 .destroy = ipv4_dst_destroy,
207 .ifdown = ipv4_dst_ifdown,
208 .negative_advice = ipv4_negative_advice,
209 .link_failure = ipv4_link_failure,
210 .update_pmtu = ip_rt_update_pmtu,
Herbert Xu1ac06e02008-05-20 14:32:14 -0700211 .local_out = __ip_local_out,
David S. Millerd3aaeb32011-07-18 00:40:17 -0700212 .neigh_lookup = ipv4_neigh_lookup,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700213};
214
215#define ECN_OR_COST(class) TC_PRIO_##class
216
Philippe De Muyter4839c522007-07-09 15:32:57 -0700217const __u8 ip_tos2prio[16] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700218 TC_PRIO_BESTEFFORT,
Dan Siemon4a2b9c32011-03-15 13:56:07 +0000219 ECN_OR_COST(BESTEFFORT),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700220 TC_PRIO_BESTEFFORT,
221 ECN_OR_COST(BESTEFFORT),
222 TC_PRIO_BULK,
223 ECN_OR_COST(BULK),
224 TC_PRIO_BULK,
225 ECN_OR_COST(BULK),
226 TC_PRIO_INTERACTIVE,
227 ECN_OR_COST(INTERACTIVE),
228 TC_PRIO_INTERACTIVE,
229 ECN_OR_COST(INTERACTIVE),
230 TC_PRIO_INTERACTIVE_BULK,
231 ECN_OR_COST(INTERACTIVE_BULK),
232 TC_PRIO_INTERACTIVE_BULK,
233 ECN_OR_COST(INTERACTIVE_BULK)
234};
235
236
237/*
238 * Route cache.
239 */
240
241/* The locking scheme is rather straight forward:
242 *
243 * 1) Read-Copy Update protects the buckets of the central route hash.
244 * 2) Only writers remove entries, and they hold the lock
245 * as they look at rtable reference counts.
246 * 3) Only readers acquire references to rtable entries,
247 * they do so with atomic increments and with the
248 * lock held.
249 */
250
251struct rt_hash_bucket {
Eric Dumazet1c317202010-10-25 21:02:07 +0000252 struct rtable __rcu *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700253};
Neil Horman1080d702008-10-27 12:28:25 -0700254
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700255#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
256 defined(CONFIG_PROVE_LOCKING)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700257/*
258 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
259 * The size of this table is a power of two and depends on the number of CPUS.
Ingo Molnar62051202006-07-03 00:24:59 -0700260 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700261 */
Ingo Molnar62051202006-07-03 00:24:59 -0700262#ifdef CONFIG_LOCKDEP
263# define RT_HASH_LOCK_SZ 256
Eric Dumazet22c047c2005-07-05 14:55:24 -0700264#else
Ingo Molnar62051202006-07-03 00:24:59 -0700265# if NR_CPUS >= 32
266# define RT_HASH_LOCK_SZ 4096
267# elif NR_CPUS >= 16
268# define RT_HASH_LOCK_SZ 2048
269# elif NR_CPUS >= 8
270# define RT_HASH_LOCK_SZ 1024
271# elif NR_CPUS >= 4
272# define RT_HASH_LOCK_SZ 512
273# else
274# define RT_HASH_LOCK_SZ 256
275# endif
Eric Dumazet22c047c2005-07-05 14:55:24 -0700276#endif
277
278static spinlock_t *rt_hash_locks;
279# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800280
281static __init void rt_hash_lock_init(void)
282{
283 int i;
284
285 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
286 GFP_KERNEL);
287 if (!rt_hash_locks)
288 panic("IP: failed to allocate rt_hash_locks\n");
289
290 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
291 spin_lock_init(&rt_hash_locks[i]);
292}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700293#else
294# define rt_hash_lock_addr(slot) NULL
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800295
296static inline void rt_hash_lock_init(void)
297{
298}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700299#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700300
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700301static struct rt_hash_bucket *rt_hash_table __read_mostly;
302static unsigned rt_hash_mask __read_mostly;
303static unsigned int rt_hash_log __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700304
Eric Dumazet2f970d82006-01-17 02:54:36 -0800305static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
Eric Dumazet27f39c72010-05-19 22:07:23 +0000306#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700307
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700308static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700309 int genid)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700310{
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700311 return jhash_3words((__force u32)daddr, (__force u32)saddr,
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700312 idx, genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800313 & rt_hash_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700314}
315
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700316static inline int rt_genid(struct net *net)
317{
318 return atomic_read(&net->ipv4.rt_genid);
319}
320
Linus Torvalds1da177e2005-04-16 15:20:36 -0700321#ifdef CONFIG_PROC_FS
322struct rt_cache_iter_state {
Denis V. Luneva75e9362008-02-28 20:50:55 -0800323 struct seq_net_private p;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700324 int bucket;
Eric Dumazet29e75252008-01-31 17:05:09 -0800325 int genid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700326};
327
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900328static struct rtable *rt_cache_get_first(struct seq_file *seq)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700329{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900330 struct rt_cache_iter_state *st = seq->private;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700331 struct rtable *r = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700332
333 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
Eric Dumazet33d480c2011-08-11 19:30:52 +0000334 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
Eric Dumazeta6272662008-08-28 01:11:25 -0700335 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700336 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -0800337 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Eric Dumazet29e75252008-01-31 17:05:09 -0800338 while (r) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700339 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
Denis V. Luneva75e9362008-02-28 20:50:55 -0800340 r->rt_genid == st->genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800341 return r;
Changli Gaod8d1f302010-06-10 23:31:35 -0700342 r = rcu_dereference_bh(r->dst.rt_next);
Eric Dumazet29e75252008-01-31 17:05:09 -0800343 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700344 rcu_read_unlock_bh();
345 }
Eric Dumazet29e75252008-01-31 17:05:09 -0800346 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700347}
348
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900349static struct rtable *__rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800350 struct rtable *r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700351{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900352 struct rt_cache_iter_state *st = seq->private;
Eric Dumazeta6272662008-08-28 01:11:25 -0700353
Eric Dumazet1c317202010-10-25 21:02:07 +0000354 r = rcu_dereference_bh(r->dst.rt_next);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700355 while (!r) {
356 rcu_read_unlock_bh();
Eric Dumazeta6272662008-08-28 01:11:25 -0700357 do {
358 if (--st->bucket < 0)
359 return NULL;
Eric Dumazet33d480c2011-08-11 19:30:52 +0000360 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700361 rcu_read_lock_bh();
Eric Dumazet1c317202010-10-25 21:02:07 +0000362 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700363 }
Eric Dumazet1c317202010-10-25 21:02:07 +0000364 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700365}
366
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900367static struct rtable *rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800368 struct rtable *r)
369{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900370 struct rt_cache_iter_state *st = seq->private;
371 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700372 if (dev_net(r->dst.dev) != seq_file_net(seq))
Denis V. Luneva75e9362008-02-28 20:50:55 -0800373 continue;
Denis V. Lunev642d6312008-02-28 20:50:33 -0800374 if (r->rt_genid == st->genid)
375 break;
376 }
377 return r;
378}
379
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900380static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700381{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900382 struct rtable *r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700383
384 if (r)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900385 while (pos && (r = rt_cache_get_next(seq, r)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700386 --pos;
387 return pos ? NULL : r;
388}
389
390static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
391{
Eric Dumazet29e75252008-01-31 17:05:09 -0800392 struct rt_cache_iter_state *st = seq->private;
Eric Dumazet29e75252008-01-31 17:05:09 -0800393 if (*pos)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900394 return rt_cache_get_idx(seq, *pos - 1);
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700395 st->genid = rt_genid(seq_file_net(seq));
Eric Dumazet29e75252008-01-31 17:05:09 -0800396 return SEQ_START_TOKEN;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700397}
398
399static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
400{
Eric Dumazet29e75252008-01-31 17:05:09 -0800401 struct rtable *r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700402
403 if (v == SEQ_START_TOKEN)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900404 r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700405 else
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900406 r = rt_cache_get_next(seq, v);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700407 ++*pos;
408 return r;
409}
410
411static void rt_cache_seq_stop(struct seq_file *seq, void *v)
412{
413 if (v && v != SEQ_START_TOKEN)
414 rcu_read_unlock_bh();
415}
416
417static int rt_cache_seq_show(struct seq_file *seq, void *v)
418{
419 if (v == SEQ_START_TOKEN)
420 seq_printf(seq, "%-127s\n",
421 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
422 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
423 "HHUptod\tSpecDst");
424 else {
425 struct rtable *r = v;
David S. Miller69cce1d2011-07-17 23:09:49 -0700426 struct neighbour *n;
Eric Dumazet218fa902011-11-29 20:05:55 +0000427 int len, HHUptod;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700428
Eric Dumazet218fa902011-11-29 20:05:55 +0000429 rcu_read_lock();
David Miller27217452011-12-02 16:52:08 +0000430 n = dst_get_neighbour_noref(&r->dst);
Eric Dumazet218fa902011-11-29 20:05:55 +0000431 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
432 rcu_read_unlock();
433
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700434 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
435 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
Changli Gaod8d1f302010-06-10 23:31:35 -0700436 r->dst.dev ? r->dst.dev->name : "*",
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700437 (__force u32)r->rt_dst,
438 (__force u32)r->rt_gateway,
Changli Gaod8d1f302010-06-10 23:31:35 -0700439 r->rt_flags, atomic_read(&r->dst.__refcnt),
440 r->dst.__use, 0, (__force u32)r->rt_src,
David S. Miller0dbaee32010-12-13 12:52:14 -0800441 dst_metric_advmss(&r->dst) + 40,
Changli Gaod8d1f302010-06-10 23:31:35 -0700442 dst_metric(&r->dst, RTAX_WINDOW),
443 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
444 dst_metric(&r->dst, RTAX_RTTVAR)),
David S. Miller475949d2011-05-03 19:45:15 -0700445 r->rt_key_tos,
David S. Millerf6b72b62011-07-14 07:53:20 -0700446 -1,
Eric Dumazet218fa902011-11-29 20:05:55 +0000447 HHUptod,
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700448 r->rt_spec_dst, &len);
449
450 seq_printf(seq, "%*s\n", 127 - len, "");
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900451 }
452 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700453}
454
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700455static const struct seq_operations rt_cache_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700456 .start = rt_cache_seq_start,
457 .next = rt_cache_seq_next,
458 .stop = rt_cache_seq_stop,
459 .show = rt_cache_seq_show,
460};
461
462static int rt_cache_seq_open(struct inode *inode, struct file *file)
463{
Denis V. Luneva75e9362008-02-28 20:50:55 -0800464 return seq_open_net(inode, file, &rt_cache_seq_ops,
Pavel Emelyanovcf7732e2007-10-10 02:29:29 -0700465 sizeof(struct rt_cache_iter_state));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700466}
467
Arjan van de Ven9a321442007-02-12 00:55:35 -0800468static const struct file_operations rt_cache_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700469 .owner = THIS_MODULE,
470 .open = rt_cache_seq_open,
471 .read = seq_read,
472 .llseek = seq_lseek,
Denis V. Luneva75e9362008-02-28 20:50:55 -0800473 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700474};
475
476
477static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
478{
479 int cpu;
480
481 if (*pos == 0)
482 return SEQ_START_TOKEN;
483
Rusty Russell0f231742008-12-29 12:23:42 +0000484 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700485 if (!cpu_possible(cpu))
486 continue;
487 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800488 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700489 }
490 return NULL;
491}
492
493static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
494{
495 int cpu;
496
Rusty Russell0f231742008-12-29 12:23:42 +0000497 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700498 if (!cpu_possible(cpu))
499 continue;
500 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800501 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700502 }
503 return NULL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900504
Linus Torvalds1da177e2005-04-16 15:20:36 -0700505}
506
507static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
508{
509
510}
511
512static int rt_cpu_seq_show(struct seq_file *seq, void *v)
513{
514 struct rt_cache_stat *st = v;
515
516 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700517 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700518 return 0;
519 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900520
Linus Torvalds1da177e2005-04-16 15:20:36 -0700521 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
522 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
Eric Dumazetfc66f952010-10-08 06:37:34 +0000523 dst_entries_get_slow(&ipv4_dst_ops),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700524 st->in_hit,
525 st->in_slow_tot,
526 st->in_slow_mc,
527 st->in_no_route,
528 st->in_brd,
529 st->in_martian_dst,
530 st->in_martian_src,
531
532 st->out_hit,
533 st->out_slow_tot,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900534 st->out_slow_mc,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700535
536 st->gc_total,
537 st->gc_ignored,
538 st->gc_goal_miss,
539 st->gc_dst_overflow,
540 st->in_hlist_search,
541 st->out_hlist_search
542 );
543 return 0;
544}
545
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700546static const struct seq_operations rt_cpu_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700547 .start = rt_cpu_seq_start,
548 .next = rt_cpu_seq_next,
549 .stop = rt_cpu_seq_stop,
550 .show = rt_cpu_seq_show,
551};
552
553
554static int rt_cpu_seq_open(struct inode *inode, struct file *file)
555{
556 return seq_open(file, &rt_cpu_seq_ops);
557}
558
Arjan van de Ven9a321442007-02-12 00:55:35 -0800559static const struct file_operations rt_cpu_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700560 .owner = THIS_MODULE,
561 .open = rt_cpu_seq_open,
562 .read = seq_read,
563 .llseek = seq_lseek,
564 .release = seq_release,
565};
566
Patrick McHardyc7066f72011-01-14 13:36:42 +0100567#ifdef CONFIG_IP_ROUTE_CLASSID
Alexey Dobriyana661c412009-11-25 15:40:35 -0800568static int rt_acct_proc_show(struct seq_file *m, void *v)
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800569{
Alexey Dobriyana661c412009-11-25 15:40:35 -0800570 struct ip_rt_acct *dst, *src;
571 unsigned int i, j;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800572
Alexey Dobriyana661c412009-11-25 15:40:35 -0800573 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
574 if (!dst)
575 return -ENOMEM;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800576
Alexey Dobriyana661c412009-11-25 15:40:35 -0800577 for_each_possible_cpu(i) {
578 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
579 for (j = 0; j < 256; j++) {
580 dst[j].o_bytes += src[j].o_bytes;
581 dst[j].o_packets += src[j].o_packets;
582 dst[j].i_bytes += src[j].i_bytes;
583 dst[j].i_packets += src[j].i_packets;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800584 }
585 }
Alexey Dobriyana661c412009-11-25 15:40:35 -0800586
587 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
588 kfree(dst);
589 return 0;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800590}
Alexey Dobriyana661c412009-11-25 15:40:35 -0800591
592static int rt_acct_proc_open(struct inode *inode, struct file *file)
593{
594 return single_open(file, rt_acct_proc_show, NULL);
595}
596
597static const struct file_operations rt_acct_proc_fops = {
598 .owner = THIS_MODULE,
599 .open = rt_acct_proc_open,
600 .read = seq_read,
601 .llseek = seq_lseek,
602 .release = single_release,
603};
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800604#endif
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800605
Denis V. Lunev73b38712008-02-28 20:51:18 -0800606static int __net_init ip_rt_do_proc_init(struct net *net)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800607{
608 struct proc_dir_entry *pde;
609
610 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
611 &rt_cache_seq_fops);
612 if (!pde)
613 goto err1;
614
Wang Chen77020722008-02-28 14:14:25 -0800615 pde = proc_create("rt_cache", S_IRUGO,
616 net->proc_net_stat, &rt_cpu_seq_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800617 if (!pde)
618 goto err2;
619
Patrick McHardyc7066f72011-01-14 13:36:42 +0100620#ifdef CONFIG_IP_ROUTE_CLASSID
Alexey Dobriyana661c412009-11-25 15:40:35 -0800621 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800622 if (!pde)
623 goto err3;
624#endif
625 return 0;
626
Patrick McHardyc7066f72011-01-14 13:36:42 +0100627#ifdef CONFIG_IP_ROUTE_CLASSID
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800628err3:
629 remove_proc_entry("rt_cache", net->proc_net_stat);
630#endif
631err2:
632 remove_proc_entry("rt_cache", net->proc_net);
633err1:
634 return -ENOMEM;
635}
Denis V. Lunev73b38712008-02-28 20:51:18 -0800636
637static void __net_exit ip_rt_do_proc_exit(struct net *net)
638{
639 remove_proc_entry("rt_cache", net->proc_net_stat);
640 remove_proc_entry("rt_cache", net->proc_net);
Patrick McHardyc7066f72011-01-14 13:36:42 +0100641#ifdef CONFIG_IP_ROUTE_CLASSID
Denis V. Lunev73b38712008-02-28 20:51:18 -0800642 remove_proc_entry("rt_acct", net->proc_net);
Alexey Dobriyan0a931ac2010-01-17 03:32:50 +0000643#endif
Denis V. Lunev73b38712008-02-28 20:51:18 -0800644}
645
646static struct pernet_operations ip_rt_proc_ops __net_initdata = {
647 .init = ip_rt_do_proc_init,
648 .exit = ip_rt_do_proc_exit,
649};
650
651static int __init ip_rt_proc_init(void)
652{
653 return register_pernet_subsys(&ip_rt_proc_ops);
654}
655
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800656#else
Denis V. Lunev73b38712008-02-28 20:51:18 -0800657static inline int ip_rt_proc_init(void)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800658{
659 return 0;
660}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700661#endif /* CONFIG_PROC_FS */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900662
Stephen Hemminger5969f712008-04-10 01:52:09 -0700663static inline void rt_free(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700664{
Changli Gaod8d1f302010-06-10 23:31:35 -0700665 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700666}
667
Stephen Hemminger5969f712008-04-10 01:52:09 -0700668static inline void rt_drop(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700669{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700670 ip_rt_put(rt);
Changli Gaod8d1f302010-06-10 23:31:35 -0700671 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700672}
673
Stephen Hemminger5969f712008-04-10 01:52:09 -0700674static inline int rt_fast_clean(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700675{
676 /* Kill broadcast/multicast entries very aggresively, if they
677 collide in hash table with more useful entries */
678 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
David S. Millerc7537962010-11-11 17:07:48 -0800679 rt_is_input_route(rth) && rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700680}
681
Stephen Hemminger5969f712008-04-10 01:52:09 -0700682static inline int rt_valuable(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700683{
684 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
David S. Miller2c8cec52011-02-09 20:42:07 -0800685 (rth->peer && rth->peer->pmtu_expires);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700686}
687
688static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
689{
690 unsigned long age;
691 int ret = 0;
692
Changli Gaod8d1f302010-06-10 23:31:35 -0700693 if (atomic_read(&rth->dst.__refcnt))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700694 goto out;
695
Changli Gaod8d1f302010-06-10 23:31:35 -0700696 age = jiffies - rth->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700697 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
698 (age <= tmo2 && rt_valuable(rth)))
699 goto out;
700 ret = 1;
701out: return ret;
702}
703
704/* Bits of score are:
705 * 31: very valuable
706 * 30: not quite useless
707 * 29..0: usage counter
708 */
709static inline u32 rt_score(struct rtable *rt)
710{
Changli Gaod8d1f302010-06-10 23:31:35 -0700711 u32 score = jiffies - rt->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700712
713 score = ~score & ~(3<<30);
714
715 if (rt_valuable(rt))
716 score |= (1<<31);
717
David S. Millerc7537962010-11-11 17:07:48 -0800718 if (rt_is_output_route(rt) ||
Linus Torvalds1da177e2005-04-16 15:20:36 -0700719 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
720 score |= (1<<30);
721
722 return score;
723}
724
Neil Horman1080d702008-10-27 12:28:25 -0700725static inline bool rt_caching(const struct net *net)
726{
727 return net->ipv4.current_rt_cache_rebuild_count <=
728 net->ipv4.sysctl_rt_cache_rebuild_count;
729}
730
David S. Miller5e2b61f2011-03-04 21:47:09 -0800731static inline bool compare_hash_inputs(const struct rtable *rt1,
732 const struct rtable *rt2)
Neil Horman1080d702008-10-27 12:28:25 -0700733{
David S. Miller5e2b61f2011-03-04 21:47:09 -0800734 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
735 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
Julian Anastasov97a80412011-08-09 04:01:16 +0000736 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
Neil Horman1080d702008-10-27 12:28:25 -0700737}
738
David S. Miller5e2b61f2011-03-04 21:47:09 -0800739static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700740{
David S. Miller5e2b61f2011-03-04 21:47:09 -0800741 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
742 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
743 (rt1->rt_mark ^ rt2->rt_mark) |
David S. Miller475949d2011-05-03 19:45:15 -0700744 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
Julian Anastasovd547f722011-08-07 22:20:20 -0700745 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
Julian Anastasov97a80412011-08-09 04:01:16 +0000746 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700747}
748
Denis V. Lunevb5921912008-01-22 23:50:25 -0800749static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
750{
Changli Gaod8d1f302010-06-10 23:31:35 -0700751 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
Denis V. Lunevb5921912008-01-22 23:50:25 -0800752}
753
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700754static inline int rt_is_expired(struct rtable *rth)
755{
Changli Gaod8d1f302010-06-10 23:31:35 -0700756 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700757}
758
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800759/*
760 * Perform a full scan of hash table and free all entries.
761 * Can be called by a softirq or a process.
762 * In the later case, we want to be reschedule if necessary
763 */
David S. Miller6561a3b2010-12-19 21:11:20 -0800764static void rt_do_flush(struct net *net, int process_context)
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800765{
766 unsigned int i;
767 struct rtable *rth, *next;
768
769 for (i = 0; i <= rt_hash_mask; i++) {
David S. Miller6561a3b2010-12-19 21:11:20 -0800770 struct rtable __rcu **pprev;
771 struct rtable *list;
772
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800773 if (process_context && need_resched())
774 cond_resched();
Eric Dumazet33d480c2011-08-11 19:30:52 +0000775 rth = rcu_access_pointer(rt_hash_table[i].chain);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800776 if (!rth)
777 continue;
778
779 spin_lock_bh(rt_hash_lock_addr(i));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700780
David S. Miller6561a3b2010-12-19 21:11:20 -0800781 list = NULL;
782 pprev = &rt_hash_table[i].chain;
783 rth = rcu_dereference_protected(*pprev,
Eric Dumazet1c317202010-10-25 21:02:07 +0000784 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700785
David S. Miller6561a3b2010-12-19 21:11:20 -0800786 while (rth) {
787 next = rcu_dereference_protected(rth->dst.rt_next,
788 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700789
David S. Miller6561a3b2010-12-19 21:11:20 -0800790 if (!net ||
791 net_eq(dev_net(rth->dst.dev), net)) {
792 rcu_assign_pointer(*pprev, next);
793 rcu_assign_pointer(rth->dst.rt_next, list);
794 list = rth;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700795 } else {
David S. Miller6561a3b2010-12-19 21:11:20 -0800796 pprev = &rth->dst.rt_next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700797 }
David S. Miller6561a3b2010-12-19 21:11:20 -0800798 rth = next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700799 }
David S. Miller6561a3b2010-12-19 21:11:20 -0800800
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800801 spin_unlock_bh(rt_hash_lock_addr(i));
802
David S. Miller6561a3b2010-12-19 21:11:20 -0800803 for (; list; list = next) {
804 next = rcu_dereference_protected(list->dst.rt_next, 1);
805 rt_free(list);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800806 }
807 }
808}
809
Neil Horman1080d702008-10-27 12:28:25 -0700810/*
811 * While freeing expired entries, we compute average chain length
812 * and standard deviation, using fixed-point arithmetic.
813 * This to have an estimation of rt_chain_length_max
814 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
815 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
816 */
817
818#define FRACT_BITS 3
819#define ONE (1UL << FRACT_BITS)
820
Eric Dumazet98376382010-03-08 03:20:00 +0000821/*
822 * Given a hash chain and an item in this hash chain,
823 * find if a previous entry has the same hash_inputs
824 * (but differs on tos, mark or oif)
825 * Returns 0 if an alias is found.
826 * Returns ONE if rth has no alias before itself.
827 */
828static int has_noalias(const struct rtable *head, const struct rtable *rth)
829{
830 const struct rtable *aux = head;
831
832 while (aux != rth) {
David S. Miller5e2b61f2011-03-04 21:47:09 -0800833 if (compare_hash_inputs(aux, rth))
Eric Dumazet98376382010-03-08 03:20:00 +0000834 return 0;
Eric Dumazet1c317202010-10-25 21:02:07 +0000835 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +0000836 }
837 return ONE;
838}
839
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500840static void rt_check_expire(void)
841{
842 static unsigned int rover;
843 unsigned int i = rover, goal;
844 struct rtable *rth;
845 struct rtable __rcu **rthp;
846 unsigned long samples = 0;
847 unsigned long sum = 0, sum2 = 0;
848 unsigned long delta;
849 u64 mult;
850
851 delta = jiffies - expires_ljiffies;
852 expires_ljiffies = jiffies;
853 mult = ((u64)delta) << rt_hash_log;
854 if (ip_rt_gc_timeout > 1)
855 do_div(mult, ip_rt_gc_timeout);
856 goal = (unsigned int)mult;
857 if (goal > rt_hash_mask)
858 goal = rt_hash_mask + 1;
859 for (; goal > 0; goal--) {
860 unsigned long tmo = ip_rt_gc_timeout;
861 unsigned long length;
862
863 i = (i + 1) & rt_hash_mask;
864 rthp = &rt_hash_table[i].chain;
865
866 if (need_resched())
867 cond_resched();
868
869 samples++;
870
871 if (rcu_dereference_raw(*rthp) == NULL)
872 continue;
873 length = 0;
874 spin_lock_bh(rt_hash_lock_addr(i));
875 while ((rth = rcu_dereference_protected(*rthp,
876 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
877 prefetch(rth->dst.rt_next);
878 if (rt_is_expired(rth)) {
879 *rthp = rth->dst.rt_next;
880 rt_free(rth);
881 continue;
882 }
883 if (rth->dst.expires) {
884 /* Entry is expired even if it is in use */
885 if (time_before_eq(jiffies, rth->dst.expires)) {
886nofree:
887 tmo >>= 1;
888 rthp = &rth->dst.rt_next;
889 /*
890 * We only count entries on
891 * a chain with equal hash inputs once
892 * so that entries for different QOS
893 * levels, and other non-hash input
894 * attributes don't unfairly skew
895 * the length computation
896 */
897 length += has_noalias(rt_hash_table[i].chain, rth);
898 continue;
899 }
900 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
901 goto nofree;
902
903 /* Cleanup aged off entries. */
904 *rthp = rth->dst.rt_next;
905 rt_free(rth);
906 }
907 spin_unlock_bh(rt_hash_lock_addr(i));
908 sum += length;
909 sum2 += length*length;
910 }
911 if (samples) {
912 unsigned long avg = sum / samples;
913 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
914 rt_chain_length_max = max_t(unsigned long,
915 ip_rt_gc_elasticity,
916 (avg + 4*sd) >> FRACT_BITS);
917 }
918 rover = i;
919}
920
921/*
922 * rt_worker_func() is run in process context.
923 * we call rt_check_expire() to scan part of the hash table
924 */
925static void rt_worker_func(struct work_struct *work)
926{
927 rt_check_expire();
928 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
929}
930
Eric Dumazet29e75252008-01-31 17:05:09 -0800931/*
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300932 * Perturbation of rt_genid by a small quantity [1..256]
Eric Dumazet29e75252008-01-31 17:05:09 -0800933 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
934 * many times (2^24) without giving recent rt_genid.
935 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700936 */
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700937static void rt_cache_invalidate(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700938{
Eric Dumazet29e75252008-01-31 17:05:09 -0800939 unsigned char shuffle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700940
Eric Dumazet29e75252008-01-31 17:05:09 -0800941 get_random_bytes(&shuffle, sizeof(shuffle));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700942 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
Steffen Klassert5faa5df2012-03-06 21:20:26 +0000943 inetpeer_invalidate_tree(AF_INET);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700944}
945
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800946/*
Eric Dumazet29e75252008-01-31 17:05:09 -0800947 * delay < 0 : invalidate cache (fast : entries will be deleted later)
948 * delay >= 0 : invalidate & flush cache (can be long)
949 */
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -0700950void rt_cache_flush(struct net *net, int delay)
Eric Dumazet29e75252008-01-31 17:05:09 -0800951{
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700952 rt_cache_invalidate(net);
Eric Dumazet29e75252008-01-31 17:05:09 -0800953 if (delay >= 0)
David S. Miller6561a3b2010-12-19 21:11:20 -0800954 rt_do_flush(net, !in_softirq());
Eric Dumazet29e75252008-01-31 17:05:09 -0800955}
956
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000957/* Flush previous cache invalidated entries from the cache */
David S. Miller6561a3b2010-12-19 21:11:20 -0800958void rt_cache_flush_batch(struct net *net)
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000959{
David S. Miller6561a3b2010-12-19 21:11:20 -0800960 rt_do_flush(net, !in_softirq());
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000961}
962
Neil Horman1080d702008-10-27 12:28:25 -0700963static void rt_emergency_hash_rebuild(struct net *net)
964{
Neil Horman3ee94372010-05-08 01:57:52 -0700965 if (net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +0000966 pr_warn("Route hash chain too long!\n");
Neil Horman3ee94372010-05-08 01:57:52 -0700967 rt_cache_invalidate(net);
Neil Horman1080d702008-10-27 12:28:25 -0700968}
969
Linus Torvalds1da177e2005-04-16 15:20:36 -0700970/*
971 Short description of GC goals.
972
973 We want to build algorithm, which will keep routing cache
974 at some equilibrium point, when number of aged off entries
975 is kept approximately equal to newly generated ones.
976
977 Current expiration strength is variable "expire".
978 We try to adjust it dynamically, so that if networking
979 is idle expires is large enough to keep enough of warm entries,
980 and when load increases it reduces to limit cache size.
981 */
982
Marcelo Ricardo Leitnerb6153ea2014-08-14 16:44:52 -0300983static void __do_rt_garbage_collect(int elasticity, int min_interval)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700984{
985 static unsigned long expire = RT_GC_TIMEOUT;
986 static unsigned long last_gc;
987 static int rover;
988 static int equilibrium;
Eric Dumazet1c317202010-10-25 21:02:07 +0000989 struct rtable *rth;
990 struct rtable __rcu **rthp;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700991 unsigned long now = jiffies;
992 int goal;
Eric Dumazetfc66f952010-10-08 06:37:34 +0000993 int entries = dst_entries_get_fast(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700994
995 /*
996 * Garbage collection is pretty expensive,
997 * do not make it too frequently.
998 */
999
1000 RT_CACHE_STAT_INC(gc_total);
1001
Marcelo Ricardo Leitnerb6153ea2014-08-14 16:44:52 -03001002 if (now - last_gc < min_interval &&
Eric Dumazetfc66f952010-10-08 06:37:34 +00001003 entries < ip_rt_max_size) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001004 RT_CACHE_STAT_INC(gc_ignored);
1005 goto out;
1006 }
1007
Eric Dumazetfc66f952010-10-08 06:37:34 +00001008 entries = dst_entries_get_slow(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001009 /* Calculate number of entries, which we want to expire now. */
Marcelo Ricardo Leitnerb6153ea2014-08-14 16:44:52 -03001010 goal = entries - (elasticity << rt_hash_log);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001011 if (goal <= 0) {
1012 if (equilibrium < ipv4_dst_ops.gc_thresh)
1013 equilibrium = ipv4_dst_ops.gc_thresh;
Eric Dumazetfc66f952010-10-08 06:37:34 +00001014 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001015 if (goal > 0) {
Eric Dumazetb790ced2007-12-21 01:49:07 -08001016 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +00001017 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001018 }
1019 } else {
1020 /* We are in dangerous area. Try to reduce cache really
1021 * aggressively.
1022 */
Eric Dumazetb790ced2007-12-21 01:49:07 -08001023 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +00001024 equilibrium = entries - goal;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001025 }
1026
Marcelo Ricardo Leitnerb6153ea2014-08-14 16:44:52 -03001027 if (now - last_gc >= min_interval)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001028 last_gc = now;
1029
1030 if (goal <= 0) {
1031 equilibrium += goal;
1032 goto work_done;
1033 }
1034
1035 do {
1036 int i, k;
1037
1038 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1039 unsigned long tmo = expire;
1040
1041 k = (k + 1) & rt_hash_mask;
1042 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001043 spin_lock_bh(rt_hash_lock_addr(k));
Eric Dumazet1c317202010-10-25 21:02:07 +00001044 while ((rth = rcu_dereference_protected(*rthp,
1045 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001046 if (!rt_is_expired(rth) &&
Eric Dumazet29e75252008-01-31 17:05:09 -08001047 !rt_may_expire(rth, tmo, expire)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001048 tmo >>= 1;
Changli Gaod8d1f302010-06-10 23:31:35 -07001049 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001050 continue;
1051 }
Changli Gaod8d1f302010-06-10 23:31:35 -07001052 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001053 rt_free(rth);
1054 goal--;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001055 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001056 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001057 if (goal <= 0)
1058 break;
1059 }
1060 rover = k;
1061
1062 if (goal <= 0)
1063 goto work_done;
1064
1065 /* Goal is not achieved. We stop process if:
1066
1067 - if expire reduced to zero. Otherwise, expire is halfed.
1068 - if table is not full.
1069 - if we are called from interrupt.
1070 - jiffies check is just fallback/debug loop breaker.
1071 We will not spin here for long time in any case.
1072 */
1073
1074 RT_CACHE_STAT_INC(gc_goal_miss);
1075
1076 if (expire == 0)
1077 break;
1078
1079 expire >>= 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001080
Eric Dumazetfc66f952010-10-08 06:37:34 +00001081 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001082 goto out;
1083 } while (!in_softirq() && time_before_eq(jiffies, now));
1084
Eric Dumazetfc66f952010-10-08 06:37:34 +00001085 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1086 goto out;
1087 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001088 goto out;
1089 if (net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00001090 pr_warn("dst cache overflow\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07001091 RT_CACHE_STAT_INC(gc_dst_overflow);
Marcelo Ricardo Leitnerb6153ea2014-08-14 16:44:52 -03001092 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001093
1094work_done:
Marcelo Ricardo Leitnerb6153ea2014-08-14 16:44:52 -03001095 expire += min_interval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001096 if (expire > ip_rt_gc_timeout ||
Eric Dumazetfc66f952010-10-08 06:37:34 +00001097 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1098 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001099 expire = ip_rt_gc_timeout;
Marcelo Ricardo Leitnerb6153ea2014-08-14 16:44:52 -03001100out: return;
1101}
1102
1103static void __rt_garbage_collect(struct work_struct *w)
1104{
1105 __do_rt_garbage_collect(ip_rt_gc_elasticity, ip_rt_gc_min_interval);
1106}
1107
1108static int rt_garbage_collect(struct dst_ops *ops)
1109{
1110 if (!work_pending(&rt_gc_worker))
1111 schedule_work(&rt_gc_worker);
1112
1113 if (dst_entries_get_fast(&ipv4_dst_ops) >= ip_rt_max_size ||
1114 dst_entries_get_slow(&ipv4_dst_ops) >= ip_rt_max_size) {
1115 RT_CACHE_STAT_INC(gc_dst_overflow);
1116 return 1;
1117 }
1118 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001119}
1120
Eric Dumazet98376382010-03-08 03:20:00 +00001121/*
1122 * Returns number of entries in a hash chain that have different hash_inputs
1123 */
1124static int slow_chain_length(const struct rtable *head)
1125{
1126 int length = 0;
1127 const struct rtable *rth = head;
1128
1129 while (rth) {
1130 length += has_noalias(head, rth);
Eric Dumazet1c317202010-10-25 21:02:07 +00001131 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +00001132 }
1133 return length >> FRACT_BITS;
1134}
1135
David S. Millerd3aaeb32011-07-18 00:40:17 -07001136static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
David Miller3769cff2011-07-11 22:44:24 +00001137{
David S. Millerd3aaeb32011-07-18 00:40:17 -07001138 static const __be32 inaddr_any = 0;
1139 struct net_device *dev = dst->dev;
1140 const __be32 *pkey = daddr;
David S. Miller39232972012-01-26 15:22:32 -05001141 const struct rtable *rt;
David Miller3769cff2011-07-11 22:44:24 +00001142 struct neighbour *n;
1143
David S. Miller39232972012-01-26 15:22:32 -05001144 rt = (const struct rtable *) dst;
1145
David Miller3769cff2011-07-11 22:44:24 +00001146 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
David S. Millerd3aaeb32011-07-18 00:40:17 -07001147 pkey = &inaddr_any;
David S. Miller39232972012-01-26 15:22:32 -05001148 else if (rt->rt_gateway)
1149 pkey = (const __be32 *) &rt->rt_gateway;
David S. Millerd3aaeb32011-07-18 00:40:17 -07001150
David S. Miller80703d22012-02-15 17:48:35 -05001151 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
David S. Millerd3aaeb32011-07-18 00:40:17 -07001152 if (n)
1153 return n;
David Miller32092ec2011-07-25 00:01:41 +00001154 return neigh_create(&arp_tbl, pkey, dev);
David S. Millerd3aaeb32011-07-18 00:40:17 -07001155}
1156
1157static int rt_bind_neighbour(struct rtable *rt)
1158{
1159 struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
David Miller3769cff2011-07-11 22:44:24 +00001160 if (IS_ERR(n))
1161 return PTR_ERR(n);
David S. Miller69cce1d2011-07-17 23:09:49 -07001162 dst_set_neighbour(&rt->dst, n);
David Miller3769cff2011-07-11 22:44:24 +00001163
1164 return 0;
1165}
1166
David S. Millerb23dd4f2011-03-02 14:31:35 -08001167static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1168 struct sk_buff *skb, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001169{
Eric Dumazet1c317202010-10-25 21:02:07 +00001170 struct rtable *rth, *cand;
1171 struct rtable __rcu **rthp, **candp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001172 unsigned long now;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001173 u32 min_score;
1174 int chain_length;
1175 int attempts = !in_softirq();
1176
1177restart:
1178 chain_length = 0;
1179 min_score = ~(u32)0;
1180 cand = NULL;
1181 candp = NULL;
1182 now = jiffies;
1183
Changli Gaod8d1f302010-06-10 23:31:35 -07001184 if (!rt_caching(dev_net(rt->dst.dev))) {
Neil Horman73e42892009-06-20 01:15:16 -07001185 /*
1186 * If we're not caching, just tell the caller we
1187 * were successful and don't touch the route. The
1188 * caller hold the sole reference to the cache entry, and
1189 * it will be released when the caller is done with it.
1190 * If we drop it here, the callers have no way to resolve routes
1191 * when we're not caching. Instead, just point *rp at rt, so
1192 * the caller gets a single use out of the route
Neil Hormanb6280b42009-06-22 10:18:53 +00001193 * Note that we do rt_free on this new route entry, so that
1194 * once its refcount hits zero, we are still able to reap it
1195 * (Thanks Alexey)
Eric Dumazet27b75c92010-10-15 05:44:11 +00001196 * Note: To avoid expensive rcu stuff for this uncached dst,
1197 * we set DST_NOCACHE so that dst_release() can free dst without
1198 * waiting a grace period.
Neil Horman73e42892009-06-20 01:15:16 -07001199 */
Neil Hormanb6280b42009-06-22 10:18:53 +00001200
Eric Dumazetc7d44262010-10-03 22:17:54 -07001201 rt->dst.flags |= DST_NOCACHE;
David S. Millerc7537962010-11-11 17:07:48 -08001202 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
David Miller3769cff2011-07-11 22:44:24 +00001203 int err = rt_bind_neighbour(rt);
Neil Hormanb6280b42009-06-22 10:18:53 +00001204 if (err) {
1205 if (net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00001206 pr_warn("Neighbour table failure & not caching routes\n");
Eric Dumazet27b75c92010-10-15 05:44:11 +00001207 ip_rt_put(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001208 return ERR_PTR(err);
Neil Hormanb6280b42009-06-22 10:18:53 +00001209 }
1210 }
1211
Neil Hormanb6280b42009-06-22 10:18:53 +00001212 goto skip_hashing;
Neil Horman1080d702008-10-27 12:28:25 -07001213 }
1214
Linus Torvalds1da177e2005-04-16 15:20:36 -07001215 rthp = &rt_hash_table[hash].chain;
1216
Eric Dumazet22c047c2005-07-05 14:55:24 -07001217 spin_lock_bh(rt_hash_lock_addr(hash));
Eric Dumazet1c317202010-10-25 21:02:07 +00001218 while ((rth = rcu_dereference_protected(*rthp,
1219 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001220 if (rt_is_expired(rth)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001221 *rthp = rth->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001222 rt_free(rth);
1223 continue;
1224 }
David S. Miller5e2b61f2011-03-04 21:47:09 -08001225 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001226 /* Put it first */
Changli Gaod8d1f302010-06-10 23:31:35 -07001227 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001228 /*
1229 * Since lookup is lockfree, the deletion
1230 * must be visible to another weakly ordered CPU before
1231 * the insertion at the start of the hash chain.
1232 */
Changli Gaod8d1f302010-06-10 23:31:35 -07001233 rcu_assign_pointer(rth->dst.rt_next,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001234 rt_hash_table[hash].chain);
1235 /*
1236 * Since lookup is lockfree, the update writes
1237 * must be ordered for consistency on SMP.
1238 */
1239 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1240
Changli Gaod8d1f302010-06-10 23:31:35 -07001241 dst_use(&rth->dst, now);
Eric Dumazet22c047c2005-07-05 14:55:24 -07001242 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001243
1244 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001245 if (skb)
Changli Gaod8d1f302010-06-10 23:31:35 -07001246 skb_dst_set(skb, &rth->dst);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001247 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001248 }
1249
Changli Gaod8d1f302010-06-10 23:31:35 -07001250 if (!atomic_read(&rth->dst.__refcnt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001251 u32 score = rt_score(rth);
1252
1253 if (score <= min_score) {
1254 cand = rth;
1255 candp = rthp;
1256 min_score = score;
1257 }
1258 }
1259
1260 chain_length++;
1261
Changli Gaod8d1f302010-06-10 23:31:35 -07001262 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001263 }
1264
1265 if (cand) {
1266 /* ip_rt_gc_elasticity used to be average length of chain
1267 * length, when exceeded gc becomes really aggressive.
1268 *
1269 * The second limit is less certain. At the moment it allows
1270 * only 2 entries per bucket. We will see.
1271 */
1272 if (chain_length > ip_rt_gc_elasticity) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001273 *candp = cand->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001274 rt_free(cand);
1275 }
Neil Horman1080d702008-10-27 12:28:25 -07001276 } else {
Eric Dumazet98376382010-03-08 03:20:00 +00001277 if (chain_length > rt_chain_length_max &&
1278 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001279 struct net *net = dev_net(rt->dst.dev);
Neil Horman1080d702008-10-27 12:28:25 -07001280 int num = ++net->ipv4.current_rt_cache_rebuild_count;
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001281 if (!rt_caching(net)) {
Joe Perches058bd4d2012-03-11 18:36:11 +00001282 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
Changli Gaod8d1f302010-06-10 23:31:35 -07001283 rt->dst.dev->name, num);
Neil Horman1080d702008-10-27 12:28:25 -07001284 }
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001285 rt_emergency_hash_rebuild(net);
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001286 spin_unlock_bh(rt_hash_lock_addr(hash));
1287
David S. Miller5e2b61f2011-03-04 21:47:09 -08001288 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001289 ifindex, rt_genid(net));
1290 goto restart;
Neil Horman1080d702008-10-27 12:28:25 -07001291 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001292 }
1293
1294 /* Try to bind route to arp only if it is output
1295 route or unicast forwarding path.
1296 */
David S. Millerc7537962010-11-11 17:07:48 -08001297 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
David Miller3769cff2011-07-11 22:44:24 +00001298 int err = rt_bind_neighbour(rt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001299 if (err) {
Eric Dumazet22c047c2005-07-05 14:55:24 -07001300 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001301
1302 if (err != -ENOBUFS) {
1303 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001304 return ERR_PTR(err);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001305 }
1306
1307 /* Neighbour tables are full and nothing
1308 can be released. Try to shrink route cache,
1309 it is most likely it holds some neighbour records.
1310 */
1311 if (attempts-- > 0) {
Marcelo Ricardo Leitnerb6153ea2014-08-14 16:44:52 -03001312 __do_rt_garbage_collect(1, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001313 goto restart;
1314 }
1315
1316 if (net_ratelimit())
Joe Perchesafd465032012-03-12 07:03:32 +00001317 pr_warn("Neighbour table overflow\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07001318 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001319 return ERR_PTR(-ENOBUFS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001320 }
1321 }
1322
Changli Gaod8d1f302010-06-10 23:31:35 -07001323 rt->dst.rt_next = rt_hash_table[hash].chain;
Neil Horman1080d702008-10-27 12:28:25 -07001324
Eric Dumazet00269b52008-10-16 14:18:29 -07001325 /*
1326 * Since lookup is lockfree, we must make sure
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001327 * previous writes to rt are committed to memory
Eric Dumazet00269b52008-10-16 14:18:29 -07001328 * before making rt visible to other CPUS.
1329 */
Eric Dumazet1ddbcb02009-05-19 20:14:28 +00001330 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
Neil Horman1080d702008-10-27 12:28:25 -07001331
Eric Dumazet22c047c2005-07-05 14:55:24 -07001332 spin_unlock_bh(rt_hash_lock_addr(hash));
Neil Horman73e42892009-06-20 01:15:16 -07001333
Neil Hormanb6280b42009-06-22 10:18:53 +00001334skip_hashing:
David S. Millerb23dd4f2011-03-02 14:31:35 -08001335 if (skb)
Changli Gaod8d1f302010-06-10 23:31:35 -07001336 skb_dst_set(skb, &rt->dst);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001337 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001338}
1339
David S. Miller6431cbc2011-02-07 20:38:06 -08001340static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1341
1342static u32 rt_peer_genid(void)
1343{
1344 return atomic_read(&__rt_peer_genid);
1345}
1346
David S. Millera48eff12011-05-18 18:42:43 -04001347void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001348{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001349 struct inet_peer *peer;
1350
David S. Millera48eff12011-05-18 18:42:43 -04001351 peer = inet_getpeer_v4(daddr, create);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001352
Eric Dumazet49e8ab02010-08-19 06:10:45 +00001353 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001354 inet_putpeer(peer);
David S. Miller6431cbc2011-02-07 20:38:06 -08001355 else
1356 rt->rt_peer_genid = rt_peer_genid();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001357}
1358
Eric Dumazet509a15a2014-07-26 08:58:10 +02001359#define IP_IDENTS_SZ 2048u
1360struct ip_ident_bucket {
1361 atomic_t id;
1362 u32 stamp32;
1363};
1364
1365static struct ip_ident_bucket *ip_idents __read_mostly;
1366
1367/* In order to protect privacy, we add a perturbation to identifiers
1368 * if one generator is seldom used. This makes hard for an attacker
1369 * to infer how many packets were sent between two points in time.
1370 */
1371u32 ip_idents_reserve(u32 hash, int segs)
1372{
1373 struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ;
1374 u32 old = ACCESS_ONCE(bucket->stamp32);
1375 u32 now = (u32)jiffies;
1376 u32 delta = 0;
1377
1378 if (old != now && cmpxchg(&bucket->stamp32, old, now) == old) {
1379 u64 x = random32();
1380
1381 x *= (now - old);
1382 delta = (u32)(x >> 32);
1383 }
1384
1385 return atomic_add_return(segs + delta, &bucket->id) - segs;
1386}
1387EXPORT_SYMBOL(ip_idents_reserve);
Eric Dumazetad52eef2014-06-02 05:26:03 -07001388
1389void __ip_select_ident(struct iphdr *iph, int segs)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001390{
Eric Dumazetad52eef2014-06-02 05:26:03 -07001391 static u32 ip_idents_hashrnd __read_mostly;
1392 static bool hashrnd_initialized = false;
1393 u32 hash, id;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001394
Eric Dumazetad52eef2014-06-02 05:26:03 -07001395 if (unlikely(!hashrnd_initialized)) {
1396 hashrnd_initialized = true;
1397 get_random_bytes(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
1398 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001399
Eric Dumazet509a15a2014-07-26 08:58:10 +02001400 hash = jhash_3words((__force u32)iph->daddr,
1401 (__force u32)iph->saddr,
1402 iph->protocol,
1403 ip_idents_hashrnd);
Eric Dumazetad52eef2014-06-02 05:26:03 -07001404 id = ip_idents_reserve(hash, segs);
1405 iph->id = htons(id);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001406}
Eric Dumazet4bc2f182010-07-09 21:22:10 +00001407EXPORT_SYMBOL(__ip_select_ident);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001408
1409static void rt_del(unsigned hash, struct rtable *rt)
1410{
Eric Dumazet1c317202010-10-25 21:02:07 +00001411 struct rtable __rcu **rthp;
1412 struct rtable *aux;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001413
Eric Dumazet29e75252008-01-31 17:05:09 -08001414 rthp = &rt_hash_table[hash].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001415 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001416 ip_rt_put(rt);
Eric Dumazet1c317202010-10-25 21:02:07 +00001417 while ((aux = rcu_dereference_protected(*rthp,
1418 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001419 if (aux == rt || rt_is_expired(aux)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001420 *rthp = aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001421 rt_free(aux);
1422 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001423 }
Changli Gaod8d1f302010-06-10 23:31:35 -07001424 rthp = &aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001425 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001426 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001427}
1428
David S. Millerde398fb2011-12-05 13:21:42 -05001429static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001430{
1431 struct rtable *rt = (struct rtable *) dst;
1432 __be32 orig_gw = rt->rt_gateway;
1433 struct neighbour *n, *old_n;
1434
1435 dst_confirm(&rt->dst);
1436
1437 rt->rt_gateway = peer->redirect_learned.a4;
1438
1439 n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
David S. Millerde398fb2011-12-05 13:21:42 -05001440 if (IS_ERR(n)) {
1441 rt->rt_gateway = orig_gw;
1442 return;
1443 }
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001444 old_n = xchg(&rt->dst._neighbour, n);
1445 if (old_n)
1446 neigh_release(old_n);
David S. Millerde398fb2011-12-05 13:21:42 -05001447 if (!(n->nud_state & NUD_VALID)) {
1448 neigh_event_send(n, NULL);
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001449 } else {
1450 rt->rt_flags |= RTCF_REDIRECTED;
1451 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1452 }
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001453}
1454
Eric Dumazeted7865a42010-06-07 21:49:44 -07001455/* called in rcu_read_lock() section */
Al Virof7655222006-09-26 21:25:43 -07001456void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1457 __be32 saddr, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001458{
Flavio Leitner7cc91502011-10-24 02:56:38 -04001459 int s, i;
Eric Dumazeted7865a42010-06-07 21:49:44 -07001460 struct in_device *in_dev = __in_dev_get_rcu(dev);
Flavio Leitner7cc91502011-10-24 02:56:38 -04001461 __be32 skeys[2] = { saddr, 0 };
1462 int ikeys[2] = { dev->ifindex, 0 };
David S. Millerf39925d2011-02-09 22:00:16 -08001463 struct inet_peer *peer;
Denis V. Lunev317805b2008-02-28 20:50:06 -08001464 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001465
Linus Torvalds1da177e2005-04-16 15:20:36 -07001466 if (!in_dev)
1467 return;
1468
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001469 net = dev_net(dev);
Joe Perches9d4fb272009-11-23 10:41:23 -08001470 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1471 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1472 ipv4_is_zeronet(new_gw))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001473 goto reject_redirect;
1474
1475 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1476 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1477 goto reject_redirect;
1478 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1479 goto reject_redirect;
1480 } else {
Denis V. Lunev317805b2008-02-28 20:50:06 -08001481 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001482 goto reject_redirect;
1483 }
1484
Flavio Leitner7cc91502011-10-24 02:56:38 -04001485 for (s = 0; s < 2; s++) {
1486 for (i = 0; i < 2; i++) {
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001487 unsigned int hash;
1488 struct rtable __rcu **rthp;
1489 struct rtable *rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001490
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001491 hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1492
1493 rthp = &rt_hash_table[hash].chain;
1494
1495 while ((rt = rcu_dereference(*rthp)) != NULL) {
1496 rthp = &rt->dst.rt_next;
1497
1498 if (rt->rt_key_dst != daddr ||
1499 rt->rt_key_src != skeys[s] ||
1500 rt->rt_oif != ikeys[i] ||
1501 rt_is_input_route(rt) ||
1502 rt_is_expired(rt) ||
1503 !net_eq(dev_net(rt->dst.dev), net) ||
1504 rt->dst.error ||
1505 rt->dst.dev != dev ||
1506 rt->rt_gateway != old_gw)
1507 continue;
1508
1509 if (!rt->peer)
1510 rt_bind_peer(rt, rt->rt_dst, 1);
1511
1512 peer = rt->peer;
1513 if (peer) {
Steffen Klassertac3f48d2012-03-06 21:21:10 +00001514 if (peer->redirect_learned.a4 != new_gw) {
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001515 peer->redirect_learned.a4 = new_gw;
1516 atomic_inc(&__rt_peer_genid);
1517 }
1518 check_peer_redir(&rt->dst, peer);
1519 }
Flavio Leitner7cc91502011-10-24 02:56:38 -04001520 }
Flavio Leitner7cc91502011-10-24 02:56:38 -04001521 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001522 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001523 return;
1524
1525reject_redirect:
1526#ifdef CONFIG_IP_ROUTE_VERBOSE
1527 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00001528 pr_info("Redirect from %pI4 on %s about %pI4 ignored\n"
Harvey Harrison673d57e2008-10-31 00:53:57 -07001529 " Advised path = %pI4 -> %pI4\n",
Joe Perches058bd4d2012-03-11 18:36:11 +00001530 &old_gw, dev->name, &new_gw,
1531 &saddr, &daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001532#endif
Eric Dumazeted7865a42010-06-07 21:49:44 -07001533 ;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001534}
1535
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001536static bool peer_pmtu_expired(struct inet_peer *peer)
1537{
1538 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1539
1540 return orig &&
1541 time_after_eq(jiffies, orig) &&
1542 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1543}
1544
1545static bool peer_pmtu_cleaned(struct inet_peer *peer)
1546{
1547 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1548
1549 return orig &&
1550 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1551}
1552
Linus Torvalds1da177e2005-04-16 15:20:36 -07001553static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1554{
Eric Dumazetee6b9672008-03-05 18:30:47 -08001555 struct rtable *rt = (struct rtable *)dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001556 struct dst_entry *ret = dst;
1557
1558 if (rt) {
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001559 if (dst->obsolete > 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001560 ip_rt_put(rt);
1561 ret = NULL;
David S. Miller2c8cec52011-02-09 20:42:07 -08001562 } else if (rt->rt_flags & RTCF_REDIRECTED) {
David S. Miller5e2b61f2011-03-04 21:47:09 -08001563 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1564 rt->rt_oif,
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001565 rt_genid(dev_net(dst->dev)));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001566 rt_del(hash, rt);
1567 ret = NULL;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001568 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1569 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001570 }
1571 }
1572 return ret;
1573}
1574
1575/*
1576 * Algorithm:
1577 * 1. The first ip_rt_redirect_number redirects are sent
1578 * with exponential backoff, then we stop sending them at all,
1579 * assuming that the host ignores our redirects.
1580 * 2. If we did not see packets requiring redirects
1581 * during ip_rt_redirect_silence, we assume that the host
1582 * forgot redirected route and start to send redirects again.
1583 *
1584 * This algorithm is much cheaper and more intelligent than dumb load limiting
1585 * in icmp.c.
1586 *
1587 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1588 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1589 */
1590
1591void ip_rt_send_redirect(struct sk_buff *skb)
1592{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001593 struct rtable *rt = skb_rtable(skb);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001594 struct in_device *in_dev;
David S. Miller92d86822011-02-04 15:55:25 -08001595 struct inet_peer *peer;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001596 int log_martians;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001597
Eric Dumazet30038fc2009-08-28 23:52:01 -07001598 rcu_read_lock();
Changli Gaod8d1f302010-06-10 23:31:35 -07001599 in_dev = __in_dev_get_rcu(rt->dst.dev);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001600 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1601 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001602 return;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001603 }
1604 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1605 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001606
David S. Miller92d86822011-02-04 15:55:25 -08001607 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001608 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller92d86822011-02-04 15:55:25 -08001609 peer = rt->peer;
1610 if (!peer) {
1611 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1612 return;
1613 }
1614
Linus Torvalds1da177e2005-04-16 15:20:36 -07001615 /* No redirected packets during ip_rt_redirect_silence;
1616 * reset the algorithm.
1617 */
David S. Miller92d86822011-02-04 15:55:25 -08001618 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1619 peer->rate_tokens = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001620
1621 /* Too many ignored redirects; do not send anything
Changli Gaod8d1f302010-06-10 23:31:35 -07001622 * set dst.rate_last to the last seen redirected packet.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001623 */
David S. Miller92d86822011-02-04 15:55:25 -08001624 if (peer->rate_tokens >= ip_rt_redirect_number) {
1625 peer->rate_last = jiffies;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001626 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001627 }
1628
1629 /* Check for load limit; set rate_last to the latest sent
1630 * redirect.
1631 */
David S. Miller92d86822011-02-04 15:55:25 -08001632 if (peer->rate_tokens == 0 ||
Li Yewang14fb8a72006-12-18 00:26:35 -08001633 time_after(jiffies,
David S. Miller92d86822011-02-04 15:55:25 -08001634 (peer->rate_last +
1635 (ip_rt_redirect_load << peer->rate_tokens)))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001636 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
David S. Miller92d86822011-02-04 15:55:25 -08001637 peer->rate_last = jiffies;
1638 ++peer->rate_tokens;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001639#ifdef CONFIG_IP_ROUTE_VERBOSE
Eric Dumazet30038fc2009-08-28 23:52:01 -07001640 if (log_martians &&
David S. Miller92d86822011-02-04 15:55:25 -08001641 peer->rate_tokens == ip_rt_redirect_number &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001642 net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00001643 pr_warn("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1644 &ip_hdr(skb)->saddr, rt->rt_iif,
Harvey Harrison673d57e2008-10-31 00:53:57 -07001645 &rt->rt_dst, &rt->rt_gateway);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001646#endif
1647 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001648}
1649
1650static int ip_error(struct sk_buff *skb)
1651{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001652 struct rtable *rt = skb_rtable(skb);
David S. Miller92d86822011-02-04 15:55:25 -08001653 struct inet_peer *peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001654 unsigned long now;
David S. Miller92d86822011-02-04 15:55:25 -08001655 bool send;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001656 int code;
1657
Changli Gaod8d1f302010-06-10 23:31:35 -07001658 switch (rt->dst.error) {
Joe Perches4500ebf2011-07-01 09:43:07 +00001659 case EINVAL:
1660 default:
1661 goto out;
1662 case EHOSTUNREACH:
1663 code = ICMP_HOST_UNREACH;
1664 break;
1665 case ENETUNREACH:
1666 code = ICMP_NET_UNREACH;
1667 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1668 IPSTATS_MIB_INNOROUTES);
1669 break;
1670 case EACCES:
1671 code = ICMP_PKT_FILTERED;
1672 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001673 }
1674
David S. Miller92d86822011-02-04 15:55:25 -08001675 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001676 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller92d86822011-02-04 15:55:25 -08001677 peer = rt->peer;
1678
1679 send = true;
1680 if (peer) {
1681 now = jiffies;
1682 peer->rate_tokens += now - peer->rate_last;
1683 if (peer->rate_tokens > ip_rt_error_burst)
1684 peer->rate_tokens = ip_rt_error_burst;
1685 peer->rate_last = now;
1686 if (peer->rate_tokens >= ip_rt_error_cost)
1687 peer->rate_tokens -= ip_rt_error_cost;
1688 else
1689 send = false;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001690 }
David S. Miller92d86822011-02-04 15:55:25 -08001691 if (send)
1692 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001693
1694out: kfree_skb(skb);
1695 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001696}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001697
1698/*
1699 * The last two values are not from the RFC but
1700 * are needed for AMPRnet AX.25 paths.
1701 */
1702
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001703static const unsigned short mtu_plateau[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -07001704{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1705
Stephen Hemminger5969f712008-04-10 01:52:09 -07001706static inline unsigned short guess_mtu(unsigned short old_mtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001707{
1708 int i;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001709
Linus Torvalds1da177e2005-04-16 15:20:36 -07001710 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1711 if (old_mtu > mtu_plateau[i])
1712 return mtu_plateau[i];
1713 return 68;
1714}
1715
Eric Dumazetb71d1d42011-04-22 04:53:02 +00001716unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
Timo Teras0010e462008-04-29 03:32:25 -07001717 unsigned short new_mtu,
1718 struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001719{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001720 unsigned short old_mtu = ntohs(iph->tot_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001721 unsigned short est_mtu = 0;
David S. Miller2c8cec52011-02-09 20:42:07 -08001722 struct inet_peer *peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001723
David S. Miller2c8cec52011-02-09 20:42:07 -08001724 peer = inet_getpeer_v4(iph->daddr, 1);
1725 if (peer) {
1726 unsigned short mtu = new_mtu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001727
David S. Miller2c8cec52011-02-09 20:42:07 -08001728 if (new_mtu < 68 || new_mtu >= old_mtu) {
1729 /* BSD 4.2 derived systems incorrectly adjust
1730 * tot_len by the IP header length, and report
1731 * a zero MTU in the ICMP message.
1732 */
1733 if (mtu == 0 &&
1734 old_mtu >= 68 + (iph->ihl << 2))
1735 old_mtu -= iph->ihl << 2;
1736 mtu = guess_mtu(old_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001737 }
David S. Miller2c8cec52011-02-09 20:42:07 -08001738
1739 if (mtu < ip_rt_min_pmtu)
1740 mtu = ip_rt_min_pmtu;
1741 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001742 unsigned long pmtu_expires;
1743
1744 pmtu_expires = jiffies + ip_rt_mtu_expires;
1745 if (!pmtu_expires)
1746 pmtu_expires = 1UL;
1747
David S. Miller2c8cec52011-02-09 20:42:07 -08001748 est_mtu = mtu;
1749 peer->pmtu_learned = mtu;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001750 peer->pmtu_expires = pmtu_expires;
Gao feng59445b62011-10-19 15:34:09 +00001751 atomic_inc(&__rt_peer_genid);
David S. Miller2c8cec52011-02-09 20:42:07 -08001752 }
1753
1754 inet_putpeer(peer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001755 }
1756 return est_mtu ? : new_mtu;
1757}
1758
David S. Miller2c8cec52011-02-09 20:42:07 -08001759static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1760{
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001761 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
David S. Miller2c8cec52011-02-09 20:42:07 -08001762
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001763 if (!expires)
1764 return;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001765 if (time_before(jiffies, expires)) {
David S. Miller2c8cec52011-02-09 20:42:07 -08001766 u32 orig_dst_mtu = dst_mtu(dst);
1767 if (peer->pmtu_learned < orig_dst_mtu) {
1768 if (!peer->pmtu_orig)
1769 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1770 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1771 }
1772 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1773 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1774}
1775
Linus Torvalds1da177e2005-04-16 15:20:36 -07001776static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1777{
David S. Miller2c8cec52011-02-09 20:42:07 -08001778 struct rtable *rt = (struct rtable *) dst;
1779 struct inet_peer *peer;
1780
1781 dst_confirm(dst);
1782
1783 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001784 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller2c8cec52011-02-09 20:42:07 -08001785 peer = rt->peer;
1786 if (peer) {
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001787 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1788
David S. Miller2c8cec52011-02-09 20:42:07 -08001789 if (mtu < ip_rt_min_pmtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001790 mtu = ip_rt_min_pmtu;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001791 if (!pmtu_expires || mtu < peer->pmtu_learned) {
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001792
1793 pmtu_expires = jiffies + ip_rt_mtu_expires;
1794 if (!pmtu_expires)
1795 pmtu_expires = 1UL;
1796
David S. Miller2c8cec52011-02-09 20:42:07 -08001797 peer->pmtu_learned = mtu;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001798 peer->pmtu_expires = pmtu_expires;
David S. Miller2c8cec52011-02-09 20:42:07 -08001799
1800 atomic_inc(&__rt_peer_genid);
1801 rt->rt_peer_genid = rt_peer_genid();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001802 }
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001803 check_peer_pmtu(dst, peer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001804 }
1805}
1806
David S. Millerf39925d2011-02-09 22:00:16 -08001807
David S. Millerde398fb2011-12-05 13:21:42 -05001808static void ipv4_validate_peer(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001809{
David S. Miller6431cbc2011-02-07 20:38:06 -08001810 if (rt->rt_peer_genid != rt_peer_genid()) {
David S. Miller2c8cec52011-02-09 20:42:07 -08001811 struct inet_peer *peer;
1812
David S. Miller6431cbc2011-02-07 20:38:06 -08001813 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001814 rt_bind_peer(rt, rt->rt_dst, 0);
David S. Miller6431cbc2011-02-07 20:38:06 -08001815
David S. Miller2c8cec52011-02-09 20:42:07 -08001816 peer = rt->peer;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001817 if (peer) {
David S. Millerefbc3682011-12-01 13:38:59 -05001818 check_peer_pmtu(&rt->dst, peer);
David S. Miller2c8cec52011-02-09 20:42:07 -08001819
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001820 if (peer->redirect_learned.a4 &&
David S. Millerde398fb2011-12-05 13:21:42 -05001821 peer->redirect_learned.a4 != rt->rt_gateway)
1822 check_peer_redir(&rt->dst, peer);
David S. Millerf39925d2011-02-09 22:00:16 -08001823 }
1824
David S. Miller6431cbc2011-02-07 20:38:06 -08001825 rt->rt_peer_genid = rt_peer_genid();
1826 }
David S. Millerefbc3682011-12-01 13:38:59 -05001827}
1828
1829static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1830{
1831 struct rtable *rt = (struct rtable *) dst;
1832
1833 if (rt_is_expired(rt))
1834 return NULL;
David S. Millerde398fb2011-12-05 13:21:42 -05001835 ipv4_validate_peer(rt);
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001836 return dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001837}
1838
1839static void ipv4_dst_destroy(struct dst_entry *dst)
1840{
1841 struct rtable *rt = (struct rtable *) dst;
1842 struct inet_peer *peer = rt->peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001843
David S. Miller62fa8a82011-01-26 20:51:05 -08001844 if (rt->fi) {
1845 fib_info_put(rt->fi);
1846 rt->fi = NULL;
1847 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001848 if (peer) {
1849 rt->peer = NULL;
1850 inet_putpeer(peer);
1851 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001852}
1853
Linus Torvalds1da177e2005-04-16 15:20:36 -07001854
1855static void ipv4_link_failure(struct sk_buff *skb)
1856{
1857 struct rtable *rt;
1858
1859 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1860
Eric Dumazet511c3f92009-06-02 05:14:27 +00001861 rt = skb_rtable(skb);
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001862 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1863 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001864}
1865
1866static int ip_rt_bug(struct sk_buff *skb)
1867{
Harvey Harrison673d57e2008-10-31 00:53:57 -07001868 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1869 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001870 skb->dev ? skb->dev->name : "?");
1871 kfree_skb(skb);
Dave Jonesc378a9c2011-05-21 07:16:42 +00001872 WARN_ON(1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001873 return 0;
1874}
1875
1876/*
1877 We do not cache source address of outgoing interface,
1878 because it is used only by IP RR, TS and SRR options,
1879 so that it out of fast path.
1880
1881 BTW remember: "addr" is allowed to be not aligned
1882 in IP options!
1883 */
1884
David S. Miller8e363602011-05-13 17:29:41 -04001885void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001886{
Al Viroa61ced52006-09-26 21:27:54 -07001887 __be32 src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001888
David S. Millerc7537962010-11-11 17:07:48 -08001889 if (rt_is_output_route(rt))
David S. Millerc5be24f2011-05-13 18:01:21 -04001890 src = ip_hdr(skb)->saddr;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001891 else {
David S. Miller8e363602011-05-13 17:29:41 -04001892 struct fib_result res;
1893 struct flowi4 fl4;
1894 struct iphdr *iph;
1895
1896 iph = ip_hdr(skb);
1897
1898 memset(&fl4, 0, sizeof(fl4));
1899 fl4.daddr = iph->daddr;
1900 fl4.saddr = iph->saddr;
Julian Anastasovb0fe4a32011-07-23 02:00:41 +00001901 fl4.flowi4_tos = RT_TOS(iph->tos);
David S. Miller8e363602011-05-13 17:29:41 -04001902 fl4.flowi4_oif = rt->dst.dev->ifindex;
1903 fl4.flowi4_iif = skb->dev->ifindex;
1904 fl4.flowi4_mark = skb->mark;
David S. Miller5e2b61f2011-03-04 21:47:09 -08001905
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001906 rcu_read_lock();
David S. Miller68a5e3d2011-03-11 20:07:33 -05001907 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
David S. Miller436c3b62011-03-24 17:42:21 -07001908 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001909 else
1910 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001911 RT_SCOPE_UNIVERSE);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001912 rcu_read_unlock();
1913 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001914 memcpy(addr, &src, 4);
1915}
1916
Patrick McHardyc7066f72011-01-14 13:36:42 +01001917#ifdef CONFIG_IP_ROUTE_CLASSID
Linus Torvalds1da177e2005-04-16 15:20:36 -07001918static void set_class_tag(struct rtable *rt, u32 tag)
1919{
Changli Gaod8d1f302010-06-10 23:31:35 -07001920 if (!(rt->dst.tclassid & 0xFFFF))
1921 rt->dst.tclassid |= tag & 0xFFFF;
1922 if (!(rt->dst.tclassid & 0xFFFF0000))
1923 rt->dst.tclassid |= tag & 0xFFFF0000;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001924}
1925#endif
1926
David S. Miller0dbaee32010-12-13 12:52:14 -08001927static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1928{
1929 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1930
1931 if (advmss == 0) {
1932 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1933 ip_rt_min_advmss);
1934 if (advmss > 65535 - 40)
1935 advmss = 65535 - 40;
1936 }
1937 return advmss;
1938}
1939
Steffen Klassertebb762f2011-11-23 02:12:51 +00001940static unsigned int ipv4_mtu(const struct dst_entry *dst)
David S. Millerd33e4552010-12-14 13:01:14 -08001941{
Steffen Klassert261663b2011-11-23 02:14:50 +00001942 const struct rtable *rt = (const struct rtable *) dst;
Steffen Klassert618f9bc2011-11-23 02:13:31 +00001943 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1944
Steffen Klassert261663b2011-11-23 02:14:50 +00001945 if (mtu && rt_is_output_route(rt))
Steffen Klassert618f9bc2011-11-23 02:13:31 +00001946 return mtu;
1947
1948 mtu = dst->dev->mtu;
David S. Millerd33e4552010-12-14 13:01:14 -08001949
1950 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
David S. Millerd33e4552010-12-14 13:01:14 -08001951
1952 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1953 mtu = 576;
1954 }
1955
1956 if (mtu > IP_MAX_MTU)
1957 mtu = IP_MAX_MTU;
1958
1959 return mtu;
1960}
1961
David S. Miller813b3b52011-04-28 14:48:42 -07001962static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
David S. Miller5e2b61f2011-03-04 21:47:09 -08001963 struct fib_info *fi)
David S. Millera4daad62011-01-27 22:01:53 -08001964{
David S. Miller0131ba42011-02-04 14:37:30 -08001965 struct inet_peer *peer;
1966 int create = 0;
1967
1968 /* If a peer entry exists for this destination, we must hook
1969 * it up in order to get at cached metrics.
1970 */
David S. Miller813b3b52011-04-28 14:48:42 -07001971 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
David S. Miller0131ba42011-02-04 14:37:30 -08001972 create = 1;
1973
David S. Miller3c0afdc2011-03-04 21:26:07 -08001974 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
David S. Miller0131ba42011-02-04 14:37:30 -08001975 if (peer) {
David S. Miller3c0afdc2011-03-04 21:26:07 -08001976 rt->rt_peer_genid = rt_peer_genid();
David S. Miller0131ba42011-02-04 14:37:30 -08001977 if (inet_metrics_new(peer))
1978 memcpy(peer->metrics, fi->fib_metrics,
1979 sizeof(u32) * RTAX_MAX);
1980 dst_init_metrics(&rt->dst, peer->metrics, false);
David S. Miller2c8cec52011-02-09 20:42:07 -08001981
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001982 check_peer_pmtu(&rt->dst, peer);
Steffen Klassertac3f48d2012-03-06 21:21:10 +00001983
David S. Millerf39925d2011-02-09 22:00:16 -08001984 if (peer->redirect_learned.a4 &&
1985 peer->redirect_learned.a4 != rt->rt_gateway) {
1986 rt->rt_gateway = peer->redirect_learned.a4;
1987 rt->rt_flags |= RTCF_REDIRECTED;
1988 }
David S. Miller0131ba42011-02-04 14:37:30 -08001989 } else {
David S. Millerb8dad612011-01-28 14:07:16 -08001990 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1991 rt->fi = fi;
1992 atomic_inc(&fi->fib_clntref);
1993 }
David S. Millera4daad62011-01-27 22:01:53 -08001994 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
David S. Millera4daad62011-01-27 22:01:53 -08001995 }
1996}
1997
David S. Miller813b3b52011-04-28 14:48:42 -07001998static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
David S. Miller5e2b61f2011-03-04 21:47:09 -08001999 const struct fib_result *res,
David S. Miller982721f2011-02-16 21:44:24 -08002000 struct fib_info *fi, u16 type, u32 itag)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002001{
David S. Millerdefb3512010-12-08 21:16:57 -08002002 struct dst_entry *dst = &rt->dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002003
2004 if (fi) {
2005 if (FIB_RES_GW(*res) &&
2006 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
2007 rt->rt_gateway = FIB_RES_GW(*res);
David S. Miller813b3b52011-04-28 14:48:42 -07002008 rt_init_metrics(rt, fl4, fi);
Patrick McHardyc7066f72011-01-14 13:36:42 +01002009#ifdef CONFIG_IP_ROUTE_CLASSID
David S. Millerdefb3512010-12-08 21:16:57 -08002010 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002011#endif
David S. Millerd33e4552010-12-14 13:01:14 -08002012 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002013
David S. Millerdefb3512010-12-08 21:16:57 -08002014 if (dst_mtu(dst) > IP_MAX_MTU)
2015 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
David S. Miller0dbaee32010-12-13 12:52:14 -08002016 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
David S. Millerdefb3512010-12-08 21:16:57 -08002017 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002018
Patrick McHardyc7066f72011-01-14 13:36:42 +01002019#ifdef CONFIG_IP_ROUTE_CLASSID
Linus Torvalds1da177e2005-04-16 15:20:36 -07002020#ifdef CONFIG_IP_MULTIPLE_TABLES
2021 set_class_tag(rt, fib_rules_tclass(res));
2022#endif
2023 set_class_tag(rt, itag);
2024#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002025}
2026
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002027static struct rtable *rt_dst_alloc(struct net_device *dev,
2028 bool nopolicy, bool noxfrm)
David S. Miller0c4dcd52011-02-17 15:42:37 -08002029{
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002030 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2031 DST_HOST |
2032 (nopolicy ? DST_NOPOLICY : 0) |
2033 (noxfrm ? DST_NOXFRM : 0));
David S. Miller0c4dcd52011-02-17 15:42:37 -08002034}
2035
Eric Dumazet96d36222010-06-02 19:21:31 +00002036/* called in rcu_read_lock() section */
Al Viro9e12bb22006-09-26 21:25:20 -07002037static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002038 u8 tos, struct net_device *dev, int our)
2039{
Eric Dumazet96d36222010-06-02 19:21:31 +00002040 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002041 struct rtable *rth;
Al Viroa61ced52006-09-26 21:27:54 -07002042 __be32 spec_dst;
Eric Dumazet96d36222010-06-02 19:21:31 +00002043 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002044 u32 itag = 0;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002045 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002046
2047 /* Primary sanity checks. */
2048
2049 if (in_dev == NULL)
2050 return -EINVAL;
2051
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002052 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002053 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002054 goto e_inval;
2055
Joe Perchesf97c1e02007-12-16 13:45:43 -08002056 if (ipv4_is_zeronet(saddr)) {
2057 if (!ipv4_is_local_multicast(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002058 goto e_inval;
2059 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002060 } else {
Michael Smith5c04c812011-04-07 04:51:50 +00002061 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2062 &itag);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002063 if (err < 0)
2064 goto e_err;
2065 }
Benjamin LaHaise4e7b2f12012-03-27 15:55:32 +00002066 rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002067 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002068 if (!rth)
2069 goto e_nobufs;
2070
Patrick McHardyc7066f72011-01-14 13:36:42 +01002071#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07002072 rth->dst.tclassid = itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002073#endif
David S. Millercf911662011-04-28 14:31:47 -07002074 rth->dst.output = ip_rt_bug;
2075
2076 rth->rt_key_dst = daddr;
2077 rth->rt_key_src = saddr;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002078 rth->rt_genid = rt_genid(dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002079 rth->rt_flags = RTCF_MULTICAST;
Eric Dumazet29e75252008-01-31 17:05:09 -08002080 rth->rt_type = RTN_MULTICAST;
David S. Miller475949d2011-05-03 19:45:15 -07002081 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002082 rth->rt_dst = daddr;
2083 rth->rt_src = saddr;
2084 rth->rt_route_iif = dev->ifindex;
2085 rth->rt_iif = dev->ifindex;
2086 rth->rt_oif = 0;
2087 rth->rt_mark = skb->mark;
2088 rth->rt_gateway = daddr;
2089 rth->rt_spec_dst= spec_dst;
2090 rth->rt_peer_genid = 0;
2091 rth->peer = NULL;
2092 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002093 if (our) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002094 rth->dst.input= ip_local_deliver;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002095 rth->rt_flags |= RTCF_LOCAL;
2096 }
2097
2098#ifdef CONFIG_IP_MROUTE
Joe Perchesf97c1e02007-12-16 13:45:43 -08002099 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
Changli Gaod8d1f302010-06-10 23:31:35 -07002100 rth->dst.input = ip_mr_input;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002101#endif
2102 RT_CACHE_STAT_INC(in_slow_mc);
2103
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002104 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
David S. Millerb23dd4f2011-03-02 14:31:35 -08002105 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
Eric Dumazet9aa3c942011-06-18 11:59:18 -07002106 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002107
2108e_nobufs:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002109 return -ENOBUFS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002110e_inval:
Eric Dumazet96d36222010-06-02 19:21:31 +00002111 return -EINVAL;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002112e_err:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002113 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002114}
2115
2116
2117static void ip_handle_martian_source(struct net_device *dev,
2118 struct in_device *in_dev,
2119 struct sk_buff *skb,
Al Viro9e12bb22006-09-26 21:25:20 -07002120 __be32 daddr,
2121 __be32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002122{
2123 RT_CACHE_STAT_INC(in_martian_src);
2124#ifdef CONFIG_IP_ROUTE_VERBOSE
2125 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2126 /*
2127 * RFC1812 recommendation, if source is martian,
2128 * the only hint is MAC header.
2129 */
Joe Perches058bd4d2012-03-11 18:36:11 +00002130 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
Harvey Harrison673d57e2008-10-31 00:53:57 -07002131 &daddr, &saddr, dev->name);
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07002132 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
Joe Perches058bd4d2012-03-11 18:36:11 +00002133 print_hex_dump(KERN_WARNING, "ll header: ",
2134 DUMP_PREFIX_OFFSET, 16, 1,
2135 skb_mac_header(skb),
2136 dev->hard_header_len, true);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002137 }
2138 }
2139#endif
2140}
2141
Eric Dumazet47360222010-06-03 04:13:21 +00002142/* called in rcu_read_lock() section */
Stephen Hemminger5969f712008-04-10 01:52:09 -07002143static int __mkroute_input(struct sk_buff *skb,
David S. Miller982721f2011-02-16 21:44:24 -08002144 const struct fib_result *res,
Stephen Hemminger5969f712008-04-10 01:52:09 -07002145 struct in_device *in_dev,
2146 __be32 daddr, __be32 saddr, u32 tos,
2147 struct rtable **result)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002148{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002149 struct rtable *rth;
2150 int err;
2151 struct in_device *out_dev;
Eric Dumazet47360222010-06-03 04:13:21 +00002152 unsigned int flags = 0;
Al Virod9c9df82006-09-26 21:28:14 -07002153 __be32 spec_dst;
Li RongQing62e1a642014-05-22 16:36:55 +08002154 u32 itag = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002155
2156 /* get a working reference to the output device */
Eric Dumazet47360222010-06-03 04:13:21 +00002157 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002158 if (out_dev == NULL) {
2159 if (net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00002160 pr_crit("Bug in ip_route_input_slow(). Please report.\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07002161 return -EINVAL;
2162 }
2163
2164
Michael Smith5c04c812011-04-07 04:51:50 +00002165 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2166 in_dev->dev, &spec_dst, &itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002167 if (err < 0) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002168 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002169 saddr);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002170
Linus Torvalds1da177e2005-04-16 15:20:36 -07002171 goto cleanup;
2172 }
2173
2174 if (err)
2175 flags |= RTCF_DIRECTSRC;
2176
Thomas Graf51b77ca2008-06-03 16:36:01 -07002177 if (out_dev == in_dev && err &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002178 (IN_DEV_SHARED_MEDIA(out_dev) ||
2179 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2180 flags |= RTCF_DOREDIRECT;
2181
2182 if (skb->protocol != htons(ETH_P_IP)) {
2183 /* Not IP (i.e. ARP). Do not create route, if it is
2184 * invalid for proxy arp. DNAT routes are always valid.
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00002185 *
2186 * Proxy arp feature have been extended to allow, ARP
2187 * replies back to the same interface, to support
2188 * Private VLAN switch technologies. See arp.c.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002189 */
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00002190 if (out_dev == in_dev &&
2191 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002192 err = -EINVAL;
2193 goto cleanup;
2194 }
2195 }
2196
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002197 rth = rt_dst_alloc(out_dev->dev,
2198 IN_DEV_CONF_GET(in_dev, NOPOLICY),
David S. Miller0c4dcd52011-02-17 15:42:37 -08002199 IN_DEV_CONF_GET(out_dev, NOXFRM));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002200 if (!rth) {
2201 err = -ENOBUFS;
2202 goto cleanup;
2203 }
2204
David S. Miller5e2b61f2011-03-04 21:47:09 -08002205 rth->rt_key_dst = daddr;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002206 rth->rt_key_src = saddr;
David S. Millercf911662011-04-28 14:31:47 -07002207 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2208 rth->rt_flags = flags;
2209 rth->rt_type = res->type;
David S. Miller475949d2011-05-03 19:45:15 -07002210 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002211 rth->rt_dst = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002212 rth->rt_src = saddr;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002213 rth->rt_route_iif = in_dev->dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002214 rth->rt_iif = in_dev->dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002215 rth->rt_oif = 0;
David S. Millercf911662011-04-28 14:31:47 -07002216 rth->rt_mark = skb->mark;
2217 rth->rt_gateway = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002218 rth->rt_spec_dst= spec_dst;
David S. Millercf911662011-04-28 14:31:47 -07002219 rth->rt_peer_genid = 0;
2220 rth->peer = NULL;
2221 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002222
Changli Gaod8d1f302010-06-10 23:31:35 -07002223 rth->dst.input = ip_forward;
2224 rth->dst.output = ip_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002225
David S. Miller5e2b61f2011-03-04 21:47:09 -08002226 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002227
Linus Torvalds1da177e2005-04-16 15:20:36 -07002228 *result = rth;
2229 err = 0;
2230 cleanup:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002231 return err;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002232}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002233
Stephen Hemminger5969f712008-04-10 01:52:09 -07002234static int ip_mkroute_input(struct sk_buff *skb,
2235 struct fib_result *res,
David S. Miller68a5e3d2011-03-11 20:07:33 -05002236 const struct flowi4 *fl4,
Stephen Hemminger5969f712008-04-10 01:52:09 -07002237 struct in_device *in_dev,
2238 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002239{
Chuck Short7abaa272005-06-22 22:10:23 -07002240 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002241 int err;
2242 unsigned hash;
2243
2244#ifdef CONFIG_IP_ROUTE_MULTIPATH
David S. Millerff3fccb2011-03-10 16:23:24 -08002245 if (res->fi && res->fi->fib_nhs > 1)
David S. Miller1b7fe5932011-03-10 17:01:16 -08002246 fib_select_multipath(res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002247#endif
2248
2249 /* create a routing cache entry */
2250 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2251 if (err)
2252 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002253
2254 /* put it into the cache */
David S. Miller68a5e3d2011-03-11 20:07:33 -05002255 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
Changli Gaod8d1f302010-06-10 23:31:35 -07002256 rt_genid(dev_net(rth->dst.dev)));
David S. Miller68a5e3d2011-03-11 20:07:33 -05002257 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002258 if (IS_ERR(rth))
2259 return PTR_ERR(rth);
2260 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002261}
2262
Linus Torvalds1da177e2005-04-16 15:20:36 -07002263/*
2264 * NOTE. We drop all the packets that has local source
2265 * addresses, because every properly looped back packet
2266 * must have correct destination already attached by output routine.
2267 *
2268 * Such approach solves two big problems:
2269 * 1. Not simplex devices are handled properly.
2270 * 2. IP spoofing attempts are filtered with 100% of guarantee.
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002271 * called with rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -07002272 */
2273
Al Viro9e12bb22006-09-26 21:25:20 -07002274static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002275 u8 tos, struct net_device *dev)
2276{
2277 struct fib_result res;
Eric Dumazet96d36222010-06-02 19:21:31 +00002278 struct in_device *in_dev = __in_dev_get_rcu(dev);
David S. Miller68a5e3d2011-03-11 20:07:33 -05002279 struct flowi4 fl4;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002280 unsigned flags = 0;
2281 u32 itag = 0;
2282 struct rtable * rth;
2283 unsigned hash;
Al Viro9e12bb22006-09-26 21:25:20 -07002284 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002285 int err = -EINVAL;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002286 struct net * net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002287
2288 /* IP on this device is disabled. */
2289
2290 if (!in_dev)
2291 goto out;
2292
2293 /* Check for the most weird martians, which can be not detected
2294 by fib_lookup.
2295 */
2296
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002297 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002298 ipv4_is_loopback(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002299 goto martian_source;
2300
Andy Walls27a954b2010-10-17 15:11:22 +00002301 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002302 goto brd_input;
2303
2304 /* Accept zero addresses only to limited broadcast;
2305 * I even do not know to fix it or not. Waiting for complains :-)
2306 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002307 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002308 goto martian_source;
2309
Andy Walls27a954b2010-10-17 15:11:22 +00002310 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002311 goto martian_destination;
2312
2313 /*
2314 * Now we are ready to route packet.
2315 */
David S. Miller68a5e3d2011-03-11 20:07:33 -05002316 fl4.flowi4_oif = 0;
2317 fl4.flowi4_iif = dev->ifindex;
2318 fl4.flowi4_mark = skb->mark;
2319 fl4.flowi4_tos = tos;
2320 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2321 fl4.daddr = daddr;
2322 fl4.saddr = saddr;
2323 err = fib_lookup(net, &fl4, &res);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002324 if (err != 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002325 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002326 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002327 goto no_route;
2328 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002329
2330 RT_CACHE_STAT_INC(in_slow_tot);
2331
2332 if (res.type == RTN_BROADCAST)
2333 goto brd_input;
2334
2335 if (res.type == RTN_LOCAL) {
Michael Smith5c04c812011-04-07 04:51:50 +00002336 err = fib_validate_source(skb, saddr, daddr, tos,
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002337 net->loopback_dev->ifindex,
Michael Smith5c04c812011-04-07 04:51:50 +00002338 dev, &spec_dst, &itag);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002339 if (err < 0)
2340 goto martian_source_keep_err;
2341 if (err)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002342 flags |= RTCF_DIRECTSRC;
2343 spec_dst = daddr;
2344 goto local_input;
2345 }
2346
2347 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002348 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002349 if (res.type != RTN_UNICAST)
2350 goto martian_destination;
2351
David S. Miller68a5e3d2011-03-11 20:07:33 -05002352 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002353out: return err;
2354
2355brd_input:
2356 if (skb->protocol != htons(ETH_P_IP))
2357 goto e_inval;
2358
Joe Perchesf97c1e02007-12-16 13:45:43 -08002359 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002360 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2361 else {
Michael Smith5c04c812011-04-07 04:51:50 +00002362 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2363 &itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002364 if (err < 0)
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002365 goto martian_source_keep_err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002366 if (err)
2367 flags |= RTCF_DIRECTSRC;
2368 }
2369 flags |= RTCF_BROADCAST;
2370 res.type = RTN_BROADCAST;
2371 RT_CACHE_STAT_INC(in_brd);
2372
2373local_input:
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002374 rth = rt_dst_alloc(net->loopback_dev,
2375 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002376 if (!rth)
2377 goto e_nobufs;
2378
David S. Millercf911662011-04-28 14:31:47 -07002379 rth->dst.input= ip_local_deliver;
Changli Gaod8d1f302010-06-10 23:31:35 -07002380 rth->dst.output= ip_rt_bug;
David S. Millercf911662011-04-28 14:31:47 -07002381#ifdef CONFIG_IP_ROUTE_CLASSID
2382 rth->dst.tclassid = itag;
2383#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002384
David S. Miller5e2b61f2011-03-04 21:47:09 -08002385 rth->rt_key_dst = daddr;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002386 rth->rt_key_src = saddr;
David S. Millercf911662011-04-28 14:31:47 -07002387 rth->rt_genid = rt_genid(net);
2388 rth->rt_flags = flags|RTCF_LOCAL;
2389 rth->rt_type = res.type;
David S. Miller475949d2011-05-03 19:45:15 -07002390 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002391 rth->rt_dst = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002392 rth->rt_src = saddr;
Patrick McHardyc7066f72011-01-14 13:36:42 +01002393#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07002394 rth->dst.tclassid = itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002395#endif
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002396 rth->rt_route_iif = dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002397 rth->rt_iif = dev->ifindex;
David S. Millercf911662011-04-28 14:31:47 -07002398 rth->rt_oif = 0;
2399 rth->rt_mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002400 rth->rt_gateway = daddr;
2401 rth->rt_spec_dst= spec_dst;
David S. Millercf911662011-04-28 14:31:47 -07002402 rth->rt_peer_genid = 0;
2403 rth->peer = NULL;
2404 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002405 if (res.type == RTN_UNREACHABLE) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002406 rth->dst.input= ip_error;
2407 rth->dst.error= -err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002408 rth->rt_flags &= ~RTCF_LOCAL;
2409 }
David S. Miller68a5e3d2011-03-11 20:07:33 -05002410 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2411 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002412 err = 0;
2413 if (IS_ERR(rth))
2414 err = PTR_ERR(rth);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002415 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002416
2417no_route:
2418 RT_CACHE_STAT_INC(in_no_route);
2419 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2420 res.type = RTN_UNREACHABLE;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08002421 if (err == -ESRCH)
2422 err = -ENETUNREACH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002423 goto local_input;
2424
2425 /*
2426 * Do not cache martian addresses: they should be logged (RFC1812)
2427 */
2428martian_destination:
2429 RT_CACHE_STAT_INC(in_martian_dst);
2430#ifdef CONFIG_IP_ROUTE_VERBOSE
2431 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00002432 pr_warn("martian destination %pI4 from %pI4, dev %s\n",
Harvey Harrison673d57e2008-10-31 00:53:57 -07002433 &daddr, &saddr, dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002434#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002435
2436e_hostunreach:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002437 err = -EHOSTUNREACH;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002438 goto out;
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002439
Linus Torvalds1da177e2005-04-16 15:20:36 -07002440e_inval:
2441 err = -EINVAL;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002442 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002443
2444e_nobufs:
2445 err = -ENOBUFS;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002446 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002447
2448martian_source:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002449 err = -EINVAL;
2450martian_source_keep_err:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002451 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002452 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002453}
2454
Eric Dumazet407eadd2010-05-10 11:32:55 +00002455int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2456 u8 tos, struct net_device *dev, bool noref)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002457{
2458 struct rtable * rth;
2459 unsigned hash;
2460 int iif = dev->ifindex;
Denis V. Lunevb5921912008-01-22 23:50:25 -08002461 struct net *net;
Eric Dumazet96d36222010-06-02 19:21:31 +00002462 int res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002463
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002464 net = dev_net(dev);
Neil Horman1080d702008-10-27 12:28:25 -07002465
Eric Dumazet96d36222010-06-02 19:21:31 +00002466 rcu_read_lock();
2467
Neil Horman1080d702008-10-27 12:28:25 -07002468 if (!rt_caching(net))
2469 goto skip_cache;
2470
Linus Torvalds1da177e2005-04-16 15:20:36 -07002471 tos &= IPTOS_RT_MASK;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002472 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002473
Linus Torvalds1da177e2005-04-16 15:20:36 -07002474 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002475 rth = rcu_dereference(rth->dst.rt_next)) {
David S. Miller5e2b61f2011-03-04 21:47:09 -08002476 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2477 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
Julian Anastasov97a80412011-08-09 04:01:16 +00002478 (rth->rt_route_iif ^ iif) |
David S. Miller475949d2011-05-03 19:45:15 -07002479 (rth->rt_key_tos ^ tos)) == 0 &&
David S. Miller5e2b61f2011-03-04 21:47:09 -08002480 rth->rt_mark == skb->mark &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002481 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002482 !rt_is_expired(rth)) {
David S. Millerde398fb2011-12-05 13:21:42 -05002483 ipv4_validate_peer(rth);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002484 if (noref) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002485 dst_use_noref(&rth->dst, jiffies);
2486 skb_dst_set_noref(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002487 } else {
Changli Gaod8d1f302010-06-10 23:31:35 -07002488 dst_use(&rth->dst, jiffies);
2489 skb_dst_set(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002490 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002491 RT_CACHE_STAT_INC(in_hit);
2492 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002493 return 0;
2494 }
2495 RT_CACHE_STAT_INC(in_hlist_search);
2496 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002497
Neil Horman1080d702008-10-27 12:28:25 -07002498skip_cache:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002499 /* Multicast recognition logic is moved from route cache to here.
2500 The problem was that too many Ethernet cards have broken/missing
2501 hardware multicast filters :-( As result the host on multicasting
2502 network acquires a lot of useless route cache entries, sort of
2503 SDR messages from all the world. Now we try to get rid of them.
2504 Really, provided software IP multicast filter is organized
2505 reasonably (at least, hashed), it does not result in a slowdown
2506 comparing with route cache reject entries.
2507 Note, that multicast routers are not affected, because
2508 route cache entry is created eventually.
2509 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002510 if (ipv4_is_multicast(daddr)) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002511 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002512
Eric Dumazet96d36222010-06-02 19:21:31 +00002513 if (in_dev) {
David S. Millerdbdd9a52011-03-10 16:34:38 -08002514 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2515 ip_hdr(skb)->protocol);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002516 if (our
2517#ifdef CONFIG_IP_MROUTE
Joe Perches9d4fb272009-11-23 10:41:23 -08002518 ||
2519 (!ipv4_is_local_multicast(daddr) &&
2520 IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002521#endif
Joe Perches9d4fb272009-11-23 10:41:23 -08002522 ) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002523 int res = ip_route_input_mc(skb, daddr, saddr,
2524 tos, dev, our);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002525 rcu_read_unlock();
Eric Dumazet96d36222010-06-02 19:21:31 +00002526 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002527 }
2528 }
2529 rcu_read_unlock();
2530 return -EINVAL;
2531 }
Eric Dumazet96d36222010-06-02 19:21:31 +00002532 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2533 rcu_read_unlock();
2534 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002535}
Eric Dumazet407eadd2010-05-10 11:32:55 +00002536EXPORT_SYMBOL(ip_route_input_common);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002537
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002538/* called with rcu_read_lock() */
David S. Miller982721f2011-02-16 21:44:24 -08002539static struct rtable *__mkroute_output(const struct fib_result *res,
David S. Miller68a5e3d2011-03-11 20:07:33 -05002540 const struct flowi4 *fl4,
David S. Miller813b3b52011-04-28 14:48:42 -07002541 __be32 orig_daddr, __be32 orig_saddr,
Julian Anastasovf61759e2011-12-02 11:39:42 +00002542 int orig_oif, __u8 orig_rtos,
2543 struct net_device *dev_out,
David S. Miller5ada5522011-02-17 15:29:00 -08002544 unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002545{
David S. Miller982721f2011-02-16 21:44:24 -08002546 struct fib_info *fi = res->fi;
David S. Miller5ada5522011-02-17 15:29:00 -08002547 struct in_device *in_dev;
David S. Miller982721f2011-02-16 21:44:24 -08002548 u16 type = res->type;
David S. Miller5ada5522011-02-17 15:29:00 -08002549 struct rtable *rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002550
David S. Miller68a5e3d2011-03-11 20:07:33 -05002551 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
David S. Miller5ada5522011-02-17 15:29:00 -08002552 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002553
David S. Miller68a5e3d2011-03-11 20:07:33 -05002554 if (ipv4_is_lbcast(fl4->daddr))
David S. Miller982721f2011-02-16 21:44:24 -08002555 type = RTN_BROADCAST;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002556 else if (ipv4_is_multicast(fl4->daddr))
David S. Miller982721f2011-02-16 21:44:24 -08002557 type = RTN_MULTICAST;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002558 else if (ipv4_is_zeronet(fl4->daddr))
David S. Miller5ada5522011-02-17 15:29:00 -08002559 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002560
2561 if (dev_out->flags & IFF_LOOPBACK)
2562 flags |= RTCF_LOCAL;
2563
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002564 in_dev = __in_dev_get_rcu(dev_out);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002565 if (!in_dev)
David S. Miller5ada5522011-02-17 15:29:00 -08002566 return ERR_PTR(-EINVAL);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002567
David S. Miller982721f2011-02-16 21:44:24 -08002568 if (type == RTN_BROADCAST) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002569 flags |= RTCF_BROADCAST | RTCF_LOCAL;
David S. Miller982721f2011-02-16 21:44:24 -08002570 fi = NULL;
2571 } else if (type == RTN_MULTICAST) {
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002572 flags |= RTCF_MULTICAST | RTCF_LOCAL;
David S. Miller813b3b52011-04-28 14:48:42 -07002573 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2574 fl4->flowi4_proto))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002575 flags &= ~RTCF_LOCAL;
2576 /* If multicast route do not exist use
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002577 * default one, but do not gateway in this case.
2578 * Yes, it is hack.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002579 */
David S. Miller982721f2011-02-16 21:44:24 -08002580 if (fi && res->prefixlen < 4)
2581 fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002582 }
2583
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002584 rth = rt_dst_alloc(dev_out,
2585 IN_DEV_CONF_GET(in_dev, NOPOLICY),
David S. Miller0c4dcd52011-02-17 15:42:37 -08002586 IN_DEV_CONF_GET(in_dev, NOXFRM));
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002587 if (!rth)
David S. Miller5ada5522011-02-17 15:29:00 -08002588 return ERR_PTR(-ENOBUFS);
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002589
David S. Millercf911662011-04-28 14:31:47 -07002590 rth->dst.output = ip_output;
2591
David S. Miller813b3b52011-04-28 14:48:42 -07002592 rth->rt_key_dst = orig_daddr;
2593 rth->rt_key_src = orig_saddr;
David S. Millercf911662011-04-28 14:31:47 -07002594 rth->rt_genid = rt_genid(dev_net(dev_out));
2595 rth->rt_flags = flags;
2596 rth->rt_type = type;
Julian Anastasovf61759e2011-12-02 11:39:42 +00002597 rth->rt_key_tos = orig_rtos;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002598 rth->rt_dst = fl4->daddr;
2599 rth->rt_src = fl4->saddr;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002600 rth->rt_route_iif = 0;
David S. Miller813b3b52011-04-28 14:48:42 -07002601 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2602 rth->rt_oif = orig_oif;
2603 rth->rt_mark = fl4->flowi4_mark;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002604 rth->rt_gateway = fl4->daddr;
2605 rth->rt_spec_dst= fl4->saddr;
David S. Millercf911662011-04-28 14:31:47 -07002606 rth->rt_peer_genid = 0;
2607 rth->peer = NULL;
2608 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002609
2610 RT_CACHE_STAT_INC(out_slow_tot);
2611
2612 if (flags & RTCF_LOCAL) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002613 rth->dst.input = ip_local_deliver;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002614 rth->rt_spec_dst = fl4->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002615 }
2616 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
David S. Miller68a5e3d2011-03-11 20:07:33 -05002617 rth->rt_spec_dst = fl4->saddr;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002618 if (flags & RTCF_LOCAL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002619 !(dev_out->flags & IFF_LOOPBACK)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002620 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002621 RT_CACHE_STAT_INC(out_slow_mc);
2622 }
2623#ifdef CONFIG_IP_MROUTE
David S. Miller982721f2011-02-16 21:44:24 -08002624 if (type == RTN_MULTICAST) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002625 if (IN_DEV_MFORWARD(in_dev) &&
David S. Miller813b3b52011-04-28 14:48:42 -07002626 !ipv4_is_local_multicast(fl4->daddr)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002627 rth->dst.input = ip_mr_input;
2628 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002629 }
2630 }
2631#endif
2632 }
2633
David S. Miller813b3b52011-04-28 14:48:42 -07002634 rt_set_nexthop(rth, fl4, res, fi, type, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002635
David S. Miller5ada5522011-02-17 15:29:00 -08002636 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002637}
2638
Linus Torvalds1da177e2005-04-16 15:20:36 -07002639/*
2640 * Major route resolver routine.
Eric Dumazet0197aa32010-09-30 03:33:58 +00002641 * called with rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002642 */
2643
David S. Miller813b3b52011-04-28 14:48:42 -07002644static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002645{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002646 struct net_device *dev_out = NULL;
Julian Anastasovf61759e2011-12-02 11:39:42 +00002647 __u8 tos = RT_FL_TOS(fl4);
David S. Miller813b3b52011-04-28 14:48:42 -07002648 unsigned int flags = 0;
2649 struct fib_result res;
David S. Miller5ada5522011-02-17 15:29:00 -08002650 struct rtable *rth;
David S. Miller813b3b52011-04-28 14:48:42 -07002651 __be32 orig_daddr;
2652 __be32 orig_saddr;
2653 int orig_oif;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002654
2655 res.fi = NULL;
2656#ifdef CONFIG_IP_MULTIPLE_TABLES
2657 res.r = NULL;
2658#endif
2659
David S. Miller813b3b52011-04-28 14:48:42 -07002660 orig_daddr = fl4->daddr;
2661 orig_saddr = fl4->saddr;
2662 orig_oif = fl4->flowi4_oif;
2663
2664 fl4->flowi4_iif = net->loopback_dev->ifindex;
2665 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2666 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2667 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
David S. Miller44713b62011-03-04 21:24:47 -08002668
David S. Miller010c2702011-02-17 15:37:09 -08002669 rcu_read_lock();
David S. Miller813b3b52011-04-28 14:48:42 -07002670 if (fl4->saddr) {
David S. Millerb23dd4f2011-03-02 14:31:35 -08002671 rth = ERR_PTR(-EINVAL);
David S. Miller813b3b52011-04-28 14:48:42 -07002672 if (ipv4_is_multicast(fl4->saddr) ||
2673 ipv4_is_lbcast(fl4->saddr) ||
2674 ipv4_is_zeronet(fl4->saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002675 goto out;
2676
Linus Torvalds1da177e2005-04-16 15:20:36 -07002677 /* I removed check for oif == dev_out->oif here.
2678 It was wrong for two reasons:
Denis V. Lunev1ab35272008-01-22 22:04:30 -08002679 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2680 is assigned to multiple interfaces.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002681 2. Moreover, we are allowed to send packets with saddr
2682 of another iface. --ANK
2683 */
2684
David S. Miller813b3b52011-04-28 14:48:42 -07002685 if (fl4->flowi4_oif == 0 &&
2686 (ipv4_is_multicast(fl4->daddr) ||
2687 ipv4_is_lbcast(fl4->daddr))) {
Julian Anastasova210d012008-10-01 07:28:28 -07002688 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
David S. Miller813b3b52011-04-28 14:48:42 -07002689 dev_out = __ip_dev_find(net, fl4->saddr, false);
Julian Anastasova210d012008-10-01 07:28:28 -07002690 if (dev_out == NULL)
2691 goto out;
2692
Linus Torvalds1da177e2005-04-16 15:20:36 -07002693 /* Special hack: user can direct multicasts
2694 and limited broadcast via necessary interface
2695 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2696 This hack is not just for fun, it allows
2697 vic,vat and friends to work.
2698 They bind socket to loopback, set ttl to zero
2699 and expect that it will work.
2700 From the viewpoint of routing cache they are broken,
2701 because we are not allowed to build multicast path
2702 with loopback source addr (look, routing cache
2703 cannot know, that ttl is zero, so that packet
2704 will not leave this host and route is valid).
2705 Luckily, this hack is good workaround.
2706 */
2707
David S. Miller813b3b52011-04-28 14:48:42 -07002708 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002709 goto make_route;
2710 }
Julian Anastasova210d012008-10-01 07:28:28 -07002711
David S. Miller813b3b52011-04-28 14:48:42 -07002712 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
Julian Anastasova210d012008-10-01 07:28:28 -07002713 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
David S. Miller813b3b52011-04-28 14:48:42 -07002714 if (!__ip_dev_find(net, fl4->saddr, false))
Julian Anastasova210d012008-10-01 07:28:28 -07002715 goto out;
Julian Anastasova210d012008-10-01 07:28:28 -07002716 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002717 }
2718
2719
David S. Miller813b3b52011-04-28 14:48:42 -07002720 if (fl4->flowi4_oif) {
2721 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002722 rth = ERR_PTR(-ENODEV);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002723 if (dev_out == NULL)
2724 goto out;
Herbert Xue5ed6392005-10-03 14:35:55 -07002725
2726 /* RACE: Check return value of inet_select_addr instead. */
Eric Dumazetfc75fc82010-12-22 04:39:39 +00002727 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
David S. Millerb23dd4f2011-03-02 14:31:35 -08002728 rth = ERR_PTR(-ENETUNREACH);
Eric Dumazetfc75fc82010-12-22 04:39:39 +00002729 goto out;
2730 }
David S. Miller813b3b52011-04-28 14:48:42 -07002731 if (ipv4_is_local_multicast(fl4->daddr) ||
2732 ipv4_is_lbcast(fl4->daddr)) {
2733 if (!fl4->saddr)
2734 fl4->saddr = inet_select_addr(dev_out, 0,
2735 RT_SCOPE_LINK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002736 goto make_route;
2737 }
Jiri Bencad61d4c2013-10-04 17:04:48 +02002738 if (!fl4->saddr) {
David S. Miller813b3b52011-04-28 14:48:42 -07002739 if (ipv4_is_multicast(fl4->daddr))
2740 fl4->saddr = inet_select_addr(dev_out, 0,
2741 fl4->flowi4_scope);
2742 else if (!fl4->daddr)
2743 fl4->saddr = inet_select_addr(dev_out, 0,
2744 RT_SCOPE_HOST);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002745 }
2746 }
2747
David S. Miller813b3b52011-04-28 14:48:42 -07002748 if (!fl4->daddr) {
2749 fl4->daddr = fl4->saddr;
2750 if (!fl4->daddr)
2751 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002752 dev_out = net->loopback_dev;
David S. Miller813b3b52011-04-28 14:48:42 -07002753 fl4->flowi4_oif = net->loopback_dev->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002754 res.type = RTN_LOCAL;
2755 flags |= RTCF_LOCAL;
2756 goto make_route;
2757 }
2758
David S. Miller813b3b52011-04-28 14:48:42 -07002759 if (fib_lookup(net, fl4, &res)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002760 res.fi = NULL;
David S. Miller813b3b52011-04-28 14:48:42 -07002761 if (fl4->flowi4_oif) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002762 /* Apparently, routing tables are wrong. Assume,
2763 that the destination is on link.
2764
2765 WHY? DW.
2766 Because we are allowed to send to iface
2767 even if it has NO routes and NO assigned
2768 addresses. When oif is specified, routing
2769 tables are looked up with only one purpose:
2770 to catch if destination is gatewayed, rather than
2771 direct. Moreover, if MSG_DONTROUTE is set,
2772 we send packet, ignoring both routing tables
2773 and ifaddr state. --ANK
2774
2775
2776 We could make it even if oif is unknown,
2777 likely IPv6, but we do not.
2778 */
2779
David S. Miller813b3b52011-04-28 14:48:42 -07002780 if (fl4->saddr == 0)
2781 fl4->saddr = inet_select_addr(dev_out, 0,
2782 RT_SCOPE_LINK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002783 res.type = RTN_UNICAST;
2784 goto make_route;
2785 }
David S. Millerb23dd4f2011-03-02 14:31:35 -08002786 rth = ERR_PTR(-ENETUNREACH);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002787 goto out;
2788 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002789
2790 if (res.type == RTN_LOCAL) {
David S. Miller813b3b52011-04-28 14:48:42 -07002791 if (!fl4->saddr) {
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002792 if (res.fi->fib_prefsrc)
David S. Miller813b3b52011-04-28 14:48:42 -07002793 fl4->saddr = res.fi->fib_prefsrc;
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002794 else
David S. Miller813b3b52011-04-28 14:48:42 -07002795 fl4->saddr = fl4->daddr;
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002796 }
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002797 dev_out = net->loopback_dev;
David S. Miller813b3b52011-04-28 14:48:42 -07002798 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002799 res.fi = NULL;
2800 flags |= RTCF_LOCAL;
2801 goto make_route;
2802 }
2803
2804#ifdef CONFIG_IP_ROUTE_MULTIPATH
David S. Miller813b3b52011-04-28 14:48:42 -07002805 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
David S. Miller1b7fe5932011-03-10 17:01:16 -08002806 fib_select_multipath(&res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002807 else
2808#endif
David S. Miller21d8c492011-04-14 14:49:37 -07002809 if (!res.prefixlen &&
2810 res.table->tb_num_default > 1 &&
David S. Miller813b3b52011-04-28 14:48:42 -07002811 res.type == RTN_UNICAST && !fl4->flowi4_oif)
David S. Miller0c838ff2011-01-31 16:16:50 -08002812 fib_select_default(&res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002813
David S. Miller813b3b52011-04-28 14:48:42 -07002814 if (!fl4->saddr)
2815 fl4->saddr = FIB_RES_PREFSRC(net, res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002816
Linus Torvalds1da177e2005-04-16 15:20:36 -07002817 dev_out = FIB_RES_DEV(res);
David S. Miller813b3b52011-04-28 14:48:42 -07002818 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002819
2820
2821make_route:
David S. Miller813b3b52011-04-28 14:48:42 -07002822 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
Julian Anastasovf61759e2011-12-02 11:39:42 +00002823 tos, dev_out, flags);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002824 if (!IS_ERR(rth)) {
David S. Miller5ada5522011-02-17 15:29:00 -08002825 unsigned int hash;
2826
David S. Miller813b3b52011-04-28 14:48:42 -07002827 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
David S. Miller5ada5522011-02-17 15:29:00 -08002828 rt_genid(dev_net(dev_out)));
David S. Miller813b3b52011-04-28 14:48:42 -07002829 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
David S. Miller5ada5522011-02-17 15:29:00 -08002830 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002831
David S. Miller010c2702011-02-17 15:37:09 -08002832out:
2833 rcu_read_unlock();
David S. Millerb23dd4f2011-03-02 14:31:35 -08002834 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002835}
2836
David S. Miller813b3b52011-04-28 14:48:42 -07002837struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002838{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002839 struct rtable *rth;
David S. Miller010c2702011-02-17 15:37:09 -08002840 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002841
Neil Horman1080d702008-10-27 12:28:25 -07002842 if (!rt_caching(net))
2843 goto slow_output;
2844
David S. Miller9d6ec932011-03-12 01:12:47 -05002845 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002846
2847 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08002848 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002849 rth = rcu_dereference_bh(rth->dst.rt_next)) {
David S. Miller9d6ec932011-03-12 01:12:47 -05002850 if (rth->rt_key_dst == flp4->daddr &&
2851 rth->rt_key_src == flp4->saddr &&
David S. Millerc7537962010-11-11 17:07:48 -08002852 rt_is_output_route(rth) &&
David S. Miller9d6ec932011-03-12 01:12:47 -05002853 rth->rt_oif == flp4->flowi4_oif &&
2854 rth->rt_mark == flp4->flowi4_mark &&
David S. Miller475949d2011-05-03 19:45:15 -07002855 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
Denis V. Lunevb5921912008-01-22 23:50:25 -08002856 (IPTOS_RT_MASK | RTO_ONLINK)) &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002857 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002858 !rt_is_expired(rth)) {
David S. Millerde398fb2011-12-05 13:21:42 -05002859 ipv4_validate_peer(rth);
Changli Gaod8d1f302010-06-10 23:31:35 -07002860 dst_use(&rth->dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002861 RT_CACHE_STAT_INC(out_hit);
2862 rcu_read_unlock_bh();
David S. Miller56157872011-05-02 14:37:45 -07002863 if (!flp4->saddr)
2864 flp4->saddr = rth->rt_src;
2865 if (!flp4->daddr)
2866 flp4->daddr = rth->rt_dst;
David S. Millerb23dd4f2011-03-02 14:31:35 -08002867 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002868 }
2869 RT_CACHE_STAT_INC(out_hlist_search);
2870 }
2871 rcu_read_unlock_bh();
2872
Neil Horman1080d702008-10-27 12:28:25 -07002873slow_output:
David S. Miller9d6ec932011-03-12 01:12:47 -05002874 return ip_route_output_slow(net, flp4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002875}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002876EXPORT_SYMBOL_GPL(__ip_route_output_key);
2877
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002878static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2879{
2880 return NULL;
2881}
2882
Steffen Klassertebb762f2011-11-23 02:12:51 +00002883static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
Roland Dreierec831ea2011-01-31 13:16:00 -08002884{
Steffen Klassert618f9bc2011-11-23 02:13:31 +00002885 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2886
2887 return mtu ? : dst->dev->mtu;
Roland Dreierec831ea2011-01-31 13:16:00 -08002888}
2889
David S. Miller14e50e52007-05-24 18:17:54 -07002890static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2891{
2892}
2893
Held Bernhard0972ddb2011-04-24 22:07:32 +00002894static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2895 unsigned long old)
2896{
2897 return NULL;
2898}
2899
David S. Miller14e50e52007-05-24 18:17:54 -07002900static struct dst_ops ipv4_dst_blackhole_ops = {
2901 .family = AF_INET,
Harvey Harrison09640e62009-02-01 00:45:17 -08002902 .protocol = cpu_to_be16(ETH_P_IP),
David S. Miller14e50e52007-05-24 18:17:54 -07002903 .destroy = ipv4_dst_destroy,
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002904 .check = ipv4_blackhole_dst_check,
Steffen Klassertebb762f2011-11-23 02:12:51 +00002905 .mtu = ipv4_blackhole_mtu,
Eric Dumazet214f45c2011-02-18 11:39:01 -08002906 .default_advmss = ipv4_default_advmss,
David S. Miller14e50e52007-05-24 18:17:54 -07002907 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
Held Bernhard0972ddb2011-04-24 22:07:32 +00002908 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
David S. Millerd3aaeb32011-07-18 00:40:17 -07002909 .neigh_lookup = ipv4_neigh_lookup,
David S. Miller14e50e52007-05-24 18:17:54 -07002910};
2911
David S. Miller2774c132011-03-01 14:59:04 -08002912struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
David S. Miller14e50e52007-05-24 18:17:54 -07002913{
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002914 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
David S. Miller2774c132011-03-01 14:59:04 -08002915 struct rtable *ort = (struct rtable *) dst_orig;
David S. Miller14e50e52007-05-24 18:17:54 -07002916
2917 if (rt) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002918 struct dst_entry *new = &rt->dst;
David S. Miller14e50e52007-05-24 18:17:54 -07002919
David S. Miller14e50e52007-05-24 18:17:54 -07002920 new->__use = 1;
Herbert Xu352e5122007-11-13 21:34:06 -08002921 new->input = dst_discard;
2922 new->output = dst_discard;
David S. Millerdefb3512010-12-08 21:16:57 -08002923 dst_copy_metrics(new, &ort->dst);
David S. Miller14e50e52007-05-24 18:17:54 -07002924
Changli Gaod8d1f302010-06-10 23:31:35 -07002925 new->dev = ort->dst.dev;
David S. Miller14e50e52007-05-24 18:17:54 -07002926 if (new->dev)
2927 dev_hold(new->dev);
2928
David S. Miller5e2b61f2011-03-04 21:47:09 -08002929 rt->rt_key_dst = ort->rt_key_dst;
2930 rt->rt_key_src = ort->rt_key_src;
David S. Miller475949d2011-05-03 19:45:15 -07002931 rt->rt_key_tos = ort->rt_key_tos;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002932 rt->rt_route_iif = ort->rt_route_iif;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002933 rt->rt_iif = ort->rt_iif;
2934 rt->rt_oif = ort->rt_oif;
2935 rt->rt_mark = ort->rt_mark;
David S. Miller14e50e52007-05-24 18:17:54 -07002936
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002937 rt->rt_genid = rt_genid(net);
David S. Miller14e50e52007-05-24 18:17:54 -07002938 rt->rt_flags = ort->rt_flags;
2939 rt->rt_type = ort->rt_type;
2940 rt->rt_dst = ort->rt_dst;
2941 rt->rt_src = ort->rt_src;
David S. Miller14e50e52007-05-24 18:17:54 -07002942 rt->rt_gateway = ort->rt_gateway;
2943 rt->rt_spec_dst = ort->rt_spec_dst;
2944 rt->peer = ort->peer;
2945 if (rt->peer)
2946 atomic_inc(&rt->peer->refcnt);
David S. Miller62fa8a82011-01-26 20:51:05 -08002947 rt->fi = ort->fi;
2948 if (rt->fi)
2949 atomic_inc(&rt->fi->fib_clntref);
David S. Miller14e50e52007-05-24 18:17:54 -07002950
2951 dst_free(new);
2952 }
2953
David S. Miller2774c132011-03-01 14:59:04 -08002954 dst_release(dst_orig);
2955
2956 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
David S. Miller14e50e52007-05-24 18:17:54 -07002957}
2958
David S. Miller9d6ec932011-03-12 01:12:47 -05002959struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
David S. Millerb23dd4f2011-03-02 14:31:35 -08002960 struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002961{
David S. Miller9d6ec932011-03-12 01:12:47 -05002962 struct rtable *rt = __ip_route_output_key(net, flp4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002963
David S. Millerb23dd4f2011-03-02 14:31:35 -08002964 if (IS_ERR(rt))
2965 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002966
David S. Miller56157872011-05-02 14:37:45 -07002967 if (flp4->flowi4_proto)
David S. Miller9d6ec932011-03-12 01:12:47 -05002968 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2969 flowi4_to_flowi(flp4),
2970 sk, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002971
David S. Millerb23dd4f2011-03-02 14:31:35 -08002972 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002973}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002974EXPORT_SYMBOL_GPL(ip_route_output_flow);
2975
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002976static int rt_fill_info(struct net *net,
2977 struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002978 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002979{
Eric Dumazet511c3f92009-06-02 05:14:27 +00002980 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002981 struct rtmsg *r;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002982 struct nlmsghdr *nlh;
Steffen Klassert2bc8ca42011-10-11 01:12:02 +00002983 unsigned long expires = 0;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00002984 const struct inet_peer *peer = rt->peer;
Thomas Grafe3703b32006-11-27 09:27:07 -08002985 u32 id = 0, ts = 0, tsage = 0, error;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002986
2987 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2988 if (nlh == NULL)
Patrick McHardy26932562007-01-31 23:16:40 -08002989 return -EMSGSIZE;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002990
2991 r = nlmsg_data(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002992 r->rtm_family = AF_INET;
2993 r->rtm_dst_len = 32;
2994 r->rtm_src_len = 0;
David S. Miller475949d2011-05-03 19:45:15 -07002995 r->rtm_tos = rt->rt_key_tos;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002996 r->rtm_table = RT_TABLE_MAIN;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002997 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002998 r->rtm_type = rt->rt_type;
2999 r->rtm_scope = RT_SCOPE_UNIVERSE;
3000 r->rtm_protocol = RTPROT_UNSPEC;
3001 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
3002 if (rt->rt_flags & RTCF_NOTIFY)
3003 r->rtm_flags |= RTM_F_NOTIFY;
Thomas Grafbe403ea2006-08-17 18:15:17 -07003004
Al Viro17fb2c62006-09-26 22:15:25 -07003005 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
Thomas Grafbe403ea2006-08-17 18:15:17 -07003006
David S. Miller5e2b61f2011-03-04 21:47:09 -08003007 if (rt->rt_key_src) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003008 r->rtm_src_len = 32;
David S. Miller5e2b61f2011-03-04 21:47:09 -08003009 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003010 }
Changli Gaod8d1f302010-06-10 23:31:35 -07003011 if (rt->dst.dev)
3012 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
Patrick McHardyc7066f72011-01-14 13:36:42 +01003013#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07003014 if (rt->dst.tclassid)
3015 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003016#endif
David S. Millerc7537962010-11-11 17:07:48 -08003017 if (rt_is_input_route(rt))
Al Viro17fb2c62006-09-26 22:15:25 -07003018 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
David S. Miller5e2b61f2011-03-04 21:47:09 -08003019 else if (rt->rt_src != rt->rt_key_src)
Al Viro17fb2c62006-09-26 22:15:25 -07003020 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
Thomas Grafbe403ea2006-08-17 18:15:17 -07003021
Linus Torvalds1da177e2005-04-16 15:20:36 -07003022 if (rt->rt_dst != rt->rt_gateway)
Al Viro17fb2c62006-09-26 22:15:25 -07003023 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
Thomas Grafbe403ea2006-08-17 18:15:17 -07003024
David S. Millerdefb3512010-12-08 21:16:57 -08003025 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
Thomas Grafbe403ea2006-08-17 18:15:17 -07003026 goto nla_put_failure;
3027
David S. Miller5e2b61f2011-03-04 21:47:09 -08003028 if (rt->rt_mark)
3029 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
Eric Dumazet963bfee2010-07-20 22:03:14 +00003030
Changli Gaod8d1f302010-06-10 23:31:35 -07003031 error = rt->dst.error;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00003032 if (peer) {
Eric Dumazet317fe0e2010-06-16 04:52:13 +00003033 inet_peer_refcheck(rt->peer);
Eric Dumazetfe6fe792011-06-08 06:07:07 +00003034 if (peer->tcp_ts_stamp) {
3035 ts = peer->tcp_ts;
3036 tsage = get_seconds() - peer->tcp_ts_stamp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003037 }
Eric Dumazetfe6fe792011-06-08 06:07:07 +00003038 expires = ACCESS_ONCE(peer->pmtu_expires);
Steffen Klassert2bc8ca42011-10-11 01:12:02 +00003039 if (expires) {
3040 if (time_before(jiffies, expires))
3041 expires -= jiffies;
3042 else
3043 expires = 0;
3044 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003045 }
Thomas Grafbe403ea2006-08-17 18:15:17 -07003046
David S. Millerc7537962010-11-11 17:07:48 -08003047 if (rt_is_input_route(rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003048#ifdef CONFIG_IP_MROUTE
Al Viroe4485152006-09-26 22:15:01 -07003049 __be32 dst = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003050
Joe Perchesf97c1e02007-12-16 13:45:43 -08003051 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003052 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
David S. Miller9a1b9492011-05-04 12:18:54 -07003053 int err = ipmr_get_route(net, skb,
3054 rt->rt_src, rt->rt_dst,
3055 r, nowait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003056 if (err <= 0) {
3057 if (!nowait) {
3058 if (err == 0)
3059 return 0;
Thomas Grafbe403ea2006-08-17 18:15:17 -07003060 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003061 } else {
3062 if (err == -EMSGSIZE)
Thomas Grafbe403ea2006-08-17 18:15:17 -07003063 goto nla_put_failure;
Thomas Grafe3703b32006-11-27 09:27:07 -08003064 error = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003065 }
3066 }
3067 } else
3068#endif
David S. Miller5e2b61f2011-03-04 21:47:09 -08003069 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003070 }
3071
Changli Gaod8d1f302010-06-10 23:31:35 -07003072 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
Thomas Grafe3703b32006-11-27 09:27:07 -08003073 expires, error) < 0)
3074 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003075
Thomas Grafbe403ea2006-08-17 18:15:17 -07003076 return nlmsg_end(skb, nlh);
3077
3078nla_put_failure:
Patrick McHardy26932562007-01-31 23:16:40 -08003079 nlmsg_cancel(skb, nlh);
3080 return -EMSGSIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003081}
3082
Thomas Graf63f34442007-03-22 11:55:17 -07003083static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003084{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09003085 struct net *net = sock_net(in_skb->sk);
Thomas Grafd889ce32006-08-17 18:15:44 -07003086 struct rtmsg *rtm;
3087 struct nlattr *tb[RTA_MAX+1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07003088 struct rtable *rt = NULL;
Al Viro9e12bb22006-09-26 21:25:20 -07003089 __be32 dst = 0;
3090 __be32 src = 0;
3091 u32 iif;
Thomas Grafd889ce32006-08-17 18:15:44 -07003092 int err;
Eric Dumazet963bfee2010-07-20 22:03:14 +00003093 int mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003094 struct sk_buff *skb;
3095
Thomas Grafd889ce32006-08-17 18:15:44 -07003096 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3097 if (err < 0)
3098 goto errout;
3099
3100 rtm = nlmsg_data(nlh);
3101
Linus Torvalds1da177e2005-04-16 15:20:36 -07003102 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
Thomas Grafd889ce32006-08-17 18:15:44 -07003103 if (skb == NULL) {
3104 err = -ENOBUFS;
3105 goto errout;
3106 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003107
3108 /* Reserve room for dummy headers, this skb can pass
3109 through good chunk of routing engine.
3110 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07003111 skb_reset_mac_header(skb);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07003112 skb_reset_network_header(skb);
Stephen Hemmingerd2c962b2006-04-17 17:27:11 -07003113
3114 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07003115 ip_hdr(skb)->protocol = IPPROTO_ICMP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003116 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3117
Al Viro17fb2c62006-09-26 22:15:25 -07003118 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3119 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
Thomas Grafd889ce32006-08-17 18:15:44 -07003120 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
Eric Dumazet963bfee2010-07-20 22:03:14 +00003121 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003122
3123 if (iif) {
Thomas Grafd889ce32006-08-17 18:15:44 -07003124 struct net_device *dev;
3125
Denis V. Lunev19375042008-02-28 20:52:04 -08003126 dev = __dev_get_by_index(net, iif);
Thomas Grafd889ce32006-08-17 18:15:44 -07003127 if (dev == NULL) {
3128 err = -ENODEV;
3129 goto errout_free;
3130 }
3131
Linus Torvalds1da177e2005-04-16 15:20:36 -07003132 skb->protocol = htons(ETH_P_IP);
3133 skb->dev = dev;
Eric Dumazet963bfee2010-07-20 22:03:14 +00003134 skb->mark = mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003135 local_bh_disable();
3136 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3137 local_bh_enable();
Thomas Grafd889ce32006-08-17 18:15:44 -07003138
Eric Dumazet511c3f92009-06-02 05:14:27 +00003139 rt = skb_rtable(skb);
Changli Gaod8d1f302010-06-10 23:31:35 -07003140 if (err == 0 && rt->dst.error)
3141 err = -rt->dst.error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003142 } else {
David S. Miller68a5e3d2011-03-11 20:07:33 -05003143 struct flowi4 fl4 = {
3144 .daddr = dst,
3145 .saddr = src,
3146 .flowi4_tos = rtm->rtm_tos,
3147 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3148 .flowi4_mark = mark,
Thomas Grafd889ce32006-08-17 18:15:44 -07003149 };
David S. Miller9d6ec932011-03-12 01:12:47 -05003150 rt = ip_route_output_key(net, &fl4);
David S. Millerb23dd4f2011-03-02 14:31:35 -08003151
3152 err = 0;
3153 if (IS_ERR(rt))
3154 err = PTR_ERR(rt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003155 }
Thomas Grafd889ce32006-08-17 18:15:44 -07003156
Linus Torvalds1da177e2005-04-16 15:20:36 -07003157 if (err)
Thomas Grafd889ce32006-08-17 18:15:44 -07003158 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003159
Changli Gaod8d1f302010-06-10 23:31:35 -07003160 skb_dst_set(skb, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003161 if (rtm->rtm_flags & RTM_F_NOTIFY)
3162 rt->rt_flags |= RTCF_NOTIFY;
3163
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003164 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Denis V. Lunev19375042008-02-28 20:52:04 -08003165 RTM_NEWROUTE, 0, 0);
Thomas Grafd889ce32006-08-17 18:15:44 -07003166 if (err <= 0)
3167 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003168
Denis V. Lunev19375042008-02-28 20:52:04 -08003169 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
Thomas Grafd889ce32006-08-17 18:15:44 -07003170errout:
Thomas Graf2942e902006-08-15 00:30:25 -07003171 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003172
Thomas Grafd889ce32006-08-17 18:15:44 -07003173errout_free:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003174 kfree_skb(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07003175 goto errout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003176}
3177
3178int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3179{
3180 struct rtable *rt;
3181 int h, s_h;
3182 int idx, s_idx;
Denis V. Lunev19375042008-02-28 20:52:04 -08003183 struct net *net;
3184
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09003185 net = sock_net(skb->sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003186
3187 s_h = cb->args[0];
Eric Dumazetd8c92832008-01-07 21:52:14 -08003188 if (s_h < 0)
3189 s_h = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003190 s_idx = idx = cb->args[1];
Eric Dumazeta6272662008-08-28 01:11:25 -07003191 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3192 if (!rt_hash_table[h].chain)
3193 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003194 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08003195 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
Changli Gaod8d1f302010-06-10 23:31:35 -07003196 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3197 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003198 continue;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07003199 if (rt_is_expired(rt))
Eric Dumazet29e75252008-01-31 17:05:09 -08003200 continue;
Changli Gaod8d1f302010-06-10 23:31:35 -07003201 skb_dst_set_noref(skb, &rt->dst);
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003202 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003203 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07003204 1, NLM_F_MULTI) <= 0) {
Eric Dumazetadf30902009-06-02 05:19:30 +00003205 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003206 rcu_read_unlock_bh();
3207 goto done;
3208 }
Eric Dumazetadf30902009-06-02 05:19:30 +00003209 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003210 }
3211 rcu_read_unlock_bh();
3212 }
3213
3214done:
3215 cb->args[0] = h;
3216 cb->args[1] = idx;
3217 return skb->len;
3218}
3219
3220void ip_rt_multicast_event(struct in_device *in_dev)
3221{
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -07003222 rt_cache_flush(dev_net(in_dev->dev), 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003223}
3224
3225#ifdef CONFIG_SYSCTL
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003226static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003227 void __user *buffer,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003228 size_t *lenp, loff_t *ppos)
3229{
3230 if (write) {
Denis V. Lunev639e1042008-07-05 19:02:06 -07003231 int flush_delay;
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003232 ctl_table ctl;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003233 struct net *net;
Denis V. Lunev639e1042008-07-05 19:02:06 -07003234
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003235 memcpy(&ctl, __ctl, sizeof(ctl));
3236 ctl.data = &flush_delay;
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003237 proc_dointvec(&ctl, write, buffer, lenp, ppos);
Denis V. Lunev639e1042008-07-05 19:02:06 -07003238
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003239 net = (struct net *)__ctl->extra1;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003240 rt_cache_flush(net, flush_delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003241 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003242 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003243
3244 return -EINVAL;
3245}
3246
Al Viroeeb61f72008-07-27 08:59:33 +01003247static ctl_table ipv4_route_table[] = {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003248 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003249 .procname = "gc_thresh",
3250 .data = &ipv4_dst_ops.gc_thresh,
3251 .maxlen = sizeof(int),
3252 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003253 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003254 },
3255 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003256 .procname = "max_size",
3257 .data = &ip_rt_max_size,
3258 .maxlen = sizeof(int),
3259 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003260 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003261 },
3262 {
3263 /* Deprecated. Use gc_min_interval_ms */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003264
Linus Torvalds1da177e2005-04-16 15:20:36 -07003265 .procname = "gc_min_interval",
3266 .data = &ip_rt_gc_min_interval,
3267 .maxlen = sizeof(int),
3268 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003269 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003270 },
3271 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003272 .procname = "gc_min_interval_ms",
3273 .data = &ip_rt_gc_min_interval,
3274 .maxlen = sizeof(int),
3275 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003276 .proc_handler = proc_dointvec_ms_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003277 },
3278 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003279 .procname = "gc_timeout",
3280 .data = &ip_rt_gc_timeout,
3281 .maxlen = sizeof(int),
3282 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003283 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003284 },
3285 {
Eric Dumazet9f28a2f2011-12-21 15:47:16 -05003286 .procname = "gc_interval",
3287 .data = &ip_rt_gc_interval,
3288 .maxlen = sizeof(int),
3289 .mode = 0644,
3290 .proc_handler = proc_dointvec_jiffies,
3291 },
3292 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003293 .procname = "redirect_load",
3294 .data = &ip_rt_redirect_load,
3295 .maxlen = sizeof(int),
3296 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003297 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003298 },
3299 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003300 .procname = "redirect_number",
3301 .data = &ip_rt_redirect_number,
3302 .maxlen = sizeof(int),
3303 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003304 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003305 },
3306 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003307 .procname = "redirect_silence",
3308 .data = &ip_rt_redirect_silence,
3309 .maxlen = sizeof(int),
3310 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003311 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003312 },
3313 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003314 .procname = "error_cost",
3315 .data = &ip_rt_error_cost,
3316 .maxlen = sizeof(int),
3317 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003318 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003319 },
3320 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003321 .procname = "error_burst",
3322 .data = &ip_rt_error_burst,
3323 .maxlen = sizeof(int),
3324 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003325 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003326 },
3327 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003328 .procname = "gc_elasticity",
3329 .data = &ip_rt_gc_elasticity,
3330 .maxlen = sizeof(int),
3331 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003332 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003333 },
3334 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003335 .procname = "mtu_expires",
3336 .data = &ip_rt_mtu_expires,
3337 .maxlen = sizeof(int),
3338 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003339 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003340 },
3341 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003342 .procname = "min_pmtu",
3343 .data = &ip_rt_min_pmtu,
3344 .maxlen = sizeof(int),
3345 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003346 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003347 },
3348 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003349 .procname = "min_adv_mss",
3350 .data = &ip_rt_min_advmss,
3351 .maxlen = sizeof(int),
3352 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003353 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003354 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003355 { }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003356};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003357
Al Viro2f4520d2008-08-25 15:17:44 -07003358static struct ctl_table empty[1];
3359
3360static struct ctl_table ipv4_skeleton[] =
3361{
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003362 { .procname = "route",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003363 .mode = 0555, .child = ipv4_route_table},
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003364 { .procname = "neigh",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003365 .mode = 0555, .child = empty},
Al Viro2f4520d2008-08-25 15:17:44 -07003366 { }
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003367};
3368
Al Viro2f4520d2008-08-25 15:17:44 -07003369static __net_initdata struct ctl_path ipv4_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003370 { .procname = "net", },
3371 { .procname = "ipv4", },
Al Viro2f4520d2008-08-25 15:17:44 -07003372 { },
3373};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003374
3375static struct ctl_table ipv4_route_flush_table[] = {
3376 {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003377 .procname = "flush",
3378 .maxlen = sizeof(int),
3379 .mode = 0200,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003380 .proc_handler = ipv4_sysctl_rtcache_flush,
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003381 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003382 { },
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003383};
3384
Al Viro2f4520d2008-08-25 15:17:44 -07003385static __net_initdata struct ctl_path ipv4_route_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003386 { .procname = "net", },
3387 { .procname = "ipv4", },
3388 { .procname = "route", },
Al Viro2f4520d2008-08-25 15:17:44 -07003389 { },
3390};
3391
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003392static __net_init int sysctl_route_net_init(struct net *net)
3393{
3394 struct ctl_table *tbl;
3395
3396 tbl = ipv4_route_flush_table;
Octavian Purdila09ad9bc2009-11-25 15:14:13 -08003397 if (!net_eq(net, &init_net)) {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003398 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3399 if (tbl == NULL)
3400 goto err_dup;
3401 }
3402 tbl[0].extra1 = net;
3403
3404 net->ipv4.route_hdr =
3405 register_net_sysctl_table(net, ipv4_route_path, tbl);
3406 if (net->ipv4.route_hdr == NULL)
3407 goto err_reg;
3408 return 0;
3409
3410err_reg:
3411 if (tbl != ipv4_route_flush_table)
3412 kfree(tbl);
3413err_dup:
3414 return -ENOMEM;
3415}
3416
3417static __net_exit void sysctl_route_net_exit(struct net *net)
3418{
3419 struct ctl_table *tbl;
3420
3421 tbl = net->ipv4.route_hdr->ctl_table_arg;
3422 unregister_net_sysctl_table(net->ipv4.route_hdr);
3423 BUG_ON(tbl == ipv4_route_flush_table);
3424 kfree(tbl);
3425}
3426
3427static __net_initdata struct pernet_operations sysctl_route_ops = {
3428 .init = sysctl_route_net_init,
3429 .exit = sysctl_route_net_exit,
3430};
Linus Torvalds1da177e2005-04-16 15:20:36 -07003431#endif
3432
Neil Horman3ee94372010-05-08 01:57:52 -07003433static __net_init int rt_genid_init(struct net *net)
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003434{
Neil Horman3ee94372010-05-08 01:57:52 -07003435 get_random_bytes(&net->ipv4.rt_genid,
3436 sizeof(net->ipv4.rt_genid));
David S. Miller436c3b62011-03-24 17:42:21 -07003437 get_random_bytes(&net->ipv4.dev_addr_genid,
3438 sizeof(net->ipv4.dev_addr_genid));
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003439 return 0;
3440}
3441
Neil Horman3ee94372010-05-08 01:57:52 -07003442static __net_initdata struct pernet_operations rt_genid_ops = {
3443 .init = rt_genid_init,
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003444};
3445
3446
Patrick McHardyc7066f72011-01-14 13:36:42 +01003447#ifdef CONFIG_IP_ROUTE_CLASSID
Tejun Heo7d720c32010-02-16 15:20:26 +00003448struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
Patrick McHardyc7066f72011-01-14 13:36:42 +01003449#endif /* CONFIG_IP_ROUTE_CLASSID */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003450
3451static __initdata unsigned long rhash_entries;
3452static int __init set_rhash_entries(char *str)
3453{
3454 if (!str)
3455 return 0;
3456 rhash_entries = simple_strtoul(str, &str, 0);
3457 return 1;
3458}
3459__setup("rhash_entries=", set_rhash_entries);
3460
3461int __init ip_rt_init(void)
3462{
Eric Dumazet424c4b72005-07-05 14:58:19 -07003463 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003464
Eric Dumazetad52eef2014-06-02 05:26:03 -07003465 ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3466 if (!ip_idents)
3467 panic("IP: failed to allocate ip_idents\n");
3468
3469 get_random_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3470
Patrick McHardyc7066f72011-01-14 13:36:42 +01003471#ifdef CONFIG_IP_ROUTE_CLASSID
Ingo Molnar0dcec8c2009-02-25 14:07:33 +01003472 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003473 if (!ip_rt_acct)
3474 panic("IP: failed to allocate ip_rt_acct\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003475#endif
3476
Alexey Dobriyane5d679f2006-08-26 19:25:52 -07003477 ipv4_dst_ops.kmem_cachep =
3478 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09003479 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003480
David S. Miller14e50e52007-05-24 18:17:54 -07003481 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3482
Eric Dumazetfc66f952010-10-08 06:37:34 +00003483 if (dst_entries_init(&ipv4_dst_ops) < 0)
3484 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3485
3486 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3487 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3488
Eric Dumazet424c4b72005-07-05 14:58:19 -07003489 rt_hash_table = (struct rt_hash_bucket *)
3490 alloc_large_system_hash("IP route cache",
3491 sizeof(struct rt_hash_bucket),
3492 rhash_entries,
Jan Beulich44813742009-09-21 17:03:05 -07003493 (totalram_pages >= 128 * 1024) ?
Mike Stroyan18955cf2005-11-29 16:12:55 -08003494 15 : 17,
Kirill Korotaev8d1502d2006-08-07 20:44:22 -07003495 0,
Eric Dumazet424c4b72005-07-05 14:58:19 -07003496 &rt_hash_log,
3497 &rt_hash_mask,
Anton Blanchardc9503e02009-04-27 05:42:24 -07003498 rhash_entries ? 0 : 512 * 1024);
Eric Dumazet22c047c2005-07-05 14:55:24 -07003499 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3500 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003501
3502 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3503 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3504
Linus Torvalds1da177e2005-04-16 15:20:36 -07003505 devinet_init();
3506 ip_fib_init();
3507
Eric Dumazet9f28a2f2011-12-21 15:47:16 -05003508 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3509 expires_ljiffies = jiffies;
3510 schedule_delayed_work(&expires_work,
3511 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3512
Denis V. Lunev73b38712008-02-28 20:51:18 -08003513 if (ip_rt_proc_init())
Joe Perches058bd4d2012-03-11 18:36:11 +00003514 pr_err("Unable to create route proc files\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003515#ifdef CONFIG_XFRM
3516 xfrm_init();
Neil Hormana33bc5c2009-07-30 18:52:15 -07003517 xfrm4_init(ip_rt_max_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003518#endif
Greg Rosec7ac8672011-06-10 01:27:09 +00003519 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
Thomas Graf63f34442007-03-22 11:55:17 -07003520
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003521#ifdef CONFIG_SYSCTL
3522 register_pernet_subsys(&sysctl_route_ops);
3523#endif
Neil Horman3ee94372010-05-08 01:57:52 -07003524 register_pernet_subsys(&rt_genid_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003525 return rc;
3526}
3527
Al Viroa1bc6eb2008-07-30 06:32:52 -04003528#ifdef CONFIG_SYSCTL
Al Viroeeb61f72008-07-27 08:59:33 +01003529/*
3530 * We really need to sanitize the damn ipv4 init order, then all
3531 * this nonsense will go away.
3532 */
3533void __init ip_static_sysctl_init(void)
3534{
Al Viro2f4520d2008-08-25 15:17:44 -07003535 register_sysctl_paths(ipv4_path, ipv4_skeleton);
Al Viroeeb61f72008-07-27 08:59:33 +01003536}
Al Viroa1bc6eb2008-07-30 06:32:52 -04003537#endif