blob: f8921b448c39590c236799403720c035ba6f4f7e [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
Jesper Juhl02c30a82005-05-05 16:16:16 -07008 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -07009 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090021 * Alan Cox : Super /proc >4K
Linus Torvalds1da177e2005-04-16 15:20:36 -070022 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090039 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070055 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Ilia Sotnikovcef26852006-03-25 01:38:55 -080056 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
Linus Torvalds1da177e2005-04-16 15:20:36 -070058 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
Joe Perchesafd465032012-03-12 07:03:32 +000065#define pr_fmt(fmt) "IPv4: " fmt
66
Linus Torvalds1da177e2005-04-16 15:20:36 -070067#include <linux/module.h>
68#include <asm/uaccess.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070069#include <linux/bitops.h>
70#include <linux/types.h>
71#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070072#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070073#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070074#include <linux/string.h>
75#include <linux/socket.h>
76#include <linux/sockios.h>
77#include <linux/errno.h>
78#include <linux/in.h>
79#include <linux/inet.h>
80#include <linux/netdevice.h>
81#include <linux/proc_fs.h>
82#include <linux/init.h>
Eric Dumazet39c90ec2007-09-15 10:55:54 -070083#include <linux/workqueue.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070084#include <linux/skbuff.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070085#include <linux/inetdevice.h>
86#include <linux/igmp.h>
87#include <linux/pkt_sched.h>
88#include <linux/mroute.h>
89#include <linux/netfilter_ipv4.h>
90#include <linux/random.h>
91#include <linux/jhash.h>
92#include <linux/rcupdate.h>
93#include <linux/times.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090094#include <linux/slab.h>
Stephen Rothwellb9eda062011-12-22 17:03:29 +110095#include <linux/prefetch.h>
Herbert Xu352e5122007-11-13 21:34:06 -080096#include <net/dst.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020097#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070098#include <net/protocol.h>
99#include <net/ip.h>
100#include <net/route.h>
101#include <net/inetpeer.h>
102#include <net/sock.h>
103#include <net/ip_fib.h>
104#include <net/arp.h>
105#include <net/tcp.h>
106#include <net/icmp.h>
107#include <net/xfrm.h>
Tom Tucker8d717402006-07-30 20:43:36 -0700108#include <net/netevent.h>
Thomas Graf63f34442007-03-22 11:55:17 -0700109#include <net/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700110#ifdef CONFIG_SYSCTL
111#include <linux/sysctl.h>
Shan Wei7426a562012-04-18 18:05:46 +0000112#include <linux/kmemleak.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700113#endif
David S. Miller6e5714e2011-08-03 20:50:44 -0700114#include <net/secure_seq.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700115
David S. Miller68a5e3d2011-03-11 20:07:33 -0500116#define RT_FL_TOS(oldflp4) \
Julian Anastasovf61759e2011-12-02 11:39:42 +0000117 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700118
119#define IP_MAX_MTU 0xFFF0
120
121#define RT_GC_TIMEOUT (300*HZ)
122
Linus Torvalds1da177e2005-04-16 15:20:36 -0700123static int ip_rt_max_size;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700124static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500125static int ip_rt_gc_interval __read_mostly = 60 * HZ;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700126static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
127static int ip_rt_redirect_number __read_mostly = 9;
128static int ip_rt_redirect_load __read_mostly = HZ / 50;
129static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
130static int ip_rt_error_cost __read_mostly = HZ;
131static int ip_rt_error_burst __read_mostly = 5 * HZ;
132static int ip_rt_gc_elasticity __read_mostly = 8;
133static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
134static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
135static int ip_rt_min_advmss __read_mostly = 256;
Neil Horman1080d702008-10-27 12:28:25 -0700136static int rt_chain_length_max __read_mostly = 20;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700137
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500138static struct delayed_work expires_work;
139static unsigned long expires_ljiffies;
140
Linus Torvalds1da177e2005-04-16 15:20:36 -0700141/*
142 * Interface to generic destination cache.
143 */
144
145static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
David S. Miller0dbaee32010-12-13 12:52:14 -0800146static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
Steffen Klassertebb762f2011-11-23 02:12:51 +0000147static unsigned int ipv4_mtu(const struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148static void ipv4_dst_destroy(struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700149static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150static void ipv4_link_failure(struct sk_buff *skb);
151static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
Daniel Lezcano569d3642008-01-18 03:56:57 -0800152static int rt_garbage_collect(struct dst_ops *ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700153
Eric Dumazet72cdd1d2010-11-11 07:14:07 +0000154static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
155 int how)
156{
157}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700158
David S. Miller62fa8a82011-01-26 20:51:05 -0800159static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
160{
David S. Miller31248732012-07-10 07:08:18 -0700161 WARN_ON(1);
162 return NULL;
David S. Miller62fa8a82011-01-26 20:51:05 -0800163}
164
David S. Millerf894cbf2012-07-02 21:52:24 -0700165static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
166 struct sk_buff *skb,
167 const void *daddr);
David S. Millerd3aaeb32011-07-18 00:40:17 -0700168
Linus Torvalds1da177e2005-04-16 15:20:36 -0700169static struct dst_ops ipv4_dst_ops = {
170 .family = AF_INET,
Harvey Harrison09640e62009-02-01 00:45:17 -0800171 .protocol = cpu_to_be16(ETH_P_IP),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700172 .gc = rt_garbage_collect,
173 .check = ipv4_dst_check,
David S. Miller0dbaee32010-12-13 12:52:14 -0800174 .default_advmss = ipv4_default_advmss,
Steffen Klassertebb762f2011-11-23 02:12:51 +0000175 .mtu = ipv4_mtu,
David S. Miller62fa8a82011-01-26 20:51:05 -0800176 .cow_metrics = ipv4_cow_metrics,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700177 .destroy = ipv4_dst_destroy,
178 .ifdown = ipv4_dst_ifdown,
179 .negative_advice = ipv4_negative_advice,
180 .link_failure = ipv4_link_failure,
181 .update_pmtu = ip_rt_update_pmtu,
Herbert Xu1ac06e02008-05-20 14:32:14 -0700182 .local_out = __ip_local_out,
David S. Millerd3aaeb32011-07-18 00:40:17 -0700183 .neigh_lookup = ipv4_neigh_lookup,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700184};
185
186#define ECN_OR_COST(class) TC_PRIO_##class
187
Philippe De Muyter4839c522007-07-09 15:32:57 -0700188const __u8 ip_tos2prio[16] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700189 TC_PRIO_BESTEFFORT,
Dan Siemon4a2b9c32011-03-15 13:56:07 +0000190 ECN_OR_COST(BESTEFFORT),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700191 TC_PRIO_BESTEFFORT,
192 ECN_OR_COST(BESTEFFORT),
193 TC_PRIO_BULK,
194 ECN_OR_COST(BULK),
195 TC_PRIO_BULK,
196 ECN_OR_COST(BULK),
197 TC_PRIO_INTERACTIVE,
198 ECN_OR_COST(INTERACTIVE),
199 TC_PRIO_INTERACTIVE,
200 ECN_OR_COST(INTERACTIVE),
201 TC_PRIO_INTERACTIVE_BULK,
202 ECN_OR_COST(INTERACTIVE_BULK),
203 TC_PRIO_INTERACTIVE_BULK,
204 ECN_OR_COST(INTERACTIVE_BULK)
205};
Amir Vadaid4a96862012-04-04 21:33:28 +0000206EXPORT_SYMBOL(ip_tos2prio);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700207
208/*
209 * Route cache.
210 */
211
212/* The locking scheme is rather straight forward:
213 *
214 * 1) Read-Copy Update protects the buckets of the central route hash.
215 * 2) Only writers remove entries, and they hold the lock
216 * as they look at rtable reference counts.
217 * 3) Only readers acquire references to rtable entries,
218 * they do so with atomic increments and with the
219 * lock held.
220 */
221
222struct rt_hash_bucket {
Eric Dumazet1c317202010-10-25 21:02:07 +0000223 struct rtable __rcu *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700224};
Neil Horman1080d702008-10-27 12:28:25 -0700225
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700226#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
227 defined(CONFIG_PROVE_LOCKING)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700228/*
229 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
230 * The size of this table is a power of two and depends on the number of CPUS.
Ingo Molnar62051202006-07-03 00:24:59 -0700231 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700232 */
Ingo Molnar62051202006-07-03 00:24:59 -0700233#ifdef CONFIG_LOCKDEP
234# define RT_HASH_LOCK_SZ 256
Eric Dumazet22c047c2005-07-05 14:55:24 -0700235#else
Ingo Molnar62051202006-07-03 00:24:59 -0700236# if NR_CPUS >= 32
237# define RT_HASH_LOCK_SZ 4096
238# elif NR_CPUS >= 16
239# define RT_HASH_LOCK_SZ 2048
240# elif NR_CPUS >= 8
241# define RT_HASH_LOCK_SZ 1024
242# elif NR_CPUS >= 4
243# define RT_HASH_LOCK_SZ 512
244# else
245# define RT_HASH_LOCK_SZ 256
246# endif
Eric Dumazet22c047c2005-07-05 14:55:24 -0700247#endif
248
249static spinlock_t *rt_hash_locks;
250# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800251
252static __init void rt_hash_lock_init(void)
253{
254 int i;
255
256 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
257 GFP_KERNEL);
258 if (!rt_hash_locks)
259 panic("IP: failed to allocate rt_hash_locks\n");
260
261 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
262 spin_lock_init(&rt_hash_locks[i]);
263}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700264#else
265# define rt_hash_lock_addr(slot) NULL
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800266
267static inline void rt_hash_lock_init(void)
268{
269}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700270#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700271
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700272static struct rt_hash_bucket *rt_hash_table __read_mostly;
Eric Dumazet95c96172012-04-15 05:58:06 +0000273static unsigned int rt_hash_mask __read_mostly;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700274static unsigned int rt_hash_log __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700275
Eric Dumazet2f970d82006-01-17 02:54:36 -0800276static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
Eric Dumazet27f39c72010-05-19 22:07:23 +0000277#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700278
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700279static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700280 int genid)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700281{
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700282 return jhash_3words((__force u32)daddr, (__force u32)saddr,
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700283 idx, genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800284 & rt_hash_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700285}
286
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700287static inline int rt_genid(struct net *net)
288{
289 return atomic_read(&net->ipv4.rt_genid);
290}
291
Linus Torvalds1da177e2005-04-16 15:20:36 -0700292#ifdef CONFIG_PROC_FS
293struct rt_cache_iter_state {
Denis V. Luneva75e9362008-02-28 20:50:55 -0800294 struct seq_net_private p;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700295 int bucket;
Eric Dumazet29e75252008-01-31 17:05:09 -0800296 int genid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700297};
298
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900299static struct rtable *rt_cache_get_first(struct seq_file *seq)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700300{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900301 struct rt_cache_iter_state *st = seq->private;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700302 struct rtable *r = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700303
304 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
Eric Dumazet33d480c2011-08-11 19:30:52 +0000305 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
Eric Dumazeta6272662008-08-28 01:11:25 -0700306 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700307 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -0800308 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Eric Dumazet29e75252008-01-31 17:05:09 -0800309 while (r) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700310 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
Denis V. Luneva75e9362008-02-28 20:50:55 -0800311 r->rt_genid == st->genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800312 return r;
Changli Gaod8d1f302010-06-10 23:31:35 -0700313 r = rcu_dereference_bh(r->dst.rt_next);
Eric Dumazet29e75252008-01-31 17:05:09 -0800314 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700315 rcu_read_unlock_bh();
316 }
Eric Dumazet29e75252008-01-31 17:05:09 -0800317 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700318}
319
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900320static struct rtable *__rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800321 struct rtable *r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700322{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900323 struct rt_cache_iter_state *st = seq->private;
Eric Dumazeta6272662008-08-28 01:11:25 -0700324
Eric Dumazet1c317202010-10-25 21:02:07 +0000325 r = rcu_dereference_bh(r->dst.rt_next);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700326 while (!r) {
327 rcu_read_unlock_bh();
Eric Dumazeta6272662008-08-28 01:11:25 -0700328 do {
329 if (--st->bucket < 0)
330 return NULL;
Eric Dumazet33d480c2011-08-11 19:30:52 +0000331 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700332 rcu_read_lock_bh();
Eric Dumazet1c317202010-10-25 21:02:07 +0000333 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700334 }
Eric Dumazet1c317202010-10-25 21:02:07 +0000335 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700336}
337
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900338static struct rtable *rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800339 struct rtable *r)
340{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900341 struct rt_cache_iter_state *st = seq->private;
342 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700343 if (dev_net(r->dst.dev) != seq_file_net(seq))
Denis V. Luneva75e9362008-02-28 20:50:55 -0800344 continue;
Denis V. Lunev642d6312008-02-28 20:50:33 -0800345 if (r->rt_genid == st->genid)
346 break;
347 }
348 return r;
349}
350
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900351static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700352{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900353 struct rtable *r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700354
355 if (r)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900356 while (pos && (r = rt_cache_get_next(seq, r)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700357 --pos;
358 return pos ? NULL : r;
359}
360
361static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
362{
Eric Dumazet29e75252008-01-31 17:05:09 -0800363 struct rt_cache_iter_state *st = seq->private;
Eric Dumazet29e75252008-01-31 17:05:09 -0800364 if (*pos)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900365 return rt_cache_get_idx(seq, *pos - 1);
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700366 st->genid = rt_genid(seq_file_net(seq));
Eric Dumazet29e75252008-01-31 17:05:09 -0800367 return SEQ_START_TOKEN;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700368}
369
370static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
371{
Eric Dumazet29e75252008-01-31 17:05:09 -0800372 struct rtable *r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700373
374 if (v == SEQ_START_TOKEN)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900375 r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700376 else
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900377 r = rt_cache_get_next(seq, v);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700378 ++*pos;
379 return r;
380}
381
382static void rt_cache_seq_stop(struct seq_file *seq, void *v)
383{
384 if (v && v != SEQ_START_TOKEN)
385 rcu_read_unlock_bh();
386}
387
388static int rt_cache_seq_show(struct seq_file *seq, void *v)
389{
390 if (v == SEQ_START_TOKEN)
391 seq_printf(seq, "%-127s\n",
392 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
393 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
394 "HHUptod\tSpecDst");
395 else {
396 struct rtable *r = v;
David S. Miller3c521f22012-07-02 02:04:13 -0700397 int len;
Eric Dumazet218fa902011-11-29 20:05:55 +0000398
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700399 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
David S. Miller794785b2012-07-10 00:52:56 -0700400 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
401 r->dst.dev ? r->dst.dev->name : "*",
402 (__force u32)r->rt_dst,
403 (__force u32)r->rt_gateway,
404 r->rt_flags, atomic_read(&r->dst.__refcnt),
405 r->dst.__use, 0, (__force u32)r->rt_src,
406 dst_metric_advmss(&r->dst) + 40,
407 dst_metric(&r->dst, RTAX_WINDOW), 0,
408 r->rt_key_tos,
409 -1, 0, 0, &len);
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700410
411 seq_printf(seq, "%*s\n", 127 - len, "");
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900412 }
413 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700414}
415
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700416static const struct seq_operations rt_cache_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700417 .start = rt_cache_seq_start,
418 .next = rt_cache_seq_next,
419 .stop = rt_cache_seq_stop,
420 .show = rt_cache_seq_show,
421};
422
423static int rt_cache_seq_open(struct inode *inode, struct file *file)
424{
Denis V. Luneva75e9362008-02-28 20:50:55 -0800425 return seq_open_net(inode, file, &rt_cache_seq_ops,
Pavel Emelyanovcf7732e2007-10-10 02:29:29 -0700426 sizeof(struct rt_cache_iter_state));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700427}
428
Arjan van de Ven9a321442007-02-12 00:55:35 -0800429static const struct file_operations rt_cache_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700430 .owner = THIS_MODULE,
431 .open = rt_cache_seq_open,
432 .read = seq_read,
433 .llseek = seq_lseek,
Denis V. Luneva75e9362008-02-28 20:50:55 -0800434 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700435};
436
437
438static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
439{
440 int cpu;
441
442 if (*pos == 0)
443 return SEQ_START_TOKEN;
444
Rusty Russell0f23174a2008-12-29 12:23:42 +0000445 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700446 if (!cpu_possible(cpu))
447 continue;
448 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800449 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700450 }
451 return NULL;
452}
453
454static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
455{
456 int cpu;
457
Rusty Russell0f23174a2008-12-29 12:23:42 +0000458 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700459 if (!cpu_possible(cpu))
460 continue;
461 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800462 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700463 }
464 return NULL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900465
Linus Torvalds1da177e2005-04-16 15:20:36 -0700466}
467
468static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
469{
470
471}
472
473static int rt_cpu_seq_show(struct seq_file *seq, void *v)
474{
475 struct rt_cache_stat *st = v;
476
477 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700478 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700479 return 0;
480 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900481
Linus Torvalds1da177e2005-04-16 15:20:36 -0700482 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
483 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
Eric Dumazetfc66f952010-10-08 06:37:34 +0000484 dst_entries_get_slow(&ipv4_dst_ops),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700485 st->in_hit,
486 st->in_slow_tot,
487 st->in_slow_mc,
488 st->in_no_route,
489 st->in_brd,
490 st->in_martian_dst,
491 st->in_martian_src,
492
493 st->out_hit,
494 st->out_slow_tot,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900495 st->out_slow_mc,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700496
497 st->gc_total,
498 st->gc_ignored,
499 st->gc_goal_miss,
500 st->gc_dst_overflow,
501 st->in_hlist_search,
502 st->out_hlist_search
503 );
504 return 0;
505}
506
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700507static const struct seq_operations rt_cpu_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700508 .start = rt_cpu_seq_start,
509 .next = rt_cpu_seq_next,
510 .stop = rt_cpu_seq_stop,
511 .show = rt_cpu_seq_show,
512};
513
514
515static int rt_cpu_seq_open(struct inode *inode, struct file *file)
516{
517 return seq_open(file, &rt_cpu_seq_ops);
518}
519
Arjan van de Ven9a321442007-02-12 00:55:35 -0800520static const struct file_operations rt_cpu_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700521 .owner = THIS_MODULE,
522 .open = rt_cpu_seq_open,
523 .read = seq_read,
524 .llseek = seq_lseek,
525 .release = seq_release,
526};
527
Patrick McHardyc7066f72011-01-14 13:36:42 +0100528#ifdef CONFIG_IP_ROUTE_CLASSID
Alexey Dobriyana661c412009-11-25 15:40:35 -0800529static int rt_acct_proc_show(struct seq_file *m, void *v)
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800530{
Alexey Dobriyana661c412009-11-25 15:40:35 -0800531 struct ip_rt_acct *dst, *src;
532 unsigned int i, j;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800533
Alexey Dobriyana661c412009-11-25 15:40:35 -0800534 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
535 if (!dst)
536 return -ENOMEM;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800537
Alexey Dobriyana661c412009-11-25 15:40:35 -0800538 for_each_possible_cpu(i) {
539 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
540 for (j = 0; j < 256; j++) {
541 dst[j].o_bytes += src[j].o_bytes;
542 dst[j].o_packets += src[j].o_packets;
543 dst[j].i_bytes += src[j].i_bytes;
544 dst[j].i_packets += src[j].i_packets;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800545 }
546 }
Alexey Dobriyana661c412009-11-25 15:40:35 -0800547
548 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
549 kfree(dst);
550 return 0;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800551}
Alexey Dobriyana661c412009-11-25 15:40:35 -0800552
553static int rt_acct_proc_open(struct inode *inode, struct file *file)
554{
555 return single_open(file, rt_acct_proc_show, NULL);
556}
557
558static const struct file_operations rt_acct_proc_fops = {
559 .owner = THIS_MODULE,
560 .open = rt_acct_proc_open,
561 .read = seq_read,
562 .llseek = seq_lseek,
563 .release = single_release,
564};
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800565#endif
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800566
Denis V. Lunev73b38712008-02-28 20:51:18 -0800567static int __net_init ip_rt_do_proc_init(struct net *net)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800568{
569 struct proc_dir_entry *pde;
570
571 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
572 &rt_cache_seq_fops);
573 if (!pde)
574 goto err1;
575
Wang Chen77020722008-02-28 14:14:25 -0800576 pde = proc_create("rt_cache", S_IRUGO,
577 net->proc_net_stat, &rt_cpu_seq_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800578 if (!pde)
579 goto err2;
580
Patrick McHardyc7066f72011-01-14 13:36:42 +0100581#ifdef CONFIG_IP_ROUTE_CLASSID
Alexey Dobriyana661c412009-11-25 15:40:35 -0800582 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800583 if (!pde)
584 goto err3;
585#endif
586 return 0;
587
Patrick McHardyc7066f72011-01-14 13:36:42 +0100588#ifdef CONFIG_IP_ROUTE_CLASSID
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800589err3:
590 remove_proc_entry("rt_cache", net->proc_net_stat);
591#endif
592err2:
593 remove_proc_entry("rt_cache", net->proc_net);
594err1:
595 return -ENOMEM;
596}
Denis V. Lunev73b38712008-02-28 20:51:18 -0800597
598static void __net_exit ip_rt_do_proc_exit(struct net *net)
599{
600 remove_proc_entry("rt_cache", net->proc_net_stat);
601 remove_proc_entry("rt_cache", net->proc_net);
Patrick McHardyc7066f72011-01-14 13:36:42 +0100602#ifdef CONFIG_IP_ROUTE_CLASSID
Denis V. Lunev73b38712008-02-28 20:51:18 -0800603 remove_proc_entry("rt_acct", net->proc_net);
Alexey Dobriyan0a931ac2010-01-17 03:32:50 +0000604#endif
Denis V. Lunev73b38712008-02-28 20:51:18 -0800605}
606
607static struct pernet_operations ip_rt_proc_ops __net_initdata = {
608 .init = ip_rt_do_proc_init,
609 .exit = ip_rt_do_proc_exit,
610};
611
612static int __init ip_rt_proc_init(void)
613{
614 return register_pernet_subsys(&ip_rt_proc_ops);
615}
616
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800617#else
Denis V. Lunev73b38712008-02-28 20:51:18 -0800618static inline int ip_rt_proc_init(void)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800619{
620 return 0;
621}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700622#endif /* CONFIG_PROC_FS */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900623
Stephen Hemminger5969f712008-04-10 01:52:09 -0700624static inline void rt_free(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700625{
Changli Gaod8d1f302010-06-10 23:31:35 -0700626 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700627}
628
Stephen Hemminger5969f712008-04-10 01:52:09 -0700629static inline void rt_drop(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700630{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700631 ip_rt_put(rt);
Changli Gaod8d1f302010-06-10 23:31:35 -0700632 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700633}
634
Stephen Hemminger5969f712008-04-10 01:52:09 -0700635static inline int rt_fast_clean(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700636{
637 /* Kill broadcast/multicast entries very aggresively, if they
638 collide in hash table with more useful entries */
639 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
David S. Millerc7537962010-11-11 17:07:48 -0800640 rt_is_input_route(rth) && rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700641}
642
Stephen Hemminger5969f712008-04-10 01:52:09 -0700643static inline int rt_valuable(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700644{
645 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
David S. Miller59436342012-07-10 06:58:42 -0700646 rth->dst.expires;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700647}
648
649static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
650{
651 unsigned long age;
652 int ret = 0;
653
Changli Gaod8d1f302010-06-10 23:31:35 -0700654 if (atomic_read(&rth->dst.__refcnt))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700655 goto out;
656
Changli Gaod8d1f302010-06-10 23:31:35 -0700657 age = jiffies - rth->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700658 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
659 (age <= tmo2 && rt_valuable(rth)))
660 goto out;
661 ret = 1;
662out: return ret;
663}
664
665/* Bits of score are:
666 * 31: very valuable
667 * 30: not quite useless
668 * 29..0: usage counter
669 */
670static inline u32 rt_score(struct rtable *rt)
671{
Changli Gaod8d1f302010-06-10 23:31:35 -0700672 u32 score = jiffies - rt->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700673
674 score = ~score & ~(3<<30);
675
676 if (rt_valuable(rt))
677 score |= (1<<31);
678
David S. Millerc7537962010-11-11 17:07:48 -0800679 if (rt_is_output_route(rt) ||
Linus Torvalds1da177e2005-04-16 15:20:36 -0700680 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
681 score |= (1<<30);
682
683 return score;
684}
685
Neil Horman1080d702008-10-27 12:28:25 -0700686static inline bool rt_caching(const struct net *net)
687{
688 return net->ipv4.current_rt_cache_rebuild_count <=
689 net->ipv4.sysctl_rt_cache_rebuild_count;
690}
691
David S. Miller5e2b61f2011-03-04 21:47:09 -0800692static inline bool compare_hash_inputs(const struct rtable *rt1,
693 const struct rtable *rt2)
Neil Horman1080d702008-10-27 12:28:25 -0700694{
David S. Miller5e2b61f2011-03-04 21:47:09 -0800695 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
696 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
Julian Anastasov97a80412011-08-09 04:01:16 +0000697 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
Neil Horman1080d702008-10-27 12:28:25 -0700698}
699
David S. Miller5e2b61f2011-03-04 21:47:09 -0800700static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700701{
David S. Miller5e2b61f2011-03-04 21:47:09 -0800702 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
703 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
704 (rt1->rt_mark ^ rt2->rt_mark) |
David S. Miller475949d2011-05-03 19:45:15 -0700705 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
Julian Anastasovd547f722011-08-07 22:20:20 -0700706 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
Julian Anastasov97a80412011-08-09 04:01:16 +0000707 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700708}
709
Denis V. Lunevb5921912008-01-22 23:50:25 -0800710static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
711{
Changli Gaod8d1f302010-06-10 23:31:35 -0700712 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
Denis V. Lunevb5921912008-01-22 23:50:25 -0800713}
714
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700715static inline int rt_is_expired(struct rtable *rth)
716{
Changli Gaod8d1f302010-06-10 23:31:35 -0700717 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700718}
719
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800720/*
721 * Perform a full scan of hash table and free all entries.
722 * Can be called by a softirq or a process.
723 * In the later case, we want to be reschedule if necessary
724 */
David S. Miller6561a3b2010-12-19 21:11:20 -0800725static void rt_do_flush(struct net *net, int process_context)
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800726{
727 unsigned int i;
728 struct rtable *rth, *next;
729
730 for (i = 0; i <= rt_hash_mask; i++) {
David S. Miller6561a3b2010-12-19 21:11:20 -0800731 struct rtable __rcu **pprev;
732 struct rtable *list;
733
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800734 if (process_context && need_resched())
735 cond_resched();
Eric Dumazet33d480c2011-08-11 19:30:52 +0000736 rth = rcu_access_pointer(rt_hash_table[i].chain);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800737 if (!rth)
738 continue;
739
740 spin_lock_bh(rt_hash_lock_addr(i));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700741
David S. Miller6561a3b2010-12-19 21:11:20 -0800742 list = NULL;
743 pprev = &rt_hash_table[i].chain;
744 rth = rcu_dereference_protected(*pprev,
Eric Dumazet1c317202010-10-25 21:02:07 +0000745 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700746
David S. Miller6561a3b2010-12-19 21:11:20 -0800747 while (rth) {
748 next = rcu_dereference_protected(rth->dst.rt_next,
749 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700750
David S. Miller6561a3b2010-12-19 21:11:20 -0800751 if (!net ||
752 net_eq(dev_net(rth->dst.dev), net)) {
753 rcu_assign_pointer(*pprev, next);
754 rcu_assign_pointer(rth->dst.rt_next, list);
755 list = rth;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700756 } else {
David S. Miller6561a3b2010-12-19 21:11:20 -0800757 pprev = &rth->dst.rt_next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700758 }
David S. Miller6561a3b2010-12-19 21:11:20 -0800759 rth = next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700760 }
David S. Miller6561a3b2010-12-19 21:11:20 -0800761
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800762 spin_unlock_bh(rt_hash_lock_addr(i));
763
David S. Miller6561a3b2010-12-19 21:11:20 -0800764 for (; list; list = next) {
765 next = rcu_dereference_protected(list->dst.rt_next, 1);
766 rt_free(list);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800767 }
768 }
769}
770
Neil Horman1080d702008-10-27 12:28:25 -0700771/*
772 * While freeing expired entries, we compute average chain length
773 * and standard deviation, using fixed-point arithmetic.
774 * This to have an estimation of rt_chain_length_max
775 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
776 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
777 */
778
779#define FRACT_BITS 3
780#define ONE (1UL << FRACT_BITS)
781
Eric Dumazet98376382010-03-08 03:20:00 +0000782/*
783 * Given a hash chain and an item in this hash chain,
784 * find if a previous entry has the same hash_inputs
785 * (but differs on tos, mark or oif)
786 * Returns 0 if an alias is found.
787 * Returns ONE if rth has no alias before itself.
788 */
789static int has_noalias(const struct rtable *head, const struct rtable *rth)
790{
791 const struct rtable *aux = head;
792
793 while (aux != rth) {
David S. Miller5e2b61f2011-03-04 21:47:09 -0800794 if (compare_hash_inputs(aux, rth))
Eric Dumazet98376382010-03-08 03:20:00 +0000795 return 0;
Eric Dumazet1c317202010-10-25 21:02:07 +0000796 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +0000797 }
798 return ONE;
799}
800
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500801static void rt_check_expire(void)
802{
803 static unsigned int rover;
804 unsigned int i = rover, goal;
805 struct rtable *rth;
806 struct rtable __rcu **rthp;
807 unsigned long samples = 0;
808 unsigned long sum = 0, sum2 = 0;
809 unsigned long delta;
810 u64 mult;
811
812 delta = jiffies - expires_ljiffies;
813 expires_ljiffies = jiffies;
814 mult = ((u64)delta) << rt_hash_log;
815 if (ip_rt_gc_timeout > 1)
816 do_div(mult, ip_rt_gc_timeout);
817 goal = (unsigned int)mult;
818 if (goal > rt_hash_mask)
819 goal = rt_hash_mask + 1;
820 for (; goal > 0; goal--) {
821 unsigned long tmo = ip_rt_gc_timeout;
822 unsigned long length;
823
824 i = (i + 1) & rt_hash_mask;
825 rthp = &rt_hash_table[i].chain;
826
827 if (need_resched())
828 cond_resched();
829
830 samples++;
831
832 if (rcu_dereference_raw(*rthp) == NULL)
833 continue;
834 length = 0;
835 spin_lock_bh(rt_hash_lock_addr(i));
836 while ((rth = rcu_dereference_protected(*rthp,
837 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
838 prefetch(rth->dst.rt_next);
David S. Millerdf67e6c2012-06-26 00:10:09 -0700839 if (rt_is_expired(rth) ||
840 rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500841 *rthp = rth->dst.rt_next;
842 rt_free(rth);
843 continue;
844 }
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500845
David S. Millerdf67e6c2012-06-26 00:10:09 -0700846 /* We only count entries on a chain with equal
847 * hash inputs once so that entries for
848 * different QOS levels, and other non-hash
849 * input attributes don't unfairly skew the
850 * length computation
851 */
852 tmo >>= 1;
853 rthp = &rth->dst.rt_next;
854 length += has_noalias(rt_hash_table[i].chain, rth);
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500855 }
856 spin_unlock_bh(rt_hash_lock_addr(i));
857 sum += length;
858 sum2 += length*length;
859 }
860 if (samples) {
861 unsigned long avg = sum / samples;
862 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
863 rt_chain_length_max = max_t(unsigned long,
864 ip_rt_gc_elasticity,
865 (avg + 4*sd) >> FRACT_BITS);
866 }
867 rover = i;
868}
869
870/*
871 * rt_worker_func() is run in process context.
872 * we call rt_check_expire() to scan part of the hash table
873 */
874static void rt_worker_func(struct work_struct *work)
875{
876 rt_check_expire();
877 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
878}
879
Eric Dumazet29e75252008-01-31 17:05:09 -0800880/*
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300881 * Perturbation of rt_genid by a small quantity [1..256]
Eric Dumazet29e75252008-01-31 17:05:09 -0800882 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
883 * many times (2^24) without giving recent rt_genid.
884 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700885 */
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700886static void rt_cache_invalidate(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700887{
Eric Dumazet29e75252008-01-31 17:05:09 -0800888 unsigned char shuffle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700889
Eric Dumazet29e75252008-01-31 17:05:09 -0800890 get_random_bytes(&shuffle, sizeof(shuffle));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700891 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700892}
893
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800894/*
Eric Dumazet29e75252008-01-31 17:05:09 -0800895 * delay < 0 : invalidate cache (fast : entries will be deleted later)
896 * delay >= 0 : invalidate & flush cache (can be long)
897 */
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -0700898void rt_cache_flush(struct net *net, int delay)
Eric Dumazet29e75252008-01-31 17:05:09 -0800899{
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700900 rt_cache_invalidate(net);
Eric Dumazet29e75252008-01-31 17:05:09 -0800901 if (delay >= 0)
David S. Miller6561a3b2010-12-19 21:11:20 -0800902 rt_do_flush(net, !in_softirq());
Eric Dumazet29e75252008-01-31 17:05:09 -0800903}
904
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000905/* Flush previous cache invalidated entries from the cache */
David S. Miller6561a3b2010-12-19 21:11:20 -0800906void rt_cache_flush_batch(struct net *net)
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000907{
David S. Miller6561a3b2010-12-19 21:11:20 -0800908 rt_do_flush(net, !in_softirq());
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000909}
910
Neil Horman1080d702008-10-27 12:28:25 -0700911static void rt_emergency_hash_rebuild(struct net *net)
912{
Joe Perchese87cc472012-05-13 21:56:26 +0000913 net_warn_ratelimited("Route hash chain too long!\n");
Neil Horman3ee94372010-05-08 01:57:52 -0700914 rt_cache_invalidate(net);
Neil Horman1080d702008-10-27 12:28:25 -0700915}
916
Linus Torvalds1da177e2005-04-16 15:20:36 -0700917/*
918 Short description of GC goals.
919
920 We want to build algorithm, which will keep routing cache
921 at some equilibrium point, when number of aged off entries
922 is kept approximately equal to newly generated ones.
923
924 Current expiration strength is variable "expire".
925 We try to adjust it dynamically, so that if networking
926 is idle expires is large enough to keep enough of warm entries,
927 and when load increases it reduces to limit cache size.
928 */
929
Daniel Lezcano569d3642008-01-18 03:56:57 -0800930static int rt_garbage_collect(struct dst_ops *ops)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700931{
932 static unsigned long expire = RT_GC_TIMEOUT;
933 static unsigned long last_gc;
934 static int rover;
935 static int equilibrium;
Eric Dumazet1c317202010-10-25 21:02:07 +0000936 struct rtable *rth;
937 struct rtable __rcu **rthp;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700938 unsigned long now = jiffies;
939 int goal;
Eric Dumazetfc66f952010-10-08 06:37:34 +0000940 int entries = dst_entries_get_fast(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700941
942 /*
943 * Garbage collection is pretty expensive,
944 * do not make it too frequently.
945 */
946
947 RT_CACHE_STAT_INC(gc_total);
948
949 if (now - last_gc < ip_rt_gc_min_interval &&
Eric Dumazetfc66f952010-10-08 06:37:34 +0000950 entries < ip_rt_max_size) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700951 RT_CACHE_STAT_INC(gc_ignored);
952 goto out;
953 }
954
Eric Dumazetfc66f952010-10-08 06:37:34 +0000955 entries = dst_entries_get_slow(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700956 /* Calculate number of entries, which we want to expire now. */
Eric Dumazetfc66f952010-10-08 06:37:34 +0000957 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700958 if (goal <= 0) {
959 if (equilibrium < ipv4_dst_ops.gc_thresh)
960 equilibrium = ipv4_dst_ops.gc_thresh;
Eric Dumazetfc66f952010-10-08 06:37:34 +0000961 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700962 if (goal > 0) {
Eric Dumazetb790ced2007-12-21 01:49:07 -0800963 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +0000964 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700965 }
966 } else {
967 /* We are in dangerous area. Try to reduce cache really
968 * aggressively.
969 */
Eric Dumazetb790ced2007-12-21 01:49:07 -0800970 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +0000971 equilibrium = entries - goal;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700972 }
973
974 if (now - last_gc >= ip_rt_gc_min_interval)
975 last_gc = now;
976
977 if (goal <= 0) {
978 equilibrium += goal;
979 goto work_done;
980 }
981
982 do {
983 int i, k;
984
985 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
986 unsigned long tmo = expire;
987
988 k = (k + 1) & rt_hash_mask;
989 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700990 spin_lock_bh(rt_hash_lock_addr(k));
Eric Dumazet1c317202010-10-25 21:02:07 +0000991 while ((rth = rcu_dereference_protected(*rthp,
992 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700993 if (!rt_is_expired(rth) &&
Eric Dumazet29e75252008-01-31 17:05:09 -0800994 !rt_may_expire(rth, tmo, expire)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700995 tmo >>= 1;
Changli Gaod8d1f302010-06-10 23:31:35 -0700996 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700997 continue;
998 }
Changli Gaod8d1f302010-06-10 23:31:35 -0700999 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001000 rt_free(rth);
1001 goal--;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001002 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001003 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001004 if (goal <= 0)
1005 break;
1006 }
1007 rover = k;
1008
1009 if (goal <= 0)
1010 goto work_done;
1011
1012 /* Goal is not achieved. We stop process if:
1013
1014 - if expire reduced to zero. Otherwise, expire is halfed.
1015 - if table is not full.
1016 - if we are called from interrupt.
1017 - jiffies check is just fallback/debug loop breaker.
1018 We will not spin here for long time in any case.
1019 */
1020
1021 RT_CACHE_STAT_INC(gc_goal_miss);
1022
1023 if (expire == 0)
1024 break;
1025
1026 expire >>= 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001027
Eric Dumazetfc66f952010-10-08 06:37:34 +00001028 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001029 goto out;
1030 } while (!in_softirq() && time_before_eq(jiffies, now));
1031
Eric Dumazetfc66f952010-10-08 06:37:34 +00001032 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1033 goto out;
1034 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001035 goto out;
Joe Perchese87cc472012-05-13 21:56:26 +00001036 net_warn_ratelimited("dst cache overflow\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07001037 RT_CACHE_STAT_INC(gc_dst_overflow);
1038 return 1;
1039
1040work_done:
1041 expire += ip_rt_gc_min_interval;
1042 if (expire > ip_rt_gc_timeout ||
Eric Dumazetfc66f952010-10-08 06:37:34 +00001043 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1044 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001045 expire = ip_rt_gc_timeout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001046out: return 0;
1047}
1048
Eric Dumazet98376382010-03-08 03:20:00 +00001049/*
1050 * Returns number of entries in a hash chain that have different hash_inputs
1051 */
1052static int slow_chain_length(const struct rtable *head)
1053{
1054 int length = 0;
1055 const struct rtable *rth = head;
1056
1057 while (rth) {
1058 length += has_noalias(head, rth);
Eric Dumazet1c317202010-10-25 21:02:07 +00001059 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +00001060 }
1061 return length >> FRACT_BITS;
1062}
1063
David S. Millerf894cbf2012-07-02 21:52:24 -07001064static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
1065 struct sk_buff *skb,
1066 const void *daddr)
David Miller3769cff2011-07-11 22:44:24 +00001067{
David S. Millerd3aaeb32011-07-18 00:40:17 -07001068 struct net_device *dev = dst->dev;
1069 const __be32 *pkey = daddr;
David S. Miller39232972012-01-26 15:22:32 -05001070 const struct rtable *rt;
David Miller3769cff2011-07-11 22:44:24 +00001071 struct neighbour *n;
1072
David S. Miller39232972012-01-26 15:22:32 -05001073 rt = (const struct rtable *) dst;
David S. Millera263b302012-07-02 02:02:15 -07001074 if (rt->rt_gateway)
David S. Miller39232972012-01-26 15:22:32 -05001075 pkey = (const __be32 *) &rt->rt_gateway;
David S. Millerf894cbf2012-07-02 21:52:24 -07001076 else if (skb)
1077 pkey = &ip_hdr(skb)->daddr;
David S. Millerd3aaeb32011-07-18 00:40:17 -07001078
David S. Miller80703d22012-02-15 17:48:35 -05001079 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
David S. Millerd3aaeb32011-07-18 00:40:17 -07001080 if (n)
1081 return n;
David Miller32092ec2011-07-25 00:01:41 +00001082 return neigh_create(&arp_tbl, pkey, dev);
David S. Millerd3aaeb32011-07-18 00:40:17 -07001083}
1084
Eric Dumazet95c96172012-04-15 05:58:06 +00001085static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
David S. Millerb23dd4f2011-03-02 14:31:35 -08001086 struct sk_buff *skb, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001087{
Eric Dumazet1c317202010-10-25 21:02:07 +00001088 struct rtable *rth, *cand;
1089 struct rtable __rcu **rthp, **candp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001090 unsigned long now;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001091 u32 min_score;
1092 int chain_length;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001093
1094restart:
1095 chain_length = 0;
1096 min_score = ~(u32)0;
1097 cand = NULL;
1098 candp = NULL;
1099 now = jiffies;
1100
Eric Dumazet7586ece2012-06-20 05:02:19 +00001101 if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) {
Neil Horman73e42892009-06-20 01:15:16 -07001102 /*
1103 * If we're not caching, just tell the caller we
1104 * were successful and don't touch the route. The
1105 * caller hold the sole reference to the cache entry, and
1106 * it will be released when the caller is done with it.
1107 * If we drop it here, the callers have no way to resolve routes
1108 * when we're not caching. Instead, just point *rp at rt, so
1109 * the caller gets a single use out of the route
Neil Hormanb6280b42009-06-22 10:18:53 +00001110 * Note that we do rt_free on this new route entry, so that
1111 * once its refcount hits zero, we are still able to reap it
1112 * (Thanks Alexey)
Eric Dumazet27b75c92010-10-15 05:44:11 +00001113 * Note: To avoid expensive rcu stuff for this uncached dst,
1114 * we set DST_NOCACHE so that dst_release() can free dst without
1115 * waiting a grace period.
Neil Horman73e42892009-06-20 01:15:16 -07001116 */
Neil Hormanb6280b42009-06-22 10:18:53 +00001117
Eric Dumazetc7d44262010-10-03 22:17:54 -07001118 rt->dst.flags |= DST_NOCACHE;
Neil Hormanb6280b42009-06-22 10:18:53 +00001119 goto skip_hashing;
Neil Horman1080d702008-10-27 12:28:25 -07001120 }
1121
Linus Torvalds1da177e2005-04-16 15:20:36 -07001122 rthp = &rt_hash_table[hash].chain;
1123
Eric Dumazet22c047c2005-07-05 14:55:24 -07001124 spin_lock_bh(rt_hash_lock_addr(hash));
Eric Dumazet1c317202010-10-25 21:02:07 +00001125 while ((rth = rcu_dereference_protected(*rthp,
1126 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001127 if (rt_is_expired(rth)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001128 *rthp = rth->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001129 rt_free(rth);
1130 continue;
1131 }
David S. Miller5e2b61f2011-03-04 21:47:09 -08001132 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001133 /* Put it first */
Changli Gaod8d1f302010-06-10 23:31:35 -07001134 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001135 /*
1136 * Since lookup is lockfree, the deletion
1137 * must be visible to another weakly ordered CPU before
1138 * the insertion at the start of the hash chain.
1139 */
Changli Gaod8d1f302010-06-10 23:31:35 -07001140 rcu_assign_pointer(rth->dst.rt_next,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001141 rt_hash_table[hash].chain);
1142 /*
1143 * Since lookup is lockfree, the update writes
1144 * must be ordered for consistency on SMP.
1145 */
1146 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1147
Changli Gaod8d1f302010-06-10 23:31:35 -07001148 dst_use(&rth->dst, now);
Eric Dumazet22c047c2005-07-05 14:55:24 -07001149 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001150
1151 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001152 if (skb)
Changli Gaod8d1f302010-06-10 23:31:35 -07001153 skb_dst_set(skb, &rth->dst);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001154 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001155 }
1156
Changli Gaod8d1f302010-06-10 23:31:35 -07001157 if (!atomic_read(&rth->dst.__refcnt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001158 u32 score = rt_score(rth);
1159
1160 if (score <= min_score) {
1161 cand = rth;
1162 candp = rthp;
1163 min_score = score;
1164 }
1165 }
1166
1167 chain_length++;
1168
Changli Gaod8d1f302010-06-10 23:31:35 -07001169 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001170 }
1171
1172 if (cand) {
1173 /* ip_rt_gc_elasticity used to be average length of chain
1174 * length, when exceeded gc becomes really aggressive.
1175 *
1176 * The second limit is less certain. At the moment it allows
1177 * only 2 entries per bucket. We will see.
1178 */
1179 if (chain_length > ip_rt_gc_elasticity) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001180 *candp = cand->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001181 rt_free(cand);
1182 }
Neil Horman1080d702008-10-27 12:28:25 -07001183 } else {
Eric Dumazet98376382010-03-08 03:20:00 +00001184 if (chain_length > rt_chain_length_max &&
1185 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001186 struct net *net = dev_net(rt->dst.dev);
Neil Horman1080d702008-10-27 12:28:25 -07001187 int num = ++net->ipv4.current_rt_cache_rebuild_count;
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001188 if (!rt_caching(net)) {
Joe Perches058bd4d2012-03-11 18:36:11 +00001189 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
Changli Gaod8d1f302010-06-10 23:31:35 -07001190 rt->dst.dev->name, num);
Neil Horman1080d702008-10-27 12:28:25 -07001191 }
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001192 rt_emergency_hash_rebuild(net);
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001193 spin_unlock_bh(rt_hash_lock_addr(hash));
1194
David S. Miller5e2b61f2011-03-04 21:47:09 -08001195 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001196 ifindex, rt_genid(net));
1197 goto restart;
Neil Horman1080d702008-10-27 12:28:25 -07001198 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001199 }
1200
Changli Gaod8d1f302010-06-10 23:31:35 -07001201 rt->dst.rt_next = rt_hash_table[hash].chain;
Neil Horman1080d702008-10-27 12:28:25 -07001202
Eric Dumazet00269b52008-10-16 14:18:29 -07001203 /*
1204 * Since lookup is lockfree, we must make sure
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001205 * previous writes to rt are committed to memory
Eric Dumazet00269b52008-10-16 14:18:29 -07001206 * before making rt visible to other CPUS.
1207 */
Eric Dumazet1ddbcb02009-05-19 20:14:28 +00001208 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
Neil Horman1080d702008-10-27 12:28:25 -07001209
Eric Dumazet22c047c2005-07-05 14:55:24 -07001210 spin_unlock_bh(rt_hash_lock_addr(hash));
Neil Horman73e42892009-06-20 01:15:16 -07001211
Neil Hormanb6280b42009-06-22 10:18:53 +00001212skip_hashing:
David S. Millerb23dd4f2011-03-02 14:31:35 -08001213 if (skb)
Changli Gaod8d1f302010-06-10 23:31:35 -07001214 skb_dst_set(skb, &rt->dst);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001215 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001216}
1217
Linus Torvalds1da177e2005-04-16 15:20:36 -07001218/*
1219 * Peer allocation may fail only in serious out-of-memory conditions. However
1220 * we still can generate some output.
1221 * Random ID selection looks a bit dangerous because we have no chances to
1222 * select ID being unique in a reasonable period of time.
1223 * But broken packet identifier may be better than no packet at all.
1224 */
1225static void ip_select_fb_ident(struct iphdr *iph)
1226{
1227 static DEFINE_SPINLOCK(ip_fb_id_lock);
1228 static u32 ip_fallback_id;
1229 u32 salt;
1230
1231 spin_lock_bh(&ip_fb_id_lock);
Al Viroe4485152006-09-26 22:15:01 -07001232 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001233 iph->id = htons(salt & 0xFFFF);
1234 ip_fallback_id = salt;
1235 spin_unlock_bh(&ip_fb_id_lock);
1236}
1237
1238void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1239{
David S. Miller1d861aa2012-07-10 03:58:16 -07001240 struct net *net = dev_net(dst->dev);
1241 struct inet_peer *peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001242
David S. Miller1d861aa2012-07-10 03:58:16 -07001243 peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
1244 if (peer) {
1245 iph->id = htons(inet_getid(peer, more));
1246 inet_putpeer(peer);
1247 return;
1248 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001249
1250 ip_select_fb_ident(iph);
1251}
Eric Dumazet4bc2f182010-07-09 21:22:10 +00001252EXPORT_SYMBOL(__ip_select_ident);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001253
Eric Dumazet95c96172012-04-15 05:58:06 +00001254static void rt_del(unsigned int hash, struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001255{
Eric Dumazet1c317202010-10-25 21:02:07 +00001256 struct rtable __rcu **rthp;
1257 struct rtable *aux;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001258
Eric Dumazet29e75252008-01-31 17:05:09 -08001259 rthp = &rt_hash_table[hash].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001260 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001261 ip_rt_put(rt);
Eric Dumazet1c317202010-10-25 21:02:07 +00001262 while ((aux = rcu_dereference_protected(*rthp,
1263 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001264 if (aux == rt || rt_is_expired(aux)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001265 *rthp = aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001266 rt_free(aux);
1267 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001268 }
Changli Gaod8d1f302010-06-10 23:31:35 -07001269 rthp = &aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001270 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001271 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001272}
1273
David S. Millerd0da7202012-07-11 20:27:54 -07001274static void ip_do_redirect(struct rtable *rt, __be32 old_gw, __be32 new_gw)
1275{
1276 struct neighbour *n;
1277
1278 if (rt->rt_gateway != old_gw)
1279 return;
1280
1281 n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
1282 if (n) {
1283 if (!(n->nud_state & NUD_VALID)) {
1284 neigh_event_send(n, NULL);
1285 } else {
1286 rt->rt_gateway = new_gw;
1287 rt->rt_flags |= RTCF_REDIRECTED;
1288 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1289 }
1290 neigh_release(n);
1291 }
1292}
1293
Eric Dumazeted7865a42010-06-07 21:49:44 -07001294/* called in rcu_read_lock() section */
David S. Miller94206122012-07-11 20:38:08 -07001295void ip_rt_redirect(struct sk_buff *skb, __be32 new_gw)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001296{
David S. Miller94206122012-07-11 20:38:08 -07001297 const struct iphdr *iph = (const struct iphdr *) skb->data;
1298 __be32 old_gw = ip_hdr(skb)->saddr;
1299 __be32 daddr = iph->daddr;
1300 __be32 saddr = iph->saddr;
1301 struct net_device *dev = skb->dev;
Eric Dumazeted7865a42010-06-07 21:49:44 -07001302 struct in_device *in_dev = __in_dev_get_rcu(dev);
Flavio Leitner7cc91502011-10-24 02:56:38 -04001303 int ikeys[2] = { dev->ifindex, 0 };
David S. Miller94206122012-07-11 20:38:08 -07001304 __be32 skeys[2] = { saddr, 0 };
Denis V. Lunev317805b2008-02-28 20:50:06 -08001305 struct net *net;
David S. Miller94206122012-07-11 20:38:08 -07001306 int s, i;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001307
Linus Torvalds1da177e2005-04-16 15:20:36 -07001308 if (!in_dev)
1309 return;
1310
David S. Miller94206122012-07-11 20:38:08 -07001311 switch (icmp_hdr(skb)->code & 7) {
1312 case ICMP_REDIR_NET:
1313 case ICMP_REDIR_NETTOS:
1314 case ICMP_REDIR_HOST:
1315 case ICMP_REDIR_HOSTTOS:
1316 break;
1317
1318 default:
1319 return;
1320 }
1321
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001322 net = dev_net(dev);
Joe Perches9d4fb272009-11-23 10:41:23 -08001323 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1324 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1325 ipv4_is_zeronet(new_gw))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001326 goto reject_redirect;
1327
1328 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1329 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1330 goto reject_redirect;
1331 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1332 goto reject_redirect;
1333 } else {
Denis V. Lunev317805b2008-02-28 20:50:06 -08001334 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001335 goto reject_redirect;
1336 }
1337
Flavio Leitner7cc91502011-10-24 02:56:38 -04001338 for (s = 0; s < 2; s++) {
1339 for (i = 0; i < 2; i++) {
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001340 unsigned int hash;
1341 struct rtable __rcu **rthp;
1342 struct rtable *rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001343
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001344 hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1345
1346 rthp = &rt_hash_table[hash].chain;
1347
1348 while ((rt = rcu_dereference(*rthp)) != NULL) {
1349 rthp = &rt->dst.rt_next;
1350
1351 if (rt->rt_key_dst != daddr ||
1352 rt->rt_key_src != skeys[s] ||
1353 rt->rt_oif != ikeys[i] ||
1354 rt_is_input_route(rt) ||
1355 rt_is_expired(rt) ||
1356 !net_eq(dev_net(rt->dst.dev), net) ||
1357 rt->dst.error ||
David S. Millerd0da7202012-07-11 20:27:54 -07001358 rt->dst.dev != dev)
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001359 continue;
1360
David S. Millerd0da7202012-07-11 20:27:54 -07001361 ip_do_redirect(rt, old_gw, new_gw);
Flavio Leitner7cc91502011-10-24 02:56:38 -04001362 }
Flavio Leitner7cc91502011-10-24 02:56:38 -04001363 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001364 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001365 return;
1366
1367reject_redirect:
1368#ifdef CONFIG_IP_ROUTE_VERBOSE
Joe Perchese87cc472012-05-13 21:56:26 +00001369 if (IN_DEV_LOG_MARTIANS(in_dev))
1370 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1371 " Advised path = %pI4 -> %pI4\n",
1372 &old_gw, dev->name, &new_gw,
1373 &saddr, &daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001374#endif
Eric Dumazeted7865a42010-06-07 21:49:44 -07001375 ;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001376}
1377
1378static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1379{
Eric Dumazetee6b9672008-03-05 18:30:47 -08001380 struct rtable *rt = (struct rtable *)dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001381 struct dst_entry *ret = dst;
1382
1383 if (rt) {
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001384 if (dst->obsolete > 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001385 ip_rt_put(rt);
1386 ret = NULL;
David S. Miller59436342012-07-10 06:58:42 -07001387 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1388 rt->dst.expires) {
Eric Dumazet95c96172012-04-15 05:58:06 +00001389 unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
David S. Miller5e2b61f2011-03-04 21:47:09 -08001390 rt->rt_oif,
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001391 rt_genid(dev_net(dst->dev)));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001392 rt_del(hash, rt);
1393 ret = NULL;
1394 }
1395 }
1396 return ret;
1397}
1398
1399/*
1400 * Algorithm:
1401 * 1. The first ip_rt_redirect_number redirects are sent
1402 * with exponential backoff, then we stop sending them at all,
1403 * assuming that the host ignores our redirects.
1404 * 2. If we did not see packets requiring redirects
1405 * during ip_rt_redirect_silence, we assume that the host
1406 * forgot redirected route and start to send redirects again.
1407 *
1408 * This algorithm is much cheaper and more intelligent than dumb load limiting
1409 * in icmp.c.
1410 *
1411 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1412 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1413 */
1414
1415void ip_rt_send_redirect(struct sk_buff *skb)
1416{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001417 struct rtable *rt = skb_rtable(skb);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001418 struct in_device *in_dev;
David S. Miller92d86822011-02-04 15:55:25 -08001419 struct inet_peer *peer;
David S. Miller1d861aa2012-07-10 03:58:16 -07001420 struct net *net;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001421 int log_martians;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001422
Eric Dumazet30038fc2009-08-28 23:52:01 -07001423 rcu_read_lock();
Changli Gaod8d1f302010-06-10 23:31:35 -07001424 in_dev = __in_dev_get_rcu(rt->dst.dev);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001425 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1426 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001427 return;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001428 }
1429 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1430 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001431
David S. Miller1d861aa2012-07-10 03:58:16 -07001432 net = dev_net(rt->dst.dev);
1433 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
David S. Miller92d86822011-02-04 15:55:25 -08001434 if (!peer) {
1435 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1436 return;
1437 }
1438
Linus Torvalds1da177e2005-04-16 15:20:36 -07001439 /* No redirected packets during ip_rt_redirect_silence;
1440 * reset the algorithm.
1441 */
David S. Miller92d86822011-02-04 15:55:25 -08001442 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1443 peer->rate_tokens = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001444
1445 /* Too many ignored redirects; do not send anything
Changli Gaod8d1f302010-06-10 23:31:35 -07001446 * set dst.rate_last to the last seen redirected packet.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001447 */
David S. Miller92d86822011-02-04 15:55:25 -08001448 if (peer->rate_tokens >= ip_rt_redirect_number) {
1449 peer->rate_last = jiffies;
David S. Miller1d861aa2012-07-10 03:58:16 -07001450 goto out_put_peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001451 }
1452
1453 /* Check for load limit; set rate_last to the latest sent
1454 * redirect.
1455 */
David S. Miller92d86822011-02-04 15:55:25 -08001456 if (peer->rate_tokens == 0 ||
Li Yewang14fb8a72006-12-18 00:26:35 -08001457 time_after(jiffies,
David S. Miller92d86822011-02-04 15:55:25 -08001458 (peer->rate_last +
1459 (ip_rt_redirect_load << peer->rate_tokens)))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001460 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
David S. Miller92d86822011-02-04 15:55:25 -08001461 peer->rate_last = jiffies;
1462 ++peer->rate_tokens;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001463#ifdef CONFIG_IP_ROUTE_VERBOSE
Eric Dumazet30038fc2009-08-28 23:52:01 -07001464 if (log_martians &&
Joe Perchese87cc472012-05-13 21:56:26 +00001465 peer->rate_tokens == ip_rt_redirect_number)
1466 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1467 &ip_hdr(skb)->saddr, rt->rt_iif,
1468 &rt->rt_dst, &rt->rt_gateway);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001469#endif
1470 }
David S. Miller1d861aa2012-07-10 03:58:16 -07001471out_put_peer:
1472 inet_putpeer(peer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001473}
1474
1475static int ip_error(struct sk_buff *skb)
1476{
David S. Miller251da412012-06-26 16:27:09 -07001477 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
Eric Dumazet511c3f92009-06-02 05:14:27 +00001478 struct rtable *rt = skb_rtable(skb);
David S. Miller92d86822011-02-04 15:55:25 -08001479 struct inet_peer *peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001480 unsigned long now;
David S. Miller251da412012-06-26 16:27:09 -07001481 struct net *net;
David S. Miller92d86822011-02-04 15:55:25 -08001482 bool send;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001483 int code;
1484
David S. Miller251da412012-06-26 16:27:09 -07001485 net = dev_net(rt->dst.dev);
1486 if (!IN_DEV_FORWARD(in_dev)) {
1487 switch (rt->dst.error) {
1488 case EHOSTUNREACH:
1489 IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
1490 break;
1491
1492 case ENETUNREACH:
1493 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1494 break;
1495 }
1496 goto out;
1497 }
1498
Changli Gaod8d1f302010-06-10 23:31:35 -07001499 switch (rt->dst.error) {
Joe Perches4500ebf2011-07-01 09:43:07 +00001500 case EINVAL:
1501 default:
1502 goto out;
1503 case EHOSTUNREACH:
1504 code = ICMP_HOST_UNREACH;
1505 break;
1506 case ENETUNREACH:
1507 code = ICMP_NET_UNREACH;
David S. Miller251da412012-06-26 16:27:09 -07001508 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
Joe Perches4500ebf2011-07-01 09:43:07 +00001509 break;
1510 case EACCES:
1511 code = ICMP_PKT_FILTERED;
1512 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001513 }
1514
David S. Miller1d861aa2012-07-10 03:58:16 -07001515 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
David S. Miller92d86822011-02-04 15:55:25 -08001516
1517 send = true;
1518 if (peer) {
1519 now = jiffies;
1520 peer->rate_tokens += now - peer->rate_last;
1521 if (peer->rate_tokens > ip_rt_error_burst)
1522 peer->rate_tokens = ip_rt_error_burst;
1523 peer->rate_last = now;
1524 if (peer->rate_tokens >= ip_rt_error_cost)
1525 peer->rate_tokens -= ip_rt_error_cost;
1526 else
1527 send = false;
David S. Miller1d861aa2012-07-10 03:58:16 -07001528 inet_putpeer(peer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001529 }
David S. Miller92d86822011-02-04 15:55:25 -08001530 if (send)
1531 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001532
1533out: kfree_skb(skb);
1534 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001535}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001536
Linus Torvalds1da177e2005-04-16 15:20:36 -07001537static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1538{
David S. Miller2c8cec52011-02-09 20:42:07 -08001539 struct rtable *rt = (struct rtable *) dst;
David S. Miller2c8cec52011-02-09 20:42:07 -08001540
1541 dst_confirm(dst);
1542
David S. Miller59436342012-07-10 06:58:42 -07001543 if (mtu < ip_rt_min_pmtu)
1544 mtu = ip_rt_min_pmtu;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001545
David S. Miller59436342012-07-10 06:58:42 -07001546 rt->rt_pmtu = mtu;
1547 dst_set_expires(&rt->dst, ip_rt_mtu_expires);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001548}
1549
David S. Miller36393392012-06-14 22:21:46 -07001550void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1551 int oif, u32 mark, u8 protocol, int flow_flags)
1552{
1553 const struct iphdr *iph = (const struct iphdr *)skb->data;
1554 struct flowi4 fl4;
1555 struct rtable *rt;
1556
1557 flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
David S. Miller3e129392012-07-10 04:01:57 -07001558 protocol, flow_flags,
David S. Miller36393392012-06-14 22:21:46 -07001559 iph->daddr, iph->saddr, 0, 0);
1560 rt = __ip_route_output_key(net, &fl4);
1561 if (!IS_ERR(rt)) {
1562 ip_rt_update_pmtu(&rt->dst, mtu);
1563 ip_rt_put(rt);
1564 }
1565}
1566EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1567
1568void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1569{
1570 const struct inet_sock *inet = inet_sk(sk);
1571
1572 return ipv4_update_pmtu(skb, sock_net(sk), mtu,
1573 sk->sk_bound_dev_if, sk->sk_mark,
1574 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
1575 inet_sk_flowi_flags(sk));
1576}
1577EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
David S. Millerf39925d2011-02-09 22:00:16 -08001578
David S. Millerefbc3682011-12-01 13:38:59 -05001579static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1580{
1581 struct rtable *rt = (struct rtable *) dst;
1582
1583 if (rt_is_expired(rt))
1584 return NULL;
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001585 return dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001586}
1587
1588static void ipv4_dst_destroy(struct dst_entry *dst)
1589{
1590 struct rtable *rt = (struct rtable *) dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001591
David S. Miller62fa8a82011-01-26 20:51:05 -08001592 if (rt->fi) {
1593 fib_info_put(rt->fi);
1594 rt->fi = NULL;
1595 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001596}
1597
Linus Torvalds1da177e2005-04-16 15:20:36 -07001598
1599static void ipv4_link_failure(struct sk_buff *skb)
1600{
1601 struct rtable *rt;
1602
1603 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1604
Eric Dumazet511c3f92009-06-02 05:14:27 +00001605 rt = skb_rtable(skb);
David S. Miller59436342012-07-10 06:58:42 -07001606 if (rt)
1607 dst_set_expires(&rt->dst, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001608}
1609
1610static int ip_rt_bug(struct sk_buff *skb)
1611{
Joe Perches91df42b2012-05-15 14:11:54 +00001612 pr_debug("%s: %pI4 -> %pI4, %s\n",
1613 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1614 skb->dev ? skb->dev->name : "?");
Linus Torvalds1da177e2005-04-16 15:20:36 -07001615 kfree_skb(skb);
Dave Jonesc378a9c2011-05-21 07:16:42 +00001616 WARN_ON(1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001617 return 0;
1618}
1619
1620/*
1621 We do not cache source address of outgoing interface,
1622 because it is used only by IP RR, TS and SRR options,
1623 so that it out of fast path.
1624
1625 BTW remember: "addr" is allowed to be not aligned
1626 in IP options!
1627 */
1628
David S. Miller8e363602011-05-13 17:29:41 -04001629void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001630{
Al Viroa61ced52006-09-26 21:27:54 -07001631 __be32 src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001632
David S. Millerc7537962010-11-11 17:07:48 -08001633 if (rt_is_output_route(rt))
David S. Millerc5be24f2011-05-13 18:01:21 -04001634 src = ip_hdr(skb)->saddr;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001635 else {
David S. Miller8e363602011-05-13 17:29:41 -04001636 struct fib_result res;
1637 struct flowi4 fl4;
1638 struct iphdr *iph;
1639
1640 iph = ip_hdr(skb);
1641
1642 memset(&fl4, 0, sizeof(fl4));
1643 fl4.daddr = iph->daddr;
1644 fl4.saddr = iph->saddr;
Julian Anastasovb0fe4a32011-07-23 02:00:41 +00001645 fl4.flowi4_tos = RT_TOS(iph->tos);
David S. Miller8e363602011-05-13 17:29:41 -04001646 fl4.flowi4_oif = rt->dst.dev->ifindex;
1647 fl4.flowi4_iif = skb->dev->ifindex;
1648 fl4.flowi4_mark = skb->mark;
David S. Miller5e2b61f2011-03-04 21:47:09 -08001649
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001650 rcu_read_lock();
David S. Miller68a5e3d2011-03-11 20:07:33 -05001651 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
David S. Miller436c3b62011-03-24 17:42:21 -07001652 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001653 else
1654 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001655 RT_SCOPE_UNIVERSE);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001656 rcu_read_unlock();
1657 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001658 memcpy(addr, &src, 4);
1659}
1660
Patrick McHardyc7066f72011-01-14 13:36:42 +01001661#ifdef CONFIG_IP_ROUTE_CLASSID
Linus Torvalds1da177e2005-04-16 15:20:36 -07001662static void set_class_tag(struct rtable *rt, u32 tag)
1663{
Changli Gaod8d1f302010-06-10 23:31:35 -07001664 if (!(rt->dst.tclassid & 0xFFFF))
1665 rt->dst.tclassid |= tag & 0xFFFF;
1666 if (!(rt->dst.tclassid & 0xFFFF0000))
1667 rt->dst.tclassid |= tag & 0xFFFF0000;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001668}
1669#endif
1670
David S. Miller0dbaee32010-12-13 12:52:14 -08001671static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1672{
1673 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1674
1675 if (advmss == 0) {
1676 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1677 ip_rt_min_advmss);
1678 if (advmss > 65535 - 40)
1679 advmss = 65535 - 40;
1680 }
1681 return advmss;
1682}
1683
Steffen Klassertebb762f2011-11-23 02:12:51 +00001684static unsigned int ipv4_mtu(const struct dst_entry *dst)
David S. Millerd33e4552010-12-14 13:01:14 -08001685{
Steffen Klassert261663b2011-11-23 02:14:50 +00001686 const struct rtable *rt = (const struct rtable *) dst;
David S. Miller59436342012-07-10 06:58:42 -07001687 unsigned int mtu = rt->rt_pmtu;
1688
1689 if (mtu && time_after_eq(jiffies, rt->dst.expires))
1690 mtu = 0;
1691
1692 if (!mtu)
1693 mtu = dst_metric_raw(dst, RTAX_MTU);
Steffen Klassert618f9bc2011-11-23 02:13:31 +00001694
Steffen Klassert261663b2011-11-23 02:14:50 +00001695 if (mtu && rt_is_output_route(rt))
Steffen Klassert618f9bc2011-11-23 02:13:31 +00001696 return mtu;
1697
1698 mtu = dst->dev->mtu;
David S. Millerd33e4552010-12-14 13:01:14 -08001699
1700 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
David S. Millerd33e4552010-12-14 13:01:14 -08001701
1702 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1703 mtu = 576;
1704 }
1705
1706 if (mtu > IP_MAX_MTU)
1707 mtu = IP_MAX_MTU;
1708
1709 return mtu;
1710}
1711
David S. Miller813b3b52011-04-28 14:48:42 -07001712static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
David S. Miller5e2b61f2011-03-04 21:47:09 -08001713 struct fib_info *fi)
David S. Millera4daad62011-01-27 22:01:53 -08001714{
David S. Millerf1850712012-07-10 07:26:01 -07001715 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1716 rt->fi = fi;
1717 atomic_inc(&fi->fib_clntref);
David S. Millera4daad62011-01-27 22:01:53 -08001718 }
David S. Millerf1850712012-07-10 07:26:01 -07001719 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
David S. Millera4daad62011-01-27 22:01:53 -08001720}
1721
David S. Miller813b3b52011-04-28 14:48:42 -07001722static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
David S. Miller5e2b61f2011-03-04 21:47:09 -08001723 const struct fib_result *res,
David S. Miller982721f2011-02-16 21:44:24 -08001724 struct fib_info *fi, u16 type, u32 itag)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001725{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001726 if (fi) {
1727 if (FIB_RES_GW(*res) &&
1728 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1729 rt->rt_gateway = FIB_RES_GW(*res);
David S. Miller813b3b52011-04-28 14:48:42 -07001730 rt_init_metrics(rt, fl4, fi);
Patrick McHardyc7066f72011-01-14 13:36:42 +01001731#ifdef CONFIG_IP_ROUTE_CLASSID
David S. Miller710ab6c2012-07-10 07:02:09 -07001732 rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001733#endif
David S. Millerd33e4552010-12-14 13:01:14 -08001734 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001735
Patrick McHardyc7066f72011-01-14 13:36:42 +01001736#ifdef CONFIG_IP_ROUTE_CLASSID
Linus Torvalds1da177e2005-04-16 15:20:36 -07001737#ifdef CONFIG_IP_MULTIPLE_TABLES
1738 set_class_tag(rt, fib_rules_tclass(res));
1739#endif
1740 set_class_tag(rt, itag);
1741#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001742}
1743
David S. Miller5c1e6aa2011-04-28 14:13:38 -07001744static struct rtable *rt_dst_alloc(struct net_device *dev,
1745 bool nopolicy, bool noxfrm)
David S. Miller0c4dcd52011-02-17 15:42:37 -08001746{
David S. Miller5c1e6aa2011-04-28 14:13:38 -07001747 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1748 DST_HOST |
1749 (nopolicy ? DST_NOPOLICY : 0) |
1750 (noxfrm ? DST_NOXFRM : 0));
David S. Miller0c4dcd52011-02-17 15:42:37 -08001751}
1752
Eric Dumazet96d36222010-06-02 19:21:31 +00001753/* called in rcu_read_lock() section */
Al Viro9e12bb22006-09-26 21:25:20 -07001754static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001755 u8 tos, struct net_device *dev, int our)
1756{
Eric Dumazet96d36222010-06-02 19:21:31 +00001757 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001758 struct rtable *rth;
Eric Dumazet96d36222010-06-02 19:21:31 +00001759 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001760 u32 itag = 0;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001761 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001762
1763 /* Primary sanity checks. */
1764
1765 if (in_dev == NULL)
1766 return -EINVAL;
1767
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001768 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Thomas Grafd0daebc32012-06-12 00:44:01 +00001769 skb->protocol != htons(ETH_P_IP))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001770 goto e_inval;
1771
Thomas Grafd0daebc32012-06-12 00:44:01 +00001772 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1773 if (ipv4_is_loopback(saddr))
1774 goto e_inval;
1775
Joe Perchesf97c1e02007-12-16 13:45:43 -08001776 if (ipv4_is_zeronet(saddr)) {
1777 if (!ipv4_is_local_multicast(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001778 goto e_inval;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001779 } else {
David S. Miller9e56e382012-06-28 18:54:02 -07001780 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1781 in_dev, &itag);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001782 if (err < 0)
1783 goto e_err;
1784 }
Benjamin LaHaise4e7b2f12012-03-27 15:55:32 +00001785 rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
David S. Miller5c1e6aa2011-04-28 14:13:38 -07001786 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001787 if (!rth)
1788 goto e_nobufs;
1789
Patrick McHardyc7066f72011-01-14 13:36:42 +01001790#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07001791 rth->dst.tclassid = itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001792#endif
David S. Millercf911662011-04-28 14:31:47 -07001793 rth->dst.output = ip_rt_bug;
1794
1795 rth->rt_key_dst = daddr;
1796 rth->rt_key_src = saddr;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001797 rth->rt_genid = rt_genid(dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001798 rth->rt_flags = RTCF_MULTICAST;
Eric Dumazet29e75252008-01-31 17:05:09 -08001799 rth->rt_type = RTN_MULTICAST;
David S. Miller475949d2011-05-03 19:45:15 -07001800 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07001801 rth->rt_dst = daddr;
1802 rth->rt_src = saddr;
1803 rth->rt_route_iif = dev->ifindex;
1804 rth->rt_iif = dev->ifindex;
1805 rth->rt_oif = 0;
1806 rth->rt_mark = skb->mark;
David S. Miller59436342012-07-10 06:58:42 -07001807 rth->rt_pmtu = 0;
David S. Millercf911662011-04-28 14:31:47 -07001808 rth->rt_gateway = daddr;
David S. Millercf911662011-04-28 14:31:47 -07001809 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001810 if (our) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001811 rth->dst.input= ip_local_deliver;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001812 rth->rt_flags |= RTCF_LOCAL;
1813 }
1814
1815#ifdef CONFIG_IP_MROUTE
Joe Perchesf97c1e02007-12-16 13:45:43 -08001816 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
Changli Gaod8d1f302010-06-10 23:31:35 -07001817 rth->dst.input = ip_mr_input;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001818#endif
1819 RT_CACHE_STAT_INC(in_slow_mc);
1820
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001821 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
David S. Millerb23dd4f2011-03-02 14:31:35 -08001822 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
Eric Dumazet9aa3c942011-06-18 11:59:18 -07001823 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001824
1825e_nobufs:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001826 return -ENOBUFS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001827e_inval:
Eric Dumazet96d36222010-06-02 19:21:31 +00001828 return -EINVAL;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001829e_err:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001830 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001831}
1832
1833
1834static void ip_handle_martian_source(struct net_device *dev,
1835 struct in_device *in_dev,
1836 struct sk_buff *skb,
Al Viro9e12bb22006-09-26 21:25:20 -07001837 __be32 daddr,
1838 __be32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001839{
1840 RT_CACHE_STAT_INC(in_martian_src);
1841#ifdef CONFIG_IP_ROUTE_VERBOSE
1842 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1843 /*
1844 * RFC1812 recommendation, if source is martian,
1845 * the only hint is MAC header.
1846 */
Joe Perches058bd4d2012-03-11 18:36:11 +00001847 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
Harvey Harrison673d57e2008-10-31 00:53:57 -07001848 &daddr, &saddr, dev->name);
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001849 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
Joe Perches058bd4d2012-03-11 18:36:11 +00001850 print_hex_dump(KERN_WARNING, "ll header: ",
1851 DUMP_PREFIX_OFFSET, 16, 1,
1852 skb_mac_header(skb),
1853 dev->hard_header_len, true);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001854 }
1855 }
1856#endif
1857}
1858
Eric Dumazet47360222010-06-03 04:13:21 +00001859/* called in rcu_read_lock() section */
Stephen Hemminger5969f712008-04-10 01:52:09 -07001860static int __mkroute_input(struct sk_buff *skb,
David S. Miller982721f2011-02-16 21:44:24 -08001861 const struct fib_result *res,
Stephen Hemminger5969f712008-04-10 01:52:09 -07001862 struct in_device *in_dev,
1863 __be32 daddr, __be32 saddr, u32 tos,
1864 struct rtable **result)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001865{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001866 struct rtable *rth;
1867 int err;
1868 struct in_device *out_dev;
Eric Dumazet47360222010-06-03 04:13:21 +00001869 unsigned int flags = 0;
Al Virod9c9df82006-09-26 21:28:14 -07001870 u32 itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001871
1872 /* get a working reference to the output device */
Eric Dumazet47360222010-06-03 04:13:21 +00001873 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001874 if (out_dev == NULL) {
Joe Perchese87cc472012-05-13 21:56:26 +00001875 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07001876 return -EINVAL;
1877 }
1878
1879
Michael Smith5c04c812011-04-07 04:51:50 +00001880 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
David S. Miller9e56e382012-06-28 18:54:02 -07001881 in_dev->dev, in_dev, &itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001882 if (err < 0) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001883 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001884 saddr);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001885
Linus Torvalds1da177e2005-04-16 15:20:36 -07001886 goto cleanup;
1887 }
1888
1889 if (err)
1890 flags |= RTCF_DIRECTSRC;
1891
Thomas Graf51b77ca2008-06-03 16:36:01 -07001892 if (out_dev == in_dev && err &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001893 (IN_DEV_SHARED_MEDIA(out_dev) ||
1894 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1895 flags |= RTCF_DOREDIRECT;
1896
1897 if (skb->protocol != htons(ETH_P_IP)) {
1898 /* Not IP (i.e. ARP). Do not create route, if it is
1899 * invalid for proxy arp. DNAT routes are always valid.
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00001900 *
1901 * Proxy arp feature have been extended to allow, ARP
1902 * replies back to the same interface, to support
1903 * Private VLAN switch technologies. See arp.c.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001904 */
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00001905 if (out_dev == in_dev &&
1906 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001907 err = -EINVAL;
1908 goto cleanup;
1909 }
1910 }
1911
David S. Miller5c1e6aa2011-04-28 14:13:38 -07001912 rth = rt_dst_alloc(out_dev->dev,
1913 IN_DEV_CONF_GET(in_dev, NOPOLICY),
David S. Miller0c4dcd52011-02-17 15:42:37 -08001914 IN_DEV_CONF_GET(out_dev, NOXFRM));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001915 if (!rth) {
1916 err = -ENOBUFS;
1917 goto cleanup;
1918 }
1919
David S. Miller5e2b61f2011-03-04 21:47:09 -08001920 rth->rt_key_dst = daddr;
David S. Miller5e2b61f2011-03-04 21:47:09 -08001921 rth->rt_key_src = saddr;
David S. Millercf911662011-04-28 14:31:47 -07001922 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1923 rth->rt_flags = flags;
1924 rth->rt_type = res->type;
David S. Miller475949d2011-05-03 19:45:15 -07001925 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07001926 rth->rt_dst = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001927 rth->rt_src = saddr;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07001928 rth->rt_route_iif = in_dev->dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08001929 rth->rt_iif = in_dev->dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08001930 rth->rt_oif = 0;
David S. Millercf911662011-04-28 14:31:47 -07001931 rth->rt_mark = skb->mark;
David S. Miller59436342012-07-10 06:58:42 -07001932 rth->rt_pmtu = 0;
David S. Millercf911662011-04-28 14:31:47 -07001933 rth->rt_gateway = daddr;
David S. Millercf911662011-04-28 14:31:47 -07001934 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001935
Changli Gaod8d1f302010-06-10 23:31:35 -07001936 rth->dst.input = ip_forward;
1937 rth->dst.output = ip_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001938
David S. Miller5e2b61f2011-03-04 21:47:09 -08001939 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001940
Linus Torvalds1da177e2005-04-16 15:20:36 -07001941 *result = rth;
1942 err = 0;
1943 cleanup:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001944 return err;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001945}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001946
Stephen Hemminger5969f712008-04-10 01:52:09 -07001947static int ip_mkroute_input(struct sk_buff *skb,
1948 struct fib_result *res,
David S. Miller68a5e3d2011-03-11 20:07:33 -05001949 const struct flowi4 *fl4,
Stephen Hemminger5969f712008-04-10 01:52:09 -07001950 struct in_device *in_dev,
1951 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001952{
Daniel Baluta5e73ea12012-04-15 01:34:41 +00001953 struct rtable *rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001954 int err;
Eric Dumazet95c96172012-04-15 05:58:06 +00001955 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001956
1957#ifdef CONFIG_IP_ROUTE_MULTIPATH
David S. Millerff3fccb2011-03-10 16:23:24 -08001958 if (res->fi && res->fi->fib_nhs > 1)
David S. Miller1b7fe5932011-03-10 17:01:16 -08001959 fib_select_multipath(res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001960#endif
1961
1962 /* create a routing cache entry */
1963 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1964 if (err)
1965 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001966
1967 /* put it into the cache */
David S. Miller68a5e3d2011-03-11 20:07:33 -05001968 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
Changli Gaod8d1f302010-06-10 23:31:35 -07001969 rt_genid(dev_net(rth->dst.dev)));
David S. Miller68a5e3d2011-03-11 20:07:33 -05001970 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001971 if (IS_ERR(rth))
1972 return PTR_ERR(rth);
1973 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001974}
1975
Linus Torvalds1da177e2005-04-16 15:20:36 -07001976/*
1977 * NOTE. We drop all the packets that has local source
1978 * addresses, because every properly looped back packet
1979 * must have correct destination already attached by output routine.
1980 *
1981 * Such approach solves two big problems:
1982 * 1. Not simplex devices are handled properly.
1983 * 2. IP spoofing attempts are filtered with 100% of guarantee.
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001984 * called with rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -07001985 */
1986
Al Viro9e12bb22006-09-26 21:25:20 -07001987static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
David S. Millerc10237e2012-06-27 17:05:06 -07001988 u8 tos, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001989{
1990 struct fib_result res;
Eric Dumazet96d36222010-06-02 19:21:31 +00001991 struct in_device *in_dev = __in_dev_get_rcu(dev);
David S. Miller68a5e3d2011-03-11 20:07:33 -05001992 struct flowi4 fl4;
Eric Dumazet95c96172012-04-15 05:58:06 +00001993 unsigned int flags = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001994 u32 itag = 0;
Eric Dumazet95c96172012-04-15 05:58:06 +00001995 struct rtable *rth;
1996 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001997 int err = -EINVAL;
Daniel Baluta5e73ea12012-04-15 01:34:41 +00001998 struct net *net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001999
2000 /* IP on this device is disabled. */
2001
2002 if (!in_dev)
2003 goto out;
2004
2005 /* Check for the most weird martians, which can be not detected
2006 by fib_lookup.
2007 */
2008
Thomas Grafd0daebc32012-06-12 00:44:01 +00002009 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002010 goto martian_source;
2011
Andy Walls27a954b2010-10-17 15:11:22 +00002012 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002013 goto brd_input;
2014
2015 /* Accept zero addresses only to limited broadcast;
2016 * I even do not know to fix it or not. Waiting for complains :-)
2017 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002018 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002019 goto martian_source;
2020
Thomas Grafd0daebc32012-06-12 00:44:01 +00002021 if (ipv4_is_zeronet(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002022 goto martian_destination;
2023
Thomas Grafd0daebc32012-06-12 00:44:01 +00002024 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
2025 if (ipv4_is_loopback(daddr))
2026 goto martian_destination;
2027
2028 if (ipv4_is_loopback(saddr))
2029 goto martian_source;
2030 }
2031
Linus Torvalds1da177e2005-04-16 15:20:36 -07002032 /*
2033 * Now we are ready to route packet.
2034 */
David S. Miller68a5e3d2011-03-11 20:07:33 -05002035 fl4.flowi4_oif = 0;
2036 fl4.flowi4_iif = dev->ifindex;
2037 fl4.flowi4_mark = skb->mark;
2038 fl4.flowi4_tos = tos;
2039 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2040 fl4.daddr = daddr;
2041 fl4.saddr = saddr;
2042 err = fib_lookup(net, &fl4, &res);
David S. Miller251da412012-06-26 16:27:09 -07002043 if (err != 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002044 goto no_route;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002045
2046 RT_CACHE_STAT_INC(in_slow_tot);
2047
2048 if (res.type == RTN_BROADCAST)
2049 goto brd_input;
2050
2051 if (res.type == RTN_LOCAL) {
Michael Smith5c04c812011-04-07 04:51:50 +00002052 err = fib_validate_source(skb, saddr, daddr, tos,
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002053 net->loopback_dev->ifindex,
David S. Miller9e56e382012-06-28 18:54:02 -07002054 dev, in_dev, &itag);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002055 if (err < 0)
2056 goto martian_source_keep_err;
2057 if (err)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002058 flags |= RTCF_DIRECTSRC;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002059 goto local_input;
2060 }
2061
2062 if (!IN_DEV_FORWARD(in_dev))
David S. Miller251da412012-06-26 16:27:09 -07002063 goto no_route;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002064 if (res.type != RTN_UNICAST)
2065 goto martian_destination;
2066
David S. Miller68a5e3d2011-03-11 20:07:33 -05002067 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002068out: return err;
2069
2070brd_input:
2071 if (skb->protocol != htons(ETH_P_IP))
2072 goto e_inval;
2073
David S. Miller41347dc2012-06-28 04:05:27 -07002074 if (!ipv4_is_zeronet(saddr)) {
David S. Miller9e56e382012-06-28 18:54:02 -07002075 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2076 in_dev, &itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002077 if (err < 0)
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002078 goto martian_source_keep_err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002079 if (err)
2080 flags |= RTCF_DIRECTSRC;
2081 }
2082 flags |= RTCF_BROADCAST;
2083 res.type = RTN_BROADCAST;
2084 RT_CACHE_STAT_INC(in_brd);
2085
2086local_input:
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002087 rth = rt_dst_alloc(net->loopback_dev,
2088 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002089 if (!rth)
2090 goto e_nobufs;
2091
David S. Millercf911662011-04-28 14:31:47 -07002092 rth->dst.input= ip_local_deliver;
Changli Gaod8d1f302010-06-10 23:31:35 -07002093 rth->dst.output= ip_rt_bug;
David S. Millercf911662011-04-28 14:31:47 -07002094#ifdef CONFIG_IP_ROUTE_CLASSID
2095 rth->dst.tclassid = itag;
2096#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002097
David S. Miller5e2b61f2011-03-04 21:47:09 -08002098 rth->rt_key_dst = daddr;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002099 rth->rt_key_src = saddr;
David S. Millercf911662011-04-28 14:31:47 -07002100 rth->rt_genid = rt_genid(net);
2101 rth->rt_flags = flags|RTCF_LOCAL;
2102 rth->rt_type = res.type;
David S. Miller475949d2011-05-03 19:45:15 -07002103 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002104 rth->rt_dst = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002105 rth->rt_src = saddr;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002106 rth->rt_route_iif = dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002107 rth->rt_iif = dev->ifindex;
David S. Millercf911662011-04-28 14:31:47 -07002108 rth->rt_oif = 0;
2109 rth->rt_mark = skb->mark;
David S. Miller59436342012-07-10 06:58:42 -07002110 rth->rt_pmtu = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002111 rth->rt_gateway = daddr;
David S. Millercf911662011-04-28 14:31:47 -07002112 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002113 if (res.type == RTN_UNREACHABLE) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002114 rth->dst.input= ip_error;
2115 rth->dst.error= -err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002116 rth->rt_flags &= ~RTCF_LOCAL;
2117 }
David S. Miller68a5e3d2011-03-11 20:07:33 -05002118 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2119 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002120 err = 0;
2121 if (IS_ERR(rth))
2122 err = PTR_ERR(rth);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002123 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002124
2125no_route:
2126 RT_CACHE_STAT_INC(in_no_route);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002127 res.type = RTN_UNREACHABLE;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08002128 if (err == -ESRCH)
2129 err = -ENETUNREACH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002130 goto local_input;
2131
2132 /*
2133 * Do not cache martian addresses: they should be logged (RFC1812)
2134 */
2135martian_destination:
2136 RT_CACHE_STAT_INC(in_martian_dst);
2137#ifdef CONFIG_IP_ROUTE_VERBOSE
Joe Perchese87cc472012-05-13 21:56:26 +00002138 if (IN_DEV_LOG_MARTIANS(in_dev))
2139 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2140 &daddr, &saddr, dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002141#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002142
Linus Torvalds1da177e2005-04-16 15:20:36 -07002143e_inval:
2144 err = -EINVAL;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002145 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002146
2147e_nobufs:
2148 err = -ENOBUFS;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002149 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002150
2151martian_source:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002152 err = -EINVAL;
2153martian_source_keep_err:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002154 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002155 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002156}
2157
Eric Dumazet407eadd2010-05-10 11:32:55 +00002158int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
David S. Millerc10237e2012-06-27 17:05:06 -07002159 u8 tos, struct net_device *dev, bool noref)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002160{
Eric Dumazet95c96172012-04-15 05:58:06 +00002161 struct rtable *rth;
2162 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002163 int iif = dev->ifindex;
Denis V. Lunevb5921912008-01-22 23:50:25 -08002164 struct net *net;
Eric Dumazet96d36222010-06-02 19:21:31 +00002165 int res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002166
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002167 net = dev_net(dev);
Neil Horman1080d702008-10-27 12:28:25 -07002168
Eric Dumazet96d36222010-06-02 19:21:31 +00002169 rcu_read_lock();
2170
Neil Horman1080d702008-10-27 12:28:25 -07002171 if (!rt_caching(net))
2172 goto skip_cache;
2173
Linus Torvalds1da177e2005-04-16 15:20:36 -07002174 tos &= IPTOS_RT_MASK;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002175 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002176
Linus Torvalds1da177e2005-04-16 15:20:36 -07002177 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002178 rth = rcu_dereference(rth->dst.rt_next)) {
David S. Miller5e2b61f2011-03-04 21:47:09 -08002179 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2180 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
Julian Anastasov97a80412011-08-09 04:01:16 +00002181 (rth->rt_route_iif ^ iif) |
David S. Miller475949d2011-05-03 19:45:15 -07002182 (rth->rt_key_tos ^ tos)) == 0 &&
David S. Miller5e2b61f2011-03-04 21:47:09 -08002183 rth->rt_mark == skb->mark &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002184 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002185 !rt_is_expired(rth)) {
Eric Dumazet407eadd2010-05-10 11:32:55 +00002186 if (noref) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002187 dst_use_noref(&rth->dst, jiffies);
2188 skb_dst_set_noref(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002189 } else {
Changli Gaod8d1f302010-06-10 23:31:35 -07002190 dst_use(&rth->dst, jiffies);
2191 skb_dst_set(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002192 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002193 RT_CACHE_STAT_INC(in_hit);
2194 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002195 return 0;
2196 }
2197 RT_CACHE_STAT_INC(in_hlist_search);
2198 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002199
Neil Horman1080d702008-10-27 12:28:25 -07002200skip_cache:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002201 /* Multicast recognition logic is moved from route cache to here.
2202 The problem was that too many Ethernet cards have broken/missing
2203 hardware multicast filters :-( As result the host on multicasting
2204 network acquires a lot of useless route cache entries, sort of
2205 SDR messages from all the world. Now we try to get rid of them.
2206 Really, provided software IP multicast filter is organized
2207 reasonably (at least, hashed), it does not result in a slowdown
2208 comparing with route cache reject entries.
2209 Note, that multicast routers are not affected, because
2210 route cache entry is created eventually.
2211 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002212 if (ipv4_is_multicast(daddr)) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002213 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002214
Eric Dumazet96d36222010-06-02 19:21:31 +00002215 if (in_dev) {
David S. Millerdbdd9a52011-03-10 16:34:38 -08002216 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2217 ip_hdr(skb)->protocol);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002218 if (our
2219#ifdef CONFIG_IP_MROUTE
Joe Perches9d4fb272009-11-23 10:41:23 -08002220 ||
2221 (!ipv4_is_local_multicast(daddr) &&
2222 IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002223#endif
Joe Perches9d4fb272009-11-23 10:41:23 -08002224 ) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002225 int res = ip_route_input_mc(skb, daddr, saddr,
2226 tos, dev, our);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002227 rcu_read_unlock();
Eric Dumazet96d36222010-06-02 19:21:31 +00002228 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002229 }
2230 }
2231 rcu_read_unlock();
2232 return -EINVAL;
2233 }
David S. Millerc10237e2012-06-27 17:05:06 -07002234 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
Eric Dumazet96d36222010-06-02 19:21:31 +00002235 rcu_read_unlock();
2236 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002237}
Eric Dumazet407eadd2010-05-10 11:32:55 +00002238EXPORT_SYMBOL(ip_route_input_common);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002239
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002240/* called with rcu_read_lock() */
David S. Miller982721f2011-02-16 21:44:24 -08002241static struct rtable *__mkroute_output(const struct fib_result *res,
David S. Miller68a5e3d2011-03-11 20:07:33 -05002242 const struct flowi4 *fl4,
David S. Miller813b3b52011-04-28 14:48:42 -07002243 __be32 orig_daddr, __be32 orig_saddr,
Julian Anastasovf61759e2011-12-02 11:39:42 +00002244 int orig_oif, __u8 orig_rtos,
2245 struct net_device *dev_out,
David S. Miller5ada5522011-02-17 15:29:00 -08002246 unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002247{
David S. Miller982721f2011-02-16 21:44:24 -08002248 struct fib_info *fi = res->fi;
David S. Miller5ada5522011-02-17 15:29:00 -08002249 struct in_device *in_dev;
David S. Miller982721f2011-02-16 21:44:24 -08002250 u16 type = res->type;
David S. Miller5ada5522011-02-17 15:29:00 -08002251 struct rtable *rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002252
Thomas Grafd0daebc32012-06-12 00:44:01 +00002253 in_dev = __in_dev_get_rcu(dev_out);
2254 if (!in_dev)
David S. Miller5ada5522011-02-17 15:29:00 -08002255 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002256
Thomas Grafd0daebc32012-06-12 00:44:01 +00002257 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2258 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2259 return ERR_PTR(-EINVAL);
2260
David S. Miller68a5e3d2011-03-11 20:07:33 -05002261 if (ipv4_is_lbcast(fl4->daddr))
David S. Miller982721f2011-02-16 21:44:24 -08002262 type = RTN_BROADCAST;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002263 else if (ipv4_is_multicast(fl4->daddr))
David S. Miller982721f2011-02-16 21:44:24 -08002264 type = RTN_MULTICAST;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002265 else if (ipv4_is_zeronet(fl4->daddr))
David S. Miller5ada5522011-02-17 15:29:00 -08002266 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002267
2268 if (dev_out->flags & IFF_LOOPBACK)
2269 flags |= RTCF_LOCAL;
2270
David S. Miller982721f2011-02-16 21:44:24 -08002271 if (type == RTN_BROADCAST) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002272 flags |= RTCF_BROADCAST | RTCF_LOCAL;
David S. Miller982721f2011-02-16 21:44:24 -08002273 fi = NULL;
2274 } else if (type == RTN_MULTICAST) {
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002275 flags |= RTCF_MULTICAST | RTCF_LOCAL;
David S. Miller813b3b52011-04-28 14:48:42 -07002276 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2277 fl4->flowi4_proto))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002278 flags &= ~RTCF_LOCAL;
2279 /* If multicast route do not exist use
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002280 * default one, but do not gateway in this case.
2281 * Yes, it is hack.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002282 */
David S. Miller982721f2011-02-16 21:44:24 -08002283 if (fi && res->prefixlen < 4)
2284 fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002285 }
2286
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002287 rth = rt_dst_alloc(dev_out,
2288 IN_DEV_CONF_GET(in_dev, NOPOLICY),
David S. Miller0c4dcd52011-02-17 15:42:37 -08002289 IN_DEV_CONF_GET(in_dev, NOXFRM));
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002290 if (!rth)
David S. Miller5ada5522011-02-17 15:29:00 -08002291 return ERR_PTR(-ENOBUFS);
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002292
David S. Millercf911662011-04-28 14:31:47 -07002293 rth->dst.output = ip_output;
2294
David S. Miller813b3b52011-04-28 14:48:42 -07002295 rth->rt_key_dst = orig_daddr;
2296 rth->rt_key_src = orig_saddr;
David S. Millercf911662011-04-28 14:31:47 -07002297 rth->rt_genid = rt_genid(dev_net(dev_out));
2298 rth->rt_flags = flags;
2299 rth->rt_type = type;
Julian Anastasovf61759e2011-12-02 11:39:42 +00002300 rth->rt_key_tos = orig_rtos;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002301 rth->rt_dst = fl4->daddr;
2302 rth->rt_src = fl4->saddr;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002303 rth->rt_route_iif = 0;
David S. Miller813b3b52011-04-28 14:48:42 -07002304 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2305 rth->rt_oif = orig_oif;
2306 rth->rt_mark = fl4->flowi4_mark;
David S. Miller59436342012-07-10 06:58:42 -07002307 rth->rt_pmtu = 0;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002308 rth->rt_gateway = fl4->daddr;
David S. Millercf911662011-04-28 14:31:47 -07002309 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002310
2311 RT_CACHE_STAT_INC(out_slow_tot);
2312
David S. Miller41347dc2012-06-28 04:05:27 -07002313 if (flags & RTCF_LOCAL)
Changli Gaod8d1f302010-06-10 23:31:35 -07002314 rth->dst.input = ip_local_deliver;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002315 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002316 if (flags & RTCF_LOCAL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002317 !(dev_out->flags & IFF_LOOPBACK)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002318 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002319 RT_CACHE_STAT_INC(out_slow_mc);
2320 }
2321#ifdef CONFIG_IP_MROUTE
David S. Miller982721f2011-02-16 21:44:24 -08002322 if (type == RTN_MULTICAST) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002323 if (IN_DEV_MFORWARD(in_dev) &&
David S. Miller813b3b52011-04-28 14:48:42 -07002324 !ipv4_is_local_multicast(fl4->daddr)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002325 rth->dst.input = ip_mr_input;
2326 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002327 }
2328 }
2329#endif
2330 }
2331
David S. Miller813b3b52011-04-28 14:48:42 -07002332 rt_set_nexthop(rth, fl4, res, fi, type, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002333
Eric Dumazet7586ece2012-06-20 05:02:19 +00002334 if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE)
2335 rth->dst.flags |= DST_NOCACHE;
2336
David S. Miller5ada5522011-02-17 15:29:00 -08002337 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002338}
2339
Linus Torvalds1da177e2005-04-16 15:20:36 -07002340/*
2341 * Major route resolver routine.
Eric Dumazet0197aa32010-09-30 03:33:58 +00002342 * called with rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002343 */
2344
David S. Miller813b3b52011-04-28 14:48:42 -07002345static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002346{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002347 struct net_device *dev_out = NULL;
Julian Anastasovf61759e2011-12-02 11:39:42 +00002348 __u8 tos = RT_FL_TOS(fl4);
David S. Miller813b3b52011-04-28 14:48:42 -07002349 unsigned int flags = 0;
2350 struct fib_result res;
David S. Miller5ada5522011-02-17 15:29:00 -08002351 struct rtable *rth;
David S. Miller813b3b52011-04-28 14:48:42 -07002352 __be32 orig_daddr;
2353 __be32 orig_saddr;
2354 int orig_oif;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002355
2356 res.fi = NULL;
David S. Miller8b96d222012-06-11 02:01:56 -07002357 res.table = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002358#ifdef CONFIG_IP_MULTIPLE_TABLES
2359 res.r = NULL;
2360#endif
2361
David S. Miller813b3b52011-04-28 14:48:42 -07002362 orig_daddr = fl4->daddr;
2363 orig_saddr = fl4->saddr;
2364 orig_oif = fl4->flowi4_oif;
2365
2366 fl4->flowi4_iif = net->loopback_dev->ifindex;
2367 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2368 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2369 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
David S. Miller44713b62011-03-04 21:24:47 -08002370
David S. Miller010c2702011-02-17 15:37:09 -08002371 rcu_read_lock();
David S. Miller813b3b52011-04-28 14:48:42 -07002372 if (fl4->saddr) {
David S. Millerb23dd4f2011-03-02 14:31:35 -08002373 rth = ERR_PTR(-EINVAL);
David S. Miller813b3b52011-04-28 14:48:42 -07002374 if (ipv4_is_multicast(fl4->saddr) ||
2375 ipv4_is_lbcast(fl4->saddr) ||
2376 ipv4_is_zeronet(fl4->saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002377 goto out;
2378
Linus Torvalds1da177e2005-04-16 15:20:36 -07002379 /* I removed check for oif == dev_out->oif here.
2380 It was wrong for two reasons:
Denis V. Lunev1ab35272008-01-22 22:04:30 -08002381 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2382 is assigned to multiple interfaces.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002383 2. Moreover, we are allowed to send packets with saddr
2384 of another iface. --ANK
2385 */
2386
David S. Miller813b3b52011-04-28 14:48:42 -07002387 if (fl4->flowi4_oif == 0 &&
2388 (ipv4_is_multicast(fl4->daddr) ||
2389 ipv4_is_lbcast(fl4->daddr))) {
Julian Anastasova210d012008-10-01 07:28:28 -07002390 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
David S. Miller813b3b52011-04-28 14:48:42 -07002391 dev_out = __ip_dev_find(net, fl4->saddr, false);
Julian Anastasova210d012008-10-01 07:28:28 -07002392 if (dev_out == NULL)
2393 goto out;
2394
Linus Torvalds1da177e2005-04-16 15:20:36 -07002395 /* Special hack: user can direct multicasts
2396 and limited broadcast via necessary interface
2397 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2398 This hack is not just for fun, it allows
2399 vic,vat and friends to work.
2400 They bind socket to loopback, set ttl to zero
2401 and expect that it will work.
2402 From the viewpoint of routing cache they are broken,
2403 because we are not allowed to build multicast path
2404 with loopback source addr (look, routing cache
2405 cannot know, that ttl is zero, so that packet
2406 will not leave this host and route is valid).
2407 Luckily, this hack is good workaround.
2408 */
2409
David S. Miller813b3b52011-04-28 14:48:42 -07002410 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002411 goto make_route;
2412 }
Julian Anastasova210d012008-10-01 07:28:28 -07002413
David S. Miller813b3b52011-04-28 14:48:42 -07002414 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
Julian Anastasova210d012008-10-01 07:28:28 -07002415 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
David S. Miller813b3b52011-04-28 14:48:42 -07002416 if (!__ip_dev_find(net, fl4->saddr, false))
Julian Anastasova210d012008-10-01 07:28:28 -07002417 goto out;
Julian Anastasova210d012008-10-01 07:28:28 -07002418 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002419 }
2420
2421
David S. Miller813b3b52011-04-28 14:48:42 -07002422 if (fl4->flowi4_oif) {
2423 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002424 rth = ERR_PTR(-ENODEV);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002425 if (dev_out == NULL)
2426 goto out;
Herbert Xue5ed6392005-10-03 14:35:55 -07002427
2428 /* RACE: Check return value of inet_select_addr instead. */
Eric Dumazetfc75fc82010-12-22 04:39:39 +00002429 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
David S. Millerb23dd4f2011-03-02 14:31:35 -08002430 rth = ERR_PTR(-ENETUNREACH);
Eric Dumazetfc75fc82010-12-22 04:39:39 +00002431 goto out;
2432 }
David S. Miller813b3b52011-04-28 14:48:42 -07002433 if (ipv4_is_local_multicast(fl4->daddr) ||
2434 ipv4_is_lbcast(fl4->daddr)) {
2435 if (!fl4->saddr)
2436 fl4->saddr = inet_select_addr(dev_out, 0,
2437 RT_SCOPE_LINK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002438 goto make_route;
2439 }
David S. Miller813b3b52011-04-28 14:48:42 -07002440 if (fl4->saddr) {
2441 if (ipv4_is_multicast(fl4->daddr))
2442 fl4->saddr = inet_select_addr(dev_out, 0,
2443 fl4->flowi4_scope);
2444 else if (!fl4->daddr)
2445 fl4->saddr = inet_select_addr(dev_out, 0,
2446 RT_SCOPE_HOST);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002447 }
2448 }
2449
David S. Miller813b3b52011-04-28 14:48:42 -07002450 if (!fl4->daddr) {
2451 fl4->daddr = fl4->saddr;
2452 if (!fl4->daddr)
2453 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002454 dev_out = net->loopback_dev;
David S. Miller813b3b52011-04-28 14:48:42 -07002455 fl4->flowi4_oif = net->loopback_dev->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002456 res.type = RTN_LOCAL;
2457 flags |= RTCF_LOCAL;
2458 goto make_route;
2459 }
2460
David S. Miller813b3b52011-04-28 14:48:42 -07002461 if (fib_lookup(net, fl4, &res)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002462 res.fi = NULL;
David S. Miller8b96d222012-06-11 02:01:56 -07002463 res.table = NULL;
David S. Miller813b3b52011-04-28 14:48:42 -07002464 if (fl4->flowi4_oif) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002465 /* Apparently, routing tables are wrong. Assume,
2466 that the destination is on link.
2467
2468 WHY? DW.
2469 Because we are allowed to send to iface
2470 even if it has NO routes and NO assigned
2471 addresses. When oif is specified, routing
2472 tables are looked up with only one purpose:
2473 to catch if destination is gatewayed, rather than
2474 direct. Moreover, if MSG_DONTROUTE is set,
2475 we send packet, ignoring both routing tables
2476 and ifaddr state. --ANK
2477
2478
2479 We could make it even if oif is unknown,
2480 likely IPv6, but we do not.
2481 */
2482
David S. Miller813b3b52011-04-28 14:48:42 -07002483 if (fl4->saddr == 0)
2484 fl4->saddr = inet_select_addr(dev_out, 0,
2485 RT_SCOPE_LINK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002486 res.type = RTN_UNICAST;
2487 goto make_route;
2488 }
David S. Millerb23dd4f2011-03-02 14:31:35 -08002489 rth = ERR_PTR(-ENETUNREACH);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002490 goto out;
2491 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002492
2493 if (res.type == RTN_LOCAL) {
David S. Miller813b3b52011-04-28 14:48:42 -07002494 if (!fl4->saddr) {
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002495 if (res.fi->fib_prefsrc)
David S. Miller813b3b52011-04-28 14:48:42 -07002496 fl4->saddr = res.fi->fib_prefsrc;
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002497 else
David S. Miller813b3b52011-04-28 14:48:42 -07002498 fl4->saddr = fl4->daddr;
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002499 }
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002500 dev_out = net->loopback_dev;
David S. Miller813b3b52011-04-28 14:48:42 -07002501 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002502 res.fi = NULL;
2503 flags |= RTCF_LOCAL;
2504 goto make_route;
2505 }
2506
2507#ifdef CONFIG_IP_ROUTE_MULTIPATH
David S. Miller813b3b52011-04-28 14:48:42 -07002508 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
David S. Miller1b7fe5932011-03-10 17:01:16 -08002509 fib_select_multipath(&res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002510 else
2511#endif
David S. Miller21d8c492011-04-14 14:49:37 -07002512 if (!res.prefixlen &&
2513 res.table->tb_num_default > 1 &&
David S. Miller813b3b52011-04-28 14:48:42 -07002514 res.type == RTN_UNICAST && !fl4->flowi4_oif)
David S. Miller0c838ff2011-01-31 16:16:50 -08002515 fib_select_default(&res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002516
David S. Miller813b3b52011-04-28 14:48:42 -07002517 if (!fl4->saddr)
2518 fl4->saddr = FIB_RES_PREFSRC(net, res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002519
Linus Torvalds1da177e2005-04-16 15:20:36 -07002520 dev_out = FIB_RES_DEV(res);
David S. Miller813b3b52011-04-28 14:48:42 -07002521 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002522
2523
2524make_route:
David S. Miller813b3b52011-04-28 14:48:42 -07002525 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
Julian Anastasovf61759e2011-12-02 11:39:42 +00002526 tos, dev_out, flags);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002527 if (!IS_ERR(rth)) {
David S. Miller5ada5522011-02-17 15:29:00 -08002528 unsigned int hash;
2529
David S. Miller813b3b52011-04-28 14:48:42 -07002530 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
David S. Miller5ada5522011-02-17 15:29:00 -08002531 rt_genid(dev_net(dev_out)));
David S. Miller813b3b52011-04-28 14:48:42 -07002532 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
David S. Miller5ada5522011-02-17 15:29:00 -08002533 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002534
David S. Miller010c2702011-02-17 15:37:09 -08002535out:
2536 rcu_read_unlock();
David S. Millerb23dd4f2011-03-02 14:31:35 -08002537 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002538}
2539
David S. Miller813b3b52011-04-28 14:48:42 -07002540struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002541{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002542 struct rtable *rth;
David S. Miller010c2702011-02-17 15:37:09 -08002543 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002544
Neil Horman1080d702008-10-27 12:28:25 -07002545 if (!rt_caching(net))
2546 goto slow_output;
2547
David S. Miller9d6ec932011-03-12 01:12:47 -05002548 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002549
2550 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08002551 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002552 rth = rcu_dereference_bh(rth->dst.rt_next)) {
David S. Miller9d6ec932011-03-12 01:12:47 -05002553 if (rth->rt_key_dst == flp4->daddr &&
2554 rth->rt_key_src == flp4->saddr &&
David S. Millerc7537962010-11-11 17:07:48 -08002555 rt_is_output_route(rth) &&
David S. Miller9d6ec932011-03-12 01:12:47 -05002556 rth->rt_oif == flp4->flowi4_oif &&
2557 rth->rt_mark == flp4->flowi4_mark &&
David S. Miller475949d2011-05-03 19:45:15 -07002558 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
Denis V. Lunevb5921912008-01-22 23:50:25 -08002559 (IPTOS_RT_MASK | RTO_ONLINK)) &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002560 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002561 !rt_is_expired(rth)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002562 dst_use(&rth->dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002563 RT_CACHE_STAT_INC(out_hit);
2564 rcu_read_unlock_bh();
David S. Miller56157872011-05-02 14:37:45 -07002565 if (!flp4->saddr)
2566 flp4->saddr = rth->rt_src;
2567 if (!flp4->daddr)
2568 flp4->daddr = rth->rt_dst;
David S. Millerb23dd4f2011-03-02 14:31:35 -08002569 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002570 }
2571 RT_CACHE_STAT_INC(out_hlist_search);
2572 }
2573 rcu_read_unlock_bh();
2574
Neil Horman1080d702008-10-27 12:28:25 -07002575slow_output:
David S. Miller9d6ec932011-03-12 01:12:47 -05002576 return ip_route_output_slow(net, flp4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002577}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002578EXPORT_SYMBOL_GPL(__ip_route_output_key);
2579
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002580static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2581{
2582 return NULL;
2583}
2584
Steffen Klassertebb762f2011-11-23 02:12:51 +00002585static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
Roland Dreierec831ea2011-01-31 13:16:00 -08002586{
Steffen Klassert618f9bc2011-11-23 02:13:31 +00002587 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2588
2589 return mtu ? : dst->dev->mtu;
Roland Dreierec831ea2011-01-31 13:16:00 -08002590}
2591
David S. Miller14e50e52007-05-24 18:17:54 -07002592static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2593{
2594}
2595
Held Bernhard0972ddb2011-04-24 22:07:32 +00002596static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2597 unsigned long old)
2598{
2599 return NULL;
2600}
2601
David S. Miller14e50e52007-05-24 18:17:54 -07002602static struct dst_ops ipv4_dst_blackhole_ops = {
2603 .family = AF_INET,
Harvey Harrison09640e62009-02-01 00:45:17 -08002604 .protocol = cpu_to_be16(ETH_P_IP),
David S. Miller14e50e52007-05-24 18:17:54 -07002605 .destroy = ipv4_dst_destroy,
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002606 .check = ipv4_blackhole_dst_check,
Steffen Klassertebb762f2011-11-23 02:12:51 +00002607 .mtu = ipv4_blackhole_mtu,
Eric Dumazet214f45c2011-02-18 11:39:01 -08002608 .default_advmss = ipv4_default_advmss,
David S. Miller14e50e52007-05-24 18:17:54 -07002609 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
Held Bernhard0972ddb2011-04-24 22:07:32 +00002610 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
David S. Millerd3aaeb32011-07-18 00:40:17 -07002611 .neigh_lookup = ipv4_neigh_lookup,
David S. Miller14e50e52007-05-24 18:17:54 -07002612};
2613
David S. Miller2774c132011-03-01 14:59:04 -08002614struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
David S. Miller14e50e52007-05-24 18:17:54 -07002615{
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002616 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
David S. Miller2774c132011-03-01 14:59:04 -08002617 struct rtable *ort = (struct rtable *) dst_orig;
David S. Miller14e50e52007-05-24 18:17:54 -07002618
2619 if (rt) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002620 struct dst_entry *new = &rt->dst;
David S. Miller14e50e52007-05-24 18:17:54 -07002621
David S. Miller14e50e52007-05-24 18:17:54 -07002622 new->__use = 1;
Herbert Xu352e5122007-11-13 21:34:06 -08002623 new->input = dst_discard;
2624 new->output = dst_discard;
David S. Miller14e50e52007-05-24 18:17:54 -07002625
Changli Gaod8d1f302010-06-10 23:31:35 -07002626 new->dev = ort->dst.dev;
David S. Miller14e50e52007-05-24 18:17:54 -07002627 if (new->dev)
2628 dev_hold(new->dev);
2629
David S. Miller5e2b61f2011-03-04 21:47:09 -08002630 rt->rt_key_dst = ort->rt_key_dst;
2631 rt->rt_key_src = ort->rt_key_src;
David S. Miller475949d2011-05-03 19:45:15 -07002632 rt->rt_key_tos = ort->rt_key_tos;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002633 rt->rt_route_iif = ort->rt_route_iif;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002634 rt->rt_iif = ort->rt_iif;
2635 rt->rt_oif = ort->rt_oif;
2636 rt->rt_mark = ort->rt_mark;
David S. Miller59436342012-07-10 06:58:42 -07002637 rt->rt_pmtu = ort->rt_pmtu;
David S. Miller14e50e52007-05-24 18:17:54 -07002638
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002639 rt->rt_genid = rt_genid(net);
David S. Miller14e50e52007-05-24 18:17:54 -07002640 rt->rt_flags = ort->rt_flags;
2641 rt->rt_type = ort->rt_type;
2642 rt->rt_dst = ort->rt_dst;
2643 rt->rt_src = ort->rt_src;
David S. Miller14e50e52007-05-24 18:17:54 -07002644 rt->rt_gateway = ort->rt_gateway;
David S. Miller62fa8a82011-01-26 20:51:05 -08002645 rt->fi = ort->fi;
2646 if (rt->fi)
2647 atomic_inc(&rt->fi->fib_clntref);
David S. Miller14e50e52007-05-24 18:17:54 -07002648
2649 dst_free(new);
2650 }
2651
David S. Miller2774c132011-03-01 14:59:04 -08002652 dst_release(dst_orig);
2653
2654 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
David S. Miller14e50e52007-05-24 18:17:54 -07002655}
2656
David S. Miller9d6ec932011-03-12 01:12:47 -05002657struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
David S. Millerb23dd4f2011-03-02 14:31:35 -08002658 struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002659{
David S. Miller9d6ec932011-03-12 01:12:47 -05002660 struct rtable *rt = __ip_route_output_key(net, flp4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002661
David S. Millerb23dd4f2011-03-02 14:31:35 -08002662 if (IS_ERR(rt))
2663 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002664
David S. Miller56157872011-05-02 14:37:45 -07002665 if (flp4->flowi4_proto)
David S. Miller9d6ec932011-03-12 01:12:47 -05002666 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2667 flowi4_to_flowi(flp4),
2668 sk, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002669
David S. Millerb23dd4f2011-03-02 14:31:35 -08002670 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002671}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002672EXPORT_SYMBOL_GPL(ip_route_output_flow);
2673
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002674static int rt_fill_info(struct net *net,
2675 struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002676 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002677{
Eric Dumazet511c3f92009-06-02 05:14:27 +00002678 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002679 struct rtmsg *r;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002680 struct nlmsghdr *nlh;
Steffen Klassert2bc8ca42011-10-11 01:12:02 +00002681 unsigned long expires = 0;
David S. Millerf1850712012-07-10 07:26:01 -07002682 u32 error;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002683
2684 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2685 if (nlh == NULL)
Patrick McHardy26932562007-01-31 23:16:40 -08002686 return -EMSGSIZE;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002687
2688 r = nlmsg_data(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002689 r->rtm_family = AF_INET;
2690 r->rtm_dst_len = 32;
2691 r->rtm_src_len = 0;
David S. Miller475949d2011-05-03 19:45:15 -07002692 r->rtm_tos = rt->rt_key_tos;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002693 r->rtm_table = RT_TABLE_MAIN;
David S. Millerf3756b72012-04-01 20:39:02 -04002694 if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2695 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002696 r->rtm_type = rt->rt_type;
2697 r->rtm_scope = RT_SCOPE_UNIVERSE;
2698 r->rtm_protocol = RTPROT_UNSPEC;
2699 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2700 if (rt->rt_flags & RTCF_NOTIFY)
2701 r->rtm_flags |= RTM_F_NOTIFY;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002702
David S. Millerf3756b72012-04-01 20:39:02 -04002703 if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2704 goto nla_put_failure;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002705 if (rt->rt_key_src) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002706 r->rtm_src_len = 32;
David S. Millerf3756b72012-04-01 20:39:02 -04002707 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2708 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002709 }
David S. Millerf3756b72012-04-01 20:39:02 -04002710 if (rt->dst.dev &&
2711 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2712 goto nla_put_failure;
Patrick McHardyc7066f72011-01-14 13:36:42 +01002713#ifdef CONFIG_IP_ROUTE_CLASSID
David S. Millerf3756b72012-04-01 20:39:02 -04002714 if (rt->dst.tclassid &&
2715 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2716 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002717#endif
David S. Miller41347dc2012-06-28 04:05:27 -07002718 if (!rt_is_input_route(rt) &&
2719 rt->rt_src != rt->rt_key_src) {
David S. Millerf3756b72012-04-01 20:39:02 -04002720 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
2721 goto nla_put_failure;
2722 }
2723 if (rt->rt_dst != rt->rt_gateway &&
2724 nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2725 goto nla_put_failure;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002726
David S. Millerdefb3512010-12-08 21:16:57 -08002727 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002728 goto nla_put_failure;
2729
David S. Millerf3756b72012-04-01 20:39:02 -04002730 if (rt->rt_mark &&
2731 nla_put_be32(skb, RTA_MARK, rt->rt_mark))
2732 goto nla_put_failure;
Eric Dumazet963bfee2010-07-20 22:03:14 +00002733
Changli Gaod8d1f302010-06-10 23:31:35 -07002734 error = rt->dst.error;
David S. Miller59436342012-07-10 06:58:42 -07002735 expires = rt->dst.expires;
2736 if (expires) {
2737 if (time_before(jiffies, expires))
2738 expires -= jiffies;
2739 else
2740 expires = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002741 }
Thomas Grafbe403ea2006-08-17 18:15:17 -07002742
David S. Millerc7537962010-11-11 17:07:48 -08002743 if (rt_is_input_route(rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002744#ifdef CONFIG_IP_MROUTE
Al Viroe4485152006-09-26 22:15:01 -07002745 __be32 dst = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002746
Joe Perchesf97c1e02007-12-16 13:45:43 -08002747 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002748 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
David S. Miller9a1b9492011-05-04 12:18:54 -07002749 int err = ipmr_get_route(net, skb,
2750 rt->rt_src, rt->rt_dst,
2751 r, nowait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002752 if (err <= 0) {
2753 if (!nowait) {
2754 if (err == 0)
2755 return 0;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002756 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002757 } else {
2758 if (err == -EMSGSIZE)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002759 goto nla_put_failure;
Thomas Grafe3703b32006-11-27 09:27:07 -08002760 error = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002761 }
2762 }
2763 } else
2764#endif
David S. Millerf3756b72012-04-01 20:39:02 -04002765 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2766 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002767 }
2768
David S. Millerf1850712012-07-10 07:26:01 -07002769 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
Thomas Grafe3703b32006-11-27 09:27:07 -08002770 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002771
Thomas Grafbe403ea2006-08-17 18:15:17 -07002772 return nlmsg_end(skb, nlh);
2773
2774nla_put_failure:
Patrick McHardy26932562007-01-31 23:16:40 -08002775 nlmsg_cancel(skb, nlh);
2776 return -EMSGSIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002777}
2778
Daniel Baluta5e73ea12012-04-15 01:34:41 +00002779static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002780{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09002781 struct net *net = sock_net(in_skb->sk);
Thomas Grafd889ce32006-08-17 18:15:44 -07002782 struct rtmsg *rtm;
2783 struct nlattr *tb[RTA_MAX+1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07002784 struct rtable *rt = NULL;
Al Viro9e12bb22006-09-26 21:25:20 -07002785 __be32 dst = 0;
2786 __be32 src = 0;
2787 u32 iif;
Thomas Grafd889ce32006-08-17 18:15:44 -07002788 int err;
Eric Dumazet963bfee2010-07-20 22:03:14 +00002789 int mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002790 struct sk_buff *skb;
2791
Thomas Grafd889ce32006-08-17 18:15:44 -07002792 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2793 if (err < 0)
2794 goto errout;
2795
2796 rtm = nlmsg_data(nlh);
2797
Linus Torvalds1da177e2005-04-16 15:20:36 -07002798 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
Thomas Grafd889ce32006-08-17 18:15:44 -07002799 if (skb == NULL) {
2800 err = -ENOBUFS;
2801 goto errout;
2802 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002803
2804 /* Reserve room for dummy headers, this skb can pass
2805 through good chunk of routing engine.
2806 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07002807 skb_reset_mac_header(skb);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07002808 skb_reset_network_header(skb);
Stephen Hemmingerd2c962b2006-04-17 17:27:11 -07002809
2810 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07002811 ip_hdr(skb)->protocol = IPPROTO_ICMP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002812 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2813
Al Viro17fb2c62006-09-26 22:15:25 -07002814 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2815 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
Thomas Grafd889ce32006-08-17 18:15:44 -07002816 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
Eric Dumazet963bfee2010-07-20 22:03:14 +00002817 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002818
2819 if (iif) {
Thomas Grafd889ce32006-08-17 18:15:44 -07002820 struct net_device *dev;
2821
Denis V. Lunev19375042008-02-28 20:52:04 -08002822 dev = __dev_get_by_index(net, iif);
Thomas Grafd889ce32006-08-17 18:15:44 -07002823 if (dev == NULL) {
2824 err = -ENODEV;
2825 goto errout_free;
2826 }
2827
Linus Torvalds1da177e2005-04-16 15:20:36 -07002828 skb->protocol = htons(ETH_P_IP);
2829 skb->dev = dev;
Eric Dumazet963bfee2010-07-20 22:03:14 +00002830 skb->mark = mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002831 local_bh_disable();
2832 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2833 local_bh_enable();
Thomas Grafd889ce32006-08-17 18:15:44 -07002834
Eric Dumazet511c3f92009-06-02 05:14:27 +00002835 rt = skb_rtable(skb);
Changli Gaod8d1f302010-06-10 23:31:35 -07002836 if (err == 0 && rt->dst.error)
2837 err = -rt->dst.error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002838 } else {
David S. Miller68a5e3d2011-03-11 20:07:33 -05002839 struct flowi4 fl4 = {
2840 .daddr = dst,
2841 .saddr = src,
2842 .flowi4_tos = rtm->rtm_tos,
2843 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2844 .flowi4_mark = mark,
Thomas Grafd889ce32006-08-17 18:15:44 -07002845 };
David S. Miller9d6ec932011-03-12 01:12:47 -05002846 rt = ip_route_output_key(net, &fl4);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002847
2848 err = 0;
2849 if (IS_ERR(rt))
2850 err = PTR_ERR(rt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002851 }
Thomas Grafd889ce32006-08-17 18:15:44 -07002852
Linus Torvalds1da177e2005-04-16 15:20:36 -07002853 if (err)
Thomas Grafd889ce32006-08-17 18:15:44 -07002854 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002855
Changli Gaod8d1f302010-06-10 23:31:35 -07002856 skb_dst_set(skb, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002857 if (rtm->rtm_flags & RTM_F_NOTIFY)
2858 rt->rt_flags |= RTCF_NOTIFY;
2859
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002860 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Denis V. Lunev19375042008-02-28 20:52:04 -08002861 RTM_NEWROUTE, 0, 0);
Thomas Grafd889ce32006-08-17 18:15:44 -07002862 if (err <= 0)
2863 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002864
Denis V. Lunev19375042008-02-28 20:52:04 -08002865 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
Thomas Grafd889ce32006-08-17 18:15:44 -07002866errout:
Thomas Graf2942e902006-08-15 00:30:25 -07002867 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002868
Thomas Grafd889ce32006-08-17 18:15:44 -07002869errout_free:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002870 kfree_skb(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07002871 goto errout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002872}
2873
2874int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2875{
2876 struct rtable *rt;
2877 int h, s_h;
2878 int idx, s_idx;
Denis V. Lunev19375042008-02-28 20:52:04 -08002879 struct net *net;
2880
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09002881 net = sock_net(skb->sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002882
2883 s_h = cb->args[0];
Eric Dumazetd8c92832008-01-07 21:52:14 -08002884 if (s_h < 0)
2885 s_h = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002886 s_idx = idx = cb->args[1];
Eric Dumazeta6272662008-08-28 01:11:25 -07002887 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
2888 if (!rt_hash_table[h].chain)
2889 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002890 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08002891 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
Changli Gaod8d1f302010-06-10 23:31:35 -07002892 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
2893 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002894 continue;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002895 if (rt_is_expired(rt))
Eric Dumazet29e75252008-01-31 17:05:09 -08002896 continue;
Changli Gaod8d1f302010-06-10 23:31:35 -07002897 skb_dst_set_noref(skb, &rt->dst);
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002898 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002899 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002900 1, NLM_F_MULTI) <= 0) {
Eric Dumazetadf30902009-06-02 05:19:30 +00002901 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002902 rcu_read_unlock_bh();
2903 goto done;
2904 }
Eric Dumazetadf30902009-06-02 05:19:30 +00002905 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002906 }
2907 rcu_read_unlock_bh();
2908 }
2909
2910done:
2911 cb->args[0] = h;
2912 cb->args[1] = idx;
2913 return skb->len;
2914}
2915
2916void ip_rt_multicast_event(struct in_device *in_dev)
2917{
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -07002918 rt_cache_flush(dev_net(in_dev->dev), 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002919}
2920
2921#ifdef CONFIG_SYSCTL
Denis V. Lunev81c684d2008-07-08 03:05:28 -07002922static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07002923 void __user *buffer,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002924 size_t *lenp, loff_t *ppos)
2925{
2926 if (write) {
Denis V. Lunev639e1042008-07-05 19:02:06 -07002927 int flush_delay;
Denis V. Lunev81c684d2008-07-08 03:05:28 -07002928 ctl_table ctl;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07002929 struct net *net;
Denis V. Lunev639e1042008-07-05 19:02:06 -07002930
Denis V. Lunev81c684d2008-07-08 03:05:28 -07002931 memcpy(&ctl, __ctl, sizeof(ctl));
2932 ctl.data = &flush_delay;
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07002933 proc_dointvec(&ctl, write, buffer, lenp, ppos);
Denis V. Lunev639e1042008-07-05 19:02:06 -07002934
Denis V. Lunev81c684d2008-07-08 03:05:28 -07002935 net = (struct net *)__ctl->extra1;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07002936 rt_cache_flush(net, flush_delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002937 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002938 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002939
2940 return -EINVAL;
2941}
2942
Al Viroeeb61f72008-07-27 08:59:33 +01002943static ctl_table ipv4_route_table[] = {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002944 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002945 .procname = "gc_thresh",
2946 .data = &ipv4_dst_ops.gc_thresh,
2947 .maxlen = sizeof(int),
2948 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08002949 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002950 },
2951 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002952 .procname = "max_size",
2953 .data = &ip_rt_max_size,
2954 .maxlen = sizeof(int),
2955 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08002956 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002957 },
2958 {
2959 /* Deprecated. Use gc_min_interval_ms */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002960
Linus Torvalds1da177e2005-04-16 15:20:36 -07002961 .procname = "gc_min_interval",
2962 .data = &ip_rt_gc_min_interval,
2963 .maxlen = sizeof(int),
2964 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08002965 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002966 },
2967 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002968 .procname = "gc_min_interval_ms",
2969 .data = &ip_rt_gc_min_interval,
2970 .maxlen = sizeof(int),
2971 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08002972 .proc_handler = proc_dointvec_ms_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002973 },
2974 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002975 .procname = "gc_timeout",
2976 .data = &ip_rt_gc_timeout,
2977 .maxlen = sizeof(int),
2978 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08002979 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002980 },
2981 {
Eric Dumazet9f28a2f2011-12-21 15:47:16 -05002982 .procname = "gc_interval",
2983 .data = &ip_rt_gc_interval,
2984 .maxlen = sizeof(int),
2985 .mode = 0644,
2986 .proc_handler = proc_dointvec_jiffies,
2987 },
2988 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002989 .procname = "redirect_load",
2990 .data = &ip_rt_redirect_load,
2991 .maxlen = sizeof(int),
2992 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08002993 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002994 },
2995 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002996 .procname = "redirect_number",
2997 .data = &ip_rt_redirect_number,
2998 .maxlen = sizeof(int),
2999 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003000 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003001 },
3002 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003003 .procname = "redirect_silence",
3004 .data = &ip_rt_redirect_silence,
3005 .maxlen = sizeof(int),
3006 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003007 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003008 },
3009 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003010 .procname = "error_cost",
3011 .data = &ip_rt_error_cost,
3012 .maxlen = sizeof(int),
3013 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003014 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003015 },
3016 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003017 .procname = "error_burst",
3018 .data = &ip_rt_error_burst,
3019 .maxlen = sizeof(int),
3020 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003021 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003022 },
3023 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003024 .procname = "gc_elasticity",
3025 .data = &ip_rt_gc_elasticity,
3026 .maxlen = sizeof(int),
3027 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003028 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003029 },
3030 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003031 .procname = "mtu_expires",
3032 .data = &ip_rt_mtu_expires,
3033 .maxlen = sizeof(int),
3034 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003035 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003036 },
3037 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003038 .procname = "min_pmtu",
3039 .data = &ip_rt_min_pmtu,
3040 .maxlen = sizeof(int),
3041 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003042 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003043 },
3044 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003045 .procname = "min_adv_mss",
3046 .data = &ip_rt_min_advmss,
3047 .maxlen = sizeof(int),
3048 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003049 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003050 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003051 { }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003052};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003053
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003054static struct ctl_table ipv4_route_flush_table[] = {
3055 {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003056 .procname = "flush",
3057 .maxlen = sizeof(int),
3058 .mode = 0200,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003059 .proc_handler = ipv4_sysctl_rtcache_flush,
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003060 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003061 { },
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003062};
3063
3064static __net_init int sysctl_route_net_init(struct net *net)
3065{
3066 struct ctl_table *tbl;
3067
3068 tbl = ipv4_route_flush_table;
Octavian Purdila09ad9bc2009-11-25 15:14:13 -08003069 if (!net_eq(net, &init_net)) {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003070 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3071 if (tbl == NULL)
3072 goto err_dup;
3073 }
3074 tbl[0].extra1 = net;
3075
Eric W. Biedermanec8f23c2012-04-19 13:44:49 +00003076 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003077 if (net->ipv4.route_hdr == NULL)
3078 goto err_reg;
3079 return 0;
3080
3081err_reg:
3082 if (tbl != ipv4_route_flush_table)
3083 kfree(tbl);
3084err_dup:
3085 return -ENOMEM;
3086}
3087
3088static __net_exit void sysctl_route_net_exit(struct net *net)
3089{
3090 struct ctl_table *tbl;
3091
3092 tbl = net->ipv4.route_hdr->ctl_table_arg;
3093 unregister_net_sysctl_table(net->ipv4.route_hdr);
3094 BUG_ON(tbl == ipv4_route_flush_table);
3095 kfree(tbl);
3096}
3097
3098static __net_initdata struct pernet_operations sysctl_route_ops = {
3099 .init = sysctl_route_net_init,
3100 .exit = sysctl_route_net_exit,
3101};
Linus Torvalds1da177e2005-04-16 15:20:36 -07003102#endif
3103
Neil Horman3ee94372010-05-08 01:57:52 -07003104static __net_init int rt_genid_init(struct net *net)
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003105{
Neil Horman3ee94372010-05-08 01:57:52 -07003106 get_random_bytes(&net->ipv4.rt_genid,
3107 sizeof(net->ipv4.rt_genid));
David S. Miller436c3b62011-03-24 17:42:21 -07003108 get_random_bytes(&net->ipv4.dev_addr_genid,
3109 sizeof(net->ipv4.dev_addr_genid));
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003110 return 0;
3111}
3112
Neil Horman3ee94372010-05-08 01:57:52 -07003113static __net_initdata struct pernet_operations rt_genid_ops = {
3114 .init = rt_genid_init,
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003115};
3116
David S. Millerc3426b42012-06-09 16:27:05 -07003117static int __net_init ipv4_inetpeer_init(struct net *net)
3118{
3119 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3120
3121 if (!bp)
3122 return -ENOMEM;
3123 inet_peer_base_init(bp);
3124 net->ipv4.peers = bp;
3125 return 0;
3126}
3127
3128static void __net_exit ipv4_inetpeer_exit(struct net *net)
3129{
3130 struct inet_peer_base *bp = net->ipv4.peers;
3131
3132 net->ipv4.peers = NULL;
David S. Miller56a6b242012-06-09 16:32:41 -07003133 inetpeer_invalidate_tree(bp);
David S. Millerc3426b42012-06-09 16:27:05 -07003134 kfree(bp);
3135}
3136
3137static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3138 .init = ipv4_inetpeer_init,
3139 .exit = ipv4_inetpeer_exit,
3140};
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003141
Patrick McHardyc7066f72011-01-14 13:36:42 +01003142#ifdef CONFIG_IP_ROUTE_CLASSID
Tejun Heo7d720c32010-02-16 15:20:26 +00003143struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
Patrick McHardyc7066f72011-01-14 13:36:42 +01003144#endif /* CONFIG_IP_ROUTE_CLASSID */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003145
3146static __initdata unsigned long rhash_entries;
3147static int __init set_rhash_entries(char *str)
3148{
Eldad Zack413c27d2012-05-19 14:13:18 +00003149 ssize_t ret;
3150
Linus Torvalds1da177e2005-04-16 15:20:36 -07003151 if (!str)
3152 return 0;
Eldad Zack413c27d2012-05-19 14:13:18 +00003153
3154 ret = kstrtoul(str, 0, &rhash_entries);
3155 if (ret)
3156 return 0;
3157
Linus Torvalds1da177e2005-04-16 15:20:36 -07003158 return 1;
3159}
3160__setup("rhash_entries=", set_rhash_entries);
3161
3162int __init ip_rt_init(void)
3163{
Eric Dumazet424c4b72005-07-05 14:58:19 -07003164 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003165
Patrick McHardyc7066f72011-01-14 13:36:42 +01003166#ifdef CONFIG_IP_ROUTE_CLASSID
Ingo Molnar0dcec8c2009-02-25 14:07:33 +01003167 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003168 if (!ip_rt_acct)
3169 panic("IP: failed to allocate ip_rt_acct\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003170#endif
3171
Alexey Dobriyane5d679f2006-08-26 19:25:52 -07003172 ipv4_dst_ops.kmem_cachep =
3173 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09003174 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003175
David S. Miller14e50e52007-05-24 18:17:54 -07003176 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3177
Eric Dumazetfc66f952010-10-08 06:37:34 +00003178 if (dst_entries_init(&ipv4_dst_ops) < 0)
3179 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3180
3181 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3182 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3183
Eric Dumazet424c4b72005-07-05 14:58:19 -07003184 rt_hash_table = (struct rt_hash_bucket *)
3185 alloc_large_system_hash("IP route cache",
3186 sizeof(struct rt_hash_bucket),
3187 rhash_entries,
Jan Beulich44813742009-09-21 17:03:05 -07003188 (totalram_pages >= 128 * 1024) ?
Mike Stroyan18955cf2005-11-29 16:12:55 -08003189 15 : 17,
Kirill Korotaev8d1502d2006-08-07 20:44:22 -07003190 0,
Eric Dumazet424c4b72005-07-05 14:58:19 -07003191 &rt_hash_log,
3192 &rt_hash_mask,
Tim Bird31fe62b2012-05-23 13:33:35 +00003193 0,
Anton Blanchardc9503e02009-04-27 05:42:24 -07003194 rhash_entries ? 0 : 512 * 1024);
Eric Dumazet22c047c2005-07-05 14:55:24 -07003195 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3196 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003197
3198 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3199 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3200
Linus Torvalds1da177e2005-04-16 15:20:36 -07003201 devinet_init();
3202 ip_fib_init();
3203
Eric Dumazet9f28a2f2011-12-21 15:47:16 -05003204 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3205 expires_ljiffies = jiffies;
3206 schedule_delayed_work(&expires_work,
3207 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3208
Denis V. Lunev73b38712008-02-28 20:51:18 -08003209 if (ip_rt_proc_init())
Joe Perches058bd4d2012-03-11 18:36:11 +00003210 pr_err("Unable to create route proc files\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003211#ifdef CONFIG_XFRM
3212 xfrm_init();
Neil Hormana33bc5c2009-07-30 18:52:15 -07003213 xfrm4_init(ip_rt_max_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003214#endif
Greg Rosec7ac8672011-06-10 01:27:09 +00003215 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
Thomas Graf63f34442007-03-22 11:55:17 -07003216
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003217#ifdef CONFIG_SYSCTL
3218 register_pernet_subsys(&sysctl_route_ops);
3219#endif
Neil Horman3ee94372010-05-08 01:57:52 -07003220 register_pernet_subsys(&rt_genid_ops);
David S. Millerc3426b42012-06-09 16:27:05 -07003221 register_pernet_subsys(&ipv4_inetpeer_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003222 return rc;
3223}
3224
Al Viroa1bc6eb2008-07-30 06:32:52 -04003225#ifdef CONFIG_SYSCTL
Al Viroeeb61f72008-07-27 08:59:33 +01003226/*
3227 * We really need to sanitize the damn ipv4 init order, then all
3228 * this nonsense will go away.
3229 */
3230void __init ip_static_sysctl_init(void)
3231{
Eric W. Biederman4e5ca782012-04-19 13:32:39 +00003232 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
Al Viroeeb61f72008-07-27 08:59:33 +01003233}
Al Viroa1bc6eb2008-07-30 06:32:52 -04003234#endif