blob: 80997333db0cf0c750acdf4d8b2299788546a913 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
Jesper Juhl02c30a82005-05-05 16:16:16 -07008 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -07009 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090021 * Alan Cox : Super /proc >4K
Linus Torvalds1da177e2005-04-16 15:20:36 -070022 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090039 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070055 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Ilia Sotnikovcef26852006-03-25 01:38:55 -080056 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
Linus Torvalds1da177e2005-04-16 15:20:36 -070058 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
Linus Torvalds1da177e2005-04-16 15:20:36 -070065#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070071#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070072#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070073#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
Eric Dumazet39c90ec2007-09-15 10:55:54 -070082#include <linux/workqueue.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070083#include <linux/skbuff.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070084#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090093#include <linux/slab.h>
Herbert Xu352e5122007-11-13 21:34:06 -080094#include <net/dst.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020095#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070096#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
Tom Tucker8d717402006-07-30 20:43:36 -0700106#include <net/netevent.h>
Thomas Graf63f34442007-03-22 11:55:17 -0700107#include <net/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108#ifdef CONFIG_SYSCTL
109#include <linux/sysctl.h>
110#endif
111
112#define RT_FL_TOS(oldflp) \
113 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114
115#define IP_MAX_MTU 0xFFF0
116
117#define RT_GC_TIMEOUT (300*HZ)
118
Linus Torvalds1da177e2005-04-16 15:20:36 -0700119static int ip_rt_max_size;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700120static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
121static int ip_rt_gc_interval __read_mostly = 60 * HZ;
122static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
123static int ip_rt_redirect_number __read_mostly = 9;
124static int ip_rt_redirect_load __read_mostly = HZ / 50;
125static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126static int ip_rt_error_cost __read_mostly = HZ;
127static int ip_rt_error_burst __read_mostly = 5 * HZ;
128static int ip_rt_gc_elasticity __read_mostly = 8;
129static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
130static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
131static int ip_rt_min_advmss __read_mostly = 256;
Neil Horman1080d702008-10-27 12:28:25 -0700132static int rt_chain_length_max __read_mostly = 20;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700133
Eric Dumazet125bb8f2009-06-11 20:10:07 +0000134static struct delayed_work expires_work;
135static unsigned long expires_ljiffies;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700136
137/*
138 * Interface to generic destination cache.
139 */
140
141static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
David S. Miller0dbaee32010-12-13 12:52:14 -0800142static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700143static void ipv4_dst_destroy(struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700144static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
145static void ipv4_link_failure(struct sk_buff *skb);
146static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
Daniel Lezcano569d3642008-01-18 03:56:57 -0800147static int rt_garbage_collect(struct dst_ops *ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148
Eric Dumazet72cdd1d2010-11-11 07:14:07 +0000149static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
150 int how)
151{
152}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700153
154static struct dst_ops ipv4_dst_ops = {
155 .family = AF_INET,
Harvey Harrison09640e62009-02-01 00:45:17 -0800156 .protocol = cpu_to_be16(ETH_P_IP),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700157 .gc = rt_garbage_collect,
158 .check = ipv4_dst_check,
David S. Miller0dbaee32010-12-13 12:52:14 -0800159 .default_advmss = ipv4_default_advmss,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700160 .destroy = ipv4_dst_destroy,
161 .ifdown = ipv4_dst_ifdown,
162 .negative_advice = ipv4_negative_advice,
163 .link_failure = ipv4_link_failure,
164 .update_pmtu = ip_rt_update_pmtu,
Herbert Xu1ac06e02008-05-20 14:32:14 -0700165 .local_out = __ip_local_out,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700166};
167
168#define ECN_OR_COST(class) TC_PRIO_##class
169
Philippe De Muyter4839c522007-07-09 15:32:57 -0700170const __u8 ip_tos2prio[16] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700171 TC_PRIO_BESTEFFORT,
172 ECN_OR_COST(FILLER),
173 TC_PRIO_BESTEFFORT,
174 ECN_OR_COST(BESTEFFORT),
175 TC_PRIO_BULK,
176 ECN_OR_COST(BULK),
177 TC_PRIO_BULK,
178 ECN_OR_COST(BULK),
179 TC_PRIO_INTERACTIVE,
180 ECN_OR_COST(INTERACTIVE),
181 TC_PRIO_INTERACTIVE,
182 ECN_OR_COST(INTERACTIVE),
183 TC_PRIO_INTERACTIVE_BULK,
184 ECN_OR_COST(INTERACTIVE_BULK),
185 TC_PRIO_INTERACTIVE_BULK,
186 ECN_OR_COST(INTERACTIVE_BULK)
187};
188
189
190/*
191 * Route cache.
192 */
193
194/* The locking scheme is rather straight forward:
195 *
196 * 1) Read-Copy Update protects the buckets of the central route hash.
197 * 2) Only writers remove entries, and they hold the lock
198 * as they look at rtable reference counts.
199 * 3) Only readers acquire references to rtable entries,
200 * they do so with atomic increments and with the
201 * lock held.
202 */
203
204struct rt_hash_bucket {
Eric Dumazet1c317202010-10-25 21:02:07 +0000205 struct rtable __rcu *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700206};
Neil Horman1080d702008-10-27 12:28:25 -0700207
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700208#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
209 defined(CONFIG_PROVE_LOCKING)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700210/*
211 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
212 * The size of this table is a power of two and depends on the number of CPUS.
Ingo Molnar62051202006-07-03 00:24:59 -0700213 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700214 */
Ingo Molnar62051202006-07-03 00:24:59 -0700215#ifdef CONFIG_LOCKDEP
216# define RT_HASH_LOCK_SZ 256
Eric Dumazet22c047c2005-07-05 14:55:24 -0700217#else
Ingo Molnar62051202006-07-03 00:24:59 -0700218# if NR_CPUS >= 32
219# define RT_HASH_LOCK_SZ 4096
220# elif NR_CPUS >= 16
221# define RT_HASH_LOCK_SZ 2048
222# elif NR_CPUS >= 8
223# define RT_HASH_LOCK_SZ 1024
224# elif NR_CPUS >= 4
225# define RT_HASH_LOCK_SZ 512
226# else
227# define RT_HASH_LOCK_SZ 256
228# endif
Eric Dumazet22c047c2005-07-05 14:55:24 -0700229#endif
230
231static spinlock_t *rt_hash_locks;
232# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800233
234static __init void rt_hash_lock_init(void)
235{
236 int i;
237
238 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
239 GFP_KERNEL);
240 if (!rt_hash_locks)
241 panic("IP: failed to allocate rt_hash_locks\n");
242
243 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
244 spin_lock_init(&rt_hash_locks[i]);
245}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700246#else
247# define rt_hash_lock_addr(slot) NULL
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800248
249static inline void rt_hash_lock_init(void)
250{
251}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700252#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700253
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700254static struct rt_hash_bucket *rt_hash_table __read_mostly;
255static unsigned rt_hash_mask __read_mostly;
256static unsigned int rt_hash_log __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700257
Eric Dumazet2f970d82006-01-17 02:54:36 -0800258static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
Eric Dumazet27f39c72010-05-19 22:07:23 +0000259#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700260
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700261static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700262 int genid)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700263{
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700264 return jhash_3words((__force u32)daddr, (__force u32)saddr,
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700265 idx, genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800266 & rt_hash_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700267}
268
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700269static inline int rt_genid(struct net *net)
270{
271 return atomic_read(&net->ipv4.rt_genid);
272}
273
Linus Torvalds1da177e2005-04-16 15:20:36 -0700274#ifdef CONFIG_PROC_FS
275struct rt_cache_iter_state {
Denis V. Luneva75e9362008-02-28 20:50:55 -0800276 struct seq_net_private p;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700277 int bucket;
Eric Dumazet29e75252008-01-31 17:05:09 -0800278 int genid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700279};
280
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900281static struct rtable *rt_cache_get_first(struct seq_file *seq)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700282{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900283 struct rt_cache_iter_state *st = seq->private;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700284 struct rtable *r = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700285
286 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
Eric Dumazet1c317202010-10-25 21:02:07 +0000287 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
Eric Dumazeta6272662008-08-28 01:11:25 -0700288 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700289 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -0800290 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Eric Dumazet29e75252008-01-31 17:05:09 -0800291 while (r) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700292 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
Denis V. Luneva75e9362008-02-28 20:50:55 -0800293 r->rt_genid == st->genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800294 return r;
Changli Gaod8d1f302010-06-10 23:31:35 -0700295 r = rcu_dereference_bh(r->dst.rt_next);
Eric Dumazet29e75252008-01-31 17:05:09 -0800296 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700297 rcu_read_unlock_bh();
298 }
Eric Dumazet29e75252008-01-31 17:05:09 -0800299 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700300}
301
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900302static struct rtable *__rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800303 struct rtable *r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700304{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900305 struct rt_cache_iter_state *st = seq->private;
Eric Dumazeta6272662008-08-28 01:11:25 -0700306
Eric Dumazet1c317202010-10-25 21:02:07 +0000307 r = rcu_dereference_bh(r->dst.rt_next);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700308 while (!r) {
309 rcu_read_unlock_bh();
Eric Dumazeta6272662008-08-28 01:11:25 -0700310 do {
311 if (--st->bucket < 0)
312 return NULL;
Eric Dumazet1c317202010-10-25 21:02:07 +0000313 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700314 rcu_read_lock_bh();
Eric Dumazet1c317202010-10-25 21:02:07 +0000315 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700316 }
Eric Dumazet1c317202010-10-25 21:02:07 +0000317 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700318}
319
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900320static struct rtable *rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800321 struct rtable *r)
322{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900323 struct rt_cache_iter_state *st = seq->private;
324 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700325 if (dev_net(r->dst.dev) != seq_file_net(seq))
Denis V. Luneva75e9362008-02-28 20:50:55 -0800326 continue;
Denis V. Lunev642d6312008-02-28 20:50:33 -0800327 if (r->rt_genid == st->genid)
328 break;
329 }
330 return r;
331}
332
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900333static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700334{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900335 struct rtable *r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700336
337 if (r)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900338 while (pos && (r = rt_cache_get_next(seq, r)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700339 --pos;
340 return pos ? NULL : r;
341}
342
343static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
344{
Eric Dumazet29e75252008-01-31 17:05:09 -0800345 struct rt_cache_iter_state *st = seq->private;
Eric Dumazet29e75252008-01-31 17:05:09 -0800346 if (*pos)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900347 return rt_cache_get_idx(seq, *pos - 1);
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700348 st->genid = rt_genid(seq_file_net(seq));
Eric Dumazet29e75252008-01-31 17:05:09 -0800349 return SEQ_START_TOKEN;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700350}
351
352static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
353{
Eric Dumazet29e75252008-01-31 17:05:09 -0800354 struct rtable *r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700355
356 if (v == SEQ_START_TOKEN)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900357 r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700358 else
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900359 r = rt_cache_get_next(seq, v);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700360 ++*pos;
361 return r;
362}
363
364static void rt_cache_seq_stop(struct seq_file *seq, void *v)
365{
366 if (v && v != SEQ_START_TOKEN)
367 rcu_read_unlock_bh();
368}
369
370static int rt_cache_seq_show(struct seq_file *seq, void *v)
371{
372 if (v == SEQ_START_TOKEN)
373 seq_printf(seq, "%-127s\n",
374 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
375 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
376 "HHUptod\tSpecDst");
377 else {
378 struct rtable *r = v;
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700379 int len;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700380
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700381 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
382 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
Changli Gaod8d1f302010-06-10 23:31:35 -0700383 r->dst.dev ? r->dst.dev->name : "*",
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700384 (__force u32)r->rt_dst,
385 (__force u32)r->rt_gateway,
Changli Gaod8d1f302010-06-10 23:31:35 -0700386 r->rt_flags, atomic_read(&r->dst.__refcnt),
387 r->dst.__use, 0, (__force u32)r->rt_src,
David S. Miller0dbaee32010-12-13 12:52:14 -0800388 dst_metric_advmss(&r->dst) + 40,
Changli Gaod8d1f302010-06-10 23:31:35 -0700389 dst_metric(&r->dst, RTAX_WINDOW),
390 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
391 dst_metric(&r->dst, RTAX_RTTVAR)),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700392 r->fl.fl4_tos,
Changli Gaod8d1f302010-06-10 23:31:35 -0700393 r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
394 r->dst.hh ? (r->dst.hh->hh_output ==
Linus Torvalds1da177e2005-04-16 15:20:36 -0700395 dev_queue_xmit) : 0,
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700396 r->rt_spec_dst, &len);
397
398 seq_printf(seq, "%*s\n", 127 - len, "");
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900399 }
400 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700401}
402
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700403static const struct seq_operations rt_cache_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700404 .start = rt_cache_seq_start,
405 .next = rt_cache_seq_next,
406 .stop = rt_cache_seq_stop,
407 .show = rt_cache_seq_show,
408};
409
410static int rt_cache_seq_open(struct inode *inode, struct file *file)
411{
Denis V. Luneva75e9362008-02-28 20:50:55 -0800412 return seq_open_net(inode, file, &rt_cache_seq_ops,
Pavel Emelyanovcf7732e2007-10-10 02:29:29 -0700413 sizeof(struct rt_cache_iter_state));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700414}
415
Arjan van de Ven9a321442007-02-12 00:55:35 -0800416static const struct file_operations rt_cache_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700417 .owner = THIS_MODULE,
418 .open = rt_cache_seq_open,
419 .read = seq_read,
420 .llseek = seq_lseek,
Denis V. Luneva75e9362008-02-28 20:50:55 -0800421 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700422};
423
424
425static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
426{
427 int cpu;
428
429 if (*pos == 0)
430 return SEQ_START_TOKEN;
431
Rusty Russell0f231742008-12-29 12:23:42 +0000432 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700433 if (!cpu_possible(cpu))
434 continue;
435 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800436 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700437 }
438 return NULL;
439}
440
441static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
442{
443 int cpu;
444
Rusty Russell0f231742008-12-29 12:23:42 +0000445 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700446 if (!cpu_possible(cpu))
447 continue;
448 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800449 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700450 }
451 return NULL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900452
Linus Torvalds1da177e2005-04-16 15:20:36 -0700453}
454
455static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
456{
457
458}
459
460static int rt_cpu_seq_show(struct seq_file *seq, void *v)
461{
462 struct rt_cache_stat *st = v;
463
464 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700465 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700466 return 0;
467 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900468
Linus Torvalds1da177e2005-04-16 15:20:36 -0700469 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
470 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
Eric Dumazetfc66f952010-10-08 06:37:34 +0000471 dst_entries_get_slow(&ipv4_dst_ops),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700472 st->in_hit,
473 st->in_slow_tot,
474 st->in_slow_mc,
475 st->in_no_route,
476 st->in_brd,
477 st->in_martian_dst,
478 st->in_martian_src,
479
480 st->out_hit,
481 st->out_slow_tot,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900482 st->out_slow_mc,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700483
484 st->gc_total,
485 st->gc_ignored,
486 st->gc_goal_miss,
487 st->gc_dst_overflow,
488 st->in_hlist_search,
489 st->out_hlist_search
490 );
491 return 0;
492}
493
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700494static const struct seq_operations rt_cpu_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700495 .start = rt_cpu_seq_start,
496 .next = rt_cpu_seq_next,
497 .stop = rt_cpu_seq_stop,
498 .show = rt_cpu_seq_show,
499};
500
501
502static int rt_cpu_seq_open(struct inode *inode, struct file *file)
503{
504 return seq_open(file, &rt_cpu_seq_ops);
505}
506
Arjan van de Ven9a321442007-02-12 00:55:35 -0800507static const struct file_operations rt_cpu_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700508 .owner = THIS_MODULE,
509 .open = rt_cpu_seq_open,
510 .read = seq_read,
511 .llseek = seq_lseek,
512 .release = seq_release,
513};
514
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800515#ifdef CONFIG_NET_CLS_ROUTE
Alexey Dobriyana661c412009-11-25 15:40:35 -0800516static int rt_acct_proc_show(struct seq_file *m, void *v)
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800517{
Alexey Dobriyana661c412009-11-25 15:40:35 -0800518 struct ip_rt_acct *dst, *src;
519 unsigned int i, j;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800520
Alexey Dobriyana661c412009-11-25 15:40:35 -0800521 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
522 if (!dst)
523 return -ENOMEM;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800524
Alexey Dobriyana661c412009-11-25 15:40:35 -0800525 for_each_possible_cpu(i) {
526 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
527 for (j = 0; j < 256; j++) {
528 dst[j].o_bytes += src[j].o_bytes;
529 dst[j].o_packets += src[j].o_packets;
530 dst[j].i_bytes += src[j].i_bytes;
531 dst[j].i_packets += src[j].i_packets;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800532 }
533 }
Alexey Dobriyana661c412009-11-25 15:40:35 -0800534
535 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
536 kfree(dst);
537 return 0;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800538}
Alexey Dobriyana661c412009-11-25 15:40:35 -0800539
540static int rt_acct_proc_open(struct inode *inode, struct file *file)
541{
542 return single_open(file, rt_acct_proc_show, NULL);
543}
544
545static const struct file_operations rt_acct_proc_fops = {
546 .owner = THIS_MODULE,
547 .open = rt_acct_proc_open,
548 .read = seq_read,
549 .llseek = seq_lseek,
550 .release = single_release,
551};
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800552#endif
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800553
Denis V. Lunev73b38712008-02-28 20:51:18 -0800554static int __net_init ip_rt_do_proc_init(struct net *net)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800555{
556 struct proc_dir_entry *pde;
557
558 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
559 &rt_cache_seq_fops);
560 if (!pde)
561 goto err1;
562
Wang Chen77020722008-02-28 14:14:25 -0800563 pde = proc_create("rt_cache", S_IRUGO,
564 net->proc_net_stat, &rt_cpu_seq_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800565 if (!pde)
566 goto err2;
567
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800568#ifdef CONFIG_NET_CLS_ROUTE
Alexey Dobriyana661c412009-11-25 15:40:35 -0800569 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800570 if (!pde)
571 goto err3;
572#endif
573 return 0;
574
575#ifdef CONFIG_NET_CLS_ROUTE
576err3:
577 remove_proc_entry("rt_cache", net->proc_net_stat);
578#endif
579err2:
580 remove_proc_entry("rt_cache", net->proc_net);
581err1:
582 return -ENOMEM;
583}
Denis V. Lunev73b38712008-02-28 20:51:18 -0800584
585static void __net_exit ip_rt_do_proc_exit(struct net *net)
586{
587 remove_proc_entry("rt_cache", net->proc_net_stat);
588 remove_proc_entry("rt_cache", net->proc_net);
Alexey Dobriyan0a931ac2010-01-17 03:32:50 +0000589#ifdef CONFIG_NET_CLS_ROUTE
Denis V. Lunev73b38712008-02-28 20:51:18 -0800590 remove_proc_entry("rt_acct", net->proc_net);
Alexey Dobriyan0a931ac2010-01-17 03:32:50 +0000591#endif
Denis V. Lunev73b38712008-02-28 20:51:18 -0800592}
593
594static struct pernet_operations ip_rt_proc_ops __net_initdata = {
595 .init = ip_rt_do_proc_init,
596 .exit = ip_rt_do_proc_exit,
597};
598
599static int __init ip_rt_proc_init(void)
600{
601 return register_pernet_subsys(&ip_rt_proc_ops);
602}
603
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800604#else
Denis V. Lunev73b38712008-02-28 20:51:18 -0800605static inline int ip_rt_proc_init(void)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800606{
607 return 0;
608}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700609#endif /* CONFIG_PROC_FS */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900610
Stephen Hemminger5969f712008-04-10 01:52:09 -0700611static inline void rt_free(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700612{
Changli Gaod8d1f302010-06-10 23:31:35 -0700613 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700614}
615
Stephen Hemminger5969f712008-04-10 01:52:09 -0700616static inline void rt_drop(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700617{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700618 ip_rt_put(rt);
Changli Gaod8d1f302010-06-10 23:31:35 -0700619 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700620}
621
Stephen Hemminger5969f712008-04-10 01:52:09 -0700622static inline int rt_fast_clean(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700623{
624 /* Kill broadcast/multicast entries very aggresively, if they
625 collide in hash table with more useful entries */
626 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
David S. Millerc7537962010-11-11 17:07:48 -0800627 rt_is_input_route(rth) && rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700628}
629
Stephen Hemminger5969f712008-04-10 01:52:09 -0700630static inline int rt_valuable(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700631{
632 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
Changli Gaod8d1f302010-06-10 23:31:35 -0700633 rth->dst.expires;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700634}
635
636static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
637{
638 unsigned long age;
639 int ret = 0;
640
Changli Gaod8d1f302010-06-10 23:31:35 -0700641 if (atomic_read(&rth->dst.__refcnt))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700642 goto out;
643
644 ret = 1;
Changli Gaod8d1f302010-06-10 23:31:35 -0700645 if (rth->dst.expires &&
646 time_after_eq(jiffies, rth->dst.expires))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700647 goto out;
648
Changli Gaod8d1f302010-06-10 23:31:35 -0700649 age = jiffies - rth->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700650 ret = 0;
651 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
652 (age <= tmo2 && rt_valuable(rth)))
653 goto out;
654 ret = 1;
655out: return ret;
656}
657
658/* Bits of score are:
659 * 31: very valuable
660 * 30: not quite useless
661 * 29..0: usage counter
662 */
663static inline u32 rt_score(struct rtable *rt)
664{
Changli Gaod8d1f302010-06-10 23:31:35 -0700665 u32 score = jiffies - rt->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700666
667 score = ~score & ~(3<<30);
668
669 if (rt_valuable(rt))
670 score |= (1<<31);
671
David S. Millerc7537962010-11-11 17:07:48 -0800672 if (rt_is_output_route(rt) ||
Linus Torvalds1da177e2005-04-16 15:20:36 -0700673 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
674 score |= (1<<30);
675
676 return score;
677}
678
Neil Horman1080d702008-10-27 12:28:25 -0700679static inline bool rt_caching(const struct net *net)
680{
681 return net->ipv4.current_rt_cache_rebuild_count <=
682 net->ipv4.sysctl_rt_cache_rebuild_count;
683}
684
685static inline bool compare_hash_inputs(const struct flowi *fl1,
686 const struct flowi *fl2)
687{
Changli Gao58116622010-11-12 18:43:55 +0000688 return ((((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
689 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
Neil Horman1080d702008-10-27 12:28:25 -0700690 (fl1->iif ^ fl2->iif)) == 0);
691}
692
Linus Torvalds1da177e2005-04-16 15:20:36 -0700693static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
694{
Changli Gao58116622010-11-12 18:43:55 +0000695 return (((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
696 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
Thomas Graf47dcf0c2006-11-09 15:20:38 -0800697 (fl1->mark ^ fl2->mark) |
Changli Gao58116622010-11-12 18:43:55 +0000698 (*(u16 *)&fl1->fl4_tos ^ *(u16 *)&fl2->fl4_tos) |
David S. Miller8238b212006-10-12 00:49:15 -0700699 (fl1->oif ^ fl2->oif) |
700 (fl1->iif ^ fl2->iif)) == 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700701}
702
Denis V. Lunevb5921912008-01-22 23:50:25 -0800703static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
704{
Changli Gaod8d1f302010-06-10 23:31:35 -0700705 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
Denis V. Lunevb5921912008-01-22 23:50:25 -0800706}
707
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700708static inline int rt_is_expired(struct rtable *rth)
709{
Changli Gaod8d1f302010-06-10 23:31:35 -0700710 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700711}
712
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800713/*
714 * Perform a full scan of hash table and free all entries.
715 * Can be called by a softirq or a process.
716 * In the later case, we want to be reschedule if necessary
717 */
718static void rt_do_flush(int process_context)
719{
720 unsigned int i;
721 struct rtable *rth, *next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700722 struct rtable * tail;
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800723
724 for (i = 0; i <= rt_hash_mask; i++) {
725 if (process_context && need_resched())
726 cond_resched();
Eric Dumazet1c317202010-10-25 21:02:07 +0000727 rth = rcu_dereference_raw(rt_hash_table[i].chain);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800728 if (!rth)
729 continue;
730
731 spin_lock_bh(rt_hash_lock_addr(i));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700732#ifdef CONFIG_NET_NS
733 {
Eric Dumazet1c317202010-10-25 21:02:07 +0000734 struct rtable __rcu **prev;
735 struct rtable *p;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700736
Eric Dumazet1c317202010-10-25 21:02:07 +0000737 rth = rcu_dereference_protected(rt_hash_table[i].chain,
738 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700739
740 /* defer releasing the head of the list after spin_unlock */
Eric Dumazet1c317202010-10-25 21:02:07 +0000741 for (tail = rth; tail;
742 tail = rcu_dereference_protected(tail->dst.rt_next,
743 lockdep_is_held(rt_hash_lock_addr(i))))
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700744 if (!rt_is_expired(tail))
745 break;
746 if (rth != tail)
747 rt_hash_table[i].chain = tail;
748
749 /* call rt_free on entries after the tail requiring flush */
750 prev = &rt_hash_table[i].chain;
Eric Dumazet1c317202010-10-25 21:02:07 +0000751 for (p = rcu_dereference_protected(*prev,
752 lockdep_is_held(rt_hash_lock_addr(i)));
753 p != NULL;
754 p = next) {
755 next = rcu_dereference_protected(p->dst.rt_next,
756 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700757 if (!rt_is_expired(p)) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700758 prev = &p->dst.rt_next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700759 } else {
760 *prev = next;
761 rt_free(p);
762 }
763 }
764 }
765#else
Eric Dumazet1c317202010-10-25 21:02:07 +0000766 rth = rcu_dereference_protected(rt_hash_table[i].chain,
767 lockdep_is_held(rt_hash_lock_addr(i)));
768 rcu_assign_pointer(rt_hash_table[i].chain, NULL);
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700769 tail = NULL;
770#endif
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800771 spin_unlock_bh(rt_hash_lock_addr(i));
772
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700773 for (; rth != tail; rth = next) {
Eric Dumazet1c317202010-10-25 21:02:07 +0000774 next = rcu_dereference_protected(rth->dst.rt_next, 1);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800775 rt_free(rth);
776 }
777 }
778}
779
Neil Horman1080d702008-10-27 12:28:25 -0700780/*
781 * While freeing expired entries, we compute average chain length
782 * and standard deviation, using fixed-point arithmetic.
783 * This to have an estimation of rt_chain_length_max
784 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
785 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
786 */
787
788#define FRACT_BITS 3
789#define ONE (1UL << FRACT_BITS)
790
Eric Dumazet98376382010-03-08 03:20:00 +0000791/*
792 * Given a hash chain and an item in this hash chain,
793 * find if a previous entry has the same hash_inputs
794 * (but differs on tos, mark or oif)
795 * Returns 0 if an alias is found.
796 * Returns ONE if rth has no alias before itself.
797 */
798static int has_noalias(const struct rtable *head, const struct rtable *rth)
799{
800 const struct rtable *aux = head;
801
802 while (aux != rth) {
803 if (compare_hash_inputs(&aux->fl, &rth->fl))
804 return 0;
Eric Dumazet1c317202010-10-25 21:02:07 +0000805 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +0000806 }
807 return ONE;
808}
809
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800810static void rt_check_expire(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700811{
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700812 static unsigned int rover;
813 unsigned int i = rover, goal;
Eric Dumazet1c317202010-10-25 21:02:07 +0000814 struct rtable *rth;
815 struct rtable __rcu **rthp;
Eric Dumazetcf8da762009-05-19 18:54:22 +0000816 unsigned long samples = 0;
Neil Horman1080d702008-10-27 12:28:25 -0700817 unsigned long sum = 0, sum2 = 0;
Eric Dumazet125bb8f2009-06-11 20:10:07 +0000818 unsigned long delta;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700819 u64 mult;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700820
Eric Dumazet125bb8f2009-06-11 20:10:07 +0000821 delta = jiffies - expires_ljiffies;
822 expires_ljiffies = jiffies;
823 mult = ((u64)delta) << rt_hash_log;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700824 if (ip_rt_gc_timeout > 1)
825 do_div(mult, ip_rt_gc_timeout);
826 goal = (unsigned int)mult;
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700827 if (goal > rt_hash_mask)
828 goal = rt_hash_mask + 1;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700829 for (; goal > 0; goal--) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700830 unsigned long tmo = ip_rt_gc_timeout;
Eric Dumazetcf8da762009-05-19 18:54:22 +0000831 unsigned long length;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700832
833 i = (i + 1) & rt_hash_mask;
834 rthp = &rt_hash_table[i].chain;
835
Eric Dumazetd90bf5a2007-11-14 16:14:05 -0800836 if (need_resched())
837 cond_resched();
838
Neil Horman1080d702008-10-27 12:28:25 -0700839 samples++;
840
Eric Dumazet1c317202010-10-25 21:02:07 +0000841 if (rcu_dereference_raw(*rthp) == NULL)
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700842 continue;
Eric Dumazetcf8da762009-05-19 18:54:22 +0000843 length = 0;
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700844 spin_lock_bh(rt_hash_lock_addr(i));
Eric Dumazet1c317202010-10-25 21:02:07 +0000845 while ((rth = rcu_dereference_protected(*rthp,
846 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700847 prefetch(rth->dst.rt_next);
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700848 if (rt_is_expired(rth)) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700849 *rthp = rth->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -0800850 rt_free(rth);
851 continue;
852 }
Changli Gaod8d1f302010-06-10 23:31:35 -0700853 if (rth->dst.expires) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700854 /* Entry is expired even if it is in use */
Changli Gaod8d1f302010-06-10 23:31:35 -0700855 if (time_before_eq(jiffies, rth->dst.expires)) {
Eric Dumazet1ddbcb02009-05-19 20:14:28 +0000856nofree:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700857 tmo >>= 1;
Changli Gaod8d1f302010-06-10 23:31:35 -0700858 rthp = &rth->dst.rt_next;
Neil Horman1080d702008-10-27 12:28:25 -0700859 /*
Eric Dumazet1ddbcb02009-05-19 20:14:28 +0000860 * We only count entries on
Neil Horman1080d702008-10-27 12:28:25 -0700861 * a chain with equal hash inputs once
862 * so that entries for different QOS
863 * levels, and other non-hash input
864 * attributes don't unfairly skew
865 * the length computation
866 */
Eric Dumazet98376382010-03-08 03:20:00 +0000867 length += has_noalias(rt_hash_table[i].chain, rth);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700868 continue;
869 }
Eric Dumazet1ddbcb02009-05-19 20:14:28 +0000870 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
871 goto nofree;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700872
873 /* Cleanup aged off entries. */
Changli Gaod8d1f302010-06-10 23:31:35 -0700874 *rthp = rth->dst.rt_next;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900875 rt_free(rth);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700876 }
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700877 spin_unlock_bh(rt_hash_lock_addr(i));
Neil Horman1080d702008-10-27 12:28:25 -0700878 sum += length;
879 sum2 += length*length;
880 }
881 if (samples) {
882 unsigned long avg = sum / samples;
883 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
884 rt_chain_length_max = max_t(unsigned long,
885 ip_rt_gc_elasticity,
886 (avg + 4*sd) >> FRACT_BITS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700887 }
888 rover = i;
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800889}
890
891/*
892 * rt_worker_func() is run in process context.
Eric Dumazet29e75252008-01-31 17:05:09 -0800893 * we call rt_check_expire() to scan part of the hash table
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800894 */
895static void rt_worker_func(struct work_struct *work)
896{
Eric Dumazet29e75252008-01-31 17:05:09 -0800897 rt_check_expire();
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700898 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700899}
900
Eric Dumazet29e75252008-01-31 17:05:09 -0800901/*
902 * Pertubation of rt_genid by a small quantity [1..256]
903 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
904 * many times (2^24) without giving recent rt_genid.
905 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700906 */
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700907static void rt_cache_invalidate(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700908{
Eric Dumazet29e75252008-01-31 17:05:09 -0800909 unsigned char shuffle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700910
Eric Dumazet29e75252008-01-31 17:05:09 -0800911 get_random_bytes(&shuffle, sizeof(shuffle));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700912 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700913}
914
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800915/*
Eric Dumazet29e75252008-01-31 17:05:09 -0800916 * delay < 0 : invalidate cache (fast : entries will be deleted later)
917 * delay >= 0 : invalidate & flush cache (can be long)
918 */
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -0700919void rt_cache_flush(struct net *net, int delay)
Eric Dumazet29e75252008-01-31 17:05:09 -0800920{
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700921 rt_cache_invalidate(net);
Eric Dumazet29e75252008-01-31 17:05:09 -0800922 if (delay >= 0)
923 rt_do_flush(!in_softirq());
924}
925
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000926/* Flush previous cache invalidated entries from the cache */
927void rt_cache_flush_batch(void)
928{
929 rt_do_flush(!in_softirq());
930}
931
Neil Horman1080d702008-10-27 12:28:25 -0700932static void rt_emergency_hash_rebuild(struct net *net)
933{
Neil Horman3ee94372010-05-08 01:57:52 -0700934 if (net_ratelimit())
Neil Horman1080d702008-10-27 12:28:25 -0700935 printk(KERN_WARNING "Route hash chain too long!\n");
Neil Horman3ee94372010-05-08 01:57:52 -0700936 rt_cache_invalidate(net);
Neil Horman1080d702008-10-27 12:28:25 -0700937}
938
Linus Torvalds1da177e2005-04-16 15:20:36 -0700939/*
940 Short description of GC goals.
941
942 We want to build algorithm, which will keep routing cache
943 at some equilibrium point, when number of aged off entries
944 is kept approximately equal to newly generated ones.
945
946 Current expiration strength is variable "expire".
947 We try to adjust it dynamically, so that if networking
948 is idle expires is large enough to keep enough of warm entries,
949 and when load increases it reduces to limit cache size.
950 */
951
Daniel Lezcano569d3642008-01-18 03:56:57 -0800952static int rt_garbage_collect(struct dst_ops *ops)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700953{
954 static unsigned long expire = RT_GC_TIMEOUT;
955 static unsigned long last_gc;
956 static int rover;
957 static int equilibrium;
Eric Dumazet1c317202010-10-25 21:02:07 +0000958 struct rtable *rth;
959 struct rtable __rcu **rthp;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700960 unsigned long now = jiffies;
961 int goal;
Eric Dumazetfc66f952010-10-08 06:37:34 +0000962 int entries = dst_entries_get_fast(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700963
964 /*
965 * Garbage collection is pretty expensive,
966 * do not make it too frequently.
967 */
968
969 RT_CACHE_STAT_INC(gc_total);
970
971 if (now - last_gc < ip_rt_gc_min_interval &&
Eric Dumazetfc66f952010-10-08 06:37:34 +0000972 entries < ip_rt_max_size) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700973 RT_CACHE_STAT_INC(gc_ignored);
974 goto out;
975 }
976
Eric Dumazetfc66f952010-10-08 06:37:34 +0000977 entries = dst_entries_get_slow(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700978 /* Calculate number of entries, which we want to expire now. */
Eric Dumazetfc66f952010-10-08 06:37:34 +0000979 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700980 if (goal <= 0) {
981 if (equilibrium < ipv4_dst_ops.gc_thresh)
982 equilibrium = ipv4_dst_ops.gc_thresh;
Eric Dumazetfc66f952010-10-08 06:37:34 +0000983 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700984 if (goal > 0) {
Eric Dumazetb790ced2007-12-21 01:49:07 -0800985 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +0000986 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700987 }
988 } else {
989 /* We are in dangerous area. Try to reduce cache really
990 * aggressively.
991 */
Eric Dumazetb790ced2007-12-21 01:49:07 -0800992 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +0000993 equilibrium = entries - goal;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700994 }
995
996 if (now - last_gc >= ip_rt_gc_min_interval)
997 last_gc = now;
998
999 if (goal <= 0) {
1000 equilibrium += goal;
1001 goto work_done;
1002 }
1003
1004 do {
1005 int i, k;
1006
1007 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1008 unsigned long tmo = expire;
1009
1010 k = (k + 1) & rt_hash_mask;
1011 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001012 spin_lock_bh(rt_hash_lock_addr(k));
Eric Dumazet1c317202010-10-25 21:02:07 +00001013 while ((rth = rcu_dereference_protected(*rthp,
1014 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001015 if (!rt_is_expired(rth) &&
Eric Dumazet29e75252008-01-31 17:05:09 -08001016 !rt_may_expire(rth, tmo, expire)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001017 tmo >>= 1;
Changli Gaod8d1f302010-06-10 23:31:35 -07001018 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001019 continue;
1020 }
Changli Gaod8d1f302010-06-10 23:31:35 -07001021 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001022 rt_free(rth);
1023 goal--;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001024 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001025 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001026 if (goal <= 0)
1027 break;
1028 }
1029 rover = k;
1030
1031 if (goal <= 0)
1032 goto work_done;
1033
1034 /* Goal is not achieved. We stop process if:
1035
1036 - if expire reduced to zero. Otherwise, expire is halfed.
1037 - if table is not full.
1038 - if we are called from interrupt.
1039 - jiffies check is just fallback/debug loop breaker.
1040 We will not spin here for long time in any case.
1041 */
1042
1043 RT_CACHE_STAT_INC(gc_goal_miss);
1044
1045 if (expire == 0)
1046 break;
1047
1048 expire >>= 1;
1049#if RT_CACHE_DEBUG >= 2
1050 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
Eric Dumazetfc66f952010-10-08 06:37:34 +00001051 dst_entries_get_fast(&ipv4_dst_ops), goal, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001052#endif
1053
Eric Dumazetfc66f952010-10-08 06:37:34 +00001054 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001055 goto out;
1056 } while (!in_softirq() && time_before_eq(jiffies, now));
1057
Eric Dumazetfc66f952010-10-08 06:37:34 +00001058 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1059 goto out;
1060 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001061 goto out;
1062 if (net_ratelimit())
1063 printk(KERN_WARNING "dst cache overflow\n");
1064 RT_CACHE_STAT_INC(gc_dst_overflow);
1065 return 1;
1066
1067work_done:
1068 expire += ip_rt_gc_min_interval;
1069 if (expire > ip_rt_gc_timeout ||
Eric Dumazetfc66f952010-10-08 06:37:34 +00001070 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1071 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001072 expire = ip_rt_gc_timeout;
1073#if RT_CACHE_DEBUG >= 2
1074 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
Eric Dumazetfc66f952010-10-08 06:37:34 +00001075 dst_entries_get_fast(&ipv4_dst_ops), goal, rover);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001076#endif
1077out: return 0;
1078}
1079
Eric Dumazet98376382010-03-08 03:20:00 +00001080/*
1081 * Returns number of entries in a hash chain that have different hash_inputs
1082 */
1083static int slow_chain_length(const struct rtable *head)
1084{
1085 int length = 0;
1086 const struct rtable *rth = head;
1087
1088 while (rth) {
1089 length += has_noalias(head, rth);
Eric Dumazet1c317202010-10-25 21:02:07 +00001090 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +00001091 }
1092 return length >> FRACT_BITS;
1093}
1094
Eric Dumazet511c3f92009-06-02 05:14:27 +00001095static int rt_intern_hash(unsigned hash, struct rtable *rt,
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001096 struct rtable **rp, struct sk_buff *skb, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001097{
Eric Dumazet1c317202010-10-25 21:02:07 +00001098 struct rtable *rth, *cand;
1099 struct rtable __rcu **rthp, **candp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001100 unsigned long now;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001101 u32 min_score;
1102 int chain_length;
1103 int attempts = !in_softirq();
1104
1105restart:
1106 chain_length = 0;
1107 min_score = ~(u32)0;
1108 cand = NULL;
1109 candp = NULL;
1110 now = jiffies;
1111
Changli Gaod8d1f302010-06-10 23:31:35 -07001112 if (!rt_caching(dev_net(rt->dst.dev))) {
Neil Horman73e42892009-06-20 01:15:16 -07001113 /*
1114 * If we're not caching, just tell the caller we
1115 * were successful and don't touch the route. The
1116 * caller hold the sole reference to the cache entry, and
1117 * it will be released when the caller is done with it.
1118 * If we drop it here, the callers have no way to resolve routes
1119 * when we're not caching. Instead, just point *rp at rt, so
1120 * the caller gets a single use out of the route
Neil Hormanb6280b42009-06-22 10:18:53 +00001121 * Note that we do rt_free on this new route entry, so that
1122 * once its refcount hits zero, we are still able to reap it
1123 * (Thanks Alexey)
Eric Dumazet27b75c92010-10-15 05:44:11 +00001124 * Note: To avoid expensive rcu stuff for this uncached dst,
1125 * we set DST_NOCACHE so that dst_release() can free dst without
1126 * waiting a grace period.
Neil Horman73e42892009-06-20 01:15:16 -07001127 */
Neil Hormanb6280b42009-06-22 10:18:53 +00001128
Eric Dumazetc7d44262010-10-03 22:17:54 -07001129 rt->dst.flags |= DST_NOCACHE;
David S. Millerc7537962010-11-11 17:07:48 -08001130 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001131 int err = arp_bind_neighbour(&rt->dst);
Neil Hormanb6280b42009-06-22 10:18:53 +00001132 if (err) {
1133 if (net_ratelimit())
1134 printk(KERN_WARNING
1135 "Neighbour table failure & not caching routes.\n");
Eric Dumazet27b75c92010-10-15 05:44:11 +00001136 ip_rt_put(rt);
Neil Hormanb6280b42009-06-22 10:18:53 +00001137 return err;
1138 }
1139 }
1140
Neil Hormanb6280b42009-06-22 10:18:53 +00001141 goto skip_hashing;
Neil Horman1080d702008-10-27 12:28:25 -07001142 }
1143
Linus Torvalds1da177e2005-04-16 15:20:36 -07001144 rthp = &rt_hash_table[hash].chain;
1145
Eric Dumazet22c047c2005-07-05 14:55:24 -07001146 spin_lock_bh(rt_hash_lock_addr(hash));
Eric Dumazet1c317202010-10-25 21:02:07 +00001147 while ((rth = rcu_dereference_protected(*rthp,
1148 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001149 if (rt_is_expired(rth)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001150 *rthp = rth->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001151 rt_free(rth);
1152 continue;
1153 }
Denis V. Lunevb5921912008-01-22 23:50:25 -08001154 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001155 /* Put it first */
Changli Gaod8d1f302010-06-10 23:31:35 -07001156 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001157 /*
1158 * Since lookup is lockfree, the deletion
1159 * must be visible to another weakly ordered CPU before
1160 * the insertion at the start of the hash chain.
1161 */
Changli Gaod8d1f302010-06-10 23:31:35 -07001162 rcu_assign_pointer(rth->dst.rt_next,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001163 rt_hash_table[hash].chain);
1164 /*
1165 * Since lookup is lockfree, the update writes
1166 * must be ordered for consistency on SMP.
1167 */
1168 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1169
Changli Gaod8d1f302010-06-10 23:31:35 -07001170 dst_use(&rth->dst, now);
Eric Dumazet22c047c2005-07-05 14:55:24 -07001171 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001172
1173 rt_drop(rt);
Eric Dumazet511c3f92009-06-02 05:14:27 +00001174 if (rp)
1175 *rp = rth;
1176 else
Changli Gaod8d1f302010-06-10 23:31:35 -07001177 skb_dst_set(skb, &rth->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001178 return 0;
1179 }
1180
Changli Gaod8d1f302010-06-10 23:31:35 -07001181 if (!atomic_read(&rth->dst.__refcnt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001182 u32 score = rt_score(rth);
1183
1184 if (score <= min_score) {
1185 cand = rth;
1186 candp = rthp;
1187 min_score = score;
1188 }
1189 }
1190
1191 chain_length++;
1192
Changli Gaod8d1f302010-06-10 23:31:35 -07001193 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001194 }
1195
1196 if (cand) {
1197 /* ip_rt_gc_elasticity used to be average length of chain
1198 * length, when exceeded gc becomes really aggressive.
1199 *
1200 * The second limit is less certain. At the moment it allows
1201 * only 2 entries per bucket. We will see.
1202 */
1203 if (chain_length > ip_rt_gc_elasticity) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001204 *candp = cand->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001205 rt_free(cand);
1206 }
Neil Horman1080d702008-10-27 12:28:25 -07001207 } else {
Eric Dumazet98376382010-03-08 03:20:00 +00001208 if (chain_length > rt_chain_length_max &&
1209 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001210 struct net *net = dev_net(rt->dst.dev);
Neil Horman1080d702008-10-27 12:28:25 -07001211 int num = ++net->ipv4.current_rt_cache_rebuild_count;
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001212 if (!rt_caching(net)) {
Neil Horman1080d702008-10-27 12:28:25 -07001213 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
Changli Gaod8d1f302010-06-10 23:31:35 -07001214 rt->dst.dev->name, num);
Neil Horman1080d702008-10-27 12:28:25 -07001215 }
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001216 rt_emergency_hash_rebuild(net);
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001217 spin_unlock_bh(rt_hash_lock_addr(hash));
1218
1219 hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1220 ifindex, rt_genid(net));
1221 goto restart;
Neil Horman1080d702008-10-27 12:28:25 -07001222 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001223 }
1224
1225 /* Try to bind route to arp only if it is output
1226 route or unicast forwarding path.
1227 */
David S. Millerc7537962010-11-11 17:07:48 -08001228 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001229 int err = arp_bind_neighbour(&rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001230 if (err) {
Eric Dumazet22c047c2005-07-05 14:55:24 -07001231 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001232
1233 if (err != -ENOBUFS) {
1234 rt_drop(rt);
1235 return err;
1236 }
1237
1238 /* Neighbour tables are full and nothing
1239 can be released. Try to shrink route cache,
1240 it is most likely it holds some neighbour records.
1241 */
1242 if (attempts-- > 0) {
1243 int saved_elasticity = ip_rt_gc_elasticity;
1244 int saved_int = ip_rt_gc_min_interval;
1245 ip_rt_gc_elasticity = 1;
1246 ip_rt_gc_min_interval = 0;
Daniel Lezcano569d3642008-01-18 03:56:57 -08001247 rt_garbage_collect(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001248 ip_rt_gc_min_interval = saved_int;
1249 ip_rt_gc_elasticity = saved_elasticity;
1250 goto restart;
1251 }
1252
1253 if (net_ratelimit())
Ulrich Weber7e1b33e2010-09-27 15:02:18 -07001254 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07001255 rt_drop(rt);
1256 return -ENOBUFS;
1257 }
1258 }
1259
Changli Gaod8d1f302010-06-10 23:31:35 -07001260 rt->dst.rt_next = rt_hash_table[hash].chain;
Neil Horman1080d702008-10-27 12:28:25 -07001261
Linus Torvalds1da177e2005-04-16 15:20:36 -07001262#if RT_CACHE_DEBUG >= 2
Changli Gaod8d1f302010-06-10 23:31:35 -07001263 if (rt->dst.rt_next) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001264 struct rtable *trt;
Neil Hormanb6280b42009-06-22 10:18:53 +00001265 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1266 hash, &rt->rt_dst);
Changli Gaod8d1f302010-06-10 23:31:35 -07001267 for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
Harvey Harrison673d57e2008-10-31 00:53:57 -07001268 printk(" . %pI4", &trt->rt_dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001269 printk("\n");
1270 }
1271#endif
Eric Dumazet00269b52008-10-16 14:18:29 -07001272 /*
1273 * Since lookup is lockfree, we must make sure
1274 * previous writes to rt are comitted to memory
1275 * before making rt visible to other CPUS.
1276 */
Eric Dumazet1ddbcb02009-05-19 20:14:28 +00001277 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
Neil Horman1080d702008-10-27 12:28:25 -07001278
Eric Dumazet22c047c2005-07-05 14:55:24 -07001279 spin_unlock_bh(rt_hash_lock_addr(hash));
Neil Horman73e42892009-06-20 01:15:16 -07001280
Neil Hormanb6280b42009-06-22 10:18:53 +00001281skip_hashing:
Eric Dumazet511c3f92009-06-02 05:14:27 +00001282 if (rp)
1283 *rp = rt;
1284 else
Changli Gaod8d1f302010-06-10 23:31:35 -07001285 skb_dst_set(skb, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001286 return 0;
1287}
1288
1289void rt_bind_peer(struct rtable *rt, int create)
1290{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001291 struct inet_peer *peer;
1292
David S. Millerb534ecf2010-11-30 11:54:19 -08001293 peer = inet_getpeer_v4(rt->rt_dst, create);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001294
Eric Dumazet49e8ab02010-08-19 06:10:45 +00001295 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001296 inet_putpeer(peer);
1297}
1298
1299/*
1300 * Peer allocation may fail only in serious out-of-memory conditions. However
1301 * we still can generate some output.
1302 * Random ID selection looks a bit dangerous because we have no chances to
1303 * select ID being unique in a reasonable period of time.
1304 * But broken packet identifier may be better than no packet at all.
1305 */
1306static void ip_select_fb_ident(struct iphdr *iph)
1307{
1308 static DEFINE_SPINLOCK(ip_fb_id_lock);
1309 static u32 ip_fallback_id;
1310 u32 salt;
1311
1312 spin_lock_bh(&ip_fb_id_lock);
Al Viroe4485152006-09-26 22:15:01 -07001313 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001314 iph->id = htons(salt & 0xFFFF);
1315 ip_fallback_id = salt;
1316 spin_unlock_bh(&ip_fb_id_lock);
1317}
1318
1319void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1320{
1321 struct rtable *rt = (struct rtable *) dst;
1322
1323 if (rt) {
1324 if (rt->peer == NULL)
1325 rt_bind_peer(rt, 1);
1326
1327 /* If peer is attached to destination, it is never detached,
1328 so that we need not to grab a lock to dereference it.
1329 */
1330 if (rt->peer) {
1331 iph->id = htons(inet_getid(rt->peer, more));
1332 return;
1333 }
1334 } else
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001335 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
Stephen Hemminger9c2b3322005-04-19 22:39:42 -07001336 __builtin_return_address(0));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001337
1338 ip_select_fb_ident(iph);
1339}
Eric Dumazet4bc2f182010-07-09 21:22:10 +00001340EXPORT_SYMBOL(__ip_select_ident);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001341
1342static void rt_del(unsigned hash, struct rtable *rt)
1343{
Eric Dumazet1c317202010-10-25 21:02:07 +00001344 struct rtable __rcu **rthp;
1345 struct rtable *aux;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001346
Eric Dumazet29e75252008-01-31 17:05:09 -08001347 rthp = &rt_hash_table[hash].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001348 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001349 ip_rt_put(rt);
Eric Dumazet1c317202010-10-25 21:02:07 +00001350 while ((aux = rcu_dereference_protected(*rthp,
1351 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001352 if (aux == rt || rt_is_expired(aux)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001353 *rthp = aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001354 rt_free(aux);
1355 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001356 }
Changli Gaod8d1f302010-06-10 23:31:35 -07001357 rthp = &aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001358 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001359 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001360}
1361
Eric Dumazeted7865a42010-06-07 21:49:44 -07001362/* called in rcu_read_lock() section */
Al Virof7655222006-09-26 21:25:43 -07001363void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1364 __be32 saddr, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001365{
1366 int i, k;
Eric Dumazeted7865a42010-06-07 21:49:44 -07001367 struct in_device *in_dev = __in_dev_get_rcu(dev);
Eric Dumazet1c317202010-10-25 21:02:07 +00001368 struct rtable *rth;
1369 struct rtable __rcu **rthp;
Al Virof7655222006-09-26 21:25:43 -07001370 __be32 skeys[2] = { saddr, 0 };
Linus Torvalds1da177e2005-04-16 15:20:36 -07001371 int ikeys[2] = { dev->ifindex, 0 };
Tom Tucker8d717402006-07-30 20:43:36 -07001372 struct netevent_redirect netevent;
Denis V. Lunev317805b2008-02-28 20:50:06 -08001373 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001374
Linus Torvalds1da177e2005-04-16 15:20:36 -07001375 if (!in_dev)
1376 return;
1377
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001378 net = dev_net(dev);
Joe Perches9d4fb272009-11-23 10:41:23 -08001379 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1380 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1381 ipv4_is_zeronet(new_gw))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001382 goto reject_redirect;
1383
Neil Horman1080d702008-10-27 12:28:25 -07001384 if (!rt_caching(net))
1385 goto reject_redirect;
1386
Linus Torvalds1da177e2005-04-16 15:20:36 -07001387 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1388 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1389 goto reject_redirect;
1390 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1391 goto reject_redirect;
1392 } else {
Denis V. Lunev317805b2008-02-28 20:50:06 -08001393 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001394 goto reject_redirect;
1395 }
1396
1397 for (i = 0; i < 2; i++) {
1398 for (k = 0; k < 2; k++) {
Denis V. Lunevb00180d2008-07-05 19:04:09 -07001399 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001400 rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001401
Eric Dumazet1c317202010-10-25 21:02:07 +00001402 rthp = &rt_hash_table[hash].chain;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001403
Linus Torvalds1da177e2005-04-16 15:20:36 -07001404 while ((rth = rcu_dereference(*rthp)) != NULL) {
1405 struct rtable *rt;
1406
1407 if (rth->fl.fl4_dst != daddr ||
1408 rth->fl.fl4_src != skeys[i] ||
Linus Torvalds1da177e2005-04-16 15:20:36 -07001409 rth->fl.oif != ikeys[k] ||
David S. Millerc7537962010-11-11 17:07:48 -08001410 rt_is_input_route(rth) ||
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001411 rt_is_expired(rth) ||
Changli Gaod8d1f302010-06-10 23:31:35 -07001412 !net_eq(dev_net(rth->dst.dev), net)) {
1413 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001414 continue;
1415 }
1416
1417 if (rth->rt_dst != daddr ||
1418 rth->rt_src != saddr ||
Changli Gaod8d1f302010-06-10 23:31:35 -07001419 rth->dst.error ||
Linus Torvalds1da177e2005-04-16 15:20:36 -07001420 rth->rt_gateway != old_gw ||
Changli Gaod8d1f302010-06-10 23:31:35 -07001421 rth->dst.dev != dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001422 break;
1423
Changli Gaod8d1f302010-06-10 23:31:35 -07001424 dst_hold(&rth->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001425
1426 rt = dst_alloc(&ipv4_dst_ops);
1427 if (rt == NULL) {
1428 ip_rt_put(rth);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001429 return;
1430 }
1431
1432 /* Copy all the information. */
1433 *rt = *rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07001434 rt->dst.__use = 1;
1435 atomic_set(&rt->dst.__refcnt, 1);
1436 rt->dst.child = NULL;
1437 if (rt->dst.dev)
1438 dev_hold(rt->dst.dev);
Changli Gaod8d1f302010-06-10 23:31:35 -07001439 rt->dst.obsolete = -1;
1440 rt->dst.lastuse = jiffies;
1441 rt->dst.path = &rt->dst;
1442 rt->dst.neighbour = NULL;
1443 rt->dst.hh = NULL;
Alexey Dobriyandef8b4f2008-10-28 13:24:06 -07001444#ifdef CONFIG_XFRM
Changli Gaod8d1f302010-06-10 23:31:35 -07001445 rt->dst.xfrm = NULL;
Alexey Dobriyandef8b4f2008-10-28 13:24:06 -07001446#endif
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001447 rt->rt_genid = rt_genid(net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001448 rt->rt_flags |= RTCF_REDIRECTED;
1449
1450 /* Gateway is different ... */
1451 rt->rt_gateway = new_gw;
1452
1453 /* Redirect received -> path was valid */
Changli Gaod8d1f302010-06-10 23:31:35 -07001454 dst_confirm(&rth->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001455
1456 if (rt->peer)
1457 atomic_inc(&rt->peer->refcnt);
1458
Changli Gaod8d1f302010-06-10 23:31:35 -07001459 if (arp_bind_neighbour(&rt->dst) ||
1460 !(rt->dst.neighbour->nud_state &
Linus Torvalds1da177e2005-04-16 15:20:36 -07001461 NUD_VALID)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001462 if (rt->dst.neighbour)
1463 neigh_event_send(rt->dst.neighbour, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001464 ip_rt_put(rth);
1465 rt_drop(rt);
1466 goto do_next;
1467 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001468
Changli Gaod8d1f302010-06-10 23:31:35 -07001469 netevent.old = &rth->dst;
1470 netevent.new = &rt->dst;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001471 call_netevent_notifiers(NETEVENT_REDIRECT,
1472 &netevent);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001473
1474 rt_del(hash, rth);
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001475 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001476 ip_rt_put(rt);
1477 goto do_next;
1478 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001479 do_next:
1480 ;
1481 }
1482 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001483 return;
1484
1485reject_redirect:
1486#ifdef CONFIG_IP_ROUTE_VERBOSE
1487 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07001488 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1489 " Advised path = %pI4 -> %pI4\n",
1490 &old_gw, dev->name, &new_gw,
1491 &saddr, &daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001492#endif
Eric Dumazeted7865a42010-06-07 21:49:44 -07001493 ;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001494}
1495
1496static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1497{
Eric Dumazetee6b9672008-03-05 18:30:47 -08001498 struct rtable *rt = (struct rtable *)dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001499 struct dst_entry *ret = dst;
1500
1501 if (rt) {
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001502 if (dst->obsolete > 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001503 ip_rt_put(rt);
1504 ret = NULL;
1505 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
Changli Gaod8d1f302010-06-10 23:31:35 -07001506 (rt->dst.expires &&
1507 time_after_eq(jiffies, rt->dst.expires))) {
Al Viro8c7bc842006-09-26 21:26:19 -07001508 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
Denis V. Lunevb00180d2008-07-05 19:04:09 -07001509 rt->fl.oif,
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001510 rt_genid(dev_net(dst->dev)));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001511#if RT_CACHE_DEBUG >= 1
Harvey Harrison673d57e2008-10-31 00:53:57 -07001512 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1513 &rt->rt_dst, rt->fl.fl4_tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001514#endif
1515 rt_del(hash, rt);
1516 ret = NULL;
1517 }
1518 }
1519 return ret;
1520}
1521
1522/*
1523 * Algorithm:
1524 * 1. The first ip_rt_redirect_number redirects are sent
1525 * with exponential backoff, then we stop sending them at all,
1526 * assuming that the host ignores our redirects.
1527 * 2. If we did not see packets requiring redirects
1528 * during ip_rt_redirect_silence, we assume that the host
1529 * forgot redirected route and start to send redirects again.
1530 *
1531 * This algorithm is much cheaper and more intelligent than dumb load limiting
1532 * in icmp.c.
1533 *
1534 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1535 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1536 */
1537
1538void ip_rt_send_redirect(struct sk_buff *skb)
1539{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001540 struct rtable *rt = skb_rtable(skb);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001541 struct in_device *in_dev;
1542 int log_martians;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001543
Eric Dumazet30038fc2009-08-28 23:52:01 -07001544 rcu_read_lock();
Changli Gaod8d1f302010-06-10 23:31:35 -07001545 in_dev = __in_dev_get_rcu(rt->dst.dev);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001546 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1547 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001548 return;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001549 }
1550 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1551 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001552
1553 /* No redirected packets during ip_rt_redirect_silence;
1554 * reset the algorithm.
1555 */
Changli Gaod8d1f302010-06-10 23:31:35 -07001556 if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence))
1557 rt->dst.rate_tokens = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001558
1559 /* Too many ignored redirects; do not send anything
Changli Gaod8d1f302010-06-10 23:31:35 -07001560 * set dst.rate_last to the last seen redirected packet.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001561 */
Changli Gaod8d1f302010-06-10 23:31:35 -07001562 if (rt->dst.rate_tokens >= ip_rt_redirect_number) {
1563 rt->dst.rate_last = jiffies;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001564 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001565 }
1566
1567 /* Check for load limit; set rate_last to the latest sent
1568 * redirect.
1569 */
Changli Gaod8d1f302010-06-10 23:31:35 -07001570 if (rt->dst.rate_tokens == 0 ||
Li Yewang14fb8a72006-12-18 00:26:35 -08001571 time_after(jiffies,
Changli Gaod8d1f302010-06-10 23:31:35 -07001572 (rt->dst.rate_last +
1573 (ip_rt_redirect_load << rt->dst.rate_tokens)))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001574 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
Changli Gaod8d1f302010-06-10 23:31:35 -07001575 rt->dst.rate_last = jiffies;
1576 ++rt->dst.rate_tokens;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001577#ifdef CONFIG_IP_ROUTE_VERBOSE
Eric Dumazet30038fc2009-08-28 23:52:01 -07001578 if (log_martians &&
Changli Gaod8d1f302010-06-10 23:31:35 -07001579 rt->dst.rate_tokens == ip_rt_redirect_number &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001580 net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07001581 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1582 &rt->rt_src, rt->rt_iif,
1583 &rt->rt_dst, &rt->rt_gateway);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001584#endif
1585 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001586}
1587
1588static int ip_error(struct sk_buff *skb)
1589{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001590 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001591 unsigned long now;
1592 int code;
1593
Changli Gaod8d1f302010-06-10 23:31:35 -07001594 switch (rt->dst.error) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001595 case EINVAL:
1596 default:
1597 goto out;
1598 case EHOSTUNREACH:
1599 code = ICMP_HOST_UNREACH;
1600 break;
1601 case ENETUNREACH:
1602 code = ICMP_NET_UNREACH;
Changli Gaod8d1f302010-06-10 23:31:35 -07001603 IP_INC_STATS_BH(dev_net(rt->dst.dev),
Pavel Emelyanov7c73a6f2008-07-16 20:20:11 -07001604 IPSTATS_MIB_INNOROUTES);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001605 break;
1606 case EACCES:
1607 code = ICMP_PKT_FILTERED;
1608 break;
1609 }
1610
1611 now = jiffies;
Changli Gaod8d1f302010-06-10 23:31:35 -07001612 rt->dst.rate_tokens += now - rt->dst.rate_last;
1613 if (rt->dst.rate_tokens > ip_rt_error_burst)
1614 rt->dst.rate_tokens = ip_rt_error_burst;
1615 rt->dst.rate_last = now;
1616 if (rt->dst.rate_tokens >= ip_rt_error_cost) {
1617 rt->dst.rate_tokens -= ip_rt_error_cost;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001618 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1619 }
1620
1621out: kfree_skb(skb);
1622 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001623}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001624
1625/*
1626 * The last two values are not from the RFC but
1627 * are needed for AMPRnet AX.25 paths.
1628 */
1629
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001630static const unsigned short mtu_plateau[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -07001631{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1632
Stephen Hemminger5969f712008-04-10 01:52:09 -07001633static inline unsigned short guess_mtu(unsigned short old_mtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001634{
1635 int i;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001636
Linus Torvalds1da177e2005-04-16 15:20:36 -07001637 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1638 if (old_mtu > mtu_plateau[i])
1639 return mtu_plateau[i];
1640 return 68;
1641}
1642
Denis V. Lunevb5921912008-01-22 23:50:25 -08001643unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
Timo Teras0010e462008-04-29 03:32:25 -07001644 unsigned short new_mtu,
1645 struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001646{
Timo Teras0010e462008-04-29 03:32:25 -07001647 int i, k;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001648 unsigned short old_mtu = ntohs(iph->tot_len);
1649 struct rtable *rth;
Timo Teras0010e462008-04-29 03:32:25 -07001650 int ikeys[2] = { dev->ifindex, 0 };
Al Viroe4485152006-09-26 22:15:01 -07001651 __be32 skeys[2] = { iph->saddr, 0, };
1652 __be32 daddr = iph->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001653 unsigned short est_mtu = 0;
1654
Timo Teras0010e462008-04-29 03:32:25 -07001655 for (k = 0; k < 2; k++) {
1656 for (i = 0; i < 2; i++) {
Denis V. Lunevb00180d2008-07-05 19:04:09 -07001657 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001658 rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001659
Timo Teras0010e462008-04-29 03:32:25 -07001660 rcu_read_lock();
1661 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07001662 rth = rcu_dereference(rth->dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001663 unsigned short mtu = new_mtu;
1664
Timo Teras0010e462008-04-29 03:32:25 -07001665 if (rth->fl.fl4_dst != daddr ||
1666 rth->fl.fl4_src != skeys[i] ||
1667 rth->rt_dst != daddr ||
1668 rth->rt_src != iph->saddr ||
1669 rth->fl.oif != ikeys[k] ||
David S. Millerc7537962010-11-11 17:07:48 -08001670 rt_is_input_route(rth) ||
Changli Gaod8d1f302010-06-10 23:31:35 -07001671 dst_metric_locked(&rth->dst, RTAX_MTU) ||
1672 !net_eq(dev_net(rth->dst.dev), net) ||
Hugh Dickins6c3b8fc2008-07-26 17:51:06 -07001673 rt_is_expired(rth))
Timo Teras0010e462008-04-29 03:32:25 -07001674 continue;
1675
Linus Torvalds1da177e2005-04-16 15:20:36 -07001676 if (new_mtu < 68 || new_mtu >= old_mtu) {
1677
1678 /* BSD 4.2 compatibility hack :-( */
1679 if (mtu == 0 &&
Changli Gaod8d1f302010-06-10 23:31:35 -07001680 old_mtu >= dst_mtu(&rth->dst) &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001681 old_mtu >= 68 + (iph->ihl << 2))
1682 old_mtu -= iph->ihl << 2;
1683
1684 mtu = guess_mtu(old_mtu);
1685 }
Changli Gaod8d1f302010-06-10 23:31:35 -07001686 if (mtu <= dst_mtu(&rth->dst)) {
1687 if (mtu < dst_mtu(&rth->dst)) {
1688 dst_confirm(&rth->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001689 if (mtu < ip_rt_min_pmtu) {
David S. Millerdefb3512010-12-08 21:16:57 -08001690 u32 lock = dst_metric(&rth->dst,
1691 RTAX_LOCK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001692 mtu = ip_rt_min_pmtu;
David S. Millerdefb3512010-12-08 21:16:57 -08001693 lock |= (1 << RTAX_MTU);
1694 dst_metric_set(&rth->dst, RTAX_LOCK,
1695 lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001696 }
David S. Millerdefb3512010-12-08 21:16:57 -08001697 dst_metric_set(&rth->dst, RTAX_MTU, mtu);
Changli Gaod8d1f302010-06-10 23:31:35 -07001698 dst_set_expires(&rth->dst,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001699 ip_rt_mtu_expires);
1700 }
1701 est_mtu = mtu;
1702 }
1703 }
Timo Teras0010e462008-04-29 03:32:25 -07001704 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001705 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001706 }
1707 return est_mtu ? : new_mtu;
1708}
1709
1710static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1711{
Rami Rosen6d273f82008-08-06 02:33:49 -07001712 if (dst_mtu(dst) > mtu && mtu >= 68 &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001713 !(dst_metric_locked(dst, RTAX_MTU))) {
1714 if (mtu < ip_rt_min_pmtu) {
David S. Millerdefb3512010-12-08 21:16:57 -08001715 u32 lock = dst_metric(dst, RTAX_LOCK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001716 mtu = ip_rt_min_pmtu;
David S. Millerdefb3512010-12-08 21:16:57 -08001717 dst_metric_set(dst, RTAX_LOCK, lock | (1 << RTAX_MTU));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001718 }
David S. Millerdefb3512010-12-08 21:16:57 -08001719 dst_metric_set(dst, RTAX_MTU, mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001720 dst_set_expires(dst, ip_rt_mtu_expires);
Tom Tucker8d717402006-07-30 20:43:36 -07001721 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001722 }
1723}
1724
1725static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1726{
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001727 if (rt_is_expired((struct rtable *)dst))
1728 return NULL;
1729 return dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001730}
1731
1732static void ipv4_dst_destroy(struct dst_entry *dst)
1733{
1734 struct rtable *rt = (struct rtable *) dst;
1735 struct inet_peer *peer = rt->peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001736
1737 if (peer) {
1738 rt->peer = NULL;
1739 inet_putpeer(peer);
1740 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001741}
1742
Linus Torvalds1da177e2005-04-16 15:20:36 -07001743
1744static void ipv4_link_failure(struct sk_buff *skb)
1745{
1746 struct rtable *rt;
1747
1748 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1749
Eric Dumazet511c3f92009-06-02 05:14:27 +00001750 rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001751 if (rt)
Changli Gaod8d1f302010-06-10 23:31:35 -07001752 dst_set_expires(&rt->dst, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001753}
1754
1755static int ip_rt_bug(struct sk_buff *skb)
1756{
Harvey Harrison673d57e2008-10-31 00:53:57 -07001757 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1758 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001759 skb->dev ? skb->dev->name : "?");
1760 kfree_skb(skb);
1761 return 0;
1762}
1763
1764/*
1765 We do not cache source address of outgoing interface,
1766 because it is used only by IP RR, TS and SRR options,
1767 so that it out of fast path.
1768
1769 BTW remember: "addr" is allowed to be not aligned
1770 in IP options!
1771 */
1772
1773void ip_rt_get_source(u8 *addr, struct rtable *rt)
1774{
Al Viroa61ced52006-09-26 21:27:54 -07001775 __be32 src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001776 struct fib_result res;
1777
David S. Millerc7537962010-11-11 17:07:48 -08001778 if (rt_is_output_route(rt))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001779 src = rt->rt_src;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001780 else {
1781 rcu_read_lock();
1782 if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0)
1783 src = FIB_RES_PREFSRC(res);
1784 else
1785 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001786 RT_SCOPE_UNIVERSE);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001787 rcu_read_unlock();
1788 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001789 memcpy(addr, &src, 4);
1790}
1791
1792#ifdef CONFIG_NET_CLS_ROUTE
1793static void set_class_tag(struct rtable *rt, u32 tag)
1794{
Changli Gaod8d1f302010-06-10 23:31:35 -07001795 if (!(rt->dst.tclassid & 0xFFFF))
1796 rt->dst.tclassid |= tag & 0xFFFF;
1797 if (!(rt->dst.tclassid & 0xFFFF0000))
1798 rt->dst.tclassid |= tag & 0xFFFF0000;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001799}
1800#endif
1801
David S. Miller0dbaee32010-12-13 12:52:14 -08001802static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1803{
1804 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1805
1806 if (advmss == 0) {
1807 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1808 ip_rt_min_advmss);
1809 if (advmss > 65535 - 40)
1810 advmss = 65535 - 40;
1811 }
1812 return advmss;
1813}
1814
Linus Torvalds1da177e2005-04-16 15:20:36 -07001815static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1816{
David S. Millerdefb3512010-12-08 21:16:57 -08001817 struct dst_entry *dst = &rt->dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001818 struct fib_info *fi = res->fi;
1819
1820 if (fi) {
1821 if (FIB_RES_GW(*res) &&
1822 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1823 rt->rt_gateway = FIB_RES_GW(*res);
David S. Millerdefb3512010-12-08 21:16:57 -08001824 dst_import_metrics(dst, fi->fib_metrics);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001825 if (fi->fib_mtu == 0) {
David S. Millerdefb3512010-12-08 21:16:57 -08001826 dst_metric_set(dst, RTAX_MTU, dst->dev->mtu);
1827 if (dst_metric_locked(dst, RTAX_MTU) &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001828 rt->rt_gateway != rt->rt_dst &&
David S. Millerdefb3512010-12-08 21:16:57 -08001829 dst->dev->mtu > 576)
1830 dst_metric_set(dst, RTAX_MTU, 576);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001831 }
1832#ifdef CONFIG_NET_CLS_ROUTE
David S. Millerdefb3512010-12-08 21:16:57 -08001833 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001834#endif
1835 } else
David S. Millerdefb3512010-12-08 21:16:57 -08001836 dst_metric_set(dst, RTAX_MTU, dst->dev->mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001837
David S. Millerdefb3512010-12-08 21:16:57 -08001838 if (dst_mtu(dst) > IP_MAX_MTU)
1839 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
David S. Miller0dbaee32010-12-13 12:52:14 -08001840 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
David S. Millerdefb3512010-12-08 21:16:57 -08001841 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001842
1843#ifdef CONFIG_NET_CLS_ROUTE
1844#ifdef CONFIG_IP_MULTIPLE_TABLES
1845 set_class_tag(rt, fib_rules_tclass(res));
1846#endif
1847 set_class_tag(rt, itag);
1848#endif
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001849 rt->rt_type = res->type;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001850}
1851
Eric Dumazet96d36222010-06-02 19:21:31 +00001852/* called in rcu_read_lock() section */
Al Viro9e12bb22006-09-26 21:25:20 -07001853static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001854 u8 tos, struct net_device *dev, int our)
1855{
Eric Dumazet96d36222010-06-02 19:21:31 +00001856 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001857 struct rtable *rth;
Al Viroa61ced52006-09-26 21:27:54 -07001858 __be32 spec_dst;
Eric Dumazet96d36222010-06-02 19:21:31 +00001859 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001860 u32 itag = 0;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001861 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001862
1863 /* Primary sanity checks. */
1864
1865 if (in_dev == NULL)
1866 return -EINVAL;
1867
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001868 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08001869 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001870 goto e_inval;
1871
Joe Perchesf97c1e02007-12-16 13:45:43 -08001872 if (ipv4_is_zeronet(saddr)) {
1873 if (!ipv4_is_local_multicast(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001874 goto e_inval;
1875 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001876 } else {
1877 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1878 &itag, 0);
1879 if (err < 0)
1880 goto e_err;
1881 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001882 rth = dst_alloc(&ipv4_dst_ops);
1883 if (!rth)
1884 goto e_nobufs;
1885
Changli Gaod8d1f302010-06-10 23:31:35 -07001886 rth->dst.output = ip_rt_bug;
1887 rth->dst.obsolete = -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001888
Changli Gaod8d1f302010-06-10 23:31:35 -07001889 atomic_set(&rth->dst.__refcnt, 1);
1890 rth->dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07001891 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Changli Gaod8d1f302010-06-10 23:31:35 -07001892 rth->dst.flags |= DST_NOPOLICY;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001893 rth->fl.fl4_dst = daddr;
1894 rth->rt_dst = daddr;
1895 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001896 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001897 rth->fl.fl4_src = saddr;
1898 rth->rt_src = saddr;
1899#ifdef CONFIG_NET_CLS_ROUTE
Changli Gaod8d1f302010-06-10 23:31:35 -07001900 rth->dst.tclassid = itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001901#endif
1902 rth->rt_iif =
1903 rth->fl.iif = dev->ifindex;
Changli Gaod8d1f302010-06-10 23:31:35 -07001904 rth->dst.dev = init_net.loopback_dev;
1905 dev_hold(rth->dst.dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001906 rth->fl.oif = 0;
1907 rth->rt_gateway = daddr;
1908 rth->rt_spec_dst= spec_dst;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001909 rth->rt_genid = rt_genid(dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001910 rth->rt_flags = RTCF_MULTICAST;
Eric Dumazet29e75252008-01-31 17:05:09 -08001911 rth->rt_type = RTN_MULTICAST;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001912 if (our) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001913 rth->dst.input= ip_local_deliver;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001914 rth->rt_flags |= RTCF_LOCAL;
1915 }
1916
1917#ifdef CONFIG_IP_MROUTE
Joe Perchesf97c1e02007-12-16 13:45:43 -08001918 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
Changli Gaod8d1f302010-06-10 23:31:35 -07001919 rth->dst.input = ip_mr_input;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001920#endif
1921 RT_CACHE_STAT_INC(in_slow_mc);
1922
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001923 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001924 return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001925
1926e_nobufs:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001927 return -ENOBUFS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001928e_inval:
Eric Dumazet96d36222010-06-02 19:21:31 +00001929 return -EINVAL;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001930e_err:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001931 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001932}
1933
1934
1935static void ip_handle_martian_source(struct net_device *dev,
1936 struct in_device *in_dev,
1937 struct sk_buff *skb,
Al Viro9e12bb22006-09-26 21:25:20 -07001938 __be32 daddr,
1939 __be32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001940{
1941 RT_CACHE_STAT_INC(in_martian_src);
1942#ifdef CONFIG_IP_ROUTE_VERBOSE
1943 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1944 /*
1945 * RFC1812 recommendation, if source is martian,
1946 * the only hint is MAC header.
1947 */
Harvey Harrison673d57e2008-10-31 00:53:57 -07001948 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1949 &daddr, &saddr, dev->name);
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001950 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001951 int i;
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001952 const unsigned char *p = skb_mac_header(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001953 printk(KERN_WARNING "ll header: ");
1954 for (i = 0; i < dev->hard_header_len; i++, p++) {
1955 printk("%02x", *p);
1956 if (i < (dev->hard_header_len - 1))
1957 printk(":");
1958 }
1959 printk("\n");
1960 }
1961 }
1962#endif
1963}
1964
Eric Dumazet47360222010-06-03 04:13:21 +00001965/* called in rcu_read_lock() section */
Stephen Hemminger5969f712008-04-10 01:52:09 -07001966static int __mkroute_input(struct sk_buff *skb,
1967 struct fib_result *res,
1968 struct in_device *in_dev,
1969 __be32 daddr, __be32 saddr, u32 tos,
1970 struct rtable **result)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001971{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001972 struct rtable *rth;
1973 int err;
1974 struct in_device *out_dev;
Eric Dumazet47360222010-06-03 04:13:21 +00001975 unsigned int flags = 0;
Al Virod9c9df82006-09-26 21:28:14 -07001976 __be32 spec_dst;
1977 u32 itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001978
1979 /* get a working reference to the output device */
Eric Dumazet47360222010-06-03 04:13:21 +00001980 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001981 if (out_dev == NULL) {
1982 if (net_ratelimit())
1983 printk(KERN_CRIT "Bug in ip_route_input" \
1984 "_slow(). Please, report\n");
1985 return -EINVAL;
1986 }
1987
1988
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001989 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
jamalb0c110c2009-10-18 02:12:33 +00001990 in_dev->dev, &spec_dst, &itag, skb->mark);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001991 if (err < 0) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001992 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001993 saddr);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001994
Linus Torvalds1da177e2005-04-16 15:20:36 -07001995 goto cleanup;
1996 }
1997
1998 if (err)
1999 flags |= RTCF_DIRECTSRC;
2000
Thomas Graf51b77ca2008-06-03 16:36:01 -07002001 if (out_dev == in_dev && err &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002002 (IN_DEV_SHARED_MEDIA(out_dev) ||
2003 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2004 flags |= RTCF_DOREDIRECT;
2005
2006 if (skb->protocol != htons(ETH_P_IP)) {
2007 /* Not IP (i.e. ARP). Do not create route, if it is
2008 * invalid for proxy arp. DNAT routes are always valid.
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00002009 *
2010 * Proxy arp feature have been extended to allow, ARP
2011 * replies back to the same interface, to support
2012 * Private VLAN switch technologies. See arp.c.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002013 */
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00002014 if (out_dev == in_dev &&
2015 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002016 err = -EINVAL;
2017 goto cleanup;
2018 }
2019 }
2020
2021
2022 rth = dst_alloc(&ipv4_dst_ops);
2023 if (!rth) {
2024 err = -ENOBUFS;
2025 goto cleanup;
2026 }
2027
Changli Gaod8d1f302010-06-10 23:31:35 -07002028 atomic_set(&rth->dst.__refcnt, 1);
2029 rth->dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07002030 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Changli Gaod8d1f302010-06-10 23:31:35 -07002031 rth->dst.flags |= DST_NOPOLICY;
Herbert Xu42f811b2007-06-04 23:34:44 -07002032 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
Changli Gaod8d1f302010-06-10 23:31:35 -07002033 rth->dst.flags |= DST_NOXFRM;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002034 rth->fl.fl4_dst = daddr;
2035 rth->rt_dst = daddr;
2036 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002037 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002038 rth->fl.fl4_src = saddr;
2039 rth->rt_src = saddr;
2040 rth->rt_gateway = daddr;
2041 rth->rt_iif =
2042 rth->fl.iif = in_dev->dev->ifindex;
Changli Gaod8d1f302010-06-10 23:31:35 -07002043 rth->dst.dev = (out_dev)->dev;
2044 dev_hold(rth->dst.dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002045 rth->fl.oif = 0;
2046 rth->rt_spec_dst= spec_dst;
2047
Changli Gaod8d1f302010-06-10 23:31:35 -07002048 rth->dst.obsolete = -1;
2049 rth->dst.input = ip_forward;
2050 rth->dst.output = ip_output;
2051 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002052
2053 rt_set_nexthop(rth, res, itag);
2054
2055 rth->rt_flags = flags;
2056
2057 *result = rth;
2058 err = 0;
2059 cleanup:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002060 return err;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002061}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002062
Stephen Hemminger5969f712008-04-10 01:52:09 -07002063static int ip_mkroute_input(struct sk_buff *skb,
2064 struct fib_result *res,
2065 const struct flowi *fl,
2066 struct in_device *in_dev,
2067 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002068{
Chuck Short7abaa272005-06-22 22:10:23 -07002069 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002070 int err;
2071 unsigned hash;
2072
2073#ifdef CONFIG_IP_ROUTE_MULTIPATH
2074 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2075 fib_select_multipath(fl, res);
2076#endif
2077
2078 /* create a routing cache entry */
2079 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2080 if (err)
2081 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002082
2083 /* put it into the cache */
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002084 hash = rt_hash(daddr, saddr, fl->iif,
Changli Gaod8d1f302010-06-10 23:31:35 -07002085 rt_genid(dev_net(rth->dst.dev)));
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00002086 return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002087}
2088
Linus Torvalds1da177e2005-04-16 15:20:36 -07002089/*
2090 * NOTE. We drop all the packets that has local source
2091 * addresses, because every properly looped back packet
2092 * must have correct destination already attached by output routine.
2093 *
2094 * Such approach solves two big problems:
2095 * 1. Not simplex devices are handled properly.
2096 * 2. IP spoofing attempts are filtered with 100% of guarantee.
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002097 * called with rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -07002098 */
2099
Al Viro9e12bb22006-09-26 21:25:20 -07002100static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002101 u8 tos, struct net_device *dev)
2102{
2103 struct fib_result res;
Eric Dumazet96d36222010-06-02 19:21:31 +00002104 struct in_device *in_dev = __in_dev_get_rcu(dev);
Changli Gao58116622010-11-12 18:43:55 +00002105 struct flowi fl = { .fl4_dst = daddr,
2106 .fl4_src = saddr,
2107 .fl4_tos = tos,
2108 .fl4_scope = RT_SCOPE_UNIVERSE,
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002109 .mark = skb->mark,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002110 .iif = dev->ifindex };
2111 unsigned flags = 0;
2112 u32 itag = 0;
2113 struct rtable * rth;
2114 unsigned hash;
Al Viro9e12bb22006-09-26 21:25:20 -07002115 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002116 int err = -EINVAL;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002117 struct net * net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002118
2119 /* IP on this device is disabled. */
2120
2121 if (!in_dev)
2122 goto out;
2123
2124 /* Check for the most weird martians, which can be not detected
2125 by fib_lookup.
2126 */
2127
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002128 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002129 ipv4_is_loopback(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002130 goto martian_source;
2131
Andy Walls27a954b2010-10-17 15:11:22 +00002132 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002133 goto brd_input;
2134
2135 /* Accept zero addresses only to limited broadcast;
2136 * I even do not know to fix it or not. Waiting for complains :-)
2137 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002138 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002139 goto martian_source;
2140
Andy Walls27a954b2010-10-17 15:11:22 +00002141 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002142 goto martian_destination;
2143
2144 /*
2145 * Now we are ready to route packet.
2146 */
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002147 err = fib_lookup(net, &fl, &res);
2148 if (err != 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002149 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002150 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002151 goto no_route;
2152 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002153
2154 RT_CACHE_STAT_INC(in_slow_tot);
2155
2156 if (res.type == RTN_BROADCAST)
2157 goto brd_input;
2158
2159 if (res.type == RTN_LOCAL) {
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002160 err = fib_validate_source(saddr, daddr, tos,
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002161 net->loopback_dev->ifindex,
2162 dev, &spec_dst, &itag, skb->mark);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002163 if (err < 0)
2164 goto martian_source_keep_err;
2165 if (err)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002166 flags |= RTCF_DIRECTSRC;
2167 spec_dst = daddr;
2168 goto local_input;
2169 }
2170
2171 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002172 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002173 if (res.type != RTN_UNICAST)
2174 goto martian_destination;
2175
2176 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002177out: return err;
2178
2179brd_input:
2180 if (skb->protocol != htons(ETH_P_IP))
2181 goto e_inval;
2182
Joe Perchesf97c1e02007-12-16 13:45:43 -08002183 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002184 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2185 else {
2186 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
jamalb0c110c2009-10-18 02:12:33 +00002187 &itag, skb->mark);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002188 if (err < 0)
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002189 goto martian_source_keep_err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002190 if (err)
2191 flags |= RTCF_DIRECTSRC;
2192 }
2193 flags |= RTCF_BROADCAST;
2194 res.type = RTN_BROADCAST;
2195 RT_CACHE_STAT_INC(in_brd);
2196
2197local_input:
2198 rth = dst_alloc(&ipv4_dst_ops);
2199 if (!rth)
2200 goto e_nobufs;
2201
Changli Gaod8d1f302010-06-10 23:31:35 -07002202 rth->dst.output= ip_rt_bug;
2203 rth->dst.obsolete = -1;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002204 rth->rt_genid = rt_genid(net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002205
Changli Gaod8d1f302010-06-10 23:31:35 -07002206 atomic_set(&rth->dst.__refcnt, 1);
2207 rth->dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07002208 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Changli Gaod8d1f302010-06-10 23:31:35 -07002209 rth->dst.flags |= DST_NOPOLICY;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002210 rth->fl.fl4_dst = daddr;
2211 rth->rt_dst = daddr;
2212 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002213 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002214 rth->fl.fl4_src = saddr;
2215 rth->rt_src = saddr;
2216#ifdef CONFIG_NET_CLS_ROUTE
Changli Gaod8d1f302010-06-10 23:31:35 -07002217 rth->dst.tclassid = itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002218#endif
2219 rth->rt_iif =
2220 rth->fl.iif = dev->ifindex;
Changli Gaod8d1f302010-06-10 23:31:35 -07002221 rth->dst.dev = net->loopback_dev;
2222 dev_hold(rth->dst.dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002223 rth->rt_gateway = daddr;
2224 rth->rt_spec_dst= spec_dst;
Changli Gaod8d1f302010-06-10 23:31:35 -07002225 rth->dst.input= ip_local_deliver;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002226 rth->rt_flags = flags|RTCF_LOCAL;
2227 if (res.type == RTN_UNREACHABLE) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002228 rth->dst.input= ip_error;
2229 rth->dst.error= -err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002230 rth->rt_flags &= ~RTCF_LOCAL;
2231 }
2232 rth->rt_type = res.type;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002233 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00002234 err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002235 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002236
2237no_route:
2238 RT_CACHE_STAT_INC(in_no_route);
2239 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2240 res.type = RTN_UNREACHABLE;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08002241 if (err == -ESRCH)
2242 err = -ENETUNREACH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002243 goto local_input;
2244
2245 /*
2246 * Do not cache martian addresses: they should be logged (RFC1812)
2247 */
2248martian_destination:
2249 RT_CACHE_STAT_INC(in_martian_dst);
2250#ifdef CONFIG_IP_ROUTE_VERBOSE
2251 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07002252 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2253 &daddr, &saddr, dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002254#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002255
2256e_hostunreach:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002257 err = -EHOSTUNREACH;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002258 goto out;
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002259
Linus Torvalds1da177e2005-04-16 15:20:36 -07002260e_inval:
2261 err = -EINVAL;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002262 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002263
2264e_nobufs:
2265 err = -ENOBUFS;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002266 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002267
2268martian_source:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002269 err = -EINVAL;
2270martian_source_keep_err:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002271 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002272 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002273}
2274
Eric Dumazet407eadd2010-05-10 11:32:55 +00002275int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2276 u8 tos, struct net_device *dev, bool noref)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002277{
2278 struct rtable * rth;
2279 unsigned hash;
2280 int iif = dev->ifindex;
Denis V. Lunevb5921912008-01-22 23:50:25 -08002281 struct net *net;
Eric Dumazet96d36222010-06-02 19:21:31 +00002282 int res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002283
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002284 net = dev_net(dev);
Neil Horman1080d702008-10-27 12:28:25 -07002285
Eric Dumazet96d36222010-06-02 19:21:31 +00002286 rcu_read_lock();
2287
Neil Horman1080d702008-10-27 12:28:25 -07002288 if (!rt_caching(net))
2289 goto skip_cache;
2290
Linus Torvalds1da177e2005-04-16 15:20:36 -07002291 tos &= IPTOS_RT_MASK;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002292 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002293
Linus Torvalds1da177e2005-04-16 15:20:36 -07002294 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002295 rth = rcu_dereference(rth->dst.rt_next)) {
Eric Dumazet0eae88f2010-04-20 19:06:52 -07002296 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
2297 ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
Stephen Hemmingerc0b8c322008-04-10 04:00:28 -07002298 (rth->fl.iif ^ iif) |
2299 rth->fl.oif |
2300 (rth->fl.fl4_tos ^ tos)) == 0 &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002301 rth->fl.mark == skb->mark &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002302 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002303 !rt_is_expired(rth)) {
Eric Dumazet407eadd2010-05-10 11:32:55 +00002304 if (noref) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002305 dst_use_noref(&rth->dst, jiffies);
2306 skb_dst_set_noref(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002307 } else {
Changli Gaod8d1f302010-06-10 23:31:35 -07002308 dst_use(&rth->dst, jiffies);
2309 skb_dst_set(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002310 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002311 RT_CACHE_STAT_INC(in_hit);
2312 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002313 return 0;
2314 }
2315 RT_CACHE_STAT_INC(in_hlist_search);
2316 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002317
Neil Horman1080d702008-10-27 12:28:25 -07002318skip_cache:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002319 /* Multicast recognition logic is moved from route cache to here.
2320 The problem was that too many Ethernet cards have broken/missing
2321 hardware multicast filters :-( As result the host on multicasting
2322 network acquires a lot of useless route cache entries, sort of
2323 SDR messages from all the world. Now we try to get rid of them.
2324 Really, provided software IP multicast filter is organized
2325 reasonably (at least, hashed), it does not result in a slowdown
2326 comparing with route cache reject entries.
2327 Note, that multicast routers are not affected, because
2328 route cache entry is created eventually.
2329 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002330 if (ipv4_is_multicast(daddr)) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002331 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002332
Eric Dumazet96d36222010-06-02 19:21:31 +00002333 if (in_dev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002334 int our = ip_check_mc(in_dev, daddr, saddr,
Eric Dumazet96d36222010-06-02 19:21:31 +00002335 ip_hdr(skb)->protocol);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002336 if (our
2337#ifdef CONFIG_IP_MROUTE
Joe Perches9d4fb272009-11-23 10:41:23 -08002338 ||
2339 (!ipv4_is_local_multicast(daddr) &&
2340 IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002341#endif
Joe Perches9d4fb272009-11-23 10:41:23 -08002342 ) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002343 int res = ip_route_input_mc(skb, daddr, saddr,
2344 tos, dev, our);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002345 rcu_read_unlock();
Eric Dumazet96d36222010-06-02 19:21:31 +00002346 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002347 }
2348 }
2349 rcu_read_unlock();
2350 return -EINVAL;
2351 }
Eric Dumazet96d36222010-06-02 19:21:31 +00002352 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2353 rcu_read_unlock();
2354 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002355}
Eric Dumazet407eadd2010-05-10 11:32:55 +00002356EXPORT_SYMBOL(ip_route_input_common);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002357
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002358/* called with rcu_read_lock() */
Stephen Hemminger5969f712008-04-10 01:52:09 -07002359static int __mkroute_output(struct rtable **result,
2360 struct fib_result *res,
2361 const struct flowi *fl,
2362 const struct flowi *oldflp,
2363 struct net_device *dev_out,
2364 unsigned flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002365{
2366 struct rtable *rth;
2367 struct in_device *in_dev;
2368 u32 tos = RT_FL_TOS(oldflp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002369
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002370 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002371 return -EINVAL;
2372
Andy Walls27a954b2010-10-17 15:11:22 +00002373 if (ipv4_is_lbcast(fl->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002374 res->type = RTN_BROADCAST;
Joe Perchesf97c1e02007-12-16 13:45:43 -08002375 else if (ipv4_is_multicast(fl->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002376 res->type = RTN_MULTICAST;
Andy Walls27a954b2010-10-17 15:11:22 +00002377 else if (ipv4_is_zeronet(fl->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002378 return -EINVAL;
2379
2380 if (dev_out->flags & IFF_LOOPBACK)
2381 flags |= RTCF_LOCAL;
2382
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002383 in_dev = __in_dev_get_rcu(dev_out);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002384 if (!in_dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002385 return -EINVAL;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002386
Linus Torvalds1da177e2005-04-16 15:20:36 -07002387 if (res->type == RTN_BROADCAST) {
2388 flags |= RTCF_BROADCAST | RTCF_LOCAL;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002389 res->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002390 } else if (res->type == RTN_MULTICAST) {
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002391 flags |= RTCF_MULTICAST | RTCF_LOCAL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002392 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002393 oldflp->proto))
2394 flags &= ~RTCF_LOCAL;
2395 /* If multicast route do not exist use
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002396 * default one, but do not gateway in this case.
2397 * Yes, it is hack.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002398 */
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002399 if (res->fi && res->prefixlen < 4)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002400 res->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002401 }
2402
2403
2404 rth = dst_alloc(&ipv4_dst_ops);
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002405 if (!rth)
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002406 return -ENOBUFS;
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002407
Changli Gaod8d1f302010-06-10 23:31:35 -07002408 atomic_set(&rth->dst.__refcnt, 1);
2409 rth->dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07002410 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
Changli Gaod8d1f302010-06-10 23:31:35 -07002411 rth->dst.flags |= DST_NOXFRM;
Herbert Xu42f811b2007-06-04 23:34:44 -07002412 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Changli Gaod8d1f302010-06-10 23:31:35 -07002413 rth->dst.flags |= DST_NOPOLICY;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002414
2415 rth->fl.fl4_dst = oldflp->fl4_dst;
2416 rth->fl.fl4_tos = tos;
2417 rth->fl.fl4_src = oldflp->fl4_src;
2418 rth->fl.oif = oldflp->oif;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002419 rth->fl.mark = oldflp->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002420 rth->rt_dst = fl->fl4_dst;
2421 rth->rt_src = fl->fl4_src;
2422 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002423 /* get references to the devices that are to be hold by the routing
Linus Torvalds1da177e2005-04-16 15:20:36 -07002424 cache entry */
Changli Gaod8d1f302010-06-10 23:31:35 -07002425 rth->dst.dev = dev_out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002426 dev_hold(dev_out);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002427 rth->rt_gateway = fl->fl4_dst;
2428 rth->rt_spec_dst= fl->fl4_src;
2429
Changli Gaod8d1f302010-06-10 23:31:35 -07002430 rth->dst.output=ip_output;
2431 rth->dst.obsolete = -1;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002432 rth->rt_genid = rt_genid(dev_net(dev_out));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002433
2434 RT_CACHE_STAT_INC(out_slow_tot);
2435
2436 if (flags & RTCF_LOCAL) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002437 rth->dst.input = ip_local_deliver;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002438 rth->rt_spec_dst = fl->fl4_dst;
2439 }
2440 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2441 rth->rt_spec_dst = fl->fl4_src;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002442 if (flags & RTCF_LOCAL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002443 !(dev_out->flags & IFF_LOOPBACK)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002444 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002445 RT_CACHE_STAT_INC(out_slow_mc);
2446 }
2447#ifdef CONFIG_IP_MROUTE
2448 if (res->type == RTN_MULTICAST) {
2449 if (IN_DEV_MFORWARD(in_dev) &&
Joe Perchesf97c1e02007-12-16 13:45:43 -08002450 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002451 rth->dst.input = ip_mr_input;
2452 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002453 }
2454 }
2455#endif
2456 }
2457
2458 rt_set_nexthop(rth, res, 0);
2459
2460 rth->rt_flags = flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002461 *result = rth;
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002462 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002463}
2464
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002465/* called with rcu_read_lock() */
Stephen Hemminger5969f712008-04-10 01:52:09 -07002466static int ip_mkroute_output(struct rtable **rp,
2467 struct fib_result *res,
2468 const struct flowi *fl,
2469 const struct flowi *oldflp,
2470 struct net_device *dev_out,
2471 unsigned flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002472{
Chuck Short7abaa272005-06-22 22:10:23 -07002473 struct rtable *rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002474 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2475 unsigned hash;
2476 if (err == 0) {
Denis V. Lunevb00180d2008-07-05 19:04:09 -07002477 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002478 rt_genid(dev_net(dev_out)));
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00002479 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002480 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002481
Linus Torvalds1da177e2005-04-16 15:20:36 -07002482 return err;
2483}
2484
Linus Torvalds1da177e2005-04-16 15:20:36 -07002485/*
2486 * Major route resolver routine.
Eric Dumazet0197aa32010-09-30 03:33:58 +00002487 * called with rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002488 */
2489
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002490static int ip_route_output_slow(struct net *net, struct rtable **rp,
2491 const struct flowi *oldflp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002492{
2493 u32 tos = RT_FL_TOS(oldflp);
Changli Gao58116622010-11-12 18:43:55 +00002494 struct flowi fl = { .fl4_dst = oldflp->fl4_dst,
2495 .fl4_src = oldflp->fl4_src,
2496 .fl4_tos = tos & IPTOS_RT_MASK,
2497 .fl4_scope = ((tos & RTO_ONLINK) ?
2498 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE),
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002499 .mark = oldflp->mark,
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002500 .iif = net->loopback_dev->ifindex,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002501 .oif = oldflp->oif };
2502 struct fib_result res;
Eric Dumazet0197aa32010-09-30 03:33:58 +00002503 unsigned int flags = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002504 struct net_device *dev_out = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002505 int err;
2506
2507
2508 res.fi = NULL;
2509#ifdef CONFIG_IP_MULTIPLE_TABLES
2510 res.r = NULL;
2511#endif
2512
2513 if (oldflp->fl4_src) {
2514 err = -EINVAL;
Joe Perchesf97c1e02007-12-16 13:45:43 -08002515 if (ipv4_is_multicast(oldflp->fl4_src) ||
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002516 ipv4_is_lbcast(oldflp->fl4_src) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002517 ipv4_is_zeronet(oldflp->fl4_src))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002518 goto out;
2519
Linus Torvalds1da177e2005-04-16 15:20:36 -07002520 /* I removed check for oif == dev_out->oif here.
2521 It was wrong for two reasons:
Denis V. Lunev1ab35272008-01-22 22:04:30 -08002522 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2523 is assigned to multiple interfaces.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002524 2. Moreover, we are allowed to send packets with saddr
2525 of another iface. --ANK
2526 */
2527
Joe Perches9d4fb272009-11-23 10:41:23 -08002528 if (oldflp->oif == 0 &&
2529 (ipv4_is_multicast(oldflp->fl4_dst) ||
Andy Walls27a954b2010-10-17 15:11:22 +00002530 ipv4_is_lbcast(oldflp->fl4_dst))) {
Julian Anastasova210d012008-10-01 07:28:28 -07002531 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
Eric Dumazet0197aa32010-09-30 03:33:58 +00002532 dev_out = __ip_dev_find(net, oldflp->fl4_src, false);
Julian Anastasova210d012008-10-01 07:28:28 -07002533 if (dev_out == NULL)
2534 goto out;
2535
Linus Torvalds1da177e2005-04-16 15:20:36 -07002536 /* Special hack: user can direct multicasts
2537 and limited broadcast via necessary interface
2538 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2539 This hack is not just for fun, it allows
2540 vic,vat and friends to work.
2541 They bind socket to loopback, set ttl to zero
2542 and expect that it will work.
2543 From the viewpoint of routing cache they are broken,
2544 because we are not allowed to build multicast path
2545 with loopback source addr (look, routing cache
2546 cannot know, that ttl is zero, so that packet
2547 will not leave this host and route is valid).
2548 Luckily, this hack is good workaround.
2549 */
2550
2551 fl.oif = dev_out->ifindex;
2552 goto make_route;
2553 }
Julian Anastasova210d012008-10-01 07:28:28 -07002554
2555 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2556 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
Eric Dumazet0197aa32010-09-30 03:33:58 +00002557 if (!__ip_dev_find(net, oldflp->fl4_src, false))
Julian Anastasova210d012008-10-01 07:28:28 -07002558 goto out;
Julian Anastasova210d012008-10-01 07:28:28 -07002559 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002560 }
2561
2562
2563 if (oldflp->oif) {
Eric Dumazet0197aa32010-09-30 03:33:58 +00002564 dev_out = dev_get_by_index_rcu(net, oldflp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002565 err = -ENODEV;
2566 if (dev_out == NULL)
2567 goto out;
Herbert Xue5ed6392005-10-03 14:35:55 -07002568
2569 /* RACE: Check return value of inet_select_addr instead. */
Eric Dumazet0197aa32010-09-30 03:33:58 +00002570 if (rcu_dereference(dev_out->ip_ptr) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002571 goto out; /* Wrong error code */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002572
Joe Perchesf97c1e02007-12-16 13:45:43 -08002573 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
Andy Walls27a954b2010-10-17 15:11:22 +00002574 ipv4_is_lbcast(oldflp->fl4_dst)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002575 if (!fl.fl4_src)
2576 fl.fl4_src = inet_select_addr(dev_out, 0,
2577 RT_SCOPE_LINK);
2578 goto make_route;
2579 }
2580 if (!fl.fl4_src) {
Joe Perchesf97c1e02007-12-16 13:45:43 -08002581 if (ipv4_is_multicast(oldflp->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002582 fl.fl4_src = inet_select_addr(dev_out, 0,
2583 fl.fl4_scope);
2584 else if (!oldflp->fl4_dst)
2585 fl.fl4_src = inet_select_addr(dev_out, 0,
2586 RT_SCOPE_HOST);
2587 }
2588 }
2589
2590 if (!fl.fl4_dst) {
2591 fl.fl4_dst = fl.fl4_src;
2592 if (!fl.fl4_dst)
2593 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002594 dev_out = net->loopback_dev;
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002595 fl.oif = net->loopback_dev->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002596 res.type = RTN_LOCAL;
2597 flags |= RTCF_LOCAL;
2598 goto make_route;
2599 }
2600
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002601 if (fib_lookup(net, &fl, &res)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002602 res.fi = NULL;
2603 if (oldflp->oif) {
2604 /* Apparently, routing tables are wrong. Assume,
2605 that the destination is on link.
2606
2607 WHY? DW.
2608 Because we are allowed to send to iface
2609 even if it has NO routes and NO assigned
2610 addresses. When oif is specified, routing
2611 tables are looked up with only one purpose:
2612 to catch if destination is gatewayed, rather than
2613 direct. Moreover, if MSG_DONTROUTE is set,
2614 we send packet, ignoring both routing tables
2615 and ifaddr state. --ANK
2616
2617
2618 We could make it even if oif is unknown,
2619 likely IPv6, but we do not.
2620 */
2621
2622 if (fl.fl4_src == 0)
2623 fl.fl4_src = inet_select_addr(dev_out, 0,
2624 RT_SCOPE_LINK);
2625 res.type = RTN_UNICAST;
2626 goto make_route;
2627 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002628 err = -ENETUNREACH;
2629 goto out;
2630 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002631
2632 if (res.type == RTN_LOCAL) {
2633 if (!fl.fl4_src)
2634 fl.fl4_src = fl.fl4_dst;
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002635 dev_out = net->loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002636 fl.oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002637 res.fi = NULL;
2638 flags |= RTCF_LOCAL;
2639 goto make_route;
2640 }
2641
2642#ifdef CONFIG_IP_ROUTE_MULTIPATH
2643 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2644 fib_select_multipath(&fl, &res);
2645 else
2646#endif
2647 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002648 fib_select_default(net, &fl, &res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002649
2650 if (!fl.fl4_src)
2651 fl.fl4_src = FIB_RES_PREFSRC(res);
2652
Linus Torvalds1da177e2005-04-16 15:20:36 -07002653 dev_out = FIB_RES_DEV(res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002654 fl.oif = dev_out->ifindex;
2655
2656
2657make_route:
2658 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2659
Linus Torvalds1da177e2005-04-16 15:20:36 -07002660out: return err;
2661}
2662
Denis V. Lunev611c1832008-01-22 22:06:48 -08002663int __ip_route_output_key(struct net *net, struct rtable **rp,
2664 const struct flowi *flp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002665{
Eric Dumazet0197aa32010-09-30 03:33:58 +00002666 unsigned int hash;
2667 int res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002668 struct rtable *rth;
2669
Neil Horman1080d702008-10-27 12:28:25 -07002670 if (!rt_caching(net))
2671 goto slow_output;
2672
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002673 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002674
2675 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08002676 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002677 rth = rcu_dereference_bh(rth->dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002678 if (rth->fl.fl4_dst == flp->fl4_dst &&
2679 rth->fl.fl4_src == flp->fl4_src &&
David S. Millerc7537962010-11-11 17:07:48 -08002680 rt_is_output_route(rth) &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002681 rth->fl.oif == flp->oif &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002682 rth->fl.mark == flp->mark &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002683 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
Denis V. Lunevb5921912008-01-22 23:50:25 -08002684 (IPTOS_RT_MASK | RTO_ONLINK)) &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002685 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002686 !rt_is_expired(rth)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002687 dst_use(&rth->dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002688 RT_CACHE_STAT_INC(out_hit);
2689 rcu_read_unlock_bh();
2690 *rp = rth;
2691 return 0;
2692 }
2693 RT_CACHE_STAT_INC(out_hlist_search);
2694 }
2695 rcu_read_unlock_bh();
2696
Neil Horman1080d702008-10-27 12:28:25 -07002697slow_output:
Eric Dumazet0197aa32010-09-30 03:33:58 +00002698 rcu_read_lock();
2699 res = ip_route_output_slow(net, rp, flp);
2700 rcu_read_unlock();
2701 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002702}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002703EXPORT_SYMBOL_GPL(__ip_route_output_key);
2704
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002705static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2706{
2707 return NULL;
2708}
2709
David S. Miller14e50e52007-05-24 18:17:54 -07002710static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2711{
2712}
2713
2714static struct dst_ops ipv4_dst_blackhole_ops = {
2715 .family = AF_INET,
Harvey Harrison09640e62009-02-01 00:45:17 -08002716 .protocol = cpu_to_be16(ETH_P_IP),
David S. Miller14e50e52007-05-24 18:17:54 -07002717 .destroy = ipv4_dst_destroy,
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002718 .check = ipv4_blackhole_dst_check,
David S. Miller14e50e52007-05-24 18:17:54 -07002719 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
David S. Miller14e50e52007-05-24 18:17:54 -07002720};
2721
2722
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002723static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
David S. Miller14e50e52007-05-24 18:17:54 -07002724{
2725 struct rtable *ort = *rp;
2726 struct rtable *rt = (struct rtable *)
2727 dst_alloc(&ipv4_dst_blackhole_ops);
2728
2729 if (rt) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002730 struct dst_entry *new = &rt->dst;
David S. Miller14e50e52007-05-24 18:17:54 -07002731
2732 atomic_set(&new->__refcnt, 1);
2733 new->__use = 1;
Herbert Xu352e5122007-11-13 21:34:06 -08002734 new->input = dst_discard;
2735 new->output = dst_discard;
David S. Millerdefb3512010-12-08 21:16:57 -08002736 dst_copy_metrics(new, &ort->dst);
David S. Miller14e50e52007-05-24 18:17:54 -07002737
Changli Gaod8d1f302010-06-10 23:31:35 -07002738 new->dev = ort->dst.dev;
David S. Miller14e50e52007-05-24 18:17:54 -07002739 if (new->dev)
2740 dev_hold(new->dev);
2741
2742 rt->fl = ort->fl;
2743
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002744 rt->rt_genid = rt_genid(net);
David S. Miller14e50e52007-05-24 18:17:54 -07002745 rt->rt_flags = ort->rt_flags;
2746 rt->rt_type = ort->rt_type;
2747 rt->rt_dst = ort->rt_dst;
2748 rt->rt_src = ort->rt_src;
2749 rt->rt_iif = ort->rt_iif;
2750 rt->rt_gateway = ort->rt_gateway;
2751 rt->rt_spec_dst = ort->rt_spec_dst;
2752 rt->peer = ort->peer;
2753 if (rt->peer)
2754 atomic_inc(&rt->peer->refcnt);
2755
2756 dst_free(new);
2757 }
2758
Changli Gaod8d1f302010-06-10 23:31:35 -07002759 dst_release(&(*rp)->dst);
David S. Miller14e50e52007-05-24 18:17:54 -07002760 *rp = rt;
Eric Dumazeta02cec22010-09-22 20:43:57 +00002761 return rt ? 0 : -ENOMEM;
David S. Miller14e50e52007-05-24 18:17:54 -07002762}
2763
Denis V. Lunevf1b050b2008-01-22 22:07:10 -08002764int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2765 struct sock *sk, int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002766{
2767 int err;
2768
Denis V. Lunevf1b050b2008-01-22 22:07:10 -08002769 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002770 return err;
2771
2772 if (flp->proto) {
2773 if (!flp->fl4_src)
2774 flp->fl4_src = (*rp)->rt_src;
2775 if (!flp->fl4_dst)
2776 flp->fl4_dst = (*rp)->rt_dst;
Alexey Dobriyan52479b62008-11-25 17:35:18 -08002777 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
Herbert Xubb728452007-12-12 18:48:58 -08002778 flags ? XFRM_LOOKUP_WAIT : 0);
David S. Miller14e50e52007-05-24 18:17:54 -07002779 if (err == -EREMOTE)
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002780 err = ipv4_dst_blackhole(net, rp, flp);
David S. Miller14e50e52007-05-24 18:17:54 -07002781
2782 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002783 }
2784
2785 return 0;
2786}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002787EXPORT_SYMBOL_GPL(ip_route_output_flow);
2788
Denis V. Lunevf2063512008-01-22 22:07:34 -08002789int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002790{
Denis V. Lunevf2063512008-01-22 22:07:34 -08002791 return ip_route_output_flow(net, rp, flp, NULL, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002792}
Eric Dumazet4bc2f182010-07-09 21:22:10 +00002793EXPORT_SYMBOL(ip_route_output_key);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002794
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002795static int rt_fill_info(struct net *net,
2796 struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002797 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002798{
Eric Dumazet511c3f92009-06-02 05:14:27 +00002799 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002800 struct rtmsg *r;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002801 struct nlmsghdr *nlh;
Thomas Grafe3703b32006-11-27 09:27:07 -08002802 long expires;
2803 u32 id = 0, ts = 0, tsage = 0, error;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002804
2805 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2806 if (nlh == NULL)
Patrick McHardy26932562007-01-31 23:16:40 -08002807 return -EMSGSIZE;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002808
2809 r = nlmsg_data(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002810 r->rtm_family = AF_INET;
2811 r->rtm_dst_len = 32;
2812 r->rtm_src_len = 0;
2813 r->rtm_tos = rt->fl.fl4_tos;
2814 r->rtm_table = RT_TABLE_MAIN;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002815 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002816 r->rtm_type = rt->rt_type;
2817 r->rtm_scope = RT_SCOPE_UNIVERSE;
2818 r->rtm_protocol = RTPROT_UNSPEC;
2819 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2820 if (rt->rt_flags & RTCF_NOTIFY)
2821 r->rtm_flags |= RTM_F_NOTIFY;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002822
Al Viro17fb2c62006-09-26 22:15:25 -07002823 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002824
Linus Torvalds1da177e2005-04-16 15:20:36 -07002825 if (rt->fl.fl4_src) {
2826 r->rtm_src_len = 32;
Al Viro17fb2c62006-09-26 22:15:25 -07002827 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002828 }
Changli Gaod8d1f302010-06-10 23:31:35 -07002829 if (rt->dst.dev)
2830 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002831#ifdef CONFIG_NET_CLS_ROUTE
Changli Gaod8d1f302010-06-10 23:31:35 -07002832 if (rt->dst.tclassid)
2833 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002834#endif
David S. Millerc7537962010-11-11 17:07:48 -08002835 if (rt_is_input_route(rt))
Al Viro17fb2c62006-09-26 22:15:25 -07002836 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002837 else if (rt->rt_src != rt->fl.fl4_src)
Al Viro17fb2c62006-09-26 22:15:25 -07002838 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002839
Linus Torvalds1da177e2005-04-16 15:20:36 -07002840 if (rt->rt_dst != rt->rt_gateway)
Al Viro17fb2c62006-09-26 22:15:25 -07002841 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002842
David S. Millerdefb3512010-12-08 21:16:57 -08002843 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002844 goto nla_put_failure;
2845
Eric Dumazet963bfee2010-07-20 22:03:14 +00002846 if (rt->fl.mark)
2847 NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
2848
Changli Gaod8d1f302010-06-10 23:31:35 -07002849 error = rt->dst.error;
2850 expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002851 if (rt->peer) {
Eric Dumazet317fe0e2010-06-16 04:52:13 +00002852 inet_peer_refcheck(rt->peer);
Eric Dumazet2c1409a2009-11-12 09:33:09 +00002853 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002854 if (rt->peer->tcp_ts_stamp) {
Thomas Grafe3703b32006-11-27 09:27:07 -08002855 ts = rt->peer->tcp_ts;
James Morris9d729f72007-03-04 16:12:44 -08002856 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002857 }
2858 }
Thomas Grafbe403ea2006-08-17 18:15:17 -07002859
David S. Millerc7537962010-11-11 17:07:48 -08002860 if (rt_is_input_route(rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002861#ifdef CONFIG_IP_MROUTE
Al Viroe4485152006-09-26 22:15:01 -07002862 __be32 dst = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002863
Joe Perchesf97c1e02007-12-16 13:45:43 -08002864 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002865 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2866 int err = ipmr_get_route(net, skb, r, nowait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002867 if (err <= 0) {
2868 if (!nowait) {
2869 if (err == 0)
2870 return 0;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002871 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002872 } else {
2873 if (err == -EMSGSIZE)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002874 goto nla_put_failure;
Thomas Grafe3703b32006-11-27 09:27:07 -08002875 error = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002876 }
2877 }
2878 } else
2879#endif
Thomas Grafbe403ea2006-08-17 18:15:17 -07002880 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002881 }
2882
Changli Gaod8d1f302010-06-10 23:31:35 -07002883 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
Thomas Grafe3703b32006-11-27 09:27:07 -08002884 expires, error) < 0)
2885 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002886
Thomas Grafbe403ea2006-08-17 18:15:17 -07002887 return nlmsg_end(skb, nlh);
2888
2889nla_put_failure:
Patrick McHardy26932562007-01-31 23:16:40 -08002890 nlmsg_cancel(skb, nlh);
2891 return -EMSGSIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002892}
2893
Thomas Graf63f34442007-03-22 11:55:17 -07002894static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002895{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09002896 struct net *net = sock_net(in_skb->sk);
Thomas Grafd889ce32006-08-17 18:15:44 -07002897 struct rtmsg *rtm;
2898 struct nlattr *tb[RTA_MAX+1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07002899 struct rtable *rt = NULL;
Al Viro9e12bb22006-09-26 21:25:20 -07002900 __be32 dst = 0;
2901 __be32 src = 0;
2902 u32 iif;
Thomas Grafd889ce32006-08-17 18:15:44 -07002903 int err;
Eric Dumazet963bfee2010-07-20 22:03:14 +00002904 int mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002905 struct sk_buff *skb;
2906
Thomas Grafd889ce32006-08-17 18:15:44 -07002907 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2908 if (err < 0)
2909 goto errout;
2910
2911 rtm = nlmsg_data(nlh);
2912
Linus Torvalds1da177e2005-04-16 15:20:36 -07002913 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
Thomas Grafd889ce32006-08-17 18:15:44 -07002914 if (skb == NULL) {
2915 err = -ENOBUFS;
2916 goto errout;
2917 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002918
2919 /* Reserve room for dummy headers, this skb can pass
2920 through good chunk of routing engine.
2921 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07002922 skb_reset_mac_header(skb);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07002923 skb_reset_network_header(skb);
Stephen Hemmingerd2c962b2006-04-17 17:27:11 -07002924
2925 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07002926 ip_hdr(skb)->protocol = IPPROTO_ICMP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002927 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2928
Al Viro17fb2c62006-09-26 22:15:25 -07002929 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2930 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
Thomas Grafd889ce32006-08-17 18:15:44 -07002931 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
Eric Dumazet963bfee2010-07-20 22:03:14 +00002932 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002933
2934 if (iif) {
Thomas Grafd889ce32006-08-17 18:15:44 -07002935 struct net_device *dev;
2936
Denis V. Lunev19375042008-02-28 20:52:04 -08002937 dev = __dev_get_by_index(net, iif);
Thomas Grafd889ce32006-08-17 18:15:44 -07002938 if (dev == NULL) {
2939 err = -ENODEV;
2940 goto errout_free;
2941 }
2942
Linus Torvalds1da177e2005-04-16 15:20:36 -07002943 skb->protocol = htons(ETH_P_IP);
2944 skb->dev = dev;
Eric Dumazet963bfee2010-07-20 22:03:14 +00002945 skb->mark = mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002946 local_bh_disable();
2947 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2948 local_bh_enable();
Thomas Grafd889ce32006-08-17 18:15:44 -07002949
Eric Dumazet511c3f92009-06-02 05:14:27 +00002950 rt = skb_rtable(skb);
Changli Gaod8d1f302010-06-10 23:31:35 -07002951 if (err == 0 && rt->dst.error)
2952 err = -rt->dst.error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002953 } else {
Thomas Grafd889ce32006-08-17 18:15:44 -07002954 struct flowi fl = {
Changli Gao58116622010-11-12 18:43:55 +00002955 .fl4_dst = dst,
2956 .fl4_src = src,
2957 .fl4_tos = rtm->rtm_tos,
Thomas Grafd889ce32006-08-17 18:15:44 -07002958 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
Eric Dumazet963bfee2010-07-20 22:03:14 +00002959 .mark = mark,
Thomas Grafd889ce32006-08-17 18:15:44 -07002960 };
Denis V. Lunev19375042008-02-28 20:52:04 -08002961 err = ip_route_output_key(net, &rt, &fl);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002962 }
Thomas Grafd889ce32006-08-17 18:15:44 -07002963
Linus Torvalds1da177e2005-04-16 15:20:36 -07002964 if (err)
Thomas Grafd889ce32006-08-17 18:15:44 -07002965 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002966
Changli Gaod8d1f302010-06-10 23:31:35 -07002967 skb_dst_set(skb, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002968 if (rtm->rtm_flags & RTM_F_NOTIFY)
2969 rt->rt_flags |= RTCF_NOTIFY;
2970
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002971 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Denis V. Lunev19375042008-02-28 20:52:04 -08002972 RTM_NEWROUTE, 0, 0);
Thomas Grafd889ce32006-08-17 18:15:44 -07002973 if (err <= 0)
2974 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002975
Denis V. Lunev19375042008-02-28 20:52:04 -08002976 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
Thomas Grafd889ce32006-08-17 18:15:44 -07002977errout:
Thomas Graf2942e902006-08-15 00:30:25 -07002978 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002979
Thomas Grafd889ce32006-08-17 18:15:44 -07002980errout_free:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002981 kfree_skb(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07002982 goto errout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002983}
2984
2985int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2986{
2987 struct rtable *rt;
2988 int h, s_h;
2989 int idx, s_idx;
Denis V. Lunev19375042008-02-28 20:52:04 -08002990 struct net *net;
2991
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09002992 net = sock_net(skb->sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002993
2994 s_h = cb->args[0];
Eric Dumazetd8c92832008-01-07 21:52:14 -08002995 if (s_h < 0)
2996 s_h = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002997 s_idx = idx = cb->args[1];
Eric Dumazeta6272662008-08-28 01:11:25 -07002998 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
2999 if (!rt_hash_table[h].chain)
3000 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003001 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08003002 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
Changli Gaod8d1f302010-06-10 23:31:35 -07003003 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3004 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003005 continue;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07003006 if (rt_is_expired(rt))
Eric Dumazet29e75252008-01-31 17:05:09 -08003007 continue;
Changli Gaod8d1f302010-06-10 23:31:35 -07003008 skb_dst_set_noref(skb, &rt->dst);
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003009 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003010 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07003011 1, NLM_F_MULTI) <= 0) {
Eric Dumazetadf30902009-06-02 05:19:30 +00003012 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003013 rcu_read_unlock_bh();
3014 goto done;
3015 }
Eric Dumazetadf30902009-06-02 05:19:30 +00003016 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003017 }
3018 rcu_read_unlock_bh();
3019 }
3020
3021done:
3022 cb->args[0] = h;
3023 cb->args[1] = idx;
3024 return skb->len;
3025}
3026
3027void ip_rt_multicast_event(struct in_device *in_dev)
3028{
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -07003029 rt_cache_flush(dev_net(in_dev->dev), 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003030}
3031
3032#ifdef CONFIG_SYSCTL
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003033static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003034 void __user *buffer,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003035 size_t *lenp, loff_t *ppos)
3036{
3037 if (write) {
Denis V. Lunev639e1042008-07-05 19:02:06 -07003038 int flush_delay;
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003039 ctl_table ctl;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003040 struct net *net;
Denis V. Lunev639e1042008-07-05 19:02:06 -07003041
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003042 memcpy(&ctl, __ctl, sizeof(ctl));
3043 ctl.data = &flush_delay;
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003044 proc_dointvec(&ctl, write, buffer, lenp, ppos);
Denis V. Lunev639e1042008-07-05 19:02:06 -07003045
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003046 net = (struct net *)__ctl->extra1;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003047 rt_cache_flush(net, flush_delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003048 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003049 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003050
3051 return -EINVAL;
3052}
3053
Al Viroeeb61f72008-07-27 08:59:33 +01003054static ctl_table ipv4_route_table[] = {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003055 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003056 .procname = "gc_thresh",
3057 .data = &ipv4_dst_ops.gc_thresh,
3058 .maxlen = sizeof(int),
3059 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003060 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003061 },
3062 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003063 .procname = "max_size",
3064 .data = &ip_rt_max_size,
3065 .maxlen = sizeof(int),
3066 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003067 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003068 },
3069 {
3070 /* Deprecated. Use gc_min_interval_ms */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003071
Linus Torvalds1da177e2005-04-16 15:20:36 -07003072 .procname = "gc_min_interval",
3073 .data = &ip_rt_gc_min_interval,
3074 .maxlen = sizeof(int),
3075 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003076 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003077 },
3078 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003079 .procname = "gc_min_interval_ms",
3080 .data = &ip_rt_gc_min_interval,
3081 .maxlen = sizeof(int),
3082 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003083 .proc_handler = proc_dointvec_ms_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003084 },
3085 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003086 .procname = "gc_timeout",
3087 .data = &ip_rt_gc_timeout,
3088 .maxlen = sizeof(int),
3089 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003090 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003091 },
3092 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003093 .procname = "gc_interval",
3094 .data = &ip_rt_gc_interval,
3095 .maxlen = sizeof(int),
3096 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003097 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003098 },
3099 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003100 .procname = "redirect_load",
3101 .data = &ip_rt_redirect_load,
3102 .maxlen = sizeof(int),
3103 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003104 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003105 },
3106 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003107 .procname = "redirect_number",
3108 .data = &ip_rt_redirect_number,
3109 .maxlen = sizeof(int),
3110 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003111 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003112 },
3113 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003114 .procname = "redirect_silence",
3115 .data = &ip_rt_redirect_silence,
3116 .maxlen = sizeof(int),
3117 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003118 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003119 },
3120 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003121 .procname = "error_cost",
3122 .data = &ip_rt_error_cost,
3123 .maxlen = sizeof(int),
3124 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003125 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003126 },
3127 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003128 .procname = "error_burst",
3129 .data = &ip_rt_error_burst,
3130 .maxlen = sizeof(int),
3131 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003132 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003133 },
3134 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003135 .procname = "gc_elasticity",
3136 .data = &ip_rt_gc_elasticity,
3137 .maxlen = sizeof(int),
3138 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003139 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003140 },
3141 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003142 .procname = "mtu_expires",
3143 .data = &ip_rt_mtu_expires,
3144 .maxlen = sizeof(int),
3145 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003146 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003147 },
3148 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003149 .procname = "min_pmtu",
3150 .data = &ip_rt_min_pmtu,
3151 .maxlen = sizeof(int),
3152 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003153 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003154 },
3155 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003156 .procname = "min_adv_mss",
3157 .data = &ip_rt_min_advmss,
3158 .maxlen = sizeof(int),
3159 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003160 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003161 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003162 { }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003163};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003164
Al Viro2f4520d2008-08-25 15:17:44 -07003165static struct ctl_table empty[1];
3166
3167static struct ctl_table ipv4_skeleton[] =
3168{
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003169 { .procname = "route",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003170 .mode = 0555, .child = ipv4_route_table},
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003171 { .procname = "neigh",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003172 .mode = 0555, .child = empty},
Al Viro2f4520d2008-08-25 15:17:44 -07003173 { }
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003174};
3175
Al Viro2f4520d2008-08-25 15:17:44 -07003176static __net_initdata struct ctl_path ipv4_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003177 { .procname = "net", },
3178 { .procname = "ipv4", },
Al Viro2f4520d2008-08-25 15:17:44 -07003179 { },
3180};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003181
3182static struct ctl_table ipv4_route_flush_table[] = {
3183 {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003184 .procname = "flush",
3185 .maxlen = sizeof(int),
3186 .mode = 0200,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003187 .proc_handler = ipv4_sysctl_rtcache_flush,
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003188 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003189 { },
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003190};
3191
Al Viro2f4520d2008-08-25 15:17:44 -07003192static __net_initdata struct ctl_path ipv4_route_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003193 { .procname = "net", },
3194 { .procname = "ipv4", },
3195 { .procname = "route", },
Al Viro2f4520d2008-08-25 15:17:44 -07003196 { },
3197};
3198
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003199static __net_init int sysctl_route_net_init(struct net *net)
3200{
3201 struct ctl_table *tbl;
3202
3203 tbl = ipv4_route_flush_table;
Octavian Purdila09ad9bc2009-11-25 15:14:13 -08003204 if (!net_eq(net, &init_net)) {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003205 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3206 if (tbl == NULL)
3207 goto err_dup;
3208 }
3209 tbl[0].extra1 = net;
3210
3211 net->ipv4.route_hdr =
3212 register_net_sysctl_table(net, ipv4_route_path, tbl);
3213 if (net->ipv4.route_hdr == NULL)
3214 goto err_reg;
3215 return 0;
3216
3217err_reg:
3218 if (tbl != ipv4_route_flush_table)
3219 kfree(tbl);
3220err_dup:
3221 return -ENOMEM;
3222}
3223
3224static __net_exit void sysctl_route_net_exit(struct net *net)
3225{
3226 struct ctl_table *tbl;
3227
3228 tbl = net->ipv4.route_hdr->ctl_table_arg;
3229 unregister_net_sysctl_table(net->ipv4.route_hdr);
3230 BUG_ON(tbl == ipv4_route_flush_table);
3231 kfree(tbl);
3232}
3233
3234static __net_initdata struct pernet_operations sysctl_route_ops = {
3235 .init = sysctl_route_net_init,
3236 .exit = sysctl_route_net_exit,
3237};
Linus Torvalds1da177e2005-04-16 15:20:36 -07003238#endif
3239
Neil Horman3ee94372010-05-08 01:57:52 -07003240static __net_init int rt_genid_init(struct net *net)
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003241{
Neil Horman3ee94372010-05-08 01:57:52 -07003242 get_random_bytes(&net->ipv4.rt_genid,
3243 sizeof(net->ipv4.rt_genid));
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003244 return 0;
3245}
3246
Neil Horman3ee94372010-05-08 01:57:52 -07003247static __net_initdata struct pernet_operations rt_genid_ops = {
3248 .init = rt_genid_init,
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003249};
3250
3251
Linus Torvalds1da177e2005-04-16 15:20:36 -07003252#ifdef CONFIG_NET_CLS_ROUTE
Tejun Heo7d720c32010-02-16 15:20:26 +00003253struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003254#endif /* CONFIG_NET_CLS_ROUTE */
3255
3256static __initdata unsigned long rhash_entries;
3257static int __init set_rhash_entries(char *str)
3258{
3259 if (!str)
3260 return 0;
3261 rhash_entries = simple_strtoul(str, &str, 0);
3262 return 1;
3263}
3264__setup("rhash_entries=", set_rhash_entries);
3265
3266int __init ip_rt_init(void)
3267{
Eric Dumazet424c4b72005-07-05 14:58:19 -07003268 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003269
Linus Torvalds1da177e2005-04-16 15:20:36 -07003270#ifdef CONFIG_NET_CLS_ROUTE
Ingo Molnar0dcec8c2009-02-25 14:07:33 +01003271 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003272 if (!ip_rt_acct)
3273 panic("IP: failed to allocate ip_rt_acct\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003274#endif
3275
Alexey Dobriyane5d679f2006-08-26 19:25:52 -07003276 ipv4_dst_ops.kmem_cachep =
3277 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09003278 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003279
David S. Miller14e50e52007-05-24 18:17:54 -07003280 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3281
Eric Dumazetfc66f952010-10-08 06:37:34 +00003282 if (dst_entries_init(&ipv4_dst_ops) < 0)
3283 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3284
3285 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3286 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3287
Eric Dumazet424c4b72005-07-05 14:58:19 -07003288 rt_hash_table = (struct rt_hash_bucket *)
3289 alloc_large_system_hash("IP route cache",
3290 sizeof(struct rt_hash_bucket),
3291 rhash_entries,
Jan Beulich44813742009-09-21 17:03:05 -07003292 (totalram_pages >= 128 * 1024) ?
Mike Stroyan18955cf2005-11-29 16:12:55 -08003293 15 : 17,
Kirill Korotaev8d1502d2006-08-07 20:44:22 -07003294 0,
Eric Dumazet424c4b72005-07-05 14:58:19 -07003295 &rt_hash_log,
3296 &rt_hash_mask,
Anton Blanchardc9503e02009-04-27 05:42:24 -07003297 rhash_entries ? 0 : 512 * 1024);
Eric Dumazet22c047c2005-07-05 14:55:24 -07003298 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3299 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003300
3301 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3302 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3303
Linus Torvalds1da177e2005-04-16 15:20:36 -07003304 devinet_init();
3305 ip_fib_init();
3306
Linus Torvalds1da177e2005-04-16 15:20:36 -07003307 /* All the timers, started at system startup tend
3308 to synchronize. Perturb it a bit.
3309 */
Eric Dumazet125bb8f2009-06-11 20:10:07 +00003310 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3311 expires_ljiffies = jiffies;
Eric Dumazet39c90ec2007-09-15 10:55:54 -07003312 schedule_delayed_work(&expires_work,
3313 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003314
Denis V. Lunev73b38712008-02-28 20:51:18 -08003315 if (ip_rt_proc_init())
Pavel Emelyanov107f1632007-12-05 21:14:28 -08003316 printk(KERN_ERR "Unable to create route proc files\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003317#ifdef CONFIG_XFRM
3318 xfrm_init();
Neil Hormana33bc5c2009-07-30 18:52:15 -07003319 xfrm4_init(ip_rt_max_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003320#endif
Thomas Graf63f34442007-03-22 11:55:17 -07003321 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3322
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003323#ifdef CONFIG_SYSCTL
3324 register_pernet_subsys(&sysctl_route_ops);
3325#endif
Neil Horman3ee94372010-05-08 01:57:52 -07003326 register_pernet_subsys(&rt_genid_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003327 return rc;
3328}
3329
Al Viroa1bc6eb2008-07-30 06:32:52 -04003330#ifdef CONFIG_SYSCTL
Al Viroeeb61f72008-07-27 08:59:33 +01003331/*
3332 * We really need to sanitize the damn ipv4 init order, then all
3333 * this nonsense will go away.
3334 */
3335void __init ip_static_sysctl_init(void)
3336{
Al Viro2f4520d2008-08-25 15:17:44 -07003337 register_sysctl_paths(ipv4_path, ipv4_skeleton);
Al Viroeeb61f72008-07-27 08:59:33 +01003338}
Al Viroa1bc6eb2008-07-30 06:32:52 -04003339#endif