blob: 75ef66f31832fe96ecfe019dd36e4351efb9d740 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
Jesper Juhl02c30a82005-05-05 16:16:16 -07008 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -07009 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090021 * Alan Cox : Super /proc >4K
Linus Torvalds1da177e2005-04-16 15:20:36 -070022 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090039 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070055 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Ilia Sotnikovcef26852006-03-25 01:38:55 -080056 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
Linus Torvalds1da177e2005-04-16 15:20:36 -070058 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
Linus Torvalds1da177e2005-04-16 15:20:36 -070065#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070071#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070072#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070073#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
Eric Dumazet39c90ec2007-09-15 10:55:54 -070082#include <linux/workqueue.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070083#include <linux/skbuff.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070084#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090093#include <linux/slab.h>
Herbert Xu352e5122007-11-13 21:34:06 -080094#include <net/dst.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020095#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070096#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
Tom Tucker8d717402006-07-30 20:43:36 -0700106#include <net/netevent.h>
Thomas Graf63f34442007-03-22 11:55:17 -0700107#include <net/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108#ifdef CONFIG_SYSCTL
109#include <linux/sysctl.h>
110#endif
David S. Millere997d472011-08-03 20:50:44 -0700111#include <net/secure_seq.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700112
David S. Miller68a5e3d2011-03-11 20:07:33 -0500113#define RT_FL_TOS(oldflp4) \
114 ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700115
116#define IP_MAX_MTU 0xFFF0
117
118#define RT_GC_TIMEOUT (300*HZ)
119
Linus Torvalds1da177e2005-04-16 15:20:36 -0700120static int ip_rt_max_size;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700121static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
122static int ip_rt_gc_interval __read_mostly = 60 * HZ;
123static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
124static int ip_rt_redirect_number __read_mostly = 9;
125static int ip_rt_redirect_load __read_mostly = HZ / 50;
126static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
127static int ip_rt_error_cost __read_mostly = HZ;
128static int ip_rt_error_burst __read_mostly = 5 * HZ;
129static int ip_rt_gc_elasticity __read_mostly = 8;
130static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
131static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
132static int ip_rt_min_advmss __read_mostly = 256;
Neil Horman1080d702008-10-27 12:28:25 -0700133static int rt_chain_length_max __read_mostly = 20;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700134
Linus Torvalds1da177e2005-04-16 15:20:36 -0700135/*
136 * Interface to generic destination cache.
137 */
138
139static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
David S. Miller0dbaee32010-12-13 12:52:14 -0800140static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
David S. Millerd33e4552010-12-14 13:01:14 -0800141static unsigned int ipv4_default_mtu(const struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700142static void ipv4_dst_destroy(struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700143static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
144static void ipv4_link_failure(struct sk_buff *skb);
145static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
Daniel Lezcano569d3642008-01-18 03:56:57 -0800146static int rt_garbage_collect(struct dst_ops *ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700147
Eric Dumazet72cdd1d2010-11-11 07:14:07 +0000148static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
149 int how)
150{
151}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700152
David S. Miller62fa8a82011-01-26 20:51:05 -0800153static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
154{
David S. Miller06582542011-01-27 14:58:42 -0800155 struct rtable *rt = (struct rtable *) dst;
156 struct inet_peer *peer;
157 u32 *p = NULL;
David S. Miller62fa8a82011-01-26 20:51:05 -0800158
David S. Miller06582542011-01-27 14:58:42 -0800159 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -0400160 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller06582542011-01-27 14:58:42 -0800161
162 peer = rt->peer;
163 if (peer) {
David S. Miller62fa8a82011-01-26 20:51:05 -0800164 u32 *old_p = __DST_METRICS_PTR(old);
165 unsigned long prev, new;
166
David S. Miller06582542011-01-27 14:58:42 -0800167 p = peer->metrics;
168 if (inet_metrics_new(peer))
169 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
David S. Miller62fa8a82011-01-26 20:51:05 -0800170
171 new = (unsigned long) p;
172 prev = cmpxchg(&dst->_metrics, old, new);
173
174 if (prev != old) {
David S. Miller62fa8a82011-01-26 20:51:05 -0800175 p = __DST_METRICS_PTR(prev);
176 if (prev & DST_METRICS_READ_ONLY)
177 p = NULL;
178 } else {
David S. Miller62fa8a82011-01-26 20:51:05 -0800179 if (rt->fi) {
180 fib_info_put(rt->fi);
181 rt->fi = NULL;
182 }
183 }
184 }
185 return p;
186}
187
Linus Torvalds1da177e2005-04-16 15:20:36 -0700188static struct dst_ops ipv4_dst_ops = {
189 .family = AF_INET,
Harvey Harrison09640e62009-02-01 00:45:17 -0800190 .protocol = cpu_to_be16(ETH_P_IP),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700191 .gc = rt_garbage_collect,
192 .check = ipv4_dst_check,
David S. Miller0dbaee32010-12-13 12:52:14 -0800193 .default_advmss = ipv4_default_advmss,
David S. Millerd33e4552010-12-14 13:01:14 -0800194 .default_mtu = ipv4_default_mtu,
David S. Miller62fa8a82011-01-26 20:51:05 -0800195 .cow_metrics = ipv4_cow_metrics,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700196 .destroy = ipv4_dst_destroy,
197 .ifdown = ipv4_dst_ifdown,
198 .negative_advice = ipv4_negative_advice,
199 .link_failure = ipv4_link_failure,
200 .update_pmtu = ip_rt_update_pmtu,
Herbert Xu1ac06e02008-05-20 14:32:14 -0700201 .local_out = __ip_local_out,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700202};
203
204#define ECN_OR_COST(class) TC_PRIO_##class
205
Philippe De Muyter4839c522007-07-09 15:32:57 -0700206const __u8 ip_tos2prio[16] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700207 TC_PRIO_BESTEFFORT,
Dan Siemon4a2b9c32011-03-15 13:56:07 +0000208 ECN_OR_COST(BESTEFFORT),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700209 TC_PRIO_BESTEFFORT,
210 ECN_OR_COST(BESTEFFORT),
211 TC_PRIO_BULK,
212 ECN_OR_COST(BULK),
213 TC_PRIO_BULK,
214 ECN_OR_COST(BULK),
215 TC_PRIO_INTERACTIVE,
216 ECN_OR_COST(INTERACTIVE),
217 TC_PRIO_INTERACTIVE,
218 ECN_OR_COST(INTERACTIVE),
219 TC_PRIO_INTERACTIVE_BULK,
220 ECN_OR_COST(INTERACTIVE_BULK),
221 TC_PRIO_INTERACTIVE_BULK,
222 ECN_OR_COST(INTERACTIVE_BULK)
223};
224
225
226/*
227 * Route cache.
228 */
229
230/* The locking scheme is rather straight forward:
231 *
232 * 1) Read-Copy Update protects the buckets of the central route hash.
233 * 2) Only writers remove entries, and they hold the lock
234 * as they look at rtable reference counts.
235 * 3) Only readers acquire references to rtable entries,
236 * they do so with atomic increments and with the
237 * lock held.
238 */
239
240struct rt_hash_bucket {
Eric Dumazet1c317202010-10-25 21:02:07 +0000241 struct rtable __rcu *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700242};
Neil Horman1080d702008-10-27 12:28:25 -0700243
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700244#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
245 defined(CONFIG_PROVE_LOCKING)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700246/*
247 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
248 * The size of this table is a power of two and depends on the number of CPUS.
Ingo Molnar62051202006-07-03 00:24:59 -0700249 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700250 */
Ingo Molnar62051202006-07-03 00:24:59 -0700251#ifdef CONFIG_LOCKDEP
252# define RT_HASH_LOCK_SZ 256
Eric Dumazet22c047c2005-07-05 14:55:24 -0700253#else
Ingo Molnar62051202006-07-03 00:24:59 -0700254# if NR_CPUS >= 32
255# define RT_HASH_LOCK_SZ 4096
256# elif NR_CPUS >= 16
257# define RT_HASH_LOCK_SZ 2048
258# elif NR_CPUS >= 8
259# define RT_HASH_LOCK_SZ 1024
260# elif NR_CPUS >= 4
261# define RT_HASH_LOCK_SZ 512
262# else
263# define RT_HASH_LOCK_SZ 256
264# endif
Eric Dumazet22c047c2005-07-05 14:55:24 -0700265#endif
266
267static spinlock_t *rt_hash_locks;
268# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800269
270static __init void rt_hash_lock_init(void)
271{
272 int i;
273
274 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
275 GFP_KERNEL);
276 if (!rt_hash_locks)
277 panic("IP: failed to allocate rt_hash_locks\n");
278
279 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
280 spin_lock_init(&rt_hash_locks[i]);
281}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700282#else
283# define rt_hash_lock_addr(slot) NULL
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800284
285static inline void rt_hash_lock_init(void)
286{
287}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700288#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700289
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700290static struct rt_hash_bucket *rt_hash_table __read_mostly;
291static unsigned rt_hash_mask __read_mostly;
292static unsigned int rt_hash_log __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700293
Eric Dumazet2f970d82006-01-17 02:54:36 -0800294static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
Eric Dumazet27f39c72010-05-19 22:07:23 +0000295#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700296
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700297static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700298 int genid)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700299{
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700300 return jhash_3words((__force u32)daddr, (__force u32)saddr,
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700301 idx, genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800302 & rt_hash_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700303}
304
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700305static inline int rt_genid(struct net *net)
306{
307 return atomic_read(&net->ipv4.rt_genid);
308}
309
Linus Torvalds1da177e2005-04-16 15:20:36 -0700310#ifdef CONFIG_PROC_FS
311struct rt_cache_iter_state {
Denis V. Luneva75e9362008-02-28 20:50:55 -0800312 struct seq_net_private p;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700313 int bucket;
Eric Dumazet29e75252008-01-31 17:05:09 -0800314 int genid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700315};
316
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900317static struct rtable *rt_cache_get_first(struct seq_file *seq)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700318{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900319 struct rt_cache_iter_state *st = seq->private;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700320 struct rtable *r = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700321
322 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
Eric Dumazet1c317202010-10-25 21:02:07 +0000323 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
Eric Dumazeta6272662008-08-28 01:11:25 -0700324 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700325 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -0800326 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Eric Dumazet29e75252008-01-31 17:05:09 -0800327 while (r) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700328 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
Denis V. Luneva75e9362008-02-28 20:50:55 -0800329 r->rt_genid == st->genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800330 return r;
Changli Gaod8d1f302010-06-10 23:31:35 -0700331 r = rcu_dereference_bh(r->dst.rt_next);
Eric Dumazet29e75252008-01-31 17:05:09 -0800332 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700333 rcu_read_unlock_bh();
334 }
Eric Dumazet29e75252008-01-31 17:05:09 -0800335 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700336}
337
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900338static struct rtable *__rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800339 struct rtable *r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700340{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900341 struct rt_cache_iter_state *st = seq->private;
Eric Dumazeta6272662008-08-28 01:11:25 -0700342
Eric Dumazet1c317202010-10-25 21:02:07 +0000343 r = rcu_dereference_bh(r->dst.rt_next);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700344 while (!r) {
345 rcu_read_unlock_bh();
Eric Dumazeta6272662008-08-28 01:11:25 -0700346 do {
347 if (--st->bucket < 0)
348 return NULL;
Eric Dumazet1c317202010-10-25 21:02:07 +0000349 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700350 rcu_read_lock_bh();
Eric Dumazet1c317202010-10-25 21:02:07 +0000351 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700352 }
Eric Dumazet1c317202010-10-25 21:02:07 +0000353 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700354}
355
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900356static struct rtable *rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800357 struct rtable *r)
358{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900359 struct rt_cache_iter_state *st = seq->private;
360 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700361 if (dev_net(r->dst.dev) != seq_file_net(seq))
Denis V. Luneva75e9362008-02-28 20:50:55 -0800362 continue;
Denis V. Lunev642d6312008-02-28 20:50:33 -0800363 if (r->rt_genid == st->genid)
364 break;
365 }
366 return r;
367}
368
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900369static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700370{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900371 struct rtable *r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700372
373 if (r)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900374 while (pos && (r = rt_cache_get_next(seq, r)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700375 --pos;
376 return pos ? NULL : r;
377}
378
379static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
380{
Eric Dumazet29e75252008-01-31 17:05:09 -0800381 struct rt_cache_iter_state *st = seq->private;
Eric Dumazet29e75252008-01-31 17:05:09 -0800382 if (*pos)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900383 return rt_cache_get_idx(seq, *pos - 1);
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700384 st->genid = rt_genid(seq_file_net(seq));
Eric Dumazet29e75252008-01-31 17:05:09 -0800385 return SEQ_START_TOKEN;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700386}
387
388static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
389{
Eric Dumazet29e75252008-01-31 17:05:09 -0800390 struct rtable *r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700391
392 if (v == SEQ_START_TOKEN)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900393 r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700394 else
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900395 r = rt_cache_get_next(seq, v);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700396 ++*pos;
397 return r;
398}
399
400static void rt_cache_seq_stop(struct seq_file *seq, void *v)
401{
402 if (v && v != SEQ_START_TOKEN)
403 rcu_read_unlock_bh();
404}
405
406static int rt_cache_seq_show(struct seq_file *seq, void *v)
407{
408 if (v == SEQ_START_TOKEN)
409 seq_printf(seq, "%-127s\n",
410 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
411 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
412 "HHUptod\tSpecDst");
413 else {
414 struct rtable *r = v;
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700415 int len;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700416
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700417 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
418 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
Changli Gaod8d1f302010-06-10 23:31:35 -0700419 r->dst.dev ? r->dst.dev->name : "*",
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700420 (__force u32)r->rt_dst,
421 (__force u32)r->rt_gateway,
Changli Gaod8d1f302010-06-10 23:31:35 -0700422 r->rt_flags, atomic_read(&r->dst.__refcnt),
423 r->dst.__use, 0, (__force u32)r->rt_src,
David S. Miller0dbaee32010-12-13 12:52:14 -0800424 dst_metric_advmss(&r->dst) + 40,
Changli Gaod8d1f302010-06-10 23:31:35 -0700425 dst_metric(&r->dst, RTAX_WINDOW),
426 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
427 dst_metric(&r->dst, RTAX_RTTVAR)),
David S. Miller475949d2011-05-03 19:45:15 -0700428 r->rt_key_tos,
Changli Gaod8d1f302010-06-10 23:31:35 -0700429 r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
430 r->dst.hh ? (r->dst.hh->hh_output ==
Linus Torvalds1da177e2005-04-16 15:20:36 -0700431 dev_queue_xmit) : 0,
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700432 r->rt_spec_dst, &len);
433
434 seq_printf(seq, "%*s\n", 127 - len, "");
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900435 }
436 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700437}
438
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700439static const struct seq_operations rt_cache_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700440 .start = rt_cache_seq_start,
441 .next = rt_cache_seq_next,
442 .stop = rt_cache_seq_stop,
443 .show = rt_cache_seq_show,
444};
445
446static int rt_cache_seq_open(struct inode *inode, struct file *file)
447{
Denis V. Luneva75e9362008-02-28 20:50:55 -0800448 return seq_open_net(inode, file, &rt_cache_seq_ops,
Pavel Emelyanovcf7732e2007-10-10 02:29:29 -0700449 sizeof(struct rt_cache_iter_state));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700450}
451
Arjan van de Ven9a321442007-02-12 00:55:35 -0800452static const struct file_operations rt_cache_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700453 .owner = THIS_MODULE,
454 .open = rt_cache_seq_open,
455 .read = seq_read,
456 .llseek = seq_lseek,
Denis V. Luneva75e9362008-02-28 20:50:55 -0800457 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700458};
459
460
461static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
462{
463 int cpu;
464
465 if (*pos == 0)
466 return SEQ_START_TOKEN;
467
Rusty Russell0f231742008-12-29 12:23:42 +0000468 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700469 if (!cpu_possible(cpu))
470 continue;
471 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800472 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700473 }
474 return NULL;
475}
476
477static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
478{
479 int cpu;
480
Rusty Russell0f231742008-12-29 12:23:42 +0000481 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700482 if (!cpu_possible(cpu))
483 continue;
484 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800485 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700486 }
487 return NULL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900488
Linus Torvalds1da177e2005-04-16 15:20:36 -0700489}
490
491static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
492{
493
494}
495
496static int rt_cpu_seq_show(struct seq_file *seq, void *v)
497{
498 struct rt_cache_stat *st = v;
499
500 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700501 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700502 return 0;
503 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900504
Linus Torvalds1da177e2005-04-16 15:20:36 -0700505 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
506 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
Eric Dumazetfc66f952010-10-08 06:37:34 +0000507 dst_entries_get_slow(&ipv4_dst_ops),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700508 st->in_hit,
509 st->in_slow_tot,
510 st->in_slow_mc,
511 st->in_no_route,
512 st->in_brd,
513 st->in_martian_dst,
514 st->in_martian_src,
515
516 st->out_hit,
517 st->out_slow_tot,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900518 st->out_slow_mc,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700519
520 st->gc_total,
521 st->gc_ignored,
522 st->gc_goal_miss,
523 st->gc_dst_overflow,
524 st->in_hlist_search,
525 st->out_hlist_search
526 );
527 return 0;
528}
529
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700530static const struct seq_operations rt_cpu_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700531 .start = rt_cpu_seq_start,
532 .next = rt_cpu_seq_next,
533 .stop = rt_cpu_seq_stop,
534 .show = rt_cpu_seq_show,
535};
536
537
538static int rt_cpu_seq_open(struct inode *inode, struct file *file)
539{
540 return seq_open(file, &rt_cpu_seq_ops);
541}
542
Arjan van de Ven9a321442007-02-12 00:55:35 -0800543static const struct file_operations rt_cpu_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700544 .owner = THIS_MODULE,
545 .open = rt_cpu_seq_open,
546 .read = seq_read,
547 .llseek = seq_lseek,
548 .release = seq_release,
549};
550
Patrick McHardyc7066f72011-01-14 13:36:42 +0100551#ifdef CONFIG_IP_ROUTE_CLASSID
Alexey Dobriyana661c412009-11-25 15:40:35 -0800552static int rt_acct_proc_show(struct seq_file *m, void *v)
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800553{
Alexey Dobriyana661c412009-11-25 15:40:35 -0800554 struct ip_rt_acct *dst, *src;
555 unsigned int i, j;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800556
Alexey Dobriyana661c412009-11-25 15:40:35 -0800557 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
558 if (!dst)
559 return -ENOMEM;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800560
Alexey Dobriyana661c412009-11-25 15:40:35 -0800561 for_each_possible_cpu(i) {
562 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
563 for (j = 0; j < 256; j++) {
564 dst[j].o_bytes += src[j].o_bytes;
565 dst[j].o_packets += src[j].o_packets;
566 dst[j].i_bytes += src[j].i_bytes;
567 dst[j].i_packets += src[j].i_packets;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800568 }
569 }
Alexey Dobriyana661c412009-11-25 15:40:35 -0800570
571 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
572 kfree(dst);
573 return 0;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800574}
Alexey Dobriyana661c412009-11-25 15:40:35 -0800575
576static int rt_acct_proc_open(struct inode *inode, struct file *file)
577{
578 return single_open(file, rt_acct_proc_show, NULL);
579}
580
581static const struct file_operations rt_acct_proc_fops = {
582 .owner = THIS_MODULE,
583 .open = rt_acct_proc_open,
584 .read = seq_read,
585 .llseek = seq_lseek,
586 .release = single_release,
587};
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800588#endif
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800589
Denis V. Lunev73b38712008-02-28 20:51:18 -0800590static int __net_init ip_rt_do_proc_init(struct net *net)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800591{
592 struct proc_dir_entry *pde;
593
594 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
595 &rt_cache_seq_fops);
596 if (!pde)
597 goto err1;
598
Wang Chen77020722008-02-28 14:14:25 -0800599 pde = proc_create("rt_cache", S_IRUGO,
600 net->proc_net_stat, &rt_cpu_seq_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800601 if (!pde)
602 goto err2;
603
Patrick McHardyc7066f72011-01-14 13:36:42 +0100604#ifdef CONFIG_IP_ROUTE_CLASSID
Alexey Dobriyana661c412009-11-25 15:40:35 -0800605 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800606 if (!pde)
607 goto err3;
608#endif
609 return 0;
610
Patrick McHardyc7066f72011-01-14 13:36:42 +0100611#ifdef CONFIG_IP_ROUTE_CLASSID
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800612err3:
613 remove_proc_entry("rt_cache", net->proc_net_stat);
614#endif
615err2:
616 remove_proc_entry("rt_cache", net->proc_net);
617err1:
618 return -ENOMEM;
619}
Denis V. Lunev73b38712008-02-28 20:51:18 -0800620
621static void __net_exit ip_rt_do_proc_exit(struct net *net)
622{
623 remove_proc_entry("rt_cache", net->proc_net_stat);
624 remove_proc_entry("rt_cache", net->proc_net);
Patrick McHardyc7066f72011-01-14 13:36:42 +0100625#ifdef CONFIG_IP_ROUTE_CLASSID
Denis V. Lunev73b38712008-02-28 20:51:18 -0800626 remove_proc_entry("rt_acct", net->proc_net);
Alexey Dobriyan0a931ac2010-01-17 03:32:50 +0000627#endif
Denis V. Lunev73b38712008-02-28 20:51:18 -0800628}
629
630static struct pernet_operations ip_rt_proc_ops __net_initdata = {
631 .init = ip_rt_do_proc_init,
632 .exit = ip_rt_do_proc_exit,
633};
634
635static int __init ip_rt_proc_init(void)
636{
637 return register_pernet_subsys(&ip_rt_proc_ops);
638}
639
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800640#else
Denis V. Lunev73b38712008-02-28 20:51:18 -0800641static inline int ip_rt_proc_init(void)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800642{
643 return 0;
644}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700645#endif /* CONFIG_PROC_FS */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900646
Stephen Hemminger5969f712008-04-10 01:52:09 -0700647static inline void rt_free(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700648{
Changli Gaod8d1f302010-06-10 23:31:35 -0700649 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700650}
651
Stephen Hemminger5969f712008-04-10 01:52:09 -0700652static inline void rt_drop(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700653{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700654 ip_rt_put(rt);
Changli Gaod8d1f302010-06-10 23:31:35 -0700655 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700656}
657
Stephen Hemminger5969f712008-04-10 01:52:09 -0700658static inline int rt_fast_clean(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700659{
660 /* Kill broadcast/multicast entries very aggresively, if they
661 collide in hash table with more useful entries */
662 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
David S. Millerc7537962010-11-11 17:07:48 -0800663 rt_is_input_route(rth) && rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700664}
665
Stephen Hemminger5969f712008-04-10 01:52:09 -0700666static inline int rt_valuable(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700667{
668 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
David S. Miller2c8cec52011-02-09 20:42:07 -0800669 (rth->peer && rth->peer->pmtu_expires);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700670}
671
672static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
673{
674 unsigned long age;
675 int ret = 0;
676
Changli Gaod8d1f302010-06-10 23:31:35 -0700677 if (atomic_read(&rth->dst.__refcnt))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700678 goto out;
679
Changli Gaod8d1f302010-06-10 23:31:35 -0700680 age = jiffies - rth->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700681 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
682 (age <= tmo2 && rt_valuable(rth)))
683 goto out;
684 ret = 1;
685out: return ret;
686}
687
688/* Bits of score are:
689 * 31: very valuable
690 * 30: not quite useless
691 * 29..0: usage counter
692 */
693static inline u32 rt_score(struct rtable *rt)
694{
Changli Gaod8d1f302010-06-10 23:31:35 -0700695 u32 score = jiffies - rt->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700696
697 score = ~score & ~(3<<30);
698
699 if (rt_valuable(rt))
700 score |= (1<<31);
701
David S. Millerc7537962010-11-11 17:07:48 -0800702 if (rt_is_output_route(rt) ||
Linus Torvalds1da177e2005-04-16 15:20:36 -0700703 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
704 score |= (1<<30);
705
706 return score;
707}
708
Neil Horman1080d702008-10-27 12:28:25 -0700709static inline bool rt_caching(const struct net *net)
710{
711 return net->ipv4.current_rt_cache_rebuild_count <=
712 net->ipv4.sysctl_rt_cache_rebuild_count;
713}
714
David S. Miller5e2b61f2011-03-04 21:47:09 -0800715static inline bool compare_hash_inputs(const struct rtable *rt1,
716 const struct rtable *rt2)
Neil Horman1080d702008-10-27 12:28:25 -0700717{
David S. Miller5e2b61f2011-03-04 21:47:09 -0800718 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
719 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
Julian Anastasov025fd912011-08-09 04:01:16 +0000720 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
Neil Horman1080d702008-10-27 12:28:25 -0700721}
722
David S. Miller5e2b61f2011-03-04 21:47:09 -0800723static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700724{
David S. Miller5e2b61f2011-03-04 21:47:09 -0800725 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
726 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
727 (rt1->rt_mark ^ rt2->rt_mark) |
David S. Miller475949d2011-05-03 19:45:15 -0700728 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
Julian Anastasov44a41f42011-08-07 22:20:20 -0700729 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
Julian Anastasov025fd912011-08-09 04:01:16 +0000730 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700731}
732
Denis V. Lunevb5921912008-01-22 23:50:25 -0800733static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
734{
Changli Gaod8d1f302010-06-10 23:31:35 -0700735 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
Denis V. Lunevb5921912008-01-22 23:50:25 -0800736}
737
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700738static inline int rt_is_expired(struct rtable *rth)
739{
Changli Gaod8d1f302010-06-10 23:31:35 -0700740 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700741}
742
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800743/*
744 * Perform a full scan of hash table and free all entries.
745 * Can be called by a softirq or a process.
746 * In the later case, we want to be reschedule if necessary
747 */
David S. Miller6561a3b2010-12-19 21:11:20 -0800748static void rt_do_flush(struct net *net, int process_context)
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800749{
750 unsigned int i;
751 struct rtable *rth, *next;
752
753 for (i = 0; i <= rt_hash_mask; i++) {
David S. Miller6561a3b2010-12-19 21:11:20 -0800754 struct rtable __rcu **pprev;
755 struct rtable *list;
756
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800757 if (process_context && need_resched())
758 cond_resched();
Eric Dumazet1c317202010-10-25 21:02:07 +0000759 rth = rcu_dereference_raw(rt_hash_table[i].chain);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800760 if (!rth)
761 continue;
762
763 spin_lock_bh(rt_hash_lock_addr(i));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700764
David S. Miller6561a3b2010-12-19 21:11:20 -0800765 list = NULL;
766 pprev = &rt_hash_table[i].chain;
767 rth = rcu_dereference_protected(*pprev,
Eric Dumazet1c317202010-10-25 21:02:07 +0000768 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700769
David S. Miller6561a3b2010-12-19 21:11:20 -0800770 while (rth) {
771 next = rcu_dereference_protected(rth->dst.rt_next,
772 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700773
David S. Miller6561a3b2010-12-19 21:11:20 -0800774 if (!net ||
775 net_eq(dev_net(rth->dst.dev), net)) {
776 rcu_assign_pointer(*pprev, next);
777 rcu_assign_pointer(rth->dst.rt_next, list);
778 list = rth;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700779 } else {
David S. Miller6561a3b2010-12-19 21:11:20 -0800780 pprev = &rth->dst.rt_next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700781 }
David S. Miller6561a3b2010-12-19 21:11:20 -0800782 rth = next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700783 }
David S. Miller6561a3b2010-12-19 21:11:20 -0800784
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800785 spin_unlock_bh(rt_hash_lock_addr(i));
786
David S. Miller6561a3b2010-12-19 21:11:20 -0800787 for (; list; list = next) {
788 next = rcu_dereference_protected(list->dst.rt_next, 1);
789 rt_free(list);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800790 }
791 }
792}
793
Neil Horman1080d702008-10-27 12:28:25 -0700794/*
795 * While freeing expired entries, we compute average chain length
796 * and standard deviation, using fixed-point arithmetic.
797 * This to have an estimation of rt_chain_length_max
798 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
799 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
800 */
801
802#define FRACT_BITS 3
803#define ONE (1UL << FRACT_BITS)
804
Eric Dumazet98376382010-03-08 03:20:00 +0000805/*
806 * Given a hash chain and an item in this hash chain,
807 * find if a previous entry has the same hash_inputs
808 * (but differs on tos, mark or oif)
809 * Returns 0 if an alias is found.
810 * Returns ONE if rth has no alias before itself.
811 */
812static int has_noalias(const struct rtable *head, const struct rtable *rth)
813{
814 const struct rtable *aux = head;
815
816 while (aux != rth) {
David S. Miller5e2b61f2011-03-04 21:47:09 -0800817 if (compare_hash_inputs(aux, rth))
Eric Dumazet98376382010-03-08 03:20:00 +0000818 return 0;
Eric Dumazet1c317202010-10-25 21:02:07 +0000819 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +0000820 }
821 return ONE;
822}
823
Eric Dumazet29e75252008-01-31 17:05:09 -0800824/*
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300825 * Perturbation of rt_genid by a small quantity [1..256]
Eric Dumazet29e75252008-01-31 17:05:09 -0800826 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
827 * many times (2^24) without giving recent rt_genid.
828 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700829 */
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700830static void rt_cache_invalidate(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700831{
Eric Dumazet29e75252008-01-31 17:05:09 -0800832 unsigned char shuffle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700833
Eric Dumazet29e75252008-01-31 17:05:09 -0800834 get_random_bytes(&shuffle, sizeof(shuffle));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700835 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700836}
837
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800838/*
Eric Dumazet29e75252008-01-31 17:05:09 -0800839 * delay < 0 : invalidate cache (fast : entries will be deleted later)
840 * delay >= 0 : invalidate & flush cache (can be long)
841 */
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -0700842void rt_cache_flush(struct net *net, int delay)
Eric Dumazet29e75252008-01-31 17:05:09 -0800843{
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700844 rt_cache_invalidate(net);
Eric Dumazet29e75252008-01-31 17:05:09 -0800845 if (delay >= 0)
David S. Miller6561a3b2010-12-19 21:11:20 -0800846 rt_do_flush(net, !in_softirq());
Eric Dumazet29e75252008-01-31 17:05:09 -0800847}
848
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000849/* Flush previous cache invalidated entries from the cache */
David S. Miller6561a3b2010-12-19 21:11:20 -0800850void rt_cache_flush_batch(struct net *net)
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000851{
David S. Miller6561a3b2010-12-19 21:11:20 -0800852 rt_do_flush(net, !in_softirq());
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000853}
854
Neil Horman1080d702008-10-27 12:28:25 -0700855static void rt_emergency_hash_rebuild(struct net *net)
856{
Neil Horman3ee94372010-05-08 01:57:52 -0700857 if (net_ratelimit())
Neil Horman1080d702008-10-27 12:28:25 -0700858 printk(KERN_WARNING "Route hash chain too long!\n");
Neil Horman3ee94372010-05-08 01:57:52 -0700859 rt_cache_invalidate(net);
Neil Horman1080d702008-10-27 12:28:25 -0700860}
861
Linus Torvalds1da177e2005-04-16 15:20:36 -0700862/*
863 Short description of GC goals.
864
865 We want to build algorithm, which will keep routing cache
866 at some equilibrium point, when number of aged off entries
867 is kept approximately equal to newly generated ones.
868
869 Current expiration strength is variable "expire".
870 We try to adjust it dynamically, so that if networking
871 is idle expires is large enough to keep enough of warm entries,
872 and when load increases it reduces to limit cache size.
873 */
874
Daniel Lezcano569d3642008-01-18 03:56:57 -0800875static int rt_garbage_collect(struct dst_ops *ops)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700876{
877 static unsigned long expire = RT_GC_TIMEOUT;
878 static unsigned long last_gc;
879 static int rover;
880 static int equilibrium;
Eric Dumazet1c317202010-10-25 21:02:07 +0000881 struct rtable *rth;
882 struct rtable __rcu **rthp;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700883 unsigned long now = jiffies;
884 int goal;
Eric Dumazetfc66f952010-10-08 06:37:34 +0000885 int entries = dst_entries_get_fast(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700886
887 /*
888 * Garbage collection is pretty expensive,
889 * do not make it too frequently.
890 */
891
892 RT_CACHE_STAT_INC(gc_total);
893
894 if (now - last_gc < ip_rt_gc_min_interval &&
Eric Dumazetfc66f952010-10-08 06:37:34 +0000895 entries < ip_rt_max_size) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700896 RT_CACHE_STAT_INC(gc_ignored);
897 goto out;
898 }
899
Eric Dumazetfc66f952010-10-08 06:37:34 +0000900 entries = dst_entries_get_slow(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700901 /* Calculate number of entries, which we want to expire now. */
Eric Dumazetfc66f952010-10-08 06:37:34 +0000902 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700903 if (goal <= 0) {
904 if (equilibrium < ipv4_dst_ops.gc_thresh)
905 equilibrium = ipv4_dst_ops.gc_thresh;
Eric Dumazetfc66f952010-10-08 06:37:34 +0000906 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700907 if (goal > 0) {
Eric Dumazetb790ced2007-12-21 01:49:07 -0800908 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +0000909 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700910 }
911 } else {
912 /* We are in dangerous area. Try to reduce cache really
913 * aggressively.
914 */
Eric Dumazetb790ced2007-12-21 01:49:07 -0800915 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +0000916 equilibrium = entries - goal;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700917 }
918
919 if (now - last_gc >= ip_rt_gc_min_interval)
920 last_gc = now;
921
922 if (goal <= 0) {
923 equilibrium += goal;
924 goto work_done;
925 }
926
927 do {
928 int i, k;
929
930 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
931 unsigned long tmo = expire;
932
933 k = (k + 1) & rt_hash_mask;
934 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700935 spin_lock_bh(rt_hash_lock_addr(k));
Eric Dumazet1c317202010-10-25 21:02:07 +0000936 while ((rth = rcu_dereference_protected(*rthp,
937 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700938 if (!rt_is_expired(rth) &&
Eric Dumazet29e75252008-01-31 17:05:09 -0800939 !rt_may_expire(rth, tmo, expire)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700940 tmo >>= 1;
Changli Gaod8d1f302010-06-10 23:31:35 -0700941 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700942 continue;
943 }
Changli Gaod8d1f302010-06-10 23:31:35 -0700944 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700945 rt_free(rth);
946 goal--;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700947 }
Eric Dumazet22c047c2005-07-05 14:55:24 -0700948 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700949 if (goal <= 0)
950 break;
951 }
952 rover = k;
953
954 if (goal <= 0)
955 goto work_done;
956
957 /* Goal is not achieved. We stop process if:
958
959 - if expire reduced to zero. Otherwise, expire is halfed.
960 - if table is not full.
961 - if we are called from interrupt.
962 - jiffies check is just fallback/debug loop breaker.
963 We will not spin here for long time in any case.
964 */
965
966 RT_CACHE_STAT_INC(gc_goal_miss);
967
968 if (expire == 0)
969 break;
970
971 expire >>= 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700972
Eric Dumazetfc66f952010-10-08 06:37:34 +0000973 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700974 goto out;
975 } while (!in_softirq() && time_before_eq(jiffies, now));
976
Eric Dumazetfc66f952010-10-08 06:37:34 +0000977 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
978 goto out;
979 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700980 goto out;
981 if (net_ratelimit())
982 printk(KERN_WARNING "dst cache overflow\n");
983 RT_CACHE_STAT_INC(gc_dst_overflow);
984 return 1;
985
986work_done:
987 expire += ip_rt_gc_min_interval;
988 if (expire > ip_rt_gc_timeout ||
Eric Dumazetfc66f952010-10-08 06:37:34 +0000989 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
990 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700991 expire = ip_rt_gc_timeout;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700992out: return 0;
993}
994
Eric Dumazet98376382010-03-08 03:20:00 +0000995/*
996 * Returns number of entries in a hash chain that have different hash_inputs
997 */
998static int slow_chain_length(const struct rtable *head)
999{
1000 int length = 0;
1001 const struct rtable *rth = head;
1002
1003 while (rth) {
1004 length += has_noalias(head, rth);
Eric Dumazet1c317202010-10-25 21:02:07 +00001005 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +00001006 }
1007 return length >> FRACT_BITS;
1008}
1009
David S. Millerb23dd4f2011-03-02 14:31:35 -08001010static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1011 struct sk_buff *skb, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001012{
Eric Dumazet1c317202010-10-25 21:02:07 +00001013 struct rtable *rth, *cand;
1014 struct rtable __rcu **rthp, **candp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001015 unsigned long now;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001016 u32 min_score;
1017 int chain_length;
1018 int attempts = !in_softirq();
1019
1020restart:
1021 chain_length = 0;
1022 min_score = ~(u32)0;
1023 cand = NULL;
1024 candp = NULL;
1025 now = jiffies;
1026
Changli Gaod8d1f302010-06-10 23:31:35 -07001027 if (!rt_caching(dev_net(rt->dst.dev))) {
Neil Horman73e42892009-06-20 01:15:16 -07001028 /*
1029 * If we're not caching, just tell the caller we
1030 * were successful and don't touch the route. The
1031 * caller hold the sole reference to the cache entry, and
1032 * it will be released when the caller is done with it.
1033 * If we drop it here, the callers have no way to resolve routes
1034 * when we're not caching. Instead, just point *rp at rt, so
1035 * the caller gets a single use out of the route
Neil Hormanb6280b42009-06-22 10:18:53 +00001036 * Note that we do rt_free on this new route entry, so that
1037 * once its refcount hits zero, we are still able to reap it
1038 * (Thanks Alexey)
Eric Dumazet27b75c92010-10-15 05:44:11 +00001039 * Note: To avoid expensive rcu stuff for this uncached dst,
1040 * we set DST_NOCACHE so that dst_release() can free dst without
1041 * waiting a grace period.
Neil Horman73e42892009-06-20 01:15:16 -07001042 */
Neil Hormanb6280b42009-06-22 10:18:53 +00001043
Eric Dumazetc7d44262010-10-03 22:17:54 -07001044 rt->dst.flags |= DST_NOCACHE;
David S. Millerc7537962010-11-11 17:07:48 -08001045 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001046 int err = arp_bind_neighbour(&rt->dst);
Neil Hormanb6280b42009-06-22 10:18:53 +00001047 if (err) {
1048 if (net_ratelimit())
1049 printk(KERN_WARNING
1050 "Neighbour table failure & not caching routes.\n");
Eric Dumazet27b75c92010-10-15 05:44:11 +00001051 ip_rt_put(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001052 return ERR_PTR(err);
Neil Hormanb6280b42009-06-22 10:18:53 +00001053 }
1054 }
1055
Neil Hormanb6280b42009-06-22 10:18:53 +00001056 goto skip_hashing;
Neil Horman1080d702008-10-27 12:28:25 -07001057 }
1058
Linus Torvalds1da177e2005-04-16 15:20:36 -07001059 rthp = &rt_hash_table[hash].chain;
1060
Eric Dumazet22c047c2005-07-05 14:55:24 -07001061 spin_lock_bh(rt_hash_lock_addr(hash));
Eric Dumazet1c317202010-10-25 21:02:07 +00001062 while ((rth = rcu_dereference_protected(*rthp,
1063 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001064 if (rt_is_expired(rth)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001065 *rthp = rth->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001066 rt_free(rth);
1067 continue;
1068 }
David S. Miller5e2b61f2011-03-04 21:47:09 -08001069 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001070 /* Put it first */
Changli Gaod8d1f302010-06-10 23:31:35 -07001071 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001072 /*
1073 * Since lookup is lockfree, the deletion
1074 * must be visible to another weakly ordered CPU before
1075 * the insertion at the start of the hash chain.
1076 */
Changli Gaod8d1f302010-06-10 23:31:35 -07001077 rcu_assign_pointer(rth->dst.rt_next,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001078 rt_hash_table[hash].chain);
1079 /*
1080 * Since lookup is lockfree, the update writes
1081 * must be ordered for consistency on SMP.
1082 */
1083 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1084
Changli Gaod8d1f302010-06-10 23:31:35 -07001085 dst_use(&rth->dst, now);
Eric Dumazet22c047c2005-07-05 14:55:24 -07001086 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001087
1088 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001089 if (skb)
Changli Gaod8d1f302010-06-10 23:31:35 -07001090 skb_dst_set(skb, &rth->dst);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001091 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001092 }
1093
Changli Gaod8d1f302010-06-10 23:31:35 -07001094 if (!atomic_read(&rth->dst.__refcnt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001095 u32 score = rt_score(rth);
1096
1097 if (score <= min_score) {
1098 cand = rth;
1099 candp = rthp;
1100 min_score = score;
1101 }
1102 }
1103
1104 chain_length++;
1105
Changli Gaod8d1f302010-06-10 23:31:35 -07001106 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001107 }
1108
1109 if (cand) {
1110 /* ip_rt_gc_elasticity used to be average length of chain
1111 * length, when exceeded gc becomes really aggressive.
1112 *
1113 * The second limit is less certain. At the moment it allows
1114 * only 2 entries per bucket. We will see.
1115 */
1116 if (chain_length > ip_rt_gc_elasticity) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001117 *candp = cand->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001118 rt_free(cand);
1119 }
Neil Horman1080d702008-10-27 12:28:25 -07001120 } else {
Eric Dumazet98376382010-03-08 03:20:00 +00001121 if (chain_length > rt_chain_length_max &&
1122 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001123 struct net *net = dev_net(rt->dst.dev);
Neil Horman1080d702008-10-27 12:28:25 -07001124 int num = ++net->ipv4.current_rt_cache_rebuild_count;
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001125 if (!rt_caching(net)) {
Neil Horman1080d702008-10-27 12:28:25 -07001126 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
Changli Gaod8d1f302010-06-10 23:31:35 -07001127 rt->dst.dev->name, num);
Neil Horman1080d702008-10-27 12:28:25 -07001128 }
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001129 rt_emergency_hash_rebuild(net);
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001130 spin_unlock_bh(rt_hash_lock_addr(hash));
1131
David S. Miller5e2b61f2011-03-04 21:47:09 -08001132 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001133 ifindex, rt_genid(net));
1134 goto restart;
Neil Horman1080d702008-10-27 12:28:25 -07001135 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001136 }
1137
1138 /* Try to bind route to arp only if it is output
1139 route or unicast forwarding path.
1140 */
David S. Millerc7537962010-11-11 17:07:48 -08001141 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001142 int err = arp_bind_neighbour(&rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001143 if (err) {
Eric Dumazet22c047c2005-07-05 14:55:24 -07001144 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001145
1146 if (err != -ENOBUFS) {
1147 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001148 return ERR_PTR(err);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001149 }
1150
1151 /* Neighbour tables are full and nothing
1152 can be released. Try to shrink route cache,
1153 it is most likely it holds some neighbour records.
1154 */
1155 if (attempts-- > 0) {
1156 int saved_elasticity = ip_rt_gc_elasticity;
1157 int saved_int = ip_rt_gc_min_interval;
1158 ip_rt_gc_elasticity = 1;
1159 ip_rt_gc_min_interval = 0;
Daniel Lezcano569d3642008-01-18 03:56:57 -08001160 rt_garbage_collect(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001161 ip_rt_gc_min_interval = saved_int;
1162 ip_rt_gc_elasticity = saved_elasticity;
1163 goto restart;
1164 }
1165
1166 if (net_ratelimit())
Ulrich Weber7e1b33e2010-09-27 15:02:18 -07001167 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07001168 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001169 return ERR_PTR(-ENOBUFS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001170 }
1171 }
1172
Changli Gaod8d1f302010-06-10 23:31:35 -07001173 rt->dst.rt_next = rt_hash_table[hash].chain;
Neil Horman1080d702008-10-27 12:28:25 -07001174
Eric Dumazet00269b52008-10-16 14:18:29 -07001175 /*
1176 * Since lookup is lockfree, we must make sure
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001177 * previous writes to rt are committed to memory
Eric Dumazet00269b52008-10-16 14:18:29 -07001178 * before making rt visible to other CPUS.
1179 */
Eric Dumazet1ddbcb02009-05-19 20:14:28 +00001180 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
Neil Horman1080d702008-10-27 12:28:25 -07001181
Eric Dumazet22c047c2005-07-05 14:55:24 -07001182 spin_unlock_bh(rt_hash_lock_addr(hash));
Neil Horman73e42892009-06-20 01:15:16 -07001183
Neil Hormanb6280b42009-06-22 10:18:53 +00001184skip_hashing:
David S. Millerb23dd4f2011-03-02 14:31:35 -08001185 if (skb)
Changli Gaod8d1f302010-06-10 23:31:35 -07001186 skb_dst_set(skb, &rt->dst);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001187 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001188}
1189
David S. Miller6431cbc2011-02-07 20:38:06 -08001190static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1191
1192static u32 rt_peer_genid(void)
1193{
1194 return atomic_read(&__rt_peer_genid);
1195}
1196
David S. Millera48eff12011-05-18 18:42:43 -04001197void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001198{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001199 struct inet_peer *peer;
1200
David S. Millera48eff12011-05-18 18:42:43 -04001201 peer = inet_getpeer_v4(daddr, create);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001202
Eric Dumazet49e8ab02010-08-19 06:10:45 +00001203 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001204 inet_putpeer(peer);
David S. Miller6431cbc2011-02-07 20:38:06 -08001205 else
1206 rt->rt_peer_genid = rt_peer_genid();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001207}
1208
1209/*
1210 * Peer allocation may fail only in serious out-of-memory conditions. However
1211 * we still can generate some output.
1212 * Random ID selection looks a bit dangerous because we have no chances to
1213 * select ID being unique in a reasonable period of time.
1214 * But broken packet identifier may be better than no packet at all.
1215 */
1216static void ip_select_fb_ident(struct iphdr *iph)
1217{
1218 static DEFINE_SPINLOCK(ip_fb_id_lock);
1219 static u32 ip_fallback_id;
1220 u32 salt;
1221
1222 spin_lock_bh(&ip_fb_id_lock);
Al Viroe4485152006-09-26 22:15:01 -07001223 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001224 iph->id = htons(salt & 0xFFFF);
1225 ip_fallback_id = salt;
1226 spin_unlock_bh(&ip_fb_id_lock);
1227}
1228
1229void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1230{
1231 struct rtable *rt = (struct rtable *) dst;
1232
1233 if (rt) {
1234 if (rt->peer == NULL)
David S. Millera48eff12011-05-18 18:42:43 -04001235 rt_bind_peer(rt, rt->rt_dst, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001236
1237 /* If peer is attached to destination, it is never detached,
1238 so that we need not to grab a lock to dereference it.
1239 */
1240 if (rt->peer) {
1241 iph->id = htons(inet_getid(rt->peer, more));
1242 return;
1243 }
1244 } else
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001245 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
Stephen Hemminger9c2b3322005-04-19 22:39:42 -07001246 __builtin_return_address(0));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001247
1248 ip_select_fb_ident(iph);
1249}
Eric Dumazet4bc2f182010-07-09 21:22:10 +00001250EXPORT_SYMBOL(__ip_select_ident);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001251
1252static void rt_del(unsigned hash, struct rtable *rt)
1253{
Eric Dumazet1c317202010-10-25 21:02:07 +00001254 struct rtable __rcu **rthp;
1255 struct rtable *aux;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001256
Eric Dumazet29e75252008-01-31 17:05:09 -08001257 rthp = &rt_hash_table[hash].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001258 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001259 ip_rt_put(rt);
Eric Dumazet1c317202010-10-25 21:02:07 +00001260 while ((aux = rcu_dereference_protected(*rthp,
1261 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001262 if (aux == rt || rt_is_expired(aux)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001263 *rthp = aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001264 rt_free(aux);
1265 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001266 }
Changli Gaod8d1f302010-06-10 23:31:35 -07001267 rthp = &aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001268 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001269 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001270}
1271
Eric Dumazeted7865a42010-06-07 21:49:44 -07001272/* called in rcu_read_lock() section */
Al Virof7655222006-09-26 21:25:43 -07001273void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1274 __be32 saddr, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001275{
Eric Dumazeted7865a42010-06-07 21:49:44 -07001276 struct in_device *in_dev = __in_dev_get_rcu(dev);
David S. Millerf39925d2011-02-09 22:00:16 -08001277 struct inet_peer *peer;
Denis V. Lunev317805b2008-02-28 20:50:06 -08001278 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001279
Linus Torvalds1da177e2005-04-16 15:20:36 -07001280 if (!in_dev)
1281 return;
1282
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001283 net = dev_net(dev);
Joe Perches9d4fb272009-11-23 10:41:23 -08001284 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1285 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1286 ipv4_is_zeronet(new_gw))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001287 goto reject_redirect;
1288
1289 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1290 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1291 goto reject_redirect;
1292 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1293 goto reject_redirect;
1294 } else {
Denis V. Lunev317805b2008-02-28 20:50:06 -08001295 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001296 goto reject_redirect;
1297 }
1298
David S. Millerf39925d2011-02-09 22:00:16 -08001299 peer = inet_getpeer_v4(daddr, 1);
1300 if (peer) {
1301 peer->redirect_learned.a4 = new_gw;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001302
David S. Millerf39925d2011-02-09 22:00:16 -08001303 inet_putpeer(peer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001304
David S. Millerf39925d2011-02-09 22:00:16 -08001305 atomic_inc(&__rt_peer_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001306 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001307 return;
1308
1309reject_redirect:
1310#ifdef CONFIG_IP_ROUTE_VERBOSE
1311 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07001312 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1313 " Advised path = %pI4 -> %pI4\n",
1314 &old_gw, dev->name, &new_gw,
1315 &saddr, &daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001316#endif
Eric Dumazeted7865a42010-06-07 21:49:44 -07001317 ;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001318}
1319
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001320static bool peer_pmtu_expired(struct inet_peer *peer)
1321{
1322 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1323
1324 return orig &&
1325 time_after_eq(jiffies, orig) &&
1326 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1327}
1328
1329static bool peer_pmtu_cleaned(struct inet_peer *peer)
1330{
1331 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1332
1333 return orig &&
1334 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1335}
1336
Linus Torvalds1da177e2005-04-16 15:20:36 -07001337static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1338{
Eric Dumazetee6b9672008-03-05 18:30:47 -08001339 struct rtable *rt = (struct rtable *)dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001340 struct dst_entry *ret = dst;
1341
1342 if (rt) {
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001343 if (dst->obsolete > 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001344 ip_rt_put(rt);
1345 ret = NULL;
David S. Miller2c8cec52011-02-09 20:42:07 -08001346 } else if (rt->rt_flags & RTCF_REDIRECTED) {
David S. Miller5e2b61f2011-03-04 21:47:09 -08001347 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1348 rt->rt_oif,
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001349 rt_genid(dev_net(dst->dev)));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001350 rt_del(hash, rt);
1351 ret = NULL;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001352 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1353 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001354 }
1355 }
1356 return ret;
1357}
1358
1359/*
1360 * Algorithm:
1361 * 1. The first ip_rt_redirect_number redirects are sent
1362 * with exponential backoff, then we stop sending them at all,
1363 * assuming that the host ignores our redirects.
1364 * 2. If we did not see packets requiring redirects
1365 * during ip_rt_redirect_silence, we assume that the host
1366 * forgot redirected route and start to send redirects again.
1367 *
1368 * This algorithm is much cheaper and more intelligent than dumb load limiting
1369 * in icmp.c.
1370 *
1371 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1372 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1373 */
1374
1375void ip_rt_send_redirect(struct sk_buff *skb)
1376{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001377 struct rtable *rt = skb_rtable(skb);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001378 struct in_device *in_dev;
David S. Miller92d86822011-02-04 15:55:25 -08001379 struct inet_peer *peer;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001380 int log_martians;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001381
Eric Dumazet30038fc2009-08-28 23:52:01 -07001382 rcu_read_lock();
Changli Gaod8d1f302010-06-10 23:31:35 -07001383 in_dev = __in_dev_get_rcu(rt->dst.dev);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001384 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1385 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001386 return;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001387 }
1388 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1389 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001390
David S. Miller92d86822011-02-04 15:55:25 -08001391 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001392 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller92d86822011-02-04 15:55:25 -08001393 peer = rt->peer;
1394 if (!peer) {
1395 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1396 return;
1397 }
1398
Linus Torvalds1da177e2005-04-16 15:20:36 -07001399 /* No redirected packets during ip_rt_redirect_silence;
1400 * reset the algorithm.
1401 */
David S. Miller92d86822011-02-04 15:55:25 -08001402 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1403 peer->rate_tokens = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001404
1405 /* Too many ignored redirects; do not send anything
Changli Gaod8d1f302010-06-10 23:31:35 -07001406 * set dst.rate_last to the last seen redirected packet.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001407 */
David S. Miller92d86822011-02-04 15:55:25 -08001408 if (peer->rate_tokens >= ip_rt_redirect_number) {
1409 peer->rate_last = jiffies;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001410 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001411 }
1412
1413 /* Check for load limit; set rate_last to the latest sent
1414 * redirect.
1415 */
David S. Miller92d86822011-02-04 15:55:25 -08001416 if (peer->rate_tokens == 0 ||
Li Yewang14fb8a72006-12-18 00:26:35 -08001417 time_after(jiffies,
David S. Miller92d86822011-02-04 15:55:25 -08001418 (peer->rate_last +
1419 (ip_rt_redirect_load << peer->rate_tokens)))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001420 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
David S. Miller92d86822011-02-04 15:55:25 -08001421 peer->rate_last = jiffies;
1422 ++peer->rate_tokens;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001423#ifdef CONFIG_IP_ROUTE_VERBOSE
Eric Dumazet30038fc2009-08-28 23:52:01 -07001424 if (log_martians &&
David S. Miller92d86822011-02-04 15:55:25 -08001425 peer->rate_tokens == ip_rt_redirect_number &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001426 net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07001427 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
David S. Millerc5be24f2011-05-13 18:01:21 -04001428 &ip_hdr(skb)->saddr, rt->rt_iif,
Harvey Harrison673d57e2008-10-31 00:53:57 -07001429 &rt->rt_dst, &rt->rt_gateway);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001430#endif
1431 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001432}
1433
1434static int ip_error(struct sk_buff *skb)
1435{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001436 struct rtable *rt = skb_rtable(skb);
David S. Miller92d86822011-02-04 15:55:25 -08001437 struct inet_peer *peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001438 unsigned long now;
David S. Miller92d86822011-02-04 15:55:25 -08001439 bool send;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001440 int code;
1441
Changli Gaod8d1f302010-06-10 23:31:35 -07001442 switch (rt->dst.error) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001443 case EINVAL:
1444 default:
1445 goto out;
1446 case EHOSTUNREACH:
1447 code = ICMP_HOST_UNREACH;
1448 break;
1449 case ENETUNREACH:
1450 code = ICMP_NET_UNREACH;
Changli Gaod8d1f302010-06-10 23:31:35 -07001451 IP_INC_STATS_BH(dev_net(rt->dst.dev),
Pavel Emelyanov7c73a6f2008-07-16 20:20:11 -07001452 IPSTATS_MIB_INNOROUTES);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001453 break;
1454 case EACCES:
1455 code = ICMP_PKT_FILTERED;
1456 break;
1457 }
1458
David S. Miller92d86822011-02-04 15:55:25 -08001459 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001460 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller92d86822011-02-04 15:55:25 -08001461 peer = rt->peer;
1462
1463 send = true;
1464 if (peer) {
1465 now = jiffies;
1466 peer->rate_tokens += now - peer->rate_last;
1467 if (peer->rate_tokens > ip_rt_error_burst)
1468 peer->rate_tokens = ip_rt_error_burst;
1469 peer->rate_last = now;
1470 if (peer->rate_tokens >= ip_rt_error_cost)
1471 peer->rate_tokens -= ip_rt_error_cost;
1472 else
1473 send = false;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001474 }
David S. Miller92d86822011-02-04 15:55:25 -08001475 if (send)
1476 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001477
1478out: kfree_skb(skb);
1479 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001480}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001481
1482/*
1483 * The last two values are not from the RFC but
1484 * are needed for AMPRnet AX.25 paths.
1485 */
1486
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001487static const unsigned short mtu_plateau[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -07001488{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1489
Stephen Hemminger5969f712008-04-10 01:52:09 -07001490static inline unsigned short guess_mtu(unsigned short old_mtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001491{
1492 int i;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001493
Linus Torvalds1da177e2005-04-16 15:20:36 -07001494 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1495 if (old_mtu > mtu_plateau[i])
1496 return mtu_plateau[i];
1497 return 68;
1498}
1499
Eric Dumazetb71d1d42011-04-22 04:53:02 +00001500unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
Timo Teras0010e462008-04-29 03:32:25 -07001501 unsigned short new_mtu,
1502 struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001503{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001504 unsigned short old_mtu = ntohs(iph->tot_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001505 unsigned short est_mtu = 0;
David S. Miller2c8cec52011-02-09 20:42:07 -08001506 struct inet_peer *peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001507
David S. Miller2c8cec52011-02-09 20:42:07 -08001508 peer = inet_getpeer_v4(iph->daddr, 1);
1509 if (peer) {
1510 unsigned short mtu = new_mtu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001511
David S. Miller2c8cec52011-02-09 20:42:07 -08001512 if (new_mtu < 68 || new_mtu >= old_mtu) {
1513 /* BSD 4.2 derived systems incorrectly adjust
1514 * tot_len by the IP header length, and report
1515 * a zero MTU in the ICMP message.
1516 */
1517 if (mtu == 0 &&
1518 old_mtu >= 68 + (iph->ihl << 2))
1519 old_mtu -= iph->ihl << 2;
1520 mtu = guess_mtu(old_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001521 }
David S. Miller2c8cec52011-02-09 20:42:07 -08001522
1523 if (mtu < ip_rt_min_pmtu)
1524 mtu = ip_rt_min_pmtu;
1525 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001526 unsigned long pmtu_expires;
1527
1528 pmtu_expires = jiffies + ip_rt_mtu_expires;
1529 if (!pmtu_expires)
1530 pmtu_expires = 1UL;
1531
David S. Miller2c8cec52011-02-09 20:42:07 -08001532 est_mtu = mtu;
1533 peer->pmtu_learned = mtu;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001534 peer->pmtu_expires = pmtu_expires;
David S. Miller2c8cec52011-02-09 20:42:07 -08001535 }
1536
1537 inet_putpeer(peer);
1538
1539 atomic_inc(&__rt_peer_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001540 }
1541 return est_mtu ? : new_mtu;
1542}
1543
David S. Miller2c8cec52011-02-09 20:42:07 -08001544static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1545{
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001546 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
David S. Miller2c8cec52011-02-09 20:42:07 -08001547
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001548 if (!expires)
1549 return;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001550 if (time_before(jiffies, expires)) {
David S. Miller2c8cec52011-02-09 20:42:07 -08001551 u32 orig_dst_mtu = dst_mtu(dst);
1552 if (peer->pmtu_learned < orig_dst_mtu) {
1553 if (!peer->pmtu_orig)
1554 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1555 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1556 }
1557 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1558 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1559}
1560
Linus Torvalds1da177e2005-04-16 15:20:36 -07001561static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1562{
David S. Miller2c8cec52011-02-09 20:42:07 -08001563 struct rtable *rt = (struct rtable *) dst;
1564 struct inet_peer *peer;
1565
1566 dst_confirm(dst);
1567
1568 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001569 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller2c8cec52011-02-09 20:42:07 -08001570 peer = rt->peer;
1571 if (peer) {
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001572 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1573
David S. Miller2c8cec52011-02-09 20:42:07 -08001574 if (mtu < ip_rt_min_pmtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001575 mtu = ip_rt_min_pmtu;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001576 if (!pmtu_expires || mtu < peer->pmtu_learned) {
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001577
1578 pmtu_expires = jiffies + ip_rt_mtu_expires;
1579 if (!pmtu_expires)
1580 pmtu_expires = 1UL;
1581
David S. Miller2c8cec52011-02-09 20:42:07 -08001582 peer->pmtu_learned = mtu;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001583 peer->pmtu_expires = pmtu_expires;
David S. Miller2c8cec52011-02-09 20:42:07 -08001584
1585 atomic_inc(&__rt_peer_genid);
1586 rt->rt_peer_genid = rt_peer_genid();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001587 }
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001588 check_peer_pmtu(dst, peer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001589 }
1590}
1591
David S. Millerf39925d2011-02-09 22:00:16 -08001592static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1593{
1594 struct rtable *rt = (struct rtable *) dst;
1595 __be32 orig_gw = rt->rt_gateway;
1596
1597 dst_confirm(&rt->dst);
1598
1599 neigh_release(rt->dst.neighbour);
1600 rt->dst.neighbour = NULL;
1601
1602 rt->rt_gateway = peer->redirect_learned.a4;
1603 if (arp_bind_neighbour(&rt->dst) ||
1604 !(rt->dst.neighbour->nud_state & NUD_VALID)) {
1605 if (rt->dst.neighbour)
1606 neigh_event_send(rt->dst.neighbour, NULL);
1607 rt->rt_gateway = orig_gw;
1608 return -EAGAIN;
1609 } else {
1610 rt->rt_flags |= RTCF_REDIRECTED;
1611 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE,
1612 rt->dst.neighbour);
1613 }
1614 return 0;
1615}
1616
Linus Torvalds1da177e2005-04-16 15:20:36 -07001617static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1618{
David S. Miller6431cbc2011-02-07 20:38:06 -08001619 struct rtable *rt = (struct rtable *) dst;
1620
1621 if (rt_is_expired(rt))
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001622 return NULL;
David S. Miller6431cbc2011-02-07 20:38:06 -08001623 if (rt->rt_peer_genid != rt_peer_genid()) {
David S. Miller2c8cec52011-02-09 20:42:07 -08001624 struct inet_peer *peer;
1625
David S. Miller6431cbc2011-02-07 20:38:06 -08001626 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001627 rt_bind_peer(rt, rt->rt_dst, 0);
David S. Miller6431cbc2011-02-07 20:38:06 -08001628
David S. Miller2c8cec52011-02-09 20:42:07 -08001629 peer = rt->peer;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001630 if (peer) {
David S. Miller2c8cec52011-02-09 20:42:07 -08001631 check_peer_pmtu(dst, peer);
1632
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001633 if (peer->redirect_learned.a4 &&
1634 peer->redirect_learned.a4 != rt->rt_gateway) {
1635 if (check_peer_redir(dst, peer))
1636 return NULL;
1637 }
David S. Millerf39925d2011-02-09 22:00:16 -08001638 }
1639
David S. Miller6431cbc2011-02-07 20:38:06 -08001640 rt->rt_peer_genid = rt_peer_genid();
1641 }
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001642 return dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001643}
1644
1645static void ipv4_dst_destroy(struct dst_entry *dst)
1646{
1647 struct rtable *rt = (struct rtable *) dst;
1648 struct inet_peer *peer = rt->peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001649
David S. Miller62fa8a82011-01-26 20:51:05 -08001650 if (rt->fi) {
1651 fib_info_put(rt->fi);
1652 rt->fi = NULL;
1653 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001654 if (peer) {
1655 rt->peer = NULL;
1656 inet_putpeer(peer);
1657 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001658}
1659
Linus Torvalds1da177e2005-04-16 15:20:36 -07001660
1661static void ipv4_link_failure(struct sk_buff *skb)
1662{
1663 struct rtable *rt;
1664
1665 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1666
Eric Dumazet511c3f92009-06-02 05:14:27 +00001667 rt = skb_rtable(skb);
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001668 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1669 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001670}
1671
1672static int ip_rt_bug(struct sk_buff *skb)
1673{
Harvey Harrison673d57e2008-10-31 00:53:57 -07001674 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1675 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001676 skb->dev ? skb->dev->name : "?");
1677 kfree_skb(skb);
Dave Jonesc378a9c2011-05-21 07:16:42 +00001678 WARN_ON(1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001679 return 0;
1680}
1681
1682/*
1683 We do not cache source address of outgoing interface,
1684 because it is used only by IP RR, TS and SRR options,
1685 so that it out of fast path.
1686
1687 BTW remember: "addr" is allowed to be not aligned
1688 in IP options!
1689 */
1690
David S. Miller8e363602011-05-13 17:29:41 -04001691void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001692{
Al Viroa61ced52006-09-26 21:27:54 -07001693 __be32 src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001694
David S. Millerc7537962010-11-11 17:07:48 -08001695 if (rt_is_output_route(rt))
David S. Millerc5be24f2011-05-13 18:01:21 -04001696 src = ip_hdr(skb)->saddr;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001697 else {
David S. Miller8e363602011-05-13 17:29:41 -04001698 struct fib_result res;
1699 struct flowi4 fl4;
1700 struct iphdr *iph;
1701
1702 iph = ip_hdr(skb);
1703
1704 memset(&fl4, 0, sizeof(fl4));
1705 fl4.daddr = iph->daddr;
1706 fl4.saddr = iph->saddr;
Julian Anastasovcb737cb2011-07-23 02:00:41 +00001707 fl4.flowi4_tos = RT_TOS(iph->tos);
David S. Miller8e363602011-05-13 17:29:41 -04001708 fl4.flowi4_oif = rt->dst.dev->ifindex;
1709 fl4.flowi4_iif = skb->dev->ifindex;
1710 fl4.flowi4_mark = skb->mark;
David S. Miller5e2b61f2011-03-04 21:47:09 -08001711
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001712 rcu_read_lock();
David S. Miller68a5e3d2011-03-11 20:07:33 -05001713 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
David S. Miller436c3b62011-03-24 17:42:21 -07001714 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001715 else
1716 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001717 RT_SCOPE_UNIVERSE);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001718 rcu_read_unlock();
1719 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001720 memcpy(addr, &src, 4);
1721}
1722
Patrick McHardyc7066f72011-01-14 13:36:42 +01001723#ifdef CONFIG_IP_ROUTE_CLASSID
Linus Torvalds1da177e2005-04-16 15:20:36 -07001724static void set_class_tag(struct rtable *rt, u32 tag)
1725{
Changli Gaod8d1f302010-06-10 23:31:35 -07001726 if (!(rt->dst.tclassid & 0xFFFF))
1727 rt->dst.tclassid |= tag & 0xFFFF;
1728 if (!(rt->dst.tclassid & 0xFFFF0000))
1729 rt->dst.tclassid |= tag & 0xFFFF0000;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001730}
1731#endif
1732
David S. Miller0dbaee32010-12-13 12:52:14 -08001733static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1734{
1735 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1736
1737 if (advmss == 0) {
1738 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1739 ip_rt_min_advmss);
1740 if (advmss > 65535 - 40)
1741 advmss = 65535 - 40;
1742 }
1743 return advmss;
1744}
1745
David S. Millerd33e4552010-12-14 13:01:14 -08001746static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1747{
1748 unsigned int mtu = dst->dev->mtu;
1749
1750 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1751 const struct rtable *rt = (const struct rtable *) dst;
1752
1753 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1754 mtu = 576;
1755 }
1756
1757 if (mtu > IP_MAX_MTU)
1758 mtu = IP_MAX_MTU;
1759
1760 return mtu;
1761}
1762
David S. Miller813b3b52011-04-28 14:48:42 -07001763static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
David S. Miller5e2b61f2011-03-04 21:47:09 -08001764 struct fib_info *fi)
David S. Millera4daad62011-01-27 22:01:53 -08001765{
David S. Miller0131ba42011-02-04 14:37:30 -08001766 struct inet_peer *peer;
1767 int create = 0;
1768
1769 /* If a peer entry exists for this destination, we must hook
1770 * it up in order to get at cached metrics.
1771 */
David S. Miller813b3b52011-04-28 14:48:42 -07001772 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
David S. Miller0131ba42011-02-04 14:37:30 -08001773 create = 1;
1774
David S. Miller3c0afdc2011-03-04 21:26:07 -08001775 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
David S. Miller0131ba42011-02-04 14:37:30 -08001776 if (peer) {
David S. Miller3c0afdc2011-03-04 21:26:07 -08001777 rt->rt_peer_genid = rt_peer_genid();
David S. Miller0131ba42011-02-04 14:37:30 -08001778 if (inet_metrics_new(peer))
1779 memcpy(peer->metrics, fi->fib_metrics,
1780 sizeof(u32) * RTAX_MAX);
1781 dst_init_metrics(&rt->dst, peer->metrics, false);
David S. Miller2c8cec52011-02-09 20:42:07 -08001782
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001783 check_peer_pmtu(&rt->dst, peer);
David S. Millerf39925d2011-02-09 22:00:16 -08001784 if (peer->redirect_learned.a4 &&
1785 peer->redirect_learned.a4 != rt->rt_gateway) {
1786 rt->rt_gateway = peer->redirect_learned.a4;
1787 rt->rt_flags |= RTCF_REDIRECTED;
1788 }
David S. Miller0131ba42011-02-04 14:37:30 -08001789 } else {
David S. Millerb8dad612011-01-28 14:07:16 -08001790 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1791 rt->fi = fi;
1792 atomic_inc(&fi->fib_clntref);
1793 }
David S. Millera4daad62011-01-27 22:01:53 -08001794 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
David S. Millera4daad62011-01-27 22:01:53 -08001795 }
1796}
1797
David S. Miller813b3b52011-04-28 14:48:42 -07001798static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
David S. Miller5e2b61f2011-03-04 21:47:09 -08001799 const struct fib_result *res,
David S. Miller982721f2011-02-16 21:44:24 -08001800 struct fib_info *fi, u16 type, u32 itag)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001801{
David S. Millerdefb3512010-12-08 21:16:57 -08001802 struct dst_entry *dst = &rt->dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001803
1804 if (fi) {
1805 if (FIB_RES_GW(*res) &&
1806 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1807 rt->rt_gateway = FIB_RES_GW(*res);
David S. Miller813b3b52011-04-28 14:48:42 -07001808 rt_init_metrics(rt, fl4, fi);
Patrick McHardyc7066f72011-01-14 13:36:42 +01001809#ifdef CONFIG_IP_ROUTE_CLASSID
David S. Millerdefb3512010-12-08 21:16:57 -08001810 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001811#endif
David S. Millerd33e4552010-12-14 13:01:14 -08001812 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001813
David S. Millerdefb3512010-12-08 21:16:57 -08001814 if (dst_mtu(dst) > IP_MAX_MTU)
1815 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
David S. Miller0dbaee32010-12-13 12:52:14 -08001816 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
David S. Millerdefb3512010-12-08 21:16:57 -08001817 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001818
Patrick McHardyc7066f72011-01-14 13:36:42 +01001819#ifdef CONFIG_IP_ROUTE_CLASSID
Linus Torvalds1da177e2005-04-16 15:20:36 -07001820#ifdef CONFIG_IP_MULTIPLE_TABLES
1821 set_class_tag(rt, fib_rules_tclass(res));
1822#endif
1823 set_class_tag(rt, itag);
1824#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001825}
1826
David S. Miller5c1e6aa2011-04-28 14:13:38 -07001827static struct rtable *rt_dst_alloc(struct net_device *dev,
1828 bool nopolicy, bool noxfrm)
David S. Miller0c4dcd52011-02-17 15:42:37 -08001829{
David S. Miller5c1e6aa2011-04-28 14:13:38 -07001830 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1831 DST_HOST |
1832 (nopolicy ? DST_NOPOLICY : 0) |
1833 (noxfrm ? DST_NOXFRM : 0));
David S. Miller0c4dcd52011-02-17 15:42:37 -08001834}
1835
Eric Dumazet96d36222010-06-02 19:21:31 +00001836/* called in rcu_read_lock() section */
Al Viro9e12bb22006-09-26 21:25:20 -07001837static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001838 u8 tos, struct net_device *dev, int our)
1839{
Eric Dumazet96d36222010-06-02 19:21:31 +00001840 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001841 struct rtable *rth;
Al Viroa61ced52006-09-26 21:27:54 -07001842 __be32 spec_dst;
Eric Dumazet96d36222010-06-02 19:21:31 +00001843 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001844 u32 itag = 0;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001845 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001846
1847 /* Primary sanity checks. */
1848
1849 if (in_dev == NULL)
1850 return -EINVAL;
1851
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001852 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08001853 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001854 goto e_inval;
1855
Joe Perchesf97c1e02007-12-16 13:45:43 -08001856 if (ipv4_is_zeronet(saddr)) {
1857 if (!ipv4_is_local_multicast(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001858 goto e_inval;
1859 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001860 } else {
Michael Smith5c04c812011-04-07 04:51:50 +00001861 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1862 &itag);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001863 if (err < 0)
1864 goto e_err;
1865 }
David S. Miller5c1e6aa2011-04-28 14:13:38 -07001866 rth = rt_dst_alloc(init_net.loopback_dev,
1867 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001868 if (!rth)
1869 goto e_nobufs;
1870
Patrick McHardyc7066f72011-01-14 13:36:42 +01001871#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07001872 rth->dst.tclassid = itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001873#endif
David S. Millercf911662011-04-28 14:31:47 -07001874 rth->dst.output = ip_rt_bug;
1875
1876 rth->rt_key_dst = daddr;
1877 rth->rt_key_src = saddr;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001878 rth->rt_genid = rt_genid(dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001879 rth->rt_flags = RTCF_MULTICAST;
Eric Dumazet29e75252008-01-31 17:05:09 -08001880 rth->rt_type = RTN_MULTICAST;
David S. Miller475949d2011-05-03 19:45:15 -07001881 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07001882 rth->rt_dst = daddr;
1883 rth->rt_src = saddr;
1884 rth->rt_route_iif = dev->ifindex;
1885 rth->rt_iif = dev->ifindex;
1886 rth->rt_oif = 0;
1887 rth->rt_mark = skb->mark;
1888 rth->rt_gateway = daddr;
1889 rth->rt_spec_dst= spec_dst;
1890 rth->rt_peer_genid = 0;
1891 rth->peer = NULL;
1892 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001893 if (our) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001894 rth->dst.input= ip_local_deliver;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001895 rth->rt_flags |= RTCF_LOCAL;
1896 }
1897
1898#ifdef CONFIG_IP_MROUTE
Joe Perchesf97c1e02007-12-16 13:45:43 -08001899 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
Changli Gaod8d1f302010-06-10 23:31:35 -07001900 rth->dst.input = ip_mr_input;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001901#endif
1902 RT_CACHE_STAT_INC(in_slow_mc);
1903
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001904 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
David S. Millerb23dd4f2011-03-02 14:31:35 -08001905 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
Eric Dumazet9aa3c942011-06-18 11:59:18 -07001906 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001907
1908e_nobufs:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001909 return -ENOBUFS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001910e_inval:
Eric Dumazet96d36222010-06-02 19:21:31 +00001911 return -EINVAL;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001912e_err:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001913 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001914}
1915
1916
1917static void ip_handle_martian_source(struct net_device *dev,
1918 struct in_device *in_dev,
1919 struct sk_buff *skb,
Al Viro9e12bb22006-09-26 21:25:20 -07001920 __be32 daddr,
1921 __be32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001922{
1923 RT_CACHE_STAT_INC(in_martian_src);
1924#ifdef CONFIG_IP_ROUTE_VERBOSE
1925 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1926 /*
1927 * RFC1812 recommendation, if source is martian,
1928 * the only hint is MAC header.
1929 */
Harvey Harrison673d57e2008-10-31 00:53:57 -07001930 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1931 &daddr, &saddr, dev->name);
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001932 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001933 int i;
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001934 const unsigned char *p = skb_mac_header(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001935 printk(KERN_WARNING "ll header: ");
1936 for (i = 0; i < dev->hard_header_len; i++, p++) {
1937 printk("%02x", *p);
1938 if (i < (dev->hard_header_len - 1))
1939 printk(":");
1940 }
1941 printk("\n");
1942 }
1943 }
1944#endif
1945}
1946
Eric Dumazet47360222010-06-03 04:13:21 +00001947/* called in rcu_read_lock() section */
Stephen Hemminger5969f712008-04-10 01:52:09 -07001948static int __mkroute_input(struct sk_buff *skb,
David S. Miller982721f2011-02-16 21:44:24 -08001949 const struct fib_result *res,
Stephen Hemminger5969f712008-04-10 01:52:09 -07001950 struct in_device *in_dev,
1951 __be32 daddr, __be32 saddr, u32 tos,
1952 struct rtable **result)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001953{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001954 struct rtable *rth;
1955 int err;
1956 struct in_device *out_dev;
Eric Dumazet47360222010-06-03 04:13:21 +00001957 unsigned int flags = 0;
Al Virod9c9df82006-09-26 21:28:14 -07001958 __be32 spec_dst;
1959 u32 itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001960
1961 /* get a working reference to the output device */
Eric Dumazet47360222010-06-03 04:13:21 +00001962 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001963 if (out_dev == NULL) {
1964 if (net_ratelimit())
1965 printk(KERN_CRIT "Bug in ip_route_input" \
1966 "_slow(). Please, report\n");
1967 return -EINVAL;
1968 }
1969
1970
Michael Smith5c04c812011-04-07 04:51:50 +00001971 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1972 in_dev->dev, &spec_dst, &itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001973 if (err < 0) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001974 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001975 saddr);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001976
Linus Torvalds1da177e2005-04-16 15:20:36 -07001977 goto cleanup;
1978 }
1979
1980 if (err)
1981 flags |= RTCF_DIRECTSRC;
1982
Thomas Graf51b77ca2008-06-03 16:36:01 -07001983 if (out_dev == in_dev && err &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001984 (IN_DEV_SHARED_MEDIA(out_dev) ||
1985 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1986 flags |= RTCF_DOREDIRECT;
1987
1988 if (skb->protocol != htons(ETH_P_IP)) {
1989 /* Not IP (i.e. ARP). Do not create route, if it is
1990 * invalid for proxy arp. DNAT routes are always valid.
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00001991 *
1992 * Proxy arp feature have been extended to allow, ARP
1993 * replies back to the same interface, to support
1994 * Private VLAN switch technologies. See arp.c.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001995 */
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00001996 if (out_dev == in_dev &&
1997 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001998 err = -EINVAL;
1999 goto cleanup;
2000 }
2001 }
2002
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002003 rth = rt_dst_alloc(out_dev->dev,
2004 IN_DEV_CONF_GET(in_dev, NOPOLICY),
David S. Miller0c4dcd52011-02-17 15:42:37 -08002005 IN_DEV_CONF_GET(out_dev, NOXFRM));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002006 if (!rth) {
2007 err = -ENOBUFS;
2008 goto cleanup;
2009 }
2010
David S. Miller5e2b61f2011-03-04 21:47:09 -08002011 rth->rt_key_dst = daddr;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002012 rth->rt_key_src = saddr;
David S. Millercf911662011-04-28 14:31:47 -07002013 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2014 rth->rt_flags = flags;
2015 rth->rt_type = res->type;
David S. Miller475949d2011-05-03 19:45:15 -07002016 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002017 rth->rt_dst = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002018 rth->rt_src = saddr;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002019 rth->rt_route_iif = in_dev->dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002020 rth->rt_iif = in_dev->dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002021 rth->rt_oif = 0;
David S. Millercf911662011-04-28 14:31:47 -07002022 rth->rt_mark = skb->mark;
2023 rth->rt_gateway = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002024 rth->rt_spec_dst= spec_dst;
David S. Millercf911662011-04-28 14:31:47 -07002025 rth->rt_peer_genid = 0;
2026 rth->peer = NULL;
2027 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002028
Changli Gaod8d1f302010-06-10 23:31:35 -07002029 rth->dst.input = ip_forward;
2030 rth->dst.output = ip_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002031
David S. Miller5e2b61f2011-03-04 21:47:09 -08002032 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002033
Linus Torvalds1da177e2005-04-16 15:20:36 -07002034 *result = rth;
2035 err = 0;
2036 cleanup:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002037 return err;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002038}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002039
Stephen Hemminger5969f712008-04-10 01:52:09 -07002040static int ip_mkroute_input(struct sk_buff *skb,
2041 struct fib_result *res,
David S. Miller68a5e3d2011-03-11 20:07:33 -05002042 const struct flowi4 *fl4,
Stephen Hemminger5969f712008-04-10 01:52:09 -07002043 struct in_device *in_dev,
2044 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002045{
Chuck Short7abaa272005-06-22 22:10:23 -07002046 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002047 int err;
2048 unsigned hash;
2049
2050#ifdef CONFIG_IP_ROUTE_MULTIPATH
David S. Millerff3fccb2011-03-10 16:23:24 -08002051 if (res->fi && res->fi->fib_nhs > 1)
David S. Miller1b7fe5932011-03-10 17:01:16 -08002052 fib_select_multipath(res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002053#endif
2054
2055 /* create a routing cache entry */
2056 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2057 if (err)
2058 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002059
2060 /* put it into the cache */
David S. Miller68a5e3d2011-03-11 20:07:33 -05002061 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
Changli Gaod8d1f302010-06-10 23:31:35 -07002062 rt_genid(dev_net(rth->dst.dev)));
David S. Miller68a5e3d2011-03-11 20:07:33 -05002063 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002064 if (IS_ERR(rth))
2065 return PTR_ERR(rth);
2066 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002067}
2068
Linus Torvalds1da177e2005-04-16 15:20:36 -07002069/*
2070 * NOTE. We drop all the packets that has local source
2071 * addresses, because every properly looped back packet
2072 * must have correct destination already attached by output routine.
2073 *
2074 * Such approach solves two big problems:
2075 * 1. Not simplex devices are handled properly.
2076 * 2. IP spoofing attempts are filtered with 100% of guarantee.
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002077 * called with rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -07002078 */
2079
Al Viro9e12bb22006-09-26 21:25:20 -07002080static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002081 u8 tos, struct net_device *dev)
2082{
2083 struct fib_result res;
Eric Dumazet96d36222010-06-02 19:21:31 +00002084 struct in_device *in_dev = __in_dev_get_rcu(dev);
David S. Miller68a5e3d2011-03-11 20:07:33 -05002085 struct flowi4 fl4;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002086 unsigned flags = 0;
2087 u32 itag = 0;
2088 struct rtable * rth;
2089 unsigned hash;
Al Viro9e12bb22006-09-26 21:25:20 -07002090 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002091 int err = -EINVAL;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002092 struct net * net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002093
2094 /* IP on this device is disabled. */
2095
2096 if (!in_dev)
2097 goto out;
2098
2099 /* Check for the most weird martians, which can be not detected
2100 by fib_lookup.
2101 */
2102
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002103 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002104 ipv4_is_loopback(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002105 goto martian_source;
2106
Andy Walls27a954b2010-10-17 15:11:22 +00002107 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002108 goto brd_input;
2109
2110 /* Accept zero addresses only to limited broadcast;
2111 * I even do not know to fix it or not. Waiting for complains :-)
2112 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002113 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002114 goto martian_source;
2115
Andy Walls27a954b2010-10-17 15:11:22 +00002116 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002117 goto martian_destination;
2118
2119 /*
2120 * Now we are ready to route packet.
2121 */
David S. Miller68a5e3d2011-03-11 20:07:33 -05002122 fl4.flowi4_oif = 0;
2123 fl4.flowi4_iif = dev->ifindex;
2124 fl4.flowi4_mark = skb->mark;
2125 fl4.flowi4_tos = tos;
2126 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2127 fl4.daddr = daddr;
2128 fl4.saddr = saddr;
2129 err = fib_lookup(net, &fl4, &res);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002130 if (err != 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002131 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002132 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002133 goto no_route;
2134 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002135
2136 RT_CACHE_STAT_INC(in_slow_tot);
2137
2138 if (res.type == RTN_BROADCAST)
2139 goto brd_input;
2140
2141 if (res.type == RTN_LOCAL) {
Michael Smith5c04c812011-04-07 04:51:50 +00002142 err = fib_validate_source(skb, saddr, daddr, tos,
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002143 net->loopback_dev->ifindex,
Michael Smith5c04c812011-04-07 04:51:50 +00002144 dev, &spec_dst, &itag);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002145 if (err < 0)
2146 goto martian_source_keep_err;
2147 if (err)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002148 flags |= RTCF_DIRECTSRC;
2149 spec_dst = daddr;
2150 goto local_input;
2151 }
2152
2153 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002154 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002155 if (res.type != RTN_UNICAST)
2156 goto martian_destination;
2157
David S. Miller68a5e3d2011-03-11 20:07:33 -05002158 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002159out: return err;
2160
2161brd_input:
2162 if (skb->protocol != htons(ETH_P_IP))
2163 goto e_inval;
2164
Joe Perchesf97c1e02007-12-16 13:45:43 -08002165 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002166 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2167 else {
Michael Smith5c04c812011-04-07 04:51:50 +00002168 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2169 &itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002170 if (err < 0)
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002171 goto martian_source_keep_err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002172 if (err)
2173 flags |= RTCF_DIRECTSRC;
2174 }
2175 flags |= RTCF_BROADCAST;
2176 res.type = RTN_BROADCAST;
2177 RT_CACHE_STAT_INC(in_brd);
2178
2179local_input:
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002180 rth = rt_dst_alloc(net->loopback_dev,
2181 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002182 if (!rth)
2183 goto e_nobufs;
2184
David S. Millercf911662011-04-28 14:31:47 -07002185 rth->dst.input= ip_local_deliver;
Changli Gaod8d1f302010-06-10 23:31:35 -07002186 rth->dst.output= ip_rt_bug;
David S. Millercf911662011-04-28 14:31:47 -07002187#ifdef CONFIG_IP_ROUTE_CLASSID
2188 rth->dst.tclassid = itag;
2189#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002190
David S. Miller5e2b61f2011-03-04 21:47:09 -08002191 rth->rt_key_dst = daddr;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002192 rth->rt_key_src = saddr;
David S. Millercf911662011-04-28 14:31:47 -07002193 rth->rt_genid = rt_genid(net);
2194 rth->rt_flags = flags|RTCF_LOCAL;
2195 rth->rt_type = res.type;
David S. Miller475949d2011-05-03 19:45:15 -07002196 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002197 rth->rt_dst = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002198 rth->rt_src = saddr;
Patrick McHardyc7066f72011-01-14 13:36:42 +01002199#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07002200 rth->dst.tclassid = itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002201#endif
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002202 rth->rt_route_iif = dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002203 rth->rt_iif = dev->ifindex;
David S. Millercf911662011-04-28 14:31:47 -07002204 rth->rt_oif = 0;
2205 rth->rt_mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002206 rth->rt_gateway = daddr;
2207 rth->rt_spec_dst= spec_dst;
David S. Millercf911662011-04-28 14:31:47 -07002208 rth->rt_peer_genid = 0;
2209 rth->peer = NULL;
2210 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002211 if (res.type == RTN_UNREACHABLE) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002212 rth->dst.input= ip_error;
2213 rth->dst.error= -err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002214 rth->rt_flags &= ~RTCF_LOCAL;
2215 }
David S. Miller68a5e3d2011-03-11 20:07:33 -05002216 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2217 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002218 err = 0;
2219 if (IS_ERR(rth))
2220 err = PTR_ERR(rth);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002221 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002222
2223no_route:
2224 RT_CACHE_STAT_INC(in_no_route);
2225 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2226 res.type = RTN_UNREACHABLE;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08002227 if (err == -ESRCH)
2228 err = -ENETUNREACH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002229 goto local_input;
2230
2231 /*
2232 * Do not cache martian addresses: they should be logged (RFC1812)
2233 */
2234martian_destination:
2235 RT_CACHE_STAT_INC(in_martian_dst);
2236#ifdef CONFIG_IP_ROUTE_VERBOSE
2237 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07002238 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2239 &daddr, &saddr, dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002240#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002241
2242e_hostunreach:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002243 err = -EHOSTUNREACH;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002244 goto out;
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002245
Linus Torvalds1da177e2005-04-16 15:20:36 -07002246e_inval:
2247 err = -EINVAL;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002248 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002249
2250e_nobufs:
2251 err = -ENOBUFS;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002252 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002253
2254martian_source:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002255 err = -EINVAL;
2256martian_source_keep_err:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002257 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002258 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002259}
2260
Eric Dumazet407eadd2010-05-10 11:32:55 +00002261int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2262 u8 tos, struct net_device *dev, bool noref)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002263{
2264 struct rtable * rth;
2265 unsigned hash;
2266 int iif = dev->ifindex;
Denis V. Lunevb5921912008-01-22 23:50:25 -08002267 struct net *net;
Eric Dumazet96d36222010-06-02 19:21:31 +00002268 int res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002269
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002270 net = dev_net(dev);
Neil Horman1080d702008-10-27 12:28:25 -07002271
Eric Dumazet96d36222010-06-02 19:21:31 +00002272 rcu_read_lock();
2273
Neil Horman1080d702008-10-27 12:28:25 -07002274 if (!rt_caching(net))
2275 goto skip_cache;
2276
Linus Torvalds1da177e2005-04-16 15:20:36 -07002277 tos &= IPTOS_RT_MASK;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002278 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002279
Linus Torvalds1da177e2005-04-16 15:20:36 -07002280 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002281 rth = rcu_dereference(rth->dst.rt_next)) {
David S. Miller5e2b61f2011-03-04 21:47:09 -08002282 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2283 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
Julian Anastasov025fd912011-08-09 04:01:16 +00002284 (rth->rt_route_iif ^ iif) |
David S. Miller475949d2011-05-03 19:45:15 -07002285 (rth->rt_key_tos ^ tos)) == 0 &&
David S. Miller5e2b61f2011-03-04 21:47:09 -08002286 rth->rt_mark == skb->mark &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002287 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002288 !rt_is_expired(rth)) {
Eric Dumazet407eadd2010-05-10 11:32:55 +00002289 if (noref) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002290 dst_use_noref(&rth->dst, jiffies);
2291 skb_dst_set_noref(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002292 } else {
Changli Gaod8d1f302010-06-10 23:31:35 -07002293 dst_use(&rth->dst, jiffies);
2294 skb_dst_set(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002295 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002296 RT_CACHE_STAT_INC(in_hit);
2297 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002298 return 0;
2299 }
2300 RT_CACHE_STAT_INC(in_hlist_search);
2301 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002302
Neil Horman1080d702008-10-27 12:28:25 -07002303skip_cache:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002304 /* Multicast recognition logic is moved from route cache to here.
2305 The problem was that too many Ethernet cards have broken/missing
2306 hardware multicast filters :-( As result the host on multicasting
2307 network acquires a lot of useless route cache entries, sort of
2308 SDR messages from all the world. Now we try to get rid of them.
2309 Really, provided software IP multicast filter is organized
2310 reasonably (at least, hashed), it does not result in a slowdown
2311 comparing with route cache reject entries.
2312 Note, that multicast routers are not affected, because
2313 route cache entry is created eventually.
2314 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002315 if (ipv4_is_multicast(daddr)) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002316 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002317
Eric Dumazet96d36222010-06-02 19:21:31 +00002318 if (in_dev) {
David S. Millerdbdd9a52011-03-10 16:34:38 -08002319 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2320 ip_hdr(skb)->protocol);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002321 if (our
2322#ifdef CONFIG_IP_MROUTE
Joe Perches9d4fb272009-11-23 10:41:23 -08002323 ||
2324 (!ipv4_is_local_multicast(daddr) &&
2325 IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002326#endif
Joe Perches9d4fb272009-11-23 10:41:23 -08002327 ) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002328 int res = ip_route_input_mc(skb, daddr, saddr,
2329 tos, dev, our);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002330 rcu_read_unlock();
Eric Dumazet96d36222010-06-02 19:21:31 +00002331 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002332 }
2333 }
2334 rcu_read_unlock();
2335 return -EINVAL;
2336 }
Eric Dumazet96d36222010-06-02 19:21:31 +00002337 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2338 rcu_read_unlock();
2339 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002340}
Eric Dumazet407eadd2010-05-10 11:32:55 +00002341EXPORT_SYMBOL(ip_route_input_common);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002342
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002343/* called with rcu_read_lock() */
David S. Miller982721f2011-02-16 21:44:24 -08002344static struct rtable *__mkroute_output(const struct fib_result *res,
David S. Miller68a5e3d2011-03-11 20:07:33 -05002345 const struct flowi4 *fl4,
David S. Miller813b3b52011-04-28 14:48:42 -07002346 __be32 orig_daddr, __be32 orig_saddr,
2347 int orig_oif, struct net_device *dev_out,
David S. Miller5ada5522011-02-17 15:29:00 -08002348 unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002349{
David S. Miller982721f2011-02-16 21:44:24 -08002350 struct fib_info *fi = res->fi;
David S. Miller813b3b52011-04-28 14:48:42 -07002351 u32 tos = RT_FL_TOS(fl4);
David S. Miller5ada5522011-02-17 15:29:00 -08002352 struct in_device *in_dev;
David S. Miller982721f2011-02-16 21:44:24 -08002353 u16 type = res->type;
David S. Miller5ada5522011-02-17 15:29:00 -08002354 struct rtable *rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002355
David S. Miller68a5e3d2011-03-11 20:07:33 -05002356 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
David S. Miller5ada5522011-02-17 15:29:00 -08002357 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002358
David S. Miller68a5e3d2011-03-11 20:07:33 -05002359 if (ipv4_is_lbcast(fl4->daddr))
David S. Miller982721f2011-02-16 21:44:24 -08002360 type = RTN_BROADCAST;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002361 else if (ipv4_is_multicast(fl4->daddr))
David S. Miller982721f2011-02-16 21:44:24 -08002362 type = RTN_MULTICAST;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002363 else if (ipv4_is_zeronet(fl4->daddr))
David S. Miller5ada5522011-02-17 15:29:00 -08002364 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002365
2366 if (dev_out->flags & IFF_LOOPBACK)
2367 flags |= RTCF_LOCAL;
2368
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002369 in_dev = __in_dev_get_rcu(dev_out);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002370 if (!in_dev)
David S. Miller5ada5522011-02-17 15:29:00 -08002371 return ERR_PTR(-EINVAL);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002372
David S. Miller982721f2011-02-16 21:44:24 -08002373 if (type == RTN_BROADCAST) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002374 flags |= RTCF_BROADCAST | RTCF_LOCAL;
David S. Miller982721f2011-02-16 21:44:24 -08002375 fi = NULL;
2376 } else if (type == RTN_MULTICAST) {
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002377 flags |= RTCF_MULTICAST | RTCF_LOCAL;
David S. Miller813b3b52011-04-28 14:48:42 -07002378 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2379 fl4->flowi4_proto))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002380 flags &= ~RTCF_LOCAL;
2381 /* If multicast route do not exist use
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002382 * default one, but do not gateway in this case.
2383 * Yes, it is hack.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002384 */
David S. Miller982721f2011-02-16 21:44:24 -08002385 if (fi && res->prefixlen < 4)
2386 fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002387 }
2388
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002389 rth = rt_dst_alloc(dev_out,
2390 IN_DEV_CONF_GET(in_dev, NOPOLICY),
David S. Miller0c4dcd52011-02-17 15:42:37 -08002391 IN_DEV_CONF_GET(in_dev, NOXFRM));
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002392 if (!rth)
David S. Miller5ada5522011-02-17 15:29:00 -08002393 return ERR_PTR(-ENOBUFS);
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002394
David S. Millercf911662011-04-28 14:31:47 -07002395 rth->dst.output = ip_output;
2396
David S. Miller813b3b52011-04-28 14:48:42 -07002397 rth->rt_key_dst = orig_daddr;
2398 rth->rt_key_src = orig_saddr;
David S. Millercf911662011-04-28 14:31:47 -07002399 rth->rt_genid = rt_genid(dev_net(dev_out));
2400 rth->rt_flags = flags;
2401 rth->rt_type = type;
David S. Miller475949d2011-05-03 19:45:15 -07002402 rth->rt_key_tos = tos;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002403 rth->rt_dst = fl4->daddr;
2404 rth->rt_src = fl4->saddr;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002405 rth->rt_route_iif = 0;
David S. Miller813b3b52011-04-28 14:48:42 -07002406 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2407 rth->rt_oif = orig_oif;
2408 rth->rt_mark = fl4->flowi4_mark;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002409 rth->rt_gateway = fl4->daddr;
2410 rth->rt_spec_dst= fl4->saddr;
David S. Millercf911662011-04-28 14:31:47 -07002411 rth->rt_peer_genid = 0;
2412 rth->peer = NULL;
2413 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002414
2415 RT_CACHE_STAT_INC(out_slow_tot);
2416
2417 if (flags & RTCF_LOCAL) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002418 rth->dst.input = ip_local_deliver;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002419 rth->rt_spec_dst = fl4->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002420 }
2421 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
David S. Miller68a5e3d2011-03-11 20:07:33 -05002422 rth->rt_spec_dst = fl4->saddr;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002423 if (flags & RTCF_LOCAL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002424 !(dev_out->flags & IFF_LOOPBACK)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002425 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002426 RT_CACHE_STAT_INC(out_slow_mc);
2427 }
2428#ifdef CONFIG_IP_MROUTE
David S. Miller982721f2011-02-16 21:44:24 -08002429 if (type == RTN_MULTICAST) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002430 if (IN_DEV_MFORWARD(in_dev) &&
David S. Miller813b3b52011-04-28 14:48:42 -07002431 !ipv4_is_local_multicast(fl4->daddr)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002432 rth->dst.input = ip_mr_input;
2433 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002434 }
2435 }
2436#endif
2437 }
2438
David S. Miller813b3b52011-04-28 14:48:42 -07002439 rt_set_nexthop(rth, fl4, res, fi, type, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002440
David S. Miller5ada5522011-02-17 15:29:00 -08002441 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002442}
2443
Linus Torvalds1da177e2005-04-16 15:20:36 -07002444/*
2445 * Major route resolver routine.
Eric Dumazet0197aa32010-09-30 03:33:58 +00002446 * called with rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002447 */
2448
David S. Miller813b3b52011-04-28 14:48:42 -07002449static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002450{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002451 struct net_device *dev_out = NULL;
David S. Miller813b3b52011-04-28 14:48:42 -07002452 u32 tos = RT_FL_TOS(fl4);
2453 unsigned int flags = 0;
2454 struct fib_result res;
David S. Miller5ada5522011-02-17 15:29:00 -08002455 struct rtable *rth;
David S. Miller813b3b52011-04-28 14:48:42 -07002456 __be32 orig_daddr;
2457 __be32 orig_saddr;
2458 int orig_oif;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002459
2460 res.fi = NULL;
2461#ifdef CONFIG_IP_MULTIPLE_TABLES
2462 res.r = NULL;
2463#endif
2464
David S. Miller813b3b52011-04-28 14:48:42 -07002465 orig_daddr = fl4->daddr;
2466 orig_saddr = fl4->saddr;
2467 orig_oif = fl4->flowi4_oif;
2468
2469 fl4->flowi4_iif = net->loopback_dev->ifindex;
2470 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2471 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2472 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
David S. Miller44713b62011-03-04 21:24:47 -08002473
David S. Miller010c2702011-02-17 15:37:09 -08002474 rcu_read_lock();
David S. Miller813b3b52011-04-28 14:48:42 -07002475 if (fl4->saddr) {
David S. Millerb23dd4f2011-03-02 14:31:35 -08002476 rth = ERR_PTR(-EINVAL);
David S. Miller813b3b52011-04-28 14:48:42 -07002477 if (ipv4_is_multicast(fl4->saddr) ||
2478 ipv4_is_lbcast(fl4->saddr) ||
2479 ipv4_is_zeronet(fl4->saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002480 goto out;
2481
Linus Torvalds1da177e2005-04-16 15:20:36 -07002482 /* I removed check for oif == dev_out->oif here.
2483 It was wrong for two reasons:
Denis V. Lunev1ab35272008-01-22 22:04:30 -08002484 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2485 is assigned to multiple interfaces.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002486 2. Moreover, we are allowed to send packets with saddr
2487 of another iface. --ANK
2488 */
2489
David S. Miller813b3b52011-04-28 14:48:42 -07002490 if (fl4->flowi4_oif == 0 &&
2491 (ipv4_is_multicast(fl4->daddr) ||
2492 ipv4_is_lbcast(fl4->daddr))) {
Julian Anastasova210d012008-10-01 07:28:28 -07002493 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
David S. Miller813b3b52011-04-28 14:48:42 -07002494 dev_out = __ip_dev_find(net, fl4->saddr, false);
Julian Anastasova210d012008-10-01 07:28:28 -07002495 if (dev_out == NULL)
2496 goto out;
2497
Linus Torvalds1da177e2005-04-16 15:20:36 -07002498 /* Special hack: user can direct multicasts
2499 and limited broadcast via necessary interface
2500 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2501 This hack is not just for fun, it allows
2502 vic,vat and friends to work.
2503 They bind socket to loopback, set ttl to zero
2504 and expect that it will work.
2505 From the viewpoint of routing cache they are broken,
2506 because we are not allowed to build multicast path
2507 with loopback source addr (look, routing cache
2508 cannot know, that ttl is zero, so that packet
2509 will not leave this host and route is valid).
2510 Luckily, this hack is good workaround.
2511 */
2512
David S. Miller813b3b52011-04-28 14:48:42 -07002513 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002514 goto make_route;
2515 }
Julian Anastasova210d012008-10-01 07:28:28 -07002516
David S. Miller813b3b52011-04-28 14:48:42 -07002517 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
Julian Anastasova210d012008-10-01 07:28:28 -07002518 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
David S. Miller813b3b52011-04-28 14:48:42 -07002519 if (!__ip_dev_find(net, fl4->saddr, false))
Julian Anastasova210d012008-10-01 07:28:28 -07002520 goto out;
Julian Anastasova210d012008-10-01 07:28:28 -07002521 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002522 }
2523
2524
David S. Miller813b3b52011-04-28 14:48:42 -07002525 if (fl4->flowi4_oif) {
2526 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002527 rth = ERR_PTR(-ENODEV);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002528 if (dev_out == NULL)
2529 goto out;
Herbert Xue5ed6392005-10-03 14:35:55 -07002530
2531 /* RACE: Check return value of inet_select_addr instead. */
Eric Dumazetfc75fc82010-12-22 04:39:39 +00002532 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
David S. Millerb23dd4f2011-03-02 14:31:35 -08002533 rth = ERR_PTR(-ENETUNREACH);
Eric Dumazetfc75fc82010-12-22 04:39:39 +00002534 goto out;
2535 }
David S. Miller813b3b52011-04-28 14:48:42 -07002536 if (ipv4_is_local_multicast(fl4->daddr) ||
2537 ipv4_is_lbcast(fl4->daddr)) {
2538 if (!fl4->saddr)
2539 fl4->saddr = inet_select_addr(dev_out, 0,
2540 RT_SCOPE_LINK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002541 goto make_route;
2542 }
David S. Miller813b3b52011-04-28 14:48:42 -07002543 if (fl4->saddr) {
2544 if (ipv4_is_multicast(fl4->daddr))
2545 fl4->saddr = inet_select_addr(dev_out, 0,
2546 fl4->flowi4_scope);
2547 else if (!fl4->daddr)
2548 fl4->saddr = inet_select_addr(dev_out, 0,
2549 RT_SCOPE_HOST);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002550 }
2551 }
2552
David S. Miller813b3b52011-04-28 14:48:42 -07002553 if (!fl4->daddr) {
2554 fl4->daddr = fl4->saddr;
2555 if (!fl4->daddr)
2556 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002557 dev_out = net->loopback_dev;
David S. Miller813b3b52011-04-28 14:48:42 -07002558 fl4->flowi4_oif = net->loopback_dev->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002559 res.type = RTN_LOCAL;
2560 flags |= RTCF_LOCAL;
2561 goto make_route;
2562 }
2563
David S. Miller813b3b52011-04-28 14:48:42 -07002564 if (fib_lookup(net, fl4, &res)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002565 res.fi = NULL;
David S. Miller813b3b52011-04-28 14:48:42 -07002566 if (fl4->flowi4_oif) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002567 /* Apparently, routing tables are wrong. Assume,
2568 that the destination is on link.
2569
2570 WHY? DW.
2571 Because we are allowed to send to iface
2572 even if it has NO routes and NO assigned
2573 addresses. When oif is specified, routing
2574 tables are looked up with only one purpose:
2575 to catch if destination is gatewayed, rather than
2576 direct. Moreover, if MSG_DONTROUTE is set,
2577 we send packet, ignoring both routing tables
2578 and ifaddr state. --ANK
2579
2580
2581 We could make it even if oif is unknown,
2582 likely IPv6, but we do not.
2583 */
2584
David S. Miller813b3b52011-04-28 14:48:42 -07002585 if (fl4->saddr == 0)
2586 fl4->saddr = inet_select_addr(dev_out, 0,
2587 RT_SCOPE_LINK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002588 res.type = RTN_UNICAST;
2589 goto make_route;
2590 }
David S. Millerb23dd4f2011-03-02 14:31:35 -08002591 rth = ERR_PTR(-ENETUNREACH);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002592 goto out;
2593 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002594
2595 if (res.type == RTN_LOCAL) {
David S. Miller813b3b52011-04-28 14:48:42 -07002596 if (!fl4->saddr) {
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002597 if (res.fi->fib_prefsrc)
David S. Miller813b3b52011-04-28 14:48:42 -07002598 fl4->saddr = res.fi->fib_prefsrc;
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002599 else
David S. Miller813b3b52011-04-28 14:48:42 -07002600 fl4->saddr = fl4->daddr;
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002601 }
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002602 dev_out = net->loopback_dev;
David S. Miller813b3b52011-04-28 14:48:42 -07002603 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002604 res.fi = NULL;
2605 flags |= RTCF_LOCAL;
2606 goto make_route;
2607 }
2608
2609#ifdef CONFIG_IP_ROUTE_MULTIPATH
David S. Miller813b3b52011-04-28 14:48:42 -07002610 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
David S. Miller1b7fe5932011-03-10 17:01:16 -08002611 fib_select_multipath(&res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002612 else
2613#endif
David S. Miller21d8c492011-04-14 14:49:37 -07002614 if (!res.prefixlen &&
2615 res.table->tb_num_default > 1 &&
David S. Miller813b3b52011-04-28 14:48:42 -07002616 res.type == RTN_UNICAST && !fl4->flowi4_oif)
David S. Miller0c838ff2011-01-31 16:16:50 -08002617 fib_select_default(&res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002618
David S. Miller813b3b52011-04-28 14:48:42 -07002619 if (!fl4->saddr)
2620 fl4->saddr = FIB_RES_PREFSRC(net, res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002621
Linus Torvalds1da177e2005-04-16 15:20:36 -07002622 dev_out = FIB_RES_DEV(res);
David S. Miller813b3b52011-04-28 14:48:42 -07002623 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002624
2625
2626make_route:
David S. Miller813b3b52011-04-28 14:48:42 -07002627 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2628 dev_out, flags);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002629 if (!IS_ERR(rth)) {
David S. Miller5ada5522011-02-17 15:29:00 -08002630 unsigned int hash;
2631
David S. Miller813b3b52011-04-28 14:48:42 -07002632 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
David S. Miller5ada5522011-02-17 15:29:00 -08002633 rt_genid(dev_net(dev_out)));
David S. Miller813b3b52011-04-28 14:48:42 -07002634 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
David S. Miller5ada5522011-02-17 15:29:00 -08002635 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002636
David S. Miller010c2702011-02-17 15:37:09 -08002637out:
2638 rcu_read_unlock();
David S. Millerb23dd4f2011-03-02 14:31:35 -08002639 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002640}
2641
David S. Miller813b3b52011-04-28 14:48:42 -07002642struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002643{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002644 struct rtable *rth;
David S. Miller010c2702011-02-17 15:37:09 -08002645 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002646
Neil Horman1080d702008-10-27 12:28:25 -07002647 if (!rt_caching(net))
2648 goto slow_output;
2649
David S. Miller9d6ec932011-03-12 01:12:47 -05002650 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002651
2652 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08002653 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002654 rth = rcu_dereference_bh(rth->dst.rt_next)) {
David S. Miller9d6ec932011-03-12 01:12:47 -05002655 if (rth->rt_key_dst == flp4->daddr &&
2656 rth->rt_key_src == flp4->saddr &&
David S. Millerc7537962010-11-11 17:07:48 -08002657 rt_is_output_route(rth) &&
David S. Miller9d6ec932011-03-12 01:12:47 -05002658 rth->rt_oif == flp4->flowi4_oif &&
2659 rth->rt_mark == flp4->flowi4_mark &&
David S. Miller475949d2011-05-03 19:45:15 -07002660 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
Denis V. Lunevb5921912008-01-22 23:50:25 -08002661 (IPTOS_RT_MASK | RTO_ONLINK)) &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002662 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002663 !rt_is_expired(rth)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002664 dst_use(&rth->dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002665 RT_CACHE_STAT_INC(out_hit);
2666 rcu_read_unlock_bh();
David S. Miller56157872011-05-02 14:37:45 -07002667 if (!flp4->saddr)
2668 flp4->saddr = rth->rt_src;
2669 if (!flp4->daddr)
2670 flp4->daddr = rth->rt_dst;
David S. Millerb23dd4f2011-03-02 14:31:35 -08002671 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002672 }
2673 RT_CACHE_STAT_INC(out_hlist_search);
2674 }
2675 rcu_read_unlock_bh();
2676
Neil Horman1080d702008-10-27 12:28:25 -07002677slow_output:
David S. Miller9d6ec932011-03-12 01:12:47 -05002678 return ip_route_output_slow(net, flp4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002679}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002680EXPORT_SYMBOL_GPL(__ip_route_output_key);
2681
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002682static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2683{
2684 return NULL;
2685}
2686
Roland Dreierec831ea2011-01-31 13:16:00 -08002687static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2688{
2689 return 0;
2690}
2691
David S. Miller14e50e52007-05-24 18:17:54 -07002692static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2693{
2694}
2695
Held Bernhard0972ddb2011-04-24 22:07:32 +00002696static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2697 unsigned long old)
2698{
2699 return NULL;
2700}
2701
David S. Miller14e50e52007-05-24 18:17:54 -07002702static struct dst_ops ipv4_dst_blackhole_ops = {
2703 .family = AF_INET,
Harvey Harrison09640e62009-02-01 00:45:17 -08002704 .protocol = cpu_to_be16(ETH_P_IP),
David S. Miller14e50e52007-05-24 18:17:54 -07002705 .destroy = ipv4_dst_destroy,
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002706 .check = ipv4_blackhole_dst_check,
Roland Dreierec831ea2011-01-31 13:16:00 -08002707 .default_mtu = ipv4_blackhole_default_mtu,
Eric Dumazet214f45c2011-02-18 11:39:01 -08002708 .default_advmss = ipv4_default_advmss,
David S. Miller14e50e52007-05-24 18:17:54 -07002709 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
Held Bernhard0972ddb2011-04-24 22:07:32 +00002710 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
David S. Miller14e50e52007-05-24 18:17:54 -07002711};
2712
David S. Miller2774c132011-03-01 14:59:04 -08002713struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
David S. Miller14e50e52007-05-24 18:17:54 -07002714{
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002715 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
David S. Miller2774c132011-03-01 14:59:04 -08002716 struct rtable *ort = (struct rtable *) dst_orig;
David S. Miller14e50e52007-05-24 18:17:54 -07002717
2718 if (rt) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002719 struct dst_entry *new = &rt->dst;
David S. Miller14e50e52007-05-24 18:17:54 -07002720
David S. Miller14e50e52007-05-24 18:17:54 -07002721 new->__use = 1;
Herbert Xu352e5122007-11-13 21:34:06 -08002722 new->input = dst_discard;
2723 new->output = dst_discard;
David S. Millerdefb3512010-12-08 21:16:57 -08002724 dst_copy_metrics(new, &ort->dst);
David S. Miller14e50e52007-05-24 18:17:54 -07002725
Changli Gaod8d1f302010-06-10 23:31:35 -07002726 new->dev = ort->dst.dev;
David S. Miller14e50e52007-05-24 18:17:54 -07002727 if (new->dev)
2728 dev_hold(new->dev);
2729
David S. Miller5e2b61f2011-03-04 21:47:09 -08002730 rt->rt_key_dst = ort->rt_key_dst;
2731 rt->rt_key_src = ort->rt_key_src;
David S. Miller475949d2011-05-03 19:45:15 -07002732 rt->rt_key_tos = ort->rt_key_tos;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002733 rt->rt_route_iif = ort->rt_route_iif;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002734 rt->rt_iif = ort->rt_iif;
2735 rt->rt_oif = ort->rt_oif;
2736 rt->rt_mark = ort->rt_mark;
David S. Miller14e50e52007-05-24 18:17:54 -07002737
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002738 rt->rt_genid = rt_genid(net);
David S. Miller14e50e52007-05-24 18:17:54 -07002739 rt->rt_flags = ort->rt_flags;
2740 rt->rt_type = ort->rt_type;
2741 rt->rt_dst = ort->rt_dst;
2742 rt->rt_src = ort->rt_src;
David S. Miller14e50e52007-05-24 18:17:54 -07002743 rt->rt_gateway = ort->rt_gateway;
2744 rt->rt_spec_dst = ort->rt_spec_dst;
2745 rt->peer = ort->peer;
2746 if (rt->peer)
2747 atomic_inc(&rt->peer->refcnt);
David S. Miller62fa8a82011-01-26 20:51:05 -08002748 rt->fi = ort->fi;
2749 if (rt->fi)
2750 atomic_inc(&rt->fi->fib_clntref);
David S. Miller14e50e52007-05-24 18:17:54 -07002751
2752 dst_free(new);
2753 }
2754
David S. Miller2774c132011-03-01 14:59:04 -08002755 dst_release(dst_orig);
2756
2757 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
David S. Miller14e50e52007-05-24 18:17:54 -07002758}
2759
David S. Miller9d6ec932011-03-12 01:12:47 -05002760struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
David S. Millerb23dd4f2011-03-02 14:31:35 -08002761 struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002762{
David S. Miller9d6ec932011-03-12 01:12:47 -05002763 struct rtable *rt = __ip_route_output_key(net, flp4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002764
David S. Millerb23dd4f2011-03-02 14:31:35 -08002765 if (IS_ERR(rt))
2766 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002767
David S. Miller56157872011-05-02 14:37:45 -07002768 if (flp4->flowi4_proto)
David S. Miller9d6ec932011-03-12 01:12:47 -05002769 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2770 flowi4_to_flowi(flp4),
2771 sk, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002772
David S. Millerb23dd4f2011-03-02 14:31:35 -08002773 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002774}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002775EXPORT_SYMBOL_GPL(ip_route_output_flow);
2776
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002777static int rt_fill_info(struct net *net,
2778 struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002779 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002780{
Eric Dumazet511c3f92009-06-02 05:14:27 +00002781 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002782 struct rtmsg *r;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002783 struct nlmsghdr *nlh;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00002784 long expires = 0;
2785 const struct inet_peer *peer = rt->peer;
Thomas Grafe3703b32006-11-27 09:27:07 -08002786 u32 id = 0, ts = 0, tsage = 0, error;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002787
2788 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2789 if (nlh == NULL)
Patrick McHardy26932562007-01-31 23:16:40 -08002790 return -EMSGSIZE;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002791
2792 r = nlmsg_data(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002793 r->rtm_family = AF_INET;
2794 r->rtm_dst_len = 32;
2795 r->rtm_src_len = 0;
David S. Miller475949d2011-05-03 19:45:15 -07002796 r->rtm_tos = rt->rt_key_tos;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002797 r->rtm_table = RT_TABLE_MAIN;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002798 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002799 r->rtm_type = rt->rt_type;
2800 r->rtm_scope = RT_SCOPE_UNIVERSE;
2801 r->rtm_protocol = RTPROT_UNSPEC;
2802 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2803 if (rt->rt_flags & RTCF_NOTIFY)
2804 r->rtm_flags |= RTM_F_NOTIFY;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002805
Al Viro17fb2c62006-09-26 22:15:25 -07002806 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002807
David S. Miller5e2b61f2011-03-04 21:47:09 -08002808 if (rt->rt_key_src) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002809 r->rtm_src_len = 32;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002810 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002811 }
Changli Gaod8d1f302010-06-10 23:31:35 -07002812 if (rt->dst.dev)
2813 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
Patrick McHardyc7066f72011-01-14 13:36:42 +01002814#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07002815 if (rt->dst.tclassid)
2816 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002817#endif
David S. Millerc7537962010-11-11 17:07:48 -08002818 if (rt_is_input_route(rt))
Al Viro17fb2c62006-09-26 22:15:25 -07002819 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
David S. Miller5e2b61f2011-03-04 21:47:09 -08002820 else if (rt->rt_src != rt->rt_key_src)
Al Viro17fb2c62006-09-26 22:15:25 -07002821 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002822
Linus Torvalds1da177e2005-04-16 15:20:36 -07002823 if (rt->rt_dst != rt->rt_gateway)
Al Viro17fb2c62006-09-26 22:15:25 -07002824 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002825
David S. Millerdefb3512010-12-08 21:16:57 -08002826 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002827 goto nla_put_failure;
2828
David S. Miller5e2b61f2011-03-04 21:47:09 -08002829 if (rt->rt_mark)
2830 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
Eric Dumazet963bfee2010-07-20 22:03:14 +00002831
Changli Gaod8d1f302010-06-10 23:31:35 -07002832 error = rt->dst.error;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00002833 if (peer) {
Eric Dumazet317fe0e2010-06-16 04:52:13 +00002834 inet_peer_refcheck(rt->peer);
Eric Dumazetfe6fe792011-06-08 06:07:07 +00002835 id = atomic_read(&peer->ip_id_count) & 0xffff;
2836 if (peer->tcp_ts_stamp) {
2837 ts = peer->tcp_ts;
2838 tsage = get_seconds() - peer->tcp_ts_stamp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002839 }
Eric Dumazetfe6fe792011-06-08 06:07:07 +00002840 expires = ACCESS_ONCE(peer->pmtu_expires);
2841 if (expires)
2842 expires -= jiffies;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002843 }
Thomas Grafbe403ea2006-08-17 18:15:17 -07002844
David S. Millerc7537962010-11-11 17:07:48 -08002845 if (rt_is_input_route(rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002846#ifdef CONFIG_IP_MROUTE
Al Viroe4485152006-09-26 22:15:01 -07002847 __be32 dst = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002848
Joe Perchesf97c1e02007-12-16 13:45:43 -08002849 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002850 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
David S. Miller9a1b9492011-05-04 12:18:54 -07002851 int err = ipmr_get_route(net, skb,
2852 rt->rt_src, rt->rt_dst,
2853 r, nowait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002854 if (err <= 0) {
2855 if (!nowait) {
2856 if (err == 0)
2857 return 0;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002858 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002859 } else {
2860 if (err == -EMSGSIZE)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002861 goto nla_put_failure;
Thomas Grafe3703b32006-11-27 09:27:07 -08002862 error = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002863 }
2864 }
2865 } else
2866#endif
David S. Miller5e2b61f2011-03-04 21:47:09 -08002867 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002868 }
2869
Changli Gaod8d1f302010-06-10 23:31:35 -07002870 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
Thomas Grafe3703b32006-11-27 09:27:07 -08002871 expires, error) < 0)
2872 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002873
Thomas Grafbe403ea2006-08-17 18:15:17 -07002874 return nlmsg_end(skb, nlh);
2875
2876nla_put_failure:
Patrick McHardy26932562007-01-31 23:16:40 -08002877 nlmsg_cancel(skb, nlh);
2878 return -EMSGSIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002879}
2880
Thomas Graf63f34442007-03-22 11:55:17 -07002881static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002882{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09002883 struct net *net = sock_net(in_skb->sk);
Thomas Grafd889ce32006-08-17 18:15:44 -07002884 struct rtmsg *rtm;
2885 struct nlattr *tb[RTA_MAX+1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07002886 struct rtable *rt = NULL;
Al Viro9e12bb22006-09-26 21:25:20 -07002887 __be32 dst = 0;
2888 __be32 src = 0;
2889 u32 iif;
Thomas Grafd889ce32006-08-17 18:15:44 -07002890 int err;
Eric Dumazet963bfee2010-07-20 22:03:14 +00002891 int mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002892 struct sk_buff *skb;
2893
Thomas Grafd889ce32006-08-17 18:15:44 -07002894 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2895 if (err < 0)
2896 goto errout;
2897
2898 rtm = nlmsg_data(nlh);
2899
Linus Torvalds1da177e2005-04-16 15:20:36 -07002900 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
Thomas Grafd889ce32006-08-17 18:15:44 -07002901 if (skb == NULL) {
2902 err = -ENOBUFS;
2903 goto errout;
2904 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002905
2906 /* Reserve room for dummy headers, this skb can pass
2907 through good chunk of routing engine.
2908 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07002909 skb_reset_mac_header(skb);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07002910 skb_reset_network_header(skb);
Stephen Hemmingerd2c962b2006-04-17 17:27:11 -07002911
2912 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07002913 ip_hdr(skb)->protocol = IPPROTO_ICMP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002914 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2915
Al Viro17fb2c62006-09-26 22:15:25 -07002916 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2917 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
Thomas Grafd889ce32006-08-17 18:15:44 -07002918 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
Eric Dumazet963bfee2010-07-20 22:03:14 +00002919 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002920
2921 if (iif) {
Thomas Grafd889ce32006-08-17 18:15:44 -07002922 struct net_device *dev;
2923
Denis V. Lunev19375042008-02-28 20:52:04 -08002924 dev = __dev_get_by_index(net, iif);
Thomas Grafd889ce32006-08-17 18:15:44 -07002925 if (dev == NULL) {
2926 err = -ENODEV;
2927 goto errout_free;
2928 }
2929
Linus Torvalds1da177e2005-04-16 15:20:36 -07002930 skb->protocol = htons(ETH_P_IP);
2931 skb->dev = dev;
Eric Dumazet963bfee2010-07-20 22:03:14 +00002932 skb->mark = mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002933 local_bh_disable();
2934 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2935 local_bh_enable();
Thomas Grafd889ce32006-08-17 18:15:44 -07002936
Eric Dumazet511c3f92009-06-02 05:14:27 +00002937 rt = skb_rtable(skb);
Changli Gaod8d1f302010-06-10 23:31:35 -07002938 if (err == 0 && rt->dst.error)
2939 err = -rt->dst.error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002940 } else {
David S. Miller68a5e3d2011-03-11 20:07:33 -05002941 struct flowi4 fl4 = {
2942 .daddr = dst,
2943 .saddr = src,
2944 .flowi4_tos = rtm->rtm_tos,
2945 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2946 .flowi4_mark = mark,
Thomas Grafd889ce32006-08-17 18:15:44 -07002947 };
David S. Miller9d6ec932011-03-12 01:12:47 -05002948 rt = ip_route_output_key(net, &fl4);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002949
2950 err = 0;
2951 if (IS_ERR(rt))
2952 err = PTR_ERR(rt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002953 }
Thomas Grafd889ce32006-08-17 18:15:44 -07002954
Linus Torvalds1da177e2005-04-16 15:20:36 -07002955 if (err)
Thomas Grafd889ce32006-08-17 18:15:44 -07002956 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002957
Changli Gaod8d1f302010-06-10 23:31:35 -07002958 skb_dst_set(skb, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002959 if (rtm->rtm_flags & RTM_F_NOTIFY)
2960 rt->rt_flags |= RTCF_NOTIFY;
2961
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002962 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Denis V. Lunev19375042008-02-28 20:52:04 -08002963 RTM_NEWROUTE, 0, 0);
Thomas Grafd889ce32006-08-17 18:15:44 -07002964 if (err <= 0)
2965 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002966
Denis V. Lunev19375042008-02-28 20:52:04 -08002967 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
Thomas Grafd889ce32006-08-17 18:15:44 -07002968errout:
Thomas Graf2942e902006-08-15 00:30:25 -07002969 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002970
Thomas Grafd889ce32006-08-17 18:15:44 -07002971errout_free:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002972 kfree_skb(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07002973 goto errout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002974}
2975
2976int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2977{
2978 struct rtable *rt;
2979 int h, s_h;
2980 int idx, s_idx;
Denis V. Lunev19375042008-02-28 20:52:04 -08002981 struct net *net;
2982
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09002983 net = sock_net(skb->sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002984
2985 s_h = cb->args[0];
Eric Dumazetd8c92832008-01-07 21:52:14 -08002986 if (s_h < 0)
2987 s_h = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002988 s_idx = idx = cb->args[1];
Eric Dumazeta6272662008-08-28 01:11:25 -07002989 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
2990 if (!rt_hash_table[h].chain)
2991 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002992 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08002993 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
Changli Gaod8d1f302010-06-10 23:31:35 -07002994 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
2995 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002996 continue;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002997 if (rt_is_expired(rt))
Eric Dumazet29e75252008-01-31 17:05:09 -08002998 continue;
Changli Gaod8d1f302010-06-10 23:31:35 -07002999 skb_dst_set_noref(skb, &rt->dst);
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003000 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003001 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07003002 1, NLM_F_MULTI) <= 0) {
Eric Dumazetadf30902009-06-02 05:19:30 +00003003 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003004 rcu_read_unlock_bh();
3005 goto done;
3006 }
Eric Dumazetadf30902009-06-02 05:19:30 +00003007 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003008 }
3009 rcu_read_unlock_bh();
3010 }
3011
3012done:
3013 cb->args[0] = h;
3014 cb->args[1] = idx;
3015 return skb->len;
3016}
3017
3018void ip_rt_multicast_event(struct in_device *in_dev)
3019{
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -07003020 rt_cache_flush(dev_net(in_dev->dev), 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003021}
3022
3023#ifdef CONFIG_SYSCTL
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003024static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003025 void __user *buffer,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003026 size_t *lenp, loff_t *ppos)
3027{
3028 if (write) {
Denis V. Lunev639e1042008-07-05 19:02:06 -07003029 int flush_delay;
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003030 ctl_table ctl;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003031 struct net *net;
Denis V. Lunev639e1042008-07-05 19:02:06 -07003032
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003033 memcpy(&ctl, __ctl, sizeof(ctl));
3034 ctl.data = &flush_delay;
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003035 proc_dointvec(&ctl, write, buffer, lenp, ppos);
Denis V. Lunev639e1042008-07-05 19:02:06 -07003036
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003037 net = (struct net *)__ctl->extra1;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003038 rt_cache_flush(net, flush_delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003039 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003040 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003041
3042 return -EINVAL;
3043}
3044
Al Viroeeb61f72008-07-27 08:59:33 +01003045static ctl_table ipv4_route_table[] = {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003046 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003047 .procname = "gc_thresh",
3048 .data = &ipv4_dst_ops.gc_thresh,
3049 .maxlen = sizeof(int),
3050 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003051 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003052 },
3053 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003054 .procname = "max_size",
3055 .data = &ip_rt_max_size,
3056 .maxlen = sizeof(int),
3057 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003058 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003059 },
3060 {
3061 /* Deprecated. Use gc_min_interval_ms */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003062
Linus Torvalds1da177e2005-04-16 15:20:36 -07003063 .procname = "gc_min_interval",
3064 .data = &ip_rt_gc_min_interval,
3065 .maxlen = sizeof(int),
3066 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003067 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003068 },
3069 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003070 .procname = "gc_min_interval_ms",
3071 .data = &ip_rt_gc_min_interval,
3072 .maxlen = sizeof(int),
3073 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003074 .proc_handler = proc_dointvec_ms_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003075 },
3076 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003077 .procname = "gc_timeout",
3078 .data = &ip_rt_gc_timeout,
3079 .maxlen = sizeof(int),
3080 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003081 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003082 },
3083 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003084 .procname = "gc_interval",
3085 .data = &ip_rt_gc_interval,
3086 .maxlen = sizeof(int),
3087 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003088 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003089 },
3090 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003091 .procname = "redirect_load",
3092 .data = &ip_rt_redirect_load,
3093 .maxlen = sizeof(int),
3094 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003095 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003096 },
3097 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003098 .procname = "redirect_number",
3099 .data = &ip_rt_redirect_number,
3100 .maxlen = sizeof(int),
3101 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003102 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003103 },
3104 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003105 .procname = "redirect_silence",
3106 .data = &ip_rt_redirect_silence,
3107 .maxlen = sizeof(int),
3108 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003109 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003110 },
3111 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003112 .procname = "error_cost",
3113 .data = &ip_rt_error_cost,
3114 .maxlen = sizeof(int),
3115 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003116 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003117 },
3118 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003119 .procname = "error_burst",
3120 .data = &ip_rt_error_burst,
3121 .maxlen = sizeof(int),
3122 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003123 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003124 },
3125 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003126 .procname = "gc_elasticity",
3127 .data = &ip_rt_gc_elasticity,
3128 .maxlen = sizeof(int),
3129 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003130 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003131 },
3132 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003133 .procname = "mtu_expires",
3134 .data = &ip_rt_mtu_expires,
3135 .maxlen = sizeof(int),
3136 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003137 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003138 },
3139 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003140 .procname = "min_pmtu",
3141 .data = &ip_rt_min_pmtu,
3142 .maxlen = sizeof(int),
3143 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003144 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003145 },
3146 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003147 .procname = "min_adv_mss",
3148 .data = &ip_rt_min_advmss,
3149 .maxlen = sizeof(int),
3150 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003151 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003152 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003153 { }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003154};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003155
Al Viro2f4520d2008-08-25 15:17:44 -07003156static struct ctl_table empty[1];
3157
3158static struct ctl_table ipv4_skeleton[] =
3159{
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003160 { .procname = "route",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003161 .mode = 0555, .child = ipv4_route_table},
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003162 { .procname = "neigh",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003163 .mode = 0555, .child = empty},
Al Viro2f4520d2008-08-25 15:17:44 -07003164 { }
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003165};
3166
Al Viro2f4520d2008-08-25 15:17:44 -07003167static __net_initdata struct ctl_path ipv4_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003168 { .procname = "net", },
3169 { .procname = "ipv4", },
Al Viro2f4520d2008-08-25 15:17:44 -07003170 { },
3171};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003172
3173static struct ctl_table ipv4_route_flush_table[] = {
3174 {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003175 .procname = "flush",
3176 .maxlen = sizeof(int),
3177 .mode = 0200,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003178 .proc_handler = ipv4_sysctl_rtcache_flush,
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003179 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003180 { },
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003181};
3182
Al Viro2f4520d2008-08-25 15:17:44 -07003183static __net_initdata struct ctl_path ipv4_route_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003184 { .procname = "net", },
3185 { .procname = "ipv4", },
3186 { .procname = "route", },
Al Viro2f4520d2008-08-25 15:17:44 -07003187 { },
3188};
3189
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003190static __net_init int sysctl_route_net_init(struct net *net)
3191{
3192 struct ctl_table *tbl;
3193
3194 tbl = ipv4_route_flush_table;
Octavian Purdila09ad9bc2009-11-25 15:14:13 -08003195 if (!net_eq(net, &init_net)) {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003196 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3197 if (tbl == NULL)
3198 goto err_dup;
3199 }
3200 tbl[0].extra1 = net;
3201
3202 net->ipv4.route_hdr =
3203 register_net_sysctl_table(net, ipv4_route_path, tbl);
3204 if (net->ipv4.route_hdr == NULL)
3205 goto err_reg;
3206 return 0;
3207
3208err_reg:
3209 if (tbl != ipv4_route_flush_table)
3210 kfree(tbl);
3211err_dup:
3212 return -ENOMEM;
3213}
3214
3215static __net_exit void sysctl_route_net_exit(struct net *net)
3216{
3217 struct ctl_table *tbl;
3218
3219 tbl = net->ipv4.route_hdr->ctl_table_arg;
3220 unregister_net_sysctl_table(net->ipv4.route_hdr);
3221 BUG_ON(tbl == ipv4_route_flush_table);
3222 kfree(tbl);
3223}
3224
3225static __net_initdata struct pernet_operations sysctl_route_ops = {
3226 .init = sysctl_route_net_init,
3227 .exit = sysctl_route_net_exit,
3228};
Linus Torvalds1da177e2005-04-16 15:20:36 -07003229#endif
3230
Neil Horman3ee94372010-05-08 01:57:52 -07003231static __net_init int rt_genid_init(struct net *net)
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003232{
Neil Horman3ee94372010-05-08 01:57:52 -07003233 get_random_bytes(&net->ipv4.rt_genid,
3234 sizeof(net->ipv4.rt_genid));
David S. Miller436c3b62011-03-24 17:42:21 -07003235 get_random_bytes(&net->ipv4.dev_addr_genid,
3236 sizeof(net->ipv4.dev_addr_genid));
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003237 return 0;
3238}
3239
Neil Horman3ee94372010-05-08 01:57:52 -07003240static __net_initdata struct pernet_operations rt_genid_ops = {
3241 .init = rt_genid_init,
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003242};
3243
3244
Patrick McHardyc7066f72011-01-14 13:36:42 +01003245#ifdef CONFIG_IP_ROUTE_CLASSID
Tejun Heo7d720c32010-02-16 15:20:26 +00003246struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
Patrick McHardyc7066f72011-01-14 13:36:42 +01003247#endif /* CONFIG_IP_ROUTE_CLASSID */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003248
3249static __initdata unsigned long rhash_entries;
3250static int __init set_rhash_entries(char *str)
3251{
3252 if (!str)
3253 return 0;
3254 rhash_entries = simple_strtoul(str, &str, 0);
3255 return 1;
3256}
3257__setup("rhash_entries=", set_rhash_entries);
3258
3259int __init ip_rt_init(void)
3260{
Eric Dumazet424c4b72005-07-05 14:58:19 -07003261 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003262
Patrick McHardyc7066f72011-01-14 13:36:42 +01003263#ifdef CONFIG_IP_ROUTE_CLASSID
Ingo Molnar0dcec8c2009-02-25 14:07:33 +01003264 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003265 if (!ip_rt_acct)
3266 panic("IP: failed to allocate ip_rt_acct\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003267#endif
3268
Alexey Dobriyane5d679f2006-08-26 19:25:52 -07003269 ipv4_dst_ops.kmem_cachep =
3270 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09003271 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003272
David S. Miller14e50e52007-05-24 18:17:54 -07003273 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3274
Eric Dumazetfc66f952010-10-08 06:37:34 +00003275 if (dst_entries_init(&ipv4_dst_ops) < 0)
3276 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3277
3278 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3279 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3280
Eric Dumazet424c4b72005-07-05 14:58:19 -07003281 rt_hash_table = (struct rt_hash_bucket *)
3282 alloc_large_system_hash("IP route cache",
3283 sizeof(struct rt_hash_bucket),
3284 rhash_entries,
Jan Beulich44813742009-09-21 17:03:05 -07003285 (totalram_pages >= 128 * 1024) ?
Mike Stroyan18955cf2005-11-29 16:12:55 -08003286 15 : 17,
Kirill Korotaev8d1502d2006-08-07 20:44:22 -07003287 0,
Eric Dumazet424c4b72005-07-05 14:58:19 -07003288 &rt_hash_log,
3289 &rt_hash_mask,
Anton Blanchardc9503e02009-04-27 05:42:24 -07003290 rhash_entries ? 0 : 512 * 1024);
Eric Dumazet22c047c2005-07-05 14:55:24 -07003291 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3292 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003293
3294 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3295 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3296
Linus Torvalds1da177e2005-04-16 15:20:36 -07003297 devinet_init();
3298 ip_fib_init();
3299
Denis V. Lunev73b38712008-02-28 20:51:18 -08003300 if (ip_rt_proc_init())
Pavel Emelyanov107f1632007-12-05 21:14:28 -08003301 printk(KERN_ERR "Unable to create route proc files\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003302#ifdef CONFIG_XFRM
3303 xfrm_init();
Neil Hormana33bc5c2009-07-30 18:52:15 -07003304 xfrm4_init(ip_rt_max_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003305#endif
Thomas Graf63f34442007-03-22 11:55:17 -07003306 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3307
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003308#ifdef CONFIG_SYSCTL
3309 register_pernet_subsys(&sysctl_route_ops);
3310#endif
Neil Horman3ee94372010-05-08 01:57:52 -07003311 register_pernet_subsys(&rt_genid_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003312 return rc;
3313}
3314
Al Viroa1bc6eb2008-07-30 06:32:52 -04003315#ifdef CONFIG_SYSCTL
Al Viroeeb61f72008-07-27 08:59:33 +01003316/*
3317 * We really need to sanitize the damn ipv4 init order, then all
3318 * this nonsense will go away.
3319 */
3320void __init ip_static_sysctl_init(void)
3321{
Al Viro2f4520d2008-08-25 15:17:44 -07003322 register_sysctl_paths(ipv4_path, ipv4_skeleton);
Al Viroeeb61f72008-07-27 08:59:33 +01003323}
Al Viroa1bc6eb2008-07-30 06:32:52 -04003324#endif