blob: 36fe0540b1ce292d4b264efda38dc737e6531819 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
Jesper Juhl02c30a82005-05-05 16:16:16 -07008 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -07009 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090021 * Alan Cox : Super /proc >4K
Linus Torvalds1da177e2005-04-16 15:20:36 -070022 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090039 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070055 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Ilia Sotnikovcef26852006-03-25 01:38:55 -080056 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
Linus Torvalds1da177e2005-04-16 15:20:36 -070058 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
Linus Torvalds1da177e2005-04-16 15:20:36 -070065#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070071#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070072#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070073#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
Eric Dumazet39c90ec2007-09-15 10:55:54 -070082#include <linux/workqueue.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070083#include <linux/skbuff.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070084#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090093#include <linux/slab.h>
Herbert Xu352e5122007-11-13 21:34:06 -080094#include <net/dst.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020095#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070096#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
Tom Tucker8d717402006-07-30 20:43:36 -0700106#include <net/netevent.h>
Thomas Graf63f34442007-03-22 11:55:17 -0700107#include <net/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108#ifdef CONFIG_SYSCTL
109#include <linux/sysctl.h>
110#endif
David S. Millere997d472011-08-03 20:50:44 -0700111#include <net/secure_seq.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700112
David S. Miller68a5e3d2011-03-11 20:07:33 -0500113#define RT_FL_TOS(oldflp4) \
114 ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700115
116#define IP_MAX_MTU 0xFFF0
117
118#define RT_GC_TIMEOUT (300*HZ)
119
Linus Torvalds1da177e2005-04-16 15:20:36 -0700120static int ip_rt_max_size;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700121static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
122static int ip_rt_gc_interval __read_mostly = 60 * HZ;
123static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
124static int ip_rt_redirect_number __read_mostly = 9;
125static int ip_rt_redirect_load __read_mostly = HZ / 50;
126static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
127static int ip_rt_error_cost __read_mostly = HZ;
128static int ip_rt_error_burst __read_mostly = 5 * HZ;
129static int ip_rt_gc_elasticity __read_mostly = 8;
130static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
131static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
132static int ip_rt_min_advmss __read_mostly = 256;
Neil Horman1080d702008-10-27 12:28:25 -0700133static int rt_chain_length_max __read_mostly = 20;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700134
Linus Torvalds1da177e2005-04-16 15:20:36 -0700135/*
136 * Interface to generic destination cache.
137 */
138
139static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
David S. Miller0dbaee32010-12-13 12:52:14 -0800140static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
David S. Millerd33e4552010-12-14 13:01:14 -0800141static unsigned int ipv4_default_mtu(const struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700142static void ipv4_dst_destroy(struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700143static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
144static void ipv4_link_failure(struct sk_buff *skb);
145static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
Daniel Lezcano569d3642008-01-18 03:56:57 -0800146static int rt_garbage_collect(struct dst_ops *ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700147
Eric Dumazet72cdd1d2010-11-11 07:14:07 +0000148static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
149 int how)
150{
151}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700152
David S. Miller62fa8a82011-01-26 20:51:05 -0800153static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
154{
David S. Miller06582542011-01-27 14:58:42 -0800155 struct rtable *rt = (struct rtable *) dst;
156 struct inet_peer *peer;
157 u32 *p = NULL;
David S. Miller62fa8a82011-01-26 20:51:05 -0800158
David S. Miller06582542011-01-27 14:58:42 -0800159 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -0400160 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller06582542011-01-27 14:58:42 -0800161
162 peer = rt->peer;
163 if (peer) {
David S. Miller62fa8a82011-01-26 20:51:05 -0800164 u32 *old_p = __DST_METRICS_PTR(old);
165 unsigned long prev, new;
166
David S. Miller06582542011-01-27 14:58:42 -0800167 p = peer->metrics;
168 if (inet_metrics_new(peer))
169 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
David S. Miller62fa8a82011-01-26 20:51:05 -0800170
171 new = (unsigned long) p;
172 prev = cmpxchg(&dst->_metrics, old, new);
173
174 if (prev != old) {
David S. Miller62fa8a82011-01-26 20:51:05 -0800175 p = __DST_METRICS_PTR(prev);
176 if (prev & DST_METRICS_READ_ONLY)
177 p = NULL;
178 } else {
David S. Miller62fa8a82011-01-26 20:51:05 -0800179 if (rt->fi) {
180 fib_info_put(rt->fi);
181 rt->fi = NULL;
182 }
183 }
184 }
185 return p;
186}
187
Linus Torvalds1da177e2005-04-16 15:20:36 -0700188static struct dst_ops ipv4_dst_ops = {
189 .family = AF_INET,
Harvey Harrison09640e62009-02-01 00:45:17 -0800190 .protocol = cpu_to_be16(ETH_P_IP),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700191 .gc = rt_garbage_collect,
192 .check = ipv4_dst_check,
David S. Miller0dbaee32010-12-13 12:52:14 -0800193 .default_advmss = ipv4_default_advmss,
David S. Millerd33e4552010-12-14 13:01:14 -0800194 .default_mtu = ipv4_default_mtu,
David S. Miller62fa8a82011-01-26 20:51:05 -0800195 .cow_metrics = ipv4_cow_metrics,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700196 .destroy = ipv4_dst_destroy,
197 .ifdown = ipv4_dst_ifdown,
198 .negative_advice = ipv4_negative_advice,
199 .link_failure = ipv4_link_failure,
200 .update_pmtu = ip_rt_update_pmtu,
Herbert Xu1ac06e02008-05-20 14:32:14 -0700201 .local_out = __ip_local_out,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700202};
203
204#define ECN_OR_COST(class) TC_PRIO_##class
205
Philippe De Muyter4839c522007-07-09 15:32:57 -0700206const __u8 ip_tos2prio[16] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700207 TC_PRIO_BESTEFFORT,
Dan Siemon4a2b9c32011-03-15 13:56:07 +0000208 ECN_OR_COST(BESTEFFORT),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700209 TC_PRIO_BESTEFFORT,
210 ECN_OR_COST(BESTEFFORT),
211 TC_PRIO_BULK,
212 ECN_OR_COST(BULK),
213 TC_PRIO_BULK,
214 ECN_OR_COST(BULK),
215 TC_PRIO_INTERACTIVE,
216 ECN_OR_COST(INTERACTIVE),
217 TC_PRIO_INTERACTIVE,
218 ECN_OR_COST(INTERACTIVE),
219 TC_PRIO_INTERACTIVE_BULK,
220 ECN_OR_COST(INTERACTIVE_BULK),
221 TC_PRIO_INTERACTIVE_BULK,
222 ECN_OR_COST(INTERACTIVE_BULK)
223};
224
225
226/*
227 * Route cache.
228 */
229
230/* The locking scheme is rather straight forward:
231 *
232 * 1) Read-Copy Update protects the buckets of the central route hash.
233 * 2) Only writers remove entries, and they hold the lock
234 * as they look at rtable reference counts.
235 * 3) Only readers acquire references to rtable entries,
236 * they do so with atomic increments and with the
237 * lock held.
238 */
239
240struct rt_hash_bucket {
Eric Dumazet1c317202010-10-25 21:02:07 +0000241 struct rtable __rcu *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700242};
Neil Horman1080d702008-10-27 12:28:25 -0700243
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700244#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
245 defined(CONFIG_PROVE_LOCKING)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700246/*
247 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
248 * The size of this table is a power of two and depends on the number of CPUS.
Ingo Molnar62051202006-07-03 00:24:59 -0700249 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700250 */
Ingo Molnar62051202006-07-03 00:24:59 -0700251#ifdef CONFIG_LOCKDEP
252# define RT_HASH_LOCK_SZ 256
Eric Dumazet22c047c2005-07-05 14:55:24 -0700253#else
Ingo Molnar62051202006-07-03 00:24:59 -0700254# if NR_CPUS >= 32
255# define RT_HASH_LOCK_SZ 4096
256# elif NR_CPUS >= 16
257# define RT_HASH_LOCK_SZ 2048
258# elif NR_CPUS >= 8
259# define RT_HASH_LOCK_SZ 1024
260# elif NR_CPUS >= 4
261# define RT_HASH_LOCK_SZ 512
262# else
263# define RT_HASH_LOCK_SZ 256
264# endif
Eric Dumazet22c047c2005-07-05 14:55:24 -0700265#endif
266
267static spinlock_t *rt_hash_locks;
268# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800269
270static __init void rt_hash_lock_init(void)
271{
272 int i;
273
274 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
275 GFP_KERNEL);
276 if (!rt_hash_locks)
277 panic("IP: failed to allocate rt_hash_locks\n");
278
279 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
280 spin_lock_init(&rt_hash_locks[i]);
281}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700282#else
283# define rt_hash_lock_addr(slot) NULL
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800284
285static inline void rt_hash_lock_init(void)
286{
287}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700288#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700289
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700290static struct rt_hash_bucket *rt_hash_table __read_mostly;
291static unsigned rt_hash_mask __read_mostly;
292static unsigned int rt_hash_log __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700293
Eric Dumazet2f970d82006-01-17 02:54:36 -0800294static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
Eric Dumazet27f39c72010-05-19 22:07:23 +0000295#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700296
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700297static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700298 int genid)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700299{
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700300 return jhash_3words((__force u32)daddr, (__force u32)saddr,
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700301 idx, genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800302 & rt_hash_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700303}
304
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700305static inline int rt_genid(struct net *net)
306{
307 return atomic_read(&net->ipv4.rt_genid);
308}
309
Linus Torvalds1da177e2005-04-16 15:20:36 -0700310#ifdef CONFIG_PROC_FS
311struct rt_cache_iter_state {
Denis V. Luneva75e9362008-02-28 20:50:55 -0800312 struct seq_net_private p;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700313 int bucket;
Eric Dumazet29e75252008-01-31 17:05:09 -0800314 int genid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700315};
316
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900317static struct rtable *rt_cache_get_first(struct seq_file *seq)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700318{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900319 struct rt_cache_iter_state *st = seq->private;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700320 struct rtable *r = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700321
322 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
Eric Dumazet1c317202010-10-25 21:02:07 +0000323 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
Eric Dumazeta6272662008-08-28 01:11:25 -0700324 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700325 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -0800326 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Eric Dumazet29e75252008-01-31 17:05:09 -0800327 while (r) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700328 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
Denis V. Luneva75e9362008-02-28 20:50:55 -0800329 r->rt_genid == st->genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800330 return r;
Changli Gaod8d1f302010-06-10 23:31:35 -0700331 r = rcu_dereference_bh(r->dst.rt_next);
Eric Dumazet29e75252008-01-31 17:05:09 -0800332 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700333 rcu_read_unlock_bh();
334 }
Eric Dumazet29e75252008-01-31 17:05:09 -0800335 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700336}
337
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900338static struct rtable *__rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800339 struct rtable *r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700340{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900341 struct rt_cache_iter_state *st = seq->private;
Eric Dumazeta6272662008-08-28 01:11:25 -0700342
Eric Dumazet1c317202010-10-25 21:02:07 +0000343 r = rcu_dereference_bh(r->dst.rt_next);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700344 while (!r) {
345 rcu_read_unlock_bh();
Eric Dumazeta6272662008-08-28 01:11:25 -0700346 do {
347 if (--st->bucket < 0)
348 return NULL;
Eric Dumazet1c317202010-10-25 21:02:07 +0000349 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700350 rcu_read_lock_bh();
Eric Dumazet1c317202010-10-25 21:02:07 +0000351 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700352 }
Eric Dumazet1c317202010-10-25 21:02:07 +0000353 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700354}
355
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900356static struct rtable *rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800357 struct rtable *r)
358{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900359 struct rt_cache_iter_state *st = seq->private;
360 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700361 if (dev_net(r->dst.dev) != seq_file_net(seq))
Denis V. Luneva75e9362008-02-28 20:50:55 -0800362 continue;
Denis V. Lunev642d6312008-02-28 20:50:33 -0800363 if (r->rt_genid == st->genid)
364 break;
365 }
366 return r;
367}
368
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900369static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700370{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900371 struct rtable *r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700372
373 if (r)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900374 while (pos && (r = rt_cache_get_next(seq, r)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700375 --pos;
376 return pos ? NULL : r;
377}
378
379static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
380{
Eric Dumazet29e75252008-01-31 17:05:09 -0800381 struct rt_cache_iter_state *st = seq->private;
Eric Dumazet29e75252008-01-31 17:05:09 -0800382 if (*pos)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900383 return rt_cache_get_idx(seq, *pos - 1);
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700384 st->genid = rt_genid(seq_file_net(seq));
Eric Dumazet29e75252008-01-31 17:05:09 -0800385 return SEQ_START_TOKEN;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700386}
387
388static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
389{
Eric Dumazet29e75252008-01-31 17:05:09 -0800390 struct rtable *r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700391
392 if (v == SEQ_START_TOKEN)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900393 r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700394 else
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900395 r = rt_cache_get_next(seq, v);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700396 ++*pos;
397 return r;
398}
399
400static void rt_cache_seq_stop(struct seq_file *seq, void *v)
401{
402 if (v && v != SEQ_START_TOKEN)
403 rcu_read_unlock_bh();
404}
405
406static int rt_cache_seq_show(struct seq_file *seq, void *v)
407{
408 if (v == SEQ_START_TOKEN)
409 seq_printf(seq, "%-127s\n",
410 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
411 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
412 "HHUptod\tSpecDst");
413 else {
414 struct rtable *r = v;
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700415 int len;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700416
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700417 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
418 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
Changli Gaod8d1f302010-06-10 23:31:35 -0700419 r->dst.dev ? r->dst.dev->name : "*",
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700420 (__force u32)r->rt_dst,
421 (__force u32)r->rt_gateway,
Changli Gaod8d1f302010-06-10 23:31:35 -0700422 r->rt_flags, atomic_read(&r->dst.__refcnt),
423 r->dst.__use, 0, (__force u32)r->rt_src,
David S. Miller0dbaee32010-12-13 12:52:14 -0800424 dst_metric_advmss(&r->dst) + 40,
Changli Gaod8d1f302010-06-10 23:31:35 -0700425 dst_metric(&r->dst, RTAX_WINDOW),
426 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
427 dst_metric(&r->dst, RTAX_RTTVAR)),
David S. Miller475949d2011-05-03 19:45:15 -0700428 r->rt_key_tos,
Changli Gaod8d1f302010-06-10 23:31:35 -0700429 r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
430 r->dst.hh ? (r->dst.hh->hh_output ==
Linus Torvalds1da177e2005-04-16 15:20:36 -0700431 dev_queue_xmit) : 0,
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700432 r->rt_spec_dst, &len);
433
434 seq_printf(seq, "%*s\n", 127 - len, "");
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900435 }
436 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700437}
438
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700439static const struct seq_operations rt_cache_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700440 .start = rt_cache_seq_start,
441 .next = rt_cache_seq_next,
442 .stop = rt_cache_seq_stop,
443 .show = rt_cache_seq_show,
444};
445
446static int rt_cache_seq_open(struct inode *inode, struct file *file)
447{
Denis V. Luneva75e9362008-02-28 20:50:55 -0800448 return seq_open_net(inode, file, &rt_cache_seq_ops,
Pavel Emelyanovcf7732e2007-10-10 02:29:29 -0700449 sizeof(struct rt_cache_iter_state));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700450}
451
Arjan van de Ven9a321442007-02-12 00:55:35 -0800452static const struct file_operations rt_cache_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700453 .owner = THIS_MODULE,
454 .open = rt_cache_seq_open,
455 .read = seq_read,
456 .llseek = seq_lseek,
Denis V. Luneva75e9362008-02-28 20:50:55 -0800457 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700458};
459
460
461static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
462{
463 int cpu;
464
465 if (*pos == 0)
466 return SEQ_START_TOKEN;
467
Rusty Russell0f231742008-12-29 12:23:42 +0000468 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700469 if (!cpu_possible(cpu))
470 continue;
471 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800472 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700473 }
474 return NULL;
475}
476
477static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
478{
479 int cpu;
480
Rusty Russell0f231742008-12-29 12:23:42 +0000481 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700482 if (!cpu_possible(cpu))
483 continue;
484 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800485 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700486 }
487 return NULL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900488
Linus Torvalds1da177e2005-04-16 15:20:36 -0700489}
490
491static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
492{
493
494}
495
496static int rt_cpu_seq_show(struct seq_file *seq, void *v)
497{
498 struct rt_cache_stat *st = v;
499
500 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700501 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700502 return 0;
503 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900504
Linus Torvalds1da177e2005-04-16 15:20:36 -0700505 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
506 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
Eric Dumazetfc66f952010-10-08 06:37:34 +0000507 dst_entries_get_slow(&ipv4_dst_ops),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700508 st->in_hit,
509 st->in_slow_tot,
510 st->in_slow_mc,
511 st->in_no_route,
512 st->in_brd,
513 st->in_martian_dst,
514 st->in_martian_src,
515
516 st->out_hit,
517 st->out_slow_tot,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900518 st->out_slow_mc,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700519
520 st->gc_total,
521 st->gc_ignored,
522 st->gc_goal_miss,
523 st->gc_dst_overflow,
524 st->in_hlist_search,
525 st->out_hlist_search
526 );
527 return 0;
528}
529
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700530static const struct seq_operations rt_cpu_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700531 .start = rt_cpu_seq_start,
532 .next = rt_cpu_seq_next,
533 .stop = rt_cpu_seq_stop,
534 .show = rt_cpu_seq_show,
535};
536
537
538static int rt_cpu_seq_open(struct inode *inode, struct file *file)
539{
540 return seq_open(file, &rt_cpu_seq_ops);
541}
542
Arjan van de Ven9a321442007-02-12 00:55:35 -0800543static const struct file_operations rt_cpu_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700544 .owner = THIS_MODULE,
545 .open = rt_cpu_seq_open,
546 .read = seq_read,
547 .llseek = seq_lseek,
548 .release = seq_release,
549};
550
Patrick McHardyc7066f72011-01-14 13:36:42 +0100551#ifdef CONFIG_IP_ROUTE_CLASSID
Alexey Dobriyana661c412009-11-25 15:40:35 -0800552static int rt_acct_proc_show(struct seq_file *m, void *v)
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800553{
Alexey Dobriyana661c412009-11-25 15:40:35 -0800554 struct ip_rt_acct *dst, *src;
555 unsigned int i, j;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800556
Alexey Dobriyana661c412009-11-25 15:40:35 -0800557 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
558 if (!dst)
559 return -ENOMEM;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800560
Alexey Dobriyana661c412009-11-25 15:40:35 -0800561 for_each_possible_cpu(i) {
562 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
563 for (j = 0; j < 256; j++) {
564 dst[j].o_bytes += src[j].o_bytes;
565 dst[j].o_packets += src[j].o_packets;
566 dst[j].i_bytes += src[j].i_bytes;
567 dst[j].i_packets += src[j].i_packets;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800568 }
569 }
Alexey Dobriyana661c412009-11-25 15:40:35 -0800570
571 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
572 kfree(dst);
573 return 0;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800574}
Alexey Dobriyana661c412009-11-25 15:40:35 -0800575
576static int rt_acct_proc_open(struct inode *inode, struct file *file)
577{
578 return single_open(file, rt_acct_proc_show, NULL);
579}
580
581static const struct file_operations rt_acct_proc_fops = {
582 .owner = THIS_MODULE,
583 .open = rt_acct_proc_open,
584 .read = seq_read,
585 .llseek = seq_lseek,
586 .release = single_release,
587};
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800588#endif
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800589
Denis V. Lunev73b38712008-02-28 20:51:18 -0800590static int __net_init ip_rt_do_proc_init(struct net *net)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800591{
592 struct proc_dir_entry *pde;
593
594 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
595 &rt_cache_seq_fops);
596 if (!pde)
597 goto err1;
598
Wang Chen77020722008-02-28 14:14:25 -0800599 pde = proc_create("rt_cache", S_IRUGO,
600 net->proc_net_stat, &rt_cpu_seq_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800601 if (!pde)
602 goto err2;
603
Patrick McHardyc7066f72011-01-14 13:36:42 +0100604#ifdef CONFIG_IP_ROUTE_CLASSID
Alexey Dobriyana661c412009-11-25 15:40:35 -0800605 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800606 if (!pde)
607 goto err3;
608#endif
609 return 0;
610
Patrick McHardyc7066f72011-01-14 13:36:42 +0100611#ifdef CONFIG_IP_ROUTE_CLASSID
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800612err3:
613 remove_proc_entry("rt_cache", net->proc_net_stat);
614#endif
615err2:
616 remove_proc_entry("rt_cache", net->proc_net);
617err1:
618 return -ENOMEM;
619}
Denis V. Lunev73b38712008-02-28 20:51:18 -0800620
621static void __net_exit ip_rt_do_proc_exit(struct net *net)
622{
623 remove_proc_entry("rt_cache", net->proc_net_stat);
624 remove_proc_entry("rt_cache", net->proc_net);
Patrick McHardyc7066f72011-01-14 13:36:42 +0100625#ifdef CONFIG_IP_ROUTE_CLASSID
Denis V. Lunev73b38712008-02-28 20:51:18 -0800626 remove_proc_entry("rt_acct", net->proc_net);
Alexey Dobriyan0a931ac2010-01-17 03:32:50 +0000627#endif
Denis V. Lunev73b38712008-02-28 20:51:18 -0800628}
629
630static struct pernet_operations ip_rt_proc_ops __net_initdata = {
631 .init = ip_rt_do_proc_init,
632 .exit = ip_rt_do_proc_exit,
633};
634
635static int __init ip_rt_proc_init(void)
636{
637 return register_pernet_subsys(&ip_rt_proc_ops);
638}
639
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800640#else
Denis V. Lunev73b38712008-02-28 20:51:18 -0800641static inline int ip_rt_proc_init(void)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800642{
643 return 0;
644}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700645#endif /* CONFIG_PROC_FS */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900646
Stephen Hemminger5969f712008-04-10 01:52:09 -0700647static inline void rt_free(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700648{
Changli Gaod8d1f302010-06-10 23:31:35 -0700649 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700650}
651
Stephen Hemminger5969f712008-04-10 01:52:09 -0700652static inline void rt_drop(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700653{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700654 ip_rt_put(rt);
Changli Gaod8d1f302010-06-10 23:31:35 -0700655 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700656}
657
Stephen Hemminger5969f712008-04-10 01:52:09 -0700658static inline int rt_fast_clean(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700659{
660 /* Kill broadcast/multicast entries very aggresively, if they
661 collide in hash table with more useful entries */
662 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
David S. Millerc7537962010-11-11 17:07:48 -0800663 rt_is_input_route(rth) && rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700664}
665
Stephen Hemminger5969f712008-04-10 01:52:09 -0700666static inline int rt_valuable(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700667{
668 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
David S. Miller2c8cec52011-02-09 20:42:07 -0800669 (rth->peer && rth->peer->pmtu_expires);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700670}
671
672static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
673{
674 unsigned long age;
675 int ret = 0;
676
Changli Gaod8d1f302010-06-10 23:31:35 -0700677 if (atomic_read(&rth->dst.__refcnt))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700678 goto out;
679
Changli Gaod8d1f302010-06-10 23:31:35 -0700680 age = jiffies - rth->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700681 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
682 (age <= tmo2 && rt_valuable(rth)))
683 goto out;
684 ret = 1;
685out: return ret;
686}
687
688/* Bits of score are:
689 * 31: very valuable
690 * 30: not quite useless
691 * 29..0: usage counter
692 */
693static inline u32 rt_score(struct rtable *rt)
694{
Changli Gaod8d1f302010-06-10 23:31:35 -0700695 u32 score = jiffies - rt->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700696
697 score = ~score & ~(3<<30);
698
699 if (rt_valuable(rt))
700 score |= (1<<31);
701
David S. Millerc7537962010-11-11 17:07:48 -0800702 if (rt_is_output_route(rt) ||
Linus Torvalds1da177e2005-04-16 15:20:36 -0700703 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
704 score |= (1<<30);
705
706 return score;
707}
708
Neil Horman1080d702008-10-27 12:28:25 -0700709static inline bool rt_caching(const struct net *net)
710{
711 return net->ipv4.current_rt_cache_rebuild_count <=
712 net->ipv4.sysctl_rt_cache_rebuild_count;
713}
714
David S. Miller5e2b61f2011-03-04 21:47:09 -0800715static inline bool compare_hash_inputs(const struct rtable *rt1,
716 const struct rtable *rt2)
Neil Horman1080d702008-10-27 12:28:25 -0700717{
David S. Miller5e2b61f2011-03-04 21:47:09 -0800718 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
719 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
720 (rt1->rt_iif ^ rt2->rt_iif)) == 0);
Neil Horman1080d702008-10-27 12:28:25 -0700721}
722
David S. Miller5e2b61f2011-03-04 21:47:09 -0800723static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700724{
David S. Miller5e2b61f2011-03-04 21:47:09 -0800725 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
726 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
727 (rt1->rt_mark ^ rt2->rt_mark) |
David S. Miller475949d2011-05-03 19:45:15 -0700728 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
David S. Miller5e2b61f2011-03-04 21:47:09 -0800729 (rt1->rt_oif ^ rt2->rt_oif) |
730 (rt1->rt_iif ^ rt2->rt_iif)) == 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700731}
732
Denis V. Lunevb5921912008-01-22 23:50:25 -0800733static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
734{
Changli Gaod8d1f302010-06-10 23:31:35 -0700735 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
Denis V. Lunevb5921912008-01-22 23:50:25 -0800736}
737
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700738static inline int rt_is_expired(struct rtable *rth)
739{
Changli Gaod8d1f302010-06-10 23:31:35 -0700740 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700741}
742
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800743/*
744 * Perform a full scan of hash table and free all entries.
745 * Can be called by a softirq or a process.
746 * In the later case, we want to be reschedule if necessary
747 */
David S. Miller6561a3b2010-12-19 21:11:20 -0800748static void rt_do_flush(struct net *net, int process_context)
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800749{
750 unsigned int i;
751 struct rtable *rth, *next;
752
753 for (i = 0; i <= rt_hash_mask; i++) {
David S. Miller6561a3b2010-12-19 21:11:20 -0800754 struct rtable __rcu **pprev;
755 struct rtable *list;
756
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800757 if (process_context && need_resched())
758 cond_resched();
Eric Dumazet1c317202010-10-25 21:02:07 +0000759 rth = rcu_dereference_raw(rt_hash_table[i].chain);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800760 if (!rth)
761 continue;
762
763 spin_lock_bh(rt_hash_lock_addr(i));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700764
David S. Miller6561a3b2010-12-19 21:11:20 -0800765 list = NULL;
766 pprev = &rt_hash_table[i].chain;
767 rth = rcu_dereference_protected(*pprev,
Eric Dumazet1c317202010-10-25 21:02:07 +0000768 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700769
David S. Miller6561a3b2010-12-19 21:11:20 -0800770 while (rth) {
771 next = rcu_dereference_protected(rth->dst.rt_next,
772 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700773
David S. Miller6561a3b2010-12-19 21:11:20 -0800774 if (!net ||
775 net_eq(dev_net(rth->dst.dev), net)) {
776 rcu_assign_pointer(*pprev, next);
777 rcu_assign_pointer(rth->dst.rt_next, list);
778 list = rth;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700779 } else {
David S. Miller6561a3b2010-12-19 21:11:20 -0800780 pprev = &rth->dst.rt_next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700781 }
David S. Miller6561a3b2010-12-19 21:11:20 -0800782 rth = next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700783 }
David S. Miller6561a3b2010-12-19 21:11:20 -0800784
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800785 spin_unlock_bh(rt_hash_lock_addr(i));
786
David S. Miller6561a3b2010-12-19 21:11:20 -0800787 for (; list; list = next) {
788 next = rcu_dereference_protected(list->dst.rt_next, 1);
789 rt_free(list);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800790 }
791 }
792}
793
Neil Horman1080d702008-10-27 12:28:25 -0700794/*
795 * While freeing expired entries, we compute average chain length
796 * and standard deviation, using fixed-point arithmetic.
797 * This to have an estimation of rt_chain_length_max
798 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
799 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
800 */
801
802#define FRACT_BITS 3
803#define ONE (1UL << FRACT_BITS)
804
Eric Dumazet98376382010-03-08 03:20:00 +0000805/*
806 * Given a hash chain and an item in this hash chain,
807 * find if a previous entry has the same hash_inputs
808 * (but differs on tos, mark or oif)
809 * Returns 0 if an alias is found.
810 * Returns ONE if rth has no alias before itself.
811 */
812static int has_noalias(const struct rtable *head, const struct rtable *rth)
813{
814 const struct rtable *aux = head;
815
816 while (aux != rth) {
David S. Miller5e2b61f2011-03-04 21:47:09 -0800817 if (compare_hash_inputs(aux, rth))
Eric Dumazet98376382010-03-08 03:20:00 +0000818 return 0;
Eric Dumazet1c317202010-10-25 21:02:07 +0000819 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +0000820 }
821 return ONE;
822}
823
Eric Dumazet29e75252008-01-31 17:05:09 -0800824/*
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300825 * Perturbation of rt_genid by a small quantity [1..256]
Eric Dumazet29e75252008-01-31 17:05:09 -0800826 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
827 * many times (2^24) without giving recent rt_genid.
828 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700829 */
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700830static void rt_cache_invalidate(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700831{
Eric Dumazet29e75252008-01-31 17:05:09 -0800832 unsigned char shuffle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700833
Eric Dumazet29e75252008-01-31 17:05:09 -0800834 get_random_bytes(&shuffle, sizeof(shuffle));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700835 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700836}
837
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800838/*
Eric Dumazet29e75252008-01-31 17:05:09 -0800839 * delay < 0 : invalidate cache (fast : entries will be deleted later)
840 * delay >= 0 : invalidate & flush cache (can be long)
841 */
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -0700842void rt_cache_flush(struct net *net, int delay)
Eric Dumazet29e75252008-01-31 17:05:09 -0800843{
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700844 rt_cache_invalidate(net);
Eric Dumazet29e75252008-01-31 17:05:09 -0800845 if (delay >= 0)
David S. Miller6561a3b2010-12-19 21:11:20 -0800846 rt_do_flush(net, !in_softirq());
Eric Dumazet29e75252008-01-31 17:05:09 -0800847}
848
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000849/* Flush previous cache invalidated entries from the cache */
David S. Miller6561a3b2010-12-19 21:11:20 -0800850void rt_cache_flush_batch(struct net *net)
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000851{
David S. Miller6561a3b2010-12-19 21:11:20 -0800852 rt_do_flush(net, !in_softirq());
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000853}
854
Neil Horman1080d702008-10-27 12:28:25 -0700855static void rt_emergency_hash_rebuild(struct net *net)
856{
Neil Horman3ee94372010-05-08 01:57:52 -0700857 if (net_ratelimit())
Neil Horman1080d702008-10-27 12:28:25 -0700858 printk(KERN_WARNING "Route hash chain too long!\n");
Neil Horman3ee94372010-05-08 01:57:52 -0700859 rt_cache_invalidate(net);
Neil Horman1080d702008-10-27 12:28:25 -0700860}
861
Linus Torvalds1da177e2005-04-16 15:20:36 -0700862/*
863 Short description of GC goals.
864
865 We want to build algorithm, which will keep routing cache
866 at some equilibrium point, when number of aged off entries
867 is kept approximately equal to newly generated ones.
868
869 Current expiration strength is variable "expire".
870 We try to adjust it dynamically, so that if networking
871 is idle expires is large enough to keep enough of warm entries,
872 and when load increases it reduces to limit cache size.
873 */
874
Daniel Lezcano569d3642008-01-18 03:56:57 -0800875static int rt_garbage_collect(struct dst_ops *ops)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700876{
877 static unsigned long expire = RT_GC_TIMEOUT;
878 static unsigned long last_gc;
879 static int rover;
880 static int equilibrium;
Eric Dumazet1c317202010-10-25 21:02:07 +0000881 struct rtable *rth;
882 struct rtable __rcu **rthp;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700883 unsigned long now = jiffies;
884 int goal;
Eric Dumazetfc66f952010-10-08 06:37:34 +0000885 int entries = dst_entries_get_fast(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700886
887 /*
888 * Garbage collection is pretty expensive,
889 * do not make it too frequently.
890 */
891
892 RT_CACHE_STAT_INC(gc_total);
893
894 if (now - last_gc < ip_rt_gc_min_interval &&
Eric Dumazetfc66f952010-10-08 06:37:34 +0000895 entries < ip_rt_max_size) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700896 RT_CACHE_STAT_INC(gc_ignored);
897 goto out;
898 }
899
Eric Dumazetfc66f952010-10-08 06:37:34 +0000900 entries = dst_entries_get_slow(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700901 /* Calculate number of entries, which we want to expire now. */
Eric Dumazetfc66f952010-10-08 06:37:34 +0000902 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700903 if (goal <= 0) {
904 if (equilibrium < ipv4_dst_ops.gc_thresh)
905 equilibrium = ipv4_dst_ops.gc_thresh;
Eric Dumazetfc66f952010-10-08 06:37:34 +0000906 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700907 if (goal > 0) {
Eric Dumazetb790ced2007-12-21 01:49:07 -0800908 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +0000909 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700910 }
911 } else {
912 /* We are in dangerous area. Try to reduce cache really
913 * aggressively.
914 */
Eric Dumazetb790ced2007-12-21 01:49:07 -0800915 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +0000916 equilibrium = entries - goal;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700917 }
918
919 if (now - last_gc >= ip_rt_gc_min_interval)
920 last_gc = now;
921
922 if (goal <= 0) {
923 equilibrium += goal;
924 goto work_done;
925 }
926
927 do {
928 int i, k;
929
930 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
931 unsigned long tmo = expire;
932
933 k = (k + 1) & rt_hash_mask;
934 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700935 spin_lock_bh(rt_hash_lock_addr(k));
Eric Dumazet1c317202010-10-25 21:02:07 +0000936 while ((rth = rcu_dereference_protected(*rthp,
937 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700938 if (!rt_is_expired(rth) &&
Eric Dumazet29e75252008-01-31 17:05:09 -0800939 !rt_may_expire(rth, tmo, expire)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700940 tmo >>= 1;
Changli Gaod8d1f302010-06-10 23:31:35 -0700941 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700942 continue;
943 }
Changli Gaod8d1f302010-06-10 23:31:35 -0700944 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700945 rt_free(rth);
946 goal--;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700947 }
Eric Dumazet22c047c2005-07-05 14:55:24 -0700948 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700949 if (goal <= 0)
950 break;
951 }
952 rover = k;
953
954 if (goal <= 0)
955 goto work_done;
956
957 /* Goal is not achieved. We stop process if:
958
959 - if expire reduced to zero. Otherwise, expire is halfed.
960 - if table is not full.
961 - if we are called from interrupt.
962 - jiffies check is just fallback/debug loop breaker.
963 We will not spin here for long time in any case.
964 */
965
966 RT_CACHE_STAT_INC(gc_goal_miss);
967
968 if (expire == 0)
969 break;
970
971 expire >>= 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700972
Eric Dumazetfc66f952010-10-08 06:37:34 +0000973 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700974 goto out;
975 } while (!in_softirq() && time_before_eq(jiffies, now));
976
Eric Dumazetfc66f952010-10-08 06:37:34 +0000977 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
978 goto out;
979 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700980 goto out;
981 if (net_ratelimit())
982 printk(KERN_WARNING "dst cache overflow\n");
983 RT_CACHE_STAT_INC(gc_dst_overflow);
984 return 1;
985
986work_done:
987 expire += ip_rt_gc_min_interval;
988 if (expire > ip_rt_gc_timeout ||
Eric Dumazetfc66f952010-10-08 06:37:34 +0000989 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
990 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700991 expire = ip_rt_gc_timeout;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700992out: return 0;
993}
994
Eric Dumazet98376382010-03-08 03:20:00 +0000995/*
996 * Returns number of entries in a hash chain that have different hash_inputs
997 */
998static int slow_chain_length(const struct rtable *head)
999{
1000 int length = 0;
1001 const struct rtable *rth = head;
1002
1003 while (rth) {
1004 length += has_noalias(head, rth);
Eric Dumazet1c317202010-10-25 21:02:07 +00001005 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +00001006 }
1007 return length >> FRACT_BITS;
1008}
1009
David S. Millerb23dd4f2011-03-02 14:31:35 -08001010static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1011 struct sk_buff *skb, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001012{
Eric Dumazet1c317202010-10-25 21:02:07 +00001013 struct rtable *rth, *cand;
1014 struct rtable __rcu **rthp, **candp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001015 unsigned long now;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001016 u32 min_score;
1017 int chain_length;
1018 int attempts = !in_softirq();
1019
1020restart:
1021 chain_length = 0;
1022 min_score = ~(u32)0;
1023 cand = NULL;
1024 candp = NULL;
1025 now = jiffies;
1026
Changli Gaod8d1f302010-06-10 23:31:35 -07001027 if (!rt_caching(dev_net(rt->dst.dev))) {
Neil Horman73e42892009-06-20 01:15:16 -07001028 /*
1029 * If we're not caching, just tell the caller we
1030 * were successful and don't touch the route. The
1031 * caller hold the sole reference to the cache entry, and
1032 * it will be released when the caller is done with it.
1033 * If we drop it here, the callers have no way to resolve routes
1034 * when we're not caching. Instead, just point *rp at rt, so
1035 * the caller gets a single use out of the route
Neil Hormanb6280b42009-06-22 10:18:53 +00001036 * Note that we do rt_free on this new route entry, so that
1037 * once its refcount hits zero, we are still able to reap it
1038 * (Thanks Alexey)
Eric Dumazet27b75c92010-10-15 05:44:11 +00001039 * Note: To avoid expensive rcu stuff for this uncached dst,
1040 * we set DST_NOCACHE so that dst_release() can free dst without
1041 * waiting a grace period.
Neil Horman73e42892009-06-20 01:15:16 -07001042 */
Neil Hormanb6280b42009-06-22 10:18:53 +00001043
Eric Dumazetc7d44262010-10-03 22:17:54 -07001044 rt->dst.flags |= DST_NOCACHE;
David S. Millerc7537962010-11-11 17:07:48 -08001045 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001046 int err = arp_bind_neighbour(&rt->dst);
Neil Hormanb6280b42009-06-22 10:18:53 +00001047 if (err) {
1048 if (net_ratelimit())
1049 printk(KERN_WARNING
1050 "Neighbour table failure & not caching routes.\n");
Eric Dumazet27b75c92010-10-15 05:44:11 +00001051 ip_rt_put(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001052 return ERR_PTR(err);
Neil Hormanb6280b42009-06-22 10:18:53 +00001053 }
1054 }
1055
Neil Hormanb6280b42009-06-22 10:18:53 +00001056 goto skip_hashing;
Neil Horman1080d702008-10-27 12:28:25 -07001057 }
1058
Linus Torvalds1da177e2005-04-16 15:20:36 -07001059 rthp = &rt_hash_table[hash].chain;
1060
Eric Dumazet22c047c2005-07-05 14:55:24 -07001061 spin_lock_bh(rt_hash_lock_addr(hash));
Eric Dumazet1c317202010-10-25 21:02:07 +00001062 while ((rth = rcu_dereference_protected(*rthp,
1063 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001064 if (rt_is_expired(rth)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001065 *rthp = rth->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001066 rt_free(rth);
1067 continue;
1068 }
David S. Miller5e2b61f2011-03-04 21:47:09 -08001069 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001070 /* Put it first */
Changli Gaod8d1f302010-06-10 23:31:35 -07001071 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001072 /*
1073 * Since lookup is lockfree, the deletion
1074 * must be visible to another weakly ordered CPU before
1075 * the insertion at the start of the hash chain.
1076 */
Changli Gaod8d1f302010-06-10 23:31:35 -07001077 rcu_assign_pointer(rth->dst.rt_next,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001078 rt_hash_table[hash].chain);
1079 /*
1080 * Since lookup is lockfree, the update writes
1081 * must be ordered for consistency on SMP.
1082 */
1083 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1084
Changli Gaod8d1f302010-06-10 23:31:35 -07001085 dst_use(&rth->dst, now);
Eric Dumazet22c047c2005-07-05 14:55:24 -07001086 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001087
1088 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001089 if (skb)
Changli Gaod8d1f302010-06-10 23:31:35 -07001090 skb_dst_set(skb, &rth->dst);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001091 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001092 }
1093
Changli Gaod8d1f302010-06-10 23:31:35 -07001094 if (!atomic_read(&rth->dst.__refcnt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001095 u32 score = rt_score(rth);
1096
1097 if (score <= min_score) {
1098 cand = rth;
1099 candp = rthp;
1100 min_score = score;
1101 }
1102 }
1103
1104 chain_length++;
1105
Changli Gaod8d1f302010-06-10 23:31:35 -07001106 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001107 }
1108
1109 if (cand) {
1110 /* ip_rt_gc_elasticity used to be average length of chain
1111 * length, when exceeded gc becomes really aggressive.
1112 *
1113 * The second limit is less certain. At the moment it allows
1114 * only 2 entries per bucket. We will see.
1115 */
1116 if (chain_length > ip_rt_gc_elasticity) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001117 *candp = cand->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001118 rt_free(cand);
1119 }
Neil Horman1080d702008-10-27 12:28:25 -07001120 } else {
Eric Dumazet98376382010-03-08 03:20:00 +00001121 if (chain_length > rt_chain_length_max &&
1122 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001123 struct net *net = dev_net(rt->dst.dev);
Neil Horman1080d702008-10-27 12:28:25 -07001124 int num = ++net->ipv4.current_rt_cache_rebuild_count;
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001125 if (!rt_caching(net)) {
Neil Horman1080d702008-10-27 12:28:25 -07001126 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
Changli Gaod8d1f302010-06-10 23:31:35 -07001127 rt->dst.dev->name, num);
Neil Horman1080d702008-10-27 12:28:25 -07001128 }
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001129 rt_emergency_hash_rebuild(net);
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001130 spin_unlock_bh(rt_hash_lock_addr(hash));
1131
David S. Miller5e2b61f2011-03-04 21:47:09 -08001132 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001133 ifindex, rt_genid(net));
1134 goto restart;
Neil Horman1080d702008-10-27 12:28:25 -07001135 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001136 }
1137
1138 /* Try to bind route to arp only if it is output
1139 route or unicast forwarding path.
1140 */
David S. Millerc7537962010-11-11 17:07:48 -08001141 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001142 int err = arp_bind_neighbour(&rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001143 if (err) {
Eric Dumazet22c047c2005-07-05 14:55:24 -07001144 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001145
1146 if (err != -ENOBUFS) {
1147 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001148 return ERR_PTR(err);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001149 }
1150
1151 /* Neighbour tables are full and nothing
1152 can be released. Try to shrink route cache,
1153 it is most likely it holds some neighbour records.
1154 */
1155 if (attempts-- > 0) {
1156 int saved_elasticity = ip_rt_gc_elasticity;
1157 int saved_int = ip_rt_gc_min_interval;
1158 ip_rt_gc_elasticity = 1;
1159 ip_rt_gc_min_interval = 0;
Daniel Lezcano569d3642008-01-18 03:56:57 -08001160 rt_garbage_collect(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001161 ip_rt_gc_min_interval = saved_int;
1162 ip_rt_gc_elasticity = saved_elasticity;
1163 goto restart;
1164 }
1165
1166 if (net_ratelimit())
Ulrich Weber7e1b33e2010-09-27 15:02:18 -07001167 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07001168 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001169 return ERR_PTR(-ENOBUFS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001170 }
1171 }
1172
Changli Gaod8d1f302010-06-10 23:31:35 -07001173 rt->dst.rt_next = rt_hash_table[hash].chain;
Neil Horman1080d702008-10-27 12:28:25 -07001174
Eric Dumazet00269b52008-10-16 14:18:29 -07001175 /*
1176 * Since lookup is lockfree, we must make sure
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001177 * previous writes to rt are committed to memory
Eric Dumazet00269b52008-10-16 14:18:29 -07001178 * before making rt visible to other CPUS.
1179 */
Eric Dumazet1ddbcb02009-05-19 20:14:28 +00001180 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
Neil Horman1080d702008-10-27 12:28:25 -07001181
Eric Dumazet22c047c2005-07-05 14:55:24 -07001182 spin_unlock_bh(rt_hash_lock_addr(hash));
Neil Horman73e42892009-06-20 01:15:16 -07001183
Neil Hormanb6280b42009-06-22 10:18:53 +00001184skip_hashing:
David S. Millerb23dd4f2011-03-02 14:31:35 -08001185 if (skb)
Changli Gaod8d1f302010-06-10 23:31:35 -07001186 skb_dst_set(skb, &rt->dst);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001187 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001188}
1189
David S. Miller6431cbc2011-02-07 20:38:06 -08001190static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1191
1192static u32 rt_peer_genid(void)
1193{
1194 return atomic_read(&__rt_peer_genid);
1195}
1196
David S. Millera48eff12011-05-18 18:42:43 -04001197void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001198{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001199 struct inet_peer *peer;
1200
David S. Millera48eff12011-05-18 18:42:43 -04001201 peer = inet_getpeer_v4(daddr, create);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001202
Eric Dumazet49e8ab02010-08-19 06:10:45 +00001203 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001204 inet_putpeer(peer);
David S. Miller6431cbc2011-02-07 20:38:06 -08001205 else
1206 rt->rt_peer_genid = rt_peer_genid();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001207}
1208
1209/*
1210 * Peer allocation may fail only in serious out-of-memory conditions. However
1211 * we still can generate some output.
1212 * Random ID selection looks a bit dangerous because we have no chances to
1213 * select ID being unique in a reasonable period of time.
1214 * But broken packet identifier may be better than no packet at all.
1215 */
1216static void ip_select_fb_ident(struct iphdr *iph)
1217{
1218 static DEFINE_SPINLOCK(ip_fb_id_lock);
1219 static u32 ip_fallback_id;
1220 u32 salt;
1221
1222 spin_lock_bh(&ip_fb_id_lock);
Al Viroe4485152006-09-26 22:15:01 -07001223 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001224 iph->id = htons(salt & 0xFFFF);
1225 ip_fallback_id = salt;
1226 spin_unlock_bh(&ip_fb_id_lock);
1227}
1228
1229void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1230{
1231 struct rtable *rt = (struct rtable *) dst;
1232
1233 if (rt) {
1234 if (rt->peer == NULL)
David S. Millera48eff12011-05-18 18:42:43 -04001235 rt_bind_peer(rt, rt->rt_dst, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001236
1237 /* If peer is attached to destination, it is never detached,
1238 so that we need not to grab a lock to dereference it.
1239 */
1240 if (rt->peer) {
1241 iph->id = htons(inet_getid(rt->peer, more));
1242 return;
1243 }
1244 } else
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001245 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
Stephen Hemminger9c2b3322005-04-19 22:39:42 -07001246 __builtin_return_address(0));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001247
1248 ip_select_fb_ident(iph);
1249}
Eric Dumazet4bc2f182010-07-09 21:22:10 +00001250EXPORT_SYMBOL(__ip_select_ident);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001251
1252static void rt_del(unsigned hash, struct rtable *rt)
1253{
Eric Dumazet1c317202010-10-25 21:02:07 +00001254 struct rtable __rcu **rthp;
1255 struct rtable *aux;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001256
Eric Dumazet29e75252008-01-31 17:05:09 -08001257 rthp = &rt_hash_table[hash].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001258 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001259 ip_rt_put(rt);
Eric Dumazet1c317202010-10-25 21:02:07 +00001260 while ((aux = rcu_dereference_protected(*rthp,
1261 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001262 if (aux == rt || rt_is_expired(aux)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001263 *rthp = aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001264 rt_free(aux);
1265 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001266 }
Changli Gaod8d1f302010-06-10 23:31:35 -07001267 rthp = &aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001268 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001269 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001270}
1271
Eric Dumazeted7865a42010-06-07 21:49:44 -07001272/* called in rcu_read_lock() section */
Al Virof7655222006-09-26 21:25:43 -07001273void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1274 __be32 saddr, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001275{
Eric Dumazeted7865a42010-06-07 21:49:44 -07001276 struct in_device *in_dev = __in_dev_get_rcu(dev);
David S. Millerf39925d2011-02-09 22:00:16 -08001277 struct inet_peer *peer;
Denis V. Lunev317805b2008-02-28 20:50:06 -08001278 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001279
Linus Torvalds1da177e2005-04-16 15:20:36 -07001280 if (!in_dev)
1281 return;
1282
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001283 net = dev_net(dev);
Joe Perches9d4fb272009-11-23 10:41:23 -08001284 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1285 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1286 ipv4_is_zeronet(new_gw))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001287 goto reject_redirect;
1288
1289 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1290 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1291 goto reject_redirect;
1292 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1293 goto reject_redirect;
1294 } else {
Denis V. Lunev317805b2008-02-28 20:50:06 -08001295 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001296 goto reject_redirect;
1297 }
1298
David S. Millerf39925d2011-02-09 22:00:16 -08001299 peer = inet_getpeer_v4(daddr, 1);
1300 if (peer) {
1301 peer->redirect_learned.a4 = new_gw;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001302
David S. Millerf39925d2011-02-09 22:00:16 -08001303 inet_putpeer(peer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001304
David S. Millerf39925d2011-02-09 22:00:16 -08001305 atomic_inc(&__rt_peer_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001306 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001307 return;
1308
1309reject_redirect:
1310#ifdef CONFIG_IP_ROUTE_VERBOSE
1311 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07001312 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1313 " Advised path = %pI4 -> %pI4\n",
1314 &old_gw, dev->name, &new_gw,
1315 &saddr, &daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001316#endif
Eric Dumazeted7865a42010-06-07 21:49:44 -07001317 ;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001318}
1319
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001320static bool peer_pmtu_expired(struct inet_peer *peer)
1321{
1322 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1323
1324 return orig &&
1325 time_after_eq(jiffies, orig) &&
1326 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1327}
1328
1329static bool peer_pmtu_cleaned(struct inet_peer *peer)
1330{
1331 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1332
1333 return orig &&
1334 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1335}
1336
Linus Torvalds1da177e2005-04-16 15:20:36 -07001337static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1338{
Eric Dumazetee6b9672008-03-05 18:30:47 -08001339 struct rtable *rt = (struct rtable *)dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001340 struct dst_entry *ret = dst;
1341
1342 if (rt) {
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001343 if (dst->obsolete > 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001344 ip_rt_put(rt);
1345 ret = NULL;
David S. Miller2c8cec52011-02-09 20:42:07 -08001346 } else if (rt->rt_flags & RTCF_REDIRECTED) {
David S. Miller5e2b61f2011-03-04 21:47:09 -08001347 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1348 rt->rt_oif,
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001349 rt_genid(dev_net(dst->dev)));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001350 rt_del(hash, rt);
1351 ret = NULL;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001352 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1353 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001354 }
1355 }
1356 return ret;
1357}
1358
1359/*
1360 * Algorithm:
1361 * 1. The first ip_rt_redirect_number redirects are sent
1362 * with exponential backoff, then we stop sending them at all,
1363 * assuming that the host ignores our redirects.
1364 * 2. If we did not see packets requiring redirects
1365 * during ip_rt_redirect_silence, we assume that the host
1366 * forgot redirected route and start to send redirects again.
1367 *
1368 * This algorithm is much cheaper and more intelligent than dumb load limiting
1369 * in icmp.c.
1370 *
1371 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1372 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1373 */
1374
1375void ip_rt_send_redirect(struct sk_buff *skb)
1376{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001377 struct rtable *rt = skb_rtable(skb);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001378 struct in_device *in_dev;
David S. Miller92d86822011-02-04 15:55:25 -08001379 struct inet_peer *peer;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001380 int log_martians;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001381
Eric Dumazet30038fc2009-08-28 23:52:01 -07001382 rcu_read_lock();
Changli Gaod8d1f302010-06-10 23:31:35 -07001383 in_dev = __in_dev_get_rcu(rt->dst.dev);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001384 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1385 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001386 return;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001387 }
1388 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1389 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001390
David S. Miller92d86822011-02-04 15:55:25 -08001391 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001392 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller92d86822011-02-04 15:55:25 -08001393 peer = rt->peer;
1394 if (!peer) {
1395 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1396 return;
1397 }
1398
Linus Torvalds1da177e2005-04-16 15:20:36 -07001399 /* No redirected packets during ip_rt_redirect_silence;
1400 * reset the algorithm.
1401 */
David S. Miller92d86822011-02-04 15:55:25 -08001402 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1403 peer->rate_tokens = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001404
1405 /* Too many ignored redirects; do not send anything
Changli Gaod8d1f302010-06-10 23:31:35 -07001406 * set dst.rate_last to the last seen redirected packet.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001407 */
David S. Miller92d86822011-02-04 15:55:25 -08001408 if (peer->rate_tokens >= ip_rt_redirect_number) {
1409 peer->rate_last = jiffies;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001410 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001411 }
1412
1413 /* Check for load limit; set rate_last to the latest sent
1414 * redirect.
1415 */
David S. Miller92d86822011-02-04 15:55:25 -08001416 if (peer->rate_tokens == 0 ||
Li Yewang14fb8a72006-12-18 00:26:35 -08001417 time_after(jiffies,
David S. Miller92d86822011-02-04 15:55:25 -08001418 (peer->rate_last +
1419 (ip_rt_redirect_load << peer->rate_tokens)))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001420 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
David S. Miller92d86822011-02-04 15:55:25 -08001421 peer->rate_last = jiffies;
1422 ++peer->rate_tokens;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001423#ifdef CONFIG_IP_ROUTE_VERBOSE
Eric Dumazet30038fc2009-08-28 23:52:01 -07001424 if (log_martians &&
David S. Miller92d86822011-02-04 15:55:25 -08001425 peer->rate_tokens == ip_rt_redirect_number &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001426 net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07001427 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
David S. Millerc5be24f2011-05-13 18:01:21 -04001428 &ip_hdr(skb)->saddr, rt->rt_iif,
Harvey Harrison673d57e2008-10-31 00:53:57 -07001429 &rt->rt_dst, &rt->rt_gateway);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001430#endif
1431 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001432}
1433
1434static int ip_error(struct sk_buff *skb)
1435{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001436 struct rtable *rt = skb_rtable(skb);
David S. Miller92d86822011-02-04 15:55:25 -08001437 struct inet_peer *peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001438 unsigned long now;
David S. Miller92d86822011-02-04 15:55:25 -08001439 bool send;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001440 int code;
1441
Changli Gaod8d1f302010-06-10 23:31:35 -07001442 switch (rt->dst.error) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001443 case EINVAL:
1444 default:
1445 goto out;
1446 case EHOSTUNREACH:
1447 code = ICMP_HOST_UNREACH;
1448 break;
1449 case ENETUNREACH:
1450 code = ICMP_NET_UNREACH;
Changli Gaod8d1f302010-06-10 23:31:35 -07001451 IP_INC_STATS_BH(dev_net(rt->dst.dev),
Pavel Emelyanov7c73a6f2008-07-16 20:20:11 -07001452 IPSTATS_MIB_INNOROUTES);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001453 break;
1454 case EACCES:
1455 code = ICMP_PKT_FILTERED;
1456 break;
1457 }
1458
David S. Miller92d86822011-02-04 15:55:25 -08001459 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001460 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller92d86822011-02-04 15:55:25 -08001461 peer = rt->peer;
1462
1463 send = true;
1464 if (peer) {
1465 now = jiffies;
1466 peer->rate_tokens += now - peer->rate_last;
1467 if (peer->rate_tokens > ip_rt_error_burst)
1468 peer->rate_tokens = ip_rt_error_burst;
1469 peer->rate_last = now;
1470 if (peer->rate_tokens >= ip_rt_error_cost)
1471 peer->rate_tokens -= ip_rt_error_cost;
1472 else
1473 send = false;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001474 }
David S. Miller92d86822011-02-04 15:55:25 -08001475 if (send)
1476 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001477
1478out: kfree_skb(skb);
1479 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001480}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001481
1482/*
1483 * The last two values are not from the RFC but
1484 * are needed for AMPRnet AX.25 paths.
1485 */
1486
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001487static const unsigned short mtu_plateau[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -07001488{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1489
Stephen Hemminger5969f712008-04-10 01:52:09 -07001490static inline unsigned short guess_mtu(unsigned short old_mtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001491{
1492 int i;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001493
Linus Torvalds1da177e2005-04-16 15:20:36 -07001494 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1495 if (old_mtu > mtu_plateau[i])
1496 return mtu_plateau[i];
1497 return 68;
1498}
1499
Eric Dumazetb71d1d42011-04-22 04:53:02 +00001500unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
Timo Teras0010e462008-04-29 03:32:25 -07001501 unsigned short new_mtu,
1502 struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001503{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001504 unsigned short old_mtu = ntohs(iph->tot_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001505 unsigned short est_mtu = 0;
David S. Miller2c8cec52011-02-09 20:42:07 -08001506 struct inet_peer *peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001507
David S. Miller2c8cec52011-02-09 20:42:07 -08001508 peer = inet_getpeer_v4(iph->daddr, 1);
1509 if (peer) {
1510 unsigned short mtu = new_mtu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001511
David S. Miller2c8cec52011-02-09 20:42:07 -08001512 if (new_mtu < 68 || new_mtu >= old_mtu) {
1513 /* BSD 4.2 derived systems incorrectly adjust
1514 * tot_len by the IP header length, and report
1515 * a zero MTU in the ICMP message.
1516 */
1517 if (mtu == 0 &&
1518 old_mtu >= 68 + (iph->ihl << 2))
1519 old_mtu -= iph->ihl << 2;
1520 mtu = guess_mtu(old_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001521 }
David S. Miller2c8cec52011-02-09 20:42:07 -08001522
1523 if (mtu < ip_rt_min_pmtu)
1524 mtu = ip_rt_min_pmtu;
1525 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001526 unsigned long pmtu_expires;
1527
1528 pmtu_expires = jiffies + ip_rt_mtu_expires;
1529 if (!pmtu_expires)
1530 pmtu_expires = 1UL;
1531
David S. Miller2c8cec52011-02-09 20:42:07 -08001532 est_mtu = mtu;
1533 peer->pmtu_learned = mtu;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001534 peer->pmtu_expires = pmtu_expires;
David S. Miller2c8cec52011-02-09 20:42:07 -08001535 }
1536
1537 inet_putpeer(peer);
1538
1539 atomic_inc(&__rt_peer_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001540 }
1541 return est_mtu ? : new_mtu;
1542}
1543
David S. Miller2c8cec52011-02-09 20:42:07 -08001544static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1545{
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001546 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
David S. Miller2c8cec52011-02-09 20:42:07 -08001547
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001548 if (!expires)
1549 return;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001550 if (time_before(jiffies, expires)) {
David S. Miller2c8cec52011-02-09 20:42:07 -08001551 u32 orig_dst_mtu = dst_mtu(dst);
1552 if (peer->pmtu_learned < orig_dst_mtu) {
1553 if (!peer->pmtu_orig)
1554 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1555 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1556 }
1557 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1558 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1559}
1560
Linus Torvalds1da177e2005-04-16 15:20:36 -07001561static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1562{
David S. Miller2c8cec52011-02-09 20:42:07 -08001563 struct rtable *rt = (struct rtable *) dst;
1564 struct inet_peer *peer;
1565
1566 dst_confirm(dst);
1567
1568 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001569 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller2c8cec52011-02-09 20:42:07 -08001570 peer = rt->peer;
1571 if (peer) {
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001572 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1573
David S. Miller2c8cec52011-02-09 20:42:07 -08001574 if (mtu < ip_rt_min_pmtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001575 mtu = ip_rt_min_pmtu;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001576 if (!pmtu_expires || mtu < peer->pmtu_learned) {
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001577
1578 pmtu_expires = jiffies + ip_rt_mtu_expires;
1579 if (!pmtu_expires)
1580 pmtu_expires = 1UL;
1581
David S. Miller2c8cec52011-02-09 20:42:07 -08001582 peer->pmtu_learned = mtu;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001583 peer->pmtu_expires = pmtu_expires;
David S. Miller2c8cec52011-02-09 20:42:07 -08001584
1585 atomic_inc(&__rt_peer_genid);
1586 rt->rt_peer_genid = rt_peer_genid();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001587 }
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001588 check_peer_pmtu(dst, peer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001589 }
1590}
1591
David S. Millerf39925d2011-02-09 22:00:16 -08001592static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1593{
1594 struct rtable *rt = (struct rtable *) dst;
1595 __be32 orig_gw = rt->rt_gateway;
1596
1597 dst_confirm(&rt->dst);
1598
1599 neigh_release(rt->dst.neighbour);
1600 rt->dst.neighbour = NULL;
1601
1602 rt->rt_gateway = peer->redirect_learned.a4;
1603 if (arp_bind_neighbour(&rt->dst) ||
1604 !(rt->dst.neighbour->nud_state & NUD_VALID)) {
1605 if (rt->dst.neighbour)
1606 neigh_event_send(rt->dst.neighbour, NULL);
1607 rt->rt_gateway = orig_gw;
1608 return -EAGAIN;
1609 } else {
1610 rt->rt_flags |= RTCF_REDIRECTED;
1611 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE,
1612 rt->dst.neighbour);
1613 }
1614 return 0;
1615}
1616
Linus Torvalds1da177e2005-04-16 15:20:36 -07001617static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1618{
David S. Miller6431cbc2011-02-07 20:38:06 -08001619 struct rtable *rt = (struct rtable *) dst;
1620
1621 if (rt_is_expired(rt))
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001622 return NULL;
David S. Miller6431cbc2011-02-07 20:38:06 -08001623 if (rt->rt_peer_genid != rt_peer_genid()) {
David S. Miller2c8cec52011-02-09 20:42:07 -08001624 struct inet_peer *peer;
1625
David S. Miller6431cbc2011-02-07 20:38:06 -08001626 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001627 rt_bind_peer(rt, rt->rt_dst, 0);
David S. Miller6431cbc2011-02-07 20:38:06 -08001628
David S. Miller2c8cec52011-02-09 20:42:07 -08001629 peer = rt->peer;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001630 if (peer) {
David S. Miller2c8cec52011-02-09 20:42:07 -08001631 check_peer_pmtu(dst, peer);
1632
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001633 if (peer->redirect_learned.a4 &&
1634 peer->redirect_learned.a4 != rt->rt_gateway) {
1635 if (check_peer_redir(dst, peer))
1636 return NULL;
1637 }
David S. Millerf39925d2011-02-09 22:00:16 -08001638 }
1639
David S. Miller6431cbc2011-02-07 20:38:06 -08001640 rt->rt_peer_genid = rt_peer_genid();
1641 }
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001642 return dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001643}
1644
1645static void ipv4_dst_destroy(struct dst_entry *dst)
1646{
1647 struct rtable *rt = (struct rtable *) dst;
1648 struct inet_peer *peer = rt->peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001649
David S. Miller62fa8a82011-01-26 20:51:05 -08001650 if (rt->fi) {
1651 fib_info_put(rt->fi);
1652 rt->fi = NULL;
1653 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001654 if (peer) {
1655 rt->peer = NULL;
1656 inet_putpeer(peer);
1657 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001658}
1659
Linus Torvalds1da177e2005-04-16 15:20:36 -07001660
1661static void ipv4_link_failure(struct sk_buff *skb)
1662{
1663 struct rtable *rt;
1664
1665 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1666
Eric Dumazet511c3f92009-06-02 05:14:27 +00001667 rt = skb_rtable(skb);
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001668 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1669 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001670}
1671
1672static int ip_rt_bug(struct sk_buff *skb)
1673{
Harvey Harrison673d57e2008-10-31 00:53:57 -07001674 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1675 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001676 skb->dev ? skb->dev->name : "?");
1677 kfree_skb(skb);
Dave Jonesc378a9c2011-05-21 07:16:42 +00001678 WARN_ON(1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001679 return 0;
1680}
1681
1682/*
1683 We do not cache source address of outgoing interface,
1684 because it is used only by IP RR, TS and SRR options,
1685 so that it out of fast path.
1686
1687 BTW remember: "addr" is allowed to be not aligned
1688 in IP options!
1689 */
1690
David S. Miller8e363602011-05-13 17:29:41 -04001691void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001692{
Al Viroa61ced52006-09-26 21:27:54 -07001693 __be32 src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001694
David S. Millerc7537962010-11-11 17:07:48 -08001695 if (rt_is_output_route(rt))
David S. Millerc5be24f2011-05-13 18:01:21 -04001696 src = ip_hdr(skb)->saddr;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001697 else {
David S. Miller8e363602011-05-13 17:29:41 -04001698 struct fib_result res;
1699 struct flowi4 fl4;
1700 struct iphdr *iph;
1701
1702 iph = ip_hdr(skb);
1703
1704 memset(&fl4, 0, sizeof(fl4));
1705 fl4.daddr = iph->daddr;
1706 fl4.saddr = iph->saddr;
1707 fl4.flowi4_tos = iph->tos;
1708 fl4.flowi4_oif = rt->dst.dev->ifindex;
1709 fl4.flowi4_iif = skb->dev->ifindex;
1710 fl4.flowi4_mark = skb->mark;
David S. Miller5e2b61f2011-03-04 21:47:09 -08001711
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001712 rcu_read_lock();
David S. Miller68a5e3d2011-03-11 20:07:33 -05001713 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
David S. Miller436c3b62011-03-24 17:42:21 -07001714 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001715 else
1716 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001717 RT_SCOPE_UNIVERSE);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001718 rcu_read_unlock();
1719 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001720 memcpy(addr, &src, 4);
1721}
1722
Patrick McHardyc7066f72011-01-14 13:36:42 +01001723#ifdef CONFIG_IP_ROUTE_CLASSID
Linus Torvalds1da177e2005-04-16 15:20:36 -07001724static void set_class_tag(struct rtable *rt, u32 tag)
1725{
Changli Gaod8d1f302010-06-10 23:31:35 -07001726 if (!(rt->dst.tclassid & 0xFFFF))
1727 rt->dst.tclassid |= tag & 0xFFFF;
1728 if (!(rt->dst.tclassid & 0xFFFF0000))
1729 rt->dst.tclassid |= tag & 0xFFFF0000;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001730}
1731#endif
1732
David S. Miller0dbaee32010-12-13 12:52:14 -08001733static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1734{
1735 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1736
1737 if (advmss == 0) {
1738 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1739 ip_rt_min_advmss);
1740 if (advmss > 65535 - 40)
1741 advmss = 65535 - 40;
1742 }
1743 return advmss;
1744}
1745
David S. Millerd33e4552010-12-14 13:01:14 -08001746static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1747{
1748 unsigned int mtu = dst->dev->mtu;
1749
1750 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1751 const struct rtable *rt = (const struct rtable *) dst;
1752
1753 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1754 mtu = 576;
1755 }
1756
1757 if (mtu > IP_MAX_MTU)
1758 mtu = IP_MAX_MTU;
1759
1760 return mtu;
1761}
1762
David S. Miller813b3b52011-04-28 14:48:42 -07001763static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
David S. Miller5e2b61f2011-03-04 21:47:09 -08001764 struct fib_info *fi)
David S. Millera4daad62011-01-27 22:01:53 -08001765{
David S. Miller0131ba42011-02-04 14:37:30 -08001766 struct inet_peer *peer;
1767 int create = 0;
1768
1769 /* If a peer entry exists for this destination, we must hook
1770 * it up in order to get at cached metrics.
1771 */
David S. Miller813b3b52011-04-28 14:48:42 -07001772 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
David S. Miller0131ba42011-02-04 14:37:30 -08001773 create = 1;
1774
David S. Miller3c0afdc2011-03-04 21:26:07 -08001775 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
David S. Miller0131ba42011-02-04 14:37:30 -08001776 if (peer) {
David S. Miller3c0afdc2011-03-04 21:26:07 -08001777 rt->rt_peer_genid = rt_peer_genid();
David S. Miller0131ba42011-02-04 14:37:30 -08001778 if (inet_metrics_new(peer))
1779 memcpy(peer->metrics, fi->fib_metrics,
1780 sizeof(u32) * RTAX_MAX);
1781 dst_init_metrics(&rt->dst, peer->metrics, false);
David S. Miller2c8cec52011-02-09 20:42:07 -08001782
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001783 check_peer_pmtu(&rt->dst, peer);
David S. Millerf39925d2011-02-09 22:00:16 -08001784 if (peer->redirect_learned.a4 &&
1785 peer->redirect_learned.a4 != rt->rt_gateway) {
1786 rt->rt_gateway = peer->redirect_learned.a4;
1787 rt->rt_flags |= RTCF_REDIRECTED;
1788 }
David S. Miller0131ba42011-02-04 14:37:30 -08001789 } else {
David S. Millerb8dad612011-01-28 14:07:16 -08001790 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1791 rt->fi = fi;
1792 atomic_inc(&fi->fib_clntref);
1793 }
David S. Millera4daad62011-01-27 22:01:53 -08001794 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
David S. Millera4daad62011-01-27 22:01:53 -08001795 }
1796}
1797
David S. Miller813b3b52011-04-28 14:48:42 -07001798static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
David S. Miller5e2b61f2011-03-04 21:47:09 -08001799 const struct fib_result *res,
David S. Miller982721f2011-02-16 21:44:24 -08001800 struct fib_info *fi, u16 type, u32 itag)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001801{
David S. Millerdefb3512010-12-08 21:16:57 -08001802 struct dst_entry *dst = &rt->dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001803
1804 if (fi) {
1805 if (FIB_RES_GW(*res) &&
1806 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1807 rt->rt_gateway = FIB_RES_GW(*res);
David S. Miller813b3b52011-04-28 14:48:42 -07001808 rt_init_metrics(rt, fl4, fi);
Patrick McHardyc7066f72011-01-14 13:36:42 +01001809#ifdef CONFIG_IP_ROUTE_CLASSID
David S. Millerdefb3512010-12-08 21:16:57 -08001810 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001811#endif
David S. Millerd33e4552010-12-14 13:01:14 -08001812 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001813
David S. Millerdefb3512010-12-08 21:16:57 -08001814 if (dst_mtu(dst) > IP_MAX_MTU)
1815 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
David S. Miller0dbaee32010-12-13 12:52:14 -08001816 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
David S. Millerdefb3512010-12-08 21:16:57 -08001817 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001818
Patrick McHardyc7066f72011-01-14 13:36:42 +01001819#ifdef CONFIG_IP_ROUTE_CLASSID
Linus Torvalds1da177e2005-04-16 15:20:36 -07001820#ifdef CONFIG_IP_MULTIPLE_TABLES
1821 set_class_tag(rt, fib_rules_tclass(res));
1822#endif
1823 set_class_tag(rt, itag);
1824#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001825}
1826
David S. Miller5c1e6aa2011-04-28 14:13:38 -07001827static struct rtable *rt_dst_alloc(struct net_device *dev,
1828 bool nopolicy, bool noxfrm)
David S. Miller0c4dcd52011-02-17 15:42:37 -08001829{
David S. Miller5c1e6aa2011-04-28 14:13:38 -07001830 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1831 DST_HOST |
1832 (nopolicy ? DST_NOPOLICY : 0) |
1833 (noxfrm ? DST_NOXFRM : 0));
David S. Miller0c4dcd52011-02-17 15:42:37 -08001834}
1835
Eric Dumazet96d36222010-06-02 19:21:31 +00001836/* called in rcu_read_lock() section */
Al Viro9e12bb22006-09-26 21:25:20 -07001837static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001838 u8 tos, struct net_device *dev, int our)
1839{
Eric Dumazet96d36222010-06-02 19:21:31 +00001840 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001841 struct rtable *rth;
Al Viroa61ced52006-09-26 21:27:54 -07001842 __be32 spec_dst;
Eric Dumazet96d36222010-06-02 19:21:31 +00001843 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001844 u32 itag = 0;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001845 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001846
1847 /* Primary sanity checks. */
1848
1849 if (in_dev == NULL)
1850 return -EINVAL;
1851
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001852 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08001853 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001854 goto e_inval;
1855
Joe Perchesf97c1e02007-12-16 13:45:43 -08001856 if (ipv4_is_zeronet(saddr)) {
1857 if (!ipv4_is_local_multicast(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001858 goto e_inval;
1859 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001860 } else {
Michael Smith5c04c812011-04-07 04:51:50 +00001861 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1862 &itag);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001863 if (err < 0)
1864 goto e_err;
1865 }
David S. Miller5c1e6aa2011-04-28 14:13:38 -07001866 rth = rt_dst_alloc(init_net.loopback_dev,
1867 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001868 if (!rth)
1869 goto e_nobufs;
1870
Patrick McHardyc7066f72011-01-14 13:36:42 +01001871#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07001872 rth->dst.tclassid = itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001873#endif
David S. Millercf911662011-04-28 14:31:47 -07001874 rth->dst.output = ip_rt_bug;
1875
1876 rth->rt_key_dst = daddr;
1877 rth->rt_key_src = saddr;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001878 rth->rt_genid = rt_genid(dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001879 rth->rt_flags = RTCF_MULTICAST;
Eric Dumazet29e75252008-01-31 17:05:09 -08001880 rth->rt_type = RTN_MULTICAST;
David S. Miller475949d2011-05-03 19:45:15 -07001881 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07001882 rth->rt_dst = daddr;
1883 rth->rt_src = saddr;
1884 rth->rt_route_iif = dev->ifindex;
1885 rth->rt_iif = dev->ifindex;
1886 rth->rt_oif = 0;
1887 rth->rt_mark = skb->mark;
1888 rth->rt_gateway = daddr;
1889 rth->rt_spec_dst= spec_dst;
1890 rth->rt_peer_genid = 0;
1891 rth->peer = NULL;
1892 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001893 if (our) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001894 rth->dst.input= ip_local_deliver;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001895 rth->rt_flags |= RTCF_LOCAL;
1896 }
1897
1898#ifdef CONFIG_IP_MROUTE
Joe Perchesf97c1e02007-12-16 13:45:43 -08001899 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
Changli Gaod8d1f302010-06-10 23:31:35 -07001900 rth->dst.input = ip_mr_input;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001901#endif
1902 RT_CACHE_STAT_INC(in_slow_mc);
1903
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001904 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
David S. Millerb23dd4f2011-03-02 14:31:35 -08001905 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
Eric Dumazet9aa3c942011-06-18 11:59:18 -07001906 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001907
1908e_nobufs:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001909 return -ENOBUFS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001910e_inval:
Eric Dumazet96d36222010-06-02 19:21:31 +00001911 return -EINVAL;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001912e_err:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001913 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001914}
1915
1916
1917static void ip_handle_martian_source(struct net_device *dev,
1918 struct in_device *in_dev,
1919 struct sk_buff *skb,
Al Viro9e12bb22006-09-26 21:25:20 -07001920 __be32 daddr,
1921 __be32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001922{
1923 RT_CACHE_STAT_INC(in_martian_src);
1924#ifdef CONFIG_IP_ROUTE_VERBOSE
1925 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1926 /*
1927 * RFC1812 recommendation, if source is martian,
1928 * the only hint is MAC header.
1929 */
Harvey Harrison673d57e2008-10-31 00:53:57 -07001930 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1931 &daddr, &saddr, dev->name);
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001932 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001933 int i;
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001934 const unsigned char *p = skb_mac_header(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001935 printk(KERN_WARNING "ll header: ");
1936 for (i = 0; i < dev->hard_header_len; i++, p++) {
1937 printk("%02x", *p);
1938 if (i < (dev->hard_header_len - 1))
1939 printk(":");
1940 }
1941 printk("\n");
1942 }
1943 }
1944#endif
1945}
1946
Eric Dumazet47360222010-06-03 04:13:21 +00001947/* called in rcu_read_lock() section */
Stephen Hemminger5969f712008-04-10 01:52:09 -07001948static int __mkroute_input(struct sk_buff *skb,
David S. Miller982721f2011-02-16 21:44:24 -08001949 const struct fib_result *res,
Stephen Hemminger5969f712008-04-10 01:52:09 -07001950 struct in_device *in_dev,
1951 __be32 daddr, __be32 saddr, u32 tos,
1952 struct rtable **result)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001953{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001954 struct rtable *rth;
1955 int err;
1956 struct in_device *out_dev;
Eric Dumazet47360222010-06-03 04:13:21 +00001957 unsigned int flags = 0;
Al Virod9c9df82006-09-26 21:28:14 -07001958 __be32 spec_dst;
1959 u32 itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001960
1961 /* get a working reference to the output device */
Eric Dumazet47360222010-06-03 04:13:21 +00001962 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001963 if (out_dev == NULL) {
1964 if (net_ratelimit())
1965 printk(KERN_CRIT "Bug in ip_route_input" \
1966 "_slow(). Please, report\n");
1967 return -EINVAL;
1968 }
1969
1970
Michael Smith5c04c812011-04-07 04:51:50 +00001971 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1972 in_dev->dev, &spec_dst, &itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001973 if (err < 0) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001974 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001975 saddr);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001976
Linus Torvalds1da177e2005-04-16 15:20:36 -07001977 goto cleanup;
1978 }
1979
1980 if (err)
1981 flags |= RTCF_DIRECTSRC;
1982
Thomas Graf51b77ca2008-06-03 16:36:01 -07001983 if (out_dev == in_dev && err &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001984 (IN_DEV_SHARED_MEDIA(out_dev) ||
1985 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1986 flags |= RTCF_DOREDIRECT;
1987
1988 if (skb->protocol != htons(ETH_P_IP)) {
1989 /* Not IP (i.e. ARP). Do not create route, if it is
1990 * invalid for proxy arp. DNAT routes are always valid.
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00001991 *
1992 * Proxy arp feature have been extended to allow, ARP
1993 * replies back to the same interface, to support
1994 * Private VLAN switch technologies. See arp.c.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001995 */
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00001996 if (out_dev == in_dev &&
1997 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001998 err = -EINVAL;
1999 goto cleanup;
2000 }
2001 }
2002
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002003 rth = rt_dst_alloc(out_dev->dev,
2004 IN_DEV_CONF_GET(in_dev, NOPOLICY),
David S. Miller0c4dcd52011-02-17 15:42:37 -08002005 IN_DEV_CONF_GET(out_dev, NOXFRM));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002006 if (!rth) {
2007 err = -ENOBUFS;
2008 goto cleanup;
2009 }
2010
David S. Miller5e2b61f2011-03-04 21:47:09 -08002011 rth->rt_key_dst = daddr;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002012 rth->rt_key_src = saddr;
David S. Millercf911662011-04-28 14:31:47 -07002013 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2014 rth->rt_flags = flags;
2015 rth->rt_type = res->type;
David S. Miller475949d2011-05-03 19:45:15 -07002016 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002017 rth->rt_dst = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002018 rth->rt_src = saddr;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002019 rth->rt_route_iif = in_dev->dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002020 rth->rt_iif = in_dev->dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002021 rth->rt_oif = 0;
David S. Millercf911662011-04-28 14:31:47 -07002022 rth->rt_mark = skb->mark;
2023 rth->rt_gateway = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002024 rth->rt_spec_dst= spec_dst;
David S. Millercf911662011-04-28 14:31:47 -07002025 rth->rt_peer_genid = 0;
2026 rth->peer = NULL;
2027 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002028
Changli Gaod8d1f302010-06-10 23:31:35 -07002029 rth->dst.input = ip_forward;
2030 rth->dst.output = ip_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002031
David S. Miller5e2b61f2011-03-04 21:47:09 -08002032 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002033
Linus Torvalds1da177e2005-04-16 15:20:36 -07002034 *result = rth;
2035 err = 0;
2036 cleanup:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002037 return err;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002038}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002039
Stephen Hemminger5969f712008-04-10 01:52:09 -07002040static int ip_mkroute_input(struct sk_buff *skb,
2041 struct fib_result *res,
David S. Miller68a5e3d2011-03-11 20:07:33 -05002042 const struct flowi4 *fl4,
Stephen Hemminger5969f712008-04-10 01:52:09 -07002043 struct in_device *in_dev,
2044 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002045{
Chuck Short7abaa272005-06-22 22:10:23 -07002046 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002047 int err;
2048 unsigned hash;
2049
2050#ifdef CONFIG_IP_ROUTE_MULTIPATH
David S. Millerff3fccb2011-03-10 16:23:24 -08002051 if (res->fi && res->fi->fib_nhs > 1)
David S. Miller1b7fe5932011-03-10 17:01:16 -08002052 fib_select_multipath(res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002053#endif
2054
2055 /* create a routing cache entry */
2056 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2057 if (err)
2058 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002059
2060 /* put it into the cache */
David S. Miller68a5e3d2011-03-11 20:07:33 -05002061 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
Changli Gaod8d1f302010-06-10 23:31:35 -07002062 rt_genid(dev_net(rth->dst.dev)));
David S. Miller68a5e3d2011-03-11 20:07:33 -05002063 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002064 if (IS_ERR(rth))
2065 return PTR_ERR(rth);
2066 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002067}
2068
Linus Torvalds1da177e2005-04-16 15:20:36 -07002069/*
2070 * NOTE. We drop all the packets that has local source
2071 * addresses, because every properly looped back packet
2072 * must have correct destination already attached by output routine.
2073 *
2074 * Such approach solves two big problems:
2075 * 1. Not simplex devices are handled properly.
2076 * 2. IP spoofing attempts are filtered with 100% of guarantee.
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002077 * called with rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -07002078 */
2079
Al Viro9e12bb22006-09-26 21:25:20 -07002080static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002081 u8 tos, struct net_device *dev)
2082{
2083 struct fib_result res;
Eric Dumazet96d36222010-06-02 19:21:31 +00002084 struct in_device *in_dev = __in_dev_get_rcu(dev);
David S. Miller68a5e3d2011-03-11 20:07:33 -05002085 struct flowi4 fl4;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002086 unsigned flags = 0;
2087 u32 itag = 0;
2088 struct rtable * rth;
2089 unsigned hash;
Al Viro9e12bb22006-09-26 21:25:20 -07002090 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002091 int err = -EINVAL;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002092 struct net * net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002093
2094 /* IP on this device is disabled. */
2095
2096 if (!in_dev)
2097 goto out;
2098
2099 /* Check for the most weird martians, which can be not detected
2100 by fib_lookup.
2101 */
2102
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002103 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002104 ipv4_is_loopback(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002105 goto martian_source;
2106
Andy Walls27a954b2010-10-17 15:11:22 +00002107 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002108 goto brd_input;
2109
2110 /* Accept zero addresses only to limited broadcast;
2111 * I even do not know to fix it or not. Waiting for complains :-)
2112 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002113 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002114 goto martian_source;
2115
Andy Walls27a954b2010-10-17 15:11:22 +00002116 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002117 goto martian_destination;
2118
2119 /*
2120 * Now we are ready to route packet.
2121 */
David S. Miller68a5e3d2011-03-11 20:07:33 -05002122 fl4.flowi4_oif = 0;
2123 fl4.flowi4_iif = dev->ifindex;
2124 fl4.flowi4_mark = skb->mark;
2125 fl4.flowi4_tos = tos;
2126 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2127 fl4.daddr = daddr;
2128 fl4.saddr = saddr;
2129 err = fib_lookup(net, &fl4, &res);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002130 if (err != 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002131 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002132 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002133 goto no_route;
2134 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002135
2136 RT_CACHE_STAT_INC(in_slow_tot);
2137
2138 if (res.type == RTN_BROADCAST)
2139 goto brd_input;
2140
2141 if (res.type == RTN_LOCAL) {
Michael Smith5c04c812011-04-07 04:51:50 +00002142 err = fib_validate_source(skb, saddr, daddr, tos,
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002143 net->loopback_dev->ifindex,
Michael Smith5c04c812011-04-07 04:51:50 +00002144 dev, &spec_dst, &itag);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002145 if (err < 0)
2146 goto martian_source_keep_err;
2147 if (err)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002148 flags |= RTCF_DIRECTSRC;
2149 spec_dst = daddr;
2150 goto local_input;
2151 }
2152
2153 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002154 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002155 if (res.type != RTN_UNICAST)
2156 goto martian_destination;
2157
David S. Miller68a5e3d2011-03-11 20:07:33 -05002158 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002159out: return err;
2160
2161brd_input:
2162 if (skb->protocol != htons(ETH_P_IP))
2163 goto e_inval;
2164
Joe Perchesf97c1e02007-12-16 13:45:43 -08002165 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002166 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2167 else {
Michael Smith5c04c812011-04-07 04:51:50 +00002168 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2169 &itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002170 if (err < 0)
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002171 goto martian_source_keep_err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002172 if (err)
2173 flags |= RTCF_DIRECTSRC;
2174 }
2175 flags |= RTCF_BROADCAST;
2176 res.type = RTN_BROADCAST;
2177 RT_CACHE_STAT_INC(in_brd);
2178
2179local_input:
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002180 rth = rt_dst_alloc(net->loopback_dev,
2181 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002182 if (!rth)
2183 goto e_nobufs;
2184
David S. Millercf911662011-04-28 14:31:47 -07002185 rth->dst.input= ip_local_deliver;
Changli Gaod8d1f302010-06-10 23:31:35 -07002186 rth->dst.output= ip_rt_bug;
David S. Millercf911662011-04-28 14:31:47 -07002187#ifdef CONFIG_IP_ROUTE_CLASSID
2188 rth->dst.tclassid = itag;
2189#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002190
David S. Miller5e2b61f2011-03-04 21:47:09 -08002191 rth->rt_key_dst = daddr;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002192 rth->rt_key_src = saddr;
David S. Millercf911662011-04-28 14:31:47 -07002193 rth->rt_genid = rt_genid(net);
2194 rth->rt_flags = flags|RTCF_LOCAL;
2195 rth->rt_type = res.type;
David S. Miller475949d2011-05-03 19:45:15 -07002196 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002197 rth->rt_dst = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002198 rth->rt_src = saddr;
Patrick McHardyc7066f72011-01-14 13:36:42 +01002199#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07002200 rth->dst.tclassid = itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002201#endif
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002202 rth->rt_route_iif = dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002203 rth->rt_iif = dev->ifindex;
David S. Millercf911662011-04-28 14:31:47 -07002204 rth->rt_oif = 0;
2205 rth->rt_mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002206 rth->rt_gateway = daddr;
2207 rth->rt_spec_dst= spec_dst;
David S. Millercf911662011-04-28 14:31:47 -07002208 rth->rt_peer_genid = 0;
2209 rth->peer = NULL;
2210 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002211 if (res.type == RTN_UNREACHABLE) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002212 rth->dst.input= ip_error;
2213 rth->dst.error= -err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002214 rth->rt_flags &= ~RTCF_LOCAL;
2215 }
David S. Miller68a5e3d2011-03-11 20:07:33 -05002216 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2217 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002218 err = 0;
2219 if (IS_ERR(rth))
2220 err = PTR_ERR(rth);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002221 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002222
2223no_route:
2224 RT_CACHE_STAT_INC(in_no_route);
2225 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2226 res.type = RTN_UNREACHABLE;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08002227 if (err == -ESRCH)
2228 err = -ENETUNREACH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002229 goto local_input;
2230
2231 /*
2232 * Do not cache martian addresses: they should be logged (RFC1812)
2233 */
2234martian_destination:
2235 RT_CACHE_STAT_INC(in_martian_dst);
2236#ifdef CONFIG_IP_ROUTE_VERBOSE
2237 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07002238 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2239 &daddr, &saddr, dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002240#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002241
2242e_hostunreach:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002243 err = -EHOSTUNREACH;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002244 goto out;
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002245
Linus Torvalds1da177e2005-04-16 15:20:36 -07002246e_inval:
2247 err = -EINVAL;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002248 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002249
2250e_nobufs:
2251 err = -ENOBUFS;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002252 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002253
2254martian_source:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002255 err = -EINVAL;
2256martian_source_keep_err:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002257 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002258 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002259}
2260
Eric Dumazet407eadd2010-05-10 11:32:55 +00002261int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2262 u8 tos, struct net_device *dev, bool noref)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002263{
2264 struct rtable * rth;
2265 unsigned hash;
2266 int iif = dev->ifindex;
Denis V. Lunevb5921912008-01-22 23:50:25 -08002267 struct net *net;
Eric Dumazet96d36222010-06-02 19:21:31 +00002268 int res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002269
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002270 net = dev_net(dev);
Neil Horman1080d702008-10-27 12:28:25 -07002271
Eric Dumazet96d36222010-06-02 19:21:31 +00002272 rcu_read_lock();
2273
Neil Horman1080d702008-10-27 12:28:25 -07002274 if (!rt_caching(net))
2275 goto skip_cache;
2276
Linus Torvalds1da177e2005-04-16 15:20:36 -07002277 tos &= IPTOS_RT_MASK;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002278 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002279
Linus Torvalds1da177e2005-04-16 15:20:36 -07002280 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002281 rth = rcu_dereference(rth->dst.rt_next)) {
David S. Miller5e2b61f2011-03-04 21:47:09 -08002282 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2283 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2284 (rth->rt_iif ^ iif) |
2285 rth->rt_oif |
David S. Miller475949d2011-05-03 19:45:15 -07002286 (rth->rt_key_tos ^ tos)) == 0 &&
David S. Miller5e2b61f2011-03-04 21:47:09 -08002287 rth->rt_mark == skb->mark &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002288 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002289 !rt_is_expired(rth)) {
Eric Dumazet407eadd2010-05-10 11:32:55 +00002290 if (noref) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002291 dst_use_noref(&rth->dst, jiffies);
2292 skb_dst_set_noref(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002293 } else {
Changli Gaod8d1f302010-06-10 23:31:35 -07002294 dst_use(&rth->dst, jiffies);
2295 skb_dst_set(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002296 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002297 RT_CACHE_STAT_INC(in_hit);
2298 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002299 return 0;
2300 }
2301 RT_CACHE_STAT_INC(in_hlist_search);
2302 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002303
Neil Horman1080d702008-10-27 12:28:25 -07002304skip_cache:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002305 /* Multicast recognition logic is moved from route cache to here.
2306 The problem was that too many Ethernet cards have broken/missing
2307 hardware multicast filters :-( As result the host on multicasting
2308 network acquires a lot of useless route cache entries, sort of
2309 SDR messages from all the world. Now we try to get rid of them.
2310 Really, provided software IP multicast filter is organized
2311 reasonably (at least, hashed), it does not result in a slowdown
2312 comparing with route cache reject entries.
2313 Note, that multicast routers are not affected, because
2314 route cache entry is created eventually.
2315 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002316 if (ipv4_is_multicast(daddr)) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002317 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002318
Eric Dumazet96d36222010-06-02 19:21:31 +00002319 if (in_dev) {
David S. Millerdbdd9a52011-03-10 16:34:38 -08002320 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2321 ip_hdr(skb)->protocol);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002322 if (our
2323#ifdef CONFIG_IP_MROUTE
Joe Perches9d4fb272009-11-23 10:41:23 -08002324 ||
2325 (!ipv4_is_local_multicast(daddr) &&
2326 IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002327#endif
Joe Perches9d4fb272009-11-23 10:41:23 -08002328 ) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002329 int res = ip_route_input_mc(skb, daddr, saddr,
2330 tos, dev, our);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002331 rcu_read_unlock();
Eric Dumazet96d36222010-06-02 19:21:31 +00002332 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002333 }
2334 }
2335 rcu_read_unlock();
2336 return -EINVAL;
2337 }
Eric Dumazet96d36222010-06-02 19:21:31 +00002338 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2339 rcu_read_unlock();
2340 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002341}
Eric Dumazet407eadd2010-05-10 11:32:55 +00002342EXPORT_SYMBOL(ip_route_input_common);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002343
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002344/* called with rcu_read_lock() */
David S. Miller982721f2011-02-16 21:44:24 -08002345static struct rtable *__mkroute_output(const struct fib_result *res,
David S. Miller68a5e3d2011-03-11 20:07:33 -05002346 const struct flowi4 *fl4,
David S. Miller813b3b52011-04-28 14:48:42 -07002347 __be32 orig_daddr, __be32 orig_saddr,
2348 int orig_oif, struct net_device *dev_out,
David S. Miller5ada5522011-02-17 15:29:00 -08002349 unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002350{
David S. Miller982721f2011-02-16 21:44:24 -08002351 struct fib_info *fi = res->fi;
David S. Miller813b3b52011-04-28 14:48:42 -07002352 u32 tos = RT_FL_TOS(fl4);
David S. Miller5ada5522011-02-17 15:29:00 -08002353 struct in_device *in_dev;
David S. Miller982721f2011-02-16 21:44:24 -08002354 u16 type = res->type;
David S. Miller5ada5522011-02-17 15:29:00 -08002355 struct rtable *rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002356
David S. Miller68a5e3d2011-03-11 20:07:33 -05002357 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
David S. Miller5ada5522011-02-17 15:29:00 -08002358 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002359
David S. Miller68a5e3d2011-03-11 20:07:33 -05002360 if (ipv4_is_lbcast(fl4->daddr))
David S. Miller982721f2011-02-16 21:44:24 -08002361 type = RTN_BROADCAST;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002362 else if (ipv4_is_multicast(fl4->daddr))
David S. Miller982721f2011-02-16 21:44:24 -08002363 type = RTN_MULTICAST;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002364 else if (ipv4_is_zeronet(fl4->daddr))
David S. Miller5ada5522011-02-17 15:29:00 -08002365 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002366
2367 if (dev_out->flags & IFF_LOOPBACK)
2368 flags |= RTCF_LOCAL;
2369
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002370 in_dev = __in_dev_get_rcu(dev_out);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002371 if (!in_dev)
David S. Miller5ada5522011-02-17 15:29:00 -08002372 return ERR_PTR(-EINVAL);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002373
David S. Miller982721f2011-02-16 21:44:24 -08002374 if (type == RTN_BROADCAST) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002375 flags |= RTCF_BROADCAST | RTCF_LOCAL;
David S. Miller982721f2011-02-16 21:44:24 -08002376 fi = NULL;
2377 } else if (type == RTN_MULTICAST) {
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002378 flags |= RTCF_MULTICAST | RTCF_LOCAL;
David S. Miller813b3b52011-04-28 14:48:42 -07002379 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2380 fl4->flowi4_proto))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002381 flags &= ~RTCF_LOCAL;
2382 /* If multicast route do not exist use
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002383 * default one, but do not gateway in this case.
2384 * Yes, it is hack.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002385 */
David S. Miller982721f2011-02-16 21:44:24 -08002386 if (fi && res->prefixlen < 4)
2387 fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002388 }
2389
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002390 rth = rt_dst_alloc(dev_out,
2391 IN_DEV_CONF_GET(in_dev, NOPOLICY),
David S. Miller0c4dcd52011-02-17 15:42:37 -08002392 IN_DEV_CONF_GET(in_dev, NOXFRM));
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002393 if (!rth)
David S. Miller5ada5522011-02-17 15:29:00 -08002394 return ERR_PTR(-ENOBUFS);
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002395
David S. Millercf911662011-04-28 14:31:47 -07002396 rth->dst.output = ip_output;
2397
David S. Miller813b3b52011-04-28 14:48:42 -07002398 rth->rt_key_dst = orig_daddr;
2399 rth->rt_key_src = orig_saddr;
David S. Millercf911662011-04-28 14:31:47 -07002400 rth->rt_genid = rt_genid(dev_net(dev_out));
2401 rth->rt_flags = flags;
2402 rth->rt_type = type;
David S. Miller475949d2011-05-03 19:45:15 -07002403 rth->rt_key_tos = tos;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002404 rth->rt_dst = fl4->daddr;
2405 rth->rt_src = fl4->saddr;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002406 rth->rt_route_iif = 0;
David S. Miller813b3b52011-04-28 14:48:42 -07002407 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2408 rth->rt_oif = orig_oif;
2409 rth->rt_mark = fl4->flowi4_mark;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002410 rth->rt_gateway = fl4->daddr;
2411 rth->rt_spec_dst= fl4->saddr;
David S. Millercf911662011-04-28 14:31:47 -07002412 rth->rt_peer_genid = 0;
2413 rth->peer = NULL;
2414 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002415
2416 RT_CACHE_STAT_INC(out_slow_tot);
2417
2418 if (flags & RTCF_LOCAL) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002419 rth->dst.input = ip_local_deliver;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002420 rth->rt_spec_dst = fl4->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002421 }
2422 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
David S. Miller68a5e3d2011-03-11 20:07:33 -05002423 rth->rt_spec_dst = fl4->saddr;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002424 if (flags & RTCF_LOCAL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002425 !(dev_out->flags & IFF_LOOPBACK)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002426 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002427 RT_CACHE_STAT_INC(out_slow_mc);
2428 }
2429#ifdef CONFIG_IP_MROUTE
David S. Miller982721f2011-02-16 21:44:24 -08002430 if (type == RTN_MULTICAST) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002431 if (IN_DEV_MFORWARD(in_dev) &&
David S. Miller813b3b52011-04-28 14:48:42 -07002432 !ipv4_is_local_multicast(fl4->daddr)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002433 rth->dst.input = ip_mr_input;
2434 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002435 }
2436 }
2437#endif
2438 }
2439
David S. Miller813b3b52011-04-28 14:48:42 -07002440 rt_set_nexthop(rth, fl4, res, fi, type, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002441
David S. Miller5ada5522011-02-17 15:29:00 -08002442 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002443}
2444
Linus Torvalds1da177e2005-04-16 15:20:36 -07002445/*
2446 * Major route resolver routine.
Eric Dumazet0197aa32010-09-30 03:33:58 +00002447 * called with rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002448 */
2449
David S. Miller813b3b52011-04-28 14:48:42 -07002450static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002451{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002452 struct net_device *dev_out = NULL;
David S. Miller813b3b52011-04-28 14:48:42 -07002453 u32 tos = RT_FL_TOS(fl4);
2454 unsigned int flags = 0;
2455 struct fib_result res;
David S. Miller5ada5522011-02-17 15:29:00 -08002456 struct rtable *rth;
David S. Miller813b3b52011-04-28 14:48:42 -07002457 __be32 orig_daddr;
2458 __be32 orig_saddr;
2459 int orig_oif;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002460
2461 res.fi = NULL;
2462#ifdef CONFIG_IP_MULTIPLE_TABLES
2463 res.r = NULL;
2464#endif
2465
David S. Miller813b3b52011-04-28 14:48:42 -07002466 orig_daddr = fl4->daddr;
2467 orig_saddr = fl4->saddr;
2468 orig_oif = fl4->flowi4_oif;
2469
2470 fl4->flowi4_iif = net->loopback_dev->ifindex;
2471 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2472 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2473 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
David S. Miller44713b62011-03-04 21:24:47 -08002474
David S. Miller010c2702011-02-17 15:37:09 -08002475 rcu_read_lock();
David S. Miller813b3b52011-04-28 14:48:42 -07002476 if (fl4->saddr) {
David S. Millerb23dd4f2011-03-02 14:31:35 -08002477 rth = ERR_PTR(-EINVAL);
David S. Miller813b3b52011-04-28 14:48:42 -07002478 if (ipv4_is_multicast(fl4->saddr) ||
2479 ipv4_is_lbcast(fl4->saddr) ||
2480 ipv4_is_zeronet(fl4->saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002481 goto out;
2482
Linus Torvalds1da177e2005-04-16 15:20:36 -07002483 /* I removed check for oif == dev_out->oif here.
2484 It was wrong for two reasons:
Denis V. Lunev1ab35272008-01-22 22:04:30 -08002485 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2486 is assigned to multiple interfaces.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002487 2. Moreover, we are allowed to send packets with saddr
2488 of another iface. --ANK
2489 */
2490
David S. Miller813b3b52011-04-28 14:48:42 -07002491 if (fl4->flowi4_oif == 0 &&
2492 (ipv4_is_multicast(fl4->daddr) ||
2493 ipv4_is_lbcast(fl4->daddr))) {
Julian Anastasova210d012008-10-01 07:28:28 -07002494 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
David S. Miller813b3b52011-04-28 14:48:42 -07002495 dev_out = __ip_dev_find(net, fl4->saddr, false);
Julian Anastasova210d012008-10-01 07:28:28 -07002496 if (dev_out == NULL)
2497 goto out;
2498
Linus Torvalds1da177e2005-04-16 15:20:36 -07002499 /* Special hack: user can direct multicasts
2500 and limited broadcast via necessary interface
2501 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2502 This hack is not just for fun, it allows
2503 vic,vat and friends to work.
2504 They bind socket to loopback, set ttl to zero
2505 and expect that it will work.
2506 From the viewpoint of routing cache they are broken,
2507 because we are not allowed to build multicast path
2508 with loopback source addr (look, routing cache
2509 cannot know, that ttl is zero, so that packet
2510 will not leave this host and route is valid).
2511 Luckily, this hack is good workaround.
2512 */
2513
David S. Miller813b3b52011-04-28 14:48:42 -07002514 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002515 goto make_route;
2516 }
Julian Anastasova210d012008-10-01 07:28:28 -07002517
David S. Miller813b3b52011-04-28 14:48:42 -07002518 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
Julian Anastasova210d012008-10-01 07:28:28 -07002519 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
David S. Miller813b3b52011-04-28 14:48:42 -07002520 if (!__ip_dev_find(net, fl4->saddr, false))
Julian Anastasova210d012008-10-01 07:28:28 -07002521 goto out;
Julian Anastasova210d012008-10-01 07:28:28 -07002522 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002523 }
2524
2525
David S. Miller813b3b52011-04-28 14:48:42 -07002526 if (fl4->flowi4_oif) {
2527 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002528 rth = ERR_PTR(-ENODEV);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002529 if (dev_out == NULL)
2530 goto out;
Herbert Xue5ed6392005-10-03 14:35:55 -07002531
2532 /* RACE: Check return value of inet_select_addr instead. */
Eric Dumazetfc75fc82010-12-22 04:39:39 +00002533 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
David S. Millerb23dd4f2011-03-02 14:31:35 -08002534 rth = ERR_PTR(-ENETUNREACH);
Eric Dumazetfc75fc82010-12-22 04:39:39 +00002535 goto out;
2536 }
David S. Miller813b3b52011-04-28 14:48:42 -07002537 if (ipv4_is_local_multicast(fl4->daddr) ||
2538 ipv4_is_lbcast(fl4->daddr)) {
2539 if (!fl4->saddr)
2540 fl4->saddr = inet_select_addr(dev_out, 0,
2541 RT_SCOPE_LINK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002542 goto make_route;
2543 }
David S. Miller813b3b52011-04-28 14:48:42 -07002544 if (fl4->saddr) {
2545 if (ipv4_is_multicast(fl4->daddr))
2546 fl4->saddr = inet_select_addr(dev_out, 0,
2547 fl4->flowi4_scope);
2548 else if (!fl4->daddr)
2549 fl4->saddr = inet_select_addr(dev_out, 0,
2550 RT_SCOPE_HOST);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002551 }
2552 }
2553
David S. Miller813b3b52011-04-28 14:48:42 -07002554 if (!fl4->daddr) {
2555 fl4->daddr = fl4->saddr;
2556 if (!fl4->daddr)
2557 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002558 dev_out = net->loopback_dev;
David S. Miller813b3b52011-04-28 14:48:42 -07002559 fl4->flowi4_oif = net->loopback_dev->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002560 res.type = RTN_LOCAL;
2561 flags |= RTCF_LOCAL;
2562 goto make_route;
2563 }
2564
David S. Miller813b3b52011-04-28 14:48:42 -07002565 if (fib_lookup(net, fl4, &res)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002566 res.fi = NULL;
David S. Miller813b3b52011-04-28 14:48:42 -07002567 if (fl4->flowi4_oif) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002568 /* Apparently, routing tables are wrong. Assume,
2569 that the destination is on link.
2570
2571 WHY? DW.
2572 Because we are allowed to send to iface
2573 even if it has NO routes and NO assigned
2574 addresses. When oif is specified, routing
2575 tables are looked up with only one purpose:
2576 to catch if destination is gatewayed, rather than
2577 direct. Moreover, if MSG_DONTROUTE is set,
2578 we send packet, ignoring both routing tables
2579 and ifaddr state. --ANK
2580
2581
2582 We could make it even if oif is unknown,
2583 likely IPv6, but we do not.
2584 */
2585
David S. Miller813b3b52011-04-28 14:48:42 -07002586 if (fl4->saddr == 0)
2587 fl4->saddr = inet_select_addr(dev_out, 0,
2588 RT_SCOPE_LINK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002589 res.type = RTN_UNICAST;
2590 goto make_route;
2591 }
David S. Millerb23dd4f2011-03-02 14:31:35 -08002592 rth = ERR_PTR(-ENETUNREACH);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002593 goto out;
2594 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002595
2596 if (res.type == RTN_LOCAL) {
David S. Miller813b3b52011-04-28 14:48:42 -07002597 if (!fl4->saddr) {
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002598 if (res.fi->fib_prefsrc)
David S. Miller813b3b52011-04-28 14:48:42 -07002599 fl4->saddr = res.fi->fib_prefsrc;
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002600 else
David S. Miller813b3b52011-04-28 14:48:42 -07002601 fl4->saddr = fl4->daddr;
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002602 }
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002603 dev_out = net->loopback_dev;
David S. Miller813b3b52011-04-28 14:48:42 -07002604 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002605 res.fi = NULL;
2606 flags |= RTCF_LOCAL;
2607 goto make_route;
2608 }
2609
2610#ifdef CONFIG_IP_ROUTE_MULTIPATH
David S. Miller813b3b52011-04-28 14:48:42 -07002611 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
David S. Miller1b7fe5932011-03-10 17:01:16 -08002612 fib_select_multipath(&res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002613 else
2614#endif
David S. Miller21d8c492011-04-14 14:49:37 -07002615 if (!res.prefixlen &&
2616 res.table->tb_num_default > 1 &&
David S. Miller813b3b52011-04-28 14:48:42 -07002617 res.type == RTN_UNICAST && !fl4->flowi4_oif)
David S. Miller0c838ff2011-01-31 16:16:50 -08002618 fib_select_default(&res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002619
David S. Miller813b3b52011-04-28 14:48:42 -07002620 if (!fl4->saddr)
2621 fl4->saddr = FIB_RES_PREFSRC(net, res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002622
Linus Torvalds1da177e2005-04-16 15:20:36 -07002623 dev_out = FIB_RES_DEV(res);
David S. Miller813b3b52011-04-28 14:48:42 -07002624 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002625
2626
2627make_route:
David S. Miller813b3b52011-04-28 14:48:42 -07002628 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2629 dev_out, flags);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002630 if (!IS_ERR(rth)) {
David S. Miller5ada5522011-02-17 15:29:00 -08002631 unsigned int hash;
2632
David S. Miller813b3b52011-04-28 14:48:42 -07002633 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
David S. Miller5ada5522011-02-17 15:29:00 -08002634 rt_genid(dev_net(dev_out)));
David S. Miller813b3b52011-04-28 14:48:42 -07002635 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
David S. Miller5ada5522011-02-17 15:29:00 -08002636 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002637
David S. Miller010c2702011-02-17 15:37:09 -08002638out:
2639 rcu_read_unlock();
David S. Millerb23dd4f2011-03-02 14:31:35 -08002640 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002641}
2642
David S. Miller813b3b52011-04-28 14:48:42 -07002643struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002644{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002645 struct rtable *rth;
David S. Miller010c2702011-02-17 15:37:09 -08002646 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002647
Neil Horman1080d702008-10-27 12:28:25 -07002648 if (!rt_caching(net))
2649 goto slow_output;
2650
David S. Miller9d6ec932011-03-12 01:12:47 -05002651 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002652
2653 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08002654 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002655 rth = rcu_dereference_bh(rth->dst.rt_next)) {
David S. Miller9d6ec932011-03-12 01:12:47 -05002656 if (rth->rt_key_dst == flp4->daddr &&
2657 rth->rt_key_src == flp4->saddr &&
David S. Millerc7537962010-11-11 17:07:48 -08002658 rt_is_output_route(rth) &&
David S. Miller9d6ec932011-03-12 01:12:47 -05002659 rth->rt_oif == flp4->flowi4_oif &&
2660 rth->rt_mark == flp4->flowi4_mark &&
David S. Miller475949d2011-05-03 19:45:15 -07002661 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
Denis V. Lunevb5921912008-01-22 23:50:25 -08002662 (IPTOS_RT_MASK | RTO_ONLINK)) &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002663 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002664 !rt_is_expired(rth)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002665 dst_use(&rth->dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002666 RT_CACHE_STAT_INC(out_hit);
2667 rcu_read_unlock_bh();
David S. Miller56157872011-05-02 14:37:45 -07002668 if (!flp4->saddr)
2669 flp4->saddr = rth->rt_src;
2670 if (!flp4->daddr)
2671 flp4->daddr = rth->rt_dst;
David S. Millerb23dd4f2011-03-02 14:31:35 -08002672 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002673 }
2674 RT_CACHE_STAT_INC(out_hlist_search);
2675 }
2676 rcu_read_unlock_bh();
2677
Neil Horman1080d702008-10-27 12:28:25 -07002678slow_output:
David S. Miller9d6ec932011-03-12 01:12:47 -05002679 return ip_route_output_slow(net, flp4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002680}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002681EXPORT_SYMBOL_GPL(__ip_route_output_key);
2682
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002683static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2684{
2685 return NULL;
2686}
2687
Roland Dreierec831ea2011-01-31 13:16:00 -08002688static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2689{
2690 return 0;
2691}
2692
David S. Miller14e50e52007-05-24 18:17:54 -07002693static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2694{
2695}
2696
Held Bernhard0972ddb2011-04-24 22:07:32 +00002697static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2698 unsigned long old)
2699{
2700 return NULL;
2701}
2702
David S. Miller14e50e52007-05-24 18:17:54 -07002703static struct dst_ops ipv4_dst_blackhole_ops = {
2704 .family = AF_INET,
Harvey Harrison09640e62009-02-01 00:45:17 -08002705 .protocol = cpu_to_be16(ETH_P_IP),
David S. Miller14e50e52007-05-24 18:17:54 -07002706 .destroy = ipv4_dst_destroy,
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002707 .check = ipv4_blackhole_dst_check,
Roland Dreierec831ea2011-01-31 13:16:00 -08002708 .default_mtu = ipv4_blackhole_default_mtu,
Eric Dumazet214f45c2011-02-18 11:39:01 -08002709 .default_advmss = ipv4_default_advmss,
David S. Miller14e50e52007-05-24 18:17:54 -07002710 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
Held Bernhard0972ddb2011-04-24 22:07:32 +00002711 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
David S. Miller14e50e52007-05-24 18:17:54 -07002712};
2713
David S. Miller2774c132011-03-01 14:59:04 -08002714struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
David S. Miller14e50e52007-05-24 18:17:54 -07002715{
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002716 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
David S. Miller2774c132011-03-01 14:59:04 -08002717 struct rtable *ort = (struct rtable *) dst_orig;
David S. Miller14e50e52007-05-24 18:17:54 -07002718
2719 if (rt) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002720 struct dst_entry *new = &rt->dst;
David S. Miller14e50e52007-05-24 18:17:54 -07002721
David S. Miller14e50e52007-05-24 18:17:54 -07002722 new->__use = 1;
Herbert Xu352e5122007-11-13 21:34:06 -08002723 new->input = dst_discard;
2724 new->output = dst_discard;
David S. Millerdefb3512010-12-08 21:16:57 -08002725 dst_copy_metrics(new, &ort->dst);
David S. Miller14e50e52007-05-24 18:17:54 -07002726
Changli Gaod8d1f302010-06-10 23:31:35 -07002727 new->dev = ort->dst.dev;
David S. Miller14e50e52007-05-24 18:17:54 -07002728 if (new->dev)
2729 dev_hold(new->dev);
2730
David S. Miller5e2b61f2011-03-04 21:47:09 -08002731 rt->rt_key_dst = ort->rt_key_dst;
2732 rt->rt_key_src = ort->rt_key_src;
David S. Miller475949d2011-05-03 19:45:15 -07002733 rt->rt_key_tos = ort->rt_key_tos;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002734 rt->rt_route_iif = ort->rt_route_iif;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002735 rt->rt_iif = ort->rt_iif;
2736 rt->rt_oif = ort->rt_oif;
2737 rt->rt_mark = ort->rt_mark;
David S. Miller14e50e52007-05-24 18:17:54 -07002738
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002739 rt->rt_genid = rt_genid(net);
David S. Miller14e50e52007-05-24 18:17:54 -07002740 rt->rt_flags = ort->rt_flags;
2741 rt->rt_type = ort->rt_type;
2742 rt->rt_dst = ort->rt_dst;
2743 rt->rt_src = ort->rt_src;
David S. Miller14e50e52007-05-24 18:17:54 -07002744 rt->rt_gateway = ort->rt_gateway;
2745 rt->rt_spec_dst = ort->rt_spec_dst;
2746 rt->peer = ort->peer;
2747 if (rt->peer)
2748 atomic_inc(&rt->peer->refcnt);
David S. Miller62fa8a82011-01-26 20:51:05 -08002749 rt->fi = ort->fi;
2750 if (rt->fi)
2751 atomic_inc(&rt->fi->fib_clntref);
David S. Miller14e50e52007-05-24 18:17:54 -07002752
2753 dst_free(new);
2754 }
2755
David S. Miller2774c132011-03-01 14:59:04 -08002756 dst_release(dst_orig);
2757
2758 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
David S. Miller14e50e52007-05-24 18:17:54 -07002759}
2760
David S. Miller9d6ec932011-03-12 01:12:47 -05002761struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
David S. Millerb23dd4f2011-03-02 14:31:35 -08002762 struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002763{
David S. Miller9d6ec932011-03-12 01:12:47 -05002764 struct rtable *rt = __ip_route_output_key(net, flp4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002765
David S. Millerb23dd4f2011-03-02 14:31:35 -08002766 if (IS_ERR(rt))
2767 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002768
David S. Miller56157872011-05-02 14:37:45 -07002769 if (flp4->flowi4_proto)
David S. Miller9d6ec932011-03-12 01:12:47 -05002770 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2771 flowi4_to_flowi(flp4),
2772 sk, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002773
David S. Millerb23dd4f2011-03-02 14:31:35 -08002774 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002775}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002776EXPORT_SYMBOL_GPL(ip_route_output_flow);
2777
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002778static int rt_fill_info(struct net *net,
2779 struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002780 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002781{
Eric Dumazet511c3f92009-06-02 05:14:27 +00002782 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002783 struct rtmsg *r;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002784 struct nlmsghdr *nlh;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00002785 long expires = 0;
2786 const struct inet_peer *peer = rt->peer;
Thomas Grafe3703b32006-11-27 09:27:07 -08002787 u32 id = 0, ts = 0, tsage = 0, error;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002788
2789 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2790 if (nlh == NULL)
Patrick McHardy26932562007-01-31 23:16:40 -08002791 return -EMSGSIZE;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002792
2793 r = nlmsg_data(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002794 r->rtm_family = AF_INET;
2795 r->rtm_dst_len = 32;
2796 r->rtm_src_len = 0;
David S. Miller475949d2011-05-03 19:45:15 -07002797 r->rtm_tos = rt->rt_key_tos;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002798 r->rtm_table = RT_TABLE_MAIN;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002799 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002800 r->rtm_type = rt->rt_type;
2801 r->rtm_scope = RT_SCOPE_UNIVERSE;
2802 r->rtm_protocol = RTPROT_UNSPEC;
2803 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2804 if (rt->rt_flags & RTCF_NOTIFY)
2805 r->rtm_flags |= RTM_F_NOTIFY;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002806
Al Viro17fb2c62006-09-26 22:15:25 -07002807 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002808
David S. Miller5e2b61f2011-03-04 21:47:09 -08002809 if (rt->rt_key_src) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002810 r->rtm_src_len = 32;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002811 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002812 }
Changli Gaod8d1f302010-06-10 23:31:35 -07002813 if (rt->dst.dev)
2814 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
Patrick McHardyc7066f72011-01-14 13:36:42 +01002815#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07002816 if (rt->dst.tclassid)
2817 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002818#endif
David S. Millerc7537962010-11-11 17:07:48 -08002819 if (rt_is_input_route(rt))
Al Viro17fb2c62006-09-26 22:15:25 -07002820 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
David S. Miller5e2b61f2011-03-04 21:47:09 -08002821 else if (rt->rt_src != rt->rt_key_src)
Al Viro17fb2c62006-09-26 22:15:25 -07002822 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002823
Linus Torvalds1da177e2005-04-16 15:20:36 -07002824 if (rt->rt_dst != rt->rt_gateway)
Al Viro17fb2c62006-09-26 22:15:25 -07002825 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002826
David S. Millerdefb3512010-12-08 21:16:57 -08002827 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002828 goto nla_put_failure;
2829
David S. Miller5e2b61f2011-03-04 21:47:09 -08002830 if (rt->rt_mark)
2831 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
Eric Dumazet963bfee2010-07-20 22:03:14 +00002832
Changli Gaod8d1f302010-06-10 23:31:35 -07002833 error = rt->dst.error;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00002834 if (peer) {
Eric Dumazet317fe0e2010-06-16 04:52:13 +00002835 inet_peer_refcheck(rt->peer);
Eric Dumazetfe6fe792011-06-08 06:07:07 +00002836 id = atomic_read(&peer->ip_id_count) & 0xffff;
2837 if (peer->tcp_ts_stamp) {
2838 ts = peer->tcp_ts;
2839 tsage = get_seconds() - peer->tcp_ts_stamp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002840 }
Eric Dumazetfe6fe792011-06-08 06:07:07 +00002841 expires = ACCESS_ONCE(peer->pmtu_expires);
2842 if (expires)
2843 expires -= jiffies;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002844 }
Thomas Grafbe403ea2006-08-17 18:15:17 -07002845
David S. Millerc7537962010-11-11 17:07:48 -08002846 if (rt_is_input_route(rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002847#ifdef CONFIG_IP_MROUTE
Al Viroe4485152006-09-26 22:15:01 -07002848 __be32 dst = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002849
Joe Perchesf97c1e02007-12-16 13:45:43 -08002850 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002851 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
David S. Miller9a1b9492011-05-04 12:18:54 -07002852 int err = ipmr_get_route(net, skb,
2853 rt->rt_src, rt->rt_dst,
2854 r, nowait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002855 if (err <= 0) {
2856 if (!nowait) {
2857 if (err == 0)
2858 return 0;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002859 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002860 } else {
2861 if (err == -EMSGSIZE)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002862 goto nla_put_failure;
Thomas Grafe3703b32006-11-27 09:27:07 -08002863 error = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002864 }
2865 }
2866 } else
2867#endif
David S. Miller5e2b61f2011-03-04 21:47:09 -08002868 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002869 }
2870
Changli Gaod8d1f302010-06-10 23:31:35 -07002871 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
Thomas Grafe3703b32006-11-27 09:27:07 -08002872 expires, error) < 0)
2873 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002874
Thomas Grafbe403ea2006-08-17 18:15:17 -07002875 return nlmsg_end(skb, nlh);
2876
2877nla_put_failure:
Patrick McHardy26932562007-01-31 23:16:40 -08002878 nlmsg_cancel(skb, nlh);
2879 return -EMSGSIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002880}
2881
Thomas Graf63f34442007-03-22 11:55:17 -07002882static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002883{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09002884 struct net *net = sock_net(in_skb->sk);
Thomas Grafd889ce32006-08-17 18:15:44 -07002885 struct rtmsg *rtm;
2886 struct nlattr *tb[RTA_MAX+1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07002887 struct rtable *rt = NULL;
Al Viro9e12bb22006-09-26 21:25:20 -07002888 __be32 dst = 0;
2889 __be32 src = 0;
2890 u32 iif;
Thomas Grafd889ce32006-08-17 18:15:44 -07002891 int err;
Eric Dumazet963bfee2010-07-20 22:03:14 +00002892 int mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002893 struct sk_buff *skb;
2894
Thomas Grafd889ce32006-08-17 18:15:44 -07002895 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2896 if (err < 0)
2897 goto errout;
2898
2899 rtm = nlmsg_data(nlh);
2900
Linus Torvalds1da177e2005-04-16 15:20:36 -07002901 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
Thomas Grafd889ce32006-08-17 18:15:44 -07002902 if (skb == NULL) {
2903 err = -ENOBUFS;
2904 goto errout;
2905 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002906
2907 /* Reserve room for dummy headers, this skb can pass
2908 through good chunk of routing engine.
2909 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07002910 skb_reset_mac_header(skb);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07002911 skb_reset_network_header(skb);
Stephen Hemmingerd2c962b2006-04-17 17:27:11 -07002912
2913 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07002914 ip_hdr(skb)->protocol = IPPROTO_ICMP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002915 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2916
Al Viro17fb2c62006-09-26 22:15:25 -07002917 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2918 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
Thomas Grafd889ce32006-08-17 18:15:44 -07002919 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
Eric Dumazet963bfee2010-07-20 22:03:14 +00002920 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002921
2922 if (iif) {
Thomas Grafd889ce32006-08-17 18:15:44 -07002923 struct net_device *dev;
2924
Denis V. Lunev19375042008-02-28 20:52:04 -08002925 dev = __dev_get_by_index(net, iif);
Thomas Grafd889ce32006-08-17 18:15:44 -07002926 if (dev == NULL) {
2927 err = -ENODEV;
2928 goto errout_free;
2929 }
2930
Linus Torvalds1da177e2005-04-16 15:20:36 -07002931 skb->protocol = htons(ETH_P_IP);
2932 skb->dev = dev;
Eric Dumazet963bfee2010-07-20 22:03:14 +00002933 skb->mark = mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002934 local_bh_disable();
2935 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2936 local_bh_enable();
Thomas Grafd889ce32006-08-17 18:15:44 -07002937
Eric Dumazet511c3f92009-06-02 05:14:27 +00002938 rt = skb_rtable(skb);
Changli Gaod8d1f302010-06-10 23:31:35 -07002939 if (err == 0 && rt->dst.error)
2940 err = -rt->dst.error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002941 } else {
David S. Miller68a5e3d2011-03-11 20:07:33 -05002942 struct flowi4 fl4 = {
2943 .daddr = dst,
2944 .saddr = src,
2945 .flowi4_tos = rtm->rtm_tos,
2946 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2947 .flowi4_mark = mark,
Thomas Grafd889ce32006-08-17 18:15:44 -07002948 };
David S. Miller9d6ec932011-03-12 01:12:47 -05002949 rt = ip_route_output_key(net, &fl4);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002950
2951 err = 0;
2952 if (IS_ERR(rt))
2953 err = PTR_ERR(rt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002954 }
Thomas Grafd889ce32006-08-17 18:15:44 -07002955
Linus Torvalds1da177e2005-04-16 15:20:36 -07002956 if (err)
Thomas Grafd889ce32006-08-17 18:15:44 -07002957 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002958
Changli Gaod8d1f302010-06-10 23:31:35 -07002959 skb_dst_set(skb, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002960 if (rtm->rtm_flags & RTM_F_NOTIFY)
2961 rt->rt_flags |= RTCF_NOTIFY;
2962
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002963 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Denis V. Lunev19375042008-02-28 20:52:04 -08002964 RTM_NEWROUTE, 0, 0);
Thomas Grafd889ce32006-08-17 18:15:44 -07002965 if (err <= 0)
2966 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002967
Denis V. Lunev19375042008-02-28 20:52:04 -08002968 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
Thomas Grafd889ce32006-08-17 18:15:44 -07002969errout:
Thomas Graf2942e902006-08-15 00:30:25 -07002970 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002971
Thomas Grafd889ce32006-08-17 18:15:44 -07002972errout_free:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002973 kfree_skb(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07002974 goto errout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002975}
2976
2977int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2978{
2979 struct rtable *rt;
2980 int h, s_h;
2981 int idx, s_idx;
Denis V. Lunev19375042008-02-28 20:52:04 -08002982 struct net *net;
2983
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09002984 net = sock_net(skb->sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002985
2986 s_h = cb->args[0];
Eric Dumazetd8c92832008-01-07 21:52:14 -08002987 if (s_h < 0)
2988 s_h = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002989 s_idx = idx = cb->args[1];
Eric Dumazeta6272662008-08-28 01:11:25 -07002990 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
2991 if (!rt_hash_table[h].chain)
2992 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002993 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08002994 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
Changli Gaod8d1f302010-06-10 23:31:35 -07002995 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
2996 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002997 continue;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002998 if (rt_is_expired(rt))
Eric Dumazet29e75252008-01-31 17:05:09 -08002999 continue;
Changli Gaod8d1f302010-06-10 23:31:35 -07003000 skb_dst_set_noref(skb, &rt->dst);
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003001 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003002 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07003003 1, NLM_F_MULTI) <= 0) {
Eric Dumazetadf30902009-06-02 05:19:30 +00003004 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003005 rcu_read_unlock_bh();
3006 goto done;
3007 }
Eric Dumazetadf30902009-06-02 05:19:30 +00003008 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003009 }
3010 rcu_read_unlock_bh();
3011 }
3012
3013done:
3014 cb->args[0] = h;
3015 cb->args[1] = idx;
3016 return skb->len;
3017}
3018
3019void ip_rt_multicast_event(struct in_device *in_dev)
3020{
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -07003021 rt_cache_flush(dev_net(in_dev->dev), 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003022}
3023
3024#ifdef CONFIG_SYSCTL
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003025static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003026 void __user *buffer,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003027 size_t *lenp, loff_t *ppos)
3028{
3029 if (write) {
Denis V. Lunev639e1042008-07-05 19:02:06 -07003030 int flush_delay;
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003031 ctl_table ctl;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003032 struct net *net;
Denis V. Lunev639e1042008-07-05 19:02:06 -07003033
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003034 memcpy(&ctl, __ctl, sizeof(ctl));
3035 ctl.data = &flush_delay;
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003036 proc_dointvec(&ctl, write, buffer, lenp, ppos);
Denis V. Lunev639e1042008-07-05 19:02:06 -07003037
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003038 net = (struct net *)__ctl->extra1;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003039 rt_cache_flush(net, flush_delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003040 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003041 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003042
3043 return -EINVAL;
3044}
3045
Al Viroeeb61f72008-07-27 08:59:33 +01003046static ctl_table ipv4_route_table[] = {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003047 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003048 .procname = "gc_thresh",
3049 .data = &ipv4_dst_ops.gc_thresh,
3050 .maxlen = sizeof(int),
3051 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003052 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003053 },
3054 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003055 .procname = "max_size",
3056 .data = &ip_rt_max_size,
3057 .maxlen = sizeof(int),
3058 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003059 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003060 },
3061 {
3062 /* Deprecated. Use gc_min_interval_ms */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003063
Linus Torvalds1da177e2005-04-16 15:20:36 -07003064 .procname = "gc_min_interval",
3065 .data = &ip_rt_gc_min_interval,
3066 .maxlen = sizeof(int),
3067 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003068 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003069 },
3070 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003071 .procname = "gc_min_interval_ms",
3072 .data = &ip_rt_gc_min_interval,
3073 .maxlen = sizeof(int),
3074 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003075 .proc_handler = proc_dointvec_ms_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003076 },
3077 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003078 .procname = "gc_timeout",
3079 .data = &ip_rt_gc_timeout,
3080 .maxlen = sizeof(int),
3081 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003082 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003083 },
3084 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003085 .procname = "gc_interval",
3086 .data = &ip_rt_gc_interval,
3087 .maxlen = sizeof(int),
3088 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003089 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003090 },
3091 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003092 .procname = "redirect_load",
3093 .data = &ip_rt_redirect_load,
3094 .maxlen = sizeof(int),
3095 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003096 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003097 },
3098 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003099 .procname = "redirect_number",
3100 .data = &ip_rt_redirect_number,
3101 .maxlen = sizeof(int),
3102 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003103 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003104 },
3105 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003106 .procname = "redirect_silence",
3107 .data = &ip_rt_redirect_silence,
3108 .maxlen = sizeof(int),
3109 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003110 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003111 },
3112 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003113 .procname = "error_cost",
3114 .data = &ip_rt_error_cost,
3115 .maxlen = sizeof(int),
3116 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003117 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003118 },
3119 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003120 .procname = "error_burst",
3121 .data = &ip_rt_error_burst,
3122 .maxlen = sizeof(int),
3123 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003124 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003125 },
3126 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003127 .procname = "gc_elasticity",
3128 .data = &ip_rt_gc_elasticity,
3129 .maxlen = sizeof(int),
3130 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003131 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003132 },
3133 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003134 .procname = "mtu_expires",
3135 .data = &ip_rt_mtu_expires,
3136 .maxlen = sizeof(int),
3137 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003138 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003139 },
3140 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003141 .procname = "min_pmtu",
3142 .data = &ip_rt_min_pmtu,
3143 .maxlen = sizeof(int),
3144 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003145 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003146 },
3147 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003148 .procname = "min_adv_mss",
3149 .data = &ip_rt_min_advmss,
3150 .maxlen = sizeof(int),
3151 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003152 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003153 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003154 { }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003155};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003156
Al Viro2f4520d2008-08-25 15:17:44 -07003157static struct ctl_table empty[1];
3158
3159static struct ctl_table ipv4_skeleton[] =
3160{
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003161 { .procname = "route",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003162 .mode = 0555, .child = ipv4_route_table},
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003163 { .procname = "neigh",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003164 .mode = 0555, .child = empty},
Al Viro2f4520d2008-08-25 15:17:44 -07003165 { }
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003166};
3167
Al Viro2f4520d2008-08-25 15:17:44 -07003168static __net_initdata struct ctl_path ipv4_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003169 { .procname = "net", },
3170 { .procname = "ipv4", },
Al Viro2f4520d2008-08-25 15:17:44 -07003171 { },
3172};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003173
3174static struct ctl_table ipv4_route_flush_table[] = {
3175 {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003176 .procname = "flush",
3177 .maxlen = sizeof(int),
3178 .mode = 0200,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003179 .proc_handler = ipv4_sysctl_rtcache_flush,
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003180 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003181 { },
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003182};
3183
Al Viro2f4520d2008-08-25 15:17:44 -07003184static __net_initdata struct ctl_path ipv4_route_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003185 { .procname = "net", },
3186 { .procname = "ipv4", },
3187 { .procname = "route", },
Al Viro2f4520d2008-08-25 15:17:44 -07003188 { },
3189};
3190
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003191static __net_init int sysctl_route_net_init(struct net *net)
3192{
3193 struct ctl_table *tbl;
3194
3195 tbl = ipv4_route_flush_table;
Octavian Purdila09ad9bc2009-11-25 15:14:13 -08003196 if (!net_eq(net, &init_net)) {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003197 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3198 if (tbl == NULL)
3199 goto err_dup;
3200 }
3201 tbl[0].extra1 = net;
3202
3203 net->ipv4.route_hdr =
3204 register_net_sysctl_table(net, ipv4_route_path, tbl);
3205 if (net->ipv4.route_hdr == NULL)
3206 goto err_reg;
3207 return 0;
3208
3209err_reg:
3210 if (tbl != ipv4_route_flush_table)
3211 kfree(tbl);
3212err_dup:
3213 return -ENOMEM;
3214}
3215
3216static __net_exit void sysctl_route_net_exit(struct net *net)
3217{
3218 struct ctl_table *tbl;
3219
3220 tbl = net->ipv4.route_hdr->ctl_table_arg;
3221 unregister_net_sysctl_table(net->ipv4.route_hdr);
3222 BUG_ON(tbl == ipv4_route_flush_table);
3223 kfree(tbl);
3224}
3225
3226static __net_initdata struct pernet_operations sysctl_route_ops = {
3227 .init = sysctl_route_net_init,
3228 .exit = sysctl_route_net_exit,
3229};
Linus Torvalds1da177e2005-04-16 15:20:36 -07003230#endif
3231
Neil Horman3ee94372010-05-08 01:57:52 -07003232static __net_init int rt_genid_init(struct net *net)
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003233{
Neil Horman3ee94372010-05-08 01:57:52 -07003234 get_random_bytes(&net->ipv4.rt_genid,
3235 sizeof(net->ipv4.rt_genid));
David S. Miller436c3b62011-03-24 17:42:21 -07003236 get_random_bytes(&net->ipv4.dev_addr_genid,
3237 sizeof(net->ipv4.dev_addr_genid));
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003238 return 0;
3239}
3240
Neil Horman3ee94372010-05-08 01:57:52 -07003241static __net_initdata struct pernet_operations rt_genid_ops = {
3242 .init = rt_genid_init,
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003243};
3244
3245
Patrick McHardyc7066f72011-01-14 13:36:42 +01003246#ifdef CONFIG_IP_ROUTE_CLASSID
Tejun Heo7d720c32010-02-16 15:20:26 +00003247struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
Patrick McHardyc7066f72011-01-14 13:36:42 +01003248#endif /* CONFIG_IP_ROUTE_CLASSID */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003249
3250static __initdata unsigned long rhash_entries;
3251static int __init set_rhash_entries(char *str)
3252{
3253 if (!str)
3254 return 0;
3255 rhash_entries = simple_strtoul(str, &str, 0);
3256 return 1;
3257}
3258__setup("rhash_entries=", set_rhash_entries);
3259
3260int __init ip_rt_init(void)
3261{
Eric Dumazet424c4b72005-07-05 14:58:19 -07003262 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003263
Patrick McHardyc7066f72011-01-14 13:36:42 +01003264#ifdef CONFIG_IP_ROUTE_CLASSID
Ingo Molnar0dcec8c2009-02-25 14:07:33 +01003265 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003266 if (!ip_rt_acct)
3267 panic("IP: failed to allocate ip_rt_acct\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003268#endif
3269
Alexey Dobriyane5d679f2006-08-26 19:25:52 -07003270 ipv4_dst_ops.kmem_cachep =
3271 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09003272 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003273
David S. Miller14e50e52007-05-24 18:17:54 -07003274 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3275
Eric Dumazetfc66f952010-10-08 06:37:34 +00003276 if (dst_entries_init(&ipv4_dst_ops) < 0)
3277 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3278
3279 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3280 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3281
Eric Dumazet424c4b72005-07-05 14:58:19 -07003282 rt_hash_table = (struct rt_hash_bucket *)
3283 alloc_large_system_hash("IP route cache",
3284 sizeof(struct rt_hash_bucket),
3285 rhash_entries,
Jan Beulich44813742009-09-21 17:03:05 -07003286 (totalram_pages >= 128 * 1024) ?
Mike Stroyan18955cf2005-11-29 16:12:55 -08003287 15 : 17,
Kirill Korotaev8d1502d2006-08-07 20:44:22 -07003288 0,
Eric Dumazet424c4b72005-07-05 14:58:19 -07003289 &rt_hash_log,
3290 &rt_hash_mask,
Anton Blanchardc9503e02009-04-27 05:42:24 -07003291 rhash_entries ? 0 : 512 * 1024);
Eric Dumazet22c047c2005-07-05 14:55:24 -07003292 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3293 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003294
3295 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3296 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3297
Linus Torvalds1da177e2005-04-16 15:20:36 -07003298 devinet_init();
3299 ip_fib_init();
3300
Denis V. Lunev73b38712008-02-28 20:51:18 -08003301 if (ip_rt_proc_init())
Pavel Emelyanov107f1632007-12-05 21:14:28 -08003302 printk(KERN_ERR "Unable to create route proc files\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003303#ifdef CONFIG_XFRM
3304 xfrm_init();
Neil Hormana33bc5c2009-07-30 18:52:15 -07003305 xfrm4_init(ip_rt_max_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003306#endif
Thomas Graf63f34442007-03-22 11:55:17 -07003307 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3308
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003309#ifdef CONFIG_SYSCTL
3310 register_pernet_subsys(&sysctl_route_ops);
3311#endif
Neil Horman3ee94372010-05-08 01:57:52 -07003312 register_pernet_subsys(&rt_genid_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003313 return rc;
3314}
3315
Al Viroa1bc6eb2008-07-30 06:32:52 -04003316#ifdef CONFIG_SYSCTL
Al Viroeeb61f72008-07-27 08:59:33 +01003317/*
3318 * We really need to sanitize the damn ipv4 init order, then all
3319 * this nonsense will go away.
3320 */
3321void __init ip_static_sysctl_init(void)
3322{
Al Viro2f4520d2008-08-25 15:17:44 -07003323 register_sysctl_paths(ipv4_path, ipv4_skeleton);
Al Viroeeb61f72008-07-27 08:59:33 +01003324}
Al Viroa1bc6eb2008-07-30 06:32:52 -04003325#endif