blob: 5c2847247f51ba059c3685b2e644cc8fbe4334e1 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
Jesper Juhl02c30a82005-05-05 16:16:16 -07008 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -07009 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090021 * Alan Cox : Super /proc >4K
Linus Torvalds1da177e2005-04-16 15:20:36 -070022 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090039 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070055 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Ilia Sotnikovcef26852006-03-25 01:38:55 -080056 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
Linus Torvalds1da177e2005-04-16 15:20:36 -070058 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
Linus Torvalds1da177e2005-04-16 15:20:36 -070065#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070071#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070072#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070073#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
Eric Dumazet39c90ec2007-09-15 10:55:54 -070082#include <linux/workqueue.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070083#include <linux/skbuff.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070084#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090093#include <linux/slab.h>
Herbert Xu352e5122007-11-13 21:34:06 -080094#include <net/dst.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020095#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070096#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
Tom Tucker8d717402006-07-30 20:43:36 -0700106#include <net/netevent.h>
Thomas Graf63f34442007-03-22 11:55:17 -0700107#include <net/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108#ifdef CONFIG_SYSCTL
109#include <linux/sysctl.h>
110#endif
David Miller3769cff2011-07-11 22:44:24 +0000111#include <net/atmclip.h>
David S. Miller6e5714e2011-08-03 20:50:44 -0700112#include <net/secure_seq.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700113
David S. Miller68a5e3d2011-03-11 20:07:33 -0500114#define RT_FL_TOS(oldflp4) \
115 ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700116
117#define IP_MAX_MTU 0xFFF0
118
119#define RT_GC_TIMEOUT (300*HZ)
120
Linus Torvalds1da177e2005-04-16 15:20:36 -0700121static int ip_rt_max_size;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700122static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700123static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
124static int ip_rt_redirect_number __read_mostly = 9;
125static int ip_rt_redirect_load __read_mostly = HZ / 50;
126static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
127static int ip_rt_error_cost __read_mostly = HZ;
128static int ip_rt_error_burst __read_mostly = 5 * HZ;
129static int ip_rt_gc_elasticity __read_mostly = 8;
130static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
131static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
132static int ip_rt_min_advmss __read_mostly = 256;
Neil Horman1080d702008-10-27 12:28:25 -0700133static int rt_chain_length_max __read_mostly = 20;
Eric Dumazetde68dca2011-11-26 12:13:44 +0000134static int redirect_genid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700135
Linus Torvalds1da177e2005-04-16 15:20:36 -0700136/*
137 * Interface to generic destination cache.
138 */
139
140static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
David S. Miller0dbaee32010-12-13 12:52:14 -0800141static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
Steffen Klassertebb762f2011-11-23 02:12:51 +0000142static unsigned int ipv4_mtu(const struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700143static void ipv4_dst_destroy(struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700144static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
145static void ipv4_link_failure(struct sk_buff *skb);
146static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
Daniel Lezcano569d3642008-01-18 03:56:57 -0800147static int rt_garbage_collect(struct dst_ops *ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148
Eric Dumazet72cdd1d2010-11-11 07:14:07 +0000149static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
150 int how)
151{
152}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700153
David S. Miller62fa8a82011-01-26 20:51:05 -0800154static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
155{
David S. Miller06582542011-01-27 14:58:42 -0800156 struct rtable *rt = (struct rtable *) dst;
157 struct inet_peer *peer;
158 u32 *p = NULL;
David S. Miller62fa8a82011-01-26 20:51:05 -0800159
David S. Miller06582542011-01-27 14:58:42 -0800160 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -0400161 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller06582542011-01-27 14:58:42 -0800162
163 peer = rt->peer;
164 if (peer) {
David S. Miller62fa8a82011-01-26 20:51:05 -0800165 u32 *old_p = __DST_METRICS_PTR(old);
166 unsigned long prev, new;
167
David S. Miller06582542011-01-27 14:58:42 -0800168 p = peer->metrics;
169 if (inet_metrics_new(peer))
170 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
David S. Miller62fa8a82011-01-26 20:51:05 -0800171
172 new = (unsigned long) p;
173 prev = cmpxchg(&dst->_metrics, old, new);
174
175 if (prev != old) {
David S. Miller62fa8a82011-01-26 20:51:05 -0800176 p = __DST_METRICS_PTR(prev);
177 if (prev & DST_METRICS_READ_ONLY)
178 p = NULL;
179 } else {
David S. Miller62fa8a82011-01-26 20:51:05 -0800180 if (rt->fi) {
181 fib_info_put(rt->fi);
182 rt->fi = NULL;
183 }
184 }
185 }
186 return p;
187}
188
David S. Millerd3aaeb32011-07-18 00:40:17 -0700189static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
190
Linus Torvalds1da177e2005-04-16 15:20:36 -0700191static struct dst_ops ipv4_dst_ops = {
192 .family = AF_INET,
Harvey Harrison09640e62009-02-01 00:45:17 -0800193 .protocol = cpu_to_be16(ETH_P_IP),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700194 .gc = rt_garbage_collect,
195 .check = ipv4_dst_check,
David S. Miller0dbaee32010-12-13 12:52:14 -0800196 .default_advmss = ipv4_default_advmss,
Steffen Klassertebb762f2011-11-23 02:12:51 +0000197 .mtu = ipv4_mtu,
David S. Miller62fa8a82011-01-26 20:51:05 -0800198 .cow_metrics = ipv4_cow_metrics,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700199 .destroy = ipv4_dst_destroy,
200 .ifdown = ipv4_dst_ifdown,
201 .negative_advice = ipv4_negative_advice,
202 .link_failure = ipv4_link_failure,
203 .update_pmtu = ip_rt_update_pmtu,
Herbert Xu1ac06e02008-05-20 14:32:14 -0700204 .local_out = __ip_local_out,
David S. Millerd3aaeb32011-07-18 00:40:17 -0700205 .neigh_lookup = ipv4_neigh_lookup,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700206};
207
208#define ECN_OR_COST(class) TC_PRIO_##class
209
Philippe De Muyter4839c522007-07-09 15:32:57 -0700210const __u8 ip_tos2prio[16] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700211 TC_PRIO_BESTEFFORT,
Dan Siemon4a2b9c32011-03-15 13:56:07 +0000212 ECN_OR_COST(BESTEFFORT),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700213 TC_PRIO_BESTEFFORT,
214 ECN_OR_COST(BESTEFFORT),
215 TC_PRIO_BULK,
216 ECN_OR_COST(BULK),
217 TC_PRIO_BULK,
218 ECN_OR_COST(BULK),
219 TC_PRIO_INTERACTIVE,
220 ECN_OR_COST(INTERACTIVE),
221 TC_PRIO_INTERACTIVE,
222 ECN_OR_COST(INTERACTIVE),
223 TC_PRIO_INTERACTIVE_BULK,
224 ECN_OR_COST(INTERACTIVE_BULK),
225 TC_PRIO_INTERACTIVE_BULK,
226 ECN_OR_COST(INTERACTIVE_BULK)
227};
228
229
230/*
231 * Route cache.
232 */
233
234/* The locking scheme is rather straight forward:
235 *
236 * 1) Read-Copy Update protects the buckets of the central route hash.
237 * 2) Only writers remove entries, and they hold the lock
238 * as they look at rtable reference counts.
239 * 3) Only readers acquire references to rtable entries,
240 * they do so with atomic increments and with the
241 * lock held.
242 */
243
244struct rt_hash_bucket {
Eric Dumazet1c317202010-10-25 21:02:07 +0000245 struct rtable __rcu *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700246};
Neil Horman1080d702008-10-27 12:28:25 -0700247
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700248#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
249 defined(CONFIG_PROVE_LOCKING)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700250/*
251 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
252 * The size of this table is a power of two and depends on the number of CPUS.
Ingo Molnar62051202006-07-03 00:24:59 -0700253 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700254 */
Ingo Molnar62051202006-07-03 00:24:59 -0700255#ifdef CONFIG_LOCKDEP
256# define RT_HASH_LOCK_SZ 256
Eric Dumazet22c047c2005-07-05 14:55:24 -0700257#else
Ingo Molnar62051202006-07-03 00:24:59 -0700258# if NR_CPUS >= 32
259# define RT_HASH_LOCK_SZ 4096
260# elif NR_CPUS >= 16
261# define RT_HASH_LOCK_SZ 2048
262# elif NR_CPUS >= 8
263# define RT_HASH_LOCK_SZ 1024
264# elif NR_CPUS >= 4
265# define RT_HASH_LOCK_SZ 512
266# else
267# define RT_HASH_LOCK_SZ 256
268# endif
Eric Dumazet22c047c2005-07-05 14:55:24 -0700269#endif
270
271static spinlock_t *rt_hash_locks;
272# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800273
274static __init void rt_hash_lock_init(void)
275{
276 int i;
277
278 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
279 GFP_KERNEL);
280 if (!rt_hash_locks)
281 panic("IP: failed to allocate rt_hash_locks\n");
282
283 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
284 spin_lock_init(&rt_hash_locks[i]);
285}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700286#else
287# define rt_hash_lock_addr(slot) NULL
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800288
289static inline void rt_hash_lock_init(void)
290{
291}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700292#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700293
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700294static struct rt_hash_bucket *rt_hash_table __read_mostly;
295static unsigned rt_hash_mask __read_mostly;
296static unsigned int rt_hash_log __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700297
Eric Dumazet2f970d82006-01-17 02:54:36 -0800298static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
Eric Dumazet27f39c72010-05-19 22:07:23 +0000299#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700300
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700301static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700302 int genid)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700303{
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700304 return jhash_3words((__force u32)daddr, (__force u32)saddr,
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700305 idx, genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800306 & rt_hash_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700307}
308
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700309static inline int rt_genid(struct net *net)
310{
311 return atomic_read(&net->ipv4.rt_genid);
312}
313
Linus Torvalds1da177e2005-04-16 15:20:36 -0700314#ifdef CONFIG_PROC_FS
315struct rt_cache_iter_state {
Denis V. Luneva75e9362008-02-28 20:50:55 -0800316 struct seq_net_private p;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700317 int bucket;
Eric Dumazet29e75252008-01-31 17:05:09 -0800318 int genid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700319};
320
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900321static struct rtable *rt_cache_get_first(struct seq_file *seq)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700322{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900323 struct rt_cache_iter_state *st = seq->private;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700324 struct rtable *r = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700325
326 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
Eric Dumazet33d480c2011-08-11 19:30:52 +0000327 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
Eric Dumazeta6272662008-08-28 01:11:25 -0700328 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700329 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -0800330 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Eric Dumazet29e75252008-01-31 17:05:09 -0800331 while (r) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700332 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
Denis V. Luneva75e9362008-02-28 20:50:55 -0800333 r->rt_genid == st->genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800334 return r;
Changli Gaod8d1f302010-06-10 23:31:35 -0700335 r = rcu_dereference_bh(r->dst.rt_next);
Eric Dumazet29e75252008-01-31 17:05:09 -0800336 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700337 rcu_read_unlock_bh();
338 }
Eric Dumazet29e75252008-01-31 17:05:09 -0800339 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700340}
341
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900342static struct rtable *__rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800343 struct rtable *r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700344{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900345 struct rt_cache_iter_state *st = seq->private;
Eric Dumazeta6272662008-08-28 01:11:25 -0700346
Eric Dumazet1c317202010-10-25 21:02:07 +0000347 r = rcu_dereference_bh(r->dst.rt_next);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700348 while (!r) {
349 rcu_read_unlock_bh();
Eric Dumazeta6272662008-08-28 01:11:25 -0700350 do {
351 if (--st->bucket < 0)
352 return NULL;
Eric Dumazet33d480c2011-08-11 19:30:52 +0000353 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700354 rcu_read_lock_bh();
Eric Dumazet1c317202010-10-25 21:02:07 +0000355 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700356 }
Eric Dumazet1c317202010-10-25 21:02:07 +0000357 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700358}
359
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900360static struct rtable *rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800361 struct rtable *r)
362{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900363 struct rt_cache_iter_state *st = seq->private;
364 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700365 if (dev_net(r->dst.dev) != seq_file_net(seq))
Denis V. Luneva75e9362008-02-28 20:50:55 -0800366 continue;
Denis V. Lunev642d6312008-02-28 20:50:33 -0800367 if (r->rt_genid == st->genid)
368 break;
369 }
370 return r;
371}
372
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900373static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700374{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900375 struct rtable *r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700376
377 if (r)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900378 while (pos && (r = rt_cache_get_next(seq, r)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700379 --pos;
380 return pos ? NULL : r;
381}
382
383static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
384{
Eric Dumazet29e75252008-01-31 17:05:09 -0800385 struct rt_cache_iter_state *st = seq->private;
Eric Dumazet29e75252008-01-31 17:05:09 -0800386 if (*pos)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900387 return rt_cache_get_idx(seq, *pos - 1);
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700388 st->genid = rt_genid(seq_file_net(seq));
Eric Dumazet29e75252008-01-31 17:05:09 -0800389 return SEQ_START_TOKEN;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700390}
391
392static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
393{
Eric Dumazet29e75252008-01-31 17:05:09 -0800394 struct rtable *r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700395
396 if (v == SEQ_START_TOKEN)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900397 r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700398 else
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900399 r = rt_cache_get_next(seq, v);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700400 ++*pos;
401 return r;
402}
403
404static void rt_cache_seq_stop(struct seq_file *seq, void *v)
405{
406 if (v && v != SEQ_START_TOKEN)
407 rcu_read_unlock_bh();
408}
409
410static int rt_cache_seq_show(struct seq_file *seq, void *v)
411{
412 if (v == SEQ_START_TOKEN)
413 seq_printf(seq, "%-127s\n",
414 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
415 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
416 "HHUptod\tSpecDst");
417 else {
418 struct rtable *r = v;
David S. Miller69cce1d2011-07-17 23:09:49 -0700419 struct neighbour *n;
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700420 int len;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700421
David S. Miller69cce1d2011-07-17 23:09:49 -0700422 n = dst_get_neighbour(&r->dst);
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700423 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
424 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
Changli Gaod8d1f302010-06-10 23:31:35 -0700425 r->dst.dev ? r->dst.dev->name : "*",
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700426 (__force u32)r->rt_dst,
427 (__force u32)r->rt_gateway,
Changli Gaod8d1f302010-06-10 23:31:35 -0700428 r->rt_flags, atomic_read(&r->dst.__refcnt),
429 r->dst.__use, 0, (__force u32)r->rt_src,
David S. Miller0dbaee32010-12-13 12:52:14 -0800430 dst_metric_advmss(&r->dst) + 40,
Changli Gaod8d1f302010-06-10 23:31:35 -0700431 dst_metric(&r->dst, RTAX_WINDOW),
432 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
433 dst_metric(&r->dst, RTAX_RTTVAR)),
David S. Miller475949d2011-05-03 19:45:15 -0700434 r->rt_key_tos,
David S. Millerf6b72b62011-07-14 07:53:20 -0700435 -1,
David S. Miller69cce1d2011-07-17 23:09:49 -0700436 (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0,
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700437 r->rt_spec_dst, &len);
438
439 seq_printf(seq, "%*s\n", 127 - len, "");
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900440 }
441 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700442}
443
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700444static const struct seq_operations rt_cache_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700445 .start = rt_cache_seq_start,
446 .next = rt_cache_seq_next,
447 .stop = rt_cache_seq_stop,
448 .show = rt_cache_seq_show,
449};
450
451static int rt_cache_seq_open(struct inode *inode, struct file *file)
452{
Denis V. Luneva75e9362008-02-28 20:50:55 -0800453 return seq_open_net(inode, file, &rt_cache_seq_ops,
Pavel Emelyanovcf7732e2007-10-10 02:29:29 -0700454 sizeof(struct rt_cache_iter_state));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700455}
456
Arjan van de Ven9a321442007-02-12 00:55:35 -0800457static const struct file_operations rt_cache_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700458 .owner = THIS_MODULE,
459 .open = rt_cache_seq_open,
460 .read = seq_read,
461 .llseek = seq_lseek,
Denis V. Luneva75e9362008-02-28 20:50:55 -0800462 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700463};
464
465
466static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
467{
468 int cpu;
469
470 if (*pos == 0)
471 return SEQ_START_TOKEN;
472
Rusty Russell0f23174a2008-12-29 12:23:42 +0000473 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700474 if (!cpu_possible(cpu))
475 continue;
476 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800477 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700478 }
479 return NULL;
480}
481
482static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
483{
484 int cpu;
485
Rusty Russell0f23174a2008-12-29 12:23:42 +0000486 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700487 if (!cpu_possible(cpu))
488 continue;
489 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800490 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700491 }
492 return NULL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900493
Linus Torvalds1da177e2005-04-16 15:20:36 -0700494}
495
496static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
497{
498
499}
500
501static int rt_cpu_seq_show(struct seq_file *seq, void *v)
502{
503 struct rt_cache_stat *st = v;
504
505 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700506 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700507 return 0;
508 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900509
Linus Torvalds1da177e2005-04-16 15:20:36 -0700510 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
511 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
Eric Dumazetfc66f952010-10-08 06:37:34 +0000512 dst_entries_get_slow(&ipv4_dst_ops),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700513 st->in_hit,
514 st->in_slow_tot,
515 st->in_slow_mc,
516 st->in_no_route,
517 st->in_brd,
518 st->in_martian_dst,
519 st->in_martian_src,
520
521 st->out_hit,
522 st->out_slow_tot,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900523 st->out_slow_mc,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700524
525 st->gc_total,
526 st->gc_ignored,
527 st->gc_goal_miss,
528 st->gc_dst_overflow,
529 st->in_hlist_search,
530 st->out_hlist_search
531 );
532 return 0;
533}
534
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700535static const struct seq_operations rt_cpu_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700536 .start = rt_cpu_seq_start,
537 .next = rt_cpu_seq_next,
538 .stop = rt_cpu_seq_stop,
539 .show = rt_cpu_seq_show,
540};
541
542
543static int rt_cpu_seq_open(struct inode *inode, struct file *file)
544{
545 return seq_open(file, &rt_cpu_seq_ops);
546}
547
Arjan van de Ven9a321442007-02-12 00:55:35 -0800548static const struct file_operations rt_cpu_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700549 .owner = THIS_MODULE,
550 .open = rt_cpu_seq_open,
551 .read = seq_read,
552 .llseek = seq_lseek,
553 .release = seq_release,
554};
555
Patrick McHardyc7066f72011-01-14 13:36:42 +0100556#ifdef CONFIG_IP_ROUTE_CLASSID
Alexey Dobriyana661c412009-11-25 15:40:35 -0800557static int rt_acct_proc_show(struct seq_file *m, void *v)
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800558{
Alexey Dobriyana661c412009-11-25 15:40:35 -0800559 struct ip_rt_acct *dst, *src;
560 unsigned int i, j;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800561
Alexey Dobriyana661c412009-11-25 15:40:35 -0800562 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
563 if (!dst)
564 return -ENOMEM;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800565
Alexey Dobriyana661c412009-11-25 15:40:35 -0800566 for_each_possible_cpu(i) {
567 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
568 for (j = 0; j < 256; j++) {
569 dst[j].o_bytes += src[j].o_bytes;
570 dst[j].o_packets += src[j].o_packets;
571 dst[j].i_bytes += src[j].i_bytes;
572 dst[j].i_packets += src[j].i_packets;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800573 }
574 }
Alexey Dobriyana661c412009-11-25 15:40:35 -0800575
576 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
577 kfree(dst);
578 return 0;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800579}
Alexey Dobriyana661c412009-11-25 15:40:35 -0800580
581static int rt_acct_proc_open(struct inode *inode, struct file *file)
582{
583 return single_open(file, rt_acct_proc_show, NULL);
584}
585
586static const struct file_operations rt_acct_proc_fops = {
587 .owner = THIS_MODULE,
588 .open = rt_acct_proc_open,
589 .read = seq_read,
590 .llseek = seq_lseek,
591 .release = single_release,
592};
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800593#endif
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800594
Denis V. Lunev73b38712008-02-28 20:51:18 -0800595static int __net_init ip_rt_do_proc_init(struct net *net)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800596{
597 struct proc_dir_entry *pde;
598
599 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
600 &rt_cache_seq_fops);
601 if (!pde)
602 goto err1;
603
Wang Chen77020722008-02-28 14:14:25 -0800604 pde = proc_create("rt_cache", S_IRUGO,
605 net->proc_net_stat, &rt_cpu_seq_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800606 if (!pde)
607 goto err2;
608
Patrick McHardyc7066f72011-01-14 13:36:42 +0100609#ifdef CONFIG_IP_ROUTE_CLASSID
Alexey Dobriyana661c412009-11-25 15:40:35 -0800610 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800611 if (!pde)
612 goto err3;
613#endif
614 return 0;
615
Patrick McHardyc7066f72011-01-14 13:36:42 +0100616#ifdef CONFIG_IP_ROUTE_CLASSID
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800617err3:
618 remove_proc_entry("rt_cache", net->proc_net_stat);
619#endif
620err2:
621 remove_proc_entry("rt_cache", net->proc_net);
622err1:
623 return -ENOMEM;
624}
Denis V. Lunev73b38712008-02-28 20:51:18 -0800625
626static void __net_exit ip_rt_do_proc_exit(struct net *net)
627{
628 remove_proc_entry("rt_cache", net->proc_net_stat);
629 remove_proc_entry("rt_cache", net->proc_net);
Patrick McHardyc7066f72011-01-14 13:36:42 +0100630#ifdef CONFIG_IP_ROUTE_CLASSID
Denis V. Lunev73b38712008-02-28 20:51:18 -0800631 remove_proc_entry("rt_acct", net->proc_net);
Alexey Dobriyan0a931ac2010-01-17 03:32:50 +0000632#endif
Denis V. Lunev73b38712008-02-28 20:51:18 -0800633}
634
635static struct pernet_operations ip_rt_proc_ops __net_initdata = {
636 .init = ip_rt_do_proc_init,
637 .exit = ip_rt_do_proc_exit,
638};
639
640static int __init ip_rt_proc_init(void)
641{
642 return register_pernet_subsys(&ip_rt_proc_ops);
643}
644
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800645#else
Denis V. Lunev73b38712008-02-28 20:51:18 -0800646static inline int ip_rt_proc_init(void)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800647{
648 return 0;
649}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700650#endif /* CONFIG_PROC_FS */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900651
Stephen Hemminger5969f712008-04-10 01:52:09 -0700652static inline void rt_free(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700653{
Changli Gaod8d1f302010-06-10 23:31:35 -0700654 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700655}
656
Stephen Hemminger5969f712008-04-10 01:52:09 -0700657static inline void rt_drop(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700658{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700659 ip_rt_put(rt);
Changli Gaod8d1f302010-06-10 23:31:35 -0700660 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700661}
662
Stephen Hemminger5969f712008-04-10 01:52:09 -0700663static inline int rt_fast_clean(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700664{
665 /* Kill broadcast/multicast entries very aggresively, if they
666 collide in hash table with more useful entries */
667 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
David S. Millerc7537962010-11-11 17:07:48 -0800668 rt_is_input_route(rth) && rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700669}
670
Stephen Hemminger5969f712008-04-10 01:52:09 -0700671static inline int rt_valuable(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700672{
673 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
David S. Miller2c8cec52011-02-09 20:42:07 -0800674 (rth->peer && rth->peer->pmtu_expires);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700675}
676
677static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
678{
679 unsigned long age;
680 int ret = 0;
681
Changli Gaod8d1f302010-06-10 23:31:35 -0700682 if (atomic_read(&rth->dst.__refcnt))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700683 goto out;
684
Changli Gaod8d1f302010-06-10 23:31:35 -0700685 age = jiffies - rth->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700686 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
687 (age <= tmo2 && rt_valuable(rth)))
688 goto out;
689 ret = 1;
690out: return ret;
691}
692
693/* Bits of score are:
694 * 31: very valuable
695 * 30: not quite useless
696 * 29..0: usage counter
697 */
698static inline u32 rt_score(struct rtable *rt)
699{
Changli Gaod8d1f302010-06-10 23:31:35 -0700700 u32 score = jiffies - rt->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700701
702 score = ~score & ~(3<<30);
703
704 if (rt_valuable(rt))
705 score |= (1<<31);
706
David S. Millerc7537962010-11-11 17:07:48 -0800707 if (rt_is_output_route(rt) ||
Linus Torvalds1da177e2005-04-16 15:20:36 -0700708 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
709 score |= (1<<30);
710
711 return score;
712}
713
Neil Horman1080d702008-10-27 12:28:25 -0700714static inline bool rt_caching(const struct net *net)
715{
716 return net->ipv4.current_rt_cache_rebuild_count <=
717 net->ipv4.sysctl_rt_cache_rebuild_count;
718}
719
David S. Miller5e2b61f2011-03-04 21:47:09 -0800720static inline bool compare_hash_inputs(const struct rtable *rt1,
721 const struct rtable *rt2)
Neil Horman1080d702008-10-27 12:28:25 -0700722{
David S. Miller5e2b61f2011-03-04 21:47:09 -0800723 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
724 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
Julian Anastasov97a80412011-08-09 04:01:16 +0000725 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
Neil Horman1080d702008-10-27 12:28:25 -0700726}
727
David S. Miller5e2b61f2011-03-04 21:47:09 -0800728static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700729{
David S. Miller5e2b61f2011-03-04 21:47:09 -0800730 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
731 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
732 (rt1->rt_mark ^ rt2->rt_mark) |
David S. Miller475949d2011-05-03 19:45:15 -0700733 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
Julian Anastasovd547f722011-08-07 22:20:20 -0700734 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
Julian Anastasov97a80412011-08-09 04:01:16 +0000735 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700736}
737
Denis V. Lunevb5921912008-01-22 23:50:25 -0800738static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
739{
Changli Gaod8d1f302010-06-10 23:31:35 -0700740 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
Denis V. Lunevb5921912008-01-22 23:50:25 -0800741}
742
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700743static inline int rt_is_expired(struct rtable *rth)
744{
Changli Gaod8d1f302010-06-10 23:31:35 -0700745 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700746}
747
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800748/*
749 * Perform a full scan of hash table and free all entries.
750 * Can be called by a softirq or a process.
751 * In the later case, we want to be reschedule if necessary
752 */
David S. Miller6561a3b2010-12-19 21:11:20 -0800753static void rt_do_flush(struct net *net, int process_context)
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800754{
755 unsigned int i;
756 struct rtable *rth, *next;
757
758 for (i = 0; i <= rt_hash_mask; i++) {
David S. Miller6561a3b2010-12-19 21:11:20 -0800759 struct rtable __rcu **pprev;
760 struct rtable *list;
761
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800762 if (process_context && need_resched())
763 cond_resched();
Eric Dumazet33d480c2011-08-11 19:30:52 +0000764 rth = rcu_access_pointer(rt_hash_table[i].chain);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800765 if (!rth)
766 continue;
767
768 spin_lock_bh(rt_hash_lock_addr(i));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700769
David S. Miller6561a3b2010-12-19 21:11:20 -0800770 list = NULL;
771 pprev = &rt_hash_table[i].chain;
772 rth = rcu_dereference_protected(*pprev,
Eric Dumazet1c317202010-10-25 21:02:07 +0000773 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700774
David S. Miller6561a3b2010-12-19 21:11:20 -0800775 while (rth) {
776 next = rcu_dereference_protected(rth->dst.rt_next,
777 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700778
David S. Miller6561a3b2010-12-19 21:11:20 -0800779 if (!net ||
780 net_eq(dev_net(rth->dst.dev), net)) {
781 rcu_assign_pointer(*pprev, next);
782 rcu_assign_pointer(rth->dst.rt_next, list);
783 list = rth;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700784 } else {
David S. Miller6561a3b2010-12-19 21:11:20 -0800785 pprev = &rth->dst.rt_next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700786 }
David S. Miller6561a3b2010-12-19 21:11:20 -0800787 rth = next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700788 }
David S. Miller6561a3b2010-12-19 21:11:20 -0800789
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800790 spin_unlock_bh(rt_hash_lock_addr(i));
791
David S. Miller6561a3b2010-12-19 21:11:20 -0800792 for (; list; list = next) {
793 next = rcu_dereference_protected(list->dst.rt_next, 1);
794 rt_free(list);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800795 }
796 }
797}
798
Neil Horman1080d702008-10-27 12:28:25 -0700799/*
800 * While freeing expired entries, we compute average chain length
801 * and standard deviation, using fixed-point arithmetic.
802 * This to have an estimation of rt_chain_length_max
803 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
804 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
805 */
806
807#define FRACT_BITS 3
808#define ONE (1UL << FRACT_BITS)
809
Eric Dumazet98376382010-03-08 03:20:00 +0000810/*
811 * Given a hash chain and an item in this hash chain,
812 * find if a previous entry has the same hash_inputs
813 * (but differs on tos, mark or oif)
814 * Returns 0 if an alias is found.
815 * Returns ONE if rth has no alias before itself.
816 */
817static int has_noalias(const struct rtable *head, const struct rtable *rth)
818{
819 const struct rtable *aux = head;
820
821 while (aux != rth) {
David S. Miller5e2b61f2011-03-04 21:47:09 -0800822 if (compare_hash_inputs(aux, rth))
Eric Dumazet98376382010-03-08 03:20:00 +0000823 return 0;
Eric Dumazet1c317202010-10-25 21:02:07 +0000824 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +0000825 }
826 return ONE;
827}
828
Eric Dumazet29e75252008-01-31 17:05:09 -0800829/*
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300830 * Perturbation of rt_genid by a small quantity [1..256]
Eric Dumazet29e75252008-01-31 17:05:09 -0800831 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
832 * many times (2^24) without giving recent rt_genid.
833 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700834 */
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700835static void rt_cache_invalidate(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700836{
Eric Dumazet29e75252008-01-31 17:05:09 -0800837 unsigned char shuffle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700838
Eric Dumazet29e75252008-01-31 17:05:09 -0800839 get_random_bytes(&shuffle, sizeof(shuffle));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700840 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
Eric Dumazetde68dca2011-11-26 12:13:44 +0000841 redirect_genid++;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700842}
843
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800844/*
Eric Dumazet29e75252008-01-31 17:05:09 -0800845 * delay < 0 : invalidate cache (fast : entries will be deleted later)
846 * delay >= 0 : invalidate & flush cache (can be long)
847 */
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -0700848void rt_cache_flush(struct net *net, int delay)
Eric Dumazet29e75252008-01-31 17:05:09 -0800849{
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700850 rt_cache_invalidate(net);
Eric Dumazet29e75252008-01-31 17:05:09 -0800851 if (delay >= 0)
David S. Miller6561a3b2010-12-19 21:11:20 -0800852 rt_do_flush(net, !in_softirq());
Eric Dumazet29e75252008-01-31 17:05:09 -0800853}
854
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000855/* Flush previous cache invalidated entries from the cache */
David S. Miller6561a3b2010-12-19 21:11:20 -0800856void rt_cache_flush_batch(struct net *net)
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000857{
David S. Miller6561a3b2010-12-19 21:11:20 -0800858 rt_do_flush(net, !in_softirq());
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000859}
860
Neil Horman1080d702008-10-27 12:28:25 -0700861static void rt_emergency_hash_rebuild(struct net *net)
862{
Neil Horman3ee94372010-05-08 01:57:52 -0700863 if (net_ratelimit())
Neil Horman1080d702008-10-27 12:28:25 -0700864 printk(KERN_WARNING "Route hash chain too long!\n");
Neil Horman3ee94372010-05-08 01:57:52 -0700865 rt_cache_invalidate(net);
Neil Horman1080d702008-10-27 12:28:25 -0700866}
867
Linus Torvalds1da177e2005-04-16 15:20:36 -0700868/*
869 Short description of GC goals.
870
871 We want to build algorithm, which will keep routing cache
872 at some equilibrium point, when number of aged off entries
873 is kept approximately equal to newly generated ones.
874
875 Current expiration strength is variable "expire".
876 We try to adjust it dynamically, so that if networking
877 is idle expires is large enough to keep enough of warm entries,
878 and when load increases it reduces to limit cache size.
879 */
880
Daniel Lezcano569d3642008-01-18 03:56:57 -0800881static int rt_garbage_collect(struct dst_ops *ops)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700882{
883 static unsigned long expire = RT_GC_TIMEOUT;
884 static unsigned long last_gc;
885 static int rover;
886 static int equilibrium;
Eric Dumazet1c317202010-10-25 21:02:07 +0000887 struct rtable *rth;
888 struct rtable __rcu **rthp;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700889 unsigned long now = jiffies;
890 int goal;
Eric Dumazetfc66f952010-10-08 06:37:34 +0000891 int entries = dst_entries_get_fast(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700892
893 /*
894 * Garbage collection is pretty expensive,
895 * do not make it too frequently.
896 */
897
898 RT_CACHE_STAT_INC(gc_total);
899
900 if (now - last_gc < ip_rt_gc_min_interval &&
Eric Dumazetfc66f952010-10-08 06:37:34 +0000901 entries < ip_rt_max_size) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700902 RT_CACHE_STAT_INC(gc_ignored);
903 goto out;
904 }
905
Eric Dumazetfc66f952010-10-08 06:37:34 +0000906 entries = dst_entries_get_slow(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700907 /* Calculate number of entries, which we want to expire now. */
Eric Dumazetfc66f952010-10-08 06:37:34 +0000908 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700909 if (goal <= 0) {
910 if (equilibrium < ipv4_dst_ops.gc_thresh)
911 equilibrium = ipv4_dst_ops.gc_thresh;
Eric Dumazetfc66f952010-10-08 06:37:34 +0000912 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700913 if (goal > 0) {
Eric Dumazetb790ced2007-12-21 01:49:07 -0800914 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +0000915 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700916 }
917 } else {
918 /* We are in dangerous area. Try to reduce cache really
919 * aggressively.
920 */
Eric Dumazetb790ced2007-12-21 01:49:07 -0800921 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +0000922 equilibrium = entries - goal;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700923 }
924
925 if (now - last_gc >= ip_rt_gc_min_interval)
926 last_gc = now;
927
928 if (goal <= 0) {
929 equilibrium += goal;
930 goto work_done;
931 }
932
933 do {
934 int i, k;
935
936 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
937 unsigned long tmo = expire;
938
939 k = (k + 1) & rt_hash_mask;
940 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700941 spin_lock_bh(rt_hash_lock_addr(k));
Eric Dumazet1c317202010-10-25 21:02:07 +0000942 while ((rth = rcu_dereference_protected(*rthp,
943 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700944 if (!rt_is_expired(rth) &&
Eric Dumazet29e75252008-01-31 17:05:09 -0800945 !rt_may_expire(rth, tmo, expire)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700946 tmo >>= 1;
Changli Gaod8d1f302010-06-10 23:31:35 -0700947 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700948 continue;
949 }
Changli Gaod8d1f302010-06-10 23:31:35 -0700950 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700951 rt_free(rth);
952 goal--;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700953 }
Eric Dumazet22c047c2005-07-05 14:55:24 -0700954 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700955 if (goal <= 0)
956 break;
957 }
958 rover = k;
959
960 if (goal <= 0)
961 goto work_done;
962
963 /* Goal is not achieved. We stop process if:
964
965 - if expire reduced to zero. Otherwise, expire is halfed.
966 - if table is not full.
967 - if we are called from interrupt.
968 - jiffies check is just fallback/debug loop breaker.
969 We will not spin here for long time in any case.
970 */
971
972 RT_CACHE_STAT_INC(gc_goal_miss);
973
974 if (expire == 0)
975 break;
976
977 expire >>= 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700978
Eric Dumazetfc66f952010-10-08 06:37:34 +0000979 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700980 goto out;
981 } while (!in_softirq() && time_before_eq(jiffies, now));
982
Eric Dumazetfc66f952010-10-08 06:37:34 +0000983 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
984 goto out;
985 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700986 goto out;
987 if (net_ratelimit())
988 printk(KERN_WARNING "dst cache overflow\n");
989 RT_CACHE_STAT_INC(gc_dst_overflow);
990 return 1;
991
992work_done:
993 expire += ip_rt_gc_min_interval;
994 if (expire > ip_rt_gc_timeout ||
Eric Dumazetfc66f952010-10-08 06:37:34 +0000995 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
996 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700997 expire = ip_rt_gc_timeout;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700998out: return 0;
999}
1000
Eric Dumazet98376382010-03-08 03:20:00 +00001001/*
1002 * Returns number of entries in a hash chain that have different hash_inputs
1003 */
1004static int slow_chain_length(const struct rtable *head)
1005{
1006 int length = 0;
1007 const struct rtable *rth = head;
1008
1009 while (rth) {
1010 length += has_noalias(head, rth);
Eric Dumazet1c317202010-10-25 21:02:07 +00001011 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +00001012 }
1013 return length >> FRACT_BITS;
1014}
1015
David S. Millerd3aaeb32011-07-18 00:40:17 -07001016static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
David Miller3769cff2011-07-11 22:44:24 +00001017{
David Miller3769cff2011-07-11 22:44:24 +00001018 struct neigh_table *tbl = &arp_tbl;
David S. Millerd3aaeb32011-07-18 00:40:17 -07001019 static const __be32 inaddr_any = 0;
1020 struct net_device *dev = dst->dev;
1021 const __be32 *pkey = daddr;
David Miller3769cff2011-07-11 22:44:24 +00001022 struct neighbour *n;
1023
1024#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1025 if (dev->type == ARPHRD_ATM)
1026 tbl = clip_tbl_hook;
1027#endif
David Miller3769cff2011-07-11 22:44:24 +00001028 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
David S. Millerd3aaeb32011-07-18 00:40:17 -07001029 pkey = &inaddr_any;
1030
1031 n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
1032 if (n)
1033 return n;
1034 return neigh_create(tbl, pkey, dev);
1035}
1036
1037static int rt_bind_neighbour(struct rtable *rt)
1038{
1039 struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
David Miller3769cff2011-07-11 22:44:24 +00001040 if (IS_ERR(n))
1041 return PTR_ERR(n);
David S. Miller69cce1d2011-07-17 23:09:49 -07001042 dst_set_neighbour(&rt->dst, n);
David Miller3769cff2011-07-11 22:44:24 +00001043
1044 return 0;
1045}
1046
David S. Millerb23dd4f2011-03-02 14:31:35 -08001047static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1048 struct sk_buff *skb, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001049{
Eric Dumazet1c317202010-10-25 21:02:07 +00001050 struct rtable *rth, *cand;
1051 struct rtable __rcu **rthp, **candp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001052 unsigned long now;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001053 u32 min_score;
1054 int chain_length;
1055 int attempts = !in_softirq();
1056
1057restart:
1058 chain_length = 0;
1059 min_score = ~(u32)0;
1060 cand = NULL;
1061 candp = NULL;
1062 now = jiffies;
1063
Changli Gaod8d1f302010-06-10 23:31:35 -07001064 if (!rt_caching(dev_net(rt->dst.dev))) {
Neil Horman73e42892009-06-20 01:15:16 -07001065 /*
1066 * If we're not caching, just tell the caller we
1067 * were successful and don't touch the route. The
1068 * caller hold the sole reference to the cache entry, and
1069 * it will be released when the caller is done with it.
1070 * If we drop it here, the callers have no way to resolve routes
1071 * when we're not caching. Instead, just point *rp at rt, so
1072 * the caller gets a single use out of the route
Neil Hormanb6280b42009-06-22 10:18:53 +00001073 * Note that we do rt_free on this new route entry, so that
1074 * once its refcount hits zero, we are still able to reap it
1075 * (Thanks Alexey)
Eric Dumazet27b75c92010-10-15 05:44:11 +00001076 * Note: To avoid expensive rcu stuff for this uncached dst,
1077 * we set DST_NOCACHE so that dst_release() can free dst without
1078 * waiting a grace period.
Neil Horman73e42892009-06-20 01:15:16 -07001079 */
Neil Hormanb6280b42009-06-22 10:18:53 +00001080
Eric Dumazetc7d44262010-10-03 22:17:54 -07001081 rt->dst.flags |= DST_NOCACHE;
David S. Millerc7537962010-11-11 17:07:48 -08001082 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
David Miller3769cff2011-07-11 22:44:24 +00001083 int err = rt_bind_neighbour(rt);
Neil Hormanb6280b42009-06-22 10:18:53 +00001084 if (err) {
1085 if (net_ratelimit())
1086 printk(KERN_WARNING
1087 "Neighbour table failure & not caching routes.\n");
Eric Dumazet27b75c92010-10-15 05:44:11 +00001088 ip_rt_put(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001089 return ERR_PTR(err);
Neil Hormanb6280b42009-06-22 10:18:53 +00001090 }
1091 }
1092
Neil Hormanb6280b42009-06-22 10:18:53 +00001093 goto skip_hashing;
Neil Horman1080d702008-10-27 12:28:25 -07001094 }
1095
Linus Torvalds1da177e2005-04-16 15:20:36 -07001096 rthp = &rt_hash_table[hash].chain;
1097
Eric Dumazet22c047c2005-07-05 14:55:24 -07001098 spin_lock_bh(rt_hash_lock_addr(hash));
Eric Dumazet1c317202010-10-25 21:02:07 +00001099 while ((rth = rcu_dereference_protected(*rthp,
1100 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001101 if (rt_is_expired(rth)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001102 *rthp = rth->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001103 rt_free(rth);
1104 continue;
1105 }
David S. Miller5e2b61f2011-03-04 21:47:09 -08001106 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001107 /* Put it first */
Changli Gaod8d1f302010-06-10 23:31:35 -07001108 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001109 /*
1110 * Since lookup is lockfree, the deletion
1111 * must be visible to another weakly ordered CPU before
1112 * the insertion at the start of the hash chain.
1113 */
Changli Gaod8d1f302010-06-10 23:31:35 -07001114 rcu_assign_pointer(rth->dst.rt_next,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001115 rt_hash_table[hash].chain);
1116 /*
1117 * Since lookup is lockfree, the update writes
1118 * must be ordered for consistency on SMP.
1119 */
1120 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1121
Changli Gaod8d1f302010-06-10 23:31:35 -07001122 dst_use(&rth->dst, now);
Eric Dumazet22c047c2005-07-05 14:55:24 -07001123 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001124
1125 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001126 if (skb)
Changli Gaod8d1f302010-06-10 23:31:35 -07001127 skb_dst_set(skb, &rth->dst);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001128 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001129 }
1130
Changli Gaod8d1f302010-06-10 23:31:35 -07001131 if (!atomic_read(&rth->dst.__refcnt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001132 u32 score = rt_score(rth);
1133
1134 if (score <= min_score) {
1135 cand = rth;
1136 candp = rthp;
1137 min_score = score;
1138 }
1139 }
1140
1141 chain_length++;
1142
Changli Gaod8d1f302010-06-10 23:31:35 -07001143 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001144 }
1145
1146 if (cand) {
1147 /* ip_rt_gc_elasticity used to be average length of chain
1148 * length, when exceeded gc becomes really aggressive.
1149 *
1150 * The second limit is less certain. At the moment it allows
1151 * only 2 entries per bucket. We will see.
1152 */
1153 if (chain_length > ip_rt_gc_elasticity) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001154 *candp = cand->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001155 rt_free(cand);
1156 }
Neil Horman1080d702008-10-27 12:28:25 -07001157 } else {
Eric Dumazet98376382010-03-08 03:20:00 +00001158 if (chain_length > rt_chain_length_max &&
1159 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001160 struct net *net = dev_net(rt->dst.dev);
Neil Horman1080d702008-10-27 12:28:25 -07001161 int num = ++net->ipv4.current_rt_cache_rebuild_count;
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001162 if (!rt_caching(net)) {
Neil Horman1080d702008-10-27 12:28:25 -07001163 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
Changli Gaod8d1f302010-06-10 23:31:35 -07001164 rt->dst.dev->name, num);
Neil Horman1080d702008-10-27 12:28:25 -07001165 }
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001166 rt_emergency_hash_rebuild(net);
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001167 spin_unlock_bh(rt_hash_lock_addr(hash));
1168
David S. Miller5e2b61f2011-03-04 21:47:09 -08001169 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001170 ifindex, rt_genid(net));
1171 goto restart;
Neil Horman1080d702008-10-27 12:28:25 -07001172 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001173 }
1174
1175 /* Try to bind route to arp only if it is output
1176 route or unicast forwarding path.
1177 */
David S. Millerc7537962010-11-11 17:07:48 -08001178 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
David Miller3769cff2011-07-11 22:44:24 +00001179 int err = rt_bind_neighbour(rt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001180 if (err) {
Eric Dumazet22c047c2005-07-05 14:55:24 -07001181 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001182
1183 if (err != -ENOBUFS) {
1184 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001185 return ERR_PTR(err);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001186 }
1187
1188 /* Neighbour tables are full and nothing
1189 can be released. Try to shrink route cache,
1190 it is most likely it holds some neighbour records.
1191 */
1192 if (attempts-- > 0) {
1193 int saved_elasticity = ip_rt_gc_elasticity;
1194 int saved_int = ip_rt_gc_min_interval;
1195 ip_rt_gc_elasticity = 1;
1196 ip_rt_gc_min_interval = 0;
Daniel Lezcano569d3642008-01-18 03:56:57 -08001197 rt_garbage_collect(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001198 ip_rt_gc_min_interval = saved_int;
1199 ip_rt_gc_elasticity = saved_elasticity;
1200 goto restart;
1201 }
1202
1203 if (net_ratelimit())
Ulrich Weber7e1b33e2010-09-27 15:02:18 -07001204 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07001205 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001206 return ERR_PTR(-ENOBUFS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001207 }
1208 }
1209
Changli Gaod8d1f302010-06-10 23:31:35 -07001210 rt->dst.rt_next = rt_hash_table[hash].chain;
Neil Horman1080d702008-10-27 12:28:25 -07001211
Eric Dumazet00269b52008-10-16 14:18:29 -07001212 /*
1213 * Since lookup is lockfree, we must make sure
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001214 * previous writes to rt are committed to memory
Eric Dumazet00269b52008-10-16 14:18:29 -07001215 * before making rt visible to other CPUS.
1216 */
Eric Dumazet1ddbcb02009-05-19 20:14:28 +00001217 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
Neil Horman1080d702008-10-27 12:28:25 -07001218
Eric Dumazet22c047c2005-07-05 14:55:24 -07001219 spin_unlock_bh(rt_hash_lock_addr(hash));
Neil Horman73e42892009-06-20 01:15:16 -07001220
Neil Hormanb6280b42009-06-22 10:18:53 +00001221skip_hashing:
David S. Millerb23dd4f2011-03-02 14:31:35 -08001222 if (skb)
Changli Gaod8d1f302010-06-10 23:31:35 -07001223 skb_dst_set(skb, &rt->dst);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001224 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001225}
1226
David S. Miller6431cbc2011-02-07 20:38:06 -08001227static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1228
1229static u32 rt_peer_genid(void)
1230{
1231 return atomic_read(&__rt_peer_genid);
1232}
1233
David S. Millera48eff12011-05-18 18:42:43 -04001234void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001235{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001236 struct inet_peer *peer;
1237
David S. Millera48eff12011-05-18 18:42:43 -04001238 peer = inet_getpeer_v4(daddr, create);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001239
Eric Dumazet49e8ab02010-08-19 06:10:45 +00001240 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001241 inet_putpeer(peer);
David S. Miller6431cbc2011-02-07 20:38:06 -08001242 else
1243 rt->rt_peer_genid = rt_peer_genid();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001244}
1245
1246/*
1247 * Peer allocation may fail only in serious out-of-memory conditions. However
1248 * we still can generate some output.
1249 * Random ID selection looks a bit dangerous because we have no chances to
1250 * select ID being unique in a reasonable period of time.
1251 * But broken packet identifier may be better than no packet at all.
1252 */
1253static void ip_select_fb_ident(struct iphdr *iph)
1254{
1255 static DEFINE_SPINLOCK(ip_fb_id_lock);
1256 static u32 ip_fallback_id;
1257 u32 salt;
1258
1259 spin_lock_bh(&ip_fb_id_lock);
Al Viroe4485152006-09-26 22:15:01 -07001260 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001261 iph->id = htons(salt & 0xFFFF);
1262 ip_fallback_id = salt;
1263 spin_unlock_bh(&ip_fb_id_lock);
1264}
1265
1266void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1267{
1268 struct rtable *rt = (struct rtable *) dst;
1269
1270 if (rt) {
1271 if (rt->peer == NULL)
David S. Millera48eff12011-05-18 18:42:43 -04001272 rt_bind_peer(rt, rt->rt_dst, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001273
1274 /* If peer is attached to destination, it is never detached,
1275 so that we need not to grab a lock to dereference it.
1276 */
1277 if (rt->peer) {
1278 iph->id = htons(inet_getid(rt->peer, more));
1279 return;
1280 }
1281 } else
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001282 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
Stephen Hemminger9c2b3322005-04-19 22:39:42 -07001283 __builtin_return_address(0));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001284
1285 ip_select_fb_ident(iph);
1286}
Eric Dumazet4bc2f182010-07-09 21:22:10 +00001287EXPORT_SYMBOL(__ip_select_ident);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001288
1289static void rt_del(unsigned hash, struct rtable *rt)
1290{
Eric Dumazet1c317202010-10-25 21:02:07 +00001291 struct rtable __rcu **rthp;
1292 struct rtable *aux;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001293
Eric Dumazet29e75252008-01-31 17:05:09 -08001294 rthp = &rt_hash_table[hash].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001295 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001296 ip_rt_put(rt);
Eric Dumazet1c317202010-10-25 21:02:07 +00001297 while ((aux = rcu_dereference_protected(*rthp,
1298 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001299 if (aux == rt || rt_is_expired(aux)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001300 *rthp = aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001301 rt_free(aux);
1302 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001303 }
Changli Gaod8d1f302010-06-10 23:31:35 -07001304 rthp = &aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001305 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001306 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001307}
1308
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001309static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1310{
1311 struct rtable *rt = (struct rtable *) dst;
1312 __be32 orig_gw = rt->rt_gateway;
1313 struct neighbour *n, *old_n;
1314
1315 dst_confirm(&rt->dst);
1316
1317 rt->rt_gateway = peer->redirect_learned.a4;
1318
1319 n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1320 if (IS_ERR(n))
1321 return PTR_ERR(n);
1322 old_n = xchg(&rt->dst._neighbour, n);
1323 if (old_n)
1324 neigh_release(old_n);
1325 if (!n || !(n->nud_state & NUD_VALID)) {
1326 if (n)
1327 neigh_event_send(n, NULL);
1328 rt->rt_gateway = orig_gw;
1329 return -EAGAIN;
1330 } else {
1331 rt->rt_flags |= RTCF_REDIRECTED;
1332 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1333 }
1334 return 0;
1335}
1336
Eric Dumazeted7865a42010-06-07 21:49:44 -07001337/* called in rcu_read_lock() section */
Al Virof7655222006-09-26 21:25:43 -07001338void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1339 __be32 saddr, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001340{
Flavio Leitner7cc91502011-10-24 02:56:38 -04001341 int s, i;
Eric Dumazeted7865a42010-06-07 21:49:44 -07001342 struct in_device *in_dev = __in_dev_get_rcu(dev);
Flavio Leitner7cc91502011-10-24 02:56:38 -04001343 __be32 skeys[2] = { saddr, 0 };
1344 int ikeys[2] = { dev->ifindex, 0 };
David S. Millerf39925d2011-02-09 22:00:16 -08001345 struct inet_peer *peer;
Denis V. Lunev317805b2008-02-28 20:50:06 -08001346 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001347
Linus Torvalds1da177e2005-04-16 15:20:36 -07001348 if (!in_dev)
1349 return;
1350
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001351 net = dev_net(dev);
Joe Perches9d4fb272009-11-23 10:41:23 -08001352 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1353 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1354 ipv4_is_zeronet(new_gw))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001355 goto reject_redirect;
1356
1357 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1358 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1359 goto reject_redirect;
1360 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1361 goto reject_redirect;
1362 } else {
Denis V. Lunev317805b2008-02-28 20:50:06 -08001363 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001364 goto reject_redirect;
1365 }
1366
Flavio Leitner7cc91502011-10-24 02:56:38 -04001367 for (s = 0; s < 2; s++) {
1368 for (i = 0; i < 2; i++) {
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001369 unsigned int hash;
1370 struct rtable __rcu **rthp;
1371 struct rtable *rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001372
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001373 hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1374
1375 rthp = &rt_hash_table[hash].chain;
1376
1377 while ((rt = rcu_dereference(*rthp)) != NULL) {
1378 rthp = &rt->dst.rt_next;
1379
1380 if (rt->rt_key_dst != daddr ||
1381 rt->rt_key_src != skeys[s] ||
1382 rt->rt_oif != ikeys[i] ||
1383 rt_is_input_route(rt) ||
1384 rt_is_expired(rt) ||
1385 !net_eq(dev_net(rt->dst.dev), net) ||
1386 rt->dst.error ||
1387 rt->dst.dev != dev ||
1388 rt->rt_gateway != old_gw)
1389 continue;
1390
1391 if (!rt->peer)
1392 rt_bind_peer(rt, rt->rt_dst, 1);
1393
1394 peer = rt->peer;
1395 if (peer) {
Eric Dumazetde68dca2011-11-26 12:13:44 +00001396 if (peer->redirect_learned.a4 != new_gw ||
1397 peer->redirect_genid != redirect_genid) {
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001398 peer->redirect_learned.a4 = new_gw;
Eric Dumazetde68dca2011-11-26 12:13:44 +00001399 peer->redirect_genid = redirect_genid;
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001400 atomic_inc(&__rt_peer_genid);
1401 }
1402 check_peer_redir(&rt->dst, peer);
1403 }
Flavio Leitner7cc91502011-10-24 02:56:38 -04001404 }
Flavio Leitner7cc91502011-10-24 02:56:38 -04001405 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001406 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001407 return;
1408
1409reject_redirect:
1410#ifdef CONFIG_IP_ROUTE_VERBOSE
1411 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07001412 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1413 " Advised path = %pI4 -> %pI4\n",
1414 &old_gw, dev->name, &new_gw,
1415 &saddr, &daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001416#endif
Eric Dumazeted7865a42010-06-07 21:49:44 -07001417 ;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001418}
1419
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001420static bool peer_pmtu_expired(struct inet_peer *peer)
1421{
1422 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1423
1424 return orig &&
1425 time_after_eq(jiffies, orig) &&
1426 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1427}
1428
1429static bool peer_pmtu_cleaned(struct inet_peer *peer)
1430{
1431 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1432
1433 return orig &&
1434 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1435}
1436
Linus Torvalds1da177e2005-04-16 15:20:36 -07001437static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1438{
Eric Dumazetee6b9672008-03-05 18:30:47 -08001439 struct rtable *rt = (struct rtable *)dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001440 struct dst_entry *ret = dst;
1441
1442 if (rt) {
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001443 if (dst->obsolete > 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001444 ip_rt_put(rt);
1445 ret = NULL;
David S. Miller2c8cec52011-02-09 20:42:07 -08001446 } else if (rt->rt_flags & RTCF_REDIRECTED) {
David S. Miller5e2b61f2011-03-04 21:47:09 -08001447 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1448 rt->rt_oif,
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001449 rt_genid(dev_net(dst->dev)));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001450 rt_del(hash, rt);
1451 ret = NULL;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001452 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1453 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001454 }
1455 }
1456 return ret;
1457}
1458
1459/*
1460 * Algorithm:
1461 * 1. The first ip_rt_redirect_number redirects are sent
1462 * with exponential backoff, then we stop sending them at all,
1463 * assuming that the host ignores our redirects.
1464 * 2. If we did not see packets requiring redirects
1465 * during ip_rt_redirect_silence, we assume that the host
1466 * forgot redirected route and start to send redirects again.
1467 *
1468 * This algorithm is much cheaper and more intelligent than dumb load limiting
1469 * in icmp.c.
1470 *
1471 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1472 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1473 */
1474
1475void ip_rt_send_redirect(struct sk_buff *skb)
1476{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001477 struct rtable *rt = skb_rtable(skb);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001478 struct in_device *in_dev;
David S. Miller92d86822011-02-04 15:55:25 -08001479 struct inet_peer *peer;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001480 int log_martians;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001481
Eric Dumazet30038fc2009-08-28 23:52:01 -07001482 rcu_read_lock();
Changli Gaod8d1f302010-06-10 23:31:35 -07001483 in_dev = __in_dev_get_rcu(rt->dst.dev);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001484 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1485 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001486 return;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001487 }
1488 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1489 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001490
David S. Miller92d86822011-02-04 15:55:25 -08001491 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001492 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller92d86822011-02-04 15:55:25 -08001493 peer = rt->peer;
1494 if (!peer) {
1495 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1496 return;
1497 }
1498
Linus Torvalds1da177e2005-04-16 15:20:36 -07001499 /* No redirected packets during ip_rt_redirect_silence;
1500 * reset the algorithm.
1501 */
David S. Miller92d86822011-02-04 15:55:25 -08001502 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1503 peer->rate_tokens = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001504
1505 /* Too many ignored redirects; do not send anything
Changli Gaod8d1f302010-06-10 23:31:35 -07001506 * set dst.rate_last to the last seen redirected packet.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001507 */
David S. Miller92d86822011-02-04 15:55:25 -08001508 if (peer->rate_tokens >= ip_rt_redirect_number) {
1509 peer->rate_last = jiffies;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001510 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001511 }
1512
1513 /* Check for load limit; set rate_last to the latest sent
1514 * redirect.
1515 */
David S. Miller92d86822011-02-04 15:55:25 -08001516 if (peer->rate_tokens == 0 ||
Li Yewang14fb8a72006-12-18 00:26:35 -08001517 time_after(jiffies,
David S. Miller92d86822011-02-04 15:55:25 -08001518 (peer->rate_last +
1519 (ip_rt_redirect_load << peer->rate_tokens)))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001520 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
David S. Miller92d86822011-02-04 15:55:25 -08001521 peer->rate_last = jiffies;
1522 ++peer->rate_tokens;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001523#ifdef CONFIG_IP_ROUTE_VERBOSE
Eric Dumazet30038fc2009-08-28 23:52:01 -07001524 if (log_martians &&
David S. Miller92d86822011-02-04 15:55:25 -08001525 peer->rate_tokens == ip_rt_redirect_number &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001526 net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07001527 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
David S. Millerc5be24f2011-05-13 18:01:21 -04001528 &ip_hdr(skb)->saddr, rt->rt_iif,
Harvey Harrison673d57e2008-10-31 00:53:57 -07001529 &rt->rt_dst, &rt->rt_gateway);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001530#endif
1531 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001532}
1533
1534static int ip_error(struct sk_buff *skb)
1535{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001536 struct rtable *rt = skb_rtable(skb);
David S. Miller92d86822011-02-04 15:55:25 -08001537 struct inet_peer *peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001538 unsigned long now;
David S. Miller92d86822011-02-04 15:55:25 -08001539 bool send;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001540 int code;
1541
Changli Gaod8d1f302010-06-10 23:31:35 -07001542 switch (rt->dst.error) {
Joe Perches4500ebf2011-07-01 09:43:07 +00001543 case EINVAL:
1544 default:
1545 goto out;
1546 case EHOSTUNREACH:
1547 code = ICMP_HOST_UNREACH;
1548 break;
1549 case ENETUNREACH:
1550 code = ICMP_NET_UNREACH;
1551 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1552 IPSTATS_MIB_INNOROUTES);
1553 break;
1554 case EACCES:
1555 code = ICMP_PKT_FILTERED;
1556 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001557 }
1558
David S. Miller92d86822011-02-04 15:55:25 -08001559 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001560 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller92d86822011-02-04 15:55:25 -08001561 peer = rt->peer;
1562
1563 send = true;
1564 if (peer) {
1565 now = jiffies;
1566 peer->rate_tokens += now - peer->rate_last;
1567 if (peer->rate_tokens > ip_rt_error_burst)
1568 peer->rate_tokens = ip_rt_error_burst;
1569 peer->rate_last = now;
1570 if (peer->rate_tokens >= ip_rt_error_cost)
1571 peer->rate_tokens -= ip_rt_error_cost;
1572 else
1573 send = false;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001574 }
David S. Miller92d86822011-02-04 15:55:25 -08001575 if (send)
1576 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001577
1578out: kfree_skb(skb);
1579 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001580}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001581
1582/*
1583 * The last two values are not from the RFC but
1584 * are needed for AMPRnet AX.25 paths.
1585 */
1586
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001587static const unsigned short mtu_plateau[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -07001588{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1589
Stephen Hemminger5969f712008-04-10 01:52:09 -07001590static inline unsigned short guess_mtu(unsigned short old_mtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001591{
1592 int i;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001593
Linus Torvalds1da177e2005-04-16 15:20:36 -07001594 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1595 if (old_mtu > mtu_plateau[i])
1596 return mtu_plateau[i];
1597 return 68;
1598}
1599
Eric Dumazetb71d1d42011-04-22 04:53:02 +00001600unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
Timo Teras0010e462008-04-29 03:32:25 -07001601 unsigned short new_mtu,
1602 struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001603{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001604 unsigned short old_mtu = ntohs(iph->tot_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001605 unsigned short est_mtu = 0;
David S. Miller2c8cec52011-02-09 20:42:07 -08001606 struct inet_peer *peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001607
David S. Miller2c8cec52011-02-09 20:42:07 -08001608 peer = inet_getpeer_v4(iph->daddr, 1);
1609 if (peer) {
1610 unsigned short mtu = new_mtu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001611
David S. Miller2c8cec52011-02-09 20:42:07 -08001612 if (new_mtu < 68 || new_mtu >= old_mtu) {
1613 /* BSD 4.2 derived systems incorrectly adjust
1614 * tot_len by the IP header length, and report
1615 * a zero MTU in the ICMP message.
1616 */
1617 if (mtu == 0 &&
1618 old_mtu >= 68 + (iph->ihl << 2))
1619 old_mtu -= iph->ihl << 2;
1620 mtu = guess_mtu(old_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001621 }
David S. Miller2c8cec52011-02-09 20:42:07 -08001622
1623 if (mtu < ip_rt_min_pmtu)
1624 mtu = ip_rt_min_pmtu;
1625 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001626 unsigned long pmtu_expires;
1627
1628 pmtu_expires = jiffies + ip_rt_mtu_expires;
1629 if (!pmtu_expires)
1630 pmtu_expires = 1UL;
1631
David S. Miller2c8cec52011-02-09 20:42:07 -08001632 est_mtu = mtu;
1633 peer->pmtu_learned = mtu;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001634 peer->pmtu_expires = pmtu_expires;
Gao feng59445b62011-10-19 15:34:09 +00001635 atomic_inc(&__rt_peer_genid);
David S. Miller2c8cec52011-02-09 20:42:07 -08001636 }
1637
1638 inet_putpeer(peer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001639 }
1640 return est_mtu ? : new_mtu;
1641}
1642
David S. Miller2c8cec52011-02-09 20:42:07 -08001643static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1644{
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001645 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
David S. Miller2c8cec52011-02-09 20:42:07 -08001646
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001647 if (!expires)
1648 return;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001649 if (time_before(jiffies, expires)) {
David S. Miller2c8cec52011-02-09 20:42:07 -08001650 u32 orig_dst_mtu = dst_mtu(dst);
1651 if (peer->pmtu_learned < orig_dst_mtu) {
1652 if (!peer->pmtu_orig)
1653 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1654 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1655 }
1656 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1657 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1658}
1659
Linus Torvalds1da177e2005-04-16 15:20:36 -07001660static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1661{
David S. Miller2c8cec52011-02-09 20:42:07 -08001662 struct rtable *rt = (struct rtable *) dst;
1663 struct inet_peer *peer;
1664
1665 dst_confirm(dst);
1666
1667 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001668 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller2c8cec52011-02-09 20:42:07 -08001669 peer = rt->peer;
1670 if (peer) {
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001671 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1672
David S. Miller2c8cec52011-02-09 20:42:07 -08001673 if (mtu < ip_rt_min_pmtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001674 mtu = ip_rt_min_pmtu;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001675 if (!pmtu_expires || mtu < peer->pmtu_learned) {
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001676
1677 pmtu_expires = jiffies + ip_rt_mtu_expires;
1678 if (!pmtu_expires)
1679 pmtu_expires = 1UL;
1680
David S. Miller2c8cec52011-02-09 20:42:07 -08001681 peer->pmtu_learned = mtu;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001682 peer->pmtu_expires = pmtu_expires;
David S. Miller2c8cec52011-02-09 20:42:07 -08001683
1684 atomic_inc(&__rt_peer_genid);
1685 rt->rt_peer_genid = rt_peer_genid();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001686 }
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001687 check_peer_pmtu(dst, peer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001688 }
1689}
1690
David S. Millerf39925d2011-02-09 22:00:16 -08001691
Linus Torvalds1da177e2005-04-16 15:20:36 -07001692static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1693{
David S. Miller6431cbc2011-02-07 20:38:06 -08001694 struct rtable *rt = (struct rtable *) dst;
1695
1696 if (rt_is_expired(rt))
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001697 return NULL;
David S. Miller6431cbc2011-02-07 20:38:06 -08001698 if (rt->rt_peer_genid != rt_peer_genid()) {
David S. Miller2c8cec52011-02-09 20:42:07 -08001699 struct inet_peer *peer;
1700
David S. Miller6431cbc2011-02-07 20:38:06 -08001701 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001702 rt_bind_peer(rt, rt->rt_dst, 0);
David S. Miller6431cbc2011-02-07 20:38:06 -08001703
David S. Miller2c8cec52011-02-09 20:42:07 -08001704 peer = rt->peer;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001705 if (peer) {
David S. Miller2c8cec52011-02-09 20:42:07 -08001706 check_peer_pmtu(dst, peer);
1707
Eric Dumazetde68dca2011-11-26 12:13:44 +00001708 if (peer->redirect_genid != redirect_genid)
1709 peer->redirect_learned.a4 = 0;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001710 if (peer->redirect_learned.a4 &&
1711 peer->redirect_learned.a4 != rt->rt_gateway) {
1712 if (check_peer_redir(dst, peer))
1713 return NULL;
1714 }
David S. Millerf39925d2011-02-09 22:00:16 -08001715 }
1716
David S. Miller6431cbc2011-02-07 20:38:06 -08001717 rt->rt_peer_genid = rt_peer_genid();
1718 }
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001719 return dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001720}
1721
1722static void ipv4_dst_destroy(struct dst_entry *dst)
1723{
1724 struct rtable *rt = (struct rtable *) dst;
1725 struct inet_peer *peer = rt->peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001726
David S. Miller62fa8a82011-01-26 20:51:05 -08001727 if (rt->fi) {
1728 fib_info_put(rt->fi);
1729 rt->fi = NULL;
1730 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001731 if (peer) {
1732 rt->peer = NULL;
1733 inet_putpeer(peer);
1734 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001735}
1736
Linus Torvalds1da177e2005-04-16 15:20:36 -07001737
1738static void ipv4_link_failure(struct sk_buff *skb)
1739{
1740 struct rtable *rt;
1741
1742 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1743
Eric Dumazet511c3f92009-06-02 05:14:27 +00001744 rt = skb_rtable(skb);
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001745 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1746 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001747}
1748
1749static int ip_rt_bug(struct sk_buff *skb)
1750{
Harvey Harrison673d57e2008-10-31 00:53:57 -07001751 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1752 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001753 skb->dev ? skb->dev->name : "?");
1754 kfree_skb(skb);
Dave Jonesc378a9c2011-05-21 07:16:42 +00001755 WARN_ON(1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001756 return 0;
1757}
1758
1759/*
1760 We do not cache source address of outgoing interface,
1761 because it is used only by IP RR, TS and SRR options,
1762 so that it out of fast path.
1763
1764 BTW remember: "addr" is allowed to be not aligned
1765 in IP options!
1766 */
1767
David S. Miller8e363602011-05-13 17:29:41 -04001768void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001769{
Al Viroa61ced52006-09-26 21:27:54 -07001770 __be32 src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001771
David S. Millerc7537962010-11-11 17:07:48 -08001772 if (rt_is_output_route(rt))
David S. Millerc5be24f2011-05-13 18:01:21 -04001773 src = ip_hdr(skb)->saddr;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001774 else {
David S. Miller8e363602011-05-13 17:29:41 -04001775 struct fib_result res;
1776 struct flowi4 fl4;
1777 struct iphdr *iph;
1778
1779 iph = ip_hdr(skb);
1780
1781 memset(&fl4, 0, sizeof(fl4));
1782 fl4.daddr = iph->daddr;
1783 fl4.saddr = iph->saddr;
Julian Anastasovb0fe4a32011-07-23 02:00:41 +00001784 fl4.flowi4_tos = RT_TOS(iph->tos);
David S. Miller8e363602011-05-13 17:29:41 -04001785 fl4.flowi4_oif = rt->dst.dev->ifindex;
1786 fl4.flowi4_iif = skb->dev->ifindex;
1787 fl4.flowi4_mark = skb->mark;
David S. Miller5e2b61f2011-03-04 21:47:09 -08001788
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001789 rcu_read_lock();
David S. Miller68a5e3d2011-03-11 20:07:33 -05001790 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
David S. Miller436c3b62011-03-24 17:42:21 -07001791 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001792 else
1793 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001794 RT_SCOPE_UNIVERSE);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001795 rcu_read_unlock();
1796 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001797 memcpy(addr, &src, 4);
1798}
1799
Patrick McHardyc7066f72011-01-14 13:36:42 +01001800#ifdef CONFIG_IP_ROUTE_CLASSID
Linus Torvalds1da177e2005-04-16 15:20:36 -07001801static void set_class_tag(struct rtable *rt, u32 tag)
1802{
Changli Gaod8d1f302010-06-10 23:31:35 -07001803 if (!(rt->dst.tclassid & 0xFFFF))
1804 rt->dst.tclassid |= tag & 0xFFFF;
1805 if (!(rt->dst.tclassid & 0xFFFF0000))
1806 rt->dst.tclassid |= tag & 0xFFFF0000;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001807}
1808#endif
1809
David S. Miller0dbaee32010-12-13 12:52:14 -08001810static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1811{
1812 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1813
1814 if (advmss == 0) {
1815 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1816 ip_rt_min_advmss);
1817 if (advmss > 65535 - 40)
1818 advmss = 65535 - 40;
1819 }
1820 return advmss;
1821}
1822
Steffen Klassertebb762f2011-11-23 02:12:51 +00001823static unsigned int ipv4_mtu(const struct dst_entry *dst)
David S. Millerd33e4552010-12-14 13:01:14 -08001824{
Steffen Klassert261663b2011-11-23 02:14:50 +00001825 const struct rtable *rt = (const struct rtable *) dst;
Steffen Klassert618f9bc2011-11-23 02:13:31 +00001826 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1827
Steffen Klassert261663b2011-11-23 02:14:50 +00001828 if (mtu && rt_is_output_route(rt))
Steffen Klassert618f9bc2011-11-23 02:13:31 +00001829 return mtu;
1830
1831 mtu = dst->dev->mtu;
David S. Millerd33e4552010-12-14 13:01:14 -08001832
1833 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
David S. Millerd33e4552010-12-14 13:01:14 -08001834
1835 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1836 mtu = 576;
1837 }
1838
1839 if (mtu > IP_MAX_MTU)
1840 mtu = IP_MAX_MTU;
1841
1842 return mtu;
1843}
1844
David S. Miller813b3b52011-04-28 14:48:42 -07001845static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
David S. Miller5e2b61f2011-03-04 21:47:09 -08001846 struct fib_info *fi)
David S. Millera4daad62011-01-27 22:01:53 -08001847{
David S. Miller0131ba452011-02-04 14:37:30 -08001848 struct inet_peer *peer;
1849 int create = 0;
1850
1851 /* If a peer entry exists for this destination, we must hook
1852 * it up in order to get at cached metrics.
1853 */
David S. Miller813b3b52011-04-28 14:48:42 -07001854 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
David S. Miller0131ba452011-02-04 14:37:30 -08001855 create = 1;
1856
David S. Miller3c0afdc2011-03-04 21:26:07 -08001857 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
David S. Miller0131ba452011-02-04 14:37:30 -08001858 if (peer) {
David S. Miller3c0afdc2011-03-04 21:26:07 -08001859 rt->rt_peer_genid = rt_peer_genid();
David S. Miller0131ba452011-02-04 14:37:30 -08001860 if (inet_metrics_new(peer))
1861 memcpy(peer->metrics, fi->fib_metrics,
1862 sizeof(u32) * RTAX_MAX);
1863 dst_init_metrics(&rt->dst, peer->metrics, false);
David S. Miller2c8cec52011-02-09 20:42:07 -08001864
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001865 check_peer_pmtu(&rt->dst, peer);
Eric Dumazetde68dca2011-11-26 12:13:44 +00001866 if (peer->redirect_genid != redirect_genid)
1867 peer->redirect_learned.a4 = 0;
David S. Millerf39925d2011-02-09 22:00:16 -08001868 if (peer->redirect_learned.a4 &&
1869 peer->redirect_learned.a4 != rt->rt_gateway) {
1870 rt->rt_gateway = peer->redirect_learned.a4;
1871 rt->rt_flags |= RTCF_REDIRECTED;
1872 }
David S. Miller0131ba452011-02-04 14:37:30 -08001873 } else {
David S. Millerb8dad612011-01-28 14:07:16 -08001874 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1875 rt->fi = fi;
1876 atomic_inc(&fi->fib_clntref);
1877 }
David S. Millera4daad62011-01-27 22:01:53 -08001878 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
David S. Millera4daad62011-01-27 22:01:53 -08001879 }
1880}
1881
David S. Miller813b3b52011-04-28 14:48:42 -07001882static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
David S. Miller5e2b61f2011-03-04 21:47:09 -08001883 const struct fib_result *res,
David S. Miller982721f2011-02-16 21:44:24 -08001884 struct fib_info *fi, u16 type, u32 itag)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001885{
David S. Millerdefb3512010-12-08 21:16:57 -08001886 struct dst_entry *dst = &rt->dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001887
1888 if (fi) {
1889 if (FIB_RES_GW(*res) &&
1890 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1891 rt->rt_gateway = FIB_RES_GW(*res);
David S. Miller813b3b52011-04-28 14:48:42 -07001892 rt_init_metrics(rt, fl4, fi);
Patrick McHardyc7066f72011-01-14 13:36:42 +01001893#ifdef CONFIG_IP_ROUTE_CLASSID
David S. Millerdefb3512010-12-08 21:16:57 -08001894 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001895#endif
David S. Millerd33e4552010-12-14 13:01:14 -08001896 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001897
David S. Millerdefb3512010-12-08 21:16:57 -08001898 if (dst_mtu(dst) > IP_MAX_MTU)
1899 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
David S. Miller0dbaee32010-12-13 12:52:14 -08001900 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
David S. Millerdefb3512010-12-08 21:16:57 -08001901 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001902
Patrick McHardyc7066f72011-01-14 13:36:42 +01001903#ifdef CONFIG_IP_ROUTE_CLASSID
Linus Torvalds1da177e2005-04-16 15:20:36 -07001904#ifdef CONFIG_IP_MULTIPLE_TABLES
1905 set_class_tag(rt, fib_rules_tclass(res));
1906#endif
1907 set_class_tag(rt, itag);
1908#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001909}
1910
David S. Miller5c1e6aa2011-04-28 14:13:38 -07001911static struct rtable *rt_dst_alloc(struct net_device *dev,
1912 bool nopolicy, bool noxfrm)
David S. Miller0c4dcd52011-02-17 15:42:37 -08001913{
David S. Miller5c1e6aa2011-04-28 14:13:38 -07001914 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1915 DST_HOST |
1916 (nopolicy ? DST_NOPOLICY : 0) |
1917 (noxfrm ? DST_NOXFRM : 0));
David S. Miller0c4dcd52011-02-17 15:42:37 -08001918}
1919
Eric Dumazet96d36222010-06-02 19:21:31 +00001920/* called in rcu_read_lock() section */
Al Viro9e12bb22006-09-26 21:25:20 -07001921static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001922 u8 tos, struct net_device *dev, int our)
1923{
Eric Dumazet96d36222010-06-02 19:21:31 +00001924 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001925 struct rtable *rth;
Al Viroa61ced52006-09-26 21:27:54 -07001926 __be32 spec_dst;
Eric Dumazet96d36222010-06-02 19:21:31 +00001927 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001928 u32 itag = 0;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001929 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001930
1931 /* Primary sanity checks. */
1932
1933 if (in_dev == NULL)
1934 return -EINVAL;
1935
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001936 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08001937 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001938 goto e_inval;
1939
Joe Perchesf97c1e02007-12-16 13:45:43 -08001940 if (ipv4_is_zeronet(saddr)) {
1941 if (!ipv4_is_local_multicast(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001942 goto e_inval;
1943 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001944 } else {
Michael Smith5c04c812011-04-07 04:51:50 +00001945 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1946 &itag);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001947 if (err < 0)
1948 goto e_err;
1949 }
David S. Miller5c1e6aa2011-04-28 14:13:38 -07001950 rth = rt_dst_alloc(init_net.loopback_dev,
1951 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001952 if (!rth)
1953 goto e_nobufs;
1954
Patrick McHardyc7066f72011-01-14 13:36:42 +01001955#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07001956 rth->dst.tclassid = itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001957#endif
David S. Millercf911662011-04-28 14:31:47 -07001958 rth->dst.output = ip_rt_bug;
1959
1960 rth->rt_key_dst = daddr;
1961 rth->rt_key_src = saddr;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001962 rth->rt_genid = rt_genid(dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001963 rth->rt_flags = RTCF_MULTICAST;
Eric Dumazet29e75252008-01-31 17:05:09 -08001964 rth->rt_type = RTN_MULTICAST;
David S. Miller475949d2011-05-03 19:45:15 -07001965 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07001966 rth->rt_dst = daddr;
1967 rth->rt_src = saddr;
1968 rth->rt_route_iif = dev->ifindex;
1969 rth->rt_iif = dev->ifindex;
1970 rth->rt_oif = 0;
1971 rth->rt_mark = skb->mark;
1972 rth->rt_gateway = daddr;
1973 rth->rt_spec_dst= spec_dst;
1974 rth->rt_peer_genid = 0;
1975 rth->peer = NULL;
1976 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001977 if (our) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001978 rth->dst.input= ip_local_deliver;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001979 rth->rt_flags |= RTCF_LOCAL;
1980 }
1981
1982#ifdef CONFIG_IP_MROUTE
Joe Perchesf97c1e02007-12-16 13:45:43 -08001983 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
Changli Gaod8d1f302010-06-10 23:31:35 -07001984 rth->dst.input = ip_mr_input;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001985#endif
1986 RT_CACHE_STAT_INC(in_slow_mc);
1987
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001988 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
David S. Millerb23dd4f2011-03-02 14:31:35 -08001989 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
Eric Dumazet9aa3c942011-06-18 11:59:18 -07001990 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001991
1992e_nobufs:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001993 return -ENOBUFS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001994e_inval:
Eric Dumazet96d36222010-06-02 19:21:31 +00001995 return -EINVAL;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001996e_err:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001997 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001998}
1999
2000
2001static void ip_handle_martian_source(struct net_device *dev,
2002 struct in_device *in_dev,
2003 struct sk_buff *skb,
Al Viro9e12bb22006-09-26 21:25:20 -07002004 __be32 daddr,
2005 __be32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002006{
2007 RT_CACHE_STAT_INC(in_martian_src);
2008#ifdef CONFIG_IP_ROUTE_VERBOSE
2009 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2010 /*
2011 * RFC1812 recommendation, if source is martian,
2012 * the only hint is MAC header.
2013 */
Harvey Harrison673d57e2008-10-31 00:53:57 -07002014 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2015 &daddr, &saddr, dev->name);
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07002016 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002017 int i;
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07002018 const unsigned char *p = skb_mac_header(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002019 printk(KERN_WARNING "ll header: ");
2020 for (i = 0; i < dev->hard_header_len; i++, p++) {
2021 printk("%02x", *p);
2022 if (i < (dev->hard_header_len - 1))
2023 printk(":");
2024 }
2025 printk("\n");
2026 }
2027 }
2028#endif
2029}
2030
Eric Dumazet47360222010-06-03 04:13:21 +00002031/* called in rcu_read_lock() section */
Stephen Hemminger5969f712008-04-10 01:52:09 -07002032static int __mkroute_input(struct sk_buff *skb,
David S. Miller982721f2011-02-16 21:44:24 -08002033 const struct fib_result *res,
Stephen Hemminger5969f712008-04-10 01:52:09 -07002034 struct in_device *in_dev,
2035 __be32 daddr, __be32 saddr, u32 tos,
2036 struct rtable **result)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002037{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002038 struct rtable *rth;
2039 int err;
2040 struct in_device *out_dev;
Eric Dumazet47360222010-06-03 04:13:21 +00002041 unsigned int flags = 0;
Al Virod9c9df82006-09-26 21:28:14 -07002042 __be32 spec_dst;
2043 u32 itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002044
2045 /* get a working reference to the output device */
Eric Dumazet47360222010-06-03 04:13:21 +00002046 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002047 if (out_dev == NULL) {
2048 if (net_ratelimit())
2049 printk(KERN_CRIT "Bug in ip_route_input" \
2050 "_slow(). Please, report\n");
2051 return -EINVAL;
2052 }
2053
2054
Michael Smith5c04c812011-04-07 04:51:50 +00002055 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2056 in_dev->dev, &spec_dst, &itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002057 if (err < 0) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002058 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002059 saddr);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002060
Linus Torvalds1da177e2005-04-16 15:20:36 -07002061 goto cleanup;
2062 }
2063
2064 if (err)
2065 flags |= RTCF_DIRECTSRC;
2066
Thomas Graf51b77ca2008-06-03 16:36:01 -07002067 if (out_dev == in_dev && err &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002068 (IN_DEV_SHARED_MEDIA(out_dev) ||
2069 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2070 flags |= RTCF_DOREDIRECT;
2071
2072 if (skb->protocol != htons(ETH_P_IP)) {
2073 /* Not IP (i.e. ARP). Do not create route, if it is
2074 * invalid for proxy arp. DNAT routes are always valid.
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00002075 *
2076 * Proxy arp feature have been extended to allow, ARP
2077 * replies back to the same interface, to support
2078 * Private VLAN switch technologies. See arp.c.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002079 */
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00002080 if (out_dev == in_dev &&
2081 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002082 err = -EINVAL;
2083 goto cleanup;
2084 }
2085 }
2086
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002087 rth = rt_dst_alloc(out_dev->dev,
2088 IN_DEV_CONF_GET(in_dev, NOPOLICY),
David S. Miller0c4dcd52011-02-17 15:42:37 -08002089 IN_DEV_CONF_GET(out_dev, NOXFRM));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002090 if (!rth) {
2091 err = -ENOBUFS;
2092 goto cleanup;
2093 }
2094
David S. Miller5e2b61f2011-03-04 21:47:09 -08002095 rth->rt_key_dst = daddr;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002096 rth->rt_key_src = saddr;
David S. Millercf911662011-04-28 14:31:47 -07002097 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2098 rth->rt_flags = flags;
2099 rth->rt_type = res->type;
David S. Miller475949d2011-05-03 19:45:15 -07002100 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002101 rth->rt_dst = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002102 rth->rt_src = saddr;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002103 rth->rt_route_iif = in_dev->dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002104 rth->rt_iif = in_dev->dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002105 rth->rt_oif = 0;
David S. Millercf911662011-04-28 14:31:47 -07002106 rth->rt_mark = skb->mark;
2107 rth->rt_gateway = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002108 rth->rt_spec_dst= spec_dst;
David S. Millercf911662011-04-28 14:31:47 -07002109 rth->rt_peer_genid = 0;
2110 rth->peer = NULL;
2111 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002112
Changli Gaod8d1f302010-06-10 23:31:35 -07002113 rth->dst.input = ip_forward;
2114 rth->dst.output = ip_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002115
David S. Miller5e2b61f2011-03-04 21:47:09 -08002116 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002117
Linus Torvalds1da177e2005-04-16 15:20:36 -07002118 *result = rth;
2119 err = 0;
2120 cleanup:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002121 return err;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002122}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002123
Stephen Hemminger5969f712008-04-10 01:52:09 -07002124static int ip_mkroute_input(struct sk_buff *skb,
2125 struct fib_result *res,
David S. Miller68a5e3d2011-03-11 20:07:33 -05002126 const struct flowi4 *fl4,
Stephen Hemminger5969f712008-04-10 01:52:09 -07002127 struct in_device *in_dev,
2128 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002129{
Chuck Short7abaa272005-06-22 22:10:23 -07002130 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002131 int err;
2132 unsigned hash;
2133
2134#ifdef CONFIG_IP_ROUTE_MULTIPATH
David S. Millerff3fccb2011-03-10 16:23:24 -08002135 if (res->fi && res->fi->fib_nhs > 1)
David S. Miller1b7fe5932011-03-10 17:01:16 -08002136 fib_select_multipath(res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002137#endif
2138
2139 /* create a routing cache entry */
2140 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2141 if (err)
2142 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002143
2144 /* put it into the cache */
David S. Miller68a5e3d2011-03-11 20:07:33 -05002145 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
Changli Gaod8d1f302010-06-10 23:31:35 -07002146 rt_genid(dev_net(rth->dst.dev)));
David S. Miller68a5e3d2011-03-11 20:07:33 -05002147 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002148 if (IS_ERR(rth))
2149 return PTR_ERR(rth);
2150 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002151}
2152
Linus Torvalds1da177e2005-04-16 15:20:36 -07002153/*
2154 * NOTE. We drop all the packets that has local source
2155 * addresses, because every properly looped back packet
2156 * must have correct destination already attached by output routine.
2157 *
2158 * Such approach solves two big problems:
2159 * 1. Not simplex devices are handled properly.
2160 * 2. IP spoofing attempts are filtered with 100% of guarantee.
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002161 * called with rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -07002162 */
2163
Al Viro9e12bb22006-09-26 21:25:20 -07002164static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002165 u8 tos, struct net_device *dev)
2166{
2167 struct fib_result res;
Eric Dumazet96d36222010-06-02 19:21:31 +00002168 struct in_device *in_dev = __in_dev_get_rcu(dev);
David S. Miller68a5e3d2011-03-11 20:07:33 -05002169 struct flowi4 fl4;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002170 unsigned flags = 0;
2171 u32 itag = 0;
2172 struct rtable * rth;
2173 unsigned hash;
Al Viro9e12bb22006-09-26 21:25:20 -07002174 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002175 int err = -EINVAL;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002176 struct net * net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002177
2178 /* IP on this device is disabled. */
2179
2180 if (!in_dev)
2181 goto out;
2182
2183 /* Check for the most weird martians, which can be not detected
2184 by fib_lookup.
2185 */
2186
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002187 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002188 ipv4_is_loopback(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002189 goto martian_source;
2190
Andy Walls27a954b2010-10-17 15:11:22 +00002191 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002192 goto brd_input;
2193
2194 /* Accept zero addresses only to limited broadcast;
2195 * I even do not know to fix it or not. Waiting for complains :-)
2196 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002197 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002198 goto martian_source;
2199
Andy Walls27a954b2010-10-17 15:11:22 +00002200 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002201 goto martian_destination;
2202
2203 /*
2204 * Now we are ready to route packet.
2205 */
David S. Miller68a5e3d2011-03-11 20:07:33 -05002206 fl4.flowi4_oif = 0;
2207 fl4.flowi4_iif = dev->ifindex;
2208 fl4.flowi4_mark = skb->mark;
2209 fl4.flowi4_tos = tos;
2210 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2211 fl4.daddr = daddr;
2212 fl4.saddr = saddr;
2213 err = fib_lookup(net, &fl4, &res);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002214 if (err != 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002215 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002216 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002217 goto no_route;
2218 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002219
2220 RT_CACHE_STAT_INC(in_slow_tot);
2221
2222 if (res.type == RTN_BROADCAST)
2223 goto brd_input;
2224
2225 if (res.type == RTN_LOCAL) {
Michael Smith5c04c812011-04-07 04:51:50 +00002226 err = fib_validate_source(skb, saddr, daddr, tos,
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002227 net->loopback_dev->ifindex,
Michael Smith5c04c812011-04-07 04:51:50 +00002228 dev, &spec_dst, &itag);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002229 if (err < 0)
2230 goto martian_source_keep_err;
2231 if (err)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002232 flags |= RTCF_DIRECTSRC;
2233 spec_dst = daddr;
2234 goto local_input;
2235 }
2236
2237 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002238 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002239 if (res.type != RTN_UNICAST)
2240 goto martian_destination;
2241
David S. Miller68a5e3d2011-03-11 20:07:33 -05002242 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002243out: return err;
2244
2245brd_input:
2246 if (skb->protocol != htons(ETH_P_IP))
2247 goto e_inval;
2248
Joe Perchesf97c1e02007-12-16 13:45:43 -08002249 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002250 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2251 else {
Michael Smith5c04c812011-04-07 04:51:50 +00002252 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2253 &itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002254 if (err < 0)
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002255 goto martian_source_keep_err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002256 if (err)
2257 flags |= RTCF_DIRECTSRC;
2258 }
2259 flags |= RTCF_BROADCAST;
2260 res.type = RTN_BROADCAST;
2261 RT_CACHE_STAT_INC(in_brd);
2262
2263local_input:
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002264 rth = rt_dst_alloc(net->loopback_dev,
2265 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002266 if (!rth)
2267 goto e_nobufs;
2268
David S. Millercf911662011-04-28 14:31:47 -07002269 rth->dst.input= ip_local_deliver;
Changli Gaod8d1f302010-06-10 23:31:35 -07002270 rth->dst.output= ip_rt_bug;
David S. Millercf911662011-04-28 14:31:47 -07002271#ifdef CONFIG_IP_ROUTE_CLASSID
2272 rth->dst.tclassid = itag;
2273#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002274
David S. Miller5e2b61f2011-03-04 21:47:09 -08002275 rth->rt_key_dst = daddr;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002276 rth->rt_key_src = saddr;
David S. Millercf911662011-04-28 14:31:47 -07002277 rth->rt_genid = rt_genid(net);
2278 rth->rt_flags = flags|RTCF_LOCAL;
2279 rth->rt_type = res.type;
David S. Miller475949d2011-05-03 19:45:15 -07002280 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002281 rth->rt_dst = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002282 rth->rt_src = saddr;
Patrick McHardyc7066f72011-01-14 13:36:42 +01002283#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07002284 rth->dst.tclassid = itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002285#endif
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002286 rth->rt_route_iif = dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002287 rth->rt_iif = dev->ifindex;
David S. Millercf911662011-04-28 14:31:47 -07002288 rth->rt_oif = 0;
2289 rth->rt_mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002290 rth->rt_gateway = daddr;
2291 rth->rt_spec_dst= spec_dst;
David S. Millercf911662011-04-28 14:31:47 -07002292 rth->rt_peer_genid = 0;
2293 rth->peer = NULL;
2294 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002295 if (res.type == RTN_UNREACHABLE) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002296 rth->dst.input= ip_error;
2297 rth->dst.error= -err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002298 rth->rt_flags &= ~RTCF_LOCAL;
2299 }
David S. Miller68a5e3d2011-03-11 20:07:33 -05002300 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2301 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002302 err = 0;
2303 if (IS_ERR(rth))
2304 err = PTR_ERR(rth);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002305 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002306
2307no_route:
2308 RT_CACHE_STAT_INC(in_no_route);
2309 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2310 res.type = RTN_UNREACHABLE;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08002311 if (err == -ESRCH)
2312 err = -ENETUNREACH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002313 goto local_input;
2314
2315 /*
2316 * Do not cache martian addresses: they should be logged (RFC1812)
2317 */
2318martian_destination:
2319 RT_CACHE_STAT_INC(in_martian_dst);
2320#ifdef CONFIG_IP_ROUTE_VERBOSE
2321 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07002322 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2323 &daddr, &saddr, dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002324#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002325
2326e_hostunreach:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002327 err = -EHOSTUNREACH;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002328 goto out;
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002329
Linus Torvalds1da177e2005-04-16 15:20:36 -07002330e_inval:
2331 err = -EINVAL;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002332 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002333
2334e_nobufs:
2335 err = -ENOBUFS;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002336 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002337
2338martian_source:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002339 err = -EINVAL;
2340martian_source_keep_err:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002341 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002342 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002343}
2344
Eric Dumazet407eadd2010-05-10 11:32:55 +00002345int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2346 u8 tos, struct net_device *dev, bool noref)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002347{
2348 struct rtable * rth;
2349 unsigned hash;
2350 int iif = dev->ifindex;
Denis V. Lunevb5921912008-01-22 23:50:25 -08002351 struct net *net;
Eric Dumazet96d36222010-06-02 19:21:31 +00002352 int res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002353
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002354 net = dev_net(dev);
Neil Horman1080d702008-10-27 12:28:25 -07002355
Eric Dumazet96d36222010-06-02 19:21:31 +00002356 rcu_read_lock();
2357
Neil Horman1080d702008-10-27 12:28:25 -07002358 if (!rt_caching(net))
2359 goto skip_cache;
2360
Linus Torvalds1da177e2005-04-16 15:20:36 -07002361 tos &= IPTOS_RT_MASK;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002362 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002363
Linus Torvalds1da177e2005-04-16 15:20:36 -07002364 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002365 rth = rcu_dereference(rth->dst.rt_next)) {
David S. Miller5e2b61f2011-03-04 21:47:09 -08002366 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2367 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
Julian Anastasov97a80412011-08-09 04:01:16 +00002368 (rth->rt_route_iif ^ iif) |
David S. Miller475949d2011-05-03 19:45:15 -07002369 (rth->rt_key_tos ^ tos)) == 0 &&
David S. Miller5e2b61f2011-03-04 21:47:09 -08002370 rth->rt_mark == skb->mark &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002371 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002372 !rt_is_expired(rth)) {
Eric Dumazet407eadd2010-05-10 11:32:55 +00002373 if (noref) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002374 dst_use_noref(&rth->dst, jiffies);
2375 skb_dst_set_noref(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002376 } else {
Changli Gaod8d1f302010-06-10 23:31:35 -07002377 dst_use(&rth->dst, jiffies);
2378 skb_dst_set(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002379 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002380 RT_CACHE_STAT_INC(in_hit);
2381 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002382 return 0;
2383 }
2384 RT_CACHE_STAT_INC(in_hlist_search);
2385 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002386
Neil Horman1080d702008-10-27 12:28:25 -07002387skip_cache:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002388 /* Multicast recognition logic is moved from route cache to here.
2389 The problem was that too many Ethernet cards have broken/missing
2390 hardware multicast filters :-( As result the host on multicasting
2391 network acquires a lot of useless route cache entries, sort of
2392 SDR messages from all the world. Now we try to get rid of them.
2393 Really, provided software IP multicast filter is organized
2394 reasonably (at least, hashed), it does not result in a slowdown
2395 comparing with route cache reject entries.
2396 Note, that multicast routers are not affected, because
2397 route cache entry is created eventually.
2398 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002399 if (ipv4_is_multicast(daddr)) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002400 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002401
Eric Dumazet96d36222010-06-02 19:21:31 +00002402 if (in_dev) {
David S. Millerdbdd9a52011-03-10 16:34:38 -08002403 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2404 ip_hdr(skb)->protocol);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002405 if (our
2406#ifdef CONFIG_IP_MROUTE
Joe Perches9d4fb272009-11-23 10:41:23 -08002407 ||
2408 (!ipv4_is_local_multicast(daddr) &&
2409 IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002410#endif
Joe Perches9d4fb272009-11-23 10:41:23 -08002411 ) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002412 int res = ip_route_input_mc(skb, daddr, saddr,
2413 tos, dev, our);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002414 rcu_read_unlock();
Eric Dumazet96d36222010-06-02 19:21:31 +00002415 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002416 }
2417 }
2418 rcu_read_unlock();
2419 return -EINVAL;
2420 }
Eric Dumazet96d36222010-06-02 19:21:31 +00002421 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2422 rcu_read_unlock();
2423 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002424}
Eric Dumazet407eadd2010-05-10 11:32:55 +00002425EXPORT_SYMBOL(ip_route_input_common);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002426
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002427/* called with rcu_read_lock() */
David S. Miller982721f2011-02-16 21:44:24 -08002428static struct rtable *__mkroute_output(const struct fib_result *res,
David S. Miller68a5e3d2011-03-11 20:07:33 -05002429 const struct flowi4 *fl4,
David S. Miller813b3b52011-04-28 14:48:42 -07002430 __be32 orig_daddr, __be32 orig_saddr,
2431 int orig_oif, struct net_device *dev_out,
David S. Miller5ada5522011-02-17 15:29:00 -08002432 unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002433{
David S. Miller982721f2011-02-16 21:44:24 -08002434 struct fib_info *fi = res->fi;
David S. Miller813b3b52011-04-28 14:48:42 -07002435 u32 tos = RT_FL_TOS(fl4);
David S. Miller5ada5522011-02-17 15:29:00 -08002436 struct in_device *in_dev;
David S. Miller982721f2011-02-16 21:44:24 -08002437 u16 type = res->type;
David S. Miller5ada5522011-02-17 15:29:00 -08002438 struct rtable *rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002439
David S. Miller68a5e3d2011-03-11 20:07:33 -05002440 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
David S. Miller5ada5522011-02-17 15:29:00 -08002441 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002442
David S. Miller68a5e3d2011-03-11 20:07:33 -05002443 if (ipv4_is_lbcast(fl4->daddr))
David S. Miller982721f2011-02-16 21:44:24 -08002444 type = RTN_BROADCAST;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002445 else if (ipv4_is_multicast(fl4->daddr))
David S. Miller982721f2011-02-16 21:44:24 -08002446 type = RTN_MULTICAST;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002447 else if (ipv4_is_zeronet(fl4->daddr))
David S. Miller5ada5522011-02-17 15:29:00 -08002448 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002449
2450 if (dev_out->flags & IFF_LOOPBACK)
2451 flags |= RTCF_LOCAL;
2452
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002453 in_dev = __in_dev_get_rcu(dev_out);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002454 if (!in_dev)
David S. Miller5ada5522011-02-17 15:29:00 -08002455 return ERR_PTR(-EINVAL);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002456
David S. Miller982721f2011-02-16 21:44:24 -08002457 if (type == RTN_BROADCAST) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002458 flags |= RTCF_BROADCAST | RTCF_LOCAL;
David S. Miller982721f2011-02-16 21:44:24 -08002459 fi = NULL;
2460 } else if (type == RTN_MULTICAST) {
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002461 flags |= RTCF_MULTICAST | RTCF_LOCAL;
David S. Miller813b3b52011-04-28 14:48:42 -07002462 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2463 fl4->flowi4_proto))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002464 flags &= ~RTCF_LOCAL;
2465 /* If multicast route do not exist use
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002466 * default one, but do not gateway in this case.
2467 * Yes, it is hack.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002468 */
David S. Miller982721f2011-02-16 21:44:24 -08002469 if (fi && res->prefixlen < 4)
2470 fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002471 }
2472
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002473 rth = rt_dst_alloc(dev_out,
2474 IN_DEV_CONF_GET(in_dev, NOPOLICY),
David S. Miller0c4dcd52011-02-17 15:42:37 -08002475 IN_DEV_CONF_GET(in_dev, NOXFRM));
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002476 if (!rth)
David S. Miller5ada5522011-02-17 15:29:00 -08002477 return ERR_PTR(-ENOBUFS);
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002478
David S. Millercf911662011-04-28 14:31:47 -07002479 rth->dst.output = ip_output;
2480
David S. Miller813b3b52011-04-28 14:48:42 -07002481 rth->rt_key_dst = orig_daddr;
2482 rth->rt_key_src = orig_saddr;
David S. Millercf911662011-04-28 14:31:47 -07002483 rth->rt_genid = rt_genid(dev_net(dev_out));
2484 rth->rt_flags = flags;
2485 rth->rt_type = type;
David S. Miller475949d2011-05-03 19:45:15 -07002486 rth->rt_key_tos = tos;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002487 rth->rt_dst = fl4->daddr;
2488 rth->rt_src = fl4->saddr;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002489 rth->rt_route_iif = 0;
David S. Miller813b3b52011-04-28 14:48:42 -07002490 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2491 rth->rt_oif = orig_oif;
2492 rth->rt_mark = fl4->flowi4_mark;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002493 rth->rt_gateway = fl4->daddr;
2494 rth->rt_spec_dst= fl4->saddr;
David S. Millercf911662011-04-28 14:31:47 -07002495 rth->rt_peer_genid = 0;
2496 rth->peer = NULL;
2497 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002498
2499 RT_CACHE_STAT_INC(out_slow_tot);
2500
2501 if (flags & RTCF_LOCAL) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002502 rth->dst.input = ip_local_deliver;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002503 rth->rt_spec_dst = fl4->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002504 }
2505 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
David S. Miller68a5e3d2011-03-11 20:07:33 -05002506 rth->rt_spec_dst = fl4->saddr;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002507 if (flags & RTCF_LOCAL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002508 !(dev_out->flags & IFF_LOOPBACK)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002509 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002510 RT_CACHE_STAT_INC(out_slow_mc);
2511 }
2512#ifdef CONFIG_IP_MROUTE
David S. Miller982721f2011-02-16 21:44:24 -08002513 if (type == RTN_MULTICAST) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002514 if (IN_DEV_MFORWARD(in_dev) &&
David S. Miller813b3b52011-04-28 14:48:42 -07002515 !ipv4_is_local_multicast(fl4->daddr)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002516 rth->dst.input = ip_mr_input;
2517 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002518 }
2519 }
2520#endif
2521 }
2522
David S. Miller813b3b52011-04-28 14:48:42 -07002523 rt_set_nexthop(rth, fl4, res, fi, type, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002524
David S. Miller5ada5522011-02-17 15:29:00 -08002525 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002526}
2527
Linus Torvalds1da177e2005-04-16 15:20:36 -07002528/*
2529 * Major route resolver routine.
Eric Dumazet0197aa32010-09-30 03:33:58 +00002530 * called with rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002531 */
2532
David S. Miller813b3b52011-04-28 14:48:42 -07002533static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002534{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002535 struct net_device *dev_out = NULL;
David S. Miller813b3b52011-04-28 14:48:42 -07002536 u32 tos = RT_FL_TOS(fl4);
2537 unsigned int flags = 0;
2538 struct fib_result res;
David S. Miller5ada5522011-02-17 15:29:00 -08002539 struct rtable *rth;
David S. Miller813b3b52011-04-28 14:48:42 -07002540 __be32 orig_daddr;
2541 __be32 orig_saddr;
2542 int orig_oif;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002543
2544 res.fi = NULL;
2545#ifdef CONFIG_IP_MULTIPLE_TABLES
2546 res.r = NULL;
2547#endif
2548
David S. Miller813b3b52011-04-28 14:48:42 -07002549 orig_daddr = fl4->daddr;
2550 orig_saddr = fl4->saddr;
2551 orig_oif = fl4->flowi4_oif;
2552
2553 fl4->flowi4_iif = net->loopback_dev->ifindex;
2554 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2555 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2556 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
David S. Miller44713b62011-03-04 21:24:47 -08002557
David S. Miller010c2702011-02-17 15:37:09 -08002558 rcu_read_lock();
David S. Miller813b3b52011-04-28 14:48:42 -07002559 if (fl4->saddr) {
David S. Millerb23dd4f2011-03-02 14:31:35 -08002560 rth = ERR_PTR(-EINVAL);
David S. Miller813b3b52011-04-28 14:48:42 -07002561 if (ipv4_is_multicast(fl4->saddr) ||
2562 ipv4_is_lbcast(fl4->saddr) ||
2563 ipv4_is_zeronet(fl4->saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002564 goto out;
2565
Linus Torvalds1da177e2005-04-16 15:20:36 -07002566 /* I removed check for oif == dev_out->oif here.
2567 It was wrong for two reasons:
Denis V. Lunev1ab35272008-01-22 22:04:30 -08002568 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2569 is assigned to multiple interfaces.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002570 2. Moreover, we are allowed to send packets with saddr
2571 of another iface. --ANK
2572 */
2573
David S. Miller813b3b52011-04-28 14:48:42 -07002574 if (fl4->flowi4_oif == 0 &&
2575 (ipv4_is_multicast(fl4->daddr) ||
2576 ipv4_is_lbcast(fl4->daddr))) {
Julian Anastasova210d012008-10-01 07:28:28 -07002577 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
David S. Miller813b3b52011-04-28 14:48:42 -07002578 dev_out = __ip_dev_find(net, fl4->saddr, false);
Julian Anastasova210d012008-10-01 07:28:28 -07002579 if (dev_out == NULL)
2580 goto out;
2581
Linus Torvalds1da177e2005-04-16 15:20:36 -07002582 /* Special hack: user can direct multicasts
2583 and limited broadcast via necessary interface
2584 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2585 This hack is not just for fun, it allows
2586 vic,vat and friends to work.
2587 They bind socket to loopback, set ttl to zero
2588 and expect that it will work.
2589 From the viewpoint of routing cache they are broken,
2590 because we are not allowed to build multicast path
2591 with loopback source addr (look, routing cache
2592 cannot know, that ttl is zero, so that packet
2593 will not leave this host and route is valid).
2594 Luckily, this hack is good workaround.
2595 */
2596
David S. Miller813b3b52011-04-28 14:48:42 -07002597 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002598 goto make_route;
2599 }
Julian Anastasova210d012008-10-01 07:28:28 -07002600
David S. Miller813b3b52011-04-28 14:48:42 -07002601 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
Julian Anastasova210d012008-10-01 07:28:28 -07002602 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
David S. Miller813b3b52011-04-28 14:48:42 -07002603 if (!__ip_dev_find(net, fl4->saddr, false))
Julian Anastasova210d012008-10-01 07:28:28 -07002604 goto out;
Julian Anastasova210d012008-10-01 07:28:28 -07002605 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002606 }
2607
2608
David S. Miller813b3b52011-04-28 14:48:42 -07002609 if (fl4->flowi4_oif) {
2610 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002611 rth = ERR_PTR(-ENODEV);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002612 if (dev_out == NULL)
2613 goto out;
Herbert Xue5ed6392005-10-03 14:35:55 -07002614
2615 /* RACE: Check return value of inet_select_addr instead. */
Eric Dumazetfc75fc82010-12-22 04:39:39 +00002616 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
David S. Millerb23dd4f2011-03-02 14:31:35 -08002617 rth = ERR_PTR(-ENETUNREACH);
Eric Dumazetfc75fc82010-12-22 04:39:39 +00002618 goto out;
2619 }
David S. Miller813b3b52011-04-28 14:48:42 -07002620 if (ipv4_is_local_multicast(fl4->daddr) ||
2621 ipv4_is_lbcast(fl4->daddr)) {
2622 if (!fl4->saddr)
2623 fl4->saddr = inet_select_addr(dev_out, 0,
2624 RT_SCOPE_LINK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002625 goto make_route;
2626 }
David S. Miller813b3b52011-04-28 14:48:42 -07002627 if (fl4->saddr) {
2628 if (ipv4_is_multicast(fl4->daddr))
2629 fl4->saddr = inet_select_addr(dev_out, 0,
2630 fl4->flowi4_scope);
2631 else if (!fl4->daddr)
2632 fl4->saddr = inet_select_addr(dev_out, 0,
2633 RT_SCOPE_HOST);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002634 }
2635 }
2636
David S. Miller813b3b52011-04-28 14:48:42 -07002637 if (!fl4->daddr) {
2638 fl4->daddr = fl4->saddr;
2639 if (!fl4->daddr)
2640 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002641 dev_out = net->loopback_dev;
David S. Miller813b3b52011-04-28 14:48:42 -07002642 fl4->flowi4_oif = net->loopback_dev->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002643 res.type = RTN_LOCAL;
2644 flags |= RTCF_LOCAL;
2645 goto make_route;
2646 }
2647
David S. Miller813b3b52011-04-28 14:48:42 -07002648 if (fib_lookup(net, fl4, &res)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002649 res.fi = NULL;
David S. Miller813b3b52011-04-28 14:48:42 -07002650 if (fl4->flowi4_oif) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002651 /* Apparently, routing tables are wrong. Assume,
2652 that the destination is on link.
2653
2654 WHY? DW.
2655 Because we are allowed to send to iface
2656 even if it has NO routes and NO assigned
2657 addresses. When oif is specified, routing
2658 tables are looked up with only one purpose:
2659 to catch if destination is gatewayed, rather than
2660 direct. Moreover, if MSG_DONTROUTE is set,
2661 we send packet, ignoring both routing tables
2662 and ifaddr state. --ANK
2663
2664
2665 We could make it even if oif is unknown,
2666 likely IPv6, but we do not.
2667 */
2668
David S. Miller813b3b52011-04-28 14:48:42 -07002669 if (fl4->saddr == 0)
2670 fl4->saddr = inet_select_addr(dev_out, 0,
2671 RT_SCOPE_LINK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002672 res.type = RTN_UNICAST;
2673 goto make_route;
2674 }
David S. Millerb23dd4f2011-03-02 14:31:35 -08002675 rth = ERR_PTR(-ENETUNREACH);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002676 goto out;
2677 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002678
2679 if (res.type == RTN_LOCAL) {
David S. Miller813b3b52011-04-28 14:48:42 -07002680 if (!fl4->saddr) {
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002681 if (res.fi->fib_prefsrc)
David S. Miller813b3b52011-04-28 14:48:42 -07002682 fl4->saddr = res.fi->fib_prefsrc;
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002683 else
David S. Miller813b3b52011-04-28 14:48:42 -07002684 fl4->saddr = fl4->daddr;
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002685 }
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002686 dev_out = net->loopback_dev;
David S. Miller813b3b52011-04-28 14:48:42 -07002687 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002688 res.fi = NULL;
2689 flags |= RTCF_LOCAL;
2690 goto make_route;
2691 }
2692
2693#ifdef CONFIG_IP_ROUTE_MULTIPATH
David S. Miller813b3b52011-04-28 14:48:42 -07002694 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
David S. Miller1b7fe5932011-03-10 17:01:16 -08002695 fib_select_multipath(&res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002696 else
2697#endif
David S. Miller21d8c492011-04-14 14:49:37 -07002698 if (!res.prefixlen &&
2699 res.table->tb_num_default > 1 &&
David S. Miller813b3b52011-04-28 14:48:42 -07002700 res.type == RTN_UNICAST && !fl4->flowi4_oif)
David S. Miller0c838ff2011-01-31 16:16:50 -08002701 fib_select_default(&res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002702
David S. Miller813b3b52011-04-28 14:48:42 -07002703 if (!fl4->saddr)
2704 fl4->saddr = FIB_RES_PREFSRC(net, res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002705
Linus Torvalds1da177e2005-04-16 15:20:36 -07002706 dev_out = FIB_RES_DEV(res);
David S. Miller813b3b52011-04-28 14:48:42 -07002707 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002708
2709
2710make_route:
David S. Miller813b3b52011-04-28 14:48:42 -07002711 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2712 dev_out, flags);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002713 if (!IS_ERR(rth)) {
David S. Miller5ada5522011-02-17 15:29:00 -08002714 unsigned int hash;
2715
David S. Miller813b3b52011-04-28 14:48:42 -07002716 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
David S. Miller5ada5522011-02-17 15:29:00 -08002717 rt_genid(dev_net(dev_out)));
David S. Miller813b3b52011-04-28 14:48:42 -07002718 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
David S. Miller5ada5522011-02-17 15:29:00 -08002719 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002720
David S. Miller010c2702011-02-17 15:37:09 -08002721out:
2722 rcu_read_unlock();
David S. Millerb23dd4f2011-03-02 14:31:35 -08002723 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002724}
2725
David S. Miller813b3b52011-04-28 14:48:42 -07002726struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002727{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002728 struct rtable *rth;
David S. Miller010c2702011-02-17 15:37:09 -08002729 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002730
Neil Horman1080d702008-10-27 12:28:25 -07002731 if (!rt_caching(net))
2732 goto slow_output;
2733
David S. Miller9d6ec932011-03-12 01:12:47 -05002734 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002735
2736 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08002737 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002738 rth = rcu_dereference_bh(rth->dst.rt_next)) {
David S. Miller9d6ec932011-03-12 01:12:47 -05002739 if (rth->rt_key_dst == flp4->daddr &&
2740 rth->rt_key_src == flp4->saddr &&
David S. Millerc7537962010-11-11 17:07:48 -08002741 rt_is_output_route(rth) &&
David S. Miller9d6ec932011-03-12 01:12:47 -05002742 rth->rt_oif == flp4->flowi4_oif &&
2743 rth->rt_mark == flp4->flowi4_mark &&
David S. Miller475949d2011-05-03 19:45:15 -07002744 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
Denis V. Lunevb5921912008-01-22 23:50:25 -08002745 (IPTOS_RT_MASK | RTO_ONLINK)) &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002746 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002747 !rt_is_expired(rth)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002748 dst_use(&rth->dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002749 RT_CACHE_STAT_INC(out_hit);
2750 rcu_read_unlock_bh();
David S. Miller56157872011-05-02 14:37:45 -07002751 if (!flp4->saddr)
2752 flp4->saddr = rth->rt_src;
2753 if (!flp4->daddr)
2754 flp4->daddr = rth->rt_dst;
David S. Millerb23dd4f2011-03-02 14:31:35 -08002755 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002756 }
2757 RT_CACHE_STAT_INC(out_hlist_search);
2758 }
2759 rcu_read_unlock_bh();
2760
Neil Horman1080d702008-10-27 12:28:25 -07002761slow_output:
David S. Miller9d6ec932011-03-12 01:12:47 -05002762 return ip_route_output_slow(net, flp4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002763}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002764EXPORT_SYMBOL_GPL(__ip_route_output_key);
2765
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002766static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2767{
2768 return NULL;
2769}
2770
Steffen Klassertebb762f2011-11-23 02:12:51 +00002771static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
Roland Dreierec831ea2011-01-31 13:16:00 -08002772{
Steffen Klassert618f9bc2011-11-23 02:13:31 +00002773 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2774
2775 return mtu ? : dst->dev->mtu;
Roland Dreierec831ea2011-01-31 13:16:00 -08002776}
2777
David S. Miller14e50e52007-05-24 18:17:54 -07002778static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2779{
2780}
2781
Held Bernhard0972ddb2011-04-24 22:07:32 +00002782static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2783 unsigned long old)
2784{
2785 return NULL;
2786}
2787
David S. Miller14e50e52007-05-24 18:17:54 -07002788static struct dst_ops ipv4_dst_blackhole_ops = {
2789 .family = AF_INET,
Harvey Harrison09640e62009-02-01 00:45:17 -08002790 .protocol = cpu_to_be16(ETH_P_IP),
David S. Miller14e50e52007-05-24 18:17:54 -07002791 .destroy = ipv4_dst_destroy,
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002792 .check = ipv4_blackhole_dst_check,
Steffen Klassertebb762f2011-11-23 02:12:51 +00002793 .mtu = ipv4_blackhole_mtu,
Eric Dumazet214f45c2011-02-18 11:39:01 -08002794 .default_advmss = ipv4_default_advmss,
David S. Miller14e50e52007-05-24 18:17:54 -07002795 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
Held Bernhard0972ddb2011-04-24 22:07:32 +00002796 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
David S. Millerd3aaeb32011-07-18 00:40:17 -07002797 .neigh_lookup = ipv4_neigh_lookup,
David S. Miller14e50e52007-05-24 18:17:54 -07002798};
2799
David S. Miller2774c132011-03-01 14:59:04 -08002800struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
David S. Miller14e50e52007-05-24 18:17:54 -07002801{
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002802 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
David S. Miller2774c132011-03-01 14:59:04 -08002803 struct rtable *ort = (struct rtable *) dst_orig;
David S. Miller14e50e52007-05-24 18:17:54 -07002804
2805 if (rt) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002806 struct dst_entry *new = &rt->dst;
David S. Miller14e50e52007-05-24 18:17:54 -07002807
David S. Miller14e50e52007-05-24 18:17:54 -07002808 new->__use = 1;
Herbert Xu352e5122007-11-13 21:34:06 -08002809 new->input = dst_discard;
2810 new->output = dst_discard;
David S. Millerdefb3512010-12-08 21:16:57 -08002811 dst_copy_metrics(new, &ort->dst);
David S. Miller14e50e52007-05-24 18:17:54 -07002812
Changli Gaod8d1f302010-06-10 23:31:35 -07002813 new->dev = ort->dst.dev;
David S. Miller14e50e52007-05-24 18:17:54 -07002814 if (new->dev)
2815 dev_hold(new->dev);
2816
David S. Miller5e2b61f2011-03-04 21:47:09 -08002817 rt->rt_key_dst = ort->rt_key_dst;
2818 rt->rt_key_src = ort->rt_key_src;
David S. Miller475949d2011-05-03 19:45:15 -07002819 rt->rt_key_tos = ort->rt_key_tos;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002820 rt->rt_route_iif = ort->rt_route_iif;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002821 rt->rt_iif = ort->rt_iif;
2822 rt->rt_oif = ort->rt_oif;
2823 rt->rt_mark = ort->rt_mark;
David S. Miller14e50e52007-05-24 18:17:54 -07002824
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002825 rt->rt_genid = rt_genid(net);
David S. Miller14e50e52007-05-24 18:17:54 -07002826 rt->rt_flags = ort->rt_flags;
2827 rt->rt_type = ort->rt_type;
2828 rt->rt_dst = ort->rt_dst;
2829 rt->rt_src = ort->rt_src;
David S. Miller14e50e52007-05-24 18:17:54 -07002830 rt->rt_gateway = ort->rt_gateway;
2831 rt->rt_spec_dst = ort->rt_spec_dst;
2832 rt->peer = ort->peer;
2833 if (rt->peer)
2834 atomic_inc(&rt->peer->refcnt);
David S. Miller62fa8a82011-01-26 20:51:05 -08002835 rt->fi = ort->fi;
2836 if (rt->fi)
2837 atomic_inc(&rt->fi->fib_clntref);
David S. Miller14e50e52007-05-24 18:17:54 -07002838
2839 dst_free(new);
2840 }
2841
David S. Miller2774c132011-03-01 14:59:04 -08002842 dst_release(dst_orig);
2843
2844 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
David S. Miller14e50e52007-05-24 18:17:54 -07002845}
2846
David S. Miller9d6ec932011-03-12 01:12:47 -05002847struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
David S. Millerb23dd4f2011-03-02 14:31:35 -08002848 struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002849{
David S. Miller9d6ec932011-03-12 01:12:47 -05002850 struct rtable *rt = __ip_route_output_key(net, flp4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002851
David S. Millerb23dd4f2011-03-02 14:31:35 -08002852 if (IS_ERR(rt))
2853 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002854
David S. Miller56157872011-05-02 14:37:45 -07002855 if (flp4->flowi4_proto)
David S. Miller9d6ec932011-03-12 01:12:47 -05002856 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2857 flowi4_to_flowi(flp4),
2858 sk, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002859
David S. Millerb23dd4f2011-03-02 14:31:35 -08002860 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002861}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002862EXPORT_SYMBOL_GPL(ip_route_output_flow);
2863
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002864static int rt_fill_info(struct net *net,
2865 struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002866 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002867{
Eric Dumazet511c3f92009-06-02 05:14:27 +00002868 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002869 struct rtmsg *r;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002870 struct nlmsghdr *nlh;
Steffen Klassert2bc8ca42011-10-11 01:12:02 +00002871 unsigned long expires = 0;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00002872 const struct inet_peer *peer = rt->peer;
Thomas Grafe3703b32006-11-27 09:27:07 -08002873 u32 id = 0, ts = 0, tsage = 0, error;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002874
2875 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2876 if (nlh == NULL)
Patrick McHardy26932562007-01-31 23:16:40 -08002877 return -EMSGSIZE;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002878
2879 r = nlmsg_data(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002880 r->rtm_family = AF_INET;
2881 r->rtm_dst_len = 32;
2882 r->rtm_src_len = 0;
David S. Miller475949d2011-05-03 19:45:15 -07002883 r->rtm_tos = rt->rt_key_tos;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002884 r->rtm_table = RT_TABLE_MAIN;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002885 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002886 r->rtm_type = rt->rt_type;
2887 r->rtm_scope = RT_SCOPE_UNIVERSE;
2888 r->rtm_protocol = RTPROT_UNSPEC;
2889 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2890 if (rt->rt_flags & RTCF_NOTIFY)
2891 r->rtm_flags |= RTM_F_NOTIFY;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002892
Al Viro17fb2c62006-09-26 22:15:25 -07002893 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002894
David S. Miller5e2b61f2011-03-04 21:47:09 -08002895 if (rt->rt_key_src) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002896 r->rtm_src_len = 32;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002897 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002898 }
Changli Gaod8d1f302010-06-10 23:31:35 -07002899 if (rt->dst.dev)
2900 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
Patrick McHardyc7066f72011-01-14 13:36:42 +01002901#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07002902 if (rt->dst.tclassid)
2903 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002904#endif
David S. Millerc7537962010-11-11 17:07:48 -08002905 if (rt_is_input_route(rt))
Al Viro17fb2c62006-09-26 22:15:25 -07002906 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
David S. Miller5e2b61f2011-03-04 21:47:09 -08002907 else if (rt->rt_src != rt->rt_key_src)
Al Viro17fb2c62006-09-26 22:15:25 -07002908 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002909
Linus Torvalds1da177e2005-04-16 15:20:36 -07002910 if (rt->rt_dst != rt->rt_gateway)
Al Viro17fb2c62006-09-26 22:15:25 -07002911 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002912
David S. Millerdefb3512010-12-08 21:16:57 -08002913 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002914 goto nla_put_failure;
2915
David S. Miller5e2b61f2011-03-04 21:47:09 -08002916 if (rt->rt_mark)
2917 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
Eric Dumazet963bfee2010-07-20 22:03:14 +00002918
Changli Gaod8d1f302010-06-10 23:31:35 -07002919 error = rt->dst.error;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00002920 if (peer) {
Eric Dumazet317fe0e2010-06-16 04:52:13 +00002921 inet_peer_refcheck(rt->peer);
Eric Dumazetfe6fe792011-06-08 06:07:07 +00002922 id = atomic_read(&peer->ip_id_count) & 0xffff;
2923 if (peer->tcp_ts_stamp) {
2924 ts = peer->tcp_ts;
2925 tsage = get_seconds() - peer->tcp_ts_stamp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002926 }
Eric Dumazetfe6fe792011-06-08 06:07:07 +00002927 expires = ACCESS_ONCE(peer->pmtu_expires);
Steffen Klassert2bc8ca42011-10-11 01:12:02 +00002928 if (expires) {
2929 if (time_before(jiffies, expires))
2930 expires -= jiffies;
2931 else
2932 expires = 0;
2933 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002934 }
Thomas Grafbe403ea2006-08-17 18:15:17 -07002935
David S. Millerc7537962010-11-11 17:07:48 -08002936 if (rt_is_input_route(rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002937#ifdef CONFIG_IP_MROUTE
Al Viroe4485152006-09-26 22:15:01 -07002938 __be32 dst = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002939
Joe Perchesf97c1e02007-12-16 13:45:43 -08002940 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002941 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
David S. Miller9a1b9492011-05-04 12:18:54 -07002942 int err = ipmr_get_route(net, skb,
2943 rt->rt_src, rt->rt_dst,
2944 r, nowait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002945 if (err <= 0) {
2946 if (!nowait) {
2947 if (err == 0)
2948 return 0;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002949 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002950 } else {
2951 if (err == -EMSGSIZE)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002952 goto nla_put_failure;
Thomas Grafe3703b32006-11-27 09:27:07 -08002953 error = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002954 }
2955 }
2956 } else
2957#endif
David S. Miller5e2b61f2011-03-04 21:47:09 -08002958 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002959 }
2960
Changli Gaod8d1f302010-06-10 23:31:35 -07002961 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
Thomas Grafe3703b32006-11-27 09:27:07 -08002962 expires, error) < 0)
2963 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002964
Thomas Grafbe403ea2006-08-17 18:15:17 -07002965 return nlmsg_end(skb, nlh);
2966
2967nla_put_failure:
Patrick McHardy26932562007-01-31 23:16:40 -08002968 nlmsg_cancel(skb, nlh);
2969 return -EMSGSIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002970}
2971
Thomas Graf63f34442007-03-22 11:55:17 -07002972static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002973{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09002974 struct net *net = sock_net(in_skb->sk);
Thomas Grafd889ce32006-08-17 18:15:44 -07002975 struct rtmsg *rtm;
2976 struct nlattr *tb[RTA_MAX+1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07002977 struct rtable *rt = NULL;
Al Viro9e12bb22006-09-26 21:25:20 -07002978 __be32 dst = 0;
2979 __be32 src = 0;
2980 u32 iif;
Thomas Grafd889ce32006-08-17 18:15:44 -07002981 int err;
Eric Dumazet963bfee2010-07-20 22:03:14 +00002982 int mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002983 struct sk_buff *skb;
2984
Thomas Grafd889ce32006-08-17 18:15:44 -07002985 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2986 if (err < 0)
2987 goto errout;
2988
2989 rtm = nlmsg_data(nlh);
2990
Linus Torvalds1da177e2005-04-16 15:20:36 -07002991 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
Thomas Grafd889ce32006-08-17 18:15:44 -07002992 if (skb == NULL) {
2993 err = -ENOBUFS;
2994 goto errout;
2995 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002996
2997 /* Reserve room for dummy headers, this skb can pass
2998 through good chunk of routing engine.
2999 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07003000 skb_reset_mac_header(skb);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07003001 skb_reset_network_header(skb);
Stephen Hemmingerd2c962b2006-04-17 17:27:11 -07003002
3003 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07003004 ip_hdr(skb)->protocol = IPPROTO_ICMP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003005 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3006
Al Viro17fb2c62006-09-26 22:15:25 -07003007 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3008 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
Thomas Grafd889ce32006-08-17 18:15:44 -07003009 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
Eric Dumazet963bfee2010-07-20 22:03:14 +00003010 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003011
3012 if (iif) {
Thomas Grafd889ce32006-08-17 18:15:44 -07003013 struct net_device *dev;
3014
Denis V. Lunev19375042008-02-28 20:52:04 -08003015 dev = __dev_get_by_index(net, iif);
Thomas Grafd889ce32006-08-17 18:15:44 -07003016 if (dev == NULL) {
3017 err = -ENODEV;
3018 goto errout_free;
3019 }
3020
Linus Torvalds1da177e2005-04-16 15:20:36 -07003021 skb->protocol = htons(ETH_P_IP);
3022 skb->dev = dev;
Eric Dumazet963bfee2010-07-20 22:03:14 +00003023 skb->mark = mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003024 local_bh_disable();
3025 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3026 local_bh_enable();
Thomas Grafd889ce32006-08-17 18:15:44 -07003027
Eric Dumazet511c3f92009-06-02 05:14:27 +00003028 rt = skb_rtable(skb);
Changli Gaod8d1f302010-06-10 23:31:35 -07003029 if (err == 0 && rt->dst.error)
3030 err = -rt->dst.error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003031 } else {
David S. Miller68a5e3d2011-03-11 20:07:33 -05003032 struct flowi4 fl4 = {
3033 .daddr = dst,
3034 .saddr = src,
3035 .flowi4_tos = rtm->rtm_tos,
3036 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3037 .flowi4_mark = mark,
Thomas Grafd889ce32006-08-17 18:15:44 -07003038 };
David S. Miller9d6ec932011-03-12 01:12:47 -05003039 rt = ip_route_output_key(net, &fl4);
David S. Millerb23dd4f2011-03-02 14:31:35 -08003040
3041 err = 0;
3042 if (IS_ERR(rt))
3043 err = PTR_ERR(rt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003044 }
Thomas Grafd889ce32006-08-17 18:15:44 -07003045
Linus Torvalds1da177e2005-04-16 15:20:36 -07003046 if (err)
Thomas Grafd889ce32006-08-17 18:15:44 -07003047 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003048
Changli Gaod8d1f302010-06-10 23:31:35 -07003049 skb_dst_set(skb, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003050 if (rtm->rtm_flags & RTM_F_NOTIFY)
3051 rt->rt_flags |= RTCF_NOTIFY;
3052
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003053 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Denis V. Lunev19375042008-02-28 20:52:04 -08003054 RTM_NEWROUTE, 0, 0);
Thomas Grafd889ce32006-08-17 18:15:44 -07003055 if (err <= 0)
3056 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003057
Denis V. Lunev19375042008-02-28 20:52:04 -08003058 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
Thomas Grafd889ce32006-08-17 18:15:44 -07003059errout:
Thomas Graf2942e902006-08-15 00:30:25 -07003060 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003061
Thomas Grafd889ce32006-08-17 18:15:44 -07003062errout_free:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003063 kfree_skb(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07003064 goto errout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003065}
3066
3067int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3068{
3069 struct rtable *rt;
3070 int h, s_h;
3071 int idx, s_idx;
Denis V. Lunev19375042008-02-28 20:52:04 -08003072 struct net *net;
3073
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09003074 net = sock_net(skb->sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003075
3076 s_h = cb->args[0];
Eric Dumazetd8c92832008-01-07 21:52:14 -08003077 if (s_h < 0)
3078 s_h = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003079 s_idx = idx = cb->args[1];
Eric Dumazeta6272662008-08-28 01:11:25 -07003080 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3081 if (!rt_hash_table[h].chain)
3082 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003083 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08003084 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
Changli Gaod8d1f302010-06-10 23:31:35 -07003085 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3086 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003087 continue;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07003088 if (rt_is_expired(rt))
Eric Dumazet29e75252008-01-31 17:05:09 -08003089 continue;
Changli Gaod8d1f302010-06-10 23:31:35 -07003090 skb_dst_set_noref(skb, &rt->dst);
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003091 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003092 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07003093 1, NLM_F_MULTI) <= 0) {
Eric Dumazetadf30902009-06-02 05:19:30 +00003094 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003095 rcu_read_unlock_bh();
3096 goto done;
3097 }
Eric Dumazetadf30902009-06-02 05:19:30 +00003098 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003099 }
3100 rcu_read_unlock_bh();
3101 }
3102
3103done:
3104 cb->args[0] = h;
3105 cb->args[1] = idx;
3106 return skb->len;
3107}
3108
3109void ip_rt_multicast_event(struct in_device *in_dev)
3110{
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -07003111 rt_cache_flush(dev_net(in_dev->dev), 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003112}
3113
3114#ifdef CONFIG_SYSCTL
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003115static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003116 void __user *buffer,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003117 size_t *lenp, loff_t *ppos)
3118{
3119 if (write) {
Denis V. Lunev639e1042008-07-05 19:02:06 -07003120 int flush_delay;
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003121 ctl_table ctl;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003122 struct net *net;
Denis V. Lunev639e1042008-07-05 19:02:06 -07003123
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003124 memcpy(&ctl, __ctl, sizeof(ctl));
3125 ctl.data = &flush_delay;
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003126 proc_dointvec(&ctl, write, buffer, lenp, ppos);
Denis V. Lunev639e1042008-07-05 19:02:06 -07003127
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003128 net = (struct net *)__ctl->extra1;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003129 rt_cache_flush(net, flush_delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003130 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003131 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003132
3133 return -EINVAL;
3134}
3135
Al Viroeeb61f72008-07-27 08:59:33 +01003136static ctl_table ipv4_route_table[] = {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003137 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003138 .procname = "gc_thresh",
3139 .data = &ipv4_dst_ops.gc_thresh,
3140 .maxlen = sizeof(int),
3141 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003142 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003143 },
3144 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003145 .procname = "max_size",
3146 .data = &ip_rt_max_size,
3147 .maxlen = sizeof(int),
3148 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003149 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003150 },
3151 {
3152 /* Deprecated. Use gc_min_interval_ms */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003153
Linus Torvalds1da177e2005-04-16 15:20:36 -07003154 .procname = "gc_min_interval",
3155 .data = &ip_rt_gc_min_interval,
3156 .maxlen = sizeof(int),
3157 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003158 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003159 },
3160 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003161 .procname = "gc_min_interval_ms",
3162 .data = &ip_rt_gc_min_interval,
3163 .maxlen = sizeof(int),
3164 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003165 .proc_handler = proc_dointvec_ms_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003166 },
3167 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003168 .procname = "gc_timeout",
3169 .data = &ip_rt_gc_timeout,
3170 .maxlen = sizeof(int),
3171 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003172 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003173 },
3174 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003175 .procname = "redirect_load",
3176 .data = &ip_rt_redirect_load,
3177 .maxlen = sizeof(int),
3178 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003179 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003180 },
3181 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003182 .procname = "redirect_number",
3183 .data = &ip_rt_redirect_number,
3184 .maxlen = sizeof(int),
3185 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003186 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003187 },
3188 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003189 .procname = "redirect_silence",
3190 .data = &ip_rt_redirect_silence,
3191 .maxlen = sizeof(int),
3192 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003193 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003194 },
3195 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003196 .procname = "error_cost",
3197 .data = &ip_rt_error_cost,
3198 .maxlen = sizeof(int),
3199 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003200 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003201 },
3202 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003203 .procname = "error_burst",
3204 .data = &ip_rt_error_burst,
3205 .maxlen = sizeof(int),
3206 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003207 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003208 },
3209 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003210 .procname = "gc_elasticity",
3211 .data = &ip_rt_gc_elasticity,
3212 .maxlen = sizeof(int),
3213 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003214 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003215 },
3216 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003217 .procname = "mtu_expires",
3218 .data = &ip_rt_mtu_expires,
3219 .maxlen = sizeof(int),
3220 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003221 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003222 },
3223 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003224 .procname = "min_pmtu",
3225 .data = &ip_rt_min_pmtu,
3226 .maxlen = sizeof(int),
3227 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003228 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003229 },
3230 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003231 .procname = "min_adv_mss",
3232 .data = &ip_rt_min_advmss,
3233 .maxlen = sizeof(int),
3234 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003235 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003236 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003237 { }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003238};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003239
Al Viro2f4520d2008-08-25 15:17:44 -07003240static struct ctl_table empty[1];
3241
3242static struct ctl_table ipv4_skeleton[] =
3243{
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003244 { .procname = "route",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003245 .mode = 0555, .child = ipv4_route_table},
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003246 { .procname = "neigh",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003247 .mode = 0555, .child = empty},
Al Viro2f4520d2008-08-25 15:17:44 -07003248 { }
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003249};
3250
Al Viro2f4520d2008-08-25 15:17:44 -07003251static __net_initdata struct ctl_path ipv4_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003252 { .procname = "net", },
3253 { .procname = "ipv4", },
Al Viro2f4520d2008-08-25 15:17:44 -07003254 { },
3255};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003256
3257static struct ctl_table ipv4_route_flush_table[] = {
3258 {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003259 .procname = "flush",
3260 .maxlen = sizeof(int),
3261 .mode = 0200,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003262 .proc_handler = ipv4_sysctl_rtcache_flush,
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003263 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003264 { },
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003265};
3266
Al Viro2f4520d2008-08-25 15:17:44 -07003267static __net_initdata struct ctl_path ipv4_route_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003268 { .procname = "net", },
3269 { .procname = "ipv4", },
3270 { .procname = "route", },
Al Viro2f4520d2008-08-25 15:17:44 -07003271 { },
3272};
3273
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003274static __net_init int sysctl_route_net_init(struct net *net)
3275{
3276 struct ctl_table *tbl;
3277
3278 tbl = ipv4_route_flush_table;
Octavian Purdila09ad9bc2009-11-25 15:14:13 -08003279 if (!net_eq(net, &init_net)) {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003280 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3281 if (tbl == NULL)
3282 goto err_dup;
3283 }
3284 tbl[0].extra1 = net;
3285
3286 net->ipv4.route_hdr =
3287 register_net_sysctl_table(net, ipv4_route_path, tbl);
3288 if (net->ipv4.route_hdr == NULL)
3289 goto err_reg;
3290 return 0;
3291
3292err_reg:
3293 if (tbl != ipv4_route_flush_table)
3294 kfree(tbl);
3295err_dup:
3296 return -ENOMEM;
3297}
3298
3299static __net_exit void sysctl_route_net_exit(struct net *net)
3300{
3301 struct ctl_table *tbl;
3302
3303 tbl = net->ipv4.route_hdr->ctl_table_arg;
3304 unregister_net_sysctl_table(net->ipv4.route_hdr);
3305 BUG_ON(tbl == ipv4_route_flush_table);
3306 kfree(tbl);
3307}
3308
3309static __net_initdata struct pernet_operations sysctl_route_ops = {
3310 .init = sysctl_route_net_init,
3311 .exit = sysctl_route_net_exit,
3312};
Linus Torvalds1da177e2005-04-16 15:20:36 -07003313#endif
3314
Neil Horman3ee94372010-05-08 01:57:52 -07003315static __net_init int rt_genid_init(struct net *net)
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003316{
Neil Horman3ee94372010-05-08 01:57:52 -07003317 get_random_bytes(&net->ipv4.rt_genid,
3318 sizeof(net->ipv4.rt_genid));
David S. Miller436c3b62011-03-24 17:42:21 -07003319 get_random_bytes(&net->ipv4.dev_addr_genid,
3320 sizeof(net->ipv4.dev_addr_genid));
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003321 return 0;
3322}
3323
Neil Horman3ee94372010-05-08 01:57:52 -07003324static __net_initdata struct pernet_operations rt_genid_ops = {
3325 .init = rt_genid_init,
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003326};
3327
3328
Patrick McHardyc7066f72011-01-14 13:36:42 +01003329#ifdef CONFIG_IP_ROUTE_CLASSID
Tejun Heo7d720c32010-02-16 15:20:26 +00003330struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
Patrick McHardyc7066f72011-01-14 13:36:42 +01003331#endif /* CONFIG_IP_ROUTE_CLASSID */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003332
3333static __initdata unsigned long rhash_entries;
3334static int __init set_rhash_entries(char *str)
3335{
3336 if (!str)
3337 return 0;
3338 rhash_entries = simple_strtoul(str, &str, 0);
3339 return 1;
3340}
3341__setup("rhash_entries=", set_rhash_entries);
3342
3343int __init ip_rt_init(void)
3344{
Eric Dumazet424c4b72005-07-05 14:58:19 -07003345 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003346
Patrick McHardyc7066f72011-01-14 13:36:42 +01003347#ifdef CONFIG_IP_ROUTE_CLASSID
Ingo Molnar0dcec8c2009-02-25 14:07:33 +01003348 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003349 if (!ip_rt_acct)
3350 panic("IP: failed to allocate ip_rt_acct\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003351#endif
3352
Alexey Dobriyane5d679f2006-08-26 19:25:52 -07003353 ipv4_dst_ops.kmem_cachep =
3354 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09003355 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003356
David S. Miller14e50e52007-05-24 18:17:54 -07003357 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3358
Eric Dumazetfc66f952010-10-08 06:37:34 +00003359 if (dst_entries_init(&ipv4_dst_ops) < 0)
3360 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3361
3362 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3363 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3364
Eric Dumazet424c4b72005-07-05 14:58:19 -07003365 rt_hash_table = (struct rt_hash_bucket *)
3366 alloc_large_system_hash("IP route cache",
3367 sizeof(struct rt_hash_bucket),
3368 rhash_entries,
Jan Beulich44813742009-09-21 17:03:05 -07003369 (totalram_pages >= 128 * 1024) ?
Mike Stroyan18955cf2005-11-29 16:12:55 -08003370 15 : 17,
Kirill Korotaev8d1502d2006-08-07 20:44:22 -07003371 0,
Eric Dumazet424c4b72005-07-05 14:58:19 -07003372 &rt_hash_log,
3373 &rt_hash_mask,
Anton Blanchardc9503e02009-04-27 05:42:24 -07003374 rhash_entries ? 0 : 512 * 1024);
Eric Dumazet22c047c2005-07-05 14:55:24 -07003375 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3376 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003377
3378 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3379 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3380
Linus Torvalds1da177e2005-04-16 15:20:36 -07003381 devinet_init();
3382 ip_fib_init();
3383
Denis V. Lunev73b38712008-02-28 20:51:18 -08003384 if (ip_rt_proc_init())
Pavel Emelyanov107f1632007-12-05 21:14:28 -08003385 printk(KERN_ERR "Unable to create route proc files\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003386#ifdef CONFIG_XFRM
3387 xfrm_init();
Neil Hormana33bc5c2009-07-30 18:52:15 -07003388 xfrm4_init(ip_rt_max_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003389#endif
Greg Rosec7ac8672011-06-10 01:27:09 +00003390 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
Thomas Graf63f34442007-03-22 11:55:17 -07003391
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003392#ifdef CONFIG_SYSCTL
3393 register_pernet_subsys(&sysctl_route_ops);
3394#endif
Neil Horman3ee94372010-05-08 01:57:52 -07003395 register_pernet_subsys(&rt_genid_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003396 return rc;
3397}
3398
Al Viroa1bc6eb2008-07-30 06:32:52 -04003399#ifdef CONFIG_SYSCTL
Al Viroeeb61f72008-07-27 08:59:33 +01003400/*
3401 * We really need to sanitize the damn ipv4 init order, then all
3402 * this nonsense will go away.
3403 */
3404void __init ip_static_sysctl_init(void)
3405{
Al Viro2f4520d2008-08-25 15:17:44 -07003406 register_sysctl_paths(ipv4_path, ipv4_skeleton);
Al Viroeeb61f72008-07-27 08:59:33 +01003407}
Al Viroa1bc6eb2008-07-30 06:32:52 -04003408#endif