blob: 3097eca70fd467f0a6ef0f1d2202f462405e25bf [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
Jesper Juhl02c30a82005-05-05 16:16:16 -07008 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -07009 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090021 * Alan Cox : Super /proc >4K
Linus Torvalds1da177e2005-04-16 15:20:36 -070022 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090039 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070055 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Ilia Sotnikovcef26852006-03-25 01:38:55 -080056 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
Linus Torvalds1da177e2005-04-16 15:20:36 -070058 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
Joe Perchesafd465032012-03-12 07:03:32 +000065#define pr_fmt(fmt) "IPv4: " fmt
66
Linus Torvalds1da177e2005-04-16 15:20:36 -070067#include <linux/module.h>
68#include <asm/uaccess.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070069#include <linux/bitops.h>
70#include <linux/types.h>
71#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070072#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070073#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070074#include <linux/string.h>
75#include <linux/socket.h>
76#include <linux/sockios.h>
77#include <linux/errno.h>
78#include <linux/in.h>
79#include <linux/inet.h>
80#include <linux/netdevice.h>
81#include <linux/proc_fs.h>
82#include <linux/init.h>
Eric Dumazet39c90ec2007-09-15 10:55:54 -070083#include <linux/workqueue.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070084#include <linux/skbuff.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070085#include <linux/inetdevice.h>
86#include <linux/igmp.h>
87#include <linux/pkt_sched.h>
88#include <linux/mroute.h>
89#include <linux/netfilter_ipv4.h>
90#include <linux/random.h>
91#include <linux/jhash.h>
92#include <linux/rcupdate.h>
93#include <linux/times.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090094#include <linux/slab.h>
Stephen Rothwellb9eda062011-12-22 17:03:29 +110095#include <linux/prefetch.h>
Herbert Xu352e5122007-11-13 21:34:06 -080096#include <net/dst.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020097#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070098#include <net/protocol.h>
99#include <net/ip.h>
100#include <net/route.h>
101#include <net/inetpeer.h>
102#include <net/sock.h>
103#include <net/ip_fib.h>
104#include <net/arp.h>
105#include <net/tcp.h>
106#include <net/icmp.h>
107#include <net/xfrm.h>
Tom Tucker8d717402006-07-30 20:43:36 -0700108#include <net/netevent.h>
Thomas Graf63f34442007-03-22 11:55:17 -0700109#include <net/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700110#ifdef CONFIG_SYSCTL
111#include <linux/sysctl.h>
112#endif
David S. Miller6e5714e2011-08-03 20:50:44 -0700113#include <net/secure_seq.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700114
David S. Miller68a5e3d2011-03-11 20:07:33 -0500115#define RT_FL_TOS(oldflp4) \
Julian Anastasovf61759e2011-12-02 11:39:42 +0000116 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700117
118#define IP_MAX_MTU 0xFFF0
119
120#define RT_GC_TIMEOUT (300*HZ)
121
Linus Torvalds1da177e2005-04-16 15:20:36 -0700122static int ip_rt_max_size;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700123static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500124static int ip_rt_gc_interval __read_mostly = 60 * HZ;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700125static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
126static int ip_rt_redirect_number __read_mostly = 9;
127static int ip_rt_redirect_load __read_mostly = HZ / 50;
128static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
129static int ip_rt_error_cost __read_mostly = HZ;
130static int ip_rt_error_burst __read_mostly = 5 * HZ;
131static int ip_rt_gc_elasticity __read_mostly = 8;
132static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
133static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
134static int ip_rt_min_advmss __read_mostly = 256;
Neil Horman1080d702008-10-27 12:28:25 -0700135static int rt_chain_length_max __read_mostly = 20;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700136
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500137static struct delayed_work expires_work;
138static unsigned long expires_ljiffies;
139
Linus Torvalds1da177e2005-04-16 15:20:36 -0700140/*
141 * Interface to generic destination cache.
142 */
143
144static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
David S. Miller0dbaee32010-12-13 12:52:14 -0800145static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
Steffen Klassertebb762f2011-11-23 02:12:51 +0000146static unsigned int ipv4_mtu(const struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700147static void ipv4_dst_destroy(struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
149static void ipv4_link_failure(struct sk_buff *skb);
150static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
Daniel Lezcano569d3642008-01-18 03:56:57 -0800151static int rt_garbage_collect(struct dst_ops *ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700152
Eric Dumazet72cdd1d2010-11-11 07:14:07 +0000153static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
154 int how)
155{
156}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700157
David S. Miller62fa8a82011-01-26 20:51:05 -0800158static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
159{
David S. Miller06582542011-01-27 14:58:42 -0800160 struct rtable *rt = (struct rtable *) dst;
161 struct inet_peer *peer;
162 u32 *p = NULL;
David S. Miller62fa8a82011-01-26 20:51:05 -0800163
David S. Miller06582542011-01-27 14:58:42 -0800164 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -0400165 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller06582542011-01-27 14:58:42 -0800166
167 peer = rt->peer;
168 if (peer) {
David S. Miller62fa8a82011-01-26 20:51:05 -0800169 u32 *old_p = __DST_METRICS_PTR(old);
170 unsigned long prev, new;
171
David S. Miller06582542011-01-27 14:58:42 -0800172 p = peer->metrics;
173 if (inet_metrics_new(peer))
174 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
David S. Miller62fa8a82011-01-26 20:51:05 -0800175
176 new = (unsigned long) p;
177 prev = cmpxchg(&dst->_metrics, old, new);
178
179 if (prev != old) {
David S. Miller62fa8a82011-01-26 20:51:05 -0800180 p = __DST_METRICS_PTR(prev);
181 if (prev & DST_METRICS_READ_ONLY)
182 p = NULL;
183 } else {
David S. Miller62fa8a82011-01-26 20:51:05 -0800184 if (rt->fi) {
185 fib_info_put(rt->fi);
186 rt->fi = NULL;
187 }
188 }
189 }
190 return p;
191}
192
David S. Millerd3aaeb32011-07-18 00:40:17 -0700193static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
194
Linus Torvalds1da177e2005-04-16 15:20:36 -0700195static struct dst_ops ipv4_dst_ops = {
196 .family = AF_INET,
Harvey Harrison09640e62009-02-01 00:45:17 -0800197 .protocol = cpu_to_be16(ETH_P_IP),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700198 .gc = rt_garbage_collect,
199 .check = ipv4_dst_check,
David S. Miller0dbaee32010-12-13 12:52:14 -0800200 .default_advmss = ipv4_default_advmss,
Steffen Klassertebb762f2011-11-23 02:12:51 +0000201 .mtu = ipv4_mtu,
David S. Miller62fa8a82011-01-26 20:51:05 -0800202 .cow_metrics = ipv4_cow_metrics,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700203 .destroy = ipv4_dst_destroy,
204 .ifdown = ipv4_dst_ifdown,
205 .negative_advice = ipv4_negative_advice,
206 .link_failure = ipv4_link_failure,
207 .update_pmtu = ip_rt_update_pmtu,
Herbert Xu1ac06e02008-05-20 14:32:14 -0700208 .local_out = __ip_local_out,
David S. Millerd3aaeb32011-07-18 00:40:17 -0700209 .neigh_lookup = ipv4_neigh_lookup,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700210};
211
212#define ECN_OR_COST(class) TC_PRIO_##class
213
Philippe De Muyter4839c522007-07-09 15:32:57 -0700214const __u8 ip_tos2prio[16] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700215 TC_PRIO_BESTEFFORT,
Dan Siemon4a2b9c32011-03-15 13:56:07 +0000216 ECN_OR_COST(BESTEFFORT),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700217 TC_PRIO_BESTEFFORT,
218 ECN_OR_COST(BESTEFFORT),
219 TC_PRIO_BULK,
220 ECN_OR_COST(BULK),
221 TC_PRIO_BULK,
222 ECN_OR_COST(BULK),
223 TC_PRIO_INTERACTIVE,
224 ECN_OR_COST(INTERACTIVE),
225 TC_PRIO_INTERACTIVE,
226 ECN_OR_COST(INTERACTIVE),
227 TC_PRIO_INTERACTIVE_BULK,
228 ECN_OR_COST(INTERACTIVE_BULK),
229 TC_PRIO_INTERACTIVE_BULK,
230 ECN_OR_COST(INTERACTIVE_BULK)
231};
232
233
234/*
235 * Route cache.
236 */
237
238/* The locking scheme is rather straight forward:
239 *
240 * 1) Read-Copy Update protects the buckets of the central route hash.
241 * 2) Only writers remove entries, and they hold the lock
242 * as they look at rtable reference counts.
243 * 3) Only readers acquire references to rtable entries,
244 * they do so with atomic increments and with the
245 * lock held.
246 */
247
248struct rt_hash_bucket {
Eric Dumazet1c317202010-10-25 21:02:07 +0000249 struct rtable __rcu *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700250};
Neil Horman1080d702008-10-27 12:28:25 -0700251
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700252#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
253 defined(CONFIG_PROVE_LOCKING)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700254/*
255 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
256 * The size of this table is a power of two and depends on the number of CPUS.
Ingo Molnar62051202006-07-03 00:24:59 -0700257 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700258 */
Ingo Molnar62051202006-07-03 00:24:59 -0700259#ifdef CONFIG_LOCKDEP
260# define RT_HASH_LOCK_SZ 256
Eric Dumazet22c047c2005-07-05 14:55:24 -0700261#else
Ingo Molnar62051202006-07-03 00:24:59 -0700262# if NR_CPUS >= 32
263# define RT_HASH_LOCK_SZ 4096
264# elif NR_CPUS >= 16
265# define RT_HASH_LOCK_SZ 2048
266# elif NR_CPUS >= 8
267# define RT_HASH_LOCK_SZ 1024
268# elif NR_CPUS >= 4
269# define RT_HASH_LOCK_SZ 512
270# else
271# define RT_HASH_LOCK_SZ 256
272# endif
Eric Dumazet22c047c2005-07-05 14:55:24 -0700273#endif
274
275static spinlock_t *rt_hash_locks;
276# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800277
278static __init void rt_hash_lock_init(void)
279{
280 int i;
281
282 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
283 GFP_KERNEL);
284 if (!rt_hash_locks)
285 panic("IP: failed to allocate rt_hash_locks\n");
286
287 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
288 spin_lock_init(&rt_hash_locks[i]);
289}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700290#else
291# define rt_hash_lock_addr(slot) NULL
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800292
293static inline void rt_hash_lock_init(void)
294{
295}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700296#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700297
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700298static struct rt_hash_bucket *rt_hash_table __read_mostly;
299static unsigned rt_hash_mask __read_mostly;
300static unsigned int rt_hash_log __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700301
Eric Dumazet2f970d82006-01-17 02:54:36 -0800302static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
Eric Dumazet27f39c72010-05-19 22:07:23 +0000303#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700304
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700305static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700306 int genid)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700307{
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700308 return jhash_3words((__force u32)daddr, (__force u32)saddr,
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700309 idx, genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800310 & rt_hash_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700311}
312
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700313static inline int rt_genid(struct net *net)
314{
315 return atomic_read(&net->ipv4.rt_genid);
316}
317
Linus Torvalds1da177e2005-04-16 15:20:36 -0700318#ifdef CONFIG_PROC_FS
319struct rt_cache_iter_state {
Denis V. Luneva75e9362008-02-28 20:50:55 -0800320 struct seq_net_private p;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700321 int bucket;
Eric Dumazet29e75252008-01-31 17:05:09 -0800322 int genid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700323};
324
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900325static struct rtable *rt_cache_get_first(struct seq_file *seq)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700326{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900327 struct rt_cache_iter_state *st = seq->private;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700328 struct rtable *r = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700329
330 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
Eric Dumazet33d480c2011-08-11 19:30:52 +0000331 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
Eric Dumazeta6272662008-08-28 01:11:25 -0700332 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700333 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -0800334 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Eric Dumazet29e75252008-01-31 17:05:09 -0800335 while (r) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700336 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
Denis V. Luneva75e9362008-02-28 20:50:55 -0800337 r->rt_genid == st->genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800338 return r;
Changli Gaod8d1f302010-06-10 23:31:35 -0700339 r = rcu_dereference_bh(r->dst.rt_next);
Eric Dumazet29e75252008-01-31 17:05:09 -0800340 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700341 rcu_read_unlock_bh();
342 }
Eric Dumazet29e75252008-01-31 17:05:09 -0800343 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700344}
345
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900346static struct rtable *__rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800347 struct rtable *r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700348{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900349 struct rt_cache_iter_state *st = seq->private;
Eric Dumazeta6272662008-08-28 01:11:25 -0700350
Eric Dumazet1c317202010-10-25 21:02:07 +0000351 r = rcu_dereference_bh(r->dst.rt_next);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700352 while (!r) {
353 rcu_read_unlock_bh();
Eric Dumazeta6272662008-08-28 01:11:25 -0700354 do {
355 if (--st->bucket < 0)
356 return NULL;
Eric Dumazet33d480c2011-08-11 19:30:52 +0000357 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700358 rcu_read_lock_bh();
Eric Dumazet1c317202010-10-25 21:02:07 +0000359 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700360 }
Eric Dumazet1c317202010-10-25 21:02:07 +0000361 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700362}
363
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900364static struct rtable *rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800365 struct rtable *r)
366{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900367 struct rt_cache_iter_state *st = seq->private;
368 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700369 if (dev_net(r->dst.dev) != seq_file_net(seq))
Denis V. Luneva75e9362008-02-28 20:50:55 -0800370 continue;
Denis V. Lunev642d6312008-02-28 20:50:33 -0800371 if (r->rt_genid == st->genid)
372 break;
373 }
374 return r;
375}
376
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900377static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700378{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900379 struct rtable *r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700380
381 if (r)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900382 while (pos && (r = rt_cache_get_next(seq, r)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700383 --pos;
384 return pos ? NULL : r;
385}
386
387static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
388{
Eric Dumazet29e75252008-01-31 17:05:09 -0800389 struct rt_cache_iter_state *st = seq->private;
Eric Dumazet29e75252008-01-31 17:05:09 -0800390 if (*pos)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900391 return rt_cache_get_idx(seq, *pos - 1);
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700392 st->genid = rt_genid(seq_file_net(seq));
Eric Dumazet29e75252008-01-31 17:05:09 -0800393 return SEQ_START_TOKEN;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700394}
395
396static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
397{
Eric Dumazet29e75252008-01-31 17:05:09 -0800398 struct rtable *r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700399
400 if (v == SEQ_START_TOKEN)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900401 r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700402 else
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900403 r = rt_cache_get_next(seq, v);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700404 ++*pos;
405 return r;
406}
407
408static void rt_cache_seq_stop(struct seq_file *seq, void *v)
409{
410 if (v && v != SEQ_START_TOKEN)
411 rcu_read_unlock_bh();
412}
413
414static int rt_cache_seq_show(struct seq_file *seq, void *v)
415{
416 if (v == SEQ_START_TOKEN)
417 seq_printf(seq, "%-127s\n",
418 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
419 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
420 "HHUptod\tSpecDst");
421 else {
422 struct rtable *r = v;
David S. Miller69cce1d2011-07-17 23:09:49 -0700423 struct neighbour *n;
Eric Dumazet218fa902011-11-29 20:05:55 +0000424 int len, HHUptod;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700425
Eric Dumazet218fa902011-11-29 20:05:55 +0000426 rcu_read_lock();
David Miller27217452011-12-02 16:52:08 +0000427 n = dst_get_neighbour_noref(&r->dst);
Eric Dumazet218fa902011-11-29 20:05:55 +0000428 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
429 rcu_read_unlock();
430
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700431 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
432 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
Changli Gaod8d1f302010-06-10 23:31:35 -0700433 r->dst.dev ? r->dst.dev->name : "*",
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700434 (__force u32)r->rt_dst,
435 (__force u32)r->rt_gateway,
Changli Gaod8d1f302010-06-10 23:31:35 -0700436 r->rt_flags, atomic_read(&r->dst.__refcnt),
437 r->dst.__use, 0, (__force u32)r->rt_src,
David S. Miller0dbaee32010-12-13 12:52:14 -0800438 dst_metric_advmss(&r->dst) + 40,
Changli Gaod8d1f302010-06-10 23:31:35 -0700439 dst_metric(&r->dst, RTAX_WINDOW),
440 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
441 dst_metric(&r->dst, RTAX_RTTVAR)),
David S. Miller475949d2011-05-03 19:45:15 -0700442 r->rt_key_tos,
David S. Millerf6b72b62011-07-14 07:53:20 -0700443 -1,
Eric Dumazet218fa902011-11-29 20:05:55 +0000444 HHUptod,
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700445 r->rt_spec_dst, &len);
446
447 seq_printf(seq, "%*s\n", 127 - len, "");
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900448 }
449 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700450}
451
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700452static const struct seq_operations rt_cache_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700453 .start = rt_cache_seq_start,
454 .next = rt_cache_seq_next,
455 .stop = rt_cache_seq_stop,
456 .show = rt_cache_seq_show,
457};
458
459static int rt_cache_seq_open(struct inode *inode, struct file *file)
460{
Denis V. Luneva75e9362008-02-28 20:50:55 -0800461 return seq_open_net(inode, file, &rt_cache_seq_ops,
Pavel Emelyanovcf7732e2007-10-10 02:29:29 -0700462 sizeof(struct rt_cache_iter_state));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700463}
464
Arjan van de Ven9a321442007-02-12 00:55:35 -0800465static const struct file_operations rt_cache_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700466 .owner = THIS_MODULE,
467 .open = rt_cache_seq_open,
468 .read = seq_read,
469 .llseek = seq_lseek,
Denis V. Luneva75e9362008-02-28 20:50:55 -0800470 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700471};
472
473
474static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
475{
476 int cpu;
477
478 if (*pos == 0)
479 return SEQ_START_TOKEN;
480
Rusty Russell0f231742008-12-29 12:23:42 +0000481 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700482 if (!cpu_possible(cpu))
483 continue;
484 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800485 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700486 }
487 return NULL;
488}
489
490static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
491{
492 int cpu;
493
Rusty Russell0f231742008-12-29 12:23:42 +0000494 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700495 if (!cpu_possible(cpu))
496 continue;
497 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800498 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700499 }
500 return NULL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900501
Linus Torvalds1da177e2005-04-16 15:20:36 -0700502}
503
504static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
505{
506
507}
508
509static int rt_cpu_seq_show(struct seq_file *seq, void *v)
510{
511 struct rt_cache_stat *st = v;
512
513 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700514 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700515 return 0;
516 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900517
Linus Torvalds1da177e2005-04-16 15:20:36 -0700518 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
519 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
Eric Dumazetfc66f952010-10-08 06:37:34 +0000520 dst_entries_get_slow(&ipv4_dst_ops),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700521 st->in_hit,
522 st->in_slow_tot,
523 st->in_slow_mc,
524 st->in_no_route,
525 st->in_brd,
526 st->in_martian_dst,
527 st->in_martian_src,
528
529 st->out_hit,
530 st->out_slow_tot,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900531 st->out_slow_mc,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700532
533 st->gc_total,
534 st->gc_ignored,
535 st->gc_goal_miss,
536 st->gc_dst_overflow,
537 st->in_hlist_search,
538 st->out_hlist_search
539 );
540 return 0;
541}
542
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700543static const struct seq_operations rt_cpu_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700544 .start = rt_cpu_seq_start,
545 .next = rt_cpu_seq_next,
546 .stop = rt_cpu_seq_stop,
547 .show = rt_cpu_seq_show,
548};
549
550
551static int rt_cpu_seq_open(struct inode *inode, struct file *file)
552{
553 return seq_open(file, &rt_cpu_seq_ops);
554}
555
Arjan van de Ven9a321442007-02-12 00:55:35 -0800556static const struct file_operations rt_cpu_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700557 .owner = THIS_MODULE,
558 .open = rt_cpu_seq_open,
559 .read = seq_read,
560 .llseek = seq_lseek,
561 .release = seq_release,
562};
563
Patrick McHardyc7066f72011-01-14 13:36:42 +0100564#ifdef CONFIG_IP_ROUTE_CLASSID
Alexey Dobriyana661c412009-11-25 15:40:35 -0800565static int rt_acct_proc_show(struct seq_file *m, void *v)
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800566{
Alexey Dobriyana661c412009-11-25 15:40:35 -0800567 struct ip_rt_acct *dst, *src;
568 unsigned int i, j;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800569
Alexey Dobriyana661c412009-11-25 15:40:35 -0800570 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
571 if (!dst)
572 return -ENOMEM;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800573
Alexey Dobriyana661c412009-11-25 15:40:35 -0800574 for_each_possible_cpu(i) {
575 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
576 for (j = 0; j < 256; j++) {
577 dst[j].o_bytes += src[j].o_bytes;
578 dst[j].o_packets += src[j].o_packets;
579 dst[j].i_bytes += src[j].i_bytes;
580 dst[j].i_packets += src[j].i_packets;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800581 }
582 }
Alexey Dobriyana661c412009-11-25 15:40:35 -0800583
584 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
585 kfree(dst);
586 return 0;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800587}
Alexey Dobriyana661c412009-11-25 15:40:35 -0800588
589static int rt_acct_proc_open(struct inode *inode, struct file *file)
590{
591 return single_open(file, rt_acct_proc_show, NULL);
592}
593
594static const struct file_operations rt_acct_proc_fops = {
595 .owner = THIS_MODULE,
596 .open = rt_acct_proc_open,
597 .read = seq_read,
598 .llseek = seq_lseek,
599 .release = single_release,
600};
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800601#endif
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800602
Denis V. Lunev73b38712008-02-28 20:51:18 -0800603static int __net_init ip_rt_do_proc_init(struct net *net)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800604{
605 struct proc_dir_entry *pde;
606
607 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
608 &rt_cache_seq_fops);
609 if (!pde)
610 goto err1;
611
Wang Chen77020722008-02-28 14:14:25 -0800612 pde = proc_create("rt_cache", S_IRUGO,
613 net->proc_net_stat, &rt_cpu_seq_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800614 if (!pde)
615 goto err2;
616
Patrick McHardyc7066f72011-01-14 13:36:42 +0100617#ifdef CONFIG_IP_ROUTE_CLASSID
Alexey Dobriyana661c412009-11-25 15:40:35 -0800618 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800619 if (!pde)
620 goto err3;
621#endif
622 return 0;
623
Patrick McHardyc7066f72011-01-14 13:36:42 +0100624#ifdef CONFIG_IP_ROUTE_CLASSID
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800625err3:
626 remove_proc_entry("rt_cache", net->proc_net_stat);
627#endif
628err2:
629 remove_proc_entry("rt_cache", net->proc_net);
630err1:
631 return -ENOMEM;
632}
Denis V. Lunev73b38712008-02-28 20:51:18 -0800633
634static void __net_exit ip_rt_do_proc_exit(struct net *net)
635{
636 remove_proc_entry("rt_cache", net->proc_net_stat);
637 remove_proc_entry("rt_cache", net->proc_net);
Patrick McHardyc7066f72011-01-14 13:36:42 +0100638#ifdef CONFIG_IP_ROUTE_CLASSID
Denis V. Lunev73b38712008-02-28 20:51:18 -0800639 remove_proc_entry("rt_acct", net->proc_net);
Alexey Dobriyan0a931ac2010-01-17 03:32:50 +0000640#endif
Denis V. Lunev73b38712008-02-28 20:51:18 -0800641}
642
643static struct pernet_operations ip_rt_proc_ops __net_initdata = {
644 .init = ip_rt_do_proc_init,
645 .exit = ip_rt_do_proc_exit,
646};
647
648static int __init ip_rt_proc_init(void)
649{
650 return register_pernet_subsys(&ip_rt_proc_ops);
651}
652
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800653#else
Denis V. Lunev73b38712008-02-28 20:51:18 -0800654static inline int ip_rt_proc_init(void)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800655{
656 return 0;
657}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700658#endif /* CONFIG_PROC_FS */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900659
Stephen Hemminger5969f712008-04-10 01:52:09 -0700660static inline void rt_free(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700661{
Changli Gaod8d1f302010-06-10 23:31:35 -0700662 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700663}
664
Stephen Hemminger5969f712008-04-10 01:52:09 -0700665static inline void rt_drop(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700666{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700667 ip_rt_put(rt);
Changli Gaod8d1f302010-06-10 23:31:35 -0700668 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700669}
670
Stephen Hemminger5969f712008-04-10 01:52:09 -0700671static inline int rt_fast_clean(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700672{
673 /* Kill broadcast/multicast entries very aggresively, if they
674 collide in hash table with more useful entries */
675 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
David S. Millerc7537962010-11-11 17:07:48 -0800676 rt_is_input_route(rth) && rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700677}
678
Stephen Hemminger5969f712008-04-10 01:52:09 -0700679static inline int rt_valuable(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700680{
681 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
David S. Miller2c8cec52011-02-09 20:42:07 -0800682 (rth->peer && rth->peer->pmtu_expires);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700683}
684
685static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
686{
687 unsigned long age;
688 int ret = 0;
689
Changli Gaod8d1f302010-06-10 23:31:35 -0700690 if (atomic_read(&rth->dst.__refcnt))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700691 goto out;
692
Changli Gaod8d1f302010-06-10 23:31:35 -0700693 age = jiffies - rth->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700694 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
695 (age <= tmo2 && rt_valuable(rth)))
696 goto out;
697 ret = 1;
698out: return ret;
699}
700
701/* Bits of score are:
702 * 31: very valuable
703 * 30: not quite useless
704 * 29..0: usage counter
705 */
706static inline u32 rt_score(struct rtable *rt)
707{
Changli Gaod8d1f302010-06-10 23:31:35 -0700708 u32 score = jiffies - rt->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700709
710 score = ~score & ~(3<<30);
711
712 if (rt_valuable(rt))
713 score |= (1<<31);
714
David S. Millerc7537962010-11-11 17:07:48 -0800715 if (rt_is_output_route(rt) ||
Linus Torvalds1da177e2005-04-16 15:20:36 -0700716 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
717 score |= (1<<30);
718
719 return score;
720}
721
Neil Horman1080d702008-10-27 12:28:25 -0700722static inline bool rt_caching(const struct net *net)
723{
724 return net->ipv4.current_rt_cache_rebuild_count <=
725 net->ipv4.sysctl_rt_cache_rebuild_count;
726}
727
David S. Miller5e2b61f2011-03-04 21:47:09 -0800728static inline bool compare_hash_inputs(const struct rtable *rt1,
729 const struct rtable *rt2)
Neil Horman1080d702008-10-27 12:28:25 -0700730{
David S. Miller5e2b61f2011-03-04 21:47:09 -0800731 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
732 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
Julian Anastasov97a80412011-08-09 04:01:16 +0000733 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
Neil Horman1080d702008-10-27 12:28:25 -0700734}
735
David S. Miller5e2b61f2011-03-04 21:47:09 -0800736static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700737{
David S. Miller5e2b61f2011-03-04 21:47:09 -0800738 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
739 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
740 (rt1->rt_mark ^ rt2->rt_mark) |
David S. Miller475949d2011-05-03 19:45:15 -0700741 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
Julian Anastasovd547f722011-08-07 22:20:20 -0700742 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
Lorenzo Colitti462ce7c2014-03-31 16:23:51 +0900743 (rt1->rt_uid ^ rt2->rt_uid) |
Julian Anastasov97a80412011-08-09 04:01:16 +0000744 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700745}
746
Denis V. Lunevb5921912008-01-22 23:50:25 -0800747static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
748{
Changli Gaod8d1f302010-06-10 23:31:35 -0700749 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
Denis V. Lunevb5921912008-01-22 23:50:25 -0800750}
751
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700752static inline int rt_is_expired(struct rtable *rth)
753{
Changli Gaod8d1f302010-06-10 23:31:35 -0700754 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700755}
756
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800757/*
758 * Perform a full scan of hash table and free all entries.
759 * Can be called by a softirq or a process.
760 * In the later case, we want to be reschedule if necessary
761 */
David S. Miller6561a3b2010-12-19 21:11:20 -0800762static void rt_do_flush(struct net *net, int process_context)
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800763{
764 unsigned int i;
765 struct rtable *rth, *next;
766
767 for (i = 0; i <= rt_hash_mask; i++) {
David S. Miller6561a3b2010-12-19 21:11:20 -0800768 struct rtable __rcu **pprev;
769 struct rtable *list;
770
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800771 if (process_context && need_resched())
772 cond_resched();
Eric Dumazet33d480c2011-08-11 19:30:52 +0000773 rth = rcu_access_pointer(rt_hash_table[i].chain);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800774 if (!rth)
775 continue;
776
777 spin_lock_bh(rt_hash_lock_addr(i));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700778
David S. Miller6561a3b2010-12-19 21:11:20 -0800779 list = NULL;
780 pprev = &rt_hash_table[i].chain;
781 rth = rcu_dereference_protected(*pprev,
Eric Dumazet1c317202010-10-25 21:02:07 +0000782 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700783
David S. Miller6561a3b2010-12-19 21:11:20 -0800784 while (rth) {
785 next = rcu_dereference_protected(rth->dst.rt_next,
786 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700787
David S. Miller6561a3b2010-12-19 21:11:20 -0800788 if (!net ||
789 net_eq(dev_net(rth->dst.dev), net)) {
790 rcu_assign_pointer(*pprev, next);
791 rcu_assign_pointer(rth->dst.rt_next, list);
792 list = rth;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700793 } else {
David S. Miller6561a3b2010-12-19 21:11:20 -0800794 pprev = &rth->dst.rt_next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700795 }
David S. Miller6561a3b2010-12-19 21:11:20 -0800796 rth = next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700797 }
David S. Miller6561a3b2010-12-19 21:11:20 -0800798
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800799 spin_unlock_bh(rt_hash_lock_addr(i));
800
David S. Miller6561a3b2010-12-19 21:11:20 -0800801 for (; list; list = next) {
802 next = rcu_dereference_protected(list->dst.rt_next, 1);
803 rt_free(list);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800804 }
805 }
806}
807
Neil Horman1080d702008-10-27 12:28:25 -0700808/*
809 * While freeing expired entries, we compute average chain length
810 * and standard deviation, using fixed-point arithmetic.
811 * This to have an estimation of rt_chain_length_max
812 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
813 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
814 */
815
816#define FRACT_BITS 3
817#define ONE (1UL << FRACT_BITS)
818
Eric Dumazet98376382010-03-08 03:20:00 +0000819/*
820 * Given a hash chain and an item in this hash chain,
821 * find if a previous entry has the same hash_inputs
822 * (but differs on tos, mark or oif)
823 * Returns 0 if an alias is found.
824 * Returns ONE if rth has no alias before itself.
825 */
826static int has_noalias(const struct rtable *head, const struct rtable *rth)
827{
828 const struct rtable *aux = head;
829
830 while (aux != rth) {
David S. Miller5e2b61f2011-03-04 21:47:09 -0800831 if (compare_hash_inputs(aux, rth))
Eric Dumazet98376382010-03-08 03:20:00 +0000832 return 0;
Eric Dumazet1c317202010-10-25 21:02:07 +0000833 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +0000834 }
835 return ONE;
836}
837
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500838static void rt_check_expire(void)
839{
840 static unsigned int rover;
841 unsigned int i = rover, goal;
842 struct rtable *rth;
843 struct rtable __rcu **rthp;
844 unsigned long samples = 0;
845 unsigned long sum = 0, sum2 = 0;
846 unsigned long delta;
847 u64 mult;
848
849 delta = jiffies - expires_ljiffies;
850 expires_ljiffies = jiffies;
851 mult = ((u64)delta) << rt_hash_log;
852 if (ip_rt_gc_timeout > 1)
853 do_div(mult, ip_rt_gc_timeout);
854 goal = (unsigned int)mult;
855 if (goal > rt_hash_mask)
856 goal = rt_hash_mask + 1;
857 for (; goal > 0; goal--) {
858 unsigned long tmo = ip_rt_gc_timeout;
859 unsigned long length;
860
861 i = (i + 1) & rt_hash_mask;
862 rthp = &rt_hash_table[i].chain;
863
864 if (need_resched())
865 cond_resched();
866
867 samples++;
868
869 if (rcu_dereference_raw(*rthp) == NULL)
870 continue;
871 length = 0;
872 spin_lock_bh(rt_hash_lock_addr(i));
873 while ((rth = rcu_dereference_protected(*rthp,
874 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
875 prefetch(rth->dst.rt_next);
876 if (rt_is_expired(rth)) {
877 *rthp = rth->dst.rt_next;
878 rt_free(rth);
879 continue;
880 }
881 if (rth->dst.expires) {
882 /* Entry is expired even if it is in use */
883 if (time_before_eq(jiffies, rth->dst.expires)) {
884nofree:
885 tmo >>= 1;
886 rthp = &rth->dst.rt_next;
887 /*
888 * We only count entries on
889 * a chain with equal hash inputs once
890 * so that entries for different QOS
891 * levels, and other non-hash input
892 * attributes don't unfairly skew
893 * the length computation
894 */
895 length += has_noalias(rt_hash_table[i].chain, rth);
896 continue;
897 }
898 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
899 goto nofree;
900
901 /* Cleanup aged off entries. */
902 *rthp = rth->dst.rt_next;
903 rt_free(rth);
904 }
905 spin_unlock_bh(rt_hash_lock_addr(i));
906 sum += length;
907 sum2 += length*length;
908 }
909 if (samples) {
910 unsigned long avg = sum / samples;
911 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
912 rt_chain_length_max = max_t(unsigned long,
913 ip_rt_gc_elasticity,
914 (avg + 4*sd) >> FRACT_BITS);
915 }
916 rover = i;
917}
918
919/*
920 * rt_worker_func() is run in process context.
921 * we call rt_check_expire() to scan part of the hash table
922 */
923static void rt_worker_func(struct work_struct *work)
924{
925 rt_check_expire();
926 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
927}
928
Eric Dumazet29e75252008-01-31 17:05:09 -0800929/*
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300930 * Perturbation of rt_genid by a small quantity [1..256]
Eric Dumazet29e75252008-01-31 17:05:09 -0800931 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
932 * many times (2^24) without giving recent rt_genid.
933 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700934 */
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700935static void rt_cache_invalidate(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700936{
Eric Dumazet29e75252008-01-31 17:05:09 -0800937 unsigned char shuffle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700938
Eric Dumazet29e75252008-01-31 17:05:09 -0800939 get_random_bytes(&shuffle, sizeof(shuffle));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700940 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
Steffen Klassert5faa5df2012-03-06 21:20:26 +0000941 inetpeer_invalidate_tree(AF_INET);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700942}
943
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800944/*
Eric Dumazet29e75252008-01-31 17:05:09 -0800945 * delay < 0 : invalidate cache (fast : entries will be deleted later)
946 * delay >= 0 : invalidate & flush cache (can be long)
947 */
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -0700948void rt_cache_flush(struct net *net, int delay)
Eric Dumazet29e75252008-01-31 17:05:09 -0800949{
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700950 rt_cache_invalidate(net);
Eric Dumazet29e75252008-01-31 17:05:09 -0800951 if (delay >= 0)
David S. Miller6561a3b2010-12-19 21:11:20 -0800952 rt_do_flush(net, !in_softirq());
Eric Dumazet29e75252008-01-31 17:05:09 -0800953}
954
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000955/* Flush previous cache invalidated entries from the cache */
David S. Miller6561a3b2010-12-19 21:11:20 -0800956void rt_cache_flush_batch(struct net *net)
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000957{
David S. Miller6561a3b2010-12-19 21:11:20 -0800958 rt_do_flush(net, !in_softirq());
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000959}
960
Neil Horman1080d702008-10-27 12:28:25 -0700961static void rt_emergency_hash_rebuild(struct net *net)
962{
Neil Horman3ee94372010-05-08 01:57:52 -0700963 if (net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +0000964 pr_warn("Route hash chain too long!\n");
Neil Horman3ee94372010-05-08 01:57:52 -0700965 rt_cache_invalidate(net);
Neil Horman1080d702008-10-27 12:28:25 -0700966}
967
Linus Torvalds1da177e2005-04-16 15:20:36 -0700968/*
969 Short description of GC goals.
970
971 We want to build algorithm, which will keep routing cache
972 at some equilibrium point, when number of aged off entries
973 is kept approximately equal to newly generated ones.
974
975 Current expiration strength is variable "expire".
976 We try to adjust it dynamically, so that if networking
977 is idle expires is large enough to keep enough of warm entries,
978 and when load increases it reduces to limit cache size.
979 */
980
Daniel Lezcano569d3642008-01-18 03:56:57 -0800981static int rt_garbage_collect(struct dst_ops *ops)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700982{
983 static unsigned long expire = RT_GC_TIMEOUT;
984 static unsigned long last_gc;
985 static int rover;
986 static int equilibrium;
Eric Dumazet1c317202010-10-25 21:02:07 +0000987 struct rtable *rth;
988 struct rtable __rcu **rthp;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700989 unsigned long now = jiffies;
990 int goal;
Eric Dumazetfc66f952010-10-08 06:37:34 +0000991 int entries = dst_entries_get_fast(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700992
993 /*
994 * Garbage collection is pretty expensive,
995 * do not make it too frequently.
996 */
997
998 RT_CACHE_STAT_INC(gc_total);
999
1000 if (now - last_gc < ip_rt_gc_min_interval &&
Eric Dumazetfc66f952010-10-08 06:37:34 +00001001 entries < ip_rt_max_size) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001002 RT_CACHE_STAT_INC(gc_ignored);
1003 goto out;
1004 }
1005
Eric Dumazetfc66f952010-10-08 06:37:34 +00001006 entries = dst_entries_get_slow(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001007 /* Calculate number of entries, which we want to expire now. */
Eric Dumazetfc66f952010-10-08 06:37:34 +00001008 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001009 if (goal <= 0) {
1010 if (equilibrium < ipv4_dst_ops.gc_thresh)
1011 equilibrium = ipv4_dst_ops.gc_thresh;
Eric Dumazetfc66f952010-10-08 06:37:34 +00001012 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001013 if (goal > 0) {
Eric Dumazetb790ced2007-12-21 01:49:07 -08001014 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +00001015 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001016 }
1017 } else {
1018 /* We are in dangerous area. Try to reduce cache really
1019 * aggressively.
1020 */
Eric Dumazetb790ced2007-12-21 01:49:07 -08001021 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +00001022 equilibrium = entries - goal;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001023 }
1024
1025 if (now - last_gc >= ip_rt_gc_min_interval)
1026 last_gc = now;
1027
1028 if (goal <= 0) {
1029 equilibrium += goal;
1030 goto work_done;
1031 }
1032
1033 do {
1034 int i, k;
1035
1036 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1037 unsigned long tmo = expire;
1038
1039 k = (k + 1) & rt_hash_mask;
1040 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001041 spin_lock_bh(rt_hash_lock_addr(k));
Eric Dumazet1c317202010-10-25 21:02:07 +00001042 while ((rth = rcu_dereference_protected(*rthp,
1043 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001044 if (!rt_is_expired(rth) &&
Eric Dumazet29e75252008-01-31 17:05:09 -08001045 !rt_may_expire(rth, tmo, expire)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001046 tmo >>= 1;
Changli Gaod8d1f302010-06-10 23:31:35 -07001047 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001048 continue;
1049 }
Changli Gaod8d1f302010-06-10 23:31:35 -07001050 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001051 rt_free(rth);
1052 goal--;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001053 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001054 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001055 if (goal <= 0)
1056 break;
1057 }
1058 rover = k;
1059
1060 if (goal <= 0)
1061 goto work_done;
1062
1063 /* Goal is not achieved. We stop process if:
1064
1065 - if expire reduced to zero. Otherwise, expire is halfed.
1066 - if table is not full.
1067 - if we are called from interrupt.
1068 - jiffies check is just fallback/debug loop breaker.
1069 We will not spin here for long time in any case.
1070 */
1071
1072 RT_CACHE_STAT_INC(gc_goal_miss);
1073
1074 if (expire == 0)
1075 break;
1076
1077 expire >>= 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001078
Eric Dumazetfc66f952010-10-08 06:37:34 +00001079 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001080 goto out;
1081 } while (!in_softirq() && time_before_eq(jiffies, now));
1082
Eric Dumazetfc66f952010-10-08 06:37:34 +00001083 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1084 goto out;
1085 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001086 goto out;
1087 if (net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00001088 pr_warn("dst cache overflow\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07001089 RT_CACHE_STAT_INC(gc_dst_overflow);
1090 return 1;
1091
1092work_done:
1093 expire += ip_rt_gc_min_interval;
1094 if (expire > ip_rt_gc_timeout ||
Eric Dumazetfc66f952010-10-08 06:37:34 +00001095 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1096 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001097 expire = ip_rt_gc_timeout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001098out: return 0;
1099}
1100
Eric Dumazet98376382010-03-08 03:20:00 +00001101/*
1102 * Returns number of entries in a hash chain that have different hash_inputs
1103 */
1104static int slow_chain_length(const struct rtable *head)
1105{
1106 int length = 0;
1107 const struct rtable *rth = head;
1108
1109 while (rth) {
1110 length += has_noalias(head, rth);
Eric Dumazet1c317202010-10-25 21:02:07 +00001111 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +00001112 }
1113 return length >> FRACT_BITS;
1114}
1115
David S. Millerd3aaeb32011-07-18 00:40:17 -07001116static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
David Miller3769cff2011-07-11 22:44:24 +00001117{
David S. Millerd3aaeb32011-07-18 00:40:17 -07001118 static const __be32 inaddr_any = 0;
1119 struct net_device *dev = dst->dev;
1120 const __be32 *pkey = daddr;
David S. Miller39232972012-01-26 15:22:32 -05001121 const struct rtable *rt;
David Miller3769cff2011-07-11 22:44:24 +00001122 struct neighbour *n;
1123
David S. Miller39232972012-01-26 15:22:32 -05001124 rt = (const struct rtable *) dst;
1125
David Miller3769cff2011-07-11 22:44:24 +00001126 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
David S. Millerd3aaeb32011-07-18 00:40:17 -07001127 pkey = &inaddr_any;
David S. Miller39232972012-01-26 15:22:32 -05001128 else if (rt->rt_gateway)
1129 pkey = (const __be32 *) &rt->rt_gateway;
David S. Millerd3aaeb32011-07-18 00:40:17 -07001130
David S. Miller80703d22012-02-15 17:48:35 -05001131 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
David S. Millerd3aaeb32011-07-18 00:40:17 -07001132 if (n)
1133 return n;
David Miller32092ec2011-07-25 00:01:41 +00001134 return neigh_create(&arp_tbl, pkey, dev);
David S. Millerd3aaeb32011-07-18 00:40:17 -07001135}
1136
1137static int rt_bind_neighbour(struct rtable *rt)
1138{
1139 struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
David Miller3769cff2011-07-11 22:44:24 +00001140 if (IS_ERR(n))
1141 return PTR_ERR(n);
David S. Miller69cce1d2011-07-17 23:09:49 -07001142 dst_set_neighbour(&rt->dst, n);
David Miller3769cff2011-07-11 22:44:24 +00001143
1144 return 0;
1145}
1146
David S. Millerb23dd4f2011-03-02 14:31:35 -08001147static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1148 struct sk_buff *skb, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001149{
Eric Dumazet1c317202010-10-25 21:02:07 +00001150 struct rtable *rth, *cand;
1151 struct rtable __rcu **rthp, **candp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001152 unsigned long now;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001153 u32 min_score;
1154 int chain_length;
1155 int attempts = !in_softirq();
1156
1157restart:
1158 chain_length = 0;
1159 min_score = ~(u32)0;
1160 cand = NULL;
1161 candp = NULL;
1162 now = jiffies;
1163
Changli Gaod8d1f302010-06-10 23:31:35 -07001164 if (!rt_caching(dev_net(rt->dst.dev))) {
Neil Horman73e42892009-06-20 01:15:16 -07001165 /*
1166 * If we're not caching, just tell the caller we
1167 * were successful and don't touch the route. The
1168 * caller hold the sole reference to the cache entry, and
1169 * it will be released when the caller is done with it.
1170 * If we drop it here, the callers have no way to resolve routes
1171 * when we're not caching. Instead, just point *rp at rt, so
1172 * the caller gets a single use out of the route
Neil Hormanb6280b42009-06-22 10:18:53 +00001173 * Note that we do rt_free on this new route entry, so that
1174 * once its refcount hits zero, we are still able to reap it
1175 * (Thanks Alexey)
Eric Dumazet27b75c92010-10-15 05:44:11 +00001176 * Note: To avoid expensive rcu stuff for this uncached dst,
1177 * we set DST_NOCACHE so that dst_release() can free dst without
1178 * waiting a grace period.
Neil Horman73e42892009-06-20 01:15:16 -07001179 */
Neil Hormanb6280b42009-06-22 10:18:53 +00001180
Eric Dumazetc7d44262010-10-03 22:17:54 -07001181 rt->dst.flags |= DST_NOCACHE;
David S. Millerc7537962010-11-11 17:07:48 -08001182 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
David Miller3769cff2011-07-11 22:44:24 +00001183 int err = rt_bind_neighbour(rt);
Neil Hormanb6280b42009-06-22 10:18:53 +00001184 if (err) {
1185 if (net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00001186 pr_warn("Neighbour table failure & not caching routes\n");
Eric Dumazet27b75c92010-10-15 05:44:11 +00001187 ip_rt_put(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001188 return ERR_PTR(err);
Neil Hormanb6280b42009-06-22 10:18:53 +00001189 }
1190 }
1191
Neil Hormanb6280b42009-06-22 10:18:53 +00001192 goto skip_hashing;
Neil Horman1080d702008-10-27 12:28:25 -07001193 }
1194
Linus Torvalds1da177e2005-04-16 15:20:36 -07001195 rthp = &rt_hash_table[hash].chain;
1196
Eric Dumazet22c047c2005-07-05 14:55:24 -07001197 spin_lock_bh(rt_hash_lock_addr(hash));
Eric Dumazet1c317202010-10-25 21:02:07 +00001198 while ((rth = rcu_dereference_protected(*rthp,
1199 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001200 if (rt_is_expired(rth)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001201 *rthp = rth->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001202 rt_free(rth);
1203 continue;
1204 }
David S. Miller5e2b61f2011-03-04 21:47:09 -08001205 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001206 /* Put it first */
Changli Gaod8d1f302010-06-10 23:31:35 -07001207 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001208 /*
1209 * Since lookup is lockfree, the deletion
1210 * must be visible to another weakly ordered CPU before
1211 * the insertion at the start of the hash chain.
1212 */
Changli Gaod8d1f302010-06-10 23:31:35 -07001213 rcu_assign_pointer(rth->dst.rt_next,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001214 rt_hash_table[hash].chain);
1215 /*
1216 * Since lookup is lockfree, the update writes
1217 * must be ordered for consistency on SMP.
1218 */
1219 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1220
Changli Gaod8d1f302010-06-10 23:31:35 -07001221 dst_use(&rth->dst, now);
Eric Dumazet22c047c2005-07-05 14:55:24 -07001222 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001223
1224 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001225 if (skb)
Changli Gaod8d1f302010-06-10 23:31:35 -07001226 skb_dst_set(skb, &rth->dst);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001227 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001228 }
1229
Changli Gaod8d1f302010-06-10 23:31:35 -07001230 if (!atomic_read(&rth->dst.__refcnt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001231 u32 score = rt_score(rth);
1232
1233 if (score <= min_score) {
1234 cand = rth;
1235 candp = rthp;
1236 min_score = score;
1237 }
1238 }
1239
1240 chain_length++;
1241
Changli Gaod8d1f302010-06-10 23:31:35 -07001242 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001243 }
1244
1245 if (cand) {
1246 /* ip_rt_gc_elasticity used to be average length of chain
1247 * length, when exceeded gc becomes really aggressive.
1248 *
1249 * The second limit is less certain. At the moment it allows
1250 * only 2 entries per bucket. We will see.
1251 */
1252 if (chain_length > ip_rt_gc_elasticity) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001253 *candp = cand->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001254 rt_free(cand);
1255 }
Neil Horman1080d702008-10-27 12:28:25 -07001256 } else {
Eric Dumazet98376382010-03-08 03:20:00 +00001257 if (chain_length > rt_chain_length_max &&
1258 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001259 struct net *net = dev_net(rt->dst.dev);
Neil Horman1080d702008-10-27 12:28:25 -07001260 int num = ++net->ipv4.current_rt_cache_rebuild_count;
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001261 if (!rt_caching(net)) {
Joe Perches058bd4d2012-03-11 18:36:11 +00001262 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
Changli Gaod8d1f302010-06-10 23:31:35 -07001263 rt->dst.dev->name, num);
Neil Horman1080d702008-10-27 12:28:25 -07001264 }
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001265 rt_emergency_hash_rebuild(net);
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001266 spin_unlock_bh(rt_hash_lock_addr(hash));
1267
David S. Miller5e2b61f2011-03-04 21:47:09 -08001268 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001269 ifindex, rt_genid(net));
1270 goto restart;
Neil Horman1080d702008-10-27 12:28:25 -07001271 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001272 }
1273
1274 /* Try to bind route to arp only if it is output
1275 route or unicast forwarding path.
1276 */
David S. Millerc7537962010-11-11 17:07:48 -08001277 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
David Miller3769cff2011-07-11 22:44:24 +00001278 int err = rt_bind_neighbour(rt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001279 if (err) {
Eric Dumazet22c047c2005-07-05 14:55:24 -07001280 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001281
1282 if (err != -ENOBUFS) {
1283 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001284 return ERR_PTR(err);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001285 }
1286
1287 /* Neighbour tables are full and nothing
1288 can be released. Try to shrink route cache,
1289 it is most likely it holds some neighbour records.
1290 */
1291 if (attempts-- > 0) {
1292 int saved_elasticity = ip_rt_gc_elasticity;
1293 int saved_int = ip_rt_gc_min_interval;
1294 ip_rt_gc_elasticity = 1;
1295 ip_rt_gc_min_interval = 0;
Daniel Lezcano569d3642008-01-18 03:56:57 -08001296 rt_garbage_collect(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001297 ip_rt_gc_min_interval = saved_int;
1298 ip_rt_gc_elasticity = saved_elasticity;
1299 goto restart;
1300 }
1301
1302 if (net_ratelimit())
Joe Perchesafd465032012-03-12 07:03:32 +00001303 pr_warn("Neighbour table overflow\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07001304 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001305 return ERR_PTR(-ENOBUFS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001306 }
1307 }
1308
Changli Gaod8d1f302010-06-10 23:31:35 -07001309 rt->dst.rt_next = rt_hash_table[hash].chain;
Neil Horman1080d702008-10-27 12:28:25 -07001310
Eric Dumazet00269b52008-10-16 14:18:29 -07001311 /*
1312 * Since lookup is lockfree, we must make sure
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001313 * previous writes to rt are committed to memory
Eric Dumazet00269b52008-10-16 14:18:29 -07001314 * before making rt visible to other CPUS.
1315 */
Eric Dumazet1ddbcb02009-05-19 20:14:28 +00001316 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
Neil Horman1080d702008-10-27 12:28:25 -07001317
Eric Dumazet22c047c2005-07-05 14:55:24 -07001318 spin_unlock_bh(rt_hash_lock_addr(hash));
Neil Horman73e42892009-06-20 01:15:16 -07001319
Neil Hormanb6280b42009-06-22 10:18:53 +00001320skip_hashing:
David S. Millerb23dd4f2011-03-02 14:31:35 -08001321 if (skb)
Changli Gaod8d1f302010-06-10 23:31:35 -07001322 skb_dst_set(skb, &rt->dst);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001323 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001324}
1325
David S. Miller6431cbc2011-02-07 20:38:06 -08001326static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1327
1328static u32 rt_peer_genid(void)
1329{
1330 return atomic_read(&__rt_peer_genid);
1331}
1332
David S. Millera48eff12011-05-18 18:42:43 -04001333void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001334{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001335 struct inet_peer *peer;
1336
David S. Millera48eff12011-05-18 18:42:43 -04001337 peer = inet_getpeer_v4(daddr, create);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001338
Eric Dumazet49e8ab02010-08-19 06:10:45 +00001339 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001340 inet_putpeer(peer);
David S. Miller6431cbc2011-02-07 20:38:06 -08001341 else
1342 rt->rt_peer_genid = rt_peer_genid();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001343}
1344
1345/*
1346 * Peer allocation may fail only in serious out-of-memory conditions. However
1347 * we still can generate some output.
1348 * Random ID selection looks a bit dangerous because we have no chances to
1349 * select ID being unique in a reasonable period of time.
1350 * But broken packet identifier may be better than no packet at all.
1351 */
1352static void ip_select_fb_ident(struct iphdr *iph)
1353{
1354 static DEFINE_SPINLOCK(ip_fb_id_lock);
1355 static u32 ip_fallback_id;
1356 u32 salt;
1357
1358 spin_lock_bh(&ip_fb_id_lock);
Al Viroe4485152006-09-26 22:15:01 -07001359 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001360 iph->id = htons(salt & 0xFFFF);
1361 ip_fallback_id = salt;
1362 spin_unlock_bh(&ip_fb_id_lock);
1363}
1364
1365void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1366{
1367 struct rtable *rt = (struct rtable *) dst;
1368
Eric Dumazete688a602011-12-22 04:15:53 +00001369 if (rt && !(rt->dst.flags & DST_NOPEER)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001370 if (rt->peer == NULL)
David S. Millera48eff12011-05-18 18:42:43 -04001371 rt_bind_peer(rt, rt->rt_dst, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001372
1373 /* If peer is attached to destination, it is never detached,
1374 so that we need not to grab a lock to dereference it.
1375 */
1376 if (rt->peer) {
1377 iph->id = htons(inet_getid(rt->peer, more));
1378 return;
1379 }
Eric Dumazete688a602011-12-22 04:15:53 +00001380 } else if (!rt)
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001381 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
Stephen Hemminger9c2b3322005-04-19 22:39:42 -07001382 __builtin_return_address(0));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001383
1384 ip_select_fb_ident(iph);
1385}
Eric Dumazet4bc2f182010-07-09 21:22:10 +00001386EXPORT_SYMBOL(__ip_select_ident);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001387
1388static void rt_del(unsigned hash, struct rtable *rt)
1389{
Eric Dumazet1c317202010-10-25 21:02:07 +00001390 struct rtable __rcu **rthp;
1391 struct rtable *aux;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001392
Eric Dumazet29e75252008-01-31 17:05:09 -08001393 rthp = &rt_hash_table[hash].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001394 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001395 ip_rt_put(rt);
Eric Dumazet1c317202010-10-25 21:02:07 +00001396 while ((aux = rcu_dereference_protected(*rthp,
1397 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001398 if (aux == rt || rt_is_expired(aux)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001399 *rthp = aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001400 rt_free(aux);
1401 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001402 }
Changli Gaod8d1f302010-06-10 23:31:35 -07001403 rthp = &aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001404 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001405 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001406}
1407
David S. Millerde398fb2011-12-05 13:21:42 -05001408static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001409{
1410 struct rtable *rt = (struct rtable *) dst;
1411 __be32 orig_gw = rt->rt_gateway;
1412 struct neighbour *n, *old_n;
1413
1414 dst_confirm(&rt->dst);
1415
1416 rt->rt_gateway = peer->redirect_learned.a4;
1417
1418 n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
David S. Millerde398fb2011-12-05 13:21:42 -05001419 if (IS_ERR(n)) {
1420 rt->rt_gateway = orig_gw;
1421 return;
1422 }
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001423 old_n = xchg(&rt->dst._neighbour, n);
1424 if (old_n)
1425 neigh_release(old_n);
David S. Millerde398fb2011-12-05 13:21:42 -05001426 if (!(n->nud_state & NUD_VALID)) {
1427 neigh_event_send(n, NULL);
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001428 } else {
1429 rt->rt_flags |= RTCF_REDIRECTED;
1430 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1431 }
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001432}
1433
Eric Dumazeted7865a42010-06-07 21:49:44 -07001434/* called in rcu_read_lock() section */
Al Virof7655222006-09-26 21:25:43 -07001435void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1436 __be32 saddr, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001437{
Flavio Leitner7cc91502011-10-24 02:56:38 -04001438 int s, i;
Eric Dumazeted7865a42010-06-07 21:49:44 -07001439 struct in_device *in_dev = __in_dev_get_rcu(dev);
Flavio Leitner7cc91502011-10-24 02:56:38 -04001440 __be32 skeys[2] = { saddr, 0 };
1441 int ikeys[2] = { dev->ifindex, 0 };
David S. Millerf39925d2011-02-09 22:00:16 -08001442 struct inet_peer *peer;
Denis V. Lunev317805b2008-02-28 20:50:06 -08001443 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001444
Linus Torvalds1da177e2005-04-16 15:20:36 -07001445 if (!in_dev)
1446 return;
1447
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001448 net = dev_net(dev);
Joe Perches9d4fb272009-11-23 10:41:23 -08001449 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1450 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1451 ipv4_is_zeronet(new_gw))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001452 goto reject_redirect;
1453
1454 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1455 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1456 goto reject_redirect;
1457 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1458 goto reject_redirect;
1459 } else {
Denis V. Lunev317805b2008-02-28 20:50:06 -08001460 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001461 goto reject_redirect;
1462 }
1463
Flavio Leitner7cc91502011-10-24 02:56:38 -04001464 for (s = 0; s < 2; s++) {
1465 for (i = 0; i < 2; i++) {
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001466 unsigned int hash;
1467 struct rtable __rcu **rthp;
1468 struct rtable *rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001469
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001470 hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1471
1472 rthp = &rt_hash_table[hash].chain;
1473
1474 while ((rt = rcu_dereference(*rthp)) != NULL) {
1475 rthp = &rt->dst.rt_next;
1476
1477 if (rt->rt_key_dst != daddr ||
1478 rt->rt_key_src != skeys[s] ||
1479 rt->rt_oif != ikeys[i] ||
1480 rt_is_input_route(rt) ||
1481 rt_is_expired(rt) ||
1482 !net_eq(dev_net(rt->dst.dev), net) ||
1483 rt->dst.error ||
1484 rt->dst.dev != dev ||
1485 rt->rt_gateway != old_gw)
1486 continue;
1487
1488 if (!rt->peer)
1489 rt_bind_peer(rt, rt->rt_dst, 1);
1490
1491 peer = rt->peer;
1492 if (peer) {
Steffen Klassertac3f48d2012-03-06 21:21:10 +00001493 if (peer->redirect_learned.a4 != new_gw) {
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001494 peer->redirect_learned.a4 = new_gw;
1495 atomic_inc(&__rt_peer_genid);
1496 }
1497 check_peer_redir(&rt->dst, peer);
1498 }
Flavio Leitner7cc91502011-10-24 02:56:38 -04001499 }
Flavio Leitner7cc91502011-10-24 02:56:38 -04001500 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001501 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001502 return;
1503
1504reject_redirect:
1505#ifdef CONFIG_IP_ROUTE_VERBOSE
1506 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00001507 pr_info("Redirect from %pI4 on %s about %pI4 ignored\n"
Harvey Harrison673d57e2008-10-31 00:53:57 -07001508 " Advised path = %pI4 -> %pI4\n",
Joe Perches058bd4d2012-03-11 18:36:11 +00001509 &old_gw, dev->name, &new_gw,
1510 &saddr, &daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001511#endif
Eric Dumazeted7865a42010-06-07 21:49:44 -07001512 ;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001513}
1514
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001515static bool peer_pmtu_expired(struct inet_peer *peer)
1516{
1517 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1518
1519 return orig &&
1520 time_after_eq(jiffies, orig) &&
1521 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1522}
1523
1524static bool peer_pmtu_cleaned(struct inet_peer *peer)
1525{
1526 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1527
1528 return orig &&
1529 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1530}
1531
Linus Torvalds1da177e2005-04-16 15:20:36 -07001532static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1533{
Eric Dumazetee6b9672008-03-05 18:30:47 -08001534 struct rtable *rt = (struct rtable *)dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001535 struct dst_entry *ret = dst;
1536
1537 if (rt) {
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001538 if (dst->obsolete > 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001539 ip_rt_put(rt);
1540 ret = NULL;
David S. Miller2c8cec52011-02-09 20:42:07 -08001541 } else if (rt->rt_flags & RTCF_REDIRECTED) {
David S. Miller5e2b61f2011-03-04 21:47:09 -08001542 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1543 rt->rt_oif,
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001544 rt_genid(dev_net(dst->dev)));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001545 rt_del(hash, rt);
1546 ret = NULL;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001547 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1548 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001549 }
1550 }
1551 return ret;
1552}
1553
1554/*
1555 * Algorithm:
1556 * 1. The first ip_rt_redirect_number redirects are sent
1557 * with exponential backoff, then we stop sending them at all,
1558 * assuming that the host ignores our redirects.
1559 * 2. If we did not see packets requiring redirects
1560 * during ip_rt_redirect_silence, we assume that the host
1561 * forgot redirected route and start to send redirects again.
1562 *
1563 * This algorithm is much cheaper and more intelligent than dumb load limiting
1564 * in icmp.c.
1565 *
1566 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1567 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1568 */
1569
1570void ip_rt_send_redirect(struct sk_buff *skb)
1571{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001572 struct rtable *rt = skb_rtable(skb);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001573 struct in_device *in_dev;
David S. Miller92d86822011-02-04 15:55:25 -08001574 struct inet_peer *peer;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001575 int log_martians;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001576
Eric Dumazet30038fc2009-08-28 23:52:01 -07001577 rcu_read_lock();
Changli Gaod8d1f302010-06-10 23:31:35 -07001578 in_dev = __in_dev_get_rcu(rt->dst.dev);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001579 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1580 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001581 return;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001582 }
1583 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1584 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001585
David S. Miller92d86822011-02-04 15:55:25 -08001586 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001587 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller92d86822011-02-04 15:55:25 -08001588 peer = rt->peer;
1589 if (!peer) {
1590 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1591 return;
1592 }
1593
Linus Torvalds1da177e2005-04-16 15:20:36 -07001594 /* No redirected packets during ip_rt_redirect_silence;
1595 * reset the algorithm.
1596 */
David S. Miller92d86822011-02-04 15:55:25 -08001597 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1598 peer->rate_tokens = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001599
1600 /* Too many ignored redirects; do not send anything
Changli Gaod8d1f302010-06-10 23:31:35 -07001601 * set dst.rate_last to the last seen redirected packet.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001602 */
David S. Miller92d86822011-02-04 15:55:25 -08001603 if (peer->rate_tokens >= ip_rt_redirect_number) {
1604 peer->rate_last = jiffies;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001605 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001606 }
1607
1608 /* Check for load limit; set rate_last to the latest sent
1609 * redirect.
1610 */
David S. Miller92d86822011-02-04 15:55:25 -08001611 if (peer->rate_tokens == 0 ||
Li Yewang14fb8a72006-12-18 00:26:35 -08001612 time_after(jiffies,
David S. Miller92d86822011-02-04 15:55:25 -08001613 (peer->rate_last +
1614 (ip_rt_redirect_load << peer->rate_tokens)))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001615 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
David S. Miller92d86822011-02-04 15:55:25 -08001616 peer->rate_last = jiffies;
1617 ++peer->rate_tokens;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001618#ifdef CONFIG_IP_ROUTE_VERBOSE
Eric Dumazet30038fc2009-08-28 23:52:01 -07001619 if (log_martians &&
David S. Miller92d86822011-02-04 15:55:25 -08001620 peer->rate_tokens == ip_rt_redirect_number &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001621 net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00001622 pr_warn("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1623 &ip_hdr(skb)->saddr, rt->rt_iif,
Harvey Harrison673d57e2008-10-31 00:53:57 -07001624 &rt->rt_dst, &rt->rt_gateway);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001625#endif
1626 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001627}
1628
1629static int ip_error(struct sk_buff *skb)
1630{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001631 struct rtable *rt = skb_rtable(skb);
David S. Miller92d86822011-02-04 15:55:25 -08001632 struct inet_peer *peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001633 unsigned long now;
David S. Miller92d86822011-02-04 15:55:25 -08001634 bool send;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001635 int code;
1636
Changli Gaod8d1f302010-06-10 23:31:35 -07001637 switch (rt->dst.error) {
Joe Perches4500ebf2011-07-01 09:43:07 +00001638 case EINVAL:
1639 default:
1640 goto out;
1641 case EHOSTUNREACH:
1642 code = ICMP_HOST_UNREACH;
1643 break;
1644 case ENETUNREACH:
1645 code = ICMP_NET_UNREACH;
1646 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1647 IPSTATS_MIB_INNOROUTES);
1648 break;
1649 case EACCES:
1650 code = ICMP_PKT_FILTERED;
1651 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001652 }
1653
David S. Miller92d86822011-02-04 15:55:25 -08001654 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001655 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller92d86822011-02-04 15:55:25 -08001656 peer = rt->peer;
1657
1658 send = true;
1659 if (peer) {
1660 now = jiffies;
1661 peer->rate_tokens += now - peer->rate_last;
1662 if (peer->rate_tokens > ip_rt_error_burst)
1663 peer->rate_tokens = ip_rt_error_burst;
1664 peer->rate_last = now;
1665 if (peer->rate_tokens >= ip_rt_error_cost)
1666 peer->rate_tokens -= ip_rt_error_cost;
1667 else
1668 send = false;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001669 }
David S. Miller92d86822011-02-04 15:55:25 -08001670 if (send)
1671 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001672
1673out: kfree_skb(skb);
1674 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001675}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001676
1677/*
1678 * The last two values are not from the RFC but
1679 * are needed for AMPRnet AX.25 paths.
1680 */
1681
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001682static const unsigned short mtu_plateau[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -07001683{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1684
Stephen Hemminger5969f712008-04-10 01:52:09 -07001685static inline unsigned short guess_mtu(unsigned short old_mtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001686{
1687 int i;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001688
Linus Torvalds1da177e2005-04-16 15:20:36 -07001689 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1690 if (old_mtu > mtu_plateau[i])
1691 return mtu_plateau[i];
1692 return 68;
1693}
1694
Eric Dumazetb71d1d42011-04-22 04:53:02 +00001695unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
Timo Teras0010e462008-04-29 03:32:25 -07001696 unsigned short new_mtu,
1697 struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001698{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001699 unsigned short old_mtu = ntohs(iph->tot_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001700 unsigned short est_mtu = 0;
David S. Miller2c8cec52011-02-09 20:42:07 -08001701 struct inet_peer *peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001702
David S. Miller2c8cec52011-02-09 20:42:07 -08001703 peer = inet_getpeer_v4(iph->daddr, 1);
1704 if (peer) {
1705 unsigned short mtu = new_mtu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001706
David S. Miller2c8cec52011-02-09 20:42:07 -08001707 if (new_mtu < 68 || new_mtu >= old_mtu) {
1708 /* BSD 4.2 derived systems incorrectly adjust
1709 * tot_len by the IP header length, and report
1710 * a zero MTU in the ICMP message.
1711 */
1712 if (mtu == 0 &&
1713 old_mtu >= 68 + (iph->ihl << 2))
1714 old_mtu -= iph->ihl << 2;
1715 mtu = guess_mtu(old_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001716 }
David S. Miller2c8cec52011-02-09 20:42:07 -08001717
1718 if (mtu < ip_rt_min_pmtu)
1719 mtu = ip_rt_min_pmtu;
1720 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001721 unsigned long pmtu_expires;
1722
1723 pmtu_expires = jiffies + ip_rt_mtu_expires;
1724 if (!pmtu_expires)
1725 pmtu_expires = 1UL;
1726
David S. Miller2c8cec52011-02-09 20:42:07 -08001727 est_mtu = mtu;
1728 peer->pmtu_learned = mtu;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001729 peer->pmtu_expires = pmtu_expires;
Gao feng59445b62011-10-19 15:34:09 +00001730 atomic_inc(&__rt_peer_genid);
David S. Miller2c8cec52011-02-09 20:42:07 -08001731 }
1732
1733 inet_putpeer(peer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001734 }
1735 return est_mtu ? : new_mtu;
1736}
1737
David S. Miller2c8cec52011-02-09 20:42:07 -08001738static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1739{
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001740 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
David S. Miller2c8cec52011-02-09 20:42:07 -08001741
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001742 if (!expires)
1743 return;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001744 if (time_before(jiffies, expires)) {
David S. Miller2c8cec52011-02-09 20:42:07 -08001745 u32 orig_dst_mtu = dst_mtu(dst);
1746 if (peer->pmtu_learned < orig_dst_mtu) {
1747 if (!peer->pmtu_orig)
1748 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1749 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1750 }
1751 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1752 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1753}
1754
Linus Torvalds1da177e2005-04-16 15:20:36 -07001755static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1756{
David S. Miller2c8cec52011-02-09 20:42:07 -08001757 struct rtable *rt = (struct rtable *) dst;
1758 struct inet_peer *peer;
1759
1760 dst_confirm(dst);
1761
1762 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001763 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller2c8cec52011-02-09 20:42:07 -08001764 peer = rt->peer;
1765 if (peer) {
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001766 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1767
David S. Miller2c8cec52011-02-09 20:42:07 -08001768 if (mtu < ip_rt_min_pmtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001769 mtu = ip_rt_min_pmtu;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001770 if (!pmtu_expires || mtu < peer->pmtu_learned) {
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001771
1772 pmtu_expires = jiffies + ip_rt_mtu_expires;
1773 if (!pmtu_expires)
1774 pmtu_expires = 1UL;
1775
David S. Miller2c8cec52011-02-09 20:42:07 -08001776 peer->pmtu_learned = mtu;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001777 peer->pmtu_expires = pmtu_expires;
David S. Miller2c8cec52011-02-09 20:42:07 -08001778
1779 atomic_inc(&__rt_peer_genid);
1780 rt->rt_peer_genid = rt_peer_genid();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001781 }
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001782 check_peer_pmtu(dst, peer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001783 }
1784}
1785
David S. Millerf39925d2011-02-09 22:00:16 -08001786
David S. Millerde398fb2011-12-05 13:21:42 -05001787static void ipv4_validate_peer(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001788{
David S. Miller6431cbc2011-02-07 20:38:06 -08001789 if (rt->rt_peer_genid != rt_peer_genid()) {
David S. Miller2c8cec52011-02-09 20:42:07 -08001790 struct inet_peer *peer;
1791
David S. Miller6431cbc2011-02-07 20:38:06 -08001792 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001793 rt_bind_peer(rt, rt->rt_dst, 0);
David S. Miller6431cbc2011-02-07 20:38:06 -08001794
David S. Miller2c8cec52011-02-09 20:42:07 -08001795 peer = rt->peer;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001796 if (peer) {
David S. Millerefbc3682011-12-01 13:38:59 -05001797 check_peer_pmtu(&rt->dst, peer);
David S. Miller2c8cec52011-02-09 20:42:07 -08001798
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001799 if (peer->redirect_learned.a4 &&
David S. Millerde398fb2011-12-05 13:21:42 -05001800 peer->redirect_learned.a4 != rt->rt_gateway)
1801 check_peer_redir(&rt->dst, peer);
David S. Millerf39925d2011-02-09 22:00:16 -08001802 }
1803
David S. Miller6431cbc2011-02-07 20:38:06 -08001804 rt->rt_peer_genid = rt_peer_genid();
1805 }
David S. Millerefbc3682011-12-01 13:38:59 -05001806}
1807
1808static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1809{
1810 struct rtable *rt = (struct rtable *) dst;
1811
1812 if (rt_is_expired(rt))
1813 return NULL;
David S. Millerde398fb2011-12-05 13:21:42 -05001814 ipv4_validate_peer(rt);
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001815 return dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001816}
1817
1818static void ipv4_dst_destroy(struct dst_entry *dst)
1819{
1820 struct rtable *rt = (struct rtable *) dst;
1821 struct inet_peer *peer = rt->peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001822
David S. Miller62fa8a82011-01-26 20:51:05 -08001823 if (rt->fi) {
1824 fib_info_put(rt->fi);
1825 rt->fi = NULL;
1826 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001827 if (peer) {
1828 rt->peer = NULL;
1829 inet_putpeer(peer);
1830 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001831}
1832
Linus Torvalds1da177e2005-04-16 15:20:36 -07001833
1834static void ipv4_link_failure(struct sk_buff *skb)
1835{
1836 struct rtable *rt;
1837
1838 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1839
Eric Dumazet511c3f92009-06-02 05:14:27 +00001840 rt = skb_rtable(skb);
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001841 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1842 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001843}
1844
1845static int ip_rt_bug(struct sk_buff *skb)
1846{
Harvey Harrison673d57e2008-10-31 00:53:57 -07001847 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1848 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001849 skb->dev ? skb->dev->name : "?");
1850 kfree_skb(skb);
Dave Jonesc378a9c2011-05-21 07:16:42 +00001851 WARN_ON(1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001852 return 0;
1853}
1854
1855/*
1856 We do not cache source address of outgoing interface,
1857 because it is used only by IP RR, TS and SRR options,
1858 so that it out of fast path.
1859
1860 BTW remember: "addr" is allowed to be not aligned
1861 in IP options!
1862 */
1863
David S. Miller8e363602011-05-13 17:29:41 -04001864void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001865{
Al Viroa61ced52006-09-26 21:27:54 -07001866 __be32 src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001867
David S. Millerc7537962010-11-11 17:07:48 -08001868 if (rt_is_output_route(rt))
David S. Millerc5be24f2011-05-13 18:01:21 -04001869 src = ip_hdr(skb)->saddr;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001870 else {
David S. Miller8e363602011-05-13 17:29:41 -04001871 struct fib_result res;
1872 struct flowi4 fl4;
1873 struct iphdr *iph;
1874
1875 iph = ip_hdr(skb);
1876
1877 memset(&fl4, 0, sizeof(fl4));
1878 fl4.daddr = iph->daddr;
1879 fl4.saddr = iph->saddr;
Julian Anastasovb0fe4a32011-07-23 02:00:41 +00001880 fl4.flowi4_tos = RT_TOS(iph->tos);
David S. Miller8e363602011-05-13 17:29:41 -04001881 fl4.flowi4_oif = rt->dst.dev->ifindex;
1882 fl4.flowi4_iif = skb->dev->ifindex;
1883 fl4.flowi4_mark = skb->mark;
Lorenzo Colitti462ce7c2014-03-31 16:23:51 +09001884 fl4.flowi4_uid = skb->sk ? sock_i_uid(skb->sk) : 0;
David S. Miller5e2b61f2011-03-04 21:47:09 -08001885
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001886 rcu_read_lock();
David S. Miller68a5e3d2011-03-11 20:07:33 -05001887 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
David S. Miller436c3b62011-03-24 17:42:21 -07001888 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001889 else
1890 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001891 RT_SCOPE_UNIVERSE);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001892 rcu_read_unlock();
1893 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001894 memcpy(addr, &src, 4);
1895}
1896
Patrick McHardyc7066f72011-01-14 13:36:42 +01001897#ifdef CONFIG_IP_ROUTE_CLASSID
Linus Torvalds1da177e2005-04-16 15:20:36 -07001898static void set_class_tag(struct rtable *rt, u32 tag)
1899{
Changli Gaod8d1f302010-06-10 23:31:35 -07001900 if (!(rt->dst.tclassid & 0xFFFF))
1901 rt->dst.tclassid |= tag & 0xFFFF;
1902 if (!(rt->dst.tclassid & 0xFFFF0000))
1903 rt->dst.tclassid |= tag & 0xFFFF0000;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001904}
1905#endif
1906
David S. Miller0dbaee32010-12-13 12:52:14 -08001907static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1908{
1909 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1910
1911 if (advmss == 0) {
1912 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1913 ip_rt_min_advmss);
1914 if (advmss > 65535 - 40)
1915 advmss = 65535 - 40;
1916 }
1917 return advmss;
1918}
1919
Steffen Klassertebb762f2011-11-23 02:12:51 +00001920static unsigned int ipv4_mtu(const struct dst_entry *dst)
David S. Millerd33e4552010-12-14 13:01:14 -08001921{
Steffen Klassert261663b2011-11-23 02:14:50 +00001922 const struct rtable *rt = (const struct rtable *) dst;
Steffen Klassert618f9bc2011-11-23 02:13:31 +00001923 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1924
Steffen Klassert261663b2011-11-23 02:14:50 +00001925 if (mtu && rt_is_output_route(rt))
Steffen Klassert618f9bc2011-11-23 02:13:31 +00001926 return mtu;
1927
1928 mtu = dst->dev->mtu;
David S. Millerd33e4552010-12-14 13:01:14 -08001929
1930 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
David S. Millerd33e4552010-12-14 13:01:14 -08001931
1932 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1933 mtu = 576;
1934 }
1935
1936 if (mtu > IP_MAX_MTU)
1937 mtu = IP_MAX_MTU;
1938
1939 return mtu;
1940}
1941
David S. Miller813b3b52011-04-28 14:48:42 -07001942static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
David S. Miller5e2b61f2011-03-04 21:47:09 -08001943 struct fib_info *fi)
David S. Millera4daad62011-01-27 22:01:53 -08001944{
David S. Miller0131ba42011-02-04 14:37:30 -08001945 struct inet_peer *peer;
1946 int create = 0;
1947
1948 /* If a peer entry exists for this destination, we must hook
1949 * it up in order to get at cached metrics.
1950 */
David S. Miller813b3b52011-04-28 14:48:42 -07001951 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
David S. Miller0131ba42011-02-04 14:37:30 -08001952 create = 1;
1953
David S. Miller3c0afdc2011-03-04 21:26:07 -08001954 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
David S. Miller0131ba42011-02-04 14:37:30 -08001955 if (peer) {
David S. Miller3c0afdc2011-03-04 21:26:07 -08001956 rt->rt_peer_genid = rt_peer_genid();
David S. Miller0131ba42011-02-04 14:37:30 -08001957 if (inet_metrics_new(peer))
1958 memcpy(peer->metrics, fi->fib_metrics,
1959 sizeof(u32) * RTAX_MAX);
1960 dst_init_metrics(&rt->dst, peer->metrics, false);
David S. Miller2c8cec52011-02-09 20:42:07 -08001961
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001962 check_peer_pmtu(&rt->dst, peer);
Steffen Klassertac3f48d2012-03-06 21:21:10 +00001963
David S. Millerf39925d2011-02-09 22:00:16 -08001964 if (peer->redirect_learned.a4 &&
1965 peer->redirect_learned.a4 != rt->rt_gateway) {
1966 rt->rt_gateway = peer->redirect_learned.a4;
1967 rt->rt_flags |= RTCF_REDIRECTED;
1968 }
David S. Miller0131ba42011-02-04 14:37:30 -08001969 } else {
David S. Millerb8dad612011-01-28 14:07:16 -08001970 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1971 rt->fi = fi;
1972 atomic_inc(&fi->fib_clntref);
1973 }
David S. Millera4daad62011-01-27 22:01:53 -08001974 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
David S. Millera4daad62011-01-27 22:01:53 -08001975 }
1976}
1977
David S. Miller813b3b52011-04-28 14:48:42 -07001978static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
David S. Miller5e2b61f2011-03-04 21:47:09 -08001979 const struct fib_result *res,
David S. Miller982721f2011-02-16 21:44:24 -08001980 struct fib_info *fi, u16 type, u32 itag)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001981{
David S. Millerdefb3512010-12-08 21:16:57 -08001982 struct dst_entry *dst = &rt->dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001983
1984 if (fi) {
1985 if (FIB_RES_GW(*res) &&
1986 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1987 rt->rt_gateway = FIB_RES_GW(*res);
David S. Miller813b3b52011-04-28 14:48:42 -07001988 rt_init_metrics(rt, fl4, fi);
Patrick McHardyc7066f72011-01-14 13:36:42 +01001989#ifdef CONFIG_IP_ROUTE_CLASSID
David S. Millerdefb3512010-12-08 21:16:57 -08001990 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001991#endif
David S. Millerd33e4552010-12-14 13:01:14 -08001992 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001993
David S. Millerdefb3512010-12-08 21:16:57 -08001994 if (dst_mtu(dst) > IP_MAX_MTU)
1995 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
David S. Miller0dbaee32010-12-13 12:52:14 -08001996 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
David S. Millerdefb3512010-12-08 21:16:57 -08001997 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001998
Patrick McHardyc7066f72011-01-14 13:36:42 +01001999#ifdef CONFIG_IP_ROUTE_CLASSID
Linus Torvalds1da177e2005-04-16 15:20:36 -07002000#ifdef CONFIG_IP_MULTIPLE_TABLES
2001 set_class_tag(rt, fib_rules_tclass(res));
2002#endif
2003 set_class_tag(rt, itag);
2004#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002005}
2006
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002007static struct rtable *rt_dst_alloc(struct net_device *dev,
2008 bool nopolicy, bool noxfrm)
David S. Miller0c4dcd52011-02-17 15:42:37 -08002009{
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002010 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2011 DST_HOST |
2012 (nopolicy ? DST_NOPOLICY : 0) |
2013 (noxfrm ? DST_NOXFRM : 0));
David S. Miller0c4dcd52011-02-17 15:42:37 -08002014}
2015
Eric Dumazet96d36222010-06-02 19:21:31 +00002016/* called in rcu_read_lock() section */
Al Viro9e12bb22006-09-26 21:25:20 -07002017static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002018 u8 tos, struct net_device *dev, int our)
2019{
Eric Dumazet96d36222010-06-02 19:21:31 +00002020 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002021 struct rtable *rth;
Al Viroa61ced52006-09-26 21:27:54 -07002022 __be32 spec_dst;
Eric Dumazet96d36222010-06-02 19:21:31 +00002023 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002024 u32 itag = 0;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002025 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002026
2027 /* Primary sanity checks. */
2028
2029 if (in_dev == NULL)
2030 return -EINVAL;
2031
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002032 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002033 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002034 goto e_inval;
2035
Joe Perchesf97c1e02007-12-16 13:45:43 -08002036 if (ipv4_is_zeronet(saddr)) {
2037 if (!ipv4_is_local_multicast(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002038 goto e_inval;
2039 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002040 } else {
Michael Smith5c04c812011-04-07 04:51:50 +00002041 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2042 &itag);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002043 if (err < 0)
2044 goto e_err;
2045 }
Benjamin LaHaise4e7b2f12012-03-27 15:55:32 +00002046 rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002047 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002048 if (!rth)
2049 goto e_nobufs;
2050
Patrick McHardyc7066f72011-01-14 13:36:42 +01002051#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07002052 rth->dst.tclassid = itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002053#endif
David S. Millercf911662011-04-28 14:31:47 -07002054 rth->dst.output = ip_rt_bug;
2055
2056 rth->rt_key_dst = daddr;
2057 rth->rt_key_src = saddr;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002058 rth->rt_genid = rt_genid(dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002059 rth->rt_flags = RTCF_MULTICAST;
Eric Dumazet29e75252008-01-31 17:05:09 -08002060 rth->rt_type = RTN_MULTICAST;
David S. Miller475949d2011-05-03 19:45:15 -07002061 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002062 rth->rt_dst = daddr;
2063 rth->rt_src = saddr;
2064 rth->rt_route_iif = dev->ifindex;
2065 rth->rt_iif = dev->ifindex;
2066 rth->rt_oif = 0;
2067 rth->rt_mark = skb->mark;
Lorenzo Colitti462ce7c2014-03-31 16:23:51 +09002068 rth->rt_uid = 0;
David S. Millercf911662011-04-28 14:31:47 -07002069 rth->rt_gateway = daddr;
2070 rth->rt_spec_dst= spec_dst;
2071 rth->rt_peer_genid = 0;
2072 rth->peer = NULL;
2073 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002074 if (our) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002075 rth->dst.input= ip_local_deliver;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002076 rth->rt_flags |= RTCF_LOCAL;
2077 }
2078
2079#ifdef CONFIG_IP_MROUTE
Joe Perchesf97c1e02007-12-16 13:45:43 -08002080 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
Changli Gaod8d1f302010-06-10 23:31:35 -07002081 rth->dst.input = ip_mr_input;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002082#endif
2083 RT_CACHE_STAT_INC(in_slow_mc);
2084
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002085 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
David S. Millerb23dd4f2011-03-02 14:31:35 -08002086 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
Eric Dumazet9aa3c942011-06-18 11:59:18 -07002087 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002088
2089e_nobufs:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002090 return -ENOBUFS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002091e_inval:
Eric Dumazet96d36222010-06-02 19:21:31 +00002092 return -EINVAL;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002093e_err:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002094 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002095}
2096
2097
2098static void ip_handle_martian_source(struct net_device *dev,
2099 struct in_device *in_dev,
2100 struct sk_buff *skb,
Al Viro9e12bb22006-09-26 21:25:20 -07002101 __be32 daddr,
2102 __be32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002103{
2104 RT_CACHE_STAT_INC(in_martian_src);
2105#ifdef CONFIG_IP_ROUTE_VERBOSE
2106 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2107 /*
2108 * RFC1812 recommendation, if source is martian,
2109 * the only hint is MAC header.
2110 */
Joe Perches058bd4d2012-03-11 18:36:11 +00002111 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
Harvey Harrison673d57e2008-10-31 00:53:57 -07002112 &daddr, &saddr, dev->name);
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07002113 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
Joe Perches058bd4d2012-03-11 18:36:11 +00002114 print_hex_dump(KERN_WARNING, "ll header: ",
2115 DUMP_PREFIX_OFFSET, 16, 1,
2116 skb_mac_header(skb),
2117 dev->hard_header_len, true);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002118 }
2119 }
2120#endif
2121}
2122
Eric Dumazet47360222010-06-03 04:13:21 +00002123/* called in rcu_read_lock() section */
Stephen Hemminger5969f712008-04-10 01:52:09 -07002124static int __mkroute_input(struct sk_buff *skb,
David S. Miller982721f2011-02-16 21:44:24 -08002125 const struct fib_result *res,
Stephen Hemminger5969f712008-04-10 01:52:09 -07002126 struct in_device *in_dev,
2127 __be32 daddr, __be32 saddr, u32 tos,
2128 struct rtable **result)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002129{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002130 struct rtable *rth;
2131 int err;
2132 struct in_device *out_dev;
Eric Dumazet47360222010-06-03 04:13:21 +00002133 unsigned int flags = 0;
Al Virod9c9df82006-09-26 21:28:14 -07002134 __be32 spec_dst;
2135 u32 itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002136
2137 /* get a working reference to the output device */
Eric Dumazet47360222010-06-03 04:13:21 +00002138 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002139 if (out_dev == NULL) {
2140 if (net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00002141 pr_crit("Bug in ip_route_input_slow(). Please report.\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07002142 return -EINVAL;
2143 }
2144
2145
Michael Smith5c04c812011-04-07 04:51:50 +00002146 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2147 in_dev->dev, &spec_dst, &itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002148 if (err < 0) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002149 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002150 saddr);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002151
Linus Torvalds1da177e2005-04-16 15:20:36 -07002152 goto cleanup;
2153 }
2154
2155 if (err)
2156 flags |= RTCF_DIRECTSRC;
2157
Thomas Graf51b77ca2008-06-03 16:36:01 -07002158 if (out_dev == in_dev && err &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002159 (IN_DEV_SHARED_MEDIA(out_dev) ||
2160 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2161 flags |= RTCF_DOREDIRECT;
2162
2163 if (skb->protocol != htons(ETH_P_IP)) {
2164 /* Not IP (i.e. ARP). Do not create route, if it is
2165 * invalid for proxy arp. DNAT routes are always valid.
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00002166 *
2167 * Proxy arp feature have been extended to allow, ARP
2168 * replies back to the same interface, to support
2169 * Private VLAN switch technologies. See arp.c.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002170 */
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00002171 if (out_dev == in_dev &&
2172 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002173 err = -EINVAL;
2174 goto cleanup;
2175 }
2176 }
2177
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002178 rth = rt_dst_alloc(out_dev->dev,
2179 IN_DEV_CONF_GET(in_dev, NOPOLICY),
David S. Miller0c4dcd52011-02-17 15:42:37 -08002180 IN_DEV_CONF_GET(out_dev, NOXFRM));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002181 if (!rth) {
2182 err = -ENOBUFS;
2183 goto cleanup;
2184 }
2185
David S. Miller5e2b61f2011-03-04 21:47:09 -08002186 rth->rt_key_dst = daddr;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002187 rth->rt_key_src = saddr;
David S. Millercf911662011-04-28 14:31:47 -07002188 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2189 rth->rt_flags = flags;
2190 rth->rt_type = res->type;
David S. Miller475949d2011-05-03 19:45:15 -07002191 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002192 rth->rt_dst = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002193 rth->rt_src = saddr;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002194 rth->rt_route_iif = in_dev->dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002195 rth->rt_iif = in_dev->dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002196 rth->rt_oif = 0;
David S. Millercf911662011-04-28 14:31:47 -07002197 rth->rt_mark = skb->mark;
Lorenzo Colitti462ce7c2014-03-31 16:23:51 +09002198 rth->rt_uid = 0;
David S. Millercf911662011-04-28 14:31:47 -07002199 rth->rt_gateway = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002200 rth->rt_spec_dst= spec_dst;
David S. Millercf911662011-04-28 14:31:47 -07002201 rth->rt_peer_genid = 0;
2202 rth->peer = NULL;
2203 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002204
Changli Gaod8d1f302010-06-10 23:31:35 -07002205 rth->dst.input = ip_forward;
2206 rth->dst.output = ip_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002207
David S. Miller5e2b61f2011-03-04 21:47:09 -08002208 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002209
Linus Torvalds1da177e2005-04-16 15:20:36 -07002210 *result = rth;
2211 err = 0;
2212 cleanup:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002213 return err;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002214}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002215
Stephen Hemminger5969f712008-04-10 01:52:09 -07002216static int ip_mkroute_input(struct sk_buff *skb,
2217 struct fib_result *res,
David S. Miller68a5e3d2011-03-11 20:07:33 -05002218 const struct flowi4 *fl4,
Stephen Hemminger5969f712008-04-10 01:52:09 -07002219 struct in_device *in_dev,
2220 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002221{
Chuck Short7abaa272005-06-22 22:10:23 -07002222 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002223 int err;
2224 unsigned hash;
2225
2226#ifdef CONFIG_IP_ROUTE_MULTIPATH
David S. Millerff3fccb2011-03-10 16:23:24 -08002227 if (res->fi && res->fi->fib_nhs > 1)
David S. Miller1b7fe5932011-03-10 17:01:16 -08002228 fib_select_multipath(res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002229#endif
2230
2231 /* create a routing cache entry */
2232 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2233 if (err)
2234 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002235
2236 /* put it into the cache */
David S. Miller68a5e3d2011-03-11 20:07:33 -05002237 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
Changli Gaod8d1f302010-06-10 23:31:35 -07002238 rt_genid(dev_net(rth->dst.dev)));
David S. Miller68a5e3d2011-03-11 20:07:33 -05002239 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002240 if (IS_ERR(rth))
2241 return PTR_ERR(rth);
2242 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002243}
2244
Linus Torvalds1da177e2005-04-16 15:20:36 -07002245/*
2246 * NOTE. We drop all the packets that has local source
2247 * addresses, because every properly looped back packet
2248 * must have correct destination already attached by output routine.
2249 *
2250 * Such approach solves two big problems:
2251 * 1. Not simplex devices are handled properly.
2252 * 2. IP spoofing attempts are filtered with 100% of guarantee.
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002253 * called with rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -07002254 */
2255
Al Viro9e12bb22006-09-26 21:25:20 -07002256static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002257 u8 tos, struct net_device *dev)
2258{
2259 struct fib_result res;
Eric Dumazet96d36222010-06-02 19:21:31 +00002260 struct in_device *in_dev = __in_dev_get_rcu(dev);
David S. Miller68a5e3d2011-03-11 20:07:33 -05002261 struct flowi4 fl4;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002262 unsigned flags = 0;
2263 u32 itag = 0;
2264 struct rtable * rth;
2265 unsigned hash;
Al Viro9e12bb22006-09-26 21:25:20 -07002266 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002267 int err = -EINVAL;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002268 struct net * net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002269
2270 /* IP on this device is disabled. */
2271
2272 if (!in_dev)
2273 goto out;
2274
2275 /* Check for the most weird martians, which can be not detected
2276 by fib_lookup.
2277 */
2278
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002279 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002280 ipv4_is_loopback(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002281 goto martian_source;
2282
Andy Walls27a954b2010-10-17 15:11:22 +00002283 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002284 goto brd_input;
2285
2286 /* Accept zero addresses only to limited broadcast;
2287 * I even do not know to fix it or not. Waiting for complains :-)
2288 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002289 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002290 goto martian_source;
2291
Andy Walls27a954b2010-10-17 15:11:22 +00002292 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002293 goto martian_destination;
2294
2295 /*
2296 * Now we are ready to route packet.
2297 */
David S. Miller68a5e3d2011-03-11 20:07:33 -05002298 fl4.flowi4_oif = 0;
2299 fl4.flowi4_iif = dev->ifindex;
2300 fl4.flowi4_mark = skb->mark;
2301 fl4.flowi4_tos = tos;
2302 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2303 fl4.daddr = daddr;
2304 fl4.saddr = saddr;
2305 err = fib_lookup(net, &fl4, &res);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002306 if (err != 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002307 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002308 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002309 goto no_route;
2310 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002311
2312 RT_CACHE_STAT_INC(in_slow_tot);
2313
2314 if (res.type == RTN_BROADCAST)
2315 goto brd_input;
2316
2317 if (res.type == RTN_LOCAL) {
Michael Smith5c04c812011-04-07 04:51:50 +00002318 err = fib_validate_source(skb, saddr, daddr, tos,
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002319 net->loopback_dev->ifindex,
Michael Smith5c04c812011-04-07 04:51:50 +00002320 dev, &spec_dst, &itag);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002321 if (err < 0)
2322 goto martian_source_keep_err;
2323 if (err)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002324 flags |= RTCF_DIRECTSRC;
2325 spec_dst = daddr;
2326 goto local_input;
2327 }
2328
2329 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002330 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002331 if (res.type != RTN_UNICAST)
2332 goto martian_destination;
2333
David S. Miller68a5e3d2011-03-11 20:07:33 -05002334 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002335out: return err;
2336
2337brd_input:
2338 if (skb->protocol != htons(ETH_P_IP))
2339 goto e_inval;
2340
Joe Perchesf97c1e02007-12-16 13:45:43 -08002341 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002342 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2343 else {
Michael Smith5c04c812011-04-07 04:51:50 +00002344 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2345 &itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002346 if (err < 0)
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002347 goto martian_source_keep_err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002348 if (err)
2349 flags |= RTCF_DIRECTSRC;
2350 }
2351 flags |= RTCF_BROADCAST;
2352 res.type = RTN_BROADCAST;
2353 RT_CACHE_STAT_INC(in_brd);
2354
2355local_input:
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002356 rth = rt_dst_alloc(net->loopback_dev,
2357 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002358 if (!rth)
2359 goto e_nobufs;
2360
David S. Millercf911662011-04-28 14:31:47 -07002361 rth->dst.input= ip_local_deliver;
Changli Gaod8d1f302010-06-10 23:31:35 -07002362 rth->dst.output= ip_rt_bug;
David S. Millercf911662011-04-28 14:31:47 -07002363#ifdef CONFIG_IP_ROUTE_CLASSID
2364 rth->dst.tclassid = itag;
2365#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002366
David S. Miller5e2b61f2011-03-04 21:47:09 -08002367 rth->rt_key_dst = daddr;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002368 rth->rt_key_src = saddr;
David S. Millercf911662011-04-28 14:31:47 -07002369 rth->rt_genid = rt_genid(net);
2370 rth->rt_flags = flags|RTCF_LOCAL;
2371 rth->rt_type = res.type;
David S. Miller475949d2011-05-03 19:45:15 -07002372 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002373 rth->rt_dst = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002374 rth->rt_src = saddr;
Patrick McHardyc7066f72011-01-14 13:36:42 +01002375#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07002376 rth->dst.tclassid = itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002377#endif
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002378 rth->rt_route_iif = dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002379 rth->rt_iif = dev->ifindex;
David S. Millercf911662011-04-28 14:31:47 -07002380 rth->rt_oif = 0;
2381 rth->rt_mark = skb->mark;
Lorenzo Colitti462ce7c2014-03-31 16:23:51 +09002382 rth->rt_uid = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002383 rth->rt_gateway = daddr;
2384 rth->rt_spec_dst= spec_dst;
David S. Millercf911662011-04-28 14:31:47 -07002385 rth->rt_peer_genid = 0;
2386 rth->peer = NULL;
2387 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002388 if (res.type == RTN_UNREACHABLE) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002389 rth->dst.input= ip_error;
2390 rth->dst.error= -err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002391 rth->rt_flags &= ~RTCF_LOCAL;
2392 }
David S. Miller68a5e3d2011-03-11 20:07:33 -05002393 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2394 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002395 err = 0;
2396 if (IS_ERR(rth))
2397 err = PTR_ERR(rth);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002398 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002399
2400no_route:
2401 RT_CACHE_STAT_INC(in_no_route);
2402 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2403 res.type = RTN_UNREACHABLE;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08002404 if (err == -ESRCH)
2405 err = -ENETUNREACH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002406 goto local_input;
2407
2408 /*
2409 * Do not cache martian addresses: they should be logged (RFC1812)
2410 */
2411martian_destination:
2412 RT_CACHE_STAT_INC(in_martian_dst);
2413#ifdef CONFIG_IP_ROUTE_VERBOSE
2414 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00002415 pr_warn("martian destination %pI4 from %pI4, dev %s\n",
Harvey Harrison673d57e2008-10-31 00:53:57 -07002416 &daddr, &saddr, dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002417#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002418
2419e_hostunreach:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002420 err = -EHOSTUNREACH;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002421 goto out;
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002422
Linus Torvalds1da177e2005-04-16 15:20:36 -07002423e_inval:
2424 err = -EINVAL;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002425 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002426
2427e_nobufs:
2428 err = -ENOBUFS;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002429 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002430
2431martian_source:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002432 err = -EINVAL;
2433martian_source_keep_err:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002434 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002435 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002436}
2437
Eric Dumazet407eadd2010-05-10 11:32:55 +00002438int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2439 u8 tos, struct net_device *dev, bool noref)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002440{
2441 struct rtable * rth;
2442 unsigned hash;
2443 int iif = dev->ifindex;
Denis V. Lunevb5921912008-01-22 23:50:25 -08002444 struct net *net;
Eric Dumazet96d36222010-06-02 19:21:31 +00002445 int res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002446
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002447 net = dev_net(dev);
Neil Horman1080d702008-10-27 12:28:25 -07002448
Eric Dumazet96d36222010-06-02 19:21:31 +00002449 rcu_read_lock();
2450
Neil Horman1080d702008-10-27 12:28:25 -07002451 if (!rt_caching(net))
2452 goto skip_cache;
2453
Linus Torvalds1da177e2005-04-16 15:20:36 -07002454 tos &= IPTOS_RT_MASK;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002455 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002456
Linus Torvalds1da177e2005-04-16 15:20:36 -07002457 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002458 rth = rcu_dereference(rth->dst.rt_next)) {
David S. Miller5e2b61f2011-03-04 21:47:09 -08002459 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2460 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
Julian Anastasov97a80412011-08-09 04:01:16 +00002461 (rth->rt_route_iif ^ iif) |
David S. Miller475949d2011-05-03 19:45:15 -07002462 (rth->rt_key_tos ^ tos)) == 0 &&
David S. Miller5e2b61f2011-03-04 21:47:09 -08002463 rth->rt_mark == skb->mark &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002464 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002465 !rt_is_expired(rth)) {
David S. Millerde398fb2011-12-05 13:21:42 -05002466 ipv4_validate_peer(rth);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002467 if (noref) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002468 dst_use_noref(&rth->dst, jiffies);
2469 skb_dst_set_noref(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002470 } else {
Changli Gaod8d1f302010-06-10 23:31:35 -07002471 dst_use(&rth->dst, jiffies);
2472 skb_dst_set(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002473 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002474 RT_CACHE_STAT_INC(in_hit);
2475 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002476 return 0;
2477 }
2478 RT_CACHE_STAT_INC(in_hlist_search);
2479 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002480
Neil Horman1080d702008-10-27 12:28:25 -07002481skip_cache:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002482 /* Multicast recognition logic is moved from route cache to here.
2483 The problem was that too many Ethernet cards have broken/missing
2484 hardware multicast filters :-( As result the host on multicasting
2485 network acquires a lot of useless route cache entries, sort of
2486 SDR messages from all the world. Now we try to get rid of them.
2487 Really, provided software IP multicast filter is organized
2488 reasonably (at least, hashed), it does not result in a slowdown
2489 comparing with route cache reject entries.
2490 Note, that multicast routers are not affected, because
2491 route cache entry is created eventually.
2492 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002493 if (ipv4_is_multicast(daddr)) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002494 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002495
Eric Dumazet96d36222010-06-02 19:21:31 +00002496 if (in_dev) {
David S. Millerdbdd9a52011-03-10 16:34:38 -08002497 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2498 ip_hdr(skb)->protocol);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002499 if (our
2500#ifdef CONFIG_IP_MROUTE
Joe Perches9d4fb272009-11-23 10:41:23 -08002501 ||
2502 (!ipv4_is_local_multicast(daddr) &&
2503 IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002504#endif
Joe Perches9d4fb272009-11-23 10:41:23 -08002505 ) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002506 int res = ip_route_input_mc(skb, daddr, saddr,
2507 tos, dev, our);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002508 rcu_read_unlock();
Eric Dumazet96d36222010-06-02 19:21:31 +00002509 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002510 }
2511 }
2512 rcu_read_unlock();
2513 return -EINVAL;
2514 }
Eric Dumazet96d36222010-06-02 19:21:31 +00002515 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2516 rcu_read_unlock();
2517 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002518}
Eric Dumazet407eadd2010-05-10 11:32:55 +00002519EXPORT_SYMBOL(ip_route_input_common);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002520
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002521/* called with rcu_read_lock() */
David S. Miller982721f2011-02-16 21:44:24 -08002522static struct rtable *__mkroute_output(const struct fib_result *res,
David S. Miller68a5e3d2011-03-11 20:07:33 -05002523 const struct flowi4 *fl4,
David S. Miller813b3b52011-04-28 14:48:42 -07002524 __be32 orig_daddr, __be32 orig_saddr,
Julian Anastasovf61759e2011-12-02 11:39:42 +00002525 int orig_oif, __u8 orig_rtos,
2526 struct net_device *dev_out,
David S. Miller5ada5522011-02-17 15:29:00 -08002527 unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002528{
David S. Miller982721f2011-02-16 21:44:24 -08002529 struct fib_info *fi = res->fi;
David S. Miller5ada5522011-02-17 15:29:00 -08002530 struct in_device *in_dev;
David S. Miller982721f2011-02-16 21:44:24 -08002531 u16 type = res->type;
David S. Miller5ada5522011-02-17 15:29:00 -08002532 struct rtable *rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002533
David S. Miller68a5e3d2011-03-11 20:07:33 -05002534 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
David S. Miller5ada5522011-02-17 15:29:00 -08002535 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002536
David S. Miller68a5e3d2011-03-11 20:07:33 -05002537 if (ipv4_is_lbcast(fl4->daddr))
David S. Miller982721f2011-02-16 21:44:24 -08002538 type = RTN_BROADCAST;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002539 else if (ipv4_is_multicast(fl4->daddr))
David S. Miller982721f2011-02-16 21:44:24 -08002540 type = RTN_MULTICAST;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002541 else if (ipv4_is_zeronet(fl4->daddr))
David S. Miller5ada5522011-02-17 15:29:00 -08002542 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002543
2544 if (dev_out->flags & IFF_LOOPBACK)
2545 flags |= RTCF_LOCAL;
2546
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002547 in_dev = __in_dev_get_rcu(dev_out);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002548 if (!in_dev)
David S. Miller5ada5522011-02-17 15:29:00 -08002549 return ERR_PTR(-EINVAL);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002550
David S. Miller982721f2011-02-16 21:44:24 -08002551 if (type == RTN_BROADCAST) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002552 flags |= RTCF_BROADCAST | RTCF_LOCAL;
David S. Miller982721f2011-02-16 21:44:24 -08002553 fi = NULL;
2554 } else if (type == RTN_MULTICAST) {
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002555 flags |= RTCF_MULTICAST | RTCF_LOCAL;
David S. Miller813b3b52011-04-28 14:48:42 -07002556 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2557 fl4->flowi4_proto))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002558 flags &= ~RTCF_LOCAL;
2559 /* If multicast route do not exist use
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002560 * default one, but do not gateway in this case.
2561 * Yes, it is hack.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002562 */
David S. Miller982721f2011-02-16 21:44:24 -08002563 if (fi && res->prefixlen < 4)
2564 fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002565 }
2566
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002567 rth = rt_dst_alloc(dev_out,
2568 IN_DEV_CONF_GET(in_dev, NOPOLICY),
David S. Miller0c4dcd52011-02-17 15:42:37 -08002569 IN_DEV_CONF_GET(in_dev, NOXFRM));
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002570 if (!rth)
David S. Miller5ada5522011-02-17 15:29:00 -08002571 return ERR_PTR(-ENOBUFS);
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002572
David S. Millercf911662011-04-28 14:31:47 -07002573 rth->dst.output = ip_output;
2574
David S. Miller813b3b52011-04-28 14:48:42 -07002575 rth->rt_key_dst = orig_daddr;
2576 rth->rt_key_src = orig_saddr;
David S. Millercf911662011-04-28 14:31:47 -07002577 rth->rt_genid = rt_genid(dev_net(dev_out));
2578 rth->rt_flags = flags;
2579 rth->rt_type = type;
Julian Anastasovf61759e2011-12-02 11:39:42 +00002580 rth->rt_key_tos = orig_rtos;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002581 rth->rt_dst = fl4->daddr;
2582 rth->rt_src = fl4->saddr;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002583 rth->rt_route_iif = 0;
David S. Miller813b3b52011-04-28 14:48:42 -07002584 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2585 rth->rt_oif = orig_oif;
2586 rth->rt_mark = fl4->flowi4_mark;
Lorenzo Colitti462ce7c2014-03-31 16:23:51 +09002587 rth->rt_uid = fl4->flowi4_uid;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002588 rth->rt_gateway = fl4->daddr;
2589 rth->rt_spec_dst= fl4->saddr;
David S. Millercf911662011-04-28 14:31:47 -07002590 rth->rt_peer_genid = 0;
2591 rth->peer = NULL;
2592 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002593
2594 RT_CACHE_STAT_INC(out_slow_tot);
2595
2596 if (flags & RTCF_LOCAL) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002597 rth->dst.input = ip_local_deliver;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002598 rth->rt_spec_dst = fl4->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002599 }
2600 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
David S. Miller68a5e3d2011-03-11 20:07:33 -05002601 rth->rt_spec_dst = fl4->saddr;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002602 if (flags & RTCF_LOCAL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002603 !(dev_out->flags & IFF_LOOPBACK)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002604 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002605 RT_CACHE_STAT_INC(out_slow_mc);
2606 }
2607#ifdef CONFIG_IP_MROUTE
David S. Miller982721f2011-02-16 21:44:24 -08002608 if (type == RTN_MULTICAST) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002609 if (IN_DEV_MFORWARD(in_dev) &&
David S. Miller813b3b52011-04-28 14:48:42 -07002610 !ipv4_is_local_multicast(fl4->daddr)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002611 rth->dst.input = ip_mr_input;
2612 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002613 }
2614 }
2615#endif
2616 }
2617
David S. Miller813b3b52011-04-28 14:48:42 -07002618 rt_set_nexthop(rth, fl4, res, fi, type, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002619
David S. Miller5ada5522011-02-17 15:29:00 -08002620 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002621}
2622
Linus Torvalds1da177e2005-04-16 15:20:36 -07002623/*
2624 * Major route resolver routine.
Eric Dumazet0197aa32010-09-30 03:33:58 +00002625 * called with rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002626 */
2627
David S. Miller813b3b52011-04-28 14:48:42 -07002628static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002629{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002630 struct net_device *dev_out = NULL;
Julian Anastasovf61759e2011-12-02 11:39:42 +00002631 __u8 tos = RT_FL_TOS(fl4);
David S. Miller813b3b52011-04-28 14:48:42 -07002632 unsigned int flags = 0;
2633 struct fib_result res;
David S. Miller5ada5522011-02-17 15:29:00 -08002634 struct rtable *rth;
David S. Miller813b3b52011-04-28 14:48:42 -07002635 __be32 orig_daddr;
2636 __be32 orig_saddr;
2637 int orig_oif;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002638
2639 res.fi = NULL;
2640#ifdef CONFIG_IP_MULTIPLE_TABLES
2641 res.r = NULL;
2642#endif
2643
David S. Miller813b3b52011-04-28 14:48:42 -07002644 orig_daddr = fl4->daddr;
2645 orig_saddr = fl4->saddr;
2646 orig_oif = fl4->flowi4_oif;
2647
2648 fl4->flowi4_iif = net->loopback_dev->ifindex;
2649 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2650 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2651 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
David S. Miller44713b62011-03-04 21:24:47 -08002652
David S. Miller010c2702011-02-17 15:37:09 -08002653 rcu_read_lock();
David S. Miller813b3b52011-04-28 14:48:42 -07002654 if (fl4->saddr) {
David S. Millerb23dd4f2011-03-02 14:31:35 -08002655 rth = ERR_PTR(-EINVAL);
David S. Miller813b3b52011-04-28 14:48:42 -07002656 if (ipv4_is_multicast(fl4->saddr) ||
2657 ipv4_is_lbcast(fl4->saddr) ||
2658 ipv4_is_zeronet(fl4->saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002659 goto out;
2660
Linus Torvalds1da177e2005-04-16 15:20:36 -07002661 /* I removed check for oif == dev_out->oif here.
2662 It was wrong for two reasons:
Denis V. Lunev1ab35272008-01-22 22:04:30 -08002663 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2664 is assigned to multiple interfaces.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002665 2. Moreover, we are allowed to send packets with saddr
2666 of another iface. --ANK
2667 */
2668
David S. Miller813b3b52011-04-28 14:48:42 -07002669 if (fl4->flowi4_oif == 0 &&
2670 (ipv4_is_multicast(fl4->daddr) ||
2671 ipv4_is_lbcast(fl4->daddr))) {
Julian Anastasova210d012008-10-01 07:28:28 -07002672 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
David S. Miller813b3b52011-04-28 14:48:42 -07002673 dev_out = __ip_dev_find(net, fl4->saddr, false);
Julian Anastasova210d012008-10-01 07:28:28 -07002674 if (dev_out == NULL)
2675 goto out;
2676
Linus Torvalds1da177e2005-04-16 15:20:36 -07002677 /* Special hack: user can direct multicasts
2678 and limited broadcast via necessary interface
2679 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2680 This hack is not just for fun, it allows
2681 vic,vat and friends to work.
2682 They bind socket to loopback, set ttl to zero
2683 and expect that it will work.
2684 From the viewpoint of routing cache they are broken,
2685 because we are not allowed to build multicast path
2686 with loopback source addr (look, routing cache
2687 cannot know, that ttl is zero, so that packet
2688 will not leave this host and route is valid).
2689 Luckily, this hack is good workaround.
2690 */
2691
David S. Miller813b3b52011-04-28 14:48:42 -07002692 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002693 goto make_route;
2694 }
Julian Anastasova210d012008-10-01 07:28:28 -07002695
David S. Miller813b3b52011-04-28 14:48:42 -07002696 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
Julian Anastasova210d012008-10-01 07:28:28 -07002697 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
David S. Miller813b3b52011-04-28 14:48:42 -07002698 if (!__ip_dev_find(net, fl4->saddr, false))
Julian Anastasova210d012008-10-01 07:28:28 -07002699 goto out;
Julian Anastasova210d012008-10-01 07:28:28 -07002700 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002701 }
2702
2703
David S. Miller813b3b52011-04-28 14:48:42 -07002704 if (fl4->flowi4_oif) {
2705 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002706 rth = ERR_PTR(-ENODEV);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002707 if (dev_out == NULL)
2708 goto out;
Herbert Xue5ed6392005-10-03 14:35:55 -07002709
2710 /* RACE: Check return value of inet_select_addr instead. */
Eric Dumazetfc75fc82010-12-22 04:39:39 +00002711 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
David S. Millerb23dd4f2011-03-02 14:31:35 -08002712 rth = ERR_PTR(-ENETUNREACH);
Eric Dumazetfc75fc82010-12-22 04:39:39 +00002713 goto out;
2714 }
David S. Miller813b3b52011-04-28 14:48:42 -07002715 if (ipv4_is_local_multicast(fl4->daddr) ||
2716 ipv4_is_lbcast(fl4->daddr)) {
2717 if (!fl4->saddr)
2718 fl4->saddr = inet_select_addr(dev_out, 0,
2719 RT_SCOPE_LINK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002720 goto make_route;
2721 }
David S. Miller813b3b52011-04-28 14:48:42 -07002722 if (fl4->saddr) {
2723 if (ipv4_is_multicast(fl4->daddr))
2724 fl4->saddr = inet_select_addr(dev_out, 0,
2725 fl4->flowi4_scope);
2726 else if (!fl4->daddr)
2727 fl4->saddr = inet_select_addr(dev_out, 0,
2728 RT_SCOPE_HOST);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002729 }
2730 }
2731
David S. Miller813b3b52011-04-28 14:48:42 -07002732 if (!fl4->daddr) {
2733 fl4->daddr = fl4->saddr;
2734 if (!fl4->daddr)
2735 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002736 dev_out = net->loopback_dev;
David S. Miller813b3b52011-04-28 14:48:42 -07002737 fl4->flowi4_oif = net->loopback_dev->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002738 res.type = RTN_LOCAL;
2739 flags |= RTCF_LOCAL;
2740 goto make_route;
2741 }
2742
David S. Miller813b3b52011-04-28 14:48:42 -07002743 if (fib_lookup(net, fl4, &res)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002744 res.fi = NULL;
David S. Miller813b3b52011-04-28 14:48:42 -07002745 if (fl4->flowi4_oif) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002746 /* Apparently, routing tables are wrong. Assume,
2747 that the destination is on link.
2748
2749 WHY? DW.
2750 Because we are allowed to send to iface
2751 even if it has NO routes and NO assigned
2752 addresses. When oif is specified, routing
2753 tables are looked up with only one purpose:
2754 to catch if destination is gatewayed, rather than
2755 direct. Moreover, if MSG_DONTROUTE is set,
2756 we send packet, ignoring both routing tables
2757 and ifaddr state. --ANK
2758
2759
2760 We could make it even if oif is unknown,
2761 likely IPv6, but we do not.
2762 */
2763
David S. Miller813b3b52011-04-28 14:48:42 -07002764 if (fl4->saddr == 0)
2765 fl4->saddr = inet_select_addr(dev_out, 0,
2766 RT_SCOPE_LINK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002767 res.type = RTN_UNICAST;
2768 goto make_route;
2769 }
David S. Millerb23dd4f2011-03-02 14:31:35 -08002770 rth = ERR_PTR(-ENETUNREACH);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002771 goto out;
2772 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002773
2774 if (res.type == RTN_LOCAL) {
David S. Miller813b3b52011-04-28 14:48:42 -07002775 if (!fl4->saddr) {
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002776 if (res.fi->fib_prefsrc)
David S. Miller813b3b52011-04-28 14:48:42 -07002777 fl4->saddr = res.fi->fib_prefsrc;
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002778 else
David S. Miller813b3b52011-04-28 14:48:42 -07002779 fl4->saddr = fl4->daddr;
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002780 }
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002781 dev_out = net->loopback_dev;
David S. Miller813b3b52011-04-28 14:48:42 -07002782 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002783 res.fi = NULL;
2784 flags |= RTCF_LOCAL;
2785 goto make_route;
2786 }
2787
2788#ifdef CONFIG_IP_ROUTE_MULTIPATH
David S. Miller813b3b52011-04-28 14:48:42 -07002789 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
David S. Miller1b7fe5932011-03-10 17:01:16 -08002790 fib_select_multipath(&res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002791 else
2792#endif
David S. Miller21d8c492011-04-14 14:49:37 -07002793 if (!res.prefixlen &&
2794 res.table->tb_num_default > 1 &&
David S. Miller813b3b52011-04-28 14:48:42 -07002795 res.type == RTN_UNICAST && !fl4->flowi4_oif)
David S. Miller0c838ff2011-01-31 16:16:50 -08002796 fib_select_default(&res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002797
David S. Miller813b3b52011-04-28 14:48:42 -07002798 if (!fl4->saddr)
2799 fl4->saddr = FIB_RES_PREFSRC(net, res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002800
Linus Torvalds1da177e2005-04-16 15:20:36 -07002801 dev_out = FIB_RES_DEV(res);
David S. Miller813b3b52011-04-28 14:48:42 -07002802 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002803
2804
2805make_route:
David S. Miller813b3b52011-04-28 14:48:42 -07002806 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
Julian Anastasovf61759e2011-12-02 11:39:42 +00002807 tos, dev_out, flags);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002808 if (!IS_ERR(rth)) {
David S. Miller5ada5522011-02-17 15:29:00 -08002809 unsigned int hash;
2810
David S. Miller813b3b52011-04-28 14:48:42 -07002811 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
David S. Miller5ada5522011-02-17 15:29:00 -08002812 rt_genid(dev_net(dev_out)));
David S. Miller813b3b52011-04-28 14:48:42 -07002813 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
David S. Miller5ada5522011-02-17 15:29:00 -08002814 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002815
David S. Miller010c2702011-02-17 15:37:09 -08002816out:
2817 rcu_read_unlock();
David S. Millerb23dd4f2011-03-02 14:31:35 -08002818 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002819}
2820
David S. Miller813b3b52011-04-28 14:48:42 -07002821struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002822{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002823 struct rtable *rth;
David S. Miller010c2702011-02-17 15:37:09 -08002824 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002825
Neil Horman1080d702008-10-27 12:28:25 -07002826 if (!rt_caching(net))
2827 goto slow_output;
2828
David S. Miller9d6ec932011-03-12 01:12:47 -05002829 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002830
2831 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08002832 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002833 rth = rcu_dereference_bh(rth->dst.rt_next)) {
David S. Miller9d6ec932011-03-12 01:12:47 -05002834 if (rth->rt_key_dst == flp4->daddr &&
2835 rth->rt_key_src == flp4->saddr &&
David S. Millerc7537962010-11-11 17:07:48 -08002836 rt_is_output_route(rth) &&
David S. Miller9d6ec932011-03-12 01:12:47 -05002837 rth->rt_oif == flp4->flowi4_oif &&
2838 rth->rt_mark == flp4->flowi4_mark &&
Lorenzo Colitti462ce7c2014-03-31 16:23:51 +09002839 rth->rt_uid == flp4->flowi4_uid &&
David S. Miller475949d2011-05-03 19:45:15 -07002840 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
Denis V. Lunevb5921912008-01-22 23:50:25 -08002841 (IPTOS_RT_MASK | RTO_ONLINK)) &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002842 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002843 !rt_is_expired(rth)) {
David S. Millerde398fb2011-12-05 13:21:42 -05002844 ipv4_validate_peer(rth);
Changli Gaod8d1f302010-06-10 23:31:35 -07002845 dst_use(&rth->dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002846 RT_CACHE_STAT_INC(out_hit);
2847 rcu_read_unlock_bh();
David S. Miller56157872011-05-02 14:37:45 -07002848 if (!flp4->saddr)
2849 flp4->saddr = rth->rt_src;
2850 if (!flp4->daddr)
2851 flp4->daddr = rth->rt_dst;
David S. Millerb23dd4f2011-03-02 14:31:35 -08002852 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002853 }
2854 RT_CACHE_STAT_INC(out_hlist_search);
2855 }
2856 rcu_read_unlock_bh();
2857
Neil Horman1080d702008-10-27 12:28:25 -07002858slow_output:
David S. Miller9d6ec932011-03-12 01:12:47 -05002859 return ip_route_output_slow(net, flp4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002860}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002861EXPORT_SYMBOL_GPL(__ip_route_output_key);
2862
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002863static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2864{
2865 return NULL;
2866}
2867
Steffen Klassertebb762f2011-11-23 02:12:51 +00002868static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
Roland Dreierec831ea2011-01-31 13:16:00 -08002869{
Steffen Klassert618f9bc2011-11-23 02:13:31 +00002870 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2871
2872 return mtu ? : dst->dev->mtu;
Roland Dreierec831ea2011-01-31 13:16:00 -08002873}
2874
David S. Miller14e50e52007-05-24 18:17:54 -07002875static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2876{
2877}
2878
Held Bernhard0972ddb2011-04-24 22:07:32 +00002879static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2880 unsigned long old)
2881{
2882 return NULL;
2883}
2884
David S. Miller14e50e52007-05-24 18:17:54 -07002885static struct dst_ops ipv4_dst_blackhole_ops = {
2886 .family = AF_INET,
Harvey Harrison09640e62009-02-01 00:45:17 -08002887 .protocol = cpu_to_be16(ETH_P_IP),
David S. Miller14e50e52007-05-24 18:17:54 -07002888 .destroy = ipv4_dst_destroy,
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002889 .check = ipv4_blackhole_dst_check,
Steffen Klassertebb762f2011-11-23 02:12:51 +00002890 .mtu = ipv4_blackhole_mtu,
Eric Dumazet214f45c2011-02-18 11:39:01 -08002891 .default_advmss = ipv4_default_advmss,
David S. Miller14e50e52007-05-24 18:17:54 -07002892 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
Held Bernhard0972ddb2011-04-24 22:07:32 +00002893 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
David S. Millerd3aaeb32011-07-18 00:40:17 -07002894 .neigh_lookup = ipv4_neigh_lookup,
David S. Miller14e50e52007-05-24 18:17:54 -07002895};
2896
David S. Miller2774c132011-03-01 14:59:04 -08002897struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
David S. Miller14e50e52007-05-24 18:17:54 -07002898{
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002899 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
David S. Miller2774c132011-03-01 14:59:04 -08002900 struct rtable *ort = (struct rtable *) dst_orig;
David S. Miller14e50e52007-05-24 18:17:54 -07002901
2902 if (rt) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002903 struct dst_entry *new = &rt->dst;
David S. Miller14e50e52007-05-24 18:17:54 -07002904
David S. Miller14e50e52007-05-24 18:17:54 -07002905 new->__use = 1;
Herbert Xu352e5122007-11-13 21:34:06 -08002906 new->input = dst_discard;
2907 new->output = dst_discard;
David S. Millerdefb3512010-12-08 21:16:57 -08002908 dst_copy_metrics(new, &ort->dst);
David S. Miller14e50e52007-05-24 18:17:54 -07002909
Changli Gaod8d1f302010-06-10 23:31:35 -07002910 new->dev = ort->dst.dev;
David S. Miller14e50e52007-05-24 18:17:54 -07002911 if (new->dev)
2912 dev_hold(new->dev);
2913
David S. Miller5e2b61f2011-03-04 21:47:09 -08002914 rt->rt_key_dst = ort->rt_key_dst;
2915 rt->rt_key_src = ort->rt_key_src;
David S. Miller475949d2011-05-03 19:45:15 -07002916 rt->rt_key_tos = ort->rt_key_tos;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002917 rt->rt_route_iif = ort->rt_route_iif;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002918 rt->rt_iif = ort->rt_iif;
2919 rt->rt_oif = ort->rt_oif;
2920 rt->rt_mark = ort->rt_mark;
Lorenzo Colitti462ce7c2014-03-31 16:23:51 +09002921 rt->rt_uid = ort->rt_uid;
David S. Miller14e50e52007-05-24 18:17:54 -07002922
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002923 rt->rt_genid = rt_genid(net);
David S. Miller14e50e52007-05-24 18:17:54 -07002924 rt->rt_flags = ort->rt_flags;
2925 rt->rt_type = ort->rt_type;
2926 rt->rt_dst = ort->rt_dst;
2927 rt->rt_src = ort->rt_src;
David S. Miller14e50e52007-05-24 18:17:54 -07002928 rt->rt_gateway = ort->rt_gateway;
2929 rt->rt_spec_dst = ort->rt_spec_dst;
2930 rt->peer = ort->peer;
2931 if (rt->peer)
2932 atomic_inc(&rt->peer->refcnt);
David S. Miller62fa8a82011-01-26 20:51:05 -08002933 rt->fi = ort->fi;
2934 if (rt->fi)
2935 atomic_inc(&rt->fi->fib_clntref);
David S. Miller14e50e52007-05-24 18:17:54 -07002936
2937 dst_free(new);
2938 }
2939
David S. Miller2774c132011-03-01 14:59:04 -08002940 dst_release(dst_orig);
2941
2942 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
David S. Miller14e50e52007-05-24 18:17:54 -07002943}
2944
David S. Miller9d6ec932011-03-12 01:12:47 -05002945struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
David S. Millerb23dd4f2011-03-02 14:31:35 -08002946 struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002947{
David S. Miller9d6ec932011-03-12 01:12:47 -05002948 struct rtable *rt = __ip_route_output_key(net, flp4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002949
David S. Millerb23dd4f2011-03-02 14:31:35 -08002950 if (IS_ERR(rt))
2951 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002952
David S. Miller56157872011-05-02 14:37:45 -07002953 if (flp4->flowi4_proto)
David S. Miller9d6ec932011-03-12 01:12:47 -05002954 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2955 flowi4_to_flowi(flp4),
2956 sk, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002957
David S. Millerb23dd4f2011-03-02 14:31:35 -08002958 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002959}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002960EXPORT_SYMBOL_GPL(ip_route_output_flow);
2961
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002962static int rt_fill_info(struct net *net,
2963 struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002964 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002965{
Eric Dumazet511c3f92009-06-02 05:14:27 +00002966 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002967 struct rtmsg *r;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002968 struct nlmsghdr *nlh;
Steffen Klassert2bc8ca42011-10-11 01:12:02 +00002969 unsigned long expires = 0;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00002970 const struct inet_peer *peer = rt->peer;
Thomas Grafe3703b32006-11-27 09:27:07 -08002971 u32 id = 0, ts = 0, tsage = 0, error;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002972
2973 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2974 if (nlh == NULL)
Patrick McHardy26932562007-01-31 23:16:40 -08002975 return -EMSGSIZE;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002976
2977 r = nlmsg_data(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002978 r->rtm_family = AF_INET;
2979 r->rtm_dst_len = 32;
2980 r->rtm_src_len = 0;
David S. Miller475949d2011-05-03 19:45:15 -07002981 r->rtm_tos = rt->rt_key_tos;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002982 r->rtm_table = RT_TABLE_MAIN;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002983 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002984 r->rtm_type = rt->rt_type;
2985 r->rtm_scope = RT_SCOPE_UNIVERSE;
2986 r->rtm_protocol = RTPROT_UNSPEC;
2987 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2988 if (rt->rt_flags & RTCF_NOTIFY)
2989 r->rtm_flags |= RTM_F_NOTIFY;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002990
Al Viro17fb2c62006-09-26 22:15:25 -07002991 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002992
David S. Miller5e2b61f2011-03-04 21:47:09 -08002993 if (rt->rt_key_src) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002994 r->rtm_src_len = 32;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002995 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002996 }
Changli Gaod8d1f302010-06-10 23:31:35 -07002997 if (rt->dst.dev)
2998 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
Patrick McHardyc7066f72011-01-14 13:36:42 +01002999#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07003000 if (rt->dst.tclassid)
3001 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003002#endif
David S. Millerc7537962010-11-11 17:07:48 -08003003 if (rt_is_input_route(rt))
Al Viro17fb2c62006-09-26 22:15:25 -07003004 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
David S. Miller5e2b61f2011-03-04 21:47:09 -08003005 else if (rt->rt_src != rt->rt_key_src)
Al Viro17fb2c62006-09-26 22:15:25 -07003006 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
Thomas Grafbe403ea2006-08-17 18:15:17 -07003007
Linus Torvalds1da177e2005-04-16 15:20:36 -07003008 if (rt->rt_dst != rt->rt_gateway)
Al Viro17fb2c62006-09-26 22:15:25 -07003009 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
Thomas Grafbe403ea2006-08-17 18:15:17 -07003010
David S. Millerdefb3512010-12-08 21:16:57 -08003011 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
Thomas Grafbe403ea2006-08-17 18:15:17 -07003012 goto nla_put_failure;
3013
David S. Miller5e2b61f2011-03-04 21:47:09 -08003014 if (rt->rt_mark)
3015 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
Eric Dumazet963bfee2010-07-20 22:03:14 +00003016
Lorenzo Colitti462ce7c2014-03-31 16:23:51 +09003017 if (rt->rt_uid != (uid_t) -1)
3018 NLA_PUT_BE32(skb, RTA_UID, rt->rt_uid);
3019
Changli Gaod8d1f302010-06-10 23:31:35 -07003020 error = rt->dst.error;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00003021 if (peer) {
Eric Dumazet317fe0e2010-06-16 04:52:13 +00003022 inet_peer_refcheck(rt->peer);
Eric Dumazetfe6fe792011-06-08 06:07:07 +00003023 id = atomic_read(&peer->ip_id_count) & 0xffff;
3024 if (peer->tcp_ts_stamp) {
3025 ts = peer->tcp_ts;
3026 tsage = get_seconds() - peer->tcp_ts_stamp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003027 }
Eric Dumazetfe6fe792011-06-08 06:07:07 +00003028 expires = ACCESS_ONCE(peer->pmtu_expires);
Steffen Klassert2bc8ca42011-10-11 01:12:02 +00003029 if (expires) {
3030 if (time_before(jiffies, expires))
3031 expires -= jiffies;
3032 else
3033 expires = 0;
3034 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003035 }
Thomas Grafbe403ea2006-08-17 18:15:17 -07003036
David S. Millerc7537962010-11-11 17:07:48 -08003037 if (rt_is_input_route(rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003038#ifdef CONFIG_IP_MROUTE
Al Viroe4485152006-09-26 22:15:01 -07003039 __be32 dst = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003040
Joe Perchesf97c1e02007-12-16 13:45:43 -08003041 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003042 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
David S. Miller9a1b9492011-05-04 12:18:54 -07003043 int err = ipmr_get_route(net, skb,
3044 rt->rt_src, rt->rt_dst,
3045 r, nowait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003046 if (err <= 0) {
3047 if (!nowait) {
3048 if (err == 0)
3049 return 0;
Thomas Grafbe403ea2006-08-17 18:15:17 -07003050 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003051 } else {
3052 if (err == -EMSGSIZE)
Thomas Grafbe403ea2006-08-17 18:15:17 -07003053 goto nla_put_failure;
Thomas Grafe3703b32006-11-27 09:27:07 -08003054 error = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003055 }
3056 }
3057 } else
3058#endif
David S. Miller5e2b61f2011-03-04 21:47:09 -08003059 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003060 }
3061
Changli Gaod8d1f302010-06-10 23:31:35 -07003062 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
Thomas Grafe3703b32006-11-27 09:27:07 -08003063 expires, error) < 0)
3064 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003065
Thomas Grafbe403ea2006-08-17 18:15:17 -07003066 return nlmsg_end(skb, nlh);
3067
3068nla_put_failure:
Patrick McHardy26932562007-01-31 23:16:40 -08003069 nlmsg_cancel(skb, nlh);
3070 return -EMSGSIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003071}
3072
Thomas Graf63f34442007-03-22 11:55:17 -07003073static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003074{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09003075 struct net *net = sock_net(in_skb->sk);
Thomas Grafd889ce32006-08-17 18:15:44 -07003076 struct rtmsg *rtm;
3077 struct nlattr *tb[RTA_MAX+1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07003078 struct rtable *rt = NULL;
Al Viro9e12bb22006-09-26 21:25:20 -07003079 __be32 dst = 0;
3080 __be32 src = 0;
3081 u32 iif;
Thomas Grafd889ce32006-08-17 18:15:44 -07003082 int err;
Eric Dumazet963bfee2010-07-20 22:03:14 +00003083 int mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003084 struct sk_buff *skb;
3085
Thomas Grafd889ce32006-08-17 18:15:44 -07003086 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3087 if (err < 0)
3088 goto errout;
3089
3090 rtm = nlmsg_data(nlh);
3091
Linus Torvalds1da177e2005-04-16 15:20:36 -07003092 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
Thomas Grafd889ce32006-08-17 18:15:44 -07003093 if (skb == NULL) {
3094 err = -ENOBUFS;
3095 goto errout;
3096 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003097
3098 /* Reserve room for dummy headers, this skb can pass
3099 through good chunk of routing engine.
3100 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07003101 skb_reset_mac_header(skb);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07003102 skb_reset_network_header(skb);
Stephen Hemmingerd2c962b2006-04-17 17:27:11 -07003103
3104 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07003105 ip_hdr(skb)->protocol = IPPROTO_ICMP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003106 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3107
Al Viro17fb2c62006-09-26 22:15:25 -07003108 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3109 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
Thomas Grafd889ce32006-08-17 18:15:44 -07003110 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
Eric Dumazet963bfee2010-07-20 22:03:14 +00003111 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003112
3113 if (iif) {
Thomas Grafd889ce32006-08-17 18:15:44 -07003114 struct net_device *dev;
3115
Denis V. Lunev19375042008-02-28 20:52:04 -08003116 dev = __dev_get_by_index(net, iif);
Thomas Grafd889ce32006-08-17 18:15:44 -07003117 if (dev == NULL) {
3118 err = -ENODEV;
3119 goto errout_free;
3120 }
3121
Linus Torvalds1da177e2005-04-16 15:20:36 -07003122 skb->protocol = htons(ETH_P_IP);
3123 skb->dev = dev;
Eric Dumazet963bfee2010-07-20 22:03:14 +00003124 skb->mark = mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003125 local_bh_disable();
3126 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3127 local_bh_enable();
Thomas Grafd889ce32006-08-17 18:15:44 -07003128
Eric Dumazet511c3f92009-06-02 05:14:27 +00003129 rt = skb_rtable(skb);
Changli Gaod8d1f302010-06-10 23:31:35 -07003130 if (err == 0 && rt->dst.error)
3131 err = -rt->dst.error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003132 } else {
David S. Miller68a5e3d2011-03-11 20:07:33 -05003133 struct flowi4 fl4 = {
3134 .daddr = dst,
3135 .saddr = src,
3136 .flowi4_tos = rtm->rtm_tos,
3137 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3138 .flowi4_mark = mark,
Lorenzo Colitti462ce7c2014-03-31 16:23:51 +09003139 .flowi4_uid = tb[RTA_UID] ? nla_get_u32(tb[RTA_UID]) : current_uid(),
Thomas Grafd889ce32006-08-17 18:15:44 -07003140 };
David S. Miller9d6ec932011-03-12 01:12:47 -05003141 rt = ip_route_output_key(net, &fl4);
David S. Millerb23dd4f2011-03-02 14:31:35 -08003142
3143 err = 0;
3144 if (IS_ERR(rt))
3145 err = PTR_ERR(rt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003146 }
Thomas Grafd889ce32006-08-17 18:15:44 -07003147
Linus Torvalds1da177e2005-04-16 15:20:36 -07003148 if (err)
Thomas Grafd889ce32006-08-17 18:15:44 -07003149 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003150
Changli Gaod8d1f302010-06-10 23:31:35 -07003151 skb_dst_set(skb, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003152 if (rtm->rtm_flags & RTM_F_NOTIFY)
3153 rt->rt_flags |= RTCF_NOTIFY;
3154
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003155 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Denis V. Lunev19375042008-02-28 20:52:04 -08003156 RTM_NEWROUTE, 0, 0);
Thomas Grafd889ce32006-08-17 18:15:44 -07003157 if (err <= 0)
3158 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003159
Denis V. Lunev19375042008-02-28 20:52:04 -08003160 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
Thomas Grafd889ce32006-08-17 18:15:44 -07003161errout:
Thomas Graf2942e902006-08-15 00:30:25 -07003162 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003163
Thomas Grafd889ce32006-08-17 18:15:44 -07003164errout_free:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003165 kfree_skb(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07003166 goto errout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003167}
3168
3169int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3170{
3171 struct rtable *rt;
3172 int h, s_h;
3173 int idx, s_idx;
Denis V. Lunev19375042008-02-28 20:52:04 -08003174 struct net *net;
3175
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09003176 net = sock_net(skb->sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003177
3178 s_h = cb->args[0];
Eric Dumazetd8c92832008-01-07 21:52:14 -08003179 if (s_h < 0)
3180 s_h = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003181 s_idx = idx = cb->args[1];
Eric Dumazeta6272662008-08-28 01:11:25 -07003182 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3183 if (!rt_hash_table[h].chain)
3184 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003185 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08003186 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
Changli Gaod8d1f302010-06-10 23:31:35 -07003187 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3188 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003189 continue;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07003190 if (rt_is_expired(rt))
Eric Dumazet29e75252008-01-31 17:05:09 -08003191 continue;
Changli Gaod8d1f302010-06-10 23:31:35 -07003192 skb_dst_set_noref(skb, &rt->dst);
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003193 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003194 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07003195 1, NLM_F_MULTI) <= 0) {
Eric Dumazetadf30902009-06-02 05:19:30 +00003196 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003197 rcu_read_unlock_bh();
3198 goto done;
3199 }
Eric Dumazetadf30902009-06-02 05:19:30 +00003200 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003201 }
3202 rcu_read_unlock_bh();
3203 }
3204
3205done:
3206 cb->args[0] = h;
3207 cb->args[1] = idx;
3208 return skb->len;
3209}
3210
3211void ip_rt_multicast_event(struct in_device *in_dev)
3212{
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -07003213 rt_cache_flush(dev_net(in_dev->dev), 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003214}
3215
3216#ifdef CONFIG_SYSCTL
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003217static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003218 void __user *buffer,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003219 size_t *lenp, loff_t *ppos)
3220{
3221 if (write) {
Denis V. Lunev639e1042008-07-05 19:02:06 -07003222 int flush_delay;
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003223 ctl_table ctl;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003224 struct net *net;
Denis V. Lunev639e1042008-07-05 19:02:06 -07003225
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003226 memcpy(&ctl, __ctl, sizeof(ctl));
3227 ctl.data = &flush_delay;
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003228 proc_dointvec(&ctl, write, buffer, lenp, ppos);
Denis V. Lunev639e1042008-07-05 19:02:06 -07003229
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003230 net = (struct net *)__ctl->extra1;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003231 rt_cache_flush(net, flush_delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003232 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003233 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003234
3235 return -EINVAL;
3236}
3237
Al Viroeeb61f72008-07-27 08:59:33 +01003238static ctl_table ipv4_route_table[] = {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003239 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003240 .procname = "gc_thresh",
3241 .data = &ipv4_dst_ops.gc_thresh,
3242 .maxlen = sizeof(int),
3243 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003244 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003245 },
3246 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003247 .procname = "max_size",
3248 .data = &ip_rt_max_size,
3249 .maxlen = sizeof(int),
3250 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003251 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003252 },
3253 {
3254 /* Deprecated. Use gc_min_interval_ms */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003255
Linus Torvalds1da177e2005-04-16 15:20:36 -07003256 .procname = "gc_min_interval",
3257 .data = &ip_rt_gc_min_interval,
3258 .maxlen = sizeof(int),
3259 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003260 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003261 },
3262 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003263 .procname = "gc_min_interval_ms",
3264 .data = &ip_rt_gc_min_interval,
3265 .maxlen = sizeof(int),
3266 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003267 .proc_handler = proc_dointvec_ms_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003268 },
3269 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003270 .procname = "gc_timeout",
3271 .data = &ip_rt_gc_timeout,
3272 .maxlen = sizeof(int),
3273 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003274 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003275 },
3276 {
Eric Dumazet9f28a2f2011-12-21 15:47:16 -05003277 .procname = "gc_interval",
3278 .data = &ip_rt_gc_interval,
3279 .maxlen = sizeof(int),
3280 .mode = 0644,
3281 .proc_handler = proc_dointvec_jiffies,
3282 },
3283 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003284 .procname = "redirect_load",
3285 .data = &ip_rt_redirect_load,
3286 .maxlen = sizeof(int),
3287 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003288 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003289 },
3290 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003291 .procname = "redirect_number",
3292 .data = &ip_rt_redirect_number,
3293 .maxlen = sizeof(int),
3294 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003295 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003296 },
3297 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003298 .procname = "redirect_silence",
3299 .data = &ip_rt_redirect_silence,
3300 .maxlen = sizeof(int),
3301 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003302 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003303 },
3304 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003305 .procname = "error_cost",
3306 .data = &ip_rt_error_cost,
3307 .maxlen = sizeof(int),
3308 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003309 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003310 },
3311 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003312 .procname = "error_burst",
3313 .data = &ip_rt_error_burst,
3314 .maxlen = sizeof(int),
3315 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003316 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003317 },
3318 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003319 .procname = "gc_elasticity",
3320 .data = &ip_rt_gc_elasticity,
3321 .maxlen = sizeof(int),
3322 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003323 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003324 },
3325 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003326 .procname = "mtu_expires",
3327 .data = &ip_rt_mtu_expires,
3328 .maxlen = sizeof(int),
3329 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003330 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003331 },
3332 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003333 .procname = "min_pmtu",
3334 .data = &ip_rt_min_pmtu,
3335 .maxlen = sizeof(int),
3336 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003337 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003338 },
3339 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003340 .procname = "min_adv_mss",
3341 .data = &ip_rt_min_advmss,
3342 .maxlen = sizeof(int),
3343 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003344 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003345 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003346 { }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003347};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003348
Al Viro2f4520d2008-08-25 15:17:44 -07003349static struct ctl_table empty[1];
3350
3351static struct ctl_table ipv4_skeleton[] =
3352{
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003353 { .procname = "route",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003354 .mode = 0555, .child = ipv4_route_table},
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003355 { .procname = "neigh",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003356 .mode = 0555, .child = empty},
Al Viro2f4520d2008-08-25 15:17:44 -07003357 { }
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003358};
3359
Al Viro2f4520d2008-08-25 15:17:44 -07003360static __net_initdata struct ctl_path ipv4_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003361 { .procname = "net", },
3362 { .procname = "ipv4", },
Al Viro2f4520d2008-08-25 15:17:44 -07003363 { },
3364};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003365
3366static struct ctl_table ipv4_route_flush_table[] = {
3367 {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003368 .procname = "flush",
3369 .maxlen = sizeof(int),
3370 .mode = 0200,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003371 .proc_handler = ipv4_sysctl_rtcache_flush,
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003372 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003373 { },
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003374};
3375
Al Viro2f4520d2008-08-25 15:17:44 -07003376static __net_initdata struct ctl_path ipv4_route_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003377 { .procname = "net", },
3378 { .procname = "ipv4", },
3379 { .procname = "route", },
Al Viro2f4520d2008-08-25 15:17:44 -07003380 { },
3381};
3382
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003383static __net_init int sysctl_route_net_init(struct net *net)
3384{
3385 struct ctl_table *tbl;
3386
3387 tbl = ipv4_route_flush_table;
Octavian Purdila09ad9bc2009-11-25 15:14:13 -08003388 if (!net_eq(net, &init_net)) {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003389 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3390 if (tbl == NULL)
3391 goto err_dup;
3392 }
3393 tbl[0].extra1 = net;
3394
3395 net->ipv4.route_hdr =
3396 register_net_sysctl_table(net, ipv4_route_path, tbl);
3397 if (net->ipv4.route_hdr == NULL)
3398 goto err_reg;
3399 return 0;
3400
3401err_reg:
3402 if (tbl != ipv4_route_flush_table)
3403 kfree(tbl);
3404err_dup:
3405 return -ENOMEM;
3406}
3407
3408static __net_exit void sysctl_route_net_exit(struct net *net)
3409{
3410 struct ctl_table *tbl;
3411
3412 tbl = net->ipv4.route_hdr->ctl_table_arg;
3413 unregister_net_sysctl_table(net->ipv4.route_hdr);
3414 BUG_ON(tbl == ipv4_route_flush_table);
3415 kfree(tbl);
3416}
3417
3418static __net_initdata struct pernet_operations sysctl_route_ops = {
3419 .init = sysctl_route_net_init,
3420 .exit = sysctl_route_net_exit,
3421};
Linus Torvalds1da177e2005-04-16 15:20:36 -07003422#endif
3423
Neil Horman3ee94372010-05-08 01:57:52 -07003424static __net_init int rt_genid_init(struct net *net)
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003425{
Neil Horman3ee94372010-05-08 01:57:52 -07003426 get_random_bytes(&net->ipv4.rt_genid,
3427 sizeof(net->ipv4.rt_genid));
David S. Miller436c3b62011-03-24 17:42:21 -07003428 get_random_bytes(&net->ipv4.dev_addr_genid,
3429 sizeof(net->ipv4.dev_addr_genid));
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003430 return 0;
3431}
3432
Neil Horman3ee94372010-05-08 01:57:52 -07003433static __net_initdata struct pernet_operations rt_genid_ops = {
3434 .init = rt_genid_init,
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003435};
3436
3437
Patrick McHardyc7066f72011-01-14 13:36:42 +01003438#ifdef CONFIG_IP_ROUTE_CLASSID
Tejun Heo7d720c32010-02-16 15:20:26 +00003439struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
Patrick McHardyc7066f72011-01-14 13:36:42 +01003440#endif /* CONFIG_IP_ROUTE_CLASSID */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003441
3442static __initdata unsigned long rhash_entries;
3443static int __init set_rhash_entries(char *str)
3444{
3445 if (!str)
3446 return 0;
3447 rhash_entries = simple_strtoul(str, &str, 0);
3448 return 1;
3449}
3450__setup("rhash_entries=", set_rhash_entries);
3451
3452int __init ip_rt_init(void)
3453{
Eric Dumazet424c4b72005-07-05 14:58:19 -07003454 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003455
Patrick McHardyc7066f72011-01-14 13:36:42 +01003456#ifdef CONFIG_IP_ROUTE_CLASSID
Ingo Molnar0dcec8c2009-02-25 14:07:33 +01003457 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003458 if (!ip_rt_acct)
3459 panic("IP: failed to allocate ip_rt_acct\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003460#endif
3461
Alexey Dobriyane5d679f2006-08-26 19:25:52 -07003462 ipv4_dst_ops.kmem_cachep =
3463 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09003464 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003465
David S. Miller14e50e52007-05-24 18:17:54 -07003466 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3467
Eric Dumazetfc66f952010-10-08 06:37:34 +00003468 if (dst_entries_init(&ipv4_dst_ops) < 0)
3469 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3470
3471 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3472 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3473
Eric Dumazet424c4b72005-07-05 14:58:19 -07003474 rt_hash_table = (struct rt_hash_bucket *)
3475 alloc_large_system_hash("IP route cache",
3476 sizeof(struct rt_hash_bucket),
3477 rhash_entries,
Jan Beulich44813742009-09-21 17:03:05 -07003478 (totalram_pages >= 128 * 1024) ?
Mike Stroyan18955cf2005-11-29 16:12:55 -08003479 15 : 17,
Kirill Korotaev8d1502d2006-08-07 20:44:22 -07003480 0,
Eric Dumazet424c4b72005-07-05 14:58:19 -07003481 &rt_hash_log,
3482 &rt_hash_mask,
Anton Blanchardc9503e02009-04-27 05:42:24 -07003483 rhash_entries ? 0 : 512 * 1024);
Eric Dumazet22c047c2005-07-05 14:55:24 -07003484 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3485 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003486
3487 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3488 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3489
Linus Torvalds1da177e2005-04-16 15:20:36 -07003490 devinet_init();
3491 ip_fib_init();
3492
Eric Dumazet9f28a2f2011-12-21 15:47:16 -05003493 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3494 expires_ljiffies = jiffies;
3495 schedule_delayed_work(&expires_work,
3496 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3497
Denis V. Lunev73b38712008-02-28 20:51:18 -08003498 if (ip_rt_proc_init())
Joe Perches058bd4d2012-03-11 18:36:11 +00003499 pr_err("Unable to create route proc files\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003500#ifdef CONFIG_XFRM
3501 xfrm_init();
Neil Hormana33bc5c2009-07-30 18:52:15 -07003502 xfrm4_init(ip_rt_max_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003503#endif
Greg Rosec7ac8672011-06-10 01:27:09 +00003504 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
Thomas Graf63f34442007-03-22 11:55:17 -07003505
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003506#ifdef CONFIG_SYSCTL
3507 register_pernet_subsys(&sysctl_route_ops);
3508#endif
Neil Horman3ee94372010-05-08 01:57:52 -07003509 register_pernet_subsys(&rt_genid_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003510 return rc;
3511}
3512
Al Viroa1bc6eb2008-07-30 06:32:52 -04003513#ifdef CONFIG_SYSCTL
Al Viroeeb61f72008-07-27 08:59:33 +01003514/*
3515 * We really need to sanitize the damn ipv4 init order, then all
3516 * this nonsense will go away.
3517 */
3518void __init ip_static_sysctl_init(void)
3519{
Al Viro2f4520d2008-08-25 15:17:44 -07003520 register_sysctl_paths(ipv4_path, ipv4_skeleton);
Al Viroeeb61f72008-07-27 08:59:33 +01003521}
Al Viroa1bc6eb2008-07-30 06:32:52 -04003522#endif