blob: 51a3eec2c7069fa5c866854e2fb25178c41a92fe [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * The Internet Protocol (IP) output module.
7 *
Jesper Juhl02c30a82005-05-05 16:16:16 -07008 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -07009 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Donald Becker, <becker@super.org>
11 * Alan Cox, <Alan.Cox@linux.org>
12 * Richard Underwood
13 * Stefan Becker, <stefanb@yello.ping.de>
14 * Jorge Cwik, <jorge@laser.satlink.net>
15 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 * Hirokazu Takahashi, <taka@valinux.co.jp>
17 *
18 * See ip_input.c for original log
19 *
20 * Fixes:
21 * Alan Cox : Missing nonblock feature in ip_build_xmit.
22 * Mike Kilburn : htons() missing in ip_build_xmit.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090023 * Bradford Johnson: Fix faulty handling of some frames when
Linus Torvalds1da177e2005-04-16 15:20:36 -070024 * no route is found.
25 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
26 * (in case if packet not accepted by
27 * output firewall rules)
28 * Mike McLagan : Routing by source
29 * Alexey Kuznetsov: use new route cache
30 * Andi Kleen: Fix broken PMTU recovery and remove
31 * some redundant tests.
32 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
33 * Andi Kleen : Replace ip_reply with ip_send_reply.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090034 * Andi Kleen : Split fast and slow ip_build_xmit path
35 * for decreased register pressure on x86
36 * and more readibility.
Linus Torvalds1da177e2005-04-16 15:20:36 -070037 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
38 * silently drop skb instead of failing with -EPERM.
39 * Detlev Wengorz : Copy protocol for fragments.
40 * Hirokazu Takahashi: HW checksumming for outgoing UDP
41 * datagrams.
42 * Hirokazu Takahashi: sendfile() on UDP works now.
43 */
44
45#include <asm/uaccess.h>
46#include <asm/system.h>
47#include <linux/module.h>
48#include <linux/types.h>
49#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070050#include <linux/mm.h>
51#include <linux/string.h>
52#include <linux/errno.h>
Al Viroa1f8e7f2006-10-19 16:08:53 -040053#include <linux/highmem.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090054#include <linux/slab.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070055
56#include <linux/socket.h>
57#include <linux/sockios.h>
58#include <linux/in.h>
59#include <linux/inet.h>
60#include <linux/netdevice.h>
61#include <linux/etherdevice.h>
62#include <linux/proc_fs.h>
63#include <linux/stat.h>
64#include <linux/init.h>
65
66#include <net/snmp.h>
67#include <net/ip.h>
68#include <net/protocol.h>
69#include <net/route.h>
Patrick McHardycfacb052006-01-08 22:36:54 -080070#include <net/xfrm.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070071#include <linux/skbuff.h>
72#include <net/sock.h>
73#include <net/arp.h>
74#include <net/icmp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070075#include <net/checksum.h>
76#include <net/inetpeer.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070077#include <linux/igmp.h>
78#include <linux/netfilter_ipv4.h>
79#include <linux/netfilter_bridge.h>
80#include <linux/mroute.h>
81#include <linux/netlink.h>
Arnaldo Carvalho de Melo6cbb0df2005-08-09 19:49:02 -070082#include <linux/tcp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070083
Brian Haleyab32ea52006-09-22 14:15:41 -070084int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
David S. Miller323e1262010-12-12 21:55:08 -080085EXPORT_SYMBOL(sysctl_ip_default_ttl);
Linus Torvalds1da177e2005-04-16 15:20:36 -070086
87/* Generate a checksum for an outgoing IP datagram. */
88__inline__ void ip_send_check(struct iphdr *iph)
89{
90 iph->check = 0;
91 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
92}
Eric Dumazet4bc2f182010-07-09 21:22:10 +000093EXPORT_SYMBOL(ip_send_check);
Linus Torvalds1da177e2005-04-16 15:20:36 -070094
Herbert Xuc439cb22008-01-11 19:14:00 -080095int __ip_local_out(struct sk_buff *skb)
96{
97 struct iphdr *iph = ip_hdr(skb);
98
99 iph->tot_len = htons(skb->len);
100 ip_send_check(iph);
Jan Engelhardt9bbc7682010-03-23 04:07:29 +0100101 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
102 skb_dst(skb)->dev, dst_output);
Herbert Xuc439cb22008-01-11 19:14:00 -0800103}
104
105int ip_local_out(struct sk_buff *skb)
106{
107 int err;
108
109 err = __ip_local_out(skb);
110 if (likely(err == 1))
111 err = dst_output(skb);
112
113 return err;
114}
115EXPORT_SYMBOL_GPL(ip_local_out);
116
Linus Torvalds1da177e2005-04-16 15:20:36 -0700117/* dev_loopback_xmit for use with netfilter. */
118static int ip_dev_loopback_xmit(struct sk_buff *newskb)
119{
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -0700120 skb_reset_mac_header(newskb);
Arnaldo Carvalho de Melobbe735e2007-03-10 22:16:10 -0300121 __skb_pull(newskb, skb_network_offset(newskb));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700122 newskb->pkt_type = PACKET_LOOPBACK;
123 newskb->ip_summed = CHECKSUM_UNNECESSARY;
Eric Dumazetadf30902009-06-02 05:19:30 +0000124 WARN_ON(!skb_dst(newskb));
Eric Dumazete30b38c2010-04-15 09:13:03 +0000125 netif_rx_ni(newskb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700126 return 0;
127}
128
129static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
130{
131 int ttl = inet->uc_ttl;
132
133 if (ttl < 0)
David S. Miller323e1262010-12-12 21:55:08 -0800134 ttl = ip4_dst_hoplimit(dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700135 return ttl;
136}
137
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900138/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700139 * Add an ip header to a skbuff and send it out.
140 *
141 */
142int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000143 __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700144{
145 struct inet_sock *inet = inet_sk(sk);
Eric Dumazet511c3f92009-06-02 05:14:27 +0000146 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700147 struct iphdr *iph;
148
149 /* Build the IP header. */
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000150 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
Arnaldo Carvalho de Melo8856dfa2007-03-10 19:40:39 -0300151 skb_reset_network_header(skb);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700152 iph = ip_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700153 iph->version = 4;
154 iph->ihl = 5;
155 iph->tos = inet->tos;
Changli Gaod8d1f302010-06-10 23:31:35 -0700156 if (ip_dont_fragment(sk, &rt->dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700157 iph->frag_off = htons(IP_DF);
158 else
159 iph->frag_off = 0;
Changli Gaod8d1f302010-06-10 23:31:35 -0700160 iph->ttl = ip_select_ttl(inet, &rt->dst);
David S. Millerdd927a22011-05-04 12:03:30 -0700161 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
162 iph->saddr = saddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700163 iph->protocol = sk->sk_protocol;
Changli Gaod8d1f302010-06-10 23:31:35 -0700164 ip_select_ident(iph, &rt->dst, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700165
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000166 if (opt && opt->opt.optlen) {
167 iph->ihl += opt->opt.optlen>>2;
168 ip_options_build(skb, &opt->opt, daddr, rt, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700169 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700170
171 skb->priority = sk->sk_priority;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800172 skb->mark = sk->sk_mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700173
174 /* Send it out. */
Herbert Xuc439cb22008-01-11 19:14:00 -0800175 return ip_local_out(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700176}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -0700177EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
178
Linus Torvalds1da177e2005-04-16 15:20:36 -0700179static inline int ip_finish_output2(struct sk_buff *skb)
180{
Eric Dumazetadf30902009-06-02 05:19:30 +0000181 struct dst_entry *dst = skb_dst(skb);
Mitsuru Chinen80787eb2007-04-30 00:48:20 -0700182 struct rtable *rt = (struct rtable *)dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700183 struct net_device *dev = dst->dev;
Chuck Leverc2636b42007-10-23 21:07:32 -0700184 unsigned int hh_len = LL_RESERVED_SPACE(dev);
Eric Dumazet8a533662012-02-09 16:13:19 -0500185 struct neighbour *neigh;
186 int res;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700187
Neil Hormanedf391f2009-04-27 02:45:02 -0700188 if (rt->rt_type == RTN_MULTICAST) {
189 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
190 } else if (rt->rt_type == RTN_BROADCAST)
191 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
Mitsuru Chinen80787eb2007-04-30 00:48:20 -0700192
Linus Torvalds1da177e2005-04-16 15:20:36 -0700193 /* Be paranoid, rather than too clever. */
Stephen Hemminger3b04ddd2007-10-09 01:40:57 -0700194 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700195 struct sk_buff *skb2;
196
197 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
198 if (skb2 == NULL) {
199 kfree_skb(skb);
200 return -ENOMEM;
201 }
202 if (skb->sk)
203 skb_set_owner_w(skb2, skb->sk);
204 kfree_skb(skb);
205 skb = skb2;
206 }
207
Eric Dumazet8a533662012-02-09 16:13:19 -0500208 rcu_read_lock();
209 if (dst->hh) {
210 int res = neigh_hh_output(dst->hh, skb);
211
212 rcu_read_unlock();
213 return res;
214 } else {
215 neigh = dst_get_neighbour(dst);
216 if (neigh) {
217 res = neigh->output(skb);
218
219 rcu_read_unlock();
220 return res;
221 }
222 rcu_read_unlock();
223 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700224
225 if (net_ratelimit())
226 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
227 kfree_skb(skb);
228 return -EINVAL;
229}
230
John Heffner628a5c52007-04-20 15:53:27 -0700231static inline int ip_skb_dst_mtu(struct sk_buff *skb)
232{
233 struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
234
235 return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
Eric Dumazetadf30902009-06-02 05:19:30 +0000236 skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
John Heffner628a5c52007-04-20 15:53:27 -0700237}
238
Patrick McHardy861d0482007-10-15 01:48:39 -0700239static int ip_finish_output(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700240{
Patrick McHardy5c901da2006-01-06 23:05:36 -0800241#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
242 /* Policy lookup after SNAT yielded a new policy */
Eric Dumazetadf30902009-06-02 05:19:30 +0000243 if (skb_dst(skb)->xfrm != NULL) {
Patrick McHardy48d5cad2006-02-15 15:10:22 -0800244 IPCB(skb)->flags |= IPSKB_REROUTED;
245 return dst_output(skb);
246 }
Patrick McHardy5c901da2006-01-06 23:05:36 -0800247#endif
John Heffner628a5c52007-04-20 15:53:27 -0700248 if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
Patrick McHardy1bd9bef2006-01-05 12:20:59 -0800249 return ip_fragment(skb, ip_finish_output2);
250 else
251 return ip_finish_output2(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700252}
253
254int ip_mc_output(struct sk_buff *skb)
255{
256 struct sock *sk = skb->sk;
Eric Dumazet511c3f92009-06-02 05:14:27 +0000257 struct rtable *rt = skb_rtable(skb);
Changli Gaod8d1f302010-06-10 23:31:35 -0700258 struct net_device *dev = rt->dst.dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700259
260 /*
261 * If the indicated interface is up and running, send the packet.
262 */
Neil Hormanedf391f2009-04-27 02:45:02 -0700263 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700264
265 skb->dev = dev;
266 skb->protocol = htons(ETH_P_IP);
267
268 /*
269 * Multicasts are looped back for other local users
270 */
271
272 if (rt->rt_flags&RTCF_MULTICAST) {
Octavian Purdila7ad68482010-01-06 20:37:01 -0800273 if (sk_mc_loop(sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700274#ifdef CONFIG_IP_MROUTE
275 /* Small optimization: do not loopback not local frames,
276 which returned after forwarding; they will be dropped
277 by ip_mr_input in any case.
278 Note, that local frames are looped back to be delivered
279 to local recipients.
280
281 This check is duplicated in ip_mr_input at the moment.
282 */
Joe Perches9d4fb272009-11-23 10:41:23 -0800283 &&
284 ((rt->rt_flags & RTCF_LOCAL) ||
285 !(IPCB(skb)->flags & IPSKB_FORWARDED))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700286#endif
Joe Perches9d4fb272009-11-23 10:41:23 -0800287 ) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700288 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
289 if (newskb)
Jan Engelhardt9bbc7682010-03-23 04:07:29 +0100290 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
291 newskb, NULL, newskb->dev,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700292 ip_dev_loopback_xmit);
293 }
294
295 /* Multicasts with ttl 0 must not go beyond the host */
296
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700297 if (ip_hdr(skb)->ttl == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700298 kfree_skb(skb);
299 return 0;
300 }
301 }
302
303 if (rt->rt_flags&RTCF_BROADCAST) {
304 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
305 if (newskb)
Jan Engelhardt9bbc7682010-03-23 04:07:29 +0100306 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
307 NULL, newskb->dev, ip_dev_loopback_xmit);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700308 }
309
Jan Engelhardt9bbc7682010-03-23 04:07:29 +0100310 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
311 skb->dev, ip_finish_output,
Patrick McHardy48d5cad2006-02-15 15:10:22 -0800312 !(IPCB(skb)->flags & IPSKB_REROUTED));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700313}
314
315int ip_output(struct sk_buff *skb)
316{
Eric Dumazetadf30902009-06-02 05:19:30 +0000317 struct net_device *dev = skb_dst(skb)->dev;
Patrick McHardy1bd9bef2006-01-05 12:20:59 -0800318
Neil Hormanedf391f2009-04-27 02:45:02 -0700319 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700320
Patrick McHardy1bd9bef2006-01-05 12:20:59 -0800321 skb->dev = dev;
322 skb->protocol = htons(ETH_P_IP);
323
Jan Engelhardt9bbc7682010-03-23 04:07:29 +0100324 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900325 ip_finish_output,
Patrick McHardy48d5cad2006-02-15 15:10:22 -0800326 !(IPCB(skb)->flags & IPSKB_REROUTED));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700327}
328
David S. Millerd9d8da82011-05-06 22:23:20 -0700329int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700330{
David S. Millere89862f2007-01-26 01:04:55 -0800331 struct sock *sk = skb->sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700332 struct inet_sock *inet = inet_sk(sk);
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000333 struct ip_options_rcu *inet_opt;
David S. Millerb57ae012011-05-06 16:24:06 -0700334 struct flowi4 *fl4;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700335 struct rtable *rt;
336 struct iphdr *iph;
Eric Dumazetab6e3fe2010-05-10 11:31:49 +0000337 int res;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700338
339 /* Skip all of this if the packet is already routed,
340 * f.e. by something like SCTP.
341 */
Eric Dumazetab6e3fe2010-05-10 11:31:49 +0000342 rcu_read_lock();
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000343 inet_opt = rcu_dereference(inet->inet_opt);
David S. Millerea4fc0d2011-05-06 22:30:20 -0700344 fl4 = &fl->u.ip4;
Eric Dumazet511c3f92009-06-02 05:14:27 +0000345 rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700346 if (rt != NULL)
347 goto packet_routed;
348
349 /* Make sure we can route this packet. */
350 rt = (struct rtable *)__sk_dst_check(sk, 0);
351 if (rt == NULL) {
Al Viro3ca3c682006-09-27 18:28:07 -0700352 __be32 daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700353
354 /* Use correct destination address if we have options. */
Eric Dumazetc720c7e2009-10-15 06:30:45 +0000355 daddr = inet->inet_daddr;
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000356 if (inet_opt && inet_opt->opt.srr)
357 daddr = inet_opt->opt.faddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700358
David S. Miller78fbfd82011-03-12 00:00:52 -0500359 /* If this fails, retransmit mechanism of transport layer will
360 * keep trying until route appears or the connection times
361 * itself out.
362 */
David S. Millerb57ae012011-05-06 16:24:06 -0700363 rt = ip_route_output_ports(sock_net(sk), fl4, sk,
David S. Miller78fbfd82011-03-12 00:00:52 -0500364 daddr, inet->inet_saddr,
365 inet->inet_dport,
366 inet->inet_sport,
367 sk->sk_protocol,
368 RT_CONN_FLAGS(sk),
369 sk->sk_bound_dev_if);
370 if (IS_ERR(rt))
371 goto no_route;
Changli Gaod8d1f302010-06-10 23:31:35 -0700372 sk_setup_caps(sk, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700373 }
Changli Gaod8d1f302010-06-10 23:31:35 -0700374 skb_dst_set_noref(skb, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700375
376packet_routed:
David S. Millerea4fc0d2011-05-06 22:30:20 -0700377 if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700378 goto no_route;
379
380 /* OK, we know where to send it, allocate and build IP header. */
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000381 skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
Arnaldo Carvalho de Melo8856dfa2007-03-10 19:40:39 -0300382 skb_reset_network_header(skb);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700383 iph = ip_hdr(skb);
Al Viro714e85b2006-11-14 20:51:49 -0800384 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
Changli Gaod8d1f302010-06-10 23:31:35 -0700385 if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700386 iph->frag_off = htons(IP_DF);
387 else
388 iph->frag_off = 0;
Changli Gaod8d1f302010-06-10 23:31:35 -0700389 iph->ttl = ip_select_ttl(inet, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700390 iph->protocol = sk->sk_protocol;
David S. Millerea4fc0d2011-05-06 22:30:20 -0700391 iph->saddr = fl4->saddr;
392 iph->daddr = fl4->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700393 /* Transport layer set skb->h.foo itself. */
394
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000395 if (inet_opt && inet_opt->opt.optlen) {
396 iph->ihl += inet_opt->opt.optlen >> 2;
397 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700398 }
399
Changli Gaod8d1f302010-06-10 23:31:35 -0700400 ip_select_ident_more(iph, &rt->dst, sk,
Herbert Xu79671682006-06-22 02:40:14 -0700401 (skb_shinfo(skb)->gso_segs ?: 1) - 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700402
Linus Torvalds1da177e2005-04-16 15:20:36 -0700403 skb->priority = sk->sk_priority;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800404 skb->mark = sk->sk_mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700405
Eric Dumazetab6e3fe2010-05-10 11:31:49 +0000406 res = ip_local_out(skb);
407 rcu_read_unlock();
408 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700409
410no_route:
Eric Dumazetab6e3fe2010-05-10 11:31:49 +0000411 rcu_read_unlock();
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700412 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700413 kfree_skb(skb);
414 return -EHOSTUNREACH;
415}
Eric Dumazet4bc2f182010-07-09 21:22:10 +0000416EXPORT_SYMBOL(ip_queue_xmit);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700417
418
419static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
420{
421 to->pkt_type = from->pkt_type;
422 to->priority = from->priority;
423 to->protocol = from->protocol;
Eric Dumazetadf30902009-06-02 05:19:30 +0000424 skb_dst_drop(to);
Eric Dumazetfe76cda2010-07-01 23:48:22 +0000425 skb_dst_copy(to, from);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700426 to->dev = from->dev;
Thomas Graf82e91ff2006-11-09 15:19:14 -0800427 to->mark = from->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700428
429 /* Copy the flags to each fragment. */
430 IPCB(to)->flags = IPCB(from)->flags;
431
432#ifdef CONFIG_NET_SCHED
433 to->tc_index = from->tc_index;
434#endif
Yasuyuki Kozakaie7ac05f2007-03-14 16:44:01 -0700435 nf_copy(to, from);
Jozsef Kadlecsikba9dda32007-07-07 22:21:23 -0700436#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
437 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
438 to->nf_trace = from->nf_trace;
439#endif
Julian Anastasovc98d80e2005-10-22 13:39:21 +0300440#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
441 to->ipvs_property = from->ipvs_property;
442#endif
James Morris984bc162006-06-09 00:29:17 -0700443 skb_copy_secmark(to, from);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700444}
445
446/*
447 * This IP datagram is too large to be sent in one piece. Break it up into
448 * smaller pieces (each of size equal to IP header plus
449 * a block of the data of the original IP data part) that will yet fit in a
450 * single device frame, and queue such a frame for sending.
451 */
452
Jianjun Kongd9319102008-11-03 00:23:42 -0800453int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700454{
455 struct iphdr *iph;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700456 int ptr;
457 struct net_device *dev;
458 struct sk_buff *skb2;
Changli Gaoc893b802010-07-31 13:25:08 +0000459 unsigned int mtu, hlen, left, len, ll_rs;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700460 int offset;
Alexey Dobriyan76ab6082006-01-06 13:24:29 -0800461 __be16 not_last_frag;
Eric Dumazet511c3f92009-06-02 05:14:27 +0000462 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700463 int err = 0;
464
Changli Gaod8d1f302010-06-10 23:31:35 -0700465 dev = rt->dst.dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700466
467 /*
468 * Point into the IP datagram header.
469 */
470
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700471 iph = ip_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700472
473 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700474 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700475 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
John Heffner628a5c52007-04-20 15:53:27 -0700476 htonl(ip_skb_dst_mtu(skb)));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700477 kfree_skb(skb);
478 return -EMSGSIZE;
479 }
480
481 /*
482 * Setup starting values.
483 */
484
485 hlen = iph->ihl * 4;
Changli Gaod8d1f302010-06-10 23:31:35 -0700486 mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
Bart De Schuymer6c79bf02010-04-20 16:22:01 +0200487#ifdef CONFIG_BRIDGE_NETFILTER
488 if (skb->nf_bridge)
489 mtu -= nf_bridge_mtu_reduction(skb);
490#endif
Herbert Xu89cee8b2005-12-13 23:14:27 -0800491 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700492
493 /* When frag_list is given, use it. First, check its validity:
494 * some transformers could create wrong frag_list or break existing
495 * one, it is not prohibited. In this case fall back to copying.
496 *
497 * LATER: this step can be merged to real generation of fragments,
498 * we can switch to copy when see the first bad fragment.
499 */
David S. Miller21dc3302010-08-23 00:13:46 -0700500 if (skb_has_frag_list(skb)) {
Eric Dumazet3d130082010-09-21 08:47:45 +0000501 struct sk_buff *frag, *frag2;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700502 int first_len = skb_pagelen(skb);
503
504 if (first_len - hlen > mtu ||
505 ((first_len - hlen) & 7) ||
506 (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
507 skb_cloned(skb))
508 goto slow_path;
509
David S. Millerd7fcf1a2009-06-09 00:19:37 -0700510 skb_walk_frags(skb, frag) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700511 /* Correct geometry. */
512 if (frag->len > mtu ||
513 ((frag->len & 7) && frag->next) ||
514 skb_headroom(frag) < hlen)
Eric Dumazet3d130082010-09-21 08:47:45 +0000515 goto slow_path_clean;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700516
517 /* Partially cloned skb? */
518 if (skb_shared(frag))
Eric Dumazet3d130082010-09-21 08:47:45 +0000519 goto slow_path_clean;
Herbert Xu2fdba6b2005-05-18 22:52:33 -0700520
521 BUG_ON(frag->sk);
522 if (skb->sk) {
Herbert Xu2fdba6b2005-05-18 22:52:33 -0700523 frag->sk = skb->sk;
524 frag->destructor = sock_wfree;
Herbert Xu2fdba6b2005-05-18 22:52:33 -0700525 }
Eric Dumazet3d130082010-09-21 08:47:45 +0000526 skb->truesize -= frag->truesize;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700527 }
528
529 /* Everything is OK. Generate! */
530
531 err = 0;
532 offset = 0;
533 frag = skb_shinfo(skb)->frag_list;
David S. Millerd7fcf1a2009-06-09 00:19:37 -0700534 skb_frag_list_init(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700535 skb->data_len = first_len - skb_headlen(skb);
536 skb->len = first_len;
537 iph->tot_len = htons(first_len);
538 iph->frag_off = htons(IP_MF);
539 ip_send_check(iph);
540
541 for (;;) {
542 /* Prepare header of the next frame,
543 * before previous one went down. */
544 if (frag) {
545 frag->ip_summed = CHECKSUM_NONE;
Arnaldo Carvalho de Melobadff6d2007-03-13 13:06:52 -0300546 skb_reset_transport_header(frag);
Arnaldo Carvalho de Meloe2d1bca2007-04-10 20:46:21 -0700547 __skb_push(frag, hlen);
548 skb_reset_network_header(frag);
Arnaldo Carvalho de Melod56f90a2007-04-10 20:50:43 -0700549 memcpy(skb_network_header(frag), iph, hlen);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700550 iph = ip_hdr(frag);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700551 iph->tot_len = htons(frag->len);
552 ip_copy_metadata(frag, skb);
553 if (offset == 0)
554 ip_options_fragment(frag);
555 offset += skb->len - hlen;
556 iph->frag_off = htons(offset>>3);
557 if (frag->next != NULL)
558 iph->frag_off |= htons(IP_MF);
559 /* Ready, complete checksum */
560 ip_send_check(iph);
561 }
562
563 err = output(skb);
564
Wei Dongdafee492006-08-02 13:41:21 -0700565 if (!err)
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700566 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700567 if (err || !frag)
568 break;
569
570 skb = frag;
571 frag = skb->next;
572 skb->next = NULL;
573 }
574
575 if (err == 0) {
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700576 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700577 return 0;
578 }
579
580 while (frag) {
581 skb = frag->next;
582 kfree_skb(frag);
583 frag = skb;
584 }
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700585 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700586 return err;
Eric Dumazet3d130082010-09-21 08:47:45 +0000587
588slow_path_clean:
589 skb_walk_frags(skb, frag2) {
590 if (frag2 == frag)
591 break;
592 frag2->sk = NULL;
593 frag2->destructor = NULL;
594 skb->truesize += frag2->truesize;
595 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700596 }
597
598slow_path:
599 left = skb->len - hlen; /* Space per frame */
George Kadianakis49085bd2010-07-06 11:44:12 +0000600 ptr = hlen; /* Where to start from */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700601
Linus Torvalds1da177e2005-04-16 15:20:36 -0700602 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
Stephen Hemminger9bcfcaf2006-08-29 17:48:57 -0700603 * we need to make room for the encapsulating header
604 */
Changli Gaoc893b802010-07-31 13:25:08 +0000605 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
Stephen Hemminger9bcfcaf2006-08-29 17:48:57 -0700606
Linus Torvalds1da177e2005-04-16 15:20:36 -0700607 /*
608 * Fragment the datagram.
609 */
610
611 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
612 not_last_frag = iph->frag_off & htons(IP_MF);
613
614 /*
615 * Keep copying data until we run out.
616 */
617
Stephen Hemminger132adf52007-03-08 20:44:43 -0800618 while (left > 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700619 len = left;
620 /* IF: it doesn't fit, use 'mtu' - the data space left */
621 if (len > mtu)
622 len = mtu;
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300623 /* IF: we are not sending up to and including the packet end
Linus Torvalds1da177e2005-04-16 15:20:36 -0700624 then align the next start on an eight byte boundary */
625 if (len < left) {
626 len &= ~7;
627 }
628 /*
629 * Allocate buffer.
630 */
631
632 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
Patrick McHardy64ce2072005-08-09 20:50:53 -0700633 NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700634 err = -ENOMEM;
635 goto fail;
636 }
637
638 /*
639 * Set up data on packet
640 */
641
642 ip_copy_metadata(skb2, skb);
643 skb_reserve(skb2, ll_rs);
644 skb_put(skb2, len + hlen);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -0700645 skb_reset_network_header(skb2);
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -0700646 skb2->transport_header = skb2->network_header + hlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700647
648 /*
649 * Charge the memory for the fragment to any owner
650 * it might possess
651 */
652
653 if (skb->sk)
654 skb_set_owner_w(skb2, skb->sk);
655
656 /*
657 * Copy the packet header into the new buffer.
658 */
659
Arnaldo Carvalho de Melod626f622007-03-27 18:55:52 -0300660 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700661
662 /*
663 * Copy a block of the IP datagram.
664 */
Arnaldo Carvalho de Melobff9b612007-03-16 17:19:57 -0300665 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700666 BUG();
667 left -= len;
668
669 /*
670 * Fill in the new header fields.
671 */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700672 iph = ip_hdr(skb2);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700673 iph->frag_off = htons((offset >> 3));
674
675 /* ANK: dirty, but effective trick. Upgrade options only if
676 * the segment to be fragmented was THE FIRST (otherwise,
677 * options are already fixed) and make it ONCE
678 * on the initial skb, so that all the following fragments
679 * will inherit fixed options.
680 */
681 if (offset == 0)
682 ip_options_fragment(skb);
683
684 /*
685 * Added AC : If we are fragmenting a fragment that's not the
686 * last fragment then keep MF on each bit
687 */
688 if (left > 0 || not_last_frag)
689 iph->frag_off |= htons(IP_MF);
690 ptr += len;
691 offset += len;
692
693 /*
694 * Put this fragment into the sending queue.
695 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700696 iph->tot_len = htons(len + hlen);
697
698 ip_send_check(iph);
699
700 err = output(skb2);
701 if (err)
702 goto fail;
Wei Dongdafee492006-08-02 13:41:21 -0700703
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700704 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700705 }
706 kfree_skb(skb);
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700707 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700708 return err;
709
710fail:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900711 kfree_skb(skb);
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700712 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700713 return err;
714}
Patrick McHardy2e2f7ae2006-04-04 13:42:35 -0700715EXPORT_SYMBOL(ip_fragment);
716
Linus Torvalds1da177e2005-04-16 15:20:36 -0700717int
718ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
719{
720 struct iovec *iov = from;
721
Patrick McHardy84fa7932006-08-29 16:44:56 -0700722 if (skb->ip_summed == CHECKSUM_PARTIAL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700723 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
724 return -EFAULT;
725 } else {
Al Viro44bb9362006-11-14 21:36:14 -0800726 __wsum csum = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700727 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
728 return -EFAULT;
729 skb->csum = csum_block_add(skb->csum, csum, odd);
730 }
731 return 0;
732}
Eric Dumazet4bc2f182010-07-09 21:22:10 +0000733EXPORT_SYMBOL(ip_generic_getfrag);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700734
Al Viro44bb9362006-11-14 21:36:14 -0800735static inline __wsum
Linus Torvalds1da177e2005-04-16 15:20:36 -0700736csum_page(struct page *page, int offset, int copy)
737{
738 char *kaddr;
Al Viro44bb9362006-11-14 21:36:14 -0800739 __wsum csum;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700740 kaddr = kmap(page);
741 csum = csum_partial(kaddr + offset, copy, 0);
742 kunmap(page);
743 return csum;
744}
745
Adrian Bunk4b30b1c2005-11-29 16:27:20 -0800746static inline int ip_ufo_append_data(struct sock *sk,
Herbert Xu1470ddf2011-03-01 02:36:47 +0000747 struct sk_buff_head *queue,
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700748 int getfrag(void *from, char *to, int offset, int len,
749 int odd, struct sk_buff *skb),
750 void *from, int length, int hh_len, int fragheaderlen,
Bill Sommerfeld43392672011-07-19 15:22:33 +0000751 int transhdrlen, int maxfraglen, unsigned int flags)
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700752{
753 struct sk_buff *skb;
754 int err;
755
756 /* There is support for UDP fragmentation offload by network
757 * device, so create one single skb packet containing complete
758 * udp datagram
759 */
Herbert Xu1470ddf2011-03-01 02:36:47 +0000760 if ((skb = skb_peek_tail(queue)) == NULL) {
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700761 skb = sock_alloc_send_skb(sk,
762 hh_len + fragheaderlen + transhdrlen + 20,
763 (flags & MSG_DONTWAIT), &err);
764
765 if (skb == NULL)
766 return err;
767
768 /* reserve space for Hardware header */
769 skb_reserve(skb, hh_len);
770
771 /* create space for UDP/IP header */
Jianjun Kongd9319102008-11-03 00:23:42 -0800772 skb_put(skb, fragheaderlen + transhdrlen);
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700773
774 /* initialize network header pointer */
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -0700775 skb_reset_network_header(skb);
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700776
777 /* initialize protocol header pointer */
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -0700778 skb->transport_header = skb->network_header + fragheaderlen;
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700779
Patrick McHardy84fa7932006-08-29 16:44:56 -0700780 skb->ip_summed = CHECKSUM_PARTIAL;
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700781 skb->csum = 0;
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700782
Kostya Bbe9164e2008-04-29 22:36:30 -0700783 /* specify the length of each IP datagram fragment */
Bill Sommerfeld43392672011-07-19 15:22:33 +0000784 skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
Herbert Xuf83ef8c2006-06-30 13:37:03 -0700785 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
Herbert Xu1470ddf2011-03-01 02:36:47 +0000786 __skb_queue_tail(queue, skb);
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700787 }
Kostya Bbe9164e2008-04-29 22:36:30 -0700788
789 return skb_append_datato_frags(sk, skb, getfrag, from,
790 (length - transhdrlen));
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700791}
792
David S. Millerf5fca602011-05-08 17:24:10 -0700793static int __ip_append_data(struct sock *sk,
794 struct flowi4 *fl4,
795 struct sk_buff_head *queue,
Herbert Xu1470ddf2011-03-01 02:36:47 +0000796 struct inet_cork *cork,
797 int getfrag(void *from, char *to, int offset,
798 int len, int odd, struct sk_buff *skb),
799 void *from, int length, int transhdrlen,
800 unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700801{
802 struct inet_sock *inet = inet_sk(sk);
803 struct sk_buff *skb;
804
Herbert Xu07df5292011-03-01 23:00:58 -0800805 struct ip_options *opt = cork->opt;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700806 int hh_len;
807 int exthdrlen;
808 int mtu;
809 int copy;
810 int err;
811 int offset = 0;
812 unsigned int maxfraglen, fragheaderlen;
813 int csummode = CHECKSUM_NONE;
Herbert Xu1470ddf2011-03-01 02:36:47 +0000814 struct rtable *rt = (struct rtable *)cork->dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700815
Steffen Klassert96d73032011-06-05 20:48:47 +0000816 skb = skb_peek_tail(queue);
817
818 exthdrlen = !skb ? rt->dst.header_len : 0;
Herbert Xu07df5292011-03-01 23:00:58 -0800819 mtu = cork->fragsize;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700820
Changli Gaod8d1f302010-06-10 23:31:35 -0700821 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700822
823 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
824 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
825
Herbert Xu1470ddf2011-03-01 02:36:47 +0000826 if (cork->length + length > 0xFFFF - fragheaderlen) {
David S. Millerf5fca602011-05-08 17:24:10 -0700827 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
Eric Dumazetc720c7e2009-10-15 06:30:45 +0000828 mtu-exthdrlen);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700829 return -EMSGSIZE;
830 }
831
832 /*
833 * transhdrlen > 0 means that this is the first fragment and we wish
834 * it won't be fragmented in the future.
835 */
836 if (transhdrlen &&
837 length + fragheaderlen <= mtu &&
Changli Gaod8d1f302010-06-10 23:31:35 -0700838 rt->dst.dev->features & NETIF_F_V4_CSUM &&
Linus Torvalds1da177e2005-04-16 15:20:36 -0700839 !exthdrlen)
Patrick McHardy84fa7932006-08-29 16:44:56 -0700840 csummode = CHECKSUM_PARTIAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700841
Herbert Xu1470ddf2011-03-01 02:36:47 +0000842 cork->length += length;
Herbert Xu26cde9f2010-06-15 01:52:25 +0000843 if (((length > mtu) || (skb && skb_is_gso(skb))) &&
Kostya Bbe9164e2008-04-29 22:36:30 -0700844 (sk->sk_protocol == IPPROTO_UDP) &&
Steffen Klassertc1460662011-06-29 23:19:32 +0000845 (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
Herbert Xu1470ddf2011-03-01 02:36:47 +0000846 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
847 hh_len, fragheaderlen, transhdrlen,
Bill Sommerfeld43392672011-07-19 15:22:33 +0000848 maxfraglen, flags);
Patrick McHardybaa829d2006-03-12 20:35:12 -0800849 if (err)
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700850 goto error;
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700851 return 0;
852 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700853
854 /* So, what's going on in the loop below?
855 *
856 * We use calculated fragment length to generate chained skb,
857 * each of segments is IP fragment ready for sending to network after
858 * adding appropriate IP header.
859 */
860
Herbert Xu26cde9f2010-06-15 01:52:25 +0000861 if (!skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700862 goto alloc_new_skb;
863
864 while (length > 0) {
865 /* Check if the remaining data fits into current packet. */
866 copy = mtu - skb->len;
867 if (copy < length)
868 copy = maxfraglen - skb->len;
869 if (copy <= 0) {
870 char *data;
871 unsigned int datalen;
872 unsigned int fraglen;
873 unsigned int fraggap;
874 unsigned int alloclen;
875 struct sk_buff *skb_prev;
876alloc_new_skb:
877 skb_prev = skb;
878 if (skb_prev)
879 fraggap = skb_prev->len - maxfraglen;
880 else
881 fraggap = 0;
882
883 /*
884 * If remaining data exceeds the mtu,
885 * we know we need more fragment(s).
886 */
887 datalen = length + fraggap;
888 if (datalen > mtu - fragheaderlen)
889 datalen = maxfraglen - fragheaderlen;
890 fraglen = datalen + fragheaderlen;
891
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900892 if ((flags & MSG_MORE) &&
Changli Gaod8d1f302010-06-10 23:31:35 -0700893 !(rt->dst.dev->features&NETIF_F_SG))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700894 alloclen = mtu;
895 else
Eric Dumazet59104f02010-09-20 20:16:27 +0000896 alloclen = fraglen;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700897
Steffen Klassert353e5c92011-06-22 01:05:37 +0000898 alloclen += exthdrlen;
899
Linus Torvalds1da177e2005-04-16 15:20:36 -0700900 /* The last fragment gets additional space at tail.
901 * Note, with MSG_MORE we overallocate on fragments,
902 * because we have no idea what fragment will be
903 * the last.
904 */
Steffen Klassert33f99dc2011-06-22 01:04:37 +0000905 if (datalen == length + fraggap)
Changli Gaod8d1f302010-06-10 23:31:35 -0700906 alloclen += rt->dst.trailer_len;
Steffen Klassert33f99dc2011-06-22 01:04:37 +0000907
Linus Torvalds1da177e2005-04-16 15:20:36 -0700908 if (transhdrlen) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900909 skb = sock_alloc_send_skb(sk,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700910 alloclen + hh_len + 15,
911 (flags & MSG_DONTWAIT), &err);
912 } else {
913 skb = NULL;
914 if (atomic_read(&sk->sk_wmem_alloc) <=
915 2 * sk->sk_sndbuf)
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900916 skb = sock_wmalloc(sk,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700917 alloclen + hh_len + 15, 1,
918 sk->sk_allocation);
919 if (unlikely(skb == NULL))
920 err = -ENOBUFS;
Patrick Ohly51f31ca2009-02-12 05:03:39 +0000921 else
922 /* only the initial fragment is
923 time stamped */
Herbert Xu1470ddf2011-03-01 02:36:47 +0000924 cork->tx_flags = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700925 }
926 if (skb == NULL)
927 goto error;
928
929 /*
930 * Fill in the control structures
931 */
932 skb->ip_summed = csummode;
933 skb->csum = 0;
934 skb_reserve(skb, hh_len);
Herbert Xu1470ddf2011-03-01 02:36:47 +0000935 skb_shinfo(skb)->tx_flags = cork->tx_flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700936
937 /*
938 * Find where to start putting bytes.
939 */
Steffen Klassert353e5c92011-06-22 01:05:37 +0000940 data = skb_put(skb, fraglen + exthdrlen);
Arnaldo Carvalho de Meloc14d2452007-03-11 22:39:41 -0300941 skb_set_network_header(skb, exthdrlen);
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -0700942 skb->transport_header = (skb->network_header +
943 fragheaderlen);
Steffen Klassert353e5c92011-06-22 01:05:37 +0000944 data += fragheaderlen + exthdrlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700945
946 if (fraggap) {
947 skb->csum = skb_copy_and_csum_bits(
948 skb_prev, maxfraglen,
949 data + transhdrlen, fraggap, 0);
950 skb_prev->csum = csum_sub(skb_prev->csum,
951 skb->csum);
952 data += fraggap;
Herbert Xue9fa4f72006-08-13 20:12:58 -0700953 pskb_trim_unique(skb_prev, maxfraglen);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700954 }
955
956 copy = datalen - transhdrlen - fraggap;
957 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
958 err = -EFAULT;
959 kfree_skb(skb);
960 goto error;
961 }
962
963 offset += copy;
964 length -= datalen - fraggap;
965 transhdrlen = 0;
966 exthdrlen = 0;
967 csummode = CHECKSUM_NONE;
968
969 /*
970 * Put the packet on the pending queue.
971 */
Herbert Xu1470ddf2011-03-01 02:36:47 +0000972 __skb_queue_tail(queue, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700973 continue;
974 }
975
976 if (copy > length)
977 copy = length;
978
Changli Gaod8d1f302010-06-10 23:31:35 -0700979 if (!(rt->dst.dev->features&NETIF_F_SG)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700980 unsigned int off;
981
982 off = skb->len;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900983 if (getfrag(from, skb_put(skb, copy),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700984 offset, copy, off, skb) < 0) {
985 __skb_trim(skb, off);
986 err = -EFAULT;
987 goto error;
988 }
989 } else {
990 int i = skb_shinfo(skb)->nr_frags;
991 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
Herbert Xu1470ddf2011-03-01 02:36:47 +0000992 struct page *page = cork->page;
993 int off = cork->off;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700994 unsigned int left;
995
996 if (page && (left = PAGE_SIZE - off) > 0) {
997 if (copy >= left)
998 copy = left;
999 if (page != frag->page) {
1000 if (i == MAX_SKB_FRAGS) {
1001 err = -EMSGSIZE;
1002 goto error;
1003 }
1004 get_page(page);
Herbert Xu1470ddf2011-03-01 02:36:47 +00001005 skb_fill_page_desc(skb, i, page, off, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001006 frag = &skb_shinfo(skb)->frags[i];
1007 }
1008 } else if (i < MAX_SKB_FRAGS) {
1009 if (copy > PAGE_SIZE)
1010 copy = PAGE_SIZE;
1011 page = alloc_pages(sk->sk_allocation, 0);
1012 if (page == NULL) {
1013 err = -ENOMEM;
1014 goto error;
1015 }
Herbert Xu1470ddf2011-03-01 02:36:47 +00001016 cork->page = page;
1017 cork->off = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001018
1019 skb_fill_page_desc(skb, i, page, 0, 0);
1020 frag = &skb_shinfo(skb)->frags[i];
Linus Torvalds1da177e2005-04-16 15:20:36 -07001021 } else {
1022 err = -EMSGSIZE;
1023 goto error;
1024 }
1025 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1026 err = -EFAULT;
1027 goto error;
1028 }
Herbert Xu1470ddf2011-03-01 02:36:47 +00001029 cork->off += copy;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001030 frag->size += copy;
1031 skb->len += copy;
1032 skb->data_len += copy;
Herbert Xuf945fa72008-01-22 22:39:26 -08001033 skb->truesize += copy;
1034 atomic_add(copy, &sk->sk_wmem_alloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001035 }
1036 offset += copy;
1037 length -= copy;
1038 }
1039
1040 return 0;
1041
1042error:
Herbert Xu1470ddf2011-03-01 02:36:47 +00001043 cork->length -= length;
Pavel Emelyanov5e38e272008-07-16 20:19:49 -07001044 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001045 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001046}
1047
Herbert Xu1470ddf2011-03-01 02:36:47 +00001048static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1049 struct ipcm_cookie *ipc, struct rtable **rtp)
1050{
1051 struct inet_sock *inet = inet_sk(sk);
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001052 struct ip_options_rcu *opt;
Herbert Xu1470ddf2011-03-01 02:36:47 +00001053 struct rtable *rt;
1054
1055 /*
1056 * setup for corking.
1057 */
1058 opt = ipc->opt;
1059 if (opt) {
1060 if (cork->opt == NULL) {
1061 cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1062 sk->sk_allocation);
1063 if (unlikely(cork->opt == NULL))
1064 return -ENOBUFS;
1065 }
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001066 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
Herbert Xu1470ddf2011-03-01 02:36:47 +00001067 cork->flags |= IPCORK_OPT;
1068 cork->addr = ipc->addr;
1069 }
1070 rt = *rtp;
1071 if (unlikely(!rt))
1072 return -EFAULT;
1073 /*
1074 * We steal reference to this route, caller should not release it
1075 */
1076 *rtp = NULL;
1077 cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
Steffen Klassert353e5c92011-06-22 01:05:37 +00001078 rt->dst.dev->mtu : dst_mtu(&rt->dst);
Herbert Xu1470ddf2011-03-01 02:36:47 +00001079 cork->dst = &rt->dst;
1080 cork->length = 0;
1081 cork->tx_flags = ipc->tx_flags;
1082 cork->page = NULL;
1083 cork->off = 0;
1084
1085 return 0;
1086}
1087
1088/*
1089 * ip_append_data() and ip_append_page() can make one large IP datagram
1090 * from many pieces of data. Each pieces will be holded on the socket
1091 * until ip_push_pending_frames() is called. Each piece can be a page
1092 * or non-page data.
1093 *
1094 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1095 * this interface potentially.
1096 *
1097 * LATER: length must be adjusted by pad at tail, when it is required.
1098 */
David S. Millerf5fca602011-05-08 17:24:10 -07001099int ip_append_data(struct sock *sk, struct flowi4 *fl4,
Herbert Xu1470ddf2011-03-01 02:36:47 +00001100 int getfrag(void *from, char *to, int offset, int len,
1101 int odd, struct sk_buff *skb),
1102 void *from, int length, int transhdrlen,
1103 struct ipcm_cookie *ipc, struct rtable **rtp,
1104 unsigned int flags)
1105{
1106 struct inet_sock *inet = inet_sk(sk);
1107 int err;
1108
1109 if (flags&MSG_PROBE)
1110 return 0;
1111
1112 if (skb_queue_empty(&sk->sk_write_queue)) {
David S. Millerbdc712b2011-05-06 15:02:07 -07001113 err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
Herbert Xu1470ddf2011-03-01 02:36:47 +00001114 if (err)
1115 return err;
1116 } else {
1117 transhdrlen = 0;
1118 }
1119
David S. Millerf5fca602011-05-08 17:24:10 -07001120 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
Herbert Xu1470ddf2011-03-01 02:36:47 +00001121 from, length, transhdrlen, flags);
1122}
1123
David S. Millerf5fca602011-05-08 17:24:10 -07001124ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001125 int offset, size_t size, int flags)
1126{
1127 struct inet_sock *inet = inet_sk(sk);
1128 struct sk_buff *skb;
1129 struct rtable *rt;
1130 struct ip_options *opt = NULL;
David S. Millerbdc712b2011-05-06 15:02:07 -07001131 struct inet_cork *cork;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001132 int hh_len;
1133 int mtu;
1134 int len;
1135 int err;
1136 unsigned int maxfraglen, fragheaderlen, fraggap;
1137
1138 if (inet->hdrincl)
1139 return -EPERM;
1140
1141 if (flags&MSG_PROBE)
1142 return 0;
1143
1144 if (skb_queue_empty(&sk->sk_write_queue))
1145 return -EINVAL;
1146
David S. Millerbdc712b2011-05-06 15:02:07 -07001147 cork = &inet->cork.base;
1148 rt = (struct rtable *)cork->dst;
1149 if (cork->flags & IPCORK_OPT)
1150 opt = cork->opt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001151
Changli Gaod8d1f302010-06-10 23:31:35 -07001152 if (!(rt->dst.dev->features&NETIF_F_SG))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001153 return -EOPNOTSUPP;
1154
Changli Gaod8d1f302010-06-10 23:31:35 -07001155 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
David S. Millerbdc712b2011-05-06 15:02:07 -07001156 mtu = cork->fragsize;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001157
1158 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1159 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1160
David S. Millerbdc712b2011-05-06 15:02:07 -07001161 if (cork->length + size > 0xFFFF - fragheaderlen) {
David S. Millerf5fca602011-05-08 17:24:10 -07001162 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001163 return -EMSGSIZE;
1164 }
1165
1166 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1167 return -EINVAL;
1168
David S. Millerbdc712b2011-05-06 15:02:07 -07001169 cork->length += size;
Herbert Xu26cde9f2010-06-15 01:52:25 +00001170 if ((size + skb->len > mtu) &&
1171 (sk->sk_protocol == IPPROTO_UDP) &&
Changli Gaod8d1f302010-06-10 23:31:35 -07001172 (rt->dst.dev->features & NETIF_F_UFO)) {
Herbert Xu79671682006-06-22 02:40:14 -07001173 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
Herbert Xuf83ef8c2006-06-30 13:37:03 -07001174 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
Herbert Xu79671682006-06-22 02:40:14 -07001175 }
Ananda Rajue89e9cf2005-10-18 15:46:41 -07001176
Linus Torvalds1da177e2005-04-16 15:20:36 -07001177
1178 while (size > 0) {
1179 int i;
1180
Herbert Xu89114af2006-07-08 13:34:32 -07001181 if (skb_is_gso(skb))
Ananda Rajue89e9cf2005-10-18 15:46:41 -07001182 len = size;
1183 else {
1184
1185 /* Check if the remaining data fits into current packet. */
1186 len = mtu - skb->len;
1187 if (len < size)
1188 len = maxfraglen - skb->len;
1189 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001190 if (len <= 0) {
1191 struct sk_buff *skb_prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001192 int alloclen;
1193
1194 skb_prev = skb;
Jayachandran C0d0d2bb2005-10-13 11:43:02 -07001195 fraggap = skb_prev->len - maxfraglen;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001196
1197 alloclen = fragheaderlen + hh_len + fraggap + 15;
1198 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1199 if (unlikely(!skb)) {
1200 err = -ENOBUFS;
1201 goto error;
1202 }
1203
1204 /*
1205 * Fill in the control structures
1206 */
1207 skb->ip_summed = CHECKSUM_NONE;
1208 skb->csum = 0;
1209 skb_reserve(skb, hh_len);
1210
1211 /*
1212 * Find where to start putting bytes.
1213 */
Arnaldo Carvalho de Melo967b05f2007-03-13 13:51:52 -03001214 skb_put(skb, fragheaderlen + fraggap);
Arnaldo Carvalho de Melo2ca9e6f2007-03-10 19:15:25 -03001215 skb_reset_network_header(skb);
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -07001216 skb->transport_header = (skb->network_header +
1217 fragheaderlen);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001218 if (fraggap) {
Arnaldo Carvalho de Melo967b05f2007-03-13 13:51:52 -03001219 skb->csum = skb_copy_and_csum_bits(skb_prev,
1220 maxfraglen,
Arnaldo Carvalho de Melo9c702202007-04-25 18:04:18 -07001221 skb_transport_header(skb),
Arnaldo Carvalho de Melo967b05f2007-03-13 13:51:52 -03001222 fraggap, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001223 skb_prev->csum = csum_sub(skb_prev->csum,
1224 skb->csum);
Herbert Xue9fa4f72006-08-13 20:12:58 -07001225 pskb_trim_unique(skb_prev, maxfraglen);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001226 }
1227
1228 /*
1229 * Put the packet on the pending queue.
1230 */
1231 __skb_queue_tail(&sk->sk_write_queue, skb);
1232 continue;
1233 }
1234
1235 i = skb_shinfo(skb)->nr_frags;
1236 if (len > size)
1237 len = size;
1238 if (skb_can_coalesce(skb, i, page, offset)) {
1239 skb_shinfo(skb)->frags[i-1].size += len;
1240 } else if (i < MAX_SKB_FRAGS) {
1241 get_page(page);
1242 skb_fill_page_desc(skb, i, page, offset, len);
1243 } else {
1244 err = -EMSGSIZE;
1245 goto error;
1246 }
1247
1248 if (skb->ip_summed == CHECKSUM_NONE) {
Al Viro44bb9362006-11-14 21:36:14 -08001249 __wsum csum;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001250 csum = csum_page(page, offset, len);
1251 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1252 }
1253
1254 skb->len += len;
1255 skb->data_len += len;
David S. Miller1e34a112008-01-22 23:44:31 -08001256 skb->truesize += len;
1257 atomic_add(len, &sk->sk_wmem_alloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001258 offset += len;
1259 size -= len;
1260 }
1261 return 0;
1262
1263error:
David S. Millerbdc712b2011-05-06 15:02:07 -07001264 cork->length -= size;
Pavel Emelyanov5e38e272008-07-16 20:19:49 -07001265 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001266 return err;
1267}
1268
Herbert Xu1470ddf2011-03-01 02:36:47 +00001269static void ip_cork_release(struct inet_cork *cork)
Pavel Emelyanov429f08e2007-11-05 21:03:24 -08001270{
Herbert Xu1470ddf2011-03-01 02:36:47 +00001271 cork->flags &= ~IPCORK_OPT;
1272 kfree(cork->opt);
1273 cork->opt = NULL;
1274 dst_release(cork->dst);
1275 cork->dst = NULL;
Pavel Emelyanov429f08e2007-11-05 21:03:24 -08001276}
1277
Linus Torvalds1da177e2005-04-16 15:20:36 -07001278/*
1279 * Combined all pending IP fragments on the socket as one IP datagram
1280 * and push them out.
1281 */
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001282struct sk_buff *__ip_make_skb(struct sock *sk,
David S. Miller77968b72011-05-08 17:12:19 -07001283 struct flowi4 *fl4,
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001284 struct sk_buff_head *queue,
1285 struct inet_cork *cork)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001286{
1287 struct sk_buff *skb, *tmp_skb;
1288 struct sk_buff **tail_skb;
1289 struct inet_sock *inet = inet_sk(sk);
Pavel Emelyanov0388b002008-07-14 23:00:43 -07001290 struct net *net = sock_net(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001291 struct ip_options *opt = NULL;
Herbert Xu1470ddf2011-03-01 02:36:47 +00001292 struct rtable *rt = (struct rtable *)cork->dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001293 struct iphdr *iph;
Alexey Dobriyan76ab6082006-01-06 13:24:29 -08001294 __be16 df = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001295 __u8 ttl;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001296
Herbert Xu1470ddf2011-03-01 02:36:47 +00001297 if ((skb = __skb_dequeue(queue)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001298 goto out;
1299 tail_skb = &(skb_shinfo(skb)->frag_list);
1300
1301 /* move skb->data to ip header from ext header */
Arnaldo Carvalho de Melod56f90a2007-04-10 20:50:43 -07001302 if (skb->data < skb_network_header(skb))
Arnaldo Carvalho de Melobbe735e2007-03-10 22:16:10 -03001303 __skb_pull(skb, skb_network_offset(skb));
Herbert Xu1470ddf2011-03-01 02:36:47 +00001304 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
Arnaldo Carvalho de Melocfe1fc72007-03-16 17:26:39 -03001305 __skb_pull(tmp_skb, skb_network_header_len(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001306 *tail_skb = tmp_skb;
1307 tail_skb = &(tmp_skb->next);
1308 skb->len += tmp_skb->len;
1309 skb->data_len += tmp_skb->len;
1310 skb->truesize += tmp_skb->truesize;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001311 tmp_skb->destructor = NULL;
1312 tmp_skb->sk = NULL;
1313 }
1314
1315 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1316 * to fragment the frame generated here. No matter, what transforms
1317 * how transforms change size of the packet, it will come out.
1318 */
John Heffner628a5c52007-04-20 15:53:27 -07001319 if (inet->pmtudisc < IP_PMTUDISC_DO)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001320 skb->local_df = 1;
1321
1322 /* DF bit is set when we want to see DF on outgoing frames.
1323 * If local_df is set too, we still allow to fragment this frame
1324 * locally. */
John Heffner628a5c52007-04-20 15:53:27 -07001325 if (inet->pmtudisc >= IP_PMTUDISC_DO ||
Changli Gaod8d1f302010-06-10 23:31:35 -07001326 (skb->len <= dst_mtu(&rt->dst) &&
1327 ip_dont_fragment(sk, &rt->dst)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001328 df = htons(IP_DF);
1329
Herbert Xu1470ddf2011-03-01 02:36:47 +00001330 if (cork->flags & IPCORK_OPT)
1331 opt = cork->opt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001332
1333 if (rt->rt_type == RTN_MULTICAST)
1334 ttl = inet->mc_ttl;
1335 else
Changli Gaod8d1f302010-06-10 23:31:35 -07001336 ttl = ip_select_ttl(inet, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001337
1338 iph = (struct iphdr *)skb->data;
1339 iph->version = 4;
1340 iph->ihl = 5;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001341 iph->tos = inet->tos;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001342 iph->frag_off = df;
Changli Gaod8d1f302010-06-10 23:31:35 -07001343 ip_select_ident(iph, &rt->dst, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001344 iph->ttl = ttl;
1345 iph->protocol = sk->sk_protocol;
David S. Miller77968b72011-05-08 17:12:19 -07001346 iph->saddr = fl4->saddr;
1347 iph->daddr = fl4->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001348
David S. Miller22f728f2011-05-13 17:21:27 -04001349 if (opt) {
1350 iph->ihl += opt->optlen>>2;
1351 ip_options_build(skb, opt, cork->addr, rt, 0);
1352 }
1353
Linus Torvalds1da177e2005-04-16 15:20:36 -07001354 skb->priority = sk->sk_priority;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -08001355 skb->mark = sk->sk_mark;
Eric Dumazeta21bba92008-11-24 16:07:50 -08001356 /*
1357 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1358 * on dst refcount
1359 */
Herbert Xu1470ddf2011-03-01 02:36:47 +00001360 cork->dst = NULL;
Changli Gaod8d1f302010-06-10 23:31:35 -07001361 skb_dst_set(skb, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001362
David L Stevens96793b42007-09-17 09:57:33 -07001363 if (iph->protocol == IPPROTO_ICMP)
Pavel Emelyanov0388b002008-07-14 23:00:43 -07001364 icmp_out_count(net, ((struct icmphdr *)
David L Stevens96793b42007-09-17 09:57:33 -07001365 skb_transport_header(skb))->type);
1366
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001367 ip_cork_release(cork);
1368out:
1369 return skb;
1370}
1371
1372int ip_send_skb(struct sk_buff *skb)
1373{
1374 struct net *net = sock_net(skb->sk);
1375 int err;
1376
Herbert Xuc439cb22008-01-11 19:14:00 -08001377 err = ip_local_out(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001378 if (err) {
1379 if (err > 0)
Eric Dumazet6ce9e7b2009-09-02 18:05:33 -07001380 err = net_xmit_errno(err);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001381 if (err)
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001382 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001383 }
1384
Linus Torvalds1da177e2005-04-16 15:20:36 -07001385 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001386}
1387
David S. Miller77968b72011-05-08 17:12:19 -07001388int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
Herbert Xu1470ddf2011-03-01 02:36:47 +00001389{
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001390 struct sk_buff *skb;
1391
David S. Miller77968b72011-05-08 17:12:19 -07001392 skb = ip_finish_skb(sk, fl4);
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001393 if (!skb)
1394 return 0;
1395
1396 /* Netfilter gets whole the not fragmented skb. */
1397 return ip_send_skb(skb);
Herbert Xu1470ddf2011-03-01 02:36:47 +00001398}
1399
Linus Torvalds1da177e2005-04-16 15:20:36 -07001400/*
1401 * Throw away all pending data on the socket.
1402 */
Herbert Xu1470ddf2011-03-01 02:36:47 +00001403static void __ip_flush_pending_frames(struct sock *sk,
1404 struct sk_buff_head *queue,
1405 struct inet_cork *cork)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001406{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001407 struct sk_buff *skb;
1408
Herbert Xu1470ddf2011-03-01 02:36:47 +00001409 while ((skb = __skb_dequeue_tail(queue)) != NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001410 kfree_skb(skb);
1411
Herbert Xu1470ddf2011-03-01 02:36:47 +00001412 ip_cork_release(cork);
1413}
1414
1415void ip_flush_pending_frames(struct sock *sk)
1416{
David S. Millerbdc712b2011-05-06 15:02:07 -07001417 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001418}
1419
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001420struct sk_buff *ip_make_skb(struct sock *sk,
David S. Miller77968b72011-05-08 17:12:19 -07001421 struct flowi4 *fl4,
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001422 int getfrag(void *from, char *to, int offset,
1423 int len, int odd, struct sk_buff *skb),
1424 void *from, int length, int transhdrlen,
1425 struct ipcm_cookie *ipc, struct rtable **rtp,
1426 unsigned int flags)
1427{
David S. Millerb80d7222011-05-06 15:06:01 -07001428 struct inet_cork cork;
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001429 struct sk_buff_head queue;
1430 int err;
1431
1432 if (flags & MSG_PROBE)
1433 return NULL;
1434
1435 __skb_queue_head_init(&queue);
1436
David S. Millerb80d7222011-05-06 15:06:01 -07001437 cork.flags = 0;
1438 cork.addr = 0;
David S. Miller70652722011-05-06 16:01:15 -07001439 cork.opt = NULL;
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001440 err = ip_setup_cork(sk, &cork, ipc, rtp);
1441 if (err)
1442 return ERR_PTR(err);
1443
David S. Millerf5fca602011-05-08 17:24:10 -07001444 err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001445 from, length, transhdrlen, flags);
1446 if (err) {
1447 __ip_flush_pending_frames(sk, &queue, &cork);
1448 return ERR_PTR(err);
1449 }
1450
David S. Miller77968b72011-05-08 17:12:19 -07001451 return __ip_make_skb(sk, fl4, &queue, &cork);
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001452}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001453
1454/*
1455 * Fetch data from kernel space and fill in checksum if needed.
1456 */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001457static int ip_reply_glue_bits(void *dptr, char *to, int offset,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001458 int len, int odd, struct sk_buff *skb)
1459{
Al Viro50842052006-11-14 21:36:34 -08001460 __wsum csum;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001461
1462 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1463 skb->csum = csum_block_add(skb->csum, csum, odd);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001464 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001465}
1466
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001467/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001468 * Generic function to send a packet as reply to another packet.
1469 * Used to send TCP resets so far. ICMP should use this function too.
1470 *
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001471 * Should run single threaded per socket because it uses the sock
Linus Torvalds1da177e2005-04-16 15:20:36 -07001472 * structure to pass arguments.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001473 */
David S. Miller0a5ebb82011-05-09 13:22:43 -07001474void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1475 struct ip_reply_arg *arg, unsigned int len)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001476{
1477 struct inet_sock *inet = inet_sk(sk);
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001478 struct ip_options_data replyopts;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001479 struct ipcm_cookie ipc;
David S. Miller77968b72011-05-08 17:12:19 -07001480 struct flowi4 fl4;
Eric Dumazet511c3f92009-06-02 05:14:27 +00001481 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001482
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001483 if (ip_options_echo(&replyopts.opt.opt, skb))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001484 return;
1485
David S. Miller0a5ebb82011-05-09 13:22:43 -07001486 ipc.addr = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001487 ipc.opt = NULL;
Oliver Hartkopp2244d072010-08-17 08:59:14 +00001488 ipc.tx_flags = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001489
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001490 if (replyopts.opt.opt.optlen) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001491 ipc.opt = &replyopts.opt;
1492
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001493 if (replyopts.opt.opt.srr)
1494 daddr = replyopts.opt.opt.faddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001495 }
1496
David S. Miller77968b72011-05-08 17:12:19 -07001497 flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1498 RT_TOS(ip_hdr(skb)->tos),
1499 RT_SCOPE_UNIVERSE, sk->sk_protocol,
1500 ip_reply_arg_flowi_flags(arg),
1501 daddr, rt->rt_spec_dst,
1502 tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1503 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1504 rt = ip_route_output_key(sock_net(sk), &fl4);
1505 if (IS_ERR(rt))
1506 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001507
1508 /* And let IP do all the hard work.
1509
1510 This chunk is not reenterable, hence spinlock.
1511 Note that it uses the fact, that this function is called
1512 with locally disabled BH and that sk cannot be already spinlocked.
1513 */
1514 bh_lock_sock(sk);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001515 inet->tos = ip_hdr(skb)->tos;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001516 sk->sk_priority = skb->priority;
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001517 sk->sk_protocol = ip_hdr(skb)->protocol;
Patrick McHardyf0e48db2007-06-04 21:32:46 -07001518 sk->sk_bound_dev_if = arg->bound_dev_if;
David S. Millerf5fca602011-05-08 17:24:10 -07001519 ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
Eric Dumazet2e77d892008-11-24 15:52:46 -08001520 &ipc, &rt, MSG_DONTWAIT);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001521 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1522 if (arg->csumoffset >= 0)
Arnaldo Carvalho de Melo9c702202007-04-25 18:04:18 -07001523 *((__sum16 *)skb_transport_header(skb) +
1524 arg->csumoffset) = csum_fold(csum_add(skb->csum,
1525 arg->csum));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001526 skb->ip_summed = CHECKSUM_NONE;
David S. Miller77968b72011-05-08 17:12:19 -07001527 ip_push_pending_frames(sk, &fl4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001528 }
1529
1530 bh_unlock_sock(sk);
1531
1532 ip_rt_put(rt);
1533}
1534
Linus Torvalds1da177e2005-04-16 15:20:36 -07001535void __init ip_init(void)
1536{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001537 ip_rt_init();
1538 inet_initpeers();
1539
1540#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1541 igmp_mc_proc_init();
1542#endif
1543}