blob: 658396d4d67e781179fc0c43c0ceeb6c1798963e [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * The Internet Protocol (IP) output module.
7 *
Jesper Juhl02c30a82005-05-05 16:16:16 -07008 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -07009 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Donald Becker, <becker@super.org>
11 * Alan Cox, <Alan.Cox@linux.org>
12 * Richard Underwood
13 * Stefan Becker, <stefanb@yello.ping.de>
14 * Jorge Cwik, <jorge@laser.satlink.net>
15 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 * Hirokazu Takahashi, <taka@valinux.co.jp>
17 *
18 * See ip_input.c for original log
19 *
20 * Fixes:
21 * Alan Cox : Missing nonblock feature in ip_build_xmit.
22 * Mike Kilburn : htons() missing in ip_build_xmit.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090023 * Bradford Johnson: Fix faulty handling of some frames when
Linus Torvalds1da177e2005-04-16 15:20:36 -070024 * no route is found.
25 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
26 * (in case if packet not accepted by
27 * output firewall rules)
28 * Mike McLagan : Routing by source
29 * Alexey Kuznetsov: use new route cache
30 * Andi Kleen: Fix broken PMTU recovery and remove
31 * some redundant tests.
32 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
33 * Andi Kleen : Replace ip_reply with ip_send_reply.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090034 * Andi Kleen : Split fast and slow ip_build_xmit path
35 * for decreased register pressure on x86
36 * and more readibility.
Linus Torvalds1da177e2005-04-16 15:20:36 -070037 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
38 * silently drop skb instead of failing with -EPERM.
39 * Detlev Wengorz : Copy protocol for fragments.
40 * Hirokazu Takahashi: HW checksumming for outgoing UDP
41 * datagrams.
42 * Hirokazu Takahashi: sendfile() on UDP works now.
43 */
44
45#include <asm/uaccess.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070046#include <linux/module.h>
47#include <linux/types.h>
48#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070049#include <linux/mm.h>
50#include <linux/string.h>
51#include <linux/errno.h>
Al Viroa1f8e7f2006-10-19 16:08:53 -040052#include <linux/highmem.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090053#include <linux/slab.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070054
55#include <linux/socket.h>
56#include <linux/sockios.h>
57#include <linux/in.h>
58#include <linux/inet.h>
59#include <linux/netdevice.h>
60#include <linux/etherdevice.h>
61#include <linux/proc_fs.h>
62#include <linux/stat.h>
63#include <linux/init.h>
64
65#include <net/snmp.h>
66#include <net/ip.h>
67#include <net/protocol.h>
68#include <net/route.h>
Patrick McHardycfacb052006-01-08 22:36:54 -080069#include <net/xfrm.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070070#include <linux/skbuff.h>
71#include <net/sock.h>
72#include <net/arp.h>
73#include <net/icmp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070074#include <net/checksum.h>
75#include <net/inetpeer.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070076#include <linux/igmp.h>
77#include <linux/netfilter_ipv4.h>
78#include <linux/netfilter_bridge.h>
79#include <linux/mroute.h>
80#include <linux/netlink.h>
Arnaldo Carvalho de Melo6cbb0df2005-08-09 19:49:02 -070081#include <linux/tcp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070082
Brian Haleyab32ea52006-09-22 14:15:41 -070083int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
David S. Miller323e1262010-12-12 21:55:08 -080084EXPORT_SYMBOL(sysctl_ip_default_ttl);
Linus Torvalds1da177e2005-04-16 15:20:36 -070085
86/* Generate a checksum for an outgoing IP datagram. */
87__inline__ void ip_send_check(struct iphdr *iph)
88{
89 iph->check = 0;
90 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
91}
Eric Dumazet4bc2f182010-07-09 21:22:10 +000092EXPORT_SYMBOL(ip_send_check);
Linus Torvalds1da177e2005-04-16 15:20:36 -070093
Herbert Xuc439cb22008-01-11 19:14:00 -080094int __ip_local_out(struct sk_buff *skb)
95{
96 struct iphdr *iph = ip_hdr(skb);
97
98 iph->tot_len = htons(skb->len);
99 ip_send_check(iph);
Jan Engelhardt9bbc7682010-03-23 04:07:29 +0100100 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
101 skb_dst(skb)->dev, dst_output);
Herbert Xuc439cb22008-01-11 19:14:00 -0800102}
103
104int ip_local_out(struct sk_buff *skb)
105{
106 int err;
107
108 err = __ip_local_out(skb);
109 if (likely(err == 1))
110 err = dst_output(skb);
111
112 return err;
113}
114EXPORT_SYMBOL_GPL(ip_local_out);
115
Linus Torvalds1da177e2005-04-16 15:20:36 -0700116/* dev_loopback_xmit for use with netfilter. */
117static int ip_dev_loopback_xmit(struct sk_buff *newskb)
118{
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -0700119 skb_reset_mac_header(newskb);
Arnaldo Carvalho de Melobbe735e2007-03-10 22:16:10 -0300120 __skb_pull(newskb, skb_network_offset(newskb));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700121 newskb->pkt_type = PACKET_LOOPBACK;
122 newskb->ip_summed = CHECKSUM_UNNECESSARY;
Eric Dumazetadf30902009-06-02 05:19:30 +0000123 WARN_ON(!skb_dst(newskb));
Julian Anastasovd52fbfc2011-08-07 10:17:22 +0000124 skb_dst_force(newskb);
Eric Dumazete30b38c2010-04-15 09:13:03 +0000125 netif_rx_ni(newskb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700126 return 0;
127}
128
129static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
130{
131 int ttl = inet->uc_ttl;
132
133 if (ttl < 0)
David S. Miller323e1262010-12-12 21:55:08 -0800134 ttl = ip4_dst_hoplimit(dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700135 return ttl;
136}
137
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900138/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700139 * Add an ip header to a skbuff and send it out.
140 *
141 */
142int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000143 __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700144{
145 struct inet_sock *inet = inet_sk(sk);
Eric Dumazet511c3f92009-06-02 05:14:27 +0000146 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700147 struct iphdr *iph;
148
149 /* Build the IP header. */
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000150 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
Arnaldo Carvalho de Melo8856dfa2007-03-10 19:40:39 -0300151 skb_reset_network_header(skb);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700152 iph = ip_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700153 iph->version = 4;
154 iph->ihl = 5;
155 iph->tos = inet->tos;
Changli Gaod8d1f302010-06-10 23:31:35 -0700156 if (ip_dont_fragment(sk, &rt->dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700157 iph->frag_off = htons(IP_DF);
158 else
159 iph->frag_off = 0;
Changli Gaod8d1f302010-06-10 23:31:35 -0700160 iph->ttl = ip_select_ttl(inet, &rt->dst);
David S. Millerdd927a22011-05-04 12:03:30 -0700161 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
162 iph->saddr = saddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700163 iph->protocol = sk->sk_protocol;
Eric Dumazetad52eef2014-06-02 05:26:03 -0700164 ip_select_ident(skb, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700165
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000166 if (opt && opt->opt.optlen) {
167 iph->ihl += opt->opt.optlen>>2;
168 ip_options_build(skb, &opt->opt, daddr, rt, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700169 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700170
171 skb->priority = sk->sk_priority;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800172 skb->mark = sk->sk_mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700173
174 /* Send it out. */
Herbert Xuc439cb22008-01-11 19:14:00 -0800175 return ip_local_out(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700176}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -0700177EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
178
Linus Torvalds1da177e2005-04-16 15:20:36 -0700179static inline int ip_finish_output2(struct sk_buff *skb)
180{
Eric Dumazetadf30902009-06-02 05:19:30 +0000181 struct dst_entry *dst = skb_dst(skb);
Mitsuru Chinen80787eb2007-04-30 00:48:20 -0700182 struct rtable *rt = (struct rtable *)dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700183 struct net_device *dev = dst->dev;
Chuck Leverc2636b42007-10-23 21:07:32 -0700184 unsigned int hh_len = LL_RESERVED_SPACE(dev);
David S. Millerf6b72b62011-07-14 07:53:20 -0700185 struct neighbour *neigh;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700186
Neil Hormanedf391f2009-04-27 02:45:02 -0700187 if (rt->rt_type == RTN_MULTICAST) {
188 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
189 } else if (rt->rt_type == RTN_BROADCAST)
190 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
Mitsuru Chinen80787eb2007-04-30 00:48:20 -0700191
Linus Torvalds1da177e2005-04-16 15:20:36 -0700192 /* Be paranoid, rather than too clever. */
Stephen Hemminger3b04ddd2007-10-09 01:40:57 -0700193 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700194 struct sk_buff *skb2;
195
196 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
197 if (skb2 == NULL) {
198 kfree_skb(skb);
199 return -ENOMEM;
200 }
201 if (skb->sk)
202 skb_set_owner_w(skb2, skb->sk);
203 kfree_skb(skb);
204 skb = skb2;
205 }
206
Eric Dumazetf2c31e32011-07-29 19:00:53 +0000207 rcu_read_lock();
David Miller27217452011-12-02 16:52:08 +0000208 neigh = dst_get_neighbour_noref(dst);
Eric Dumazetf2c31e32011-07-29 19:00:53 +0000209 if (neigh) {
210 int res = neigh_output(neigh, skb);
211
212 rcu_read_unlock();
213 return res;
214 }
215 rcu_read_unlock();
David S. Miller05e3aa02011-07-16 17:26:00 -0700216
Linus Torvalds1da177e2005-04-16 15:20:36 -0700217 if (net_ratelimit())
218 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
219 kfree_skb(skb);
220 return -EINVAL;
221}
222
John Heffner628a5c52007-04-20 15:53:27 -0700223static inline int ip_skb_dst_mtu(struct sk_buff *skb)
224{
225 struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
226
227 return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
Eric Dumazetadf30902009-06-02 05:19:30 +0000228 skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
John Heffner628a5c52007-04-20 15:53:27 -0700229}
230
Patrick McHardy861d0482007-10-15 01:48:39 -0700231static int ip_finish_output(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700232{
Patrick McHardy5c901da2006-01-06 23:05:36 -0800233#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
234 /* Policy lookup after SNAT yielded a new policy */
Eric Dumazetadf30902009-06-02 05:19:30 +0000235 if (skb_dst(skb)->xfrm != NULL) {
Patrick McHardy48d5cad2006-02-15 15:10:22 -0800236 IPCB(skb)->flags |= IPSKB_REROUTED;
237 return dst_output(skb);
238 }
Patrick McHardy5c901da2006-01-06 23:05:36 -0800239#endif
John Heffner628a5c52007-04-20 15:53:27 -0700240 if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
Patrick McHardy1bd9bef2006-01-05 12:20:59 -0800241 return ip_fragment(skb, ip_finish_output2);
242 else
243 return ip_finish_output2(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700244}
245
246int ip_mc_output(struct sk_buff *skb)
247{
248 struct sock *sk = skb->sk;
Eric Dumazet511c3f92009-06-02 05:14:27 +0000249 struct rtable *rt = skb_rtable(skb);
Changli Gaod8d1f302010-06-10 23:31:35 -0700250 struct net_device *dev = rt->dst.dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700251
252 /*
253 * If the indicated interface is up and running, send the packet.
254 */
Neil Hormanedf391f2009-04-27 02:45:02 -0700255 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700256
257 skb->dev = dev;
258 skb->protocol = htons(ETH_P_IP);
259
260 /*
261 * Multicasts are looped back for other local users
262 */
263
264 if (rt->rt_flags&RTCF_MULTICAST) {
Octavian Purdila7ad68482010-01-06 20:37:01 -0800265 if (sk_mc_loop(sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700266#ifdef CONFIG_IP_MROUTE
267 /* Small optimization: do not loopback not local frames,
268 which returned after forwarding; they will be dropped
269 by ip_mr_input in any case.
270 Note, that local frames are looped back to be delivered
271 to local recipients.
272
273 This check is duplicated in ip_mr_input at the moment.
274 */
Joe Perches9d4fb272009-11-23 10:41:23 -0800275 &&
276 ((rt->rt_flags & RTCF_LOCAL) ||
277 !(IPCB(skb)->flags & IPSKB_FORWARDED))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700278#endif
Joe Perches9d4fb272009-11-23 10:41:23 -0800279 ) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700280 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
281 if (newskb)
Jan Engelhardt9bbc7682010-03-23 04:07:29 +0100282 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
283 newskb, NULL, newskb->dev,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700284 ip_dev_loopback_xmit);
285 }
286
287 /* Multicasts with ttl 0 must not go beyond the host */
288
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700289 if (ip_hdr(skb)->ttl == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700290 kfree_skb(skb);
291 return 0;
292 }
293 }
294
295 if (rt->rt_flags&RTCF_BROADCAST) {
296 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
297 if (newskb)
Jan Engelhardt9bbc7682010-03-23 04:07:29 +0100298 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
299 NULL, newskb->dev, ip_dev_loopback_xmit);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700300 }
301
Jan Engelhardt9bbc7682010-03-23 04:07:29 +0100302 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
303 skb->dev, ip_finish_output,
Patrick McHardy48d5cad2006-02-15 15:10:22 -0800304 !(IPCB(skb)->flags & IPSKB_REROUTED));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700305}
306
307int ip_output(struct sk_buff *skb)
308{
Eric Dumazetadf30902009-06-02 05:19:30 +0000309 struct net_device *dev = skb_dst(skb)->dev;
Patrick McHardy1bd9bef2006-01-05 12:20:59 -0800310
Neil Hormanedf391f2009-04-27 02:45:02 -0700311 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700312
Patrick McHardy1bd9bef2006-01-05 12:20:59 -0800313 skb->dev = dev;
314 skb->protocol = htons(ETH_P_IP);
315
Jan Engelhardt9bbc7682010-03-23 04:07:29 +0100316 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900317 ip_finish_output,
Patrick McHardy48d5cad2006-02-15 15:10:22 -0800318 !(IPCB(skb)->flags & IPSKB_REROUTED));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700319}
320
Eric Dumazet84f93072011-11-30 19:00:53 +0000321/*
322 * copy saddr and daddr, possibly using 64bit load/stores
323 * Equivalent to :
324 * iph->saddr = fl4->saddr;
325 * iph->daddr = fl4->daddr;
326 */
327static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
328{
329 BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
330 offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
331 memcpy(&iph->saddr, &fl4->saddr,
332 sizeof(fl4->saddr) + sizeof(fl4->daddr));
333}
334
David S. Millerd9d8da82011-05-06 22:23:20 -0700335int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700336{
David S. Millere89862f2007-01-26 01:04:55 -0800337 struct sock *sk = skb->sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700338 struct inet_sock *inet = inet_sk(sk);
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000339 struct ip_options_rcu *inet_opt;
David S. Millerb57ae012011-05-06 16:24:06 -0700340 struct flowi4 *fl4;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700341 struct rtable *rt;
342 struct iphdr *iph;
Eric Dumazetab6e3fe2010-05-10 11:31:49 +0000343 int res;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700344
345 /* Skip all of this if the packet is already routed,
346 * f.e. by something like SCTP.
347 */
Eric Dumazetab6e3fe2010-05-10 11:31:49 +0000348 rcu_read_lock();
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000349 inet_opt = rcu_dereference(inet->inet_opt);
David S. Millerea4fc0d2011-05-06 22:30:20 -0700350 fl4 = &fl->u.ip4;
Eric Dumazet511c3f92009-06-02 05:14:27 +0000351 rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700352 if (rt != NULL)
353 goto packet_routed;
354
355 /* Make sure we can route this packet. */
356 rt = (struct rtable *)__sk_dst_check(sk, 0);
357 if (rt == NULL) {
Al Viro3ca3c682006-09-27 18:28:07 -0700358 __be32 daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700359
360 /* Use correct destination address if we have options. */
Eric Dumazetc720c7e2009-10-15 06:30:45 +0000361 daddr = inet->inet_daddr;
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000362 if (inet_opt && inet_opt->opt.srr)
363 daddr = inet_opt->opt.faddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700364
David S. Miller78fbfd82011-03-12 00:00:52 -0500365 /* If this fails, retransmit mechanism of transport layer will
366 * keep trying until route appears or the connection times
367 * itself out.
368 */
David S. Millerb57ae012011-05-06 16:24:06 -0700369 rt = ip_route_output_ports(sock_net(sk), fl4, sk,
David S. Miller78fbfd82011-03-12 00:00:52 -0500370 daddr, inet->inet_saddr,
371 inet->inet_dport,
372 inet->inet_sport,
373 sk->sk_protocol,
374 RT_CONN_FLAGS(sk),
375 sk->sk_bound_dev_if);
376 if (IS_ERR(rt))
377 goto no_route;
Changli Gaod8d1f302010-06-10 23:31:35 -0700378 sk_setup_caps(sk, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700379 }
Changli Gaod8d1f302010-06-10 23:31:35 -0700380 skb_dst_set_noref(skb, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700381
382packet_routed:
David S. Millerea4fc0d2011-05-06 22:30:20 -0700383 if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700384 goto no_route;
385
386 /* OK, we know where to send it, allocate and build IP header. */
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000387 skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
Arnaldo Carvalho de Melo8856dfa2007-03-10 19:40:39 -0300388 skb_reset_network_header(skb);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700389 iph = ip_hdr(skb);
Al Viro714e85b2006-11-14 20:51:49 -0800390 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
Changli Gaod8d1f302010-06-10 23:31:35 -0700391 if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700392 iph->frag_off = htons(IP_DF);
393 else
394 iph->frag_off = 0;
Changli Gaod8d1f302010-06-10 23:31:35 -0700395 iph->ttl = ip_select_ttl(inet, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700396 iph->protocol = sk->sk_protocol;
Eric Dumazet84f93072011-11-30 19:00:53 +0000397 ip_copy_addrs(iph, fl4);
398
Linus Torvalds1da177e2005-04-16 15:20:36 -0700399 /* Transport layer set skb->h.foo itself. */
400
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000401 if (inet_opt && inet_opt->opt.optlen) {
402 iph->ihl += inet_opt->opt.optlen >> 2;
403 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700404 }
405
Eric Dumazetad52eef2014-06-02 05:26:03 -0700406 ip_select_ident_segs(skb, sk, skb_shinfo(skb)->gso_segs ?: 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700407
Linus Torvalds1da177e2005-04-16 15:20:36 -0700408 skb->priority = sk->sk_priority;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800409 skb->mark = sk->sk_mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700410
Eric Dumazetab6e3fe2010-05-10 11:31:49 +0000411 res = ip_local_out(skb);
412 rcu_read_unlock();
413 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700414
415no_route:
Eric Dumazetab6e3fe2010-05-10 11:31:49 +0000416 rcu_read_unlock();
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700417 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700418 kfree_skb(skb);
419 return -EHOSTUNREACH;
420}
Eric Dumazet4bc2f182010-07-09 21:22:10 +0000421EXPORT_SYMBOL(ip_queue_xmit);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700422
423
424static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
425{
426 to->pkt_type = from->pkt_type;
427 to->priority = from->priority;
428 to->protocol = from->protocol;
Eric Dumazetadf30902009-06-02 05:19:30 +0000429 skb_dst_drop(to);
Eric Dumazetfe76cda2010-07-01 23:48:22 +0000430 skb_dst_copy(to, from);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700431 to->dev = from->dev;
Thomas Graf82e91ff2006-11-09 15:19:14 -0800432 to->mark = from->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700433
434 /* Copy the flags to each fragment. */
435 IPCB(to)->flags = IPCB(from)->flags;
436
437#ifdef CONFIG_NET_SCHED
438 to->tc_index = from->tc_index;
439#endif
Yasuyuki Kozakaie7ac05f2007-03-14 16:44:01 -0700440 nf_copy(to, from);
Jozsef Kadlecsikba9dda32007-07-07 22:21:23 -0700441#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
442 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
443 to->nf_trace = from->nf_trace;
444#endif
Julian Anastasovc98d80e2005-10-22 13:39:21 +0300445#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
446 to->ipvs_property = from->ipvs_property;
447#endif
James Morris984bc162006-06-09 00:29:17 -0700448 skb_copy_secmark(to, from);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700449}
450
451/*
452 * This IP datagram is too large to be sent in one piece. Break it up into
453 * smaller pieces (each of size equal to IP header plus
454 * a block of the data of the original IP data part) that will yet fit in a
455 * single device frame, and queue such a frame for sending.
456 */
457
Jianjun Kongd9319102008-11-03 00:23:42 -0800458int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700459{
460 struct iphdr *iph;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700461 int ptr;
462 struct net_device *dev;
463 struct sk_buff *skb2;
Changli Gaoc893b802010-07-31 13:25:08 +0000464 unsigned int mtu, hlen, left, len, ll_rs;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700465 int offset;
Alexey Dobriyan76ab6082006-01-06 13:24:29 -0800466 __be16 not_last_frag;
Eric Dumazet511c3f92009-06-02 05:14:27 +0000467 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700468 int err = 0;
469
Changli Gaod8d1f302010-06-10 23:31:35 -0700470 dev = rt->dst.dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700471
472 /*
473 * Point into the IP datagram header.
474 */
475
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700476 iph = ip_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700477
478 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700479 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700480 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
John Heffner628a5c52007-04-20 15:53:27 -0700481 htonl(ip_skb_dst_mtu(skb)));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700482 kfree_skb(skb);
483 return -EMSGSIZE;
484 }
485
486 /*
487 * Setup starting values.
488 */
489
490 hlen = iph->ihl * 4;
Changli Gaod8d1f302010-06-10 23:31:35 -0700491 mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
Bart De Schuymer6c79bf02010-04-20 16:22:01 +0200492#ifdef CONFIG_BRIDGE_NETFILTER
493 if (skb->nf_bridge)
494 mtu -= nf_bridge_mtu_reduction(skb);
495#endif
Herbert Xu89cee8b2005-12-13 23:14:27 -0800496 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700497
498 /* When frag_list is given, use it. First, check its validity:
499 * some transformers could create wrong frag_list or break existing
500 * one, it is not prohibited. In this case fall back to copying.
501 *
502 * LATER: this step can be merged to real generation of fragments,
503 * we can switch to copy when see the first bad fragment.
504 */
David S. Miller21dc3302010-08-23 00:13:46 -0700505 if (skb_has_frag_list(skb)) {
Eric Dumazet3d130082010-09-21 08:47:45 +0000506 struct sk_buff *frag, *frag2;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700507 int first_len = skb_pagelen(skb);
508
509 if (first_len - hlen > mtu ||
510 ((first_len - hlen) & 7) ||
Paul Gortmaker56f8a752011-06-21 20:33:34 -0700511 ip_is_fragment(iph) ||
Linus Torvalds1da177e2005-04-16 15:20:36 -0700512 skb_cloned(skb))
513 goto slow_path;
514
David S. Millerd7fcf1a2009-06-09 00:19:37 -0700515 skb_walk_frags(skb, frag) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700516 /* Correct geometry. */
517 if (frag->len > mtu ||
518 ((frag->len & 7) && frag->next) ||
519 skb_headroom(frag) < hlen)
Eric Dumazet3d130082010-09-21 08:47:45 +0000520 goto slow_path_clean;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700521
522 /* Partially cloned skb? */
523 if (skb_shared(frag))
Eric Dumazet3d130082010-09-21 08:47:45 +0000524 goto slow_path_clean;
Herbert Xu2fdba6b2005-05-18 22:52:33 -0700525
526 BUG_ON(frag->sk);
527 if (skb->sk) {
Herbert Xu2fdba6b2005-05-18 22:52:33 -0700528 frag->sk = skb->sk;
529 frag->destructor = sock_wfree;
Herbert Xu2fdba6b2005-05-18 22:52:33 -0700530 }
Eric Dumazet3d130082010-09-21 08:47:45 +0000531 skb->truesize -= frag->truesize;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700532 }
533
534 /* Everything is OK. Generate! */
535
536 err = 0;
537 offset = 0;
538 frag = skb_shinfo(skb)->frag_list;
David S. Millerd7fcf1a2009-06-09 00:19:37 -0700539 skb_frag_list_init(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700540 skb->data_len = first_len - skb_headlen(skb);
541 skb->len = first_len;
542 iph->tot_len = htons(first_len);
543 iph->frag_off = htons(IP_MF);
544 ip_send_check(iph);
545
546 for (;;) {
547 /* Prepare header of the next frame,
548 * before previous one went down. */
549 if (frag) {
550 frag->ip_summed = CHECKSUM_NONE;
Arnaldo Carvalho de Melobadff6d2007-03-13 13:06:52 -0300551 skb_reset_transport_header(frag);
Arnaldo Carvalho de Meloe2d1bca2007-04-10 20:46:21 -0700552 __skb_push(frag, hlen);
553 skb_reset_network_header(frag);
Arnaldo Carvalho de Melod56f90a2007-04-10 20:50:43 -0700554 memcpy(skb_network_header(frag), iph, hlen);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700555 iph = ip_hdr(frag);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700556 iph->tot_len = htons(frag->len);
557 ip_copy_metadata(frag, skb);
558 if (offset == 0)
559 ip_options_fragment(frag);
560 offset += skb->len - hlen;
561 iph->frag_off = htons(offset>>3);
562 if (frag->next != NULL)
563 iph->frag_off |= htons(IP_MF);
564 /* Ready, complete checksum */
565 ip_send_check(iph);
566 }
567
568 err = output(skb);
569
Wei Dongdafee492006-08-02 13:41:21 -0700570 if (!err)
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700571 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700572 if (err || !frag)
573 break;
574
575 skb = frag;
576 frag = skb->next;
577 skb->next = NULL;
578 }
579
580 if (err == 0) {
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700581 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700582 return 0;
583 }
584
585 while (frag) {
586 skb = frag->next;
587 kfree_skb(frag);
588 frag = skb;
589 }
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700590 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700591 return err;
Eric Dumazet3d130082010-09-21 08:47:45 +0000592
593slow_path_clean:
594 skb_walk_frags(skb, frag2) {
595 if (frag2 == frag)
596 break;
597 frag2->sk = NULL;
598 frag2->destructor = NULL;
599 skb->truesize += frag2->truesize;
600 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700601 }
602
603slow_path:
604 left = skb->len - hlen; /* Space per frame */
George Kadianakis49085bd2010-07-06 11:44:12 +0000605 ptr = hlen; /* Where to start from */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700606
Linus Torvalds1da177e2005-04-16 15:20:36 -0700607 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
Stephen Hemminger9bcfcaf2006-08-29 17:48:57 -0700608 * we need to make room for the encapsulating header
609 */
Changli Gaoc893b802010-07-31 13:25:08 +0000610 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
Stephen Hemminger9bcfcaf2006-08-29 17:48:57 -0700611
Linus Torvalds1da177e2005-04-16 15:20:36 -0700612 /*
613 * Fragment the datagram.
614 */
615
616 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
617 not_last_frag = iph->frag_off & htons(IP_MF);
618
619 /*
620 * Keep copying data until we run out.
621 */
622
Stephen Hemminger132adf52007-03-08 20:44:43 -0800623 while (left > 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700624 len = left;
625 /* IF: it doesn't fit, use 'mtu' - the data space left */
626 if (len > mtu)
627 len = mtu;
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300628 /* IF: we are not sending up to and including the packet end
Linus Torvalds1da177e2005-04-16 15:20:36 -0700629 then align the next start on an eight byte boundary */
630 if (len < left) {
631 len &= ~7;
632 }
633 /*
634 * Allocate buffer.
635 */
636
637 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
Patrick McHardy64ce2072005-08-09 20:50:53 -0700638 NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700639 err = -ENOMEM;
640 goto fail;
641 }
642
643 /*
644 * Set up data on packet
645 */
646
647 ip_copy_metadata(skb2, skb);
648 skb_reserve(skb2, ll_rs);
649 skb_put(skb2, len + hlen);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -0700650 skb_reset_network_header(skb2);
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -0700651 skb2->transport_header = skb2->network_header + hlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700652
653 /*
654 * Charge the memory for the fragment to any owner
655 * it might possess
656 */
657
658 if (skb->sk)
659 skb_set_owner_w(skb2, skb->sk);
660
661 /*
662 * Copy the packet header into the new buffer.
663 */
664
Arnaldo Carvalho de Melod626f622007-03-27 18:55:52 -0300665 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700666
667 /*
668 * Copy a block of the IP datagram.
669 */
Arnaldo Carvalho de Melobff9b612007-03-16 17:19:57 -0300670 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700671 BUG();
672 left -= len;
673
674 /*
675 * Fill in the new header fields.
676 */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700677 iph = ip_hdr(skb2);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700678 iph->frag_off = htons((offset >> 3));
679
680 /* ANK: dirty, but effective trick. Upgrade options only if
681 * the segment to be fragmented was THE FIRST (otherwise,
682 * options are already fixed) and make it ONCE
683 * on the initial skb, so that all the following fragments
684 * will inherit fixed options.
685 */
686 if (offset == 0)
687 ip_options_fragment(skb);
688
689 /*
690 * Added AC : If we are fragmenting a fragment that's not the
691 * last fragment then keep MF on each bit
692 */
693 if (left > 0 || not_last_frag)
694 iph->frag_off |= htons(IP_MF);
695 ptr += len;
696 offset += len;
697
698 /*
699 * Put this fragment into the sending queue.
700 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700701 iph->tot_len = htons(len + hlen);
702
703 ip_send_check(iph);
704
705 err = output(skb2);
706 if (err)
707 goto fail;
Wei Dongdafee492006-08-02 13:41:21 -0700708
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700709 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700710 }
711 kfree_skb(skb);
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700712 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700713 return err;
714
715fail:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900716 kfree_skb(skb);
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700717 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700718 return err;
719}
Patrick McHardy2e2f7ae2006-04-04 13:42:35 -0700720EXPORT_SYMBOL(ip_fragment);
721
Linus Torvalds1da177e2005-04-16 15:20:36 -0700722int
723ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
724{
725 struct iovec *iov = from;
726
Patrick McHardy84fa7932006-08-29 16:44:56 -0700727 if (skb->ip_summed == CHECKSUM_PARTIAL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700728 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
729 return -EFAULT;
730 } else {
Al Viro44bb9362006-11-14 21:36:14 -0800731 __wsum csum = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700732 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
733 return -EFAULT;
734 skb->csum = csum_block_add(skb->csum, csum, odd);
735 }
736 return 0;
737}
Eric Dumazet4bc2f182010-07-09 21:22:10 +0000738EXPORT_SYMBOL(ip_generic_getfrag);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700739
Al Viro44bb9362006-11-14 21:36:14 -0800740static inline __wsum
Linus Torvalds1da177e2005-04-16 15:20:36 -0700741csum_page(struct page *page, int offset, int copy)
742{
743 char *kaddr;
Al Viro44bb9362006-11-14 21:36:14 -0800744 __wsum csum;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700745 kaddr = kmap(page);
746 csum = csum_partial(kaddr + offset, copy, 0);
747 kunmap(page);
748 return csum;
749}
750
Adrian Bunk4b30b1c2005-11-29 16:27:20 -0800751static inline int ip_ufo_append_data(struct sock *sk,
Herbert Xu1470ddf2011-03-01 02:36:47 +0000752 struct sk_buff_head *queue,
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700753 int getfrag(void *from, char *to, int offset, int len,
754 int odd, struct sk_buff *skb),
755 void *from, int length, int hh_len, int fragheaderlen,
Bill Sommerfeldd9be4f72011-07-19 15:22:33 +0000756 int transhdrlen, int maxfraglen, unsigned int flags)
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700757{
758 struct sk_buff *skb;
759 int err;
760
761 /* There is support for UDP fragmentation offload by network
762 * device, so create one single skb packet containing complete
763 * udp datagram
764 */
Herbert Xu1470ddf2011-03-01 02:36:47 +0000765 if ((skb = skb_peek_tail(queue)) == NULL) {
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700766 skb = sock_alloc_send_skb(sk,
767 hh_len + fragheaderlen + transhdrlen + 20,
768 (flags & MSG_DONTWAIT), &err);
769
770 if (skb == NULL)
771 return err;
772
773 /* reserve space for Hardware header */
774 skb_reserve(skb, hh_len);
775
776 /* create space for UDP/IP header */
Jianjun Kongd9319102008-11-03 00:23:42 -0800777 skb_put(skb, fragheaderlen + transhdrlen);
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700778
779 /* initialize network header pointer */
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -0700780 skb_reset_network_header(skb);
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700781
782 /* initialize protocol header pointer */
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -0700783 skb->transport_header = skb->network_header + fragheaderlen;
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700784
Patrick McHardy84fa7932006-08-29 16:44:56 -0700785 skb->ip_summed = CHECKSUM_PARTIAL;
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700786 skb->csum = 0;
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700787
Kostya Bbe9164e2008-04-29 22:36:30 -0700788 /* specify the length of each IP datagram fragment */
Bill Sommerfeldd9be4f72011-07-19 15:22:33 +0000789 skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
Herbert Xuf83ef8c2006-06-30 13:37:03 -0700790 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
Herbert Xu1470ddf2011-03-01 02:36:47 +0000791 __skb_queue_tail(queue, skb);
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700792 }
Kostya Bbe9164e2008-04-29 22:36:30 -0700793
794 return skb_append_datato_frags(sk, skb, getfrag, from,
795 (length - transhdrlen));
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700796}
797
David S. Millerf5fca602011-05-08 17:24:10 -0700798static int __ip_append_data(struct sock *sk,
799 struct flowi4 *fl4,
800 struct sk_buff_head *queue,
Herbert Xu1470ddf2011-03-01 02:36:47 +0000801 struct inet_cork *cork,
802 int getfrag(void *from, char *to, int offset,
803 int len, int odd, struct sk_buff *skb),
804 void *from, int length, int transhdrlen,
805 unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700806{
807 struct inet_sock *inet = inet_sk(sk);
808 struct sk_buff *skb;
809
Herbert Xu07df5292011-03-01 23:00:58 -0800810 struct ip_options *opt = cork->opt;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700811 int hh_len;
812 int exthdrlen;
813 int mtu;
814 int copy;
815 int err;
816 int offset = 0;
817 unsigned int maxfraglen, fragheaderlen;
818 int csummode = CHECKSUM_NONE;
Herbert Xu1470ddf2011-03-01 02:36:47 +0000819 struct rtable *rt = (struct rtable *)cork->dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700820
Steffen Klassert96d73032011-06-05 20:48:47 +0000821 skb = skb_peek_tail(queue);
822
823 exthdrlen = !skb ? rt->dst.header_len : 0;
Herbert Xu07df5292011-03-01 23:00:58 -0800824 mtu = cork->fragsize;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700825
Changli Gaod8d1f302010-06-10 23:31:35 -0700826 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700827
828 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
829 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
830
Herbert Xu1470ddf2011-03-01 02:36:47 +0000831 if (cork->length + length > 0xFFFF - fragheaderlen) {
David S. Millerf5fca602011-05-08 17:24:10 -0700832 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
Eric Dumazetc720c7e2009-10-15 06:30:45 +0000833 mtu-exthdrlen);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700834 return -EMSGSIZE;
835 }
836
837 /*
838 * transhdrlen > 0 means that this is the first fragment and we wish
839 * it won't be fragmented in the future.
840 */
841 if (transhdrlen &&
842 length + fragheaderlen <= mtu &&
Changli Gaod8d1f302010-06-10 23:31:35 -0700843 rt->dst.dev->features & NETIF_F_V4_CSUM &&
Linus Torvalds1da177e2005-04-16 15:20:36 -0700844 !exthdrlen)
Patrick McHardy84fa7932006-08-29 16:44:56 -0700845 csummode = CHECKSUM_PARTIAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700846
Herbert Xu1470ddf2011-03-01 02:36:47 +0000847 cork->length += length;
Hannes Frederic Sowa478e9a72013-10-22 00:07:47 +0200848 if (((length > mtu) || (skb && skb_has_frags(skb))) &&
Kostya Bbe9164e2008-04-29 22:36:30 -0700849 (sk->sk_protocol == IPPROTO_UDP) &&
Steffen Klassertc1460662011-06-29 23:19:32 +0000850 (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
Herbert Xu1470ddf2011-03-01 02:36:47 +0000851 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
852 hh_len, fragheaderlen, transhdrlen,
Bill Sommerfeldd9be4f72011-07-19 15:22:33 +0000853 maxfraglen, flags);
Patrick McHardybaa829d2006-03-12 20:35:12 -0800854 if (err)
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700855 goto error;
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700856 return 0;
857 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700858
859 /* So, what's going on in the loop below?
860 *
861 * We use calculated fragment length to generate chained skb,
862 * each of segments is IP fragment ready for sending to network after
863 * adding appropriate IP header.
864 */
865
Herbert Xu26cde9f2010-06-15 01:52:25 +0000866 if (!skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700867 goto alloc_new_skb;
868
869 while (length > 0) {
870 /* Check if the remaining data fits into current packet. */
871 copy = mtu - skb->len;
872 if (copy < length)
873 copy = maxfraglen - skb->len;
874 if (copy <= 0) {
875 char *data;
876 unsigned int datalen;
877 unsigned int fraglen;
878 unsigned int fraggap;
879 unsigned int alloclen;
880 struct sk_buff *skb_prev;
881alloc_new_skb:
882 skb_prev = skb;
883 if (skb_prev)
884 fraggap = skb_prev->len - maxfraglen;
885 else
886 fraggap = 0;
887
888 /*
889 * If remaining data exceeds the mtu,
890 * we know we need more fragment(s).
891 */
892 datalen = length + fraggap;
893 if (datalen > mtu - fragheaderlen)
894 datalen = maxfraglen - fragheaderlen;
895 fraglen = datalen + fragheaderlen;
896
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900897 if ((flags & MSG_MORE) &&
Changli Gaod8d1f302010-06-10 23:31:35 -0700898 !(rt->dst.dev->features&NETIF_F_SG))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700899 alloclen = mtu;
900 else
Eric Dumazet59104f02010-09-20 20:16:27 +0000901 alloclen = fraglen;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700902
Steffen Klassert353e5c92011-06-22 01:05:37 +0000903 alloclen += exthdrlen;
904
Linus Torvalds1da177e2005-04-16 15:20:36 -0700905 /* The last fragment gets additional space at tail.
906 * Note, with MSG_MORE we overallocate on fragments,
907 * because we have no idea what fragment will be
908 * the last.
909 */
Steffen Klassert33f99dc2011-06-22 01:04:37 +0000910 if (datalen == length + fraggap)
Changli Gaod8d1f302010-06-10 23:31:35 -0700911 alloclen += rt->dst.trailer_len;
Steffen Klassert33f99dc2011-06-22 01:04:37 +0000912
Linus Torvalds1da177e2005-04-16 15:20:36 -0700913 if (transhdrlen) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900914 skb = sock_alloc_send_skb(sk,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700915 alloclen + hh_len + 15,
916 (flags & MSG_DONTWAIT), &err);
917 } else {
918 skb = NULL;
919 if (atomic_read(&sk->sk_wmem_alloc) <=
920 2 * sk->sk_sndbuf)
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900921 skb = sock_wmalloc(sk,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700922 alloclen + hh_len + 15, 1,
923 sk->sk_allocation);
924 if (unlikely(skb == NULL))
925 err = -ENOBUFS;
Patrick Ohly51f31ca2009-02-12 05:03:39 +0000926 else
927 /* only the initial fragment is
928 time stamped */
Herbert Xu1470ddf2011-03-01 02:36:47 +0000929 cork->tx_flags = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700930 }
931 if (skb == NULL)
932 goto error;
933
934 /*
935 * Fill in the control structures
936 */
937 skb->ip_summed = csummode;
938 skb->csum = 0;
939 skb_reserve(skb, hh_len);
Herbert Xu1470ddf2011-03-01 02:36:47 +0000940 skb_shinfo(skb)->tx_flags = cork->tx_flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700941
942 /*
943 * Find where to start putting bytes.
944 */
Steffen Klassert353e5c92011-06-22 01:05:37 +0000945 data = skb_put(skb, fraglen + exthdrlen);
Arnaldo Carvalho de Meloc14d2452007-03-11 22:39:41 -0300946 skb_set_network_header(skb, exthdrlen);
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -0700947 skb->transport_header = (skb->network_header +
948 fragheaderlen);
Steffen Klassert353e5c92011-06-22 01:05:37 +0000949 data += fragheaderlen + exthdrlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700950
951 if (fraggap) {
952 skb->csum = skb_copy_and_csum_bits(
953 skb_prev, maxfraglen,
954 data + transhdrlen, fraggap, 0);
955 skb_prev->csum = csum_sub(skb_prev->csum,
956 skb->csum);
957 data += fraggap;
Herbert Xue9fa4f72006-08-13 20:12:58 -0700958 pskb_trim_unique(skb_prev, maxfraglen);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700959 }
960
961 copy = datalen - transhdrlen - fraggap;
962 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
963 err = -EFAULT;
964 kfree_skb(skb);
965 goto error;
966 }
967
968 offset += copy;
969 length -= datalen - fraggap;
970 transhdrlen = 0;
971 exthdrlen = 0;
972 csummode = CHECKSUM_NONE;
973
974 /*
975 * Put the packet on the pending queue.
976 */
Herbert Xu1470ddf2011-03-01 02:36:47 +0000977 __skb_queue_tail(queue, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700978 continue;
979 }
980
981 if (copy > length)
982 copy = length;
983
Changli Gaod8d1f302010-06-10 23:31:35 -0700984 if (!(rt->dst.dev->features&NETIF_F_SG)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700985 unsigned int off;
986
987 off = skb->len;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900988 if (getfrag(from, skb_put(skb, copy),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700989 offset, copy, off, skb) < 0) {
990 __skb_trim(skb, off);
991 err = -EFAULT;
992 goto error;
993 }
994 } else {
995 int i = skb_shinfo(skb)->nr_frags;
996 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
Herbert Xu1470ddf2011-03-01 02:36:47 +0000997 struct page *page = cork->page;
998 int off = cork->off;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700999 unsigned int left;
1000
1001 if (page && (left = PAGE_SIZE - off) > 0) {
1002 if (copy >= left)
1003 copy = left;
Ian Campbellaff65da2011-08-22 23:44:59 +00001004 if (page != skb_frag_page(frag)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001005 if (i == MAX_SKB_FRAGS) {
1006 err = -EMSGSIZE;
1007 goto error;
1008 }
Herbert Xu1470ddf2011-03-01 02:36:47 +00001009 skb_fill_page_desc(skb, i, page, off, 0);
Ian Campbellaff65da2011-08-22 23:44:59 +00001010 skb_frag_ref(skb, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001011 frag = &skb_shinfo(skb)->frags[i];
1012 }
1013 } else if (i < MAX_SKB_FRAGS) {
1014 if (copy > PAGE_SIZE)
1015 copy = PAGE_SIZE;
1016 page = alloc_pages(sk->sk_allocation, 0);
1017 if (page == NULL) {
1018 err = -ENOMEM;
1019 goto error;
1020 }
Herbert Xu1470ddf2011-03-01 02:36:47 +00001021 cork->page = page;
1022 cork->off = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001023
1024 skb_fill_page_desc(skb, i, page, 0, 0);
1025 frag = &skb_shinfo(skb)->frags[i];
Linus Torvalds1da177e2005-04-16 15:20:36 -07001026 } else {
1027 err = -EMSGSIZE;
1028 goto error;
1029 }
Eric Dumazet9e903e02011-10-18 21:00:24 +00001030 if (getfrag(from, skb_frag_address(frag)+skb_frag_size(frag),
Ian Campbellaff65da2011-08-22 23:44:59 +00001031 offset, copy, skb->len, skb) < 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001032 err = -EFAULT;
1033 goto error;
1034 }
Herbert Xu1470ddf2011-03-01 02:36:47 +00001035 cork->off += copy;
Eric Dumazet9e903e02011-10-18 21:00:24 +00001036 skb_frag_size_add(frag, copy);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001037 skb->len += copy;
1038 skb->data_len += copy;
Herbert Xuf945fa72008-01-22 22:39:26 -08001039 skb->truesize += copy;
1040 atomic_add(copy, &sk->sk_wmem_alloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001041 }
1042 offset += copy;
1043 length -= copy;
1044 }
1045
1046 return 0;
1047
1048error:
Herbert Xu1470ddf2011-03-01 02:36:47 +00001049 cork->length -= length;
Pavel Emelyanov5e38e272008-07-16 20:19:49 -07001050 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001051 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001052}
1053
Herbert Xu1470ddf2011-03-01 02:36:47 +00001054static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1055 struct ipcm_cookie *ipc, struct rtable **rtp)
1056{
1057 struct inet_sock *inet = inet_sk(sk);
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001058 struct ip_options_rcu *opt;
Herbert Xu1470ddf2011-03-01 02:36:47 +00001059 struct rtable *rt;
1060
1061 /*
1062 * setup for corking.
1063 */
1064 opt = ipc->opt;
1065 if (opt) {
1066 if (cork->opt == NULL) {
1067 cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1068 sk->sk_allocation);
1069 if (unlikely(cork->opt == NULL))
1070 return -ENOBUFS;
1071 }
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001072 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
Herbert Xu1470ddf2011-03-01 02:36:47 +00001073 cork->flags |= IPCORK_OPT;
1074 cork->addr = ipc->addr;
1075 }
1076 rt = *rtp;
1077 if (unlikely(!rt))
1078 return -EFAULT;
1079 /*
1080 * We steal reference to this route, caller should not release it
1081 */
1082 *rtp = NULL;
1083 cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
Steffen Klassert353e5c92011-06-22 01:05:37 +00001084 rt->dst.dev->mtu : dst_mtu(&rt->dst);
Herbert Xu1470ddf2011-03-01 02:36:47 +00001085 cork->dst = &rt->dst;
1086 cork->length = 0;
1087 cork->tx_flags = ipc->tx_flags;
1088 cork->page = NULL;
1089 cork->off = 0;
1090
1091 return 0;
1092}
1093
1094/*
1095 * ip_append_data() and ip_append_page() can make one large IP datagram
1096 * from many pieces of data. Each pieces will be holded on the socket
1097 * until ip_push_pending_frames() is called. Each piece can be a page
1098 * or non-page data.
1099 *
1100 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1101 * this interface potentially.
1102 *
1103 * LATER: length must be adjusted by pad at tail, when it is required.
1104 */
David S. Millerf5fca602011-05-08 17:24:10 -07001105int ip_append_data(struct sock *sk, struct flowi4 *fl4,
Herbert Xu1470ddf2011-03-01 02:36:47 +00001106 int getfrag(void *from, char *to, int offset, int len,
1107 int odd, struct sk_buff *skb),
1108 void *from, int length, int transhdrlen,
1109 struct ipcm_cookie *ipc, struct rtable **rtp,
1110 unsigned int flags)
1111{
1112 struct inet_sock *inet = inet_sk(sk);
1113 int err;
1114
1115 if (flags&MSG_PROBE)
1116 return 0;
1117
1118 if (skb_queue_empty(&sk->sk_write_queue)) {
David S. Millerbdc712b2011-05-06 15:02:07 -07001119 err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
Herbert Xu1470ddf2011-03-01 02:36:47 +00001120 if (err)
1121 return err;
1122 } else {
1123 transhdrlen = 0;
1124 }
1125
David S. Millerf5fca602011-05-08 17:24:10 -07001126 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
Herbert Xu1470ddf2011-03-01 02:36:47 +00001127 from, length, transhdrlen, flags);
1128}
1129
David S. Millerf5fca602011-05-08 17:24:10 -07001130ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001131 int offset, size_t size, int flags)
1132{
1133 struct inet_sock *inet = inet_sk(sk);
1134 struct sk_buff *skb;
1135 struct rtable *rt;
1136 struct ip_options *opt = NULL;
David S. Millerbdc712b2011-05-06 15:02:07 -07001137 struct inet_cork *cork;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001138 int hh_len;
1139 int mtu;
1140 int len;
1141 int err;
1142 unsigned int maxfraglen, fragheaderlen, fraggap;
1143
1144 if (inet->hdrincl)
1145 return -EPERM;
1146
1147 if (flags&MSG_PROBE)
1148 return 0;
1149
1150 if (skb_queue_empty(&sk->sk_write_queue))
1151 return -EINVAL;
1152
David S. Millerbdc712b2011-05-06 15:02:07 -07001153 cork = &inet->cork.base;
1154 rt = (struct rtable *)cork->dst;
1155 if (cork->flags & IPCORK_OPT)
1156 opt = cork->opt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001157
Changli Gaod8d1f302010-06-10 23:31:35 -07001158 if (!(rt->dst.dev->features&NETIF_F_SG))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001159 return -EOPNOTSUPP;
1160
Changli Gaod8d1f302010-06-10 23:31:35 -07001161 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
David S. Millerbdc712b2011-05-06 15:02:07 -07001162 mtu = cork->fragsize;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001163
1164 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1165 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1166
David S. Millerbdc712b2011-05-06 15:02:07 -07001167 if (cork->length + size > 0xFFFF - fragheaderlen) {
David S. Millerf5fca602011-05-08 17:24:10 -07001168 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001169 return -EMSGSIZE;
1170 }
1171
1172 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1173 return -EINVAL;
1174
David S. Millerbdc712b2011-05-06 15:02:07 -07001175 cork->length += size;
Herbert Xu26cde9f2010-06-15 01:52:25 +00001176 if ((size + skb->len > mtu) &&
1177 (sk->sk_protocol == IPPROTO_UDP) &&
Changli Gaod8d1f302010-06-10 23:31:35 -07001178 (rt->dst.dev->features & NETIF_F_UFO)) {
Herbert Xu79671682006-06-22 02:40:14 -07001179 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
Herbert Xuf83ef8c2006-06-30 13:37:03 -07001180 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
Herbert Xu79671682006-06-22 02:40:14 -07001181 }
Ananda Rajue89e9cf2005-10-18 15:46:41 -07001182
Linus Torvalds1da177e2005-04-16 15:20:36 -07001183
1184 while (size > 0) {
1185 int i;
1186
Herbert Xu89114af2006-07-08 13:34:32 -07001187 if (skb_is_gso(skb))
Ananda Rajue89e9cf2005-10-18 15:46:41 -07001188 len = size;
1189 else {
1190
1191 /* Check if the remaining data fits into current packet. */
1192 len = mtu - skb->len;
1193 if (len < size)
1194 len = maxfraglen - skb->len;
1195 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001196 if (len <= 0) {
1197 struct sk_buff *skb_prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001198 int alloclen;
1199
1200 skb_prev = skb;
Jayachandran C0d0d2bb2005-10-13 11:43:02 -07001201 fraggap = skb_prev->len - maxfraglen;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001202
1203 alloclen = fragheaderlen + hh_len + fraggap + 15;
1204 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1205 if (unlikely(!skb)) {
1206 err = -ENOBUFS;
1207 goto error;
1208 }
1209
1210 /*
1211 * Fill in the control structures
1212 */
1213 skb->ip_summed = CHECKSUM_NONE;
1214 skb->csum = 0;
1215 skb_reserve(skb, hh_len);
1216
1217 /*
1218 * Find where to start putting bytes.
1219 */
Arnaldo Carvalho de Melo967b05f2007-03-13 13:51:52 -03001220 skb_put(skb, fragheaderlen + fraggap);
Arnaldo Carvalho de Melo2ca9e6f2007-03-10 19:15:25 -03001221 skb_reset_network_header(skb);
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -07001222 skb->transport_header = (skb->network_header +
1223 fragheaderlen);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001224 if (fraggap) {
Arnaldo Carvalho de Melo967b05f2007-03-13 13:51:52 -03001225 skb->csum = skb_copy_and_csum_bits(skb_prev,
1226 maxfraglen,
Arnaldo Carvalho de Melo9c702202007-04-25 18:04:18 -07001227 skb_transport_header(skb),
Arnaldo Carvalho de Melo967b05f2007-03-13 13:51:52 -03001228 fraggap, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001229 skb_prev->csum = csum_sub(skb_prev->csum,
1230 skb->csum);
Herbert Xue9fa4f72006-08-13 20:12:58 -07001231 pskb_trim_unique(skb_prev, maxfraglen);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001232 }
1233
1234 /*
1235 * Put the packet on the pending queue.
1236 */
1237 __skb_queue_tail(&sk->sk_write_queue, skb);
1238 continue;
1239 }
1240
1241 i = skb_shinfo(skb)->nr_frags;
1242 if (len > size)
1243 len = size;
1244 if (skb_can_coalesce(skb, i, page, offset)) {
Eric Dumazet9e903e02011-10-18 21:00:24 +00001245 skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001246 } else if (i < MAX_SKB_FRAGS) {
1247 get_page(page);
1248 skb_fill_page_desc(skb, i, page, offset, len);
1249 } else {
1250 err = -EMSGSIZE;
1251 goto error;
1252 }
1253
1254 if (skb->ip_summed == CHECKSUM_NONE) {
Al Viro44bb9362006-11-14 21:36:14 -08001255 __wsum csum;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001256 csum = csum_page(page, offset, len);
1257 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1258 }
1259
1260 skb->len += len;
1261 skb->data_len += len;
David S. Miller1e34a112008-01-22 23:44:31 -08001262 skb->truesize += len;
1263 atomic_add(len, &sk->sk_wmem_alloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001264 offset += len;
1265 size -= len;
1266 }
1267 return 0;
1268
1269error:
David S. Millerbdc712b2011-05-06 15:02:07 -07001270 cork->length -= size;
Pavel Emelyanov5e38e272008-07-16 20:19:49 -07001271 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001272 return err;
1273}
1274
Herbert Xu1470ddf2011-03-01 02:36:47 +00001275static void ip_cork_release(struct inet_cork *cork)
Pavel Emelyanov429f08e2007-11-05 21:03:24 -08001276{
Herbert Xu1470ddf2011-03-01 02:36:47 +00001277 cork->flags &= ~IPCORK_OPT;
1278 kfree(cork->opt);
1279 cork->opt = NULL;
1280 dst_release(cork->dst);
1281 cork->dst = NULL;
Pavel Emelyanov429f08e2007-11-05 21:03:24 -08001282}
1283
Linus Torvalds1da177e2005-04-16 15:20:36 -07001284/*
1285 * Combined all pending IP fragments on the socket as one IP datagram
1286 * and push them out.
1287 */
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001288struct sk_buff *__ip_make_skb(struct sock *sk,
David S. Miller77968b72011-05-08 17:12:19 -07001289 struct flowi4 *fl4,
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001290 struct sk_buff_head *queue,
1291 struct inet_cork *cork)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001292{
1293 struct sk_buff *skb, *tmp_skb;
1294 struct sk_buff **tail_skb;
1295 struct inet_sock *inet = inet_sk(sk);
Pavel Emelyanov0388b002008-07-14 23:00:43 -07001296 struct net *net = sock_net(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001297 struct ip_options *opt = NULL;
Herbert Xu1470ddf2011-03-01 02:36:47 +00001298 struct rtable *rt = (struct rtable *)cork->dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001299 struct iphdr *iph;
Alexey Dobriyan76ab6082006-01-06 13:24:29 -08001300 __be16 df = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001301 __u8 ttl;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001302
Herbert Xu1470ddf2011-03-01 02:36:47 +00001303 if ((skb = __skb_dequeue(queue)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001304 goto out;
1305 tail_skb = &(skb_shinfo(skb)->frag_list);
1306
1307 /* move skb->data to ip header from ext header */
Arnaldo Carvalho de Melod56f90a2007-04-10 20:50:43 -07001308 if (skb->data < skb_network_header(skb))
Arnaldo Carvalho de Melobbe735e2007-03-10 22:16:10 -03001309 __skb_pull(skb, skb_network_offset(skb));
Herbert Xu1470ddf2011-03-01 02:36:47 +00001310 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
Arnaldo Carvalho de Melocfe1fc72007-03-16 17:26:39 -03001311 __skb_pull(tmp_skb, skb_network_header_len(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001312 *tail_skb = tmp_skb;
1313 tail_skb = &(tmp_skb->next);
1314 skb->len += tmp_skb->len;
1315 skb->data_len += tmp_skb->len;
1316 skb->truesize += tmp_skb->truesize;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001317 tmp_skb->destructor = NULL;
1318 tmp_skb->sk = NULL;
1319 }
1320
1321 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1322 * to fragment the frame generated here. No matter, what transforms
1323 * how transforms change size of the packet, it will come out.
1324 */
John Heffner628a5c52007-04-20 15:53:27 -07001325 if (inet->pmtudisc < IP_PMTUDISC_DO)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001326 skb->local_df = 1;
1327
1328 /* DF bit is set when we want to see DF on outgoing frames.
1329 * If local_df is set too, we still allow to fragment this frame
1330 * locally. */
John Heffner628a5c52007-04-20 15:53:27 -07001331 if (inet->pmtudisc >= IP_PMTUDISC_DO ||
Changli Gaod8d1f302010-06-10 23:31:35 -07001332 (skb->len <= dst_mtu(&rt->dst) &&
1333 ip_dont_fragment(sk, &rt->dst)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001334 df = htons(IP_DF);
1335
Herbert Xu1470ddf2011-03-01 02:36:47 +00001336 if (cork->flags & IPCORK_OPT)
1337 opt = cork->opt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001338
1339 if (rt->rt_type == RTN_MULTICAST)
1340 ttl = inet->mc_ttl;
1341 else
Changli Gaod8d1f302010-06-10 23:31:35 -07001342 ttl = ip_select_ttl(inet, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001343
Ansis Atteka832ae422013-09-18 15:29:52 -07001344 iph = ip_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001345 iph->version = 4;
1346 iph->ihl = 5;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001347 iph->tos = inet->tos;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001348 iph->frag_off = df;
Eric Dumazetad52eef2014-06-02 05:26:03 -07001349 ip_select_ident(skb, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001350 iph->ttl = ttl;
1351 iph->protocol = sk->sk_protocol;
Eric Dumazet84f93072011-11-30 19:00:53 +00001352 ip_copy_addrs(iph, fl4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001353
David S. Miller22f728f2011-05-13 17:21:27 -04001354 if (opt) {
1355 iph->ihl += opt->optlen>>2;
1356 ip_options_build(skb, opt, cork->addr, rt, 0);
1357 }
1358
Linus Torvalds1da177e2005-04-16 15:20:36 -07001359 skb->priority = sk->sk_priority;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -08001360 skb->mark = sk->sk_mark;
Eric Dumazeta21bba92008-11-24 16:07:50 -08001361 /*
1362 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1363 * on dst refcount
1364 */
Herbert Xu1470ddf2011-03-01 02:36:47 +00001365 cork->dst = NULL;
Changli Gaod8d1f302010-06-10 23:31:35 -07001366 skb_dst_set(skb, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001367
David L Stevens96793b42007-09-17 09:57:33 -07001368 if (iph->protocol == IPPROTO_ICMP)
Pavel Emelyanov0388b002008-07-14 23:00:43 -07001369 icmp_out_count(net, ((struct icmphdr *)
David L Stevens96793b42007-09-17 09:57:33 -07001370 skb_transport_header(skb))->type);
1371
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001372 ip_cork_release(cork);
1373out:
1374 return skb;
1375}
1376
1377int ip_send_skb(struct sk_buff *skb)
1378{
1379 struct net *net = sock_net(skb->sk);
1380 int err;
1381
Herbert Xuc439cb22008-01-11 19:14:00 -08001382 err = ip_local_out(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001383 if (err) {
1384 if (err > 0)
Eric Dumazet6ce9e7b2009-09-02 18:05:33 -07001385 err = net_xmit_errno(err);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001386 if (err)
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001387 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001388 }
1389
Linus Torvalds1da177e2005-04-16 15:20:36 -07001390 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001391}
1392
David S. Miller77968b72011-05-08 17:12:19 -07001393int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
Herbert Xu1470ddf2011-03-01 02:36:47 +00001394{
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001395 struct sk_buff *skb;
1396
David S. Miller77968b72011-05-08 17:12:19 -07001397 skb = ip_finish_skb(sk, fl4);
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001398 if (!skb)
1399 return 0;
1400
1401 /* Netfilter gets whole the not fragmented skb. */
1402 return ip_send_skb(skb);
Herbert Xu1470ddf2011-03-01 02:36:47 +00001403}
1404
Linus Torvalds1da177e2005-04-16 15:20:36 -07001405/*
1406 * Throw away all pending data on the socket.
1407 */
Herbert Xu1470ddf2011-03-01 02:36:47 +00001408static void __ip_flush_pending_frames(struct sock *sk,
1409 struct sk_buff_head *queue,
1410 struct inet_cork *cork)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001411{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001412 struct sk_buff *skb;
1413
Herbert Xu1470ddf2011-03-01 02:36:47 +00001414 while ((skb = __skb_dequeue_tail(queue)) != NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001415 kfree_skb(skb);
1416
Herbert Xu1470ddf2011-03-01 02:36:47 +00001417 ip_cork_release(cork);
1418}
1419
1420void ip_flush_pending_frames(struct sock *sk)
1421{
David S. Millerbdc712b2011-05-06 15:02:07 -07001422 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001423}
1424
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001425struct sk_buff *ip_make_skb(struct sock *sk,
David S. Miller77968b72011-05-08 17:12:19 -07001426 struct flowi4 *fl4,
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001427 int getfrag(void *from, char *to, int offset,
1428 int len, int odd, struct sk_buff *skb),
1429 void *from, int length, int transhdrlen,
1430 struct ipcm_cookie *ipc, struct rtable **rtp,
1431 unsigned int flags)
1432{
David S. Millerb80d7222011-05-06 15:06:01 -07001433 struct inet_cork cork;
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001434 struct sk_buff_head queue;
1435 int err;
1436
1437 if (flags & MSG_PROBE)
1438 return NULL;
1439
1440 __skb_queue_head_init(&queue);
1441
David S. Millerb80d7222011-05-06 15:06:01 -07001442 cork.flags = 0;
1443 cork.addr = 0;
David S. Miller70652722011-05-06 16:01:15 -07001444 cork.opt = NULL;
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001445 err = ip_setup_cork(sk, &cork, ipc, rtp);
1446 if (err)
1447 return ERR_PTR(err);
1448
David S. Millerf5fca602011-05-08 17:24:10 -07001449 err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001450 from, length, transhdrlen, flags);
1451 if (err) {
1452 __ip_flush_pending_frames(sk, &queue, &cork);
1453 return ERR_PTR(err);
1454 }
1455
David S. Miller77968b72011-05-08 17:12:19 -07001456 return __ip_make_skb(sk, fl4, &queue, &cork);
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001457}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001458
1459/*
1460 * Fetch data from kernel space and fill in checksum if needed.
1461 */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001462static int ip_reply_glue_bits(void *dptr, char *to, int offset,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001463 int len, int odd, struct sk_buff *skb)
1464{
Al Viro50842052006-11-14 21:36:34 -08001465 __wsum csum;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001466
1467 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1468 skb->csum = csum_block_add(skb->csum, csum, odd);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001469 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001470}
1471
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001472/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001473 * Generic function to send a packet as reply to another packet.
1474 * Used to send TCP resets so far. ICMP should use this function too.
1475 *
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001476 * Should run single threaded per socket because it uses the sock
Linus Torvalds1da177e2005-04-16 15:20:36 -07001477 * structure to pass arguments.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001478 */
David S. Miller0a5ebb82011-05-09 13:22:43 -07001479void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
Eric Dumazet66b13d92011-10-24 03:06:21 -04001480 const struct ip_reply_arg *arg, unsigned int len)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001481{
1482 struct inet_sock *inet = inet_sk(sk);
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001483 struct ip_options_data replyopts;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001484 struct ipcm_cookie ipc;
David S. Miller77968b72011-05-08 17:12:19 -07001485 struct flowi4 fl4;
Eric Dumazet511c3f92009-06-02 05:14:27 +00001486 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001487
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001488 if (ip_options_echo(&replyopts.opt.opt, skb))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001489 return;
1490
David S. Miller0a5ebb82011-05-09 13:22:43 -07001491 ipc.addr = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001492 ipc.opt = NULL;
Oliver Hartkopp2244d072010-08-17 08:59:14 +00001493 ipc.tx_flags = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001494
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001495 if (replyopts.opt.opt.optlen) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001496 ipc.opt = &replyopts.opt;
1497
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001498 if (replyopts.opt.opt.srr)
1499 daddr = replyopts.opt.opt.faddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001500 }
1501
Lorenzo Colitti2887dec2014-03-18 20:52:27 +09001502 flowi4_init_output(&fl4, arg->bound_dev_if,
1503 IP4_REPLY_MARK(sock_net(sk), skb->mark),
Eric Dumazet66b13d92011-10-24 03:06:21 -04001504 RT_TOS(arg->tos),
David S. Miller77968b72011-05-08 17:12:19 -07001505 RT_SCOPE_UNIVERSE, sk->sk_protocol,
1506 ip_reply_arg_flowi_flags(arg),
1507 daddr, rt->rt_spec_dst,
Lorenzo Colitti462ce7c2014-03-31 16:23:51 +09001508 tcp_hdr(skb)->source, tcp_hdr(skb)->dest,
1509 arg->uid);
David S. Miller77968b72011-05-08 17:12:19 -07001510 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1511 rt = ip_route_output_key(sock_net(sk), &fl4);
1512 if (IS_ERR(rt))
1513 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001514
1515 /* And let IP do all the hard work.
1516
1517 This chunk is not reenterable, hence spinlock.
1518 Note that it uses the fact, that this function is called
1519 with locally disabled BH and that sk cannot be already spinlocked.
1520 */
1521 bh_lock_sock(sk);
Eric Dumazet66b13d92011-10-24 03:06:21 -04001522 inet->tos = arg->tos;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001523 sk->sk_priority = skb->priority;
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001524 sk->sk_protocol = ip_hdr(skb)->protocol;
Patrick McHardyf0e48db2007-06-04 21:32:46 -07001525 sk->sk_bound_dev_if = arg->bound_dev_if;
David S. Millerf5fca602011-05-08 17:24:10 -07001526 ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
Eric Dumazet2e77d892008-11-24 15:52:46 -08001527 &ipc, &rt, MSG_DONTWAIT);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001528 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1529 if (arg->csumoffset >= 0)
Arnaldo Carvalho de Melo9c702202007-04-25 18:04:18 -07001530 *((__sum16 *)skb_transport_header(skb) +
1531 arg->csumoffset) = csum_fold(csum_add(skb->csum,
1532 arg->csum));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001533 skb->ip_summed = CHECKSUM_NONE;
David S. Miller77968b72011-05-08 17:12:19 -07001534 ip_push_pending_frames(sk, &fl4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001535 }
1536
1537 bh_unlock_sock(sk);
1538
1539 ip_rt_put(rt);
1540}
1541
Linus Torvalds1da177e2005-04-16 15:20:36 -07001542void __init ip_init(void)
1543{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001544 ip_rt_init();
1545 inet_initpeers();
1546
1547#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1548 igmp_mc_proc_init();
1549#endif
1550}