blob: 06e7cdaeedc56af64c86dc30bb1b1da262e5e08b [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * IPv6 output functions
3 * Linux INET6 implementation
4 *
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
7 *
8 * $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9 *
10 * Based on linux/net/ipv4/ip_output.c
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
16 *
17 * Changes:
18 * A.N.Kuznetsov : airthmetics in fragmentation.
19 * extension headers are implemented.
20 * route changes now work.
21 * ip6_forward does not confuse sniffers.
22 * etc.
23 *
24 * H. von Brand : Added missing #include <linux/string.h>
25 * Imran Patel : frag id should be in NBO
26 * Kazunori MIYAZAWA @USAGI
27 * : add ip6_append_data and related functions
28 * for datagram xmit
29 */
30
31#include <linux/config.h>
32#include <linux/errno.h>
33#include <linux/types.h>
34#include <linux/string.h>
35#include <linux/socket.h>
36#include <linux/net.h>
37#include <linux/netdevice.h>
38#include <linux/if_arp.h>
39#include <linux/in6.h>
40#include <linux/tcp.h>
41#include <linux/route.h>
42
43#include <linux/netfilter.h>
44#include <linux/netfilter_ipv6.h>
45
46#include <net/sock.h>
47#include <net/snmp.h>
48
49#include <net/ipv6.h>
50#include <net/ndisc.h>
51#include <net/protocol.h>
52#include <net/ip6_route.h>
53#include <net/addrconf.h>
54#include <net/rawv6.h>
55#include <net/icmp.h>
56#include <net/xfrm.h>
57#include <net/checksum.h>
58
59static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62{
63 static u32 ipv6_fragmentation_id = 1;
64 static DEFINE_SPINLOCK(ip6_id_lock);
65
66 spin_lock_bh(&ip6_id_lock);
67 fhdr->identification = htonl(ipv6_fragmentation_id);
68 if (++ipv6_fragmentation_id == 0)
69 ipv6_fragmentation_id = 1;
70 spin_unlock_bh(&ip6_id_lock);
71}
72
73static inline int ip6_output_finish(struct sk_buff *skb)
74{
75
76 struct dst_entry *dst = skb->dst;
77 struct hh_cache *hh = dst->hh;
78
79 if (hh) {
80 int hh_alen;
81
82 read_lock_bh(&hh->hh_lock);
83 hh_alen = HH_DATA_ALIGN(hh->hh_len);
84 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
85 read_unlock_bh(&hh->hh_lock);
86 skb_push(skb, hh->hh_len);
87 return hh->hh_output(skb);
88 } else if (dst->neighbour)
89 return dst->neighbour->output(skb);
90
91 IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
92 kfree_skb(skb);
93 return -EINVAL;
94
95}
96
97/* dev_loopback_xmit for use with netfilter. */
98static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
99{
100 newskb->mac.raw = newskb->data;
101 __skb_pull(newskb, newskb->nh.raw - newskb->data);
102 newskb->pkt_type = PACKET_LOOPBACK;
103 newskb->ip_summed = CHECKSUM_UNNECESSARY;
104 BUG_TRAP(newskb->dst);
105
106 netif_rx(newskb);
107 return 0;
108}
109
110
111static int ip6_output2(struct sk_buff *skb)
112{
113 struct dst_entry *dst = skb->dst;
114 struct net_device *dev = dst->dev;
115
116 skb->protocol = htons(ETH_P_IPV6);
117 skb->dev = dev;
118
119 if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
120 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
121
122 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
123 ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
124 &skb->nh.ipv6h->saddr)) {
125 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
126
127 /* Do not check for IFF_ALLMULTI; multicast routing
128 is not supported in any case.
129 */
130 if (newskb)
131 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
132 newskb->dev,
133 ip6_dev_loopback_xmit);
134
135 if (skb->nh.ipv6h->hop_limit == 0) {
136 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
137 kfree_skb(skb);
138 return 0;
139 }
140 }
141
142 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
143 }
144
145 return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
146}
147
148int ip6_output(struct sk_buff *skb)
149{
150 if (skb->len > dst_mtu(skb->dst) || dst_allfrag(skb->dst))
151 return ip6_fragment(skb, ip6_output2);
152 else
153 return ip6_output2(skb);
154}
155
156#ifdef CONFIG_NETFILTER
157int ip6_route_me_harder(struct sk_buff *skb)
158{
159 struct ipv6hdr *iph = skb->nh.ipv6h;
160 struct dst_entry *dst;
161 struct flowi fl = {
162 .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
163 .nl_u =
164 { .ip6_u =
165 { .daddr = iph->daddr,
166 .saddr = iph->saddr, } },
167 .proto = iph->nexthdr,
168 };
169
170 dst = ip6_route_output(skb->sk, &fl);
171
172 if (dst->error) {
173 IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
174 LIMIT_NETDEBUG(
175 printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n"));
176 dst_release(dst);
177 return -EINVAL;
178 }
179
180 /* Drop old route. */
181 dst_release(skb->dst);
182
183 skb->dst = dst;
184 return 0;
185}
186#endif
187
188static inline int ip6_maybe_reroute(struct sk_buff *skb)
189{
190#ifdef CONFIG_NETFILTER
191 if (skb->nfcache & NFC_ALTERED){
192 if (ip6_route_me_harder(skb) != 0){
193 kfree_skb(skb);
194 return -EINVAL;
195 }
196 }
197#endif /* CONFIG_NETFILTER */
198 return dst_output(skb);
199}
200
201/*
202 * xmit an sk_buff (used by TCP)
203 */
204
205int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
206 struct ipv6_txoptions *opt, int ipfragok)
207{
208 struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
209 struct in6_addr *first_hop = &fl->fl6_dst;
210 struct dst_entry *dst = skb->dst;
211 struct ipv6hdr *hdr;
212 u8 proto = fl->proto;
213 int seg_len = skb->len;
214 int hlimit;
215 u32 mtu;
216
217 if (opt) {
218 int head_room;
219
220 /* First: exthdrs may take lots of space (~8K for now)
221 MAX_HEADER is not enough.
222 */
223 head_room = opt->opt_nflen + opt->opt_flen;
224 seg_len += head_room;
225 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
226
227 if (skb_headroom(skb) < head_room) {
228 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
229 kfree_skb(skb);
230 skb = skb2;
231 if (skb == NULL) {
232 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
233 return -ENOBUFS;
234 }
235 if (sk)
236 skb_set_owner_w(skb, sk);
237 }
238 if (opt->opt_flen)
239 ipv6_push_frag_opts(skb, opt, &proto);
240 if (opt->opt_nflen)
241 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
242 }
243
244 hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
245
246 /*
247 * Fill in the IPv6 header
248 */
249
250 *(u32*)hdr = htonl(0x60000000) | fl->fl6_flowlabel;
251 hlimit = -1;
252 if (np)
253 hlimit = np->hop_limit;
254 if (hlimit < 0)
255 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
256 if (hlimit < 0)
257 hlimit = ipv6_get_hoplimit(dst->dev);
258
259 hdr->payload_len = htons(seg_len);
260 hdr->nexthdr = proto;
261 hdr->hop_limit = hlimit;
262
263 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
264 ipv6_addr_copy(&hdr->daddr, first_hop);
265
266 mtu = dst_mtu(dst);
267 if ((skb->len <= mtu) || ipfragok) {
268 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
269 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute);
270 }
271
272 if (net_ratelimit())
273 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
274 skb->dev = dst->dev;
275 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
276 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
277 kfree_skb(skb);
278 return -EMSGSIZE;
279}
280
281/*
282 * To avoid extra problems ND packets are send through this
283 * routine. It's code duplication but I really want to avoid
284 * extra checks since ipv6_build_header is used by TCP (which
285 * is for us performance critical)
286 */
287
288int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
289 struct in6_addr *saddr, struct in6_addr *daddr,
290 int proto, int len)
291{
292 struct ipv6_pinfo *np = inet6_sk(sk);
293 struct ipv6hdr *hdr;
294 int totlen;
295
296 skb->protocol = htons(ETH_P_IPV6);
297 skb->dev = dev;
298
299 totlen = len + sizeof(struct ipv6hdr);
300
301 hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
302 skb->nh.ipv6h = hdr;
303
304 *(u32*)hdr = htonl(0x60000000);
305
306 hdr->payload_len = htons(len);
307 hdr->nexthdr = proto;
308 hdr->hop_limit = np->hop_limit;
309
310 ipv6_addr_copy(&hdr->saddr, saddr);
311 ipv6_addr_copy(&hdr->daddr, daddr);
312
313 return 0;
314}
315
316static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
317{
318 struct ip6_ra_chain *ra;
319 struct sock *last = NULL;
320
321 read_lock(&ip6_ra_lock);
322 for (ra = ip6_ra_chain; ra; ra = ra->next) {
323 struct sock *sk = ra->sk;
324 if (sk && ra->sel == sel) {
325 if (last) {
326 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
327 if (skb2)
328 rawv6_rcv(last, skb2);
329 }
330 last = sk;
331 }
332 }
333
334 if (last) {
335 rawv6_rcv(last, skb);
336 read_unlock(&ip6_ra_lock);
337 return 1;
338 }
339 read_unlock(&ip6_ra_lock);
340 return 0;
341}
342
343static inline int ip6_forward_finish(struct sk_buff *skb)
344{
345 return dst_output(skb);
346}
347
348int ip6_forward(struct sk_buff *skb)
349{
350 struct dst_entry *dst = skb->dst;
351 struct ipv6hdr *hdr = skb->nh.ipv6h;
352 struct inet6_skb_parm *opt = IP6CB(skb);
353
354 if (ipv6_devconf.forwarding == 0)
355 goto error;
356
357 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
358 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
359 goto drop;
360 }
361
362 skb->ip_summed = CHECKSUM_NONE;
363
364 /*
365 * We DO NOT make any processing on
366 * RA packets, pushing them to user level AS IS
367 * without ane WARRANTY that application will be able
368 * to interpret them. The reason is that we
369 * cannot make anything clever here.
370 *
371 * We are not end-node, so that if packet contains
372 * AH/ESP, we cannot make anything.
373 * Defragmentation also would be mistake, RA packets
374 * cannot be fragmented, because there is no warranty
375 * that different fragments will go along one path. --ANK
376 */
377 if (opt->ra) {
378 u8 *ptr = skb->nh.raw + opt->ra;
379 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
380 return 0;
381 }
382
383 /*
384 * check and decrement ttl
385 */
386 if (hdr->hop_limit <= 1) {
387 /* Force OUTPUT device used as source address */
388 skb->dev = dst->dev;
389 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
390 0, skb->dev);
391
392 kfree_skb(skb);
393 return -ETIMEDOUT;
394 }
395
396 if (!xfrm6_route_forward(skb)) {
397 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
398 goto drop;
399 }
400 dst = skb->dst;
401
402 /* IPv6 specs say nothing about it, but it is clear that we cannot
403 send redirects to source routed frames.
404 */
405 if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
406 struct in6_addr *target = NULL;
407 struct rt6_info *rt;
408 struct neighbour *n = dst->neighbour;
409
410 /*
411 * incoming and outgoing devices are the same
412 * send a redirect.
413 */
414
415 rt = (struct rt6_info *) dst;
416 if ((rt->rt6i_flags & RTF_GATEWAY))
417 target = (struct in6_addr*)&n->primary_key;
418 else
419 target = &hdr->daddr;
420
421 /* Limit redirects both by destination (here)
422 and by source (inside ndisc_send_redirect)
423 */
424 if (xrlim_allow(dst, 1*HZ))
425 ndisc_send_redirect(skb, n, target);
426 } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
427 |IPV6_ADDR_LINKLOCAL)) {
428 /* This check is security critical. */
429 goto error;
430 }
431
432 if (skb->len > dst_mtu(dst)) {
433 /* Again, force OUTPUT device used as source address */
434 skb->dev = dst->dev;
435 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
436 IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
437 IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
438 kfree_skb(skb);
439 return -EMSGSIZE;
440 }
441
442 if (skb_cow(skb, dst->dev->hard_header_len)) {
443 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
444 goto drop;
445 }
446
447 hdr = skb->nh.ipv6h;
448
449 /* Mangling hops number delayed to point after skb COW */
450
451 hdr->hop_limit--;
452
453 IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
454 return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
455
456error:
457 IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
458drop:
459 kfree_skb(skb);
460 return -EINVAL;
461}
462
463static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
464{
465 to->pkt_type = from->pkt_type;
466 to->priority = from->priority;
467 to->protocol = from->protocol;
468 to->security = from->security;
469 dst_release(to->dst);
470 to->dst = dst_clone(from->dst);
471 to->dev = from->dev;
472
473#ifdef CONFIG_NET_SCHED
474 to->tc_index = from->tc_index;
475#endif
476#ifdef CONFIG_NETFILTER
477 to->nfmark = from->nfmark;
478 /* Connection association is same as pre-frag packet */
479 to->nfct = from->nfct;
480 nf_conntrack_get(to->nfct);
481 to->nfctinfo = from->nfctinfo;
482#ifdef CONFIG_BRIDGE_NETFILTER
483 nf_bridge_put(to->nf_bridge);
484 to->nf_bridge = from->nf_bridge;
485 nf_bridge_get(to->nf_bridge);
486#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700487#endif
488}
489
490int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
491{
492 u16 offset = sizeof(struct ipv6hdr);
493 struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
494 unsigned int packet_len = skb->tail - skb->nh.raw;
495 int found_rhdr = 0;
496 *nexthdr = &skb->nh.ipv6h->nexthdr;
497
498 while (offset + 1 <= packet_len) {
499
500 switch (**nexthdr) {
501
502 case NEXTHDR_HOP:
503 case NEXTHDR_ROUTING:
504 case NEXTHDR_DEST:
505 if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
506 if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
507 offset += ipv6_optlen(exthdr);
508 *nexthdr = &exthdr->nexthdr;
509 exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
510 break;
511 default :
512 return offset;
513 }
514 }
515
516 return offset;
517}
518
519static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
520{
521 struct net_device *dev;
522 struct sk_buff *frag;
523 struct rt6_info *rt = (struct rt6_info*)skb->dst;
524 struct ipv6hdr *tmp_hdr;
525 struct frag_hdr *fh;
526 unsigned int mtu, hlen, left, len;
527 u32 frag_id = 0;
528 int ptr, offset = 0, err=0;
529 u8 *prevhdr, nexthdr = 0;
530
531 dev = rt->u.dst.dev;
532 hlen = ip6_find_1stfragopt(skb, &prevhdr);
533 nexthdr = *prevhdr;
534
535 mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
536
537 if (skb_shinfo(skb)->frag_list) {
538 int first_len = skb_pagelen(skb);
539
540 if (first_len - hlen > mtu ||
541 ((first_len - hlen) & 7) ||
542 skb_cloned(skb))
543 goto slow_path;
544
545 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
546 /* Correct geometry. */
547 if (frag->len > mtu ||
548 ((frag->len & 7) && frag->next) ||
549 skb_headroom(frag) < hlen)
550 goto slow_path;
551
Linus Torvalds1da177e2005-04-16 15:20:36 -0700552 /* Partially cloned skb? */
553 if (skb_shared(frag))
554 goto slow_path;
Herbert Xu2fdba6b2005-05-18 22:52:33 -0700555
556 BUG_ON(frag->sk);
557 if (skb->sk) {
558 sock_hold(skb->sk);
559 frag->sk = skb->sk;
560 frag->destructor = sock_wfree;
561 skb->truesize -= frag->truesize;
562 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700563 }
564
565 err = 0;
566 offset = 0;
567 frag = skb_shinfo(skb)->frag_list;
568 skb_shinfo(skb)->frag_list = NULL;
569 /* BUILD HEADER */
570
571 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
572 if (!tmp_hdr) {
573 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
574 return -ENOMEM;
575 }
576
577 *prevhdr = NEXTHDR_FRAGMENT;
578 memcpy(tmp_hdr, skb->nh.raw, hlen);
579 __skb_pull(skb, hlen);
580 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
581 skb->nh.raw = __skb_push(skb, hlen);
582 memcpy(skb->nh.raw, tmp_hdr, hlen);
583
584 ipv6_select_ident(skb, fh);
585 fh->nexthdr = nexthdr;
586 fh->reserved = 0;
587 fh->frag_off = htons(IP6_MF);
588 frag_id = fh->identification;
589
590 first_len = skb_pagelen(skb);
591 skb->data_len = first_len - skb_headlen(skb);
592 skb->len = first_len;
593 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
594
595
596 for (;;) {
597 /* Prepare header of the next frame,
598 * before previous one went down. */
599 if (frag) {
600 frag->ip_summed = CHECKSUM_NONE;
601 frag->h.raw = frag->data;
602 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
603 frag->nh.raw = __skb_push(frag, hlen);
604 memcpy(frag->nh.raw, tmp_hdr, hlen);
605 offset += skb->len - hlen - sizeof(struct frag_hdr);
606 fh->nexthdr = nexthdr;
607 fh->reserved = 0;
608 fh->frag_off = htons(offset);
609 if (frag->next != NULL)
610 fh->frag_off |= htons(IP6_MF);
611 fh->identification = frag_id;
612 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
613 ip6_copy_metadata(frag, skb);
614 }
615
616 err = output(skb);
617 if (err || !frag)
618 break;
619
620 skb = frag;
621 frag = skb->next;
622 skb->next = NULL;
623 }
624
625 if (tmp_hdr)
626 kfree(tmp_hdr);
627
628 if (err == 0) {
629 IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
630 return 0;
631 }
632
633 while (frag) {
634 skb = frag->next;
635 kfree_skb(frag);
636 frag = skb;
637 }
638
639 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
640 return err;
641 }
642
643slow_path:
644 left = skb->len - hlen; /* Space per frame */
645 ptr = hlen; /* Where to start from */
646
647 /*
648 * Fragment the datagram.
649 */
650
651 *prevhdr = NEXTHDR_FRAGMENT;
652
653 /*
654 * Keep copying data until we run out.
655 */
656 while(left > 0) {
657 len = left;
658 /* IF: it doesn't fit, use 'mtu' - the data space left */
659 if (len > mtu)
660 len = mtu;
661 /* IF: we are not sending upto and including the packet end
662 then align the next start on an eight byte boundary */
663 if (len < left) {
664 len &= ~7;
665 }
666 /*
667 * Allocate buffer.
668 */
669
670 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
671 NETDEBUG(printk(KERN_INFO "IPv6: frag: no memory for new fragment!\n"));
672 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
673 err = -ENOMEM;
674 goto fail;
675 }
676
677 /*
678 * Set up data on packet
679 */
680
681 ip6_copy_metadata(frag, skb);
682 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
683 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
684 frag->nh.raw = frag->data;
685 fh = (struct frag_hdr*)(frag->data + hlen);
686 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
687
688 /*
689 * Charge the memory for the fragment to any owner
690 * it might possess
691 */
692 if (skb->sk)
693 skb_set_owner_w(frag, skb->sk);
694
695 /*
696 * Copy the packet header into the new buffer.
697 */
698 memcpy(frag->nh.raw, skb->data, hlen);
699
700 /*
701 * Build fragment header.
702 */
703 fh->nexthdr = nexthdr;
704 fh->reserved = 0;
705 if (frag_id) {
706 ipv6_select_ident(skb, fh);
707 frag_id = fh->identification;
708 } else
709 fh->identification = frag_id;
710
711 /*
712 * Copy a block of the IP datagram.
713 */
714 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
715 BUG();
716 left -= len;
717
718 fh->frag_off = htons(offset);
719 if (left > 0)
720 fh->frag_off |= htons(IP6_MF);
721 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
722
723 ptr += len;
724 offset += len;
725
726 /*
727 * Put this fragment into the sending queue.
728 */
729
730 IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
731
732 err = output(frag);
733 if (err)
734 goto fail;
735 }
736 kfree_skb(skb);
737 IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
738 return err;
739
740fail:
741 kfree_skb(skb);
742 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
743 return err;
744}
745
746int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
747{
748 int err = 0;
749
750 *dst = NULL;
751 if (sk) {
752 struct ipv6_pinfo *np = inet6_sk(sk);
753
754 *dst = sk_dst_check(sk, np->dst_cookie);
755 if (*dst) {
756 struct rt6_info *rt = (struct rt6_info*)*dst;
757
758 /* Yes, checking route validity in not connected
759 case is not very simple. Take into account,
760 that we do not support routing by source, TOS,
761 and MSG_DONTROUTE --ANK (980726)
762
763 1. If route was host route, check that
764 cached destination is current.
765 If it is network route, we still may
766 check its validity using saved pointer
767 to the last used address: daddr_cache.
768 We do not want to save whole address now,
769 (because main consumer of this service
770 is tcp, which has not this problem),
771 so that the last trick works only on connected
772 sockets.
773 2. oif also should be the same.
774 */
775
776 if (((rt->rt6i_dst.plen != 128 ||
777 !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
778 && (np->daddr_cache == NULL ||
779 !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
780 || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
781 dst_release(*dst);
782 *dst = NULL;
783 }
784 }
785 }
786
787 if (*dst == NULL)
788 *dst = ip6_route_output(sk, fl);
789
790 if ((err = (*dst)->error))
791 goto out_err_release;
792
793 if (ipv6_addr_any(&fl->fl6_src)) {
794 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
795
796 if (err) {
797#if IP6_DEBUG >= 2
798 printk(KERN_DEBUG "ip6_dst_lookup: "
799 "no available source address\n");
800#endif
801 goto out_err_release;
802 }
803 }
804
805 return 0;
806
807out_err_release:
808 dst_release(*dst);
809 *dst = NULL;
810 return err;
811}
812
813int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb),
814 void *from, int length, int transhdrlen,
815 int hlimit, struct ipv6_txoptions *opt, struct flowi *fl, struct rt6_info *rt,
816 unsigned int flags)
817{
818 struct inet_sock *inet = inet_sk(sk);
819 struct ipv6_pinfo *np = inet6_sk(sk);
820 struct sk_buff *skb;
821 unsigned int maxfraglen, fragheaderlen;
822 int exthdrlen;
823 int hh_len;
824 int mtu;
825 int copy;
826 int err;
827 int offset = 0;
828 int csummode = CHECKSUM_NONE;
829
830 if (flags&MSG_PROBE)
831 return 0;
832 if (skb_queue_empty(&sk->sk_write_queue)) {
833 /*
834 * setup for corking
835 */
836 if (opt) {
837 if (np->cork.opt == NULL) {
838 np->cork.opt = kmalloc(opt->tot_len,
839 sk->sk_allocation);
840 if (unlikely(np->cork.opt == NULL))
841 return -ENOBUFS;
842 } else if (np->cork.opt->tot_len < opt->tot_len) {
843 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
844 return -EINVAL;
845 }
846 memcpy(np->cork.opt, opt, opt->tot_len);
847 inet->cork.flags |= IPCORK_OPT;
848 /* need source address above miyazawa*/
849 }
850 dst_hold(&rt->u.dst);
851 np->cork.rt = rt;
852 inet->cork.fl = *fl;
853 np->cork.hop_limit = hlimit;
854 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
855 if (dst_allfrag(rt->u.dst.path))
856 inet->cork.flags |= IPCORK_ALLFRAG;
857 inet->cork.length = 0;
858 sk->sk_sndmsg_page = NULL;
859 sk->sk_sndmsg_off = 0;
860 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
861 length += exthdrlen;
862 transhdrlen += exthdrlen;
863 } else {
864 rt = np->cork.rt;
865 fl = &inet->cork.fl;
866 if (inet->cork.flags & IPCORK_OPT)
867 opt = np->cork.opt;
868 transhdrlen = 0;
869 exthdrlen = 0;
870 mtu = inet->cork.fragsize;
871 }
872
873 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
874
875 fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
876 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
877
878 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
879 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
880 ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
881 return -EMSGSIZE;
882 }
883 }
884
885 /*
886 * Let's try using as much space as possible.
887 * Use MTU if total length of the message fits into the MTU.
888 * Otherwise, we need to reserve fragment header and
889 * fragment alignment (= 8-15 octects, in total).
890 *
891 * Note that we may need to "move" the data from the tail of
892 * of the buffer to the new fragment when we split
893 * the message.
894 *
895 * FIXME: It may be fragmented into multiple chunks
896 * at once if non-fragmentable extension headers
897 * are too large.
898 * --yoshfuji
899 */
900
901 inet->cork.length += length;
902
903 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
904 goto alloc_new_skb;
905
906 while (length > 0) {
907 /* Check if the remaining data fits into current packet. */
908 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
909 if (copy < length)
910 copy = maxfraglen - skb->len;
911
912 if (copy <= 0) {
913 char *data;
914 unsigned int datalen;
915 unsigned int fraglen;
916 unsigned int fraggap;
917 unsigned int alloclen;
918 struct sk_buff *skb_prev;
919alloc_new_skb:
920 skb_prev = skb;
921
922 /* There's no room in the current skb */
923 if (skb_prev)
924 fraggap = skb_prev->len - maxfraglen;
925 else
926 fraggap = 0;
927
928 /*
929 * If remaining data exceeds the mtu,
930 * we know we need more fragment(s).
931 */
932 datalen = length + fraggap;
933 if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
934 datalen = maxfraglen - fragheaderlen;
935
936 fraglen = datalen + fragheaderlen;
937 if ((flags & MSG_MORE) &&
938 !(rt->u.dst.dev->features&NETIF_F_SG))
939 alloclen = mtu;
940 else
941 alloclen = datalen + fragheaderlen;
942
943 /*
944 * The last fragment gets additional space at tail.
945 * Note: we overallocate on fragments with MSG_MODE
946 * because we have no idea if we're the last one.
947 */
948 if (datalen == length + fraggap)
949 alloclen += rt->u.dst.trailer_len;
950
951 /*
952 * We just reserve space for fragment header.
953 * Note: this may be overallocation if the message
954 * (without MSG_MORE) fits into the MTU.
955 */
956 alloclen += sizeof(struct frag_hdr);
957
958 if (transhdrlen) {
959 skb = sock_alloc_send_skb(sk,
960 alloclen + hh_len,
961 (flags & MSG_DONTWAIT), &err);
962 } else {
963 skb = NULL;
964 if (atomic_read(&sk->sk_wmem_alloc) <=
965 2 * sk->sk_sndbuf)
966 skb = sock_wmalloc(sk,
967 alloclen + hh_len, 1,
968 sk->sk_allocation);
969 if (unlikely(skb == NULL))
970 err = -ENOBUFS;
971 }
972 if (skb == NULL)
973 goto error;
974 /*
975 * Fill in the control structures
976 */
977 skb->ip_summed = csummode;
978 skb->csum = 0;
979 /* reserve for fragmentation */
980 skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
981
982 /*
983 * Find where to start putting bytes
984 */
985 data = skb_put(skb, fraglen);
986 skb->nh.raw = data + exthdrlen;
987 data += fragheaderlen;
988 skb->h.raw = data + exthdrlen;
989
990 if (fraggap) {
991 skb->csum = skb_copy_and_csum_bits(
992 skb_prev, maxfraglen,
993 data + transhdrlen, fraggap, 0);
994 skb_prev->csum = csum_sub(skb_prev->csum,
995 skb->csum);
996 data += fraggap;
997 skb_trim(skb_prev, maxfraglen);
998 }
999 copy = datalen - transhdrlen - fraggap;
1000 if (copy < 0) {
1001 err = -EINVAL;
1002 kfree_skb(skb);
1003 goto error;
1004 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1005 err = -EFAULT;
1006 kfree_skb(skb);
1007 goto error;
1008 }
1009
1010 offset += copy;
1011 length -= datalen - fraggap;
1012 transhdrlen = 0;
1013 exthdrlen = 0;
1014 csummode = CHECKSUM_NONE;
1015
1016 /*
1017 * Put the packet on the pending queue
1018 */
1019 __skb_queue_tail(&sk->sk_write_queue, skb);
1020 continue;
1021 }
1022
1023 if (copy > length)
1024 copy = length;
1025
1026 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1027 unsigned int off;
1028
1029 off = skb->len;
1030 if (getfrag(from, skb_put(skb, copy),
1031 offset, copy, off, skb) < 0) {
1032 __skb_trim(skb, off);
1033 err = -EFAULT;
1034 goto error;
1035 }
1036 } else {
1037 int i = skb_shinfo(skb)->nr_frags;
1038 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1039 struct page *page = sk->sk_sndmsg_page;
1040 int off = sk->sk_sndmsg_off;
1041 unsigned int left;
1042
1043 if (page && (left = PAGE_SIZE - off) > 0) {
1044 if (copy >= left)
1045 copy = left;
1046 if (page != frag->page) {
1047 if (i == MAX_SKB_FRAGS) {
1048 err = -EMSGSIZE;
1049 goto error;
1050 }
1051 get_page(page);
1052 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1053 frag = &skb_shinfo(skb)->frags[i];
1054 }
1055 } else if(i < MAX_SKB_FRAGS) {
1056 if (copy > PAGE_SIZE)
1057 copy = PAGE_SIZE;
1058 page = alloc_pages(sk->sk_allocation, 0);
1059 if (page == NULL) {
1060 err = -ENOMEM;
1061 goto error;
1062 }
1063 sk->sk_sndmsg_page = page;
1064 sk->sk_sndmsg_off = 0;
1065
1066 skb_fill_page_desc(skb, i, page, 0, 0);
1067 frag = &skb_shinfo(skb)->frags[i];
1068 skb->truesize += PAGE_SIZE;
1069 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1070 } else {
1071 err = -EMSGSIZE;
1072 goto error;
1073 }
1074 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1075 err = -EFAULT;
1076 goto error;
1077 }
1078 sk->sk_sndmsg_off += copy;
1079 frag->size += copy;
1080 skb->len += copy;
1081 skb->data_len += copy;
1082 }
1083 offset += copy;
1084 length -= copy;
1085 }
1086 return 0;
1087error:
1088 inet->cork.length -= length;
1089 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1090 return err;
1091}
1092
1093int ip6_push_pending_frames(struct sock *sk)
1094{
1095 struct sk_buff *skb, *tmp_skb;
1096 struct sk_buff **tail_skb;
1097 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1098 struct inet_sock *inet = inet_sk(sk);
1099 struct ipv6_pinfo *np = inet6_sk(sk);
1100 struct ipv6hdr *hdr;
1101 struct ipv6_txoptions *opt = np->cork.opt;
1102 struct rt6_info *rt = np->cork.rt;
1103 struct flowi *fl = &inet->cork.fl;
1104 unsigned char proto = fl->proto;
1105 int err = 0;
1106
1107 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1108 goto out;
1109 tail_skb = &(skb_shinfo(skb)->frag_list);
1110
1111 /* move skb->data to ip header from ext header */
1112 if (skb->data < skb->nh.raw)
1113 __skb_pull(skb, skb->nh.raw - skb->data);
1114 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1115 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1116 *tail_skb = tmp_skb;
1117 tail_skb = &(tmp_skb->next);
1118 skb->len += tmp_skb->len;
1119 skb->data_len += tmp_skb->len;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001120 skb->truesize += tmp_skb->truesize;
1121 __sock_put(tmp_skb->sk);
1122 tmp_skb->destructor = NULL;
1123 tmp_skb->sk = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001124 }
1125
1126 ipv6_addr_copy(final_dst, &fl->fl6_dst);
1127 __skb_pull(skb, skb->h.raw - skb->nh.raw);
1128 if (opt && opt->opt_flen)
1129 ipv6_push_frag_opts(skb, opt, &proto);
1130 if (opt && opt->opt_nflen)
1131 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1132
1133 skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1134
1135 *(u32*)hdr = fl->fl6_flowlabel | htonl(0x60000000);
1136
1137 if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1138 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1139 else
1140 hdr->payload_len = 0;
1141 hdr->hop_limit = np->cork.hop_limit;
1142 hdr->nexthdr = proto;
1143 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1144 ipv6_addr_copy(&hdr->daddr, final_dst);
1145
1146 skb->dst = dst_clone(&rt->u.dst);
1147 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
1148 err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1149 if (err) {
1150 if (err > 0)
Herbert Xu3320da82005-04-19 22:32:22 -07001151 err = np->recverr ? net_xmit_errno(err) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001152 if (err)
1153 goto error;
1154 }
1155
1156out:
1157 inet->cork.flags &= ~IPCORK_OPT;
1158 if (np->cork.opt) {
1159 kfree(np->cork.opt);
1160 np->cork.opt = NULL;
1161 }
1162 if (np->cork.rt) {
1163 dst_release(&np->cork.rt->u.dst);
1164 np->cork.rt = NULL;
1165 inet->cork.flags &= ~IPCORK_ALLFRAG;
1166 }
1167 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1168 return err;
1169error:
1170 goto out;
1171}
1172
1173void ip6_flush_pending_frames(struct sock *sk)
1174{
1175 struct inet_sock *inet = inet_sk(sk);
1176 struct ipv6_pinfo *np = inet6_sk(sk);
1177 struct sk_buff *skb;
1178
1179 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1180 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1181 kfree_skb(skb);
1182 }
1183
1184 inet->cork.flags &= ~IPCORK_OPT;
1185
1186 if (np->cork.opt) {
1187 kfree(np->cork.opt);
1188 np->cork.opt = NULL;
1189 }
1190 if (np->cork.rt) {
1191 dst_release(&np->cork.rt->u.dst);
1192 np->cork.rt = NULL;
1193 inet->cork.flags &= ~IPCORK_ALLFRAG;
1194 }
1195 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1196}