blob: dca1be67164b0c1d3555d09e29283e835a1c8038 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9 *
10 * IPv4 specific functions
11 *
12 *
13 * code split from:
14 * linux/ipv4/tcp.c
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
17 *
18 * See tcp.c for author information
19 *
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
24 */
25
26/*
27 * Changes:
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
36 * ACK bit.
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -070039 * request_sock handling and moved
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
47 * coma.
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
53 */
54
55#include <linux/config.h>
56
57#include <linux/types.h>
58#include <linux/fcntl.h>
59#include <linux/module.h>
60#include <linux/random.h>
61#include <linux/cache.h>
62#include <linux/jhash.h>
63#include <linux/init.h>
64#include <linux/times.h>
65
66#include <net/icmp.h>
Arnaldo Carvalho de Melo304a1612005-08-09 19:59:20 -070067#include <net/inet_hashtables.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070068#include <net/tcp.h>
69#include <net/ipv6.h>
70#include <net/inet_common.h>
71#include <net/xfrm.h>
72
73#include <linux/inet.h>
74#include <linux/ipv6.h>
75#include <linux/stddef.h>
76#include <linux/proc_fs.h>
77#include <linux/seq_file.h>
78
79extern int sysctl_ip_dynaddr;
80int sysctl_tcp_tw_reuse;
81int sysctl_tcp_low_latency;
82
83/* Check TCP sequence numbers in ICMP packets. */
84#define ICMP_MIN_LENGTH 8
85
86/* Socket used for sending RSTs */
87static struct socket *tcp_socket;
88
89void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
90 struct sk_buff *skb);
91
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -070092struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93 .lhash_lock = RW_LOCK_UNLOCKED,
94 .lhash_users = ATOMIC_INIT(0),
95 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96 .portalloc_lock = SPIN_LOCK_UNLOCKED,
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -070097 .port_rover = 1024 - 1,
Linus Torvalds1da177e2005-04-16 15:20:36 -070098};
99
100/*
101 * This array holds the first and last local port number.
102 * For high-usage systems, use sysctl to change this to
103 * 32768-61000
104 */
105int sysctl_local_port_range[2] = { 1024, 4999 };
Linus Torvalds1da177e2005-04-16 15:20:36 -0700106
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700107static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108{
109 const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
110 struct sock *sk2;
111 struct hlist_node *node;
112 int reuse = sk->sk_reuse;
113
114 sk_for_each_bound(sk2, node, &tb->owners) {
115 if (sk != sk2 &&
116 !tcp_v6_ipv6only(sk2) &&
117 (!sk->sk_bound_dev_if ||
118 !sk2->sk_bound_dev_if ||
119 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
120 if (!reuse || !sk2->sk_reuse ||
121 sk2->sk_state == TCP_LISTEN) {
122 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
123 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
124 sk2_rcv_saddr == sk_rcv_saddr)
125 break;
126 }
127 }
128 }
129 return node != NULL;
130}
131
132/* Obtain a reference to a local port for the given sock,
133 * if snum is zero it means select any available local port.
134 */
135static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
136{
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700137 struct inet_bind_hashbucket *head;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700138 struct hlist_node *node;
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700139 struct inet_bind_bucket *tb;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700140 int ret;
141
142 local_bh_disable();
143 if (!snum) {
144 int low = sysctl_local_port_range[0];
145 int high = sysctl_local_port_range[1];
146 int remaining = (high - low) + 1;
147 int rover;
148
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700149 spin_lock(&tcp_hashinfo.portalloc_lock);
150 if (tcp_hashinfo.port_rover < low)
Folkert van Heusden0b2531b2005-05-03 14:36:08 -0700151 rover = low;
152 else
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700153 rover = tcp_hashinfo.port_rover;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700154 do {
155 rover++;
Folkert van Heusden0b2531b2005-05-03 14:36:08 -0700156 if (rover > high)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700157 rover = low;
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700158 head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700159 spin_lock(&head->lock);
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700160 inet_bind_bucket_for_each(tb, node, &head->chain)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700161 if (tb->port == rover)
162 goto next;
163 break;
164 next:
165 spin_unlock(&head->lock);
166 } while (--remaining > 0);
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700167 tcp_hashinfo.port_rover = rover;
168 spin_unlock(&tcp_hashinfo.portalloc_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700169
David S. Millerd5d28372005-08-23 10:49:54 -0700170 /* Exhausted local port range during search? It is not
171 * possible for us to be holding one of the bind hash
172 * locks if this test triggers, because if 'remaining'
173 * drops to zero, we broke out of the do/while loop at
174 * the top level, not from the 'break;' statement.
175 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700176 ret = 1;
David S. Millerd5d28372005-08-23 10:49:54 -0700177 if (unlikely(remaining <= 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700178 goto fail;
179
180 /* OK, here is the one we will use. HEAD is
181 * non-NULL and we hold it's mutex.
182 */
183 snum = rover;
184 } else {
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700185 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700186 spin_lock(&head->lock);
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700187 inet_bind_bucket_for_each(tb, node, &head->chain)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700188 if (tb->port == snum)
189 goto tb_found;
190 }
191 tb = NULL;
192 goto tb_not_found;
193tb_found:
194 if (!hlist_empty(&tb->owners)) {
195 if (sk->sk_reuse > 1)
196 goto success;
197 if (tb->fastreuse > 0 &&
198 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
199 goto success;
200 } else {
201 ret = 1;
202 if (tcp_bind_conflict(sk, tb))
203 goto fail_unlock;
204 }
205 }
206tb_not_found:
207 ret = 1;
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700208 if (!tb && (tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700209 goto fail_unlock;
210 if (hlist_empty(&tb->owners)) {
211 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
212 tb->fastreuse = 1;
213 else
214 tb->fastreuse = 0;
215 } else if (tb->fastreuse &&
216 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
217 tb->fastreuse = 0;
218success:
Arnaldo Carvalho de Meloa55ebcc2005-08-09 20:01:14 -0700219 if (!inet_sk(sk)->bind_hash)
Arnaldo Carvalho de Melo2d8c4ce2005-08-09 20:07:13 -0700220 inet_bind_hash(sk, tb, snum);
Arnaldo Carvalho de Meloa55ebcc2005-08-09 20:01:14 -0700221 BUG_TRAP(inet_sk(sk)->bind_hash == tb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700222 ret = 0;
223
224fail_unlock:
225 spin_unlock(&head->lock);
226fail:
227 local_bh_enable();
228 return ret;
229}
230
Linus Torvalds1da177e2005-04-16 15:20:36 -0700231static void tcp_v4_hash(struct sock *sk)
232{
Arnaldo Carvalho de Melo81849d12005-08-09 20:08:50 -0700233 inet_hash(&tcp_hashinfo, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700234}
235
236void tcp_unhash(struct sock *sk)
237{
Arnaldo Carvalho de Melo81849d12005-08-09 20:08:50 -0700238 inet_unhash(&tcp_hashinfo, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700239}
240
241/* Don't inline this cruft. Here are some nice properties to
242 * exploit here. The BSD API does not allow a listening TCP
243 * to specify the remote port nor the remote address for the
244 * connection. So always assume those are both wildcarded
245 * during the search since they can never be otherwise.
246 */
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700247static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head,
248 const u32 daddr,
249 const unsigned short hnum,
250 const int dif)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700251{
252 struct sock *result = NULL, *sk;
253 struct hlist_node *node;
254 int score, hiscore;
255
256 hiscore=-1;
257 sk_for_each(sk, node, head) {
258 struct inet_sock *inet = inet_sk(sk);
259
260 if (inet->num == hnum && !ipv6_only_sock(sk)) {
261 __u32 rcv_saddr = inet->rcv_saddr;
262
263 score = (sk->sk_family == PF_INET ? 1 : 0);
264 if (rcv_saddr) {
265 if (rcv_saddr != daddr)
266 continue;
267 score+=2;
268 }
269 if (sk->sk_bound_dev_if) {
270 if (sk->sk_bound_dev_if != dif)
271 continue;
272 score+=2;
273 }
274 if (score == 5)
275 return sk;
276 if (score > hiscore) {
277 hiscore = score;
278 result = sk;
279 }
280 }
281 }
282 return result;
283}
284
285/* Optimize the common listener case. */
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700286static inline struct sock *tcp_v4_lookup_listener(const u32 daddr,
287 const unsigned short hnum,
288 const int dif)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700289{
290 struct sock *sk = NULL;
291 struct hlist_head *head;
292
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700293 read_lock(&tcp_hashinfo.lhash_lock);
294 head = &tcp_hashinfo.listening_hash[inet_lhashfn(hnum)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700295 if (!hlist_empty(head)) {
296 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
297
298 if (inet->num == hnum && !sk->sk_node.next &&
299 (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
300 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
301 !sk->sk_bound_dev_if)
302 goto sherry_cache;
303 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
304 }
305 if (sk) {
306sherry_cache:
307 sock_hold(sk);
308 }
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700309 read_unlock(&tcp_hashinfo.lhash_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700310 return sk;
311}
312
313/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
314 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
315 *
316 * Local BH must be disabled here.
317 */
318
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700319static inline struct sock *__tcp_v4_lookup_established(const u32 saddr,
320 const u16 sport,
321 const u32 daddr,
322 const u16 hnum,
323 const int dif)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700324{
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700325 struct inet_ehash_bucket *head;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700326 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
327 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
328 struct sock *sk;
329 struct hlist_node *node;
330 /* Optimize here for direct hit, only listening connections can
331 * have wildcards anyways.
332 */
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700333 const int hash = inet_ehashfn(daddr, hnum, saddr, sport, tcp_hashinfo.ehash_size);
334 head = &tcp_hashinfo.ehash[hash];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700335 read_lock(&head->lock);
336 sk_for_each(sk, node, &head->chain) {
337 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
338 goto hit; /* You sunk my battleship! */
339 }
340
341 /* Must check for a TIME_WAIT'er before going to listener hash. */
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700342 sk_for_each(sk, node, &(head + tcp_hashinfo.ehash_size)->chain) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700343 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
344 goto hit;
345 }
346 sk = NULL;
347out:
348 read_unlock(&head->lock);
349 return sk;
350hit:
351 sock_hold(sk);
352 goto out;
353}
354
355static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
356 u32 daddr, u16 hnum, int dif)
357{
358 struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
359 daddr, hnum, dif);
360
361 return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
362}
363
364inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
365 u16 dport, int dif)
366{
367 struct sock *sk;
368
369 local_bh_disable();
370 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
371 local_bh_enable();
372
373 return sk;
374}
375
376EXPORT_SYMBOL_GPL(tcp_v4_lookup);
377
378static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
379{
380 return secure_tcp_sequence_number(skb->nh.iph->daddr,
381 skb->nh.iph->saddr,
382 skb->h.th->dest,
383 skb->h.th->source);
384}
385
386/* called with local bh disabled */
387static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
388 struct tcp_tw_bucket **twp)
389{
390 struct inet_sock *inet = inet_sk(sk);
391 u32 daddr = inet->rcv_saddr;
392 u32 saddr = inet->daddr;
393 int dif = sk->sk_bound_dev_if;
394 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
395 __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700396 const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size);
397 struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700398 struct sock *sk2;
399 struct hlist_node *node;
400 struct tcp_tw_bucket *tw;
401
402 write_lock(&head->lock);
403
404 /* Check TIME-WAIT sockets first. */
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700405 sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700406 tw = (struct tcp_tw_bucket *)sk2;
407
408 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
409 struct tcp_sock *tp = tcp_sk(sk);
410
411 /* With PAWS, it is safe from the viewpoint
412 of data integrity. Even without PAWS it
413 is safe provided sequence spaces do not
414 overlap i.e. at data rates <= 80Mbit/sec.
415
416 Actually, the idea is close to VJ's one,
417 only timestamp cache is held not per host,
418 but per port pair and TW bucket is used
419 as state holder.
420
421 If TW bucket has been already destroyed we
422 fall back to VJ's scheme and use initial
423 timestamp retrieved from peer table.
424 */
425 if (tw->tw_ts_recent_stamp &&
426 (!twp || (sysctl_tcp_tw_reuse &&
427 xtime.tv_sec -
428 tw->tw_ts_recent_stamp > 1))) {
429 if ((tp->write_seq =
430 tw->tw_snd_nxt + 65535 + 2) == 0)
431 tp->write_seq = 1;
432 tp->rx_opt.ts_recent = tw->tw_ts_recent;
433 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
434 sock_hold(sk2);
435 goto unique;
436 } else
437 goto not_unique;
438 }
439 }
440 tw = NULL;
441
442 /* And established part... */
443 sk_for_each(sk2, node, &head->chain) {
444 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
445 goto not_unique;
446 }
447
448unique:
449 /* Must record num and sport now. Otherwise we will see
450 * in hash table socket with a funny identity. */
451 inet->num = lport;
452 inet->sport = htons(lport);
453 sk->sk_hashent = hash;
454 BUG_TRAP(sk_unhashed(sk));
455 __sk_add_node(sk, &head->chain);
456 sock_prot_inc_use(sk->sk_prot);
457 write_unlock(&head->lock);
458
459 if (twp) {
460 *twp = tw;
461 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
462 } else if (tw) {
463 /* Silly. Should hash-dance instead... */
464 tcp_tw_deschedule(tw);
465 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
466
467 tcp_tw_put(tw);
468 }
469
470 return 0;
471
472not_unique:
473 write_unlock(&head->lock);
474 return -EADDRNOTAVAIL;
475}
476
477static inline u32 connect_port_offset(const struct sock *sk)
478{
479 const struct inet_sock *inet = inet_sk(sk);
480
481 return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
482 inet->dport);
483}
484
485/*
486 * Bind a port for a connect operation and hash it.
487 */
488static inline int tcp_v4_hash_connect(struct sock *sk)
489{
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700490 const unsigned short snum = inet_sk(sk)->num;
491 struct inet_bind_hashbucket *head;
492 struct inet_bind_bucket *tb;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700493 int ret;
494
495 if (!snum) {
496 int low = sysctl_local_port_range[0];
497 int high = sysctl_local_port_range[1];
498 int range = high - low;
499 int i;
500 int port;
501 static u32 hint;
502 u32 offset = hint + connect_port_offset(sk);
503 struct hlist_node *node;
504 struct tcp_tw_bucket *tw = NULL;
505
506 local_bh_disable();
507 for (i = 1; i <= range; i++) {
508 port = low + (i + offset) % range;
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700509 head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700510 spin_lock(&head->lock);
511
512 /* Does not bother with rcv_saddr checks,
513 * because the established check is already
514 * unique enough.
515 */
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700516 inet_bind_bucket_for_each(tb, node, &head->chain) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700517 if (tb->port == port) {
518 BUG_TRAP(!hlist_empty(&tb->owners));
519 if (tb->fastreuse >= 0)
520 goto next_port;
521 if (!__tcp_v4_check_established(sk,
522 port,
523 &tw))
524 goto ok;
525 goto next_port;
526 }
527 }
528
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700529 tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700530 if (!tb) {
531 spin_unlock(&head->lock);
532 break;
533 }
534 tb->fastreuse = -1;
535 goto ok;
536
537 next_port:
538 spin_unlock(&head->lock);
539 }
540 local_bh_enable();
541
542 return -EADDRNOTAVAIL;
543
544ok:
545 hint += i;
546
547 /* Head lock still held and bh's disabled */
Arnaldo Carvalho de Melo2d8c4ce2005-08-09 20:07:13 -0700548 inet_bind_hash(sk, tb, port);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700549 if (sk_unhashed(sk)) {
550 inet_sk(sk)->sport = htons(port);
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -0700551 __inet_hash(&tcp_hashinfo, sk, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700552 }
553 spin_unlock(&head->lock);
554
555 if (tw) {
556 tcp_tw_deschedule(tw);
557 tcp_tw_put(tw);
558 }
559
560 ret = 0;
561 goto out;
562 }
563
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700564 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
Arnaldo Carvalho de Meloa55ebcc2005-08-09 20:01:14 -0700565 tb = inet_sk(sk)->bind_hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700566 spin_lock_bh(&head->lock);
567 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -0700568 __inet_hash(&tcp_hashinfo, sk, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700569 spin_unlock_bh(&head->lock);
570 return 0;
571 } else {
572 spin_unlock(&head->lock);
573 /* No definite answer... Walk to established hash table */
574 ret = __tcp_v4_check_established(sk, snum, NULL);
575out:
576 local_bh_enable();
577 return ret;
578 }
579}
580
581/* This will initiate an outgoing connection. */
582int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
583{
584 struct inet_sock *inet = inet_sk(sk);
585 struct tcp_sock *tp = tcp_sk(sk);
586 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
587 struct rtable *rt;
588 u32 daddr, nexthop;
589 int tmp;
590 int err;
591
592 if (addr_len < sizeof(struct sockaddr_in))
593 return -EINVAL;
594
595 if (usin->sin_family != AF_INET)
596 return -EAFNOSUPPORT;
597
598 nexthop = daddr = usin->sin_addr.s_addr;
599 if (inet->opt && inet->opt->srr) {
600 if (!daddr)
601 return -EINVAL;
602 nexthop = inet->opt->faddr;
603 }
604
605 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
606 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
607 IPPROTO_TCP,
608 inet->sport, usin->sin_port, sk);
609 if (tmp < 0)
610 return tmp;
611
612 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
613 ip_rt_put(rt);
614 return -ENETUNREACH;
615 }
616
617 if (!inet->opt || !inet->opt->srr)
618 daddr = rt->rt_dst;
619
620 if (!inet->saddr)
621 inet->saddr = rt->rt_src;
622 inet->rcv_saddr = inet->saddr;
623
624 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
625 /* Reset inherited state */
626 tp->rx_opt.ts_recent = 0;
627 tp->rx_opt.ts_recent_stamp = 0;
628 tp->write_seq = 0;
629 }
630
631 if (sysctl_tcp_tw_recycle &&
632 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
633 struct inet_peer *peer = rt_get_peer(rt);
634
635 /* VJ's idea. We save last timestamp seen from
636 * the destination in peer table, when entering state TIME-WAIT
637 * and initialize rx_opt.ts_recent from it, when trying new connection.
638 */
639
640 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
641 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
642 tp->rx_opt.ts_recent = peer->tcp_ts;
643 }
644 }
645
646 inet->dport = usin->sin_port;
647 inet->daddr = daddr;
648
649 tp->ext_header_len = 0;
650 if (inet->opt)
651 tp->ext_header_len = inet->opt->optlen;
652
653 tp->rx_opt.mss_clamp = 536;
654
655 /* Socket identity is still unknown (sport may be zero).
656 * However we set state to SYN-SENT and not releasing socket
657 * lock select source port, enter ourselves into the hash tables and
658 * complete initialization after this.
659 */
660 tcp_set_state(sk, TCP_SYN_SENT);
661 err = tcp_v4_hash_connect(sk);
662 if (err)
663 goto failure;
664
665 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
666 if (err)
667 goto failure;
668
669 /* OK, now commit destination to socket. */
Arnaldo Carvalho de Melo6cbb0df2005-08-09 19:49:02 -0700670 sk_setup_caps(sk, &rt->u.dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700671
672 if (!tp->write_seq)
673 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
674 inet->daddr,
675 inet->sport,
676 usin->sin_port);
677
678 inet->id = tp->write_seq ^ jiffies;
679
680 err = tcp_connect(sk);
681 rt = NULL;
682 if (err)
683 goto failure;
684
685 return 0;
686
687failure:
688 /* This unhashes the socket and releases the local port, if necessary. */
689 tcp_set_state(sk, TCP_CLOSE);
690 ip_rt_put(rt);
691 sk->sk_route_caps = 0;
692 inet->dport = 0;
693 return err;
694}
695
696static __inline__ int tcp_v4_iif(struct sk_buff *skb)
697{
698 return ((struct rtable *)skb->dst)->rt_iif;
699}
700
701static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
702{
703 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
704}
705
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700706static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
707 struct request_sock ***prevp,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700708 __u16 rport,
709 __u32 raddr, __u32 laddr)
710{
Arnaldo Carvalho de Melo2ad69c52005-06-18 22:48:55 -0700711 struct listen_sock *lopt = tp->accept_queue.listen_opt;
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700712 struct request_sock *req, **prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700713
714 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
715 (req = *prev) != NULL;
716 prev = &req->dl_next) {
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700717 const struct inet_request_sock *ireq = inet_rsk(req);
718
719 if (ireq->rmt_port == rport &&
720 ireq->rmt_addr == raddr &&
721 ireq->loc_addr == laddr &&
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700722 TCP_INET_FAMILY(req->rsk_ops->family)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700723 BUG_TRAP(!req->sk);
724 *prevp = prev;
725 break;
726 }
727 }
728
729 return req;
730}
731
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700732static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700733{
734 struct tcp_sock *tp = tcp_sk(sk);
Arnaldo Carvalho de Melo2ad69c52005-06-18 22:48:55 -0700735 struct listen_sock *lopt = tp->accept_queue.listen_opt;
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700736 u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700737
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -0700738 reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700739 tcp_synq_added(sk);
740}
741
742
743/*
744 * This routine does path mtu discovery as defined in RFC1191.
745 */
746static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
747 u32 mtu)
748{
749 struct dst_entry *dst;
750 struct inet_sock *inet = inet_sk(sk);
751 struct tcp_sock *tp = tcp_sk(sk);
752
753 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
754 * send out by Linux are always <576bytes so they should go through
755 * unfragmented).
756 */
757 if (sk->sk_state == TCP_LISTEN)
758 return;
759
760 /* We don't check in the destentry if pmtu discovery is forbidden
761 * on this route. We just assume that no packet_to_big packets
762 * are send back when pmtu discovery is not active.
763 * There is a small race when the user changes this flag in the
764 * route, but I think that's acceptable.
765 */
766 if ((dst = __sk_dst_check(sk, 0)) == NULL)
767 return;
768
769 dst->ops->update_pmtu(dst, mtu);
770
771 /* Something is about to be wrong... Remember soft error
772 * for the case, if this connection will not able to recover.
773 */
774 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
775 sk->sk_err_soft = EMSGSIZE;
776
777 mtu = dst_mtu(dst);
778
779 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
780 tp->pmtu_cookie > mtu) {
781 tcp_sync_mss(sk, mtu);
782
783 /* Resend the TCP packet because it's
784 * clear that the old packet has been
785 * dropped. This is the new "fast" path mtu
786 * discovery.
787 */
788 tcp_simple_retransmit(sk);
789 } /* else let the usual retransmit timer handle it */
790}
791
792/*
793 * This routine is called by the ICMP module when it gets some
794 * sort of error condition. If err < 0 then the socket should
795 * be closed and the error returned to the user. If err > 0
796 * it's just the icmp type << 8 | icmp code. After adjustment
797 * header points to the first 8 bytes of the tcp header. We need
798 * to find the appropriate port.
799 *
800 * The locking strategy used here is very "optimistic". When
801 * someone else accesses the socket the ICMP is just dropped
802 * and for some paths there is no check at all.
803 * A more general error queue to queue errors for later handling
804 * is probably better.
805 *
806 */
807
808void tcp_v4_err(struct sk_buff *skb, u32 info)
809{
810 struct iphdr *iph = (struct iphdr *)skb->data;
811 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
812 struct tcp_sock *tp;
813 struct inet_sock *inet;
814 int type = skb->h.icmph->type;
815 int code = skb->h.icmph->code;
816 struct sock *sk;
817 __u32 seq;
818 int err;
819
820 if (skb->len < (iph->ihl << 2) + 8) {
821 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
822 return;
823 }
824
825 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
826 th->source, tcp_v4_iif(skb));
827 if (!sk) {
828 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
829 return;
830 }
831 if (sk->sk_state == TCP_TIME_WAIT) {
832 tcp_tw_put((struct tcp_tw_bucket *)sk);
833 return;
834 }
835
836 bh_lock_sock(sk);
837 /* If too many ICMPs get dropped on busy
838 * servers this needs to be solved differently.
839 */
840 if (sock_owned_by_user(sk))
841 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
842
843 if (sk->sk_state == TCP_CLOSE)
844 goto out;
845
846 tp = tcp_sk(sk);
847 seq = ntohl(th->seq);
848 if (sk->sk_state != TCP_LISTEN &&
849 !between(seq, tp->snd_una, tp->snd_nxt)) {
850 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
851 goto out;
852 }
853
854 switch (type) {
855 case ICMP_SOURCE_QUENCH:
856 /* Just silently ignore these. */
857 goto out;
858 case ICMP_PARAMETERPROB:
859 err = EPROTO;
860 break;
861 case ICMP_DEST_UNREACH:
862 if (code > NR_ICMP_UNREACH)
863 goto out;
864
865 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
866 if (!sock_owned_by_user(sk))
867 do_pmtu_discovery(sk, iph, info);
868 goto out;
869 }
870
871 err = icmp_err_convert[code].errno;
872 break;
873 case ICMP_TIME_EXCEEDED:
874 err = EHOSTUNREACH;
875 break;
876 default:
877 goto out;
878 }
879
880 switch (sk->sk_state) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700881 struct request_sock *req, **prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700882 case TCP_LISTEN:
883 if (sock_owned_by_user(sk))
884 goto out;
885
886 req = tcp_v4_search_req(tp, &prev, th->dest,
887 iph->daddr, iph->saddr);
888 if (!req)
889 goto out;
890
891 /* ICMPs are not backlogged, hence we cannot get
892 an established socket here.
893 */
894 BUG_TRAP(!req->sk);
895
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700896 if (seq != tcp_rsk(req)->snt_isn) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700897 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
898 goto out;
899 }
900
901 /*
902 * Still in SYN_RECV, just remove it silently.
903 * There is no good way to pass the error to the newly
904 * created socket, and POSIX does not want network
905 * errors returned from accept().
906 */
907 tcp_synq_drop(sk, req, prev);
908 goto out;
909
910 case TCP_SYN_SENT:
911 case TCP_SYN_RECV: /* Cannot happen.
912 It can f.e. if SYNs crossed.
913 */
914 if (!sock_owned_by_user(sk)) {
915 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
916 sk->sk_err = err;
917
918 sk->sk_error_report(sk);
919
920 tcp_done(sk);
921 } else {
922 sk->sk_err_soft = err;
923 }
924 goto out;
925 }
926
927 /* If we've already connected we will keep trying
928 * until we time out, or the user gives up.
929 *
930 * rfc1122 4.2.3.9 allows to consider as hard errors
931 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
932 * but it is obsoleted by pmtu discovery).
933 *
934 * Note, that in modern internet, where routing is unreliable
935 * and in each dark corner broken firewalls sit, sending random
936 * errors ordered by their masters even this two messages finally lose
937 * their original sense (even Linux sends invalid PORT_UNREACHs)
938 *
939 * Now we are in compliance with RFCs.
940 * --ANK (980905)
941 */
942
943 inet = inet_sk(sk);
944 if (!sock_owned_by_user(sk) && inet->recverr) {
945 sk->sk_err = err;
946 sk->sk_error_report(sk);
947 } else { /* Only an error on timeout */
948 sk->sk_err_soft = err;
949 }
950
951out:
952 bh_unlock_sock(sk);
953 sock_put(sk);
954}
955
956/* This routine computes an IPv4 TCP checksum. */
957void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
958 struct sk_buff *skb)
959{
960 struct inet_sock *inet = inet_sk(sk);
961
962 if (skb->ip_summed == CHECKSUM_HW) {
963 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
964 skb->csum = offsetof(struct tcphdr, check);
965 } else {
966 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
967 csum_partial((char *)th,
968 th->doff << 2,
969 skb->csum));
970 }
971}
972
973/*
974 * This routine will send an RST to the other tcp.
975 *
976 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
977 * for reset.
978 * Answer: if a packet caused RST, it is not for a socket
979 * existing in our system, if it is matched to a socket,
980 * it is just duplicate segment or bug in other side's TCP.
981 * So that we build reply only basing on parameters
982 * arrived with segment.
983 * Exception: precedence violation. We do not implement it in any case.
984 */
985
986static void tcp_v4_send_reset(struct sk_buff *skb)
987{
988 struct tcphdr *th = skb->h.th;
989 struct tcphdr rth;
990 struct ip_reply_arg arg;
991
992 /* Never send a reset in response to a reset. */
993 if (th->rst)
994 return;
995
996 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
997 return;
998
999 /* Swap the send and the receive. */
1000 memset(&rth, 0, sizeof(struct tcphdr));
1001 rth.dest = th->source;
1002 rth.source = th->dest;
1003 rth.doff = sizeof(struct tcphdr) / 4;
1004 rth.rst = 1;
1005
1006 if (th->ack) {
1007 rth.seq = th->ack_seq;
1008 } else {
1009 rth.ack = 1;
1010 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1011 skb->len - (th->doff << 2));
1012 }
1013
1014 memset(&arg, 0, sizeof arg);
1015 arg.iov[0].iov_base = (unsigned char *)&rth;
1016 arg.iov[0].iov_len = sizeof rth;
1017 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1018 skb->nh.iph->saddr, /*XXX*/
1019 sizeof(struct tcphdr), IPPROTO_TCP, 0);
1020 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1021
1022 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1023
1024 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1025 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1026}
1027
1028/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1029 outside socket context is ugly, certainly. What can I do?
1030 */
1031
1032static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1033 u32 win, u32 ts)
1034{
1035 struct tcphdr *th = skb->h.th;
1036 struct {
1037 struct tcphdr th;
1038 u32 tsopt[3];
1039 } rep;
1040 struct ip_reply_arg arg;
1041
1042 memset(&rep.th, 0, sizeof(struct tcphdr));
1043 memset(&arg, 0, sizeof arg);
1044
1045 arg.iov[0].iov_base = (unsigned char *)&rep;
1046 arg.iov[0].iov_len = sizeof(rep.th);
1047 if (ts) {
1048 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1049 (TCPOPT_TIMESTAMP << 8) |
1050 TCPOLEN_TIMESTAMP);
1051 rep.tsopt[1] = htonl(tcp_time_stamp);
1052 rep.tsopt[2] = htonl(ts);
1053 arg.iov[0].iov_len = sizeof(rep);
1054 }
1055
1056 /* Swap the send and the receive. */
1057 rep.th.dest = th->source;
1058 rep.th.source = th->dest;
1059 rep.th.doff = arg.iov[0].iov_len / 4;
1060 rep.th.seq = htonl(seq);
1061 rep.th.ack_seq = htonl(ack);
1062 rep.th.ack = 1;
1063 rep.th.window = htons(win);
1064
1065 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1066 skb->nh.iph->saddr, /*XXX*/
1067 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1068 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1069
1070 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1071
1072 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1073}
1074
1075static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1076{
1077 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1078
1079 tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1080 tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1081
1082 tcp_tw_put(tw);
1083}
1084
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001085static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001086{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001087 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001088 req->ts_recent);
1089}
1090
1091static struct dst_entry* tcp_v4_route_req(struct sock *sk,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001092 struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001093{
1094 struct rtable *rt;
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001095 const struct inet_request_sock *ireq = inet_rsk(req);
1096 struct ip_options *opt = inet_rsk(req)->opt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001097 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1098 .nl_u = { .ip4_u =
1099 { .daddr = ((opt && opt->srr) ?
1100 opt->faddr :
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001101 ireq->rmt_addr),
1102 .saddr = ireq->loc_addr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001103 .tos = RT_CONN_FLAGS(sk) } },
1104 .proto = IPPROTO_TCP,
1105 .uli_u = { .ports =
1106 { .sport = inet_sk(sk)->sport,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001107 .dport = ireq->rmt_port } } };
Linus Torvalds1da177e2005-04-16 15:20:36 -07001108
1109 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1110 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1111 return NULL;
1112 }
1113 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1114 ip_rt_put(rt);
1115 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1116 return NULL;
1117 }
1118 return &rt->u.dst;
1119}
1120
1121/*
1122 * Send a SYN-ACK after having received an ACK.
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001123 * This still operates on a request_sock only, not on a big
Linus Torvalds1da177e2005-04-16 15:20:36 -07001124 * socket.
1125 */
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001126static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001127 struct dst_entry *dst)
1128{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001129 const struct inet_request_sock *ireq = inet_rsk(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001130 int err = -1;
1131 struct sk_buff * skb;
1132
1133 /* First, grab a route. */
1134 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1135 goto out;
1136
1137 skb = tcp_make_synack(sk, dst, req);
1138
1139 if (skb) {
1140 struct tcphdr *th = skb->h.th;
1141
1142 th->check = tcp_v4_check(th, skb->len,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001143 ireq->loc_addr,
1144 ireq->rmt_addr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001145 csum_partial((char *)th, skb->len,
1146 skb->csum));
1147
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001148 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1149 ireq->rmt_addr,
1150 ireq->opt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001151 if (err == NET_XMIT_CN)
1152 err = 0;
1153 }
1154
1155out:
1156 dst_release(dst);
1157 return err;
1158}
1159
1160/*
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001161 * IPv4 request_sock destructor.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001162 */
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001163static void tcp_v4_reqsk_destructor(struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001164{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001165 if (inet_rsk(req)->opt)
1166 kfree(inet_rsk(req)->opt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001167}
1168
1169static inline void syn_flood_warning(struct sk_buff *skb)
1170{
1171 static unsigned long warntime;
1172
1173 if (time_after(jiffies, (warntime + HZ * 60))) {
1174 warntime = jiffies;
1175 printk(KERN_INFO
1176 "possible SYN flooding on port %d. Sending cookies.\n",
1177 ntohs(skb->h.th->dest));
1178 }
1179}
1180
1181/*
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001182 * Save and compile IPv4 options into the request_sock if needed.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001183 */
1184static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1185 struct sk_buff *skb)
1186{
1187 struct ip_options *opt = &(IPCB(skb)->opt);
1188 struct ip_options *dopt = NULL;
1189
1190 if (opt && opt->optlen) {
1191 int opt_size = optlength(opt);
1192 dopt = kmalloc(opt_size, GFP_ATOMIC);
1193 if (dopt) {
1194 if (ip_options_echo(dopt, skb)) {
1195 kfree(dopt);
1196 dopt = NULL;
1197 }
1198 }
1199 }
1200 return dopt;
1201}
1202
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001203struct request_sock_ops tcp_request_sock_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001204 .family = PF_INET,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001205 .obj_size = sizeof(struct tcp_request_sock),
Linus Torvalds1da177e2005-04-16 15:20:36 -07001206 .rtx_syn_ack = tcp_v4_send_synack,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001207 .send_ack = tcp_v4_reqsk_send_ack,
1208 .destructor = tcp_v4_reqsk_destructor,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001209 .send_reset = tcp_v4_send_reset,
1210};
1211
1212int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1213{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001214 struct inet_request_sock *ireq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001215 struct tcp_options_received tmp_opt;
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001216 struct request_sock *req;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001217 __u32 saddr = skb->nh.iph->saddr;
1218 __u32 daddr = skb->nh.iph->daddr;
1219 __u32 isn = TCP_SKB_CB(skb)->when;
1220 struct dst_entry *dst = NULL;
1221#ifdef CONFIG_SYN_COOKIES
1222 int want_cookie = 0;
1223#else
1224#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1225#endif
1226
1227 /* Never answer to SYNs send to broadcast or multicast */
1228 if (((struct rtable *)skb->dst)->rt_flags &
1229 (RTCF_BROADCAST | RTCF_MULTICAST))
1230 goto drop;
1231
1232 /* TW buckets are converted to open requests without
1233 * limitations, they conserve resources and peer is
1234 * evidently real one.
1235 */
1236 if (tcp_synq_is_full(sk) && !isn) {
1237#ifdef CONFIG_SYN_COOKIES
1238 if (sysctl_tcp_syncookies) {
1239 want_cookie = 1;
1240 } else
1241#endif
1242 goto drop;
1243 }
1244
1245 /* Accept backlog is full. If we have already queued enough
1246 * of warm entries in syn queue, drop request. It is better than
1247 * clogging syn queue with openreqs with exponentially increasing
1248 * timeout.
1249 */
1250 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1251 goto drop;
1252
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001253 req = reqsk_alloc(&tcp_request_sock_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001254 if (!req)
1255 goto drop;
1256
1257 tcp_clear_options(&tmp_opt);
1258 tmp_opt.mss_clamp = 536;
1259 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1260
1261 tcp_parse_options(skb, &tmp_opt, 0);
1262
1263 if (want_cookie) {
1264 tcp_clear_options(&tmp_opt);
1265 tmp_opt.saw_tstamp = 0;
1266 }
1267
1268 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1269 /* Some OSes (unknown ones, but I see them on web server, which
1270 * contains information interesting only for windows'
1271 * users) do not send their stamp in SYN. It is easy case.
1272 * We simply do not advertise TS support.
1273 */
1274 tmp_opt.saw_tstamp = 0;
1275 tmp_opt.tstamp_ok = 0;
1276 }
1277 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1278
1279 tcp_openreq_init(req, &tmp_opt, skb);
1280
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001281 ireq = inet_rsk(req);
1282 ireq->loc_addr = daddr;
1283 ireq->rmt_addr = saddr;
1284 ireq->opt = tcp_v4_save_options(sk, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001285 if (!want_cookie)
1286 TCP_ECN_create_request(req, skb->h.th);
1287
1288 if (want_cookie) {
1289#ifdef CONFIG_SYN_COOKIES
1290 syn_flood_warning(skb);
1291#endif
1292 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1293 } else if (!isn) {
1294 struct inet_peer *peer = NULL;
1295
1296 /* VJ's idea. We save last timestamp seen
1297 * from the destination in peer table, when entering
1298 * state TIME-WAIT, and check against it before
1299 * accepting new connection request.
1300 *
1301 * If "isn" is not zero, this request hit alive
1302 * timewait bucket, so that all the necessary checks
1303 * are made in the function processing timewait state.
1304 */
1305 if (tmp_opt.saw_tstamp &&
1306 sysctl_tcp_tw_recycle &&
1307 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1308 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1309 peer->v4daddr == saddr) {
1310 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1311 (s32)(peer->tcp_ts - req->ts_recent) >
1312 TCP_PAWS_WINDOW) {
1313 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1314 dst_release(dst);
1315 goto drop_and_free;
1316 }
1317 }
1318 /* Kill the following clause, if you dislike this way. */
1319 else if (!sysctl_tcp_syncookies &&
1320 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1321 (sysctl_max_syn_backlog >> 2)) &&
1322 (!peer || !peer->tcp_ts_stamp) &&
1323 (!dst || !dst_metric(dst, RTAX_RTT))) {
1324 /* Without syncookies last quarter of
1325 * backlog is filled with destinations,
1326 * proven to be alive.
1327 * It means that we continue to communicate
1328 * to destinations, already remembered
1329 * to the moment of synflood.
1330 */
Heikki Orsilaca933452005-08-08 14:26:52 -07001331 LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1332 "request from %u.%u."
1333 "%u.%u/%u\n",
1334 NIPQUAD(saddr),
1335 ntohs(skb->h.th->source)));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001336 dst_release(dst);
1337 goto drop_and_free;
1338 }
1339
1340 isn = tcp_v4_init_sequence(sk, skb);
1341 }
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001342 tcp_rsk(req)->snt_isn = isn;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001343
1344 if (tcp_v4_send_synack(sk, req, dst))
1345 goto drop_and_free;
1346
1347 if (want_cookie) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001348 reqsk_free(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001349 } else {
1350 tcp_v4_synq_add(sk, req);
1351 }
1352 return 0;
1353
1354drop_and_free:
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001355 reqsk_free(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001356drop:
1357 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1358 return 0;
1359}
1360
1361
1362/*
1363 * The three way handshake has completed - we got a valid synack -
1364 * now create the new socket.
1365 */
1366struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001367 struct request_sock *req,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001368 struct dst_entry *dst)
1369{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001370 struct inet_request_sock *ireq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001371 struct inet_sock *newinet;
1372 struct tcp_sock *newtp;
1373 struct sock *newsk;
1374
1375 if (sk_acceptq_is_full(sk))
1376 goto exit_overflow;
1377
1378 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1379 goto exit;
1380
1381 newsk = tcp_create_openreq_child(sk, req, skb);
1382 if (!newsk)
1383 goto exit;
1384
Arnaldo Carvalho de Melo6cbb0df2005-08-09 19:49:02 -07001385 sk_setup_caps(newsk, dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001386
1387 newtp = tcp_sk(newsk);
1388 newinet = inet_sk(newsk);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001389 ireq = inet_rsk(req);
1390 newinet->daddr = ireq->rmt_addr;
1391 newinet->rcv_saddr = ireq->loc_addr;
1392 newinet->saddr = ireq->loc_addr;
1393 newinet->opt = ireq->opt;
1394 ireq->opt = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001395 newinet->mc_index = tcp_v4_iif(skb);
1396 newinet->mc_ttl = skb->nh.iph->ttl;
1397 newtp->ext_header_len = 0;
1398 if (newinet->opt)
1399 newtp->ext_header_len = newinet->opt->optlen;
1400 newinet->id = newtp->write_seq ^ jiffies;
1401
1402 tcp_sync_mss(newsk, dst_mtu(dst));
1403 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1404 tcp_initialize_rcv_mss(newsk);
1405
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07001406 __inet_hash(&tcp_hashinfo, newsk, 0);
Arnaldo Carvalho de Melo2d8c4ce2005-08-09 20:07:13 -07001407 __inet_inherit_port(&tcp_hashinfo, sk, newsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001408
1409 return newsk;
1410
1411exit_overflow:
1412 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1413exit:
1414 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1415 dst_release(dst);
1416 return NULL;
1417}
1418
1419static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1420{
1421 struct tcphdr *th = skb->h.th;
1422 struct iphdr *iph = skb->nh.iph;
1423 struct tcp_sock *tp = tcp_sk(sk);
1424 struct sock *nsk;
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001425 struct request_sock **prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001426 /* Find possible connection requests. */
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001427 struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001428 iph->saddr, iph->daddr);
1429 if (req)
1430 return tcp_check_req(sk, skb, req, prev);
1431
1432 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1433 th->source,
1434 skb->nh.iph->daddr,
1435 ntohs(th->dest),
1436 tcp_v4_iif(skb));
1437
1438 if (nsk) {
1439 if (nsk->sk_state != TCP_TIME_WAIT) {
1440 bh_lock_sock(nsk);
1441 return nsk;
1442 }
1443 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1444 return NULL;
1445 }
1446
1447#ifdef CONFIG_SYN_COOKIES
1448 if (!th->rst && !th->syn && th->ack)
1449 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1450#endif
1451 return sk;
1452}
1453
1454static int tcp_v4_checksum_init(struct sk_buff *skb)
1455{
1456 if (skb->ip_summed == CHECKSUM_HW) {
1457 skb->ip_summed = CHECKSUM_UNNECESSARY;
1458 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1459 skb->nh.iph->daddr, skb->csum))
1460 return 0;
1461
Heikki Orsilaca933452005-08-08 14:26:52 -07001462 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001463 skb->ip_summed = CHECKSUM_NONE;
1464 }
1465 if (skb->len <= 76) {
1466 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1467 skb->nh.iph->daddr,
1468 skb_checksum(skb, 0, skb->len, 0)))
1469 return -1;
1470 skb->ip_summed = CHECKSUM_UNNECESSARY;
1471 } else {
1472 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1473 skb->nh.iph->saddr,
1474 skb->nh.iph->daddr, 0);
1475 }
1476 return 0;
1477}
1478
1479
1480/* The socket must have it's spinlock held when we get
1481 * here.
1482 *
1483 * We have a potential double-lock case here, so even when
1484 * doing backlog processing we use the BH locking scheme.
1485 * This is because we cannot sleep with the original spinlock
1486 * held.
1487 */
1488int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1489{
1490 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1491 TCP_CHECK_TIMER(sk);
1492 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1493 goto reset;
1494 TCP_CHECK_TIMER(sk);
1495 return 0;
1496 }
1497
1498 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1499 goto csum_err;
1500
1501 if (sk->sk_state == TCP_LISTEN) {
1502 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1503 if (!nsk)
1504 goto discard;
1505
1506 if (nsk != sk) {
1507 if (tcp_child_process(sk, nsk, skb))
1508 goto reset;
1509 return 0;
1510 }
1511 }
1512
1513 TCP_CHECK_TIMER(sk);
1514 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1515 goto reset;
1516 TCP_CHECK_TIMER(sk);
1517 return 0;
1518
1519reset:
1520 tcp_v4_send_reset(skb);
1521discard:
1522 kfree_skb(skb);
1523 /* Be careful here. If this function gets more complicated and
1524 * gcc suffers from register pressure on the x86, sk (in %ebx)
1525 * might be destroyed here. This current version compiles correctly,
1526 * but you have been warned.
1527 */
1528 return 0;
1529
1530csum_err:
1531 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1532 goto discard;
1533}
1534
1535/*
1536 * From tcp_input.c
1537 */
1538
1539int tcp_v4_rcv(struct sk_buff *skb)
1540{
1541 struct tcphdr *th;
1542 struct sock *sk;
1543 int ret;
1544
1545 if (skb->pkt_type != PACKET_HOST)
1546 goto discard_it;
1547
1548 /* Count it even if it's bad */
1549 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1550
1551 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1552 goto discard_it;
1553
1554 th = skb->h.th;
1555
1556 if (th->doff < sizeof(struct tcphdr) / 4)
1557 goto bad_packet;
1558 if (!pskb_may_pull(skb, th->doff * 4))
1559 goto discard_it;
1560
1561 /* An explanation is required here, I think.
1562 * Packet length and doff are validated by header prediction,
1563 * provided case of th->doff==0 is elimineted.
1564 * So, we defer the checks. */
1565 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1566 tcp_v4_checksum_init(skb) < 0))
1567 goto bad_packet;
1568
1569 th = skb->h.th;
1570 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1571 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1572 skb->len - th->doff * 4);
1573 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1574 TCP_SKB_CB(skb)->when = 0;
1575 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1576 TCP_SKB_CB(skb)->sacked = 0;
1577
1578 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1579 skb->nh.iph->daddr, ntohs(th->dest),
1580 tcp_v4_iif(skb));
1581
1582 if (!sk)
1583 goto no_tcp_socket;
1584
1585process:
1586 if (sk->sk_state == TCP_TIME_WAIT)
1587 goto do_time_wait;
1588
1589 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1590 goto discard_and_relse;
1591
1592 if (sk_filter(sk, skb, 0))
1593 goto discard_and_relse;
1594
1595 skb->dev = NULL;
1596
1597 bh_lock_sock(sk);
1598 ret = 0;
1599 if (!sock_owned_by_user(sk)) {
1600 if (!tcp_prequeue(sk, skb))
1601 ret = tcp_v4_do_rcv(sk, skb);
1602 } else
1603 sk_add_backlog(sk, skb);
1604 bh_unlock_sock(sk);
1605
1606 sock_put(sk);
1607
1608 return ret;
1609
1610no_tcp_socket:
1611 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1612 goto discard_it;
1613
1614 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1615bad_packet:
1616 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1617 } else {
1618 tcp_v4_send_reset(skb);
1619 }
1620
1621discard_it:
1622 /* Discard frame. */
1623 kfree_skb(skb);
1624 return 0;
1625
1626discard_and_relse:
1627 sock_put(sk);
1628 goto discard_it;
1629
1630do_time_wait:
1631 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1632 tcp_tw_put((struct tcp_tw_bucket *) sk);
1633 goto discard_it;
1634 }
1635
1636 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1637 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1638 tcp_tw_put((struct tcp_tw_bucket *) sk);
1639 goto discard_it;
1640 }
1641 switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1642 skb, th, skb->len)) {
1643 case TCP_TW_SYN: {
1644 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1645 ntohs(th->dest),
1646 tcp_v4_iif(skb));
1647 if (sk2) {
1648 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1649 tcp_tw_put((struct tcp_tw_bucket *)sk);
1650 sk = sk2;
1651 goto process;
1652 }
1653 /* Fall through to ACK */
1654 }
1655 case TCP_TW_ACK:
1656 tcp_v4_timewait_ack(sk, skb);
1657 break;
1658 case TCP_TW_RST:
1659 goto no_tcp_socket;
1660 case TCP_TW_SUCCESS:;
1661 }
1662 goto discard_it;
1663}
1664
Linus Torvalds1da177e2005-04-16 15:20:36 -07001665static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1666{
1667 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1668 struct inet_sock *inet = inet_sk(sk);
1669
1670 sin->sin_family = AF_INET;
1671 sin->sin_addr.s_addr = inet->daddr;
1672 sin->sin_port = inet->dport;
1673}
1674
1675/* VJ's idea. Save last timestamp seen from this destination
1676 * and hold it at least for normal timewait interval to use for duplicate
1677 * segment detection in subsequent connections, before they enter synchronized
1678 * state.
1679 */
1680
1681int tcp_v4_remember_stamp(struct sock *sk)
1682{
1683 struct inet_sock *inet = inet_sk(sk);
1684 struct tcp_sock *tp = tcp_sk(sk);
1685 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1686 struct inet_peer *peer = NULL;
1687 int release_it = 0;
1688
1689 if (!rt || rt->rt_dst != inet->daddr) {
1690 peer = inet_getpeer(inet->daddr, 1);
1691 release_it = 1;
1692 } else {
1693 if (!rt->peer)
1694 rt_bind_peer(rt, 1);
1695 peer = rt->peer;
1696 }
1697
1698 if (peer) {
1699 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1700 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1701 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1702 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1703 peer->tcp_ts = tp->rx_opt.ts_recent;
1704 }
1705 if (release_it)
1706 inet_putpeer(peer);
1707 return 1;
1708 }
1709
1710 return 0;
1711}
1712
1713int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1714{
1715 struct inet_peer *peer = NULL;
1716
1717 peer = inet_getpeer(tw->tw_daddr, 1);
1718
1719 if (peer) {
1720 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
1721 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1722 peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
1723 peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
1724 peer->tcp_ts = tw->tw_ts_recent;
1725 }
1726 inet_putpeer(peer);
1727 return 1;
1728 }
1729
1730 return 0;
1731}
1732
1733struct tcp_func ipv4_specific = {
1734 .queue_xmit = ip_queue_xmit,
1735 .send_check = tcp_v4_send_check,
Arnaldo Carvalho de Melo32519f12005-08-09 19:50:02 -07001736 .rebuild_header = inet_sk_rebuild_header,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001737 .conn_request = tcp_v4_conn_request,
1738 .syn_recv_sock = tcp_v4_syn_recv_sock,
1739 .remember_stamp = tcp_v4_remember_stamp,
1740 .net_header_len = sizeof(struct iphdr),
1741 .setsockopt = ip_setsockopt,
1742 .getsockopt = ip_getsockopt,
1743 .addr2sockaddr = v4_addr2sockaddr,
1744 .sockaddr_len = sizeof(struct sockaddr_in),
1745};
1746
1747/* NOTE: A lot of things set to zero explicitly by call to
1748 * sk_alloc() so need not be done here.
1749 */
1750static int tcp_v4_init_sock(struct sock *sk)
1751{
1752 struct tcp_sock *tp = tcp_sk(sk);
1753
1754 skb_queue_head_init(&tp->out_of_order_queue);
1755 tcp_init_xmit_timers(sk);
1756 tcp_prequeue_init(tp);
1757
1758 tp->rto = TCP_TIMEOUT_INIT;
1759 tp->mdev = TCP_TIMEOUT_INIT;
1760
1761 /* So many TCP implementations out there (incorrectly) count the
1762 * initial SYN frame in their delayed-ACK and congestion control
1763 * algorithms that we must have the following bandaid to talk
1764 * efficiently to them. -DaveM
1765 */
1766 tp->snd_cwnd = 2;
1767
1768 /* See draft-stevens-tcpca-spec-01 for discussion of the
1769 * initialization of these values.
1770 */
1771 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1772 tp->snd_cwnd_clamp = ~0;
David S. Millerc1b4a7e2005-07-05 15:24:38 -07001773 tp->mss_cache = 536;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001774
1775 tp->reordering = sysctl_tcp_reordering;
Stephen Hemminger5f8ef482005-06-23 20:37:36 -07001776 tp->ca_ops = &tcp_init_congestion_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001777
1778 sk->sk_state = TCP_CLOSE;
1779
1780 sk->sk_write_space = sk_stream_write_space;
1781 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1782
1783 tp->af_specific = &ipv4_specific;
1784
1785 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1786 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1787
1788 atomic_inc(&tcp_sockets_allocated);
1789
1790 return 0;
1791}
1792
1793int tcp_v4_destroy_sock(struct sock *sk)
1794{
1795 struct tcp_sock *tp = tcp_sk(sk);
1796
1797 tcp_clear_xmit_timers(sk);
1798
Stephen Hemminger317a76f2005-06-23 12:19:55 -07001799 tcp_cleanup_congestion_control(tp);
1800
Linus Torvalds1da177e2005-04-16 15:20:36 -07001801 /* Cleanup up the write buffer. */
1802 sk_stream_writequeue_purge(sk);
1803
1804 /* Cleans up our, hopefully empty, out_of_order_queue. */
1805 __skb_queue_purge(&tp->out_of_order_queue);
1806
1807 /* Clean prequeue, it must be empty really */
1808 __skb_queue_purge(&tp->ucopy.prequeue);
1809
1810 /* Clean up a referenced TCP bind bucket. */
Arnaldo Carvalho de Meloa55ebcc2005-08-09 20:01:14 -07001811 if (inet_sk(sk)->bind_hash)
Arnaldo Carvalho de Melo2d8c4ce2005-08-09 20:07:13 -07001812 inet_put_port(&tcp_hashinfo, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001813
1814 /*
1815 * If sendmsg cached page exists, toss it.
1816 */
1817 if (sk->sk_sndmsg_page) {
1818 __free_page(sk->sk_sndmsg_page);
1819 sk->sk_sndmsg_page = NULL;
1820 }
1821
1822 atomic_dec(&tcp_sockets_allocated);
1823
1824 return 0;
1825}
1826
1827EXPORT_SYMBOL(tcp_v4_destroy_sock);
1828
1829#ifdef CONFIG_PROC_FS
1830/* Proc filesystem TCP sock list dumping. */
1831
1832static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
1833{
1834 return hlist_empty(head) ? NULL :
1835 list_entry(head->first, struct tcp_tw_bucket, tw_node);
1836}
1837
1838static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
1839{
1840 return tw->tw_node.next ?
1841 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1842}
1843
1844static void *listening_get_next(struct seq_file *seq, void *cur)
1845{
1846 struct tcp_sock *tp;
1847 struct hlist_node *node;
1848 struct sock *sk = cur;
1849 struct tcp_iter_state* st = seq->private;
1850
1851 if (!sk) {
1852 st->bucket = 0;
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001853 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001854 goto get_sk;
1855 }
1856
1857 ++st->num;
1858
1859 if (st->state == TCP_SEQ_STATE_OPENREQ) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001860 struct request_sock *req = cur;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001861
1862 tp = tcp_sk(st->syn_wait_sk);
1863 req = req->dl_next;
1864 while (1) {
1865 while (req) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001866 if (req->rsk_ops->family == st->family) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001867 cur = req;
1868 goto out;
1869 }
1870 req = req->dl_next;
1871 }
1872 if (++st->sbucket >= TCP_SYNQ_HSIZE)
1873 break;
1874get_req:
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07001875 req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
Linus Torvalds1da177e2005-04-16 15:20:36 -07001876 }
1877 sk = sk_next(st->syn_wait_sk);
1878 st->state = TCP_SEQ_STATE_LISTENING;
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07001879 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001880 } else {
1881 tp = tcp_sk(sk);
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07001882 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1883 if (reqsk_queue_len(&tp->accept_queue))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001884 goto start_req;
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07001885 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001886 sk = sk_next(sk);
1887 }
1888get_sk:
1889 sk_for_each_from(sk, node) {
1890 if (sk->sk_family == st->family) {
1891 cur = sk;
1892 goto out;
1893 }
1894 tp = tcp_sk(sk);
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07001895 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1896 if (reqsk_queue_len(&tp->accept_queue)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001897start_req:
1898 st->uid = sock_i_uid(sk);
1899 st->syn_wait_sk = sk;
1900 st->state = TCP_SEQ_STATE_OPENREQ;
1901 st->sbucket = 0;
1902 goto get_req;
1903 }
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07001904 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001905 }
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -07001906 if (++st->bucket < INET_LHTABLE_SIZE) {
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001907 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001908 goto get_sk;
1909 }
1910 cur = NULL;
1911out:
1912 return cur;
1913}
1914
1915static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1916{
1917 void *rc = listening_get_next(seq, NULL);
1918
1919 while (rc && *pos) {
1920 rc = listening_get_next(seq, rc);
1921 --*pos;
1922 }
1923 return rc;
1924}
1925
1926static void *established_get_first(struct seq_file *seq)
1927{
1928 struct tcp_iter_state* st = seq->private;
1929 void *rc = NULL;
1930
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001931 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001932 struct sock *sk;
1933 struct hlist_node *node;
1934 struct tcp_tw_bucket *tw;
1935
1936 /* We can reschedule _before_ having picked the target: */
1937 cond_resched_softirq();
1938
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001939 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1940 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001941 if (sk->sk_family != st->family) {
1942 continue;
1943 }
1944 rc = sk;
1945 goto out;
1946 }
1947 st->state = TCP_SEQ_STATE_TIME_WAIT;
1948 tw_for_each(tw, node,
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001949 &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001950 if (tw->tw_family != st->family) {
1951 continue;
1952 }
1953 rc = tw;
1954 goto out;
1955 }
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001956 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001957 st->state = TCP_SEQ_STATE_ESTABLISHED;
1958 }
1959out:
1960 return rc;
1961}
1962
1963static void *established_get_next(struct seq_file *seq, void *cur)
1964{
1965 struct sock *sk = cur;
1966 struct tcp_tw_bucket *tw;
1967 struct hlist_node *node;
1968 struct tcp_iter_state* st = seq->private;
1969
1970 ++st->num;
1971
1972 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1973 tw = cur;
1974 tw = tw_next(tw);
1975get_tw:
1976 while (tw && tw->tw_family != st->family) {
1977 tw = tw_next(tw);
1978 }
1979 if (tw) {
1980 cur = tw;
1981 goto out;
1982 }
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001983 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001984 st->state = TCP_SEQ_STATE_ESTABLISHED;
1985
1986 /* We can reschedule between buckets: */
1987 cond_resched_softirq();
1988
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001989 if (++st->bucket < tcp_hashinfo.ehash_size) {
1990 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1991 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001992 } else {
1993 cur = NULL;
1994 goto out;
1995 }
1996 } else
1997 sk = sk_next(sk);
1998
1999 sk_for_each_from(sk, node) {
2000 if (sk->sk_family == st->family)
2001 goto found;
2002 }
2003
2004 st->state = TCP_SEQ_STATE_TIME_WAIT;
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07002005 tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002006 goto get_tw;
2007found:
2008 cur = sk;
2009out:
2010 return cur;
2011}
2012
2013static void *established_get_idx(struct seq_file *seq, loff_t pos)
2014{
2015 void *rc = established_get_first(seq);
2016
2017 while (rc && pos) {
2018 rc = established_get_next(seq, rc);
2019 --pos;
2020 }
2021 return rc;
2022}
2023
2024static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2025{
2026 void *rc;
2027 struct tcp_iter_state* st = seq->private;
2028
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07002029 inet_listen_lock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002030 st->state = TCP_SEQ_STATE_LISTENING;
2031 rc = listening_get_idx(seq, &pos);
2032
2033 if (!rc) {
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07002034 inet_listen_unlock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002035 local_bh_disable();
2036 st->state = TCP_SEQ_STATE_ESTABLISHED;
2037 rc = established_get_idx(seq, pos);
2038 }
2039
2040 return rc;
2041}
2042
2043static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2044{
2045 struct tcp_iter_state* st = seq->private;
2046 st->state = TCP_SEQ_STATE_LISTENING;
2047 st->num = 0;
2048 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2049}
2050
2051static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2052{
2053 void *rc = NULL;
2054 struct tcp_iter_state* st;
2055
2056 if (v == SEQ_START_TOKEN) {
2057 rc = tcp_get_idx(seq, 0);
2058 goto out;
2059 }
2060 st = seq->private;
2061
2062 switch (st->state) {
2063 case TCP_SEQ_STATE_OPENREQ:
2064 case TCP_SEQ_STATE_LISTENING:
2065 rc = listening_get_next(seq, v);
2066 if (!rc) {
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07002067 inet_listen_unlock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002068 local_bh_disable();
2069 st->state = TCP_SEQ_STATE_ESTABLISHED;
2070 rc = established_get_first(seq);
2071 }
2072 break;
2073 case TCP_SEQ_STATE_ESTABLISHED:
2074 case TCP_SEQ_STATE_TIME_WAIT:
2075 rc = established_get_next(seq, v);
2076 break;
2077 }
2078out:
2079 ++*pos;
2080 return rc;
2081}
2082
2083static void tcp_seq_stop(struct seq_file *seq, void *v)
2084{
2085 struct tcp_iter_state* st = seq->private;
2086
2087 switch (st->state) {
2088 case TCP_SEQ_STATE_OPENREQ:
2089 if (v) {
2090 struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07002091 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002092 }
2093 case TCP_SEQ_STATE_LISTENING:
2094 if (v != SEQ_START_TOKEN)
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07002095 inet_listen_unlock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002096 break;
2097 case TCP_SEQ_STATE_TIME_WAIT:
2098 case TCP_SEQ_STATE_ESTABLISHED:
2099 if (v)
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07002100 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002101 local_bh_enable();
2102 break;
2103 }
2104}
2105
2106static int tcp_seq_open(struct inode *inode, struct file *file)
2107{
2108 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2109 struct seq_file *seq;
2110 struct tcp_iter_state *s;
2111 int rc;
2112
2113 if (unlikely(afinfo == NULL))
2114 return -EINVAL;
2115
2116 s = kmalloc(sizeof(*s), GFP_KERNEL);
2117 if (!s)
2118 return -ENOMEM;
2119 memset(s, 0, sizeof(*s));
2120 s->family = afinfo->family;
2121 s->seq_ops.start = tcp_seq_start;
2122 s->seq_ops.next = tcp_seq_next;
2123 s->seq_ops.show = afinfo->seq_show;
2124 s->seq_ops.stop = tcp_seq_stop;
2125
2126 rc = seq_open(file, &s->seq_ops);
2127 if (rc)
2128 goto out_kfree;
2129 seq = file->private_data;
2130 seq->private = s;
2131out:
2132 return rc;
2133out_kfree:
2134 kfree(s);
2135 goto out;
2136}
2137
2138int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2139{
2140 int rc = 0;
2141 struct proc_dir_entry *p;
2142
2143 if (!afinfo)
2144 return -EINVAL;
2145 afinfo->seq_fops->owner = afinfo->owner;
2146 afinfo->seq_fops->open = tcp_seq_open;
2147 afinfo->seq_fops->read = seq_read;
2148 afinfo->seq_fops->llseek = seq_lseek;
2149 afinfo->seq_fops->release = seq_release_private;
2150
2151 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2152 if (p)
2153 p->data = afinfo;
2154 else
2155 rc = -ENOMEM;
2156 return rc;
2157}
2158
2159void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2160{
2161 if (!afinfo)
2162 return;
2163 proc_net_remove(afinfo->name);
2164 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2165}
2166
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07002167static void get_openreq4(struct sock *sk, struct request_sock *req,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002168 char *tmpbuf, int i, int uid)
2169{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002170 const struct inet_request_sock *ireq = inet_rsk(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002171 int ttd = req->expires - jiffies;
2172
2173 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2174 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2175 i,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002176 ireq->loc_addr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002177 ntohs(inet_sk(sk)->sport),
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002178 ireq->rmt_addr,
2179 ntohs(ireq->rmt_port),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002180 TCP_SYN_RECV,
2181 0, 0, /* could print option size, but that is af dependent. */
2182 1, /* timers active (only the expire timer) */
2183 jiffies_to_clock_t(ttd),
2184 req->retrans,
2185 uid,
2186 0, /* non standard timer */
2187 0, /* open_requests have no inode */
2188 atomic_read(&sk->sk_refcnt),
2189 req);
2190}
2191
2192static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2193{
2194 int timer_active;
2195 unsigned long timer_expires;
2196 struct tcp_sock *tp = tcp_sk(sp);
2197 struct inet_sock *inet = inet_sk(sp);
2198 unsigned int dest = inet->daddr;
2199 unsigned int src = inet->rcv_saddr;
2200 __u16 destp = ntohs(inet->dport);
2201 __u16 srcp = ntohs(inet->sport);
2202
2203 if (tp->pending == TCP_TIME_RETRANS) {
2204 timer_active = 1;
2205 timer_expires = tp->timeout;
2206 } else if (tp->pending == TCP_TIME_PROBE0) {
2207 timer_active = 4;
2208 timer_expires = tp->timeout;
2209 } else if (timer_pending(&sp->sk_timer)) {
2210 timer_active = 2;
2211 timer_expires = sp->sk_timer.expires;
2212 } else {
2213 timer_active = 0;
2214 timer_expires = jiffies;
2215 }
2216
2217 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2218 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2219 i, src, srcp, dest, destp, sp->sk_state,
2220 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2221 timer_active,
2222 jiffies_to_clock_t(timer_expires - jiffies),
2223 tp->retransmits,
2224 sock_i_uid(sp),
2225 tp->probes_out,
2226 sock_i_ino(sp),
2227 atomic_read(&sp->sk_refcnt), sp,
2228 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2229 tp->snd_cwnd,
2230 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2231}
2232
2233static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2234{
2235 unsigned int dest, src;
2236 __u16 destp, srcp;
2237 int ttd = tw->tw_ttd - jiffies;
2238
2239 if (ttd < 0)
2240 ttd = 0;
2241
2242 dest = tw->tw_daddr;
2243 src = tw->tw_rcv_saddr;
2244 destp = ntohs(tw->tw_dport);
2245 srcp = ntohs(tw->tw_sport);
2246
2247 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2248 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2249 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2250 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2251 atomic_read(&tw->tw_refcnt), tw);
2252}
2253
2254#define TMPSZ 150
2255
2256static int tcp4_seq_show(struct seq_file *seq, void *v)
2257{
2258 struct tcp_iter_state* st;
2259 char tmpbuf[TMPSZ + 1];
2260
2261 if (v == SEQ_START_TOKEN) {
2262 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2263 " sl local_address rem_address st tx_queue "
2264 "rx_queue tr tm->when retrnsmt uid timeout "
2265 "inode");
2266 goto out;
2267 }
2268 st = seq->private;
2269
2270 switch (st->state) {
2271 case TCP_SEQ_STATE_LISTENING:
2272 case TCP_SEQ_STATE_ESTABLISHED:
2273 get_tcp4_sock(v, tmpbuf, st->num);
2274 break;
2275 case TCP_SEQ_STATE_OPENREQ:
2276 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2277 break;
2278 case TCP_SEQ_STATE_TIME_WAIT:
2279 get_timewait4_sock(v, tmpbuf, st->num);
2280 break;
2281 }
2282 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2283out:
2284 return 0;
2285}
2286
2287static struct file_operations tcp4_seq_fops;
2288static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2289 .owner = THIS_MODULE,
2290 .name = "tcp",
2291 .family = AF_INET,
2292 .seq_show = tcp4_seq_show,
2293 .seq_fops = &tcp4_seq_fops,
2294};
2295
2296int __init tcp4_proc_init(void)
2297{
2298 return tcp_proc_register(&tcp4_seq_afinfo);
2299}
2300
2301void tcp4_proc_exit(void)
2302{
2303 tcp_proc_unregister(&tcp4_seq_afinfo);
2304}
2305#endif /* CONFIG_PROC_FS */
2306
2307struct proto tcp_prot = {
2308 .name = "TCP",
2309 .owner = THIS_MODULE,
2310 .close = tcp_close,
2311 .connect = tcp_v4_connect,
2312 .disconnect = tcp_disconnect,
2313 .accept = tcp_accept,
2314 .ioctl = tcp_ioctl,
2315 .init = tcp_v4_init_sock,
2316 .destroy = tcp_v4_destroy_sock,
2317 .shutdown = tcp_shutdown,
2318 .setsockopt = tcp_setsockopt,
2319 .getsockopt = tcp_getsockopt,
2320 .sendmsg = tcp_sendmsg,
2321 .recvmsg = tcp_recvmsg,
2322 .backlog_rcv = tcp_v4_do_rcv,
2323 .hash = tcp_v4_hash,
2324 .unhash = tcp_unhash,
2325 .get_port = tcp_v4_get_port,
2326 .enter_memory_pressure = tcp_enter_memory_pressure,
2327 .sockets_allocated = &tcp_sockets_allocated,
2328 .memory_allocated = &tcp_memory_allocated,
2329 .memory_pressure = &tcp_memory_pressure,
2330 .sysctl_mem = sysctl_tcp_mem,
2331 .sysctl_wmem = sysctl_tcp_wmem,
2332 .sysctl_rmem = sysctl_tcp_rmem,
2333 .max_header = MAX_TCP_HEADER,
2334 .obj_size = sizeof(struct tcp_sock),
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07002335 .rsk_prot = &tcp_request_sock_ops,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002336};
2337
2338
2339
2340void __init tcp_v4_init(struct net_proto_family *ops)
2341{
2342 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2343 if (err < 0)
2344 panic("Failed to create the TCP control socket.\n");
2345 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2346 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2347
2348 /* Unhash it so that IP input processing does not even
2349 * see it, we do not wish this socket to see incoming
2350 * packets.
2351 */
2352 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2353}
2354
2355EXPORT_SYMBOL(ipv4_specific);
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -07002356EXPORT_SYMBOL(inet_bind_bucket_create);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002357EXPORT_SYMBOL(tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002358EXPORT_SYMBOL(tcp_prot);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002359EXPORT_SYMBOL(tcp_unhash);
2360EXPORT_SYMBOL(tcp_v4_conn_request);
2361EXPORT_SYMBOL(tcp_v4_connect);
2362EXPORT_SYMBOL(tcp_v4_do_rcv);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002363EXPORT_SYMBOL(tcp_v4_remember_stamp);
2364EXPORT_SYMBOL(tcp_v4_send_check);
2365EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2366
2367#ifdef CONFIG_PROC_FS
2368EXPORT_SYMBOL(tcp_proc_register);
2369EXPORT_SYMBOL(tcp_proc_unregister);
2370#endif
2371EXPORT_SYMBOL(sysctl_local_port_range);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002372EXPORT_SYMBOL(sysctl_tcp_low_latency);
2373EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2374