Blame - net/ipv4/tcp_ipv4.c - android_kernel_htc_msm8960

blob: 3ac6659869c41eb824bed89dcda58de5ae44d14d [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* INET An implementation of the TCP/IP protocol suite for the LINUX
				3	* operating system. INET is implemented using the BSD Socket
				4	* interface as the means of communication with the user level.
				5	*
				6	* Implementation of the Transmission Control Protocol(TCP).
				7	*
				8	* Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
				9	*
				10	* IPv4 specific functions
				11	*
				12	*
				13	* code split from:
				14	* linux/ipv4/tcp.c
				15	* linux/ipv4/tcp_input.c
				16	* linux/ipv4/tcp_output.c
				17	*
				18	* See tcp.c for author information
				19	*
				20	* This program is free software; you can redistribute it and/or
				21	* modify it under the terms of the GNU General Public License
				22	* as published by the Free Software Foundation; either version
				23	* 2 of the License, or (at your option) any later version.
				24	*/
				25
				26	/*
				27	* Changes:
				28	* David S. Miller : New socket lookup architecture.
				29	* This code is dedicated to John Dyson.
				30	* David S. Miller : Change semantics of established hash,
				31	* half is devoted to TIME_WAIT sockets
				32	* and the rest go in the other half.
				33	* Andi Kleen : Add support for syncookies and fixed
				34	* some bugs: ip options weren't passed to
				35	* the TCP layer, missed a check for an
				36	* ACK bit.
				37	* Andi Kleen : Implemented fast path mtu discovery.
				38	* Fixed many serious bugs in the
				39	* open_request handling and moved
				40	* most of it into the af independent code.
				41	* Added tail drop and some other bugfixes.
				42	* Added new listen sematics.
				43	* Mike McLagan : Routing by source
				44	* Juan Jose Ciarlante: ip_dynaddr bits
				45	* Andi Kleen: various fixes.
				46	* Vitaly E. Lavrov : Transparent proxy revived after year
				47	* coma.
				48	* Andi Kleen : Fix new listen.
				49	* Andi Kleen : Fix accept error reporting.
				50	* YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
				51	* Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
				52	* a single port at the same time.
				53	*/
				54
				55	#include <linux/config.h>
				56
				57	#include <linux/types.h>
				58	#include <linux/fcntl.h>
				59	#include <linux/module.h>
				60	#include <linux/random.h>
				61	#include <linux/cache.h>
				62	#include <linux/jhash.h>
				63	#include <linux/init.h>
				64	#include <linux/times.h>
				65
				66	#include <net/icmp.h>
				67	#include <net/tcp.h>
				68	#include <net/ipv6.h>
				69	#include <net/inet_common.h>
				70	#include <net/xfrm.h>
				71
				72	#include <linux/inet.h>
				73	#include <linux/ipv6.h>
				74	#include <linux/stddef.h>
				75	#include <linux/proc_fs.h>
				76	#include <linux/seq_file.h>
				77
				78	extern int sysctl_ip_dynaddr;
				79	int sysctl_tcp_tw_reuse;
				80	int sysctl_tcp_low_latency;
				81
				82	/* Check TCP sequence numbers in ICMP packets. */
				83	#define ICMP_MIN_LENGTH 8
				84
				85	/* Socket used for sending RSTs */
				86	static struct socket *tcp_socket;
				87
				88	void tcp_v4_send_check(struct sock sk, struct tcphdr th, int len,
				89	struct sk_buff *skb);
				90
				91	struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
				92	.__tcp_lhash_lock = RW_LOCK_UNLOCKED,
				93	.__tcp_lhash_users = ATOMIC_INIT(0),
				94	.__tcp_lhash_wait
				95	= __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
				96	.__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED
				97	};
				98
				99	/*
				100	* This array holds the first and last local port number.
				101	* For high-usage systems, use sysctl to change this to
				102	* 32768-61000
				103	*/
				104	int sysctl_local_port_range[2] = { 1024, 4999 };
				105	int tcp_port_rover = 1024 - 1;
				106
				107	static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
				108	__u32 faddr, __u16 fport)
				109	{
				110	int h = (laddr ^ lport) ^ (faddr ^ fport);
				111	h ^= h >> 16;
				112	h ^= h >> 8;
				113	return h & (tcp_ehash_size - 1);
				114	}
				115
				116	static __inline__ int tcp_sk_hashfn(struct sock *sk)
				117	{
				118	struct inet_sock *inet = inet_sk(sk);
				119	__u32 laddr = inet->rcv_saddr;
				120	__u16 lport = inet->num;
				121	__u32 faddr = inet->daddr;
				122	__u16 fport = inet->dport;
				123
				124	return tcp_hashfn(laddr, lport, faddr, fport);
				125	}
				126
				127	/* Allocate and initialize a new TCP local port bind bucket.
				128	* The bindhash mutex for snum's hash chain must be held here.
				129	*/
				130	struct tcp_bind_bucket tcp_bucket_create(struct tcp_bind_hashbucket head,
				131	unsigned short snum)
				132	{
				133	struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
				134	SLAB_ATOMIC);
				135	if (tb) {
				136	tb->port = snum;
				137	tb->fastreuse = 0;
				138	INIT_HLIST_HEAD(&tb->owners);
				139	hlist_add_head(&tb->node, &head->chain);
				140	}
				141	return tb;
				142	}
				143
				144	/* Caller must hold hashbucket lock for this tb with local BH disabled */
				145	void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
				146	{
				147	if (hlist_empty(&tb->owners)) {
				148	__hlist_del(&tb->node);
				149	kmem_cache_free(tcp_bucket_cachep, tb);
				150	}
				151	}
				152
				153	/* Caller must disable local BH processing. */
				154	static __inline__ void __tcp_inherit_port(struct sock sk, struct sock child)
				155	{
				156	struct tcp_bind_hashbucket *head =
				157	&tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
				158	struct tcp_bind_bucket *tb;
				159
				160	spin_lock(&head->lock);
				161	tb = tcp_sk(sk)->bind_hash;
				162	sk_add_bind_node(child, &tb->owners);
				163	tcp_sk(child)->bind_hash = tb;
				164	spin_unlock(&head->lock);
				165	}
				166
				167	inline void tcp_inherit_port(struct sock sk, struct sock child)
				168	{
				169	local_bh_disable();
				170	__tcp_inherit_port(sk, child);
				171	local_bh_enable();
				172	}
				173
				174	void tcp_bind_hash(struct sock sk, struct tcp_bind_bucket tb,
				175	unsigned short snum)
				176	{
				177	inet_sk(sk)->num = snum;
				178	sk_add_bind_node(sk, &tb->owners);
				179	tcp_sk(sk)->bind_hash = tb;
				180	}
				181
				182	static inline int tcp_bind_conflict(struct sock sk, struct tcp_bind_bucket tb)
				183	{
				184	const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
				185	struct sock *sk2;
				186	struct hlist_node *node;
				187	int reuse = sk->sk_reuse;
				188
				189	sk_for_each_bound(sk2, node, &tb->owners) {
				190	if (sk != sk2 &&
				191	!tcp_v6_ipv6only(sk2) &&
				192	(!sk->sk_bound_dev_if \|\|
				193	!sk2->sk_bound_dev_if \|\|
				194	sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
				195	if (!reuse \|\| !sk2->sk_reuse \|\|
				196	sk2->sk_state == TCP_LISTEN) {
				197	const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
				198	if (!sk2_rcv_saddr \|\| !sk_rcv_saddr \|\|
				199	sk2_rcv_saddr == sk_rcv_saddr)
				200	break;
				201	}
				202	}
				203	}
				204	return node != NULL;
				205	}
				206
				207	/* Obtain a reference to a local port for the given sock,
				208	* if snum is zero it means select any available local port.
				209	*/
				210	static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
				211	{
				212	struct tcp_bind_hashbucket *head;
				213	struct hlist_node *node;
				214	struct tcp_bind_bucket *tb;
				215	int ret;
				216
				217	local_bh_disable();
				218	if (!snum) {
				219	int low = sysctl_local_port_range[0];
				220	int high = sysctl_local_port_range[1];
				221	int remaining = (high - low) + 1;
				222	int rover;
				223
				224	spin_lock(&tcp_portalloc_lock);
				225	rover = tcp_port_rover;
				226	do {
				227	rover++;
				228	if (rover < low \|\| rover > high)
				229	rover = low;
				230	head = &tcp_bhash[tcp_bhashfn(rover)];
				231	spin_lock(&head->lock);
				232	tb_for_each(tb, node, &head->chain)
				233	if (tb->port == rover)
				234	goto next;
				235	break;
				236	next:
				237	spin_unlock(&head->lock);
				238	} while (--remaining > 0);
				239	tcp_port_rover = rover;
				240	spin_unlock(&tcp_portalloc_lock);
				241
				242	/* Exhausted local port range during search? */
				243	ret = 1;
				244	if (remaining <= 0)
				245	goto fail;
				246
				247	/* OK, here is the one we will use. HEAD is
				248	* non-NULL and we hold it's mutex.
				249	*/
				250	snum = rover;
				251	} else {
				252	head = &tcp_bhash[tcp_bhashfn(snum)];
				253	spin_lock(&head->lock);
				254	tb_for_each(tb, node, &head->chain)
				255	if (tb->port == snum)
				256	goto tb_found;
				257	}
				258	tb = NULL;
				259	goto tb_not_found;
				260	tb_found:
				261	if (!hlist_empty(&tb->owners)) {
				262	if (sk->sk_reuse > 1)
				263	goto success;
				264	if (tb->fastreuse > 0 &&
				265	sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
				266	goto success;
				267	} else {
				268	ret = 1;
				269	if (tcp_bind_conflict(sk, tb))
				270	goto fail_unlock;
				271	}
				272	}
				273	tb_not_found:
				274	ret = 1;
				275	if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
				276	goto fail_unlock;
				277	if (hlist_empty(&tb->owners)) {
				278	if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
				279	tb->fastreuse = 1;
				280	else
				281	tb->fastreuse = 0;
				282	} else if (tb->fastreuse &&
				283	(!sk->sk_reuse \|\| sk->sk_state == TCP_LISTEN))
				284	tb->fastreuse = 0;
				285	success:
				286	if (!tcp_sk(sk)->bind_hash)
				287	tcp_bind_hash(sk, tb, snum);
				288	BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
				289	ret = 0;
				290
				291	fail_unlock:
				292	spin_unlock(&head->lock);
				293	fail:
				294	local_bh_enable();
				295	return ret;
				296	}
				297
				298	/* Get rid of any references to a local port held by the
				299	* given sock.
				300	*/
				301	static void __tcp_put_port(struct sock *sk)
				302	{
				303	struct inet_sock *inet = inet_sk(sk);
				304	struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
				305	struct tcp_bind_bucket *tb;
				306
				307	spin_lock(&head->lock);
				308	tb = tcp_sk(sk)->bind_hash;
				309	__sk_del_bind_node(sk);
				310	tcp_sk(sk)->bind_hash = NULL;
				311	inet->num = 0;
				312	tcp_bucket_destroy(tb);
				313	spin_unlock(&head->lock);
				314	}
				315
				316	void tcp_put_port(struct sock *sk)
				317	{
				318	local_bh_disable();
				319	__tcp_put_port(sk);
				320	local_bh_enable();
				321	}
				322
				323	/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
				324	* Look, when several writers sleep and reader wakes them up, all but one
				325	* immediately hit write lock and grab all the cpus. Exclusive sleep solves
				326	* this, _but_ remember, it adds useless work on UP machines (wake up each
				327	* exclusive lock release). It should be ifdefed really.
				328	*/
				329
				330	void tcp_listen_wlock(void)
				331	{
				332	write_lock(&tcp_lhash_lock);
				333
				334	if (atomic_read(&tcp_lhash_users)) {
				335	DEFINE_WAIT(wait);
				336
				337	for (;;) {
				338	prepare_to_wait_exclusive(&tcp_lhash_wait,
				339	&wait, TASK_UNINTERRUPTIBLE);
				340	if (!atomic_read(&tcp_lhash_users))
				341	break;
				342	write_unlock_bh(&tcp_lhash_lock);
				343	schedule();
				344	write_lock_bh(&tcp_lhash_lock);
				345	}
				346
				347	finish_wait(&tcp_lhash_wait, &wait);
				348	}
				349	}
				350
				351	static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
				352	{
				353	struct hlist_head *list;
				354	rwlock_t *lock;
				355
				356	BUG_TRAP(sk_unhashed(sk));
				357	if (listen_possible && sk->sk_state == TCP_LISTEN) {
				358	list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
				359	lock = &tcp_lhash_lock;
				360	tcp_listen_wlock();
				361	} else {
				362	list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
				363	lock = &tcp_ehash[sk->sk_hashent].lock;
				364	write_lock(lock);
				365	}
				366	__sk_add_node(sk, list);
				367	sock_prot_inc_use(sk->sk_prot);
				368	write_unlock(lock);
				369	if (listen_possible && sk->sk_state == TCP_LISTEN)
				370	wake_up(&tcp_lhash_wait);
				371	}
				372
				373	static void tcp_v4_hash(struct sock *sk)
				374	{
				375	if (sk->sk_state != TCP_CLOSE) {
				376	local_bh_disable();
				377	__tcp_v4_hash(sk, 1);
				378	local_bh_enable();
				379	}
				380	}
				381
				382	void tcp_unhash(struct sock *sk)
				383	{
				384	rwlock_t *lock;
				385
				386	if (sk_unhashed(sk))
				387	goto ende;
				388
				389	if (sk->sk_state == TCP_LISTEN) {
				390	local_bh_disable();
				391	tcp_listen_wlock();
				392	lock = &tcp_lhash_lock;
				393	} else {
				394	struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
				395	lock = &head->lock;
				396	write_lock_bh(&head->lock);
				397	}
				398
				399	if (__sk_del_node_init(sk))
				400	sock_prot_dec_use(sk->sk_prot);
				401	write_unlock_bh(lock);
				402
				403	ende:
				404	if (sk->sk_state == TCP_LISTEN)
				405	wake_up(&tcp_lhash_wait);
				406	}
				407
				408	/* Don't inline this cruft. Here are some nice properties to
				409	* exploit here. The BSD API does not allow a listening TCP
				410	* to specify the remote port nor the remote address for the
				411	* connection. So always assume those are both wildcarded
				412	* during the search since they can never be otherwise.
				413	*/
				414	static struct sock __tcp_v4_lookup_listener(struct hlist_head head, u32 daddr,
				415	unsigned short hnum, int dif)
				416	{
				417	struct sock result = NULL, sk;
				418	struct hlist_node *node;
				419	int score, hiscore;
				420
				421	hiscore=-1;
				422	sk_for_each(sk, node, head) {
				423	struct inet_sock *inet = inet_sk(sk);
				424
				425	if (inet->num == hnum && !ipv6_only_sock(sk)) {
				426	__u32 rcv_saddr = inet->rcv_saddr;
				427
				428	score = (sk->sk_family == PF_INET ? 1 : 0);
				429	if (rcv_saddr) {
				430	if (rcv_saddr != daddr)
				431	continue;
				432	score+=2;
				433	}
				434	if (sk->sk_bound_dev_if) {
				435	if (sk->sk_bound_dev_if != dif)
				436	continue;
				437	score+=2;
				438	}
				439	if (score == 5)
				440	return sk;
				441	if (score > hiscore) {
				442	hiscore = score;
				443	result = sk;
				444	}
				445	}
				446	}
				447	return result;
				448	}
				449
				450	/* Optimize the common listener case. */
				451	static inline struct sock *tcp_v4_lookup_listener(u32 daddr,
				452	unsigned short hnum, int dif)
				453	{
				454	struct sock *sk = NULL;
				455	struct hlist_head *head;
				456
				457	read_lock(&tcp_lhash_lock);
				458	head = &tcp_listening_hash[tcp_lhashfn(hnum)];
				459	if (!hlist_empty(head)) {
				460	struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
				461
				462	if (inet->num == hnum && !sk->sk_node.next &&
				463	(!inet->rcv_saddr \|\| inet->rcv_saddr == daddr) &&
				464	(sk->sk_family == PF_INET \|\| !ipv6_only_sock(sk)) &&
				465	!sk->sk_bound_dev_if)
				466	goto sherry_cache;
				467	sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
				468	}
				469	if (sk) {
				470	sherry_cache:
				471	sock_hold(sk);
				472	}
				473	read_unlock(&tcp_lhash_lock);
				474	return sk;
				475	}
				476
				477	/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
				478	* we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
				479	*
				480	* Local BH must be disabled here.
				481	*/
				482
				483	static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
				484	u32 daddr, u16 hnum,
				485	int dif)
				486	{
				487	struct tcp_ehash_bucket *head;
				488	TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
				489	__u32 ports = TCP_COMBINED_PORTS(sport, hnum);
				490	struct sock *sk;
				491	struct hlist_node *node;
				492	/* Optimize here for direct hit, only listening connections can
				493	* have wildcards anyways.
				494	*/
				495	int hash = tcp_hashfn(daddr, hnum, saddr, sport);
				496	head = &tcp_ehash[hash];
				497	read_lock(&head->lock);
				498	sk_for_each(sk, node, &head->chain) {
				499	if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
				500	goto hit; /* You sunk my battleship! */
				501	}
				502
				503	/* Must check for a TIME_WAIT'er before going to listener hash. */
				504	sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
				505	if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
				506	goto hit;
				507	}
				508	sk = NULL;
				509	out:
				510	read_unlock(&head->lock);
				511	return sk;
				512	hit:
				513	sock_hold(sk);
				514	goto out;
				515	}
				516
				517	static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
				518	u32 daddr, u16 hnum, int dif)
				519	{
				520	struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
				521	daddr, hnum, dif);
				522
				523	return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
				524	}
				525
				526	inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
				527	u16 dport, int dif)
				528	{
				529	struct sock *sk;
				530
				531	local_bh_disable();
				532	sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
				533	local_bh_enable();
				534
				535	return sk;
				536	}
				537
				538	EXPORT_SYMBOL_GPL(tcp_v4_lookup);
				539
				540	static inline __u32 tcp_v4_init_sequence(struct sock sk, struct sk_buff skb)
				541	{
				542	return secure_tcp_sequence_number(skb->nh.iph->daddr,
				543	skb->nh.iph->saddr,
				544	skb->h.th->dest,
				545	skb->h.th->source);
				546	}
				547
				548	/* called with local bh disabled */
				549	static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
				550	struct tcp_tw_bucket **twp)
				551	{
				552	struct inet_sock *inet = inet_sk(sk);
				553	u32 daddr = inet->rcv_saddr;
				554	u32 saddr = inet->daddr;
				555	int dif = sk->sk_bound_dev_if;
				556	TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
				557	__u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
				558	int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
				559	struct tcp_ehash_bucket *head = &tcp_ehash[hash];
				560	struct sock *sk2;
				561	struct hlist_node *node;
				562	struct tcp_tw_bucket *tw;
				563
				564	write_lock(&head->lock);
				565
				566	/* Check TIME-WAIT sockets first. */
				567	sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
				568	tw = (struct tcp_tw_bucket *)sk2;
				569
				570	if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
				571	struct tcp_sock *tp = tcp_sk(sk);
				572
				573	/* With PAWS, it is safe from the viewpoint
				574	of data integrity. Even without PAWS it
				575	is safe provided sequence spaces do not
				576	overlap i.e. at data rates <= 80Mbit/sec.
				577
				578	Actually, the idea is close to VJ's one,
				579	only timestamp cache is held not per host,
				580	but per port pair and TW bucket is used
				581	as state holder.
				582
				583	If TW bucket has been already destroyed we
				584	fall back to VJ's scheme and use initial
				585	timestamp retrieved from peer table.
				586	*/
				587	if (tw->tw_ts_recent_stamp &&
				588	(!twp \|\| (sysctl_tcp_tw_reuse &&
				589	xtime.tv_sec -
				590	tw->tw_ts_recent_stamp > 1))) {
				591	if ((tp->write_seq =
				592	tw->tw_snd_nxt + 65535 + 2) == 0)
				593	tp->write_seq = 1;
				594	tp->rx_opt.ts_recent = tw->tw_ts_recent;
				595	tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
				596	sock_hold(sk2);
				597	goto unique;
				598	} else
				599	goto not_unique;
				600	}
				601	}
				602	tw = NULL;
				603
				604	/* And established part... */
				605	sk_for_each(sk2, node, &head->chain) {
				606	if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
				607	goto not_unique;
				608	}
				609
				610	unique:
				611	/* Must record num and sport now. Otherwise we will see
				612	* in hash table socket with a funny identity. */
				613	inet->num = lport;
				614	inet->sport = htons(lport);
				615	sk->sk_hashent = hash;
				616	BUG_TRAP(sk_unhashed(sk));
				617	__sk_add_node(sk, &head->chain);
				618	sock_prot_inc_use(sk->sk_prot);
				619	write_unlock(&head->lock);
				620
				621	if (twp) {
				622	*twp = tw;
				623	NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
				624	} else if (tw) {
				625	/* Silly. Should hash-dance instead... */
				626	tcp_tw_deschedule(tw);
				627	NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
				628
				629	tcp_tw_put(tw);
				630	}
				631
				632	return 0;
				633
				634	not_unique:
				635	write_unlock(&head->lock);
				636	return -EADDRNOTAVAIL;
				637	}
				638
				639	static inline u32 connect_port_offset(const struct sock *sk)
				640	{
				641	const struct inet_sock *inet = inet_sk(sk);
				642
				643	return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
				644	inet->dport);
				645	}
				646
				647	/*
				648	* Bind a port for a connect operation and hash it.
				649	*/
				650	static inline int tcp_v4_hash_connect(struct sock *sk)
				651	{
				652	unsigned short snum = inet_sk(sk)->num;
				653	struct tcp_bind_hashbucket *head;
				654	struct tcp_bind_bucket *tb;
				655	int ret;
				656
				657	if (!snum) {
				658	int low = sysctl_local_port_range[0];
				659	int high = sysctl_local_port_range[1];
				660	int range = high - low;
				661	int i;
				662	int port;
				663	static u32 hint;
				664	u32 offset = hint + connect_port_offset(sk);
				665	struct hlist_node *node;
				666	struct tcp_tw_bucket *tw = NULL;
				667
				668	local_bh_disable();
				669	for (i = 1; i <= range; i++) {
				670	port = low + (i + offset) % range;
				671	head = &tcp_bhash[tcp_bhashfn(port)];
				672	spin_lock(&head->lock);
				673
				674	/* Does not bother with rcv_saddr checks,
				675	* because the established check is already
				676	* unique enough.
				677	*/
				678	tb_for_each(tb, node, &head->chain) {
				679	if (tb->port == port) {
				680	BUG_TRAP(!hlist_empty(&tb->owners));
				681	if (tb->fastreuse >= 0)
				682	goto next_port;
				683	if (!__tcp_v4_check_established(sk,
				684	port,
				685	&tw))
				686	goto ok;
				687	goto next_port;
				688	}
				689	}
				690
				691	tb = tcp_bucket_create(head, port);
				692	if (!tb) {
				693	spin_unlock(&head->lock);
				694	break;
				695	}
				696	tb->fastreuse = -1;
				697	goto ok;
				698
				699	next_port:
				700	spin_unlock(&head->lock);
				701	}
				702	local_bh_enable();
				703
				704	return -EADDRNOTAVAIL;
				705
				706	ok:
				707	hint += i;
				708
				709	/* Head lock still held and bh's disabled */
				710	tcp_bind_hash(sk, tb, port);
				711	if (sk_unhashed(sk)) {
				712	inet_sk(sk)->sport = htons(port);
				713	__tcp_v4_hash(sk, 0);
				714	}
				715	spin_unlock(&head->lock);
				716
				717	if (tw) {
				718	tcp_tw_deschedule(tw);
				719	tcp_tw_put(tw);
				720	}
				721
				722	ret = 0;
				723	goto out;
				724	}
				725
				726	head = &tcp_bhash[tcp_bhashfn(snum)];
				727	tb = tcp_sk(sk)->bind_hash;
				728	spin_lock_bh(&head->lock);
				729	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
				730	__tcp_v4_hash(sk, 0);
				731	spin_unlock_bh(&head->lock);
				732	return 0;
				733	} else {
				734	spin_unlock(&head->lock);
				735	/* No definite answer... Walk to established hash table */
				736	ret = __tcp_v4_check_established(sk, snum, NULL);
				737	out:
				738	local_bh_enable();
				739	return ret;
				740	}
				741	}
				742
				743	/* This will initiate an outgoing connection. */
				744	int tcp_v4_connect(struct sock sk, struct sockaddr uaddr, int addr_len)
				745	{
				746	struct inet_sock *inet = inet_sk(sk);
				747	struct tcp_sock *tp = tcp_sk(sk);
				748	struct sockaddr_in usin = (struct sockaddr_in )uaddr;
				749	struct rtable *rt;
				750	u32 daddr, nexthop;
				751	int tmp;
				752	int err;
				753
				754	if (addr_len < sizeof(struct sockaddr_in))
				755	return -EINVAL;
				756
				757	if (usin->sin_family != AF_INET)
				758	return -EAFNOSUPPORT;
				759
				760	nexthop = daddr = usin->sin_addr.s_addr;
				761	if (inet->opt && inet->opt->srr) {
				762	if (!daddr)
				763	return -EINVAL;
				764	nexthop = inet->opt->faddr;
				765	}
				766
				767	tmp = ip_route_connect(&rt, nexthop, inet->saddr,
				768	RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
				769	IPPROTO_TCP,
				770	inet->sport, usin->sin_port, sk);
				771	if (tmp < 0)
				772	return tmp;
				773
				774	if (rt->rt_flags & (RTCF_MULTICAST \| RTCF_BROADCAST)) {
				775	ip_rt_put(rt);
				776	return -ENETUNREACH;
				777	}
				778
				779	if (!inet->opt \|\| !inet->opt->srr)
				780	daddr = rt->rt_dst;
				781
				782	if (!inet->saddr)
				783	inet->saddr = rt->rt_src;
				784	inet->rcv_saddr = inet->saddr;
				785
				786	if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
				787	/* Reset inherited state */
				788	tp->rx_opt.ts_recent = 0;
				789	tp->rx_opt.ts_recent_stamp = 0;
				790	tp->write_seq = 0;
				791	}
				792
				793	if (sysctl_tcp_tw_recycle &&
				794	!tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
				795	struct inet_peer *peer = rt_get_peer(rt);
				796
				797	/* VJ's idea. We save last timestamp seen from
				798	* the destination in peer table, when entering state TIME-WAIT
				799	* and initialize rx_opt.ts_recent from it, when trying new connection.
				800	*/
				801
				802	if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
				803	tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
				804	tp->rx_opt.ts_recent = peer->tcp_ts;
				805	}
				806	}
				807
				808	inet->dport = usin->sin_port;
				809	inet->daddr = daddr;
				810
				811	tp->ext_header_len = 0;
				812	if (inet->opt)
				813	tp->ext_header_len = inet->opt->optlen;
				814
				815	tp->rx_opt.mss_clamp = 536;
				816
				817	/* Socket identity is still unknown (sport may be zero).
				818	* However we set state to SYN-SENT and not releasing socket
				819	* lock select source port, enter ourselves into the hash tables and
				820	* complete initialization after this.
				821	*/
				822	tcp_set_state(sk, TCP_SYN_SENT);
				823	err = tcp_v4_hash_connect(sk);
				824	if (err)
				825	goto failure;
				826
				827	err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
				828	if (err)
				829	goto failure;
				830
				831	/* OK, now commit destination to socket. */
				832	__sk_dst_set(sk, &rt->u.dst);
				833	tcp_v4_setup_caps(sk, &rt->u.dst);
				834
				835	if (!tp->write_seq)
				836	tp->write_seq = secure_tcp_sequence_number(inet->saddr,
				837	inet->daddr,
				838	inet->sport,
				839	usin->sin_port);
				840
				841	inet->id = tp->write_seq ^ jiffies;
				842
				843	err = tcp_connect(sk);
				844	rt = NULL;
				845	if (err)
				846	goto failure;
				847
				848	return 0;
				849
				850	failure:
				851	/* This unhashes the socket and releases the local port, if necessary. */
				852	tcp_set_state(sk, TCP_CLOSE);
				853	ip_rt_put(rt);
				854	sk->sk_route_caps = 0;
				855	inet->dport = 0;
				856	return err;
				857	}
				858
				859	static __inline__ int tcp_v4_iif(struct sk_buff *skb)
				860	{
				861	return ((struct rtable *)skb->dst)->rt_iif;
				862	}
				863
				864	static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
				865	{
				866	return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
				867	}
				868
				869	static struct open_request tcp_v4_search_req(struct tcp_sock tp,
				870	struct open_request ***prevp,
				871	__u16 rport,
				872	__u32 raddr, __u32 laddr)
				873	{
				874	struct tcp_listen_opt *lopt = tp->listen_opt;
				875	struct open_request req, *prev;
				876
				877	for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
				878	(req = *prev) != NULL;
				879	prev = &req->dl_next) {
				880	if (req->rmt_port == rport &&
				881	req->af.v4_req.rmt_addr == raddr &&
				882	req->af.v4_req.loc_addr == laddr &&
				883	TCP_INET_FAMILY(req->class->family)) {
				884	BUG_TRAP(!req->sk);
				885	*prevp = prev;
				886	break;
				887	}
				888	}
				889
				890	return req;
				891	}
				892
				893	static void tcp_v4_synq_add(struct sock sk, struct open_request req)
				894	{
				895	struct tcp_sock *tp = tcp_sk(sk);
				896	struct tcp_listen_opt *lopt = tp->listen_opt;
				897	u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
				898
				899	req->expires = jiffies + TCP_TIMEOUT_INIT;
				900	req->retrans = 0;
				901	req->sk = NULL;
				902	req->dl_next = lopt->syn_table[h];
				903
				904	write_lock(&tp->syn_wait_lock);
				905	lopt->syn_table[h] = req;
				906	write_unlock(&tp->syn_wait_lock);
				907
				908	tcp_synq_added(sk);
				909	}
				910
				911
				912	/*
				913	* This routine does path mtu discovery as defined in RFC1191.
				914	*/
				915	static inline void do_pmtu_discovery(struct sock sk, struct iphdr iph,
				916	u32 mtu)
				917	{
				918	struct dst_entry *dst;
				919	struct inet_sock *inet = inet_sk(sk);
				920	struct tcp_sock *tp = tcp_sk(sk);
				921
				922	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
				923	* send out by Linux are always <576bytes so they should go through
				924	* unfragmented).
				925	*/
				926	if (sk->sk_state == TCP_LISTEN)
				927	return;
				928
				929	/* We don't check in the destentry if pmtu discovery is forbidden
				930	* on this route. We just assume that no packet_to_big packets
				931	* are send back when pmtu discovery is not active.
				932	* There is a small race when the user changes this flag in the
				933	* route, but I think that's acceptable.
				934	*/
				935	if ((dst = __sk_dst_check(sk, 0)) == NULL)
				936	return;
				937
				938	dst->ops->update_pmtu(dst, mtu);
				939
				940	/* Something is about to be wrong... Remember soft error
				941	* for the case, if this connection will not able to recover.
				942	*/
				943	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
				944	sk->sk_err_soft = EMSGSIZE;
				945
				946	mtu = dst_mtu(dst);
				947
				948	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
				949	tp->pmtu_cookie > mtu) {
				950	tcp_sync_mss(sk, mtu);
				951
				952	/* Resend the TCP packet because it's
				953	* clear that the old packet has been
				954	* dropped. This is the new "fast" path mtu
				955	* discovery.
				956	*/
				957	tcp_simple_retransmit(sk);
				958	} /* else let the usual retransmit timer handle it */
				959	}
				960
				961	/*
				962	* This routine is called by the ICMP module when it gets some
				963	* sort of error condition. If err < 0 then the socket should
				964	* be closed and the error returned to the user. If err > 0
				965	* it's just the icmp type << 8 \| icmp code. After adjustment
				966	* header points to the first 8 bytes of the tcp header. We need
				967	* to find the appropriate port.
				968	*
				969	* The locking strategy used here is very "optimistic". When
				970	* someone else accesses the socket the ICMP is just dropped
				971	* and for some paths there is no check at all.
				972	* A more general error queue to queue errors for later handling
				973	* is probably better.
				974	*
				975	*/
				976
				977	void tcp_v4_err(struct sk_buff *skb, u32 info)
				978	{
				979	struct iphdr iph = (struct iphdr )skb->data;
				980	struct tcphdr th = (struct tcphdr )(skb->data + (iph->ihl << 2));
				981	struct tcp_sock *tp;
				982	struct inet_sock *inet;
				983	int type = skb->h.icmph->type;
				984	int code = skb->h.icmph->code;
				985	struct sock *sk;
				986	__u32 seq;
				987	int err;
				988
				989	if (skb->len < (iph->ihl << 2) + 8) {
				990	ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
				991	return;
				992	}
				993
				994	sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
				995	th->source, tcp_v4_iif(skb));
				996	if (!sk) {
				997	ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
				998	return;
				999	}
				1000	if (sk->sk_state == TCP_TIME_WAIT) {
				1001	tcp_tw_put((struct tcp_tw_bucket *)sk);
				1002	return;
				1003	}
				1004
				1005	bh_lock_sock(sk);
				1006	/* If too many ICMPs get dropped on busy
				1007	* servers this needs to be solved differently.
				1008	*/
				1009	if (sock_owned_by_user(sk))
				1010	NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
				1011
				1012	if (sk->sk_state == TCP_CLOSE)
				1013	goto out;
				1014
				1015	tp = tcp_sk(sk);
				1016	seq = ntohl(th->seq);
				1017	if (sk->sk_state != TCP_LISTEN &&
				1018	!between(seq, tp->snd_una, tp->snd_nxt)) {
				1019	NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
				1020	goto out;
				1021	}
				1022
				1023	switch (type) {
				1024	case ICMP_SOURCE_QUENCH:
				1025	/* Just silently ignore these. */
				1026	goto out;
				1027	case ICMP_PARAMETERPROB:
				1028	err = EPROTO;
				1029	break;
				1030	case ICMP_DEST_UNREACH:
				1031	if (code > NR_ICMP_UNREACH)
				1032	goto out;
				1033
				1034	if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
				1035	if (!sock_owned_by_user(sk))
				1036	do_pmtu_discovery(sk, iph, info);
				1037	goto out;
				1038	}
				1039
				1040	err = icmp_err_convert[code].errno;
				1041	break;
				1042	case ICMP_TIME_EXCEEDED:
				1043	err = EHOSTUNREACH;
				1044	break;
				1045	default:
				1046	goto out;
				1047	}
				1048
				1049	switch (sk->sk_state) {
				1050	struct open_request req, *prev;
				1051	case TCP_LISTEN:
				1052	if (sock_owned_by_user(sk))
				1053	goto out;
				1054
				1055	req = tcp_v4_search_req(tp, &prev, th->dest,
				1056	iph->daddr, iph->saddr);
				1057	if (!req)
				1058	goto out;
				1059
				1060	/* ICMPs are not backlogged, hence we cannot get
				1061	an established socket here.
				1062	*/
				1063	BUG_TRAP(!req->sk);
				1064
				1065	if (seq != req->snt_isn) {
				1066	NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
				1067	goto out;
				1068	}
				1069
				1070	/*
				1071	* Still in SYN_RECV, just remove it silently.
				1072	* There is no good way to pass the error to the newly
				1073	* created socket, and POSIX does not want network
				1074	* errors returned from accept().
				1075	*/
				1076	tcp_synq_drop(sk, req, prev);
				1077	goto out;
				1078
				1079	case TCP_SYN_SENT:
				1080	case TCP_SYN_RECV: /* Cannot happen.
				1081	It can f.e. if SYNs crossed.
				1082	*/
				1083	if (!sock_owned_by_user(sk)) {
				1084	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
				1085	sk->sk_err = err;
				1086
				1087	sk->sk_error_report(sk);
				1088
				1089	tcp_done(sk);
				1090	} else {
				1091	sk->sk_err_soft = err;
				1092	}
				1093	goto out;
				1094	}
				1095
				1096	/* If we've already connected we will keep trying
				1097	* until we time out, or the user gives up.
				1098	*
				1099	* rfc1122 4.2.3.9 allows to consider as hard errors
				1100	* only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
				1101	* but it is obsoleted by pmtu discovery).
				1102	*
				1103	* Note, that in modern internet, where routing is unreliable
				1104	* and in each dark corner broken firewalls sit, sending random
				1105	* errors ordered by their masters even this two messages finally lose
				1106	* their original sense (even Linux sends invalid PORT_UNREACHs)
				1107	*
				1108	* Now we are in compliance with RFCs.
				1109	* --ANK (980905)
				1110	*/
				1111
				1112	inet = inet_sk(sk);
				1113	if (!sock_owned_by_user(sk) && inet->recverr) {
				1114	sk->sk_err = err;
				1115	sk->sk_error_report(sk);
				1116	} else { /* Only an error on timeout */
				1117	sk->sk_err_soft = err;
				1118	}
				1119
				1120	out:
				1121	bh_unlock_sock(sk);
				1122	sock_put(sk);
				1123	}
				1124
				1125	/* This routine computes an IPv4 TCP checksum. */
				1126	void tcp_v4_send_check(struct sock sk, struct tcphdr th, int len,
				1127	struct sk_buff *skb)
				1128	{
				1129	struct inet_sock *inet = inet_sk(sk);
				1130
				1131	if (skb->ip_summed == CHECKSUM_HW) {
				1132	th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
				1133	skb->csum = offsetof(struct tcphdr, check);
				1134	} else {
				1135	th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
				1136	csum_partial((char *)th,
				1137	th->doff << 2,
				1138	skb->csum));
				1139	}
				1140	}
				1141
				1142	/*
				1143	* This routine will send an RST to the other tcp.
				1144	*
				1145	* Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
				1146	* for reset.
				1147	* Answer: if a packet caused RST, it is not for a socket
				1148	* existing in our system, if it is matched to a socket,
				1149	* it is just duplicate segment or bug in other side's TCP.
				1150	* So that we build reply only basing on parameters
				1151	* arrived with segment.
				1152	* Exception: precedence violation. We do not implement it in any case.
				1153	*/
				1154
				1155	static void tcp_v4_send_reset(struct sk_buff *skb)
				1156	{
				1157	struct tcphdr *th = skb->h.th;
				1158	struct tcphdr rth;
				1159	struct ip_reply_arg arg;
				1160
				1161	/* Never send a reset in response to a reset. */
				1162	if (th->rst)
				1163	return;
				1164
				1165	if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
				1166	return;
				1167
				1168	/* Swap the send and the receive. */
				1169	memset(&rth, 0, sizeof(struct tcphdr));
				1170	rth.dest = th->source;
				1171	rth.source = th->dest;
				1172	rth.doff = sizeof(struct tcphdr) / 4;
				1173	rth.rst = 1;
				1174
				1175	if (th->ack) {
				1176	rth.seq = th->ack_seq;
				1177	} else {
				1178	rth.ack = 1;
				1179	rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
				1180	skb->len - (th->doff << 2));
				1181	}
				1182
				1183	memset(&arg, 0, sizeof arg);
				1184	arg.iov[0].iov_base = (unsigned char *)&rth;
				1185	arg.iov[0].iov_len = sizeof rth;
				1186	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
				1187	skb->nh.iph->saddr, /XXX/
				1188	sizeof(struct tcphdr), IPPROTO_TCP, 0);
				1189	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
				1190
				1191	ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
				1192
				1193	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
				1194	TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
				1195	}
				1196
				1197	/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
				1198	outside socket context is ugly, certainly. What can I do?
				1199	*/
				1200
				1201	static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
				1202	u32 win, u32 ts)
				1203	{
				1204	struct tcphdr *th = skb->h.th;
				1205	struct {
				1206	struct tcphdr th;
				1207	u32 tsopt[3];
				1208	} rep;
				1209	struct ip_reply_arg arg;
				1210
				1211	memset(&rep.th, 0, sizeof(struct tcphdr));
				1212	memset(&arg, 0, sizeof arg);
				1213
				1214	arg.iov[0].iov_base = (unsigned char *)&rep;
				1215	arg.iov[0].iov_len = sizeof(rep.th);
				1216	if (ts) {
				1217	rep.tsopt[0] = htonl((TCPOPT_NOP << 24) \| (TCPOPT_NOP << 16) \|
				1218	(TCPOPT_TIMESTAMP << 8) \|
				1219	TCPOLEN_TIMESTAMP);
				1220	rep.tsopt[1] = htonl(tcp_time_stamp);
				1221	rep.tsopt[2] = htonl(ts);
				1222	arg.iov[0].iov_len = sizeof(rep);
				1223	}
				1224
				1225	/* Swap the send and the receive. */
				1226	rep.th.dest = th->source;
				1227	rep.th.source = th->dest;
				1228	rep.th.doff = arg.iov[0].iov_len / 4;
				1229	rep.th.seq = htonl(seq);
				1230	rep.th.ack_seq = htonl(ack);
				1231	rep.th.ack = 1;
				1232	rep.th.window = htons(win);
				1233
				1234	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
				1235	skb->nh.iph->saddr, /XXX/
				1236	arg.iov[0].iov_len, IPPROTO_TCP, 0);
				1237	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
				1238
				1239	ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
				1240
				1241	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
				1242	}
				1243
				1244	static void tcp_v4_timewait_ack(struct sock sk, struct sk_buff skb)
				1245	{
				1246	struct tcp_tw_bucket tw = (struct tcp_tw_bucket )sk;
				1247
				1248	tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
				1249	tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
				1250
				1251	tcp_tw_put(tw);
				1252	}
				1253
				1254	static void tcp_v4_or_send_ack(struct sk_buff skb, struct open_request req)
				1255	{
				1256	tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd,
				1257	req->ts_recent);
				1258	}
				1259
				1260	static struct dst_entry* tcp_v4_route_req(struct sock *sk,
				1261	struct open_request *req)
				1262	{
				1263	struct rtable *rt;
				1264	struct ip_options *opt = req->af.v4_req.opt;
				1265	struct flowi fl = { .oif = sk->sk_bound_dev_if,
				1266	.nl_u = { .ip4_u =
				1267	{ .daddr = ((opt && opt->srr) ?
				1268	opt->faddr :
				1269	req->af.v4_req.rmt_addr),
				1270	.saddr = req->af.v4_req.loc_addr,
				1271	.tos = RT_CONN_FLAGS(sk) } },
				1272	.proto = IPPROTO_TCP,
				1273	.uli_u = { .ports =
				1274	{ .sport = inet_sk(sk)->sport,
				1275	.dport = req->rmt_port } } };
				1276
				1277	if (ip_route_output_flow(&rt, &fl, sk, 0)) {
				1278	IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
				1279	return NULL;
				1280	}
				1281	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
				1282	ip_rt_put(rt);
				1283	IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
				1284	return NULL;
				1285	}
				1286	return &rt->u.dst;
				1287	}
				1288
				1289	/*
				1290	* Send a SYN-ACK after having received an ACK.
				1291	* This still operates on a open_request only, not on a big
				1292	* socket.
				1293	*/
				1294	static int tcp_v4_send_synack(struct sock sk, struct open_request req,
				1295	struct dst_entry *dst)
				1296	{
				1297	int err = -1;
				1298	struct sk_buff * skb;
				1299
				1300	/* First, grab a route. */
				1301	if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
				1302	goto out;
				1303
				1304	skb = tcp_make_synack(sk, dst, req);
				1305
				1306	if (skb) {
				1307	struct tcphdr *th = skb->h.th;
				1308
				1309	th->check = tcp_v4_check(th, skb->len,
				1310	req->af.v4_req.loc_addr,
				1311	req->af.v4_req.rmt_addr,
				1312	csum_partial((char *)th, skb->len,
				1313	skb->csum));
				1314
				1315	err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
				1316	req->af.v4_req.rmt_addr,
				1317	req->af.v4_req.opt);
				1318	if (err == NET_XMIT_CN)
				1319	err = 0;
				1320	}
				1321
				1322	out:
				1323	dst_release(dst);
				1324	return err;
				1325	}
				1326
				1327	/*
				1328	* IPv4 open_request destructor.
				1329	*/
				1330	static void tcp_v4_or_free(struct open_request *req)
				1331	{
				1332	if (req->af.v4_req.opt)
				1333	kfree(req->af.v4_req.opt);
				1334	}
				1335
				1336	static inline void syn_flood_warning(struct sk_buff *skb)
				1337	{
				1338	static unsigned long warntime;
				1339
				1340	if (time_after(jiffies, (warntime + HZ * 60))) {
				1341	warntime = jiffies;
				1342	printk(KERN_INFO
				1343	"possible SYN flooding on port %d. Sending cookies.\n",
				1344	ntohs(skb->h.th->dest));
				1345	}
				1346	}
				1347
				1348	/*
				1349	* Save and compile IPv4 options into the open_request if needed.
				1350	*/
				1351	static inline struct ip_options tcp_v4_save_options(struct sock sk,
				1352	struct sk_buff *skb)
				1353	{
				1354	struct ip_options *opt = &(IPCB(skb)->opt);
				1355	struct ip_options *dopt = NULL;
				1356
				1357	if (opt && opt->optlen) {
				1358	int opt_size = optlength(opt);
				1359	dopt = kmalloc(opt_size, GFP_ATOMIC);
				1360	if (dopt) {
				1361	if (ip_options_echo(dopt, skb)) {
				1362	kfree(dopt);
				1363	dopt = NULL;
				1364	}
				1365	}
				1366	}
				1367	return dopt;
				1368	}
				1369
				1370	/*
				1371	* Maximum number of SYN_RECV sockets in queue per LISTEN socket.
				1372	* One SYN_RECV socket costs about 80bytes on a 32bit machine.
				1373	* It would be better to replace it with a global counter for all sockets
				1374	* but then some measure against one socket starving all other sockets
				1375	* would be needed.
				1376	*
				1377	* It was 128 by default. Experiments with real servers show, that
				1378	* it is absolutely not enough even at 100conn/sec. 256 cures most
				1379	* of problems. This value is adjusted to 128 for very small machines
				1380	* (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
				1381	* Further increasing requires to change hash table size.
				1382	*/
				1383	int sysctl_max_syn_backlog = 256;
				1384
				1385	struct or_calltable or_ipv4 = {
				1386	.family = PF_INET,
				1387	.rtx_syn_ack = tcp_v4_send_synack,
				1388	.send_ack = tcp_v4_or_send_ack,
				1389	.destructor = tcp_v4_or_free,
				1390	.send_reset = tcp_v4_send_reset,
				1391	};
				1392
				1393	int tcp_v4_conn_request(struct sock sk, struct sk_buff skb)
				1394	{
				1395	struct tcp_options_received tmp_opt;
				1396	struct open_request *req;
				1397	__u32 saddr = skb->nh.iph->saddr;
				1398	__u32 daddr = skb->nh.iph->daddr;
				1399	__u32 isn = TCP_SKB_CB(skb)->when;
				1400	struct dst_entry *dst = NULL;
				1401	#ifdef CONFIG_SYN_COOKIES
				1402	int want_cookie = 0;
				1403	#else
				1404	#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
				1405	#endif
				1406
				1407	/* Never answer to SYNs send to broadcast or multicast */
				1408	if (((struct rtable *)skb->dst)->rt_flags &
				1409	(RTCF_BROADCAST \| RTCF_MULTICAST))
				1410	goto drop;
				1411
				1412	/* TW buckets are converted to open requests without
				1413	* limitations, they conserve resources and peer is
				1414	* evidently real one.
				1415	*/
				1416	if (tcp_synq_is_full(sk) && !isn) {
				1417	#ifdef CONFIG_SYN_COOKIES
				1418	if (sysctl_tcp_syncookies) {
				1419	want_cookie = 1;
				1420	} else
				1421	#endif
				1422	goto drop;
				1423	}
				1424
				1425	/* Accept backlog is full. If we have already queued enough
				1426	* of warm entries in syn queue, drop request. It is better than
				1427	* clogging syn queue with openreqs with exponentially increasing
				1428	* timeout.
				1429	*/
				1430	if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
				1431	goto drop;
				1432
				1433	req = tcp_openreq_alloc();
				1434	if (!req)
				1435	goto drop;
				1436
				1437	tcp_clear_options(&tmp_opt);
				1438	tmp_opt.mss_clamp = 536;
				1439	tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
				1440
				1441	tcp_parse_options(skb, &tmp_opt, 0);
				1442
				1443	if (want_cookie) {
				1444	tcp_clear_options(&tmp_opt);
				1445	tmp_opt.saw_tstamp = 0;
				1446	}
				1447
				1448	if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
				1449	/* Some OSes (unknown ones, but I see them on web server, which
				1450	* contains information interesting only for windows'
				1451	* users) do not send their stamp in SYN. It is easy case.
				1452	* We simply do not advertise TS support.
				1453	*/
				1454	tmp_opt.saw_tstamp = 0;
				1455	tmp_opt.tstamp_ok = 0;
				1456	}
				1457	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
				1458
				1459	tcp_openreq_init(req, &tmp_opt, skb);
				1460
				1461	req->af.v4_req.loc_addr = daddr;
				1462	req->af.v4_req.rmt_addr = saddr;
				1463	req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
				1464	req->class = &or_ipv4;
				1465	if (!want_cookie)
				1466	TCP_ECN_create_request(req, skb->h.th);
				1467
				1468	if (want_cookie) {
				1469	#ifdef CONFIG_SYN_COOKIES
				1470	syn_flood_warning(skb);
				1471	#endif
				1472	isn = cookie_v4_init_sequence(sk, skb, &req->mss);
				1473	} else if (!isn) {
				1474	struct inet_peer *peer = NULL;
				1475
				1476	/* VJ's idea. We save last timestamp seen
				1477	* from the destination in peer table, when entering
				1478	* state TIME-WAIT, and check against it before
				1479	* accepting new connection request.
				1480	*
				1481	* If "isn" is not zero, this request hit alive
				1482	* timewait bucket, so that all the necessary checks
				1483	* are made in the function processing timewait state.
				1484	*/
				1485	if (tmp_opt.saw_tstamp &&
				1486	sysctl_tcp_tw_recycle &&
				1487	(dst = tcp_v4_route_req(sk, req)) != NULL &&
				1488	(peer = rt_get_peer((struct rtable *)dst)) != NULL &&
				1489	peer->v4daddr == saddr) {
				1490	if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
				1491	(s32)(peer->tcp_ts - req->ts_recent) >
				1492	TCP_PAWS_WINDOW) {
				1493	NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
				1494	dst_release(dst);
				1495	goto drop_and_free;
				1496	}
				1497	}
				1498	/* Kill the following clause, if you dislike this way. */
				1499	else if (!sysctl_tcp_syncookies &&
				1500	(sysctl_max_syn_backlog - tcp_synq_len(sk) <
				1501	(sysctl_max_syn_backlog >> 2)) &&
				1502	(!peer \|\| !peer->tcp_ts_stamp) &&
				1503	(!dst \|\| !dst_metric(dst, RTAX_RTT))) {
				1504	/* Without syncookies last quarter of
				1505	* backlog is filled with destinations,
				1506	* proven to be alive.
				1507	* It means that we continue to communicate
				1508	* to destinations, already remembered
				1509	* to the moment of synflood.
				1510	*/
				1511	NETDEBUG(if (net_ratelimit()) \
				1512	printk(KERN_DEBUG "TCP: drop open "
				1513	"request from %u.%u."
				1514	"%u.%u/%u\n", \
				1515	NIPQUAD(saddr),
				1516	ntohs(skb->h.th->source)));
				1517	dst_release(dst);
				1518	goto drop_and_free;
				1519	}
				1520
				1521	isn = tcp_v4_init_sequence(sk, skb);
				1522	}
				1523	req->snt_isn = isn;
				1524
				1525	if (tcp_v4_send_synack(sk, req, dst))
				1526	goto drop_and_free;
				1527
				1528	if (want_cookie) {
				1529	tcp_openreq_free(req);
				1530	} else {
				1531	tcp_v4_synq_add(sk, req);
				1532	}
				1533	return 0;
				1534
				1535	drop_and_free:
				1536	tcp_openreq_free(req);
				1537	drop:
				1538	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
				1539	return 0;
				1540	}
				1541
				1542
				1543	/*
				1544	* The three way handshake has completed - we got a valid synack -
				1545	* now create the new socket.
				1546	*/
				1547	struct sock tcp_v4_syn_recv_sock(struct sock sk, struct sk_buff *skb,
				1548	struct open_request *req,
				1549	struct dst_entry *dst)
				1550	{
				1551	struct inet_sock *newinet;
				1552	struct tcp_sock *newtp;
				1553	struct sock *newsk;
				1554
				1555	if (sk_acceptq_is_full(sk))
				1556	goto exit_overflow;
				1557
				1558	if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
				1559	goto exit;
				1560
				1561	newsk = tcp_create_openreq_child(sk, req, skb);
				1562	if (!newsk)
				1563	goto exit;
				1564
				1565	newsk->sk_dst_cache = dst;
				1566	tcp_v4_setup_caps(newsk, dst);
				1567
				1568	newtp = tcp_sk(newsk);
				1569	newinet = inet_sk(newsk);
				1570	newinet->daddr = req->af.v4_req.rmt_addr;
				1571	newinet->rcv_saddr = req->af.v4_req.loc_addr;
				1572	newinet->saddr = req->af.v4_req.loc_addr;
				1573	newinet->opt = req->af.v4_req.opt;
				1574	req->af.v4_req.opt = NULL;
				1575	newinet->mc_index = tcp_v4_iif(skb);
				1576	newinet->mc_ttl = skb->nh.iph->ttl;
				1577	newtp->ext_header_len = 0;
				1578	if (newinet->opt)
				1579	newtp->ext_header_len = newinet->opt->optlen;
				1580	newinet->id = newtp->write_seq ^ jiffies;
				1581
				1582	tcp_sync_mss(newsk, dst_mtu(dst));
				1583	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
				1584	tcp_initialize_rcv_mss(newsk);
				1585
				1586	__tcp_v4_hash(newsk, 0);
				1587	__tcp_inherit_port(sk, newsk);
				1588
				1589	return newsk;
				1590
				1591	exit_overflow:
				1592	NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
				1593	exit:
				1594	NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
				1595	dst_release(dst);
				1596	return NULL;
				1597	}
				1598
				1599	static struct sock tcp_v4_hnd_req(struct sock sk, struct sk_buff *skb)
				1600	{
				1601	struct tcphdr *th = skb->h.th;
				1602	struct iphdr *iph = skb->nh.iph;
				1603	struct tcp_sock *tp = tcp_sk(sk);
				1604	struct sock *nsk;
				1605	struct open_request **prev;
				1606	/* Find possible connection requests. */
				1607	struct open_request *req = tcp_v4_search_req(tp, &prev, th->source,
				1608	iph->saddr, iph->daddr);
				1609	if (req)
				1610	return tcp_check_req(sk, skb, req, prev);
				1611
				1612	nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
				1613	th->source,
				1614	skb->nh.iph->daddr,
				1615	ntohs(th->dest),
				1616	tcp_v4_iif(skb));
				1617
				1618	if (nsk) {
				1619	if (nsk->sk_state != TCP_TIME_WAIT) {
				1620	bh_lock_sock(nsk);
				1621	return nsk;
				1622	}
				1623	tcp_tw_put((struct tcp_tw_bucket *)nsk);
				1624	return NULL;
				1625	}
				1626
				1627	#ifdef CONFIG_SYN_COOKIES
				1628	if (!th->rst && !th->syn && th->ack)
				1629	sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
				1630	#endif
				1631	return sk;
				1632	}
				1633
				1634	static int tcp_v4_checksum_init(struct sk_buff *skb)
				1635	{
				1636	if (skb->ip_summed == CHECKSUM_HW) {
				1637	skb->ip_summed = CHECKSUM_UNNECESSARY;
				1638	if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
				1639	skb->nh.iph->daddr, skb->csum))
				1640	return 0;
				1641
				1642	NETDEBUG(if (net_ratelimit())
				1643	printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
				1644	skb->ip_summed = CHECKSUM_NONE;
				1645	}
				1646	if (skb->len <= 76) {
				1647	if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
				1648	skb->nh.iph->daddr,
				1649	skb_checksum(skb, 0, skb->len, 0)))
				1650	return -1;
				1651	skb->ip_summed = CHECKSUM_UNNECESSARY;
				1652	} else {
				1653	skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
				1654	skb->nh.iph->saddr,
				1655	skb->nh.iph->daddr, 0);
				1656	}
				1657	return 0;
				1658	}
				1659
				1660
				1661	/* The socket must have it's spinlock held when we get
				1662	* here.
				1663	*
				1664	* We have a potential double-lock case here, so even when
				1665	* doing backlog processing we use the BH locking scheme.
				1666	* This is because we cannot sleep with the original spinlock
				1667	* held.
				1668	*/
				1669	int tcp_v4_do_rcv(struct sock sk, struct sk_buff skb)
				1670	{
				1671	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
				1672	TCP_CHECK_TIMER(sk);
				1673	if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
				1674	goto reset;
				1675	TCP_CHECK_TIMER(sk);
				1676	return 0;
				1677	}
				1678
				1679	if (skb->len < (skb->h.th->doff << 2) \|\| tcp_checksum_complete(skb))
				1680	goto csum_err;
				1681
				1682	if (sk->sk_state == TCP_LISTEN) {
				1683	struct sock *nsk = tcp_v4_hnd_req(sk, skb);
				1684	if (!nsk)
				1685	goto discard;
				1686
				1687	if (nsk != sk) {
				1688	if (tcp_child_process(sk, nsk, skb))
				1689	goto reset;
				1690	return 0;
				1691	}
				1692	}
				1693
				1694	TCP_CHECK_TIMER(sk);
				1695	if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
				1696	goto reset;
				1697	TCP_CHECK_TIMER(sk);
				1698	return 0;
				1699
				1700	reset:
				1701	tcp_v4_send_reset(skb);
				1702	discard:
				1703	kfree_skb(skb);
				1704	/* Be careful here. If this function gets more complicated and
				1705	* gcc suffers from register pressure on the x86, sk (in %ebx)
				1706	* might be destroyed here. This current version compiles correctly,
				1707	* but you have been warned.
				1708	*/
				1709	return 0;
				1710
				1711	csum_err:
				1712	TCP_INC_STATS_BH(TCP_MIB_INERRS);
				1713	goto discard;
				1714	}
				1715
				1716	/*
				1717	* From tcp_input.c
				1718	*/
				1719
				1720	int tcp_v4_rcv(struct sk_buff *skb)
				1721	{
				1722	struct tcphdr *th;
				1723	struct sock *sk;
				1724	int ret;
				1725
				1726	if (skb->pkt_type != PACKET_HOST)
				1727	goto discard_it;
				1728
				1729	/* Count it even if it's bad */
				1730	TCP_INC_STATS_BH(TCP_MIB_INSEGS);
				1731
				1732	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
				1733	goto discard_it;
				1734
				1735	th = skb->h.th;
				1736
				1737	if (th->doff < sizeof(struct tcphdr) / 4)
				1738	goto bad_packet;
				1739	if (!pskb_may_pull(skb, th->doff * 4))
				1740	goto discard_it;
				1741
				1742	/* An explanation is required here, I think.
				1743	* Packet length and doff are validated by header prediction,
				1744	* provided case of th->doff==0 is elimineted.
				1745	* So, we defer the checks. */
				1746	if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
				1747	tcp_v4_checksum_init(skb) < 0))
				1748	goto bad_packet;
				1749
				1750	th = skb->h.th;
				1751	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
				1752	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
				1753	skb->len - th->doff * 4);
				1754	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
				1755	TCP_SKB_CB(skb)->when = 0;
				1756	TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
				1757	TCP_SKB_CB(skb)->sacked = 0;
				1758
				1759	sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
				1760	skb->nh.iph->daddr, ntohs(th->dest),
				1761	tcp_v4_iif(skb));
				1762
				1763	if (!sk)
				1764	goto no_tcp_socket;
				1765
				1766	process:
				1767	if (sk->sk_state == TCP_TIME_WAIT)
				1768	goto do_time_wait;
				1769
				1770	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
				1771	goto discard_and_relse;
				1772
				1773	if (sk_filter(sk, skb, 0))
				1774	goto discard_and_relse;
				1775
				1776	skb->dev = NULL;
				1777
				1778	bh_lock_sock(sk);
				1779	ret = 0;
				1780	if (!sock_owned_by_user(sk)) {
				1781	if (!tcp_prequeue(sk, skb))
				1782	ret = tcp_v4_do_rcv(sk, skb);
				1783	} else
				1784	sk_add_backlog(sk, skb);
				1785	bh_unlock_sock(sk);
				1786
				1787	sock_put(sk);
				1788
				1789	return ret;
				1790
				1791	no_tcp_socket:
				1792	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
				1793	goto discard_it;
				1794
				1795	if (skb->len < (th->doff << 2) \|\| tcp_checksum_complete(skb)) {
				1796	bad_packet:
				1797	TCP_INC_STATS_BH(TCP_MIB_INERRS);
				1798	} else {
				1799	tcp_v4_send_reset(skb);
				1800	}
				1801
				1802	discard_it:
				1803	/* Discard frame. */
				1804	kfree_skb(skb);
				1805	return 0;
				1806
				1807	discard_and_relse:
				1808	sock_put(sk);
				1809	goto discard_it;
				1810
				1811	do_time_wait:
				1812	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
				1813	tcp_tw_put((struct tcp_tw_bucket *) sk);
				1814	goto discard_it;
				1815	}
				1816
				1817	if (skb->len < (th->doff << 2) \|\| tcp_checksum_complete(skb)) {
				1818	TCP_INC_STATS_BH(TCP_MIB_INERRS);
				1819	tcp_tw_put((struct tcp_tw_bucket *) sk);
				1820	goto discard_it;
				1821	}
				1822	switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
				1823	skb, th, skb->len)) {
				1824	case TCP_TW_SYN: {
				1825	struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
				1826	ntohs(th->dest),
				1827	tcp_v4_iif(skb));
				1828	if (sk2) {
				1829	tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
				1830	tcp_tw_put((struct tcp_tw_bucket *)sk);
				1831	sk = sk2;
				1832	goto process;
				1833	}
				1834	/* Fall through to ACK */
				1835	}
				1836	case TCP_TW_ACK:
				1837	tcp_v4_timewait_ack(sk, skb);
				1838	break;
				1839	case TCP_TW_RST:
				1840	goto no_tcp_socket;
				1841	case TCP_TW_SUCCESS:;
				1842	}
				1843	goto discard_it;
				1844	}
				1845
				1846	/* With per-bucket locks this operation is not-atomic, so that
				1847	* this version is not worse.
				1848	*/
				1849	static void __tcp_v4_rehash(struct sock *sk)
				1850	{
				1851	sk->sk_prot->unhash(sk);
				1852	sk->sk_prot->hash(sk);
				1853	}
				1854
				1855	static int tcp_v4_reselect_saddr(struct sock *sk)
				1856	{
				1857	struct inet_sock *inet = inet_sk(sk);
				1858	int err;
				1859	struct rtable *rt;
				1860	__u32 old_saddr = inet->saddr;
				1861	__u32 new_saddr;
				1862	__u32 daddr = inet->daddr;
				1863
				1864	if (inet->opt && inet->opt->srr)
				1865	daddr = inet->opt->faddr;
				1866
				1867	/* Query new route. */
				1868	err = ip_route_connect(&rt, daddr, 0,
				1869	RT_CONN_FLAGS(sk),
				1870	sk->sk_bound_dev_if,
				1871	IPPROTO_TCP,
				1872	inet->sport, inet->dport, sk);
				1873	if (err)
				1874	return err;
				1875
				1876	__sk_dst_set(sk, &rt->u.dst);
				1877	tcp_v4_setup_caps(sk, &rt->u.dst);
				1878
				1879	new_saddr = rt->rt_src;
				1880
				1881	if (new_saddr == old_saddr)
				1882	return 0;
				1883
				1884	if (sysctl_ip_dynaddr > 1) {
				1885	printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
				1886	"saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
				1887	NIPQUAD(old_saddr),
				1888	NIPQUAD(new_saddr));
				1889	}
				1890
				1891	inet->saddr = new_saddr;
				1892	inet->rcv_saddr = new_saddr;
				1893
				1894	/* XXX The only one ugly spot where we need to
				1895	* XXX really change the sockets identity after
				1896	* XXX it has entered the hashes. -DaveM
				1897	*
				1898	* Besides that, it does not check for connection
				1899	* uniqueness. Wait for troubles.
				1900	*/
				1901	__tcp_v4_rehash(sk);
				1902	return 0;
				1903	}
				1904
				1905	int tcp_v4_rebuild_header(struct sock *sk)
				1906	{
				1907	struct inet_sock *inet = inet_sk(sk);
				1908	struct rtable rt = (struct rtable )__sk_dst_check(sk, 0);
				1909	u32 daddr;
				1910	int err;
				1911
				1912	/* Route is OK, nothing to do. */
				1913	if (rt)
				1914	return 0;
				1915
				1916	/* Reroute. */
				1917	daddr = inet->daddr;
				1918	if (inet->opt && inet->opt->srr)
				1919	daddr = inet->opt->faddr;
				1920
				1921	{
				1922	struct flowi fl = { .oif = sk->sk_bound_dev_if,
				1923	.nl_u = { .ip4_u =
				1924	{ .daddr = daddr,
				1925	.saddr = inet->saddr,
				1926	.tos = RT_CONN_FLAGS(sk) } },
				1927	.proto = IPPROTO_TCP,
				1928	.uli_u = { .ports =
				1929	{ .sport = inet->sport,
				1930	.dport = inet->dport } } };
				1931
				1932	err = ip_route_output_flow(&rt, &fl, sk, 0);
				1933	}
				1934	if (!err) {
				1935	__sk_dst_set(sk, &rt->u.dst);
				1936	tcp_v4_setup_caps(sk, &rt->u.dst);
				1937	return 0;
				1938	}
				1939
				1940	/* Routing failed... */
				1941	sk->sk_route_caps = 0;
				1942
				1943	if (!sysctl_ip_dynaddr \|\|
				1944	sk->sk_state != TCP_SYN_SENT \|\|
				1945	(sk->sk_userlocks & SOCK_BINDADDR_LOCK) \|\|
				1946	(err = tcp_v4_reselect_saddr(sk)) != 0)
				1947	sk->sk_err_soft = -err;
				1948
				1949	return err;
				1950	}
				1951
				1952	static void v4_addr2sockaddr(struct sock sk, struct sockaddr uaddr)
				1953	{
				1954	struct sockaddr_in sin = (struct sockaddr_in ) uaddr;
				1955	struct inet_sock *inet = inet_sk(sk);
				1956
				1957	sin->sin_family = AF_INET;
				1958	sin->sin_addr.s_addr = inet->daddr;
				1959	sin->sin_port = inet->dport;
				1960	}
				1961
				1962	/* VJ's idea. Save last timestamp seen from this destination
				1963	* and hold it at least for normal timewait interval to use for duplicate
				1964	* segment detection in subsequent connections, before they enter synchronized
				1965	* state.
				1966	*/
				1967
				1968	int tcp_v4_remember_stamp(struct sock *sk)
				1969	{
				1970	struct inet_sock *inet = inet_sk(sk);
				1971	struct tcp_sock *tp = tcp_sk(sk);
				1972	struct rtable rt = (struct rtable )__sk_dst_get(sk);
				1973	struct inet_peer *peer = NULL;
				1974	int release_it = 0;
				1975
				1976	if (!rt \|\| rt->rt_dst != inet->daddr) {
				1977	peer = inet_getpeer(inet->daddr, 1);
				1978	release_it = 1;
				1979	} else {
				1980	if (!rt->peer)
				1981	rt_bind_peer(rt, 1);
				1982	peer = rt->peer;
				1983	}
				1984
				1985	if (peer) {
				1986	if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 \|\|
				1987	(peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
				1988	peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
				1989	peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
				1990	peer->tcp_ts = tp->rx_opt.ts_recent;
				1991	}
				1992	if (release_it)
				1993	inet_putpeer(peer);
				1994	return 1;
				1995	}
				1996
				1997	return 0;
				1998	}
				1999
				2000	int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
				2001	{
				2002	struct inet_peer *peer = NULL;
				2003
				2004	peer = inet_getpeer(tw->tw_daddr, 1);
				2005
				2006	if (peer) {
				2007	if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 \|\|
				2008	(peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
				2009	peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
				2010	peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
				2011	peer->tcp_ts = tw->tw_ts_recent;
				2012	}
				2013	inet_putpeer(peer);
				2014	return 1;
				2015	}
				2016
				2017	return 0;
				2018	}
				2019
				2020	struct tcp_func ipv4_specific = {
				2021	.queue_xmit = ip_queue_xmit,
				2022	.send_check = tcp_v4_send_check,
				2023	.rebuild_header = tcp_v4_rebuild_header,
				2024	.conn_request = tcp_v4_conn_request,
				2025	.syn_recv_sock = tcp_v4_syn_recv_sock,
				2026	.remember_stamp = tcp_v4_remember_stamp,
				2027	.net_header_len = sizeof(struct iphdr),
				2028	.setsockopt = ip_setsockopt,
				2029	.getsockopt = ip_getsockopt,
				2030	.addr2sockaddr = v4_addr2sockaddr,
				2031	.sockaddr_len = sizeof(struct sockaddr_in),
				2032	};
				2033
				2034	/* NOTE: A lot of things set to zero explicitly by call to
				2035	* sk_alloc() so need not be done here.
				2036	*/
				2037	static int tcp_v4_init_sock(struct sock *sk)
				2038	{
				2039	struct tcp_sock *tp = tcp_sk(sk);
				2040
				2041	skb_queue_head_init(&tp->out_of_order_queue);
				2042	tcp_init_xmit_timers(sk);
				2043	tcp_prequeue_init(tp);
				2044
				2045	tp->rto = TCP_TIMEOUT_INIT;
				2046	tp->mdev = TCP_TIMEOUT_INIT;
				2047
				2048	/* So many TCP implementations out there (incorrectly) count the
				2049	* initial SYN frame in their delayed-ACK and congestion control
				2050	* algorithms that we must have the following bandaid to talk
				2051	* efficiently to them. -DaveM
				2052	*/
				2053	tp->snd_cwnd = 2;
				2054
				2055	/* See draft-stevens-tcpca-spec-01 for discussion of the
				2056	* initialization of these values.
				2057	*/
				2058	tp->snd_ssthresh = 0x7fffffff; /* Infinity */
				2059	tp->snd_cwnd_clamp = ~0;
				2060	tp->mss_cache_std = tp->mss_cache = 536;
				2061
				2062	tp->reordering = sysctl_tcp_reordering;
				2063
				2064	sk->sk_state = TCP_CLOSE;
				2065
				2066	sk->sk_write_space = sk_stream_write_space;
				2067	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
				2068
				2069	tp->af_specific = &ipv4_specific;
				2070
				2071	sk->sk_sndbuf = sysctl_tcp_wmem[1];
				2072	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
				2073
				2074	atomic_inc(&tcp_sockets_allocated);
				2075
				2076	return 0;
				2077	}
				2078
				2079	int tcp_v4_destroy_sock(struct sock *sk)
				2080	{
				2081	struct tcp_sock *tp = tcp_sk(sk);
				2082
				2083	tcp_clear_xmit_timers(sk);
				2084
				2085	/* Cleanup up the write buffer. */
				2086	sk_stream_writequeue_purge(sk);
				2087
				2088	/* Cleans up our, hopefully empty, out_of_order_queue. */
				2089	__skb_queue_purge(&tp->out_of_order_queue);
				2090
				2091	/* Clean prequeue, it must be empty really */
				2092	__skb_queue_purge(&tp->ucopy.prequeue);
				2093
				2094	/* Clean up a referenced TCP bind bucket. */
				2095	if (tp->bind_hash)
				2096	tcp_put_port(sk);
				2097
				2098	/*
				2099	* If sendmsg cached page exists, toss it.
				2100	*/
				2101	if (sk->sk_sndmsg_page) {
				2102	__free_page(sk->sk_sndmsg_page);
				2103	sk->sk_sndmsg_page = NULL;
				2104	}
				2105
				2106	atomic_dec(&tcp_sockets_allocated);
				2107
				2108	return 0;
				2109	}
				2110
				2111	EXPORT_SYMBOL(tcp_v4_destroy_sock);
				2112
				2113	#ifdef CONFIG_PROC_FS
				2114	/* Proc filesystem TCP sock list dumping. */
				2115
				2116	static inline struct tcp_tw_bucket tw_head(struct hlist_head head)
				2117	{
				2118	return hlist_empty(head) ? NULL :
				2119	list_entry(head->first, struct tcp_tw_bucket, tw_node);
				2120	}
				2121
				2122	static inline struct tcp_tw_bucket tw_next(struct tcp_tw_bucket tw)
				2123	{
				2124	return tw->tw_node.next ?
				2125	hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
				2126	}
				2127
				2128	static void listening_get_next(struct seq_file seq, void *cur)
				2129	{
				2130	struct tcp_sock *tp;
				2131	struct hlist_node *node;
				2132	struct sock *sk = cur;
				2133	struct tcp_iter_state* st = seq->private;
				2134
				2135	if (!sk) {
				2136	st->bucket = 0;
				2137	sk = sk_head(&tcp_listening_hash[0]);
				2138	goto get_sk;
				2139	}
				2140
				2141	++st->num;
				2142
				2143	if (st->state == TCP_SEQ_STATE_OPENREQ) {
				2144	struct open_request *req = cur;
				2145
				2146	tp = tcp_sk(st->syn_wait_sk);
				2147	req = req->dl_next;
				2148	while (1) {
				2149	while (req) {
				2150	if (req->class->family == st->family) {
				2151	cur = req;
				2152	goto out;
				2153	}
				2154	req = req->dl_next;
				2155	}
				2156	if (++st->sbucket >= TCP_SYNQ_HSIZE)
				2157	break;
				2158	get_req:
				2159	req = tp->listen_opt->syn_table[st->sbucket];
				2160	}
				2161	sk = sk_next(st->syn_wait_sk);
				2162	st->state = TCP_SEQ_STATE_LISTENING;
				2163	read_unlock_bh(&tp->syn_wait_lock);
				2164	} else {
				2165	tp = tcp_sk(sk);
				2166	read_lock_bh(&tp->syn_wait_lock);
				2167	if (tp->listen_opt && tp->listen_opt->qlen)
				2168	goto start_req;
				2169	read_unlock_bh(&tp->syn_wait_lock);
				2170	sk = sk_next(sk);
				2171	}
				2172	get_sk:
				2173	sk_for_each_from(sk, node) {
				2174	if (sk->sk_family == st->family) {
				2175	cur = sk;
				2176	goto out;
				2177	}
				2178	tp = tcp_sk(sk);
				2179	read_lock_bh(&tp->syn_wait_lock);
				2180	if (tp->listen_opt && tp->listen_opt->qlen) {
				2181	start_req:
				2182	st->uid = sock_i_uid(sk);
				2183	st->syn_wait_sk = sk;
				2184	st->state = TCP_SEQ_STATE_OPENREQ;
				2185	st->sbucket = 0;
				2186	goto get_req;
				2187	}
				2188	read_unlock_bh(&tp->syn_wait_lock);
				2189	}
				2190	if (++st->bucket < TCP_LHTABLE_SIZE) {
				2191	sk = sk_head(&tcp_listening_hash[st->bucket]);
				2192	goto get_sk;
				2193	}
				2194	cur = NULL;
				2195	out:
				2196	return cur;
				2197	}
				2198
				2199	static void listening_get_idx(struct seq_file seq, loff_t *pos)
				2200	{
				2201	void *rc = listening_get_next(seq, NULL);
				2202
				2203	while (rc && *pos) {
				2204	rc = listening_get_next(seq, rc);
				2205	--*pos;
				2206	}
				2207	return rc;
				2208	}
				2209
				2210	static void established_get_first(struct seq_file seq)
				2211	{
				2212	struct tcp_iter_state* st = seq->private;
				2213	void *rc = NULL;
				2214
				2215	for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
				2216	struct sock *sk;
				2217	struct hlist_node *node;
				2218	struct tcp_tw_bucket *tw;
				2219
				2220	/* We can reschedule _before_ having picked the target: */
				2221	cond_resched_softirq();
				2222
				2223	read_lock(&tcp_ehash[st->bucket].lock);
				2224	sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
				2225	if (sk->sk_family != st->family) {
				2226	continue;
				2227	}
				2228	rc = sk;
				2229	goto out;
				2230	}
				2231	st->state = TCP_SEQ_STATE_TIME_WAIT;
				2232	tw_for_each(tw, node,
				2233	&tcp_ehash[st->bucket + tcp_ehash_size].chain) {
				2234	if (tw->tw_family != st->family) {
				2235	continue;
				2236	}
				2237	rc = tw;
				2238	goto out;
				2239	}
				2240	read_unlock(&tcp_ehash[st->bucket].lock);
				2241	st->state = TCP_SEQ_STATE_ESTABLISHED;
				2242	}
				2243	out:
				2244	return rc;
				2245	}
				2246
				2247	static void established_get_next(struct seq_file seq, void *cur)
				2248	{
				2249	struct sock *sk = cur;
				2250	struct tcp_tw_bucket *tw;
				2251	struct hlist_node *node;
				2252	struct tcp_iter_state* st = seq->private;
				2253
				2254	++st->num;
				2255
				2256	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
				2257	tw = cur;
				2258	tw = tw_next(tw);
				2259	get_tw:
				2260	while (tw && tw->tw_family != st->family) {
				2261	tw = tw_next(tw);
				2262	}
				2263	if (tw) {
				2264	cur = tw;
				2265	goto out;
				2266	}
				2267	read_unlock(&tcp_ehash[st->bucket].lock);
				2268	st->state = TCP_SEQ_STATE_ESTABLISHED;
				2269
				2270	/* We can reschedule between buckets: */
				2271	cond_resched_softirq();
				2272
				2273	if (++st->bucket < tcp_ehash_size) {
				2274	read_lock(&tcp_ehash[st->bucket].lock);
				2275	sk = sk_head(&tcp_ehash[st->bucket].chain);
				2276	} else {
				2277	cur = NULL;
				2278	goto out;
				2279	}
				2280	} else
				2281	sk = sk_next(sk);
				2282
				2283	sk_for_each_from(sk, node) {
				2284	if (sk->sk_family == st->family)
				2285	goto found;
				2286	}
				2287
				2288	st->state = TCP_SEQ_STATE_TIME_WAIT;
				2289	tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
				2290	goto get_tw;
				2291	found:
				2292	cur = sk;
				2293	out:
				2294	return cur;
				2295	}
				2296
				2297	static void established_get_idx(struct seq_file seq, loff_t pos)
				2298	{
				2299	void *rc = established_get_first(seq);
				2300
				2301	while (rc && pos) {
				2302	rc = established_get_next(seq, rc);
				2303	--pos;
				2304	}
				2305	return rc;
				2306	}
				2307
				2308	static void tcp_get_idx(struct seq_file seq, loff_t pos)
				2309	{
				2310	void *rc;
				2311	struct tcp_iter_state* st = seq->private;
				2312
				2313	tcp_listen_lock();
				2314	st->state = TCP_SEQ_STATE_LISTENING;
				2315	rc = listening_get_idx(seq, &pos);
				2316
				2317	if (!rc) {
				2318	tcp_listen_unlock();
				2319	local_bh_disable();
				2320	st->state = TCP_SEQ_STATE_ESTABLISHED;
				2321	rc = established_get_idx(seq, pos);
				2322	}
				2323
				2324	return rc;
				2325	}
				2326
				2327	static void tcp_seq_start(struct seq_file seq, loff_t *pos)
				2328	{
				2329	struct tcp_iter_state* st = seq->private;
				2330	st->state = TCP_SEQ_STATE_LISTENING;
				2331	st->num = 0;
				2332	return pos ? tcp_get_idx(seq, pos - 1) : SEQ_START_TOKEN;
				2333	}
				2334
				2335	static void tcp_seq_next(struct seq_file seq, void v, loff_t pos)
				2336	{
				2337	void *rc = NULL;
				2338	struct tcp_iter_state* st;
				2339
				2340	if (v == SEQ_START_TOKEN) {
				2341	rc = tcp_get_idx(seq, 0);
				2342	goto out;
				2343	}
				2344	st = seq->private;
				2345
				2346	switch (st->state) {
				2347	case TCP_SEQ_STATE_OPENREQ:
				2348	case TCP_SEQ_STATE_LISTENING:
				2349	rc = listening_get_next(seq, v);
				2350	if (!rc) {
				2351	tcp_listen_unlock();
				2352	local_bh_disable();
				2353	st->state = TCP_SEQ_STATE_ESTABLISHED;
				2354	rc = established_get_first(seq);
				2355	}
				2356	break;
				2357	case TCP_SEQ_STATE_ESTABLISHED:
				2358	case TCP_SEQ_STATE_TIME_WAIT:
				2359	rc = established_get_next(seq, v);
				2360	break;
				2361	}
				2362	out:
				2363	++*pos;
				2364	return rc;
				2365	}
				2366
				2367	static void tcp_seq_stop(struct seq_file seq, void v)
				2368	{
				2369	struct tcp_iter_state* st = seq->private;
				2370
				2371	switch (st->state) {
				2372	case TCP_SEQ_STATE_OPENREQ:
				2373	if (v) {
				2374	struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
				2375	read_unlock_bh(&tp->syn_wait_lock);
				2376	}
				2377	case TCP_SEQ_STATE_LISTENING:
				2378	if (v != SEQ_START_TOKEN)
				2379	tcp_listen_unlock();
				2380	break;
				2381	case TCP_SEQ_STATE_TIME_WAIT:
				2382	case TCP_SEQ_STATE_ESTABLISHED:
				2383	if (v)
				2384	read_unlock(&tcp_ehash[st->bucket].lock);
				2385	local_bh_enable();
				2386	break;
				2387	}
				2388	}
				2389
				2390	static int tcp_seq_open(struct inode inode, struct file file)
				2391	{
				2392	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
				2393	struct seq_file *seq;
				2394	struct tcp_iter_state *s;
				2395	int rc;
				2396
				2397	if (unlikely(afinfo == NULL))
				2398	return -EINVAL;
				2399
				2400	s = kmalloc(sizeof(*s), GFP_KERNEL);
				2401	if (!s)
				2402	return -ENOMEM;
				2403	memset(s, 0, sizeof(*s));
				2404	s->family = afinfo->family;
				2405	s->seq_ops.start = tcp_seq_start;
				2406	s->seq_ops.next = tcp_seq_next;
				2407	s->seq_ops.show = afinfo->seq_show;
				2408	s->seq_ops.stop = tcp_seq_stop;
				2409
				2410	rc = seq_open(file, &s->seq_ops);
				2411	if (rc)
				2412	goto out_kfree;
				2413	seq = file->private_data;
				2414	seq->private = s;
				2415	out:
				2416	return rc;
				2417	out_kfree:
				2418	kfree(s);
				2419	goto out;
				2420	}
				2421
				2422	int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
				2423	{
				2424	int rc = 0;
				2425	struct proc_dir_entry *p;
				2426
				2427	if (!afinfo)
				2428	return -EINVAL;
				2429	afinfo->seq_fops->owner = afinfo->owner;
				2430	afinfo->seq_fops->open = tcp_seq_open;
				2431	afinfo->seq_fops->read = seq_read;
				2432	afinfo->seq_fops->llseek = seq_lseek;
				2433	afinfo->seq_fops->release = seq_release_private;
				2434
				2435	p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
				2436	if (p)
				2437	p->data = afinfo;
				2438	else
				2439	rc = -ENOMEM;
				2440	return rc;
				2441	}
				2442
				2443	void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
				2444	{
				2445	if (!afinfo)
				2446	return;
				2447	proc_net_remove(afinfo->name);
				2448	memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
				2449	}
				2450
				2451	static void get_openreq4(struct sock sk, struct open_request req,
				2452	char *tmpbuf, int i, int uid)
				2453	{
				2454	int ttd = req->expires - jiffies;
				2455
				2456	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
				2457	" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
				2458	i,
				2459	req->af.v4_req.loc_addr,
				2460	ntohs(inet_sk(sk)->sport),
				2461	req->af.v4_req.rmt_addr,
				2462	ntohs(req->rmt_port),
				2463	TCP_SYN_RECV,
				2464	0, 0, /* could print option size, but that is af dependent. */
				2465	1, /* timers active (only the expire timer) */
				2466	jiffies_to_clock_t(ttd),
				2467	req->retrans,
				2468	uid,
				2469	0, /* non standard timer */
				2470	0, /* open_requests have no inode */
				2471	atomic_read(&sk->sk_refcnt),
				2472	req);
				2473	}
				2474
				2475	static void get_tcp4_sock(struct sock sp, char tmpbuf, int i)
				2476	{
				2477	int timer_active;
				2478	unsigned long timer_expires;
				2479	struct tcp_sock *tp = tcp_sk(sp);
				2480	struct inet_sock *inet = inet_sk(sp);
				2481	unsigned int dest = inet->daddr;
				2482	unsigned int src = inet->rcv_saddr;
				2483	__u16 destp = ntohs(inet->dport);
				2484	__u16 srcp = ntohs(inet->sport);
				2485
				2486	if (tp->pending == TCP_TIME_RETRANS) {
				2487	timer_active = 1;
				2488	timer_expires = tp->timeout;
				2489	} else if (tp->pending == TCP_TIME_PROBE0) {
				2490	timer_active = 4;
				2491	timer_expires = tp->timeout;
				2492	} else if (timer_pending(&sp->sk_timer)) {
				2493	timer_active = 2;
				2494	timer_expires = sp->sk_timer.expires;
				2495	} else {
				2496	timer_active = 0;
				2497	timer_expires = jiffies;
				2498	}
				2499
				2500	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
				2501	"%08X %5d %8d %lu %d %p %u %u %u %u %d",
				2502	i, src, srcp, dest, destp, sp->sk_state,
				2503	tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
				2504	timer_active,
				2505	jiffies_to_clock_t(timer_expires - jiffies),
				2506	tp->retransmits,
				2507	sock_i_uid(sp),
				2508	tp->probes_out,
				2509	sock_i_ino(sp),
				2510	atomic_read(&sp->sk_refcnt), sp,
				2511	tp->rto, tp->ack.ato, (tp->ack.quick << 1) \| tp->ack.pingpong,
				2512	tp->snd_cwnd,
				2513	tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
				2514	}
				2515
				2516	static void get_timewait4_sock(struct tcp_tw_bucket tw, char tmpbuf, int i)
				2517	{
				2518	unsigned int dest, src;
				2519	__u16 destp, srcp;
				2520	int ttd = tw->tw_ttd - jiffies;
				2521
				2522	if (ttd < 0)
				2523	ttd = 0;
				2524
				2525	dest = tw->tw_daddr;
				2526	src = tw->tw_rcv_saddr;
				2527	destp = ntohs(tw->tw_dport);
				2528	srcp = ntohs(tw->tw_sport);
				2529
				2530	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
				2531	" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
				2532	i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
				2533	3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
				2534	atomic_read(&tw->tw_refcnt), tw);
				2535	}
				2536
				2537	#define TMPSZ 150
				2538
				2539	static int tcp4_seq_show(struct seq_file seq, void v)
				2540	{
				2541	struct tcp_iter_state* st;
				2542	char tmpbuf[TMPSZ + 1];
				2543
				2544	if (v == SEQ_START_TOKEN) {
				2545	seq_printf(seq, "%-*s\n", TMPSZ - 1,
				2546	" sl local_address rem_address st tx_queue "
				2547	"rx_queue tr tm->when retrnsmt uid timeout "
				2548	"inode");
				2549	goto out;
				2550	}
				2551	st = seq->private;
				2552
				2553	switch (st->state) {
				2554	case TCP_SEQ_STATE_LISTENING:
				2555	case TCP_SEQ_STATE_ESTABLISHED:
				2556	get_tcp4_sock(v, tmpbuf, st->num);
				2557	break;
				2558	case TCP_SEQ_STATE_OPENREQ:
				2559	get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
				2560	break;
				2561	case TCP_SEQ_STATE_TIME_WAIT:
				2562	get_timewait4_sock(v, tmpbuf, st->num);
				2563	break;
				2564	}
				2565	seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
				2566	out:
				2567	return 0;
				2568	}
				2569
				2570	static struct file_operations tcp4_seq_fops;
				2571	static struct tcp_seq_afinfo tcp4_seq_afinfo = {
				2572	.owner = THIS_MODULE,
				2573	.name = "tcp",
				2574	.family = AF_INET,
				2575	.seq_show = tcp4_seq_show,
				2576	.seq_fops = &tcp4_seq_fops,
				2577	};
				2578
				2579	int __init tcp4_proc_init(void)
				2580	{
				2581	return tcp_proc_register(&tcp4_seq_afinfo);
				2582	}
				2583
				2584	void tcp4_proc_exit(void)
				2585	{
				2586	tcp_proc_unregister(&tcp4_seq_afinfo);
				2587	}
				2588	#endif /* CONFIG_PROC_FS */
				2589
				2590	struct proto tcp_prot = {
				2591	.name = "TCP",
				2592	.owner = THIS_MODULE,
				2593	.close = tcp_close,
				2594	.connect = tcp_v4_connect,
				2595	.disconnect = tcp_disconnect,
				2596	.accept = tcp_accept,
				2597	.ioctl = tcp_ioctl,
				2598	.init = tcp_v4_init_sock,
				2599	.destroy = tcp_v4_destroy_sock,
				2600	.shutdown = tcp_shutdown,
				2601	.setsockopt = tcp_setsockopt,
				2602	.getsockopt = tcp_getsockopt,
				2603	.sendmsg = tcp_sendmsg,
				2604	.recvmsg = tcp_recvmsg,
				2605	.backlog_rcv = tcp_v4_do_rcv,
				2606	.hash = tcp_v4_hash,
				2607	.unhash = tcp_unhash,
				2608	.get_port = tcp_v4_get_port,
				2609	.enter_memory_pressure = tcp_enter_memory_pressure,
				2610	.sockets_allocated = &tcp_sockets_allocated,
				2611	.memory_allocated = &tcp_memory_allocated,
				2612	.memory_pressure = &tcp_memory_pressure,
				2613	.sysctl_mem = sysctl_tcp_mem,
				2614	.sysctl_wmem = sysctl_tcp_wmem,
				2615	.sysctl_rmem = sysctl_tcp_rmem,
				2616	.max_header = MAX_TCP_HEADER,
				2617	.obj_size = sizeof(struct tcp_sock),
				2618	};
				2619
				2620
				2621
				2622	void __init tcp_v4_init(struct net_proto_family *ops)
				2623	{
				2624	int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
				2625	if (err < 0)
				2626	panic("Failed to create the TCP control socket.\n");
				2627	tcp_socket->sk->sk_allocation = GFP_ATOMIC;
				2628	inet_sk(tcp_socket->sk)->uc_ttl = -1;
				2629
				2630	/* Unhash it so that IP input processing does not even
				2631	* see it, we do not wish this socket to see incoming
				2632	* packets.
				2633	*/
				2634	tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
				2635	}
				2636
				2637	EXPORT_SYMBOL(ipv4_specific);
				2638	EXPORT_SYMBOL(tcp_bind_hash);
				2639	EXPORT_SYMBOL(tcp_bucket_create);
				2640	EXPORT_SYMBOL(tcp_hashinfo);
				2641	EXPORT_SYMBOL(tcp_inherit_port);
				2642	EXPORT_SYMBOL(tcp_listen_wlock);
				2643	EXPORT_SYMBOL(tcp_port_rover);
				2644	EXPORT_SYMBOL(tcp_prot);
				2645	EXPORT_SYMBOL(tcp_put_port);
				2646	EXPORT_SYMBOL(tcp_unhash);
				2647	EXPORT_SYMBOL(tcp_v4_conn_request);
				2648	EXPORT_SYMBOL(tcp_v4_connect);
				2649	EXPORT_SYMBOL(tcp_v4_do_rcv);
				2650	EXPORT_SYMBOL(tcp_v4_rebuild_header);
				2651	EXPORT_SYMBOL(tcp_v4_remember_stamp);
				2652	EXPORT_SYMBOL(tcp_v4_send_check);
				2653	EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
				2654
				2655	#ifdef CONFIG_PROC_FS
				2656	EXPORT_SYMBOL(tcp_proc_register);
				2657	EXPORT_SYMBOL(tcp_proc_unregister);
				2658	#endif
				2659	EXPORT_SYMBOL(sysctl_local_port_range);
				2660	EXPORT_SYMBOL(sysctl_max_syn_backlog);
				2661	EXPORT_SYMBOL(sysctl_tcp_low_latency);
				2662	EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
				2663