Blame - net/ipv4/tcp_minisocks.c - android_kernel_oneplus_msm8996

blob: fd70509f0d53df8e39b63efbbbfeb360e34dbf2e [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* INET An implementation of the TCP/IP protocol suite for the LINUX
				3	* operating system. INET is implemented using the BSD Socket
				4	* interface as the means of communication with the user level.
				5	*
				6	* Implementation of the Transmission Control Protocol(TCP).
				7	*
				8	* Version: $Id: tcp_minisocks.c,v 1.15 2002/02/01 22:01:04 davem Exp $
				9	*
				10	* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
				11	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
				12	* Mark Evans, <evansmp@uhura.aston.ac.uk>
				13	* Corey Minyard <wf-rch!minyard@relay.EU.net>
				14	* Florian La Roche, <flla@stud.uni-sb.de>
				15	* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
				16	* Linus Torvalds, <torvalds@cs.helsinki.fi>
				17	* Alan Cox, <gw4pts@gw4pts.ampr.org>
				18	* Matthew Dillon, <dillon@apollo.west.oic.com>
				19	* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
				20	* Jorge Cwik, <jorge@laser.satlink.net>
				21	*/
				22
				23	#include <linux/config.h>
				24	#include <linux/mm.h>
				25	#include <linux/module.h>
				26	#include <linux/sysctl.h>
				27	#include <linux/workqueue.h>
				28	#include <net/tcp.h>
				29	#include <net/inet_common.h>
				30	#include <net/xfrm.h>
				31
				32	#ifdef CONFIG_SYSCTL
				33	#define SYNC_INIT 0 /* let the user enable it */
				34	#else
				35	#define SYNC_INIT 1
				36	#endif
				37
				38	int sysctl_tcp_tw_recycle;
				39	int sysctl_tcp_max_tw_buckets = NR_FILE*2;
				40
				41	int sysctl_tcp_syncookies = SYNC_INIT;
				42	int sysctl_tcp_abort_on_overflow;
				43
				44	static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo);
				45
				46	static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
				47	{
				48	if (seq == s_win)
				49	return 1;
				50	if (after(end_seq, s_win) && before(seq, e_win))
				51	return 1;
				52	return (seq == e_win && seq == end_seq);
				53	}
				54
				55	/* New-style handling of TIME_WAIT sockets. */
				56
				57	int tcp_tw_count;
				58
				59
				60	/* Must be called with locally disabled BHs. */
				61	static void tcp_timewait_kill(struct tcp_tw_bucket *tw)
				62	{
				63	struct tcp_ehash_bucket *ehead;
				64	struct tcp_bind_hashbucket *bhead;
				65	struct tcp_bind_bucket *tb;
				66
				67	/* Unlink from established hashes. */
				68	ehead = &tcp_ehash[tw->tw_hashent];
				69	write_lock(&ehead->lock);
				70	if (hlist_unhashed(&tw->tw_node)) {
				71	write_unlock(&ehead->lock);
				72	return;
				73	}
				74	__hlist_del(&tw->tw_node);
				75	sk_node_init(&tw->tw_node);
				76	write_unlock(&ehead->lock);
				77
				78	/* Disassociate with bind bucket. */
				79	bhead = &tcp_bhash[tcp_bhashfn(tw->tw_num)];
				80	spin_lock(&bhead->lock);
				81	tb = tw->tw_tb;
				82	__hlist_del(&tw->tw_bind_node);
				83	tw->tw_tb = NULL;
				84	tcp_bucket_destroy(tb);
				85	spin_unlock(&bhead->lock);
				86
				87	#ifdef INET_REFCNT_DEBUG
				88	if (atomic_read(&tw->tw_refcnt) != 1) {
				89	printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw,
				90	atomic_read(&tw->tw_refcnt));
				91	}
				92	#endif
				93	tcp_tw_put(tw);
				94	}
				95
				96	/*
				97	* * Main purpose of TIME-WAIT state is to close connection gracefully,
				98	* when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
				99	* (and, probably, tail of data) and one or more our ACKs are lost.
				100	* * What is TIME-WAIT timeout? It is associated with maximal packet
				101	* lifetime in the internet, which results in wrong conclusion, that
				102	* it is set to catch "old duplicate segments" wandering out of their path.
				103	* It is not quite correct. This timeout is calculated so that it exceeds
				104	* maximal retransmission timeout enough to allow to lose one (or more)
				105	* segments sent by peer and our ACKs. This time may be calculated from RTO.
				106	* * When TIME-WAIT socket receives RST, it means that another end
				107	* finally closed and we are allowed to kill TIME-WAIT too.
				108	* * Second purpose of TIME-WAIT is catching old duplicate segments.
				109	* Well, certainly it is pure paranoia, but if we load TIME-WAIT
				110	* with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
				111	* * If we invented some more clever way to catch duplicates
				112	* (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
				113	*
				114	* The algorithm below is based on FORMAL INTERPRETATION of RFCs.
				115	* When you compare it to RFCs, please, read section SEGMENT ARRIVES
				116	* from the very beginning.
				117	*
				118	* NOTE. With recycling (and later with fin-wait-2) TW bucket
				119	* is _not_ stateless. It means, that strictly speaking we must
				120	* spinlock it. I do not want! Well, probability of misbehaviour
				121	* is ridiculously low and, seems, we could use some mb() tricks
				122	* to avoid misread sequence numbers, states etc. --ANK
				123	*/
				124	enum tcp_tw_status
				125	tcp_timewait_state_process(struct tcp_tw_bucket tw, struct sk_buff skb,
				126	struct tcphdr *th, unsigned len)
				127	{
				128	struct tcp_options_received tmp_opt;
				129	int paws_reject = 0;
				130
				131	tmp_opt.saw_tstamp = 0;
				132	if (th->doff > (sizeof(struct tcphdr) >> 2) && tw->tw_ts_recent_stamp) {
				133	tcp_parse_options(skb, &tmp_opt, 0);
				134
				135	if (tmp_opt.saw_tstamp) {
				136	tmp_opt.ts_recent = tw->tw_ts_recent;
				137	tmp_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
				138	paws_reject = tcp_paws_check(&tmp_opt, th->rst);
				139	}
				140	}
				141
				142	if (tw->tw_substate == TCP_FIN_WAIT2) {
				143	/* Just repeat all the checks of tcp_rcv_state_process() */
				144
				145	/* Out of window, send ACK */
				146	if (paws_reject \|\|
				147	!tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
				148	tw->tw_rcv_nxt,
				149	tw->tw_rcv_nxt + tw->tw_rcv_wnd))
				150	return TCP_TW_ACK;
				151
				152	if (th->rst)
				153	goto kill;
				154
				155	if (th->syn && !before(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt))
				156	goto kill_with_rst;
				157
				158	/* Dup ACK? */
				159	if (!after(TCP_SKB_CB(skb)->end_seq, tw->tw_rcv_nxt) \|\|
				160	TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
				161	tcp_tw_put(tw);
				162	return TCP_TW_SUCCESS;
				163	}
				164
				165	/* New data or FIN. If new data arrive after half-duplex close,
				166	* reset.
				167	*/
				168	if (!th->fin \|\|
				169	TCP_SKB_CB(skb)->end_seq != tw->tw_rcv_nxt + 1) {
				170	kill_with_rst:
				171	tcp_tw_deschedule(tw);
				172	tcp_tw_put(tw);
				173	return TCP_TW_RST;
				174	}
				175
				176	/* FIN arrived, enter true time-wait state. */
				177	tw->tw_substate = TCP_TIME_WAIT;
				178	tw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
				179	if (tmp_opt.saw_tstamp) {
				180	tw->tw_ts_recent_stamp = xtime.tv_sec;
				181	tw->tw_ts_recent = tmp_opt.rcv_tsval;
				182	}
				183
				184	/* I am shamed, but failed to make it more elegant.
				185	* Yes, it is direct reference to IP, which is impossible
				186	* to generalize to IPv6. Taking into account that IPv6
				187	* do not undertsnad recycling in any case, it not
				188	* a big problem in practice. --ANK */
				189	if (tw->tw_family == AF_INET &&
				190	sysctl_tcp_tw_recycle && tw->tw_ts_recent_stamp &&
				191	tcp_v4_tw_remember_stamp(tw))
				192	tcp_tw_schedule(tw, tw->tw_timeout);
				193	else
				194	tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
				195	return TCP_TW_ACK;
				196	}
				197
				198	/*
				199	* Now real TIME-WAIT state.
				200	*
				201	* RFC 1122:
				202	* "When a connection is [...] on TIME-WAIT state [...]
				203	* [a TCP] MAY accept a new SYN from the remote TCP to
				204	* reopen the connection directly, if it:
				205	*
				206	* (1) assigns its initial sequence number for the new
				207	* connection to be larger than the largest sequence
				208	* number it used on the previous connection incarnation,
				209	* and
				210	*
				211	* (2) returns to TIME-WAIT state if the SYN turns out
				212	* to be an old duplicate".
				213	*/
				214
				215	if (!paws_reject &&
				216	(TCP_SKB_CB(skb)->seq == tw->tw_rcv_nxt &&
				217	(TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq \|\| th->rst))) {
				218	/* In window segment, it may be only reset or bare ack. */
				219
				220	if (th->rst) {
				221	/* This is TIME_WAIT assasination, in two flavors.
				222	* Oh well... nobody has a sufficient solution to this
				223	* protocol bug yet.
				224	*/
				225	if (sysctl_tcp_rfc1337 == 0) {
				226	kill:
				227	tcp_tw_deschedule(tw);
				228	tcp_tw_put(tw);
				229	return TCP_TW_SUCCESS;
				230	}
				231	}
				232	tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
				233
				234	if (tmp_opt.saw_tstamp) {
				235	tw->tw_ts_recent = tmp_opt.rcv_tsval;
				236	tw->tw_ts_recent_stamp = xtime.tv_sec;
				237	}
				238
				239	tcp_tw_put(tw);
				240	return TCP_TW_SUCCESS;
				241	}
				242
				243	/* Out of window segment.
				244
				245	All the segments are ACKed immediately.
				246
				247	The only exception is new SYN. We accept it, if it is
				248	not old duplicate and we are not in danger to be killed
				249	by delayed old duplicates. RFC check is that it has
				250	newer sequence number works at rates <40Mbit/sec.
				251	However, if paws works, it is reliable AND even more,
				252	we even may relax silly seq space cutoff.
				253
				254	RED-PEN: we violate main RFC requirement, if this SYN will appear
				255	old duplicate (i.e. we receive RST in reply to SYN-ACK),
				256	we must return socket to time-wait state. It is not good,
				257	but not fatal yet.
				258	*/
				259
				260	if (th->syn && !th->rst && !th->ack && !paws_reject &&
				261	(after(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt) \|\|
				262	(tmp_opt.saw_tstamp && (s32)(tw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
				263	u32 isn = tw->tw_snd_nxt + 65535 + 2;
				264	if (isn == 0)
				265	isn++;
				266	TCP_SKB_CB(skb)->when = isn;
				267	return TCP_TW_SYN;
				268	}
				269
				270	if (paws_reject)
				271	NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
				272
				273	if(!th->rst) {
				274	/* In this case we must reset the TIMEWAIT timer.
				275	*
				276	* If it is ACKless SYN it may be both old duplicate
				277	* and new good SYN with random sequence number <rcv_nxt.
				278	* Do not reschedule in the last case.
				279	*/
				280	if (paws_reject \|\| th->ack)
				281	tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
				282
				283	/* Send ACK. Note, we do not put the bucket,
				284	* it will be released by caller.
				285	*/
				286	return TCP_TW_ACK;
				287	}
				288	tcp_tw_put(tw);
				289	return TCP_TW_SUCCESS;
				290	}
				291
				292	/* Enter the time wait state. This is called with locally disabled BH.
				293	* Essentially we whip up a timewait bucket, copy the
				294	* relevant info into it from the SK, and mess with hash chains
				295	* and list linkage.
				296	*/
				297	static void __tcp_tw_hashdance(struct sock sk, struct tcp_tw_bucket tw)
				298	{
				299	struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->sk_hashent];
				300	struct tcp_bind_hashbucket *bhead;
				301
				302	/* Step 1: Put TW into bind hash. Original socket stays there too.
				303	Note, that any socket with inet_sk(sk)->num != 0 MUST be bound in
				304	binding cache, even if it is closed.
				305	*/
				306	bhead = &tcp_bhash[tcp_bhashfn(inet_sk(sk)->num)];
				307	spin_lock(&bhead->lock);
				308	tw->tw_tb = tcp_sk(sk)->bind_hash;
				309	BUG_TRAP(tcp_sk(sk)->bind_hash);
				310	tw_add_bind_node(tw, &tw->tw_tb->owners);
				311	spin_unlock(&bhead->lock);
				312
				313	write_lock(&ehead->lock);
				314
				315	/* Step 2: Remove SK from established hash. */
				316	if (__sk_del_node_init(sk))
				317	sock_prot_dec_use(sk->sk_prot);
				318
				319	/* Step 3: Hash TW into TIMEWAIT half of established hash table. */
				320	tw_add_node(tw, &(ehead + tcp_ehash_size)->chain);
				321	atomic_inc(&tw->tw_refcnt);
				322
				323	write_unlock(&ehead->lock);
				324	}
				325
				326	/*
				327	* Move a socket to time-wait or dead fin-wait-2 state.
				328	*/
				329	void tcp_time_wait(struct sock *sk, int state, int timeo)
				330	{
				331	struct tcp_tw_bucket *tw = NULL;
				332	struct tcp_sock *tp = tcp_sk(sk);
				333	int recycle_ok = 0;
				334
				335	if (sysctl_tcp_tw_recycle && tp->rx_opt.ts_recent_stamp)
				336	recycle_ok = tp->af_specific->remember_stamp(sk);
				337
				338	if (tcp_tw_count < sysctl_tcp_max_tw_buckets)
				339	tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
				340
				341	if(tw != NULL) {
				342	struct inet_sock *inet = inet_sk(sk);
				343	int rto = (tp->rto<<2) - (tp->rto>>1);
				344
				345	/* Give us an identity. */
				346	tw->tw_daddr = inet->daddr;
				347	tw->tw_rcv_saddr = inet->rcv_saddr;
				348	tw->tw_bound_dev_if = sk->sk_bound_dev_if;
				349	tw->tw_num = inet->num;
				350	tw->tw_state = TCP_TIME_WAIT;
				351	tw->tw_substate = state;
				352	tw->tw_sport = inet->sport;
				353	tw->tw_dport = inet->dport;
				354	tw->tw_family = sk->sk_family;
				355	tw->tw_reuse = sk->sk_reuse;
				356	tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
				357	atomic_set(&tw->tw_refcnt, 1);
				358
				359	tw->tw_hashent = sk->sk_hashent;
				360	tw->tw_rcv_nxt = tp->rcv_nxt;
				361	tw->tw_snd_nxt = tp->snd_nxt;
				362	tw->tw_rcv_wnd = tcp_receive_window(tp);
				363	tw->tw_ts_recent = tp->rx_opt.ts_recent;
				364	tw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
				365	tw_dead_node_init(tw);
				366
				367	#if defined(CONFIG_IPV6) \|\| defined(CONFIG_IPV6_MODULE)
				368	if (tw->tw_family == PF_INET6) {
				369	struct ipv6_pinfo *np = inet6_sk(sk);
				370
				371	ipv6_addr_copy(&tw->tw_v6_daddr, &np->daddr);
				372	ipv6_addr_copy(&tw->tw_v6_rcv_saddr, &np->rcv_saddr);
				373	tw->tw_v6_ipv6only = np->ipv6only;
				374	} else {
				375	memset(&tw->tw_v6_daddr, 0, sizeof(tw->tw_v6_daddr));
				376	memset(&tw->tw_v6_rcv_saddr, 0, sizeof(tw->tw_v6_rcv_saddr));
				377	tw->tw_v6_ipv6only = 0;
				378	}
				379	#endif
				380	/* Linkage updates. */
				381	__tcp_tw_hashdance(sk, tw);
				382
				383	/* Get the TIME_WAIT timeout firing. */
				384	if (timeo < rto)
				385	timeo = rto;
				386
				387	if (recycle_ok) {
				388	tw->tw_timeout = rto;
				389	} else {
				390	tw->tw_timeout = TCP_TIMEWAIT_LEN;
				391	if (state == TCP_TIME_WAIT)
				392	timeo = TCP_TIMEWAIT_LEN;
				393	}
				394
				395	tcp_tw_schedule(tw, timeo);
				396	tcp_tw_put(tw);
				397	} else {
				398	/* Sorry, if we're out of memory, just CLOSE this
				399	* socket up. We've got bigger problems than
				400	* non-graceful socket closings.
				401	*/
				402	if (net_ratelimit())
				403	printk(KERN_INFO "TCP: time wait bucket table overflow\n");
				404	}
				405
				406	tcp_update_metrics(sk);
				407	tcp_done(sk);
				408	}
				409
				410	/* Kill off TIME_WAIT sockets once their lifetime has expired. */
				411	static int tcp_tw_death_row_slot;
				412
				413	static void tcp_twkill(unsigned long);
				414
				415	/* TIME_WAIT reaping mechanism. */
				416	#define TCP_TWKILL_SLOTS 8 /* Please keep this a power of 2. */
				417	#define TCP_TWKILL_PERIOD (TCP_TIMEWAIT_LEN/TCP_TWKILL_SLOTS)
				418
				419	#define TCP_TWKILL_QUOTA 100
				420
				421	static struct hlist_head tcp_tw_death_row[TCP_TWKILL_SLOTS];
				422	static DEFINE_SPINLOCK(tw_death_lock);
				423	static struct timer_list tcp_tw_timer = TIMER_INITIALIZER(tcp_twkill, 0, 0);
				424	static void twkill_work(void *);
				425	static DECLARE_WORK(tcp_twkill_work, twkill_work, NULL);
				426	static u32 twkill_thread_slots;
				427
				428	/* Returns non-zero if quota exceeded. */
				429	static int tcp_do_twkill_work(int slot, unsigned int quota)
				430	{
				431	struct tcp_tw_bucket *tw;
				432	struct hlist_node *node;
				433	unsigned int killed;
				434	int ret;
				435
				436	/* NOTE: compare this to previous version where lock
				437	* was released after detaching chain. It was racy,
				438	* because tw buckets are scheduled in not serialized context
				439	* in 2.3 (with netfilter), and with softnet it is common, because
				440	* soft irqs are not sequenced.
				441	*/
				442	killed = 0;
				443	ret = 0;
				444	rescan:
				445	tw_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) {
				446	__tw_del_dead_node(tw);
				447	spin_unlock(&tw_death_lock);
				448	tcp_timewait_kill(tw);
				449	tcp_tw_put(tw);
				450	killed++;
				451	spin_lock(&tw_death_lock);
				452	if (killed > quota) {
				453	ret = 1;
				454	break;
				455	}
				456
				457	/* While we dropped tw_death_lock, another cpu may have
				458	* killed off the next TW bucket in the list, therefore
				459	* do a fresh re-read of the hlist head node with the
				460	* lock reacquired. We still use the hlist traversal
				461	* macro in order to get the prefetches.
				462	*/
				463	goto rescan;
				464	}
				465
				466	tcp_tw_count -= killed;
				467	NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed);
				468
				469	return ret;
				470	}
				471
				472	static void tcp_twkill(unsigned long dummy)
				473	{
				474	int need_timer, ret;
				475
				476	spin_lock(&tw_death_lock);
				477
				478	if (tcp_tw_count == 0)
				479	goto out;
				480
				481	need_timer = 0;
				482	ret = tcp_do_twkill_work(tcp_tw_death_row_slot, TCP_TWKILL_QUOTA);
				483	if (ret) {
				484	twkill_thread_slots \|= (1 << tcp_tw_death_row_slot);
				485	mb();
				486	schedule_work(&tcp_twkill_work);
				487	need_timer = 1;
				488	} else {
				489	/* We purged the entire slot, anything left? */
				490	if (tcp_tw_count)
				491	need_timer = 1;
				492	}
				493	tcp_tw_death_row_slot =
				494	((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
				495	if (need_timer)
				496	mod_timer(&tcp_tw_timer, jiffies + TCP_TWKILL_PERIOD);
				497	out:
				498	spin_unlock(&tw_death_lock);
				499	}
				500
				501	extern void twkill_slots_invalid(void);
				502
				503	static void twkill_work(void *dummy)
				504	{
				505	int i;
				506
				507	if ((TCP_TWKILL_SLOTS - 1) > (sizeof(twkill_thread_slots) * 8))
				508	twkill_slots_invalid();
				509
				510	while (twkill_thread_slots) {
				511	spin_lock_bh(&tw_death_lock);
				512	for (i = 0; i < TCP_TWKILL_SLOTS; i++) {
				513	if (!(twkill_thread_slots & (1 << i)))
				514	continue;
				515
				516	while (tcp_do_twkill_work(i, TCP_TWKILL_QUOTA) != 0) {
				517	if (need_resched()) {
				518	spin_unlock_bh(&tw_death_lock);
				519	schedule();
				520	spin_lock_bh(&tw_death_lock);
				521	}
				522	}
				523
				524	twkill_thread_slots &= ~(1 << i);
				525	}
				526	spin_unlock_bh(&tw_death_lock);
				527	}
				528	}
				529
				530	/* These are always called from BH context. See callers in
				531	* tcp_input.c to verify this.
				532	*/
				533
				534	/* This is for handling early-kills of TIME_WAIT sockets. */
				535	void tcp_tw_deschedule(struct tcp_tw_bucket *tw)
				536	{
				537	spin_lock(&tw_death_lock);
				538	if (tw_del_dead_node(tw)) {
				539	tcp_tw_put(tw);
				540	if (--tcp_tw_count == 0)
				541	del_timer(&tcp_tw_timer);
				542	}
				543	spin_unlock(&tw_death_lock);
				544	tcp_timewait_kill(tw);
				545	}
				546
				547	/* Short-time timewait calendar */
				548
				549	static int tcp_twcal_hand = -1;
				550	static int tcp_twcal_jiffie;
				551	static void tcp_twcal_tick(unsigned long);
				552	static struct timer_list tcp_twcal_timer =
				553	TIMER_INITIALIZER(tcp_twcal_tick, 0, 0);
				554	static struct hlist_head tcp_twcal_row[TCP_TW_RECYCLE_SLOTS];
				555
				556	static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo)
				557	{
				558	struct hlist_head *list;
				559	int slot;
				560
				561	/* timeout := RTO * 3.5
				562	*
				563	* 3.5 = 1+2+0.5 to wait for two retransmits.
				564	*
				565	* RATIONALE: if FIN arrived and we entered TIME-WAIT state,
				566	* our ACK acking that FIN can be lost. If N subsequent retransmitted
				567	* FINs (or previous seqments) are lost (probability of such event
				568	* is p^(N+1), where p is probability to lose single packet and
				569	* time to detect the loss is about RTO*(2^N - 1) with exponential
				570	* backoff). Normal timewait length is calculated so, that we
				571	* waited at least for one retransmitted FIN (maximal RTO is 120sec).
				572	* [ BTW Linux. following BSD, violates this requirement waiting
				573	* only for 60sec, we should wait at least for 240 secs.
				574	* Well, 240 consumes too much of resources 8)
				575	* ]
				576	* This interval is not reduced to catch old duplicate and
				577	* responces to our wandering segments living for two MSLs.
				578	* However, if we use PAWS to detect
				579	* old duplicates, we can reduce the interval to bounds required
				580	* by RTO, rather than MSL. So, if peer understands PAWS, we
				581	* kill tw bucket after 3.5*RTO (it is important that this number
				582	* is greater than TS tick!) and detect old duplicates with help
				583	* of PAWS.
				584	*/
				585	slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK;
				586
				587	spin_lock(&tw_death_lock);
				588
				589	/* Unlink it, if it was scheduled */
				590	if (tw_del_dead_node(tw))
				591	tcp_tw_count--;
				592	else
				593	atomic_inc(&tw->tw_refcnt);
				594
				595	if (slot >= TCP_TW_RECYCLE_SLOTS) {
				596	/* Schedule to slow timer */
				597	if (timeo >= TCP_TIMEWAIT_LEN) {
				598	slot = TCP_TWKILL_SLOTS-1;
				599	} else {
				600	slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD;
				601	if (slot >= TCP_TWKILL_SLOTS)
				602	slot = TCP_TWKILL_SLOTS-1;
				603	}
				604	tw->tw_ttd = jiffies + timeo;
				605	slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1);
				606	list = &tcp_tw_death_row[slot];
				607	} else {
				608	tw->tw_ttd = jiffies + (slot << TCP_TW_RECYCLE_TICK);
				609
				610	if (tcp_twcal_hand < 0) {
				611	tcp_twcal_hand = 0;
				612	tcp_twcal_jiffie = jiffies;
				613	tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK);
				614	add_timer(&tcp_twcal_timer);
				615	} else {
				616	if (time_after(tcp_twcal_timer.expires, jiffies + (slot<<TCP_TW_RECYCLE_TICK)))
				617	mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK));
				618	slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1);
				619	}
				620	list = &tcp_twcal_row[slot];
				621	}
				622
				623	hlist_add_head(&tw->tw_death_node, list);
				624
				625	if (tcp_tw_count++ == 0)
				626	mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
				627	spin_unlock(&tw_death_lock);
				628	}
				629
				630	void tcp_twcal_tick(unsigned long dummy)
				631	{
				632	int n, slot;
				633	unsigned long j;
				634	unsigned long now = jiffies;
				635	int killed = 0;
				636	int adv = 0;
				637
				638	spin_lock(&tw_death_lock);
				639	if (tcp_twcal_hand < 0)
				640	goto out;
				641
				642	slot = tcp_twcal_hand;
				643	j = tcp_twcal_jiffie;
				644
				645	for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) {
				646	if (time_before_eq(j, now)) {
				647	struct hlist_node node, safe;
				648	struct tcp_tw_bucket *tw;
				649
				650	tw_for_each_inmate_safe(tw, node, safe,
				651	&tcp_twcal_row[slot]) {
				652	__tw_del_dead_node(tw);
				653	tcp_timewait_kill(tw);
				654	tcp_tw_put(tw);
				655	killed++;
				656	}
				657	} else {
				658	if (!adv) {
				659	adv = 1;
				660	tcp_twcal_jiffie = j;
				661	tcp_twcal_hand = slot;
				662	}
				663
				664	if (!hlist_empty(&tcp_twcal_row[slot])) {
				665	mod_timer(&tcp_twcal_timer, j);
				666	goto out;
				667	}
				668	}
				669	j += (1<<TCP_TW_RECYCLE_TICK);
				670	slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1);
				671	}
				672	tcp_twcal_hand = -1;
				673
				674	out:
				675	if ((tcp_tw_count -= killed) == 0)
				676	del_timer(&tcp_tw_timer);
				677	NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed);
				678	spin_unlock(&tw_death_lock);
				679	}
				680
				681	/* This is not only more efficient than what we used to do, it eliminates
				682	* a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
				683	*
				684	* Actually, we could lots of memory writes here. tp of listening
				685	* socket contains all necessary default parameters.
				686	*/
				687	struct sock tcp_create_openreq_child(struct sock sk, struct open_request req, struct sk_buff skb)
				688	{
				689	/* allocate the newsk from the same slab of the master sock,
				690	* if not, at sk_free time we'll try to free it from the wrong
				691	* slabcache (i.e. is it TCPv4 or v6?), this is handled thru sk->sk_prot -acme */
				692	struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, sk->sk_prot, 0);
				693
				694	if(newsk != NULL) {
				695	struct tcp_sock *newtp;
				696	struct sk_filter *filter;
				697
				698	memcpy(newsk, sk, sizeof(struct tcp_sock));
				699	newsk->sk_state = TCP_SYN_RECV;
				700
				701	/* SANITY */
				702	sk_node_init(&newsk->sk_node);
				703	tcp_sk(newsk)->bind_hash = NULL;
				704
				705	/* Clone the TCP header template */
				706	inet_sk(newsk)->dport = req->rmt_port;
				707
				708	sock_lock_init(newsk);
				709	bh_lock_sock(newsk);
				710
				711	rwlock_init(&newsk->sk_dst_lock);
				712	atomic_set(&newsk->sk_rmem_alloc, 0);
				713	skb_queue_head_init(&newsk->sk_receive_queue);
				714	atomic_set(&newsk->sk_wmem_alloc, 0);
				715	skb_queue_head_init(&newsk->sk_write_queue);
				716	atomic_set(&newsk->sk_omem_alloc, 0);
				717	newsk->sk_wmem_queued = 0;
				718	newsk->sk_forward_alloc = 0;
				719
				720	sock_reset_flag(newsk, SOCK_DONE);
				721	newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
				722	newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
				723	newsk->sk_send_head = NULL;
				724	rwlock_init(&newsk->sk_callback_lock);
				725	skb_queue_head_init(&newsk->sk_error_queue);
				726	newsk->sk_write_space = sk_stream_write_space;
				727
				728	if ((filter = newsk->sk_filter) != NULL)
				729	sk_filter_charge(newsk, filter);
				730
				731	if (unlikely(xfrm_sk_clone_policy(newsk))) {
				732	/* It is still raw copy of parent, so invalidate
				733	* destructor and make plain sk_free() */
				734	newsk->sk_destruct = NULL;
				735	sk_free(newsk);
				736	return NULL;
				737	}
				738
				739	/* Now setup tcp_sock */
				740	newtp = tcp_sk(newsk);
				741	newtp->pred_flags = 0;
				742	newtp->rcv_nxt = req->rcv_isn + 1;
				743	newtp->snd_nxt = req->snt_isn + 1;
				744	newtp->snd_una = req->snt_isn + 1;
				745	newtp->snd_sml = req->snt_isn + 1;
				746
				747	tcp_prequeue_init(newtp);
				748
				749	tcp_init_wl(newtp, req->snt_isn, req->rcv_isn);
				750
				751	newtp->retransmits = 0;
				752	newtp->backoff = 0;
				753	newtp->srtt = 0;
				754	newtp->mdev = TCP_TIMEOUT_INIT;
				755	newtp->rto = TCP_TIMEOUT_INIT;
				756
				757	newtp->packets_out = 0;
				758	newtp->left_out = 0;
				759	newtp->retrans_out = 0;
				760	newtp->sacked_out = 0;
				761	newtp->fackets_out = 0;
				762	newtp->snd_ssthresh = 0x7fffffff;
				763
				764	/* So many TCP implementations out there (incorrectly) count the
				765	* initial SYN frame in their delayed-ACK and congestion control
				766	* algorithms that we must have the following bandaid to talk
				767	* efficiently to them. -DaveM
				768	*/
				769	newtp->snd_cwnd = 2;
				770	newtp->snd_cwnd_cnt = 0;
				771
				772	newtp->frto_counter = 0;
				773	newtp->frto_highmark = 0;
				774
				775	tcp_set_ca_state(newtp, TCP_CA_Open);
				776	tcp_init_xmit_timers(newsk);
				777	skb_queue_head_init(&newtp->out_of_order_queue);
				778	newtp->rcv_wup = req->rcv_isn + 1;
				779	newtp->write_seq = req->snt_isn + 1;
				780	newtp->pushed_seq = newtp->write_seq;
				781	newtp->copied_seq = req->rcv_isn + 1;
				782
				783	newtp->rx_opt.saw_tstamp = 0;
				784
				785	newtp->rx_opt.dsack = 0;
				786	newtp->rx_opt.eff_sacks = 0;
				787
				788	newtp->probes_out = 0;
				789	newtp->rx_opt.num_sacks = 0;
				790	newtp->urg_data = 0;
				791	newtp->listen_opt = NULL;
				792	newtp->accept_queue = newtp->accept_queue_tail = NULL;
				793	/* Deinitialize syn_wait_lock to trap illegal accesses. */
				794	memset(&newtp->syn_wait_lock, 0, sizeof(newtp->syn_wait_lock));
				795
				796	/* Back to base struct sock members. */
				797	newsk->sk_err = 0;
				798	newsk->sk_priority = 0;
				799	atomic_set(&newsk->sk_refcnt, 2);
				800	#ifdef INET_REFCNT_DEBUG
				801	atomic_inc(&inet_sock_nr);
				802	#endif
				803	atomic_inc(&tcp_sockets_allocated);
				804
				805	if (sock_flag(newsk, SOCK_KEEPOPEN))
				806	tcp_reset_keepalive_timer(newsk,
				807	keepalive_time_when(newtp));
				808	newsk->sk_socket = NULL;
				809	newsk->sk_sleep = NULL;
				810
				811	newtp->rx_opt.tstamp_ok = req->tstamp_ok;
				812	if((newtp->rx_opt.sack_ok = req->sack_ok) != 0) {
				813	if (sysctl_tcp_fack)
				814	newtp->rx_opt.sack_ok \|= 2;
				815	}
				816	newtp->window_clamp = req->window_clamp;
				817	newtp->rcv_ssthresh = req->rcv_wnd;
				818	newtp->rcv_wnd = req->rcv_wnd;
				819	newtp->rx_opt.wscale_ok = req->wscale_ok;
				820	if (newtp->rx_opt.wscale_ok) {
				821	newtp->rx_opt.snd_wscale = req->snd_wscale;
				822	newtp->rx_opt.rcv_wscale = req->rcv_wscale;
				823	} else {
				824	newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
				825	newtp->window_clamp = min(newtp->window_clamp, 65535U);
				826	}
				827	newtp->snd_wnd = ntohs(skb->h.th->window) << newtp->rx_opt.snd_wscale;
				828	newtp->max_window = newtp->snd_wnd;
				829
				830	if (newtp->rx_opt.tstamp_ok) {
				831	newtp->rx_opt.ts_recent = req->ts_recent;
				832	newtp->rx_opt.ts_recent_stamp = xtime.tv_sec;
				833	newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
				834	} else {
				835	newtp->rx_opt.ts_recent_stamp = 0;
				836	newtp->tcp_header_len = sizeof(struct tcphdr);
				837	}
				838	if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len)
				839	newtp->ack.last_seg_size = skb->len-newtp->tcp_header_len;
				840	newtp->rx_opt.mss_clamp = req->mss;
				841	TCP_ECN_openreq_child(newtp, req);
				842	if (newtp->ecn_flags&TCP_ECN_OK)
				843	sock_set_flag(newsk, SOCK_NO_LARGESEND);
				844
				845	tcp_ca_init(newtp);
				846
				847	TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS);
				848	}
				849	return newsk;
				850	}
				851
				852	/*
				853	* Process an incoming packet for SYN_RECV sockets represented
				854	* as an open_request.
				855	*/
				856
				857	struct sock tcp_check_req(struct sock sk,struct sk_buff *skb,
				858	struct open_request *req,
				859	struct open_request **prev)
				860	{
				861	struct tcphdr *th = skb->h.th;
				862	struct tcp_sock *tp = tcp_sk(sk);
				863	u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST\|TCP_FLAG_SYN\|TCP_FLAG_ACK);
				864	int paws_reject = 0;
				865	struct tcp_options_received tmp_opt;
				866	struct sock *child;
				867
				868	tmp_opt.saw_tstamp = 0;
				869	if (th->doff > (sizeof(struct tcphdr)>>2)) {
				870	tcp_parse_options(skb, &tmp_opt, 0);
				871
				872	if (tmp_opt.saw_tstamp) {
				873	tmp_opt.ts_recent = req->ts_recent;
				874	/* We do not store true stamp, but it is not required,
				875	* it can be estimated (approximately)
				876	* from another data.
				877	*/
				878	tmp_opt.ts_recent_stamp = xtime.tv_sec - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
				879	paws_reject = tcp_paws_check(&tmp_opt, th->rst);
				880	}
				881	}
				882
				883	/* Check for pure retransmitted SYN. */
				884	if (TCP_SKB_CB(skb)->seq == req->rcv_isn &&
				885	flg == TCP_FLAG_SYN &&
				886	!paws_reject) {
				887	/*
				888	* RFC793 draws (Incorrectly! It was fixed in RFC1122)
				889	* this case on figure 6 and figure 8, but formal
				890	* protocol description says NOTHING.
				891	* To be more exact, it says that we should send ACK,
				892	* because this segment (at least, if it has no data)
				893	* is out of window.
				894	*
				895	* CONCLUSION: RFC793 (even with RFC1122) DOES NOT
				896	* describe SYN-RECV state. All the description
				897	* is wrong, we cannot believe to it and should
				898	* rely only on common sense and implementation
				899	* experience.
				900	*
				901	* Enforce "SYN-ACK" according to figure 8, figure 6
				902	* of RFC793, fixed by RFC1122.
				903	*/
				904	req->class->rtx_syn_ack(sk, req, NULL);
				905	return NULL;
				906	}
				907
				908	/* Further reproduces section "SEGMENT ARRIVES"
				909	for state SYN-RECEIVED of RFC793.
				910	It is broken, however, it does not work only
				911	when SYNs are crossed.
				912
				913	You would think that SYN crossing is impossible here, since
				914	we should have a SYN_SENT socket (from connect()) on our end,
				915	but this is not true if the crossed SYNs were sent to both
				916	ends by a malicious third party. We must defend against this,
				917	and to do that we first verify the ACK (as per RFC793, page
				918	36) and reset if it is invalid. Is this a true full defense?
				919	To convince ourselves, let us consider a way in which the ACK
				920	test can still pass in this 'malicious crossed SYNs' case.
				921	Malicious sender sends identical SYNs (and thus identical sequence
				922	numbers) to both A and B:
				923
				924	A: gets SYN, seq=7
				925	B: gets SYN, seq=7
				926
				927	By our good fortune, both A and B select the same initial
				928	send sequence number of seven :-)
				929
				930	A: sends SYN\|ACK, seq=7, ack_seq=8
				931	B: sends SYN\|ACK, seq=7, ack_seq=8
				932
				933	So we are now A eating this SYN\|ACK, ACK test passes. So
				934	does sequence test, SYN is truncated, and thus we consider
				935	it a bare ACK.
				936
				937	If tp->defer_accept, we silently drop this bare ACK. Otherwise,
				938	we create an established connection. Both ends (listening sockets)
				939	accept the new incoming connection and try to talk to each other. 8-)
				940
				941	Note: This case is both harmless, and rare. Possibility is about the
				942	same as us discovering intelligent life on another plant tomorrow.
				943
				944	But generally, we should (RFC lies!) to accept ACK
				945	from SYNACK both here and in tcp_rcv_state_process().
				946	tcp_rcv_state_process() does not, hence, we do not too.
				947
				948	Note that the case is absolutely generic:
				949	we cannot optimize anything here without
				950	violating protocol. All the checks must be made
				951	before attempt to create socket.
				952	*/
				953
				954	/* RFC793 page 36: "If the connection is in any non-synchronized state ...
				955	* and the incoming segment acknowledges something not yet
				956	* sent (the segment carries an unaccaptable ACK) ...
				957	* a reset is sent."
				958	*
				959	* Invalid ACK: reset will be sent by listening socket
				960	*/
				961	if ((flg & TCP_FLAG_ACK) &&
				962	(TCP_SKB_CB(skb)->ack_seq != req->snt_isn+1))
				963	return sk;
				964
				965	/* Also, it would be not so bad idea to check rcv_tsecr, which
				966	* is essentially ACK extension and too early or too late values
				967	* should cause reset in unsynchronized states.
				968	*/
				969
				970	/* RFC793: "first check sequence number". */
				971
				972	if (paws_reject \|\| !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
				973	req->rcv_isn+1, req->rcv_isn+1+req->rcv_wnd)) {
				974	/* Out of window: send ACK and drop. */
				975	if (!(flg & TCP_FLAG_RST))
				976	req->class->send_ack(skb, req);
				977	if (paws_reject)
				978	NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
				979	return NULL;
				980	}
				981
				982	/* In sequence, PAWS is OK. */
				983
				984	if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1))
				985	req->ts_recent = tmp_opt.rcv_tsval;
				986
				987	if (TCP_SKB_CB(skb)->seq == req->rcv_isn) {
				988	/* Truncate SYN, it is out of window starting
				989	at req->rcv_isn+1. */
				990	flg &= ~TCP_FLAG_SYN;
				991	}
				992
				993	/* RFC793: "second check the RST bit" and
				994	* "fourth, check the SYN bit"
				995	*/
				996	if (flg & (TCP_FLAG_RST\|TCP_FLAG_SYN))
				997	goto embryonic_reset;
				998
				999	/* ACK sequence verified above, just make sure ACK is
				1000	* set. If ACK not set, just silently drop the packet.
				1001	*/
				1002	if (!(flg & TCP_FLAG_ACK))
				1003	return NULL;
				1004
				1005	/* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
				1006	if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == req->rcv_isn+1) {
				1007	req->acked = 1;
				1008	return NULL;
				1009	}
				1010
				1011	/* OK, ACK is valid, create big socket and
				1012	* feed this segment to it. It will repeat all
				1013	* the tests. THIS SEGMENT MUST MOVE SOCKET TO
				1014	* ESTABLISHED STATE. If it will be dropped after
				1015	* socket is created, wait for troubles.
				1016	*/
				1017	child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
				1018	if (child == NULL)
				1019	goto listen_overflow;
				1020
				1021	tcp_synq_unlink(tp, req, prev);
				1022	tcp_synq_removed(sk, req);
				1023
				1024	tcp_acceptq_queue(sk, req, child);
				1025	return child;
				1026
				1027	listen_overflow:
				1028	if (!sysctl_tcp_abort_on_overflow) {
				1029	req->acked = 1;
				1030	return NULL;
				1031	}
				1032
				1033	embryonic_reset:
				1034	NET_INC_STATS_BH(LINUX_MIB_EMBRYONICRSTS);
				1035	if (!(flg & TCP_FLAG_RST))
				1036	req->class->send_reset(skb);
				1037
				1038	tcp_synq_drop(sk, req, prev);
				1039	return NULL;
				1040	}
				1041
				1042	/*
				1043	* Queue segment on the new socket if the new socket is active,
				1044	* otherwise we just shortcircuit this and continue with
				1045	* the new socket.
				1046	*/
				1047
				1048	int tcp_child_process(struct sock parent, struct sock child,
				1049	struct sk_buff *skb)
				1050	{
				1051	int ret = 0;
				1052	int state = child->sk_state;
				1053
				1054	if (!sock_owned_by_user(child)) {
				1055	ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len);
				1056
				1057	/* Wakeup parent, send SIGIO */
				1058	if (state == TCP_SYN_RECV && child->sk_state != state)
				1059	parent->sk_data_ready(parent, 0);
				1060	} else {
				1061	/* Alas, it is possible again, because we do lookup
				1062	* in main socket hash table and lock on listening
				1063	* socket does not protect us more.
				1064	*/
				1065	sk_add_backlog(child, skb);
				1066	}
				1067
				1068	bh_unlock_sock(child);
				1069	sock_put(child);
				1070	return ret;
				1071	}
				1072
				1073	EXPORT_SYMBOL(tcp_check_req);
				1074	EXPORT_SYMBOL(tcp_child_process);
				1075	EXPORT_SYMBOL(tcp_create_openreq_child);
				1076	EXPORT_SYMBOL(tcp_timewait_state_process);
				1077	EXPORT_SYMBOL(tcp_tw_deschedule);