| Arnaldo Carvalho de Melo | 77d8bf9 | 2005-08-09 20:00:51 -0700 | [diff] [blame] | 1 | /* | 
|  | 2 | * INET		An implementation of the TCP/IP protocol suite for the LINUX | 
|  | 3 | *		operating system.  INET is implemented using the BSD Socket | 
|  | 4 | *		interface as the means of communication with the user level. | 
|  | 5 | * | 
|  | 6 | *		Generic INET transport hashtables | 
|  | 7 | * | 
|  | 8 | * Authors:	Lotsa people, from code originally in tcp | 
|  | 9 | * | 
|  | 10 | *	This program is free software; you can redistribute it and/or | 
|  | 11 | *      modify it under the terms of the GNU General Public License | 
|  | 12 | *      as published by the Free Software Foundation; either version | 
|  | 13 | *      2 of the License, or (at your option) any later version. | 
|  | 14 | */ | 
|  | 15 |  | 
|  | 16 | #include <linux/config.h> | 
| Arnaldo Carvalho de Melo | 2d8c4ce | 2005-08-09 20:07:13 -0700 | [diff] [blame] | 17 | #include <linux/module.h> | 
| Arnaldo Carvalho de Melo | a7f5e7f | 2005-12-13 23:25:31 -0800 | [diff] [blame] | 18 | #include <linux/random.h> | 
| Arnaldo Carvalho de Melo | f3f05f7 | 2005-08-09 20:08:09 -0700 | [diff] [blame] | 19 | #include <linux/sched.h> | 
| Arnaldo Carvalho de Melo | 77d8bf9 | 2005-08-09 20:00:51 -0700 | [diff] [blame] | 20 | #include <linux/slab.h> | 
| Arnaldo Carvalho de Melo | f3f05f7 | 2005-08-09 20:08:09 -0700 | [diff] [blame] | 21 | #include <linux/wait.h> | 
| Arnaldo Carvalho de Melo | 77d8bf9 | 2005-08-09 20:00:51 -0700 | [diff] [blame] | 22 |  | 
| Arnaldo Carvalho de Melo | 463c84b | 2005-08-09 20:10:42 -0700 | [diff] [blame] | 23 | #include <net/inet_connection_sock.h> | 
| Arnaldo Carvalho de Melo | 77d8bf9 | 2005-08-09 20:00:51 -0700 | [diff] [blame] | 24 | #include <net/inet_hashtables.h> | 
| Arnaldo Carvalho de Melo | a7f5e7f | 2005-12-13 23:25:31 -0800 | [diff] [blame] | 25 | #include <net/ip.h> | 
| Arnaldo Carvalho de Melo | 77d8bf9 | 2005-08-09 20:00:51 -0700 | [diff] [blame] | 26 |  | 
|  | 27 | /* | 
|  | 28 | * Allocate and initialize a new local port bind bucket. | 
|  | 29 | * The bindhash mutex for snum's hash chain must be held here. | 
|  | 30 | */ | 
|  | 31 | struct inet_bind_bucket *inet_bind_bucket_create(kmem_cache_t *cachep, | 
|  | 32 | struct inet_bind_hashbucket *head, | 
|  | 33 | const unsigned short snum) | 
|  | 34 | { | 
|  | 35 | struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, SLAB_ATOMIC); | 
|  | 36 |  | 
|  | 37 | if (tb != NULL) { | 
|  | 38 | tb->port      = snum; | 
|  | 39 | tb->fastreuse = 0; | 
|  | 40 | INIT_HLIST_HEAD(&tb->owners); | 
|  | 41 | hlist_add_head(&tb->node, &head->chain); | 
|  | 42 | } | 
|  | 43 | return tb; | 
|  | 44 | } | 
|  | 45 |  | 
|  | 46 | EXPORT_SYMBOL(inet_bind_bucket_create); | 
|  | 47 |  | 
|  | 48 | /* | 
|  | 49 | * Caller must hold hashbucket lock for this tb with local BH disabled | 
|  | 50 | */ | 
|  | 51 | void inet_bind_bucket_destroy(kmem_cache_t *cachep, struct inet_bind_bucket *tb) | 
|  | 52 | { | 
|  | 53 | if (hlist_empty(&tb->owners)) { | 
|  | 54 | __hlist_del(&tb->node); | 
|  | 55 | kmem_cache_free(cachep, tb); | 
|  | 56 | } | 
|  | 57 | } | 
| Arnaldo Carvalho de Melo | 2d8c4ce | 2005-08-09 20:07:13 -0700 | [diff] [blame] | 58 |  | 
|  | 59 | void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, | 
|  | 60 | const unsigned short snum) | 
|  | 61 | { | 
| Arnaldo Carvalho de Melo | 463c84b | 2005-08-09 20:10:42 -0700 | [diff] [blame] | 62 | inet_sk(sk)->num = snum; | 
| Arnaldo Carvalho de Melo | 2d8c4ce | 2005-08-09 20:07:13 -0700 | [diff] [blame] | 63 | sk_add_bind_node(sk, &tb->owners); | 
| Arnaldo Carvalho de Melo | 463c84b | 2005-08-09 20:10:42 -0700 | [diff] [blame] | 64 | inet_csk(sk)->icsk_bind_hash = tb; | 
| Arnaldo Carvalho de Melo | 2d8c4ce | 2005-08-09 20:07:13 -0700 | [diff] [blame] | 65 | } | 
|  | 66 |  | 
|  | 67 | EXPORT_SYMBOL(inet_bind_hash); | 
|  | 68 |  | 
|  | 69 | /* | 
|  | 70 | * Get rid of any references to a local port held by the given sock. | 
|  | 71 | */ | 
|  | 72 | static void __inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk) | 
|  | 73 | { | 
| Arnaldo Carvalho de Melo | 463c84b | 2005-08-09 20:10:42 -0700 | [diff] [blame] | 74 | const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size); | 
| Arnaldo Carvalho de Melo | 2d8c4ce | 2005-08-09 20:07:13 -0700 | [diff] [blame] | 75 | struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; | 
|  | 76 | struct inet_bind_bucket *tb; | 
|  | 77 |  | 
|  | 78 | spin_lock(&head->lock); | 
| Arnaldo Carvalho de Melo | 463c84b | 2005-08-09 20:10:42 -0700 | [diff] [blame] | 79 | tb = inet_csk(sk)->icsk_bind_hash; | 
| Arnaldo Carvalho de Melo | 2d8c4ce | 2005-08-09 20:07:13 -0700 | [diff] [blame] | 80 | __sk_del_bind_node(sk); | 
| Arnaldo Carvalho de Melo | 463c84b | 2005-08-09 20:10:42 -0700 | [diff] [blame] | 81 | inet_csk(sk)->icsk_bind_hash = NULL; | 
|  | 82 | inet_sk(sk)->num = 0; | 
| Arnaldo Carvalho de Melo | 2d8c4ce | 2005-08-09 20:07:13 -0700 | [diff] [blame] | 83 | inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); | 
|  | 84 | spin_unlock(&head->lock); | 
|  | 85 | } | 
|  | 86 |  | 
|  | 87 | void inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk) | 
|  | 88 | { | 
|  | 89 | local_bh_disable(); | 
|  | 90 | __inet_put_port(hashinfo, sk); | 
|  | 91 | local_bh_enable(); | 
|  | 92 | } | 
|  | 93 |  | 
|  | 94 | EXPORT_SYMBOL(inet_put_port); | 
| Arnaldo Carvalho de Melo | f3f05f7 | 2005-08-09 20:08:09 -0700 | [diff] [blame] | 95 |  | 
|  | 96 | /* | 
|  | 97 | * This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP. | 
|  | 98 | * Look, when several writers sleep and reader wakes them up, all but one | 
|  | 99 | * immediately hit write lock and grab all the cpus. Exclusive sleep solves | 
|  | 100 | * this, _but_ remember, it adds useless work on UP machines (wake up each | 
|  | 101 | * exclusive lock release). It should be ifdefed really. | 
|  | 102 | */ | 
|  | 103 | void inet_listen_wlock(struct inet_hashinfo *hashinfo) | 
|  | 104 | { | 
|  | 105 | write_lock(&hashinfo->lhash_lock); | 
|  | 106 |  | 
|  | 107 | if (atomic_read(&hashinfo->lhash_users)) { | 
|  | 108 | DEFINE_WAIT(wait); | 
|  | 109 |  | 
|  | 110 | for (;;) { | 
|  | 111 | prepare_to_wait_exclusive(&hashinfo->lhash_wait, | 
|  | 112 | &wait, TASK_UNINTERRUPTIBLE); | 
|  | 113 | if (!atomic_read(&hashinfo->lhash_users)) | 
|  | 114 | break; | 
|  | 115 | write_unlock_bh(&hashinfo->lhash_lock); | 
|  | 116 | schedule(); | 
|  | 117 | write_lock_bh(&hashinfo->lhash_lock); | 
|  | 118 | } | 
|  | 119 |  | 
|  | 120 | finish_wait(&hashinfo->lhash_wait, &wait); | 
|  | 121 | } | 
|  | 122 | } | 
|  | 123 |  | 
|  | 124 | EXPORT_SYMBOL(inet_listen_wlock); | 
| Arnaldo Carvalho de Melo | 33b6223 | 2005-08-09 20:09:06 -0700 | [diff] [blame] | 125 |  | 
|  | 126 | /* | 
|  | 127 | * Don't inline this cruft. Here are some nice properties to exploit here. The | 
|  | 128 | * BSD API does not allow a listening sock to specify the remote port nor the | 
|  | 129 | * remote address for the connection. So always assume those are both | 
|  | 130 | * wildcarded during the search since they can never be otherwise. | 
|  | 131 | */ | 
|  | 132 | struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 daddr, | 
|  | 133 | const unsigned short hnum, const int dif) | 
|  | 134 | { | 
|  | 135 | struct sock *result = NULL, *sk; | 
|  | 136 | const struct hlist_node *node; | 
|  | 137 | int hiscore = -1; | 
|  | 138 |  | 
|  | 139 | sk_for_each(sk, node, head) { | 
|  | 140 | const struct inet_sock *inet = inet_sk(sk); | 
|  | 141 |  | 
|  | 142 | if (inet->num == hnum && !ipv6_only_sock(sk)) { | 
|  | 143 | const __u32 rcv_saddr = inet->rcv_saddr; | 
|  | 144 | int score = sk->sk_family == PF_INET ? 1 : 0; | 
|  | 145 |  | 
|  | 146 | if (rcv_saddr) { | 
|  | 147 | if (rcv_saddr != daddr) | 
|  | 148 | continue; | 
|  | 149 | score += 2; | 
|  | 150 | } | 
|  | 151 | if (sk->sk_bound_dev_if) { | 
|  | 152 | if (sk->sk_bound_dev_if != dif) | 
|  | 153 | continue; | 
|  | 154 | score += 2; | 
|  | 155 | } | 
|  | 156 | if (score == 5) | 
|  | 157 | return sk; | 
|  | 158 | if (score > hiscore) { | 
|  | 159 | hiscore	= score; | 
|  | 160 | result	= sk; | 
|  | 161 | } | 
|  | 162 | } | 
|  | 163 | } | 
|  | 164 | return result; | 
|  | 165 | } | 
| Arnaldo Carvalho de Melo | e48c414 | 2005-08-09 20:09:46 -0700 | [diff] [blame] | 166 |  | 
|  | 167 | EXPORT_SYMBOL_GPL(__inet_lookup_listener); | 
| Arnaldo Carvalho de Melo | a7f5e7f | 2005-12-13 23:25:31 -0800 | [diff] [blame] | 168 |  | 
|  | 169 | /* called with local bh disabled */ | 
|  | 170 | static int __inet_check_established(struct inet_timewait_death_row *death_row, | 
|  | 171 | struct sock *sk, __u16 lport, | 
|  | 172 | struct inet_timewait_sock **twp) | 
|  | 173 | { | 
|  | 174 | struct inet_hashinfo *hinfo = death_row->hashinfo; | 
|  | 175 | struct inet_sock *inet = inet_sk(sk); | 
|  | 176 | u32 daddr = inet->rcv_saddr; | 
|  | 177 | u32 saddr = inet->daddr; | 
|  | 178 | int dif = sk->sk_bound_dev_if; | 
|  | 179 | INET_ADDR_COOKIE(acookie, saddr, daddr) | 
|  | 180 | const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport); | 
|  | 181 | unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); | 
|  | 182 | struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); | 
|  | 183 | struct sock *sk2; | 
|  | 184 | const struct hlist_node *node; | 
|  | 185 | struct inet_timewait_sock *tw; | 
|  | 186 |  | 
|  | 187 | prefetch(head->chain.first); | 
|  | 188 | write_lock(&head->lock); | 
|  | 189 |  | 
|  | 190 | /* Check TIME-WAIT sockets first. */ | 
|  | 191 | sk_for_each(sk2, node, &(head + hinfo->ehash_size)->chain) { | 
|  | 192 | tw = inet_twsk(sk2); | 
|  | 193 |  | 
|  | 194 | if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) { | 
|  | 195 | if (twsk_unique(sk, sk2, twp)) | 
|  | 196 | goto unique; | 
|  | 197 | else | 
|  | 198 | goto not_unique; | 
|  | 199 | } | 
|  | 200 | } | 
|  | 201 | tw = NULL; | 
|  | 202 |  | 
|  | 203 | /* And established part... */ | 
|  | 204 | sk_for_each(sk2, node, &head->chain) { | 
|  | 205 | if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) | 
|  | 206 | goto not_unique; | 
|  | 207 | } | 
|  | 208 |  | 
|  | 209 | unique: | 
|  | 210 | /* Must record num and sport now. Otherwise we will see | 
|  | 211 | * in hash table socket with a funny identity. */ | 
|  | 212 | inet->num = lport; | 
|  | 213 | inet->sport = htons(lport); | 
|  | 214 | sk->sk_hash = hash; | 
|  | 215 | BUG_TRAP(sk_unhashed(sk)); | 
|  | 216 | __sk_add_node(sk, &head->chain); | 
|  | 217 | sock_prot_inc_use(sk->sk_prot); | 
|  | 218 | write_unlock(&head->lock); | 
|  | 219 |  | 
|  | 220 | if (twp) { | 
|  | 221 | *twp = tw; | 
|  | 222 | NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); | 
|  | 223 | } else if (tw) { | 
|  | 224 | /* Silly. Should hash-dance instead... */ | 
|  | 225 | inet_twsk_deschedule(tw, death_row); | 
|  | 226 | NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); | 
|  | 227 |  | 
|  | 228 | inet_twsk_put(tw); | 
|  | 229 | } | 
|  | 230 |  | 
|  | 231 | return 0; | 
|  | 232 |  | 
|  | 233 | not_unique: | 
|  | 234 | write_unlock(&head->lock); | 
|  | 235 | return -EADDRNOTAVAIL; | 
|  | 236 | } | 
|  | 237 |  | 
|  | 238 | static inline u32 inet_sk_port_offset(const struct sock *sk) | 
|  | 239 | { | 
|  | 240 | const struct inet_sock *inet = inet_sk(sk); | 
|  | 241 | return secure_ipv4_port_ephemeral(inet->rcv_saddr, inet->daddr, | 
|  | 242 | inet->dport); | 
|  | 243 | } | 
|  | 244 |  | 
|  | 245 | /* | 
|  | 246 | * Bind a port for a connect operation and hash it. | 
|  | 247 | */ | 
|  | 248 | int inet_hash_connect(struct inet_timewait_death_row *death_row, | 
|  | 249 | struct sock *sk) | 
|  | 250 | { | 
|  | 251 | struct inet_hashinfo *hinfo = death_row->hashinfo; | 
|  | 252 | const unsigned short snum = inet_sk(sk)->num; | 
|  | 253 | struct inet_bind_hashbucket *head; | 
|  | 254 | struct inet_bind_bucket *tb; | 
|  | 255 | int ret; | 
|  | 256 |  | 
|  | 257 | if (!snum) { | 
|  | 258 | int low = sysctl_local_port_range[0]; | 
|  | 259 | int high = sysctl_local_port_range[1]; | 
|  | 260 | int range = high - low; | 
|  | 261 | int i; | 
|  | 262 | int port; | 
|  | 263 | static u32 hint; | 
|  | 264 | u32 offset = hint + inet_sk_port_offset(sk); | 
|  | 265 | struct hlist_node *node; | 
|  | 266 | struct inet_timewait_sock *tw = NULL; | 
|  | 267 |  | 
|  | 268 | local_bh_disable(); | 
|  | 269 | for (i = 1; i <= range; i++) { | 
|  | 270 | port = low + (i + offset) % range; | 
|  | 271 | head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)]; | 
|  | 272 | spin_lock(&head->lock); | 
|  | 273 |  | 
|  | 274 | /* Does not bother with rcv_saddr checks, | 
|  | 275 | * because the established check is already | 
|  | 276 | * unique enough. | 
|  | 277 | */ | 
|  | 278 | inet_bind_bucket_for_each(tb, node, &head->chain) { | 
|  | 279 | if (tb->port == port) { | 
|  | 280 | BUG_TRAP(!hlist_empty(&tb->owners)); | 
|  | 281 | if (tb->fastreuse >= 0) | 
|  | 282 | goto next_port; | 
|  | 283 | if (!__inet_check_established(death_row, | 
|  | 284 | sk, port, | 
|  | 285 | &tw)) | 
|  | 286 | goto ok; | 
|  | 287 | goto next_port; | 
|  | 288 | } | 
|  | 289 | } | 
|  | 290 |  | 
|  | 291 | tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, head, port); | 
|  | 292 | if (!tb) { | 
|  | 293 | spin_unlock(&head->lock); | 
|  | 294 | break; | 
|  | 295 | } | 
|  | 296 | tb->fastreuse = -1; | 
|  | 297 | goto ok; | 
|  | 298 |  | 
|  | 299 | next_port: | 
|  | 300 | spin_unlock(&head->lock); | 
|  | 301 | } | 
|  | 302 | local_bh_enable(); | 
|  | 303 |  | 
|  | 304 | return -EADDRNOTAVAIL; | 
|  | 305 |  | 
|  | 306 | ok: | 
|  | 307 | hint += i; | 
|  | 308 |  | 
|  | 309 | /* Head lock still held and bh's disabled */ | 
|  | 310 | inet_bind_hash(sk, tb, port); | 
|  | 311 | if (sk_unhashed(sk)) { | 
|  | 312 | inet_sk(sk)->sport = htons(port); | 
|  | 313 | __inet_hash(hinfo, sk, 0); | 
|  | 314 | } | 
|  | 315 | spin_unlock(&head->lock); | 
|  | 316 |  | 
|  | 317 | if (tw) { | 
|  | 318 | inet_twsk_deschedule(tw, death_row);; | 
|  | 319 | inet_twsk_put(tw); | 
|  | 320 | } | 
|  | 321 |  | 
|  | 322 | ret = 0; | 
|  | 323 | goto out; | 
|  | 324 | } | 
|  | 325 |  | 
|  | 326 | head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)]; | 
|  | 327 | tb  = inet_csk(sk)->icsk_bind_hash; | 
|  | 328 | spin_lock_bh(&head->lock); | 
|  | 329 | if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { | 
|  | 330 | __inet_hash(hinfo, sk, 0); | 
|  | 331 | spin_unlock_bh(&head->lock); | 
|  | 332 | return 0; | 
|  | 333 | } else { | 
|  | 334 | spin_unlock(&head->lock); | 
|  | 335 | /* No definite answer... Walk to established hash table */ | 
|  | 336 | ret = __inet_check_established(death_row, sk, snum, NULL); | 
|  | 337 | out: | 
|  | 338 | local_bh_enable(); | 
|  | 339 | return ret; | 
|  | 340 | } | 
|  | 341 | } | 
|  | 342 |  | 
|  | 343 | EXPORT_SYMBOL_GPL(inet_hash_connect); |