| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 1 | /* | 
 | 2 |  * linux/net/sunrpc/svcsock.c | 
 | 3 |  * | 
 | 4 |  * These are the RPC server socket internals. | 
 | 5 |  * | 
 | 6 |  * The server scheduling algorithm does not always distribute the load | 
 | 7 |  * evenly when servicing a single client. May need to modify the | 
 | 8 |  * svc_sock_enqueue procedure... | 
 | 9 |  * | 
 | 10 |  * TCP support is largely untested and may be a little slow. The problem | 
 | 11 |  * is that we currently do two separate recvfrom's, one for the 4-byte | 
 | 12 |  * record length, and the second for the actual record. This could possibly | 
 | 13 |  * be improved by always reading a minimum size of around 100 bytes and | 
 | 14 |  * tucking any superfluous bytes away in a temporary store. Still, that | 
 | 15 |  * leaves write requests out in the rain. An alternative may be to peek at | 
 | 16 |  * the first skb in the queue, and if it matches the next TCP sequence | 
 | 17 |  * number, to extract the record marker. Yuck. | 
 | 18 |  * | 
 | 19 |  * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> | 
 | 20 |  */ | 
 | 21 |  | 
 | 22 | #include <linux/sched.h> | 
 | 23 | #include <linux/errno.h> | 
 | 24 | #include <linux/fcntl.h> | 
 | 25 | #include <linux/net.h> | 
 | 26 | #include <linux/in.h> | 
 | 27 | #include <linux/inet.h> | 
 | 28 | #include <linux/udp.h> | 
| Andrew Morton | 91483c4 | 2005-08-09 20:20:07 -0700 | [diff] [blame] | 29 | #include <linux/tcp.h> | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 30 | #include <linux/unistd.h> | 
 | 31 | #include <linux/slab.h> | 
 | 32 | #include <linux/netdevice.h> | 
 | 33 | #include <linux/skbuff.h> | 
 | 34 | #include <net/sock.h> | 
 | 35 | #include <net/checksum.h> | 
 | 36 | #include <net/ip.h> | 
| Arnaldo Carvalho de Melo | c752f07 | 2005-08-09 20:08:28 -0700 | [diff] [blame] | 37 | #include <net/tcp_states.h> | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 38 | #include <asm/uaccess.h> | 
 | 39 | #include <asm/ioctls.h> | 
 | 40 |  | 
 | 41 | #include <linux/sunrpc/types.h> | 
 | 42 | #include <linux/sunrpc/xdr.h> | 
 | 43 | #include <linux/sunrpc/svcsock.h> | 
 | 44 | #include <linux/sunrpc/stats.h> | 
 | 45 |  | 
 | 46 | /* SMP locking strategy: | 
 | 47 |  * | 
 | 48 |  * 	svc_serv->sv_lock protects most stuff for that service. | 
 | 49 |  * | 
 | 50 |  *	Some flags can be set to certain values at any time | 
 | 51 |  *	providing that certain rules are followed: | 
 | 52 |  * | 
 | 53 |  *	SK_BUSY  can be set to 0 at any time.   | 
 | 54 |  *		svc_sock_enqueue must be called afterwards | 
 | 55 |  *	SK_CONN, SK_DATA, can be set or cleared at any time. | 
 | 56 |  *		after a set, svc_sock_enqueue must be called.	 | 
 | 57 |  *		after a clear, the socket must be read/accepted | 
 | 58 |  *		 if this succeeds, it must be set again. | 
 | 59 |  *	SK_CLOSE can set at any time. It is never cleared. | 
 | 60 |  * | 
 | 61 |  */ | 
 | 62 |  | 
 | 63 | #define RPCDBG_FACILITY	RPCDBG_SVCSOCK | 
 | 64 |  | 
 | 65 |  | 
 | 66 | static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *, | 
 | 67 | 					 int *errp, int pmap_reg); | 
 | 68 | static void		svc_udp_data_ready(struct sock *, int); | 
 | 69 | static int		svc_udp_recvfrom(struct svc_rqst *); | 
 | 70 | static int		svc_udp_sendto(struct svc_rqst *); | 
 | 71 |  | 
 | 72 | static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk); | 
 | 73 | static int svc_deferred_recv(struct svc_rqst *rqstp); | 
 | 74 | static struct cache_deferred_req *svc_defer(struct cache_req *req); | 
 | 75 |  | 
 | 76 | /* | 
 | 77 |  * Queue up an idle server thread.  Must have serv->sv_lock held. | 
 | 78 |  * Note: this is really a stack rather than a queue, so that we only | 
 | 79 |  * use as many different threads as we need, and the rest don't polute | 
 | 80 |  * the cache. | 
 | 81 |  */ | 
 | 82 | static inline void | 
 | 83 | svc_serv_enqueue(struct svc_serv *serv, struct svc_rqst *rqstp) | 
 | 84 | { | 
 | 85 | 	list_add(&rqstp->rq_list, &serv->sv_threads); | 
 | 86 | } | 
 | 87 |  | 
 | 88 | /* | 
 | 89 |  * Dequeue an nfsd thread.  Must have serv->sv_lock held. | 
 | 90 |  */ | 
 | 91 | static inline void | 
 | 92 | svc_serv_dequeue(struct svc_serv *serv, struct svc_rqst *rqstp) | 
 | 93 | { | 
 | 94 | 	list_del(&rqstp->rq_list); | 
 | 95 | } | 
 | 96 |  | 
 | 97 | /* | 
 | 98 |  * Release an skbuff after use | 
 | 99 |  */ | 
 | 100 | static inline void | 
 | 101 | svc_release_skb(struct svc_rqst *rqstp) | 
 | 102 | { | 
 | 103 | 	struct sk_buff *skb = rqstp->rq_skbuff; | 
 | 104 | 	struct svc_deferred_req *dr = rqstp->rq_deferred; | 
 | 105 |  | 
 | 106 | 	if (skb) { | 
 | 107 | 		rqstp->rq_skbuff = NULL; | 
 | 108 |  | 
 | 109 | 		dprintk("svc: service %p, releasing skb %p\n", rqstp, skb); | 
 | 110 | 		skb_free_datagram(rqstp->rq_sock->sk_sk, skb); | 
 | 111 | 	} | 
 | 112 | 	if (dr) { | 
 | 113 | 		rqstp->rq_deferred = NULL; | 
 | 114 | 		kfree(dr); | 
 | 115 | 	} | 
 | 116 | } | 
 | 117 |  | 
 | 118 | /* | 
 | 119 |  * Any space to write? | 
 | 120 |  */ | 
 | 121 | static inline unsigned long | 
 | 122 | svc_sock_wspace(struct svc_sock *svsk) | 
 | 123 | { | 
 | 124 | 	int wspace; | 
 | 125 |  | 
 | 126 | 	if (svsk->sk_sock->type == SOCK_STREAM) | 
 | 127 | 		wspace = sk_stream_wspace(svsk->sk_sk); | 
 | 128 | 	else | 
 | 129 | 		wspace = sock_wspace(svsk->sk_sk); | 
 | 130 |  | 
 | 131 | 	return wspace; | 
 | 132 | } | 
 | 133 |  | 
 | 134 | /* | 
 | 135 |  * Queue up a socket with data pending. If there are idle nfsd | 
 | 136 |  * processes, wake 'em up. | 
 | 137 |  * | 
 | 138 |  */ | 
 | 139 | static void | 
 | 140 | svc_sock_enqueue(struct svc_sock *svsk) | 
 | 141 | { | 
 | 142 | 	struct svc_serv	*serv = svsk->sk_server; | 
 | 143 | 	struct svc_rqst	*rqstp; | 
 | 144 |  | 
 | 145 | 	if (!(svsk->sk_flags & | 
 | 146 | 	      ( (1<<SK_CONN)|(1<<SK_DATA)|(1<<SK_CLOSE)|(1<<SK_DEFERRED)) )) | 
 | 147 | 		return; | 
 | 148 | 	if (test_bit(SK_DEAD, &svsk->sk_flags)) | 
 | 149 | 		return; | 
 | 150 |  | 
 | 151 | 	spin_lock_bh(&serv->sv_lock); | 
 | 152 |  | 
 | 153 | 	if (!list_empty(&serv->sv_threads) &&  | 
 | 154 | 	    !list_empty(&serv->sv_sockets)) | 
 | 155 | 		printk(KERN_ERR | 
 | 156 | 			"svc_sock_enqueue: threads and sockets both waiting??\n"); | 
 | 157 |  | 
 | 158 | 	if (test_bit(SK_DEAD, &svsk->sk_flags)) { | 
 | 159 | 		/* Don't enqueue dead sockets */ | 
 | 160 | 		dprintk("svc: socket %p is dead, not enqueued\n", svsk->sk_sk); | 
 | 161 | 		goto out_unlock; | 
 | 162 | 	} | 
 | 163 |  | 
 | 164 | 	if (test_bit(SK_BUSY, &svsk->sk_flags)) { | 
 | 165 | 		/* Don't enqueue socket while daemon is receiving */ | 
 | 166 | 		dprintk("svc: socket %p busy, not enqueued\n", svsk->sk_sk); | 
 | 167 | 		goto out_unlock; | 
 | 168 | 	} | 
 | 169 |  | 
 | 170 | 	set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); | 
 | 171 | 	if (((svsk->sk_reserved + serv->sv_bufsz)*2 | 
 | 172 | 	     > svc_sock_wspace(svsk)) | 
 | 173 | 	    && !test_bit(SK_CLOSE, &svsk->sk_flags) | 
 | 174 | 	    && !test_bit(SK_CONN, &svsk->sk_flags)) { | 
 | 175 | 		/* Don't enqueue while not enough space for reply */ | 
 | 176 | 		dprintk("svc: socket %p  no space, %d*2 > %ld, not enqueued\n", | 
 | 177 | 			svsk->sk_sk, svsk->sk_reserved+serv->sv_bufsz, | 
 | 178 | 			svc_sock_wspace(svsk)); | 
 | 179 | 		goto out_unlock; | 
 | 180 | 	} | 
 | 181 | 	clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); | 
 | 182 |  | 
 | 183 | 	/* Mark socket as busy. It will remain in this state until the | 
 | 184 | 	 * server has processed all pending data and put the socket back | 
 | 185 | 	 * on the idle list. | 
 | 186 | 	 */ | 
 | 187 | 	set_bit(SK_BUSY, &svsk->sk_flags); | 
 | 188 |  | 
 | 189 | 	if (!list_empty(&serv->sv_threads)) { | 
 | 190 | 		rqstp = list_entry(serv->sv_threads.next, | 
 | 191 | 				   struct svc_rqst, | 
 | 192 | 				   rq_list); | 
 | 193 | 		dprintk("svc: socket %p served by daemon %p\n", | 
 | 194 | 			svsk->sk_sk, rqstp); | 
 | 195 | 		svc_serv_dequeue(serv, rqstp); | 
 | 196 | 		if (rqstp->rq_sock) | 
 | 197 | 			printk(KERN_ERR  | 
 | 198 | 				"svc_sock_enqueue: server %p, rq_sock=%p!\n", | 
 | 199 | 				rqstp, rqstp->rq_sock); | 
 | 200 | 		rqstp->rq_sock = svsk; | 
 | 201 | 		svsk->sk_inuse++; | 
 | 202 | 		rqstp->rq_reserved = serv->sv_bufsz; | 
 | 203 | 		svsk->sk_reserved += rqstp->rq_reserved; | 
 | 204 | 		wake_up(&rqstp->rq_wait); | 
 | 205 | 	} else { | 
 | 206 | 		dprintk("svc: socket %p put into queue\n", svsk->sk_sk); | 
 | 207 | 		list_add_tail(&svsk->sk_ready, &serv->sv_sockets); | 
 | 208 | 	} | 
 | 209 |  | 
 | 210 | out_unlock: | 
 | 211 | 	spin_unlock_bh(&serv->sv_lock); | 
 | 212 | } | 
 | 213 |  | 
 | 214 | /* | 
 | 215 |  * Dequeue the first socket.  Must be called with the serv->sv_lock held. | 
 | 216 |  */ | 
 | 217 | static inline struct svc_sock * | 
 | 218 | svc_sock_dequeue(struct svc_serv *serv) | 
 | 219 | { | 
 | 220 | 	struct svc_sock	*svsk; | 
 | 221 |  | 
 | 222 | 	if (list_empty(&serv->sv_sockets)) | 
 | 223 | 		return NULL; | 
 | 224 |  | 
 | 225 | 	svsk = list_entry(serv->sv_sockets.next, | 
 | 226 | 			  struct svc_sock, sk_ready); | 
 | 227 | 	list_del_init(&svsk->sk_ready); | 
 | 228 |  | 
 | 229 | 	dprintk("svc: socket %p dequeued, inuse=%d\n", | 
 | 230 | 		svsk->sk_sk, svsk->sk_inuse); | 
 | 231 |  | 
 | 232 | 	return svsk; | 
 | 233 | } | 
 | 234 |  | 
 | 235 | /* | 
 | 236 |  * Having read something from a socket, check whether it | 
 | 237 |  * needs to be re-enqueued. | 
 | 238 |  * Note: SK_DATA only gets cleared when a read-attempt finds | 
 | 239 |  * no (or insufficient) data. | 
 | 240 |  */ | 
 | 241 | static inline void | 
 | 242 | svc_sock_received(struct svc_sock *svsk) | 
 | 243 | { | 
 | 244 | 	clear_bit(SK_BUSY, &svsk->sk_flags); | 
 | 245 | 	svc_sock_enqueue(svsk); | 
 | 246 | } | 
 | 247 |  | 
 | 248 |  | 
 | 249 | /** | 
 | 250 |  * svc_reserve - change the space reserved for the reply to a request. | 
 | 251 |  * @rqstp:  The request in question | 
 | 252 |  * @space: new max space to reserve | 
 | 253 |  * | 
 | 254 |  * Each request reserves some space on the output queue of the socket | 
 | 255 |  * to make sure the reply fits.  This function reduces that reserved | 
 | 256 |  * space to be the amount of space used already, plus @space. | 
 | 257 |  * | 
 | 258 |  */ | 
 | 259 | void svc_reserve(struct svc_rqst *rqstp, int space) | 
 | 260 | { | 
 | 261 | 	space += rqstp->rq_res.head[0].iov_len; | 
 | 262 |  | 
 | 263 | 	if (space < rqstp->rq_reserved) { | 
 | 264 | 		struct svc_sock *svsk = rqstp->rq_sock; | 
 | 265 | 		spin_lock_bh(&svsk->sk_server->sv_lock); | 
 | 266 | 		svsk->sk_reserved -= (rqstp->rq_reserved - space); | 
 | 267 | 		rqstp->rq_reserved = space; | 
 | 268 | 		spin_unlock_bh(&svsk->sk_server->sv_lock); | 
 | 269 |  | 
 | 270 | 		svc_sock_enqueue(svsk); | 
 | 271 | 	} | 
 | 272 | } | 
 | 273 |  | 
 | 274 | /* | 
 | 275 |  * Release a socket after use. | 
 | 276 |  */ | 
 | 277 | static inline void | 
 | 278 | svc_sock_put(struct svc_sock *svsk) | 
 | 279 | { | 
 | 280 | 	struct svc_serv *serv = svsk->sk_server; | 
 | 281 |  | 
 | 282 | 	spin_lock_bh(&serv->sv_lock); | 
 | 283 | 	if (!--(svsk->sk_inuse) && test_bit(SK_DEAD, &svsk->sk_flags)) { | 
 | 284 | 		spin_unlock_bh(&serv->sv_lock); | 
 | 285 | 		dprintk("svc: releasing dead socket\n"); | 
 | 286 | 		sock_release(svsk->sk_sock); | 
 | 287 | 		kfree(svsk); | 
 | 288 | 	} | 
 | 289 | 	else | 
 | 290 | 		spin_unlock_bh(&serv->sv_lock); | 
 | 291 | } | 
 | 292 |  | 
 | 293 | static void | 
 | 294 | svc_sock_release(struct svc_rqst *rqstp) | 
 | 295 | { | 
 | 296 | 	struct svc_sock	*svsk = rqstp->rq_sock; | 
 | 297 |  | 
 | 298 | 	svc_release_skb(rqstp); | 
 | 299 |  | 
 | 300 | 	svc_free_allpages(rqstp); | 
 | 301 | 	rqstp->rq_res.page_len = 0; | 
 | 302 | 	rqstp->rq_res.page_base = 0; | 
 | 303 |  | 
 | 304 |  | 
 | 305 | 	/* Reset response buffer and release | 
 | 306 | 	 * the reservation. | 
 | 307 | 	 * But first, check that enough space was reserved | 
 | 308 | 	 * for the reply, otherwise we have a bug! | 
 | 309 | 	 */ | 
 | 310 | 	if ((rqstp->rq_res.len) >  rqstp->rq_reserved) | 
 | 311 | 		printk(KERN_ERR "RPC request reserved %d but used %d\n", | 
 | 312 | 		       rqstp->rq_reserved, | 
 | 313 | 		       rqstp->rq_res.len); | 
 | 314 |  | 
 | 315 | 	rqstp->rq_res.head[0].iov_len = 0; | 
 | 316 | 	svc_reserve(rqstp, 0); | 
 | 317 | 	rqstp->rq_sock = NULL; | 
 | 318 |  | 
 | 319 | 	svc_sock_put(svsk); | 
 | 320 | } | 
 | 321 |  | 
 | 322 | /* | 
 | 323 |  * External function to wake up a server waiting for data | 
 | 324 |  */ | 
 | 325 | void | 
 | 326 | svc_wake_up(struct svc_serv *serv) | 
 | 327 | { | 
 | 328 | 	struct svc_rqst	*rqstp; | 
 | 329 |  | 
 | 330 | 	spin_lock_bh(&serv->sv_lock); | 
 | 331 | 	if (!list_empty(&serv->sv_threads)) { | 
 | 332 | 		rqstp = list_entry(serv->sv_threads.next, | 
 | 333 | 				   struct svc_rqst, | 
 | 334 | 				   rq_list); | 
 | 335 | 		dprintk("svc: daemon %p woken up.\n", rqstp); | 
 | 336 | 		/* | 
 | 337 | 		svc_serv_dequeue(serv, rqstp); | 
 | 338 | 		rqstp->rq_sock = NULL; | 
 | 339 | 		 */ | 
 | 340 | 		wake_up(&rqstp->rq_wait); | 
 | 341 | 	} | 
 | 342 | 	spin_unlock_bh(&serv->sv_lock); | 
 | 343 | } | 
 | 344 |  | 
 | 345 | /* | 
 | 346 |  * Generic sendto routine | 
 | 347 |  */ | 
 | 348 | static int | 
 | 349 | svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) | 
 | 350 | { | 
 | 351 | 	struct svc_sock	*svsk = rqstp->rq_sock; | 
 | 352 | 	struct socket	*sock = svsk->sk_sock; | 
 | 353 | 	int		slen; | 
 | 354 | 	char 		buffer[CMSG_SPACE(sizeof(struct in_pktinfo))]; | 
 | 355 | 	struct cmsghdr *cmh = (struct cmsghdr *)buffer; | 
 | 356 | 	struct in_pktinfo *pki = (struct in_pktinfo *)CMSG_DATA(cmh); | 
 | 357 | 	int		len = 0; | 
 | 358 | 	int		result; | 
 | 359 | 	int		size; | 
 | 360 | 	struct page	**ppage = xdr->pages; | 
 | 361 | 	size_t		base = xdr->page_base; | 
 | 362 | 	unsigned int	pglen = xdr->page_len; | 
 | 363 | 	unsigned int	flags = MSG_MORE; | 
 | 364 |  | 
 | 365 | 	slen = xdr->len; | 
 | 366 |  | 
 | 367 | 	if (rqstp->rq_prot == IPPROTO_UDP) { | 
 | 368 | 		/* set the source and destination */ | 
 | 369 | 		struct msghdr	msg; | 
 | 370 | 		msg.msg_name    = &rqstp->rq_addr; | 
 | 371 | 		msg.msg_namelen = sizeof(rqstp->rq_addr); | 
 | 372 | 		msg.msg_iov     = NULL; | 
 | 373 | 		msg.msg_iovlen  = 0; | 
 | 374 | 		msg.msg_flags	= MSG_MORE; | 
 | 375 |  | 
 | 376 | 		msg.msg_control = cmh; | 
 | 377 | 		msg.msg_controllen = sizeof(buffer); | 
 | 378 | 		cmh->cmsg_len = CMSG_LEN(sizeof(*pki)); | 
 | 379 | 		cmh->cmsg_level = SOL_IP; | 
 | 380 | 		cmh->cmsg_type = IP_PKTINFO; | 
 | 381 | 		pki->ipi_ifindex = 0; | 
 | 382 | 		pki->ipi_spec_dst.s_addr = rqstp->rq_daddr; | 
 | 383 |  | 
 | 384 | 		if (sock_sendmsg(sock, &msg, 0) < 0) | 
 | 385 | 			goto out; | 
 | 386 | 	} | 
 | 387 |  | 
 | 388 | 	/* send head */ | 
 | 389 | 	if (slen == xdr->head[0].iov_len) | 
 | 390 | 		flags = 0; | 
 | 391 | 	len = sock->ops->sendpage(sock, rqstp->rq_respages[0], 0, xdr->head[0].iov_len, flags); | 
 | 392 | 	if (len != xdr->head[0].iov_len) | 
 | 393 | 		goto out; | 
 | 394 | 	slen -= xdr->head[0].iov_len; | 
 | 395 | 	if (slen == 0) | 
 | 396 | 		goto out; | 
 | 397 |  | 
 | 398 | 	/* send page data */ | 
 | 399 | 	size = PAGE_SIZE - base < pglen ? PAGE_SIZE - base : pglen; | 
 | 400 | 	while (pglen > 0) { | 
 | 401 | 		if (slen == size) | 
 | 402 | 			flags = 0; | 
 | 403 | 		result = sock->ops->sendpage(sock, *ppage, base, size, flags); | 
 | 404 | 		if (result > 0) | 
 | 405 | 			len += result; | 
 | 406 | 		if (result != size) | 
 | 407 | 			goto out; | 
 | 408 | 		slen -= size; | 
 | 409 | 		pglen -= size; | 
 | 410 | 		size = PAGE_SIZE < pglen ? PAGE_SIZE : pglen; | 
 | 411 | 		base = 0; | 
 | 412 | 		ppage++; | 
 | 413 | 	} | 
 | 414 | 	/* send tail */ | 
 | 415 | 	if (xdr->tail[0].iov_len) { | 
 | 416 | 		result = sock->ops->sendpage(sock, rqstp->rq_respages[rqstp->rq_restailpage],  | 
 | 417 | 					     ((unsigned long)xdr->tail[0].iov_base)& (PAGE_SIZE-1), | 
 | 418 | 					     xdr->tail[0].iov_len, 0); | 
 | 419 |  | 
 | 420 | 		if (result > 0) | 
 | 421 | 			len += result; | 
 | 422 | 	} | 
 | 423 | out: | 
 | 424 | 	dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %x)\n", | 
 | 425 | 			rqstp->rq_sock, xdr->head[0].iov_base, xdr->head[0].iov_len, xdr->len, len, | 
 | 426 | 		rqstp->rq_addr.sin_addr.s_addr); | 
 | 427 |  | 
 | 428 | 	return len; | 
 | 429 | } | 
 | 430 |  | 
 | 431 | /* | 
 | 432 |  * Check input queue length | 
 | 433 |  */ | 
 | 434 | static int | 
 | 435 | svc_recv_available(struct svc_sock *svsk) | 
 | 436 | { | 
 | 437 | 	mm_segment_t	oldfs; | 
 | 438 | 	struct socket	*sock = svsk->sk_sock; | 
 | 439 | 	int		avail, err; | 
 | 440 |  | 
 | 441 | 	oldfs = get_fs(); set_fs(KERNEL_DS); | 
 | 442 | 	err = sock->ops->ioctl(sock, TIOCINQ, (unsigned long) &avail); | 
 | 443 | 	set_fs(oldfs); | 
 | 444 |  | 
 | 445 | 	return (err >= 0)? avail : err; | 
 | 446 | } | 
 | 447 |  | 
 | 448 | /* | 
 | 449 |  * Generic recvfrom routine. | 
 | 450 |  */ | 
 | 451 | static int | 
 | 452 | svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, int buflen) | 
 | 453 | { | 
 | 454 | 	struct msghdr	msg; | 
 | 455 | 	struct socket	*sock; | 
 | 456 | 	int		len, alen; | 
 | 457 |  | 
 | 458 | 	rqstp->rq_addrlen = sizeof(rqstp->rq_addr); | 
 | 459 | 	sock = rqstp->rq_sock->sk_sock; | 
 | 460 |  | 
 | 461 | 	msg.msg_name    = &rqstp->rq_addr; | 
 | 462 | 	msg.msg_namelen = sizeof(rqstp->rq_addr); | 
 | 463 | 	msg.msg_control = NULL; | 
 | 464 | 	msg.msg_controllen = 0; | 
 | 465 |  | 
 | 466 | 	msg.msg_flags	= MSG_DONTWAIT; | 
 | 467 |  | 
 | 468 | 	len = kernel_recvmsg(sock, &msg, iov, nr, buflen, MSG_DONTWAIT); | 
 | 469 |  | 
 | 470 | 	/* sock_recvmsg doesn't fill in the name/namelen, so we must.. | 
 | 471 | 	 * possibly we should cache this in the svc_sock structure | 
 | 472 | 	 * at accept time. FIXME | 
 | 473 | 	 */ | 
 | 474 | 	alen = sizeof(rqstp->rq_addr); | 
 | 475 | 	sock->ops->getname(sock, (struct sockaddr *)&rqstp->rq_addr, &alen, 1); | 
 | 476 |  | 
 | 477 | 	dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n", | 
 | 478 | 		rqstp->rq_sock, iov[0].iov_base, iov[0].iov_len, len); | 
 | 479 |  | 
 | 480 | 	return len; | 
 | 481 | } | 
 | 482 |  | 
 | 483 | /* | 
 | 484 |  * Set socket snd and rcv buffer lengths | 
 | 485 |  */ | 
 | 486 | static inline void | 
 | 487 | svc_sock_setbufsize(struct socket *sock, unsigned int snd, unsigned int rcv) | 
 | 488 | { | 
 | 489 | #if 0 | 
 | 490 | 	mm_segment_t	oldfs; | 
 | 491 | 	oldfs = get_fs(); set_fs(KERNEL_DS); | 
 | 492 | 	sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF, | 
 | 493 | 			(char*)&snd, sizeof(snd)); | 
 | 494 | 	sock_setsockopt(sock, SOL_SOCKET, SO_RCVBUF, | 
 | 495 | 			(char*)&rcv, sizeof(rcv)); | 
 | 496 | #else | 
 | 497 | 	/* sock_setsockopt limits use to sysctl_?mem_max, | 
 | 498 | 	 * which isn't acceptable.  Until that is made conditional | 
 | 499 | 	 * on not having CAP_SYS_RESOURCE or similar, we go direct... | 
 | 500 | 	 * DaveM said I could! | 
 | 501 | 	 */ | 
 | 502 | 	lock_sock(sock->sk); | 
 | 503 | 	sock->sk->sk_sndbuf = snd * 2; | 
 | 504 | 	sock->sk->sk_rcvbuf = rcv * 2; | 
 | 505 | 	sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK; | 
 | 506 | 	release_sock(sock->sk); | 
 | 507 | #endif | 
 | 508 | } | 
 | 509 | /* | 
 | 510 |  * INET callback when data has been received on the socket. | 
 | 511 |  */ | 
 | 512 | static void | 
 | 513 | svc_udp_data_ready(struct sock *sk, int count) | 
 | 514 | { | 
| Neil Brown | 939bb7e | 2005-09-13 01:25:39 -0700 | [diff] [blame] | 515 | 	struct svc_sock	*svsk = (struct svc_sock *)sk->sk_user_data; | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 516 |  | 
| Neil Brown | 939bb7e | 2005-09-13 01:25:39 -0700 | [diff] [blame] | 517 | 	if (svsk) { | 
 | 518 | 		dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n", | 
 | 519 | 			svsk, sk, count, test_bit(SK_BUSY, &svsk->sk_flags)); | 
 | 520 | 		set_bit(SK_DATA, &svsk->sk_flags); | 
 | 521 | 		svc_sock_enqueue(svsk); | 
 | 522 | 	} | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 523 | 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) | 
 | 524 | 		wake_up_interruptible(sk->sk_sleep); | 
 | 525 | } | 
 | 526 |  | 
 | 527 | /* | 
 | 528 |  * INET callback when space is newly available on the socket. | 
 | 529 |  */ | 
 | 530 | static void | 
 | 531 | svc_write_space(struct sock *sk) | 
 | 532 | { | 
 | 533 | 	struct svc_sock	*svsk = (struct svc_sock *)(sk->sk_user_data); | 
 | 534 |  | 
 | 535 | 	if (svsk) { | 
 | 536 | 		dprintk("svc: socket %p(inet %p), write_space busy=%d\n", | 
 | 537 | 			svsk, sk, test_bit(SK_BUSY, &svsk->sk_flags)); | 
 | 538 | 		svc_sock_enqueue(svsk); | 
 | 539 | 	} | 
 | 540 |  | 
 | 541 | 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) { | 
| Neil Brown | 939bb7e | 2005-09-13 01:25:39 -0700 | [diff] [blame] | 542 | 		dprintk("RPC svc_write_space: someone sleeping on %p\n", | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 543 | 		       svsk); | 
 | 544 | 		wake_up_interruptible(sk->sk_sleep); | 
 | 545 | 	} | 
 | 546 | } | 
 | 547 |  | 
 | 548 | /* | 
 | 549 |  * Receive a datagram from a UDP socket. | 
 | 550 |  */ | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 551 | static int | 
 | 552 | svc_udp_recvfrom(struct svc_rqst *rqstp) | 
 | 553 | { | 
 | 554 | 	struct svc_sock	*svsk = rqstp->rq_sock; | 
 | 555 | 	struct svc_serv	*serv = svsk->sk_server; | 
 | 556 | 	struct sk_buff	*skb; | 
 | 557 | 	int		err, len; | 
 | 558 |  | 
 | 559 | 	if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags)) | 
 | 560 | 	    /* udp sockets need large rcvbuf as all pending | 
 | 561 | 	     * requests are still in that buffer.  sndbuf must | 
 | 562 | 	     * also be large enough that there is enough space | 
 | 563 | 	     * for one reply per thread. | 
 | 564 | 	     */ | 
 | 565 | 	    svc_sock_setbufsize(svsk->sk_sock, | 
 | 566 | 				(serv->sv_nrthreads+3) * serv->sv_bufsz, | 
 | 567 | 				(serv->sv_nrthreads+3) * serv->sv_bufsz); | 
 | 568 |  | 
 | 569 | 	if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) { | 
 | 570 | 		svc_sock_received(svsk); | 
 | 571 | 		return svc_deferred_recv(rqstp); | 
 | 572 | 	} | 
 | 573 |  | 
 | 574 | 	clear_bit(SK_DATA, &svsk->sk_flags); | 
 | 575 | 	while ((skb = skb_recv_datagram(svsk->sk_sk, 0, 1, &err)) == NULL) { | 
 | 576 | 		if (err == -EAGAIN) { | 
 | 577 | 			svc_sock_received(svsk); | 
 | 578 | 			return err; | 
 | 579 | 		} | 
 | 580 | 		/* possibly an icmp error */ | 
 | 581 | 		dprintk("svc: recvfrom returned error %d\n", -err); | 
 | 582 | 	} | 
| Patrick McHardy | a61bbcf | 2005-08-14 17:24:31 -0700 | [diff] [blame] | 583 | 	if (skb->tstamp.off_sec == 0) { | 
 | 584 | 		struct timeval tv; | 
 | 585 |  | 
 | 586 | 		tv.tv_sec = xtime.tv_sec; | 
| Andrew Morton | 4bcde03 | 2005-10-26 01:59:03 -0700 | [diff] [blame] | 587 | 		tv.tv_usec = xtime.tv_nsec / NSEC_PER_USEC; | 
| Patrick McHardy | a61bbcf | 2005-08-14 17:24:31 -0700 | [diff] [blame] | 588 | 		skb_set_timestamp(skb, &tv); | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 589 | 		/* Don't enable netstamp, sunrpc doesn't  | 
 | 590 | 		   need that much accuracy */ | 
 | 591 | 	} | 
| Patrick McHardy | a61bbcf | 2005-08-14 17:24:31 -0700 | [diff] [blame] | 592 | 	skb_get_timestamp(skb, &svsk->sk_sk->sk_stamp); | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 593 | 	set_bit(SK_DATA, &svsk->sk_flags); /* there may be more data... */ | 
 | 594 |  | 
 | 595 | 	/* | 
 | 596 | 	 * Maybe more packets - kick another thread ASAP. | 
 | 597 | 	 */ | 
 | 598 | 	svc_sock_received(svsk); | 
 | 599 |  | 
 | 600 | 	len  = skb->len - sizeof(struct udphdr); | 
 | 601 | 	rqstp->rq_arg.len = len; | 
 | 602 |  | 
 | 603 | 	rqstp->rq_prot        = IPPROTO_UDP; | 
 | 604 |  | 
 | 605 | 	/* Get sender address */ | 
 | 606 | 	rqstp->rq_addr.sin_family = AF_INET; | 
 | 607 | 	rqstp->rq_addr.sin_port = skb->h.uh->source; | 
 | 608 | 	rqstp->rq_addr.sin_addr.s_addr = skb->nh.iph->saddr; | 
 | 609 | 	rqstp->rq_daddr = skb->nh.iph->daddr; | 
 | 610 |  | 
 | 611 | 	if (skb_is_nonlinear(skb)) { | 
 | 612 | 		/* we have to copy */ | 
 | 613 | 		local_bh_disable(); | 
 | 614 | 		if (csum_partial_copy_to_xdr(&rqstp->rq_arg, skb)) { | 
 | 615 | 			local_bh_enable(); | 
 | 616 | 			/* checksum error */ | 
 | 617 | 			skb_free_datagram(svsk->sk_sk, skb); | 
 | 618 | 			return 0; | 
 | 619 | 		} | 
 | 620 | 		local_bh_enable(); | 
 | 621 | 		skb_free_datagram(svsk->sk_sk, skb);  | 
 | 622 | 	} else { | 
 | 623 | 		/* we can use it in-place */ | 
 | 624 | 		rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr); | 
 | 625 | 		rqstp->rq_arg.head[0].iov_len = len; | 
 | 626 | 		if (skb->ip_summed != CHECKSUM_UNNECESSARY) { | 
 | 627 | 			if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { | 
 | 628 | 				skb_free_datagram(svsk->sk_sk, skb); | 
 | 629 | 				return 0; | 
 | 630 | 			} | 
 | 631 | 			skb->ip_summed = CHECKSUM_UNNECESSARY; | 
 | 632 | 		} | 
 | 633 | 		rqstp->rq_skbuff = skb; | 
 | 634 | 	} | 
 | 635 |  | 
 | 636 | 	rqstp->rq_arg.page_base = 0; | 
 | 637 | 	if (len <= rqstp->rq_arg.head[0].iov_len) { | 
 | 638 | 		rqstp->rq_arg.head[0].iov_len = len; | 
 | 639 | 		rqstp->rq_arg.page_len = 0; | 
 | 640 | 	} else { | 
 | 641 | 		rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len; | 
 | 642 | 		rqstp->rq_argused += (rqstp->rq_arg.page_len + PAGE_SIZE - 1)/ PAGE_SIZE; | 
 | 643 | 	} | 
 | 644 |  | 
 | 645 | 	if (serv->sv_stats) | 
 | 646 | 		serv->sv_stats->netudpcnt++; | 
 | 647 |  | 
 | 648 | 	return len; | 
 | 649 | } | 
 | 650 |  | 
 | 651 | static int | 
 | 652 | svc_udp_sendto(struct svc_rqst *rqstp) | 
 | 653 | { | 
 | 654 | 	int		error; | 
 | 655 |  | 
 | 656 | 	error = svc_sendto(rqstp, &rqstp->rq_res); | 
 | 657 | 	if (error == -ECONNREFUSED) | 
 | 658 | 		/* ICMP error on earlier request. */ | 
 | 659 | 		error = svc_sendto(rqstp, &rqstp->rq_res); | 
 | 660 |  | 
 | 661 | 	return error; | 
 | 662 | } | 
 | 663 |  | 
 | 664 | static void | 
 | 665 | svc_udp_init(struct svc_sock *svsk) | 
 | 666 | { | 
 | 667 | 	svsk->sk_sk->sk_data_ready = svc_udp_data_ready; | 
 | 668 | 	svsk->sk_sk->sk_write_space = svc_write_space; | 
 | 669 | 	svsk->sk_recvfrom = svc_udp_recvfrom; | 
 | 670 | 	svsk->sk_sendto = svc_udp_sendto; | 
 | 671 |  | 
 | 672 | 	/* initialise setting must have enough space to | 
 | 673 | 	 * receive and respond to one request.   | 
 | 674 | 	 * svc_udp_recvfrom will re-adjust if necessary | 
 | 675 | 	 */ | 
 | 676 | 	svc_sock_setbufsize(svsk->sk_sock, | 
 | 677 | 			    3 * svsk->sk_server->sv_bufsz, | 
 | 678 | 			    3 * svsk->sk_server->sv_bufsz); | 
 | 679 |  | 
 | 680 | 	set_bit(SK_DATA, &svsk->sk_flags); /* might have come in before data_ready set up */ | 
 | 681 | 	set_bit(SK_CHNGBUF, &svsk->sk_flags); | 
 | 682 | } | 
 | 683 |  | 
 | 684 | /* | 
 | 685 |  * A data_ready event on a listening socket means there's a connection | 
 | 686 |  * pending. Do not use state_change as a substitute for it. | 
 | 687 |  */ | 
 | 688 | static void | 
 | 689 | svc_tcp_listen_data_ready(struct sock *sk, int count_unused) | 
 | 690 | { | 
| Neil Brown | 939bb7e | 2005-09-13 01:25:39 -0700 | [diff] [blame] | 691 | 	struct svc_sock	*svsk = (struct svc_sock *)sk->sk_user_data; | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 692 |  | 
 | 693 | 	dprintk("svc: socket %p TCP (listen) state change %d\n", | 
| Neil Brown | 939bb7e | 2005-09-13 01:25:39 -0700 | [diff] [blame] | 694 | 		sk, sk->sk_state); | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 695 |  | 
| Neil Brown | 939bb7e | 2005-09-13 01:25:39 -0700 | [diff] [blame] | 696 | 	/* | 
 | 697 | 	 * This callback may called twice when a new connection | 
 | 698 | 	 * is established as a child socket inherits everything | 
 | 699 | 	 * from a parent LISTEN socket. | 
 | 700 | 	 * 1) data_ready method of the parent socket will be called | 
 | 701 | 	 *    when one of child sockets become ESTABLISHED. | 
 | 702 | 	 * 2) data_ready method of the child socket may be called | 
 | 703 | 	 *    when it receives data before the socket is accepted. | 
 | 704 | 	 * In case of 2, we should ignore it silently. | 
 | 705 | 	 */ | 
 | 706 | 	if (sk->sk_state == TCP_LISTEN) { | 
 | 707 | 		if (svsk) { | 
 | 708 | 			set_bit(SK_CONN, &svsk->sk_flags); | 
 | 709 | 			svc_sock_enqueue(svsk); | 
 | 710 | 		} else | 
 | 711 | 			printk("svc: socket %p: no user data\n", sk); | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 712 | 	} | 
| Neil Brown | 939bb7e | 2005-09-13 01:25:39 -0700 | [diff] [blame] | 713 |  | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 714 | 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) | 
 | 715 | 		wake_up_interruptible_all(sk->sk_sleep); | 
 | 716 | } | 
 | 717 |  | 
 | 718 | /* | 
 | 719 |  * A state change on a connected socket means it's dying or dead. | 
 | 720 |  */ | 
 | 721 | static void | 
 | 722 | svc_tcp_state_change(struct sock *sk) | 
 | 723 | { | 
| Neil Brown | 939bb7e | 2005-09-13 01:25:39 -0700 | [diff] [blame] | 724 | 	struct svc_sock	*svsk = (struct svc_sock *)sk->sk_user_data; | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 725 |  | 
 | 726 | 	dprintk("svc: socket %p TCP (connected) state change %d (svsk %p)\n", | 
| Neil Brown | 939bb7e | 2005-09-13 01:25:39 -0700 | [diff] [blame] | 727 | 		sk, sk->sk_state, sk->sk_user_data); | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 728 |  | 
| Neil Brown | 939bb7e | 2005-09-13 01:25:39 -0700 | [diff] [blame] | 729 | 	if (!svsk) | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 730 | 		printk("svc: socket %p: no user data\n", sk); | 
| Neil Brown | 939bb7e | 2005-09-13 01:25:39 -0700 | [diff] [blame] | 731 | 	else { | 
 | 732 | 		set_bit(SK_CLOSE, &svsk->sk_flags); | 
 | 733 | 		svc_sock_enqueue(svsk); | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 734 | 	} | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 735 | 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) | 
 | 736 | 		wake_up_interruptible_all(sk->sk_sleep); | 
 | 737 | } | 
 | 738 |  | 
 | 739 | static void | 
 | 740 | svc_tcp_data_ready(struct sock *sk, int count) | 
 | 741 | { | 
| Neil Brown | 939bb7e | 2005-09-13 01:25:39 -0700 | [diff] [blame] | 742 | 	struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 743 |  | 
 | 744 | 	dprintk("svc: socket %p TCP data ready (svsk %p)\n", | 
| Neil Brown | 939bb7e | 2005-09-13 01:25:39 -0700 | [diff] [blame] | 745 | 		sk, sk->sk_user_data); | 
 | 746 | 	if (svsk) { | 
 | 747 | 		set_bit(SK_DATA, &svsk->sk_flags); | 
 | 748 | 		svc_sock_enqueue(svsk); | 
 | 749 | 	} | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 750 | 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) | 
 | 751 | 		wake_up_interruptible(sk->sk_sleep); | 
 | 752 | } | 
 | 753 |  | 
 | 754 | /* | 
 | 755 |  * Accept a TCP connection | 
 | 756 |  */ | 
 | 757 | static void | 
 | 758 | svc_tcp_accept(struct svc_sock *svsk) | 
 | 759 | { | 
 | 760 | 	struct sockaddr_in sin; | 
 | 761 | 	struct svc_serv	*serv = svsk->sk_server; | 
 | 762 | 	struct socket	*sock = svsk->sk_sock; | 
 | 763 | 	struct socket	*newsock; | 
 | 764 | 	struct proto_ops *ops; | 
 | 765 | 	struct svc_sock	*newsvsk; | 
 | 766 | 	int		err, slen; | 
 | 767 |  | 
 | 768 | 	dprintk("svc: tcp_accept %p sock %p\n", svsk, sock); | 
 | 769 | 	if (!sock) | 
 | 770 | 		return; | 
 | 771 |  | 
 | 772 | 	err = sock_create_lite(PF_INET, SOCK_STREAM, IPPROTO_TCP, &newsock); | 
 | 773 | 	if (err) { | 
 | 774 | 		if (err == -ENOMEM) | 
 | 775 | 			printk(KERN_WARNING "%s: no more sockets!\n", | 
 | 776 | 			       serv->sv_name); | 
 | 777 | 		return; | 
 | 778 | 	} | 
 | 779 |  | 
 | 780 | 	dprintk("svc: tcp_accept %p allocated\n", newsock); | 
 | 781 | 	newsock->ops = ops = sock->ops; | 
 | 782 |  | 
 | 783 | 	clear_bit(SK_CONN, &svsk->sk_flags); | 
 | 784 | 	if ((err = ops->accept(sock, newsock, O_NONBLOCK)) < 0) { | 
 | 785 | 		if (err != -EAGAIN && net_ratelimit()) | 
 | 786 | 			printk(KERN_WARNING "%s: accept failed (err %d)!\n", | 
 | 787 | 				   serv->sv_name, -err); | 
 | 788 | 		goto failed;		/* aborted connection or whatever */ | 
 | 789 | 	} | 
 | 790 | 	set_bit(SK_CONN, &svsk->sk_flags); | 
 | 791 | 	svc_sock_enqueue(svsk); | 
 | 792 |  | 
 | 793 | 	slen = sizeof(sin); | 
 | 794 | 	err = ops->getname(newsock, (struct sockaddr *) &sin, &slen, 1); | 
 | 795 | 	if (err < 0) { | 
 | 796 | 		if (net_ratelimit()) | 
 | 797 | 			printk(KERN_WARNING "%s: peername failed (err %d)!\n", | 
 | 798 | 				   serv->sv_name, -err); | 
 | 799 | 		goto failed;		/* aborted connection or whatever */ | 
 | 800 | 	} | 
 | 801 |  | 
 | 802 | 	/* Ideally, we would want to reject connections from unauthorized | 
 | 803 | 	 * hosts here, but when we get encription, the IP of the host won't | 
 | 804 | 	 * tell us anything. For now just warn about unpriv connections. | 
 | 805 | 	 */ | 
 | 806 | 	if (ntohs(sin.sin_port) >= 1024) { | 
 | 807 | 		dprintk(KERN_WARNING | 
 | 808 | 			"%s: connect from unprivileged port: %u.%u.%u.%u:%d\n", | 
 | 809 | 			serv->sv_name,  | 
 | 810 | 			NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); | 
 | 811 | 	} | 
 | 812 |  | 
 | 813 | 	dprintk("%s: connect from %u.%u.%u.%u:%04x\n", serv->sv_name, | 
 | 814 | 			NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); | 
 | 815 |  | 
 | 816 | 	/* make sure that a write doesn't block forever when | 
 | 817 | 	 * low on memory | 
 | 818 | 	 */ | 
 | 819 | 	newsock->sk->sk_sndtimeo = HZ*30; | 
 | 820 |  | 
 | 821 | 	if (!(newsvsk = svc_setup_socket(serv, newsock, &err, 0))) | 
 | 822 | 		goto failed; | 
 | 823 |  | 
 | 824 |  | 
 | 825 | 	/* make sure that we don't have too many active connections. | 
 | 826 | 	 * If we have, something must be dropped. | 
 | 827 | 	 * | 
 | 828 | 	 * There's no point in trying to do random drop here for | 
 | 829 | 	 * DoS prevention. The NFS clients does 1 reconnect in 15 | 
 | 830 | 	 * seconds. An attacker can easily beat that. | 
 | 831 | 	 * | 
 | 832 | 	 * The only somewhat efficient mechanism would be if drop | 
 | 833 | 	 * old connections from the same IP first. But right now | 
 | 834 | 	 * we don't even record the client IP in svc_sock. | 
 | 835 | 	 */ | 
 | 836 | 	if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) { | 
 | 837 | 		struct svc_sock *svsk = NULL; | 
 | 838 | 		spin_lock_bh(&serv->sv_lock); | 
 | 839 | 		if (!list_empty(&serv->sv_tempsocks)) { | 
 | 840 | 			if (net_ratelimit()) { | 
 | 841 | 				/* Try to help the admin */ | 
 | 842 | 				printk(KERN_NOTICE "%s: too many open TCP " | 
 | 843 | 					"sockets, consider increasing the " | 
 | 844 | 					"number of nfsd threads\n", | 
 | 845 | 						   serv->sv_name); | 
 | 846 | 				printk(KERN_NOTICE "%s: last TCP connect from " | 
 | 847 | 					"%u.%u.%u.%u:%d\n", | 
 | 848 | 					serv->sv_name, | 
 | 849 | 					NIPQUAD(sin.sin_addr.s_addr), | 
 | 850 | 					ntohs(sin.sin_port)); | 
 | 851 | 			} | 
 | 852 | 			/* | 
 | 853 | 			 * Always select the oldest socket. It's not fair, | 
 | 854 | 			 * but so is life | 
 | 855 | 			 */ | 
 | 856 | 			svsk = list_entry(serv->sv_tempsocks.prev, | 
 | 857 | 					  struct svc_sock, | 
 | 858 | 					  sk_list); | 
 | 859 | 			set_bit(SK_CLOSE, &svsk->sk_flags); | 
 | 860 | 			svsk->sk_inuse ++; | 
 | 861 | 		} | 
 | 862 | 		spin_unlock_bh(&serv->sv_lock); | 
 | 863 |  | 
 | 864 | 		if (svsk) { | 
 | 865 | 			svc_sock_enqueue(svsk); | 
 | 866 | 			svc_sock_put(svsk); | 
 | 867 | 		} | 
 | 868 |  | 
 | 869 | 	} | 
 | 870 |  | 
 | 871 | 	if (serv->sv_stats) | 
 | 872 | 		serv->sv_stats->nettcpconn++; | 
 | 873 |  | 
 | 874 | 	return; | 
 | 875 |  | 
 | 876 | failed: | 
 | 877 | 	sock_release(newsock); | 
 | 878 | 	return; | 
 | 879 | } | 
 | 880 |  | 
 | 881 | /* | 
 | 882 |  * Receive data from a TCP socket. | 
 | 883 |  */ | 
 | 884 | static int | 
 | 885 | svc_tcp_recvfrom(struct svc_rqst *rqstp) | 
 | 886 | { | 
 | 887 | 	struct svc_sock	*svsk = rqstp->rq_sock; | 
 | 888 | 	struct svc_serv	*serv = svsk->sk_server; | 
 | 889 | 	int		len; | 
 | 890 | 	struct kvec vec[RPCSVC_MAXPAGES]; | 
 | 891 | 	int pnum, vlen; | 
 | 892 |  | 
 | 893 | 	dprintk("svc: tcp_recv %p data %d conn %d close %d\n", | 
 | 894 | 		svsk, test_bit(SK_DATA, &svsk->sk_flags), | 
 | 895 | 		test_bit(SK_CONN, &svsk->sk_flags), | 
 | 896 | 		test_bit(SK_CLOSE, &svsk->sk_flags)); | 
 | 897 |  | 
 | 898 | 	if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) { | 
 | 899 | 		svc_sock_received(svsk); | 
 | 900 | 		return svc_deferred_recv(rqstp); | 
 | 901 | 	} | 
 | 902 |  | 
 | 903 | 	if (test_bit(SK_CLOSE, &svsk->sk_flags)) { | 
 | 904 | 		svc_delete_socket(svsk); | 
 | 905 | 		return 0; | 
 | 906 | 	} | 
 | 907 |  | 
 | 908 | 	if (test_bit(SK_CONN, &svsk->sk_flags)) { | 
 | 909 | 		svc_tcp_accept(svsk); | 
 | 910 | 		svc_sock_received(svsk); | 
 | 911 | 		return 0; | 
 | 912 | 	} | 
 | 913 |  | 
 | 914 | 	if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags)) | 
 | 915 | 		/* sndbuf needs to have room for one request | 
 | 916 | 		 * per thread, otherwise we can stall even when the | 
 | 917 | 		 * network isn't a bottleneck. | 
 | 918 | 		 * rcvbuf just needs to be able to hold a few requests. | 
 | 919 | 		 * Normally they will be removed from the queue  | 
 | 920 | 		 * as soon a a complete request arrives. | 
 | 921 | 		 */ | 
 | 922 | 		svc_sock_setbufsize(svsk->sk_sock, | 
 | 923 | 				    (serv->sv_nrthreads+3) * serv->sv_bufsz, | 
 | 924 | 				    3 * serv->sv_bufsz); | 
 | 925 |  | 
 | 926 | 	clear_bit(SK_DATA, &svsk->sk_flags); | 
 | 927 |  | 
 | 928 | 	/* Receive data. If we haven't got the record length yet, get | 
 | 929 | 	 * the next four bytes. Otherwise try to gobble up as much as | 
 | 930 | 	 * possible up to the complete record length. | 
 | 931 | 	 */ | 
 | 932 | 	if (svsk->sk_tcplen < 4) { | 
 | 933 | 		unsigned long	want = 4 - svsk->sk_tcplen; | 
 | 934 | 		struct kvec	iov; | 
 | 935 |  | 
 | 936 | 		iov.iov_base = ((char *) &svsk->sk_reclen) + svsk->sk_tcplen; | 
 | 937 | 		iov.iov_len  = want; | 
 | 938 | 		if ((len = svc_recvfrom(rqstp, &iov, 1, want)) < 0) | 
 | 939 | 			goto error; | 
 | 940 | 		svsk->sk_tcplen += len; | 
 | 941 |  | 
 | 942 | 		if (len < want) { | 
 | 943 | 			dprintk("svc: short recvfrom while reading record length (%d of %lu)\n", | 
 | 944 | 			        len, want); | 
 | 945 | 			svc_sock_received(svsk); | 
 | 946 | 			return -EAGAIN; /* record header not complete */ | 
 | 947 | 		} | 
 | 948 |  | 
 | 949 | 		svsk->sk_reclen = ntohl(svsk->sk_reclen); | 
 | 950 | 		if (!(svsk->sk_reclen & 0x80000000)) { | 
 | 951 | 			/* FIXME: technically, a record can be fragmented, | 
 | 952 | 			 *  and non-terminal fragments will not have the top | 
 | 953 | 			 *  bit set in the fragment length header. | 
 | 954 | 			 *  But apparently no known nfs clients send fragmented | 
 | 955 | 			 *  records. */ | 
 | 956 | 			printk(KERN_NOTICE "RPC: bad TCP reclen 0x%08lx (non-terminal)\n", | 
 | 957 | 			       (unsigned long) svsk->sk_reclen); | 
 | 958 | 			goto err_delete; | 
 | 959 | 		} | 
 | 960 | 		svsk->sk_reclen &= 0x7fffffff; | 
 | 961 | 		dprintk("svc: TCP record, %d bytes\n", svsk->sk_reclen); | 
 | 962 | 		if (svsk->sk_reclen > serv->sv_bufsz) { | 
 | 963 | 			printk(KERN_NOTICE "RPC: bad TCP reclen 0x%08lx (large)\n", | 
 | 964 | 			       (unsigned long) svsk->sk_reclen); | 
 | 965 | 			goto err_delete; | 
 | 966 | 		} | 
 | 967 | 	} | 
 | 968 |  | 
 | 969 | 	/* Check whether enough data is available */ | 
 | 970 | 	len = svc_recv_available(svsk); | 
 | 971 | 	if (len < 0) | 
 | 972 | 		goto error; | 
 | 973 |  | 
 | 974 | 	if (len < svsk->sk_reclen) { | 
 | 975 | 		dprintk("svc: incomplete TCP record (%d of %d)\n", | 
 | 976 | 			len, svsk->sk_reclen); | 
 | 977 | 		svc_sock_received(svsk); | 
 | 978 | 		return -EAGAIN;	/* record not complete */ | 
 | 979 | 	} | 
 | 980 | 	len = svsk->sk_reclen; | 
 | 981 | 	set_bit(SK_DATA, &svsk->sk_flags); | 
 | 982 |  | 
 | 983 | 	vec[0] = rqstp->rq_arg.head[0]; | 
 | 984 | 	vlen = PAGE_SIZE; | 
 | 985 | 	pnum = 1; | 
 | 986 | 	while (vlen < len) { | 
 | 987 | 		vec[pnum].iov_base = page_address(rqstp->rq_argpages[rqstp->rq_argused++]); | 
 | 988 | 		vec[pnum].iov_len = PAGE_SIZE; | 
 | 989 | 		pnum++; | 
 | 990 | 		vlen += PAGE_SIZE; | 
 | 991 | 	} | 
 | 992 |  | 
 | 993 | 	/* Now receive data */ | 
 | 994 | 	len = svc_recvfrom(rqstp, vec, pnum, len); | 
 | 995 | 	if (len < 0) | 
 | 996 | 		goto error; | 
 | 997 |  | 
 | 998 | 	dprintk("svc: TCP complete record (%d bytes)\n", len); | 
 | 999 | 	rqstp->rq_arg.len = len; | 
 | 1000 | 	rqstp->rq_arg.page_base = 0; | 
 | 1001 | 	if (len <= rqstp->rq_arg.head[0].iov_len) { | 
 | 1002 | 		rqstp->rq_arg.head[0].iov_len = len; | 
 | 1003 | 		rqstp->rq_arg.page_len = 0; | 
 | 1004 | 	} else { | 
 | 1005 | 		rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len; | 
 | 1006 | 	} | 
 | 1007 |  | 
 | 1008 | 	rqstp->rq_skbuff      = NULL; | 
 | 1009 | 	rqstp->rq_prot	      = IPPROTO_TCP; | 
 | 1010 |  | 
 | 1011 | 	/* Reset TCP read info */ | 
 | 1012 | 	svsk->sk_reclen = 0; | 
 | 1013 | 	svsk->sk_tcplen = 0; | 
 | 1014 |  | 
 | 1015 | 	svc_sock_received(svsk); | 
 | 1016 | 	if (serv->sv_stats) | 
 | 1017 | 		serv->sv_stats->nettcpcnt++; | 
 | 1018 |  | 
 | 1019 | 	return len; | 
 | 1020 |  | 
 | 1021 |  err_delete: | 
 | 1022 | 	svc_delete_socket(svsk); | 
 | 1023 | 	return -EAGAIN; | 
 | 1024 |  | 
 | 1025 |  error: | 
 | 1026 | 	if (len == -EAGAIN) { | 
 | 1027 | 		dprintk("RPC: TCP recvfrom got EAGAIN\n"); | 
 | 1028 | 		svc_sock_received(svsk); | 
 | 1029 | 	} else { | 
 | 1030 | 		printk(KERN_NOTICE "%s: recvfrom returned errno %d\n", | 
 | 1031 | 					svsk->sk_server->sv_name, -len); | 
 | 1032 | 		svc_sock_received(svsk); | 
 | 1033 | 	} | 
 | 1034 |  | 
 | 1035 | 	return len; | 
 | 1036 | } | 
 | 1037 |  | 
 | 1038 | /* | 
 | 1039 |  * Send out data on TCP socket. | 
 | 1040 |  */ | 
 | 1041 | static int | 
 | 1042 | svc_tcp_sendto(struct svc_rqst *rqstp) | 
 | 1043 | { | 
 | 1044 | 	struct xdr_buf	*xbufp = &rqstp->rq_res; | 
 | 1045 | 	int sent; | 
 | 1046 | 	u32 reclen; | 
 | 1047 |  | 
 | 1048 | 	/* Set up the first element of the reply kvec. | 
 | 1049 | 	 * Any other kvecs that may be in use have been taken | 
 | 1050 | 	 * care of by the server implementation itself. | 
 | 1051 | 	 */ | 
 | 1052 | 	reclen = htonl(0x80000000|((xbufp->len ) - 4)); | 
 | 1053 | 	memcpy(xbufp->head[0].iov_base, &reclen, 4); | 
 | 1054 |  | 
 | 1055 | 	if (test_bit(SK_DEAD, &rqstp->rq_sock->sk_flags)) | 
 | 1056 | 		return -ENOTCONN; | 
 | 1057 |  | 
 | 1058 | 	sent = svc_sendto(rqstp, &rqstp->rq_res); | 
 | 1059 | 	if (sent != xbufp->len) { | 
 | 1060 | 		printk(KERN_NOTICE "rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n", | 
 | 1061 | 		       rqstp->rq_sock->sk_server->sv_name, | 
 | 1062 | 		       (sent<0)?"got error":"sent only", | 
 | 1063 | 		       sent, xbufp->len); | 
 | 1064 | 		svc_delete_socket(rqstp->rq_sock); | 
 | 1065 | 		sent = -EAGAIN; | 
 | 1066 | 	} | 
 | 1067 | 	return sent; | 
 | 1068 | } | 
 | 1069 |  | 
 | 1070 | static void | 
 | 1071 | svc_tcp_init(struct svc_sock *svsk) | 
 | 1072 | { | 
 | 1073 | 	struct sock	*sk = svsk->sk_sk; | 
 | 1074 | 	struct tcp_sock *tp = tcp_sk(sk); | 
 | 1075 |  | 
 | 1076 | 	svsk->sk_recvfrom = svc_tcp_recvfrom; | 
 | 1077 | 	svsk->sk_sendto = svc_tcp_sendto; | 
 | 1078 |  | 
 | 1079 | 	if (sk->sk_state == TCP_LISTEN) { | 
 | 1080 | 		dprintk("setting up TCP socket for listening\n"); | 
 | 1081 | 		sk->sk_data_ready = svc_tcp_listen_data_ready; | 
 | 1082 | 		set_bit(SK_CONN, &svsk->sk_flags); | 
 | 1083 | 	} else { | 
 | 1084 | 		dprintk("setting up TCP socket for reading\n"); | 
 | 1085 | 		sk->sk_state_change = svc_tcp_state_change; | 
 | 1086 | 		sk->sk_data_ready = svc_tcp_data_ready; | 
 | 1087 | 		sk->sk_write_space = svc_write_space; | 
 | 1088 |  | 
 | 1089 | 		svsk->sk_reclen = 0; | 
 | 1090 | 		svsk->sk_tcplen = 0; | 
 | 1091 |  | 
 | 1092 | 		tp->nonagle = 1;        /* disable Nagle's algorithm */ | 
 | 1093 |  | 
 | 1094 | 		/* initialise setting must have enough space to | 
 | 1095 | 		 * receive and respond to one request.   | 
 | 1096 | 		 * svc_tcp_recvfrom will re-adjust if necessary | 
 | 1097 | 		 */ | 
 | 1098 | 		svc_sock_setbufsize(svsk->sk_sock, | 
 | 1099 | 				    3 * svsk->sk_server->sv_bufsz, | 
 | 1100 | 				    3 * svsk->sk_server->sv_bufsz); | 
 | 1101 |  | 
 | 1102 | 		set_bit(SK_CHNGBUF, &svsk->sk_flags); | 
 | 1103 | 		set_bit(SK_DATA, &svsk->sk_flags); | 
 | 1104 | 		if (sk->sk_state != TCP_ESTABLISHED)  | 
 | 1105 | 			set_bit(SK_CLOSE, &svsk->sk_flags); | 
 | 1106 | 	} | 
 | 1107 | } | 
 | 1108 |  | 
 | 1109 | void | 
 | 1110 | svc_sock_update_bufs(struct svc_serv *serv) | 
 | 1111 | { | 
 | 1112 | 	/* | 
 | 1113 | 	 * The number of server threads has changed. Update | 
 | 1114 | 	 * rcvbuf and sndbuf accordingly on all sockets | 
 | 1115 | 	 */ | 
 | 1116 | 	struct list_head *le; | 
 | 1117 |  | 
 | 1118 | 	spin_lock_bh(&serv->sv_lock); | 
 | 1119 | 	list_for_each(le, &serv->sv_permsocks) { | 
 | 1120 | 		struct svc_sock *svsk =  | 
 | 1121 | 			list_entry(le, struct svc_sock, sk_list); | 
 | 1122 | 		set_bit(SK_CHNGBUF, &svsk->sk_flags); | 
 | 1123 | 	} | 
 | 1124 | 	list_for_each(le, &serv->sv_tempsocks) { | 
 | 1125 | 		struct svc_sock *svsk = | 
 | 1126 | 			list_entry(le, struct svc_sock, sk_list); | 
 | 1127 | 		set_bit(SK_CHNGBUF, &svsk->sk_flags); | 
 | 1128 | 	} | 
 | 1129 | 	spin_unlock_bh(&serv->sv_lock); | 
 | 1130 | } | 
 | 1131 |  | 
 | 1132 | /* | 
 | 1133 |  * Receive the next request on any socket. | 
 | 1134 |  */ | 
 | 1135 | int | 
 | 1136 | svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout) | 
 | 1137 | { | 
 | 1138 | 	struct svc_sock		*svsk =NULL; | 
 | 1139 | 	int			len; | 
 | 1140 | 	int 			pages; | 
 | 1141 | 	struct xdr_buf		*arg; | 
 | 1142 | 	DECLARE_WAITQUEUE(wait, current); | 
 | 1143 |  | 
 | 1144 | 	dprintk("svc: server %p waiting for data (to = %ld)\n", | 
 | 1145 | 		rqstp, timeout); | 
 | 1146 |  | 
 | 1147 | 	if (rqstp->rq_sock) | 
 | 1148 | 		printk(KERN_ERR  | 
 | 1149 | 			"svc_recv: service %p, socket not NULL!\n", | 
 | 1150 | 			 rqstp); | 
 | 1151 | 	if (waitqueue_active(&rqstp->rq_wait)) | 
 | 1152 | 		printk(KERN_ERR  | 
 | 1153 | 			"svc_recv: service %p, wait queue active!\n", | 
 | 1154 | 			 rqstp); | 
 | 1155 |  | 
 | 1156 | 	/* Initialize the buffers */ | 
 | 1157 | 	/* first reclaim pages that were moved to response list */ | 
 | 1158 | 	svc_pushback_allpages(rqstp); | 
 | 1159 |  | 
 | 1160 | 	/* now allocate needed pages.  If we get a failure, sleep briefly */ | 
 | 1161 | 	pages = 2 + (serv->sv_bufsz + PAGE_SIZE -1) / PAGE_SIZE; | 
 | 1162 | 	while (rqstp->rq_arghi < pages) { | 
 | 1163 | 		struct page *p = alloc_page(GFP_KERNEL); | 
 | 1164 | 		if (!p) { | 
| Nishanth Aravamudan | 121caf5 | 2005-09-12 14:15:34 -0700 | [diff] [blame] | 1165 | 			schedule_timeout_uninterruptible(msecs_to_jiffies(500)); | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 1166 | 			continue; | 
 | 1167 | 		} | 
 | 1168 | 		rqstp->rq_argpages[rqstp->rq_arghi++] = p; | 
 | 1169 | 	} | 
 | 1170 |  | 
 | 1171 | 	/* Make arg->head point to first page and arg->pages point to rest */ | 
 | 1172 | 	arg = &rqstp->rq_arg; | 
 | 1173 | 	arg->head[0].iov_base = page_address(rqstp->rq_argpages[0]); | 
 | 1174 | 	arg->head[0].iov_len = PAGE_SIZE; | 
 | 1175 | 	rqstp->rq_argused = 1; | 
 | 1176 | 	arg->pages = rqstp->rq_argpages + 1; | 
 | 1177 | 	arg->page_base = 0; | 
 | 1178 | 	/* save at least one page for response */ | 
 | 1179 | 	arg->page_len = (pages-2)*PAGE_SIZE; | 
 | 1180 | 	arg->len = (pages-1)*PAGE_SIZE; | 
 | 1181 | 	arg->tail[0].iov_len = 0; | 
| Christoph Lameter | 3e1d1d2 | 2005-06-24 23:13:50 -0700 | [diff] [blame] | 1182 |  | 
 | 1183 | 	try_to_freeze(); | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 1184 | 	if (signalled()) | 
 | 1185 | 		return -EINTR; | 
 | 1186 |  | 
 | 1187 | 	spin_lock_bh(&serv->sv_lock); | 
 | 1188 | 	if (!list_empty(&serv->sv_tempsocks)) { | 
 | 1189 | 		svsk = list_entry(serv->sv_tempsocks.next, | 
 | 1190 | 				  struct svc_sock, sk_list); | 
 | 1191 | 		/* apparently the "standard" is that clients close | 
 | 1192 | 		 * idle connections after 5 minutes, servers after | 
 | 1193 | 		 * 6 minutes | 
 | 1194 | 		 *   http://www.connectathon.org/talks96/nfstcp.pdf  | 
 | 1195 | 		 */ | 
 | 1196 | 		if (get_seconds() - svsk->sk_lastrecv < 6*60 | 
 | 1197 | 		    || test_bit(SK_BUSY, &svsk->sk_flags)) | 
 | 1198 | 			svsk = NULL; | 
 | 1199 | 	} | 
 | 1200 | 	if (svsk) { | 
 | 1201 | 		set_bit(SK_BUSY, &svsk->sk_flags); | 
 | 1202 | 		set_bit(SK_CLOSE, &svsk->sk_flags); | 
 | 1203 | 		rqstp->rq_sock = svsk; | 
 | 1204 | 		svsk->sk_inuse++; | 
 | 1205 | 	} else if ((svsk = svc_sock_dequeue(serv)) != NULL) { | 
 | 1206 | 		rqstp->rq_sock = svsk; | 
 | 1207 | 		svsk->sk_inuse++; | 
 | 1208 | 		rqstp->rq_reserved = serv->sv_bufsz;	 | 
 | 1209 | 		svsk->sk_reserved += rqstp->rq_reserved; | 
 | 1210 | 	} else { | 
 | 1211 | 		/* No data pending. Go to sleep */ | 
 | 1212 | 		svc_serv_enqueue(serv, rqstp); | 
 | 1213 |  | 
 | 1214 | 		/* | 
 | 1215 | 		 * We have to be able to interrupt this wait | 
 | 1216 | 		 * to bring down the daemons ... | 
 | 1217 | 		 */ | 
 | 1218 | 		set_current_state(TASK_INTERRUPTIBLE); | 
 | 1219 | 		add_wait_queue(&rqstp->rq_wait, &wait); | 
 | 1220 | 		spin_unlock_bh(&serv->sv_lock); | 
 | 1221 |  | 
 | 1222 | 		schedule_timeout(timeout); | 
 | 1223 |  | 
| Christoph Lameter | 3e1d1d2 | 2005-06-24 23:13:50 -0700 | [diff] [blame] | 1224 | 		try_to_freeze(); | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 1225 |  | 
 | 1226 | 		spin_lock_bh(&serv->sv_lock); | 
 | 1227 | 		remove_wait_queue(&rqstp->rq_wait, &wait); | 
 | 1228 |  | 
 | 1229 | 		if (!(svsk = rqstp->rq_sock)) { | 
 | 1230 | 			svc_serv_dequeue(serv, rqstp); | 
 | 1231 | 			spin_unlock_bh(&serv->sv_lock); | 
 | 1232 | 			dprintk("svc: server %p, no data yet\n", rqstp); | 
 | 1233 | 			return signalled()? -EINTR : -EAGAIN; | 
 | 1234 | 		} | 
 | 1235 | 	} | 
 | 1236 | 	spin_unlock_bh(&serv->sv_lock); | 
 | 1237 |  | 
 | 1238 | 	dprintk("svc: server %p, socket %p, inuse=%d\n", | 
 | 1239 | 		 rqstp, svsk, svsk->sk_inuse); | 
 | 1240 | 	len = svsk->sk_recvfrom(rqstp); | 
 | 1241 | 	dprintk("svc: got len=%d\n", len); | 
 | 1242 |  | 
 | 1243 | 	/* No data, incomplete (TCP) read, or accept() */ | 
 | 1244 | 	if (len == 0 || len == -EAGAIN) { | 
 | 1245 | 		rqstp->rq_res.len = 0; | 
 | 1246 | 		svc_sock_release(rqstp); | 
 | 1247 | 		return -EAGAIN; | 
 | 1248 | 	} | 
 | 1249 | 	svsk->sk_lastrecv = get_seconds(); | 
 | 1250 | 	if (test_bit(SK_TEMP, &svsk->sk_flags)) { | 
 | 1251 | 		/* push active sockets to end of list */ | 
 | 1252 | 		spin_lock_bh(&serv->sv_lock); | 
 | 1253 | 		if (!list_empty(&svsk->sk_list)) | 
 | 1254 | 			list_move_tail(&svsk->sk_list, &serv->sv_tempsocks); | 
 | 1255 | 		spin_unlock_bh(&serv->sv_lock); | 
 | 1256 | 	} | 
 | 1257 |  | 
 | 1258 | 	rqstp->rq_secure  = ntohs(rqstp->rq_addr.sin_port) < 1024; | 
 | 1259 | 	rqstp->rq_chandle.defer = svc_defer; | 
 | 1260 |  | 
 | 1261 | 	if (serv->sv_stats) | 
 | 1262 | 		serv->sv_stats->netcnt++; | 
 | 1263 | 	return len; | 
 | 1264 | } | 
 | 1265 |  | 
 | 1266 | /*  | 
 | 1267 |  * Drop request | 
 | 1268 |  */ | 
 | 1269 | void | 
 | 1270 | svc_drop(struct svc_rqst *rqstp) | 
 | 1271 | { | 
 | 1272 | 	dprintk("svc: socket %p dropped request\n", rqstp->rq_sock); | 
 | 1273 | 	svc_sock_release(rqstp); | 
 | 1274 | } | 
 | 1275 |  | 
 | 1276 | /* | 
 | 1277 |  * Return reply to client. | 
 | 1278 |  */ | 
 | 1279 | int | 
 | 1280 | svc_send(struct svc_rqst *rqstp) | 
 | 1281 | { | 
 | 1282 | 	struct svc_sock	*svsk; | 
 | 1283 | 	int		len; | 
 | 1284 | 	struct xdr_buf	*xb; | 
 | 1285 |  | 
 | 1286 | 	if ((svsk = rqstp->rq_sock) == NULL) { | 
 | 1287 | 		printk(KERN_WARNING "NULL socket pointer in %s:%d\n", | 
 | 1288 | 				__FILE__, __LINE__); | 
 | 1289 | 		return -EFAULT; | 
 | 1290 | 	} | 
 | 1291 |  | 
 | 1292 | 	/* release the receive skb before sending the reply */ | 
 | 1293 | 	svc_release_skb(rqstp); | 
 | 1294 |  | 
 | 1295 | 	/* calculate over-all length */ | 
 | 1296 | 	xb = & rqstp->rq_res; | 
 | 1297 | 	xb->len = xb->head[0].iov_len + | 
 | 1298 | 		xb->page_len + | 
 | 1299 | 		xb->tail[0].iov_len; | 
 | 1300 |  | 
 | 1301 | 	/* Grab svsk->sk_sem to serialize outgoing data. */ | 
 | 1302 | 	down(&svsk->sk_sem); | 
 | 1303 | 	if (test_bit(SK_DEAD, &svsk->sk_flags)) | 
 | 1304 | 		len = -ENOTCONN; | 
 | 1305 | 	else | 
 | 1306 | 		len = svsk->sk_sendto(rqstp); | 
 | 1307 | 	up(&svsk->sk_sem); | 
 | 1308 | 	svc_sock_release(rqstp); | 
 | 1309 |  | 
 | 1310 | 	if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN) | 
 | 1311 | 		return 0; | 
 | 1312 | 	return len; | 
 | 1313 | } | 
 | 1314 |  | 
 | 1315 | /* | 
 | 1316 |  * Initialize socket for RPC use and create svc_sock struct | 
 | 1317 |  * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF. | 
 | 1318 |  */ | 
 | 1319 | static struct svc_sock * | 
 | 1320 | svc_setup_socket(struct svc_serv *serv, struct socket *sock, | 
 | 1321 | 					int *errp, int pmap_register) | 
 | 1322 | { | 
 | 1323 | 	struct svc_sock	*svsk; | 
 | 1324 | 	struct sock	*inet; | 
 | 1325 |  | 
 | 1326 | 	dprintk("svc: svc_setup_socket %p\n", sock); | 
 | 1327 | 	if (!(svsk = kmalloc(sizeof(*svsk), GFP_KERNEL))) { | 
 | 1328 | 		*errp = -ENOMEM; | 
 | 1329 | 		return NULL; | 
 | 1330 | 	} | 
 | 1331 | 	memset(svsk, 0, sizeof(*svsk)); | 
 | 1332 |  | 
 | 1333 | 	inet = sock->sk; | 
 | 1334 |  | 
 | 1335 | 	/* Register socket with portmapper */ | 
 | 1336 | 	if (*errp >= 0 && pmap_register) | 
 | 1337 | 		*errp = svc_register(serv, inet->sk_protocol, | 
 | 1338 | 				     ntohs(inet_sk(inet)->sport)); | 
 | 1339 |  | 
 | 1340 | 	if (*errp < 0) { | 
 | 1341 | 		kfree(svsk); | 
 | 1342 | 		return NULL; | 
 | 1343 | 	} | 
 | 1344 |  | 
 | 1345 | 	set_bit(SK_BUSY, &svsk->sk_flags); | 
 | 1346 | 	inet->sk_user_data = svsk; | 
 | 1347 | 	svsk->sk_sock = sock; | 
 | 1348 | 	svsk->sk_sk = inet; | 
 | 1349 | 	svsk->sk_ostate = inet->sk_state_change; | 
 | 1350 | 	svsk->sk_odata = inet->sk_data_ready; | 
 | 1351 | 	svsk->sk_owspace = inet->sk_write_space; | 
 | 1352 | 	svsk->sk_server = serv; | 
 | 1353 | 	svsk->sk_lastrecv = get_seconds(); | 
 | 1354 | 	INIT_LIST_HEAD(&svsk->sk_deferred); | 
 | 1355 | 	INIT_LIST_HEAD(&svsk->sk_ready); | 
 | 1356 | 	sema_init(&svsk->sk_sem, 1); | 
 | 1357 |  | 
 | 1358 | 	/* Initialize the socket */ | 
 | 1359 | 	if (sock->type == SOCK_DGRAM) | 
 | 1360 | 		svc_udp_init(svsk); | 
 | 1361 | 	else | 
 | 1362 | 		svc_tcp_init(svsk); | 
 | 1363 |  | 
 | 1364 | 	spin_lock_bh(&serv->sv_lock); | 
 | 1365 | 	if (!pmap_register) { | 
 | 1366 | 		set_bit(SK_TEMP, &svsk->sk_flags); | 
 | 1367 | 		list_add(&svsk->sk_list, &serv->sv_tempsocks); | 
 | 1368 | 		serv->sv_tmpcnt++; | 
 | 1369 | 	} else { | 
 | 1370 | 		clear_bit(SK_TEMP, &svsk->sk_flags); | 
 | 1371 | 		list_add(&svsk->sk_list, &serv->sv_permsocks); | 
 | 1372 | 	} | 
 | 1373 | 	spin_unlock_bh(&serv->sv_lock); | 
 | 1374 |  | 
 | 1375 | 	dprintk("svc: svc_setup_socket created %p (inet %p)\n", | 
 | 1376 | 				svsk, svsk->sk_sk); | 
 | 1377 |  | 
 | 1378 | 	clear_bit(SK_BUSY, &svsk->sk_flags); | 
 | 1379 | 	svc_sock_enqueue(svsk); | 
 | 1380 | 	return svsk; | 
 | 1381 | } | 
 | 1382 |  | 
 | 1383 | /* | 
 | 1384 |  * Create socket for RPC service. | 
 | 1385 |  */ | 
 | 1386 | static int | 
 | 1387 | svc_create_socket(struct svc_serv *serv, int protocol, struct sockaddr_in *sin) | 
 | 1388 | { | 
 | 1389 | 	struct svc_sock	*svsk; | 
 | 1390 | 	struct socket	*sock; | 
 | 1391 | 	int		error; | 
 | 1392 | 	int		type; | 
 | 1393 |  | 
 | 1394 | 	dprintk("svc: svc_create_socket(%s, %d, %u.%u.%u.%u:%d)\n", | 
 | 1395 | 				serv->sv_program->pg_name, protocol, | 
 | 1396 | 				NIPQUAD(sin->sin_addr.s_addr), | 
 | 1397 | 				ntohs(sin->sin_port)); | 
 | 1398 |  | 
 | 1399 | 	if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) { | 
 | 1400 | 		printk(KERN_WARNING "svc: only UDP and TCP " | 
 | 1401 | 				"sockets supported\n"); | 
 | 1402 | 		return -EINVAL; | 
 | 1403 | 	} | 
 | 1404 | 	type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; | 
 | 1405 |  | 
 | 1406 | 	if ((error = sock_create_kern(PF_INET, type, protocol, &sock)) < 0) | 
 | 1407 | 		return error; | 
 | 1408 |  | 
 | 1409 | 	if (sin != NULL) { | 
 | 1410 | 		if (type == SOCK_STREAM) | 
 | 1411 | 			sock->sk->sk_reuse = 1; /* allow address reuse */ | 
 | 1412 | 		error = sock->ops->bind(sock, (struct sockaddr *) sin, | 
 | 1413 | 						sizeof(*sin)); | 
 | 1414 | 		if (error < 0) | 
 | 1415 | 			goto bummer; | 
 | 1416 | 	} | 
 | 1417 |  | 
 | 1418 | 	if (protocol == IPPROTO_TCP) { | 
 | 1419 | 		if ((error = sock->ops->listen(sock, 64)) < 0) | 
 | 1420 | 			goto bummer; | 
 | 1421 | 	} | 
 | 1422 |  | 
 | 1423 | 	if ((svsk = svc_setup_socket(serv, sock, &error, 1)) != NULL) | 
 | 1424 | 		return 0; | 
 | 1425 |  | 
 | 1426 | bummer: | 
 | 1427 | 	dprintk("svc: svc_create_socket error = %d\n", -error); | 
 | 1428 | 	sock_release(sock); | 
 | 1429 | 	return error; | 
 | 1430 | } | 
 | 1431 |  | 
 | 1432 | /* | 
 | 1433 |  * Remove a dead socket | 
 | 1434 |  */ | 
 | 1435 | void | 
 | 1436 | svc_delete_socket(struct svc_sock *svsk) | 
 | 1437 | { | 
 | 1438 | 	struct svc_serv	*serv; | 
 | 1439 | 	struct sock	*sk; | 
 | 1440 |  | 
 | 1441 | 	dprintk("svc: svc_delete_socket(%p)\n", svsk); | 
 | 1442 |  | 
 | 1443 | 	serv = svsk->sk_server; | 
 | 1444 | 	sk = svsk->sk_sk; | 
 | 1445 |  | 
 | 1446 | 	sk->sk_state_change = svsk->sk_ostate; | 
 | 1447 | 	sk->sk_data_ready = svsk->sk_odata; | 
 | 1448 | 	sk->sk_write_space = svsk->sk_owspace; | 
 | 1449 |  | 
 | 1450 | 	spin_lock_bh(&serv->sv_lock); | 
 | 1451 |  | 
 | 1452 | 	list_del_init(&svsk->sk_list); | 
 | 1453 | 	list_del_init(&svsk->sk_ready); | 
 | 1454 | 	if (!test_and_set_bit(SK_DEAD, &svsk->sk_flags)) | 
 | 1455 | 		if (test_bit(SK_TEMP, &svsk->sk_flags)) | 
 | 1456 | 			serv->sv_tmpcnt--; | 
 | 1457 |  | 
 | 1458 | 	if (!svsk->sk_inuse) { | 
 | 1459 | 		spin_unlock_bh(&serv->sv_lock); | 
 | 1460 | 		sock_release(svsk->sk_sock); | 
 | 1461 | 		kfree(svsk); | 
 | 1462 | 	} else { | 
 | 1463 | 		spin_unlock_bh(&serv->sv_lock); | 
 | 1464 | 		dprintk(KERN_NOTICE "svc: server socket destroy delayed\n"); | 
 | 1465 | 		/* svsk->sk_server = NULL; */ | 
 | 1466 | 	} | 
 | 1467 | } | 
 | 1468 |  | 
 | 1469 | /* | 
 | 1470 |  * Make a socket for nfsd and lockd | 
 | 1471 |  */ | 
 | 1472 | int | 
 | 1473 | svc_makesock(struct svc_serv *serv, int protocol, unsigned short port) | 
 | 1474 | { | 
 | 1475 | 	struct sockaddr_in	sin; | 
 | 1476 |  | 
 | 1477 | 	dprintk("svc: creating socket proto = %d\n", protocol); | 
 | 1478 | 	sin.sin_family      = AF_INET; | 
 | 1479 | 	sin.sin_addr.s_addr = INADDR_ANY; | 
 | 1480 | 	sin.sin_port        = htons(port); | 
 | 1481 | 	return svc_create_socket(serv, protocol, &sin); | 
 | 1482 | } | 
 | 1483 |  | 
 | 1484 | /* | 
 | 1485 |  * Handle defer and revisit of requests  | 
 | 1486 |  */ | 
 | 1487 |  | 
 | 1488 | static void svc_revisit(struct cache_deferred_req *dreq, int too_many) | 
 | 1489 | { | 
 | 1490 | 	struct svc_deferred_req *dr = container_of(dreq, struct svc_deferred_req, handle); | 
 | 1491 | 	struct svc_serv *serv = dreq->owner; | 
 | 1492 | 	struct svc_sock *svsk; | 
 | 1493 |  | 
 | 1494 | 	if (too_many) { | 
 | 1495 | 		svc_sock_put(dr->svsk); | 
 | 1496 | 		kfree(dr); | 
 | 1497 | 		return; | 
 | 1498 | 	} | 
 | 1499 | 	dprintk("revisit queued\n"); | 
 | 1500 | 	svsk = dr->svsk; | 
 | 1501 | 	dr->svsk = NULL; | 
 | 1502 | 	spin_lock_bh(&serv->sv_lock); | 
 | 1503 | 	list_add(&dr->handle.recent, &svsk->sk_deferred); | 
 | 1504 | 	spin_unlock_bh(&serv->sv_lock); | 
 | 1505 | 	set_bit(SK_DEFERRED, &svsk->sk_flags); | 
 | 1506 | 	svc_sock_enqueue(svsk); | 
 | 1507 | 	svc_sock_put(svsk); | 
 | 1508 | } | 
 | 1509 |  | 
 | 1510 | static struct cache_deferred_req * | 
 | 1511 | svc_defer(struct cache_req *req) | 
 | 1512 | { | 
 | 1513 | 	struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle); | 
 | 1514 | 	int size = sizeof(struct svc_deferred_req) + (rqstp->rq_arg.len); | 
 | 1515 | 	struct svc_deferred_req *dr; | 
 | 1516 |  | 
 | 1517 | 	if (rqstp->rq_arg.page_len) | 
 | 1518 | 		return NULL; /* if more than a page, give up FIXME */ | 
 | 1519 | 	if (rqstp->rq_deferred) { | 
 | 1520 | 		dr = rqstp->rq_deferred; | 
 | 1521 | 		rqstp->rq_deferred = NULL; | 
 | 1522 | 	} else { | 
 | 1523 | 		int skip  = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len; | 
 | 1524 | 		/* FIXME maybe discard if size too large */ | 
 | 1525 | 		dr = kmalloc(size, GFP_KERNEL); | 
 | 1526 | 		if (dr == NULL) | 
 | 1527 | 			return NULL; | 
 | 1528 |  | 
 | 1529 | 		dr->handle.owner = rqstp->rq_server; | 
 | 1530 | 		dr->prot = rqstp->rq_prot; | 
 | 1531 | 		dr->addr = rqstp->rq_addr; | 
 | 1532 | 		dr->argslen = rqstp->rq_arg.len >> 2; | 
 | 1533 | 		memcpy(dr->args, rqstp->rq_arg.head[0].iov_base-skip, dr->argslen<<2); | 
 | 1534 | 	} | 
 | 1535 | 	spin_lock_bh(&rqstp->rq_server->sv_lock); | 
 | 1536 | 	rqstp->rq_sock->sk_inuse++; | 
 | 1537 | 	dr->svsk = rqstp->rq_sock; | 
 | 1538 | 	spin_unlock_bh(&rqstp->rq_server->sv_lock); | 
 | 1539 |  | 
 | 1540 | 	dr->handle.revisit = svc_revisit; | 
 | 1541 | 	return &dr->handle; | 
 | 1542 | } | 
 | 1543 |  | 
 | 1544 | /* | 
 | 1545 |  * recv data from a deferred request into an active one | 
 | 1546 |  */ | 
 | 1547 | static int svc_deferred_recv(struct svc_rqst *rqstp) | 
 | 1548 | { | 
 | 1549 | 	struct svc_deferred_req *dr = rqstp->rq_deferred; | 
 | 1550 |  | 
 | 1551 | 	rqstp->rq_arg.head[0].iov_base = dr->args; | 
 | 1552 | 	rqstp->rq_arg.head[0].iov_len = dr->argslen<<2; | 
 | 1553 | 	rqstp->rq_arg.page_len = 0; | 
 | 1554 | 	rqstp->rq_arg.len = dr->argslen<<2; | 
 | 1555 | 	rqstp->rq_prot        = dr->prot; | 
 | 1556 | 	rqstp->rq_addr        = dr->addr; | 
 | 1557 | 	return dr->argslen<<2; | 
 | 1558 | } | 
 | 1559 |  | 
 | 1560 |  | 
 | 1561 | static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk) | 
 | 1562 | { | 
 | 1563 | 	struct svc_deferred_req *dr = NULL; | 
 | 1564 | 	struct svc_serv	*serv = svsk->sk_server; | 
 | 1565 | 	 | 
 | 1566 | 	if (!test_bit(SK_DEFERRED, &svsk->sk_flags)) | 
 | 1567 | 		return NULL; | 
 | 1568 | 	spin_lock_bh(&serv->sv_lock); | 
 | 1569 | 	clear_bit(SK_DEFERRED, &svsk->sk_flags); | 
 | 1570 | 	if (!list_empty(&svsk->sk_deferred)) { | 
 | 1571 | 		dr = list_entry(svsk->sk_deferred.next, | 
 | 1572 | 				struct svc_deferred_req, | 
 | 1573 | 				handle.recent); | 
 | 1574 | 		list_del_init(&dr->handle.recent); | 
 | 1575 | 		set_bit(SK_DEFERRED, &svsk->sk_flags); | 
 | 1576 | 	} | 
 | 1577 | 	spin_unlock_bh(&serv->sv_lock); | 
 | 1578 | 	return dr; | 
 | 1579 | } |