blob: 2b1c31f51b92df66d85ba2fbee8df95f45223d31 [file] [log] [blame]
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001/*
2 * Copyright 2012 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/module.h>
16#include <linux/init.h>
17#include <linux/moduleparam.h>
18#include <linux/sched.h>
19#include <linux/kernel.h> /* printk() */
20#include <linux/slab.h> /* kmalloc() */
21#include <linux/errno.h> /* error codes */
22#include <linux/types.h> /* size_t */
23#include <linux/interrupt.h>
24#include <linux/in.h>
25#include <linux/irq.h>
26#include <linux/netdevice.h> /* struct device, and other headers */
27#include <linux/etherdevice.h> /* eth_type_trans */
28#include <linux/skbuff.h>
29#include <linux/ioctl.h>
30#include <linux/cdev.h>
31#include <linux/hugetlb.h>
32#include <linux/in6.h>
33#include <linux/timer.h>
34#include <linux/hrtimer.h>
35#include <linux/ktime.h>
36#include <linux/io.h>
37#include <linux/ctype.h>
38#include <linux/ip.h>
39#include <linux/tcp.h>
40
41#include <asm/checksum.h>
42#include <asm/homecache.h>
43#include <gxio/mpipe.h>
44#include <arch/sim.h>
45
46/* Default transmit lockup timeout period, in jiffies. */
47#define TILE_NET_TIMEOUT (5 * HZ)
48
49/* The maximum number of distinct channels (idesc.channel is 5 bits). */
50#define TILE_NET_CHANNELS 32
51
52/* Maximum number of idescs to handle per "poll". */
53#define TILE_NET_BATCH 128
54
55/* Maximum number of packets to handle per "poll". */
56#define TILE_NET_WEIGHT 64
57
58/* Number of entries in each iqueue. */
59#define IQUEUE_ENTRIES 512
60
61/* Number of entries in each equeue. */
62#define EQUEUE_ENTRIES 2048
63
64/* Total header bytes per equeue slot. Must be big enough for 2 bytes
65 * of NET_IP_ALIGN alignment, plus 14 bytes (?) of L2 header, plus up to
66 * 60 bytes of actual TCP header. We round up to align to cache lines.
67 */
68#define HEADER_BYTES 128
69
70/* Maximum completions per cpu per device (must be a power of two).
71 * ISSUE: What is the right number here? If this is too small, then
72 * egress might block waiting for free space in a completions array.
73 * ISSUE: At the least, allocate these only for initialized echannels.
74 */
75#define TILE_NET_MAX_COMPS 64
76
77#define MAX_FRAGS (MAX_SKB_FRAGS + 1)
78
Chris Metcalf2628e8a2013-08-01 11:36:42 -040079/* The "kinds" of buffer stacks (small/large/jumbo). */
80#define MAX_KINDS 3
81
Chris Metcalfe3d62d72012-06-07 10:45:02 +000082/* Size of completions data to allocate.
83 * ISSUE: Probably more than needed since we don't use all the channels.
84 */
85#define COMPS_SIZE (TILE_NET_CHANNELS * sizeof(struct tile_net_comps))
86
87/* Size of NotifRing data to allocate. */
88#define NOTIF_RING_SIZE (IQUEUE_ENTRIES * sizeof(gxio_mpipe_idesc_t))
89
90/* Timeout to wake the per-device TX timer after we stop the queue.
91 * We don't want the timeout too short (adds overhead, and might end
92 * up causing stop/wake/stop/wake cycles) or too long (affects performance).
93 * For the 10 Gb NIC, 30 usec means roughly 30+ 1500-byte packets.
94 */
95#define TX_TIMER_DELAY_USEC 30
96
97/* Timeout to wake the per-cpu egress timer to free completions. */
98#define EGRESS_TIMER_DELAY_USEC 1000
99
100MODULE_AUTHOR("Tilera Corporation");
101MODULE_LICENSE("GPL");
102
103/* A "packet fragment" (a chunk of memory). */
104struct frag {
105 void *buf;
106 size_t length;
107};
108
109/* A single completion. */
110struct tile_net_comp {
111 /* The "complete_count" when the completion will be complete. */
112 s64 when;
113 /* The buffer to be freed when the completion is complete. */
114 struct sk_buff *skb;
115};
116
117/* The completions for a given cpu and echannel. */
118struct tile_net_comps {
119 /* The completions. */
120 struct tile_net_comp comp_queue[TILE_NET_MAX_COMPS];
121 /* The number of completions used. */
122 unsigned long comp_next;
123 /* The number of completions freed. */
124 unsigned long comp_last;
125};
126
127/* The transmit wake timer for a given cpu and echannel. */
128struct tile_net_tx_wake {
Chris Metcalf9b4c3412012-07-01 14:43:47 -0400129 int tx_queue_idx;
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000130 struct hrtimer timer;
131 struct net_device *dev;
132};
133
134/* Info for a specific cpu. */
135struct tile_net_info {
136 /* The NAPI struct. */
137 struct napi_struct napi;
138 /* Packet queue. */
139 gxio_mpipe_iqueue_t iqueue;
140 /* Our cpu. */
141 int my_cpu;
142 /* True if iqueue is valid. */
143 bool has_iqueue;
144 /* NAPI flags. */
145 bool napi_added;
146 bool napi_enabled;
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400147 /* Number of buffers (by kind) which must still be provided. */
148 unsigned int num_needed_buffers[MAX_KINDS];
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000149 /* A timer for handling egress completions. */
150 struct hrtimer egress_timer;
151 /* True if "egress_timer" is scheduled. */
152 bool egress_timer_scheduled;
153 /* Comps for each egress channel. */
154 struct tile_net_comps *comps_for_echannel[TILE_NET_CHANNELS];
155 /* Transmit wake timer for each egress channel. */
156 struct tile_net_tx_wake tx_wake[TILE_NET_CHANNELS];
157};
158
159/* Info for egress on a particular egress channel. */
160struct tile_net_egress {
161 /* The "equeue". */
162 gxio_mpipe_equeue_t *equeue;
163 /* The headers for TSO. */
164 unsigned char *headers;
165};
166
167/* Info for a specific device. */
168struct tile_net_priv {
169 /* Our network device. */
170 struct net_device *dev;
171 /* The primary link. */
172 gxio_mpipe_link_t link;
173 /* The primary channel, if open, else -1. */
174 int channel;
175 /* The "loopify" egress link, if needed. */
176 gxio_mpipe_link_t loopify_link;
177 /* The "loopify" egress channel, if open, else -1. */
178 int loopify_channel;
179 /* The egress channel (channel or loopify_channel). */
180 int echannel;
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000181};
182
183/* Egress info, indexed by "priv->echannel" (lazily created as needed). */
184static struct tile_net_egress egress_for_echannel[TILE_NET_CHANNELS];
185
186/* Devices currently associated with each channel.
187 * NOTE: The array entry can become NULL after ifconfig down, but
188 * we do not free the underlying net_device structures, so it is
189 * safe to use a pointer after reading it from this array.
190 */
191static struct net_device *tile_net_devs_for_channel[TILE_NET_CHANNELS];
192
193/* A mutex for "tile_net_devs_for_channel". */
194static DEFINE_MUTEX(tile_net_devs_for_channel_mutex);
195
196/* The per-cpu info. */
197static DEFINE_PER_CPU(struct tile_net_info, per_cpu_info);
198
199/* The "context" for all devices. */
200static gxio_mpipe_context_t context;
201
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400202/* The buffer size enums for each buffer stack.
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000203 * See arch/tile/include/gxio/mpipe.h for the set of possible values.
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400204 * We avoid the "10384" size because it can induce "false chaining"
205 * on "cut-through" jumbo packets.
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000206 */
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400207static gxio_mpipe_buffer_size_enum_t buffer_size_enums[MAX_KINDS] = {
208 GXIO_MPIPE_BUFFER_SIZE_128,
209 GXIO_MPIPE_BUFFER_SIZE_1664,
210 GXIO_MPIPE_BUFFER_SIZE_16384
211};
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000212
213/* The actual memory allocated for the buffer stacks. */
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400214static void *buffer_stack_vas[MAX_KINDS];
215
216/* The amount of memory allocated for each buffer stack. */
217static size_t buffer_stack_bytes[MAX_KINDS];
218
219/* The first buffer stack index (small = +0, large = +1, jumbo = +2). */
220static int first_buffer_stack = -1;
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000221
222/* The buckets. */
223static int first_bucket = -1;
224static int num_buckets = 1;
225
226/* The ingress irq. */
227static int ingress_irq = -1;
228
229/* Text value of tile_net.cpus if passed as a module parameter. */
230static char *network_cpus_string;
231
232/* The actual cpus in "network_cpus". */
233static struct cpumask network_cpus_map;
234
235/* If "loopify=LINK" was specified, this is "LINK". */
236static char *loopify_link_name;
237
238/* If "tile_net.custom" was specified, this is non-NULL. */
239static char *custom_str;
240
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400241/* If "tile_net.jumbo=NUM" was specified, this is "NUM". */
242static uint jumbo_num;
243
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000244/* The "tile_net.cpus" argument specifies the cpus that are dedicated
245 * to handle ingress packets.
246 *
247 * The parameter should be in the form "tile_net.cpus=m-n[,x-y]", where
248 * m, n, x, y are integer numbers that represent the cpus that can be
249 * neither a dedicated cpu nor a dataplane cpu.
250 */
251static bool network_cpus_init(void)
252{
253 char buf[1024];
254 int rc;
255
256 if (network_cpus_string == NULL)
257 return false;
258
259 rc = cpulist_parse_crop(network_cpus_string, &network_cpus_map);
260 if (rc != 0) {
261 pr_warn("tile_net.cpus=%s: malformed cpu list\n",
262 network_cpus_string);
263 return false;
264 }
265
266 /* Remove dedicated cpus. */
267 cpumask_and(&network_cpus_map, &network_cpus_map, cpu_possible_mask);
268
269 if (cpumask_empty(&network_cpus_map)) {
270 pr_warn("Ignoring empty tile_net.cpus='%s'.\n",
271 network_cpus_string);
272 return false;
273 }
274
275 cpulist_scnprintf(buf, sizeof(buf), &network_cpus_map);
276 pr_info("Linux network CPUs: %s\n", buf);
277 return true;
278}
279
280module_param_named(cpus, network_cpus_string, charp, 0444);
281MODULE_PARM_DESC(cpus, "cpulist of cores that handle network interrupts");
282
283/* The "tile_net.loopify=LINK" argument causes the named device to
284 * actually use "loop0" for ingress, and "loop1" for egress. This
285 * allows an app to sit between the actual link and linux, passing
286 * (some) packets along to linux, and forwarding (some) packets sent
287 * out by linux.
288 */
289module_param_named(loopify, loopify_link_name, charp, 0444);
290MODULE_PARM_DESC(loopify, "name the device to use loop0/1 for ingress/egress");
291
292/* The "tile_net.custom" argument causes us to ignore the "conventional"
293 * classifier metadata, in particular, the "l2_offset".
294 */
295module_param_named(custom, custom_str, charp, 0444);
296MODULE_PARM_DESC(custom, "indicates a (heavily) customized classifier");
297
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400298/* The "tile_net.jumbo" argument causes us to support "jumbo" packets,
299 * and to allocate the given number of "jumbo" buffers.
300 */
301module_param_named(jumbo, jumbo_num, uint, 0444);
302MODULE_PARM_DESC(jumbo, "the number of buffers to support jumbo packets");
303
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000304/* Atomically update a statistics field.
305 * Note that on TILE-Gx, this operation is fire-and-forget on the
306 * issuing core (single-cycle dispatch) and takes only a few cycles
307 * longer than a regular store when the request reaches the home cache.
308 * No expensive bus management overhead is required.
309 */
310static void tile_net_stats_add(unsigned long value, unsigned long *field)
311{
312 BUILD_BUG_ON(sizeof(atomic_long_t) != sizeof(unsigned long));
313 atomic_long_add(value, (atomic_long_t *)field);
314}
315
316/* Allocate and push a buffer. */
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400317static bool tile_net_provide_buffer(int kind)
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000318{
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400319 gxio_mpipe_buffer_size_enum_t bse = buffer_size_enums[kind];
320 size_t bs = gxio_mpipe_buffer_size_enum_to_buffer_size(bse);
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000321 const unsigned long buffer_alignment = 128;
322 struct sk_buff *skb;
323 int len;
324
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400325 len = sizeof(struct sk_buff **) + buffer_alignment + bs;
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000326 skb = dev_alloc_skb(len);
327 if (skb == NULL)
328 return false;
329
330 /* Make room for a back-pointer to 'skb' and guarantee alignment. */
331 skb_reserve(skb, sizeof(struct sk_buff **));
332 skb_reserve(skb, -(long)skb->data & (buffer_alignment - 1));
333
334 /* Save a back-pointer to 'skb'. */
335 *(struct sk_buff **)(skb->data - sizeof(struct sk_buff **)) = skb;
336
337 /* Make sure "skb" and the back-pointer have been flushed. */
338 wmb();
339
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400340 gxio_mpipe_push_buffer(&context, first_buffer_stack + kind,
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000341 (void *)va_to_tile_io_addr(skb->data));
342
343 return true;
344}
345
346/* Convert a raw mpipe buffer to its matching skb pointer. */
347static struct sk_buff *mpipe_buf_to_skb(void *va)
348{
349 /* Acquire the associated "skb". */
350 struct sk_buff **skb_ptr = va - sizeof(*skb_ptr);
351 struct sk_buff *skb = *skb_ptr;
352
353 /* Paranoia. */
354 if (skb->data != va) {
355 /* Panic here since there's a reasonable chance
356 * that corrupt buffers means generic memory
357 * corruption, with unpredictable system effects.
358 */
359 panic("Corrupt linux buffer! va=%p, skb=%p, skb->data=%p",
360 va, skb, skb->data);
361 }
362
363 return skb;
364}
365
366static void tile_net_pop_all_buffers(int stack)
367{
368 for (;;) {
369 tile_io_addr_t addr =
370 (tile_io_addr_t)gxio_mpipe_pop_buffer(&context, stack);
371 if (addr == 0)
372 break;
373 dev_kfree_skb_irq(mpipe_buf_to_skb(tile_io_addr_to_va(addr)));
374 }
375}
376
377/* Provide linux buffers to mPIPE. */
378static void tile_net_provide_needed_buffers(void)
379{
380 struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400381 int kind;
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000382
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400383 for (kind = 0; kind < MAX_KINDS; kind++) {
384 while (info->num_needed_buffers[kind] != 0) {
385 if (!tile_net_provide_buffer(kind)) {
386 /* Add info to the allocation failure dump. */
387 pr_notice("Tile %d still needs some buffers\n",
388 info->my_cpu);
389 return;
390 }
391 info->num_needed_buffers[kind]--;
392 }
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000393 }
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000394}
395
396static inline bool filter_packet(struct net_device *dev, void *buf)
397{
398 /* Filter packets received before we're up. */
399 if (dev == NULL || !(dev->flags & IFF_UP))
400 return true;
401
402 /* Filter out packets that aren't for us. */
403 if (!(dev->flags & IFF_PROMISC) &&
404 !is_multicast_ether_addr(buf) &&
405 compare_ether_addr(dev->dev_addr, buf) != 0)
406 return true;
407
408 return false;
409}
410
411static void tile_net_receive_skb(struct net_device *dev, struct sk_buff *skb,
412 gxio_mpipe_idesc_t *idesc, unsigned long len)
413{
414 struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000415
416 /* Encode the actual packet length. */
417 skb_put(skb, len);
418
419 skb->protocol = eth_type_trans(skb, dev);
420
421 /* Acknowledge "good" hardware checksums. */
422 if (idesc->cs && idesc->csum_seed_val == 0xFFFF)
423 skb->ip_summed = CHECKSUM_UNNECESSARY;
424
Chris Metcalf6ab4ae92013-08-01 11:36:42 -0400425 napi_gro_receive(&info->napi, skb);
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000426
427 /* Update stats. */
Chris Metcalfad018182013-08-01 11:36:42 -0400428 tile_net_stats_add(1, &dev->stats.rx_packets);
429 tile_net_stats_add(len, &dev->stats.rx_bytes);
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000430
431 /* Need a new buffer. */
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400432 if (idesc->size == buffer_size_enums[0])
433 info->num_needed_buffers[0]++;
434 else if (idesc->size == buffer_size_enums[1])
435 info->num_needed_buffers[1]++;
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000436 else
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400437 info->num_needed_buffers[2]++;
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000438}
439
440/* Handle a packet. Return true if "processed", false if "filtered". */
441static bool tile_net_handle_packet(gxio_mpipe_idesc_t *idesc)
442{
443 struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
444 struct net_device *dev = tile_net_devs_for_channel[idesc->channel];
445 uint8_t l2_offset;
446 void *va;
447 void *buf;
448 unsigned long len;
449 bool filter;
450
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400451 /* Drop packets for which no buffer was available (which can
452 * happen under heavy load), or for which the me/tr/ce flags
453 * are set (which can happen for jumbo cut-through packets,
454 * or with a customized classifier).
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000455 */
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400456 if (idesc->be || idesc->me || idesc->tr || idesc->ce) {
457 if (dev)
Chris Metcalfad018182013-08-01 11:36:42 -0400458 tile_net_stats_add(1, &dev->stats.rx_errors);
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400459 goto drop;
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000460 }
461
462 /* Get the "l2_offset", if allowed. */
463 l2_offset = custom_str ? 0 : gxio_mpipe_idesc_get_l2_offset(idesc);
464
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400465 /* Get the VA (including NET_IP_ALIGN bytes of "headroom"). */
466 va = tile_io_addr_to_va((unsigned long)idesc->va);
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000467
468 /* Get the actual packet start/length. */
469 buf = va + l2_offset;
470 len = idesc->l2_size - l2_offset;
471
472 /* Point "va" at the raw buffer. */
473 va -= NET_IP_ALIGN;
474
475 filter = filter_packet(dev, buf);
476 if (filter) {
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400477 if (dev)
Chris Metcalfad018182013-08-01 11:36:42 -0400478 tile_net_stats_add(1, &dev->stats.rx_dropped);
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400479drop:
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000480 gxio_mpipe_iqueue_drop(&info->iqueue, idesc);
481 } else {
482 struct sk_buff *skb = mpipe_buf_to_skb(va);
483
484 /* Skip headroom, and any custom header. */
485 skb_reserve(skb, NET_IP_ALIGN + l2_offset);
486
487 tile_net_receive_skb(dev, skb, idesc, len);
488 }
489
490 gxio_mpipe_iqueue_consume(&info->iqueue, idesc);
491 return !filter;
492}
493
494/* Handle some packets for the current CPU.
495 *
496 * This function handles up to TILE_NET_BATCH idescs per call.
497 *
498 * ISSUE: Since we do not provide new buffers until this function is
499 * complete, we must initially provide enough buffers for each network
500 * cpu to fill its iqueue and also its batched idescs.
501 *
502 * ISSUE: The "rotting packet" race condition occurs if a packet
503 * arrives after the queue appears to be empty, and before the
504 * hypervisor interrupt is re-enabled.
505 */
506static int tile_net_poll(struct napi_struct *napi, int budget)
507{
508 struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
509 unsigned int work = 0;
510 gxio_mpipe_idesc_t *idesc;
511 int i, n;
512
513 /* Process packets. */
514 while ((n = gxio_mpipe_iqueue_try_peek(&info->iqueue, &idesc)) > 0) {
515 for (i = 0; i < n; i++) {
516 if (i == TILE_NET_BATCH)
517 goto done;
518 if (tile_net_handle_packet(idesc + i)) {
519 if (++work >= budget)
520 goto done;
521 }
522 }
523 }
524
525 /* There are no packets left. */
526 napi_complete(&info->napi);
527
528 /* Re-enable hypervisor interrupts. */
529 gxio_mpipe_enable_notif_ring_interrupt(&context, info->iqueue.ring);
530
531 /* HACK: Avoid the "rotting packet" problem. */
532 if (gxio_mpipe_iqueue_try_peek(&info->iqueue, &idesc) > 0)
533 napi_schedule(&info->napi);
534
535 /* ISSUE: Handle completions? */
536
537done:
538 tile_net_provide_needed_buffers();
539
540 return work;
541}
542
543/* Handle an ingress interrupt on the current cpu. */
544static irqreturn_t tile_net_handle_ingress_irq(int irq, void *unused)
545{
546 struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
547 napi_schedule(&info->napi);
548 return IRQ_HANDLED;
549}
550
551/* Free some completions. This must be called with interrupts blocked. */
552static int tile_net_free_comps(gxio_mpipe_equeue_t *equeue,
553 struct tile_net_comps *comps,
554 int limit, bool force_update)
555{
556 int n = 0;
557 while (comps->comp_last < comps->comp_next) {
558 unsigned int cid = comps->comp_last % TILE_NET_MAX_COMPS;
559 struct tile_net_comp *comp = &comps->comp_queue[cid];
560 if (!gxio_mpipe_equeue_is_complete(equeue, comp->when,
561 force_update || n == 0))
562 break;
563 dev_kfree_skb_irq(comp->skb);
564 comps->comp_last++;
565 if (++n == limit)
566 break;
567 }
568 return n;
569}
570
571/* Add a completion. This must be called with interrupts blocked.
572 * tile_net_equeue_try_reserve() will have ensured a free completion entry.
573 */
574static void add_comp(gxio_mpipe_equeue_t *equeue,
575 struct tile_net_comps *comps,
576 uint64_t when, struct sk_buff *skb)
577{
578 int cid = comps->comp_next % TILE_NET_MAX_COMPS;
579 comps->comp_queue[cid].when = when;
580 comps->comp_queue[cid].skb = skb;
581 comps->comp_next++;
582}
583
Chris Metcalf9b4c3412012-07-01 14:43:47 -0400584static void tile_net_schedule_tx_wake_timer(struct net_device *dev,
585 int tx_queue_idx)
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000586{
Chris Metcalf9b4c3412012-07-01 14:43:47 -0400587 struct tile_net_info *info = &per_cpu(per_cpu_info, tx_queue_idx);
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000588 struct tile_net_priv *priv = netdev_priv(dev);
Chris Metcalf9b4c3412012-07-01 14:43:47 -0400589 struct tile_net_tx_wake *tx_wake = &info->tx_wake[priv->echannel];
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000590
Chris Metcalf9b4c3412012-07-01 14:43:47 -0400591 hrtimer_start(&tx_wake->timer,
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000592 ktime_set(0, TX_TIMER_DELAY_USEC * 1000UL),
593 HRTIMER_MODE_REL_PINNED);
594}
595
596static enum hrtimer_restart tile_net_handle_tx_wake_timer(struct hrtimer *t)
597{
598 struct tile_net_tx_wake *tx_wake =
599 container_of(t, struct tile_net_tx_wake, timer);
Chris Metcalf9b4c3412012-07-01 14:43:47 -0400600 netif_wake_subqueue(tx_wake->dev, tx_wake->tx_queue_idx);
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000601 return HRTIMER_NORESTART;
602}
603
604/* Make sure the egress timer is scheduled. */
605static void tile_net_schedule_egress_timer(void)
606{
607 struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
608
609 if (!info->egress_timer_scheduled) {
610 hrtimer_start(&info->egress_timer,
611 ktime_set(0, EGRESS_TIMER_DELAY_USEC * 1000UL),
612 HRTIMER_MODE_REL_PINNED);
613 info->egress_timer_scheduled = true;
614 }
615}
616
617/* The "function" for "info->egress_timer".
618 *
619 * This timer will reschedule itself as long as there are any pending
620 * completions expected for this tile.
621 */
622static enum hrtimer_restart tile_net_handle_egress_timer(struct hrtimer *t)
623{
624 struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
625 unsigned long irqflags;
626 bool pending = false;
627 int i;
628
629 local_irq_save(irqflags);
630
631 /* The timer is no longer scheduled. */
632 info->egress_timer_scheduled = false;
633
634 /* Free all possible comps for this tile. */
635 for (i = 0; i < TILE_NET_CHANNELS; i++) {
636 struct tile_net_egress *egress = &egress_for_echannel[i];
637 struct tile_net_comps *comps = info->comps_for_echannel[i];
638 if (comps->comp_last >= comps->comp_next)
639 continue;
640 tile_net_free_comps(egress->equeue, comps, -1, true);
641 pending = pending || (comps->comp_last < comps->comp_next);
642 }
643
644 /* Reschedule timer if needed. */
645 if (pending)
646 tile_net_schedule_egress_timer();
647
648 local_irq_restore(irqflags);
649
650 return HRTIMER_NORESTART;
651}
652
Chris Metcalf5e7a54a2013-08-01 11:36:42 -0400653/* Helper function for "tile_net_update()". */
654static void manage_ingress_irq(void *enable)
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000655{
Chris Metcalf5e7a54a2013-08-01 11:36:42 -0400656 if (enable)
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000657 enable_percpu_irq(ingress_irq, 0);
Chris Metcalf5e7a54a2013-08-01 11:36:42 -0400658 else
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000659 disable_percpu_irq(ingress_irq);
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000660}
661
662/* Helper function for tile_net_open() and tile_net_stop().
663 * Always called under tile_net_devs_for_channel_mutex.
664 */
665static int tile_net_update(struct net_device *dev)
666{
667 static gxio_mpipe_rules_t rules; /* too big to fit on the stack */
668 bool saw_channel = false;
669 int channel;
670 int rc;
671 int cpu;
672
673 gxio_mpipe_rules_init(&rules, &context);
674
675 for (channel = 0; channel < TILE_NET_CHANNELS; channel++) {
676 if (tile_net_devs_for_channel[channel] == NULL)
677 continue;
678 if (!saw_channel) {
679 saw_channel = true;
680 gxio_mpipe_rules_begin(&rules, first_bucket,
681 num_buckets, NULL);
682 gxio_mpipe_rules_set_headroom(&rules, NET_IP_ALIGN);
683 }
684 gxio_mpipe_rules_add_channel(&rules, channel);
685 }
686
687 /* NOTE: This can fail if there is no classifier.
688 * ISSUE: Can anything else cause it to fail?
689 */
690 rc = gxio_mpipe_rules_commit(&rules);
691 if (rc != 0) {
692 netdev_warn(dev, "gxio_mpipe_rules_commit failed: %d\n", rc);
693 return -EIO;
694 }
695
Chris Metcalf5e7a54a2013-08-01 11:36:42 -0400696 /* Update all cpus, sequentially (to protect "netif_napi_add()").
697 * We use on_each_cpu to handle the IPI mask or unmask.
698 */
699 if (!saw_channel)
700 on_each_cpu(manage_ingress_irq, (void *)0, 1);
701 for_each_online_cpu(cpu) {
702 struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
703 if (!info->has_iqueue)
704 continue;
705 if (saw_channel) {
706 if (!info->napi_added) {
707 netif_napi_add(dev, &info->napi,
708 tile_net_poll, TILE_NET_WEIGHT);
709 info->napi_added = true;
710 }
711 if (!info->napi_enabled) {
712 napi_enable(&info->napi);
713 info->napi_enabled = true;
714 }
715 } else {
716 if (info->napi_enabled) {
717 napi_disable(&info->napi);
718 info->napi_enabled = false;
719 }
720 /* FIXME: Drain the iqueue. */
721 }
722 }
723 if (saw_channel)
724 on_each_cpu(manage_ingress_irq, (void *)1, 1);
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000725
726 /* HACK: Allow packets to flow in the simulator. */
727 if (saw_channel)
728 sim_enable_mpipe_links(0, -1);
729
730 return 0;
731}
732
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400733/* Initialize a buffer stack. */
734static int create_buffer_stack(struct net_device *dev,
735 int kind, size_t num_buffers)
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000736{
737 pte_t hash_pte = pte_set_home((pte_t) { 0 }, PAGE_HOME_HASH);
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400738 size_t needed = gxio_mpipe_calc_buffer_stack_bytes(num_buffers);
739 int stack_idx = first_buffer_stack + kind;
740 void *va;
741 int i, rc;
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000742
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400743 /* Round up to 64KB and then use alloc_pages() so we get the
744 * required 64KB alignment.
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000745 */
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400746 buffer_stack_bytes[kind] = ALIGN(needed, 64 * 1024);
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000747
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400748 va = alloc_pages_exact(buffer_stack_bytes[kind], GFP_KERNEL);
749 if (va == NULL) {
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000750 netdev_err(dev,
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400751 "Could not alloc %zd bytes for buffer stack %d\n",
752 buffer_stack_bytes[kind], kind);
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000753 return -ENOMEM;
754 }
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400755
756 /* Initialize the buffer stack. */
757 rc = gxio_mpipe_init_buffer_stack(&context, stack_idx,
758 buffer_size_enums[kind],
759 va, buffer_stack_bytes[kind], 0);
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000760 if (rc != 0) {
761 netdev_err(dev, "gxio_mpipe_init_buffer_stack: %d\n", rc);
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400762 free_pages_exact(va, buffer_stack_bytes[kind]);
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000763 return rc;
764 }
765
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400766 buffer_stack_vas[kind] = va;
767
768 rc = gxio_mpipe_register_client_memory(&context, stack_idx,
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000769 hash_pte, 0);
770 if (rc != 0) {
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400771 netdev_err(dev, "gxio_mpipe_register_client_memory: %d\n", rc);
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000772 return rc;
773 }
774
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400775 /* Provide initial buffers. */
776 for (i = 0; i < num_buffers; i++) {
777 if (!tile_net_provide_buffer(kind)) {
778 netdev_err(dev, "Cannot allocate initial sk_bufs!\n");
779 return -ENOMEM;
780 }
781 }
782
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000783 return 0;
784}
785
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400786/* Allocate and initialize mpipe buffer stacks, and register them in
787 * the mPIPE TLBs, for small, large, and (possibly) jumbo packet sizes.
788 * This routine supports tile_net_init_mpipe(), below.
789 */
790static int init_buffer_stacks(struct net_device *dev,
791 int network_cpus_count)
792{
793 int num_kinds = MAX_KINDS - (jumbo_num == 0);
794 size_t num_buffers;
795 int rc;
796
797 /* Allocate the buffer stacks. */
798 rc = gxio_mpipe_alloc_buffer_stacks(&context, num_kinds, 0, 0);
799 if (rc < 0) {
800 netdev_err(dev, "gxio_mpipe_alloc_buffer_stacks: %d\n", rc);
801 return rc;
802 }
803 first_buffer_stack = rc;
804
805 /* Enough small/large buffers to (normally) avoid buffer errors. */
806 num_buffers =
807 network_cpus_count * (IQUEUE_ENTRIES + TILE_NET_BATCH);
808
809 /* Allocate the small memory stack. */
810 if (rc >= 0)
811 rc = create_buffer_stack(dev, 0, num_buffers);
812
813 /* Allocate the large buffer stack. */
814 if (rc >= 0)
815 rc = create_buffer_stack(dev, 1, num_buffers);
816
817 /* Allocate the jumbo buffer stack if needed. */
818 if (rc >= 0 && jumbo_num != 0)
819 rc = create_buffer_stack(dev, 2, jumbo_num);
820
821 return rc;
822}
823
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000824/* Allocate per-cpu resources (memory for completions and idescs).
825 * This routine supports tile_net_init_mpipe(), below.
826 */
827static int alloc_percpu_mpipe_resources(struct net_device *dev,
828 int cpu, int ring)
829{
830 struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
831 int order, i, rc;
832 struct page *page;
833 void *addr;
834
835 /* Allocate the "comps". */
836 order = get_order(COMPS_SIZE);
837 page = homecache_alloc_pages(GFP_KERNEL, order, cpu);
838 if (page == NULL) {
839 netdev_err(dev, "Failed to alloc %zd bytes comps memory\n",
840 COMPS_SIZE);
841 return -ENOMEM;
842 }
843 addr = pfn_to_kaddr(page_to_pfn(page));
844 memset(addr, 0, COMPS_SIZE);
845 for (i = 0; i < TILE_NET_CHANNELS; i++)
846 info->comps_for_echannel[i] =
847 addr + i * sizeof(struct tile_net_comps);
848
849 /* If this is a network cpu, create an iqueue. */
850 if (cpu_isset(cpu, network_cpus_map)) {
851 order = get_order(NOTIF_RING_SIZE);
852 page = homecache_alloc_pages(GFP_KERNEL, order, cpu);
853 if (page == NULL) {
854 netdev_err(dev,
855 "Failed to alloc %zd bytes iqueue memory\n",
856 NOTIF_RING_SIZE);
857 return -ENOMEM;
858 }
859 addr = pfn_to_kaddr(page_to_pfn(page));
860 rc = gxio_mpipe_iqueue_init(&info->iqueue, &context, ring++,
861 addr, NOTIF_RING_SIZE, 0);
862 if (rc < 0) {
863 netdev_err(dev,
864 "gxio_mpipe_iqueue_init failed: %d\n", rc);
865 return rc;
866 }
867 info->has_iqueue = true;
868 }
869
870 return ring;
871}
872
873/* Initialize NotifGroup and buckets.
874 * This routine supports tile_net_init_mpipe(), below.
875 */
876static int init_notif_group_and_buckets(struct net_device *dev,
877 int ring, int network_cpus_count)
878{
879 int group, rc;
880
881 /* Allocate one NotifGroup. */
882 rc = gxio_mpipe_alloc_notif_groups(&context, 1, 0, 0);
883 if (rc < 0) {
884 netdev_err(dev, "gxio_mpipe_alloc_notif_groups failed: %d\n",
885 rc);
886 return rc;
887 }
888 group = rc;
889
890 /* Initialize global num_buckets value. */
891 if (network_cpus_count > 4)
892 num_buckets = 256;
893 else if (network_cpus_count > 1)
894 num_buckets = 16;
895
896 /* Allocate some buckets, and set global first_bucket value. */
897 rc = gxio_mpipe_alloc_buckets(&context, num_buckets, 0, 0);
898 if (rc < 0) {
899 netdev_err(dev, "gxio_mpipe_alloc_buckets failed: %d\n", rc);
900 return rc;
901 }
902 first_bucket = rc;
903
904 /* Init group and buckets. */
905 rc = gxio_mpipe_init_notif_group_and_buckets(
906 &context, group, ring, network_cpus_count,
907 first_bucket, num_buckets,
908 GXIO_MPIPE_BUCKET_STICKY_FLOW_LOCALITY);
909 if (rc != 0) {
910 netdev_err(
911 dev,
912 "gxio_mpipe_init_notif_group_and_buckets failed: %d\n",
913 rc);
914 return rc;
915 }
916
917 return 0;
918}
919
920/* Create an irq and register it, then activate the irq and request
921 * interrupts on all cores. Note that "ingress_irq" being initialized
922 * is how we know not to call tile_net_init_mpipe() again.
923 * This routine supports tile_net_init_mpipe(), below.
924 */
925static int tile_net_setup_interrupts(struct net_device *dev)
926{
927 int cpu, rc;
928
929 rc = create_irq();
930 if (rc < 0) {
931 netdev_err(dev, "create_irq failed: %d\n", rc);
932 return rc;
933 }
934 ingress_irq = rc;
935 tile_irq_activate(ingress_irq, TILE_IRQ_PERCPU);
936 rc = request_irq(ingress_irq, tile_net_handle_ingress_irq,
Simon Marchi6fc4adc2012-11-15 18:13:19 +0000937 0, "tile_net", NULL);
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000938 if (rc != 0) {
939 netdev_err(dev, "request_irq failed: %d\n", rc);
940 destroy_irq(ingress_irq);
941 ingress_irq = -1;
942 return rc;
943 }
944
945 for_each_online_cpu(cpu) {
946 struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
947 if (info->has_iqueue) {
948 gxio_mpipe_request_notif_ring_interrupt(
949 &context, cpu_x(cpu), cpu_y(cpu),
Chris Metcalfc5399142013-05-02 15:29:04 -0400950 KERNEL_PL, ingress_irq, info->iqueue.ring);
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000951 }
952 }
953
954 return 0;
955}
956
957/* Undo any state set up partially by a failed call to tile_net_init_mpipe. */
958static void tile_net_init_mpipe_fail(void)
959{
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400960 int kind, cpu;
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000961
962 /* Do cleanups that require the mpipe context first. */
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400963 for (kind = 0; kind < MAX_KINDS; kind++) {
964 if (buffer_stack_vas[kind] != NULL) {
965 tile_net_pop_all_buffers(first_buffer_stack + kind);
966 }
967 }
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000968
969 /* Destroy mpipe context so the hardware no longer owns any memory. */
970 gxio_mpipe_destroy(&context);
971
972 for_each_online_cpu(cpu) {
973 struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
974 free_pages((unsigned long)(info->comps_for_echannel[0]),
975 get_order(COMPS_SIZE));
976 info->comps_for_echannel[0] = NULL;
977 free_pages((unsigned long)(info->iqueue.idescs),
978 get_order(NOTIF_RING_SIZE));
979 info->iqueue.idescs = NULL;
980 }
981
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400982 for (kind = 0; kind < MAX_KINDS; kind++) {
983 if (buffer_stack_vas[kind] != NULL) {
984 free_pages_exact(buffer_stack_vas[kind],
985 buffer_stack_bytes[kind]);
986 buffer_stack_vas[kind] = NULL;
987 }
988 }
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000989
Chris Metcalf2628e8a2013-08-01 11:36:42 -0400990 first_buffer_stack = -1;
Chris Metcalfe3d62d72012-06-07 10:45:02 +0000991 first_bucket = -1;
992}
993
994/* The first time any tilegx network device is opened, we initialize
995 * the global mpipe state. If this step fails, we fail to open the
996 * device, but if it succeeds, we never need to do it again, and since
997 * tile_net can't be unloaded, we never undo it.
998 *
999 * Note that some resources in this path (buffer stack indices,
1000 * bindings from init_buffer_stack, etc.) are hypervisor resources
1001 * that are freed implicitly by gxio_mpipe_destroy().
1002 */
1003static int tile_net_init_mpipe(struct net_device *dev)
1004{
Chris Metcalf2628e8a2013-08-01 11:36:42 -04001005 int rc;
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001006 int cpu;
1007 int first_ring, ring;
1008 int network_cpus_count = cpus_weight(network_cpus_map);
1009
1010 if (!hash_default) {
1011 netdev_err(dev, "Networking requires hash_default!\n");
1012 return -EIO;
1013 }
1014
1015 rc = gxio_mpipe_init(&context, 0);
1016 if (rc != 0) {
1017 netdev_err(dev, "gxio_mpipe_init failed: %d\n", rc);
1018 return -EIO;
1019 }
1020
1021 /* Set up the buffer stacks. */
Chris Metcalf2628e8a2013-08-01 11:36:42 -04001022 rc = init_buffer_stacks(dev, network_cpus_count);
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001023 if (rc != 0)
1024 goto fail;
1025
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001026 /* Allocate one NotifRing for each network cpu. */
1027 rc = gxio_mpipe_alloc_notif_rings(&context, network_cpus_count, 0, 0);
1028 if (rc < 0) {
1029 netdev_err(dev, "gxio_mpipe_alloc_notif_rings failed %d\n",
1030 rc);
1031 goto fail;
1032 }
1033
1034 /* Init NotifRings per-cpu. */
1035 first_ring = rc;
1036 ring = first_ring;
1037 for_each_online_cpu(cpu) {
1038 rc = alloc_percpu_mpipe_resources(dev, cpu, ring);
1039 if (rc < 0)
1040 goto fail;
1041 ring = rc;
1042 }
1043
1044 /* Initialize NotifGroup and buckets. */
1045 rc = init_notif_group_and_buckets(dev, first_ring, network_cpus_count);
1046 if (rc != 0)
1047 goto fail;
1048
1049 /* Create and enable interrupts. */
1050 rc = tile_net_setup_interrupts(dev);
1051 if (rc != 0)
1052 goto fail;
1053
1054 return 0;
1055
1056fail:
1057 tile_net_init_mpipe_fail();
1058 return rc;
1059}
1060
1061/* Create persistent egress info for a given egress channel.
1062 * Note that this may be shared between, say, "gbe0" and "xgbe0".
1063 * ISSUE: Defer header allocation until TSO is actually needed?
1064 */
1065static int tile_net_init_egress(struct net_device *dev, int echannel)
1066{
Chris Metcalf2628e8a2013-08-01 11:36:42 -04001067 static int ering = -1;
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001068 struct page *headers_page, *edescs_page, *equeue_page;
1069 gxio_mpipe_edesc_t *edescs;
1070 gxio_mpipe_equeue_t *equeue;
1071 unsigned char *headers;
1072 int headers_order, edescs_order, equeue_order;
1073 size_t edescs_size;
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001074 int rc = -ENOMEM;
1075
1076 /* Only initialize once. */
1077 if (egress_for_echannel[echannel].equeue != NULL)
1078 return 0;
1079
1080 /* Allocate memory for the "headers". */
1081 headers_order = get_order(EQUEUE_ENTRIES * HEADER_BYTES);
1082 headers_page = alloc_pages(GFP_KERNEL, headers_order);
1083 if (headers_page == NULL) {
1084 netdev_warn(dev,
1085 "Could not alloc %zd bytes for TSO headers.\n",
1086 PAGE_SIZE << headers_order);
1087 goto fail;
1088 }
1089 headers = pfn_to_kaddr(page_to_pfn(headers_page));
1090
1091 /* Allocate memory for the "edescs". */
1092 edescs_size = EQUEUE_ENTRIES * sizeof(*edescs);
1093 edescs_order = get_order(edescs_size);
1094 edescs_page = alloc_pages(GFP_KERNEL, edescs_order);
1095 if (edescs_page == NULL) {
1096 netdev_warn(dev,
1097 "Could not alloc %zd bytes for eDMA ring.\n",
1098 edescs_size);
1099 goto fail_headers;
1100 }
1101 edescs = pfn_to_kaddr(page_to_pfn(edescs_page));
1102
1103 /* Allocate memory for the "equeue". */
1104 equeue_order = get_order(sizeof(*equeue));
1105 equeue_page = alloc_pages(GFP_KERNEL, equeue_order);
1106 if (equeue_page == NULL) {
1107 netdev_warn(dev,
1108 "Could not alloc %zd bytes for equeue info.\n",
1109 PAGE_SIZE << equeue_order);
1110 goto fail_edescs;
1111 }
1112 equeue = pfn_to_kaddr(page_to_pfn(equeue_page));
1113
Chris Metcalf2628e8a2013-08-01 11:36:42 -04001114 /* Allocate an edma ring (using a one entry "free list"). */
1115 if (ering < 0) {
1116 rc = gxio_mpipe_alloc_edma_rings(&context, 1, 0, 0);
1117 if (rc < 0) {
1118 netdev_warn(dev, "gxio_mpipe_alloc_edma_rings: %d\n",
1119 rc);
1120 goto fail_equeue;
1121 }
1122 ering = rc;
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001123 }
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001124
1125 /* Initialize the equeue. */
Chris Metcalf2628e8a2013-08-01 11:36:42 -04001126 rc = gxio_mpipe_equeue_init(equeue, &context, ering, echannel,
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001127 edescs, edescs_size, 0);
1128 if (rc != 0) {
1129 netdev_err(dev, "gxio_mpipe_equeue_init failed: %d\n", rc);
1130 goto fail_equeue;
1131 }
1132
Chris Metcalf2628e8a2013-08-01 11:36:42 -04001133 /* Don't reuse the ering later. */
1134 ering = -1;
1135
1136 if (jumbo_num != 0) {
1137 /* Make sure "jumbo" packets can be egressed safely. */
1138 if (gxio_mpipe_equeue_set_snf_size(equeue, 10368) < 0) {
1139 /* ISSUE: There is no "gxio_mpipe_equeue_destroy()". */
1140 netdev_warn(dev, "Jumbo packets may not be egressed"
1141 " properly on channel %d\n", echannel);
1142 }
1143 }
1144
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001145 /* Done. */
1146 egress_for_echannel[echannel].equeue = equeue;
1147 egress_for_echannel[echannel].headers = headers;
1148 return 0;
1149
1150fail_equeue:
1151 __free_pages(equeue_page, equeue_order);
1152
1153fail_edescs:
1154 __free_pages(edescs_page, edescs_order);
1155
1156fail_headers:
1157 __free_pages(headers_page, headers_order);
1158
1159fail:
1160 return rc;
1161}
1162
1163/* Return channel number for a newly-opened link. */
1164static int tile_net_link_open(struct net_device *dev, gxio_mpipe_link_t *link,
1165 const char *link_name)
1166{
1167 int rc = gxio_mpipe_link_open(link, &context, link_name, 0);
1168 if (rc < 0) {
1169 netdev_err(dev, "Failed to open '%s'\n", link_name);
1170 return rc;
1171 }
Chris Metcalf2628e8a2013-08-01 11:36:42 -04001172 if (jumbo_num != 0) {
1173 u32 attr = GXIO_MPIPE_LINK_RECEIVE_JUMBO;
1174 rc = gxio_mpipe_link_set_attr(link, attr, 1);
1175 if (rc != 0) {
1176 netdev_err(dev,
1177 "Cannot receive jumbo packets on '%s'\n",
1178 link_name);
1179 gxio_mpipe_link_close(link);
1180 return rc;
1181 }
1182 }
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001183 rc = gxio_mpipe_link_channel(link);
1184 if (rc < 0 || rc >= TILE_NET_CHANNELS) {
1185 netdev_err(dev, "gxio_mpipe_link_channel bad value: %d\n", rc);
1186 gxio_mpipe_link_close(link);
1187 return -EINVAL;
1188 }
1189 return rc;
1190}
1191
1192/* Help the kernel activate the given network interface. */
1193static int tile_net_open(struct net_device *dev)
1194{
1195 struct tile_net_priv *priv = netdev_priv(dev);
1196 int cpu, rc;
1197
1198 mutex_lock(&tile_net_devs_for_channel_mutex);
1199
1200 /* Do one-time initialization the first time any device is opened. */
1201 if (ingress_irq < 0) {
1202 rc = tile_net_init_mpipe(dev);
1203 if (rc != 0)
1204 goto fail;
1205 }
1206
1207 /* Determine if this is the "loopify" device. */
1208 if (unlikely((loopify_link_name != NULL) &&
1209 !strcmp(dev->name, loopify_link_name))) {
1210 rc = tile_net_link_open(dev, &priv->link, "loop0");
1211 if (rc < 0)
1212 goto fail;
1213 priv->channel = rc;
1214 rc = tile_net_link_open(dev, &priv->loopify_link, "loop1");
1215 if (rc < 0)
1216 goto fail;
1217 priv->loopify_channel = rc;
1218 priv->echannel = rc;
1219 } else {
1220 rc = tile_net_link_open(dev, &priv->link, dev->name);
1221 if (rc < 0)
1222 goto fail;
1223 priv->channel = rc;
1224 priv->echannel = rc;
1225 }
1226
1227 /* Initialize egress info (if needed). Once ever, per echannel. */
1228 rc = tile_net_init_egress(dev, priv->echannel);
1229 if (rc != 0)
1230 goto fail;
1231
1232 tile_net_devs_for_channel[priv->channel] = dev;
1233
1234 rc = tile_net_update(dev);
1235 if (rc != 0)
1236 goto fail;
1237
1238 mutex_unlock(&tile_net_devs_for_channel_mutex);
1239
1240 /* Initialize the transmit wake timer for this device for each cpu. */
1241 for_each_online_cpu(cpu) {
1242 struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
1243 struct tile_net_tx_wake *tx_wake =
1244 &info->tx_wake[priv->echannel];
1245
1246 hrtimer_init(&tx_wake->timer, CLOCK_MONOTONIC,
1247 HRTIMER_MODE_REL);
Chris Metcalf9b4c3412012-07-01 14:43:47 -04001248 tx_wake->tx_queue_idx = cpu;
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001249 tx_wake->timer.function = tile_net_handle_tx_wake_timer;
1250 tx_wake->dev = dev;
1251 }
1252
1253 for_each_online_cpu(cpu)
1254 netif_start_subqueue(dev, cpu);
1255 netif_carrier_on(dev);
1256 return 0;
1257
1258fail:
1259 if (priv->loopify_channel >= 0) {
1260 if (gxio_mpipe_link_close(&priv->loopify_link) != 0)
1261 netdev_warn(dev, "Failed to close loopify link!\n");
1262 priv->loopify_channel = -1;
1263 }
1264 if (priv->channel >= 0) {
1265 if (gxio_mpipe_link_close(&priv->link) != 0)
1266 netdev_warn(dev, "Failed to close link!\n");
1267 priv->channel = -1;
1268 }
1269 priv->echannel = -1;
1270 tile_net_devs_for_channel[priv->channel] = NULL;
1271 mutex_unlock(&tile_net_devs_for_channel_mutex);
1272
1273 /* Don't return raw gxio error codes to generic Linux. */
1274 return (rc > -512) ? rc : -EIO;
1275}
1276
1277/* Help the kernel deactivate the given network interface. */
1278static int tile_net_stop(struct net_device *dev)
1279{
1280 struct tile_net_priv *priv = netdev_priv(dev);
1281 int cpu;
1282
1283 for_each_online_cpu(cpu) {
1284 struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
1285 struct tile_net_tx_wake *tx_wake =
1286 &info->tx_wake[priv->echannel];
1287
1288 hrtimer_cancel(&tx_wake->timer);
1289 netif_stop_subqueue(dev, cpu);
1290 }
1291
1292 mutex_lock(&tile_net_devs_for_channel_mutex);
1293 tile_net_devs_for_channel[priv->channel] = NULL;
1294 (void)tile_net_update(dev);
1295 if (priv->loopify_channel >= 0) {
1296 if (gxio_mpipe_link_close(&priv->loopify_link) != 0)
1297 netdev_warn(dev, "Failed to close loopify link!\n");
1298 priv->loopify_channel = -1;
1299 }
1300 if (priv->channel >= 0) {
1301 if (gxio_mpipe_link_close(&priv->link) != 0)
1302 netdev_warn(dev, "Failed to close link!\n");
1303 priv->channel = -1;
1304 }
1305 priv->echannel = -1;
1306 mutex_unlock(&tile_net_devs_for_channel_mutex);
1307
1308 return 0;
1309}
1310
1311/* Determine the VA for a fragment. */
1312static inline void *tile_net_frag_buf(skb_frag_t *f)
1313{
1314 unsigned long pfn = page_to_pfn(skb_frag_page(f));
1315 return pfn_to_kaddr(pfn) + f->page_offset;
1316}
1317
1318/* Acquire a completion entry and an egress slot, or if we can't,
1319 * stop the queue and schedule the tx_wake timer.
1320 */
1321static s64 tile_net_equeue_try_reserve(struct net_device *dev,
Chris Metcalf9b4c3412012-07-01 14:43:47 -04001322 int tx_queue_idx,
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001323 struct tile_net_comps *comps,
1324 gxio_mpipe_equeue_t *equeue,
1325 int num_edescs)
1326{
1327 /* Try to acquire a completion entry. */
1328 if (comps->comp_next - comps->comp_last < TILE_NET_MAX_COMPS - 1 ||
1329 tile_net_free_comps(equeue, comps, 32, false) != 0) {
1330
1331 /* Try to acquire an egress slot. */
1332 s64 slot = gxio_mpipe_equeue_try_reserve(equeue, num_edescs);
1333 if (slot >= 0)
1334 return slot;
1335
1336 /* Freeing some completions gives the equeue time to drain. */
1337 tile_net_free_comps(equeue, comps, TILE_NET_MAX_COMPS, false);
1338
1339 slot = gxio_mpipe_equeue_try_reserve(equeue, num_edescs);
1340 if (slot >= 0)
1341 return slot;
1342 }
1343
1344 /* Still nothing; give up and stop the queue for a short while. */
Chris Metcalf9b4c3412012-07-01 14:43:47 -04001345 netif_stop_subqueue(dev, tx_queue_idx);
1346 tile_net_schedule_tx_wake_timer(dev, tx_queue_idx);
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001347 return -1;
1348}
1349
1350/* Determine how many edesc's are needed for TSO.
1351 *
1352 * Sometimes, if "sendfile()" requires copying, we will be called with
1353 * "data" containing the header and payload, with "frags" being empty.
1354 * Sometimes, for example when using NFS over TCP, a single segment can
1355 * span 3 fragments. This requires special care.
1356 */
1357static int tso_count_edescs(struct sk_buff *skb)
1358{
1359 struct skb_shared_info *sh = skb_shinfo(skb);
Chris Metcalf83885462012-07-11 14:08:21 -04001360 unsigned int sh_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
Chris Metcalf3da3fff2012-10-25 07:25:20 +00001361 unsigned int data_len = skb->len - sh_len;
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001362 unsigned int p_len = sh->gso_size;
1363 long f_id = -1; /* id of the current fragment */
Chris Metcalf3da3fff2012-10-25 07:25:20 +00001364 long f_size = skb_headlen(skb) - sh_len; /* current fragment size */
1365 long f_used = 0; /* bytes used from the current fragment */
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001366 long n; /* size of the current piece of payload */
1367 int num_edescs = 0;
1368 int segment;
1369
1370 for (segment = 0; segment < sh->gso_segs; segment++) {
1371
1372 unsigned int p_used = 0;
1373
1374 /* One edesc for header and for each piece of the payload. */
1375 for (num_edescs++; p_used < p_len; num_edescs++) {
1376
1377 /* Advance as needed. */
1378 while (f_used >= f_size) {
1379 f_id++;
Chris Metcalf3da3fff2012-10-25 07:25:20 +00001380 f_size = skb_frag_size(&sh->frags[f_id]);
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001381 f_used = 0;
1382 }
1383
1384 /* Use bytes from the current fragment. */
1385 n = p_len - p_used;
1386 if (n > f_size - f_used)
1387 n = f_size - f_used;
1388 f_used += n;
1389 p_used += n;
1390 }
1391
1392 /* The last segment may be less than gso_size. */
1393 data_len -= p_len;
1394 if (data_len < p_len)
1395 p_len = data_len;
1396 }
1397
1398 return num_edescs;
1399}
1400
1401/* Prepare modified copies of the skbuff headers.
1402 * FIXME: add support for IPv6.
1403 */
1404static void tso_headers_prepare(struct sk_buff *skb, unsigned char *headers,
1405 s64 slot)
1406{
1407 struct skb_shared_info *sh = skb_shinfo(skb);
1408 struct iphdr *ih;
1409 struct tcphdr *th;
Chris Metcalf83885462012-07-11 14:08:21 -04001410 unsigned int sh_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
Chris Metcalf3da3fff2012-10-25 07:25:20 +00001411 unsigned int data_len = skb->len - sh_len;
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001412 unsigned char *data = skb->data;
Chris Metcalf83885462012-07-11 14:08:21 -04001413 unsigned int ih_off, th_off, p_len;
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001414 unsigned int isum_seed, tsum_seed, id, seq;
1415 long f_id = -1; /* id of the current fragment */
Chris Metcalf3da3fff2012-10-25 07:25:20 +00001416 long f_size = skb_headlen(skb) - sh_len; /* current fragment size */
1417 long f_used = 0; /* bytes used from the current fragment */
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001418 long n; /* size of the current piece of payload */
1419 int segment;
1420
1421 /* Locate original headers and compute various lengths. */
1422 ih = ip_hdr(skb);
1423 th = tcp_hdr(skb);
1424 ih_off = skb_network_offset(skb);
1425 th_off = skb_transport_offset(skb);
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001426 p_len = sh->gso_size;
1427
1428 /* Set up seed values for IP and TCP csum and initialize id and seq. */
1429 isum_seed = ((0xFFFF - ih->check) +
1430 (0xFFFF - ih->tot_len) +
1431 (0xFFFF - ih->id));
Chris Metcalf3da3fff2012-10-25 07:25:20 +00001432 tsum_seed = th->check + (0xFFFF ^ htons(skb->len));
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001433 id = ntohs(ih->id);
1434 seq = ntohl(th->seq);
1435
1436 /* Prepare all the headers. */
1437 for (segment = 0; segment < sh->gso_segs; segment++) {
1438 unsigned char *buf;
1439 unsigned int p_used = 0;
1440
1441 /* Copy to the header memory for this segment. */
1442 buf = headers + (slot % EQUEUE_ENTRIES) * HEADER_BYTES +
1443 NET_IP_ALIGN;
1444 memcpy(buf, data, sh_len);
1445
1446 /* Update copied ip header. */
1447 ih = (struct iphdr *)(buf + ih_off);
1448 ih->tot_len = htons(sh_len + p_len - ih_off);
1449 ih->id = htons(id);
1450 ih->check = csum_long(isum_seed + ih->tot_len +
1451 ih->id) ^ 0xffff;
1452
1453 /* Update copied tcp header. */
1454 th = (struct tcphdr *)(buf + th_off);
1455 th->seq = htonl(seq);
1456 th->check = csum_long(tsum_seed + htons(sh_len + p_len));
1457 if (segment != sh->gso_segs - 1) {
1458 th->fin = 0;
1459 th->psh = 0;
1460 }
1461
1462 /* Skip past the header. */
1463 slot++;
1464
1465 /* Skip past the payload. */
1466 while (p_used < p_len) {
1467
1468 /* Advance as needed. */
1469 while (f_used >= f_size) {
1470 f_id++;
Chris Metcalf3da3fff2012-10-25 07:25:20 +00001471 f_size = skb_frag_size(&sh->frags[f_id]);
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001472 f_used = 0;
1473 }
1474
1475 /* Use bytes from the current fragment. */
1476 n = p_len - p_used;
1477 if (n > f_size - f_used)
1478 n = f_size - f_used;
1479 f_used += n;
1480 p_used += n;
1481
1482 slot++;
1483 }
1484
1485 id++;
1486 seq += p_len;
1487
1488 /* The last segment may be less than gso_size. */
1489 data_len -= p_len;
1490 if (data_len < p_len)
1491 p_len = data_len;
1492 }
1493
1494 /* Flush the headers so they are ready for hardware DMA. */
1495 wmb();
1496}
1497
1498/* Pass all the data to mpipe for egress. */
1499static void tso_egress(struct net_device *dev, gxio_mpipe_equeue_t *equeue,
1500 struct sk_buff *skb, unsigned char *headers, s64 slot)
1501{
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001502 struct skb_shared_info *sh = skb_shinfo(skb);
Chris Metcalf83885462012-07-11 14:08:21 -04001503 unsigned int sh_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
Chris Metcalf3da3fff2012-10-25 07:25:20 +00001504 unsigned int data_len = skb->len - sh_len;
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001505 unsigned int p_len = sh->gso_size;
1506 gxio_mpipe_edesc_t edesc_head = { { 0 } };
1507 gxio_mpipe_edesc_t edesc_body = { { 0 } };
1508 long f_id = -1; /* id of the current fragment */
Chris Metcalf3da3fff2012-10-25 07:25:20 +00001509 long f_size = skb_headlen(skb) - sh_len; /* current fragment size */
1510 long f_used = 0; /* bytes used from the current fragment */
1511 void *f_data = skb->data + sh_len;
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001512 long n; /* size of the current piece of payload */
1513 unsigned long tx_packets = 0, tx_bytes = 0;
Chris Metcalf83885462012-07-11 14:08:21 -04001514 unsigned int csum_start;
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001515 int segment;
1516
1517 /* Prepare to egress the headers: set up header edesc. */
1518 csum_start = skb_checksum_start_offset(skb);
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001519 edesc_head.csum = 1;
1520 edesc_head.csum_start = csum_start;
1521 edesc_head.csum_dest = csum_start + skb->csum_offset;
1522 edesc_head.xfer_size = sh_len;
1523
1524 /* This is only used to specify the TLB. */
Chris Metcalf2628e8a2013-08-01 11:36:42 -04001525 edesc_head.stack_idx = first_buffer_stack;
1526 edesc_body.stack_idx = first_buffer_stack;
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001527
1528 /* Egress all the edescs. */
1529 for (segment = 0; segment < sh->gso_segs; segment++) {
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001530 unsigned char *buf;
1531 unsigned int p_used = 0;
1532
1533 /* Egress the header. */
1534 buf = headers + (slot % EQUEUE_ENTRIES) * HEADER_BYTES +
1535 NET_IP_ALIGN;
1536 edesc_head.va = va_to_tile_io_addr(buf);
1537 gxio_mpipe_equeue_put_at(equeue, edesc_head, slot);
1538 slot++;
1539
1540 /* Egress the payload. */
1541 while (p_used < p_len) {
Chris Metcalf3da3fff2012-10-25 07:25:20 +00001542 void *va;
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001543
1544 /* Advance as needed. */
1545 while (f_used >= f_size) {
1546 f_id++;
Chris Metcalf3da3fff2012-10-25 07:25:20 +00001547 f_size = skb_frag_size(&sh->frags[f_id]);
Chris Metcalf83885462012-07-11 14:08:21 -04001548 f_data = tile_net_frag_buf(&sh->frags[f_id]);
Chris Metcalf3da3fff2012-10-25 07:25:20 +00001549 f_used = 0;
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001550 }
1551
Chris Metcalf3da3fff2012-10-25 07:25:20 +00001552 va = f_data + f_used;
1553
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001554 /* Use bytes from the current fragment. */
1555 n = p_len - p_used;
1556 if (n > f_size - f_used)
1557 n = f_size - f_used;
1558 f_used += n;
1559 p_used += n;
1560
1561 /* Egress a piece of the payload. */
Chris Metcalf3da3fff2012-10-25 07:25:20 +00001562 edesc_body.va = va_to_tile_io_addr(va);
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001563 edesc_body.xfer_size = n;
1564 edesc_body.bound = !(p_used < p_len);
1565 gxio_mpipe_equeue_put_at(equeue, edesc_body, slot);
1566 slot++;
1567 }
1568
1569 tx_packets++;
1570 tx_bytes += sh_len + p_len;
1571
1572 /* The last segment may be less than gso_size. */
1573 data_len -= p_len;
1574 if (data_len < p_len)
1575 p_len = data_len;
1576 }
1577
1578 /* Update stats. */
Chris Metcalfad018182013-08-01 11:36:42 -04001579 tile_net_stats_add(tx_packets, &dev->stats.tx_packets);
1580 tile_net_stats_add(tx_bytes, &dev->stats.tx_bytes);
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001581}
1582
1583/* Do "TSO" handling for egress.
1584 *
1585 * Normally drivers set NETIF_F_TSO only to support hardware TSO;
1586 * otherwise the stack uses scatter-gather to implement GSO in software.
1587 * On our testing, enabling GSO support (via NETIF_F_SG) drops network
1588 * performance down to around 7.5 Gbps on the 10G interfaces, although
1589 * also dropping cpu utilization way down, to under 8%. But
1590 * implementing "TSO" in the driver brings performance back up to line
1591 * rate, while dropping cpu usage even further, to less than 4%. In
1592 * practice, profiling of GSO shows that skb_segment() is what causes
1593 * the performance overheads; we benefit in the driver from using
1594 * preallocated memory to duplicate the TCP/IP headers.
1595 */
1596static int tile_net_tx_tso(struct sk_buff *skb, struct net_device *dev)
1597{
1598 struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
1599 struct tile_net_priv *priv = netdev_priv(dev);
1600 int channel = priv->echannel;
1601 struct tile_net_egress *egress = &egress_for_echannel[channel];
1602 struct tile_net_comps *comps = info->comps_for_echannel[channel];
1603 gxio_mpipe_equeue_t *equeue = egress->equeue;
1604 unsigned long irqflags;
1605 int num_edescs;
1606 s64 slot;
1607
1608 /* Determine how many mpipe edesc's are needed. */
1609 num_edescs = tso_count_edescs(skb);
1610
1611 local_irq_save(irqflags);
1612
1613 /* Try to acquire a completion entry and an egress slot. */
Chris Metcalf9b4c3412012-07-01 14:43:47 -04001614 slot = tile_net_equeue_try_reserve(dev, skb->queue_mapping, comps,
1615 equeue, num_edescs);
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001616 if (slot < 0) {
1617 local_irq_restore(irqflags);
1618 return NETDEV_TX_BUSY;
1619 }
1620
1621 /* Set up copies of header data properly. */
1622 tso_headers_prepare(skb, egress->headers, slot);
1623
1624 /* Actually pass the data to the network hardware. */
1625 tso_egress(dev, equeue, skb, egress->headers, slot);
1626
1627 /* Add a completion record. */
1628 add_comp(equeue, comps, slot + num_edescs - 1, skb);
1629
1630 local_irq_restore(irqflags);
1631
1632 /* Make sure the egress timer is scheduled. */
1633 tile_net_schedule_egress_timer();
1634
1635 return NETDEV_TX_OK;
1636}
1637
1638/* Analyze the body and frags for a transmit request. */
1639static unsigned int tile_net_tx_frags(struct frag *frags,
1640 struct sk_buff *skb,
1641 void *b_data, unsigned int b_len)
1642{
1643 unsigned int i, n = 0;
1644
1645 struct skb_shared_info *sh = skb_shinfo(skb);
1646
1647 if (b_len != 0) {
1648 frags[n].buf = b_data;
1649 frags[n++].length = b_len;
1650 }
1651
1652 for (i = 0; i < sh->nr_frags; i++) {
1653 skb_frag_t *f = &sh->frags[i];
1654 frags[n].buf = tile_net_frag_buf(f);
1655 frags[n++].length = skb_frag_size(f);
1656 }
1657
1658 return n;
1659}
1660
1661/* Help the kernel transmit a packet. */
1662static int tile_net_tx(struct sk_buff *skb, struct net_device *dev)
1663{
1664 struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
1665 struct tile_net_priv *priv = netdev_priv(dev);
1666 struct tile_net_egress *egress = &egress_for_echannel[priv->echannel];
1667 gxio_mpipe_equeue_t *equeue = egress->equeue;
1668 struct tile_net_comps *comps =
1669 info->comps_for_echannel[priv->echannel];
1670 unsigned int len = skb->len;
1671 unsigned char *data = skb->data;
1672 unsigned int num_edescs;
1673 struct frag frags[MAX_FRAGS];
1674 gxio_mpipe_edesc_t edescs[MAX_FRAGS];
1675 unsigned long irqflags;
1676 gxio_mpipe_edesc_t edesc = { { 0 } };
1677 unsigned int i;
1678 s64 slot;
1679
1680 if (skb_is_gso(skb))
1681 return tile_net_tx_tso(skb, dev);
1682
1683 num_edescs = tile_net_tx_frags(frags, skb, data, skb_headlen(skb));
1684
1685 /* This is only used to specify the TLB. */
Chris Metcalf2628e8a2013-08-01 11:36:42 -04001686 edesc.stack_idx = first_buffer_stack;
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001687
1688 /* Prepare the edescs. */
1689 for (i = 0; i < num_edescs; i++) {
1690 edesc.xfer_size = frags[i].length;
1691 edesc.va = va_to_tile_io_addr(frags[i].buf);
1692 edescs[i] = edesc;
1693 }
1694
1695 /* Mark the final edesc. */
1696 edescs[num_edescs - 1].bound = 1;
1697
1698 /* Add checksum info to the initial edesc, if needed. */
1699 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1700 unsigned int csum_start = skb_checksum_start_offset(skb);
1701 edescs[0].csum = 1;
1702 edescs[0].csum_start = csum_start;
1703 edescs[0].csum_dest = csum_start + skb->csum_offset;
1704 }
1705
1706 local_irq_save(irqflags);
1707
1708 /* Try to acquire a completion entry and an egress slot. */
Chris Metcalf9b4c3412012-07-01 14:43:47 -04001709 slot = tile_net_equeue_try_reserve(dev, skb->queue_mapping, comps,
1710 equeue, num_edescs);
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001711 if (slot < 0) {
1712 local_irq_restore(irqflags);
1713 return NETDEV_TX_BUSY;
1714 }
1715
1716 for (i = 0; i < num_edescs; i++)
1717 gxio_mpipe_equeue_put_at(equeue, edescs[i], slot++);
1718
1719 /* Add a completion record. */
1720 add_comp(equeue, comps, slot - 1, skb);
1721
1722 /* NOTE: Use ETH_ZLEN for short packets (e.g. 42 < 60). */
Chris Metcalfad018182013-08-01 11:36:42 -04001723 tile_net_stats_add(1, &dev->stats.tx_packets);
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001724 tile_net_stats_add(max_t(unsigned int, len, ETH_ZLEN),
Chris Metcalfad018182013-08-01 11:36:42 -04001725 &dev->stats.tx_bytes);
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001726
1727 local_irq_restore(irqflags);
1728
1729 /* Make sure the egress timer is scheduled. */
1730 tile_net_schedule_egress_timer();
1731
1732 return NETDEV_TX_OK;
1733}
1734
1735/* Return subqueue id on this core (one per core). */
1736static u16 tile_net_select_queue(struct net_device *dev, struct sk_buff *skb)
1737{
1738 return smp_processor_id();
1739}
1740
1741/* Deal with a transmit timeout. */
1742static void tile_net_tx_timeout(struct net_device *dev)
1743{
1744 int cpu;
1745
1746 for_each_online_cpu(cpu)
1747 netif_wake_subqueue(dev, cpu);
1748}
1749
1750/* Ioctl commands. */
1751static int tile_net_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
1752{
1753 return -EOPNOTSUPP;
1754}
1755
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001756/* Change the MTU. */
1757static int tile_net_change_mtu(struct net_device *dev, int new_mtu)
1758{
Chris Metcalf2628e8a2013-08-01 11:36:42 -04001759 if (new_mtu < 68)
1760 return -EINVAL;
1761 if (new_mtu > ((jumbo_num != 0) ? 9000 : 1500))
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001762 return -EINVAL;
1763 dev->mtu = new_mtu;
1764 return 0;
1765}
1766
1767/* Change the Ethernet address of the NIC.
1768 *
1769 * The hypervisor driver does not support changing MAC address. However,
1770 * the hardware does not do anything with the MAC address, so the address
1771 * which gets used on outgoing packets, and which is accepted on incoming
1772 * packets, is completely up to us.
1773 *
1774 * Returns 0 on success, negative on failure.
1775 */
1776static int tile_net_set_mac_address(struct net_device *dev, void *p)
1777{
1778 struct sockaddr *addr = p;
1779
1780 if (!is_valid_ether_addr(addr->sa_data))
1781 return -EINVAL;
1782 memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
1783 return 0;
1784}
1785
1786#ifdef CONFIG_NET_POLL_CONTROLLER
1787/* Polling 'interrupt' - used by things like netconsole to send skbs
1788 * without having to re-enable interrupts. It's not called while
1789 * the interrupt routine is executing.
1790 */
1791static void tile_net_netpoll(struct net_device *dev)
1792{
1793 disable_percpu_irq(ingress_irq);
1794 tile_net_handle_ingress_irq(ingress_irq, NULL);
1795 enable_percpu_irq(ingress_irq, 0);
1796}
1797#endif
1798
1799static const struct net_device_ops tile_net_ops = {
1800 .ndo_open = tile_net_open,
1801 .ndo_stop = tile_net_stop,
1802 .ndo_start_xmit = tile_net_tx,
1803 .ndo_select_queue = tile_net_select_queue,
1804 .ndo_do_ioctl = tile_net_ioctl,
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001805 .ndo_change_mtu = tile_net_change_mtu,
1806 .ndo_tx_timeout = tile_net_tx_timeout,
1807 .ndo_set_mac_address = tile_net_set_mac_address,
1808#ifdef CONFIG_NET_POLL_CONTROLLER
1809 .ndo_poll_controller = tile_net_netpoll,
1810#endif
1811};
1812
1813/* The setup function.
1814 *
1815 * This uses ether_setup() to assign various fields in dev, including
1816 * setting IFF_BROADCAST and IFF_MULTICAST, then sets some extra fields.
1817 */
1818static void tile_net_setup(struct net_device *dev)
1819{
Chris Metcalfa8eaed52013-08-01 11:36:42 -04001820 netdev_features_t features = 0;
1821
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001822 ether_setup(dev);
1823 dev->netdev_ops = &tile_net_ops;
1824 dev->watchdog_timeo = TILE_NET_TIMEOUT;
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001825 dev->mtu = 1500;
Chris Metcalfa8eaed52013-08-01 11:36:42 -04001826
1827 features |= NETIF_F_LLTX;
1828 features |= NETIF_F_HW_CSUM;
1829 features |= NETIF_F_SG;
1830 features |= NETIF_F_TSO;
1831
1832 dev->hw_features |= features;
1833 dev->vlan_features |= features;
1834 dev->features |= features;
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001835}
1836
1837/* Allocate the device structure, register the device, and obtain the
1838 * MAC address from the hypervisor.
1839 */
1840static void tile_net_dev_init(const char *name, const uint8_t *mac)
1841{
1842 int ret;
1843 int i;
1844 int nz_addr = 0;
1845 struct net_device *dev;
1846 struct tile_net_priv *priv;
1847
1848 /* HACK: Ignore "loop" links. */
1849 if (strncmp(name, "loop", 4) == 0)
1850 return;
1851
1852 /* Allocate the device structure. Normally, "name" is a
1853 * template, instantiated by register_netdev(), but not for us.
1854 */
1855 dev = alloc_netdev_mqs(sizeof(*priv), name, tile_net_setup,
1856 NR_CPUS, 1);
1857 if (!dev) {
1858 pr_err("alloc_netdev_mqs(%s) failed\n", name);
1859 return;
1860 }
1861
1862 /* Initialize "priv". */
1863 priv = netdev_priv(dev);
1864 memset(priv, 0, sizeof(*priv));
1865 priv->dev = dev;
1866 priv->channel = -1;
1867 priv->loopify_channel = -1;
1868 priv->echannel = -1;
1869
1870 /* Get the MAC address and set it in the device struct; this must
1871 * be done before the device is opened. If the MAC is all zeroes,
1872 * we use a random address, since we're probably on the simulator.
1873 */
1874 for (i = 0; i < 6; i++)
1875 nz_addr |= mac[i];
1876
1877 if (nz_addr) {
1878 memcpy(dev->dev_addr, mac, 6);
1879 dev->addr_len = 6;
1880 } else {
Chris Metcalfc8ab13f2012-07-18 12:23:06 -04001881 eth_hw_addr_random(dev);
Chris Metcalfe3d62d72012-06-07 10:45:02 +00001882 }
1883
1884 /* Register the network device. */
1885 ret = register_netdev(dev);
1886 if (ret) {
1887 netdev_err(dev, "register_netdev failed %d\n", ret);
1888 free_netdev(dev);
1889 return;
1890 }
1891}
1892
1893/* Per-cpu module initialization. */
1894static void tile_net_init_module_percpu(void *unused)
1895{
1896 struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
1897 int my_cpu = smp_processor_id();
1898
1899 info->has_iqueue = false;
1900
1901 info->my_cpu = my_cpu;
1902
1903 /* Initialize the egress timer. */
1904 hrtimer_init(&info->egress_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1905 info->egress_timer.function = tile_net_handle_egress_timer;
1906}
1907
1908/* Module initialization. */
1909static int __init tile_net_init_module(void)
1910{
1911 int i;
1912 char name[GXIO_MPIPE_LINK_NAME_LEN];
1913 uint8_t mac[6];
1914
1915 pr_info("Tilera Network Driver\n");
1916
1917 mutex_init(&tile_net_devs_for_channel_mutex);
1918
1919 /* Initialize each CPU. */
1920 on_each_cpu(tile_net_init_module_percpu, NULL, 1);
1921
1922 /* Find out what devices we have, and initialize them. */
1923 for (i = 0; gxio_mpipe_link_enumerate_mac(i, name, mac) >= 0; i++)
1924 tile_net_dev_init(name, mac);
1925
1926 if (!network_cpus_init())
1927 network_cpus_map = *cpu_online_mask;
1928
1929 return 0;
1930}
1931
1932module_init(tile_net_init_module);