diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index c251cca295c..d4e39ff1545 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -25,6 +25,14 @@ menuconfig NETDEVICES # that for each of the symbols. if NETDEVICES +config NETDEVICES_MULTIQUEUE + bool "Netdevice multiple hardware queue support" + ---help--- + Say Y here if you want to allow the network stack to use multiple + hardware TX queues on an ethernet device. + + Most people will say N here. + config IFB tristate "Intermediate Functional Block support" depends on NET_CLS_ACT diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index f48eb89efd0..6cdb97365e4 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -39,7 +39,8 @@ extern void eth_header_cache_update(struct hh_cache *hh, struct net_device *dev extern int eth_header_cache(struct neighbour *neigh, struct hh_cache *hh); -extern struct net_device *alloc_etherdev(int sizeof_priv); +extern struct net_device *alloc_etherdev_mq(int sizeof_priv, unsigned int queue_count); +#define alloc_etherdev(sizeof_priv) alloc_etherdev_mq(sizeof_priv, 1) /** * is_zero_ether_addr - Determine if give Ethernet address is all zeros. diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2c0cc19edfb..9817821729c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -108,6 +108,14 @@ struct wireless_dev; #define MAX_HEADER (LL_MAX_HEADER + 48) #endif +struct net_device_subqueue +{ + /* Give a control state for each queue. This struct may contain + * per-queue locks in the future. + */ + unsigned long state; +}; + /* * Network device statistics. Akin to the 2.0 ether stats but * with byte counters. @@ -331,6 +339,7 @@ struct net_device #define NETIF_F_VLAN_CHALLENGED 1024 /* Device cannot handle VLAN packets */ #define NETIF_F_GSO 2048 /* Enable software GSO. */ #define NETIF_F_LLTX 4096 /* LockLess TX */ +#define NETIF_F_MULTI_QUEUE 16384 /* Has multiple TX/RX queues */ /* Segmentation offload features */ #define NETIF_F_GSO_SHIFT 16 @@ -557,6 +566,10 @@ struct net_device /* rtnetlink link ops */ const struct rtnl_link_ops *rtnl_link_ops; + + /* The TX queue control structures */ + unsigned int egress_subqueue_count; + struct net_device_subqueue egress_subqueue[0]; }; #define to_net_dev(d) container_of(d, struct net_device, dev) @@ -565,9 +578,7 @@ struct net_device static inline void *netdev_priv(const struct net_device *dev) { - return (char *)dev + ((sizeof(struct net_device) - + NETDEV_ALIGN_CONST) - & ~NETDEV_ALIGN_CONST); + return dev->priv; } #define SET_MODULE_OWNER(dev) do { } while (0) @@ -719,6 +730,62 @@ static inline int netif_running(const struct net_device *dev) return test_bit(__LINK_STATE_START, &dev->state); } +/* + * Routines to manage the subqueues on a device. We only need start + * stop, and a check if it's stopped. All other device management is + * done at the overall netdevice level. + * Also test the device if we're multiqueue. + */ +static inline void netif_start_subqueue(struct net_device *dev, u16 queue_index) +{ +#ifdef CONFIG_NETDEVICES_MULTIQUEUE + clear_bit(__LINK_STATE_XOFF, &dev->egress_subqueue[queue_index].state); +#endif +} + +static inline void netif_stop_subqueue(struct net_device *dev, u16 queue_index) +{ +#ifdef CONFIG_NETDEVICES_MULTIQUEUE +#ifdef CONFIG_NETPOLL_TRAP + if (netpoll_trap()) + return; +#endif + set_bit(__LINK_STATE_XOFF, &dev->egress_subqueue[queue_index].state); +#endif +} + +static inline int netif_subqueue_stopped(const struct net_device *dev, + u16 queue_index) +{ +#ifdef CONFIG_NETDEVICES_MULTIQUEUE + return test_bit(__LINK_STATE_XOFF, + &dev->egress_subqueue[queue_index].state); +#else + return 0; +#endif +} + +static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index) +{ +#ifdef CONFIG_NETDEVICES_MULTIQUEUE +#ifdef CONFIG_NETPOLL_TRAP + if (netpoll_trap()) + return; +#endif + if (test_and_clear_bit(__LINK_STATE_XOFF, + &dev->egress_subqueue[queue_index].state)) + __netif_schedule(dev); +#endif +} + +static inline int netif_is_multiqueue(const struct net_device *dev) +{ +#ifdef CONFIG_NETDEVICES_MULTIQUEUE + return (!!(NETIF_F_MULTI_QUEUE & dev->features)); +#else + return 0; +#endif +} /* Use this variant when it is known for sure that it * is executing from interrupt context. @@ -1009,8 +1076,11 @@ static inline void netif_tx_disable(struct net_device *dev) extern void ether_setup(struct net_device *dev); /* Support for loadable net-drivers */ -extern struct net_device *alloc_netdev(int sizeof_priv, const char *name, - void (*setup)(struct net_device *)); +extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, + void (*setup)(struct net_device *), + unsigned int queue_count); +#define alloc_netdev(sizeof_priv, name, setup) \ + alloc_netdev_mq(sizeof_priv, name, setup, 1) extern int register_netdev(struct net_device *dev); extern void unregister_netdev(struct net_device *dev); /* Functions used for secondary unicast and multicast support */ diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 881fe80f01d..2d6a14f5f2f 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -196,7 +196,6 @@ typedef unsigned char *sk_buff_data_t; * @sk: Socket we are owned by * @tstamp: Time we arrived * @dev: Device we arrived on/are leaving by - * @iif: ifindex of device we arrived on * @transport_header: Transport layer header * @network_header: Network layer header * @mac_header: Link layer header @@ -231,6 +230,8 @@ typedef unsigned char *sk_buff_data_t; * @nfctinfo: Relationship of this skb to the connection * @nfct_reasm: netfilter conntrack re-assembly pointer * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c + * @iif: ifindex of device we arrived on + * @queue_mapping: Queue mapping for multiqueue devices * @tc_index: Traffic control index * @tc_verd: traffic control verdict * @dma_cookie: a cookie to one of several possible DMA operations @@ -246,8 +247,6 @@ struct sk_buff { struct sock *sk; ktime_t tstamp; struct net_device *dev; - int iif; - /* 4 byte hole on 64 bit*/ struct dst_entry *dst; struct sec_path *sp; @@ -290,12 +289,18 @@ struct sk_buff { #ifdef CONFIG_BRIDGE_NETFILTER struct nf_bridge_info *nf_bridge; #endif + + int iif; + __u16 queue_mapping; + #ifdef CONFIG_NET_SCHED __u16 tc_index; /* traffic control index */ #ifdef CONFIG_NET_CLS_ACT __u16 tc_verd; /* traffic control verdict */ #endif #endif + /* 2 byte hole */ + #ifdef CONFIG_NET_DMA dma_cookie_t dma_cookie; #endif @@ -1725,6 +1730,20 @@ static inline void skb_init_secmark(struct sk_buff *skb) { } #endif +static inline void skb_set_queue_mapping(struct sk_buff *skb, u16 queue_mapping) +{ +#ifdef CONFIG_NETDEVICES_MULTIQUEUE + skb->queue_mapping = queue_mapping; +#endif +} + +static inline void skb_copy_queue_mapping(struct sk_buff *to, const struct sk_buff *from) +{ +#ifdef CONFIG_NETDEVICES_MULTIQUEUE + to->queue_mapping = from->queue_mapping; +#endif +} + static inline int skb_is_gso(const struct sk_buff *skb) { return skb_shinfo(skb)->gso_size; diff --git a/net/core/dev.c b/net/core/dev.c index 6dce9d2d46f..7ddf66d0ad5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1429,7 +1429,9 @@ gso: skb->next = nskb; return rc; } - if (unlikely(netif_queue_stopped(dev) && skb->next)) + if (unlikely((netif_queue_stopped(dev) || + netif_subqueue_stopped(dev, skb->queue_mapping)) && + skb->next)) return NETDEV_TX_BUSY; } while (skb->next); @@ -1547,6 +1549,8 @@ gso: spin_lock(&dev->queue_lock); q = dev->qdisc; if (q->enqueue) { + /* reset queue_mapping to zero */ + skb->queue_mapping = 0; rc = q->enqueue(skb, q); qdisc_run(dev); spin_unlock(&dev->queue_lock); @@ -1576,7 +1580,8 @@ gso: HARD_TX_LOCK(dev, cpu); - if (!netif_queue_stopped(dev)) { + if (!netif_queue_stopped(dev) && + !netif_subqueue_stopped(dev, skb->queue_mapping)) { rc = 0; if (!dev_hard_start_xmit(skb, dev)) { HARD_TX_UNLOCK(dev); @@ -3539,16 +3544,18 @@ static struct net_device_stats *internal_stats(struct net_device *dev) } /** - * alloc_netdev - allocate network device + * alloc_netdev_mq - allocate network device * @sizeof_priv: size of private data to allocate space for * @name: device name format string * @setup: callback to initialize device + * @queue_count: the number of subqueues to allocate * * Allocates a struct net_device with private data area for driver use - * and performs basic initialization. + * and performs basic initialization. Also allocates subquue structs + * for each queue on the device at the end of the netdevice. */ -struct net_device *alloc_netdev(int sizeof_priv, const char *name, - void (*setup)(struct net_device *)) +struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, + void (*setup)(struct net_device *), unsigned int queue_count) { void *p; struct net_device *dev; @@ -3557,7 +3564,9 @@ struct net_device *alloc_netdev(int sizeof_priv, const char *name, BUG_ON(strlen(name) >= sizeof(dev->name)); /* ensure 32-byte alignment of both the device and private area */ - alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST; + alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST + + (sizeof(struct net_device_subqueue) * queue_count)) & + ~NETDEV_ALIGN_CONST; alloc_size += sizeof_priv + NETDEV_ALIGN_CONST; p = kzalloc(alloc_size, GFP_KERNEL); @@ -3570,15 +3579,22 @@ struct net_device *alloc_netdev(int sizeof_priv, const char *name, (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST); dev->padded = (char *)dev - (char *)p; - if (sizeof_priv) - dev->priv = netdev_priv(dev); + if (sizeof_priv) { + dev->priv = ((char *)dev + + ((sizeof(struct net_device) + + (sizeof(struct net_device_subqueue) * + queue_count) + NETDEV_ALIGN_CONST) + & ~NETDEV_ALIGN_CONST)); + } + + dev->egress_subqueue_count = queue_count; dev->get_stats = internal_stats; setup(dev); strcpy(dev->name, name); return dev; } -EXPORT_SYMBOL(alloc_netdev); +EXPORT_SYMBOL(alloc_netdev_mq); /** * free_netdev - free network device diff --git a/net/core/netpoll.c b/net/core/netpoll.c index a0efdd7a6b3..4b06d193637 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -66,8 +66,9 @@ static void queue_process(struct work_struct *work) local_irq_save(flags); netif_tx_lock(dev); - if (netif_queue_stopped(dev) || - dev->hard_start_xmit(skb, dev) != NETDEV_TX_OK) { + if ((netif_queue_stopped(dev) || + netif_subqueue_stopped(dev, skb->queue_mapping)) || + dev->hard_start_xmit(skb, dev) != NETDEV_TX_OK) { skb_queue_head(&npinfo->txq, skb); netif_tx_unlock(dev); local_irq_restore(flags); @@ -254,7 +255,8 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) for (tries = jiffies_to_usecs(1)/USEC_PER_POLL; tries > 0; --tries) { if (netif_tx_trylock(dev)) { - if (!netif_queue_stopped(dev)) + if (!netif_queue_stopped(dev) && + !netif_subqueue_stopped(dev, skb->queue_mapping)) status = dev->hard_start_xmit(skb, dev); netif_tx_unlock(dev); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 9cd3a1cb60e..dffe067e7a7 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3139,7 +3139,9 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev) } } - if (netif_queue_stopped(odev) || need_resched()) { + if ((netif_queue_stopped(odev) || + netif_subqueue_stopped(odev, pkt_dev->skb->queue_mapping)) || + need_resched()) { idle_start = getCurUs(); if (!netif_running(odev)) { @@ -3154,7 +3156,8 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev) pkt_dev->idle_acc += getCurUs() - idle_start; - if (netif_queue_stopped(odev)) { + if (netif_queue_stopped(odev) || + netif_subqueue_stopped(odev, pkt_dev->skb->queue_mapping)) { pkt_dev->next_tx_us = getCurUs(); /* TODO */ pkt_dev->next_tx_ns = 0; goto out; /* Try the next interface */ @@ -3181,7 +3184,8 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev) } netif_tx_lock_bh(odev); - if (!netif_queue_stopped(odev)) { + if (!netif_queue_stopped(odev) && + !netif_subqueue_stopped(odev, pkt_dev->skb->queue_mapping)) { atomic_inc(&(pkt_dev->skb->users)); retry_now: diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c989c3a0f90..6a41b96b3d3 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -419,6 +419,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) n->nohdr = 0; C(pkt_type); C(ip_summed); + skb_copy_queue_mapping(n, skb); C(priority); #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) C(ipvs_property); @@ -460,6 +461,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #endif new->sk = NULL; new->dev = old->dev; + skb_copy_queue_mapping(new, old); new->priority = old->priority; new->protocol = old->protocol; new->dst = dst_clone(old->dst); @@ -1932,6 +1934,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features) tail = nskb; nskb->dev = skb->dev; + skb_copy_queue_mapping(nskb, skb); nskb->priority = skb->priority; nskb->protocol = skb->protocol; nskb->dst = dst_clone(skb->dst); diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 0ac2524f3b6..1387e5411f7 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -316,9 +316,10 @@ void ether_setup(struct net_device *dev) EXPORT_SYMBOL(ether_setup); /** - * alloc_etherdev - Allocates and sets up an Ethernet device + * alloc_etherdev_mq - Allocates and sets up an Ethernet device * @sizeof_priv: Size of additional driver-private structure to be allocated * for this Ethernet device + * @queue_count: The number of queues this device has. * * Fill in the fields of the device structure with Ethernet-generic * values. Basically does everything except registering the device. @@ -328,8 +329,8 @@ EXPORT_SYMBOL(ether_setup); * this private data area. */ -struct net_device *alloc_etherdev(int sizeof_priv) +struct net_device *alloc_etherdev_mq(int sizeof_priv, unsigned int queue_count) { - return alloc_netdev(sizeof_priv, "eth%d", ether_setup); + return alloc_netdev_mq(sizeof_priv, "eth%d", ether_setup, queue_count); } -EXPORT_SYMBOL(alloc_etherdev); +EXPORT_SYMBOL(alloc_etherdev_mq); diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index f05ad9a30b4..dfe7e452098 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -277,6 +277,7 @@ static int teql_master_xmit(struct sk_buff *skb, struct net_device *dev) int busy; int nores; int len = skb->len; + int subq = skb->queue_mapping; struct sk_buff *skb_res = NULL; start = master->slaves; @@ -293,7 +294,9 @@ restart: if (slave->qdisc_sleeping != q) continue; - if (netif_queue_stopped(slave) || ! netif_running(slave)) { + if (netif_queue_stopped(slave) || + netif_subqueue_stopped(slave, subq) || + !netif_running(slave)) { busy = 1; continue; } @@ -302,6 +305,7 @@ restart: case 0: if (netif_tx_trylock(slave)) { if (!netif_queue_stopped(slave) && + !netif_subqueue_stopped(slave, subq) && slave->hard_start_xmit(skb, slave) == 0) { netif_tx_unlock(slave); master->slaves = NEXT_SLAVE(q);