mirror of
https://github.com/adulau/aha.git
synced 2025-01-01 05:36:24 +00:00
[NET]: Size listen hash tables using backlog hint
We currently allocate a fixed size (TCP_SYNQ_HSIZE=512) slots hash table for each LISTEN socket, regardless of various parameters (listen backlog for example) On x86_64, this means order-1 allocations (might fail), even for 'small' sockets, expecting few connections. On the contrary, a huge server wanting a backlog of 50000 is slowed down a bit because of this fixed limit. This patch makes the sizing of listen hash table a dynamic parameter, depending of : - net.core.somaxconn tunable (default is 128) - net.ipv4.tcp_max_syn_backlog tunable (default : 256, 1024 or 128) - backlog value given by user application (2nd parameter of listen()) For large allocations (bigger than PAGE_SIZE), we use vmalloc() instead of kmalloc(). We still limit memory allocation with the two existing tunables (somaxconn & tcp_max_syn_backlog). So for standard setups, this patch actually reduce RAM usage. Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
3c62f75aac
commit
72a3effaf6
9 changed files with 38 additions and 24 deletions
|
@ -28,8 +28,8 @@ struct proto;
|
||||||
|
|
||||||
struct request_sock_ops {
|
struct request_sock_ops {
|
||||||
int family;
|
int family;
|
||||||
kmem_cache_t *slab;
|
|
||||||
int obj_size;
|
int obj_size;
|
||||||
|
kmem_cache_t *slab;
|
||||||
int (*rtx_syn_ack)(struct sock *sk,
|
int (*rtx_syn_ack)(struct sock *sk,
|
||||||
struct request_sock *req,
|
struct request_sock *req,
|
||||||
struct dst_entry *dst);
|
struct dst_entry *dst);
|
||||||
|
@ -51,13 +51,13 @@ struct request_sock {
|
||||||
u32 rcv_wnd; /* rcv_wnd offered first time */
|
u32 rcv_wnd; /* rcv_wnd offered first time */
|
||||||
u32 ts_recent;
|
u32 ts_recent;
|
||||||
unsigned long expires;
|
unsigned long expires;
|
||||||
struct request_sock_ops *rsk_ops;
|
const struct request_sock_ops *rsk_ops;
|
||||||
struct sock *sk;
|
struct sock *sk;
|
||||||
u32 secid;
|
u32 secid;
|
||||||
u32 peer_secid;
|
u32 peer_secid;
|
||||||
};
|
};
|
||||||
|
|
||||||
static inline struct request_sock *reqsk_alloc(struct request_sock_ops *ops)
|
static inline struct request_sock *reqsk_alloc(const struct request_sock_ops *ops)
|
||||||
{
|
{
|
||||||
struct request_sock *req = kmem_cache_alloc(ops->slab, SLAB_ATOMIC);
|
struct request_sock *req = kmem_cache_alloc(ops->slab, SLAB_ATOMIC);
|
||||||
|
|
||||||
|
@ -121,7 +121,7 @@ struct request_sock_queue {
|
||||||
};
|
};
|
||||||
|
|
||||||
extern int reqsk_queue_alloc(struct request_sock_queue *queue,
|
extern int reqsk_queue_alloc(struct request_sock_queue *queue,
|
||||||
const int nr_table_entries);
|
unsigned int nr_table_entries);
|
||||||
|
|
||||||
static inline struct listen_sock *reqsk_queue_yank_listen_sk(struct request_sock_queue *queue)
|
static inline struct listen_sock *reqsk_queue_yank_listen_sk(struct request_sock_queue *queue)
|
||||||
{
|
{
|
||||||
|
|
|
@ -138,7 +138,6 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
|
||||||
#define MAX_TCP_SYNCNT 127
|
#define MAX_TCP_SYNCNT 127
|
||||||
|
|
||||||
#define TCP_SYNQ_INTERVAL (HZ/5) /* Period of SYNACK timer */
|
#define TCP_SYNQ_INTERVAL (HZ/5) /* Period of SYNACK timer */
|
||||||
#define TCP_SYNQ_HSIZE 512 /* Size of SYNACK hash table */
|
|
||||||
|
|
||||||
#define TCP_PAWS_24DAYS (60 * 60 * 24 * 24)
|
#define TCP_PAWS_24DAYS (60 * 60 * 24 * 24)
|
||||||
#define TCP_PAWS_MSL 60 /* Per-host timestamps are invalidated
|
#define TCP_PAWS_MSL 60 /* Per-host timestamps are invalidated
|
||||||
|
|
|
@ -15,6 +15,7 @@
|
||||||
#include <linux/random.h>
|
#include <linux/random.h>
|
||||||
#include <linux/slab.h>
|
#include <linux/slab.h>
|
||||||
#include <linux/string.h>
|
#include <linux/string.h>
|
||||||
|
#include <linux/vmalloc.h>
|
||||||
|
|
||||||
#include <net/request_sock.h>
|
#include <net/request_sock.h>
|
||||||
|
|
||||||
|
@ -29,22 +30,31 @@
|
||||||
* it is absolutely not enough even at 100conn/sec. 256 cures most
|
* it is absolutely not enough even at 100conn/sec. 256 cures most
|
||||||
* of problems. This value is adjusted to 128 for very small machines
|
* of problems. This value is adjusted to 128 for very small machines
|
||||||
* (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
|
* (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
|
||||||
* Further increasing requires to change hash table size.
|
* Note : Dont forget somaxconn that may limit backlog too.
|
||||||
*/
|
*/
|
||||||
int sysctl_max_syn_backlog = 256;
|
int sysctl_max_syn_backlog = 256;
|
||||||
|
|
||||||
int reqsk_queue_alloc(struct request_sock_queue *queue,
|
int reqsk_queue_alloc(struct request_sock_queue *queue,
|
||||||
const int nr_table_entries)
|
unsigned int nr_table_entries)
|
||||||
{
|
{
|
||||||
const int lopt_size = sizeof(struct listen_sock) +
|
size_t lopt_size = sizeof(struct listen_sock);
|
||||||
nr_table_entries * sizeof(struct request_sock *);
|
struct listen_sock *lopt;
|
||||||
struct listen_sock *lopt = kzalloc(lopt_size, GFP_KERNEL);
|
|
||||||
|
|
||||||
|
nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
|
||||||
|
nr_table_entries = max_t(u32, nr_table_entries, 8);
|
||||||
|
nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
|
||||||
|
lopt_size += nr_table_entries * sizeof(struct request_sock *);
|
||||||
|
if (lopt_size > PAGE_SIZE)
|
||||||
|
lopt = __vmalloc(lopt_size,
|
||||||
|
GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
|
||||||
|
PAGE_KERNEL);
|
||||||
|
else
|
||||||
|
lopt = kzalloc(lopt_size, GFP_KERNEL);
|
||||||
if (lopt == NULL)
|
if (lopt == NULL)
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
|
|
||||||
for (lopt->max_qlen_log = 6;
|
for (lopt->max_qlen_log = 3;
|
||||||
(1 << lopt->max_qlen_log) < sysctl_max_syn_backlog;
|
(1 << lopt->max_qlen_log) < nr_table_entries;
|
||||||
lopt->max_qlen_log++);
|
lopt->max_qlen_log++);
|
||||||
|
|
||||||
get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
|
get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
|
||||||
|
@ -65,9 +75,11 @@ void reqsk_queue_destroy(struct request_sock_queue *queue)
|
||||||
{
|
{
|
||||||
/* make all the listen_opt local to us */
|
/* make all the listen_opt local to us */
|
||||||
struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue);
|
struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue);
|
||||||
|
size_t lopt_size = sizeof(struct listen_sock) +
|
||||||
|
lopt->nr_table_entries * sizeof(struct request_sock *);
|
||||||
|
|
||||||
if (lopt->qlen != 0) {
|
if (lopt->qlen != 0) {
|
||||||
int i;
|
unsigned int i;
|
||||||
|
|
||||||
for (i = 0; i < lopt->nr_table_entries; i++) {
|
for (i = 0; i < lopt->nr_table_entries; i++) {
|
||||||
struct request_sock *req;
|
struct request_sock *req;
|
||||||
|
@ -81,6 +93,9 @@ void reqsk_queue_destroy(struct request_sock_queue *queue)
|
||||||
}
|
}
|
||||||
|
|
||||||
BUG_TRAP(lopt->qlen == 0);
|
BUG_TRAP(lopt->qlen == 0);
|
||||||
|
if (lopt_size > PAGE_SIZE)
|
||||||
|
vfree(lopt);
|
||||||
|
else
|
||||||
kfree(lopt);
|
kfree(lopt);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1022,7 +1022,7 @@ static void dccp_v4_reqsk_destructor(struct request_sock *req)
|
||||||
kfree(inet_rsk(req)->opt);
|
kfree(inet_rsk(req)->opt);
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct request_sock_ops dccp_request_sock_ops = {
|
static struct request_sock_ops dccp_request_sock_ops _read_mostly = {
|
||||||
.family = PF_INET,
|
.family = PF_INET,
|
||||||
.obj_size = sizeof(struct dccp_request_sock),
|
.obj_size = sizeof(struct dccp_request_sock),
|
||||||
.rtx_syn_ack = dccp_v4_send_response,
|
.rtx_syn_ack = dccp_v4_send_response,
|
||||||
|
|
|
@ -262,12 +262,12 @@ int dccp_destroy_sock(struct sock *sk)
|
||||||
|
|
||||||
EXPORT_SYMBOL_GPL(dccp_destroy_sock);
|
EXPORT_SYMBOL_GPL(dccp_destroy_sock);
|
||||||
|
|
||||||
static inline int dccp_listen_start(struct sock *sk)
|
static inline int dccp_listen_start(struct sock *sk, int backlog)
|
||||||
{
|
{
|
||||||
struct dccp_sock *dp = dccp_sk(sk);
|
struct dccp_sock *dp = dccp_sk(sk);
|
||||||
|
|
||||||
dp->dccps_role = DCCP_ROLE_LISTEN;
|
dp->dccps_role = DCCP_ROLE_LISTEN;
|
||||||
return inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
|
return inet_csk_listen_start(sk, backlog);
|
||||||
}
|
}
|
||||||
|
|
||||||
int dccp_disconnect(struct sock *sk, int flags)
|
int dccp_disconnect(struct sock *sk, int flags)
|
||||||
|
@ -788,7 +788,7 @@ int inet_dccp_listen(struct socket *sock, int backlog)
|
||||||
* FIXME: here it probably should be sk->sk_prot->listen_start
|
* FIXME: here it probably should be sk->sk_prot->listen_start
|
||||||
* see tcp_listen_start
|
* see tcp_listen_start
|
||||||
*/
|
*/
|
||||||
err = dccp_listen_start(sk);
|
err = dccp_listen_start(sk, backlog);
|
||||||
if (err)
|
if (err)
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
|
@ -204,7 +204,7 @@ int inet_listen(struct socket *sock, int backlog)
|
||||||
* we can only allow the backlog to be adjusted.
|
* we can only allow the backlog to be adjusted.
|
||||||
*/
|
*/
|
||||||
if (old_state != TCP_LISTEN) {
|
if (old_state != TCP_LISTEN) {
|
||||||
err = inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
|
err = inet_csk_listen_start(sk, backlog);
|
||||||
if (err)
|
if (err)
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
|
@ -343,7 +343,7 @@ struct dst_entry* inet_csk_route_req(struct sock *sk,
|
||||||
EXPORT_SYMBOL_GPL(inet_csk_route_req);
|
EXPORT_SYMBOL_GPL(inet_csk_route_req);
|
||||||
|
|
||||||
static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
|
static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
|
||||||
const u32 rnd, const u16 synq_hsize)
|
const u32 rnd, const u32 synq_hsize)
|
||||||
{
|
{
|
||||||
return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);
|
return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);
|
||||||
}
|
}
|
||||||
|
|
|
@ -715,7 +715,7 @@ static struct ip_options *tcp_v4_save_options(struct sock *sk,
|
||||||
return dopt;
|
return dopt;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct request_sock_ops tcp_request_sock_ops = {
|
struct request_sock_ops tcp_request_sock_ops __read_mostly = {
|
||||||
.family = PF_INET,
|
.family = PF_INET,
|
||||||
.obj_size = sizeof(struct tcp_request_sock),
|
.obj_size = sizeof(struct tcp_request_sock),
|
||||||
.rtx_syn_ack = tcp_v4_send_synack,
|
.rtx_syn_ack = tcp_v4_send_synack,
|
||||||
|
@ -1395,7 +1395,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
|
||||||
}
|
}
|
||||||
req = req->dl_next;
|
req = req->dl_next;
|
||||||
}
|
}
|
||||||
if (++st->sbucket >= TCP_SYNQ_HSIZE)
|
if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
|
||||||
break;
|
break;
|
||||||
get_req:
|
get_req:
|
||||||
req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
|
req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
|
||||||
|
|
|
@ -526,7 +526,7 @@ static void tcp_v6_reqsk_destructor(struct request_sock *req)
|
||||||
kfree_skb(inet6_rsk(req)->pktopts);
|
kfree_skb(inet6_rsk(req)->pktopts);
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct request_sock_ops tcp6_request_sock_ops = {
|
static struct request_sock_ops tcp6_request_sock_ops _read_mostly = {
|
||||||
.family = AF_INET6,
|
.family = AF_INET6,
|
||||||
.obj_size = sizeof(struct tcp6_request_sock),
|
.obj_size = sizeof(struct tcp6_request_sock),
|
||||||
.rtx_syn_ack = tcp_v6_send_synack,
|
.rtx_syn_ack = tcp_v6_send_synack,
|
||||||
|
|
Loading…
Reference in a new issue