Skip to content

Commit da5e363

Browse files
Tom Herbertdavem330
authored andcommitted
soreuseport: TCP/IPv4 implementation
Allow multiple listener sockets to bind to the same port. Motivation for soresuseport would be something like a web server binding to port 80 running with multiple threads, where each thread might have it's own listener socket. This could be done as an alternative to other models: 1) have one listener thread which dispatches completed connections to workers. 2) accept on a single listener socket from multiple threads. In case #1 the listener thread can easily become the bottleneck with high connection turn-over rate. In case #2, the proportion of connections accepted per thread tends to be uneven under high connection load (assuming simple event loop: while (1) { accept(); process() }, wakeup does not promote fairness among the sockets. We have seen the disproportion to be as high as 3:1 ratio between thread accepting most connections and the one accepting the fewest. With so_reusport the distribution is uniform. Signed-off-by: Tom Herbert <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 055dc21 commit da5e363

File tree

5 files changed

+73
-21
lines changed

5 files changed

+73
-21
lines changed

include/net/inet_hashtables.h

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,9 @@ struct inet_bind_bucket {
8181
struct net *ib_net;
8282
#endif
8383
unsigned short port;
84-
signed short fastreuse;
84+
signed char fastreuse;
85+
signed char fastreuseport;
86+
kuid_t fastuid;
8587
int num_owners;
8688
struct hlist_node node;
8789
struct hlist_head owners;
@@ -257,15 +259,19 @@ extern void inet_unhash(struct sock *sk);
257259

258260
extern struct sock *__inet_lookup_listener(struct net *net,
259261
struct inet_hashinfo *hashinfo,
262+
const __be32 saddr,
263+
const __be16 sport,
260264
const __be32 daddr,
261265
const unsigned short hnum,
262266
const int dif);
263267

264268
static inline struct sock *inet_lookup_listener(struct net *net,
265269
struct inet_hashinfo *hashinfo,
270+
__be32 saddr, __be16 sport,
266271
__be32 daddr, __be16 dport, int dif)
267272
{
268-
return __inet_lookup_listener(net, hashinfo, daddr, ntohs(dport), dif);
273+
return __inet_lookup_listener(net, hashinfo, saddr, sport,
274+
daddr, ntohs(dport), dif);
269275
}
270276

271277
/* Socket demux engine toys. */
@@ -358,7 +364,8 @@ static inline struct sock *__inet_lookup(struct net *net,
358364
struct sock *sk = __inet_lookup_established(net, hashinfo,
359365
saddr, sport, daddr, hnum, dif);
360366

361-
return sk ? : __inet_lookup_listener(net, hashinfo, daddr, hnum, dif);
367+
return sk ? : __inet_lookup_listener(net, hashinfo, saddr, sport,
368+
daddr, hnum, dif);
362369
}
363370

364371
static inline struct sock *inet_lookup(struct net *net,

include/net/netfilter/nf_tproxy_core.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ nf_tproxy_get_sock_v4(struct net *net, const u8 protocol,
8282
break;
8383
case NFT_LOOKUP_LISTENER:
8484
sk = inet_lookup_listener(net, &tcp_hashinfo,
85+
saddr, sport,
8586
daddr, dport,
8687
in->ifindex);
8788

net/ipv4/inet_connection_sock.c

Lines changed: 37 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ int inet_csk_bind_conflict(const struct sock *sk,
5959
struct sock *sk2;
6060
struct hlist_node *node;
6161
int reuse = sk->sk_reuse;
62+
int reuseport = sk->sk_reuseport;
63+
kuid_t uid = sock_i_uid((struct sock *)sk);
6264

6365
/*
6466
* Unlike other sk lookup places we do not check
@@ -73,8 +75,11 @@ int inet_csk_bind_conflict(const struct sock *sk,
7375
(!sk->sk_bound_dev_if ||
7476
!sk2->sk_bound_dev_if ||
7577
sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
76-
if (!reuse || !sk2->sk_reuse ||
77-
sk2->sk_state == TCP_LISTEN) {
78+
if ((!reuse || !sk2->sk_reuse ||
79+
sk2->sk_state == TCP_LISTEN) &&
80+
(!reuseport || !sk2->sk_reuseport ||
81+
(sk2->sk_state != TCP_TIME_WAIT &&
82+
!uid_eq(uid, sock_i_uid(sk2))))) {
7883
const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
7984
if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
8085
sk2_rcv_saddr == sk_rcv_saddr(sk))
@@ -106,6 +111,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
106111
int ret, attempts = 5;
107112
struct net *net = sock_net(sk);
108113
int smallest_size = -1, smallest_rover;
114+
kuid_t uid = sock_i_uid(sk);
109115

110116
local_bh_disable();
111117
if (!snum) {
@@ -125,9 +131,12 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
125131
spin_lock(&head->lock);
126132
inet_bind_bucket_for_each(tb, node, &head->chain)
127133
if (net_eq(ib_net(tb), net) && tb->port == rover) {
128-
if (tb->fastreuse > 0 &&
129-
sk->sk_reuse &&
130-
sk->sk_state != TCP_LISTEN &&
134+
if (((tb->fastreuse > 0 &&
135+
sk->sk_reuse &&
136+
sk->sk_state != TCP_LISTEN) ||
137+
(tb->fastreuseport > 0 &&
138+
sk->sk_reuseport &&
139+
uid_eq(tb->fastuid, uid))) &&
131140
(tb->num_owners < smallest_size || smallest_size == -1)) {
132141
smallest_size = tb->num_owners;
133142
smallest_rover = rover;
@@ -185,14 +194,17 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
185194
if (sk->sk_reuse == SK_FORCE_REUSE)
186195
goto success;
187196

188-
if (tb->fastreuse > 0 &&
189-
sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
197+
if (((tb->fastreuse > 0 &&
198+
sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
199+
(tb->fastreuseport > 0 &&
200+
sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
190201
smallest_size == -1) {
191202
goto success;
192203
} else {
193204
ret = 1;
194205
if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
195-
if (sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
206+
if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
207+
(sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
196208
smallest_size != -1 && --attempts >= 0) {
197209
spin_unlock(&head->lock);
198210
goto again;
@@ -212,9 +224,23 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
212224
tb->fastreuse = 1;
213225
else
214226
tb->fastreuse = 0;
215-
} else if (tb->fastreuse &&
216-
(!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
217-
tb->fastreuse = 0;
227+
if (sk->sk_reuseport) {
228+
tb->fastreuseport = 1;
229+
tb->fastuid = uid;
230+
} else {
231+
tb->fastreuseport = 0;
232+
tb->fastuid = 0;
233+
}
234+
} else {
235+
if (tb->fastreuse &&
236+
(!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
237+
tb->fastreuse = 0;
238+
if (tb->fastreuseport &&
239+
(!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))) {
240+
tb->fastreuseport = 0;
241+
tb->fastuid = 0;
242+
}
243+
}
218244
success:
219245
if (!inet_csk(sk)->icsk_bind_hash)
220246
inet_bind_hash(sk, tb, snum);

net/ipv4/inet_hashtables.c

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
3939
write_pnet(&tb->ib_net, hold_net(net));
4040
tb->port = snum;
4141
tb->fastreuse = 0;
42+
tb->fastreuseport = 0;
4243
tb->num_owners = 0;
4344
INIT_HLIST_HEAD(&tb->owners);
4445
hlist_add_head(&tb->node, &head->chain);
@@ -151,16 +152,16 @@ static inline int compute_score(struct sock *sk, struct net *net,
151152
if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&
152153
!ipv6_only_sock(sk)) {
153154
__be32 rcv_saddr = inet->inet_rcv_saddr;
154-
score = sk->sk_family == PF_INET ? 1 : 0;
155+
score = sk->sk_family == PF_INET ? 2 : 1;
155156
if (rcv_saddr) {
156157
if (rcv_saddr != daddr)
157158
return -1;
158-
score += 2;
159+
score += 4;
159160
}
160161
if (sk->sk_bound_dev_if) {
161162
if (sk->sk_bound_dev_if != dif)
162163
return -1;
163-
score += 2;
164+
score += 4;
164165
}
165166
}
166167
return score;
@@ -176,24 +177,37 @@ static inline int compute_score(struct sock *sk, struct net *net,
176177

177178
struct sock *__inet_lookup_listener(struct net *net,
178179
struct inet_hashinfo *hashinfo,
180+
const __be32 saddr, __be16 sport,
179181
const __be32 daddr, const unsigned short hnum,
180182
const int dif)
181183
{
182184
struct sock *sk, *result;
183185
struct hlist_nulls_node *node;
184186
unsigned int hash = inet_lhashfn(net, hnum);
185187
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
186-
int score, hiscore;
188+
int score, hiscore, matches = 0, reuseport = 0;
189+
u32 phash = 0;
187190

188191
rcu_read_lock();
189192
begin:
190193
result = NULL;
191-
hiscore = -1;
194+
hiscore = 0;
192195
sk_nulls_for_each_rcu(sk, node, &ilb->head) {
193196
score = compute_score(sk, net, hnum, daddr, dif);
194197
if (score > hiscore) {
195198
result = sk;
196199
hiscore = score;
200+
reuseport = sk->sk_reuseport;
201+
if (reuseport) {
202+
phash = inet_ehashfn(net, daddr, hnum,
203+
saddr, sport);
204+
matches = 1;
205+
}
206+
} else if (score == hiscore && reuseport) {
207+
matches++;
208+
if (((u64)phash * matches) >> 32 == 0)
209+
result = sk;
210+
phash = next_pseudo_random32(phash);
197211
}
198212
}
199213
/*
@@ -501,7 +515,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
501515
inet_bind_bucket_for_each(tb, node, &head->chain) {
502516
if (net_eq(ib_net(tb), net) &&
503517
tb->port == port) {
504-
if (tb->fastreuse >= 0)
518+
if (tb->fastreuse >= 0 ||
519+
tb->fastreuseport >= 0)
505520
goto next_port;
506521
WARN_ON(hlist_empty(&tb->owners));
507522
if (!check_established(death_row, sk,
@@ -518,6 +533,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
518533
break;
519534
}
520535
tb->fastreuse = -1;
536+
tb->fastreuseport = -1;
521537
goto ok;
522538

523539
next_port:

net/ipv4/tcp_ipv4.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -657,7 +657,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
657657
* no RST generated if md5 hash doesn't match.
658658
*/
659659
sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
660-
&tcp_hashinfo, ip_hdr(skb)->daddr,
660+
&tcp_hashinfo, ip_hdr(skb)->saddr,
661+
th->source, ip_hdr(skb)->daddr,
661662
ntohs(th->source), inet_iif(skb));
662663
/* don't send rst if it can't find key */
663664
if (!sk1)
@@ -2074,6 +2075,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
20742075
case TCP_TW_SYN: {
20752076
struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
20762077
&tcp_hashinfo,
2078+
iph->saddr, th->source,
20772079
iph->daddr, th->dest,
20782080
inet_iif(skb));
20792081
if (sk2) {

0 commit comments

Comments
 (0)