From 7310164b2cbae510b17377973fab26bf85c7d6c6 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Sat, 22 Apr 2023 22:06:23 +0200
Subject: [PATCH] MINOR: listener: add a new global
 tune.listener.default-shards setting

This new setting accepts "by-process", "by-group" and "by-thread" and
will dictate how listeners will be sharded by default when nothing is
specified. While the default remains "by-process", "by-group" should be
much more efficient with many threads, while not changing anything for
single-group setups.
---
 doc/configuration.txt      |   24 ++++++++++++++++++++++++
 include/haproxy/global-t.h |    1 +
 src/haproxy.c              |    1 +
 src/listener.c             |   24 +++++++++++++++++++++++-
 4 files changed, 49 insertions(+), 1 deletion(-)
diff --git a/doc/configuration.txt b/doc/configuration.txt
index d337c88..8fbe88a 100644
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@@ -3023,6 +3023,30 @@ tune.idletimer <timeout>
   clicking). There should be no reason for changing this value. Please check
   tune.ssl.maxrecord below.
 
+tune.listener.default-shards { by-process | by-thread | by-group }
+  Normally, all "bind" lines will create a single shard, that is, a single
+  socket that all threads of the process will listen to. With many threads,
+  this is not very efficient, and may even induce some important overhead in
+  the kernel for updating the polling state or even distributing events to the
+  various threads. Modern operating systems support balancing of incoming
+  connections, a mechanism that will consist in permitting multiple sockets to
+  be bound to the same address and port, and to evenly distribute all incoming
+  connections to these sockets so that each thread only sees the connections
+  that are waiting in the socket it is bound to. This significantly reduces
+  kernel-side overhead and increases performance in the incoming connection
+  path. This is usually enabled in HAProxy using the "shards" setting on "bind"
+  lines, which defaults to 1, meaning that each listener will be unique in the
+  process. On systems with many processors, it may be more convenient to change
+  the default setting to "by-thread" in order to always create one listening
+  socket per thread, or "by-group" in order to always create one listening
+  socket per thread group. Be careful about the file descriptor usage with
+  "by-thread" as each listener will need as many sockets as there are threads.
+  Also some operating systems (e.g. FreeBSD) are limited to no more than 256
+  sockets on a same address. Note that "by-group" will remain equivalent to
+  "by-process" for default configurations involving a single thread group, and
+  will fall back to sharing the same socket on systems that do not support this
+  mechanism. As such, it is the recommended setting.
+
 tune.listener.multi-queue { on | fair | off }
   Enables ('on' / 'fair') or disables ('off') the listener's multi-queue accept
   which spreads the incoming traffic to all threads a "bind" line is allowed to
diff --git a/include/haproxy/global-t.h b/include/haproxy/global-t.h
index 3a7b53b..e7d02fe 100644
--- a/include/haproxy/global-t.h
+++ b/include/haproxy/global-t.h
@@ -168,6 +168,7 @@ struct global {
 		size_t pool_cache_size;    /* per-thread cache size per pool (defaults to CONFIG_HAP_POOL_CACHE_SIZE) */
 		unsigned short idle_timer; /* how long before an empty buffer is considered idle (ms) */
 		int nb_stk_ctr;       /* number of stick counters, defaults to MAX_SESS_STKCTR */
+		int default_shards; /* default shards for listeners, or -1 (by-thread) or -2 (by-group) */
 #ifdef USE_QUIC
 		unsigned int quic_backend_max_idle_timeout;
 		unsigned int quic_frontend_max_idle_timeout;
diff --git a/src/haproxy.c b/src/haproxy.c
index b1574ab..739183a 100644
--- a/src/haproxy.c
+++ b/src/haproxy.c
@@ -205,6 +205,7 @@ struct global global = {
 		.idle_timer = 1000, /* 1 second */
 #endif
 		.nb_stk_ctr = MAX_SESS_STKCTR,
+		.default_shards = 1, /* "by-process" = one shard per listener */
 #ifdef USE_QUIC
 		.quic_backend_max_idle_timeout = QUIC_TP_DFLT_BACK_MAX_IDLE_TIMEOUT,
 		.quic_frontend_max_idle_timeout = QUIC_TP_DFLT_FRONT_MAX_IDLE_TIMEOUT,
diff --git a/src/listener.c b/src/listener.c
index bfb5ece..d5390ed 100644
--- a/src/listener.c
+++ b/src/listener.c
@@ -1918,7 +1918,7 @@ struct bind_conf *bind_conf_alloc(struct proxy *fe, const char *file,
 	bind_conf->settings.ux.uid = -1;
 	bind_conf->settings.ux.gid = -1;
 	bind_conf->settings.ux.mode = 0;
-	bind_conf->settings.shards = 1;
+	bind_conf->settings.shards = global.tune.default_shards;
 	bind_conf->xprt = xprt;
 	bind_conf->frontend = fe;
 	bind_conf->analysers = fe->fe_req_ana;
@@ -2298,6 +2298,27 @@ static int bind_parse_thread(char **args, int cur_arg, struct proxy *px, struct
 	return 0;
 }
 
+/* config parser for global "tune.listener.default-shards" */
+static int cfg_parse_tune_listener_shards(char **args, int section_type, struct proxy *curpx,
+                                          const struct proxy *defpx, const char *file, int line,
+                                          char **err)
+{
+	if (too_many_args(1, args, err, NULL))
+		return -1;
+
+	if (strcmp(args[1], "by-thread") == 0)
+		global.tune.default_shards = -1;
+	else if (strcmp(args[1], "by-group") == 0)
+		global.tune.default_shards = -2;
+	else if (strcmp(args[1], "by-process") == 0)
+		global.tune.default_shards = 1;
+	else {
+		memprintf(err, "'%s' expects either 'by-process', 'by-group', or 'by-thread' but got '%s'.", args[0], args[1]);
+		return -1;
+	}
+	return 0;
+}
+
 /* config parser for global "tune.listener.multi-queue", accepts "on", "fair" or "off" */
 static int cfg_parse_tune_listener_mq(char **args, int section_type, struct proxy *curpx,
                                       const struct proxy *defpx, const char *file, int line,
@@ -2366,6 +2387,7 @@ INITCALL1(STG_REGISTER, bind_register_keywords, &bind_kws);
 
 /* config keyword parsers */
 static struct cfg_kw_list cfg_kws = {ILH, {
+	{ CFG_GLOBAL, "tune.listener.default-shards",   cfg_parse_tune_listener_shards  },
 	{ CFG_GLOBAL, "tune.listener.multi-queue",      cfg_parse_tune_listener_mq      },
 	{ 0, NULL, NULL }
 }};
-- 
1.7.10.4