BUG/MEDIUM: resolvers: Track api calls with a counter to free resolutions
authorChristopher Faulet <cfaulet@haproxy.com>
Tue, 2 Nov 2021 15:25:05 +0000 (16:25 +0100)
committerChristopher Faulet <cfaulet@haproxy.com>
Wed, 3 Nov 2021 15:21:31 +0000 (16:21 +0100)
The kill list introduced in commit f766ec6b5 ("MEDIUM: resolvers: use a kill
list to preserve the list consistency") contains a bug. The deatch_row must
be initialized before calling resolv_process_responses() function. However,
this function is called for the dns code. The death_row is not visible from
the outside. So, it is possible to add a resolution in an uninitialized
death_row, leading to a crash.

But, with the current implementation, it is not possible to handle the
death_row in resolv_process_responses() function because, internally, the
kill list may be freed via a call to resolv_unlink_resolution(). At the end,
we are unable to determine all call chains to guarantee a safe use of the
kill list. It is a shameful observation, but unfortunatly true.

So, to make the fix simple, we track all calls to the public resolvers
api. A counter is incremented when we enter in the resolver code and
decremented when we leave it. This way, we are able to track the recursions
to init and release the kill list only once, at the edge.

Following functions are incrementing/decrementing the recurse counter:

  * resolv_trigger_resolution()
  * resolv_srvrq_expire_task()
  * resolv_link_resolution()
  * resolv_unlink_resolution()
  * resolv_detach_from_resolution_answer_items()
  * resolv_process_responses()
  * process_resolvers()
  * resolvers_finalize_config()
  * resolv_action_do_resolve()

This patch should fix the issue #1404. It must be backported everywhere the
above commit was backported.

(cherry picked from commit 9ed1a0601d6b5ccaf6d410fee4d5d4b49d36652a)
Signed-off-by: Willy Tarreau <w@1wt.eu>
(cherry picked from commit 7a3a5361324d0b40a723f560e977e7370f9471da)
Signed-off-by: Christopher Faulet <cfaulet@haproxy.com>

src/dns.c

index c566fb1..e653e3d 100644 (file)
--- a/src/dns.c
+++ b/src/dns.c
@@ -51,6 +51,7 @@ struct list dns_resolvers  = LIST_HEAD_INIT(dns_resolvers);
 struct list dns_srvrq_list = LIST_HEAD_INIT(dns_srvrq_list);
 
 static THREAD_LOCAL struct list death_row; /* list of deferred resolutions to kill, local validity only */
+static THREAD_LOCAL unsigned int recurse = 0; /* counter to track calls to public functions */
 static THREAD_LOCAL uint64_t dns_query_id_seed = 0; /* random seed */
 
 DECLARE_STATIC_POOL(dns_answer_item_pool, "dns_answer_item", sizeof(struct dns_answer_item));
@@ -62,6 +63,8 @@ unsigned int dns_failed_resolutions = 0;
 static struct task *dns_process_resolvers(struct task *t, void *context, unsigned short state);
 static void dns_free_resolution(struct dns_resolution *resolution);
 static void _dns_unlink_resolution(struct dns_requester *requester);
+static void enter_resolver_code();
+static void leave_resolver_code();
 
 enum {
        DNS_STAT_ID,
@@ -509,12 +512,16 @@ void dns_trigger_resolution(struct dns_requester *req)
        res       = req->resolution;
        resolvers = res->resolvers;
 
+       enter_resolver_code();
+
        /* The resolution must not be triggered yet. Use the cached response, if
         * valid */
        exp = tick_add(res->last_resolution, resolvers->hold.valid);
        if (resolvers->t && (res->status != RSLV_STATUS_VALID ||
            !tick_isset(res->last_resolution) || tick_is_expired(exp, now_ms)))
                task_wakeup(resolvers->t, TASK_WOKEN_OTHER);
+
+       leave_resolver_code();
 }
 
 
@@ -633,13 +640,16 @@ int dns_read_name(unsigned char *buffer, unsigned char *bufend,
 
 /* Reinitialize the list of aborted resolutions before calling certain
  * functions relying on it. The list must be processed by calling
- * free_aborted_resolutions() after operations.
+ * leave_resolver_code() after operations.
  */
-static void init_aborted_resolutions()
+static void enter_resolver_code()
 {
-       LIST_INIT(&death_row);
+       if (!recurse)
+               LIST_INIT(&death_row);
+       recurse++;
 }
 
+/* Add a resolution to the death_row. */
 static void abort_resolution(struct dns_resolution *res)
 {
        LIST_DEL_INIT(&res->list);
@@ -647,16 +657,20 @@ static void abort_resolution(struct dns_resolution *res)
 }
 
 /* This releases any aborted resolution found in the death row. It is mandatory
- * to call init_aborted_resolutions() first before the function (or loop) that
+ * to call enter_resolver_code() first before the function (or loop) that
  * needs to defer deletions. Note that some of them are in relation via internal
  * objects and might cause the deletion of other ones from the same list, so we
  * must absolutely not use a list_for_each_entry_safe() nor any such thing here,
  * and solely rely on each call to remove the first remaining list element.
  */
-static void free_aborted_resolutions()
+static void leave_resolver_code()
 {
        struct dns_resolution *res;
 
+       recurse--;
+       if (recurse)
+               return;
+
        while (!LIST_ISEMPTY(&death_row)) {
                res = LIST_NEXT(&death_row, struct dns_resolution *, list);
                dns_free_resolution(res);
@@ -671,7 +685,7 @@ static void free_aborted_resolutions()
  * obsolete.
  *
  * Must be called with the DNS lock held, and with the death_row already
- * initialized via init_aborted_resolutions().
+ * initialized via enter_resolver_code().
  */
 static void dns_srvrq_cleanup_srv(struct server *srv)
 {
@@ -709,9 +723,9 @@ static struct task *dns_srvrq_expire_task(struct task *t, void *context, unsigne
                goto end;
 
        HA_SPIN_LOCK(DNS_LOCK, &srv->srvrq->resolvers->lock);
-       init_aborted_resolutions();
+       enter_resolver_code();
        dns_srvrq_cleanup_srv(srv);
-       free_aborted_resolutions();
+       leave_resolver_code();
        HA_SPIN_UNLOCK(DNS_LOCK, &srv->srvrq->resolvers->lock);
 
  end:
@@ -719,8 +733,7 @@ static struct task *dns_srvrq_expire_task(struct task *t, void *context, unsigne
 }
 
 /* Checks for any obsolete record, also identify any SRV request, and try to
- * find a corresponding server. Must be called with the death_row already
- * initialized via init_aborted_resolutions().
+ * find a corresponding server.
 */
 static void dns_check_dns_response(struct dns_resolution *res)
 {
@@ -1993,8 +2006,7 @@ dns_get_requester(struct dns_requester **req, enum obj_type *owner,
 }
 
 /* Links a requester (a server or a dns_srvrq) with a resolution. It returns 0
- * on success, -1 otherwise. Must be called with the death_row already initialized
- * via init_aborted_resolutions().
+ * on success, -1 otherwise.
  */
 int dns_link_resolution(void *requester, int requester_type, int requester_locked)
 {
@@ -2007,6 +2019,7 @@ int dns_link_resolution(void *requester, int requester_type, int requester_locke
        char **hostname_dn;
        int   hostname_dn_len, query_type;
 
+       enter_resolver_code();
        switch (requester_type) {
                case OBJ_TYPE_SERVER:
                        srv             = (struct server *)requester;
@@ -2082,20 +2095,18 @@ int dns_link_resolution(void *requester, int requester_type, int requester_locke
   err:
        if (res && LIST_ISEMPTY(&res->requesters))
                dns_free_resolution(res);
+       leave_resolver_code();
        return -1;
 }
 
-/* This function removes all server/srvrq references on answer items
- * if <safe> is set to 1, in case of srvrq, sub server resquesters unlink
- * is called using safe == 1 to make it usable into callbacks. Must be called
- * with the death_row already initialized via init_aborted_resolutions().
- */
+/* This function removes all server/srvrq references on answer items. */
 void dns_detach_from_resolution_answer_items(struct dns_resolution *res,  struct dns_requester *req)
 {
        struct dns_answer_item *item, *itemback;
        struct server *srv, *srvback;
        struct dns_srvrq    *srvrq;
 
+       enter_resolver_code();
        if ((srv = objt_server(req->owner)) != NULL) {
                LIST_DEL_INIT(&srv->ip_rec_item);
        }
@@ -2109,13 +2120,12 @@ void dns_detach_from_resolution_answer_items(struct dns_resolution *res,  struct
                        }
                }
        }
+       leave_resolver_code();
 }
 
 
 /* Removes a requester from a DNS resolution. It takes takes care of all the
  * consequences. It also cleans up some parameters from the requester.
- * if <safe> is set to 1, the corresponding resolution is not released. Must be
- * called with the death_row already initialized via init_aborted_resolutions().
  */
 void _dns_unlink_resolution(struct dns_requester *requester)
 {
@@ -2166,9 +2176,9 @@ void _dns_unlink_resolution(struct dns_requester *requester)
 /* The public version of the function above that deals with the death row. */
 void dns_unlink_resolution(struct dns_requester *requester)
 {
-       init_aborted_resolutions();
+       enter_resolver_code();
        _dns_unlink_resolution(requester);
-       free_aborted_resolutions();
+       leave_resolver_code();
 }
 
 /* Called when a network IO is generated on a name server socket for an incoming
@@ -2178,7 +2188,6 @@ void dns_unlink_resolution(struct dns_requester *requester)
  *  - call requester's error callback if invalid response
  *  - check the dn_name in the packet against the one sent
  *
- * Must be called with the death_row already initialized via init_aborted_resolutions().
  */
 static void dns_resolve_recv(struct dgram_conn *dgram)
 {
@@ -2209,6 +2218,7 @@ static void dns_resolve_recv(struct dgram_conn *dgram)
        }
 
        resolvers = ns->resolvers;
+       enter_resolver_code();
        HA_SPIN_LOCK(DNS_LOCK, &resolvers->lock);
 
        /* process all pending input messages */
@@ -2383,6 +2393,7 @@ static void dns_resolve_recv(struct dgram_conn *dgram)
        }
        dns_update_resolvers_timeout(resolvers);
        HA_SPIN_UNLOCK(DNS_LOCK, &resolvers->lock);
+       leave_resolver_code();
 }
 
 /* Called when a resolvers network socket is ready to send data */
@@ -2468,6 +2479,7 @@ static struct task *dns_process_resolvers(struct task *t, void *context, unsigne
        struct dns_resolution *res, *resback;
        int exp;
 
+       enter_resolver_code();
        HA_SPIN_LOCK(DNS_LOCK, &resolvers->lock);
 
        /* Handle all expired resolutions from the active list. Elements that
@@ -2475,7 +2487,6 @@ static struct task *dns_process_resolvers(struct task *t, void *context, unsigne
         * ones will be handled normally.
         */
 
-       init_aborted_resolutions();
        res  = LIST_NEXT(&resolvers->resolutions.curr, struct dns_resolution *, list);
        while (&res->list != &resolvers->resolutions.curr) {
                resback = LIST_NEXT(&res->list, struct dns_resolution *, list);
@@ -2558,12 +2569,11 @@ static struct task *dns_process_resolvers(struct task *t, void *context, unsigne
                        LIST_ADDQ(&resolvers->resolutions.wait, &res->list);
                }
        }
-
-       /* now we can purge all queued deletions */
-       free_aborted_resolutions();
-
        dns_update_resolvers_timeout(resolvers);
        HA_SPIN_UNLOCK(DNS_LOCK, &resolvers->lock);
+
+       /* now we can purge all queued deletions */
+       leave_resolver_code();
        return t;
 }
 
@@ -2582,7 +2592,7 @@ static void dns_deinit(void)
        struct dns_requester  *req, *reqback;
        struct dns_srvrq      *srvrq, *srvrqback;
 
-       init_aborted_resolutions();
+       enter_resolver_code();
        list_for_each_entry_safe(resolvers, resolversback, &dns_resolvers, list) {
                list_for_each_entry_safe(ns, nsback, &resolvers->nameservers, list) {
                        free(ns->id);
@@ -2625,7 +2635,7 @@ static void dns_deinit(void)
                free(srvrq);
        }
 
-       free_aborted_resolutions();
+       leave_resolver_code();
 }
 
 /* Finalizes the DNS configuration by allocating required resources and checking
@@ -2638,7 +2648,7 @@ static int dns_finalize_config(void)
        struct proxy         *px;
        int err_code = 0;
 
-       init_aborted_resolutions();
+       enter_resolver_code();
 
        /* allocate pool of resolution per resolvers */
        list_for_each_entry(resolvers, &dns_resolvers, list) {
@@ -2753,10 +2763,10 @@ static int dns_finalize_config(void)
        if (err_code & (ERR_ALERT|ERR_ABORT))
                goto err;
 
-       free_aborted_resolutions();
+       leave_resolver_code();
        return err_code;
   err:
-       free_aborted_resolutions();
+       leave_resolver_code();
        dns_deinit();
        return err_code;
 
@@ -3046,7 +3056,7 @@ enum act_return dns_action_do_resolve(struct act_rule *rule, struct proxy *px,
 
        resolvers = rule->arg.dns.resolvers;
 
-       init_aborted_resolutions();
+       enter_resolver_code();
 
        /* we have a response to our DNS resolution */
  use_cache:
@@ -3130,7 +3140,7 @@ enum act_return dns_action_do_resolve(struct act_rule *rule, struct proxy *px,
        ret = ACT_RET_YIELD;
 
   end:
-       free_aborted_resolutions();
+       leave_resolver_code();
        if (locked)
                HA_SPIN_UNLOCK(DNS_LOCK, &resolvers->lock);
        return ret;