From dde1b4499a68741a59a77fe8072b76f0f50ec070 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Thu, 21 Oct 2021 14:33:38 +0200 Subject: [PATCH] OPTIM: dns: use an atomic check for the list membership The crash that was fixed by commit 7045590d8 ("BUG/MAJOR: dns: attempt to lock globaly for msg waiter list instead of use barrier") was now completely analysed and confirmed to be partially a result of the debugging code added to LIST_INLIST(), which was looking at both pointers and their reciprocals, and that, if used in a concurrent context, could perfectly return false if a neighbor was being added or removed while the current one didn't change, allowing the LIST_APPEND to fail. As the LIST API was not designed to be used in a concurrent context, we should not rely on LIST_INLIST() but on the newly introduced LIST_INLIST_ATOMIC(). This patch simply reverts the commit above to switch to the new test, saving a lock during potentially long operations. It was verified that the check doesn't fail anymore. It is unsure what the performance impact of the fix above could be in some contexts. If any performance regression is observed, it could make sense to backport this patch, along with the previous commit introducing the LIST_INLIST_ATOMIC() macro. --- src/dns.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/src/dns.c b/src/dns.c index 1f03dee79..818c4e772 100644 --- a/src/dns.c +++ b/src/dns.c @@ -175,9 +175,21 @@ ssize_t dns_recv_nameserver(struct dns_nameserver *ns, void *data, size_t size) ds->rx_msg.len = 0; + /* This barrier is here to ensure that all data is + * stored if the appctx detect the elem is out of the + * list. + */ + __ha_barrier_store(); + LIST_DEL_INIT(&ds->waiter); if (ds->appctx) { + /* This second barrier is here to ensure that + * the waked up appctx won't miss that the elem + * is removed from the list. + */ + __ha_barrier_store(); + /* awake appctx because it may have other * message to receive */ @@ -618,10 +630,8 @@ read: * delete from the list */ - /* lock the dns_stream_server containing lists heads */ - HA_SPIN_LOCK(DNS_LOCK, &ds->dss->lock); - - if (!LIST_INLIST(&ds->waiter)) { + __ha_barrier_load(); + if (!LIST_INLIST_ATOMIC(&ds->waiter)) { while (1) { uint16_t query_id; struct eb32_node *eb; @@ -700,9 +710,13 @@ read: * wait_sess list where the task processing * response will pop available responses */ + HA_SPIN_LOCK(DNS_LOCK, &ds->dss->lock); + BUG_ON(LIST_INLIST(&ds->waiter)); LIST_APPEND(&ds->dss->wait_sess, &ds->waiter); + HA_SPIN_UNLOCK(DNS_LOCK, &ds->dss->lock); + /* awake the task processing the responses */ task_wakeup(ds->dss->task_rsp, TASK_WOKEN_INIT); @@ -712,14 +726,12 @@ read: if (!LIST_INLIST(&ds->waiter)) { /* there is no more pending data to read and the con was closed by the server side */ if (!co_data(si_oc(si)) && (si_oc(si)->flags & CF_SHUTW)) { - HA_SPIN_UNLOCK(DNS_LOCK, &ds->dss->lock); goto close; } } } - HA_SPIN_UNLOCK(DNS_LOCK, &ds->dss->lock); return; close: si_shutw(si);