parent
646582f139
commit
0ded577149
@ -0,0 +1,50 @@
|
||||
From b2194c95f9d02965b2ad3d0dcf333e74881347d0 Mon Sep 17 00:00:00 2001
|
||||
From: James Chapman <jachapma@redhat.com>
|
||||
Date: Thu, 31 Aug 2023 14:05:31 +0000
|
||||
Subject: [PATCH] Issue 5909 - Multi listener hang with 20k connections (#5917)
|
||||
|
||||
Bug Description: A fix for connection sub-table to freelist mapping results
|
||||
in an uninitialised head of the sub-table linked list.
|
||||
|
||||
Fix Description: During connection table creation, initialise all elements but
|
||||
skip the list head during the mapping phase.
|
||||
|
||||
Fixes: https//github.com/389ds/389-ds-base/issues/5909
|
||||
|
||||
Reviewed by: @progier389 @tbordaz (Thank you)
|
||||
---
|
||||
ldap/servers/slapd/conntable.c | 10 ++++++----
|
||||
1 file changed, 6 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/ldap/servers/slapd/conntable.c b/ldap/servers/slapd/conntable.c
|
||||
index b1c66cf42..11e997432 100644
|
||||
--- a/ldap/servers/slapd/conntable.c
|
||||
+++ b/ldap/servers/slapd/conntable.c
|
||||
@@ -172,9 +172,9 @@ connection_table_new(int table_size)
|
||||
/* all connections start out invalid */
|
||||
ct->fd[ct_list][i].fd = SLAPD_INVALID_SOCKET;
|
||||
|
||||
- /* The connection table has a double linked list running through it.
|
||||
+ /* The connection sub-tables have a double linked list running through them.
|
||||
* This is used to find out which connections should be looked at
|
||||
- * in the poll loop. Slot 0 in the table is always the head of
|
||||
+ * in the poll loop. Slot 0 in each sub-table is always the head of
|
||||
* the linked list. Each slot has a c_next and c_prev which are
|
||||
* pointers back into the array of connection slots. */
|
||||
ct->c[ct_list][i].c_next = NULL;
|
||||
@@ -196,8 +196,10 @@ connection_table_new(int table_size)
|
||||
/* Ready to rock, mark as such. */
|
||||
ct->c[ct_list][i].c_state = CONN_STATE_INIT;
|
||||
|
||||
- /* Map multiple ct lists to a single freelist. */
|
||||
- ct->c_freelist[free_idx++] = &(ct->c[ct_list][i]);
|
||||
+ /* Map multiple ct lists to a single freelist, but skip slot 0 of each list. */
|
||||
+ if (i != 0) {
|
||||
+ ct->c_freelist[free_idx++] = &(ct->c[ct_list][i]);
|
||||
+ }
|
||||
}
|
||||
}
|
||||
|
||||
--
|
||||
2.41.0
|
||||
|
@ -0,0 +1,629 @@
|
||||
From 26716f0edefe4690d9fd9143d07156e01321020c Mon Sep 17 00:00:00 2001
|
||||
From: progier389 <progier@redhat.com>
|
||||
Date: Thu, 28 Sep 2023 12:15:25 +0200
|
||||
Subject: [PATCH] issue 5924 - ASAN server build crash when looping
|
||||
opening/closing connections (#5926)
|
||||
|
||||
* issue 5924 - ASAN server build crash when looping opening/closing connections
|
||||
Issue: Got a crash due to:
|
||||
1. Failure to get a connection slot because connection freelist is misshandled.
|
||||
2. A confusion between listening and acceptedfd descriptor leaded to
|
||||
close the listening descriptor while handing the error.
|
||||
|
||||
Solution:
|
||||
Rename clearly the file descriptor variables
|
||||
Close the accepted file descriptor in error handler
|
||||
Rewrite the freelist management so that first connection chosen is the last released one.
|
||||
(Code is simpler, this fix the end of list issue, and it avoid to spread the open connection over too much memory)
|
||||
|
||||
Issue: #5924
|
||||
|
||||
Reviewed by: @Firstyear, @vashirov, @droideck (Thanks !)
|
||||
|
||||
(cherry picked from commit 02d333251419ff3c4d0384595e9fe7ded5bcd8fc)
|
||||
---
|
||||
dirsrvtests/tests/suites/basic/basic_test.py | 287 +++++++++++++++++++
|
||||
ldap/servers/slapd/conntable.c | 100 +++----
|
||||
ldap/servers/slapd/daemon.c | 35 ++-
|
||||
ldap/servers/slapd/fe.h | 1 -
|
||||
4 files changed, 351 insertions(+), 72 deletions(-)
|
||||
|
||||
diff --git a/dirsrvtests/tests/suites/basic/basic_test.py b/dirsrvtests/tests/suites/basic/basic_test.py
|
||||
index fac0f7371..f4525e184 100644
|
||||
--- a/dirsrvtests/tests/suites/basic/basic_test.py
|
||||
+++ b/dirsrvtests/tests/suites/basic/basic_test.py
|
||||
@@ -28,7 +28,15 @@ from lib389.replica import Replicas, Changelog
|
||||
from lib389.backend import Backends
|
||||
from lib389.idm.domain import Domain
|
||||
from lib389.nss_ssl import NssSsl
|
||||
+from lib389._constants import *
|
||||
+from lib389 import DirSrv
|
||||
+from lib389.instance.setup import SetupDs
|
||||
+from lib389.instance.options import General2Base, Slapd2Base
|
||||
import os
|
||||
+import random
|
||||
+import ldap
|
||||
+import time
|
||||
+import subprocess
|
||||
|
||||
|
||||
pytestmark = pytest.mark.tier0
|
||||
@@ -36,6 +44,7 @@ pytestmark = pytest.mark.tier0
|
||||
default_paths = Paths()
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
+DEBUGGING = os.getenv("DEBUGGING", default=False)
|
||||
|
||||
# Globals
|
||||
USER1_DN = 'uid=user1,' + DEFAULT_SUFFIX
|
||||
@@ -51,6 +60,190 @@ ROOTDSE_DEF_ATTR_LIST = ('namingContexts',
|
||||
'vendorName',
|
||||
'vendorVersion')
|
||||
|
||||
+# This MAX_FDS value left about 22 connections available with bdb
|
||||
+# (should have more connections with lmdb)
|
||||
+MAX_FDS = 150
|
||||
+
|
||||
+
|
||||
+
|
||||
+default_paths = Paths()
|
||||
+
|
||||
+
|
||||
+
|
||||
+log = logging.getLogger(__name__)
|
||||
+DEBUGGING = os.getenv("DEBUGGING", default=False)
|
||||
+
|
||||
+
|
||||
+class CustomSetup():
|
||||
+ DEFAULT_GENERAL = { 'config_version': 2,
|
||||
+ 'full_machine_name': 'localhost.localdomain',
|
||||
+ 'strict_host_checking': False,
|
||||
+ # Not setting 'systemd' because it is not used.
|
||||
+ # (that is the global default.inf setting that matters)
|
||||
+ }
|
||||
+ DEFAULT_SLAPD = { 'root_password': PW_DM,
|
||||
+ 'defaults': INSTALL_LATEST_CONFIG,
|
||||
+ }
|
||||
+ DEFAULT_BACKENDS = [ {
|
||||
+ 'cn': 'userroot',
|
||||
+ 'nsslapd-suffix': DEFAULT_SUFFIX,
|
||||
+ 'sample_entries': 'yes',
|
||||
+ BACKEND_SAMPLE_ENTRIES: INSTALL_LATEST_CONFIG,
|
||||
+ }, ]
|
||||
+
|
||||
+ WRAPPER_FORMAT = '''#!/bin/sh
|
||||
+{wrapper_options}
|
||||
+exec {nsslapd} -D {cfgdir} -i {pidfile}
|
||||
+'''
|
||||
+
|
||||
+
|
||||
+ class CustomDirSrv(DirSrv):
|
||||
+ def __init__(self, verbose=False, external_log=log):
|
||||
+ super().__init__(verbose=verbose, external_log=external_log)
|
||||
+ self.wrapper = None # placeholder for the wrapper file name
|
||||
+
|
||||
+ def _reset_systemd(self):
|
||||
+ self.systemd_override = False
|
||||
+
|
||||
+ def status(self):
|
||||
+ self._reset_systemd()
|
||||
+ return super().status()
|
||||
+
|
||||
+ def start(self, timeout=120, *args):
|
||||
+ if self.status():
|
||||
+ return
|
||||
+ tmp_env = os.environ
|
||||
+ # Unset PYTHONPATH to avoid mixing old CLI tools and new lib389
|
||||
+ if "PYTHONPATH" in tmp_env:
|
||||
+ del tmp_env["PYTHONPATH"]
|
||||
+ try:
|
||||
+ subprocess.check_call([
|
||||
+ '/usr/bin/sh',
|
||||
+ self.wrapper
|
||||
+ ], env=tmp_env, stderr=subprocess.STDOUT)
|
||||
+ except subprocess.CalledProcessError as e:
|
||||
+ log.fatal("%s failed! Error (%s) %s" % (self.wrapper, e.returncode, e.output))
|
||||
+ raise e from None
|
||||
+ for count in range(timeout):
|
||||
+ if self.status():
|
||||
+ return
|
||||
+ time.sleep(1)
|
||||
+ raise TimeoutException('Failed to start ns-slpad')
|
||||
+
|
||||
+ def stop(self, timeout=120):
|
||||
+ self._reset_systemd()
|
||||
+ super().stop(timeout=timeout)
|
||||
+
|
||||
+
|
||||
+ def _search_be(belist, beinfo):
|
||||
+ for be in belist:
|
||||
+ if be['cn'] == beinfo['cn']:
|
||||
+ return be
|
||||
+ return None
|
||||
+
|
||||
+ def __init__(self, serverid, general=None, slapd=None, backends=None, log=log):
|
||||
+ verbose = log.level > logging.DEBUG
|
||||
+ self.log = log
|
||||
+ self.serverid = serverid
|
||||
+ self.verbose = verbose
|
||||
+ self.wrapper = f'/tmp/ds_{serverid}_wrapper.sh'
|
||||
+ if serverid.startswith('slapd-'):
|
||||
+ self.instname = server.id
|
||||
+ else:
|
||||
+ self.instname = 'slapd-'+serverid
|
||||
+ self.ldapi = None
|
||||
+ self.pid_file = None
|
||||
+ self.inst = None
|
||||
+
|
||||
+ # Lets prepare the options
|
||||
+ general_options = General2Base(log)
|
||||
+ for d in (CustomSetup.DEFAULT_GENERAL, general):
|
||||
+ if d:
|
||||
+ for key,val in d.items():
|
||||
+ general_options.set(key,val)
|
||||
+ log.debug('[general]: %s' % general_options._options)
|
||||
+ self.general = general_options
|
||||
+
|
||||
+ slapd_options = Slapd2Base(self.log)
|
||||
+ slapd_options.set('instance_name', serverid)
|
||||
+ for d in (CustomSetup.DEFAULT_SLAPD, slapd):
|
||||
+ if d:
|
||||
+ for key,val in d.items():
|
||||
+ slapd_options.set(key,val)
|
||||
+ log.debug('[slapd]: %s' % slapd_options._options)
|
||||
+ self.slapd = slapd_options
|
||||
+
|
||||
+ backend_options = []
|
||||
+ for backend_list in (CustomSetup.DEFAULT_BACKENDS, backends):
|
||||
+ if not backend_list:
|
||||
+ continue
|
||||
+ for backend in backend_list:
|
||||
+ target_be = CustomSetup._search_be(backend_options, backend)
|
||||
+ if not target_be:
|
||||
+ target_be = {}
|
||||
+ backend_options.append(target_be)
|
||||
+ for key,val in backend.items():
|
||||
+ target_be[key] = val
|
||||
+ log.debug('[backends]: %s' % backend_options)
|
||||
+ self.backends = backend_options
|
||||
+
|
||||
+ def _to_dirsrv_args(self):
|
||||
+ args = {}
|
||||
+ slapd = self.slapd.collect()
|
||||
+ general = self.general.collect()
|
||||
+ args["SER_HOST"] = general['full_machine_name']
|
||||
+ args["SER_PORT"] = slapd['port']
|
||||
+ args["SER_SECURE_PORT"] = slapd['secure_port']
|
||||
+ args["SER_SERVERID_PROP"] = self.serverid
|
||||
+ return args
|
||||
+
|
||||
+ def create_instance(self):
|
||||
+ sds = SetupDs(verbose=self.verbose, dryrun=False, log=self.log)
|
||||
+ self.general.verify()
|
||||
+ general = self.general.collect()
|
||||
+ self.slapd.verify()
|
||||
+ slapd = self.slapd.collect()
|
||||
+ sds.create_from_args(general, slapd, self.backends, None)
|
||||
+ self.ldapi = get_ldapurl_from_serverid(self.serverid)[0]
|
||||
+ args = self._to_dirsrv_args()
|
||||
+ log.debug('DirSrv.allocate args = %s' % str(args))
|
||||
+ log.debug('ldapi = %s' % str(self.ldapi))
|
||||
+ root_dn = slapd['root_dn']
|
||||
+ root_password = slapd['root_password']
|
||||
+ inst = DirSrv(verbose=self.verbose, external_log=self.log)
|
||||
+ inst.local_simple_allocate(self.serverid, ldapuri=self.ldapi, binddn=root_dn, password=root_password)
|
||||
+ self.pid_file = inst.pid_file()
|
||||
+ # inst.setup_ldapi()
|
||||
+ log.debug('DirSrv = %s' % str(inst.__dict__))
|
||||
+ inst.open()
|
||||
+ inst.stop()
|
||||
+ inst = CustomSetup.CustomDirSrv(verbose=self.verbose, external_log=self.log)
|
||||
+ inst.local_simple_allocate(self.serverid, ldapuri=self.ldapi, binddn=root_dn, password=root_password)
|
||||
+ self.inst = inst
|
||||
+ return inst
|
||||
+
|
||||
+ def create_wrapper(self, maxfds=None):
|
||||
+ self.inst.wrapper = self.wrapper
|
||||
+ slapd = self.slapd.collect()
|
||||
+ sbin_dir = slapd['sbin_dir']
|
||||
+ config_dir = slapd['config_dir']
|
||||
+ fmtvalues = {
|
||||
+ 'nsslapd': f'{sbin_dir}/ns-slapd',
|
||||
+ 'cfgdir': config_dir.format(instance_name=self.instname),
|
||||
+ 'pidfile': self.pid_file,
|
||||
+ 'wrapper_options': ''
|
||||
+ }
|
||||
+ if maxfds:
|
||||
+ fmtvalues['wrapper_options']=f'ulimit -n {maxfds}\nulimit -H -n {maxfds}'
|
||||
+ with open(self.wrapper, 'w') as f:
|
||||
+ f.write(CustomSetup.WRAPPER_FORMAT.format(**fmtvalues))
|
||||
+
|
||||
+ def cleanup(self):
|
||||
+ self.inst.stop()
|
||||
+ self.inst.delete()
|
||||
+ if os.path.exists(self.wrapper):
|
||||
+ os.remove(self.wrapper)
|
||||
+
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def _reset_attr(request, topology_st):
|
||||
@@ -2222,6 +2415,100 @@ def test_dscreate_with_different_rdn(dscreate_test_rdn_value):
|
||||
assert True
|
||||
|
||||
|
||||
+@pytest.fixture(scope="module")
|
||||
+def dscreate_custom_instance(request):
|
||||
+ topo = CustomSetup('custom')
|
||||
+
|
||||
+ def fin():
|
||||
+ topo.cleanup()
|
||||
+
|
||||
+ request.addfinalizer(fin)
|
||||
+ topo.create_instance()
|
||||
+ # Return CustomSetup object associated with
|
||||
+ # a stopped instance named "custom"
|
||||
+ return topo
|
||||
+
|
||||
+ obj.create_wrapper(maxfds=150)
|
||||
+ log.info("Starting wrapper")
|
||||
+ inst.start()
|
||||
+ log.info("Server is started.")
|
||||
+ log.info("Open connection")
|
||||
+ inst.open()
|
||||
+
|
||||
+
|
||||
+@pytest.fixture(scope="module", params=set(range(1,5)))
|
||||
+def dscreate_with_numlistener(request, dscreate_custom_instance):
|
||||
+ numlisteners = request.param
|
||||
+ dscreate_custom_instance.create_wrapper(maxfds=MAX_FDS)
|
||||
+ inst = dscreate_custom_instance.inst
|
||||
+ inst.stop()
|
||||
+ dse_ldif = DSEldif(inst)
|
||||
+ dse_ldif.replace('cn=config', 'nsslapd-numlisteners', str(numlisteners))
|
||||
+ inst.start()
|
||||
+ inst.open()
|
||||
+ return inst
|
||||
+
|
||||
+
|
||||
+@pytest.mark.skipif(ds_is_older('2.2.0.0'),
|
||||
+ reason="This test is only required with multiple listener support.")
|
||||
+def test_conn_limits(dscreate_with_numlistener):
|
||||
+ """Check the connections limits for various number of listeners.
|
||||
+
|
||||
+ :id: 7be2eb5c-4d8f-11ee-ae3d-482ae39447e5
|
||||
+ :parametrized: yes
|
||||
+ :setup: Setup an instance then set nsslapd-numlisteners and maximum file descriptors
|
||||
+ :steps:
|
||||
+ 1. Loops on:
|
||||
+ Open new connection and perform search until timeout expires
|
||||
+ 2. Close one of the previously open connections
|
||||
+ 3. Loops MAX_FDS times on:
|
||||
+ - opening a new connection
|
||||
+ - perform a search
|
||||
+ - close the connection
|
||||
+ 4. Close all open connections
|
||||
+ 5. Remove the instance
|
||||
+ :expectedresults:
|
||||
+ 1. Should get a timeout (because the server has no more any connections)
|
||||
+ 2. Should success
|
||||
+ 3. Should success (otherwise issue #5924 has probably been hit)
|
||||
+ 4. Should success
|
||||
+ 5. Should success
|
||||
+ """
|
||||
+ inst = dscreate_with_numlistener
|
||||
+
|
||||
+ conns = []
|
||||
+ timeout_occured = False
|
||||
+ for i in range(MAX_FDS):
|
||||
+ try:
|
||||
+ ldc = ldap.initialize(f'ldap://localhost:{inst.port}')
|
||||
+ ldc.set_option(ldap.OPT_TIMEOUT, 5)
|
||||
+ ldc.search_s(DEFAULT_SUFFIX, ldap.SCOPE_SUBTREE, "(uid=demo)")
|
||||
+ conns.append(ldc)
|
||||
+ except ldap.TIMEOUT:
|
||||
+ timeout_occured = True
|
||||
+ break
|
||||
+ # Should not be able to open MAX_FDS connections (some file descriptor are
|
||||
+ # reserved (for example for the listening socket )
|
||||
+ assert timeout_occured
|
||||
+
|
||||
+ conn = random.choice(conns)
|
||||
+ conn.unbind()
|
||||
+ conns.remove(conn)
|
||||
+
|
||||
+ # Should loop enough time so trigger issue #5924 if it is not fixed.
|
||||
+ for i in range(MAX_FDS):
|
||||
+ ldc = ldap.initialize(f'ldap://localhost:{inst.port}')
|
||||
+ # Set a timeout long enough so that the test fails if server is unresponsive
|
||||
+ ldc.set_option(ldap.OPT_TIMEOUT, 60)
|
||||
+ ldc.search_s(DEFAULT_SUFFIX, ldap.SCOPE_SUBTREE, "(uid=demo)")
|
||||
+ ldc.unbind()
|
||||
+
|
||||
+ # Close all open connections
|
||||
+ for c in conns:
|
||||
+ c.unbind()
|
||||
+
|
||||
+ # Step 6 is done in teardown phase by dscreate_instance finalizer
|
||||
+
|
||||
if __name__ == '__main__':
|
||||
# Run isolated
|
||||
# -s for DEBUG mode
|
||||
diff --git a/ldap/servers/slapd/conntable.c b/ldap/servers/slapd/conntable.c
|
||||
index 11e997432..1ca60e0e5 100644
|
||||
--- a/ldap/servers/slapd/conntable.c
|
||||
+++ b/ldap/servers/slapd/conntable.c
|
||||
@@ -48,69 +48,59 @@
|
||||
* under the connection table lock. This should move the allocation algorithm from O(n) worst case
|
||||
* to O(1) worst case as we always recieve and empty slot *or* ct full. It also reduces lock/atomic
|
||||
* contention on the CPU to improve things.
|
||||
+ * Lastly a change was done: Instead of having a sliding windows that tend to get a never used
|
||||
+ * slot for each new connection, it nows reuse last freed one. That has several benefits:
|
||||
+ * - Fix a bug because the last free list slots may not be alloced.
|
||||
+ * - Avoid to grow the memory footprint when there is no much load
|
||||
+ * - Simplify the code (as only a single is needed. )
|
||||
*
|
||||
- * The freelist is a ringbuffer of pointers to the connection table. On a small scale it looks like:
|
||||
+ * The freelist is a stack of pointers to the connection table.
|
||||
+ * It is NULL terminated. On a small scale it looks like:
|
||||
*
|
||||
* |--------------------------------------------|
|
||||
- * | slot 1 | slot 2 | slot 3 | slot 4 | slot 5 |
|
||||
- * | _ ptr | _ ptr | _ ptr | _ ptr | _ ptr |
|
||||
+ * | slot 0 | slot 1 | slot 2 | slot 3 | slot 4 |
|
||||
+ * | _ ptr | _ ptr | _ ptr | _ ptr | NULL |
|
||||
* |--------------------------------------------|
|
||||
- * ^ ^- conn_next
|
||||
- * |
|
||||
- * \-- conn_free
|
||||
+ * ^- conn_next
|
||||
*
|
||||
- * As we allocate, we shift conn_next through the list, yielding the ptr that was stored (and
|
||||
- * setting it to NULL as we proceed)
|
||||
+ * To allocate, we pop one of the stored connection ptr out of the stack (yield the ptr, set
|
||||
+ * its slot to NULL then increase conn_next)
|
||||
*
|
||||
* |--------------------------------------------|
|
||||
- * | slot 1 | slot 2 | slot 3 | slot 4 | slot 5 |
|
||||
- * | _NULL | _NULL | _ ptr | _ ptr | _ ptr |
|
||||
+ * | slot 0 | slot 1 | slot 2 | slot 3 | slot 4 |
|
||||
+ * | _NULL | _NULL | _ ptr | _ ptr | NULL |
|
||||
* |--------------------------------------------|
|
||||
- * ^ ^- conn_next
|
||||
- * |
|
||||
- * \-- conn_free
|
||||
+ * ^- conn_next
|
||||
*
|
||||
- * When a connection is "freed" we return it to conn_free, which is then also slid up.
|
||||
+ * When a connection is "freed" we push it back in the stack after decreasing conn_next
|
||||
*
|
||||
* |--------------------------------------------|
|
||||
- * | slot 1 | slot 2 | slot 3 | slot 4 | slot 5 |
|
||||
- * | _ ptr | _NULL | _ ptr | _ ptr | _ ptr |
|
||||
+ * | slot 0 | slot 1 | slot 2 | slot 3 | slot 4 |
|
||||
+ * | _NULL | _ ptr | _ ptr | _ ptr | NULL |
|
||||
* |--------------------------------------------|
|
||||
- * ^ ^- conn_next
|
||||
- * |
|
||||
- * \-- conn_free
|
||||
+ * ^- conn_next
|
||||
*
|
||||
- * If all connections are exhausted, conn_next will == conn_next, as conn_next must have proceeded
|
||||
- * to the end of the ring, and then wrapped back allocating all previous until we meet with conn_free.
|
||||
+ * If all connections are exhausted, freelist[conn_next] is NULL
|
||||
*
|
||||
* |--------------------------------------------|
|
||||
- * | slot 1 | slot 2 | slot 3 | slot 4 | slot 5 |
|
||||
+ * | slot 0 | slot 1 | slot 2 | slot 3 | slot 4 |
|
||||
* | _NULL | _NULL | _NULL | _NULL | _NULL |
|
||||
* |--------------------------------------------|
|
||||
- * ^ ^- conn_next
|
||||
- * |
|
||||
- * \-- conn_free
|
||||
+ * ^- conn_next
|
||||
*
|
||||
* This means allocations of conns will keep failing until a connection is returned.
|
||||
*
|
||||
* |--------------------------------------------|
|
||||
- * | slot 1 | slot 2 | slot 3 | slot 4 | slot 5 |
|
||||
- * | _NULL | _ ptr | _NULL | _NULL | _NULL |
|
||||
+ * | slot 0 | slot 1 | slot 2 | slot 3 | slot 4 |
|
||||
+ * | _NULL | _NULL | _NULL | _ ptr | NULL |
|
||||
* |--------------------------------------------|
|
||||
- * ^- conn_next ^
|
||||
- * |
|
||||
- * \-- conn_free
|
||||
+ * ^- conn_next
|
||||
*
|
||||
* And now conn_next can begin to allocate again.
|
||||
*
|
||||
*
|
||||
* -- invariants
|
||||
- * * when conn_free is slid back to meet conn_next, there can be no situation where another
|
||||
- * connection is returned, as none must allocated -if they were allocated, conn_free would have
|
||||
- * moved_along.
|
||||
- * * the ring buffer must be as large as conntable.
|
||||
- * * We do not check conn_next == conn_free (that's the starting state), but we check if the
|
||||
- * slot at conn_next is NULL, which must imply that conn_free has nothing to return.
|
||||
+ * * the stack must be as large as conntable.
|
||||
* * connection_table_move_connection_out_of_active_list is the only function able to return a
|
||||
* connection to the freelist, as it is the function that is called when the event system has
|
||||
* determined all IO's are complete, or unable to complete. This function is what prepares the
|
||||
@@ -136,11 +126,9 @@ connection_table_new(int table_size)
|
||||
ct->c = (Connection **)slapi_ch_calloc(1, ct->size * sizeof(Connection *));
|
||||
ct->fd = (struct POLL_STRUCT **)slapi_ch_calloc(1, ct->list_num * sizeof(struct POLL_STRUCT*));
|
||||
ct->table_mutex = PR_NewLock();
|
||||
- /* Allocate the freelist */
|
||||
- ct->c_freelist = (Connection **)slapi_ch_calloc(1, ct->size * sizeof(Connection *));
|
||||
- /* NEVER use slot 0, this is a list pointer */
|
||||
- ct->conn_next_offset = 1;
|
||||
- ct->conn_free_offset = 1;
|
||||
+ /* Allocate the freelist (a slot for each connection plus another slot for the final NULL pointer) */
|
||||
+ ct->c_freelist = (Connection **)slapi_ch_calloc(1, (ct->size+1) * sizeof(Connection *));
|
||||
+ ct->conn_next_offset = 0;
|
||||
|
||||
slapi_log_err(SLAPI_LOG_INFO, "connection_table_new", "Number of connection sub-tables %d, each containing %d slots.\n",
|
||||
ct->list_num, ct->list_size);
|
||||
@@ -273,22 +261,22 @@ connection_table_get_connection(Connection_Table *ct, int sd)
|
||||
{
|
||||
PR_Lock(ct->table_mutex);
|
||||
|
||||
- PR_ASSERT(ct->conn_next_offset != 0);
|
||||
Connection *c = ct->c_freelist[ct->conn_next_offset];
|
||||
if (c != NULL) {
|
||||
/* We allocated it, so now NULL the slot and move forward. */
|
||||
- ct->c_freelist[ct->conn_next_offset] = NULL;
|
||||
- /* Handle overflow. */
|
||||
- ct->conn_next_offset = (ct->conn_next_offset + 1) % ct->size;
|
||||
- if (ct->conn_next_offset == 0) {
|
||||
- /* Never use slot 0 */
|
||||
- ct->conn_next_offset += 1;
|
||||
- }
|
||||
+ PR_ASSERT(ct->conn_next_offset>=0 && ct->conn_next_offset<ct->size);
|
||||
+ ct->c_freelist[ct->conn_next_offset++] = NULL;
|
||||
PR_Unlock(ct->table_mutex);
|
||||
} else {
|
||||
/* couldn't find a Connection, table must be full */
|
||||
- slapi_log_err(SLAPI_LOG_CONNS, "connection_table_get_connection", "Max open connections reached\n");
|
||||
PR_Unlock(ct->table_mutex);
|
||||
+ static time_t last_err_msg_time = 0;
|
||||
+ time_t curtime = slapi_current_utc_time();
|
||||
+ /* Logs the message only once per seconds */
|
||||
+ if (curtime != last_err_msg_time) {
|
||||
+ slapi_log_err(SLAPI_LOG_ERR, "connection_table_get_connection", "Max open connections reached\n");
|
||||
+ last_err_msg_time = curtime;
|
||||
+ }
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -461,14 +449,10 @@ connection_table_move_connection_out_of_active_list(Connection_Table *ct, Connec
|
||||
|
||||
/* Finally, place the connection back into the freelist for use */
|
||||
PR_ASSERT(c->c_refcnt == 0);
|
||||
- PR_ASSERT(ct->conn_free_offset != 0);
|
||||
- PR_ASSERT(ct->c_freelist[ct->conn_free_offset] == NULL);
|
||||
- ct->c_freelist[ct->conn_free_offset] = c;
|
||||
- ct->conn_free_offset = (ct->conn_free_offset + 1) % ct->size;
|
||||
- if (ct->conn_free_offset == 0) {
|
||||
- /* Never use slot 0 */
|
||||
- ct->conn_free_offset += 1;
|
||||
- }
|
||||
+ PR_ASSERT(ct->conn_next_offset != 0);
|
||||
+ ct->conn_next_offset--;
|
||||
+ PR_ASSERT(ct->c_freelist[ct->conn_next_offset] == NULL);
|
||||
+ ct->c_freelist[ct->conn_next_offset] = c;
|
||||
|
||||
PR_Unlock(ct->table_mutex);
|
||||
|
||||
diff --git a/ldap/servers/slapd/daemon.c b/ldap/servers/slapd/daemon.c
|
||||
index e8b979aca..10aabed6d 100644
|
||||
--- a/ldap/servers/slapd/daemon.c
|
||||
+++ b/ldap/servers/slapd/daemon.c
|
||||
@@ -135,19 +135,25 @@ get_pid_file(void)
|
||||
}
|
||||
|
||||
static int
|
||||
-accept_and_configure(int s __attribute__((unused)), PRFileDesc *pr_acceptfd, PRNetAddr *pr_netaddr, int addrlen __attribute__((unused)), int secure, int local, PRFileDesc **pr_clonefd)
|
||||
+accept_and_configure(int s __attribute__((unused)), PRFileDesc *listenfd, PRNetAddr *pr_netaddr, int addrlen __attribute__((unused)), int secure, int local, PRFileDesc **pr_accepted_fd)
|
||||
{
|
||||
int ns = 0;
|
||||
PRIntervalTime pr_timeout = PR_MillisecondsToInterval(slapd_accept_wakeup_timer);
|
||||
|
||||
- (*pr_clonefd) = PR_Accept(pr_acceptfd, pr_netaddr, pr_timeout);
|
||||
- if (!(*pr_clonefd)) {
|
||||
+ (*pr_accepted_fd) = PR_Accept(listenfd, pr_netaddr, pr_timeout);
|
||||
+ if (!(*pr_accepted_fd)) {
|
||||
PRErrorCode prerr = PR_GetError();
|
||||
- slapi_log_err(SLAPI_LOG_ERR, "accept_and_configure", "PR_Accept() failed, " SLAPI_COMPONENT_NAME_NSPR " error %d (%s)\n",
|
||||
- prerr, slapd_pr_strerror(prerr));
|
||||
+ static time_t last_err_msg_time = 0;
|
||||
+ time_t curtime = slapi_current_utc_time();
|
||||
+ /* Logs the message only once per seconds */
|
||||
+ if (curtime != last_err_msg_time) {
|
||||
+ slapi_log_err(SLAPI_LOG_ERR, "accept_and_configure", "PR_Accept() failed, " SLAPI_COMPONENT_NAME_NSPR " error %d (%s)\n",
|
||||
+ prerr, slapd_pr_strerror(prerr));
|
||||
+ last_err_msg_time = curtime;
|
||||
+ }
|
||||
return (SLAPD_INVALID_SOCKET);
|
||||
}
|
||||
- ns = configure_pr_socket(pr_clonefd, secure, local);
|
||||
+ ns = configure_pr_socket(pr_accepted_fd, secure, local);
|
||||
|
||||
return ns;
|
||||
}
|
||||
@@ -155,7 +161,7 @@ accept_and_configure(int s __attribute__((unused)), PRFileDesc *pr_acceptfd, PRN
|
||||
/*
|
||||
* This is the shiny new re-born daemon function, without all the hair
|
||||
*/
|
||||
-static int handle_new_connection(Connection_Table *ct, int tcps, PRFileDesc *pr_acceptfd, int secure, int local, Connection **newconn);
|
||||
+static int handle_new_connection(Connection_Table *ct, int tcps, PRFileDesc *listenfd, int secure, int local, Connection **newconn);
|
||||
static void handle_pr_read_ready(Connection_Table *ct, int list_id, PRIntn num_poll);
|
||||
static int clear_signal(struct POLL_STRUCT *fds, int list_id);
|
||||
static void unfurl_banners(Connection_Table *ct, daemon_ports_t *ports, PRFileDesc **n_tcps, PRFileDesc **s_tcps, PRFileDesc **i_unix);
|
||||
@@ -831,6 +837,7 @@ accept_thread(void *vports)
|
||||
}
|
||||
/* Need a sleep delay here. */
|
||||
PR_Sleep(pr_timeout);
|
||||
+ last_accept_new_connections = accept_new_connections;
|
||||
continue;
|
||||
} else {
|
||||
/* Log that we are now listening again */
|
||||
@@ -1846,28 +1853,30 @@ handle_closed_connection(Connection *conn)
|
||||
* this function returns the connection table list the new connection is in
|
||||
*/
|
||||
static int
|
||||
-handle_new_connection(Connection_Table *ct, int tcps, PRFileDesc *pr_acceptfd, int secure, int local, Connection **newconn)
|
||||
+handle_new_connection(Connection_Table *ct, int tcps, PRFileDesc *listenfd, int secure, int local, Connection **newconn)
|
||||
{
|
||||
int ns = 0;
|
||||
Connection *conn = NULL;
|
||||
/* struct sockaddr_in from;*/
|
||||
PRNetAddr from = {{0}};
|
||||
- PRFileDesc *pr_clonefd = NULL;
|
||||
+ PRFileDesc *pr_accepted_fd = NULL;
|
||||
slapdFrontendConfig_t *fecfg = getFrontendConfig();
|
||||
ber_len_t maxbersize;
|
||||
|
||||
if (newconn) {
|
||||
*newconn = NULL;
|
||||
}
|
||||
- if ((ns = accept_and_configure(tcps, pr_acceptfd, &from,
|
||||
- sizeof(from), secure, local, &pr_clonefd)) == SLAPD_INVALID_SOCKET) {
|
||||
+ if ((ns = accept_and_configure(tcps, listenfd, &from,
|
||||
+ sizeof(from), secure, local, &pr_accepted_fd)) == SLAPD_INVALID_SOCKET) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* get a new Connection from the Connection Table */
|
||||
conn = connection_table_get_connection(ct, ns);
|
||||
if (conn == NULL) {
|
||||
- PR_Close(pr_acceptfd);
|
||||
+ if (pr_accepted_fd) {
|
||||
+ PR_Close(pr_accepted_fd);
|
||||
+ }
|
||||
return -1;
|
||||
}
|
||||
pthread_mutex_lock(&(conn->c_mutex));
|
||||
@@ -1879,7 +1888,7 @@ handle_new_connection(Connection_Table *ct, int tcps, PRFileDesc *pr_acceptfd, i
|
||||
conn->c_idletimeout = fecfg->idletimeout;
|
||||
conn->c_idletimeout_handle = idletimeout_reslimit_handle;
|
||||
conn->c_sd = ns;
|
||||
- conn->c_prfd = pr_clonefd;
|
||||
+ conn->c_prfd = pr_accepted_fd;
|
||||
conn->c_flags &= ~CONN_FLAG_CLOSING;
|
||||
|
||||
/* Set per connection static config */
|
||||
diff --git a/ldap/servers/slapd/fe.h b/ldap/servers/slapd/fe.h
|
||||
index 64c6645e7..95dfeeb89 100644
|
||||
--- a/ldap/servers/slapd/fe.h
|
||||
+++ b/ldap/servers/slapd/fe.h
|
||||
@@ -88,7 +88,6 @@ struct connection_table
|
||||
/* An array of free connections awaiting allocation. */;
|
||||
Connection **c_freelist;
|
||||
size_t conn_next_offset;
|
||||
- size_t conn_free_offset;
|
||||
struct POLL_STRUCT **fd;
|
||||
PRLock *table_mutex;
|
||||
};
|
||||
--
|
||||
2.41.0
|
||||
|
@ -0,0 +1,388 @@
|
||||
From dc091325814d9ac74d238c7e14cac8b9112b3271 Mon Sep 17 00:00:00 2001
|
||||
From: progier389 <progier@redhat.com>
|
||||
Date: Fri, 17 Nov 2023 14:41:51 +0100
|
||||
Subject: [PATCH] Issue 5984 - Crash when paged result search are abandoned
|
||||
(#5985)
|
||||
|
||||
* Issue 5984 - Crash when paged result search are abandoned
|
||||
|
||||
Problem:
|
||||
Fix #4551 has changed the lock that protects the paged result data
|
||||
within a connection. But the abandon operation attempts to free
|
||||
the paged search result with the connection lock.
|
||||
This leads to race condition and double free causing an heap
|
||||
corruption and a SIGSEGV.
|
||||
|
||||
Solution:
|
||||
- Get a copy of the operation data that needs to be logged.
|
||||
- Unlock the connection mutex (to avoid deadlock risk)
|
||||
- Free the paged result while holding the paged result lock.
|
||||
|
||||
Issue: 5984
|
||||
|
||||
Reviewed by: @tbordaz (Thanks!)
|
||||
|
||||
(cherry picked from commit 06bd0862956672eb76276cab5c1dd906fe5a7eec)
|
||||
---
|
||||
.../paged_results/paged_results_test.py | 107 ++++++++++++++++--
|
||||
ldap/servers/slapd/abandon.c | 23 ++--
|
||||
ldap/servers/slapd/opshared.c | 4 +-
|
||||
ldap/servers/slapd/pagedresults.c | 8 +-
|
||||
ldap/servers/slapd/proto-slap.h | 2 +-
|
||||
src/lib389/lib389/__init__.py | 27 ++++-
|
||||
6 files changed, 150 insertions(+), 21 deletions(-)
|
||||
|
||||
diff --git a/dirsrvtests/tests/suites/paged_results/paged_results_test.py b/dirsrvtests/tests/suites/paged_results/paged_results_test.py
|
||||
index d490c4af2..cdafa834a 100644
|
||||
--- a/dirsrvtests/tests/suites/paged_results/paged_results_test.py
|
||||
+++ b/dirsrvtests/tests/suites/paged_results/paged_results_test.py
|
||||
@@ -7,7 +7,8 @@
|
||||
# --- END COPYRIGHT BLOCK ---
|
||||
#
|
||||
import socket
|
||||
-from random import sample
|
||||
+from random import sample, randrange
|
||||
+
|
||||
import pytest
|
||||
from ldap.controls import SimplePagedResultsControl, GetEffectiveRightsControl
|
||||
from lib389.tasks import *
|
||||
@@ -16,6 +17,10 @@ from lib389.topologies import topology_st
|
||||
from lib389._constants import DN_LDBM, DN_DM, DEFAULT_SUFFIX
|
||||
from lib389._controls import SSSRequestControl
|
||||
from lib389.idm.user import UserAccount, UserAccounts
|
||||
+from lib389.cli_base import FakeArgs
|
||||
+from lib389.config import LDBMConfig
|
||||
+from lib389.dbgen import dbgen_users
|
||||
+
|
||||
from lib389.idm.organization import Organization
|
||||
from lib389.idm.organizationalunit import OrganizationalUnit
|
||||
from lib389.backend import Backends
|
||||
@@ -42,11 +47,56 @@ NEW_BACKEND_1 = 'parent_base'
|
||||
NEW_BACKEND_2 = 'child_base'
|
||||
|
||||
OLD_HOSTNAME = socket.gethostname()
|
||||
-socket.sethostname('localhost')
|
||||
+if os.getuid() == 0:
|
||||
+ socket.sethostname('localhost')
|
||||
HOSTNAME = socket.gethostname()
|
||||
IP_ADDRESS = socket.gethostbyname(HOSTNAME)
|
||||
OLD_IP_ADDRESS = socket.gethostbyname(OLD_HOSTNAME)
|
||||
|
||||
+
|
||||
+@pytest.fixture(scope="module")
|
||||
+def create_40k_users(topology_st, request):
|
||||
+ inst = topology_st.standalone
|
||||
+
|
||||
+ # Prepare return value
|
||||
+ retval = FakeArgs()
|
||||
+ retval.inst = inst
|
||||
+ retval.bename = '40k'
|
||||
+ retval.suffix = f'o={retval.bename}'
|
||||
+ retval.ldif_file = f'{inst.get_ldif_dir()}/{retval.bename}.ldif'
|
||||
+
|
||||
+ # Create new backend
|
||||
+ bes = Backends(inst)
|
||||
+ be_1 = bes.create(properties={
|
||||
+ 'cn': retval.bename,
|
||||
+ 'nsslapd-suffix': retval.suffix,
|
||||
+ })
|
||||
+
|
||||
+ # Set paged search lookthrough limit
|
||||
+ ldbmconfig = LDBMConfig(inst)
|
||||
+ ldbmconfig.replace('nsslapd-pagedlookthroughlimit', b'100000')
|
||||
+
|
||||
+ # Create ldif and import it.
|
||||
+ dbgen_users(inst, 40000, retval.ldif_file, retval.suffix)
|
||||
+ # tasks = Tasks(inst)
|
||||
+ # args = {TASK_WAIT: True}
|
||||
+ # tasks.importLDIF(retval.suffix, None, retval.ldif_file, args)
|
||||
+ inst.stop()
|
||||
+ assert inst.ldif2db(retval.bename, None, None, None, retval.ldif_file, None)
|
||||
+ inst.start()
|
||||
+
|
||||
+ # And set an aci allowing anonymous read
|
||||
+ log.info('Adding ACI to allow our test user to search')
|
||||
+ ACI_TARGET = '(targetattr != "userPassword || aci")'
|
||||
+ ACI_ALLOW = '(version 3.0; acl "Enable anonymous access";allow (read, search, compare)'
|
||||
+ ACI_SUBJECT = '(userdn = "ldap:///anyone");)'
|
||||
+ ACI_BODY = ACI_TARGET + ACI_ALLOW + ACI_SUBJECT
|
||||
+ o_1 = Organization(inst, retval.suffix)
|
||||
+ o_1.set('aci', ACI_BODY)
|
||||
+
|
||||
+ return retval
|
||||
+
|
||||
+
|
||||
@pytest.fixture(scope="module")
|
||||
def create_user(topology_st, request):
|
||||
"""User for binding operation"""
|
||||
@@ -71,8 +121,10 @@ def create_user(topology_st, request):
|
||||
|
||||
def fin():
|
||||
log.info('Deleting user simplepaged_test')
|
||||
- user.delete()
|
||||
- socket.sethostname(OLD_HOSTNAME)
|
||||
+ if not DEBUGGING:
|
||||
+ user.delete()
|
||||
+ if os.getuid() == 0:
|
||||
+ socket.sethostname(OLD_HOSTNAME)
|
||||
|
||||
request.addfinalizer(fin)
|
||||
|
||||
@@ -175,7 +227,7 @@ def change_conf_attr(topology_st, suffix, attr_name, attr_value):
|
||||
return attr_value_bck
|
||||
|
||||
|
||||
-def paged_search(conn, suffix, controls, search_flt, searchreq_attrlist):
|
||||
+def paged_search(conn, suffix, controls, search_flt, searchreq_attrlist, abandon_rate=0):
|
||||
"""Search at the DEFAULT_SUFFIX with ldap.SCOPE_SUBTREE
|
||||
using Simple Paged Control(should the first item in the
|
||||
list controls.
|
||||
@@ -195,9 +247,16 @@ def paged_search(conn, suffix, controls, search_flt, searchreq_attrlist):
|
||||
req_pr_ctrl.size,
|
||||
str(controls)))
|
||||
msgid = conn.search_ext(suffix, ldap.SCOPE_SUBTREE, search_flt, searchreq_attrlist, serverctrls=controls)
|
||||
+ log.info('Getting page %d' % (pages,))
|
||||
while True:
|
||||
- log.info('Getting page %d' % (pages,))
|
||||
- rtype, rdata, rmsgid, rctrls = conn.result3(msgid)
|
||||
+ try:
|
||||
+ rtype, rdata, rmsgid, rctrls = conn.result3(msgid, timeout=0.001)
|
||||
+ except ldap.TIMEOUT:
|
||||
+ if pages > 0 and abandon_rate>0 and randrange(100)<abandon_rate:
|
||||
+ conn.abandon(msgid)
|
||||
+ log.info('Paged result search is abandonned.')
|
||||
+ return all_results
|
||||
+ continue
|
||||
log.debug('Data: {}'.format(rdata))
|
||||
all_results.extend(rdata)
|
||||
pages += 1
|
||||
@@ -217,6 +276,7 @@ def paged_search(conn, suffix, controls, search_flt, searchreq_attrlist):
|
||||
break # No more pages available
|
||||
else:
|
||||
break
|
||||
+ log.info('Getting page %d' % (pages,))
|
||||
|
||||
assert not pctrls[0].cookie
|
||||
return all_results
|
||||
@@ -1191,6 +1251,39 @@ def test_maxsimplepaged_per_conn_failure(topology_st, create_user, conf_attr_val
|
||||
del_users(users_list)
|
||||
change_conf_attr(topology_st, DN_CONFIG, 'nsslapd-maxsimplepaged-per-conn', max_per_con_bck)
|
||||
|
||||
+
|
||||
+def test_search_stress_abandon(create_40k_users, create_user):
|
||||
+ """Verify that search with a simple paged results control
|
||||
+ returns all entries it should without errors.
|
||||
+
|
||||
+ :id: e154b24a-83d6-11ee-90d1-482ae39447e5
|
||||
+ :customerscenario: True
|
||||
+ :feature: Simple paged results
|
||||
+ :setup: Standalone instance, test user for binding,
|
||||
+ 40K users in a second backend
|
||||
+ :steps:
|
||||
+ 1. Bind as test user
|
||||
+ 2. Loops a number of times doing:
|
||||
+ - search through added users with a simple paged control
|
||||
+ - randomly abandoning the search after a few ms.
|
||||
+ :expectedresults:
|
||||
+ 1. Bind should be successful
|
||||
+ 2. The loop should complete successfully.
|
||||
+ """
|
||||
+
|
||||
+ abandon_rate = 10
|
||||
+ page_size = 500
|
||||
+ nbloops = 1000
|
||||
+ search_flt = r'(uid=*)'
|
||||
+ searchreq_attrlist = ['dn', 'sn']
|
||||
+ log.info('Set user bind %s ' % create_user)
|
||||
+ conn = create_user.bind(TEST_USER_PWD)
|
||||
+ for idx in range(nbloops):
|
||||
+ req_ctrl = SimplePagedResultsControl(True, size=page_size, cookie='')
|
||||
+ # If the issue #5984 is not fixed the server crashs and the paged search fails with ldap.SERVER_DOWN exception
|
||||
+ paged_search(conn, create_40k_users.suffix, [req_ctrl], search_flt, searchreq_attrlist, abandon_rate=abandon_rate)
|
||||
+
|
||||
+
|
||||
if __name__ == '__main__':
|
||||
# Run isolated
|
||||
# -s for DEBUG mode
|
||||
diff --git a/ldap/servers/slapd/abandon.c b/ldap/servers/slapd/abandon.c
|
||||
index 26a2e7bf8..964d28836 100644
|
||||
--- a/ldap/servers/slapd/abandon.c
|
||||
+++ b/ldap/servers/slapd/abandon.c
|
||||
@@ -38,6 +38,12 @@ do_abandon(Slapi_PBlock *pb)
|
||||
Connection *pb_conn = NULL;
|
||||
Operation *pb_op = NULL;
|
||||
Operation *o;
|
||||
+ /* Keep a copy of some data because o may vanish once conn is unlocked */
|
||||
+ struct {
|
||||
+ struct timespec hr_time_end;
|
||||
+ int nentries;
|
||||
+ int opid;
|
||||
+ } o_copy;
|
||||
|
||||
slapi_pblock_get(pb, SLAPI_OPERATION, &pb_op);
|
||||
slapi_pblock_get(pb, SLAPI_CONNECTION, &pb_conn);
|
||||
@@ -90,8 +96,12 @@ do_abandon(Slapi_PBlock *pb)
|
||||
|
||||
pthread_mutex_lock(&(pb_conn->c_mutex));
|
||||
for (o = pb_conn->c_ops; o != NULL; o = o->o_next) {
|
||||
- if (o->o_msgid == id && o != pb_op)
|
||||
+ if (o->o_msgid == id && o != pb_op) {
|
||||
+ slapi_operation_time_elapsed(o, &o_copy.hr_time_end);
|
||||
+ o_copy.nentries = o->o_results.r.r_search.nentries;
|
||||
+ o_copy.opid = o->o_opid;
|
||||
break;
|
||||
+ }
|
||||
}
|
||||
|
||||
if (o != NULL) {
|
||||
@@ -130,7 +140,8 @@ do_abandon(Slapi_PBlock *pb)
|
||||
slapi_log_err(SLAPI_LOG_TRACE, "do_abandon", "op not found\n");
|
||||
}
|
||||
|
||||
- if (0 == pagedresults_free_one_msgid_nolock(pb_conn, id)) {
|
||||
+ pthread_mutex_unlock(&(pb_conn->c_mutex));
|
||||
+ if (0 == pagedresults_free_one_msgid(pb_conn, id, pageresult_lock_get_addr(pb_conn))) {
|
||||
slapi_log_access(LDAP_DEBUG_STATS, "conn=%" PRIu64
|
||||
" op=%d ABANDON targetop=Simple Paged Results msgid=%d\n",
|
||||
pb_conn->c_connid, pb_op->o_opid, id);
|
||||
@@ -143,15 +154,11 @@ do_abandon(Slapi_PBlock *pb)
|
||||
" targetop=SUPPRESSED-BY-PLUGIN msgid=%d\n",
|
||||
pb_conn->c_connid, pb_op->o_opid, id);
|
||||
} else {
|
||||
- struct timespec o_hr_time_end;
|
||||
- slapi_operation_time_elapsed(o, &o_hr_time_end);
|
||||
slapi_log_access(LDAP_DEBUG_STATS, "conn=%" PRIu64 " op=%d ABANDON"
|
||||
" targetop=%d msgid=%d nentries=%d etime=%" PRId64 ".%010" PRId64 "\n",
|
||||
- pb_conn->c_connid, pb_op->o_opid, o->o_opid, id,
|
||||
- o->o_results.r.r_search.nentries, (int64_t)o_hr_time_end.tv_sec, (int64_t)o_hr_time_end.tv_nsec);
|
||||
+ pb_conn->c_connid, pb_op->o_opid, o_copy.opid, id,
|
||||
+ o_copy.nentries, (int64_t)o_copy.hr_time_end.tv_sec, (int64_t)o_copy.hr_time_end.tv_nsec);
|
||||
}
|
||||
-
|
||||
- pthread_mutex_unlock(&(pb_conn->c_mutex));
|
||||
/*
|
||||
* Wake up the persistent searches, so they
|
||||
* can notice if they've been abandoned.
|
||||
diff --git a/ldap/servers/slapd/opshared.c b/ldap/servers/slapd/opshared.c
|
||||
index a842d4249..f77043afa 100644
|
||||
--- a/ldap/servers/slapd/opshared.c
|
||||
+++ b/ldap/servers/slapd/opshared.c
|
||||
@@ -921,9 +921,7 @@ op_shared_search(Slapi_PBlock *pb, int send_result)
|
||||
next_be = NULL; /* to break the loop */
|
||||
if (operation->o_status & SLAPI_OP_STATUS_ABANDONED) {
|
||||
/* It turned out this search was abandoned. */
|
||||
- pthread_mutex_lock(pagedresults_mutex);
|
||||
- pagedresults_free_one_msgid_nolock(pb_conn, operation->o_msgid);
|
||||
- pthread_mutex_unlock(pagedresults_mutex);
|
||||
+ pagedresults_free_one_msgid(pb_conn, operation->o_msgid, pagedresults_mutex);
|
||||
/* paged-results-request was abandoned; making an empty cookie. */
|
||||
pagedresults_set_response_control(pb, 0, estimate, -1, pr_idx);
|
||||
send_ldap_result(pb, 0, NULL, "Simple Paged Results Search abandoned", 0, NULL);
|
||||
diff --git a/ldap/servers/slapd/pagedresults.c b/ldap/servers/slapd/pagedresults.c
|
||||
index fc15f6bec..9959c927e 100644
|
||||
--- a/ldap/servers/slapd/pagedresults.c
|
||||
+++ b/ldap/servers/slapd/pagedresults.c
|
||||
@@ -34,6 +34,10 @@ pageresult_lock_cleanup()
|
||||
slapi_ch_free((void**)&lock_hash);
|
||||
}
|
||||
|
||||
+/* Beware to the lock order with c_mutex:
|
||||
+ * c_mutex is sometime locked while holding pageresult_lock
|
||||
+ * ==> Do not lock pageresult_lock when holing c_mutex
|
||||
+ */
|
||||
pthread_mutex_t *
|
||||
pageresult_lock_get_addr(Connection *conn)
|
||||
{
|
||||
@@ -350,7 +354,7 @@ pagedresults_free_one(Connection *conn, Operation *op, int index)
|
||||
* Used for abandoning - pageresult_lock_get_addr(conn) is already locked in do_abandone.
|
||||
*/
|
||||
int
|
||||
-pagedresults_free_one_msgid_nolock(Connection *conn, ber_int_t msgid)
|
||||
+pagedresults_free_one_msgid(Connection *conn, ber_int_t msgid, pthread_mutex_t *mutex)
|
||||
{
|
||||
int rc = -1;
|
||||
int i;
|
||||
@@ -361,6 +365,7 @@ pagedresults_free_one_msgid_nolock(Connection *conn, ber_int_t msgid)
|
||||
} else {
|
||||
slapi_log_err(SLAPI_LOG_TRACE,
|
||||
"pagedresults_free_one_msgid_nolock", "=> msgid=%d\n", msgid);
|
||||
+ pthread_mutex_lock(mutex);
|
||||
for (i = 0; i < conn->c_pagedresults.prl_maxlen; i++) {
|
||||
if (conn->c_pagedresults.prl_list[i].pr_msgid == msgid) {
|
||||
PagedResults *prp = conn->c_pagedresults.prl_list + i;
|
||||
@@ -375,6 +380,7 @@ pagedresults_free_one_msgid_nolock(Connection *conn, ber_int_t msgid)
|
||||
break;
|
||||
}
|
||||
}
|
||||
+ pthread_mutex_unlock(mutex);
|
||||
slapi_log_err(SLAPI_LOG_TRACE,
|
||||
"pagedresults_free_one_msgid_nolock", "<= %d\n", rc);
|
||||
}
|
||||
diff --git a/ldap/servers/slapd/proto-slap.h b/ldap/servers/slapd/proto-slap.h
|
||||
index c7389fe2e..e8adbc254 100644
|
||||
--- a/ldap/servers/slapd/proto-slap.h
|
||||
+++ b/ldap/servers/slapd/proto-slap.h
|
||||
@@ -1614,7 +1614,7 @@ int pagedresults_is_timedout_nolock(Connection *conn);
|
||||
int pagedresults_reset_timedout_nolock(Connection *conn);
|
||||
int pagedresults_in_use_nolock(Connection *conn);
|
||||
int pagedresults_free_one(Connection *conn, Operation *op, int index);
|
||||
-int pagedresults_free_one_msgid_nolock(Connection *conn, ber_int_t msgid);
|
||||
+int pagedresults_free_one_msgid(Connection *conn, ber_int_t msgid, pthread_mutex_t *mutex);
|
||||
int op_is_pagedresults(Operation *op);
|
||||
int pagedresults_cleanup_all(Connection *conn, int needlock);
|
||||
void op_set_pagedresults(Operation *op);
|
||||
diff --git a/src/lib389/lib389/__init__.py b/src/lib389/lib389/__init__.py
|
||||
index 7590ec442..6a941dbe7 100644
|
||||
--- a/src/lib389/lib389/__init__.py
|
||||
+++ b/src/lib389/lib389/__init__.py
|
||||
@@ -1048,6 +1048,24 @@ class DirSrv(SimpleLDAPObject, object):
|
||||
|
||||
self.state = DIRSRV_STATE_OFFLINE
|
||||
|
||||
+ def dump_errorlog(self):
|
||||
+ '''
|
||||
+ Its logs all errors messages within the error log that occured
|
||||
+ after the last startup.
|
||||
+ '''
|
||||
+ if os.path.isfile(self.errlog):
|
||||
+ lines = []
|
||||
+ with open(self.errlog, 'r') as file:
|
||||
+ for line in file:
|
||||
+ if "starting up" in line:
|
||||
+ lines = []
|
||||
+ for key in ( 'DEBUG', 'INFO', 'NOTICE', 'WARN' ):
|
||||
+ if key in line:
|
||||
+ lines.append(line)
|
||||
+ break
|
||||
+ for line in lines:
|
||||
+ self.log.error(line)
|
||||
+
|
||||
def start(self, timeout=120, post_open=True):
|
||||
'''
|
||||
It starts an instance and rebind it. Its final state after rebind
|
||||
@@ -1071,7 +1089,13 @@ class DirSrv(SimpleLDAPObject, object):
|
||||
if self.with_systemd():
|
||||
self.log.debug("systemd status -> True")
|
||||
# Do systemd things here ...
|
||||
- subprocess.check_output(["systemctl", "start", "dirsrv@%s" % self.serverid], stderr=subprocess.STDOUT)
|
||||
+ try:
|
||||
+ subprocess.check_output(["systemctl", "start", "dirsrv@%s" % self.serverid], stderr=subprocess.STDOUT)
|
||||
+ except subprocess.CalledProcessError as e:
|
||||
+ self.dump_errorlog()
|
||||
+ self.log.error('Failed to start dirsrv@%s: "%s"' % (self.serverid, e.output.decode()))
|
||||
+ self.log.error(e)
|
||||
+ raise ValueError('Failed to start DS')
|
||||
else:
|
||||
self.log.debug("systemd status -> False")
|
||||
# Start the process.
|
||||
@@ -1095,6 +1119,7 @@ class DirSrv(SimpleLDAPObject, object):
|
||||
self.log.debug("DEBUG: starting with %s" % cmd)
|
||||
output = subprocess.check_output(*cmd, env=env, stderr=subprocess.STDOUT)
|
||||
except subprocess.CalledProcessError as e:
|
||||
+ self.dump_errorlog()
|
||||
self.log.error('Failed to start ns-slapd: "%s"' % e.output.decode())
|
||||
self.log.error(e)
|
||||
raise ValueError('Failed to start DS')
|
||||
--
|
||||
2.41.0
|
||||
|
@ -0,0 +1,91 @@
|
||||
From 382003bde3db1c82989dfe25c5a1ddb457473186 Mon Sep 17 00:00:00 2001
|
||||
From: progier389 <progier@redhat.com>
|
||||
Date: Tue, 21 Nov 2023 11:57:44 +0100
|
||||
Subject: [PATCH] Issue 5984 - Crash when paged result search are abandoned -
|
||||
fix2 (#5987)
|
||||
|
||||
Chasing several rabbits at the same time is a bad idea !
|
||||
and I mixed branches and unwillingly pushed one commit for #5980 in #5984
|
||||
just before the PR #5985 merge ! -:(
|
||||
Hopefully it does not break anything but just logs some useless crap if instance fails to starts.
|
||||
Anyway This commit reverts the change about __init.py
|
||||
and also do a minor code cleanup (removed a trailing space) in abandon.c
|
||||
|
||||
Issue #5984
|
||||
|
||||
Reviewed by: @tbordaz Thanks !
|
||||
|
||||
(cherry picked from commit df7dd8320424f7ab616c9ad8086a6874ff8bf859)
|
||||
---
|
||||
ldap/servers/slapd/abandon.c | 2 +-
|
||||
src/lib389/lib389/__init__.py | 27 +--------------------------
|
||||
2 files changed, 2 insertions(+), 27 deletions(-)
|
||||
|
||||
diff --git a/ldap/servers/slapd/abandon.c b/ldap/servers/slapd/abandon.c
|
||||
index 964d28836..2dd1ee320 100644
|
||||
--- a/ldap/servers/slapd/abandon.c
|
||||
+++ b/ldap/servers/slapd/abandon.c
|
||||
@@ -43,7 +43,7 @@ do_abandon(Slapi_PBlock *pb)
|
||||
struct timespec hr_time_end;
|
||||
int nentries;
|
||||
int opid;
|
||||
- } o_copy;
|
||||
+ } o_copy;
|
||||
|
||||
slapi_pblock_get(pb, SLAPI_OPERATION, &pb_op);
|
||||
slapi_pblock_get(pb, SLAPI_CONNECTION, &pb_conn);
|
||||
diff --git a/src/lib389/lib389/__init__.py b/src/lib389/lib389/__init__.py
|
||||
index 6a941dbe7..7590ec442 100644
|
||||
--- a/src/lib389/lib389/__init__.py
|
||||
+++ b/src/lib389/lib389/__init__.py
|
||||
@@ -1048,24 +1048,6 @@ class DirSrv(SimpleLDAPObject, object):
|
||||
|
||||
self.state = DIRSRV_STATE_OFFLINE
|
||||
|
||||
- def dump_errorlog(self):
|
||||
- '''
|
||||
- Its logs all errors messages within the error log that occured
|
||||
- after the last startup.
|
||||
- '''
|
||||
- if os.path.isfile(self.errlog):
|
||||
- lines = []
|
||||
- with open(self.errlog, 'r') as file:
|
||||
- for line in file:
|
||||
- if "starting up" in line:
|
||||
- lines = []
|
||||
- for key in ( 'DEBUG', 'INFO', 'NOTICE', 'WARN' ):
|
||||
- if key in line:
|
||||
- lines.append(line)
|
||||
- break
|
||||
- for line in lines:
|
||||
- self.log.error(line)
|
||||
-
|
||||
def start(self, timeout=120, post_open=True):
|
||||
'''
|
||||
It starts an instance and rebind it. Its final state after rebind
|
||||
@@ -1089,13 +1071,7 @@ class DirSrv(SimpleLDAPObject, object):
|
||||
if self.with_systemd():
|
||||
self.log.debug("systemd status -> True")
|
||||
# Do systemd things here ...
|
||||
- try:
|
||||
- subprocess.check_output(["systemctl", "start", "dirsrv@%s" % self.serverid], stderr=subprocess.STDOUT)
|
||||
- except subprocess.CalledProcessError as e:
|
||||
- self.dump_errorlog()
|
||||
- self.log.error('Failed to start dirsrv@%s: "%s"' % (self.serverid, e.output.decode()))
|
||||
- self.log.error(e)
|
||||
- raise ValueError('Failed to start DS')
|
||||
+ subprocess.check_output(["systemctl", "start", "dirsrv@%s" % self.serverid], stderr=subprocess.STDOUT)
|
||||
else:
|
||||
self.log.debug("systemd status -> False")
|
||||
# Start the process.
|
||||
@@ -1119,7 +1095,6 @@ class DirSrv(SimpleLDAPObject, object):
|
||||
self.log.debug("DEBUG: starting with %s" % cmd)
|
||||
output = subprocess.check_output(*cmd, env=env, stderr=subprocess.STDOUT)
|
||||
except subprocess.CalledProcessError as e:
|
||||
- self.dump_errorlog()
|
||||
self.log.error('Failed to start ns-slapd: "%s"' % e.output.decode())
|
||||
self.log.error(e)
|
||||
raise ValueError('Failed to start DS')
|
||||
--
|
||||
2.41.0
|
||||
|
Loading…
Reference in new issue