You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
221 lines
9.5 KiB
221 lines
9.5 KiB
From 8cf981c00ae18d3efaeb10819282cd991621e9a2 Mon Sep 17 00:00:00 2001
|
|
From: tbordaz <tbordaz@redhat.com>
|
|
Date: Wed, 22 May 2024 11:29:05 +0200
|
|
Subject: [PATCH] Issue 6172 - RFE: improve the performance of evaluation of
|
|
filter component when tested against a large valueset (like group members)
|
|
(#6173)
|
|
|
|
Bug description:
|
|
Before returning an entry (to a SRCH) the server checks that the entry matches the SRCH filter.
|
|
If a filter component (equality) is testing the value (ava) against a
|
|
large valueset (like uniquemember values), it takes a long time because
|
|
of the large number of values and required normalization of the values.
|
|
This can be improved taking benefit of sorted valueset. Those sorted
|
|
valueset were created to improve updates of large valueset (groups) but
|
|
at that time not implemented in SRCH path.
|
|
|
|
Fix description:
|
|
In case of LDAP_FILTER_EQUALITY component, the server can get
|
|
benefit of the sorted valuearray.
|
|
To limit the risk of regression, we use the sorted valuearray
|
|
only for the DN syntax attribute. Indeed the sorted valuearray was
|
|
designed for those type of attribute.
|
|
With those two limitations, there is no need of a toggle and
|
|
the call to plugin_call_syntax_filter_ava can be replaced by
|
|
a call to slapi_valueset_find.
|
|
In both cases, sorted valueset and plugin_call_syntax_filter_ava, ava and
|
|
values are normalized.
|
|
In sorted valueset, the values have been normalized to insert the index
|
|
in the sorted array and then comparison is done on normalized values.
|
|
In plugin_call_syntax_filter_ava, all values in valuearray (of valueset) are normalized
|
|
before comparison.
|
|
|
|
relates: #6172
|
|
|
|
Reviewed by: Pierre Rogier, Simon Pichugin (Big Thanks !!!)
|
|
---
|
|
.../tests/suites/filter/filter_test.py | 125 ++++++++++++++++++
|
|
ldap/servers/slapd/filterentry.c | 22 ++-
|
|
2 files changed, 146 insertions(+), 1 deletion(-)
|
|
|
|
diff --git a/dirsrvtests/tests/suites/filter/filter_test.py b/dirsrvtests/tests/suites/filter/filter_test.py
|
|
index d6bfa5a3b..4baaf04a7 100644
|
|
--- a/dirsrvtests/tests/suites/filter/filter_test.py
|
|
+++ b/dirsrvtests/tests/suites/filter/filter_test.py
|
|
@@ -9,7 +9,11 @@
|
|
import logging
|
|
|
|
import pytest
|
|
+import time
|
|
+from lib389.dirsrv_log import DirsrvAccessLog
|
|
from lib389.tasks import *
|
|
+from lib389.backend import Backends, Backend
|
|
+from lib389.dbgen import dbgen_users, dbgen_groups
|
|
from lib389.topologies import topology_st
|
|
from lib389._constants import PASSWORD, DEFAULT_SUFFIX, DN_DM, SUFFIX
|
|
from lib389.utils import *
|
|
@@ -304,6 +308,127 @@ def test_extended_search(topology_st):
|
|
ents = topology_st.standalone.search_s(SUFFIX, ldap.SCOPE_SUBTREE, myfilter)
|
|
assert len(ents) == 1
|
|
|
|
+def test_match_large_valueset(topology_st):
|
|
+ """Test that when returning a big number of entries
|
|
+ and that we need to match the filter from a large valueset
|
|
+ we get benefit to use the sorted valueset
|
|
+
|
|
+ :id: 7db5aa88-50e0-4c31-85dd-1d2072cb674c
|
|
+
|
|
+ :setup: Standalone instance
|
|
+
|
|
+ :steps:
|
|
+ 1. Create a users and groups backends and tune them
|
|
+ 2. Generate a test ldif (2k users and 1K groups with all users)
|
|
+ 3. Import test ldif file using Offline import (ldif2db).
|
|
+ 4. Prim the 'groups' entrycache with a "fast" search
|
|
+ 5. Search the 'groups' with a difficult matching value
|
|
+ 6. check that etime from step 5 is less than a second
|
|
+
|
|
+ :expectedresults:
|
|
+ 1. Create a users and groups backends should PASS
|
|
+ 2. Generate LDIF should PASS.
|
|
+ 3. Offline import should PASS.
|
|
+ 4. Priming should PASS.
|
|
+ 5. Performance search should PASS.
|
|
+ 6. Etime of performance search should PASS.
|
|
+ """
|
|
+
|
|
+ log.info('Running test_match_large_valueset...')
|
|
+ #
|
|
+ # Test online/offline LDIF imports
|
|
+ #
|
|
+ inst = topology_st.standalone
|
|
+ inst.start()
|
|
+ backends = Backends(inst)
|
|
+ users_suffix = "ou=users,%s" % DEFAULT_SUFFIX
|
|
+ users_backend = 'users'
|
|
+ users_ldif = 'users_import.ldif'
|
|
+ groups_suffix = "ou=groups,%s" % DEFAULT_SUFFIX
|
|
+ groups_backend = 'groups'
|
|
+ groups_ldif = 'groups_import.ldif'
|
|
+ groups_entrycache = '200000000'
|
|
+ users_number = 2000
|
|
+ groups_number = 1000
|
|
+
|
|
+
|
|
+ # For priming the cache we just want to be fast
|
|
+ # taking the first value in the valueset is good
|
|
+ # whether the valueset is sorted or not
|
|
+ priming_user_rdn = "user0001"
|
|
+
|
|
+ # For performance testing, this is important to use
|
|
+ # user1000 rather then user0001
|
|
+ # Because user0001 is the first value in the valueset
|
|
+ # whether we use the sorted valuearray or non sorted
|
|
+ # valuearray the performance will be similar.
|
|
+ # With middle value user1000, the performance boost of
|
|
+ # the sorted valuearray will make the difference.
|
|
+ perf_user_rdn = "user1000"
|
|
+
|
|
+ # Step 1. Prepare the backends and tune the groups entrycache
|
|
+ try:
|
|
+ be_users = backends.create(properties={'parent': DEFAULT_SUFFIX, 'nsslapd-suffix': users_suffix, 'name': users_backend})
|
|
+ be_groups = backends.create(properties={'parent': DEFAULT_SUFFIX, 'nsslapd-suffix': groups_suffix, 'name': groups_backend})
|
|
+
|
|
+ # set the entry cache to 200Mb as the 1K groups of 2K users require at least 170Mb
|
|
+ be_groups.replace('nsslapd-cachememsize', groups_entrycache)
|
|
+ except:
|
|
+ raise
|
|
+
|
|
+ # Step 2. Generate a test ldif (10k users entries)
|
|
+ log.info("Generating users LDIF...")
|
|
+ ldif_dir = inst.get_ldif_dir()
|
|
+ users_import_ldif = "%s/%s" % (ldif_dir, users_ldif)
|
|
+ groups_import_ldif = "%s/%s" % (ldif_dir, groups_ldif)
|
|
+ dbgen_users(inst, users_number, users_import_ldif, suffix=users_suffix, generic=True, parent=users_suffix)
|
|
+
|
|
+ # Generate a test ldif (800 groups with 10k members) that fit in 700Mb entry cache
|
|
+ props = {
|
|
+ "name": "group",
|
|
+ "suffix": groups_suffix,
|
|
+ "parent": groups_suffix,
|
|
+ "number": groups_number,
|
|
+ "numMembers": users_number,
|
|
+ "createMembers": False,
|
|
+ "memberParent": users_suffix,
|
|
+ "membershipAttr": "uniquemember",
|
|
+ }
|
|
+ dbgen_groups(inst, groups_import_ldif, props)
|
|
+
|
|
+ # Step 3. Do the both offline imports
|
|
+ inst.stop()
|
|
+ if not inst.ldif2db(users_backend, None, None, None, users_import_ldif):
|
|
+ log.fatal('test_basic_import_export: Offline users import failed')
|
|
+ assert False
|
|
+ if not inst.ldif2db(groups_backend, None, None, None, groups_import_ldif):
|
|
+ log.fatal('test_basic_import_export: Offline groups import failed')
|
|
+ assert False
|
|
+ inst.start()
|
|
+
|
|
+ # Step 4. first prime the cache
|
|
+ # Just request the 'DN'. We are interested by the time of matching not by the time of transfert
|
|
+ entries = topology_st.standalone.search_s(groups_suffix, ldap.SCOPE_SUBTREE, "(&(objectclass=groupOfUniqueNames)(uniquemember=uid=%s,%s))" % (priming_user_rdn, users_suffix), ['dn'])
|
|
+ assert len(entries) == groups_number
|
|
+
|
|
+ # Step 5. Now do the real performance checking it should take less than a second
|
|
+ # Just request the 'DN'. We are interested by the time of matching not by the time of transfert
|
|
+ search_start = time.time()
|
|
+ entries = topology_st.standalone.search_s(groups_suffix, ldap.SCOPE_SUBTREE, "(&(objectclass=groupOfUniqueNames)(uniquemember=uid=%s,%s))" % (perf_user_rdn, users_suffix), ['dn'])
|
|
+ duration = time.time() - search_start
|
|
+ log.info("Duration of the search was %f", duration)
|
|
+
|
|
+ # Step 6. Gather the etime from the access log
|
|
+ inst.stop()
|
|
+ access_log = DirsrvAccessLog(inst)
|
|
+ search_result = access_log.match(".*RESULT err=0 tag=101 nentries=%s.*" % groups_number)
|
|
+ log.info("Found patterns are %s", search_result[0])
|
|
+ log.info("Found patterns are %s", search_result[1])
|
|
+ etime = float(search_result[1].split('etime=')[1])
|
|
+ log.info("Duration of the search from access log was %f", etime)
|
|
+ assert len(entries) == groups_number
|
|
+ assert (etime < 1)
|
|
+
|
|
if __name__ == '__main__':
|
|
# Run isolated
|
|
# -s for DEBUG mode
|
|
diff --git a/ldap/servers/slapd/filterentry.c b/ldap/servers/slapd/filterentry.c
|
|
index fd8fdda9f..cae5c7edc 100644
|
|
--- a/ldap/servers/slapd/filterentry.c
|
|
+++ b/ldap/servers/slapd/filterentry.c
|
|
@@ -296,7 +296,27 @@ test_ava_filter(
|
|
rc = -1;
|
|
for (; a != NULL; a = a->a_next) {
|
|
if (slapi_attr_type_cmp(ava->ava_type, a->a_type, SLAPI_TYPE_CMP_SUBTYPE) == 0) {
|
|
- rc = plugin_call_syntax_filter_ava(a, ftype, ava);
|
|
+ if ((ftype == LDAP_FILTER_EQUALITY) &&
|
|
+ (slapi_attr_is_dn_syntax_type(a->a_type))) {
|
|
+ /* This path is for a performance improvement */
|
|
+
|
|
+ /* In case of equality filter we can get benefit of the
|
|
+ * sorted valuearray (from valueset).
|
|
+ * This improvement is limited to DN syntax attributes for
|
|
+ * which the sorted valueset was designed.
|
|
+ */
|
|
+ Slapi_Value *sval = NULL;
|
|
+ sval = slapi_value_new_berval(&ava->ava_value);
|
|
+ if (slapi_valueset_find((const Slapi_Attr *)a, &a->a_present_values, sval)) {
|
|
+ rc = 0;
|
|
+ }
|
|
+ slapi_value_free(&sval);
|
|
+ } else {
|
|
+ /* When sorted valuearray optimization cannot be used
|
|
+ * lets filter the value according to its syntax
|
|
+ */
|
|
+ rc = plugin_call_syntax_filter_ava(a, ftype, ava);
|
|
+ }
|
|
if (rc == 0) {
|
|
break;
|
|
}
|
|
--
|
|
2.46.0
|
|
|