parent
2d07d399e7
commit
0ea3514cf4
@ -1 +1 @@
|
||||
SOURCES/openmpi-4.1.2.tar.bz2
|
||||
SOURCES/openmpi-4.1.1.tar.bz2
|
||||
|
@ -1 +1 @@
|
||||
a8da271d00d808359dda41628e5cb241fa6f6df5 SOURCES/openmpi-4.1.2.tar.bz2
|
||||
fa4dc97da18c8c26d5aadb85262a0f2d52b1aa90 SOURCES/openmpi-4.1.1.tar.bz2
|
||||
|
@ -0,0 +1,367 @@
|
||||
From 63c80c7692e55f634cbca6f67cc5c9cdef3a04d2 Mon Sep 17 00:00:00 2001
|
||||
From: Honggang Li <honli@redhat.com>
|
||||
Date: Mon, 28 Jun 2021 21:38:13 +0800
|
||||
Subject: [PATCH] Revert "ucx: check supported transports and devices for
|
||||
setting priority"
|
||||
|
||||
This reverts commit c36d7459b6331c4da825cad5a64326e7c1a272aa.
|
||||
---
|
||||
contrib/platform/mellanox/optimized.conf | 2 -
|
||||
ompi/mca/pml/ucx/pml_ucx_component.c | 15 +-
|
||||
opal/mca/common/ucx/common_ucx.c | 202 +----------------------
|
||||
opal/mca/common/ucx/common_ucx.h | 15 --
|
||||
opal/mca/common/ucx/configure.m4 | 2 -
|
||||
5 files changed, 2 insertions(+), 234 deletions(-)
|
||||
|
||||
diff --git a/contrib/platform/mellanox/optimized.conf b/contrib/platform/mellanox/optimized.conf
|
||||
index 543fd8d1e224..b86b37c9e2fa 100644
|
||||
--- a/contrib/platform/mellanox/optimized.conf
|
||||
+++ b/contrib/platform/mellanox/optimized.conf
|
||||
@@ -61,8 +61,6 @@
|
||||
coll = ^ml
|
||||
hwloc_base_binding_policy = core
|
||||
btl = self
|
||||
-pml_ucx_tls = any
|
||||
-pml_ucx_devices = any
|
||||
# Basic behavior to smooth startup
|
||||
mca_base_component_show_load_errors = 0
|
||||
orte_abort_timeout = 10
|
||||
diff --git a/ompi/mca/pml/ucx/pml_ucx_component.c b/ompi/mca/pml/ucx/pml_ucx_component.c
|
||||
index 6aed6c41d11d..ed9cc6573e8e 100644
|
||||
--- a/ompi/mca/pml/ucx/pml_ucx_component.c
|
||||
+++ b/ompi/mca/pml/ucx/pml_ucx_component.c
|
||||
@@ -107,26 +107,13 @@ static mca_pml_base_module_t*
|
||||
mca_pml_ucx_component_init(int* priority, bool enable_progress_threads,
|
||||
bool enable_mpi_threads)
|
||||
{
|
||||
- opal_common_ucx_support_level_t support_level;
|
||||
int ret;
|
||||
|
||||
- support_level = opal_common_ucx_support_level(ompi_pml_ucx.ucp_context);
|
||||
- if (support_level == OPAL_COMMON_UCX_SUPPORT_NONE) {
|
||||
- return NULL;
|
||||
- }
|
||||
-
|
||||
if ( (ret = mca_pml_ucx_init(enable_mpi_threads)) != 0) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
- /*
|
||||
- * If found supported devices - set to the configured (high) priority.
|
||||
- * Otherwise - Found only supported transports (which could be exposed by
|
||||
- * unsupported devices), so set a priority lower than ob1.
|
||||
- */
|
||||
- *priority = (support_level == OPAL_COMMON_UCX_SUPPORT_DEVICE) ?
|
||||
- ompi_pml_ucx.priority : 19;
|
||||
- PML_UCX_VERBOSE(2, "returning priority %d", *priority);
|
||||
+ *priority = ompi_pml_ucx.priority;
|
||||
return &ompi_pml_ucx.super;
|
||||
}
|
||||
|
||||
diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c
|
||||
index ac7a17d799a5..ae8e66877ab6 100644
|
||||
--- a/opal/mca/common/ucx/common_ucx.c
|
||||
+++ b/opal/mca/common/ucx/common_ucx.c
|
||||
@@ -14,11 +14,8 @@
|
||||
#include "opal/mca/base/mca_base_framework.h"
|
||||
#include "opal/mca/pmix/pmix.h"
|
||||
#include "opal/memoryhooks/memory.h"
|
||||
-#include "opal/util/argv.h"
|
||||
|
||||
#include <ucm/api/ucm.h>
|
||||
-#include <fnmatch.h>
|
||||
-#include <stdio.h>
|
||||
|
||||
/***********************************************************************/
|
||||
|
||||
@@ -28,8 +25,7 @@ opal_common_ucx_module_t opal_common_ucx = {
|
||||
.verbose = 0,
|
||||
.progress_iterations = 100,
|
||||
.registered = 0,
|
||||
- .opal_mem_hooks = 0,
|
||||
- .tls = NULL
|
||||
+ .opal_mem_hooks = 0
|
||||
};
|
||||
|
||||
static void opal_common_ucx_mem_release_cb(void *buf, size_t length,
|
||||
@@ -40,15 +36,10 @@ static void opal_common_ucx_mem_release_cb(void *buf, size_t length,
|
||||
|
||||
OPAL_DECLSPEC void opal_common_ucx_mca_var_register(const mca_base_component_t *component)
|
||||
{
|
||||
- static const char *default_tls = "rc_verbs,ud_verbs,rc_mlx5,dc_mlx5,cuda_ipc,rocm_ipc";
|
||||
- static const char *default_devices = "mlx*";
|
||||
static int registered = 0;
|
||||
static int hook_index;
|
||||
static int verbose_index;
|
||||
static int progress_index;
|
||||
- static int tls_index;
|
||||
- static int devices_index;
|
||||
-
|
||||
if (!registered) {
|
||||
verbose_index = mca_base_var_register("opal", "opal_common", "ucx", "verbose",
|
||||
"Verbose level of the UCX components",
|
||||
@@ -69,29 +60,6 @@ OPAL_DECLSPEC void opal_common_ucx_mca_var_register(const mca_base_component_t *
|
||||
OPAL_INFO_LVL_3,
|
||||
MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&opal_common_ucx.opal_mem_hooks);
|
||||
-
|
||||
- opal_common_ucx.tls = malloc(sizeof(*opal_common_ucx.tls));
|
||||
- *opal_common_ucx.tls = strdup(default_tls);
|
||||
- tls_index = mca_base_var_register("opal", "opal_common", "ucx", "tls",
|
||||
- "List of UCX transports which should be supported on the system, to enable "
|
||||
- "selecting the UCX component. Special values: any (any available). "
|
||||
- "A '^' prefix negates the list. "
|
||||
- "For example, in order to exclude on shared memory and TCP transports, "
|
||||
- "please set to '^posix,sysv,self,tcp,cma,knem,xpmem'.",
|
||||
- MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
||||
- OPAL_INFO_LVL_3,
|
||||
- MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
- opal_common_ucx.tls);
|
||||
-
|
||||
- opal_common_ucx.devices = malloc(sizeof(*opal_common_ucx.devices));
|
||||
- *opal_common_ucx.devices = strdup(default_devices);
|
||||
- devices_index = mca_base_var_register("opal", "opal_common", "ucx", "devices",
|
||||
- "List of device driver pattern names, which, if supported by UCX, will "
|
||||
- "bump its priority above ob1. Special values: any (any available)",
|
||||
- MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
||||
- OPAL_INFO_LVL_3,
|
||||
- MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
- opal_common_ucx.devices);
|
||||
registered = 1;
|
||||
}
|
||||
if (component) {
|
||||
@@ -107,14 +75,6 @@ OPAL_DECLSPEC void opal_common_ucx_mca_var_register(const mca_base_component_t *
|
||||
component->mca_type_name,
|
||||
component->mca_component_name,
|
||||
"opal_mem_hooks", 0);
|
||||
- mca_base_var_register_synonym(tls_index, component->mca_project_name,
|
||||
- component->mca_type_name,
|
||||
- component->mca_component_name,
|
||||
- "tls", 0);
|
||||
- mca_base_var_register_synonym(devices_index, component->mca_project_name,
|
||||
- component->mca_type_name,
|
||||
- component->mca_component_name,
|
||||
- "devices", 0);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -163,166 +123,6 @@ OPAL_DECLSPEC void opal_common_ucx_mca_deregister(void)
|
||||
opal_output_close(opal_common_ucx.output);
|
||||
}
|
||||
|
||||
-#if HAVE_DECL_OPEN_MEMSTREAM
|
||||
-static bool opal_common_ucx_check_device(const char *device_name, char **device_list)
|
||||
-{
|
||||
- char sysfs_driver_link[PATH_MAX];
|
||||
- char driver_path[PATH_MAX];
|
||||
- char *ib_device_name;
|
||||
- char *driver_name;
|
||||
- char **list_item;
|
||||
- ssize_t ret;
|
||||
-
|
||||
- /* mlx5_0:1 */
|
||||
- ret = sscanf(device_name, "%m[^:]%*d", &ib_device_name);
|
||||
- if (ret != 1) {
|
||||
- return false;
|
||||
- }
|
||||
-
|
||||
- sysfs_driver_link[sizeof(sysfs_driver_link) - 1] = '\0';
|
||||
- snprintf(sysfs_driver_link, sizeof(sysfs_driver_link) - 1,
|
||||
- "/sys/class/infiniband/%s/device/driver", ib_device_name);
|
||||
- free(ib_device_name);
|
||||
-
|
||||
- driver_path[sizeof(driver_path) - 1] = '\0';
|
||||
- ret = readlink(sysfs_driver_link, driver_path, sizeof(driver_path) - 1);
|
||||
- if (ret < 0) {
|
||||
- MCA_COMMON_UCX_VERBOSE(2, "readlink(%s) failed: %s", sysfs_driver_link,
|
||||
- strerror(errno));
|
||||
- return false;
|
||||
- }
|
||||
-
|
||||
- driver_name = basename(driver_path);
|
||||
- for (list_item = device_list; *list_item != NULL; ++list_item) {
|
||||
- if (!fnmatch(*list_item, driver_name, 0)) {
|
||||
- MCA_COMMON_UCX_VERBOSE(2, "driver '%s' matched by '%s'",
|
||||
- driver_path, *list_item);
|
||||
- return true;
|
||||
- }
|
||||
- }
|
||||
-
|
||||
- return false;
|
||||
-}
|
||||
-#endif
|
||||
-
|
||||
-OPAL_DECLSPEC opal_common_ucx_support_level_t
|
||||
-opal_common_ucx_support_level(ucp_context_h context)
|
||||
-{
|
||||
- opal_common_ucx_support_level_t support_level = OPAL_COMMON_UCX_SUPPORT_NONE;
|
||||
- static const char *support_level_names[] = {
|
||||
- [OPAL_COMMON_UCX_SUPPORT_NONE] = "none",
|
||||
- [OPAL_COMMON_UCX_SUPPORT_TRANSPORT] = "transports only",
|
||||
- [OPAL_COMMON_UCX_SUPPORT_DEVICE] = "transports and devices"
|
||||
- };
|
||||
-#if HAVE_DECL_OPEN_MEMSTREAM
|
||||
- char *rsc_tl_name, *rsc_device_name;
|
||||
- char **tl_list, **device_list, **list_item;
|
||||
- bool is_any_tl, is_any_device;
|
||||
- bool found_tl, negate;
|
||||
- char line[128];
|
||||
- FILE *stream;
|
||||
- char *buffer;
|
||||
- size_t size;
|
||||
- int ret;
|
||||
-#endif
|
||||
-
|
||||
- is_any_tl = !strcmp(*opal_common_ucx.tls, "any");
|
||||
- is_any_device = !strcmp(*opal_common_ucx.devices, "any");
|
||||
-
|
||||
- /* Check for special value "any" */
|
||||
- if (is_any_tl && is_any_device) {
|
||||
- MCA_COMMON_UCX_VERBOSE(1, "ucx is enabled on any transport or device",
|
||||
- *opal_common_ucx.tls);
|
||||
- support_level = OPAL_COMMON_UCX_SUPPORT_DEVICE;
|
||||
- goto out;
|
||||
- }
|
||||
-
|
||||
-#if HAVE_DECL_OPEN_MEMSTREAM
|
||||
- /* Split transports list */
|
||||
- negate = ('^' == (*opal_common_ucx.tls)[0]);
|
||||
- tl_list = opal_argv_split(*opal_common_ucx.tls + (negate ? 1 : 0), ',');
|
||||
- if (tl_list == NULL) {
|
||||
- MCA_COMMON_UCX_VERBOSE(1, "failed to split tl list '%s', ucx is disabled",
|
||||
- *opal_common_ucx.tls);
|
||||
- goto out;
|
||||
- }
|
||||
-
|
||||
- /* Split devices list */
|
||||
- device_list = opal_argv_split(*opal_common_ucx.devices, ',');
|
||||
- if (device_list == NULL) {
|
||||
- MCA_COMMON_UCX_VERBOSE(1, "failed to split devices list '%s', ucx is disabled",
|
||||
- *opal_common_ucx.devices);
|
||||
- goto out_free_tl_list;
|
||||
- }
|
||||
-
|
||||
- /* Open memory stream to dump UCX information to */
|
||||
- stream = open_memstream(&buffer, &size);
|
||||
- if (stream == NULL) {
|
||||
- MCA_COMMON_UCX_VERBOSE(1, "failed to open memory stream for ucx info (%s), "
|
||||
- "ucx is disabled", strerror(errno));
|
||||
- goto out_free_device_list;
|
||||
- }
|
||||
-
|
||||
- /* Print ucx transports information to the memory stream */
|
||||
- ucp_context_print_info(context, stream);
|
||||
-
|
||||
- /* Rewind and read transports/devices list from the stream */
|
||||
- fseek(stream, 0, SEEK_SET);
|
||||
- while ((support_level != OPAL_COMMON_UCX_SUPPORT_DEVICE) &&
|
||||
- (fgets(line, sizeof(line), stream) != NULL)) {
|
||||
- rsc_tl_name = NULL;
|
||||
- ret = sscanf(line,
|
||||
- /* "# resource 6 : md 5 dev 4 flags -- rc_verbs/mlx5_0:1" */
|
||||
- "# resource %*d : md %*d dev %*d flags -- %m[^/ \n\r]/%m[^/ \n\r]",
|
||||
- &rsc_tl_name, &rsc_device_name);
|
||||
- if (ret != 2) {
|
||||
- free(rsc_tl_name);
|
||||
- continue;
|
||||
- }
|
||||
-
|
||||
- /* Check if 'rsc_tl_name' is found provided list */
|
||||
- found_tl = is_any_tl;
|
||||
- for (list_item = tl_list; !found_tl && (*list_item != NULL); ++list_item) {
|
||||
- found_tl = !strcmp(*list_item, rsc_tl_name);
|
||||
- }
|
||||
-
|
||||
- /* Check if the transport has a match (either positive or negative) */
|
||||
- assert(!(is_any_tl && negate));
|
||||
- if (found_tl != negate) {
|
||||
- if (is_any_device ||
|
||||
- opal_common_ucx_check_device(rsc_device_name, device_list)) {
|
||||
- MCA_COMMON_UCX_VERBOSE(2, "%s/%s: matched both transport and device list",
|
||||
- rsc_tl_name, rsc_device_name);
|
||||
- support_level = OPAL_COMMON_UCX_SUPPORT_DEVICE;
|
||||
- } else {
|
||||
- MCA_COMMON_UCX_VERBOSE(2, "%s/%s: matched transport list but not device list",
|
||||
- rsc_tl_name, rsc_device_name);
|
||||
- support_level = OPAL_COMMON_UCX_SUPPORT_TRANSPORT;
|
||||
- }
|
||||
- } else {
|
||||
- MCA_COMMON_UCX_VERBOSE(2, "%s/%s: did not match transport list",
|
||||
- rsc_tl_name, rsc_device_name);
|
||||
- }
|
||||
-
|
||||
- free(rsc_device_name);
|
||||
- free(rsc_tl_name);
|
||||
- }
|
||||
-
|
||||
- MCA_COMMON_UCX_VERBOSE(2, "support level is %s", support_level_names[support_level]);
|
||||
- fclose(stream);
|
||||
- free(buffer);
|
||||
-
|
||||
-out_free_device_list:
|
||||
- opal_argv_free(device_list);
|
||||
-out_free_tl_list:
|
||||
- opal_argv_free(tl_list);
|
||||
-out:
|
||||
-#else
|
||||
- MCA_COMMON_UCX_VERBOSE(2, "open_memstream() was not found, ucx is disabled");
|
||||
-#endif
|
||||
- return support_level;
|
||||
-}
|
||||
-
|
||||
void opal_common_ucx_empty_complete_cb(void *request, ucs_status_t status)
|
||||
{
|
||||
}
|
||||
diff --git a/opal/mca/common/ucx/common_ucx.h b/opal/mca/common/ucx/common_ucx.h
|
||||
index 92cdd738ef98..202131ac8907 100644
|
||||
--- a/opal/mca/common/ucx/common_ucx.h
|
||||
+++ b/opal/mca/common/ucx/common_ucx.h
|
||||
@@ -88,8 +88,6 @@ typedef struct opal_common_ucx_module {
|
||||
int progress_iterations;
|
||||
int registered;
|
||||
bool opal_mem_hooks;
|
||||
- char **tls;
|
||||
- char **devices;
|
||||
} opal_common_ucx_module_t;
|
||||
|
||||
typedef struct opal_common_ucx_del_proc {
|
||||
@@ -97,23 +95,10 @@ typedef struct opal_common_ucx_del_proc {
|
||||
size_t vpid;
|
||||
} opal_common_ucx_del_proc_t;
|
||||
|
||||
-typedef enum {
|
||||
- /* No supported transports found (according to configured list of supported
|
||||
- transports) */
|
||||
- OPAL_COMMON_UCX_SUPPORT_NONE,
|
||||
-
|
||||
- /* Have supported transports but not supported devices */
|
||||
- OPAL_COMMON_UCX_SUPPORT_TRANSPORT,
|
||||
-
|
||||
- /* Have both supported transports and supported devices */
|
||||
- OPAL_COMMON_UCX_SUPPORT_DEVICE,
|
||||
-} opal_common_ucx_support_level_t;
|
||||
-
|
||||
extern opal_common_ucx_module_t opal_common_ucx;
|
||||
|
||||
OPAL_DECLSPEC void opal_common_ucx_mca_register(void);
|
||||
OPAL_DECLSPEC void opal_common_ucx_mca_deregister(void);
|
||||
-OPAL_DECLSPEC opal_common_ucx_support_level_t opal_common_ucx_support_level(ucp_context_h context);
|
||||
OPAL_DECLSPEC void opal_common_ucx_mca_proc_added(void);
|
||||
OPAL_DECLSPEC void opal_common_ucx_empty_complete_cb(void *request, ucs_status_t status);
|
||||
OPAL_DECLSPEC int opal_common_ucx_mca_pmix_fence(ucp_worker_h worker);
|
||||
diff --git a/opal/mca/common/ucx/configure.m4 b/opal/mca/common/ucx/configure.m4
|
||||
index af8628a889c6..27e07c2005b2 100644
|
||||
--- a/opal/mca/common/ucx/configure.m4
|
||||
+++ b/opal/mca/common/ucx/configure.m4
|
||||
@@ -18,8 +18,6 @@ AC_DEFUN([MCA_opal_common_ucx_CONFIG],[
|
||||
[common_ucx_happy="yes"],
|
||||
[common_ucx_happy="no"])
|
||||
|
||||
- AC_CHECK_DECLS([open_memstream], [], [], [[#include <stdio.h>]])
|
||||
-
|
||||
AS_IF([test "$common_ucx_happy" = "yes"],
|
||||
[$1],
|
||||
[$2])
|
||||
--
|
||||
2.31.1
|
||||
|
@ -0,0 +1,33 @@
|
||||
From 266189935aef4fce825d0db831b4b53accc62c32 Mon Sep 17 00:00:00 2001
|
||||
From: Jeff Squyres <jsquyres@cisco.com>
|
||||
Date: Tue, 22 Jun 2021 22:28:37 -0400
|
||||
Subject: [PATCH] fbtl-posix: link to common_ompio
|
||||
|
||||
The posix fbtl calls mca_common_ompio_progress(), which resides in
|
||||
common/ompio (i.e., libmca_common_ompio.la). So add that into
|
||||
mca_fbtl_posix_la_LIBADD (like we do in a few other OMPIO-based
|
||||
components). Failure to do this *can* lead to the posix fbtl
|
||||
component failing to load (depending on whether other OMPIO-based
|
||||
components that pull in libmca_common_ompio were loaded first).
|
||||
|
||||
Thanks to Honggang Li for raising the issue.
|
||||
|
||||
Signed-off-by: Jeff Squyres <jsquyres@cisco.com>
|
||||
---
|
||||
ompi/mca/fbtl/posix/Makefile.am | 3 ++-
|
||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/ompi/mca/fbtl/posix/Makefile.am b/ompi/mca/fbtl/posix/Makefile.am
|
||||
index a7b0624d3ec..1ce19cb09b7 100644
|
||||
--- a/ompi/mca/fbtl/posix/Makefile.am
|
||||
+++ b/ompi/mca/fbtl/posix/Makefile.am
|
||||
@@ -34,7 +34,8 @@ mcacomponentdir = $(ompilibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_fbtl_posix_la_SOURCES = $(sources)
|
||||
mca_fbtl_posix_la_LDFLAGS = -module -avoid-version
|
||||
-mca_fbtl_posix_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la
|
||||
+mca_fbtl_posix_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \
|
||||
+ $(OMPI_TOP_BUILDDIR)/ompi/mca/common/ompio/libmca_common_ompio.la
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_fbtl_posix_la_SOURCES = $(sources)
|
Loading…
Reference in new issue