From 75a5004ac61b7d354269378ca1bc77649d96c731 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hande=20G=C3=B6z=C3=BCkan?= Date: Thu, 18 Feb 2021 11:11:17 +0100 Subject: [PATCH 01/11] run tests both on ubuntu 18.04 and 20.04 for both openmpi and mpich implementations --- .github/workflows/unittests.yml | 5 ++++- ci/install_mpi.sh | 17 ++++++++++++++--- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml index 9d9c82b3..7061a17a 100644 --- a/.github/workflows/unittests.yml +++ b/.github/workflows/unittests.yml @@ -6,14 +6,16 @@ on: jobs: test: - runs-on: ubuntu-20.04 + runs-on: ${{ matrix.os }} defaults: run: shell: bash -l {0} strategy: fail-fast: false matrix: + os: [ubuntu-18.04, ubuntu-20.04] mpi_install: [system, conda] + mpi_impl: [openmpi, mpich] python: [3.8] steps: - uses: actions/checkout@v2 @@ -28,6 +30,7 @@ jobs: ./ci/install_mpi.sh env: MPI_INSTALL: ${{ matrix.mpi_install }} + MPI_IMPL: ${{ matrix.mpi_impl }} - name: Run unit tests run: | pip install -e .[test] diff --git a/ci/install_mpi.sh b/ci/install_mpi.sh index eb5c2de2..43a1af12 100755 --- a/ci/install_mpi.sh +++ b/ci/install_mpi.sh @@ -3,12 +3,23 @@ set -euo pipefail case "$MPI_INSTALL" in "conda") - conda install -y openmpi mpi4py + conda install -y "$MPI_IMPL" mpi4py ;; "system") sudo apt-get update - sudo apt-get install -qy libopenmpi-dev openmpi-bin + case "$MPI_IMPL" in + "openmpi") + sudo apt-get install -qy libopenmpi-dev openmpi-bin + ;; + "mpich") + sudo apt-get install -qy mpich + ;; + *) + false + ;; + esac ;; *) - false;; + false;; esac + From 7bbb9fc47fa52aa5238a5d3c8828c6a7749a28f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hande=20G=C3=B6z=C3=BCkan?= Date: Wed, 31 Mar 2021 11:24:28 +0200 Subject: [PATCH 02/11] do not specify hostfile in env variable for tests --- dicodile/update_z/tests/conftest.py | 1 - dicodile/update_z/tests/hostfile_test | 1 - 2 files changed, 2 deletions(-) delete mode 100644 dicodile/update_z/tests/hostfile_test diff --git a/dicodile/update_z/tests/conftest.py b/dicodile/update_z/tests/conftest.py index fb8cb3d1..0d5f72c7 100644 --- a/dicodile/update_z/tests/conftest.py +++ b/dicodile/update_z/tests/conftest.py @@ -4,4 +4,3 @@ def pytest_configure(config): # Set DICOD in debug mode os.environ["TESTING_DICOD"] = "1" - os.environ["MPI_HOSTFILE"] = "dicodile/update_z/tests/hostfile_test" diff --git a/dicodile/update_z/tests/hostfile_test b/dicodile/update_z/tests/hostfile_test deleted file mode 100644 index 05e311e1..00000000 --- a/dicodile/update_z/tests/hostfile_test +++ /dev/null @@ -1 +0,0 @@ -localhost slots=16 From 50b1766729a41d13f3576e1ffc89cb717dd182b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hande=20G=C3=B6z=C3=BCkan?= Date: Fri, 2 Apr 2021 17:42:43 +0200 Subject: [PATCH 03/11] update pytest command to be run with mpiexec --- .github/workflows/unittests.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml index 7061a17a..639430cf 100644 --- a/.github/workflows/unittests.yml +++ b/.github/workflows/unittests.yml @@ -34,8 +34,7 @@ jobs: - name: Run unit tests run: | pip install -e .[test] - echo "localhost slots=16">hostfile - pytest --cov-report=xml --cov=dicodile + mpiexec -np 1 --host localhost:16 python -m pytest --cov-report=xml --cov=dicodile - name: Upload coverage uses: codecov/codecov-action@v1 with: From cf5509357b707d83eeaa733484e6f585ae611538 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hande=20G=C3=B6z=C3=BCkan?= Date: Fri, 2 Apr 2021 17:48:01 +0200 Subject: [PATCH 04/11] use host flag instead of hostfile to build the docs --- docs/Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/Makefile b/docs/Makefile index 96599f2a..c601fbad 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -7,7 +7,6 @@ SPHINXOPTS += SPHINXBUILD ?= sphinx-build SOURCEDIR = . BUILDDIR = _build -HOSTFILE = ../hostfile ALLOW_AS_ROOT = ${ALLOW} # Put it first so that "make" without argument is like "make help". @@ -24,7 +23,7 @@ clean: .PHONY: html html: - mpiexec -np 1 $(ALLOW_AS_ROOT) --hostfile $(HOSTFILE) $(SPHINXBUILD) -b html $(SOURCEDIR) $(BUILDDIR)/html $(SPHINXOPTS) ${0} + mpiexec -np 1 $(ALLOW_AS_ROOT) --host localhost:50 $(SPHINXBUILD) -b html $(SOURCEDIR) $(BUILDDIR)/html $(SPHINXOPTS) ${0} @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." From e68d6773e1ac9f6cb60ea2c5b7e206373ecdb543 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hande=20G=C3=B6z=C3=BCkan?= Date: Thu, 6 May 2021 16:22:41 +0200 Subject: [PATCH 05/11] stop workers after dicodile function run --- dicodile/_dicodile.py | 2 ++ dicodile/update_z/distributed_sparse_encoder.py | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/dicodile/_dicodile.py b/dicodile/_dicodile.py index 46aba3bf..8ac86b43 100644 --- a/dicodile/_dicodile.py +++ b/dicodile/_dicodile.py @@ -227,5 +227,7 @@ def dicodile(X, D_init, reg=.1, n_iter=100, eps=1e-5, window=False, runtime = np.sum(times) encoder.release_workers() + + encoder.shut_down_workers() print("[INFO:{}] Finished in {:.0f}s".format(name, runtime)) return D_hat, z_hat, pobj, times diff --git a/dicodile/update_z/distributed_sparse_encoder.py b/dicodile/update_z/distributed_sparse_encoder.py index 471a2d0b..aba45086 100644 --- a/dicodile/update_z/distributed_sparse_encoder.py +++ b/dicodile/update_z/distributed_sparse_encoder.py @@ -5,7 +5,7 @@ from ..utils import constants from ..utils.csc import compute_objective from ..workers.reusable_workers import get_reusable_workers -from ..workers.reusable_workers import send_command_to_reusable_workers +from ..workers.reusable_workers import send_command_to_reusable_workers, shutdown_reusable_workers from ..utils import debug_flags as flags from ..utils.debugs import main_check_beta @@ -139,6 +139,9 @@ def get_sufficient_statistics(self): def release_workers(self): send_command_to_reusable_workers(constants.TAG_DICODILE_STOP) + def shut_down_workers(self): + shutdown_reusable_workers() + def check_cost(self, X, D_hat, reg): cost = self.get_cost() z_hat = self.get_z_hat() From 6e0e1ed33f39295833ea27a456f2e2799519786a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hande=20G=C3=B6z=C3=BCkan?= Date: Thu, 6 May 2021 16:24:05 +0200 Subject: [PATCH 06/11] stop workers at the end of test run --- dicodile/update_z/tests/test_distributed_sparse_encoder.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dicodile/update_z/tests/test_distributed_sparse_encoder.py b/dicodile/update_z/tests/test_distributed_sparse_encoder.py index 06aea6bd..ecb3d863 100644 --- a/dicodile/update_z/tests/test_distributed_sparse_encoder.py +++ b/dicodile/update_z/tests/test_distributed_sparse_encoder.py @@ -51,3 +51,5 @@ def test_distributed_sparse_encoder(): ztX = compute_ztX(z_hat, X) assert np.allclose(ztz, ztz_distrib) assert np.allclose(ztX, ztX_distrib) + + encoder.shut_down_workers() From 5d0cf39a21ae5231c38dc2d58783db07353998e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hande=20G=C3=B6z=C3=BCkan?= Date: Thu, 6 May 2021 16:34:39 +0200 Subject: [PATCH 07/11] copy dicod.py to communication primitive.py --- dicodile/update_z/communication_primitive.py | 382 ++++++++++++++++++ .../update_z/distributed_sparse_encoder.py | 8 +- 2 files changed, 386 insertions(+), 4 deletions(-) create mode 100644 dicodile/update_z/communication_primitive.py diff --git a/dicodile/update_z/communication_primitive.py b/dicodile/update_z/communication_primitive.py new file mode 100644 index 00000000..f0f0d92e --- /dev/null +++ b/dicodile/update_z/communication_primitive.py @@ -0,0 +1,382 @@ +"""Convolutional Sparse Coding with DICOD + +Author : tommoral +""" + +import time +import logging +import numpy as np +from mpi4py import MPI + +from ..utils import constants +from ..utils import debug_flags as flags +from ..utils.csc import compute_objective +from ..utils.debugs import main_check_beta +from .coordinate_descent import STRATEGIES +from ..utils.segmentation import Segmentation +from .coordinate_descent import coordinate_descent +from ..utils.mpi import broadcast_array, recv_reduce_sum_array +from ..utils.shape_helpers import get_valid_support, find_grid_size + +from ..workers.reusable_workers import get_reusable_workers +from ..workers.reusable_workers import send_command_to_reusable_workers + + +log = logging.getLogger('dicod') + +# debug flags + +interactive_exec = "xterm" +interactive_args = ["-fa", "Monospace", "-fs", "12", "-e", "ipython", "-i"] + + +def dicod(X_i, D, reg, z0=None, DtD=None, n_seg='auto', strategy='greedy', + soft_lock='border', n_workers=1, w_world='auto', hostfile=None, + tol=1e-5, max_iter=100000, timeout=None, z_positive=False, + return_ztz=False, warm_start=False, freeze_support=False, + timing=False, random_state=None, verbose=0, debug=False): + """DICOD for 2D convolutional sparse coding. + + Parameters + ---------- + X_i : ndarray, shape (n_channels, *sig_support) + Image to encode on the dictionary D + D : ndarray, shape (n_atoms, n_channels, *atom_support) + Current dictionary for the sparse coding + reg : float + Regularization parameter + z0 : ndarray, shape (n_atoms, *valid_support) or None + Warm start value for z_hat. If None, z_hat is initialized to 0. + DtD : ndarray, shape (n_atoms, n_atoms, 2 valid_support - 1) or None + Warm start value for DtD. If None, it is computed in each worker. + n_seg : int or {{ 'auto' }} + Number of segments to use for each dimension. If set to 'auto' use + segments of twice the size of the dictionary. + strategy : str in {} + Coordinate selection scheme for the coordinate descent. If set to + 'greedy', the coordinate with the largest value for dz_opt is selected. + If set to 'random', the coordinate is chosen uniformly on the segment. + soft_lock : str in {{ 'none', 'corner', 'border' }} + If set to true, use the soft-lock in LGCD. + n_workers : int + Number of workers used to compute the convolutional sparse coding + solution. + w_world : int or {{'auto'}} + Number of jobs used per row in the splitting grid. This should divide + n_workers. + hostfile : str + File containing the cluster information. See MPI documentation to have + the format of this file. + tol : float + Tolerance for the minimal update size in this algorithm. + max_iter : int + Maximal number of iteration run by this algorithm. + timeout : int + Timeout for the algorithm in seconds + z_positive : boolean + If set to true, the activations are constrained to be positive. + return_ztz : boolean + If True, returns the constants ztz and ztX, used to compute D-updates. + warm_start : boolean + If set to True, start from the previous solution z_hat if it exists. + freeze_support : boolean + If set to True, only update the coefficient that are non-zero in z0. + timing : boolean + If set to True, log the cost and timing information. + random_state : None or int or RandomState + current random state to seed the random number generator. + verbose : int + Verbosity level of the algorithm. + + Return + ------ + z_hat : ndarray, shape (n_atoms, *valid_support) + Activation associated to X_i for the given dictionary D + """ + if strategy == 'lgcd': + strategy = 'greedy' + assert n_seg == 'auto', "strategy='lgcd' only work with n_seg='auto'." + elif strategy == 'gcd': + strategy = 'greedy' + assert n_seg == 'auto', "strategy='gcd' only work with n_seg='auto'." + n_seg = 1 + + # Parameters validation + n_channels, *sig_support = X_i.shape + n_atoms, n_channels, *atom_support = D.shape + assert D.ndim - 1 == X_i.ndim + valid_support = get_valid_support(sig_support, atom_support) + + assert soft_lock in ['none', 'corner', 'border'] + assert strategy in ['greedy', 'random', 'cyclic', 'cyclic-r'] + + if n_workers == 1: + return coordinate_descent( + X_i, D, reg, z0=z0, DtD=DtD, n_seg=n_seg, strategy=strategy, + tol=tol, max_iter=max_iter, timeout=timeout, z_positive=z_positive, + freeze_support=freeze_support, return_ztz=return_ztz, + timing=timing, random_state=random_state, verbose=verbose) + + params = dict( + strategy=strategy, tol=tol, max_iter=max_iter, timeout=timeout, + n_seg=n_seg, z_positive=z_positive, verbose=verbose, timing=timing, + debug=debug, random_state=random_state, reg=reg, return_ztz=return_ztz, + soft_lock=soft_lock, precomputed_DtD=DtD is not None, + freeze_support=freeze_support, warm_start=warm_start + ) + + comm = _spawn_workers(n_workers, hostfile) + t_transfert, workers_segments = _send_task(comm, X_i, D, z0, DtD, w_world, + params) + + if flags.CHECK_WARM_BETA: + main_check_beta(comm, workers_segments) + + if verbose > 0: + print('\r[INFO:DICOD-{}] End transfert - {:.4}s' + .format(workers_segments.effective_n_seg, t_transfert).ljust(80)) + + # Wait for the result computation + comm.Barrier() + run_statistics = _gather_run_statistics( + comm, workers_segments, verbose=verbose) + + z_hat, ztz, ztX, cost, _log, t_reduce = _recv_result( + comm, D.shape, valid_support, workers_segments, return_ztz=return_ztz, + timing=timing, verbose=verbose) + comm.Barrier() + + if timing: + p_obj = reconstruct_pobj(X_i, D, reg, _log, t_transfert, t_reduce, + n_workers=n_workers, + valid_support=valid_support, z0=z0) + else: + p_obj = [[run_statistics['n_updates'], + run_statistics['runtime'], + cost]] + return z_hat, ztz, ztX, p_obj, run_statistics + + +def reconstruct_pobj(X, D, reg, _log, t_init, t_reduce, n_workers, + valid_support=None, z0=None): + n_atoms = D.shape[0] + if z0 is None: + z_hat = np.zeros((n_atoms, *valid_support)) + else: + z_hat = np.copy(z0) + + # Re-order the updates + _log.sort() + max_ii = [0] * n_workers + for _, ii, rank, *_ in _log: + max_ii[rank] = max(max_ii[rank], ii) + max_ii = np.sum(max_ii) + + up_ii = 0 + p_obj = [(up_ii, t_init, compute_objective(X, z_hat, D, reg))] + next_ii_cost = 1 + last_ii = [0] * n_workers + for i, (t_update, ii, rank, k0, pt0, dz) in enumerate(_log): + z_hat[k0][tuple(pt0)] += dz + up_ii += ii - last_ii[rank] + last_ii[rank] = ii + if up_ii >= next_ii_cost: + p_obj.append((up_ii, t_update + t_init, + compute_objective(X, z_hat, D, reg))) + next_ii_cost = next_ii_cost * 1.3 + print("\rReconstructing cost {:7.2%}" + .format(np.log2(up_ii)/np.log2(max_ii)), end='', flush=True) + elif i + 1 % 1000: + print("\rReconstructing cost {:7.2%}" + .format(np.log2(up_ii)/np.log2(max_ii)), end='', flush=True) + print('\rReconstruction cost: done'.ljust(40)) + + final_cost = compute_objective(X, z_hat, D, reg) + p_obj.append((up_ii, t_update, final_cost)) + p_obj.append((up_ii, t_init + t_update + t_reduce, final_cost)) + return np.array(p_obj) + + +def _spawn_workers(n_workers, hostfile): + comm = get_reusable_workers(n_workers, hostfile=hostfile) + send_command_to_reusable_workers(constants.TAG_WORKER_RUN_DICOD) + return comm + + +def _send_task(comm, X, D, z0, DtD, w_world, params): + t_start = time.time() + n_atoms, n_channels, *atom_support = D.shape + + _send_params(comm, params) + + _send_D(comm, D, DtD) + + workers_segments = _send_signal(comm, w_world, atom_support, X, z0) + + t_init = time.time() - t_start + return t_init, workers_segments + + +def _send_params(comm, params): + comm.bcast(params, root=MPI.ROOT) + + +def _send_D(comm, D, DtD=None): + broadcast_array(comm, D) + if DtD is not None: + broadcast_array(comm, DtD) + + +def _send_signal(comm, w_world, atom_support, X, z0=None): + n_workers = comm.Get_remote_size() + n_channels, *full_support = X.shape + valid_support = get_valid_support(full_support, atom_support) + overlap = tuple(np.array(atom_support) - 1) + + X_info = dict(has_z0=z0 is not None, valid_support=valid_support) + + if w_world == 'auto': + X_info["workers_topology"] = find_grid_size( + n_workers, valid_support, atom_support + ) + else: + assert n_workers % w_world == 0 + X_info["workers_topology"] = w_world, n_workers // w_world + + # compute a segmentation for the image, + workers_segments = Segmentation(n_seg=X_info['workers_topology'], + signal_support=valid_support, + overlap=overlap) + + # Make sure that each worker has at least a segment of twice the size of + # the dictionary. If this is not the case, the algorithm is not valid as it + # is possible to have interference with workers that are not neighbors. + worker_support = workers_segments.get_seg_support(0, inner=True) + msg = ("The size of the support in each worker is smaller than twice the " + "size of the atom support. The algorithm is does not converge in " + "this condition. Reduce the number of cores.\n" + f"worker: {worker_support}, atom: {atom_support}, " + f"topology: {X_info['workers_topology']}") + assert all( + (np.array(worker_support) >= 2 * np.array(atom_support)) + | (np.array(X_info['workers_topology']) == 1)), msg + + # Broadcast the info about this signal to the + comm.bcast(X_info, root=MPI.ROOT) + + X = np.array(X, dtype='d') + + for i_seg in range(n_workers): + if z0 is not None: + worker_slice = workers_segments.get_seg_slice(i_seg) + _send_array(comm, i_seg, z0[worker_slice]) + seg_bounds = workers_segments.get_seg_bounds(i_seg) + X_worker_slice = (Ellipsis,) + tuple([ + slice(start, end + size_atom_ax - 1) + for (start, end), size_atom_ax in zip(seg_bounds, atom_support) + ]) + _send_array(comm, i_seg, X[X_worker_slice]) + + # Synchronize the multiple send with a Barrier + comm.Barrier() + return workers_segments + + +def _send_array(comm, dest, arr): + comm.Send([arr.ravel(), MPI.DOUBLE], + dest=dest, tag=constants.TAG_ROOT + dest) + + +def _gather_run_statistics(comm, workers_segments, verbose=0): + n_workers = workers_segments.effective_n_seg + + if flags.CHECK_FINAL_BETA: + main_check_beta(comm, workers_segments) + + stats = np.array(comm.gather(None, root=MPI.ROOT)) + iterations, n_coordinate_updates = np.sum(stats[:, :2], axis=0) + runtime, t_local_init, t_run = np.max(stats[:, 2:5], axis=0) + t_select = np.mean(stats[:, -2], axis=0) + t_update = np.mean([s for s in stats[:, -1] if s is not None]) + if verbose > 1: + print("\r[INFO:DICOD-{}] converged in {:.3f}s ({:.3f}s) with " + "{:.0f} iterations ({:.0f} updates).".format( + n_workers, runtime, t_run, iterations, n_coordinate_updates)) + if verbose > 5: + print(f"\r[DEBUG:DICOD-{n_workers}] t_select={t_select:.3e}s " + f"t_update={t_update:.3e}s") + run_statistics = dict( + iterations=iterations, runtime=runtime, t_init=t_local_init, + t_run=t_run, n_updates=n_coordinate_updates, t_select=t_select, + t_update=t_update + ) + return run_statistics + + +def _recv_result(comm, D_shape, valid_support, workers_segments, + return_ztz=False, timing=False, verbose=0): + n_atoms, n_channels, *atom_support = D_shape + + t_start = time.time() + + z_hat = recv_z_hat(comm, n_atoms=n_atoms, + workers_segments=workers_segments) + + if return_ztz: + ztz, ztX = recv_sufficient_statistics(comm, D_shape) + else: + ztz, ztX = None, None + + cost = recv_cost(comm) + + _log = [] + if timing: + for i_seg in range(workers_segments.effective_n_seg): + _log.extend(comm.recv(source=i_seg)) + + t_reduce = time.time() - t_start + if verbose >= 5: + print('\r[DEBUG:DICOD-{}] End finalization - {:.4}s' + .format(workers_segments.effective_n_seg, t_reduce)) + + return z_hat, ztz, ztX, cost, _log, t_reduce + + +def recv_z_hat(comm, n_atoms, workers_segments): + + valid_support = workers_segments.signal_support + + inner = not flags.GET_OVERLAP_Z_HAT + z_hat = np.empty((n_atoms, *valid_support), dtype='d') + for i_seg in range(workers_segments.effective_n_seg): + worker_support = workers_segments.get_seg_support( + i_seg, inner=inner) + z_worker = np.zeros((n_atoms,) + worker_support, 'd') + comm.Recv([z_worker.ravel(), MPI.DOUBLE], source=i_seg, + tag=constants.TAG_ROOT + i_seg) + worker_slice = workers_segments.get_seg_slice( + i_seg, inner=inner) + z_hat[worker_slice] = z_worker + + return z_hat + + +def recv_z_nnz(comm, n_atoms): + return recv_reduce_sum_array(comm, n_atoms) + + +def recv_sufficient_statistics(comm, D_shape): + n_atoms, n_channels, *atom_support = D_shape + ztz_support = tuple(2 * np.array(atom_support) - 1) + ztz = recv_reduce_sum_array(comm, (n_atoms, n_atoms, *ztz_support)) + ztX = recv_reduce_sum_array(comm, (n_atoms, n_channels, *atom_support)) + return ztz, ztX + + +def recv_cost(comm): + cost = recv_reduce_sum_array(comm, 1) + return cost[0] + + +# Update the docstring +dicod.__doc__.format(STRATEGIES) diff --git a/dicodile/update_z/distributed_sparse_encoder.py b/dicodile/update_z/distributed_sparse_encoder.py index aba45086..21e15ff9 100644 --- a/dicodile/update_z/distributed_sparse_encoder.py +++ b/dicodile/update_z/distributed_sparse_encoder.py @@ -11,10 +11,10 @@ from ..utils.debugs import main_check_beta from ..utils.shape_helpers import get_valid_support -from .dicod import recv_z_hat, recv_z_nnz -from .dicod import _gather_run_statistics -from .dicod import _send_task, _send_D, _send_signal -from .dicod import recv_cost, recv_sufficient_statistics +from .communication_primitive import recv_z_hat, recv_z_nnz +from .communication_primitive import _gather_run_statistics +from .communication_primitive import _send_task, _send_D, _send_signal +from .communication_primitive import recv_cost, recv_sufficient_statistics class DistributedSparseEncoder: From 8c51462c89b47f03e186c2dbcabf57337fd61d20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hande=20G=C3=B6z=C3=BCkan?= Date: Fri, 7 May 2021 11:48:37 +0200 Subject: [PATCH 08/11] shutdown reusable workers at the end of dicod --- dicodile/update_z/dicod.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dicodile/update_z/dicod.py b/dicodile/update_z/dicod.py index f0f0d92e..07ef9724 100644 --- a/dicodile/update_z/dicod.py +++ b/dicodile/update_z/dicod.py @@ -18,7 +18,7 @@ from ..utils.mpi import broadcast_array, recv_reduce_sum_array from ..utils.shape_helpers import get_valid_support, find_grid_size -from ..workers.reusable_workers import get_reusable_workers +from ..workers.reusable_workers import get_reusable_workers, shutdown_reusable_workers from ..workers.reusable_workers import send_command_to_reusable_workers @@ -154,6 +154,8 @@ def dicod(X_i, D, reg, z0=None, DtD=None, n_seg='auto', strategy='greedy', p_obj = [[run_statistics['n_updates'], run_statistics['runtime'], cost]] + + shutdown_reusable_workers() return z_hat, ztz, ztX, p_obj, run_statistics From d430195d388838c1469059cc17a2f3af15164764 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hande=20G=C3=B6z=C3=BCkan?= Date: Fri, 7 May 2021 11:51:13 +0200 Subject: [PATCH 09/11] run ci on all branches --- .github/workflows/unittests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml index 639430cf..1c76fd15 100644 --- a/.github/workflows/unittests.yml +++ b/.github/workflows/unittests.yml @@ -2,7 +2,7 @@ name: unittests on: pull_request: push: - branches: main +# branches: main jobs: test: From 175277312d7b6c2441d2d24497da425ca394b15e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hande=20G=C3=B6z=C3=BCkan?= Date: Fri, 7 May 2021 12:37:18 +0200 Subject: [PATCH 10/11] check network interfaces on ci server --- .github/workflows/unittests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml index 1c76fd15..162398f0 100644 --- a/.github/workflows/unittests.yml +++ b/.github/workflows/unittests.yml @@ -28,6 +28,7 @@ jobs: conda --version which python ./ci/install_mpi.sh + ip addr env: MPI_INSTALL: ${{ matrix.mpi_install }} MPI_IMPL: ${{ matrix.mpi_impl }} From 909cdcfd5a21d03fee3ee4c084014850bf09a2b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hande=20G=C3=B6z=C3=BCkan?= Date: Fri, 7 May 2021 12:43:03 +0200 Subject: [PATCH 11/11] test with eth0 --- .github/workflows/unittests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml index 162398f0..66cf0f3a 100644 --- a/.github/workflows/unittests.yml +++ b/.github/workflows/unittests.yml @@ -35,7 +35,7 @@ jobs: - name: Run unit tests run: | pip install -e .[test] - mpiexec -np 1 --host localhost:16 python -m pytest --cov-report=xml --cov=dicodile + mpiexec -np 1 --mca btl_tcp_if_include eth0 --host localhost:16 python -m pytest --cov-report=xml --cov=dicodile - name: Upload coverage uses: codecov/codecov-action@v1 with: