From 75a5004ac61b7d354269378ca1bc77649d96c731 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hande=20G=C3=B6z=C3=BCkan?= <hande.gozukan@inria.fr>
Date: Thu, 18 Feb 2021 11:11:17 +0100
Subject: [PATCH 01/11] run tests both on ubuntu 18.04 and 20.04 for both
 openmpi and mpich implementations

---
 .github/workflows/unittests.yml |  5 ++++-
 ci/install_mpi.sh               | 17 ++++++++++++++---
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml
index 9d9c82b3..7061a17a 100644
--- a/.github/workflows/unittests.yml
+++ b/.github/workflows/unittests.yml
@@ -6,14 +6,16 @@ on:
 
 jobs:
   test:
-    runs-on: ubuntu-20.04
+    runs-on: ${{ matrix.os }}
     defaults:
       run:
         shell: bash -l {0}
     strategy:
       fail-fast: false
       matrix:
+        os: [ubuntu-18.04, ubuntu-20.04]
         mpi_install: [system, conda]
+        mpi_impl: [openmpi, mpich]
         python: [3.8]
     steps:
       - uses: actions/checkout@v2
@@ -28,6 +30,7 @@ jobs:
           ./ci/install_mpi.sh
         env:
           MPI_INSTALL: ${{ matrix.mpi_install }}
+          MPI_IMPL: ${{ matrix.mpi_impl }}
       - name: Run unit tests
         run: |
           pip install -e .[test]
diff --git a/ci/install_mpi.sh b/ci/install_mpi.sh
index eb5c2de2..43a1af12 100755
--- a/ci/install_mpi.sh
+++ b/ci/install_mpi.sh
@@ -3,12 +3,23 @@ set -euo pipefail
 
 case "$MPI_INSTALL" in
     "conda")
-        conda install -y openmpi mpi4py
+        conda install -y "$MPI_IMPL" mpi4py
 	;;
     "system")
         sudo apt-get update
-	sudo apt-get install -qy libopenmpi-dev openmpi-bin
+	case "$MPI_IMPL" in
+	    "openmpi")
+		sudo apt-get install -qy libopenmpi-dev openmpi-bin
+		;;
+	    "mpich")
+		sudo apt-get install -qy mpich
+		;;
+	    *)
+		false
+		;;
+	esac
 	;;
     *)
-        false;;
+	false;;
 esac
+

From 7bbb9fc47fa52aa5238a5d3c8828c6a7749a28f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hande=20G=C3=B6z=C3=BCkan?= <hande.gozukan@inria.fr>
Date: Wed, 31 Mar 2021 11:24:28 +0200
Subject: [PATCH 02/11] do not specify hostfile in env variable for tests

---
 dicodile/update_z/tests/conftest.py   | 1 -
 dicodile/update_z/tests/hostfile_test | 1 -
 2 files changed, 2 deletions(-)
 delete mode 100644 dicodile/update_z/tests/hostfile_test

diff --git a/dicodile/update_z/tests/conftest.py b/dicodile/update_z/tests/conftest.py
index fb8cb3d1..0d5f72c7 100644
--- a/dicodile/update_z/tests/conftest.py
+++ b/dicodile/update_z/tests/conftest.py
@@ -4,4 +4,3 @@
 def pytest_configure(config):
     # Set DICOD in debug mode
     os.environ["TESTING_DICOD"] = "1"
-    os.environ["MPI_HOSTFILE"] = "dicodile/update_z/tests/hostfile_test"
diff --git a/dicodile/update_z/tests/hostfile_test b/dicodile/update_z/tests/hostfile_test
deleted file mode 100644
index 05e311e1..00000000
--- a/dicodile/update_z/tests/hostfile_test
+++ /dev/null
@@ -1 +0,0 @@
-localhost slots=16

From 50b1766729a41d13f3576e1ffc89cb717dd182b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hande=20G=C3=B6z=C3=BCkan?= <hande.gozukan@inria.fr>
Date: Fri, 2 Apr 2021 17:42:43 +0200
Subject: [PATCH 03/11] update pytest command to be run with mpiexec

---
 .github/workflows/unittests.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml
index 7061a17a..639430cf 100644
--- a/.github/workflows/unittests.yml
+++ b/.github/workflows/unittests.yml
@@ -34,8 +34,7 @@ jobs:
       - name: Run unit tests
         run: |
           pip install -e .[test]
-          echo "localhost slots=16">hostfile
-          pytest --cov-report=xml --cov=dicodile
+          mpiexec -np 1 --host localhost:16 python -m pytest --cov-report=xml --cov=dicodile
       - name: Upload coverage
         uses: codecov/codecov-action@v1
         with:

From cf5509357b707d83eeaa733484e6f585ae611538 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hande=20G=C3=B6z=C3=BCkan?= <hande.gozukan@inria.fr>
Date: Fri, 2 Apr 2021 17:48:01 +0200
Subject: [PATCH 04/11] use host flag instead of hostfile to build the docs

---
 docs/Makefile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/Makefile b/docs/Makefile
index 96599f2a..c601fbad 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -7,7 +7,6 @@ SPHINXOPTS    +=
 SPHINXBUILD   ?= sphinx-build
 SOURCEDIR     = .
 BUILDDIR      = _build
-HOSTFILE      = ../hostfile
 ALLOW_AS_ROOT = ${ALLOW} 
 
 # Put it first so that "make" without argument is like "make help".
@@ -24,7 +23,7 @@ clean:
 
 .PHONY: html
 html:
-	mpiexec -np 1 $(ALLOW_AS_ROOT) --hostfile $(HOSTFILE) $(SPHINXBUILD) -b html $(SOURCEDIR) $(BUILDDIR)/html $(SPHINXOPTS) ${0}
+	mpiexec -np 1 $(ALLOW_AS_ROOT) --host localhost:50 $(SPHINXBUILD) -b html $(SOURCEDIR) $(BUILDDIR)/html $(SPHINXOPTS) ${0}
 	@echo
 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 

From e68d6773e1ac9f6cb60ea2c5b7e206373ecdb543 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hande=20G=C3=B6z=C3=BCkan?= <hande.gozukan@inria.fr>
Date: Thu, 6 May 2021 16:22:41 +0200
Subject: [PATCH 05/11] stop workers after dicodile function run

---
 dicodile/_dicodile.py                           | 2 ++
 dicodile/update_z/distributed_sparse_encoder.py | 5 ++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/dicodile/_dicodile.py b/dicodile/_dicodile.py
index 46aba3bf..8ac86b43 100644
--- a/dicodile/_dicodile.py
+++ b/dicodile/_dicodile.py
@@ -227,5 +227,7 @@ def dicodile(X, D_init, reg=.1, n_iter=100, eps=1e-5, window=False,
 
     runtime = np.sum(times)
     encoder.release_workers()
+
+    encoder.shut_down_workers()
     print("[INFO:{}] Finished in {:.0f}s".format(name, runtime))
     return D_hat, z_hat, pobj, times
diff --git a/dicodile/update_z/distributed_sparse_encoder.py b/dicodile/update_z/distributed_sparse_encoder.py
index 471a2d0b..aba45086 100644
--- a/dicodile/update_z/distributed_sparse_encoder.py
+++ b/dicodile/update_z/distributed_sparse_encoder.py
@@ -5,7 +5,7 @@
 from ..utils import constants
 from ..utils.csc import compute_objective
 from ..workers.reusable_workers import get_reusable_workers
-from ..workers.reusable_workers import send_command_to_reusable_workers
+from ..workers.reusable_workers import send_command_to_reusable_workers, shutdown_reusable_workers
 
 from ..utils import debug_flags as flags
 from ..utils.debugs import main_check_beta
@@ -139,6 +139,9 @@ def get_sufficient_statistics(self):
     def release_workers(self):
         send_command_to_reusable_workers(constants.TAG_DICODILE_STOP)
 
+    def shut_down_workers(self):
+        shutdown_reusable_workers()
+
     def check_cost(self, X, D_hat, reg):
         cost = self.get_cost()
         z_hat = self.get_z_hat()

From 6e0e1ed33f39295833ea27a456f2e2799519786a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hande=20G=C3=B6z=C3=BCkan?= <hande.gozukan@inria.fr>
Date: Thu, 6 May 2021 16:24:05 +0200
Subject: [PATCH 06/11] stop workers at the end of test run

---
 dicodile/update_z/tests/test_distributed_sparse_encoder.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dicodile/update_z/tests/test_distributed_sparse_encoder.py b/dicodile/update_z/tests/test_distributed_sparse_encoder.py
index 06aea6bd..ecb3d863 100644
--- a/dicodile/update_z/tests/test_distributed_sparse_encoder.py
+++ b/dicodile/update_z/tests/test_distributed_sparse_encoder.py
@@ -51,3 +51,5 @@ def test_distributed_sparse_encoder():
     ztX = compute_ztX(z_hat, X)
     assert np.allclose(ztz, ztz_distrib)
     assert np.allclose(ztX, ztX_distrib)
+
+    encoder.shut_down_workers()

From 5d0cf39a21ae5231c38dc2d58783db07353998e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hande=20G=C3=B6z=C3=BCkan?= <hande.gozukan@inria.fr>
Date: Thu, 6 May 2021 16:34:39 +0200
Subject: [PATCH 07/11] copy dicod.py to communication primitive.py

---
 dicodile/update_z/communication_primitive.py  | 382 ++++++++++++++++++
 .../update_z/distributed_sparse_encoder.py    |   8 +-
 2 files changed, 386 insertions(+), 4 deletions(-)
 create mode 100644 dicodile/update_z/communication_primitive.py

diff --git a/dicodile/update_z/communication_primitive.py b/dicodile/update_z/communication_primitive.py
new file mode 100644
index 00000000..f0f0d92e
--- /dev/null
+++ b/dicodile/update_z/communication_primitive.py
@@ -0,0 +1,382 @@
+"""Convolutional Sparse Coding with DICOD
+
+Author : tommoral <thomas.moreau@inria.fr>
+"""
+
+import time
+import logging
+import numpy as np
+from mpi4py import MPI
+
+from ..utils import constants
+from ..utils import debug_flags as flags
+from ..utils.csc import compute_objective
+from ..utils.debugs import main_check_beta
+from .coordinate_descent import STRATEGIES
+from ..utils.segmentation import Segmentation
+from .coordinate_descent import coordinate_descent
+from ..utils.mpi import broadcast_array, recv_reduce_sum_array
+from ..utils.shape_helpers import get_valid_support, find_grid_size
+
+from ..workers.reusable_workers import get_reusable_workers
+from ..workers.reusable_workers import send_command_to_reusable_workers
+
+
+log = logging.getLogger('dicod')
+
+# debug flags
+
+interactive_exec = "xterm"
+interactive_args = ["-fa", "Monospace", "-fs", "12", "-e", "ipython", "-i"]
+
+
+def dicod(X_i, D, reg, z0=None, DtD=None, n_seg='auto', strategy='greedy',
+          soft_lock='border', n_workers=1, w_world='auto', hostfile=None,
+          tol=1e-5, max_iter=100000, timeout=None, z_positive=False,
+          return_ztz=False, warm_start=False, freeze_support=False,
+          timing=False, random_state=None, verbose=0, debug=False):
+    """DICOD for 2D convolutional sparse coding.
+
+    Parameters
+    ----------
+    X_i : ndarray, shape (n_channels, *sig_support)
+        Image to encode on the dictionary D
+    D : ndarray, shape (n_atoms, n_channels, *atom_support)
+        Current dictionary for the sparse coding
+    reg : float
+        Regularization parameter
+    z0 : ndarray, shape (n_atoms, *valid_support) or None
+        Warm start value for z_hat. If None, z_hat is initialized to 0.
+    DtD : ndarray, shape (n_atoms, n_atoms, 2 valid_support - 1) or None
+        Warm start value for DtD. If None, it is computed in each worker.
+    n_seg : int or {{ 'auto' }}
+        Number of segments to use for each dimension. If set to 'auto' use
+        segments of twice the size of the dictionary.
+    strategy : str in {}
+        Coordinate selection scheme for the coordinate descent. If set to
+        'greedy', the coordinate with the largest value for dz_opt is selected.
+        If set to 'random', the coordinate is chosen uniformly on the segment.
+    soft_lock : str in {{ 'none', 'corner', 'border' }}
+        If set to true, use the soft-lock in LGCD.
+    n_workers : int
+        Number of workers used to compute the convolutional sparse coding
+        solution.
+    w_world : int or {{'auto'}}
+        Number of jobs used per row in the splitting grid. This should divide
+        n_workers.
+    hostfile : str
+        File containing the cluster information. See MPI documentation to have
+        the format of this file.
+    tol : float
+        Tolerance for the minimal update size in this algorithm.
+    max_iter : int
+        Maximal number of iteration run by this algorithm.
+    timeout : int
+        Timeout for the algorithm in seconds
+    z_positive : boolean
+        If set to true, the activations are constrained to be positive.
+    return_ztz : boolean
+        If True, returns the constants ztz and ztX, used to compute D-updates.
+    warm_start : boolean
+        If set to True, start from the previous solution z_hat if it exists.
+    freeze_support : boolean
+        If set to True, only update the coefficient that are non-zero in z0.
+    timing : boolean
+        If set to True, log the cost and timing information.
+    random_state : None or int or RandomState
+        current random state to seed the random number generator.
+    verbose : int
+        Verbosity level of the algorithm.
+
+    Return
+    ------
+    z_hat : ndarray, shape (n_atoms, *valid_support)
+        Activation associated to X_i for the given dictionary D
+    """
+    if strategy == 'lgcd':
+        strategy = 'greedy'
+        assert n_seg == 'auto', "strategy='lgcd' only work with n_seg='auto'."
+    elif strategy == 'gcd':
+        strategy = 'greedy'
+        assert n_seg == 'auto', "strategy='gcd' only work with n_seg='auto'."
+        n_seg = 1
+
+    # Parameters validation
+    n_channels, *sig_support = X_i.shape
+    n_atoms, n_channels, *atom_support = D.shape
+    assert D.ndim - 1 == X_i.ndim
+    valid_support = get_valid_support(sig_support, atom_support)
+
+    assert soft_lock in ['none', 'corner', 'border']
+    assert strategy in ['greedy', 'random', 'cyclic', 'cyclic-r']
+
+    if n_workers == 1:
+        return coordinate_descent(
+            X_i, D, reg, z0=z0, DtD=DtD, n_seg=n_seg, strategy=strategy,
+            tol=tol, max_iter=max_iter, timeout=timeout, z_positive=z_positive,
+            freeze_support=freeze_support, return_ztz=return_ztz,
+            timing=timing, random_state=random_state, verbose=verbose)
+
+    params = dict(
+        strategy=strategy, tol=tol, max_iter=max_iter, timeout=timeout,
+        n_seg=n_seg, z_positive=z_positive, verbose=verbose, timing=timing,
+        debug=debug, random_state=random_state, reg=reg, return_ztz=return_ztz,
+        soft_lock=soft_lock, precomputed_DtD=DtD is not None,
+        freeze_support=freeze_support, warm_start=warm_start
+    )
+
+    comm = _spawn_workers(n_workers, hostfile)
+    t_transfert, workers_segments = _send_task(comm, X_i, D, z0, DtD, w_world,
+                                               params)
+
+    if flags.CHECK_WARM_BETA:
+        main_check_beta(comm, workers_segments)
+
+    if verbose > 0:
+        print('\r[INFO:DICOD-{}] End transfert - {:.4}s'
+              .format(workers_segments.effective_n_seg, t_transfert).ljust(80))
+
+    # Wait for the result computation
+    comm.Barrier()
+    run_statistics = _gather_run_statistics(
+        comm, workers_segments, verbose=verbose)
+
+    z_hat, ztz, ztX, cost, _log, t_reduce = _recv_result(
+        comm, D.shape, valid_support, workers_segments, return_ztz=return_ztz,
+        timing=timing, verbose=verbose)
+    comm.Barrier()
+
+    if timing:
+        p_obj = reconstruct_pobj(X_i, D, reg, _log, t_transfert, t_reduce,
+                                 n_workers=n_workers,
+                                 valid_support=valid_support, z0=z0)
+    else:
+        p_obj = [[run_statistics['n_updates'],
+                  run_statistics['runtime'],
+                  cost]]
+    return z_hat, ztz, ztX, p_obj, run_statistics
+
+
+def reconstruct_pobj(X, D, reg, _log, t_init, t_reduce, n_workers,
+                     valid_support=None, z0=None):
+    n_atoms = D.shape[0]
+    if z0 is None:
+        z_hat = np.zeros((n_atoms, *valid_support))
+    else:
+        z_hat = np.copy(z0)
+
+    # Re-order the updates
+    _log.sort()
+    max_ii = [0] * n_workers
+    for _, ii, rank, *_ in _log:
+        max_ii[rank] = max(max_ii[rank], ii)
+    max_ii = np.sum(max_ii)
+
+    up_ii = 0
+    p_obj = [(up_ii, t_init, compute_objective(X, z_hat, D, reg))]
+    next_ii_cost = 1
+    last_ii = [0] * n_workers
+    for i, (t_update, ii, rank, k0, pt0, dz) in enumerate(_log):
+        z_hat[k0][tuple(pt0)] += dz
+        up_ii += ii - last_ii[rank]
+        last_ii[rank] = ii
+        if up_ii >= next_ii_cost:
+            p_obj.append((up_ii, t_update + t_init,
+                          compute_objective(X, z_hat, D, reg)))
+            next_ii_cost = next_ii_cost * 1.3
+            print("\rReconstructing cost {:7.2%}"
+                  .format(np.log2(up_ii)/np.log2(max_ii)), end='', flush=True)
+        elif i + 1 % 1000:
+            print("\rReconstructing cost {:7.2%}"
+                  .format(np.log2(up_ii)/np.log2(max_ii)), end='', flush=True)
+    print('\rReconstruction cost: done'.ljust(40))
+
+    final_cost = compute_objective(X, z_hat, D, reg)
+    p_obj.append((up_ii, t_update, final_cost))
+    p_obj.append((up_ii, t_init + t_update + t_reduce, final_cost))
+    return np.array(p_obj)
+
+
+def _spawn_workers(n_workers, hostfile):
+    comm = get_reusable_workers(n_workers, hostfile=hostfile)
+    send_command_to_reusable_workers(constants.TAG_WORKER_RUN_DICOD)
+    return comm
+
+
+def _send_task(comm, X, D, z0, DtD, w_world, params):
+    t_start = time.time()
+    n_atoms, n_channels, *atom_support = D.shape
+
+    _send_params(comm, params)
+
+    _send_D(comm, D, DtD)
+
+    workers_segments = _send_signal(comm, w_world, atom_support, X, z0)
+
+    t_init = time.time() - t_start
+    return t_init, workers_segments
+
+
+def _send_params(comm, params):
+    comm.bcast(params, root=MPI.ROOT)
+
+
+def _send_D(comm, D, DtD=None):
+    broadcast_array(comm, D)
+    if DtD is not None:
+        broadcast_array(comm, DtD)
+
+
+def _send_signal(comm, w_world, atom_support, X, z0=None):
+    n_workers = comm.Get_remote_size()
+    n_channels, *full_support = X.shape
+    valid_support = get_valid_support(full_support, atom_support)
+    overlap = tuple(np.array(atom_support) - 1)
+
+    X_info = dict(has_z0=z0 is not None, valid_support=valid_support)
+
+    if w_world == 'auto':
+        X_info["workers_topology"] = find_grid_size(
+            n_workers, valid_support, atom_support
+        )
+    else:
+        assert n_workers % w_world == 0
+        X_info["workers_topology"] = w_world, n_workers // w_world
+
+    # compute a segmentation for the image,
+    workers_segments = Segmentation(n_seg=X_info['workers_topology'],
+                                    signal_support=valid_support,
+                                    overlap=overlap)
+
+    # Make sure that each worker has at least a segment of twice the size of
+    # the dictionary. If this is not the case, the algorithm is not valid as it
+    # is possible to have interference with workers that are not neighbors.
+    worker_support = workers_segments.get_seg_support(0, inner=True)
+    msg = ("The size of the support in each worker is smaller than twice the "
+           "size of the atom support. The algorithm is does not converge in "
+           "this condition. Reduce the number of cores.\n"
+           f"worker: {worker_support}, atom: {atom_support}, "
+           f"topology: {X_info['workers_topology']}")
+    assert all(
+        (np.array(worker_support) >= 2 * np.array(atom_support))
+        | (np.array(X_info['workers_topology']) == 1)), msg
+
+    # Broadcast the info about this signal to the
+    comm.bcast(X_info, root=MPI.ROOT)
+
+    X = np.array(X, dtype='d')
+
+    for i_seg in range(n_workers):
+        if z0 is not None:
+            worker_slice = workers_segments.get_seg_slice(i_seg)
+            _send_array(comm, i_seg, z0[worker_slice])
+        seg_bounds = workers_segments.get_seg_bounds(i_seg)
+        X_worker_slice = (Ellipsis,) + tuple([
+            slice(start, end + size_atom_ax - 1)
+            for (start, end), size_atom_ax in zip(seg_bounds, atom_support)
+        ])
+        _send_array(comm, i_seg, X[X_worker_slice])
+
+    # Synchronize the multiple send with a Barrier
+    comm.Barrier()
+    return workers_segments
+
+
+def _send_array(comm, dest, arr):
+    comm.Send([arr.ravel(), MPI.DOUBLE],
+              dest=dest, tag=constants.TAG_ROOT + dest)
+
+
+def _gather_run_statistics(comm, workers_segments, verbose=0):
+    n_workers = workers_segments.effective_n_seg
+
+    if flags.CHECK_FINAL_BETA:
+        main_check_beta(comm, workers_segments)
+
+    stats = np.array(comm.gather(None, root=MPI.ROOT))
+    iterations, n_coordinate_updates = np.sum(stats[:, :2], axis=0)
+    runtime, t_local_init, t_run = np.max(stats[:, 2:5], axis=0)
+    t_select = np.mean(stats[:, -2], axis=0)
+    t_update = np.mean([s for s in stats[:, -1] if s is not None])
+    if verbose > 1:
+        print("\r[INFO:DICOD-{}] converged in {:.3f}s ({:.3f}s) with "
+              "{:.0f} iterations ({:.0f} updates).".format(
+                  n_workers, runtime, t_run, iterations, n_coordinate_updates))
+    if verbose > 5:
+        print(f"\r[DEBUG:DICOD-{n_workers}] t_select={t_select:.3e}s "
+              f"t_update={t_update:.3e}s")
+    run_statistics = dict(
+        iterations=iterations, runtime=runtime, t_init=t_local_init,
+        t_run=t_run, n_updates=n_coordinate_updates, t_select=t_select,
+        t_update=t_update
+    )
+    return run_statistics
+
+
+def _recv_result(comm, D_shape, valid_support, workers_segments,
+                 return_ztz=False, timing=False, verbose=0):
+    n_atoms, n_channels, *atom_support = D_shape
+
+    t_start = time.time()
+
+    z_hat = recv_z_hat(comm, n_atoms=n_atoms,
+                       workers_segments=workers_segments)
+
+    if return_ztz:
+        ztz, ztX = recv_sufficient_statistics(comm, D_shape)
+    else:
+        ztz, ztX = None, None
+
+    cost = recv_cost(comm)
+
+    _log = []
+    if timing:
+        for i_seg in range(workers_segments.effective_n_seg):
+            _log.extend(comm.recv(source=i_seg))
+
+    t_reduce = time.time() - t_start
+    if verbose >= 5:
+        print('\r[DEBUG:DICOD-{}] End finalization - {:.4}s'
+              .format(workers_segments.effective_n_seg, t_reduce))
+
+    return z_hat, ztz, ztX, cost, _log, t_reduce
+
+
+def recv_z_hat(comm, n_atoms, workers_segments):
+
+    valid_support = workers_segments.signal_support
+
+    inner = not flags.GET_OVERLAP_Z_HAT
+    z_hat = np.empty((n_atoms, *valid_support), dtype='d')
+    for i_seg in range(workers_segments.effective_n_seg):
+        worker_support = workers_segments.get_seg_support(
+            i_seg, inner=inner)
+        z_worker = np.zeros((n_atoms,) + worker_support, 'd')
+        comm.Recv([z_worker.ravel(), MPI.DOUBLE], source=i_seg,
+                  tag=constants.TAG_ROOT + i_seg)
+        worker_slice = workers_segments.get_seg_slice(
+            i_seg, inner=inner)
+        z_hat[worker_slice] = z_worker
+
+    return z_hat
+
+
+def recv_z_nnz(comm, n_atoms):
+    return recv_reduce_sum_array(comm, n_atoms)
+
+
+def recv_sufficient_statistics(comm, D_shape):
+    n_atoms, n_channels, *atom_support = D_shape
+    ztz_support = tuple(2 * np.array(atom_support) - 1)
+    ztz = recv_reduce_sum_array(comm, (n_atoms, n_atoms, *ztz_support))
+    ztX = recv_reduce_sum_array(comm, (n_atoms, n_channels, *atom_support))
+    return ztz, ztX
+
+
+def recv_cost(comm):
+    cost = recv_reduce_sum_array(comm, 1)
+    return cost[0]
+
+
+# Update the docstring
+dicod.__doc__.format(STRATEGIES)
diff --git a/dicodile/update_z/distributed_sparse_encoder.py b/dicodile/update_z/distributed_sparse_encoder.py
index aba45086..21e15ff9 100644
--- a/dicodile/update_z/distributed_sparse_encoder.py
+++ b/dicodile/update_z/distributed_sparse_encoder.py
@@ -11,10 +11,10 @@
 from ..utils.debugs import main_check_beta
 from ..utils.shape_helpers import get_valid_support
 
-from .dicod import recv_z_hat, recv_z_nnz
-from .dicod import _gather_run_statistics
-from .dicod import _send_task, _send_D, _send_signal
-from .dicod import recv_cost, recv_sufficient_statistics
+from .communication_primitive import recv_z_hat, recv_z_nnz
+from .communication_primitive import _gather_run_statistics
+from .communication_primitive import _send_task, _send_D, _send_signal
+from .communication_primitive import recv_cost, recv_sufficient_statistics
 
 
 class DistributedSparseEncoder:

From 8c51462c89b47f03e186c2dbcabf57337fd61d20 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hande=20G=C3=B6z=C3=BCkan?= <hande.gozukan@inria.fr>
Date: Fri, 7 May 2021 11:48:37 +0200
Subject: [PATCH 08/11] shutdown reusable workers at the end of dicod

---
 dicodile/update_z/dicod.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/dicodile/update_z/dicod.py b/dicodile/update_z/dicod.py
index f0f0d92e..07ef9724 100644
--- a/dicodile/update_z/dicod.py
+++ b/dicodile/update_z/dicod.py
@@ -18,7 +18,7 @@
 from ..utils.mpi import broadcast_array, recv_reduce_sum_array
 from ..utils.shape_helpers import get_valid_support, find_grid_size
 
-from ..workers.reusable_workers import get_reusable_workers
+from ..workers.reusable_workers import get_reusable_workers, shutdown_reusable_workers
 from ..workers.reusable_workers import send_command_to_reusable_workers
 
 
@@ -154,6 +154,8 @@ def dicod(X_i, D, reg, z0=None, DtD=None, n_seg='auto', strategy='greedy',
         p_obj = [[run_statistics['n_updates'],
                   run_statistics['runtime'],
                   cost]]
+
+    shutdown_reusable_workers()
     return z_hat, ztz, ztX, p_obj, run_statistics
 
 

From d430195d388838c1469059cc17a2f3af15164764 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hande=20G=C3=B6z=C3=BCkan?= <hande.gozukan@inria.fr>
Date: Fri, 7 May 2021 11:51:13 +0200
Subject: [PATCH 09/11] run ci on all branches

---
 .github/workflows/unittests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml
index 639430cf..1c76fd15 100644
--- a/.github/workflows/unittests.yml
+++ b/.github/workflows/unittests.yml
@@ -2,7 +2,7 @@ name: unittests
 on:
   pull_request:
   push:
-    branches: main
+#    branches: main
 
 jobs:
   test:

From 175277312d7b6c2441d2d24497da425ca394b15e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hande=20G=C3=B6z=C3=BCkan?= <hande.gozukan@inria.fr>
Date: Fri, 7 May 2021 12:37:18 +0200
Subject: [PATCH 10/11] check network interfaces on ci server

---
 .github/workflows/unittests.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml
index 1c76fd15..162398f0 100644
--- a/.github/workflows/unittests.yml
+++ b/.github/workflows/unittests.yml
@@ -28,6 +28,7 @@ jobs:
           conda --version
           which python
           ./ci/install_mpi.sh
+          ip addr
         env:
           MPI_INSTALL: ${{ matrix.mpi_install }}
           MPI_IMPL: ${{ matrix.mpi_impl }}

From 909cdcfd5a21d03fee3ee4c084014850bf09a2b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hande=20G=C3=B6z=C3=BCkan?= <hande.gozukan@inria.fr>
Date: Fri, 7 May 2021 12:43:03 +0200
Subject: [PATCH 11/11] test with eth0

---
 .github/workflows/unittests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml
index 162398f0..66cf0f3a 100644
--- a/.github/workflows/unittests.yml
+++ b/.github/workflows/unittests.yml
@@ -35,7 +35,7 @@ jobs:
       - name: Run unit tests
         run: |
           pip install -e .[test]
-          mpiexec -np 1 --host localhost:16 python -m pytest --cov-report=xml --cov=dicodile
+          mpiexec -np 1 --mca btl_tcp_if_include eth0 --host localhost:16 python -m pytest --cov-report=xml --cov=dicodile
       - name: Upload coverage
         uses: codecov/codecov-action@v1
         with: