Merge pull request #507 from darko-poljak/the-good-the-bad-and-the-ugly

The good the bad and the ugly - object prepare/run parallelization
2017-06-13 22:45:09 +02:00 · 2017-06-13 22:45:09 +02:00 · e774fc2b2a
commit e774fc2b2a
parent 8cd2136d8e e375bd464d
6 changed files with 185 additions and 36 deletions
--- a/cdist/config.py
+++ b/cdist/config.py
@ -27,6 +27,8 @@ import time
 import itertools
 import tempfile
 import socket
+import multiprocessing
+from cdist.mputil import mp_pool_run
 import atexit
 import shutil

@ -48,7 +50,7 @@ class Config(object):

        self.local = local
        self.remote = remote
-        self.log = logging.getLogger(self.local.target_host[0])
+        self._open_logger()
        self.dry_run = dry_run
        self.jobs = jobs

@ -123,7 +125,6 @@ class Config(object):
    @classmethod
    def commandline(cls, args):
        """Configure remote system"""
-        import multiprocessing

        # FIXME: Refactor relict - remove later
        log = logging.getLogger("cdist")
@ -278,6 +279,14 @@ class Config(object):
            Iterate over the objects once - helper method for
            iterate_until_finished
        """
+        if self.jobs:
+            objects_changed = self._iterate_once_parallel()
+        else:
+            objects_changed = self._iterate_once_sequential()
+        return objects_changed
+
+    def _iterate_once_sequential(self):
+        self.log.info("Iteration in sequential mode")
        objects_changed = False

        for cdist_object in self.object_list():
@ -303,6 +312,95 @@ class Config(object):

        return objects_changed

+    def _iterate_once_parallel(self):
+        self.log.info("Iteration in parallel mode in {} jobs".format(
+            self.jobs))
+        objects_changed = False
+
+        cargo = []
+        for cdist_object in self.object_list():
+            if cdist_object.requirements_unfinished(cdist_object.requirements):
+                """We cannot do anything for this poor object"""
+                continue
+
+            if cdist_object.state == core.CdistObject.STATE_UNDEF:
+                """Prepare the virgin object"""
+
+                # self.object_prepare(cdist_object)
+                # objects_changed = True
+                cargo.append(cdist_object)
+
+        n = len(cargo)
+        if n == 1:
+            self.log.debug("Only one object, preparing sequentially")
+            self.object_prepare(cargo[0])
+            objects_changed = True
+        elif cargo:
+            self.log.debug("Multiprocessing start method is {}".format(
+                multiprocessing.get_start_method()))
+            self.log.debug(("Starting multiprocessing Pool for {} parallel "
+                            "objects preparation".format(n)))
+            args = [
+                (c, ) for c in cargo
+            ]
+            mp_pool_run(self.object_prepare, args, jobs=self.jobs)
+            self.log.debug(("Multiprocessing for parallel object "
+                            "preparation finished"))
+            objects_changed = True
+
+        del cargo[:]
+        for cdist_object in self.object_list():
+            if cdist_object.requirements_unfinished(cdist_object.requirements):
+                """We cannot do anything for this poor object"""
+                continue
+
+            if cdist_object.state == core.CdistObject.STATE_PREPARED:
+                if cdist_object.requirements_unfinished(
+                        cdist_object.autorequire):
+                    """The previous step created objects we depend on -
+                    wait for them
+                    """
+                    continue
+
+                # self.object_run(cdist_object)
+                # objects_changed = True
+                cargo.append(cdist_object)
+
+        n = len(cargo)
+        if n == 1:
+            self.log.debug("Only one object, running sequentially")
+            self.object_run(cargo[0])
+            objects_changed = True
+        elif cargo:
+            self.log.debug("Multiprocessing start method is {}".format(
+                multiprocessing.get_start_method()))
+            self.log.debug(("Starting multiprocessing Pool for {} parallel "
+                            "object run".format(n)))
+            args = [
+                (c, ) for c in cargo
+            ]
+            mp_pool_run(self.object_run, args, jobs=self.jobs)
+            self.log.debug(("Multiprocessing for parallel object "
+                            "run finished"))
+            objects_changed = True
+
+        return objects_changed
+
+    def _open_logger(self):
+        self.log = logging.getLogger(self.local.target_host[0])
+
+    # logger is not pickable, so remove it when we pickle
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        if 'log' in state:
+            del state['log']
+        return state
+
+    # recreate logger when we unpickle
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        self._open_logger()
+
    def iterate_until_finished(self):
        """
            Go through all objects and solve them
--- a/cdist/core/cdist_type.py
+++ b/cdist/core/cdist_type.py
@ -79,6 +79,9 @@ class CdistType(object):

    _instances = {}

+    def __getnewargs__(self):
+        return self.base_path, self.name
+
    def __new__(cls, *args, **kwargs):
        """only one instance of each named type may exist"""
        # name is second argument
--- a/cdist/core/explorer.py
+++ b/cdist/core/explorer.py
@ -24,8 +24,7 @@ import logging
 import os
 import glob
 import multiprocessing
-
-import cdist
+from cdist.mputil import mp_pool_run

 '''
 common:
@ -121,18 +120,12 @@ class Explorer(object):
            multiprocessing.get_start_method()))
        self.log.debug(("Starting multiprocessing Pool for global "
                       "explorers run"))
-        with multiprocessing.Pool(self.jobs) as pool:
-            self.log.debug("Starting async for global explorer run")
-            results = [
-                pool.apply_async(self._run_global_explorer, (e, out_path,))
-                for e in self.list_global_explorer_names()
-            ]
-
-            self.log.debug("Waiting async results for global explorer runs")
-            for r in results:
-                r.get()  # self._run_global_explorer returns None
-            self.log.debug(("Multiprocessing run for global explorers "
-                           "finished"))
+        args = [
+            (e, out_path, ) for e in self.list_global_explorer_names()
+        ]
+        mp_pool_run(self._run_global_explorer, args, jobs=self.jobs)
+        self.log.debug(("Multiprocessing run for global explorers "
+                        "finished"))

    # logger is not pickable, so remove it when we pickle
    def __getstate__(self):
--- a/cdist/core/manifest.py
+++ b/cdist/core/manifest.py
@ -98,7 +98,7 @@ class Manifest(object):
        self.target_host = target_host
        self.local = local

-        self.log = logging.getLogger(self.target_host[0])
+        self._open_logger()

        self.env = {
            'PATH': "%s:%s" % (self.local.bin_path, os.environ['PATH']),
@ -114,6 +114,21 @@ class Manifest(object):
        if self.log.getEffectiveLevel() == logging.DEBUG:
            self.env.update({'__cdist_debug': "yes"})

+    def _open_logger(self):
+        self.log = logging.getLogger(self.target_host[0])
+
+    # logger is not pickable, so remove it when we pickle
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        if 'log' in state:
+            del state['log']
+        return state
+
+    # recreate logger when we unpickle
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        self._open_logger()
+
    def env_initial_manifest(self, initial_manifest):
        env = os.environ.copy()
        env.update(self.env)
--- a/cdist/exec/remote.py
+++ b/cdist/exec/remote.py
@ -31,6 +31,7 @@ import multiprocessing
 import cdist
 import cdist.exec.util as exec_util
 import cdist.util.ipaddr as ipaddr
+from cdist.mputil import mp_pool_run


 def _wrap_addr(addr):
@ -152,25 +153,16 @@ class Remote(object):
            multiprocessing.get_start_method()))
        self.log.debug(("Starting multiprocessing Pool for parallel "
                        "remote transfer"))
-        with multiprocessing.Pool(jobs) as pool:
-            self.log.debug("Starting async for parallel transfer")
-            commands = []
-            for f in glob.glob1(source, '*'):
-                command = self._copy.split()
-                path = os.path.join(source, f)
-                command.extend([path, '{0}:{1}'.format(
-                    _wrap_addr(self.target_host[0]), destination)])
-                commands.append(command)
-            results = [
-                pool.apply_async(self._run_command, (cmd,))
-                for cmd in commands
-            ]
-
-            self.log.debug("Waiting async results for parallel transfer")
-            for r in results:
-                r.get()  # self._run_command returns None
-            self.log.debug(("Multiprocessing for parallel transfer "
-                            "finished"))
+        args = []
+        for f in glob.glob1(source, '*'):
+            command = self._copy.split()
+            path = os.path.join(source, f)
+            command.extend([path, '{0}:{1}'.format(
+                _wrap_addr(self.target_host[0]), destination)])
+            args.append((command, ))
+        mp_pool_run(self._run_command, args, jobs=jobs)
+        self.log.debug(("Multiprocessing for parallel transfer "
+                        "finished"))

    def run_script(self, script, env=None, return_output=False):
        """Run the given script with the given environment on the remote side.
--- a/cdist/mputil.py
+++ b/cdist/mputil.py
@ -0,0 +1,48 @@
+# -*- coding: utf-8 -*-
+#
+# 2016 Darko Poljak (darko.poljak at gmail.com)
+#
+# This file is part of cdist.
+#
+# cdist is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# cdist is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with cdist. If not, see <http://www.gnu.org/licenses/>.
+#
+#
+
+
+import multiprocessing
+import itertools
+
+
+def mp_pool_run(func, args=None, kwds=None, jobs=multiprocessing.cpu_count()):
+    """ Run func using multiprocessing.Pool with jobs jobs and supplied
+        iterable of args and kwds with one entry for each parallel func
+        instance.
+        Return list of results.
+    """
+    if args and kwds:
+        fargs = zip(args, kwds)
+    elif args:
+        fargs = zip(args, itertools.repeat({}))
+    elif kwds:
+        fargs = zip(itertools.repeat(()), kwds)
+    else:
+        return [func(), ]
+
+    with multiprocessing.Pool(jobs) as pool:
+        results = [
+            pool.apply_async(func, a, k)
+            for a, k in fargs
+        ]
+        retval = [r.get() for r in results]
+    return retval