ucloud-scheduler/main.py

# TODO
#  1. on startup check if there is any VM with status REQUESTED_NEW already
#  2. send an email to an email address defined by env['admin-email']
#     if resources are finished
#  3. v3) Introduce a status endpoint of the scheduler -
#     maybe expose a prometheus compatible output

import json
import argparse
import logging

from decouple import config
from collections import Counter
from functools import reduce
from etcd3_wrapper import Etcd3Wrapper
from datetime import datetime

logging.basicConfig(
    level=logging.DEBUG,
    filename="log.txt",
    filemode="a",
    format="%(asctime)s: %(levelname)s - %(message)s",
    datefmt="%d-%b-%y %H:%M:%S",
)


class VmPool(object):
    def __init__(self, etcd_client, vm_prefix):
        self.client = etcd_client
        self.vms = []

        _vms = self.client.get_prefix(vm_prefix)
        self.vms = [(vm.key, json.loads(vm.value)) for vm in _vms]

    @staticmethod
    def by_host(vms, host):
        return list(filter(lambda x: x[1]["hostname"] == host, vms))

    @staticmethod
    def by_status(vms, status):
        return list(filter(lambda x: x[1]["status"] == status, vms))

    @staticmethod
    def except_status(vms, status):
        return list(filter(lambda x: x[1]["status"] != status, vms))


def accumulated_specs(vms_specs):
    if not vms_specs:
        return {}
    return reduce((lambda x, y: Counter(x) + Counter(y)), vms_specs)


def remaining_resources(host_specs, vms_specs):
    """Return remaining resources host_specs - vms"""
    vms_specs = Counter(vms_specs)
    remaining = Counter(host_specs)
    remaining.subtract(vms_specs)

    return remaining


def get_suitable_host(etcd_client, vm_prefix, host_prefix, vm_specs):
    vm_pool = VmPool(etcd_client, vm_prefix)
    hosts = etcd_client.get_prefix(host_prefix, value_in_json=True)
    hosts = filter(lambda h: h.value["status"] == "ALIVE", hosts)

    for host in hosts:
        _host_name, host_value = (host.key, host.value)

        # Get All Virtual Machines
        vms = vm_pool.vms

        # Filter them by host_name
        vms = VmPool.by_host(vms, _host_name)

        # Filter them by status
        vms = VmPool.except_status(vms, "REQUESTED_NEW")

        running_vms_specs = [vm[1]["specs"] for vm in vms]
        # Accumulate all of their combined specs
        running_vms_accumulated_specs = accumulated_specs(running_vms_specs)
        print(running_vms_accumulated_specs)

        # Find out remaining resources after
        # host_specs - already running vm_specs
        print(host_value)
        remaining = remaining_resources(
            host_value["specs"], running_vms_accumulated_specs
        )
        print(remaining)
        # Find out remaining - new_vm_specs
        remaining = remaining_resources(remaining, vm_specs)
        # if remaining resources >= 0 return this host_name
        if all(
            map(lambda x: True if remaining[x] >= 0 else False, remaining)
        ):
            return _host_name

    return None


def dead_host_detection(hosts):
    dead_hosts_keys = []
    for host in hosts:
        # Bring out your dead! - Monty Python and the Holy Grail

        if "status" in host.value and "last_heartbeat" in host.value:
            # Don't count that is already buried
            if host.value["status"] == "DEAD":
                continue

            last_heartbeat = datetime.fromisoformat(
                host.value["last_heartbeat"]
            )
            delta = datetime.utcnow() - last_heartbeat
            if delta.total_seconds() > 60:
                dead_hosts_keys.append(host.key)
        else:
            dead_hosts_keys.append(host.key)

    return dead_hosts_keys


def dead_host_mitigation(client: Etcd3Wrapper, dead_hosts_keys):
    for host_key in dead_hosts_keys:
        host = client.get(host_key, value_in_json=True)
        host.value["status"] = "DEAD"
        host.value["last_heartbeat"] = datetime.utcnow().isoformat()
        client.put(host.key, host.value, value_in_json=True)

        # Find all vms that were hosted on this dead host
        all_vms = client.get_prefix(config("VM_PREFIX"), value_in_json=True)
        vms_hosted_on_dead_host = filter(
            lambda _vm: _vm.value["hostname"] == host_key, all_vms
        )
        for vm in vms_hosted_on_dead_host:
            vm.value["host"] = ""
            vm.value["status"] = "REQUESTED_START"
            client.put(vm.key, vm.value, value_in_json=True)


def main(vm_prefix, host_prefix):
    client = Etcd3Wrapper(
        host=config("ETCD_HOST"), port=int(config("ETCD_PORT"))
    )

    for events_iterator in [client.get_prefix(vm_prefix),
                            client.watch_prefix(vm_prefix, timeout=10)]:
        for e in events_iterator:
            try:
                e.value = json.loads(e.value)
            except json.JSONDecodeError:
                logging.error(f"Invalid JSON {e.value}")
                continue

            logging.debug(e.key, e.value)

            e_status = e.value["status"]

            if e_status == "TIMEOUT":
                logging.info("Timeout")
                hosts = client.get_prefix(host_prefix, value_in_json=True)
                dead_hosts = dead_host_detection(hosts)
                dead_host_mitigation(client, dead_hosts)

            elif e_status == "REQUESTED_NEW":
                host_name = get_suitable_host(
                    client, vm_prefix, host_prefix, e.value["specs"]
                )
                if host_name:
                    e.value["status"] = "SCHEDULED_DEPLOY"
                    e.value["hostname"] = host_name
                    client.put(e.key, json.dumps(e.value))
                else:
                    # email admin
                    print("No Resource Left. Emailing admin....")


if __name__ == "__main__":
    argparser = argparse.ArgumentParser()
    argparser.add_argument(
        "--vm_prefix", required=False, default=config("VM_PREFIX")
    )
    argparser.add_argument(
        "--host_prefix", required=False, default=config("HOST_PREFIX")
    )
    args = argparser.parse_args()

    main(args.vm_prefix, args.host_prefix)
initial code 2019-06-25 16:39:29 +05:00			`# TODO`
simplifies logic + add unit tests 2019-06-30 21:30:17 +05:00			`# 1. on startup check if there is any VM with status REQUESTED_NEW already`
			`# 2. send an email to an email address defined by env['admin-email']`
			`# if resources are finished`
			`# 3. v3) Introduce a status endpoint of the scheduler -`
			`# maybe expose a prometheus compatible output`
initial code 2019-06-25 16:39:29 +05:00
			`import json`
simplifies logic + add unit tests 2019-06-30 21:30:17 +05:00			`import argparse`
new tests added, dead host detection/mitigation, fix bug in specs difference computing code 2019-07-18 18:46:49 +05:00			`import logging`
initial code 2019-06-25 16:39:29 +05:00
			`from decouple import config`
			`from collections import Counter`
simplifies logic + add unit tests 2019-06-30 21:30:17 +05:00			`from functools import reduce`
new tests added, dead host detection/mitigation, fix bug in specs difference computing code 2019-07-18 18:46:49 +05:00			`from etcd3_wrapper import Etcd3Wrapper`
			`from datetime import datetime`

			`logging.basicConfig(`
			`level=logging.DEBUG,`
			`filename="log.txt",`
			`filemode="a",`
			`format="%(asctime)s: %(levelname)s - %(message)s",`
			`datefmt="%d-%b-%y %H:%M:%S",`
			`)`
initial code 2019-06-25 16:39:29 +05:00

			`class VmPool(object):`
			`def __init__(self, etcd_client, vm_prefix):`
			`self.client = etcd_client`
			`self.vms = []`

			`_vms = self.client.get_prefix(vm_prefix)`
new tests added, dead host detection/mitigation, fix bug in specs difference computing code 2019-07-18 18:46:49 +05:00			`self.vms = [(vm.key, json.loads(vm.value)) for vm in _vms]`
initial code 2019-06-25 16:39:29 +05:00
			`@staticmethod`
			`def by_host(vms, host):`
			`return list(filter(lambda x: x[1]["hostname"] == host, vms))`

			`@staticmethod`
			`def by_status(vms, status):`
			`return list(filter(lambda x: x[1]["status"] == status, vms))`

			`@staticmethod`
			`def except_status(vms, status):`
			`return list(filter(lambda x: x[1]["status"] != status, vms))`


simplifies logic + add unit tests 2019-06-30 21:30:17 +05:00			`def accumulated_specs(vms_specs):`
new tests added, dead host detection/mitigation, fix bug in specs difference computing code 2019-07-18 18:46:49 +05:00			`if not vms_specs:`
simplifies logic + add unit tests 2019-06-30 21:30:17 +05:00			`return {}`
			`return reduce((lambda x, y: Counter(x) + Counter(y)), vms_specs)`
initial code 2019-06-25 16:39:29 +05:00

simplifies logic + add unit tests 2019-06-30 21:30:17 +05:00			`def remaining_resources(host_specs, vms_specs):`
initial code 2019-06-25 16:39:29 +05:00			`"""Return remaining resources host_specs - vms"""`
simplifies logic + add unit tests 2019-06-30 21:30:17 +05:00			`vms_specs = Counter(vms_specs)`
initial code 2019-06-25 16:39:29 +05:00			`remaining = Counter(host_specs)`
			`remaining.subtract(vms_specs)`

			`return remaining`

VMs on Dead Host should get status REQUESTED_START instead of REQUESTED_NEW 2019-07-22 11:32:29 +05:00
initial code 2019-06-25 16:39:29 +05:00			`def get_suitable_host(etcd_client, vm_prefix, host_prefix, vm_specs):`
			`vm_pool = VmPool(etcd_client, vm_prefix)`
new tests added, dead host detection/mitigation, fix bug in specs difference computing code 2019-07-18 18:46:49 +05:00			`hosts = etcd_client.get_prefix(host_prefix, value_in_json=True)`
			`hosts = filter(lambda h: h.value["status"] == "ALIVE", hosts)`
initial code 2019-06-25 16:39:29 +05:00
			`for host in hosts:`
new tests added, dead host detection/mitigation, fix bug in specs difference computing code 2019-07-18 18:46:49 +05:00			`_host_name, host_value = (host.key, host.value)`
initial code 2019-06-25 16:39:29 +05:00
			`# Get All Virtual Machines`
			`vms = vm_pool.vms`

			`# Filter them by host_name`
			`vms = VmPool.by_host(vms, _host_name)`

			`# Filter them by status`
			`vms = VmPool.except_status(vms, "REQUESTED_NEW")`

simplifies logic + add unit tests 2019-06-30 21:30:17 +05:00			`running_vms_specs = [vm[1]["specs"] for vm in vms]`
initial code 2019-06-25 16:39:29 +05:00			`# Accumulate all of their combined specs`
simplifies logic + add unit tests 2019-06-30 21:30:17 +05:00			`running_vms_accumulated_specs = accumulated_specs(running_vms_specs)`
new tests added, dead host detection/mitigation, fix bug in specs difference computing code 2019-07-18 18:46:49 +05:00			`print(running_vms_accumulated_specs)`
initial code 2019-06-25 16:39:29 +05:00
simplifies logic + add unit tests 2019-06-30 21:30:17 +05:00			`# Find out remaining resources after`
			`# host_specs - already running vm_specs`
new tests added, dead host detection/mitigation, fix bug in specs difference computing code 2019-07-18 18:46:49 +05:00			`print(host_value)`
simplifies logic + add unit tests 2019-06-30 21:30:17 +05:00			`remaining = remaining_resources(`
new tests added, dead host detection/mitigation, fix bug in specs difference computing code 2019-07-18 18:46:49 +05:00			`host_value["specs"], running_vms_accumulated_specs`
simplifies logic + add unit tests 2019-06-30 21:30:17 +05:00			`)`
new tests added, dead host detection/mitigation, fix bug in specs difference computing code 2019-07-18 18:46:49 +05:00			`print(remaining)`
initial code 2019-06-25 16:39:29 +05:00			`# Find out remaining - new_vm_specs`
			`remaining = remaining_resources(remaining, vm_specs)`
			`# if remaining resources >= 0 return this host_name`
simplifies logic + add unit tests 2019-06-30 21:30:17 +05:00			`if all(`
			`map(lambda x: True if remaining[x] >= 0 else False, remaining)`
			`):`
initial code 2019-06-25 16:39:29 +05:00			`return _host_name`

			`return None`


new tests added, dead host detection/mitigation, fix bug in specs difference computing code 2019-07-18 18:46:49 +05:00			`def dead_host_detection(hosts):`
			`dead_hosts_keys = []`
			`for host in hosts:`
			`# Bring out your dead! - Monty Python and the Holy Grail`
simplifies logic + add unit tests 2019-06-30 21:30:17 +05:00
new tests added, dead host detection/mitigation, fix bug in specs difference computing code 2019-07-18 18:46:49 +05:00			`if "status" in host.value and "last_heartbeat" in host.value:`
			`# Don't count that is already buried`
			`if host.value["status"] == "DEAD":`
			`continue`

			`last_heartbeat = datetime.fromisoformat(`
			`host.value["last_heartbeat"]`
			`)`
			`delta = datetime.utcnow() - last_heartbeat`
			`if delta.total_seconds() > 60:`
			`dead_hosts_keys.append(host.key)`
			`else:`
			`dead_hosts_keys.append(host.key)`

			`return dead_hosts_keys`


			`def dead_host_mitigation(client: Etcd3Wrapper, dead_hosts_keys):`
			`for host_key in dead_hosts_keys:`
			`host = client.get(host_key, value_in_json=True)`
			`host.value["status"] = "DEAD"`
			`host.value["last_heartbeat"] = datetime.utcnow().isoformat()`
			`client.put(host.key, host.value, value_in_json=True)`

			`# Find all vms that were hosted on this dead host`
			`all_vms = client.get_prefix(config("VM_PREFIX"), value_in_json=True)`
			`vms_hosted_on_dead_host = filter(`
			`lambda _vm: _vm.value["hostname"] == host_key, all_vms`
			`)`
			`for vm in vms_hosted_on_dead_host:`
			`vm.value["host"] = ""`
VMs on Dead Host should get status REQUESTED_START instead of REQUESTED_NEW 2019-07-22 11:32:29 +05:00			`vm.value["status"] = "REQUESTED_START"`
new tests added, dead host detection/mitigation, fix bug in specs difference computing code 2019-07-18 18:46:49 +05:00			`client.put(vm.key, vm.value, value_in_json=True)`


			`def main(vm_prefix, host_prefix):`
			`client = Etcd3Wrapper(`
simplifies logic + add unit tests 2019-06-30 21:30:17 +05:00			`host=config("ETCD_HOST"), port=int(config("ETCD_PORT"))`
			`)`
initial code 2019-06-25 16:39:29 +05:00
check any pending vm scheduling on start 2019-07-20 14:50:08 +05:00			`for events_iterator in [client.get_prefix(vm_prefix),`
			`client.watch_prefix(vm_prefix, timeout=10)]:`
			`for e in events_iterator:`
			`try:`
			`e.value = json.loads(e.value)`
			`except json.JSONDecodeError:`
			`logging.error(f"Invalid JSON {e.value}")`
			`continue`
initial code 2019-06-25 16:39:29 +05:00
check any pending vm scheduling on start 2019-07-20 14:50:08 +05:00			`logging.debug(e.key, e.value)`

			`e_status = e.value["status"]`

			`if e_status == "TIMEOUT":`
			`logging.info("Timeout")`
			`hosts = client.get_prefix(host_prefix, value_in_json=True)`
			`dead_hosts = dead_host_detection(hosts)`
			`dead_host_mitigation(client, dead_hosts)`

			`elif e_status == "REQUESTED_NEW":`
			`host_name = get_suitable_host(`
			`client, vm_prefix, host_prefix, e.value["specs"]`
			`)`
			`if host_name:`
			`e.value["status"] = "SCHEDULED_DEPLOY"`
			`e.value["hostname"] = host_name`
			`client.put(e.key, json.dumps(e.value))`
			`else:`
			`# email admin`
			`print("No Resource Left. Emailing admin....")`
simplifies logic + add unit tests 2019-06-30 21:30:17 +05:00

			`if __name__ == "__main__":`
			`argparser = argparse.ArgumentParser()`
			`argparser.add_argument(`
			`"--vm_prefix", required=False, default=config("VM_PREFIX")`
			`)`
			`argparser.add_argument(`
			`"--host_prefix", required=False, default=config("HOST_PREFIX")`
			`)`
			`args = argparser.parse_args()`

			`main(args.vm_prefix, args.host_prefix)`
No results found.