2019-06-25 11:39:29 +00:00
|
|
|
# TODO
|
2019-07-25 10:29:35 +00:00
|
|
|
# 1. send an email to an email address defined by env['admin-email']
|
2019-06-30 16:30:17 +00:00
|
|
|
# if resources are finished
|
2019-07-25 10:29:35 +00:00
|
|
|
# 2. v3) Introduce a status endpoint of the scheduler -
|
2019-06-30 16:30:17 +00:00
|
|
|
# maybe expose a prometheus compatible output
|
2019-06-25 11:39:29 +00:00
|
|
|
|
|
|
|
import json
|
2019-06-30 16:30:17 +00:00
|
|
|
import argparse
|
2019-07-18 13:46:49 +00:00
|
|
|
import logging
|
2019-06-25 11:39:29 +00:00
|
|
|
|
|
|
|
from decouple import config
|
|
|
|
from collections import Counter
|
2019-06-30 16:30:17 +00:00
|
|
|
from functools import reduce
|
2019-07-25 08:45:34 +00:00
|
|
|
from etcd3_wrapper import Etcd3Wrapper, EtcdEntry, PseudoEtcdMeta
|
2019-07-18 13:46:49 +00:00
|
|
|
from datetime import datetime
|
|
|
|
|
|
|
|
logging.basicConfig(
|
|
|
|
level=logging.DEBUG,
|
|
|
|
filename="log.txt",
|
|
|
|
filemode="a",
|
|
|
|
format="%(asctime)s: %(levelname)s - %(message)s",
|
|
|
|
datefmt="%d-%b-%y %H:%M:%S",
|
|
|
|
)
|
2019-06-25 11:39:29 +00:00
|
|
|
|
|
|
|
|
|
|
|
class VmPool(object):
|
|
|
|
def __init__(self, etcd_client, vm_prefix):
|
|
|
|
self.client = etcd_client
|
|
|
|
self.vms = []
|
|
|
|
|
|
|
|
_vms = self.client.get_prefix(vm_prefix)
|
2019-07-18 13:46:49 +00:00
|
|
|
self.vms = [(vm.key, json.loads(vm.value)) for vm in _vms]
|
2019-06-25 11:39:29 +00:00
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def by_host(vms, host):
|
|
|
|
return list(filter(lambda x: x[1]["hostname"] == host, vms))
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def by_status(vms, status):
|
|
|
|
return list(filter(lambda x: x[1]["status"] == status, vms))
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def except_status(vms, status):
|
|
|
|
return list(filter(lambda x: x[1]["status"] != status, vms))
|
|
|
|
|
|
|
|
|
2019-06-30 16:30:17 +00:00
|
|
|
def accumulated_specs(vms_specs):
|
2019-07-18 13:46:49 +00:00
|
|
|
if not vms_specs:
|
2019-06-30 16:30:17 +00:00
|
|
|
return {}
|
|
|
|
return reduce((lambda x, y: Counter(x) + Counter(y)), vms_specs)
|
2019-06-25 11:39:29 +00:00
|
|
|
|
|
|
|
|
2019-06-30 16:30:17 +00:00
|
|
|
def remaining_resources(host_specs, vms_specs):
|
2019-06-25 11:39:29 +00:00
|
|
|
"""Return remaining resources host_specs - vms"""
|
2019-06-30 16:30:17 +00:00
|
|
|
vms_specs = Counter(vms_specs)
|
2019-06-25 11:39:29 +00:00
|
|
|
remaining = Counter(host_specs)
|
|
|
|
remaining.subtract(vms_specs)
|
|
|
|
|
|
|
|
return remaining
|
|
|
|
|
2019-07-22 06:32:29 +00:00
|
|
|
|
2019-06-25 11:39:29 +00:00
|
|
|
def get_suitable_host(etcd_client, vm_prefix, host_prefix, vm_specs):
|
|
|
|
vm_pool = VmPool(etcd_client, vm_prefix)
|
2019-07-18 13:46:49 +00:00
|
|
|
hosts = etcd_client.get_prefix(host_prefix, value_in_json=True)
|
|
|
|
hosts = filter(lambda h: h.value["status"] == "ALIVE", hosts)
|
2019-06-25 11:39:29 +00:00
|
|
|
|
|
|
|
for host in hosts:
|
2019-07-18 13:46:49 +00:00
|
|
|
_host_name, host_value = (host.key, host.value)
|
2019-06-25 11:39:29 +00:00
|
|
|
|
|
|
|
# Get All Virtual Machines
|
|
|
|
vms = vm_pool.vms
|
|
|
|
|
|
|
|
# Filter them by host_name
|
|
|
|
vms = VmPool.by_host(vms, _host_name)
|
|
|
|
|
|
|
|
# Filter them by status
|
|
|
|
vms = VmPool.except_status(vms, "REQUESTED_NEW")
|
|
|
|
|
2019-06-30 16:30:17 +00:00
|
|
|
running_vms_specs = [vm[1]["specs"] for vm in vms]
|
2019-06-25 11:39:29 +00:00
|
|
|
# Accumulate all of their combined specs
|
2019-06-30 16:30:17 +00:00
|
|
|
running_vms_accumulated_specs = accumulated_specs(running_vms_specs)
|
2019-07-25 08:07:59 +00:00
|
|
|
# print(running_vms_accumulated_specs)
|
2019-06-25 11:39:29 +00:00
|
|
|
|
2019-06-30 16:30:17 +00:00
|
|
|
# Find out remaining resources after
|
|
|
|
# host_specs - already running vm_specs
|
2019-07-25 08:07:59 +00:00
|
|
|
# print(host_value)
|
2019-06-30 16:30:17 +00:00
|
|
|
remaining = remaining_resources(
|
2019-07-18 13:46:49 +00:00
|
|
|
host_value["specs"], running_vms_accumulated_specs
|
2019-06-30 16:30:17 +00:00
|
|
|
)
|
2019-07-25 08:07:59 +00:00
|
|
|
# print(remaining)
|
2019-06-25 11:39:29 +00:00
|
|
|
# Find out remaining - new_vm_specs
|
|
|
|
remaining = remaining_resources(remaining, vm_specs)
|
|
|
|
# if remaining resources >= 0 return this host_name
|
2019-06-30 16:30:17 +00:00
|
|
|
if all(
|
|
|
|
map(lambda x: True if remaining[x] >= 0 else False, remaining)
|
|
|
|
):
|
2019-06-25 11:39:29 +00:00
|
|
|
return _host_name
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
2019-07-18 13:46:49 +00:00
|
|
|
def dead_host_detection(hosts):
|
|
|
|
dead_hosts_keys = []
|
|
|
|
for host in hosts:
|
|
|
|
# Bring out your dead! - Monty Python and the Holy Grail
|
2019-06-30 16:30:17 +00:00
|
|
|
|
2019-07-18 13:46:49 +00:00
|
|
|
if "status" in host.value and "last_heartbeat" in host.value:
|
|
|
|
# Don't count that is already buried
|
|
|
|
if host.value["status"] == "DEAD":
|
|
|
|
continue
|
|
|
|
|
|
|
|
last_heartbeat = datetime.fromisoformat(
|
|
|
|
host.value["last_heartbeat"]
|
|
|
|
)
|
|
|
|
delta = datetime.utcnow() - last_heartbeat
|
|
|
|
if delta.total_seconds() > 60:
|
|
|
|
dead_hosts_keys.append(host.key)
|
|
|
|
else:
|
|
|
|
dead_hosts_keys.append(host.key)
|
|
|
|
|
|
|
|
return dead_hosts_keys
|
|
|
|
|
|
|
|
|
|
|
|
def dead_host_mitigation(client: Etcd3Wrapper, dead_hosts_keys):
|
|
|
|
for host_key in dead_hosts_keys:
|
|
|
|
host = client.get(host_key, value_in_json=True)
|
|
|
|
host.value["status"] = "DEAD"
|
|
|
|
host.value["last_heartbeat"] = datetime.utcnow().isoformat()
|
|
|
|
client.put(host.key, host.value, value_in_json=True)
|
|
|
|
|
|
|
|
# Find all vms that were hosted on this dead host
|
|
|
|
all_vms = client.get_prefix(config("VM_PREFIX"), value_in_json=True)
|
|
|
|
vms_hosted_on_dead_host = filter(
|
|
|
|
lambda _vm: _vm.value["hostname"] == host_key, all_vms
|
|
|
|
)
|
|
|
|
for vm in vms_hosted_on_dead_host:
|
2019-07-25 08:07:59 +00:00
|
|
|
vm.value["hostname"] = ""
|
|
|
|
if vm.value["status"] != "STOPPED":
|
|
|
|
vm.value["status"] = "REQUESTED_NEW"
|
2019-07-18 13:46:49 +00:00
|
|
|
client.put(vm.key, vm.value, value_in_json=True)
|
|
|
|
|
|
|
|
|
2019-07-25 08:45:34 +00:00
|
|
|
def assign_host(client, vm_prefix, host_prefix, e):
|
|
|
|
host_name = get_suitable_host(
|
|
|
|
client, vm_prefix, host_prefix, e.value["specs"]
|
|
|
|
)
|
|
|
|
if host_name:
|
2019-07-25 10:29:35 +00:00
|
|
|
if e.value["status"] == "REQUESTED_NEW":
|
|
|
|
e.value["status"] = "SCHEDULED_DEPLOY"
|
|
|
|
else:
|
|
|
|
e.value["status"] = "REQUESTED_START"
|
|
|
|
|
2019-07-25 08:45:34 +00:00
|
|
|
e.value["hostname"] = host_name
|
|
|
|
client.put(e.key, json.dumps(e.value))
|
|
|
|
return host_name
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
2019-07-18 13:46:49 +00:00
|
|
|
def main(vm_prefix, host_prefix):
|
|
|
|
client = Etcd3Wrapper(
|
2019-06-30 16:30:17 +00:00
|
|
|
host=config("ETCD_HOST"), port=int(config("ETCD_PORT"))
|
|
|
|
)
|
2019-07-25 08:45:34 +00:00
|
|
|
RESCAN_VMS = False
|
|
|
|
for events_iterator in [
|
|
|
|
client.get_prefix(vm_prefix),
|
|
|
|
client.watch_prefix(vm_prefix, timeout=10),
|
|
|
|
]:
|
2019-07-20 09:50:08 +00:00
|
|
|
for e in events_iterator:
|
|
|
|
try:
|
|
|
|
e.value = json.loads(e.value)
|
|
|
|
except json.JSONDecodeError:
|
|
|
|
logging.error(f"Invalid JSON {e.value}")
|
|
|
|
continue
|
2019-06-25 11:39:29 +00:00
|
|
|
|
2019-07-20 09:50:08 +00:00
|
|
|
logging.debug(e.key, e.value)
|
|
|
|
|
|
|
|
e_status = e.value["status"]
|
|
|
|
|
|
|
|
if e_status == "TIMEOUT":
|
2019-07-25 13:29:41 +00:00
|
|
|
client.client.delete(e.key)
|
2019-07-20 09:50:08 +00:00
|
|
|
logging.info("Timeout")
|
|
|
|
hosts = client.get_prefix(host_prefix, value_in_json=True)
|
|
|
|
dead_hosts = dead_host_detection(hosts)
|
|
|
|
dead_host_mitigation(client, dead_hosts)
|
|
|
|
|
2019-07-25 08:45:34 +00:00
|
|
|
if RESCAN_VMS:
|
|
|
|
RESCAN_VMS = False # Assume we won't need it after this
|
2019-07-25 10:29:35 +00:00
|
|
|
vms = client.get_prefix(vm_prefix)
|
2019-07-25 08:45:34 +00:00
|
|
|
|
|
|
|
for vm in vms:
|
|
|
|
fake_e = EtcdEntry(
|
2019-07-25 10:29:35 +00:00
|
|
|
PseudoEtcdMeta(key=vm.key.encode("utf-8")),
|
|
|
|
value=vm.value.encode("utf-8"), value_in_json=True
|
2019-07-25 08:45:34 +00:00
|
|
|
)
|
|
|
|
if (assign_host(client, vm_prefix, host_prefix,
|
|
|
|
fake_e) is None):
|
|
|
|
# We need it because we still have vm left
|
|
|
|
# to schedule
|
|
|
|
RESCAN_VMS = True
|
|
|
|
|
2019-07-25 10:29:35 +00:00
|
|
|
elif e_status in ["REQUESTED_NEW", "REQUESTED_START"]:
|
2019-07-25 08:45:34 +00:00
|
|
|
if assign_host(client, vm_prefix, host_prefix, e) is None:
|
2019-07-20 09:50:08 +00:00
|
|
|
print("No Resource Left. Emailing admin....")
|
2019-07-25 08:45:34 +00:00
|
|
|
RESCAN_VMS = True
|
2019-06-30 16:30:17 +00:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
argparser = argparse.ArgumentParser()
|
|
|
|
argparser.add_argument(
|
|
|
|
"--vm_prefix", required=False, default=config("VM_PREFIX")
|
|
|
|
)
|
|
|
|
argparser.add_argument(
|
|
|
|
"--host_prefix", required=False, default=config("HOST_PREFIX")
|
|
|
|
)
|
|
|
|
args = argparser.parse_args()
|
|
|
|
|
|
|
|
main(args.vm_prefix, args.host_prefix)
|