* Refactoring

* Fix issue that causes a new image store to be created at every start of ucloud-api.
* VM Migration API call now takes hostname instead of host key.
* StorageHandler Classes are introduced. They transparently handles things related to importing of image, make vm out of image, resize vm image, delete vm image etc.
* Loggers added to __init__.py of every ucloud component's subpackage.
* Non-Trivial Timeout Events are no longer logged.
* Fix issue that prevents removal of stopped VMs (i.e VMs that are successfully migrated).
* Improved unit handling added. e.g MB, Mb, mB, mb are all Mega Bytes.
* VM migration is now possible on IPv6 host.
* Destination VM (receiving side of migration of a vm) now correctly expects incoming data on free ephemeral port.
* Traceback is no longer output to screen, instead it goes to log file.
* All sanity checks are put into a single file. These checks are run by ucloud.py before running any of ucloud component.
This commit is contained in:
ahmadbilalkhalid 2019-11-25 11:52:36 +05:00
parent 6fa77bce4d
commit cc0ca68498
26 changed files with 1101 additions and 294 deletions

View file

@ -22,7 +22,7 @@ def check_otp(name, realm, token):
except binascii.Error:
return 400
response = requests.get(
response = requests.post(
"{OTP_SERVER}{OTP_VERIFY_ENDPOINT}".format(
OTP_SERVER=env_vars.get("OTP_SERVER", ""),
OTP_VERIFY_ENDPOINT=env_vars.get("OTP_VERIFY_ENDPOINT", "verify"),

View file

@ -1,16 +1,16 @@
import json
import os
import subprocess
from uuid import uuid4
import pynetbox
from uuid import uuid4
from os.path import join as join_path
from flask import Flask, request
from flask_restful import Resource, Api
from common import counters
from common.request import RequestEntry, RequestType
from common.vm import VMStatus
from config import (etcd_client, request_pool, vm_pool, host_pool, env_vars)
from config import (etcd_client, request_pool, vm_pool, host_pool, env_vars, image_storage_handler)
from . import schemas
from .helper import generate_mac, mac2ipv6
from api import logger
@ -20,13 +20,15 @@ api = Api(app)
class CreateVM(Resource):
"""API Request to Handle Creation of VM"""
@staticmethod
def post():
data = request.json
validator = schemas.CreateVMSchema(data)
if validator.is_valid():
vm_uuid = uuid4().hex
vm_key = os.path.join(env_vars.get("VM_PREFIX"), vm_uuid)
vm_key = join_path(env_vars.get("VM_PREFIX"), vm_uuid)
specs = {
"cpu": validator.specs["cpu"],
"ram": validator.specs["ram"],
@ -67,14 +69,14 @@ class VmStatus(Resource):
validator = schemas.VMStatusSchema(data)
if validator.is_valid():
vm = vm_pool.get(
os.path.join(env_vars.get("VM_PREFIX"), data["uuid"])
join_path(env_vars.get("VM_PREFIX"), data["uuid"])
)
vm_value = vm.value.copy()
vm_value["ip"] = []
for network_and_mac in vm.network:
network_name, mac = network_and_mac
network = etcd_client.get(
os.path.join(
join_path(
env_vars.get("NETWORK_PREFIX"),
data["name"],
network_name,
@ -96,7 +98,7 @@ class CreateImage(Resource):
validator = schemas.CreateImageSchema(data)
if validator.is_valid():
file_entry = etcd_client.get(
os.path.join(env_vars.get("FILE_PREFIX"), data["uuid"])
join_path(env_vars.get("FILE_PREFIX"), data["uuid"])
)
file_entry_value = json.loads(file_entry.value)
@ -109,7 +111,7 @@ class CreateImage(Resource):
"visibility": "public",
}
etcd_client.put(
os.path.join(env_vars.get("IMAGE_PREFIX"), data["uuid"]),
join_path(env_vars.get("IMAGE_PREFIX"), data["uuid"]),
json.dumps(image_entry_json),
)
@ -123,8 +125,9 @@ class ListPublicImages(Resource):
images = etcd_client.get_prefix(
env_vars.get("IMAGE_PREFIX"), value_in_json=True
)
r = {}
r["images"] = []
r = {
"images": []
}
for image in images:
image_key = "{}:{}".format(
image.value["store_name"], image.value["name"]
@ -143,46 +146,22 @@ class VMAction(Resource):
if validator.is_valid():
vm_entry = vm_pool.get(
os.path.join(env_vars.get("VM_PREFIX"), data["uuid"])
join_path(env_vars.get("VM_PREFIX"), data["uuid"])
)
action = data["action"]
if action == "start":
vm_entry.status = VMStatus.requested_start
vm_pool.put(vm_entry)
action = "schedule"
if action == "delete" and vm_entry.hostname == "":
try:
path_without_protocol = vm_entry.path[
vm_entry.path.find(":") + 1:
]
if env_vars.get("WITHOUT_CEPH"):
command_to_delete = [
"rm",
"-rf",
os.path.join("/var/vm", vm_entry.uuid),
]
else:
command_to_delete = [
"rbd",
"rm",
path_without_protocol,
]
subprocess.check_output(
command_to_delete, stderr=subprocess.PIPE
)
except subprocess.CalledProcessError as e:
if "No such file" in e.stderr.decode("utf-8"):
if image_storage_handler.is_vm_image_exists(vm_entry.uuid):
r_status = image_storage_handler.delete_vm_image(vm_entry.uuid)
if r_status:
etcd_client.client.delete(vm_entry.key)
return {"message": "VM successfully deleted"}
else:
logger.exception(e)
return {
"message": "Some error occurred while deleting VM"
}
logger.error("Some Error Occurred while deleting VM")
return {"message": "VM deletion unsuccessfull"}
else:
etcd_client.client.delete(vm_entry.key)
return {"message": "VM successfully deleted"}
@ -211,8 +190,8 @@ class VMMigration(Resource):
r = RequestEntry.from_scratch(
type=RequestType.ScheduleVM,
uuid=vm.uuid,
destination=os.path.join(
env_vars.get("HOST_PREFIX"), data["destination"]
destination=join_path(
env_vars.get("HOST_PREFIX"), validator.destination.value
),
migration=True,
request_prefix=env_vars.get("REQUEST_PREFIX")
@ -289,7 +268,7 @@ class CreateHost(Resource):
data = request.json
validator = schemas.CreateHostSchema(data)
if validator.is_valid():
host_key = os.path.join(env_vars.get("HOST_PREFIX"), uuid4().hex)
host_key = join_path(env_vars.get("HOST_PREFIX"), uuid4().hex)
host_entry = {
"specs": data["specs"],
"hostname": data["hostname"],
@ -327,7 +306,7 @@ class GetSSHKeys(Resource):
if not validator.key_name.value:
# {user_prefix}/{realm}/{name}/key/
etcd_key = os.path.join(
etcd_key = join_path(
env_vars.get('USER_PREFIX'),
data["realm"],
data["name"],
@ -344,7 +323,7 @@ class GetSSHKeys(Resource):
else:
# {user_prefix}/{realm}/{name}/key/{key_name}
etcd_key = os.path.join(
etcd_key = join_path(
env_vars.get('USER_PREFIX'),
data["realm"],
data["name"],
@ -373,7 +352,7 @@ class AddSSHKey(Resource):
if validator.is_valid():
# {user_prefix}/{realm}/{name}/key/{key_name}
etcd_key = os.path.join(
etcd_key = join_path(
env_vars.get("USER_PREFIX"),
data["realm"],
data["name"],
@ -403,7 +382,7 @@ class RemoveSSHKey(Resource):
if validator.is_valid():
# {user_prefix}/{realm}/{name}/key/{key_name}
etcd_key = os.path.join(
etcd_key = join_path(
env_vars.get("USER_PREFIX"),
data["realm"],
data["name"],
@ -462,7 +441,7 @@ class CreateNetwork(Resource):
else:
network_entry["ipv6"] = "fd00::/64"
network_key = os.path.join(
network_key = join_path(
env_vars.get("NETWORK_PREFIX"),
data["name"],
data["network_name"],
@ -480,7 +459,7 @@ class ListUserNetwork(Resource):
validator = schemas.OTPSchema(data)
if validator.is_valid():
prefix = os.path.join(
prefix = join_path(
env_vars.get("NETWORK_PREFIX"), data["name"]
)
networks = etcd_client.get_prefix(prefix, value_in_json=True)
@ -517,15 +496,17 @@ api.add_resource(CreateNetwork, "/network/create")
def main():
data = {
"is_public": True,
"type": "ceph",
"name": "images",
"description": "first ever public image-store",
"attributes": {"list": [], "key": [], "pool": "images"},
}
image_stores = list(etcd_client.get_prefix(env_vars.get('IMAGE_STORE_PREFIX'), value_in_json=True))
if len(image_stores) == 0:
data = {
"is_public": True,
"type": "ceph",
"name": "images",
"description": "first ever public image-store",
"attributes": {"list": [], "key": [], "pool": "images"},
}
etcd_client.put(os.path.join(env_vars.get('IMAGE_STORE_PREFIX'), uuid4().hex), json.dumps(data))
etcd_client.put(join_path(env_vars.get('IMAGE_STORE_PREFIX'), uuid4().hex), json.dumps(data))
app.run(host="::", debug=True)

View file

@ -381,12 +381,14 @@ class VmMigrationSchema(OTPSchema):
super().__init__(data=data, fields=fields)
def destination_validation(self):
host_key = self.destination.value
host = host_pool.get(host_key)
hostname = self.destination.value
host = next(filter(lambda h: h.hostname == hostname, host_pool.hosts), None)
if not host:
self.add_error("No Such Host ({}) exists".format(self.destination.value))
elif host.status != HostStatus.alive:
self.add_error("Destination Host is dead")
else:
self.destination.value = host.key
def validation(self):
vm = vm_pool.get(self.uuid.value)

View file

@ -1,28 +1,6 @@
from decouple import Config, RepositoryEnv, UndefinedValueError
from etcd3_wrapper import EtcdEntry
class EnvironmentVariables:
def __init__(self, env_file):
try:
env_config = Config(RepositoryEnv(env_file))
except FileNotFoundError:
print("{} does not exists".format(env_file))
exit(1)
else:
self.config = env_config
def get(self, *args, **kwargs):
"""Return value of var from env_vars"""
try:
value = self.config.get(*args, **kwargs)
except UndefinedValueError as e:
print(e)
exit(1)
else:
return value
class SpecificEtcdEntryBase:
def __init__(self, e: EtcdEntry):
self.key = e.key

View file

@ -1,5 +1,9 @@
import logging
import socket
import requests
import json
from ipaddress import ip_address
from os.path import join as join_path
@ -37,3 +41,14 @@ def get_ipv4_address():
address = s.getsockname()[0]
return address
def get_ipv6_address():
try:
r = requests.get("https://api6.ipify.org?format=json")
content = json.loads(r.content.decode("utf-8"))
ip = ip_address(content["ip"]).exploded
except Exception as e:
logging.exception(e)
else:
return ip

158
common/storage_handlers.py Normal file
View file

@ -0,0 +1,158 @@
import shutil
import subprocess as sp
import os
import stat
from abc import ABC
from host import logger
from os.path import join as join_path
class ImageStorageHandler(ABC):
def __init__(self, image_base, vm_base):
self.image_base = image_base
self.vm_base = vm_base
def import_image(self, image_src, image_dest, protect=False):
"""Put an image at the destination
:param src: An Image file
:param dest: A path where :param src: is to be put.
:param protect: If protect is true then the dest is protect (readonly etc)
The obj must exist on filesystem.
"""
raise NotImplementedError()
def make_vm_image(self, image_path, path):
"""Copy image from src to dest
:param src: A path
:param dest: A path
src and destination must be on same storage system i.e both on file system or both on CEPH etc.
"""
raise NotImplementedError()
def resize_vm_image(self, path, size):
"""Resize image located at :param path:
:param path: The file which is to be resized
:param size: Size must be in Megabytes
"""
raise NotImplementedError()
def delete_vm_image(self, path):
raise NotImplementedError()
def execute_command(self, command, report=True):
command = list(map(str, command))
try:
output = sp.check_output(command, stderr=sp.PIPE)
except Exception as e:
if report:
print(e)
logger.exception(e)
return False
return True
def vm_path_string(self, path):
raise NotImplementedError()
def qemu_path_string(self, path):
raise NotImplementedError()
def is_vm_image_exists(self, path):
raise NotImplementedError()
class FileSystemBasedImageStorageHandler(ImageStorageHandler):
def import_image(self, src, dest, protect=False):
dest = join_path(self.image_base, dest)
try:
shutil.copy(src, dest)
if protect:
os.chmod(dest, stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
except Exception as e:
logger.exception(e)
return False
return True
def make_vm_image(self, src, dest):
src = join_path(self.image_base, src)
dest = join_path(self.vm_base, dest)
try:
shutil.copy(src, dest)
except Exception as e:
logger.exception(e)
return False
return True
def resize_vm_image(self, path, size):
path = join_path(self.vm_base, path)
command = ["qemu-img", "resize", "-f", "raw", path, "{}M".format(size)]
if self.execute_command(command):
return True
else:
self.delete_vm_image(path)
return False
def delete_vm_image(self, path):
path = join_path(self.vm_base, path)
try:
os.remove(path)
except Exception as e:
logger.exception(e)
return False
return True
def vm_path_string(self, path):
return join_path(self.vm_base, path)
def qemu_path_string(self, path):
return self.vm_path_string(path)
def is_vm_image_exists(self, path):
path = join_path(self.vm_base, path)
command = ["ls", path]
return self.execute_command(command, report=False)
class CEPHBasedImageStorageHandler(ImageStorageHandler):
def import_image(self, src, dest, protect=False):
dest = join_path(self.image_base, dest)
command = ["rbd", "import", src, dest]
if protect:
snap_create_command = ["rbd", "snap", "create", "{}@protected".format(dest)]
snap_protect_command = ["rbd", "snap", "protect", "{}@protected".format(dest)]
return self.execute_command(command) and self.execute_command(snap_create_command) and\
self.execute_command(snap_protect_command)
return self.execute_command(command)
def make_vm_image(self, src, dest):
src = join_path(self.image_base, src)
dest = join_path(self.vm_base, dest)
command = ["rbd", "clone", "{}@protected".format(src), dest]
return self.execute_command(command)
def resize_vm_image(self, path, size):
path = join_path(self.vm_base, path)
command = ["rbd", "resize", path, "--size", size]
return self.execute_command(command)
def delete_vm_image(self, path):
path = join_path(self.vm_base, path)
command = ["rbd", "rm", path]
return self.execute_command(command)
def vm_path_string(self, path):
return join_path(self.vm_base, path)
def qemu_path_string(self, path):
return "rbd:{}".format(self.vm_path_string(path))
def is_vm_image_exists(self, path):
path = join_path(self.vm_base, path)
command = ["rbd", "info", path]
return self.execute_command(command, report=False)

View file

@ -60,10 +60,6 @@ class VMEntry(SpecificEtcdEntryBase):
self.log = self.log[:5]
self.log.append("{} - {}".format(datetime.now().isoformat(), msg))
@property
def path(self):
return "rbd:uservms/{}".format(self.uuid)
class VmPool:
def __init__(self, etcd_client, vm_prefix):

View file

@ -1,14 +1,16 @@
from etcd3_wrapper import Etcd3Wrapper
from common.classes import EnvironmentVariables
from common.host import HostPool
from common.request import RequestPool
from common.vm import VmPool
from common.storage_handlers import FileSystemBasedImageStorageHandler, CEPHBasedImageStorageHandler
from decouple import Config, RepositoryEnv
env_vars = EnvironmentVariables('/etc/ucloud/ucloud.conf')
env_vars = Config(RepositoryEnv('/etc/ucloud/ucloud.conf'))
etcd_wrapper_args = ()
etcd_wrapper_kwargs = {"host": env_vars.get("ETCD_URL")}
etcd_wrapper_kwargs = {'host': env_vars.get('ETCD_URL')}
etcd_client = Etcd3Wrapper(*etcd_wrapper_args, **etcd_wrapper_kwargs)
@ -17,3 +19,12 @@ vm_pool = VmPool(etcd_client, env_vars.get('VM_PREFIX'))
request_pool = RequestPool(etcd_client, env_vars.get('REQUEST_PREFIX'))
running_vms = []
__storage_backend = env_vars.get("STORAGE_BACKEND")
if __storage_backend == "filesystem":
image_storage_handler = FileSystemBasedImageStorageHandler(vm_base=env_vars.get("VM_DIR"),
image_base=env_vars.get("IMAGE_DIR"))
elif __storage_backend == "ceph":
image_storage_handler = CEPHBasedImageStorageHandler(vm_base="ssd", image_base="ssd")
else:
raise Exception("Unknown Image Storage Handler")

View file

@ -0,0 +1,44 @@
graph LR
style ucloud fill:#FFD2FC
style cron fill:#FFF696
style infrastructure fill:#BDF0FF
subgraph ucloud[ucloud]
ucloud-cli[CLI]-->ucloud-api[API]
ucloud-api-->ucloud-scheduler[Scheduler]
ucloud-api-->ucloud-imagescanner[Image Scanner]
ucloud-api-->ucloud-host[Host]
ucloud-scheduler-->ucloud-host
ucloud-host-->need-networking{VM need Networking}
need-networking-->|Yes| networking-scripts
need-networking-->|No| VM[Virtual Machine]
need-networking-->|SLAAC?| radvd
networking-scripts-->VM
networking-scripts--Create Networks Devices-->networking-scripts
subgraph cron[Cron Jobs]
ucloud-imagescanner
ucloud-filescanner[File Scanner]
ucloud-filescanner--Track User files-->ucloud-filescanner
end
subgraph infrastructure[Infrastructure]
radvd
etcd
networking-scripts[Networking Scripts]
ucloud-imagescanner-->image-store
image-store{Image Store}
image-store-->|CEPH| ceph
image-store-->|FILE| file-system
ceph[CEPH]
file-system[File System]
end
subgraph virtual-machine[Virtual Machine]
VM
VM-->ucloud-init
end
subgraph metadata-group[Metadata Server]
metadata-->ucloud-init
ucloud-init<-->metadata
end
end

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 37 KiB

View file

@ -1,7 +1,7 @@
.. ucloud documentation master file, created by
sphinx-quickstart on Mon Nov 11 19:08:16 2019.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
sphinx-quickstart on Mon Nov 11 19:08:16 2019.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
Welcome to ucloud's documentation!
==================================
@ -15,7 +15,9 @@ Welcome to ucloud's documentation!
usage/usage-for-admins
usage/usage-for-users
usage/how-to-create-an-os-image-for-ucloud
theory/summary
misc/todo
troubleshooting/installation-troubleshooting
Indices and tables
==================

View file

@ -135,7 +135,7 @@ You just need to update **AUTH_SEED** in the below code to match your auth's see
ETCD_URL=localhost
WITHOUT_CEPH=True
STORAGE_BACKEND=filesystem
BASE_DIR=/var/www
IMAGE_DIR=/var/image
@ -195,3 +195,35 @@ profile e.g *~/.profile*
alias uotp='cd /root/uotp/ && pipenv run python app.py'
and run :code:`source ~/.profile`
Arch
-----
.. code-block:: sh
# Update/Upgrade
pacman -Syuu
pacman -S python3 qemu chrony python-pip
pip3 install pipenv
cat > /etc/chrony.conf << EOF
server 0.arch.pool.ntp.org
server 1.arch.pool.ntp.org
server 2.arch.pool.ntp.org
EOF
systemctl start chronyd
systemctl enable chronyd
# Create non-root user and allow it sudo access
# without password
useradd -m ucloud
echo "ucloud ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
sudo -H -u ucloud bash -c 'cd /home/ucloud && git clone https://aur.archlinux.org/yay.git && cd yay && makepkg -si'
sudo -H -u ucloud bash -c 'yay -S etcd'
systemctl start etcd
systemctl enable etcd

View file

@ -1,6 +1,18 @@
TODO
====
* **Check Authentication:** Nico reported that some endpoints
even work without providing token. (ListUserVM)
* Put overrides for **IMAGE_BASE**, **VM_BASE** in **ImageStorageHandler**.
* Put "Always use only one StorageHandler"
* Create Network Manager
* That would handle tasks like up/down an interface
* Create VXLANs, Bridges, TAPs.
* Remove them when they are no longer used.
* Check for :code:`etcd3.exceptions.ConnectionFailedError` when calling some etcd operation to
avoid crashing whole application.
* Throw KeyError instead of returning None when some key is not found in etcd.

View file

@ -0,0 +1,98 @@
Summary
=======
.. image:: /images/ucloud.svg
.. code-block::
<cli>
|
|
|
+-------------------------<api>
| |
| |```````````````|```````````````|
| | | |
| <file_scanner> <scheduler> <image_scanner>
| |
| |
+-------------------------<host>
|
|
|
Virtual Machine------<init>------<metadata>
**ucloud-cli** interact with **ucloud-api** to do the following operations:
- Create/Delete/Start/Stop/Migrate/Probe (Status of) Virtual Machines
- Create/Delete Networks
- Add/Get/Delete SSH Keys
- Create OS Image out of a file (tracked by file_scanner)
- List User's files/networks/vms
- Add Host
ucloud can currently stores OS-Images on
* File System
* `CEPH <https://ceph.io/>`_
**ucloud-api** in turns creates appropriate Requests which are taken
by suitable components of ucloud. For Example, if user uses ucloud-cli
to create a VM, **ucloud-api** would create a **ScheduleVMRequest** containing
things like pointer to VM's entry which have specs, networking
configuration of VMs.
**ucloud-scheduler** accepts requests for VM's scheduling and
migration. It finds a host from a list of available host on which
the incoming VM can run and schedules it on that host.
**ucloud-host** runs on host servers i.e servers that
actually runs virtual machines, accepts requests
intended only for them. It creates/delete/start/stop/migrate
virtual machines. It also arrange network resources needed for the
incoming VM.
**ucloud-filescanner** keep tracks of user's files which would be needed
later for creating OS Images.
**ucloud-imagescanner** converts images files from qcow2 format to raw
format which would then be imported into image store.
* In case of **File System**, the converted image would be copied to
:file:`/var/image/` or the path referred by :envvar:`IMAGE_PATH` environement variable
mentioned in :file:`/etc/ucloud/ucloud.conf`.
* In case of **CEPH**, the converted image would be imported into
specific pool (it depends on the image store in which the image
belongs) of CEPH Block Storage.
**ucloud-metadata** provides metadata which is used to contextualize
VMs. When, the VM is created, it is just clone (duplicate) of OS
image from which it is created. So, to differentiate between my
VM and your VM, the VM need to be contextualized. This works
like the following
.. note::
Actually, ucloud-init makes the GET request. You can also try it
yourself using curl but ucloud-init does that for yourself.
* VM make a GET requests http://metadata which resolves to actual
address of metadata server. The metadata server looks at the IPv6
Address of the requester and extracts the MAC Address which is possible
because the IPv6 address is
`IPv6 EUI-64 <https://community.cisco.com/t5/networking-documents/understanding-ipv6-eui-64-bit-address/ta-p/3116953>`_.
Metadata use this MAC address to find the actual VM to which it belongs
and its owner, ssh-keys and much more. Then, metadata return these
details back to the calling VM in JSON format. These details are
then used be the **ucloud-init** which is explained next.
**ucloud-init** gets the metadata from **ucloud-metadata** to contextualize
the VM. Specifically, it gets owner's ssh keys (or any other keys the
owner of VM added to authorized keys for this VM) and put them to ssh
server's (installed on VM) authorized keys so that owner can access
the VM using ssh. It also install softwares that are needed for correct
behavior of VM e.g rdnssd (needed for `SLAAC <https://en.wikipedia.org/wiki/IPv6#Stateless_address_autoconfiguration_(SLAAC)>`_).

View file

@ -0,0 +1,24 @@
Installation Troubleshooting
============================
etcd doesn't start
------------------
.. code-block:: sh
[root@archlinux ~]# systemctl start etcd
Job for etcd.service failed because the control process exited with error code.
See "systemctl status etcd.service" and "journalctl -xe" for details
possible solution
~~~~~~~~~~~~~~~~~
Try :code:`cat /etc/hosts` if its output contain the following
.. code-block:: sh
127.0.0.1 localhost.localdomain localhost
::1 localhost localhost.localdomain
then unfortunately, we can't help you. But, if it doesn't contain the
above you can put the above in :file:`/etc/hosts` to fix the issue.

View file

@ -0,0 +1,3 @@
import logging
logger = logging.getLogger(__name__)

View file

@ -6,7 +6,7 @@ import time
from uuid import uuid4
from etcd3_wrapper import Etcd3Wrapper
from filescanner import logger
from config import env_vars
@ -17,9 +17,10 @@ def getxattr(file, attr):
value = sp.check_output(['getfattr', file,
'--name', attr,
'--only-values',
'--absolute-names'])
'--absolute-names'], stderr=sp.DEVNULL)
value = value.decode("utf-8")
except sp.CalledProcessError:
except sp.CalledProcessError as e:
logger.exception(e)
value = None
return value
@ -63,7 +64,7 @@ try:
sp.check_output(['which', 'getfattr'])
sp.check_output(['which', 'setfattr'])
except Exception as e:
print(e)
logger.exception(e)
print('Make sure you have getfattr and setfattr available')
exit(1)

13
host/helper.py Normal file
View file

@ -0,0 +1,13 @@
import socket
from contextlib import closing
def find_free_port():
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
try:
s.bind(('', 0))
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
except Exception:
return None
else:
return s.getsockname()[1]

View file

@ -1,6 +1,5 @@
import argparse
import multiprocessing as mp
import os
import time
from etcd3_wrapper import Etcd3Wrapper
@ -10,13 +9,17 @@ from config import (vm_pool, request_pool,
etcd_client, running_vms,
etcd_wrapper_args, etcd_wrapper_kwargs,
HostPool, env_vars)
from .helper import find_free_port
from . import virtualmachine
from host import logger
def update_heartbeat(host):
def update_heartbeat(hostname):
"""Update Last HeartBeat Time for :param hostname: in etcd"""
client = Etcd3Wrapper(*etcd_wrapper_args, **etcd_wrapper_kwargs)
host_pool = HostPool(client, env_vars.get('HOST_PREFIX'))
this_host = next(filter(lambda h: h.hostname == host, host_pool.hosts), None)
this_host = next(filter(lambda h: h.hostname == hostname, host_pool.hosts), None)
while True:
this_host.update_heartbeat()
@ -35,17 +38,22 @@ def maintenance(host):
# whether this host vm is successfully migrated. If yes
# then we shutdown "vm1" on this host.
to_be_removed = []
for running_vm in running_vms:
with vm_pool.get_put(running_vm.key) as vm_entry:
if vm_entry.hostname != host.key and not vm_entry.in_migration:
running_vm.handle.shutdown()
vm_entry.add_log("VM on source host shutdown.")
logger.info("VM migration not completed successfully.")
to_be_removed.append(running_vm)
for r in to_be_removed:
running_vms.remove(r)
# To check vm running according to etcd entries
alleged_running_vms = vm_pool.by_status("RUNNING", vm_pool.by_host(host.key))
for vm_entry in alleged_running_vms:
_vm = virtualmachine.get_vm(running_vms, vm_entry.key)
# Whether, the allegedly running vm is in our
# running_vms list or not if it is said to be
# running on this host but it is not then we
@ -64,10 +72,6 @@ def maintenance(host):
def main(hostname):
assert env_vars.get('WITHOUT_CEPH') and os.path.isdir(env_vars.get('VM_DIR')), (
"You have set env_vars.get('WITHOUT_CEPH') to True. So, the vm directory mentioned"
" in .env file must exists. But, it don't.")
heartbeat_updating_process = mp.Process(target=update_heartbeat, args=(hostname,))
host_pool = HostPool(etcd_client, env_vars.get('HOST_PREFIX'))
@ -99,7 +103,6 @@ def main(hostname):
request_event = RequestEntry(request_event)
if request_event.type == "TIMEOUT":
logger.info("Timeout Event")
maintenance(host)
continue
@ -121,7 +124,7 @@ def main(hostname):
virtualmachine.delete(vm_entry)
elif request_event.type == RequestType.InitVMMigration:
virtualmachine.init_migration(vm_entry, host.key)
virtualmachine.start(vm_entry, host.key, find_free_port())
elif request_event.type == RequestType.TransferVM:
virtualmachine.transfer(request_event)

View file

@ -304,6 +304,7 @@ class QEMUMachine(object):
LOG.debug('Command: %r', ' '.join(self._qemu_full_args))
if self._iolog:
LOG.debug('Output: %r', self._iolog)
raise Exception(self._iolog)
raise
def _launch(self):

View file

@ -4,27 +4,28 @@
# For QEMU Monitor Protocol Commands Information, See
# https://qemu.weilnetz.de/doc/qemu-doc.html#pcsys_005fmonitor
import errno
import os
import random
import subprocess as sp
import tempfile
import time
from functools import wraps
from os.path import join
from os.path import join as join_path
from string import Template
from typing import Union
import bitmath
import sshtunnel
from common.helpers import get_ipv4_address
from common.helpers import get_ipv6_address
from common.request import RequestEntry, RequestType
from common.vm import VMEntry, VMStatus
from config import etcd_client, request_pool, running_vms, vm_pool, env_vars
from config import etcd_client, request_pool, running_vms, vm_pool, env_vars, image_storage_handler
from . import qmp
from host import logger
class VM:
def __init__(self, key, handle, vnc_socket_file):
self.key = key # type: str
@ -106,24 +107,16 @@ def update_radvd_conf(etcd_client):
sp.check_output(['systemctl', 'restart', 'radvd'])
def get_start_command_args(
vm_entry, vnc_sock_filename: str, migration=False, migration_port=4444,
):
def get_start_command_args(vm_entry, vnc_sock_filename: str, migration=False, migration_port=None):
threads_per_core = 1
vm_memory = int(bitmath.parse_string(vm_entry.specs["ram"]).to_MB())
vm_memory = int(bitmath.parse_string_unsafe(vm_entry.specs["ram"]).to_MB())
vm_cpus = int(vm_entry.specs["cpu"])
vm_uuid = vm_entry.uuid
vm_networks = vm_entry.network
if env_vars.get('WITHOUT_CEPH'):
command = "-drive file={},format=raw,if=virtio,cache=none".format(
os.path.join(env_vars.get('VM_DIR'), vm_uuid)
)
else:
command = "-drive file=rbd:uservms/{},format=raw,if=virtio,cache=none".format(
vm_uuid
)
command = "-drive file={},format=raw,if=virtio,cache=none".format(
image_storage_handler.qemu_path_string(vm_uuid)
)
command += " -device virtio-rng-pci -vnc unix:{}".format(vnc_sock_filename)
command += " -m {} -smp cores={},threads={}".format(
vm_memory, vm_cpus, threads_per_core
@ -131,7 +124,7 @@ def get_start_command_args(
command += " -name {}".format(vm_uuid)
if migration:
command += " -incoming tcp:0:{}".format(migration_port)
command += " -incoming tcp:[::]:{}".format(migration_port)
tap = None
for network_and_mac in vm_networks:
@ -154,7 +147,7 @@ def get_start_command_args(
return command.split(" ")
def create_vm_object(vm_entry, migration=False, migration_port=4444):
def create_vm_object(vm_entry, migration=False, migration_port=None):
# NOTE: If migration suddenly stop working, having different
# VNC unix filename on source and destination host can
# be a possible cause of it.
@ -198,61 +191,19 @@ def need_running_vm(func):
def create(vm_entry: VMEntry):
vm_hdd = int(bitmath.parse_string(vm_entry.specs["os-ssd"]).to_MB())
if env_vars.get('WITHOUT_CEPH'):
_command_to_create = [
"cp",
os.path.join(env_vars.get('IMAGE_DIR'), vm_entry.image_uuid),
os.path.join(env_vars.get('VM_DIR'), vm_entry.uuid),
]
_command_to_extend = [
"qemu-img",
"resize",
"-f", "raw",
os.path.join(env_vars.get('VM_DIR'), vm_entry.uuid),
"{}M".format(vm_hdd),
]
if image_storage_handler.is_vm_image_exists(vm_entry.uuid):
# File Already exists. No Problem Continue
logger.debug("Image for vm %s exists", vm_entry.uuid)
else:
_command_to_create = [
"rbd",
"clone",
"images/{}@protected".format(vm_entry.image_uuid),
"uservms/{}".format(vm_entry.uuid),
]
_command_to_extend = [
"rbd",
"resize",
"uservms/{}".format(vm_entry.uuid),
"--size",
vm_hdd,
]
try:
sp.check_output(_command_to_create)
except sp.CalledProcessError as e:
if e.returncode == errno.EEXIST:
logger.debug("Image for vm %s exists", vm_entry.uuid)
# File Already exists. No Problem Continue
return
# This exception catches all other exceptions
# i.e FileNotFound (BaseImage), pool Does Not Exists etc.
logger.exception(e)
vm_entry.status = "ERROR"
else:
try:
sp.check_output(_command_to_extend)
except Exception as e:
logger.exception(e)
else:
logger.info("New VM Created")
vm_hdd = int(bitmath.parse_string_unsafe(vm_entry.specs["os-ssd"]).to_MB())
if image_storage_handler.make_vm_image(src=vm_entry.image_uuid, dest=vm_entry.uuid):
if not image_storage_handler.resize_vm_image(path=vm_entry.uuid, size=vm_hdd):
vm_entry.status = "ERROR"
else:
logger.info("New VM Created")
def start(vm_entry: VMEntry):
def start(vm_entry: VMEntry, destination_host_key=None, migration_port=None):
_vm = get_vm(running_vms, vm_entry.key)
# VM already running. No need to proceed further.
@ -260,8 +211,12 @@ def start(vm_entry: VMEntry):
logger.info("VM %s already running", vm_entry.uuid)
return
else:
create(vm_entry)
launch_vm(vm_entry)
if destination_host_key:
launch_vm(vm_entry, migration=True, migration_port=migration_port,
destination_host_key=destination_host_key)
else:
create(vm_entry)
launch_vm(vm_entry)
@need_running_vm
@ -278,18 +233,9 @@ def stop(vm_entry):
def delete(vm_entry):
logger.info("Deleting VM | %s", vm_entry)
stop(vm_entry)
path_without_protocol = vm_entry.path[vm_entry.path.find(":") + 1:]
if env_vars.get('WITHOUT_CEPH'):
vm_deletion_command = ["rm", os.path.join(env_vars.get('VM_DIR'), vm_entry.uuid)]
else:
vm_deletion_command = ["rbd", "rm", path_without_protocol]
try:
sp.check_output(vm_deletion_command)
except Exception as e:
logger.exception(e)
else:
r_status = image_storage_handler.delete_vm_image(vm_entry.uuid)
if r_status:
etcd_client.client.delete(vm_entry.key)
@ -301,15 +247,16 @@ def transfer(request_event):
_host, _port = request_event.parameters["host"], request_event.parameters["port"]
_uuid = request_event.uuid
_destination = request_event.destination_host_key
vm = get_vm(running_vms, join(env_vars.get('VM_PREFIX'), _uuid))
vm = get_vm(running_vms, join_path(env_vars.get('VM_PREFIX'), _uuid))
if vm:
tunnel = sshtunnel.SSHTunnelForwarder(
(_host, 22),
_host,
ssh_username=env_vars.get("ssh_username"),
ssh_pkey=env_vars.get("ssh_pkey"),
ssh_private_key_password=env_vars.get("ssh_private_key_password"),
remote_bind_address=("127.0.0.1", _port),
ssh_proxy_enabled=True,
ssh_proxy=(_host, 22)
)
try:
tunnel.start()
@ -317,7 +264,7 @@ def transfer(request_event):
logger.exception("Couldn't establish connection to (%s, 22)", _host)
else:
vm.handle.command(
"migrate", uri="tcp:{}:{}".format(_host, tunnel.local_bind_port)
"migrate", uri="tcp:0.0.0.0:{}".format(tunnel.local_bind_port)
)
status = vm.handle.command("query-migrate")["status"]
@ -340,38 +287,22 @@ def transfer(request_event):
tunnel.close()
def init_migration(vm_entry, destination_host_key):
# This function would run on destination host i.e host on which the vm
# would be transferred after migration.
# This host would be responsible for starting VM that would receive
# state of VM running on source host.
_vm = get_vm(running_vms, vm_entry.key)
if _vm:
# VM already running. No need to proceed further.
logger.info("%s Already running", _vm.key)
return
launch_vm(vm_entry, migration=True, migration_port=4444,
destination_host_key=destination_host_key)
def launch_vm(vm_entry, migration=False, migration_port=None, destination_host_key=None):
logger.info("Starting %s", vm_entry.key)
vm = create_vm_object(vm_entry, migration=migration, migration_port=migration_port)
try:
vm.handle.launch()
except Exception as e:
logger.exception(e)
except Exception:
logger.exception("Error Occured while starting VM")
vm.handle.shutdown()
if migration:
# We don't care whether MachineError or any other error occurred
vm.handle.shutdown()
pass
else:
# Error during typical launch of a vm
vm_entry.add_log("Error Occurred while starting VM")
vm.handle.shutdown()
vm_entry.declare_killed()
vm_pool.put(vm_entry)
else:
@ -383,7 +314,7 @@ def launch_vm(vm_entry, migration=False, migration_port=None, destination_host_k
r = RequestEntry.from_scratch(
type=RequestType.TransferVM,
hostname=vm_entry.hostname,
parameters={"host": get_ipv4_address(), "port": 4444},
parameters={"host": get_ipv6_address(), "port": migration_port},
uuid=vm_entry.uuid,
destination_host_key=destination_host_key,
request_prefix=env_vars.get("REQUEST_PREFIX")

View file

@ -1,9 +1,9 @@
import json
import os
import subprocess
import sys
from config import etcd_client, env_vars
from os.path import join as join_path
from config import etcd_client, env_vars, image_storage_handler
from imagescanner import logger
@ -20,20 +20,6 @@ def qemu_img_type(path):
def main():
# If you are using env_vars.get('WITHOUT_CEPH') FLAG in .env
# then please make sure that env_vars.get('IMAGE_DIR') directory
# exists otherwise this script would fail
if env_vars.get('WITHOUT_CEPH') and not os.path.isdir(env_vars.get('IMAGE_DIR')):
print("You have set env_vars.get('WITHOUT_CEPH') to True. So,"
"the {} must exists. But, it don't".format(env_vars.get('IMAGE_DIR')))
sys.exit(1)
try:
subprocess.check_output(['which', 'qemu-img'])
except Exception:
print("qemu-img missing")
sys.exit(1)
# We want to get images entries that requests images to be created
images = etcd_client.get_prefix(env_vars.get('IMAGE_PREFIX'), value_in_json=True)
images_to_be_created = list(filter(lambda im: im.value['status'] == 'TO_BE_CREATED', images))
@ -44,7 +30,7 @@ def main():
image_owner = image.value['owner']
image_filename = image.value['filename']
image_store_name = image.value['store_name']
image_full_path = os.path.join(env_vars.get('BASE_DIR'), image_owner, image_filename)
image_full_path = join_path(env_vars.get('BASE_DIR'), image_owner, image_filename)
image_stores = etcd_client.get_prefix(env_vars.get('IMAGE_STORE_PREFIX'), value_in_json=True)
user_image_store = next(filter(
@ -58,43 +44,25 @@ def main():
logger.exception(e)
else:
# At least our basic data is available
qemu_img_convert_command = ["qemu-img", "convert", "-f", "qcow2",
"-O", "raw", image_full_path, "image.raw"]
if env_vars.get('WITHOUT_CEPH'):
image_import_command = ["mv", "image.raw", os.path.join(env_vars.get('IMAGE_DIR'), image_uuid)]
snapshot_creation_command = ["true"]
snapshot_protect_command = ["true"]
else:
image_import_command = ["rbd", "import", "image.raw",
"{}/{}".format(image_store_pool, image_uuid)]
snapshot_creation_command = ["rbd", "snap", "create",
"{}/{}@protected".format(image_store_pool, image_uuid)]
snapshot_protect_command = ["rbd", "snap", "protect",
"{}/{}@protected".format(image_store_pool, image_uuid)]
# First check whether the image is qcow2
if qemu_img_type(image_full_path) == "qcow2":
try:
# Convert .qcow2 to .raw
subprocess.check_output(qemu_img_convert_command)
# Import image either to ceph/filesystem
subprocess.check_output(image_import_command)
# Create and Protect Snapshot
subprocess.check_output(snapshot_creation_command)
subprocess.check_output(snapshot_protect_command)
except Exception as e:
logger.exception(e)
else:
# Everything is successfully done
image.value["status"] = "CREATED"
etcd_client.put(image.key, json.dumps(image.value))
# Import and Protect
r_status = image_storage_handler.import_image(src="image.raw",
dest=image_uuid,
protect=True)
if r_status:
# Everything is successfully done
image.value["status"] = "CREATED"
etcd_client.put(image.key, json.dumps(image.value))
else:
# The user provided image is either not found or of invalid format
image.value["status"] = "INVALID_IMAGE"

33
sanity_checks.py Normal file
View file

@ -0,0 +1,33 @@
import sys
import subprocess as sp
from os.path import isdir
from config import env_vars
def check():
#########################
# ucloud-image-scanner #
#########################
if env_vars.get('STORAGE_BACKEND') == 'filesystem' and not isdir(env_vars.get('IMAGE_DIR')):
print("You have set STORAGE_BACKEND to filesystem. So,"
"the {} must exists. But, it don't".format(env_vars.get('IMAGE_DIR')))
sys.exit(1)
try:
sp.check_output(['which', 'qemu-img'])
except Exception:
print("qemu-img missing")
sys.exit(1)
###############
# ucloud-host #
###############
if env_vars.get('STORAGE_BACKEND') == 'filesystem' and not isdir(env_vars.get('VM_DIR')):
print("You have set STORAGE_BACKEND to filesystem. So, the vm directory mentioned"
" in .env file must exists. But, it don't.")
sys.exit(1)
if __name__ == "__main__":
check()

View file

@ -23,16 +23,16 @@ def remaining_resources(host_specs, vms_specs):
for component in _vms_specs:
if isinstance(_vms_specs[component], str):
_vms_specs[component] = int(bitmath.parse_string(_vms_specs[component]).to_MB())
_vms_specs[component] = int(bitmath.parse_string_unsafe(_vms_specs[component]).to_MB())
elif isinstance(_vms_specs[component], list):
_vms_specs[component] = map(lambda x: int(bitmath.parse_string(x).to_MB()), _vms_specs[component])
_vms_specs[component] = map(lambda x: int(bitmath.parse_string_unsafe(x).to_MB()), _vms_specs[component])
_vms_specs[component] = reduce(lambda x, y: x + y, _vms_specs[component], 0)
for component in _remaining:
if isinstance(_remaining[component], str):
_remaining[component] = int(bitmath.parse_string(_remaining[component]).to_MB())
_remaining[component] = int(bitmath.parse_string_unsafe(_remaining[component]).to_MB())
elif isinstance(_remaining[component], list):
_remaining[component] = map(lambda x: int(bitmath.parse_string(x).to_MB()), _remaining[component])
_remaining[component] = map(lambda x: int(bitmath.parse_string_unsafe(x).to_MB()), _remaining[component])
_remaining[component] = reduce(lambda x, y: x + y, _remaining[component], 0)
_remaining.subtract(_vms_specs)

View file

@ -23,8 +23,6 @@ def main():
]:
for request_event in request_iterator:
request_entry = RequestEntry(request_event)
logger.debug("%s, %s", request_entry.key, request_entry.value)
# Never Run time critical mechanism inside timeout
# mechanism because timeout mechanism only comes
# when no other event is happening. It means under
@ -33,10 +31,10 @@ def main():
# Detect hosts that are dead and set their status
# to "DEAD", and their VMs' status to "KILLED"
logger.debug("TIMEOUT event occured")
dead_hosts = dead_host_detection()
logger.debug("Dead hosts: %s", dead_hosts)
dead_host_mitigation(dead_hosts)
if dead_hosts:
logger.debug("Dead hosts: %s", dead_hosts)
dead_host_mitigation(dead_hosts)
# If there are VMs that weren't assigned a host
# because there wasn't a host available which
@ -52,6 +50,8 @@ def main():
request_pool.put(r)
elif request_entry.type == RequestType.ScheduleVM:
logger.debug("%s, %s", request_entry.key, request_entry.value)
vm_entry = vm_pool.get(request_entry.uuid)
if vm_entry is None:
logger.info("Trying to act on {} but it is deleted".format(request_entry.uuid))
@ -67,7 +67,7 @@ def main():
hosts=[host_pool.get(request_entry.destination)])
except NoSuitableHostFound:
logger.info("Requested destination host doesn't have enough capacity"
"to hold %s", vm_entry.uuid)
"to hold %s" % vm_entry.uuid)
else:
r = RequestEntry.from_scratch(type=RequestType.InitVMMigration,
uuid=request_entry.uuid,

View file

@ -3,6 +3,7 @@ import multiprocessing as mp
import logging
from os.path import join as join_path
from sanity_checks import check
if __name__ == "__main__":
arg_parser = argparse.ArgumentParser(prog='ucloud',
@ -21,30 +22,36 @@ if __name__ == "__main__":
format="%(name)s %(asctime)s: %(levelname)s - %(message)s",
datefmt="%d-%b-%y %H:%M:%S",
)
try:
check()
if args.component == 'api':
from api.main import main
if args.component == 'api':
from api.main import main
main()
elif args.component == 'host':
from host.main import main
main()
elif args.component == 'host':
from host.main import main
hostname = args.component_args
mp.set_start_method('spawn')
main(*hostname)
elif args.component == 'scheduler':
from scheduler.main import main
hostname = args.component_args
mp.set_start_method('spawn')
main(*hostname)
elif args.component == 'scheduler':
from scheduler.main import main
main()
elif args.component == 'filescanner':
from filescanner.main import main
main()
elif args.component == 'filescanner':
from filescanner.main import main
main()
elif args.component == 'imagescanner':
from imagescanner.main import main
main()
elif args.component == 'imagescanner':
from imagescanner.main import main
main()
elif args.component == 'metadata':
from metadata.main import main
main()
elif args.component == 'metadata':
from metadata.main import main
main()
main()
except Exception as e:
logging.exception(e)
print(e)