2017-10-26 14:01:08 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
2022-10-09 00:13:49 +02:00
|
|
|
"""Collecting data from third party API repositories."""
|
2017-10-26 14:01:08 +02:00
|
|
|
|
2022-10-25 11:41:49 +02:00
|
|
|
import requests
|
|
|
|
import bleach
|
2025-05-07 23:40:27 +02:00
|
|
|
from flask import current_app
|
2022-10-09 00:13:49 +02:00
|
|
|
from pyquery import PyQuery as pq # noqa: N813
|
2021-10-01 18:11:43 +02:00
|
|
|
from base64 import b64decode
|
2023-03-14 15:11:05 +01:00
|
|
|
from bleach.sanitizer import ALLOWED_ATTRIBUTES
|
2021-10-01 18:11:43 +02:00
|
|
|
from urllib.parse import quote_plus
|
2022-10-25 11:41:49 +02:00
|
|
|
from .apievents import (
|
2022-11-13 18:08:12 +01:00
|
|
|
fetch_commits_github,
|
2022-10-25 11:41:49 +02:00
|
|
|
fetch_commits_gitlab,
|
|
|
|
fetch_commits_gitea,
|
|
|
|
)
|
2023-12-22 17:46:00 +01:00
|
|
|
from .utils import (
|
2025-05-07 23:40:27 +02:00
|
|
|
sanitize_url,
|
|
|
|
load_presets,
|
|
|
|
load_yaml_presets,
|
|
|
|
fix_relative_links,
|
|
|
|
markdownit,
|
2023-12-22 17:46:00 +01:00
|
|
|
)
|
2017-10-26 14:01:08 +02:00
|
|
|
from future.standard_library import install_aliases
|
2025-05-07 23:40:27 +02:00
|
|
|
|
2017-10-26 14:01:08 +02:00
|
|
|
install_aliases()
|
2021-07-12 12:19:55 +02:00
|
|
|
|
2022-11-13 21:43:07 +01:00
|
|
|
# In seconds, how long to wait for API response
|
|
|
|
REQUEST_TIMEOUT = 10
|
|
|
|
|
2025-05-07 23:40:27 +02:00
|
|
|
|
|
|
|
def FetchStageConfig(url, top_element="stages", by_col="name"):
|
2023-08-15 17:43:16 +02:00
|
|
|
"""Download a remote YAML stages configuration."""
|
2025-05-07 23:40:27 +02:00
|
|
|
if not url.startswith("http:") and not url.startswith("https:"):
|
|
|
|
current_app.logger.info("Loading stages from file")
|
2023-08-15 17:43:16 +02:00
|
|
|
return load_yaml_presets(top_element, by_col, url)
|
2025-05-07 23:40:27 +02:00
|
|
|
current_app.logger.info("Loading stages from URL")
|
2023-08-15 17:43:16 +02:00
|
|
|
data = requests.get(url, timeout=REQUEST_TIMEOUT)
|
2025-05-07 23:40:27 +02:00
|
|
|
if data.text.find("stages:") < 0:
|
|
|
|
current_app.logger.debug("No stage data: %s", data.text)
|
2023-08-15 17:43:16 +02:00
|
|
|
return {}
|
|
|
|
blob = data.text
|
|
|
|
return load_presets(blob, top_element, by_col)
|
|
|
|
|
|
|
|
|
2022-10-25 11:41:49 +02:00
|
|
|
def FetchGiteaProject(project_url):
|
|
|
|
"""Download data from Codeberg, a large Gitea site."""
|
|
|
|
# Docs: https://codeberg.org/api/swagger
|
|
|
|
site_root = "https://codeberg.org"
|
2025-05-07 23:40:27 +02:00
|
|
|
url_q = quote_plus(project_url, "/")
|
2022-10-25 11:41:49 +02:00
|
|
|
api_repos = site_root + "/api/v1/repos/%s" % url_q
|
|
|
|
api_content = api_repos + "/contents"
|
|
|
|
# Collect basic data
|
2025-05-07 23:40:27 +02:00
|
|
|
current_app.logger.info("Fetching Gitea: %s", url_q)
|
2022-11-13 21:43:07 +01:00
|
|
|
data = requests.get(api_repos, timeout=REQUEST_TIMEOUT)
|
2025-05-07 23:40:27 +02:00
|
|
|
if data.text.find("{") < 0:
|
|
|
|
current_app.logger.debug("No data: %s", data.text)
|
2022-10-25 11:41:49 +02:00
|
|
|
return {}
|
|
|
|
json = data.json()
|
2025-05-07 23:40:27 +02:00
|
|
|
if "name" not in json:
|
|
|
|
current_app.logger.debug("Invalid data: %s", data.text)
|
2022-10-25 11:41:49 +02:00
|
|
|
return {}
|
|
|
|
# Collect the README
|
2022-11-13 21:43:07 +01:00
|
|
|
data = requests.get(api_content, timeout=REQUEST_TIMEOUT)
|
2022-10-25 11:41:49 +02:00
|
|
|
readme = ""
|
2025-05-07 23:40:27 +02:00
|
|
|
if not data.text.find("{") < 0:
|
2022-10-25 11:41:49 +02:00
|
|
|
readmeurl = None
|
|
|
|
for repo_file in data.json():
|
2025-05-07 23:40:27 +02:00
|
|
|
if "readme" in repo_file["name"].lower():
|
|
|
|
readmeurl = repo_file["download_url"]
|
2022-11-13 21:43:07 +01:00
|
|
|
readmedata = requests.get(readmeurl, timeout=REQUEST_TIMEOUT)
|
2022-11-13 18:08:12 +01:00
|
|
|
readme = readmedata.text
|
2022-10-25 11:41:49 +02:00
|
|
|
break
|
|
|
|
if readmeurl is None:
|
2025-05-07 23:40:27 +02:00
|
|
|
current_app.logger.info("Could not find README: %s", url_q)
|
|
|
|
issuesurl = ""
|
|
|
|
if json["has_issues"]:
|
|
|
|
issuesurl = json["html_url"] + "/issues"
|
2022-10-25 11:41:49 +02:00
|
|
|
return {
|
2025-05-07 23:40:27 +02:00
|
|
|
"type": "Gitea",
|
|
|
|
"name": json["name"],
|
|
|
|
"summary": json["description"],
|
|
|
|
"description": readme,
|
|
|
|
"source_url": json["html_url"],
|
|
|
|
"image_url": json["avatar_url"] or json["owner"]["avatar_url"],
|
|
|
|
"contact_url": issuesurl,
|
|
|
|
"commits": fetch_commits_gitea(url_q),
|
2022-10-25 11:41:49 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2017-10-26 14:01:08 +02:00
|
|
|
def FetchGitlabProject(project_url):
|
2022-10-25 11:41:49 +02:00
|
|
|
"""Download data from GitLab."""
|
2022-11-13 18:08:12 +01:00
|
|
|
WEB_BASE = "https://gitlab.com"
|
|
|
|
API_BASE = WEB_BASE + "/api/v4/projects/%s"
|
2017-10-26 14:01:08 +02:00
|
|
|
url_q = quote_plus(project_url)
|
2022-10-25 11:41:49 +02:00
|
|
|
# Collect basic data
|
2025-05-07 23:40:27 +02:00
|
|
|
current_app.logger.info("Fetching GitLab: %s", url_q)
|
2022-11-13 21:43:07 +01:00
|
|
|
data = requests.get(API_BASE % url_q, timeout=REQUEST_TIMEOUT)
|
2025-05-07 23:40:27 +02:00
|
|
|
if data.text.find("{") < 0:
|
|
|
|
current_app.logger.debug("No data: %s", data.text)
|
2021-10-01 18:11:43 +02:00
|
|
|
return {}
|
2017-10-26 14:01:08 +02:00
|
|
|
json = data.json()
|
2025-05-07 23:40:27 +02:00
|
|
|
if "name" not in json:
|
|
|
|
current_app.logger.debug("Invalid data: %s", data.text)
|
2021-10-01 18:11:43 +02:00
|
|
|
return {}
|
2022-10-25 11:41:49 +02:00
|
|
|
# Collect the README
|
2025-05-07 23:40:27 +02:00
|
|
|
readmeurl = json["readme_url"] + "?inline=false"
|
|
|
|
readmeurl = readmeurl.replace("-/blob/", "-/raw/")
|
2022-11-13 21:43:07 +01:00
|
|
|
readmedata = requests.get(readmeurl, timeout=REQUEST_TIMEOUT)
|
2017-10-26 14:01:08 +02:00
|
|
|
readme = readmedata.text or ""
|
|
|
|
return {
|
2025-05-07 23:40:27 +02:00
|
|
|
"type": "GitLab",
|
|
|
|
"name": json["name"],
|
|
|
|
"summary": json["description"],
|
|
|
|
"description": readme,
|
|
|
|
"source_url": json["web_url"],
|
|
|
|
"image_url": json["avatar_url"],
|
|
|
|
"contact_url": json["web_url"] + "/issues",
|
|
|
|
"commits": fetch_commits_gitlab(json["id"]),
|
2017-10-26 14:01:08 +02:00
|
|
|
}
|
|
|
|
|
2021-10-01 18:11:43 +02:00
|
|
|
|
2022-10-09 00:01:28 +02:00
|
|
|
def FetchGitlabAvatar(email):
|
2022-10-25 11:41:49 +02:00
|
|
|
"""Download a user avatar from GitLab."""
|
2022-10-09 00:01:28 +02:00
|
|
|
apiurl = "https://gitlab.com/api/v4/avatar?email=%s&size=80"
|
2022-11-13 21:43:07 +01:00
|
|
|
data = requests.get(apiurl % email, timeout=REQUEST_TIMEOUT)
|
2025-05-07 23:40:27 +02:00
|
|
|
if data.text.find("{") < 0:
|
|
|
|
current_app.logger.debug("No data: %s", data.text)
|
2022-10-09 00:01:28 +02:00
|
|
|
return None
|
|
|
|
json = data.json()
|
2025-05-07 23:40:27 +02:00
|
|
|
if "avatar_url" not in json:
|
2022-10-09 00:01:28 +02:00
|
|
|
return None
|
2025-05-07 23:40:27 +02:00
|
|
|
return json["avatar_url"]
|
2022-10-09 00:01:28 +02:00
|
|
|
|
|
|
|
|
2017-10-26 14:01:08 +02:00
|
|
|
def FetchGithubProject(project_url):
|
2022-10-25 11:41:49 +02:00
|
|
|
"""Download data from GitHub."""
|
2017-10-26 14:01:08 +02:00
|
|
|
API_BASE = "https://api.github.com/repos/%s"
|
2025-05-07 23:40:27 +02:00
|
|
|
current_app.logger.info("Fetching GitHub: %s", project_url)
|
2022-11-13 21:43:07 +01:00
|
|
|
data = requests.get(API_BASE % project_url, timeout=REQUEST_TIMEOUT)
|
2025-05-07 23:40:27 +02:00
|
|
|
if data.text.find("{") < 0:
|
|
|
|
current_app.logger.debug("No data: %s", data.text)
|
2021-10-01 18:11:43 +02:00
|
|
|
return {}
|
2017-10-26 14:01:08 +02:00
|
|
|
json = data.json()
|
2025-05-07 23:40:27 +02:00
|
|
|
if "name" not in json or "full_name" not in json:
|
|
|
|
current_app.logger.debug("Invalid data: %s", data.text)
|
2021-10-01 18:11:43 +02:00
|
|
|
return {}
|
2025-05-07 23:40:27 +02:00
|
|
|
repo_full_name = json["full_name"]
|
|
|
|
default_branch = json["default_branch"] or "main"
|
2017-10-26 14:01:08 +02:00
|
|
|
readmeurl = "%s/readme" % (API_BASE % project_url)
|
2022-11-13 21:43:07 +01:00
|
|
|
readmedata = requests.get(readmeurl, timeout=REQUEST_TIMEOUT)
|
2025-05-07 23:40:27 +02:00
|
|
|
readme = ""
|
|
|
|
if readmedata.text.find("{") < 0:
|
|
|
|
current_app.logger.debug("No readme: %s", data.text)
|
2022-11-13 21:08:32 +01:00
|
|
|
else:
|
|
|
|
readme = readmedata.json()
|
2025-05-07 23:40:27 +02:00
|
|
|
if "content" not in readme:
|
|
|
|
readme = ""
|
2021-05-22 23:08:24 +02:00
|
|
|
else:
|
2025-05-11 23:07:33 +02:00
|
|
|
# Convert from base64
|
|
|
|
readme = b64decode(readme["content"]).decode("utf-8") # type: ignore
|
2021-05-22 23:08:24 +02:00
|
|
|
# Fix relative links in text
|
|
|
|
imgroot = "https://raw.githubusercontent.com"
|
2023-12-22 17:46:00 +01:00
|
|
|
readme = fix_relative_links(readme, imgroot, repo_full_name, default_branch)
|
2017-10-26 14:01:08 +02:00
|
|
|
return {
|
2025-05-07 23:40:27 +02:00
|
|
|
"type": "GitHub",
|
|
|
|
"name": json["name"],
|
|
|
|
"summary": json["description"],
|
|
|
|
"description": readme,
|
|
|
|
"webpage_url": json["homepage"],
|
|
|
|
"source_url": json["html_url"],
|
|
|
|
"image_url": json["owner"]["avatar_url"],
|
|
|
|
"contact_url": json["html_url"] + "/issues",
|
|
|
|
"download_url": json["html_url"] + "/releases",
|
|
|
|
"commits": fetch_commits_github(repo_full_name),
|
2017-10-26 14:01:08 +02:00
|
|
|
}
|
|
|
|
|
2021-10-01 18:11:43 +02:00
|
|
|
|
2024-10-31 10:27:59 +01:00
|
|
|
def FetchGithubIssue(project_url, issue_id):
|
|
|
|
"""Download an issue from GitHub."""
|
|
|
|
project_data = FetchGithubProject(project_url)
|
2025-05-07 23:40:27 +02:00
|
|
|
current_app.logger.info("Fetching GitHub Issue: %s", issue_id)
|
2024-10-31 10:27:59 +01:00
|
|
|
API_BASE = "https://api.github.com/repos/%s/issues/%d"
|
|
|
|
data = requests.get(API_BASE % (project_url, issue_id), timeout=REQUEST_TIMEOUT)
|
2025-05-07 23:40:27 +02:00
|
|
|
if data.text.find("{") < 0:
|
|
|
|
current_app.logger.debug("No data: %s", data.text)
|
2024-10-31 10:27:59 +01:00
|
|
|
return {}
|
|
|
|
json = data.json()
|
2025-05-07 23:40:27 +02:00
|
|
|
if "title" not in json or "body" not in json:
|
|
|
|
current_app.logger.debug("Invalid data: %s", data.text)
|
2024-10-31 10:27:59 +01:00
|
|
|
return {}
|
2025-05-07 23:40:27 +02:00
|
|
|
project_data["hashtag"] = "#%d" % issue_id
|
|
|
|
project_data["summary"] = project_data["name"]
|
|
|
|
project_data["name"] = json["title"][:77]
|
|
|
|
project_data["description"] = json["body"]
|
2024-10-31 10:27:59 +01:00
|
|
|
return project_data
|
|
|
|
|
|
|
|
|
2017-10-26 14:01:08 +02:00
|
|
|
def FetchBitbucketProject(project_url):
|
2022-10-25 11:41:49 +02:00
|
|
|
"""Download data from Bitbucket."""
|
2017-10-26 14:01:08 +02:00
|
|
|
WEB_BASE = "https://bitbucket.org/%s"
|
|
|
|
API_BASE = "https://api.bitbucket.org/2.0/repositories/%s"
|
2025-05-07 23:40:27 +02:00
|
|
|
current_app.logger.info("Fetching Bitbucket: %s", project_url)
|
2022-11-13 21:43:07 +01:00
|
|
|
data = requests.get(API_BASE % project_url, timeout=REQUEST_TIMEOUT)
|
2025-05-07 23:40:27 +02:00
|
|
|
if data.text.find("{") < 0:
|
|
|
|
current_app.logger.debug("No data at: %s", project_url)
|
2021-07-08 23:00:59 +02:00
|
|
|
return {}
|
2017-10-26 14:01:08 +02:00
|
|
|
json = data.json()
|
2025-05-07 23:40:27 +02:00
|
|
|
if "name" not in json:
|
|
|
|
current_app.logger.debug("Invalid format at: %s", project_url)
|
2021-07-08 23:00:59 +02:00
|
|
|
return {}
|
2025-05-07 23:40:27 +02:00
|
|
|
readme = ""
|
|
|
|
for docext in [".md", ".rst", ".txt", ""]:
|
2021-10-01 18:11:43 +02:00
|
|
|
readmedata = requests.get(
|
2025-05-07 23:40:27 +02:00
|
|
|
API_BASE % project_url + "/src/HEAD/README.md", timeout=REQUEST_TIMEOUT
|
|
|
|
)
|
2021-07-08 23:00:59 +02:00
|
|
|
if readmedata.text.find('{"type":"error"') != 0:
|
|
|
|
readme = readmedata.text
|
|
|
|
break
|
2017-10-26 14:01:08 +02:00
|
|
|
web_url = WEB_BASE % project_url
|
2025-05-07 23:40:27 +02:00
|
|
|
contact_url = json["website"] or web_url
|
|
|
|
if json["has_issues"]:
|
2021-10-01 18:11:43 +02:00
|
|
|
contact_url = "%s/issues" % web_url
|
2025-05-07 23:40:27 +02:00
|
|
|
image_url = ""
|
|
|
|
if (
|
|
|
|
"project" in json
|
|
|
|
and "links" in json["project"]
|
|
|
|
and "avatar" in json["project"]["links"]
|
|
|
|
):
|
|
|
|
image_url = json["project"]["links"]["avatar"]["href"]
|
|
|
|
elif "links" in json and "avatar" in json["links"]:
|
|
|
|
image_url = json["links"]["avatar"]["href"]
|
2017-10-26 14:01:08 +02:00
|
|
|
return {
|
2025-05-07 23:40:27 +02:00
|
|
|
"type": "Bitbucket",
|
|
|
|
"name": json["name"],
|
|
|
|
"summary": json["description"],
|
|
|
|
"description": readme,
|
|
|
|
"webpage_url": json["website"],
|
|
|
|
"source_url": web_url,
|
|
|
|
"image_url": image_url,
|
|
|
|
"contact_url": contact_url,
|
2017-10-26 14:01:08 +02:00
|
|
|
}
|
|
|
|
|
2021-10-01 18:11:43 +02:00
|
|
|
|
2023-06-05 22:15:55 +02:00
|
|
|
def FetchDataProject(datapackage_url):
|
2022-10-25 11:41:49 +02:00
|
|
|
"""Try to load a Data Package formatted JSON file."""
|
2021-10-01 18:11:43 +02:00
|
|
|
# TODO: use frictionlessdata library!
|
2025-05-07 23:40:27 +02:00
|
|
|
project_url = datapackage_url.replace("datapackage.json", "")
|
|
|
|
project_url = sanitize_url(project_url) + "datapackage.json"
|
2022-11-13 21:43:07 +01:00
|
|
|
data = requests.get(project_url, timeout=REQUEST_TIMEOUT)
|
2022-11-13 22:14:52 +01:00
|
|
|
# TODO: treat dribdat events as special
|
2025-05-07 23:40:27 +02:00
|
|
|
current_app.logger.info("Fetching Data Package: %s", project_url)
|
|
|
|
if data.text.find("{") < 0:
|
|
|
|
current_app.logger.debug("No data at: %s", project_url)
|
2021-10-01 18:11:43 +02:00
|
|
|
return {}
|
2017-10-26 15:02:53 +02:00
|
|
|
json = data.json()
|
2025-05-07 23:40:27 +02:00
|
|
|
contact_url = ""
|
|
|
|
if "name" not in json or "title" not in json:
|
|
|
|
current_app.logger.debug("Invalid format at: %s", project_url)
|
2021-10-01 18:11:43 +02:00
|
|
|
return {}
|
2022-11-13 22:14:52 +01:00
|
|
|
try:
|
|
|
|
text_content = parse_data_package(json)
|
|
|
|
except KeyError:
|
2025-05-07 23:40:27 +02:00
|
|
|
text_content = "(Could not parse Data Package contents)"
|
|
|
|
if "homepage" in json:
|
|
|
|
contact_url = json["homepage"] or ""
|
|
|
|
elif (
|
|
|
|
"maintainers" in json
|
|
|
|
and len(json["maintainers"]) > 0
|
|
|
|
and "web" in json["maintainers"][0]
|
|
|
|
):
|
|
|
|
contact_url = json["maintainers"][0]["web"]
|
2017-10-26 15:02:53 +02:00
|
|
|
return {
|
2025-05-07 23:40:27 +02:00
|
|
|
"type": "Data Package",
|
|
|
|
"name": json["name"],
|
|
|
|
"summary": json["title"],
|
|
|
|
"description": text_content,
|
|
|
|
"source_url": project_url,
|
|
|
|
"logo_icon": "box-open",
|
|
|
|
"contact_url": contact_url,
|
2017-10-26 15:02:53 +02:00
|
|
|
}
|
|
|
|
|
2021-10-01 18:11:43 +02:00
|
|
|
|
2022-11-13 22:14:52 +01:00
|
|
|
def parse_data_package(json):
|
|
|
|
"""Extract contents of a Data Package."""
|
2025-05-07 23:40:27 +02:00
|
|
|
text_content = ""
|
|
|
|
if "description" in json:
|
|
|
|
text_content = json["description"] + "\n\n"
|
|
|
|
if "resources" in json:
|
|
|
|
text_content = text_content + "\n### Resources\n\n"
|
|
|
|
for r in json["resources"]:
|
|
|
|
rn = r["name"]
|
|
|
|
if "path" in r:
|
|
|
|
rn = "[%s](%s)" % (rn, r["path"])
|
|
|
|
text_content = text_content + "- " + rn + "\n"
|
|
|
|
if "sources" in json:
|
|
|
|
text_content = text_content + "\n### Sources\n\n"
|
|
|
|
for r in json["sources"]:
|
|
|
|
rn = r["title"]
|
|
|
|
if "path" in r:
|
|
|
|
rn = "[%s](%s)" % (rn, r["path"])
|
|
|
|
text_content = text_content + "- " + rn + "\n"
|
|
|
|
if text_content == "":
|
|
|
|
raise KeyError("No content")
|
2022-11-13 22:14:52 +01:00
|
|
|
return text_content
|
|
|
|
|
|
|
|
|
2023-08-29 17:57:16 +02:00
|
|
|
def FetchDribdatProject(dribdat_url):
|
|
|
|
"""Try to load a Dribdat project from a remote page."""
|
2025-05-07 23:40:27 +02:00
|
|
|
project_url = dribdat_url.replace("/project/", "/api/project/")
|
|
|
|
project_url = sanitize_url(project_url) + "?full=1"
|
2023-08-29 17:57:16 +02:00
|
|
|
data = requests.get(project_url, timeout=REQUEST_TIMEOUT)
|
|
|
|
# TODO: treat dribdat events as special
|
2025-05-07 23:40:27 +02:00
|
|
|
current_app.logger.info("Fetching Dribdat site: %s", project_url)
|
|
|
|
if data.text.find("{") < 0:
|
|
|
|
current_app.logger.debug("No data at: %s", project_url)
|
2023-08-29 17:57:16 +02:00
|
|
|
return {}
|
|
|
|
json = data.json()
|
2025-05-07 23:40:27 +02:00
|
|
|
if "project" not in json or "event" not in json:
|
|
|
|
current_app.logger.debug("Invalid format at: %s", project_url)
|
2023-08-29 17:57:16 +02:00
|
|
|
return {}
|
2025-05-07 23:40:27 +02:00
|
|
|
projectdata = json["project"]
|
|
|
|
projectdata["type"] = "Dribdat"
|
|
|
|
projectdata["description"] = projectdata["longtext"]
|
2023-08-29 17:57:16 +02:00
|
|
|
return projectdata
|
|
|
|
|
|
|
|
|
2020-10-21 17:13:13 +02:00
|
|
|
# Basis: https://github.com/mozilla/bleach/blob/master/bleach/sanitizer.py#L16
|
2023-03-14 15:11:05 +01:00
|
|
|
ALLOWED_HTML_TAGS = [
|
2025-05-07 23:40:27 +02:00
|
|
|
"acronym",
|
|
|
|
"a",
|
|
|
|
"blockquote",
|
|
|
|
"li",
|
|
|
|
"abbr",
|
|
|
|
"strong",
|
|
|
|
"b",
|
|
|
|
"i",
|
|
|
|
"ul",
|
|
|
|
"ol",
|
|
|
|
"code",
|
|
|
|
"em",
|
|
|
|
"img",
|
|
|
|
"font",
|
|
|
|
"center",
|
|
|
|
"sub",
|
|
|
|
"sup",
|
|
|
|
"pre",
|
|
|
|
"table",
|
|
|
|
"tr",
|
|
|
|
"thead",
|
|
|
|
"tbody",
|
|
|
|
"td",
|
|
|
|
"h1",
|
|
|
|
"h2",
|
|
|
|
"h3",
|
|
|
|
"h4",
|
|
|
|
"h5",
|
|
|
|
"p",
|
|
|
|
"u",
|
2023-03-14 15:11:05 +01:00
|
|
|
]
|
2020-10-21 17:13:13 +02:00
|
|
|
ALLOWED_HTML_ATTR = ALLOWED_ATTRIBUTES
|
2025-05-07 23:40:27 +02:00
|
|
|
ALLOWED_HTML_ATTR["h1"] = ["id"]
|
|
|
|
ALLOWED_HTML_ATTR["h2"] = ["id"]
|
|
|
|
ALLOWED_HTML_ATTR["h3"] = ["id"]
|
|
|
|
ALLOWED_HTML_ATTR["h4"] = ["id"]
|
|
|
|
ALLOWED_HTML_ATTR["h5"] = ["id"]
|
|
|
|
ALLOWED_HTML_ATTR["a"] = ["href", "title", "class", "name"]
|
|
|
|
ALLOWED_HTML_ATTR["img"] = ["src", "width", "height", "alt", "class"]
|
|
|
|
ALLOWED_HTML_ATTR["font"] = ["color"]
|
2017-10-26 14:48:37 +02:00
|
|
|
|
2021-10-01 18:11:43 +02:00
|
|
|
|
2023-05-15 19:24:45 +02:00
|
|
|
def RequestRemoteContent(project_url):
|
2020-10-26 13:18:45 +01:00
|
|
|
try:
|
2023-03-28 14:43:26 +02:00
|
|
|
# TODO: the admin should be able to whitelist a range of allowed
|
2024-10-31 10:27:59 +01:00
|
|
|
# online resources controlling the domains from which we can
|
2023-03-28 14:43:26 +02:00
|
|
|
# fetch remote content.
|
|
|
|
project_url = sanitize_url(project_url)
|
2025-05-07 23:40:27 +02:00
|
|
|
current_app.logger.info("Fetching: %s", project_url)
|
2022-11-13 21:43:07 +01:00
|
|
|
data = requests.get(project_url, timeout=REQUEST_TIMEOUT)
|
2023-05-15 19:24:45 +02:00
|
|
|
return data.text or None
|
2021-10-16 21:48:25 +02:00
|
|
|
except requests.exceptions.RequestException:
|
2025-05-07 23:40:27 +02:00
|
|
|
current_app.logger.warning("Could not connect to %s" % project_url)
|
2023-05-15 19:24:45 +02:00
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
def FetchWebProject(project_url):
|
|
|
|
"""Parse a remote Document, wiki or website URL."""
|
2024-10-31 10:27:59 +01:00
|
|
|
|
2023-05-15 19:24:45 +02:00
|
|
|
datatext = RequestRemoteContent(project_url)
|
2025-05-07 23:40:27 +02:00
|
|
|
if datatext is None:
|
|
|
|
return {}
|
2020-10-26 13:21:12 +01:00
|
|
|
|
2021-10-16 21:48:25 +02:00
|
|
|
# Google Document
|
2025-05-07 23:40:27 +02:00
|
|
|
if project_url.startswith("https://docs.google.com/document"):
|
2023-05-15 19:24:45 +02:00
|
|
|
return FetchWebGoogleDoc(datatext, project_url)
|
2023-03-21 23:20:53 +01:00
|
|
|
# Instructables
|
2025-05-07 23:40:27 +02:00
|
|
|
elif project_url.startswith("https://www.instructables.com/"):
|
2023-05-15 19:24:45 +02:00
|
|
|
return FetchWebInstructables(datatext, project_url)
|
2024-11-18 23:46:57 +01:00
|
|
|
# Pretalx
|
2025-06-22 23:24:23 +02:00
|
|
|
elif datatext.find('<meta name="generator" content="pretalx"') > 0:
|
2024-11-18 23:46:57 +01:00
|
|
|
return FetchWebPretalx(datatext, project_url)
|
2021-10-16 21:48:25 +02:00
|
|
|
# DokuWiki
|
2025-06-22 23:24:23 +02:00
|
|
|
elif datatext.find('<meta name="generator" content="DokuWiki"') > 0:
|
2023-05-15 19:24:45 +02:00
|
|
|
return FetchWebDokuWiki(datatext, project_url)
|
2021-10-16 21:48:25 +02:00
|
|
|
# Etherpad
|
2025-05-07 23:40:27 +02:00
|
|
|
elif datatext.find("pad.importExport.exportetherpad") > 0:
|
2023-05-15 19:24:45 +02:00
|
|
|
return FetchWebEtherpad(datatext, project_url)
|
2025-06-22 23:24:23 +02:00
|
|
|
# CodiMD / HackMD
|
|
|
|
elif datatext.find('<div id="doc" ') > 0:
|
|
|
|
return FetchWebCodiMD(datatext, project_url)
|
|
|
|
|
|
|
|
return {}
|
2021-10-16 21:48:25 +02:00
|
|
|
|
|
|
|
|
|
|
|
def FetchWebGoogleDoc(text, url):
|
2022-10-25 11:41:49 +02:00
|
|
|
"""Help extract data from a Google doc."""
|
2021-10-16 21:48:25 +02:00
|
|
|
doc = pq(text)
|
|
|
|
doc("style").remove()
|
|
|
|
ptitle = doc("div#title") or doc("div#header")
|
|
|
|
if len(ptitle) < 1:
|
|
|
|
return {}
|
|
|
|
content = doc("div#contents")
|
|
|
|
if len(content) < 1:
|
|
|
|
return {}
|
2025-05-07 23:40:27 +02:00
|
|
|
content = str(content.html()).strip()
|
2023-03-14 15:11:05 +01:00
|
|
|
if not content or len(content) < 1:
|
|
|
|
return {}
|
2025-05-07 23:40:27 +02:00
|
|
|
html_content = bleach.clean(
|
|
|
|
content,
|
|
|
|
strip=True,
|
|
|
|
tags=frozenset(ALLOWED_HTML_TAGS),
|
|
|
|
attributes=ALLOWED_HTML_ATTR,
|
|
|
|
)
|
2017-10-26 14:48:37 +02:00
|
|
|
obj = {}
|
|
|
|
# {
|
2020-10-22 11:20:47 +02:00
|
|
|
# 'type': 'Google', ...
|
2017-10-26 14:48:37 +02:00
|
|
|
# 'name': name,
|
|
|
|
# 'summary': summary,
|
|
|
|
# 'description': html_content,
|
|
|
|
# 'image_url': image_url
|
|
|
|
# 'source_url': project_url,
|
|
|
|
# }
|
2025-05-07 23:40:27 +02:00
|
|
|
obj["type"] = "Google Docs"
|
|
|
|
obj["name"] = ptitle.text()
|
|
|
|
obj["description"] = html_content
|
|
|
|
obj["source_url"] = url
|
|
|
|
obj["logo_icon"] = "paperclip"
|
2021-10-16 21:48:25 +02:00
|
|
|
return obj
|
2017-10-26 14:48:37 +02:00
|
|
|
|
2021-02-23 15:47:37 +01:00
|
|
|
|
2021-10-16 21:48:25 +02:00
|
|
|
def FetchWebCodiMD(text, url):
|
2022-10-25 11:41:49 +02:00
|
|
|
"""Help extract data from CodiMD."""
|
2021-10-16 21:48:25 +02:00
|
|
|
doc = pq(text)
|
|
|
|
ptitle = doc("title")
|
|
|
|
if len(ptitle) < 1:
|
|
|
|
return {}
|
2025-05-07 23:40:27 +02:00
|
|
|
content = str(doc("div#doc").html())
|
2021-10-16 21:48:25 +02:00
|
|
|
if len(content) < 1:
|
|
|
|
return {}
|
|
|
|
obj = {}
|
2025-05-07 23:40:27 +02:00
|
|
|
obj["type"] = "Markdown"
|
|
|
|
obj["name"] = ptitle.text()
|
|
|
|
obj["description"] = markdownit(content)
|
|
|
|
obj["source_url"] = url
|
|
|
|
obj["logo_icon"] = "outdent"
|
2021-10-16 21:48:25 +02:00
|
|
|
return obj
|
2017-10-26 14:48:37 +02:00
|
|
|
|
|
|
|
|
2021-10-16 21:48:25 +02:00
|
|
|
def FetchWebDokuWiki(text, url):
|
2022-10-25 11:41:49 +02:00
|
|
|
"""Help extract data from DokuWiki."""
|
2021-10-16 21:48:25 +02:00
|
|
|
doc = pq(text)
|
2025-06-22 23:24:23 +02:00
|
|
|
ptitle = doc(".pageId")
|
2021-10-16 21:48:25 +02:00
|
|
|
if len(ptitle) < 1:
|
|
|
|
return {}
|
2025-06-22 23:24:23 +02:00
|
|
|
title = str(ptitle.text()).replace("project:", "")
|
|
|
|
if len(ptitle) < 1:
|
2021-10-16 21:48:25 +02:00
|
|
|
return {}
|
2025-06-22 23:24:23 +02:00
|
|
|
content = doc("#dokuwiki__content")
|
|
|
|
if len(content) < 1:
|
|
|
|
content = doc("div.dw-content")
|
|
|
|
if len(content) < 1:
|
|
|
|
return {}
|
2025-05-07 23:40:27 +02:00
|
|
|
html_content = bleach.clean(
|
|
|
|
str(content.html()).strip(),
|
|
|
|
strip=True,
|
|
|
|
tags=ALLOWED_HTML_TAGS,
|
|
|
|
attributes=ALLOWED_HTML_ATTR,
|
|
|
|
)
|
2021-10-16 21:48:25 +02:00
|
|
|
obj = {}
|
2025-05-07 23:40:27 +02:00
|
|
|
obj["type"] = "DokuWiki"
|
2025-06-22 23:24:23 +02:00
|
|
|
obj["name"] = title
|
2025-05-07 23:40:27 +02:00
|
|
|
obj["description"] = html_content
|
|
|
|
obj["source_url"] = url
|
|
|
|
obj["logo_icon"] = "list-ul"
|
2021-10-16 21:48:25 +02:00
|
|
|
return obj
|
2017-10-26 14:48:37 +02:00
|
|
|
|
|
|
|
|
2021-10-16 21:48:25 +02:00
|
|
|
def FetchWebEtherpad(text, url):
|
2022-10-25 11:41:49 +02:00
|
|
|
"""Help extract data from Etherpad Lite."""
|
2025-05-07 23:40:27 +02:00
|
|
|
ptitle = url.split("/")[-1]
|
2021-10-16 21:48:25 +02:00
|
|
|
if len(ptitle) < 1:
|
|
|
|
return {}
|
2025-05-07 23:40:27 +02:00
|
|
|
text_content = requests.get("%s/export/txt" % url, timeout=REQUEST_TIMEOUT).text
|
2021-10-16 21:48:25 +02:00
|
|
|
obj = {}
|
2025-05-07 23:40:27 +02:00
|
|
|
obj["type"] = "Etherpad"
|
|
|
|
obj["name"] = ptitle.replace("_", " ")
|
|
|
|
obj["description"] = text_content
|
|
|
|
obj["source_url"] = url
|
|
|
|
obj["logo_icon"] = "pen"
|
2021-10-16 21:48:25 +02:00
|
|
|
return obj
|
2020-10-24 22:37:36 +02:00
|
|
|
|
|
|
|
|
2021-10-16 21:48:25 +02:00
|
|
|
def FetchWebInstructables(text, url):
|
2022-10-25 11:41:49 +02:00
|
|
|
"""Help extract data from Instructables."""
|
2021-10-16 21:48:25 +02:00
|
|
|
doc = pq(text)
|
|
|
|
ptitle = doc(".header-title")
|
|
|
|
content = doc(".main-content")
|
2022-11-13 18:08:12 +01:00
|
|
|
if len(content) < 1 or len(ptitle) < 1:
|
2021-10-16 21:48:25 +02:00
|
|
|
return {}
|
2022-11-13 21:08:32 +01:00
|
|
|
html_content = ParseInstructablesPage(content)
|
|
|
|
obj = {}
|
2025-05-07 23:40:27 +02:00
|
|
|
obj["type"] = "Instructables"
|
|
|
|
obj["name"] = ptitle.text()
|
|
|
|
obj["description"] = html_content
|
|
|
|
obj["source_url"] = url
|
|
|
|
obj["logo_icon"] = "wrench"
|
2022-11-13 21:08:32 +01:00
|
|
|
return obj
|
|
|
|
|
|
|
|
|
2023-03-21 23:20:53 +01:00
|
|
|
def FetchWebGitHub(url):
|
|
|
|
"""Grab a Markdown source from a GitHub link."""
|
2025-05-07 23:40:27 +02:00
|
|
|
if not url.endswith(".md") or "/blob/" not in url:
|
2023-03-21 23:20:53 +01:00
|
|
|
return {}
|
2025-05-07 23:40:27 +02:00
|
|
|
filename = url.split("/")[-1].replace(".md", "")
|
|
|
|
rawurl = url.replace("/blob/", "/raw/").replace("https://github.com/", "")
|
2023-03-21 23:38:23 +01:00
|
|
|
rawdata = requests.get("https://github.com/" + rawurl, timeout=REQUEST_TIMEOUT)
|
2023-03-21 23:20:53 +01:00
|
|
|
text_content = rawdata.text or ""
|
|
|
|
return {
|
2025-05-07 23:40:27 +02:00
|
|
|
"type": "Markdown",
|
|
|
|
"name": filename,
|
|
|
|
"description": text_content,
|
|
|
|
"source_url": url,
|
|
|
|
"logo_icon": "outdent",
|
2023-03-21 23:20:53 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2023-12-03 22:38:04 +01:00
|
|
|
def FetchWebGitHubGist(url):
|
|
|
|
"""Grab a Markdown source from a GitHub Gist link."""
|
2025-05-07 23:40:27 +02:00
|
|
|
rawurl = url.replace("https://gist.github.com/", "") + "/raw"
|
|
|
|
rawdata = requests.get(
|
|
|
|
"https://gist.githubusercontent.com/" + rawurl, timeout=REQUEST_TIMEOUT
|
|
|
|
)
|
2023-12-03 22:38:04 +01:00
|
|
|
text_content = rawdata.text or ""
|
|
|
|
return {
|
2025-05-07 23:40:27 +02:00
|
|
|
"type": "Markdown",
|
|
|
|
"name": "Gist",
|
|
|
|
"description": text_content,
|
|
|
|
"source_url": url,
|
|
|
|
"logo_icon": "outdent",
|
2023-12-03 22:38:04 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2022-11-13 21:08:32 +01:00
|
|
|
def ParseInstructablesPage(content):
|
|
|
|
"""Create an HTML summary of content."""
|
2021-10-16 21:48:25 +02:00
|
|
|
html_content = ""
|
|
|
|
for step in content.find(".step"):
|
2025-05-07 23:40:27 +02:00
|
|
|
step_title = pq(step).find(".step-title")
|
2021-10-16 21:48:25 +02:00
|
|
|
if step_title is not None:
|
2025-05-07 23:40:27 +02:00
|
|
|
html_content += "<h3>%s</h3>" % step_title.text()
|
2021-10-16 21:48:25 +02:00
|
|
|
# Grab photos
|
2025-05-07 23:40:27 +02:00
|
|
|
for img in pq(step).find("noscript"):
|
|
|
|
img_html = str(pq(img).html())
|
|
|
|
if "{{ file" not in img_html:
|
|
|
|
html_content += img_html
|
2021-10-16 21:48:25 +02:00
|
|
|
# Iterate through body
|
2025-05-07 23:40:27 +02:00
|
|
|
step_content = pq(step).find(".step-body")
|
2021-10-16 21:48:25 +02:00
|
|
|
if step_content is None:
|
|
|
|
continue
|
|
|
|
for elem in pq(step_content).children():
|
2022-11-13 21:08:32 +01:00
|
|
|
elem_tag, p = ParseInstructablesElement(elem)
|
|
|
|
if elem_tag is None:
|
|
|
|
continue
|
2025-05-07 23:40:27 +02:00
|
|
|
html_content += "<%s>%s</%s>" % (elem_tag, p, elem_tag)
|
2022-11-13 21:08:32 +01:00
|
|
|
return html_content
|
|
|
|
|
|
|
|
|
|
|
|
def ParseInstructablesElement(elem):
|
|
|
|
"""Check and return minimal contents."""
|
2025-05-07 23:40:27 +02:00
|
|
|
if elem.tag == "pre":
|
2022-11-13 21:08:32 +01:00
|
|
|
if elem.text is None:
|
|
|
|
return None, None
|
2025-05-07 23:40:27 +02:00
|
|
|
return "pre", elem.text
|
2022-11-13 21:08:32 +01:00
|
|
|
else:
|
|
|
|
p = pq(elem).html()
|
|
|
|
if p is None:
|
|
|
|
return None, None
|
2025-05-07 23:40:27 +02:00
|
|
|
p = bleach.clean(
|
|
|
|
str(p).strip(),
|
|
|
|
strip=True,
|
|
|
|
tags=ALLOWED_HTML_TAGS,
|
|
|
|
attributes=ALLOWED_HTML_ATTR,
|
|
|
|
)
|
2022-11-13 21:08:32 +01:00
|
|
|
return elem.tag, p
|
2024-11-18 23:46:57 +01:00
|
|
|
|
|
|
|
|
|
|
|
def FetchWebPretalx(text, url):
|
|
|
|
"""Grab Pretalx data from a talk."""
|
2025-05-07 23:40:27 +02:00
|
|
|
if "/talk/" not in url:
|
2024-11-18 23:46:57 +01:00
|
|
|
return {}
|
|
|
|
doc = pq(text)
|
2025-05-07 23:40:27 +02:00
|
|
|
apiurl = doc('link[@rel="alternate"]').attr("href")
|
|
|
|
rawdata = requests.get(str(apiurl) + "?format=json", timeout=REQUEST_TIMEOUT)
|
|
|
|
if rawdata.text.find("{") < 0:
|
2024-11-18 23:46:57 +01:00
|
|
|
return {}
|
|
|
|
jsondata = rawdata.json()
|
|
|
|
return {
|
2025-05-07 23:40:27 +02:00
|
|
|
"type": "Pretalx",
|
|
|
|
"name": jsondata["title"],
|
|
|
|
"summary": jsondata["abstract"][:2000],
|
|
|
|
"description": jsondata["description"],
|
|
|
|
"source_url": url,
|
|
|
|
"logo_icon": "window-maximize",
|
2024-11-18 23:46:57 +01:00
|
|
|
}
|