Added Gitea support and tests

This commit is contained in:
datalets 2022-10-25 11:41:49 +02:00
commit b53e5e493e
4 changed files with 203 additions and 23 deletions

View file

@ -1,6 +1,5 @@
# -*- coding: utf-8 -*-
"""Utilities for aggregating data
"""
"""Utilities for aggregating data."""
from dribdat.user.models import Activity, User, Project
from dribdat.user import isUserActive
@ -8,17 +7,18 @@ from dribdat.database import db
from dribdat.apifetch import (
FetchGitlabProject,
FetchGithubProject,
FetchGiteaProject,
FetchBitbucketProject,
FetchDataProject,
FetchWebProject,
)
import json
import re
def GetProjectData(url):
"""Parses the Readme URL to collect remote data."""
"""Parse the Readme URL to collect remote data."""
# TODO: find a better way to decide the kind of repo
if url.find('//gitlab.com') > 0:
apiurl = url
apiurl = re.sub(r'(?i)-?/blob/[a-z]+/README.*', '', apiurl)
@ -37,6 +37,16 @@ def GetProjectData(url):
return {}
return FetchGithubProject(apiurl)
elif url.find('//codeberg.org') > 0:
apiurl = url
apiurl = re.sub(r'(?i)/src/branch/[a-z]+/README.*', '', apiurl)
apiurl = re.sub(r'https?://codeberg\.org/', '', apiurl).strip('/')
if apiurl.endswith('.git'):
apiurl = apiurl[:-4]
if apiurl == url:
return {}
return FetchGiteaProject(apiurl)
elif url.find('//bitbucket.org') > 0:
apiurl = url
apiurl = re.sub(r'(?i)/src/[a-z]+/(README)?\.?[a-z]*', '', apiurl)

View file

@ -1,12 +1,47 @@
# -*- coding: utf-8 -*-
""" Collecting events from remote repositories """
"""Collect events from remote repositories."""
import logging
import requests
from dateutil import parser
def FetchGithubCommits(full_name, since=None, until=None):
def fetch_commits_gitea(full_name, limit=10):
"""Parse data about Gitea commits."""
apiurl = "https://codeberg.org/api/v1/repos/%s/commits?limit=%d" % (
full_name, limit)
data = requests.get(apiurl)
if data.status_code != 200:
logging.warn("Could not sync Gitea commits on %s" % full_name)
return []
json = data.json()
if 'message' in json:
logging.warn("Could not sync Gitea commits on %s: %s"
% (full_name, json['message']))
return []
commitlog = []
for entry in json:
if 'commit' not in entry:
continue
url = entry['html_url']
commit = entry['commit']
datestamp = parser.parse(entry['created'])
author = ''
if 'committer' in commit and 'name' in commit['committer']:
author = commit['committer']['name']
elif 'author' in entry and 'name' in commit['author']:
author = commit['author']['name']
commitlog.append({
'url': url,
'date': datestamp,
'author': author,
'message': commit['message'][:256],
})
return commitlog
def fetch_commits_github(full_name, since=None, until=None):
"""Parse data about GitHub commits."""
apiurl = "https://api.github.com/repos/%s/commits?per_page=50" % full_name
if since is not None:
apiurl += "&since=%s" % since.replace(microsecond=0).isoformat()
@ -14,7 +49,6 @@ def FetchGithubCommits(full_name, since=None, until=None):
apiurl += "&until=%s" % until.replace(microsecond=0).isoformat()
data = requests.get(apiurl)
if data.status_code != 200:
print(data)
logging.warn("Could not sync GitHub commits on %s" % full_name)
return []
json = data.json()
@ -28,11 +62,12 @@ def FetchGithubCommits(full_name, since=None, until=None):
continue
commit = entry['commit']
datestamp = parser.parse(commit['committer']['date'])
author = ''
if 'author' in entry and \
entry['author'] is not None and \
'login' in entry['author']:
author = entry['author']['login']
else:
elif 'committer' in commit:
author = commit['committer']['name'][:100]
url = "https://github.com/%s" % full_name
if 'html_url' in entry:
@ -44,3 +79,37 @@ def FetchGithubCommits(full_name, since=None, until=None):
'message': commit['message'][:256],
})
return commitlog
def fetch_commits_gitlab(project_id: int, since=None, until=None):
"""Parse data about GitLab commits."""
apiurl = 'https://gitlab.com/api/v4/'
apiurl = apiurl + "projects/%d/repository/commits?" % project_id
if since is not None:
apiurl += "&since=%s" % since.replace(microsecond=0).isoformat()
if until is not None:
apiurl += "&until=%s" % until.replace(microsecond=0).isoformat()
# Collect basic data
data = requests.get(apiurl)
if data.text.find('{') < 0:
return []
json = data.json()
if 'message' in json:
logging.warn("Could not sync GitLab commits", json['message'])
return []
commitlog = []
for commit in json:
if 'message' not in commit:
continue
datestamp = parser.parse(commit['created_at'])
author = ''
if 'author_name' in commit and \
commit['author_name'] is not None:
author = commit['author_name']
commitlog.append({
'url': commit['web_url'],
'date': datestamp,
'author': author,
'message': commit['message'][:256],
})
return commitlog

View file

@ -1,31 +1,80 @@
# -*- coding: utf-8 -*-
"""Collecting data from third party API repositories."""
from .apievents import FetchGithubCommits
import re
import requests
import bleach
import logging
from flask import url_for
from pyquery import PyQuery as pq # noqa: N813
from base64 import b64decode
from flask_misaka import markdown
from bleach.sanitizer import ALLOWED_TAGS, ALLOWED_ATTRIBUTES
from urllib.parse import quote_plus
import re
import requests
import bleach
from .apievents import (
fetch_commits_github,
fetch_commits_gitlab,
fetch_commits_gitea,
)
from future.standard_library import install_aliases
install_aliases()
def FetchGiteaProject(project_url):
"""Download data from Codeberg, a large Gitea site."""
# Docs: https://codeberg.org/api/swagger
site_root = "https://codeberg.org"
url_q = quote_plus(project_url, '/')
api_repos = site_root + "/api/v1/repos/%s" % url_q
api_content = api_repos + "/contents"
# Collect basic data
data = requests.get(api_repos)
if data.text.find('{') < 0:
return {}
json = data.json()
if 'name' not in json:
return {}
# Collect the README
data = requests.get(api_content)
readme = ""
if not data.text.find('{') < 0:
readmeurl = None
for repo_file in data.json():
if 'readme' in repo_file['name'].lower():
readmeurl = repo_file['download_url']
readmedata = requests.get(readmeurl)
break
if readmeurl is None:
logging.info("Could not find README", url_q)
issuesurl = ''
if json['has_issues']:
issuesurl = json['html_url'] + '/issues'
return {
'type': 'Gitea',
'name': json['name'],
'summary': json['description'],
'description': readme,
'source_url': json['html_url'],
'image_url': json['avatar_url'] or json['owner']['avatar_url'],
'contact_url': issuesurl,
'commits': fetch_commits_gitea(url_q)
}
def FetchGitlabProject(project_url):
"""Download data from GitLab."""
WEB_BASE = "https://gitlab.com/%s"
API_BASE = "https://gitlab.com/api/v4/projects/%s"
url_q = quote_plus(project_url)
# Collect basic data
data = requests.get(API_BASE % url_q)
if data.text.find('{') < 0:
return {}
json = data.json()
if 'name' not in json:
return {}
readmeurl = "%s/raw/master/README.md" % (WEB_BASE % project_url)
# Collect the README
readmeurl = json['readme_url'] + '?inline=false'
readmedata = requests.get(readmeurl)
readme = readmedata.text or ""
return {
@ -33,14 +82,15 @@ def FetchGitlabProject(project_url):
'name': json['name'],
'summary': json['description'],
'description': readme,
# 'homepage_url': "",
'source_url': json['web_url'],
'image_url': json['avatar_url'],
'contact_url': json['web_url'] + '/issues',
'commits': fetch_commits_gitlab(json['id'])
}
def FetchGitlabAvatar(email):
"""Download a user avatar from GitLab."""
apiurl = "https://gitlab.com/api/v4/avatar?email=%s&size=80"
data = requests.get(apiurl % email)
if data.text.find('{') < 0:
@ -52,6 +102,7 @@ def FetchGitlabAvatar(email):
def FetchGithubProject(project_url):
"""Download data from GitHub."""
API_BASE = "https://api.github.com/repos/%s"
data = requests.get(API_BASE % project_url)
if data.text.find('{') < 0:
@ -93,11 +144,12 @@ def FetchGithubProject(project_url):
'image_url': json['owner']['avatar_url'],
'contact_url': json['html_url'] + '/issues',
'download_url': json['html_url'] + '/releases',
'commits': FetchGithubCommits(repo_full_name)
'commits': fetch_commits_github(repo_full_name)
}
def FetchBitbucketProject(project_url):
"""Download data from Bitbucket."""
WEB_BASE = "https://bitbucket.org/%s"
API_BASE = "https://api.bitbucket.org/2.0/repositories/%s"
data = requests.get(API_BASE % project_url)
@ -138,11 +190,8 @@ def FetchBitbucketProject(project_url):
}
DP_VIEWER_URL = 'http://data.okfn.org/tools/view?url=%s'
def FetchDataProject(project_url):
""" Tries to load a Data Package formatted JSON file """
"""Try to load a Data Package formatted JSON file."""
# TODO: use frictionlessdata library!
data = requests.get(project_url)
if data.text.find('{') < 0:
@ -150,15 +199,15 @@ def FetchDataProject(project_url):
json = data.json()
if 'name' not in json or 'title' not in json:
return {}
text_content = project_url + '\n\n'
if 'homepage' in json:
readme_url = json['homepage']
else:
readme_url = project_url.replace('datapackage.json', 'README.md')
text_content = ""
if readme_url.startswith('http') and readme_url != project_url:
text_content = requests.get(readme_url).text
text_content = text_content + requests.get(readme_url).text
if not text_content and 'description' in json:
text_content = json['description']
text_content = text_content + json['description']
contact_url = ''
if 'maintainers' in json and \
len(json['maintainers']) > 0 and \
@ -169,7 +218,6 @@ def FetchDataProject(project_url):
'name': json['name'],
'summary': json['title'],
'description': text_content,
# 'homepage_url': DP_VIEWER_URL % project_url,
'source_url': project_url,
'image_url': url_for('static', filename='img/datapackage_icon.png',
_external=True),
@ -195,6 +243,7 @@ ALLOWED_HTML_ATTR['font'] = ['color']
def FetchWebProject(project_url):
"""Parse a remote Document, wiki or website URL."""
try:
data = requests.get(project_url)
except requests.exceptions.RequestException:
@ -219,6 +268,7 @@ def FetchWebProject(project_url):
def FetchWebGoogleDoc(text, url):
"""Help extract data from a Google doc."""
doc = pq(text)
doc("style").remove()
ptitle = doc("div#title") or doc("div#header")
@ -249,6 +299,7 @@ def FetchWebGoogleDoc(text, url):
def FetchWebCodiMD(text, url):
"""Help extract data from CodiMD."""
doc = pq(text)
ptitle = doc("title")
if len(ptitle) < 1:
@ -267,6 +318,7 @@ def FetchWebCodiMD(text, url):
def FetchWebDokuWiki(text, url):
"""Help extract data from DokuWiki."""
doc = pq(text)
ptitle = doc("span.pageId")
if len(ptitle) < 1:
@ -288,6 +340,7 @@ def FetchWebDokuWiki(text, url):
def FetchWebEtherpad(text, url):
"""Help extract data from Etherpad Lite."""
ptitle = url.split('/')[-1]
if len(ptitle) < 1:
return {}
@ -303,6 +356,7 @@ def FetchWebEtherpad(text, url):
def FetchWebInstructables(text, url):
"""Help extract data from Instructables."""
doc = pq(text)
ptitle = doc(".header-title")
if len(ptitle) < 1:

47
tests/test_aggregate.py Normal file
View file

@ -0,0 +1,47 @@
# -*- coding: utf-8 -*-
"""Dribdat data aggregation tests."""
from dribdat.aggregation import GetProjectData
class TestAggregate:
"""Here be tests."""
def test_gitea(self):
"""Test parsing a Codeberg readme."""
test_url = 'https://codeberg.org/dribdat/dribdat'
test_obj = GetProjectData(test_url)
assert 'name' in test_obj
assert test_obj['name'] == 'dribdat'
assert test_obj['type'] == 'Gitea'
assert 'commits' in test_obj
assert len(test_obj['commits']) > 5
def test_github(self):
"""Test parsing a GitHub readme."""
test_url = 'https://github.com/dribdat/dribdat'
test_obj = GetProjectData(test_url)
assert 'name' in test_obj
assert test_obj['name'] == 'dribdat'
assert test_obj['type'] == 'GitHub'
assert 'commits' in test_obj
assert len(test_obj['commits']) > 5
def test_gitlab(self):
"""Test parsing a GitLab readme."""
test_url = 'https://gitlab.com/dribdat/dribdat'
test_obj = GetProjectData(test_url)
assert 'name' in test_obj
assert test_obj['name'] == 'dribdat'
assert test_obj['type'] == 'GitLab'
assert 'commits' in test_obj
assert len(test_obj['commits']) > 5
def test_bitbucket(self):
"""Test parsing a Bitbucket readme."""
test_url = 'https://bitbucket.org/dribdat/dribdat/src/master/'
test_obj = GetProjectData(test_url)
assert 'name' in test_obj
assert test_obj['name'] == 'dribdat'
assert test_obj['type'] == 'Bitbucket'
# TODO: support for commits