Added Gitea support and tests

2026-03-13 06:26:11 +00:00 · 2022-10-25 11:41:49 +02:00 · 2022-10-25 11:41:49 +02:00 · b53e5e493e
commit b53e5e493e
parent 25e779adcc
4 changed files with 203 additions and 23 deletions
--- a/dribdat/aggregation.py
+++ b/dribdat/aggregation.py
@ -1,6 +1,5 @@
 # -*- coding: utf-8 -*-
-"""Utilities for aggregating data
-"""
+"""Utilities for aggregating data."""

 from dribdat.user.models import Activity, User, Project
 from dribdat.user import isUserActive
@ -8,17 +7,18 @@ from dribdat.database import db
 from dribdat.apifetch import (
    FetchGitlabProject,
    FetchGithubProject,
+    FetchGiteaProject,
    FetchBitbucketProject,
    FetchDataProject,
    FetchWebProject,
 )
-
 import json
 import re


 def GetProjectData(url):
-    """Parses the Readme URL to collect remote data."""
+    """Parse the Readme URL to collect remote data."""
+    # TODO: find a better way to decide the kind of repo
    if url.find('//gitlab.com') > 0:
        apiurl = url
        apiurl = re.sub(r'(?i)-?/blob/[a-z]+/README.*', '', apiurl)
@ -37,6 +37,16 @@ def GetProjectData(url):
            return {}
        return FetchGithubProject(apiurl)

+    elif url.find('//codeberg.org') > 0:
+        apiurl = url
+        apiurl = re.sub(r'(?i)/src/branch/[a-z]+/README.*', '', apiurl)
+        apiurl = re.sub(r'https?://codeberg\.org/', '', apiurl).strip('/')
+        if apiurl.endswith('.git'):
+            apiurl = apiurl[:-4]
+        if apiurl == url:
+            return {}
+        return FetchGiteaProject(apiurl)
+
    elif url.find('//bitbucket.org') > 0:
        apiurl = url
        apiurl = re.sub(r'(?i)/src/[a-z]+/(README)?\.?[a-z]*', '', apiurl)
--- a/dribdat/apievents.py
+++ b/dribdat/apievents.py
@ -1,12 +1,47 @@
 # -*- coding: utf-8 -*-
-""" Collecting events from remote repositories """
+"""Collect events from remote repositories."""

 import logging
 import requests
 from dateutil import parser


-def FetchGithubCommits(full_name, since=None, until=None):
+def fetch_commits_gitea(full_name, limit=10):
+    """Parse data about Gitea commits."""
+    apiurl = "https://codeberg.org/api/v1/repos/%s/commits?limit=%d" % (
+        full_name, limit)
+    data = requests.get(apiurl)
+    if data.status_code != 200:
+        logging.warn("Could not sync Gitea commits on %s" % full_name)
+        return []
+    json = data.json()
+    if 'message' in json:
+        logging.warn("Could not sync Gitea commits on %s: %s"
+                     % (full_name, json['message']))
+        return []
+    commitlog = []
+    for entry in json:
+        if 'commit' not in entry:
+            continue
+        url = entry['html_url']
+        commit = entry['commit']
+        datestamp = parser.parse(entry['created'])
+        author = ''
+        if 'committer' in commit and 'name' in commit['committer']:
+            author = commit['committer']['name']
+        elif 'author' in entry and 'name' in commit['author']:
+            author = commit['author']['name']
+        commitlog.append({
+            'url': url,
+            'date': datestamp,
+            'author': author,
+            'message': commit['message'][:256],
+        })
+    return commitlog
+
+
+def fetch_commits_github(full_name, since=None, until=None):
+    """Parse data about GitHub commits."""
    apiurl = "https://api.github.com/repos/%s/commits?per_page=50" % full_name
    if since is not None:
        apiurl += "&since=%s" % since.replace(microsecond=0).isoformat()
@ -14,7 +49,6 @@ def FetchGithubCommits(full_name, since=None, until=None):
        apiurl += "&until=%s" % until.replace(microsecond=0).isoformat()
    data = requests.get(apiurl)
    if data.status_code != 200:
-        print(data)
        logging.warn("Could not sync GitHub commits on %s" % full_name)
        return []
    json = data.json()
@ -28,11 +62,12 @@ def FetchGithubCommits(full_name, since=None, until=None):
            continue
        commit = entry['commit']
        datestamp = parser.parse(commit['committer']['date'])
+        author = ''
        if 'author' in entry and \
                entry['author'] is not None and \
                'login' in entry['author']:
            author = entry['author']['login']
-        else:
+        elif 'committer' in commit:
            author = commit['committer']['name'][:100]
        url = "https://github.com/%s" % full_name
        if 'html_url' in entry:
@ -44,3 +79,37 @@ def FetchGithubCommits(full_name, since=None, until=None):
            'message': commit['message'][:256],
        })
    return commitlog
+
+
+def fetch_commits_gitlab(project_id: int, since=None, until=None):
+    """Parse data about GitLab commits."""
+    apiurl = 'https://gitlab.com/api/v4/'
+    apiurl = apiurl + "projects/%d/repository/commits?" % project_id
+    if since is not None:
+        apiurl += "&since=%s" % since.replace(microsecond=0).isoformat()
+    if until is not None:
+        apiurl += "&until=%s" % until.replace(microsecond=0).isoformat()
+    # Collect basic data
+    data = requests.get(apiurl)
+    if data.text.find('{') < 0:
+        return []
+    json = data.json()
+    if 'message' in json:
+        logging.warn("Could not sync GitLab commits", json['message'])
+        return []
+    commitlog = []
+    for commit in json:
+        if 'message' not in commit:
+            continue
+        datestamp = parser.parse(commit['created_at'])
+        author = ''
+        if 'author_name' in commit and \
+                commit['author_name'] is not None:
+            author = commit['author_name']
+        commitlog.append({
+            'url': commit['web_url'],
+            'date': datestamp,
+            'author': author,
+            'message': commit['message'][:256],
+        })
+    return commitlog
--- a/dribdat/apifetch.py
+++ b/dribdat/apifetch.py
@ -1,31 +1,80 @@
 # -*- coding: utf-8 -*-
 """Collecting data from third party API repositories."""

-from .apievents import FetchGithubCommits
+import re
+import requests
+import bleach
+import logging
 from flask import url_for
 from pyquery import PyQuery as pq  # noqa: N813
 from base64 import b64decode
 from flask_misaka import markdown
 from bleach.sanitizer import ALLOWED_TAGS, ALLOWED_ATTRIBUTES
 from urllib.parse import quote_plus
-import re
-import requests
-import bleach
+from .apievents import (
+    fetch_commits_github, 
+    fetch_commits_gitlab,
+    fetch_commits_gitea,
+)
 from future.standard_library import install_aliases
 install_aliases()


+def FetchGiteaProject(project_url):
+    """Download data from Codeberg, a large Gitea site."""
+    # Docs: https://codeberg.org/api/swagger
+    site_root = "https://codeberg.org"
+    url_q = quote_plus(project_url, '/')
+    api_repos = site_root + "/api/v1/repos/%s" % url_q
+    api_content = api_repos + "/contents"
+    # Collect basic data
+    data = requests.get(api_repos)
+    if data.text.find('{') < 0:
+        return {}
+    json = data.json()
+    if 'name' not in json:
+        return {}
+    # Collect the README
+    data = requests.get(api_content)
+    readme = ""
+    if not data.text.find('{') < 0:
+        readmeurl = None
+        for repo_file in data.json():
+            if 'readme' in repo_file['name'].lower():
+                readmeurl = repo_file['download_url']
+                readmedata = requests.get(readmeurl)
+                break
+        if readmeurl is None:
+            logging.info("Could not find README", url_q)
+    issuesurl = ''
+    if json['has_issues']:
+        issuesurl = json['html_url'] + '/issues'
+    return {
+        'type': 'Gitea',
+        'name': json['name'],
+        'summary': json['description'],
+        'description': readme,
+        'source_url': json['html_url'],
+        'image_url': json['avatar_url'] or json['owner']['avatar_url'],
+        'contact_url': issuesurl,
+        'commits': fetch_commits_gitea(url_q)
+    }
+
+
 def FetchGitlabProject(project_url):
+    """Download data from GitLab."""
    WEB_BASE = "https://gitlab.com/%s"
    API_BASE = "https://gitlab.com/api/v4/projects/%s"
    url_q = quote_plus(project_url)
+    # Collect basic data
    data = requests.get(API_BASE % url_q)
    if data.text.find('{') < 0:
        return {}
    json = data.json()
    if 'name' not in json:
        return {}
-    readmeurl = "%s/raw/master/README.md" % (WEB_BASE % project_url)
+    # Collect the README
+    readmeurl = json['readme_url'] + '?inline=false'
    readmedata = requests.get(readmeurl)
    readme = readmedata.text or ""
    return {
@ -33,14 +82,15 @@ def FetchGitlabProject(project_url):
        'name': json['name'],
        'summary': json['description'],
        'description': readme,
-        # 'homepage_url': "",
        'source_url': json['web_url'],
        'image_url': json['avatar_url'],
        'contact_url': json['web_url'] + '/issues',
+        'commits': fetch_commits_gitlab(json['id'])
    }


 def FetchGitlabAvatar(email):
+    """Download a user avatar from GitLab."""
    apiurl = "https://gitlab.com/api/v4/avatar?email=%s&size=80"
    data = requests.get(apiurl % email)
    if data.text.find('{') < 0:
@ -52,6 +102,7 @@ def FetchGitlabAvatar(email):


 def FetchGithubProject(project_url):
+    """Download data from GitHub."""
    API_BASE = "https://api.github.com/repos/%s"
    data = requests.get(API_BASE % project_url)
    if data.text.find('{') < 0:
@ -93,11 +144,12 @@ def FetchGithubProject(project_url):
        'image_url': json['owner']['avatar_url'],
        'contact_url': json['html_url'] + '/issues',
        'download_url': json['html_url'] + '/releases',
-        'commits': FetchGithubCommits(repo_full_name)
+        'commits': fetch_commits_github(repo_full_name)
    }


 def FetchBitbucketProject(project_url):
+    """Download data from Bitbucket."""
    WEB_BASE = "https://bitbucket.org/%s"
    API_BASE = "https://api.bitbucket.org/2.0/repositories/%s"
    data = requests.get(API_BASE % project_url)
@ -138,11 +190,8 @@ def FetchBitbucketProject(project_url):
    }


-DP_VIEWER_URL = 'http://data.okfn.org/tools/view?url=%s'
-
-
 def FetchDataProject(project_url):
-    """ Tries to load a Data Package formatted JSON file """
+    """Try to load a Data Package formatted JSON file."""
    # TODO: use frictionlessdata library!
    data = requests.get(project_url)
    if data.text.find('{') < 0:
@ -150,15 +199,15 @@ def FetchDataProject(project_url):
    json = data.json()
    if 'name' not in json or 'title' not in json:
        return {}
+    text_content = project_url + '\n\n'
    if 'homepage' in json:
        readme_url = json['homepage']
    else:
        readme_url = project_url.replace('datapackage.json', 'README.md')
-    text_content = ""
    if readme_url.startswith('http') and readme_url != project_url:
-        text_content = requests.get(readme_url).text
+        text_content = text_content + requests.get(readme_url).text
    if not text_content and 'description' in json:
-        text_content = json['description']
+        text_content = text_content + json['description']
    contact_url = ''
    if 'maintainers' in json and \
            len(json['maintainers']) > 0 and \
@ -169,7 +218,6 @@ def FetchDataProject(project_url):
        'name': json['name'],
        'summary': json['title'],
        'description': text_content,
-        # 'homepage_url': DP_VIEWER_URL % project_url,
        'source_url': project_url,
        'image_url': url_for('static', filename='img/datapackage_icon.png',
                             _external=True),
@ -195,6 +243,7 @@ ALLOWED_HTML_ATTR['font'] = ['color']


 def FetchWebProject(project_url):
+    """Parse a remote Document, wiki or website URL."""
    try:
        data = requests.get(project_url)
    except requests.exceptions.RequestException:
@ -219,6 +268,7 @@ def FetchWebProject(project_url):


 def FetchWebGoogleDoc(text, url):
+    """Help extract data from a Google doc."""
    doc = pq(text)
    doc("style").remove()
    ptitle = doc("div#title") or doc("div#header")
@ -249,6 +299,7 @@ def FetchWebGoogleDoc(text, url):


 def FetchWebCodiMD(text, url):
+    """Help extract data from CodiMD."""
    doc = pq(text)
    ptitle = doc("title")
    if len(ptitle) < 1:
@ -267,6 +318,7 @@ def FetchWebCodiMD(text, url):


 def FetchWebDokuWiki(text, url):
+    """Help extract data from DokuWiki."""
    doc = pq(text)
    ptitle = doc("span.pageId")
    if len(ptitle) < 1:
@ -288,6 +340,7 @@ def FetchWebDokuWiki(text, url):


 def FetchWebEtherpad(text, url):
+    """Help extract data from Etherpad Lite."""
    ptitle = url.split('/')[-1]
    if len(ptitle) < 1:
        return {}
@ -303,6 +356,7 @@ def FetchWebEtherpad(text, url):


 def FetchWebInstructables(text, url):
+    """Help extract data from Instructables."""
    doc = pq(text)
    ptitle = doc(".header-title")
    if len(ptitle) < 1:
--- a/tests/test_aggregate.py
+++ b/tests/test_aggregate.py
@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+"""Dribdat data aggregation tests."""
+
+from dribdat.aggregation import GetProjectData
+
+
+class TestAggregate:
+    """Here be tests."""
+
+    def test_gitea(self):
+        """Test parsing a Codeberg readme."""
+        test_url = 'https://codeberg.org/dribdat/dribdat'
+        test_obj = GetProjectData(test_url)
+        assert 'name' in test_obj
+        assert test_obj['name'] == 'dribdat'
+        assert test_obj['type'] == 'Gitea'
+        assert 'commits' in test_obj
+        assert len(test_obj['commits']) > 5
+
+    def test_github(self):
+        """Test parsing a GitHub readme."""
+        test_url = 'https://github.com/dribdat/dribdat'
+        test_obj = GetProjectData(test_url)
+        assert 'name' in test_obj
+        assert test_obj['name'] == 'dribdat'
+        assert test_obj['type'] == 'GitHub'
+        assert 'commits' in test_obj
+        assert len(test_obj['commits']) > 5
+
+    def test_gitlab(self):
+        """Test parsing a GitLab readme."""
+        test_url = 'https://gitlab.com/dribdat/dribdat'
+        test_obj = GetProjectData(test_url)
+        assert 'name' in test_obj
+        assert test_obj['name'] == 'dribdat'
+        assert test_obj['type'] == 'GitLab'
+        assert 'commits' in test_obj
+        assert len(test_obj['commits']) > 5
+
+    def test_bitbucket(self):
+        """Test parsing a Bitbucket readme."""
+        test_url = 'https://bitbucket.org/dribdat/dribdat/src/master/'
+        test_obj = GetProjectData(test_url)
+        assert 'name' in test_obj
+        assert test_obj['name'] == 'dribdat'
+        assert test_obj['type'] == 'Bitbucket'
+        # TODO: support for commits