Tweak Git image parsing

This commit is contained in:
Oleg Lavrovsky 2023-12-22 17:46:00 +01:00
parent dbd070aedf
commit c1a7af0443
No known key found for this signature in database
GPG Key ID: 31E523030632FF4B
4 changed files with 34 additions and 13 deletions

View File

@ -15,7 +15,9 @@ from .apievents import (
fetch_commits_gitlab,
fetch_commits_gitea,
)
from .utils import sanitize_url, load_presets, load_yaml_presets
from .utils import (
sanitize_url, load_presets, load_yaml_presets, fix_relative_links
)
from future.standard_library import install_aliases
install_aliases()
@ -154,17 +156,7 @@ def FetchGithubProject(project_url):
readme = b64decode(readme['content']).decode('utf-8')
# Fix relative links in text
imgroot = "https://raw.githubusercontent.com"
readme = re.sub(
r"<img src=\"(?!http)",
"<img src=\"%s/%s/%s/" % (imgroot, repo_full_name, default_branch),
readme
)
readme = re.sub(
r"\!\[(.*)\]\((?!http)",
# TODO check why we are using \g escape here?
r"![\g<1>](%s/%s/%s/" % (imgroot, repo_full_name, default_branch),
readme
)
readme = fix_relative_links(readme, imgroot, repo_full_name, default_branch)
return {
'type': 'GitHub',
'name': json['name'],

View File

@ -166,3 +166,19 @@ def load_yaml_presets(filename, by_col='name', filepath=None):
with open(fn, mode='r') as file:
config = load_presets(file, filename, by_col)
return config
def fix_relative_links(readme, imgroot, repo_full_name, default_branch):
"""Ensures that images in Markdown are absolute."""
readme = re.sub(
r" src=\"(?!http)",
" src=\"%s/%s/%s/" % (imgroot, repo_full_name, default_branch),
readme
)
readme = re.sub(
r"\!\[(.*)\]\((?!http)",
# Pass named group to include full path in the image URL
"![\g<1>](%s/%s/%s/" % (imgroot, repo_full_name, default_branch),
readme
)
return readme

View File

@ -20,7 +20,6 @@ class TestRepository:
assert 'dribdat' in test_obj['description']
assert 'dribdat/dribdat' in test_obj['source_url']
def test_datapackage_dribdat(self):
"""Test parsing a dribdat Data Package."""
test_url = 'https://raw.githubusercontent.com/dribdat/dribdat/main/tests/mock/datapackage.json'
@ -70,6 +69,8 @@ class TestRepository:
assert test_obj['type'] == 'GitHub'
assert 'commits' in test_obj
assert len(test_obj['commits']) > 5
assert 'src="dribdat/static/img' not in test_obj['description']
assert 'src="https://raw.githubusercontent.com/dribdat/dribdat/main/dribdat/static/img' in test_obj['description']
def test_github_other(self):
"""Test parsing a GitHub Markdown file."""

View File

@ -8,6 +8,7 @@ from dribdat.aggregation import (
FetchWebProject,
ProjectActivity,
)
from .utils import fix_relative_links
from .factories import ProjectFactory
from .mock.project_data import project_data
@ -68,6 +69,7 @@ class TestSync:
assert test_obj['source_url'] == test_url
assert 'Guidelines' in test_obj['description']
def test_googledoc(self):
"""Test parsing a Google Document."""
# Handbook to Hackathons with Dribdat
@ -76,3 +78,13 @@ class TestSync:
assert 'description' in test_obj
assert 'Handbook' in test_obj['description']
def test_fix_relative_links(self):
imgroot = "https://raw.githubusercontent.com"
repo_full_name = "dribdat/dribdat"
default_branch = "main"
readme = '![hello there](world.png) <img title="hello" src="again.jpg">'
readme = fix_relative_links(readme, imgroot, repo_full_name, default_branch)
assert imgroot in readme
assert not '(world.png)' in readme
assert not '"again.jpg"' in readme