2017-07-04 08:44:22 +00:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
from datetime import datetime
|
2017-09-04 21:03:01 +00:00
|
|
|
from guess_language import guess_language
|
2017-07-04 08:44:22 +00:00
|
|
|
|
|
|
|
def parse(obj, raw, stream):
|
|
|
|
"""
|
|
|
|
Parse raw JSON implementation from the Feedly API
|
|
|
|
"""
|
|
|
|
obj.raw = raw
|
|
|
|
obj.stream = stream
|
|
|
|
obj.entry_id = raw['id']
|
|
|
|
|
|
|
|
# Date stamp handling
|
|
|
|
ts = raw['published'] / 1000
|
2017-07-04 09:05:58 +00:00
|
|
|
obj.published = datetime.fromtimestamp(ts)
|
2017-07-04 08:44:22 +00:00
|
|
|
|
|
|
|
# Authorship and title
|
2017-11-17 16:00:04 +00:00
|
|
|
obj.title = raw['title'][:250]
|
2017-07-04 08:44:22 +00:00
|
|
|
if 'author' in raw['origin']:
|
2017-11-17 16:00:04 +00:00
|
|
|
obj.author = raw['author'][:250]
|
2017-07-04 08:44:22 +00:00
|
|
|
elif 'title' in raw['origin']:
|
2017-11-17 16:00:04 +00:00
|
|
|
obj.author = raw['origin']['title'][:250]
|
2017-07-04 08:44:22 +00:00
|
|
|
|
|
|
|
# Parse links and references
|
|
|
|
if len(raw['alternate']) > 0:
|
2017-11-17 16:00:04 +00:00
|
|
|
obj.link = raw['alternate'][0]['href'][:500]
|
2017-07-04 08:44:22 +00:00
|
|
|
if 'thumbnail' in raw and len(raw['thumbnail']) > 0:
|
|
|
|
if 'url' in raw['thumbnail'][0]:
|
2017-11-17 16:00:04 +00:00
|
|
|
obj.visual = raw['thumbnail'][0]['url'][:500]
|
2017-07-04 08:44:22 +00:00
|
|
|
elif 'enclosure' in raw and len(raw['enclosure']) > 0:
|
|
|
|
if 'href' in raw['enclosure'][0]:
|
2017-11-17 16:00:04 +00:00
|
|
|
obj.visual = raw['enclosure'][0]['href'][:500]
|
2017-07-04 08:44:22 +00:00
|
|
|
elif 'visual' in raw and 'url' in raw['visual']:
|
2017-11-17 16:00:04 +00:00
|
|
|
obj.visual = raw['visual']['url'][:500]
|
2017-07-04 09:05:58 +00:00
|
|
|
if obj.visual.lower().strip() == 'none':
|
|
|
|
obj.visual = ''
|
2017-07-04 08:44:22 +00:00
|
|
|
|
|
|
|
# Collect text in nested JSON content
|
2017-11-17 16:22:09 +00:00
|
|
|
if 'summary' in obj.raw:
|
|
|
|
if 'content' in obj.raw['summary']:
|
|
|
|
obj.content = obj.raw['summary']['content']
|
|
|
|
else:
|
|
|
|
obj.content = obj.raw['summary']
|
|
|
|
elif 'content' in obj.raw:
|
|
|
|
if 'content' in obj.raw['content']:
|
|
|
|
obj.content = obj.raw['content']['content']
|
|
|
|
else:
|
|
|
|
obj.content = obj.raw['content']
|
2017-11-17 14:47:53 +00:00
|
|
|
elif 'fullContent' in obj.raw:
|
|
|
|
obj.content = obj.raw['fullContent']
|
2017-07-04 08:44:22 +00:00
|
|
|
else:
|
2017-11-17 16:22:09 +00:00
|
|
|
obj.content = ''
|
2017-07-04 08:44:22 +00:00
|
|
|
|
2017-09-04 21:03:01 +00:00
|
|
|
# Detect language
|
2017-09-14 07:56:38 +00:00
|
|
|
try:
|
|
|
|
obj.lang = guess_language(obj.content) or ''
|
|
|
|
except:
|
|
|
|
obj.lang = ''
|
2017-09-04 21:03:01 +00:00
|
|
|
|
2017-07-04 08:44:22 +00:00
|
|
|
# Collect tags
|
|
|
|
tags = []
|
2017-07-06 13:58:13 +00:00
|
|
|
if 'tags' in obj.raw:
|
|
|
|
for tag in obj.raw['tags']:
|
|
|
|
if 'label' in tag:
|
|
|
|
label = tag['label'].replace(',','-')
|
|
|
|
label = label.strip().lower()
|
|
|
|
if len(label) > 3 and not label in tags:
|
|
|
|
tags.append(label)
|
|
|
|
obj.tags = ','.join(tags)
|
2017-07-04 08:44:22 +00:00
|
|
|
|
|
|
|
return obj
|