105 lines
3.6 KiB
Python
105 lines
3.6 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""
|
|
plnt.sync
|
|
~~~~~~~~~
|
|
|
|
Does the synchronization. Called by "manage-plnt.py sync"
|
|
|
|
:copyright: (c) 2009 by the Werkzeug Team, see AUTHORS for more details.
|
|
:license: BSD.
|
|
"""
|
|
import sys
|
|
import feedparser
|
|
from time import time
|
|
from datetime import datetime
|
|
from werkzeug.utils import escape
|
|
from plnt.database import Blog, Entry, session
|
|
from plnt.utils import strip_tags, nl2p
|
|
|
|
|
|
HTML_MIMETYPES = set(['text/html', 'application/xhtml+xml'])
|
|
|
|
|
|
def sync():
|
|
"""
|
|
Performs a synchronization. Articles that are already syncronized aren't
|
|
touched anymore.
|
|
"""
|
|
for blog in Blog.query.all():
|
|
# parse the feed. feedparser.parse will never given an exception
|
|
# but the bozo bit might be defined.
|
|
feed = feedparser.parse(blog.feed_url)
|
|
blog_author = feed.get('author') or blog.name
|
|
blog_author_detail = feed.get('author_detail')
|
|
|
|
for entry in feed.entries:
|
|
# get the guid. either the id if specified, otherwise the link.
|
|
# if none is available we skip the entry.
|
|
guid = entry.get('id') or entry.get('link')
|
|
if not guid:
|
|
continue
|
|
|
|
# get an old entry for the guid to check if we need to update
|
|
# or recreate the item
|
|
old_entry = Entry.query.filter_by(guid=guid).first()
|
|
|
|
# get title, url and text. skip if no title or no text is
|
|
# given. if the link is missing we use the blog link.
|
|
if 'title_detail' in entry:
|
|
title = entry.title_detail.get('value') or ''
|
|
if entry.title_detail.get('type') in HTML_MIMETYPES:
|
|
title = strip_tags(title)
|
|
else:
|
|
title = escape(title)
|
|
else:
|
|
title = entry.get('title')
|
|
url = entry.get('link') or blog.blog_url
|
|
text = 'content' in entry and entry.content[0] or \
|
|
entry.get('summary_detail')
|
|
|
|
if not title or not text:
|
|
continue
|
|
|
|
# if we have an html text we use that, otherwise we HTML
|
|
# escape the text and use that one. We also handle XHTML
|
|
# with our tag soup parser for the moment.
|
|
if text.get('type') not in HTML_MIMETYPES:
|
|
text = escape(nl2p(text.get('value') or ''))
|
|
else:
|
|
text = text.get('value') or ''
|
|
|
|
# no text? continue
|
|
if not text.strip():
|
|
continue
|
|
|
|
# get the pub date and updated date. This is rather complex
|
|
# because different feeds do different stuff
|
|
pub_date = entry.get('published_parsed') or \
|
|
entry.get('created_parsed') or \
|
|
entry.get('date_parsed')
|
|
updated = entry.get('updated_parsed') or pub_date
|
|
pub_date = pub_date or updated
|
|
|
|
# if we don't have a pub_date we skip.
|
|
if not pub_date:
|
|
continue
|
|
|
|
# convert the time tuples to datetime objects.
|
|
pub_date = datetime(*pub_date[:6])
|
|
updated = datetime(*updated[:6])
|
|
if old_entry and updated <= old_entry.last_update:
|
|
continue
|
|
|
|
# create a new entry object based on the data collected or
|
|
# update the old one.
|
|
entry = old_entry or Entry()
|
|
entry.blog = blog
|
|
entry.guid = guid
|
|
entry.title = title
|
|
entry.url = url
|
|
entry.text = text
|
|
entry.pub_date = pub_date
|
|
entry.last_update = updated
|
|
session.add(entry)
|
|
|
|
session.commit()
|