Use homegrown date parsing
This commit is contained in:
parent
6834a6de95
commit
0d984fdc29
1 changed files with 57 additions and 3 deletions
|
@ -2,8 +2,7 @@ import re
|
|||
import enum
|
||||
import datetime
|
||||
|
||||
from dateparser.search import search_dates
|
||||
from email.header import decode_header
|
||||
from email.header import decode_header
|
||||
|
||||
from nntp.tiny.db import DatabaseTable
|
||||
|
||||
|
@ -42,6 +41,61 @@ def each_line(text: str):
|
|||
yield text[start:end]
|
||||
break
|
||||
|
||||
DATE_MONTHS = {
|
||||
'jan': 1, 'january': 1,
|
||||
'feb': 2, 'february': 2,
|
||||
'mar': 3, 'march': 3,
|
||||
'apr': 4, 'april': 4,
|
||||
'may': 5,
|
||||
'jun': 6, 'june': 6,
|
||||
'jul': 7, 'july': 7,
|
||||
'aug': 8, 'august': 8,
|
||||
'sep': 9, 'september': 9,
|
||||
'oct': 10, 'october': 10,
|
||||
'nov': 11, 'november': 11,
|
||||
'dec': 12, 'december': 12
|
||||
}
|
||||
|
||||
DATE_RE = [
|
||||
re.compile(r'^(?:[A-Za-z]+),\s+(?P<dd>\d{1,2}) (?P<month>[A-Za-z]+) (?P<yyyy>\d{4}) (?P<hh>\d{2}):(?P<MM>\d{2}):(?P<ss>\d{2}) (?P<offset_sign>[+\-])(?P<offset_hour>\d{2})(?P<offset_minute>\d{2})$'),
|
||||
re.compile(r'^(?:[A-Za-z]+),\s+(?P<dd>\d{1,2}) (?P<month>[A-Za-z]+) (?P<yyyy>\d{4}) (?P<hh>\d{2}):(?P<MM>\d{2}):(?P<ss>\d{2}) (?P<offset_sign>[+\-])(?P<offset_hour>\d{2})(?P<offset_minute>\d{2}) \((?:[A-Za-z ]+)\)$'),
|
||||
re.compile(r'^(?:[A-Za-z]+),\s+(?P<dd>\d{1,2}) (?P<month>[A-Za-z]+) (?P<yyyy>\d{4}) (?P<hh>\d{2}):(?P<MM>\d{2}):(?P<ss>\d{2}) (?:GMT|UTC)$'),
|
||||
re.compile(r'^(?:[A-Za-z]+),\s+(?P<dd>\d{1,2}) (?P<month>[A-Za-z]+) (?P<yyyy>\d{4}) (?P<hh>\d{2}):(?P<MM>\d{2}) (?P<offset_sign>[+\-])(?P<offset_hour>\d{2})(?P<offset_minute>\d{2})$'),
|
||||
re.compile(r'^(?:[A-Za-z]+),\s+(?P<dd>\d{1,2}) (?P<month>[A-Za-z]+) (?P<yyyy>\d{4}) (?P<hh>\d{2}):(?P<MM>\d{2}) (?P<offset_sign>[+\-])(?P<offset_hour>\d{2})(?P<offset_minute>\d{2}) \((?:[A-Z ]+)\)$'),
|
||||
re.compile(r'^(?:[A-Za-z]+),\s+(?P<dd>\d{1,2}) (?P<month>[A-Za-z]+) (?P<yyyy>\d{4}) (?P<hh>\d{2}):(?P<MM>\d{2}) (?:GMT|UTC)$'),
|
||||
re.compile(r'^(?P<dd>\d{1,2}) (?P<month>[A-Za-z]+) (?P<yyyy>\d{4}) (?P<hh>\d{2}):(?P<MM>\d{2}):(?P<ss>\d{2}) (?P<offset_sign>[+\-])(?P<offset_hour>\d{2})(?P<offset_minute>\d{2})$'),
|
||||
re.compile(r'^(?P<dd>\d{1,2}) (?P<month>[A-Za-z]+) (?P<yyyy>\d{4}) (?P<hh>\d{2}):(?P<MM>\d{2}):(?P<ss>\d{2}) (?:GMT|UTC)$'),
|
||||
]
|
||||
|
||||
def parse_timestamp(timestamp: str):
|
||||
if timestamp is None or timestamp == '':
|
||||
return datetime.datetime.fromtimestamp(0)
|
||||
|
||||
for re in DATE_RE:
|
||||
match = re.match(timestamp)
|
||||
|
||||
if match is not None:
|
||||
capture = match.groupdict()
|
||||
mult = -1 if capture.get('offset_sign', '+') == '-' else 1
|
||||
tz = datetime.timezone(datetime.timedelta(
|
||||
hours = mult * int(capture.get('offset_hour', 0)),
|
||||
minutes = mult * int(capture.get('offset_minute', 0))
|
||||
))
|
||||
|
||||
yyyy = int(capture['yyyy'])
|
||||
|
||||
if 'month' in capture:
|
||||
mm = DATE_MONTHS[capture['month'].lower()]
|
||||
else:
|
||||
mm = int(capture['mm'])
|
||||
|
||||
dd = int(capture['dd'])
|
||||
hh = int(capture.get('hh', 0))
|
||||
MM = int(capture.get('MM', 0))
|
||||
ss = int(capture.get('ss', 0))
|
||||
|
||||
return datetime.datetime(yyyy, mm, dd, hh, MM, ss, 0, tz)
|
||||
|
||||
class MessageState(enum.Enum):
|
||||
EMPTY = 0
|
||||
HEADER = 1
|
||||
|
@ -147,7 +201,7 @@ class Message(DatabaseTable):
|
|||
value = self.header('Date')
|
||||
|
||||
if value is not None:
|
||||
ret = search_dates(value)[0][1]
|
||||
ret = parse_timestamp(value)
|
||||
|
||||
self._cache['created_on'] = str(ret)
|
||||
except:
|
||||
|
|
Loading…
Add table
Reference in a new issue