Use homegrown date parsing

This commit is contained in:
XANTRONIX Development 2024-11-11 12:37:48 -05:00
parent 6834a6de95
commit 0d984fdc29

View file

@ -2,8 +2,7 @@ import re
import enum
import datetime
from dateparser.search import search_dates
from email.header import decode_header
from email.header import decode_header
from nntp.tiny.db import DatabaseTable
@ -42,6 +41,61 @@ def each_line(text: str):
yield text[start:end]
break
DATE_MONTHS = {
'jan': 1, 'january': 1,
'feb': 2, 'february': 2,
'mar': 3, 'march': 3,
'apr': 4, 'april': 4,
'may': 5,
'jun': 6, 'june': 6,
'jul': 7, 'july': 7,
'aug': 8, 'august': 8,
'sep': 9, 'september': 9,
'oct': 10, 'october': 10,
'nov': 11, 'november': 11,
'dec': 12, 'december': 12
}
DATE_RE = [
re.compile(r'^(?:[A-Za-z]+),\s+(?P<dd>\d{1,2}) (?P<month>[A-Za-z]+) (?P<yyyy>\d{4}) (?P<hh>\d{2}):(?P<MM>\d{2}):(?P<ss>\d{2}) (?P<offset_sign>[+\-])(?P<offset_hour>\d{2})(?P<offset_minute>\d{2})$'),
re.compile(r'^(?:[A-Za-z]+),\s+(?P<dd>\d{1,2}) (?P<month>[A-Za-z]+) (?P<yyyy>\d{4}) (?P<hh>\d{2}):(?P<MM>\d{2}):(?P<ss>\d{2}) (?P<offset_sign>[+\-])(?P<offset_hour>\d{2})(?P<offset_minute>\d{2}) \((?:[A-Za-z ]+)\)$'),
re.compile(r'^(?:[A-Za-z]+),\s+(?P<dd>\d{1,2}) (?P<month>[A-Za-z]+) (?P<yyyy>\d{4}) (?P<hh>\d{2}):(?P<MM>\d{2}):(?P<ss>\d{2}) (?:GMT|UTC)$'),
re.compile(r'^(?:[A-Za-z]+),\s+(?P<dd>\d{1,2}) (?P<month>[A-Za-z]+) (?P<yyyy>\d{4}) (?P<hh>\d{2}):(?P<MM>\d{2}) (?P<offset_sign>[+\-])(?P<offset_hour>\d{2})(?P<offset_minute>\d{2})$'),
re.compile(r'^(?:[A-Za-z]+),\s+(?P<dd>\d{1,2}) (?P<month>[A-Za-z]+) (?P<yyyy>\d{4}) (?P<hh>\d{2}):(?P<MM>\d{2}) (?P<offset_sign>[+\-])(?P<offset_hour>\d{2})(?P<offset_minute>\d{2}) \((?:[A-Z ]+)\)$'),
re.compile(r'^(?:[A-Za-z]+),\s+(?P<dd>\d{1,2}) (?P<month>[A-Za-z]+) (?P<yyyy>\d{4}) (?P<hh>\d{2}):(?P<MM>\d{2}) (?:GMT|UTC)$'),
re.compile(r'^(?P<dd>\d{1,2}) (?P<month>[A-Za-z]+) (?P<yyyy>\d{4}) (?P<hh>\d{2}):(?P<MM>\d{2}):(?P<ss>\d{2}) (?P<offset_sign>[+\-])(?P<offset_hour>\d{2})(?P<offset_minute>\d{2})$'),
re.compile(r'^(?P<dd>\d{1,2}) (?P<month>[A-Za-z]+) (?P<yyyy>\d{4}) (?P<hh>\d{2}):(?P<MM>\d{2}):(?P<ss>\d{2}) (?:GMT|UTC)$'),
]
def parse_timestamp(timestamp: str):
if timestamp is None or timestamp == '':
return datetime.datetime.fromtimestamp(0)
for re in DATE_RE:
match = re.match(timestamp)
if match is not None:
capture = match.groupdict()
mult = -1 if capture.get('offset_sign', '+') == '-' else 1
tz = datetime.timezone(datetime.timedelta(
hours = mult * int(capture.get('offset_hour', 0)),
minutes = mult * int(capture.get('offset_minute', 0))
))
yyyy = int(capture['yyyy'])
if 'month' in capture:
mm = DATE_MONTHS[capture['month'].lower()]
else:
mm = int(capture['mm'])
dd = int(capture['dd'])
hh = int(capture.get('hh', 0))
MM = int(capture.get('MM', 0))
ss = int(capture.get('ss', 0))
return datetime.datetime(yyyy, mm, dd, hh, MM, ss, 0, tz)
class MessageState(enum.Enum):
EMPTY = 0
HEADER = 1
@ -147,7 +201,7 @@ class Message(DatabaseTable):
value = self.header('Date')
if value is not None:
ret = search_dates(value)[0][1]
ret = parse_timestamp(value)
self._cache['created_on'] = str(ret)
except: