Use homegrown date parsing
This commit is contained in:
parent
6834a6de95
commit
0d984fdc29
1 changed files with 57 additions and 3 deletions
|
@ -2,8 +2,7 @@ import re
|
||||||
import enum
|
import enum
|
||||||
import datetime
|
import datetime
|
||||||
|
|
||||||
from dateparser.search import search_dates
|
from email.header import decode_header
|
||||||
from email.header import decode_header
|
|
||||||
|
|
||||||
from nntp.tiny.db import DatabaseTable
|
from nntp.tiny.db import DatabaseTable
|
||||||
|
|
||||||
|
@ -42,6 +41,61 @@ def each_line(text: str):
|
||||||
yield text[start:end]
|
yield text[start:end]
|
||||||
break
|
break
|
||||||
|
|
||||||
|
DATE_MONTHS = {
|
||||||
|
'jan': 1, 'january': 1,
|
||||||
|
'feb': 2, 'february': 2,
|
||||||
|
'mar': 3, 'march': 3,
|
||||||
|
'apr': 4, 'april': 4,
|
||||||
|
'may': 5,
|
||||||
|
'jun': 6, 'june': 6,
|
||||||
|
'jul': 7, 'july': 7,
|
||||||
|
'aug': 8, 'august': 8,
|
||||||
|
'sep': 9, 'september': 9,
|
||||||
|
'oct': 10, 'october': 10,
|
||||||
|
'nov': 11, 'november': 11,
|
||||||
|
'dec': 12, 'december': 12
|
||||||
|
}
|
||||||
|
|
||||||
|
DATE_RE = [
|
||||||
|
re.compile(r'^(?:[A-Za-z]+),\s+(?P<dd>\d{1,2}) (?P<month>[A-Za-z]+) (?P<yyyy>\d{4}) (?P<hh>\d{2}):(?P<MM>\d{2}):(?P<ss>\d{2}) (?P<offset_sign>[+\-])(?P<offset_hour>\d{2})(?P<offset_minute>\d{2})$'),
|
||||||
|
re.compile(r'^(?:[A-Za-z]+),\s+(?P<dd>\d{1,2}) (?P<month>[A-Za-z]+) (?P<yyyy>\d{4}) (?P<hh>\d{2}):(?P<MM>\d{2}):(?P<ss>\d{2}) (?P<offset_sign>[+\-])(?P<offset_hour>\d{2})(?P<offset_minute>\d{2}) \((?:[A-Za-z ]+)\)$'),
|
||||||
|
re.compile(r'^(?:[A-Za-z]+),\s+(?P<dd>\d{1,2}) (?P<month>[A-Za-z]+) (?P<yyyy>\d{4}) (?P<hh>\d{2}):(?P<MM>\d{2}):(?P<ss>\d{2}) (?:GMT|UTC)$'),
|
||||||
|
re.compile(r'^(?:[A-Za-z]+),\s+(?P<dd>\d{1,2}) (?P<month>[A-Za-z]+) (?P<yyyy>\d{4}) (?P<hh>\d{2}):(?P<MM>\d{2}) (?P<offset_sign>[+\-])(?P<offset_hour>\d{2})(?P<offset_minute>\d{2})$'),
|
||||||
|
re.compile(r'^(?:[A-Za-z]+),\s+(?P<dd>\d{1,2}) (?P<month>[A-Za-z]+) (?P<yyyy>\d{4}) (?P<hh>\d{2}):(?P<MM>\d{2}) (?P<offset_sign>[+\-])(?P<offset_hour>\d{2})(?P<offset_minute>\d{2}) \((?:[A-Z ]+)\)$'),
|
||||||
|
re.compile(r'^(?:[A-Za-z]+),\s+(?P<dd>\d{1,2}) (?P<month>[A-Za-z]+) (?P<yyyy>\d{4}) (?P<hh>\d{2}):(?P<MM>\d{2}) (?:GMT|UTC)$'),
|
||||||
|
re.compile(r'^(?P<dd>\d{1,2}) (?P<month>[A-Za-z]+) (?P<yyyy>\d{4}) (?P<hh>\d{2}):(?P<MM>\d{2}):(?P<ss>\d{2}) (?P<offset_sign>[+\-])(?P<offset_hour>\d{2})(?P<offset_minute>\d{2})$'),
|
||||||
|
re.compile(r'^(?P<dd>\d{1,2}) (?P<month>[A-Za-z]+) (?P<yyyy>\d{4}) (?P<hh>\d{2}):(?P<MM>\d{2}):(?P<ss>\d{2}) (?:GMT|UTC)$'),
|
||||||
|
]
|
||||||
|
|
||||||
|
def parse_timestamp(timestamp: str):
|
||||||
|
if timestamp is None or timestamp == '':
|
||||||
|
return datetime.datetime.fromtimestamp(0)
|
||||||
|
|
||||||
|
for re in DATE_RE:
|
||||||
|
match = re.match(timestamp)
|
||||||
|
|
||||||
|
if match is not None:
|
||||||
|
capture = match.groupdict()
|
||||||
|
mult = -1 if capture.get('offset_sign', '+') == '-' else 1
|
||||||
|
tz = datetime.timezone(datetime.timedelta(
|
||||||
|
hours = mult * int(capture.get('offset_hour', 0)),
|
||||||
|
minutes = mult * int(capture.get('offset_minute', 0))
|
||||||
|
))
|
||||||
|
|
||||||
|
yyyy = int(capture['yyyy'])
|
||||||
|
|
||||||
|
if 'month' in capture:
|
||||||
|
mm = DATE_MONTHS[capture['month'].lower()]
|
||||||
|
else:
|
||||||
|
mm = int(capture['mm'])
|
||||||
|
|
||||||
|
dd = int(capture['dd'])
|
||||||
|
hh = int(capture.get('hh', 0))
|
||||||
|
MM = int(capture.get('MM', 0))
|
||||||
|
ss = int(capture.get('ss', 0))
|
||||||
|
|
||||||
|
return datetime.datetime(yyyy, mm, dd, hh, MM, ss, 0, tz)
|
||||||
|
|
||||||
class MessageState(enum.Enum):
|
class MessageState(enum.Enum):
|
||||||
EMPTY = 0
|
EMPTY = 0
|
||||||
HEADER = 1
|
HEADER = 1
|
||||||
|
@ -147,7 +201,7 @@ class Message(DatabaseTable):
|
||||||
value = self.header('Date')
|
value = self.header('Date')
|
||||||
|
|
||||||
if value is not None:
|
if value is not None:
|
||||||
ret = search_dates(value)[0][1]
|
ret = parse_timestamp(value)
|
||||||
|
|
||||||
self._cache['created_on'] = str(ret)
|
self._cache['created_on'] = str(ret)
|
||||||
except:
|
except:
|
||||||
|
|
Loading…
Add table
Reference in a new issue