xenu_nntp/lib/nntp/tiny/message.py

314 lines
8.9 KiB
Python
Raw Normal View History

2024-11-08 15:47:21 -05:00
import re
import enum
import datetime
2024-11-11 12:37:48 -05:00
from email.header import decode_header
2024-11-08 15:47:21 -05:00
2024-11-08 23:11:09 -05:00
from nntp.tiny.db import DatabaseTable
2024-11-08 15:47:21 -05:00
def decode(text: str):
decoded = decode_header(text)[0]
if decoded[1] is None:
if decoded[0] == b'':
return ''
2024-11-08 15:47:21 -05:00
return str(decoded[0])
try:
return str(decoded[0], decoded[1])
except:
if decoded[0] == b'':
return ''
2024-11-08 15:47:21 -05:00
return str(decoded[0])
2024-11-09 22:25:25 -05:00
def each_line(text: str):
start = 0
end = len(text)
while True:
try:
index = text.index('\n', start, end)
yield text[start:index+1]
start = index + 1
if start == end:
break
except ValueError:
yield text[start:end]
break
2024-11-11 12:37:48 -05:00
DATE_MONTHS = {
'jan': 1, 'january': 1,
'feb': 2, 'february': 2,
'mar': 3, 'march': 3,
'apr': 4, 'april': 4,
'may': 5,
'jun': 6, 'june': 6,
'jul': 7, 'july': 7,
'aug': 8, 'august': 8,
'sep': 9, 'september': 9,
'oct': 10, 'october': 10,
'nov': 11, 'november': 11,
'dec': 12, 'december': 12
}
DATE_RE = [
re.compile(r'^(?:[A-Za-z]+),\s+(?P<dd>\d{1,2}) (?P<month>[A-Za-z]+) (?P<yyyy>\d{4}) (?P<hh>\d{2}):(?P<MM>\d{2}):(?P<ss>\d{2}) (?P<offset_sign>[+\-])(?P<offset_hour>\d{2})(?P<offset_minute>\d{2})$'),
re.compile(r'^(?:[A-Za-z]+),\s+(?P<dd>\d{1,2}) (?P<month>[A-Za-z]+) (?P<yyyy>\d{4}) (?P<hh>\d{2}):(?P<MM>\d{2}):(?P<ss>\d{2}) (?P<offset_sign>[+\-])(?P<offset_hour>\d{2})(?P<offset_minute>\d{2}) \((?:[A-Za-z ]+)\)$'),
re.compile(r'^(?:[A-Za-z]+),\s+(?P<dd>\d{1,2}) (?P<month>[A-Za-z]+) (?P<yyyy>\d{4}) (?P<hh>\d{2}):(?P<MM>\d{2}):(?P<ss>\d{2}) (?:GMT|UTC)$'),
re.compile(r'^(?:[A-Za-z]+),\s+(?P<dd>\d{1,2}) (?P<month>[A-Za-z]+) (?P<yyyy>\d{4}) (?P<hh>\d{2}):(?P<MM>\d{2}) (?P<offset_sign>[+\-])(?P<offset_hour>\d{2})(?P<offset_minute>\d{2})$'),
re.compile(r'^(?:[A-Za-z]+),\s+(?P<dd>\d{1,2}) (?P<month>[A-Za-z]+) (?P<yyyy>\d{4}) (?P<hh>\d{2}):(?P<MM>\d{2}) (?P<offset_sign>[+\-])(?P<offset_hour>\d{2})(?P<offset_minute>\d{2}) \((?:[A-Z ]+)\)$'),
re.compile(r'^(?:[A-Za-z]+),\s+(?P<dd>\d{1,2}) (?P<month>[A-Za-z]+) (?P<yyyy>\d{4}) (?P<hh>\d{2}):(?P<MM>\d{2}) (?:GMT|UTC)$'),
re.compile(r'^(?P<dd>\d{1,2}) (?P<month>[A-Za-z]+) (?P<yyyy>\d{4}) (?P<hh>\d{2}):(?P<MM>\d{2}):(?P<ss>\d{2}) (?P<offset_sign>[+\-])(?P<offset_hour>\d{2})(?P<offset_minute>\d{2})$'),
re.compile(r'^(?P<dd>\d{1,2}) (?P<month>[A-Za-z]+) (?P<yyyy>\d{4}) (?P<hh>\d{2}):(?P<MM>\d{2}):(?P<ss>\d{2}) (?:GMT|UTC)$'),
]
def parse_timestamp(timestamp: str):
if timestamp is None or timestamp == '':
return datetime.datetime.fromtimestamp(0)
for re in DATE_RE:
match = re.match(timestamp)
if match is not None:
capture = match.groupdict()
mult = -1 if capture.get('offset_sign', '+') == '-' else 1
tz = datetime.timezone(datetime.timedelta(
hours = mult * int(capture.get('offset_hour', 0)),
minutes = mult * int(capture.get('offset_minute', 0))
))
yyyy = int(capture['yyyy'])
if 'month' in capture:
mm = DATE_MONTHS[capture['month'].lower()]
else:
mm = int(capture['mm'])
dd = int(capture['dd'])
hh = int(capture.get('hh', 0))
MM = int(capture.get('MM', 0))
ss = int(capture.get('ss', 0))
return datetime.datetime(yyyy, mm, dd, hh, MM, ss, 0, tz)
2024-11-08 15:47:21 -05:00
class MessageState(enum.Enum):
EMPTY = 0
HEADER = 1
BODY = 2
2024-11-08 23:11:09 -05:00
class Message(DatabaseTable):
__slots__ = (
'_cache',
'_headers',
'_body',
'_key',
'id',
'newsgroup_id',
'state',
'line',
'content',
)
2024-11-08 23:11:09 -05:00
name = 'newsgroup_message'
key = 'id'
columns = (
'id',
2024-11-08 23:11:09 -05:00
'newsgroup_id',
'created_on',
'message_id',
'parent_id',
2024-11-08 23:11:09 -05:00
'sender',
'subject',
'content'
)
2024-11-08 15:47:21 -05:00
2024-11-09 21:43:09 -05:00
RE_HEADER = re.compile(r'^([A-Za-z0-9\-]+): (.*)$')
2024-11-08 15:47:21 -05:00
def __init__(self):
self._cache = dict()
self._headers = None
self._body = None
self._key = None
self.id = None
2024-11-08 23:11:09 -05:00
self.newsgroup_id = None
self.state = MessageState.EMPTY
self.line = None
self.content = ''
@staticmethod
def __from_row__(row):
message = Message()
#
# Defer parsing the message content until a specific header not already
# assigned to a dedcicated property, or the message body, is required.
#
message.content = row['content']
message.id = row['id']
message.newsgroup_id = row['newsgroup_id']
message.created_on = row['created_on']
message.message_id = row['message_id']
message.parent_id = row['parent_id']
message.sender = row['sender']
return message
2024-11-10 02:19:08 -05:00
def __values__(self) -> tuple:
2024-11-08 23:11:09 -05:00
return (
self.newsgroup_id,
self.created_on,
self.message_id,
self.parent_id,
self.sender,
self.subject,
2024-11-08 23:11:09 -05:00
self.content
)
2024-11-08 15:47:21 -05:00
@property
def headers(self):
if self._headers is None:
self.read(self.content)
return self._headers
@property
def body(self):
if self._body is None:
self.read(self.content)
return self._body
2024-11-08 15:47:21 -05:00
def header(self, key: str):
2024-11-11 01:04:20 -05:00
if self._headers is None:
self.read(self.content)
2024-11-08 15:47:21 -05:00
return self.headers.get(key.lower())
@property
def created_on(self):
2024-11-11 13:24:27 -05:00
value = self._cache.get('created_on')
2024-11-11 13:24:27 -05:00
if value is not None:
return datetime.datetime.fromisoformat(value)
timestamp = self.header('Date')
2024-11-11 13:24:27 -05:00
ret = parse_timestamp(timestamp)
2024-11-11 13:24:27 -05:00
self._cache['created_on'] = str(ret)
return ret
@created_on.setter
def created_on(self, value):
2024-11-11 01:04:20 -05:00
if self._headers is not None:
self._headers['date'] = str(value)
self._cache['created_on'] = str(value)
@property
def message_id(self) -> str:
2024-11-11 01:04:20 -05:00
if self._headers is None:
return self._cache.get('message_id')
2024-11-09 21:43:09 -05:00
return self.header('Message-ID')
2024-11-08 20:31:24 -05:00
@message_id.setter
def message_id(self, value):
2024-11-11 01:04:20 -05:00
if self._headers is None:
self._cache['message_id'] = value
else:
self.headers['message-id'] = value
@property
def parent_id(self) -> str:
2024-11-11 01:04:20 -05:00
if self._headers is None:
return self._cache.get('parent_id')
return self.header('References')
2024-11-08 20:31:24 -05:00
@parent_id.setter
def parent_id(self, value):
2024-11-11 01:04:20 -05:00
if self._headers is None:
self._cache['parent_id'] = value
else:
self.headers['references'] = value
2024-11-08 15:47:21 -05:00
@property
def sender(self) -> str:
2024-11-11 01:04:20 -05:00
if self._headers is None:
return self._cache.get('sender')
2024-11-08 15:47:21 -05:00
return self.headers.get('from', 'Unknown')
@sender.setter
def sender(self, value):
2024-11-11 01:04:20 -05:00
if self._headers is None:
self._cache['sender'] = value
else:
self.headers['from'] = value
@property
def subject(self) -> str:
2024-11-11 01:04:20 -05:00
if self._headers is None:
return self._cache.get('subject', '(no subject)')
2024-11-08 15:47:21 -05:00
return self.headers.get('subject', '(no subject)')
@subject.setter
def subject(self, value):
2024-11-11 01:04:20 -05:00
if self._headers is None:
self._cache['subject'] = value
else:
self.headers['subject'] = value
2024-11-08 15:47:21 -05:00
def is_first_line(self):
return len(self.headers) == 1 and (self._body == '' or self._body is None)
2024-11-08 15:47:21 -05:00
2024-11-10 02:19:08 -05:00
def read_line(self, line: str):
if self.line is not None:
self.content += self.line
2024-11-08 15:47:21 -05:00
if self.state is MessageState.EMPTY:
self.state = MessageState.HEADER
self._headers = dict()
2024-11-08 15:47:21 -05:00
if self.state is MessageState.HEADER:
if line == '\n' or line == '\r\n':
self.state = MessageState.BODY
elif line[0] == ' ' or line[0] == '\t':
self._headers[self._key] += ' ' + decode(line.strip())
else:
match = self.RE_HEADER.match(line)
2024-11-08 15:47:21 -05:00
if match:
self._key = match[1].lower()
2024-11-08 15:47:21 -05:00
self._headers[self._key] = decode(match[2].rstrip())
elif self.state is MessageState.BODY:
if self._body is None:
self._body = ''
else:
self._body += self.line
self.line = line
2024-11-08 15:47:21 -05:00
2024-11-10 02:19:08 -05:00
def read(self, text: str):
for line in each_line(text):
self.read_line(line)
2024-11-08 15:47:21 -05:00
@staticmethod
2024-11-10 02:19:08 -05:00
def from_text(text: str):
2024-11-09 13:32:38 -05:00
message = Message()
2024-11-08 15:47:21 -05:00
2024-11-09 22:25:25 -05:00
for line in each_line(text):
2024-11-10 02:19:08 -05:00
message.read_line(line)
2024-11-08 15:47:21 -05:00
return message