Significantly improve message parsing performance

This commit is contained in:
XANTRONIX Development 2025-01-06 00:40:48 -05:00
parent 6038e09670
commit 8aa8ae4dec

View file

@ -4,6 +4,9 @@ import datetime
import uuid
import email.message
import email.policy
import email.parser
import email.errors
from email.utils import parsedate_to_datetime
from email.header import decode_header, Header
@ -117,14 +120,10 @@ class MessageRange():
obj.max = int(match[1])
return obj
class MessageState(enum.Enum):
EMPTY = 0
HEADER = 1
BODY = 2
class Message(DatabaseTable):
__slots__ = (
'_cache',
'_parser',
'_headers',
'_headers_lc',
'_body',
@ -153,12 +152,12 @@ class Message(DatabaseTable):
super().__init__()
self._cache = dict()
self._parser = None
self._headers = None
self._headers_lc = None
self._body = None
self._key = None
self.id = None
self.state = MessageState.EMPTY
self.line = None
self.content = ''
@ -304,49 +303,16 @@ class Message(DatabaseTable):
elif value is not None:
self._header_set('Subject', Header(value).encode())
def is_first_line(self):
return len(self.headers) == 1 and (self._body == '' or self._body is None)
def readline(self, line: str):
if self.line is not None:
self.content += self.line
if self._parser is None:
self.content = line
self._parser = email.parser.FeedParser()
else:
self.content += line
if self.state is MessageState.EMPTY:
self.state = MessageState.HEADER
self._headers = dict()
self._headers_lc = dict()
if self.state is MessageState.HEADER:
if line == '\n' or line == '\r\n':
self.state = MessageState.BODY
elif line[0] == ' ' or line[0] == '\t':
self._header_append(self._key, ' ' + line.strip())
else:
match = self.RE_HEADER.match(line)
if match:
self._key = match[1]
self._header_append(self._key, match[2].rstrip())
elif self.state is MessageState.BODY:
if self._body is None:
self._body = ''
else:
self._body += self.line
self.line = line
def finish(self):
if self.line:
self.content += self.line
self._body += self.line
self.line = None
def read(self, text: str):
obj = email.message_from_string(text)
self._parser.feed(line)
def read_email_obj(self, obj: email.message.Message):
self._headers = dict()
self._headers_lc = dict()
@ -354,7 +320,14 @@ class Message(DatabaseTable):
self._header_set(key, obj.get(key))
self._body = obj.get_payload()
self.line = None
def finish(self):
if self._parser:
self.read_email_obj(self._parser.close())
self._parser = None
def read(self, text: str):
self.read_email_obj(email.message_from_string(text))
def message_id_assign(self):
sender = self.sender
@ -401,12 +374,6 @@ class Message(DatabaseTable):
@staticmethod
def from_text(text: str):
message = Message()
obj = email.message_from_string(text)
for key in obj.keys():
message._header_set(key, obj.get(key))
message._body = obj.get_payload()
message.read(text)
return message