Significantly improve message parsing performance
This commit is contained in:
parent
6038e09670
commit
8aa8ae4dec
1 changed files with 21 additions and 54 deletions
|
@ -4,6 +4,9 @@ import datetime
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
import email.message
|
import email.message
|
||||||
|
import email.policy
|
||||||
|
import email.parser
|
||||||
|
import email.errors
|
||||||
|
|
||||||
from email.utils import parsedate_to_datetime
|
from email.utils import parsedate_to_datetime
|
||||||
from email.header import decode_header, Header
|
from email.header import decode_header, Header
|
||||||
|
@ -117,14 +120,10 @@ class MessageRange():
|
||||||
obj.max = int(match[1])
|
obj.max = int(match[1])
|
||||||
return obj
|
return obj
|
||||||
|
|
||||||
class MessageState(enum.Enum):
|
|
||||||
EMPTY = 0
|
|
||||||
HEADER = 1
|
|
||||||
BODY = 2
|
|
||||||
|
|
||||||
class Message(DatabaseTable):
|
class Message(DatabaseTable):
|
||||||
__slots__ = (
|
__slots__ = (
|
||||||
'_cache',
|
'_cache',
|
||||||
|
'_parser',
|
||||||
'_headers',
|
'_headers',
|
||||||
'_headers_lc',
|
'_headers_lc',
|
||||||
'_body',
|
'_body',
|
||||||
|
@ -153,12 +152,12 @@ class Message(DatabaseTable):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
self._cache = dict()
|
self._cache = dict()
|
||||||
|
self._parser = None
|
||||||
self._headers = None
|
self._headers = None
|
||||||
self._headers_lc = None
|
self._headers_lc = None
|
||||||
self._body = None
|
self._body = None
|
||||||
self._key = None
|
self._key = None
|
||||||
self.id = None
|
self.id = None
|
||||||
self.state = MessageState.EMPTY
|
|
||||||
self.line = None
|
self.line = None
|
||||||
self.content = ''
|
self.content = ''
|
||||||
|
|
||||||
|
@ -304,49 +303,16 @@ class Message(DatabaseTable):
|
||||||
elif value is not None:
|
elif value is not None:
|
||||||
self._header_set('Subject', Header(value).encode())
|
self._header_set('Subject', Header(value).encode())
|
||||||
|
|
||||||
def is_first_line(self):
|
|
||||||
return len(self.headers) == 1 and (self._body == '' or self._body is None)
|
|
||||||
|
|
||||||
def readline(self, line: str):
|
def readline(self, line: str):
|
||||||
if self.line is not None:
|
if self._parser is None:
|
||||||
self.content += self.line
|
self.content = line
|
||||||
|
self._parser = email.parser.FeedParser()
|
||||||
|
else:
|
||||||
|
self.content += line
|
||||||
|
|
||||||
if self.state is MessageState.EMPTY:
|
self._parser.feed(line)
|
||||||
self.state = MessageState.HEADER
|
|
||||||
self._headers = dict()
|
|
||||||
self._headers_lc = dict()
|
|
||||||
|
|
||||||
if self.state is MessageState.HEADER:
|
|
||||||
if line == '\n' or line == '\r\n':
|
|
||||||
self.state = MessageState.BODY
|
|
||||||
elif line[0] == ' ' or line[0] == '\t':
|
|
||||||
self._header_append(self._key, ' ' + line.strip())
|
|
||||||
else:
|
|
||||||
match = self.RE_HEADER.match(line)
|
|
||||||
|
|
||||||
if match:
|
|
||||||
self._key = match[1]
|
|
||||||
|
|
||||||
self._header_append(self._key, match[2].rstrip())
|
|
||||||
|
|
||||||
elif self.state is MessageState.BODY:
|
|
||||||
if self._body is None:
|
|
||||||
self._body = ''
|
|
||||||
else:
|
|
||||||
self._body += self.line
|
|
||||||
|
|
||||||
self.line = line
|
|
||||||
|
|
||||||
def finish(self):
|
|
||||||
if self.line:
|
|
||||||
self.content += self.line
|
|
||||||
self._body += self.line
|
|
||||||
|
|
||||||
self.line = None
|
|
||||||
|
|
||||||
def read(self, text: str):
|
|
||||||
obj = email.message_from_string(text)
|
|
||||||
|
|
||||||
|
def read_email_obj(self, obj: email.message.Message):
|
||||||
self._headers = dict()
|
self._headers = dict()
|
||||||
self._headers_lc = dict()
|
self._headers_lc = dict()
|
||||||
|
|
||||||
|
@ -354,7 +320,14 @@ class Message(DatabaseTable):
|
||||||
self._header_set(key, obj.get(key))
|
self._header_set(key, obj.get(key))
|
||||||
|
|
||||||
self._body = obj.get_payload()
|
self._body = obj.get_payload()
|
||||||
self.line = None
|
|
||||||
|
def finish(self):
|
||||||
|
if self._parser:
|
||||||
|
self.read_email_obj(self._parser.close())
|
||||||
|
self._parser = None
|
||||||
|
|
||||||
|
def read(self, text: str):
|
||||||
|
self.read_email_obj(email.message_from_string(text))
|
||||||
|
|
||||||
def message_id_assign(self):
|
def message_id_assign(self):
|
||||||
sender = self.sender
|
sender = self.sender
|
||||||
|
@ -401,12 +374,6 @@ class Message(DatabaseTable):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_text(text: str):
|
def from_text(text: str):
|
||||||
message = Message()
|
message = Message()
|
||||||
|
message.read(text)
|
||||||
obj = email.message_from_string(text)
|
|
||||||
|
|
||||||
for key in obj.keys():
|
|
||||||
message._header_set(key, obj.get(key))
|
|
||||||
|
|
||||||
message._body = obj.get_payload()
|
|
||||||
|
|
||||||
return message
|
return message
|
||||||
|
|
Loading…
Add table
Reference in a new issue