From fdac039c2a4d15a449e06deb198a64c5bd5cb31d Mon Sep 17 00:00:00 2001 From: XANTRONIX Development <dev@xantronix.com> Date: Wed, 6 Nov 2024 23:34:51 -0500 Subject: [PATCH] initial commit --- ingest.py | 147 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 ingest.py diff --git a/ingest.py b/ingest.py new file mode 100644 index 0000000..0fa271d --- /dev/null +++ b/ingest.py @@ -0,0 +1,147 @@ +#! /usr/bin/env python3 + +import sys +import re +import sqlite3 + +class MBoxReaderError(Exception): + pass + +class MBoxReaderBuffer(): + def __init__(self): + self.lines = [None, None, None, None] + self.index = None + + def add(self, line: str): + if self.lines[0] is None: + self.lines[0] = line + self.index = 0 + elif self.lines[1] is None: + self.lines[1] = line + self.index = 1 + elif self.lines[2] is None: + self.lines[2] = line + self.index = 2 + elif self.lines[3] is None: + self.lines[3] = line + self.index = 3 + else: + self.lines[0] = self.lines[1] + self.lines[1] = self.lines[2] + self.lines[2] = self.lines[3] + self.lines[3] = line + self.index = 3 + + if self.index is None: + self.index = 0 + elif self.index < 3: + self.index += 1 + + def is_empty_line(self, line): + return self.lines[line] == '\n' + + def is_from_line(self, line): + return self.lines[line][0:5] == 'From ' + + def is_header_line(self, line): + return re.match('^([^:]+): (.*)$', self.lines[line]) is not None + + def is_start(self): + if self.lines[0] is None or self.lines[1] is None: + return + + if self.is_from_line(0) and self.is_header_line(1): + pass + elif self.is_empty_line(0) \ + and self.is_empty_line(1) \ + and self.is_from_line(2) \ + and self.is_header_line(3): + return 3 + +class MBoxMessage(): + __slots__ = 'headers', 'body', 'key', + + def __init__(self): + self.headers = dict() + self.body = None + self.key = None + + def add(self, line: str): + if self.body is None: + if line == '\n': + self.body = '' + elif line[0] == ' ' or line[0] == '\t': + self.headers[self.key] += ' ' + line.strip() + else: + match = re.match('^([^:]+): (.*)$', line) + + if match: + self.key = match[1] + + self.headers[self.key] = line.rstrip() + else: + self.body += line + + def is_empty(self): + return len(self.headers) == 0 and self.body is None + + def is_first_line(self): + return len(self.headers) == 1 and self.body is None + +class MBoxReader(): + __slots__ = 'path', 'fh', 'line', 'buf', 'message', + + def __init__(self, path: str): + self.path = path + self.fh = open(path, 'r', newline='') + self.line = 0 + self.buf = MBoxReaderBuffer() + self.message = None + + def get_message(self): + while True: + line = self.fh.readline() + + if line is None: + ret = self.message + + self.message = None + + return ret + + self.line += 1 + + self.buf.add(line) + + if self.buf.is_start(): + if self.message is None: + self.message = MBoxMessage() + else: + ret = self.message + + self.message = MBoxMessage() + self.message.add(line) + + return ret + + if self.message: + self.message.add(line) + + def messages(self): + while True: + message = self.get_message() + + if message is None: + break + + yield message + +db = sqlite3.connect(sys.argv[1]) +reader = MBoxReader(sys.argv[2]) + +count = 0 + +for message in reader.messages(): + count += 1 + +print(f"Found {count} messages")