initial commit
This commit is contained in:
commit
fdac039c2a
1 changed files with 147 additions and 0 deletions
147
ingest.py
Normal file
147
ingest.py
Normal file
|
@ -0,0 +1,147 @@
|
|||
#! /usr/bin/env python3
|
||||
|
||||
import sys
|
||||
import re
|
||||
import sqlite3
|
||||
|
||||
class MBoxReaderError(Exception):
|
||||
pass
|
||||
|
||||
class MBoxReaderBuffer():
|
||||
def __init__(self):
|
||||
self.lines = [None, None, None, None]
|
||||
self.index = None
|
||||
|
||||
def add(self, line: str):
|
||||
if self.lines[0] is None:
|
||||
self.lines[0] = line
|
||||
self.index = 0
|
||||
elif self.lines[1] is None:
|
||||
self.lines[1] = line
|
||||
self.index = 1
|
||||
elif self.lines[2] is None:
|
||||
self.lines[2] = line
|
||||
self.index = 2
|
||||
elif self.lines[3] is None:
|
||||
self.lines[3] = line
|
||||
self.index = 3
|
||||
else:
|
||||
self.lines[0] = self.lines[1]
|
||||
self.lines[1] = self.lines[2]
|
||||
self.lines[2] = self.lines[3]
|
||||
self.lines[3] = line
|
||||
self.index = 3
|
||||
|
||||
if self.index is None:
|
||||
self.index = 0
|
||||
elif self.index < 3:
|
||||
self.index += 1
|
||||
|
||||
def is_empty_line(self, line):
|
||||
return self.lines[line] == '\n'
|
||||
|
||||
def is_from_line(self, line):
|
||||
return self.lines[line][0:5] == 'From '
|
||||
|
||||
def is_header_line(self, line):
|
||||
return re.match('^([^:]+): (.*)$', self.lines[line]) is not None
|
||||
|
||||
def is_start(self):
|
||||
if self.lines[0] is None or self.lines[1] is None:
|
||||
return
|
||||
|
||||
if self.is_from_line(0) and self.is_header_line(1):
|
||||
pass
|
||||
elif self.is_empty_line(0) \
|
||||
and self.is_empty_line(1) \
|
||||
and self.is_from_line(2) \
|
||||
and self.is_header_line(3):
|
||||
return 3
|
||||
|
||||
class MBoxMessage():
|
||||
__slots__ = 'headers', 'body', 'key',
|
||||
|
||||
def __init__(self):
|
||||
self.headers = dict()
|
||||
self.body = None
|
||||
self.key = None
|
||||
|
||||
def add(self, line: str):
|
||||
if self.body is None:
|
||||
if line == '\n':
|
||||
self.body = ''
|
||||
elif line[0] == ' ' or line[0] == '\t':
|
||||
self.headers[self.key] += ' ' + line.strip()
|
||||
else:
|
||||
match = re.match('^([^:]+): (.*)$', line)
|
||||
|
||||
if match:
|
||||
self.key = match[1]
|
||||
|
||||
self.headers[self.key] = line.rstrip()
|
||||
else:
|
||||
self.body += line
|
||||
|
||||
def is_empty(self):
|
||||
return len(self.headers) == 0 and self.body is None
|
||||
|
||||
def is_first_line(self):
|
||||
return len(self.headers) == 1 and self.body is None
|
||||
|
||||
class MBoxReader():
|
||||
__slots__ = 'path', 'fh', 'line', 'buf', 'message',
|
||||
|
||||
def __init__(self, path: str):
|
||||
self.path = path
|
||||
self.fh = open(path, 'r', newline='')
|
||||
self.line = 0
|
||||
self.buf = MBoxReaderBuffer()
|
||||
self.message = None
|
||||
|
||||
def get_message(self):
|
||||
while True:
|
||||
line = self.fh.readline()
|
||||
|
||||
if line is None:
|
||||
ret = self.message
|
||||
|
||||
self.message = None
|
||||
|
||||
return ret
|
||||
|
||||
self.line += 1
|
||||
|
||||
self.buf.add(line)
|
||||
|
||||
if self.buf.is_start():
|
||||
if self.message is None:
|
||||
self.message = MBoxMessage()
|
||||
else:
|
||||
ret = self.message
|
||||
|
||||
self.message = MBoxMessage()
|
||||
self.message.add(line)
|
||||
|
||||
return ret
|
||||
|
||||
if self.message:
|
||||
self.message.add(line)
|
||||
|
||||
def messages(self):
|
||||
while True:
|
||||
message = self.get_message()
|
||||
|
||||
if message is None:
|
||||
break
|
||||
|
||||
yield message
|
||||
|
||||
db = sqlite3.connect(sys.argv[1])
|
||||
reader = MBoxReader(sys.argv[2])
|
||||
|
||||
count = 0
|
||||
|
||||
for message in reader.messages():
|
||||
count += 1
|
||||
|
||||
print(f"Found {count} messages")
|
Loading…
Add table
Reference in a new issue