diff --git a/ingest.py b/lib/nntp/tiny/mbox.py similarity index 77% rename from ingest.py rename to lib/nntp/tiny/mbox.py index 0fa271d..1d906c7 100644 --- a/ingest.py +++ b/lib/nntp/tiny/mbox.py @@ -1,7 +1,5 @@ -#! /usr/bin/env python3 - -import sys import re +import enum import sqlite3 class MBoxReaderError(Exception): @@ -58,18 +56,29 @@ class MBoxReaderBuffer(): and self.is_header_line(3): return 3 +class MBoxMessageState(enum.Enum): + EMPTY = 0 + HEADER = 1 + BODY = 2 + class MBoxMessage(): - __slots__ = 'headers', 'body', 'key', + __slots__ = 'state', 'headers', 'line', 'body', 'key', def __init__(self): + self.state = MBoxMessageState.EMPTY self.headers = dict() - self.body = None + self.line = None + self.body = '' self.key = None def add(self, line: str): - if self.body is None: - if line == '\n': - self.body = '' + if self.state is MBoxMessageState.EMPTY: + self.state = MBoxMessageState.HEADER + + if self.state is MBoxMessageState.HEADER: + if line == '\n' or line == '\r\n': + self.state = MBoxMessageState.BODY + self.body = '' elif line[0] == ' ' or line[0] == '\t': self.headers[self.key] += ' ' + line.strip() else: @@ -78,15 +87,17 @@ class MBoxMessage(): if match: self.key = match[1] - self.headers[self.key] = line.rstrip() - else: - self.body += line + self.headers[self.key] = match[2].rstrip() + elif self.state is MBoxMessageState.BODY: + if self.line is None: + self.body = line + else: + self.body += self.line - def is_empty(self): - return len(self.headers) == 0 and self.body is None + self.line = line def is_first_line(self): - return len(self.headers) == 1 and self.body is None + return len(self.headers) == 1 and self.body == '' class MBoxReader(): __slots__ = 'path', 'fh', 'line', 'buf', 'message', @@ -102,7 +113,7 @@ class MBoxReader(): while True: line = self.fh.readline() - if line is None: + if line is None or line == '': ret = self.message self.message = None @@ -135,13 +146,3 @@ class MBoxReader(): break yield message - -db = sqlite3.connect(sys.argv[1]) -reader = MBoxReader(sys.argv[2]) - -count = 0 - -for message in reader.messages(): - count += 1 - -print(f"Found {count} messages")