From 59deeff397be1bb3f2af1813fb90fc48325fc4e1 Mon Sep 17 00:00:00 2001 From: Vahagn Khachatryan Date: Tue, 28 Aug 2018 07:08:20 +0100 Subject: [PATCH] Seperating parser_mhtml.py and book.py --- gragir/__main__.py | 78 +++---------------------------------------- gragir/book.py | 20 +++++++++++ gragir/parse_mhtml.py | 57 +++++++++++++++++++++++++++++++ 3 files changed, 82 insertions(+), 73 deletions(-) create mode 100644 gragir/book.py create mode 100644 gragir/parse_mhtml.py diff --git a/gragir/__main__.py b/gragir/__main__.py index 0db8f41..484b0d8 100644 --- a/gragir/__main__.py +++ b/gragir/__main__.py @@ -14,13 +14,13 @@ import sys import logging import argparse -import zipfile -import email - import urllib.parse as urlparse import ebooklib.epub as ebooklib from bs4 import BeautifulSoup +from book import Book, Item +from parse_mhtml import parseMhtmlZipFile + def parseArguments(): """ Usage: @@ -48,72 +48,6 @@ def configLogger(args): format='%(message)s', level=loggingLevel) - -def validateMht(fileName): - return True - -class Item(object): - - def __init__(self, url, content_type, payload): - self.url = url - self.content_type = content_type - self.payload = payload - self.needed_by = set() - self.needs = set() - -class Book(object): - - def __init__(self, file_name): - self.file_name = file_name - self.content = {} - self.first = None - -def parseMht(mht, book): - logger = logging.getLogger(__name__) - - mhtContent = email.message_from_bytes(mht) - - parts = mhtContent.get_payload() - # Multiple parts, usually? If single 'str' part, then convert to a list. - if not type(parts) is list: - parts = [mhtContent] - - logger.info(' Number of parts: {}'.format(len(parts))) - - # Save all parts to files. - for p in parts: # walk() for a tree, but I'm guessing MHT is never nested? - #??? cs = p.get_charset() # Expecting "utf-8" for root HTML, None for all other parts. - ct = p.get_content_type() - # String coerced to lower case of the form maintype/subtype, else get_default_type(). - fp = p.get("content-location") or "index.html" # File path. Expecting root HTML is only part with no location. - - logger.info(' Content type: {}, Location: {}, Size: {}' - .format(ct, fp, len(p.get_payload()))) - - book.content[fp] = Item(fp, ct, p.get_payload(decode=True)) - - -def parseMhtFile(zip, mhtInfo, book): - logger = logging.getLogger(__name__) - logger.info('Parsing {}, size: {}, csize: {} ' - .format(mhtInfo.filename, - mhtInfo.file_size, - mhtInfo.compress_size)) - - with zip.open(mhtInfo) as mht: - parseMht(mht.read(), book) - - -def parseZipFile(zip, book): - logger = logging.getLogger(__name__) - for zipMember in zip.infolist(): - if validateMht(zipMember): - parseMhtFile(zip, zipMember, book) - else: - logger.error("Unexpected file in zip: {}".format(zipMember)) - raise Exception("Unexpected file in zip.") - - def parseHtml(book): logger = logging.getLogger(__name__) logger.info("Loaded {} parts.".format(len(book.content))) @@ -354,12 +288,10 @@ def main(): book = Book(args.epub) - with zipfile.ZipFile(args.zip, 'r') as zip: - parseZipFile(zip, book) - + parseMhtmlZipFile(args.zip, book) parseHtml(book) createDAG(book) - createEpubBook(book) + #createEpubBook(book) if __name__ == "__main__": diff --git a/gragir/book.py b/gragir/book.py new file mode 100644 index 0000000..384b153 --- /dev/null +++ b/gragir/book.py @@ -0,0 +1,20 @@ + + + + +class Item(object): + + def __init__(self, url, content_type, payload): + self.url = url + self.content_type = content_type + self.payload = payload + self.needed_by = set() + self.needs = set() + +class Book(object): + + def __init__(self, file_name): + self.file_name = file_name + self.content = {} + self.first = None + diff --git a/gragir/parse_mhtml.py b/gragir/parse_mhtml.py new file mode 100644 index 0000000..c9ddc57 --- /dev/null +++ b/gragir/parse_mhtml.py @@ -0,0 +1,57 @@ +import logging +import zipfile +import email + +from book import Item, Book + +def validateMht(fileName): + return True + +def parseMht(mht, book): + logger = logging.getLogger(__name__) + + mhtContent = email.message_from_bytes(mht) + + parts = mhtContent.get_payload() + # Multiple parts, usually? If single 'str' part, then convert to a list. + if not type(parts) is list: + parts = [mhtContent] + + logger.info(' Number of parts: {}'.format(len(parts))) + + # Save all parts to files. + for p in parts: # walk() for a tree, but I'm guessing MHT is never nested? + #??? cs = p.get_charset() # Expecting "utf-8" for root HTML, None for all other parts. + ct = p.get_content_type() + # String coerced to lower case of the form maintype/subtype, else get_default_type(). + fp = p.get("content-location") or "index.html" # File path. Expecting root HTML is only part with no location. + + logger.info(' Content type: {}, Location: {}, Size: {}' + .format(ct, fp, len(p.get_payload()))) + + book.content[fp] = Item(fp, ct, p.get_payload(decode=True)) + + +def parseMhtFile(zip, mhtInfo, book): + logger = logging.getLogger(__name__) + logger.info('Parsing {}, size: {}, csize: {} ' + .format(mhtInfo.filename, + mhtInfo.file_size, + mhtInfo.compress_size)) + + with zip.open(mhtInfo) as mht: + parseMht(mht.read(), book) + + +def parseMhtmlZip(zip, book): + logger = logging.getLogger(__name__) + for zipMember in zip.infolist(): + if validateMht(zipMember): + parseMhtFile(zip, zipMember, book) + else: + logger.error("Unexpected file in zip: {}".format(zipMember)) + raise Exception("Unexpected file in zip.") + +def parseMhtmlZipFile(zipName, book): + with zipfile.ZipFile(zipName, 'r') as zip: + parseMhtmlZip(zip, book)