Seperating parser_mhtml.py and book.py

This commit is contained in:
2018-08-28 07:08:20 +01:00
parent be375c1a94
commit 59deeff397
3 changed files with 82 additions and 73 deletions

View File

@@ -14,13 +14,13 @@ import sys
import logging import logging
import argparse import argparse
import zipfile
import email
import urllib.parse as urlparse import urllib.parse as urlparse
import ebooklib.epub as ebooklib import ebooklib.epub as ebooklib
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from book import Book, Item
from parse_mhtml import parseMhtmlZipFile
def parseArguments(): def parseArguments():
""" """
Usage: Usage:
@@ -48,72 +48,6 @@ def configLogger(args):
format='%(message)s', format='%(message)s',
level=loggingLevel) level=loggingLevel)
def validateMht(fileName):
return True
class Item(object):
def __init__(self, url, content_type, payload):
self.url = url
self.content_type = content_type
self.payload = payload
self.needed_by = set()
self.needs = set()
class Book(object):
def __init__(self, file_name):
self.file_name = file_name
self.content = {}
self.first = None
def parseMht(mht, book):
logger = logging.getLogger(__name__)
mhtContent = email.message_from_bytes(mht)
parts = mhtContent.get_payload()
# Multiple parts, usually? If single 'str' part, then convert to a list.
if not type(parts) is list:
parts = [mhtContent]
logger.info(' Number of parts: {}'.format(len(parts)))
# Save all parts to files.
for p in parts: # walk() for a tree, but I'm guessing MHT is never nested?
#??? cs = p.get_charset() # Expecting "utf-8" for root HTML, None for all other parts.
ct = p.get_content_type()
# String coerced to lower case of the form maintype/subtype, else get_default_type().
fp = p.get("content-location") or "index.html" # File path. Expecting root HTML is only part with no location.
logger.info(' Content type: {}, Location: {}, Size: {}'
.format(ct, fp, len(p.get_payload())))
book.content[fp] = Item(fp, ct, p.get_payload(decode=True))
def parseMhtFile(zip, mhtInfo, book):
logger = logging.getLogger(__name__)
logger.info('Parsing {}, size: {}, csize: {} '
.format(mhtInfo.filename,
mhtInfo.file_size,
mhtInfo.compress_size))
with zip.open(mhtInfo) as mht:
parseMht(mht.read(), book)
def parseZipFile(zip, book):
logger = logging.getLogger(__name__)
for zipMember in zip.infolist():
if validateMht(zipMember):
parseMhtFile(zip, zipMember, book)
else:
logger.error("Unexpected file in zip: {}".format(zipMember))
raise Exception("Unexpected file in zip.")
def parseHtml(book): def parseHtml(book):
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.info("Loaded {} parts.".format(len(book.content))) logger.info("Loaded {} parts.".format(len(book.content)))
@@ -354,12 +288,10 @@ def main():
book = Book(args.epub) book = Book(args.epub)
with zipfile.ZipFile(args.zip, 'r') as zip: parseMhtmlZipFile(args.zip, book)
parseZipFile(zip, book)
parseHtml(book) parseHtml(book)
createDAG(book) createDAG(book)
createEpubBook(book) #createEpubBook(book)
if __name__ == "__main__": if __name__ == "__main__":

20
gragir/book.py Normal file
View File

@@ -0,0 +1,20 @@
class Item(object):
def __init__(self, url, content_type, payload):
self.url = url
self.content_type = content_type
self.payload = payload
self.needed_by = set()
self.needs = set()
class Book(object):
def __init__(self, file_name):
self.file_name = file_name
self.content = {}
self.first = None

57
gragir/parse_mhtml.py Normal file
View File

@@ -0,0 +1,57 @@
import logging
import zipfile
import email
from book import Item, Book
def validateMht(fileName):
return True
def parseMht(mht, book):
logger = logging.getLogger(__name__)
mhtContent = email.message_from_bytes(mht)
parts = mhtContent.get_payload()
# Multiple parts, usually? If single 'str' part, then convert to a list.
if not type(parts) is list:
parts = [mhtContent]
logger.info(' Number of parts: {}'.format(len(parts)))
# Save all parts to files.
for p in parts: # walk() for a tree, but I'm guessing MHT is never nested?
#??? cs = p.get_charset() # Expecting "utf-8" for root HTML, None for all other parts.
ct = p.get_content_type()
# String coerced to lower case of the form maintype/subtype, else get_default_type().
fp = p.get("content-location") or "index.html" # File path. Expecting root HTML is only part with no location.
logger.info(' Content type: {}, Location: {}, Size: {}'
.format(ct, fp, len(p.get_payload())))
book.content[fp] = Item(fp, ct, p.get_payload(decode=True))
def parseMhtFile(zip, mhtInfo, book):
logger = logging.getLogger(__name__)
logger.info('Parsing {}, size: {}, csize: {} '
.format(mhtInfo.filename,
mhtInfo.file_size,
mhtInfo.compress_size))
with zip.open(mhtInfo) as mht:
parseMht(mht.read(), book)
def parseMhtmlZip(zip, book):
logger = logging.getLogger(__name__)
for zipMember in zip.infolist():
if validateMht(zipMember):
parseMhtFile(zip, zipMember, book)
else:
logger.error("Unexpected file in zip: {}".format(zipMember))
raise Exception("Unexpected file in zip.")
def parseMhtmlZipFile(zipName, book):
with zipfile.ZipFile(zipName, 'r') as zip:
parseMhtmlZip(zip, book)