Parsing and loading content.

This commit is contained in:
2018-08-12 09:42:45 +01:00
parent 9520ca6432
commit 70308d1be4
2 changed files with 54 additions and 11 deletions

View File

@@ -12,7 +12,7 @@ test_all: $(addprefix test_,$(TEST_FILES))
.PHONY: $(addprefix test_,$(TEST_FILES)) .PHONY: $(addprefix test_,$(TEST_FILES))
$(addprefix test_,$(TEST_FILES)): test_%: $(addprefix test_,$(TEST_FILES)): test_%:
@echo Testing $* @echo Testing $*
@python -v gragir/__main__.py $* test_out/$(*F).epub @python gragir/__main__.py -v $* test_out/$(*F).epub
#python gragir/__main__.py samples/algorithms_third_edition_in_c.zip algorithms_third_edition_in_c.epub #python gragir/__main__.py samples/algorithms_third_edition_in_c.zip algorithms_third_edition_in_c.epub

View File

@@ -4,9 +4,9 @@
""" """
# Standard library modules do the heavy lifting. Ours is all simple stuff. # Standard library modules do the heavy lifting. Ours is all simple stuff.
# import base64 import base64
# import email, email.message import email, email.message
# import mimetypes import mimetypes
# import os # import os
# import quopri # import quopri
@@ -40,15 +40,51 @@ def parseArguments():
def validateMht(fileName): def validateMht(fileName):
return True return True
def parseMhtFile(mhtFileName): def parseMht(mht, content):
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.info(mhtFileName)
pass
def parseZipFile( zip ): mhtContent = email.message_from_bytes(mht)
parts = mhtContent.get_payload()
# Multiple parts, usually? If single 'str' part, then convert to a list.
if not type(parts) is list:
parts = [mhtContent]
logger.info(' Number of parts: {}'.format(len(parts)))
# Save all parts to files.
for p in parts: # walk() for a tree, but I'm guessing MHT is never nested?
#??? cs = p.get_charset() # Expecting "utf-8" for root HTML, None for all other parts.
ct = p.get_content_type()
# String coerced to lower case of the form maintype/subtype, else get_default_type().
fp = p.get("content-location") or "index.html" # File path. Expecting root HTML is only part with no location.
logger.info(' Content type: {}, Location: {}, Size: {}'
.format(ct, fp, len(p.get_payload())))
content[fp] = p.get_payload(decode=True)
# Create directories as necessary.
# if os.path.dirname(fp):
# os.makedirs(os.path.dirname(fp), exist_ok=True)
# # Save part's body to a file.
# open(fp, "wb").write(p.get_payload(decode=True))
def parseMhtFile(zip, mhtInfo, content):
logger = logging.getLogger(__name__)
logger.info('Parsing {}, size: {}, csize: {} '
.format(mhtInfo.filename,
mhtInfo.file_size,
mhtInfo.compress_size))
with zip.open(mhtInfo) as mht:
parseMht(mht.read(), content)
def parseZipFile(zip, content):
for zipMember in zip.infolist(): for zipMember in zip.infolist():
if validateMht(zipMember): if validateMht(zipMember):
parseMhtFile(zipMember.filename) parseMhtFile(zip, zipMember, content)
else: else:
pass pass
@@ -71,8 +107,15 @@ def main():
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.info("Parsing {}.".format(args.zip)) logger.info("Parsing {}.".format(args.zip))
content = {}
with zipfile.ZipFile(args.zip, 'r') as zip: with zipfile.ZipFile(args.zip, 'r') as zip:
parseZipFile(zip) parseZipFile(zip, content)
logger.info("Loaded {} parts.".format(len(content)))
for name in content.keys():
logger.info("{}".format(name))
if __name__ == "__main__": if __name__ == "__main__":
main() # Kindda useless if we're not using doctest or anything? main()