From 70308d1be4d987c8c1eb29a109d235e4f011f510 Mon Sep 17 00:00:00 2001 From: Vahagn Khachatryan Date: Sun, 12 Aug 2018 09:42:45 +0100 Subject: [PATCH] Parsing and loading content. --- Makefile | 2 +- gragir/__main__.py | 63 ++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 54 insertions(+), 11 deletions(-) diff --git a/Makefile b/Makefile index 1129c2a..d1369a1 100644 --- a/Makefile +++ b/Makefile @@ -12,7 +12,7 @@ test_all: $(addprefix test_,$(TEST_FILES)) .PHONY: $(addprefix test_,$(TEST_FILES)) $(addprefix test_,$(TEST_FILES)): test_%: @echo Testing $* - @python -v gragir/__main__.py $* test_out/$(*F).epub + @python gragir/__main__.py -v $* test_out/$(*F).epub #python gragir/__main__.py samples/algorithms_third_edition_in_c.zip algorithms_third_edition_in_c.epub diff --git a/gragir/__main__.py b/gragir/__main__.py index 35e0784..bfdd1f1 100644 --- a/gragir/__main__.py +++ b/gragir/__main__.py @@ -4,9 +4,9 @@ """ # Standard library modules do the heavy lifting. Ours is all simple stuff. -# import base64 -# import email, email.message -# import mimetypes +import base64 +import email, email.message +import mimetypes # import os # import quopri @@ -40,15 +40,51 @@ def parseArguments(): def validateMht(fileName): return True -def parseMhtFile(mhtFileName): +def parseMht(mht, content): logger = logging.getLogger(__name__) - logger.info(mhtFileName) - pass -def parseZipFile( zip ): + mhtContent = email.message_from_bytes(mht) + + parts = mhtContent.get_payload() + # Multiple parts, usually? If single 'str' part, then convert to a list. + if not type(parts) is list: + parts = [mhtContent] + + logger.info(' Number of parts: {}'.format(len(parts))) + + # Save all parts to files. + for p in parts: # walk() for a tree, but I'm guessing MHT is never nested? + #??? cs = p.get_charset() # Expecting "utf-8" for root HTML, None for all other parts. + ct = p.get_content_type() + # String coerced to lower case of the form maintype/subtype, else get_default_type(). + fp = p.get("content-location") or "index.html" # File path. Expecting root HTML is only part with no location. + + logger.info(' Content type: {}, Location: {}, Size: {}' + .format(ct, fp, len(p.get_payload()))) + + content[fp] = p.get_payload(decode=True) + # Create directories as necessary. + # if os.path.dirname(fp): + # os.makedirs(os.path.dirname(fp), exist_ok=True) + + # # Save part's body to a file. + # open(fp, "wb").write(p.get_payload(decode=True)) + +def parseMhtFile(zip, mhtInfo, content): + logger = logging.getLogger(__name__) + logger.info('Parsing {}, size: {}, csize: {} ' + .format(mhtInfo.filename, + mhtInfo.file_size, + mhtInfo.compress_size)) + + with zip.open(mhtInfo) as mht: + parseMht(mht.read(), content) + + +def parseZipFile(zip, content): for zipMember in zip.infolist(): if validateMht(zipMember): - parseMhtFile(zipMember.filename) + parseMhtFile(zip, zipMember, content) else: pass @@ -71,8 +107,15 @@ def main(): logger = logging.getLogger(__name__) logger.info("Parsing {}.".format(args.zip)) + content = {} + with zipfile.ZipFile(args.zip, 'r') as zip: - parseZipFile(zip) + parseZipFile(zip, content) + + logger.info("Loaded {} parts.".format(len(content))) + for name in content.keys(): + logger.info("{}".format(name)) + if __name__ == "__main__": - main() # Kindda useless if we're not using doctest or anything? + main()