From 70308d1be4d987c8c1eb29a109d235e4f011f510 Mon Sep 17 00:00:00 2001
From: Vahagn Khachatryan <vahagn.khachatryan@gmail.com>
Date: Sun, 12 Aug 2018 09:42:45 +0100
Subject: [PATCH] Parsing and loading content.

---
 Makefile           |  2 +-
 gragir/__main__.py | 63 ++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 54 insertions(+), 11 deletions(-)

diff --git a/Makefile b/Makefile
index 1129c2a..d1369a1 100644
--- a/Makefile
+++ b/Makefile
@@ -12,7 +12,7 @@ test_all: $(addprefix test_,$(TEST_FILES))
 .PHONY: $(addprefix test_,$(TEST_FILES))
 $(addprefix test_,$(TEST_FILES)): test_%:
 	@echo Testing $*
-	@python -v gragir/__main__.py $* test_out/$(*F).epub
+	@python gragir/__main__.py -v $* test_out/$(*F).epub
 
 #python gragir/__main__.py samples/algorithms_third_edition_in_c.zip algorithms_third_edition_in_c.epub
 
diff --git a/gragir/__main__.py b/gragir/__main__.py
index 35e0784..bfdd1f1 100644
--- a/gragir/__main__.py
+++ b/gragir/__main__.py
@@ -4,9 +4,9 @@
 """
 
 # Standard library modules do the heavy lifting. Ours is all simple stuff.
-# import base64
-# import email, email.message
-# import mimetypes
+import base64
+import email, email.message
+import mimetypes
 # import os
 # import quopri
 
@@ -40,15 +40,51 @@ def parseArguments():
 def validateMht(fileName):
     return True
 
-def parseMhtFile(mhtFileName):
+def parseMht(mht, content):
     logger = logging.getLogger(__name__)
-    logger.info(mhtFileName)
-    pass
 
-def parseZipFile( zip ):
+    mhtContent = email.message_from_bytes(mht)
+
+    parts = mhtContent.get_payload()
+    # Multiple parts, usually? If single 'str' part, then convert to a list.
+    if not type(parts) is list: 
+        parts = [mhtContent] 
+
+    logger.info('   Number of parts: {}'.format(len(parts)))
+
+    # Save all parts to files.
+    for p in parts: # walk() for a tree, but I'm guessing MHT is never nested?
+            #??? cs = p.get_charset() # Expecting "utf-8" for root HTML, None for all other parts.						
+            ct = p.get_content_type()
+                 # String coerced to lower case of the form maintype/subtype, else get_default_type().			
+            fp = p.get("content-location") or "index.html" # File path. Expecting root HTML is only part with no location.
+
+            logger.info('       Content type: {}, Location: {}, Size: {}'
+                        .format(ct, fp, len(p.get_payload())))
+
+            content[fp] = p.get_payload(decode=True)
+            # Create directories as necessary.
+            # if os.path.dirname(fp):
+            #         os.makedirs(os.path.dirname(fp), exist_ok=True)
+
+            # # Save part's body to a file.
+            # open(fp, "wb").write(p.get_payload(decode=True))
+
+def parseMhtFile(zip, mhtInfo, content):
+    logger = logging.getLogger(__name__)
+    logger.info('Parsing {}, size: {}, csize: {} '
+                .format(mhtInfo.filename,
+                        mhtInfo.file_size, 
+                        mhtInfo.compress_size))
+
+    with zip.open(mhtInfo) as mht:
+        parseMht(mht.read(), content)
+
+
+def parseZipFile(zip, content):
     for zipMember in zip.infolist():
         if validateMht(zipMember):
-            parseMhtFile(zipMember.filename)
+            parseMhtFile(zip, zipMember, content)
         else:
             pass
 
@@ -71,8 +107,15 @@ def main():
     logger = logging.getLogger(__name__)
     logger.info("Parsing {}.".format(args.zip))
 
+    content = {}
+
     with zipfile.ZipFile(args.zip, 'r') as zip:
-        parseZipFile(zip)
+        parseZipFile(zip, content)
+
+    logger.info("Loaded {} parts.".format(len(content)))
+    for name in content.keys():
+        logger.info("{}".format(name))
+
 
 if __name__ == "__main__":
-    main() # Kindda useless if we're not using doctest or anything?
+    main()