From 509d54e5d6a9bfe2d563aab7ece02691afebba55 Mon Sep 17 00:00:00 2001
From: Vahagn Khachatryan <vahagn.khachatryan@gmail.com>
Date: Sun, 12 Aug 2018 13:11:19 +0100
Subject: [PATCH] Reading html files through BeautifulSoup

---
 .gitignore         |   2 +
 Makefile           |   8 +-
 gragir/__main__.py | 178 ++++++++++++++++++++++++++++++++++++++++-----
 modules/ebooklib   |   2 +-
 4 files changed, 168 insertions(+), 22 deletions(-)

diff --git a/.gitignore b/.gitignore
index 0ca54b0..5b77593 100644
--- a/.gitignore
+++ b/.gitignore
@@ -102,3 +102,5 @@ ENV/
 
 # smaple files
 samples/*.zip
+temp/*
+test_out/*
diff --git a/Makefile b/Makefile
index d1369a1..6466bdc 100644
--- a/Makefile
+++ b/Makefile
@@ -1,18 +1,20 @@
 
 TEST_FILES = $(wildcard samples/*.zip)
 
+PYTHONPATH = $(CURDIR)/modules/ebooklib
+
 .PHONY: test
-test:
-	python gragir/__main__.py -v samples/algorithms_third_edition_in_c.zip algorithms_third_edition_in_c.epub
+test: test_samples/algorithms_third_edition_in_c.zip 
 
 .PHONY: test_all
 test_all: $(addprefix test_,$(TEST_FILES))
 
 
+.ONESHELL:
 .PHONY: $(addprefix test_,$(TEST_FILES))
 $(addprefix test_,$(TEST_FILES)): test_%:
 	@echo Testing $*
-	@python gragir/__main__.py -v $* test_out/$(*F).epub
+	@set PYTHONPATH=$(PYTHONPATH) && python gragir/__main__.py -v $* test_out/$(*F).epub
 
 #python gragir/__main__.py samples/algorithms_third_edition_in_c.zip algorithms_third_edition_in_c.epub
 
diff --git a/gragir/__main__.py b/gragir/__main__.py
index bfdd1f1..ca7dd3d 100644
--- a/gragir/__main__.py
+++ b/gragir/__main__.py
@@ -4,9 +4,9 @@
 """
 
 # Standard library modules do the heavy lifting. Ours is all simple stuff.
-import base64
-import email, email.message
-import mimetypes
+# import base64
+# import email.message
+# import mimetypes
 # import os
 # import quopri
 
@@ -15,6 +15,10 @@ import logging
 import argparse
 
 import zipfile
+import email
+
+import ebooklib.epub as ebooklib
+from bs4 import BeautifulSoup
 
 def parseArguments():
     """
@@ -30,16 +34,20 @@ def parseArguments():
     parser.add_argument("-q", "--quiet", action="store_true", help="log only errors.")
     args = parser.parse_args() # --help is built-in.
 
-    # Validate command line.
-    # if args.pack == args.unpack:
-    #         sys.stderr.write("Invalid: must specify one action, either --pack or --unpack.\n")
-    #         sys.exit(-1)
-
     return args
 
+
 def validateMht(fileName):
     return True
 
+class Item(object):
+
+    def __init__(self, file_name, content_type, payload):
+        self.file_name = file_name
+        self.content_type = content_type
+        self.payload = payload
+
+
 def parseMht(mht, content):
     logger = logging.getLogger(__name__)
 
@@ -62,13 +70,8 @@ def parseMht(mht, content):
             logger.info('       Content type: {}, Location: {}, Size: {}'
                         .format(ct, fp, len(p.get_payload())))
 
-            content[fp] = p.get_payload(decode=True)
-            # Create directories as necessary.
-            # if os.path.dirname(fp):
-            #         os.makedirs(os.path.dirname(fp), exist_ok=True)
+            content[fp] = Item(fp, ct, p.get_payload(decode=True))
 
-            # # Save part's body to a file.
-            # open(fp, "wb").write(p.get_payload(decode=True))
 
 def parseMhtFile(zip, mhtInfo, content):
     logger = logging.getLogger(__name__)
@@ -82,12 +85,22 @@ def parseMhtFile(zip, mhtInfo, content):
 
 
 def parseZipFile(zip, content):
+    logger = logging.getLogger(__name__)
     for zipMember in zip.infolist():
         if validateMht(zipMember):
             parseMhtFile(zip, zipMember, content)
         else:
             pass
 
+def enrichContent(content):
+    logger = logging.getLogger(__name__)
+    logger.info("Loaded {} parts.".format(len(content)))
+    for item in content.values():
+        logger.info("Enriching {} {}".format(item.content_type, item.file_name))
+        if item.content_type == 'text/html':
+            item.soup = BeautifulSoup(item.payload, "lxml")
+
+    # for name in content.keys():
 
 def configLogger(args):
     loggingLevel = logging.DEBUG if args.debug \
@@ -97,7 +110,136 @@ def configLogger(args):
         format='%(asctime)s %(levelname)s: %(name)s - %(message)s',
         level=loggingLevel)
 
-# Just do it.
+def createEpubBook(content):
+    book = ebooklib.EpubBook()
+
+    #     class EpubImage(EpubItem):
+    #     class EpubNav(EpubHtml):
+    #     class EpubCoverHtml(EpubHtml):
+    #     class EpubHtml(EpubItem):
+    #     class EpubCover(EpubItem):
+    #     class EpubNcx(EpubItem):
+    #     class EpubItem(object):
+    #     class EpubException(Exception):
+    #     class Link(object):
+    #     class Section(object):
+
+
+    #     def set_identifier(self, uid)
+    #     def set_title(self, title)
+    #     def set_language(self, lang)
+    #     def set_cover(self, file_name, content, create_page=True):
+    #         """
+    #         Set cover and create cover document if needed.
+
+    #         :Args:
+    #           - file_name: file name of the cover page
+    #           - content: Content for the cover image
+    #           - create_page: Should cover page be defined. Defined as bool value (optional). Default value is True.
+    #         """
+
+    #     def add_author(self, author, file_as=None, role=None, uid='creator'):
+    #     def add_metadata(self, namespace, name, value, others=None):
+    #     def set_unique_metadata(self, namespace, name, value, others=None):
+    #         "Add metadata if metadata with this identifier does not already exist, otherwise update existing metadata."
+    #     def add_item(self, item):
+
+    #     def get_metadata(self, namespace, name):
+    #     def get_item_with_id(self, uid):
+    #         """
+    #         Returns item for defined UID.
+
+    #         >>> book.get_item_with_id('image_001')
+
+    #         :Args:
+    #           - uid: UID for the item
+
+    #         :Returns:
+    #           Returns item object. Returns None if nothing was found.
+    #         """
+
+    #     def get_item_with_href(self, href):
+    #         """
+    #         Returns item for defined HREF.
+
+    #         >>> book.get_item_with_href('EPUB/document.xhtml')
+
+    #         :Args:
+    #           - href: HREF for the item we are searching for
+
+    #         :Returns:
+    #           Returns item object. Returns None if nothing was found.
+    #         """
+
+    #     def get_items(self):
+    #     def get_items_of_media_type(self, media_type):
+    #     def get_items_of_type(self, item_type):
+    #         """
+    #         Returns all items of specified type.
+
+    #         >>> book.get_items_of_type(epub.ITEM_IMAGE)
+
+    #         :Args:
+    #           - item_type: Type for items we are searching for
+
+    #         :Returns:
+    #           Returns found items as tuple.
+    #         """
+    #         return (item for item in self.items if item.get_type() == item_type)
+
+
+    #     def get_template(self, name):
+    #    def set_template(self, name, value):
+    #         """
+    #         Defines templates which are used to generate certain types of pages. When defining new value for the template
+    #         we have to use content of type 'str' (Python 2) or 'bytes' (Python 3).
+
+    #         At the moment we use these templates:
+    #           - ncx
+    #           - nav
+    #           - chapter
+    #           - cover
+
+    #         :Args:
+    #           - name: Name for the template
+    #           - value: Content for the template
+    #         """
+
+    
+    #     def add_prefix(self, name, uri):
+    #         """
+    #         Appends custom prefix to be added to the content.opf document
+
+    #         >>> epub_book.add_prefix('bkterms', 'http://booktype.org/')
+
+    #         :Args:
+    #           - name: namespave name
+    #           - uri: URI for the namespace
+    #         """
+
+    return book
+
+def writeEpubBook(name, book, options=None):
+    """
+    Creates epub file with the content defined in EpubBook.
+
+    >>> makeEpub('book.epub', book)
+
+    :Args:
+      - name: file name for the output file
+      - book: instance of EpubBook
+      - options: extra opions as dictionary (optional)
+    """
+    logger = logging.getLogger(__name__)
+
+    try:
+        epub = ebooklib.EpubWriter(name, book, options)
+        epub.process()
+        epub.write()
+    except Exception as e:
+        logger.error("Exception {}.".format(e))
+
+
 def main():
     """
     """
@@ -112,9 +254,9 @@ def main():
     with zipfile.ZipFile(args.zip, 'r') as zip:
         parseZipFile(zip, content)
 
-    logger.info("Loaded {} parts.".format(len(content)))
-    for name in content.keys():
-        logger.info("{}".format(name))
+    enrichContent(content)
+    book = createEpubBook(content)
+    writeEpubBook(args.epub, book)
 
 
 if __name__ == "__main__":
diff --git a/modules/ebooklib b/modules/ebooklib
index 00a3c6e..6a004d4 160000
--- a/modules/ebooklib
+++ b/modules/ebooklib
@@ -1 +1 @@
-Subproject commit 00a3c6e064c5a71dc8da38276f04e559c0cd66df
+Subproject commit 6a004d4ae3d6da44575a482aa3f605e9484f7b0c