Saving internal image into test_out/test_save

2018-09-17 00:15:09 +01:00
parent 4ed17e5596
commit 50d453e92d
5 changed files with 246 additions and 10 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -104,10 +104,6 @@ ENV/
 samples/*.zip
 temp/*
 test_out/*
-<<<<<<< HEAD
 .vscode/gragir.code-workspace
 gragir.7z
-||||||| merged common ancestors
-=======
-.vscode/gragir..code-workspace
->>>>>>> e75fd39a62122b51fe018cdc4bab7100a4d1208f
+gragir.log
--- a/gragir/main.py
+++ b/gragir/main.py
@@ -19,6 +19,7 @@ import ebooklib.epub as ebooklib
 from book import Book, Item
 from parse_mhtml import parseMhtmlZipFile
 from enrich_html import EnrichHtml
+from prepare_epub import PrepareEpub

 def parseArguments():
    """
@@ -46,6 +47,15 @@ def configLogger(args):
    logging.basicConfig(
        format='%(message)s',
        level=loggingLevel)
+    
+
+    fh = logging.FileHandler('gragir.log', mode='w')
+    fh.setLevel(logging.DEBUG)
+    fh.setFormatter(fmt=logging.Formatter(fmt='%(asctime)s %(levelname)s: %(name)s - %(message)s'))
+    #, datefmt='%H:%M:%S'
+    #'%(asctime)s %(levelname)s: %(name)s - %(message)s')
+    logger = logging.getLogger()
+    logger.addHandler(fh)



@@ -211,6 +221,8 @@ def main():

    parseMhtmlZipFile(args.zip, book)
    EnrichHtml.enrich(book)
+    PrepareEpub.prepare(book)
+    book.save_in_dir('test_out/test_save')
    #createDAG(book)
    #createEpubBook(book)

--- a/gragir/book.py
+++ b/gragir/book.py
@@ -1,6 +1,5 @@
-
-
-
+import os
+import logging

 class Item(object):

@@ -12,6 +11,33 @@ class Item(object):
        self.needs = set()
        self.soup = None

+    def save_file(self, directory):
+        logger = logging.getLogger(__name__)
+        if hasattr(self, 'remove'):
+            return
+        #
+        #   Create file name.
+        #
+        if directory[-1] != '/':
+            directory += '/'
+        file_name = directory + self.url
+        logger.info("Saved {}".format(file_name))
+        #
+        #   Ensure directory exist.
+        #
+        dir = os.path.dirname(file_name)
+        if not os.path.exists(dir):
+            os.makedirs(dir)
+        #
+        #   Save content.
+        #
+        if self.soup:
+            with open( file_name, 'wb') as file:
+                file.write(self.soup.prettify("utf-8"))
+        else:
+            with open( file_name, 'wb') as file:
+                file.write(self.payload)
+
 class Book(object):

    def __init__(self, file_name):
@@ -19,3 +45,8 @@ class Book(object):
        self.content = {}
        self.first = None

+    def save_in_dir(self, directory):
+        if not os.path.exists(directory):
+            os.makedirs(directory)
+        for item in self.content.values():
+            item.save_file(directory)
--- a/gragir/enrich_html.py
+++ b/gragir/enrich_html.py
@@ -39,8 +39,9 @@ class EnrichHtml(object):
            if item.soup is not None:
                logger.info("Create DAG {}".format(item.url))

-                links = item.soup.find_all('a')
-                for link in links:
+                my_url = urlparse.urlsplit(item.url)
+
+                for link in item.soup.find_all('a'):
                    href = link.get('href')
                    if not href:
                        continue
@@ -58,6 +59,24 @@ class EnrichHtml(object):
                    elif href:
                        logger.info("   refered but no item exist: {}".format(url))

+                for link in item.soup.find_all('img'):
+                    href = link.get('src')
+                    if not href:
+                        continue
+                    parsed_href = urlparse.urlsplit(href)
+                    url = \
+                        urlparse.SplitResult(parsed_href.scheme,
+                                            parsed_href.netloc,
+                                            parsed_href.path,
+                                            parsed_href.query,
+                                            None).geturl()
+
+                    if url in book.content:
+                        book.content[url].needed_by.add(item.url)
+                        item.needs.add(url)
+                    elif href:
+                        logger.info("   refered but no item exist: {}".format(url))
+
    @classmethod
    def populateContent(cls, book):
        logger = logging.getLogger(__name__)
--- a/gragir/prepare_epub.py
+++ b/gragir/prepare_epub.py
@@ -0,0 +1,178 @@
+import os
+import logging
+import urllib.parse as urlparse
+from bs4 import BeautifulSoup
+
+from book import Item, Book
+
+
+class PrepareEpub(object):
+
+    @classmethod
+    def prepare(cls, book):
+        logger = logging.getLogger(__name__)
+        logger.info("BEGIN Prepare EPUB.")
+        cls.localize_url(book)
+        logger.info("END Prepare EPUB.")
+
+    @classmethod
+    def localize_url(cls, book):
+        #logger = logging.getLogger(__name__)
+        for item in book.content.values():
+            if hasattr(item, 'remove'):
+                continue
+            category = item.content_type.split("/")[0]
+            if category != 'text':
+                cls._moveTo(book,item,category)
+            else:
+                cls._moveTo(book,item,"")
+
+    @classmethod
+    def _moveTo(cls, book, item, category):
+        logger = logging.getLogger(__name__)
+        parsed_url= urlparse.urlsplit(item.url)
+        file_name = os.path.basename(parsed_url.path)
+        if category:
+            new_url = category + "/" + file_name 
+        else:       
+            new_url = file_name 
+        if item.url != new_url \
+            and new_url in book.content:
+                new_url = cls._findUniqueName(book, category, file_name)
+
+        logger.info("Renaming {} -> {}"
+                    .format(item.url, new_url))
+
+        for dependant in item.needed_by:
+            if hasattr(dependant, 'soup'):
+                base_link = urlparse.urlsplit(dependant.url)
+                base_link.path = os.path.dirname(base_link.path)
+                for a in dependant.soup.find_all('a'):
+                    if cls._getAbsoluteUrl(base_link, a.attr.href) == item.url:
+                        a.attr.href = new_url
+                for img in dependant.soup.find_all('img'):
+                    if cls._getAbsoluteUrl(base_link, img.attr.src) == item.url:
+                        img.attrs.src = new_url
+        item.url = new_url
+
+    @classmethod
+    def _getAbsoluteUrl(cls, base_link, link):
+        parsed = urlparse.urlsplit(link)
+        if parsed.netloc is None:
+            parsed.scheme = base_link.scheme 
+            parsed.netloc = base_link.netloc
+        if  parsed.path[0] != '/':
+            parsed.path = base_link.path + '/' + href.path
+        return \
+            urlparse.SplitResult(parsed.scheme,
+                                parsed.netloc,
+                                parsed.path,
+                                parsed.query,
+                                None).geturl()
+
+    @classmethod
+    def _findUniqueName(cls, book, category, filename):
+        i = 0
+        file_name_base, file_ext = os.path.splitext(filename)
+        while True:
+            i+=1
+            if category:
+                new_url = category + '/' + file_name_base + '_' + i + file_ext
+            else: 
+                new_url = file_name_base + '_' + i + file_ext
+            if new_url not in book.content:
+                break 
+        return new_url
+
+    @classmethod
+    def createDAG(cls, book):
+        logger = logging.getLogger(__name__)
+        for item in book.content.values():
+            if item.soup is not None:
+                logger.info("Create DAG {}".format(item.url))
+
+                links = item.soup.find_all('a')
+                for link in links:
+                    href = link.get('href')
+                    if not href:
+                        continue
+                    parsed_href = urlparse.urlsplit(href)
+                    url = \
+                        urlparse.SplitResult(parsed_href.scheme,
+                                            parsed_href.netloc,
+                                            parsed_href.path,
+                                            parsed_href.query,
+                                            None).geturl()
+
+                    if url in book.content:
+                        book.content[url].needed_by.add(item.url)
+                        item.needs.add(url)
+                    elif href:
+                        logger.info("   refered but no item exist: {}".format(url))
+
+    @classmethod
+    def populateContent(cls, book):
+        logger = logging.getLogger(__name__)
+        for item in book.content.values():
+            if item.soup is not None:
+                # Try to find content.
+                item_content = item.soup.find_all('div', attrs={"id": "sbo-rt-content"})
+                if len(item_content) == 1:
+                    item.content = item_content[0]
+                else:
+                    logger.error("No content found: {}".format(item.url))
+                    item.remove = True
+
+    @classmethod
+    def createOrder(cls, book):
+        logger = logging.getLogger(__name__)
+        for item in book.content.values():
+            if item.soup is not None:
+                # Try to get prev chapter.
+                links = item.soup.find_all('a', attrs={"class": "prev nav-link"})
+                if len(links):
+                    item.prev = links[0].get('href')
+
+                # Try to get next chapter.
+                links = item.soup.find_all('a', attrs={"class": "next nav-link"})
+                if len(links):
+                    item.next = links[0].get('href')
+
+        for item in book.content.values():
+            if item.soup is not None \
+                and not hasattr(item, 'prev') \
+                and not hasattr(item, 'remove'):
+                if book.first:
+                    logger.error("Multiple begin points found. {} and {}"
+                                .format(item.url, item.url))
+                    raise Exception("Multiple begin points found.")
+                else:
+                    book.first = item
+
+    @classmethod
+    def getTitle(cls, item):
+        if hasattr(item.soup, 'title') and item.soup.title:
+            return item.soup.title.string
+        else:
+            return item.url
+
+
+    @classmethod
+    def print(cls, book):
+        logger = logging.getLogger(__name__)
+        item = book.first
+        while item is not None:
+            logger.info("Item: {}".format(cls.getTitle(item)))
+            if hasattr(item, 'prev'):
+                logger.info("   Prev: {}".format(item.prev))
+            if hasattr(item, 'next'):
+                logger.info("   Next: {}".format(item.next))
+            for url in item.needs:
+                logger.info("   Needs: {}".format(url))
+            logger.info("")
+
+            if hasattr(item, 'next'):
+                item = book.content[item.next]
+            else:
+                item = None 
+