Html enrichment is separated to a new module.

2018-09-11 07:21:25 +01:00
parent 59deeff397
commit 14c25d79ed
3 changed files with 130 additions and 82 deletions
--- a/gragir/main.py
+++ b/gragir/main.py
@@ -14,12 +14,11 @@ import sys
 import logging
 import argparse

-import urllib.parse as urlparse
 import ebooklib.epub as ebooklib
-from bs4 import BeautifulSoup

 from book import Book, Item
 from parse_mhtml import parseMhtmlZipFile
+from enrich_html import EnrichHtml

 def parseArguments():
    """
@@ -48,84 +47,6 @@ def configLogger(args):
        format='%(message)s',
        level=loggingLevel)

-def parseHtml(book):
-    logger = logging.getLogger(__name__)
-    logger.info("Loaded {} parts.".format(len(book.content)))
-    for item in book.content.values():
-        logger.info("Enriching {} {}".format(item.content_type, item.url))
-        if item.content_type == 'text/html':
-            item.soup = BeautifulSoup(item.payload, "lxml")
-            if hasattr(item.soup, 'title') and item.soup.title:
-                item.title = item.soup.title.string
-            else:
-                logger.info("No title for {}".format(item.url))
-
-
-def createDAG(book):
-    logger = logging.getLogger(__name__)
-    for item in book.content.values():
-        if hasattr(item, 'soup'):
-            if hasattr(item.soup, 'title') and item.soup.title:
-                logger.info("Title {}".format(item.soup.title.string))
-            else:
-                logger.info("No title for {}".format(item.url))
-
-            links = item.soup.find_all('a')
-            for link in links:
-                href = link.get('href')
-                if not href:
-                    continue
-                parsed_href = urlparse.urlsplit(href)
-                url = \
-                    urlparse.SplitResult(parsed_href.scheme,
-                                         parsed_href.netloc,
-                                         parsed_href.path,
-                                         parsed_href.query,
-                                         None).geturl()
-
-                if url in book.content:
-                    book.content[url].needed_by.add(item.url)
-                    item.needs.add(url)
-                elif href:
-                    logger.info("   refered but no item exist: {}".format(url))
-
-            # Try to get prev chapter.
-            links = item.soup.find_all('a', attrs={"class": "prev nav-link"})
-            if len(links):
-                item.prev = links[0].get('href')
-
-            # Try to get next chapter.
-            links = item.soup.find_all('a', attrs={"class": "next nav-link"})
-            if len(links):
-                item.next = links[0].get('href')
-
-            # Try to find content.
-            item_content = item.soup.find_all('div', attrs={"id": "sbo-rt-content"})
-            if len(item_content) == 1:
-                item.content = item_content[0]
-            else:
-                logger.error("No content found: {}".format(item.url))
-                item.remove = True
-
-    for item in book.content.values():
-        if hasattr(item, 'soup') \
-            and not hasattr(item, 'prev') \
-            and not hasattr(item, 'remove'):
-            if book.first:
-                logger.error("Multiple begin points found. {} and {}"
-                             .format(it.url, item.url))
-                raise Exception("Multiple begin points found.")
-            else:
-                book.first = item
-
-    for item in book.content.values():
-        logger.info("Item: {}".format(item.url))
-        if hasattr(item, 'prev'):
-            logger.info("   Prev: {}".format(item.prev))
-        if hasattr(item, 'next'):
-            logger.info("   Next: {}".format(item.next))
-        for url in item.needs:
-            logger.info("   Needs: {}".format(url))


    # for name in content.keys():
@@ -289,8 +210,8 @@ def main():
    book = Book(args.epub)

    parseMhtmlZipFile(args.zip, book)
-    parseHtml(book)
-    createDAG(book)
+    EnrichHtml.enrich(book)
+    #createDAG(book)
    #createEpubBook(book)


--- a/gragir/book.py
+++ b/gragir/book.py
@@ -10,6 +10,7 @@ class Item(object):
        self.payload = payload
        self.needed_by = set()
        self.needs = set()
+        self.soup = None

 class Book(object):

--- a/gragir/enrich_html.py
+++ b/gragir/enrich_html.py
@@ -0,0 +1,126 @@
+import logging
+import urllib.parse as urlparse
+from bs4 import BeautifulSoup
+
+from book import Item, Book
+
+
+class EnrichHtml(object):
+
+    @classmethod
+    def enrich(cls, book):
+        logger = logging.getLogger(__name__)
+        logger.info("BEGIN Html Enrichment {} items.".format(len(book.content)))
+        cls.parse(book)
+        cls.createDAG(book)
+        cls.populateContent(book)
+        cls.createOrder(book)
+        cls.print(book)
+        logger.info("BEGIN Html Enrichment {} items.".format(len(book.content)))
+
+    @classmethod
+    def parse(cls, book):
+        logger = logging.getLogger(__name__)
+        for item in book.content.values():
+            if item.content_type == 'text/html':
+                logger.info("Parsing {} {}".format(item.content_type, item.url))
+                item.soup = BeautifulSoup(item.payload, "lxml")
+                if hasattr(item.soup, 'title') and item.soup.title:
+                    item.title = item.soup.title.string
+                else:
+                    logger.info("No title for {}".format(item.url))
+            else:
+                logger.info("Skipping {} {}".format(item.content_type, item.url))
+
+    @classmethod
+    def createDAG(cls, book):
+        logger = logging.getLogger(__name__)
+        for item in book.content.values():
+            if item.soup is not None:
+                logger.info("Create DAG {}".format(item.url))
+
+                links = item.soup.find_all('a')
+                for link in links:
+                    href = link.get('href')
+                    if not href:
+                        continue
+                    parsed_href = urlparse.urlsplit(href)
+                    url = \
+                        urlparse.SplitResult(parsed_href.scheme,
+                                            parsed_href.netloc,
+                                            parsed_href.path,
+                                            parsed_href.query,
+                                            None).geturl()
+
+                    if url in book.content:
+                        book.content[url].needed_by.add(item.url)
+                        item.needs.add(url)
+                    elif href:
+                        logger.info("   refered but no item exist: {}".format(url))
+
+    @classmethod
+    def populateContent(cls, book):
+        logger = logging.getLogger(__name__)
+        for item in book.content.values():
+            if item.soup is not None:
+                # Try to find content.
+                item_content = item.soup.find_all('div', attrs={"id": "sbo-rt-content"})
+                if len(item_content) == 1:
+                    item.content = item_content[0]
+                else:
+                    logger.error("No content found: {}".format(item.url))
+                    item.remove = True
+
+    @classmethod
+    def createOrder(cls, book):
+        logger = logging.getLogger(__name__)
+        for item in book.content.values():
+            if item.soup is not None:
+                # Try to get prev chapter.
+                links = item.soup.find_all('a', attrs={"class": "prev nav-link"})
+                if len(links):
+                    item.prev = links[0].get('href')
+
+                # Try to get next chapter.
+                links = item.soup.find_all('a', attrs={"class": "next nav-link"})
+                if len(links):
+                    item.next = links[0].get('href')
+
+        for item in book.content.values():
+            if item.soup is not None \
+                and not hasattr(item, 'prev') \
+                and not hasattr(item, 'remove'):
+                if book.first:
+                    logger.error("Multiple begin points found. {} and {}"
+                                .format(item.url, item.url))
+                    raise Exception("Multiple begin points found.")
+                else:
+                    book.first = item
+
+    @classmethod
+    def getTitle(cls, item):
+        if hasattr(item.soup, 'title') and item.soup.title:
+            return item.soup.title.string
+        else:
+            return item.url
+
+
+    @classmethod
+    def print(cls, book):
+        logger = logging.getLogger(__name__)
+        item = book.first
+        while item is not None:
+            logger.info("Item: {}".format(cls.getTitle(item)))
+            if hasattr(item, 'prev'):
+                logger.info("   Prev: {}".format(item.prev))
+            if hasattr(item, 'next'):
+                logger.info("   Next: {}".format(item.next))
+            for url in item.needs:
+                logger.info("   Needs: {}".format(url))
+            logger.info("")
+
+            if hasattr(item, 'next'):
+                item = book.content[item.next]
+            else:
+                item = None 
+