Last changes.

2018-08-20 06:12:46 +01:00
parent 509d54e5d6
commit 3e225ee5c2
1 changed files with 130 additions and 27 deletions
--- a/gragir/main.py
+++ b/gragir/main.py
@@ -17,6 +17,7 @@ import argparse
 import zipfile
 import email
 import urllib.parse as urlparse
 import ebooklib.epub as ebooklib
 from bs4 import BeautifulSoup
@@ -36,19 +37,38 @@ def parseArguments():
    return args
 def configLogger(args):
    loggingLevel = logging.DEBUG if args.debug \
                        else logging.INFO if args.verbose \
                            else logging.WARNING
    # logging.basicConfig(
    #     format='%(asctime)s %(levelname)s: %(name)s - %(message)s',
    #     level=loggingLevel)
    logging.basicConfig(
        format='%(message)s',
        level=loggingLevel)
 def validateMht(fileName):
    return True
 class Item(object):
-    def __init__(self, file_name, content_type, payload):
+    def __init__(self, url, content_type, payload):
-        self.file_name = file_name
+        self.url = url
        self.content_type = content_type
        self.payload = payload
        self.needed_by = set()
        self.needs = set()
 class Book(object):
-def parseMht(mht, content):
+    def __init__(self, file_name):
        self.file_name = file_name
        self.content = {}
        self.first = None
 def parseMht(mht, book):
    logger = logging.getLogger(__name__)
    mhtContent = email.message_from_bytes(mht)
@@ -70,10 +90,10 @@ def parseMht(mht, content):
            logger.info('       Content type: {}, Location: {}, Size: {}'
                        .format(ct, fp, len(p.get_payload())))
-            content[fp] = Item(fp, ct, p.get_payload(decode=True))
+            book.content[fp] = Item(fp, ct, p.get_payload(decode=True))
-def parseMhtFile(zip, mhtInfo, content):
+def parseMhtFile(zip, mhtInfo, book):
    logger = logging.getLogger(__name__)
    logger.info('Parsing {}, size: {}, csize: {} '
                .format(mhtInfo.filename,
@@ -81,37 +101,120 @@ def parseMhtFile(zip, mhtInfo, content):
                        mhtInfo.compress_size))
    with zip.open(mhtInfo) as mht:
-        parseMht(mht.read(), content)
+        parseMht(mht.read(), book)
-def parseZipFile(zip, content):
+def parseZipFile(zip, book):
    logger = logging.getLogger(__name__)
    for zipMember in zip.infolist():
        if validateMht(zipMember):
-            parseMhtFile(zip, zipMember, content)
+            parseMhtFile(zip, zipMember, book)
        else:
-            pass
+            logger.error("Unexpected file in zip: {}".format(zipMember))
            raise Exception("Unexpected file in zip.")
-def enrichContent(content):
+
 def parseHtml(book):
    logger = logging.getLogger(__name__)
-    logger.info("Loaded {} parts.".format(len(content)))
+    logger.info("Loaded {} parts.".format(len(book.content)))
-    for item in content.values():
+    for item in book.content.values():
-        logger.info("Enriching {} {}".format(item.content_type, item.file_name))
+        logger.info("Enriching {} {}".format(item.content_type, item.url))
        if item.content_type == 'text/html':
            item.soup = BeautifulSoup(item.payload, "lxml")
            if hasattr(item.soup, 'title') and item.soup.title:
                item.title = item.soup.title.string
            else:
                logger.info("No title for {}".format(item.url))
 def createDAG(book):
    logger = logging.getLogger(__name__)
    for item in book.content.values():
        if hasattr(item, 'soup'):
            if hasattr(item.soup, 'title') and item.soup.title:
                logger.info("Title {}".format(item.soup.title.string))
            else:
                logger.info("No title for {}".format(item.url))
            links = item.soup.find_all('a')
            for link in links:
                href = link.get('href')
                if not href:
                    continue
                parsed_href = urlparse.urlsplit(href)
                url = \
                    urlparse.SplitResult(parsed_href.scheme,
                                         parsed_href.netloc,
                                         parsed_href.path,
                                         parsed_href.query,
                                         None).geturl()
                if url in book.content:
                    book.content[url].needed_by.add(item.url)
                    item.needs.add(url)
                elif href:
                    logger.info("   refered but no item exist: {}".format(url))
            # Try to get prev chapter.
            links = item.soup.find_all('a', attrs={"class": "prev nav-link"})
            if len(links):
                item.prev = links[0].get('href')
            # Try to get next chapter.
            links = item.soup.find_all('a', attrs={"class": "next nav-link"})
            if len(links):
                item.next = links[0].get('href')
            # Try to find content.
            item_content = item.soup.find_all('div', attrs={"id": "sbo-rt-content"})
            if len(item_content) == 1:
                item.content = item_content[0]
            else:
                logger.error("No content found: {}".format(item.url))
                item.remove = True
    for item in book.content.values():
        if hasattr(item, 'soup') \
            and not hasattr(item, 'prev') \
            and not hasattr(item, 'remove'):
            if book.first:
                logger.error("Multiple begin points found. {} and {}"
                             .format(it.url, item.url))
                raise Exception("Multiple begin points found.")
            else:
                book.first = item
    for item in book.content.values():
        logger.info("Item: {}".format(item.url))
        if hasattr(item, 'prev'):
            logger.info("   Prev: {}".format(item.prev))
        if hasattr(item, 'next'):
            logger.info("   Next: {}".format(item.next))
        for url in item.needs:
            logger.info("   Needs: {}".format(url))
    # for name in content.keys():
-def configLogger(args):
+def createEpubHtml(item):
-    loggingLevel = logging.DEBUG if args.debug \
+    html = ebooklib.EpubHtml()
-                        else logging.INFO if args.verbose \
+    return html
                            else logging.WARNING
    logging.basicConfig(
        format='%(asctime)s %(levelname)s: %(name)s - %(message)s',
        level=loggingLevel)
-def createEpubBook(content):
+def createEpubBook(book):
-    book = ebooklib.EpubBook()
+    logger = logging.getLogger(__name__)
    ebook = ebooklib.EpubBook()
    it = book.first
    while it:
        if it.content_type == 'text/html':
            html = createEpubHtml(it)
            ebook.add_item(html)
        elif it.content_type == 'image/html':
            html = createEpubHtml(it)
            ebook.add_item(html)
    writeEpubBook(book.file_name, ebook)
    #     class EpubImage(EpubItem):
    #     class EpubNav(EpubHtml):
@@ -249,14 +352,14 @@ def main():
    logger = logging.getLogger(__name__)
    logger.info("Parsing {}.".format(args.zip))
-    content = {}
+    book = Book(args.epub)
    with zipfile.ZipFile(args.zip, 'r') as zip:
-        parseZipFile(zip, content)
+        parseZipFile(zip, book)
-    enrichContent(content)
+    parseHtml(book)
-    book = createEpubBook(content)
+    createDAG(book)
-    writeEpubBook(args.epub, book)
+    createEpubBook(book)
 if __name__ == "__main__":