Last changes.

2018-08-20 06:12:46 +01:00
parent 509d54e5d6
commit 3e225ee5c2
1 changed files with 130 additions and 27 deletions
--- a/gragir/main.py
+++ b/gragir/main.py
@@ -17,6 +17,7 @@ import argparse
 import zipfile
 import email

+import urllib.parse as urlparse
 import ebooklib.epub as ebooklib
 from bs4 import BeautifulSoup

@@ -36,19 +37,38 @@ def parseArguments():

    return args

+def configLogger(args):
+    loggingLevel = logging.DEBUG if args.debug \
+                        else logging.INFO if args.verbose \
+                            else logging.WARNING
+    # logging.basicConfig(
+    #     format='%(asctime)s %(levelname)s: %(name)s - %(message)s',
+    #     level=loggingLevel)
+    logging.basicConfig(
+        format='%(message)s',
+        level=loggingLevel)
+

 def validateMht(fileName):
    return True

 class Item(object):

-    def __init__(self, file_name, content_type, payload):
-        self.file_name = file_name
+    def __init__(self, url, content_type, payload):
+        self.url = url
        self.content_type = content_type
        self.payload = payload
+        self.needed_by = set()
+        self.needs = set()

+class Book(object):

-def parseMht(mht, content):
+    def __init__(self, file_name):
+        self.file_name = file_name
+        self.content = {}
+        self.first = None
+
+def parseMht(mht, book):
    logger = logging.getLogger(__name__)

    mhtContent = email.message_from_bytes(mht)
@@ -70,10 +90,10 @@ def parseMht(mht, content):
            logger.info('       Content type: {}, Location: {}, Size: {}'
                        .format(ct, fp, len(p.get_payload())))

-            content[fp] = Item(fp, ct, p.get_payload(decode=True))
+            book.content[fp] = Item(fp, ct, p.get_payload(decode=True))


-def parseMhtFile(zip, mhtInfo, content):
+def parseMhtFile(zip, mhtInfo, book):
    logger = logging.getLogger(__name__)
    logger.info('Parsing {}, size: {}, csize: {} '
                .format(mhtInfo.filename,
@@ -81,37 +101,120 @@ def parseMhtFile(zip, mhtInfo, content):
                        mhtInfo.compress_size))

    with zip.open(mhtInfo) as mht:
-        parseMht(mht.read(), content)
+        parseMht(mht.read(), book)


-def parseZipFile(zip, content):
+def parseZipFile(zip, book):
    logger = logging.getLogger(__name__)
    for zipMember in zip.infolist():
        if validateMht(zipMember):
-            parseMhtFile(zip, zipMember, content)
+            parseMhtFile(zip, zipMember, book)
        else:
-            pass
+            logger.error("Unexpected file in zip: {}".format(zipMember))
+            raise Exception("Unexpected file in zip.")

-def enrichContent(content):
+
+def parseHtml(book):
    logger = logging.getLogger(__name__)
-    logger.info("Loaded {} parts.".format(len(content)))
-    for item in content.values():
-        logger.info("Enriching {} {}".format(item.content_type, item.file_name))
+    logger.info("Loaded {} parts.".format(len(book.content)))
+    for item in book.content.values():
+        logger.info("Enriching {} {}".format(item.content_type, item.url))
        if item.content_type == 'text/html':
            item.soup = BeautifulSoup(item.payload, "lxml")
+            if hasattr(item.soup, 'title') and item.soup.title:
+                item.title = item.soup.title.string
+            else:
+                logger.info("No title for {}".format(item.url))
+
+
+def createDAG(book):
+    logger = logging.getLogger(__name__)
+    for item in book.content.values():
+        if hasattr(item, 'soup'):
+            if hasattr(item.soup, 'title') and item.soup.title:
+                logger.info("Title {}".format(item.soup.title.string))
+            else:
+                logger.info("No title for {}".format(item.url))
+
+            links = item.soup.find_all('a')
+            for link in links:
+                href = link.get('href')
+                if not href:
+                    continue
+                parsed_href = urlparse.urlsplit(href)
+                url = \
+                    urlparse.SplitResult(parsed_href.scheme,
+                                         parsed_href.netloc,
+                                         parsed_href.path,
+                                         parsed_href.query,
+                                         None).geturl()
+
+                if url in book.content:
+                    book.content[url].needed_by.add(item.url)
+                    item.needs.add(url)
+                elif href:
+                    logger.info("   refered but no item exist: {}".format(url))
+
+            # Try to get prev chapter.
+            links = item.soup.find_all('a', attrs={"class": "prev nav-link"})
+            if len(links):
+                item.prev = links[0].get('href')
+
+            # Try to get next chapter.
+            links = item.soup.find_all('a', attrs={"class": "next nav-link"})
+            if len(links):
+                item.next = links[0].get('href')
+
+            # Try to find content.
+            item_content = item.soup.find_all('div', attrs={"id": "sbo-rt-content"})
+            if len(item_content) == 1:
+                item.content = item_content[0]
+            else:
+                logger.error("No content found: {}".format(item.url))
+                item.remove = True
+
+    for item in book.content.values():
+        if hasattr(item, 'soup') \
+            and not hasattr(item, 'prev') \
+            and not hasattr(item, 'remove'):
+            if book.first:
+                logger.error("Multiple begin points found. {} and {}"
+                             .format(it.url, item.url))
+                raise Exception("Multiple begin points found.")
+            else:
+                book.first = item
+
+    for item in book.content.values():
+        logger.info("Item: {}".format(item.url))
+        if hasattr(item, 'prev'):
+            logger.info("   Prev: {}".format(item.prev))
+        if hasattr(item, 'next'):
+            logger.info("   Next: {}".format(item.next))
+        for url in item.needs:
+            logger.info("   Needs: {}".format(url))
+

    # for name in content.keys():

-def configLogger(args):
-    loggingLevel = logging.DEBUG if args.debug \
-                        else logging.INFO if args.verbose \
-                            else logging.WARNING
-    logging.basicConfig(
-        format='%(asctime)s %(levelname)s: %(name)s - %(message)s',
-        level=loggingLevel)
+def createEpubHtml(item):
+    html = ebooklib.EpubHtml()
+    return html

-def createEpubBook(content):
-    book = ebooklib.EpubBook()
+def createEpubBook(book):
+    logger = logging.getLogger(__name__)
+
+    ebook = ebooklib.EpubBook()
+
+    it = book.first
+    while it:
+        if it.content_type == 'text/html':
+            html = createEpubHtml(it)
+            ebook.add_item(html)
+        elif it.content_type == 'image/html':
+            html = createEpubHtml(it)
+            ebook.add_item(html)
+        
+    writeEpubBook(book.file_name, ebook)

    #     class EpubImage(EpubItem):
    #     class EpubNav(EpubHtml):
@@ -249,14 +352,14 @@ def main():
    logger = logging.getLogger(__name__)
    logger.info("Parsing {}.".format(args.zip))

-    content = {}
+    book = Book(args.epub)

    with zipfile.ZipFile(args.zip, 'r') as zip:
-        parseZipFile(zip, content)
+        parseZipFile(zip, book)

-    enrichContent(content)
-    book = createEpubBook(content)
-    writeEpubBook(args.epub, book)
+    parseHtml(book)
+    createDAG(book)
+    createEpubBook(book)


 if __name__ == "__main__":