From 3e225ee5c208be517de5c37222ea012dd4a15e46 Mon Sep 17 00:00:00 2001 From: Vahagn Khachatryan Date: Mon, 20 Aug 2018 06:12:46 +0100 Subject: [PATCH] Last changes. --- gragir/__main__.py | 157 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 130 insertions(+), 27 deletions(-) diff --git a/gragir/__main__.py b/gragir/__main__.py index ca7dd3d..0db8f41 100644 --- a/gragir/__main__.py +++ b/gragir/__main__.py @@ -17,6 +17,7 @@ import argparse import zipfile import email +import urllib.parse as urlparse import ebooklib.epub as ebooklib from bs4 import BeautifulSoup @@ -36,19 +37,38 @@ def parseArguments(): return args +def configLogger(args): + loggingLevel = logging.DEBUG if args.debug \ + else logging.INFO if args.verbose \ + else logging.WARNING + # logging.basicConfig( + # format='%(asctime)s %(levelname)s: %(name)s - %(message)s', + # level=loggingLevel) + logging.basicConfig( + format='%(message)s', + level=loggingLevel) + def validateMht(fileName): return True class Item(object): - def __init__(self, file_name, content_type, payload): - self.file_name = file_name + def __init__(self, url, content_type, payload): + self.url = url self.content_type = content_type self.payload = payload + self.needed_by = set() + self.needs = set() +class Book(object): -def parseMht(mht, content): + def __init__(self, file_name): + self.file_name = file_name + self.content = {} + self.first = None + +def parseMht(mht, book): logger = logging.getLogger(__name__) mhtContent = email.message_from_bytes(mht) @@ -70,10 +90,10 @@ def parseMht(mht, content): logger.info(' Content type: {}, Location: {}, Size: {}' .format(ct, fp, len(p.get_payload()))) - content[fp] = Item(fp, ct, p.get_payload(decode=True)) + book.content[fp] = Item(fp, ct, p.get_payload(decode=True)) -def parseMhtFile(zip, mhtInfo, content): +def parseMhtFile(zip, mhtInfo, book): logger = logging.getLogger(__name__) logger.info('Parsing {}, size: {}, csize: {} ' .format(mhtInfo.filename, @@ -81,37 +101,120 @@ def parseMhtFile(zip, mhtInfo, content): mhtInfo.compress_size)) with zip.open(mhtInfo) as mht: - parseMht(mht.read(), content) + parseMht(mht.read(), book) -def parseZipFile(zip, content): +def parseZipFile(zip, book): logger = logging.getLogger(__name__) for zipMember in zip.infolist(): if validateMht(zipMember): - parseMhtFile(zip, zipMember, content) + parseMhtFile(zip, zipMember, book) else: - pass + logger.error("Unexpected file in zip: {}".format(zipMember)) + raise Exception("Unexpected file in zip.") -def enrichContent(content): + +def parseHtml(book): logger = logging.getLogger(__name__) - logger.info("Loaded {} parts.".format(len(content))) - for item in content.values(): - logger.info("Enriching {} {}".format(item.content_type, item.file_name)) + logger.info("Loaded {} parts.".format(len(book.content))) + for item in book.content.values(): + logger.info("Enriching {} {}".format(item.content_type, item.url)) if item.content_type == 'text/html': item.soup = BeautifulSoup(item.payload, "lxml") + if hasattr(item.soup, 'title') and item.soup.title: + item.title = item.soup.title.string + else: + logger.info("No title for {}".format(item.url)) + + +def createDAG(book): + logger = logging.getLogger(__name__) + for item in book.content.values(): + if hasattr(item, 'soup'): + if hasattr(item.soup, 'title') and item.soup.title: + logger.info("Title {}".format(item.soup.title.string)) + else: + logger.info("No title for {}".format(item.url)) + + links = item.soup.find_all('a') + for link in links: + href = link.get('href') + if not href: + continue + parsed_href = urlparse.urlsplit(href) + url = \ + urlparse.SplitResult(parsed_href.scheme, + parsed_href.netloc, + parsed_href.path, + parsed_href.query, + None).geturl() + + if url in book.content: + book.content[url].needed_by.add(item.url) + item.needs.add(url) + elif href: + logger.info(" refered but no item exist: {}".format(url)) + + # Try to get prev chapter. + links = item.soup.find_all('a', attrs={"class": "prev nav-link"}) + if len(links): + item.prev = links[0].get('href') + + # Try to get next chapter. + links = item.soup.find_all('a', attrs={"class": "next nav-link"}) + if len(links): + item.next = links[0].get('href') + + # Try to find content. + item_content = item.soup.find_all('div', attrs={"id": "sbo-rt-content"}) + if len(item_content) == 1: + item.content = item_content[0] + else: + logger.error("No content found: {}".format(item.url)) + item.remove = True + + for item in book.content.values(): + if hasattr(item, 'soup') \ + and not hasattr(item, 'prev') \ + and not hasattr(item, 'remove'): + if book.first: + logger.error("Multiple begin points found. {} and {}" + .format(it.url, item.url)) + raise Exception("Multiple begin points found.") + else: + book.first = item + + for item in book.content.values(): + logger.info("Item: {}".format(item.url)) + if hasattr(item, 'prev'): + logger.info(" Prev: {}".format(item.prev)) + if hasattr(item, 'next'): + logger.info(" Next: {}".format(item.next)) + for url in item.needs: + logger.info(" Needs: {}".format(url)) + # for name in content.keys(): -def configLogger(args): - loggingLevel = logging.DEBUG if args.debug \ - else logging.INFO if args.verbose \ - else logging.WARNING - logging.basicConfig( - format='%(asctime)s %(levelname)s: %(name)s - %(message)s', - level=loggingLevel) +def createEpubHtml(item): + html = ebooklib.EpubHtml() + return html -def createEpubBook(content): - book = ebooklib.EpubBook() +def createEpubBook(book): + logger = logging.getLogger(__name__) + + ebook = ebooklib.EpubBook() + + it = book.first + while it: + if it.content_type == 'text/html': + html = createEpubHtml(it) + ebook.add_item(html) + elif it.content_type == 'image/html': + html = createEpubHtml(it) + ebook.add_item(html) + + writeEpubBook(book.file_name, ebook) # class EpubImage(EpubItem): # class EpubNav(EpubHtml): @@ -249,14 +352,14 @@ def main(): logger = logging.getLogger(__name__) logger.info("Parsing {}.".format(args.zip)) - content = {} + book = Book(args.epub) with zipfile.ZipFile(args.zip, 'r') as zip: - parseZipFile(zip, content) + parseZipFile(zip, book) - enrichContent(content) - book = createEpubBook(content) - writeEpubBook(args.epub, book) + parseHtml(book) + createDAG(book) + createEpubBook(book) if __name__ == "__main__":