diff --git a/.gitignore b/.gitignore index 6ce39be..91cb2f2 100644 --- a/.gitignore +++ b/.gitignore @@ -104,10 +104,6 @@ ENV/ samples/*.zip temp/* test_out/* -<<<<<<< HEAD .vscode/gragir.code-workspace gragir.7z -||||||| merged common ancestors -======= -.vscode/gragir..code-workspace ->>>>>>> e75fd39a62122b51fe018cdc4bab7100a4d1208f +gragir.log \ No newline at end of file diff --git a/gragir/__main__.py b/gragir/__main__.py index 326b94b..607360a 100644 --- a/gragir/__main__.py +++ b/gragir/__main__.py @@ -19,6 +19,7 @@ import ebooklib.epub as ebooklib from book import Book, Item from parse_mhtml import parseMhtmlZipFile from enrich_html import EnrichHtml +from prepare_epub import PrepareEpub def parseArguments(): """ @@ -46,6 +47,15 @@ def configLogger(args): logging.basicConfig( format='%(message)s', level=loggingLevel) + + + fh = logging.FileHandler('gragir.log', mode='w') + fh.setLevel(logging.DEBUG) + fh.setFormatter(fmt=logging.Formatter(fmt='%(asctime)s %(levelname)s: %(name)s - %(message)s')) + #, datefmt='%H:%M:%S' + #'%(asctime)s %(levelname)s: %(name)s - %(message)s') + logger = logging.getLogger() + logger.addHandler(fh) @@ -211,6 +221,8 @@ def main(): parseMhtmlZipFile(args.zip, book) EnrichHtml.enrich(book) + PrepareEpub.prepare(book) + book.save_in_dir('test_out/test_save') #createDAG(book) #createEpubBook(book) diff --git a/gragir/book.py b/gragir/book.py index 307667a..d048e2a 100644 --- a/gragir/book.py +++ b/gragir/book.py @@ -1,6 +1,5 @@ - - - +import os +import logging class Item(object): @@ -12,6 +11,33 @@ class Item(object): self.needs = set() self.soup = None + def save_file(self, directory): + logger = logging.getLogger(__name__) + if hasattr(self, 'remove'): + return + # + # Create file name. + # + if directory[-1] != '/': + directory += '/' + file_name = directory + self.url + logger.info("Saved {}".format(file_name)) + # + # Ensure directory exist. + # + dir = os.path.dirname(file_name) + if not os.path.exists(dir): + os.makedirs(dir) + # + # Save content. + # + if self.soup: + with open( file_name, 'wb') as file: + file.write(self.soup.prettify("utf-8")) + else: + with open( file_name, 'wb') as file: + file.write(self.payload) + class Book(object): def __init__(self, file_name): @@ -19,3 +45,8 @@ class Book(object): self.content = {} self.first = None + def save_in_dir(self, directory): + if not os.path.exists(directory): + os.makedirs(directory) + for item in self.content.values(): + item.save_file(directory) diff --git a/gragir/enrich_html.py b/gragir/enrich_html.py index edac364..fffc207 100644 --- a/gragir/enrich_html.py +++ b/gragir/enrich_html.py @@ -39,8 +39,9 @@ class EnrichHtml(object): if item.soup is not None: logger.info("Create DAG {}".format(item.url)) - links = item.soup.find_all('a') - for link in links: + my_url = urlparse.urlsplit(item.url) + + for link in item.soup.find_all('a'): href = link.get('href') if not href: continue @@ -58,6 +59,24 @@ class EnrichHtml(object): elif href: logger.info(" refered but no item exist: {}".format(url)) + for link in item.soup.find_all('img'): + href = link.get('src') + if not href: + continue + parsed_href = urlparse.urlsplit(href) + url = \ + urlparse.SplitResult(parsed_href.scheme, + parsed_href.netloc, + parsed_href.path, + parsed_href.query, + None).geturl() + + if url in book.content: + book.content[url].needed_by.add(item.url) + item.needs.add(url) + elif href: + logger.info(" refered but no item exist: {}".format(url)) + @classmethod def populateContent(cls, book): logger = logging.getLogger(__name__) diff --git a/gragir/prepare_epub.py b/gragir/prepare_epub.py new file mode 100644 index 0000000..4881768 --- /dev/null +++ b/gragir/prepare_epub.py @@ -0,0 +1,178 @@ +import os +import logging +import urllib.parse as urlparse +from bs4 import BeautifulSoup + +from book import Item, Book + + +class PrepareEpub(object): + + @classmethod + def prepare(cls, book): + logger = logging.getLogger(__name__) + logger.info("BEGIN Prepare EPUB.") + cls.localize_url(book) + logger.info("END Prepare EPUB.") + + @classmethod + def localize_url(cls, book): + #logger = logging.getLogger(__name__) + for item in book.content.values(): + if hasattr(item, 'remove'): + continue + category = item.content_type.split("/")[0] + if category != 'text': + cls._moveTo(book,item,category) + else: + cls._moveTo(book,item,"") + + @classmethod + def _moveTo(cls, book, item, category): + logger = logging.getLogger(__name__) + parsed_url= urlparse.urlsplit(item.url) + file_name = os.path.basename(parsed_url.path) + if category: + new_url = category + "/" + file_name + else: + new_url = file_name + if item.url != new_url \ + and new_url in book.content: + new_url = cls._findUniqueName(book, category, file_name) + + logger.info("Renaming {} -> {}" + .format(item.url, new_url)) + + for dependant in item.needed_by: + if hasattr(dependant, 'soup'): + base_link = urlparse.urlsplit(dependant.url) + base_link.path = os.path.dirname(base_link.path) + for a in dependant.soup.find_all('a'): + if cls._getAbsoluteUrl(base_link, a.attr.href) == item.url: + a.attr.href = new_url + for img in dependant.soup.find_all('img'): + if cls._getAbsoluteUrl(base_link, img.attr.src) == item.url: + img.attrs.src = new_url + item.url = new_url + + @classmethod + def _getAbsoluteUrl(cls, base_link, link): + parsed = urlparse.urlsplit(link) + if parsed.netloc is None: + parsed.scheme = base_link.scheme + parsed.netloc = base_link.netloc + if parsed.path[0] != '/': + parsed.path = base_link.path + '/' + href.path + return \ + urlparse.SplitResult(parsed.scheme, + parsed.netloc, + parsed.path, + parsed.query, + None).geturl() + + @classmethod + def _findUniqueName(cls, book, category, filename): + i = 0 + file_name_base, file_ext = os.path.splitext(filename) + while True: + i+=1 + if category: + new_url = category + '/' + file_name_base + '_' + i + file_ext + else: + new_url = file_name_base + '_' + i + file_ext + if new_url not in book.content: + break + return new_url + + @classmethod + def createDAG(cls, book): + logger = logging.getLogger(__name__) + for item in book.content.values(): + if item.soup is not None: + logger.info("Create DAG {}".format(item.url)) + + links = item.soup.find_all('a') + for link in links: + href = link.get('href') + if not href: + continue + parsed_href = urlparse.urlsplit(href) + url = \ + urlparse.SplitResult(parsed_href.scheme, + parsed_href.netloc, + parsed_href.path, + parsed_href.query, + None).geturl() + + if url in book.content: + book.content[url].needed_by.add(item.url) + item.needs.add(url) + elif href: + logger.info(" refered but no item exist: {}".format(url)) + + @classmethod + def populateContent(cls, book): + logger = logging.getLogger(__name__) + for item in book.content.values(): + if item.soup is not None: + # Try to find content. + item_content = item.soup.find_all('div', attrs={"id": "sbo-rt-content"}) + if len(item_content) == 1: + item.content = item_content[0] + else: + logger.error("No content found: {}".format(item.url)) + item.remove = True + + @classmethod + def createOrder(cls, book): + logger = logging.getLogger(__name__) + for item in book.content.values(): + if item.soup is not None: + # Try to get prev chapter. + links = item.soup.find_all('a', attrs={"class": "prev nav-link"}) + if len(links): + item.prev = links[0].get('href') + + # Try to get next chapter. + links = item.soup.find_all('a', attrs={"class": "next nav-link"}) + if len(links): + item.next = links[0].get('href') + + for item in book.content.values(): + if item.soup is not None \ + and not hasattr(item, 'prev') \ + and not hasattr(item, 'remove'): + if book.first: + logger.error("Multiple begin points found. {} and {}" + .format(item.url, item.url)) + raise Exception("Multiple begin points found.") + else: + book.first = item + + @classmethod + def getTitle(cls, item): + if hasattr(item.soup, 'title') and item.soup.title: + return item.soup.title.string + else: + return item.url + + + @classmethod + def print(cls, book): + logger = logging.getLogger(__name__) + item = book.first + while item is not None: + logger.info("Item: {}".format(cls.getTitle(item))) + if hasattr(item, 'prev'): + logger.info(" Prev: {}".format(item.prev)) + if hasattr(item, 'next'): + logger.info(" Next: {}".format(item.next)) + for url in item.needs: + logger.info(" Needs: {}".format(url)) + logger.info("") + + if hasattr(item, 'next'): + item = book.content[item.next] + else: + item = None +