diff --git a/gragir/book.py b/gragir/book.py index d048e2a..167abf3 100644 --- a/gragir/book.py +++ b/gragir/book.py @@ -1,5 +1,6 @@ import os import logging +import urllib class Item(object): @@ -8,6 +9,7 @@ class Item(object): self.content_type = content_type self.payload = payload self.needed_by = set() + self.needed_by_elem = set() self.needs = set() self.soup = None @@ -38,6 +40,11 @@ class Item(object): with open( file_name, 'wb') as file: file.write(self.payload) + def getAbsoluteUrl(self, link): + defrag,_ =urllib.parse.urldefrag(link) + return urllib.parse.urljoin(self.url, defrag) + + class Book(object): def __init__(self, file_name): @@ -45,8 +52,39 @@ class Book(object): self.content = {} self.first = None + def remove(self, item): + del self.content[item.url] + def save_in_dir(self, directory): if not os.path.exists(directory): os.makedirs(directory) for item in self.content.values(): item.save_file(directory) + + def insertDependency(self, item, element, url): + logger = logging.getLogger(__name__) + if url in self.content: + item.needs.add(self.content[url]) + self.content[url].needed_by.add(item) + self.content[url].needed_by_elem.add(element) + elif url: + logger.info(" refered but no item exist: {}".format(url)) + + def print(self): + logger = logging.getLogger(__name__) + logger.info("Book Structure:") + item = self.first + while item is not None: + logger.info("Item: {}".format(item.title)) + if hasattr(item, 'prev'): + logger.info(" Prev: {}".format(item.prev.url)) + if hasattr(item, 'next'): + logger.info(" Next: {}".format(item.next.url)) + for ref_item in item.needs: + logger.info(" Needs: {}".format(ref_item.url)) + logger.info("") + + if hasattr(item, 'next'): + item = item.next + else: + item = None diff --git a/gragir/enrich_html.py b/gragir/enrich_html.py index fffc207..05f51e7 100644 --- a/gragir/enrich_html.py +++ b/gragir/enrich_html.py @@ -1,5 +1,6 @@ import logging -import urllib.parse as urlparse +import urllib +import os from bs4 import BeautifulSoup from book import Item, Book @@ -12,10 +13,11 @@ class EnrichHtml(object): logger = logging.getLogger(__name__) logger.info("BEGIN Html Enrichment {} items.".format(len(book.content))) cls.parse(book) - cls.createDAG(book) - cls.populateContent(book) cls.createOrder(book) - cls.print(book) + cls.populateContent(book) + cls.createDAG(book) + cls.findFirst(book) + book.print() logger.info("BEGIN Html Enrichment {} items.".format(len(book.content))) @classmethod @@ -38,108 +40,73 @@ class EnrichHtml(object): for item in book.content.values(): if item.soup is not None: logger.info("Create DAG {}".format(item.url)) + cls.normalizeUrlAndSetDependecy(book, item, 'a', 'href') + cls.normalizeUrlAndSetDependecy(book, item, 'img', 'src') + cls.normalizeUrlAndSetDependecy(book, item, 'img', 'data-mfp-src') - my_url = urlparse.urlsplit(item.url) - for link in item.soup.find_all('a'): - href = link.get('href') - if not href: - continue - parsed_href = urlparse.urlsplit(href) - url = \ - urlparse.SplitResult(parsed_href.scheme, - parsed_href.netloc, - parsed_href.path, - parsed_href.query, - None).geturl() + @classmethod + def normalizeUrlAndSetDependecy(cls, book, item, tag, attr): + logger = logging.getLogger(__name__) + for element in item.soup.find_all(tag): + url = element.get(attr) + if not url: + continue + normal_url = item.getAbsoluteUrl(url) + logger.info(" depends on: {}".format(normal_url)) + book.insertDependency(item, element, normal_url) - if url in book.content: - book.content[url].needed_by.add(item.url) - item.needs.add(url) - elif href: - logger.info(" refered but no item exist: {}".format(url)) - - for link in item.soup.find_all('img'): - href = link.get('src') - if not href: - continue - parsed_href = urlparse.urlsplit(href) - url = \ - urlparse.SplitResult(parsed_href.scheme, - parsed_href.netloc, - parsed_href.path, - parsed_href.query, - None).geturl() - - if url in book.content: - book.content[url].needed_by.add(item.url) - item.needs.add(url) - elif href: - logger.info(" refered but no item exist: {}".format(url)) @classmethod def populateContent(cls, book): logger = logging.getLogger(__name__) + remove = [] for item in book.content.values(): - if item.soup is not None: + if item.soup: # Try to find content. item_content = item.soup.find_all('div', attrs={"id": "sbo-rt-content"}) if len(item_content) == 1: - item.content = item_content[0] + content = item_content[0].extract() + body = item.soup.find('body') + body.clear() + body.append(content) else: - logger.error("No content found: {}".format(item.url)) - item.remove = True + logger.error(" No content found: {}".format(item.url)) + remove.append(item) + + for item in remove: + book.remove(item) + @classmethod def createOrder(cls, book): logger = logging.getLogger(__name__) + logger.info("Create Order:") for item in book.content.values(): - if item.soup is not None: + if item.soup: # Try to get prev chapter. links = item.soup.find_all('a', attrs={"class": "prev nav-link"}) if len(links): - item.prev = links[0].get('href') + item.prev = book.content[ + item.getAbsoluteUrl(links[0]['href'])] + logger.info(" prev = {}:".format(item.prev.url)) # Try to get next chapter. links = item.soup.find_all('a', attrs={"class": "next nav-link"}) if len(links): - item.next = links[0].get('href') + item.next = book.content[ + item.getAbsoluteUrl(links[0]['href'])] + logger.info(" next = {}:".format(item.next.url)) + @classmethod + def findFirst(cls, book): + logger = logging.getLogger(__name__) for item in book.content.values(): - if item.soup is not None \ - and not hasattr(item, 'prev') \ - and not hasattr(item, 'remove'): + if item.soup and not hasattr(item, 'prev'): if book.first: logger.error("Multiple begin points found. {} and {}" - .format(item.url, item.url)) + .format(book.first.url, item.url)) raise Exception("Multiple begin points found.") else: book.first = item - - @classmethod - def getTitle(cls, item): - if hasattr(item.soup, 'title') and item.soup.title: - return item.soup.title.string - else: - return item.url - - - @classmethod - def print(cls, book): - logger = logging.getLogger(__name__) - item = book.first - while item is not None: - logger.info("Item: {}".format(cls.getTitle(item))) - if hasattr(item, 'prev'): - logger.info(" Prev: {}".format(item.prev)) - if hasattr(item, 'next'): - logger.info(" Next: {}".format(item.next)) - for url in item.needs: - logger.info(" Needs: {}".format(url)) - logger.info("") - - if hasattr(item, 'next'): - item = book.content[item.next] - else: - item = None - + logger.info(" first = {}:".format(book.first.url)) diff --git a/gragir/prepare_epub.py b/gragir/prepare_epub.py index 4881768..2429624 100644 --- a/gragir/prepare_epub.py +++ b/gragir/prepare_epub.py @@ -1,6 +1,6 @@ import os import logging -import urllib.parse as urlparse +import urllib from bs4 import BeautifulSoup from book import Item, Book @@ -13,62 +13,55 @@ class PrepareEpub(object): logger = logging.getLogger(__name__) logger.info("BEGIN Prepare EPUB.") cls.localize_url(book) + book.print() logger.info("END Prepare EPUB.") @classmethod def localize_url(cls, book): - #logger = logging.getLogger(__name__) + logger = logging.getLogger(__name__) for item in book.content.values(): - if hasattr(item, 'remove'): - continue - category = item.content_type.split("/")[0] - if category != 'text': - cls._moveTo(book,item,category) + # + # Create local name. It will have dir/filename structure. + # + mime = item.content_type + if mime == 'text/css': + local_url = cls._createLocalName(book,item,'css') + + elif mime == 'application/font-woff' \ + or mime == 'application/font-woff2': + local_url = cls._createLocalName(book,item,'font') + + elif mime == 'text/html': + local_url = cls._createLocalName(book,item,'') + else: - cls._moveTo(book,item,"") + local_url = cls._createLocalName(book,item,mime.split("/")[0]) + + cls._moveTo(item,local_url) + @classmethod - def _moveTo(cls, book, item, category): - logger = logging.getLogger(__name__) - parsed_url= urlparse.urlsplit(item.url) - file_name = os.path.basename(parsed_url.path) + def _createLocalName(cls, book, item, category): + # + # Get file name. + # + parsed_url= urllib.parse.urlsplit(item.url) + file_name = parsed_url.path.split('/')[-1] + # + # Append category + # if category: - new_url = category + "/" + file_name - else: - new_url = file_name + new_url = category + "/" + file_name + else: + new_url = file_name + # + # If file name already exist then generate a unique one. + # if item.url != new_url \ and new_url in book.content: new_url = cls._findUniqueName(book, category, file_name) + return new_url - logger.info("Renaming {} -> {}" - .format(item.url, new_url)) - - for dependant in item.needed_by: - if hasattr(dependant, 'soup'): - base_link = urlparse.urlsplit(dependant.url) - base_link.path = os.path.dirname(base_link.path) - for a in dependant.soup.find_all('a'): - if cls._getAbsoluteUrl(base_link, a.attr.href) == item.url: - a.attr.href = new_url - for img in dependant.soup.find_all('img'): - if cls._getAbsoluteUrl(base_link, img.attr.src) == item.url: - img.attrs.src = new_url - item.url = new_url - - @classmethod - def _getAbsoluteUrl(cls, base_link, link): - parsed = urlparse.urlsplit(link) - if parsed.netloc is None: - parsed.scheme = base_link.scheme - parsed.netloc = base_link.netloc - if parsed.path[0] != '/': - parsed.path = base_link.path + '/' + href.path - return \ - urlparse.SplitResult(parsed.scheme, - parsed.netloc, - parsed.path, - parsed.query, - None).geturl() @classmethod def _findUniqueName(cls, book, category, filename): @@ -78,101 +71,24 @@ class PrepareEpub(object): i+=1 if category: new_url = category + '/' + file_name_base + '_' + i + file_ext - else: + else: new_url = file_name_base + '_' + i + file_ext if new_url not in book.content: - break + break return new_url + @classmethod - def createDAG(cls, book): + def _moveTo(cls, item, local_url): logger = logging.getLogger(__name__) - for item in book.content.values(): - if item.soup is not None: - logger.info("Create DAG {}".format(item.url)) - - links = item.soup.find_all('a') - for link in links: - href = link.get('href') - if not href: - continue - parsed_href = urlparse.urlsplit(href) - url = \ - urlparse.SplitResult(parsed_href.scheme, - parsed_href.netloc, - parsed_href.path, - parsed_href.query, - None).geturl() - - if url in book.content: - book.content[url].needed_by.add(item.url) - item.needs.add(url) - elif href: - logger.info(" refered but no item exist: {}".format(url)) - - @classmethod - def populateContent(cls, book): - logger = logging.getLogger(__name__) - for item in book.content.values(): - if item.soup is not None: - # Try to find content. - item_content = item.soup.find_all('div', attrs={"id": "sbo-rt-content"}) - if len(item_content) == 1: - item.content = item_content[0] - else: - logger.error("No content found: {}".format(item.url)) - item.remove = True - - @classmethod - def createOrder(cls, book): - logger = logging.getLogger(__name__) - for item in book.content.values(): - if item.soup is not None: - # Try to get prev chapter. - links = item.soup.find_all('a', attrs={"class": "prev nav-link"}) - if len(links): - item.prev = links[0].get('href') - - # Try to get next chapter. - links = item.soup.find_all('a', attrs={"class": "next nav-link"}) - if len(links): - item.next = links[0].get('href') - - for item in book.content.values(): - if item.soup is not None \ - and not hasattr(item, 'prev') \ - and not hasattr(item, 'remove'): - if book.first: - logger.error("Multiple begin points found. {} and {}" - .format(item.url, item.url)) - raise Exception("Multiple begin points found.") - else: - book.first = item - - @classmethod - def getTitle(cls, item): - if hasattr(item.soup, 'title') and item.soup.title: - return item.soup.title.string - else: - return item.url - - - @classmethod - def print(cls, book): - logger = logging.getLogger(__name__) - item = book.first - while item is not None: - logger.info("Item: {}".format(cls.getTitle(item))) - if hasattr(item, 'prev'): - logger.info(" Prev: {}".format(item.prev)) - if hasattr(item, 'next'): - logger.info(" Next: {}".format(item.next)) - for url in item.needs: - logger.info(" Needs: {}".format(url)) - logger.info("") - - if hasattr(item, 'next'): - item = book.content[item.next] - else: - item = None + logger.info("Renaming {} -> {}".format(item.url, local_url)) + for ref_elem in item.needed_by_elem: + if ref_elem.name == 'a': + _,fragment = urllib.parse.urldefrag(ref_elem['href']) + ref_elem['href'] = local_url + "#" + fragment + if ref_elem.name == 'img': + ref_elem['src'] = local_url + if ref_elem.name == 'img': + ref_elem['data-mfp-src'] = local_url + item.url = local_url