diff --git a/gragir/enrich_html.py b/gragir/enrich_html.py index 988b37f..7173409 100644 --- a/gragir/enrich_html.py +++ b/gragir/enrich_html.py @@ -2,6 +2,7 @@ import logging import urllib import os from bs4 import BeautifulSoup +import cssutils from book import Item, Book @@ -31,6 +32,8 @@ class EnrichHtml(object): item.title = item.soup.title.string else: logger.info("No title for {}".format(item.url)) + elif item.content_type == 'text/css': + item.css = cssutils.parseString(item.payload, None, item.url) else: logger.info("Skipping {} {}".format(item.content_type, item.url)) @@ -80,6 +83,19 @@ class EnrichHtml(object): else: logger.warn(" No content found: {}".format(item.url)) remove.append(item) + # Remove annotator + remove_content = [] + remove_content.extend(item.soup.find_all('div', attrs={"class": "annotator-outer"})) + remove_content.extend(item.soup.find_all('div', attrs={"class": "annotator-modal-wrapper"})) + remove_content.extend(item.soup.find_all('div', attrs={"class": "annotator-adder"})) + for elem in remove_content: + elem.decompose() + # Unwrap + remove_content = [] + remove_content.extend(item.soup.find_all('div', attrs={"class": "annotator-wrapper"})) + remove_content.extend(item.soup.find_all('div', attrs={"id": "sbo-rt-content"})) + for elem in remove_content: + elem.unwrap() for item in remove: book.remove(item) @@ -95,14 +111,14 @@ class EnrichHtml(object): links = item.soup.find_all('a', attrs={"class": "prev nav-link"}) if len(links): item.prev = book.content[ - item.getAbsoluteUrl(links[0]['href'])] + item.getAbsoluteUrl(links[0].get('href'))] logger.info(" prev = {}:".format(item.prev.url)) # Try to get next chapter. links = item.soup.find_all('a', attrs={"class": "next nav-link"}) if len(links): item.next = book.content[ - item.getAbsoluteUrl(links[0]['href'])] + item.getAbsoluteUrl(links[0].get('href'))] logger.info(" next = {}:".format(item.next.url)) @classmethod diff --git a/gragir/prepare_epub.py b/gragir/prepare_epub.py index e751554..0b16180 100644 --- a/gragir/prepare_epub.py +++ b/gragir/prepare_epub.py @@ -2,6 +2,7 @@ import os import logging import urllib from bs4 import BeautifulSoup +from tidylib import tidy_document from book import Item, Book @@ -33,6 +34,13 @@ class PrepareEpub(object): elif mime == 'text/html': local_url = cls._createLocalName(book,item,'') + local_url_split = local_url.split('.') + logger.info('{}'.format(local_url_split)) + if local_url_split[-1] == 'htm' \ + or local_url_split[-1] == 'html': + local_url_split[-1] = 'xhtml' + local_url = '.'.join(local_url_split) + logger.info('converting to xhtml {}'.format(local_url_split)) else: local_url = cls._createLocalName(book,item,mime.split("/")[0]) @@ -105,4 +113,7 @@ class PrepareEpub(object): logger = logging.getLogger(__name__) logger.info("Createing XML for {}".format(item.url)) - item.payload = item.soup.prettify("utf-8") + item.payload, err = tidy_document( item.soup.prettify("utf-8"), + options={ 'output-xhtml' : 1, 'tidy-mark' : 1}) + item.content_type = 'text/xhtml' + logger.info("Errors: {}".format(err)) \ No newline at end of file diff --git a/gragir/save_epub.py b/gragir/save_epub.py index 3045721..8adfa97 100644 --- a/gragir/save_epub.py +++ b/gragir/save_epub.py @@ -14,6 +14,8 @@ class SaveEpub(object): ebook = ebooklib.EpubBook() cls.fillEpubBook(ebook, book) + cls.fillSpine(ebook, book) + cls.fillGuide(ebook, book) cls.writeEpubBook(book.file_name, ebook) @@ -31,6 +33,8 @@ class SaveEpub(object): """ logger = logging.getLogger(__name__) + options = {'plugins': []} + try: epub = ebooklib.EpubWriter(name, ebook, options) epub.process() @@ -54,7 +58,7 @@ class SaveEpub(object): or mime == 'application/font-woff2': return 'font' - elif mime == 'text/html': + elif mime == 'text/html' or mime == 'text/xhtml': return 'html' else: @@ -93,9 +97,39 @@ class SaveEpub(object): eitem.content = item.payload ebook.add_item(eitem) - item.id = eitem.get_id() + item.idref = eitem.get_id() - # class EpubNav(EpubHtml): - # class EpubNcx(EpubItem): - # class Link(object): - # class Section(object): \ No newline at end of file + @classmethod + def fillSpine(cls, ebook, book): + logger = logging.getLogger(__name__) + + ebook.add_item(ebooklib.EpubNcx()) + ebook.add_item(ebooklib.EpubNav()) + + # define CSS style + style = 'BODY {color: white;}' + nav_css = ebooklib.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style) + ebook.add_item(nav_css) + ebook.spine = ['nav'] + + # ebook.toc = (ebooklib.Link(item.url, item.title, 'intro'), + # (epub.Section(item.title), + # (c1, )) + # ) + + item = book.first + while item: + logger.info("EPUB: Adding to spin {}".format(item.url)) + ebook.spine.append(item.idref) + + if hasattr(item, 'next'): + item = item.next + else: + item = None + + @classmethod + def fillGuide(cls, ebook, book): + logger = logging.getLogger(__name__) + + item = book.first + ebook.guide.append({ 'href' : item.url, 'title' : item.title, 'type' : 'text'})