Generating epubs.

This commit is contained in:
2018-10-21 21:22:56 +01:00
parent 11a29c3e0a
commit 0f4c478ef6
3 changed files with 70 additions and 9 deletions

View File

@@ -2,6 +2,7 @@ import logging
import urllib
import os
from bs4 import BeautifulSoup
import cssutils
from book import Item, Book
@@ -31,6 +32,8 @@ class EnrichHtml(object):
item.title = item.soup.title.string
else:
logger.info("No title for {}".format(item.url))
elif item.content_type == 'text/css':
item.css = cssutils.parseString(item.payload, None, item.url)
else:
logger.info("Skipping {} {}".format(item.content_type, item.url))
@@ -80,6 +83,19 @@ class EnrichHtml(object):
else:
logger.warn(" No content found: {}".format(item.url))
remove.append(item)
# Remove annotator
remove_content = []
remove_content.extend(item.soup.find_all('div', attrs={"class": "annotator-outer"}))
remove_content.extend(item.soup.find_all('div', attrs={"class": "annotator-modal-wrapper"}))
remove_content.extend(item.soup.find_all('div', attrs={"class": "annotator-adder"}))
for elem in remove_content:
elem.decompose()
# Unwrap
remove_content = []
remove_content.extend(item.soup.find_all('div', attrs={"class": "annotator-wrapper"}))
remove_content.extend(item.soup.find_all('div', attrs={"id": "sbo-rt-content"}))
for elem in remove_content:
elem.unwrap()
for item in remove:
book.remove(item)
@@ -95,14 +111,14 @@ class EnrichHtml(object):
links = item.soup.find_all('a', attrs={"class": "prev nav-link"})
if len(links):
item.prev = book.content[
item.getAbsoluteUrl(links[0]['href'])]
item.getAbsoluteUrl(links[0].get('href'))]
logger.info(" prev = {}:".format(item.prev.url))
# Try to get next chapter.
links = item.soup.find_all('a', attrs={"class": "next nav-link"})
if len(links):
item.next = book.content[
item.getAbsoluteUrl(links[0]['href'])]
item.getAbsoluteUrl(links[0].get('href'))]
logger.info(" next = {}:".format(item.next.url))
@classmethod

View File

@@ -2,6 +2,7 @@ import os
import logging
import urllib
from bs4 import BeautifulSoup
from tidylib import tidy_document
from book import Item, Book
@@ -33,6 +34,13 @@ class PrepareEpub(object):
elif mime == 'text/html':
local_url = cls._createLocalName(book,item,'')
local_url_split = local_url.split('.')
logger.info('{}'.format(local_url_split))
if local_url_split[-1] == 'htm' \
or local_url_split[-1] == 'html':
local_url_split[-1] = 'xhtml'
local_url = '.'.join(local_url_split)
logger.info('converting to xhtml {}'.format(local_url_split))
else:
local_url = cls._createLocalName(book,item,mime.split("/")[0])
@@ -105,4 +113,7 @@ class PrepareEpub(object):
logger = logging.getLogger(__name__)
logger.info("Createing XML for {}".format(item.url))
item.payload = item.soup.prettify("utf-8")
item.payload, err = tidy_document( item.soup.prettify("utf-8"),
options={ 'output-xhtml' : 1, 'tidy-mark' : 1})
item.content_type = 'text/xhtml'
logger.info("Errors: {}".format(err))

View File

@@ -14,6 +14,8 @@ class SaveEpub(object):
ebook = ebooklib.EpubBook()
cls.fillEpubBook(ebook, book)
cls.fillSpine(ebook, book)
cls.fillGuide(ebook, book)
cls.writeEpubBook(book.file_name, ebook)
@@ -31,6 +33,8 @@ class SaveEpub(object):
"""
logger = logging.getLogger(__name__)
options = {'plugins': []}
try:
epub = ebooklib.EpubWriter(name, ebook, options)
epub.process()
@@ -54,7 +58,7 @@ class SaveEpub(object):
or mime == 'application/font-woff2':
return 'font'
elif mime == 'text/html':
elif mime == 'text/html' or mime == 'text/xhtml':
return 'html'
else:
@@ -93,9 +97,39 @@ class SaveEpub(object):
eitem.content = item.payload
ebook.add_item(eitem)
item.id = eitem.get_id()
item.idref = eitem.get_id()
# class EpubNav(EpubHtml):
# class EpubNcx(EpubItem):
# class Link(object):
# class Section(object):
@classmethod
def fillSpine(cls, ebook, book):
logger = logging.getLogger(__name__)
ebook.add_item(ebooklib.EpubNcx())
ebook.add_item(ebooklib.EpubNav())
# define CSS style
style = 'BODY {color: white;}'
nav_css = ebooklib.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
ebook.add_item(nav_css)
ebook.spine = ['nav']
# ebook.toc = (ebooklib.Link(item.url, item.title, 'intro'),
# (epub.Section(item.title),
# (c1, ))
# )
item = book.first
while item:
logger.info("EPUB: Adding to spin {}".format(item.url))
ebook.spine.append(item.idref)
if hasattr(item, 'next'):
item = item.next
else:
item = None
@classmethod
def fillGuide(cls, ebook, book):
logger = logging.getLogger(__name__)
item = book.first
ebook.guide.append({ 'href' : item.url, 'title' : item.title, 'type' : 'text'})