Converting DAG to point to objects directly.

This commit is contained in:
2018-10-07 18:31:33 +01:00
parent 50d453e92d
commit a4cedbf3c7
3 changed files with 134 additions and 213 deletions

View File

@@ -1,5 +1,6 @@
import os import os
import logging import logging
import urllib
class Item(object): class Item(object):
@@ -8,6 +9,7 @@ class Item(object):
self.content_type = content_type self.content_type = content_type
self.payload = payload self.payload = payload
self.needed_by = set() self.needed_by = set()
self.needed_by_elem = set()
self.needs = set() self.needs = set()
self.soup = None self.soup = None
@@ -38,6 +40,11 @@ class Item(object):
with open( file_name, 'wb') as file: with open( file_name, 'wb') as file:
file.write(self.payload) file.write(self.payload)
def getAbsoluteUrl(self, link):
defrag,_ =urllib.parse.urldefrag(link)
return urllib.parse.urljoin(self.url, defrag)
class Book(object): class Book(object):
def __init__(self, file_name): def __init__(self, file_name):
@@ -45,8 +52,39 @@ class Book(object):
self.content = {} self.content = {}
self.first = None self.first = None
def remove(self, item):
del self.content[item.url]
def save_in_dir(self, directory): def save_in_dir(self, directory):
if not os.path.exists(directory): if not os.path.exists(directory):
os.makedirs(directory) os.makedirs(directory)
for item in self.content.values(): for item in self.content.values():
item.save_file(directory) item.save_file(directory)
def insertDependency(self, item, element, url):
logger = logging.getLogger(__name__)
if url in self.content:
item.needs.add(self.content[url])
self.content[url].needed_by.add(item)
self.content[url].needed_by_elem.add(element)
elif url:
logger.info(" refered but no item exist: {}".format(url))
def print(self):
logger = logging.getLogger(__name__)
logger.info("Book Structure:")
item = self.first
while item is not None:
logger.info("Item: {}".format(item.title))
if hasattr(item, 'prev'):
logger.info(" Prev: {}".format(item.prev.url))
if hasattr(item, 'next'):
logger.info(" Next: {}".format(item.next.url))
for ref_item in item.needs:
logger.info(" Needs: {}".format(ref_item.url))
logger.info("")
if hasattr(item, 'next'):
item = item.next
else:
item = None

View File

@@ -1,5 +1,6 @@
import logging import logging
import urllib.parse as urlparse import urllib
import os
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from book import Item, Book from book import Item, Book
@@ -12,10 +13,11 @@ class EnrichHtml(object):
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.info("BEGIN Html Enrichment {} items.".format(len(book.content))) logger.info("BEGIN Html Enrichment {} items.".format(len(book.content)))
cls.parse(book) cls.parse(book)
cls.createDAG(book)
cls.populateContent(book)
cls.createOrder(book) cls.createOrder(book)
cls.print(book) cls.populateContent(book)
cls.createDAG(book)
cls.findFirst(book)
book.print()
logger.info("BEGIN Html Enrichment {} items.".format(len(book.content))) logger.info("BEGIN Html Enrichment {} items.".format(len(book.content)))
@classmethod @classmethod
@@ -38,108 +40,73 @@ class EnrichHtml(object):
for item in book.content.values(): for item in book.content.values():
if item.soup is not None: if item.soup is not None:
logger.info("Create DAG {}".format(item.url)) logger.info("Create DAG {}".format(item.url))
cls.normalizeUrlAndSetDependecy(book, item, 'a', 'href')
cls.normalizeUrlAndSetDependecy(book, item, 'img', 'src')
cls.normalizeUrlAndSetDependecy(book, item, 'img', 'data-mfp-src')
my_url = urlparse.urlsplit(item.url)
for link in item.soup.find_all('a'): @classmethod
href = link.get('href') def normalizeUrlAndSetDependecy(cls, book, item, tag, attr):
if not href: logger = logging.getLogger(__name__)
for element in item.soup.find_all(tag):
url = element.get(attr)
if not url:
continue continue
parsed_href = urlparse.urlsplit(href) normal_url = item.getAbsoluteUrl(url)
url = \ logger.info(" depends on: {}".format(normal_url))
urlparse.SplitResult(parsed_href.scheme, book.insertDependency(item, element, normal_url)
parsed_href.netloc,
parsed_href.path,
parsed_href.query,
None).geturl()
if url in book.content:
book.content[url].needed_by.add(item.url)
item.needs.add(url)
elif href:
logger.info(" refered but no item exist: {}".format(url))
for link in item.soup.find_all('img'):
href = link.get('src')
if not href:
continue
parsed_href = urlparse.urlsplit(href)
url = \
urlparse.SplitResult(parsed_href.scheme,
parsed_href.netloc,
parsed_href.path,
parsed_href.query,
None).geturl()
if url in book.content:
book.content[url].needed_by.add(item.url)
item.needs.add(url)
elif href:
logger.info(" refered but no item exist: {}".format(url))
@classmethod @classmethod
def populateContent(cls, book): def populateContent(cls, book):
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
remove = []
for item in book.content.values(): for item in book.content.values():
if item.soup is not None: if item.soup:
# Try to find content. # Try to find content.
item_content = item.soup.find_all('div', attrs={"id": "sbo-rt-content"}) item_content = item.soup.find_all('div', attrs={"id": "sbo-rt-content"})
if len(item_content) == 1: if len(item_content) == 1:
item.content = item_content[0] content = item_content[0].extract()
body = item.soup.find('body')
body.clear()
body.append(content)
else: else:
logger.error("No content found: {}".format(item.url)) logger.error(" No content found: {}".format(item.url))
item.remove = True remove.append(item)
for item in remove:
book.remove(item)
@classmethod @classmethod
def createOrder(cls, book): def createOrder(cls, book):
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.info("Create Order:")
for item in book.content.values(): for item in book.content.values():
if item.soup is not None: if item.soup:
# Try to get prev chapter. # Try to get prev chapter.
links = item.soup.find_all('a', attrs={"class": "prev nav-link"}) links = item.soup.find_all('a', attrs={"class": "prev nav-link"})
if len(links): if len(links):
item.prev = links[0].get('href') item.prev = book.content[
item.getAbsoluteUrl(links[0]['href'])]
logger.info(" prev = {}:".format(item.prev.url))
# Try to get next chapter. # Try to get next chapter.
links = item.soup.find_all('a', attrs={"class": "next nav-link"}) links = item.soup.find_all('a', attrs={"class": "next nav-link"})
if len(links): if len(links):
item.next = links[0].get('href') item.next = book.content[
item.getAbsoluteUrl(links[0]['href'])]
logger.info(" next = {}:".format(item.next.url))
@classmethod
def findFirst(cls, book):
logger = logging.getLogger(__name__)
for item in book.content.values(): for item in book.content.values():
if item.soup is not None \ if item.soup and not hasattr(item, 'prev'):
and not hasattr(item, 'prev') \
and not hasattr(item, 'remove'):
if book.first: if book.first:
logger.error("Multiple begin points found. {} and {}" logger.error("Multiple begin points found. {} and {}"
.format(item.url, item.url)) .format(book.first.url, item.url))
raise Exception("Multiple begin points found.") raise Exception("Multiple begin points found.")
else: else:
book.first = item book.first = item
logger.info(" first = {}:".format(book.first.url))
@classmethod
def getTitle(cls, item):
if hasattr(item.soup, 'title') and item.soup.title:
return item.soup.title.string
else:
return item.url
@classmethod
def print(cls, book):
logger = logging.getLogger(__name__)
item = book.first
while item is not None:
logger.info("Item: {}".format(cls.getTitle(item)))
if hasattr(item, 'prev'):
logger.info(" Prev: {}".format(item.prev))
if hasattr(item, 'next'):
logger.info(" Next: {}".format(item.next))
for url in item.needs:
logger.info(" Needs: {}".format(url))
logger.info("")
if hasattr(item, 'next'):
item = book.content[item.next]
else:
item = None

View File

@@ -1,6 +1,6 @@
import os import os
import logging import logging
import urllib.parse as urlparse import urllib
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from book import Item, Book from book import Item, Book
@@ -13,62 +13,55 @@ class PrepareEpub(object):
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.info("BEGIN Prepare EPUB.") logger.info("BEGIN Prepare EPUB.")
cls.localize_url(book) cls.localize_url(book)
book.print()
logger.info("END Prepare EPUB.") logger.info("END Prepare EPUB.")
@classmethod @classmethod
def localize_url(cls, book): def localize_url(cls, book):
#logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
for item in book.content.values(): for item in book.content.values():
if hasattr(item, 'remove'): #
continue # Create local name. It will have dir/filename structure.
category = item.content_type.split("/")[0] #
if category != 'text': mime = item.content_type
cls._moveTo(book,item,category) if mime == 'text/css':
local_url = cls._createLocalName(book,item,'css')
elif mime == 'application/font-woff' \
or mime == 'application/font-woff2':
local_url = cls._createLocalName(book,item,'font')
elif mime == 'text/html':
local_url = cls._createLocalName(book,item,'')
else: else:
cls._moveTo(book,item,"") local_url = cls._createLocalName(book,item,mime.split("/")[0])
cls._moveTo(item,local_url)
@classmethod @classmethod
def _moveTo(cls, book, item, category): def _createLocalName(cls, book, item, category):
logger = logging.getLogger(__name__) #
parsed_url= urlparse.urlsplit(item.url) # Get file name.
file_name = os.path.basename(parsed_url.path) #
parsed_url= urllib.parse.urlsplit(item.url)
file_name = parsed_url.path.split('/')[-1]
#
# Append category
#
if category: if category:
new_url = category + "/" + file_name new_url = category + "/" + file_name
else: else:
new_url = file_name new_url = file_name
#
# If file name already exist then generate a unique one.
#
if item.url != new_url \ if item.url != new_url \
and new_url in book.content: and new_url in book.content:
new_url = cls._findUniqueName(book, category, file_name) new_url = cls._findUniqueName(book, category, file_name)
return new_url
logger.info("Renaming {} -> {}"
.format(item.url, new_url))
for dependant in item.needed_by:
if hasattr(dependant, 'soup'):
base_link = urlparse.urlsplit(dependant.url)
base_link.path = os.path.dirname(base_link.path)
for a in dependant.soup.find_all('a'):
if cls._getAbsoluteUrl(base_link, a.attr.href) == item.url:
a.attr.href = new_url
for img in dependant.soup.find_all('img'):
if cls._getAbsoluteUrl(base_link, img.attr.src) == item.url:
img.attrs.src = new_url
item.url = new_url
@classmethod
def _getAbsoluteUrl(cls, base_link, link):
parsed = urlparse.urlsplit(link)
if parsed.netloc is None:
parsed.scheme = base_link.scheme
parsed.netloc = base_link.netloc
if parsed.path[0] != '/':
parsed.path = base_link.path + '/' + href.path
return \
urlparse.SplitResult(parsed.scheme,
parsed.netloc,
parsed.path,
parsed.query,
None).geturl()
@classmethod @classmethod
def _findUniqueName(cls, book, category, filename): def _findUniqueName(cls, book, category, filename):
@@ -84,95 +77,18 @@ class PrepareEpub(object):
break break
return new_url return new_url
@classmethod @classmethod
def createDAG(cls, book): def _moveTo(cls, item, local_url):
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
for item in book.content.values(): logger.info("Renaming {} -> {}".format(item.url, local_url))
if item.soup is not None:
logger.info("Create DAG {}".format(item.url))
links = item.soup.find_all('a')
for link in links:
href = link.get('href')
if not href:
continue
parsed_href = urlparse.urlsplit(href)
url = \
urlparse.SplitResult(parsed_href.scheme,
parsed_href.netloc,
parsed_href.path,
parsed_href.query,
None).geturl()
if url in book.content:
book.content[url].needed_by.add(item.url)
item.needs.add(url)
elif href:
logger.info(" refered but no item exist: {}".format(url))
@classmethod
def populateContent(cls, book):
logger = logging.getLogger(__name__)
for item in book.content.values():
if item.soup is not None:
# Try to find content.
item_content = item.soup.find_all('div', attrs={"id": "sbo-rt-content"})
if len(item_content) == 1:
item.content = item_content[0]
else:
logger.error("No content found: {}".format(item.url))
item.remove = True
@classmethod
def createOrder(cls, book):
logger = logging.getLogger(__name__)
for item in book.content.values():
if item.soup is not None:
# Try to get prev chapter.
links = item.soup.find_all('a', attrs={"class": "prev nav-link"})
if len(links):
item.prev = links[0].get('href')
# Try to get next chapter.
links = item.soup.find_all('a', attrs={"class": "next nav-link"})
if len(links):
item.next = links[0].get('href')
for item in book.content.values():
if item.soup is not None \
and not hasattr(item, 'prev') \
and not hasattr(item, 'remove'):
if book.first:
logger.error("Multiple begin points found. {} and {}"
.format(item.url, item.url))
raise Exception("Multiple begin points found.")
else:
book.first = item
@classmethod
def getTitle(cls, item):
if hasattr(item.soup, 'title') and item.soup.title:
return item.soup.title.string
else:
return item.url
@classmethod
def print(cls, book):
logger = logging.getLogger(__name__)
item = book.first
while item is not None:
logger.info("Item: {}".format(cls.getTitle(item)))
if hasattr(item, 'prev'):
logger.info(" Prev: {}".format(item.prev))
if hasattr(item, 'next'):
logger.info(" Next: {}".format(item.next))
for url in item.needs:
logger.info(" Needs: {}".format(url))
logger.info("")
if hasattr(item, 'next'):
item = book.content[item.next]
else:
item = None
for ref_elem in item.needed_by_elem:
if ref_elem.name == 'a':
_,fragment = urllib.parse.urldefrag(ref_elem['href'])
ref_elem['href'] = local_url + "#" + fragment
if ref_elem.name == 'img':
ref_elem['src'] = local_url
if ref_elem.name == 'img':
ref_elem['data-mfp-src'] = local_url
item.url = local_url