Converting DAG to point to objects directly.
This commit is contained in:
@@ -1,5 +1,6 @@
|
|||||||
import os
|
import os
|
||||||
import logging
|
import logging
|
||||||
|
import urllib
|
||||||
|
|
||||||
class Item(object):
|
class Item(object):
|
||||||
|
|
||||||
@@ -8,6 +9,7 @@ class Item(object):
|
|||||||
self.content_type = content_type
|
self.content_type = content_type
|
||||||
self.payload = payload
|
self.payload = payload
|
||||||
self.needed_by = set()
|
self.needed_by = set()
|
||||||
|
self.needed_by_elem = set()
|
||||||
self.needs = set()
|
self.needs = set()
|
||||||
self.soup = None
|
self.soup = None
|
||||||
|
|
||||||
@@ -38,6 +40,11 @@ class Item(object):
|
|||||||
with open( file_name, 'wb') as file:
|
with open( file_name, 'wb') as file:
|
||||||
file.write(self.payload)
|
file.write(self.payload)
|
||||||
|
|
||||||
|
def getAbsoluteUrl(self, link):
|
||||||
|
defrag,_ =urllib.parse.urldefrag(link)
|
||||||
|
return urllib.parse.urljoin(self.url, defrag)
|
||||||
|
|
||||||
|
|
||||||
class Book(object):
|
class Book(object):
|
||||||
|
|
||||||
def __init__(self, file_name):
|
def __init__(self, file_name):
|
||||||
@@ -45,8 +52,39 @@ class Book(object):
|
|||||||
self.content = {}
|
self.content = {}
|
||||||
self.first = None
|
self.first = None
|
||||||
|
|
||||||
|
def remove(self, item):
|
||||||
|
del self.content[item.url]
|
||||||
|
|
||||||
def save_in_dir(self, directory):
|
def save_in_dir(self, directory):
|
||||||
if not os.path.exists(directory):
|
if not os.path.exists(directory):
|
||||||
os.makedirs(directory)
|
os.makedirs(directory)
|
||||||
for item in self.content.values():
|
for item in self.content.values():
|
||||||
item.save_file(directory)
|
item.save_file(directory)
|
||||||
|
|
||||||
|
def insertDependency(self, item, element, url):
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
if url in self.content:
|
||||||
|
item.needs.add(self.content[url])
|
||||||
|
self.content[url].needed_by.add(item)
|
||||||
|
self.content[url].needed_by_elem.add(element)
|
||||||
|
elif url:
|
||||||
|
logger.info(" refered but no item exist: {}".format(url))
|
||||||
|
|
||||||
|
def print(self):
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logger.info("Book Structure:")
|
||||||
|
item = self.first
|
||||||
|
while item is not None:
|
||||||
|
logger.info("Item: {}".format(item.title))
|
||||||
|
if hasattr(item, 'prev'):
|
||||||
|
logger.info(" Prev: {}".format(item.prev.url))
|
||||||
|
if hasattr(item, 'next'):
|
||||||
|
logger.info(" Next: {}".format(item.next.url))
|
||||||
|
for ref_item in item.needs:
|
||||||
|
logger.info(" Needs: {}".format(ref_item.url))
|
||||||
|
logger.info("")
|
||||||
|
|
||||||
|
if hasattr(item, 'next'):
|
||||||
|
item = item.next
|
||||||
|
else:
|
||||||
|
item = None
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
import logging
|
import logging
|
||||||
import urllib.parse as urlparse
|
import urllib
|
||||||
|
import os
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from book import Item, Book
|
from book import Item, Book
|
||||||
@@ -12,10 +13,11 @@ class EnrichHtml(object):
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
logger.info("BEGIN Html Enrichment {} items.".format(len(book.content)))
|
logger.info("BEGIN Html Enrichment {} items.".format(len(book.content)))
|
||||||
cls.parse(book)
|
cls.parse(book)
|
||||||
cls.createDAG(book)
|
|
||||||
cls.populateContent(book)
|
|
||||||
cls.createOrder(book)
|
cls.createOrder(book)
|
||||||
cls.print(book)
|
cls.populateContent(book)
|
||||||
|
cls.createDAG(book)
|
||||||
|
cls.findFirst(book)
|
||||||
|
book.print()
|
||||||
logger.info("BEGIN Html Enrichment {} items.".format(len(book.content)))
|
logger.info("BEGIN Html Enrichment {} items.".format(len(book.content)))
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@@ -38,108 +40,73 @@ class EnrichHtml(object):
|
|||||||
for item in book.content.values():
|
for item in book.content.values():
|
||||||
if item.soup is not None:
|
if item.soup is not None:
|
||||||
logger.info("Create DAG {}".format(item.url))
|
logger.info("Create DAG {}".format(item.url))
|
||||||
|
cls.normalizeUrlAndSetDependecy(book, item, 'a', 'href')
|
||||||
|
cls.normalizeUrlAndSetDependecy(book, item, 'img', 'src')
|
||||||
|
cls.normalizeUrlAndSetDependecy(book, item, 'img', 'data-mfp-src')
|
||||||
|
|
||||||
my_url = urlparse.urlsplit(item.url)
|
|
||||||
|
|
||||||
for link in item.soup.find_all('a'):
|
@classmethod
|
||||||
href = link.get('href')
|
def normalizeUrlAndSetDependecy(cls, book, item, tag, attr):
|
||||||
if not href:
|
logger = logging.getLogger(__name__)
|
||||||
|
for element in item.soup.find_all(tag):
|
||||||
|
url = element.get(attr)
|
||||||
|
if not url:
|
||||||
continue
|
continue
|
||||||
parsed_href = urlparse.urlsplit(href)
|
normal_url = item.getAbsoluteUrl(url)
|
||||||
url = \
|
logger.info(" depends on: {}".format(normal_url))
|
||||||
urlparse.SplitResult(parsed_href.scheme,
|
book.insertDependency(item, element, normal_url)
|
||||||
parsed_href.netloc,
|
|
||||||
parsed_href.path,
|
|
||||||
parsed_href.query,
|
|
||||||
None).geturl()
|
|
||||||
|
|
||||||
if url in book.content:
|
|
||||||
book.content[url].needed_by.add(item.url)
|
|
||||||
item.needs.add(url)
|
|
||||||
elif href:
|
|
||||||
logger.info(" refered but no item exist: {}".format(url))
|
|
||||||
|
|
||||||
for link in item.soup.find_all('img'):
|
|
||||||
href = link.get('src')
|
|
||||||
if not href:
|
|
||||||
continue
|
|
||||||
parsed_href = urlparse.urlsplit(href)
|
|
||||||
url = \
|
|
||||||
urlparse.SplitResult(parsed_href.scheme,
|
|
||||||
parsed_href.netloc,
|
|
||||||
parsed_href.path,
|
|
||||||
parsed_href.query,
|
|
||||||
None).geturl()
|
|
||||||
|
|
||||||
if url in book.content:
|
|
||||||
book.content[url].needed_by.add(item.url)
|
|
||||||
item.needs.add(url)
|
|
||||||
elif href:
|
|
||||||
logger.info(" refered but no item exist: {}".format(url))
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def populateContent(cls, book):
|
def populateContent(cls, book):
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
remove = []
|
||||||
for item in book.content.values():
|
for item in book.content.values():
|
||||||
if item.soup is not None:
|
if item.soup:
|
||||||
# Try to find content.
|
# Try to find content.
|
||||||
item_content = item.soup.find_all('div', attrs={"id": "sbo-rt-content"})
|
item_content = item.soup.find_all('div', attrs={"id": "sbo-rt-content"})
|
||||||
if len(item_content) == 1:
|
if len(item_content) == 1:
|
||||||
item.content = item_content[0]
|
content = item_content[0].extract()
|
||||||
|
body = item.soup.find('body')
|
||||||
|
body.clear()
|
||||||
|
body.append(content)
|
||||||
else:
|
else:
|
||||||
logger.error(" No content found: {}".format(item.url))
|
logger.error(" No content found: {}".format(item.url))
|
||||||
item.remove = True
|
remove.append(item)
|
||||||
|
|
||||||
|
for item in remove:
|
||||||
|
book.remove(item)
|
||||||
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def createOrder(cls, book):
|
def createOrder(cls, book):
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
logger.info("Create Order:")
|
||||||
for item in book.content.values():
|
for item in book.content.values():
|
||||||
if item.soup is not None:
|
if item.soup:
|
||||||
# Try to get prev chapter.
|
# Try to get prev chapter.
|
||||||
links = item.soup.find_all('a', attrs={"class": "prev nav-link"})
|
links = item.soup.find_all('a', attrs={"class": "prev nav-link"})
|
||||||
if len(links):
|
if len(links):
|
||||||
item.prev = links[0].get('href')
|
item.prev = book.content[
|
||||||
|
item.getAbsoluteUrl(links[0]['href'])]
|
||||||
|
logger.info(" prev = {}:".format(item.prev.url))
|
||||||
|
|
||||||
# Try to get next chapter.
|
# Try to get next chapter.
|
||||||
links = item.soup.find_all('a', attrs={"class": "next nav-link"})
|
links = item.soup.find_all('a', attrs={"class": "next nav-link"})
|
||||||
if len(links):
|
if len(links):
|
||||||
item.next = links[0].get('href')
|
item.next = book.content[
|
||||||
|
item.getAbsoluteUrl(links[0]['href'])]
|
||||||
|
logger.info(" next = {}:".format(item.next.url))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def findFirst(cls, book):
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
for item in book.content.values():
|
for item in book.content.values():
|
||||||
if item.soup is not None \
|
if item.soup and not hasattr(item, 'prev'):
|
||||||
and not hasattr(item, 'prev') \
|
|
||||||
and not hasattr(item, 'remove'):
|
|
||||||
if book.first:
|
if book.first:
|
||||||
logger.error("Multiple begin points found. {} and {}"
|
logger.error("Multiple begin points found. {} and {}"
|
||||||
.format(item.url, item.url))
|
.format(book.first.url, item.url))
|
||||||
raise Exception("Multiple begin points found.")
|
raise Exception("Multiple begin points found.")
|
||||||
else:
|
else:
|
||||||
book.first = item
|
book.first = item
|
||||||
|
logger.info(" first = {}:".format(book.first.url))
|
||||||
@classmethod
|
|
||||||
def getTitle(cls, item):
|
|
||||||
if hasattr(item.soup, 'title') and item.soup.title:
|
|
||||||
return item.soup.title.string
|
|
||||||
else:
|
|
||||||
return item.url
|
|
||||||
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def print(cls, book):
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
item = book.first
|
|
||||||
while item is not None:
|
|
||||||
logger.info("Item: {}".format(cls.getTitle(item)))
|
|
||||||
if hasattr(item, 'prev'):
|
|
||||||
logger.info(" Prev: {}".format(item.prev))
|
|
||||||
if hasattr(item, 'next'):
|
|
||||||
logger.info(" Next: {}".format(item.next))
|
|
||||||
for url in item.needs:
|
|
||||||
logger.info(" Needs: {}".format(url))
|
|
||||||
logger.info("")
|
|
||||||
|
|
||||||
if hasattr(item, 'next'):
|
|
||||||
item = book.content[item.next]
|
|
||||||
else:
|
|
||||||
item = None
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import os
|
import os
|
||||||
import logging
|
import logging
|
||||||
import urllib.parse as urlparse
|
import urllib
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from book import Item, Book
|
from book import Item, Book
|
||||||
@@ -13,62 +13,55 @@ class PrepareEpub(object):
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
logger.info("BEGIN Prepare EPUB.")
|
logger.info("BEGIN Prepare EPUB.")
|
||||||
cls.localize_url(book)
|
cls.localize_url(book)
|
||||||
|
book.print()
|
||||||
logger.info("END Prepare EPUB.")
|
logger.info("END Prepare EPUB.")
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def localize_url(cls, book):
|
def localize_url(cls, book):
|
||||||
#logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
for item in book.content.values():
|
for item in book.content.values():
|
||||||
if hasattr(item, 'remove'):
|
#
|
||||||
continue
|
# Create local name. It will have dir/filename structure.
|
||||||
category = item.content_type.split("/")[0]
|
#
|
||||||
if category != 'text':
|
mime = item.content_type
|
||||||
cls._moveTo(book,item,category)
|
if mime == 'text/css':
|
||||||
|
local_url = cls._createLocalName(book,item,'css')
|
||||||
|
|
||||||
|
elif mime == 'application/font-woff' \
|
||||||
|
or mime == 'application/font-woff2':
|
||||||
|
local_url = cls._createLocalName(book,item,'font')
|
||||||
|
|
||||||
|
elif mime == 'text/html':
|
||||||
|
local_url = cls._createLocalName(book,item,'')
|
||||||
|
|
||||||
else:
|
else:
|
||||||
cls._moveTo(book,item,"")
|
local_url = cls._createLocalName(book,item,mime.split("/")[0])
|
||||||
|
|
||||||
|
cls._moveTo(item,local_url)
|
||||||
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _moveTo(cls, book, item, category):
|
def _createLocalName(cls, book, item, category):
|
||||||
logger = logging.getLogger(__name__)
|
#
|
||||||
parsed_url= urlparse.urlsplit(item.url)
|
# Get file name.
|
||||||
file_name = os.path.basename(parsed_url.path)
|
#
|
||||||
|
parsed_url= urllib.parse.urlsplit(item.url)
|
||||||
|
file_name = parsed_url.path.split('/')[-1]
|
||||||
|
#
|
||||||
|
# Append category
|
||||||
|
#
|
||||||
if category:
|
if category:
|
||||||
new_url = category + "/" + file_name
|
new_url = category + "/" + file_name
|
||||||
else:
|
else:
|
||||||
new_url = file_name
|
new_url = file_name
|
||||||
|
#
|
||||||
|
# If file name already exist then generate a unique one.
|
||||||
|
#
|
||||||
if item.url != new_url \
|
if item.url != new_url \
|
||||||
and new_url in book.content:
|
and new_url in book.content:
|
||||||
new_url = cls._findUniqueName(book, category, file_name)
|
new_url = cls._findUniqueName(book, category, file_name)
|
||||||
|
return new_url
|
||||||
|
|
||||||
logger.info("Renaming {} -> {}"
|
|
||||||
.format(item.url, new_url))
|
|
||||||
|
|
||||||
for dependant in item.needed_by:
|
|
||||||
if hasattr(dependant, 'soup'):
|
|
||||||
base_link = urlparse.urlsplit(dependant.url)
|
|
||||||
base_link.path = os.path.dirname(base_link.path)
|
|
||||||
for a in dependant.soup.find_all('a'):
|
|
||||||
if cls._getAbsoluteUrl(base_link, a.attr.href) == item.url:
|
|
||||||
a.attr.href = new_url
|
|
||||||
for img in dependant.soup.find_all('img'):
|
|
||||||
if cls._getAbsoluteUrl(base_link, img.attr.src) == item.url:
|
|
||||||
img.attrs.src = new_url
|
|
||||||
item.url = new_url
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def _getAbsoluteUrl(cls, base_link, link):
|
|
||||||
parsed = urlparse.urlsplit(link)
|
|
||||||
if parsed.netloc is None:
|
|
||||||
parsed.scheme = base_link.scheme
|
|
||||||
parsed.netloc = base_link.netloc
|
|
||||||
if parsed.path[0] != '/':
|
|
||||||
parsed.path = base_link.path + '/' + href.path
|
|
||||||
return \
|
|
||||||
urlparse.SplitResult(parsed.scheme,
|
|
||||||
parsed.netloc,
|
|
||||||
parsed.path,
|
|
||||||
parsed.query,
|
|
||||||
None).geturl()
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _findUniqueName(cls, book, category, filename):
|
def _findUniqueName(cls, book, category, filename):
|
||||||
@@ -84,95 +77,18 @@ class PrepareEpub(object):
|
|||||||
break
|
break
|
||||||
return new_url
|
return new_url
|
||||||
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def createDAG(cls, book):
|
def _moveTo(cls, item, local_url):
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
for item in book.content.values():
|
logger.info("Renaming {} -> {}".format(item.url, local_url))
|
||||||
if item.soup is not None:
|
|
||||||
logger.info("Create DAG {}".format(item.url))
|
|
||||||
|
|
||||||
links = item.soup.find_all('a')
|
|
||||||
for link in links:
|
|
||||||
href = link.get('href')
|
|
||||||
if not href:
|
|
||||||
continue
|
|
||||||
parsed_href = urlparse.urlsplit(href)
|
|
||||||
url = \
|
|
||||||
urlparse.SplitResult(parsed_href.scheme,
|
|
||||||
parsed_href.netloc,
|
|
||||||
parsed_href.path,
|
|
||||||
parsed_href.query,
|
|
||||||
None).geturl()
|
|
||||||
|
|
||||||
if url in book.content:
|
|
||||||
book.content[url].needed_by.add(item.url)
|
|
||||||
item.needs.add(url)
|
|
||||||
elif href:
|
|
||||||
logger.info(" refered but no item exist: {}".format(url))
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def populateContent(cls, book):
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
for item in book.content.values():
|
|
||||||
if item.soup is not None:
|
|
||||||
# Try to find content.
|
|
||||||
item_content = item.soup.find_all('div', attrs={"id": "sbo-rt-content"})
|
|
||||||
if len(item_content) == 1:
|
|
||||||
item.content = item_content[0]
|
|
||||||
else:
|
|
||||||
logger.error("No content found: {}".format(item.url))
|
|
||||||
item.remove = True
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def createOrder(cls, book):
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
for item in book.content.values():
|
|
||||||
if item.soup is not None:
|
|
||||||
# Try to get prev chapter.
|
|
||||||
links = item.soup.find_all('a', attrs={"class": "prev nav-link"})
|
|
||||||
if len(links):
|
|
||||||
item.prev = links[0].get('href')
|
|
||||||
|
|
||||||
# Try to get next chapter.
|
|
||||||
links = item.soup.find_all('a', attrs={"class": "next nav-link"})
|
|
||||||
if len(links):
|
|
||||||
item.next = links[0].get('href')
|
|
||||||
|
|
||||||
for item in book.content.values():
|
|
||||||
if item.soup is not None \
|
|
||||||
and not hasattr(item, 'prev') \
|
|
||||||
and not hasattr(item, 'remove'):
|
|
||||||
if book.first:
|
|
||||||
logger.error("Multiple begin points found. {} and {}"
|
|
||||||
.format(item.url, item.url))
|
|
||||||
raise Exception("Multiple begin points found.")
|
|
||||||
else:
|
|
||||||
book.first = item
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def getTitle(cls, item):
|
|
||||||
if hasattr(item.soup, 'title') and item.soup.title:
|
|
||||||
return item.soup.title.string
|
|
||||||
else:
|
|
||||||
return item.url
|
|
||||||
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def print(cls, book):
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
item = book.first
|
|
||||||
while item is not None:
|
|
||||||
logger.info("Item: {}".format(cls.getTitle(item)))
|
|
||||||
if hasattr(item, 'prev'):
|
|
||||||
logger.info(" Prev: {}".format(item.prev))
|
|
||||||
if hasattr(item, 'next'):
|
|
||||||
logger.info(" Next: {}".format(item.next))
|
|
||||||
for url in item.needs:
|
|
||||||
logger.info(" Needs: {}".format(url))
|
|
||||||
logger.info("")
|
|
||||||
|
|
||||||
if hasattr(item, 'next'):
|
|
||||||
item = book.content[item.next]
|
|
||||||
else:
|
|
||||||
item = None
|
|
||||||
|
|
||||||
|
for ref_elem in item.needed_by_elem:
|
||||||
|
if ref_elem.name == 'a':
|
||||||
|
_,fragment = urllib.parse.urldefrag(ref_elem['href'])
|
||||||
|
ref_elem['href'] = local_url + "#" + fragment
|
||||||
|
if ref_elem.name == 'img':
|
||||||
|
ref_elem['src'] = local_url
|
||||||
|
if ref_elem.name == 'img':
|
||||||
|
ref_elem['data-mfp-src'] = local_url
|
||||||
|
item.url = local_url
|
||||||
|
|||||||
Reference in New Issue
Block a user