Saving internal image into test_out/test_save
This commit is contained in:
6
.gitignore
vendored
6
.gitignore
vendored
@@ -104,10 +104,6 @@ ENV/
|
|||||||
samples/*.zip
|
samples/*.zip
|
||||||
temp/*
|
temp/*
|
||||||
test_out/*
|
test_out/*
|
||||||
<<<<<<< HEAD
|
|
||||||
.vscode/gragir.code-workspace
|
.vscode/gragir.code-workspace
|
||||||
gragir.7z
|
gragir.7z
|
||||||
||||||| merged common ancestors
|
gragir.log
|
||||||
=======
|
|
||||||
.vscode/gragir..code-workspace
|
|
||||||
>>>>>>> e75fd39a62122b51fe018cdc4bab7100a4d1208f
|
|
||||||
@@ -19,6 +19,7 @@ import ebooklib.epub as ebooklib
|
|||||||
from book import Book, Item
|
from book import Book, Item
|
||||||
from parse_mhtml import parseMhtmlZipFile
|
from parse_mhtml import parseMhtmlZipFile
|
||||||
from enrich_html import EnrichHtml
|
from enrich_html import EnrichHtml
|
||||||
|
from prepare_epub import PrepareEpub
|
||||||
|
|
||||||
def parseArguments():
|
def parseArguments():
|
||||||
"""
|
"""
|
||||||
@@ -48,6 +49,15 @@ def configLogger(args):
|
|||||||
level=loggingLevel)
|
level=loggingLevel)
|
||||||
|
|
||||||
|
|
||||||
|
fh = logging.FileHandler('gragir.log', mode='w')
|
||||||
|
fh.setLevel(logging.DEBUG)
|
||||||
|
fh.setFormatter(fmt=logging.Formatter(fmt='%(asctime)s %(levelname)s: %(name)s - %(message)s'))
|
||||||
|
#, datefmt='%H:%M:%S'
|
||||||
|
#'%(asctime)s %(levelname)s: %(name)s - %(message)s')
|
||||||
|
logger = logging.getLogger()
|
||||||
|
logger.addHandler(fh)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# for name in content.keys():
|
# for name in content.keys():
|
||||||
|
|
||||||
@@ -211,6 +221,8 @@ def main():
|
|||||||
|
|
||||||
parseMhtmlZipFile(args.zip, book)
|
parseMhtmlZipFile(args.zip, book)
|
||||||
EnrichHtml.enrich(book)
|
EnrichHtml.enrich(book)
|
||||||
|
PrepareEpub.prepare(book)
|
||||||
|
book.save_in_dir('test_out/test_save')
|
||||||
#createDAG(book)
|
#createDAG(book)
|
||||||
#createEpubBook(book)
|
#createEpubBook(book)
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
|
import os
|
||||||
|
import logging
|
||||||
|
|
||||||
|
|
||||||
class Item(object):
|
class Item(object):
|
||||||
|
|
||||||
@@ -12,6 +11,33 @@ class Item(object):
|
|||||||
self.needs = set()
|
self.needs = set()
|
||||||
self.soup = None
|
self.soup = None
|
||||||
|
|
||||||
|
def save_file(self, directory):
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
if hasattr(self, 'remove'):
|
||||||
|
return
|
||||||
|
#
|
||||||
|
# Create file name.
|
||||||
|
#
|
||||||
|
if directory[-1] != '/':
|
||||||
|
directory += '/'
|
||||||
|
file_name = directory + self.url
|
||||||
|
logger.info("Saved {}".format(file_name))
|
||||||
|
#
|
||||||
|
# Ensure directory exist.
|
||||||
|
#
|
||||||
|
dir = os.path.dirname(file_name)
|
||||||
|
if not os.path.exists(dir):
|
||||||
|
os.makedirs(dir)
|
||||||
|
#
|
||||||
|
# Save content.
|
||||||
|
#
|
||||||
|
if self.soup:
|
||||||
|
with open( file_name, 'wb') as file:
|
||||||
|
file.write(self.soup.prettify("utf-8"))
|
||||||
|
else:
|
||||||
|
with open( file_name, 'wb') as file:
|
||||||
|
file.write(self.payload)
|
||||||
|
|
||||||
class Book(object):
|
class Book(object):
|
||||||
|
|
||||||
def __init__(self, file_name):
|
def __init__(self, file_name):
|
||||||
@@ -19,3 +45,8 @@ class Book(object):
|
|||||||
self.content = {}
|
self.content = {}
|
||||||
self.first = None
|
self.first = None
|
||||||
|
|
||||||
|
def save_in_dir(self, directory):
|
||||||
|
if not os.path.exists(directory):
|
||||||
|
os.makedirs(directory)
|
||||||
|
for item in self.content.values():
|
||||||
|
item.save_file(directory)
|
||||||
|
|||||||
@@ -39,8 +39,9 @@ class EnrichHtml(object):
|
|||||||
if item.soup is not None:
|
if item.soup is not None:
|
||||||
logger.info("Create DAG {}".format(item.url))
|
logger.info("Create DAG {}".format(item.url))
|
||||||
|
|
||||||
links = item.soup.find_all('a')
|
my_url = urlparse.urlsplit(item.url)
|
||||||
for link in links:
|
|
||||||
|
for link in item.soup.find_all('a'):
|
||||||
href = link.get('href')
|
href = link.get('href')
|
||||||
if not href:
|
if not href:
|
||||||
continue
|
continue
|
||||||
@@ -58,6 +59,24 @@ class EnrichHtml(object):
|
|||||||
elif href:
|
elif href:
|
||||||
logger.info(" refered but no item exist: {}".format(url))
|
logger.info(" refered but no item exist: {}".format(url))
|
||||||
|
|
||||||
|
for link in item.soup.find_all('img'):
|
||||||
|
href = link.get('src')
|
||||||
|
if not href:
|
||||||
|
continue
|
||||||
|
parsed_href = urlparse.urlsplit(href)
|
||||||
|
url = \
|
||||||
|
urlparse.SplitResult(parsed_href.scheme,
|
||||||
|
parsed_href.netloc,
|
||||||
|
parsed_href.path,
|
||||||
|
parsed_href.query,
|
||||||
|
None).geturl()
|
||||||
|
|
||||||
|
if url in book.content:
|
||||||
|
book.content[url].needed_by.add(item.url)
|
||||||
|
item.needs.add(url)
|
||||||
|
elif href:
|
||||||
|
logger.info(" refered but no item exist: {}".format(url))
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def populateContent(cls, book):
|
def populateContent(cls, book):
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|||||||
178
gragir/prepare_epub.py
Normal file
178
gragir/prepare_epub.py
Normal file
@@ -0,0 +1,178 @@
|
|||||||
|
import os
|
||||||
|
import logging
|
||||||
|
import urllib.parse as urlparse
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from book import Item, Book
|
||||||
|
|
||||||
|
|
||||||
|
class PrepareEpub(object):
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def prepare(cls, book):
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logger.info("BEGIN Prepare EPUB.")
|
||||||
|
cls.localize_url(book)
|
||||||
|
logger.info("END Prepare EPUB.")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def localize_url(cls, book):
|
||||||
|
#logger = logging.getLogger(__name__)
|
||||||
|
for item in book.content.values():
|
||||||
|
if hasattr(item, 'remove'):
|
||||||
|
continue
|
||||||
|
category = item.content_type.split("/")[0]
|
||||||
|
if category != 'text':
|
||||||
|
cls._moveTo(book,item,category)
|
||||||
|
else:
|
||||||
|
cls._moveTo(book,item,"")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _moveTo(cls, book, item, category):
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
parsed_url= urlparse.urlsplit(item.url)
|
||||||
|
file_name = os.path.basename(parsed_url.path)
|
||||||
|
if category:
|
||||||
|
new_url = category + "/" + file_name
|
||||||
|
else:
|
||||||
|
new_url = file_name
|
||||||
|
if item.url != new_url \
|
||||||
|
and new_url in book.content:
|
||||||
|
new_url = cls._findUniqueName(book, category, file_name)
|
||||||
|
|
||||||
|
logger.info("Renaming {} -> {}"
|
||||||
|
.format(item.url, new_url))
|
||||||
|
|
||||||
|
for dependant in item.needed_by:
|
||||||
|
if hasattr(dependant, 'soup'):
|
||||||
|
base_link = urlparse.urlsplit(dependant.url)
|
||||||
|
base_link.path = os.path.dirname(base_link.path)
|
||||||
|
for a in dependant.soup.find_all('a'):
|
||||||
|
if cls._getAbsoluteUrl(base_link, a.attr.href) == item.url:
|
||||||
|
a.attr.href = new_url
|
||||||
|
for img in dependant.soup.find_all('img'):
|
||||||
|
if cls._getAbsoluteUrl(base_link, img.attr.src) == item.url:
|
||||||
|
img.attrs.src = new_url
|
||||||
|
item.url = new_url
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _getAbsoluteUrl(cls, base_link, link):
|
||||||
|
parsed = urlparse.urlsplit(link)
|
||||||
|
if parsed.netloc is None:
|
||||||
|
parsed.scheme = base_link.scheme
|
||||||
|
parsed.netloc = base_link.netloc
|
||||||
|
if parsed.path[0] != '/':
|
||||||
|
parsed.path = base_link.path + '/' + href.path
|
||||||
|
return \
|
||||||
|
urlparse.SplitResult(parsed.scheme,
|
||||||
|
parsed.netloc,
|
||||||
|
parsed.path,
|
||||||
|
parsed.query,
|
||||||
|
None).geturl()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _findUniqueName(cls, book, category, filename):
|
||||||
|
i = 0
|
||||||
|
file_name_base, file_ext = os.path.splitext(filename)
|
||||||
|
while True:
|
||||||
|
i+=1
|
||||||
|
if category:
|
||||||
|
new_url = category + '/' + file_name_base + '_' + i + file_ext
|
||||||
|
else:
|
||||||
|
new_url = file_name_base + '_' + i + file_ext
|
||||||
|
if new_url not in book.content:
|
||||||
|
break
|
||||||
|
return new_url
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def createDAG(cls, book):
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
for item in book.content.values():
|
||||||
|
if item.soup is not None:
|
||||||
|
logger.info("Create DAG {}".format(item.url))
|
||||||
|
|
||||||
|
links = item.soup.find_all('a')
|
||||||
|
for link in links:
|
||||||
|
href = link.get('href')
|
||||||
|
if not href:
|
||||||
|
continue
|
||||||
|
parsed_href = urlparse.urlsplit(href)
|
||||||
|
url = \
|
||||||
|
urlparse.SplitResult(parsed_href.scheme,
|
||||||
|
parsed_href.netloc,
|
||||||
|
parsed_href.path,
|
||||||
|
parsed_href.query,
|
||||||
|
None).geturl()
|
||||||
|
|
||||||
|
if url in book.content:
|
||||||
|
book.content[url].needed_by.add(item.url)
|
||||||
|
item.needs.add(url)
|
||||||
|
elif href:
|
||||||
|
logger.info(" refered but no item exist: {}".format(url))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def populateContent(cls, book):
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
for item in book.content.values():
|
||||||
|
if item.soup is not None:
|
||||||
|
# Try to find content.
|
||||||
|
item_content = item.soup.find_all('div', attrs={"id": "sbo-rt-content"})
|
||||||
|
if len(item_content) == 1:
|
||||||
|
item.content = item_content[0]
|
||||||
|
else:
|
||||||
|
logger.error("No content found: {}".format(item.url))
|
||||||
|
item.remove = True
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def createOrder(cls, book):
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
for item in book.content.values():
|
||||||
|
if item.soup is not None:
|
||||||
|
# Try to get prev chapter.
|
||||||
|
links = item.soup.find_all('a', attrs={"class": "prev nav-link"})
|
||||||
|
if len(links):
|
||||||
|
item.prev = links[0].get('href')
|
||||||
|
|
||||||
|
# Try to get next chapter.
|
||||||
|
links = item.soup.find_all('a', attrs={"class": "next nav-link"})
|
||||||
|
if len(links):
|
||||||
|
item.next = links[0].get('href')
|
||||||
|
|
||||||
|
for item in book.content.values():
|
||||||
|
if item.soup is not None \
|
||||||
|
and not hasattr(item, 'prev') \
|
||||||
|
and not hasattr(item, 'remove'):
|
||||||
|
if book.first:
|
||||||
|
logger.error("Multiple begin points found. {} and {}"
|
||||||
|
.format(item.url, item.url))
|
||||||
|
raise Exception("Multiple begin points found.")
|
||||||
|
else:
|
||||||
|
book.first = item
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def getTitle(cls, item):
|
||||||
|
if hasattr(item.soup, 'title') and item.soup.title:
|
||||||
|
return item.soup.title.string
|
||||||
|
else:
|
||||||
|
return item.url
|
||||||
|
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def print(cls, book):
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
item = book.first
|
||||||
|
while item is not None:
|
||||||
|
logger.info("Item: {}".format(cls.getTitle(item)))
|
||||||
|
if hasattr(item, 'prev'):
|
||||||
|
logger.info(" Prev: {}".format(item.prev))
|
||||||
|
if hasattr(item, 'next'):
|
||||||
|
logger.info(" Next: {}".format(item.next))
|
||||||
|
for url in item.needs:
|
||||||
|
logger.info(" Needs: {}".format(url))
|
||||||
|
logger.info("")
|
||||||
|
|
||||||
|
if hasattr(item, 'next'):
|
||||||
|
item = book.content[item.next]
|
||||||
|
else:
|
||||||
|
item = None
|
||||||
|
|
||||||
Reference in New Issue
Block a user