Localization works.

This commit is contained in:
2018-10-07 23:22:55 +01:00
parent a4cedbf3c7
commit db2ae46989
4 changed files with 32 additions and 23 deletions

View File

@@ -9,7 +9,7 @@ class Item(object):
self.content_type = content_type self.content_type = content_type
self.payload = payload self.payload = payload
self.needed_by = set() self.needed_by = set()
self.needed_by_elem = set() self.needed_by_elem = []
self.needs = set() self.needs = set()
self.soup = None self.soup = None
@@ -23,7 +23,7 @@ class Item(object):
if directory[-1] != '/': if directory[-1] != '/':
directory += '/' directory += '/'
file_name = directory + self.url file_name = directory + self.url
logger.info("Saved {}".format(file_name)) logger.info("Saveing {}".format(file_name))
# #
# Ensure directory exist. # Ensure directory exist.
# #
@@ -44,6 +44,13 @@ class Item(object):
defrag,_ =urllib.parse.urldefrag(link) defrag,_ =urllib.parse.urldefrag(link)
return urllib.parse.urljoin(self.url, defrag) return urllib.parse.urljoin(self.url, defrag)
def refersTo(self, ref_item):
self.needs.add(ref_item)
def referencedBy(self, item, element):
self.needed_by.add(item)
self.needed_by_elem.append(element)
class Book(object): class Book(object):
@@ -61,15 +68,6 @@ class Book(object):
for item in self.content.values(): for item in self.content.values():
item.save_file(directory) item.save_file(directory)
def insertDependency(self, item, element, url):
logger = logging.getLogger(__name__)
if url in self.content:
item.needs.add(self.content[url])
self.content[url].needed_by.add(item)
self.content[url].needed_by_elem.add(element)
elif url:
logger.info(" refered but no item exist: {}".format(url))
def print(self): def print(self):
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.info("Book Structure:") logger.info("Book Structure:")

View File

@@ -41,6 +41,7 @@ class EnrichHtml(object):
if item.soup is not None: if item.soup is not None:
logger.info("Create DAG {}".format(item.url)) logger.info("Create DAG {}".format(item.url))
cls.normalizeUrlAndSetDependecy(book, item, 'a', 'href') cls.normalizeUrlAndSetDependecy(book, item, 'a', 'href')
cls.normalizeUrlAndSetDependecy(book, item, 'link', 'href')
cls.normalizeUrlAndSetDependecy(book, item, 'img', 'src') cls.normalizeUrlAndSetDependecy(book, item, 'img', 'src')
cls.normalizeUrlAndSetDependecy(book, item, 'img', 'data-mfp-src') cls.normalizeUrlAndSetDependecy(book, item, 'img', 'data-mfp-src')
@@ -53,8 +54,14 @@ class EnrichHtml(object):
if not url: if not url:
continue continue
normal_url = item.getAbsoluteUrl(url) normal_url = item.getAbsoluteUrl(url)
logger.info(" depends on: {}".format(normal_url)) logger.info(" refers to: {}".format(normal_url))
book.insertDependency(item, element, normal_url) if normal_url in book.content:
ref_item = book.content[normal_url]
item.refersTo(ref_item)
ref_item.referencedBy(item, element)
else:
logger.info(" refered but no item exist: {}".format(url))
element[attr] = ''
@classmethod @classmethod
@@ -71,7 +78,7 @@ class EnrichHtml(object):
body.clear() body.clear()
body.append(content) body.append(content)
else: else:
logger.error(" No content found: {}".format(item.url)) logger.warn(" No content found: {}".format(item.url))
remove.append(item) remove.append(item)
for item in remove: for item in remove:

View File

@@ -14,16 +14,16 @@ def parseMht(mht, book):
parts = mhtContent.get_payload() parts = mhtContent.get_payload()
# Multiple parts, usually? If single 'str' part, then convert to a list. # Multiple parts, usually? If single 'str' part, then convert to a list.
if not type(parts) is list: if not type(parts) is list:
parts = [mhtContent] parts = [mhtContent]
logger.info(' Number of parts: {}'.format(len(parts))) logger.info(' Number of parts: {}'.format(len(parts)))
# Save all parts to files. # Save all parts to files.
for p in parts: # walk() for a tree, but I'm guessing MHT is never nested? for p in parts: # walk() for a tree, but I'm guessing MHT is never nested?
#??? cs = p.get_charset() # Expecting "utf-8" for root HTML, None for all other parts. #??? cs = p.get_charset() # Expecting "utf-8" for root HTML, None for all other parts.
ct = p.get_content_type() ct = p.get_content_type()
# String coerced to lower case of the form maintype/subtype, else get_default_type(). # String coerced to lower case of the form maintype/subtype, else get_default_type().
fp = p.get("content-location") or "index.html" # File path. Expecting root HTML is only part with no location. fp = p.get("content-location") or "index.html" # File path. Expecting root HTML is only part with no location.
logger.info(' Content type: {}, Location: {}, Size: {}' logger.info(' Content type: {}, Location: {}, Size: {}'
@@ -34,9 +34,9 @@ def parseMht(mht, book):
def parseMhtFile(zip, mhtInfo, book): def parseMhtFile(zip, mhtInfo, book):
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.info('Parsing {}, size: {}, csize: {} ' logger.info('Reading {}, size: {}, csize: {} '
.format(mhtInfo.filename, .format(mhtInfo.filename,
mhtInfo.file_size, mhtInfo.file_size,
mhtInfo.compress_size)) mhtInfo.compress_size))
with zip.open(mhtInfo) as mht: with zip.open(mhtInfo) as mht:

View File

@@ -86,9 +86,13 @@ class PrepareEpub(object):
for ref_elem in item.needed_by_elem: for ref_elem in item.needed_by_elem:
if ref_elem.name == 'a': if ref_elem.name == 'a':
_,fragment = urllib.parse.urldefrag(ref_elem['href']) _,fragment = urllib.parse.urldefrag(ref_elem['href'])
ref_elem['href'] = local_url + "#" + fragment if fragment:
if ref_elem.name == 'img': ref_elem['href'] = local_url + "#" + fragment
else:
ref_elem['href'] = local_url
elif ref_elem.name == 'img':
ref_elem['src'] = local_url ref_elem['src'] = local_url
if ref_elem.name == 'img':
ref_elem['data-mfp-src'] = local_url ref_elem['data-mfp-src'] = local_url
else:
logger.info("Renaming {} -> {}".format(item.url, local_url))
item.url = local_url item.url = local_url