From db2ae4698973de7546b326093cea4608cbbc6096 Mon Sep 17 00:00:00 2001 From: Vahagn Khachatryan Date: Sun, 7 Oct 2018 23:22:55 +0100 Subject: [PATCH] Localization works. --- gragir/book.py | 20 +++++++++----------- gragir/enrich_html.py | 13 ++++++++++--- gragir/parse_mhtml.py | 12 ++++++------ gragir/prepare_epub.py | 10 +++++++--- 4 files changed, 32 insertions(+), 23 deletions(-) diff --git a/gragir/book.py b/gragir/book.py index 167abf3..eb3243c 100644 --- a/gragir/book.py +++ b/gragir/book.py @@ -9,7 +9,7 @@ class Item(object): self.content_type = content_type self.payload = payload self.needed_by = set() - self.needed_by_elem = set() + self.needed_by_elem = [] self.needs = set() self.soup = None @@ -23,7 +23,7 @@ class Item(object): if directory[-1] != '/': directory += '/' file_name = directory + self.url - logger.info("Saved {}".format(file_name)) + logger.info("Saveing {}".format(file_name)) # # Ensure directory exist. # @@ -44,6 +44,13 @@ class Item(object): defrag,_ =urllib.parse.urldefrag(link) return urllib.parse.urljoin(self.url, defrag) + def refersTo(self, ref_item): + self.needs.add(ref_item) + + def referencedBy(self, item, element): + self.needed_by.add(item) + self.needed_by_elem.append(element) + class Book(object): @@ -61,15 +68,6 @@ class Book(object): for item in self.content.values(): item.save_file(directory) - def insertDependency(self, item, element, url): - logger = logging.getLogger(__name__) - if url in self.content: - item.needs.add(self.content[url]) - self.content[url].needed_by.add(item) - self.content[url].needed_by_elem.add(element) - elif url: - logger.info(" refered but no item exist: {}".format(url)) - def print(self): logger = logging.getLogger(__name__) logger.info("Book Structure:") diff --git a/gragir/enrich_html.py b/gragir/enrich_html.py index 05f51e7..988b37f 100644 --- a/gragir/enrich_html.py +++ b/gragir/enrich_html.py @@ -41,6 +41,7 @@ class EnrichHtml(object): if item.soup is not None: logger.info("Create DAG {}".format(item.url)) cls.normalizeUrlAndSetDependecy(book, item, 'a', 'href') + cls.normalizeUrlAndSetDependecy(book, item, 'link', 'href') cls.normalizeUrlAndSetDependecy(book, item, 'img', 'src') cls.normalizeUrlAndSetDependecy(book, item, 'img', 'data-mfp-src') @@ -53,8 +54,14 @@ class EnrichHtml(object): if not url: continue normal_url = item.getAbsoluteUrl(url) - logger.info(" depends on: {}".format(normal_url)) - book.insertDependency(item, element, normal_url) + logger.info(" refers to: {}".format(normal_url)) + if normal_url in book.content: + ref_item = book.content[normal_url] + item.refersTo(ref_item) + ref_item.referencedBy(item, element) + else: + logger.info(" refered but no item exist: {}".format(url)) + element[attr] = '' @classmethod @@ -71,7 +78,7 @@ class EnrichHtml(object): body.clear() body.append(content) else: - logger.error(" No content found: {}".format(item.url)) + logger.warn(" No content found: {}".format(item.url)) remove.append(item) for item in remove: diff --git a/gragir/parse_mhtml.py b/gragir/parse_mhtml.py index c9ddc57..c7a03ba 100644 --- a/gragir/parse_mhtml.py +++ b/gragir/parse_mhtml.py @@ -14,16 +14,16 @@ def parseMht(mht, book): parts = mhtContent.get_payload() # Multiple parts, usually? If single 'str' part, then convert to a list. - if not type(parts) is list: - parts = [mhtContent] + if not type(parts) is list: + parts = [mhtContent] logger.info(' Number of parts: {}'.format(len(parts))) # Save all parts to files. for p in parts: # walk() for a tree, but I'm guessing MHT is never nested? - #??? cs = p.get_charset() # Expecting "utf-8" for root HTML, None for all other parts. + #??? cs = p.get_charset() # Expecting "utf-8" for root HTML, None for all other parts. ct = p.get_content_type() - # String coerced to lower case of the form maintype/subtype, else get_default_type(). + # String coerced to lower case of the form maintype/subtype, else get_default_type(). fp = p.get("content-location") or "index.html" # File path. Expecting root HTML is only part with no location. logger.info(' Content type: {}, Location: {}, Size: {}' @@ -34,9 +34,9 @@ def parseMht(mht, book): def parseMhtFile(zip, mhtInfo, book): logger = logging.getLogger(__name__) - logger.info('Parsing {}, size: {}, csize: {} ' + logger.info('Reading {}, size: {}, csize: {} ' .format(mhtInfo.filename, - mhtInfo.file_size, + mhtInfo.file_size, mhtInfo.compress_size)) with zip.open(mhtInfo) as mht: diff --git a/gragir/prepare_epub.py b/gragir/prepare_epub.py index 2429624..d3505fe 100644 --- a/gragir/prepare_epub.py +++ b/gragir/prepare_epub.py @@ -86,9 +86,13 @@ class PrepareEpub(object): for ref_elem in item.needed_by_elem: if ref_elem.name == 'a': _,fragment = urllib.parse.urldefrag(ref_elem['href']) - ref_elem['href'] = local_url + "#" + fragment - if ref_elem.name == 'img': + if fragment: + ref_elem['href'] = local_url + "#" + fragment + else: + ref_elem['href'] = local_url + elif ref_elem.name == 'img': ref_elem['src'] = local_url - if ref_elem.name == 'img': ref_elem['data-mfp-src'] = local_url + else: + logger.info("Renaming {} -> {}".format(item.url, local_url)) item.url = local_url