From 509d54e5d6a9bfe2d563aab7ece02691afebba55 Mon Sep 17 00:00:00 2001 From: Vahagn Khachatryan Date: Sun, 12 Aug 2018 13:11:19 +0100 Subject: [PATCH] Reading html files through BeautifulSoup --- .gitignore | 2 + Makefile | 8 +- gragir/__main__.py | 178 ++++++++++++++++++++++++++++++++++++++++----- modules/ebooklib | 2 +- 4 files changed, 168 insertions(+), 22 deletions(-) diff --git a/.gitignore b/.gitignore index 0ca54b0..5b77593 100644 --- a/.gitignore +++ b/.gitignore @@ -102,3 +102,5 @@ ENV/ # smaple files samples/*.zip +temp/* +test_out/* diff --git a/Makefile b/Makefile index d1369a1..6466bdc 100644 --- a/Makefile +++ b/Makefile @@ -1,18 +1,20 @@ TEST_FILES = $(wildcard samples/*.zip) +PYTHONPATH = $(CURDIR)/modules/ebooklib + .PHONY: test -test: - python gragir/__main__.py -v samples/algorithms_third_edition_in_c.zip algorithms_third_edition_in_c.epub +test: test_samples/algorithms_third_edition_in_c.zip .PHONY: test_all test_all: $(addprefix test_,$(TEST_FILES)) +.ONESHELL: .PHONY: $(addprefix test_,$(TEST_FILES)) $(addprefix test_,$(TEST_FILES)): test_%: @echo Testing $* - @python gragir/__main__.py -v $* test_out/$(*F).epub + @set PYTHONPATH=$(PYTHONPATH) && python gragir/__main__.py -v $* test_out/$(*F).epub #python gragir/__main__.py samples/algorithms_third_edition_in_c.zip algorithms_third_edition_in_c.epub diff --git a/gragir/__main__.py b/gragir/__main__.py index bfdd1f1..ca7dd3d 100644 --- a/gragir/__main__.py +++ b/gragir/__main__.py @@ -4,9 +4,9 @@ """ # Standard library modules do the heavy lifting. Ours is all simple stuff. -import base64 -import email, email.message -import mimetypes +# import base64 +# import email.message +# import mimetypes # import os # import quopri @@ -15,6 +15,10 @@ import logging import argparse import zipfile +import email + +import ebooklib.epub as ebooklib +from bs4 import BeautifulSoup def parseArguments(): """ @@ -30,16 +34,20 @@ def parseArguments(): parser.add_argument("-q", "--quiet", action="store_true", help="log only errors.") args = parser.parse_args() # --help is built-in. - # Validate command line. - # if args.pack == args.unpack: - # sys.stderr.write("Invalid: must specify one action, either --pack or --unpack.\n") - # sys.exit(-1) - return args + def validateMht(fileName): return True +class Item(object): + + def __init__(self, file_name, content_type, payload): + self.file_name = file_name + self.content_type = content_type + self.payload = payload + + def parseMht(mht, content): logger = logging.getLogger(__name__) @@ -62,13 +70,8 @@ def parseMht(mht, content): logger.info(' Content type: {}, Location: {}, Size: {}' .format(ct, fp, len(p.get_payload()))) - content[fp] = p.get_payload(decode=True) - # Create directories as necessary. - # if os.path.dirname(fp): - # os.makedirs(os.path.dirname(fp), exist_ok=True) + content[fp] = Item(fp, ct, p.get_payload(decode=True)) - # # Save part's body to a file. - # open(fp, "wb").write(p.get_payload(decode=True)) def parseMhtFile(zip, mhtInfo, content): logger = logging.getLogger(__name__) @@ -82,12 +85,22 @@ def parseMhtFile(zip, mhtInfo, content): def parseZipFile(zip, content): + logger = logging.getLogger(__name__) for zipMember in zip.infolist(): if validateMht(zipMember): parseMhtFile(zip, zipMember, content) else: pass +def enrichContent(content): + logger = logging.getLogger(__name__) + logger.info("Loaded {} parts.".format(len(content))) + for item in content.values(): + logger.info("Enriching {} {}".format(item.content_type, item.file_name)) + if item.content_type == 'text/html': + item.soup = BeautifulSoup(item.payload, "lxml") + + # for name in content.keys(): def configLogger(args): loggingLevel = logging.DEBUG if args.debug \ @@ -97,7 +110,136 @@ def configLogger(args): format='%(asctime)s %(levelname)s: %(name)s - %(message)s', level=loggingLevel) -# Just do it. +def createEpubBook(content): + book = ebooklib.EpubBook() + + # class EpubImage(EpubItem): + # class EpubNav(EpubHtml): + # class EpubCoverHtml(EpubHtml): + # class EpubHtml(EpubItem): + # class EpubCover(EpubItem): + # class EpubNcx(EpubItem): + # class EpubItem(object): + # class EpubException(Exception): + # class Link(object): + # class Section(object): + + + # def set_identifier(self, uid) + # def set_title(self, title) + # def set_language(self, lang) + # def set_cover(self, file_name, content, create_page=True): + # """ + # Set cover and create cover document if needed. + + # :Args: + # - file_name: file name of the cover page + # - content: Content for the cover image + # - create_page: Should cover page be defined. Defined as bool value (optional). Default value is True. + # """ + + # def add_author(self, author, file_as=None, role=None, uid='creator'): + # def add_metadata(self, namespace, name, value, others=None): + # def set_unique_metadata(self, namespace, name, value, others=None): + # "Add metadata if metadata with this identifier does not already exist, otherwise update existing metadata." + # def add_item(self, item): + + # def get_metadata(self, namespace, name): + # def get_item_with_id(self, uid): + # """ + # Returns item for defined UID. + + # >>> book.get_item_with_id('image_001') + + # :Args: + # - uid: UID for the item + + # :Returns: + # Returns item object. Returns None if nothing was found. + # """ + + # def get_item_with_href(self, href): + # """ + # Returns item for defined HREF. + + # >>> book.get_item_with_href('EPUB/document.xhtml') + + # :Args: + # - href: HREF for the item we are searching for + + # :Returns: + # Returns item object. Returns None if nothing was found. + # """ + + # def get_items(self): + # def get_items_of_media_type(self, media_type): + # def get_items_of_type(self, item_type): + # """ + # Returns all items of specified type. + + # >>> book.get_items_of_type(epub.ITEM_IMAGE) + + # :Args: + # - item_type: Type for items we are searching for + + # :Returns: + # Returns found items as tuple. + # """ + # return (item for item in self.items if item.get_type() == item_type) + + + # def get_template(self, name): + # def set_template(self, name, value): + # """ + # Defines templates which are used to generate certain types of pages. When defining new value for the template + # we have to use content of type 'str' (Python 2) or 'bytes' (Python 3). + + # At the moment we use these templates: + # - ncx + # - nav + # - chapter + # - cover + + # :Args: + # - name: Name for the template + # - value: Content for the template + # """ + + + # def add_prefix(self, name, uri): + # """ + # Appends custom prefix to be added to the content.opf document + + # >>> epub_book.add_prefix('bkterms', 'http://booktype.org/') + + # :Args: + # - name: namespave name + # - uri: URI for the namespace + # """ + + return book + +def writeEpubBook(name, book, options=None): + """ + Creates epub file with the content defined in EpubBook. + + >>> makeEpub('book.epub', book) + + :Args: + - name: file name for the output file + - book: instance of EpubBook + - options: extra opions as dictionary (optional) + """ + logger = logging.getLogger(__name__) + + try: + epub = ebooklib.EpubWriter(name, book, options) + epub.process() + epub.write() + except Exception as e: + logger.error("Exception {}.".format(e)) + + def main(): """ """ @@ -112,9 +254,9 @@ def main(): with zipfile.ZipFile(args.zip, 'r') as zip: parseZipFile(zip, content) - logger.info("Loaded {} parts.".format(len(content))) - for name in content.keys(): - logger.info("{}".format(name)) + enrichContent(content) + book = createEpubBook(content) + writeEpubBook(args.epub, book) if __name__ == "__main__": diff --git a/modules/ebooklib b/modules/ebooklib index 00a3c6e..6a004d4 160000 --- a/modules/ebooklib +++ b/modules/ebooklib @@ -1 +1 @@ -Subproject commit 00a3c6e064c5a71dc8da38276f04e559c0cd66df +Subproject commit 6a004d4ae3d6da44575a482aa3f605e9484f7b0c