Reading html files through BeautifulSoup
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -102,3 +102,5 @@ ENV/
|
|||||||
|
|
||||||
# smaple files
|
# smaple files
|
||||||
samples/*.zip
|
samples/*.zip
|
||||||
|
temp/*
|
||||||
|
test_out/*
|
||||||
|
|||||||
8
Makefile
8
Makefile
@@ -1,18 +1,20 @@
|
|||||||
|
|
||||||
TEST_FILES = $(wildcard samples/*.zip)
|
TEST_FILES = $(wildcard samples/*.zip)
|
||||||
|
|
||||||
|
PYTHONPATH = $(CURDIR)/modules/ebooklib
|
||||||
|
|
||||||
.PHONY: test
|
.PHONY: test
|
||||||
test:
|
test: test_samples/algorithms_third_edition_in_c.zip
|
||||||
python gragir/__main__.py -v samples/algorithms_third_edition_in_c.zip algorithms_third_edition_in_c.epub
|
|
||||||
|
|
||||||
.PHONY: test_all
|
.PHONY: test_all
|
||||||
test_all: $(addprefix test_,$(TEST_FILES))
|
test_all: $(addprefix test_,$(TEST_FILES))
|
||||||
|
|
||||||
|
|
||||||
|
.ONESHELL:
|
||||||
.PHONY: $(addprefix test_,$(TEST_FILES))
|
.PHONY: $(addprefix test_,$(TEST_FILES))
|
||||||
$(addprefix test_,$(TEST_FILES)): test_%:
|
$(addprefix test_,$(TEST_FILES)): test_%:
|
||||||
@echo Testing $*
|
@echo Testing $*
|
||||||
@python gragir/__main__.py -v $* test_out/$(*F).epub
|
@set PYTHONPATH=$(PYTHONPATH) && python gragir/__main__.py -v $* test_out/$(*F).epub
|
||||||
|
|
||||||
#python gragir/__main__.py samples/algorithms_third_edition_in_c.zip algorithms_third_edition_in_c.epub
|
#python gragir/__main__.py samples/algorithms_third_edition_in_c.zip algorithms_third_edition_in_c.epub
|
||||||
|
|
||||||
|
|||||||
@@ -4,9 +4,9 @@
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
# Standard library modules do the heavy lifting. Ours is all simple stuff.
|
# Standard library modules do the heavy lifting. Ours is all simple stuff.
|
||||||
import base64
|
# import base64
|
||||||
import email, email.message
|
# import email.message
|
||||||
import mimetypes
|
# import mimetypes
|
||||||
# import os
|
# import os
|
||||||
# import quopri
|
# import quopri
|
||||||
|
|
||||||
@@ -15,6 +15,10 @@ import logging
|
|||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
import zipfile
|
import zipfile
|
||||||
|
import email
|
||||||
|
|
||||||
|
import ebooklib.epub as ebooklib
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
def parseArguments():
|
def parseArguments():
|
||||||
"""
|
"""
|
||||||
@@ -30,16 +34,20 @@ def parseArguments():
|
|||||||
parser.add_argument("-q", "--quiet", action="store_true", help="log only errors.")
|
parser.add_argument("-q", "--quiet", action="store_true", help="log only errors.")
|
||||||
args = parser.parse_args() # --help is built-in.
|
args = parser.parse_args() # --help is built-in.
|
||||||
|
|
||||||
# Validate command line.
|
|
||||||
# if args.pack == args.unpack:
|
|
||||||
# sys.stderr.write("Invalid: must specify one action, either --pack or --unpack.\n")
|
|
||||||
# sys.exit(-1)
|
|
||||||
|
|
||||||
return args
|
return args
|
||||||
|
|
||||||
|
|
||||||
def validateMht(fileName):
|
def validateMht(fileName):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
class Item(object):
|
||||||
|
|
||||||
|
def __init__(self, file_name, content_type, payload):
|
||||||
|
self.file_name = file_name
|
||||||
|
self.content_type = content_type
|
||||||
|
self.payload = payload
|
||||||
|
|
||||||
|
|
||||||
def parseMht(mht, content):
|
def parseMht(mht, content):
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -62,13 +70,8 @@ def parseMht(mht, content):
|
|||||||
logger.info(' Content type: {}, Location: {}, Size: {}'
|
logger.info(' Content type: {}, Location: {}, Size: {}'
|
||||||
.format(ct, fp, len(p.get_payload())))
|
.format(ct, fp, len(p.get_payload())))
|
||||||
|
|
||||||
content[fp] = p.get_payload(decode=True)
|
content[fp] = Item(fp, ct, p.get_payload(decode=True))
|
||||||
# Create directories as necessary.
|
|
||||||
# if os.path.dirname(fp):
|
|
||||||
# os.makedirs(os.path.dirname(fp), exist_ok=True)
|
|
||||||
|
|
||||||
# # Save part's body to a file.
|
|
||||||
# open(fp, "wb").write(p.get_payload(decode=True))
|
|
||||||
|
|
||||||
def parseMhtFile(zip, mhtInfo, content):
|
def parseMhtFile(zip, mhtInfo, content):
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -82,12 +85,22 @@ def parseMhtFile(zip, mhtInfo, content):
|
|||||||
|
|
||||||
|
|
||||||
def parseZipFile(zip, content):
|
def parseZipFile(zip, content):
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
for zipMember in zip.infolist():
|
for zipMember in zip.infolist():
|
||||||
if validateMht(zipMember):
|
if validateMht(zipMember):
|
||||||
parseMhtFile(zip, zipMember, content)
|
parseMhtFile(zip, zipMember, content)
|
||||||
else:
|
else:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def enrichContent(content):
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logger.info("Loaded {} parts.".format(len(content)))
|
||||||
|
for item in content.values():
|
||||||
|
logger.info("Enriching {} {}".format(item.content_type, item.file_name))
|
||||||
|
if item.content_type == 'text/html':
|
||||||
|
item.soup = BeautifulSoup(item.payload, "lxml")
|
||||||
|
|
||||||
|
# for name in content.keys():
|
||||||
|
|
||||||
def configLogger(args):
|
def configLogger(args):
|
||||||
loggingLevel = logging.DEBUG if args.debug \
|
loggingLevel = logging.DEBUG if args.debug \
|
||||||
@@ -97,7 +110,136 @@ def configLogger(args):
|
|||||||
format='%(asctime)s %(levelname)s: %(name)s - %(message)s',
|
format='%(asctime)s %(levelname)s: %(name)s - %(message)s',
|
||||||
level=loggingLevel)
|
level=loggingLevel)
|
||||||
|
|
||||||
# Just do it.
|
def createEpubBook(content):
|
||||||
|
book = ebooklib.EpubBook()
|
||||||
|
|
||||||
|
# class EpubImage(EpubItem):
|
||||||
|
# class EpubNav(EpubHtml):
|
||||||
|
# class EpubCoverHtml(EpubHtml):
|
||||||
|
# class EpubHtml(EpubItem):
|
||||||
|
# class EpubCover(EpubItem):
|
||||||
|
# class EpubNcx(EpubItem):
|
||||||
|
# class EpubItem(object):
|
||||||
|
# class EpubException(Exception):
|
||||||
|
# class Link(object):
|
||||||
|
# class Section(object):
|
||||||
|
|
||||||
|
|
||||||
|
# def set_identifier(self, uid)
|
||||||
|
# def set_title(self, title)
|
||||||
|
# def set_language(self, lang)
|
||||||
|
# def set_cover(self, file_name, content, create_page=True):
|
||||||
|
# """
|
||||||
|
# Set cover and create cover document if needed.
|
||||||
|
|
||||||
|
# :Args:
|
||||||
|
# - file_name: file name of the cover page
|
||||||
|
# - content: Content for the cover image
|
||||||
|
# - create_page: Should cover page be defined. Defined as bool value (optional). Default value is True.
|
||||||
|
# """
|
||||||
|
|
||||||
|
# def add_author(self, author, file_as=None, role=None, uid='creator'):
|
||||||
|
# def add_metadata(self, namespace, name, value, others=None):
|
||||||
|
# def set_unique_metadata(self, namespace, name, value, others=None):
|
||||||
|
# "Add metadata if metadata with this identifier does not already exist, otherwise update existing metadata."
|
||||||
|
# def add_item(self, item):
|
||||||
|
|
||||||
|
# def get_metadata(self, namespace, name):
|
||||||
|
# def get_item_with_id(self, uid):
|
||||||
|
# """
|
||||||
|
# Returns item for defined UID.
|
||||||
|
|
||||||
|
# >>> book.get_item_with_id('image_001')
|
||||||
|
|
||||||
|
# :Args:
|
||||||
|
# - uid: UID for the item
|
||||||
|
|
||||||
|
# :Returns:
|
||||||
|
# Returns item object. Returns None if nothing was found.
|
||||||
|
# """
|
||||||
|
|
||||||
|
# def get_item_with_href(self, href):
|
||||||
|
# """
|
||||||
|
# Returns item for defined HREF.
|
||||||
|
|
||||||
|
# >>> book.get_item_with_href('EPUB/document.xhtml')
|
||||||
|
|
||||||
|
# :Args:
|
||||||
|
# - href: HREF for the item we are searching for
|
||||||
|
|
||||||
|
# :Returns:
|
||||||
|
# Returns item object. Returns None if nothing was found.
|
||||||
|
# """
|
||||||
|
|
||||||
|
# def get_items(self):
|
||||||
|
# def get_items_of_media_type(self, media_type):
|
||||||
|
# def get_items_of_type(self, item_type):
|
||||||
|
# """
|
||||||
|
# Returns all items of specified type.
|
||||||
|
|
||||||
|
# >>> book.get_items_of_type(epub.ITEM_IMAGE)
|
||||||
|
|
||||||
|
# :Args:
|
||||||
|
# - item_type: Type for items we are searching for
|
||||||
|
|
||||||
|
# :Returns:
|
||||||
|
# Returns found items as tuple.
|
||||||
|
# """
|
||||||
|
# return (item for item in self.items if item.get_type() == item_type)
|
||||||
|
|
||||||
|
|
||||||
|
# def get_template(self, name):
|
||||||
|
# def set_template(self, name, value):
|
||||||
|
# """
|
||||||
|
# Defines templates which are used to generate certain types of pages. When defining new value for the template
|
||||||
|
# we have to use content of type 'str' (Python 2) or 'bytes' (Python 3).
|
||||||
|
|
||||||
|
# At the moment we use these templates:
|
||||||
|
# - ncx
|
||||||
|
# - nav
|
||||||
|
# - chapter
|
||||||
|
# - cover
|
||||||
|
|
||||||
|
# :Args:
|
||||||
|
# - name: Name for the template
|
||||||
|
# - value: Content for the template
|
||||||
|
# """
|
||||||
|
|
||||||
|
|
||||||
|
# def add_prefix(self, name, uri):
|
||||||
|
# """
|
||||||
|
# Appends custom prefix to be added to the content.opf document
|
||||||
|
|
||||||
|
# >>> epub_book.add_prefix('bkterms', 'http://booktype.org/')
|
||||||
|
|
||||||
|
# :Args:
|
||||||
|
# - name: namespave name
|
||||||
|
# - uri: URI for the namespace
|
||||||
|
# """
|
||||||
|
|
||||||
|
return book
|
||||||
|
|
||||||
|
def writeEpubBook(name, book, options=None):
|
||||||
|
"""
|
||||||
|
Creates epub file with the content defined in EpubBook.
|
||||||
|
|
||||||
|
>>> makeEpub('book.epub', book)
|
||||||
|
|
||||||
|
:Args:
|
||||||
|
- name: file name for the output file
|
||||||
|
- book: instance of EpubBook
|
||||||
|
- options: extra opions as dictionary (optional)
|
||||||
|
"""
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
try:
|
||||||
|
epub = ebooklib.EpubWriter(name, book, options)
|
||||||
|
epub.process()
|
||||||
|
epub.write()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Exception {}.".format(e))
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
"""
|
"""
|
||||||
"""
|
"""
|
||||||
@@ -112,9 +254,9 @@ def main():
|
|||||||
with zipfile.ZipFile(args.zip, 'r') as zip:
|
with zipfile.ZipFile(args.zip, 'r') as zip:
|
||||||
parseZipFile(zip, content)
|
parseZipFile(zip, content)
|
||||||
|
|
||||||
logger.info("Loaded {} parts.".format(len(content)))
|
enrichContent(content)
|
||||||
for name in content.keys():
|
book = createEpubBook(content)
|
||||||
logger.info("{}".format(name))
|
writeEpubBook(args.epub, book)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
Submodule modules/ebooklib updated: 00a3c6e064...6a004d4ae3
Reference in New Issue
Block a user