Parsing and loading content.
This commit is contained in:
2
Makefile
2
Makefile
@@ -12,7 +12,7 @@ test_all: $(addprefix test_,$(TEST_FILES))
|
|||||||
.PHONY: $(addprefix test_,$(TEST_FILES))
|
.PHONY: $(addprefix test_,$(TEST_FILES))
|
||||||
$(addprefix test_,$(TEST_FILES)): test_%:
|
$(addprefix test_,$(TEST_FILES)): test_%:
|
||||||
@echo Testing $*
|
@echo Testing $*
|
||||||
@python -v gragir/__main__.py $* test_out/$(*F).epub
|
@python gragir/__main__.py -v $* test_out/$(*F).epub
|
||||||
|
|
||||||
#python gragir/__main__.py samples/algorithms_third_edition_in_c.zip algorithms_third_edition_in_c.epub
|
#python gragir/__main__.py samples/algorithms_third_edition_in_c.zip algorithms_third_edition_in_c.epub
|
||||||
|
|
||||||
|
|||||||
@@ -4,9 +4,9 @@
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
# Standard library modules do the heavy lifting. Ours is all simple stuff.
|
# Standard library modules do the heavy lifting. Ours is all simple stuff.
|
||||||
# import base64
|
import base64
|
||||||
# import email, email.message
|
import email, email.message
|
||||||
# import mimetypes
|
import mimetypes
|
||||||
# import os
|
# import os
|
||||||
# import quopri
|
# import quopri
|
||||||
|
|
||||||
@@ -40,15 +40,51 @@ def parseArguments():
|
|||||||
def validateMht(fileName):
|
def validateMht(fileName):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def parseMhtFile(mhtFileName):
|
def parseMht(mht, content):
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
logger.info(mhtFileName)
|
|
||||||
pass
|
|
||||||
|
|
||||||
def parseZipFile( zip ):
|
mhtContent = email.message_from_bytes(mht)
|
||||||
|
|
||||||
|
parts = mhtContent.get_payload()
|
||||||
|
# Multiple parts, usually? If single 'str' part, then convert to a list.
|
||||||
|
if not type(parts) is list:
|
||||||
|
parts = [mhtContent]
|
||||||
|
|
||||||
|
logger.info(' Number of parts: {}'.format(len(parts)))
|
||||||
|
|
||||||
|
# Save all parts to files.
|
||||||
|
for p in parts: # walk() for a tree, but I'm guessing MHT is never nested?
|
||||||
|
#??? cs = p.get_charset() # Expecting "utf-8" for root HTML, None for all other parts.
|
||||||
|
ct = p.get_content_type()
|
||||||
|
# String coerced to lower case of the form maintype/subtype, else get_default_type().
|
||||||
|
fp = p.get("content-location") or "index.html" # File path. Expecting root HTML is only part with no location.
|
||||||
|
|
||||||
|
logger.info(' Content type: {}, Location: {}, Size: {}'
|
||||||
|
.format(ct, fp, len(p.get_payload())))
|
||||||
|
|
||||||
|
content[fp] = p.get_payload(decode=True)
|
||||||
|
# Create directories as necessary.
|
||||||
|
# if os.path.dirname(fp):
|
||||||
|
# os.makedirs(os.path.dirname(fp), exist_ok=True)
|
||||||
|
|
||||||
|
# # Save part's body to a file.
|
||||||
|
# open(fp, "wb").write(p.get_payload(decode=True))
|
||||||
|
|
||||||
|
def parseMhtFile(zip, mhtInfo, content):
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logger.info('Parsing {}, size: {}, csize: {} '
|
||||||
|
.format(mhtInfo.filename,
|
||||||
|
mhtInfo.file_size,
|
||||||
|
mhtInfo.compress_size))
|
||||||
|
|
||||||
|
with zip.open(mhtInfo) as mht:
|
||||||
|
parseMht(mht.read(), content)
|
||||||
|
|
||||||
|
|
||||||
|
def parseZipFile(zip, content):
|
||||||
for zipMember in zip.infolist():
|
for zipMember in zip.infolist():
|
||||||
if validateMht(zipMember):
|
if validateMht(zipMember):
|
||||||
parseMhtFile(zipMember.filename)
|
parseMhtFile(zip, zipMember, content)
|
||||||
else:
|
else:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -71,8 +107,15 @@ def main():
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
logger.info("Parsing {}.".format(args.zip))
|
logger.info("Parsing {}.".format(args.zip))
|
||||||
|
|
||||||
|
content = {}
|
||||||
|
|
||||||
with zipfile.ZipFile(args.zip, 'r') as zip:
|
with zipfile.ZipFile(args.zip, 'r') as zip:
|
||||||
parseZipFile(zip)
|
parseZipFile(zip, content)
|
||||||
|
|
||||||
|
logger.info("Loaded {} parts.".format(len(content)))
|
||||||
|
for name in content.keys():
|
||||||
|
logger.info("{}".format(name))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main() # Kindda useless if we're not using doctest or anything?
|
main()
|
||||||
|
|||||||
Reference in New Issue
Block a user