Parsing and loading content.
This commit is contained in:
2
Makefile
2
Makefile
@@ -12,7 +12,7 @@ test_all: $(addprefix test_,$(TEST_FILES))
|
||||
.PHONY: $(addprefix test_,$(TEST_FILES))
|
||||
$(addprefix test_,$(TEST_FILES)): test_%:
|
||||
@echo Testing $*
|
||||
@python -v gragir/__main__.py $* test_out/$(*F).epub
|
||||
@python gragir/__main__.py -v $* test_out/$(*F).epub
|
||||
|
||||
#python gragir/__main__.py samples/algorithms_third_edition_in_c.zip algorithms_third_edition_in_c.epub
|
||||
|
||||
|
||||
@@ -4,9 +4,9 @@
|
||||
"""
|
||||
|
||||
# Standard library modules do the heavy lifting. Ours is all simple stuff.
|
||||
# import base64
|
||||
# import email, email.message
|
||||
# import mimetypes
|
||||
import base64
|
||||
import email, email.message
|
||||
import mimetypes
|
||||
# import os
|
||||
# import quopri
|
||||
|
||||
@@ -40,15 +40,51 @@ def parseArguments():
|
||||
def validateMht(fileName):
|
||||
return True
|
||||
|
||||
def parseMhtFile(mhtFileName):
|
||||
def parseMht(mht, content):
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.info(mhtFileName)
|
||||
pass
|
||||
|
||||
def parseZipFile( zip ):
|
||||
mhtContent = email.message_from_bytes(mht)
|
||||
|
||||
parts = mhtContent.get_payload()
|
||||
# Multiple parts, usually? If single 'str' part, then convert to a list.
|
||||
if not type(parts) is list:
|
||||
parts = [mhtContent]
|
||||
|
||||
logger.info(' Number of parts: {}'.format(len(parts)))
|
||||
|
||||
# Save all parts to files.
|
||||
for p in parts: # walk() for a tree, but I'm guessing MHT is never nested?
|
||||
#??? cs = p.get_charset() # Expecting "utf-8" for root HTML, None for all other parts.
|
||||
ct = p.get_content_type()
|
||||
# String coerced to lower case of the form maintype/subtype, else get_default_type().
|
||||
fp = p.get("content-location") or "index.html" # File path. Expecting root HTML is only part with no location.
|
||||
|
||||
logger.info(' Content type: {}, Location: {}, Size: {}'
|
||||
.format(ct, fp, len(p.get_payload())))
|
||||
|
||||
content[fp] = p.get_payload(decode=True)
|
||||
# Create directories as necessary.
|
||||
# if os.path.dirname(fp):
|
||||
# os.makedirs(os.path.dirname(fp), exist_ok=True)
|
||||
|
||||
# # Save part's body to a file.
|
||||
# open(fp, "wb").write(p.get_payload(decode=True))
|
||||
|
||||
def parseMhtFile(zip, mhtInfo, content):
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.info('Parsing {}, size: {}, csize: {} '
|
||||
.format(mhtInfo.filename,
|
||||
mhtInfo.file_size,
|
||||
mhtInfo.compress_size))
|
||||
|
||||
with zip.open(mhtInfo) as mht:
|
||||
parseMht(mht.read(), content)
|
||||
|
||||
|
||||
def parseZipFile(zip, content):
|
||||
for zipMember in zip.infolist():
|
||||
if validateMht(zipMember):
|
||||
parseMhtFile(zipMember.filename)
|
||||
parseMhtFile(zip, zipMember, content)
|
||||
else:
|
||||
pass
|
||||
|
||||
@@ -71,8 +107,15 @@ def main():
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.info("Parsing {}.".format(args.zip))
|
||||
|
||||
content = {}
|
||||
|
||||
with zipfile.ZipFile(args.zip, 'r') as zip:
|
||||
parseZipFile(zip)
|
||||
parseZipFile(zip, content)
|
||||
|
||||
logger.info("Loaded {} parts.".format(len(content)))
|
||||
for name in content.keys():
|
||||
logger.info("{}".format(name))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main() # Kindda useless if we're not using doctest or anything?
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user