Html enrichment is separated to a new module.

This commit is contained in:
2018-09-11 07:21:25 +01:00
parent 59deeff397
commit 14c25d79ed
3 changed files with 130 additions and 82 deletions

View File

@@ -14,12 +14,11 @@ import sys
import logging
import argparse
import urllib.parse as urlparse
import ebooklib.epub as ebooklib
from bs4 import BeautifulSoup
from book import Book, Item
from parse_mhtml import parseMhtmlZipFile
from enrich_html import EnrichHtml
def parseArguments():
"""
@@ -48,84 +47,6 @@ def configLogger(args):
format='%(message)s',
level=loggingLevel)
def parseHtml(book):
logger = logging.getLogger(__name__)
logger.info("Loaded {} parts.".format(len(book.content)))
for item in book.content.values():
logger.info("Enriching {} {}".format(item.content_type, item.url))
if item.content_type == 'text/html':
item.soup = BeautifulSoup(item.payload, "lxml")
if hasattr(item.soup, 'title') and item.soup.title:
item.title = item.soup.title.string
else:
logger.info("No title for {}".format(item.url))
def createDAG(book):
logger = logging.getLogger(__name__)
for item in book.content.values():
if hasattr(item, 'soup'):
if hasattr(item.soup, 'title') and item.soup.title:
logger.info("Title {}".format(item.soup.title.string))
else:
logger.info("No title for {}".format(item.url))
links = item.soup.find_all('a')
for link in links:
href = link.get('href')
if not href:
continue
parsed_href = urlparse.urlsplit(href)
url = \
urlparse.SplitResult(parsed_href.scheme,
parsed_href.netloc,
parsed_href.path,
parsed_href.query,
None).geturl()
if url in book.content:
book.content[url].needed_by.add(item.url)
item.needs.add(url)
elif href:
logger.info(" refered but no item exist: {}".format(url))
# Try to get prev chapter.
links = item.soup.find_all('a', attrs={"class": "prev nav-link"})
if len(links):
item.prev = links[0].get('href')
# Try to get next chapter.
links = item.soup.find_all('a', attrs={"class": "next nav-link"})
if len(links):
item.next = links[0].get('href')
# Try to find content.
item_content = item.soup.find_all('div', attrs={"id": "sbo-rt-content"})
if len(item_content) == 1:
item.content = item_content[0]
else:
logger.error("No content found: {}".format(item.url))
item.remove = True
for item in book.content.values():
if hasattr(item, 'soup') \
and not hasattr(item, 'prev') \
and not hasattr(item, 'remove'):
if book.first:
logger.error("Multiple begin points found. {} and {}"
.format(it.url, item.url))
raise Exception("Multiple begin points found.")
else:
book.first = item
for item in book.content.values():
logger.info("Item: {}".format(item.url))
if hasattr(item, 'prev'):
logger.info(" Prev: {}".format(item.prev))
if hasattr(item, 'next'):
logger.info(" Next: {}".format(item.next))
for url in item.needs:
logger.info(" Needs: {}".format(url))
# for name in content.keys():
@@ -289,8 +210,8 @@ def main():
book = Book(args.epub)
parseMhtmlZipFile(args.zip, book)
parseHtml(book)
createDAG(book)
EnrichHtml.enrich(book)
#createDAG(book)
#createEpubBook(book)

View File

@@ -10,6 +10,7 @@ class Item(object):
self.payload = payload
self.needed_by = set()
self.needs = set()
self.soup = None
class Book(object):

126
gragir/enrich_html.py Normal file
View File

@@ -0,0 +1,126 @@
import logging
import urllib.parse as urlparse
from bs4 import BeautifulSoup
from book import Item, Book
class EnrichHtml(object):
@classmethod
def enrich(cls, book):
logger = logging.getLogger(__name__)
logger.info("BEGIN Html Enrichment {} items.".format(len(book.content)))
cls.parse(book)
cls.createDAG(book)
cls.populateContent(book)
cls.createOrder(book)
cls.print(book)
logger.info("BEGIN Html Enrichment {} items.".format(len(book.content)))
@classmethod
def parse(cls, book):
logger = logging.getLogger(__name__)
for item in book.content.values():
if item.content_type == 'text/html':
logger.info("Parsing {} {}".format(item.content_type, item.url))
item.soup = BeautifulSoup(item.payload, "lxml")
if hasattr(item.soup, 'title') and item.soup.title:
item.title = item.soup.title.string
else:
logger.info("No title for {}".format(item.url))
else:
logger.info("Skipping {} {}".format(item.content_type, item.url))
@classmethod
def createDAG(cls, book):
logger = logging.getLogger(__name__)
for item in book.content.values():
if item.soup is not None:
logger.info("Create DAG {}".format(item.url))
links = item.soup.find_all('a')
for link in links:
href = link.get('href')
if not href:
continue
parsed_href = urlparse.urlsplit(href)
url = \
urlparse.SplitResult(parsed_href.scheme,
parsed_href.netloc,
parsed_href.path,
parsed_href.query,
None).geturl()
if url in book.content:
book.content[url].needed_by.add(item.url)
item.needs.add(url)
elif href:
logger.info(" refered but no item exist: {}".format(url))
@classmethod
def populateContent(cls, book):
logger = logging.getLogger(__name__)
for item in book.content.values():
if item.soup is not None:
# Try to find content.
item_content = item.soup.find_all('div', attrs={"id": "sbo-rt-content"})
if len(item_content) == 1:
item.content = item_content[0]
else:
logger.error("No content found: {}".format(item.url))
item.remove = True
@classmethod
def createOrder(cls, book):
logger = logging.getLogger(__name__)
for item in book.content.values():
if item.soup is not None:
# Try to get prev chapter.
links = item.soup.find_all('a', attrs={"class": "prev nav-link"})
if len(links):
item.prev = links[0].get('href')
# Try to get next chapter.
links = item.soup.find_all('a', attrs={"class": "next nav-link"})
if len(links):
item.next = links[0].get('href')
for item in book.content.values():
if item.soup is not None \
and not hasattr(item, 'prev') \
and not hasattr(item, 'remove'):
if book.first:
logger.error("Multiple begin points found. {} and {}"
.format(item.url, item.url))
raise Exception("Multiple begin points found.")
else:
book.first = item
@classmethod
def getTitle(cls, item):
if hasattr(item.soup, 'title') and item.soup.title:
return item.soup.title.string
else:
return item.url
@classmethod
def print(cls, book):
logger = logging.getLogger(__name__)
item = book.first
while item is not None:
logger.info("Item: {}".format(cls.getTitle(item)))
if hasattr(item, 'prev'):
logger.info(" Prev: {}".format(item.prev))
if hasattr(item, 'next'):
logger.info(" Next: {}".format(item.next))
for url in item.needs:
logger.info(" Needs: {}".format(url))
logger.info("")
if hasattr(item, 'next'):
item = book.content[item.next]
else:
item = None