Html enrichment is separated to a new module.

This commit is contained in:
2018-09-11 07:21:25 +01:00
parent 59deeff397
commit 14c25d79ed
3 changed files with 130 additions and 82 deletions

View File

@@ -14,12 +14,11 @@ import sys
import logging import logging
import argparse import argparse
import urllib.parse as urlparse
import ebooklib.epub as ebooklib import ebooklib.epub as ebooklib
from bs4 import BeautifulSoup
from book import Book, Item from book import Book, Item
from parse_mhtml import parseMhtmlZipFile from parse_mhtml import parseMhtmlZipFile
from enrich_html import EnrichHtml
def parseArguments(): def parseArguments():
""" """
@@ -48,84 +47,6 @@ def configLogger(args):
format='%(message)s', format='%(message)s',
level=loggingLevel) level=loggingLevel)
def parseHtml(book):
logger = logging.getLogger(__name__)
logger.info("Loaded {} parts.".format(len(book.content)))
for item in book.content.values():
logger.info("Enriching {} {}".format(item.content_type, item.url))
if item.content_type == 'text/html':
item.soup = BeautifulSoup(item.payload, "lxml")
if hasattr(item.soup, 'title') and item.soup.title:
item.title = item.soup.title.string
else:
logger.info("No title for {}".format(item.url))
def createDAG(book):
logger = logging.getLogger(__name__)
for item in book.content.values():
if hasattr(item, 'soup'):
if hasattr(item.soup, 'title') and item.soup.title:
logger.info("Title {}".format(item.soup.title.string))
else:
logger.info("No title for {}".format(item.url))
links = item.soup.find_all('a')
for link in links:
href = link.get('href')
if not href:
continue
parsed_href = urlparse.urlsplit(href)
url = \
urlparse.SplitResult(parsed_href.scheme,
parsed_href.netloc,
parsed_href.path,
parsed_href.query,
None).geturl()
if url in book.content:
book.content[url].needed_by.add(item.url)
item.needs.add(url)
elif href:
logger.info(" refered but no item exist: {}".format(url))
# Try to get prev chapter.
links = item.soup.find_all('a', attrs={"class": "prev nav-link"})
if len(links):
item.prev = links[0].get('href')
# Try to get next chapter.
links = item.soup.find_all('a', attrs={"class": "next nav-link"})
if len(links):
item.next = links[0].get('href')
# Try to find content.
item_content = item.soup.find_all('div', attrs={"id": "sbo-rt-content"})
if len(item_content) == 1:
item.content = item_content[0]
else:
logger.error("No content found: {}".format(item.url))
item.remove = True
for item in book.content.values():
if hasattr(item, 'soup') \
and not hasattr(item, 'prev') \
and not hasattr(item, 'remove'):
if book.first:
logger.error("Multiple begin points found. {} and {}"
.format(it.url, item.url))
raise Exception("Multiple begin points found.")
else:
book.first = item
for item in book.content.values():
logger.info("Item: {}".format(item.url))
if hasattr(item, 'prev'):
logger.info(" Prev: {}".format(item.prev))
if hasattr(item, 'next'):
logger.info(" Next: {}".format(item.next))
for url in item.needs:
logger.info(" Needs: {}".format(url))
# for name in content.keys(): # for name in content.keys():
@@ -289,8 +210,8 @@ def main():
book = Book(args.epub) book = Book(args.epub)
parseMhtmlZipFile(args.zip, book) parseMhtmlZipFile(args.zip, book)
parseHtml(book) EnrichHtml.enrich(book)
createDAG(book) #createDAG(book)
#createEpubBook(book) #createEpubBook(book)

View File

@@ -10,6 +10,7 @@ class Item(object):
self.payload = payload self.payload = payload
self.needed_by = set() self.needed_by = set()
self.needs = set() self.needs = set()
self.soup = None
class Book(object): class Book(object):

126
gragir/enrich_html.py Normal file
View File

@@ -0,0 +1,126 @@
import logging
import urllib.parse as urlparse
from bs4 import BeautifulSoup
from book import Item, Book
class EnrichHtml(object):
@classmethod
def enrich(cls, book):
logger = logging.getLogger(__name__)
logger.info("BEGIN Html Enrichment {} items.".format(len(book.content)))
cls.parse(book)
cls.createDAG(book)
cls.populateContent(book)
cls.createOrder(book)
cls.print(book)
logger.info("BEGIN Html Enrichment {} items.".format(len(book.content)))
@classmethod
def parse(cls, book):
logger = logging.getLogger(__name__)
for item in book.content.values():
if item.content_type == 'text/html':
logger.info("Parsing {} {}".format(item.content_type, item.url))
item.soup = BeautifulSoup(item.payload, "lxml")
if hasattr(item.soup, 'title') and item.soup.title:
item.title = item.soup.title.string
else:
logger.info("No title for {}".format(item.url))
else:
logger.info("Skipping {} {}".format(item.content_type, item.url))
@classmethod
def createDAG(cls, book):
logger = logging.getLogger(__name__)
for item in book.content.values():
if item.soup is not None:
logger.info("Create DAG {}".format(item.url))
links = item.soup.find_all('a')
for link in links:
href = link.get('href')
if not href:
continue
parsed_href = urlparse.urlsplit(href)
url = \
urlparse.SplitResult(parsed_href.scheme,
parsed_href.netloc,
parsed_href.path,
parsed_href.query,
None).geturl()
if url in book.content:
book.content[url].needed_by.add(item.url)
item.needs.add(url)
elif href:
logger.info(" refered but no item exist: {}".format(url))
@classmethod
def populateContent(cls, book):
logger = logging.getLogger(__name__)
for item in book.content.values():
if item.soup is not None:
# Try to find content.
item_content = item.soup.find_all('div', attrs={"id": "sbo-rt-content"})
if len(item_content) == 1:
item.content = item_content[0]
else:
logger.error("No content found: {}".format(item.url))
item.remove = True
@classmethod
def createOrder(cls, book):
logger = logging.getLogger(__name__)
for item in book.content.values():
if item.soup is not None:
# Try to get prev chapter.
links = item.soup.find_all('a', attrs={"class": "prev nav-link"})
if len(links):
item.prev = links[0].get('href')
# Try to get next chapter.
links = item.soup.find_all('a', attrs={"class": "next nav-link"})
if len(links):
item.next = links[0].get('href')
for item in book.content.values():
if item.soup is not None \
and not hasattr(item, 'prev') \
and not hasattr(item, 'remove'):
if book.first:
logger.error("Multiple begin points found. {} and {}"
.format(item.url, item.url))
raise Exception("Multiple begin points found.")
else:
book.first = item
@classmethod
def getTitle(cls, item):
if hasattr(item.soup, 'title') and item.soup.title:
return item.soup.title.string
else:
return item.url
@classmethod
def print(cls, book):
logger = logging.getLogger(__name__)
item = book.first
while item is not None:
logger.info("Item: {}".format(cls.getTitle(item)))
if hasattr(item, 'prev'):
logger.info(" Prev: {}".format(item.prev))
if hasattr(item, 'next'):
logger.info(" Next: {}".format(item.next))
for url in item.needs:
logger.info(" Needs: {}".format(url))
logger.info("")
if hasattr(item, 'next'):
item = book.content[item.next]
else:
item = None