Html enrichment is separated to a new module.
This commit is contained in:
@@ -14,12 +14,11 @@ import sys
|
||||
import logging
|
||||
import argparse
|
||||
|
||||
import urllib.parse as urlparse
|
||||
import ebooklib.epub as ebooklib
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from book import Book, Item
|
||||
from parse_mhtml import parseMhtmlZipFile
|
||||
from enrich_html import EnrichHtml
|
||||
|
||||
def parseArguments():
|
||||
"""
|
||||
@@ -48,84 +47,6 @@ def configLogger(args):
|
||||
format='%(message)s',
|
||||
level=loggingLevel)
|
||||
|
||||
def parseHtml(book):
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.info("Loaded {} parts.".format(len(book.content)))
|
||||
for item in book.content.values():
|
||||
logger.info("Enriching {} {}".format(item.content_type, item.url))
|
||||
if item.content_type == 'text/html':
|
||||
item.soup = BeautifulSoup(item.payload, "lxml")
|
||||
if hasattr(item.soup, 'title') and item.soup.title:
|
||||
item.title = item.soup.title.string
|
||||
else:
|
||||
logger.info("No title for {}".format(item.url))
|
||||
|
||||
|
||||
def createDAG(book):
|
||||
logger = logging.getLogger(__name__)
|
||||
for item in book.content.values():
|
||||
if hasattr(item, 'soup'):
|
||||
if hasattr(item.soup, 'title') and item.soup.title:
|
||||
logger.info("Title {}".format(item.soup.title.string))
|
||||
else:
|
||||
logger.info("No title for {}".format(item.url))
|
||||
|
||||
links = item.soup.find_all('a')
|
||||
for link in links:
|
||||
href = link.get('href')
|
||||
if not href:
|
||||
continue
|
||||
parsed_href = urlparse.urlsplit(href)
|
||||
url = \
|
||||
urlparse.SplitResult(parsed_href.scheme,
|
||||
parsed_href.netloc,
|
||||
parsed_href.path,
|
||||
parsed_href.query,
|
||||
None).geturl()
|
||||
|
||||
if url in book.content:
|
||||
book.content[url].needed_by.add(item.url)
|
||||
item.needs.add(url)
|
||||
elif href:
|
||||
logger.info(" refered but no item exist: {}".format(url))
|
||||
|
||||
# Try to get prev chapter.
|
||||
links = item.soup.find_all('a', attrs={"class": "prev nav-link"})
|
||||
if len(links):
|
||||
item.prev = links[0].get('href')
|
||||
|
||||
# Try to get next chapter.
|
||||
links = item.soup.find_all('a', attrs={"class": "next nav-link"})
|
||||
if len(links):
|
||||
item.next = links[0].get('href')
|
||||
|
||||
# Try to find content.
|
||||
item_content = item.soup.find_all('div', attrs={"id": "sbo-rt-content"})
|
||||
if len(item_content) == 1:
|
||||
item.content = item_content[0]
|
||||
else:
|
||||
logger.error("No content found: {}".format(item.url))
|
||||
item.remove = True
|
||||
|
||||
for item in book.content.values():
|
||||
if hasattr(item, 'soup') \
|
||||
and not hasattr(item, 'prev') \
|
||||
and not hasattr(item, 'remove'):
|
||||
if book.first:
|
||||
logger.error("Multiple begin points found. {} and {}"
|
||||
.format(it.url, item.url))
|
||||
raise Exception("Multiple begin points found.")
|
||||
else:
|
||||
book.first = item
|
||||
|
||||
for item in book.content.values():
|
||||
logger.info("Item: {}".format(item.url))
|
||||
if hasattr(item, 'prev'):
|
||||
logger.info(" Prev: {}".format(item.prev))
|
||||
if hasattr(item, 'next'):
|
||||
logger.info(" Next: {}".format(item.next))
|
||||
for url in item.needs:
|
||||
logger.info(" Needs: {}".format(url))
|
||||
|
||||
|
||||
# for name in content.keys():
|
||||
@@ -289,8 +210,8 @@ def main():
|
||||
book = Book(args.epub)
|
||||
|
||||
parseMhtmlZipFile(args.zip, book)
|
||||
parseHtml(book)
|
||||
createDAG(book)
|
||||
EnrichHtml.enrich(book)
|
||||
#createDAG(book)
|
||||
#createEpubBook(book)
|
||||
|
||||
|
||||
|
||||
@@ -10,6 +10,7 @@ class Item(object):
|
||||
self.payload = payload
|
||||
self.needed_by = set()
|
||||
self.needs = set()
|
||||
self.soup = None
|
||||
|
||||
class Book(object):
|
||||
|
||||
|
||||
126
gragir/enrich_html.py
Normal file
126
gragir/enrich_html.py
Normal file
@@ -0,0 +1,126 @@
|
||||
import logging
|
||||
import urllib.parse as urlparse
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from book import Item, Book
|
||||
|
||||
|
||||
class EnrichHtml(object):
|
||||
|
||||
@classmethod
|
||||
def enrich(cls, book):
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.info("BEGIN Html Enrichment {} items.".format(len(book.content)))
|
||||
cls.parse(book)
|
||||
cls.createDAG(book)
|
||||
cls.populateContent(book)
|
||||
cls.createOrder(book)
|
||||
cls.print(book)
|
||||
logger.info("BEGIN Html Enrichment {} items.".format(len(book.content)))
|
||||
|
||||
@classmethod
|
||||
def parse(cls, book):
|
||||
logger = logging.getLogger(__name__)
|
||||
for item in book.content.values():
|
||||
if item.content_type == 'text/html':
|
||||
logger.info("Parsing {} {}".format(item.content_type, item.url))
|
||||
item.soup = BeautifulSoup(item.payload, "lxml")
|
||||
if hasattr(item.soup, 'title') and item.soup.title:
|
||||
item.title = item.soup.title.string
|
||||
else:
|
||||
logger.info("No title for {}".format(item.url))
|
||||
else:
|
||||
logger.info("Skipping {} {}".format(item.content_type, item.url))
|
||||
|
||||
@classmethod
|
||||
def createDAG(cls, book):
|
||||
logger = logging.getLogger(__name__)
|
||||
for item in book.content.values():
|
||||
if item.soup is not None:
|
||||
logger.info("Create DAG {}".format(item.url))
|
||||
|
||||
links = item.soup.find_all('a')
|
||||
for link in links:
|
||||
href = link.get('href')
|
||||
if not href:
|
||||
continue
|
||||
parsed_href = urlparse.urlsplit(href)
|
||||
url = \
|
||||
urlparse.SplitResult(parsed_href.scheme,
|
||||
parsed_href.netloc,
|
||||
parsed_href.path,
|
||||
parsed_href.query,
|
||||
None).geturl()
|
||||
|
||||
if url in book.content:
|
||||
book.content[url].needed_by.add(item.url)
|
||||
item.needs.add(url)
|
||||
elif href:
|
||||
logger.info(" refered but no item exist: {}".format(url))
|
||||
|
||||
@classmethod
|
||||
def populateContent(cls, book):
|
||||
logger = logging.getLogger(__name__)
|
||||
for item in book.content.values():
|
||||
if item.soup is not None:
|
||||
# Try to find content.
|
||||
item_content = item.soup.find_all('div', attrs={"id": "sbo-rt-content"})
|
||||
if len(item_content) == 1:
|
||||
item.content = item_content[0]
|
||||
else:
|
||||
logger.error("No content found: {}".format(item.url))
|
||||
item.remove = True
|
||||
|
||||
@classmethod
|
||||
def createOrder(cls, book):
|
||||
logger = logging.getLogger(__name__)
|
||||
for item in book.content.values():
|
||||
if item.soup is not None:
|
||||
# Try to get prev chapter.
|
||||
links = item.soup.find_all('a', attrs={"class": "prev nav-link"})
|
||||
if len(links):
|
||||
item.prev = links[0].get('href')
|
||||
|
||||
# Try to get next chapter.
|
||||
links = item.soup.find_all('a', attrs={"class": "next nav-link"})
|
||||
if len(links):
|
||||
item.next = links[0].get('href')
|
||||
|
||||
for item in book.content.values():
|
||||
if item.soup is not None \
|
||||
and not hasattr(item, 'prev') \
|
||||
and not hasattr(item, 'remove'):
|
||||
if book.first:
|
||||
logger.error("Multiple begin points found. {} and {}"
|
||||
.format(item.url, item.url))
|
||||
raise Exception("Multiple begin points found.")
|
||||
else:
|
||||
book.first = item
|
||||
|
||||
@classmethod
|
||||
def getTitle(cls, item):
|
||||
if hasattr(item.soup, 'title') and item.soup.title:
|
||||
return item.soup.title.string
|
||||
else:
|
||||
return item.url
|
||||
|
||||
|
||||
@classmethod
|
||||
def print(cls, book):
|
||||
logger = logging.getLogger(__name__)
|
||||
item = book.first
|
||||
while item is not None:
|
||||
logger.info("Item: {}".format(cls.getTitle(item)))
|
||||
if hasattr(item, 'prev'):
|
||||
logger.info(" Prev: {}".format(item.prev))
|
||||
if hasattr(item, 'next'):
|
||||
logger.info(" Next: {}".format(item.next))
|
||||
for url in item.needs:
|
||||
logger.info(" Needs: {}".format(url))
|
||||
logger.info("")
|
||||
|
||||
if hasattr(item, 'next'):
|
||||
item = book.content[item.next]
|
||||
else:
|
||||
item = None
|
||||
|
||||
Reference in New Issue
Block a user