bookie-python/html_parse.py

94 lines
4.1 KiB
Python
Raw Permalink Normal View History

2014-10-24 22:34:28 +00:00
from bs4 import BeautifulSoup as BS
from urllib.parse import urljoin
from bookie import app
def html_parse(raw_html,url,paragraphs=True):
strip_tags = False
soup = BS(raw_html)
for t in soup(["script","style","nav","header","aside","select","form", \
"link","meta","svg"]):
t.decompose()
for [tag, attr] in kill_list():
for t in soup.findAll(tag, attr):
t.decompose()
if soup.find("div", attrs={"class":"story-text"}):
app.logger.debug('Text import from <div class="story-text">')
text = soup.find("div", attrs={"class":"story-text"})
elif soup.find("div", attrs={"id":"article"}):
print('Text import from <div id="article">')
text = soup.find("div", attrs={"id":"article"})
elif soup.find("div", attrs={"id":"articleBody"}):
print('Text import from <div id="articleBody">')
text = soup.find("div", attrs={"id":"articleBody"})
elif soup.find("div", attrs={"class":"articleBody"}):
print('Text import from <div class="articleBody">')
text = soup.find("div", attrs={"class":"articleBody"})
elif soup.find("div", attrs={"class":"post"}):
print('Text import from <div class="post">')
text = soup.find("div", attrs={"class":"post"})
elif soup.find("div", attrs={"class":"post-content"}):
print('Text import from <div class="post-content">')
text = soup.find("div", attrs={"class":"post-content"})
elif soup.find("div", attrs={"class":"article-content"}):
print('Text import from <div class="article-content">')
text = soup.find("div", attrs={"class":"article-content"})
elif soup.find("div", attrs={"class":"story-content"}):
print('Text import from <div class="story-content">')
text = soup.find("div", attrs={"class":"story-content"})
elif soup.find("div", attrs={"class":"content"}):
print('Text import from <div class="content">')
text = soup.find("div", attrs={"class":"content"})
elif soup.find("article"):
print('Text import from from <article>')
text = soup.find("article")
elif soup.find("div", attrs={"id":"page"}):
print('Text import from <div id="page">')
text = soup.find("div", attrs={"id":"page"})
else:
text = soup("body")[0]
strip_tags = True
if paragraphs == True:
for t in text('img'):
t['style'] = "max-width:600px;max-height:600px;"
try:
t['src'] = urljoin(url, t['src'])
except:
pass
for t in text("div"):
del(t['class'])
del(t['style'])
for t in text("iframe"):
del(t['height'])
del(t['width'])
t['style'] = "max-width:600px;max-height:600px;margin:0em auto;display:block;"
if strip_tags == True:
lines = (line.strip() for line in text.get_text().splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
output = '<p>'+'</p></p>'.join(chunk for chunk in chunks if chunk) + '</p>'
else:
lines = (line.strip() for line in text.prettify().splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
output = '\n'.join(chunk for chunk in chunks if chunk)
else:
lines = (line.strip() for line in text.get_text().splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
output = '\n'.join(chunk for chunk in chunks if chunk)
return output
def kill_list():
kill_list = []
kill_list.append(["div", {"id": "comments"}])
kill_list.append(["div", {"class": "video"}])
kill_list.append(["div", {"class": "m-linkset"}])
kill_list.append(["div", {"class": "m-feature__intro"}])
kill_list.append(["div", {"class": "m-share-buttons"}])
kill_list.append(["p", {"class": "m-entry__byline"}])
kill_list.append(["div", {"class": "social"}])
kill_list.append(["div", {"id": "follow-bar"}])
kill_list.append(["section", {"class": "m-rail-component"}])
return kill_list