94 lines
No EOL
4.1 KiB
Python
94 lines
No EOL
4.1 KiB
Python
from bs4 import BeautifulSoup as BS
|
|
from urllib.parse import urljoin
|
|
|
|
from bookie import app
|
|
|
|
def html_parse(raw_html,url,paragraphs=True):
|
|
strip_tags = False
|
|
soup = BS(raw_html)
|
|
for t in soup(["script","style","nav","header","aside","select","form", \
|
|
"link","meta","svg"]):
|
|
t.decompose()
|
|
for [tag, attr] in kill_list():
|
|
for t in soup.findAll(tag, attr):
|
|
t.decompose()
|
|
if soup.find("div", attrs={"class":"story-text"}):
|
|
app.logger.debug('Text import from <div class="story-text">')
|
|
text = soup.find("div", attrs={"class":"story-text"})
|
|
elif soup.find("div", attrs={"id":"article"}):
|
|
print('Text import from <div id="article">')
|
|
text = soup.find("div", attrs={"id":"article"})
|
|
elif soup.find("div", attrs={"id":"articleBody"}):
|
|
print('Text import from <div id="articleBody">')
|
|
text = soup.find("div", attrs={"id":"articleBody"})
|
|
elif soup.find("div", attrs={"class":"articleBody"}):
|
|
print('Text import from <div class="articleBody">')
|
|
text = soup.find("div", attrs={"class":"articleBody"})
|
|
elif soup.find("div", attrs={"class":"post"}):
|
|
print('Text import from <div class="post">')
|
|
text = soup.find("div", attrs={"class":"post"})
|
|
elif soup.find("div", attrs={"class":"post-content"}):
|
|
print('Text import from <div class="post-content">')
|
|
text = soup.find("div", attrs={"class":"post-content"})
|
|
elif soup.find("div", attrs={"class":"article-content"}):
|
|
print('Text import from <div class="article-content">')
|
|
text = soup.find("div", attrs={"class":"article-content"})
|
|
elif soup.find("div", attrs={"class":"story-content"}):
|
|
print('Text import from <div class="story-content">')
|
|
text = soup.find("div", attrs={"class":"story-content"})
|
|
elif soup.find("div", attrs={"class":"content"}):
|
|
print('Text import from <div class="content">')
|
|
text = soup.find("div", attrs={"class":"content"})
|
|
elif soup.find("article"):
|
|
print('Text import from from <article>')
|
|
text = soup.find("article")
|
|
elif soup.find("div", attrs={"id":"page"}):
|
|
print('Text import from <div id="page">')
|
|
text = soup.find("div", attrs={"id":"page"})
|
|
else:
|
|
text = soup("body")[0]
|
|
strip_tags = True
|
|
|
|
if paragraphs == True:
|
|
for t in text('img'):
|
|
t['style'] = "max-width:600px;max-height:600px;"
|
|
try:
|
|
t['src'] = urljoin(url, t['src'])
|
|
except:
|
|
pass
|
|
for t in text("div"):
|
|
del(t['class'])
|
|
del(t['style'])
|
|
for t in text("iframe"):
|
|
del(t['height'])
|
|
del(t['width'])
|
|
t['style'] = "max-width:600px;max-height:600px;margin:0em auto;display:block;"
|
|
|
|
if strip_tags == True:
|
|
lines = (line.strip() for line in text.get_text().splitlines())
|
|
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
|
output = '<p>'+'</p></p>'.join(chunk for chunk in chunks if chunk) + '</p>'
|
|
else:
|
|
lines = (line.strip() for line in text.prettify().splitlines())
|
|
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
|
output = '\n'.join(chunk for chunk in chunks if chunk)
|
|
else:
|
|
lines = (line.strip() for line in text.get_text().splitlines())
|
|
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
|
output = '\n'.join(chunk for chunk in chunks if chunk)
|
|
|
|
return output
|
|
|
|
def kill_list():
|
|
kill_list = []
|
|
kill_list.append(["div", {"id": "comments"}])
|
|
kill_list.append(["div", {"class": "video"}])
|
|
kill_list.append(["div", {"class": "m-linkset"}])
|
|
kill_list.append(["div", {"class": "m-feature__intro"}])
|
|
kill_list.append(["div", {"class": "m-share-buttons"}])
|
|
kill_list.append(["p", {"class": "m-entry__byline"}])
|
|
kill_list.append(["div", {"class": "social"}])
|
|
kill_list.append(["div", {"id": "follow-bar"}])
|
|
kill_list.append(["section", {"class": "m-rail-component"}])
|
|
|
|
return kill_list |