from bs4 import BeautifulSoup as BS from urllib.parse import urljoin from bookie import app def html_parse(raw_html,url,paragraphs=True): strip_tags = False soup = BS(raw_html) for t in soup(["script","style","nav","header","aside","select","form", \ "link","meta","svg"]): t.decompose() for [tag, attr] in kill_list(): for t in soup.findAll(tag, attr): t.decompose() if soup.find("div", attrs={"class":"story-text"}): app.logger.debug('Text import from
') text = soup.find("div", attrs={"class":"story-text"}) elif soup.find("div", attrs={"id":"article"}): print('Text import from
') text = soup.find("div", attrs={"id":"article"}) elif soup.find("div", attrs={"id":"articleBody"}): print('Text import from
') text = soup.find("div", attrs={"id":"articleBody"}) elif soup.find("div", attrs={"class":"articleBody"}): print('Text import from
') text = soup.find("div", attrs={"class":"articleBody"}) elif soup.find("div", attrs={"class":"post"}): print('Text import from
') text = soup.find("div", attrs={"class":"post"}) elif soup.find("div", attrs={"class":"post-content"}): print('Text import from
') text = soup.find("div", attrs={"class":"post-content"}) elif soup.find("div", attrs={"class":"article-content"}): print('Text import from
') text = soup.find("div", attrs={"class":"article-content"}) elif soup.find("div", attrs={"class":"story-content"}): print('Text import from
') text = soup.find("div", attrs={"class":"story-content"}) elif soup.find("div", attrs={"class":"content"}): print('Text import from
') text = soup.find("div", attrs={"class":"content"}) elif soup.find("article"): print('Text import from from
') text = soup.find("article") elif soup.find("div", attrs={"id":"page"}): print('Text import from
') text = soup.find("div", attrs={"id":"page"}) else: text = soup("body")[0] strip_tags = True if paragraphs == True: for t in text('img'): t['style'] = "max-width:600px;max-height:600px;" try: t['src'] = urljoin(url, t['src']) except: pass for t in text("div"): del(t['class']) del(t['style']) for t in text("iframe"): del(t['height']) del(t['width']) t['style'] = "max-width:600px;max-height:600px;margin:0em auto;display:block;" if strip_tags == True: lines = (line.strip() for line in text.get_text().splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) output = '

'+'

'.join(chunk for chunk in chunks if chunk) + '

' else: lines = (line.strip() for line in text.prettify().splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) output = '\n'.join(chunk for chunk in chunks if chunk) else: lines = (line.strip() for line in text.get_text().splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) output = '\n'.join(chunk for chunk in chunks if chunk) return output def kill_list(): kill_list = [] kill_list.append(["div", {"id": "comments"}]) kill_list.append(["div", {"class": "video"}]) kill_list.append(["div", {"class": "m-linkset"}]) kill_list.append(["div", {"class": "m-feature__intro"}]) kill_list.append(["div", {"class": "m-share-buttons"}]) kill_list.append(["p", {"class": "m-entry__byline"}]) kill_list.append(["div", {"class": "social"}]) kill_list.append(["div", {"id": "follow-bar"}]) kill_list.append(["section", {"class": "m-rail-component"}]) return kill_list