bookie-python/bookie.py

691 lines
23 KiB
Python

import os
from flask import Flask, render_template, redirect, url_for, request, Response
from flask.ext.mongoengine import MongoEngine
from flask.ext.security import Security, UserMixin, RoleMixin, login_required, MongoEngineUserDatastore
from flask.ext.login import current_user
import datetime
import base64
import urllib
from subprocess import call
from bs4 import BeautifulSoup as BS
from xml.sax.saxutils import escape
from flask.ext.security.utils import encrypt_password
app = Flask(__name__)
#####
# Config Values
#####
app.config["MONGODB_DB"] = "bookie"
app.config['SECRET_KEY'] = 'bobloblawlawblog'
app.config['UPLOAD_FOLDER'] = "static/uploads"
app.config['SITE_URL'] = "http://localhost:5000"
app.config['SECURITY_PASSWORD_HASH'] = "bcrypt"
app.config['SECURITY_PASSWORD_SALT'] = "asdfiqwnvonaosinva"
#####
# MongoDB Setup
#####
db = MongoEngine(app)
#####
# Classes
#####
class Role(db.Document, RoleMixin):
name = db.StringField(max_length=80, unique=True)
description = db.StringField(max_length=255)
class User(db.Document, UserMixin):
email = db.StringField(max_length=255)
password = db.StringField(max_length=255)
active = db.BooleanField(default=True)
confirmed_at = db.DateTimeField()
roles = db.ListField(db.ReferenceField(Role), default=[])
def encrypt_password(self, pw):
return encrypt_password(pw)
class Tag(db.Document):
name = db.StringField(required=True, max_length=25, unique=True)
note = db.StringField(required=False, max_length=100)
def __repr__(self):
return "Tag()"
def __str__(self):
return str(self.name)
class ArchivedText(db.Document):
url = db.StringField(max_length=1000, required=True)
created_at = db.DateTimeField(default=datetime.datetime.now, required=True)
text = db.StringField(required=True,default="")
raw_html = db.StringField(required=True,default="")
def get_html(self):
app.logger.debug("Brewing an opener")
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor())
app.logger.debug("Getting HTML")
raw_html = opener.open(self.url).read()
app.logger.debug("HTML retrieved") if raw_html != "" else False
try:
return raw_html.decode()
except:
return str(raw_html)
class ArchivedImage(db.Document):
url = db.StringField(max_length=1000, required=True)
created_at = db.DateTimeField(default=datetime.datetime.now, required=True)
path = db.StringField(required=True,max_length=150)
class Bookmark(db.Document):
#Meta
created_at = db.DateTimeField(default=datetime.datetime.now, required=True)
url = db.StringField(max_length=1000, required=True)
short = db.StringField(max_length=25, required=True, unique=True)
title = db.StringField(max_length=255, required=True)
note = db.StringField(required=False)
tags = db.ListField(db.ReferenceField(Tag))
image_embed = db.BooleanField(required=True, default=False)
archived_text = db.BooleanField()
archived_text_needed = db.BooleanField()
archived_text_ref = db.ReferenceField(ArchivedText)
archived_image = db.BooleanField()
archived_image_needed = db.BooleanField()
archived_image_ref = db.ReferenceField(ArchivedImage)
unread = db.BooleanField()
private = db.BooleanField(required=True, default=False)
deleted = db.BooleanField(required=True, default=False)
source = db.StringField(max_length=255)
#Metrics
hits = db.IntField(required=True,default=0)
factor = db.FloatField(required=True)
def get_factor(self):
return (len(self.short)+14)/len(self.url)
def get_short(self):
unique = False
while not unique:
s = base64.urlsafe_b64encode(os.urandom(5))[0:5].decode('Latin-1')
if Bookmark.objects(short=s).first() == None:
unique = True
return s
meta = {
'allow_inheritance': True,
'indexes': ['-created_at', 'short'],
'ordering': ['-created_at']
}
def __repr__(self):
return "Bookmark()"
def __str__(self):
return str("Bookmark " + self.short)
#####
# Security
#####
user_datastore = MongoEngineUserDatastore(db, User, Role)
security = Security(app, user_datastore)
######
# Helper Functions
######
# Function to update the archived text of a bookmark
# Inputs: Bookmark()
# Output: True / False
def update_archived_text(b, force=False, update_note=True):
if force == True:
t = ArchivedText.objects.create(url=b.url)
t.raw_html = t.get_html()
else:
t = ArchivedText.objects(url=b.url).order_by("-created_at").first()
if not t:
t = ArchivedText.objects.create(url=b.url)
if not hasattr(t, 'raw_html') or t.raw_html == "":
t.raw_html = t.get_html()
t.text = html_parse(t.raw_html, b.url, True)
t.save()
b.archived_text_ref = t
if update_note == True and b.note == "":
b.note = html_parse(t.raw_html, b.url, False)[:250]
b.archived_text_needed = False
b.archived_text = True
b.save()
return True
# Function to update the archived image of a bookmark
# Inputs: Bookmark()
# Output: True / False
def update_archived_image(b):
a = ArchivedImage()
a.url = b.url
ref = 'static/archive/images/'+b.short+'_'+a.created_at.strftime("%Y-%m-%d_%H%M%S")+'.jpg'
app.logger.debug(ref)
call(['/usr/bin/env','wkhtmltoimage',b.url,ref])
a.path = '/'+ ref
a.save()
b.archived_image_ref = a
b.archived_image_needed = False
b.archived_image = True
b.save()
return True
# A custom function to extract the key test from the raw html
# Inputs:
# Outputs:
def html_parse(raw_html,url,paragraphs=True):
strip_tags = False
soup = BS(raw_html)
for t in soup(["script","style","nav","header","aside","select","form", \
"link","meta","svg"]):
t.decompose()
for [tag, attr] in kill_list():
for t in soup.findAll(tag, attr):
t.decompose()
if soup.find("div", attrs={"class":"story-text"}):
app.logger.debug('Text import from <div class="story-text">')
text = soup.find("div", attrs={"class":"story-text"})
elif soup.find("div", attrs={"id":"article"}):
app.logger.debug('Text import from <div id="article">')
text = soup.find("div", attrs={"id":"article"})
elif soup.find("div", attrs={"id":"articleBody"}):
app.logger.debug('Text import from <div id="articleBody">')
text = soup.find("div", attrs={"id":"articleBody"})
elif soup.find("div", attrs={"class":"articleBody"}):
app.logger.debug('Text import from <div class="articleBody">')
text = soup.find("div", attrs={"class":"articleBody"})
elif soup.find("div", attrs={"class":"post"}):
app.logger.debug('Text import from <div class="post">')
text = soup.find("div", attrs={"class":"post"})
elif soup.find("div", attrs={"class":"post-content"}):
app.logger.debug('Text import from <div class="post-content">')
text = soup.find("div", attrs={"class":"post-content"})
elif soup.find("div", attrs={"class":"article-content"}):
app.logger.debug('Text import from <div class="article-content">')
text = soup.find("div", attrs={"class":"article-content"})
elif soup.find("div", attrs={"class":"story-content"}):
app.logger.debug('Text import from <div class="story-content">')
text = soup.find("div", attrs={"class":"story-content"})
elif soup.find("div", attrs={"class":"content"}):
app.logger.debug('Text import from <div class="content">')
text = soup.find("div", attrs={"class":"content"})
elif soup.find("article"):
app.logger.debug('Text import from from <article>')
text = soup.find("article")
elif soup.find("div", attrs={"id":"page"}):
app.logger.debug('Text import from <div id="page">')
text = soup.find("div", attrs={"id":"page"})
else:
app.logger.debug('Text import from <body>')
text = soup("body")[0]
strip_tags = True
if paragraphs == True:
for t in text('img'):
t['style'] = "max-width:600px;max-height:600px;"
try:
t['src'] = urllib.parse.urljoin(url, t['src'])
except:
pass
for t in text("div"):
del(t['class'])
del(t['style'])
for t in text("iframe"):
del(t['height'])
del(t['width'])
t['style'] = "max-width:600px;max-height:600px;margin:0em auto;display:block;"
if strip_tags == True:
lines = (line.strip() for line in text.get_text().splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
output = '<p>'+'</p></p>'.join(chunk for chunk in chunks if chunk) + '</p>'
else:
lines = (line.strip() for line in text.prettify().splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
output = '\n'.join(chunk for chunk in chunks if chunk)
else:
lines = (line.strip() for line in text.get_text().splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
output = '\n'.join(chunk for chunk in chunks if chunk)
return output
# A function defining key banned HTML tags
# Input: none
# Output: List of beautiful soup .decompose() compatible banned tags
def kill_list():
kill_list = []
kill_list.append(["div", {"id": "comments"}])
kill_list.append(["div", {"class": "video"}])
kill_list.append(["div", {"class": "m-linkset"}])
kill_list.append(["div", {"class": "m-feature__intro"}])
kill_list.append(["div", {"class": "m-share-buttons"}])
kill_list.append(["p", {"class": "m-entry__byline"}])
kill_list.append(["div", {"class": "social"}])
kill_list.append(["div", {"id": "follow-bar"}])
kill_list.append(["section", {"class": "m-rail-component"}])
return kill_list
# Encoding Function to enable JSON export
# Lifted from: http://goo.gl/SkWzpn
# Inputs: mongoengine object or query
# Outputs: Prepared instance for JSON dump
def encode_model(self, obj):
if isinstance(obj, (mongoengine.Document, mongoengine.EmbeddedDocument)):
out = dict(obj._data)
for k,v in out.items():
if isinstance(v, ObjectId):
out[k] = str(v)
elif isinstance(obj, mongoengine.queryset.QuerySet):
out = list(obj)
elif isinstance(obj, types.ModuleType):
out = None
elif isinstance(obj, groupby):
out = [ (g,list(l)) for g,l in obj ]
else:
raise TypeError("Could not JSON-encode type '%s': %s" % (type(obj), str(obj)))
return out
#####
# Routes
#####
# List all bookmarks
@app.route('/all/<int:count>/<format>')
@app.route('/all/<int:count>/')
@app.route('/all/')
@app.route('/a/')
@login_required
def list(count=100, format="HTML"):
loc = '/all/' + str(count) + '/'
if format == "csv":
blist = Bookmark.objects(deleted=False).order_by("-created_at").limit(count)
out = ""
tags = ''
for b in blist:
for t in b.tags:
tags = tags + t.name + ' '
out = out + b.url + ',' + b.title + ',' + b.note + ',' + b.created_at.isoformat() + ',' + tags + ',' + str(b.unread) + ',bookie\n'
return out
elif format == "xml":
c = 0
blist = Bookmark.objects(deleted=False).order_by("-created_at").only("url","title","short","note","created_at","tags","unread").limit(count)
out = "<xml>\n"
out += "\t<title>Bookmark Export "+datetime.datetime.now().strftime("%Y-%m-%d")+"</title>\n"
out += "\t<created_at>"+datetime.datetime.now().isoformat()+"</created_at>\n"
for b in blist:
out += "\t<bookmark>\n"
out += "\t\t<index>"+str(c)+"</index>\n"
out += "\t\t<title>"+escape(b.title)+"</title>\n"
out += "\t\t<short>"+escape(b.short)+"</short>\n"
out += "\t\t<created_at>"+b.created_at.isoformat()+"</created_at>\n"
out += "\t\t<unread>"
if b.unread:
out += "True"
else:
out += "False"
out += "</unread>\n"
out += "\t\t<url>"+escape(b.url)+"</url>\n"
out += "\t\t<tags>"
for t in b.tags:
out += escape(t.name) + " "
out += "</tags>\n"
out += "\t\t<note><![CDATA[\n"
out += "\t\t\t"+b.note+"\n"
out += "\t\t]]></note>\n"
out += "\t</bookmark>\n"
c += 1
out += "</xml>\n"
return Response(out, mimetype='application/xml')
elif format == "json":
blist = Bookmark.objects(deleted=False).order_by("-created_at").only("url","title","short","note","created_at","tags","unread").limit(count)
out = ""
for b in blist:
out += "{\n"
out += '\t"title": "'+b.title+'",\n'
out += '\t"short": "'+b.short+'",\n'
out += '\t"created_at": "'+b.created_at.strftime("%Y-%m-%d %H:%M:%S")+'",\n'
out += '\t"unread": "'
if b.unread:
out += "True"
else:
out += "False"
out += '",\n'
out += '\t"url": "'+b.url+'",\n'
out += '\t"tags": "'
for t in b.tags:
out += t.name + " "
out += '",\n'
out += '\t"note": "'+b.note+'"\n'
out += "}\n"
return out
else:
blist = Bookmark.objects(deleted=False).order_by("-created_at").limit(count)
return render_template("list.html", blist=blist, loc=loc)
# View deleted bookmarks
@app.route('/deleted/<int:count>/<format>')
@app.route('/deleted/<int:count>/')
@app.route('/deleted/')
@app.route('/d/')
@login_required
def deleted(count=100, format="HTML"):
loc = '/deleted/' + str(count) + '/'
blist = Bookmark.objects(deleted=True).order_by("-created_at").limit(count)
if format == "JSON":
return blist.to_json()
else:
return render_template("list.html", blist=blist, loc=loc)
# List unread bookmarks
@app.route('/unread/<int:count>/<format>')
@app.route('/unread/<int:count>/')
@app.route('/unread/')
@app.route('/u/')
@login_required
def unread(count=100, format="HTML"):
loc = '/unread/' + str(count) + '/'
blist = Bookmark.objects(unread=True, deleted=False).order_by("-created_at").limit(count)
if format == "JSON":
return blist.to_json()
else:
return render_template('list.html', blist=blist, loc=loc)
# New bookmark
@app.route('/new', methods=["GET", "POST"])
@login_required
def new():
if request.method=="POST":
b = Bookmark()
b.title = request.form["title"]
b.short = str(b.get_short())
b.note = request.form["note"]
try:
if request.form["image_embed"]:
b.image_embed = True
except:
b.image_embed = False
try:
if request.form["unread"]:
b.unread = True
except:
b.unread = False
try:
if request.form["archive"]:
b.archive_image_needed = True
b.archive_text_needed = True
except:
b.archive_image_needed = False
b.archive_text_needed = False
tag_list = []
for rawtag in request.form['tags'].split(" "):
t = Tag.objects.get_or_create(name=rawtag)[0].save()
tag_list.append(t)
b.tags = tag_list
if request.form["url"] == "":
file = request.files['file_upload']
ext = file.filename.rsplit('.',1)[1]
filename = b.short + "." + ext
file.save(os.path.join(app.config['UPLOAD_FOLDER'],filename))
b.url = '/' + app.config['UPLOAD_FOLDER'] + '/' + filename
b.factor = b.get_factor()
b.save()
return render_template("detail.html", b=b)
elif request.form["url"] != "":
b.url = request.form["url"]
b.factor = b.get_factor()
b.save()
return render_template("detail.html", b=b)
return render_template("form.html", action="/new")
else:
b = False
if any(k in request.args.keys() for k in ('title','url','note')):
# if 'title' in request.args.keys():
b = Bookmark()
if 'title' in request.args.keys():
b.title = request.args['title']
else:
b.title = ""
if 'url' in request.args.keys():
b.url = request.args['url']
else:
b.url = ""
if 'note' in request.args.keys():
b.note = request.args['note']
else:
b.note = ""
return render_template("form.html", action="/new", b=b)
@app.route('/tag/<rawtag>')
@login_required
def tagsearch(rawtag):
t = Tag.objects.get_or_404(name=rawtag.lower())
blist = Bookmark.objects(tags__in=[t])
if blist.count() > 0 :
return render_template('list.html',blist=blist)
else:
return redirect("/", code=302)
@app.route('/<id>/update/<action>')
@app.route('/<id>/u/<action>')
@login_required
def update(id,action):
if 'redirect' in request.args.keys():
loc = request.args['redirect']
else:
loc = '/'
if 'anchor' in request.args.keys():
app.logger.debug(request.args['anchor'])
loc = loc + "#" + request.args['anchor']
b = Bookmark.objects(short=id).first()
if action == "text":
update_archived_text(b)
elif action == "text_force":
update_archived_text(b, force=True)
elif action == "image":
update_archived_image(b)
elif action == "archive":
b.unread = False
b.save()
elif action == "unread":
b.unread = True
b.save()
elif action == "private":
b.private = True
b.save()
elif action == "public":
b.private = False
b.save()
elif action == "restore":
b.deleted = False
b.save()
elif action == "delete":
b.deleted = True
b.save()
return redirect(loc, code=302)
@app.route('/<id>/details')
@app.route('/<id>/d')
@login_required
def details(id):
b = Bookmark.objects(short=id).first()
return render_template("detail.html", b=b)
@app.route('/<id>/edit', methods=["GET", "POST"])
@app.route('/<id>/e', methods=["GET", "POST"])
@login_required
def edit(id):
b = Bookmark.objects(short=id).first()
if request.method=="POST":
if "title" in request.form.keys():
b.title = request.form["title"]
if "note" in request.form.keys():
b.note = request.form["note"]
if "image_embed" in request.form.keys() and \
request.form['image_embed'] == "checked":
b.image_embed = True
else:
b.image_embed = False
if "unread" in request.form.keys() and \
request.form['unread'] == "checked":
b.unread = True
else:
b.unread = False
if "archive_text_needed" in request.form.keys() and \
request.form['archive_text_needed'] == "checked":
b.archive_text_needed = True
else:
b.archive_text_needed = False
if "archive_image_needed" in request.form.keys() and \
request.form['archive_text_needed'] == "checked":
b.archive_image_needed = True
else:
b.archive_image_needed = False
tag_list = []
for rawtag in request.form['tags'].split(" "):
t = Tag.objects.get_or_create(name=rawtag)[0].save()
tag_list.append(t)
b.tags = tag_list
if "url" in request.form.keys():
b.url = request.form["url"]
b.factor = b.get_factor()
b.save()
if b:
return render_template("form.html", action = "/"+b.short+"/edit", b=b)
else:
return redirect("/", code=302)
# Pull up an archived and parsed text view of the Bookmark
# The first line of defense in preventing link rot...
@app.route('/<id>/text/<version>')
@app.route('/<id>/text/')
@app.route('/<id>/t/')
@login_required
def text(id, version=False):
b = Bookmark.objects(short=id).first()
tlist = ArchivedText.objects(url=b.url)
if b:
if version:
t = ArchivedText.objects(url=b.url,created_at=version).first()
text = t.text
else:
text = b.archived_text_ref.text
b.hits += 1
b.save()
return render_template("text.html", b=b, text=text, tlist=tlist)
else:
return redirect("/", code=302)
# Display the raw html scraped from the website
# The second line of defense against link rot...
@app.route('/<id>/raw/<version>')
@app.route('/<id>/raw/')
@app.route('/<id>/r/')
@login_required
def raw(id, version=False):
b = Bookmark.objects(short=id).first()
tlist = ArchivedText.objects(url=b.url)
if b:
if version:
t = ArchivedText.objects(url=b.url,created_at=version).first()
text = t.raw_html
else:
text = b.archived_text_ref.raw_html
return text
else:
return redirect("/", code=302)
# An archived image scraped from the website
# The third line of defense against link rot...
@app.route('/<id>/image/<version>')
@app.route('/<id>/image/')
@app.route('/<id>/i/')
@login_required
def image(id, version=False):
b = Bookmark.objects(short=id).first()
tlist = ArchivedImage.objects(url=b.url)
if b:
if version:
t = ArchivedImage.objects(url=b.url,created_at=version).first()
path = t.path
else:
path = b.archived_image_ref.path
b.hits += 1
b.save()
return redirect(path, code=302)
else:
return redirect("/", code=302)
# Embed url as an image in a formatted page. Does not require login.
@app.route('/<id>/embed')
def embed(id):
b = Bookmark.objects(short=id).first()
if b and (b.private != True or current_user.is_authenticated()):
b.hits += 1
b.save()
return render_template("image.html", b=b)
else:
return redirect("/", code=302)
# Short code redirects directly to bookmark target, does not require auth to use
# bookie as a URL shortener app
@app.route('/<id>')
def short(id):
b = Bookmark.objects(short=id).first()
if b and (b.private != True or current_user.is_authenticated()):
b.hits += 1
b.save()
if b.image_embed:
return redirect("/"+b.short+"/embed", code=302)
else:
return redirect(b.url, code=302)
else:
return redirect("/", code=302)
# Anonymous home page
@app.route('/')
def index():
return render_template("index.html")