import os from flask import Flask, render_template, redirect, url_for, request, Response from flask.ext.mongoengine import MongoEngine from flask.ext.security import Security, UserMixin, RoleMixin, login_required, MongoEngineUserDatastore from flask.ext.login import current_user import datetime import base64 import urllib from subprocess import call from bs4 import BeautifulSoup as BS from xml.sax.saxutils import escape from flask.ext.security.utils import encrypt_password app = Flask(__name__) ##### # Config Values ##### app.config["MONGODB_DB"] = "bookie" app.config['SECRET_KEY'] = 'bobloblawlawblog' app.config['UPLOAD_FOLDER'] = "static/uploads" app.config['SITE_URL'] = "http://localhost:5000" app.config['SECURITY_PASSWORD_HASH'] = "bcrypt" app.config['SECURITY_PASSWORD_SALT'] = "asdfiqwnvonaosinva" ##### # MongoDB Setup ##### db = MongoEngine(app) ##### # Classes ##### class Role(db.Document, RoleMixin): name = db.StringField(max_length=80, unique=True) description = db.StringField(max_length=255) class User(db.Document, UserMixin): email = db.StringField(max_length=255) password = db.StringField(max_length=255) active = db.BooleanField(default=True) confirmed_at = db.DateTimeField() roles = db.ListField(db.ReferenceField(Role), default=[]) def encrypt_password(self, pw): return encrypt_password(pw) class Tag(db.Document): name = db.StringField(required=True, max_length=25, unique=True) note = db.StringField(required=False, max_length=100) def __repr__(self): return "Tag()" def __str__(self): return str(self.name) class ArchivedText(db.Document): url = db.StringField(max_length=1000, required=True) created_at = db.DateTimeField(default=datetime.datetime.now, required=True) text = db.StringField(required=True,default="") raw_html = db.StringField(required=True,default="") def get_html(self): app.logger.debug("Brewing an opener") opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor()) app.logger.debug("Getting HTML") raw_html = opener.open(self.url).read() app.logger.debug("HTML retrieved") if raw_html != "" else False try: return raw_html.decode() except: return str(raw_html) class ArchivedImage(db.Document): url = db.StringField(max_length=1000, required=True) created_at = db.DateTimeField(default=datetime.datetime.now, required=True) path = db.StringField(required=True,max_length=150) class Bookmark(db.Document): #Meta created_at = db.DateTimeField(default=datetime.datetime.now, required=True) url = db.StringField(max_length=1000, required=True) short = db.StringField(max_length=25, required=True, unique=True) title = db.StringField(max_length=255, required=True) note = db.StringField(required=False) tags = db.ListField(db.ReferenceField(Tag)) image_embed = db.BooleanField(required=True, default=False) archived_text = db.BooleanField() archived_text_needed = db.BooleanField() archived_text_ref = db.ReferenceField(ArchivedText) archived_image = db.BooleanField() archived_image_needed = db.BooleanField() archived_image_ref = db.ReferenceField(ArchivedImage) unread = db.BooleanField() private = db.BooleanField(required=True, default=False) deleted = db.BooleanField(required=True, default=False) source = db.StringField(max_length=255) #Metrics hits = db.IntField(required=True,default=0) factor = db.FloatField(required=True) def get_factor(self): return (len(self.short)+14)/len(self.url) def get_short(self): unique = False while not unique: s = base64.urlsafe_b64encode(os.urandom(5))[0:5].decode('Latin-1') if Bookmark.objects(short=s).first() == None: unique = True return s meta = { 'allow_inheritance': True, 'indexes': ['-created_at', 'short'], 'ordering': ['-created_at'] } def __repr__(self): return "Bookmark()" def __str__(self): return str("Bookmark " + self.short) ##### # Security ##### user_datastore = MongoEngineUserDatastore(db, User, Role) security = Security(app, user_datastore) ###### # Helper Functions ###### # Function to update the archived text of a bookmark # Inputs: Bookmark() # Output: True / False def update_archived_text(b, force=False, update_note=True): if force == True: t = ArchivedText.objects.create(url=b.url) t.raw_html = t.get_html() else: t = ArchivedText.objects(url=b.url).order_by("-created_at").first() if not t: t = ArchivedText.objects.create(url=b.url) if not hasattr(t, 'raw_html') or t.raw_html == "": t.raw_html = t.get_html() t.text = html_parse(t.raw_html, b.url, True) t.save() b.archived_text_ref = t if update_note == True and b.note == "": b.note = html_parse(t.raw_html, b.url, False)[:250] b.archived_text_needed = False b.archived_text = True b.save() return True # Function to update the archived image of a bookmark # Inputs: Bookmark() # Output: True / False def update_archived_image(b): a = ArchivedImage() a.url = b.url ref = 'static/archive/images/'+b.short+'_'+a.created_at.strftime("%Y-%m-%d_%H%M%S")+'.jpg' app.logger.debug(ref) call(['/usr/bin/env','wkhtmltoimage',b.url,ref]) a.path = '/'+ ref a.save() b.archived_image_ref = a b.archived_image_needed = False b.archived_image = True b.save() return True # A custom function to extract the key test from the raw html # Inputs: # Outputs: def html_parse(raw_html,url,paragraphs=True): strip_tags = False soup = BS(raw_html) for t in soup(["script","style","nav","header","aside","select","form", \ "link","meta","svg"]): t.decompose() for [tag, attr] in kill_list(): for t in soup.findAll(tag, attr): t.decompose() if soup.find("div", attrs={"class":"story-text"}): app.logger.debug('Text import from
') text = soup.find("div", attrs={"class":"story-text"}) elif soup.find("div", attrs={"id":"article"}): app.logger.debug('Text import from
') text = soup.find("div", attrs={"id":"article"}) elif soup.find("div", attrs={"id":"articleBody"}): app.logger.debug('Text import from
') text = soup.find("div", attrs={"id":"articleBody"}) elif soup.find("div", attrs={"class":"articleBody"}): app.logger.debug('Text import from
') text = soup.find("div", attrs={"class":"articleBody"}) elif soup.find("div", attrs={"class":"post"}): app.logger.debug('Text import from
') text = soup.find("div", attrs={"class":"post"}) elif soup.find("div", attrs={"class":"post-content"}): app.logger.debug('Text import from
') text = soup.find("div", attrs={"class":"post-content"}) elif soup.find("div", attrs={"class":"article-content"}): app.logger.debug('Text import from
') text = soup.find("div", attrs={"class":"article-content"}) elif soup.find("div", attrs={"class":"story-content"}): app.logger.debug('Text import from
') text = soup.find("div", attrs={"class":"story-content"}) elif soup.find("div", attrs={"class":"content"}): app.logger.debug('Text import from
') text = soup.find("div", attrs={"class":"content"}) elif soup.find("article"): app.logger.debug('Text import from from
') text = soup.find("article") elif soup.find("div", attrs={"id":"page"}): app.logger.debug('Text import from
') text = soup.find("div", attrs={"id":"page"}) else: app.logger.debug('Text import from ') text = soup("body")[0] strip_tags = True if paragraphs == True: for t in text('img'): t['style'] = "max-width:600px;max-height:600px;" try: t['src'] = urllib.parse.urljoin(url, t['src']) except: pass for t in text("div"): del(t['class']) del(t['style']) for t in text("iframe"): del(t['height']) del(t['width']) t['style'] = "max-width:600px;max-height:600px;margin:0em auto;display:block;" if strip_tags == True: lines = (line.strip() for line in text.get_text().splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) output = '

'+'

'.join(chunk for chunk in chunks if chunk) + '

' else: lines = (line.strip() for line in text.prettify().splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) output = '\n'.join(chunk for chunk in chunks if chunk) else: lines = (line.strip() for line in text.get_text().splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) output = '\n'.join(chunk for chunk in chunks if chunk) return output # A function defining key banned HTML tags # Input: none # Output: List of beautiful soup .decompose() compatible banned tags def kill_list(): kill_list = [] kill_list.append(["div", {"id": "comments"}]) kill_list.append(["div", {"class": "video"}]) kill_list.append(["div", {"class": "m-linkset"}]) kill_list.append(["div", {"class": "m-feature__intro"}]) kill_list.append(["div", {"class": "m-share-buttons"}]) kill_list.append(["p", {"class": "m-entry__byline"}]) kill_list.append(["div", {"class": "social"}]) kill_list.append(["div", {"id": "follow-bar"}]) kill_list.append(["section", {"class": "m-rail-component"}]) return kill_list # Encoding Function to enable JSON export # Lifted from: http://goo.gl/SkWzpn # Inputs: mongoengine object or query # Outputs: Prepared instance for JSON dump def encode_model(self, obj): if isinstance(obj, (mongoengine.Document, mongoengine.EmbeddedDocument)): out = dict(obj._data) for k,v in out.items(): if isinstance(v, ObjectId): out[k] = str(v) elif isinstance(obj, mongoengine.queryset.QuerySet): out = list(obj) elif isinstance(obj, types.ModuleType): out = None elif isinstance(obj, groupby): out = [ (g,list(l)) for g,l in obj ] else: raise TypeError("Could not JSON-encode type '%s': %s" % (type(obj), str(obj))) return out ##### # Routes ##### # List all bookmarks @app.route('/all//') @app.route('/all//') @app.route('/all/') @app.route('/a/') @login_required def list(count=100, format="HTML"): loc = '/all/' + str(count) + '/' if format == "csv": blist = Bookmark.objects(deleted=False).order_by("-created_at").limit(count) out = "" tags = '' for b in blist: for t in b.tags: tags = tags + t.name + ' ' out = out + b.url + ',' + b.title + ',' + b.note + ',' + b.created_at.isoformat() + ',' + tags + ',' + str(b.unread) + ',bookie\n' return out elif format == "xml": c = 0 blist = Bookmark.objects(deleted=False).order_by("-created_at").only("url","title","short","note","created_at","tags","unread").limit(count) out = "\n" out += "\tBookmark Export "+datetime.datetime.now().strftime("%Y-%m-%d")+"\n" out += "\t"+datetime.datetime.now().isoformat()+"\n" for b in blist: out += "\t\n" out += "\t\t"+str(c)+"\n" out += "\t\t"+escape(b.title)+"\n" out += "\t\t"+escape(b.short)+"\n" out += "\t\t"+b.created_at.isoformat()+"\n" out += "\t\t" if b.unread: out += "True" else: out += "False" out += "\n" out += "\t\t"+escape(b.url)+"\n" out += "\t\t" for t in b.tags: out += escape(t.name) + " " out += "\n" out += "\t\t\n" out += "\t\n" c += 1 out += "\n" return Response(out, mimetype='application/xml') elif format == "json": blist = Bookmark.objects(deleted=False).order_by("-created_at").only("url","title","short","note","created_at","tags","unread").limit(count) out = "" for b in blist: out += "{\n" out += '\t"title": "'+b.title+'",\n' out += '\t"short": "'+b.short+'",\n' out += '\t"created_at": "'+b.created_at.strftime("%Y-%m-%d %H:%M:%S")+'",\n' out += '\t"unread": "' if b.unread: out += "True" else: out += "False" out += '",\n' out += '\t"url": "'+b.url+'",\n' out += '\t"tags": "' for t in b.tags: out += t.name + " " out += '",\n' out += '\t"note": "'+b.note+'"\n' out += "}\n" return out else: blist = Bookmark.objects(deleted=False).order_by("-created_at").limit(count) return render_template("list.html", blist=blist, loc=loc) # View deleted bookmarks @app.route('/deleted//') @app.route('/deleted//') @app.route('/deleted/') @app.route('/d/') @login_required def deleted(count=100, format="HTML"): loc = '/deleted/' + str(count) + '/' blist = Bookmark.objects(deleted=True).order_by("-created_at").limit(count) if format == "JSON": return blist.to_json() else: return render_template("list.html", blist=blist, loc=loc) # List unread bookmarks @app.route('/unread//') @app.route('/unread//') @app.route('/unread/') @app.route('/u/') @login_required def unread(count=100, format="HTML"): loc = '/unread/' + str(count) + '/' blist = Bookmark.objects(unread=True, deleted=False).order_by("-created_at").limit(count) if format == "JSON": return blist.to_json() else: return render_template('list.html', blist=blist, loc=loc) # New bookmark @app.route('/new', methods=["GET", "POST"]) @login_required def new(): if request.method=="POST": b = Bookmark() b.title = request.form["title"] b.short = str(b.get_short()) b.note = request.form["note"] try: if request.form["image_embed"]: b.image_embed = True except: b.image_embed = False try: if request.form["unread"]: b.unread = True except: b.unread = False try: if request.form["archive"]: b.archive_image_needed = True b.archive_text_needed = True except: b.archive_image_needed = False b.archive_text_needed = False tag_list = [] for rawtag in request.form['tags'].split(" "): t = Tag.objects.get_or_create(name=rawtag)[0].save() tag_list.append(t) b.tags = tag_list if request.form["url"] == "": file = request.files['file_upload'] ext = file.filename.rsplit('.',1)[1] filename = b.short + "." + ext file.save(os.path.join(app.config['UPLOAD_FOLDER'],filename)) b.url = '/' + app.config['UPLOAD_FOLDER'] + '/' + filename b.factor = b.get_factor() b.save() return render_template("detail.html", b=b) elif request.form["url"] != "": b.url = request.form["url"] b.factor = b.get_factor() b.save() return render_template("detail.html", b=b) return render_template("form.html", action="/new") else: b = False if any(k in request.args.keys() for k in ('title','url','note')): # if 'title' in request.args.keys(): b = Bookmark() if 'title' in request.args.keys(): b.title = request.args['title'] else: b.title = "" if 'url' in request.args.keys(): b.url = request.args['url'] else: b.url = "" if 'note' in request.args.keys(): b.note = request.args['note'] else: b.note = "" return render_template("form.html", action="/new", b=b) @app.route('/tag/') @login_required def tagsearch(rawtag): t = Tag.objects.get_or_404(name=rawtag.lower()) blist = Bookmark.objects(tags__in=[t]) if blist.count() > 0 : return render_template('list.html',blist=blist) else: return redirect("/", code=302) @app.route('//update/') @app.route('//u/') @login_required def update(id,action): if 'redirect' in request.args.keys(): loc = request.args['redirect'] else: loc = '/' if 'anchor' in request.args.keys(): app.logger.debug(request.args['anchor']) loc = loc + "#" + request.args['anchor'] b = Bookmark.objects(short=id).first() if action == "text": update_archived_text(b) elif action == "text_force": update_archived_text(b, force=True) elif action == "image": update_archived_image(b) elif action == "archive": b.unread = False b.save() elif action == "unread": b.unread = True b.save() elif action == "private": b.private = True b.save() elif action == "public": b.private = False b.save() elif action == "restore": b.deleted = False b.save() elif action == "delete": b.deleted = True b.save() return redirect(loc, code=302) @app.route('//details') @app.route('//d') @login_required def details(id): b = Bookmark.objects(short=id).first() return render_template("detail.html", b=b) @app.route('//edit', methods=["GET", "POST"]) @app.route('//e', methods=["GET", "POST"]) @login_required def edit(id): b = Bookmark.objects(short=id).first() if request.method=="POST": if "title" in request.form.keys(): b.title = request.form["title"] if "note" in request.form.keys(): b.note = request.form["note"] if "image_embed" in request.form.keys() and \ request.form['image_embed'] == "checked": b.image_embed = True else: b.image_embed = False if "unread" in request.form.keys() and \ request.form['unread'] == "checked": b.unread = True else: b.unread = False if "archive_text_needed" in request.form.keys() and \ request.form['archive_text_needed'] == "checked": b.archive_text_needed = True else: b.archive_text_needed = False if "archive_image_needed" in request.form.keys() and \ request.form['archive_text_needed'] == "checked": b.archive_image_needed = True else: b.archive_image_needed = False tag_list = [] for rawtag in request.form['tags'].split(" "): t = Tag.objects.get_or_create(name=rawtag)[0].save() tag_list.append(t) b.tags = tag_list if "url" in request.form.keys(): b.url = request.form["url"] b.factor = b.get_factor() b.save() if b: return render_template("form.html", action = "/"+b.short+"/edit", b=b) else: return redirect("/", code=302) # Pull up an archived and parsed text view of the Bookmark # The first line of defense in preventing link rot... @app.route('//text/') @app.route('//text/') @app.route('//t/') @login_required def text(id, version=False): b = Bookmark.objects(short=id).first() tlist = ArchivedText.objects(url=b.url) if b: if version: t = ArchivedText.objects(url=b.url,created_at=version).first() text = t.text else: text = b.archived_text_ref.text b.hits += 1 b.save() return render_template("text.html", b=b, text=text, tlist=tlist) else: return redirect("/", code=302) # Display the raw html scraped from the website # The second line of defense against link rot... @app.route('//raw/') @app.route('//raw/') @app.route('//r/') @login_required def raw(id, version=False): b = Bookmark.objects(short=id).first() tlist = ArchivedText.objects(url=b.url) if b: if version: t = ArchivedText.objects(url=b.url,created_at=version).first() text = t.raw_html else: text = b.archived_text_ref.raw_html return text else: return redirect("/", code=302) # An archived image scraped from the website # The third line of defense against link rot... @app.route('//image/') @app.route('//image/') @app.route('//i/') @login_required def image(id, version=False): b = Bookmark.objects(short=id).first() tlist = ArchivedImage.objects(url=b.url) if b: if version: t = ArchivedImage.objects(url=b.url,created_at=version).first() path = t.path else: path = b.archived_image_ref.path b.hits += 1 b.save() return redirect(path, code=302) else: return redirect("/", code=302) # Embed url as an image in a formatted page. Does not require login. @app.route('//embed') def embed(id): b = Bookmark.objects(short=id).first() if b and (b.private != True or current_user.is_authenticated()): b.hits += 1 b.save() return render_template("image.html", b=b) else: return redirect("/", code=302) # Short code redirects directly to bookmark target, does not require auth to use # bookie as a URL shortener app @app.route('/') def short(id): b = Bookmark.objects(short=id).first() if b and (b.private != True or current_user.is_authenticated()): b.hits += 1 b.save() if b.image_embed: return redirect("/"+b.short+"/embed", code=302) else: return redirect(b.url, code=302) else: return redirect("/", code=302) # Anonymous home page @app.route('/') def index(): return render_template("index.html")