cleanup commit
This commit is contained in:
parent
805b37ed5a
commit
8daec1baf7
8 changed files with 225 additions and 58 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -1 +1,2 @@
|
|||
bookie
|
||||
*.db
|
||||
|
|
19
bookmark.go
19
bookmark.go
|
@ -20,3 +20,22 @@ func bookmarkExists(url string, db *sql.DB) bool {
|
|||
|
||||
return false
|
||||
}
|
||||
|
||||
func addBookmark(db *sql.DB, b Bookmark) (Bookmark, error) {
|
||||
fmt.Println("Got request to add:", b.Title)
|
||||
|
||||
var ret Bookmark
|
||||
|
||||
_, err := db.Exec(`
|
||||
INSERT INTO bookmarks(
|
||||
url, title, author, date, pubDate, comments, mercuryContent, pdfpath,
|
||||
tags
|
||||
) VALUES ( ?, ?, ?, ?, ?, ?, ?, ?, ?);`, b.URL, b.Domain, b.Author,
|
||||
b.Title, b.SaveDate, b.PubDate, b.Comments, b.MercuryContent, b.PDFpath,
|
||||
b.Tags)
|
||||
if err != nil {
|
||||
fmt.Println("Could not insert bookmark.")
|
||||
return ret, err
|
||||
}
|
||||
return ret, nil
|
||||
}
|
||||
|
|
BIN
bookmarks.db-journal
Normal file
BIN
bookmarks.db-journal
Normal file
Binary file not shown.
42
cmd.go
42
cmd.go
|
@ -1,42 +0,0 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
)
|
||||
|
||||
func pullPocket(db *sql.DB) {
|
||||
fmt.Println("Getting archive data from Pocket...")
|
||||
|
||||
// Pull data from RSS feed.
|
||||
archiveURL := "https://getpocket.com/users/amdavidson/feed/read"
|
||||
|
||||
resp, err := http.Get(archiveURL)
|
||||
if err != nil {
|
||||
fmt.Println("Could not get archived urls")
|
||||
panic(err)
|
||||
}
|
||||
|
||||
defer resp.Body.Close()
|
||||
body, err := ioutil.ReadAll(resp.Body)
|
||||
|
||||
// Parse the feed
|
||||
f := Feed{}
|
||||
err = xml.Unmarshal(body, &f)
|
||||
if err != nil {
|
||||
fmt.Println("Could not parse feed")
|
||||
panic(err)
|
||||
}
|
||||
|
||||
for _, bookmark := range f.BookmarkList {
|
||||
if bookmarkExists(bookmark.GUID, db) == false {
|
||||
fmt.Printf("New bookmark url %s\n", bookmark.GUID)
|
||||
ingestURL(bookmark.GUID, db)
|
||||
} else {
|
||||
fmt.Printf("Already know about %s\n", bookmark.GUID)
|
||||
}
|
||||
}
|
||||
}
|
82
ingest.go
82
ingest.go
|
@ -5,23 +5,35 @@ import (
|
|||
"fmt"
|
||||
)
|
||||
|
||||
func ingestJobExists(url string, db *sql.DB) bool {
|
||||
var count int
|
||||
|
||||
err := db.QueryRow("SELECT count() FROM ingest where URL=?", url).Scan(&count)
|
||||
func runIngest(db *sql.DB) {
|
||||
URLs, err := db.Query("SELECT URL FROM ingest")
|
||||
if err != nil {
|
||||
fmt.Println("Could not check ingest table for URL")
|
||||
fmt.Println("Could not get ingest URL list")
|
||||
panic(err)
|
||||
}
|
||||
defer URLs.Close()
|
||||
|
||||
if count > 0 {
|
||||
return true
|
||||
var URL string
|
||||
for URLs.Next() {
|
||||
err := URLs.Scan(&URL)
|
||||
if err != nil {
|
||||
fmt.Println("Could not parse ingest URL record")
|
||||
panic(err)
|
||||
}
|
||||
err = ingestURL(URL, db)
|
||||
if err != nil {
|
||||
fmt.Println("Could not ingest url.")
|
||||
fmt.Println(err)
|
||||
}
|
||||
}
|
||||
err = URLs.Err()
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func ingestURL(url string, db *sql.DB) sql.Result {
|
||||
// createIngestJob puts the url into a table to queue for ingesting into the bookmark table.
|
||||
func createIngestJob(url string, db *sql.DB) sql.Result {
|
||||
if ingestJobExists(url, db) {
|
||||
fmt.Println("URL exists in ingest queue")
|
||||
row, err := db.Exec("SELECT * FROM ingest WHERE URL=?", url)
|
||||
|
@ -40,3 +52,51 @@ func ingestURL(url string, db *sql.DB) sql.Result {
|
|||
|
||||
return row
|
||||
}
|
||||
|
||||
func ingestJobExists(url string, db *sql.DB) bool {
|
||||
var count int
|
||||
|
||||
err := db.QueryRow("SELECT count() FROM ingest where URL=?", url).Scan(&count)
|
||||
if err != nil {
|
||||
fmt.Println("Could not check ingest table for URL")
|
||||
panic(err)
|
||||
}
|
||||
|
||||
if count > 0 {
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// Ingests a URL into the bookmarks table
|
||||
func ingestURL(url string, db *sql.DB) error {
|
||||
fmt.Println("Ingesting:", url)
|
||||
|
||||
var in Bookmark
|
||||
|
||||
in.URL = url
|
||||
|
||||
m, err := getMercury(url)
|
||||
if err != nil {
|
||||
fmt.Println(err)
|
||||
}
|
||||
|
||||
if m.Title != "" {
|
||||
in.Title = m.Title
|
||||
}
|
||||
|
||||
if m.Content != "" {
|
||||
in.MercuryContent = m.Content
|
||||
}
|
||||
|
||||
if m.DatePublished != "" {
|
||||
in.PubDate = m.DatePublished
|
||||
}
|
||||
|
||||
b, err := addBookmark(db, in)
|
||||
|
||||
fmt.Println("Ingested:", b.Title)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
|
33
main.go
33
main.go
|
@ -11,11 +11,16 @@ import (
|
|||
type Bookmark struct {
|
||||
// Required
|
||||
Title string `xml:"title"`
|
||||
Link string `xml:"link"`
|
||||
GUID string `xml:"guid"`
|
||||
URL string `xml:"guid"`
|
||||
// Optional
|
||||
PubDate string `xml:"pubDate"`
|
||||
Domain string
|
||||
Author string
|
||||
SaveDate string
|
||||
PubDate string
|
||||
Comments string `xml:"comments"`
|
||||
MercuryContent string
|
||||
PDFpath string
|
||||
Tags string
|
||||
}
|
||||
|
||||
// Feed defines the structure of the RSS feed exported from Pocket
|
||||
|
@ -31,6 +36,24 @@ type Feed struct {
|
|||
BookmarkList []Bookmark `xml:"channel>item"`
|
||||
}
|
||||
|
||||
// Mercury is a data structure to manage hte output of the mercury parsing
|
||||
type Mercury struct {
|
||||
Title string
|
||||
Author string
|
||||
DatePublished string
|
||||
Dek string
|
||||
LeadImageURL string
|
||||
Content string
|
||||
NextPageURL string
|
||||
URL string
|
||||
Domain string
|
||||
Excerpt string
|
||||
WordCount int
|
||||
Direction string
|
||||
TotalPages int
|
||||
RenderedPages int
|
||||
}
|
||||
|
||||
func main() {
|
||||
fmt.Println("Launching Pocket Archive...")
|
||||
|
||||
|
@ -41,7 +64,9 @@ func main() {
|
|||
}
|
||||
defer db.Close()
|
||||
|
||||
pullPocket(db)
|
||||
//pullPocket(db)
|
||||
|
||||
runIngest(db)
|
||||
|
||||
fmt.Println("Pocket Archive exiting.")
|
||||
|
||||
|
|
13
schema.sql
13
schema.sql
|
@ -1,3 +1,14 @@
|
|||
CREATE TABLE bookmarks (url text not null primary key);
|
||||
CREATE TABLE bookmarks (
|
||||
url text not null primary key,
|
||||
domain text,
|
||||
title text,
|
||||
author text,
|
||||
date text,
|
||||
pubDate text,
|
||||
comments text,
|
||||
mercuryContent text,
|
||||
pdfpath text,
|
||||
tags text
|
||||
);
|
||||
CREATE TABLE ingest (url text not null primary key);
|
||||
|
||||
|
|
93
util.go
Normal file
93
util.go
Normal file
|
@ -0,0 +1,93 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"github.com/mauidude/go-readability"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
)
|
||||
|
||||
func pullPocket(db *sql.DB) {
|
||||
fmt.Println("Getting archive data from Pocket...")
|
||||
|
||||
// Pull data from RSS feed.
|
||||
archiveURL := "https://getpocket.com/users/amdavidson/feed/read"
|
||||
|
||||
resp, err := http.Get(archiveURL)
|
||||
if err != nil {
|
||||
fmt.Println("Could not get archived urls")
|
||||
panic(err)
|
||||
}
|
||||
|
||||
defer resp.Body.Close()
|
||||
body, err := ioutil.ReadAll(resp.Body)
|
||||
|
||||
// Parse the feed
|
||||
f := Feed{}
|
||||
err = xml.Unmarshal(body, &f)
|
||||
if err != nil {
|
||||
fmt.Println("Could not parse feed")
|
||||
panic(err)
|
||||
}
|
||||
|
||||
for _, bookmark := range f.BookmarkList {
|
||||
if bookmarkExists(bookmark.URL, db) == false {
|
||||
fmt.Printf("New bookmark url %s\n", bookmark.URL)
|
||||
createIngestJob(bookmark.URL, db)
|
||||
} else {
|
||||
fmt.Printf("Already know about %s\n", bookmark.URL)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func getMercury(url string) (Mercury, error) {
|
||||
|
||||
var m Mercury
|
||||
|
||||
client := http.Client{}
|
||||
|
||||
reqURL := fmt.Sprintf("https://mercury.postlight.com/parser?url=%s", url)
|
||||
|
||||
req, err := http.NewRequest("GET", reqURL, nil)
|
||||
if err != nil {
|
||||
fmt.Println("Could not make new request")
|
||||
return m, err
|
||||
}
|
||||
|
||||
req.Header.Add("x-api-key", "BM0BDRUyRbZCruv0VPJr3N9DBGGhW1GixHxS1sEB")
|
||||
|
||||
mercuryRet, err := client.Do(req)
|
||||
if err != nil {
|
||||
fmt.Println("Could not request data from Mercury")
|
||||
return m, err
|
||||
}
|
||||
|
||||
mercuryJSON, _ := ioutil.ReadAll(mercuryRet.Body)
|
||||
|
||||
err = json.Unmarshal(mercuryJSON, &m)
|
||||
if err != nil {
|
||||
fmt.Println("Could not parse Mercury output")
|
||||
return m, err
|
||||
}
|
||||
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func getReadability(url string) string {
|
||||
resp, err := http.Get(url)
|
||||
if err != nil {
|
||||
return "Could not get readability data. Could not reach url."
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
html, err := ioutil.ReadAll(resp.Body)
|
||||
|
||||
doc, err := readability.NewDocument(string(html))
|
||||
if err != nil {
|
||||
return "Could not get readability data. Could not parse retrieved data."
|
||||
}
|
||||
|
||||
return doc.Content()
|
||||
}
|
Loading…
Reference in a new issue