diff --git a/.gitignore b/.gitignore index 98e6ef6..e5c15f3 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ +bookie *.db diff --git a/bookmark.go b/bookmark.go index ed0a47f..ff0dc9c 100644 --- a/bookmark.go +++ b/bookmark.go @@ -20,3 +20,22 @@ func bookmarkExists(url string, db *sql.DB) bool { return false } + +func addBookmark(db *sql.DB, b Bookmark) (Bookmark, error) { + fmt.Println("Got request to add:", b.Title) + + var ret Bookmark + + _, err := db.Exec(` + INSERT INTO bookmarks( + url, title, author, date, pubDate, comments, mercuryContent, pdfpath, + tags + ) VALUES ( ?, ?, ?, ?, ?, ?, ?, ?, ?);`, b.URL, b.Domain, b.Author, + b.Title, b.SaveDate, b.PubDate, b.Comments, b.MercuryContent, b.PDFpath, + b.Tags) + if err != nil { + fmt.Println("Could not insert bookmark.") + return ret, err + } + return ret, nil +} diff --git a/bookmarks.db-journal b/bookmarks.db-journal new file mode 100644 index 0000000..66f80bd Binary files /dev/null and b/bookmarks.db-journal differ diff --git a/cmd.go b/cmd.go deleted file mode 100644 index cfae017..0000000 --- a/cmd.go +++ /dev/null @@ -1,42 +0,0 @@ -package main - -import ( - "database/sql" - "encoding/xml" - "fmt" - "io/ioutil" - "net/http" -) - -func pullPocket(db *sql.DB) { - fmt.Println("Getting archive data from Pocket...") - - // Pull data from RSS feed. - archiveURL := "https://getpocket.com/users/amdavidson/feed/read" - - resp, err := http.Get(archiveURL) - if err != nil { - fmt.Println("Could not get archived urls") - panic(err) - } - - defer resp.Body.Close() - body, err := ioutil.ReadAll(resp.Body) - - // Parse the feed - f := Feed{} - err = xml.Unmarshal(body, &f) - if err != nil { - fmt.Println("Could not parse feed") - panic(err) - } - - for _, bookmark := range f.BookmarkList { - if bookmarkExists(bookmark.GUID, db) == false { - fmt.Printf("New bookmark url %s\n", bookmark.GUID) - ingestURL(bookmark.GUID, db) - } else { - fmt.Printf("Already know about %s\n", bookmark.GUID) - } - } -} diff --git a/ingest.go b/ingest.go index 7329d54..b17515a 100644 --- a/ingest.go +++ b/ingest.go @@ -5,23 +5,35 @@ import ( "fmt" ) -func ingestJobExists(url string, db *sql.DB) bool { - var count int - - err := db.QueryRow("SELECT count() FROM ingest where URL=?", url).Scan(&count) +func runIngest(db *sql.DB) { + URLs, err := db.Query("SELECT URL FROM ingest") if err != nil { - fmt.Println("Could not check ingest table for URL") + fmt.Println("Could not get ingest URL list") panic(err) } + defer URLs.Close() - if count > 0 { - return true + var URL string + for URLs.Next() { + err := URLs.Scan(&URL) + if err != nil { + fmt.Println("Could not parse ingest URL record") + panic(err) + } + err = ingestURL(URL, db) + if err != nil { + fmt.Println("Could not ingest url.") + fmt.Println(err) + } + } + err = URLs.Err() + if err != nil { + panic(err) } - - return false } -func ingestURL(url string, db *sql.DB) sql.Result { +// createIngestJob puts the url into a table to queue for ingesting into the bookmark table. +func createIngestJob(url string, db *sql.DB) sql.Result { if ingestJobExists(url, db) { fmt.Println("URL exists in ingest queue") row, err := db.Exec("SELECT * FROM ingest WHERE URL=?", url) @@ -40,3 +52,51 @@ func ingestURL(url string, db *sql.DB) sql.Result { return row } + +func ingestJobExists(url string, db *sql.DB) bool { + var count int + + err := db.QueryRow("SELECT count() FROM ingest where URL=?", url).Scan(&count) + if err != nil { + fmt.Println("Could not check ingest table for URL") + panic(err) + } + + if count > 0 { + return true + } + + return false +} + +// Ingests a URL into the bookmarks table +func ingestURL(url string, db *sql.DB) error { + fmt.Println("Ingesting:", url) + + var in Bookmark + + in.URL = url + + m, err := getMercury(url) + if err != nil { + fmt.Println(err) + } + + if m.Title != "" { + in.Title = m.Title + } + + if m.Content != "" { + in.MercuryContent = m.Content + } + + if m.DatePublished != "" { + in.PubDate = m.DatePublished + } + + b, err := addBookmark(db, in) + + fmt.Println("Ingested:", b.Title) + + return nil +} diff --git a/main.go b/main.go index cf27a04..abf7aef 100644 --- a/main.go +++ b/main.go @@ -11,11 +11,16 @@ import ( type Bookmark struct { // Required Title string `xml:"title"` - Link string `xml:"link"` - GUID string `xml:"guid"` + URL string `xml:"guid"` // Optional - PubDate string `xml:"pubDate"` - Comments string `xml:"comments"` + Domain string + Author string + SaveDate string + PubDate string + Comments string `xml:"comments"` + MercuryContent string + PDFpath string + Tags string } // Feed defines the structure of the RSS feed exported from Pocket @@ -31,6 +36,24 @@ type Feed struct { BookmarkList []Bookmark `xml:"channel>item"` } +// Mercury is a data structure to manage hte output of the mercury parsing +type Mercury struct { + Title string + Author string + DatePublished string + Dek string + LeadImageURL string + Content string + NextPageURL string + URL string + Domain string + Excerpt string + WordCount int + Direction string + TotalPages int + RenderedPages int +} + func main() { fmt.Println("Launching Pocket Archive...") @@ -41,7 +64,9 @@ func main() { } defer db.Close() - pullPocket(db) + //pullPocket(db) + + runIngest(db) fmt.Println("Pocket Archive exiting.") diff --git a/schema.sql b/schema.sql index 68a0c40..1c8ef97 100644 --- a/schema.sql +++ b/schema.sql @@ -1,3 +1,14 @@ -CREATE TABLE bookmarks (url text not null primary key); +CREATE TABLE bookmarks ( + url text not null primary key, + domain text, + title text, + author text, + date text, + pubDate text, + comments text, + mercuryContent text, + pdfpath text, + tags text +); CREATE TABLE ingest (url text not null primary key); diff --git a/util.go b/util.go new file mode 100644 index 0000000..b96575d --- /dev/null +++ b/util.go @@ -0,0 +1,93 @@ +package main + +import ( + "database/sql" + "encoding/json" + "encoding/xml" + "fmt" + "github.com/mauidude/go-readability" + "io/ioutil" + "net/http" +) + +func pullPocket(db *sql.DB) { + fmt.Println("Getting archive data from Pocket...") + + // Pull data from RSS feed. + archiveURL := "https://getpocket.com/users/amdavidson/feed/read" + + resp, err := http.Get(archiveURL) + if err != nil { + fmt.Println("Could not get archived urls") + panic(err) + } + + defer resp.Body.Close() + body, err := ioutil.ReadAll(resp.Body) + + // Parse the feed + f := Feed{} + err = xml.Unmarshal(body, &f) + if err != nil { + fmt.Println("Could not parse feed") + panic(err) + } + + for _, bookmark := range f.BookmarkList { + if bookmarkExists(bookmark.URL, db) == false { + fmt.Printf("New bookmark url %s\n", bookmark.URL) + createIngestJob(bookmark.URL, db) + } else { + fmt.Printf("Already know about %s\n", bookmark.URL) + } + } +} + +func getMercury(url string) (Mercury, error) { + + var m Mercury + + client := http.Client{} + + reqURL := fmt.Sprintf("https://mercury.postlight.com/parser?url=%s", url) + + req, err := http.NewRequest("GET", reqURL, nil) + if err != nil { + fmt.Println("Could not make new request") + return m, err + } + + req.Header.Add("x-api-key", "BM0BDRUyRbZCruv0VPJr3N9DBGGhW1GixHxS1sEB") + + mercuryRet, err := client.Do(req) + if err != nil { + fmt.Println("Could not request data from Mercury") + return m, err + } + + mercuryJSON, _ := ioutil.ReadAll(mercuryRet.Body) + + err = json.Unmarshal(mercuryJSON, &m) + if err != nil { + fmt.Println("Could not parse Mercury output") + return m, err + } + + return m, nil +} + +func getReadability(url string) string { + resp, err := http.Get(url) + if err != nil { + return "Could not get readability data. Could not reach url." + } + defer resp.Body.Close() + html, err := ioutil.ReadAll(resp.Body) + + doc, err := readability.NewDocument(string(html)) + if err != nil { + return "Could not get readability data. Could not parse retrieved data." + } + + return doc.Content() +}