cleanup commit

This commit is contained in:
Andrew Davidson 2023-01-02 18:13:08 -08:00
parent 805b37ed5a
commit 8daec1baf7
8 changed files with 225 additions and 58 deletions

1
.gitignore vendored
View file

@ -1 +1,2 @@
bookie
*.db

View file

@ -20,3 +20,22 @@ func bookmarkExists(url string, db *sql.DB) bool {
return false
}
func addBookmark(db *sql.DB, b Bookmark) (Bookmark, error) {
fmt.Println("Got request to add:", b.Title)
var ret Bookmark
_, err := db.Exec(`
INSERT INTO bookmarks(
url, title, author, date, pubDate, comments, mercuryContent, pdfpath,
tags
) VALUES ( ?, ?, ?, ?, ?, ?, ?, ?, ?);`, b.URL, b.Domain, b.Author,
b.Title, b.SaveDate, b.PubDate, b.Comments, b.MercuryContent, b.PDFpath,
b.Tags)
if err != nil {
fmt.Println("Could not insert bookmark.")
return ret, err
}
return ret, nil
}

BIN
bookmarks.db-journal Normal file

Binary file not shown.

42
cmd.go
View file

@ -1,42 +0,0 @@
package main
import (
"database/sql"
"encoding/xml"
"fmt"
"io/ioutil"
"net/http"
)
func pullPocket(db *sql.DB) {
fmt.Println("Getting archive data from Pocket...")
// Pull data from RSS feed.
archiveURL := "https://getpocket.com/users/amdavidson/feed/read"
resp, err := http.Get(archiveURL)
if err != nil {
fmt.Println("Could not get archived urls")
panic(err)
}
defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
// Parse the feed
f := Feed{}
err = xml.Unmarshal(body, &f)
if err != nil {
fmt.Println("Could not parse feed")
panic(err)
}
for _, bookmark := range f.BookmarkList {
if bookmarkExists(bookmark.GUID, db) == false {
fmt.Printf("New bookmark url %s\n", bookmark.GUID)
ingestURL(bookmark.GUID, db)
} else {
fmt.Printf("Already know about %s\n", bookmark.GUID)
}
}
}

View file

@ -5,23 +5,35 @@ import (
"fmt"
)
func ingestJobExists(url string, db *sql.DB) bool {
var count int
err := db.QueryRow("SELECT count() FROM ingest where URL=?", url).Scan(&count)
func runIngest(db *sql.DB) {
URLs, err := db.Query("SELECT URL FROM ingest")
if err != nil {
fmt.Println("Could not check ingest table for URL")
fmt.Println("Could not get ingest URL list")
panic(err)
}
defer URLs.Close()
if count > 0 {
return true
var URL string
for URLs.Next() {
err := URLs.Scan(&URL)
if err != nil {
fmt.Println("Could not parse ingest URL record")
panic(err)
}
err = ingestURL(URL, db)
if err != nil {
fmt.Println("Could not ingest url.")
fmt.Println(err)
}
}
err = URLs.Err()
if err != nil {
panic(err)
}
return false
}
func ingestURL(url string, db *sql.DB) sql.Result {
// createIngestJob puts the url into a table to queue for ingesting into the bookmark table.
func createIngestJob(url string, db *sql.DB) sql.Result {
if ingestJobExists(url, db) {
fmt.Println("URL exists in ingest queue")
row, err := db.Exec("SELECT * FROM ingest WHERE URL=?", url)
@ -40,3 +52,51 @@ func ingestURL(url string, db *sql.DB) sql.Result {
return row
}
func ingestJobExists(url string, db *sql.DB) bool {
var count int
err := db.QueryRow("SELECT count() FROM ingest where URL=?", url).Scan(&count)
if err != nil {
fmt.Println("Could not check ingest table for URL")
panic(err)
}
if count > 0 {
return true
}
return false
}
// Ingests a URL into the bookmarks table
func ingestURL(url string, db *sql.DB) error {
fmt.Println("Ingesting:", url)
var in Bookmark
in.URL = url
m, err := getMercury(url)
if err != nil {
fmt.Println(err)
}
if m.Title != "" {
in.Title = m.Title
}
if m.Content != "" {
in.MercuryContent = m.Content
}
if m.DatePublished != "" {
in.PubDate = m.DatePublished
}
b, err := addBookmark(db, in)
fmt.Println("Ingested:", b.Title)
return nil
}

35
main.go
View file

@ -11,11 +11,16 @@ import (
type Bookmark struct {
// Required
Title string `xml:"title"`
Link string `xml:"link"`
GUID string `xml:"guid"`
URL string `xml:"guid"`
// Optional
PubDate string `xml:"pubDate"`
Comments string `xml:"comments"`
Domain string
Author string
SaveDate string
PubDate string
Comments string `xml:"comments"`
MercuryContent string
PDFpath string
Tags string
}
// Feed defines the structure of the RSS feed exported from Pocket
@ -31,6 +36,24 @@ type Feed struct {
BookmarkList []Bookmark `xml:"channel>item"`
}
// Mercury is a data structure to manage hte output of the mercury parsing
type Mercury struct {
Title string
Author string
DatePublished string
Dek string
LeadImageURL string
Content string
NextPageURL string
URL string
Domain string
Excerpt string
WordCount int
Direction string
TotalPages int
RenderedPages int
}
func main() {
fmt.Println("Launching Pocket Archive...")
@ -41,7 +64,9 @@ func main() {
}
defer db.Close()
pullPocket(db)
//pullPocket(db)
runIngest(db)
fmt.Println("Pocket Archive exiting.")

View file

@ -1,3 +1,14 @@
CREATE TABLE bookmarks (url text not null primary key);
CREATE TABLE bookmarks (
url text not null primary key,
domain text,
title text,
author text,
date text,
pubDate text,
comments text,
mercuryContent text,
pdfpath text,
tags text
);
CREATE TABLE ingest (url text not null primary key);

93
util.go Normal file
View file

@ -0,0 +1,93 @@
package main
import (
"database/sql"
"encoding/json"
"encoding/xml"
"fmt"
"github.com/mauidude/go-readability"
"io/ioutil"
"net/http"
)
func pullPocket(db *sql.DB) {
fmt.Println("Getting archive data from Pocket...")
// Pull data from RSS feed.
archiveURL := "https://getpocket.com/users/amdavidson/feed/read"
resp, err := http.Get(archiveURL)
if err != nil {
fmt.Println("Could not get archived urls")
panic(err)
}
defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
// Parse the feed
f := Feed{}
err = xml.Unmarshal(body, &f)
if err != nil {
fmt.Println("Could not parse feed")
panic(err)
}
for _, bookmark := range f.BookmarkList {
if bookmarkExists(bookmark.URL, db) == false {
fmt.Printf("New bookmark url %s\n", bookmark.URL)
createIngestJob(bookmark.URL, db)
} else {
fmt.Printf("Already know about %s\n", bookmark.URL)
}
}
}
func getMercury(url string) (Mercury, error) {
var m Mercury
client := http.Client{}
reqURL := fmt.Sprintf("https://mercury.postlight.com/parser?url=%s", url)
req, err := http.NewRequest("GET", reqURL, nil)
if err != nil {
fmt.Println("Could not make new request")
return m, err
}
req.Header.Add("x-api-key", "BM0BDRUyRbZCruv0VPJr3N9DBGGhW1GixHxS1sEB")
mercuryRet, err := client.Do(req)
if err != nil {
fmt.Println("Could not request data from Mercury")
return m, err
}
mercuryJSON, _ := ioutil.ReadAll(mercuryRet.Body)
err = json.Unmarshal(mercuryJSON, &m)
if err != nil {
fmt.Println("Could not parse Mercury output")
return m, err
}
return m, nil
}
func getReadability(url string) string {
resp, err := http.Get(url)
if err != nil {
return "Could not get readability data. Could not reach url."
}
defer resp.Body.Close()
html, err := ioutil.ReadAll(resp.Body)
doc, err := readability.NewDocument(string(html))
if err != nil {
return "Could not get readability data. Could not parse retrieved data."
}
return doc.Content()
}