diff --git a/bookmark/bookmark.go b/bookmark/bookmark.go index e69de29..d1b7022 100644 --- a/bookmark/bookmark.go +++ b/bookmark/bookmark.go @@ -0,0 +1,21 @@ +package bookmark + +import ( + "fmt" +) + +func bookmarkExists(url string, db *sql.DB) bool { + var count int + + err := db.QueryRow("SELECT count() FROM bookmarks where URL=?", url).Scan(&count) + if err != nil { + fmt.Println("Could not check database for url") + panic(err) + } + + if count > 0 { + return true + } + + return false +} diff --git a/cmd/cmd.go b/cmd/cmd.go new file mode 100644 index 0000000..b657eba --- /dev/null +++ b/cmd/cmd.go @@ -0,0 +1,65 @@ +package cmd + +import ( + "encoding/xml" + "fmt" + "gitlab.amd.im/pocketarchive/bookmark" + "gitlab.amd.im/pocketarchive/ingest" + "net/http" +) + +func PullPocket(db *sql.DB) { + fmt.Println("Getting archive data from Pocket...") + + // Pull data from RSS feed. + archiveURL := "https://getpocket.com/users/amdavidson/feed/read" + + resp, err := http.Get(archiveURL) + if err != nil { + fmt.Println("Could not get archived urls") + panic(err) + } + + defer resp.Body.Close() + body, err := ioutil.ReadAll(resp.Body) + + // Parse the feed + f := Feed{} + err = xml.Unmarshal(body, &f) + if err != nil { + fmt.Println("Could not parse feed") + panic(err) + } + + for _, bookmark := range f.BookmarkList { + if bookmark.BookmarkExists(bookmark.GUID, db) == false { + fmt.Printf("New bookmark url %s\n", bookmark.GUID) + ingest.CreateIngestJob(bookmark.GUID, db) + } else { + fmt.Printf("Already know about %s\n", bookmark.GUID) + } + } +} + +func RunIngest (db *sql.DB) { + URLs, err := db.Query("SELECT URL FROM ingest") + if err != nil { + fmt.Println("Could not get ingest URL list") + panic(err) + } + defer URLs.Close() + + var URL string + for URLs.Next() { + err := URLs.Scan(&URL) + if err != nil { + fmt.Println("Could not parse ingest URL record") + panic(err) + } + ingest.IngestURL(URL, db) + } + err = URLs.Err() + if err != nil { + panic(err) + } +} diff --git a/ingest/ingest.go b/ingest/ingest.go index e69de29..ab30317 100644 --- a/ingest/ingest.go +++ b/ingest/ingest.go @@ -0,0 +1,117 @@ +package ingest + +import ( + "fmt" + "github.com/mauidude/go-readability" + "io/ioutil" + "net/http" +) + +func ingestJobExists(url string, db *sql.DB) bool { + var count int + + err := db.QueryRow("SELECT count() FROM ingest where URL=?", url).Scan(&count) + if err != nil { + fmt.Println("Could not check ingest table for URL") + panic(err) + } + + if count > 0 { + return true + } + + return false +} + +// createIngestJob puts the url into a table to queue for ingesting into the bookmark table. +func createIngestJob(url string, db *sql.DB) sql.Result { + if ingestJobExists(url, db) { + fmt.Println("URL exists in ingest queue") + row, err := db.Exec("SELECT * FROM ingest WHERE URL=?", url) + if err != nil { + fmt.Println("Could not get job from ingest queue") + panic(err) + } + return row + } + + row, err := db.Exec("INSERT INTO ingest(url) VALUES (?)", url) + if err != nil { + fmt.Println("Could not execute insert query") + panic(err) + } + + return row +} + +// Ingests a URL into the bookmarks table +func ingestURL(url string, db *sql.DB) { + fmt.Println("Ingesting:", url) + resp, err := http.Get(url) + if err != nil { + fmt.Println("Could not access bookmarked url:", url) + panic(err) + } + defer resp.Body.Close() + html, err := ioutil.ReadAll(resp.Body) + + doc, err := readability.NewDocument(string(html)) + if err != nil { + fmt.Println("Could not parse site data for:", url) + } + fmt.Println("Content:", doc.Content()) +} +func ingestJobExists(url string, db *sql.DB) bool { + var count int + + err := db.QueryRow("SELECT count() FROM ingest where URL=?", url).Scan(&count) + if err != nil { + fmt.Println("Could not check ingest table for URL") + panic(err) + } + + if count > 0 { + return true + } + + return false +} + +// createIngestJob puts the url into a table to queue for ingesting into the bookmark table. +func createIngestJob(url string, db *sql.DB) sql.Result { + if ingestJobExists(url, db) { + fmt.Println("URL exists in ingest queue") + row, err := db.Exec("SELECT * FROM ingest WHERE URL=?", url) + if err != nil { + fmt.Println("Could not get job from ingest queue") + panic(err) + } + return row + } + + row, err := db.Exec("INSERT INTO ingest(url) VALUES (?)", url) + if err != nil { + fmt.Println("Could not execute insert query") + panic(err) + } + + return row +} + +// Ingests a URL into the bookmarks table +func ingestURL(url string, db *sql.DB) { + fmt.Println("Ingesting:", url) + resp, err := http.Get(url) + if err != nil { + fmt.Println("Could not access bookmarked url:", url) + panic(err) + } + defer resp.Body.Close() + html, err := ioutil.ReadAll(resp.Body) + + doc, err := readability.NewDocument(string(html)) + if err != nil { + fmt.Println("Could not parse site data for:", url) + } + fmt.Println("Content:", doc.Content()) +} diff --git a/main.go b/main.go new file mode 100644 index 0000000..f9e6bf0 --- /dev/null +++ b/main.go @@ -0,0 +1,50 @@ +package pocketarchive +t + +import ( + "fmt" + "gitlab.amd.im/pocketarchive/cmd" + pasql "gitlab.amd.im/pocketarchive/sql" +) + +// Make up some data structures into which we can put our feed. + +// Bookmark defines the fundamental structure of the items to be archived. +type Bookmark struct { + // Required + Title string `xml:"title"` + Link string `xml:"link"` + GUID string `xml:"guid"` + // Optional + PubDate string `xml:"pubDate"` + Comments string `xml:"comments"` +} + +// Feed defines the structure of the RSS feed exported from Pocket +type Feed struct { + XMLName xml.Name `xml:"rss"` + Version string `xml:"version,attr"` + // Required + Title string `xml:"channel>title"` + Link string `xml:"channel>link"` + Description string `xml:"channel>description"` + // Optional + PubDate string `xml:"channel>pubDate"` + BookmarkList []Bookmark `xml:"channel>item"` +} + +func main() { + fmt.Println("Launching Pocket Archive...") + + db, err = pasql.GetDB("./bookmark.db") + if err != nil { + fmt.Println("Could not open database") + } + defer db.Close() + + cmd.PullPocket(db) + + cmd.RunIngest(db) + + fmt.Println("Pocket Archive exiting.") +} diff --git a/sql/sql.go b/sql/sql.go index e69de29..e33f39d 100644 --- a/sql/sql.go +++ b/sql/sql.go @@ -0,0 +1,47 @@ +package sql + +import ( + "database/sql" + "fmt" + _ "github.com/mattn/go-sqlite3" + "io/ioutil" + "os" + "strings" +) + +// getDB opens a DB object and returns a usable DB instance +func GetDB(path string) (*sql.DB, error) { + var fillDB bool + _, err := os.Stat(path) + if err != nil { + fmt.Println("Database does not exist, creating and applying schema") + fillDB = true + } else { + fillDB = false + } + + db, err := sql.Open("sqlite3", path) + if err != nil { + fmt.Println("Could not open database") + panic(err) + } + + if fillDB { + file, err := ioutil.ReadFile("./schema.sql") + if err != nil { + fmt.Println("database empty, but cold not read schema file") + panic(err) + } + requests := strings.Split(string(file), ";") + + for _, request := range requests { + _, err := db.Exec(request) + if err != nil { + fmt.Println("Could not execute:", request) + panic(err) + } + } + } + + return db, nil +}