summaryrefslogtreecommitdiff
path: root/main.go
diff options
context:
space:
mode:
Diffstat (limited to 'main.go')
-rw-r--r--main.go250
1 files changed, 250 insertions, 0 deletions
diff --git a/main.go b/main.go
new file mode 100644
index 0000000..1910205
--- /dev/null
+++ b/main.go
@@ -0,0 +1,250 @@
+package main
+
+import (
+ "database/sql"
+ "encoding/xml"
+ "fmt"
+ "io"
+ "log"
+ "net/http"
+ "strconv"
+ "strings"
+ "time"
+
+ _ "github.com/mattn/go-sqlite3"
+)
+
+// RSS structures
+type RSS struct {
+ XMLName xml.Name `xml:"rss"`
+ Channel Channel `xml:"channel"`
+}
+
+type Channel struct {
+ Title string `xml:"title"`
+ Items []Item `xml:"item"`
+}
+
+type Item struct {
+ Title string `xml:"title"`
+ Link string `xml:"link"`
+ Category string `xml:"category"`
+ Enclosure Enclosure `xml:"enclosure"`
+}
+
+type Enclosure struct {
+ URL string `xml:"url,attr"`
+ Type string `xml:"type,attr"`
+}
+
+// Internal structures
+type YearWeek struct {
+ Year int
+ Week int
+}
+
+type TitleAuthor struct {
+ Title string
+ Author string
+}
+
+func parseYearWeek(date string) (YearWeek, error) {
+ parts := strings.Split(date, "/")
+ if len(parts) != 2 {
+ return YearWeek{}, fmt.Errorf("expected 'YYYY/WW', got '%s'", date)
+ }
+
+ year, err := strconv.Atoi(parts[0])
+ if err != nil {
+ return YearWeek{}, fmt.Errorf("invalid year for date '%s': %w", date, err)
+ }
+
+ week, err := strconv.Atoi(parts[1])
+ if err != nil {
+ return YearWeek{}, fmt.Errorf("invalid week for date '%s': %w", date, err)
+ }
+
+ return YearWeek{Year: year, Week: week}, nil
+}
+
+func parseTitle(title string) (TitleAuthor, error) {
+ parts := strings.Split(title, " - ")
+ if len(parts) < 3 {
+ return TitleAuthor{}, fmt.Errorf("expected 'Week - Author - Title', got '%s'", title)
+ }
+
+ return TitleAuthor{Title: strings.Join(parts[2:], " - "), Author: parts[1]}, nil
+}
+
+// For some tracks, like https://weeklybeats.com/keff/music/evil-los-man, the WB RSS feed returns invalid XML bytes. Scrub these.
+func filterInvalidXMLBytes(data []byte) []byte {
+ writePos := 0
+ for _, b := range data {
+ if isValidXMLByte(b) {
+ data[writePos] = b
+ writePos++
+ }
+ }
+ return data[:writePos]
+}
+
+func isValidXMLByte(b byte) bool {
+ // Valid XML 1.0 characters for single-byte range
+ return b == 0x09 || b == 0x0A || b == 0x0D || b >= 0x20
+}
+
+func main() {
+ // Create SQLite database
+ db, err := sql.Open("sqlite3", "weeklybeats.db")
+ if err != nil {
+ log.Fatal("Failed to open database:", err)
+ }
+ defer db.Close()
+
+ // Create table
+ createTable := `
+ CREATE TABLE IF NOT EXISTS tracks (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ title TEXT NOT NULL,
+ link TEXT NOT NULL,
+ author TEXT,
+ week INTEGER,
+ year INTEGER,
+ url TEXT,
+ UNIQUE(author, week, year)
+ );`
+
+ _, err = db.Exec(createTable)
+ if err != nil {
+ log.Fatal("Failed to create table:", err)
+ }
+
+ fmt.Println("Starting to scrape Weekly Beats RSS feed...")
+
+ // Prepare insert statement
+ insertStmt, err := db.Prepare(`
+ INSERT OR IGNORE INTO tracks (title, link, author, week, year, url)
+ VALUES (?, ?, ?, ?, ?, ?)
+ `)
+ if err != nil {
+ log.Fatal("Failed to prepare insert statement:", err)
+ }
+ defer insertStmt.Close()
+
+ page := 1
+ totalItems := 0
+
+ for {
+ fmt.Printf("Fetching page %d...\n", page)
+
+ // Construct URL with page parameter
+ url := fmt.Sprintf("https://weeklybeats.com/music/rss?limit=1000&page=%d", page)
+
+ // Fetch RSS feed
+ resp, err := http.Get(url)
+ if err != nil {
+ log.Printf("Failed to fetch page %d: %v", page, err)
+ break
+ }
+
+ if resp.StatusCode != 200 {
+ fmt.Printf("Received status %d for page %d, stopping\n", resp.StatusCode, page)
+ resp.Body.Close()
+ break
+ }
+
+ body, err := io.ReadAll(resp.Body)
+ resp.Body.Close()
+
+ if err != nil {
+ log.Printf("Failed to read response body for page %d: %v", page, err)
+ break
+ }
+
+ cleanedBody := filterInvalidXMLBytes(body)
+
+ // Parse XML
+ var rss RSS
+ err = xml.Unmarshal(cleanedBody, &rss)
+ if err != nil {
+ log.Printf("Failed to parse XML for page %d: %v", page, err)
+ break
+ }
+
+ // Check if we got any items
+ if len(rss.Channel.Items) == 0 {
+ fmt.Printf("No items found on page %d, stopping\n", page)
+ break
+ }
+
+ fmt.Printf("Found %d items on page %d\n", len(rss.Channel.Items), page)
+
+ // Insert items into database
+ pageItems := 0
+ for _, item := range rss.Channel.Items {
+ titleAuthor, err := parseTitle(item.Title)
+ if err != nil {
+ log.Printf("Failed to parse title for item '%s': %v", item.Title, err)
+ continue
+ }
+
+ yearWeek, err := parseYearWeek(item.Category)
+ if err != nil {
+ log.Printf("Failed to parse date for item '%s': %v", item.Title, err)
+ continue
+ }
+
+ _, err = insertStmt.Exec(titleAuthor.Title, item.Link, titleAuthor.Author, yearWeek.Week, yearWeek.Year, item.Enclosure.URL)
+ if err != nil {
+ log.Printf("Failed to insert item '%s': %v", item.Title, err)
+ continue
+ }
+ pageItems++
+ }
+
+ fmt.Printf("Inserted %d new items from page %d\n", pageItems, page)
+ totalItems += pageItems
+
+ // Be respectful to the server
+ time.Sleep(500 * time.Millisecond)
+
+ page++
+
+ // Safety check to prevent infinite loops
+ if page > 1000 {
+ fmt.Println("Reached maximum page limit (1000), stopping")
+ break
+ }
+ }
+
+ // Print summary
+ fmt.Printf("\nScraping complete! Total items processed: %d\n", totalItems)
+
+ // Query and display some statistics
+ var count int
+ err = db.QueryRow("SELECT COUNT(*) FROM tracks").Scan(&count)
+ if err != nil {
+ log.Printf("Failed to count records: %v", err)
+ } else {
+ fmt.Printf("Total records in database: %d\n", count)
+ }
+
+ // Show sample of data
+ fmt.Println("\nSample of stored data:")
+ rows, err := db.Query("SELECT title, author, link FROM tracks LIMIT 5")
+ if err != nil {
+ log.Printf("Failed to query sample data: %v", err)
+ return
+ }
+ defer rows.Close()
+
+ for rows.Next() {
+ var title, author, link string
+ err := rows.Scan(&title, &author, &link)
+ if err != nil {
+ log.Printf("Failed to scan row: %v", err)
+ continue
+ }
+ fmt.Printf("- %s by %s (%s)\n", title, author, link)
+ }
+}