package main import ( "database/sql" "encoding/xml" "fmt" "io" "log" "net/http" "strconv" "strings" "time" _ "github.com/ncruces/go-sqlite3/driver" _ "github.com/ncruces/go-sqlite3/embed" ) // RSS structures type RSS struct { XMLName xml.Name `xml:"rss"` Channel Channel `xml:"channel"` } type Channel struct { Title string `xml:"title"` Items []Item `xml:"item"` } type Item struct { Title string `xml:"title"` Link string `xml:"link"` Category string `xml:"category"` Enclosure Enclosure `xml:"enclosure"` } type Enclosure struct { URL string `xml:"url,attr"` Type string `xml:"type,attr"` } // Internal structures type YearWeek struct { Year int Week int } type TitleAuthor struct { Title string Author string } func parseYearWeek(date string) (YearWeek, error) { parts := strings.Split(date, "/") if len(parts) != 2 { return YearWeek{}, fmt.Errorf("expected 'YYYY/WW', got '%s'", date) } year, err := strconv.Atoi(parts[0]) if err != nil { return YearWeek{}, fmt.Errorf("invalid year for date '%s': %w", date, err) } week, err := strconv.Atoi(parts[1]) if err != nil { return YearWeek{}, fmt.Errorf("invalid week for date '%s': %w", date, err) } return YearWeek{Year: year, Week: week}, nil } func parseTitle(title string) (TitleAuthor, error) { parts := strings.Split(title, " - ") if len(parts) < 3 { return TitleAuthor{}, fmt.Errorf("expected 'Week - Author - Title', got '%s'", title) } return TitleAuthor{Title: strings.Join(parts[2:], " - "), Author: parts[1]}, nil } // For some tracks, like https://weeklybeats.com/keff/music/evil-los-man, the WB RSS feed returns invalid XML bytes. Scrub these. func filterInvalidXMLBytes(data []byte) []byte { writePos := 0 for _, b := range data { if isValidXMLByte(b) { data[writePos] = b writePos++ } } return data[:writePos] } func isValidXMLByte(b byte) bool { // Valid XML 1.0 characters for single-byte range return b == 0x09 || b == 0x0A || b == 0x0D || b >= 0x20 } func main() { fmt.Println("Program started") // Create SQLite database db, err := sql.Open("sqlite3", "weeklybeats.db") if err != nil { log.Fatal("Failed to open database:", err) } defer db.Close() // Create table createTable := ` CREATE TABLE IF NOT EXISTS tracks ( id INTEGER PRIMARY KEY AUTOINCREMENT, title TEXT NOT NULL, link TEXT NOT NULL, author TEXT, week INTEGER, year INTEGER, url TEXT, UNIQUE(author, week, year) );` _, err = db.Exec(createTable) if err != nil { log.Fatal("Failed to create table:", err) } fmt.Println("Starting to scrape Weekly Beats RSS feed...") // Prepare insert statement insertStmt, err := db.Prepare(` INSERT OR IGNORE INTO tracks (title, link, author, week, year, url) VALUES (?, ?, ?, ?, ?, ?) `) if err != nil { log.Fatal("Failed to prepare insert statement:", err) } defer insertStmt.Close() year := 2012 totalItems := 0 for year <= 2024 { page := 1 for { fmt.Printf("Fetching page %d for year %d...\n", page, year) // Construct URL with page parameter url := fmt.Sprintf("https://weeklybeats.com/music/rss?limit=1000&year=%d&page=%d", year, page) // Fetch RSS feed resp, err := http.Get(url) if err != nil { log.Printf("Failed to fetch page %d: %v", page, err) break } if resp.StatusCode != 200 { fmt.Printf("Received status %d for page %d, stopping\n", resp.StatusCode, page) resp.Body.Close() break } body, err := io.ReadAll(resp.Body) resp.Body.Close() if err != nil { log.Printf("Failed to read response body for page %d: %v", page, err) break } cleanedBody := filterInvalidXMLBytes(body) // Parse XML var rss RSS err = xml.Unmarshal(cleanedBody, &rss) if err != nil { log.Printf("Failed to parse XML for page %d: %v", page, err) break } // Check if we got any items if len(rss.Channel.Items) == 0 { fmt.Printf("No items found on page %d, stopping\n", page) break } fmt.Printf("Found %d items on page %d\n", len(rss.Channel.Items), page) // Insert items into database pageItems := 0 for _, item := range rss.Channel.Items { titleAuthor, err := parseTitle(item.Title) if err != nil { log.Printf("Failed to parse title for item '%s': %v", item.Title, err) continue } yearWeek, err := parseYearWeek(item.Category) if err != nil { log.Printf("Failed to parse date for item '%s': %v", item.Title, err) continue } _, err = insertStmt.Exec(titleAuthor.Title, item.Link, titleAuthor.Author, yearWeek.Week, yearWeek.Year, item.Enclosure.URL) if err != nil { log.Printf("Failed to insert item '%s': %v", item.Title, err) continue } pageItems++ } fmt.Printf("Inserted %d new items from page %d\n", pageItems, page) totalItems += pageItems // Be respectful to the server time.Sleep(500 * time.Millisecond) page++ // Safety check to prevent infinite loops if page > 1000 { fmt.Println("Reached maximum page limit (1000), stopping") break } } year += 2 } // Print summary fmt.Printf("\nScraping complete! Total items processed: %d\n", totalItems) // Query and display some statistics var count int err = db.QueryRow("SELECT COUNT(*) FROM tracks").Scan(&count) if err != nil { log.Printf("Failed to count records: %v", err) } else { fmt.Printf("Total records in database: %d\n", count) } // Show sample of data fmt.Println("\nSample of stored data:") rows, err := db.Query("SELECT title, author, link FROM tracks LIMIT 5") if err != nil { log.Printf("Failed to query sample data: %v", err) return } defer rows.Close() for rows.Next() { var title, author, link string err := rows.Scan(&title, &author, &link) if err != nil { log.Printf("Failed to scan row: %v", err) continue } fmt.Printf("- %s by %s (%s)\n", title, author, link) } }