package main import ( "database/sql" "encoding/xml" "fmt" "io" "log" "net/http" "net/url" "strconv" "strings" "time" _ "github.com/ncruces/go-sqlite3/driver" _ "github.com/ncruces/go-sqlite3/embed" "github.com/PuerkitoBio/goquery" ) const ( artistFetchDelay = 250 * time.Millisecond pageFetchDelay = 500 * time.Millisecond maxPages = 1000 itemsPerPage = 1000 ) // RSS structures type RSS struct { XMLName xml.Name `xml:"rss"` Channel Channel `xml:"channel"` } type Channel struct { Title string `xml:"title"` Items []Item `xml:"item"` } type Item struct { Title string `xml:"title"` Author string `xml:"https://www.itunes.com/dtds/podcast-1.0.dtd author"` Link string `xml:"link"` Category string `xml:"category"` Enclosure Enclosure `xml:"enclosure"` } type Enclosure struct { URL string `xml:"url,attr"` Type string `xml:"type,attr"` } // Internal structures type YearWeek struct { Year int Week int } type TitleAuthor struct { Title string Author string } func extractUsername(trackURL string) (string, error) { u, err := url.Parse(trackURL) if err != nil { return "", err } parts := strings.Split(strings.Trim(u.Path, "/"), "/") if len(parts) < 1 { return "", fmt.Errorf("invalid URL format: %s", trackURL) } return parts[0], nil } func fetchArtistName(trackURL string) (string, error) { resp, err := http.Get(trackURL) if err != nil { return "", err } defer resp.Body.Close() if resp.StatusCode != 200 { return "", fmt.Errorf("HTTP %d", resp.StatusCode) } doc, err := goquery.NewDocumentFromReader(resp.Body) if err != nil { return "", err } artistName := strings.TrimSpace(strings.TrimPrefix(doc.Find("#item_user a").Text(), "By ")) if artistName == "" { return "", fmt.Errorf("artist name not found") } return artistName, nil } func parseYearWeek(date string) (YearWeek, error) { parts := strings.Split(date, "/") if len(parts) != 2 { return YearWeek{}, fmt.Errorf("expected 'YYYY/WW', got '%s'", date) } year, err := strconv.Atoi(parts[0]) if err != nil { return YearWeek{}, fmt.Errorf("invalid year for date '%s': %w", date, err) } week, err := strconv.Atoi(parts[1]) if err != nil { return YearWeek{}, fmt.Errorf("invalid week for date '%s': %w", date, err) } return YearWeek{Year: year, Week: week}, nil } func parseTitle(title string) (string, error) { parts := strings.Split(title, " - ") if len(parts) < 3 { return "", fmt.Errorf("expected 'Week - Author - Title', got '%s'", title) } return strings.Join(parts[2:], " - "), nil } // For some tracks, like https://weeklybeats.com/keff/music/evil-los-man, the WB RSS feed returns invalid XML bytes. Scrub these. func filterInvalidXMLBytes(data []byte) []byte { writePos := 0 for _, b := range data { if isValidXMLByte(b) { data[writePos] = b writePos++ } } return data[:writePos] } func isValidXMLByte(b byte) bool { // Valid XML 1.0 characters for single-byte range return b == 0x09 || b == 0x0A || b == 0x0D || b >= 0x20 } func fetchRSSPage(year, page int) (*RSS, error) { url := fmt.Sprintf("https://weeklybeats.com/music/rss?limit=%d&year=%d&page=%d", itemsPerPage, year, page) resp, err := http.Get(url) if err != nil { return nil, fmt.Errorf("failed to fetch: %w", err) } defer resp.Body.Close() if resp.StatusCode != 200 { return nil, fmt.Errorf("received status %d", resp.StatusCode) } body, err := io.ReadAll(resp.Body) if err != nil { return nil, fmt.Errorf("failed to read response: %w", err) } cleanedBody := filterInvalidXMLBytes(body) var rss RSS err = xml.Unmarshal(cleanedBody, &rss) if err != nil { return nil, fmt.Errorf("failed to parse XML: %w", err) } return &rss, nil } func getArtistNameFromTitle(title string) (string, error) { titleAuthor, err := parseTitle(title) if err != nil { return "", err } return titleAuthor.Author, nil } func resolveArtistName(item Item, cache map[string]string) (string, error) { username, err := extractUsername(item.Link) if err != nil { log.Printf("Failed to extract username from '%s': %v", item.Link, err) return getArtistNameFromTitle(item.Title) } // Try cache first if artistName := cache[username]; artistName != "" { return artistName, nil } // Not in cache, fetch from track page artistName, err := fetchArtistName(item.Link) if err != nil { log.Printf("Failed to fetch artist for '%s': %v, using RSS fallback", username, err) return getArtistNameFromTitle(item.Title) } cache[username] = artistName fmt.Printf("Cached artist name '%s' for username '%s'\n", artistName, username) time.Sleep(artistFetchDelay) return artistName, nil } func insertItems(insertStmt *sql.Stmt, items []Item, cache map[string]string) int { inserted := 0 for _, item := range items { artistName, err := resolveArtistName(item, cache) if err != nil { log.Printf("Failed to resolve artist for item '%s': %v", item.Title, err) continue } titleAuthor, err := parseTitle(item.Title) if err != nil { log.Printf("Failed to parse title for item '%s': %v", item.Title, err) continue } yearWeek, err := parseYearWeek(item.Category) if err != nil { log.Printf("Failed to parse date for item '%s': %v", item.Title, err) continue } _, err = insertStmt.Exec(titleAuthor.Title, item.Link, artistName, yearWeek.Week, yearWeek.Year, item.Enclosure.URL) if err != nil { log.Printf("Failed to insert item '%s': %v", item.Title, err) continue } inserted++ } return inserted } func scrapeYear(year int, insertStmt *sql.Stmt, cache map[string]string) int { totalItems := 0 page := 1 for { fmt.Printf("Fetching page %d for year %d...\n", page, year) rss, err := fetchRSSPage(year, page) if err != nil { log.Printf("Failed to fetch page %d: %v", page, err) break } if len(rss.Channel.Items) == 0 { fmt.Printf("No items found on page %d, stopping\n", page) break } fmt.Printf("Found %d items on page %d\n", len(rss.Channel.Items), page) pageItems := insertItems(insertStmt, rss.Channel.Items, cache) fmt.Printf("Inserted %d new items from page %d\n", pageItems, page) totalItems += pageItems time.Sleep(pageFetchDelay) page++ if page > maxPages { fmt.Printf("Reached maximum page limit (%d), stopping\n", maxPages) break } } return totalItems } func printStatistics(db *sql.DB) { var count int err := db.QueryRow("SELECT COUNT(*) FROM tracks").Scan(&count) if err != nil { log.Printf("Failed to count records: %v", err) } else { fmt.Printf("Total records in database: %d\n", count) } fmt.Println("\nSample of stored data:") rows, err := db.Query("SELECT title, author, link FROM tracks LIMIT 5") if err != nil { log.Printf("Failed to query sample data: %v", err) return } defer rows.Close() for rows.Next() { var title, author, link string err := rows.Scan(&title, &author, &link) if err != nil { log.Printf("Failed to scan row: %v", err) continue } fmt.Printf("- %s by %s (%s)\n", title, author, link) } } func main() { fmt.Println("Program started") db, err := sql.Open("sqlite3", "weeklybeats.db") if err != nil { log.Fatal("Failed to open database:", err) } defer db.Close() createTable := ` CREATE TABLE IF NOT EXISTS tracks ( id INTEGER PRIMARY KEY AUTOINCREMENT, title TEXT NOT NULL, link TEXT NOT NULL, author TEXT, week INTEGER, year INTEGER, url TEXT, UNIQUE(author, week, year) );` _, err = db.Exec(createTable) if err != nil { log.Fatal("Failed to create table:", err) } fmt.Println("Starting to scrape Weekly Beats RSS feed...") insertStmt, err := db.Prepare(` INSERT OR IGNORE INTO tracks (title, link, author, week, year, url) VALUES (?, ?, ?, ?, ?, ?) `) if err != nil { log.Fatal("Failed to prepare insert statement:", err) } defer insertStmt.Close() cache := make(map[string]string) totalItems := 0 for year := 2012; year <= 2024; year += 2 { totalItems += scrapeYear(year, insertStmt, cache) } fmt.Printf("\nScraping complete! Total items processed: %d\n", totalItems) printStatistics(db) }