From 542116a0dd3fc9e7556e9800dec7663cd4d401f6 Mon Sep 17 00:00:00 2001 From: Serguey Parkhomovsky Date: Wed, 3 Sep 2025 13:20:45 -0700 Subject: Initial commit --- main.go | 250 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 250 insertions(+) create mode 100644 main.go (limited to 'main.go') diff --git a/main.go b/main.go new file mode 100644 index 0000000..1910205 --- /dev/null +++ b/main.go @@ -0,0 +1,250 @@ +package main + +import ( + "database/sql" + "encoding/xml" + "fmt" + "io" + "log" + "net/http" + "strconv" + "strings" + "time" + + _ "github.com/mattn/go-sqlite3" +) + +// RSS structures +type RSS struct { + XMLName xml.Name `xml:"rss"` + Channel Channel `xml:"channel"` +} + +type Channel struct { + Title string `xml:"title"` + Items []Item `xml:"item"` +} + +type Item struct { + Title string `xml:"title"` + Link string `xml:"link"` + Category string `xml:"category"` + Enclosure Enclosure `xml:"enclosure"` +} + +type Enclosure struct { + URL string `xml:"url,attr"` + Type string `xml:"type,attr"` +} + +// Internal structures +type YearWeek struct { + Year int + Week int +} + +type TitleAuthor struct { + Title string + Author string +} + +func parseYearWeek(date string) (YearWeek, error) { + parts := strings.Split(date, "/") + if len(parts) != 2 { + return YearWeek{}, fmt.Errorf("expected 'YYYY/WW', got '%s'", date) + } + + year, err := strconv.Atoi(parts[0]) + if err != nil { + return YearWeek{}, fmt.Errorf("invalid year for date '%s': %w", date, err) + } + + week, err := strconv.Atoi(parts[1]) + if err != nil { + return YearWeek{}, fmt.Errorf("invalid week for date '%s': %w", date, err) + } + + return YearWeek{Year: year, Week: week}, nil +} + +func parseTitle(title string) (TitleAuthor, error) { + parts := strings.Split(title, " - ") + if len(parts) < 3 { + return TitleAuthor{}, fmt.Errorf("expected 'Week - Author - Title', got '%s'", title) + } + + return TitleAuthor{Title: strings.Join(parts[2:], " - "), Author: parts[1]}, nil +} + +// For some tracks, like https://weeklybeats.com/keff/music/evil-los-man, the WB RSS feed returns invalid XML bytes. Scrub these. +func filterInvalidXMLBytes(data []byte) []byte { + writePos := 0 + for _, b := range data { + if isValidXMLByte(b) { + data[writePos] = b + writePos++ + } + } + return data[:writePos] +} + +func isValidXMLByte(b byte) bool { + // Valid XML 1.0 characters for single-byte range + return b == 0x09 || b == 0x0A || b == 0x0D || b >= 0x20 +} + +func main() { + // Create SQLite database + db, err := sql.Open("sqlite3", "weeklybeats.db") + if err != nil { + log.Fatal("Failed to open database:", err) + } + defer db.Close() + + // Create table + createTable := ` + CREATE TABLE IF NOT EXISTS tracks ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + title TEXT NOT NULL, + link TEXT NOT NULL, + author TEXT, + week INTEGER, + year INTEGER, + url TEXT, + UNIQUE(author, week, year) + );` + + _, err = db.Exec(createTable) + if err != nil { + log.Fatal("Failed to create table:", err) + } + + fmt.Println("Starting to scrape Weekly Beats RSS feed...") + + // Prepare insert statement + insertStmt, err := db.Prepare(` + INSERT OR IGNORE INTO tracks (title, link, author, week, year, url) + VALUES (?, ?, ?, ?, ?, ?) + `) + if err != nil { + log.Fatal("Failed to prepare insert statement:", err) + } + defer insertStmt.Close() + + page := 1 + totalItems := 0 + + for { + fmt.Printf("Fetching page %d...\n", page) + + // Construct URL with page parameter + url := fmt.Sprintf("https://weeklybeats.com/music/rss?limit=1000&page=%d", page) + + // Fetch RSS feed + resp, err := http.Get(url) + if err != nil { + log.Printf("Failed to fetch page %d: %v", page, err) + break + } + + if resp.StatusCode != 200 { + fmt.Printf("Received status %d for page %d, stopping\n", resp.StatusCode, page) + resp.Body.Close() + break + } + + body, err := io.ReadAll(resp.Body) + resp.Body.Close() + + if err != nil { + log.Printf("Failed to read response body for page %d: %v", page, err) + break + } + + cleanedBody := filterInvalidXMLBytes(body) + + // Parse XML + var rss RSS + err = xml.Unmarshal(cleanedBody, &rss) + if err != nil { + log.Printf("Failed to parse XML for page %d: %v", page, err) + break + } + + // Check if we got any items + if len(rss.Channel.Items) == 0 { + fmt.Printf("No items found on page %d, stopping\n", page) + break + } + + fmt.Printf("Found %d items on page %d\n", len(rss.Channel.Items), page) + + // Insert items into database + pageItems := 0 + for _, item := range rss.Channel.Items { + titleAuthor, err := parseTitle(item.Title) + if err != nil { + log.Printf("Failed to parse title for item '%s': %v", item.Title, err) + continue + } + + yearWeek, err := parseYearWeek(item.Category) + if err != nil { + log.Printf("Failed to parse date for item '%s': %v", item.Title, err) + continue + } + + _, err = insertStmt.Exec(titleAuthor.Title, item.Link, titleAuthor.Author, yearWeek.Week, yearWeek.Year, item.Enclosure.URL) + if err != nil { + log.Printf("Failed to insert item '%s': %v", item.Title, err) + continue + } + pageItems++ + } + + fmt.Printf("Inserted %d new items from page %d\n", pageItems, page) + totalItems += pageItems + + // Be respectful to the server + time.Sleep(500 * time.Millisecond) + + page++ + + // Safety check to prevent infinite loops + if page > 1000 { + fmt.Println("Reached maximum page limit (1000), stopping") + break + } + } + + // Print summary + fmt.Printf("\nScraping complete! Total items processed: %d\n", totalItems) + + // Query and display some statistics + var count int + err = db.QueryRow("SELECT COUNT(*) FROM tracks").Scan(&count) + if err != nil { + log.Printf("Failed to count records: %v", err) + } else { + fmt.Printf("Total records in database: %d\n", count) + } + + // Show sample of data + fmt.Println("\nSample of stored data:") + rows, err := db.Query("SELECT title, author, link FROM tracks LIMIT 5") + if err != nil { + log.Printf("Failed to query sample data: %v", err) + return + } + defer rows.Close() + + for rows.Next() { + var title, author, link string + err := rows.Scan(&title, &author, &link) + if err != nil { + log.Printf("Failed to scan row: %v", err) + continue + } + fmt.Printf("- %s by %s (%s)\n", title, author, link) + } +} -- cgit v1.2.3