diff options
| author | Serguey Parkhomovsky <xindigo@gmail.com> | 2025-09-03 13:20:45 -0700 |
|---|---|---|
| committer | Serguey Parkhomovsky <xindigo@gmail.com> | 2025-09-03 13:20:57 -0700 |
| commit | 542116a0dd3fc9e7556e9800dec7663cd4d401f6 (patch) | |
| tree | 31de83aaeb5b1a0f336f26da093552fd1b501185 | |
Initial commit
| -rw-r--r-- | .gitignore | 35 | ||||
| -rw-r--r-- | LICENSE | 7 | ||||
| -rw-r--r-- | go.mod | 5 | ||||
| -rw-r--r-- | go.sum | 2 | ||||
| -rw-r--r-- | main.go | 250 |
5 files changed, 299 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..85b38b7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,35 @@ +# If you prefer the allow list template instead of the deny list, see community template: +# https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore +# +# Binaries for programs and plugins +*.exe +*.exe~ +*.dll +*.so +*.dylib + +# Test binary, built with `go test -c` +*.test + +# Code coverage profiles and other test artifacts +*.out +coverage.* +*.coverprofile +profile.cov + +# Dependency directories (remove the comment below to include it) +# vendor/ + +# Go workspace file +go.work +go.work.sum + +# env file +.env + +# Editor/IDE +# .idea/ +# .vscode/ + +# weeklybeats db output file +weeklybeats.db @@ -0,0 +1,7 @@ +Copyright 2025 Serguey Parkhomovsky + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. @@ -0,0 +1,5 @@ +module weeklybeats-scraper + +go 1.25.1 + +require github.com/mattn/go-sqlite3 v1.14.32 @@ -0,0 +1,2 @@ +github.com/mattn/go-sqlite3 v1.14.32 h1:JD12Ag3oLy1zQA+BNn74xRgaBbdhbNIDYvQUEuuErjs= +github.com/mattn/go-sqlite3 v1.14.32/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= @@ -0,0 +1,250 @@ +package main + +import ( + "database/sql" + "encoding/xml" + "fmt" + "io" + "log" + "net/http" + "strconv" + "strings" + "time" + + _ "github.com/mattn/go-sqlite3" +) + +// RSS structures +type RSS struct { + XMLName xml.Name `xml:"rss"` + Channel Channel `xml:"channel"` +} + +type Channel struct { + Title string `xml:"title"` + Items []Item `xml:"item"` +} + +type Item struct { + Title string `xml:"title"` + Link string `xml:"link"` + Category string `xml:"category"` + Enclosure Enclosure `xml:"enclosure"` +} + +type Enclosure struct { + URL string `xml:"url,attr"` + Type string `xml:"type,attr"` +} + +// Internal structures +type YearWeek struct { + Year int + Week int +} + +type TitleAuthor struct { + Title string + Author string +} + +func parseYearWeek(date string) (YearWeek, error) { + parts := strings.Split(date, "/") + if len(parts) != 2 { + return YearWeek{}, fmt.Errorf("expected 'YYYY/WW', got '%s'", date) + } + + year, err := strconv.Atoi(parts[0]) + if err != nil { + return YearWeek{}, fmt.Errorf("invalid year for date '%s': %w", date, err) + } + + week, err := strconv.Atoi(parts[1]) + if err != nil { + return YearWeek{}, fmt.Errorf("invalid week for date '%s': %w", date, err) + } + + return YearWeek{Year: year, Week: week}, nil +} + +func parseTitle(title string) (TitleAuthor, error) { + parts := strings.Split(title, " - ") + if len(parts) < 3 { + return TitleAuthor{}, fmt.Errorf("expected 'Week - Author - Title', got '%s'", title) + } + + return TitleAuthor{Title: strings.Join(parts[2:], " - "), Author: parts[1]}, nil +} + +// For some tracks, like https://weeklybeats.com/keff/music/evil-los-man, the WB RSS feed returns invalid XML bytes. Scrub these. +func filterInvalidXMLBytes(data []byte) []byte { + writePos := 0 + for _, b := range data { + if isValidXMLByte(b) { + data[writePos] = b + writePos++ + } + } + return data[:writePos] +} + +func isValidXMLByte(b byte) bool { + // Valid XML 1.0 characters for single-byte range + return b == 0x09 || b == 0x0A || b == 0x0D || b >= 0x20 +} + +func main() { + // Create SQLite database + db, err := sql.Open("sqlite3", "weeklybeats.db") + if err != nil { + log.Fatal("Failed to open database:", err) + } + defer db.Close() + + // Create table + createTable := ` + CREATE TABLE IF NOT EXISTS tracks ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + title TEXT NOT NULL, + link TEXT NOT NULL, + author TEXT, + week INTEGER, + year INTEGER, + url TEXT, + UNIQUE(author, week, year) + );` + + _, err = db.Exec(createTable) + if err != nil { + log.Fatal("Failed to create table:", err) + } + + fmt.Println("Starting to scrape Weekly Beats RSS feed...") + + // Prepare insert statement + insertStmt, err := db.Prepare(` + INSERT OR IGNORE INTO tracks (title, link, author, week, year, url) + VALUES (?, ?, ?, ?, ?, ?) + `) + if err != nil { + log.Fatal("Failed to prepare insert statement:", err) + } + defer insertStmt.Close() + + page := 1 + totalItems := 0 + + for { + fmt.Printf("Fetching page %d...\n", page) + + // Construct URL with page parameter + url := fmt.Sprintf("https://weeklybeats.com/music/rss?limit=1000&page=%d", page) + + // Fetch RSS feed + resp, err := http.Get(url) + if err != nil { + log.Printf("Failed to fetch page %d: %v", page, err) + break + } + + if resp.StatusCode != 200 { + fmt.Printf("Received status %d for page %d, stopping\n", resp.StatusCode, page) + resp.Body.Close() + break + } + + body, err := io.ReadAll(resp.Body) + resp.Body.Close() + + if err != nil { + log.Printf("Failed to read response body for page %d: %v", page, err) + break + } + + cleanedBody := filterInvalidXMLBytes(body) + + // Parse XML + var rss RSS + err = xml.Unmarshal(cleanedBody, &rss) + if err != nil { + log.Printf("Failed to parse XML for page %d: %v", page, err) + break + } + + // Check if we got any items + if len(rss.Channel.Items) == 0 { + fmt.Printf("No items found on page %d, stopping\n", page) + break + } + + fmt.Printf("Found %d items on page %d\n", len(rss.Channel.Items), page) + + // Insert items into database + pageItems := 0 + for _, item := range rss.Channel.Items { + titleAuthor, err := parseTitle(item.Title) + if err != nil { + log.Printf("Failed to parse title for item '%s': %v", item.Title, err) + continue + } + + yearWeek, err := parseYearWeek(item.Category) + if err != nil { + log.Printf("Failed to parse date for item '%s': %v", item.Title, err) + continue + } + + _, err = insertStmt.Exec(titleAuthor.Title, item.Link, titleAuthor.Author, yearWeek.Week, yearWeek.Year, item.Enclosure.URL) + if err != nil { + log.Printf("Failed to insert item '%s': %v", item.Title, err) + continue + } + pageItems++ + } + + fmt.Printf("Inserted %d new items from page %d\n", pageItems, page) + totalItems += pageItems + + // Be respectful to the server + time.Sleep(500 * time.Millisecond) + + page++ + + // Safety check to prevent infinite loops + if page > 1000 { + fmt.Println("Reached maximum page limit (1000), stopping") + break + } + } + + // Print summary + fmt.Printf("\nScraping complete! Total items processed: %d\n", totalItems) + + // Query and display some statistics + var count int + err = db.QueryRow("SELECT COUNT(*) FROM tracks").Scan(&count) + if err != nil { + log.Printf("Failed to count records: %v", err) + } else { + fmt.Printf("Total records in database: %d\n", count) + } + + // Show sample of data + fmt.Println("\nSample of stored data:") + rows, err := db.Query("SELECT title, author, link FROM tracks LIMIT 5") + if err != nil { + log.Printf("Failed to query sample data: %v", err) + return + } + defer rows.Close() + + for rows.Next() { + var title, author, link string + err := rows.Scan(&title, &author, &link) + if err != nil { + log.Printf("Failed to scan row: %v", err) + continue + } + fmt.Printf("- %s by %s (%s)\n", title, author, link) + } +} |
