From 38977b259a46b60f0ba9a223a0c9ee4e908facd9 Mon Sep 17 00:00:00 2001 From: Serguey Parkhomovsky Date: Thu, 25 Dec 2025 22:14:43 -0800 Subject: add artist scraping capability --- main.go | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 98 insertions(+), 10 deletions(-) (limited to 'main.go') diff --git a/main.go b/main.go index 2474d39..2389688 100644 --- a/main.go +++ b/main.go @@ -7,13 +7,22 @@ import ( "io" "log" "net/http" + "net/url" "strconv" "strings" "time" + "github.com/PuerkitoBio/goquery" _ "github.com/mattn/go-sqlite3" ) +const ( + artistFetchDelay = 250 * time.Millisecond + pageFetchDelay = 500 * time.Millisecond + maxPages = 1000 + itemsPerPage = 1000 +) + // RSS structures type RSS struct { XMLName xml.Name `xml:"rss"` @@ -48,6 +57,44 @@ type TitleAuthor struct { Author string } +func extractUsername(trackURL string) (string, error) { + u, err := url.Parse(trackURL) + if err != nil { + return "", err + } + + parts := strings.Split(strings.Trim(u.Path, "/"), "/") + if len(parts) < 1 { + return "", fmt.Errorf("invalid URL format: %s", trackURL) + } + + return parts[0], nil +} + +func fetchArtistName(trackURL string) (string, error) { + resp, err := http.Get(trackURL) + if err != nil { + return "", err + } + defer resp.Body.Close() + + if resp.StatusCode != 200 { + return "", fmt.Errorf("HTTP %d", resp.StatusCode) + } + + doc, err := goquery.NewDocumentFromReader(resp.Body) + if err != nil { + return "", err + } + + artistName := strings.TrimSpace(strings.TrimPrefix(doc.Find("#item_user a").Text(), "By ")) + if artistName == "" { + return "", fmt.Errorf("artist name not found") + } + + return artistName, nil +} + func parseYearWeek(date string) (YearWeek, error) { parts := strings.Split(date, "/") if len(parts) != 2 { @@ -94,7 +141,7 @@ func isValidXMLByte(b byte) bool { } func fetchRSSPage(year, page int) (*RSS, error) { - url := fmt.Sprintf("https://weeklybeats.com/music/rss?limit=1000&year=%d&page=%d", year, page) + url := fmt.Sprintf("https://weeklybeats.com/music/rss?limit=%d&year=%d&page=%d", itemsPerPage, year, page) resp, err := http.Get(url) if err != nil { @@ -122,9 +169,49 @@ func fetchRSSPage(year, page int) (*RSS, error) { return &rss, nil } -func insertItems(insertStmt *sql.Stmt, items []Item) int { +func getArtistNameFromTitle(title string) (string, error) { + titleAuthor, err := parseTitle(title) + if err != nil { + return "", err + } + return titleAuthor.Author, nil +} + +func resolveArtistName(item Item, cache map[string]string) (string, error) { + username, err := extractUsername(item.Link) + if err != nil { + log.Printf("Failed to extract username from '%s': %v", item.Link, err) + return getArtistNameFromTitle(item.Title) + } + + // Try cache first + if artistName := cache[username]; artistName != "" { + return artistName, nil + } + + // Not in cache, fetch from track page + artistName, err := fetchArtistName(item.Link) + if err != nil { + log.Printf("Failed to fetch artist for '%s': %v, using RSS fallback", username, err) + return getArtistNameFromTitle(item.Title) + } + + cache[username] = artistName + fmt.Printf("Cached artist name '%s' for username '%s'\n", artistName, username) + time.Sleep(artistFetchDelay) + + return artistName, nil +} + +func insertItems(insertStmt *sql.Stmt, items []Item, cache map[string]string) int { inserted := 0 for _, item := range items { + artistName, err := resolveArtistName(item, cache) + if err != nil { + log.Printf("Failed to resolve artist for item '%s': %v", item.Title, err) + continue + } + titleAuthor, err := parseTitle(item.Title) if err != nil { log.Printf("Failed to parse title for item '%s': %v", item.Title, err) @@ -137,7 +224,7 @@ func insertItems(insertStmt *sql.Stmt, items []Item) int { continue } - _, err = insertStmt.Exec(titleAuthor.Title, item.Link, titleAuthor.Author, yearWeek.Week, yearWeek.Year, item.Enclosure.URL) + _, err = insertStmt.Exec(titleAuthor.Title, item.Link, artistName, yearWeek.Week, yearWeek.Year, item.Enclosure.URL) if err != nil { log.Printf("Failed to insert item '%s': %v", item.Title, err) continue @@ -147,7 +234,7 @@ func insertItems(insertStmt *sql.Stmt, items []Item) int { return inserted } -func scrapeYear(year int, insertStmt *sql.Stmt) int { +func scrapeYear(year int, insertStmt *sql.Stmt, cache map[string]string) int { totalItems := 0 page := 1 @@ -167,16 +254,15 @@ func scrapeYear(year int, insertStmt *sql.Stmt) int { fmt.Printf("Found %d items on page %d\n", len(rss.Channel.Items), page) - pageItems := insertItems(insertStmt, rss.Channel.Items) + pageItems := insertItems(insertStmt, rss.Channel.Items, cache) fmt.Printf("Inserted %d new items from page %d\n", pageItems, page) totalItems += pageItems - time.Sleep(500 * time.Millisecond) - + time.Sleep(pageFetchDelay) page++ - if page > 1000 { - fmt.Println("Reached maximum page limit (1000), stopping") + if page > maxPages { + fmt.Printf("Reached maximum page limit (%d), stopping\n", maxPages) break } } @@ -249,9 +335,11 @@ func main() { } defer insertStmt.Close() + cache := make(map[string]string) + totalItems := 0 for year := 2012; year <= 2024; year += 2 { - totalItems += scrapeYear(year, insertStmt) + totalItems += scrapeYear(year, insertStmt, cache) } fmt.Printf("\nScraping complete! Total items processed: %d\n", totalItems) -- cgit v1.2.3