diff --git a/README.md b/README.md index 699a629..7bf9477 100644 --- a/README.md +++ b/README.md @@ -2,4 +2,5 @@ Crawls pet cards from poiskzoo.ru [![Build Status](https://drone.k8s.grechka.family/api/badges/LostPetInitiative/poiskzoo-ru-crawler/status.svg)](https://drone.k8s.grechka.family/LostPetInitiative/poiskzoo-ru-crawler) +[![Go Report Card](https://goreportcard.com/badge/github.com/LostPetInitiative/poiskzoo-ru-crawler)](https://goreportcard.com/report/github.com/LostPetInitiative/poiskzoo-ru-crawler) [![codecov](https://codecov.io/github/LostPetInitiative/poiskzoo-ru-crawler/branch/main/graph/badge.svg?token=TRV81WCITE)](https://codecov.io/github/LostPetInitiative/poiskzoo-ru-crawler) diff --git a/main.go b/main.go index bbe648c..064360a 100644 --- a/main.go +++ b/main.go @@ -129,11 +129,11 @@ func main() { } // fetching catalog - var newDetectedCardIDs []types.CardID = nil + var newDetectedCards []crawler.Card = nil if len(knownCardsIdSet) == 0 { // fetching only the first page log.Println("The card storage is empty. Fetching the first catalog page page...") - newDetectedCardIDs, err = crawler.GetCardCatalogPage(0) + newDetectedCards, err = crawler.GetCardCatalogPage(1) if err != nil { log.Panicf("Failed to get catalog page: %v\n", err) } @@ -144,22 +144,26 @@ func main() { pagesLoop: for { log.Printf("Fetching catalog page %d...\n", pageNum) - pageNewDetectedCardIDs, err := crawler.GetCardCatalogPage(0) + pageNewDetectedCards, err := crawler.GetCardCatalogPage(pageNum) if err != nil { log.Panicf("Failed to get catalog page: %v\n", err) } - log.Printf("Got %d cards for page %d of the catalog\n", len(pageNewDetectedCardIDs), pageNum) + log.Printf("Got %d cards for page %d of the catalog\n", len(pageNewDetectedCards), pageNum) - if newDetectedCardIDs == nil { - newDetectedCardIDs = pageNewDetectedCardIDs + if newDetectedCards == nil { + newDetectedCards = pageNewDetectedCards } else { - newDetectedCardIDs = append(newDetectedCardIDs, pageNewDetectedCardIDs...) + newDetectedCards = append(newDetectedCards, pageNewDetectedCards...) } // analyzing pageNewDetectedCardIDs for intersection with known IDS - for _, newCardID := range pageNewDetectedCardIDs { - if _, exists := knownCardsIdSet[newCardID]; exists { - log.Printf("Found already known card %d at page %d\n", newCardID, pageNum) + for _, newCard := range pageNewDetectedCards { + if newCard.HasPaidPromotion { + // ignoring promoted card in look for already downloaded + continue + } + if _, exists := knownCardsIdSet[newCard.Id]; exists { + log.Printf("Found already known card %d at page %d\n", newCard.Id, pageNum) break pagesLoop } } @@ -169,11 +173,11 @@ func main() { } // finding what exactly cards are new (not previously downloaded) - var newCardsIDs []types.CardID = make([]types.CardID, 0, len(newDetectedCardIDs)) - for _, newCardIdCandidate := range newDetectedCardIDs { - if _, alreadyDownloaded := knownCardsIdSet[newCardIdCandidate]; !alreadyDownloaded { - newCardsIDs = append(newCardsIDs, newCardIdCandidate) - heap.Push(knownIDsHeap, newCardIdCandidate) + var newCardsIDs []types.CardID = make([]types.CardID, 0, len(newDetectedCards)) + for _, newCardIdCandidate := range newDetectedCards { + if _, alreadyDownloaded := knownCardsIdSet[newCardIdCandidate.Id]; !alreadyDownloaded { + newCardsIDs = append(newCardsIDs, newCardIdCandidate.Id) + heap.Push(knownIDsHeap, newCardIdCandidate.Id) } } log.Printf("%d new cards to download\n", len(newCardsIDs)) diff --git a/main_test.go b/main_test.go deleted file mode 100644 index 5fff9dd..0000000 --- a/main_test.go +++ /dev/null @@ -1,50 +0,0 @@ -package main - -import ( - "io/ioutil" - "testing" - - "github.com/LostPetInitiative/poiskzoo-ru-crawler/pkg/crawler" - "github.com/LostPetInitiative/poiskzoo-ru-crawler/pkg/geocoding" - "github.com/LostPetInitiative/poiskzoo-ru-crawler/pkg/types" - "github.com/LostPetInitiative/poiskzoo-ru-crawler/pkg/utils" -) - -func TestFullCardDownload(t *testing.T) { - card, err := crawler.GetPetCard(types.CardID(164971)) - if err != nil { - t.Error(err) - t.FailNow() - } - - image, err := utils.HttpGet(card.ImagesURL, types.AnyMimeType) - if err != nil { - t.Error(err) - t.FailNow() - } - - jsonCard := crawler.NewCardJSON(card, - &geocoding.GeoCoords{Lat: 10.0, Lon: 20.0}, - "hardcoded", - image.Body, - image.ContentType) - serialized := jsonCard.JsonSerialize() - - expectedBytes, err := ioutil.ReadFile("./testdata/164971.json") - expected := string(expectedBytes) - if err != nil { - t.Error(err) - t.FailNow() - } - - if expected != serialized { - for i := 0; i < len(expected) && i < len(serialized); i++ { - if expected[i] != serialized[i] { - t.Errorf("Expected != actual. Diff is at byte idx: %d\n", i) - t.Errorf("Actual: %v\n", serialized) - t.FailNow() - } - } - } - -} diff --git a/pkg/crawler/index.go b/pkg/crawler/index.go index 0aac651..c082af8 100644 --- a/pkg/crawler/index.go +++ b/pkg/crawler/index.go @@ -4,8 +4,6 @@ import ( "fmt" "log" "net/url" - "strconv" - "strings" "time" "github.com/LostPetInitiative/poiskzoo-ru-crawler/pkg/types" @@ -14,7 +12,7 @@ import ( const poiskZooBaseURL string = "https://poiskzoo.ru" -func GetCardCatalogPage(pageNum int) ([]types.CardID, error) { +func GetCardCatalogPage(pageNum int) ([]Card, error) { effectiveUrlStr := fmt.Sprintf("%s/poteryashka/page-%d", poiskZooBaseURL, pageNum) effectiveUrl, err := url.Parse(effectiveUrlStr) if err != nil { @@ -28,24 +26,9 @@ func GetCardCatalogPage(pageNum int) ([]types.CardID, error) { body := resp.Body parsedNode := ParseHtmlContent(string(body)) - var urls []string = ExtractCardUrlsFromDocument(parsedNode) - - var result []types.CardID = make([]types.CardID, len(urls)) - for i, url := range urls { - // urls are like "/bijsk/propala-koshka/162257" - lastIdx := strings.LastIndex(url, "/") - if lastIdx == -1 { - panic(fmt.Sprintf("card URL in not in supported format: %q", url)) - } - cardIdStr := url[lastIdx+1:] - cardID, err := strconv.ParseInt(cardIdStr, 10, 32) - if err != nil { - return nil, err - } - result[i] = types.CardID(cardID) - } - return result, nil + var cards []Card = ExtractCardsFromCatalogDocument(parsedNode) + return cards, nil } type PetCard struct { diff --git a/pkg/crawler/index_test.go b/pkg/crawler/index_test.go index ec8017e..f1e0e04 100644 --- a/pkg/crawler/index_test.go +++ b/pkg/crawler/index_test.go @@ -1,7 +1,12 @@ package crawler import ( + "os" "testing" + + "github.com/LostPetInitiative/poiskzoo-ru-crawler/pkg/geocoding" + "github.com/LostPetInitiative/poiskzoo-ru-crawler/pkg/types" + "github.com/LostPetInitiative/poiskzoo-ru-crawler/pkg/utils" ) func TestGetCardCatalogPage(t *testing.T) { @@ -16,3 +21,42 @@ func TestGetCardCatalogPage(t *testing.T) { t.FailNow() } } + +func TestFullCardDownload(t *testing.T) { + card, err := GetPetCard(types.CardID(164971)) + if err != nil { + t.Error(err) + t.FailNow() + } + + image, err := utils.HttpGet(card.ImagesURL, types.AnyMimeType) + if err != nil { + t.Error(err) + t.FailNow() + } + + jsonCard := NewCardJSON(card, + &geocoding.GeoCoords{Lat: 10.0, Lon: 20.0}, + "hardcoded", + image.Body, + image.ContentType) + serialized := jsonCard.JsonSerialize() + + expectedBytes, err := os.ReadFile("./testdata/164971.json") + expected := string(expectedBytes) + if err != nil { + t.Error(err) + t.FailNow() + } + + if expected != serialized { + for i := 0; i < len(expected) && i < len(serialized); i++ { + if expected[i] != serialized[i] { + t.Errorf("Expected != actual. Diff is at byte idx: %d\n", i) + t.Errorf("Actual: %v\n", serialized) + t.FailNow() + } + } + } + +} diff --git a/pkg/crawler/parser.go b/pkg/crawler/parser.go index 194dd35..a44f0a4 100644 --- a/pkg/crawler/parser.go +++ b/pkg/crawler/parser.go @@ -21,19 +21,64 @@ func ParseHtmlContent(htmlContent string) *html.Node { return doc } +type Card struct { + Id types.CardID + Url string + HasPaidPromotion bool +} + // Returns relative URL from cards found on the catalog page -func ExtractCardUrlsFromDocument(doc *html.Node) []string { - nodes, err := htmlquery.QueryAll(doc, "//div[contains(@class, 'pzplitkadiv')]//div[contains(@class, 'pzplitkalink')]/a") +func ExtractCardsFromCatalogDocument(doc *html.Node) []Card { + //nodes, err := htmlquery.QueryAll(doc, "//div[contains(@class, 'pzplitkadiv')]//div[contains(@class, 'pzplitkalink')]/a") + nodes, err := htmlquery.QueryAll(doc, "//div[contains(@class, 'pzplitkadiv')]") if err != nil { panic(`not a valid XPath expression.`) } - res := make([]string, len(nodes)) + res := make([]Card, len(nodes)) for i, n := range nodes { + var isPaidPromotion bool + var found bool = false for _, a := range n.Attr { + if a.Key == "class" { + switch { + case strings.Contains(a.Val, "blockdivbaza_vip1"): + isPaidPromotion = true + case strings.Contains(a.Val, "blockdivbaza_vip0"): + isPaidPromotion = false + default: + panic("Can't find paid promotion indication class") + } + found = true + break + } + } + if !found { + panic("Can't find class attr for promotion indication") + } + linkNode, err := htmlquery.Query(n, "div[contains(@class, 'pzplitkalink')]/a") + if err != nil { + panic("Can't find link for the card") + } + for _, a := range linkNode.Attr { if a.Key == "href" { - res[i] = a.Val + // urls are like "/bijsk/propala-koshka/162257" + url := a.Val + lastIdx := strings.LastIndex(url, "/") + if lastIdx == -1 { + panic(fmt.Sprintf("card URL in not in supported format: %q", url)) + } + cardIdStr := url[lastIdx+1:] + cardID, err := strconv.ParseInt(cardIdStr, 10, 32) + if err != nil { + panic(err) + } + res[i] = Card{ + Id: types.CardID(cardID), + Url: url, + HasPaidPromotion: isPaidPromotion, + } break } } diff --git a/pkg/crawler/parser_test.go b/pkg/crawler/parser_test.go index 1d48a44..afb3b02 100644 --- a/pkg/crawler/parser_test.go +++ b/pkg/crawler/parser_test.go @@ -17,23 +17,37 @@ func TestExtractCardUrlsFromCatalogPage(t *testing.T) { } catalogHtml := string(fileContent) - extractedUrls := ExtractCardUrlsFromDocument(ParseHtmlContent(catalogHtml)) + extractedUrls := ExtractCardsFromCatalogDocument(ParseHtmlContent(catalogHtml)) const expectedCount int = 52 if len(extractedUrls) != expectedCount { t.Logf("Expected to extract %d card IDs but extracted %d", expectedCount, len(extractedUrls)) t.Fail() } - urlMap := make(map[string]int) - for i, url := range extractedUrls { - urlMap[url] = i + cardMap := make(map[int]Card) + for _, card := range extractedUrls { + cardMap[int(card.Id)] = card } - _, exists := urlMap["/stavropol/najdena-sobaka/164833"] + card, exists := cardMap[164833] if !exists { t.Logf("Did not find expected URL in the result set") t.Fail() } + if card.HasPaidPromotion { + t.Log("Card 164833 is promoted") + t.Fail() + } + + card, exists = cardMap[164656] + if !exists { + t.Log("Did not find expected URL in the result set") + t.Fail() + } + if !card.HasPaidPromotion { + t.Log("Card 164656 is not promoted") + t.Fail() + } } func TestExtractSpeciesFromPetCardPage(t *testing.T) { diff --git a/testdata/164971.json b/pkg/crawler/testdata/164971.json similarity index 100% rename from testdata/164971.json rename to pkg/crawler/testdata/164971.json