Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

7 handle paid card promotions #9

Merged
merged 2 commits into from
Oct 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
Crawls pet cards from poiskzoo.ru

[![Build Status](https://drone.k8s.grechka.family/api/badges/LostPetInitiative/poiskzoo-ru-crawler/status.svg)](https://drone.k8s.grechka.family/LostPetInitiative/poiskzoo-ru-crawler)
[![Go Report Card](https://goreportcard.com/badge/github.com/LostPetInitiative/poiskzoo-ru-crawler)](https://goreportcard.com/report/github.com/LostPetInitiative/poiskzoo-ru-crawler)
[![codecov](https://codecov.io/github/LostPetInitiative/poiskzoo-ru-crawler/branch/main/graph/badge.svg?token=TRV81WCITE)](https://codecov.io/github/LostPetInitiative/poiskzoo-ru-crawler)
34 changes: 19 additions & 15 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,11 +129,11 @@ func main() {
}

// fetching catalog
var newDetectedCardIDs []types.CardID = nil
var newDetectedCards []crawler.Card = nil
if len(knownCardsIdSet) == 0 {
// fetching only the first page
log.Println("The card storage is empty. Fetching the first catalog page page...")
newDetectedCardIDs, err = crawler.GetCardCatalogPage(0)
newDetectedCards, err = crawler.GetCardCatalogPage(1)
if err != nil {
log.Panicf("Failed to get catalog page: %v\n", err)
}
Expand All @@ -144,22 +144,26 @@ func main() {
pagesLoop:
for {
log.Printf("Fetching catalog page %d...\n", pageNum)
pageNewDetectedCardIDs, err := crawler.GetCardCatalogPage(0)
pageNewDetectedCards, err := crawler.GetCardCatalogPage(pageNum)
if err != nil {
log.Panicf("Failed to get catalog page: %v\n", err)
}
log.Printf("Got %d cards for page %d of the catalog\n", len(pageNewDetectedCardIDs), pageNum)
log.Printf("Got %d cards for page %d of the catalog\n", len(pageNewDetectedCards), pageNum)

if newDetectedCardIDs == nil {
newDetectedCardIDs = pageNewDetectedCardIDs
if newDetectedCards == nil {
newDetectedCards = pageNewDetectedCards
} else {
newDetectedCardIDs = append(newDetectedCardIDs, pageNewDetectedCardIDs...)
newDetectedCards = append(newDetectedCards, pageNewDetectedCards...)
}

// analyzing pageNewDetectedCardIDs for intersection with known IDS
for _, newCardID := range pageNewDetectedCardIDs {
if _, exists := knownCardsIdSet[newCardID]; exists {
log.Printf("Found already known card %d at page %d\n", newCardID, pageNum)
for _, newCard := range pageNewDetectedCards {
if newCard.HasPaidPromotion {
// ignoring promoted card in look for already downloaded
continue
}
if _, exists := knownCardsIdSet[newCard.Id]; exists {
log.Printf("Found already known card %d at page %d\n", newCard.Id, pageNum)
break pagesLoop
}
}
Expand All @@ -169,11 +173,11 @@ func main() {
}

// finding what exactly cards are new (not previously downloaded)
var newCardsIDs []types.CardID = make([]types.CardID, 0, len(newDetectedCardIDs))
for _, newCardIdCandidate := range newDetectedCardIDs {
if _, alreadyDownloaded := knownCardsIdSet[newCardIdCandidate]; !alreadyDownloaded {
newCardsIDs = append(newCardsIDs, newCardIdCandidate)
heap.Push(knownIDsHeap, newCardIdCandidate)
var newCardsIDs []types.CardID = make([]types.CardID, 0, len(newDetectedCards))
for _, newCardIdCandidate := range newDetectedCards {
if _, alreadyDownloaded := knownCardsIdSet[newCardIdCandidate.Id]; !alreadyDownloaded {
newCardsIDs = append(newCardsIDs, newCardIdCandidate.Id)
heap.Push(knownIDsHeap, newCardIdCandidate.Id)
}
}
log.Printf("%d new cards to download\n", len(newCardsIDs))
Expand Down
50 changes: 0 additions & 50 deletions main_test.go

This file was deleted.

23 changes: 3 additions & 20 deletions pkg/crawler/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@ import (
"fmt"
"log"
"net/url"
"strconv"
"strings"
"time"

"github.com/LostPetInitiative/poiskzoo-ru-crawler/pkg/types"
Expand All @@ -14,7 +12,7 @@ import (

const poiskZooBaseURL string = "https://poiskzoo.ru"

func GetCardCatalogPage(pageNum int) ([]types.CardID, error) {
func GetCardCatalogPage(pageNum int) ([]Card, error) {
effectiveUrlStr := fmt.Sprintf("%s/poteryashka/page-%d", poiskZooBaseURL, pageNum)
effectiveUrl, err := url.Parse(effectiveUrlStr)
if err != nil {
Expand All @@ -28,24 +26,9 @@ func GetCardCatalogPage(pageNum int) ([]types.CardID, error) {
body := resp.Body

parsedNode := ParseHtmlContent(string(body))
var urls []string = ExtractCardUrlsFromDocument(parsedNode)

var result []types.CardID = make([]types.CardID, len(urls))
for i, url := range urls {
// urls are like "/bijsk/propala-koshka/162257"
lastIdx := strings.LastIndex(url, "/")
if lastIdx == -1 {
panic(fmt.Sprintf("card URL in not in supported format: %q", url))
}
cardIdStr := url[lastIdx+1:]
cardID, err := strconv.ParseInt(cardIdStr, 10, 32)
if err != nil {
return nil, err
}
result[i] = types.CardID(cardID)
}
return result, nil
var cards []Card = ExtractCardsFromCatalogDocument(parsedNode)

return cards, nil
}

type PetCard struct {
Expand Down
44 changes: 44 additions & 0 deletions pkg/crawler/index_test.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
package crawler

import (
"os"
"testing"

"github.com/LostPetInitiative/poiskzoo-ru-crawler/pkg/geocoding"
"github.com/LostPetInitiative/poiskzoo-ru-crawler/pkg/types"
"github.com/LostPetInitiative/poiskzoo-ru-crawler/pkg/utils"
)

func TestGetCardCatalogPage(t *testing.T) {
Expand All @@ -16,3 +21,42 @@ func TestGetCardCatalogPage(t *testing.T) {
t.FailNow()
}
}

func TestFullCardDownload(t *testing.T) {
card, err := GetPetCard(types.CardID(164971))
if err != nil {
t.Error(err)
t.FailNow()
}

image, err := utils.HttpGet(card.ImagesURL, types.AnyMimeType)
if err != nil {
t.Error(err)
t.FailNow()
}

jsonCard := NewCardJSON(card,
&geocoding.GeoCoords{Lat: 10.0, Lon: 20.0},
"hardcoded",
image.Body,
image.ContentType)
serialized := jsonCard.JsonSerialize()

expectedBytes, err := os.ReadFile("./testdata/164971.json")
expected := string(expectedBytes)
if err != nil {
t.Error(err)
t.FailNow()
}

if expected != serialized {
for i := 0; i < len(expected) && i < len(serialized); i++ {
if expected[i] != serialized[i] {
t.Errorf("Expected != actual. Diff is at byte idx: %d\n", i)
t.Errorf("Actual: %v\n", serialized)
t.FailNow()
}
}
}

}
53 changes: 49 additions & 4 deletions pkg/crawler/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,64 @@ func ParseHtmlContent(htmlContent string) *html.Node {
return doc
}

type Card struct {
Id types.CardID
Url string
HasPaidPromotion bool
}

// Returns relative URL from cards found on the catalog page
func ExtractCardUrlsFromDocument(doc *html.Node) []string {
nodes, err := htmlquery.QueryAll(doc, "//div[contains(@class, 'pzplitkadiv')]//div[contains(@class, 'pzplitkalink')]/a")
func ExtractCardsFromCatalogDocument(doc *html.Node) []Card {
//nodes, err := htmlquery.QueryAll(doc, "//div[contains(@class, 'pzplitkadiv')]//div[contains(@class, 'pzplitkalink')]/a")
nodes, err := htmlquery.QueryAll(doc, "//div[contains(@class, 'pzplitkadiv')]")
if err != nil {
panic(`not a valid XPath expression.`)
}

res := make([]string, len(nodes))
res := make([]Card, len(nodes))

for i, n := range nodes {
var isPaidPromotion bool
var found bool = false
for _, a := range n.Attr {
if a.Key == "class" {
switch {
case strings.Contains(a.Val, "blockdivbaza_vip1"):
isPaidPromotion = true
case strings.Contains(a.Val, "blockdivbaza_vip0"):
isPaidPromotion = false
default:
panic("Can't find paid promotion indication class")
}
found = true
break
}
}
if !found {
panic("Can't find class attr for promotion indication")
}
linkNode, err := htmlquery.Query(n, "div[contains(@class, 'pzplitkalink')]/a")
if err != nil {
panic("Can't find link for the card")
}
for _, a := range linkNode.Attr {
if a.Key == "href" {
res[i] = a.Val
// urls are like "/bijsk/propala-koshka/162257"
url := a.Val
lastIdx := strings.LastIndex(url, "/")
if lastIdx == -1 {
panic(fmt.Sprintf("card URL in not in supported format: %q", url))
}
cardIdStr := url[lastIdx+1:]
cardID, err := strconv.ParseInt(cardIdStr, 10, 32)
if err != nil {
panic(err)
}
res[i] = Card{
Id: types.CardID(cardID),
Url: url,
HasPaidPromotion: isPaidPromotion,
}
break
}
}
Expand Down
24 changes: 19 additions & 5 deletions pkg/crawler/parser_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,23 +17,37 @@ func TestExtractCardUrlsFromCatalogPage(t *testing.T) {
}
catalogHtml := string(fileContent)

extractedUrls := ExtractCardUrlsFromDocument(ParseHtmlContent(catalogHtml))
extractedUrls := ExtractCardsFromCatalogDocument(ParseHtmlContent(catalogHtml))
const expectedCount int = 52
if len(extractedUrls) != expectedCount {
t.Logf("Expected to extract %d card IDs but extracted %d", expectedCount, len(extractedUrls))
t.Fail()
}

urlMap := make(map[string]int)
for i, url := range extractedUrls {
urlMap[url] = i
cardMap := make(map[int]Card)
for _, card := range extractedUrls {
cardMap[int(card.Id)] = card
}

_, exists := urlMap["/stavropol/najdena-sobaka/164833"]
card, exists := cardMap[164833]
if !exists {
t.Logf("Did not find expected URL in the result set")
t.Fail()
}
if card.HasPaidPromotion {
t.Log("Card 164833 is promoted")
t.Fail()
}

card, exists = cardMap[164656]
if !exists {
t.Log("Did not find expected URL in the result set")
t.Fail()
}
if !card.HasPaidPromotion {
t.Log("Card 164656 is not promoted")
t.Fail()
}
}

func TestExtractSpeciesFromPetCardPage(t *testing.T) {
Expand Down
File renamed without changes.