Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Addition of python scraper #49

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added src/scraper/.DS_Store
Binary file not shown.
28 changes: 25 additions & 3 deletions src/scraper/creatures.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@ package main

import (
"fmt"
"github.com/PuerkitoBio/goquery"
"log"
"strings"

"github.com/PuerkitoBio/goquery"
)

func (d *Data) getCreatureDetails(i int, id string) {
Expand Down Expand Up @@ -33,13 +34,34 @@ func (d *Data) getCreatureDetails(i int, id string) {
}
}

// Get the the creature's lore
// Get the the creature's lore and image URL
if selection.AttrOr("id", "") == "ctl00_MainContent_DetailedOutput" {
lore := selection.Text()
lore = lore[:strings.Index(lore, "Creature ")]
d.Creatures[i].Lore = lore

/*
Get the creature's thumbnail image if it is in the expected node,
imediately after the name of the creature h,
Not all creatures has it.
TODO check if it is in the same positions for all creatures.

Image URL is relative to domain: https://2e.aonprd.com/
Es: https://2e.aonprd.com/Images/Monsters/Serpentfolk_AapophSerpentfolk.png
*/

selection.Children().Each(func(j int, s *goquery.Selection) {
if s.HasClass("title") {

if s.Next().Children().AttrOr("class", "") == "thumbnail" {
imageURL, _ := s.Next().Children().Attr("src")
d.Creatures[i].ImgURL = imageURL
}
}
})

}
// TODO Get the creature's image

})

}
5 changes: 3 additions & 2 deletions src/scraper/fetch.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@ package main

import (
"fmt"
"github.com/PuerkitoBio/goquery"
"io/ioutil"
"log"
"net/http"
"strconv"
"strings"

"github.com/PuerkitoBio/goquery"
)

func getAONCreatures(url string) string {
Expand Down Expand Up @@ -65,4 +66,4 @@ func (d *Data) parseAONTable(data string) {

// Drop first entry which is the table header row
d.Creatures = d.Creatures[1:]
}
}
9 changes: 8 additions & 1 deletion src/scraper/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,15 @@ module pf2-encounter
go 1.15

require (
github.com/PuerkitoBio/goquery v1.6.0
github.com/PuerkitoBio/goquery v1.6.1
github.com/andybalholm/cascadia v1.2.0 // indirect
github.com/campoy/unique v0.0.0-20180121183637-88950e537e7e
github.com/davecgh/go-spew v1.1.1
github.com/helloeave/json v1.15.3
github.com/yuin/goldmark v1.3.7 // indirect
golang.org/x/crypto v0.0.0-20210513164829-c07d793c2f9a // indirect
golang.org/x/net v0.0.0-20210525063256-abc453219eb5 // indirect
golang.org/x/sys v0.0.0-20210525143221-35b2ab0089ea // indirect
golang.org/x/term v0.0.0-20210503060354-a79de5458b56 // indirect
golang.org/x/tools v0.1.2 // indirect
)
38 changes: 38 additions & 0 deletions src/scraper/go.sum
Original file line number Diff line number Diff line change
@@ -1,16 +1,54 @@
github.com/PuerkitoBio/goquery v1.6.0 h1:j7taAbelrdcsOlGeMenZxc2AWXD5fieT1/znArdnx94=
github.com/PuerkitoBio/goquery v1.6.0/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
github.com/PuerkitoBio/goquery v1.6.1 h1:FgjbQZKl5HTmcn4sKBgvx8vv63nhyhIpv7lJpFGCWpk=
github.com/PuerkitoBio/goquery v1.6.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo=
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/andybalholm/cascadia v1.2.0 h1:vuRCkM5Ozh/BfmsaTm26kbjm0mIOM3yS5Ek/F5h18aE=
github.com/andybalholm/cascadia v1.2.0/go.mod h1:YCyR8vOZT9aZ1CHEd8ap0gMVm2aFgxBp0T0eFw1RUQY=
github.com/campoy/unique v0.0.0-20180121183637-88950e537e7e h1:V9a67dfYqPLAvzk5hMQOXYJlZ4SLIXgyKIE+ZiHzgGQ=
github.com/campoy/unique v0.0.0-20180121183637-88950e537e7e/go.mod h1:9IOqJGCPMSc6E5ydlp5NIonxObaeu/Iub/X03EKPVYo=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/helloeave/json v1.15.3 h1:roUxUEGhsSvhuhi80c4qmLiW633d5uf0mkzUGzBMfX8=
github.com/helloeave/json v1.15.3/go.mod h1:uTHhuUsgnrpm9cc7Gi3tfIUwgf1dq/7+uLfpUFLBFEQ=
github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k=
github.com/yuin/goldmark v1.3.7 h1:NSaHgaeJFCtWXCBkBKXw0rhgMuJ0VoE9FB5mWldcrQ4=
github.com/yuin/goldmark v1.3.7/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20210513164829-c07d793c2f9a h1:kr2P4QFmQr29mSLA43kwrOcgcReGTfbE9N577tCTuBc=
golang.org/x/crypto v0.0.0-20210513164829-c07d793c2f9a/go.mod h1:P+XmwS30IXTQdn5tA2iutPOUgjI07+tq3H3K9MVA1s8=
golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200202094626-16171245cfb2 h1:CCH4IOTTfewWjGOlSp+zGcjutRKlBEZQ6wTn8ozI/nI=
golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM=
golang.org/x/net v0.0.0-20210525063256-abc453219eb5 h1:wjuX4b5yYQnEQHzd+CBcrcC6OVR2J1CN6mUy0oSxIPo=
golang.org/x/net v0.0.0-20210525063256-abc453219eb5/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20210525143221-35b2ab0089ea h1:+WiDlPBBaO+h9vPNZi8uJ3k4BkKQB7Iow3aqwHVA5hI=
golang.org/x/sys v0.0.0-20210525143221-35b2ab0089ea/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210503060354-a79de5458b56 h1:b8jxX3zqjpqb2LklXPzKSGJhzyxCOZSz8ncv8Nv+y7w=
golang.org/x/term v0.0.0-20210503060354-a79de5458b56/go.mod h1:tfny5GFUkzUvx4ps4ajbZsCe5lw1metzhBm9T3x7oIY=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.1.2 h1:kRBLX7v7Af8W7Gdbbc908OJcdgtK8bOz9Uaj8/F1ACA=
golang.org/x/tools v0.1.2/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
8 changes: 5 additions & 3 deletions src/scraper/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@ package main

import (
"fmt"
"github.com/davecgh/go-spew/spew"
"github.com/helloeave/json"
"io"
"io/ioutil"
"log"
"os"

"github.com/davecgh/go-spew/spew"
"github.com/helloeave/json"
)

// baseURL for creatures pages
Expand All @@ -29,6 +30,7 @@ type Creature struct {
Rarity string `json:"rarity"`
Id string `json:"id"`
Lore string `json:"lore"`
ImgURL string `json:image_url`
}

type metadata struct {
Expand Down Expand Up @@ -75,7 +77,7 @@ func main() {

// Pretty print the result, DEBUG only
if os.Getenv("DEBUG") == "1" {
fmt.Printf("Found %i creatures\n", len(d.Creatures))
fmt.Printf("Found %d creatures\n", len(d.Creatures))
// Just spew a bunch of them, we don't need to check them all!
spew.Dump(d.Creatures[:2])
}
Expand Down
1 change: 0 additions & 1 deletion src/scraper/metadata.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,5 +51,4 @@ func (d *Data) FillMetadata() {
return d.Metadata.Sizes[i] < d.Metadata.Sizes[j]
})


}
3 changes: 3 additions & 0 deletions src/scrapers/py_scraper/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
creatures.json
monsters-raw.json
npcs-raw.json
210 changes: 210 additions & 0 deletions src/scrapers/py_scraper/scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
"""
Monsters JSON Scraper
"""
import os
import json
import dataclasses
from dataclasses import dataclass, field
from typing import List, Set

import requests

#** Variables **#

#: user-agent to use when making requests
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36'

#: cache for raw-npcs results
NPCS_CACHE = 'npcs-raw.json'

#: cache for raw-monsters results
MONSTERS_CACHE = 'monsters-raw.json'

#: aeonprd hostname
AEONPRD = '2e.aonprd.com'

#: aeonprd elatasearch hostname
AEONPRD_ELASTIC = 'https://elasticsearch.aonprd.com/aon/_search'

#: pre-calculated aonprd elastic lookup query-params
ELASTIC_PARAM = {'track_total_hits': 'true'}

#: pre-calculated headers to include during elasticsearch lookup
ELASTIC_HEADERS = {
'user-agent': USER_AGENT,
'accept': '*/*',
'origin': f'https://{AEONPRD}',
'referer': f'https://{AEONPRD}',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-site',
'sec-gpc': 1,
}

#** Functions **#

def creature_data(npcs: bool = False) -> dict:
"""retrieve raw monster-data from aonprd elasticsearch"""
# retrieve from file-cache
cache = NPCS_CACHE if npcs else MONSTERS_CACHE
if os.path.exists(cache):
with open(cache, 'r') as f:
return json.load(f)
# construct query json
json_post = {
"query":{
"function_score":{
"query":{
"bool":{
"filter":[
{"query_string":{
"query":f"category:creature npc:{str(npcs).lower()} ",
"default_operator":"AND",
"fields":["name","text^0.1","trait_raw","type"]
}}
],
"must_not":[{"term":{"exclude_from_search":True}}]
}
},
"boost_mode":"multiply",
"functions":[
{"filter":{"terms":{"type":["Ancestry","Class"]}},"weight":1.1},
{"filter":{"terms":{"type":["Trait"]}},"weight":1.05}
]
}
},
"size": 10000,
"sort": ["_score","_doc"],
"_source": {"excludes":["text"]},
"aggs": {
"group1":{
"composite":{
"sources":[{"field1":{"terms":{"field":"type","missing_bucket":True}}}],
"size":10000
}
}
}
}
# make request and write to cache
res = requests.post(AEONPRD_ELASTIC, params=ELASTIC_PARAM, json=json_post)
with open(cache, 'wb') as f:
f.write(res.content)
return res.json()

def parse_creatures(data: dict) -> List['Creature']:
"""parse through raw monster data to retrieve imporant details"""
# parse creatures from raw data
creatures = []
for hit in data['hits']['hits']:
attrs = hit['_source']

name = attrs['name']
id = attrs['url'].rsplit('=', 1)[1]
if not id.isdigit():
raise RuntimeError(f'failed to parse {name!r}')
npc = attrs.get('npc', False)

# Some creaturea trow an error because aignment is missing
try:
align = attrs['alignment']
except KeyError:
align = "NA"

traits = set(attrs['trait'])
images = attrs.get('image', [])
creature = 'NPC' if npc else attrs['type']

if align in traits:
traits.remove(align)

creatures.append(Creature(
id=id,
name=name,
level=attrs['level'],
alignment=align,
creature_type=creature,
size=attrs['size'][0],
rarity=attrs['rarity'],
lore=attrs.get('summary', ''),
family=attrs.get('creature_family', ''),
image_url=images[0] if images else '',
npc=npc,
traits=traits,
))
return creatures

def generate_data(creatures: List['Creature']) -> 'Data':
"""generate metadata and pass into data object"""
# generate meta-data from collected monsters
meta = Metadata(total=len(creatures))
for creature in creatures:
meta.min_level = min(meta.min_level, creature.level)
meta.max_level = max(meta.max_level, creature.level)
meta.alignments.add(creature.alignment)
meta.creature_types.add(creature.creature_type)
meta.rarities.add(creature.rarity)
meta.sizes.add(creature.size)
meta.traits |= creature.traits
if creature.family:
meta.families.add(creature.family)
return Data(creatures, meta)

#** Classes **#

@dataclass
class Metadata:
min_level: int = 0
max_level: int = 0
total: int = 0
families: Set[str] = field(default_factory=set)
alignments: Set[str] = field(default_factory=set)
creature_types: Set[str] = field(default_factory=set)
traits: Set[str] = field(default_factory=set)
rarities: Set[str] = field(default_factory=set)
sizes: Set[str] = field(default_factory=set)

@dataclass
class Creature:
id: str
name: str
creature_type: str
level: int
alignment: str
size: str
rarity: str
lore: str
family: str
image_url: str
npc: bool = False
traits: Set[str] = field(default_factory=set)

@dataclass(repr=False)
class Data:
creatures: List[Creature] = field(default_factory=list)
metadata: Metadata = field(default_factory=Metadata)

def __repr__(self) -> str:
return f'Creatures(found={len(self.creatures)}, meta={self.metadata!r})'

class JsonEncoder(json.JSONEncoder):
"""custom json encoder to convert sets to supported list-type"""
def default(self, obj):
if isinstance(obj, set):
return sorted(list(obj))
if dataclasses.is_dataclass(obj):
return dataclasses.asdict(obj)
return super().default(obj)


#** Init **#

if(__name__ == '__main__'):
data = creature_data(npcs=False)
monsters = parse_creatures(data)

data = creature_data(npcs=True)
npcs = parse_creatures(data)

collected = generate_data(monsters + npcs)
with open('creatures.json', 'w') as f:
json.dump(collected, f, cls=JsonEncoder)
Loading