Skip to content

Commit

Permalink
Merge pull request #219 from Foohy/scraper_hotfix
Browse files Browse the repository at this point in the history
Fix addon scraper not handling steam erroring with very large (>50,000) results
  • Loading branch information
Foohy authored May 13, 2024
2 parents 9783ad1 + 73411cf commit 5618086
Showing 1 changed file with 32 additions and 16 deletions.
48 changes: 32 additions & 16 deletions other/scraper/scrape.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
#!/usr/bin/env python

import sys
import json
import time
import urllib.request
import urllib.parse
import re

HOST = "http://api.steampowered.com"
Expand All @@ -11,16 +14,24 @@
DELAY = 0.1 # How long to delay between requests
FILENAME = "addons.txt"

ignore_words = ["content", "server"]
# Not a whole word search, so nav also gets navmesh
ignore_words = [
"content",
"server",
"nav",
"node",
"icon"
]

ignore_reg = "(?<!_){0}(?!_)" # Allow ignore words to be a part of the map name (surrounding underscores)
def containsIgnoreWord(str, word):
return re.search(ignore_reg.format(word), str) is not None
return re.search(ignore_reg.format(word), str, flags=re.IGNORECASE) is not None

def containsIgnoreWords(str):
for word in ignore_words:
if containsIgnoreWord(str, word):
return True

return False

if __name__ == "__main__":
Expand All @@ -38,14 +49,20 @@ def containsIgnoreWords(str):

f = open(FILENAME, "w")

while True:
req = "{0}/{1}?key={2}&appid={3}&requiredtags[0]=map&numperpage={4}&page={5}&return_metadata=1&query_type=1".format(HOST, ENDPOINT, key, APPID, NUMPERPAGE, page)
response = urllib.request.urlopen(req).read()
resobj = json.loads(response.decode("utf-8", "ignore"))
total = resobj["response"]["total"]

for addon in resobj["response"]["publishedfiledetails"]:
if "title" in addon and containsIgnoreWords(addon["title"]):
cursor = "*"
last_cursor = None
while cursor != None and cursor != last_cursor:
req = "{0}/{1}?key={2}&appid={3}&requiredtags[0]=map&numperpage={4}&cursor={5}&return_metadata=1&query_type=1".format(HOST, ENDPOINT, key, APPID, NUMPERPAGE, urllib.parse.quote_plus(cursor))
response_data = urllib.request.urlopen(req).read()
response = json.loads(response_data.decode("utf-8", "ignore"))["response"]
total = response["total"]
last_cursor = cursor
cursor = response["next_cursor"]

for addon in response["publishedfiledetails"]:
hasignorewords = "title" in addon and containsIgnoreWords(addon["title"])
sexyfuntimes = "maybe_inappropriate_sex" in addon and addon["maybe_inappropriate_sex"] == True
if hasignorewords or sexyfuntimes:
ign_str = u"Ignoring: " + addon["title"]
print(ign_str.encode('utf-8'))
continue
Expand All @@ -56,18 +73,18 @@ def containsIgnoreWords(str):
workshopids.append(wsid)

# Informative output
finished = page * NUMPERPAGE + len(resobj["response"]["publishedfiledetails"])
finished = page * NUMPERPAGE + len(response["publishedfiledetails"])
print("Finished {0} addons. ({1:.2f}% of {2})".format(finished, finished * 100.0 / total, total))

# Move onto to the next page
page += 1

if page * NUMPERPAGE > resobj["response"]["total"]:
if page * NUMPERPAGE > response["total"]:
break
else:
else:
# so valve doesn't get angry at us
time.sleep(DELAY)

# Results come back sorted, but reverse it so
# newer entries are added at the end instead of shifting everything at the beginning
workshopids.reverse()
Expand All @@ -78,4 +95,3 @@ def containsIgnoreWords(str):

print("Finished!!")
f.close()

0 comments on commit 5618086

Please sign in to comment.