Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Astra scraper #31

Merged
merged 11 commits into from
Oct 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .env.template
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#Scrapers
LOGIN_NETID=
LOGIN_PASSWORD=
LOGIN_ASTRA_USERNAME=
LOGIN_ASTRA_PASSWORD=
HEADLESS_MODE=false

#Uploader
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ require (
github.com/chromedp/cdproto v0.0.0-20240801214329-3f85d328b335
github.com/chromedp/chromedp v0.10.0
github.com/joho/godotenv v1.5.1
github.com/valyala/fastjson v1.6.4
go.mongodb.org/mongo-driver v1.15.0
)

Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS
github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
github.com/ugorji/go/codec v1.2.12 h1:9LC83zGrHhuUA9l16C9AHXAqEV/2wBQ4nkvumAE65EE=
github.com/ugorji/go/codec v1.2.12/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg=
github.com/valyala/fastjson v1.6.4 h1:uAUNq9Z6ymTgGhcm0UynUAB6tlbakBrz6CQFax3BXVQ=
github.com/valyala/fastjson v1.6.4/go.mod h1:CLCAqky6SMuOcxStkYQvblddUtoRxhYMGLrsQns1aXY=
github.com/xdg-go/pbkdf2 v1.0.0 h1:Su7DPu48wXMwC3bs7MCNG+z4FhcyEuz5dlvchbq0B0c=
github.com/xdg-go/pbkdf2 v1.0.0/go.mod h1:jrpuAogTd400dnrH08LKmI/xc1MbPOebTwRqcT5RDeI=
github.com/xdg-go/scram v1.1.2 h1:FHX5I5B4i4hKRVRBCFRxq1iQRej7WO3hhBuJf+UUySY=
Expand Down
4 changes: 4 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ func main() {
scrapeOrganizations := flag.Bool("organizations", false, "Alongside -scrape, signifies that SOC organizations should be scraped.")
// Flag for event scraping
scrapeEvents := flag.Bool("events", false, "Alongside -scrape, signifies that events should be scraped.")
// Flag for astra scraping
scrapeAstra := flag.Bool("astra", false, "Alongside -scrape, signifies that Astra should be scraped.")

// Flags for parsing
parse := flag.Bool("parse", false, "Puts the tool into parsing mode.")
Expand Down Expand Up @@ -92,6 +94,8 @@ func main() {
scrapers.ScrapeOrganizations(*outDir)
case *scrapeEvents:
scrapers.ScrapeEvents(*outDir)
case *scrapeAstra:
scrapers.ScrapeAstra(*outDir)
default:
log.Panic("You must specify which type of scraping you would like to perform with one of the scraping flags!")
}
Expand Down
123 changes: 123 additions & 0 deletions scrapers/astra.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
/*
This file contains the code for the Astra scraper.
*/

package scrapers

import (
"fmt"
"io"
"log"
"net/http"
"os"
"time"

"github.com/UTDNebula/api-tools/utils"
"github.com/joho/godotenv"
"github.com/valyala/fastjson"
)

var MAX_EVENTS_PER_DAY = 5000

func ScrapeAstra(outDir string) {

// Load env vars
if err := godotenv.Load(); err != nil {
log.Panic("Error loading .env file")
}

// Start chromedp
chromedpCtx, cancel := utils.InitChromeDp()

// Make output folder
err := os.MkdirAll(outDir, 0777)
if err != nil {
panic(err)
}

days := "{" // String JSON for storing results by day
firstLoop := true // To avoid adding a comma to the JSON on the first loop

// Init http client
tr := &http.Transport{
MaxIdleConns: 10,
IdleConnTimeout: 30 * time.Second,
DisableCompression: true,
}
cli := &http.Client{Transport: tr}

// Get cookies for auth
astraHeaders := utils.RefreshAstraToken(chromedpCtx)
time.Sleep(500 * time.Millisecond)
cancel() // Don't need chromedp anymore

// Starting date
date := time.Now()
// Start on previous date to make sure we have today's data, regardless of what timezone the scraper is in
date = date.Add(time.Hour * -24)

// Stop condition
lt10EventsCount := 0

// Run until 90 days of no events
for lt10EventsCount < 90 {
formattedDate := date.Format("2006-01-02")
log.Printf("Scraping %s...", formattedDate)

// Request daily events
url := fmt.Sprintf("https://www.aaiscloud.com/UTXDallas/~api/calendar/CalendarWeekGrid?_dc=%d&action=GET&start=0&limit=%d&isForWeekView=false&fields=ActivityId,ActivityPk,ActivityName,ParentActivityId,ParentActivityName,MeetingType,Description,StartDate,EndDate,DayOfWeek,StartMinute,EndMinute,ActivityTypeCode,ResourceId,CampusName,BuildingCode,RoomNumber,RoomName,LocationName,InstitutionId,SectionId,SectionPk,IsExam,IsCrosslist,IsAllDay,IsPrivate,EventId,EventPk,CurrentState,NotAllowedUsageMask,UsageColor,UsageColorIsPrimary,EventTypeColor,MaxAttendance,ActualAttendance,Capacity&filter=(StartDate%%3C%%3D%%22%sT23%%3A00%%3A00%%22)%%26%%26(EndDate%%3E%%3D%%22%sT00%%3A00%%3A00%%22)&page=1&sortOrder=%%2BStartDate,%%2BStartMinute", time.Now().UnixMilli(), MAX_EVENTS_PER_DAY, formattedDate, formattedDate)
req, err := http.NewRequest("GET", url, nil)
if err != nil {
panic(err)
}
req.Header = astraHeaders
res, err := cli.Do(req)
if err != nil {
panic(err)
}
if res.StatusCode != 200 {
log.Panicf("ERROR: Status was: %s\nIf the status is 404, you've likely been IP ratelimited!", res.Status)
}
body, err := io.ReadAll(res.Body)
if err != nil {
panic(err)
}
res.Body.Close()
stringBody := string(body)

// Check for no events
numEvents := fastjson.GetInt(body, "totalRecords")
if numEvents >= MAX_EVENTS_PER_DAY {
log.Panic("ERROR: Max events per day exceeded!")
}
if numEvents < 10 {
lt10EventsCount += 1
if lt10EventsCount > 30 {
log.Printf("There have been %d days in a row with fewer than 10 events.", lt10EventsCount)
}
} else {
lt10EventsCount = 0
}

// Add to record
comma := ","
if firstLoop {
comma = ""
firstLoop = false
}
days = fmt.Sprintf("%s%s\"%s\":%s", days, comma, formattedDate, stringBody)
date = date.Add(time.Hour * 24)
}

// Write event data to output file
days = fmt.Sprintf("%s}", days)
fptr, err := os.Create(fmt.Sprintf("%s/reservations.json", outDir))
if err != nil {
panic(err)
}
_, err = fptr.Write([]byte(days))
if err != nil {
panic(err)
}
fptr.Close()
}
73 changes: 73 additions & 0 deletions utils/methods.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,79 @@ func RefreshToken(chromedpCtx context.Context) map[string][]string {
}
}

// This function signs into Astra
func RefreshAstraToken(chromedpCtx context.Context) map[string][]string {
// Get username and password
username, present := os.LookupEnv("LOGIN_ASTRA_USERNAME")
if !present {
log.Panic("LOGIN_ASTRA_USERNAME is missing from .env!")
}
password, present := os.LookupEnv("LOGIN_ASTRA_PASSWORD")
if !present {
log.Panic("LOGIN_ASTRA_PASSWORD is missing from .env!")
}

// Sign in
VPrintf("Signing in...")
_, err := chromedp.RunResponse(chromedpCtx,
chromedp.ActionFunc(func(ctx context.Context) error {
err := network.ClearBrowserCookies().Do(ctx)
return err
}),
chromedp.Navigate(`https://www.aaiscloud.com/UTXDallas/logon.aspx?ReturnUrl=%2futxdallas%2fcalendars%2fdailygridcalendar.aspx`),
chromedp.WaitVisible(`input#userNameField-inputEl`),
chromedp.SendKeys(`input#userNameField-inputEl`, username),
chromedp.SendKeys(`input#textfield-1029-inputEl`, password),
chromedp.WaitVisible(`a#logonButton`),
chromedp.Click(`a#logonButton`),
chromedp.WaitVisible(`body`, chromedp.ByQuery),
)
if err != nil {
panic(err)
}

// Save all cookies to string
cookieStr := ""
_, err = chromedp.RunResponse(chromedpCtx,
chromedp.WaitVisible(`body`, chromedp.ByQuery),
chromedp.ActionFunc(func(ctx context.Context) error {
cookies, err := network.GetCookies().Do(ctx)
gotToken := false
for _, cookie := range cookies {
cookieStr = fmt.Sprintf("%s%s=%s; ", cookieStr, cookie.Name, cookie.Value)
if cookie.Name == "UTXDallas.ASPXFORMSAUTH" {
VPrintf("Got new token: PTGSESSID = %s", cookie.Value)
gotToken = true
}
}
if !gotToken {
return errors.New("failed to get a new token")
}
return err
}),
)
if err != nil {
panic(err)
}

// Return headers, copied from a request the actual site made
return map[string][]string{
"Host": {"www.aaiscloud.com"},
"User-Agent": {"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0"},
"Accept": {"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8"},
"Accept-Language": {"en-US,en;q=0.5"},
"Accept-Encoding": {"gzip, deflate, br, zstd"},
"Connection": {"keep-alive"},
"Cookie": {cookieStr},
"Upgrade-Insecure-Requests": {"1"},
"Sec-Fetch-Dest": {"document"},
"Sec-Fetch-Mode": {"navigate"},
"Sec-Fetch-Site": {"none"},
"Sec-Fetch-User": {"?1"},
"Priority": {"u=0, i"},
}
}

// Encodes and writes the given data as tab-indented JSON to the given filepath.
func WriteJSON(filepath string, data interface{}) error {
fptr, err := os.Create(filepath)
Expand Down
Loading