Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scrape 'Research Tags' for professor profiles #11

Open
wants to merge 4 commits into
base: develop
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 68 additions & 5 deletions scrapers/profiles.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,16 @@ import (

"github.com/UTDNebula/nebula-api/schema"
"github.com/chromedp/cdproto/cdp"
"github.com/chromedp/cdproto/dom"
"github.com/chromedp/cdproto/runtime"
"github.com/chromedp/chromedp"
"go.mongodb.org/mongo-driver/bson/primitive"
)

const BASE_URL string = "https://profiles.utdallas.edu/browse?page="

var primaryLocationRegex *regexp.Regexp = regexp.MustCompile("^(\\w+)\\s+(\\d+\\.\\d{3}[A-z]?)$")
var fallbackLocationRegex *regexp.Regexp = regexp.MustCompile("^([A-z]+)(\\d+)\\.?(\\d{3}[A-z]?)$")
var primaryLocationRegex *regexp.Regexp = regexp.MustCompile(`^(\\w+)\\s+(\\d+\\.\\d{3}[A-z]?)$`)
var fallbackLocationRegex *regexp.Regexp = regexp.MustCompile(`^([A-z]+)(\\d+)\\.?(\\d{3}[A-z]?)$`)

func parseLocation(text string) schema.Location {
var building string
Expand Down Expand Up @@ -120,7 +121,7 @@ func scrapeProfessorLinks() []string {
for _, node := range nodes {
href, hasHref := node.Attribute("href")
if !hasHref {
return errors.New("Professor card was missing an href!")
return errors.New("professor card was missing an href")
}
professorLinks = append(professorLinks, href)
}
Expand Down Expand Up @@ -185,7 +186,7 @@ func ScrapeProfiles(outDir string) {
var hasSrc bool
imageUri, hasSrc = attributes["src"]
if !hasSrc {
return errors.New("No src found for imageUri!")
return errors.New("no src found for imageUri")
}
}
return err
Expand All @@ -200,7 +201,7 @@ func ScrapeProfiles(outDir string) {
var hasStyle bool
imageUri, hasStyle = attributes["style"]
if !hasStyle {
return errors.New("No style found for imageUri!")
return errors.New("no style found for imageUri")
}
imageUri = imageUri[23 : len(imageUri)-3]
}
Expand Down Expand Up @@ -257,6 +258,7 @@ func ScrapeProfiles(outDir string) {
var tempText string
err := chromedp.Text("div.contact_info > div", &tempText).Do(ctx)
texts = strings.Split(tempText, "\n")
fmt.Println(tempText)
return err
},
),
Expand All @@ -269,6 +271,65 @@ func ScrapeProfiles(outDir string) {
phoneNumber, office := parseList(texts)
log.Printf("Parsed list! #: %s, Office: %v\n\n", phoneNumber, office)

//Get the Tags
var tags map[string]string = map[string]string{}
var educations [][]string = [][]string{}
log.Printf("Scraping tags and Educations...\n")
err = chromedp.Run(chromedpCtx,
chromedp.QueryAfter(".tags-badge",
func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error {
for _, node := range nodes {
tempText := getNodeText(node)
href, hasHref := node.Attribute("href")
if !hasHref {
return errors.New("professor card was missing an href")
}
tags[tempText] = href
}
return nil
}, chromedp.AtLeast(0),
),
)

if err != nil {
panic(err)
}

err = chromedp.Run(chromedpCtx,
chromedp.QueryAfter("#preparation>div",
func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error {
for _, node := range nodes {
//This successfully gets to the correct divs,
//however major workarounds are required because there is text not within any node
element, err := dom.GetOuterHTML().WithNodeID(node.NodeID).Do(ctx)

if err != nil {
return err
}

regexSplitter := regexp.MustCompile(`\s?<[\w+" "|=]*>\s?|\s?<\/[\w]*>\s?|[\s]{2,}|\t|\s?-\s?`)
out := []string{}

for _, val := range regexSplitter.Split(element, -1) {
if val != "" {
out = append(out, val)
}
}

educations = append(educations, out)
}
return nil
}, chromedp.AtLeast(0),
),
)

if err != nil {
panic(err)
}

log.Printf("Scraped tags! #: %s\n", tags)
log.Printf("Scraped educations! #: %s\n", educations)

professors = append(professors, schema.Professor{
Id: schema.IdWrapper{Id: primitive.NewObjectID()},
First_name: firstName,
Expand All @@ -281,6 +342,8 @@ func ScrapeProfiles(outDir string) {
Image_uri: imageUri,
Office_hours: []schema.Meeting{},
Sections: []schema.IdWrapper{},
Tags: tags,
Education: educations,
})

log.Printf("Scraped profile for %s %s!\n\n", firstName, lastName)
Expand Down