diff --git a/backend/.env b/backend/.env index 586ff59..1ad1ee0 100644 --- a/backend/.env +++ b/backend/.env @@ -11,4 +11,5 @@ CHAIN_DILL_PATH="backend/runtime/chain.dill" SQL_DB_FILE="backend/resources/sql.db" PALM_THRESHOLD=0.5 PALM_API_MAX_CHARS=999 -PAGE_LOAD_TIMEOUT=5000 \ No newline at end of file +PAGE_LOAD_TIMEOUT=10000 +EXPAND_WAIT_TIME=2 \ No newline at end of file diff --git a/backend/resources/data_config.json b/backend/resources/data_config.json index 260f567..e3b82a4 100644 --- a/backend/resources/data_config.json +++ b/backend/resources/data_config.json @@ -38,5 +38,17 @@ "details" : "The course {title} has following details as updated latest in the website url: {details}" }, "text_config_type": "single" + }, + "courseList": { + "locator": [ + "td[class='coursepadding']->coursename" + ], + "ignore_words" : ["[Print Course (opens a new window)]", "Print (opens a new window)"], + "textify" : { + "count" : "There are {count} courses available in the Purdue Fort Wayne Computer Science Department.\n", + "coursename_iter" : "{coursename_iter}\n" + }, + "text_config_type": "multi", + "shrink" : "true" } } \ No newline at end of file diff --git a/backend/resources/urls.txt b/backend/resources/urls.txt index e7df85a..96c0dae 100644 --- a/backend/resources/urls.txt +++ b/backend/resources/urls.txt @@ -46,13 +46,4 @@ https://catalog.pfw.edu/preview_program.php?catoid=62&poid=16211 || degreeCourse https://catalog.pfw.edu/preview_program.php?catoid=62&poid=16095 || degreeCourse https://catalog.pfw.edu/preview_program.php?catoid=62&poid=16263 || degreeCourse https://catalog.pfw.edu/preview_program.php?catoid=62&poid=16093 || degreeCourse -https://catalog.pfw.edu/preview_program.php?catoid=63&poid=16993 || degreeCourse -https://www.pfw.edu/etcs/computer-science/beyond-classroom || studentOrgList -https://www.pfw.edu/etcs/computer-science/senior-design-capstone-projects || capstoneProjectTeams -https://www.pfw.edu/etcs/computer-science/summer-computing-camp-2023 || summerComputingCamp2023 -https://www.pfw.edu/etcs/computer-science/about-us || -https://www.pfw.edu/etcs/student-success-center || -https://www.pfw.edu/etcs/student-success-center#help-corner || helpDesk -https://www.pfw.edu/etcs/student-success-center#lead || helpDesk -https://www.pfw.edu/etcs/student-success-center#etcs-peer-support || -https://www.pfw.edu/etcs/student-success-center##etcs-advising || +https://catalog.pfw.edu/preview_program.php?catoid=63&poid=16993 || degreeCourse \ No newline at end of file diff --git a/backend/services/data/scrapping.py b/backend/services/data/scrapping.py index f941511..b0b3763 100644 --- a/backend/services/data/scrapping.py +++ b/backend/services/data/scrapping.py @@ -53,11 +53,6 @@ async def scrape(playwright: Playwright, url: str, locator_config: dict = None): url_content = url_set[1].strip() if len(url_set) > 1 else None try: await page.goto(url_link, wait_until="domcontentloaded") - await page.evaluate(''' - document.querySelectorAll('[x-data]').forEach((element) => { - element.removeAttribute('x-data'); - }); - ''') page_title = await page.title() page_title = page_title.split('|') flag = not any(element in page_title[0] for element in strict_no_elements) @@ -66,6 +61,10 @@ async def scrape(playwright: Playwright, url: str, locator_config: dict = None): config = locator_config.get(url_content) context_appender: str = "" locators = config.get('locator') + if bool(config.get('shrink')): + for element in await page.query_selector_all("*[aria-expanded]"): + await element.click() + await asyncio.sleep(int(os.getenv("EXPAND_WAIT_TIME"))) title_replacements = config.get('title_replacer') data['title'] = safe_replace(safe_replace(page_title[0], replacements), title_replacements).strip() data['type'] = url_content