Skip to content

Commit

Permalink
v0.0.1
Browse files Browse the repository at this point in the history
  • Loading branch information
Mark Howison committed Sep 6, 2023
1 parent 402727b commit 9417976
Show file tree
Hide file tree
Showing 11 changed files with 851 additions and 10 deletions.
407 changes: 407 additions & 0 deletions LICENSE.txt

Large diffs are not rendered by default.

33 changes: 23 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,18 +1,31 @@
## My Project
# job-posting-structure

TODO: Fill this README out!
Parses structured information from HTML-formatted job postings.

Be sure to:
Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.

* Change the title in this README
* Edit your repository description on GitHub
* Write in your license below and create a LICENSE file
## JobStruct class

## Security
The primary class is called JobStruct and can be initialized from
a filename, an HTML string, or an existing BeautifulSoup object that
contains parsed HTML:

See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information.
j = JobStruct.from_file("myJobPosting.html")

## License
with open("myJobPosting.html") as f:
posting_html_str = f.read()
j = JobStruct.from_string(posting_html_str)

This library is licensed under the LICENSE NAME HERE License.
posting_soup_obj = BeautifulSoup(posting_html_str, "html.parser")
j = JobStruct.from_soup(posting_soup_obj)

Once initialized, the JobStruct object has attributes for each segment
that was parsed from the job posting:

* description
* benefits
* qualitifications
* responsibilities
* requirements
* eeo (Equal Employment Opportunity)
* other
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[build-system]
requires = ["setuptools"]
build-backend = "setuptools.build_meta"
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
beautifulsoup4 >= 4.0.0
24 changes: 24 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
[metadata]
name = jobstruct
author = Suraj Maharjan
author_email = [email protected]
url = https://github.com/amazon-science/job-posting-structure
version = attr: jobstruct.__version__
license = Creative Commons Non-Commercial 4.0
description = Parses structured information from HTML-formatted job postings.
long_description = file: README.md
long_description_content_type = text/markdown
classifiers =
Development Status :: 3 - Alpha
License :: Free for non-commercial use
Intended Audience :: Science/Research
Operating System :: OS Independent
Natural Language :: English
Programming Language :: Python :: 3
Topic :: Scientific/Engineering
Topic :: Scientific/Engineering :: Information Analysis

[options]
install_requires = file: requirements.txt
python_requires = >= 3.8
setup_requires = setuptools
6 changes: 6 additions & 0 deletions src/jobstruct/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: CC-BY-NC-4.0

from .jobstruct import JobStruct

__version__ = "0.0.1"
184 changes: 184 additions & 0 deletions src/jobstruct/jobstruct.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: CC-BY-NC-4.0

from bs4 import BeautifulSoup

class JobStruct:
"""
A class that represents a parsed HTML job posting, starting
from either a filename, HTML text, or a BeautifulSoup object.
The parsed segments of the job posting, available as attributes, are:
* description
* benefits
* qualitifications
* responsibilities
* requirements
* eeo (Equal Employment Opportunity)
* other
"""

__TAGS = [
"p",
"div",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
]

__SEGMENTS = {
"description": frozenset((
"description",
"overview",
"glance",
"summary",
"posting"
)),
"benefits": frozenset((
"perks",
"benefits",
"offer"
)),
"qualifications": frozenset((
"experience",
"qualification",
"qualifications",
"skills",
)),
"responsibilities": frozenset((
"responsibilities",
"duties",
"functions",
"function(s)"
)),
"requirements": frozenset((
"requirements",
"required",
"requirement"
)),
"eeo": frozenset((
"equal",
"opportunity",
"employer"
))
}

def __init__(self, soup: BeautifulSoup = None):
"""
Segments the HTML job posting `soup` that has been parsed by BeautifulSoup,
and provides the segments as attributes. If no `soup` is provided, returns
an empty structure.
"""
self._init_segments()
if soup is not None:
self.soup: BeautifulSoup = soup
self._segment()
self._add_attributes()


@classmethod
def from_file(cls, filename: str) -> "JobStruct":
"""
Creates a JobStruct object from the HTML in `filename`.
"""
with open(filename) as f:
soup: BeautifulSoup = BeautifulSoup(f.read(), "html.parser")
return cls(soup)


@classmethod
def from_string(cls, html: str) -> "JobStruct":
"""
Creates a JobStruct object from a `string` containing HTML.
"""
soup: BeautifulSoup = BeautifulSoup(html, "html.parser")
return cls(soup)


@classmethod
def from_soup(cls, soup: BeautifulSoup) -> "JobStruct":
"""
Creates a JobStruct object from BeautifulSoup-parsed HTML in `soup`.
"""
return cls(soup)


def to_dict(self):
"""
Convert the JobStruct object to a dictionary containing the segment
attributes.
"""
return {segment: list(values) for segment, values in self.segments.items()}


def _init_segments(self):
"""
Initial empty list for each segment type.
"""
self.segments = {segment: list() for segment in JobStruct.__SEGMENTS.keys()}
# Other is the catch-all type for segments that don't match a keyword.
self.segments["other"] = list()


def _segment(self):
"""
Loop over HTML elements to find headings for each segment type and
append the elements following the heading to the segment lists.
"""
segment = "other"
for element in self.soup.body.find_all(JobStruct.__TAGS):
text = element.get_text(separator="\n").strip()
if text:
if len(text.split()) <= 5:
segment = self._classify_segment(text.lower())
elif self._is_terminal(element):
for line in text.split("\n"):
if "equal opportunity employer" in line:
self.segments["eeo"].append(line)
else:
self.segments[segment].append(line)


def _classify_segment(self, text: str):
"""
Classify `text` into one of the segment types using keywords.
Defaults to "other" if no keywords were found.
"""
for segment, keywords in JobStruct.__SEGMENTS.items():
if any(word.strip(":") in keywords for word in text.split()):
return segment
return "other"


def _is_terminal(self, element):
"""
"""
return all(
element.find(tag) is None
for tag in JobStruct.__TAGS
)


def _add_attributes(self):
"""
Add attributes for each segment type to the returned object.
"""
for segment in self.segments.keys():
assert not hasattr(self, segment)
setattr(self, segment, self.segments[segment])


def __str__(self):
output = []
for segment, values in self.segments.items():
if segment != "other":
if values:
output.append(f"{segment}: [")
for value in values:
output.append(value)
output.append("]")
else:
output.append(f"{segment}: []")
return "\n".join(output)
1 change: 1 addition & 0 deletions src/jobstruct/py.typed
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Marker file that indicates this package supports typing
Loading

0 comments on commit 9417976

Please sign in to comment.