v0.0.1

amazon-science · Sep 6, 2023 · 9417976 · 9417976
1 parent 402727b
commit 9417976
Show file tree

Hide file tree

Showing 11 changed files with 851 additions and 10 deletions.
diff --git a/LICENSE.txt b/LICENSE.txt
diff --git a/README.md b/README.md
@@ -1,18 +1,31 @@
-## My Project
+# job-posting-structure
 
-TODO: Fill this README out!
+Parses structured information from HTML-formatted job postings.
 
-Be sure to:
+Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 
-* Change the title in this README
-* Edit your repository description on GitHub
-* Write in your license below and create a LICENSE file
+## JobStruct class
 
-## Security
+The primary class is called JobStruct and can be initialized from
+a filename, an HTML string, or an existing BeautifulSoup object that
+contains parsed HTML:
 
-See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information.
+    j = JobStruct.from_file("myJobPosting.html")
 
-## License
+    with open("myJobPosting.html") as f:
+        posting_html_str = f.read()
+    j = JobStruct.from_string(posting_html_str)
 
-This library is licensed under the LICENSE NAME HERE License.
+    posting_soup_obj = BeautifulSoup(posting_html_str, "html.parser")
+    j = JobStruct.from_soup(posting_soup_obj)
 
+Once initialized, the JobStruct object has attributes for each segment
+that was parsed from the job posting:
+
+* description
+* benefits
+* qualitifications
+* responsibilities
+* requirements
+* eeo (Equal Employment Opportunity)
+* other
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,3 @@
+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1 @@
+beautifulsoup4 >= 4.0.0
diff --git a/setup.cfg b/setup.cfg
@@ -0,0 +1,24 @@
+[metadata]
+name = jobstruct
+author = Suraj Maharjan
+author_email = [email protected]
+url = https://github.com/amazon-science/job-posting-structure
+version = attr: jobstruct.__version__
+license = Creative Commons Non-Commercial 4.0
+description = Parses structured information from HTML-formatted job postings.
+long_description = file: README.md
+long_description_content_type = text/markdown
+classifiers =
+    Development Status :: 3 - Alpha
+    License :: Free for non-commercial use
+    Intended Audience :: Science/Research
+    Operating System :: OS Independent
+    Natural Language :: English
+    Programming Language :: Python :: 3
+    Topic :: Scientific/Engineering
+    Topic :: Scientific/Engineering :: Information Analysis
+
+[options]
+install_requires = file: requirements.txt
+python_requires = >= 3.8
+setup_requires = setuptools
diff --git a/src/jobstruct/__init__.py b/src/jobstruct/__init__.py
@@ -0,0 +1,6 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.  
+# SPDX-License-Identifier: CC-BY-NC-4.0
+
+from .jobstruct import JobStruct
+
+__version__ = "0.0.1"
diff --git a/src/jobstruct/jobstruct.py b/src/jobstruct/jobstruct.py
@@ -0,0 +1,184 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.  
+# SPDX-License-Identifier: CC-BY-NC-4.0
+
+from bs4 import BeautifulSoup
+
+class JobStruct:
+    """
+    A class that represents a parsed HTML job posting, starting
+    from either a filename, HTML text, or a BeautifulSoup object.
+    The parsed segments of the job posting, available as attributes, are:
+    * description
+    * benefits
+    * qualitifications
+    * responsibilities
+    * requirements
+    * eeo (Equal Employment Opportunity)
+    * other
+    """
+
+    __TAGS = [
+        "p",
+        "div",
+        "h1",
+        "h2",
+        "h3",
+        "h4",
+        "h5",
+        "h6",
+    ]
+
+    __SEGMENTS = {
+        "description": frozenset((
+            "description",
+            "overview",
+            "glance",
+            "summary",
+            "posting"
+        )),
+        "benefits": frozenset((
+            "perks",
+            "benefits",
+            "offer"
+        )),
+        "qualifications": frozenset((
+            "experience",
+            "qualification",
+            "qualifications",
+            "skills",
+        )),
+        "responsibilities": frozenset((
+            "responsibilities",
+            "duties",
+            "functions",
+            "function(s)"
+        )),
+        "requirements": frozenset((
+            "requirements",
+            "required",
+            "requirement"
+        )),
+        "eeo": frozenset((
+            "equal",
+            "opportunity",
+            "employer"
+        ))
+    }
+
+    def __init__(self, soup: BeautifulSoup = None):
+        """
+        Segments the HTML job posting `soup` that has been parsed by BeautifulSoup,
+        and provides the segments as attributes. If no `soup` is provided, returns
+        an empty structure.
+        """
+        self._init_segments()
+        if soup is not None:
+            self.soup: BeautifulSoup = soup
+            self._segment()
+        self._add_attributes()
+
+
+    @classmethod
+    def from_file(cls, filename: str) -> "JobStruct":
+        """
+        Creates a JobStruct object from the HTML in `filename`.
+        """
+        with open(filename) as f:
+            soup: BeautifulSoup = BeautifulSoup(f.read(), "html.parser")
+        return cls(soup)
+
+
+    @classmethod
+    def from_string(cls, html: str) -> "JobStruct":
+        """
+        Creates a JobStruct object from a `string` containing HTML.
+        """
+        soup: BeautifulSoup = BeautifulSoup(html, "html.parser")
+        return cls(soup)
+
+
+    @classmethod
+    def from_soup(cls, soup: BeautifulSoup) -> "JobStruct":
+        """
+        Creates a JobStruct object from BeautifulSoup-parsed HTML in `soup`.
+        """
+        return cls(soup)
+
+
+    def to_dict(self):
+        """
+        Convert the JobStruct object to a dictionary containing the segment
+        attributes.
+        """
+        return {segment: list(values) for segment, values in self.segments.items()}
+
+
+    def _init_segments(self):
+        """
+        Initial empty list for each segment type.
+        """
+        self.segments = {segment: list() for segment in JobStruct.__SEGMENTS.keys()}
+        # Other is the catch-all type for segments that don't match a keyword.
+        self.segments["other"] = list()
+
+
+    def _segment(self):
+        """
+        Loop over HTML elements to find headings for each segment type and
+        append the elements following the heading to the segment lists.
+        """
+        segment = "other"
+        for element in self.soup.body.find_all(JobStruct.__TAGS):
+            text = element.get_text(separator="\n").strip()
+            if text:
+                if len(text.split()) <= 5:
+                    segment = self._classify_segment(text.lower())
+                elif self._is_terminal(element):
+                    for line in text.split("\n"):
+                        if "equal opportunity employer" in line:
+                            self.segments["eeo"].append(line)
+                        else:
+                            self.segments[segment].append(line)
+
+
+    def _classify_segment(self, text: str):
+        """
+        Classify `text` into one of the segment types using keywords.
+        Defaults to "other" if no keywords were found.
+        """
+        for segment, keywords in JobStruct.__SEGMENTS.items():
+            if any(word.strip(":") in keywords for word in text.split()):
+                return segment
+        return "other"
+
+
+    def _is_terminal(self, element):
+        """
+        """
+        return all(
+            element.find(tag) is None
+            for tag in JobStruct.__TAGS
+        )
+
+
+    def _add_attributes(self):
+        """
+        Add attributes for each segment type to the returned object.
+        """
+        for segment in self.segments.keys():
+            assert not hasattr(self, segment)
+            setattr(self, segment, self.segments[segment])
+
+
+    def __str__(self):
+        output = []
+        for segment, values in self.segments.items():
+            if segment != "other":
+                if values:
+                    output.append(f"{segment}: [")
+                    for value in values:
+                        output.append(value)
+                    output.append("]")
+                else:
+                    output.append(f"{segment}: []")
+        return "\n".join(output)
diff --git a/src/jobstruct/py.typed b/src/jobstruct/py.typed
@@ -0,0 +1 @@
+# Marker file that indicates this package supports typing
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# Marker file that indicates this package supports typing