Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

escape domain regexes #141

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,11 @@ jobs:

- name: Validate
run: |
isort --check-only src/ tests/
isort --version
isort --check-only --diff src/ tests/
black --version
black --check --line-length 100 .
mypy --version
mypy

build-and-test:
Expand Down
16 changes: 14 additions & 2 deletions src/packageurl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,12 @@ def normalize(
subpath: Optional[AnyStr],
encode: Optional[bool] = ...,
) -> Tuple[
str, Optional[str], str, Optional[str], Union[str, Dict[str, str], None], Optional[str]
str,
Optional[str],
str,
Optional[str],
Union[str, Dict[str, str], None],
Optional[str],
]: ...


Expand Down Expand Up @@ -335,7 +340,14 @@ def normalize(
version_norm = normalize_version(version, encode)
qualifiers_norm = normalize_qualifiers(qualifiers, encode)
subpath_norm = normalize_subpath(subpath, encode)
return type_norm, namespace_norm, name_norm, version_norm, qualifiers_norm, subpath_norm
return (
type_norm,
namespace_norm,
name_norm,
version_norm,
qualifiers_norm,
subpath_norm,
)


class PackageURL(
Expand Down
104 changes: 61 additions & 43 deletions src/packageurl/contrib/url2purl.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,10 +124,10 @@ def build_generic_purl(uri):


@purl_router.route(
"https?://registry.npmjs.*/.*",
"https?://registry.yarnpkg.com/.*",
"https?://(www\\.)?npmjs.*/package.*",
"https?://(www\\.)?yarnpkg.com/package.*",
r"https?://registry\.npmjs\.(com|org)/.*",
r"https?://registry\.yarnpkg\.com/.*",
r"https?://(www\.)?npmjs\.(com|org)/package.*",
r"https?://(www\.)?yarnpkg\.com/package.*",
)
def build_npm_purl(uri):
# npm URLs are difficult to disambiguate with regex
Expand Down Expand Up @@ -221,9 +221,9 @@ def build_npm_web_purl(uri):


@purl_router.route(
"https?://repo1.maven.org/maven2/.*",
"https?://central.maven.org/maven2/.*",
"maven-index://repo1.maven.org/.*",
r"https?://repo1\.maven\.org/maven2/.*",
r"https?://central\.maven\.org/maven2/.*",
r"maven-index://repo1\.maven\.org/.*",
)
def build_maven_purl(uri):
path = unquote_plus(urlparse(uri).path)
Expand Down Expand Up @@ -253,15 +253,26 @@ def build_maven_purl(uri):

qualifiers["classifier"] = classifier.strip("-")

valid_types = ("aar", "ear", "mar", "pom", "rar", "rpm", "sar", "tar.gz", "war", "zip")
valid_types = (
"aar",
"ear",
"mar",
"pom",
"rar",
"rpm",
"sar",
"tar.gz",
"war",
"zip",
)
if extension in valid_types:
qualifiers["type"] = extension

return PackageURL("maven", namespace, name, version, qualifiers)


# https://rubygems.org/gems/i18n-js-3.0.11.gem
@purl_router.route("https?://rubygems.org/(downloads|gems)/.*")
@purl_router.route(r"https?://rubygems\.org/(downloads|gems)/.*")
def build_rubygems_purl(uri):
# We use a more general route pattern instead of using `rubygems_pattern`
# below by itself because we want to capture all rubygems download URLs,
Expand All @@ -272,7 +283,7 @@ def build_rubygems_purl(uri):
# https://rubygems.org/downloads/jwt-0.1.8.gem
# https://rubygems.org/gems/i18n-js-3.0.11.gem
rubygems_pattern = (
r"^https?://rubygems.org/(downloads|gems)/(?P<name>.+)-(?P<version>.+)(\.gem)$"
r"^https?://rubygems\.org/(downloads|gems)/(?P<name>.+)-(?P<version>.+)(\.gem)$"
)
return purl_from_pattern("gem", rubygems_pattern, uri)

Expand All @@ -296,7 +307,7 @@ def build_cran_purl(uri):
# https://pypi.python.org/packages/2.6/t/threadpool/threadpool-1.2.7-py2.6.egg
# https://pypi.python.org/packages/any/s/setuptools/setuptools-0.6c11-1.src.rpm
# https://files.pythonhosted.org/packages/84/d8/451842a5496844bb5c7634b231a2e4caf0d867d2e25f09b840d3b07f3d4b/multi_key_dict-2.0.win32.exe
pypi_pattern = r"(?P<name>(\w\.?)+(-\w+)*)-(?P<version>.+)\.(zip|tar.gz|tar.bz2|tgz|egg|rpm|exe)$"
pypi_pattern = r"(?P<name>(\w\.?)+(-\w+)*)-(?P<version>.+)\.(zip|tar\.gz|tar\.bz2|tgz|egg|rpm|exe)$"

# This pattern can be found in the following locations:
# - wheel.wheelfile.WHEEL_INFO_RE
Expand All @@ -312,8 +323,9 @@ def build_cran_purl(uri):


@purl_router.route(
"https?://pypi.org/(packages|project)/.+",
"https?://.+python.+org/(packages|project)/.*",
r"https?://pypi\.org/(packages|project)/.+",
r"https?://pypi\.python\.org/(packages|project)/.*",
r"https?://files\.pythonhosted\.org/(packages|project)/.*",
)
def build_pypi_purl(uri):
path = unquote_plus(urlparse(uri).path)
Expand Down Expand Up @@ -357,14 +369,16 @@ def build_composer_purl(uri):

# http://nuget.org/packages/EntityFramework/4.2.0.0
# https://www.nuget.org/api/v2/package/Newtonsoft.Json/11.0.1
nuget_www_pattern = r"^https?://.*nuget.org/(api/v2/)?packages?/(?P<name>.+)/(?P<version>.+)$"
nuget_www_pattern = (
r"^https?://(www\.)?nuget\.org/(api/v2/)?packages?/(?P<name>.+)/(?P<version>.+)$"
)

register_pattern("nuget", nuget_www_pattern)


# https://api.nuget.org/v3-flatcontainer/newtonsoft.json/10.0.1/newtonsoft.json.10.0.1.nupkg
nuget_api_pattern = (
r"^https?://api.nuget.org/v3-flatcontainer/"
r"^https?://api\.nuget\.org/v3-flatcontainer/"
r"(?P<name>.+)/"
r"(?P<version>.+)/"
r".*(nupkg)$" # ends with "nupkg"
Expand All @@ -378,7 +392,7 @@ def build_composer_purl(uri):
# https://sourceforge.net/projects/ventoy/files/v1.0.96/Ventoy%201.0.96%20release%20source%20code.tar.gz/download
# https://sourceforge.net/projects/geoserver/files/GeoServer/2.23.4/geoserver-2.23.4-war.zip/download
sourceforge_download_pattern = (
r"^https?://.*sourceforge.net/projects/"
r"^https?://((master|iweb)\.dl\.)?sourceforge\.net/projects/"
r"(?P<name>.+)/"
r"files/"
r"(?i:(?P=name)/)?" # optional case-insensitive name segment repeated
Expand All @@ -392,7 +406,7 @@ def build_composer_purl(uri):

# https://sourceforge.net/projects/spacesniffer/files/spacesniffer_1_3_0_2.zip/download
sourceforge_download_pattern_bis = (
r"^https?://.*sourceforge.net/projects/"
r"^https?://((master|iweb)\.dl\.)?sourceforge\.net/projects/"
r"(?P<name>.+)/"
r"files/"
r"(?i:(?P=name))_*(?P<version>[0-9_]+).*"
Expand All @@ -402,7 +416,7 @@ def build_composer_purl(uri):
register_pattern("sourceforge", sourceforge_download_pattern_bis)


@purl_router.route("https?://.*sourceforge.net/project/.*")
@purl_router.route(r"https?://((master|iweb)\.dl\.)?sourceforge\.net/projects?/.*")
def build_sourceforge_purl(uri):
# We use a more general route pattern instead of using `sourceforge_pattern`
# below by itself because we want to capture all sourceforge download URLs,
Expand All @@ -412,7 +426,7 @@ def build_sourceforge_purl(uri):

# http://master.dl.sourceforge.net/project/libpng/zlib/1.2.3/zlib-1.2.3.tar.bz2
sourceforge_pattern = (
r"^https?://.*sourceforge.net/projects?/"
r"^https?://((master|iweb)\.dl\.)?sourceforge\.net/projects?/"
r"(?P<namespace>([^/]+))/" # do not allow more "/" segments
r"(OldFiles/)?"
r"(?P<name>.+)/"
Expand All @@ -437,27 +451,29 @@ def build_sourceforge_purl(uri):
if remaining_uri_path_segments:
project_name = remaining_uri_path_segments[0] # aloyscore
sourceforge_purl = PackageURL(
type="sourceforge", name=project_name, qualifiers={"download_url": uri}
type="sourceforge",
name=project_name,
qualifiers={"download_url": uri},
)
return sourceforge_purl


# https://crates.io/api/v1/crates/rand/0.7.2/download
cargo_pattern = r"^https?://crates.io/api/v1/crates/(?P<name>.+)/(?P<version>.+)(\/download)$"
cargo_pattern = r"^https?://crates\.io/api/v1/crates/(?P<name>.+)/(?P<version>.+)(\/download)$"

register_pattern("cargo", cargo_pattern)


# https://raw.githubusercontent.com/volatilityfoundation/dwarf2json/master/LICENSE.txt
github_raw_content_pattern = (
r"https?://raw.githubusercontent.com/(?P<namespace>[^/]+)/(?P<name>[^/]+)/"
r"https?://raw\.githubusercontent\.com/(?P<namespace>[^/]+)/(?P<name>[^/]+)/"
r"(?P<version>[^/]+)/(?P<subpath>.*)$"
)

register_pattern("github", github_raw_content_pattern)


@purl_router.route("https?://api.github\\.com/repos/.*")
@purl_router.route(r"https?://api\.github\.com/repos/.*")
def build_github_api_purl(url):
"""
Return a PackageURL object from GitHub API `url`.
Expand Down Expand Up @@ -488,54 +504,54 @@ def build_github_api_purl(url):
# https://codeload.github.com/nexB/scancode-toolkit/tar.gz/v3.1.1
# https://codeload.github.com/berngp/grails-rest/zip/release/0.7
github_codeload_pattern = (
r"https?://codeload.github.com/(?P<namespace>.+)/(?P<name>.+)/"
r"(zip|tar.gz|tar.bz2|tgz)/(.*/)*"
r"https?://codeload\.github\.com/(?P<namespace>.+)/(?P<name>.+)/"
r"(zip|tar\.gz|tar\.bz2|tgz)/([^/]*/)*"
Copy link
Author

@ds-cbo ds-cbo Jan 5, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reason for this change from (.*/)* to ([^/]*/)* can be found here:

https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS

r"(?P<version_prefix>v|V?)(?P<version>.+)$"
)

register_pattern("github", github_codeload_pattern)


@purl_router.route("https?://github\\.com/.*")
@purl_router.route(r"https?://github\.com/.*")
def build_github_purl(url):
"""
Return a PackageURL object from GitHub `url`.
"""

# https://github.com/nexB/scancode-toolkit/archive/v3.1.1.zip
archive_pattern = (
r"https?://github.com/(?P<namespace>.+)/(?P<name>.+)"
r"/archive/(.*/)*"
r"https?://github\.com/(?P<namespace>.+)/(?P<name>.+)"
r"/archive/([^/]*/)*"
r"((?P=name)(-|_|@))?"
r"(?P<version_prefix>v|V?)(?P<version>.+).(zip|tar.gz|tar.bz2|.tgz)"
r"(?P<version_prefix>v|V?)(?P<version>.+)\.(zip|tar\.gz|tar\.bz2|tgz)"
)

# https://github.com/downloads/mozilla/rhino/rhino1_7R4.zip
download_pattern = (
r"https?://github.com/downloads/(?P<namespace>.+)/(?P<name>.+)/"
r"https?://github\.com/downloads/(?P<namespace>.+)/(?P<name>.+)/"
r"((?P=name)(-|@)?)?"
r"(?P<version_prefix>v|V?)(?P<version>.+).(zip|tar.gz|tar.bz2|.tgz)"
r"(?P<version_prefix>v|V?)(?P<version>.+)\.(zip|tar\.gz|tar\.bz2|tgz)"
Copy link
Author

@ds-cbo ds-cbo Jan 5, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This also fixes a bug with tgz files, since it would previously match

v1.2.3.tgz as { version="1.2.", extension=".tgz" }
as opposed to { version="1.2.3", extension="tgz" }

due to the double unescaped . here.

See for clarification: https://regex101.com/r/luaRIy/1

)

# https://github.com/pypa/get-virtualenv/raw/20.0.31/public/virtualenv.pyz
raw_pattern = (
r"https?://github.com/(?P<namespace>.+)/(?P<name>.+)"
r"https?://github\.com/(?P<namespace>.+)/(?P<name>.+)"
r"/raw/(?P<version_prefix>v|V?)(?P<version>[^/]+)/(?P<subpath>.*)$"
)

# https://github.com/fanf2/unifdef/blob/master/unifdef.c
blob_pattern = (
r"https?://github.com/(?P<namespace>.+)/(?P<name>.+)"
r"https?://github\.com/(?P<namespace>.+)/(?P<name>.+)"
r"/blob/(?P<version>[^/]+)/(?P<subpath>.*)$"
)

releases_download_pattern = (
r"https?://github.com/(?P<namespace>.+)/(?P<name>.+)"
r"https?://github\.com/(?P<namespace>.+)/(?P<name>.+)"
r"/releases/download/(?P<version_prefix>v|V?)(?P<version>[^/]+)/.*$"
)

# https://github.com/pombredanne/schematics.git
git_pattern = r"https?://github.com/(?P<namespace>.+)/(?P<name>.+).(git)"
git_pattern = r"https?://github\.com/(?P<namespace>.+)/(?P<name>.+)\.(git)"

patterns = (
archive_pattern,
Expand Down Expand Up @@ -584,7 +600,7 @@ def build_github_purl(url):
)


@purl_router.route("https?://bitbucket\\.org/.*")
@purl_router.route(r"https?://bitbucket\.org/.*")
def build_bitbucket_purl(url):
"""
Return a PackageURL object from BitBucket `url`.
Expand All @@ -602,9 +618,9 @@ def build_bitbucket_purl(url):
name = segments[1]

bitbucket_download_pattern = (
r"https?://bitbucket.org/"
r"https?://bitbucket\.org/"
r"(?P<namespace>.+)/(?P<name>.+)/downloads/"
r"(?P<version>.+).(zip|tar.gz|tar.bz2|.tgz|exe|msi)"
r"(?P<version>.+)\.(zip|tar\.gz|tar\.bz2|tgz|exe|msi)"
)
matches = re.search(bitbucket_download_pattern, url)

Expand Down Expand Up @@ -635,7 +651,7 @@ def build_bitbucket_purl(url):
)


@purl_router.route("https?://gitlab\\.com/(?!.*/archive/).*")
@purl_router.route(r"https?://gitlab\.com/(?!.*/archive/).*")
def build_gitlab_purl(url):
"""
Return a PackageURL object from Gitlab `url`.
Expand Down Expand Up @@ -675,7 +691,7 @@ def build_gitlab_purl(url):

# https://gitlab.com/hoppr/hoppr/-/archive/v1.11.1-dev.2/hoppr-v1.11.1-dev.2.tar.gz
gitlab_archive_pattern = (
r"^https?://gitlab.com/"
r"^https?://gitlab\.com/"
r"(?P<namespace>.+)/(?P<name>.+)/-/archive/(?P<version>.+)/"
r"(?P=name)-(?P=version).*"
r"[^/]$"
Expand All @@ -686,7 +702,7 @@ def build_gitlab_purl(url):

# https://hackage.haskell.org/package/cli-extras-0.2.0.0/cli-extras-0.2.0.0.tar.gz
hackage_download_pattern = (
r"^https?://hackage.haskell.org/package/"
r"^https?://hackage\.haskell\.org/package/"
r"(?P<name>.+)-(?P<version>.+)/"
r"(?P=name)-(?P=version).*"
r"[^/]$"
Expand All @@ -696,13 +712,15 @@ def build_gitlab_purl(url):


# https://hackage.haskell.org/package/cli-extras-0.2.0.0/
hackage_project_pattern = r"^https?://hackage.haskell.org/package/(?P<name>.+)-(?P<version>[^/]+)/"
hackage_project_pattern = (
r"^https?://hackage\.haskell\.org/package/(?P<name>.+)-(?P<version>[^/]+)/"
)

register_pattern("hackage", hackage_project_pattern)


@purl_router.route(
"https?://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/.*"
r"https?://storage\.googleapis\.com/google-code-archive-downloads/v2/code\.google\.com/.*"
)
def build_generic_google_code_archive_purl(uri):
# https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com
Expand Down