Skip to content

Commit

Permalink
codemod safe lxml parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
clavedeluna committed Oct 6, 2023
1 parent 0790cc2 commit d833a92
Show file tree
Hide file tree
Showing 6 changed files with 207 additions and 0 deletions.
27 changes: 27 additions & 0 deletions integration_tests/test_lxml_safe_parsing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from core_codemods.lxml_safe_parsing import LxmlSafeParsing
from integration_tests.base_test import (
BaseIntegrationTest,
original_and_expected_from_code_path,
)


class TestLxmlSafeParsing(BaseIntegrationTest):
codemod = LxmlSafeParsing
code_path = "tests/samples/lxml_parsing.py"
original_code, expected_new_code = original_and_expected_from_code_path(
code_path,
[
(
1,
'lxml.etree.parse("path_to_file", parser=lxml.etree.XMLParser(resolve_entities=False))\n',
),
(
2,
'lxml.etree.fromstring("xml_str", parser=lxml.etree.XMLParser(resolve_entities=False))\n',
),
],
)
expected_diff = '--- \n+++ \n@@ -1,3 +1,3 @@\n import lxml\n-lxml.etree.parse("path_to_file")\n-lxml.etree.fromstring("xml_str")\n+lxml.etree.parse("path_to_file", parser=lxml.etree.XMLParser(resolve_entities=False))\n+lxml.etree.fromstring("xml_str", parser=lxml.etree.XMLParser(resolve_entities=False))\n'
expected_line_change = "2"
num_changes = 2
change_description = LxmlSafeParsing.CHANGE_DESCRIPTION
2 changes: 2 additions & 0 deletions src/core_codemods/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from .jwt_decode_verify import JwtDecodeVerify
from .limit_readline import LimitReadline
from .lxml_safe_parser_defaults import LxmlSafeParserDefaults
from .lxml_safe_parsing import LxmlSafeParsing
from .order_imports import OrderImports
from .process_creation_sandbox import ProcessSandbox
from .remove_unnecessary_f_str import RemoveUnnecessaryFStr
Expand Down Expand Up @@ -37,6 +38,7 @@
JwtDecodeVerify,
LimitReadline,
LxmlSafeParserDefaults,
LxmlSafeParsing,
OrderImports,
ProcessSandbox,
RemoveUnnecessaryFStr,
Expand Down
14 changes: 14 additions & 0 deletions src/core_codemods/docs/pixee_python_safe-lxml-parsing.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
This codemod sets the `parser` parameter in calls to `lxml.etree.parse` and `lxml.etree.fromstring`
if omitted or set to `None` (the default value). Unfortunately, the default `parser=None` means `lxml`
will rely on an unsafe parser, making your code potentially vulnerable to entity expansion
attacks and external entity (XXE) attacks.

The changes look as follows:

```diff
import lxml
- lxml.etree.parse("path_to_file")
- lxml.etree.fromstring("xml_str")
+ lxml.etree.parse("path_to_file", parser=lxml.etree.XMLParser(resolve_entities=False))
+ lxml.etree.fromstring("xml_str", parser=lxml.etree.XMLParser(resolve_entities=False))
```
52 changes: 52 additions & 0 deletions src/core_codemods/lxml_safe_parsing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from codemodder.codemods.base_codemod import ReviewGuidance
from codemodder.codemods.api import SemgrepCodemod
from codemodder.codemods.api.helpers import NewArg


class LxmlSafeParsing(SemgrepCodemod):
NAME = "safe-lxml-parsing"
REVIEW_GUIDANCE = ReviewGuidance.MERGE_WITHOUT_REVIEW
SUMMARY = "Use safe parsers in lxml parsing functions"
DESCRIPTION = (
"Call `lxml.etree.parse` and `lxml.etree.fromstring` with a safe parser"
)

@classmethod
def rule(cls):
return """
rules:
- pattern-either:
- patterns:
- pattern: lxml.etree.$FUNC(...)
- pattern-not: lxml.etree.$FUNC(...,parser=..., ...)
- metavariable-pattern:
metavariable: $FUNC
patterns:
- pattern-either:
- pattern: parse
- pattern: fromstring
- pattern-inside: |
import lxml
...
- patterns:
- pattern: lxml.etree.$FUNC(..., parser=None, ...)
- metavariable-pattern:
metavariable: $FUNC
patterns:
- pattern-either:
- pattern: parse
- pattern: fromstring
- pattern-inside: |
import lxml
...
"""

def on_result_found(self, original_node, updated_node):
self.remove_unused_import(original_node)
self.add_needed_import("lxml")
safe_parser = "lxml.etree.XMLParser(resolve_entities=False)"
new_args = self.replace_args(
original_node,
[NewArg(name="parser", value=safe_parser, add_if_missing=True)],
)
return self.update_arg_target(updated_node, new_args)
109 changes: 109 additions & 0 deletions tests/codemods/test_lxml_safe_parsing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import pytest
from core_codemods.lxml_safe_parsing import LxmlSafeParsing
from tests.codemods.base_codemod_test import BaseSemgrepCodemodTest

each_func = pytest.mark.parametrize("func", ["parse", "fromstring"])


class TestLxmlSafeParsing(BaseSemgrepCodemodTest):
codemod = LxmlSafeParsing

def test_name(self):
assert self.codemod.name() == "safe-lxml-parsing"

@each_func
def test_import(self, tmpdir, func):
input_code = f"""import lxml
lxml.etree.{func}("path_to_file")
var = "hello"
"""
expexted_output = f"""import lxml
lxml.etree.{func}("path_to_file", parser=lxml.etree.XMLParser(resolve_entities=False))
var = "hello"
"""

self.run_and_assert(tmpdir, input_code, expexted_output)

@each_func
def test_from_import(self, tmpdir, func):
input_code = f"""from lxml.etree import {func}
{func}("path_to_file")
var = "hello"
"""
expexted_output = f"""from lxml.etree import {func}
import lxml
{func}("path_to_file", parser=lxml.etree.XMLParser(resolve_entities=False))
var = "hello"
"""

self.run_and_assert(tmpdir, input_code, expexted_output)

@each_func
def test_from_import_module(self, tmpdir, func):
input_code = f"""from lxml import etree
etree.{func}("path_to_file")
var = "hello"
"""
expexted_output = f"""from lxml import etree
import lxml
etree.{func}("path_to_file", parser=lxml.etree.XMLParser(resolve_entities=False))
var = "hello"
"""

self.run_and_assert(tmpdir, input_code, expexted_output)

@each_func
def test_import_alias(self, tmpdir, func):
input_code = f"""from lxml.etree import {func} as func
func("path_to_file")
var = "hello"
"""
expexted_output = f"""from lxml.etree import {func} as func
import lxml
func("path_to_file", parser=lxml.etree.XMLParser(resolve_entities=False))
var = "hello"
"""

self.run_and_assert(tmpdir, input_code, expexted_output)

@pytest.mark.parametrize(
"input_args,expected_args",
[
(
"'str', parser=None",
"'str', parser=lxml.etree.XMLParser(resolve_entities=False)",
),
(
"source, base_url='url', parser=None",
"source, base_url='url', parser=lxml.etree.XMLParser(resolve_entities=False)",
),
(
"source, parser=lxml.etree.XMLParser(resolve_entities=False)",
"source, parser=lxml.etree.XMLParser(resolve_entities=False)",
),
# This case would be changed by `safe-lxml-parser-defaults` codemod
(
"source, parser=lxml.etree.XMLParser()",
"source, parser=lxml.etree.XMLParser()",
),
],
)
@each_func
def test_verify_variations(self, tmpdir, func, input_args, expected_args):
input_code = f"""import lxml
lxml.etree.{func}({input_args})
var = "hello"
"""
expexted_output = f"""import lxml
lxml.etree.{func}({expected_args})
var = "hello"
"""
self.run_and_assert(tmpdir, input_code, expexted_output)
3 changes: 3 additions & 0 deletions tests/samples/lxml_parsing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import lxml
lxml.etree.parse("path_to_file")
lxml.etree.fromstring("xml_str")

0 comments on commit d833a92

Please sign in to comment.