MobSF · ajinabraham · Nov 5, 2024 · Sep 10, 2024 · Sep 11, 2024 · Sep 11, 2024
diff --git a/action.md b/action.md
@@ -2,7 +2,7 @@
 
 Update Pipfile
 ```bash
-PIPENV_IGNORE_VIRTUALENVS=1 pipenv lock 
+PIPENV_IGNORE_VIRTUALENVS=1 pipenv lock
 PIPENV_IGNORE_VIRTUALENVS=1 pipenv sync
 PIPENV_IGNORE_VIRTUALENVS=1 pipenv run pip freeze > requirements.txt
 ```
diff --git a/mobsfscan/__init__.py b/mobsfscan/__init__.py
@@ -6,7 +6,7 @@
 __title__ = 'mobsfscan'
 __authors__ = 'Ajin Abraham'
 __copyright__ = f'Copyright {datetime.now().year} Ajin Abraham, OpenSecurity'
-__version__ = '0.4.0'
+__version__ = '0.4.1'
 __version_info__ = tuple(int(i) for i in __version__.split('.'))
 __all__ = [
     '__title__',

diff --git a/mobsfscan/formatters/sarif.py b/mobsfscan/formatters/sarif.py
@@ -1,138 +1,109 @@
 # -*- coding: utf_8 -*-
-"""Sarif output format.
+"""SARIF output formatter for MobSF scan results.
 
-Based on https://github.com/microsoft/bandit-sarif-formatter/
-blob/master/bandit_sarif_formatter/formatter.py
+Based on https://github.com/microsoft/
+bandit-sarif-formatter/blob/master/
+bandit_sarif_formatter/formatter.py
+MIT License, Copyright (c) Microsoft Corporation.
 
-Copyright (c) Microsoft.  All Rights Reserved.
-MIT License
-
-Copyright (c) Microsoft Corporation.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE
 """
-from datetime import datetime
+from datetime import datetime, timezone
 from pathlib import PurePath
 import urllib.parse as urlparse
 
 import sarif_om as om
 
 from jschema_to_python.to_json import to_json
 
-
 TS_FORMAT = '%Y-%m-%dT%H:%M:%SZ'
 
 
 def level_from_severity(severity):
-    if severity == 'ERROR':
-        return 'error'
-    elif severity == 'WARNING':
-        return 'warning'
-    elif severity == 'INFO':
-        return 'note'
-    else:
-        return 'none'
+    return {
+        'ERROR': 'error',
+        'WARNING': 'warning',
+        'INFO': 'note',
+    }.get(severity, 'none')
 
 
 def to_uri(file_path):
     pure_path = PurePath(file_path)
     if pure_path.is_absolute():
         return pure_path.as_uri()
     else:
-        posix_path = pure_path.as_posix()  # Replace backslashes with slashes.
-        return urlparse.quote(posix_path)  # %-encode special characters.
+        return urlparse.quote(pure_path.as_posix())
 
 
-def get_rule_name(rule_id):
-    normalized = []
-    noms = rule_id.split('_')
-    for nom in noms:
-        normalized.append(nom.capitalize())
-    return ''.join(normalized)
+def format_rule_name(rule_id):
+    return ''.join(word.capitalize() for word in rule_id.split('_'))
 
 
 def add_results(path, scan_results, run):
     if run.results is None:
         run.results = []
-    res = {}
-    res.update(scan_results.get('results', []))
+    res = scan_results.get('results', {})
     rules = {}
     rule_indices = {}
 
     for rule_id, issue_dict in res.items():
-        result = create_result(path, rule_id, issue_dict, rules, rule_indices)
-        run.results.append(result)
+        rule_results = create_rule_results(
+            path, rule_id, issue_dict, rules, rule_indices)
+        run.results.extend(rule_results)
 
-    if len(rules) > 0:
+    if rules:
         run.tool.driver.rules = list(rules.values())
 
 
-def create_result(path, rule_id, issue_dict, rules, rule_indices):
-    if rule_id in rules:
-        rule = rules[rule_id]
-        rule_index = rule_indices[rule_id]
-    else:
-        doc = issue_dict['metadata'].get('reference')
-        if not doc:
-            doc = ('https://mobile-security.gitbook.io/'
-                   'mobile-security-testing-guide/')
+def create_rule_results(path, rule_id, issue_dict, rules, rule_indices):
+    rule_results = []
+    rule, rule_index = rules.get(rule_id), rule_indices.get(rule_id)
+    ref_url = ('https://mobile-security.gitbook.io/'
+               'mobile-security-testing-guide/')
+    if not rule:
+        doc = issue_dict['metadata'].get('reference') or ref_url
         cwe_id = issue_dict['metadata']['cwe'].split(':')[0].lower()
         rule = om.ReportingDescriptor(
             id=rule_id,
-            name=get_rule_name(rule_id),
+            name=format_rule_name(rule_id),
             help_uri=doc,
-            properties={
-                'tags': ['security', f'external/cwe/{cwe_id}'],
-            },
-        )
+            properties={'tags': ['security', f'external/cwe/{cwe_id}']})
         rule_index = len(rules)
         rules[rule_id] = rule
         rule_indices[rule_id] = rule_index
 
-    locations = []
     for item in issue_dict.get('files', []):
-        physical_location = om.PhysicalLocation(
-            artifact_location=om.ArtifactLocation(
-                uri=to_uri(item['file_path'])),
-        )
-        physical_location.region = om.Region(
-            start_line=item['match_lines'][0],
-            end_line=item['match_lines'][1],
-            start_column=item['match_position'][0],
-            end_column=item['match_position'][1],
-            snippet=om.ArtifactContent(text=item['match_string']),
-        )
-        locations.append(om.Location(physical_location=physical_location))
-    if not locations:
-        artifact = om.PhysicalLocation(
-            artifact_location=om.ArtifactLocation(
-                uri=path[0]),
-        )
-        artifact.region = om.Region(
-            start_line=1,
-            end_line=1,
-            start_column=1,
-            end_column=1,
-            snippet=om.ArtifactContent(text='Missing Best Practice'),
-        )
-        locations.append(om.Location(physical_location=artifact))
-
+        location = create_location(item)
+        rule_results.append(create_result(rule, rule_index, issue_dict, [location]))
+
+    if not issue_dict.get('files'):
+        default_location = om.Location(
+            physical_location=om.PhysicalLocation(
+                artifact_location=om.ArtifactLocation(uri=path[0]),
+                region=om.Region(
+                    start_line=1,
+                    end_line=1,
+                    start_column=1,
+                    end_column=1,
+                    snippet=om.ArtifactContent(text='Missing Best Practice'))))
+        rule_results.append(create_result(
+            rule, rule_index, issue_dict, [default_location]))
+
+    return rule_results
+
+
+def create_location(item):
+    return om.Location(
+        physical_location=om.PhysicalLocation(
+            artifact_location=om.ArtifactLocation(uri=to_uri(item['file_path'])),
+            region=om.Region(
+                start_line=item['match_lines'][0],
+                end_line=item['match_lines'][1],
+                start_column=item['match_position'][0],
+                end_column=item['match_position'][1],
+                snippet=om.ArtifactContent(text=item['match_string']))))
+
+
+def create_result(rule, rule_index, issue_dict, locations):
     return om.Result(
         rule_id=rule.id,
         rule_index=rule_index,
@@ -144,38 +115,34 @@ def create_result(path, rule_id, issue_dict, rules, rule_indices):
             'masvs': issue_dict['metadata']['masvs'],
             'cwe': issue_dict['metadata']['cwe'],
             'reference': issue_dict['metadata']['reference'],
-        },
-    )
+        })
 
 
 def sarif_output(outfile, scan_results, mobsfscan_version, path):
     log = om.SarifLog(
-        schema_uri=('https://raw.githubusercontent.com/oasis-tcs/'
-                    'sarif-spec/master/Schemata/sarif-schema-2.1.0.json'),
+        schema_uri=('https://raw.githubusercontent.com/'
+                    'oasis-tcs/sarif-spec/master/Schemata/'
+                    'sarif-schema-2.1.0.json'),
         version='2.1.0',
-        runs=[
-            om.Run(
-                tool=om.Tool(driver=om.ToolComponent(
-                    name='mobsfscan',
-                    information_uri='https://github.com/MobSF/mobsfscan',
-                    semantic_version=mobsfscan_version,
-                    version=mobsfscan_version),
-                ),
-                invocations=[
-                    om.Invocation(
-                        end_time_utc=datetime.utcnow().strftime(TS_FORMAT),
-                        execution_successful=True,
-                    ),
-                ],
-            ),
-        ],
-    )
+        runs=[om.Run(
+            tool=om.Tool(driver=om.ToolComponent(
+                name='mobsfscan',
+                information_uri='https://github.com/MobSF/mobsfscan',
+                semantic_version=mobsfscan_version,
+                version=mobsfscan_version,
+            )),
+            invocations=[om.Invocation(
+                end_time_utc=datetime.now(timezone.utc).strftime(TS_FORMAT),
+                execution_successful=True,
+            )])])
     run = log.runs[0]
     add_results(path, scan_results, run)
     json_out = to_json(log)
+
     if outfile:
         with open(outfile, 'w') as of:
             of.write(json_out)
     else:
         print(json_out)
+
     return json_out
diff --git a/mobsfscan/mobsfscan.py b/mobsfscan/mobsfscan.py
@@ -127,6 +127,28 @@ def format_output(self, results) -> dict:
         self.post_ignore_rules()
         self.post_ignore_rules_by_severity()
         self.post_ignore_files()
+        self.deduplicate_files()
+
+    def deduplicate_files(self):
+        """Deduplicate files."""
+        for _, details in self.result['results'].items():
+            files = details.get('files')
+            # some results don't have any files,
+            # so we need to check before we continue
+            if files:
+                # "file" here refers to the dictionary containig
+                # the file_path, match_lines, etc.
+                # for each file we create a tuple with it's contents
+                # then using those tuples as keys and
+                # "file" as values we create a dictionary
+                # This means that for each unique "file"
+                # we will get only one entry as we
+                # can't have duplicate keys
+                # Once this is done - convert the dictionary
+                # back to list by grabbing it's values and passing it to list()
+                unique_files = list(
+                    {tuple(sorted(f.items())): f for f in files}.values())
+                details['files'] = unique_files
 
     def format_semgrep(self, sgrep_output):
         """Format semgrep output."""

diff --git a/tox.ini b/tox.ini
@@ -16,6 +16,7 @@ setenv =
 skip_install = true
 deps =
     pydocstyle
+    autopep8
     flake8
     flake8-broken-line
     flake8-bugbear
@@ -32,6 +33,7 @@ deps =
     pep8-naming
     radon
 commands =
+    autopep8 --recursive --in-place setup.py mobsfscan tests
     flake8 setup.py mobsfscan tests
 
 [testenv:bandit]
@@ -99,3 +101,4 @@ ignore =
     R701,
     # Too complex
 radon_max_cc = 10
+max-line-length = 88